From 91447636331957f3d9b5ca5b508f07c526b0074d Mon Sep 17 00:00:00 2001 From: Apple <opensource@apple.com> Date: Thu, 28 Apr 2005 00:52:30 +0000 Subject: [PATCH] xnu-792.tar.gz --- EXTERNAL_HEADERS/Info.plist | 22 + EXTERNAL_HEADERS/Makefile | 39 + EXTERNAL_HEADERS/ar.h | 2 + EXTERNAL_HEADERS/architecture/Makefile | 32 + EXTERNAL_HEADERS/architecture/byte_order.h | 15 +- EXTERNAL_HEADERS/architecture/i386/Makefile | 37 + EXTERNAL_HEADERS/architecture/ppc/Makefile | 34 + .../architecture/ppc/byte_order.h | 78 +- EXTERNAL_HEADERS/architecture/ppc/cframe.h | 18 +- EXTERNAL_HEADERS/bsd/i386/ansi.h | 104 - EXTERNAL_HEADERS/bsd/ppc/ansi.h | 104 - EXTERNAL_HEADERS/i386/Makefile | 27 + .../i386/_limits.h | 8 +- EXTERNAL_HEADERS/{bsd => }/i386/limits.h | 7 +- EXTERNAL_HEADERS/mach-o/Makefile | 30 + EXTERNAL_HEADERS/mach-o/fat.h | 15 + EXTERNAL_HEADERS/mach-o/loader.h | 116 +- EXTERNAL_HEADERS/mach-o/nlist.h | 14 + EXTERNAL_HEADERS/machine/Makefile | 30 + EXTERNAL_HEADERS/ppc/Makefile | 27 + .../disk.h => EXTERNAL_HEADERS/ppc/_limits.h | 8 +- EXTERNAL_HEADERS/{bsd => }/ppc/limits.h | 6 +- Makefile | 6 +- bsd/Makefile | 2 + bsd/bsm/Makefile | 5 +- bsd/bsm/audit.h | 5 +- bsd/bsm/audit_kernel.h | 47 +- bsd/bsm/audit_klib.h | 10 +- bsd/bsm/audit_record.h | 14 +- bsd/conf/MASTER | 18 +- bsd/conf/MASTER.i386 | 4 +- bsd/conf/MASTER.ppc | 6 +- bsd/conf/Makefile | 4 + bsd/conf/Makefile.i386 | 294 + bsd/conf/Makefile.template | 38 +- bsd/conf/files | 69 +- bsd/conf/files.i386 | 1 - bsd/conf/files.ppc | 2 +- bsd/conf/param.c | 6 +- bsd/conf/tools/Makefile | 8 +- bsd/conf/tools/newvers/Makefile | 49 - bsd/conf/tools/newvers/newvers.csh | 33 - bsd/conf/version.major | 1 - bsd/conf/version.minor | 1 - bsd/conf/version.variant | 1 - bsd/crypto/Makefile | 4 +- bsd/crypto/{rijndael => aes}/Makefile | 8 +- bsd/crypto/aes/aes.h | 175 + bsd/crypto/aes/aescrypt.c | 407 + bsd/crypto/aes/aeskey.c | 455 ++ bsd/crypto/aes/aesopt.h | 753 ++ bsd/crypto/aes/aestab.c | 384 + bsd/crypto/aes/aestab.h | 175 + bsd/crypto/blowfish/Makefile | 2 +- bsd/crypto/blowfish/blowfish.h | 6 +- bsd/crypto/cast128/Makefile | 2 +- bsd/crypto/cast128/cast128.h | 14 +- bsd/crypto/des/Makefile | 2 +- bsd/crypto/des/des.h | 41 +- bsd/crypto/md5.c | 2 +- bsd/crypto/md5.h | 8 +- bsd/crypto/rc4/Makefile | 2 +- bsd/crypto/rijndael/boxes-fst.dat | 958 --- bsd/crypto/rijndael/rijndael-alg-fst.c | 488 -- bsd/crypto/rijndael/rijndael-alg-fst.h | 34 - bsd/crypto/rijndael/rijndael-api-fst.c | 484 -- bsd/crypto/rijndael/rijndael-api-fst.h | 104 - bsd/crypto/rijndael/rijndael.h | 4 - bsd/crypto/rijndael/rijndael_local.h | 11 - bsd/crypto/sha1.c | 2 +- bsd/crypto/sha1.h | 8 +- bsd/crypto/sha2/Makefile | 2 +- bsd/crypto/sha2/sha2.h | 34 +- bsd/dev/Makefile | 7 +- bsd/dev/i386/conf.c | 113 +- bsd/dev/i386/cons.c | 137 +- bsd/dev/i386/cons.h | 16 +- bsd/dev/i386/kern_machdep.c | 98 +- bsd/dev/i386/km.c | 92 +- bsd/dev/i386/mem.c | 78 +- bsd/dev/i386/memmove.c | 6 +- bsd/dev/i386/stubs.c | 91 +- bsd/dev/i386/sysctl.c | 37 +- bsd/dev/i386/unix_signal.c | 254 +- bsd/dev/i386/unix_startup.c | 160 - bsd/dev/memdev.c | 172 +- bsd/dev/ppc/chud/chud_bsd_callback.c | 27 +- bsd/dev/ppc/chud/chud_process.c | 3 +- bsd/dev/ppc/conf.c | 7 +- bsd/dev/ppc/cons.c | 72 +- bsd/dev/ppc/cons.h | 8 +- bsd/dev/ppc/kern_machdep.c | 256 +- bsd/dev/ppc/km.c | 127 +- bsd/dev/ppc/mem.c | 76 +- bsd/dev/ppc/memmove.c | 4 +- bsd/dev/ppc/munge.s | 356 + bsd/dev/ppc/nvram.c | 7 +- bsd/dev/ppc/stubs.c | 51 +- bsd/dev/ppc/systemcalls.c | 383 +- bsd/dev/ppc/unix_signal.c | 612 +- bsd/dev/ppc/xsumas.s | 600 +- bsd/dev/random/YarrowCoreLib/port/smf.c | 7 +- bsd/dev/random/YarrowCoreLib/src/comp.c | 10 +- bsd/dev/random/YarrowCoreLib/src/prng.c | 13 +- bsd/dev/random/YarrowCoreLib/src/sha1mod.c | 11 +- bsd/dev/random/YarrowCoreLib/src/sha1mod.h | 4 +- bsd/dev/random/YarrowCoreLib/src/smf.h | 2 +- .../random/YarrowCoreLib/src/yarrowUtils.c | 1 + bsd/dev/random/randomdev.c | 44 +- bsd/dev/random/randomdev.h | 4 +- bsd/dev/{ppc => }/unix_startup.c | 129 +- bsd/dev/vn/shadow.c | 12 +- bsd/dev/vn/shadow.h | 5 + bsd/dev/vn/vn.c | 1060 ++- bsd/hfs/MacOSStubs.c | 16 +- bsd/hfs/Makefile | 2 +- bsd/hfs/hfs.h | 349 +- bsd/hfs/hfs_attrlist.c | 955 ++- bsd/hfs/hfs_btreeio.c | 305 +- bsd/hfs/hfs_catalog.c | 1055 ++- bsd/hfs/hfs_catalog.h | 103 +- bsd/hfs/hfs_chash.c | 392 +- bsd/hfs/hfs_cnode.c | 1257 +-- bsd/hfs/hfs_cnode.h | 202 +- bsd/hfs/hfs_dbg.h | 2 +- bsd/hfs/hfs_encodinghint.c | 10 +- bsd/hfs/hfs_encodings.c | 62 +- bsd/hfs/hfs_endian.c | 49 + bsd/hfs/hfs_format.h | 104 +- bsd/hfs/hfs_fsctl.h | 65 + bsd/hfs/hfs_hotfiles.c | 812 +- bsd/hfs/hfs_hotfiles.h | 12 +- bsd/hfs/hfs_link.c | 272 +- bsd/hfs/hfs_lockf.c | 707 -- bsd/hfs/hfs_lockf.h | 117 - bsd/hfs/hfs_lookup.c | 531 +- bsd/hfs/hfs_macos_defs.h | 3 +- bsd/hfs/hfs_mount.h | 11 +- bsd/hfs/hfs_notification.c | 11 +- bsd/hfs/hfs_quota.c | 543 +- bsd/hfs/hfs_quota.h | 26 +- bsd/hfs/hfs_readwrite.c | 3071 ++++---- bsd/hfs/hfs_search.c | 460 +- bsd/hfs/hfs_vfsops.c | 2721 ++++--- bsd/hfs/hfs_vfsutils.c | 969 ++- bsd/hfs/hfs_vnops.c | 4310 +++++------ bsd/hfs/hfs_xattr.c | 1062 +++ bsd/hfs/hfscommon/BTree/BTree.c | 12 +- bsd/hfs/hfscommon/BTree/BTreeNodeReserve.c | 40 +- bsd/hfs/hfscommon/BTree/BTreeScanner.c | 32 +- bsd/hfs/hfscommon/BTree/BTreeTreeOps.c | 2 +- bsd/hfs/hfscommon/Catalog/Catalog.c | 245 - bsd/hfs/hfscommon/Catalog/CatalogIterators.c | 643 -- bsd/hfs/hfscommon/Catalog/CatalogUtilities.c | 4 +- bsd/hfs/hfscommon/Catalog/FileIDsServices.c | 29 +- bsd/hfs/hfscommon/Misc/FileExtentMapping.c | 336 +- bsd/hfs/hfscommon/Misc/VolumeAllocation.c | 177 +- bsd/hfs/hfscommon/Unicode/UnicodeWrappers.c | 32 +- bsd/hfs/hfscommon/headers/BTreeScanner.h | 3 +- bsd/hfs/hfscommon/headers/BTreesInternal.h | 6 +- bsd/hfs/hfscommon/headers/CatalogPrivate.h | 84 +- bsd/hfs/hfscommon/headers/FileMgrInternal.h | 57 +- bsd/i386/Makefile | 14 +- bsd/i386/_types.h | 118 + bsd/i386/cpu.h | 36 - bsd/i386/endian.h | 49 +- bsd/i386/exec.h | 11 +- bsd/i386/label_t.h | 40 - bsd/i386/param.h | 8 +- bsd/i386/reboot.h | 6 +- bsd/i386/setjmp.h | 33 +- bsd/i386/signal.h | 8 +- bsd/i386/spl.h | 55 - bsd/i386/table.h | 33 - bsd/i386/types.h | 35 +- bsd/i386/ucontext.h | 26 +- bsd/i386/vmparam.h | 2 +- bsd/isofs/cd9660/Makefile | 4 +- bsd/isofs/cd9660/cd9660_bmap.c | 96 +- bsd/isofs/cd9660/cd9660_lookup.c | 156 +- bsd/isofs/cd9660/cd9660_mount.h | 30 +- bsd/isofs/cd9660/cd9660_node.c | 192 +- bsd/isofs/cd9660/cd9660_node.h | 88 +- bsd/isofs/cd9660/cd9660_rrip.c | 162 +- bsd/isofs/cd9660/cd9660_util.c | 367 +- bsd/isofs/cd9660/cd9660_vfsops.c | 869 ++- bsd/isofs/cd9660/cd9660_vnops.c | 974 +-- bsd/isofs/cd9660/iso.h | 83 +- bsd/isofs/cd9660/iso_rrip.h | 16 +- bsd/kern/ast.h | 14 +- bsd/kern/bsd_init.c | 286 +- bsd/kern/bsd_stubs.c | 14 +- bsd/kern/init_sysent.c | 1240 ++- bsd/kern/kdebug.c | 588 +- bsd/kern/kern_acct.c | 161 +- bsd/kern/kern_aio.c | 569 +- bsd/kern/kern_audit.c | 709 +- bsd/kern/kern_authorization.c | 1014 +++ bsd/kern/kern_bsm_audit.c | 55 +- bsd/kern/kern_bsm_klib.c | 70 +- bsd/kern/kern_bsm_token.c | 112 +- bsd/kern/kern_clock.c | 96 +- bsd/kern/kern_control.c | 1099 ++- bsd/kern/kern_core.c | 238 +- bsd/kern/kern_credential.c | 2268 ++++++ bsd/kern/kern_descrip.c | 2437 ++++-- bsd/kern/kern_event.c | 2098 +++-- bsd/kern/kern_exec.c | 2138 +++--- bsd/kern/kern_exit.c | 880 ++- bsd/kern/kern_fork.c | 276 +- bsd/kern/kern_ktrace.c | 322 +- bsd/kern/kern_lock.c | 23 +- .../ufs/ufs_lockf.c => kern/kern_lockf.c} | 469 +- bsd/kern/kern_malloc.c | 51 +- bsd/kern/kern_mib.c | 99 +- bsd/kern/kern_mman.c | 865 +-- bsd/kern/kern_newsysctl.c | 64 +- bsd/kern/kern_panicinfo.c | 273 +- bsd/kern/kern_pcsamples.c | 215 +- bsd/kern/kern_physio.c | 145 +- bsd/kern/kern_proc.c | 221 +- bsd/kern/kern_prot.c | 931 ++- bsd/kern/kern_resource.c | 178 +- bsd/kern/kern_shutdown.c | 124 +- bsd/kern/kern_sig.c | 954 +-- bsd/kern/kern_subr.c | 1046 ++- bsd/kern/kern_symfile.c | 115 +- bsd/kern/kern_synch.c | 267 +- bsd/kern/kern_sysctl.c | 1180 ++- bsd/kern/kern_time.c | 297 +- bsd/kern/kern_xxx.c | 134 +- bsd/kern/kpi_mbuf.c | 939 +++ bsd/kern/kpi_socket.c | 772 ++ bsd/kern/kpi_socketfilter.c | 595 ++ bsd/kern/mach_fat.c | 61 +- bsd/kern/mach_header.c | 157 +- bsd/kern/mach_header.h | 21 +- bsd/kern/mach_loader.c | 745 +- bsd/kern/mach_loader.h | 23 +- bsd/kern/mach_process.c | 194 +- bsd/kern/makesyscalls.sh | 694 ++ bsd/kern/netboot.c | 61 +- bsd/kern/posix_sem.c | 626 +- bsd/kern/posix_shm.c | 551 +- bsd/kern/qsort.c | 4 +- bsd/kern/spl.c | 33 +- bsd/kern/subr_log.c | 103 +- bsd/kern/subr_prf.c | 9 +- bsd/kern/subr_prof.c | 247 +- bsd/kern/subr_xxx.c | 7 +- bsd/kern/sys_domain.c | 5 +- bsd/kern/sys_generic.c | 2385 +++--- bsd/kern/sys_pipe.c | 1646 ++++ bsd/kern/sys_socket.c | 274 +- bsd/kern/syscalls.c | 825 +- bsd/kern/syscalls.master | 474 ++ bsd/kern/sysctl_init.c | 120 +- bsd/kern/sysv_ipc.c | 81 +- bsd/kern/sysv_msg.c | 668 +- bsd/kern/sysv_sem.c | 911 ++- bsd/kern/sysv_shm.c | 853 ++- bsd/kern/tty.c | 325 +- bsd/kern/tty_compat.c | 358 +- bsd/kern/tty_conf.c | 50 +- bsd/kern/tty_pty.c | 350 +- bsd/kern/tty_subr.c | 17 +- bsd/kern/tty_tb.c | 2 +- bsd/kern/tty_tty.c | 117 +- bsd/kern/ubc_subr.c | 1049 +-- bsd/kern/uipc_domain.c | 154 +- bsd/kern/uipc_mbuf.c | 1251 ++- bsd/kern/uipc_mbuf2.c | 189 +- bsd/kern/uipc_proto.c | 19 +- bsd/kern/uipc_socket.c | 1687 +++-- bsd/kern/uipc_socket2.c | 598 +- bsd/kern/uipc_syscalls.c | 1673 ++-- bsd/kern/uipc_usrreq.c | 597 +- bsd/libkern/Makefile | 3 - bsd/libkern/crc32.c | 104 + bsd/libkern/inet_ntoa.c | 70 - bsd/libkern/inet_ntop.c | 208 + bsd/libkern/libkern.h | 50 +- bsd/libkern/scanc.c | 5 +- bsd/machine/Makefile | 16 +- bsd/machine/{unix_traps.h => _limits.h} | 16 +- bsd/machine/{table.h => _types.h} | 16 +- bsd/machine/cons.h | 4 +- bsd/machine/disklabel.h | 4 +- bsd/machine/endian.h | 4 +- bsd/machine/exec.h | 4 +- bsd/machine/param.h | 4 +- bsd/machine/profile.h | 4 +- bsd/machine/psl.h | 4 +- bsd/machine/ptrace.h | 4 +- bsd/machine/reboot.h | 4 +- bsd/machine/reg.h | 4 +- bsd/machine/setjmp.h | 9 +- bsd/machine/signal.h | 4 +- bsd/machine/spl.h | 37 +- bsd/machine/types.h | 4 +- bsd/machine/ucontext.h | 4 +- bsd/machine/user.h | 35 - bsd/machine/vmparam.h | 4 +- bsd/man/man2/Makefile | 24 +- bsd/man/man2/accept.2 | 2 +- bsd/man/man2/aio_cancel.2 | 117 + bsd/man/man2/aio_error.2 | 100 + bsd/man/man2/aio_read.2 | 211 + bsd/man/man2/aio_return.2 | 103 + bsd/man/man2/aio_suspend.2 | 113 + bsd/man/man2/aio_write.2 | 204 + bsd/man/man2/bind.2 | 2 +- bsd/man/man2/brk.2 | 150 - bsd/man/man2/chflags.2 | 4 +- bsd/man/man2/chown.2 | 94 +- bsd/man/man2/connect.2 | 14 +- bsd/man/man2/exchangedata.2 | 190 + bsd/man/man2/fcntl.2 | 13 + bsd/man/man2/flock.2 | 2 +- bsd/man/man2/fsync.2 | 42 +- bsd/man/man2/getattrlist.2 | 1684 +++++ bsd/man/man2/getdirentriesattr.2 | 427 ++ bsd/man/man2/getfsstat.2 | 21 +- bsd/man/man2/getpeername.2 | 2 +- bsd/man/man2/getsockname.2 | 2 +- bsd/man/man2/getsockopt.2 | 7 +- bsd/man/man2/getxattr.2 | 165 + bsd/man/man2/intro.2 | 29 +- bsd/man/man2/listxattr.2 | 153 + bsd/man/man2/madvise.2 | 2 +- bsd/man/man2/mkfifo.2 | 2 +- bsd/man/man2/poll.2 | 198 + bsd/man/man2/posix_madvise.2 | 2 +- bsd/man/man2/ptrace.2 | 245 +- bsd/man/man2/quotactl.2 | 2 +- bsd/man/man2/recv.2 | 16 +- bsd/man/man2/removexattr.2 | 135 + bsd/man/man2/sbrk.2 | 1 - bsd/man/man2/searchfs.2 | 804 ++ bsd/man/man2/select.2 | 10 +- bsd/man/man2/semctl.2 | 2 - bsd/man/man2/semget.2 | 2 - bsd/man/man2/semop.2 | 2 - bsd/man/man2/send.2 | 2 +- bsd/man/man2/setattrlist.2 | 363 + bsd/man/man2/setxattr.2 | 175 + bsd/man/man2/shmget.2 | 4 +- bsd/man/man2/shutdown.2 | 12 +- bsd/man/man2/vfork.2 | 2 +- bsd/man/man4/Makefile | 2 + bsd/man/man4/arp.4 | 3 +- bsd/man/man4/bpf.4 | 4 + bsd/man/man4/dummynet.4 | 64 + bsd/man/man4/icmp6.4 | 366 +- bsd/man/man4/ifmib.4 | 196 + bsd/man/man4/ip6.4 | 1155 ++- bsd/man/man4/termios.4 | 4 +- bsd/man/man4/unix.4 | 29 + bsd/man/man5/types.5 | 4 +- bsd/man/man9/fetch.9 | 49 +- bsd/man/man9/store.9 | 47 +- bsd/miscfs/deadfs/dead_vnops.c | 303 +- bsd/miscfs/devfs/devfs.h | 12 +- bsd/miscfs/devfs/devfs_proto.h | 9 +- bsd/miscfs/devfs/devfs_tree.c | 606 +- bsd/miscfs/devfs/devfs_vfsops.c | 227 +- bsd/miscfs/devfs/devfs_vnops.c | 1256 ++- bsd/miscfs/devfs/devfsdefs.h | 83 +- bsd/miscfs/fdesc/fdesc.h | 29 +- bsd/miscfs/fdesc/fdesc_vfsops.c | 153 +- bsd/miscfs/fdesc/fdesc_vnops.c | 515 +- bsd/miscfs/fifofs/fifo.h | 105 +- bsd/miscfs/fifofs/fifo_vnops.c | 486 +- bsd/miscfs/nullfs/null.h | 13 +- bsd/miscfs/nullfs/null_subr.c | 28 +- bsd/miscfs/nullfs/null_vfsops.c | 155 +- bsd/miscfs/nullfs/null_vnops.c | 233 +- bsd/miscfs/specfs/spec_lockf.c | 706 -- bsd/miscfs/specfs/spec_vnops.c | 692 +- bsd/miscfs/specfs/specdev.h | 98 +- bsd/miscfs/synthfs/synthfs.h | 97 +- bsd/miscfs/synthfs/synthfs_util.c | 106 +- bsd/miscfs/synthfs/synthfs_vfsops.c | 377 +- bsd/miscfs/synthfs/synthfs_vnops.c | 1146 +-- bsd/miscfs/union/union.h | 60 +- bsd/miscfs/union/union_subr.c | 323 +- bsd/miscfs/union/union_vfsops.c | 298 +- bsd/miscfs/union/union_vnops.c | 936 +-- bsd/miscfs/volfs/volfs.h | 139 +- bsd/miscfs/volfs/volfs_vfsops.c | 326 +- bsd/miscfs/volfs/volfs_vnops.c | 1607 ++-- bsd/net/Makefile | 28 +- bsd/net/bpf.c | 783 +- bsd/net/bpf.h | 63 +- bsd/net/bpf_filter.c | 28 +- bsd/net/bpfdesc.h | 4 +- bsd/net/bridge.c | 166 +- bsd/net/bridge.h | 1 - bsd/net/bsd_comp.c | 39 +- bsd/net/devtimer.c | 276 + bsd/net/devtimer.h | 89 + bsd/net/dlil.c | 3865 +++++----- bsd/net/dlil.h | 301 +- bsd/net/dlil_pvt.h | 16 +- bsd/net/ether_at_pr_module.c | 350 +- bsd/net/ether_if_module.c | 891 +-- bsd/net/ether_inet6_pr_module.c | 378 +- bsd/net/ether_inet_pr_module.c | 609 +- bsd/net/ethernet.h | 23 +- bsd/{ppc/label_t.h => net/ieee8023ad.h} | 36 +- bsd/net/if.c | 1253 +-- bsd/net/if.h | 206 +- bsd/net/if_arp.h | 54 - bsd/net/if_atm.h | 19 +- bsd/net/if_bond.c | 4485 +++++++++++ bsd/net/if_bond_var.h | 92 + bsd/net/if_disc.c | 4 +- bsd/net/if_dl.h | 11 +- bsd/net/if_dummy.c | 70 +- bsd/net/if_ether.h | 52 + bsd/net/if_ethersubr.c | 219 +- bsd/net/if_faith.c | 56 +- bsd/net/if_faith.h | 8 +- bsd/net/if_gif.c | 267 +- bsd/net/if_gif.h | 21 +- bsd/net/if_loop.c | 468 +- bsd/net/if_media.c | 18 +- bsd/net/if_media.h | 38 +- bsd/net/if_mib.c | 116 +- bsd/net/if_mib.h | 27 +- bsd/net/if_pppvar.h | 28 +- bsd/net/if_sppp.h | 205 - bsd/net/if_stf.c | 217 +- bsd/net/if_stf.h | 41 - bsd/net/if_tun.c | 764 -- bsd/net/if_tun.h | 66 - bsd/net/if_tunvar.h | 73 - bsd/net/if_types.h | 11 +- bsd/net/if_var.h | 599 +- bsd/net/if_vlan.c | 2108 ++++-- bsd/net/if_vlan_var.h | 4 + bsd/net/init.c | 107 + bsd/net/init.h | 59 + bsd/net/iso88025.h | 11 - bsd/net/kext_net.c | 5 +- bsd/net/kext_net.h | 209 +- bsd/net/kpi_interface.c | 1355 ++++ bsd/net/kpi_interface.h | 1617 ++++ bsd/net/kpi_interfacefilter.c | 47 + bsd/net/kpi_interfacefilter.h | 195 + bsd/net/kpi_protocol.c | 366 + bsd/net/kpi_protocol.h | 176 + bsd/net/lacp.h | 418 + bsd/net/multicast_list.c | 145 + bsd/{machine/ansi.h => net/multicast_list.h} | 49 +- bsd/net/ndrv.c | 571 +- bsd/net/ndrv.h | 15 +- bsd/net/ndrv_var.h | 15 +- bsd/net/net_osdep.c | 1 - bsd/net/net_osdep.h | 6 +- bsd/net/netisr.c | 133 - bsd/net/pfkeyv2.h | 8 +- bsd/net/ppp_comp.h | 38 +- bsd/net/ppp_deflate.c | 35 +- bsd/net/ppp_defs.h | 8 - bsd/net/radix.c | 74 +- bsd/net/radix.h | 65 +- bsd/net/raw_cb.c | 12 + bsd/net/raw_cb.h | 24 +- bsd/net/raw_usrreq.c | 70 +- bsd/net/route.c | 359 +- bsd/net/route.h | 116 +- bsd/net/rtsock.c | 793 +- bsd/net/slcompress.c | 635 -- bsd/net/slcompress.h | 188 - bsd/net/zlib.c | 15 +- bsd/net/zlib.h | 8 +- bsd/netat/Makefile | 20 +- bsd/netat/adsp.c | 285 +- bsd/netat/adsp.h | 42 +- bsd/netat/adsp_CLDeny.c | 4 +- bsd/netat/adsp_Close.c | 2 +- bsd/netat/adsp_Control.c | 2 +- bsd/netat/adsp_Timer.c | 4 +- bsd/netat/adsp_attention.c | 4 +- bsd/netat/adsp_internal.h | 37 +- bsd/netat/appletalk.h | 9 +- bsd/netat/asp.h | 9 +- bsd/netat/asp_proto.c | 65 +- bsd/netat/at.c | 47 +- bsd/netat/at_aarp.h | 15 +- bsd/netat/at_config.h | 3 + bsd/netat/at_ddp_brt.h | 6 +- bsd/netat/at_pat.h | 3 + bsd/netat/at_pcb.c | 7 +- bsd/netat/at_pcb.h | 27 +- bsd/netat/at_proto.c | 49 +- bsd/netat/at_snmp.h | 3 + bsd/netat/at_var.h | 40 +- bsd/netat/atp.h | 9 +- bsd/netat/atp_misc.c | 4 +- bsd/netat/atp_open.c | 2 +- bsd/netat/atp_read.c | 19 +- bsd/netat/atp_write.c | 117 +- bsd/netat/aurp.h | 87 +- bsd/netat/aurp_aurpd.c | 46 +- bsd/netat/aurp_cfg.c | 2 +- bsd/netat/aurp_misc.c | 7 +- bsd/netat/aurp_open.c | 12 +- bsd/netat/aurp_ri.c | 36 +- bsd/netat/aurp_rx.c | 1 + bsd/netat/aurp_tickle.c | 9 +- bsd/netat/ddp.c | 11 +- bsd/netat/ddp.h | 12 +- bsd/netat/ddp_aarp.c | 19 +- bsd/netat/ddp_brt.c | 14 +- bsd/netat/ddp_lap.c | 66 +- bsd/netat/ddp_nbp.c | 4 +- bsd/netat/ddp_proto.c | 1 - bsd/netat/ddp_r_rtmp.c | 51 +- bsd/netat/ddp_r_zip.c | 56 +- bsd/netat/ddp_rtmp.c | 13 +- bsd/netat/ddp_usrreq.c | 2 +- bsd/netat/debug.h | 6 +- bsd/netat/drv_dep.c | 64 +- bsd/netat/ep.h | 3 + bsd/netat/lap.h | 3 + bsd/netat/nbp.h | 19 +- bsd/netat/pap.h | 3 + bsd/netat/routing_tables.h | 17 +- bsd/netat/rtmp.h | 3 + bsd/netat/sys_dep.c | 185 +- bsd/netat/sys_glue.c | 176 +- bsd/netat/sysglue.h | 42 +- bsd/netat/zip.h | 3 + bsd/netinet/Makefile | 29 +- bsd/netinet/bootp.h | 2 + bsd/netinet/dhcp_options.h | 4 +- bsd/netinet/icmp6.h | 30 +- bsd/netinet/icmp_var.h | 12 +- bsd/netinet/if_atm.h | 10 +- bsd/netinet/if_ether.c | 923 --- bsd/netinet/if_ether.h | 76 +- bsd/netinet/if_fddi.h | 16 +- bsd/netinet/if_tun.h | 9 +- bsd/netinet/igmp.c | 45 +- bsd/netinet/igmp_var.h | 23 +- bsd/netinet/in.c | 432 +- bsd/netinet/in.h | 99 +- bsd/netinet/in_arp.c | 876 +++ bsd/netinet/in_arp.h | 131 + bsd/netinet/in_bootp.c | 63 +- bsd/netinet/in_gif.c | 16 +- bsd/netinet/in_gif.h | 14 +- bsd/netinet/in_pcb.c | 709 +- bsd/netinet/in_pcb.h | 293 +- bsd/netinet/in_proto.c | 116 +- bsd/netinet/in_rmx.c | 57 +- bsd/netinet/in_systm.h | 15 +- bsd/netinet/in_var.h | 49 +- bsd/netinet/ip.h | 4 + bsd/netinet/ip6.h | 18 +- bsd/netinet/ip_compat.h | 15 +- bsd/netinet/ip_divert.c | 300 +- bsd/netinet/ip_divert.h | 92 + bsd/netinet/ip_dummynet.c | 877 ++- bsd/netinet/ip_dummynet.h | 225 +- bsd/netinet/ip_ecn.h | 10 +- bsd/netinet/ip_encap.c | 10 +- bsd/netinet/ip_encap.h | 28 +- bsd/netinet/ip_flow.c | 4 +- bsd/netinet/ip_flow.h | 4 +- bsd/netinet/ip_fw.h | 21 +- bsd/netinet/ip_fw2.c | 3324 ++++++++ bsd/netinet/ip_fw2.h | 443 ++ bsd/netinet/ip_fw2_compat.c | 2253 ++++++ bsd/netinet/ip_fw2_compat.h | 375 + bsd/netinet/ip_icmp.c | 86 +- bsd/netinet/ip_icmp.h | 10 +- bsd/netinet/ip_id.c | 10 +- bsd/netinet/ip_input.c | 712 +- bsd/netinet/ip_mroute.c | 66 +- bsd/netinet/ip_mroute.h | 29 +- bsd/netinet/ip_output.c | 675 +- bsd/netinet/ip_var.h | 98 +- bsd/netinet/kpi_ipfilter.c | 496 ++ bsd/netinet/kpi_ipfilter.h | 193 + .../table.h => netinet/kpi_ipfilter_var.h} | 46 +- bsd/netinet/raw_ip.c | 247 +- bsd/netinet/tcp.h | 45 +- bsd/netinet/tcp_debug.h | 9 +- bsd/netinet/tcp_fsm.h | 10 +- bsd/netinet/tcp_input.c | 514 +- bsd/netinet/tcp_output.c | 146 +- bsd/netinet/tcp_seq.h | 10 +- bsd/netinet/tcp_subr.c | 217 +- bsd/netinet/tcp_timer.c | 236 +- bsd/netinet/tcp_timer.h | 20 +- bsd/netinet/tcp_usrreq.c | 105 +- bsd/netinet/tcp_var.h | 268 +- bsd/netinet/tcpip.h | 2 +- bsd/netinet/udp_usrreq.c | 494 +- bsd/netinet/udp_var.h | 28 +- bsd/netinet6/Makefile | 27 +- bsd/netinet6/ah.h | 32 +- bsd/netinet6/ah6.h | 18 +- bsd/netinet6/ah_core.c | 91 +- bsd/netinet6/ah_input.c | 53 +- bsd/netinet6/ah_output.c | 2 +- bsd/netinet6/dest6.c | 6 +- bsd/netinet6/esp.h | 48 +- bsd/netinet6/esp6.h | 16 +- bsd/netinet6/esp_core.c | 112 +- bsd/netinet6/esp_input.c | 49 +- bsd/netinet6/esp_output.c | 4 +- bsd/netinet6/esp_rijndael.c | 397 +- bsd/netinet6/esp_rijndael.h | 19 +- bsd/netinet6/frag6.c | 42 +- bsd/netinet6/icmp6.c | 168 +- bsd/netinet6/in6.c | 463 +- bsd/netinet6/in6.h | 305 +- bsd/netinet6/in6_gif.c | 14 +- bsd/netinet6/in6_gif.h | 12 +- bsd/netinet6/in6_ifattach.c | 220 +- bsd/netinet6/in6_ifattach.h | 22 +- bsd/netinet6/in6_pcb.c | 367 +- bsd/netinet6/in6_pcb.h | 66 +- bsd/netinet6/in6_prefix.c | 147 +- bsd/netinet6/in6_prefix.h | 9 +- bsd/netinet6/in6_proto.c | 89 +- bsd/netinet6/in6_rmx.c | 106 +- bsd/netinet6/in6_src.c | 112 +- bsd/netinet6/in6_var.h | 60 +- bsd/netinet6/ip6_ecn.h | 10 +- bsd/netinet6/ip6_forward.c | 77 +- bsd/netinet6/ip6_fw.c | 1369 ++++ bsd/netinet6/ip6_fw.h | 12 +- bsd/netinet6/ip6_input.c | 284 +- bsd/netinet6/ip6_mroute.c | 181 +- bsd/netinet6/ip6_mroute.h | 44 +- bsd/netinet6/ip6_output.c | 237 +- bsd/netinet6/ip6_var.h | 120 +- bsd/netinet6/ip6protosw.h | 41 +- bsd/netinet6/ipcomp.h | 18 +- bsd/netinet6/ipcomp6.h | 12 +- bsd/netinet6/ipcomp_core.c | 11 +- bsd/netinet6/ipcomp_input.c | 20 +- bsd/netinet6/ipcomp_output.c | 11 +- bsd/netinet6/ipsec.c | 185 +- bsd/netinet6/ipsec.h | 95 +- bsd/netinet6/ipsec6.h | 46 +- bsd/netinet6/mld6.c | 38 +- bsd/netinet6/mld6_var.h | 18 +- bsd/netinet6/nd6.c | 520 +- bsd/netinet6/nd6.h | 150 +- bsd/netinet6/nd6_nbr.c | 237 +- bsd/netinet6/nd6_rtr.c | 345 +- bsd/netinet6/pim6_var.h | 14 +- bsd/netinet6/raw_ip6.c | 126 +- bsd/netinet6/raw_ip6.h | 4 - bsd/netinet6/route6.c | 14 +- bsd/netinet6/scope6.c | 39 +- bsd/netinet6/scope6_var.h | 22 +- bsd/netinet6/tcp6_var.h | 16 +- bsd/netinet6/udp6_output.c | 25 +- bsd/netinet6/udp6_usrreq.c | 192 +- bsd/netinet6/udp6_var.h | 16 +- bsd/netkey/Makefile | 11 +- bsd/netkey/key.c | 743 +- bsd/netkey/key.h | 57 +- bsd/netkey/key_debug.c | 18 +- bsd/netkey/key_debug.h | 32 +- bsd/netkey/key_var.h | 11 +- bsd/netkey/keydb.c | 2 +- bsd/netkey/keydb.h | 28 +- bsd/netkey/keysock.c | 225 +- bsd/netkey/keysock.h | 16 +- bsd/nfs/Makefile | 4 +- bsd/nfs/krpc.h | 18 +- bsd/nfs/krpc_subr.c | 257 +- bsd/nfs/nfs.h | 826 +- bsd/nfs/nfs_bio.c | 1821 +++-- bsd/nfs/nfs_boot.c | 216 +- bsd/nfs/nfs_lock.c | 309 +- bsd/nfs/nfs_lock.h | 36 +- bsd/nfs/nfs_node.c | 323 +- bsd/nfs/nfs_nqlease.c | 1353 ---- bsd/nfs/nfs_serv.c | 4186 +++++----- bsd/nfs/nfs_socket.c | 1728 ++--- bsd/nfs/nfs_srvcache.c | 114 +- bsd/nfs/nfs_subs.c | 2294 ++++-- bsd/nfs/nfs_syscalls.c | 1296 ++-- bsd/nfs/nfs_vfsops.c | 1260 +-- bsd/nfs/nfs_vnops.c | 4475 ++++++----- bsd/nfs/nfsdiskless.h | 26 +- bsd/nfs/nfsm_subs.h | 433 +- bsd/nfs/nfsmount.h | 28 +- bsd/nfs/nfsnode.h | 205 +- bsd/nfs/nfsproto.h | 40 +- bsd/nfs/nfsrtt.h | 3 +- bsd/nfs/nfsrvcache.h | 5 +- bsd/nfs/nlminfo.h | 52 - bsd/nfs/nqnfs.h | 244 - bsd/nfs/rpcv2.h | 3 +- bsd/nfs/xdr_subs.h | 14 +- bsd/ppc/Makefile | 14 +- bsd/ppc/_types.h | 118 + bsd/ppc/disklabel.h | 10 +- bsd/ppc/endian.h | 74 +- bsd/ppc/exec.h | 10 +- bsd/ppc/param.h | 13 +- bsd/ppc/reboot.h | 6 +- bsd/ppc/reg.h | 6 +- bsd/ppc/setjmp.h | 61 +- bsd/ppc/signal.h | 28 +- bsd/ppc/spl.h | 55 - bsd/ppc/types.h | 46 +- bsd/ppc/ucontext.h | 39 +- bsd/ppc/user.h | 30 - bsd/ppc/vmparam.h | 8 +- bsd/sys/Makefile | 86 +- bsd/{miscfs/specfs/lockf.h => sys/_endian.h} | 118 +- bsd/sys/_types.h | 198 + bsd/sys/acct.h | 10 +- bsd/sys/aio.h | 34 +- bsd/sys/aio_kern.h | 32 +- bsd/sys/attr.h | 188 +- bsd/sys/audit.h | 208 - bsd/sys/buf.h | 576 +- bsd/sys/buf_internal.h | 252 + bsd/sys/cdefs.h | 183 +- bsd/sys/clist.h | 7 +- bsd/sys/conf.h | 144 +- bsd/sys/dirent.h | 52 +- bsd/sys/disk.h | 15 +- bsd/sys/disklabel.h | 3 +- bsd/sys/dkstat.h | 8 +- bsd/sys/domain.h | 55 +- bsd/sys/errno.h | 75 +- bsd/sys/ev.h | 17 +- bsd/sys/event.h | 178 +- bsd/sys/eventvar.h | 29 +- bsd/sys/exec.h | 33 +- bsd/sys/fcntl.h | 205 +- bsd/sys/file.h | 137 +- bsd/sys/file_internal.h | 208 + bsd/sys/filedesc.h | 35 +- bsd/sys/fsctl.h | 5 +- bsd/sys/fsevents.h | 88 + bsd/{net/if_slvar.h => sys/imgact.h} | 102 +- bsd/sys/ioctl.h | 6 +- bsd/sys/ioctl_compat.h | 17 +- bsd/sys/ipc.h | 136 +- bsd/sys/ipcs.h | 94 + bsd/sys/kauth.h | 652 ++ bsd/sys/kdebug.h | 31 +- bsd/sys/kern_audit.h | 285 - bsd/sys/kern_control.h | 515 +- bsd/sys/kern_event.h | 212 +- bsd/sys/kernel.h | 23 +- bsd/sys/kernel_types.h | 127 + bsd/sys/kpi_mbuf.h | 1127 +++ bsd/sys/kpi_socket.h | 375 + bsd/sys/kpi_socketfilter.h | 604 ++ bsd/sys/ktrace.h | 30 +- bsd/sys/loadable_fs.h | 3 - bsd/sys/lock.h | 91 +- bsd/sys/lockf.h | 73 +- bsd/sys/mach_swapon.h | 6 +- bsd/sys/malloc.h | 80 +- bsd/sys/mbuf.h | 264 +- bsd/sys/md5.h | 2 +- bsd/sys/mman.h | 184 +- bsd/sys/mount.h | 527 +- bsd/sys/mount_internal.h | 301 + bsd/sys/msg.h | 215 +- bsd/sys/mtio.h | 6 +- bsd/sys/namei.h | 180 +- bsd/sys/param.h | 19 +- bsd/sys/pipe.h | 157 + bsd/sys/poll.h | 38 +- bsd/sys/proc.h | 392 +- bsd/sys/proc_internal.h | 369 + bsd/sys/protosw.h | 152 +- bsd/sys/ptrace.h | 17 +- bsd/{net/netisr.h => sys/ptrace_internal.h} | 66 +- bsd/sys/queue.h | 4 +- bsd/sys/quota.h | 101 +- bsd/sys/random.h | 5 +- bsd/sys/reboot.h | 3 + bsd/sys/resource.h | 199 +- bsd/sys/resourcevar.h | 32 +- bsd/sys/select.h | 122 +- bsd/sys/sem.h | 297 +- bsd/sys/sem_internal.h | 208 + bsd/sys/semaphore.h | 4 +- bsd/sys/shm.h | 146 +- bsd/sys/shm_internal.h | 117 + bsd/sys/signal.h | 327 +- bsd/sys/signalvar.h | 80 +- bsd/sys/socket.h | 305 +- bsd/sys/socketvar.h | 226 +- bsd/sys/sockio.h | 48 +- bsd/sys/stat.h | 386 +- bsd/sys/sys_domain.h | 13 +- bsd/sys/syscall.h | 761 +- bsd/sys/sysctl.h | 308 +- bsd/sys/sysent.h | 75 + bsd/sys/syslimits.h | 8 +- bsd/sys/syslog.h | 151 +- bsd/sys/sysproto.h | 1610 ++++ bsd/sys/systm.h | 184 +- bsd/sys/table.h | 121 - bsd/sys/termios.h | 164 +- bsd/sys/time.h | 204 +- bsd/sys/timeb.h | 27 +- bsd/sys/times.h | 26 +- bsd/sys/tprintf.h | 11 +- bsd/sys/trace.h | 2 +- bsd/sys/tty.h | 146 +- bsd/sys/ttycom.h | 6 + bsd/sys/ttydefaults.h | 2 +- bsd/sys/types.h | 318 +- bsd/sys/ubc.h | 159 +- bsd/sys/ubc_internal.h | 154 + bsd/sys/ucontext.h | 65 +- bsd/sys/ucred.h | 55 +- bsd/sys/uio.h | 215 +- bsd/sys/uio_internal.h | 445 ++ bsd/sys/un.h | 43 +- bsd/sys/unistd.h | 72 +- bsd/sys/unpcb.h | 87 +- bsd/sys/user.h | 96 +- bsd/sys/utfconv.h | 13 +- bsd/sys/utsname.h | 16 +- bsd/sys/ux_exception.h | 4 + bsd/sys/version.h | 2 +- bsd/sys/vfs_context.h | 14 + bsd/sys/vm.h | 57 +- bsd/sys/vmmeter.h | 3 - bsd/sys/vnioctl.h | 60 +- bsd/sys/vnode.h | 788 +- bsd/sys/vnode_if.h | 1799 ++--- bsd/sys/vnode_internal.h | 370 + bsd/sys/vstat.h | 6 +- bsd/sys/wait.h | 138 +- bsd/sys/xattr.h | 74 + bsd/ufs/ffs/ffs_alloc.c | 238 +- bsd/ufs/ffs/ffs_balloc.c | 211 +- bsd/ufs/ffs/ffs_extern.h | 86 +- bsd/ufs/ffs/ffs_inode.c | 202 +- bsd/ufs/ffs/ffs_subr.c | 88 +- bsd/ufs/ffs/ffs_vfsops.c | 1067 +-- bsd/ufs/ffs/ffs_vnops.c | 393 +- bsd/ufs/ufs/Makefile | 2 +- bsd/ufs/ufs/inode.h | 12 +- bsd/ufs/ufs/lockf.h | 113 - bsd/ufs/ufs/quota.h | 26 +- bsd/ufs/ufs/ufs_attrlist.c | 666 +- bsd/ufs/ufs/ufs_bmap.c | 466 +- bsd/ufs/ufs/ufs_byte_order.c | 6 +- bsd/ufs/ufs/ufs_byte_order.h | 34 +- bsd/ufs/ufs/ufs_extern.h | 145 +- bsd/ufs/ufs/ufs_ihash.c | 45 +- bsd/ufs/ufs/ufs_inode.c | 32 +- bsd/ufs/ufs/ufs_lookup.c | 290 +- bsd/ufs/ufs/ufs_quota.c | 560 +- bsd/ufs/ufs/ufs_readwrite.c | 228 +- bsd/ufs/ufs/ufs_vfsops.c | 94 +- bsd/ufs/ufs/ufs_vnops.c | 1729 ++--- bsd/ufs/ufs/ufsmount.h | 4 - bsd/uuid/Makefile | 60 + bsd/uuid/uuid.h | 74 + bsd/uxkern/ux_exception.c | 99 +- bsd/vfs/kpi_vfs.c | 4626 ++++++++++++ bsd/vfs/vfs_attrlist.c | 1632 ++++ bsd/vfs/vfs_bio.c | 3710 ++++++--- bsd/vfs/vfs_cache.c | 1223 ++- bsd/vfs/vfs_cluster.c | 2661 ++++--- bsd/vfs/vfs_conf.c | 38 +- bsd/vfs/vfs_fsevents.c | 1402 ++++ bsd/vfs/vfs_init.c | 258 +- bsd/vfs/vfs_journal.c | 624 +- bsd/vfs/vfs_journal.h | 24 +- bsd/vfs/vfs_lookup.c | 734 +- bsd/vfs/vfs_quota.c | 849 ++- bsd/vfs/vfs_subr.c | 6727 +++++++++++------ bsd/vfs/vfs_support.c | 861 +-- bsd/vfs/vfs_support.h | 224 +- bsd/vfs/vfs_syscalls.c | 6379 +++++++++------- bsd/vfs/vfs_utfconv.c | 85 + bsd/vfs/vfs_vnops.c | 1042 ++- bsd/vfs/vfs_xattr.c | 2007 +++++ bsd/vfs/vnode_if.c | 998 +-- bsd/vfs/vnode_if.sh | 24 +- bsd/vfs/vnode_if.src | 698 +- bsd/vm/Makefile | 3 +- bsd/vm/dp_backing_file.c | 204 +- bsd/vm/vm_pager.h | 10 +- bsd/vm/vm_unix.c | 1263 +++- bsd/vm/vnode_pager.c | 268 +- bsd/vm/vnode_pager.h | 22 +- config/BSDKernel.exports | 4022 ++-------- config/BSDKernel.ppc.exports | 489 -- config/IOKit.exports | 379 +- config/IOKit.ppc.exports | 184 - config/IPFirewall.kext/Info.plist | 26 + config/Libkern.exports | 97 +- config/Libkern.ppc.exports | 2 + config/Mach.exports | 2042 +---- config/Mach.ppc.exports | 582 -- config/Makefile | 36 +- config/MasterVersion | 19 + config/System.kext/Info.plist | 8 +- .../PlugIns/AppleNMI.kext/Info.plist | 6 +- .../ApplePlatformFamily.kext/Info.plist | 6 +- .../PlugIns/BSDKernel.kext/Info.plist | 8 +- .../PlugIns/BSDKernel6.0.kext/Info.plist | 6 +- .../System.kext/PlugIns/IOKit.kext/Info.plist | 6 +- .../PlugIns/IOKit6.0.kext/Info.plist | 6 +- .../PlugIns/IONVRAMFamily.kext/Info.plist | 6 +- .../IOSystemManagement.kext/Info.plist | 6 +- .../PlugIns/Libkern.kext/Info.plist | 8 +- .../PlugIns/Libkern6.0.kext/Info.plist | 6 +- .../System.kext/PlugIns/Mach.kext/Info.plist | 8 +- .../PlugIns/Mach6.0.kext/Info.plist | 6 +- .../PlugIns/System6.0.kext/Info.plist | 8 +- .../PlugIns/Unsupported.kext/Info.plist | 32 + config/System6.0.exports | 5902 +-------------- config/System6.0.i386.exports | 108 +- config/System6.0.ppc.exports | 1047 --- config/Unsupported.exports | 268 + config/Unsupported.i386.exports | 8 + config/Unsupported.ppc.exports | 25 + config/newvers.pl | 110 + config/version.c | 42 + .../drvAppleIntelClock/AppleIntelClock.h | 2 - .../drvAppleIntelClock/IntelClock.cpp | 11 +- .../Drivers/platform/drvAppleNMI/AppleNMI.cpp | 22 +- .../drvApplePlatformExpert/AppleCPU.cpp | 22 +- iokit/IOKit/IOBufferMemoryDescriptor.h | 18 +- iokit/IOKit/IOCPU.h | 3 + iokit/IOKit/IOCatalogue.h | 2 + iokit/IOKit/IOCommand.h | 7 +- iokit/IOKit/IOCommandPool.h | 39 +- iokit/IOKit/IODeviceMemory.h | 19 +- iokit/IOKit/IODeviceTreeSupport.h | 5 + iokit/IOKit/IOEventSource.h | 2 +- iokit/IOKit/IOFilterInterruptEventSource.h | 6 +- iokit/IOKit/IOInterruptEventSource.h | 2 +- iokit/IOKit/IOKitDebug.h | 3 +- iokit/IOKit/IOKitKeys.h | 9 + iokit/IOKit/IOKitKeysPrivate.h | 1 + iokit/IOKit/IOLib.h | 8 +- iokit/IOKit/IOLocks.h | 234 +- iokit/IOKit/IOMemoryCursor.h | 187 +- iokit/IOKit/IOMemoryDescriptor.h | 130 +- iokit/IOKit/IOMessage.h | 11 +- iokit/IOKit/IONVRAM.h | 10 +- iokit/IOKit/IOPolledInterface.h | 93 + iokit/IOKit/IORangeAllocator.h | 40 +- iokit/IOKit/IORegistryEntry.h | 55 +- iokit/IOKit/IOReturn.h | 25 +- iokit/IOKit/IOService.h | 14 +- iokit/IOKit/IOServicePM.h | 240 +- iokit/IOKit/IOTimeStamp.h | 6 +- iokit/IOKit/IOTimerEventSource.h | 13 +- iokit/IOKit/IOTypes.h | 13 +- iokit/IOKit/IOUserClient.h | 17 +- iokit/IOKit/IOWorkLoop.h | 132 +- iokit/IOKit/Makefile | 6 +- iokit/IOKit/OSMessageNotification.h | 9 + iokit/IOKit/i386/IOSharedLockImp.h | 13 - iokit/IOKit/i386/Makefile | 2 +- iokit/IOKit/ppc/IOSharedLockImp.h | 37 - iokit/IOKit/ppc/Makefile | 2 +- iokit/IOKit/pwr_mgt/IOPM.h | 6 +- iokit/IOKit/pwr_mgt/IOPMlog.h | 149 +- iokit/IOKit/pwr_mgt/Makefile | 9 +- iokit/IOKit/pwr_mgt/RootDomain.h | 8 +- iokit/IOKit/system.h | 64 +- iokit/Kernel/IOBufferMemoryDescriptor.cpp | 191 +- iokit/Kernel/IOCPU.cpp | 83 +- iokit/Kernel/IOCatalogue.cpp | 77 +- iokit/Kernel/IOCommand.cpp | 2 +- iokit/Kernel/IOCommandQueue.cpp | 4 +- iokit/Kernel/IODeviceTreeSupport.cpp | 181 +- iokit/Kernel/IOInterruptController.cpp | 143 +- iokit/Kernel/IOInterruptEventSource.cpp | 8 +- iokit/Kernel/IOKitDebug.cpp | 19 +- iokit/Kernel/IOKitKernelInternal.h | 56 + iokit/Kernel/IOLib.c | 119 +- iokit/Kernel/IOLocks.cpp | 87 +- iokit/Kernel/IOMapper.cpp | 8 +- iokit/Kernel/IOMemoryCursor.cpp | 4 +- iokit/Kernel/IOMemoryDescriptor.cpp | 1410 ++-- iokit/Kernel/IONVRAM.cpp | 364 +- iokit/Kernel/IOPMPagingPlexus.cpp | 235 - iokit/Kernel/IOPMchangeNoteList.cpp | 4 +- iokit/Kernel/IOPMrootDomain.cpp | 153 +- iokit/Kernel/IOPlatformExpert.cpp | 57 +- iokit/Kernel/IORegistryEntry.cpp | 182 +- iokit/Kernel/IOService.cpp | 300 +- iokit/Kernel/IOServicePM.cpp | 321 +- iokit/Kernel/IOServicePrivate.h | 9 +- iokit/Kernel/IOStartIOKit.cpp | 17 +- iokit/Kernel/IOTimerEventSource.cpp | 122 +- iokit/Kernel/IOUserClient.cpp | 159 +- iokit/Kernel/IOWorkLoop.cpp | 18 +- iokit/Kernel/RootDomainUserClient.cpp | 70 +- iokit/Kernel/RootDomainUserClient.h | 6 + iokit/KernelConfigTables.cpp | 30 +- iokit/bsddev/IOKitBSDInit.cpp | 137 +- iokit/conf/Makefile.i386 | 27 + iokit/conf/Makefile.template | 19 +- iokit/conf/files | 2 +- iokit/conf/tools/Makefile | 8 +- iokit/conf/tools/newvers/Makefile | 49 - iokit/conf/tools/newvers/newvers.csh | 34 - iokit/conf/version.major | 1 - iokit/conf/version.minor | 1 - iokit/conf/version.variant | 1 - iokit/include/mach/mach.h | 2 +- iokit/mach-o/mach_header.h | 21 +- kgmacros | 680 +- libkern/Makefile | 16 +- libkern/c++/OSArray.cpp | 98 +- libkern/c++/OSCollection.cpp | 49 +- libkern/c++/OSDictionary.cpp | 86 +- libkern/c++/OSMetaClass.cpp | 32 +- libkern/c++/OSNumber.cpp | 15 +- libkern/c++/OSObjectAsm.s | 2 +- libkern/c++/OSOrderedSet.cpp | 88 +- libkern/c++/OSRuntime.cpp | 16 +- libkern/c++/OSSet.cpp | 77 +- libkern/c++/OSUnserialize.cpp | 2 +- libkern/c++/OSUnserializeXML.cpp | 10 +- libkern/c++/OSUnserializeXML.y | 10 +- .../test2/test2.pbproj/project.pbxproj | 30 +- .../TestSerialization/test2/test2_main.cpp | 124 +- libkern/conf/Makefile.i386 | 28 + libkern/conf/Makefile.template | 14 +- libkern/conf/files | 4 + libkern/conf/tools/Makefile | 8 +- libkern/conf/tools/newvers/Makefile | 49 - libkern/conf/tools/newvers/newvers.csh | 34 - libkern/conf/version.major | 1 - libkern/conf/version.minor | 1 - libkern/conf/version.variant | 1 - libkern/gen/OSAtomicOperations.c | 67 +- libkern/gen/OSDebug.cpp | 170 + libkern/i386/OSAtomic.s | 23 +- libkern/libkern/Makefile | 17 +- libkern/libkern/OSAtomic.h | 36 +- libkern/libkern/OSBase.h | 28 +- libkern/libkern/OSByteOrder.h | 20 +- .../trap.h => libkern/libkern/OSDebug.h | 26 +- libkern/libkern/OSMalloc.h | 73 + libkern/libkern/c++/OSArray.h | 24 +- libkern/libkern/c++/OSBoolean.h | 11 +- libkern/libkern/c++/OSCollection.h | 103 +- libkern/libkern/c++/OSDictionary.h | 18 + libkern/libkern/c++/OSLib.h | 2 + libkern/libkern/c++/OSMetaClass.h | 65 +- libkern/libkern/c++/OSNumber.h | 1 + libkern/libkern/c++/OSOrderedSet.h | 18 + libkern/libkern/c++/OSSet.h | 17 + libkern/libkern/i386/OSByteOrder.h | 20 +- bsd/i386/user.h => libkern/libkern/locks.h | 27 +- libkern/libkern/ppc/OSByteOrder.h | 18 +- libkern/libkern/sysctl.h | 108 + libkern/libkern/version.h.template | 94 + libkern/mach-o/loader.h | 107 +- libkern/mach-o/mach_header.h | 25 +- libkern/ppc/OSAtomic.s | 10 +- libkern/stdio/scanf.c | 660 ++ libkern/uuid/Makefile | 37 + libkern/uuid/uuid.c | 200 + libsa/catalogue.cpp | 50 +- libsa/conf/Makefile.i386 | 27 + libsa/conf/Makefile.template | 14 +- libsa/conf/tools/Makefile | 8 +- libsa/conf/tools/newvers/Makefile | 49 - libsa/conf/tools/newvers/newvers.csh | 34 - libsa/conf/version.major | 1 - libsa/conf/version.minor | 1 - libsa/conf/version.variant | 1 - libsa/dgraph.c | 32 +- libsa/dgraph.h | 2 + libsa/kext.cpp | 31 +- libsa/kld_patch.c | 62 +- libsa/kld_patch.h | 10 +- libsa/kmod.cpp | 2 +- libsa/libsa/Makefile | 7 +- libsa/libsa/i386/Makefile | 2 + libsa/libsa/mach/Makefile | 2 + libsa/libsa/ppc/Makefile | 2 + libsa/libsa/stdlib.h | 7 +- libsa/load.c | 128 +- libsa/mach_loader.h | 3 + libsa/malloc.c | 22 +- libsa/mkext.c | 52 +- libsa/ppc/setjmp.s | 8 +- makedefs/MakeInc.def | 84 +- makedefs/MakeInc.dir | 53 +- makedefs/MakeInc.rule | 514 +- osfmk/Makefile | 25 +- .../UserNotification/KUNCUserNotifications.c | 36 +- .../UserNotification/KUNCUserNotifications.h | 48 +- osfmk/UserNotification/UNDReply.defs | 2 +- osfmk/UserNotification/UNDRequest.defs | 6 +- osfmk/UserNotification/UNDTypes.h | 5 +- osfmk/conf/MASTER | 32 +- osfmk/conf/MASTER.i386 | 3 +- osfmk/conf/MASTER.ppc | 13 +- osfmk/conf/Makefile | 234 + osfmk/conf/Makefile.i386 | 36 + osfmk/conf/Makefile.ppc | 7 +- osfmk/conf/Makefile.template | 20 +- osfmk/conf/files | 26 +- osfmk/conf/files.i386 | 25 +- osfmk/conf/files.ppc | 9 +- osfmk/conf/kernelversion.major | 1 - osfmk/conf/kernelversion.minor | 1 - osfmk/conf/kernelversion.variant | 1 - osfmk/conf/tools/Makefile | 12 +- osfmk/conf/tools/kernel_newvers/Makefile | 49 - .../tools/kernel_newvers/kernel_newvers.csh | 39 - osfmk/conf/tools/newvers/Makefile | 49 - osfmk/conf/tools/newvers/newvers.csh | 33 - osfmk/conf/version.major | 1 - osfmk/conf/version.minor | 1 - osfmk/conf/version.variant | 1 - osfmk/console/i386/serial_console.c | 188 +- osfmk/console/i386/text_console.c | 60 +- osfmk/console/i386/video_scroll.c | 9 +- osfmk/console/panic_dialog.c | 1171 +-- osfmk/console/panic_image.c | 3713 +++++---- osfmk/console/panic_ui/README | 65 + osfmk/console/panic_ui/appleclut8.h | 51 + .../panic_ui/generated_files/panic_image.c | 1953 +++++ .../generated_files/rendered_numbers.c | 376 + osfmk/console/panic_ui/genimage.c | 1621 ++++ .../console/panic_ui/images/panic_dialog.tiff | Bin 0 -> 136036 bytes .../panic_ui/images/panic_dialogWHD.raw | Bin 0 -> 120366 bytes .../panic_ui/images/rendered_numbers.tiff | Bin 0 -> 3218 bytes .../panic_ui/images/rendered_numbersWHD.raw | Bin 0 -> 1425 bytes osfmk/console/panic_ui/qtif2kraw.c | 892 +++ osfmk/console/panic_ui/setupdialog.c | 359 + osfmk/console/panic_ui/systemCLUT.act | Bin 0 -> 768 bytes osfmk/console/ppc/serial_console.c | 134 +- osfmk/console/video_console.c | 187 +- osfmk/ddb/Makefile | 2 - osfmk/ddb/db_aout.c | 6 +- osfmk/ddb/db_break.c | 50 +- osfmk/ddb/db_break.h | 2 +- osfmk/ddb/db_command.c | 25 +- osfmk/ddb/db_command.h | 85 +- osfmk/ddb/db_examine.c | 80 +- osfmk/ddb/db_expr.c | 6 +- osfmk/ddb/db_ext_symtab.c | 14 +- osfmk/ddb/db_macro.c | 6 +- osfmk/ddb/db_output.c | 8 +- osfmk/ddb/db_output.h | 73 +- osfmk/ddb/db_print.c | 184 +- osfmk/ddb/db_sym.c | 76 +- osfmk/ddb/db_sym.h | 144 +- osfmk/ddb/db_task_thread.c | 46 +- osfmk/ddb/db_task_thread.h | 16 +- osfmk/ddb/db_trap.c | 10 +- osfmk/ddb/db_variables.c | 20 +- osfmk/ddb/db_variables.h | 2 +- osfmk/ddb/db_watch.c | 90 +- osfmk/ddb/db_write_cmd.c | 88 +- osfmk/ddb/tr.c | 2 +- osfmk/default_pager/Makefile | 1 + osfmk/default_pager/default_pager.c | 58 +- osfmk/default_pager/default_pager_internal.h | 48 +- osfmk/default_pager/default_pager_object.defs | 4 + osfmk/default_pager/default_pager_types.defs | 87 +- osfmk/default_pager/default_pager_types.h | 26 +- osfmk/default_pager/diag.h | 4 +- osfmk/default_pager/dp_backing_store.c | 460 +- osfmk/default_pager/dp_memory_object.c | 354 +- osfmk/device/device.defs | 11 + osfmk/device/device_init.c | 13 +- osfmk/device/device_port.h | 2 +- osfmk/device/device_types.h | 6 +- osfmk/device/iokit_rpc.c | 17 +- osfmk/device/subrs.c | 43 + osfmk/i386/AT386/asm_startup.h | 270 - osfmk/i386/AT386/bbclock.c | 47 +- osfmk/i386/AT386/bbclock_entries.h | 1 + osfmk/i386/AT386/conf.c | 6 +- osfmk/i386/AT386/himem.c | 14 +- osfmk/i386/AT386/machdep.mk | 3 +- osfmk/i386/AT386/model_dep.c | 96 +- osfmk/i386/Makefile | 14 +- osfmk/i386/acpi.c | 109 + bsd/machine/proc.h => osfmk/i386/acpi.h | 28 +- osfmk/i386/acpi_wakeup.s | 381 + osfmk/i386/apic.h | 38 +- osfmk/i386/asm.h | 11 +- osfmk/i386/ast_check.c | 1 - osfmk/i386/bcopy.s | 51 + osfmk/i386/bsd_i386.c | 442 +- osfmk/i386/commpage/atomic.s | 149 + osfmk/i386/commpage/commpage.c | 226 +- osfmk/i386/commpage/commpage.h | 9 + .../commpage/commpage_mach_absolute_time.s | 87 +- osfmk/i386/commpage/commpage_sigs.h | 57 - osfmk/i386/commpage/commpage_sigs.s | 69 - osfmk/i386/commpage/spinlocks.s | 18 +- osfmk/i386/cpu.c | 194 +- osfmk/i386/cpu_capabilities.h | 130 +- osfmk/i386/cpu_data.h | 221 +- osfmk/i386/cpu_number.h | 11 +- osfmk/i386/cpu_threads.c | 110 + osfmk/i386/cpu_threads.h | 56 + osfmk/i386/cpuid.c | 504 +- osfmk/i386/cpuid.h | 150 +- osfmk/i386/cswitch.s | 59 +- osfmk/i386/db_interface.c | 125 +- osfmk/i386/db_machdep.h | 2 +- osfmk/i386/db_trace.c | 110 +- osfmk/i386/endian.h | 20 +- osfmk/i386/fpu.c | 249 +- osfmk/i386/fpu.h | 27 +- osfmk/i386/gdt.c | 24 +- osfmk/i386/genassym.c | 175 +- osfmk/i386/hardclock.c | 235 - osfmk/i386/hardclock_entries.h | 50 - osfmk/i386/hw_lock_types.h | 9 +- osfmk/i386/i386_init.c | 66 +- osfmk/i386/i386_lock.s | 902 +-- osfmk/i386/i386_vm_init.c | 328 +- osfmk/i386/io_emulate.c | 20 +- osfmk/i386/io_map.c | 7 +- osfmk/i386/io_map_entries.h | 9 +- osfmk/i386/iopb.c | 14 +- osfmk/i386/iopb_entries.h | 2 +- osfmk/i386/ipl.h | 10 - osfmk/i386/ldt.c | 22 +- osfmk/i386/lock.h | 170 +- osfmk/i386/locks.h | 144 + osfmk/i386/locks_i386.c | 1870 +++++ osfmk/i386/locore.s | 658 +- osfmk/i386/loose_ends.c | 719 +- osfmk/i386/mach_param.h | 58 - osfmk/i386/machdep_call.c | 63 +- osfmk/i386/machdep_call.h | 20 +- osfmk/i386/machine_cpu.h | 26 +- osfmk/i386/machine_routines.c | 207 +- osfmk/i386/machine_routines.h | 25 +- osfmk/i386/machine_routines_asm.s | 250 - osfmk/i386/machparam.h | 8 - osfmk/i386/mcount.s | 20 +- osfmk/i386/misc_protos.h | 38 +- osfmk/i386/mp.c | 684 +- osfmk/i386/mp.h | 182 +- osfmk/i386/mp_desc.c | 259 +- osfmk/i386/mp_desc.h | 104 +- osfmk/i386/mp_events.h | 15 +- osfmk/i386/mp_slave_boot.h | 2 +- osfmk/i386/mp_slave_boot.s | 35 +- osfmk/i386/mtrr.c | 643 ++ bsd/ppc/cpu.h => osfmk/i386/mtrr.h | 48 +- osfmk/i386/pcb.c | 407 +- osfmk/i386/perfmon.c | 552 ++ osfmk/i386/perfmon.h | 305 + osfmk/i386/phys.c | 124 +- osfmk/i386/pio.h | 1 - osfmk/i386/pit.h | 2 +- osfmk/i386/pmap.c | 1606 ++-- osfmk/i386/pmap.h | 311 +- osfmk/i386/postcode.h | 160 + osfmk/i386/proc_reg.h | 103 +- osfmk/i386/read_fault.c | 45 +- osfmk/i386/rtclock.c | 1379 ++-- osfmk/i386/rtclock_entries.h | 9 +- osfmk/i386/seg.h | 105 +- osfmk/{kern/time_out.h => i386/simple_lock.h} | 77 +- osfmk/i386/start.s | 495 +- osfmk/i386/thread.h | 173 +- osfmk/i386/thread_act.h | 185 - osfmk/i386/trap.c | 189 +- osfmk/i386/trap.h | 11 +- osfmk/i386/user_ldt.c | 48 +- osfmk/i386/user_ldt.h | 2 +- osfmk/i386/xpr.h | 9 - osfmk/ipc/ipc_entry.c | 10 +- osfmk/ipc/ipc_entry.h | 29 +- osfmk/ipc/ipc_hash.c | 41 +- osfmk/ipc/ipc_hash.h | 9 +- osfmk/ipc/ipc_init.c | 33 +- osfmk/ipc/ipc_kmsg.c | 1201 ++- osfmk/ipc/ipc_kmsg.h | 85 +- osfmk/ipc/ipc_mqueue.c | 152 +- osfmk/ipc/ipc_mqueue.h | 42 +- osfmk/ipc/ipc_object.c | 31 +- osfmk/ipc/ipc_object.h | 32 +- osfmk/ipc/ipc_port.c | 103 +- osfmk/ipc/ipc_port.h | 40 +- osfmk/ipc/ipc_print.h | 65 +- osfmk/ipc/ipc_pset.c | 9 +- osfmk/ipc/ipc_pset.h | 16 +- osfmk/ipc/ipc_right.c | 34 +- osfmk/ipc/ipc_space.c | 5 +- osfmk/ipc/ipc_space.h | 13 +- osfmk/ipc/ipc_splay.c | 58 +- osfmk/ipc/ipc_table.c | 93 +- osfmk/ipc/ipc_table.h | 103 +- osfmk/ipc/ipc_types.h | 50 +- osfmk/ipc/mach_debug.c | 137 +- osfmk/ipc/mach_msg.c | 349 +- osfmk/ipc/mach_port.c | 118 +- osfmk/ipc/port.h | 7 +- osfmk/kdp/kdp.c | 9 +- osfmk/kdp/kdp_core.h | 28 +- osfmk/kdp/kdp_internal.h | 26 +- osfmk/kdp/kdp_udp.c | 40 +- osfmk/kdp/ml/i386/kdp_machdep.c | 95 +- osfmk/kdp/ml/i386/kdp_vm.c | 4 +- osfmk/kdp/ml/ppc/kdp_machdep.c | 22 +- osfmk/kdp/ml/ppc/kdp_vm.c | 102 +- osfmk/kern/Makefile | 7 +- osfmk/kern/assert.h | 9 +- osfmk/kern/ast.c | 87 +- osfmk/kern/ast.h | 65 +- osfmk/kern/bsd_kern.c | 275 +- osfmk/kern/clock.c | 213 +- osfmk/kern/clock.h | 176 +- osfmk/kern/counters.c | 71 +- osfmk/kern/counters.h | 85 +- osfmk/kern/cpu_data.c | 33 - osfmk/kern/cpu_data.h | 35 +- osfmk/kern/cpu_number.h | 10 +- osfmk/kern/debug.c | 29 +- osfmk/kern/debug.h | 22 +- osfmk/kern/etap.c | 1866 ----- osfmk/kern/etap_macros.h | 456 -- osfmk/kern/etap_map.c | 174 - osfmk/kern/etap_map.h | 84 - osfmk/kern/etap_options.h | 102 - osfmk/kern/etap_pool.c | 224 - osfmk/kern/etap_pool.h | 107 - osfmk/kern/exception.c | 88 +- osfmk/kern/exception.h | 10 +- osfmk/kern/host.c | 425 +- osfmk/kern/host.h | 23 +- osfmk/kern/host_notify.c | 12 +- osfmk/kern/host_statistics.h | 15 +- osfmk/kern/ipc_host.c | 19 +- osfmk/kern/ipc_kobject.c | 196 +- osfmk/kern/ipc_kobject.h | 7 +- osfmk/kern/ipc_mig.c | 124 +- osfmk/kern/ipc_mig.h | 57 +- osfmk/kern/ipc_tt.c | 1042 ++- osfmk/kern/ipc_tt.h | 62 +- osfmk/kern/kalloc.c | 381 +- osfmk/kern/kalloc.h | 52 +- osfmk/kern/kern_types.h | 58 +- osfmk/kern/kmod.c | 208 +- osfmk/kern/ledger.c | 52 +- osfmk/kern/ledger.h | 26 +- osfmk/kern/lock.c | 2384 ------ osfmk/kern/lock.h | 303 +- osfmk/kern/lock_mon.c | 415 - osfmk/kern/locks.c | 1055 +++ osfmk/kern/locks.h | 409 + osfmk/kern/mach_clock.c | 91 +- osfmk/kern/mach_param.h | 12 +- osfmk/kern/machine.c | 217 +- osfmk/kern/machine.h | 85 +- osfmk/kern/misc_protos.h | 88 +- osfmk/kern/mk_sp.c | 687 +- osfmk/kern/mk_sp.h | 67 - osfmk/kern/mk_timer.c | 43 +- osfmk/kern/mk_timer.h | 1 - osfmk/kern/norma_protos.h | 74 +- osfmk/kern/printf.c | 28 +- osfmk/kern/priority.c | 230 +- osfmk/kern/processor.c | 478 +- osfmk/kern/processor.h | 97 +- .../kern/processor_data.c | 27 +- osfmk/kern/processor_data.h | 80 + osfmk/kern/profile.c | 76 +- osfmk/kern/profile.h | 25 +- osfmk/kern/queue.h | 105 +- osfmk/kern/sched.h | 101 +- osfmk/kern/{mach_factor.c => sched_average.c} | 102 +- osfmk/kern/sched_prim.c | 1387 ++-- osfmk/kern/sched_prim.h | 283 +- osfmk/kern/simple_lock.h | 328 +- osfmk/kern/simple_lock_types.h | 285 - osfmk/kern/spl.h | 4 +- osfmk/kern/sscanf.c | 93 - osfmk/kern/stack.c | 470 ++ osfmk/kern/startup.c | 233 +- osfmk/kern/startup.h | 53 +- osfmk/kern/sync_lock.c | 121 +- osfmk/kern/sync_lock.h | 62 +- osfmk/kern/sync_sema.c | 360 +- osfmk/kern/syscall_emulation.c | 112 +- osfmk/kern/syscall_emulation.h | 96 - osfmk/kern/syscall_subr.c | 284 +- osfmk/kern/syscall_subr.h | 24 +- osfmk/kern/syscall_sw.c | 282 +- osfmk/kern/syscall_sw.h | 30 +- osfmk/kern/task.c | 1009 ++- osfmk/kern/task.h | 175 +- osfmk/kern/task_policy.c | 26 +- osfmk/kern/task_swap.c | 5 +- osfmk/kern/thread.c | 1445 ++-- osfmk/kern/thread.h | 454 +- osfmk/kern/thread_act.c | 1374 +--- osfmk/kern/thread_act.h | 53 - osfmk/kern/thread_call.c | 70 +- osfmk/kern/thread_call.h | 105 +- osfmk/kern/thread_policy.c | 68 +- osfmk/kern/thread_swap.c | 195 - osfmk/kern/thread_swap.h | 53 - osfmk/kern/timer.c | 551 +- osfmk/kern/timer.h | 189 +- osfmk/kern/timer_call.c | 33 +- osfmk/kern/wait_queue.c | 129 +- osfmk/kern/wait_queue.h | 80 +- osfmk/kern/xpr.c | 192 +- osfmk/kern/xpr.h | 118 +- osfmk/kern/zalloc.c | 210 +- osfmk/kern/zalloc.h | 166 +- osfmk/libsa/string.h | 46 +- osfmk/libsa/types.h | 2 + osfmk/mach-o/loader.h | 4 + osfmk/mach-o/mach_header.c | 147 +- osfmk/mach-o/mach_header.h | 23 +- osfmk/mach/AT386/machdep.mk | 35 - osfmk/mach/Makefile | 26 +- osfmk/mach/boolean.h | 73 +- osfmk/mach/boot_info.h | 253 - osfmk/mach/clock_types.defs | 4 +- osfmk/mach/clock_types.h | 19 +- osfmk/mach/error.h | 92 +- osfmk/mach/etap.h | 276 - osfmk/mach/etap_events.h | 347 - osfmk/mach/events_info.h | 57 +- osfmk/mach/exception.h | 9 - osfmk/mach/exception_types.h | 3 +- osfmk/mach/host_info.h | 83 +- osfmk/mach/host_notify.h | 8 - osfmk/mach/host_notify_reply.defs | 8 - osfmk/mach/host_priv.defs | 22 +- osfmk/mach/host_reboot.h | 28 +- osfmk/mach/host_special_ports.h | 1 - osfmk/mach/i386/Makefile | 30 +- osfmk/mach/i386/boolean.h | 48 - osfmk/mach/i386/exception.h | 75 - osfmk/mach/i386/flipc_dep.h | 39 - osfmk/mach/i386/fp_reg.h | 51 +- osfmk/mach/i386/kern_return.h | 52 +- osfmk/mach/i386/mach_i386.defs | 155 - osfmk/mach/i386/mach_i386_types.h | 111 - osfmk/mach/i386/machine_types.defs | 21 +- osfmk/mach/i386/ndr_def.h | 30 - osfmk/mach/i386/processor_info.h | 1 - osfmk/mach/i386/rpc.h | 8 +- osfmk/mach/i386/syscall_sw.h | 108 +- osfmk/mach/i386/thread_state.h | 24 +- osfmk/mach/i386/thread_status.h | 50 +- osfmk/mach/i386/vm_param.h | 97 +- osfmk/mach/i386/vm_types.h | 91 +- osfmk/mach/kern_return.h | 5 - osfmk/mach/kmod.h | 70 +- osfmk/mach/mach_host.defs | 18 +- osfmk/mach/mach_interface.h | 31 +- osfmk/mach/mach_param.h | 76 - osfmk/mach/mach_port.defs | 8 +- osfmk/mach/mach_syscalls.h | 20 +- osfmk/mach/mach_time.h | 26 +- osfmk/mach/mach_traps.h | 525 +- osfmk/mach/mach_types.defs | 84 +- osfmk/mach/mach_types.h | 39 +- osfmk/mach/mach_vm.defs | 441 ++ osfmk/mach/machine.h | 97 +- osfmk/mach/machine/asm.h | 7 +- osfmk/mach/machine/boolean.h | 7 +- osfmk/mach/machine/exception.h | 7 +- osfmk/mach/machine/kern_return.h | 7 +- osfmk/mach/machine/machine_types.defs | 7 +- osfmk/mach/machine/ndr_def.h | 7 +- osfmk/mach/machine/processor_info.h | 7 +- osfmk/mach/machine/rpc.h | 9 +- osfmk/mach/machine/syscall_sw.h | 11 +- osfmk/mach/machine/thread_state.h | 7 +- osfmk/mach/machine/thread_status.h | 7 +- osfmk/mach/machine/vm_param.h | 7 +- osfmk/mach/machine/vm_types.h | 7 +- osfmk/mach/memory_object.defs | 19 +- osfmk/mach/memory_object.h | 29 +- osfmk/mach/memory_object_control.defs | 2 + osfmk/mach/memory_object_types.h | 380 +- osfmk/mach/message.h | 111 +- osfmk/mach/mig.h | 90 +- osfmk/mach/mig_errors.h | 12 +- osfmk/mach/mk_timer.h | 9 +- osfmk/mach/ndr.h | 7 +- osfmk/mach/notify.h | 4 - osfmk/mach/policy.h | 53 +- osfmk/mach/port.h | 208 +- osfmk/mach/ppc/Makefile | 2 +- osfmk/mach/ppc/_types.h | 226 + osfmk/mach/ppc/boolean.h | 73 +- osfmk/mach/ppc/exception.h | 3 - osfmk/mach/ppc/kern_return.h | 71 +- osfmk/mach/ppc/machine_types.defs | 51 +- osfmk/mach/ppc/ndr_def.h | 27 - osfmk/mach/ppc/processor_info.h | 30 +- osfmk/mach/ppc/rpc.h | 17 +- osfmk/mach/ppc/syscall_sw.h | 22 +- osfmk/mach/ppc/thread_state.h | 5 +- osfmk/mach/ppc/thread_status.h | 202 +- osfmk/mach/ppc/vm_param.h | 66 +- osfmk/mach/ppc/vm_types.h | 101 +- osfmk/mach/processor_info.h | 29 +- osfmk/mach/rpc.h | 6 - osfmk/mach/semaphore.h | 25 +- osfmk/mach/shared_memory_server.h | 83 +- osfmk/mach/std_types.h | 6 +- osfmk/mach/sync_policy.h | 16 +- osfmk/mach/syscall_sw.h | 33 +- osfmk/mach/task.defs | 17 +- osfmk/mach/task_info.h | 108 +- osfmk/mach/task_ledger.h | 7 +- osfmk/mach/task_policy.h | 17 +- osfmk/mach/task_special_ports.h | 8 +- osfmk/mach/thread_act.defs | 10 +- osfmk/mach/thread_info.h | 15 +- osfmk/mach/thread_policy.h | 29 +- osfmk/mach/thread_status.h | 6 +- osfmk/mach/thread_switch.h | 8 +- osfmk/mach/time_value.h | 32 +- osfmk/mach/upl.defs | 14 +- osfmk/mach/vm_attributes.h | 6 +- osfmk/mach/vm_inherit.h | 6 +- osfmk/mach/vm_map.defs | 42 +- osfmk/mach/vm_param.h | 109 +- osfmk/mach/vm_prot.h | 9 +- osfmk/mach/vm_purgable.h | 55 + osfmk/mach/vm_region.h | 105 +- osfmk/mach/vm_statistics.h | 90 +- osfmk/mach/vm_sync.h | 7 +- osfmk/mach/vm_types.h | 56 +- osfmk/mach_debug/Makefile | 2 +- osfmk/mach_debug/hash_info.h | 48 +- osfmk/mach_debug/ipc_info.h | 62 +- osfmk/mach_debug/lockgroup_info.h | 69 + osfmk/mach_debug/mach_debug_types.defs | 110 +- osfmk/mach_debug/mach_debug_types.h | 1 + osfmk/mach_debug/page_info.h | 44 +- osfmk/mach_debug/vm_info.h | 55 +- osfmk/mach_debug/zone_info.h | 66 +- osfmk/machine/Makefile | 27 +- osfmk/machine/cpu_capabilities.h | 16 +- osfmk/machine/cpu_number.h | 4 + osfmk/machine/disk.h | 35 - osfmk/machine/gdb_defs.h | 35 - osfmk/machine/hw_lock_types.h | 35 - osfmk/machine/io_map_entries.h | 4 + osfmk/machine/iobus.h | 35 - osfmk/machine/kgdb_defs.h | 35 - osfmk/machine/kgdb_setjmp.h | 35 - osfmk/machine/lock.h | 4 + osfmk/machine/{spl.h => locks.h} | 12 +- osfmk/machine/mach_param.h | 35 - .../label_t.h => osfmk/machine/machine_cpu.h | 12 +- osfmk/machine/machine_rpc.h | 2 +- .../cpu.h => osfmk/machine/simple_lock.h | 13 +- osfmk/machine/thread_act.h | 35 - osfmk/man/host_basic_info.html | 2 +- osfmk/ppc/AltiAssist.s | 1 - osfmk/ppc/Diagnostics.c | 169 +- osfmk/ppc/Diagnostics.h | 19 +- osfmk/ppc/Emulate.s | 32 +- osfmk/ppc/Emulate64.s | 49 +- osfmk/ppc/Firmware.s | 7 +- osfmk/ppc/FirmwareC.c | 1 - osfmk/ppc/Makefile | 11 +- osfmk/ppc/PPCcalls.c | 1 - osfmk/ppc/PseudoKernel.c | 163 +- osfmk/ppc/_setjmp.s | 8 +- osfmk/ppc/aligned_data.s | 82 +- osfmk/ppc/asm.h | 14 +- osfmk/ppc/ast.h | 5 +- osfmk/ppc/atomic_switch.s | 6 +- osfmk/ppc/bcopy.s | 837 +- osfmk/ppc/bcopytest.c | 2 - osfmk/ppc/cache.s | 108 + osfmk/ppc/chud/chud_cpu.c | 982 ++- osfmk/ppc/chud/chud_cpu_asm.h | 80 +- osfmk/ppc/chud/chud_cpu_asm.s | 761 +- osfmk/ppc/chud/chud_memory.c | 5 +- osfmk/ppc/chud/chud_osfmk_callback.c | 307 +- osfmk/ppc/chud/chud_spr.h | 1 + osfmk/ppc/chud/chud_thread.c | 921 ++- osfmk/ppc/chud/chud_xnu.h | 101 +- osfmk/ppc/chud/chud_xnu_private.h | 44 + osfmk/ppc/clock.h | 52 - osfmk/ppc/commpage/atomic.s | 274 + osfmk/ppc/commpage/bcopy_64.s | 7 +- osfmk/ppc/commpage/bcopy_970.s | 42 +- osfmk/ppc/commpage/bcopy_g3.s | 3 +- osfmk/ppc/commpage/bcopy_g4.s | 3 +- osfmk/ppc/commpage/bigcopy_970.s | 605 +- osfmk/ppc/commpage/bzero_128.s | 29 +- osfmk/ppc/commpage/bzero_32.s | 3 +- osfmk/ppc/commpage/cacheflush.s | 20 +- osfmk/ppc/commpage/commpage.c | 437 +- osfmk/ppc/commpage/commpage.h | 16 +- osfmk/ppc/commpage/commpage_asm.s | 31 +- osfmk/ppc/commpage/gettimeofday.s | 83 +- osfmk/ppc/commpage/mach_absolute_time.s | 22 +- osfmk/ppc/commpage/memset_64.s | 91 + osfmk/ppc/commpage/memset_g3.s | 127 + osfmk/ppc/commpage/memset_g4.s | 126 + osfmk/ppc/commpage/memset_g5.s | 163 + osfmk/ppc/commpage/pthread.s | 43 +- osfmk/ppc/commpage/spinlocks.s | 112 +- osfmk/ppc/console_feed.c | 2 +- osfmk/ppc/cpu.c | 1368 ++-- osfmk/ppc/cpu_capabilities.h | 70 +- osfmk/ppc/cpu_data.h | 55 +- osfmk/ppc/cpu_internal.h | 92 + osfmk/ppc/cpu_number.h | 8 +- osfmk/ppc/cswtch.s | 129 +- osfmk/ppc/db_interface.c | 37 +- osfmk/ppc/db_low_trace.c | 130 +- osfmk/ppc/db_machdep.h | 2 +- osfmk/ppc/db_trace.c | 110 +- osfmk/ppc/exception.h | 106 +- osfmk/ppc/fpu_protos.h | 6 +- osfmk/ppc/genassym.c | 342 +- osfmk/ppc/hw_exception.s | 267 +- osfmk/ppc/hw_lock.s | 1673 ++-- osfmk/ppc/hw_lock_types.h | 46 +- osfmk/ppc/hw_perfmon.c | 159 +- osfmk/ppc/hw_perfmon.h | 2 +- osfmk/ppc/hw_vm.s | 3656 ++++++++- osfmk/ppc/interrupt.c | 51 +- osfmk/ppc/io_map.c | 8 +- osfmk/ppc/io_map_entries.h | 3 + osfmk/ppc/lock.h | 57 +- osfmk/ppc/locks.h | 206 + osfmk/ppc/locks_ppc.c | 2054 +++++ osfmk/ppc/lowglobals.h | 18 +- osfmk/ppc/lowmem_vectors.s | 549 +- osfmk/ppc/mach_param.h | 57 - osfmk/ppc/machine_cpu.h | 20 +- osfmk/ppc/machine_routines.c | 626 +- osfmk/ppc/machine_routines.h | 311 +- osfmk/ppc/machine_routines_asm.s | 275 +- osfmk/ppc/mappings.c | 441 +- osfmk/ppc/mappings.h | 305 +- osfmk/ppc/mcount.s | 7 +- osfmk/ppc/mem.h | 7 +- osfmk/ppc/misc.c | 10 +- osfmk/ppc/misc_asm.s | 46 +- osfmk/ppc/misc_protos.h | 134 +- osfmk/ppc/model_dep.c | 95 +- osfmk/ppc/movc.s | 692 +- osfmk/ppc/new_screen.h | 5 +- osfmk/ppc/pcb.c | 353 +- osfmk/ppc/pmap.c | 764 +- osfmk/ppc/pmap.h | 133 +- osfmk/ppc/ppc_init.c | 223 +- osfmk/ppc/ppc_vm_init.c | 139 +- osfmk/ppc/proc_reg.h | 6 +- osfmk/ppc/rtclock.c | 382 +- osfmk/ppc/savearea.c | 7 +- osfmk/ppc/savearea.h | 35 +- osfmk/ppc/savearea_asm.s | 166 +- osfmk/ppc/serial_io.c | 33 +- osfmk/ppc/serial_io.h | 4 +- osfmk/ppc/simple_lock.h | 172 + osfmk/ppc/skiplists.s | 86 +- osfmk/ppc/start.s | 145 +- osfmk/ppc/status.c | 450 +- osfmk/ppc/thread.h | 154 +- osfmk/ppc/thread_act.h | 180 - osfmk/ppc/trap.c | 170 +- osfmk/ppc/trap.h | 13 +- osfmk/ppc/vmachmon.c | 796 +- osfmk/ppc/vmachmon.h | 90 +- osfmk/ppc/vmachmon_asm.s | 102 +- osfmk/profiling/i386/profile-asm.s | 6 +- osfmk/profiling/i386/profile-md.c | 2 +- osfmk/sys/types.h | 3 +- osfmk/sys/version.h | 135 - osfmk/vm/Makefile | 10 +- osfmk/vm/bsd_vm.c | 592 +- osfmk/vm/cpm.h | 2 + osfmk/vm/device_vm.c | 172 +- osfmk/vm/memory_object.c | 1070 +-- osfmk/vm/memory_object.h | 26 +- osfmk/vm/pmap.h | 187 +- osfmk/vm/task_working_set.c | 399 +- osfmk/vm/task_working_set.h | 110 +- osfmk/vm/vm_debug.c | 113 +- osfmk/vm/vm_debug.h | 7 +- osfmk/vm/vm_external.c | 7 +- osfmk/vm/vm_fault.c | 498 +- osfmk/vm/vm_fault.h | 56 +- osfmk/vm/vm_init.c | 8 +- osfmk/vm/vm_kern.c | 505 +- osfmk/vm/vm_kern.h | 52 +- osfmk/vm/vm_map.c | 5889 +++++++++------ osfmk/vm/vm_map.h | 567 +- osfmk/vm/vm_object.c | 1199 ++- osfmk/vm/vm_object.h | 124 +- osfmk/vm/vm_page.h | 88 +- osfmk/vm/vm_pageout.c | 3973 +++++++--- osfmk/vm/vm_pageout.h | 142 +- osfmk/vm/vm_print.h | 8 +- osfmk/vm/vm_protos.h | 330 + osfmk/vm/vm_resident.c | 519 +- osfmk/vm/vm_shared_memory_server.c | 1874 ++++- osfmk/vm/vm_shared_memory_server.h | 83 +- osfmk/vm/vm_user.c | 3435 +++++---- pexpert/conf/Makefile.i386 | 16 +- pexpert/conf/Makefile.ppc | 1 - pexpert/conf/Makefile.template | 14 +- pexpert/conf/tools/Makefile | 8 +- pexpert/conf/tools/newvers/Makefile | 49 - pexpert/conf/tools/newvers/newvers.csh | 33 - pexpert/conf/version.major | 1 - pexpert/conf/version.minor | 1 - pexpert/conf/version.variant | 1 - pexpert/gen/bootargs.c | 33 +- pexpert/gen/device_tree.c | 15 +- pexpert/gen/pe_gen.c | 2 + pexpert/i386/fakePPCDeviceTree.c | 45 +- pexpert/i386/fakePPCDeviceTree.h | 9 +- pexpert/i386/fakePPCStructs.h | 26 +- pexpert/i386/kd.c | 8 +- pexpert/i386/pe_identify_machine.c | 8 +- pexpert/i386/pe_init.c | 82 +- pexpert/i386/pe_interrupt.c | 22 +- pexpert/i386/pe_kprintf.c | 13 +- pexpert/i386/pe_serial.c | 6 +- pexpert/pexpert/i386/protos.h | 1 - pexpert/pexpert/pexpert.h | 19 +- pexpert/pexpert/ppc/protos.h | 7 +- pexpert/pexpert/protos.h | 16 +- pexpert/ppc/pe_init.c | 1 - pexpert/ppc/pe_kprintf.c | 39 +- 1754 files changed, 236521 insertions(+), 155169 deletions(-) create mode 100644 EXTERNAL_HEADERS/Info.plist create mode 100644 EXTERNAL_HEADERS/Makefile create mode 100644 EXTERNAL_HEADERS/architecture/Makefile create mode 100644 EXTERNAL_HEADERS/architecture/i386/Makefile create mode 100644 EXTERNAL_HEADERS/architecture/ppc/Makefile delete mode 100644 EXTERNAL_HEADERS/bsd/i386/ansi.h delete mode 100644 EXTERNAL_HEADERS/bsd/ppc/ansi.h create mode 100644 EXTERNAL_HEADERS/i386/Makefile rename bsd/dev/disk_label.h => EXTERNAL_HEADERS/i386/_limits.h (81%) rename EXTERNAL_HEADERS/{bsd => }/i386/limits.h (97%) create mode 100644 EXTERNAL_HEADERS/mach-o/Makefile create mode 100644 EXTERNAL_HEADERS/machine/Makefile create mode 100644 EXTERNAL_HEADERS/ppc/Makefile rename bsd/dev/disk.h => EXTERNAL_HEADERS/ppc/_limits.h (81%) rename EXTERNAL_HEADERS/{bsd => }/ppc/limits.h (98%) delete mode 100644 bsd/conf/tools/newvers/Makefile delete mode 100644 bsd/conf/tools/newvers/newvers.csh delete mode 100644 bsd/conf/version.major delete mode 100644 bsd/conf/version.minor delete mode 100644 bsd/conf/version.variant rename bsd/crypto/{rijndael => aes}/Makefile (81%) create mode 100644 bsd/crypto/aes/aes.h create mode 100644 bsd/crypto/aes/aescrypt.c create mode 100644 bsd/crypto/aes/aeskey.c create mode 100644 bsd/crypto/aes/aesopt.h create mode 100644 bsd/crypto/aes/aestab.c create mode 100644 bsd/crypto/aes/aestab.h delete mode 100644 bsd/crypto/rijndael/boxes-fst.dat delete mode 100644 bsd/crypto/rijndael/rijndael-alg-fst.c delete mode 100644 bsd/crypto/rijndael/rijndael-alg-fst.h delete mode 100644 bsd/crypto/rijndael/rijndael-api-fst.c delete mode 100644 bsd/crypto/rijndael/rijndael-api-fst.h delete mode 100644 bsd/crypto/rijndael/rijndael.h delete mode 100644 bsd/crypto/rijndael/rijndael_local.h delete mode 100644 bsd/dev/i386/unix_startup.c create mode 100644 bsd/dev/ppc/munge.s rename bsd/dev/{ppc => }/unix_startup.c (55%) create mode 100644 bsd/hfs/hfs_fsctl.h delete mode 100644 bsd/hfs/hfs_lockf.c delete mode 100644 bsd/hfs/hfs_lockf.h create mode 100644 bsd/hfs/hfs_xattr.c delete mode 100644 bsd/hfs/hfscommon/Catalog/Catalog.c delete mode 100644 bsd/hfs/hfscommon/Catalog/CatalogIterators.c create mode 100644 bsd/i386/_types.h delete mode 100644 bsd/i386/cpu.h delete mode 100644 bsd/i386/label_t.h delete mode 100644 bsd/i386/spl.h delete mode 100644 bsd/i386/table.h create mode 100644 bsd/kern/kern_authorization.c create mode 100644 bsd/kern/kern_credential.c rename bsd/{ufs/ufs/ufs_lockf.c => kern/kern_lockf.c} (61%) create mode 100644 bsd/kern/kpi_mbuf.c create mode 100644 bsd/kern/kpi_socket.c create mode 100644 bsd/kern/kpi_socketfilter.c create mode 100755 bsd/kern/makesyscalls.sh create mode 100644 bsd/kern/sys_pipe.c create mode 100644 bsd/kern/syscalls.master create mode 100644 bsd/libkern/crc32.c delete mode 100644 bsd/libkern/inet_ntoa.c create mode 100644 bsd/libkern/inet_ntop.c rename bsd/machine/{unix_traps.h => _limits.h} (78%) rename bsd/machine/{table.h => _types.h} (78%) delete mode 100644 bsd/machine/user.h create mode 100644 bsd/man/man2/aio_cancel.2 create mode 100644 bsd/man/man2/aio_error.2 create mode 100644 bsd/man/man2/aio_read.2 create mode 100644 bsd/man/man2/aio_return.2 create mode 100644 bsd/man/man2/aio_suspend.2 create mode 100644 bsd/man/man2/aio_write.2 delete mode 100644 bsd/man/man2/brk.2 create mode 100644 bsd/man/man2/exchangedata.2 create mode 100644 bsd/man/man2/getattrlist.2 create mode 100644 bsd/man/man2/getdirentriesattr.2 create mode 100644 bsd/man/man2/getxattr.2 create mode 100644 bsd/man/man2/listxattr.2 create mode 100644 bsd/man/man2/poll.2 create mode 100644 bsd/man/man2/removexattr.2 delete mode 100644 bsd/man/man2/sbrk.2 create mode 100644 bsd/man/man2/searchfs.2 create mode 100644 bsd/man/man2/setattrlist.2 create mode 100644 bsd/man/man2/setxattr.2 create mode 100644 bsd/man/man4/dummynet.4 create mode 100644 bsd/man/man4/ifmib.4 delete mode 100644 bsd/miscfs/specfs/spec_lockf.c create mode 100644 bsd/net/devtimer.c create mode 100644 bsd/net/devtimer.h rename bsd/{ppc/label_t.h => net/ieee8023ad.h} (58%) create mode 100644 bsd/net/if_bond.c create mode 100644 bsd/net/if_bond_var.h create mode 100644 bsd/net/if_ether.h delete mode 100644 bsd/net/if_sppp.h delete mode 100644 bsd/net/if_stf.h delete mode 100644 bsd/net/if_tun.c delete mode 100644 bsd/net/if_tun.h delete mode 100644 bsd/net/if_tunvar.h create mode 100644 bsd/net/init.c create mode 100644 bsd/net/init.h create mode 100644 bsd/net/kpi_interface.c create mode 100644 bsd/net/kpi_interface.h create mode 100644 bsd/net/kpi_interfacefilter.c create mode 100644 bsd/net/kpi_interfacefilter.h create mode 100644 bsd/net/kpi_protocol.c create mode 100644 bsd/net/kpi_protocol.h create mode 100644 bsd/net/lacp.h create mode 100644 bsd/net/multicast_list.c rename bsd/{machine/ansi.h => net/multicast_list.h} (52%) delete mode 100644 bsd/net/netisr.c delete mode 100644 bsd/net/slcompress.c delete mode 100644 bsd/net/slcompress.h delete mode 100644 bsd/netinet/if_ether.c create mode 100644 bsd/netinet/in_arp.c create mode 100644 bsd/netinet/in_arp.h create mode 100644 bsd/netinet/ip_divert.h create mode 100644 bsd/netinet/ip_fw2.c create mode 100644 bsd/netinet/ip_fw2.h create mode 100644 bsd/netinet/ip_fw2_compat.c create mode 100644 bsd/netinet/ip_fw2_compat.h create mode 100644 bsd/netinet/kpi_ipfilter.c create mode 100644 bsd/netinet/kpi_ipfilter.h rename bsd/{ppc/table.h => netinet/kpi_ipfilter_var.h} (51%) create mode 100644 bsd/netinet6/ip6_fw.c delete mode 100644 bsd/nfs/nfs_nqlease.c delete mode 100644 bsd/nfs/nlminfo.h delete mode 100644 bsd/nfs/nqnfs.h create mode 100644 bsd/ppc/_types.h delete mode 100644 bsd/ppc/spl.h delete mode 100644 bsd/ppc/user.h rename bsd/{miscfs/specfs/lockf.h => sys/_endian.h} (53%) create mode 100644 bsd/sys/_types.h delete mode 100644 bsd/sys/audit.h create mode 100644 bsd/sys/buf_internal.h create mode 100644 bsd/sys/file_internal.h create mode 100644 bsd/sys/fsevents.h rename bsd/{net/if_slvar.h => sys/imgact.h} (54%) create mode 100644 bsd/sys/ipcs.h create mode 100644 bsd/sys/kauth.h delete mode 100644 bsd/sys/kern_audit.h create mode 100644 bsd/sys/kernel_types.h create mode 100644 bsd/sys/kpi_mbuf.h create mode 100644 bsd/sys/kpi_socket.h create mode 100644 bsd/sys/kpi_socketfilter.h create mode 100644 bsd/sys/mount_internal.h create mode 100644 bsd/sys/pipe.h create mode 100644 bsd/sys/proc_internal.h rename bsd/{net/netisr.h => sys/ptrace_internal.h} (63%) create mode 100644 bsd/sys/sem_internal.h create mode 100644 bsd/sys/shm_internal.h create mode 100644 bsd/sys/sysent.h create mode 100644 bsd/sys/sysproto.h delete mode 100644 bsd/sys/table.h create mode 100644 bsd/sys/ubc_internal.h create mode 100644 bsd/sys/uio_internal.h create mode 100644 bsd/sys/vfs_context.h create mode 100644 bsd/sys/vnode_internal.h create mode 100644 bsd/sys/xattr.h delete mode 100644 bsd/ufs/ufs/lockf.h create mode 100644 bsd/uuid/Makefile create mode 100644 bsd/uuid/uuid.h create mode 100644 bsd/vfs/kpi_vfs.c create mode 100644 bsd/vfs/vfs_attrlist.c create mode 100644 bsd/vfs/vfs_fsevents.c create mode 100644 bsd/vfs/vfs_xattr.c create mode 100644 config/IPFirewall.kext/Info.plist create mode 100644 config/MasterVersion create mode 100644 config/System.kext/PlugIns/Unsupported.kext/Info.plist create mode 100644 config/Unsupported.exports create mode 100644 config/Unsupported.i386.exports create mode 100644 config/Unsupported.ppc.exports create mode 100755 config/newvers.pl create mode 100644 config/version.c create mode 100644 iokit/IOKit/IOPolledInterface.h create mode 100644 iokit/Kernel/IOKitKernelInternal.h delete mode 100644 iokit/Kernel/IOPMPagingPlexus.cpp delete mode 100644 iokit/conf/tools/newvers/Makefile delete mode 100644 iokit/conf/tools/newvers/newvers.csh delete mode 100644 iokit/conf/version.major delete mode 100644 iokit/conf/version.minor delete mode 100644 iokit/conf/version.variant delete mode 100644 libkern/conf/tools/newvers/Makefile delete mode 100644 libkern/conf/tools/newvers/newvers.csh delete mode 100644 libkern/conf/version.major delete mode 100644 libkern/conf/version.minor delete mode 100644 libkern/conf/version.variant create mode 100644 libkern/gen/OSDebug.cpp rename bsd/machine/trap.h => libkern/libkern/OSDebug.h (71%) create mode 100644 libkern/libkern/OSMalloc.h rename bsd/i386/user.h => libkern/libkern/locks.h (79%) create mode 100644 libkern/libkern/sysctl.h create mode 100644 libkern/libkern/version.h.template create mode 100644 libkern/stdio/scanf.c create mode 100644 libkern/uuid/Makefile create mode 100644 libkern/uuid/uuid.c delete mode 100644 libsa/conf/tools/newvers/Makefile delete mode 100644 libsa/conf/tools/newvers/newvers.csh delete mode 100644 libsa/conf/version.major delete mode 100644 libsa/conf/version.minor delete mode 100644 libsa/conf/version.variant delete mode 100644 osfmk/conf/kernelversion.major delete mode 100644 osfmk/conf/kernelversion.minor delete mode 100644 osfmk/conf/kernelversion.variant delete mode 100644 osfmk/conf/tools/kernel_newvers/Makefile delete mode 100644 osfmk/conf/tools/kernel_newvers/kernel_newvers.csh delete mode 100644 osfmk/conf/tools/newvers/Makefile delete mode 100644 osfmk/conf/tools/newvers/newvers.csh delete mode 100644 osfmk/conf/version.major delete mode 100644 osfmk/conf/version.minor delete mode 100644 osfmk/conf/version.variant create mode 100644 osfmk/console/panic_ui/README create mode 100644 osfmk/console/panic_ui/appleclut8.h create mode 100644 osfmk/console/panic_ui/generated_files/panic_image.c create mode 100644 osfmk/console/panic_ui/generated_files/rendered_numbers.c create mode 100644 osfmk/console/panic_ui/genimage.c create mode 100644 osfmk/console/panic_ui/images/panic_dialog.tiff create mode 100644 osfmk/console/panic_ui/images/panic_dialogWHD.raw create mode 100644 osfmk/console/panic_ui/images/rendered_numbers.tiff create mode 100644 osfmk/console/panic_ui/images/rendered_numbersWHD.raw create mode 100644 osfmk/console/panic_ui/qtif2kraw.c create mode 100644 osfmk/console/panic_ui/setupdialog.c create mode 100644 osfmk/console/panic_ui/systemCLUT.act delete mode 100644 osfmk/i386/AT386/asm_startup.h create mode 100644 osfmk/i386/acpi.c rename bsd/machine/proc.h => osfmk/i386/acpi.h (66%) create mode 100644 osfmk/i386/acpi_wakeup.s create mode 100644 osfmk/i386/commpage/atomic.s create mode 100644 osfmk/i386/cpu_threads.c create mode 100644 osfmk/i386/cpu_threads.h delete mode 100644 osfmk/i386/hardclock.c delete mode 100644 osfmk/i386/hardclock_entries.h create mode 100644 osfmk/i386/locks.h create mode 100644 osfmk/i386/locks_i386.c delete mode 100644 osfmk/i386/mach_param.h create mode 100644 osfmk/i386/mtrr.c rename bsd/ppc/cpu.h => osfmk/i386/mtrr.h (51%) create mode 100644 osfmk/i386/perfmon.c create mode 100644 osfmk/i386/perfmon.h create mode 100644 osfmk/i386/postcode.h rename osfmk/{kern/time_out.h => i386/simple_lock.h} (55%) delete mode 100644 osfmk/i386/thread_act.h delete mode 100644 osfmk/kern/cpu_data.c delete mode 100644 osfmk/kern/etap.c delete mode 100644 osfmk/kern/etap_macros.h delete mode 100644 osfmk/kern/etap_map.c delete mode 100644 osfmk/kern/etap_map.h delete mode 100644 osfmk/kern/etap_options.h delete mode 100644 osfmk/kern/etap_pool.c delete mode 100644 osfmk/kern/etap_pool.h delete mode 100644 osfmk/kern/lock.c delete mode 100644 osfmk/kern/lock_mon.c create mode 100644 osfmk/kern/locks.c create mode 100644 osfmk/kern/locks.h delete mode 100644 osfmk/kern/mk_sp.h rename bsd/vm/vm_pageout.h => osfmk/kern/processor_data.c (65%) create mode 100644 osfmk/kern/processor_data.h rename osfmk/kern/{mach_factor.c => sched_average.c} (69%) delete mode 100644 osfmk/kern/simple_lock_types.h delete mode 100644 osfmk/kern/sscanf.c create mode 100644 osfmk/kern/stack.c delete mode 100644 osfmk/kern/syscall_emulation.h delete mode 100644 osfmk/kern/thread_act.h delete mode 100644 osfmk/kern/thread_swap.c delete mode 100644 osfmk/kern/thread_swap.h delete mode 100644 osfmk/mach/AT386/machdep.mk delete mode 100644 osfmk/mach/boot_info.h delete mode 100644 osfmk/mach/etap.h delete mode 100644 osfmk/mach/etap_events.h delete mode 100644 osfmk/mach/i386/mach_i386.defs delete mode 100644 osfmk/mach/i386/mach_i386_types.h create mode 100644 osfmk/mach/mach_vm.defs create mode 100644 osfmk/mach/ppc/_types.h create mode 100644 osfmk/mach/vm_purgable.h create mode 100644 osfmk/mach_debug/lockgroup_info.h delete mode 100644 osfmk/machine/disk.h delete mode 100644 osfmk/machine/gdb_defs.h delete mode 100644 osfmk/machine/hw_lock_types.h delete mode 100644 osfmk/machine/iobus.h delete mode 100644 osfmk/machine/kgdb_defs.h delete mode 100644 osfmk/machine/kgdb_setjmp.h rename osfmk/machine/{spl.h => locks.h} (83%) delete mode 100644 osfmk/machine/mach_param.h rename bsd/machine/label_t.h => osfmk/machine/machine_cpu.h (81%) rename bsd/machine/cpu.h => osfmk/machine/simple_lock.h (84%) delete mode 100644 osfmk/machine/thread_act.h create mode 100644 osfmk/ppc/chud/chud_xnu_private.h delete mode 100644 osfmk/ppc/clock.h create mode 100644 osfmk/ppc/commpage/atomic.s create mode 100644 osfmk/ppc/commpage/memset_64.s create mode 100644 osfmk/ppc/commpage/memset_g3.s create mode 100644 osfmk/ppc/commpage/memset_g4.s create mode 100644 osfmk/ppc/commpage/memset_g5.s create mode 100644 osfmk/ppc/cpu_internal.h create mode 100644 osfmk/ppc/locks.h create mode 100644 osfmk/ppc/locks_ppc.c delete mode 100644 osfmk/ppc/mach_param.h create mode 100644 osfmk/ppc/simple_lock.h delete mode 100644 osfmk/ppc/thread_act.h delete mode 100644 osfmk/sys/version.h create mode 100644 osfmk/vm/vm_protos.h delete mode 100644 pexpert/conf/tools/newvers/Makefile delete mode 100644 pexpert/conf/tools/newvers/newvers.csh delete mode 100644 pexpert/conf/version.major delete mode 100644 pexpert/conf/version.minor delete mode 100644 pexpert/conf/version.variant diff --git a/EXTERNAL_HEADERS/Info.plist b/EXTERNAL_HEADERS/Info.plist new file mode 100644 index 000000000..848b9fe75 --- /dev/null +++ b/EXTERNAL_HEADERS/Info.plist @@ -0,0 +1,22 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE plist SYSTEM "file://localhost/System/Library/DTDs/PropertyList.dtd"> +<plist version="0.9"> +<dict> + <key>CFBundleDevelopmentRegion</key> + <string>English</string> + <key>CFBundleIdentifier</key> + <string>com.apple.framework.kernel</string> + <key>CFBundleName</key> + <string>Kernel</string> + <key>CFBundleInfoDictionaryVersion</key> + <string>6.0</string> + <key>CFBundlePackageType</key> + <string>FMWK</string> + <key>CFBundleShortVersionString</key> + <string>###KERNEL_VERSION_SHORT###</string> + <key>CFBundleVersion</key> + <string>###KERNEL_VERSION_LONG###</string> + <key>CFBundleSignature</key> + <string>????</string> +</dict> +</plist> diff --git a/EXTERNAL_HEADERS/Makefile b/EXTERNAL_HEADERS/Makefile new file mode 100644 index 000000000..0e05710a4 --- /dev/null +++ b/EXTERNAL_HEADERS/Makefile @@ -0,0 +1,39 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + + +include $(MakeInc_cmd) +include $(MakeInc_def) + +INSTINC_SUBDIRS = \ + architecture \ + machine \ + mach-o + +INSTINC_SUBDIRS_PPC = \ + architecture \ + ppc + +INSTINC_SUBDIRS_I386 = \ + architecture \ + i386 + +EXPORT_FILES = \ + ar.h \ + stdarg.h \ + stdint.h + +INSTALL_MI_LIST = + +INSTALL_MI_DIR = . + +EXPORT_MI_LIST = ${EXPORT_FILES} + +EXPORT_MI_DIR = . + +include $(MakeInc_rule) +include $(MakeInc_dir) + + diff --git a/EXTERNAL_HEADERS/ar.h b/EXTERNAL_HEADERS/ar.h index def1c4320..79ab2ade0 100644 --- a/EXTERNAL_HEADERS/ar.h +++ b/EXTERNAL_HEADERS/ar.h @@ -61,6 +61,7 @@ * * @(#)ar.h 8.2 (Berkeley) 1/21/94 */ +#ifdef KERNEL_PRIVATE #ifndef _AR_H_ #define _AR_H_ @@ -86,3 +87,4 @@ struct ar_hdr { }; #endif /* !_AR_H_ */ +#endif /* KERNEL_PRIVATE */ diff --git a/EXTERNAL_HEADERS/architecture/Makefile b/EXTERNAL_HEADERS/architecture/Makefile new file mode 100644 index 000000000..fd64ab197 --- /dev/null +++ b/EXTERNAL_HEADERS/architecture/Makefile @@ -0,0 +1,32 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + + +include $(MakeInc_cmd) +include $(MakeInc_def) + +INSTINC_SUBDIRS = + +INSTINC_SUBDIRS_PPC = \ + ppc + +INSTINC_SUBDIRS_I386 = \ + i386 + +EXPORT_FILES = \ + byte_order.h + +INSTALL_MI_LIST = + +INSTALL_MI_DIR = architecture + +EXPORT_MI_LIST = ${EXPORT_FILES} + +EXPORT_MI_DIR = architecture + +include $(MakeInc_rule) +include $(MakeInc_dir) + + diff --git a/EXTERNAL_HEADERS/architecture/byte_order.h b/EXTERNAL_HEADERS/architecture/byte_order.h index b39c272cb..fe80ee110 100644 --- a/EXTERNAL_HEADERS/architecture/byte_order.h +++ b/EXTERNAL_HEADERS/architecture/byte_order.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -23,17 +23,6 @@ * Copyright (c) 1992 NeXT Computer, Inc. * * Byte ordering conversion. - * - * HISTORY - * - * 20 October 1992 ? at NeXT - * Added #ifdef wrapper to prevent multiple inclusions of this file. - * - * 8 October 1992 ? at NeXT - * Converted to NXxxx versions. Condensed history. - * - * 18 May 1992 ? at NeXT - * Created. */ #ifndef _ARCHITECTURE_BYTE_ORDER_H_ @@ -42,7 +31,7 @@ typedef unsigned long NXSwappedFloat; typedef unsigned long long NXSwappedDouble; -#if defined (__ppc__) +#if defined (__ppc__) || defined(__ppc64__) #include "architecture/ppc/byte_order.h" #elif defined (__i386__) #include "architecture/i386/byte_order.h" diff --git a/EXTERNAL_HEADERS/architecture/i386/Makefile b/EXTERNAL_HEADERS/architecture/i386/Makefile new file mode 100644 index 000000000..5140ed922 --- /dev/null +++ b/EXTERNAL_HEADERS/architecture/i386/Makefile @@ -0,0 +1,37 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + + +include $(MakeInc_cmd) +include $(MakeInc_def) + +INSTINC_SUBDIRS_I386 = + +EXPORT_FILES = \ + asm_help.h \ + cpu.h \ + fpu.h \ + io.h \ + sel.h \ + tss.h \ + byte_order.h \ + desc.h \ + frame.h \ + reg_help.h \ + table.h + + +INSTALL_MD_LIST = + +INSTALL_MD_DIR = + +EXPORT_MD_LIST = ${EXPORT_FILES} + +EXPORT_MD_DIR = architecture/i386 + +include $(MakeInc_rule) +include $(MakeInc_dir) + + diff --git a/EXTERNAL_HEADERS/architecture/ppc/Makefile b/EXTERNAL_HEADERS/architecture/ppc/Makefile new file mode 100644 index 000000000..8cf3b09c2 --- /dev/null +++ b/EXTERNAL_HEADERS/architecture/ppc/Makefile @@ -0,0 +1,34 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + + +include $(MakeInc_cmd) +include $(MakeInc_def) + +INSTINC_SUBDIRS_PPC = + +EXPORT_FILES = \ + asm_help.h \ + basic_regs.h \ + byte_order.h \ + cframe.h \ + fp_regs.h \ + macro_help.h \ + pseudo_inst.h \ + reg_help.h + + +INSTALL_MD_LIST = + +INSTALL_MD_DIR = + +EXPORT_MD_LIST = ${EXPORT_FILES} + +EXPORT_MD_DIR = architecture/ppc + +include $(MakeInc_rule) +include $(MakeInc_dir) + + diff --git a/EXTERNAL_HEADERS/architecture/ppc/byte_order.h b/EXTERNAL_HEADERS/architecture/ppc/byte_order.h index 77927e97a..4669264e4 100644 --- a/EXTERNAL_HEADERS/architecture/ppc/byte_order.h +++ b/EXTERNAL_HEADERS/architecture/ppc/byte_order.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -23,17 +23,6 @@ * Copyright (c) 1996 NeXT Software, Inc. * * Byte ordering conversion (for ppc). - * - * HISTORY - * - * 29-Dec-96 Umesh Vaishampayan (umeshv@NeXT.com) - * Ported from m98k. - * - * 8 October 1992 ? at NeXT - * Converted to NXxxx versions. Condensed history. - * - * 28 August 1992 Bruce Martin @NeXT - * Created. */ static __inline__ @@ -77,34 +66,40 @@ NXSwapInt( } static __inline__ -unsigned long -NXSwapLong( - unsigned long inv +unsigned long long +NXSwapLongLong( + unsigned long long inv ) { - union lconv { - unsigned long ul; - unsigned char uc[4]; + union llconv { + unsigned long long ull; + unsigned char uc[8]; } *inp, outv; - inp = (union lconv *)&inv; + inp = (union llconv *)&inv; - outv.uc[0] = inp->uc[3]; - outv.uc[1] = inp->uc[2]; - outv.uc[2] = inp->uc[1]; - outv.uc[3] = inp->uc[0]; + outv.uc[0] = inp->uc[7]; + outv.uc[1] = inp->uc[6]; + outv.uc[2] = inp->uc[5]; + outv.uc[3] = inp->uc[4]; + outv.uc[4] = inp->uc[3]; + outv.uc[5] = inp->uc[2]; + outv.uc[6] = inp->uc[1]; + outv.uc[7] = inp->uc[0]; - return (outv.ul); + return (outv.ull); } +#if defined(__LP64__) + static __inline__ -unsigned long long -NXSwapLongLong( - unsigned long long inv +unsigned long +NXSwapLong( + unsigned long inv ) { union llconv { - unsigned long long ull; + unsigned long ul; unsigned char uc[8]; } *inp, outv; @@ -119,9 +114,34 @@ NXSwapLongLong( outv.uc[6] = inp->uc[1]; outv.uc[7] = inp->uc[0]; - return (outv.ull); + return (outv.ul); } +#else + +static __inline__ +unsigned long +NXSwapLong( + unsigned long inv +) +{ + union lconv { + unsigned long ul; + unsigned char uc[4]; + } *inp, outv; + + inp = (union lconv *)&inv; + + outv.uc[0] = inp->uc[3]; + outv.uc[1] = inp->uc[2]; + outv.uc[2] = inp->uc[1]; + outv.uc[3] = inp->uc[0]; + + return (outv.ul); +} + +#endif /* __LP64__ */ + #ifndef KERNEL static __inline__ NXSwappedFloat diff --git a/EXTERNAL_HEADERS/architecture/ppc/cframe.h b/EXTERNAL_HEADERS/architecture/ppc/cframe.h index 80a08ab6c..3dc63034c 100644 --- a/EXTERNAL_HEADERS/architecture/ppc/cframe.h +++ b/EXTERNAL_HEADERS/architecture/ppc/cframe.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -26,21 +26,19 @@ * * This include file defines C calling sequence defines * for ppc port. - * - * HISTORY - * 20-May-97 Umesh Vaishampayan (umeshv@apple.com) - * Added C_RED_ZONE. - * 29-Dec-96 Umesh Vaishampayan (umeshv@NeXT.com) - * Ported from m98k. - * 11-June-91 Mike DeMoney (mike@next.com) - * Created. */ #ifndef _ARCH_PPC_CFRAME_H_ #define _ARCH_PPC_CFRAME_H_ +#if defined (__ppc64__) +#define C_ARGSAVE_LEN 64 /* at least 64 bytes of arg save */ +#define C_STACK_ALIGN 32 /* stack must be 32 byte aligned */ +#define C_RED_ZONE 320 /* 320 bytes to skip over saved registers */ +#else #define C_ARGSAVE_LEN 32 /* at least 32 bytes of arg save */ #define C_STACK_ALIGN 16 /* stack must be 16 byte aligned */ -#define C_RED_ZONE 244 /* 224 bytes to skip over saved registers */ +#define C_RED_ZONE 224 /* 224 bytes to skip over saved registers */ +#endif #endif /* _ARCH_PPC_CFRAME_H_ */ diff --git a/EXTERNAL_HEADERS/bsd/i386/ansi.h b/EXTERNAL_HEADERS/bsd/i386/ansi.h deleted file mode 100644 index e20404611..000000000 --- a/EXTERNAL_HEADERS/bsd/i386/ansi.h +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/*- - * Copyright (c) 1990, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)ansi.h 8.2 (Berkeley) 1/4/94 - */ - -#ifndef _ANSI_H_ -#define _ANSI_H_ - -/* - * Types which are fundamental to the implementation and may appear in - * more than one standard header are defined here. Standard headers - * then use: - * #ifdef _BSD_SIZE_T_ - * typedef _BSD_SIZE_T_ size_t; - * #undef _BSD_SIZE_T_ - * #endif - */ -#define _BSD_CLOCK_T_ unsigned long /* clock() */ -#if defined(__GNUC__) && defined(__PTRDIFF_TYPE__) && defined(__SIZE_TYPE__) -#define _BSD_PTRDIFF_T_ __PTRDIFF_TYPE__ /* ptr1 - ptr2 */ -#define _BSD_SIZE_T_ __SIZE_TYPE__ /* sizeof() */ -#else -#define _BSD_PTRDIFF_T_ int /* ptr1 - ptr2 */ -#define _BSD_SIZE_T_ unsigned long /* sizeof() */ -#endif /* __GNUC__ */ -#define _BSD_SSIZE_T_ int /* byte count or error */ -#define _BSD_TIME_T_ long /* time() */ -#define _BSD_VA_LIST_ void * /* va_list */ -#define _BSD_SOCKLEN_T_ int32_t /* socklen_t (duh) */ - -/* - * Runes (wchar_t) is declared to be an ``int'' instead of the more natural - * ``unsigned long'' or ``long''. Two things are happening here. It is not - * unsigned so that EOF (-1) can be naturally assigned to it and used. Also, - * it looks like 10646 will be a 31 bit standard. This means that if your - * ints cannot hold 32 bits, you will be in trouble. The reason an int was - * chosen over a long is that the is*() and to*() routines take ints (says - * ANSI C), but they use _RUNE_T_ instead of int. By changing it here, you - * lose a bit of ANSI conformance, but your programs will still work. - * - * Note that _WCHAR_T_ and _RUNE_T_ must be of the same type. When wchar_t - * and rune_t are typedef'd, _WCHAR_T_ will be undef'd, but _RUNE_T remains - * defined for ctype.h. - */ -#if defined(__GNUC__) && defined(__WCHAR_TYPE__) -#define _BSD_WCHAR_T_ __WCHAR_TYPE__ /* wchar_t */ -#define _BSD_RUNE_T_ __WCHAR_TYPE__ /* rune_t */ -#else -#define _BSD_WCHAR_T_ int /* wchar_t */ -#define _BSD_RUNE_T_ int /* rune_t */ -#endif /* __GNUC__ */ - -#endif /* _ANSI_H_ */ diff --git a/EXTERNAL_HEADERS/bsd/ppc/ansi.h b/EXTERNAL_HEADERS/bsd/ppc/ansi.h deleted file mode 100644 index 7bd74d59e..000000000 --- a/EXTERNAL_HEADERS/bsd/ppc/ansi.h +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/*- - * Copyright (c) 1990, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)ansi.h 8.2 (Berkeley) 1/4/94 - */ - -#ifndef _ANSI_H_ -#define _ANSI_H_ - -/* - * Types which are fundamental to the implementation and may appear in - * more than one standard header are defined here. Standard headers - * then use: - * #ifdef _BSD_SIZE_T_ - * typedef _BSD_SIZE_T_ size_t; - * #undef _BSD_SIZE_T_ - * #endif - */ -#define _BSD_CLOCK_T_ unsigned long /* clock() */ -#if defined(__GNUC__) && defined(__PTRDIFF_TYPE__) && defined(__SIZE_TYPE__) -#define _BSD_PTRDIFF_T_ __PTRDIFF_TYPE__ /* ptr1 - ptr2 */ -#define _BSD_SIZE_T_ __SIZE_TYPE__ /* sizeof() */ -#else -#define _BSD_PTRDIFF_T_ int /* ptr1 - ptr2 */ -#define _BSD_SIZE_T_ unsigned long /* sizeof() */ -#endif /* __GNUC__ */ -#define _BSD_SSIZE_T_ int /* byte count or error */ -#define _BSD_TIME_T_ long /* time() */ -#define _BSD_VA_LIST_ char * /* va_list */ -#define _BSD_SOCKLEN_T_ int32_t /* socklen_t (duh) */ - -/* - * Runes (wchar_t) is declared to be an ``int'' instead of the more natural - * ``unsigned long'' or ``long''. Two things are happening here. It is not - * unsigned so that EOF (-1) can be naturally assigned to it and used. Also, - * it looks like 10646 will be a 31 bit standard. This means that if your - * ints cannot hold 32 bits, you will be in trouble. The reason an int was - * chosen over a long is that the is*() and to*() routines take ints (says - * ANSI C), but they use _RUNE_T_ instead of int. By changing it here, you - * lose a bit of ANSI conformance, but your programs will still work. - * - * Note that _WCHAR_T_ and _RUNE_T_ must be of the same type. When wchar_t - * and rune_t are typedef'd, _WCHAR_T_ will be undef'd, but _RUNE_T remains - * defined for ctype.h. - */ -#if defined(__GNUC__) && defined(__WCHAR_TYPE__) -#define _BSD_WCHAR_T_ __WCHAR_TYPE__ /* wchar_t */ -#define _BSD_RUNE_T_ __WCHAR_TYPE__ /* rune_t */ -#else -#define _BSD_WCHAR_T_ int /* wchar_t */ -#define _BSD_RUNE_T_ int /* rune_t */ -#endif /* __GNUC__ */ - -#endif /* _ANSI_H_ */ diff --git a/EXTERNAL_HEADERS/i386/Makefile b/EXTERNAL_HEADERS/i386/Makefile new file mode 100644 index 000000000..850fc3599 --- /dev/null +++ b/EXTERNAL_HEADERS/i386/Makefile @@ -0,0 +1,27 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + + +include $(MakeInc_cmd) +include $(MakeInc_def) + +INSTINC_SUBDIRS_I386 = + +EXPORT_FILES = \ + _limits.h \ + limits.h + +INSTALL_MD_LIST = + +INSTALL_MD_DIR = + +EXPORT_MD_LIST = ${EXPORT_FILES} + +EXPORT_MD_DIR = i386 + +include $(MakeInc_rule) +include $(MakeInc_dir) + + diff --git a/bsd/dev/disk_label.h b/EXTERNAL_HEADERS/i386/_limits.h similarity index 81% rename from bsd/dev/disk_label.h rename to EXTERNAL_HEADERS/i386/_limits.h index 3819dcbdc..3b9e7a6f7 100644 --- a/bsd/dev/disk_label.h +++ b/EXTERNAL_HEADERS/i386/_limits.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -19,5 +19,9 @@ * * @APPLE_LICENSE_HEADER_END@ */ +#ifndef _I386__LIMITS_H_ +#define _I386__LIMITS_H_ -#warning <dev/disk_label.h> is obsolete +#define __DARWIN_CLK_TCK 100 /* ticks per second */ + +#endif /* _I386__LIMITS_H_ */ diff --git a/EXTERNAL_HEADERS/bsd/i386/limits.h b/EXTERNAL_HEADERS/i386/limits.h similarity index 97% rename from EXTERNAL_HEADERS/bsd/i386/limits.h rename to EXTERNAL_HEADERS/i386/limits.h index 64eecd5a2..5c6e31df1 100644 --- a/EXTERNAL_HEADERS/bsd/i386/limits.h +++ b/EXTERNAL_HEADERS/i386/limits.h @@ -63,11 +63,14 @@ #ifndef _I386_LIMITS_H_ #define _I386_LIMITS_H_ +#include <i386/_limits.h> + #define CHAR_BIT 8 /* number of bits in a char */ #define MB_LEN_MAX 6 /* Allow 31 bit UTF2 */ - -#define CLK_TCK 100 /* ticks per second */ +#ifndef CLK_TCK +#define CLK_TCK __DARWIN_CLK_TCK /* ticks per second */ +#endif /* * According to ANSI (section 2.2.4.2), the values below must be usable by diff --git a/EXTERNAL_HEADERS/mach-o/Makefile b/EXTERNAL_HEADERS/mach-o/Makefile new file mode 100644 index 000000000..efac1ba25 --- /dev/null +++ b/EXTERNAL_HEADERS/mach-o/Makefile @@ -0,0 +1,30 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + + +include $(MakeInc_cmd) +include $(MakeInc_def) + +INSTINC_SUBDIRS = \ + +EXPORT_FILES = \ + fat.h \ + kld.h \ + loader.h \ + nlist.h \ + reloc.h + +INSTALL_MI_LIST = + +INSTALL_MI_DIR = + +EXPORT_MI_LIST = ${EXPORT_FILES} + +EXPORT_MI_DIR = mach-o + +include $(MakeInc_rule) +include $(MakeInc_dir) + + diff --git a/EXTERNAL_HEADERS/mach-o/fat.h b/EXTERNAL_HEADERS/mach-o/fat.h index 3558e4acf..e964a1965 100644 --- a/EXTERNAL_HEADERS/mach-o/fat.h +++ b/EXTERNAL_HEADERS/mach-o/fat.h @@ -57,3 +57,18 @@ struct fat_arch { unsigned long align; /* alignment as a power of 2 */ }; +#ifdef KERNEL + +#include <mach/mach_types.h> + +struct vnode; + +/* XXX return type should be load_return_t, but mach_loader.h is not in scope */ +int fatfile_getarch_affinity(struct vnode *vp, vm_offset_t data_ptr, + struct fat_arch *archret, int affinity); +int fatfile_getarch(struct vnode *vp, vm_offset_t data_ptr, + struct fat_arch *archret); +int fatfile_getarch_with_bits(struct vnode *vp, integer_t archbits, + vm_offset_t data_ptr, struct fat_arch *archret); + +#endif /* KERNEL */ diff --git a/EXTERNAL_HEADERS/mach-o/loader.h b/EXTERNAL_HEADERS/mach-o/loader.h index 707dd35d8..d0d148fa9 100644 --- a/EXTERNAL_HEADERS/mach-o/loader.h +++ b/EXTERNAL_HEADERS/mach-o/loader.h @@ -46,22 +46,47 @@ #include <architecture/byte_order.h> /* - * The mach header appears at the very beginning of the object file. + * The mach header appears at the very beginning of the object file; it + * is the same for both 32-bit and 64-bit architectures. */ struct mach_header { - unsigned long magic; /* mach magic number identifier */ + uint32_t magic; /* mach magic number identifier */ cpu_type_t cputype; /* cpu specifier */ cpu_subtype_t cpusubtype; /* machine specifier */ - unsigned long filetype; /* type of file */ - unsigned long ncmds; /* number of load commands */ - unsigned long sizeofcmds; /* the size of all the load commands */ - unsigned long flags; /* flags */ + uint32_t filetype; /* type of file */ + uint32_t ncmds; /* number of load commands */ + uint32_t sizeofcmds; /* the size of all the load commands */ + uint32_t flags; /* flags */ +}; + +/* + * The 64-bit mach header appears at the very beginning of object files for + * 64-bit architectures. + */ +struct mach_header_64 { + uint32_t magic; /* mach magic number identifier */ + cpu_type_t cputype; /* cpu specifier */ + cpu_subtype_t cpusubtype; /* machine specifier */ + uint32_t filetype; /* type of file */ + uint32_t ncmds; /* number of load commands */ + uint32_t sizeofcmds; /* the size of all the load commands */ + uint32_t flags; /* flags */ + uint32_t reserved; /* reserved */ }; -/* Constant for the magic field of the mach_header */ +/* Constant for the magic field of the mach_header (32-bit architectures) */ #define MH_MAGIC 0xfeedface /* the mach magic number */ #define MH_CIGAM NXSwapInt(MH_MAGIC) +/* Constant for the magic field of the mach_header_64 (64-bit architectures) */ +#define MH_MAGIC_64 0xfeedfacf /* the 64-bit mach magic number */ +#define MH_CIGAM_64 NXSwapInt(MH_MAGIC_64) + +/* Constants for the cmd field of new load commands, the type */ +#define LC_SEGMENT_64 0x19 /* 64-bit segment of this file to be mapped */ +#define LC_ROUTINES_64 0x1a /* 64-bit image routines */ + + /* * The layout of the file depends on the filetype. For all but the MH_OBJECT * file type the segments are padded out and aligned on a segment alignment @@ -118,7 +143,9 @@ struct mach_header { * of the particular load command structure plus anything that follows it that * is a part of the load command (i.e. section structures, strings, etc.). To * advance to the next load command the cmdsize can be added to the offset or - * pointer of the current load command. The cmdsize MUST be a multiple of + * pointer of the current load command. The cmdsize for 32-bit architectures + * MUST be a multiple of 4 bytes and for 64-bit architectures MUST be a multiple + * of 8 bytes (these are forever the maximum alignment of any load commands). * sizeof(long) (this is forever the maximum alignment of any load commands). * The padded bytes must be zero. All tables in the object file must also * follow these rules so the file can be memory mapped. Otherwise the pointers @@ -174,7 +201,7 @@ union lc_str { * section structures directly follow the segment command and their size is * reflected in cmdsize. */ -struct segment_command { +struct segment_command { /* for 32-bit architectures */ unsigned long cmd; /* LC_SEGMENT */ unsigned long cmdsize; /* includes sizeof section structs */ char segname[16]; /* segment name */ @@ -188,6 +215,27 @@ struct segment_command { unsigned long flags; /* flags */ }; +/* + * The 64-bit segment load command indicates that a part of this file is to be + * mapped into a 64-bit task's address space. If the 64-bit segment has + * sections then section_64 structures directly follow the 64-bit segment + * command and their size is reflected in cmdsize. + */ +struct segment_command_64 { /* for 64-bit architectures */ + uint32_t cmd; /* LC_SEGMENT_64 */ + uint32_t cmdsize; /* includes sizeof section_64 structs */ + char segname[16]; /* segment name */ + uint64_t vmaddr; /* memory address of this segment */ + uint64_t vmsize; /* memory size of this segment */ + uint64_t fileoff; /* file offset of this segment */ + uint64_t filesize; /* amount to map from the file */ + vm_prot_t maxprot; /* maximum VM protection */ + vm_prot_t initprot; /* initial VM protection */ + uint32_t nsects; /* number of sections in segment */ + uint32_t flags; /* flags */ +}; + + /* Constants for the flags field of the segment_command */ #define SG_HIGHVM 0x1 /* the file contents for this segment is for the high part of the VM space, the low part @@ -207,7 +255,9 @@ struct segment_command { * and load commands of the object file before it's first section. The zero * fill sections are always last in their segment (in all formats). This * allows the zeroed segment padding to be mapped into memory where zero fill - * sections might be. + * sections might be. The gigabyte zero fill sections, those with the section + * type S_GB_ZEROFILL, can only be in a segment with sections of this type. + * These segments are then placed after all other segments. * * The MH_OBJECT format has all of it's sections in one segment for * compactness. There is no padding to a specified segment boundary and the @@ -224,7 +274,7 @@ struct segment_command { * fields of the section structure for mach object files is described in the * header file <reloc.h>. */ -struct section { +struct section { /* for 32-bit architectures */ char sectname[16]; /* name of this section */ char segname[16]; /* segment this section goes in */ unsigned long addr; /* memory address of this section */ @@ -238,6 +288,22 @@ struct section { unsigned long reserved2; /* reserved */ }; +struct section_64 { /* for 64-bit architectures */ + char sectname[16]; /* name of this section */ + char segname[16]; /* segment this section goes in */ + uint64_t addr; /* memory address of this section */ + uint64_t size; /* size in bytes of this section */ + uint32_t offset; /* file offset of this section */ + uint32_t align; /* section alignment (power of 2) */ + uint32_t reloff; /* file offset of relocation entries */ + uint32_t nreloc; /* number of relocation entries */ + uint32_t flags; /* flags (section type and attributes)*/ + uint32_t reserved1; /* reserved (for offset or index) */ + uint32_t reserved2; /* reserved (for count or sizeof) */ + uint32_t reserved3; /* reserved */ +}; + + /* * The flags field of a section structure is separated into two parts a section * type and section attributes. The section types are mutually exclusive (it @@ -667,6 +733,34 @@ struct dylib_module { objc_module_info_size; /* the (__OBJC,__module_info) section */ }; +/* a 64-bit module table entry */ +struct dylib_module_64 { + uint32_t module_name; /* the module name (index into string table) */ + + uint32_t iextdefsym; /* index into externally defined symbols */ + uint32_t nextdefsym; /* number of externally defined symbols */ + uint32_t irefsym; /* index into reference symbol table */ + uint32_t nrefsym; /* number of reference symbol table entries */ + uint32_t ilocalsym; /* index into symbols for local symbols */ + uint32_t nlocalsym; /* number of local symbols */ + + uint32_t iextrel; /* index into external relocation entries */ + uint32_t nextrel; /* number of external relocation entries */ + + uint32_t iinit_iterm; /* low 16 bits are the index into the init + section, high 16 bits are the index into + the term section */ + uint32_t ninit_nterm; /* low 16 bits are the number of init section + entries, high 16 bits are the number of + term section entries */ + + uint32_t /* for this module size of the */ + objc_module_info_size; /* (__OBJC,__module_info) section */ + uint64_t /* for this module address of the start of */ + objc_module_info_addr; /* the (__OBJC,__module_info) section */ +}; + + /* * The entries in the reference symbol table are used when loading the module * (both by the static and dynamic link editors) and if the module is unloaded diff --git a/EXTERNAL_HEADERS/mach-o/nlist.h b/EXTERNAL_HEADERS/mach-o/nlist.h index 91763d098..3fe7c367c 100644 --- a/EXTERNAL_HEADERS/mach-o/nlist.h +++ b/EXTERNAL_HEADERS/mach-o/nlist.h @@ -81,6 +81,20 @@ struct nlist { unsigned long n_value; /* value of this symbol (or stab offset) */ }; +/* + * This is the symbol table entry structure for 64-bit architectures. + */ +struct nlist_64 { + union { + uint32_t n_strx; /* index into the string table */ + } n_un; + uint8_t n_type; /* type flag, see below */ + uint8_t n_sect; /* section number or NO_SECT */ + uint16_t n_desc; /* see <mach-o/stab.h> */ + uint64_t n_value; /* value of this symbol (or stab offset) */ +}; + + /* * Symbols with a index into the string table of zero (n_un.n_strx == 0) are * defined to have a null, "", name. Therefore all string indexes to non null diff --git a/EXTERNAL_HEADERS/machine/Makefile b/EXTERNAL_HEADERS/machine/Makefile new file mode 100644 index 000000000..f26245e7c --- /dev/null +++ b/EXTERNAL_HEADERS/machine/Makefile @@ -0,0 +1,30 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + + +include $(MakeInc_cmd) +include $(MakeInc_def) + +INSTINC_SUBDIRS = + +INSTINC_SUBDIRS_PPC = + +INSTINC_SUBDIRS_I386 = + +EXPORT_FILES = \ + limits.h + +INSTALL_MI_LIST = + +INSTALL_MI_DIR = . + +EXPORT_MI_LIST = ${EXPORT_FILES} + +EXPORT_MI_DIR = machine + +include $(MakeInc_rule) +include $(MakeInc_dir) + + diff --git a/EXTERNAL_HEADERS/ppc/Makefile b/EXTERNAL_HEADERS/ppc/Makefile new file mode 100644 index 000000000..843510dfd --- /dev/null +++ b/EXTERNAL_HEADERS/ppc/Makefile @@ -0,0 +1,27 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + + +include $(MakeInc_cmd) +include $(MakeInc_def) + +INSTINC_SUBDIRS_PPC = + +EXPORT_FILES = \ + _limits.h \ + limits.h + +INSTALL_MD_LIST = + +INSTALL_MD_DIR = + +EXPORT_MD_LIST = ${EXPORT_FILES} + +EXPORT_MD_DIR = ppc + +include $(MakeInc_rule) +include $(MakeInc_dir) + + diff --git a/bsd/dev/disk.h b/EXTERNAL_HEADERS/ppc/_limits.h similarity index 81% rename from bsd/dev/disk.h rename to EXTERNAL_HEADERS/ppc/_limits.h index 001bfb171..d512ec411 100644 --- a/bsd/dev/disk.h +++ b/EXTERNAL_HEADERS/ppc/_limits.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -19,5 +19,9 @@ * * @APPLE_LICENSE_HEADER_END@ */ +#ifndef _PPC__LIMITS_H_ +#define _PPC__LIMITS_H_ -#warning <dev/disk.h> is obsolete, please use <sys/disk.h> instead +#define __DARWIN_CLK_TCK 100 /* ticks per second */ + +#endif /* _PPC__LIMITS_H_ */ diff --git a/EXTERNAL_HEADERS/bsd/ppc/limits.h b/EXTERNAL_HEADERS/ppc/limits.h similarity index 98% rename from EXTERNAL_HEADERS/bsd/ppc/limits.h rename to EXTERNAL_HEADERS/ppc/limits.h index a5bc0b5df..69d6991dc 100644 --- a/EXTERNAL_HEADERS/bsd/ppc/limits.h +++ b/EXTERNAL_HEADERS/ppc/limits.h @@ -63,10 +63,14 @@ #ifndef _PPC_LIMITS_H_ #define _PPC_LIMITS_H_ +#include <ppc/_limits.h> + #define CHAR_BIT 8 /* number of bits in a char */ #define MB_LEN_MAX 6 /* Allow 31 bit UTF2 */ -#define CLK_TCK 100 /* ticks per second */ +#ifndef CLK_TCK +#define CLK_TCK __DARWIN_CLK_TCK /* ticks per second */ +#endif /* * According to ANSI (section 2.2.4.2), the values below must be usable by diff --git a/Makefile b/Makefile index d2938322b..2855bb34c 100644 --- a/Makefile +++ b/Makefile @@ -33,11 +33,11 @@ ALL_SUBDIRS = \ CONFIG_SUBDIRS = config -INSTINC_SUBDIRS = $(ALL_SUBDIRS) +INSTINC_SUBDIRS = $(ALL_SUBDIRS) EXTERNAL_HEADERS -INSTINC_SUBDIRS_PPC = $(INSTINC_SUBDIRS) +INSTINC_SUBDIRS_PPC = $(INSTINC_SUBDIRS) EXTERNAL_HEADERS -INSTINC_SUBDIRS_I386 = $(INSTINC_SUBDIRS) +INSTINC_SUBDIRS_I386 = $(INSTINC_SUBDIRS) EXTERNAL_HEADERS EXPINC_SUBDIRS = $(ALL_SUBDIRS) diff --git a/bsd/Makefile b/bsd/Makefile index 5a07086bc..7d5fb5325 100644 --- a/bsd/Makefile +++ b/bsd/Makefile @@ -24,6 +24,7 @@ INSTINC_SUBDIRS = \ nfs \ sys \ ufs \ + uuid \ vfs INSTINC_SUBDIRS_PPC = \ @@ -49,6 +50,7 @@ EXPINC_SUBDIRS = \ nfs \ sys \ ufs \ + uuid \ vfs \ vm diff --git a/bsd/bsm/Makefile b/bsd/bsm/Makefile index b1e6c12ab..ec3b7ceb4 100644 --- a/bsd/bsm/Makefile +++ b/bsd/bsm/Makefile @@ -21,12 +21,15 @@ EXPINC_SUBDIRS_I386 = \ DATAFILES = \ audit.h audit_record.h audit_kevents.h +KERNFILES = \ + audit.h + INSTALL_MI_LIST = ${DATAFILES} INSTALL_MI_DIR = bsm -EXPORT_MI_LIST = ${DATAFILES} +EXPORT_MI_LIST = ${KERNFILES} EXPORT_MI_DIR = bsm diff --git a/bsd/bsm/audit.h b/bsd/bsm/audit.h index 4c48cff9f..7ee808102 100644 --- a/bsd/bsm/audit.h +++ b/bsd/bsm/audit.h @@ -25,9 +25,8 @@ #define _BSM_AUDIT_H #include <sys/queue.h> -#include <sys/ucred.h> +#include <sys/types.h> #include <sys/param.h> -#include <sys/ipc.h> #include <sys/socket.h> #include <sys/cdefs.h> @@ -45,7 +44,7 @@ /* * Pre-defined audit IDs */ -#define AU_DEFAUDITID -1 +#define AU_DEFAUDITID ((uid_t)-1) /* * Define the masks for the classes of audit events. diff --git a/bsd/bsm/audit_kernel.h b/bsd/bsm/audit_kernel.h index 9dc1de446..5719f3107 100644 --- a/bsd/bsm/audit_kernel.h +++ b/bsd/bsm/audit_kernel.h @@ -29,8 +29,8 @@ #include <bsm/audit.h> #include <sys/sysctl.h> -#include <sys/eventvar.h> #include <sys/user.h> +#include <sys/ipc.h> /* * Audit subsystem condition flags. The audit_enabled flag is set and @@ -211,8 +211,8 @@ struct audit_record { int ar_arg_svipc_id; void * ar_arg_svipc_addr; struct posix_ipc_perm ar_arg_pipc_perm; - mach_port_t ar_arg_mach_port1; - mach_port_t ar_arg_mach_port2; + mach_port_name_t ar_arg_mach_port1; + mach_port_name_t ar_arg_mach_port2; union auditon_udata ar_arg_auditon; }; @@ -265,12 +265,12 @@ int kau_close(struct au_record *rec, struct timespec *endtime, short event); void kau_free(struct au_record *rec); void kau_init(void); -token_t *kau_to_file(char *file, struct timeval *tv); -token_t *kau_to_header(struct timespec *ctime, int rec_size, +token_t *kau_to_file(const char *file, const struct timeval *tv); +token_t *kau_to_header(const struct timespec *ctime, int rec_size, au_event_t e_type, au_emod_t e_mod); -token_t *kau_to_header32(struct timespec *ctime, int rec_size, +token_t *kau_to_header32(const struct timespec *ctime, int rec_size, au_event_t e_type, au_emod_t e_mod); -token_t *kau_to_header64(struct timespec *ctime, int rec_size, +token_t *kau_to_header64(const struct timespec *ctime, int rec_size, au_event_t e_type, au_emod_t e_mod); /* * The remaining kernel functions are conditionally compiled in as they @@ -278,17 +278,17 @@ token_t *kau_to_header64(struct timespec *ctime, int rec_size, * the source tree where these functions are referenced. */ #ifdef AUDIT -void audit_arg_addr(void * addr); -void audit_arg_len(int len); +void audit_arg_addr(user_addr_t addr); +void audit_arg_len(user_size_t len); void audit_arg_fd(int fd); void audit_arg_fflags(int fflags); void audit_arg_gid(gid_t gid, gid_t egid, gid_t rgid, gid_t sgid); void audit_arg_uid(uid_t uid, uid_t euid, uid_t ruid, uid_t suid); -void audit_arg_groupset(gid_t *gidset, u_int gidset_size); -void audit_arg_login(char *login); -void audit_arg_ctlname(int *name, int namelen); +void audit_arg_groupset(const gid_t *gidset, u_int gidset_size); +void audit_arg_login(const char *login); +void audit_arg_ctlname(const int *name, int namelen); void audit_arg_mask(int mask); void audit_arg_mode(mode_t mode); void audit_arg_dev(int dev); @@ -302,22 +302,23 @@ void audit_arg_socket(int sodomain, int sotype, void audit_arg_sockaddr(struct proc *p, struct sockaddr *so); void audit_arg_auid(uid_t auid); -void audit_arg_auditinfo(struct auditinfo *au_info); +void audit_arg_auditinfo(const struct auditinfo *au_info); void audit_arg_upath(struct proc *p, char *upath, u_int64_t flags); void audit_arg_vnpath(struct vnode *vp, u_int64_t flags); -void audit_arg_text(char *text); +void audit_arg_vnpath_withref(struct vnode *vp, u_int64_t flags); +void audit_arg_text(const char *text); void audit_arg_cmd(int cmd); void audit_arg_svipc_cmd(int cmd); -void audit_arg_svipc_perm(struct ipc_perm *perm); +void audit_arg_svipc_perm(const struct ipc_perm *perm); void audit_arg_svipc_id(int id); void audit_arg_svipc_addr(void *addr); void audit_arg_posix_ipc_perm(uid_t uid, gid_t gid, mode_t mode); -void audit_arg_auditon(union auditon_udata *udata); -void audit_arg_file(struct proc *p, struct file *fp); -void audit_arg_mach_port1(mach_port_t port); -void audit_arg_mach_port2(mach_port_t port); +void audit_arg_auditon(const union auditon_udata *udata); +void audit_arg_file(struct proc *p, const struct fileproc *fp); +void audit_arg_mach_port1(mach_port_name_t port); +void audit_arg_mach_port2(mach_port_name_t port); void audit_sysclose(struct proc *p, int fd); @@ -347,7 +348,7 @@ void audit_proc_free(struct proc *p); * possible that an audit record was begun before auditing was turned off. */ #define AUDIT_SYSCALL_EXIT(error, proc, uthread) do { \ - if (audit_enabled | (uthread->uu_ar != NULL)) { \ + if (audit_enabled || (uthread->uu_ar != NULL)) { \ audit_syscall_exit(error, proc, uthread); \ } \ } while (0) @@ -363,9 +364,9 @@ void audit_proc_free(struct proc *p); } while (0) #define AUDIT_MACH_SYSCALL_EXIT(retval) do { \ - struct uthread *uthread = get_bsdthread_info(current_act()); \ - if (audit_enabled | (uthread->uu_ar != NULL)) { \ - audit_mach_syscall_exit(retval, uthread); \ + struct uthread *__uthread = get_bsdthread_info(current_thread()); \ + if (audit_enabled || (__uthread->uu_ar != NULL)) { \ + audit_mach_syscall_exit(retval, __uthread); \ } \ } while (0) diff --git a/bsd/bsm/audit_klib.h b/bsd/bsm/audit_klib.h index 9725885a8..00730f6a7 100644 --- a/bsd/bsm/audit_klib.h +++ b/bsd/bsm/audit_klib.h @@ -38,13 +38,21 @@ token_t *kau_to_socket(struct socket_au_info *soi); token_t *kau_to_attr32(struct vnode_au_info *vni); token_t *kau_to_attr64(struct vnode_au_info *vni); +int auditon_command_event(int cmd); int au_preselect(au_event_t event, au_mask_t *mask_p, int sorf); au_event_t flags_and_error_to_openevent(int oflags, int error); -void au_evclassmap_init(); +au_event_t ctlname_to_sysctlevent(int name[], uint64_t valid_arg); +au_event_t msgctl_to_event(int cmd); +au_event_t semctl_to_event(int cmd); +void au_evclassmap_init(void); void au_evclassmap_insert(au_event_t event, au_class_t class); au_class_t au_event_class(au_event_t event); int canon_path(struct proc *p, char *path, char *cpath); + + + + /* * Define a system call to audit event mapping table. */ diff --git a/bsd/bsm/audit_record.h b/bsd/bsm/audit_record.h index 3a64eabd0..da2c96388 100644 --- a/bsd/bsm/audit_record.h +++ b/bsd/bsm/audit_record.h @@ -26,7 +26,7 @@ #include <sys/cdefs.h> #include <sys/vnode.h> -#include <sys/ipc.h> +#include <sys/types.h> #include <sys/un.h> #include <sys/event.h> #include <netinet/in_systm.h> @@ -52,19 +52,19 @@ #define ADD_U_INT16(loc, val) \ do { \ - memcpy(loc, (u_char *)&val, sizeof(u_int16_t));\ + memcpy(loc, (const u_char *)&val, sizeof(u_int16_t));\ loc += sizeof(u_int16_t); \ }while(0) #define ADD_U_INT32(loc, val) \ do { \ - memcpy(loc, (u_char *)&val, sizeof(u_int32_t));\ + memcpy(loc, (const u_char *)&val, sizeof(u_int32_t));\ loc += sizeof(u_int32_t); \ }while(0) #define ADD_U_INT64(loc, val)\ do {\ - memcpy(loc, (u_char *)&val, sizeof(u_int64_t));\ + memcpy(loc, (const u_char *)&val, sizeof(u_int64_t));\ loc += sizeof(u_int64_t); \ }while(0) @@ -243,9 +243,9 @@ token_t *au_to_me(void); token_t *au_to_arg(char n, char *text, u_int32_t v); token_t *au_to_arg32(char n, char *text, u_int32_t v); token_t *au_to_arg64(char n, char *text, u_int64_t v); -token_t *au_to_attr(struct vattr *attr); -token_t *au_to_attr32(struct vattr *attr); -token_t *au_to_attr64(struct vattr *attr); +token_t *au_to_attr(struct vnode_attr *attr); +token_t *au_to_attr32(struct vnode_attr *attr); +token_t *au_to_attr64(struct vnode_attr *attr); token_t *au_to_data(char unit_print, char unit_type, char unit_count, char *p); token_t *au_to_exit(int retval, int err); diff --git a/bsd/conf/MASTER b/bsd/conf/MASTER index 10ac705f2..90cb6ba5c 100644 --- a/bsd/conf/MASTER +++ b/bsd/conf/MASTER @@ -115,7 +115,6 @@ options NORMA_VM # NORMA virtual memory support # <norma_vm> options NORMA_TASK # NORMA task support # <norma_task> options NORMA_ETHER # NORMA across ethernet # <norma_ether> options SIMPLE_CLOCK # don't assume fixed tick # <simple_clock> -options STAT_TIME # Use statistical timing # <!timing> options XPR_DEBUG # kernel tracing # <xpr_debug> options KDEBUG # kernel tracing # <kdebug> options DDM_DEBUG # driverkit-style tracing # <ddm_debug> @@ -133,12 +132,16 @@ options ROUTING # routing # <routing> options TPIP # # <tpip> options TUN # # <tun> options VLAN # # <vlan> +options BOND # # <bond> +options NETMIBS # # <netmibs> options IPDIVERT # Divert sockets (for NAT) # <ipdivert> options IPFIREWALL # IP Firewalling (used by NAT) # <ipfirewall> #options IPFIREWALL_VERBOSE # # <ipfirewall> options IPFIREWALL_FORWARD #Transparent proxy # <ipfirewall> options IPFIREWALL_DEFAULT_TO_ACCEPT # allow everything by default # <ipfirewall> -options IPFIREWALL_KEXT # Kernel extension # <ipfirewall> +#options IPFIREWALL_KEXT # Kernel extension # <ipfirewall> +options DUMMYNET # dummynet support # <dummynet> +options IPFW2 # IP firewall (new version) # <ipfw2> options MULTICAST # Internet Protocol Class-D $ options TCPDEBUG # TCP debug # <tcpdebug> options RANDOM_IP_ID # random (not sequential) ip ids # <randomipid> @@ -150,7 +153,8 @@ options AUDIT # Security event auditing # <audit> # # 4.4 general kernel # -options COMPAT_43 # 4.3 BSD compatibility # <compat_43> +options COMPAT_43_TTY # 4.3 BSD tty compat # <compat_43_tty> +options COMPAT_43_SOCKET # 4.3 BSD socket compat # <compat_43_socket> options DIAGNOSTIC # diagnostics # <diagnostic> options KTRACE # ktrace support # <ktrace> options GPROF # build profiling # <profile> @@ -202,8 +206,8 @@ options "INET6" # kernel IPv6 Support # <inet6> options IPSEC # IP security # <ipsec> options IPSEC_ESP # IP security # <ipsec> options "IPV6FIREWALL" # IPv6 Firewall Feature # <ipv6firewall> -options "IPV6FIREWALL_DEFAULT_TO_ACCEPT" #IPv6 Firewall Feature # <ipv6firewall> -options "IPV6FIREWALL_VERBOSE" #IPv6 Firewall Feature # <ipv6firewall> +options "IPV6FIREWALL_DEFAULT_TO_ACCEPT" #IPv6 Firewall Feature # <ipv6firewall> +#options "IPV6FIREWALL_VERBOSE" #IPv6 Firewall Feature # <ipv6firewall> pseudo-device gif 1 # <gif> pseudo-device dummy 2 # <dummy> @@ -258,6 +262,10 @@ pseudo-device mdevdevice 1 init mdevinit # pseudo-device bpfilter 4 init bpf_init +# +# fsevents device +pseudo-device fsevents 1 init fsevents_init + # # shim to "linux" mach disk drivers (mach drivers must also be turned on) # diff --git a/bsd/conf/MASTER.i386 b/bsd/conf/MASTER.i386 index d1fdbe7a7..6c9d460bc 100644 --- a/bsd/conf/MASTER.i386 +++ b/bsd/conf/MASTER.i386 @@ -45,9 +45,9 @@ # Standard Apple Research Configurations: # -------- ----- -------- --------------- # -# RELEASE = [intel pc mach medium event vol pst gdb kernobjc libdriver fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug nfsclient nfsserver quota fifo fdesc union ffs cd9660 compat_43 volfs devfs synthfs revfs hfs mrouting ipdivert ipfirewall inet6 ipsec gif tcpdrop_synfin ktrace stf vlan] +# RELEASE = [intel pc mach medium event vol pst gdb kernobjc libdriver fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug nfsclient nfsserver quota fifo fdesc union ffs cd9660 compat_oldsock volfs devfs revfs hfs mrouting ipdivert ipfirewall ipfw2 dummynet ipv6firewall inet6 ipsec gif tcpdrop_synfin ktrace stf compat_43_tty compat_43_socket vlan bond netmibs] # PROFILE = [RELEASE profile] -# DEBUG = [intel pc mach medium event vol pst gdb kernobjc libdriver_g fixpri debug simple_clock mdebug kernserv driverkit xpr_debug uxpr kernstack ipc_compat ipc_debug nfsclient nfsserver quota fifo fdesc union ffs cd9660 compat_43 revfs hfs volfs devfs synthfs mach_assert mrouting ipdivert ipfirewall inet6 ipsec gif tcpdrop_synfin ktrace stf vlan] +# DEBUG = [intel pc mach medium event vol pst gdb kernobjc libdriver_g fixpri debug simple_clock mdebug kernserv driverkit xpr_debug uxpr kernstack ipc_compat ipc_debug nfsclient nfsserver quota fifo fdesc union ffs cd9660 compat_oldsock revfs hfs volfs devfs mach_assert mrouting ipdivert ipfirewall ipfw2 dummynet ipv6firewall inet6 ipsec gif tcpdrop_synfin ktrace stf compat_43_tty compat_43_socket vlan bond netmibs] # ###################################################################### # diff --git a/bsd/conf/MASTER.ppc b/bsd/conf/MASTER.ppc index 8aea2f9d4..ee3e6839b 100644 --- a/bsd/conf/MASTER.ppc +++ b/bsd/conf/MASTER.ppc @@ -45,10 +45,10 @@ # Standard Apple Research Configurations: # -------- ----- -------- --------------- # -# RELEASE = [ppc mach medium vol pst gdb simple_clock kernstack nfsclient nfsserver quota fifo fdesc union ffs cd9660 compat_43 revfs noprofiling hfs volfs devfs synthfs netat mrouting ipdivert ipfirewall ktrace inet6 ipsec tcpdrop_synfin gif stf vlan] +# RELEASE = [ppc mach medium vol pst gdb simple_clock kernstack nfsclient nfsserver quota fifo fdesc union ffs cd9660 compat_oldsock revfs noprofiling hfs volfs devfs netat mrouting ipdivert ipfirewall ipfw2 dummynet ktrace inet6 ipv6firewall ipsec tcpdrop_synfin gif stf compat_43_tty compat_43_socket vlan bond netmibs] # RELEASE_TRACE = [RELEASE kdebug] -# PROFILE = [ppc mach medium vol pst gdb simple_clock kernstack nfsclient nfsserver quota fifo fdesc union ffs cd9660 compat_43 revfs profile hfs volfs devfs synthfs netat mrouting ipdivert ipfirewall ktrace inet6 ipsec tcpdrop_synfin gif stf vlan] -# DEBUG = [ppc mach medium vol pst gdb debug simple_clock kernstack nfsclient nfsserver quota fifo fdesc union ffs cd9660 compat_43 revfs profiling hfs volfs devfs synthfs netat mrouting mach_assert ipdivert ipfirewall ktrace inet6 ipsec tcpdrop_synfin gif stf vlan] +# PROFILE = [ppc mach medium vol pst gdb simple_clock kernstack nfsclient nfsserver quota fifo fdesc union ffs cd9660 compat_oldsock revfs profile hfs volfs devfs netat mrouting ipdivert ipfirewall ipfw2 dummynet ktrace inet6 ipv6firewall ipsec tcpdrop_synfin gif stf compat_43_tty compat_43_socket vlan bond] +# DEBUG = [ppc mach medium vol pst gdb debug simple_clock kernstack nfsclient nfsserver quota fifo fdesc union ffs cd9660 compat_oldsock revfs profiling hfs volfs devfs netat mrouting mach_assert ipdivert ipfirewall ipfw2 dummynet ktrace inet6 ipv6firewall ipsec tcpdrop_synfin gif stf compat_43_tty compat_43_socket vlan bond netmibs] # DEBUG_TRACE = [DEBUG kdebug] # ###################################################################### diff --git a/bsd/conf/Makefile b/bsd/conf/Makefile index 3fbb79f00..b4b5a7d2f 100644 --- a/bsd/conf/Makefile +++ b/bsd/conf/Makefile @@ -3,6 +3,10 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir +export vnode_pager.o_CFLAGS_ADD=-Werror +export vm_unix.o_CFLAGS_ADD=-Werror +export dp_backing_file.o_CFLAGS_ADD=-Werror +export if_mib.o_CFLAGS_ADD=-Wno-unused-parameter include $(MakeInc_cmd) include $(MakeInc_def) diff --git a/bsd/conf/Makefile.i386 b/bsd/conf/Makefile.i386 index 2f6232c14..0b3a6a0d1 100644 --- a/bsd/conf/Makefile.i386 +++ b/bsd/conf/Makefile.i386 @@ -2,6 +2,300 @@ #BEGIN Machine dependent Makefile fragment for i386 ###################################################################### +# Enable -Werror for i386 builds +CFLAGS+=$(WERROR) +CWARNFLAGS= $(filter-out -Wbad-function-cast, $(CWARNFLAGS_STD)) + +# Objects that don't compile cleanly: +OBJS_NO_WERROR = \ + ioconf.o \ + aescrypt.o \ + aeskey.o \ + des_setkey.o \ + sha2.o \ + shadow.o \ + vn.o \ + MacOSStubs.o \ + hfs_attrlist.o \ + hfs_btreeio.o \ + hfs_catalog.o \ + hfs_chash.o \ + hfs_cnode.o \ + hfs_encodinghint.o \ + hfs_encodings.o \ + hfs_endian.o \ + hfs_hotfiles.o \ + hfs_link.o \ + hfs_lookup.o \ + hfs_notification.o \ + hfs_quota.o \ + hfs_readwrite.o \ + hfs_search.o \ + hfs_vfsops.o \ + hfs_vfsutils.o \ + hfs_vnops.o \ + hfs_xattr.o \ + BTree.o \ + BTreeAllocate.o \ + BTreeMiscOps.o \ + BTreeNodeOps.o \ + BTreeNodeReserve.o \ + BTreeScanner.o \ + BTreeTreeOps.o \ + CatalogUtilities.o \ + FileIDsServices.o \ + BTreeWrapper.o \ + FileExtentMapping.o \ + VolumeAllocation.o \ + UnicodeWrappers.o \ + cd9660_bmap.o \ + cd9660_lookup.o \ + cd9660_node.o \ + cd9660_rrip.o \ + cd9660_util.o \ + cd9660_vfsops.o \ + cd9660_vnops.o \ + bsd_init.o \ + bsd_stubs.o \ + kdebug.o \ + kern_acct.o \ + kern_aio.o \ + kern_audit.o \ + kern_authorization.o \ + kern_bsm_audit.o \ + kern_bsm_klib.o \ + kern_bsm_token.o \ + kern_clock.o \ + kern_control.o \ + kern_core.o \ + kern_credential.o \ + kern_descrip.o \ + kern_event.o \ + kern_exec.o \ + kern_exit.o \ + kern_fork.o \ + kern_ktrace.o \ + kern_lock.o \ + kern_malloc.o \ + kern_mib.o \ + kern_mman.o \ + kern_newsysctl.o \ + kern_panicinfo.o \ + kern_pcsamples.o \ + kern_physio.o \ + kern_prot.o \ + kern_resource.o \ + kern_shutdown.o \ + kern_sig.o \ + kern_symfile.o \ + kern_synch.o \ + kern_sysctl.o \ + kern_time.o \ + kern_xxx.o \ + kpi_mbuf.o \ + kpi_socket.o \ + kpi_socketfilter.o \ + mach_fat.o \ + mach_header.o \ + mach_loader.o \ + mach_process.o \ + netboot.o \ + posix_sem.o \ + posix_shm.o \ + qsort.o \ + spl.o \ + subr_log.o \ + subr_prf.o \ + subr_prof.o \ + subr_xxx.o \ + sys_domain.o \ + sys_generic.o \ + sys_socket.o \ + sysctl_init.o \ + sysv_ipc.o \ + sys_pipe.o \ + sysv_sem.o \ + sysv_shm.o \ + tty.o \ + tty_compat.o \ + tty_conf.o \ + tty_pty.o \ + tty_subr.o \ + tty_tty.o \ + ubc_subr.o \ + uipc_domain.o \ + uipc_mbuf.o \ + uipc_mbuf2.o \ + uipc_proto.o \ + uipc_socket.o \ + uipc_socket2.o \ + uipc_syscalls.o \ + uipc_usrreq.o \ + random.o \ + dead_vnops.o \ + devfs_tree.o \ + devfs_vfsops.o \ + devfs_vnops.o \ + fdesc_vfsops.o \ + fdesc_vnops.o \ + fifo_vnops.o \ + spec_vnops.o \ + synthfs_util.o \ + synthfs_vfsops.o \ + synthfs_vnops.o \ + union_subr.o \ + union_vfsops.o \ + union_vnops.o \ + volfs_vfsops.o \ + volfs_vnops.o \ + bpf.o \ + dlil.o \ + ether_at_pr_module.o \ + ether_if_module.o \ + ether_inet6_pr_module.o \ + ether_inet_pr_module.o \ + if.o \ + if_bond.o \ + if_ethersubr.o \ + if_gif.o \ + if_loop.o \ + if_media.o \ + if_stf.o \ + if_vlan.o \ + kext_net.o \ + kpi_interface.o \ + kpi_protocol.o \ + ndrv.o \ + netisr.o \ + net_osdep.o \ + radix.o \ + raw_usrreq.o \ + route.o \ + rtsock.o \ + zlib.o \ + dhcp_options.o \ + if_ether.o \ + igmp.o \ + in.o \ + in_bootp.o \ + in_cksum.o \ + ip_fw2.o \ + ip_fw2_compat.o \ + kpi_ipfilter.o \ + in_gif.o \ + in_pcb.o \ + in_proto.o \ + in_rmx.o \ + ip_divert.o \ + ip_dummynet.o \ + ip_encap.o \ + ip_flow.o \ + ip_icmp.o \ + ip_input.o \ + ip_mroute.o \ + ip_output.o \ + raw_ip.o \ + tcp_input.o \ + tcp_output.o \ + tcp_subr.o \ + tcp_timer.o \ + tcp_usrreq.o \ + udp_usrreq.o \ + ah_core.o \ + ah_input.o \ + ah_output.o \ + dest6.o \ + esp_core.o \ + esp_input.o \ + esp_output.o \ + esp_rijndael.o \ + frag6.o \ + icmp6.o \ + in6.o \ + in6_cksum.o \ + in6_gif.o \ + in6_ifattach.o \ + in6_pcb.o \ + in6_prefix.o \ + in6_proto.o \ + in6_rmx.o \ + in6_src.o \ + ip6_forward.o \ + ip6_fw.o \ + ip6_input.o \ + ip6_mroute.o \ + ip6_output.o \ + ipcomp_core.o \ + ipcomp_input.o \ + ipcomp_output.o \ + ipsec.o \ + mld6.o \ + nd6.o \ + nd6_nbr.o \ + nd6_rtr.o \ + raw_ip6.o \ + route6.o \ + scope6.o \ + udp6_output.o \ + udp6_usrreq.o \ + key.o \ + keydb.o \ + keysock.o \ + krpc_subr.o \ + nfs_bio.o \ + nfs_boot.o \ + nfs_node.o \ + nfs_nqlease.o \ + nfs_socket.o \ + nfs_srvcache.o \ + nfs_subs.o \ + nfs_syscalls.o \ + nfs_vfsops.o \ + nfs_vnops.o \ + ffs_alloc.o \ + ffs_balloc.o \ + ffs_inode.o \ + ffs_subr.o \ + ffs_vfsops.o \ + ffs_vnops.o \ + ufs_attrlist.o \ + ufs_bmap.o \ + ufs_byte_order.o \ + ufs_ihash.o \ + ufs_inode.o \ + ufs_lockf.o \ + ufs_lookup.o \ + ufs_quota.o \ + ufs_readwrite.o \ + ufs_vfsops.o \ + ufs_vnops.o \ + ux_exception.o \ + vfs_bio.o \ + vfs_cache.o \ + vfs_cluster.o \ + vfs_conf.o \ + vfs_fsevents.o \ + vfs_init.o \ + vfs_journal.o \ + vfs_lookup.o \ + vfs_quota.o \ + vfs_subr.o \ + vfs_support.o \ + vfs_syscalls.o \ + vfs_utfconv.o \ + vfs_vnops.o \ + vfs_xattr.o \ + kpi_vfs.o \ + vnode_if.o \ + sysctl.o \ + unix_startup.o \ + memdev.o \ + init_sysent.o + +OBJS_WERROR=$(filter-out $(OBJS_NO_WERROR),$(OBJS)) + +$(OBJS_WERROR): WERROR=-Werror + ###################################################################### #END Machine dependent Makefile fragment for i386 ###################################################################### diff --git a/bsd/conf/Makefile.template b/bsd/conf/Makefile.template index cd0355d62..7ba478229 100644 --- a/bsd/conf/Makefile.template +++ b/bsd/conf/Makefile.template @@ -1,3 +1,25 @@ +# +# Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. +# +# @APPLE_LICENSE_HEADER_START@ +# +# The contents of this file constitute Original Code as defined in and +# are subject to the Apple Public Source License Version 1.1 (the +# "License"). You may not use this file except in compliance with the +# License. Please obtain a copy of the License at +# http://www.apple.com/publicsource and read it before using this file. +# +# This Original Code and all software distributed under the License are +# distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER +# EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +# INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the +# License for the specific language governing rights and limitations +# under the License. +# +# @APPLE_LICENSE_HEADER_END@ +# + # # Mach Operating System # Copyright (c) 1986 Carnegie-Mellon University @@ -24,7 +46,7 @@ include $(MakeInc_def) CFLAGS+= -imacros meta_features.h -DARCH_PRIVATE -DKERNEL -DDRIVER_PRIVATE \ -D_KERNEL_BUILD -DKERNEL_BUILD -DMACH_KERNEL -DBSD_BUILD \ -DBSD_KERNEL_PRIVATE -DNCPUS=1 -Wno-four-char-constants -fpascal-strings \ - -D__APPLE__ -I. + -D__APPLE__ -DLP64KERN=1 -DLP64_DEBUG=0 -I. # XXX: ld flags for bsd.o export LDFLAGS_COMPONENT += -keep_private_externs @@ -40,14 +62,6 @@ COMP_SUBDIRS = # .PRECIOUS: Makefile -VERSION_FILES= \ - $(SOURCE_DIR)/$(COMPONENT)/conf/version.major \ - $(SOURCE_DIR)/$(COMPONENT)/conf/version.minor \ - $(SOURCE_DIR)/$(COMPONENT)/conf/version.variant - -COPYRIGHT_FILES = \ - $(SOURCE_DIR)/$(COMPONENT)/conf/copyright - # # Theses macros are filled in by the config program depending on the # current configuration. The MACHDEP macro is replaced by the @@ -90,12 +104,8 @@ LDOBJS = $(OBJS) $(COMPONENT).o: $(LDOBJS) @echo "[ creating $(COMPONENT).o ]" - $(RM) $(RMFLAGS) vers.c - $(COMPOBJROOT)/newvers \ - `$(CAT) ${VERSION_FILES}` ${COPYRIGHT_FILES} - ${KCC} $(CFLAGS) $(INCLUDES) -c vers.c @echo [ updating $(COMPONENT).o ${BSD_KERNEL_CONFIG} ] - $(LD) $(LDFLAGS_COMPONENT) -o $(COMPONENT).o ${LDOBJS} vers.o + $(LD) $(LDFLAGS_COMPONENT) -o $(COMPONENT).o ${LDOBJS} do_depend: do_all ${MD} -u Makedep -f -d `ls *.d`; diff --git a/bsd/conf/files b/bsd/conf/files index 1a4bd5ee8..2b6779be7 100644 --- a/bsd/conf/files +++ b/bsd/conf/files @@ -37,7 +37,6 @@ OPTIONS/mach_xp optional mach_xp OPTIONS/mach_xp_fpd optional mach_xp_fpd OPTIONS/quota optional quota OPTIONS/simple_clock optional simple_clock -OPTIONS/stat_time optional stat_time OPTIONS/xpr_debug optional xpr_debug OPTIONS/kdebug optional kdebug OPTIONS/nfsclient optional nfsclient @@ -56,11 +55,14 @@ OPTIONS/norma_ether optional norma_ether OPTIONS/new_vm_code optional new_vm_code OPTIONS/old_vm_code optional old_vm_code OPTIONS/compat_43 optional compat_43 +OPTIONS/compat_43_tty optional compat_43_tty +OPTIONS/compat_43_socket optional compat_43_socket OPTIONS/diagnostic optional diagnostic OPTIONS/ktrace optional ktrace OPTIONS/profiling optional profiling OPTIONS/vndevice optional vndevice OPTIONS/audit optional audit +OPTIONS/fsevents optional fsevents # # Network options @@ -88,7 +90,9 @@ OPTIONS/fddi optional fddi OPTIONS/ipdivert optional ipdivert OPTIONS/dummynet optional dummynet +OPTIONS/ipfw2 optional ipfw2 OPTIONS/ipfirewall optional ipfirewall +OPTIONS/ipv6firewall optional ipv6firewall OPTIONS/tcpdebug optional tcpdebug OPTIONS/bridge optional bridge OPTIONS/faith optional faith @@ -117,17 +121,21 @@ bsd/dev/random/YarrowCoreLib/src/prng.c standard bsd/dev/random/YarrowCoreLib/src/sha1mod.c standard bsd/dev/random/YarrowCoreLib/src/yarrowUtils.c standard -bsd/dev/memdev.c standard +bsd/dev/memdev.c standard + +bsd/dev/unix_startup.c standard bsd/dev/vn/vn.c optional vndevice bsd/dev/vn/shadow.c optional vndevice +bsd/libkern/crc32.c standard bsd/libkern/random.c standard bsd/libkern/scanc.c standard bsd/libkern/skpc.c standard -bsd/libkern/inet_ntoa.c standard +bsd/libkern/inet_ntop.c standard bsd/libkern/bcd.c standard +bsd/vfs/vfs_attrlist.c standard bsd/vfs/vfs_bio.c standard bsd/vfs/vfs_cache.c standard bsd/vfs/vfs_cluster.c standard @@ -140,8 +148,12 @@ bsd/vfs/vfs_syscalls.c standard bsd/vfs/vfs_support.c standard bsd/vfs/vfs_utfconv.c standard bsd/vfs/vfs_vnops.c standard +bsd/vfs/vfs_xattr.c standard bsd/vfs/vnode_if.c standard +bsd/vfs/kpi_vfs.c standard bsd/vfs/vfs_journal.c standard +#bsd/vfs/vfs_fsevents.c optional fsevents +bsd/vfs/vfs_fsevents.c standard bsd/miscfs/deadfs/dead_vnops.c standard bsd/miscfs/fdesc/fdesc_vfsops.c optional fdesc @@ -151,7 +163,6 @@ bsd/miscfs/nullfs/null_subr.c optional nullfs bsd/miscfs/nullfs/null_vfsops.c optional nullfs bsd/miscfs/nullfs/null_vnops.c optional nullfs bsd/miscfs/specfs/spec_vnops.c standard -bsd/miscfs/specfs/spec_lockf.c standard bsd/miscfs/union/union_subr.c optional union bsd/miscfs/union/union_vfsops.c optional union bsd/miscfs/union/union_vnops.c optional union @@ -175,7 +186,7 @@ bsd/isofs/cd9660/cd9660_util.c optional cd9660 bsd/isofs/cd9660/cd9660_vfsops.c optional cd9660 bsd/isofs/cd9660/cd9660_vnops.c optional cd9660 -bsd/net/slcompress.c optional i4bipr +#bsd/net/slcompress.c optional i4bipr bsd/net/bpf.c optional bpfilter bsd/net/bpf_filter.c optional bpfilter bsd/net/bridge.c optional bridge @@ -183,19 +194,23 @@ bsd/net/bsd_comp.c optional ppp_bsdcomp bsd/net/if.c standard bsd/net/if_atmsubr.c optional atm bsd/net/if_disc.c optional disc +bsd/net/init.c standard bsd/net/dlil.c standard bsd/net/ether_if_module.c optional ether bsd/net/ether_at_pr_module.c optional ether bsd/net/ether_inet_pr_module.c optional ether bsd/net/ether_inet6_pr_module.c optional ether inet6 -bsd/net/if_ethersubr.c optional ether +#bsd/net/if_ethersubr.c optional ether bsd/net/if_loop.c optional loop -bsd/net/if_media.c standard -bsd/net/if_mib.c standard +#bsd/net/if_media.c standard +bsd/net/if_mib.c optional netmibs bsd/net/if_sl.c optional sl bsd/net/if_tun.c optional tun bsd/net/if_vlan.c optional vlan -bsd/net/kext_net.c standard +bsd/net/multicast_list.c standard +bsd/net/if_bond.c optional bond +bsd/net/devtimer.c optional bond +#bsd/net/kext_net.c standard bsd/net/ndrv.c standard bsd/net/ppp_deflate.c optional ppp_deflate bsd/net/radix.c standard @@ -206,7 +221,7 @@ bsd/net/rtsock.c standard bsd/net/slcompress.c optional ppp bsd/net/slcompress.c optional sl bsd/net/zlib.c optional ppp_deflate -bsd/net/netisr.c standard +#bsd/net/netisr.c standard bsd/net/zlib.c optional ipsec bsd/net/if_dummy.c optional dummy bsd/net/if_gif.c optional gif @@ -215,20 +230,25 @@ bsd/net/if_stf.c optional stf bsd/net/if_faith.c optional faith bsd/net/net_osdep.c optional ipsec bsd/net/net_osdep.c optional inet6 +bsd/net/kpi_interface.c standard +bsd/net/kpi_protocol.c standard +bsd/net/kpi_interfacefilter.c standard bsd/netinet/if_atm.c optional atm -bsd/netinet/if_ether.c optional ether bsd/netinet/igmp.c standard bsd/netinet/in.c standard bsd/netinet/in_bootp.c standard bsd/netinet/dhcp_options.c standard +bsd/netinet/in_arp.c standard bsd/netinet/in_pcb.c standard bsd/netinet/in_proto.c standard bsd/netinet/in_rmx.c standard bsd/netinet/ip_divert.c optional ipdivert bsd/netinet/ip_dummynet.c optional dummynet bsd/netinet/ip_flow.c standard +bsd/netinet/ip_fw2.c optional ipfw2 +bsd/netinet/ip_fw2_compat.c optional ipfw2 bsd/netinet/ip_icmp.c standard bsd/netinet/ip_id.c optional randomipid bsd/netinet/ip_input.c standard @@ -246,6 +266,7 @@ bsd/netinet/in_gif.c optional gif inet bsd/netinet/ip_ecn.c optional inet inet6 bsd/netinet/ip_ecn.c optional inet ipsec bsd/netinet/ip_encap.c optional inet +bsd/netinet/kpi_ipfilter.c standard bsd/netinet6/ah_core.c optional ipsec bsd/netinet6/ah_input.c optional ipsec bsd/netinet6/ah_output.c optional ipsec @@ -260,10 +281,11 @@ bsd/netinet6/icmp6.c optional inet6 bsd/netinet6/in6.c optional inet6 bsd/netinet6/in6_cksum.c optional inet6 bsd/netinet6/in6_gif.c optional gif inet6 +bsd/netinet6/ip6_fw.c optional inet6 bsd/netinet6/ip6_forward.c optional inet6 bsd/netinet6/in6_ifattach.c optional inet6 bsd/netinet6/ip6_input.c optional inet6 -bsd/netinet6/ip6_mroute.c optional inet6 +bsd/netinet6/ip6_mroute.c optional inet6 bsd/netinet6/ip6_output.c optional inet6 bsd/netinet6/in6_src.c optional inet6 bsd/netinet6/ipcomp_core.c optional ipsec @@ -299,8 +321,9 @@ bsd/crypto/des/des_setkey.c optional crypto bsd/crypto/blowfish/bf_enc.c optional crypto bsd/crypto/blowfish/bf_skey.c optional crypto bsd/crypto/cast128/cast128.c optional crypto -bsd/crypto/rijndael/rijndael-alg-fst.c optional crypto -bsd/crypto/rijndael/rijndael-api-fst.c optional crypto +bsd/crypto/aes/aescrypt.c optional crypto +bsd/crypto/aes/aeskey.c optional crypto +bsd/crypto/aes/aestab.c optional crypto bsd/crypto/rc4/rc4.c optional crypto #bsd/netpm/pm_aTT.c optional pm @@ -378,7 +401,6 @@ bsd/nfs/krpc_subr.c optional nfsclient bsd/nfs/nfs_bio.c optional nfsclient bsd/nfs/nfs_boot.c optional nfsclient bsd/nfs/nfs_node.c optional nfsclient -bsd/nfs/nfs_nqlease.c optional nfsclient nfsserver bsd/nfs/nfs_serv.c optional nfsserver bsd/nfs/nfs_socket.c optional nfsclient nfsserver bsd/nfs/nfs_srvcache.c optional nfsserver @@ -404,7 +426,6 @@ bsd/ufs/ufs/ufs_bmap.c standard bsd/ufs/ufs/ufs_byte_order.c optional rev_endian_fs bsd/ufs/ufs/ufs_ihash.c standard bsd/ufs/ufs/ufs_inode.c standard -bsd/ufs/ufs/ufs_lockf.c standard bsd/ufs/ufs/ufs_lookup.c standard bsd/ufs/ufs/ufs_quota.c optional quota bsd/ufs/ufs/ufs_vfsops.c standard @@ -420,7 +441,6 @@ bsd/hfs/hfs_encodings.c optional hfs bsd/hfs/hfs_endian.c optional hfs bsd/hfs/hfs_hotfiles.c optional hfs bsd/hfs/hfs_link.c optional hfs -bsd/hfs/hfs_lockf.c optional hfs bsd/hfs/hfs_lookup.c optional hfs bsd/hfs/hfs_notification.c optional hfs bsd/hfs/hfs_quota.c optional quota @@ -429,6 +449,7 @@ bsd/hfs/hfs_search.c optional hfs bsd/hfs/hfs_vfsops.c optional hfs bsd/hfs/hfs_vfsutils.c optional hfs bsd/hfs/hfs_vnops.c optional hfs +bsd/hfs/hfs_xattr.c optional hfs bsd/hfs/MacOSStubs.c optional hfs bsd/hfs/rangelist.c optional hfs bsd/hfs/hfscommon/BTree/BTree.c optional hfs @@ -438,8 +459,6 @@ bsd/hfs/hfscommon/BTree/BTreeNodeOps.c optional hfs bsd/hfs/hfscommon/BTree/BTreeNodeReserve.c optional hfs bsd/hfs/hfscommon/BTree/BTreeScanner.c optional hfs bsd/hfs/hfscommon/BTree/BTreeTreeOps.c optional hfs -bsd/hfs/hfscommon/Catalog/Catalog.c optional hfs -bsd/hfs/hfscommon/Catalog/CatalogIterators.c optional hfs bsd/hfs/hfscommon/Catalog/CatalogUtilities.c optional hfs bsd/hfs/hfscommon/Catalog/FileIDsServices.c optional hfs bsd/hfs/hfscommon/Misc/BTreeWrapper.c optional hfs @@ -451,19 +470,22 @@ bsd/kern/bsd_init.c standard bsd/kern/init_sysent.c standard bsd/kern/kdebug.c standard bsd/kern/kern_acct.c standard -bsd/kern/kern_aio.c standard +bsd/kern/kern_aio.c standard bsd/kern/kern_audit.c standard +bsd/kern/kern_authorization.c standard bsd/kern/kern_bsm_token.c standard bsd/kern/kern_bsm_audit.c standard bsd/kern/kern_bsm_klib.c standard bsd/kern/kern_clock.c standard bsd/kern/kern_core.c standard +bsd/kern/kern_credential.c standard bsd/kern/kern_symfile.c standard bsd/kern/kern_descrip.c standard bsd/kern/kern_event.c standard bsd/kern/kern_control.c standard bsd/kern/kern_exec.c standard bsd/kern/kern_exit.c standard +bsd/kern/kern_lockf.c standard bsd/kern/kern_fork.c standard bsd/kern/kern_ktrace.c standard bsd/kern/kern_lock.c optional cpus @@ -481,6 +503,7 @@ bsd/kern/kern_synch.c standard bsd/kern/kern_sysctl.c standard bsd/kern/kern_newsysctl.c standard bsd/kern/kern_mib.c standard +bsd/kern/kpi_mbuf.c standard bsd/kern/sysctl_init.c standard bsd/kern/kern_time.c standard bsd/kern/kern_xxx.c standard @@ -492,11 +515,12 @@ bsd/kern/subr_prf.c standard bsd/kern/subr_prof.c standard bsd/kern/subr_xxx.c standard bsd/kern/sys_generic.c standard +bsd/kern/sys_pipe.c standard bsd/kern/sys_socket.c standard bsd/kern/sys_domain.c standard bsd/kern/syscalls.c standard bsd/kern/tty.c standard -bsd/kern/tty_compat.c optional compat_43 +bsd/kern/tty_compat.c optional compat_43_tty bsd/kern/tty_conf.c standard bsd/kern/tty_pty.c optional pty bsd/kern/tty_subr.c standard @@ -513,6 +537,7 @@ bsd/kern/uipc_usrreq.c standard bsd/kern/sysv_ipc.c standard bsd/kern/sysv_shm.c standard bsd/kern/sysv_sem.c standard +bsd/kern/sysv_msg.c standard bsd/kern/mach_fat.c standard bsd/kern/mach_header.c standard bsd/kern/mach_loader.c standard @@ -520,6 +545,8 @@ bsd/kern/posix_sem.c standard bsd/kern/posix_shm.c standard # XXXdbg - I need this in the journaling and block cache code bsd/kern/qsort.c standard +bsd/kern/kpi_socket.c standard +bsd/kern/kpi_socketfilter.c standard bsd/vm/vnode_pager.c standard bsd/vm/vm_unix.c standard diff --git a/bsd/conf/files.i386 b/bsd/conf/files.i386 index 73da06e97..cc998565b 100644 --- a/bsd/conf/files.i386 +++ b/bsd/conf/files.i386 @@ -13,7 +13,6 @@ bsd/dev/i386/stubs.c standard bsd/dev/i386/lock_stubs.c standard bsd/dev/i386/sysctl.c standard bsd/dev/i386/unix_signal.c standard -bsd/dev/i386/unix_startup.c standard bsd/kern/bsd_stubs.c standard diff --git a/bsd/conf/files.ppc b/bsd/conf/files.ppc index d1b636110..36adc9a65 100644 --- a/bsd/conf/files.ppc +++ b/bsd/conf/files.ppc @@ -7,7 +7,6 @@ bsd/netinet/in_cksum.c standard bsd/dev/ppc/conf.c standard bsd/dev/ppc/cons.c standard bsd/dev/ppc/mem.c standard -bsd/dev/ppc/unix_startup.c standard bsd/dev/ppc/unix_signal.c standard bsd/dev/ppc/ffs.s standard bsd/dev/ppc/memmove.c standard @@ -18,6 +17,7 @@ bsd/dev/ppc/systemcalls.c standard bsd/dev/ppc/km.c standard bsd/dev/ppc/xsumas.s standard bsd/dev/ppc/sysctl.c standard +bsd/dev/ppc/munge.s standard bsd/dev/ppc/chud/chud_bsd_callback.c standard bsd/dev/ppc/chud/chud_process.c standard diff --git a/bsd/conf/param.c b/bsd/conf/param.c index 5016d2029..0aede52e9 100644 --- a/bsd/conf/param.c +++ b/bsd/conf/param.c @@ -63,8 +63,8 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/socket.h> -#include <sys/vnode.h> -#include <sys/file.h> +#include <sys/vnode_internal.h> +#include <sys/file_internal.h> #include <sys/callout.h> #include <sys/clist.h> #include <sys/mbuf.h> @@ -73,7 +73,7 @@ #include <sys/quota.h> #include <ufs/ufs/inode.h> #include <miscfs/fifofs/fifo.h> -#include <sys/shm.h> +#include <sys/shm_internal.h> #include <sys/aio_kern.h> struct timezone tz = { TIMEZONE, PST }; diff --git a/bsd/conf/tools/Makefile b/bsd/conf/tools/Makefile index 9df86ce8c..4f9ccd553 100644 --- a/bsd/conf/tools/Makefile +++ b/bsd/conf/tools/Makefile @@ -7,13 +7,9 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) -SETUP_SUBDIRS = \ - doconf \ - newvers +SETUP_SUBDIRS = doconf -COMP_SUBDIRS = \ - doconf \ - newvers +COMP_SUBDIRS = doconf INST_SUBDIRS = \ diff --git a/bsd/conf/tools/newvers/Makefile b/bsd/conf/tools/newvers/Makefile deleted file mode 100644 index 73603c753..000000000 --- a/bsd/conf/tools/newvers/Makefile +++ /dev/null @@ -1,49 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -COMP_SUBDIRS = \ - -INST_SUBDIRS = \ - - -# -# Who and where -# -BINDIR= -DSTDIR= $(strip $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT)/) -PROGRAM= $(DSTDIR)newvers - -# -# How to install it -# -IFLAGS= -c -m 555 - -$(PROGRAM): $(DSTDIR)% : $(SOURCE)%.csh - @echo "[ $(SOURCE) ] make setup_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)"; - -$(RM) $(RMFLAGS) $(notdir $(PROGRAM)).VERS - sed -e "s/#PROGRAM.*/#`vers_string $(notdir $(PROGRAM))`/" \ - < $< >$(notdir $(PROGRAM)).VERS; - install $(IFLAGS) $(notdir $(PROGRAM)).VERS $(PROGRAM); - -$(RM) $(RMFLAGS) $(notdir $(PROGRAM)).VERS; - -do_build_setup: $(PROGRAM) - -do_build_all: - @echo "[ $(SOURCE) ] make do_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -setup_build_install: - @echo "[ $(SOURCE) ] make setup_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -do_build_install: - @echo "[ $(SOURCE) ] make do_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/bsd/conf/tools/newvers/newvers.csh b/bsd/conf/tools/newvers/newvers.csh deleted file mode 100644 index 75324d3bc..000000000 --- a/bsd/conf/tools/newvers/newvers.csh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/sh - -# -# Mach Operating System -# Copyright (c) 1990 Carnegie-Mellon University -# Copyright (c) 1989 Carnegie-Mellon University -# All rights reserved. The CMU software License Agreement specifies -# the terms and conditions for use and redistribution. -# - -# -# newvers.sh copyright major minor variant -# - -major="$1"; minor="$2"; variant="$3" -v="${major}.${minor}" d=`pwd` h="rcbuilder" t=`date` w=`whoami` -if [ -z "$d" -o -z "$h" -o -z "$t" ]; then - exit 1 -fi -CONFIG=`expr "$d" : '.*/\([^/]*\)$'` -d=`expr "$d" : '.*/\([^/]*/[^/]*/[^/]*\)$'` -( - /bin/echo "int ${COMPONENT}_version_major = ${major};" ; - /bin/echo "int ${COMPONENT}_version_minor = ${minor};" ; - /bin/echo "char ${COMPONENT}_version_variant[] = \"${variant}\";" ; - /bin/echo "char ${COMPONENT}_version[] = \"BSD Component Version ${v}:\\n${t}; $w($h):$d\\n\";" ; - /bin/echo "char ${COMPONENT}_osrelease[] = \"${major}.${minor}\";" ; - /bin/echo "char ${COMPONENT}_ostype[] = \"BSD\";" ; -) > vers.c -if [ -s vers.suffix -o ! -f vers.suffix ]; then - rm -f vers.suffix - echo ".${variant}.${CONFIG}" > vers.suffix -fi -exit 0 diff --git a/bsd/conf/version.major b/bsd/conf/version.major deleted file mode 100644 index 7f8f011eb..000000000 --- a/bsd/conf/version.major +++ /dev/null @@ -1 +0,0 @@ -7 diff --git a/bsd/conf/version.minor b/bsd/conf/version.minor deleted file mode 100644 index ec635144f..000000000 --- a/bsd/conf/version.minor +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/bsd/conf/version.variant b/bsd/conf/version.variant deleted file mode 100644 index 573541ac9..000000000 --- a/bsd/conf/version.variant +++ /dev/null @@ -1 +0,0 @@ -0 diff --git a/bsd/crypto/Makefile b/bsd/crypto/Makefile index 4e0880559..e878376c2 100644 --- a/bsd/crypto/Makefile +++ b/bsd/crypto/Makefile @@ -12,7 +12,7 @@ INSTINC_SUBDIRS = \ cast128 \ des \ rc4 \ - rijndael \ + aes \ sha2 @@ -33,7 +33,7 @@ INSTALL_MI_DIR = crypto EXPORT_MI_DIR = ${INSTALL_MI_DIR} -INSTALL_MI_LCL_KERN_LIST = ${PRIVATE_DATAFILES} +INSTALL_KF_MI_LCL_LIST = ${PRIVATE_DATAFILES} include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/bsd/crypto/rijndael/Makefile b/bsd/crypto/aes/Makefile similarity index 81% rename from bsd/crypto/rijndael/Makefile rename to bsd/crypto/aes/Makefile index 92d360eb6..9a6c0e847 100644 --- a/bsd/crypto/rijndael/Makefile +++ b/bsd/crypto/aes/Makefile @@ -20,13 +20,17 @@ EXPINC_SUBDIRS_PPC = \ EXPINC_SUBDIRS_I386 = \ PRIVATE_DATAFILES = \ - rijndael-alg-fst.h rijndael-api-fst.h rijndael.h + aes.h aesopt.h aestab.h INSTALL_MI_DIR = crypto EXPORT_MI_DIR = ${INSTALL_MI_DIR} -INSTALL_MI_LCL_KERN_LIST = ${PRIVATE_DATAFILES} +EXPORT_MI_LIST = aes.h + +INSTALL_KF_MI_LIST = + +INSTALL_KF_MI_LCL_LIST = ${PRIVATE_DATAFILES} include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/bsd/crypto/aes/aes.h b/bsd/crypto/aes/aes.h new file mode 100644 index 000000000..d2dd335c3 --- /dev/null +++ b/bsd/crypto/aes/aes.h @@ -0,0 +1,175 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue 28/01/2004 + + This file contains the definitions required to use AES in C. See aesopt.h + for optimisation details. +*/ + +#if !defined( _AES_H ) +#define _AES_H + +/* This include is used to find 8 & 32 bit unsigned integer types */ +#include <machine/limits.h> + +#if defined(__cplusplus) +extern "C" +{ +#endif + +#define AES_128 /* define if AES with 128 bit keys is needed */ +#define AES_192 /* define if AES with 192 bit keys is needed */ +#define AES_256 /* define if AES with 256 bit keys is needed */ +#define AES_VAR /* define if a variable key size is needed */ + +/* The following must also be set in assembler files if being used */ + +#define AES_ENCRYPT /* if support for encryption is needed */ +#define AES_DECRYPT /* if support for decryption is needed */ +//#define AES_ERR_CHK /* for parameter checks & error return codes */ + +#if UCHAR_MAX == 0xff /* an unsigned 8 bit type */ + typedef unsigned char aes_08t; +#else +# error Please define aes_08t as an 8-bit unsigned integer type in aes.h +#endif + +#if UINT_MAX == 4294967295 /* an unsigned 32 bit type */ + typedef unsigned int aes_32t; +#elif ULONG_MAX == 4294967295ul + typedef unsigned long aes_32t; +#else +# error Please define aes_32t as a 32-bit unsigned integer type in aes.h +#endif + +#define AES_BLOCK_SIZE 16 /* the AES block size in bytes */ +#define N_COLS 4 /* the number of columns in the state */ + +/* The key schedule length is 11, 13 or 15 16-byte blocks for 128, */ +/* 192 or 256-bit keys respectively. That is 176, 208 or 240 bytes */ +/* or 44, 52 or 60 32-bit words. For simplicity this code allocates */ +/* the maximum 60 word array for the key schedule for all key sizes */ + +#if defined( AES_VAR ) || defined( AES_256 ) +#define KS_LENGTH 60 +#elif defined( AES_192 ) +#define KS_LENGTH 52 +#else +#define KS_LENGTH 44 +#endif + +#if defined( AES_ERR_CHK ) +#define aes_ret int +#define aes_good 0 +#define aes_error -1 +#else +#define aes_ret void +#endif + +#if !defined( AES_DLL ) /* implement normal/DLL functions */ +#define aes_rval aes_ret +#else +#define aes_rval aes_ret __declspec(dllexport) _stdcall +#endif + +typedef struct +{ aes_32t ks[KS_LENGTH]; + aes_32t rn; +} aes_encrypt_ctx; + +typedef struct +{ aes_32t ks[KS_LENGTH]; + aes_32t rn; +} aes_decrypt_ctx; + +typedef struct +{ + aes_decrypt_ctx decrypt; + aes_encrypt_ctx encrypt; +} aes_ctx; + + +/* This routine must be called before first use if non-static */ +/* tables are being used */ + +void gen_tabs(void); + +/* The key length (klen) is input in bytes when it is in the range */ +/* 16 <= klen <= 32 or in bits when in the range 128 <= klen <= 256 */ + +#if defined( AES_ENCRYPT ) + +#if defined(AES_128) || defined(AES_VAR) +aes_rval aes_encrypt_key128(const unsigned char *in_key, aes_encrypt_ctx cx[1]); +#endif + +#if defined(AES_192) || defined(AES_VAR) +aes_rval aes_encrypt_key192(const unsigned char *in_key, aes_encrypt_ctx cx[1]); +#endif + +#if defined(AES_256) || defined(AES_VAR) +aes_rval aes_encrypt_key256(const unsigned char *in_key, aes_encrypt_ctx cx[1]); +#endif + +#if defined(AES_VAR) +aes_rval aes_encrypt_key(const unsigned char *in_key, int key_len, aes_encrypt_ctx cx[1]); +#endif + +aes_rval aes_encrypt_cbc(const unsigned char *in_blk, const unsigned char *in_iv, unsigned int num_blk, + unsigned char *out_blk, const aes_encrypt_ctx cx[1]); +#endif + +#if defined( AES_DECRYPT ) + +#if defined(AES_128) || defined(AES_VAR) +aes_rval aes_decrypt_key128(const unsigned char *in_key, aes_decrypt_ctx cx[1]); +#endif + +#if defined(AES_192) || defined(AES_VAR) +aes_rval aes_decrypt_key192(const unsigned char *in_key, aes_decrypt_ctx cx[1]); +#endif + +#if defined(AES_256) || defined(AES_VAR) +aes_rval aes_decrypt_key256(const unsigned char *in_key, aes_decrypt_ctx cx[1]); +#endif + +#if defined(AES_VAR) +aes_rval aes_decrypt_key(const unsigned char *in_key, int key_len, aes_decrypt_ctx cx[1]); +#endif + +aes_rval aes_decrypt_cbc(const unsigned char *in_blk, const unsigned char *in_iv, unsigned int num_blk, + unsigned char *out_blk, const aes_decrypt_ctx cx[1]); +#endif + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/bsd/crypto/aes/aescrypt.c b/bsd/crypto/aes/aescrypt.c new file mode 100644 index 000000000..141cd3fbf --- /dev/null +++ b/bsd/crypto/aes/aescrypt.c @@ -0,0 +1,407 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue 28/01/2004 + + This file contains the code for implementing encryption and decryption + for AES (Rijndael) for block and key sizes of 16, 24 and 32 bytes. It + can optionally be replaced by code written in assembler using NASM. For + further details see the file aesopt.h +*/ + +#include "aesopt.h" +#include "aestab.h" + +#if defined(__cplusplus) +extern "C" +{ +#endif + +#define ki(y,x,k,c) (s(y,c) = s(x, c) ^ (k)[c]) +#define xo(y,x,c) (s(y,c) ^= s(x, c)) +#define si(y,x,c) (s(y,c) = word_in(x, c)) +#define so(y,x,c) word_out(y, c, s(x,c)) + +#if defined(ARRAYS) +#define locals(y,x) x[4],y[4] +#else +#define locals(y,x) x##0,x##1,x##2,x##3,y##0,y##1,y##2,y##3 +#endif + +#define dtables(tab) const aes_32t *tab##0, *tab##1, *tab##2, *tab##3 +#define itables(tab) tab##0 = tab[0]; tab##1 = tab[1]; tab##2 = tab[2]; tab##3 = tab[3] + +#define l_copy(y, x) s(y,0) = s(x,0); s(y,1) = s(x,1); \ + s(y,2) = s(x,2); s(y,3) = s(x,3); + +#define key_in(y,x,k) ki(y,x,k,0); ki(y,x,k,1); ki(y,x,k,2); ki(y,x,k,3) +#define cbc(y,x) xo(y,x,0); xo(y,x,1); xo(y,x,2); xo(y,x,3) +#define state_in(y,x) si(y,x,0); si(y,x,1); si(y,x,2); si(y,x,3) +#define state_out(y,x) so(y,x,0); so(y,x,1); so(y,x,2); so(y,x,3) +#define round(rm,y,x,k) rm(y,x,k,0); rm(y,x,k,1); rm(y,x,k,2); rm(y,x,k,3) + +#if defined(ENCRYPTION) && !defined(AES_ASM) + +/* Visual C++ .Net v7.1 provides the fastest encryption code when using + Pentium optimiation with small code but this is poor for decryption + so we need to control this with the following VC++ pragmas +*/ + +#if defined(_MSC_VER) +#pragma optimize( "s", on ) +#endif + +/* Given the column (c) of the output state variable, the following + macros give the input state variables which are needed in its + computation for each row (r) of the state. All the alternative + macros give the same end values but expand into different ways + of calculating these values. In particular the complex macro + used for dynamically variable block sizes is designed to expand + to a compile time constant whenever possible but will expand to + conditional clauses on some branches (I am grateful to Frank + Yellin for this construction) +*/ + +#define fwd_var(x,r,c)\ + ( r == 0 ? ( c == 0 ? s(x,0) : c == 1 ? s(x,1) : c == 2 ? s(x,2) : s(x,3))\ + : r == 1 ? ( c == 0 ? s(x,1) : c == 1 ? s(x,2) : c == 2 ? s(x,3) : s(x,0))\ + : r == 2 ? ( c == 0 ? s(x,2) : c == 1 ? s(x,3) : c == 2 ? s(x,0) : s(x,1))\ + : ( c == 0 ? s(x,3) : c == 1 ? s(x,0) : c == 2 ? s(x,1) : s(x,2))) + +#if defined(FT4_SET) +#undef dec_fmvars +# if defined(ENC_ROUND_CACHE_TABLES) +#define fwd_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_cached_tables(x,t_fn,fwd_var,rf1,c)) +# else +#define fwd_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_tables(x,t_fn,fwd_var,rf1,c)) +# endif +#elif defined(FT1_SET) +#undef dec_fmvars +#define fwd_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ one_table(x,upr,t_fn,fwd_var,rf1,c)) +#else +#define fwd_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ fwd_mcol(no_table(x,t_sbox,fwd_var,rf1,c))) +#endif + +#if defined(FL4_SET) +# if defined(LAST_ENC_ROUND_CACHE_TABLES) +#define fwd_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_cached_tables(x,t_fl,fwd_var,rf1,c)) +# else +#define fwd_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_tables(x,t_fl,fwd_var,rf1,c)) +# endif +#elif defined(FL1_SET) +#define fwd_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ one_table(x,ups,t_fl,fwd_var,rf1,c)) +#else +#define fwd_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ no_table(x,t_sbox,fwd_var,rf1,c)) +#endif + +aes_rval aes_encrypt_cbc(const unsigned char *in, const unsigned char *in_iv, unsigned int num_blk, + unsigned char *out, const aes_encrypt_ctx cx[1]) +{ aes_32t locals(b0, b1); + const aes_32t *kp = cx->ks; +#if defined(ENC_ROUND_CACHE_TABLES) + dtables(t_fn); +#endif +#if defined(LAST_ENC_ROUND_CACHE_TABLES) + dtables(t_fl); +#endif + +#if defined( dec_fmvars ) + dec_fmvars; /* declare variables for fwd_mcol() if needed */ +#endif + +#if defined( AES_ERR_CHK ) + if( cx->rn != 10 && cx->rn != 12 && cx->rn != 14 ) + return aes_error; +#endif + + // Load IV into b0. + state_in(b0, in_iv); + + for (;num_blk; in += AES_BLOCK_SIZE, out += AES_BLOCK_SIZE, --num_blk) + { +#if 0 + // Read the plaintext into b1 + state_in(b1, in); + // Do the CBC with b0 which is either the iv or the ciphertext of the previous block. + cbc(b1, b0); + + // Xor b1 with the key schedule to get things started. + key_in(b0, b1, kp); +#else + // Since xor is associative we mess with the ordering here to get the loads started early + key_in(b1, b0, kp); // Xor b0(IV) with the key schedule and assign to b1 + state_in(b0, in); // Load block into b0 + cbc(b0, b1); // Xor b0 with b1 and store in b0 +#endif + +#if defined(ENC_ROUND_CACHE_TABLES) + itables(t_fn); +#endif + +#if (ENC_UNROLL == FULL) + + switch(cx->rn) + { + case 14: + round(fwd_rnd, b1, b0, kp + 1 * N_COLS); + round(fwd_rnd, b0, b1, kp + 2 * N_COLS); + kp += 2 * N_COLS; + case 12: + round(fwd_rnd, b1, b0, kp + 1 * N_COLS); + round(fwd_rnd, b0, b1, kp + 2 * N_COLS); + kp += 2 * N_COLS; + case 10: + default: + round(fwd_rnd, b1, b0, kp + 1 * N_COLS); + round(fwd_rnd, b0, b1, kp + 2 * N_COLS); + round(fwd_rnd, b1, b0, kp + 3 * N_COLS); + round(fwd_rnd, b0, b1, kp + 4 * N_COLS); + round(fwd_rnd, b1, b0, kp + 5 * N_COLS); + round(fwd_rnd, b0, b1, kp + 6 * N_COLS); + round(fwd_rnd, b1, b0, kp + 7 * N_COLS); + round(fwd_rnd, b0, b1, kp + 8 * N_COLS); + round(fwd_rnd, b1, b0, kp + 9 * N_COLS); +#if defined(LAST_ENC_ROUND_CACHE_TABLES) + itables(t_fl); +#endif + round(fwd_lrnd, b0, b1, kp +10 * N_COLS); + } + +#else + + { aes_32t rnd; +#if (ENC_UNROLL == PARTIAL) + for(rnd = 0; rnd < (cx->rn >> 1) - 1; ++rnd) + { + kp += N_COLS; + round(fwd_rnd, b1, b0, kp); + kp += N_COLS; + round(fwd_rnd, b0, b1, kp); + } + kp += N_COLS; + round(fwd_rnd, b1, b0, kp); +#else + for(rnd = 0; rnd < cx->rn - 1; ++rnd) + { + kp += N_COLS; + round(fwd_rnd, b1, b0, kp); + l_copy(b0, b1); + } +#endif +#if defined(LAST_ENC_ROUND_CACHE_TABLES) + itables(t_fl); +#endif + kp += N_COLS; + round(fwd_lrnd, b0, b1, kp); + } +#endif + + state_out(out, b0); + } + +#if defined( AES_ERR_CHK ) + return aes_good; +#endif +} + +#endif + +#if defined(DECRYPTION) && !defined(AES_ASM) + +/* Visual C++ .Net v7.1 provides the fastest encryption code when using + Pentium optimiation with small code but this is poor for decryption + so we need to control this with the following VC++ pragmas +*/ + +#if defined(_MSC_VER) +#pragma optimize( "t", on ) +#endif + +/* Given the column (c) of the output state variable, the following + macros give the input state variables which are needed in its + computation for each row (r) of the state. All the alternative + macros give the same end values but expand into different ways + of calculating these values. In particular the complex macro + used for dynamically variable block sizes is designed to expand + to a compile time constant whenever possible but will expand to + conditional clauses on some branches (I am grateful to Frank + Yellin for this construction) +*/ + +#define inv_var(x,r,c)\ + ( r == 0 ? ( c == 0 ? s(x,0) : c == 1 ? s(x,1) : c == 2 ? s(x,2) : s(x,3))\ + : r == 1 ? ( c == 0 ? s(x,3) : c == 1 ? s(x,0) : c == 2 ? s(x,1) : s(x,2))\ + : r == 2 ? ( c == 0 ? s(x,2) : c == 1 ? s(x,3) : c == 2 ? s(x,0) : s(x,1))\ + : ( c == 0 ? s(x,1) : c == 1 ? s(x,2) : c == 2 ? s(x,3) : s(x,0))) + +#if defined(IT4_SET) +#undef dec_imvars +# if defined(DEC_ROUND_CACHE_TABLES) +#define inv_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_cached_tables(x,t_in,inv_var,rf1,c)) +# else +#define inv_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_tables(x,t_in,inv_var,rf1,c)) +# endif +#elif defined(IT1_SET) +#undef dec_imvars +#define inv_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ one_table(x,upr,t_in,inv_var,rf1,c)) +#else +#define inv_rnd(y,x,k,c) (s(y,c) = inv_mcol((k)[c] ^ no_table(x,t_ibox,inv_var,rf1,c))) +#endif + +#if defined(IL4_SET) +# if defined(LAST_DEC_ROUND_CACHE_TABLES) +#define inv_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_cached_tables(x,t_il,inv_var,rf1,c)) +# else +#define inv_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_tables(x,t_il,inv_var,rf1,c)) +# endif +#elif defined(IL1_SET) +#define inv_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ one_table(x,ups,t_il,inv_var,rf1,c)) +#else +#define inv_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ no_table(x,t_ibox,inv_var,rf1,c)) +#endif + +aes_rval aes_decrypt_cbc(const unsigned char *in, const unsigned char *in_iv, unsigned int num_blk, + unsigned char *out, const aes_decrypt_ctx cx[1]) +{ aes_32t locals(b0, b1); + const aes_32t *kp = cx->ks + cx->rn * N_COLS; +#if defined(DEC_ROUND_CACHE_TABLES) + dtables(t_in); +#endif +#if defined(LAST_DEC_ROUND_CACHE_TABLES) + dtables(t_il); +#endif + +#if defined( dec_imvars ) + dec_imvars; /* declare variables for inv_mcol() if needed */ +#endif + +#if defined( AES_ERR_CHK ) + if( cx->rn != 10 && cx->rn != 12 && cx->rn != 14 ) + return aes_error; +#endif + +#if defined(DEC_ROUND_CACHE_TABLES) + itables(t_in); +#endif + + in += AES_BLOCK_SIZE * (num_blk - 1); + out += AES_BLOCK_SIZE * (num_blk - 1); + // Load the last block's ciphertext into b1 + state_in(b1, in); + + for (;num_blk; out -= AES_BLOCK_SIZE, --num_blk) + { + // Do the xor part of state_in, where b1 is the previous block's ciphertext. + key_in(b0, b1, kp); + +#if (DEC_UNROLL == FULL) + + switch(cx->rn) + { + case 14: + round(inv_rnd, b1, b0, kp - 1 * N_COLS); + round(inv_rnd, b0, b1, kp - 2 * N_COLS); + kp -= 2 * N_COLS; + case 12: + round(inv_rnd, b1, b0, kp - 1 * N_COLS); + round(inv_rnd, b0, b1, kp - 2 * N_COLS); + kp -= 2 * N_COLS; + case 10: + default: + round(inv_rnd, b1, b0, kp - 1 * N_COLS); + round(inv_rnd, b0, b1, kp - 2 * N_COLS); + round(inv_rnd, b1, b0, kp - 3 * N_COLS); + round(inv_rnd, b0, b1, kp - 4 * N_COLS); + round(inv_rnd, b1, b0, kp - 5 * N_COLS); + round(inv_rnd, b0, b1, kp - 6 * N_COLS); + round(inv_rnd, b1, b0, kp - 7 * N_COLS); + round(inv_rnd, b0, b1, kp - 8 * N_COLS); + round(inv_rnd, b1, b0, kp - 9 * N_COLS); +#if defined(LAST_DEC_ROUND_CACHE_TABLES) + itables(t_il); +#endif + round(inv_lrnd, b0, b1, kp - 10 * N_COLS); + } + +#else + + { aes_32t rnd; +#if (DEC_UNROLL == PARTIAL) + for(rnd = 0; rnd < (cx->rn >> 1) - 1; ++rnd) + { + kp -= N_COLS; + round(inv_rnd, b1, b0, kp); + kp -= N_COLS; + round(inv_rnd, b0, b1, kp); + } + kp -= N_COLS; + round(inv_rnd, b1, b0, kp); +#else + for(rnd = 0; rnd < cx->rn - 1; ++rnd) + { + kp -= N_COLS; + round(inv_rnd, b1, b0, kp); + l_copy(b0, b1); + } +#endif +#if defined(LAST_DEC_ROUND_CACHE_TABLES) + itables(t_il); +#endif + kp -= N_COLS; + round(inv_lrnd, b0, b1, kp); + } +#endif + + if (num_blk == 1) + { + // We are doing the first block so we need the IV rather than the previous + // block for CBC (there is no previous block) + state_in(b1, in_iv); + } + else + { + in -= AES_BLOCK_SIZE; + state_in(b1, in); + } + + // Do the CBC with b1 which is either the IV or the ciphertext of the previous block. + cbc(b0, b1); + + state_out(out, b0); + } +#if defined( AES_ERR_CHK ) + return aes_good; +#endif +} + +#endif + +#if defined(__cplusplus) +} +#endif diff --git a/bsd/crypto/aes/aeskey.c b/bsd/crypto/aes/aeskey.c new file mode 100644 index 000000000..0120e0c7d --- /dev/null +++ b/bsd/crypto/aes/aeskey.c @@ -0,0 +1,455 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue Date: 26/08/2003 + + This file contains the code for implementing the key schedule for AES + (Rijndael) for block and key sizes of 16, 24, and 32 bytes. See aesopt.h + for further details including optimisation. +*/ + +#include "aesopt.h" +#include "aestab.h" + +#if defined(__cplusplus) +extern "C" +{ +#endif + +/* Initialise the key schedule from the user supplied key. The key + length can be specified in bytes, with legal values of 16, 24 + and 32, or in bits, with legal values of 128, 192 and 256. These + values correspond with Nk values of 4, 6 and 8 respectively. + + The following macros implement a single cycle in the key + schedule generation process. The number of cycles needed + for each cx->n_col and nk value is: + + nk = 4 5 6 7 8 + ------------------------------ + cx->n_col = 4 10 9 8 7 7 + cx->n_col = 5 14 11 10 9 9 + cx->n_col = 6 19 15 12 11 11 + cx->n_col = 7 21 19 16 13 14 + cx->n_col = 8 29 23 19 17 14 +*/ + +#define ke4(k,i) \ +{ k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; k[4*(i)+5] = ss[1] ^= ss[0]; \ + k[4*(i)+6] = ss[2] ^= ss[1]; k[4*(i)+7] = ss[3] ^= ss[2]; \ +} +#define kel4(k,i) \ +{ k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; k[4*(i)+5] = ss[1] ^= ss[0]; \ + k[4*(i)+6] = ss[2] ^= ss[1]; k[4*(i)+7] = ss[3] ^= ss[2]; \ +} + +#define ke6(k,i) \ +{ k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; k[6*(i)+ 7] = ss[1] ^= ss[0]; \ + k[6*(i)+ 8] = ss[2] ^= ss[1]; k[6*(i)+ 9] = ss[3] ^= ss[2]; \ + k[6*(i)+10] = ss[4] ^= ss[3]; k[6*(i)+11] = ss[5] ^= ss[4]; \ +} +#define kel6(k,i) \ +{ k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; k[6*(i)+ 7] = ss[1] ^= ss[0]; \ + k[6*(i)+ 8] = ss[2] ^= ss[1]; k[6*(i)+ 9] = ss[3] ^= ss[2]; \ +} + +#define ke8(k,i) \ +{ k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; k[8*(i)+ 9] = ss[1] ^= ss[0]; \ + k[8*(i)+10] = ss[2] ^= ss[1]; k[8*(i)+11] = ss[3] ^= ss[2]; \ + k[8*(i)+12] = ss[4] ^= ls_box(ss[3],0); k[8*(i)+13] = ss[5] ^= ss[4]; \ + k[8*(i)+14] = ss[6] ^= ss[5]; k[8*(i)+15] = ss[7] ^= ss[6]; \ +} +#define kel8(k,i) \ +{ k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; k[8*(i)+ 9] = ss[1] ^= ss[0]; \ + k[8*(i)+10] = ss[2] ^= ss[1]; k[8*(i)+11] = ss[3] ^= ss[2]; \ +} + +#if defined(ENCRYPTION_KEY_SCHEDULE) + +#if defined(AES_128) || defined(AES_VAR) + +aes_rval aes_encrypt_key128(const unsigned char *key, aes_encrypt_ctx cx[1]) +{ aes_32t ss[4]; + + cx->ks[0] = ss[0] = word_in(key, 0); + cx->ks[1] = ss[1] = word_in(key, 1); + cx->ks[2] = ss[2] = word_in(key, 2); + cx->ks[3] = ss[3] = word_in(key, 3); + +#if ENC_UNROLL == NONE + { aes_32t i; + + for(i = 0; i < ((11 * N_COLS - 5) / 4); ++i) + ke4(cx->ks, i); + } +#else + ke4(cx->ks, 0); ke4(cx->ks, 1); + ke4(cx->ks, 2); ke4(cx->ks, 3); + ke4(cx->ks, 4); ke4(cx->ks, 5); + ke4(cx->ks, 6); ke4(cx->ks, 7); + ke4(cx->ks, 8); +#endif + kel4(cx->ks, 9); + cx->rn = 10; +#if defined( AES_ERR_CHK ) + return aes_good; +#endif +} + +#endif + +#if defined(AES_192) || defined(AES_VAR) + +aes_rval aes_encrypt_key192(const unsigned char *key, aes_encrypt_ctx cx[1]) +{ aes_32t ss[6]; + + cx->ks[0] = ss[0] = word_in(key, 0); + cx->ks[1] = ss[1] = word_in(key, 1); + cx->ks[2] = ss[2] = word_in(key, 2); + cx->ks[3] = ss[3] = word_in(key, 3); + cx->ks[4] = ss[4] = word_in(key, 4); + cx->ks[5] = ss[5] = word_in(key, 5); + +#if ENC_UNROLL == NONE + { aes_32t i; + + for(i = 0; i < (13 * N_COLS - 7) / 6; ++i) + ke6(cx->ks, i); + } +#else + ke6(cx->ks, 0); ke6(cx->ks, 1); + ke6(cx->ks, 2); ke6(cx->ks, 3); + ke6(cx->ks, 4); ke6(cx->ks, 5); + ke6(cx->ks, 6); +#endif + kel6(cx->ks, 7); + cx->rn = 12; +#if defined( AES_ERR_CHK ) + return aes_good; +#endif +} + +#endif + +#if defined(AES_256) || defined(AES_VAR) + +aes_rval aes_encrypt_key256(const unsigned char *key, aes_encrypt_ctx cx[1]) +{ aes_32t ss[8]; + + cx->ks[0] = ss[0] = word_in(key, 0); + cx->ks[1] = ss[1] = word_in(key, 1); + cx->ks[2] = ss[2] = word_in(key, 2); + cx->ks[3] = ss[3] = word_in(key, 3); + cx->ks[4] = ss[4] = word_in(key, 4); + cx->ks[5] = ss[5] = word_in(key, 5); + cx->ks[6] = ss[6] = word_in(key, 6); + cx->ks[7] = ss[7] = word_in(key, 7); + +#if ENC_UNROLL == NONE + { aes_32t i; + + for(i = 0; i < (15 * N_COLS - 9) / 8; ++i) + ke8(cx->ks, i); + } +#else + ke8(cx->ks, 0); ke8(cx->ks, 1); + ke8(cx->ks, 2); ke8(cx->ks, 3); + ke8(cx->ks, 4); ke8(cx->ks, 5); +#endif + kel8(cx->ks, 6); + cx->rn = 14; +#if defined( AES_ERR_CHK ) + return aes_good; +#endif +} + +#endif + +#if defined(AES_VAR) + +aes_rval aes_encrypt_key(const unsigned char *key, int key_len, aes_encrypt_ctx cx[1]) +{ + switch(key_len) + { +#if defined( AES_ERR_CHK ) + case 16: case 128: return aes_encrypt_key128(key, cx); + case 24: case 192: return aes_encrypt_key192(key, cx); + case 32: case 256: return aes_encrypt_key256(key, cx); + default: return aes_error; +#else + case 16: case 128: aes_encrypt_key128(key, cx); return; + case 24: case 192: aes_encrypt_key192(key, cx); return; + case 32: case 256: aes_encrypt_key256(key, cx); return; +#endif + } +} + +#endif + +#endif + +#if defined(DECRYPTION_KEY_SCHEDULE) + +#if DEC_ROUND == NO_TABLES +#define ff(x) (x) +#else +#define ff(x) inv_mcol(x) +#if defined( dec_imvars ) +#define d_vars dec_imvars +#endif +#endif + +#if 1 +#define kdf4(k,i) \ +{ ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3]; ss[1] = ss[1] ^ ss[3]; ss[2] = ss[2] ^ ss[3]; ss[3] = ss[3]; \ + ss[4] = ls_box(ss[(i+3) % 4], 3) ^ t_use(r,c)[i]; ss[i % 4] ^= ss[4]; \ + ss[4] ^= k[4*(i)]; k[4*(i)+4] = ff(ss[4]); ss[4] ^= k[4*(i)+1]; k[4*(i)+5] = ff(ss[4]); \ + ss[4] ^= k[4*(i)+2]; k[4*(i)+6] = ff(ss[4]); ss[4] ^= k[4*(i)+3]; k[4*(i)+7] = ff(ss[4]); \ +} +#define kd4(k,i) \ +{ ss[4] = ls_box(ss[(i+3) % 4], 3) ^ t_use(r,c)[i]; ss[i % 4] ^= ss[4]; ss[4] = ff(ss[4]); \ + k[4*(i)+4] = ss[4] ^= k[4*(i)]; k[4*(i)+5] = ss[4] ^= k[4*(i)+1]; \ + k[4*(i)+6] = ss[4] ^= k[4*(i)+2]; k[4*(i)+7] = ss[4] ^= k[4*(i)+3]; \ +} +#define kdl4(k,i) \ +{ ss[4] = ls_box(ss[(i+3) % 4], 3) ^ t_use(r,c)[i]; ss[i % 4] ^= ss[4]; \ + k[4*(i)+4] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3]; k[4*(i)+5] = ss[1] ^ ss[3]; \ + k[4*(i)+6] = ss[0]; k[4*(i)+7] = ss[1]; \ +} +#else +#define kdf4(k,i) \ +{ ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; k[4*(i)+ 4] = ff(ss[0]); ss[1] ^= ss[0]; k[4*(i)+ 5] = ff(ss[1]); \ + ss[2] ^= ss[1]; k[4*(i)+ 6] = ff(ss[2]); ss[3] ^= ss[2]; k[4*(i)+ 7] = ff(ss[3]); \ +} +#define kd4(k,i) \ +{ ss[4] = ls_box(ss[3],3) ^ t_use(r,c)[i]; \ + ss[0] ^= ss[4]; ss[4] = ff(ss[4]); k[4*(i)+ 4] = ss[4] ^= k[4*(i)]; \ + ss[1] ^= ss[0]; k[4*(i)+ 5] = ss[4] ^= k[4*(i)+ 1]; \ + ss[2] ^= ss[1]; k[4*(i)+ 6] = ss[4] ^= k[4*(i)+ 2]; \ + ss[3] ^= ss[2]; k[4*(i)+ 7] = ss[4] ^= k[4*(i)+ 3]; \ +} +#define kdl4(k,i) \ +{ ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; k[4*(i)+ 4] = ss[0]; ss[1] ^= ss[0]; k[4*(i)+ 5] = ss[1]; \ + ss[2] ^= ss[1]; k[4*(i)+ 6] = ss[2]; ss[3] ^= ss[2]; k[4*(i)+ 7] = ss[3]; \ +} +#endif + +#define kdf6(k,i) \ +{ ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; k[6*(i)+ 6] = ff(ss[0]); ss[1] ^= ss[0]; k[6*(i)+ 7] = ff(ss[1]); \ + ss[2] ^= ss[1]; k[6*(i)+ 8] = ff(ss[2]); ss[3] ^= ss[2]; k[6*(i)+ 9] = ff(ss[3]); \ + ss[4] ^= ss[3]; k[6*(i)+10] = ff(ss[4]); ss[5] ^= ss[4]; k[6*(i)+11] = ff(ss[5]); \ +} +#define kd6(k,i) \ +{ ss[6] = ls_box(ss[5],3) ^ t_use(r,c)[i]; \ + ss[0] ^= ss[6]; ss[6] = ff(ss[6]); k[6*(i)+ 6] = ss[6] ^= k[6*(i)]; \ + ss[1] ^= ss[0]; k[6*(i)+ 7] = ss[6] ^= k[6*(i)+ 1]; \ + ss[2] ^= ss[1]; k[6*(i)+ 8] = ss[6] ^= k[6*(i)+ 2]; \ + ss[3] ^= ss[2]; k[6*(i)+ 9] = ss[6] ^= k[6*(i)+ 3]; \ + ss[4] ^= ss[3]; k[6*(i)+10] = ss[6] ^= k[6*(i)+ 4]; \ + ss[5] ^= ss[4]; k[6*(i)+11] = ss[6] ^= k[6*(i)+ 5]; \ +} +#define kdl6(k,i) \ +{ ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; k[6*(i)+ 6] = ss[0]; ss[1] ^= ss[0]; k[6*(i)+ 7] = ss[1]; \ + ss[2] ^= ss[1]; k[6*(i)+ 8] = ss[2]; ss[3] ^= ss[2]; k[6*(i)+ 9] = ss[3]; \ +} + +#define kdf8(k,i) \ +{ ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; k[8*(i)+ 8] = ff(ss[0]); ss[1] ^= ss[0]; k[8*(i)+ 9] = ff(ss[1]); \ + ss[2] ^= ss[1]; k[8*(i)+10] = ff(ss[2]); ss[3] ^= ss[2]; k[8*(i)+11] = ff(ss[3]); \ + ss[4] ^= ls_box(ss[3],0); k[8*(i)+12] = ff(ss[4]); ss[5] ^= ss[4]; k[8*(i)+13] = ff(ss[5]); \ + ss[6] ^= ss[5]; k[8*(i)+14] = ff(ss[6]); ss[7] ^= ss[6]; k[8*(i)+15] = ff(ss[7]); \ +} +#define kd8(k,i) \ +{ aes_32t g = ls_box(ss[7],3) ^ t_use(r,c)[i]; \ + ss[0] ^= g; g = ff(g); k[8*(i)+ 8] = g ^= k[8*(i)]; \ + ss[1] ^= ss[0]; k[8*(i)+ 9] = g ^= k[8*(i)+ 1]; \ + ss[2] ^= ss[1]; k[8*(i)+10] = g ^= k[8*(i)+ 2]; \ + ss[3] ^= ss[2]; k[8*(i)+11] = g ^= k[8*(i)+ 3]; \ + g = ls_box(ss[3],0); \ + ss[4] ^= g; g = ff(g); k[8*(i)+12] = g ^= k[8*(i)+ 4]; \ + ss[5] ^= ss[4]; k[8*(i)+13] = g ^= k[8*(i)+ 5]; \ + ss[6] ^= ss[5]; k[8*(i)+14] = g ^= k[8*(i)+ 6]; \ + ss[7] ^= ss[6]; k[8*(i)+15] = g ^= k[8*(i)+ 7]; \ +} +#define kdl8(k,i) \ +{ ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; k[8*(i)+ 8] = ss[0]; ss[1] ^= ss[0]; k[8*(i)+ 9] = ss[1]; \ + ss[2] ^= ss[1]; k[8*(i)+10] = ss[2]; ss[3] ^= ss[2]; k[8*(i)+11] = ss[3]; \ +} + +#if defined(AES_128) || defined(AES_VAR) + +aes_rval aes_decrypt_key128(const unsigned char *key, aes_decrypt_ctx cx[1]) +{ aes_32t ss[5]; +#if defined( d_vars ) + d_vars; +#endif + cx->ks[0] = ss[0] = word_in(key, 0); + cx->ks[1] = ss[1] = word_in(key, 1); + cx->ks[2] = ss[2] = word_in(key, 2); + cx->ks[3] = ss[3] = word_in(key, 3); + +#if DEC_UNROLL == NONE + { aes_32t i; + + for(i = 0; i < (11 * N_COLS - 5) / 4; ++i) + ke4(cx->ks, i); + kel4(cx->ks, 9); +#if !(DEC_ROUND == NO_TABLES) + for(i = N_COLS; i < 10 * N_COLS; ++i) + cx->ks[i] = inv_mcol(cx->ks[i]); +#endif + } +#else + kdf4(cx->ks, 0); kd4(cx->ks, 1); + kd4(cx->ks, 2); kd4(cx->ks, 3); + kd4(cx->ks, 4); kd4(cx->ks, 5); + kd4(cx->ks, 6); kd4(cx->ks, 7); + kd4(cx->ks, 8); kdl4(cx->ks, 9); +#endif + cx->rn = 10; +#if defined( AES_ERR_CHK ) + return aes_good; +#endif +} + +#endif + +#if defined(AES_192) || defined(AES_VAR) + +aes_rval aes_decrypt_key192(const unsigned char *key, aes_decrypt_ctx cx[1]) +{ aes_32t ss[7]; +#if defined( d_vars ) + d_vars; +#endif + cx->ks[0] = ss[0] = word_in(key, 0); + cx->ks[1] = ss[1] = word_in(key, 1); + cx->ks[2] = ss[2] = word_in(key, 2); + cx->ks[3] = ss[3] = word_in(key, 3); + +#if DEC_UNROLL == NONE + cx->ks[4] = ss[4] = word_in(key, 4); + cx->ks[5] = ss[5] = word_in(key, 5); + { aes_32t i; + + for(i = 0; i < (13 * N_COLS - 7) / 6; ++i) + ke6(cx->ks, i); + kel6(cx->ks, 7); +#if !(DEC_ROUND == NO_TABLES) + for(i = N_COLS; i < 12 * N_COLS; ++i) + cx->ks[i] = inv_mcol(cx->ks[i]); +#endif + } +#else + cx->ks[4] = ff(ss[4] = word_in(key, 4)); + cx->ks[5] = ff(ss[5] = word_in(key, 5)); + kdf6(cx->ks, 0); kd6(cx->ks, 1); + kd6(cx->ks, 2); kd6(cx->ks, 3); + kd6(cx->ks, 4); kd6(cx->ks, 5); + kd6(cx->ks, 6); kdl6(cx->ks, 7); +#endif + cx->rn = 12; +#if defined( AES_ERR_CHK ) + return aes_good; +#endif +} + +#endif + +#if defined(AES_256) || defined(AES_VAR) + +aes_rval aes_decrypt_key256(const unsigned char *key, aes_decrypt_ctx cx[1]) +{ aes_32t ss[8]; +#if defined( d_vars ) + d_vars; +#endif + cx->ks[0] = ss[0] = word_in(key, 0); + cx->ks[1] = ss[1] = word_in(key, 1); + cx->ks[2] = ss[2] = word_in(key, 2); + cx->ks[3] = ss[3] = word_in(key, 3); + +#if DEC_UNROLL == NONE + cx->ks[4] = ss[4] = word_in(key, 4); + cx->ks[5] = ss[5] = word_in(key, 5); + cx->ks[6] = ss[6] = word_in(key, 6); + cx->ks[7] = ss[7] = word_in(key, 7); + { aes_32t i; + + for(i = 0; i < (15 * N_COLS - 9) / 8; ++i) + ke8(cx->ks, i); + kel8(cx->ks, i); +#if !(DEC_ROUND == NO_TABLES) + for(i = N_COLS; i < 14 * N_COLS; ++i) + cx->ks[i] = inv_mcol(cx->ks[i]); + +#endif + } +#else + cx->ks[4] = ff(ss[4] = word_in(key, 4)); + cx->ks[5] = ff(ss[5] = word_in(key, 5)); + cx->ks[6] = ff(ss[6] = word_in(key, 6)); + cx->ks[7] = ff(ss[7] = word_in(key, 7)); + kdf8(cx->ks, 0); kd8(cx->ks, 1); + kd8(cx->ks, 2); kd8(cx->ks, 3); + kd8(cx->ks, 4); kd8(cx->ks, 5); + kdl8(cx->ks, 6); +#endif + cx->rn = 14; +#if defined( AES_ERR_CHK ) + return aes_good; +#endif +} + +#endif + +#if defined(AES_VAR) + +aes_rval aes_decrypt_key(const unsigned char *key, int key_len, aes_decrypt_ctx cx[1]) +{ + switch(key_len) + { +#if defined( AES_ERR_CHK ) + case 16: case 128: return aes_decrypt_key128(key, cx); + case 24: case 192: return aes_decrypt_key192(key, cx); + case 32: case 256: return aes_decrypt_key256(key, cx); + default: return aes_error; +#else + case 16: case 128: aes_decrypt_key128(key, cx); return; + case 24: case 192: aes_decrypt_key192(key, cx); return; + case 32: case 256: aes_decrypt_key256(key, cx); return; +#endif + } +} + +#endif + +#endif + +#if defined(__cplusplus) +} +#endif diff --git a/bsd/crypto/aes/aesopt.h b/bsd/crypto/aes/aesopt.h new file mode 100644 index 000000000..7b2ea04f0 --- /dev/null +++ b/bsd/crypto/aes/aesopt.h @@ -0,0 +1,753 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue 28/01/2004 + + My thanks go to Dag Arne Osvik for devising the schemes used here for key + length derivation from the form of the key schedule + + This file contains the compilation options for AES (Rijndael) and code + that is common across encryption, key scheduling and table generation. + + OPERATION + + These source code files implement the AES algorithm Rijndael designed by + Joan Daemen and Vincent Rijmen. This version is designed for the standard + block size of 16 bytes and for key sizes of 128, 192 and 256 bits (16, 24 + and 32 bytes). + + This version is designed for flexibility and speed using operations on + 32-bit words rather than operations on bytes. It can be compiled with + either big or little endian internal byte order but is faster when the + native byte order for the processor is used. + + THE CIPHER INTERFACE + + The cipher interface is implemented as an array of bytes in which lower + AES bit sequence indexes map to higher numeric significance within bytes. + + aes_08t (an unsigned 8-bit type) + aes_32t (an unsigned 32-bit type) + struct aes_encrypt_ctx (structure for the cipher encryption context) + struct aes_decrypt_ctx (structure for the cipher decryption context) + aes_rval the function return type + + C subroutine calls: + + aes_rval aes_encrypt_key128(const unsigned char *key, aes_encrypt_ctx cx[1]); + aes_rval aes_encrypt_key192(const unsigned char *key, aes_encrypt_ctx cx[1]); + aes_rval aes_encrypt_key256(const unsigned char *key, aes_encrypt_ctx cx[1]); + aes_rval aes_encrypt(const unsigned char *in, unsigned char *out, + const aes_encrypt_ctx cx[1]); + + aes_rval aes_decrypt_key128(const unsigned char *key, aes_decrypt_ctx cx[1]); + aes_rval aes_decrypt_key192(const unsigned char *key, aes_decrypt_ctx cx[1]); + aes_rval aes_decrypt_key256(const unsigned char *key, aes_decrypt_ctx cx[1]); + aes_rval aes_decrypt(const unsigned char *in, unsigned char *out, + const aes_decrypt_ctx cx[1]); + + IMPORTANT NOTE: If you are using this C interface with dynamic tables make sure that + you call genTabs() before AES is used so that the tables are initialised. + + C++ aes class subroutines: + + Class AESencrypt for encryption + + Construtors: + AESencrypt(void) + AESencrypt(const unsigned char *key) - 128 bit key + Members: + aes_rval key128(const unsigned char *key) + aes_rval key192(const unsigned char *key) + aes_rval key256(const unsigned char *key) + aes_rval encrypt(const unsigned char *in, unsigned char *out) const + + Class AESdecrypt for encryption + Construtors: + AESdecrypt(void) + AESdecrypt(const unsigned char *key) - 128 bit key + Members: + aes_rval key128(const unsigned char *key) + aes_rval key192(const unsigned char *key) + aes_rval key256(const unsigned char *key) + aes_rval decrypt(const unsigned char *in, unsigned char *out) const + + COMPILATION + + The files used to provide AES (Rijndael) are + + a. aes.h for the definitions needed for use in C. + b. aescpp.h for the definitions needed for use in C++. + c. aesopt.h for setting compilation options (also includes common code). + d. aescrypt.c for encryption and decrytpion, or + e. aeskey.c for key scheduling. + f. aestab.c for table loading or generation. + g. aescrypt.asm for encryption and decryption using assembler code. + h. aescrypt.mmx.asm for encryption and decryption using MMX assembler. + + To compile AES (Rijndael) for use in C code use aes.h and set the + defines here for the facilities you need (key lengths, encryption + and/or decryption). Do not define AES_DLL or AES_CPP. Set the options + for optimisations and table sizes here. + + To compile AES (Rijndael) for use in in C++ code use aescpp.h but do + not define AES_DLL + + To compile AES (Rijndael) in C as a Dynamic Link Library DLL) use + aes.h and include the AES_DLL define. + + CONFIGURATION OPTIONS (here and in aes.h) + + a. set AES_DLL in aes.h if AES (Rijndael) is to be compiled as a DLL + b. You may need to set PLATFORM_BYTE_ORDER to define the byte order. + c. If you want the code to run in a specific internal byte order, then + ALGORITHM_BYTE_ORDER must be set accordingly. + d. set other configuration options decribed below. +*/ + +#if !defined( _AESOPT_H ) +#define _AESOPT_H + +#include "aes.h" + +/* CONFIGURATION - USE OF DEFINES + + Later in this section there are a number of defines that control the + operation of the code. In each section, the purpose of each define is + explained so that the relevant form can be included or excluded by + setting either 1's or 0's respectively on the branches of the related + #if clauses. + + PLATFORM SPECIFIC INCLUDES AND BYTE ORDER IN 32-BIT WORDS + + To obtain the highest speed on processors with 32-bit words, this code + needs to determine the byte order of the target machine. The following + block of code is an attempt to capture the most obvious ways in which + various environemnts define byte order. It may well fail, in which case + the definitions will need to be set by editing at the points marked + **** EDIT HERE IF NECESSARY **** below. My thanks go to Peter Gutmann + for his assistance with this endian detection nightmare. +*/ + +#define BRG_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ +#define BRG_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ + +#if defined(__GNUC__) || defined(__GNU_LIBRARY__) +# if defined(__FreeBSD__) || defined(__OpenBSD__) +# include <sys/endian.h> +# elif defined( BSD ) && BSD >= 199103 +# include <machine/endian.h> +# elif defined(__APPLE__) +# if defined(__BIG_ENDIAN__) && !defined( BIG_ENDIAN ) +# define BIG_ENDIAN +# elif defined(__LITTLE_ENDIAN__) && !defined( LITTLE_ENDIAN ) +# define LITTLE_ENDIAN +# endif +# else +# include <endian.h> +# if defined(__BEOS__) +# include <byteswap.h> +# endif +# endif +#endif + +#if !defined(PLATFORM_BYTE_ORDER) +# if defined(LITTLE_ENDIAN) || defined(BIG_ENDIAN) +# if defined(LITTLE_ENDIAN) && !defined(BIG_ENDIAN) +# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN +# elif !defined(LITTLE_ENDIAN) && defined(BIG_ENDIAN) +# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN +# elif defined(BYTE_ORDER) && (BYTE_ORDER == LITTLE_ENDIAN) +# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN +# elif defined(BYTE_ORDER) && (BYTE_ORDER == BIG_ENDIAN) +# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN +# endif +# elif defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN) +# if defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN) +# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN +# elif !defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN) +# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN +# elif defined(_BYTE_ORDER) && (_BYTE_ORDER == _LITTLE_ENDIAN) +# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN +# elif defined(_BYTE_ORDER) && (_BYTE_ORDER == _BIG_ENDIAN) +# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN +# endif +# elif defined(__LITTLE_ENDIAN__) || defined(__BIG_ENDIAN__) +# if defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) +# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN +# elif !defined(__LITTLE_ENDIAN__) && defined(__BIG_ENDIAN__) +# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN +# elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __LITTLE_ENDIAN__) +# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN +# elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __BIG_ENDIAN__) +# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN +# endif +# endif +#endif + +/* if the platform is still unknown, try to find its byte order */ +/* from commonly used machine defines */ + +#if !defined(PLATFORM_BYTE_ORDER) + +#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \ + defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \ + defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \ + defined( vax ) || defined( vms ) || defined( VMS ) || \ + defined( __VMS ) +# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN + +#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \ + defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \ + defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \ + defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \ + defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \ + defined( __TANDEM ) || defined( THINK_C ) || defined( __VMCMS__ ) +# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN + +#elif 0 /* **** EDIT HERE IF NECESSARY **** */ +# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN +#elif 0 /* **** EDIT HERE IF NECESSARY **** */ +# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN +#else +# error Please edit aesopt.h (line 234 or 236) to set the platform byte order +#endif + +#endif + +/* SOME LOCAL DEFINITIONS */ + +#define NO_TABLES 0 +#define ONE_TABLE 1 +#define FOUR_TABLES 4 +#define NONE 0 +#define PARTIAL 1 +#define FULL 2 + +#if defined(bswap32) +#define aes_sw32 bswap32 +#elif defined(bswap_32) +#define aes_sw32 bswap_32 +#else +#define brot(x,n) (((aes_32t)(x) << n) | ((aes_32t)(x) >> (32 - n))) +#define aes_sw32(x) ((brot((x),8) & 0x00ff00ff) | (brot((x),24) & 0xff00ff00)) +#endif + +/* 1. FUNCTIONS REQUIRED + + This implementation provides subroutines for encryption, decryption + and for setting the three key lengths (separately) for encryption + and decryption. When the assembler code is not being used the following + definition blocks allow the selection of the routines that are to be + included in the compilation. +*/ +#if defined( AES_ENCRYPT ) +#define ENCRYPTION +#define ENCRYPTION_KEY_SCHEDULE +#endif + +#if defined( AES_DECRYPT ) +#define DECRYPTION +#define DECRYPTION_KEY_SCHEDULE +#endif + +/* 2. ASSEMBLER SUPPORT + + This define (which can be on the command line) enables the use of the + assembler code routines for encryption and decryption with the C code + only providing key scheduling +*/ +#if 0 && !defined(AES_ASM) +#define AES_ASM +#endif + +/* 3. BYTE ORDER WITHIN 32 BIT WORDS + + The fundamental data processing units in Rijndael are 8-bit bytes. The + input, output and key input are all enumerated arrays of bytes in which + bytes are numbered starting at zero and increasing to one less than the + number of bytes in the array in question. This enumeration is only used + for naming bytes and does not imply any adjacency or order relationship + from one byte to another. When these inputs and outputs are considered + as bit sequences, bits 8*n to 8*n+7 of the bit sequence are mapped to + byte[n] with bit 8n+i in the sequence mapped to bit 7-i within the byte. + In this implementation bits are numbered from 0 to 7 starting at the + numerically least significant end of each byte (bit n represents 2^n). + + However, Rijndael can be implemented more efficiently using 32-bit + words by packing bytes into words so that bytes 4*n to 4*n+3 are placed + into word[n]. While in principle these bytes can be assembled into words + in any positions, this implementation only supports the two formats in + which bytes in adjacent positions within words also have adjacent byte + numbers. This order is called big-endian if the lowest numbered bytes + in words have the highest numeric significance and little-endian if the + opposite applies. + + This code can work in either order irrespective of the order used by the + machine on which it runs. Normally the internal byte order will be set + to the order of the processor on which the code is to be run but this + define can be used to reverse this in special situations + + NOTE: Assembler code versions rely on PLATFORM_BYTE_ORDER being set +*/ +#if 1 || defined(AES_ASM) +#define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER +#elif 0 +#define ALGORITHM_BYTE_ORDER BRG_LITTLE_ENDIAN +#elif 0 +#define ALGORITHM_BYTE_ORDER BRG_BIG_ENDIAN +#else +#error The algorithm byte order is not defined +#endif + +/* 4. FAST INPUT/OUTPUT OPERATIONS. + + On some machines it is possible to improve speed by transferring the + bytes in the input and output arrays to and from the internal 32-bit + variables by addressing these arrays as if they are arrays of 32-bit + words. On some machines this will always be possible but there may + be a large performance penalty if the byte arrays are not aligned on + the normal word boundaries. On other machines this technique will + lead to memory access errors when such 32-bit word accesses are not + properly aligned. The option SAFE_IO avoids such problems but will + often be slower on those machines that support misaligned access + (especially so if care is taken to align the input and output byte + arrays on 32-bit word boundaries). If SAFE_IO is not defined it is + assumed that access to byte arrays as if they are arrays of 32-bit + words will not cause problems when such accesses are misaligned. +*/ +#if 0 && !defined(_MSC_VER) +#define SAFE_IO +#endif + +/* 5. LOOP UNROLLING + + The code for encryption and decrytpion cycles through a number of rounds + that can be implemented either in a loop or by expanding the code into a + long sequence of instructions, the latter producing a larger program but + one that will often be much faster. The latter is called loop unrolling. + There are also potential speed advantages in expanding two iterations in + a loop with half the number of iterations, which is called partial loop + unrolling. The following options allow partial or full loop unrolling + to be set independently for encryption and decryption +*/ +#if 1 +#define ENC_UNROLL FULL +#elif 0 +#define ENC_UNROLL PARTIAL +#else +#define ENC_UNROLL NONE +#endif + +#if 1 +#define DEC_UNROLL FULL +#elif 0 +#define DEC_UNROLL PARTIAL +#else +#define DEC_UNROLL NONE +#endif + +/* 6. FAST FINITE FIELD OPERATIONS + + If this section is included, tables are used to provide faster finite + field arithmetic (this has no effect if FIXED_TABLES is defined). +*/ +#if 1 +#define FF_TABLES +#endif + +/* 7. INTERNAL STATE VARIABLE FORMAT + + The internal state of Rijndael is stored in a number of local 32-bit + word varaibles which can be defined either as an array or as individual + names variables. Include this section if you want to store these local + varaibles in arrays. Otherwise individual local variables will be used. +*/ +#if 0 +#define ARRAYS +#endif + +/* In this implementation the columns of the state array are each held in + 32-bit words. The state array can be held in various ways: in an array + of words, in a number of individual word variables or in a number of + processor registers. The following define maps a variable name x and + a column number c to the way the state array variable is to be held. + The first define below maps the state into an array x[c] whereas the + second form maps the state into a number of individual variables x0, + x1, etc. Another form could map individual state colums to machine + register names. +*/ + +#if defined(ARRAYS) +#define s(x,c) x[c] +#else +#define s(x,c) x##c +#endif + +/* 8. FIXED OR DYNAMIC TABLES + + When this section is included the tables used by the code are compiled + statically into the binary file. Otherwise the subroutine gen_tabs() + must be called to compute them before the code is first used. +*/ +#if 1 +#define FIXED_TABLES +#endif + +/* 9. TABLE ALIGNMENT + + On some sytsems speed will be improved by aligning the AES large lookup + tables on particular boundaries. This define should be set to a power of + two giving the desired alignment. It can be left undefined if alignment + is not needed. This option is specific to the Microsft VC++ compiler - + it seems to sometimes cause trouble for the VC++ version 6 compiler. +*/ + +#if 0 && defined(_MSC_VER) && (_MSC_VER >= 1300) +#define TABLE_ALIGN 64 +#endif + +/* 10. INTERNAL TABLE CONFIGURATION + + This cipher proceeds by repeating in a number of cycles known as 'rounds' + which are implemented by a round function which can optionally be speeded + up using tables. The basic tables are each 256 32-bit words, with either + one or four tables being required for each round function depending on + how much speed is required. The encryption and decryption round functions + are different and the last encryption and decrytpion round functions are + different again making four different round functions in all. + + This means that: + 1. Normal encryption and decryption rounds can each use either 0, 1 + or 4 tables and table spaces of 0, 1024 or 4096 bytes each. + 2. The last encryption and decryption rounds can also use either 0, 1 + or 4 tables and table spaces of 0, 1024 or 4096 bytes each. + + Include or exclude the appropriate definitions below to set the number + of tables used by this implementation. +*/ + +#if 1 /* set tables for the normal encryption round */ +#define ENC_ROUND FOUR_TABLES +#elif 0 +#define ENC_ROUND ONE_TABLE +#else +#define ENC_ROUND NO_TABLES +#endif + +#if 1 /* set tables for the last encryption round */ +#define LAST_ENC_ROUND FOUR_TABLES +#elif 0 +#define LAST_ENC_ROUND ONE_TABLE +#else +#define LAST_ENC_ROUND NO_TABLES +#endif + +#if 1 /* set tables for the normal decryption round */ +#define DEC_ROUND FOUR_TABLES +#elif 0 +#define DEC_ROUND ONE_TABLE +#else +#define DEC_ROUND NO_TABLES +#endif + +#if 1 /* set tables for the last decryption round */ +#define LAST_DEC_ROUND FOUR_TABLES +#elif 0 +#define LAST_DEC_ROUND ONE_TABLE +#else +#define LAST_DEC_ROUND NO_TABLES +#endif + +/* The decryption key schedule can be speeded up with tables in the same + way that the round functions can. Include or exclude the following + defines to set this requirement. +*/ +#if 1 +#define KEY_SCHED FOUR_TABLES +#elif 0 +#define KEY_SCHED ONE_TABLE +#else +#define KEY_SCHED NO_TABLES +#endif + +/* 11. TABLE POINTER CACHING + + Normally tables are referenced directly, Enable this option if you wish to + cache pointers to the tables in the encrypt/decrypt code. Note that this + only works if you are using FOUR_TABLES for the ROUND you enable this for. +*/ +#if 1 +#define ENC_ROUND_CACHE_TABLES +#endif +#if 1 +#define LAST_ENC_ROUND_CACHE_TABLES +#endif +#if 1 +#define DEC_ROUND_CACHE_TABLES +#endif +#if 1 +#define LAST_DEC_ROUND_CACHE_TABLES +#endif + + +/* END OF CONFIGURATION OPTIONS */ + +#define RC_LENGTH (5 * (AES_BLOCK_SIZE / 4 - 2)) + +/* Disable or report errors on some combinations of options */ + +#if ENC_ROUND == NO_TABLES && LAST_ENC_ROUND != NO_TABLES +#undef LAST_ENC_ROUND +#define LAST_ENC_ROUND NO_TABLES +#elif ENC_ROUND == ONE_TABLE && LAST_ENC_ROUND == FOUR_TABLES +#undef LAST_ENC_ROUND +#define LAST_ENC_ROUND ONE_TABLE +#endif + +#if ENC_ROUND == NO_TABLES && ENC_UNROLL != NONE +#undef ENC_UNROLL +#define ENC_UNROLL NONE +#endif + +#if DEC_ROUND == NO_TABLES && LAST_DEC_ROUND != NO_TABLES +#undef LAST_DEC_ROUND +#define LAST_DEC_ROUND NO_TABLES +#elif DEC_ROUND == ONE_TABLE && LAST_DEC_ROUND == FOUR_TABLES +#undef LAST_DEC_ROUND +#define LAST_DEC_ROUND ONE_TABLE +#endif + +#if DEC_ROUND == NO_TABLES && DEC_UNROLL != NONE +#undef DEC_UNROLL +#define DEC_UNROLL NONE +#endif + +/* upr(x,n): rotates bytes within words by n positions, moving bytes to + higher index positions with wrap around into low positions + ups(x,n): moves bytes by n positions to higher index positions in + words but without wrap around + bval(x,n): extracts a byte from a word + + NOTE: The definitions given here are intended only for use with + unsigned variables and with shift counts that are compile + time constants +*/ + +#if (ALGORITHM_BYTE_ORDER == BRG_LITTLE_ENDIAN) +#define upr(x,n) (((aes_32t)(x) << (8 * (n))) | ((aes_32t)(x) >> (32 - 8 * (n)))) +#define ups(x,n) ((aes_32t) (x) << (8 * (n))) +#define bval(x,n) ((aes_08t)((x) >> (8 * (n)))) +#define bytes2word(b0, b1, b2, b3) \ + (((aes_32t)(b3) << 24) | ((aes_32t)(b2) << 16) | ((aes_32t)(b1) << 8) | (b0)) +#endif + +#if (ALGORITHM_BYTE_ORDER == BRG_BIG_ENDIAN) +#define upr(x,n) (((aes_32t)(x) >> (8 * (n))) | ((aes_32t)(x) << (32 - 8 * (n)))) +#define ups(x,n) ((aes_32t) (x) >> (8 * (n)))) +#define bval(x,n) ((aes_08t)((x) >> (24 - 8 * (n)))) +#define bytes2word(b0, b1, b2, b3) \ + (((aes_32t)(b0) << 24) | ((aes_32t)(b1) << 16) | ((aes_32t)(b2) << 8) | (b3)) +#endif + +#if defined(SAFE_IO) + +#define word_in(x,c) bytes2word(((aes_08t*)(x)+4*c)[0], ((aes_08t*)(x)+4*c)[1], \ + ((aes_08t*)(x)+4*c)[2], ((aes_08t*)(x)+4*c)[3]) +#define word_out(x,c,v) { ((aes_08t*)(x)+4*c)[0] = bval(v,0); ((aes_08t*)(x)+4*c)[1] = bval(v,1); \ + ((aes_08t*)(x)+4*c)[2] = bval(v,2); ((aes_08t*)(x)+4*c)[3] = bval(v,3); } + +#elif (ALGORITHM_BYTE_ORDER == PLATFORM_BYTE_ORDER) + +#define word_in(x,c) (*((aes_32t*)(x)+(c))) +#define word_out(x,c,v) (*((aes_32t*)(x)+(c)) = (v)) + +#else + +#define word_in(x,c) aes_sw32(*((aes_32t*)(x)+(c))) +#define word_out(x,c,v) (*((aes_32t*)(x)+(c)) = aes_sw32(v)) + +#endif + +/* the finite field modular polynomial and elements */ + +#define WPOLY 0x011b +#define BPOLY 0x1b + +/* multiply four bytes in GF(2^8) by 'x' {02} in parallel */ + +#define m1 0x80808080 +#define m2 0x7f7f7f7f +#define gf_mulx(x) ((((x) & m2) << 1) ^ ((((x) & m1) >> 7) * BPOLY)) + +/* The following defines provide alternative definitions of gf_mulx that might + give improved performance if a fast 32-bit multiply is not available. Note + that a temporary variable u needs to be defined where gf_mulx is used. + +#define gf_mulx(x) (u = (x) & m1, u |= (u >> 1), ((x) & m2) << 1) ^ ((u >> 3) | (u >> 6)) +#define m4 (0x01010101 * BPOLY) +#define gf_mulx(x) (u = (x) & m1, ((x) & m2) << 1) ^ ((u - (u >> 7)) & m4) +*/ + +/* Work out which tables are needed for the different options */ + +#if defined( AES_ASM ) +#if defined( ENC_ROUND ) +#undef ENC_ROUND +#endif +#define ENC_ROUND FOUR_TABLES +#if defined( LAST_ENC_ROUND ) +#undef LAST_ENC_ROUND +#endif +#define LAST_ENC_ROUND FOUR_TABLES +#if defined( DEC_ROUND ) +#undef DEC_ROUND +#endif +#define DEC_ROUND FOUR_TABLES +#if defined( LAST_DEC_ROUND ) +#undef LAST_DEC_ROUND +#endif +#define LAST_DEC_ROUND FOUR_TABLES +#if defined( KEY_SCHED ) +#undef KEY_SCHED +#define KEY_SCHED FOUR_TABLES +#endif +#endif + +#if defined(ENCRYPTION) || defined(AES_ASM) +#if ENC_ROUND == ONE_TABLE +#define FT1_SET +#elif ENC_ROUND == FOUR_TABLES +#define FT4_SET +#else +#define SBX_SET +#endif +#if LAST_ENC_ROUND == ONE_TABLE +#define FL1_SET +#elif LAST_ENC_ROUND == FOUR_TABLES +#define FL4_SET +#elif !defined(SBX_SET) +#define SBX_SET +#endif +#endif + +#if defined(DECRYPTION) || defined(AES_ASM) +#if DEC_ROUND == ONE_TABLE +#define IT1_SET +#elif DEC_ROUND == FOUR_TABLES +#define IT4_SET +#else +#define ISB_SET +#endif +#if LAST_DEC_ROUND == ONE_TABLE +#define IL1_SET +#elif LAST_DEC_ROUND == FOUR_TABLES +#define IL4_SET +#elif !defined(ISB_SET) +#define ISB_SET +#endif +#endif + +#if defined(ENCRYPTION_KEY_SCHEDULE) || defined(DECRYPTION_KEY_SCHEDULE) +#if KEY_SCHED == ONE_TABLE +#define LS1_SET +#define IM1_SET +#elif KEY_SCHED == FOUR_TABLES +#define LS4_SET +#define IM4_SET +#elif !defined(SBX_SET) +#define SBX_SET +#endif +#endif + +/* generic definitions of Rijndael macros that use tables */ + +#define no_table(x,box,vf,rf,c) bytes2word( \ + box[bval(vf(x,0,c),rf(0,c))], \ + box[bval(vf(x,1,c),rf(1,c))], \ + box[bval(vf(x,2,c),rf(2,c))], \ + box[bval(vf(x,3,c),rf(3,c))]) + +#define one_table(x,op,tab,vf,rf,c) \ + ( tab[bval(vf(x,0,c),rf(0,c))] \ + ^ op(tab[bval(vf(x,1,c),rf(1,c))],1) \ + ^ op(tab[bval(vf(x,2,c),rf(2,c))],2) \ + ^ op(tab[bval(vf(x,3,c),rf(3,c))],3)) + +#define four_tables(x,tab,vf,rf,c) \ + ( tab[0][bval(vf(x,0,c),rf(0,c))] \ + ^ tab[1][bval(vf(x,1,c),rf(1,c))] \ + ^ tab[2][bval(vf(x,2,c),rf(2,c))] \ + ^ tab[3][bval(vf(x,3,c),rf(3,c))]) + +#define four_cached_tables(x,tab,vf,rf,c) \ +( tab##0[bval(vf(x,0,c),rf(0,c))] \ + ^ tab##1[bval(vf(x,1,c),rf(1,c))] \ + ^ tab##2[bval(vf(x,2,c),rf(2,c))] \ + ^ tab##3[bval(vf(x,3,c),rf(3,c))]) + +#define vf1(x,r,c) (x) +#define rf1(r,c) (r) +#define rf2(r,c) ((8+r-c)&3) + +/* perform forward and inverse column mix operation on four bytes in long word x in */ +/* parallel. NOTE: x must be a simple variable, NOT an expression in these macros. */ + +#if defined(FM4_SET) /* not currently used */ +#define fwd_mcol(x) four_tables(x,t_use(f,m),vf1,rf1,0) +#elif defined(FM1_SET) /* not currently used */ +#define fwd_mcol(x) one_table(x,upr,t_use(f,m),vf1,rf1,0) +#else +#define dec_fmvars aes_32t g2 +#define fwd_mcol(x) (g2 = gf_mulx(x), g2 ^ upr((x) ^ g2, 3) ^ upr((x), 2) ^ upr((x), 1)) +#endif + +#if defined(IM4_SET) +#define inv_mcol(x) four_tables(x,t_use(i,m),vf1,rf1,0) +#elif defined(IM1_SET) +#define inv_mcol(x) one_table(x,upr,t_use(i,m),vf1,rf1,0) +#else +#define dec_imvars aes_32t g2, g4, g9 +#define inv_mcol(x) (g2 = gf_mulx(x), g4 = gf_mulx(g2), g9 = (x) ^ gf_mulx(g4), g4 ^= g9, \ + (x) ^ g2 ^ g4 ^ upr(g2 ^ g9, 3) ^ upr(g4, 2) ^ upr(g9, 1)) +#endif + +#if defined(FL4_SET) +#define ls_box(x,c) four_tables(x,t_use(f,l),vf1,rf2,c) +#elif defined(LS4_SET) +#define ls_box(x,c) four_tables(x,t_use(l,s),vf1,rf2,c) +#elif defined(FL1_SET) +#define ls_box(x,c) one_table(x,upr,t_use(f,l),vf1,rf2,c) +#elif defined(LS1_SET) +#define ls_box(x,c) one_table(x,upr,t_use(l,s),vf1,rf2,c) +#else +#define ls_box(x,c) no_table(x,t_use(s,box),vf1,rf2,c) +#endif + +#endif diff --git a/bsd/crypto/aes/aestab.c b/bsd/crypto/aes/aestab.c new file mode 100644 index 000000000..7997f2978 --- /dev/null +++ b/bsd/crypto/aes/aestab.c @@ -0,0 +1,384 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue 28/01/2004 + +*/ + +#if defined(__cplusplus) +extern "C" +{ +#endif + +#define DO_TABLES + +#include "aesopt.h" + +#if defined(FIXED_TABLES) + +#define sb_data(w) {\ + w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\ + w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\ + w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\ + w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\ + w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\ + w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\ + w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\ + w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\ + w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\ + w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\ + w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\ + w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\ + w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\ + w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\ + w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\ + w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\ + w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\ + w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\ + w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\ + w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\ + w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\ + w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\ + w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\ + w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\ + w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\ + w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\ + w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\ + w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\ + w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\ + w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\ + w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\ + w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) } + +#define isb_data(w) {\ + w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), w(0x38),\ + w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), w(0xd7), w(0xfb),\ + w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), w(0x2f), w(0xff), w(0x87),\ + w(0x34), w(0x8e), w(0x43), w(0x44), w(0xc4), w(0xde), w(0xe9), w(0xcb),\ + w(0x54), w(0x7b), w(0x94), w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d),\ + w(0xee), w(0x4c), w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e),\ + w(0x08), w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2),\ + w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), w(0x25),\ + w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), w(0x98), w(0x16),\ + w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), w(0x65), w(0xb6), w(0x92),\ + w(0x6c), w(0x70), w(0x48), w(0x50), w(0xfd), w(0xed), w(0xb9), w(0xda),\ + w(0x5e), w(0x15), w(0x46), w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84),\ + w(0x90), w(0xd8), w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a),\ + w(0xf7), w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06),\ + w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), w(0x02),\ + w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), w(0x8a), w(0x6b),\ + w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), w(0x67), w(0xdc), w(0xea),\ + w(0x97), w(0xf2), w(0xcf), w(0xce), w(0xf0), w(0xb4), w(0xe6), w(0x73),\ + w(0x96), w(0xac), w(0x74), w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85),\ + w(0xe2), w(0xf9), w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e),\ + w(0x47), w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89),\ + w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), w(0x1b),\ + w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), w(0x79), w(0x20),\ + w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), w(0xcd), w(0x5a), w(0xf4),\ + w(0x1f), w(0xdd), w(0xa8), w(0x33), w(0x88), w(0x07), w(0xc7), w(0x31),\ + w(0xb1), w(0x12), w(0x10), w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f),\ + w(0x60), w(0x51), w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d),\ + w(0x2d), w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef),\ + w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), w(0xb0),\ + w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), w(0x99), w(0x61),\ + w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), w(0x77), w(0xd6), w(0x26),\ + w(0xe1), w(0x69), w(0x14), w(0x63), w(0x55), w(0x21), w(0x0c), w(0x7d) } + +#define mm_data(w) {\ + w(0x00), w(0x01), w(0x02), w(0x03), w(0x04), w(0x05), w(0x06), w(0x07),\ + w(0x08), w(0x09), w(0x0a), w(0x0b), w(0x0c), w(0x0d), w(0x0e), w(0x0f),\ + w(0x10), w(0x11), w(0x12), w(0x13), w(0x14), w(0x15), w(0x16), w(0x17),\ + w(0x18), w(0x19), w(0x1a), w(0x1b), w(0x1c), w(0x1d), w(0x1e), w(0x1f),\ + w(0x20), w(0x21), w(0x22), w(0x23), w(0x24), w(0x25), w(0x26), w(0x27),\ + w(0x28), w(0x29), w(0x2a), w(0x2b), w(0x2c), w(0x2d), w(0x2e), w(0x2f),\ + w(0x30), w(0x31), w(0x32), w(0x33), w(0x34), w(0x35), w(0x36), w(0x37),\ + w(0x38), w(0x39), w(0x3a), w(0x3b), w(0x3c), w(0x3d), w(0x3e), w(0x3f),\ + w(0x40), w(0x41), w(0x42), w(0x43), w(0x44), w(0x45), w(0x46), w(0x47),\ + w(0x48), w(0x49), w(0x4a), w(0x4b), w(0x4c), w(0x4d), w(0x4e), w(0x4f),\ + w(0x50), w(0x51), w(0x52), w(0x53), w(0x54), w(0x55), w(0x56), w(0x57),\ + w(0x58), w(0x59), w(0x5a), w(0x5b), w(0x5c), w(0x5d), w(0x5e), w(0x5f),\ + w(0x60), w(0x61), w(0x62), w(0x63), w(0x64), w(0x65), w(0x66), w(0x67),\ + w(0x68), w(0x69), w(0x6a), w(0x6b), w(0x6c), w(0x6d), w(0x6e), w(0x6f),\ + w(0x70), w(0x71), w(0x72), w(0x73), w(0x74), w(0x75), w(0x76), w(0x77),\ + w(0x78), w(0x79), w(0x7a), w(0x7b), w(0x7c), w(0x7d), w(0x7e), w(0x7f),\ + w(0x80), w(0x81), w(0x82), w(0x83), w(0x84), w(0x85), w(0x86), w(0x87),\ + w(0x88), w(0x89), w(0x8a), w(0x8b), w(0x8c), w(0x8d), w(0x8e), w(0x8f),\ + w(0x90), w(0x91), w(0x92), w(0x93), w(0x94), w(0x95), w(0x96), w(0x97),\ + w(0x98), w(0x99), w(0x9a), w(0x9b), w(0x9c), w(0x9d), w(0x9e), w(0x9f),\ + w(0xa0), w(0xa1), w(0xa2), w(0xa3), w(0xa4), w(0xa5), w(0xa6), w(0xa7),\ + w(0xa8), w(0xa9), w(0xaa), w(0xab), w(0xac), w(0xad), w(0xae), w(0xaf),\ + w(0xb0), w(0xb1), w(0xb2), w(0xb3), w(0xb4), w(0xb5), w(0xb6), w(0xb7),\ + w(0xb8), w(0xb9), w(0xba), w(0xbb), w(0xbc), w(0xbd), w(0xbe), w(0xbf),\ + w(0xc0), w(0xc1), w(0xc2), w(0xc3), w(0xc4), w(0xc5), w(0xc6), w(0xc7),\ + w(0xc8), w(0xc9), w(0xca), w(0xcb), w(0xcc), w(0xcd), w(0xce), w(0xcf),\ + w(0xd0), w(0xd1), w(0xd2), w(0xd3), w(0xd4), w(0xd5), w(0xd6), w(0xd7),\ + w(0xd8), w(0xd9), w(0xda), w(0xdb), w(0xdc), w(0xdd), w(0xde), w(0xdf),\ + w(0xe0), w(0xe1), w(0xe2), w(0xe3), w(0xe4), w(0xe5), w(0xe6), w(0xe7),\ + w(0xe8), w(0xe9), w(0xea), w(0xeb), w(0xec), w(0xed), w(0xee), w(0xef),\ + w(0xf0), w(0xf1), w(0xf2), w(0xf3), w(0xf4), w(0xf5), w(0xf6), w(0xf7),\ + w(0xf8), w(0xf9), w(0xfa), w(0xfb), w(0xfc), w(0xfd), w(0xfe), w(0xff) } + +#define rc_data(w) {\ + w(0x01), w(0x02), w(0x04), w(0x08), w(0x10),w(0x20), w(0x40), w(0x80),\ + w(0x1b), w(0x36) } + +#define h0(x) (x) + +#define w0(p) bytes2word(p, 0, 0, 0) +#define w1(p) bytes2word(0, p, 0, 0) +#define w2(p) bytes2word(0, 0, p, 0) +#define w3(p) bytes2word(0, 0, 0, p) + +#define u0(p) bytes2word(f2(p), p, p, f3(p)) +#define u1(p) bytes2word(f3(p), f2(p), p, p) +#define u2(p) bytes2word(p, f3(p), f2(p), p) +#define u3(p) bytes2word(p, p, f3(p), f2(p)) + +#define v0(p) bytes2word(fe(p), f9(p), fd(p), fb(p)) +#define v1(p) bytes2word(fb(p), fe(p), f9(p), fd(p)) +#define v2(p) bytes2word(fd(p), fb(p), fe(p), f9(p)) +#define v3(p) bytes2word(f9(p), fd(p), fb(p), fe(p)) + +#endif + +#if defined(FIXED_TABLES) || !defined(FF_TABLES) + +#define f2(x) ((x<<1) ^ (((x>>7) & 1) * WPOLY)) +#define f4(x) ((x<<2) ^ (((x>>6) & 1) * WPOLY) ^ (((x>>6) & 2) * WPOLY)) +#define f8(x) ((x<<3) ^ (((x>>5) & 1) * WPOLY) ^ (((x>>5) & 2) * WPOLY) \ + ^ (((x>>5) & 4) * WPOLY)) +#define f3(x) (f2(x) ^ x) +#define f9(x) (f8(x) ^ x) +#define fb(x) (f8(x) ^ f2(x) ^ x) +#define fd(x) (f8(x) ^ f4(x) ^ x) +#define fe(x) (f8(x) ^ f4(x) ^ f2(x)) + +#else + +#define f2(x) ((x) ? pow[log[x] + 0x19] : 0) +#define f3(x) ((x) ? pow[log[x] + 0x01] : 0) +#define f9(x) ((x) ? pow[log[x] + 0xc7] : 0) +#define fb(x) ((x) ? pow[log[x] + 0x68] : 0) +#define fd(x) ((x) ? pow[log[x] + 0xee] : 0) +#define fe(x) ((x) ? pow[log[x] + 0xdf] : 0) +#define fi(x) ((x) ? pow[ 255 - log[x]] : 0) + +#endif + +#include "aestab.h" + +#if defined(FIXED_TABLES) + +/* implemented in case of wrong call for fixed tables */ + +void gen_tabs(void) +{ +} + +#else /* dynamic table generation */ + +#if !defined(FF_TABLES) + +/* Generate the tables for the dynamic table option + + It will generally be sensible to use tables to compute finite + field multiplies and inverses but where memory is scarse this + code might sometimes be better. But it only has effect during + initialisation so its pretty unimportant in overall terms. +*/ + +/* return 2 ^ (n - 1) where n is the bit number of the highest bit + set in x with x in the range 1 < x < 0x00000200. This form is + used so that locals within fi can be bytes rather than words +*/ + +static aes_08t hibit(const aes_32t x) +{ aes_08t r = (aes_08t)((x >> 1) | (x >> 2)); + + r |= (r >> 2); + r |= (r >> 4); + return (r + 1) >> 1; +} + +/* return the inverse of the finite field element x */ + +static aes_08t fi(const aes_08t x) +{ aes_08t p1 = x, p2 = BPOLY, n1 = hibit(x), n2 = 0x80, v1 = 1, v2 = 0; + + if(x < 2) return x; + + for(;;) + { + if(!n1) return v1; + + while(n2 >= n1) + { + n2 /= n1; p2 ^= p1 * n2; v2 ^= v1 * n2; n2 = hibit(p2); + } + + if(!n2) return v2; + + while(n1 >= n2) + { + n1 /= n2; p1 ^= p2 * n1; v1 ^= v2 * n1; n1 = hibit(p1); + } + } +} + +#endif + +/* The forward and inverse affine transformations used in the S-box */ + +#define fwd_affine(x) \ + (w = (aes_32t)x, w ^= (w<<1)^(w<<2)^(w<<3)^(w<<4), 0x63^(aes_08t)(w^(w>>8))) + +#define inv_affine(x) \ + (w = (aes_32t)x, w = (w<<1)^(w<<3)^(w<<6), 0x05^(aes_08t)(w^(w>>8))) + +static int init = 0; + +void gen_tabs(void) +{ aes_32t i, w; + +#if defined(FF_TABLES) + + aes_08t pow[512], log[256]; + + if(init) return; + /* log and power tables for GF(2^8) finite field with + WPOLY as modular polynomial - the simplest primitive + root is 0x03, used here to generate the tables + */ + + i = 0; w = 1; + do + { + pow[i] = (aes_08t)w; + pow[i + 255] = (aes_08t)w; + log[w] = (aes_08t)i++; + w ^= (w << 1) ^ (w & 0x80 ? WPOLY : 0); + } + while (w != 1); + +#else + if(init) return; +#endif + + for(i = 0, w = 1; i < RC_LENGTH; ++i) + { + t_set(r,c)[i] = bytes2word(w, 0, 0, 0); + w = f2(w); + } + + for(i = 0; i < 256; ++i) + { aes_08t b; + + b = fwd_affine(fi((aes_08t)i)); + w = bytes2word(f2(b), b, b, f3(b)); + +#if defined( SBX_SET ) + t_set(s,box)[i] = b; +#endif + +#if defined( FT1_SET ) /* tables for a normal encryption round */ + t_set(f,n)[i] = w; +#endif +#if defined( FT4_SET ) + t_set(f,n)[0][i] = w; + t_set(f,n)[1][i] = upr(w,1); + t_set(f,n)[2][i] = upr(w,2); + t_set(f,n)[3][i] = upr(w,3); +#endif + w = bytes2word(b, 0, 0, 0); + +#if defined( FL1_SET ) /* tables for last encryption round (may also */ + t_set(f,l)[i] = w; /* be used in the key schedule) */ +#endif +#if defined( FL4_SET ) + t_set(f,l)[0][i] = w; + t_set(f,l)[1][i] = upr(w,1); + t_set(f,l)[2][i] = upr(w,2); + t_set(f,l)[3][i] = upr(w,3); +#endif + +#if defined( LS1_SET ) /* table for key schedule if t_set(f,l) above is */ + t_set(l,s)[i] = w; /* not of the required form */ +#endif +#if defined( LS4_SET ) + t_set(l,s)[0][i] = w; + t_set(l,s)[1][i] = upr(w,1); + t_set(l,s)[2][i] = upr(w,2); + t_set(l,s)[3][i] = upr(w,3); +#endif + + b = fi(inv_affine((aes_08t)i)); + w = bytes2word(fe(b), f9(b), fd(b), fb(b)); + +#if defined( IM1_SET ) /* tables for the inverse mix column operation */ + t_set(i,m)[b] = w; +#endif +#if defined( IM4_SET ) + t_set(i,m)[0][b] = w; + t_set(i,m)[1][b] = upr(w,1); + t_set(i,m)[2][b] = upr(w,2); + t_set(i,m)[3][b] = upr(w,3); +#endif + +#if defined( ISB_SET ) + t_set(i,box)[i] = b; +#endif +#if defined( IT1_SET ) /* tables for a normal decryption round */ + t_set(i,n)[i] = w; +#endif +#if defined( IT4_SET ) + t_set(i,n)[0][i] = w; + t_set(i,n)[1][i] = upr(w,1); + t_set(i,n)[2][i] = upr(w,2); + t_set(i,n)[3][i] = upr(w,3); +#endif + w = bytes2word(b, 0, 0, 0); +#if defined( IL1_SET ) /* tables for last decryption round */ + t_set(i,l)[i] = w; +#endif +#if defined( IL4_SET ) + t_set(i,l)[0][i] = w; + t_set(i,l)[1][i] = upr(w,1); + t_set(i,l)[2][i] = upr(w,2); + t_set(i,l)[3][i] = upr(w,3); +#endif + } + init = 1; +} + +#endif + +#if defined(__cplusplus) +} +#endif + diff --git a/bsd/crypto/aes/aestab.h b/bsd/crypto/aes/aestab.h new file mode 100644 index 000000000..c610f9d43 --- /dev/null +++ b/bsd/crypto/aes/aestab.h @@ -0,0 +1,175 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue 28/01/2004 + + This file contains the code for declaring the tables needed to implement + AES. The file aesopt.h is assumed to be included before this header file. + If there are no global variables, the definitions here can be used to put + the AES tables in a structure so that a pointer can then be added to the + AES context to pass them to the AES routines that need them. If this + facility is used, the calling program has to ensure that this pointer is + managed appropriately. In particular, the value of the t_dec(in,it) item + in the table structure must be set to zero in order to ensure that the + tables are initialised. In practice the three code sequences in aeskey.c + that control the calls to gen_tabs() and the gen_tabs() routine itself will + have to be changed for a specific implementation. If global variables are + available it will generally be preferable to use them with the precomputed + FIXED_TABLES option that uses static global tables. + + The following defines can be used to control the way the tables + are defined, initialised and used in embedded environments that + require special features for these purposes + + the 't_dec' construction is used to declare fixed table arrays + the 't_set' construction is used to set fixed table values + the 't_use' construction is used to access fixed table values + + 256 byte tables: + + t_xxx(s,box) => forward S box + t_xxx(i,box) => inverse S box + + 256 32-bit word OR 4 x 256 32-bit word tables: + + t_xxx(f,n) => forward normal round + t_xxx(f,l) => forward last round + t_xxx(i,n) => inverse normal round + t_xxx(i,l) => inverse last round + t_xxx(l,s) => key schedule table + t_xxx(i,m) => key schedule table + + Other variables and tables: + + t_xxx(r,c) => the rcon table +*/ + +#if !defined( _AESTAB_H ) +#define _AESTAB_H + +#define t_dec(m,n) t_##m##n +#define t_set(m,n) t_##m##n +#define t_use(m,n) t_##m##n + +#if defined(FIXED_TABLES) +#define Const const +#else +#define Const +#endif + +#if defined(DO_TABLES) +#define Extern +#else +#define Extern extern +#endif + +#if defined(_MSC_VER) && defined(TABLE_ALIGN) +#define Align __declspec(align(TABLE_ALIGN)) +#else +#define Align +#endif + +#if defined(__cplusplus) +extern "C" +{ +#endif + +#if defined(DO_TABLES) && defined(FIXED_TABLES) +#define d_1(t,n,b,e) Align Const t n[256] = b(e) +#define d_4(t,n,b,e,f,g,h) Align Const t n[4][256] = { b(e), b(f), b(g), b(h) } +Extern Align Const aes_32t t_dec(r,c)[RC_LENGTH] = rc_data(w0); +#else +#define d_1(t,n,b,e) Extern Align Const t n[256] +#define d_4(t,n,b,e,f,g,h) Extern Align Const t n[4][256] +Extern Align Const aes_32t t_dec(r,c)[RC_LENGTH]; +#endif + +#if defined( SBX_SET ) + d_1(aes_08t, t_dec(s,box), sb_data, h0); +#endif +#if defined( ISB_SET ) + d_1(aes_08t, t_dec(i,box), isb_data, h0); +#endif + +#if defined( FT1_SET ) + d_1(aes_32t, t_dec(f,n), sb_data, u0); +#endif +#if defined( FT4_SET ) + d_4(aes_32t, t_dec(f,n), sb_data, u0, u1, u2, u3); +#endif + +#if defined( FL1_SET ) + d_1(aes_32t, t_dec(f,l), sb_data, w0); +#endif +#if defined( FL4_SET ) + d_4(aes_32t, t_dec(f,l), sb_data, w0, w1, w2, w3); +#endif + +#if defined( IT1_SET ) + d_1(aes_32t, t_dec(i,n), isb_data, v0); +#endif +#if defined( IT4_SET ) + d_4(aes_32t, t_dec(i,n), isb_data, v0, v1, v2, v3); +#endif + +#if defined( IL1_SET ) + d_1(aes_32t, t_dec(i,l), isb_data, w0); +#endif +#if defined( IL4_SET ) + d_4(aes_32t, t_dec(i,l), isb_data, w0, w1, w2, w3); +#endif + +#if defined( LS1_SET ) +#if defined( FL1_SET ) +#undef LS1_SET +#else + d_1(aes_32t, t_dec(l,s), sb_data, w0); +#endif +#endif + +#if defined( LS4_SET ) +#if defined( FL4_SET ) +#undef LS4_SET +#else + d_4(aes_32t, t_dec(l,s), sb_data, w0, w1, w2, w3); +#endif +#endif + +#if defined( IM1_SET ) + d_1(aes_32t, t_dec(i,m), mm_data, v0); +#endif +#if defined( IM4_SET ) + d_4(aes_32t, t_dec(i,m), mm_data, v0, v1, v2, v3); +#endif + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/bsd/crypto/blowfish/Makefile b/bsd/crypto/blowfish/Makefile index e4885864b..0521cc6fd 100644 --- a/bsd/crypto/blowfish/Makefile +++ b/bsd/crypto/blowfish/Makefile @@ -26,7 +26,7 @@ INSTALL_MI_DIR = crypto EXPORT_MI_DIR = ${INSTALL_MI_DIR} -INSTALL_MI_LCL_KERN_LIST = ${PRIVATE_DATAFILES} +INSTALL_KF_MI_LCL_LIST = ${PRIVATE_DATAFILES} include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/bsd/crypto/blowfish/blowfish.h b/bsd/crypto/blowfish/blowfish.h index 69b902426..121e9c394 100644 --- a/bsd/crypto/blowfish/blowfish.h +++ b/bsd/crypto/blowfish/blowfish.h @@ -80,9 +80,9 @@ typedef struct bf_key_st { BF_LONG S[4*256]; } BF_KEY; -void BF_set_key __P((BF_KEY *, int, unsigned char *)); -void BF_encrypt __P((BF_LONG *, BF_KEY *)); -void BF_decrypt __P((BF_LONG *, BF_KEY *)); +void BF_set_key(BF_KEY *, int, unsigned char *); +void BF_encrypt(BF_LONG *, BF_KEY *); +void BF_decrypt(BF_LONG *, BF_KEY *); void BF_cbc_encrypt(const unsigned char *, unsigned char *, long, const BF_KEY *, unsigned char *, int); diff --git a/bsd/crypto/cast128/Makefile b/bsd/crypto/cast128/Makefile index d67b89e11..d214498bb 100644 --- a/bsd/crypto/cast128/Makefile +++ b/bsd/crypto/cast128/Makefile @@ -26,7 +26,7 @@ INSTALL_MI_DIR = crypto EXPORT_MI_DIR = ${INSTALL_MI_DIR} -INSTALL_MI_LCL_KERN_LIST = ${PRIVATE_DATAFILES} +INSTALL_KF_MI_LCL_LIST = ${PRIVATE_DATAFILES} include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/bsd/crypto/cast128/cast128.h b/bsd/crypto/cast128/cast128.h index 2dc90d318..d79eea55c 100644 --- a/bsd/crypto/cast128/cast128.h +++ b/bsd/crypto/cast128/cast128.h @@ -46,14 +46,10 @@ #define CAST128_DECRYPT 0 -extern void set_cast128_subkey __P((u_int32_t *, u_int8_t *, int)); -extern void cast128_encrypt_round16 __P((u_int8_t *, const u_int8_t *, - u_int32_t *)); -extern void cast128_decrypt_round16 __P((u_int8_t *, const u_int8_t *, - u_int32_t *)); -extern void cast128_encrypt_round12 __P((u_int8_t *, const u_int8_t *, - u_int32_t *)); -extern void cast128_decrypt_round12 __P((u_int8_t *, const u_int8_t *, - u_int32_t *)); +extern void set_cast128_subkey(u_int32_t *, u_int8_t *, int); +extern void cast128_encrypt_round16(u_int8_t *, const u_int8_t *, u_int32_t *); +extern void cast128_decrypt_round16(u_int8_t *, const u_int8_t *, u_int32_t *); +extern void cast128_encrypt_round12(u_int8_t *, const u_int8_t *, u_int32_t *); +extern void cast128_decrypt_round12(u_int8_t *, const u_int8_t *, u_int32_t *); #endif diff --git a/bsd/crypto/des/Makefile b/bsd/crypto/des/Makefile index 58c13ed5f..08483b21a 100644 --- a/bsd/crypto/des/Makefile +++ b/bsd/crypto/des/Makefile @@ -26,7 +26,7 @@ INSTALL_MI_DIR = crypto EXPORT_MI_DIR = ${INSTALL_MI_DIR} -INSTALL_MI_LCL_KERN_LIST = ${PRIVATE_DATAFILES} +INSTALL_KF_MI_LCL_LIST = ${PRIVATE_DATAFILES} include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/bsd/crypto/des/des.h b/bsd/crypto/des/des.h index a21b6bfa2..9f232b185 100644 --- a/bsd/crypto/des/des.h +++ b/bsd/crypto/des/des.h @@ -81,35 +81,34 @@ typedef struct des_ks_struct extern int des_check_key; /* defaults to false */ -char *des_options __P((void)); -void des_ecb_encrypt __P((des_cblock *, des_cblock *, - des_key_schedule, int)); +char *des_options(void); +void des_ecb_encrypt(des_cblock *, des_cblock *, des_key_schedule, int); -void des_encrypt1 __P((DES_LONG *, des_key_schedule, int)); -void des_encrypt2 __P((DES_LONG *, des_key_schedule, int)); -void des_encrypt3 __P((DES_LONG *, des_key_schedule, des_key_schedule, - des_key_schedule)); -void des_decrypt3 __P((DES_LONG *, des_key_schedule, des_key_schedule, - des_key_schedule)); +void des_encrypt1(DES_LONG *, des_key_schedule, int); +void des_encrypt2(DES_LONG *, des_key_schedule, int); +void des_encrypt3(DES_LONG *, des_key_schedule, des_key_schedule, + des_key_schedule); +void des_decrypt3(DES_LONG *, des_key_schedule, des_key_schedule, + des_key_schedule); -void des_ecb3_encrypt __P((des_cblock *, des_cblock *, des_key_schedule, - des_key_schedule, des_key_schedule, int)); +void des_ecb3_encrypt(des_cblock *, des_cblock *, des_key_schedule, + des_key_schedule, des_key_schedule, int); -void des_ncbc_encrypt __P((const unsigned char *, unsigned char *, long, - des_key_schedule, des_cblock *, int)); +void des_ncbc_encrypt(const unsigned char *, unsigned char *, long, + des_key_schedule, des_cblock *, int); void des_ede3_cbc_encrypt(const unsigned char *, unsigned char *, long, des_key_schedule, des_key_schedule, des_key_schedule, des_cblock *, int); -void des_set_odd_parity __P((des_cblock *)); -void des_fixup_key_parity __P((des_cblock *)); -int des_is_weak_key __P((des_cblock *)); -int des_set_key __P((des_cblock *, des_key_schedule)); -int des_key_sched __P((des_cblock *, des_key_schedule)); -int des_set_key_checked __P((des_cblock *, des_key_schedule)); -void des_set_key_unchecked __P((des_cblock *, des_key_schedule)); -int des_check_key_parity __P((des_cblock *)); +void des_set_odd_parity(des_cblock *); +void des_fixup_key_parity(des_cblock *); +int des_is_weak_key(des_cblock *); +int des_set_key(des_cblock *, des_key_schedule); +int des_key_sched(des_cblock *, des_key_schedule); +int des_set_key_checked(des_cblock *, des_key_schedule); +void des_set_key_unchecked(des_cblock *, des_key_schedule); +int des_check_key_parity(des_cblock *); #ifdef __cplusplus } diff --git a/bsd/crypto/md5.c b/bsd/crypto/md5.c index 6fd600bd9..734232dac 100644 --- a/bsd/crypto/md5.c +++ b/bsd/crypto/md5.c @@ -127,7 +127,7 @@ static const u_int8_t md5_paddat[MD5_BUFLEN] = { 0, 0, 0, 0, 0, 0, 0, 0, }; -static void md5_calc __P((u_int8_t *, md5_ctxt *)); +static void md5_calc(u_int8_t *, md5_ctxt *); void md5_init(ctxt) md5_ctxt *ctxt; diff --git a/bsd/crypto/md5.h b/bsd/crypto/md5.h index 3d02afe6b..8a99300b8 100644 --- a/bsd/crypto/md5.h +++ b/bsd/crypto/md5.h @@ -58,10 +58,10 @@ typedef struct { u_int8_t md5_buf[MD5_BUFLEN]; } md5_ctxt; -extern void md5_init __P((md5_ctxt *)); -extern void md5_loop __P((md5_ctxt *, u_int8_t *, u_int)); -extern void md5_pad __P((md5_ctxt *)); -extern void md5_result __P((u_int8_t *, md5_ctxt *)); +extern void md5_init(md5_ctxt *); +extern void md5_loop(md5_ctxt *, u_int8_t *, u_int); +extern void md5_pad(md5_ctxt *); +extern void md5_result(u_int8_t *, md5_ctxt *); /* compatibility */ #define MD5_CTX md5_ctxt diff --git a/bsd/crypto/rc4/Makefile b/bsd/crypto/rc4/Makefile index 09d432842..23432a57e 100644 --- a/bsd/crypto/rc4/Makefile +++ b/bsd/crypto/rc4/Makefile @@ -26,7 +26,7 @@ INSTALL_MI_DIR = crypto EXPORT_MI_DIR = ${INSTALL_MI_DIR} -INSTALL_MI_LCL_KERN_LIST = ${PRIVATE_DATAFILES} +INSTALL_KF_MI_LCL_LIST = ${PRIVATE_DATAFILES} include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/bsd/crypto/rijndael/boxes-fst.dat b/bsd/crypto/rijndael/boxes-fst.dat deleted file mode 100644 index 8b9e26c33..000000000 --- a/bsd/crypto/rijndael/boxes-fst.dat +++ /dev/null @@ -1,958 +0,0 @@ -/* $FreeBSD: src/sys/crypto/rijndael/boxes-fst.dat,v 1.2.2.1 2001/07/03 11:01:35 ume Exp $ */ -/* $KAME: boxes-fst.dat,v 1.6 2001/05/27 00:23:22 itojun Exp $ */ - -const word8 S[256] = { - 99, 124, 119, 123, 242, 107, 111, 197, 48, 1, 103, 43, 254, 215, 171, 118, -202, 130, 201, 125, 250, 89, 71, 240, 173, 212, 162, 175, 156, 164, 114, 192, -183, 253, 147, 38, 54, 63, 247, 204, 52, 165, 229, 241, 113, 216, 49, 21, - 4, 199, 35, 195, 24, 150, 5, 154, 7, 18, 128, 226, 235, 39, 178, 117, - 9, 131, 44, 26, 27, 110, 90, 160, 82, 59, 214, 179, 41, 227, 47, 132, - 83, 209, 0, 237, 32, 252, 177, 91, 106, 203, 190, 57, 74, 76, 88, 207, -208, 239, 170, 251, 67, 77, 51, 133, 69, 249, 2, 127, 80, 60, 159, 168, - 81, 163, 64, 143, 146, 157, 56, 245, 188, 182, 218, 33, 16, 255, 243, 210, -205, 12, 19, 236, 95, 151, 68, 23, 196, 167, 126, 61, 100, 93, 25, 115, - 96, 129, 79, 220, 34, 42, 144, 136, 70, 238, 184, 20, 222, 94, 11, 219, -224, 50, 58, 10, 73, 6, 36, 92, 194, 211, 172, 98, 145, 149, 228, 121, -231, 200, 55, 109, 141, 213, 78, 169, 108, 86, 244, 234, 101, 122, 174, 8, -186, 120, 37, 46, 28, 166, 180, 198, 232, 221, 116, 31, 75, 189, 139, 138, -112, 62, 181, 102, 72, 3, 246, 14, 97, 53, 87, 185, 134, 193, 29, 158, -225, 248, 152, 17, 105, 217, 142, 148, 155, 30, 135, 233, 206, 85, 40, 223, -140, 161, 137, 13, 191, 230, 66, 104, 65, 153, 45, 15, 176, 84, 187, 22 -}; - -#ifdef INTERMEDIATE_VALUE_KAT -static const word8 Si[256] = { - 82, 9, 106, 213, 48, 54, 165, 56, 191, 64, 163, 158, 129, 243, 215, 251, -124, 227, 57, 130, 155, 47, 255, 135, 52, 142, 67, 68, 196, 222, 233, 203, - 84, 123, 148, 50, 166, 194, 35, 61, 238, 76, 149, 11, 66, 250, 195, 78, - 8, 46, 161, 102, 40, 217, 36, 178, 118, 91, 162, 73, 109, 139, 209, 37, -114, 248, 246, 100, 134, 104, 152, 22, 212, 164, 92, 204, 93, 101, 182, 146, -108, 112, 72, 80, 253, 237, 185, 218, 94, 21, 70, 87, 167, 141, 157, 132, -144, 216, 171, 0, 140, 188, 211, 10, 247, 228, 88, 5, 184, 179, 69, 6, -208, 44, 30, 143, 202, 63, 15, 2, 193, 175, 189, 3, 1, 19, 138, 107, - 58, 145, 17, 65, 79, 103, 220, 234, 151, 242, 207, 206, 240, 180, 230, 115, -150, 172, 116, 34, 231, 173, 53, 133, 226, 249, 55, 232, 28, 117, 223, 110, - 71, 241, 26, 113, 29, 41, 197, 137, 111, 183, 98, 14, 170, 24, 190, 27, -252, 86, 62, 75, 198, 210, 121, 32, 154, 219, 192, 254, 120, 205, 90, 244, - 31, 221, 168, 51, 136, 7, 199, 49, 177, 18, 16, 89, 39, 128, 236, 95, - 96, 81, 127, 169, 25, 181, 74, 13, 45, 229, 122, 159, 147, 201, 156, 239, -160, 224, 59, 77, 174, 42, 245, 176, 200, 235, 187, 60, 131, 83, 153, 97, - 23, 43, 4, 126, 186, 119, 214, 38, 225, 105, 20, 99, 85, 33, 12, 125 -}; -#endif /* INTERMEDIATE_VALUE_KAT */ - -union xtab { - word32 xt32[256]; - word8 xt8[256][4]; -}; - -static const union xtab xT1 = { - .xt8 = { -{0xc6,0x63,0x63,0xa5}, {0xf8,0x7c,0x7c,0x84}, {0xee,0x77,0x77,0x99}, {0xf6,0x7b,0x7b,0x8d}, -{0xff,0xf2,0xf2,0x0d}, {0xd6,0x6b,0x6b,0xbd}, {0xde,0x6f,0x6f,0xb1}, {0x91,0xc5,0xc5,0x54}, -{0x60,0x30,0x30,0x50}, {0x02,0x01,0x01,0x03}, {0xce,0x67,0x67,0xa9}, {0x56,0x2b,0x2b,0x7d}, -{0xe7,0xfe,0xfe,0x19}, {0xb5,0xd7,0xd7,0x62}, {0x4d,0xab,0xab,0xe6}, {0xec,0x76,0x76,0x9a}, -{0x8f,0xca,0xca,0x45}, {0x1f,0x82,0x82,0x9d}, {0x89,0xc9,0xc9,0x40}, {0xfa,0x7d,0x7d,0x87}, -{0xef,0xfa,0xfa,0x15}, {0xb2,0x59,0x59,0xeb}, {0x8e,0x47,0x47,0xc9}, {0xfb,0xf0,0xf0,0x0b}, -{0x41,0xad,0xad,0xec}, {0xb3,0xd4,0xd4,0x67}, {0x5f,0xa2,0xa2,0xfd}, {0x45,0xaf,0xaf,0xea}, -{0x23,0x9c,0x9c,0xbf}, {0x53,0xa4,0xa4,0xf7}, {0xe4,0x72,0x72,0x96}, {0x9b,0xc0,0xc0,0x5b}, -{0x75,0xb7,0xb7,0xc2}, {0xe1,0xfd,0xfd,0x1c}, {0x3d,0x93,0x93,0xae}, {0x4c,0x26,0x26,0x6a}, -{0x6c,0x36,0x36,0x5a}, {0x7e,0x3f,0x3f,0x41}, {0xf5,0xf7,0xf7,0x02}, {0x83,0xcc,0xcc,0x4f}, -{0x68,0x34,0x34,0x5c}, {0x51,0xa5,0xa5,0xf4}, {0xd1,0xe5,0xe5,0x34}, {0xf9,0xf1,0xf1,0x08}, -{0xe2,0x71,0x71,0x93}, {0xab,0xd8,0xd8,0x73}, {0x62,0x31,0x31,0x53}, {0x2a,0x15,0x15,0x3f}, -{0x08,0x04,0x04,0x0c}, {0x95,0xc7,0xc7,0x52}, {0x46,0x23,0x23,0x65}, {0x9d,0xc3,0xc3,0x5e}, -{0x30,0x18,0x18,0x28}, {0x37,0x96,0x96,0xa1}, {0x0a,0x05,0x05,0x0f}, {0x2f,0x9a,0x9a,0xb5}, -{0x0e,0x07,0x07,0x09}, {0x24,0x12,0x12,0x36}, {0x1b,0x80,0x80,0x9b}, {0xdf,0xe2,0xe2,0x3d}, -{0xcd,0xeb,0xeb,0x26}, {0x4e,0x27,0x27,0x69}, {0x7f,0xb2,0xb2,0xcd}, {0xea,0x75,0x75,0x9f}, -{0x12,0x09,0x09,0x1b}, {0x1d,0x83,0x83,0x9e}, {0x58,0x2c,0x2c,0x74}, {0x34,0x1a,0x1a,0x2e}, -{0x36,0x1b,0x1b,0x2d}, {0xdc,0x6e,0x6e,0xb2}, {0xb4,0x5a,0x5a,0xee}, {0x5b,0xa0,0xa0,0xfb}, -{0xa4,0x52,0x52,0xf6}, {0x76,0x3b,0x3b,0x4d}, {0xb7,0xd6,0xd6,0x61}, {0x7d,0xb3,0xb3,0xce}, -{0x52,0x29,0x29,0x7b}, {0xdd,0xe3,0xe3,0x3e}, {0x5e,0x2f,0x2f,0x71}, {0x13,0x84,0x84,0x97}, -{0xa6,0x53,0x53,0xf5}, {0xb9,0xd1,0xd1,0x68}, {0x00,0x00,0x00,0x00}, {0xc1,0xed,0xed,0x2c}, -{0x40,0x20,0x20,0x60}, {0xe3,0xfc,0xfc,0x1f}, {0x79,0xb1,0xb1,0xc8}, {0xb6,0x5b,0x5b,0xed}, -{0xd4,0x6a,0x6a,0xbe}, {0x8d,0xcb,0xcb,0x46}, {0x67,0xbe,0xbe,0xd9}, {0x72,0x39,0x39,0x4b}, -{0x94,0x4a,0x4a,0xde}, {0x98,0x4c,0x4c,0xd4}, {0xb0,0x58,0x58,0xe8}, {0x85,0xcf,0xcf,0x4a}, -{0xbb,0xd0,0xd0,0x6b}, {0xc5,0xef,0xef,0x2a}, {0x4f,0xaa,0xaa,0xe5}, {0xed,0xfb,0xfb,0x16}, -{0x86,0x43,0x43,0xc5}, {0x9a,0x4d,0x4d,0xd7}, {0x66,0x33,0x33,0x55}, {0x11,0x85,0x85,0x94}, -{0x8a,0x45,0x45,0xcf}, {0xe9,0xf9,0xf9,0x10}, {0x04,0x02,0x02,0x06}, {0xfe,0x7f,0x7f,0x81}, -{0xa0,0x50,0x50,0xf0}, {0x78,0x3c,0x3c,0x44}, {0x25,0x9f,0x9f,0xba}, {0x4b,0xa8,0xa8,0xe3}, -{0xa2,0x51,0x51,0xf3}, {0x5d,0xa3,0xa3,0xfe}, {0x80,0x40,0x40,0xc0}, {0x05,0x8f,0x8f,0x8a}, -{0x3f,0x92,0x92,0xad}, {0x21,0x9d,0x9d,0xbc}, {0x70,0x38,0x38,0x48}, {0xf1,0xf5,0xf5,0x04}, -{0x63,0xbc,0xbc,0xdf}, {0x77,0xb6,0xb6,0xc1}, {0xaf,0xda,0xda,0x75}, {0x42,0x21,0x21,0x63}, -{0x20,0x10,0x10,0x30}, {0xe5,0xff,0xff,0x1a}, {0xfd,0xf3,0xf3,0x0e}, {0xbf,0xd2,0xd2,0x6d}, -{0x81,0xcd,0xcd,0x4c}, {0x18,0x0c,0x0c,0x14}, {0x26,0x13,0x13,0x35}, {0xc3,0xec,0xec,0x2f}, -{0xbe,0x5f,0x5f,0xe1}, {0x35,0x97,0x97,0xa2}, {0x88,0x44,0x44,0xcc}, {0x2e,0x17,0x17,0x39}, -{0x93,0xc4,0xc4,0x57}, {0x55,0xa7,0xa7,0xf2}, {0xfc,0x7e,0x7e,0x82}, {0x7a,0x3d,0x3d,0x47}, -{0xc8,0x64,0x64,0xac}, {0xba,0x5d,0x5d,0xe7}, {0x32,0x19,0x19,0x2b}, {0xe6,0x73,0x73,0x95}, -{0xc0,0x60,0x60,0xa0}, {0x19,0x81,0x81,0x98}, {0x9e,0x4f,0x4f,0xd1}, {0xa3,0xdc,0xdc,0x7f}, -{0x44,0x22,0x22,0x66}, {0x54,0x2a,0x2a,0x7e}, {0x3b,0x90,0x90,0xab}, {0x0b,0x88,0x88,0x83}, -{0x8c,0x46,0x46,0xca}, {0xc7,0xee,0xee,0x29}, {0x6b,0xb8,0xb8,0xd3}, {0x28,0x14,0x14,0x3c}, -{0xa7,0xde,0xde,0x79}, {0xbc,0x5e,0x5e,0xe2}, {0x16,0x0b,0x0b,0x1d}, {0xad,0xdb,0xdb,0x76}, -{0xdb,0xe0,0xe0,0x3b}, {0x64,0x32,0x32,0x56}, {0x74,0x3a,0x3a,0x4e}, {0x14,0x0a,0x0a,0x1e}, -{0x92,0x49,0x49,0xdb}, {0x0c,0x06,0x06,0x0a}, {0x48,0x24,0x24,0x6c}, {0xb8,0x5c,0x5c,0xe4}, -{0x9f,0xc2,0xc2,0x5d}, {0xbd,0xd3,0xd3,0x6e}, {0x43,0xac,0xac,0xef}, {0xc4,0x62,0x62,0xa6}, -{0x39,0x91,0x91,0xa8}, {0x31,0x95,0x95,0xa4}, {0xd3,0xe4,0xe4,0x37}, {0xf2,0x79,0x79,0x8b}, -{0xd5,0xe7,0xe7,0x32}, {0x8b,0xc8,0xc8,0x43}, {0x6e,0x37,0x37,0x59}, {0xda,0x6d,0x6d,0xb7}, -{0x01,0x8d,0x8d,0x8c}, {0xb1,0xd5,0xd5,0x64}, {0x9c,0x4e,0x4e,0xd2}, {0x49,0xa9,0xa9,0xe0}, -{0xd8,0x6c,0x6c,0xb4}, {0xac,0x56,0x56,0xfa}, {0xf3,0xf4,0xf4,0x07}, {0xcf,0xea,0xea,0x25}, -{0xca,0x65,0x65,0xaf}, {0xf4,0x7a,0x7a,0x8e}, {0x47,0xae,0xae,0xe9}, {0x10,0x08,0x08,0x18}, -{0x6f,0xba,0xba,0xd5}, {0xf0,0x78,0x78,0x88}, {0x4a,0x25,0x25,0x6f}, {0x5c,0x2e,0x2e,0x72}, -{0x38,0x1c,0x1c,0x24}, {0x57,0xa6,0xa6,0xf1}, {0x73,0xb4,0xb4,0xc7}, {0x97,0xc6,0xc6,0x51}, -{0xcb,0xe8,0xe8,0x23}, {0xa1,0xdd,0xdd,0x7c}, {0xe8,0x74,0x74,0x9c}, {0x3e,0x1f,0x1f,0x21}, -{0x96,0x4b,0x4b,0xdd}, {0x61,0xbd,0xbd,0xdc}, {0x0d,0x8b,0x8b,0x86}, {0x0f,0x8a,0x8a,0x85}, -{0xe0,0x70,0x70,0x90}, {0x7c,0x3e,0x3e,0x42}, {0x71,0xb5,0xb5,0xc4}, {0xcc,0x66,0x66,0xaa}, -{0x90,0x48,0x48,0xd8}, {0x06,0x03,0x03,0x05}, {0xf7,0xf6,0xf6,0x01}, {0x1c,0x0e,0x0e,0x12}, -{0xc2,0x61,0x61,0xa3}, {0x6a,0x35,0x35,0x5f}, {0xae,0x57,0x57,0xf9}, {0x69,0xb9,0xb9,0xd0}, -{0x17,0x86,0x86,0x91}, {0x99,0xc1,0xc1,0x58}, {0x3a,0x1d,0x1d,0x27}, {0x27,0x9e,0x9e,0xb9}, -{0xd9,0xe1,0xe1,0x38}, {0xeb,0xf8,0xf8,0x13}, {0x2b,0x98,0x98,0xb3}, {0x22,0x11,0x11,0x33}, -{0xd2,0x69,0x69,0xbb}, {0xa9,0xd9,0xd9,0x70}, {0x07,0x8e,0x8e,0x89}, {0x33,0x94,0x94,0xa7}, -{0x2d,0x9b,0x9b,0xb6}, {0x3c,0x1e,0x1e,0x22}, {0x15,0x87,0x87,0x92}, {0xc9,0xe9,0xe9,0x20}, -{0x87,0xce,0xce,0x49}, {0xaa,0x55,0x55,0xff}, {0x50,0x28,0x28,0x78}, {0xa5,0xdf,0xdf,0x7a}, -{0x03,0x8c,0x8c,0x8f}, {0x59,0xa1,0xa1,0xf8}, {0x09,0x89,0x89,0x80}, {0x1a,0x0d,0x0d,0x17}, -{0x65,0xbf,0xbf,0xda}, {0xd7,0xe6,0xe6,0x31}, {0x84,0x42,0x42,0xc6}, {0xd0,0x68,0x68,0xb8}, -{0x82,0x41,0x41,0xc3}, {0x29,0x99,0x99,0xb0}, {0x5a,0x2d,0x2d,0x77}, {0x1e,0x0f,0x0f,0x11}, -{0x7b,0xb0,0xb0,0xcb}, {0xa8,0x54,0x54,0xfc}, {0x6d,0xbb,0xbb,0xd6}, {0x2c,0x16,0x16,0x3a} - } -}; -#define T1 xT1.xt8 - -static const union xtab xT2 = { - .xt8 = { -{0xa5,0xc6,0x63,0x63}, {0x84,0xf8,0x7c,0x7c}, {0x99,0xee,0x77,0x77}, {0x8d,0xf6,0x7b,0x7b}, -{0x0d,0xff,0xf2,0xf2}, {0xbd,0xd6,0x6b,0x6b}, {0xb1,0xde,0x6f,0x6f}, {0x54,0x91,0xc5,0xc5}, -{0x50,0x60,0x30,0x30}, {0x03,0x02,0x01,0x01}, {0xa9,0xce,0x67,0x67}, {0x7d,0x56,0x2b,0x2b}, -{0x19,0xe7,0xfe,0xfe}, {0x62,0xb5,0xd7,0xd7}, {0xe6,0x4d,0xab,0xab}, {0x9a,0xec,0x76,0x76}, -{0x45,0x8f,0xca,0xca}, {0x9d,0x1f,0x82,0x82}, {0x40,0x89,0xc9,0xc9}, {0x87,0xfa,0x7d,0x7d}, -{0x15,0xef,0xfa,0xfa}, {0xeb,0xb2,0x59,0x59}, {0xc9,0x8e,0x47,0x47}, {0x0b,0xfb,0xf0,0xf0}, -{0xec,0x41,0xad,0xad}, {0x67,0xb3,0xd4,0xd4}, {0xfd,0x5f,0xa2,0xa2}, {0xea,0x45,0xaf,0xaf}, -{0xbf,0x23,0x9c,0x9c}, {0xf7,0x53,0xa4,0xa4}, {0x96,0xe4,0x72,0x72}, {0x5b,0x9b,0xc0,0xc0}, -{0xc2,0x75,0xb7,0xb7}, {0x1c,0xe1,0xfd,0xfd}, {0xae,0x3d,0x93,0x93}, {0x6a,0x4c,0x26,0x26}, -{0x5a,0x6c,0x36,0x36}, {0x41,0x7e,0x3f,0x3f}, {0x02,0xf5,0xf7,0xf7}, {0x4f,0x83,0xcc,0xcc}, -{0x5c,0x68,0x34,0x34}, {0xf4,0x51,0xa5,0xa5}, {0x34,0xd1,0xe5,0xe5}, {0x08,0xf9,0xf1,0xf1}, -{0x93,0xe2,0x71,0x71}, {0x73,0xab,0xd8,0xd8}, {0x53,0x62,0x31,0x31}, {0x3f,0x2a,0x15,0x15}, -{0x0c,0x08,0x04,0x04}, {0x52,0x95,0xc7,0xc7}, {0x65,0x46,0x23,0x23}, {0x5e,0x9d,0xc3,0xc3}, -{0x28,0x30,0x18,0x18}, {0xa1,0x37,0x96,0x96}, {0x0f,0x0a,0x05,0x05}, {0xb5,0x2f,0x9a,0x9a}, -{0x09,0x0e,0x07,0x07}, {0x36,0x24,0x12,0x12}, {0x9b,0x1b,0x80,0x80}, {0x3d,0xdf,0xe2,0xe2}, -{0x26,0xcd,0xeb,0xeb}, {0x69,0x4e,0x27,0x27}, {0xcd,0x7f,0xb2,0xb2}, {0x9f,0xea,0x75,0x75}, -{0x1b,0x12,0x09,0x09}, {0x9e,0x1d,0x83,0x83}, {0x74,0x58,0x2c,0x2c}, {0x2e,0x34,0x1a,0x1a}, -{0x2d,0x36,0x1b,0x1b}, {0xb2,0xdc,0x6e,0x6e}, {0xee,0xb4,0x5a,0x5a}, {0xfb,0x5b,0xa0,0xa0}, -{0xf6,0xa4,0x52,0x52}, {0x4d,0x76,0x3b,0x3b}, {0x61,0xb7,0xd6,0xd6}, {0xce,0x7d,0xb3,0xb3}, -{0x7b,0x52,0x29,0x29}, {0x3e,0xdd,0xe3,0xe3}, {0x71,0x5e,0x2f,0x2f}, {0x97,0x13,0x84,0x84}, -{0xf5,0xa6,0x53,0x53}, {0x68,0xb9,0xd1,0xd1}, {0x00,0x00,0x00,0x00}, {0x2c,0xc1,0xed,0xed}, -{0x60,0x40,0x20,0x20}, {0x1f,0xe3,0xfc,0xfc}, {0xc8,0x79,0xb1,0xb1}, {0xed,0xb6,0x5b,0x5b}, -{0xbe,0xd4,0x6a,0x6a}, {0x46,0x8d,0xcb,0xcb}, {0xd9,0x67,0xbe,0xbe}, {0x4b,0x72,0x39,0x39}, -{0xde,0x94,0x4a,0x4a}, {0xd4,0x98,0x4c,0x4c}, {0xe8,0xb0,0x58,0x58}, {0x4a,0x85,0xcf,0xcf}, -{0x6b,0xbb,0xd0,0xd0}, {0x2a,0xc5,0xef,0xef}, {0xe5,0x4f,0xaa,0xaa}, {0x16,0xed,0xfb,0xfb}, -{0xc5,0x86,0x43,0x43}, {0xd7,0x9a,0x4d,0x4d}, {0x55,0x66,0x33,0x33}, {0x94,0x11,0x85,0x85}, -{0xcf,0x8a,0x45,0x45}, {0x10,0xe9,0xf9,0xf9}, {0x06,0x04,0x02,0x02}, {0x81,0xfe,0x7f,0x7f}, -{0xf0,0xa0,0x50,0x50}, {0x44,0x78,0x3c,0x3c}, {0xba,0x25,0x9f,0x9f}, {0xe3,0x4b,0xa8,0xa8}, -{0xf3,0xa2,0x51,0x51}, {0xfe,0x5d,0xa3,0xa3}, {0xc0,0x80,0x40,0x40}, {0x8a,0x05,0x8f,0x8f}, -{0xad,0x3f,0x92,0x92}, {0xbc,0x21,0x9d,0x9d}, {0x48,0x70,0x38,0x38}, {0x04,0xf1,0xf5,0xf5}, -{0xdf,0x63,0xbc,0xbc}, {0xc1,0x77,0xb6,0xb6}, {0x75,0xaf,0xda,0xda}, {0x63,0x42,0x21,0x21}, -{0x30,0x20,0x10,0x10}, {0x1a,0xe5,0xff,0xff}, {0x0e,0xfd,0xf3,0xf3}, {0x6d,0xbf,0xd2,0xd2}, -{0x4c,0x81,0xcd,0xcd}, {0x14,0x18,0x0c,0x0c}, {0x35,0x26,0x13,0x13}, {0x2f,0xc3,0xec,0xec}, -{0xe1,0xbe,0x5f,0x5f}, {0xa2,0x35,0x97,0x97}, {0xcc,0x88,0x44,0x44}, {0x39,0x2e,0x17,0x17}, -{0x57,0x93,0xc4,0xc4}, {0xf2,0x55,0xa7,0xa7}, {0x82,0xfc,0x7e,0x7e}, {0x47,0x7a,0x3d,0x3d}, -{0xac,0xc8,0x64,0x64}, {0xe7,0xba,0x5d,0x5d}, {0x2b,0x32,0x19,0x19}, {0x95,0xe6,0x73,0x73}, -{0xa0,0xc0,0x60,0x60}, {0x98,0x19,0x81,0x81}, {0xd1,0x9e,0x4f,0x4f}, {0x7f,0xa3,0xdc,0xdc}, -{0x66,0x44,0x22,0x22}, {0x7e,0x54,0x2a,0x2a}, {0xab,0x3b,0x90,0x90}, {0x83,0x0b,0x88,0x88}, -{0xca,0x8c,0x46,0x46}, {0x29,0xc7,0xee,0xee}, {0xd3,0x6b,0xb8,0xb8}, {0x3c,0x28,0x14,0x14}, -{0x79,0xa7,0xde,0xde}, {0xe2,0xbc,0x5e,0x5e}, {0x1d,0x16,0x0b,0x0b}, {0x76,0xad,0xdb,0xdb}, -{0x3b,0xdb,0xe0,0xe0}, {0x56,0x64,0x32,0x32}, {0x4e,0x74,0x3a,0x3a}, {0x1e,0x14,0x0a,0x0a}, -{0xdb,0x92,0x49,0x49}, {0x0a,0x0c,0x06,0x06}, {0x6c,0x48,0x24,0x24}, {0xe4,0xb8,0x5c,0x5c}, -{0x5d,0x9f,0xc2,0xc2}, {0x6e,0xbd,0xd3,0xd3}, {0xef,0x43,0xac,0xac}, {0xa6,0xc4,0x62,0x62}, -{0xa8,0x39,0x91,0x91}, {0xa4,0x31,0x95,0x95}, {0x37,0xd3,0xe4,0xe4}, {0x8b,0xf2,0x79,0x79}, -{0x32,0xd5,0xe7,0xe7}, {0x43,0x8b,0xc8,0xc8}, {0x59,0x6e,0x37,0x37}, {0xb7,0xda,0x6d,0x6d}, -{0x8c,0x01,0x8d,0x8d}, {0x64,0xb1,0xd5,0xd5}, {0xd2,0x9c,0x4e,0x4e}, {0xe0,0x49,0xa9,0xa9}, -{0xb4,0xd8,0x6c,0x6c}, {0xfa,0xac,0x56,0x56}, {0x07,0xf3,0xf4,0xf4}, {0x25,0xcf,0xea,0xea}, -{0xaf,0xca,0x65,0x65}, {0x8e,0xf4,0x7a,0x7a}, {0xe9,0x47,0xae,0xae}, {0x18,0x10,0x08,0x08}, -{0xd5,0x6f,0xba,0xba}, {0x88,0xf0,0x78,0x78}, {0x6f,0x4a,0x25,0x25}, {0x72,0x5c,0x2e,0x2e}, -{0x24,0x38,0x1c,0x1c}, {0xf1,0x57,0xa6,0xa6}, {0xc7,0x73,0xb4,0xb4}, {0x51,0x97,0xc6,0xc6}, -{0x23,0xcb,0xe8,0xe8}, {0x7c,0xa1,0xdd,0xdd}, {0x9c,0xe8,0x74,0x74}, {0x21,0x3e,0x1f,0x1f}, -{0xdd,0x96,0x4b,0x4b}, {0xdc,0x61,0xbd,0xbd}, {0x86,0x0d,0x8b,0x8b}, {0x85,0x0f,0x8a,0x8a}, -{0x90,0xe0,0x70,0x70}, {0x42,0x7c,0x3e,0x3e}, {0xc4,0x71,0xb5,0xb5}, {0xaa,0xcc,0x66,0x66}, -{0xd8,0x90,0x48,0x48}, {0x05,0x06,0x03,0x03}, {0x01,0xf7,0xf6,0xf6}, {0x12,0x1c,0x0e,0x0e}, -{0xa3,0xc2,0x61,0x61}, {0x5f,0x6a,0x35,0x35}, {0xf9,0xae,0x57,0x57}, {0xd0,0x69,0xb9,0xb9}, -{0x91,0x17,0x86,0x86}, {0x58,0x99,0xc1,0xc1}, {0x27,0x3a,0x1d,0x1d}, {0xb9,0x27,0x9e,0x9e}, -{0x38,0xd9,0xe1,0xe1}, {0x13,0xeb,0xf8,0xf8}, {0xb3,0x2b,0x98,0x98}, {0x33,0x22,0x11,0x11}, -{0xbb,0xd2,0x69,0x69}, {0x70,0xa9,0xd9,0xd9}, {0x89,0x07,0x8e,0x8e}, {0xa7,0x33,0x94,0x94}, -{0xb6,0x2d,0x9b,0x9b}, {0x22,0x3c,0x1e,0x1e}, {0x92,0x15,0x87,0x87}, {0x20,0xc9,0xe9,0xe9}, -{0x49,0x87,0xce,0xce}, {0xff,0xaa,0x55,0x55}, {0x78,0x50,0x28,0x28}, {0x7a,0xa5,0xdf,0xdf}, -{0x8f,0x03,0x8c,0x8c}, {0xf8,0x59,0xa1,0xa1}, {0x80,0x09,0x89,0x89}, {0x17,0x1a,0x0d,0x0d}, -{0xda,0x65,0xbf,0xbf}, {0x31,0xd7,0xe6,0xe6}, {0xc6,0x84,0x42,0x42}, {0xb8,0xd0,0x68,0x68}, -{0xc3,0x82,0x41,0x41}, {0xb0,0x29,0x99,0x99}, {0x77,0x5a,0x2d,0x2d}, {0x11,0x1e,0x0f,0x0f}, -{0xcb,0x7b,0xb0,0xb0}, {0xfc,0xa8,0x54,0x54}, {0xd6,0x6d,0xbb,0xbb}, {0x3a,0x2c,0x16,0x16} - } -}; -#define T2 xT2.xt8 - -static const union xtab xT3 = { - .xt8 = { -{0x63,0xa5,0xc6,0x63}, {0x7c,0x84,0xf8,0x7c}, {0x77,0x99,0xee,0x77}, {0x7b,0x8d,0xf6,0x7b}, -{0xf2,0x0d,0xff,0xf2}, {0x6b,0xbd,0xd6,0x6b}, {0x6f,0xb1,0xde,0x6f}, {0xc5,0x54,0x91,0xc5}, -{0x30,0x50,0x60,0x30}, {0x01,0x03,0x02,0x01}, {0x67,0xa9,0xce,0x67}, {0x2b,0x7d,0x56,0x2b}, -{0xfe,0x19,0xe7,0xfe}, {0xd7,0x62,0xb5,0xd7}, {0xab,0xe6,0x4d,0xab}, {0x76,0x9a,0xec,0x76}, -{0xca,0x45,0x8f,0xca}, {0x82,0x9d,0x1f,0x82}, {0xc9,0x40,0x89,0xc9}, {0x7d,0x87,0xfa,0x7d}, -{0xfa,0x15,0xef,0xfa}, {0x59,0xeb,0xb2,0x59}, {0x47,0xc9,0x8e,0x47}, {0xf0,0x0b,0xfb,0xf0}, -{0xad,0xec,0x41,0xad}, {0xd4,0x67,0xb3,0xd4}, {0xa2,0xfd,0x5f,0xa2}, {0xaf,0xea,0x45,0xaf}, -{0x9c,0xbf,0x23,0x9c}, {0xa4,0xf7,0x53,0xa4}, {0x72,0x96,0xe4,0x72}, {0xc0,0x5b,0x9b,0xc0}, -{0xb7,0xc2,0x75,0xb7}, {0xfd,0x1c,0xe1,0xfd}, {0x93,0xae,0x3d,0x93}, {0x26,0x6a,0x4c,0x26}, -{0x36,0x5a,0x6c,0x36}, {0x3f,0x41,0x7e,0x3f}, {0xf7,0x02,0xf5,0xf7}, {0xcc,0x4f,0x83,0xcc}, -{0x34,0x5c,0x68,0x34}, {0xa5,0xf4,0x51,0xa5}, {0xe5,0x34,0xd1,0xe5}, {0xf1,0x08,0xf9,0xf1}, -{0x71,0x93,0xe2,0x71}, {0xd8,0x73,0xab,0xd8}, {0x31,0x53,0x62,0x31}, {0x15,0x3f,0x2a,0x15}, -{0x04,0x0c,0x08,0x04}, {0xc7,0x52,0x95,0xc7}, {0x23,0x65,0x46,0x23}, {0xc3,0x5e,0x9d,0xc3}, -{0x18,0x28,0x30,0x18}, {0x96,0xa1,0x37,0x96}, {0x05,0x0f,0x0a,0x05}, {0x9a,0xb5,0x2f,0x9a}, -{0x07,0x09,0x0e,0x07}, {0x12,0x36,0x24,0x12}, {0x80,0x9b,0x1b,0x80}, {0xe2,0x3d,0xdf,0xe2}, -{0xeb,0x26,0xcd,0xeb}, {0x27,0x69,0x4e,0x27}, {0xb2,0xcd,0x7f,0xb2}, {0x75,0x9f,0xea,0x75}, -{0x09,0x1b,0x12,0x09}, {0x83,0x9e,0x1d,0x83}, {0x2c,0x74,0x58,0x2c}, {0x1a,0x2e,0x34,0x1a}, -{0x1b,0x2d,0x36,0x1b}, {0x6e,0xb2,0xdc,0x6e}, {0x5a,0xee,0xb4,0x5a}, {0xa0,0xfb,0x5b,0xa0}, -{0x52,0xf6,0xa4,0x52}, {0x3b,0x4d,0x76,0x3b}, {0xd6,0x61,0xb7,0xd6}, {0xb3,0xce,0x7d,0xb3}, -{0x29,0x7b,0x52,0x29}, {0xe3,0x3e,0xdd,0xe3}, {0x2f,0x71,0x5e,0x2f}, {0x84,0x97,0x13,0x84}, -{0x53,0xf5,0xa6,0x53}, {0xd1,0x68,0xb9,0xd1}, {0x00,0x00,0x00,0x00}, {0xed,0x2c,0xc1,0xed}, -{0x20,0x60,0x40,0x20}, {0xfc,0x1f,0xe3,0xfc}, {0xb1,0xc8,0x79,0xb1}, {0x5b,0xed,0xb6,0x5b}, -{0x6a,0xbe,0xd4,0x6a}, {0xcb,0x46,0x8d,0xcb}, {0xbe,0xd9,0x67,0xbe}, {0x39,0x4b,0x72,0x39}, -{0x4a,0xde,0x94,0x4a}, {0x4c,0xd4,0x98,0x4c}, {0x58,0xe8,0xb0,0x58}, {0xcf,0x4a,0x85,0xcf}, -{0xd0,0x6b,0xbb,0xd0}, {0xef,0x2a,0xc5,0xef}, {0xaa,0xe5,0x4f,0xaa}, {0xfb,0x16,0xed,0xfb}, -{0x43,0xc5,0x86,0x43}, {0x4d,0xd7,0x9a,0x4d}, {0x33,0x55,0x66,0x33}, {0x85,0x94,0x11,0x85}, -{0x45,0xcf,0x8a,0x45}, {0xf9,0x10,0xe9,0xf9}, {0x02,0x06,0x04,0x02}, {0x7f,0x81,0xfe,0x7f}, -{0x50,0xf0,0xa0,0x50}, {0x3c,0x44,0x78,0x3c}, {0x9f,0xba,0x25,0x9f}, {0xa8,0xe3,0x4b,0xa8}, -{0x51,0xf3,0xa2,0x51}, {0xa3,0xfe,0x5d,0xa3}, {0x40,0xc0,0x80,0x40}, {0x8f,0x8a,0x05,0x8f}, -{0x92,0xad,0x3f,0x92}, {0x9d,0xbc,0x21,0x9d}, {0x38,0x48,0x70,0x38}, {0xf5,0x04,0xf1,0xf5}, -{0xbc,0xdf,0x63,0xbc}, {0xb6,0xc1,0x77,0xb6}, {0xda,0x75,0xaf,0xda}, {0x21,0x63,0x42,0x21}, -{0x10,0x30,0x20,0x10}, {0xff,0x1a,0xe5,0xff}, {0xf3,0x0e,0xfd,0xf3}, {0xd2,0x6d,0xbf,0xd2}, -{0xcd,0x4c,0x81,0xcd}, {0x0c,0x14,0x18,0x0c}, {0x13,0x35,0x26,0x13}, {0xec,0x2f,0xc3,0xec}, -{0x5f,0xe1,0xbe,0x5f}, {0x97,0xa2,0x35,0x97}, {0x44,0xcc,0x88,0x44}, {0x17,0x39,0x2e,0x17}, -{0xc4,0x57,0x93,0xc4}, {0xa7,0xf2,0x55,0xa7}, {0x7e,0x82,0xfc,0x7e}, {0x3d,0x47,0x7a,0x3d}, -{0x64,0xac,0xc8,0x64}, {0x5d,0xe7,0xba,0x5d}, {0x19,0x2b,0x32,0x19}, {0x73,0x95,0xe6,0x73}, -{0x60,0xa0,0xc0,0x60}, {0x81,0x98,0x19,0x81}, {0x4f,0xd1,0x9e,0x4f}, {0xdc,0x7f,0xa3,0xdc}, -{0x22,0x66,0x44,0x22}, {0x2a,0x7e,0x54,0x2a}, {0x90,0xab,0x3b,0x90}, {0x88,0x83,0x0b,0x88}, -{0x46,0xca,0x8c,0x46}, {0xee,0x29,0xc7,0xee}, {0xb8,0xd3,0x6b,0xb8}, {0x14,0x3c,0x28,0x14}, -{0xde,0x79,0xa7,0xde}, {0x5e,0xe2,0xbc,0x5e}, {0x0b,0x1d,0x16,0x0b}, {0xdb,0x76,0xad,0xdb}, -{0xe0,0x3b,0xdb,0xe0}, {0x32,0x56,0x64,0x32}, {0x3a,0x4e,0x74,0x3a}, {0x0a,0x1e,0x14,0x0a}, -{0x49,0xdb,0x92,0x49}, {0x06,0x0a,0x0c,0x06}, {0x24,0x6c,0x48,0x24}, {0x5c,0xe4,0xb8,0x5c}, -{0xc2,0x5d,0x9f,0xc2}, {0xd3,0x6e,0xbd,0xd3}, {0xac,0xef,0x43,0xac}, {0x62,0xa6,0xc4,0x62}, -{0x91,0xa8,0x39,0x91}, {0x95,0xa4,0x31,0x95}, {0xe4,0x37,0xd3,0xe4}, {0x79,0x8b,0xf2,0x79}, -{0xe7,0x32,0xd5,0xe7}, {0xc8,0x43,0x8b,0xc8}, {0x37,0x59,0x6e,0x37}, {0x6d,0xb7,0xda,0x6d}, -{0x8d,0x8c,0x01,0x8d}, {0xd5,0x64,0xb1,0xd5}, {0x4e,0xd2,0x9c,0x4e}, {0xa9,0xe0,0x49,0xa9}, -{0x6c,0xb4,0xd8,0x6c}, {0x56,0xfa,0xac,0x56}, {0xf4,0x07,0xf3,0xf4}, {0xea,0x25,0xcf,0xea}, -{0x65,0xaf,0xca,0x65}, {0x7a,0x8e,0xf4,0x7a}, {0xae,0xe9,0x47,0xae}, {0x08,0x18,0x10,0x08}, -{0xba,0xd5,0x6f,0xba}, {0x78,0x88,0xf0,0x78}, {0x25,0x6f,0x4a,0x25}, {0x2e,0x72,0x5c,0x2e}, -{0x1c,0x24,0x38,0x1c}, {0xa6,0xf1,0x57,0xa6}, {0xb4,0xc7,0x73,0xb4}, {0xc6,0x51,0x97,0xc6}, -{0xe8,0x23,0xcb,0xe8}, {0xdd,0x7c,0xa1,0xdd}, {0x74,0x9c,0xe8,0x74}, {0x1f,0x21,0x3e,0x1f}, -{0x4b,0xdd,0x96,0x4b}, {0xbd,0xdc,0x61,0xbd}, {0x8b,0x86,0x0d,0x8b}, {0x8a,0x85,0x0f,0x8a}, -{0x70,0x90,0xe0,0x70}, {0x3e,0x42,0x7c,0x3e}, {0xb5,0xc4,0x71,0xb5}, {0x66,0xaa,0xcc,0x66}, -{0x48,0xd8,0x90,0x48}, {0x03,0x05,0x06,0x03}, {0xf6,0x01,0xf7,0xf6}, {0x0e,0x12,0x1c,0x0e}, -{0x61,0xa3,0xc2,0x61}, {0x35,0x5f,0x6a,0x35}, {0x57,0xf9,0xae,0x57}, {0xb9,0xd0,0x69,0xb9}, -{0x86,0x91,0x17,0x86}, {0xc1,0x58,0x99,0xc1}, {0x1d,0x27,0x3a,0x1d}, {0x9e,0xb9,0x27,0x9e}, -{0xe1,0x38,0xd9,0xe1}, {0xf8,0x13,0xeb,0xf8}, {0x98,0xb3,0x2b,0x98}, {0x11,0x33,0x22,0x11}, -{0x69,0xbb,0xd2,0x69}, {0xd9,0x70,0xa9,0xd9}, {0x8e,0x89,0x07,0x8e}, {0x94,0xa7,0x33,0x94}, -{0x9b,0xb6,0x2d,0x9b}, {0x1e,0x22,0x3c,0x1e}, {0x87,0x92,0x15,0x87}, {0xe9,0x20,0xc9,0xe9}, -{0xce,0x49,0x87,0xce}, {0x55,0xff,0xaa,0x55}, {0x28,0x78,0x50,0x28}, {0xdf,0x7a,0xa5,0xdf}, -{0x8c,0x8f,0x03,0x8c}, {0xa1,0xf8,0x59,0xa1}, {0x89,0x80,0x09,0x89}, {0x0d,0x17,0x1a,0x0d}, -{0xbf,0xda,0x65,0xbf}, {0xe6,0x31,0xd7,0xe6}, {0x42,0xc6,0x84,0x42}, {0x68,0xb8,0xd0,0x68}, -{0x41,0xc3,0x82,0x41}, {0x99,0xb0,0x29,0x99}, {0x2d,0x77,0x5a,0x2d}, {0x0f,0x11,0x1e,0x0f}, -{0xb0,0xcb,0x7b,0xb0}, {0x54,0xfc,0xa8,0x54}, {0xbb,0xd6,0x6d,0xbb}, {0x16,0x3a,0x2c,0x16} - } -}; -#define T3 xT3.xt8 - -static const union xtab xT4 = { - .xt8 = { -{0x63,0x63,0xa5,0xc6}, {0x7c,0x7c,0x84,0xf8}, {0x77,0x77,0x99,0xee}, {0x7b,0x7b,0x8d,0xf6}, -{0xf2,0xf2,0x0d,0xff}, {0x6b,0x6b,0xbd,0xd6}, {0x6f,0x6f,0xb1,0xde}, {0xc5,0xc5,0x54,0x91}, -{0x30,0x30,0x50,0x60}, {0x01,0x01,0x03,0x02}, {0x67,0x67,0xa9,0xce}, {0x2b,0x2b,0x7d,0x56}, -{0xfe,0xfe,0x19,0xe7}, {0xd7,0xd7,0x62,0xb5}, {0xab,0xab,0xe6,0x4d}, {0x76,0x76,0x9a,0xec}, -{0xca,0xca,0x45,0x8f}, {0x82,0x82,0x9d,0x1f}, {0xc9,0xc9,0x40,0x89}, {0x7d,0x7d,0x87,0xfa}, -{0xfa,0xfa,0x15,0xef}, {0x59,0x59,0xeb,0xb2}, {0x47,0x47,0xc9,0x8e}, {0xf0,0xf0,0x0b,0xfb}, -{0xad,0xad,0xec,0x41}, {0xd4,0xd4,0x67,0xb3}, {0xa2,0xa2,0xfd,0x5f}, {0xaf,0xaf,0xea,0x45}, -{0x9c,0x9c,0xbf,0x23}, {0xa4,0xa4,0xf7,0x53}, {0x72,0x72,0x96,0xe4}, {0xc0,0xc0,0x5b,0x9b}, -{0xb7,0xb7,0xc2,0x75}, {0xfd,0xfd,0x1c,0xe1}, {0x93,0x93,0xae,0x3d}, {0x26,0x26,0x6a,0x4c}, -{0x36,0x36,0x5a,0x6c}, {0x3f,0x3f,0x41,0x7e}, {0xf7,0xf7,0x02,0xf5}, {0xcc,0xcc,0x4f,0x83}, -{0x34,0x34,0x5c,0x68}, {0xa5,0xa5,0xf4,0x51}, {0xe5,0xe5,0x34,0xd1}, {0xf1,0xf1,0x08,0xf9}, -{0x71,0x71,0x93,0xe2}, {0xd8,0xd8,0x73,0xab}, {0x31,0x31,0x53,0x62}, {0x15,0x15,0x3f,0x2a}, -{0x04,0x04,0x0c,0x08}, {0xc7,0xc7,0x52,0x95}, {0x23,0x23,0x65,0x46}, {0xc3,0xc3,0x5e,0x9d}, -{0x18,0x18,0x28,0x30}, {0x96,0x96,0xa1,0x37}, {0x05,0x05,0x0f,0x0a}, {0x9a,0x9a,0xb5,0x2f}, -{0x07,0x07,0x09,0x0e}, {0x12,0x12,0x36,0x24}, {0x80,0x80,0x9b,0x1b}, {0xe2,0xe2,0x3d,0xdf}, -{0xeb,0xeb,0x26,0xcd}, {0x27,0x27,0x69,0x4e}, {0xb2,0xb2,0xcd,0x7f}, {0x75,0x75,0x9f,0xea}, -{0x09,0x09,0x1b,0x12}, {0x83,0x83,0x9e,0x1d}, {0x2c,0x2c,0x74,0x58}, {0x1a,0x1a,0x2e,0x34}, -{0x1b,0x1b,0x2d,0x36}, {0x6e,0x6e,0xb2,0xdc}, {0x5a,0x5a,0xee,0xb4}, {0xa0,0xa0,0xfb,0x5b}, -{0x52,0x52,0xf6,0xa4}, {0x3b,0x3b,0x4d,0x76}, {0xd6,0xd6,0x61,0xb7}, {0xb3,0xb3,0xce,0x7d}, -{0x29,0x29,0x7b,0x52}, {0xe3,0xe3,0x3e,0xdd}, {0x2f,0x2f,0x71,0x5e}, {0x84,0x84,0x97,0x13}, -{0x53,0x53,0xf5,0xa6}, {0xd1,0xd1,0x68,0xb9}, {0x00,0x00,0x00,0x00}, {0xed,0xed,0x2c,0xc1}, -{0x20,0x20,0x60,0x40}, {0xfc,0xfc,0x1f,0xe3}, {0xb1,0xb1,0xc8,0x79}, {0x5b,0x5b,0xed,0xb6}, -{0x6a,0x6a,0xbe,0xd4}, {0xcb,0xcb,0x46,0x8d}, {0xbe,0xbe,0xd9,0x67}, {0x39,0x39,0x4b,0x72}, -{0x4a,0x4a,0xde,0x94}, {0x4c,0x4c,0xd4,0x98}, {0x58,0x58,0xe8,0xb0}, {0xcf,0xcf,0x4a,0x85}, -{0xd0,0xd0,0x6b,0xbb}, {0xef,0xef,0x2a,0xc5}, {0xaa,0xaa,0xe5,0x4f}, {0xfb,0xfb,0x16,0xed}, -{0x43,0x43,0xc5,0x86}, {0x4d,0x4d,0xd7,0x9a}, {0x33,0x33,0x55,0x66}, {0x85,0x85,0x94,0x11}, -{0x45,0x45,0xcf,0x8a}, {0xf9,0xf9,0x10,0xe9}, {0x02,0x02,0x06,0x04}, {0x7f,0x7f,0x81,0xfe}, -{0x50,0x50,0xf0,0xa0}, {0x3c,0x3c,0x44,0x78}, {0x9f,0x9f,0xba,0x25}, {0xa8,0xa8,0xe3,0x4b}, -{0x51,0x51,0xf3,0xa2}, {0xa3,0xa3,0xfe,0x5d}, {0x40,0x40,0xc0,0x80}, {0x8f,0x8f,0x8a,0x05}, -{0x92,0x92,0xad,0x3f}, {0x9d,0x9d,0xbc,0x21}, {0x38,0x38,0x48,0x70}, {0xf5,0xf5,0x04,0xf1}, -{0xbc,0xbc,0xdf,0x63}, {0xb6,0xb6,0xc1,0x77}, {0xda,0xda,0x75,0xaf}, {0x21,0x21,0x63,0x42}, -{0x10,0x10,0x30,0x20}, {0xff,0xff,0x1a,0xe5}, {0xf3,0xf3,0x0e,0xfd}, {0xd2,0xd2,0x6d,0xbf}, -{0xcd,0xcd,0x4c,0x81}, {0x0c,0x0c,0x14,0x18}, {0x13,0x13,0x35,0x26}, {0xec,0xec,0x2f,0xc3}, -{0x5f,0x5f,0xe1,0xbe}, {0x97,0x97,0xa2,0x35}, {0x44,0x44,0xcc,0x88}, {0x17,0x17,0x39,0x2e}, -{0xc4,0xc4,0x57,0x93}, {0xa7,0xa7,0xf2,0x55}, {0x7e,0x7e,0x82,0xfc}, {0x3d,0x3d,0x47,0x7a}, -{0x64,0x64,0xac,0xc8}, {0x5d,0x5d,0xe7,0xba}, {0x19,0x19,0x2b,0x32}, {0x73,0x73,0x95,0xe6}, -{0x60,0x60,0xa0,0xc0}, {0x81,0x81,0x98,0x19}, {0x4f,0x4f,0xd1,0x9e}, {0xdc,0xdc,0x7f,0xa3}, -{0x22,0x22,0x66,0x44}, {0x2a,0x2a,0x7e,0x54}, {0x90,0x90,0xab,0x3b}, {0x88,0x88,0x83,0x0b}, -{0x46,0x46,0xca,0x8c}, {0xee,0xee,0x29,0xc7}, {0xb8,0xb8,0xd3,0x6b}, {0x14,0x14,0x3c,0x28}, -{0xde,0xde,0x79,0xa7}, {0x5e,0x5e,0xe2,0xbc}, {0x0b,0x0b,0x1d,0x16}, {0xdb,0xdb,0x76,0xad}, -{0xe0,0xe0,0x3b,0xdb}, {0x32,0x32,0x56,0x64}, {0x3a,0x3a,0x4e,0x74}, {0x0a,0x0a,0x1e,0x14}, -{0x49,0x49,0xdb,0x92}, {0x06,0x06,0x0a,0x0c}, {0x24,0x24,0x6c,0x48}, {0x5c,0x5c,0xe4,0xb8}, -{0xc2,0xc2,0x5d,0x9f}, {0xd3,0xd3,0x6e,0xbd}, {0xac,0xac,0xef,0x43}, {0x62,0x62,0xa6,0xc4}, -{0x91,0x91,0xa8,0x39}, {0x95,0x95,0xa4,0x31}, {0xe4,0xe4,0x37,0xd3}, {0x79,0x79,0x8b,0xf2}, -{0xe7,0xe7,0x32,0xd5}, {0xc8,0xc8,0x43,0x8b}, {0x37,0x37,0x59,0x6e}, {0x6d,0x6d,0xb7,0xda}, -{0x8d,0x8d,0x8c,0x01}, {0xd5,0xd5,0x64,0xb1}, {0x4e,0x4e,0xd2,0x9c}, {0xa9,0xa9,0xe0,0x49}, -{0x6c,0x6c,0xb4,0xd8}, {0x56,0x56,0xfa,0xac}, {0xf4,0xf4,0x07,0xf3}, {0xea,0xea,0x25,0xcf}, -{0x65,0x65,0xaf,0xca}, {0x7a,0x7a,0x8e,0xf4}, {0xae,0xae,0xe9,0x47}, {0x08,0x08,0x18,0x10}, -{0xba,0xba,0xd5,0x6f}, {0x78,0x78,0x88,0xf0}, {0x25,0x25,0x6f,0x4a}, {0x2e,0x2e,0x72,0x5c}, -{0x1c,0x1c,0x24,0x38}, {0xa6,0xa6,0xf1,0x57}, {0xb4,0xb4,0xc7,0x73}, {0xc6,0xc6,0x51,0x97}, -{0xe8,0xe8,0x23,0xcb}, {0xdd,0xdd,0x7c,0xa1}, {0x74,0x74,0x9c,0xe8}, {0x1f,0x1f,0x21,0x3e}, -{0x4b,0x4b,0xdd,0x96}, {0xbd,0xbd,0xdc,0x61}, {0x8b,0x8b,0x86,0x0d}, {0x8a,0x8a,0x85,0x0f}, -{0x70,0x70,0x90,0xe0}, {0x3e,0x3e,0x42,0x7c}, {0xb5,0xb5,0xc4,0x71}, {0x66,0x66,0xaa,0xcc}, -{0x48,0x48,0xd8,0x90}, {0x03,0x03,0x05,0x06}, {0xf6,0xf6,0x01,0xf7}, {0x0e,0x0e,0x12,0x1c}, -{0x61,0x61,0xa3,0xc2}, {0x35,0x35,0x5f,0x6a}, {0x57,0x57,0xf9,0xae}, {0xb9,0xb9,0xd0,0x69}, -{0x86,0x86,0x91,0x17}, {0xc1,0xc1,0x58,0x99}, {0x1d,0x1d,0x27,0x3a}, {0x9e,0x9e,0xb9,0x27}, -{0xe1,0xe1,0x38,0xd9}, {0xf8,0xf8,0x13,0xeb}, {0x98,0x98,0xb3,0x2b}, {0x11,0x11,0x33,0x22}, -{0x69,0x69,0xbb,0xd2}, {0xd9,0xd9,0x70,0xa9}, {0x8e,0x8e,0x89,0x07}, {0x94,0x94,0xa7,0x33}, -{0x9b,0x9b,0xb6,0x2d}, {0x1e,0x1e,0x22,0x3c}, {0x87,0x87,0x92,0x15}, {0xe9,0xe9,0x20,0xc9}, -{0xce,0xce,0x49,0x87}, {0x55,0x55,0xff,0xaa}, {0x28,0x28,0x78,0x50}, {0xdf,0xdf,0x7a,0xa5}, -{0x8c,0x8c,0x8f,0x03}, {0xa1,0xa1,0xf8,0x59}, {0x89,0x89,0x80,0x09}, {0x0d,0x0d,0x17,0x1a}, -{0xbf,0xbf,0xda,0x65}, {0xe6,0xe6,0x31,0xd7}, {0x42,0x42,0xc6,0x84}, {0x68,0x68,0xb8,0xd0}, -{0x41,0x41,0xc3,0x82}, {0x99,0x99,0xb0,0x29}, {0x2d,0x2d,0x77,0x5a}, {0x0f,0x0f,0x11,0x1e}, -{0xb0,0xb0,0xcb,0x7b}, {0x54,0x54,0xfc,0xa8}, {0xbb,0xbb,0xd6,0x6d}, {0x16,0x16,0x3a,0x2c} - } -}; -#define T4 xT4.xt8 - -static const union xtab xT5 = { - .xt8 = { -{0x51,0xf4,0xa7,0x50}, {0x7e,0x41,0x65,0x53}, {0x1a,0x17,0xa4,0xc3}, {0x3a,0x27,0x5e,0x96}, -{0x3b,0xab,0x6b,0xcb}, {0x1f,0x9d,0x45,0xf1}, {0xac,0xfa,0x58,0xab}, {0x4b,0xe3,0x03,0x93}, -{0x20,0x30,0xfa,0x55}, {0xad,0x76,0x6d,0xf6}, {0x88,0xcc,0x76,0x91}, {0xf5,0x02,0x4c,0x25}, -{0x4f,0xe5,0xd7,0xfc}, {0xc5,0x2a,0xcb,0xd7}, {0x26,0x35,0x44,0x80}, {0xb5,0x62,0xa3,0x8f}, -{0xde,0xb1,0x5a,0x49}, {0x25,0xba,0x1b,0x67}, {0x45,0xea,0x0e,0x98}, {0x5d,0xfe,0xc0,0xe1}, -{0xc3,0x2f,0x75,0x02}, {0x81,0x4c,0xf0,0x12}, {0x8d,0x46,0x97,0xa3}, {0x6b,0xd3,0xf9,0xc6}, -{0x03,0x8f,0x5f,0xe7}, {0x15,0x92,0x9c,0x95}, {0xbf,0x6d,0x7a,0xeb}, {0x95,0x52,0x59,0xda}, -{0xd4,0xbe,0x83,0x2d}, {0x58,0x74,0x21,0xd3}, {0x49,0xe0,0x69,0x29}, {0x8e,0xc9,0xc8,0x44}, -{0x75,0xc2,0x89,0x6a}, {0xf4,0x8e,0x79,0x78}, {0x99,0x58,0x3e,0x6b}, {0x27,0xb9,0x71,0xdd}, -{0xbe,0xe1,0x4f,0xb6}, {0xf0,0x88,0xad,0x17}, {0xc9,0x20,0xac,0x66}, {0x7d,0xce,0x3a,0xb4}, -{0x63,0xdf,0x4a,0x18}, {0xe5,0x1a,0x31,0x82}, {0x97,0x51,0x33,0x60}, {0x62,0x53,0x7f,0x45}, -{0xb1,0x64,0x77,0xe0}, {0xbb,0x6b,0xae,0x84}, {0xfe,0x81,0xa0,0x1c}, {0xf9,0x08,0x2b,0x94}, -{0x70,0x48,0x68,0x58}, {0x8f,0x45,0xfd,0x19}, {0x94,0xde,0x6c,0x87}, {0x52,0x7b,0xf8,0xb7}, -{0xab,0x73,0xd3,0x23}, {0x72,0x4b,0x02,0xe2}, {0xe3,0x1f,0x8f,0x57}, {0x66,0x55,0xab,0x2a}, -{0xb2,0xeb,0x28,0x07}, {0x2f,0xb5,0xc2,0x03}, {0x86,0xc5,0x7b,0x9a}, {0xd3,0x37,0x08,0xa5}, -{0x30,0x28,0x87,0xf2}, {0x23,0xbf,0xa5,0xb2}, {0x02,0x03,0x6a,0xba}, {0xed,0x16,0x82,0x5c}, -{0x8a,0xcf,0x1c,0x2b}, {0xa7,0x79,0xb4,0x92}, {0xf3,0x07,0xf2,0xf0}, {0x4e,0x69,0xe2,0xa1}, -{0x65,0xda,0xf4,0xcd}, {0x06,0x05,0xbe,0xd5}, {0xd1,0x34,0x62,0x1f}, {0xc4,0xa6,0xfe,0x8a}, -{0x34,0x2e,0x53,0x9d}, {0xa2,0xf3,0x55,0xa0}, {0x05,0x8a,0xe1,0x32}, {0xa4,0xf6,0xeb,0x75}, -{0x0b,0x83,0xec,0x39}, {0x40,0x60,0xef,0xaa}, {0x5e,0x71,0x9f,0x06}, {0xbd,0x6e,0x10,0x51}, -{0x3e,0x21,0x8a,0xf9}, {0x96,0xdd,0x06,0x3d}, {0xdd,0x3e,0x05,0xae}, {0x4d,0xe6,0xbd,0x46}, -{0x91,0x54,0x8d,0xb5}, {0x71,0xc4,0x5d,0x05}, {0x04,0x06,0xd4,0x6f}, {0x60,0x50,0x15,0xff}, -{0x19,0x98,0xfb,0x24}, {0xd6,0xbd,0xe9,0x97}, {0x89,0x40,0x43,0xcc}, {0x67,0xd9,0x9e,0x77}, -{0xb0,0xe8,0x42,0xbd}, {0x07,0x89,0x8b,0x88}, {0xe7,0x19,0x5b,0x38}, {0x79,0xc8,0xee,0xdb}, -{0xa1,0x7c,0x0a,0x47}, {0x7c,0x42,0x0f,0xe9}, {0xf8,0x84,0x1e,0xc9}, {0x00,0x00,0x00,0x00}, -{0x09,0x80,0x86,0x83}, {0x32,0x2b,0xed,0x48}, {0x1e,0x11,0x70,0xac}, {0x6c,0x5a,0x72,0x4e}, -{0xfd,0x0e,0xff,0xfb}, {0x0f,0x85,0x38,0x56}, {0x3d,0xae,0xd5,0x1e}, {0x36,0x2d,0x39,0x27}, -{0x0a,0x0f,0xd9,0x64}, {0x68,0x5c,0xa6,0x21}, {0x9b,0x5b,0x54,0xd1}, {0x24,0x36,0x2e,0x3a}, -{0x0c,0x0a,0x67,0xb1}, {0x93,0x57,0xe7,0x0f}, {0xb4,0xee,0x96,0xd2}, {0x1b,0x9b,0x91,0x9e}, -{0x80,0xc0,0xc5,0x4f}, {0x61,0xdc,0x20,0xa2}, {0x5a,0x77,0x4b,0x69}, {0x1c,0x12,0x1a,0x16}, -{0xe2,0x93,0xba,0x0a}, {0xc0,0xa0,0x2a,0xe5}, {0x3c,0x22,0xe0,0x43}, {0x12,0x1b,0x17,0x1d}, -{0x0e,0x09,0x0d,0x0b}, {0xf2,0x8b,0xc7,0xad}, {0x2d,0xb6,0xa8,0xb9}, {0x14,0x1e,0xa9,0xc8}, -{0x57,0xf1,0x19,0x85}, {0xaf,0x75,0x07,0x4c}, {0xee,0x99,0xdd,0xbb}, {0xa3,0x7f,0x60,0xfd}, -{0xf7,0x01,0x26,0x9f}, {0x5c,0x72,0xf5,0xbc}, {0x44,0x66,0x3b,0xc5}, {0x5b,0xfb,0x7e,0x34}, -{0x8b,0x43,0x29,0x76}, {0xcb,0x23,0xc6,0xdc}, {0xb6,0xed,0xfc,0x68}, {0xb8,0xe4,0xf1,0x63}, -{0xd7,0x31,0xdc,0xca}, {0x42,0x63,0x85,0x10}, {0x13,0x97,0x22,0x40}, {0x84,0xc6,0x11,0x20}, -{0x85,0x4a,0x24,0x7d}, {0xd2,0xbb,0x3d,0xf8}, {0xae,0xf9,0x32,0x11}, {0xc7,0x29,0xa1,0x6d}, -{0x1d,0x9e,0x2f,0x4b}, {0xdc,0xb2,0x30,0xf3}, {0x0d,0x86,0x52,0xec}, {0x77,0xc1,0xe3,0xd0}, -{0x2b,0xb3,0x16,0x6c}, {0xa9,0x70,0xb9,0x99}, {0x11,0x94,0x48,0xfa}, {0x47,0xe9,0x64,0x22}, -{0xa8,0xfc,0x8c,0xc4}, {0xa0,0xf0,0x3f,0x1a}, {0x56,0x7d,0x2c,0xd8}, {0x22,0x33,0x90,0xef}, -{0x87,0x49,0x4e,0xc7}, {0xd9,0x38,0xd1,0xc1}, {0x8c,0xca,0xa2,0xfe}, {0x98,0xd4,0x0b,0x36}, -{0xa6,0xf5,0x81,0xcf}, {0xa5,0x7a,0xde,0x28}, {0xda,0xb7,0x8e,0x26}, {0x3f,0xad,0xbf,0xa4}, -{0x2c,0x3a,0x9d,0xe4}, {0x50,0x78,0x92,0x0d}, {0x6a,0x5f,0xcc,0x9b}, {0x54,0x7e,0x46,0x62}, -{0xf6,0x8d,0x13,0xc2}, {0x90,0xd8,0xb8,0xe8}, {0x2e,0x39,0xf7,0x5e}, {0x82,0xc3,0xaf,0xf5}, -{0x9f,0x5d,0x80,0xbe}, {0x69,0xd0,0x93,0x7c}, {0x6f,0xd5,0x2d,0xa9}, {0xcf,0x25,0x12,0xb3}, -{0xc8,0xac,0x99,0x3b}, {0x10,0x18,0x7d,0xa7}, {0xe8,0x9c,0x63,0x6e}, {0xdb,0x3b,0xbb,0x7b}, -{0xcd,0x26,0x78,0x09}, {0x6e,0x59,0x18,0xf4}, {0xec,0x9a,0xb7,0x01}, {0x83,0x4f,0x9a,0xa8}, -{0xe6,0x95,0x6e,0x65}, {0xaa,0xff,0xe6,0x7e}, {0x21,0xbc,0xcf,0x08}, {0xef,0x15,0xe8,0xe6}, -{0xba,0xe7,0x9b,0xd9}, {0x4a,0x6f,0x36,0xce}, {0xea,0x9f,0x09,0xd4}, {0x29,0xb0,0x7c,0xd6}, -{0x31,0xa4,0xb2,0xaf}, {0x2a,0x3f,0x23,0x31}, {0xc6,0xa5,0x94,0x30}, {0x35,0xa2,0x66,0xc0}, -{0x74,0x4e,0xbc,0x37}, {0xfc,0x82,0xca,0xa6}, {0xe0,0x90,0xd0,0xb0}, {0x33,0xa7,0xd8,0x15}, -{0xf1,0x04,0x98,0x4a}, {0x41,0xec,0xda,0xf7}, {0x7f,0xcd,0x50,0x0e}, {0x17,0x91,0xf6,0x2f}, -{0x76,0x4d,0xd6,0x8d}, {0x43,0xef,0xb0,0x4d}, {0xcc,0xaa,0x4d,0x54}, {0xe4,0x96,0x04,0xdf}, -{0x9e,0xd1,0xb5,0xe3}, {0x4c,0x6a,0x88,0x1b}, {0xc1,0x2c,0x1f,0xb8}, {0x46,0x65,0x51,0x7f}, -{0x9d,0x5e,0xea,0x04}, {0x01,0x8c,0x35,0x5d}, {0xfa,0x87,0x74,0x73}, {0xfb,0x0b,0x41,0x2e}, -{0xb3,0x67,0x1d,0x5a}, {0x92,0xdb,0xd2,0x52}, {0xe9,0x10,0x56,0x33}, {0x6d,0xd6,0x47,0x13}, -{0x9a,0xd7,0x61,0x8c}, {0x37,0xa1,0x0c,0x7a}, {0x59,0xf8,0x14,0x8e}, {0xeb,0x13,0x3c,0x89}, -{0xce,0xa9,0x27,0xee}, {0xb7,0x61,0xc9,0x35}, {0xe1,0x1c,0xe5,0xed}, {0x7a,0x47,0xb1,0x3c}, -{0x9c,0xd2,0xdf,0x59}, {0x55,0xf2,0x73,0x3f}, {0x18,0x14,0xce,0x79}, {0x73,0xc7,0x37,0xbf}, -{0x53,0xf7,0xcd,0xea}, {0x5f,0xfd,0xaa,0x5b}, {0xdf,0x3d,0x6f,0x14}, {0x78,0x44,0xdb,0x86}, -{0xca,0xaf,0xf3,0x81}, {0xb9,0x68,0xc4,0x3e}, {0x38,0x24,0x34,0x2c}, {0xc2,0xa3,0x40,0x5f}, -{0x16,0x1d,0xc3,0x72}, {0xbc,0xe2,0x25,0x0c}, {0x28,0x3c,0x49,0x8b}, {0xff,0x0d,0x95,0x41}, -{0x39,0xa8,0x01,0x71}, {0x08,0x0c,0xb3,0xde}, {0xd8,0xb4,0xe4,0x9c}, {0x64,0x56,0xc1,0x90}, -{0x7b,0xcb,0x84,0x61}, {0xd5,0x32,0xb6,0x70}, {0x48,0x6c,0x5c,0x74}, {0xd0,0xb8,0x57,0x42} - } -}; -#define T5 xT5.xt8 - -static const union xtab xT6 = { - .xt8 = { -{0x50,0x51,0xf4,0xa7}, {0x53,0x7e,0x41,0x65}, {0xc3,0x1a,0x17,0xa4}, {0x96,0x3a,0x27,0x5e}, -{0xcb,0x3b,0xab,0x6b}, {0xf1,0x1f,0x9d,0x45}, {0xab,0xac,0xfa,0x58}, {0x93,0x4b,0xe3,0x03}, -{0x55,0x20,0x30,0xfa}, {0xf6,0xad,0x76,0x6d}, {0x91,0x88,0xcc,0x76}, {0x25,0xf5,0x02,0x4c}, -{0xfc,0x4f,0xe5,0xd7}, {0xd7,0xc5,0x2a,0xcb}, {0x80,0x26,0x35,0x44}, {0x8f,0xb5,0x62,0xa3}, -{0x49,0xde,0xb1,0x5a}, {0x67,0x25,0xba,0x1b}, {0x98,0x45,0xea,0x0e}, {0xe1,0x5d,0xfe,0xc0}, -{0x02,0xc3,0x2f,0x75}, {0x12,0x81,0x4c,0xf0}, {0xa3,0x8d,0x46,0x97}, {0xc6,0x6b,0xd3,0xf9}, -{0xe7,0x03,0x8f,0x5f}, {0x95,0x15,0x92,0x9c}, {0xeb,0xbf,0x6d,0x7a}, {0xda,0x95,0x52,0x59}, -{0x2d,0xd4,0xbe,0x83}, {0xd3,0x58,0x74,0x21}, {0x29,0x49,0xe0,0x69}, {0x44,0x8e,0xc9,0xc8}, -{0x6a,0x75,0xc2,0x89}, {0x78,0xf4,0x8e,0x79}, {0x6b,0x99,0x58,0x3e}, {0xdd,0x27,0xb9,0x71}, -{0xb6,0xbe,0xe1,0x4f}, {0x17,0xf0,0x88,0xad}, {0x66,0xc9,0x20,0xac}, {0xb4,0x7d,0xce,0x3a}, -{0x18,0x63,0xdf,0x4a}, {0x82,0xe5,0x1a,0x31}, {0x60,0x97,0x51,0x33}, {0x45,0x62,0x53,0x7f}, -{0xe0,0xb1,0x64,0x77}, {0x84,0xbb,0x6b,0xae}, {0x1c,0xfe,0x81,0xa0}, {0x94,0xf9,0x08,0x2b}, -{0x58,0x70,0x48,0x68}, {0x19,0x8f,0x45,0xfd}, {0x87,0x94,0xde,0x6c}, {0xb7,0x52,0x7b,0xf8}, -{0x23,0xab,0x73,0xd3}, {0xe2,0x72,0x4b,0x02}, {0x57,0xe3,0x1f,0x8f}, {0x2a,0x66,0x55,0xab}, -{0x07,0xb2,0xeb,0x28}, {0x03,0x2f,0xb5,0xc2}, {0x9a,0x86,0xc5,0x7b}, {0xa5,0xd3,0x37,0x08}, -{0xf2,0x30,0x28,0x87}, {0xb2,0x23,0xbf,0xa5}, {0xba,0x02,0x03,0x6a}, {0x5c,0xed,0x16,0x82}, -{0x2b,0x8a,0xcf,0x1c}, {0x92,0xa7,0x79,0xb4}, {0xf0,0xf3,0x07,0xf2}, {0xa1,0x4e,0x69,0xe2}, -{0xcd,0x65,0xda,0xf4}, {0xd5,0x06,0x05,0xbe}, {0x1f,0xd1,0x34,0x62}, {0x8a,0xc4,0xa6,0xfe}, -{0x9d,0x34,0x2e,0x53}, {0xa0,0xa2,0xf3,0x55}, {0x32,0x05,0x8a,0xe1}, {0x75,0xa4,0xf6,0xeb}, -{0x39,0x0b,0x83,0xec}, {0xaa,0x40,0x60,0xef}, {0x06,0x5e,0x71,0x9f}, {0x51,0xbd,0x6e,0x10}, -{0xf9,0x3e,0x21,0x8a}, {0x3d,0x96,0xdd,0x06}, {0xae,0xdd,0x3e,0x05}, {0x46,0x4d,0xe6,0xbd}, -{0xb5,0x91,0x54,0x8d}, {0x05,0x71,0xc4,0x5d}, {0x6f,0x04,0x06,0xd4}, {0xff,0x60,0x50,0x15}, -{0x24,0x19,0x98,0xfb}, {0x97,0xd6,0xbd,0xe9}, {0xcc,0x89,0x40,0x43}, {0x77,0x67,0xd9,0x9e}, -{0xbd,0xb0,0xe8,0x42}, {0x88,0x07,0x89,0x8b}, {0x38,0xe7,0x19,0x5b}, {0xdb,0x79,0xc8,0xee}, -{0x47,0xa1,0x7c,0x0a}, {0xe9,0x7c,0x42,0x0f}, {0xc9,0xf8,0x84,0x1e}, {0x00,0x00,0x00,0x00}, -{0x83,0x09,0x80,0x86}, {0x48,0x32,0x2b,0xed}, {0xac,0x1e,0x11,0x70}, {0x4e,0x6c,0x5a,0x72}, -{0xfb,0xfd,0x0e,0xff}, {0x56,0x0f,0x85,0x38}, {0x1e,0x3d,0xae,0xd5}, {0x27,0x36,0x2d,0x39}, -{0x64,0x0a,0x0f,0xd9}, {0x21,0x68,0x5c,0xa6}, {0xd1,0x9b,0x5b,0x54}, {0x3a,0x24,0x36,0x2e}, -{0xb1,0x0c,0x0a,0x67}, {0x0f,0x93,0x57,0xe7}, {0xd2,0xb4,0xee,0x96}, {0x9e,0x1b,0x9b,0x91}, -{0x4f,0x80,0xc0,0xc5}, {0xa2,0x61,0xdc,0x20}, {0x69,0x5a,0x77,0x4b}, {0x16,0x1c,0x12,0x1a}, -{0x0a,0xe2,0x93,0xba}, {0xe5,0xc0,0xa0,0x2a}, {0x43,0x3c,0x22,0xe0}, {0x1d,0x12,0x1b,0x17}, -{0x0b,0x0e,0x09,0x0d}, {0xad,0xf2,0x8b,0xc7}, {0xb9,0x2d,0xb6,0xa8}, {0xc8,0x14,0x1e,0xa9}, -{0x85,0x57,0xf1,0x19}, {0x4c,0xaf,0x75,0x07}, {0xbb,0xee,0x99,0xdd}, {0xfd,0xa3,0x7f,0x60}, -{0x9f,0xf7,0x01,0x26}, {0xbc,0x5c,0x72,0xf5}, {0xc5,0x44,0x66,0x3b}, {0x34,0x5b,0xfb,0x7e}, -{0x76,0x8b,0x43,0x29}, {0xdc,0xcb,0x23,0xc6}, {0x68,0xb6,0xed,0xfc}, {0x63,0xb8,0xe4,0xf1}, -{0xca,0xd7,0x31,0xdc}, {0x10,0x42,0x63,0x85}, {0x40,0x13,0x97,0x22}, {0x20,0x84,0xc6,0x11}, -{0x7d,0x85,0x4a,0x24}, {0xf8,0xd2,0xbb,0x3d}, {0x11,0xae,0xf9,0x32}, {0x6d,0xc7,0x29,0xa1}, -{0x4b,0x1d,0x9e,0x2f}, {0xf3,0xdc,0xb2,0x30}, {0xec,0x0d,0x86,0x52}, {0xd0,0x77,0xc1,0xe3}, -{0x6c,0x2b,0xb3,0x16}, {0x99,0xa9,0x70,0xb9}, {0xfa,0x11,0x94,0x48}, {0x22,0x47,0xe9,0x64}, -{0xc4,0xa8,0xfc,0x8c}, {0x1a,0xa0,0xf0,0x3f}, {0xd8,0x56,0x7d,0x2c}, {0xef,0x22,0x33,0x90}, -{0xc7,0x87,0x49,0x4e}, {0xc1,0xd9,0x38,0xd1}, {0xfe,0x8c,0xca,0xa2}, {0x36,0x98,0xd4,0x0b}, -{0xcf,0xa6,0xf5,0x81}, {0x28,0xa5,0x7a,0xde}, {0x26,0xda,0xb7,0x8e}, {0xa4,0x3f,0xad,0xbf}, -{0xe4,0x2c,0x3a,0x9d}, {0x0d,0x50,0x78,0x92}, {0x9b,0x6a,0x5f,0xcc}, {0x62,0x54,0x7e,0x46}, -{0xc2,0xf6,0x8d,0x13}, {0xe8,0x90,0xd8,0xb8}, {0x5e,0x2e,0x39,0xf7}, {0xf5,0x82,0xc3,0xaf}, -{0xbe,0x9f,0x5d,0x80}, {0x7c,0x69,0xd0,0x93}, {0xa9,0x6f,0xd5,0x2d}, {0xb3,0xcf,0x25,0x12}, -{0x3b,0xc8,0xac,0x99}, {0xa7,0x10,0x18,0x7d}, {0x6e,0xe8,0x9c,0x63}, {0x7b,0xdb,0x3b,0xbb}, -{0x09,0xcd,0x26,0x78}, {0xf4,0x6e,0x59,0x18}, {0x01,0xec,0x9a,0xb7}, {0xa8,0x83,0x4f,0x9a}, -{0x65,0xe6,0x95,0x6e}, {0x7e,0xaa,0xff,0xe6}, {0x08,0x21,0xbc,0xcf}, {0xe6,0xef,0x15,0xe8}, -{0xd9,0xba,0xe7,0x9b}, {0xce,0x4a,0x6f,0x36}, {0xd4,0xea,0x9f,0x09}, {0xd6,0x29,0xb0,0x7c}, -{0xaf,0x31,0xa4,0xb2}, {0x31,0x2a,0x3f,0x23}, {0x30,0xc6,0xa5,0x94}, {0xc0,0x35,0xa2,0x66}, -{0x37,0x74,0x4e,0xbc}, {0xa6,0xfc,0x82,0xca}, {0xb0,0xe0,0x90,0xd0}, {0x15,0x33,0xa7,0xd8}, -{0x4a,0xf1,0x04,0x98}, {0xf7,0x41,0xec,0xda}, {0x0e,0x7f,0xcd,0x50}, {0x2f,0x17,0x91,0xf6}, -{0x8d,0x76,0x4d,0xd6}, {0x4d,0x43,0xef,0xb0}, {0x54,0xcc,0xaa,0x4d}, {0xdf,0xe4,0x96,0x04}, -{0xe3,0x9e,0xd1,0xb5}, {0x1b,0x4c,0x6a,0x88}, {0xb8,0xc1,0x2c,0x1f}, {0x7f,0x46,0x65,0x51}, -{0x04,0x9d,0x5e,0xea}, {0x5d,0x01,0x8c,0x35}, {0x73,0xfa,0x87,0x74}, {0x2e,0xfb,0x0b,0x41}, -{0x5a,0xb3,0x67,0x1d}, {0x52,0x92,0xdb,0xd2}, {0x33,0xe9,0x10,0x56}, {0x13,0x6d,0xd6,0x47}, -{0x8c,0x9a,0xd7,0x61}, {0x7a,0x37,0xa1,0x0c}, {0x8e,0x59,0xf8,0x14}, {0x89,0xeb,0x13,0x3c}, -{0xee,0xce,0xa9,0x27}, {0x35,0xb7,0x61,0xc9}, {0xed,0xe1,0x1c,0xe5}, {0x3c,0x7a,0x47,0xb1}, -{0x59,0x9c,0xd2,0xdf}, {0x3f,0x55,0xf2,0x73}, {0x79,0x18,0x14,0xce}, {0xbf,0x73,0xc7,0x37}, -{0xea,0x53,0xf7,0xcd}, {0x5b,0x5f,0xfd,0xaa}, {0x14,0xdf,0x3d,0x6f}, {0x86,0x78,0x44,0xdb}, -{0x81,0xca,0xaf,0xf3}, {0x3e,0xb9,0x68,0xc4}, {0x2c,0x38,0x24,0x34}, {0x5f,0xc2,0xa3,0x40}, -{0x72,0x16,0x1d,0xc3}, {0x0c,0xbc,0xe2,0x25}, {0x8b,0x28,0x3c,0x49}, {0x41,0xff,0x0d,0x95}, -{0x71,0x39,0xa8,0x01}, {0xde,0x08,0x0c,0xb3}, {0x9c,0xd8,0xb4,0xe4}, {0x90,0x64,0x56,0xc1}, -{0x61,0x7b,0xcb,0x84}, {0x70,0xd5,0x32,0xb6}, {0x74,0x48,0x6c,0x5c}, {0x42,0xd0,0xb8,0x57} - } -}; -#define T6 xT6.xt8 - -static const union xtab xT7 = { - .xt8 = { -{0xa7,0x50,0x51,0xf4}, {0x65,0x53,0x7e,0x41}, {0xa4,0xc3,0x1a,0x17}, {0x5e,0x96,0x3a,0x27}, -{0x6b,0xcb,0x3b,0xab}, {0x45,0xf1,0x1f,0x9d}, {0x58,0xab,0xac,0xfa}, {0x03,0x93,0x4b,0xe3}, -{0xfa,0x55,0x20,0x30}, {0x6d,0xf6,0xad,0x76}, {0x76,0x91,0x88,0xcc}, {0x4c,0x25,0xf5,0x02}, -{0xd7,0xfc,0x4f,0xe5}, {0xcb,0xd7,0xc5,0x2a}, {0x44,0x80,0x26,0x35}, {0xa3,0x8f,0xb5,0x62}, -{0x5a,0x49,0xde,0xb1}, {0x1b,0x67,0x25,0xba}, {0x0e,0x98,0x45,0xea}, {0xc0,0xe1,0x5d,0xfe}, -{0x75,0x02,0xc3,0x2f}, {0xf0,0x12,0x81,0x4c}, {0x97,0xa3,0x8d,0x46}, {0xf9,0xc6,0x6b,0xd3}, -{0x5f,0xe7,0x03,0x8f}, {0x9c,0x95,0x15,0x92}, {0x7a,0xeb,0xbf,0x6d}, {0x59,0xda,0x95,0x52}, -{0x83,0x2d,0xd4,0xbe}, {0x21,0xd3,0x58,0x74}, {0x69,0x29,0x49,0xe0}, {0xc8,0x44,0x8e,0xc9}, -{0x89,0x6a,0x75,0xc2}, {0x79,0x78,0xf4,0x8e}, {0x3e,0x6b,0x99,0x58}, {0x71,0xdd,0x27,0xb9}, -{0x4f,0xb6,0xbe,0xe1}, {0xad,0x17,0xf0,0x88}, {0xac,0x66,0xc9,0x20}, {0x3a,0xb4,0x7d,0xce}, -{0x4a,0x18,0x63,0xdf}, {0x31,0x82,0xe5,0x1a}, {0x33,0x60,0x97,0x51}, {0x7f,0x45,0x62,0x53}, -{0x77,0xe0,0xb1,0x64}, {0xae,0x84,0xbb,0x6b}, {0xa0,0x1c,0xfe,0x81}, {0x2b,0x94,0xf9,0x08}, -{0x68,0x58,0x70,0x48}, {0xfd,0x19,0x8f,0x45}, {0x6c,0x87,0x94,0xde}, {0xf8,0xb7,0x52,0x7b}, -{0xd3,0x23,0xab,0x73}, {0x02,0xe2,0x72,0x4b}, {0x8f,0x57,0xe3,0x1f}, {0xab,0x2a,0x66,0x55}, -{0x28,0x07,0xb2,0xeb}, {0xc2,0x03,0x2f,0xb5}, {0x7b,0x9a,0x86,0xc5}, {0x08,0xa5,0xd3,0x37}, -{0x87,0xf2,0x30,0x28}, {0xa5,0xb2,0x23,0xbf}, {0x6a,0xba,0x02,0x03}, {0x82,0x5c,0xed,0x16}, -{0x1c,0x2b,0x8a,0xcf}, {0xb4,0x92,0xa7,0x79}, {0xf2,0xf0,0xf3,0x07}, {0xe2,0xa1,0x4e,0x69}, -{0xf4,0xcd,0x65,0xda}, {0xbe,0xd5,0x06,0x05}, {0x62,0x1f,0xd1,0x34}, {0xfe,0x8a,0xc4,0xa6}, -{0x53,0x9d,0x34,0x2e}, {0x55,0xa0,0xa2,0xf3}, {0xe1,0x32,0x05,0x8a}, {0xeb,0x75,0xa4,0xf6}, -{0xec,0x39,0x0b,0x83}, {0xef,0xaa,0x40,0x60}, {0x9f,0x06,0x5e,0x71}, {0x10,0x51,0xbd,0x6e}, -{0x8a,0xf9,0x3e,0x21}, {0x06,0x3d,0x96,0xdd}, {0x05,0xae,0xdd,0x3e}, {0xbd,0x46,0x4d,0xe6}, -{0x8d,0xb5,0x91,0x54}, {0x5d,0x05,0x71,0xc4}, {0xd4,0x6f,0x04,0x06}, {0x15,0xff,0x60,0x50}, -{0xfb,0x24,0x19,0x98}, {0xe9,0x97,0xd6,0xbd}, {0x43,0xcc,0x89,0x40}, {0x9e,0x77,0x67,0xd9}, -{0x42,0xbd,0xb0,0xe8}, {0x8b,0x88,0x07,0x89}, {0x5b,0x38,0xe7,0x19}, {0xee,0xdb,0x79,0xc8}, -{0x0a,0x47,0xa1,0x7c}, {0x0f,0xe9,0x7c,0x42}, {0x1e,0xc9,0xf8,0x84}, {0x00,0x00,0x00,0x00}, -{0x86,0x83,0x09,0x80}, {0xed,0x48,0x32,0x2b}, {0x70,0xac,0x1e,0x11}, {0x72,0x4e,0x6c,0x5a}, -{0xff,0xfb,0xfd,0x0e}, {0x38,0x56,0x0f,0x85}, {0xd5,0x1e,0x3d,0xae}, {0x39,0x27,0x36,0x2d}, -{0xd9,0x64,0x0a,0x0f}, {0xa6,0x21,0x68,0x5c}, {0x54,0xd1,0x9b,0x5b}, {0x2e,0x3a,0x24,0x36}, -{0x67,0xb1,0x0c,0x0a}, {0xe7,0x0f,0x93,0x57}, {0x96,0xd2,0xb4,0xee}, {0x91,0x9e,0x1b,0x9b}, -{0xc5,0x4f,0x80,0xc0}, {0x20,0xa2,0x61,0xdc}, {0x4b,0x69,0x5a,0x77}, {0x1a,0x16,0x1c,0x12}, -{0xba,0x0a,0xe2,0x93}, {0x2a,0xe5,0xc0,0xa0}, {0xe0,0x43,0x3c,0x22}, {0x17,0x1d,0x12,0x1b}, -{0x0d,0x0b,0x0e,0x09}, {0xc7,0xad,0xf2,0x8b}, {0xa8,0xb9,0x2d,0xb6}, {0xa9,0xc8,0x14,0x1e}, -{0x19,0x85,0x57,0xf1}, {0x07,0x4c,0xaf,0x75}, {0xdd,0xbb,0xee,0x99}, {0x60,0xfd,0xa3,0x7f}, -{0x26,0x9f,0xf7,0x01}, {0xf5,0xbc,0x5c,0x72}, {0x3b,0xc5,0x44,0x66}, {0x7e,0x34,0x5b,0xfb}, -{0x29,0x76,0x8b,0x43}, {0xc6,0xdc,0xcb,0x23}, {0xfc,0x68,0xb6,0xed}, {0xf1,0x63,0xb8,0xe4}, -{0xdc,0xca,0xd7,0x31}, {0x85,0x10,0x42,0x63}, {0x22,0x40,0x13,0x97}, {0x11,0x20,0x84,0xc6}, -{0x24,0x7d,0x85,0x4a}, {0x3d,0xf8,0xd2,0xbb}, {0x32,0x11,0xae,0xf9}, {0xa1,0x6d,0xc7,0x29}, -{0x2f,0x4b,0x1d,0x9e}, {0x30,0xf3,0xdc,0xb2}, {0x52,0xec,0x0d,0x86}, {0xe3,0xd0,0x77,0xc1}, -{0x16,0x6c,0x2b,0xb3}, {0xb9,0x99,0xa9,0x70}, {0x48,0xfa,0x11,0x94}, {0x64,0x22,0x47,0xe9}, -{0x8c,0xc4,0xa8,0xfc}, {0x3f,0x1a,0xa0,0xf0}, {0x2c,0xd8,0x56,0x7d}, {0x90,0xef,0x22,0x33}, -{0x4e,0xc7,0x87,0x49}, {0xd1,0xc1,0xd9,0x38}, {0xa2,0xfe,0x8c,0xca}, {0x0b,0x36,0x98,0xd4}, -{0x81,0xcf,0xa6,0xf5}, {0xde,0x28,0xa5,0x7a}, {0x8e,0x26,0xda,0xb7}, {0xbf,0xa4,0x3f,0xad}, -{0x9d,0xe4,0x2c,0x3a}, {0x92,0x0d,0x50,0x78}, {0xcc,0x9b,0x6a,0x5f}, {0x46,0x62,0x54,0x7e}, -{0x13,0xc2,0xf6,0x8d}, {0xb8,0xe8,0x90,0xd8}, {0xf7,0x5e,0x2e,0x39}, {0xaf,0xf5,0x82,0xc3}, -{0x80,0xbe,0x9f,0x5d}, {0x93,0x7c,0x69,0xd0}, {0x2d,0xa9,0x6f,0xd5}, {0x12,0xb3,0xcf,0x25}, -{0x99,0x3b,0xc8,0xac}, {0x7d,0xa7,0x10,0x18}, {0x63,0x6e,0xe8,0x9c}, {0xbb,0x7b,0xdb,0x3b}, -{0x78,0x09,0xcd,0x26}, {0x18,0xf4,0x6e,0x59}, {0xb7,0x01,0xec,0x9a}, {0x9a,0xa8,0x83,0x4f}, -{0x6e,0x65,0xe6,0x95}, {0xe6,0x7e,0xaa,0xff}, {0xcf,0x08,0x21,0xbc}, {0xe8,0xe6,0xef,0x15}, -{0x9b,0xd9,0xba,0xe7}, {0x36,0xce,0x4a,0x6f}, {0x09,0xd4,0xea,0x9f}, {0x7c,0xd6,0x29,0xb0}, -{0xb2,0xaf,0x31,0xa4}, {0x23,0x31,0x2a,0x3f}, {0x94,0x30,0xc6,0xa5}, {0x66,0xc0,0x35,0xa2}, -{0xbc,0x37,0x74,0x4e}, {0xca,0xa6,0xfc,0x82}, {0xd0,0xb0,0xe0,0x90}, {0xd8,0x15,0x33,0xa7}, -{0x98,0x4a,0xf1,0x04}, {0xda,0xf7,0x41,0xec}, {0x50,0x0e,0x7f,0xcd}, {0xf6,0x2f,0x17,0x91}, -{0xd6,0x8d,0x76,0x4d}, {0xb0,0x4d,0x43,0xef}, {0x4d,0x54,0xcc,0xaa}, {0x04,0xdf,0xe4,0x96}, -{0xb5,0xe3,0x9e,0xd1}, {0x88,0x1b,0x4c,0x6a}, {0x1f,0xb8,0xc1,0x2c}, {0x51,0x7f,0x46,0x65}, -{0xea,0x04,0x9d,0x5e}, {0x35,0x5d,0x01,0x8c}, {0x74,0x73,0xfa,0x87}, {0x41,0x2e,0xfb,0x0b}, -{0x1d,0x5a,0xb3,0x67}, {0xd2,0x52,0x92,0xdb}, {0x56,0x33,0xe9,0x10}, {0x47,0x13,0x6d,0xd6}, -{0x61,0x8c,0x9a,0xd7}, {0x0c,0x7a,0x37,0xa1}, {0x14,0x8e,0x59,0xf8}, {0x3c,0x89,0xeb,0x13}, -{0x27,0xee,0xce,0xa9}, {0xc9,0x35,0xb7,0x61}, {0xe5,0xed,0xe1,0x1c}, {0xb1,0x3c,0x7a,0x47}, -{0xdf,0x59,0x9c,0xd2}, {0x73,0x3f,0x55,0xf2}, {0xce,0x79,0x18,0x14}, {0x37,0xbf,0x73,0xc7}, -{0xcd,0xea,0x53,0xf7}, {0xaa,0x5b,0x5f,0xfd}, {0x6f,0x14,0xdf,0x3d}, {0xdb,0x86,0x78,0x44}, -{0xf3,0x81,0xca,0xaf}, {0xc4,0x3e,0xb9,0x68}, {0x34,0x2c,0x38,0x24}, {0x40,0x5f,0xc2,0xa3}, -{0xc3,0x72,0x16,0x1d}, {0x25,0x0c,0xbc,0xe2}, {0x49,0x8b,0x28,0x3c}, {0x95,0x41,0xff,0x0d}, -{0x01,0x71,0x39,0xa8}, {0xb3,0xde,0x08,0x0c}, {0xe4,0x9c,0xd8,0xb4}, {0xc1,0x90,0x64,0x56}, -{0x84,0x61,0x7b,0xcb}, {0xb6,0x70,0xd5,0x32}, {0x5c,0x74,0x48,0x6c}, {0x57,0x42,0xd0,0xb8} - } -}; -#define T7 xT7.xt8 - -static const union xtab xT8 = { - .xt8 = { -{0xf4,0xa7,0x50,0x51}, {0x41,0x65,0x53,0x7e}, {0x17,0xa4,0xc3,0x1a}, {0x27,0x5e,0x96,0x3a}, -{0xab,0x6b,0xcb,0x3b}, {0x9d,0x45,0xf1,0x1f}, {0xfa,0x58,0xab,0xac}, {0xe3,0x03,0x93,0x4b}, -{0x30,0xfa,0x55,0x20}, {0x76,0x6d,0xf6,0xad}, {0xcc,0x76,0x91,0x88}, {0x02,0x4c,0x25,0xf5}, -{0xe5,0xd7,0xfc,0x4f}, {0x2a,0xcb,0xd7,0xc5}, {0x35,0x44,0x80,0x26}, {0x62,0xa3,0x8f,0xb5}, -{0xb1,0x5a,0x49,0xde}, {0xba,0x1b,0x67,0x25}, {0xea,0x0e,0x98,0x45}, {0xfe,0xc0,0xe1,0x5d}, -{0x2f,0x75,0x02,0xc3}, {0x4c,0xf0,0x12,0x81}, {0x46,0x97,0xa3,0x8d}, {0xd3,0xf9,0xc6,0x6b}, -{0x8f,0x5f,0xe7,0x03}, {0x92,0x9c,0x95,0x15}, {0x6d,0x7a,0xeb,0xbf}, {0x52,0x59,0xda,0x95}, -{0xbe,0x83,0x2d,0xd4}, {0x74,0x21,0xd3,0x58}, {0xe0,0x69,0x29,0x49}, {0xc9,0xc8,0x44,0x8e}, -{0xc2,0x89,0x6a,0x75}, {0x8e,0x79,0x78,0xf4}, {0x58,0x3e,0x6b,0x99}, {0xb9,0x71,0xdd,0x27}, -{0xe1,0x4f,0xb6,0xbe}, {0x88,0xad,0x17,0xf0}, {0x20,0xac,0x66,0xc9}, {0xce,0x3a,0xb4,0x7d}, -{0xdf,0x4a,0x18,0x63}, {0x1a,0x31,0x82,0xe5}, {0x51,0x33,0x60,0x97}, {0x53,0x7f,0x45,0x62}, -{0x64,0x77,0xe0,0xb1}, {0x6b,0xae,0x84,0xbb}, {0x81,0xa0,0x1c,0xfe}, {0x08,0x2b,0x94,0xf9}, -{0x48,0x68,0x58,0x70}, {0x45,0xfd,0x19,0x8f}, {0xde,0x6c,0x87,0x94}, {0x7b,0xf8,0xb7,0x52}, -{0x73,0xd3,0x23,0xab}, {0x4b,0x02,0xe2,0x72}, {0x1f,0x8f,0x57,0xe3}, {0x55,0xab,0x2a,0x66}, -{0xeb,0x28,0x07,0xb2}, {0xb5,0xc2,0x03,0x2f}, {0xc5,0x7b,0x9a,0x86}, {0x37,0x08,0xa5,0xd3}, -{0x28,0x87,0xf2,0x30}, {0xbf,0xa5,0xb2,0x23}, {0x03,0x6a,0xba,0x02}, {0x16,0x82,0x5c,0xed}, -{0xcf,0x1c,0x2b,0x8a}, {0x79,0xb4,0x92,0xa7}, {0x07,0xf2,0xf0,0xf3}, {0x69,0xe2,0xa1,0x4e}, -{0xda,0xf4,0xcd,0x65}, {0x05,0xbe,0xd5,0x06}, {0x34,0x62,0x1f,0xd1}, {0xa6,0xfe,0x8a,0xc4}, -{0x2e,0x53,0x9d,0x34}, {0xf3,0x55,0xa0,0xa2}, {0x8a,0xe1,0x32,0x05}, {0xf6,0xeb,0x75,0xa4}, -{0x83,0xec,0x39,0x0b}, {0x60,0xef,0xaa,0x40}, {0x71,0x9f,0x06,0x5e}, {0x6e,0x10,0x51,0xbd}, -{0x21,0x8a,0xf9,0x3e}, {0xdd,0x06,0x3d,0x96}, {0x3e,0x05,0xae,0xdd}, {0xe6,0xbd,0x46,0x4d}, -{0x54,0x8d,0xb5,0x91}, {0xc4,0x5d,0x05,0x71}, {0x06,0xd4,0x6f,0x04}, {0x50,0x15,0xff,0x60}, -{0x98,0xfb,0x24,0x19}, {0xbd,0xe9,0x97,0xd6}, {0x40,0x43,0xcc,0x89}, {0xd9,0x9e,0x77,0x67}, -{0xe8,0x42,0xbd,0xb0}, {0x89,0x8b,0x88,0x07}, {0x19,0x5b,0x38,0xe7}, {0xc8,0xee,0xdb,0x79}, -{0x7c,0x0a,0x47,0xa1}, {0x42,0x0f,0xe9,0x7c}, {0x84,0x1e,0xc9,0xf8}, {0x00,0x00,0x00,0x00}, -{0x80,0x86,0x83,0x09}, {0x2b,0xed,0x48,0x32}, {0x11,0x70,0xac,0x1e}, {0x5a,0x72,0x4e,0x6c}, -{0x0e,0xff,0xfb,0xfd}, {0x85,0x38,0x56,0x0f}, {0xae,0xd5,0x1e,0x3d}, {0x2d,0x39,0x27,0x36}, -{0x0f,0xd9,0x64,0x0a}, {0x5c,0xa6,0x21,0x68}, {0x5b,0x54,0xd1,0x9b}, {0x36,0x2e,0x3a,0x24}, -{0x0a,0x67,0xb1,0x0c}, {0x57,0xe7,0x0f,0x93}, {0xee,0x96,0xd2,0xb4}, {0x9b,0x91,0x9e,0x1b}, -{0xc0,0xc5,0x4f,0x80}, {0xdc,0x20,0xa2,0x61}, {0x77,0x4b,0x69,0x5a}, {0x12,0x1a,0x16,0x1c}, -{0x93,0xba,0x0a,0xe2}, {0xa0,0x2a,0xe5,0xc0}, {0x22,0xe0,0x43,0x3c}, {0x1b,0x17,0x1d,0x12}, -{0x09,0x0d,0x0b,0x0e}, {0x8b,0xc7,0xad,0xf2}, {0xb6,0xa8,0xb9,0x2d}, {0x1e,0xa9,0xc8,0x14}, -{0xf1,0x19,0x85,0x57}, {0x75,0x07,0x4c,0xaf}, {0x99,0xdd,0xbb,0xee}, {0x7f,0x60,0xfd,0xa3}, -{0x01,0x26,0x9f,0xf7}, {0x72,0xf5,0xbc,0x5c}, {0x66,0x3b,0xc5,0x44}, {0xfb,0x7e,0x34,0x5b}, -{0x43,0x29,0x76,0x8b}, {0x23,0xc6,0xdc,0xcb}, {0xed,0xfc,0x68,0xb6}, {0xe4,0xf1,0x63,0xb8}, -{0x31,0xdc,0xca,0xd7}, {0x63,0x85,0x10,0x42}, {0x97,0x22,0x40,0x13}, {0xc6,0x11,0x20,0x84}, -{0x4a,0x24,0x7d,0x85}, {0xbb,0x3d,0xf8,0xd2}, {0xf9,0x32,0x11,0xae}, {0x29,0xa1,0x6d,0xc7}, -{0x9e,0x2f,0x4b,0x1d}, {0xb2,0x30,0xf3,0xdc}, {0x86,0x52,0xec,0x0d}, {0xc1,0xe3,0xd0,0x77}, -{0xb3,0x16,0x6c,0x2b}, {0x70,0xb9,0x99,0xa9}, {0x94,0x48,0xfa,0x11}, {0xe9,0x64,0x22,0x47}, -{0xfc,0x8c,0xc4,0xa8}, {0xf0,0x3f,0x1a,0xa0}, {0x7d,0x2c,0xd8,0x56}, {0x33,0x90,0xef,0x22}, -{0x49,0x4e,0xc7,0x87}, {0x38,0xd1,0xc1,0xd9}, {0xca,0xa2,0xfe,0x8c}, {0xd4,0x0b,0x36,0x98}, -{0xf5,0x81,0xcf,0xa6}, {0x7a,0xde,0x28,0xa5}, {0xb7,0x8e,0x26,0xda}, {0xad,0xbf,0xa4,0x3f}, -{0x3a,0x9d,0xe4,0x2c}, {0x78,0x92,0x0d,0x50}, {0x5f,0xcc,0x9b,0x6a}, {0x7e,0x46,0x62,0x54}, -{0x8d,0x13,0xc2,0xf6}, {0xd8,0xb8,0xe8,0x90}, {0x39,0xf7,0x5e,0x2e}, {0xc3,0xaf,0xf5,0x82}, -{0x5d,0x80,0xbe,0x9f}, {0xd0,0x93,0x7c,0x69}, {0xd5,0x2d,0xa9,0x6f}, {0x25,0x12,0xb3,0xcf}, -{0xac,0x99,0x3b,0xc8}, {0x18,0x7d,0xa7,0x10}, {0x9c,0x63,0x6e,0xe8}, {0x3b,0xbb,0x7b,0xdb}, -{0x26,0x78,0x09,0xcd}, {0x59,0x18,0xf4,0x6e}, {0x9a,0xb7,0x01,0xec}, {0x4f,0x9a,0xa8,0x83}, -{0x95,0x6e,0x65,0xe6}, {0xff,0xe6,0x7e,0xaa}, {0xbc,0xcf,0x08,0x21}, {0x15,0xe8,0xe6,0xef}, -{0xe7,0x9b,0xd9,0xba}, {0x6f,0x36,0xce,0x4a}, {0x9f,0x09,0xd4,0xea}, {0xb0,0x7c,0xd6,0x29}, -{0xa4,0xb2,0xaf,0x31}, {0x3f,0x23,0x31,0x2a}, {0xa5,0x94,0x30,0xc6}, {0xa2,0x66,0xc0,0x35}, -{0x4e,0xbc,0x37,0x74}, {0x82,0xca,0xa6,0xfc}, {0x90,0xd0,0xb0,0xe0}, {0xa7,0xd8,0x15,0x33}, -{0x04,0x98,0x4a,0xf1}, {0xec,0xda,0xf7,0x41}, {0xcd,0x50,0x0e,0x7f}, {0x91,0xf6,0x2f,0x17}, -{0x4d,0xd6,0x8d,0x76}, {0xef,0xb0,0x4d,0x43}, {0xaa,0x4d,0x54,0xcc}, {0x96,0x04,0xdf,0xe4}, -{0xd1,0xb5,0xe3,0x9e}, {0x6a,0x88,0x1b,0x4c}, {0x2c,0x1f,0xb8,0xc1}, {0x65,0x51,0x7f,0x46}, -{0x5e,0xea,0x04,0x9d}, {0x8c,0x35,0x5d,0x01}, {0x87,0x74,0x73,0xfa}, {0x0b,0x41,0x2e,0xfb}, -{0x67,0x1d,0x5a,0xb3}, {0xdb,0xd2,0x52,0x92}, {0x10,0x56,0x33,0xe9}, {0xd6,0x47,0x13,0x6d}, -{0xd7,0x61,0x8c,0x9a}, {0xa1,0x0c,0x7a,0x37}, {0xf8,0x14,0x8e,0x59}, {0x13,0x3c,0x89,0xeb}, -{0xa9,0x27,0xee,0xce}, {0x61,0xc9,0x35,0xb7}, {0x1c,0xe5,0xed,0xe1}, {0x47,0xb1,0x3c,0x7a}, -{0xd2,0xdf,0x59,0x9c}, {0xf2,0x73,0x3f,0x55}, {0x14,0xce,0x79,0x18}, {0xc7,0x37,0xbf,0x73}, -{0xf7,0xcd,0xea,0x53}, {0xfd,0xaa,0x5b,0x5f}, {0x3d,0x6f,0x14,0xdf}, {0x44,0xdb,0x86,0x78}, -{0xaf,0xf3,0x81,0xca}, {0x68,0xc4,0x3e,0xb9}, {0x24,0x34,0x2c,0x38}, {0xa3,0x40,0x5f,0xc2}, -{0x1d,0xc3,0x72,0x16}, {0xe2,0x25,0x0c,0xbc}, {0x3c,0x49,0x8b,0x28}, {0x0d,0x95,0x41,0xff}, -{0xa8,0x01,0x71,0x39}, {0x0c,0xb3,0xde,0x08}, {0xb4,0xe4,0x9c,0xd8}, {0x56,0xc1,0x90,0x64}, -{0xcb,0x84,0x61,0x7b}, {0x32,0xb6,0x70,0xd5}, {0x6c,0x5c,0x74,0x48}, {0xb8,0x57,0x42,0xd0} - } -}; -#define T8 xT8.xt8 - -static const word8 S5[256] = { -0x52,0x09,0x6a,0xd5, -0x30,0x36,0xa5,0x38, -0xbf,0x40,0xa3,0x9e, -0x81,0xf3,0xd7,0xfb, -0x7c,0xe3,0x39,0x82, -0x9b,0x2f,0xff,0x87, -0x34,0x8e,0x43,0x44, -0xc4,0xde,0xe9,0xcb, -0x54,0x7b,0x94,0x32, -0xa6,0xc2,0x23,0x3d, -0xee,0x4c,0x95,0x0b, -0x42,0xfa,0xc3,0x4e, -0x08,0x2e,0xa1,0x66, -0x28,0xd9,0x24,0xb2, -0x76,0x5b,0xa2,0x49, -0x6d,0x8b,0xd1,0x25, -0x72,0xf8,0xf6,0x64, -0x86,0x68,0x98,0x16, -0xd4,0xa4,0x5c,0xcc, -0x5d,0x65,0xb6,0x92, -0x6c,0x70,0x48,0x50, -0xfd,0xed,0xb9,0xda, -0x5e,0x15,0x46,0x57, -0xa7,0x8d,0x9d,0x84, -0x90,0xd8,0xab,0x00, -0x8c,0xbc,0xd3,0x0a, -0xf7,0xe4,0x58,0x05, -0xb8,0xb3,0x45,0x06, -0xd0,0x2c,0x1e,0x8f, -0xca,0x3f,0x0f,0x02, -0xc1,0xaf,0xbd,0x03, -0x01,0x13,0x8a,0x6b, -0x3a,0x91,0x11,0x41, -0x4f,0x67,0xdc,0xea, -0x97,0xf2,0xcf,0xce, -0xf0,0xb4,0xe6,0x73, -0x96,0xac,0x74,0x22, -0xe7,0xad,0x35,0x85, -0xe2,0xf9,0x37,0xe8, -0x1c,0x75,0xdf,0x6e, -0x47,0xf1,0x1a,0x71, -0x1d,0x29,0xc5,0x89, -0x6f,0xb7,0x62,0x0e, -0xaa,0x18,0xbe,0x1b, -0xfc,0x56,0x3e,0x4b, -0xc6,0xd2,0x79,0x20, -0x9a,0xdb,0xc0,0xfe, -0x78,0xcd,0x5a,0xf4, -0x1f,0xdd,0xa8,0x33, -0x88,0x07,0xc7,0x31, -0xb1,0x12,0x10,0x59, -0x27,0x80,0xec,0x5f, -0x60,0x51,0x7f,0xa9, -0x19,0xb5,0x4a,0x0d, -0x2d,0xe5,0x7a,0x9f, -0x93,0xc9,0x9c,0xef, -0xa0,0xe0,0x3b,0x4d, -0xae,0x2a,0xf5,0xb0, -0xc8,0xeb,0xbb,0x3c, -0x83,0x53,0x99,0x61, -0x17,0x2b,0x04,0x7e, -0xba,0x77,0xd6,0x26, -0xe1,0x69,0x14,0x63, -0x55,0x21,0x0c,0x7d -}; - -static const union xtab xU1 = { - .xt8 = { -{0x00,0x00,0x00,0x00}, {0x0e,0x09,0x0d,0x0b}, {0x1c,0x12,0x1a,0x16}, {0x12,0x1b,0x17,0x1d}, -{0x38,0x24,0x34,0x2c}, {0x36,0x2d,0x39,0x27}, {0x24,0x36,0x2e,0x3a}, {0x2a,0x3f,0x23,0x31}, -{0x70,0x48,0x68,0x58}, {0x7e,0x41,0x65,0x53}, {0x6c,0x5a,0x72,0x4e}, {0x62,0x53,0x7f,0x45}, -{0x48,0x6c,0x5c,0x74}, {0x46,0x65,0x51,0x7f}, {0x54,0x7e,0x46,0x62}, {0x5a,0x77,0x4b,0x69}, -{0xe0,0x90,0xd0,0xb0}, {0xee,0x99,0xdd,0xbb}, {0xfc,0x82,0xca,0xa6}, {0xf2,0x8b,0xc7,0xad}, -{0xd8,0xb4,0xe4,0x9c}, {0xd6,0xbd,0xe9,0x97}, {0xc4,0xa6,0xfe,0x8a}, {0xca,0xaf,0xf3,0x81}, -{0x90,0xd8,0xb8,0xe8}, {0x9e,0xd1,0xb5,0xe3}, {0x8c,0xca,0xa2,0xfe}, {0x82,0xc3,0xaf,0xf5}, -{0xa8,0xfc,0x8c,0xc4}, {0xa6,0xf5,0x81,0xcf}, {0xb4,0xee,0x96,0xd2}, {0xba,0xe7,0x9b,0xd9}, -{0xdb,0x3b,0xbb,0x7b}, {0xd5,0x32,0xb6,0x70}, {0xc7,0x29,0xa1,0x6d}, {0xc9,0x20,0xac,0x66}, -{0xe3,0x1f,0x8f,0x57}, {0xed,0x16,0x82,0x5c}, {0xff,0x0d,0x95,0x41}, {0xf1,0x04,0x98,0x4a}, -{0xab,0x73,0xd3,0x23}, {0xa5,0x7a,0xde,0x28}, {0xb7,0x61,0xc9,0x35}, {0xb9,0x68,0xc4,0x3e}, -{0x93,0x57,0xe7,0x0f}, {0x9d,0x5e,0xea,0x04}, {0x8f,0x45,0xfd,0x19}, {0x81,0x4c,0xf0,0x12}, -{0x3b,0xab,0x6b,0xcb}, {0x35,0xa2,0x66,0xc0}, {0x27,0xb9,0x71,0xdd}, {0x29,0xb0,0x7c,0xd6}, -{0x03,0x8f,0x5f,0xe7}, {0x0d,0x86,0x52,0xec}, {0x1f,0x9d,0x45,0xf1}, {0x11,0x94,0x48,0xfa}, -{0x4b,0xe3,0x03,0x93}, {0x45,0xea,0x0e,0x98}, {0x57,0xf1,0x19,0x85}, {0x59,0xf8,0x14,0x8e}, -{0x73,0xc7,0x37,0xbf}, {0x7d,0xce,0x3a,0xb4}, {0x6f,0xd5,0x2d,0xa9}, {0x61,0xdc,0x20,0xa2}, -{0xad,0x76,0x6d,0xf6}, {0xa3,0x7f,0x60,0xfd}, {0xb1,0x64,0x77,0xe0}, {0xbf,0x6d,0x7a,0xeb}, -{0x95,0x52,0x59,0xda}, {0x9b,0x5b,0x54,0xd1}, {0x89,0x40,0x43,0xcc}, {0x87,0x49,0x4e,0xc7}, -{0xdd,0x3e,0x05,0xae}, {0xd3,0x37,0x08,0xa5}, {0xc1,0x2c,0x1f,0xb8}, {0xcf,0x25,0x12,0xb3}, -{0xe5,0x1a,0x31,0x82}, {0xeb,0x13,0x3c,0x89}, {0xf9,0x08,0x2b,0x94}, {0xf7,0x01,0x26,0x9f}, -{0x4d,0xe6,0xbd,0x46}, {0x43,0xef,0xb0,0x4d}, {0x51,0xf4,0xa7,0x50}, {0x5f,0xfd,0xaa,0x5b}, -{0x75,0xc2,0x89,0x6a}, {0x7b,0xcb,0x84,0x61}, {0x69,0xd0,0x93,0x7c}, {0x67,0xd9,0x9e,0x77}, -{0x3d,0xae,0xd5,0x1e}, {0x33,0xa7,0xd8,0x15}, {0x21,0xbc,0xcf,0x08}, {0x2f,0xb5,0xc2,0x03}, -{0x05,0x8a,0xe1,0x32}, {0x0b,0x83,0xec,0x39}, {0x19,0x98,0xfb,0x24}, {0x17,0x91,0xf6,0x2f}, -{0x76,0x4d,0xd6,0x8d}, {0x78,0x44,0xdb,0x86}, {0x6a,0x5f,0xcc,0x9b}, {0x64,0x56,0xc1,0x90}, -{0x4e,0x69,0xe2,0xa1}, {0x40,0x60,0xef,0xaa}, {0x52,0x7b,0xf8,0xb7}, {0x5c,0x72,0xf5,0xbc}, -{0x06,0x05,0xbe,0xd5}, {0x08,0x0c,0xb3,0xde}, {0x1a,0x17,0xa4,0xc3}, {0x14,0x1e,0xa9,0xc8}, -{0x3e,0x21,0x8a,0xf9}, {0x30,0x28,0x87,0xf2}, {0x22,0x33,0x90,0xef}, {0x2c,0x3a,0x9d,0xe4}, -{0x96,0xdd,0x06,0x3d}, {0x98,0xd4,0x0b,0x36}, {0x8a,0xcf,0x1c,0x2b}, {0x84,0xc6,0x11,0x20}, -{0xae,0xf9,0x32,0x11}, {0xa0,0xf0,0x3f,0x1a}, {0xb2,0xeb,0x28,0x07}, {0xbc,0xe2,0x25,0x0c}, -{0xe6,0x95,0x6e,0x65}, {0xe8,0x9c,0x63,0x6e}, {0xfa,0x87,0x74,0x73}, {0xf4,0x8e,0x79,0x78}, -{0xde,0xb1,0x5a,0x49}, {0xd0,0xb8,0x57,0x42}, {0xc2,0xa3,0x40,0x5f}, {0xcc,0xaa,0x4d,0x54}, -{0x41,0xec,0xda,0xf7}, {0x4f,0xe5,0xd7,0xfc}, {0x5d,0xfe,0xc0,0xe1}, {0x53,0xf7,0xcd,0xea}, -{0x79,0xc8,0xee,0xdb}, {0x77,0xc1,0xe3,0xd0}, {0x65,0xda,0xf4,0xcd}, {0x6b,0xd3,0xf9,0xc6}, -{0x31,0xa4,0xb2,0xaf}, {0x3f,0xad,0xbf,0xa4}, {0x2d,0xb6,0xa8,0xb9}, {0x23,0xbf,0xa5,0xb2}, -{0x09,0x80,0x86,0x83}, {0x07,0x89,0x8b,0x88}, {0x15,0x92,0x9c,0x95}, {0x1b,0x9b,0x91,0x9e}, -{0xa1,0x7c,0x0a,0x47}, {0xaf,0x75,0x07,0x4c}, {0xbd,0x6e,0x10,0x51}, {0xb3,0x67,0x1d,0x5a}, -{0x99,0x58,0x3e,0x6b}, {0x97,0x51,0x33,0x60}, {0x85,0x4a,0x24,0x7d}, {0x8b,0x43,0x29,0x76}, -{0xd1,0x34,0x62,0x1f}, {0xdf,0x3d,0x6f,0x14}, {0xcd,0x26,0x78,0x09}, {0xc3,0x2f,0x75,0x02}, -{0xe9,0x10,0x56,0x33}, {0xe7,0x19,0x5b,0x38}, {0xf5,0x02,0x4c,0x25}, {0xfb,0x0b,0x41,0x2e}, -{0x9a,0xd7,0x61,0x8c}, {0x94,0xde,0x6c,0x87}, {0x86,0xc5,0x7b,0x9a}, {0x88,0xcc,0x76,0x91}, -{0xa2,0xf3,0x55,0xa0}, {0xac,0xfa,0x58,0xab}, {0xbe,0xe1,0x4f,0xb6}, {0xb0,0xe8,0x42,0xbd}, -{0xea,0x9f,0x09,0xd4}, {0xe4,0x96,0x04,0xdf}, {0xf6,0x8d,0x13,0xc2}, {0xf8,0x84,0x1e,0xc9}, -{0xd2,0xbb,0x3d,0xf8}, {0xdc,0xb2,0x30,0xf3}, {0xce,0xa9,0x27,0xee}, {0xc0,0xa0,0x2a,0xe5}, -{0x7a,0x47,0xb1,0x3c}, {0x74,0x4e,0xbc,0x37}, {0x66,0x55,0xab,0x2a}, {0x68,0x5c,0xa6,0x21}, -{0x42,0x63,0x85,0x10}, {0x4c,0x6a,0x88,0x1b}, {0x5e,0x71,0x9f,0x06}, {0x50,0x78,0x92,0x0d}, -{0x0a,0x0f,0xd9,0x64}, {0x04,0x06,0xd4,0x6f}, {0x16,0x1d,0xc3,0x72}, {0x18,0x14,0xce,0x79}, -{0x32,0x2b,0xed,0x48}, {0x3c,0x22,0xe0,0x43}, {0x2e,0x39,0xf7,0x5e}, {0x20,0x30,0xfa,0x55}, -{0xec,0x9a,0xb7,0x01}, {0xe2,0x93,0xba,0x0a}, {0xf0,0x88,0xad,0x17}, {0xfe,0x81,0xa0,0x1c}, -{0xd4,0xbe,0x83,0x2d}, {0xda,0xb7,0x8e,0x26}, {0xc8,0xac,0x99,0x3b}, {0xc6,0xa5,0x94,0x30}, -{0x9c,0xd2,0xdf,0x59}, {0x92,0xdb,0xd2,0x52}, {0x80,0xc0,0xc5,0x4f}, {0x8e,0xc9,0xc8,0x44}, -{0xa4,0xf6,0xeb,0x75}, {0xaa,0xff,0xe6,0x7e}, {0xb8,0xe4,0xf1,0x63}, {0xb6,0xed,0xfc,0x68}, -{0x0c,0x0a,0x67,0xb1}, {0x02,0x03,0x6a,0xba}, {0x10,0x18,0x7d,0xa7}, {0x1e,0x11,0x70,0xac}, -{0x34,0x2e,0x53,0x9d}, {0x3a,0x27,0x5e,0x96}, {0x28,0x3c,0x49,0x8b}, {0x26,0x35,0x44,0x80}, -{0x7c,0x42,0x0f,0xe9}, {0x72,0x4b,0x02,0xe2}, {0x60,0x50,0x15,0xff}, {0x6e,0x59,0x18,0xf4}, -{0x44,0x66,0x3b,0xc5}, {0x4a,0x6f,0x36,0xce}, {0x58,0x74,0x21,0xd3}, {0x56,0x7d,0x2c,0xd8}, -{0x37,0xa1,0x0c,0x7a}, {0x39,0xa8,0x01,0x71}, {0x2b,0xb3,0x16,0x6c}, {0x25,0xba,0x1b,0x67}, -{0x0f,0x85,0x38,0x56}, {0x01,0x8c,0x35,0x5d}, {0x13,0x97,0x22,0x40}, {0x1d,0x9e,0x2f,0x4b}, -{0x47,0xe9,0x64,0x22}, {0x49,0xe0,0x69,0x29}, {0x5b,0xfb,0x7e,0x34}, {0x55,0xf2,0x73,0x3f}, -{0x7f,0xcd,0x50,0x0e}, {0x71,0xc4,0x5d,0x05}, {0x63,0xdf,0x4a,0x18}, {0x6d,0xd6,0x47,0x13}, -{0xd7,0x31,0xdc,0xca}, {0xd9,0x38,0xd1,0xc1}, {0xcb,0x23,0xc6,0xdc}, {0xc5,0x2a,0xcb,0xd7}, -{0xef,0x15,0xe8,0xe6}, {0xe1,0x1c,0xe5,0xed}, {0xf3,0x07,0xf2,0xf0}, {0xfd,0x0e,0xff,0xfb}, -{0xa7,0x79,0xb4,0x92}, {0xa9,0x70,0xb9,0x99}, {0xbb,0x6b,0xae,0x84}, {0xb5,0x62,0xa3,0x8f}, -{0x9f,0x5d,0x80,0xbe}, {0x91,0x54,0x8d,0xb5}, {0x83,0x4f,0x9a,0xa8}, {0x8d,0x46,0x97,0xa3} - } -}; -#define U1 xU1.xt8 - -static const union xtab xU2 = { - .xt8 = { -{0x00,0x00,0x00,0x00}, {0x0b,0x0e,0x09,0x0d}, {0x16,0x1c,0x12,0x1a}, {0x1d,0x12,0x1b,0x17}, -{0x2c,0x38,0x24,0x34}, {0x27,0x36,0x2d,0x39}, {0x3a,0x24,0x36,0x2e}, {0x31,0x2a,0x3f,0x23}, -{0x58,0x70,0x48,0x68}, {0x53,0x7e,0x41,0x65}, {0x4e,0x6c,0x5a,0x72}, {0x45,0x62,0x53,0x7f}, -{0x74,0x48,0x6c,0x5c}, {0x7f,0x46,0x65,0x51}, {0x62,0x54,0x7e,0x46}, {0x69,0x5a,0x77,0x4b}, -{0xb0,0xe0,0x90,0xd0}, {0xbb,0xee,0x99,0xdd}, {0xa6,0xfc,0x82,0xca}, {0xad,0xf2,0x8b,0xc7}, -{0x9c,0xd8,0xb4,0xe4}, {0x97,0xd6,0xbd,0xe9}, {0x8a,0xc4,0xa6,0xfe}, {0x81,0xca,0xaf,0xf3}, -{0xe8,0x90,0xd8,0xb8}, {0xe3,0x9e,0xd1,0xb5}, {0xfe,0x8c,0xca,0xa2}, {0xf5,0x82,0xc3,0xaf}, -{0xc4,0xa8,0xfc,0x8c}, {0xcf,0xa6,0xf5,0x81}, {0xd2,0xb4,0xee,0x96}, {0xd9,0xba,0xe7,0x9b}, -{0x7b,0xdb,0x3b,0xbb}, {0x70,0xd5,0x32,0xb6}, {0x6d,0xc7,0x29,0xa1}, {0x66,0xc9,0x20,0xac}, -{0x57,0xe3,0x1f,0x8f}, {0x5c,0xed,0x16,0x82}, {0x41,0xff,0x0d,0x95}, {0x4a,0xf1,0x04,0x98}, -{0x23,0xab,0x73,0xd3}, {0x28,0xa5,0x7a,0xde}, {0x35,0xb7,0x61,0xc9}, {0x3e,0xb9,0x68,0xc4}, -{0x0f,0x93,0x57,0xe7}, {0x04,0x9d,0x5e,0xea}, {0x19,0x8f,0x45,0xfd}, {0x12,0x81,0x4c,0xf0}, -{0xcb,0x3b,0xab,0x6b}, {0xc0,0x35,0xa2,0x66}, {0xdd,0x27,0xb9,0x71}, {0xd6,0x29,0xb0,0x7c}, -{0xe7,0x03,0x8f,0x5f}, {0xec,0x0d,0x86,0x52}, {0xf1,0x1f,0x9d,0x45}, {0xfa,0x11,0x94,0x48}, -{0x93,0x4b,0xe3,0x03}, {0x98,0x45,0xea,0x0e}, {0x85,0x57,0xf1,0x19}, {0x8e,0x59,0xf8,0x14}, -{0xbf,0x73,0xc7,0x37}, {0xb4,0x7d,0xce,0x3a}, {0xa9,0x6f,0xd5,0x2d}, {0xa2,0x61,0xdc,0x20}, -{0xf6,0xad,0x76,0x6d}, {0xfd,0xa3,0x7f,0x60}, {0xe0,0xb1,0x64,0x77}, {0xeb,0xbf,0x6d,0x7a}, -{0xda,0x95,0x52,0x59}, {0xd1,0x9b,0x5b,0x54}, {0xcc,0x89,0x40,0x43}, {0xc7,0x87,0x49,0x4e}, -{0xae,0xdd,0x3e,0x05}, {0xa5,0xd3,0x37,0x08}, {0xb8,0xc1,0x2c,0x1f}, {0xb3,0xcf,0x25,0x12}, -{0x82,0xe5,0x1a,0x31}, {0x89,0xeb,0x13,0x3c}, {0x94,0xf9,0x08,0x2b}, {0x9f,0xf7,0x01,0x26}, -{0x46,0x4d,0xe6,0xbd}, {0x4d,0x43,0xef,0xb0}, {0x50,0x51,0xf4,0xa7}, {0x5b,0x5f,0xfd,0xaa}, -{0x6a,0x75,0xc2,0x89}, {0x61,0x7b,0xcb,0x84}, {0x7c,0x69,0xd0,0x93}, {0x77,0x67,0xd9,0x9e}, -{0x1e,0x3d,0xae,0xd5}, {0x15,0x33,0xa7,0xd8}, {0x08,0x21,0xbc,0xcf}, {0x03,0x2f,0xb5,0xc2}, -{0x32,0x05,0x8a,0xe1}, {0x39,0x0b,0x83,0xec}, {0x24,0x19,0x98,0xfb}, {0x2f,0x17,0x91,0xf6}, -{0x8d,0x76,0x4d,0xd6}, {0x86,0x78,0x44,0xdb}, {0x9b,0x6a,0x5f,0xcc}, {0x90,0x64,0x56,0xc1}, -{0xa1,0x4e,0x69,0xe2}, {0xaa,0x40,0x60,0xef}, {0xb7,0x52,0x7b,0xf8}, {0xbc,0x5c,0x72,0xf5}, -{0xd5,0x06,0x05,0xbe}, {0xde,0x08,0x0c,0xb3}, {0xc3,0x1a,0x17,0xa4}, {0xc8,0x14,0x1e,0xa9}, -{0xf9,0x3e,0x21,0x8a}, {0xf2,0x30,0x28,0x87}, {0xef,0x22,0x33,0x90}, {0xe4,0x2c,0x3a,0x9d}, -{0x3d,0x96,0xdd,0x06}, {0x36,0x98,0xd4,0x0b}, {0x2b,0x8a,0xcf,0x1c}, {0x20,0x84,0xc6,0x11}, -{0x11,0xae,0xf9,0x32}, {0x1a,0xa0,0xf0,0x3f}, {0x07,0xb2,0xeb,0x28}, {0x0c,0xbc,0xe2,0x25}, -{0x65,0xe6,0x95,0x6e}, {0x6e,0xe8,0x9c,0x63}, {0x73,0xfa,0x87,0x74}, {0x78,0xf4,0x8e,0x79}, -{0x49,0xde,0xb1,0x5a}, {0x42,0xd0,0xb8,0x57}, {0x5f,0xc2,0xa3,0x40}, {0x54,0xcc,0xaa,0x4d}, -{0xf7,0x41,0xec,0xda}, {0xfc,0x4f,0xe5,0xd7}, {0xe1,0x5d,0xfe,0xc0}, {0xea,0x53,0xf7,0xcd}, -{0xdb,0x79,0xc8,0xee}, {0xd0,0x77,0xc1,0xe3}, {0xcd,0x65,0xda,0xf4}, {0xc6,0x6b,0xd3,0xf9}, -{0xaf,0x31,0xa4,0xb2}, {0xa4,0x3f,0xad,0xbf}, {0xb9,0x2d,0xb6,0xa8}, {0xb2,0x23,0xbf,0xa5}, -{0x83,0x09,0x80,0x86}, {0x88,0x07,0x89,0x8b}, {0x95,0x15,0x92,0x9c}, {0x9e,0x1b,0x9b,0x91}, -{0x47,0xa1,0x7c,0x0a}, {0x4c,0xaf,0x75,0x07}, {0x51,0xbd,0x6e,0x10}, {0x5a,0xb3,0x67,0x1d}, -{0x6b,0x99,0x58,0x3e}, {0x60,0x97,0x51,0x33}, {0x7d,0x85,0x4a,0x24}, {0x76,0x8b,0x43,0x29}, -{0x1f,0xd1,0x34,0x62}, {0x14,0xdf,0x3d,0x6f}, {0x09,0xcd,0x26,0x78}, {0x02,0xc3,0x2f,0x75}, -{0x33,0xe9,0x10,0x56}, {0x38,0xe7,0x19,0x5b}, {0x25,0xf5,0x02,0x4c}, {0x2e,0xfb,0x0b,0x41}, -{0x8c,0x9a,0xd7,0x61}, {0x87,0x94,0xde,0x6c}, {0x9a,0x86,0xc5,0x7b}, {0x91,0x88,0xcc,0x76}, -{0xa0,0xa2,0xf3,0x55}, {0xab,0xac,0xfa,0x58}, {0xb6,0xbe,0xe1,0x4f}, {0xbd,0xb0,0xe8,0x42}, -{0xd4,0xea,0x9f,0x09}, {0xdf,0xe4,0x96,0x04}, {0xc2,0xf6,0x8d,0x13}, {0xc9,0xf8,0x84,0x1e}, -{0xf8,0xd2,0xbb,0x3d}, {0xf3,0xdc,0xb2,0x30}, {0xee,0xce,0xa9,0x27}, {0xe5,0xc0,0xa0,0x2a}, -{0x3c,0x7a,0x47,0xb1}, {0x37,0x74,0x4e,0xbc}, {0x2a,0x66,0x55,0xab}, {0x21,0x68,0x5c,0xa6}, -{0x10,0x42,0x63,0x85}, {0x1b,0x4c,0x6a,0x88}, {0x06,0x5e,0x71,0x9f}, {0x0d,0x50,0x78,0x92}, -{0x64,0x0a,0x0f,0xd9}, {0x6f,0x04,0x06,0xd4}, {0x72,0x16,0x1d,0xc3}, {0x79,0x18,0x14,0xce}, -{0x48,0x32,0x2b,0xed}, {0x43,0x3c,0x22,0xe0}, {0x5e,0x2e,0x39,0xf7}, {0x55,0x20,0x30,0xfa}, -{0x01,0xec,0x9a,0xb7}, {0x0a,0xe2,0x93,0xba}, {0x17,0xf0,0x88,0xad}, {0x1c,0xfe,0x81,0xa0}, -{0x2d,0xd4,0xbe,0x83}, {0x26,0xda,0xb7,0x8e}, {0x3b,0xc8,0xac,0x99}, {0x30,0xc6,0xa5,0x94}, -{0x59,0x9c,0xd2,0xdf}, {0x52,0x92,0xdb,0xd2}, {0x4f,0x80,0xc0,0xc5}, {0x44,0x8e,0xc9,0xc8}, -{0x75,0xa4,0xf6,0xeb}, {0x7e,0xaa,0xff,0xe6}, {0x63,0xb8,0xe4,0xf1}, {0x68,0xb6,0xed,0xfc}, -{0xb1,0x0c,0x0a,0x67}, {0xba,0x02,0x03,0x6a}, {0xa7,0x10,0x18,0x7d}, {0xac,0x1e,0x11,0x70}, -{0x9d,0x34,0x2e,0x53}, {0x96,0x3a,0x27,0x5e}, {0x8b,0x28,0x3c,0x49}, {0x80,0x26,0x35,0x44}, -{0xe9,0x7c,0x42,0x0f}, {0xe2,0x72,0x4b,0x02}, {0xff,0x60,0x50,0x15}, {0xf4,0x6e,0x59,0x18}, -{0xc5,0x44,0x66,0x3b}, {0xce,0x4a,0x6f,0x36}, {0xd3,0x58,0x74,0x21}, {0xd8,0x56,0x7d,0x2c}, -{0x7a,0x37,0xa1,0x0c}, {0x71,0x39,0xa8,0x01}, {0x6c,0x2b,0xb3,0x16}, {0x67,0x25,0xba,0x1b}, -{0x56,0x0f,0x85,0x38}, {0x5d,0x01,0x8c,0x35}, {0x40,0x13,0x97,0x22}, {0x4b,0x1d,0x9e,0x2f}, -{0x22,0x47,0xe9,0x64}, {0x29,0x49,0xe0,0x69}, {0x34,0x5b,0xfb,0x7e}, {0x3f,0x55,0xf2,0x73}, -{0x0e,0x7f,0xcd,0x50}, {0x05,0x71,0xc4,0x5d}, {0x18,0x63,0xdf,0x4a}, {0x13,0x6d,0xd6,0x47}, -{0xca,0xd7,0x31,0xdc}, {0xc1,0xd9,0x38,0xd1}, {0xdc,0xcb,0x23,0xc6}, {0xd7,0xc5,0x2a,0xcb}, -{0xe6,0xef,0x15,0xe8}, {0xed,0xe1,0x1c,0xe5}, {0xf0,0xf3,0x07,0xf2}, {0xfb,0xfd,0x0e,0xff}, -{0x92,0xa7,0x79,0xb4}, {0x99,0xa9,0x70,0xb9}, {0x84,0xbb,0x6b,0xae}, {0x8f,0xb5,0x62,0xa3}, -{0xbe,0x9f,0x5d,0x80}, {0xb5,0x91,0x54,0x8d}, {0xa8,0x83,0x4f,0x9a}, {0xa3,0x8d,0x46,0x97} - } -}; -#define U2 xU2.xt8 - -static const union xtab xU3 = { - .xt8 = { -{0x00,0x00,0x00,0x00}, {0x0d,0x0b,0x0e,0x09}, {0x1a,0x16,0x1c,0x12}, {0x17,0x1d,0x12,0x1b}, -{0x34,0x2c,0x38,0x24}, {0x39,0x27,0x36,0x2d}, {0x2e,0x3a,0x24,0x36}, {0x23,0x31,0x2a,0x3f}, -{0x68,0x58,0x70,0x48}, {0x65,0x53,0x7e,0x41}, {0x72,0x4e,0x6c,0x5a}, {0x7f,0x45,0x62,0x53}, -{0x5c,0x74,0x48,0x6c}, {0x51,0x7f,0x46,0x65}, {0x46,0x62,0x54,0x7e}, {0x4b,0x69,0x5a,0x77}, -{0xd0,0xb0,0xe0,0x90}, {0xdd,0xbb,0xee,0x99}, {0xca,0xa6,0xfc,0x82}, {0xc7,0xad,0xf2,0x8b}, -{0xe4,0x9c,0xd8,0xb4}, {0xe9,0x97,0xd6,0xbd}, {0xfe,0x8a,0xc4,0xa6}, {0xf3,0x81,0xca,0xaf}, -{0xb8,0xe8,0x90,0xd8}, {0xb5,0xe3,0x9e,0xd1}, {0xa2,0xfe,0x8c,0xca}, {0xaf,0xf5,0x82,0xc3}, -{0x8c,0xc4,0xa8,0xfc}, {0x81,0xcf,0xa6,0xf5}, {0x96,0xd2,0xb4,0xee}, {0x9b,0xd9,0xba,0xe7}, -{0xbb,0x7b,0xdb,0x3b}, {0xb6,0x70,0xd5,0x32}, {0xa1,0x6d,0xc7,0x29}, {0xac,0x66,0xc9,0x20}, -{0x8f,0x57,0xe3,0x1f}, {0x82,0x5c,0xed,0x16}, {0x95,0x41,0xff,0x0d}, {0x98,0x4a,0xf1,0x04}, -{0xd3,0x23,0xab,0x73}, {0xde,0x28,0xa5,0x7a}, {0xc9,0x35,0xb7,0x61}, {0xc4,0x3e,0xb9,0x68}, -{0xe7,0x0f,0x93,0x57}, {0xea,0x04,0x9d,0x5e}, {0xfd,0x19,0x8f,0x45}, {0xf0,0x12,0x81,0x4c}, -{0x6b,0xcb,0x3b,0xab}, {0x66,0xc0,0x35,0xa2}, {0x71,0xdd,0x27,0xb9}, {0x7c,0xd6,0x29,0xb0}, -{0x5f,0xe7,0x03,0x8f}, {0x52,0xec,0x0d,0x86}, {0x45,0xf1,0x1f,0x9d}, {0x48,0xfa,0x11,0x94}, -{0x03,0x93,0x4b,0xe3}, {0x0e,0x98,0x45,0xea}, {0x19,0x85,0x57,0xf1}, {0x14,0x8e,0x59,0xf8}, -{0x37,0xbf,0x73,0xc7}, {0x3a,0xb4,0x7d,0xce}, {0x2d,0xa9,0x6f,0xd5}, {0x20,0xa2,0x61,0xdc}, -{0x6d,0xf6,0xad,0x76}, {0x60,0xfd,0xa3,0x7f}, {0x77,0xe0,0xb1,0x64}, {0x7a,0xeb,0xbf,0x6d}, -{0x59,0xda,0x95,0x52}, {0x54,0xd1,0x9b,0x5b}, {0x43,0xcc,0x89,0x40}, {0x4e,0xc7,0x87,0x49}, -{0x05,0xae,0xdd,0x3e}, {0x08,0xa5,0xd3,0x37}, {0x1f,0xb8,0xc1,0x2c}, {0x12,0xb3,0xcf,0x25}, -{0x31,0x82,0xe5,0x1a}, {0x3c,0x89,0xeb,0x13}, {0x2b,0x94,0xf9,0x08}, {0x26,0x9f,0xf7,0x01}, -{0xbd,0x46,0x4d,0xe6}, {0xb0,0x4d,0x43,0xef}, {0xa7,0x50,0x51,0xf4}, {0xaa,0x5b,0x5f,0xfd}, -{0x89,0x6a,0x75,0xc2}, {0x84,0x61,0x7b,0xcb}, {0x93,0x7c,0x69,0xd0}, {0x9e,0x77,0x67,0xd9}, -{0xd5,0x1e,0x3d,0xae}, {0xd8,0x15,0x33,0xa7}, {0xcf,0x08,0x21,0xbc}, {0xc2,0x03,0x2f,0xb5}, -{0xe1,0x32,0x05,0x8a}, {0xec,0x39,0x0b,0x83}, {0xfb,0x24,0x19,0x98}, {0xf6,0x2f,0x17,0x91}, -{0xd6,0x8d,0x76,0x4d}, {0xdb,0x86,0x78,0x44}, {0xcc,0x9b,0x6a,0x5f}, {0xc1,0x90,0x64,0x56}, -{0xe2,0xa1,0x4e,0x69}, {0xef,0xaa,0x40,0x60}, {0xf8,0xb7,0x52,0x7b}, {0xf5,0xbc,0x5c,0x72}, -{0xbe,0xd5,0x06,0x05}, {0xb3,0xde,0x08,0x0c}, {0xa4,0xc3,0x1a,0x17}, {0xa9,0xc8,0x14,0x1e}, -{0x8a,0xf9,0x3e,0x21}, {0x87,0xf2,0x30,0x28}, {0x90,0xef,0x22,0x33}, {0x9d,0xe4,0x2c,0x3a}, -{0x06,0x3d,0x96,0xdd}, {0x0b,0x36,0x98,0xd4}, {0x1c,0x2b,0x8a,0xcf}, {0x11,0x20,0x84,0xc6}, -{0x32,0x11,0xae,0xf9}, {0x3f,0x1a,0xa0,0xf0}, {0x28,0x07,0xb2,0xeb}, {0x25,0x0c,0xbc,0xe2}, -{0x6e,0x65,0xe6,0x95}, {0x63,0x6e,0xe8,0x9c}, {0x74,0x73,0xfa,0x87}, {0x79,0x78,0xf4,0x8e}, -{0x5a,0x49,0xde,0xb1}, {0x57,0x42,0xd0,0xb8}, {0x40,0x5f,0xc2,0xa3}, {0x4d,0x54,0xcc,0xaa}, -{0xda,0xf7,0x41,0xec}, {0xd7,0xfc,0x4f,0xe5}, {0xc0,0xe1,0x5d,0xfe}, {0xcd,0xea,0x53,0xf7}, -{0xee,0xdb,0x79,0xc8}, {0xe3,0xd0,0x77,0xc1}, {0xf4,0xcd,0x65,0xda}, {0xf9,0xc6,0x6b,0xd3}, -{0xb2,0xaf,0x31,0xa4}, {0xbf,0xa4,0x3f,0xad}, {0xa8,0xb9,0x2d,0xb6}, {0xa5,0xb2,0x23,0xbf}, -{0x86,0x83,0x09,0x80}, {0x8b,0x88,0x07,0x89}, {0x9c,0x95,0x15,0x92}, {0x91,0x9e,0x1b,0x9b}, -{0x0a,0x47,0xa1,0x7c}, {0x07,0x4c,0xaf,0x75}, {0x10,0x51,0xbd,0x6e}, {0x1d,0x5a,0xb3,0x67}, -{0x3e,0x6b,0x99,0x58}, {0x33,0x60,0x97,0x51}, {0x24,0x7d,0x85,0x4a}, {0x29,0x76,0x8b,0x43}, -{0x62,0x1f,0xd1,0x34}, {0x6f,0x14,0xdf,0x3d}, {0x78,0x09,0xcd,0x26}, {0x75,0x02,0xc3,0x2f}, -{0x56,0x33,0xe9,0x10}, {0x5b,0x38,0xe7,0x19}, {0x4c,0x25,0xf5,0x02}, {0x41,0x2e,0xfb,0x0b}, -{0x61,0x8c,0x9a,0xd7}, {0x6c,0x87,0x94,0xde}, {0x7b,0x9a,0x86,0xc5}, {0x76,0x91,0x88,0xcc}, -{0x55,0xa0,0xa2,0xf3}, {0x58,0xab,0xac,0xfa}, {0x4f,0xb6,0xbe,0xe1}, {0x42,0xbd,0xb0,0xe8}, -{0x09,0xd4,0xea,0x9f}, {0x04,0xdf,0xe4,0x96}, {0x13,0xc2,0xf6,0x8d}, {0x1e,0xc9,0xf8,0x84}, -{0x3d,0xf8,0xd2,0xbb}, {0x30,0xf3,0xdc,0xb2}, {0x27,0xee,0xce,0xa9}, {0x2a,0xe5,0xc0,0xa0}, -{0xb1,0x3c,0x7a,0x47}, {0xbc,0x37,0x74,0x4e}, {0xab,0x2a,0x66,0x55}, {0xa6,0x21,0x68,0x5c}, -{0x85,0x10,0x42,0x63}, {0x88,0x1b,0x4c,0x6a}, {0x9f,0x06,0x5e,0x71}, {0x92,0x0d,0x50,0x78}, -{0xd9,0x64,0x0a,0x0f}, {0xd4,0x6f,0x04,0x06}, {0xc3,0x72,0x16,0x1d}, {0xce,0x79,0x18,0x14}, -{0xed,0x48,0x32,0x2b}, {0xe0,0x43,0x3c,0x22}, {0xf7,0x5e,0x2e,0x39}, {0xfa,0x55,0x20,0x30}, -{0xb7,0x01,0xec,0x9a}, {0xba,0x0a,0xe2,0x93}, {0xad,0x17,0xf0,0x88}, {0xa0,0x1c,0xfe,0x81}, -{0x83,0x2d,0xd4,0xbe}, {0x8e,0x26,0xda,0xb7}, {0x99,0x3b,0xc8,0xac}, {0x94,0x30,0xc6,0xa5}, -{0xdf,0x59,0x9c,0xd2}, {0xd2,0x52,0x92,0xdb}, {0xc5,0x4f,0x80,0xc0}, {0xc8,0x44,0x8e,0xc9}, -{0xeb,0x75,0xa4,0xf6}, {0xe6,0x7e,0xaa,0xff}, {0xf1,0x63,0xb8,0xe4}, {0xfc,0x68,0xb6,0xed}, -{0x67,0xb1,0x0c,0x0a}, {0x6a,0xba,0x02,0x03}, {0x7d,0xa7,0x10,0x18}, {0x70,0xac,0x1e,0x11}, -{0x53,0x9d,0x34,0x2e}, {0x5e,0x96,0x3a,0x27}, {0x49,0x8b,0x28,0x3c}, {0x44,0x80,0x26,0x35}, -{0x0f,0xe9,0x7c,0x42}, {0x02,0xe2,0x72,0x4b}, {0x15,0xff,0x60,0x50}, {0x18,0xf4,0x6e,0x59}, -{0x3b,0xc5,0x44,0x66}, {0x36,0xce,0x4a,0x6f}, {0x21,0xd3,0x58,0x74}, {0x2c,0xd8,0x56,0x7d}, -{0x0c,0x7a,0x37,0xa1}, {0x01,0x71,0x39,0xa8}, {0x16,0x6c,0x2b,0xb3}, {0x1b,0x67,0x25,0xba}, -{0x38,0x56,0x0f,0x85}, {0x35,0x5d,0x01,0x8c}, {0x22,0x40,0x13,0x97}, {0x2f,0x4b,0x1d,0x9e}, -{0x64,0x22,0x47,0xe9}, {0x69,0x29,0x49,0xe0}, {0x7e,0x34,0x5b,0xfb}, {0x73,0x3f,0x55,0xf2}, -{0x50,0x0e,0x7f,0xcd}, {0x5d,0x05,0x71,0xc4}, {0x4a,0x18,0x63,0xdf}, {0x47,0x13,0x6d,0xd6}, -{0xdc,0xca,0xd7,0x31}, {0xd1,0xc1,0xd9,0x38}, {0xc6,0xdc,0xcb,0x23}, {0xcb,0xd7,0xc5,0x2a}, -{0xe8,0xe6,0xef,0x15}, {0xe5,0xed,0xe1,0x1c}, {0xf2,0xf0,0xf3,0x07}, {0xff,0xfb,0xfd,0x0e}, -{0xb4,0x92,0xa7,0x79}, {0xb9,0x99,0xa9,0x70}, {0xae,0x84,0xbb,0x6b}, {0xa3,0x8f,0xb5,0x62}, -{0x80,0xbe,0x9f,0x5d}, {0x8d,0xb5,0x91,0x54}, {0x9a,0xa8,0x83,0x4f}, {0x97,0xa3,0x8d,0x46} - } -}; -#define U3 xU3.xt8 - -static const union xtab xU4 = { - .xt8 = { -{0x00,0x00,0x00,0x00}, {0x09,0x0d,0x0b,0x0e}, {0x12,0x1a,0x16,0x1c}, {0x1b,0x17,0x1d,0x12}, -{0x24,0x34,0x2c,0x38}, {0x2d,0x39,0x27,0x36}, {0x36,0x2e,0x3a,0x24}, {0x3f,0x23,0x31,0x2a}, -{0x48,0x68,0x58,0x70}, {0x41,0x65,0x53,0x7e}, {0x5a,0x72,0x4e,0x6c}, {0x53,0x7f,0x45,0x62}, -{0x6c,0x5c,0x74,0x48}, {0x65,0x51,0x7f,0x46}, {0x7e,0x46,0x62,0x54}, {0x77,0x4b,0x69,0x5a}, -{0x90,0xd0,0xb0,0xe0}, {0x99,0xdd,0xbb,0xee}, {0x82,0xca,0xa6,0xfc}, {0x8b,0xc7,0xad,0xf2}, -{0xb4,0xe4,0x9c,0xd8}, {0xbd,0xe9,0x97,0xd6}, {0xa6,0xfe,0x8a,0xc4}, {0xaf,0xf3,0x81,0xca}, -{0xd8,0xb8,0xe8,0x90}, {0xd1,0xb5,0xe3,0x9e}, {0xca,0xa2,0xfe,0x8c}, {0xc3,0xaf,0xf5,0x82}, -{0xfc,0x8c,0xc4,0xa8}, {0xf5,0x81,0xcf,0xa6}, {0xee,0x96,0xd2,0xb4}, {0xe7,0x9b,0xd9,0xba}, -{0x3b,0xbb,0x7b,0xdb}, {0x32,0xb6,0x70,0xd5}, {0x29,0xa1,0x6d,0xc7}, {0x20,0xac,0x66,0xc9}, -{0x1f,0x8f,0x57,0xe3}, {0x16,0x82,0x5c,0xed}, {0x0d,0x95,0x41,0xff}, {0x04,0x98,0x4a,0xf1}, -{0x73,0xd3,0x23,0xab}, {0x7a,0xde,0x28,0xa5}, {0x61,0xc9,0x35,0xb7}, {0x68,0xc4,0x3e,0xb9}, -{0x57,0xe7,0x0f,0x93}, {0x5e,0xea,0x04,0x9d}, {0x45,0xfd,0x19,0x8f}, {0x4c,0xf0,0x12,0x81}, -{0xab,0x6b,0xcb,0x3b}, {0xa2,0x66,0xc0,0x35}, {0xb9,0x71,0xdd,0x27}, {0xb0,0x7c,0xd6,0x29}, -{0x8f,0x5f,0xe7,0x03}, {0x86,0x52,0xec,0x0d}, {0x9d,0x45,0xf1,0x1f}, {0x94,0x48,0xfa,0x11}, -{0xe3,0x03,0x93,0x4b}, {0xea,0x0e,0x98,0x45}, {0xf1,0x19,0x85,0x57}, {0xf8,0x14,0x8e,0x59}, -{0xc7,0x37,0xbf,0x73}, {0xce,0x3a,0xb4,0x7d}, {0xd5,0x2d,0xa9,0x6f}, {0xdc,0x20,0xa2,0x61}, -{0x76,0x6d,0xf6,0xad}, {0x7f,0x60,0xfd,0xa3}, {0x64,0x77,0xe0,0xb1}, {0x6d,0x7a,0xeb,0xbf}, -{0x52,0x59,0xda,0x95}, {0x5b,0x54,0xd1,0x9b}, {0x40,0x43,0xcc,0x89}, {0x49,0x4e,0xc7,0x87}, -{0x3e,0x05,0xae,0xdd}, {0x37,0x08,0xa5,0xd3}, {0x2c,0x1f,0xb8,0xc1}, {0x25,0x12,0xb3,0xcf}, -{0x1a,0x31,0x82,0xe5}, {0x13,0x3c,0x89,0xeb}, {0x08,0x2b,0x94,0xf9}, {0x01,0x26,0x9f,0xf7}, -{0xe6,0xbd,0x46,0x4d}, {0xef,0xb0,0x4d,0x43}, {0xf4,0xa7,0x50,0x51}, {0xfd,0xaa,0x5b,0x5f}, -{0xc2,0x89,0x6a,0x75}, {0xcb,0x84,0x61,0x7b}, {0xd0,0x93,0x7c,0x69}, {0xd9,0x9e,0x77,0x67}, -{0xae,0xd5,0x1e,0x3d}, {0xa7,0xd8,0x15,0x33}, {0xbc,0xcf,0x08,0x21}, {0xb5,0xc2,0x03,0x2f}, -{0x8a,0xe1,0x32,0x05}, {0x83,0xec,0x39,0x0b}, {0x98,0xfb,0x24,0x19}, {0x91,0xf6,0x2f,0x17}, -{0x4d,0xd6,0x8d,0x76}, {0x44,0xdb,0x86,0x78}, {0x5f,0xcc,0x9b,0x6a}, {0x56,0xc1,0x90,0x64}, -{0x69,0xe2,0xa1,0x4e}, {0x60,0xef,0xaa,0x40}, {0x7b,0xf8,0xb7,0x52}, {0x72,0xf5,0xbc,0x5c}, -{0x05,0xbe,0xd5,0x06}, {0x0c,0xb3,0xde,0x08}, {0x17,0xa4,0xc3,0x1a}, {0x1e,0xa9,0xc8,0x14}, -{0x21,0x8a,0xf9,0x3e}, {0x28,0x87,0xf2,0x30}, {0x33,0x90,0xef,0x22}, {0x3a,0x9d,0xe4,0x2c}, -{0xdd,0x06,0x3d,0x96}, {0xd4,0x0b,0x36,0x98}, {0xcf,0x1c,0x2b,0x8a}, {0xc6,0x11,0x20,0x84}, -{0xf9,0x32,0x11,0xae}, {0xf0,0x3f,0x1a,0xa0}, {0xeb,0x28,0x07,0xb2}, {0xe2,0x25,0x0c,0xbc}, -{0x95,0x6e,0x65,0xe6}, {0x9c,0x63,0x6e,0xe8}, {0x87,0x74,0x73,0xfa}, {0x8e,0x79,0x78,0xf4}, -{0xb1,0x5a,0x49,0xde}, {0xb8,0x57,0x42,0xd0}, {0xa3,0x40,0x5f,0xc2}, {0xaa,0x4d,0x54,0xcc}, -{0xec,0xda,0xf7,0x41}, {0xe5,0xd7,0xfc,0x4f}, {0xfe,0xc0,0xe1,0x5d}, {0xf7,0xcd,0xea,0x53}, -{0xc8,0xee,0xdb,0x79}, {0xc1,0xe3,0xd0,0x77}, {0xda,0xf4,0xcd,0x65}, {0xd3,0xf9,0xc6,0x6b}, -{0xa4,0xb2,0xaf,0x31}, {0xad,0xbf,0xa4,0x3f}, {0xb6,0xa8,0xb9,0x2d}, {0xbf,0xa5,0xb2,0x23}, -{0x80,0x86,0x83,0x09}, {0x89,0x8b,0x88,0x07}, {0x92,0x9c,0x95,0x15}, {0x9b,0x91,0x9e,0x1b}, -{0x7c,0x0a,0x47,0xa1}, {0x75,0x07,0x4c,0xaf}, {0x6e,0x10,0x51,0xbd}, {0x67,0x1d,0x5a,0xb3}, -{0x58,0x3e,0x6b,0x99}, {0x51,0x33,0x60,0x97}, {0x4a,0x24,0x7d,0x85}, {0x43,0x29,0x76,0x8b}, -{0x34,0x62,0x1f,0xd1}, {0x3d,0x6f,0x14,0xdf}, {0x26,0x78,0x09,0xcd}, {0x2f,0x75,0x02,0xc3}, -{0x10,0x56,0x33,0xe9}, {0x19,0x5b,0x38,0xe7}, {0x02,0x4c,0x25,0xf5}, {0x0b,0x41,0x2e,0xfb}, -{0xd7,0x61,0x8c,0x9a}, {0xde,0x6c,0x87,0x94}, {0xc5,0x7b,0x9a,0x86}, {0xcc,0x76,0x91,0x88}, -{0xf3,0x55,0xa0,0xa2}, {0xfa,0x58,0xab,0xac}, {0xe1,0x4f,0xb6,0xbe}, {0xe8,0x42,0xbd,0xb0}, -{0x9f,0x09,0xd4,0xea}, {0x96,0x04,0xdf,0xe4}, {0x8d,0x13,0xc2,0xf6}, {0x84,0x1e,0xc9,0xf8}, -{0xbb,0x3d,0xf8,0xd2}, {0xb2,0x30,0xf3,0xdc}, {0xa9,0x27,0xee,0xce}, {0xa0,0x2a,0xe5,0xc0}, -{0x47,0xb1,0x3c,0x7a}, {0x4e,0xbc,0x37,0x74}, {0x55,0xab,0x2a,0x66}, {0x5c,0xa6,0x21,0x68}, -{0x63,0x85,0x10,0x42}, {0x6a,0x88,0x1b,0x4c}, {0x71,0x9f,0x06,0x5e}, {0x78,0x92,0x0d,0x50}, -{0x0f,0xd9,0x64,0x0a}, {0x06,0xd4,0x6f,0x04}, {0x1d,0xc3,0x72,0x16}, {0x14,0xce,0x79,0x18}, -{0x2b,0xed,0x48,0x32}, {0x22,0xe0,0x43,0x3c}, {0x39,0xf7,0x5e,0x2e}, {0x30,0xfa,0x55,0x20}, -{0x9a,0xb7,0x01,0xec}, {0x93,0xba,0x0a,0xe2}, {0x88,0xad,0x17,0xf0}, {0x81,0xa0,0x1c,0xfe}, -{0xbe,0x83,0x2d,0xd4}, {0xb7,0x8e,0x26,0xda}, {0xac,0x99,0x3b,0xc8}, {0xa5,0x94,0x30,0xc6}, -{0xd2,0xdf,0x59,0x9c}, {0xdb,0xd2,0x52,0x92}, {0xc0,0xc5,0x4f,0x80}, {0xc9,0xc8,0x44,0x8e}, -{0xf6,0xeb,0x75,0xa4}, {0xff,0xe6,0x7e,0xaa}, {0xe4,0xf1,0x63,0xb8}, {0xed,0xfc,0x68,0xb6}, -{0x0a,0x67,0xb1,0x0c}, {0x03,0x6a,0xba,0x02}, {0x18,0x7d,0xa7,0x10}, {0x11,0x70,0xac,0x1e}, -{0x2e,0x53,0x9d,0x34}, {0x27,0x5e,0x96,0x3a}, {0x3c,0x49,0x8b,0x28}, {0x35,0x44,0x80,0x26}, -{0x42,0x0f,0xe9,0x7c}, {0x4b,0x02,0xe2,0x72}, {0x50,0x15,0xff,0x60}, {0x59,0x18,0xf4,0x6e}, -{0x66,0x3b,0xc5,0x44}, {0x6f,0x36,0xce,0x4a}, {0x74,0x21,0xd3,0x58}, {0x7d,0x2c,0xd8,0x56}, -{0xa1,0x0c,0x7a,0x37}, {0xa8,0x01,0x71,0x39}, {0xb3,0x16,0x6c,0x2b}, {0xba,0x1b,0x67,0x25}, -{0x85,0x38,0x56,0x0f}, {0x8c,0x35,0x5d,0x01}, {0x97,0x22,0x40,0x13}, {0x9e,0x2f,0x4b,0x1d}, -{0xe9,0x64,0x22,0x47}, {0xe0,0x69,0x29,0x49}, {0xfb,0x7e,0x34,0x5b}, {0xf2,0x73,0x3f,0x55}, -{0xcd,0x50,0x0e,0x7f}, {0xc4,0x5d,0x05,0x71}, {0xdf,0x4a,0x18,0x63}, {0xd6,0x47,0x13,0x6d}, -{0x31,0xdc,0xca,0xd7}, {0x38,0xd1,0xc1,0xd9}, {0x23,0xc6,0xdc,0xcb}, {0x2a,0xcb,0xd7,0xc5}, -{0x15,0xe8,0xe6,0xef}, {0x1c,0xe5,0xed,0xe1}, {0x07,0xf2,0xf0,0xf3}, {0x0e,0xff,0xfb,0xfd}, -{0x79,0xb4,0x92,0xa7}, {0x70,0xb9,0x99,0xa9}, {0x6b,0xae,0x84,0xbb}, {0x62,0xa3,0x8f,0xb5}, -{0x5d,0x80,0xbe,0x9f}, {0x54,0x8d,0xb5,0x91}, {0x4f,0x9a,0xa8,0x83}, {0x46,0x97,0xa3,0x8d} - } -}; -#define U4 xU4.xt8 - -static const word32 rcon[30] = { - 0x01,0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91 -}; diff --git a/bsd/crypto/rijndael/rijndael-alg-fst.c b/bsd/crypto/rijndael/rijndael-alg-fst.c deleted file mode 100644 index 5cd4857e4..000000000 --- a/bsd/crypto/rijndael/rijndael-alg-fst.c +++ /dev/null @@ -1,488 +0,0 @@ -/* $FreeBSD: src/sys/crypto/rijndael/rijndael-alg-fst.c,v 1.3.2.1 2001/07/03 11:01:35 ume Exp $ */ -/* $KAME: rijndael-alg-fst.c,v 1.7 2001/05/27 00:23:23 itojun Exp $ */ - -/* - * rijndael-alg-fst.c v2.3 April '2000 - * - * Optimised ANSI C code - * - * authors: v1.0: Antoon Bosselaers - * v2.0: Vincent Rijmen - * v2.3: Paulo Barreto - * - * This code is placed in the public domain. - */ - -#include <sys/cdefs.h> -#include <sys/types.h> -#ifdef KERNEL -#include <sys/systm.h> -#else -#include <string.h> -#endif -#include <crypto/rijndael/rijndael-alg-fst.h> -#include <crypto/rijndael/rijndael_local.h> - -#include <crypto/rijndael/boxes-fst.dat> - -int rijndaelKeySched(word8 k[MAXKC][4], word8 W[MAXROUNDS+1][4][4], int ROUNDS) { - /* Calculate the necessary round keys - * The number of calculations depends on keyBits and blockBits - */ - int j, r, t, rconpointer = 0; - union { - word8 x8[MAXKC][4]; - word32 x32[MAXKC]; - } xtk; -#define tk xtk.x8 - int KC = ROUNDS - 6; - - for (j = KC-1; j >= 0; j--) { - *((word32*)tk[j]) = *((word32*)k[j]); - } - r = 0; - t = 0; - /* copy values into round key array */ - for (j = 0; (j < KC) && (r < ROUNDS + 1); ) { - for (; (j < KC) && (t < 4); j++, t++) { - *((word32*)W[r][t]) = *((word32*)tk[j]); - } - if (t == 4) { - r++; - t = 0; - } - } - - while (r < ROUNDS + 1) { /* while not enough round key material calculated */ - /* calculate new values */ - tk[0][0] ^= S[tk[KC-1][1]]; - tk[0][1] ^= S[tk[KC-1][2]]; - tk[0][2] ^= S[tk[KC-1][3]]; - tk[0][3] ^= S[tk[KC-1][0]]; - tk[0][0] ^= rcon[rconpointer++]; - - if (KC != 8) { - for (j = 1; j < KC; j++) { - *((word32*)tk[j]) ^= *((word32*)tk[j-1]); - } - } else { - for (j = 1; j < KC/2; j++) { - *((word32*)tk[j]) ^= *((word32*)tk[j-1]); - } - tk[KC/2][0] ^= S[tk[KC/2 - 1][0]]; - tk[KC/2][1] ^= S[tk[KC/2 - 1][1]]; - tk[KC/2][2] ^= S[tk[KC/2 - 1][2]]; - tk[KC/2][3] ^= S[tk[KC/2 - 1][3]]; - for (j = KC/2 + 1; j < KC; j++) { - *((word32*)tk[j]) ^= *((word32*)tk[j-1]); - } - } - /* copy values into round key array */ - for (j = 0; (j < KC) && (r < ROUNDS + 1); ) { - for (; (j < KC) && (t < 4); j++, t++) { - *((word32*)W[r][t]) = *((word32*)tk[j]); - } - if (t == 4) { - r++; - t = 0; - } - } - } - return 0; -#undef tk -} - -int rijndaelKeyEncToDec(word8 W[MAXROUNDS+1][4][4], int ROUNDS) { - int r; - word8 *w; - - for (r = 1; r < ROUNDS; r++) { - w = W[r][0]; - *((word32*)w) = - *((const word32*)U1[w[0]]) - ^ *((const word32*)U2[w[1]]) - ^ *((const word32*)U3[w[2]]) - ^ *((const word32*)U4[w[3]]); - - w = W[r][1]; - *((word32*)w) = - *((const word32*)U1[w[0]]) - ^ *((const word32*)U2[w[1]]) - ^ *((const word32*)U3[w[2]]) - ^ *((const word32*)U4[w[3]]); - - w = W[r][2]; - *((word32*)w) = - *((const word32*)U1[w[0]]) - ^ *((const word32*)U2[w[1]]) - ^ *((const word32*)U3[w[2]]) - ^ *((const word32*)U4[w[3]]); - - w = W[r][3]; - *((word32*)w) = - *((const word32*)U1[w[0]]) - ^ *((const word32*)U2[w[1]]) - ^ *((const word32*)U3[w[2]]) - ^ *((const word32*)U4[w[3]]); - } - return 0; -} - -/** - * Encrypt a single block. - */ -int rijndaelEncrypt(word8 in[16], word8 out[16], word8 rk[MAXROUNDS+1][4][4], int ROUNDS) { - int r; - union { - word8 x8[16]; - word32 x32[4]; - } xa, xb; -#define a xa.x8 -#define b xb.x8 - union { - word8 x8[4][4]; - word32 x32[4]; - } xtemp; -#define temp xtemp.x8 - - memcpy(a, in, sizeof a); - - *((word32*)temp[0]) = *((word32*)(a )) ^ *((word32*)rk[0][0]); - *((word32*)temp[1]) = *((word32*)(a+ 4)) ^ *((word32*)rk[0][1]); - *((word32*)temp[2]) = *((word32*)(a+ 8)) ^ *((word32*)rk[0][2]); - *((word32*)temp[3]) = *((word32*)(a+12)) ^ *((word32*)rk[0][3]); - *((word32*)(b )) = *((const word32*)T1[temp[0][0]]) - ^ *((const word32*)T2[temp[1][1]]) - ^ *((const word32*)T3[temp[2][2]]) - ^ *((const word32*)T4[temp[3][3]]); - *((word32*)(b + 4)) = *((const word32*)T1[temp[1][0]]) - ^ *((const word32*)T2[temp[2][1]]) - ^ *((const word32*)T3[temp[3][2]]) - ^ *((const word32*)T4[temp[0][3]]); - *((word32*)(b + 8)) = *((const word32*)T1[temp[2][0]]) - ^ *((const word32*)T2[temp[3][1]]) - ^ *((const word32*)T3[temp[0][2]]) - ^ *((const word32*)T4[temp[1][3]]); - *((word32*)(b +12)) = *((const word32*)T1[temp[3][0]]) - ^ *((const word32*)T2[temp[0][1]]) - ^ *((const word32*)T3[temp[1][2]]) - ^ *((const word32*)T4[temp[2][3]]); - for (r = 1; r < ROUNDS-1; r++) { - *((word32*)temp[0]) = *((word32*)(b )) ^ *((word32*)rk[r][0]); - *((word32*)temp[1]) = *((word32*)(b+ 4)) ^ *((word32*)rk[r][1]); - *((word32*)temp[2]) = *((word32*)(b+ 8)) ^ *((word32*)rk[r][2]); - *((word32*)temp[3]) = *((word32*)(b+12)) ^ *((word32*)rk[r][3]); - - *((word32*)(b )) = *((const word32*)T1[temp[0][0]]) - ^ *((const word32*)T2[temp[1][1]]) - ^ *((const word32*)T3[temp[2][2]]) - ^ *((const word32*)T4[temp[3][3]]); - *((word32*)(b + 4)) = *((const word32*)T1[temp[1][0]]) - ^ *((const word32*)T2[temp[2][1]]) - ^ *((const word32*)T3[temp[3][2]]) - ^ *((const word32*)T4[temp[0][3]]); - *((word32*)(b + 8)) = *((const word32*)T1[temp[2][0]]) - ^ *((const word32*)T2[temp[3][1]]) - ^ *((const word32*)T3[temp[0][2]]) - ^ *((const word32*)T4[temp[1][3]]); - *((word32*)(b +12)) = *((const word32*)T1[temp[3][0]]) - ^ *((const word32*)T2[temp[0][1]]) - ^ *((const word32*)T3[temp[1][2]]) - ^ *((const word32*)T4[temp[2][3]]); - } - /* last round is special */ - *((word32*)temp[0]) = *((word32*)(b )) ^ *((word32*)rk[ROUNDS-1][0]); - *((word32*)temp[1]) = *((word32*)(b+ 4)) ^ *((word32*)rk[ROUNDS-1][1]); - *((word32*)temp[2]) = *((word32*)(b+ 8)) ^ *((word32*)rk[ROUNDS-1][2]); - *((word32*)temp[3]) = *((word32*)(b+12)) ^ *((word32*)rk[ROUNDS-1][3]); - b[ 0] = T1[temp[0][0]][1]; - b[ 1] = T1[temp[1][1]][1]; - b[ 2] = T1[temp[2][2]][1]; - b[ 3] = T1[temp[3][3]][1]; - b[ 4] = T1[temp[1][0]][1]; - b[ 5] = T1[temp[2][1]][1]; - b[ 6] = T1[temp[3][2]][1]; - b[ 7] = T1[temp[0][3]][1]; - b[ 8] = T1[temp[2][0]][1]; - b[ 9] = T1[temp[3][1]][1]; - b[10] = T1[temp[0][2]][1]; - b[11] = T1[temp[1][3]][1]; - b[12] = T1[temp[3][0]][1]; - b[13] = T1[temp[0][1]][1]; - b[14] = T1[temp[1][2]][1]; - b[15] = T1[temp[2][3]][1]; - *((word32*)(b )) ^= *((word32*)rk[ROUNDS][0]); - *((word32*)(b+ 4)) ^= *((word32*)rk[ROUNDS][1]); - *((word32*)(b+ 8)) ^= *((word32*)rk[ROUNDS][2]); - *((word32*)(b+12)) ^= *((word32*)rk[ROUNDS][3]); - - memcpy(out, b, sizeof b /* XXX out */); - - return 0; -#undef a -#undef b -#undef temp -} - -#ifdef INTERMEDIATE_VALUE_KAT -/** - * Encrypt only a certain number of rounds. - * Only used in the Intermediate Value Known Answer Test. - */ -int rijndaelEncryptRound(word8 a[4][4], word8 rk[MAXROUNDS+1][4][4], int ROUNDS, int rounds) { - int r; - word8 temp[4][4]; - - /* make number of rounds sane */ - if (rounds > ROUNDS) { - rounds = ROUNDS; - } - - *((word32*)a[0]) = *((word32*)a[0]) ^ *((word32*)rk[0][0]); - *((word32*)a[1]) = *((word32*)a[1]) ^ *((word32*)rk[0][1]); - *((word32*)a[2]) = *((word32*)a[2]) ^ *((word32*)rk[0][2]); - *((word32*)a[3]) = *((word32*)a[3]) ^ *((word32*)rk[0][3]); - - for (r = 1; (r <= rounds) && (r < ROUNDS); r++) { - *((word32*)temp[0]) = *((word32*)T1[a[0][0]]) - ^ *((word32*)T2[a[1][1]]) - ^ *((word32*)T3[a[2][2]]) - ^ *((word32*)T4[a[3][3]]); - *((word32*)temp[1]) = *((word32*)T1[a[1][0]]) - ^ *((word32*)T2[a[2][1]]) - ^ *((word32*)T3[a[3][2]]) - ^ *((word32*)T4[a[0][3]]); - *((word32*)temp[2]) = *((word32*)T1[a[2][0]]) - ^ *((word32*)T2[a[3][1]]) - ^ *((word32*)T3[a[0][2]]) - ^ *((word32*)T4[a[1][3]]); - *((word32*)temp[3]) = *((word32*)T1[a[3][0]]) - ^ *((word32*)T2[a[0][1]]) - ^ *((word32*)T3[a[1][2]]) - ^ *((word32*)T4[a[2][3]]); - *((word32*)a[0]) = *((word32*)temp[0]) ^ *((word32*)rk[r][0]); - *((word32*)a[1]) = *((word32*)temp[1]) ^ *((word32*)rk[r][1]); - *((word32*)a[2]) = *((word32*)temp[2]) ^ *((word32*)rk[r][2]); - *((word32*)a[3]) = *((word32*)temp[3]) ^ *((word32*)rk[r][3]); - } - if (rounds == ROUNDS) { - /* last round is special */ - temp[0][0] = T1[a[0][0]][1]; - temp[0][1] = T1[a[1][1]][1]; - temp[0][2] = T1[a[2][2]][1]; - temp[0][3] = T1[a[3][3]][1]; - temp[1][0] = T1[a[1][0]][1]; - temp[1][1] = T1[a[2][1]][1]; - temp[1][2] = T1[a[3][2]][1]; - temp[1][3] = T1[a[0][3]][1]; - temp[2][0] = T1[a[2][0]][1]; - temp[2][1] = T1[a[3][1]][1]; - temp[2][2] = T1[a[0][2]][1]; - temp[2][3] = T1[a[1][3]][1]; - temp[3][0] = T1[a[3][0]][1]; - temp[3][1] = T1[a[0][1]][1]; - temp[3][2] = T1[a[1][2]][1]; - temp[3][3] = T1[a[2][3]][1]; - *((word32*)a[0]) = *((word32*)temp[0]) ^ *((word32*)rk[ROUNDS][0]); - *((word32*)a[1]) = *((word32*)temp[1]) ^ *((word32*)rk[ROUNDS][1]); - *((word32*)a[2]) = *((word32*)temp[2]) ^ *((word32*)rk[ROUNDS][2]); - *((word32*)a[3]) = *((word32*)temp[3]) ^ *((word32*)rk[ROUNDS][3]); - } - - return 0; -} -#endif /* INTERMEDIATE_VALUE_KAT */ - -/** - * Decrypt a single block. - */ -int rijndaelDecrypt(word8 in[16], word8 out[16], word8 rk[MAXROUNDS+1][4][4], int ROUNDS) { - int r; - union { - word8 x8[16]; - word32 x32[4]; - } xa, xb; -#define a xa.x8 -#define b xb.x8 - union { - word8 x8[4][4]; - word32 x32[4]; - } xtemp; -#define temp xtemp.x8 - - memcpy(a, in, sizeof a); - - *((word32*)temp[0]) = *((word32*)(a )) ^ *((word32*)rk[ROUNDS][0]); - *((word32*)temp[1]) = *((word32*)(a+ 4)) ^ *((word32*)rk[ROUNDS][1]); - *((word32*)temp[2]) = *((word32*)(a+ 8)) ^ *((word32*)rk[ROUNDS][2]); - *((word32*)temp[3]) = *((word32*)(a+12)) ^ *((word32*)rk[ROUNDS][3]); - - *((word32*)(b )) = *((const word32*)T5[temp[0][0]]) - ^ *((const word32*)T6[temp[3][1]]) - ^ *((const word32*)T7[temp[2][2]]) - ^ *((const word32*)T8[temp[1][3]]); - *((word32*)(b+ 4)) = *((const word32*)T5[temp[1][0]]) - ^ *((const word32*)T6[temp[0][1]]) - ^ *((const word32*)T7[temp[3][2]]) - ^ *((const word32*)T8[temp[2][3]]); - *((word32*)(b+ 8)) = *((const word32*)T5[temp[2][0]]) - ^ *((const word32*)T6[temp[1][1]]) - ^ *((const word32*)T7[temp[0][2]]) - ^ *((const word32*)T8[temp[3][3]]); - *((word32*)(b+12)) = *((const word32*)T5[temp[3][0]]) - ^ *((const word32*)T6[temp[2][1]]) - ^ *((const word32*)T7[temp[1][2]]) - ^ *((const word32*)T8[temp[0][3]]); - for (r = ROUNDS-1; r > 1; r--) { - *((word32*)temp[0]) = *((word32*)(b )) ^ *((word32*)rk[r][0]); - *((word32*)temp[1]) = *((word32*)(b+ 4)) ^ *((word32*)rk[r][1]); - *((word32*)temp[2]) = *((word32*)(b+ 8)) ^ *((word32*)rk[r][2]); - *((word32*)temp[3]) = *((word32*)(b+12)) ^ *((word32*)rk[r][3]); - *((word32*)(b )) = *((const word32*)T5[temp[0][0]]) - ^ *((const word32*)T6[temp[3][1]]) - ^ *((const word32*)T7[temp[2][2]]) - ^ *((const word32*)T8[temp[1][3]]); - *((word32*)(b+ 4)) = *((const word32*)T5[temp[1][0]]) - ^ *((const word32*)T6[temp[0][1]]) - ^ *((const word32*)T7[temp[3][2]]) - ^ *((const word32*)T8[temp[2][3]]); - *((word32*)(b+ 8)) = *((const word32*)T5[temp[2][0]]) - ^ *((const word32*)T6[temp[1][1]]) - ^ *((const word32*)T7[temp[0][2]]) - ^ *((const word32*)T8[temp[3][3]]); - *((word32*)(b+12)) = *((const word32*)T5[temp[3][0]]) - ^ *((const word32*)T6[temp[2][1]]) - ^ *((const word32*)T7[temp[1][2]]) - ^ *((const word32*)T8[temp[0][3]]); - } - /* last round is special */ - *((word32*)temp[0]) = *((word32*)(b )) ^ *((word32*)rk[1][0]); - *((word32*)temp[1]) = *((word32*)(b+ 4)) ^ *((word32*)rk[1][1]); - *((word32*)temp[2]) = *((word32*)(b+ 8)) ^ *((word32*)rk[1][2]); - *((word32*)temp[3]) = *((word32*)(b+12)) ^ *((word32*)rk[1][3]); - b[ 0] = S5[temp[0][0]]; - b[ 1] = S5[temp[3][1]]; - b[ 2] = S5[temp[2][2]]; - b[ 3] = S5[temp[1][3]]; - b[ 4] = S5[temp[1][0]]; - b[ 5] = S5[temp[0][1]]; - b[ 6] = S5[temp[3][2]]; - b[ 7] = S5[temp[2][3]]; - b[ 8] = S5[temp[2][0]]; - b[ 9] = S5[temp[1][1]]; - b[10] = S5[temp[0][2]]; - b[11] = S5[temp[3][3]]; - b[12] = S5[temp[3][0]]; - b[13] = S5[temp[2][1]]; - b[14] = S5[temp[1][2]]; - b[15] = S5[temp[0][3]]; - *((word32*)(b )) ^= *((word32*)rk[0][0]); - *((word32*)(b+ 4)) ^= *((word32*)rk[0][1]); - *((word32*)(b+ 8)) ^= *((word32*)rk[0][2]); - *((word32*)(b+12)) ^= *((word32*)rk[0][3]); - - memcpy(out, b, sizeof b /* XXX out */); - - return 0; -#undef a -#undef b -#undef temp -} - - -#ifdef INTERMEDIATE_VALUE_KAT -/** - * Decrypt only a certain number of rounds. - * Only used in the Intermediate Value Known Answer Test. - * Operations rearranged such that the intermediate values - * of decryption correspond with the intermediate values - * of encryption. - */ -int rijndaelDecryptRound(word8 a[4][4], word8 rk[MAXROUNDS+1][4][4], int ROUNDS, int rounds) { - int r, i; - word8 temp[4], shift; - - /* make number of rounds sane */ - if (rounds > ROUNDS) { - rounds = ROUNDS; - } - /* first round is special: */ - *(word32 *)a[0] ^= *(word32 *)rk[ROUNDS][0]; - *(word32 *)a[1] ^= *(word32 *)rk[ROUNDS][1]; - *(word32 *)a[2] ^= *(word32 *)rk[ROUNDS][2]; - *(word32 *)a[3] ^= *(word32 *)rk[ROUNDS][3]; - for (i = 0; i < 4; i++) { - a[i][0] = Si[a[i][0]]; - a[i][1] = Si[a[i][1]]; - a[i][2] = Si[a[i][2]]; - a[i][3] = Si[a[i][3]]; - } - for (i = 1; i < 4; i++) { - shift = (4 - i) & 3; - temp[0] = a[(0 + shift) & 3][i]; - temp[1] = a[(1 + shift) & 3][i]; - temp[2] = a[(2 + shift) & 3][i]; - temp[3] = a[(3 + shift) & 3][i]; - a[0][i] = temp[0]; - a[1][i] = temp[1]; - a[2][i] = temp[2]; - a[3][i] = temp[3]; - } - /* ROUNDS-1 ordinary rounds */ - for (r = ROUNDS-1; r > rounds; r--) { - *(word32 *)a[0] ^= *(word32 *)rk[r][0]; - *(word32 *)a[1] ^= *(word32 *)rk[r][1]; - *(word32 *)a[2] ^= *(word32 *)rk[r][2]; - *(word32 *)a[3] ^= *(word32 *)rk[r][3]; - - *((word32*)a[0]) = - *((word32*)U1[a[0][0]]) - ^ *((word32*)U2[a[0][1]]) - ^ *((word32*)U3[a[0][2]]) - ^ *((word32*)U4[a[0][3]]); - - *((word32*)a[1]) = - *((word32*)U1[a[1][0]]) - ^ *((word32*)U2[a[1][1]]) - ^ *((word32*)U3[a[1][2]]) - ^ *((word32*)U4[a[1][3]]); - - *((word32*)a[2]) = - *((word32*)U1[a[2][0]]) - ^ *((word32*)U2[a[2][1]]) - ^ *((word32*)U3[a[2][2]]) - ^ *((word32*)U4[a[2][3]]); - - *((word32*)a[3]) = - *((word32*)U1[a[3][0]]) - ^ *((word32*)U2[a[3][1]]) - ^ *((word32*)U3[a[3][2]]) - ^ *((word32*)U4[a[3][3]]); - for (i = 0; i < 4; i++) { - a[i][0] = Si[a[i][0]]; - a[i][1] = Si[a[i][1]]; - a[i][2] = Si[a[i][2]]; - a[i][3] = Si[a[i][3]]; - } - for (i = 1; i < 4; i++) { - shift = (4 - i) & 3; - temp[0] = a[(0 + shift) & 3][i]; - temp[1] = a[(1 + shift) & 3][i]; - temp[2] = a[(2 + shift) & 3][i]; - temp[3] = a[(3 + shift) & 3][i]; - a[0][i] = temp[0]; - a[1][i] = temp[1]; - a[2][i] = temp[2]; - a[3][i] = temp[3]; - } - } - if (rounds == 0) { - /* End with the extra key addition */ - *(word32 *)a[0] ^= *(word32 *)rk[0][0]; - *(word32 *)a[1] ^= *(word32 *)rk[0][1]; - *(word32 *)a[2] ^= *(word32 *)rk[0][2]; - *(word32 *)a[3] ^= *(word32 *)rk[0][3]; - } - return 0; -} -#endif /* INTERMEDIATE_VALUE_KAT */ diff --git a/bsd/crypto/rijndael/rijndael-alg-fst.h b/bsd/crypto/rijndael/rijndael-alg-fst.h deleted file mode 100644 index 811ce60d1..000000000 --- a/bsd/crypto/rijndael/rijndael-alg-fst.h +++ /dev/null @@ -1,34 +0,0 @@ -/* $FreeBSD: src/sys/crypto/rijndael/rijndael-alg-fst.h,v 1.2.2.1 2001/07/03 11:01:35 ume Exp $ */ -/* $KAME: rijndael-alg-fst.h,v 1.4 2000/10/02 17:14:26 itojun Exp $ */ - -/* - * rijndael-alg-fst.h v2.3 April '2000 - * - * Optimised ANSI C code - * - * #define INTERMEDIATE_VALUE_KAT to generate the Intermediate Value Known Answer Test. - */ - -#ifndef __RIJNDAEL_ALG_FST_H -#define __RIJNDAEL_ALG_FST_H - -#define RIJNDAEL_MAXKC (256/32) -#define RIJNDAEL_MAXROUNDS 14 - -int rijndaelKeySched(u_int8_t k[RIJNDAEL_MAXKC][4], u_int8_t rk[RIJNDAEL_MAXROUNDS+1][4][4], int ROUNDS); - -int rijndaelKeyEncToDec(u_int8_t W[RIJNDAEL_MAXROUNDS+1][4][4], int ROUNDS); - -int rijndaelEncrypt(u_int8_t a[16], u_int8_t b[16], u_int8_t rk[RIJNDAEL_MAXROUNDS+1][4][4], int ROUNDS); - -#ifdef INTERMEDIATE_VALUE_KAT -int rijndaelEncryptRound(u_int8_t a[4][4], u_int8_t rk[RIJNDAEL_MAXROUNDS+1][4][4], int ROUNDS, int rounds); -#endif /* INTERMEDIATE_VALUE_KAT */ - -int rijndaelDecrypt(u_int8_t a[16], u_int8_t b[16], u_int8_t rk[RIJNDAEL_MAXROUNDS+1][4][4], int ROUNDS); - -#ifdef INTERMEDIATE_VALUE_KAT -int rijndaelDecryptRound(u_int8_t a[4][4], u_int8_t rk[RIJNDAEL_MAXROUNDS+1][4][4], int ROUNDS, int rounds); -#endif /* INTERMEDIATE_VALUE_KAT */ - -#endif /* __RIJNDAEL_ALG_FST_H */ diff --git a/bsd/crypto/rijndael/rijndael-api-fst.c b/bsd/crypto/rijndael/rijndael-api-fst.c deleted file mode 100644 index 295bab387..000000000 --- a/bsd/crypto/rijndael/rijndael-api-fst.c +++ /dev/null @@ -1,484 +0,0 @@ -/* $FreeBSD: src/sys/crypto/rijndael/rijndael-api-fst.c,v 1.2.2.1 2001/07/03 11:01:35 ume Exp $ */ -/* $KAME: rijndael-api-fst.c,v 1.10 2001/05/27 09:34:18 itojun Exp $ */ - -/* - * rijndael-api-fst.c v2.3 April '2000 - * - * Optimised ANSI C code - * - * authors: v1.0: Antoon Bosselaers - * v2.0: Vincent Rijmen - * v2.1: Vincent Rijmen - * v2.2: Vincent Rijmen - * v2.3: Paulo Barreto - * v2.4: Vincent Rijmen - * - * This code is placed in the public domain. - */ - -#include <sys/param.h> -#include <sys/types.h> -#ifdef KERNEL -#include <sys/systm.h> -#else -#include <string.h> -#endif -#include <crypto/rijndael/rijndael-alg-fst.h> -#include <crypto/rijndael/rijndael-api-fst.h> -#include <crypto/rijndael/rijndael_local.h> - -int rijndael_makeKey(keyInstance *key, BYTE direction, int keyLen, char *keyMaterial) { - word8 k[MAXKC][4]; - int i; - char *keyMat; - - if (key == NULL) { - return BAD_KEY_INSTANCE; - } - - if ((direction == DIR_ENCRYPT) || (direction == DIR_DECRYPT)) { - key->direction = direction; - } else { - return BAD_KEY_DIR; - } - - if ((keyLen == 128) || (keyLen == 192) || (keyLen == 256)) { - key->keyLen = keyLen; - } else { - return BAD_KEY_MAT; - } - - if (keyMaterial != NULL) { - bcopy(keyMaterial, key->keyMaterial, keyLen/8); - } - - key->ROUNDS = keyLen/32 + 6; - - /* initialize key schedule: */ - keyMat = key->keyMaterial; - for (i = 0; i < key->keyLen/8; i++) { - k[i >> 2][i & 3] = (word8)keyMat[i]; - } - rijndaelKeySched(k, key->keySched, key->ROUNDS); - if (direction == DIR_DECRYPT) { - rijndaelKeyEncToDec(key->keySched, key->ROUNDS); - } - - return TRUE; -} - -int rijndael_cipherInit(cipherInstance *cipher, BYTE mode, char *IV) { - if ((mode == MODE_ECB) || (mode == MODE_CBC) || (mode == MODE_CFB1)) { - cipher->mode = mode; - } else { - return BAD_CIPHER_MODE; - } - if (IV != NULL) { - bcopy(IV, cipher->IV, MAX_IV_SIZE); - } else { - bzero(cipher->IV, MAX_IV_SIZE); - } - return TRUE; -} - -int rijndael_blockEncrypt(cipherInstance *cipher, keyInstance *key, - BYTE *input, int inputLen, BYTE *outBuffer) { - int i, k, numBlocks; - word8 block[16], iv[4][4]; - - if (cipher == NULL || - key == NULL || - key->direction == DIR_DECRYPT) { - return BAD_CIPHER_STATE; - } - if (input == NULL || inputLen <= 0) { - return 0; /* nothing to do */ - } - - numBlocks = inputLen/128; - - switch (cipher->mode) { - case MODE_ECB: - for (i = numBlocks; i > 0; i--) { - rijndaelEncrypt(input, outBuffer, key->keySched, key->ROUNDS); - input += 16; - outBuffer += 16; - } - break; - - case MODE_CBC: -#if 1 /*STRICT_ALIGN*/ - bcopy(cipher->IV, block, 16); - bcopy(input, iv, 16); - ((word32*)block)[0] ^= ((word32*)iv)[0]; - ((word32*)block)[1] ^= ((word32*)iv)[1]; - ((word32*)block)[2] ^= ((word32*)iv)[2]; - ((word32*)block)[3] ^= ((word32*)iv)[3]; -#else - ((word32*)block)[0] = ((word32*)cipher->IV)[0] ^ ((word32*)input)[0]; - ((word32*)block)[1] = ((word32*)cipher->IV)[1] ^ ((word32*)input)[1]; - ((word32*)block)[2] = ((word32*)cipher->IV)[2] ^ ((word32*)input)[2]; - ((word32*)block)[3] = ((word32*)cipher->IV)[3] ^ ((word32*)input)[3]; -#endif - rijndaelEncrypt(block, outBuffer, key->keySched, key->ROUNDS); - input += 16; - for (i = numBlocks - 1; i > 0; i--) { -#if 1 /*STRICT_ALIGN*/ - bcopy(outBuffer, block, 16); - ((word32*)block)[0] ^= ((word32*)iv)[0]; - ((word32*)block)[1] ^= ((word32*)iv)[1]; - ((word32*)block)[2] ^= ((word32*)iv)[2]; - ((word32*)block)[3] ^= ((word32*)iv)[3]; -#else - ((word32*)block)[0] = ((word32*)outBuffer)[0] ^ ((word32*)input)[0]; - ((word32*)block)[1] = ((word32*)outBuffer)[1] ^ ((word32*)input)[1]; - ((word32*)block)[2] = ((word32*)outBuffer)[2] ^ ((word32*)input)[2]; - ((word32*)block)[3] = ((word32*)outBuffer)[3] ^ ((word32*)input)[3]; -#endif - outBuffer += 16; - rijndaelEncrypt(block, outBuffer, key->keySched, key->ROUNDS); - input += 16; - } - break; - - case MODE_CFB1: -#if 1 /*STRICT_ALIGN*/ - bcopy(cipher->IV, iv, 16); -#else /* !STRICT_ALIGN */ - *((word32*)iv[0]) = *((word32*)(cipher->IV )); - *((word32*)iv[1]) = *((word32*)(cipher->IV+ 4)); - *((word32*)iv[2]) = *((word32*)(cipher->IV+ 8)); - *((word32*)iv[3]) = *((word32*)(cipher->IV+12)); -#endif /* ?STRICT_ALIGN */ - for (i = numBlocks; i > 0; i--) { - for (k = 0; k < 128; k++) { - *((word32*) block ) = *((word32*)iv[0]); - *((word32*)(block+ 4)) = *((word32*)iv[1]); - *((word32*)(block+ 8)) = *((word32*)iv[2]); - *((word32*)(block+12)) = *((word32*)iv[3]); - rijndaelEncrypt(block, block, key->keySched, key->ROUNDS); - outBuffer[k/8] ^= (block[0] & 0x80) >> (k & 7); - iv[0][0] = (iv[0][0] << 1) | (iv[0][1] >> 7); - iv[0][1] = (iv[0][1] << 1) | (iv[0][2] >> 7); - iv[0][2] = (iv[0][2] << 1) | (iv[0][3] >> 7); - iv[0][3] = (iv[0][3] << 1) | (iv[1][0] >> 7); - iv[1][0] = (iv[1][0] << 1) | (iv[1][1] >> 7); - iv[1][1] = (iv[1][1] << 1) | (iv[1][2] >> 7); - iv[1][2] = (iv[1][2] << 1) | (iv[1][3] >> 7); - iv[1][3] = (iv[1][3] << 1) | (iv[2][0] >> 7); - iv[2][0] = (iv[2][0] << 1) | (iv[2][1] >> 7); - iv[2][1] = (iv[2][1] << 1) | (iv[2][2] >> 7); - iv[2][2] = (iv[2][2] << 1) | (iv[2][3] >> 7); - iv[2][3] = (iv[2][3] << 1) | (iv[3][0] >> 7); - iv[3][0] = (iv[3][0] << 1) | (iv[3][1] >> 7); - iv[3][1] = (iv[3][1] << 1) | (iv[3][2] >> 7); - iv[3][2] = (iv[3][2] << 1) | (iv[3][3] >> 7); - iv[3][3] = (iv[3][3] << 1) | ((outBuffer[k/8] >> (7-(k&7))) & 1); - } - } - break; - - default: - return BAD_CIPHER_STATE; - } - - return 128*numBlocks; -} - -/** - * Encrypt data partitioned in octets, using RFC 2040-like padding. - * - * @param input data to be encrypted (octet sequence) - * @param inputOctets input length in octets (not bits) - * @param outBuffer encrypted output data - * - * @return length in octets (not bits) of the encrypted output buffer. - */ -int rijndael_padEncrypt(cipherInstance *cipher, keyInstance *key, - BYTE *input, int inputOctets, BYTE *outBuffer) { - int i, numBlocks, padLen; - word8 block[16], *iv, *cp; - - if (cipher == NULL || - key == NULL || - key->direction == DIR_DECRYPT) { - return BAD_CIPHER_STATE; - } - if (input == NULL || inputOctets <= 0) { - return 0; /* nothing to do */ - } - - numBlocks = inputOctets/16; - - switch (cipher->mode) { - case MODE_ECB: - for (i = numBlocks; i > 0; i--) { - rijndaelEncrypt(input, outBuffer, key->keySched, key->ROUNDS); - input += 16; - outBuffer += 16; - } - padLen = 16 - (inputOctets - 16*numBlocks); - if (padLen > 0 && padLen <= 16) - panic("rijndael_padEncrypt(ECB)"); - bcopy(input, block, 16 - padLen); - for (cp = block + 16 - padLen; cp < block + 16; cp++) - *cp = padLen; - rijndaelEncrypt(block, outBuffer, key->keySched, key->ROUNDS); - break; - - case MODE_CBC: - iv = cipher->IV; - for (i = numBlocks; i > 0; i--) { - ((word32*)block)[0] = ((word32*)input)[0] ^ ((word32*)iv)[0]; - ((word32*)block)[1] = ((word32*)input)[1] ^ ((word32*)iv)[1]; - ((word32*)block)[2] = ((word32*)input)[2] ^ ((word32*)iv)[2]; - ((word32*)block)[3] = ((word32*)input)[3] ^ ((word32*)iv)[3]; - rijndaelEncrypt(block, outBuffer, key->keySched, key->ROUNDS); - iv = outBuffer; - input += 16; - outBuffer += 16; - } - padLen = 16 - (inputOctets - 16*numBlocks); - if (padLen > 0 && padLen <= 16) - panic("rijndael_padEncrypt(CBC)"); - for (i = 0; i < 16 - padLen; i++) { - block[i] = input[i] ^ iv[i]; - } - for (i = 16 - padLen; i < 16; i++) { - block[i] = (BYTE)padLen ^ iv[i]; - } - rijndaelEncrypt(block, outBuffer, key->keySched, key->ROUNDS); - break; - - default: - return BAD_CIPHER_STATE; - } - - return 16*(numBlocks + 1); -} - -int rijndael_blockDecrypt(cipherInstance *cipher, keyInstance *key, - BYTE *input, int inputLen, BYTE *outBuffer) { - int i, k, numBlocks; - word8 block[16], iv[4][4]; - - if (cipher == NULL || - key == NULL || - (cipher->mode != MODE_CFB1 && key->direction == DIR_ENCRYPT)) { - return BAD_CIPHER_STATE; - } - if (input == NULL || inputLen <= 0) { - return 0; /* nothing to do */ - } - - numBlocks = inputLen/128; - - switch (cipher->mode) { - case MODE_ECB: - for (i = numBlocks; i > 0; i--) { - rijndaelDecrypt(input, outBuffer, key->keySched, key->ROUNDS); - input += 16; - outBuffer += 16; - } - break; - - case MODE_CBC: -#if 1 /*STRICT_ALIGN */ - bcopy(cipher->IV, iv, 16); -#else - *((word32*)iv[0]) = *((word32*)(cipher->IV )); - *((word32*)iv[1]) = *((word32*)(cipher->IV+ 4)); - *((word32*)iv[2]) = *((word32*)(cipher->IV+ 8)); - *((word32*)iv[3]) = *((word32*)(cipher->IV+12)); -#endif - for (i = numBlocks; i > 0; i--) { - rijndaelDecrypt(input, block, key->keySched, key->ROUNDS); - ((word32*)block)[0] ^= *((word32*)iv[0]); - ((word32*)block)[1] ^= *((word32*)iv[1]); - ((word32*)block)[2] ^= *((word32*)iv[2]); - ((word32*)block)[3] ^= *((word32*)iv[3]); -#if 1 /*STRICT_ALIGN*/ - bcopy(input, iv, 16); - bcopy(block, outBuffer, 16); -#else - *((word32*)iv[0]) = ((word32*)input)[0]; ((word32*)outBuffer)[0] = ((word32*)block)[0]; - *((word32*)iv[1]) = ((word32*)input)[1]; ((word32*)outBuffer)[1] = ((word32*)block)[1]; - *((word32*)iv[2]) = ((word32*)input)[2]; ((word32*)outBuffer)[2] = ((word32*)block)[2]; - *((word32*)iv[3]) = ((word32*)input)[3]; ((word32*)outBuffer)[3] = ((word32*)block)[3]; -#endif - input += 16; - outBuffer += 16; - } - break; - - case MODE_CFB1: -#if 1 /*STRICT_ALIGN */ - bcopy(cipher->IV, iv, 16); -#else - *((word32*)iv[0]) = *((word32*)(cipher->IV)); - *((word32*)iv[1]) = *((word32*)(cipher->IV+ 4)); - *((word32*)iv[2]) = *((word32*)(cipher->IV+ 8)); - *((word32*)iv[3]) = *((word32*)(cipher->IV+12)); -#endif - for (i = numBlocks; i > 0; i--) { - for (k = 0; k < 128; k++) { - *((word32*) block ) = *((word32*)iv[0]); - *((word32*)(block+ 4)) = *((word32*)iv[1]); - *((word32*)(block+ 8)) = *((word32*)iv[2]); - *((word32*)(block+12)) = *((word32*)iv[3]); - rijndaelEncrypt(block, block, key->keySched, key->ROUNDS); - iv[0][0] = (iv[0][0] << 1) | (iv[0][1] >> 7); - iv[0][1] = (iv[0][1] << 1) | (iv[0][2] >> 7); - iv[0][2] = (iv[0][2] << 1) | (iv[0][3] >> 7); - iv[0][3] = (iv[0][3] << 1) | (iv[1][0] >> 7); - iv[1][0] = (iv[1][0] << 1) | (iv[1][1] >> 7); - iv[1][1] = (iv[1][1] << 1) | (iv[1][2] >> 7); - iv[1][2] = (iv[1][2] << 1) | (iv[1][3] >> 7); - iv[1][3] = (iv[1][3] << 1) | (iv[2][0] >> 7); - iv[2][0] = (iv[2][0] << 1) | (iv[2][1] >> 7); - iv[2][1] = (iv[2][1] << 1) | (iv[2][2] >> 7); - iv[2][2] = (iv[2][2] << 1) | (iv[2][3] >> 7); - iv[2][3] = (iv[2][3] << 1) | (iv[3][0] >> 7); - iv[3][0] = (iv[3][0] << 1) | (iv[3][1] >> 7); - iv[3][1] = (iv[3][1] << 1) | (iv[3][2] >> 7); - iv[3][2] = (iv[3][2] << 1) | (iv[3][3] >> 7); - iv[3][3] = (iv[3][3] << 1) | ((input[k/8] >> (7-(k&7))) & 1); - outBuffer[k/8] ^= (block[0] & 0x80) >> (k & 7); - } - } - break; - - default: - return BAD_CIPHER_STATE; - } - - return 128*numBlocks; -} - -int rijndael_padDecrypt(cipherInstance *cipher, keyInstance *key, - BYTE *input, int inputOctets, BYTE *outBuffer) { - int i, numBlocks, padLen; - word8 block[16]; - word32 iv[4]; - - if (cipher == NULL || - key == NULL || - key->direction == DIR_ENCRYPT) { - return BAD_CIPHER_STATE; - } - if (input == NULL || inputOctets <= 0) { - return 0; /* nothing to do */ - } - if (inputOctets % 16 != 0) { - return BAD_DATA; - } - - numBlocks = inputOctets/16; - - switch (cipher->mode) { - case MODE_ECB: - /* all blocks but last */ - for (i = numBlocks - 1; i > 0; i--) { - rijndaelDecrypt(input, outBuffer, key->keySched, key->ROUNDS); - input += 16; - outBuffer += 16; - } - /* last block */ - rijndaelDecrypt(input, block, key->keySched, key->ROUNDS); - padLen = block[15]; - if (padLen >= 16) { - return BAD_DATA; - } - for (i = 16 - padLen; i < 16; i++) { - if (block[i] != padLen) { - return BAD_DATA; - } - } - bcopy(block, outBuffer, 16 - padLen); - break; - - case MODE_CBC: - bcopy(cipher->IV, iv, 16); - /* all blocks but last */ - for (i = numBlocks - 1; i > 0; i--) { - rijndaelDecrypt(input, block, key->keySched, key->ROUNDS); - ((word32*)block)[0] ^= iv[0]; - ((word32*)block)[1] ^= iv[1]; - ((word32*)block)[2] ^= iv[2]; - ((word32*)block)[3] ^= iv[3]; - bcopy(input, iv, 16); - bcopy(block, outBuffer, 16); - input += 16; - outBuffer += 16; - } - /* last block */ - rijndaelDecrypt(input, block, key->keySched, key->ROUNDS); - ((word32*)block)[0] ^= iv[0]; - ((word32*)block)[1] ^= iv[1]; - ((word32*)block)[2] ^= iv[2]; - ((word32*)block)[3] ^= iv[3]; - padLen = block[15]; - if (padLen <= 0 || padLen > 16) { - return BAD_DATA; - } - for (i = 16 - padLen; i < 16; i++) { - if (block[i] != padLen) { - return BAD_DATA; - } - } - bcopy(block, outBuffer, 16 - padLen); - break; - - default: - return BAD_CIPHER_STATE; - } - - return 16*numBlocks - padLen; -} - -#ifdef INTERMEDIATE_VALUE_KAT -/** - * cipherUpdateRounds: - * - * Encrypts/Decrypts exactly one full block a specified number of rounds. - * Only used in the Intermediate Value Known Answer Test. - * - * Returns: - * TRUE - on success - * BAD_CIPHER_STATE - cipher in bad state (e.g., not initialized) - */ -int rijndael_cipherUpdateRounds(cipherInstance *cipher, keyInstance *key, - BYTE *input, int inputLen, BYTE *outBuffer, int rounds) { - int j; - word8 block[4][4]; - - if (cipher == NULL || key == NULL) { - return BAD_CIPHER_STATE; - } - - for (j = 3; j >= 0; j--) { - /* parse input stream into rectangular array */ - *((word32*)block[j]) = *((word32*)(input+4*j)); - } - - switch (key->direction) { - case DIR_ENCRYPT: - rijndaelEncryptRound(block, key->keySched, key->ROUNDS, rounds); - break; - - case DIR_DECRYPT: - rijndaelDecryptRound(block, key->keySched, key->ROUNDS, rounds); - break; - - default: - return BAD_KEY_DIR; - } - - for (j = 3; j >= 0; j--) { - /* parse rectangular array into output ciphertext bytes */ - *((word32*)(outBuffer+4*j)) = *((word32*)block[j]); - } - - return TRUE; -} -#endif /* INTERMEDIATE_VALUE_KAT */ diff --git a/bsd/crypto/rijndael/rijndael-api-fst.h b/bsd/crypto/rijndael/rijndael-api-fst.h deleted file mode 100644 index 682f2da69..000000000 --- a/bsd/crypto/rijndael/rijndael-api-fst.h +++ /dev/null @@ -1,104 +0,0 @@ -/* $FreeBSD: src/sys/crypto/rijndael/rijndael-api-fst.h,v 1.2.2.1 2001/07/03 11:01:36 ume Exp $ */ -/* $KAME: rijndael-api-fst.h,v 1.6 2001/05/27 00:23:23 itojun Exp $ */ - -/* - * rijndael-api-fst.h v2.3 April '2000 - * - * Optimised ANSI C code - * - * #define INTERMEDIATE_VALUE_KAT to generate the Intermediate Value Known Answer Test. - */ - -#ifndef __RIJNDAEL_API_FST_H -#define __RIJNDAEL_API_FST_H - -#include <crypto/rijndael/rijndael-alg-fst.h> - -/* Defines: - Add any additional defines you need -*/ - -#define DIR_ENCRYPT 0 /* Are we encrpyting? */ -#define DIR_DECRYPT 1 /* Are we decrpyting? */ -#define MODE_ECB 1 /* Are we ciphering in ECB mode? */ -#define MODE_CBC 2 /* Are we ciphering in CBC mode? */ -#define MODE_CFB1 3 /* Are we ciphering in 1-bit CFB mode? */ -#define TRUE 1 -#define FALSE 0 -#define BITSPERBLOCK 128 /* Default number of bits in a cipher block */ - -/* Error Codes - CHANGE POSSIBLE: inclusion of additional error codes */ -#define BAD_KEY_DIR -1 /* Key direction is invalid, e.g., unknown value */ -#define BAD_KEY_MAT -2 /* Key material not of correct length */ -#define BAD_KEY_INSTANCE -3 /* Key passed is not valid */ -#define BAD_CIPHER_MODE -4 /* Params struct passed to cipherInit invalid */ -#define BAD_CIPHER_STATE -5 /* Cipher in wrong state (e.g., not initialized) */ -#define BAD_BLOCK_LENGTH -6 -#define BAD_CIPHER_INSTANCE -7 -#define BAD_DATA -8 /* Data contents are invalid, e.g., invalid padding */ -#define BAD_OTHER -9 /* Unknown error */ - -/* CHANGE POSSIBLE: inclusion of algorithm specific defines */ -#define MAX_KEY_SIZE 64 /* # of ASCII char's needed to represent a key */ -#define MAX_IV_SIZE 16 /* # bytes needed to represent an IV */ - -/* Typedefs: - - Typedef'ed data storage elements. Add any algorithm specific -parameters at the bottom of the structs as appropriate. -*/ - -/* The structure for key information */ -typedef struct { - u_int8_t direction; /* Key used for encrypting or decrypting? */ - int keyLen; /* Length of the key */ - char keyMaterial[MAX_KEY_SIZE+1]; /* Raw key data in ASCII, e.g., user input or KAT values */ - /* The following parameters are algorithm dependent, replace or add as necessary */ - int ROUNDS; /* key-length-dependent number of rounds */ - int blockLen; /* block length */ - union { - u_int8_t xkS8[RIJNDAEL_MAXROUNDS+1][4][4]; /* key schedule */ - u_int32_t xkS32[RIJNDAEL_MAXROUNDS+1][4]; /* key schedule */ - } xKeySched; -#define keySched xKeySched.xkS8 -} keyInstance; - -/* The structure for cipher information */ -typedef struct { /* changed order of the components */ - u_int8_t mode; /* MODE_ECB, MODE_CBC, or MODE_CFB1 */ - u_int8_t IV[MAX_IV_SIZE]; /* A possible Initialization Vector for ciphering */ - /* Add any algorithm specific parameters needed here */ - int blockLen; /* Sample: Handles non-128 bit block sizes (if available) */ -} cipherInstance; - -/* Function prototypes */ -/* CHANGED: nothing - TODO: implement the following extensions to setup 192-bit and 256-bit block lengths: - makeKeyEx(): parameter blockLen added - -- this parameter is absolutely necessary if you want to - setup the round keys in a variable block length setting - cipherInitEx(): parameter blockLen added (for obvious reasons) - */ - -int rijndael_makeKey(keyInstance *key, u_int8_t direction, int keyLen, char *keyMaterial); - -int rijndael_cipherInit(cipherInstance *cipher, u_int8_t mode, char *IV); - -int rijndael_blockEncrypt(cipherInstance *cipher, keyInstance *key, - u_int8_t *input, int inputLen, u_int8_t *outBuffer); - -int rijndael_padEncrypt(cipherInstance *cipher, keyInstance *key, - u_int8_t *input, int inputOctets, u_int8_t *outBuffer); - -int rijndael_blockDecrypt(cipherInstance *cipher, keyInstance *key, - u_int8_t *input, int inputLen, u_int8_t *outBuffer); - -int rijndael_padDecrypt(cipherInstance *cipher, keyInstance *key, - u_int8_t *input, int inputOctets, u_int8_t *outBuffer); - -#ifdef INTERMEDIATE_VALUE_KAT -int rijndael_cipherUpdateRounds(cipherInstance *cipher, keyInstance *key, - u_int8_t *input, int inputLen, u_int8_t *outBuffer, int Rounds); -#endif /* INTERMEDIATE_VALUE_KAT */ - -#endif /* __RIJNDAEL_API_FST_H */ diff --git a/bsd/crypto/rijndael/rijndael.h b/bsd/crypto/rijndael/rijndael.h deleted file mode 100644 index 8dafa3b71..000000000 --- a/bsd/crypto/rijndael/rijndael.h +++ /dev/null @@ -1,4 +0,0 @@ -/* $KAME: rijndael.h,v 1.2 2000/10/02 17:14:27 itojun Exp $ */ -/* $FreeBSD: src/sys/crypto/rijndael/rijndael.h,v 1.1.1.1.2.1 2001/07/03 11:01:36 ume Exp $ */ - -#include <crypto/rijndael/rijndael-api-fst.h> diff --git a/bsd/crypto/rijndael/rijndael_local.h b/bsd/crypto/rijndael/rijndael_local.h deleted file mode 100644 index 81e79604a..000000000 --- a/bsd/crypto/rijndael/rijndael_local.h +++ /dev/null @@ -1,11 +0,0 @@ -/* $KAME: rijndael_local.h,v 1.3 2000/10/02 17:14:27 itojun Exp $ */ -/* $FreeBSD: src/sys/crypto/rijndael/rijndael_local.h,v 1.3.2.1 2001/07/03 11:01:36 ume Exp $ */ - -/* the file should not be used from outside */ -typedef u_int8_t BYTE; -typedef u_int8_t word8; -typedef u_int16_t word16; -typedef u_int32_t word32; - -#define MAXKC RIJNDAEL_MAXKC -#define MAXROUNDS RIJNDAEL_MAXROUNDS diff --git a/bsd/crypto/sha1.c b/bsd/crypto/sha1.c index c5c7b27cd..cf6bbe72a 100644 --- a/bsd/crypto/sha1.c +++ b/bsd/crypto/sha1.c @@ -84,7 +84,7 @@ static u_int32_t _K[] = { 0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6 }; sha1_step(ctxt); \ } -static void sha1_step __P((struct sha1_ctxt *)); +static void sha1_step(struct sha1_ctxt *); static void sha1_step(ctxt) diff --git a/bsd/crypto/sha1.h b/bsd/crypto/sha1.h index a3ee2d834..f5dbac6eb 100644 --- a/bsd/crypto/sha1.h +++ b/bsd/crypto/sha1.h @@ -55,10 +55,10 @@ struct sha1_ctxt { }; #ifdef KERNEL -extern void sha1_init __P((struct sha1_ctxt *)); -extern void sha1_pad __P((struct sha1_ctxt *)); -extern void sha1_loop __P((struct sha1_ctxt *, const u_int8_t *, size_t)); -extern void sha1_result __P((struct sha1_ctxt *, caddr_t)); +extern void sha1_init(struct sha1_ctxt *); +extern void sha1_pad(struct sha1_ctxt *); +extern void sha1_loop(struct sha1_ctxt *, const u_int8_t *, size_t); +extern void sha1_result(struct sha1_ctxt *, caddr_t); /* compatibilty with other SHA1 source codes */ typedef struct sha1_ctxt SHA1_CTX; diff --git a/bsd/crypto/sha2/Makefile b/bsd/crypto/sha2/Makefile index 95aff4dee..72820c951 100644 --- a/bsd/crypto/sha2/Makefile +++ b/bsd/crypto/sha2/Makefile @@ -26,7 +26,7 @@ INSTALL_MI_DIR = crypto EXPORT_MI_DIR = ${INSTALL_MI_DIR} -INSTALL_MI_LCL_KERN_LIST = ${PRIVATE_DATAFILES} +INSTALL_KF_MI_LCL_LIST = ${PRIVATE_DATAFILES} include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/bsd/crypto/sha2/sha2.h b/bsd/crypto/sha2/sha2.h index 1f063d26e..3997e63f1 100644 --- a/bsd/crypto/sha2/sha2.h +++ b/bsd/crypto/sha2/sha2.h @@ -115,23 +115,23 @@ typedef SHA512_CTX SHA384_CTX; /*** SHA-256/384/512 Function Prototypes ******************************/ -void SHA256_Init __P((SHA256_CTX *)); -void SHA256_Update __P((SHA256_CTX*, const u_int8_t*, size_t)); -void SHA256_Final __P((u_int8_t[SHA256_DIGEST_LENGTH], SHA256_CTX*)); -char* SHA256_End __P((SHA256_CTX*, char[SHA256_DIGEST_STRING_LENGTH])); -char* SHA256_Data __P((const u_int8_t*, size_t, char[SHA256_DIGEST_STRING_LENGTH])); - -void SHA384_Init __P((SHA384_CTX*)); -void SHA384_Update __P((SHA384_CTX*, const u_int8_t*, size_t)); -void SHA384_Final __P((u_int8_t[SHA384_DIGEST_LENGTH], SHA384_CTX*)); -char* SHA384_End __P((SHA384_CTX*, char[SHA384_DIGEST_STRING_LENGTH])); -char* SHA384_Data __P((const u_int8_t*, size_t, char[SHA384_DIGEST_STRING_LENGTH])); - -void SHA512_Init __P((SHA512_CTX*)); -void SHA512_Update __P((SHA512_CTX*, const u_int8_t*, size_t)); -void SHA512_Final __P((u_int8_t[SHA512_DIGEST_LENGTH], SHA512_CTX*)); -char* SHA512_End __P((SHA512_CTX*, char[SHA512_DIGEST_STRING_LENGTH])); -char* SHA512_Data __P((const u_int8_t*, size_t, char[SHA512_DIGEST_STRING_LENGTH])); +void SHA256_Init(SHA256_CTX *); +void SHA256_Update(SHA256_CTX*, const u_int8_t*, size_t); +void SHA256_Final(u_int8_t[SHA256_DIGEST_LENGTH], SHA256_CTX*); +char* SHA256_End(SHA256_CTX*, char[SHA256_DIGEST_STRING_LENGTH]); +char* SHA256_Data(const u_int8_t*, size_t, char[SHA256_DIGEST_STRING_LENGTH]); + +void SHA384_Init(SHA384_CTX*); +void SHA384_Update(SHA384_CTX*, const u_int8_t*, size_t); +void SHA384_Final(u_int8_t[SHA384_DIGEST_LENGTH], SHA384_CTX*); +char* SHA384_End(SHA384_CTX*, char[SHA384_DIGEST_STRING_LENGTH]); +char* SHA384_Data(const u_int8_t*, size_t, char[SHA384_DIGEST_STRING_LENGTH]); + +void SHA512_Init(SHA512_CTX*); +void SHA512_Update(SHA512_CTX*, const u_int8_t*, size_t); +void SHA512_Final(u_int8_t[SHA512_DIGEST_LENGTH], SHA512_CTX*); +char* SHA512_End(SHA512_CTX*, char[SHA512_DIGEST_STRING_LENGTH]); +char* SHA512_Data(const u_int8_t*, size_t, char[SHA512_DIGEST_STRING_LENGTH]); #ifdef __cplusplus } diff --git a/bsd/dev/Makefile b/bsd/dev/Makefile index 8e8cf9321..66b0e557c 100644 --- a/bsd/dev/Makefile +++ b/bsd/dev/Makefile @@ -19,14 +19,11 @@ EXPINC_SUBDIRS_PPC = \ EXPINC_SUBDIRS_I386 = \ -DATAFILES = \ - disk.h disk_label.h kmreg_com.h - -INSTALL_MI_LIST = ${DATAFILES} +INSTALL_MI_LIST = INSTALL_MI_DIR = dev -EXPORT_MI_LIST = ${DATAFILES} +EXPORT_MI_LIST = EXPORT_MI_DIR = dev diff --git a/bsd/dev/i386/conf.c b/bsd/dev/i386/conf.c index 1d872abbd..0e5ea99b4 100644 --- a/bsd/dev/i386/conf.c +++ b/bsd/dev/i386/conf.c @@ -35,13 +35,15 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/buf.h> #include <sys/ioctl.h> #include <sys/tty.h> #include <sys/conf.h> - -extern int nulldev(); +/* Prototypes that should be elsewhere: */ +extern int isdisk(dev_t dev, int type); +extern dev_t chrtoblk(dev_t dev); +extern int chrtoblk_set(int cdev, int bdev); +extern int iskmemdev(dev_t dev); struct bdevsw bdevsw[] = { @@ -89,32 +91,62 @@ struct bdevsw bdevsw[] = int nblkdev = sizeof (bdevsw) / sizeof (bdevsw[0]); extern struct tty *km_tty[]; -extern int cnopen(),cnclose(),cnread(),cnwrite(),cnioctl(), - cnselect(),cngetc(), cnputc(dev_t dev, char c); -extern int kmopen(),kmclose(),kmread(),kmwrite(),kmioctl(), - kmgetc(), kmputc(dev_t dev, char c); -extern int sgopen(),sgclose(), sgioctl(); +extern d_open_t cnopen; +extern d_close_t cnclose; +extern d_read_t cnread; +extern d_write_t cnwrite; +extern d_ioctl_t cnioctl; +extern d_select_t cnselect; +extern d_getc_t cngetc; +extern d_putc_t cnputc; +extern d_open_t kmopen; +extern d_close_t kmclose; +extern d_read_t kmread; +extern d_write_t kmwrite; +extern d_ioctl_t kmioctl; +extern d_getc_t kmgetc; +extern d_putc_t kmputc; +extern d_open_t sgopen; +extern d_close_t sgclose; +extern d_ioctl_t sgioctl; #if NVOL > 0 -extern int volopen(),volclose(),volioctl(); +extern d_open_t volopen; +extern d_close_t volclose; +extern d_ioctl_t volioctl; #else #define volopen eno_opcl #define volclose eno_opcl #define volioctl eno_ioctl #endif -extern int cttyopen(), cttyread(), cttywrite(), cttyioctl(), cttyselect(); +extern d_open_t cttyopen; +extern d_read_t cttyread; +extern d_write_t cttywrite; +extern d_ioctl_t cttyioctl; +extern d_select_t cttyselect; -extern int mmread(),mmwrite(); +extern d_read_t mmread; +extern d_write_t mmwrite; +extern d_ioctl_t mmioctl; #define mmselect (select_fcn_t *)seltrue #define mmmmap eno_mmap #include <pty.h> #if NPTY > 0 extern struct tty *pt_tty[]; -extern int ptsopen(),ptsclose(),ptsread(),ptswrite(),ptsstop(),ptsputc(); -extern int ptcopen(),ptcclose(),ptcread(),ptcwrite(),ptcselect(), - ptyioctl(); +extern d_open_t ptsopen; +extern d_close_t ptsclose; +extern d_read_t ptsread; +extern d_write_t ptswrite; +extern d_stop_t ptsstop; +extern d_putc_t ptsputc; +extern d_open_t ptcopen; +extern d_close_t ptcclose; +extern d_read_t ptcread; +extern d_write_t ptcwrite; +extern d_select_t ptcselect; +extern d_ioctl_t ptyioctl; #else #define ptsopen eno_opcl #define ptsclose eno_opcl @@ -131,9 +163,25 @@ extern int ptcopen(),ptcclose(),ptcread(),ptcwrite(),ptcselect(), #define ptyioctl eno_ioctl #endif -extern int logopen(),logclose(),logread(),logioctl(),logselect(); -extern int fdesc_open(), fdesc_read(), fdesc_write(), - fdesc_ioctl(), fdesc_select(); +extern d_open_t logopen; +extern d_close_t logclose; +extern d_read_t logread; +extern d_ioctl_t logioctl; +extern d_select_t logselect; +extern d_open_t fdesc_open; +extern d_read_t fdesc_read; +extern d_write_t fdesc_write; +extern d_ioctl_t fdesc_ioctl; +extern d_select_t fdesc_select; + +#define nullopen (d_open_t *)&nulldev +#define nullclose (d_close_t *)&nulldev +#define nullread (d_read_t *)&nulldev +#define nullwrite (d_write_t *)&nulldev +#define nullioctl (d_ioctl_t *)&nulldev +#define nullselect (d_select_t *)&nulldev +#define nullstop (d_stop_t *)&nulldev +#define nullreset (d_reset_t *)&nulldev struct cdevsw cdevsw[] = { @@ -150,33 +198,33 @@ struct cdevsw cdevsw[] = { cnopen, cnclose, cnread, cnwrite, /* 0*/ - cnioctl, nulldev, nulldev, 0, cnselect, + cnioctl, nullstop, nullreset, 0, cnselect, eno_mmap, eno_strat, cngetc, cnputc, D_TTY }, NO_CDEVICE, /* 1*/ { - cttyopen, nulldev, cttyread, cttywrite, /* 2*/ - cttyioctl, nulldev, nulldev, 0, cttyselect, + cttyopen, nullclose, cttyread, cttywrite, /* 2*/ + cttyioctl, nullstop, nullreset, 0, cttyselect, eno_mmap, eno_strat, eno_getc, eno_putc, D_TTY }, { - nulldev, nulldev, mmread, mmwrite, /* 3*/ - eno_ioctl, nulldev, nulldev, 0, mmselect, - mmmmap, eno_strat, eno_getc, eno_putc, 0 + nullopen, nullclose, mmread, mmwrite, /* 3*/ + mmioctl, nullstop, nullreset, 0, mmselect, + mmmmap, eno_strat, eno_getc, eno_putc, D_DISK }, { ptsopen, ptsclose, ptsread, ptswrite, /* 4*/ - ptyioctl, ptsstop, nulldev, pt_tty, ttselect, + ptyioctl, ptsstop, nullreset, pt_tty, ttselect, eno_mmap, eno_strat, eno_getc, eno_putc, D_TTY }, { ptcopen, ptcclose, ptcread, ptcwrite, /* 5*/ - ptyioctl, nulldev, nulldev, 0, ptcselect, + ptyioctl, nullstop, nullreset, 0, ptcselect, eno_mmap, eno_strat, eno_getc, eno_putc, D_TTY }, { logopen, logclose, logread, eno_rdwrt, /* 6*/ - logioctl, eno_stop, nulldev, 0, logselect, + logioctl, eno_stop, nullreset, 0, logselect, eno_mmap, eno_strat, eno_getc, eno_putc, 0 }, NO_CDEVICE, /* 7*/ @@ -186,7 +234,7 @@ struct cdevsw cdevsw[] = NO_CDEVICE, /*11*/ { kmopen, kmclose, kmread, kmwrite, /*12*/ - kmioctl, nulldev, nulldev, km_tty, ttselect, + kmioctl, nullstop, nullreset, km_tty, ttselect, eno_mmap, eno_strat, kmgetc, kmputc, 0 }, NO_CDEVICE, /*13*/ @@ -248,9 +296,7 @@ int nchrdev = sizeof (cdevsw) / sizeof (cdevsw[0]); * return true if a disk */ int -isdisk(dev, type) - dev_t dev; - int type; +isdisk(dev_t dev, int type) { dev_t maj = major(dev); @@ -301,8 +347,7 @@ static int chrtoblktab[] = { * convert chr dev to blk dev */ dev_t -chrtoblk(dev) - dev_t dev; +chrtoblk(dev_t dev) { int blkmaj; @@ -328,9 +373,7 @@ chrtoblk_set(int cdev, int bdev) /* * Returns true if dev is /dev/mem or /dev/kmem. */ -int iskmemdev(dev) - dev_t dev; +int iskmemdev(dev_t dev) { - return (major(dev) == 3 && minor(dev) < 2); } diff --git a/bsd/dev/i386/cons.c b/bsd/dev/i386/cons.c index 118878bd1..43f98d1ef 100644 --- a/bsd/dev/i386/cons.c +++ b/bsd/dev/i386/cons.c @@ -44,80 +44,111 @@ struct tty cons; struct tty *constty; /* current console device */ +int cnopen(__unused dev_t dev, int flag, int devtype, struct proc *pp); +int cnclose(__unused dev_t dev, int flag, int mode, struct proc *pp); +int cnread(__unused dev_t dev, struct uio *uio, int ioflag); +int cnwrite(__unused dev_t dev, struct uio *uio, int ioflag); +int cnioctl(__unused dev_t dev, int cmd, caddr_t addr, int flg, struct proc *p); +int cnselect(__unused dev_t dev, int flag, void * wql, struct proc *p); + +void slave_cnenable(void); + +int alert( + __unused int width, + __unused int height, + __unused const char *title, + const char *msg, + int p1, int p2, int p3, int p4, int p5, int p6, int p7, int p8); +int alert_done(void); + /*ARGSUSED*/ int -cnopen(dev, flag, devtype, pp) - dev_t dev; - int flag, devtype; - struct proc *pp; +cnopen(__unused dev_t dev, int flag, int devtype, struct proc *pp) { dev_t device; + boolean_t funnel_state; + int error; + + funnel_state = thread_funnel_set(kernel_flock, TRUE); if (constty) device = constty->t_dev; else device = cons.t_dev; - return ((*cdevsw[major(device)].d_open)(device, flag, devtype, pp)); + error = (*cdevsw[major(device)].d_open)(device, flag, devtype, pp); + thread_funnel_set(kernel_flock, funnel_state); + + return(error); } /*ARGSUSED*/ int -cnclose(dev, flag, mode, pp) - dev_t dev; - int flag, mode; - struct proc *pp; +cnclose(__unused dev_t dev, int flag, int mode, struct proc *pp) { dev_t device; + boolean_t funnel_state; + int error; + funnel_state = thread_funnel_set(kernel_flock, TRUE); if (constty) device = constty->t_dev; else device = cons.t_dev; - return ((*cdevsw[major(device)].d_close)(device, flag, mode, pp)); + error = (*cdevsw[major(device)].d_close)(device, flag, mode, pp); + thread_funnel_set(kernel_flock, funnel_state); + + return(error); + + } /*ARGSUSED*/ int -cnread(dev, uio, ioflag) - dev_t dev; - struct uio *uio; - int ioflag; +cnread(__unused dev_t dev, struct uio *uio, int ioflag) { dev_t device; + boolean_t funnel_state; + int error; + funnel_state = thread_funnel_set(kernel_flock, TRUE); if (constty) device = constty->t_dev; else device = cons.t_dev; - return ((*cdevsw[major(device)].d_read)(device, uio, ioflag)); + error = (*cdevsw[major(device)].d_read)(device, uio, ioflag); + thread_funnel_set(kernel_flock, funnel_state); + + return(error); } /*ARGSUSED*/ int -cnwrite(dev, uio, ioflag) - dev_t dev; - struct uio *uio; - int ioflag; +cnwrite(__unused dev_t dev, struct uio *uio, int ioflag) { dev_t device; + boolean_t funnel_state; + int error; + funnel_state = thread_funnel_set(kernel_flock, TRUE); if (constty) device = constty->t_dev; else device = cons.t_dev; - return ((*cdevsw[major(device)].d_write)(device, uio, ioflag)); + error = (*cdevsw[major(device)].d_write)(device, uio, ioflag); + thread_funnel_set(kernel_flock, funnel_state); + + return(error); } /*ARGSUSED*/ int -cnioctl(dev, cmd, addr, flag, p) - dev_t dev; - int cmd; - caddr_t addr; - int flag; - struct proc *p; +cnioctl(__unused dev_t dev, int cmd, caddr_t addr, int flag, struct proc *p) { dev_t device; + boolean_t funnel_state; + int error; + + funnel_state = thread_funnel_set(kernel_flock, TRUE); if (constty) device = constty->t_dev; @@ -127,23 +158,26 @@ cnioctl(dev, cmd, addr, flag, p) * Superuser can always use this to wrest control of console * output from the "virtual" console. */ - if (cmd == TIOCCONS && constty) { - int error = suser(p->p_ucred, (u_short *) NULL); - if (error) - return (error); + if ((unsigned) cmd == TIOCCONS && constty) { + error = proc_suser(p); + if (error) { + goto out; + } constty = NULL; - return (0); + error = 0; + goto out; } - return ((*cdevsw[major(device)].d_ioctl)(device, cmd, addr, flag, p)); + error = (*cdevsw[major(device)].d_ioctl)(device, cmd, addr, flag, p); +out: + thread_funnel_set(kernel_flock, funnel_state); + + return(error); } /*ARGSUSED*/ +/* called with funnel held */ int -cnselect(dev, flag, wql, p) - dev_t dev; - int flag; - void * wql; - struct proc *p; +cnselect(__unused dev_t dev, int flag, void * wql, struct proc *p) { dev_t device; @@ -159,12 +193,18 @@ int cngetc() { dev_t device; + boolean_t funnel_state; + int error; + funnel_state = thread_funnel_set(kernel_flock, TRUE); if (constty) device = constty->t_dev; else device = cons.t_dev; - return ((*cdevsw[major(device)].d_getc)(device)); + error = (*cdevsw[major(device)].d_getc)(device); + thread_funnel_set(kernel_flock, funnel_state); + + return(error); } /*ARGSUSED*/ @@ -173,21 +213,26 @@ cnputc(c) char c; { dev_t device; + boolean_t funnel_state; + int error; + funnel_state = thread_funnel_set(kernel_flock, TRUE); if (constty) device = constty->t_dev; else device = cons.t_dev; - return ((*cdevsw[major(device)].d_putc)(device, c)); + error = (*cdevsw[major(device)].d_putc)(device, c); + thread_funnel_set(kernel_flock, funnel_state); + + return(error); } #endif -#if NCPUS > 1 -slave_cnenable() +void +slave_cnenable(void) { /* FIXME: what to do here? */ } -#endif NCPUS > 1 #if 0 void @@ -207,9 +252,9 @@ kprintf( const char *format, ...) */ int alert( - int width, - int height, - const char *title, + __unused int width, + __unused int height, + __unused const char *title, const char *msg, int p1, int p2, @@ -233,7 +278,7 @@ alert( } int -alert_done() +alert_done(void) { /* DoRestore(); */ return 0; diff --git a/bsd/dev/i386/cons.h b/bsd/dev/i386/cons.h index 00d91a155..e14004aa2 100644 --- a/bsd/dev/i386/cons.h +++ b/bsd/dev/i386/cons.h @@ -24,14 +24,14 @@ */ struct consdev { - char *cn_name; /* name of device in dev_name_list */ - int (*cn_probe)(); /* probe hardware and fill in consdev info */ - int (*cn_init)(); /* turn on as console */ - int (*cn_getc)(); /* kernel getchar interface */ - int (*cn_putc)(); /* kernel putchar interface */ - struct tty *cn_tp; /* tty structure for console device */ - dev_t cn_dev; /* major/minor of device */ - short cn_pri; /* pecking order; the higher the better */ + char *cn_name; /* name of device in dev_name_list */ + int (*cn_probe)(void); /* probe hardware, fill consdev info */ + int (*cn_init)(void); /* turn on as console */ + int (*cn_getc)(void); /* kernel getchar interface */ + int (*cn_putc)(void); /* kernel putchar interface */ + struct tty *cn_tp; /* tty structure for console device */ + dev_t cn_dev; /* major/minor of device */ + short cn_pri; /* pecking order; higher the better */ }; /* values for cn_pri - reflect our policy for console selection */ diff --git a/bsd/dev/i386/kern_machdep.c b/bsd/dev/i386/kern_machdep.c index 0c3684b58..a78df67ed 100644 --- a/bsd/dev/i386/kern_machdep.c +++ b/bsd/dev/i386/kern_machdep.c @@ -26,78 +26,30 @@ * Author: John Seamons * * Machine-specific kernel routines. - * - * 8-Dec-91 Peter King (king) at NeXT - * Added grade_cpu_subtype(). - * FIXME: Do we want to merge this with check_cpu_subtype()? - * - * 5-Mar-90 John Seamons (jks) at NeXT - * Created. */ #include <sys/types.h> #include <mach/machine.h> #include <kern/cpu_number.h> -check_cpu_subtype (cpu_subtype) - cpu_subtype_t cpu_subtype; -{ - struct machine_slot *ms = &machine_slot[cpu_number()]; - - switch (ms->cpu_subtype) { - case CPU_SUBTYPE_386: - if (cpu_subtype == CPU_SUBTYPE_386) - return (TRUE); - break; - - case CPU_SUBTYPE_486: - case CPU_SUBTYPE_486SX: - if ( cpu_subtype == CPU_SUBTYPE_486 || - cpu_subtype == CPU_SUBTYPE_486SX || - cpu_subtype == CPU_SUBTYPE_386 ) - return (TRUE); - break; - - case CPU_SUBTYPE_586: - if ( cpu_subtype == CPU_SUBTYPE_586 || - cpu_subtype == CPU_SUBTYPE_486 || - cpu_subtype == CPU_SUBTYPE_486SX || - cpu_subtype == CPU_SUBTYPE_386 ) - return (TRUE); - break; - - default: - if ( CPU_SUBTYPE_INTEL_MODEL(cpu_subtype) == - CPU_SUBTYPE_INTEL_MODEL_ALL) { - if ( CPU_SUBTYPE_INTEL_FAMILY(ms->cpu_subtype) >= - CPU_SUBTYPE_INTEL_FAMILY(cpu_subtype)) - return (TRUE); - } - else { - if ( ms->cpu_subtype == cpu_subtype) - return (TRUE); - } - break; - } - - return (FALSE); -} +extern int grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype); /********************************************************************** - * Routine: grade_cpu_subtype() + * Routine: grade_binary() * - * Function: Return a relative preference for cpu_subtypes in fat - * executable files. The higher the grade, the higher the - * preference. A grade of 0 means not acceptable. + * Function: Return a relative preference for exectypes and + * execsubtypes in fat executable files. The higher the + * grade, the higher the preference. A grade of 0 means + * not acceptable. **********************************************************************/ -grade_cpu_subtype (cpu_subtype) - cpu_subtype_t cpu_subtype; +int +grade_binary(__unused cpu_type_t exectype, cpu_subtype_t execsubtype) { - struct machine_slot *ms = &machine_slot[cpu_number()]; + int cpusubtype = cpu_subtype(); - switch (ms->cpu_subtype) { + switch (cpusubtype) { case CPU_SUBTYPE_386: - switch (cpu_subtype) { + switch (execsubtype) { case CPU_SUBTYPE_386: return 1; default: @@ -105,7 +57,7 @@ grade_cpu_subtype (cpu_subtype) } case CPU_SUBTYPE_486: - switch (cpu_subtype) { + switch (execsubtype) { case CPU_SUBTYPE_386: return 1; @@ -120,7 +72,7 @@ grade_cpu_subtype (cpu_subtype) } case CPU_SUBTYPE_486SX: - switch (cpu_subtype) { + switch (execsubtype) { case CPU_SUBTYPE_386: return 1; @@ -135,7 +87,7 @@ grade_cpu_subtype (cpu_subtype) } case CPU_SUBTYPE_586: - switch (cpu_subtype) { + switch (execsubtype) { case CPU_SUBTYPE_386: return 1; @@ -153,18 +105,28 @@ grade_cpu_subtype (cpu_subtype) } default: - if ( CPU_SUBTYPE_INTEL_MODEL(cpu_subtype) == + if ( CPU_SUBTYPE_INTEL_MODEL(execsubtype) == CPU_SUBTYPE_INTEL_MODEL_ALL) { - if ( CPU_SUBTYPE_INTEL_FAMILY(ms->cpu_subtype) >= - CPU_SUBTYPE_INTEL_FAMILY(cpu_subtype)) + if ( CPU_SUBTYPE_INTEL_FAMILY(cpusubtype) >= + CPU_SUBTYPE_INTEL_FAMILY(execsubtype)) return CPU_SUBTYPE_INTEL_FAMILY_MAX - - CPU_SUBTYPE_INTEL_FAMILY(ms->cpu_subtype) - - CPU_SUBTYPE_INTEL_FAMILY(cpu_subtype); + CPU_SUBTYPE_INTEL_FAMILY(cpusubtype) - + CPU_SUBTYPE_INTEL_FAMILY(execsubtype); } else { - if ( ms->cpu_subtype == cpu_subtype) + if ( cpusubtype == execsubtype) return CPU_SUBTYPE_INTEL_FAMILY_MAX + 1; } return 0; } } + +extern void md_prepare_for_shutdown(int, int, char *); + +void +md_prepare_for_shutdown( + __unused int paniced, + __unused int howto, + __unused char * command) +{ +} diff --git a/bsd/dev/i386/km.c b/bsd/dev/i386/km.c index 048cb3951..6de6ff0bb 100644 --- a/bsd/dev/i386/km.c +++ b/bsd/dev/i386/km.c @@ -41,10 +41,25 @@ #include <dev/kmreg_com.h> #include <pexpert/pexpert.h> +extern int hz; + +extern void cnputcusr(char); +extern int cngetc(void); + +void kminit(void); +int kmopen(dev_t dev, int flag, int devtype, struct proc *pp); +int kmclose(dev_t dev, int flag, int mode, struct proc *p); +int kmread(dev_t dev, struct uio *uio, int ioflag); +int kmwrite(dev_t dev, struct uio *uio, int ioflag); +int kmioctl(dev_t dev, int cmd, caddr_t data, int flag, struct proc *p); +int kmputc(int c); +int kmgetc(dev_t dev); +int kmgetc_silent(dev_t dev); +void cons_cinput(char ch); + /* * 'Global' variables, shared only by this file and conf.c. */ -extern struct tty cons; struct tty *km_tty[1] = { &cons }; /* @@ -63,9 +78,10 @@ static void kmstart(struct tty *tp); extern void KeyboardOpen(void); -int kminit() +void +kminit(void) { - cons.t_dev = makedev(12, 0); + cons.t_dev = makedev(12, 0); initialized = 1; } /* @@ -75,10 +91,9 @@ int kmopen( dev_t dev, int flag, - int devtype, + __unused int devtype, struct proc *pp) { - int rtn; int unit; struct tty *tp; struct winsize *wp; @@ -101,7 +116,7 @@ kmopen( tp->t_ispeed = tp->t_ospeed = TTYDEF_SPEED; termioschars(&tp->t_termios); ttsetwater(tp); - } else if ((tp->t_state & TS_XCLUDE) && pp->p_ucred->cr_uid != 0) + } else if ((tp->t_state & TS_XCLUDE) && proc_suser(pp)) return EBUSY; tp->t_state |= TS_CARR_ON; /* lie and say carrier exists and is on. */ @@ -133,10 +148,10 @@ kmopen( int kmclose( - dev_t dev, + __unused dev_t dev, int flag, - int mode, - struct proc *p) + __unused int mode, + __unused struct proc *p) { struct tty *tp; @@ -149,7 +164,7 @@ kmclose( int kmread( - dev_t dev, + __unused dev_t dev, struct uio *uio, int ioflag) { @@ -161,7 +176,7 @@ kmread( int kmwrite( - dev_t dev, + __unused dev_t dev, struct uio *uio, int ioflag) { @@ -173,7 +188,7 @@ kmwrite( int kmioctl( - dev_t dev, + __unused dev_t dev, int cmd, caddr_t data, int flag, @@ -209,16 +224,9 @@ kmioctl( } default: error = (*linesw[tp->t_line].l_ioctl)(tp, cmd, data, flag, p); - if (error >= 0) { + if (ENOTTY != error) return error; - } - error = ttioctl (tp, cmd, data, flag, p); - if (error >= 0) { - return error; - } - else { - return ENOTTY; - } + return ttioctl (tp, cmd, data, flag, p); } } @@ -234,16 +242,16 @@ kmputc( return( 0); if(c == '\n') - cnputc('\r'); + cnputcusr('\r'); - cnputc(c); + cnputcusr(c); return 0; } int kmgetc( - dev_t dev) + __unused dev_t dev) { int c; @@ -252,13 +260,13 @@ kmgetc( if (c == '\r') { c = '\n'; } - cnputc(c); + cnputcusr(c); return c; } int kmgetc_silent( - dev_t dev) + __unused dev_t dev) { int c; @@ -279,31 +287,17 @@ static void kmstart( struct tty *tp) { - extern int hz; if (tp->t_state & (TS_TIMEOUT | TS_BUSY | TS_TTSTOP)) goto out; if (tp->t_outq.c_cc == 0) goto out; tp->t_state |= TS_BUSY; - if (tp->t_outq.c_cc > tp->t_lowat) { - /* - * Start immediately. - */ - kmoutput(tp); - } - else { - /* - * Wait a bit... - */ -#if 0 - /* FIXME */ - timeout(kmtimeout, tp, hz); -#else - kmoutput(tp); -#endif - } + kmoutput(tp); + return; + out: - ttwwakeup(tp); + (*linesw[tp->t_line].l_start)(tp); + return; } static void @@ -328,7 +322,6 @@ kmoutput( char buf[80]; char *cp; int cc = -1; - extern int hz; while (tp->t_outq.c_cc > 0) { @@ -345,17 +338,16 @@ kmoutput( timeout(kmtimeout, tp, hz); } tp->t_state &= ~TS_BUSY; - ttwwakeup(tp); + (*linesw[tp->t_line].l_start)(tp); return 0; } + +void cons_cinput(char ch) { struct tty *tp = &cons; - boolean_t funnel_state; - (*linesw[tp->t_line].l_rint) (ch, tp); - } diff --git a/bsd/dev/i386/mem.c b/bsd/dev/i386/mem.c index 5c3422a64..892be2473 100644 --- a/bsd/dev/i386/mem.c +++ b/bsd/dev/i386/mem.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -70,52 +70,67 @@ #include <sys/param.h> #include <sys/dir.h> #include <sys/proc.h> -#include <sys/buf.h> #include <sys/systm.h> #include <sys/vm.h> -#include <sys/uio.h> +#include <sys/uio_internal.h> #include <sys/malloc.h> #include <mach/vm_types.h> #include <mach/vm_param.h> #include <vm/vm_kern.h> /* for kernel_map */ +extern vm_offset_t kvtophys(vm_offset_t va); +extern boolean_t kernacc(off_t, size_t ); + static caddr_t devzerobuf; -mmread(dev, uio) - dev_t dev; - struct uio *uio; +int mmread(dev_t dev, struct uio *uio); +int mmwrite(dev_t dev, struct uio *uio); +int mmioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p); +int mmrw(dev_t dev, struct uio *uio, enum uio_rw rw); + +int +mmread(dev_t dev, struct uio *uio) { return (mmrw(dev, uio, UIO_READ)); } -mmwrite(dev, uio) - dev_t dev; - struct uio *uio; +int +mmwrite(dev_t dev, struct uio *uio) { return (mmrw(dev, uio, UIO_WRITE)); } -mmrw(dev, uio, rw) - dev_t dev; - struct uio *uio; - enum uio_rw rw; +int +mmioctl(__unused dev_t dev, u_long cmd, __unused caddr_t data, + __unused int flag, __unused struct proc *p) +{ + switch (cmd) { + case FIONBIO: + case FIOASYNC: + /* OK to do nothing: we always return immediately */ + break; + default: + return ENODEV; + } + + return (0); +} + +int +mmrw(dev_t dev, struct uio *uio, enum uio_rw rw) { register int o; register u_int c, v; - register struct iovec *iov; int error = 0; vm_offset_t where; - int spl; vm_size_t size; - extern boolean_t kernacc(off_t, size_t ); - while (uio->uio_resid > 0 && error == 0) { - iov = uio->uio_iov; - if (iov->iov_len == 0) { - uio->uio_iov++; + while (uio_resid(uio) > 0 && error == 0) { + if (uio_iov_len(uio) == 0) { + uio_next_iov(uio); uio->uio_iovcnt--; if (uio->uio_iovcnt < 0) panic("mmrw"); @@ -135,7 +150,8 @@ mmrw(dev, uio, rw) goto fault; } o = uio->uio_offset - v; - c = min(PAGE_SIZE - o, (u_int)iov->iov_len); + // LP64todo - fix this! + c = min(PAGE_SIZE - o, (u_int)uio_iov_len(uio)); error = uiomove((caddr_t) (where + o), c, uio); kmem_free(kernel_map, where, PAGE_SIZE); continue; @@ -146,17 +162,18 @@ mmrw(dev, uio, rw) if (((vm_address_t)uio->uio_offset >= VM_MAX_KERNEL_ADDRESS) || ((vm_address_t)uio->uio_offset <= VM_MIN_KERNEL_ADDRESS)) goto fault; - c = iov->iov_len; + c = uio_iov_len(uio); if (!kernacc(uio->uio_offset, c)) goto fault; - error = uiomove((caddr_t)uio->uio_offset, (int)c, uio); + error = uiomove((caddr_t)(uintptr_t)uio->uio_offset, + (int)c, uio); continue; /* minor device 2 is EOF/RATHOLE */ case 2: if (rw == UIO_READ) return (0); - c = iov->iov_len; + c = uio_iov_len(uio); break; case 3: if(devzerobuf == NULL) { @@ -164,10 +181,11 @@ mmrw(dev, uio, rw) bzero(devzerobuf, PAGE_SIZE); } if(uio->uio_rw == UIO_WRITE) { - c = iov->iov_len; + c = uio_iov_len(uio); break; } - c = min(iov->iov_len, PAGE_SIZE); + // LP64todo - fix this! + c = min(uio_iov_len(uio), PAGE_SIZE); error = uiomove(devzerobuf, (int)c, uio); continue; default: @@ -177,10 +195,10 @@ mmrw(dev, uio, rw) if (error) break; - iov->iov_base += c; - iov->iov_len -= c; + uio_iov_base_add(uio, c); + uio_iov_len_add(uio, -((int)c)); uio->uio_offset += c; - uio->uio_resid -= c; + uio_setresid(uio, (uio_resid(uio) - c)); } return (error); fault: @@ -201,7 +219,7 @@ kernacc( end = start + len; while (base < end) { - if(kvtophys((vm_offset_t)base) == NULL) + if(kvtophys((vm_offset_t)base) == 0ULL) return(FALSE); base += page_size; } diff --git a/bsd/dev/i386/memmove.c b/bsd/dev/i386/memmove.c index 12b0e2070..5ef7f1291 100644 --- a/bsd/dev/i386/memmove.c +++ b/bsd/dev/i386/memmove.c @@ -34,7 +34,7 @@ * */ - +#include <string.h> #if 0 void *memcpy(void *dst, const void *src, unsigned int ulen) @@ -43,7 +43,9 @@ void *memcpy(void *dst, const void *src, unsigned int ulen) return dst; } #endif /* 0 */ -void *memmove(void *dst, const void *src, unsigned int ulen) + +void * +memmove(void *dst, const void *src, size_t ulen) { bcopy(src, dst, ulen); return dst; diff --git a/bsd/dev/i386/stubs.c b/bsd/dev/i386/stubs.c index 1ae89fe67..5a15de0ea 100644 --- a/bsd/dev/i386/stubs.c +++ b/bsd/dev/i386/stubs.c @@ -27,17 +27,22 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/buf.h> #include <sys/ioctl.h> #include <sys/tty.h> #include <sys/conf.h> -#include <sys/proc.h> +#include <sys/kauth.h> +#include <sys/ucred.h> +#include <sys/proc_internal.h> #include <sys/user.h> #include <kern/task.h> #include <kern/thread.h> -#include <kern/thread_act.h> #include <vm/vm_map.h> +/* XXX should be elsewhere (cpeak) */ +extern int set_bsduthreadargs(thread_t, void *, void *); +extern void *get_bsduthreadarg(thread_t); +extern int *get_bsduthreadrval(thread_t); +extern int *get_bsduthreadlowpridelay(thread_t); /* * copy a null terminated string from the kernel address space into @@ -50,11 +55,11 @@ * the number of bytes copied is always returned in lencopied. */ int -copyoutstr(from, to, maxlen, lencopied) - void * from, * to; - size_t maxlen, *lencopied; +copyoutstr(const void *from, user_addr_t to, size_t maxlen, size_t *lencopied) { - int slen,len,error=0; + size_t slen; + size_t len; + int error = 0; slen = strlen(from) + 1; if (slen > maxlen) @@ -81,42 +86,58 @@ copyoutstr(from, to, maxlen, lencopied) */ /* from ppc/fault_copy.c -Titan1T4 VERSION */ int -copystr(vfrom, vto, maxlen, lencopied) - register void * vfrom, *vto; - size_t maxlen, *lencopied; +copystr(const void *vfrom, void *vto, size_t maxlen, size_t *lencopied) { - register unsigned l; - int error; -caddr_t from, to; - - from = vfrom; - to = vto; - for (l = 0; l < maxlen; l++) - if ((*to++ = *from++) == '\0') { - if (lencopied) - *lencopied = l + 1; - return 0; - } - if (lencopied) - *lencopied = maxlen; - return ENAMETOOLONG; + size_t l; + char const *from = (char const *) vfrom; + char *to = (char *) vto; + + for (l = 0; l < maxlen; l++) { + if ((*to++ = *from++) == '\0') { + if (lencopied) + *lencopied = l + 1; + return 0; + } + } + if (lencopied) + *lencopied = maxlen; + return ENAMETOOLONG; } -int copywithin(src, dst, count) -void * src, *dst; -size_t count; +int +copywithin(void *src, void *dst, size_t count) { bcopy(src,dst,count); return 0; } -set_bsduthreadargs(thread_t th, void * pcb, void *ignored_arg) +int +set_bsduthreadargs(thread_t th, void * pcb, __unused void *ignored_arg) { -struct uthread * ut; + struct uthread * ut; + struct proc *p = current_proc(); ut = get_bsdthread_info(th); ut->uu_ar0 = (int *)pcb; + /* + * Delayed binding of thread credential to process credential. + * + * XXX This doesn't really belong here, but the i386 code has a + * XXX number of seemingly gratuitous structural differences that + * XXX make this the most appropriate place to do the work. + */ + if (ut->uu_ucred != p->p_ucred && + (ut->uu_flag & UT_SETUID) == 0) { + kauth_cred_t old = ut->uu_ucred; + proc_lock(p); + ut->uu_ucred = p->p_ucred; + kauth_cred_ref(ut->uu_ucred); + proc_unlock(p); + if (old != NOCRED) + kauth_cred_rele(old); + } + return(1); } @@ -129,9 +150,17 @@ struct uthread *ut; } int * -get_bsduthreadrval(thread_act_t th) +get_bsduthreadrval(thread_t th) { struct uthread *ut; ut = get_bsdthread_info(th); return(&ut->uu_rval[0]); } + +int * +get_bsduthreadlowpridelay(thread_t th) +{ +struct uthread *ut; + ut = get_bsdthread_info(th); + return(&ut->uu_lowpri_delay); +} diff --git a/bsd/dev/i386/sysctl.c b/bsd/dev/i386/sysctl.c index a0c07910e..e3fe2fac6 100644 --- a/bsd/dev/i386/sysctl.c +++ b/bsd/dev/i386/sysctl.c @@ -20,6 +20,7 @@ * @APPLE_LICENSE_HEADER_END@ */ +#include <string.h> #include <sys/param.h> #include <sys/kernel.h> #include <sys/sysctl.h> @@ -28,31 +29,38 @@ static int hw_cpu_sysctl SYSCTL_HANDLER_ARGS { - i386_cpu_info_t cpu_info; - void *ptr = (uint8_t *)&cpu_info + (uint32_t)arg1; + __unused struct sysctl_oid *unused_oidp = oidp; + i386_cpu_info_t *cpu_info = cpuid_info(); + void *ptr = (uint8_t *)cpu_info + (uint32_t)arg1; int value; - cpuid_get_info(&cpu_info); + if (arg2 == -1) { + ptr = *(char **)ptr; + arg2 = 0; + } + + if (arg2 == 0 && ((char *)ptr)[0] == '\0') { + return ENOENT; + } if (arg2 == sizeof(uint8_t)) { value = (uint32_t) *(uint8_t *)ptr; ptr = &value; arg2 = sizeof(uint32_t); } - return SYSCTL_OUT(req, ptr, arg2 ? arg2 : strlen((char *)ptr)+1); - return 0; + return SYSCTL_OUT(req, ptr, arg2 ? (size_t) arg2 : strlen((char *)ptr)+1); } static int hw_cpu_features SYSCTL_HANDLER_ARGS { - i386_cpu_info_t cpu_info; + __unused struct sysctl_oid *unused_oidp = oidp; + __unused void *unused_arg1 = arg1; + __unused int unused_arg2 = arg2; char buf[256]; - vm_size_t size; - cpuid_get_info(&cpu_info); buf[0] = '\0'; - cpuid_get_feature_names(cpu_info.cpuid_features, buf, sizeof(buf)); + cpuid_get_feature_names(cpuid_features(), buf, sizeof(buf)); return SYSCTL_OUT(req, buf, strlen(buf) + 1); } @@ -68,6 +76,10 @@ SYSCTL_PROC(_machdep_cpu, OID_AUTO, brand_string, CTLTYPE_STRING | CTLFLAG_RD, (void *)offsetof(i386_cpu_info_t, cpuid_brand_string), 0, hw_cpu_sysctl, "A", "CPU brand string"); +SYSCTL_PROC(_machdep_cpu, OID_AUTO, model_string, CTLTYPE_STRING | CTLFLAG_RD, + (void *)offsetof(i386_cpu_info_t, cpuid_model_string), -1, + hw_cpu_sysctl, "A", "CPU model string"); + SYSCTL_PROC(_machdep_cpu, OID_AUTO, value, CTLTYPE_INT | CTLFLAG_RD, (void *)offsetof(i386_cpu_info_t, cpuid_value), sizeof(uint32_t), hw_cpu_sysctl, "I", "CPU value"); @@ -104,12 +116,6 @@ SYSCTL_PROC(_machdep_cpu, OID_AUTO, brand, CTLTYPE_INT | CTLFLAG_RD, (void *)offsetof(i386_cpu_info_t, cpuid_brand), sizeof(uint8_t), hw_cpu_sysctl, "I", "CPU brand"); -#if 0 -SYSCTL_PROC(_machdep_cpu, OID_AUTO, model_string, CTLTYPE_STRING | CTLFLAG_RD, - (void *)offsetof(i386_cpu_info_t, model_string), 0, - hw_cpu_sysctl, "A", "CPU model string"); -#endif - SYSCTL_PROC(_machdep_cpu, OID_AUTO, features, CTLTYPE_STRING | CTLFLAG_RD, 0, 0, hw_cpu_features, "A", "CPU feature names"); @@ -120,6 +126,7 @@ struct sysctl_oid *machdep_sysctl_list[] = &sysctl__machdep_cpu, &sysctl__machdep_cpu_vendor, &sysctl__machdep_cpu_brand_string, + &sysctl__machdep_cpu_model_string, &sysctl__machdep_cpu_value, &sysctl__machdep_cpu_family, &sysctl__machdep_cpu_model, diff --git a/bsd/dev/i386/unix_signal.c b/bsd/dev/i386/unix_signal.c index efd73bfb3..ca3b9f2df 100644 --- a/bsd/dev/i386/unix_signal.c +++ b/bsd/dev/i386/unix_signal.c @@ -31,36 +31,31 @@ #include <mach/exception.h> #include <kern/thread.h> -#include <kern/thread_act.h> +#include <sys/systm.h> #include <sys/param.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> #include <sys/user.h> +#include <sys/sysproto.h> +#include <sys/sysent.h> +#include <mach/thread_act.h> /* for thread_abort_safely */ #include <i386/psl.h> +#include <i386/seg.h> #include <mach/i386/thread_status.h> -#include <dev/i386/sel_inline.h> +extern struct i386_saved_state *get_user_regs(thread_t); -/* - * FIXME.. should be included from mach_kernel/i386/seg.h - */ - -#define USER_CS 0x17 -#define USER_DS 0x1f -#define USER_CTHREAD 0x27 - -#define UDATA_SEL USER_DS -#define UCODE_SEL USER_CS -#define UCTHREAD_SEL USER_CTHREAD - -#define valid_user_code_selector(x) (TRUE) -#define valid_user_data_selector(x) (TRUE) -#define valid_user_stack_selector(x) (TRUE) - +extern boolean_t valid_user_segment_selectors(uint16_t cs, + uint16_t ss, + uint16_t ds, + uint16_t es, + uint16_t fs, + uint16_t gs); -#define NULL_SEG 0 +/* Forward: */ +extern boolean_t machine_exception(int, int, int, int *, int *); /* Signal handler flavors supported */ /* These defns should match the Libc implmn */ @@ -76,11 +71,10 @@ * pointer, and the argument pointer, it returns * to the user specified pc, psl. */ - void sendsig(p, catcher, sig, mask, code) struct proc *p; - sig_t catcher; + user_addr_t catcher; /* sig_t */ int sig, mask; u_long code; { @@ -96,13 +90,11 @@ sendsig(p, catcher, sig, mask, code) struct sigacts *ps = p->p_sigacts; int oonstack; thread_t thread = current_thread(); - thread_act_t th_act = current_act(); struct uthread * ut; - struct i386_saved_state * saved_state = (struct i386_saved_state *) - get_user_regs(th_act); + struct i386_saved_state * saved_state = get_user_regs(thread); sig_t trampact; - ut = get_bsdthread_info(th_act); + ut = get_bsdthread_info(thread); oonstack = ps->ps_sigstk.ss_flags & SA_ONSTACK; if ((ps->ps_flags & SAS_ALTSTACK) && !oonstack && (ps->ps_sigonstack & sigmask(sig))) { @@ -116,10 +108,10 @@ sendsig(p, catcher, sig, mask, code) * Build the argument list for the signal handler. */ - trampact = ps->ps_trampact[sig]; + trampact = (sig_t)ps->ps_trampact[sig]; /* Handler should call sigreturn to get out of it */ frame.retaddr = 0xffffffff; - frame.catcher = catcher; + frame.catcher = CAST_DOWN(sig_t,catcher); /* XXX LP64 */ frame.sigstyle = UC_TRAD; frame.sig = sig; @@ -128,19 +120,9 @@ sendsig(p, catcher, sig, mask, code) } else frame.code = 0; frame.scp = scp; - if (copyout((caddr_t)&frame, (caddr_t)fp, sizeof (frame))) + if (copyout((caddr_t)&frame, (user_addr_t)fp, sizeof (frame))) goto bad; -#if PC_SUPPORT - { - PCcontext_t context = threadPCContext(thread); - - if (context && context->running) { - oonstack |= 02; - context->running = FALSE; - } - } -#endif /* * Build the signal context to be used by sigreturn. */ @@ -171,17 +153,17 @@ sendsig(p, catcher, sig, mask, code) context.sc_fs = saved_state->fs; context.sc_gs = saved_state->gs; } - if (copyout((caddr_t)&context, (caddr_t)scp, sizeof (context))) + if (copyout((caddr_t)&context, (user_addr_t)scp, sizeof (context))) goto bad; saved_state->eip = (unsigned int)trampact; - saved_state->cs = UCODE_SEL; + saved_state->cs = USER_CS; saved_state->uesp = (unsigned int)fp; - saved_state->ss = UDATA_SEL; + saved_state->ss = USER_DS; - saved_state->ds = UDATA_SEL; - saved_state->es = UDATA_SEL; + saved_state->ds = USER_DS; + saved_state->es = USER_DS; saved_state->fs = NULL_SEG; saved_state->gs = USER_CTHREAD; return; @@ -207,83 +189,88 @@ bad: * psl to gain improper priviledges or to cause * a machine fault. */ -struct sigreturn_args { - struct sigcontext *sigcntxp; -}; /* ARGSUSED */ int -sigreturn(p, uap, retval) - struct proc *p; - struct sigreturn_args *uap; - int *retval; +sigreturn( + struct proc *p, + struct sigreturn_args *uap, + __unused int *retval) { struct sigcontext context; thread_t thread = current_thread(); - thread_act_t th_act = current_act(); int error; struct i386_saved_state* saved_state = (struct i386_saved_state*) - get_user_regs(th_act); + get_user_regs(thread); struct uthread * ut; - if (saved_state == NULL) - return EINVAL; + if (saved_state == NULL) + return EINVAL; - if (error = copyin((caddr_t)uap->sigcntxp, (caddr_t)&context, - sizeof (context))) - return(error); - ut = (struct uthread *)get_bsdthread_info(th_act); + if ((error = copyin(CAST_USER_ADDR_T(uap->sigcntxp), (void *)&context, + sizeof (context)))) + return(error); - if (context.sc_onstack & 01) - p->p_sigacts->ps_sigstk.ss_flags |= SA_ONSTACK; + /* + * Validate segment selectors. + * Bad values would result in kernel exception at context switch + * back to user mode. If other state is invalid an exception will + * occur in user context. + */ + if (!valid_user_segment_selectors(context.sc_cs, + context.sc_ss, + context.sc_ds, + context.sc_es, + context.sc_fs, + context.sc_gs)) { + return EINVAL; + } + + ut = (struct uthread *)get_bsdthread_info(thread); + + if (context.sc_onstack & 01) + p->p_sigacts->ps_sigstk.ss_flags |= SA_ONSTACK; else p->p_sigacts->ps_sigstk.ss_flags &= ~SA_ONSTACK; + ut->uu_sigmask = context.sc_mask &~ sigcantmask; if(ut->uu_siglist & ~ut->uu_sigmask) - signal_setast(current_act()); - saved_state->eax = context.sc_eax; - saved_state->ebx = context.sc_ebx; - saved_state->ecx = context.sc_ecx; - saved_state->edx = context.sc_edx; - saved_state->edi = context.sc_edi; - saved_state->esi = context.sc_esi; - saved_state->ebp = context.sc_ebp; - saved_state->uesp = context.sc_esp; - saved_state->ss = context.sc_ss; - saved_state->efl = context.sc_eflags; - saved_state->efl &= ~EFL_USERCLR; - saved_state->efl |= EFL_USERSET; - saved_state->eip = context.sc_eip; - saved_state->cs = context.sc_cs; - - if (context.sc_eflags & EFL_VM) { - saved_state->ds = NULL_SEG; - saved_state->es = NULL_SEG; - saved_state->fs = NULL_SEG; - saved_state->gs = NULL_SEG; - saved_state->v86_segs.v86_ds = context.sc_ds; - saved_state->v86_segs.v86_es = context.sc_es; - saved_state->v86_segs.v86_fs = context.sc_fs; - saved_state->v86_segs.v86_gs = context.sc_gs; - - saved_state->efl |= EFL_VM; - } - else { - saved_state->ds = context.sc_ds; - saved_state->es = context.sc_es; - saved_state->fs = context.sc_fs; - saved_state->gs = context.sc_gs; - } - -#if PC_SUPPORT - if (context.sc_onstack & 02) { - PCcontext_t context = threadPCContext(thread); - - if (context) - context->running = TRUE; - } -#endif + signal_setast(thread); + + saved_state->eax = context.sc_eax; + saved_state->ebx = context.sc_ebx; + saved_state->ecx = context.sc_ecx; + saved_state->edx = context.sc_edx; + saved_state->edi = context.sc_edi; + saved_state->esi = context.sc_esi; + saved_state->ebp = context.sc_ebp; + saved_state->uesp = context.sc_esp; + saved_state->ss = context.sc_ss; + saved_state->efl = context.sc_eflags; + saved_state->efl &= ~EFL_USERCLR; + saved_state->efl |= EFL_USERSET; + saved_state->eip = context.sc_eip; + saved_state->cs = context.sc_cs; + + if (context.sc_eflags & EFL_VM) { + saved_state->ds = NULL_SEG; + saved_state->es = NULL_SEG; + saved_state->fs = NULL_SEG; + saved_state->gs = NULL_SEG; + saved_state->v86_segs.v86_ds = context.sc_ds; + saved_state->v86_segs.v86_es = context.sc_es; + saved_state->v86_segs.v86_fs = context.sc_fs; + saved_state->v86_segs.v86_gs = context.sc_gs; + + saved_state->efl |= EFL_VM; + } + else { + saved_state->ds = context.sc_ds; + saved_state->es = context.sc_es; + saved_state->fs = context.sc_fs; + saved_state->gs = context.sc_gs; + } return (EJUSTRETURN); } @@ -295,11 +282,11 @@ sigreturn(p, uap, retval) boolean_t machine_exception( - int exception, - int code, - int subcode, - int *unix_signal, - int *unix_code + int exception, + int code, + __unused int subcode, + int *unix_signal, + int *unix_code ) { @@ -321,3 +308,52 @@ machine_exception( return(TRUE); } + +#include <sys/systm.h> +#include <sys/sysent.h> + +int __pthread_cset(struct sysent *); +void __pthread_creset(struct sysent *); + +int +__pthread_cset(struct sysent *callp) +{ + unsigned int cancel_enable; + thread_t thread; + struct uthread * uthread; + + thread = current_thread(); + uthread = get_bsdthread_info(thread); + + cancel_enable = callp->sy_cancel; + if (cancel_enable == _SYSCALL_CANCEL_NONE) { + uthread->uu_flag |= UT_NOTCANCELPT; + } else { + if((uthread->uu_flag & (UT_CANCELDISABLE | UT_CANCEL | UT_CANCELED)) == UT_CANCEL) { + if (cancel_enable == _SYSCALL_CANCEL_PRE) + return(EINTR); + else + thread_abort_safely(thread); + } + } + return(0); +} + + +void +__pthread_creset(struct sysent *callp) +{ + + unsigned int cancel_enable; + thread_t thread; + struct uthread * uthread; + + thread = current_thread(); + uthread = get_bsdthread_info(thread); + + cancel_enable = callp->sy_cancel; + if (!cancel_enable) + uthread->uu_flag &= ~UT_NOTCANCELPT; + +} + diff --git a/bsd/dev/i386/unix_startup.c b/bsd/dev/i386/unix_startup.c deleted file mode 100644 index f341af01a..000000000 --- a/bsd/dev/i386/unix_startup.c +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1992,7 NeXT Computer, Inc. - * - * Unix data structure initialization. - * - */ - -#include <mach/mach_types.h> - -#include <vm/vm_kern.h> -#include <mach/vm_prot.h> - -#include <sys/param.h> -#include <sys/buf.h> -#include <sys/clist.h> -#include <sys/mbuf.h> -#include <sys/systm.h> -#include <sys/tty.h> -#include <dev/ppc/cons.h> - -extern vm_map_t mb_map; - -/* - * Declare these as initialized data so we can patch them. - */ - -#ifdef NBUF -int nbuf = NBUF; -int niobuf = NBUF/2; -#else -int nbuf = 0; -int niobuf = 0; -#endif - -int srv = 0; /* Flag indicates a server boot when set */ -int ncl = 0; - -vm_map_t buffer_map; -vm_map_t bufferhdr_map; - -void -bsd_startupearly() -{ - vm_offset_t firstaddr; - vm_size_t size; - kern_return_t ret; - - if (nbuf == 0) - nbuf = atop(mem_size / 100); /* 1% */ - if (nbuf > 8192) - nbuf = 8192; - if (nbuf < 256) - nbuf = 256; - - if (niobuf == 0) - niobuf = (nbuf / 2) + 64; /* 64 reserved buffers */ - if (niobuf > 4096) - niobuf = 4096; - if (niobuf < 128) - niobuf = 128; - - size = (nbuf + niobuf) * sizeof (struct buf); - size = round_page(size); - - ret = kmem_suballoc(kernel_map, - &firstaddr, - size, - FALSE, - TRUE, - &bufferhdr_map); - - if (ret != KERN_SUCCESS) - panic("Failed to create bufferhdr_map"); - - ret = kernel_memory_allocate(bufferhdr_map, - &firstaddr, - size, - 0, - KMA_HERE | KMA_KOBJECT); - - if (ret != KERN_SUCCESS) - panic("Failed to allocate bufferhdr_map"); - - buf = (struct buf * )firstaddr; - bzero(buf,size); - - if (mem_size > (64 * 1024 * 1024)) { - int scale; - extern u_long tcp_sendspace; - extern u_long tcp_recvspace; - - if ((nmbclusters = ncl) == 0) { - if ((nmbclusters = ((mem_size / 16) / MCLBYTES)) > 16384) - nmbclusters = 16384; - } - if ((scale = nmbclusters / NMBCLUSTERS) > 1) { - tcp_sendspace *= scale; - tcp_recvspace *= scale; - - if (tcp_sendspace > (32 * 1024)) - tcp_sendspace = 32 * 1024; - if (tcp_recvspace > (32 * 1024)) - tcp_recvspace = 32 * 1024; - } - } -} - -void -bsd_bufferinit() -{ - unsigned int i; - vm_size_t size; - kern_return_t ret; - vm_offset_t firstaddr; - - cons.t_dev = makedev(12, 0); - - bsd_startupearly(); - - ret = kmem_suballoc(kernel_map, - (vm_offset_t *)&mbutl, - (vm_size_t) (nmbclusters * MCLBYTES), - FALSE, - TRUE, - &mb_map); - - if (ret != KERN_SUCCESS) - panic("Failed to allocate mb_map\n"); - - /* - * Set up buffers, so they can be used to read disk labels. - */ - bufinit(); -} - -void -md_prepare_for_shutdown(int paniced, int howto, char * command) -{ -} diff --git a/bsd/dev/memdev.c b/bsd/dev/memdev.c index 490d030cd..a01fcb258 100644 --- a/bsd/dev/memdev.c +++ b/bsd/dev/memdev.c @@ -1,3 +1,24 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ /* * Copyright (c) 1988 University of Utah. * Copyright (c) 1990, 1993 @@ -48,9 +69,7 @@ * */ - #include <sys/param.h> -#include <sys/systm.h> #include <sys/kernel.h> #include <sys/mount.h> #include <sys/namei.h> @@ -58,30 +77,38 @@ #include <sys/buf.h> #include <sys/malloc.h> #include <sys/mount.h> -#include <sys/vnode.h> #include <sys/fcntl.h> #include <sys/conf.h> #include <sys/disk.h> #include <sys/stat.h> - #include <sys/vm.h> +#include <sys/uio_internal.h> +#include <libkern/libkern.h> +#include <vm/pmap.h> #include <vm/vm_pager.h> -#include <vm/vm_pageout.h> #include <mach/memory_object_types.h> #include <miscfs/devfs/devfs.h> -static open_close_fcn_t mdevopen; -static open_close_fcn_t mdevclose; + +void mdevinit(int the_cnt); + +static open_close_fcn_t mdevopen; +static open_close_fcn_t mdevclose; static psize_fcn_t mdevsize; -static strategy_fcn_t mdevstrategy; -static int mdevbioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p); -static int mdevcioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p); -static int mdevrw(dev_t dev, struct uio *uio, int ioflag); -static char *nonspace(char *pos, char *end); -static char *getspace(char *pos, char *end); -static char *cvtnum(char *pos, char *end, unsigned int *num); +static strategy_fcn_t mdevstrategy; +static int mdevbioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p); +static int mdevcioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p); +static int mdevrw(dev_t dev, struct uio *uio, int ioflag); +static char * nonspace(char *pos, char *end); +static char * getspace(char *pos, char *end); +static char * cvtnum(char *pos, char *end, unsigned int *num); + +extern void bcopy_phys(addr64_t from, addr64_t to, vm_size_t bytes); +extern void mapping_set_mod(ppnum_t pn); +extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); + /* * cdevsw @@ -139,11 +166,13 @@ static int mdevioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc dev_t mdevadd(int devid, ppnum_t base, unsigned int size, int phys); dev_t mdevlookup(int devid); -static int mdevclose(dev_t dev, int flags, int devtype, struct proc *p) { +static int mdevclose(__unused dev_t dev, __unused int flags, + __unused int devtype, __unused struct proc *p) { + return (0); } -static int mdevopen(dev_t dev, int flags, int devtype, struct proc *p) { +static int mdevopen(dev_t dev, int flags, __unused int devtype, __unused struct proc *p) { int devid; @@ -156,12 +185,11 @@ static int mdevopen(dev_t dev, int flags, int devtype, struct proc *p) { return(0); } -static int mdevrw(dev_t dev, struct uio *uio, int ioflag) { +static int mdevrw(dev_t dev, struct uio *uio, __unused int ioflag) { int status; - int unit; addr64_t mdata; - int devid; - enum uio_seg saveflag; + int devid; + enum uio_seg saveflag; devid = minor(dev); /* Get minor device number */ @@ -171,82 +199,77 @@ static int mdevrw(dev_t dev, struct uio *uio, int ioflag) { mdata = ((addr64_t)mdev[devid].mdBase << 12) + uio->uio_offset; /* Point to the area in "file" */ saveflag = uio->uio_segflg; /* Remember what the request is */ - if (mdev[devid].mdFlags & mdPhys) uio->uio_segflg = UIO_PHYS_USERSPACE; /* Make sure we are moving from physical ram if physical device */ - status = uiomove64(mdata, uio->uio_resid, uio); /* Move the data */ +#if LP64_DEBUG + if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) { + panic("mdevrw - invalid uio_segflg\n"); + } +#endif /* LP64_DEBUG */ + /* Make sure we are moving from physical ram if physical device */ + if (mdev[devid].mdFlags & mdPhys) { + if (uio->uio_segflg == UIO_USERSPACE64) + uio->uio_segflg = UIO_PHYS_USERSPACE64; + else if (uio->uio_segflg == UIO_USERSPACE32) + uio->uio_segflg = UIO_PHYS_USERSPACE32; + else + uio->uio_segflg = UIO_PHYS_USERSPACE; + } + status = uiomove64(mdata, uio_resid(uio), uio); /* Move the data */ uio->uio_segflg = saveflag; /* Restore the flag */ return (status); } static void mdevstrategy(struct buf *bp) { - int unmap; - unsigned int sz, left, lop, csize; - kern_return_t ret; + unsigned int left, lop, csize; vm_offset_t vaddr, blkoff; - struct buf *tbuf; int devid; addr64_t paddr, fvaddr; ppnum_t pp; - devid = minor(bp->b_dev); /* Get minor device number */ + devid = minor(buf_device(bp)); /* Get minor device number */ if ((mdev[devid].mdFlags & mdInited) == 0) { /* Have we actually been defined yet? */ - bp->b_error = ENXIO; - bp->b_flags |= B_ERROR; - biodone(bp); + buf_seterror(bp, ENXIO); + buf_biodone(bp); return; } - bp->b_resid = bp->b_bcount; /* Set byte count */ + buf_setresid(bp, buf_count(bp)); /* Set byte count */ - blkoff = bp->b_blkno * mdev[devid].mdSecsize; /* Get offset into file */ + blkoff = buf_blkno(bp) * mdev[devid].mdSecsize; /* Get offset into file */ /* * Note that reading past end is an error, but reading at end is an EOF. For these - * we just return with b_resid == b_bcount. + * we just return with resid == count. */ if (blkoff >= (mdev[devid].mdSize << 12)) { /* Are they trying to read/write at/after end? */ if(blkoff != (mdev[devid].mdSize << 12)) { /* Are we trying to read after EOF? */ - bp->b_error = EINVAL; /* Yeah, this is an error */ - bp->b_flags |= B_ERROR | B_INVAL; + buf_seterror(bp, EINVAL); /* Yeah, this is an error */ } - biodone(bp); /* Return */ + buf_biodone(bp); /* Return */ return; } - if ((blkoff + bp->b_bcount) > (mdev[devid].mdSize << 12)) { /* Will this read go past end? */ - bp->b_bcount = ((mdev[devid].mdSize << 12) - blkoff); /* Yes, trim to max */ + if ((blkoff + buf_count(bp)) > (mdev[devid].mdSize << 12)) { /* Will this read go past end? */ + buf_setcount(bp, ((mdev[devid].mdSize << 12) - blkoff)); /* Yes, trim to max */ } + /* + * make sure the buffer's data area is + * accessible + */ + if (buf_map(bp, (caddr_t *)&vaddr)) + panic("ramstrategy: buf_map failed\n"); - vaddr = 0; /* Assume not mapped yet */ - unmap = 0; - - if (bp->b_flags & B_VECTORLIST) { /* Do we have a list of UPLs? */ - tbuf = (struct buf *)bp->b_real_bp; /* Get this for C's inadequacies */ - if((bp->b_flags & B_NEED_IODONE) && /* If we have a UPL, is it already mapped? */ - tbuf && - tbuf->b_data) { - vaddr = (vm_offset_t)tbuf->b_data; /* We already have this mapped in, get base address */ - } - else { /* Not mapped yet */ - ret = ubc_upl_map(bp->b_pagelist, &vaddr); /* Map it in */ - if(ret != KERN_SUCCESS) panic("ramstrategy: ubc_upl_map failed, rc = %08X\n", ret); - unmap = 1; /* Remember to unmap later */ - } - vaddr = vaddr += bp->b_uploffset; /* Calculate actual vaddr */ - } - else vaddr = (vm_offset_t)bp->b_data; /* No UPL, we already have address */ - fvaddr = (mdev[devid].mdBase << 12) + blkoff; /* Point to offset into ram disk */ - if(bp->b_flags & B_READ) { /* Is this a read? */ + if (buf_flags(bp) & B_READ) { /* Is this a read? */ if(!(mdev[devid].mdFlags & mdPhys)) { /* Physical mapped disk? */ bcopy((void *)((uintptr_t)fvaddr), - (void *)vaddr, (size_t)bp->b_bcount); /* This is virtual, just get the data */ + (void *)vaddr, (size_t)buf_count(bp)); /* This is virtual, just get the data */ } else { - left = bp->b_bcount; /* Init the amount left to copy */ + left = buf_count(bp); /* Init the amount left to copy */ while(left) { /* Go until it is all copied */ lop = min((4096 - (vaddr & 4095)), (4096 - (fvaddr & 4095))); /* Get smallest amount left on sink and source */ @@ -269,10 +292,10 @@ static void mdevstrategy(struct buf *bp) { else { /* This is a write */ if(!(mdev[devid].mdFlags & mdPhys)) { /* Physical mapped disk? */ bcopy((void *)vaddr, (void *)((uintptr_t)fvaddr), - (size_t)bp->b_bcount); /* This is virtual, just put the data */ + (size_t)buf_count(bp)); /* This is virtual, just put the data */ } else { - left = bp->b_bcount; /* Init the amount left to copy */ + left = buf_count(bp); /* Init the amount left to copy */ while(left) { /* Go until it is all copied */ lop = min((4096 - (vaddr & 4095)), (4096 - (fvaddr & 4095))); /* Get smallest amount left on sink and source */ @@ -292,13 +315,16 @@ static void mdevstrategy(struct buf *bp) { } } } - - if (unmap) { /* Do we need to unmap this? */ - ubc_upl_unmap(bp->b_pagelist); /* Yes, unmap it */ - } - - bp->b_resid = 0; /* Nothing more to do */ - biodone(bp); /* Say we've finished */ + /* + * buf_unmap takes care of all the cases + * it will unmap the buffer from kernel + * virtual space if that was the state + * when we mapped it. + */ + buf_unmap(bp); + + buf_setresid(bp, 0); /* Nothing more to do */ + buf_biodone(bp); /* Say we've finished */ } static int mdevbioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) { @@ -309,8 +335,8 @@ static int mdevcioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc return (mdevioctl(dev, cmd, data, flag, p, 1)); } -static int mdevioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p, int is_char) { - +static int mdevioctl(dev_t dev, u_long cmd, caddr_t data, __unused int flag, + struct proc *p, int is_char) { int error; u_long *f; u_int64_t *o; @@ -320,7 +346,7 @@ static int mdevioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc if (devid > 16) return (ENXIO); /* Not valid */ - error = suser(p->p_ucred, &p->p_acflag); /* Are we superman? */ + error = proc_suser(p); /* Are we superman? */ if (error) return (error); /* Nope... */ f = (u_long*)data; @@ -392,7 +418,7 @@ static int mdevsize(dev_t dev) { #include <pexpert/pexpert.h> -void mdevinit(int cnt) { +void mdevinit(__unused int the_cnt) { int devid, phys; ppnum_t base; diff --git a/bsd/dev/ppc/chud/chud_bsd_callback.c b/bsd/dev/ppc/chud/chud_bsd_callback.c index e212ebe82..0302458f5 100644 --- a/bsd/dev/ppc/chud/chud_bsd_callback.c +++ b/bsd/dev/ppc/chud/chud_bsd_callback.c @@ -28,12 +28,7 @@ #include <sys/types.h> /* u_int */ #include <sys/proc.h> /* struct proc */ #include <sys/systm.h> /* struct sysent */ - -struct exit_args { - int rval; -}; -extern void exit(struct proc *p, struct exit_args *uap, int *retval); -extern struct sysent sysent[]; +#include <sys/sysproto.h> #pragma mark **** kern debug **** typedef void (*chudxnu_kdebug_callback_func_t)(uint32_t debugid, uint32_t arg0, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint32_t arg4); @@ -69,23 +64,3 @@ kern_return_t chudxnu_kdebug_callback_cancel(void) return KERN_SUCCESS; } - -#pragma mark **** task will exit **** - -typedef kern_return_t (*chudxnu_exit_callback_func_t)(int pid); - -__private_extern__ -kern_return_t chudxnu_exit_callback_enter(chudxnu_exit_callback_func_t func) -{ - - return KERN_FAILURE; - -} - -__private_extern__ -kern_return_t chudxnu_exit_callback_cancel(void) -{ - - return KERN_FAILURE; - -} diff --git a/bsd/dev/ppc/chud/chud_process.c b/bsd/dev/ppc/chud/chud_process.c index c0dcd504d..1fad77e4c 100644 --- a/bsd/dev/ppc/chud/chud_process.c +++ b/bsd/dev/ppc/chud/chud_process.c @@ -21,7 +21,8 @@ */ #include <sys/systm.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> +#include <sys/vnode_internal.h> // vn_getpath() __private_extern__ int chudxnu_pid_for_task(task_t task) diff --git a/bsd/dev/ppc/conf.c b/bsd/dev/ppc/conf.c index e4a751fca..463700245 100644 --- a/bsd/dev/ppc/conf.c +++ b/bsd/dev/ppc/conf.c @@ -35,7 +35,6 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/buf.h> #include <sys/ioctl.h> #include <sys/tty.h> #include <sys/conf.h> @@ -96,7 +95,7 @@ extern int kmopen(),kmclose(),kmread(),kmwrite(),kmioctl(), extern int cttyopen(), cttyread(), cttywrite(), cttyioctl(), cttyselect(); -extern int mmread(),mmwrite(); +extern int mmread(),mmwrite(),mmioctl(); #define mmselect seltrue #if 1 @@ -157,8 +156,8 @@ struct cdevsw cdevsw[] = }, { nulldev, nulldev, mmread, mmwrite, /* 3*/ - eno_ioctl, nulldev, nulldev, 0, (select_fcn_t *)mmselect, - eno_mmap, eno_strat, eno_getc, eno_putc, 0 + mmioctl, nulldev, nulldev, 0, (select_fcn_t *)mmselect, + eno_mmap, eno_strat, eno_getc, eno_putc, D_DISK }, { ptsopen, ptsclose, ptsread, ptswrite, /* 4*/ diff --git a/bsd/dev/ppc/cons.c b/bsd/dev/ppc/cons.c index 26290d9a3..b9d966909 100644 --- a/bsd/dev/ppc/cons.c +++ b/bsd/dev/ppc/cons.c @@ -53,12 +53,19 @@ consopen(dev, flag, devtype, pp) struct proc *pp; { dev_t device; + boolean_t funnel_state; + int error; + + funnel_state = thread_funnel_set(kernel_flock, TRUE); if (constty) device = constty->t_dev; else device = cons.t_dev; - return ((*cdevsw[major(device)].d_open)(device, flag, devtype, pp)); + error = (*cdevsw[major(device)].d_open)(device, flag, devtype, pp); + thread_funnel_set(kernel_flock, funnel_state); + + return(error); } /*ARGSUSED*/ @@ -69,12 +76,20 @@ consclose(dev, flag, mode, pp) struct proc *pp; { dev_t device; + boolean_t funnel_state; + int error; + funnel_state = thread_funnel_set(kernel_flock, TRUE); if (constty) device = constty->t_dev; else device = cons.t_dev; - return ((*cdevsw[major(device)].d_close)(device, flag, mode, pp)); + error = (*cdevsw[major(device)].d_close)(device, flag, mode, pp); + thread_funnel_set(kernel_flock, funnel_state); + + return(error); + + } /*ARGSUSED*/ @@ -85,12 +100,18 @@ consread(dev, uio, ioflag) int ioflag; { dev_t device; + boolean_t funnel_state; + int error; + funnel_state = thread_funnel_set(kernel_flock, TRUE); if (constty) device = constty->t_dev; else device = cons.t_dev; - return ((*cdevsw[major(device)].d_read)(device, uio, ioflag)); + error = (*cdevsw[major(device)].d_read)(device, uio, ioflag); + thread_funnel_set(kernel_flock, funnel_state); + + return(error); } /*ARGSUSED*/ @@ -101,12 +122,18 @@ conswrite(dev, uio, ioflag) int ioflag; { dev_t device; + boolean_t funnel_state; + int error; + funnel_state = thread_funnel_set(kernel_flock, TRUE); if (constty) device = constty->t_dev; else device = cons.t_dev; - return ((*cdevsw[major(device)].d_write)(device, uio, ioflag)); + error = (*cdevsw[major(device)].d_write)(device, uio, ioflag); + thread_funnel_set(kernel_flock, funnel_state); + + return(error); } /*ARGSUSED*/ @@ -119,6 +146,10 @@ consioctl(dev, cmd, addr, flag, p) struct proc *p; { dev_t device; + boolean_t funnel_state; + int error; + + funnel_state = thread_funnel_set(kernel_flock, TRUE); if (constty) device = constty->t_dev; @@ -129,16 +160,23 @@ consioctl(dev, cmd, addr, flag, p) * output from the "virtual" console. */ if (cmd == TIOCCONS && constty) { - int error = suser(p->p_ucred, (u_short *) NULL); - if (error) - return (error); + error = proc_suser(p); + if (error) { + goto out; + } constty = NULL; - return (0); + error = 0; + goto out; } - return ((*cdevsw[major(device)].d_ioctl)(device, cmd, addr, flag, p)); + error = (*cdevsw[major(device)].d_ioctl)(device, cmd, addr, flag, p); +out: + thread_funnel_set(kernel_flock, funnel_state); + + return(error); } /*ARGSUSED*/ +/* called with funnel held */ int consselect(dev, flag, wql, p) dev_t dev; @@ -159,12 +197,18 @@ int cons_getc() { dev_t device; + boolean_t funnel_state; + int error; + funnel_state = thread_funnel_set(kernel_flock, TRUE); if (constty) device = constty->t_dev; else device = cons.t_dev; - return ((*cdevsw[major(device)].d_getc)(device)); + error = (*cdevsw[major(device)].d_getc)(device); + thread_funnel_set(kernel_flock, funnel_state); + + return(error); } /*ARGSUSED*/ @@ -173,12 +217,18 @@ cons_putc(c) char c; { dev_t device; + boolean_t funnel_state; + int error; + funnel_state = thread_funnel_set(kernel_flock, TRUE); if (constty) device = constty->t_dev; else device = cons.t_dev; - return ((*cdevsw[major(device)].d_putc)(device, c)); + error = (*cdevsw[major(device)].d_putc)(device, c); + thread_funnel_set(kernel_flock, funnel_state); + + return(error); } /* diff --git a/bsd/dev/ppc/cons.h b/bsd/dev/ppc/cons.h index 00d91a155..6da1b0ae7 100644 --- a/bsd/dev/ppc/cons.h +++ b/bsd/dev/ppc/cons.h @@ -25,10 +25,10 @@ struct consdev { char *cn_name; /* name of device in dev_name_list */ - int (*cn_probe)(); /* probe hardware and fill in consdev info */ - int (*cn_init)(); /* turn on as console */ - int (*cn_getc)(); /* kernel getchar interface */ - int (*cn_putc)(); /* kernel putchar interface */ + int (*cn_probe)(void); /* probe and fill in consdev info */ + int (*cn_init)(void); /* turn on as console */ + int (*cn_getc)(void); /* kernel getchar interface */ + int (*cn_putc)(void); /* kernel putchar interface */ struct tty *cn_tp; /* tty structure for console device */ dev_t cn_dev; /* major/minor of device */ short cn_pri; /* pecking order; the higher the better */ diff --git a/bsd/dev/ppc/kern_machdep.c b/bsd/dev/ppc/kern_machdep.c index ba6b61d50..9aefbe8ba 100644 --- a/bsd/dev/ppc/kern_machdep.c +++ b/bsd/dev/ppc/kern_machdep.c @@ -27,14 +27,6 @@ * Author: John Seamons * * Machine-specific kernel routines. - * - * HISTORY - * 8-Dec-91 Peter King (king) at NeXT - * Added grade_cpu_subtype(). - * FIXME: Do we want to merge this with check_cpu_subtype()? - * - * 5-Mar-90 John Seamons (jks) at NeXT - * Created. */ #include <sys/types.h> @@ -44,80 +36,189 @@ #include <mach/vm_param.h> #include <kern/cpu_number.h> -int -check_cpu_subtype(cpu_subtype_t cpu_subtype) -{ - struct machine_slot *ms = &machine_slot[cpu_number()]; - - if (cpu_subtype == ms->cpu_subtype) - return (TRUE); - - switch (cpu_subtype) { - case CPU_SUBTYPE_POWERPC_970: - /* Do not allow a 970 binary to run on non-970 systems */ - if (ms->cpu_subtype != CPU_SUBTYPE_POWERPC_970) - break; - case CPU_SUBTYPE_POWERPC_7450: - case CPU_SUBTYPE_POWERPC_7400: - case CPU_SUBTYPE_POWERPC_750: - case CPU_SUBTYPE_POWERPC_ALL: - return (TRUE); - } - - return (FALSE); -} +int grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype); /* - * Routine: grade_cpu_subtype() + * Routine: grade_binary() * * Function: - * Return a relative preference for cpu_subtypes in fat executable files. - * The higher the grade, the higher the preference. + * Return a relative preference for exectypes and execsubtypes in fat + * executable files. The higher the grade, the higher the preference. * A grade of 0 means not acceptable. + * + * Note: We really don't care about the real cpu_type() here, + * because machines can only have one type. */ - int -grade_cpu_subtype(cpu_subtype_t cpu_subtype) +grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype) { - struct machine_slot *ms = &machine_slot[cpu_number()]; + int cpusubtype = cpu_subtype(); /* - * This code should match cpusubtype_findbestarch() in best_arch.c in the - * cctools project. As of 2/16/98 this is what has been agreed upon for - * the PowerPC subtypes. If an exact match is not found the subtype will - * be picked from the following order: + * This code should match cpusubtype_findbestarch() in best_arch.c + * in the cctools project. As of 2/16/98 this is what has been + * agreed upon for the PowerPC subtypes. If an exact match is not + * found the subtype will be picked from the following order: * 970(but only on 970), 7450, 7400, 750, ALL - * Note the 601 is NOT in the list above. It is only picked via an exact - * match. For details see Radar 2213821. - * - * To implement this function to follow what was agreed upon above, we use - * the fact there are currently 4 different subtypes. Exact matches return - * the value 6, and the values 5 thru 1 are returned for the - * subtypes listed in the order above. + * Note the 601 is NOT in the list above. It is only picked via + * an exact match. For details see Radar 2213821. */ - if (ms->cpu_subtype == cpu_subtype) - return 6; - switch (cpu_subtype) { - case CPU_SUBTYPE_POWERPC_970: - /* Do not allow a 970 binary to run on non-970 systems */ - if (ms->cpu_subtype != CPU_SUBTYPE_POWERPC_970) - break; - return 5; - case CPU_SUBTYPE_POWERPC_7450: - return 4; - case CPU_SUBTYPE_POWERPC_7400: - return 3; - case CPU_SUBTYPE_POWERPC_750: - return 2; - case CPU_SUBTYPE_POWERPC_ALL: - return 1; + + switch (cpusubtype) { + case CPU_SUBTYPE_POWERPC_970: + switch(exectype) { + case CPU_TYPE_POWERPC64: /* CPU_IS64BIT | CPU_POWERPC */ + switch(execsubtype) { + /* + * Prefer 64 bit architecture specific binaries; note + * that this value does not mean the same thing here + * as it does below. + */ + case CPU_SUBTYPE_POWERPC_970: + return 8; + /* Prefer generic binaries */ + case CPU_SUBTYPE_POWERPC_ALL: + return 7; + default: + return 0; + } + /* NOTREACHED */ + + case CPU_TYPE_POWERPC: + switch(execsubtype) { + /* + * Prefer 32 bit binaries with 64 bit leaf functions; + * this is actually bogus use of the subtype to encode + * CPU feature bits. + */ + case CPU_SUBTYPE_POWERPC_970: + return 6; + case CPU_SUBTYPE_POWERPC_7450: + return 4; + case CPU_SUBTYPE_POWERPC_7400: + return 3; + case CPU_SUBTYPE_POWERPC_750: + return 2; + case CPU_SUBTYPE_POWERPC_ALL: + return 1; + default: + return 0; + } + /* NOTREACHED */ + + default: + return 0; + } + /* NOTREACHED */ + + case CPU_SUBTYPE_POWERPC_7450: + switch(exectype) { + case CPU_TYPE_POWERPC64: /* CPU_IS64BIT | CPU_POWERPC */ + return 0; + + case CPU_TYPE_POWERPC: + switch(execsubtype) { + case CPU_SUBTYPE_POWERPC_7450: + return 6; + case CPU_SUBTYPE_POWERPC_7400: + return 4; + case CPU_SUBTYPE_POWERPC_750: + return 3; + case CPU_SUBTYPE_POWERPC_ALL: + return 1; + default: + return 0; + } + /* NOTREACHED */ + + default: + return 0; + } + /* NOTREACHED */ + + case CPU_SUBTYPE_POWERPC_7400: + switch(exectype) { + case CPU_TYPE_POWERPC64: /* CPU_IS64BIT | CPU_POWERPC */ + return 0; + + case CPU_TYPE_POWERPC: + switch(execsubtype) { + case CPU_SUBTYPE_POWERPC_7400: + return 6; + case CPU_SUBTYPE_POWERPC_7450: + return 4; + case CPU_SUBTYPE_POWERPC_750: + return 3; + case CPU_SUBTYPE_POWERPC_ALL: + return 1; + default: + return 0; + } + /* NOTREACHED */ + + default: + return 0; + } + /* NOTREACHED */ + + case CPU_SUBTYPE_POWERPC_750: + switch(exectype) { + case CPU_TYPE_POWERPC64: /* CPU_IS64BIT | CPU_POWERPC */ + return 0; + + case CPU_TYPE_POWERPC: + switch(execsubtype) { + case CPU_SUBTYPE_POWERPC_750: + return 6; +#ifndef ADDRESS_RADAR_2678019 + /* + * Currently implemented because dropping this would + * turn the executable subtype into a "has Altivec" + * flag, which we do not want to permit. It could + * also break working third party applications + * already in use in the field. + */ + case CPU_SUBTYPE_POWERPC_7400: + return 4; + case CPU_SUBTYPE_POWERPC_7450: + return 3; +#endif /* ADDRESS_RADAR_2678019 */ + case CPU_SUBTYPE_POWERPC_ALL: + return 1; + default: + return 0; + } + /* NOTREACHED */ + + default: + return 0; + } + /* NOTREACHED */ + + default: + switch(exectype) { + case CPU_TYPE_POWERPC64: /* CPU_IS64BIT | CPU_POWERPC */ + return 0; + + case CPU_TYPE_POWERPC: + /* Special case for PPC601 */ + if (cpusubtype == execsubtype) + return 6; + /* + * If we get here it is because it is a cpusubtype we + * don't support or a new cpusubtype that was added + * since this code was written. Both will be + * considered unacceptable. + */ + return 0; + /* NOTREACHED */ + + default: + return 0; + } + /* NOTREACHED */ } - /* - * If we get here it is because it is a cpusubtype we don't support - * or a new cpusubtype that was added since this code was written. Both - * will be considered unacceptable. - */ - return 0; + /* NOTREACHED */ } boolean_t @@ -140,3 +241,20 @@ kernacc( return (TRUE); } + +void +md_prepare_for_shutdown(int paniced, int howto, char * command); + +extern void IOSystemShutdownNotification(void); + +void +md_prepare_for_shutdown(__unused int paniced, __unused int howto, + __unused char * command) +{ + + /* + * Temporary hack to notify the power management root domain + * that the system will shut down. + */ + IOSystemShutdownNotification(); +} diff --git a/bsd/dev/ppc/km.c b/bsd/dev/ppc/km.c index cc6a8f009..db5d95169 100644 --- a/bsd/dev/ppc/km.c +++ b/bsd/dev/ppc/km.c @@ -26,7 +26,7 @@ * HISTORY */ -#include <sys/param.h> +#include <sys/kernel.h> #include <sys/tty.h> #include <dev/ppc/cons.h> @@ -44,7 +44,6 @@ /* * 'Global' variables, shared only by this file and conf.c. */ -extern struct tty cons; struct tty *km_tty[1] = { &cons }; /* @@ -53,10 +52,23 @@ struct tty *km_tty[1] = { &cons }; */ int disableConsoleOutput; -/* - * 'Global' variables, shared only by this file and kmDevice.m. - */ -int initialized = 0; +static int initialized = 0; + +// Function prototypes +extern d_open_t kmopen; +extern d_close_t kmclose; +extern d_read_t kmread; +extern d_write_t kmwrite; +extern d_ioctl_t kmioctl; +extern d_getc_t kmgetc; +extern d_putc_t kmputc; + +extern void kminit(void); + +// used by or implemented in the osfmk project +extern void cnputcusr(char); // From osfmk +extern int cngetc(void); // From osfmk +extern void cons_cinput(char ch); // Used by osfmk static int kmoutput(struct tty *tp); static void kmtimeout(struct tty *tp); @@ -64,7 +76,8 @@ static void kmstart(struct tty *tp); extern void KeyboardOpen(void); -int kminit() +void +kminit(void) { cons.t_dev = makedev(12, 0); initialized = 1; @@ -73,13 +86,8 @@ int kminit() * cdevsw interface to km driver. */ int -kmopen( - dev_t dev, - int flag, - int devtype, - struct proc *pp) +kmopen(dev_t dev, int flag, __unused int devtype, struct proc *pp) { - int rtn; int unit; struct tty *tp; struct winsize *wp; @@ -102,7 +110,7 @@ kmopen( tp->t_ispeed = tp->t_ospeed = TTYDEF_SPEED; termioschars(&tp->t_termios); ttsetwater(tp); - } else if ((tp->t_state & TS_XCLUDE) && pp->p_ucred->cr_uid != 0) + } else if ((tp->t_state & TS_XCLUDE) && proc_suser(pp)) return EBUSY; tp->t_state |= TS_CARR_ON; /* lie and say carrier exists and is on. */ @@ -133,11 +141,8 @@ kmopen( } int -kmclose( - dev_t dev, - int flag, - int mode, - struct proc *p) +kmclose(__unused dev_t dev, __unused int flag, __unused int mode, + __unused struct proc *p) { struct tty *tp; @@ -149,10 +154,7 @@ kmclose( } int -kmread( - dev_t dev, - struct uio *uio, - int ioflag) +kmread(__unused dev_t dev, struct uio *uio, int ioflag) { register struct tty *tp; @@ -161,10 +163,7 @@ kmread( } int -kmwrite( - dev_t dev, - struct uio *uio, - int ioflag) +kmwrite(__unused dev_t dev, struct uio *uio, int ioflag) { register struct tty *tp; @@ -173,11 +172,7 @@ kmwrite( } int -kmioctl( - dev_t dev, - int cmd, - caddr_t data, - int flag, +kmioctl( __unused dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) { int error; @@ -210,22 +205,14 @@ kmioctl( } default: error = (*linesw[tp->t_line].l_ioctl)(tp, cmd, data, flag, p); - if (error >= 0) { + if (ENOTTY != error) return error; - } - error = ttioctl (tp, cmd, data, flag, p); - if (error >= 0) { - return error; - } - else { - return ENOTTY; - } + return ttioctl (tp, cmd, data, flag, p); } } int -kmputc( - int c) +kmputc(__unused dev_t dev, char c) { if( disableConsoleOutput) @@ -243,8 +230,7 @@ kmputc( } int -kmgetc( - dev_t dev) +kmgetc(__unused dev_t dev) { int c; @@ -257,9 +243,10 @@ kmgetc( return c; } +#if 0 int kmgetc_silent( - dev_t dev) + __unused dev_t dev) { int c; @@ -269,6 +256,7 @@ kmgetc_silent( } return c; } +#endif /* 0 */ /* * Callouts from linesw. @@ -277,38 +265,23 @@ kmgetc_silent( #define KM_LOWAT_DELAY ((ns_time_t)1000) static void -kmstart( - struct tty *tp) +kmstart(struct tty *tp) { - extern int hz; if (tp->t_state & (TS_TIMEOUT | TS_BUSY | TS_TTSTOP)) goto out; if (tp->t_outq.c_cc == 0) goto out; tp->t_state |= TS_BUSY; - if (tp->t_outq.c_cc > tp->t_lowat) { - /* - * Start immediately. - */ - kmoutput(tp); - } - else { - /* - * Wait a bit... - */ -#if 0 - /* FIXME */ - timeout(kmtimeout, tp, hz); -#else - kmoutput(tp); -#endif - } + kmoutput(tp); + return; + out: - ttwwakeup(tp); + (*linesw[tp->t_line].l_start)(tp); + return; } static void -kmtimeout( struct tty *tp) +kmtimeout(struct tty *tp) { boolean_t funnel_state; @@ -319,8 +292,7 @@ kmtimeout( struct tty *tp) } static int -kmoutput( - struct tty *tp) +kmoutput(struct tty *tp) { /* * FIXME - to be grokked...copied from m68k km.c. @@ -328,8 +300,6 @@ kmoutput( char buf[80]; char *cp; int cc = -1; - extern int hz; - while (tp->t_outq.c_cc > 0) { cc = ndqb(&tp->t_outq, 0); @@ -337,25 +307,22 @@ kmoutput( break; cc = min(cc, sizeof buf); (void) q_to_b(&tp->t_outq, buf, cc); - for (cp = buf; cp < &buf[cc]; cp++) { - kmputc(*cp & 0x7f); - } + for (cp = buf; cp < &buf[cc]; cp++) + kmputc(tp->t_dev, *cp & 0x7f); } if (tp->t_outq.c_cc > 0) { timeout((timeout_fcn_t)kmtimeout, tp, hz); } tp->t_state &= ~TS_BUSY; - ttwwakeup(tp); + (*linesw[tp->t_line].l_start)(tp); return 0; } -cons_cinput(char ch) + +void cons_cinput(char ch) { struct tty *tp = &cons; - boolean_t funnel_state; - (*linesw[tp->t_line].l_rint) (ch, tp); - } diff --git a/bsd/dev/ppc/mem.c b/bsd/dev/ppc/mem.c index 9c44dc246..4e7c8f8c3 100644 --- a/bsd/dev/ppc/mem.c +++ b/bsd/dev/ppc/mem.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -70,10 +70,9 @@ #include <sys/param.h> #include <sys/dir.h> #include <sys/proc.h> -#include <sys/buf.h> #include <sys/systm.h> #include <sys/vm.h> -#include <sys/uio.h> +#include <sys/uio_internal.h> #include <sys/malloc.h> #include <vm/pmap.h> @@ -87,7 +86,14 @@ static caddr_t devzerobuf; extern pmap_t kernel_pmap; +extern boolean_t kernacc(off_t, size_t ); +int mmread(dev_t dev, struct uio *uio); +int mmrw(dev_t dev, struct uio *uio, enum uio_rw rw); +int mmioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p); +int mmwrite(dev_t dev, struct uio *uio); + +int mmread(dev, uio) dev_t dev; struct uio *uio; @@ -96,6 +102,7 @@ mmread(dev, uio) return (mmrw(dev, uio, UIO_READ)); } +int mmwrite(dev, uio) dev_t dev; struct uio *uio; @@ -104,25 +111,41 @@ mmwrite(dev, uio) return (mmrw(dev, uio, UIO_WRITE)); } +int +mmioctl(__unused dev_t dev, u_long cmd, __unused caddr_t data, + __unused int flag, __unused struct proc *p) +{ + switch (cmd) { + case FIONBIO: + case FIOASYNC: + /* OK to do nothing: we always return immediately */ + break; + default: + return ENODEV; + } + + return (0); +} + +int mmrw(dev, uio, rw) dev_t dev; struct uio *uio; enum uio_rw rw; { register int o; - register u_int c, v; +#if LP64KERN + register uint64_t c; +#else + register uint c; +#endif addr64_t vll; - register struct iovec *iov; int error = 0; vm_offset_t where; - int spl; - vm_size_t size; - extern boolean_t kernacc(off_t, size_t ); - - while (uio->uio_resid > 0 && error == 0) { - iov = uio->uio_iov; - if (iov->iov_len == 0) { - uio->uio_iov++; + + while (uio_resid(uio) > 0 && error == 0) { + if (uio_iov_len(uio) == 0) { + uio_next_iov(uio); uio->uio_iovcnt--; if (uio->uio_iovcnt < 0) panic("mmrw"); @@ -157,7 +180,8 @@ mmrw(dev, uio, rw) } } o = uio->uio_offset - vll; - c = min(PAGE_SIZE - o, (u_int)iov->iov_len); + // LP64todo - fix this! + c = min(PAGE_SIZE - o, uio_iov_len(uio)); error = uiomove((caddr_t)(where + o), c, uio); if(dgWork.dgFlags & enaDiagDM) (void)mapping_remove(kernel_pmap, (addr64_t)where); /* Unmap it */ @@ -170,17 +194,17 @@ mmrw(dev, uio, rw) if (((addr64_t)uio->uio_offset > vm_last_addr) || ((addr64_t)uio->uio_offset < VM_MIN_KERNEL_ADDRESS)) goto fault; - c = iov->iov_len; + c = uio_iov_len(uio); if (!kernacc(uio->uio_offset, c)) goto fault; - error = uiomove64(uio->uio_offset, (int)c, uio); + error = uiomove64(uio->uio_offset, c, uio); continue; /* minor device 2 is EOF/RATHOLE */ case 2: if (rw == UIO_READ) return (0); - c = iov->iov_len; + c = uio_iov_len(uio); break; /* minor device 3 is ZERO/RATHOLE */ case 3: @@ -189,11 +213,12 @@ mmrw(dev, uio, rw) bzero(devzerobuf, PAGE_SIZE); } if(uio->uio_rw == UIO_WRITE) { - c = iov->iov_len; + c = uio_iov_len(uio); break; } - c = min(iov->iov_len, PAGE_SIZE); - error = uiomove(devzerobuf, (int)c, uio); + // LP64todo - fix this! + c = min(uio_iov_len(uio), PAGE_SIZE); + error = uiomove(devzerobuf, c, uio); continue; default: goto fault; @@ -202,10 +227,15 @@ mmrw(dev, uio, rw) if (error) break; - iov->iov_base += c; - iov->iov_len -= c; + uio_iov_base_add(uio, c); uio->uio_offset += c; - uio->uio_resid -= c; +#if LP64KERN + uio_setresid(uio, (uio_resid(uio) - c)); + uio_iov_len_add(uio, -((int64_t)c)); +#else + uio_setresid(uio, (uio_resid(uio) - c)); + uio_iov_len_add(uio, -((int)c)); +#endif } return (error); fault: diff --git a/bsd/dev/ppc/memmove.c b/bsd/dev/ppc/memmove.c index e36599aa8..c9a091bb1 100644 --- a/bsd/dev/ppc/memmove.c +++ b/bsd/dev/ppc/memmove.c @@ -36,11 +36,13 @@ void *memcpy(void *dst, const void *src, unsigned int ulen) bcopy(src, dst, ulen); return dst; } -#endif /* 0 */ + void *memmove(void *dst, const void *src, unsigned int ulen) { bcopy(src, dst, ulen); return dst; } +#endif /* 0 */ + diff --git a/bsd/dev/ppc/munge.s b/bsd/dev/ppc/munge.s new file mode 100644 index 000000000..6c835dddd --- /dev/null +++ b/bsd/dev/ppc/munge.s @@ -0,0 +1,356 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_EN + */ + +/* + * Syscall argument mungers. + * + * Passed a pointer to the users register array in the savearea, we copy args into + * the uu_arg[] array, padding etc as appropriate. The issue is that parameters + * passed in registers from a 32-bit address space do not map directly into the uu_args. + * For example, a 32-bit long-long comes in two registers, but we need to combine + * them into one 64-bit long-long in the uu_args. + * + * There are several functions in this file. Each takes two parameters: + * + * void munge_XXXX( const void *regs, void *uu_args); + * + * The name of the function encodes the number and type of the parameters, as follows: + * + * w = a 32-bit value such as an int or a 32-bit ptr, that does not require + * sign extension. These are handled by skipping a word in the input, + * zeroing a word of output, and copying a word from input to output. + * + * s = a 32-bit value such as a long, which must be sign-extended to a 64-bit + * long-long in the uu_args. These are handled by skipping a word of + * input, loading a word of input and sign extending it to a double, + * and storing two words of output. + * + * l = a 64-bit long-long, passed in two registers. These are handled by skipping + * a word of input, copying a word, skipping another word of input, and + * copying another word. + * + * d = a 32-bit int or a 64-bit ptr or long, passed in via a 64-bit GPR + * from a 64-bit process. We copy two words from input to output. + * + * For example, "munge_wls" takes a word, a long-long, and a word. This takes + * four registers: the first word is in one, the long-long takes two, and the + * final word is in the fourth. We store six words: a 0, the low words of the + * first three registers, and the two words resulting from sign-extending the + * low word of the fourth register. + * + * As you can see, we save a lot of code by collapsing mungers that are prefixes + * of each other, into the more general routine. This ends up copying a few extra + * bytes of parameters, but big deal. The old kernel copied all eight words for + * every system call. + * + * These routines assume explicit pad words in the uu_arg structures, that fill out + * int parameters to 64 bits. Having pad words makes munging args for 64-bit + * processes the equivalent of a simple bcopy(), though it does introduce an + * endian dependency. + */ + + .align 5 + .globl _munge_dddddddd // that is 8 'd's +_munge_dddddddd: + .globl _munge_ddddddd +_munge_ddddddd: + .globl _munge_dddddd +_munge_dddddd: + .globl _munge_ddddd +_munge_ddddd: + ld r5,0*8+0(r3) + ld r6,1*8+0(r3) + ld r7,2*8+0(r3) + ld r8,3*8+0(r3) + ld r9,4*8+0(r3) + ld r10,5*8+0(r3) + ld r11,6*8+0(r3) + ld r12,7*8+0(r3) + + std r5,0*8+0(r4) + std r6,1*8+0(r4) + std r7,2*8+0(r4) + std r8,3*8+0(r4) + std r9,4*8+0(r4) + std r10,5*8+0(r4) + std r11,6*8+0(r4) + std r12,7*8+0(r4) + + blr + + + .align 5 + .globl _munge_dddd +_munge_dddd: + .globl _munge_ddd +_munge_ddd: + .globl _munge_dd +_munge_dd: + .globl _munge_d +_munge_d: + ld r5,0*8+0(r3) + ld r6,1*8+0(r3) + ld r7,2*8+0(r3) + ld r8,3*8+0(r3) + + std r5,0*8+0(r4) + std r6,1*8+0(r4) + std r7,2*8+0(r4) + std r8,3*8+0(r4) + + blr + + + .align 5 + .globl _munge_wwwwwwww // that is 8 'w's +_munge_wwwwwwww: + .globl _munge_wwwwwww +_munge_wwwwwww: + .globl _munge_wwwwww +_munge_wwwwww: + .globl _munge_wwwww +_munge_wwwww: + li r0,0 + lwz r5,0*8+4(r3) + lwz r6,1*8+4(r3) + lwz r7,2*8+4(r3) + lwz r8,3*8+4(r3) + lwz r9,4*8+4(r3) + lwz r10,5*8+4(r3) + lwz r11,6*8+4(r3) + lwz r12,7*8+4(r3) + + stw r0,0*8+0(r4) + stw r5,0*8+4(r4) + stw r0,1*8+0(r4) + stw r6,1*8+4(r4) + stw r0,2*8+0(r4) + stw r7,2*8+4(r4) + stw r0,3*8+0(r4) + stw r8,3*8+4(r4) + stw r0,4*8+0(r4) + stw r9,4*8+4(r4) + stw r0,5*8+0(r4) + stw r10,5*8+4(r4) + stw r0,6*8+0(r4) + stw r11,6*8+4(r4) + stw r0,7*8+0(r4) + stw r12,7*8+4(r4) + + blr + + + .align 5 + .globl _munge_wwww +_munge_wwww: + .globl _munge_www +_munge_www: + .globl _munge_ww +_munge_ww: + .globl _munge_w +_munge_w: + li r0,0 + lwz r5,0*8+4(r3) + lwz r6,1*8+4(r3) + lwz r7,2*8+4(r3) + lwz r8,3*8+4(r3) + + stw r0,0*8+0(r4) + stw r5,0*8+4(r4) + stw r0,1*8+0(r4) + stw r6,1*8+4(r4) + stw r0,2*8+0(r4) + stw r7,2*8+4(r4) + stw r0,3*8+0(r4) + stw r8,3*8+4(r4) + + blr + + .align 5 + .globl _munge_l +_munge_l: + li r0,0 + lwz r5,0*8+4(r3) + lwz r6,1*8+4(r3) + + stw r5,0*8+0(r4) + stw r6,0*8+4(r4) + + blr + + .align 5 + .globl _munge_wlw +_munge_wlw: + .globl _munge_wl +_munge_wl: + li r0,0 + lwz r5,0*8+4(r3) + lwz r6,1*8+4(r3) + lwz r7,2*8+4(r3) + lwz r8,3*8+4(r3) + + stw r0,0*8+0(r4) + stw r5,0*8+4(r4) + stw r6,1*8+0(r4) + stw r7,1*8+4(r4) + stw r0,2*8+0(r4) + stw r8,2*8+4(r4) + + blr + + + .align 5 + .globl _munge_wwwl +_munge_wwwl: + li r0,0 + lwz r5,0*8+4(r3) + lwz r6,1*8+4(r3) + lwz r7,2*8+4(r3) + lwz r8,3*8+4(r3) + lwz r9,4*8+4(r3) + + stw r0,0*8+0(r4) + stw r5,0*8+4(r4) + stw r0,1*8+0(r4) + stw r6,1*8+4(r4) + stw r0,2*8+0(r4) + stw r7,2*8+4(r4) + stw r8,3*8+0(r4) + stw r9,3*8+4(r4) + + blr + + + .align 5 + .globl _munge_wwwwl // 4 'w's and an l +_munge_wwwwl: + li r0,0 + lwz r5,0*8+4(r3) + lwz r6,1*8+4(r3) + lwz r7,2*8+4(r3) + lwz r8,3*8+4(r3) + lwz r9,4*8+4(r3) + lwz r10,5*8+4(r3) + + stw r0,0*8+0(r4) + stw r5,0*8+4(r4) + stw r0,1*8+0(r4) + stw r6,1*8+4(r4) + stw r0,2*8+0(r4) + stw r7,2*8+4(r4) + stw r0,3*8+0(r4) + stw r8,3*8+4(r4) + stw r9,4*8+0(r4) + stw r10,4*8+4(r4) + + blr + + + .align 5 + .globl _munge_wwwwwl // 5 'w's and an l +_munge_wwwwwl: + li r0,0 + lwz r5,0*8+4(r3) + lwz r6,1*8+4(r3) + lwz r7,2*8+4(r3) + lwz r8,3*8+4(r3) + lwz r9,4*8+4(r3) + lwz r10,5*8+4(r3) + lwz r11,6*8+4(r3) + + stw r0,0*8+0(r4) + stw r5,0*8+4(r4) + stw r0,1*8+0(r4) + stw r6,1*8+4(r4) + stw r0,2*8+0(r4) + stw r7,2*8+4(r4) + stw r0,3*8+0(r4) + stw r8,3*8+4(r4) + stw r0,4*8+0(r4) + stw r9,4*8+4(r4) + stw r10,5*8+0(r4) + stw r11,5*8+4(r4) + + blr + + + .align 5 + .globl _munge_wsw +_munge_wsw: + li r0,0 + lwz r5,0*8+4(r3) + lwz r6,1*8+4(r3) + lwz r7,2*8+4(r3) + + stw r0,0*8+0(r4) + srawi r2,r6,31 + stw r5,0*8+4(r4) + stw r2,1*8+0(r4) + stw r6,1*8+4(r4) + stw r0,2*8+0(r4) + stw r7,2*8+4(r4) + + blr + + + .align 5 + .globl _munge_wws +_munge_wws: + li r0,0 + lwz r5,0*8+4(r3) + lwz r6,1*8+4(r3) + lwz r7,2*8+4(r3) + + stw r0,0*8+0(r4) + stw r5,0*8+4(r4) + stw r0,1*8+0(r4) + srawi r2,r7,31 + stw r6,1*8+4(r4) + stw r2,2*8+0(r4) + stw r7,2*8+4(r4) + + blr + + + .align 5 + .globl _munge_wwwsw +_munge_wwwsw: + li r0,0 + lwz r5,0*8+4(r3) + lwz r6,1*8+4(r3) + lwz r7,2*8+4(r3) + lwz r8,3*8+4(r3) + lwz r9,4*8+4(r3) + + stw r0,0*8+0(r4) + stw r5,0*8+4(r4) + stw r0,1*8+0(r4) + stw r6,1*8+4(r4) + srawi r2,r8,31 + stw r0,2*8+0(r4) + stw r7,2*8+4(r4) + stw r2,3*8+0(r4) + stw r8,3*8+4(r4) + stw r0,4*8+0(r4) + stw r9,4*8+4(r4) + + blr diff --git a/bsd/dev/ppc/nvram.c b/bsd/dev/ppc/nvram.c index d4900147f..bf466872a 100644 --- a/bsd/dev/ppc/nvram.c +++ b/bsd/dev/ppc/nvram.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -68,7 +68,7 @@ nvread(dev, uio, ioflag) int error = 0; offset = uio->uio_offset; - size = uio->uio_resid; + size = uio_resid(uio); for (read = 0; read < size; read++, offset++) { error = PEnvread(offset, 1, &cc); @@ -88,7 +88,6 @@ nvread(dev, uio, ioflag) nvwrite(dev_t dev, struct uio *uio, int ioflag) { - register struct iovec *iov; long offset; long size; int c; @@ -97,7 +96,7 @@ nvwrite(dev_t dev, struct uio *uio, int ioflag) int error = 0; offset = uio->uio_offset; - size = uio->uio_resid; + size = uio_resid(uio); for (wrote = 0; wrote < size; wrote++, offset++) { c = uwritec(uio); diff --git a/bsd/dev/ppc/stubs.c b/bsd/dev/ppc/stubs.c index 27bf27f71..3f0df507d 100644 --- a/bsd/dev/ppc/stubs.c +++ b/bsd/dev/ppc/stubs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -27,14 +27,12 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/buf.h> #include <sys/ioctl.h> #include <sys/tty.h> #include <sys/conf.h> #include <sys/proc.h> #include <sys/user.h> #include <kern/thread.h> -#include <kern/thread_act.h> #include <kern/task.h> #include <vm/vm_map.h> @@ -51,13 +49,10 @@ */ /* from ppc/fault_copy.c -Titan1T4 VERSION */ int -copystr(vfrom, vto, maxlen, lencopied) - register void * vfrom, *vto; - size_t maxlen, *lencopied; +copystr(const void *vfrom, void *vto, size_t maxlen, size_t *lencopied) { register unsigned l; - int error; -caddr_t from, to; + caddr_t from, to; from = vfrom; to = vto; @@ -80,44 +75,6 @@ size_t count; return 0; } -struct unix_syscallargs { - int flavor; - int r3; - int arg1, arg2,arg3,arg4,arg5,arg6,arg7; -}; - -set_bsduthreadargs(thread_t th, void * pcb, struct unix_syscallargs * sarg) -{ -struct uthread * ut; - - ut = get_bsdthread_info(th); - ut->uu_ar0 = (int *)pcb; - - if (sarg->flavor) - { - ut->uu_arg[0] = sarg->arg1; - ut->uu_arg[1] = sarg->arg2; - ut->uu_arg[2] = sarg->arg3; - ut->uu_arg[3] = sarg->arg4; - ut->uu_arg[4] = sarg->arg5; - ut->uu_arg[5] = sarg->arg6; - ut->uu_arg[7] = sarg->arg7; - } - else - { - ut->uu_arg[0] = sarg->r3; - ut->uu_arg[1] = sarg->arg1; - ut->uu_arg[2] = sarg->arg2; - ut->uu_arg[3] = sarg->arg3; - ut->uu_arg[4] = sarg->arg4; - ut->uu_arg[5] = sarg->arg5; - ut->uu_arg[6] = sarg->arg6; - ut->uu_arg[7] = sarg->arg7; - } - - return(1); -} - void * get_bsduthreadarg(thread_t th) { @@ -127,7 +84,7 @@ struct uthread *ut; } int * -get_bsduthreadrval(thread_act_t th) +get_bsduthreadrval(thread_t th) { struct uthread *ut; ut = get_bsdthread_info(th); diff --git a/bsd/dev/ppc/systemcalls.c b/bsd/dev/ppc/systemcalls.c index 79dcb99f5..a20314a54 100644 --- a/bsd/dev/ppc/systemcalls.c +++ b/bsd/dev/ppc/systemcalls.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -22,32 +22,36 @@ #include <kern/task.h> #include <kern/thread.h> -#include <kern/thread_act.h> #include <kern/assert.h> #include <kern/clock.h> +#include <kern/locks.h> +#include <kern/sched_prim.h> #include <mach/machine/thread_status.h> #include <ppc/savearea.h> #include <sys/kernel.h> #include <sys/vm.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> #include <sys/syscall.h> #include <sys/systm.h> #include <sys/user.h> #include <sys/errno.h> #include <sys/ktrace.h> #include <sys/kdebug.h> +#include <sys/sysent.h> +#include <sys/sysproto.h> +#include <sys/kauth.h> #include <bsm/audit_kernel.h> extern void -unix_syscall( - struct savearea *regs -); +unix_syscall(struct savearea *regs); +void +unix_syscall_return(int error); extern struct savearea * find_user_regs( - thread_act_t act); + thread_t act); extern void enter_funnel_section(funnel_t *funnel_lock); extern void exit_funnel_section(void); @@ -60,11 +64,9 @@ extern void exit_funnel_section(void); * Outputs: none */ void -unix_syscall( - struct savearea *regs -) +unix_syscall(struct savearea *regs) { - thread_act_t thread_act; + thread_t thread_act; struct uthread *uthread; struct proc *proc; struct sysent *callp; @@ -72,8 +74,9 @@ unix_syscall( unsigned short code; boolean_t flavor; int funnel_type; + unsigned int cancel_enable; - flavor = (((unsigned int)regs->save_r0) == NULL)? 1: 0; + flavor = (((unsigned int)regs->save_r0) == 0)? 1: 0; if (flavor) code = regs->save_r3; @@ -88,49 +91,83 @@ unix_syscall( KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, regs->save_r3, regs->save_r4, regs->save_r5, regs->save_r6, 0); } - thread_act = current_act(); + thread_act = current_thread(); uthread = get_bsdthread_info(thread_act); - if (!(uthread->uu_flag & P_VFORK)) + if (!(uthread->uu_flag & UT_VFORK)) proc = (struct proc *)get_bsdtask_info(current_task()); else proc = current_proc(); + /* + * Delayed binding of thread credential to process credential, if we + * are not running with an explicitly set thread credential. + */ + if (uthread->uu_ucred != proc->p_ucred && + (uthread->uu_flag & UT_SETUID) == 0) { + kauth_cred_t old = uthread->uu_ucred; + proc_lock(proc); + uthread->uu_ucred = proc->p_ucred; + kauth_cred_ref(uthread->uu_ucred); + proc_unlock(proc); + if (old != NOCRED) + kauth_cred_rele(old); + } + uthread->uu_ar0 = (int *)regs; callp = (code >= nsysent) ? &sysent[63] : &sysent[code]; -#ifdef DEBUG - if (callp->sy_narg > 8) - panic("unix_syscall: max arg count exceeded"); -#endif - if (callp->sy_narg != 0) { + void *regsp; + sy_munge_t *mungerp; + + if (IS_64BIT_PROCESS(proc)) { + /* XXX Turn 64 bit unsafe calls into nosys() */ + if (callp->sy_funnel & UNSAFE_64BIT) { + callp = &sysent[63]; + goto unsafe; + } + mungerp = callp->sy_arg_munge64; + } + else { + mungerp = callp->sy_arg_munge32; + } if ( !flavor) { - uthread->uu_arg[0] = regs->save_r3; - uthread->uu_arg[1] = regs->save_r4; - uthread->uu_arg[2] = regs->save_r5; - uthread->uu_arg[3] = regs->save_r6; - uthread->uu_arg[4] = regs->save_r7; - uthread->uu_arg[5] = regs->save_r8; - uthread->uu_arg[6] = regs->save_r9; - uthread->uu_arg[7] = regs->save_r10; + regsp = (void *) ®s->save_r3; } else { - uthread->uu_arg[0] = regs->save_r4; - uthread->uu_arg[1] = regs->save_r5; - uthread->uu_arg[2] = regs->save_r6; - uthread->uu_arg[3] = regs->save_r7; - uthread->uu_arg[4] = regs->save_r8; - uthread->uu_arg[5] = regs->save_r9; - uthread->uu_arg[7] = regs->save_r10; + /* indirect system call consumes an argument so only 7 are supported */ + if (callp->sy_narg > 7) { + callp = &sysent[63]; + goto unsafe; + } + regsp = (void *) ®s->save_r4; } + /* call syscall argument munger to copy in arguments (see xnu/bsd/dev/ppc/munge.s) */ + (*mungerp)(regsp, (void *) &uthread->uu_arg[0]); } - funnel_type = (int)callp->sy_funnel; +unsafe: + cancel_enable = callp->sy_cancel; + + if (cancel_enable == _SYSCALL_CANCEL_NONE) { + uthread->uu_flag |= UT_NOTCANCELPT; + } else { + if((uthread->uu_flag & (UT_CANCELDISABLE | UT_CANCEL | UT_CANCELED)) == UT_CANCEL) { + if (cancel_enable == _SYSCALL_CANCEL_PRE) { + /* system call cancelled; return to handle cancellation */ + regs->save_r3 = (long long)EINTR; + thread_exception_return(); + /* NOTREACHED */ + } else { + thread_abort_safely(thread_act); + } + } + } + + funnel_type = (int)(callp->sy_funnel & FUNNEL_MASK); if (funnel_type == KERNEL_FUNNEL) enter_funnel_section(kernel_flock); - else if (funnel_type == NETWORK_FUNNEL) - enter_funnel_section(network_flock); uthread->uu_rval[0] = 0; @@ -150,12 +187,20 @@ unix_syscall( regs->save_srr0 += 4; if (KTRPOINT(proc, KTR_SYSCALL)) - ktrsyscall(proc, code, callp->sy_narg, uthread->uu_arg, funnel_type); + ktrsyscall(proc, code, callp->sy_narg, uthread->uu_arg); +#ifdef JOE_DEBUG + uthread->uu_iocount = 0; + uthread->uu_vpindex = 0; +#endif AUDIT_SYSCALL_ENTER(code, proc, uthread); error = (*(callp->sy_call))(proc, (void *)uthread->uu_arg, &(uthread->uu_rval[0])); AUDIT_SYSCALL_EXIT(error, proc, uthread); +#ifdef JOE_DEBUG + if (uthread->uu_iocount) + joe_debug("system call returned with uu_iocount != 0"); +#endif regs = find_user_regs(thread_act); if (error == ERESTART) { @@ -166,42 +211,124 @@ unix_syscall( /* set the "pc" to execute cerror routine */ regs->save_srr0 -= 4; } else { /* (not error) */ - regs->save_r3 = uthread->uu_rval[0]; - regs->save_r4 = uthread->uu_rval[1]; + switch (callp->sy_return_type) { + case _SYSCALL_RET_INT_T: + regs->save_r3 = uthread->uu_rval[0]; + regs->save_r4 = uthread->uu_rval[1]; + break; + case _SYSCALL_RET_UINT_T: + regs->save_r3 = ((u_int)uthread->uu_rval[0]); + regs->save_r4 = ((u_int)uthread->uu_rval[1]); + break; + case _SYSCALL_RET_OFF_T: + /* off_t returns 64 bits split across two registers for 32 bit */ + /* process and in one register for 64 bit process */ + if (IS_64BIT_PROCESS(proc)) { + u_int64_t *retp = (u_int64_t *)&uthread->uu_rval[0]; + regs->save_r3 = *retp; + regs->save_r4 = 0; + } + else { + regs->save_r3 = uthread->uu_rval[0]; + regs->save_r4 = uthread->uu_rval[1]; + } + break; + case _SYSCALL_RET_ADDR_T: + case _SYSCALL_RET_SIZE_T: + case _SYSCALL_RET_SSIZE_T: + /* the variable length return types (user_addr_t, user_ssize_t, + * and user_size_t) are always the largest possible size in the + * kernel (we use uu_rval[0] and [1] as one 64 bit value). + */ + { + user_addr_t *retp = (user_addr_t *)&uthread->uu_rval[0]; + regs->save_r3 = *retp; + regs->save_r4 = 0; + } + break; + case _SYSCALL_RET_NONE: + break; + default: + panic("unix_syscall: unknown return type"); + break; + } } } /* else (error == EJUSTRETURN) { nothing } */ - if (KTRPOINT(proc, KTR_SYSRET)) - ktrsysret(proc, code, error, uthread->uu_rval[0], funnel_type); - exit_funnel_section(); + if (KTRPOINT(proc, KTR_SYSRET)) { + switch(callp->sy_return_type) { + case _SYSCALL_RET_ADDR_T: + case _SYSCALL_RET_SIZE_T: + case _SYSCALL_RET_SSIZE_T: + /* + * Trace the value of the least significant bits, + * until we can revise the ktrace API safely. + */ + ktrsysret(proc, code, error, uthread->uu_rval[1]); + break; + default: + ktrsysret(proc, code, error, uthread->uu_rval[0]); + break; + } + } + if (cancel_enable == _SYSCALL_CANCEL_NONE) + uthread->uu_flag &= ~UT_NOTCANCELPT; + + exit_funnel_section(); + + if (uthread->uu_lowpri_delay) { + /* + * task is marked as a low priority I/O type + * and the I/O we issued while in this system call + * collided with normal I/O operations... we'll + * delay in order to mitigate the impact of this + * task on the normal operation of the system + */ + IOSleep(uthread->uu_lowpri_delay); + uthread->uu_lowpri_delay = 0; + } if (kdebug_enable && (code != 180)) { - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, - error, uthread->uu_rval[0], uthread->uu_rval[1], 0, 0); + + if (callp->sy_return_type == _SYSCALL_RET_SSIZE_T) + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, + error, uthread->uu_rval[1], 0, 0, 0); + else + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, + error, uthread->uu_rval[0], uthread->uu_rval[1], 0, 0); } thread_exception_return(); /* NOTREACHED */ } -unix_syscall_return(error) +void +unix_syscall_return(int error) { - thread_act_t thread_act; + thread_t thread_act; struct uthread *uthread; struct proc *proc; struct savearea *regs; unsigned short code; struct sysent *callp; int funnel_type; + unsigned int cancel_enable; - thread_act = current_act(); + thread_act = current_thread(); proc = current_proc(); uthread = get_bsdthread_info(thread_act); regs = find_user_regs(thread_act); + if (regs->save_r0 != 0) + code = regs->save_r0; + else + code = regs->save_r3; + + callp = (code >= nsysent) ? &sysent[63] : &sysent[code]; + /* * Get index into sysent table */ @@ -213,29 +340,91 @@ unix_syscall_return(error) /* set the "pc" to execute cerror routine */ regs->save_srr0 -= 4; } else { /* (not error) */ - regs->save_r3 = uthread->uu_rval[0]; - regs->save_r4 = uthread->uu_rval[1]; + switch (callp->sy_return_type) { + case _SYSCALL_RET_INT_T: + regs->save_r3 = uthread->uu_rval[0]; + regs->save_r4 = uthread->uu_rval[1]; + break; + case _SYSCALL_RET_UINT_T: + regs->save_r3 = ((u_int)uthread->uu_rval[0]); + regs->save_r4 = ((u_int)uthread->uu_rval[1]); + break; + case _SYSCALL_RET_OFF_T: + /* off_t returns 64 bits split across two registers for 32 bit */ + /* process and in one register for 64 bit process */ + if (IS_64BIT_PROCESS(proc)) { + u_int64_t *retp = (u_int64_t *)&uthread->uu_rval[0]; + regs->save_r3 = *retp; + } + else { + regs->save_r3 = uthread->uu_rval[0]; + regs->save_r4 = uthread->uu_rval[1]; + } + break; + case _SYSCALL_RET_ADDR_T: + case _SYSCALL_RET_SIZE_T: + case _SYSCALL_RET_SSIZE_T: + /* the variable length return types (user_addr_t, user_ssize_t, + * and user_size_t) are always the largest possible size in the + * kernel (we use uu_rval[0] and [1] as one 64 bit value). + */ + { + u_int64_t *retp = (u_int64_t *)&uthread->uu_rval[0]; + regs->save_r3 = *retp; + } + break; + case _SYSCALL_RET_NONE: + break; + default: + panic("unix_syscall: unknown return type"); + break; + } } } /* else (error == EJUSTRETURN) { nothing } */ - if (regs->save_r0 != NULL) - code = regs->save_r0; - else - code = regs->save_r3; - - callp = (code >= nsysent) ? &sysent[63] : &sysent[code]; + if (KTRPOINT(proc, KTR_SYSRET)) { + switch(callp->sy_return_type) { + case _SYSCALL_RET_ADDR_T: + case _SYSCALL_RET_SIZE_T: + case _SYSCALL_RET_SSIZE_T: + /* + * Trace the value of the least significant bits, + * until we can revise the ktrace API safely. + */ + ktrsysret(proc, code, error, uthread->uu_rval[1]); + break; + default: + ktrsysret(proc, code, error, uthread->uu_rval[0]); + break; + } + } - funnel_type = (int)callp->sy_funnel; + cancel_enable = callp->sy_cancel; - if (KTRPOINT(proc, KTR_SYSRET)) - ktrsysret(proc, code, error, uthread->uu_rval[0], funnel_type); + if (cancel_enable == _SYSCALL_CANCEL_NONE) + uthread->uu_flag &= ~UT_NOTCANCELPT; - exit_funnel_section(); + exit_funnel_section(); + if (uthread->uu_lowpri_delay) { + /* + * task is marked as a low priority I/O type + * and the I/O we issued while in this system call + * collided with normal I/O operations... we'll + * delay in order to mitigate the impact of this + * task on the normal operation of the system + */ + IOSleep(uthread->uu_lowpri_delay); + uthread->uu_lowpri_delay = 0; + } if (kdebug_enable && (code != 180)) { - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, - error, uthread->uu_rval[0], uthread->uu_rval[1], 0, 0); + if (callp->sy_return_type == _SYSCALL_RET_SSIZE_T) + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, + error, uthread->uu_rval[1], 0, 0, 0); + else + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, + error, uthread->uu_rval[0], uthread->uu_rval[1], 0, 0); } thread_exception_return(); @@ -251,35 +440,81 @@ unix_syscall_return(error) * and decrementing interval timers, optionally reloading the interval * timers when they expire. */ -struct gettimeofday_args{ - struct timeval *tp; - struct timezone *tzp; -}; /* NOTE THIS implementation is for ppc architectures only. * It is infrequently called, since the commpage intercepts * most calls in user mode. + * + * XXX Y2038 bug because of assumed return of 32 bit seconds value, and + * XXX first parameter to clock_gettimeofday() */ int -ppc_gettimeofday(p, uap, retval) - struct proc *p; - register struct gettimeofday_args *uap; - register_t *retval; +ppc_gettimeofday(__unused struct proc *p, + register struct ppc_gettimeofday_args *uap, + register_t *retval) { int error = 0; + extern lck_spin_t * tz_slock; if (uap->tp) clock_gettimeofday(&retval[0], &retval[1]); if (uap->tzp) { struct timezone ltz; - extern simple_lock_data_t tz_slock; - usimple_lock(&tz_slock); + lck_spin_lock(tz_slock); ltz = tz; - usimple_unlock(&tz_slock); - error = copyout((caddr_t)<z, (caddr_t)uap->tzp, sizeof (tz)); + lck_spin_unlock(tz_slock); + error = copyout((caddr_t)<z, uap->tzp, sizeof (tz)); } return (error); } +#ifdef JOE_DEBUG +joe_debug(char *p) { + + printf("%s\n", p); +} +#endif + + +/* + * WARNING - this is a temporary workaround for binary compatibility issues + * with anti-piracy software that relies on patching ptrace (3928003). + * This KPI will be removed in the system release after Tiger. + */ +uintptr_t temp_patch_ptrace(uintptr_t new_ptrace) +{ + struct sysent * callp; + sy_call_t * old_ptrace; + + if (new_ptrace == 0) + return(0); + + enter_funnel_section(kernel_flock); + callp = &sysent[26]; + old_ptrace = callp->sy_call; + + /* only allow one patcher of ptrace */ + if (old_ptrace == (sy_call_t *) ptrace) { + callp->sy_call = (sy_call_t *) new_ptrace; + } + else { + old_ptrace = NULL; + } + exit_funnel_section( ); + + return((uintptr_t)old_ptrace); +} + +void temp_unpatch_ptrace(void) +{ + struct sysent * callp; + + enter_funnel_section(kernel_flock); + callp = &sysent[26]; + callp->sy_call = (sy_call_t *) ptrace; + exit_funnel_section( ); + + return; +} diff --git a/bsd/dev/ppc/unix_signal.c b/bsd/dev/ppc/unix_signal.c index 501bc87e8..75a700d51 100644 --- a/bsd/dev/ppc/unix_signal.c +++ b/bsd/dev/ppc/unix_signal.c @@ -27,24 +27,48 @@ #include <mach/exception_types.h> #include <sys/param.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> #include <sys/user.h> #include <sys/ucontext.h> +#include <sys/sysproto.h> +#include <sys/systm.h> +#include <sys/ux_exception.h> #include <ppc/signal.h> #include <sys/signalvar.h> #include <sys/kdebug.h> #include <sys/wait.h> #include <kern/thread.h> -#include <kern/thread_act.h> #include <mach/ppc/thread_status.h> #include <ppc/proc_reg.h> -#define C_REDZONE_LEN 224 -#define C_STK_ALIGN 16 -#define C_PARAMSAVE_LEN 64 -#define C_LINKAGE_LEN 48 -#define TRUNC_DOWN(a,b,c) (((((unsigned)a)-(b))/(c)) * (c)) +// #include <machine/thread.h> XXX include path messed up for some reason... + +/* XXX functions not in a Mach headers */ +extern kern_return_t thread_getstatus(register thread_t act, int flavor, + thread_state_t tstate, mach_msg_type_number_t *count); +extern int is_64signalregset(void); +extern unsigned int get_msr_exportmask(void); +extern kern_return_t thread_setstatus(thread_t thread, int flavor, + thread_state_t tstate, mach_msg_type_number_t count); +extern void ppc_checkthreadstate(void *, int); +extern struct savearea_vec *find_user_vec_curr(void); +extern int thread_enable_fpe(thread_t act, int onoff); + + + +#define C_32_REDZONE_LEN 224 +#define C_32_STK_ALIGN 16 +#define C_32_PARAMSAVE_LEN 64 +#define C_32_LINKAGE_LEN 48 + +#define C_64_REDZONE_LEN 320 +#define C_64_STK_ALIGN 32 +#define C_64_PARAMSAVE_LEN 64 +#define C_64_LINKAGE_LEN 48 + +#define TRUNC_DOWN32(a,b,c) ((((uint32_t)a)-(b)) & ((uint32_t)(-(c)))) +#define TRUNC_DOWN64(a,b,c) ((((uint64_t)a)-(b)) & ((uint64_t)(-(c)))) /* * The stack layout possibilities (info style); This needs to mach with signal trampoline code @@ -82,42 +106,99 @@ #define UC_FLAVOR64_VEC_SIZE ((PPC_THREAD_STATE64_COUNT + PPC_EXCEPTION_STATE64_COUNT + PPC_FLOAT_STATE_COUNT + PPC_VECTOR_STATE_COUNT) * sizeof(int)) +/* + * NOTE: Source and target may *NOT* overlap! + */ +static void +ucontext_32to64(struct ucontext64 *in, struct user_ucontext64 *out) +{ + out->uc_onstack = in->uc_onstack; + out->uc_sigmask = in->uc_sigmask; + + /* internal "structure assign" */ + out->uc_stack.ss_sp = CAST_USER_ADDR_T(in->uc_stack.ss_sp); + out->uc_stack.ss_size = in->uc_stack.ss_size; + out->uc_stack.ss_flags = in->uc_stack.ss_flags; + + out->uc_link = CAST_USER_ADDR_T(in->uc_link); + out->uc_mcsize = in->uc_mcsize; + out->uc_mcontext64 = CAST_USER_ADDR_T(in->uc_mcontext64); +} + +/* + * This conversion is safe, since if we are converting for a 32 bit process, + * then it's values of uc-stack.ss_size and uc_mcsize will never exceed 4G. + * + * NOTE: Source and target may *NOT* overlap! + */ +static void +ucontext_64to32(struct user_ucontext64 *in, struct ucontext64 *out) +{ + out->uc_onstack = in->uc_onstack; + out->uc_sigmask = in->uc_sigmask; + + /* internal "structure assign" */ + out->uc_stack.ss_sp = CAST_DOWN(void *,in->uc_stack.ss_sp); + out->uc_stack.ss_size = in->uc_stack.ss_size; /* range reduction */ + out->uc_stack.ss_flags = in->uc_stack.ss_flags; + + out->uc_link = CAST_DOWN(void *,in->uc_link); + out->uc_mcsize = in->uc_mcsize; /* range reduction */ + out->uc_mcontext64 = CAST_DOWN(void *,in->uc_mcontext64); +} + +/* + * NOTE: Source and target may *NOT* overlap! + */ +static void +siginfo_64to32(user_siginfo_t *in, siginfo_t *out) +{ + out->si_signo = in->si_signo; + out->si_errno = in->si_errno; + out->si_code = in->si_code; + out->si_pid = in->si_pid; + out->si_uid = in->si_uid; + out->si_status = in->si_status; + out->si_addr = CAST_DOWN(void *,in->si_addr); + /* following cast works for sival_int because of padding */ + out->si_value.sival_ptr = CAST_DOWN(void *,in->si_value.sival_ptr); + out->si_band = in->si_band; /* range reduction */ + out->pad[0] = in->pad[0]; /* mcontext.ss.r1 */ +} + + /* * Arrange for this process to run a signal handler */ void -sendsig(p, catcher, sig, mask, code) - struct proc *p; - sig_t catcher; - int sig, mask; - u_long code; +sendsig(struct proc *p, user_addr_t catcher, int sig, int mask, __unused u_long code) { kern_return_t kretn; - struct mcontext mctx, *p_mctx; - struct mcontext64 mctx64, *p_mctx64; - struct ucontext uctx, *p_uctx; - siginfo_t sinfo, *p_sinfo; + struct mcontext mctx; + user_addr_t p_mctx = USER_ADDR_NULL; /* mcontext dest. */ + struct mcontext64 mctx64; + user_addr_t p_mctx64 = USER_ADDR_NULL; /* mcontext dest. */ + struct user_ucontext64 uctx; + user_addr_t p_uctx; /* user stack addr top copy ucontext */ + user_siginfo_t sinfo; + user_addr_t p_sinfo; /* user stack addr top copy siginfo */ struct sigacts *ps = p->p_sigacts; - int framesize; int oonstack; - unsigned long sp; - unsigned long state_count; - thread_act_t th_act; + user_addr_t sp; + mach_msg_type_number_t state_count; + thread_t th_act; struct uthread *ut; - unsigned long paramp,linkp; int infostyle = UC_TRAD; int dualcontext =0; - sig_t trampact; + user_addr_t trampact; int vec_used = 0; int stack_size = 0; - int stack_flags = 0; void * tstate; int flavor; int ctx32 = 1; - int is_64signalregset(void); - th_act = current_act(); + th_act = current_thread(); ut = get_bsdthread_info(th_act); @@ -128,15 +209,21 @@ sendsig(p, catcher, sig, mask, code) dualcontext = 1; infostyle = UC_DUAL; } - if (p->p_sigacts->ps_64regset & sigmask(sig)) { + if (p->p_sigacts->ps_64regset & sigmask(sig)) { dualcontext = 0; ctx32 = 0; infostyle = UC_FLAVOR64; } - if (is_64signalregset() && (infostyle == UC_TRAD)) { + /* treat 64 bit processes as having used 64 bit registers */ + if ((IS_64BIT_PROCESS(p) || is_64signalregset()) && + (infostyle == UC_TRAD)) { ctx32=0; infostyle = UC_TRAD64; - } + } + if (IS_64BIT_PROCESS(p)) { + ctx32=0; + dualcontext = 0; + } /* I need this for SIGINFO anyway */ flavor = PPC_THREAD_STATE; @@ -199,6 +286,7 @@ sendsig(p, catcher, sig, mask, code) state_count = PPC_VECTOR_STATE_COUNT; if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) goto bad; + infostyle += 5; } if ((ctx32 == 0) || dualcontext) { @@ -207,8 +295,8 @@ sendsig(p, catcher, sig, mask, code) state_count = PPC_VECTOR_STATE_COUNT; if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) goto bad; + infostyle += 5; } - infostyle += 5; } trampact = ps->ps_trampact[sig]; @@ -217,54 +305,69 @@ sendsig(p, catcher, sig, mask, code) /* figure out where our new stack lives */ if ((ps->ps_flags & SAS_ALTSTACK) && !oonstack && (ps->ps_sigonstack & sigmask(sig))) { - sp = (unsigned long)(ps->ps_sigstk.ss_sp); + sp = ps->ps_sigstk.ss_sp; sp += ps->ps_sigstk.ss_size; stack_size = ps->ps_sigstk.ss_size; ps->ps_sigstk.ss_flags |= SA_ONSTACK; } else { if (ctx32 == 0) - sp = (unsigned int)mctx64.ss.r1; + sp = mctx64.ss.r1; else - sp = mctx.ss.r1; + sp = CAST_USER_ADDR_T(mctx.ss.r1); } /* put siginfo on top */ /* preserve RED ZONE area */ - sp = TRUNC_DOWN(sp, C_REDZONE_LEN, C_STK_ALIGN); + if (IS_64BIT_PROCESS(p)) + sp = TRUNC_DOWN64(sp, C_64_REDZONE_LEN, C_64_STK_ALIGN); + else + sp = TRUNC_DOWN32(sp, C_32_REDZONE_LEN, C_32_STK_ALIGN); /* next are the saved registers */ if ((ctx32 == 0) || dualcontext) { - sp -= sizeof(*p_mctx64); - p_mctx64 = (struct mcontext64 *)sp; + sp -= sizeof(struct mcontext64); + p_mctx64 = sp; } if ((ctx32 == 1) || dualcontext) { - sp -= sizeof(*p_mctx); - p_mctx = (struct mcontext *)sp; + sp -= sizeof(struct mcontext); + p_mctx = sp; } - /* context goes first on stack */ - sp -= sizeof(*p_uctx); - p_uctx = (struct ucontext *) sp; - - /* this is where siginfo goes on stack */ - sp -= sizeof(*p_sinfo); - p_sinfo = (siginfo_t *) sp; - - /* C calling conventions, create param save and linkage - * areas - */ - - sp = TRUNC_DOWN(sp, C_PARAMSAVE_LEN, C_STK_ALIGN); - paramp = sp; - sp -= C_LINKAGE_LEN; - linkp = sp; + if (IS_64BIT_PROCESS(p)) { + /* context goes first on stack */ + sp -= sizeof(struct user_ucontext64); + p_uctx = sp; + + /* this is where siginfo goes on stack */ + sp -= sizeof(user_siginfo_t); + p_sinfo = sp; + + sp = TRUNC_DOWN64(sp, C_64_PARAMSAVE_LEN+C_64_LINKAGE_LEN, C_64_STK_ALIGN); + } else { + /* + * struct ucontext and struct ucontext64 are identical in + * size and content; the only difference is the internal + * pointer type for the last element, which makes no + * difference for the copyout(). + */ + + /* context goes first on stack */ + sp -= sizeof(struct ucontext64); + p_uctx = sp; + + /* this is where siginfo goes on stack */ + sp -= sizeof(siginfo_t); + p_sinfo = sp; + + sp = TRUNC_DOWN32(sp, C_32_PARAMSAVE_LEN+C_32_LINKAGE_LEN, C_32_STK_ALIGN); + } uctx.uc_onstack = oonstack; uctx.uc_sigmask = mask; - uctx.uc_stack.ss_sp = (char *)sp; + uctx.uc_stack.ss_sp = sp; uctx.uc_stack.ss_size = stack_size; if (oonstack) uctx.uc_stack.ss_flags |= SS_ONSTACK; @@ -279,15 +382,20 @@ sendsig(p, catcher, sig, mask, code) uctx.uc_mcsize += (size_t)(PPC_VECTOR_STATE_COUNT * sizeof(int)); if (ctx32 == 0) - uctx.uc_mcontext = (void *)p_mctx64; + uctx.uc_mcontext64 = p_mctx64; else - uctx.uc_mcontext = (void *)p_mctx; + uctx.uc_mcontext64 = p_mctx; /* setup siginfo */ - bzero((caddr_t)&sinfo, sizeof(siginfo_t)); + bzero((caddr_t)&sinfo, sizeof(user_siginfo_t)); sinfo.si_signo = sig; - sinfo.si_addr = (void *)mctx.ss.srr0; - sinfo.pad[0] = (unsigned int)mctx.ss.r1; + if (ctx32 == 0) { + sinfo.si_addr = mctx64.ss.srr0; + sinfo.pad[0] = mctx64.ss.r1; + } else { + sinfo.si_addr = CAST_USER_ADDR_T(mctx.ss.srr0); + sinfo.pad[0] = CAST_USER_ADDR_T(mctx.ss.r1); + } switch (sig) { case SIGCHLD: @@ -311,15 +419,30 @@ sendsig(p, catcher, sig, mask, code) } break; case SIGILL: - sinfo.si_addr = (void *)mctx.ss.srr0; - if (mctx.ss.srr1 & (1 << (31 - SRR1_PRG_ILL_INS_BIT))) - sinfo.si_code = ILL_ILLOPC; - else if (mctx.ss.srr1 & (1 << (31 - SRR1_PRG_PRV_INS_BIT))) - sinfo.si_code = ILL_PRVOPC; - else if (mctx.ss.srr1 & (1 << (31 - SRR1_PRG_TRAP_BIT))) - sinfo.si_code = ILL_ILLTRP; - else - sinfo.si_code = ILL_NOOP; + /* + * If it's 64 bit and not a dual context, mctx will + * contain uninitialized data, so we have to use + * mctx64 here. + */ + if(ctx32 == 0) { + if (mctx64.ss.srr1 & (1 << (31 - SRR1_PRG_ILL_INS_BIT))) + sinfo.si_code = ILL_ILLOPC; + else if (mctx64.ss.srr1 & (1 << (31 - SRR1_PRG_PRV_INS_BIT))) + sinfo.si_code = ILL_PRVOPC; + else if (mctx64.ss.srr1 & (1 << (31 - SRR1_PRG_TRAP_BIT))) + sinfo.si_code = ILL_ILLTRP; + else + sinfo.si_code = ILL_NOOP; + } else { + if (mctx.ss.srr1 & (1 << (31 - SRR1_PRG_ILL_INS_BIT))) + sinfo.si_code = ILL_ILLOPC; + else if (mctx.ss.srr1 & (1 << (31 - SRR1_PRG_PRV_INS_BIT))) + sinfo.si_code = ILL_PRVOPC; + else if (mctx.ss.srr1 & (1 << (31 - SRR1_PRG_TRAP_BIT))) + sinfo.si_code = ILL_ILLTRP; + else + sinfo.si_code = ILL_NOOP; + } break; case SIGFPE: #define FPSCR_VX 2 @@ -327,36 +450,75 @@ sendsig(p, catcher, sig, mask, code) #define FPSCR_UX 4 #define FPSCR_ZX 5 #define FPSCR_XX 6 - sinfo.si_addr = (void *)mctx.ss.srr0; - if (mctx.fs.fpscr & (1 << (31 - FPSCR_VX))) - sinfo.si_code = FPE_FLTINV; - else if (mctx.fs.fpscr & (1 << (31 - FPSCR_OX))) - sinfo.si_code = FPE_FLTOVF; - else if (mctx.fs.fpscr & (1 << (31 - FPSCR_UX))) - sinfo.si_code = FPE_FLTUND; - else if (mctx.fs.fpscr & (1 << (31 - FPSCR_ZX))) - sinfo.si_code = FPE_FLTDIV; - else if (mctx.fs.fpscr & (1 << (31 - FPSCR_XX))) - sinfo.si_code = FPE_FLTRES; - else - sinfo.si_code = FPE_NOOP; + /* + * If it's 64 bit and not a dual context, mctx will + * contain uninitialized data, so we have to use + * mctx64 here. + */ + if(ctx32 == 0) { + if (mctx64.fs.fpscr & (1 << (31 - FPSCR_VX))) + sinfo.si_code = FPE_FLTINV; + else if (mctx64.fs.fpscr & (1 << (31 - FPSCR_OX))) + sinfo.si_code = FPE_FLTOVF; + else if (mctx64.fs.fpscr & (1 << (31 - FPSCR_UX))) + sinfo.si_code = FPE_FLTUND; + else if (mctx64.fs.fpscr & (1 << (31 - FPSCR_ZX))) + sinfo.si_code = FPE_FLTDIV; + else if (mctx64.fs.fpscr & (1 << (31 - FPSCR_XX))) + sinfo.si_code = FPE_FLTRES; + else + sinfo.si_code = FPE_NOOP; + } else { + if (mctx.fs.fpscr & (1 << (31 - FPSCR_VX))) + sinfo.si_code = FPE_FLTINV; + else if (mctx.fs.fpscr & (1 << (31 - FPSCR_OX))) + sinfo.si_code = FPE_FLTOVF; + else if (mctx.fs.fpscr & (1 << (31 - FPSCR_UX))) + sinfo.si_code = FPE_FLTUND; + else if (mctx.fs.fpscr & (1 << (31 - FPSCR_ZX))) + sinfo.si_code = FPE_FLTDIV; + else if (mctx.fs.fpscr & (1 << (31 - FPSCR_XX))) + sinfo.si_code = FPE_FLTRES; + else + sinfo.si_code = FPE_NOOP; + } break; case SIGBUS: - sinfo.si_addr = (void *)mctx.ss.srr0; + if (ctx32 == 0) { + sinfo.si_addr = mctx64.es.dar; + } else { + sinfo.si_addr = CAST_USER_ADDR_T(mctx.es.dar); + } /* on ppc we generate only if EXC_PPC_UNALIGNED */ sinfo.si_code = BUS_ADRALN; break; case SIGSEGV: - sinfo.si_addr = (void *)mctx.ss.srr0; - /* First check in srr1 and then in dsisr */ - if (mctx.ss.srr1 & (1 << (31 - DSISR_PROT_BIT))) - sinfo.si_code = SEGV_ACCERR; - else if (mctx.es.dsisr & (1 << (31 - DSISR_PROT_BIT))) - sinfo.si_code = SEGV_ACCERR; - else - sinfo.si_code = SEGV_MAPERR; + /* + * If it's 64 bit and not a dual context, mctx will + * contain uninitialized data, so we have to use + * mctx64 here. + */ + if (ctx32 == 0) { + sinfo.si_addr = mctx64.es.dar; + /* First check in srr1 and then in dsisr */ + if (mctx64.ss.srr1 & (1 << (31 - DSISR_PROT_BIT))) + sinfo.si_code = SEGV_ACCERR; + else if (mctx64.es.dsisr & (1 << (31 - DSISR_PROT_BIT))) + sinfo.si_code = SEGV_ACCERR; + else + sinfo.si_code = SEGV_MAPERR; + } else { + sinfo.si_addr = CAST_USER_ADDR_T(mctx.es.dar); + /* First check in srr1 and then in dsisr */ + if (mctx.ss.srr1 & (1 << (31 - DSISR_PROT_BIT))) + sinfo.si_code = SEGV_ACCERR; + else if (mctx.es.dsisr & (1 << (31 - DSISR_PROT_BIT))) + sinfo.si_code = SEGV_ACCERR; + else + sinfo.si_code = SEGV_MAPERR; + } break; default: break; @@ -364,37 +526,69 @@ sendsig(p, catcher, sig, mask, code) /* copy info out to user space */ - if (copyout((caddr_t)&uctx, (caddr_t)p_uctx, sizeof(struct ucontext))) - goto bad; - if (copyout((caddr_t)&sinfo, (caddr_t)p_sinfo, sizeof(siginfo_t))) - goto bad; - if ((ctx32 == 0) || dualcontext) { - tstate = &mctx64; - if (copyout((caddr_t)tstate, (caddr_t)p_mctx64, (vec_used? UC_FLAVOR64_VEC_SIZE: UC_FLAVOR64_SIZE))) + if (IS_64BIT_PROCESS(p)) { + if (copyout(&uctx, p_uctx, sizeof(struct user_ucontext64))) goto bad; - } - if ((ctx32 == 1) || dualcontext) { - tstate = &mctx; - if (copyout((caddr_t)tstate, (caddr_t)p_mctx, uctx.uc_mcsize)) + if (copyout(&sinfo, p_sinfo, sizeof(user_siginfo_t))) goto bad; - } + } else { + struct ucontext64 uctx32; + siginfo_t sinfo32; + ucontext_64to32(&uctx, &uctx32); + if (copyout(&uctx32, p_uctx, sizeof(struct ucontext64))) + goto bad; - /* Place our arguments in arg registers: rtm dependent */ + siginfo_64to32(&sinfo,&sinfo32); + if (copyout(&sinfo32, p_sinfo, sizeof(siginfo_t))) + goto bad; + } + if ((ctx32 == 0) || dualcontext) { + /* + * NOTE: Size of mcontext is not variant between 64bit and + * 32bit programs usng 64bit registers. + */ + if (copyout(&mctx64, p_mctx64, (vec_used? UC_FLAVOR64_VEC_SIZE: UC_FLAVOR64_SIZE))) + goto bad; + } + if ((ctx32 == 1) || dualcontext) { + if (copyout(&mctx, p_mctx, uctx.uc_mcsize)) + goto bad; + } - mctx.ss.r3 = (unsigned long)catcher; - mctx.ss.r4 = (unsigned long)infostyle; - mctx.ss.r5 = (unsigned long)sig; - mctx.ss.r6 = (unsigned long)p_sinfo; - mctx.ss.r7 = (unsigned long)p_uctx; - mctx.ss.srr0 = (unsigned long)trampact; - mctx.ss.srr1 = get_msr_exportmask(); /* MSR_EXPORT_MASK_SET */ - mctx.ss.r1 = sp; - state_count = PPC_THREAD_STATE_COUNT; - if ((kretn = thread_setstatus(th_act, PPC_THREAD_STATE, &mctx.ss, &state_count)) != KERN_SUCCESS) { - panic("sendsig: thread_setstatus failed, ret = %08X\n", kretn); - } + /* Place our arguments in arg registers: rtm dependent */ + if(IS_64BIT_PROCESS(p)) { + mctx64.ss.r3 = catcher; + mctx64.ss.r4 = CAST_USER_ADDR_T(infostyle); + mctx64.ss.r5 = CAST_USER_ADDR_T(sig); + mctx64.ss.r6 = p_sinfo; + mctx64.ss.r7 = p_uctx; + + mctx64.ss.srr0 = trampact; + /* MSR_EXPORT_MASK_SET */ + mctx64.ss.srr1 = CAST_USER_ADDR_T(get_msr_exportmask()); + mctx64.ss.r1 = sp; + state_count = PPC_THREAD_STATE64_COUNT; + if ((kretn = thread_setstatus(th_act, PPC_THREAD_STATE64, (void *)&mctx64.ss, state_count)) != KERN_SUCCESS) { + panic("sendsig: thread_setstatus failed, ret = %08X\n", kretn); + } + } else { + mctx.ss.r3 = CAST_DOWN(unsigned long,catcher); + mctx.ss.r4 = (unsigned long)infostyle; + mctx.ss.r5 = (unsigned long)sig; + mctx.ss.r6 = CAST_DOWN(unsigned long,p_sinfo); + mctx.ss.r7 = CAST_DOWN(unsigned long,p_uctx); + + mctx.ss.srr0 = CAST_DOWN(unsigned long,trampact); + /* MSR_EXPORT_MASK_SET */ + mctx.ss.srr1 = get_msr_exportmask(); + mctx.ss.r1 = CAST_DOWN(unsigned long,sp); + state_count = PPC_THREAD_STATE_COUNT; + if ((kretn = thread_setstatus(th_act, PPC_THREAD_STATE, (void *)&mctx.ss, state_count)) != KERN_SUCCESS) { + panic("sendsig: thread_setstatus failed, ret = %08X\n", kretn); + } + } return; bad: @@ -419,167 +613,50 @@ bad: * a machine fault. */ -#define FOR64_TRANSITION 1 - - -#ifdef FOR64_TRANSITION - -struct osigreturn_args { - struct ucontext *uctx; -}; - /* ARGSUSED */ int -osigreturn(p, uap, retval) - struct proc *p; - struct osigreturn_args *uap; - int *retval; +sigreturn(struct proc *p, struct sigreturn_args *uap, __unused int *retval) { - struct ucontext uctx; - struct ucontext *p_uctx; - struct mcontext64 mctx64; - struct mcontext64 *p_64mctx; - struct mcontext *p_mctx; - int error; - thread_act_t th_act; - struct sigacts *ps = p->p_sigacts; - sigset_t mask; - register sig_t action; - unsigned long state_count; - unsigned int state_flavor; - struct uthread * ut; - int vec_used = 0; - void *tsptr, *fptr, *vptr, *mactx; - void ppc_checkthreadstate(void *, int); - - th_act = current_act(); - /* lets use the larger one */ - mactx = (void *)&mctx64; - - ut = (struct uthread *)get_bsdthread_info(th_act); - if (error = copyin(uap->uctx, &uctx, sizeof(struct ucontext))) { - return(error); - } - - /* validate the machine context size */ - switch (uctx.uc_mcsize) { - case UC_FLAVOR64_VEC_SIZE : - case UC_FLAVOR64_SIZE : - case UC_FLAVOR_VEC_SIZE : - case UC_FLAVOR_SIZE: - break; - default: - return(EINVAL); - } - - if (error = copyin(uctx.uc_mcontext, mactx, uctx.uc_mcsize)) { - return(error); - } - - if (uctx.uc_onstack & 01) - p->p_sigacts->ps_sigstk.ss_flags |= SA_ONSTACK; - else - p->p_sigacts->ps_sigstk.ss_flags &= ~SA_ONSTACK; - - ut->uu_sigmask = uctx.uc_sigmask & ~sigcantmask; - if (ut->uu_siglist & ~ut->uu_sigmask) - signal_setast(current_act()); - - vec_used = 0; - switch (uctx.uc_mcsize) { - case UC_FLAVOR64_VEC_SIZE : - vec_used = 1; - case UC_FLAVOR64_SIZE : { - p_64mctx = (struct mcontext64 *)mactx; - tsptr = (void *)&p_64mctx->ss; - fptr = (void *)&p_64mctx->fs; - vptr = (void *)&p_64mctx->vs; - state_flavor = PPC_THREAD_STATE64; - state_count = PPC_THREAD_STATE64_COUNT; - } - break; - case UC_FLAVOR_VEC_SIZE : - vec_used = 1; - case UC_FLAVOR_SIZE: - default: { - p_mctx = (struct mcontext *)mactx; - tsptr = (void *)&p_mctx->ss; - fptr = (void *)&p_mctx->fs; - vptr = (void *)&p_mctx->vs; - state_flavor = PPC_THREAD_STATE; - state_count = PPC_THREAD_STATE_COUNT; - } - break; - } /* switch () */ - - /* validate the thread state, set/reset appropriate mode bits in srr1 */ - (void)ppc_checkthreadstate(tsptr, state_flavor); + struct user_ucontext64 uctx; - if (thread_setstatus(th_act, state_flavor, tsptr, &state_count) != KERN_SUCCESS) { - return(EINVAL); - } - - state_count = PPC_FLOAT_STATE_COUNT; - if (thread_setstatus(th_act, PPC_FLOAT_STATE, fptr, &state_count) != KERN_SUCCESS) { - return(EINVAL); - } - - mask = sigmask(SIGFPE); - if (((ut->uu_sigmask & mask) == 0) && (p->p_sigcatch & mask) && ((p->p_sigignore & mask) == 0)) { - action = ps->ps_sigact[SIGFPE]; - if((action != SIG_DFL) && (action != SIG_IGN)) { - thread_enable_fpe(th_act, 1); - } - } - - if (vec_used) { - state_count = PPC_VECTOR_STATE_COUNT; - if (thread_setstatus(th_act, PPC_VECTOR_STATE, vptr, &state_count) != KERN_SUCCESS) { - return(EINVAL); - } - } - return (EJUSTRETURN); -} - -#endif /* FOR64_TRANSITION */ - -struct sigreturn_args { - struct ucontext *uctx; - int infostyle; -}; - -/* ARGSUSED */ -int -sigreturn(p, uap, retval) - struct proc *p; - struct sigreturn_args *uap; - int *retval; -{ - struct ucontext uctx; - struct ucontext *p_uctx; - char mactx[sizeof(struct mcontext64)]; - struct mcontext *p_mctx; + char mactx[sizeof(struct mcontext64)]; + struct mcontext *p_mctx; struct mcontext64 *p_64mctx; int error; - thread_act_t th_act; + thread_t th_act; struct sigacts *ps = p->p_sigacts; sigset_t mask; - register sig_t action; + user_addr_t action; unsigned long state_count; unsigned int state_flavor; struct uthread * ut; int vec_used = 0; void *tsptr, *fptr, *vptr; int infostyle = uap->infostyle; - void ppc_checkthreadstate(void *, int); - th_act = current_act(); + th_act = current_thread(); ut = (struct uthread *)get_bsdthread_info(th_act); - if (error = copyin(uap->uctx, &uctx, sizeof(struct ucontext))) { - return(error); + if (IS_64BIT_PROCESS(p)) { + error = copyin(uap->uctx, &uctx, sizeof(struct user_ucontext64)); + if (error) + return(error); + } else { + struct ucontext64 uctx32; + + /* + * struct ucontext and struct ucontext64 are identical in + * size and content; the only difference is the internal + * pointer type for the last element, which makes no + * difference for the copyin(). + */ + error = copyin(uap->uctx, &uctx32, sizeof(struct ucontext)); + if (error) + return(error); + ucontext_32to64(&uctx32, &uctx); } + /* validate the machine context size */ switch (uctx.uc_mcsize) { case UC_FLAVOR64_VEC_SIZE: @@ -590,18 +667,23 @@ sigreturn(p, uap, retval) default: return(EINVAL); } - if (error = copyin(uctx.uc_mcontext, mactx, uctx.uc_mcsize)) { + + /* + * The 64 bit process mcontext is identical to the mcontext64, so + * there is no conversion necessary. + */ + error = copyin(uctx.uc_mcontext64, mactx, uctx.uc_mcsize); + if (error) return(error); - } - if (uctx.uc_onstack & 01) + if ((uctx.uc_onstack & 01)) p->p_sigacts->ps_sigstk.ss_flags |= SA_ONSTACK; else p->p_sigacts->ps_sigstk.ss_flags &= ~SA_ONSTACK; ut->uu_sigmask = uctx.uc_sigmask & ~sigcantmask; if (ut->uu_siglist & ~ut->uu_sigmask) - signal_setast(current_act()); + signal_setast(current_thread()); vec_used = 0; switch (infostyle) { @@ -637,12 +719,12 @@ sigreturn(p, uap, retval) /* validate the thread state, set/reset appropriate mode bits in srr1 */ (void)ppc_checkthreadstate(tsptr, state_flavor); - if (thread_setstatus(th_act, state_flavor, tsptr, &state_count) != KERN_SUCCESS) { + if (thread_setstatus(th_act, state_flavor, tsptr, state_count) != KERN_SUCCESS) { return(EINVAL); } state_count = PPC_FLOAT_STATE_COUNT; - if (thread_setstatus(th_act, PPC_FLOAT_STATE, fptr, &state_count) != KERN_SUCCESS) { + if (thread_setstatus(th_act, PPC_FLOAT_STATE, fptr, state_count) != KERN_SUCCESS) { return(EINVAL); } @@ -656,7 +738,7 @@ sigreturn(p, uap, retval) if (vec_used) { state_count = PPC_VECTOR_STATE_COUNT; - if (thread_setstatus(th_act, PPC_VECTOR_STATE, vptr, &state_count) != KERN_SUCCESS) { + if (thread_setstatus(th_act, PPC_VECTOR_STATE, vptr, state_count) != KERN_SUCCESS) { return(EINVAL); } } @@ -672,7 +754,7 @@ boolean_t machine_exception( int exception, int code, - int subcode, + __unused int subcode, int *unix_signal, int *unix_code ) diff --git a/bsd/dev/ppc/xsumas.s b/bsd/dev/ppc/xsumas.s index c83a688f1..dae54fb13 100644 --- a/bsd/dev/ppc/xsumas.s +++ b/bsd/dev/ppc/xsumas.s @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -19,231 +19,377 @@ * * @APPLE_LICENSE_HEADER_END@ */ -#define STANDALONE 0 - -#if STANDALONE -#include "asm.h" -#include "assym.h" -#include "proc_reg.h" /* For CACHE_LINE_SIZE */ - -#else - -#include <mach/ppc/asm.h> -#if 0 -/* #include <assym.h> */ -#include <ppc/proc_reg.h> /* For CACHE_LINE_SIZE */ -#endif 0 -#endif + +#define kShort 11 +#define cr1_gt 5 // bit 1 of cr1 /* - * Reg 3 - Pointer to data - * Reg 4 - Length of data - * Reg 5 - Accumulated sum value - * Reg 6 - Starting on odd boundary flag (relative to byte 0 of the checksumed data) - */ - -ENTRY(xsum_assym, TAG_NO_FRAME_USED) - - mr r11, r6 ; Swapped flag - addi r8, 0, 0 - addi r10, 0, 0x1f - addi r7, 0, 1 - addic r7, r7, 0 ; This clears the carry bit! - mr r12, r5 ; Save the passed-in checksum value - - /* - * Sum bytes before cache line boundary - */ - - cmpi cr0,0,r4,0 ; Check for length of 0 - beq Lleftovers - - and. r9, r3, r10 - beq Laligned32 ; 32 byte aligned - - andi. r9, r3, 0x3 - beq Laligned4 - - andi. r9, r3, 0x1 - beq Laligned2 ; 2 byte aligned - - addi r11, 0, 1 ; swap bytes at end - lbz r8, 0(r3) - add r3, r3, r7 - subf. r4, r7, r4 - beq Ldone - -Laligned2: - cmpi cr0,0,r4,2 ; If remaining length is less than two - go to wrap-up - blt Lleftovers - andi. r9, r3, 0x3 ; If aligned on a 4-byte boundary, go to that code - beq Laligned4 - lhz r5, 0(r3) ; Load and add a halfword to the checksum - adde r8, r8, r5 - slwi r7, r7, 1 - add r3, r3, r7 - subf. r4, r7, r4 - beq Ldone - - - /* - Add longwords up to the 32 byte boundary - */ - -Laligned4: - addi r7, 0, 4 -Lloop4: - cmpi cr0,0,r4,4 - blt Lleftovers - and. r9, r3, r10 - beq Laligned32 - lwz r5, 0(r3) - adde r8, r8, r5 - add r3, r3, r7 - subf. r4, r7, r4 - bne Lloop4 - b Ldone - - - /* - We're aligned on a 32 byte boundary now - add 8 longwords to checksum - until the remaining length is less than 32 - */ -Laligned32: - andis. r6, r4, 0xffff - bne Lmainloop - andi. r6, r4, 0xffe0 - beq Lleftovers - -Lmainloop: - addi r9, 0, 64 - addi r10, 0, 32 - cmpi cr0,0,r4,64 - blt Lnopretouch - dcbt r3, r10 ; Touch one cache-line ahead -Lnopretouch: - lwz r5, 0(r3) - - /* - * This is the main meat of the checksum. I attempted to arrange this code - * such that the processor would execute as many instructions as possible - * in parallel. - */ - -Lloop: - cmpi cr0,0,r4,96 - blt Lnotouch - dcbt r3, r9 ; Touch two cache lines ahead -Lnotouch: - adde r8, r8, r5 - lwz r5, 4(r3) - lwz r6, 8(r3) - lwz r7, 12(r3) - adde r8, r8, r5 - lwz r5, 16(r3) - adde r8, r8, r6 - lwz r6, 20(r3) - adde r8, r8, r7 - lwz r7, 24(r3) - adde r8, r8, r5 - lwz r5, 28(r3) - add r3, r3, r10 - adde r8, r8, r6 - adde r8, r8, r7 - adde r8, r8, r5 - subf r4, r10, r4 - andi. r6, r4, 0xffe0 - beq Lleftovers - lwz r5, 0(r3) - b Lloop - - /* - * Handle whatever bytes are left - */ - -Lleftovers: - /* - * Handle leftover bytes - */ - cmpi cr0,0,r4,0 - beq Ldone - - addi r7, 0, 1 - addi r10, 0, 0x7ffc - - and. r9, r4, r10 - bne Lfourormore - srw r10, r10, r7 - and. r9, r4, r10 - bne Ltwoormore - b Loneleft - -Lfourormore: - addi r10, 0, 4 - -Lfourloop: - lwz r5, 0(r3) - adde r8, r8, r5 - add r3, r3, r10 - subf r4, r10, r4 - andi. r6, r4, 0xfffc - bne Lfourloop - -Ltwoormore: - andi. r6, r4, 0xfffe - beq Loneleft - lhz r5, 0(r3) - adde r8, r8, r5 - addi r3, r3, 2 - subi r4, r4, 2 - -Loneleft: - cmpi cr0,0,r4,0 - beq Ldone - lbz r5, 0(r3) - slwi r5, r5, 8 - adde r8, r8, r5 - - /* - * Wrap the longword around, adding the two 16-bit portions - * to each other along with any previous and subsequent carries. - */ -Ldone: - addze r8, r8 ; Add the carry - addze r8, r8 ; Add the carry again (the last add may have carried) - andis. r6, r8, 0xffff ; Stuff r6 with the high order 16 bits of sum word - srwi r6, r6, 16 ; Shift it to the low order word - andi. r8, r8, 0xffff ; Zero out the high order word - add r8, r8, r6 ; Add the two halves - - andis. r6, r8, 0xffff ; Do the above again in case we carried into the - srwi r6, r6, 16 ; high order word with the last add. - andi. r8, r8, 0xffff - add r3, r8, r6 - - cmpi cr0,0,r11,0 ; Check to see if we need to swap the bytes - beq Ldontswap - - /* - * Our buffer began on an odd boundary, so we need to swap - * the checksum bytes. - */ - slwi r8, r3, 8 ; shift byte 0 to byte 1 - clrlwi r8, r8, 16 ; Clear top 16 bits - srwi r3, r3, 8 ; shift byte 1 to byte 0 - or r3, r8, r3 ; or them - -Ldontswap: - add r3, r3, r12 ; Add in the passed-in checksum - andis. r6, r3, 0xffff ; Wrap and add any carries into the top 16 bits - srwi r6, r6, 16 - andi. r3, r3, 0xffff - add r3, r3, r6 - - andis. r6, r3, 0xffff ; Do the above again in case we carried into the - srwi r6, r6, 16 ; high order word with the last add. - andi. r3, r3, 0xffff - add r3, r3, r6 - blr - - + * short xsum_assym( short *p, int len, short xsum, boolean odd); + * + * r3 - Pointer to data + * r4 - Length of data + * r5 - Accumulated sum value + * r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data) + * + * Note: If the "odd" flag is set, the address in r3 will be even. Nonetheless, we + * correctly handle the case where the flag is set and the address is odd. + * + * This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum + * of the data, treated as an array of 16-bit integers. 1s-complement sums are done + * via "add with carry" operations on a 2s-complement machine like PPC. Note that + * the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the + * final sum is folded down to 16 bits. On 32-bit machines we use "adde", which is + * perfect except that it serializes the adds on the carry bit. On 64-bit machines + * we avoid this serialization by adding 32-bit words into 64-bit sums, then folding + * all 64-bits into a 16-bit sum at the end. We cannot use "adde" on 64-bit sums, + * because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit + * is set on the low 32-bits of the sum.) + * + * Using Altivec is tempting, but the performance impact of the greatly increased + * number of exceptions and register save/restore traffic probably make it impractical + * for now. + */ + .globl _xsum_assym + .globl _xsum_nop_if_32bit + .text + .align 5 +_xsum_assym: + cmplwi cr0,r4,kShort ; too short to word align? + rlwinm r2,r3,0,0x3 ; get byte offset in word + dcbt 0,r3 ; touch in 1st cache line + cmpwi cr6,r2,0 ; is address word aligned? + ble cr0,Lshort ; skip if too short to bother aligning + + subfic r0,r2,4 ; get #bytes in partial word + cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set + addic r0,r0,0 ; turn off carry + beq cr6,Laligned ; skip if already word aligned (r2==0 if aligned) + +; Partial word at start: zero filled on left, it becomes initial checksum. + + rlwinm r3,r3,0,0,29 ; word align address + mtcrf 0x01,r2 ; move byte offset to cr7 + lwz r6,0(r3) ; get partial word + li r7,-1 ; start of mask for partial fill + slwi r8,r2,3 ; multiply byte offset by 8 + sub r4,r4,r0 ; adjust length for bytes in partial word + crxor cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary + srw r7,r7,r8 ; get mask for bytes to keep in partial word + addi r3,r3,4 ; point to next word of input + and r2,r6,r7 ; zero fill on left + +; Address is now word aligned. Prepare for inner loop over 32-byte chunks. +; r2 = initial checksum +; r3 = word aligned address +; r4 = length remaining +; r5 = accumulated sum parameter +; carry = off +; cr1_gt = "starting on odd address" flag + +Laligned: + srwi. r0,r4,5 ; get count of 32-byte chunks + mtcrf 0x02,r4 ; move residual length to cr6 and cr7 + mtcrf 0x01,r4 + beq cr0,Lleftovers ; no chunks + + mtctr r0 ; set up loop count + li r4,32 ; offset to next chunk +_xsum_nop_if_32bit: + b L64BitPath ; use the 64-bit path (patched to nop on 32-bit machine) + dcbt r4,r3 ; touch in 2nd cache line + li r0,96 ; get touch offset + b LInnerLoop32 ; enter 32-bit loop + +; Inner loop for 32-bit machines. + + .align 4 +LInnerLoop32: + lwz r4,0(r3) + lwz r6,4(r3) + lwz r7,8(r3) + lwz r8,12(r3) + adde r2,r2,r4 + lwz r9,16(r3) + adde r2,r2,r6 + lwz r10,20(r3) + adde r2,r2,r7 + lwz r11,24(r3) + adde r2,r2,r8 + lwz r12,28(r3) + adde r2,r2,r9 + dcbt r3,r0 + adde r2,r2,r10 + addi r3,r3,32 + adde r2,r2,r11 + adde r2,r2,r12 + bdnz+ LInnerLoop32 + +; Handle leftover bytes. +; r2 = checksum so far +; r3 = word aligned address +; r5 = accumulated sum parameter +; carry = live +; cr1_gt = "starting on odd address" flag +; cr6,cr7 = residual length + +Lleftovers: + bf 27,Lleftover8 ; test 0x10 bit of residual length + lwz r4,0(r3) + lwz r6,4(r3) + lwz r7,8(r3) + lwz r8,12(r3) + addi r3,r3,16 + adde r2,r2,r4 + adde r2,r2,r6 + adde r2,r2,r7 + adde r2,r2,r8 +Lleftover8: + bf 28,Lleftover4 + lwz r4,0(r3) + lwz r6,4(r3) + addi r3,r3,8 + adde r2,r2,r4 + adde r2,r2,r6 +Lleftover4: + bf 29,Lleftover2 + lwz r4,0(r3) + addi r3,r3,4 + adde r2,r2,r4 +Lleftover2: + bf 30,Lleftover1 + lhz r4,0(r3) + addi r3,r3,2 + adde r2,r2,r4 +Lleftover1: + bf 31,Lwrapup + lbz r4,0(r3) + slwi r4,r4,8 ; shift last byte into proper lane + adde r2,r2,r4 + +; All data bytes checksummed. Wrap up. +; r2 = checksum so far (word parallel) +; r5 = accumulated sum parameter +; carry = live +; cr1_gt = "starting on odd address" flag + +Lwrapup: + addze r2,r2 ; add in last carry + addze r2,r2 ; in case the "addze" carries +Lwrapupx: ; here from short-operand case, with xer(ca) undefined + srwi r6,r2,16 ; top half of 32-bit checksum + rlwinm r7,r2,0,0xFFFF ; lower half + add r2,r6,r7 ; add them together + srwi r6,r2,16 ; then do it again, in case first carried + rlwinm r7,r2,0,0xFFFF + add r2,r6,r7 + bf cr1_gt,Lswapped ; test "starting on odd address" flag + +; The checksum began on an odd address, so swap bytes. + + rlwinm r6,r2,24,0x00FF ; move top byte to bottom + rlwinm r7,r2,8,0xFF00 ; bottom to top + or r2,r6,r7 ; rejoin + +; Finally, add in checksum passed in as a parameter. + +Lswapped: + add r2,r2,r5 ; add passed-in checksum + srwi r6,r2,16 ; top half of 32-bit checksum + rlwinm r7,r2,0,0xFFFF ; lower half + add r2,r6,r7 ; add them together + srwi r6,r2,16 ; then do it again, in case first carried + rlwinm r7,r2,0,0xFFFF + add r3,r6,r7 ; steer result into r3 + blr + +; Handle short operands. Do a halfword at a time. +; r3 = address +; r4 = length (<= kShort) +; r5 = accumulated sum parameter +; r6 = "starting on odd byte" flag + +Lshort: + cmpwi cr6,r4,2 ; at least two bytes? + andi. r0,r4,1 ; odd length? + li r2,0 ; initialize checksum + cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set + blt cr6,Lshort2 ; fewer than two bytes, so skip +Lshort1: + cmpwi cr6,r4,4 ; two more bytes (after we decrement)? + lhz r7,0(r3) + subi r4,r4,2 + addi r3,r3,2 + add r2,r2,r7 ; note no need for "adde" + bge cr6,Lshort1 ; loop for 2 more bytes +Lshort2: + beq Lwrapupx ; no byte at end, proceed to checkout with carry undefined + lbz r7,0(r3) + slwi r7,r7,8 ; shift last byte into proper lane + add r2,r2,r7 + b Lwrapupx + +; Handle 64-bit machine. The major improvement over the 32-bit path is that we use +; four parallel 32-bit accumulators, which carry into the upper half naturally so we +; do not have to use "adde", which serializes on the carry bit. Note that we cannot +; do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly. +; r2 = checksum so far (ie, the zero-filled partial first word) +; r3 = word aligned address +; r5 = accumulated sum parameter +; ctr = number of 32-byte chunks of input +; carry = unused in this code +; cr1_gt = "starting on odd address" flag +; cr6,cr7 = residual length + +L64BitPath: + stw r13,-4(r1) ; save a few nonvolatile regs in red zone so we can use them + stw r14,-8(r1) + stw r15,-12(r1) + stw r16,-16(r1) + li r0,128 ; to touch next line + li r13,0 ; r13-r15 are the accumulators, so initialize them + dcbt r3,r0 ; touch in next cache line, and keep loads away from the above stores + lwz r4,0(r3) ; start pipeline by loading first 32 bytes into r4, r6-r12 + lwz r6,4(r3) + lwz r7,8(r3) + mr r14,r2 ; just copy incoming partial word into one of the accumulators + li r15,0 + lwz r8,12(r3) + lwz r9,16(r3) + li r16,0 + li r0,256 ; get touch offset + lwz r10,20(r3) + lwz r11,24(r3) + lwz r12,28(r3) ; load last word of previous chunk + addi r3,r3,32 ; skip past the chunk + bdnz++ LInnerLoop64 ; enter loop if another chunk to go + + b LAddLastChunk ; only one chunk + +; Inner loop for 64-bit processors. This loop is scheduled for the 970. +; It is pipelined (loads are one iteration ahead of adds), and unrolled. +; It should take 9-10 cycles per iteration, which consumes 64 bytes of input. + + .align 5 +LInnerLoop64: ; 64 bytes/iteration + add r13,r13,r4 ; cycle 1 + add r14,r14,r6 + dcbt r3,r0 ; touch in 2 lines ahead + lwz r4,0(r3) + + add r15,r15,r7 ; cycle 2, etc + lwz r6,4(r3) + lwz r7,8(r3) + add r16,r16,r8 + + lwz r8,12(r3) + add r13,r13,r9 + add r14,r14,r10 + lwz r9,16(r3) + + add r15,r15,r11 + lwz r10,20(r3) + lwz r11,24(r3) + add r16,r16,r12 + bdz-- LEarlyExit ; early exit if no more chunks + + lwz r12,28(r3) + add r13,r13,r4 + add r14,r14,r6 + lwz r4,32(r3) + + add r15,r15,r7 + lwz r6,36(r3) + lwz r7,40(r3) + add r16,r16,r8 + + lwz r8,44(r3) + add r13,r13,r9 + add r14,r14,r10 + lwz r9,48(r3) + + add r15,r15,r11 + lwz r10,52(r3) + lwz r11,56(r3) + add r16,r16,r12 + + nop ; position last load in 2nd dispatch slot + lwz r12,60(r3) + addi r3,r3,64 + bdnz++ LInnerLoop64 + + b LAddLastChunk + +; Add in the last 32-byte chunk, and any leftover bytes. +; r3 = word aligned address of next byte of data +; r5 = accumulated sum parameter +; r13-r16 = the four accumulators +; cr1_gt = "starting on odd address" flag +; cr6,cr7 = residual length + +LEarlyExit: ; here from middle of inner loop + lwz r12,28(r3) ; load last word of last chunk + addi r3,r3,32 +LAddLastChunk: ; last 32-byte chunk of input is in r4,r6-r12 + add r13,r13,r4 ; add in last chunk + add r14,r14,r6 ; these are 64-bit adds + add r15,r15,r7 + add r16,r16,r8 + add r13,r13,r9 + add r14,r14,r10 + add r15,r15,r11 + add r16,r16,r12 + +; Handle leftover bytes, if any. + + bf 27,Lleft1 ; test 0x10 bit of residual length + lwz r4,0(r3) + lwz r6,4(r3) + lwz r7,8(r3) + lwz r8,12(r3) + addi r3,r3,16 + add r13,r13,r4 + add r14,r14,r6 + add r15,r15,r7 + add r16,r16,r8 +Lleft1: + bf 28,Lleft2 + lwz r4,0(r3) + lwz r6,4(r3) + addi r3,r3,8 + add r13,r13,r4 + add r14,r14,r6 +Lleft2: + bf 29,Lleft3 + lwz r4,0(r3) + addi r3,r3,4 + add r14,r14,r4 +Lleft3: + bf 30,Lleft4 + lhz r4,0(r3) + addi r3,r3,2 + add r15,r15,r4 +Lleft4: + bf 31,Lleft5 + lbz r4,0(r3) + slwi r4,r4,8 ; shift last byte into proper lane + add r16,r16,r4 + +; All data bytes have been checksummed. Now we must add together the four +; accumulators and restore the regs from the red zone. +; r3 = word aligned address of next byte of data +; r5 = accumulated sum parameter +; r13-r16 = the four accumulators +; carry = not used so far +; cr1_gt = "starting on odd address" flag + +Lleft5: + add r8,r13,r14 ; add the four accumulators together + add r9,r15,r16 + lwz r13,-4(r1) ; start to restore nonvolatiles from red zone + lwz r14,-8(r1) + add r8,r8,r9 ; now r8 is 64-bit sum of the four accumulators + lwz r15,-12(r1) + lwz r16,-16(r1) + srdi r7,r8,32 ; get upper half of 64-bit sum + addc r2,r7,r8 ; finally, do a 32-bit add of the two halves of r8 (setting carry) + b Lwrapup ; merge r2, r5, and carry into a 16-bit checksum diff --git a/bsd/dev/random/YarrowCoreLib/port/smf.c b/bsd/dev/random/YarrowCoreLib/port/smf.c index 838a87601..297fe0f58 100644 --- a/bsd/dev/random/YarrowCoreLib/port/smf.c +++ b/bsd/dev/random/YarrowCoreLib/port/smf.c @@ -29,10 +29,12 @@ #include <dev/random/YarrowCoreLib/src/smf.h> #include <sys/malloc.h> +#include <sys/systm.h> -SMFAPI void mmInit() +SMFAPI void mmInit( void ) { + return; } SMFAPI MMPTR mmMalloc(DWORD request) @@ -62,8 +64,9 @@ SMFAPI LPVOID mmGetPtr(MMPTR ptrnum) return (LPVOID)ptrnum; } -SMFAPI void mmReturnPtr(MMPTR ptrnum) +SMFAPI void mmReturnPtr(__unused MMPTR ptrnum) { /* nothing */ + return; } diff --git a/bsd/dev/random/YarrowCoreLib/src/comp.c b/bsd/dev/random/YarrowCoreLib/src/comp.c index 91c1844a0..8d2faeea0 100644 --- a/bsd/dev/random/YarrowCoreLib/src/comp.c +++ b/bsd/dev/random/YarrowCoreLib/src/comp.c @@ -32,24 +32,26 @@ #ifdef YARROW_KERNEL /* null compression */ -comp_error_status comp_init(COMP_CTX* ctx) +comp_error_status comp_init(__unused COMP_CTX* ctx) { return COMP_SUCCESS; } -comp_error_status comp_add_data(COMP_CTX* ctx,Bytef* inp,uInt inplen) +comp_error_status comp_add_data( __unused COMP_CTX* ctx, + __unused Bytef* inp, + __unused uInt inplen ) { return COMP_SUCCESS; } -comp_error_status comp_get_ratio(COMP_CTX* ctx,float* out) +comp_error_status comp_get_ratio( __unused COMP_CTX* ctx,float* out ) { *out = 1.0; return COMP_SUCCESS; } -comp_error_status comp_end(COMP_CTX* ctx) +comp_error_status comp_end( __unused COMP_CTX* ctx ) { return COMP_SUCCESS; } diff --git a/bsd/dev/random/YarrowCoreLib/src/prng.c b/bsd/dev/random/YarrowCoreLib/src/prng.c index 5fe3bfd94..e2ba0a2ee 100644 --- a/bsd/dev/random/YarrowCoreLib/src/prng.c +++ b/bsd/dev/random/YarrowCoreLib/src/prng.c @@ -182,9 +182,10 @@ cleanup_slow_init: /* In-place modifed bubble sort */ static void -bubbleSort(UINT *data,UINT len) +bubbleSort( UINT *data, LONG len ) { - UINT i,last,newlast,temp; + LONG i,last,newlast; + UINT temp; last = len-1; while(last!=-1) @@ -476,7 +477,7 @@ prngStretch(BYTE *inbuf,UINT inbuflen,BYTE *outbuf,UINT outbuflen) { /* Add entropy to the PRNG from a source */ prng_error_status -prngInput(PRNG *p, BYTE *inbuf,UINT inbuflen,UINT poolnum,UINT estbits) +prngInput(PRNG *p, BYTE *inbuf,UINT inbuflen,UINT poolnum, __unused UINT estbits) { #ifndef YARROW_KERNEL comp_error_status resp; @@ -513,13 +514,15 @@ prng_error_status prngAllowReseed(PRNG *p, LONGLONG ticks) { UINT temp[TOTAL_SOURCES]; - UINT i,sum; + LONG i; + UINT sum; #ifndef KERNEL_BUILD float ratio; #endif +#ifndef KERNEL_BUILD comp_error_status resp; - +#endif CHECKSTATE(p); diff --git a/bsd/dev/random/YarrowCoreLib/src/sha1mod.c b/bsd/dev/random/YarrowCoreLib/src/sha1mod.c index 03dfa5a23..f58585865 100644 --- a/bsd/dev/random/YarrowCoreLib/src/sha1mod.c +++ b/bsd/dev/random/YarrowCoreLib/src/sha1mod.c @@ -27,6 +27,9 @@ By Steve Reid <steve@edmweb.com> */ /* Header portion split from main code for convenience (AYB 3/02/98) */ #include "sha1mod.h" +#ifdef SHA1HANDSOFF +#include <string.h> +#endif #define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits)))) @@ -58,7 +61,7 @@ By Steve Reid <steve@edmweb.com> /* Hash a single 512-bit block. This is the core of the algorithm. */ -void SHA1Transform(unsigned long state[5], unsigned char buffer[64]) +void SHA1Transform(unsigned long state[5], const unsigned char buffer[64]) { unsigned long a, b, c, d, e; typedef union { @@ -127,7 +130,7 @@ void SHA1Init(SHA1_CTX* context) /* Run your data through this. */ -void SHA1Update(SHA1_CTX* context, unsigned char* data, unsigned int len) +void SHA1Update(SHA1_CTX* context, const unsigned char* data, unsigned int len) { unsigned int i, j; @@ -158,9 +161,9 @@ unsigned char finalcount[8]; finalcount[i] = (unsigned char)((context->count[(i >= 4 ? 0 : 1)] >> ((3-(i & 3)) * 8) ) & 255); /* Endian independent */ } - SHA1Update(context, (unsigned char *)"\200", 1); + SHA1Update(context, "\200", 1); while ((context->count[0] & 504) != 448) { - SHA1Update(context, (unsigned char *)"\0", 1); + SHA1Update(context, "\0", 1); } SHA1Update(context, finalcount, 8); /* Should cause a SHA1Transform() */ for (i = 0; i < 20; i++) { diff --git a/bsd/dev/random/YarrowCoreLib/src/sha1mod.h b/bsd/dev/random/YarrowCoreLib/src/sha1mod.h index 839168e8b..c066767bb 100644 --- a/bsd/dev/random/YarrowCoreLib/src/sha1mod.h +++ b/bsd/dev/random/YarrowCoreLib/src/sha1mod.h @@ -53,9 +53,9 @@ typedef struct { } SHA1_CTX; //Function forward declerations -void SHA1Transform(unsigned long state[5], unsigned char buffer[64]); +void SHA1Transform(unsigned long state[5], const unsigned char buffer[64]); void SHA1Init(SHA1_CTX* context); -void SHA1Update(SHA1_CTX* context, unsigned char* data, unsigned int len); +void SHA1Update(SHA1_CTX* context, const unsigned char* data, unsigned int len); void SHA1Final(unsigned char digest[20], SHA1_CTX* context); #endif /* __SHA1_H__ */ diff --git a/bsd/dev/random/YarrowCoreLib/src/smf.h b/bsd/dev/random/YarrowCoreLib/src/smf.h index 538b815f4..ad4fcf321 100644 --- a/bsd/dev/random/YarrowCoreLib/src/smf.h +++ b/bsd/dev/random/YarrowCoreLib/src/smf.h @@ -79,7 +79,7 @@ extern "C" { #define MM_NULL 0 /* Function forward declarations */ -SMFAPI void mmInit(); +SMFAPI void mmInit( void ); SMFAPI MMPTR mmMalloc(DWORD request); SMFAPI void mmFree(MMPTR ptrnum); SMFAPI LPVOID mmGetPtr(MMPTR ptrnum); diff --git a/bsd/dev/random/YarrowCoreLib/src/yarrowUtils.c b/bsd/dev/random/YarrowCoreLib/src/yarrowUtils.c index a441c2780..bd8b19794 100644 --- a/bsd/dev/random/YarrowCoreLib/src/yarrowUtils.c +++ b/bsd/dev/random/YarrowCoreLib/src/yarrowUtils.c @@ -36,6 +36,7 @@ */ #include "dev/random/YarrowCoreLib/include/yarrowUtils.h" +#include <string.h> void trashMemory(void* mem, int len) diff --git a/bsd/dev/random/randomdev.c b/bsd/dev/random/randomdev.c index dfb37cf51..747656750 100644 --- a/bsd/dev/random/randomdev.c +++ b/bsd/dev/random/randomdev.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999, 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -31,6 +31,7 @@ #include <kern/lock.h> #include <sys/time.h> #include <sys/malloc.h> +#include <sys/uio_internal.h> #include <dev/random/randomdev.h> #include <dev/random/YarrowCoreLib/include/yarrow.h> @@ -49,9 +50,9 @@ static struct cdevsw random_cdevsw = random_close, /* close */ random_read, /* read */ random_write, /* write */ - random_ioctl, /* ioctl */ - nulldev, /* stop */ - nulldev, /* reset */ + random_ioctl, /* ioctl */ + (stop_fcn_t *)nulldev, /* stop */ + (reset_fcn_t *)nulldev, /* reset */ NULL, /* tty's */ eno_select, /* select */ eno_mmap, /* mmap */ @@ -69,10 +70,11 @@ static mutex_t *gYarrowMutex = 0; #define RESEED_TICKS 50 /* how long a reseed operation can take */ + /* *Initialize ONLY the Yarrow generator. */ -void PreliminarySetup () +void PreliminarySetup( void ) { prng_error_status perr; struct timeval tt; @@ -120,7 +122,7 @@ void PreliminarySetup () * and to register ourselves with devfs */ void -random_init() +random_init( void ) { int ret; @@ -152,12 +154,8 @@ random_init() } int -random_ioctl(dev, cmd, data, flag, p) - dev_t dev; - u_long cmd; - caddr_t data; - int flag; - struct proc *p; +random_ioctl( __unused dev_t dev, u_long cmd, __unused caddr_t data, + __unused int flag, __unused struct proc *p ) { switch (cmd) { case FIONBIO: @@ -176,7 +174,7 @@ random_ioctl(dev, cmd, data, flag, p) */ int -random_open(dev_t dev, int flags, int devtype, struct proc *p) +random_open(__unused dev_t dev, int flags, __unused int devtype, __unused struct proc *p) { if (gRandomError != 0) { /* forget it, yarrow didn't come up */ @@ -191,7 +189,7 @@ random_open(dev_t dev, int flags, int devtype, struct proc *p) if (securelevel >= 2) return (EPERM); #ifndef __APPLE__ - if ((securelevel >= 1) && suser(p->p_ucred, &p->p_acflag)) + if ((securelevel >= 1) && proc_suser(p)) return (EPERM); #endif /* !__APPLE__ */ } @@ -205,7 +203,7 @@ random_open(dev_t dev, int flags, int devtype, struct proc *p) */ int -random_close(dev_t dev, int flags, int mode, struct proc *p) +random_close(__unused dev_t dev, __unused int flags, __unused int mode, __unused struct proc *p) { return (0); } @@ -216,7 +214,7 @@ random_close(dev_t dev, int flags, int mode, struct proc *p) * prng. */ int -random_write (dev_t dev, struct uio *uio, int ioflag) +random_write (__unused dev_t dev, struct uio *uio, __unused int ioflag) { int retCode = 0; char rdBuffer[256]; @@ -230,9 +228,10 @@ random_write (dev_t dev, struct uio *uio, int ioflag) /* Security server is sending us entropy */ - while (uio->uio_resid > 0 && retCode == 0) { + while (uio_resid(uio) > 0 && retCode == 0) { /* get the user's data */ - int bytesToInput = min(uio->uio_resid, sizeof (rdBuffer)); + // LP64todo - fix this! uio_resid may be 64-bit value + int bytesToInput = min(uio_resid(uio), sizeof (rdBuffer)); retCode = uiomove(rdBuffer, bytesToInput, uio); if (retCode != 0) goto /*ugh*/ error_exit; @@ -263,7 +262,7 @@ error_exit: /* do this to make sure the mutex unlocks. */ * return data to the caller. Results unpredictable. */ int -random_read(dev_t dev, struct uio *uio, int ioflag) +random_read(__unused dev_t dev, struct uio *uio, __unused int ioflag) { int retCode = 0; char wrBuffer[512]; @@ -274,9 +273,10 @@ random_read(dev_t dev, struct uio *uio, int ioflag) /* lock down the mutex */ mutex_lock(gYarrowMutex); - while (uio->uio_resid > 0 && retCode == 0) { + while (uio_resid(uio) > 0 && retCode == 0) { /* get the user's data */ - int bytesToRead = min(uio->uio_resid, sizeof (wrBuffer)); + // LP64todo - fix this! uio_resid may be 64-bit value + int bytesToRead = min(uio_resid(uio), sizeof (wrBuffer)); /* get the data from Yarrow */ if (prngOutput(gPrngRef, (BYTE *) wrBuffer, sizeof (wrBuffer)) != 0) { @@ -317,7 +317,7 @@ read_random(void* buffer, u_int numbytes) * Return an unsigned long pseudo-random number. */ u_long -RandomULong() +RandomULong( void ) { u_long buf; read_random(&buf, sizeof (buf)); diff --git a/bsd/dev/random/randomdev.h b/bsd/dev/random/randomdev.h index efa6703e9..e5c65aea1 100644 --- a/bsd/dev/random/randomdev.h +++ b/bsd/dev/random/randomdev.h @@ -29,12 +29,14 @@ #include <sys/random.h> +void PreliminarySetup( void ); +void random_init( void ); int random_open(dev_t dev, int flags, int devtype, struct proc *pp); int random_close(dev_t dev, int flags, int mode, struct proc *pp); int random_read(dev_t dev, struct uio *uio, int ioflag); int random_write(dev_t dev, struct uio *uio, int ioflag); -u_long RandomULong(); +u_long RandomULong( void ); #endif /* __APPLE_API_PRIVATE */ #endif /* __DEV_RANDOMDEV_H__ */ diff --git a/bsd/dev/ppc/unix_startup.c b/bsd/dev/unix_startup.c similarity index 55% rename from bsd/dev/ppc/unix_startup.c rename to bsd/dev/unix_startup.c index 717fddc9f..33070c045 100644 --- a/bsd/dev/ppc/unix_startup.c +++ b/bsd/dev/unix_startup.c @@ -1,14 +1,14 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ - * + * * The contents of this file constitute Original Code as defined in and * are subject to the Apple Public Source License Version 1.1 (the * "License"). You may not use this file except in compliance with the * License. Please obtain a copy of the License at * http://www.apple.com/publicsource and read it before using this file. - * + * * This Original Code and all software distributed under the License are * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -16,13 +16,14 @@ * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the * License for the specific language governing rights and limitations * under the License. - * + * * @APPLE_LICENSE_HEADER_END@ */ /* * Copyright (c) 1992,7 NeXT Computer, Inc. * * Unix data structure initialization. + * */ #include <mach/mach_types.h> @@ -31,41 +32,53 @@ #include <mach/vm_prot.h> #include <sys/param.h> -#include <sys/buf.h> +#include <sys/buf_internal.h> #include <sys/clist.h> #include <sys/mbuf.h> #include <sys/systm.h> #include <sys/tty.h> #include <dev/ppc/cons.h> -extern vm_map_t mb_map; +extern vm_map_t mb_map; + +extern u_long tcp_sendspace; +extern u_long tcp_recvspace; + +void bsd_bufferinit(void); +extern void md_prepare_for_shutdown(int, int, char *); /* * Declare these as initialized data so we can patch them. */ #ifdef NBUF -int nbuf = NBUF; -int niobuf = NBUF/2; +int nbuf = NBUF; +int niobuf = NBUF / 2; + #else -int nbuf = 0; -int niobuf = 0; +int nbuf = 0; +int niobuf = 0; + #endif -int srv = 0; /* Flag indicates a server boot when set */ -int ncl = 0; +int srv = 0; /* Flag indicates a server boot when set */ +int ncl = 0; + +vm_map_t buffer_map; +vm_map_t bufferhdr_map; -vm_map_t bufferhdr_map; + +extern void bsd_startupearly(void); void -bsd_startupearly() +bsd_startupearly(void) { - vm_offset_t firstaddr; - vm_size_t size; - kern_return_t ret; + vm_offset_t firstaddr; + vm_size_t size; + kern_return_t ret; if (nbuf == 0) - nbuf = atop_64(sane_size / 100); /* Get 1% of ram, but no more than we can map */ + nbuf = atop(sane_size / 100); /* Get 1% of ram, but no more than we can map */ if (nbuf > 8192) nbuf = 8192; if (nbuf < 256) @@ -78,39 +91,37 @@ bsd_startupearly() if (niobuf < 128) niobuf = 128; - size = (nbuf + niobuf) * sizeof (struct buf); - size = round_page_32(size); + size = (nbuf + niobuf) * sizeof(struct buf); + size = round_page(size); ret = kmem_suballoc(kernel_map, - &firstaddr, - size, - FALSE, - TRUE, - &bufferhdr_map); + &firstaddr, + size, + FALSE, + VM_FLAGS_ANYWHERE, + &bufferhdr_map); - if (ret != KERN_SUCCESS) + if (ret != KERN_SUCCESS) panic("Failed to create bufferhdr_map"); - + ret = kernel_memory_allocate(bufferhdr_map, - &firstaddr, - size, - 0, - KMA_HERE | KMA_KOBJECT); + &firstaddr, + size, + 0, + KMA_HERE | KMA_KOBJECT); if (ret != KERN_SUCCESS) panic("Failed to allocate bufferhdr_map"); - buf = (struct buf * )firstaddr; - bzero(buf,size); + buf = (struct buf *) firstaddr; + bzero(buf, size); - if ((sane_size > (64 * 1024 * 1024)) || ncl) { - int scale; - extern u_long tcp_sendspace; - extern u_long tcp_recvspace; + if (sane_size > (64 * 1024 * 1024) || ncl) { + int scale; if ((nmbclusters = ncl) == 0) { - if ((nmbclusters = ((sane_size / 16) / MCLBYTES)) > 16384) - nmbclusters = 16384; + if ((nmbclusters = ((sane_size / 16)/MCLBYTES)) > 32768) + nmbclusters = 32768; } if ((scale = nmbclusters / NMBCLUSTERS) > 1) { tcp_sendspace *= scale; @@ -125,38 +136,26 @@ bsd_startupearly() } void -bsd_bufferinit() +bsd_bufferinit(void) { - kern_return_t ret; + kern_return_t ret; - cons.t_dev = makedev(12, 0); + cons.t_dev = makedev(12, 0); bsd_startupearly(); - ret = kmem_suballoc(kernel_map, - (vm_offset_t *) &mbutl, - (vm_size_t) (nmbclusters * MCLBYTES), - FALSE, - TRUE, - &mb_map); + ret = kmem_suballoc(kernel_map, + (vm_offset_t *) & mbutl, + (vm_size_t) (nmbclusters * MCLBYTES), + FALSE, + VM_FLAGS_ANYWHERE, + &mb_map); - if (ret != KERN_SUCCESS) + if (ret != KERN_SUCCESS) panic("Failed to allocate mb_map\n"); - - /* - * Set up buffers, so they can be used to read disk labels. - */ - bufinit(); -} - -void -md_prepare_for_shutdown(int paniced, int howto, char * command) -{ - extern void IOSystemShutdownNotification(); - /* - * Temporary hack to notify the power management root domain - * that the system will shut down. - */ - IOSystemShutdownNotification(); + /* + * Set up buffers, so they can be used to read disk labels. + */ + bufinit(); } diff --git a/bsd/dev/vn/shadow.c b/bsd/dev/vn/shadow.c index 12a20c725..cb5fbfd6d 100644 --- a/bsd/dev/vn/shadow.c +++ b/bsd/dev/vn/shadow.c @@ -62,6 +62,7 @@ #include <sys/malloc.h> #define my_malloc(a) _MALLOC(a, M_TEMP, M_WAITOK) #define my_free(a) FREE(a, M_TEMP) +#include <libkern/libkern.h> #endif /* TEST_SHADOW */ #include "shadow.h" @@ -289,7 +290,7 @@ bitmap_get(u_char * map, u_long start_bit, u_long bit_count, } end: - for (i = start.bit; i < end.bit; i++) { + for (i = start.bit; i < (int)end.bit; i++) { boolean_t this_is_set = (map[start.byte] & bit(i)) ? TRUE : FALSE; if (this_is_set != is_set) { @@ -525,6 +526,15 @@ shadow_map_write(shadow_map_t * map, u_long block_offset, return (shadow_grew); } +boolean_t +shadow_map_is_written(shadow_map_t * map, u_long block_offset) +{ + bitmap_offset_t b; + + b = bitmap_offset(block_offset); + return ((map->block_bitmap[b.byte] & bit(b.bit)) ? TRUE : FALSE); +} + /* * Function: shadow_map_shadow_size * diff --git a/bsd/dev/vn/shadow.h b/bsd/dev/vn/shadow.h index 074ba9e4c..b610fd828 100644 --- a/bsd/dev/vn/shadow.h +++ b/bsd/dev/vn/shadow.h @@ -35,12 +35,17 @@ shadow_map_read(shadow_map_t * map, u_long block_offset, u_long block_count, boolean_t shadow_map_write(shadow_map_t * map, u_long block_offset, u_long block_count, u_long * incr_block_offset, u_long * incr_block_count); +boolean_t +shadow_map_is_written(shadow_map_t * map, u_long block_offset); + u_long shadow_map_shadow_size(shadow_map_t * map); shadow_map_t * shadow_map_create(off_t file_size, off_t shadow_size, unsigned long band_size, unsigned long block_size); +void +shadow_map_free(shadow_map_t * map); #endif /* __APPLE_API_PRIVATE */ #endif /* __VN_SHADOW_H__ */ diff --git a/bsd/dev/vn/vn.c b/bsd/dev/vn/vn.c index 19f246616..265270a3a 100644 --- a/bsd/dev/vn/vn.c +++ b/bsd/dev/vn/vn.c @@ -1,3 +1,24 @@ +/* + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ /* * Copyright (c) 1988 University of Utah. @@ -48,7 +69,7 @@ * Block/character interface to a vnode. Allows one to treat a file * as a disk (e.g. build a filesystem in it, mount it, etc.). * - * NOTE 1: This uses the VOP_BMAP/VOP_STRATEGY interface to the vnode + * NOTE 1: This uses the vnop_blockmap/vnop_strategy interface to the vnode * instead of a simple VOP_RDWR. We do this to avoid distorting the * local buffer cache. * @@ -71,35 +92,26 @@ #include <sys/mount.h> #include <sys/namei.h> #include <sys/proc.h> +#include <sys/kauth.h> #include <sys/buf.h> #include <sys/malloc.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/fcntl.h> #include <sys/conf.h> #include <sys/disk.h> #include <sys/stat.h> #include <sys/conf.h> +#include <sys/uio_internal.h> #include <sys/vnioctl.h> #include <sys/vm.h> #include <vm/vm_pager.h> -#include <vm/vm_pageout.h> #include <mach/memory_object_types.h> #include <miscfs/devfs/devfs.h> -extern void -vfs_io_maxsegsize(struct vnode *vp, - int flags, /* B_READ or B_WRITE */ - int *maxsegsize); - -extern void -vfs_io_attributes(struct vnode *vp, - int flags, /* B_READ or B_WRITE */ - int *iosize, - int *vectors); #include "shadow.h" @@ -152,13 +164,15 @@ struct vn_softc { u_int64_t sc_fsize; /* file size in bytes */ u_int64_t sc_size; /* size of vn, sc_secsize scale */ int sc_flags; /* flags */ - int sc_secsize; /* sector size */ + u_long sc_secsize; /* sector size */ struct vnode *sc_vp; /* vnode if not NULL */ + uint32_t sc_vid; int sc_open_flags; struct vnode *sc_shadow_vp; /* shadow vnode if not NULL */ + uint32_t sc_shadow_vid; shadow_map_t * sc_shadow_map; /* shadow map if not NULL */ - struct ucred *sc_cred; /* credentials */ - u_long sc_options; /* options */ + kauth_cred_t sc_cred; /* credentials */ + u_int32_t sc_options; /* options */ void * sc_bdev; void * sc_cdev; } vn_table[NVNDEVICE]; @@ -169,40 +183,45 @@ struct vn_softc { #define VNF_INITED 0x01 #define VNF_READONLY 0x02 -static u_long vn_options; +static u_int32_t vn_options; #define IFOPT(vn,opt) if (((vn)->sc_options|vn_options) & (opt)) #define TESTOPT(vn,opt) (((vn)->sc_options|vn_options) & (opt)) -static int vnsetcred (struct vn_softc *vn, struct proc *p); -static void vnclear (struct vn_softc *vn); +static int setcred(struct vnode * vp, struct proc * p, + kauth_cred_t cred); +static void vnclear (struct vn_softc *vn, struct proc * p); +static void vn_ioctl_to_64(struct vn_ioctl *from, struct user_vn_ioctl *to); +void vndevice_init(void); +int vndevice_root_image(char * path, char devname[], dev_t * dev_p); static int vniocattach_file(struct vn_softc *vn, - struct vn_ioctl *vio, + struct user_vn_ioctl *vniop, dev_t dev, int in_kernel, struct proc *p); static int vniocattach_shadow(struct vn_softc * vn, - struct vn_ioctl *vio, + struct user_vn_ioctl *vniop, dev_t dev, int in_kernel, struct proc *p); -static __inline__ +static __inline__ int vnunit(dev_t dev) { return (minor(dev)); } static int -vnclose(dev_t dev, int flags, int devtype, struct proc *p) +vnclose(__unused dev_t dev, __unused int flags, + __unused int devtype, __unused struct proc *p) { return (0); } static int -vnopen(dev_t dev, int flags, int devtype, struct proc *p) +vnopen(dev_t dev, int flags, __unused int devtype, __unused struct proc *p) { struct vn_softc *vn; int unit; @@ -218,11 +237,260 @@ vnopen(dev_t dev, int flags, int devtype, struct proc *p) return(0); } +static int +file_io(struct vnode * vp, struct vfs_context * context_p, + enum uio_rw op, char * base, off_t offset, user_ssize_t count, + user_ssize_t * resid) +{ + uio_t auio; + int error; + char uio_buf[UIO_SIZEOF(1)]; + + auio = uio_createwithbuffer(1, offset, UIO_SYSSPACE, op, + &uio_buf[0], sizeof(uio_buf)); + uio_addiov(auio, CAST_USER_ADDR_T(base), count); + if (op == UIO_READ) + error = VNOP_READ(vp, auio, IO_SYNC, context_p); + else + error = VNOP_WRITE(vp, auio, IO_SYNC, context_p); + + if (resid != NULL) { + *resid = uio_resid(auio); + } + return (error); +} + +static __inline__ off_t +block_round(off_t o, int blocksize) +{ + return ((o + blocksize - 1) / blocksize); +} + +static __inline__ off_t +block_truncate(off_t o, int blocksize) +{ + return (o / blocksize); +} + +static __inline__ int +block_remainder(off_t o, int blocksize) +{ + return (o % blocksize); +} + +static int +vnread_shadow(struct vn_softc * vn, struct uio *uio, int ioflag, + struct vfs_context * context_p) +{ + u_long blocksize = vn->sc_secsize; + int error = 0; + off_t offset; + user_ssize_t resid; + off_t orig_offset; + user_ssize_t orig_resid; + + orig_resid = resid = uio_resid(uio); + orig_offset = offset = uio_offset(uio); + + while (resid > 0) { + u_long remainder; + u_long this_block_number; + u_long this_block_count; + off_t this_offset; + user_ssize_t this_resid; + struct vnode * vp; + + /* figure out which blocks to read */ + remainder = block_remainder(offset, blocksize); + if (shadow_map_read(vn->sc_shadow_map, + block_truncate(offset, blocksize), + block_round(resid + remainder, blocksize), + &this_block_number, &this_block_count)) { + vp = vn->sc_shadow_vp; + } + else { + vp = vn->sc_vp; + } + + /* read the blocks (or parts thereof) */ + this_offset = (off_t)this_block_number * blocksize + remainder; + uio_setoffset(uio, this_offset); + this_resid = this_block_count * blocksize - remainder; + if (this_resid > resid) { + this_resid = resid; + } + uio_setresid(uio, this_resid); + error = VNOP_READ(vp, uio, ioflag, context_p); + if (error) { + break; + } + + /* figure out how much we actually read */ + this_resid -= uio_resid(uio); + if (this_resid == 0) { + printf("vn device: vnread_shadow zero length read\n"); + break; + } + resid -= this_resid; + offset += this_resid; + } + uio_setresid(uio, resid); + uio_setoffset(uio, offset); + return (error); +} + +static int +vncopy_block_to_shadow(struct vn_softc * vn, struct vfs_context * context_p, + u_long file_block, u_long shadow_block) +{ + int error; + char * tmpbuf; + + tmpbuf = _MALLOC(vn->sc_secsize, M_TEMP, M_WAITOK); + if (tmpbuf == NULL) { + return (ENOMEM); + } + /* read one block from file at file_block offset */ + error = file_io(vn->sc_vp, context_p, UIO_READ, + tmpbuf, (off_t)file_block * vn->sc_secsize, + vn->sc_secsize, NULL); + if (error) { + goto done; + } + /* write one block to shadow file at shadow_block offset */ + error = file_io(vn->sc_shadow_vp, context_p, UIO_WRITE, + tmpbuf, (off_t)shadow_block * vn->sc_secsize, + vn->sc_secsize, NULL); + done: + FREE(tmpbuf, M_TEMP); + return (error); +} + +enum { + FLAGS_FIRST_BLOCK_PARTIAL = 0x1, + FLAGS_LAST_BLOCK_PARTIAL = 0x2 +}; + +static int +vnwrite_shadow(struct vn_softc * vn, struct uio *uio, int ioflag, + struct vfs_context * context_p) +{ + u_long blocksize = vn->sc_secsize; + int error = 0; + user_ssize_t resid; + off_t offset; + + resid = uio_resid(uio); + offset = uio_offset(uio); + + while (resid > 0) { + int flags = 0; + u_long offset_block_number; + u_long remainder; + u_long resid_block_count; + u_long shadow_block_count; + u_long shadow_block_number; + user_ssize_t this_resid; + + /* figure out which blocks to write */ + offset_block_number = block_truncate(offset, blocksize); + remainder = block_remainder(offset, blocksize); + resid_block_count = block_round(resid + remainder, blocksize); + /* figure out if the first or last blocks are partial writes */ + if (remainder > 0 + && !shadow_map_is_written(vn->sc_shadow_map, + offset_block_number)) { + /* the first block is a partial write */ + flags |= FLAGS_FIRST_BLOCK_PARTIAL; + } + if (resid_block_count > 1 + && !shadow_map_is_written(vn->sc_shadow_map, + offset_block_number + + resid_block_count - 1) + && block_remainder(offset + resid, blocksize) > 0) { + /* the last block is a partial write */ + flags |= FLAGS_LAST_BLOCK_PARTIAL; + } + if (shadow_map_write(vn->sc_shadow_map, + offset_block_number, resid_block_count, + &shadow_block_number, + &shadow_block_count)) { + /* shadow file is growing */ +#if 0 + /* truncate the file to its new length before write */ + off_t size; + size = (off_t)shadow_map_shadow_size(vn->sc_shadow_map) + * vn->sc_secsize; + vnode_setsize(vn->sc_shadow_vp, size, IO_SYNC, + context_p); +#endif 0 + } + /* write the blocks (or parts thereof) */ + uio_setoffset(uio, (off_t) + shadow_block_number * blocksize + remainder); + this_resid = (off_t)shadow_block_count * blocksize - remainder; + if (this_resid >= resid) { + this_resid = resid; + if ((flags & FLAGS_LAST_BLOCK_PARTIAL) != 0) { + /* copy the last block to the shadow */ + u_long d; + u_long s; + + s = offset_block_number + + resid_block_count - 1; + d = shadow_block_number + + shadow_block_count - 1; + error = vncopy_block_to_shadow(vn, context_p, + s, d); + if (error) { + printf("vnwrite_shadow: failed to copy" + " block %d to shadow block %d\n", + s, d); + break; + } + } + } + uio_setresid(uio, this_resid); + if ((flags & FLAGS_FIRST_BLOCK_PARTIAL) != 0) { + /* copy the first block to the shadow */ + error = vncopy_block_to_shadow(vn, context_p, + offset_block_number, + shadow_block_number); + if (error) { + printf("vnwrite_shadow: failed to" + " copy block %d to shadow block %d\n", + offset_block_number, + shadow_block_number); + break; + } + } + error = VNOP_WRITE(vn->sc_shadow_vp, uio, ioflag, context_p); + if (error) { + break; + } + /* figure out how much we actually wrote */ + this_resid -= uio_resid(uio); + if (this_resid == 0) { + printf("vn device: vnwrite_shadow zero length write\n"); + break; + } + resid -= this_resid; + offset += this_resid; + } + uio_setresid(uio, resid); + uio_setoffset(uio, offset); + return (error); +} + static int vnread(dev_t dev, struct uio *uio, int ioflag) { - struct proc * p = current_proc(); - int status; + struct vfs_context context; + int error = 0; + boolean_t funnel_state; + off_t offset; + struct proc * p; + user_ssize_t resid; struct vn_softc * vn; int unit; @@ -230,25 +498,74 @@ vnread(dev_t dev, struct uio *uio, int ioflag) if (vnunit(dev) >= NVNDEVICE) { return (ENXIO); } + p = current_proc(); + funnel_state = thread_funnel_set(kernel_flock, TRUE); vn = vn_table + unit; if ((vn->sc_flags & VNF_INITED) == 0) { - return (ENXIO); + error = ENXIO; + goto done; } - if (vn->sc_shadow_vp != NULL) { - return (ENODEV); + error = vnode_getwithvid(vn->sc_vp, vn->sc_vid); + if (error != 0) { + /* the vnode is no longer available, abort */ + error = ENXIO; + vnclear(vn, p); + goto done; + } + + resid = uio_resid(uio); + offset = uio_offset(uio); + + /* + * If out of bounds return an error. If at the EOF point, + * simply read less. + */ + if (offset >= (off_t)vn->sc_fsize) { + if (offset > (off_t)vn->sc_fsize) { + error = EINVAL; + } + goto done; + } + /* + * If the request crosses EOF, truncate the request. + */ + if ((offset + resid) > (off_t)vn->sc_fsize) { + resid = vn->sc_fsize - offset; + uio_setresid(uio, resid); } - vn_lock(vn->sc_vp, LK_EXCLUSIVE | LK_RETRY, p); - status = VOP_READ(vn->sc_vp, uio, ioflag, vn->sc_cred); - VOP_UNLOCK(vn->sc_vp, 0, p); - return (status); + context.vc_proc = p; + context.vc_ucred = vn->sc_cred; + if (vn->sc_shadow_vp != NULL) { + error = vnode_getwithvid(vn->sc_shadow_vp, + vn->sc_shadow_vid); + if (error != 0) { + /* the vnode is no longer available, abort */ + error = ENXIO; + vnode_put(vn->sc_vp); + vnclear(vn, p); + goto done; + } + error = vnread_shadow(vn, uio, ioflag, &context); + vnode_put(vn->sc_shadow_vp); + } else { + error = VNOP_READ(vn->sc_vp, uio, ioflag, &context); + } + vnode_put(vn->sc_vp); + done: + (void) thread_funnel_set(kernel_flock, funnel_state); + return (error); } static int vnwrite(dev_t dev, struct uio *uio, int ioflag) { - struct proc * p = current_proc(); - int status; + struct vfs_context context; + int error; + boolean_t funnel_state; + off_t offset; + struct proc * p; + user_ssize_t resid; struct vn_softc * vn; int unit; @@ -256,83 +573,86 @@ vnwrite(dev_t dev, struct uio *uio, int ioflag) if (vnunit(dev) >= NVNDEVICE) { return (ENXIO); } + p = current_proc(); + funnel_state = thread_funnel_set(kernel_flock, TRUE); vn = vn_table + unit; if ((vn->sc_flags & VNF_INITED) == 0) { - return (ENXIO); - } - if (vn->sc_shadow_vp != NULL) { - return (ENODEV); + error = ENXIO; + goto done; } if (vn->sc_flags & VNF_READONLY) { - return (EROFS); + error = EROFS; + goto done; } + error = vnode_getwithvid(vn->sc_vp, vn->sc_vid); + if (error != 0) { + /* the vnode is no longer available, abort */ + error = ENXIO; + vnclear(vn, p); + goto done; + } + resid = uio_resid(uio); + offset = uio_offset(uio); - vn_lock(vn->sc_vp, LK_EXCLUSIVE | LK_RETRY, p); - status = VOP_WRITE(vn->sc_vp, uio, ioflag, vn->sc_cred); - VOP_UNLOCK(vn->sc_vp, 0, p); - - return (status); -} - -static boolean_t -bp_is_mapped(struct buf * bp, vm_offset_t * vaddr) -{ - boolean_t is_mapped = FALSE; + /* + * If out of bounds return an error. If at the EOF point, + * simply write less. + */ + if (offset >= (off_t)vn->sc_fsize) { + if (offset > (off_t)vn->sc_fsize) { + error = EINVAL; + } + goto done; + } + /* + * If the request crosses EOF, truncate the request. + */ + if ((offset + resid) > (off_t)vn->sc_fsize) { + resid = (off_t)vn->sc_fsize - offset; + uio_setresid(uio, resid); + } - if (bp->b_flags & B_NEED_IODONE) { - struct buf * real_bp = (struct buf *)bp->b_real_bp; + context.vc_proc = p; + context.vc_ucred = vn->sc_cred; - if (real_bp && real_bp->b_data) { - *vaddr = (vm_offset_t)real_bp->b_data; - is_mapped = TRUE; + if (vn->sc_shadow_vp != NULL) { + error = vnode_getwithvid(vn->sc_shadow_vp, + vn->sc_shadow_vid); + if (error != 0) { + /* the vnode is no longer available, abort */ + error = ENXIO; + vnode_put(vn->sc_vp); + vnclear(vn, p); + goto done; } + error = vnwrite_shadow(vn, uio, ioflag, &context); + vnode_put(vn->sc_shadow_vp); + } else { + error = VNOP_WRITE(vn->sc_vp, uio, ioflag, &context); } - return (is_mapped); -} - -static __inline__ int -file_io(struct vnode * vp, struct ucred * cred, - enum uio_rw op, char * base, off_t offset, long count, - struct proc * p, long * resid) -{ - struct uio auio; - struct iovec aiov; - int error; - - bzero(&auio, sizeof(auio)); - aiov.iov_base = base; - aiov.iov_len = count; - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_offset = offset; - auio.uio_rw = op; - auio.uio_resid = count; - auio.uio_procp = p; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - if (op == UIO_READ) - error = VOP_READ(vp, &auio, IO_SYNC, cred); - else - error = VOP_WRITE(vp, &auio, IO_SYNC, cred); - VOP_UNLOCK(vp, 0, p); - *resid = auio.uio_resid; + vnode_put(vn->sc_vp); + done: + (void) thread_funnel_set(kernel_flock, funnel_state); return (error); } static int shadow_read(struct vn_softc * vn, struct buf * bp, char * base, struct proc * p) { + u_long blocksize = vn->sc_secsize; + struct vfs_context context; int error = 0; u_long offset; boolean_t read_shadow; u_long resid; u_long start = 0; - offset = bp->b_blkno; - resid = bp->b_bcount / vn->sc_secsize; - + context.vc_proc = p; + context.vc_ucred = vn->sc_cred; + offset = buf_blkno(bp); + resid = buf_resid(bp) / blocksize; while (resid > 0) { - u_long temp_resid; + user_ssize_t temp_resid; u_long this_offset; u_long this_resid; struct vnode * vp; @@ -346,23 +666,23 @@ shadow_read(struct vn_softc * vn, struct buf * bp, char * base, struct proc * p) else { vp = vn->sc_vp; } - error = file_io(vp, vn->sc_cred, UIO_READ, base + start, - (off_t)this_offset * vn->sc_secsize, - this_resid * vn->sc_secsize, p, &temp_resid); - if (error) + error = file_io(vp, &context, UIO_READ, base + start, + (off_t)this_offset * blocksize, + (user_ssize_t)this_resid * blocksize, + &temp_resid); + if (error) { break; - temp_resid = this_resid - temp_resid / vn->sc_secsize; - if (temp_resid == 0) { - static int printed = 0; - printf("vn device: shadow_write zero length read (printed %d)\n", printed); - printed++; + } + this_resid -= (temp_resid / blocksize); + if (this_resid == 0) { + printf("vn device: shadow_read zero length read\n"); break; } - resid -= temp_resid; - offset += temp_resid; - start += temp_resid * vn->sc_secsize;; + resid -= this_resid; + offset += this_resid; + start += this_resid * blocksize; } - bp->b_resid = resid * vn->sc_secsize; + buf_setresid(bp, resid * blocksize); return (error); } @@ -370,20 +690,22 @@ static int shadow_write(struct vn_softc * vn, struct buf * bp, char * base, struct proc * p) { + u_long blocksize = vn->sc_secsize; + struct vfs_context context; int error = 0; u_long offset; boolean_t shadow_grew; u_long resid; u_long start = 0; - offset = bp->b_blkno; - resid = bp->b_bcount / vn->sc_secsize; - + context.vc_proc = p; + context.vc_ucred = vn->sc_cred; + offset = buf_blkno(bp); + resid = buf_resid(bp) / blocksize; while (resid > 0) { - u_long temp_resid; + user_ssize_t temp_resid; u_long this_offset; u_long this_resid; - struct vnode * vp; shadow_grew = shadow_map_write(vn->sc_shadow_map, offset, resid, @@ -393,80 +715,66 @@ shadow_write(struct vn_softc * vn, struct buf * bp, char * base, off_t size; /* truncate the file to its new length before write */ size = (off_t)shadow_map_shadow_size(vn->sc_shadow_map) - * vn->sc_secsize; - vn_lock(vn->sc_shadow_vp, LK_EXCLUSIVE | LK_RETRY, p); - VOP_TRUNCATE(vn->sc_shadow_vp, size, - IO_SYNC, vn->sc_cred, p); - VOP_UNLOCK(vn->sc_shadow_vp, 0, p); + * blocksize; + vnode_setsize(vn->sc_shadow_vp, size, IO_SYNC, + &context); #endif } - error = file_io(vn->sc_shadow_vp, vn->sc_cred, UIO_WRITE, + error = file_io(vn->sc_shadow_vp, &context, UIO_WRITE, base + start, - (off_t)this_offset * vn->sc_secsize, - this_resid * vn->sc_secsize, p, &temp_resid); + (off_t)this_offset * blocksize, + (user_ssize_t)this_resid * blocksize, + &temp_resid); if (error) { break; } - temp_resid = this_resid - temp_resid / vn->sc_secsize; - if (temp_resid == 0) { - static int printed = 0; - printf("vn device: shadow_write zero length write (printed %d)\n", printed); - printed++; + this_resid -= (temp_resid / blocksize); + if (this_resid == 0) { + printf("vn device: shadow_write zero length write\n"); break; } - resid -= temp_resid; - offset += temp_resid; - start += temp_resid * vn->sc_secsize;; + resid -= this_resid; + offset += this_resid; + start += this_resid * blocksize; } - bp->b_resid = resid * vn->sc_secsize; + buf_setresid(bp, resid * blocksize); return (error); } static int -vn_readwrite_io(struct vn_softc * vn, struct buf * bp) +vn_readwrite_io(struct vn_softc * vn, struct buf * bp, struct proc * p) { int error = 0; char * iov_base; - boolean_t need_unmap = FALSE; - struct proc * p = current_proc(); - vm_offset_t vaddr = NULL; + caddr_t vaddr; - if (bp->b_flags & B_VECTORLIST) { - if (bp_is_mapped(bp, &vaddr) == FALSE) { - if (ubc_upl_map(bp->b_pagelist, &vaddr) - != KERN_SUCCESS) { - panic("vn device: ubc_upl_map failed"); - } - else { - need_unmap = TRUE; - } - } - } - if (error) - return (error); - if (vaddr != NULL) - iov_base = (caddr_t)(vaddr + bp->b_uploffset); - else - iov_base = bp->b_data; + if (buf_map(bp, &vaddr)) + panic("vn device: buf_map failed"); + iov_base = (char *)vaddr; + if (vn->sc_shadow_vp == NULL) { - error = file_io(vn->sc_vp, vn->sc_cred, - bp->b_flags & B_READ ? UIO_READ : UIO_WRITE, - iov_base, (off_t)bp->b_blkno * vn->sc_secsize, - bp->b_bcount, p, &bp->b_resid); + struct vfs_context context; + user_ssize_t temp_resid; + + context.vc_proc = p; + context.vc_ucred = vn->sc_cred; + + error = file_io(vn->sc_vp, &context, + buf_flags(bp) & B_READ ? UIO_READ : UIO_WRITE, + iov_base, + (off_t)buf_blkno(bp) * vn->sc_secsize, + buf_resid(bp), &temp_resid); + buf_setresid(bp, temp_resid); } else { - if (bp->b_flags & B_READ) + if (buf_flags(bp) & B_READ) error = shadow_read(vn, bp, iov_base, p); else error = shadow_write(vn, bp, iov_base, p); - if (error == 0) - bp->b_resid = 0; - - } - if (need_unmap) { - ubc_upl_unmap(bp->b_pagelist); } + buf_unmap(bp); + return (error); } @@ -476,94 +784,123 @@ vnstrategy(struct buf *bp) struct vn_softc *vn; int error = 0; long sz; /* in sc_secsize chunks */ + daddr64_t blk_num; + boolean_t funnel_state; + struct proc * p = current_proc(); + struct vnode * shadow_vp = NULL; + struct vnode * vp = NULL; - vn = vn_table + vnunit(bp->b_dev); + funnel_state = thread_funnel_set(kernel_flock, TRUE); + vn = vn_table + vnunit(buf_device(bp)); if ((vn->sc_flags & VNF_INITED) == 0) { - bp->b_error = ENXIO; - bp->b_flags |= B_ERROR; - biodone(bp); - return; + error = ENXIO; + goto done; } - bp->b_resid = bp->b_bcount; + buf_setresid(bp, buf_count(bp)); /* * Check for required alignment. Transfers must be a valid * multiple of the sector size. */ - if (bp->b_bcount % vn->sc_secsize != 0 || - bp->b_blkno % (vn->sc_secsize / DEV_BSIZE) != 0) { - bp->b_error = EINVAL; - bp->b_flags |= B_ERROR | B_INVAL; - biodone(bp); - return; + blk_num = buf_blkno(bp); + if (buf_count(bp) % vn->sc_secsize != 0) { + error = EINVAL; + goto done; } - sz = howmany(bp->b_bcount, vn->sc_secsize); + sz = howmany(buf_count(bp), vn->sc_secsize); /* * If out of bounds return an error. If at the EOF point, * simply read or write less. */ - if (bp->b_blkno >= vn->sc_size) { - if (bp->b_blkno > vn->sc_size) { - bp->b_error = EINVAL; - bp->b_flags |= B_ERROR | B_INVAL; + if (blk_num >= 0 && (u_int64_t)blk_num >= vn->sc_size) { + if (blk_num > 0 && (u_int64_t)blk_num > vn->sc_size) { + error = EINVAL; } - biodone(bp); - return; + goto done; } /* * If the request crosses EOF, truncate the request. */ - if ((bp->b_blkno + sz) > vn->sc_size) { - bp->b_bcount = (vn->sc_size - bp->b_blkno) * vn->sc_secsize; - bp->b_resid = bp->b_bcount; + if ((blk_num + sz) > 0 && ((u_int64_t)(blk_num + sz)) > vn->sc_size) { + buf_setcount(bp, (vn->sc_size - blk_num) * vn->sc_secsize); + buf_setresid(bp, buf_count(bp)); } - - if (vn->sc_vp) { - error = vn_readwrite_io(vn, bp); - if (error) { - bp->b_error = error; - bp->b_flags |= B_ERROR; + vp = vn->sc_vp; + if (vp == NULL) { + error = ENXIO; + goto done; + } + error = vnode_getwithvid(vp, vn->sc_vid); + if (error != 0) { + /* the vnode is no longer available, abort */ + error = ENXIO; + vnclear(vn, p); + goto done; + } + shadow_vp = vn->sc_shadow_vp; + if (shadow_vp != NULL) { + error = vnode_getwithvid(shadow_vp, + vn->sc_shadow_vid); + if (error != 0) { + /* the vnode is no longer available, abort */ + error = ENXIO; + vnode_put(vn->sc_vp); + vnclear(vn, p); + goto done; } - biodone(bp); } - else { - bp->b_flags |= B_ERROR; - bp->b_error = EINVAL; - biodone(bp); + error = vn_readwrite_io(vn, bp, p); + vnode_put(vp); + if (shadow_vp != NULL) { + vnode_put(shadow_vp); } + + done: + (void) thread_funnel_set(kernel_flock, funnel_state); + if (error) { + buf_seterror(bp, error); + } + buf_biodone(bp); + return; } /* ARGSUSED */ static int -vnioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p, +vnioctl(dev_t dev, u_long cmd, caddr_t data, + __unused int flag, struct proc *p, int is_char) { struct vn_softc *vn; - struct vn_ioctl *vio; + struct user_vn_ioctl *viop; int error; - u_long *f; - int num = 0; + u_int32_t *f; u_int64_t * o; int unit; - int size = 0; + struct vfsioattr ioattr; + struct user_vn_ioctl user_vnio; + boolean_t funnel_state; unit = vnunit(dev); if (vnunit(dev) >= NVNDEVICE) { return (ENXIO); } + + funnel_state = thread_funnel_set(kernel_flock, TRUE); vn = vn_table + unit; - error = suser(p->p_ucred, &p->p_acflag); - if (error) - return (error); + error = proc_suser(p); + if (error) { + goto done; + } - vio = (struct vn_ioctl *)data; - f = (u_long*)data; + viop = (struct user_vn_ioctl *)data; + f = (u_int32_t *)data; o = (u_int64_t *)data; switch (cmd) { case VNIOCDETACH: + case VNIOCDETACH64: case DKIOCGETBLOCKSIZE: - case DKIOCSETBLOCKSIZE: + case DKIOCSETBLOCKSIZE: case DKIOCGETMAXBLOCKCOUNTREAD: case DKIOCGETMAXBLOCKCOUNTWRITE: case DKIOCGETMAXSEGMENTCOUNTREAD: @@ -573,59 +910,67 @@ vnioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p, case DKIOCGETBLOCKCOUNT: case DKIOCGETBLOCKCOUNT32: if ((vn->sc_flags & VNF_INITED) == 0) { - return (ENXIO); + error = ENXIO; + goto done; } break; default: break; } + + if (vn->sc_vp != NULL) + vfs_ioattr(vnode_mount(vn->sc_vp), &ioattr); + else + bzero(&ioattr, sizeof(ioattr)); + switch (cmd) { + case DKIOCISVIRTUAL: + *f = 1; + break; case DKIOCGETMAXBLOCKCOUNTREAD: - vfs_io_attributes(vn->sc_vp, B_READ, &size, &num); - *o = size / vn->sc_secsize; + *o = ioattr.io_maxreadcnt / vn->sc_secsize; break; case DKIOCGETMAXBLOCKCOUNTWRITE: - vfs_io_attributes(vn->sc_vp, B_WRITE, &size, &num); - *o = size / vn->sc_secsize; + *o = ioattr.io_maxwritecnt / vn->sc_secsize; break; case DKIOCGETMAXBYTECOUNTREAD: - vfs_io_attributes(vn->sc_vp, B_READ, &size, &num); - *o = size; + *o = ioattr.io_maxreadcnt; break; case DKIOCGETMAXBYTECOUNTWRITE: - vfs_io_attributes(vn->sc_vp, B_WRITE, &size, &num); - *o = size; + *o = ioattr.io_maxwritecnt; break; case DKIOCGETMAXSEGMENTCOUNTREAD: - vfs_io_attributes(vn->sc_vp, B_READ, &size, &num); - *o = num; + *o = ioattr.io_segreadcnt; break; case DKIOCGETMAXSEGMENTCOUNTWRITE: - vfs_io_attributes(vn->sc_vp, B_WRITE, &size, &num); - *o = num; + *o = ioattr.io_segwritecnt; break; case DKIOCGETMAXSEGMENTBYTECOUNTREAD: - vfs_io_maxsegsize(vn->sc_vp, B_READ, &size); - *o = size; + *o = ioattr.io_maxsegreadsize; break; case DKIOCGETMAXSEGMENTBYTECOUNTWRITE: - vfs_io_maxsegsize(vn->sc_vp, B_WRITE, &size); - *o = size; + *o = ioattr.io_maxsegwritesize; break; - case DKIOCGETBLOCKSIZE: - *f = vn->sc_secsize; + case DKIOCGETBLOCKSIZE: + *f = vn->sc_secsize; break; - case DKIOCSETBLOCKSIZE: + case DKIOCSETBLOCKSIZE: if (is_char) { /* can only set block size on block device */ - return (ENODEV); - } - if (vn->sc_shadow_vp != NULL) { - /* can't set the block size if already shadowing */ - return (EBUSY); + error = ENODEV; + break; } if (*f < DEV_BSIZE) { - return (EINVAL); + error = EINVAL; + break; + } + if (vn->sc_shadow_vp != NULL) { + if (*f == (unsigned)vn->sc_secsize) { + break; + } + /* can't change the block size if already shadowing */ + error = EBUSY; + break; } vn->sc_secsize = *f; /* recompute the size in terms of the new blocksize */ @@ -641,37 +986,57 @@ vnioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p, *o = vn->sc_size; break; case VNIOCSHADOW: + case VNIOCSHADOW64: if (vn->sc_shadow_vp != NULL) { - return (EBUSY); + error = EBUSY; + break; } if (vn->sc_vp == NULL) { /* much be attached before we can shadow */ - return (EINVAL); + error = EINVAL; + break; + } + if (!proc_is64bit(p)) { + /* downstream code expects LP64 version of vn_ioctl structure */ + vn_ioctl_to_64((struct vn_ioctl *)viop, &user_vnio); + viop = &user_vnio; } - if (vio->vn_file == NULL) { - return (EINVAL); + if (viop->vn_file == USER_ADDR_NULL) { + error = EINVAL; + break; } - error = vniocattach_shadow(vn, vio, dev, 0, p); + error = vniocattach_shadow(vn, viop, dev, 0, p); break; case VNIOCATTACH: + case VNIOCATTACH64: if (is_char) { /* attach only on block device */ - return (ENODEV); + error = ENODEV; + break; } if (vn->sc_flags & VNF_INITED) { - return (EBUSY); + error = EBUSY; + break; + } + if (!proc_is64bit(p)) { + /* downstream code expects LP64 version of vn_ioctl structure */ + vn_ioctl_to_64((struct vn_ioctl *)viop, &user_vnio); + viop = &user_vnio; } - if (vio->vn_file == NULL) { - return (EINVAL); + if (viop->vn_file == USER_ADDR_NULL) { + error = EINVAL; + break; } - error = vniocattach_file(vn, vio, dev, 0, p); + error = vniocattach_file(vn, viop, dev, 0, p); break; case VNIOCDETACH: + case VNIOCDETACH64: if (is_char) { /* detach only on block device */ - return (ENODEV); + error = ENODEV; + break; } /* Note: spec_open won't open a mounted block device */ @@ -683,7 +1048,7 @@ vnioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p, * How are these problems handled for removable and failing * hardware devices? (Hint: They are not) */ - vnclear(vn); + vnclear(vn, p); break; case VNIOCGSET: @@ -710,6 +1075,8 @@ vnioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p, error = ENOTTY; break; } + done: + (void) thread_funnel_set(kernel_flock, funnel_state); return(error); } @@ -734,34 +1101,44 @@ vnioctl_blk(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) static int vniocattach_file(struct vn_softc *vn, - struct vn_ioctl *vio, + struct user_vn_ioctl *vniop, dev_t dev, int in_kernel, struct proc *p) { - struct vattr vattr; + dev_t cdev; + struct vfs_context context; + kauth_cred_t cred; struct nameidata nd; + off_t file_size; int error, flags; + + context.vc_proc = p; + context.vc_ucred = proc_ucred(p); flags = FREAD|FWRITE; if (in_kernel) { - NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, vio->vn_file, p); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE32, vniop->vn_file, &context); } else { - NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, vio->vn_file, p); + NDINIT(&nd, LOOKUP, FOLLOW, + (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32), + vniop->vn_file, &context); } + /* vn_open gives both long- and short-term references */ error = vn_open(&nd, flags, 0); if (error) { if (error != EACCES && error != EPERM && error != EROFS) return (error); flags &= ~FWRITE; if (in_kernel) { - NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, - vio->vn_file, p); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE32, + vniop->vn_file, &context); } else { - NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, - vio->vn_file, p); + NDINIT(&nd, LOOKUP, FOLLOW, + (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32), + vniop->vn_file, &context); } error = vn_open(&nd, flags, 0); if (error) @@ -770,99 +1147,93 @@ vniocattach_file(struct vn_softc *vn, if (nd.ni_vp->v_type != VREG) { error = EINVAL; } - else if (ubc_isinuse(nd.ni_vp, 1)) { - error = EBUSY; - } else { - error = VOP_GETATTR(nd.ni_vp, &vattr, p->p_ucred, p); + error = vnode_size(nd.ni_vp, &file_size, &context); } if (error != 0) { - VOP_UNLOCK(nd.ni_vp, 0, p); - (void) vn_close(nd.ni_vp, flags, p->p_ucred, p); + (void) vn_close(nd.ni_vp, flags, proc_ucred(p), p); + vnode_put(nd.ni_vp); return (error); } - vn->sc_vp = nd.ni_vp; - vn->sc_vp->v_flag |= VNOCACHE_DATA; - VOP_UNLOCK(nd.ni_vp, 0, p); - - vn->sc_open_flags = flags; - - /* - * If the size is specified, override the file attributes. Note that - * the vn_size argument is in PAGE_SIZE sized blocks. - */ -#if 0 - if (vio->vn_size) - vn->sc_size = (quad_t)vio->vn_size * PAGE_SIZE / vn->sc_secsize; - else - vn->sc_size = vattr.va_size / vn->sc_secsize; -#endif - vn->sc_secsize = DEV_BSIZE; - vn->sc_fsize = vattr.va_size; - vn->sc_size = vattr.va_size / vn->sc_secsize; - error = vnsetcred(vn, p); + cred = kauth_cred_proc_ref(p); + nd.ni_vp->v_flag |= VNOCACHE_DATA; + error = setcred(nd.ni_vp, p, cred); if (error) { - (void) vn_close(nd.ni_vp, flags, p->p_ucred, p); + (void)vn_close(nd.ni_vp, flags, proc_ucred(p), p); + vnode_put(nd.ni_vp); + kauth_cred_rele(cred); return(error); } - { - dev_t cdev = makedev(vndevice_cdev_major, - minor(dev)); - vn->sc_cdev = devfs_make_node(cdev, DEVFS_CHAR, - UID_ROOT, GID_OPERATOR, - 0600, "rvn%d", - minor(dev)); - } + vn->sc_secsize = DEV_BSIZE; + vn->sc_fsize = file_size; + vn->sc_size = file_size / vn->sc_secsize; + vn->sc_vp = nd.ni_vp; + vn->sc_vid = vnode_vid(nd.ni_vp); + vn->sc_open_flags = flags; + vn->sc_cred = cred; + cdev = makedev(vndevice_cdev_major, minor(dev)); + vn->sc_cdev = devfs_make_node(cdev, DEVFS_CHAR, + UID_ROOT, GID_OPERATOR, + 0600, "rvn%d", + minor(dev)); vn->sc_flags |= VNF_INITED; if (flags == FREAD) vn->sc_flags |= VNF_READONLY; + /* lose the short-term reference */ + vnode_put(nd.ni_vp); return(0); } static int -vniocattach_shadow(vn, vio, dev, in_kernel, p) - struct vn_softc *vn; - struct vn_ioctl *vio; - dev_t dev; - int in_kernel; - struct proc *p; +vniocattach_shadow(struct vn_softc *vn, struct user_vn_ioctl *vniop, + __unused int dev, int in_kernel, struct proc *p) { - struct vattr vattr; + struct vfs_context context; struct nameidata nd; int error, flags; shadow_map_t * map; + off_t file_size; + + context.vc_proc = p; + context.vc_ucred = proc_ucred(p); flags = FREAD|FWRITE; if (in_kernel) { - NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, vio->vn_file, p); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE32, vniop->vn_file, &context); } else { - NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, vio->vn_file, p); + NDINIT(&nd, LOOKUP, FOLLOW, + (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32), + vniop->vn_file, &context); } + /* vn_open gives both long- and short-term references */ error = vn_open(&nd, flags, 0); if (error) { /* shadow MUST be writable! */ return (error); } - if (nd.ni_vp->v_type != VREG || - (error = VOP_GETATTR(nd.ni_vp, &vattr, p->p_ucred, p))) { - VOP_UNLOCK(nd.ni_vp, 0, p); - (void) vn_close(nd.ni_vp, flags, p->p_ucred, p); + if (nd.ni_vp->v_type != VREG + || (error = vnode_size(nd.ni_vp, &file_size, &context))) { + (void)vn_close(nd.ni_vp, flags, proc_ucred(p), p); + vnode_put(nd.ni_vp); return (error ? error : EINVAL); } - vn->sc_shadow_vp = nd.ni_vp; - vn->sc_shadow_vp->v_flag |= VNOCACHE_DATA; - VOP_UNLOCK(nd.ni_vp, 0, p); - - map = shadow_map_create(vn->sc_fsize, vattr.va_size, + map = shadow_map_create(vn->sc_fsize, file_size, 0, vn->sc_secsize); if (map == NULL) { - (void) vn_close(nd.ni_vp, flags, p->p_ucred, p); + (void)vn_close(nd.ni_vp, flags, proc_ucred(p), p); + vnode_put(nd.ni_vp); vn->sc_shadow_vp = NULL; return (ENOMEM); } + vn->sc_shadow_vp = nd.ni_vp; + vn->sc_shadow_vid = vnode_vid(nd.ni_vp); + vn->sc_shadow_vp->v_flag |= VNOCACHE_DATA; vn->sc_shadow_map = map; vn->sc_flags &= ~VNF_READONLY; /* we're now read/write */ + + /* lose the short-term reference */ + vnode_put(nd.ni_vp); return(0); } @@ -870,18 +1241,17 @@ int vndevice_root_image(char * path, char devname[], dev_t * dev_p) { int error = 0; - int flags; - struct vn_softc * vn; - struct vn_ioctl vio; + struct vn_softc * vn; + struct user_vn_ioctl vnio; - vio.vn_file = path; - vio.vn_size = 0; + vnio.vn_file = CAST_USER_ADDR_T(path); + vnio.vn_size = 0; vn = vn_table + ROOT_IMAGE_UNIT; *dev_p = makedev(vndevice_bdev_major, ROOT_IMAGE_UNIT); sprintf(devname, "vn%d", ROOT_IMAGE_UNIT); - error = vniocattach_file(vn, &vio, *dev_p, 1, current_proc()); + error = vniocattach_file(vn, &vnio, *dev_p, 1, current_proc()); return (error); } @@ -891,60 +1261,34 @@ vndevice_root_image(char * path, char devname[], dev_t * dev_p) * to this "disk" is essentially as root. Note that credentials may change * if some other uid can write directly to the mapped file (NFS). */ -int -vnsetcred(struct vn_softc *vn, struct proc * p) +static int +setcred(struct vnode * vp, struct proc * p, kauth_cred_t cred) { char *tmpbuf; int error = 0; - struct proc * current_proc(); - struct ucred * cred = p->p_ucred; - - /* - * Set credits in our softc - */ - - if (vn->sc_cred) - crfree(vn->sc_cred); - vn->sc_cred = crdup(cred); + struct vfs_context context; /* * Horrible kludge to establish credentials for NFS XXX. */ - - if (vn->sc_vp) { - struct uio auio; - struct iovec aiov; - - tmpbuf = _MALLOC(vn->sc_secsize, M_TEMP, M_WAITOK); - bzero(&auio, sizeof(auio)); - - aiov.iov_base = tmpbuf; - aiov.iov_len = vn->sc_secsize; - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_offset = 0; - auio.uio_rw = UIO_READ; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_resid = aiov.iov_len; - vn_lock(vn->sc_vp, LK_EXCLUSIVE | LK_RETRY, p); - error = VOP_READ(vn->sc_vp, &auio, 0, vn->sc_cred); - VOP_UNLOCK(vn->sc_vp, 0, p); - FREE(tmpbuf, M_TEMP); - } + context.vc_proc = p; + context.vc_ucred = cred; + tmpbuf = _MALLOC(DEV_BSIZE, M_TEMP, M_WAITOK); + error = file_io(vp, &context, UIO_READ, tmpbuf, 0, DEV_BSIZE, NULL); + FREE(tmpbuf, M_TEMP); return (error); } void -vnclear(struct vn_softc *vn) +vnclear(struct vn_softc *vn, struct proc * p) { - int flags; - struct proc * p = current_proc(); /* XXX */ - if (vn->sc_vp != NULL) { + /* release long-term reference */ (void)vn_close(vn->sc_vp, vn->sc_open_flags, vn->sc_cred, p); vn->sc_vp = NULL; } if (vn->sc_shadow_vp != NULL) { + /* release long-term reference */ (void)vn_close(vn->sc_shadow_vp, FREAD | FWRITE, vn->sc_cred, p); vn->sc_shadow_vp = NULL; @@ -953,9 +1297,9 @@ vnclear(struct vn_softc *vn) shadow_map_free(vn->sc_shadow_map); vn->sc_shadow_map = NULL; } - vn->sc_flags = ~(VNF_INITED | VNF_READONLY); + vn->sc_flags &= ~(VNF_INITED | VNF_READONLY); if (vn->sc_cred) { - crfree(vn->sc_cred); + kauth_cred_rele(vn->sc_cred); vn->sc_cred = NULL; } vn->sc_size = 0; @@ -969,19 +1313,24 @@ vnclear(struct vn_softc *vn) static int vnsize(dev_t dev) { + int secsize; struct vn_softc *vn; int unit; + boolean_t funnel_state; unit = vnunit(dev); if (vnunit(dev) >= NVNDEVICE) { - return (ENXIO); + return (-1); } - vn = vn_table + unit; + funnel_state = thread_funnel_set(kernel_flock, TRUE); + vn = vn_table + unit; if ((vn->sc_flags & VNF_INITED) == 0) - return(-1); - - return(vn->sc_secsize); + secsize = -1; + else + secsize = vn->sc_secsize; + (void) thread_funnel_set(kernel_flock, funnel_state); + return (secsize); } #define CDEV_MAJOR -1 @@ -989,7 +1338,7 @@ vnsize(dev_t dev) static int vndevice_inited = 0; void -vndevice_init() +vndevice_init(void) { int i; @@ -1019,4 +1368,13 @@ vndevice_init() printf("vninit: devfs_make_node failed!\n"); } } + +static void +vn_ioctl_to_64(struct vn_ioctl *from, struct user_vn_ioctl *to) +{ + to->vn_file = CAST_USER_ADDR_T(from->vn_file); + to->vn_size = from->vn_size; + to->vn_control = from->vn_control; +} + #endif /* NVNDEVICE */ diff --git a/bsd/hfs/MacOSStubs.c b/bsd/hfs/MacOSStubs.c index 1dd6e6860..1b0fee293 100644 --- a/bsd/hfs/MacOSStubs.c +++ b/bsd/hfs/MacOSStubs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -43,7 +43,11 @@ struct timezone gTimeZone = {8*60,1}; */ UInt32 GetTimeUTC(void) { - return (time.tv_sec + MAC_GMT_FACTOR); + struct timeval tv; + + microtime(&tv); + + return (tv.tv_sec + MAC_GMT_FACTOR); } @@ -93,7 +97,7 @@ UInt32 UTCToLocal(UInt32 utcTime) * to_bsd_time - convert from Mac OS time (seconds since 1/1/1904) * to BSD time (seconds since 1/1/1970) */ -u_int32_t to_bsd_time(u_int32_t hfs_time) +time_t to_bsd_time(u_int32_t hfs_time) { u_int32_t gmt = hfs_time; @@ -102,16 +106,16 @@ u_int32_t to_bsd_time(u_int32_t hfs_time) else gmt = 0; /* don't let date go negative! */ - return gmt; + return (time_t)gmt; } /* * to_hfs_time - convert from BSD time (seconds since 1/1/1970) * to Mac OS time (seconds since 1/1/1904) */ -u_int32_t to_hfs_time(u_int32_t bsd_time) +u_int32_t to_hfs_time(time_t bsd_time) { - u_int32_t hfs_time = bsd_time; + u_int32_t hfs_time = (u_int32_t)bsd_time; /* don't adjust zero - treat as uninitialzed */ if (hfs_time != 0) diff --git a/bsd/hfs/Makefile b/bsd/hfs/Makefile index bea8d9526..cdc1fb8ba 100644 --- a/bsd/hfs/Makefile +++ b/bsd/hfs/Makefile @@ -24,7 +24,7 @@ DATAFILES = \ PRIVATE_DATAFILES = \ hfs.h hfs_attrlist.h hfs_catalog.h hfs_cnode.h hfs_endian.h \ - hfs_lockf.h hfs_macos_defs.h hfs_quota.h rangelist.h + hfs_fsctl.h hfs_macos_defs.h hfs_quota.h rangelist.h INSTALL_MI_LIST = ${DATAFILES} diff --git a/bsd/hfs/hfs.h b/bsd/hfs/hfs.h index defa4dc72..e9b96c239 100644 --- a/bsd/hfs/hfs.h +++ b/bsd/hfs/hfs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -30,15 +30,15 @@ #ifdef KERNEL #ifdef __APPLE_API_PRIVATE #include <sys/param.h> -#include <sys/lock.h> #include <sys/queue.h> #include <sys/mount.h> -#include <sys/namei.h> #include <sys/vnode.h> #include <sys/quota.h> #include <sys/dirent.h> #include <sys/event.h> +#include <kern/locks.h> + #include <vfs/vfs_journal.h> #include <hfs/hfs_format.h> @@ -48,10 +48,6 @@ #include <hfs/hfs_encodings.h> #include <hfs/hfs_hotfiles.h> - -struct uio; // This is more effective than #include <sys/uio.h> in case KERNEL is undefined... -struct hfslockf; /* For advisory locking */ - /* * Just reported via MIG interface. */ @@ -72,17 +68,8 @@ enum { kMDBSize = 512 }; /* Size of I/O transfer to read entire MDB */ enum { kMasterDirectoryBlock = 2 }; /* MDB offset on disk in 512-byte blocks */ enum { kMDBOffset = kMasterDirectoryBlock * 512 }; /* MDB offset on disk in bytes */ -enum { - kUnknownID = 0, - kRootParID = 1, - kRootDirID = 2 -}; +#define kRootDirID kHFSRootFolderID -enum { - kDataFork, - kRsrcFork, - kDirectory -}; /* number of locked buffer caches to hold for b-tree meta data */ #define kMaxLockedMetaBuffers 32 @@ -131,103 +118,79 @@ extern struct timezone gTimeZone; /* Internal Data structures*/ -struct vcb_t { - u_int16_t vcbSigWord; - int16_t vcbAtrb; - int16_t vcbFlags; - int16_t vcbspare; - u_int32_t vcbJinfoBlock; - - u_int32_t vcbCrDate; - u_int32_t vcbLsMod; - u_int32_t vcbVolBkUp; - - int32_t vcbFilCnt; - int32_t vcbDirCnt; - u_int32_t blockSize; /* size of allocation blocks */ - u_int32_t totalBlocks; /* total allocation blocks */ - u_int32_t freeBlocks; /* free allocation blocks */ - u_int32_t nextAllocation; /* start of next allocation search */ - int32_t vcbClpSiz; - u_int32_t vcbNxtCNID; - u_int32_t vcbCNIDGen; - int32_t vcbWrCnt; - - int32_t vcbFndrInfo[8]; - - u_int64_t encodingsBitmap; /* HFS Plus only */ - - u_int16_t vcbNmFls; /* HFS only */ - u_int16_t vcbNmRtDirs; /* HFS only */ - int16_t vcbVBMSt; /* HFS only */ - int16_t vcbAlBlSt; /* HFS only */ - - struct vnode * extentsRefNum; - struct vnode * catalogRefNum; - struct vnode * allocationsRefNum; - - u_int8_t vcbVN[256]; /* volume name in UTF-8 */ - u_int32_t volumeNameEncodingHint; - u_int32_t hfsPlusIOPosOffset; /* Disk block where HFS+ starts */ - u_int32_t vcbVBMIOSize; /* volume bitmap I/O size */ - - /* cache of largest known free extents */ - u_int32_t vcbFreeExtCnt; - HFSPlusExtentDescriptor vcbFreeExt[kMaxFreeExtents]; - - u_int32_t reserveBlocks; /* free block reserve */ - u_int32_t loanedBlocks; /* blocks on loan for delayed allocations */ - - u_int32_t localCreateDate; /* creation times for HFS+ volumes are in local time */ - simple_lock_data_t vcbSimpleLock; /* simple lock to allow concurrent access to vcb data */ -}; -typedef struct vcb_t ExtendedVCB; - #define kHFS_DamagedVolume 0x1 /* This volume has errors, unmount dirty */ /* XXX */ #define MARK_VOLUMEDAMAGED(fcb) -/* - * NOTE: The code relies on being able to cast an ExtendedVCB* to a vfsVCB* in order - * to gain access to the mount point pointer from a pointer - * to an ExtendedVCB. DO NOT INSERT OTHER FIELDS BEFORE THE vcb FIELD!! - * - * vcbFlags, vcbLsMod, vcbFilCnt, vcbDirCnt, vcbNxtCNID, etc - * are locked by the hfs_lock simple lock. - */ -typedef struct vfsVCB { - ExtendedVCB vcb_vcb; - struct hfsmount *vcb_hfsmp; /* Pointer to hfsmount structure */ -} vfsVCB_t; - - /* This structure describes the HFS specific mount structure data. */ typedef struct hfsmount { - u_int32_t hfs_flags; /* see below */ - + u_int32_t hfs_flags; /* see below */ + /* Physical Description */ - u_long hfs_phys_block_count; /* Num of PHYSICAL blocks of volume */ - u_long hfs_phys_block_size; /* Always a multiple of 512 */ + u_long hfs_phys_block_size; /* Always a multiple of 512 */ + daddr64_t hfs_phys_block_count; /* Num of PHYSICAL blocks of volume */ + daddr64_t hfs_alt_id_sector; /* location of alternate VH/MDB */ /* Access to VFS and devices */ struct mount *hfs_mp; /* filesystem vfs structure */ struct vnode *hfs_devvp; /* block device mounted vnode */ - dev_t hfs_raw_dev; /* device mounted */ - struct netexport hfs_export; /* Export information */ - u_int32_t hfs_logBlockSize; /* Size of buffer cache buffer for I/O */ + struct vnode * hfs_extents_vp; + struct vnode * hfs_catalog_vp; + struct vnode * hfs_allocation_vp; + struct vnode * hfs_attribute_vp; + dev_t hfs_raw_dev; /* device mounted */ + u_int32_t hfs_logBlockSize; /* Size of buffer cache buffer for I/O */ /* Default values for HFS standard and non-init access */ - uid_t hfs_uid; /* uid to set as owner of the files */ - gid_t hfs_gid; /* gid to set as owner of the files */ - mode_t hfs_dir_mask; /* mask to and with directory protection bits */ - mode_t hfs_file_mask; /* mask to and with file protection bits */ - u_long hfs_encoding; /* Defualt encoding for non hfs+ volumes */ - - /* HFS Specific */ - struct vfsVCB hfs_vcb; + uid_t hfs_uid; /* uid to set as owner of the files */ + gid_t hfs_gid; /* gid to set as owner of the files */ + mode_t hfs_dir_mask; /* mask to and with directory protection bits */ + mode_t hfs_file_mask; /* mask to and with file protection bits */ + u_long hfs_encoding; /* Defualt encoding for non hfs+ volumes */ + + /* Persistent fields (on disk, dynamic) */ + time_t hfs_mtime; /* file system last modification time */ + u_int32_t hfs_filecount; /* number of files in file system */ + u_int32_t hfs_dircount; /* number of directories in file system */ + u_int32_t freeBlocks; /* free allocation blocks */ + u_int32_t nextAllocation; /* start of next allocation search */ + u_int32_t vcbNxtCNID; /* next unused catalog node ID */ + u_int32_t vcbWrCnt; /* file system write count */ + u_int64_t encodingsBitmap; /* in-use encodings */ + u_int16_t vcbNmFls; /* HFS Only - root dir file count */ + u_int16_t vcbNmRtDirs; /* HFS Only - root dir directory count */ + + /* Persistent fields (on disk, static) */ + u_int16_t vcbSigWord; + int16_t vcbFlags; + u_int32_t vcbAtrb; + u_int32_t vcbJinfoBlock; + time_t hfs_itime; /* file system creation time */ + time_t hfs_btime; /* file system last backup time */ + u_int32_t blockSize; /* size of allocation blocks */ + u_int32_t totalBlocks; /* total allocation blocks */ + int32_t vcbClpSiz; + u_int32_t vcbFndrInfo[8]; + int16_t vcbVBMSt; /* HFS only */ + int16_t vcbAlBlSt; /* HFS only */ + + /* vcb stuff */ + u_int8_t vcbVN[256]; /* volume name in UTF-8 */ + u_int32_t volumeNameEncodingHint; + u_int32_t hfsPlusIOPosOffset; /* Disk block where HFS+ starts */ + u_int32_t vcbVBMIOSize; /* volume bitmap I/O size */ + + /* cache of largest known free extents */ + u_int32_t vcbFreeExtCnt; + HFSPlusExtentDescriptor vcbFreeExt[kMaxFreeExtents]; + + u_int32_t reserveBlocks; /* free block reserve */ + u_int32_t loanedBlocks; /* blocks on loan for delayed allocations */ + + u_int32_t localCreateDate; /* creation times for HFS+ volumes are in local time */ struct cat_desc hfs_privdir_desc; struct cat_attr hfs_privdir_attr; u_int32_t hfs_metadata_createdate; @@ -244,14 +207,18 @@ typedef struct hfsmount { u_int32_t jnl_size; u_int32_t hfs_jnlfileid; u_int32_t hfs_jnlinfoblkid; - volatile int readers; - volatile int blocker; + lck_rw_t hfs_global_lock; + u_int32_t hfs_global_lock_nesting; /* Notification variables: */ unsigned long hfs_notification_conditions; u_int32_t hfs_freespace_notify_warninglimit; u_int32_t hfs_freespace_notify_desiredlevel; + /* time mounted and last mounted mod time "snapshot" */ + time_t hfs_mount_time; + time_t hfs_last_mounted_mtime; + /* Metadata allocation zone variables: */ u_int32_t hfs_metazone_start; u_int32_t hfs_metazone_end; @@ -263,6 +230,7 @@ typedef struct hfsmount { int hfs_catalog_maxblks; /* Hot File Clustering variables: */ + lck_mtx_t hfc_mutex; /* serialize hot file stages */ enum hfc_stage hfc_stage; /* what are we up to... */ time_t hfc_timebase; /* recording period start time */ time_t hfc_timeout; /* recording period stop time */ @@ -275,8 +243,24 @@ typedef struct hfsmount { struct vnode * hfs_backingfs_rootvp; int hfs_sparsebandblks; #endif + size_t hfs_max_inline_attrsize; + + lck_mtx_t hfs_mutex; /* protects access to hfsmount data */ + void *hfs_freezing_proc; /* who froze the fs */ } hfsmount_t; +typedef hfsmount_t ExtendedVCB; + +/* Aliases for legacy field names */ +#define vcbCrDate hfs_itime +#define vcbLsMod hfs_mtime +#define vcbVolBkUp hfs_btime +#define extentsRefNum hfs_extents_vp +#define catalogRefNum hfs_catalog_vp +#define allocationsRefNum hfs_allocation_vp +#define vcbFilCnt hfs_filecount +#define vcbDirCnt hfs_dircount + /* HFS mount point flags */ #define HFS_READ_ONLY 0x001 @@ -289,47 +273,24 @@ typedef struct hfsmount { #define HFS_METADATA_ZONE 0x080 #define HFS_FRAGMENTED_FREESPACE 0x100 #define HFS_NEED_JNL_RESET 0x200 - -#ifdef HFS_SPARSE_DEV #define HFS_HAS_SPARSE_DEVICE 0x400 -#endif -#define hfs_global_shared_lock_acquire(hfsmp) \ - do { \ - if (hfsmp->blocker) { \ - tsleep((caddr_t)&hfsmp->blocker, PRIBIO, "journal_blocker", 0); \ - continue; \ - } \ - hfsmp->readers++; \ - break; \ - } while (1) - -#define hfs_global_shared_lock_release(hfsmp) \ - do { \ - hfsmp->readers--; \ - if (hfsmp->readers == 0) { \ - wakeup((caddr_t)&hfsmp->readers); \ - } \ - } while (0) - -#define hfs_global_exclusive_lock_acquire(hfsmp) \ - do { \ - if (hfsmp->blocker) { \ - tsleep((caddr_t)&hfsmp->blocker, PRIBIO, "journal_blocker", 0); \ - continue; \ - } \ - if (hfsmp->readers != 0) { \ - tsleep((caddr_t)&hfsmp->readers, PRIBIO, "journal_enable/disble", 0); \ - continue; \ - } \ - hfsmp->blocker = 1; \ - break; \ - } while (1) - -#define hfs_global_exclusive_lock_release(hfsmp) \ - hfsmp->blocker = 0; \ - wakeup((caddr_t)&hfsmp->blocker) +#define HFS_MOUNT_LOCK(hfsmp, metadata) \ + { \ + if ((metadata) && 1) \ + lck_mtx_lock(&(hfsmp)->hfs_mutex); \ + } \ + +#define HFS_MOUNT_UNLOCK(hfsmp, metadata) \ + { \ + if ((metadata) && 1) \ + lck_mtx_unlock(&(hfsmp)->hfs_mutex); \ + } \ + +#define hfs_global_exclusive_lock_acquire(hfsmp) lck_rw_lock_exclusive(&(hfsmp)->hfs_global_lock) +#define hfs_global_exclusive_lock_release(hfsmp) lck_rw_done(&(hfsmp)->hfs_global_lock) + #define MAXHFSVNODELEN 31 @@ -341,28 +302,19 @@ typedef struct filefork FCB; (void) sprintf((name), "%s%d", HFS_INODE_PREFIX, (linkno)) -/* structure to hold a "." or ".." directory entry (12 bytes) */ -typedef struct hfsdotentry { - u_int32_t d_fileno; /* unique file number */ - u_int16_t d_reclen; /* length of this structure */ - u_int8_t d_type; /* dirent file type */ - u_int8_t d_namelen; /* len of filename */ - char d_name[4]; /* "." or ".." */ -} hfsdotentry; #define HFS_AVERAGE_NAME_SIZE 22 #define AVERAGE_HFSDIRENTRY_SIZE (8+HFS_AVERAGE_NAME_SIZE+4) -#define MAX_HFSDIRENTRY_SIZE sizeof(struct dirent) -#define DIRENTRY_SIZE(namlen) \ - ((sizeof(struct dirent) - (NAME_MAX+1)) + (((namlen)+1 + 3) &~ 3)) +#define STD_DIRENT_LEN(namlen) \ + ((sizeof(struct dirent) - (NAME_MAX+1)) + (((namlen)+1 + 3) &~ 3)) +#define EXT_DIRENT_LEN(namlen) \ + ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 3) & ~3) enum { kHFSPlusMaxFileNameBytes = kHFSPlusMaxFileNameChars * 3 }; -enum { kdirentMaxNameBytes = NAME_MAX }; - /* macro to determine if hfs or hfsplus */ #define ISHFSPLUS(VCB) ((VCB)->vcbSigWord == kHFSPlusSigWord) @@ -372,25 +324,25 @@ enum { kdirentMaxNameBytes = NAME_MAX }; /* * Various ways to acquire a VFS mount point pointer: */ -#define VTOVFS(VP) ((VP)->v_mount) +#define VTOVFS(VP) vnode_mount((VP)) #define HFSTOVFS(HFSMP) ((HFSMP)->hfs_mp) -#define VCBTOVFS(VCB) (((struct vfsVCB *)(VCB))->vcb_hfsmp->hfs_mp) +#define VCBTOVFS(VCB) HFSTOVFS(VCB) /* * Various ways to acquire an HFS mount point pointer: */ -#define VTOHFS(VP) ((struct hfsmount *)((VP)->v_mount->mnt_data)) -#define VFSTOHFS(MP) ((struct hfsmount *)(MP)->mnt_data) -#define VCBTOHFS(VCB) (((struct vfsVCB *)(VCB))->vcb_hfsmp) -#define FCBTOHFS(FCB) ((struct hfsmount *)(FCB)->ff_cp->c_vp->v_mount->mnt_data) +#define VTOHFS(VP) ((struct hfsmount *)vfs_fsprivate(vnode_mount((VP)))) +#define VFSTOHFS(MP) ((struct hfsmount *)vfs_fsprivate((MP))) +#define VCBTOHFS(VCB) (VCB) +#define FCBTOHFS(FCB) ((struct hfsmount *)vfs_fsprivate(vnode_mount((FCB)->ff_cp->c_vp))) /* - * Various ways to acquire a VCB pointer: + * Various ways to acquire a VCB (legacy) pointer: */ -#define VTOVCB(VP) (&(((struct hfsmount *)((VP)->v_mount->mnt_data))->hfs_vcb.vcb_vcb)) -#define VFSTOVCB(MP) (&(((struct hfsmount *)(MP)->mnt_data)->hfs_vcb.vcb_vcb)) -#define HFSTOVCB(HFSMP) (&(HFSMP)->hfs_vcb.vcb_vcb) -#define FCBTOVCB(FCB) (&(((struct hfsmount *)((FCB)->ff_cp->c_vp->v_mount->mnt_data))->hfs_vcb.vcb_vcb)) +#define VTOVCB(VP) VTOHFS(VP) +#define VFSTOVCB(MP) VFSTOHFS(MP) +#define HFSTOVCB(HFSMP) (HFSMP) +#define FCBTOVCB(FCB) FCBTOHFS(FCB) #define HFS_KNOTE(vp, hint) KNOTE(&VTOC(vp)->c_knotes, (hint)) @@ -408,6 +360,17 @@ enum { kdirentMaxNameBytes = NAME_MAX }; #define HFS_ALT_SECTOR(blksize, blkcnt) (((blkcnt) - 1) - (512 / (blksize))) #define HFS_ALT_OFFSET(blksize) ((blksize) > 1024 ? (blksize) - 1024 : 0) + +/* + * HFS specific fcntl()'s + */ +#define HFS_BULKACCESS (FCNTL_FS_SPECIFIC_BASE + 0x00001) +#define HFS_GET_MOUNT_TIME (FCNTL_FS_SPECIFIC_BASE + 0x00002) +#define HFS_GET_LAST_MTIME (FCNTL_FS_SPECIFIC_BASE + 0x00003) +#define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004) +#define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005) + + /* * This is the straight GMT conversion constant: * 00:00:00 January 1, 1970 - 00:00:00 January 1, 1904 @@ -416,25 +379,18 @@ enum { kdirentMaxNameBytes = NAME_MAX }; #define MAC_GMT_FACTOR 2082844800UL -u_int32_t to_bsd_time(u_int32_t hfs_time); -u_int32_t to_hfs_time(u_int32_t bsd_time); +time_t to_bsd_time(u_int32_t hfs_time); +u_int32_t to_hfs_time(time_t bsd_time); int hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush); #define HFS_ALTFLUSH 1 extern int hfsUnmount(struct hfsmount *hfsmp, struct proc *p); - -extern int hfs_getcnode(struct hfsmount *hfsmp, cnid_t cnid, struct cat_desc *descp, - int wantrsrc, struct cat_attr *attrp, struct cat_fork *forkp, - struct vnode **vpp); - -extern int hfs_getnewvnode(struct hfsmount *hfsmp, struct cnode *cp, +extern int hfs_getnewvnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname *cnp, struct cat_desc *descp, int wantrsrc, struct cat_attr *attrp, struct cat_fork *forkp, struct vnode **vpp); -extern int hfs_metafilelocking(struct hfsmount *hfsmp, u_long fileID, u_int flags, struct proc *p); - extern u_int32_t hfs_freeblks(struct hfsmount * hfsmp, int wantreserve); extern void hfs_remove_orphans(struct hfsmount *); @@ -464,13 +420,13 @@ unsigned long BestBlockSizeFit(unsigned long allocationBlockSize, OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, struct proc *p); OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, - off_t embeddedOffset, u_int64_t disksize, struct proc *p, void *args); + off_t embeddedOffset, u_int64_t disksize, struct proc *p, void *args, kauth_cred_t cred); extern int hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, - void *_args, int embeddedOffset, int mdb_offset, + void *_args, off_t embeddedOffset, daddr64_t mdb_offset, HFSMasterDirectoryBlock *mdbp, struct ucred *cred); -extern u_long GetFileInfo(ExtendedVCB *vcb, u_int32_t dirid, char *name, - struct cat_attr *fattr, struct cat_fork *forkinfo); +extern u_long GetFileInfo(ExtendedVCB *vcb, u_int32_t dirid, const char *name, + struct cat_attr *fattr, struct cat_fork *forkinfo); int hfs_getconverter(u_int32_t encoding, hfs_to_unicode_func_t *get_unicode, unicode_to_hfs_func_t *get_hfsname); @@ -505,6 +461,47 @@ extern int hfs_virtualmetafile(struct cnode *); void hfs_generate_volume_notifications(struct hfsmount *hfsmp); +__private_extern__ u_int32_t hfs_getencodingbias(void); +__private_extern__ void hfs_setencodingbias(u_int32_t bias); + +extern int hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, + struct vnode **rvpp, struct proc *p); + +extern int hfs_update(struct vnode *, int); + +extern int hfs_truncate(struct vnode *, off_t, int, int, vfs_context_t); + +extern int hfs_fsync(struct vnode *, int, int, struct proc *); + +extern int hfs_access(struct vnode *, mode_t, struct ucred *, struct proc *); + +extern int hfs_vget(struct hfsmount *, cnid_t, struct vnode **, int); + +extern int hfs_bmap(struct vnode *, daddr_t, struct vnode **, daddr64_t *, int *); + +extern int hfs_removeallattr(struct hfsmount *hfsmp, u_int32_t fileid); + +__private_extern__ int hfs_start_transaction(struct hfsmount *hfsmp); +__private_extern__ int hfs_end_transaction(struct hfsmount *hfsmp); + +extern int hfs_setextendedsecurity(struct hfsmount *hfsmp, int state); +extern void hfs_checkextendedsecurity(struct hfsmount *hfsmp); + +extern int hfs_extendfs(struct hfsmount *, u_int64_t, vfs_context_t); +extern int hfs_truncatefs(struct hfsmount *, u_int64_t, vfs_context_t); + +extern int hfs_isallocated(struct hfsmount *, u_long, u_long); + + +/* HFS System file locking */ +#define SFL_CATALOG 0x0001 +#define SFL_EXTENTS 0x0002 +#define SFL_BITMAP 0x0004 +#define SFL_ATTRIBUTE 0x0008 +#define SFL_VALIDMASK (SFL_CATALOG | SFL_EXTENTS | SFL_BITMAP | SFL_ATTRIBUTE) + +extern int hfs_systemfile_lock(struct hfsmount *, int, enum hfslocktype); +extern void hfs_systemfile_unlock(struct hfsmount *, int); #endif /* __APPLE_API_PRIVATE */ #endif /* KERNEL */ diff --git a/bsd/hfs/hfs_attrlist.c b/bsd/hfs/hfs_attrlist.c index 23084bcda..c3d29a9e8 100644 --- a/bsd/hfs/hfs_attrlist.c +++ b/bsd/hfs/hfs_attrlist.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -33,6 +33,10 @@ #include <sys/attr.h> #include <sys/stat.h> #include <sys/unistd.h> +#include <sys/mount_internal.h> +#include <sys/kauth.h> + +#include <kern/locks.h> #include "hfs.h" #include "hfs_cnode.h" @@ -43,23 +47,23 @@ /* Routines that are shared by hfs_setattr: */ -extern int hfs_write_access(struct vnode *vp, struct ucred *cred, +extern int hfs_write_access(struct vnode *vp, kauth_cred_t cred, struct proc *p, Boolean considerFlags); -extern int hfs_chflags(struct vnode *vp, u_long flags, struct ucred *cred, +extern int hfs_chflags(struct vnode *vp, uint32_t flags, kauth_cred_t cred, struct proc *p); -extern int hfs_chmod(struct vnode *vp, int mode, struct ucred *cred, +extern int hfs_chmod(struct vnode *vp, int mode, kauth_cred_t cred, struct proc *p); -extern int hfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, +extern int hfs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred, struct proc *p); -extern char * hfs_getnamehint(struct cnode *dcp, int index); +__private_extern__ int hfs_vnop_readdirattr(struct vnop_readdirattr_args *ap); -extern void hfs_savenamehint(struct cnode *dcp, int index, const char * namehint); +__private_extern__ int hfs_vnop_setattrlist(struct vnop_setattrlist_args *ap); -extern void hfs_relnamehint(struct cnode *dcp, int index); +__private_extern__ int hfs_vnop_getattrlist(struct vnop_getattrlist_args *ap); /* Packing routines: */ @@ -68,7 +72,7 @@ static void packvolcommonattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *vp, struct proc *p); static void packvolattr(struct attrblock *abp, struct hfsmount *hfsmp, - struct vnode *vp, struct proc *p); + struct vnode *vp); static void packcommonattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *vp, struct cat_desc * cdp, @@ -76,60 +80,52 @@ static void packcommonattr(struct attrblock *abp, struct hfsmount *hfsmp, static void packfileattr(struct attrblock *abp, struct hfsmount *hfsmp, struct cat_attr *cattrp, struct cat_fork *datafork, - struct cat_fork *rsrcfork, struct proc *p); + struct cat_fork *rsrcfork); static void packdirattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *vp, struct cat_desc * descp, - struct cat_attr * cattrp, struct proc *p); + struct cat_attr * cattrp); -static void unpackattrblk(struct attrblock *abp, struct vnode *vp); + +#if 0 +static int unpackattrblk(struct attrblock *abp, struct vnode *vp); static void unpackcommonattr(struct attrblock *abp, struct vnode *vp); -static void unpackvolattr(struct attrblock *abp, struct hfsmount *hfsmp, - struct vnode *rootvp); +static int unpackvolattr(struct attrblock *abp, struct hfsmount *hfsmp, + struct vnode *root_vp); /* - -# -#% getattrlist vp = = = -# - vop_getattrlist { - IN struct vnode *vp; - IN struct attrlist *alist; - INOUT struct uio *uio; - IN struct ucred *cred; - IN struct proc *p; - }; - + * Get a list of attributes. */ __private_extern__ int -hfs_getattrlist(ap) - struct vop_getattrlist_args /* { +hfs_vnop_getattrlist(ap) + struct vnop_getattrlist_args /* { struct vnode *a_vp; struct attrlist *a_alist struct uio *a_uio; - struct ucred *a_cred; - struct proc *a_p; + int a_options; + vfs_context_t a_context; } */ *ap; { struct vnode *vp = ap->a_vp; - struct cnode *cp = VTOC(vp); - struct hfsmount *hfsmp = VTOHFS(vp); + struct cnode *cp; + struct hfsmount *hfsmp; struct attrlist *alist = ap->a_alist; - struct timeval tv; + proc_t p = vfs_context_proc(ap->a_context); int fixedblocksize; int attrblocksize; int attrbufsize; - void *attrbufptr; + void *attrbufptr = NULL; void *attrptr; void *varptr; struct attrblock attrblk; struct cat_fork *datafp = NULL; struct cat_fork *rsrcfp = NULL; - struct cat_fork rsrcfork = {0}; + struct cat_fork rsrcfork; + int lockflags; int error = 0; if ((alist->bitmapcount != ATTR_BIT_MAP_COUNT) || @@ -157,68 +153,75 @@ hfs_getattrlist(ap) return (EINVAL); } - /* Requesting volume information requires root vnode */ - if ((alist->volattr) && cp->c_fileid != kRootDirID) - return (EINVAL); + if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) + return (error); + cp = VTOC(vp); + hfsmp = VTOHFS(vp); + /* Requesting volume information requires root vnode */ + if ((alist->volattr) && cp->c_fileid != kHFSRootFolderID) { + error = EINVAL; + goto exit; + } /* Asking for data fork attributes from the rsrc fork is not supported */ - if (VNODE_IS_RSRC(vp) && (alist->fileattr & ATTR_DATAFORK_MASK)) - return (EINVAL); - + if (VNODE_IS_RSRC(vp) && (alist->fileattr & ATTR_DATAFORK_MASK)) { + error = EINVAL; + goto exit; + } /* This file no longer exists! */ - if (cp->c_flag & (C_NOEXISTS | C_DELETED)) - return (ENOENT); - + if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { + error = ENOENT; + goto exit; + } /* This file doesn't have a name! */ - if ((cp->c_desc.cd_namelen == 0) && (alist->commonattr & ATTR_CMN_NAME)) - return (ENOENT); + if ((cp->c_desc.cd_namelen == 0) && (alist->commonattr & ATTR_CMN_NAME)) { + error = ENOENT; + goto exit; + } /* Update cnode times if needed */ - tv = time; - CTIMES(cp, &tv, &tv); + hfs_touchtimes(hfsmp, cp); /* * If a File ID (ATTR_CMN_OBJPERMANENTID) is requested on * an HFS volume we must be sure to create the thread * record before returning it. (yikes) */ - if ((vp->v_type == VREG) && + if (vnode_isreg(vp) && (alist->commonattr & ATTR_CMN_OBJPERMANENTID) && (VTOVCB(vp)->vcbSigWord != kHFSPlusSigWord)) { - cat_cookie_t cookie = {0}; - - if (hfsmp->hfs_flags & HFS_READ_ONLY) - return (EROFS); - if ((error = hfs_write_access(vp, ap->a_cred, ap->a_p, false)) != 0) - return (error); + cat_cookie_t cookie; + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + error = EROFS; + goto exit; + } + if ((error = hfs_write_access(vp, vfs_context_ucred(ap->a_context), + p, false)) != 0) { + goto exit; + } /* * Reserve some space in the Catalog file. */ - error = cat_preflight(hfsmp, CAT_CREATE, &cookie, ap->a_p); - if (error) - return (error); - - /* Lock catalog b-tree */ - error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, - LK_EXCLUSIVE, ap->a_p); + bzero(&cookie, sizeof(cookie)); + error = cat_preflight(hfsmp, CAT_CREATE, &cookie, p); if (error) { - cat_postflight(hfsmp, &cookie, ap->a_p); - return (error); - } + goto exit; + } + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); error = cat_insertfilethread(hfsmp, &cp->c_desc); - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, - ap->a_p); + hfs_systemfile_unlock(hfsmp, lockflags); - cat_postflight(hfsmp, &cookie, ap->a_p); + cat_postflight(hfsmp, &cookie, p); if (error) - return (error); + goto exit; } - + bzero(&rsrcfork, sizeof(rsrcfork)); /* Establish known fork data */ if (cp->c_datafork != NULL) { datafp = &cp->c_datafork->ff_data; @@ -235,25 +238,23 @@ hfs_getattrlist(ap) * fetched from the catalog. */ if ((alist->fileattr & ATTR_RSRCFORK_MASK) && (rsrcfp == NULL)) { - /* Lock catalog b-tree */ - error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, ap->a_p); - if (error) - return (error); + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); /* Get resource fork data */ error = cat_lookup(hfsmp, &cp->c_desc, 1, - (struct cat_desc *)0, (struct cat_attr *)0, &rsrcfork); + (struct cat_desc *)0, (struct cat_attr *)0, &rsrcfork, NULL); + + hfs_systemfile_unlock(hfsmp, lockflags); - /* Unlock the Catalog */ - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, ap->a_p); if (error) - return (error); + goto exit; rsrcfp = &rsrcfork; } fixedblocksize = hfs_attrblksize(alist); - attrblocksize = fixedblocksize + (sizeof(u_long)); /* u_long for length longword */ + attrblocksize = fixedblocksize + (sizeof(uint32_t)); /* uint32_t for length word */ if (alist->commonattr & ATTR_CMN_NAME) attrblocksize += kHFSPlusMaxFileNameBytes + 1; if (alist->volattr & ATTR_VOL_MOUNTPOINT) @@ -266,11 +267,11 @@ hfs_getattrlist(ap) if (alist->fileattr & ATTR_FILE_FORKLIST) attrblocksize += 0; #endif - attrbufsize = MIN(ap->a_uio->uio_resid, attrblocksize); + attrbufsize = MIN(uio_resid(ap->a_uio), attrblocksize); MALLOC(attrbufptr, void *, attrblocksize, M_TEMP, M_WAITOK); attrptr = attrbufptr; - *((u_long *)attrptr) = 0; /* Set buffer length in case of errors */ - ++((u_long *)attrptr); /* Reserve space for length field */ + *((uint32_t *)attrptr) = 0; /* Set buffer length in case of errors */ + ++((uint32_t *)attrptr); /* Reserve space for length field */ varptr = ((char *)attrptr) + fixedblocksize; attrblk.ab_attrlist = alist; @@ -280,50 +281,41 @@ hfs_getattrlist(ap) attrblk.ab_blocksize = attrblocksize; hfs_packattrblk(&attrblk, hfsmp, vp, &cp->c_desc, &cp->c_attr, - datafp, rsrcfp, ap->a_p); + datafp, rsrcfp, p); /* Don't copy out more data than was generated */ - attrbufsize = MIN(attrbufsize, (u_int)varptr - (u_int)attrbufptr); + attrbufsize = MIN((u_int)attrbufsize, (u_int)varptr - (u_int)attrbufptr); /* Set actual buffer length for return to caller */ - *((u_long *)attrbufptr) = attrbufsize; + *((uint32_t *)attrbufptr) = attrbufsize; error = uiomove((caddr_t)attrbufptr, attrbufsize, ap->a_uio); - - FREE(attrbufptr, M_TEMP); +exit: + if (attrbufptr) + FREE(attrbufptr, M_TEMP); + hfs_unlock(cp); return (error); } /* - -# -#% setattrlist vp L L L -# - vop_setattrlist { - IN struct vnode *vp; - IN struct attrlist *alist; - INOUT struct uio *uio; - IN struct ucred *cred; - IN struct proc *p; - }; - + * Set a list of attributes. */ __private_extern__ int -hfs_setattrlist(ap) - struct vop_setattrlist_args /* { +hfs_vnop_setattrlist(ap) + struct vnop_setattrlist_args /* { struct vnode *a_vp; struct attrlist *a_alist struct uio *a_uio; - struct ucred *a_cred; - struct proc *a_p; + int a_options; + vfs_context_t a_context; } */ *ap; { struct vnode *vp = ap->a_vp; - struct cnode *cp = VTOC(vp); - struct hfsmount * hfsmp = VTOHFS(vp); + struct cnode *cp; + struct hfsmount * hfsmp; struct attrlist *alist = ap->a_alist; - struct ucred *cred = ap->a_cred; - struct proc *p = ap->a_p; + kauth_cred_t cred = vfs_context_ucred(ap->a_context); + struct proc *p = vfs_context_proc(ap->a_context); int attrblocksize; void *attrbufptr = NULL; void *attrptr; @@ -332,9 +324,11 @@ hfs_setattrlist(ap) uid_t saved_uid; gid_t saved_gid; mode_t saved_mode; - u_long saved_flags; + uint32_t saved_flags; int error = 0; + hfsmp = VTOHFS(vp); + if (hfsmp->hfs_flags & HFS_READ_ONLY) return (EROFS); if ((alist->bitmapcount != ATTR_BIT_MAP_COUNT) || @@ -344,6 +338,10 @@ hfs_setattrlist(ap) ((alist->fileattr & ~ATTR_FILE_SETMASK) != 0)) { return (EINVAL); } + if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) + return (error); + cp = VTOC(vp); + /* * When setting volume attributes make sure * that ATTR_VOL_INFO is set and that all @@ -352,24 +350,27 @@ hfs_setattrlist(ap) if ((alist->volattr != 0) && (((alist->volattr & ATTR_VOL_INFO) == 0) || (alist->commonattr & ~ATTR_CMN_VOLSETMASK) || - (cp->c_fileid != kRootDirID))) { + (cp->c_fileid != kHFSRootFolderID))) { if ((alist->volattr & ATTR_VOL_INFO) == 0) printf("hfs_setattrlist: you forgot to set ATTR_VOL_INFO bit!\n"); else printf("hfs_setattrlist: you cannot set bits 0x%08X!\n", alist->commonattr & ~ATTR_CMN_VOLSETMASK); - return (EINVAL); + error = EINVAL; + goto ErrorExit; + } + if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { + error = ENOENT; + goto ErrorExit; } - if (cp->c_flag & (C_NOEXISTS | C_DELETED)) - return (ENOENT); - // XXXdbg - don't allow modifying the journal or journal_info_block if (hfsmp->jnl && cp->c_datafork) { struct HFSPlusExtentDescriptor *extd; extd = &cp->c_datafork->ff_extents[0]; if (extd->startBlock == HFSTOVCB(hfsmp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) { - return EPERM; + error = EPERM; + goto ErrorExit; } } @@ -394,27 +395,30 @@ hfs_setattrlist(ap) * change so this check is sufficient for now. */ if ((error = hfs_owner_rights(hfsmp, cp->c_uid, cred, p, true)) != 0) - return (error); + goto ErrorExit; } /* * For any other attributes, check to see if the user has - * write access to the cnode in question [unlike VOP_ACCESS, + * write access to the cnode in question [unlike vn_access, * ignore IMMUTABLE here]: */ if (((alist->commonattr & ~ATTR_OWNERSHIP_SETMASK) != 0) || (alist->volattr != 0) || (alist->dirattr != 0) || (alist->fileattr != 0)) { if ((error = hfs_write_access(vp, cred, p, false)) != 0) - return (error); + goto ErrorExit; } /* * Allocate the buffer now to minimize the time we might * be blocked holding the catalog lock. */ - attrblocksize = ap->a_uio->uio_resid; - if (attrblocksize < hfs_attrblksize(alist)) - return (EINVAL); + // LP64todo - fix this + attrblocksize = uio_resid(ap->a_uio); + if (attrblocksize < hfs_attrblksize(alist)) { + error = EINVAL; + goto ErrorExit; + } MALLOC(attrbufptr, void *, attrblocksize, M_TEMP, M_WAITOK); @@ -434,7 +438,9 @@ hfs_setattrlist(ap) attrblk.ab_varbufpp = &varptr; attrblk.ab_flags = 0; attrblk.ab_blocksize = attrblocksize; - unpackattrblk(&attrblk, vp); + error = unpackattrblk(&attrblk, vp); + if (error) + goto ErrorExit; /* If unpacking changed the owner/group then call hfs_chown() */ if ((saved_uid != cp->c_uid) || (saved_gid != cp->c_gid)) { @@ -459,7 +465,7 @@ hfs_setattrlist(ap) } /* If unpacking changed the flags then call hfs_chflags() */ if (saved_flags !=cp->c_flags) { - u_long flags; + uint32_t flags; flags = cp->c_flags; cp->c_flags = saved_flags; @@ -470,13 +476,10 @@ hfs_setattrlist(ap) * If any cnode attributes changed then do an update. */ if (alist->volattr == 0) { - struct timeval tv; - cp->c_flag |= C_MODIFIED; - tv = time; - CTIMES(cp, &tv, &tv); - if ((error = VOP_UPDATE(vp, &tv, &tv, 1))) + if ((error = hfs_update(vp, TRUE))) { goto ErrorExit; + } } /* Volume Rename */ if (alist->volattr & ATTR_VOL_NAME) { @@ -489,32 +492,34 @@ hfs_setattrlist(ap) */ copystr(cp->c_desc.cd_nameptr, vcb->vcbVN, sizeof(vcb->vcbVN), NULL); } else { - struct cat_desc to_desc = {0}; - struct cat_desc todir_desc = {0}; - struct cat_desc new_desc = {0}; - cat_cookie_t cookie = {0}; + struct cat_desc to_desc; + struct cat_desc todir_desc; + struct cat_desc new_desc; + cat_cookie_t cookie; int catreserve = 0; int catlocked = 0; int started_tr = 0; + int lockflags; + + bzero(&to_desc, sizeof(to_desc)); + bzero(&todir_desc, sizeof(todir_desc)); + bzero(&new_desc, sizeof(new_desc)); + bzero(&cookie, sizeof(cookie)); - todir_desc.cd_parentcnid = kRootParID; - todir_desc.cd_cnid = kRootParID; + todir_desc.cd_parentcnid = kHFSRootParentID; + todir_desc.cd_cnid = kHFSRootFolderID; todir_desc.cd_flags = CD_ISDIR; to_desc.cd_nameptr = vcb->vcbVN; to_desc.cd_namelen = strlen(vcb->vcbVN); - to_desc.cd_parentcnid = kRootParID; + to_desc.cd_parentcnid = kHFSRootParentID; to_desc.cd_cnid = cp->c_cnid; to_desc.cd_flags = CD_ISDIR; - // XXXdbg - hfs_global_shared_lock_acquire(hfsmp); - if (hfsmp->jnl) { - if ((error = journal_start_transaction(hfsmp->jnl) != 0)) { - goto rename_out; - } - started_tr = 1; + if ((error = hfs_start_transaction(hfsmp) != 0)) { + goto rename_out; } + started_tr = 1; /* * Reserve some space in the Catalog file. @@ -525,25 +530,21 @@ hfs_setattrlist(ap) } catreserve = 1; - /* Lock catalog b-tree */ - error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); - if (error) { - goto rename_out; - } + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); catlocked = 1; error = cat_rename(hfsmp, &cp->c_desc, &todir_desc, &to_desc, &new_desc); rename_out: if (catlocked) { - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); + hfs_systemfile_unlock(hfsmp, lockflags); } if (catreserve) { cat_postflight(hfsmp, &cookie, p); } + (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0); if (started_tr) { - journal_end_transaction(hfsmp->jnl); + hfs_end_transaction(hfsmp); } - hfs_global_shared_lock_release(hfsmp); if (error) { /* Restore the old name in the VCB */ @@ -558,12 +559,12 @@ rename_out: cp->c_desc.cd_nameptr = 0; cp->c_desc.cd_namelen = 0; cp->c_desc.cd_flags &= ~CD_HASBUF; - remove_name(name); + vfs_removename(name); } /* Update cnode's catalog descriptor */ replace_desc(cp, &new_desc); vcb->volumeNameEncodingHint = new_desc.cd_encoding; - cp->c_flag |= C_CHANGE; + cp->c_touch_chgtime = TRUE; } } @@ -580,9 +581,10 @@ ErrorExit: if (attrbufptr) FREE(attrbufptr, M_TEMP); + hfs_unlock(cp); return (error); } - +#endif /* * readdirattr operation will return attributes for the items in the @@ -604,7 +606,7 @@ ErrorExit: # #% readdirattr vp L L L # -vop_readdirattr { +vnop_readdirattr { IN struct vnode *vp; IN struct attrlist *alist; INOUT struct uio *uio; @@ -614,13 +616,13 @@ vop_readdirattr { OUT int *eofflag; OUT u_long *actualCount; OUT u_long **cookies; - IN struct ucred *cred; + IN kauth_cred_t cred; }; */ __private_extern__ int -hfs_readdirattr(ap) - struct vop_readdirattr_args /* { +hfs_vnop_readdirattr(ap) + struct vnop_readdirattr_args /* { struct vnode *a_vp; struct attrlist *a_alist; struct uio *a_uio; @@ -629,49 +631,39 @@ hfs_readdirattr(ap) u_long *a_newstate; int *a_eofflag; u_long *a_actualcount; - u_long **a_cookies; - struct ucred *a_cred; + vfs_context_t a_context; } */ *ap; { struct vnode *dvp = ap->a_vp; - struct cnode *dcp = VTOC(dvp); - struct hfsmount * hfsmp = VTOHFS(dvp); + struct cnode *dcp; + struct hfsmount * hfsmp; struct attrlist *alist = ap->a_alist; - struct uio *uio = ap->a_uio; + uio_t uio = ap->a_uio; int maxcount = ap->a_maxcount; - struct proc *p = current_proc(); - u_long fixedblocksize; - u_long maxattrblocksize; - u_long currattrbufsize; + struct proc *p = vfs_context_proc(ap->a_context); + uint32_t fixedblocksize; + uint32_t maxattrblocksize; + uint32_t currattrbufsize; void *attrbufptr = NULL; void *attrptr; void *varptr; struct attrblock attrblk; int error = 0; int depleted = 0; - int index, startindex; + int index; int i, dir_entries; struct cat_desc *lastdescp = NULL; - struct cat_desc prevdesc; - char * prevnamebuf = NULL; struct cat_entrylist *ce_list = NULL; - - dir_entries = dcp->c_entries; - if (dcp->c_attr.ca_fileid == kHFSRootFolderID && hfsmp->jnl) { - dir_entries -= 3; - } + directoryhint_t *dirhint = NULL; + unsigned int tag; + int shared_cnode_lock = 0; *(ap->a_actualcount) = 0; *(ap->a_eofflag) = 0; - - if (ap->a_cookies != NULL) { - printf("readdirattr: no cookies!\n"); - return (EINVAL); - } /* Check for invalid options and buffer space. */ if (((ap->a_options & ~(FSOPT_NOINMEMUPDATE | FSOPT_NOFOLLOW)) != 0) - || (uio->uio_resid <= 0) || (uio->uio_iovcnt > 1) || (maxcount <= 0)) + || (uio_resid(uio) <= 0) || (uio_iovcnt(uio) > 1) || (maxcount <= 0)) return (EINVAL); /* This call doesn't take volume attributes. */ @@ -682,17 +674,29 @@ hfs_readdirattr(ap) ((alist->fileattr & ~ATTR_FILE_VALIDMASK) != 0)) return (EINVAL); + if ((error = hfs_lock(VTOC(dvp), HFS_EXCLUSIVE_LOCK))) + return (error); + dcp = VTOC(dvp); + hfsmp = VTOHFS(dvp); + /* Reject requests for unsupported options. */ if ((alist->commonattr & (ATTR_CMN_NAMEDATTRCOUNT | ATTR_CMN_NAMEDATTRLIST | ATTR_CMN_OBJPERMANENTID)) || (alist->fileattr & (ATTR_FILE_FILETYPE | ATTR_FILE_FORKCOUNT | ATTR_FILE_FORKLIST | ATTR_FILE_DATAEXTENTS | ATTR_FILE_RSRCEXTENTS))) { printf("readdirattr: unsupported attributes! (%s)\n", dcp->c_desc.cd_nameptr); - return (EINVAL); + error = EINVAL; + goto exit; + } + + dir_entries = dcp->c_entries; + if (dcp->c_attr.ca_fileid == kHFSRootFolderID && hfsmp->jnl) { + dir_entries -= 3; } /* Convert uio_offset into a directory index. */ - startindex = index = uio->uio_offset / sizeof(struct dirent); + index = uio_offset(uio) & HFS_INDEX_MASK; + tag = uio_offset(uio) & ~HFS_INDEX_MASK; if ((index + 1) > dir_entries) { *(ap->a_eofflag) = 1; error = 0; @@ -700,7 +704,7 @@ hfs_readdirattr(ap) } /* Get a buffer to hold packed attributes. */ - fixedblocksize = (sizeof(u_long) + hfs_attrblksize(alist)); /* u_long for length */ + fixedblocksize = (sizeof(uint32_t) + hfs_attrblksize(alist)); /* 4 bytes for length */ maxattrblocksize = fixedblocksize; if (alist->commonattr & ATTR_CMN_NAME) maxattrblocksize += kHFSPlusMaxFileNameBytes + 1; @@ -713,38 +717,48 @@ hfs_readdirattr(ap) bzero(ce_list, sizeof(*ce_list)); ce_list->maxentries = MAXCATENTRIES; - /* Initialize a starting descriptor. */ - bzero(&prevdesc, sizeof(prevdesc)); - prevdesc.cd_flags = CD_DECOMPOSED; - prevdesc.cd_hint = dcp->c_childhint; - prevdesc.cd_parentcnid = dcp->c_cnid; - prevdesc.cd_nameptr = hfs_getnamehint(dcp, index); - prevdesc.cd_namelen = prevdesc.cd_nameptr ? strlen(prevdesc.cd_nameptr) : 0; - + /* Get a directory hint (cnode must be locked exclusive) */ + dirhint = hfs_getdirhint(dcp, ((index - 1) & HFS_INDEX_MASK) | tag); + + /* Hide tag from catalog layer. */ + dirhint->dh_index &= HFS_INDEX_MASK; + if (dirhint->dh_index == HFS_INDEX_MASK) { + dirhint->dh_index = -1; + } + + /* + * An ATTR_CMN_USERACCESS attribute request can result in a + * call to kauth_cred_ismember_gid(). So when requesting + * this attribute we downgrade our exclusive lock on dcp to + * a shared lock in case kauth_cred_ismember_gid generates + * an indirect call back into the file system. + */ + if (alist->commonattr & ATTR_CMN_USERACCESS) { + lck_rw_lock_exclusive_to_shared(&dcp->c_rwlock); + dcp->c_lockowner = HFS_SHARED_OWNER; + shared_cnode_lock = 1; + } /* * Obtain a list of catalog entries and pack their attributes until * the output buffer is full or maxcount entries have been packed. */ while (!depleted) { int maxentries; + int lockflags; /* Constrain our list size. */ - maxentries = uio->uio_resid / (fixedblocksize + HFS_AVERAGE_NAME_SIZE); + maxentries = uio_resid(uio) / (fixedblocksize + HFS_AVERAGE_NAME_SIZE); maxentries = min(maxentries, dcp->c_entries - index); maxentries = min(maxentries, maxcount); ce_list->maxentries = min(maxentries, ce_list->maxentries); lastdescp = NULL; - /* Lock catalog b-tree. */ - error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, p); - if (error) - goto exit; + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - error = cat_getentriesattr(hfsmp, &prevdesc, index, ce_list); + error = cat_getentriesattr(hfsmp, dirhint, ce_list); /* Don't forget to release the descriptors later! */ - /* Unlock catalog b-tree. */ - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); + hfs_systemfile_unlock(hfsmp, lockflags); if (error == ENOENT) { *(ap->a_eofflag) = TRUE; @@ -755,15 +769,16 @@ hfs_readdirattr(ap) break; /* Process the catalog entries. */ - for (i = 0; i < ce_list->realentries; ++i) { + for (i = 0; i < (int)ce_list->realentries; ++i) { struct cnode *cp = NULL; struct vnode *vp = NULL; - struct vnode *rvp = NULL; struct cat_desc * cdescp; struct cat_attr * cattrp; - struct cat_fork c_datafork = {0}; - struct cat_fork c_rsrcfork = {0}; + struct cat_fork c_datafork; + struct cat_fork c_rsrcfork; + bzero(&c_datafork, sizeof(c_datafork)); + bzero(&c_rsrcfork, sizeof(c_rsrcfork)); cdescp = &ce_list->entry[i].ce_desc; cattrp = &ce_list->entry[i].ce_attr; c_datafork.cf_size = ce_list->entry[i].ce_datasize; @@ -774,8 +789,10 @@ hfs_readdirattr(ap) * Get in memory cnode data (if any). */ if (!(ap->a_options & FSOPT_NOINMEMUPDATE)) { - cp = hfs_chashget(dcp->c_dev, cattrp->ca_fileid, 0, &vp, &rvp); - if (cp != NULL) { + vp = hfs_chash_getvnode(dcp->c_dev, cattrp->ca_fileid, 0, 0); + + if (vp != NULL) { + cp = VTOC(vp); /* Only use cnode's decriptor for non-hardlinks */ if (!(cp->c_flag & C_HARDLINK)) cdescp = &cp->c_desc; @@ -790,7 +807,7 @@ hfs_readdirattr(ap) } } } - *((u_long *)attrptr)++ = 0; /* move it past length */ + *((uint32_t *)attrptr)++ = 0; /* move it past length */ attrblk.ab_attrlist = alist; attrblk.ab_attrbufpp = &attrptr; attrblk.ab_varbufpp = &varptr; @@ -803,21 +820,20 @@ hfs_readdirattr(ap) currattrbufsize = ((char *)varptr - (char *)attrbufptr); /* All done with cnode. */ - if (vp) { - vput(vp); + if (vp != NULL) { + hfs_unlock(VTOC(vp)); + vnode_put(vp); vp = NULL; - } else if (rvp) { - vput(rvp); - rvp = NULL; + cp = NULL; } - cp = NULL; /* Make sure there's enough buffer space remaining. */ - if (currattrbufsize > uio->uio_resid) { + // LP64todo - fix this! + if (uio_resid(uio) < 0 || currattrbufsize > (uint32_t)uio_resid(uio)) { depleted = 1; break; } else { - *((u_long *)attrbufptr) = currattrbufsize; + *((uint32_t *)attrbufptr) = currattrbufsize; error = uiomove((caddr_t)attrbufptr, currattrbufsize, ap->a_uio); if (error != E_NONE) { depleted = 1; @@ -832,7 +848,9 @@ hfs_readdirattr(ap) /* Termination checks */ if ((--maxcount <= 0) || - (uio->uio_resid < (fixedblocksize + HFS_AVERAGE_NAME_SIZE)) || + // LP64todo - fix this! + uio_resid(uio) < 0 || + ((uint32_t)uio_resid(uio) < (fixedblocksize + HFS_AVERAGE_NAME_SIZE)) || (index >= dir_entries)) { depleted = 1; break; @@ -844,46 +862,56 @@ hfs_readdirattr(ap) if (index < dir_entries && !(*(ap->a_eofflag)) && lastdescp != NULL) { - if (prevnamebuf == NULL) - MALLOC(prevnamebuf, char *, kHFSPlusMaxFileNameBytes + 1, M_TEMP, M_WAITOK); - bcopy(lastdescp->cd_nameptr, prevnamebuf, lastdescp->cd_namelen + 1); - if (!depleted) { - prevdesc.cd_hint = lastdescp->cd_hint; - prevdesc.cd_nameptr = prevnamebuf; - prevdesc.cd_namelen = lastdescp->cd_namelen + 1; + + /* Remember last entry */ + if (dirhint->dh_desc.cd_nameptr != NULL) { + vfs_removename(dirhint->dh_desc.cd_nameptr); } + dirhint->dh_desc.cd_namelen = lastdescp->cd_namelen; + dirhint->dh_desc.cd_nameptr = + vfs_addname(lastdescp->cd_nameptr, lastdescp->cd_namelen, 0, 0); + dirhint->dh_index = index - 1; + dirhint->dh_desc.cd_cnid = lastdescp->cd_cnid; + dirhint->dh_desc.cd_hint = lastdescp->cd_hint; + dirhint->dh_desc.cd_encoding = lastdescp->cd_encoding; } /* All done with the catalog descriptors. */ - for (i = 0; i < ce_list->realentries; ++i) + for (i = 0; i < (int)ce_list->realentries; ++i) cat_releasedesc(&ce_list->entry[i].ce_desc); ce_list->realentries = 0; } /* while not depleted */ *ap->a_newstate = dcp->c_mtime; - - /* All done with last name hint */ - hfs_relnamehint(dcp, startindex); - startindex = 0; - /* Convert directory index into uio_offset. */ - uio->uio_offset = index * sizeof(struct dirent); + /* Make sure dcp is locked exclusive before changing c_dirhinttag. */ + if (shared_cnode_lock) { + lck_rw_lock_shared_to_exclusive(&dcp->c_rwlock); + dcp->c_lockowner = current_thread(); + shared_cnode_lock = 0; + } - /* Save a name hint if there are more entries */ - if ((error == 0) && prevnamebuf && (index + 1) < dcp->c_entries) - hfs_savenamehint(dcp, index, prevnamebuf); -exit: - if (startindex > 0) - hfs_relnamehint(dcp, startindex); + /* Convert directory index back into a uio_offset. */ + while (tag == 0) tag = (++dcp->c_dirhinttag) << HFS_INDEX_BITS; + uio_setoffset(uio, index | tag); + dirhint->dh_index |= tag; +exit: + /* Drop directory hint on error or if there are no more entries */ + if (dirhint && (error || index >= dir_entries)) { + if (shared_cnode_lock) { + lck_rw_lock_shared_to_exclusive(&dcp->c_rwlock); + dcp->c_lockowner = current_thread(); + } + hfs_reldirhint(dcp, dirhint); + } if (attrbufptr) FREE(attrbufptr, M_TEMP); if (ce_list) FREE(ce_list, M_TEMP); - if (prevnamebuf) - FREE(prevnamebuf, M_TEMP); + hfs_unlock(dcp); return (error); } @@ -911,16 +939,16 @@ hfs_packattrblk(struct attrblock *abp, packvolcommonattr(abp, hfsmp, vp, p); if (attrlistp->volattr & ~ATTR_VOL_INFO) - packvolattr(abp, hfsmp, vp, p); + packvolattr(abp, hfsmp, vp); } else { if (attrlistp->commonattr) packcommonattr(abp, hfsmp, vp, descp, attrp, p); if (attrlistp->dirattr && S_ISDIR(attrp->ca_mode)) - packdirattr(abp, hfsmp, vp, descp,attrp, p); + packdirattr(abp, hfsmp, vp, descp,attrp); if (attrlistp->fileattr && !S_ISDIR(attrp->ca_mode)) - packfileattr(abp, hfsmp, attrp, datafork, rsrcfork, p); + packfileattr(abp, hfsmp, attrp, datafork, rsrcfork); } } @@ -928,7 +956,7 @@ hfs_packattrblk(struct attrblock *abp, static char* mountpointname(struct mount *mp) { - size_t namelength = strlen(mp->mnt_stat.f_mntonname); + size_t namelength = strlen(mp->mnt_vfsstat.f_mntonname); int foundchars = 0; char *c; @@ -940,7 +968,7 @@ mountpointname(struct mount *mp) * the first slash encountered (which must precede the * last part of the pathname). */ - for (c = mp->mnt_stat.f_mntonname + namelength - 1; + for (c = mp->mnt_vfsstat.f_mntonname + namelength - 1; namelength > 0; --c, --namelength) { if (*c != '/') { foundchars = 1; @@ -949,7 +977,7 @@ mountpointname(struct mount *mp) } } - return (mp->mnt_stat.f_mntonname); + return (mp->mnt_vfsstat.f_mntonname); } @@ -958,14 +986,13 @@ packnameattr( struct attrblock *abp, struct vnode *vp, char *name, - int namelen, - struct proc *p) + int namelen) { void *varbufptr; struct attrreference * attr_refptr; char *mpname; size_t mpnamelen; - u_long attrlength; + uint32_t attrlength; char empty = 0; /* A cnode's name may be incorrect for the root of a mounted @@ -974,8 +1001,8 @@ packnameattr( * root directory, it's best to return the last element of the location where the volume's mounted: */ - if ((vp != NULL) && (vp->v_flag & VROOT) && - (mpname = mountpointname(vp->v_mount))) { + if ((vp != NULL) && vnode_isvroot(vp) && + (mpname = mountpointname(vnode_mount(vp)))) { mpnamelen = strlen(mpname); /* Trim off any trailing slashes: */ @@ -1023,12 +1050,13 @@ packvolcommonattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *v struct cnode *cp = VTOC(vp); struct mount *mp = VTOVFS(vp); ExtendedVCB *vcb = HFSTOVCB(hfsmp); - u_long attrlength; + u_int32_t attrlength; + boolean_t is_64_bit = proc_is64bit(p); attr = abp->ab_attrlist->commonattr; if (ATTR_CMN_NAME & attr) { - packnameattr(abp, vp, cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen, p); + packnameattr(abp, vp, cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen); attrbufptr = *abp->ab_attrbufpp; varbufptr = *abp->ab_varbufpp; } @@ -1036,7 +1064,11 @@ packvolcommonattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *v *((dev_t *)attrbufptr)++ = hfsmp->hfs_raw_dev; } if (ATTR_CMN_FSID & attr) { - *((fsid_t *)attrbufptr) = mp->mnt_stat.f_fsid; + fsid_t fsid; + + fsid.val[0] = (long)hfsmp->hfs_raw_dev; + fsid.val[1] = (long)vfs_typenum(mp); + *((fsid_t *)attrbufptr) = fsid; ++((fsid_t *)attrbufptr); } if (ATTR_CMN_OBJTYPE & attr) { @@ -1061,7 +1093,7 @@ packvolcommonattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *v ++((fsobj_id_t *)attrbufptr); } if (ATTR_CMN_SCRIPT & attr) { - u_long encoding; + uint32_t encoding; if (vcb->vcbSigWord == kHFSPlusSigWord) encoding = vcb->volumeNameEncodingHint; @@ -1070,29 +1102,64 @@ packvolcommonattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *v *((text_encoding_t *)attrbufptr)++ = encoding; } if (ATTR_CMN_CRTIME & attr) { - ((struct timespec *)attrbufptr)->tv_sec = vcb->vcbCrDate; - ((struct timespec *)attrbufptr)->tv_nsec = 0; - ++((struct timespec *)attrbufptr); + if (is_64_bit) { + ((struct user_timespec *)attrbufptr)->tv_sec = vcb->vcbCrDate; + ((struct user_timespec *)attrbufptr)->tv_nsec = 0; + ++((struct user_timespec *)attrbufptr); + } + else { + ((struct timespec *)attrbufptr)->tv_sec = vcb->vcbCrDate; + ((struct timespec *)attrbufptr)->tv_nsec = 0; + ++((struct timespec *)attrbufptr); + } } if (ATTR_CMN_MODTIME & attr) { - ((struct timespec *)attrbufptr)->tv_sec = vcb->vcbLsMod; - ((struct timespec *)attrbufptr)->tv_nsec = 0; - ++((struct timespec *)attrbufptr); + if (is_64_bit) { + ((struct user_timespec *)attrbufptr)->tv_sec = vcb->vcbLsMod; + ((struct user_timespec *)attrbufptr)->tv_nsec = 0; + ++((struct user_timespec *)attrbufptr); + } + else { + ((struct timespec *)attrbufptr)->tv_sec = vcb->vcbLsMod; + ((struct timespec *)attrbufptr)->tv_nsec = 0; + ++((struct timespec *)attrbufptr); + } } if (ATTR_CMN_CHGTIME & attr) { - ((struct timespec *)attrbufptr)->tv_sec = vcb->vcbLsMod; - ((struct timespec *)attrbufptr)->tv_nsec = 0; - ++((struct timespec *)attrbufptr); + if (is_64_bit) { + ((struct user_timespec *)attrbufptr)->tv_sec = vcb->vcbLsMod; + ((struct user_timespec *)attrbufptr)->tv_nsec = 0; + ++((struct user_timespec *)attrbufptr); + } + else { + ((struct timespec *)attrbufptr)->tv_sec = vcb->vcbLsMod; + ((struct timespec *)attrbufptr)->tv_nsec = 0; + ++((struct timespec *)attrbufptr); + } } if (ATTR_CMN_ACCTIME & attr) { - ((struct timespec *)attrbufptr)->tv_sec = vcb->vcbLsMod; - ((struct timespec *)attrbufptr)->tv_nsec = 0; - ++((struct timespec *)attrbufptr); + if (is_64_bit) { + ((struct user_timespec *)attrbufptr)->tv_sec = vcb->vcbLsMod; + ((struct user_timespec *)attrbufptr)->tv_nsec = 0; + ++((struct user_timespec *)attrbufptr); + } + else { + ((struct timespec *)attrbufptr)->tv_sec = vcb->vcbLsMod; + ((struct timespec *)attrbufptr)->tv_nsec = 0; + ++((struct timespec *)attrbufptr); + } } if (ATTR_CMN_BKUPTIME & attr) { - ((struct timespec *)attrbufptr)->tv_sec = vcb->vcbVolBkUp; - ((struct timespec *)attrbufptr)->tv_nsec = 0; - ++((struct timespec *)attrbufptr); + if (is_64_bit) { + ((struct user_timespec *)attrbufptr)->tv_sec = vcb->vcbVolBkUp; + ((struct user_timespec *)attrbufptr)->tv_nsec = 0; + ++((struct user_timespec *)attrbufptr); + } + else { + ((struct timespec *)attrbufptr)->tv_sec = vcb->vcbVolBkUp; + ((struct timespec *)attrbufptr)->tv_nsec = 0; + ++((struct timespec *)attrbufptr); + } } if (ATTR_CMN_FNDRINFO & attr) { bcopy (&vcb->vcbFndrInfo, attrbufptr, sizeof(vcb->vcbFndrInfo)); @@ -1100,13 +1167,14 @@ packvolcommonattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *v } if (ATTR_CMN_OWNERID & attr) { if (cp->c_uid == UNKNOWNUID) - *((uid_t *)attrbufptr)++ = p->p_ucred->cr_uid; + *((uid_t *)attrbufptr)++ = kauth_cred_getuid(proc_ucred(p)); else *((uid_t *)attrbufptr)++ = cp->c_uid; } if (ATTR_CMN_GRPID & attr) { *((gid_t *)attrbufptr)++ = cp->c_gid; } + if (ATTR_CMN_ACCESSMASK & attr) { /* * [2856576] Since we are dynamically changing the owner, also @@ -1115,11 +1183,11 @@ packvolcommonattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *v * a security hole where set-user-id programs run as whoever is * logged on (or root if nobody is logged in yet!) */ - *((u_long *)attrbufptr)++ = + *((uint32_t *)attrbufptr)++ = (cp->c_uid == UNKNOWNUID) ? cp->c_mode & ~(S_ISUID | S_ISGID) : cp->c_mode; } if (ATTR_CMN_NAMEDATTRCOUNT & attr) { - *((u_long *)attrbufptr)++ = 0; /* XXX PPD TBC */ + *((uint32_t *)attrbufptr)++ = 0; /* XXX PPD TBC */ } if (ATTR_CMN_NAMEDATTRLIST & attr) { attrlength = 0; @@ -1133,12 +1201,12 @@ packvolcommonattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *v ++((struct attrreference *)attrbufptr); } if (ATTR_CMN_FLAGS & attr) { - *((u_long *)attrbufptr)++ = cp->c_flags; + *((uint32_t *)attrbufptr)++ = cp->c_flags; } if (ATTR_CMN_USERACCESS & attr) { - *((u_long *)attrbufptr)++ = + *((uint32_t *)attrbufptr)++ = DerivePermissionSummary(cp->c_uid, cp->c_gid, cp->c_mode, - VTOVFS(vp), current_proc()->p_ucred, current_proc()); + VTOVFS(vp), kauth_cred_get(), proc_self()); } *abp->ab_attrbufpp = attrbufptr; @@ -1147,7 +1215,7 @@ packvolcommonattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *v static void -packvolattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *vp, struct proc *p) +packvolattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *vp) { attrgroup_t attr; void *attrbufptr = *abp->ab_attrbufpp; @@ -1155,15 +1223,15 @@ packvolattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *vp, str struct cnode *cp = VTOC(vp); struct mount *mp = VTOVFS(vp); ExtendedVCB *vcb = HFSTOVCB(hfsmp); - u_long attrlength; + uint32_t attrlength; attr = abp->ab_attrlist->volattr; if (ATTR_VOL_FSTYPE & attr) { - *((u_long *)attrbufptr)++ = (u_long)mp->mnt_vfc->vfc_typenum; + *((uint32_t *)attrbufptr)++ = (uint32_t)vfs_typenum(mp); } if (ATTR_VOL_SIGNATURE & attr) { - *((u_long *)attrbufptr)++ = (u_long)vcb->vcbSigWord; + *((uint32_t *)attrbufptr)++ = (uint32_t)vcb->vcbSigWord; } if (ATTR_VOL_SIZE & attr) { *((off_t *)attrbufptr)++ = @@ -1184,30 +1252,30 @@ packvolattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *vp, str *((off_t *)attrbufptr)++ = (off_t)(vcb->vcbClpSiz); } if (ATTR_VOL_IOBLOCKSIZE & attr) { - *((u_long *)attrbufptr)++ = (u_long)hfsmp->hfs_logBlockSize; + *((uint32_t *)attrbufptr)++ = hfsmp->hfs_logBlockSize; } if (ATTR_VOL_OBJCOUNT & attr) { - *((u_long *)attrbufptr)++ = - (u_long)vcb->vcbFilCnt + (u_long)vcb->vcbDirCnt; + *((uint32_t *)attrbufptr)++ = + (uint32_t)vcb->vcbFilCnt + (uint32_t)vcb->vcbDirCnt; } if (ATTR_VOL_FILECOUNT & attr) { - *((u_long *)attrbufptr)++ = (u_long)vcb->vcbFilCnt; + *((uint32_t *)attrbufptr)++ = (uint32_t)vcb->vcbFilCnt; } if (ATTR_VOL_DIRCOUNT & attr) { - *((u_long *)attrbufptr)++ = (u_long)vcb->vcbDirCnt; + *((uint32_t *)attrbufptr)++ = (uint32_t)vcb->vcbDirCnt; } if (ATTR_VOL_MAXOBJCOUNT & attr) { - *((u_long *)attrbufptr)++ = 0xFFFFFFFF; + *((uint32_t *)attrbufptr)++ = 0xFFFFFFFF; } if (ATTR_VOL_MOUNTPOINT & attr) { ((struct attrreference *)attrbufptr)->attr_dataoffset = (char *)varbufptr - (char *)attrbufptr; ((struct attrreference *)attrbufptr)->attr_length = - strlen(mp->mnt_stat.f_mntonname) + 1; + strlen(mp->mnt_vfsstat.f_mntonname) + 1; attrlength = ((struct attrreference *)attrbufptr)->attr_length; /* round up to the next 4-byte boundary: */ attrlength = attrlength + ((4 - (attrlength & 3)) & 3); - (void) bcopy(mp->mnt_stat.f_mntonname, varbufptr, attrlength); + (void) bcopy(mp->mnt_vfsstat.f_mntonname, varbufptr, attrlength); /* Advance beyond the space just allocated: */ (char *)varbufptr += attrlength; @@ -1228,18 +1296,18 @@ packvolattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *vp, str (char *)varbufptr += attrlength; ++((struct attrreference *)attrbufptr); } - if (ATTR_VOL_MOUNTFLAGS & attr) { - *((u_long *)attrbufptr)++ = (u_long)mp->mnt_flag; - } + if (ATTR_VOL_MOUNTFLAGS & attr) { + *((uint32_t *)attrbufptr)++ = (uint32_t)vfs_flags(mp); + } if (ATTR_VOL_MOUNTEDDEVICE & attr) { ((struct attrreference *)attrbufptr)->attr_dataoffset = (char *)varbufptr - (char *)attrbufptr; ((struct attrreference *)attrbufptr)->attr_length = - strlen(mp->mnt_stat.f_mntfromname) + 1; + strlen(mp->mnt_vfsstat.f_mntfromname) + 1; attrlength = ((struct attrreference *)attrbufptr)->attr_length; /* round up to the next 4-byte boundary: */ attrlength = attrlength + ((4 - (attrlength & 3)) & 3); - (void) bcopy(mp->mnt_stat.f_mntfromname, varbufptr, attrlength); + (void) bcopy(mp->mnt_vfsstat.f_mntfromname, varbufptr, attrlength); /* Advance beyond the space just allocated: */ (char *)varbufptr += attrlength; @@ -1255,13 +1323,13 @@ packvolattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *vp, str vcapattrptr = (vol_capabilities_attr_t *)attrbufptr; if (vcb->vcbSigWord == kHFSPlusSigWord) { - u_int32_t journal_active; + u_int32_t journal_active_cap; u_int32_t case_sensitive; if (hfsmp->jnl) - journal_active = VOL_CAP_FMT_JOURNAL_ACTIVE; + journal_active_cap = VOL_CAP_FMT_JOURNAL_ACTIVE; else - journal_active = 0; + journal_active_cap = 0; if (hfsmp->hfs_flags & HFS_CASE_SENSITIVE) case_sensitive = VOL_CAP_FMT_CASE_SENSITIVE; @@ -1273,10 +1341,11 @@ packvolattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *vp, str VOL_CAP_FMT_SYMBOLICLINKS | VOL_CAP_FMT_HARDLINKS | VOL_CAP_FMT_JOURNAL | - journal_active | + journal_active_cap | case_sensitive | VOL_CAP_FMT_CASE_PRESERVING | - VOL_CAP_FMT_FAST_STATFS ; + VOL_CAP_FMT_FAST_STATFS | + VOL_CAP_FMT_2TB_FILESIZE; } else { /* Plain HFS */ vcapattrptr->capabilities[VOL_CAPABILITIES_FORMAT] = VOL_CAP_FMT_PERSISTENTOBJECTIDS | @@ -1307,7 +1376,8 @@ packvolattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *vp, str VOL_CAP_FMT_ZERO_RUNS | VOL_CAP_FMT_CASE_SENSITIVE | VOL_CAP_FMT_CASE_PRESERVING | - VOL_CAP_FMT_FAST_STATFS ; + VOL_CAP_FMT_FAST_STATFS | + VOL_CAP_FMT_2TB_FILESIZE; vcapattrptr->valid[VOL_CAPABILITIES_INTERFACES] = VOL_CAP_INT_SEARCHFS | VOL_CAP_INT_ATTRLIST | @@ -1360,10 +1430,11 @@ packcommonattr( struct mount *mp = HFSTOVFS(hfsmp); void *attrbufptr = *abp->ab_attrbufpp; void *varbufptr = *abp->ab_varbufpp; - u_long attrlength = 0; + uint32_t attrlength = 0; + boolean_t is_64_bit = proc_is64bit(p); if (ATTR_CMN_NAME & attr) { - packnameattr(abp, vp, cdp->cd_nameptr, cdp->cd_namelen, p); + packnameattr(abp, vp, cdp->cd_nameptr, cdp->cd_namelen); attrbufptr = *abp->ab_attrbufpp; varbufptr = *abp->ab_varbufpp; } @@ -1371,7 +1442,11 @@ packcommonattr( *((dev_t *)attrbufptr)++ = hfsmp->hfs_raw_dev; } if (ATTR_CMN_FSID & attr) { - *((fsid_t *)attrbufptr) = mp->mnt_stat.f_fsid; + fsid_t fsid; + + fsid.val[0] = (long)hfsmp->hfs_raw_dev; + fsid.val[1] = (long)vfs_typenum(mp); + *((fsid_t *)attrbufptr) = fsid; ++((fsid_t *)attrbufptr); } if (ATTR_CMN_OBJTYPE & attr) { @@ -1392,7 +1467,7 @@ packcommonattr( * and Carbon APIs, which are hardlink-ignorant, will always * receive the c_cnid (from getattrlist). */ - if (ATTR_CMN_OBJID & attr) { + if (ATTR_CMN_OBJID & attr) { ((fsobj_id_t *)attrbufptr)->fid_objno = cdp->cd_cnid; ((fsobj_id_t *)attrbufptr)->fid_generation = 0; ++((fsobj_id_t *)attrbufptr); @@ -1411,29 +1486,64 @@ packcommonattr( *((text_encoding_t *)attrbufptr)++ = cdp->cd_encoding; } if (ATTR_CMN_CRTIME & attr) { - ((struct timespec *)attrbufptr)->tv_sec = cap->ca_itime; - ((struct timespec *)attrbufptr)->tv_nsec = 0; - ++((struct timespec *)attrbufptr); + if (is_64_bit) { + ((struct user_timespec *)attrbufptr)->tv_sec = cap->ca_itime; + ((struct user_timespec *)attrbufptr)->tv_nsec = 0; + ++((struct user_timespec *)attrbufptr); + } + else { + ((struct timespec *)attrbufptr)->tv_sec = cap->ca_itime; + ((struct timespec *)attrbufptr)->tv_nsec = 0; + ++((struct timespec *)attrbufptr); + } } if (ATTR_CMN_MODTIME & attr) { - ((struct timespec *)attrbufptr)->tv_sec = cap->ca_mtime; - ((struct timespec *)attrbufptr)->tv_nsec = 0; - ++((struct timespec *)attrbufptr); + if (is_64_bit) { + ((struct user_timespec *)attrbufptr)->tv_sec = cap->ca_mtime; + ((struct user_timespec *)attrbufptr)->tv_nsec = 0; + ++((struct user_timespec *)attrbufptr); + } + else { + ((struct timespec *)attrbufptr)->tv_sec = cap->ca_mtime; + ((struct timespec *)attrbufptr)->tv_nsec = 0; + ++((struct timespec *)attrbufptr); + } } if (ATTR_CMN_CHGTIME & attr) { - ((struct timespec *)attrbufptr)->tv_sec = cap->ca_ctime; - ((struct timespec *)attrbufptr)->tv_nsec = 0; - ++((struct timespec *)attrbufptr); + if (is_64_bit) { + ((struct user_timespec *)attrbufptr)->tv_sec = cap->ca_ctime; + ((struct user_timespec *)attrbufptr)->tv_nsec = 0; + ++((struct user_timespec *)attrbufptr); + } + else { + ((struct timespec *)attrbufptr)->tv_sec = cap->ca_ctime; + ((struct timespec *)attrbufptr)->tv_nsec = 0; + ++((struct timespec *)attrbufptr); + } } if (ATTR_CMN_ACCTIME & attr) { - ((struct timespec *)attrbufptr)->tv_sec = cap->ca_atime; - ((struct timespec *)attrbufptr)->tv_nsec = 0; - ++((struct timespec *)attrbufptr); + if (is_64_bit) { + ((struct user_timespec *)attrbufptr)->tv_sec = cap->ca_atime; + ((struct user_timespec *)attrbufptr)->tv_nsec = 0; + ++((struct user_timespec *)attrbufptr); + } + else { + ((struct timespec *)attrbufptr)->tv_sec = cap->ca_atime; + ((struct timespec *)attrbufptr)->tv_nsec = 0; + ++((struct timespec *)attrbufptr); + } } if (ATTR_CMN_BKUPTIME & attr) { - ((struct timespec *)attrbufptr)->tv_sec = cap->ca_btime; - ((struct timespec *)attrbufptr)->tv_nsec = 0; - ++((struct timespec *)attrbufptr); + if (is_64_bit) { + ((struct user_timespec *)attrbufptr)->tv_sec = cap->ca_btime; + ((struct user_timespec *)attrbufptr)->tv_nsec = 0; + ++((struct user_timespec *)attrbufptr); + } + else { + ((struct timespec *)attrbufptr)->tv_sec = cap->ca_btime; + ((struct timespec *)attrbufptr)->tv_nsec = 0; + ++((struct timespec *)attrbufptr); + } } if (ATTR_CMN_FNDRINFO & attr) { bcopy(&cap->ca_finderinfo, attrbufptr, sizeof(u_int8_t) * 32); @@ -1441,7 +1551,7 @@ packcommonattr( } if (ATTR_CMN_OWNERID & attr) { *((uid_t *)attrbufptr)++ = - (cap->ca_uid == UNKNOWNUID) ? p->p_ucred->cr_uid : cap->ca_uid; + (cap->ca_uid == UNKNOWNUID) ? kauth_cred_getuid(proc_ucred(p)) : cap->ca_uid; } if (ATTR_CMN_GRPID & attr) { *((gid_t *)attrbufptr)++ = cap->ca_gid; @@ -1454,11 +1564,11 @@ packcommonattr( * a security hole where set-user-id programs run as whoever is * logged on (or root if nobody is logged in yet!) */ - *((u_long *)attrbufptr)++ = + *((uint32_t *)attrbufptr)++ = (cap->ca_uid == UNKNOWNUID) ? cap->ca_mode & ~(S_ISUID | S_ISGID) : cap->ca_mode; } if (ATTR_CMN_NAMEDATTRCOUNT & attr) { - *((u_long *)attrbufptr)++ = 0; + *((uint32_t *)attrbufptr)++ = 0; } if (ATTR_CMN_NAMEDATTRLIST & attr) { attrlength = 0; @@ -1472,12 +1582,12 @@ packcommonattr( ++((struct attrreference *)attrbufptr); } if (ATTR_CMN_FLAGS & attr) { - *((u_long *)attrbufptr)++ = cap->ca_flags; + *((uint32_t *)attrbufptr)++ = cap->ca_flags; } if (ATTR_CMN_USERACCESS & attr) { - *((u_long *)attrbufptr)++ = + *((uint32_t *)attrbufptr)++ = DerivePermissionSummary(cap->ca_uid, cap->ca_gid, - cap->ca_mode, mp, current_proc()->p_ucred, + cap->ca_mode, mp, proc_ucred(current_proc()), current_proc()); } @@ -1491,31 +1601,30 @@ packdirattr( struct hfsmount *hfsmp, struct vnode *vp, struct cat_desc * descp, - struct cat_attr * cattrp, - struct proc *p) + struct cat_attr * cattrp) { attrgroup_t attr = abp->ab_attrlist->dirattr; void *attrbufptr = *abp->ab_attrbufpp; if (ATTR_DIR_LINKCOUNT & attr) - *((u_long *)attrbufptr)++ = cattrp->ca_nlink; + *((uint32_t *)attrbufptr)++ = cattrp->ca_nlink; if (ATTR_DIR_ENTRYCOUNT & attr) { - u_long entries = cattrp->ca_entries; + uint32_t entries = cattrp->ca_entries; - if (descp->cd_parentcnid == kRootParID) { + if (descp->cd_parentcnid == kHFSRootParentID) { if (hfsmp->hfs_privdir_desc.cd_cnid != 0) --entries; /* hide private dir */ if (hfsmp->jnl) entries -= 2; /* hide the journal files */ } - *((u_long *)attrbufptr)++ = entries; + *((uint32_t *)attrbufptr)++ = entries; } if (ATTR_DIR_MOUNTSTATUS & attr) { - if (vp != NULL && vp->v_mountedhere != NULL) - *((u_long *)attrbufptr)++ = DIR_MNTSTATUS_MNTPOINT; + if (vp != NULL && vnode_mountedhere(vp) != NULL) + *((uint32_t *)attrbufptr)++ = DIR_MNTSTATUS_MNTPOINT; else - *((u_long *)attrbufptr)++ = 0; + *((uint32_t *)attrbufptr)++ = 0; } *abp->ab_attrbufpp = attrbufptr; } @@ -1526,19 +1635,18 @@ packfileattr( struct hfsmount *hfsmp, struct cat_attr *cattrp, struct cat_fork *datafork, - struct cat_fork *rsrcfork, - struct proc *p) + struct cat_fork *rsrcfork) { attrgroup_t attr = abp->ab_attrlist->fileattr; void *attrbufptr = *abp->ab_attrbufpp; void *varbufptr = *abp->ab_varbufpp; - u_long attrlength; - u_long allocblksize; + uint32_t attrlength; + uint32_t allocblksize; allocblksize = HFSTOVCB(hfsmp)->blockSize; if (ATTR_FILE_LINKCOUNT & attr) { - *((u_long *)attrbufptr)++ = cattrp->ca_nlink; + *((uint32_t *)attrbufptr)++ = cattrp->ca_nlink; } if (ATTR_FILE_TOTALSIZE & attr) { *((off_t *)attrbufptr)++ = datafork->cf_size + rsrcfork->cf_size; @@ -1548,22 +1656,22 @@ packfileattr( (off_t)cattrp->ca_blocks * (off_t)allocblksize; } if (ATTR_FILE_IOBLOCKSIZE & attr) { - *((u_long *)attrbufptr)++ = hfsmp->hfs_logBlockSize; + *((uint32_t *)attrbufptr)++ = hfsmp->hfs_logBlockSize; } if (ATTR_FILE_CLUMPSIZE & attr) { - *((u_long *)attrbufptr)++ = HFSTOVCB(hfsmp)->vcbClpSiz; + *((uint32_t *)attrbufptr)++ = HFSTOVCB(hfsmp)->vcbClpSiz; } if (ATTR_FILE_DEVTYPE & attr) { if (S_ISBLK(cattrp->ca_mode) || S_ISCHR(cattrp->ca_mode)) - *((u_long *)attrbufptr)++ = (u_long)cattrp->ca_rdev; + *((uint32_t *)attrbufptr)++ = (uint32_t)cattrp->ca_rdev; else - *((u_long *)attrbufptr)++ = 0; + *((uint32_t *)attrbufptr)++ = 0; } if (ATTR_FILE_FILETYPE & attr) { - *((u_long *)attrbufptr)++ = 0; + *((uint32_t *)attrbufptr)++ = 0; } if (ATTR_FILE_FORKCOUNT & attr) { - *((u_long *)attrbufptr)++ = 2; + *((uint32_t *)attrbufptr)++ = 2; } if (ATTR_FILE_FORKLIST & attr) { attrlength = 0; @@ -1602,16 +1710,21 @@ packfileattr( *abp->ab_varbufpp = varbufptr; } - -static void +#if 0 +static int unpackattrblk(struct attrblock *abp, struct vnode *vp) { struct attrlist *attrlistp = abp->ab_attrlist; + int error; - if (attrlistp->volattr) - unpackvolattr(abp, VTOHFS(vp), vp); - else if (attrlistp->commonattr) + if (attrlistp->volattr) { + error = unpackvolattr(abp, VTOHFS(vp), vp); + if (error) + return (error); + } else if (attrlistp->commonattr) { unpackcommonattr(abp, vp); + } + return (0); } @@ -1623,30 +1736,36 @@ unpackcommonattr( attrgroup_t attr = abp->ab_attrlist->commonattr; void *attrbufptr = *abp->ab_attrbufpp; struct cnode *cp = VTOC(vp); + boolean_t is_64_bit = proc_is64bit(current_proc()); if (ATTR_CMN_SCRIPT & attr) { cp->c_encoding = (u_int32_t)*((text_encoding_t *)attrbufptr)++; hfs_setencodingbits(VTOHFS(vp), cp->c_encoding); } if (ATTR_CMN_CRTIME & attr) { - cp->c_itime = ((struct timespec *)attrbufptr)->tv_sec; - ++((struct timespec *)attrbufptr); + if (is_64_bit) { + cp->c_itime = ((struct user_timespec *)attrbufptr)->tv_sec; + ++((struct user_timespec *)attrbufptr); + } + else { + cp->c_itime = ((struct timespec *)attrbufptr)->tv_sec; + ++((struct timespec *)attrbufptr); + } } if (ATTR_CMN_MODTIME & attr) { cp->c_mtime = ((struct timespec *)attrbufptr)->tv_sec; - cp->c_mtime_nsec = ((struct timespec *)attrbufptr)->tv_nsec; ++((struct timespec *)attrbufptr); - cp->c_flag &= ~C_UPDATE; + cp->c_touch_modtime = FALSE; } if (ATTR_CMN_CHGTIME & attr) { cp->c_ctime = ((struct timespec *)attrbufptr)->tv_sec; ++((struct timespec *)attrbufptr); - cp->c_flag &= ~C_CHANGE; + cp->c_touch_chgtime = FALSE; } if (ATTR_CMN_ACCTIME & attr) { cp->c_atime = ((struct timespec *)attrbufptr)->tv_sec; ++((struct timespec *)attrbufptr); - cp->c_flag &= ~C_ACCESS; + cp->c_touch_acctime = FALSE; } if (ATTR_CMN_BKUPTIME & attr) { cp->c_btime = ((struct timespec *)attrbufptr)->tv_sec; @@ -1674,7 +1793,7 @@ unpackcommonattr( } } if (ATTR_CMN_ACCESSMASK & attr) { - u_int16_t mode = (u_int16_t)*((u_long *)attrbufptr)++; + u_int16_t mode = (u_int16_t)*((uint32_t *)attrbufptr)++; if (VTOVCB(vp)->vcbSigWord == kHFSPlusSigWord) { if (mode != (mode_t)VNOVAL) { cp->c_mode &= ~ALLPERMS; @@ -1683,7 +1802,7 @@ unpackcommonattr( } } if (ATTR_CMN_FLAGS & attr) { - u_long flags = *((u_long *)attrbufptr)++; + uint32_t flags = *((uint32_t *)attrbufptr)++; /* * Flags are settable only on HFS+ volumes. A special * exception is made for the IMMUTABLE flags @@ -1693,7 +1812,7 @@ unpackcommonattr( if ((VTOVCB(vp)->vcbSigWord == kHFSPlusSigWord) || ((VTOVCB(vp)->vcbSigWord == kHFSSigWord) && ((flags & ~IMMUTABLE) == 0))) { - if (flags != (u_long)VNOVAL) { + if (flags != (uint32_t)VNOVAL) { cp->c_flags = flags; } } @@ -1702,47 +1821,56 @@ unpackcommonattr( } -static void +static int unpackvolattr( struct attrblock *abp, struct hfsmount *hfsmp, - struct vnode *rootvp) + struct vnode *root_vp) { void *attrbufptr = *abp->ab_attrbufpp; - ExtendedVCB *vcb = HFSTOVCB(hfsmp); attrgroup_t attr; + int error = 0; + boolean_t is_64_bit = proc_is64bit(current_proc()); + + HFS_MOUNT_LOCK(hfsmp, TRUE); attr = abp->ab_attrlist->commonattr; if (attr == 0) goto volattr; if (ATTR_CMN_SCRIPT & attr) { - vcb->volumeNameEncodingHint = + hfsmp->volumeNameEncodingHint = (u_int32_t)*(((text_encoding_t *)attrbufptr)++); } if (ATTR_CMN_CRTIME & attr) { - vcb->vcbCrDate = ((struct timespec *)attrbufptr)->tv_sec; - ++((struct timespec *)attrbufptr); + if (is_64_bit) { + hfsmp->vcbCrDate = ((struct user_timespec *)attrbufptr)->tv_sec; + ++((struct user_timespec *)attrbufptr); + } + else { + hfsmp->vcbCrDate = ((struct timespec *)attrbufptr)->tv_sec; + ++((struct timespec *)attrbufptr); + } /* The volume's create date comes from the root directory */ - VTOC(rootvp)->c_itime = vcb->vcbCrDate; - VTOC(rootvp)->c_flag |= C_MODIFIED; + VTOC(root_vp)->c_itime = hfsmp->vcbCrDate; + VTOC(root_vp)->c_flag |= C_MODIFIED; /* * XXX Should we also do a relative change to the * the volume header's create date in local time? */ } if (ATTR_CMN_MODTIME & attr) { - vcb->vcbLsMod = ((struct timespec *)attrbufptr)->tv_sec; + hfsmp->vcbLsMod = ((struct timespec *)attrbufptr)->tv_sec; ++((struct timespec *)attrbufptr); } if (ATTR_CMN_BKUPTIME & attr) { - vcb->vcbVolBkUp = ((struct timespec *)attrbufptr)->tv_sec; + hfsmp->vcbVolBkUp = ((struct timespec *)attrbufptr)->tv_sec; ++((struct timespec *)attrbufptr); } if (ATTR_CMN_FNDRINFO & attr) { - bcopy(attrbufptr, &vcb->vcbFndrInfo, sizeof(vcb->vcbFndrInfo)); - (char *)attrbufptr += sizeof(vcb->vcbFndrInfo); + bcopy(attrbufptr, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo)); + (char *)attrbufptr += sizeof(hfsmp->vcbFndrInfo); } volattr: @@ -1752,14 +1880,22 @@ volattr: * It could be empty or garbage (bad UTF-8). */ if (ATTR_VOL_NAME & attr) { - copystr(((char *)attrbufptr) + *((u_long *)attrbufptr), - vcb->vcbVN, sizeof(vcb->vcbVN), NULL); - (char *)attrbufptr += sizeof(struct attrreference); + attrreference_t * attr_refp = (attrreference_t *) attrbufptr; + + error = copystr(((char *)attrbufptr) + attr_refp->attr_dataoffset, + hfsmp->vcbVN, MIN(attr_refp->attr_length, sizeof(hfsmp->vcbVN)), + NULL); + if (error == 0) + (char *)attrbufptr += sizeof(struct attrreference); } *abp->ab_attrbufpp = attrbufptr; - vcb->vcbFlags |= 0xFF00; + hfsmp->vcbFlags |= 0xFF00; + HFS_MOUNT_UNLOCK(hfsmp, TRUE); + + return (error); } +#endif /* * Calculate the total size of an attribute block. @@ -1770,7 +1906,14 @@ hfs_attrblksize(struct attrlist *attrlist) { int size; attrgroup_t a; + int sizeof_timespec; + boolean_t is_64_bit = proc_is64bit(current_proc()); + if (is_64_bit) + sizeof_timespec = sizeof(struct user_timespec); + else + sizeof_timespec = sizeof(struct timespec); + #if ((ATTR_CMN_NAME | ATTR_CMN_DEVID | ATTR_CMN_FSID | ATTR_CMN_OBJTYPE | \ ATTR_CMN_OBJTAG | ATTR_CMN_OBJID | ATTR_CMN_OBJPERMANENTID | \ ATTR_CMN_PAROBJID | ATTR_CMN_SCRIPT | ATTR_CMN_CRTIME | \ @@ -1828,55 +1971,55 @@ hfs_attrblksize(struct attrlist *attrlist) if (a & ATTR_CMN_OBJPERMANENTID) size += sizeof(fsobj_id_t); if (a & ATTR_CMN_PAROBJID) size += sizeof(fsobj_id_t); if (a & ATTR_CMN_SCRIPT) size += sizeof(text_encoding_t); - if (a & ATTR_CMN_CRTIME) size += sizeof(struct timespec); - if (a & ATTR_CMN_MODTIME) size += sizeof(struct timespec); - if (a & ATTR_CMN_CHGTIME) size += sizeof(struct timespec); - if (a & ATTR_CMN_ACCTIME) size += sizeof(struct timespec); - if (a & ATTR_CMN_BKUPTIME) size += sizeof(struct timespec); + if (a & ATTR_CMN_CRTIME) size += sizeof_timespec; + if (a & ATTR_CMN_MODTIME) size += sizeof_timespec; + if (a & ATTR_CMN_CHGTIME) size += sizeof_timespec; + if (a & ATTR_CMN_ACCTIME) size += sizeof_timespec; + if (a & ATTR_CMN_BKUPTIME) size += sizeof_timespec; if (a & ATTR_CMN_FNDRINFO) size += 32 * sizeof(u_int8_t); if (a & ATTR_CMN_OWNERID) size += sizeof(uid_t); if (a & ATTR_CMN_GRPID) size += sizeof(gid_t); - if (a & ATTR_CMN_ACCESSMASK) size += sizeof(u_long); - if (a & ATTR_CMN_NAMEDATTRCOUNT) size += sizeof(u_long); + if (a & ATTR_CMN_ACCESSMASK) size += sizeof(uint32_t); + if (a & ATTR_CMN_NAMEDATTRCOUNT) size += sizeof(uint32_t); if (a & ATTR_CMN_NAMEDATTRLIST) size += sizeof(struct attrreference); - if (a & ATTR_CMN_FLAGS) size += sizeof(u_long); - if (a & ATTR_CMN_USERACCESS) size += sizeof(u_long); + if (a & ATTR_CMN_FLAGS) size += sizeof(uint32_t); + if (a & ATTR_CMN_USERACCESS) size += sizeof(uint32_t); }; if ((a = attrlist->volattr) != 0) { - if (a & ATTR_VOL_FSTYPE) size += sizeof(u_long); - if (a & ATTR_VOL_SIGNATURE) size += sizeof(u_long); + if (a & ATTR_VOL_FSTYPE) size += sizeof(uint32_t); + if (a & ATTR_VOL_SIGNATURE) size += sizeof(uint32_t); if (a & ATTR_VOL_SIZE) size += sizeof(off_t); if (a & ATTR_VOL_SPACEFREE) size += sizeof(off_t); if (a & ATTR_VOL_SPACEAVAIL) size += sizeof(off_t); if (a & ATTR_VOL_MINALLOCATION) size += sizeof(off_t); if (a & ATTR_VOL_ALLOCATIONCLUMP) size += sizeof(off_t); - if (a & ATTR_VOL_IOBLOCKSIZE) size += sizeof(u_long); - if (a & ATTR_VOL_OBJCOUNT) size += sizeof(u_long); - if (a & ATTR_VOL_FILECOUNT) size += sizeof(u_long); - if (a & ATTR_VOL_DIRCOUNT) size += sizeof(u_long); - if (a & ATTR_VOL_MAXOBJCOUNT) size += sizeof(u_long); + if (a & ATTR_VOL_IOBLOCKSIZE) size += sizeof(uint32_t); + if (a & ATTR_VOL_OBJCOUNT) size += sizeof(uint32_t); + if (a & ATTR_VOL_FILECOUNT) size += sizeof(uint32_t); + if (a & ATTR_VOL_DIRCOUNT) size += sizeof(uint32_t); + if (a & ATTR_VOL_MAXOBJCOUNT) size += sizeof(uint32_t); if (a & ATTR_VOL_MOUNTPOINT) size += sizeof(struct attrreference); if (a & ATTR_VOL_NAME) size += sizeof(struct attrreference); - if (a & ATTR_VOL_MOUNTFLAGS) size += sizeof(u_long); + if (a & ATTR_VOL_MOUNTFLAGS) size += sizeof(uint32_t); if (a & ATTR_VOL_MOUNTEDDEVICE) size += sizeof(struct attrreference); if (a & ATTR_VOL_ENCODINGSUSED) size += sizeof(unsigned long long); if (a & ATTR_VOL_CAPABILITIES) size += sizeof(vol_capabilities_attr_t); if (a & ATTR_VOL_ATTRIBUTES) size += sizeof(vol_attributes_attr_t); }; if ((a = attrlist->dirattr) != 0) { - if (a & ATTR_DIR_LINKCOUNT) size += sizeof(u_long); - if (a & ATTR_DIR_ENTRYCOUNT) size += sizeof(u_long); - if (a & ATTR_DIR_MOUNTSTATUS) size += sizeof(u_long); + if (a & ATTR_DIR_LINKCOUNT) size += sizeof(uint32_t); + if (a & ATTR_DIR_ENTRYCOUNT) size += sizeof(uint32_t); + if (a & ATTR_DIR_MOUNTSTATUS) size += sizeof(uint32_t); }; if ((a = attrlist->fileattr) != 0) { - if (a & ATTR_FILE_LINKCOUNT) size += sizeof(u_long); + if (a & ATTR_FILE_LINKCOUNT) size += sizeof(uint32_t); if (a & ATTR_FILE_TOTALSIZE) size += sizeof(off_t); if (a & ATTR_FILE_ALLOCSIZE) size += sizeof(off_t); - if (a & ATTR_FILE_IOBLOCKSIZE) size += sizeof(size_t); - if (a & ATTR_FILE_CLUMPSIZE) size += sizeof(off_t); - if (a & ATTR_FILE_DEVTYPE) size += sizeof(u_long); - if (a & ATTR_FILE_FILETYPE) size += sizeof(u_long); - if (a & ATTR_FILE_FORKCOUNT) size += sizeof(u_long); + if (a & ATTR_FILE_IOBLOCKSIZE) size += sizeof(uint32_t); + if (a & ATTR_FILE_CLUMPSIZE) size += sizeof(uint32_t); + if (a & ATTR_FILE_DEVTYPE) size += sizeof(uint32_t); + if (a & ATTR_FILE_FILETYPE) size += sizeof(uint32_t); + if (a & ATTR_FILE_FORKCOUNT) size += sizeof(uint32_t); if (a & ATTR_FILE_FORKLIST) size += sizeof(struct attrreference); if (a & ATTR_FILE_DATALENGTH) size += sizeof(off_t); if (a & ATTR_FILE_DATAALLOCSIZE) size += sizeof(off_t); @@ -1897,17 +2040,15 @@ hfs_attrblksize(struct attrlist *attrlist) __private_extern__ unsigned long DerivePermissionSummary(uid_t obj_uid, gid_t obj_gid, mode_t obj_mode, - struct mount *mp, struct ucred *cred, struct proc *p) + struct mount *mp, kauth_cred_t cred, struct proc *p) { - register gid_t *gp; unsigned long permissions; - int i; if (obj_uid == UNKNOWNUID) - obj_uid = p->p_ucred->cr_uid; + obj_uid = kauth_cred_getuid(proc_ucred(p)); /* User id 0 (root) always gets access. */ - if (cred->cr_uid == 0) { + if (!suser(cred, NULL)) { permissions = R_OK | W_OK | X_OK; goto Exit; }; @@ -1919,12 +2060,12 @@ DerivePermissionSummary(uid_t obj_uid, gid_t obj_gid, mode_t obj_mode, } /* Otherwise, check the groups. */ - if (! (mp->mnt_flag & MNT_UNKNOWNPERMISSIONS)) { - for (i = 0, gp = cred->cr_groups; i < cred->cr_ngroups; i++, gp++) { - if (obj_gid == *gp) { - permissions = ((unsigned long)obj_mode & S_IRWXG) >> 3; - goto Exit; - } + if (! (((unsigned int)vfs_flags(mp)) & MNT_UNKNOWNPERMISSIONS)) { + int is_member; + + if (kauth_cred_ismember_gid(cred, obj_gid, &is_member) == 0 && is_member) { + permissions = ((unsigned long)obj_mode & S_IRWXG) >> 3; + goto Exit; } } diff --git a/bsd/hfs/hfs_btreeio.c b/bsd/hfs/hfs_btreeio.c index 3791361fa..688983419 100644 --- a/bsd/hfs/hfs_btreeio.c +++ b/bsd/hfs/hfs_btreeio.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -24,6 +24,7 @@ #include <sys/systm.h> #include <sys/buf.h> #include <sys/kernel.h> +#include <sys/malloc.h> #include <sys/mount.h> #include <sys/vnode.h> @@ -65,23 +66,26 @@ OSStatus GetBTreeBlock(FileReference vp, UInt32 blockNum, GetBlockOptions option OSStatus retval = E_NONE; struct buf *bp = NULL; - if (options & kGetEmptyBlock) - bp = getblk(vp, blockNum, block->blockSize, 0, 0, BLK_META); - else - retval = meta_bread(vp, blockNum, block->blockSize, NOCRED, &bp); - - DBG_ASSERT(bp != NULL); - DBG_ASSERT(bp->b_data != NULL); - DBG_ASSERT(bp->b_bcount == block->blockSize); - DBG_ASSERT(bp->b_lblkno == blockNum); + if (options & kGetEmptyBlock) { + daddr64_t blkno; + off_t offset; + offset = (daddr64_t)blockNum * (daddr64_t)block->blockSize; + bp = buf_getblk(vp, (daddr64_t)blockNum, block->blockSize, 0, 0, BLK_META); + if (bp && + VNOP_BLOCKMAP(vp, offset, block->blockSize, &blkno, NULL, NULL, 0, NULL) == 0) { + buf_setblkno(bp, blkno); + } + } else { + retval = buf_meta_bread(vp, (daddr64_t)blockNum, block->blockSize, NOCRED, &bp); + } if (bp == NULL) retval = -1; //XXX need better error if (retval == E_NONE) { block->blockHeader = bp; - block->buffer = bp->b_data; - block->blockReadFromDisk = (bp->b_flags & B_CACHE) == 0; /* not found in cache ==> came from disk */ + block->buffer = (char *)buf_dataptr(bp); + block->blockReadFromDisk = (buf_fromcache(bp) == 0); /* not found in cache ==> came from disk */ // XXXdbg block->isModified = 0; @@ -91,8 +95,8 @@ OSStatus GetBTreeBlock(FileReference vp, UInt32 blockNum, GetBlockOptions option if (!(options & kGetEmptyBlock)) { /* This happens when we first open the b-tree, we might not have all the node data on hand */ if ((((BTNodeDescriptor *)block->buffer)->kind == kBTHeaderNode) && - (((BTHeaderRec *)((char *)block->buffer + 14))->nodeSize != bp->b_bcount) && - (SWAP_BE16 (((BTHeaderRec *)((char *)block->buffer + 14))->nodeSize) != bp->b_bcount)) { + (((BTHeaderRec *)((char *)block->buffer + 14))->nodeSize != buf_count(bp)) && + (SWAP_BE16 (((BTHeaderRec *)((char *)block->buffer + 14))->nodeSize) != buf_count(bp))) { /* Don't swap the descriptors at all, we don't care (this block will be invalidated) */ SWAP_BT_NODE (block, ISHFSPLUS(VTOVCB(vp)), VTOC(vp)->c_fileid, 3); @@ -110,7 +114,7 @@ OSStatus GetBTreeBlock(FileReference vp, UInt32 blockNum, GetBlockOptions option #endif } else { if (bp) - brelse(bp); + buf_brelse(bp); block->blockHeader = NULL; block->buffer = NULL; } @@ -143,15 +147,15 @@ static int btree_journal_modify_block_end(struct hfsmount *hfsmp, struct buf *bp) { #if BYTE_ORDER == LITTLE_ENDIAN - struct vnode *vp = bp->b_vp; + struct vnode *vp = buf_vnode(bp); BlockDescriptor block; /* Prepare the block pointer */ block.blockHeader = bp; - block.buffer = bp->b_data; + block.buffer = (char *)buf_dataptr(bp); /* not found in cache ==> came from disk */ - block.blockReadFromDisk = (bp->b_flags & B_CACHE) == 0; - block.blockSize = bp->b_bcount; + block.blockReadFromDisk = (buf_fromcache(bp) == 0); + block.blockSize = buf_count(bp); // XXXdbg have to swap the data before it goes in the journal SWAP_BT_NODE (&block, ISHFSPLUS (VTOVCB(vp)), VTOC(vp)->c_fileid, 1); @@ -177,11 +181,12 @@ OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlock } if (options & kTrashBlock) { - bp->b_flags |= B_INVAL; - if (hfsmp->jnl && (bp->b_flags & B_LOCKED)) { + buf_markinvalid(bp); + + if (hfsmp->jnl && (buf_flags(bp) & B_LOCKED)) { journal_kill_block(hfsmp->jnl, bp); } else { - brelse(bp); /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */ + buf_brelse(bp); /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */ } } else { if (options & kForceWriteBlock) { @@ -193,27 +198,29 @@ OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlock retval = btree_journal_modify_block_end(hfsmp, bp); blockPtr->isModified = 0; } else { - retval = VOP_BWRITE(bp); + retval = VNOP_BWRITE(bp); } } else if (options & kMarkBlockDirty) { + struct timeval tv; + microuptime(&tv); if ((options & kLockTransaction) && hfsmp->jnl == NULL) { /* * - * Set the B_LOCKED flag and unlock the buffer, causing brelse to move + * Set the B_LOCKED flag and unlock the buffer, causing buf_brelse to move * the buffer onto the LOCKED free list. This is necessary, otherwise - * getnewbuf() would try to reclaim the buffers using bawrite, which + * getnewbuf() would try to reclaim the buffers using buf_bawrite, which * isn't going to work. * */ - extern int count_lock_queue __P((void)); + extern int count_lock_queue(void); + /* Don't hog all the buffers... */ if (count_lock_queue() > kMaxLockedMetaBuffers) { hfs_btsync(vp, HFS_SYNCTRANS); /* Rollback sync time to cause a sync on lock release... */ - (void) BTSetLastSync(VTOF(vp), time.tv_sec - (kMaxSecsForFsync + 1)); + (void) BTSetLastSync(VTOF(vp), tv.tv_sec - (kMaxSecsForFsync + 1)); } - - bp->b_flags |= B_LOCKED; + buf_setflags(bp, B_LOCKED); } /* @@ -230,13 +237,14 @@ OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlock } else if (bdwrite_internal(bp, 1) != 0) { hfs_btsync(vp, 0); /* Rollback sync time to cause a sync on lock release... */ - (void) BTSetLastSync(VTOF(vp), time.tv_sec - (kMaxSecsForFsync + 1)); - bp->b_flags &= ~B_LOCKED; - bawrite(bp); + (void) BTSetLastSync(VTOF(vp), tv.tv_sec - (kMaxSecsForFsync + 1)); + + buf_clearflags(bp, B_LOCKED); + buf_bawrite(bp); } } else { // check if we had previously called journal_modify_block_start() - // on this block and if so, abort it (which will call brelse()). + // on this block and if so, abort it (which will call buf_brelse()). if (hfsmp->jnl && blockPtr->isModified) { // XXXdbg - I don't want to call modify_block_abort() // because I think it may be screwing up the @@ -248,7 +256,7 @@ OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlock btree_journal_modify_block_end(hfsmp, bp); blockPtr->isModified = 0; } else { - brelse(bp); /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */ + buf_brelse(bp); /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */ } }; }; @@ -263,7 +271,7 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF) { #pragma unused (maxEOF) - OSStatus retval, ret; + OSStatus retval = 0, ret = 0; UInt64 actualBytesAdded, origSize; UInt64 bytesToAdd; u_int32_t startAllocation; @@ -272,7 +280,8 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF) ExtendedVCB *vcb; FCB *filePtr; struct proc *p = NULL; - UInt64 trim = 0; + UInt64 trim = 0; + int lockflags = 0; filePtr = GetFileControlBlock(vp); @@ -295,17 +304,14 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF) * return an error if an attempt is made to extend the Extents B-tree * when the resident extents are exhausted. */ - /* XXX warning - this can leave the volume bitmap unprotected during ExtendFileC call */ - if(VTOC(vp)->c_fileid != kHFSExtentsFileID) - { - p = current_proc(); - /* lock extents b-tree (also protects volume bitmap) */ - retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, p); - if (retval) - return (retval); - } - (void) BTGetInformation(filePtr, 0, &btInfo); + /* Protect allocation bitmap and extents overflow file. */ + lockflags = SFL_BITMAP; + if (VTOC(vp)->c_fileid != kHFSExtentsFileID) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(vcb, lockflags, HFS_EXCLUSIVE_LOCK); + + (void) BTGetInformation(filePtr, 0, &btInfo); #if 0 // XXXdbg /* @@ -350,7 +356,7 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF) } } } while (retval == dskFulErr && actualBytesAdded == 0); - + /* * If a new extent was added then move the roving allocator * reference forward by the current b-tree file size so @@ -411,7 +417,7 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF) // XXXdbg - this probably doesn't need to be a panic() panic("hfs: error truncating btree files (sz 0x%llx, trim %lld, ret %d)\n", filePtr->fcbEOF, trim, ret); - return ret; + goto out; } actualBytesAdded -= trim; } @@ -421,10 +427,10 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF) * Get any extents overflow b-tree changes to disk ASAP! */ (void) BTFlushPath(VTOF(vcb->extentsRefNum)); - (void) VOP_FSYNC(vcb->extentsRefNum, NOCRED, MNT_WAIT, p); - - (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, p); + (void) hfs_fsync(vcb->extentsRefNum, MNT_WAIT, 0, p); } + hfs_systemfile_unlock(vcb, lockflags); + lockflags = 0; if ((filePtr->fcbEOF % btInfo.nodeSize) != 0) { panic("hfs: extendbtree: fcb 0x%x has eof 0x%llx not a multiple of 0x%x (trim %llx)\n", @@ -438,18 +444,22 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF) (VTOC(vp)->c_fileid == kHFSCatalogFileID) || (VTOC(vp)->c_fileid == kHFSAttributesFileID) ) { + VTOC(vp)->c_flag |= C_MODIFIED; MarkVCBDirty( vcb ); ret = hfs_flushvolumeheader(VCBTOHFS(vcb), MNT_WAIT, HFS_ALTFLUSH); } else { - struct timeval tv = time; - - VTOC(vp)->c_flag |= C_CHANGE | C_UPDATE; - (void) VOP_UPDATE(vp, &tv, &tv, MNT_WAIT); + VTOC(vp)->c_touch_chgtime = TRUE; + VTOC(vp)->c_touch_modtime = TRUE; + (void) hfs_update(vp, TRUE); } ret = ClearBTNodes(vp, btInfo.nodeSize, filePtr->fcbEOF - actualBytesAdded, actualBytesAdded); - if (ret) - return (ret); +out: + if (retval == 0) + retval = ret; + + if (lockflags) + hfs_systemfile_unlock(vcb, lockflags); return retval; } @@ -463,14 +473,14 @@ ClearBTNodes(struct vnode *vp, long blksize, off_t offset, off_t amount) { struct hfsmount *hfsmp = VTOHFS(vp); struct buf *bp = NULL; - daddr_t blk; - daddr_t blkcnt; + daddr64_t blk; + daddr64_t blkcnt; blk = offset / blksize; blkcnt = amount / blksize; while (blkcnt > 0) { - bp = getblk(vp, blk, blksize, 0, 0, BLK_META); + bp = buf_getblk(vp, blk, blksize, 0, 0, BLK_META); if (bp == NULL) continue; @@ -480,9 +490,9 @@ ClearBTNodes(struct vnode *vp, long blksize, off_t offset, off_t amount) // become *way* too large //journal_modify_block_start(hfsmp->jnl, bp); } + bzero((char *)buf_dataptr(bp), blksize); - bzero((char *)bp->b_data, blksize); - bp->b_flags |= B_AGE; + buf_markaged(bp); // XXXdbg if (hfsmp->jnl) { @@ -493,15 +503,15 @@ ClearBTNodes(struct vnode *vp, long blksize, off_t offset, off_t amount) // XXXdbg - remove this once we decide what to do with the // writes to the journal if ((blk % 32) == 0) - VOP_BWRITE(bp); + VNOP_BWRITE(bp); else - bawrite(bp); + buf_bawrite(bp); } else { /* wait/yield every 32 blocks so we don't hog all the buffers */ if ((blk % 32) == 0) - VOP_BWRITE(bp); + VNOP_BWRITE(bp); else - bawrite(bp); + buf_bawrite(bp); } --blkcnt; ++blk; @@ -509,3 +519,166 @@ ClearBTNodes(struct vnode *vp, long blksize, off_t offset, off_t amount) return (0); } + + +extern char hfs_attrname[]; + +extern int hfs_attrkeycompare(HFSPlusAttrKey *searchKey, HFSPlusAttrKey *trialKey); + +int hfs_create_attr_btree(struct hfsmount *hfsmp, uint32_t nodesize, uint32_t nodecnt); + +/* + * Create an HFS+ Attribute B-tree File. + * + * A journal transaction must be already started. + */ +int +hfs_create_attr_btree(struct hfsmount *hfsmp, uint32_t nodesize, uint32_t nodecnt) +{ + struct vnode* vp = NULL; + struct cat_desc cndesc; + struct cat_attr cnattr; + struct cat_fork cfork; + BlockDescriptor blkdesc; + BTNodeDescriptor *ndp; + BTHeaderRec *bthp; + BTreeControlBlockPtr btcb = NULL; + struct buf *bp = NULL; + void * buffer; + u_int16_t *index; + u_int16_t offset; + int result; + + printf("Creating HFS+ Attribute B-tree File (%d nodes) on %s\n", nodecnt, hfsmp->vcbVN); + + /* + * Set up Attribute B-tree vnode + */ + bzero(&cndesc, sizeof(cndesc)); + cndesc.cd_parentcnid = kHFSRootParentID; + cndesc.cd_flags |= CD_ISMETA; + cndesc.cd_nameptr = hfs_attrname; + cndesc.cd_namelen = strlen(hfs_attrname); + cndesc.cd_cnid = kHFSAttributesFileID; + + bzero(&cnattr, sizeof(cnattr)); + cnattr.ca_nlink = 1; + cnattr.ca_mode = S_IFREG; + cnattr.ca_fileid = cndesc.cd_cnid; + + bzero(&cfork, sizeof(cfork)); + cfork.cf_clump = nodesize * nodecnt; + + result = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, &vp); + if (result) + return (result); + + /* + * Set up Attribute B-tree control block + */ + MALLOC(btcb, BTreeControlBlock *, sizeof(BTreeControlBlock), M_TEMP, M_WAITOK); + bzero(btcb, sizeof(BTreeControlBlock)); + + btcb->nodeSize = nodesize; + btcb->maxKeyLength = kHFSPlusAttrKeyMaximumLength; + btcb->btreeType = 0xFF; + btcb->attributes = kBTVariableIndexKeysMask | kBTBigKeysMask; + btcb->version = kBTreeVersion; + btcb->writeCount = 1; + btcb->flags = 0; /* kBTHeaderDirty */ + btcb->fileRefNum = vp; + btcb->getBlockProc = GetBTreeBlock; + btcb->releaseBlockProc = ReleaseBTreeBlock; + btcb->setEndOfForkProc = ExtendBTreeFile; + btcb->keyCompareProc = (KeyCompareProcPtr)hfs_attrkeycompare; + VTOF(vp)->fcbBTCBPtr = btcb; + + /* + * Allocate some space + */ + result = ExtendBTreeFile(vp, nodesize, cfork.cf_clump); + if (result) + goto exit; + + btcb->totalNodes = VTOF(vp)->ff_size / nodesize; + btcb->freeNodes = btcb->totalNodes - 1; + + /* + * Initialize the b-tree header on disk + */ + bp = buf_getblk(vp, 0, nodesize, 0, 0, BLK_META); + if (bp == NULL) { + result = EIO; + goto exit; + } + + buffer = (void *)buf_dataptr(bp); + blkdesc.buffer = buffer; + blkdesc.blockHeader = (void *)bp; + blkdesc.blockReadFromDisk = 0; + blkdesc.isModified = 0; + + ModifyBlockStart(vp, &blkdesc); + + if (buf_size(bp) != nodesize) + panic("hfs_create_attr_btree: bad buffer size (%d)\n", buf_size(bp)); + + bzero(buffer, nodesize); + index = (int16_t *)buffer; + + /* FILL IN THE NODE DESCRIPTOR: */ + ndp = (BTNodeDescriptor *)buffer; + ndp->kind = kBTHeaderNode; + ndp->numRecords = 3; + offset = sizeof(BTNodeDescriptor); + index[(nodesize / 2) - 1] = offset; + + /* FILL IN THE HEADER RECORD: */ + bthp = (BTHeaderRec *)((UInt8 *)buffer + offset); + bthp->nodeSize = nodesize; + bthp->totalNodes = btcb->totalNodes; + bthp->freeNodes = btcb->freeNodes; + bthp->clumpSize = cfork.cf_clump; + bthp->btreeType = 0xFF; + bthp->attributes = kBTVariableIndexKeysMask | kBTBigKeysMask; + bthp->maxKeyLength = kHFSPlusAttrKeyMaximumLength; + bthp->keyCompareType = kHFSBinaryCompare; + offset += sizeof(BTHeaderRec); + index[(nodesize / 2) - 2] = offset; + + /* FILL IN THE USER RECORD: */ + offset += kBTreeHeaderUserBytes; + index[(nodesize / 2) - 3] = offset; + + /* FILL IN THE MAP RECORD (only one node in use). */ + *((u_int8_t *)buffer + offset) = 0x80; + offset += nodesize - sizeof(BTNodeDescriptor) - sizeof(BTHeaderRec) + - kBTreeHeaderUserBytes - (4 * sizeof(int16_t)); + index[(nodesize / 2) - 4] = offset; + + if (hfsmp->jnl) { + result = btree_journal_modify_block_end(hfsmp, bp); + } else { + result = VNOP_BWRITE(bp); + } + if (result) + goto exit; + + /* Publish new btree file */ + hfsmp->hfs_attribute_vp = vp; + (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH); + +exit: + hfs_unlock(VTOC(vp)); + if (result) { + if (btcb) { + FREE (btcb, M_TEMP); + } + vnode_put(vp); + // hfs_truncate(); /* XXX need to give back blocks */ + } + return (result); +} + + + diff --git a/bsd/hfs/hfs_catalog.c b/bsd/hfs/hfs_catalog.c index 87a7e6bf0..64dd3c8fb 100644 --- a/bsd/hfs/hfs_catalog.c +++ b/bsd/hfs/hfs_catalog.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -26,7 +26,6 @@ #include <sys/stat.h> #include <sys/mount.h> #include <sys/vnode.h> -#include <sys/namei.h> #include <sys/dirent.h> #include <vfs/vfs_support.h> #include <libkern/libkern.h> @@ -39,10 +38,8 @@ #include "hfs_endian.h" #include "hfscommon/headers/BTreesInternal.h" -#include "hfscommon/headers/CatalogPrivate.h" #include "hfscommon/headers/HFSUnicodeWrappers.h" -extern OSErr PositionIterator(CatalogIterator *cip, UInt32 offset, BTreeIterator *bip, UInt16 *op); /* * Initialization of an FSBufferDescriptor structure. @@ -68,9 +65,26 @@ struct update_state { struct hfsmount * s_hfsmp; }; +struct position_state { + int error; + u_int32_t count; + u_int32_t index; + u_int32_t parentID; + struct hfsmount *hfsmp; +}; + +/* Map file mode type to directory entry types */ +u_char modetodirtype[16] = { + DT_REG, DT_FIFO, DT_CHR, DT_UNKNOWN, + DT_DIR, DT_UNKNOWN, DT_BLK, DT_UNKNOWN, + DT_REG, DT_UNKNOWN, DT_LNK, DT_UNKNOWN, + DT_SOCK, DT_UNKNOWN, DT_WHT, DT_UNKNOWN +}; +#define MODE_TO_DT(mode) (modetodirtype[((mode) & S_IFMT) >> 12]) + static int cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, u_long hint, int wantrsrc, - struct cat_desc *descp, struct cat_attr *attrp, struct cat_fork *forkp); + struct cat_desc *descp, struct cat_attr *attrp, struct cat_fork *forkp, cnid_t *desc_cnid); static int cat_lookupmangled(struct hfsmount *hfsmp, struct cat_desc *descp, int wantrsrc, struct cat_desc *outdescp, struct cat_attr *attrp, struct cat_fork *forkp); @@ -84,7 +98,8 @@ extern int unicode_to_hfs(ExtendedVCB *vcb, ByteCount srcLen, /* Internal catalog support routines */ -int resolvelink(struct hfsmount *hfsmp, u_long linkref, struct HFSPlusCatalogFile *recp); +static int cat_findposition(const CatalogKey *ckp, const CatalogRecord *crp, + struct position_state *state); static int resolvelinkid(struct hfsmount *hfsmp, u_long linkref, ino_t *ino); @@ -97,7 +112,7 @@ static void buildthreadkey(HFSCatalogNodeID parentID, int std_hfs, CatalogKey *k static void buildrecord(struct cat_attr *attrp, cnid_t cnid, int std_hfs, u_int32_t encoding, CatalogRecord *crp, int *recordSize); -static int catrec_update(const CatalogKey *ckp, CatalogRecord *crp, u_int16_t reclen, struct update_state *state); +static int catrec_update(const CatalogKey *ckp, CatalogRecord *crp, struct update_state *state); static int builddesc(const HFSPlusCatalogKey *key, cnid_t cnid, u_long hint, u_long encoding, int isdir, struct cat_desc *descp); @@ -122,21 +137,18 @@ int cat_preflight(struct hfsmount *hfsmp, catops_t ops, cat_cookie_t *cookie, struct proc *p) { FCB *fcb; + int lockflags; int result; - fcb = GetFileControlBlock(HFSTOVCB(hfsmp)->catalogRefNum); + fcb = GetFileControlBlock(hfsmp->hfs_catalog_vp); - /* Lock catalog b-tree */ - result = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); - if (result) - return (result); + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); result = BTReserveSpace(fcb, ops, (void*)cookie); - /* Unlock catalog b-tree */ - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); + hfs_systemfile_unlock(hfsmp, lockflags); - MacToVFSError(result); + return MacToVFSError(result); } __private_extern__ @@ -144,15 +156,15 @@ void cat_postflight(struct hfsmount *hfsmp, cat_cookie_t *cookie, struct proc *p) { FCB *fcb; - int error; + int lockflags; - fcb = GetFileControlBlock(HFSTOVCB(hfsmp)->catalogRefNum); + fcb = GetFileControlBlock(hfsmp->hfs_catalog_vp); + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); - error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); (void) BTReleaseReserve(fcb, (void*)cookie); - if (error == 0) { - hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); - } + + hfs_systemfile_unlock(hfsmp, lockflags); } @@ -261,7 +273,7 @@ cat_releasedesc(struct cat_desc *descp) descp->cd_nameptr = NULL; descp->cd_namelen = 0; descp->cd_flags &= ~CD_HASBUF; - remove_name(name); + vfs_removename(name); } descp->cd_nameptr = NULL; descp->cd_namelen = 0; @@ -279,7 +291,7 @@ __private_extern__ int cat_lookup(struct hfsmount *hfsmp, struct cat_desc *descp, int wantrsrc, struct cat_desc *outdescp, struct cat_attr *attrp, - struct cat_fork *forkp) + struct cat_fork *forkp, cnid_t *desc_cnid) { CatalogKey * keyp; int std_hfs; @@ -293,11 +305,23 @@ cat_lookup(struct hfsmount *hfsmp, struct cat_desc *descp, int wantrsrc, if (result) goto exit; - result = cat_lookupbykey(hfsmp, keyp, descp->cd_hint, wantrsrc, outdescp, attrp, forkp); + result = cat_lookupbykey(hfsmp, keyp, descp->cd_hint, wantrsrc, outdescp, attrp, forkp, desc_cnid); if (result == ENOENT) { if (!std_hfs) { + struct cat_desc temp_desc; + if (outdescp == NULL) { + bzero(&temp_desc, sizeof(temp_desc)); + outdescp = &temp_desc; + } result = cat_lookupmangled(hfsmp, descp, wantrsrc, outdescp, attrp, forkp); + if (desc_cnid) { + *desc_cnid = outdescp->cd_cnid; + } + if (outdescp == &temp_desc) { + /* Release the local copy of desc */ + cat_releasedesc(outdescp); + } } else if (hfsmp->hfs_encoding != kTextEncodingMacRoman) { // make MacRoman key from utf-8 // result = cat_lookupbykey(hfsmp, keyp, descp->cd_hint, attrp, forkp); @@ -366,6 +390,78 @@ exit: } +/* + * cat_findname - obtain a descriptor from cnid + * + * Only a thread lookup is performed. + */ +__private_extern__ +int +cat_findname(struct hfsmount *hfsmp, cnid_t cnid, struct cat_desc *outdescp) +{ + struct BTreeIterator * iterator; + FSBufferDescriptor btdata; + CatalogKey * keyp; + CatalogRecord * recp; + int isdir; + int result; + int std_hfs; + + isdir = 0; + std_hfs = (hfsmp->hfs_flags & HFS_STANDARD); + + MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); + buildthreadkey(cnid, std_hfs, (CatalogKey *)&iterator->key); + iterator->hint.nodeNum = 0; + + MALLOC(recp, CatalogRecord *, sizeof(CatalogRecord), M_TEMP, M_WAITOK); + BDINIT(btdata, recp); + + result = BTSearchRecord(VTOF(hfsmp->hfs_catalog_vp), iterator, &btdata, NULL, NULL); + if (result) + goto exit; + + /* Turn thread record into a cnode key (in place). */ + switch (recp->recordType) { + case kHFSFolderThreadRecord: + isdir = 1; + /* fall through */ + case kHFSFileThreadRecord: + keyp = (CatalogKey *)((char *)&recp->hfsThread.reserved + 6); + keyp->hfs.keyLength = kHFSCatalogKeyMinimumLength + keyp->hfs.nodeName[0]; + break; + + case kHFSPlusFolderThreadRecord: + isdir = 1; + /* fall through */ + case kHFSPlusFileThreadRecord: + keyp = (CatalogKey *)&recp->hfsPlusThread.reserved; + keyp->hfsPlus.keyLength = kHFSPlusCatalogKeyMinimumLength + + (keyp->hfsPlus.nodeName.length * 2); + break; + default: + result = ENOENT; + goto exit; + } + if (std_hfs) { + HFSPlusCatalogKey * pluskey = NULL; + u_long encoding; + + MALLOC(pluskey, HFSPlusCatalogKey *, sizeof(HFSPlusCatalogKey), M_TEMP, M_WAITOK); + promotekey(hfsmp, &keyp->hfs, pluskey, &encoding); + builddesc(pluskey, cnid, 0, encoding, isdir, outdescp); + FREE(pluskey, M_TEMP); + + } else { + builddesc((HFSPlusCatalogKey *)keyp, cnid, 0, 0, isdir, outdescp); + } +exit: + FREE(recp, M_TEMP); + FREE(iterator, M_TEMP); + + return MacToVFSError(result); +} + /* * cat_idlookup - lookup a catalog node using a cnode id */ @@ -416,7 +512,7 @@ cat_idlookup(struct hfsmount *hfsmp, cnid_t cnid, struct cat_desc *outdescp, goto exit; } - result = cat_lookupbykey(hfsmp, keyp, 0, 0, outdescp, attrp, forkp); + result = cat_lookupbykey(hfsmp, keyp, 0, 0, outdescp, attrp, forkp, NULL); exit: FREE(recp, M_TEMP); FREE(iterator, M_TEMP); @@ -468,7 +564,7 @@ falsematch: */ static int cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, u_long hint, int wantrsrc, - struct cat_desc *descp, struct cat_attr *attrp, struct cat_fork *forkp) + struct cat_desc *descp, struct cat_attr *attrp, struct cat_fork *forkp, cnid_t *desc_cnid) { struct BTreeIterator * iterator; FSBufferDescriptor btdata; @@ -516,8 +612,8 @@ cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, u_long hint, int wantr && (recp->recordType == kHFSPlusFileRecord) && (SWAP_BE32(recp->hfsPlusFile.userInfo.fdType) == kHardLinkFileType) && (SWAP_BE32(recp->hfsPlusFile.userInfo.fdCreator) == kHFSPlusCreator) - && ((to_bsd_time(recp->hfsPlusFile.createDate) == HFSTOVCB(hfsmp)->vcbCrDate) || - (to_bsd_time(recp->hfsPlusFile.createDate) == hfsmp->hfs_metadata_createdate))) { + && ((to_bsd_time(recp->hfsPlusFile.createDate) == (time_t)HFSTOVCB(hfsmp)->vcbCrDate) || + (to_bsd_time(recp->hfsPlusFile.createDate) == (time_t)hfsmp->hfs_metadata_createdate))) { ilink = recp->hfsPlusFile.bsdInfo.special.iNodeNum; @@ -588,6 +684,10 @@ cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, u_long hint, int wantr FREE(pluskey, M_TEMP); } } + + if (desc_cnid != NULL) { + *desc_cnid = cnid; + } exit: FREE(iterator, M_TEMP); FREE(recp, M_TEMP); @@ -611,23 +711,46 @@ cat_create(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr u_int32_t nextCNID; u_int32_t datalen; int std_hfs; - int result; + int result = 0; u_long encoding; int modeformat; + int mntlock = 0; modeformat = attrp->ca_mode & S_IFMT; vcb = HFSTOVCB(hfsmp); fcb = GetFileControlBlock(vcb->catalogRefNum); - nextCNID = vcb->vcbNxtCNID; std_hfs = (vcb->vcbSigWord == kHFSSigWord); - if (std_hfs && nextCNID == 0xFFFFFFFF) - return (ENOSPC); + /* + * Atomically get the next CNID. If we have wrapped the CNIDs + * then keep the hfsmp lock held until we have found a CNID. + */ + HFS_MOUNT_LOCK(hfsmp, TRUE); + mntlock = 1; + nextCNID = hfsmp->vcbNxtCNID; + if (nextCNID == 0xFFFFFFFF) { + if (std_hfs) { + result = ENOSPC; + } else { + hfsmp->vcbNxtCNID = kHFSFirstUserCatalogNodeID; + hfsmp->vcbAtrb |= kHFSCatalogNodeIDsReusedMask; + } + } else { + hfsmp->vcbNxtCNID++; + } + hfsmp->vcbFlags |= 0xFF00; + /* OK to drop lock if CNIDs are not wrapping */ + if ((hfsmp->vcbAtrb & kHFSCatalogNodeIDsReusedMask) == 0) { + HFS_MOUNT_UNLOCK(hfsmp, TRUE); + mntlock = 0; + if (result) + return (result); /* HFS only exit */ + } /* Get space for iterator, key and data */ MALLOC(bto, struct btobj *, sizeof(struct btobj), M_TEMP, M_WAITOK); - bzero(bto, sizeof(struct btobj)); + bto->iterator.hint.nodeNum = 0; result = buildkey(hfsmp, descp, &bto->key, 0); if (result) @@ -653,14 +776,11 @@ cat_create(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr buildthreadkey(nextCNID, std_hfs, (CatalogKey *) &bto->iterator.key); result = BTInsertRecord(fcb, &bto->iterator, &btdata, datalen); - if (result == btExists && !std_hfs) { + if ((result == btExists) && !std_hfs && mntlock) { /* * Allow CNIDs on HFS Plus volumes to wrap around */ - ++nextCNID; - if (nextCNID < kHFSFirstUserCatalogNodeID) { - vcb->vcbAtrb |= kHFSCatalogNodeIDsReusedMask; - vcb->vcbFlags |= 0xFF00; + if (++nextCNID < kHFSFirstUserCatalogNodeID) { nextCNID = kHFSFirstUserCatalogNodeID; } continue; @@ -669,6 +789,19 @@ cat_create(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr } if (result) goto exit; } + + /* + * CNID is now established. If we have wrapped then + * update the vcbNxtCNID and drop the vcb lock. + */ + if (mntlock) { + hfsmp->vcbNxtCNID = nextCNID + 1; + if (hfsmp->vcbNxtCNID < kHFSFirstUserCatalogNodeID) { + hfsmp->vcbNxtCNID = kHFSFirstUserCatalogNodeID; + } + HFS_MOUNT_UNLOCK(hfsmp, TRUE); + mntlock = 0; + } /* * Now insert the file/directory record @@ -716,18 +849,10 @@ cat_create(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr } attrp->ca_fileid = nextCNID; - /* Update parent stats */ - TrashCatalogIterator(vcb, descp->cd_parentcnid); - - /* Update volume stats */ - if (++nextCNID < kHFSFirstUserCatalogNodeID) { - vcb->vcbAtrb |= kHFSCatalogNodeIDsReusedMask; - nextCNID = kHFSFirstUserCatalogNodeID; - } - vcb->vcbNxtCNID = nextCNID; - vcb->vcbFlags |= 0xFF00; - exit: + if (mntlock) + HFS_MOUNT_UNLOCK(hfsmp, TRUE); + (void) BTFlushPath(fcb); FREE(bto, M_TEMP); @@ -796,7 +921,7 @@ cat_rename ( * When moving a directory, make sure its a valid move. */ if (directory && (from_cdp->cd_parentcnid != to_cdp->cd_parentcnid)) { - struct BTreeIterator iterator = {0}; + struct BTreeIterator iterator; cnid_t cnid = from_cdp->cd_cnid; cnid_t pathcnid = todir_cdp->cd_parentcnid; @@ -807,7 +932,7 @@ cat_rename ( result = EINVAL; goto exit; } - + bzero(&iterator, sizeof(iterator)); /* * Traverese destination path all the way back to the root * making sure that source directory is not encountered. @@ -833,8 +958,33 @@ cat_rename ( */ result = BTSearchRecord(fcb, from_iterator, &btdata, &datasize, from_iterator); - if (result) - goto exit; + if (result) { + if (std_hfs || (result != btNotFound)) + goto exit; + + struct cat_desc temp_desc; + + /* Probably the node has mangled name */ + result = cat_lookupmangled(hfsmp, from_cdp, 0, &temp_desc, NULL, NULL); + if (result) + goto exit; + + /* The file has mangled name. Search the cnode data using full name */ + bzero(from_iterator, sizeof(*from_iterator)); + result = buildkey(hfsmp, &temp_desc, (HFSPlusCatalogKey *)&from_iterator->key, 0); + if (result) { + cat_releasedesc(&temp_desc); + goto exit; + } + + result = BTSearchRecord(fcb, from_iterator, &btdata, &datasize, from_iterator); + if (result) { + cat_releasedesc(&temp_desc); + goto exit; + } + + cat_releasedesc(&temp_desc); + } /* Update the text encoding (on disk and in descriptor) */ if (!std_hfs) { @@ -862,11 +1012,6 @@ cat_rename ( goto exit; #endif - /* Trash the iterator caches */ - TrashCatalogIterator(vcb, from_cdp->cd_parentcnid); - if (from_cdp->cd_parentcnid != to_cdp->cd_parentcnid) - TrashCatalogIterator(vcb, to_cdp->cd_parentcnid); - /* Step 2: Insert cnode at new location */ result = BTInsertRecord(fcb, to_iterator, &btdata, datasize); if (result == btExists) { @@ -1014,22 +1159,22 @@ cat_delete(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr * A file must be zero length (no blocks) */ if (descp->cd_cnid < kHFSFirstUserCatalogNodeID || - descp->cd_parentcnid == kRootParID) + descp->cd_parentcnid == kHFSRootParentID) return (EINVAL); /* XXX Preflight Missing */ /* Get space for iterator */ MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - bzero(iterator, sizeof(*iterator)); + iterator->hint.nodeNum = 0; /* * Derive a key from either the file ID (for a virtual inode) * or the descriptor. */ if (descp->cd_namelen == 0) { - result = getkey(hfsmp, attrp->ca_fileid, (CatalogKey *)&iterator->key); - cnid = attrp->ca_fileid; + result = getkey(hfsmp, attrp->ca_fileid, (CatalogKey *)&iterator->key); + cnid = attrp->ca_fileid; } else { result = buildkey(hfsmp, descp, (HFSPlusCatalogKey *)&iterator->key, 0); cnid = descp->cd_cnid; @@ -1039,15 +1184,39 @@ cat_delete(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr /* Delete record */ result = BTDeleteRecord(fcb, iterator); - if (result) - goto exit; + if (result) { + if (std_hfs || (result != btNotFound)) + goto exit; + + struct cat_desc temp_desc; + + /* Probably the node has mangled name */ + result = cat_lookupmangled(hfsmp, descp, 0, &temp_desc, attrp, NULL); + if (result) + goto exit; + + /* The file has mangled name. Delete the file using full name */ + bzero(iterator, sizeof(*iterator)); + result = buildkey(hfsmp, &temp_desc, (HFSPlusCatalogKey *)&iterator->key, 0); + cnid = temp_desc.cd_cnid; + if (result) { + cat_releasedesc(&temp_desc); + goto exit; + } + + result = BTDeleteRecord(fcb, iterator); + if (result) { + cat_releasedesc(&temp_desc); + goto exit; + } + + cat_releasedesc(&temp_desc); + } /* Delete thread record, ignore errors */ buildthreadkey(cnid, std_hfs, (CatalogKey *)&iterator->key); (void) BTDeleteRecord(fcb, iterator); - TrashCatalogIterator(vcb, descp->cd_parentcnid); - exit: (void) BTFlushPath(fcb); FREE(iterator, M_TEMP); @@ -1084,7 +1253,6 @@ cat_update(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr /* Get space for iterator */ MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - bzero(iterator, sizeof(*iterator)); /* * For open-deleted files we need to do a lookup by cnid @@ -1124,8 +1292,7 @@ exit: * This is called from within BTUpdateRecord. */ static int -catrec_update(const CatalogKey *ckp, CatalogRecord *crp, u_int16_t reclen, - struct update_state *state) +catrec_update(const CatalogKey *ckp, CatalogRecord *crp, struct update_state *state) { struct cat_desc *descp; struct cat_attr *attrp; @@ -1199,15 +1366,18 @@ catrec_update(const CatalogKey *ckp, CatalogRecord *crp, u_int16_t reclen, dir = (struct HFSPlusCatalogFolder *)crp; /* Do a quick sanity check */ if ((ckp->hfsPlus.parentID != descp->cd_parentcnid) || - (dir->folderID != descp->cd_cnid)) + (dir->folderID != descp->cd_cnid)) return (btNotFound); + dir->flags = attrp->ca_recflags; dir->valence = attrp->ca_entries; dir->createDate = to_hfs_time(attrp->ca_itime); dir->contentModDate = to_hfs_time(attrp->ca_mtime); dir->backupDate = to_hfs_time(attrp->ca_btime); dir->accessDate = to_hfs_time(attrp->ca_atime); + attrp->ca_atimeondisk = attrp->ca_atime; dir->attributeModDate = to_hfs_time(attrp->ca_ctime); dir->textEncoding = descp->cd_encoding; + dir->attrBlocks = attrp->ca_attrblks; bcopy(&attrp->ca_finderinfo[0], &dir->userInfo, 32); /* * Update the BSD Info if it was already initialized on @@ -1237,8 +1407,7 @@ catrec_update(const CatalogKey *ckp, CatalogRecord *crp, u_int16_t reclen, ((attrp->ca_mode & ALLPERMS) != (hfsmp->hfs_dir_mask & ACCESSPERMS))) { if ((dir->bsdInfo.fileMode == 0) || - (HFSTOVFS(hfsmp)->mnt_flag & - MNT_UNKNOWNPERMISSIONS) == 0) { + (((unsigned int)vfs_flags(HFSTOVFS(hfsmp))) & MNT_UNKNOWNPERMISSIONS) == 0) { dir->bsdInfo.ownerID = attrp->ca_uid; dir->bsdInfo.groupID = attrp->ca_gid; } @@ -1255,12 +1424,15 @@ catrec_update(const CatalogKey *ckp, CatalogRecord *crp, u_int16_t reclen, /* Do a quick sanity check */ if (file->fileID != attrp->ca_fileid) return (btNotFound); + file->flags = attrp->ca_recflags; file->createDate = to_hfs_time(attrp->ca_itime); file->contentModDate = to_hfs_time(attrp->ca_mtime); file->backupDate = to_hfs_time(attrp->ca_btime); file->accessDate = to_hfs_time(attrp->ca_atime); + attrp->ca_atimeondisk = attrp->ca_atime; file->attributeModDate = to_hfs_time(attrp->ca_ctime); file->textEncoding = descp->cd_encoding; + file->attrBlocks = attrp->ca_attrblks; bcopy(&attrp->ca_finderinfo[0], &file->userInfo, 32); /* * Update the BSD Info if it was already initialized on @@ -1290,8 +1462,7 @@ catrec_update(const CatalogKey *ckp, CatalogRecord *crp, u_int16_t reclen, ((attrp->ca_mode & ALLPERMS) != (hfsmp->hfs_file_mask & ACCESSPERMS))) { if ((file->bsdInfo.fileMode == 0) || - (HFSTOVFS(hfsmp)->mnt_flag & - MNT_UNKNOWNPERMISSIONS) == 0) { + (((unsigned int)vfs_flags(HFSTOVFS(hfsmp))) & MNT_UNKNOWNPERMISSIONS) == 0) { file->bsdInfo.ownerID = attrp->ca_uid; file->bsdInfo.groupID = attrp->ca_gid; } @@ -1316,7 +1487,7 @@ catrec_update(const CatalogKey *ckp, CatalogRecord *crp, u_int16_t reclen, bcopy(&forkp->cf_extents[0], &file->dataFork.extents, sizeof(HFSPlusExtentRecord)); /* Push blocks read to disk */ - file->resourceFork.clumpSize = + file->dataFork.clumpSize = howmany(forkp->cf_bytesread, blksize); } @@ -1346,8 +1517,8 @@ catrec_update(const CatalogKey *ckp, CatalogRecord *crp, u_int16_t reclen, } /* - * catrec_readattr - - * This is called from within BTIterateRecords. + * Callback to collect directory entries. + * Called with readattr_state for each item in a directory. */ struct readattr_state { struct hfsmount *hfsmp; @@ -1358,8 +1529,8 @@ struct readattr_state { }; static int -catrec_readattr(const CatalogKey *key, const CatalogRecord *rec, - u_long node, struct readattr_state *state) +cat_readattr(const CatalogKey *key, const CatalogRecord *rec, + struct readattr_state *state) { struct cat_entrylist *list = state->list; struct hfsmount *hfsmp = state->hfsmp; @@ -1387,7 +1558,7 @@ catrec_readattr(const CatalogKey *key, const CatalogRecord *rec, } /* Hide the private meta data directory and journal files */ - if (parentcnid == kRootDirID) { + if (parentcnid == kHFSRootFolderID) { if ((rec->recordType == kHFSPlusFolderRecord) && (rec->hfsPlusFolder.folderID == hfsmp->hfs_privdir_desc.cd_cnid)) { return (1); /* continue */ @@ -1401,7 +1572,6 @@ catrec_readattr(const CatalogKey *key, const CatalogRecord *rec, } } - cep = &list->entry[list->realentries++]; if (state->stdhfs) { @@ -1414,7 +1584,7 @@ catrec_readattr(const CatalogKey *key, const CatalogRecord *rec, MALLOC(pluskey, HFSPlusCatalogKey *, sizeof(HFSPlusCatalogKey), M_TEMP, M_WAITOK); promotekey(hfsmp, (HFSCatalogKey *)key, pluskey, &encoding); - builddesc(pluskey, getcnid(rec), node, encoding, isadir(rec), &cep->ce_desc); + builddesc(pluskey, getcnid(rec), 0, encoding, isadir(rec), &cep->ce_desc); FREE(pluskey, M_TEMP); if (rec->recordType == kHFSFileRecord) { @@ -1427,7 +1597,7 @@ catrec_readattr(const CatalogKey *key, const CatalogRecord *rec, } } else { getbsdattr(hfsmp, (struct HFSPlusCatalogFile *)rec, &cep->ce_attr); - builddesc((HFSPlusCatalogKey *)key, getcnid(rec), node, getencoding(rec), + builddesc((HFSPlusCatalogKey *)key, getcnid(rec), 0, getencoding(rec), isadir(rec), &cep->ce_desc); if (rec->recordType == kHFSPlusFileRecord) { @@ -1447,12 +1617,13 @@ catrec_readattr(const CatalogKey *key, const CatalogRecord *rec, } /* + * Pack a cat_entrylist buffer with attributes from the catalog + * * Note: index is zero relative */ __private_extern__ int -cat_getentriesattr(struct hfsmount *hfsmp, struct cat_desc *prevdesc, int index, - struct cat_entrylist *ce_list) +cat_getentriesattr(struct hfsmount *hfsmp, directoryhint_t *dirhint, struct cat_entrylist *ce_list) { FCB* fcb; CatalogKey * key; @@ -1461,13 +1632,15 @@ cat_getentriesattr(struct hfsmount *hfsmp, struct cat_desc *prevdesc, int index, cnid_t parentcnid; int i; int std_hfs; + int index; + int have_key; int result = 0; ce_list->realentries = 0; fcb = GetFileControlBlock(HFSTOVCB(hfsmp)->catalogRefNum); std_hfs = (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord); - parentcnid = prevdesc->cd_parentcnid; + parentcnid = dirhint->dh_desc.cd_parentcnid; state.hfsmp = hfsmp; state.list = ce_list; @@ -1478,37 +1651,63 @@ cat_getentriesattr(struct hfsmount *hfsmp, struct cat_desc *prevdesc, int index, MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); bzero(iterator, sizeof(*iterator)); key = (CatalogKey *)&iterator->key; - iterator->hint.nodeNum = prevdesc->cd_hint; + have_key = 0; + iterator->hint.nodeNum = dirhint->dh_desc.cd_hint; + index = dirhint->dh_index + 1; /* - * If the last entry wasn't cached then establish the iterator + * Attempt to build a key from cached filename */ - if ((index == 0) || - (prevdesc->cd_namelen == 0) || - (buildkey(hfsmp, prevdesc, (HFSPlusCatalogKey *)key, 0) != 0)) { - int i; + if (dirhint->dh_desc.cd_namelen != 0) { + if (buildkey(hfsmp, &dirhint->dh_desc, (HFSPlusCatalogKey *)key, 0) == 0) { + have_key = 1; + } + } + + /* + * If the last entry wasn't cached then position the btree iterator + */ + if ((index == 0) || !have_key) { /* - * Position the iterator at the directory thread. - * (ie just before the first entry) + * Position the iterator at the directory's thread record. + * (i.e. just before the first entry) */ - buildthreadkey(parentcnid, std_hfs, key); + buildthreadkey(dirhint->dh_desc.cd_parentcnid, (hfsmp->hfs_flags & HFS_STANDARD), key); result = BTSearchRecord(fcb, iterator, NULL, NULL, iterator); - if (result) - goto exit; /* bad news */ + if (result) { + result = MacToVFSError(result); + goto exit; + } + /* * Iterate until we reach the entry just * before the one we want to start with. */ - for (i = 0; i < index; ++i) { - result = BTIterateRecord(fcb, kBTreeNextRecord, iterator, NULL, NULL); - if (result) - goto exit; /* bad news */ + if (index > 0) { + struct position_state ps; + + ps.error = 0; + ps.count = 0; + ps.index = index; + ps.parentID = dirhint->dh_desc.cd_parentcnid; + ps.hfsmp = hfsmp; + + result = BTIterateRecords(fcb, kBTreeNextRecord, iterator, + (IterateCallBackProcPtr)cat_findposition, &ps); + if (ps.error) + result = ps.error; + else + result = MacToVFSError(result); + if (result) { + result = MacToVFSError(result); + goto exit; + } } } - /* Fill list with entries. */ + /* Fill list with entries starting at iterator->key. */ result = BTIterateRecords(fcb, kBTreeNextRecord, iterator, - (IterateCallBackProcPtr)catrec_readattr, &state); + (IterateCallBackProcPtr)cat_readattr, &state); if (state.error) result = state.error; @@ -1523,7 +1722,7 @@ cat_getentriesattr(struct hfsmount *hfsmp, struct cat_desc *prevdesc, int index, /* * Resolve any hard links. */ - for (i = 0; i < ce_list->realentries; ++i) { + for (i = 0; i < (int)ce_list->realentries; ++i) { struct FndrFileInfo *fip; struct cat_entry *cep; struct HFSPlusCatalogFile filerec; @@ -1539,8 +1738,8 @@ cat_getentriesattr(struct hfsmount *hfsmp, struct cat_desc *prevdesc, int index, if ((cep->ce_attr.ca_rdev != 0) && (SWAP_BE32(fip->fdType) == kHardLinkFileType) && (SWAP_BE32(fip->fdCreator) == kHFSPlusCreator) - && ((cep->ce_attr.ca_itime == HFSTOVCB(hfsmp)->vcbCrDate) || - (cep->ce_attr.ca_itime == hfsmp->hfs_metadata_createdate))) { + && ((cep->ce_attr.ca_itime == (time_t)HFSTOVCB(hfsmp)->vcbCrDate) || + (cep->ce_attr.ca_itime == (time_t)hfsmp->hfs_metadata_createdate))) { if (resolvelink(hfsmp, cep->ce_attr.ca_rdev, &filerec) != 0) continue; @@ -1558,109 +1757,113 @@ exit: return MacToVFSError(result); } -struct linkinfo { - u_long link_ref; - void * dirent_addr; -}; +#define SMALL_DIRENTRY_SIZE (int)(sizeof(struct dirent) - (MAXNAMLEN + 1) + 8) -struct read_state { - u_int32_t cbs_parentID; - u_int32_t cbs_hiddenDirID; - u_int32_t cbs_hiddenJournalID; - u_int32_t cbs_hiddenInfoBlkID; - off_t cbs_lastoffset; - struct uio * cbs_uio; - ExtendedVCB * cbs_vcb; - int8_t cbs_hfsPlus; - int8_t cbs_case_sensitive; - int16_t cbs_result; - int32_t cbs_numresults; - u_long *cbs_cookies; - int32_t cbs_ncookies; - int32_t cbs_nlinks; - int32_t cbs_maxlinks; - struct linkinfo *cbs_linkinfo; -}; +/* + * Callback to pack directory entries. + * Called with packdirentry_state for each item in a directory. + */ -/* Map file mode type to directory entry types */ -u_char modetodirtype[16] = { - DT_REG, DT_FIFO, DT_CHR, DT_UNKNOWN, - DT_DIR, DT_UNKNOWN, DT_BLK, DT_UNKNOWN, - DT_REG, DT_UNKNOWN, DT_LNK, DT_UNKNOWN, - DT_SOCK, DT_UNKNOWN, DT_WHT, DT_UNKNOWN +/* Hard link information collected during cat_getdirentries. */ +struct linkinfo { + u_long link_ref; + user_addr_t dirent_addr; +}; +typedef struct linkinfo linkinfo_t; + +/* State information for the cat_packdirentry callback function. */ +struct packdirentry_state { + int cbs_extended; + u_int32_t cbs_parentID; + u_int32_t cbs_index; + uio_t cbs_uio; + ExtendedVCB * cbs_hfsmp; + int cbs_result; + int32_t cbs_nlinks; + int32_t cbs_maxlinks; + linkinfo_t * cbs_linkinfo; + struct cat_desc * cbs_desc; +// struct dirent * cbs_stdentry; + struct direntry * cbs_direntry; }; - -#define MODE_TO_DT(mode) (modetodirtype[((mode) & S_IFMT) >> 12]) static int -catrec_read(const CatalogKey *ckp, const CatalogRecord *crp, - u_int16_t recordLen, struct read_state *state) +cat_packdirentry(const CatalogKey *ckp, const CatalogRecord *crp, + struct packdirentry_state *state) { struct hfsmount *hfsmp; CatalogName *cnp; - size_t utf8chars; - u_int32_t curID; + cnid_t curID; OSErr result; struct dirent catent; + struct direntry * entry = NULL; time_t itime; u_long ilinkref = 0; - void * uiobase; + cnid_t cnid; + int hide = 0; + u_int8_t type; + u_int8_t is_mangled = 0; + char *nameptr; + user_addr_t uiobase; + size_t namelen = 0; + size_t maxnamelen; + size_t uiosize = 0; + caddr_t uioaddr; - if (state->cbs_hfsPlus) - curID = ckp->hfsPlus.parentID; - else + hfsmp = state->cbs_hfsmp; + + if (hfsmp->hfs_flags & HFS_STANDARD) curID = ckp->hfs.parentID; + else + curID = ckp->hfsPlus.parentID; /* We're done when parent directory changes */ if (state->cbs_parentID != curID) { -lastitem: -/* - * The NSDirectoryList class chokes on empty records (it doesnt check d_reclen!) - * so remove padding for now... - */ -#if 0 - /* - * Pad the end of list with an empty record. - * This eliminates an extra call by readdir(3c). - */ - catent.d_fileno = 0; - catent.d_reclen = 0; - catent.d_type = 0; - catent.d_namlen = 0; - *(int32_t*)&catent.d_name[0] = 0; - - state->cbs_lastoffset = state->cbs_uio->uio_offset; - - state->cbs_result = uiomove((caddr_t) &catent, 12, state->cbs_uio); - if (state->cbs_result == 0) - state->cbs_result = ENOENT; -#else - state->cbs_lastoffset = state->cbs_uio->uio_offset; state->cbs_result = ENOENT; -#endif return (0); /* stop */ } - if (state->cbs_hfsPlus) { + if (state->cbs_extended) { + entry = state->cbs_direntry; + nameptr = &entry->d_name[0]; + maxnamelen = NAME_MAX; + } else { + nameptr = &catent.d_name[0]; + maxnamelen = NAME_MAX; + } + + if (!(hfsmp->hfs_flags & HFS_STANDARD)) { switch(crp->recordType) { case kHFSPlusFolderRecord: - catent.d_type = DT_DIR; - catent.d_fileno = crp->hfsPlusFolder.folderID; + type = DT_DIR; + cnid = crp->hfsPlusFolder.folderID; + /* Hide our private meta data directory */ + if ((curID == kHFSRootFolderID) && + (cnid == hfsmp->hfs_privdir_desc.cd_cnid)) { + hide = 1; + } + break; case kHFSPlusFileRecord: itime = to_bsd_time(crp->hfsPlusFile.createDate); - hfsmp = VCBTOHFS(state->cbs_vcb); /* * When a hardlink link is encountered save its link ref. */ if ((SWAP_BE32(crp->hfsPlusFile.userInfo.fdType) == kHardLinkFileType) && (SWAP_BE32(crp->hfsPlusFile.userInfo.fdCreator) == kHFSPlusCreator) && - ((itime == state->cbs_vcb->vcbCrDate) || - (itime == hfsmp->hfs_metadata_createdate))) { + ((itime == (time_t)hfsmp->hfs_itime) || + (itime == (time_t)hfsmp->hfs_metadata_createdate))) { ilinkref = crp->hfsPlusFile.bsdInfo.special.iNodeNum; } - catent.d_type = MODE_TO_DT(crp->hfsPlusFile.bsdInfo.fileMode); - catent.d_fileno = crp->hfsPlusFile.fileID; + type = MODE_TO_DT(crp->hfsPlusFile.bsdInfo.fileMode); + cnid = crp->hfsPlusFile.fileID; + /* Hide the journal files */ + if ((curID == kHFSRootFolderID) && + (hfsmp->jnl) && + ((cnid == hfsmp->hfs_jnlfileid) || + (cnid == hfsmp->hfs_jnlinfoblkid))) { + hide = 1; + } break; default: return (0); /* stop */ @@ -1668,83 +1871,119 @@ lastitem: cnp = (CatalogName*) &ckp->hfsPlus.nodeName; result = utf8_encodestr(cnp->ustr.unicode, cnp->ustr.length * sizeof(UniChar), - catent.d_name, &utf8chars, kdirentMaxNameBytes + 1, ':', 0); + nameptr, &namelen, maxnamelen + 1, ':', 0); if (result == ENAMETOOLONG) { result = ConvertUnicodeToUTF8Mangled(cnp->ustr.length * sizeof(UniChar), - cnp->ustr.unicode, kdirentMaxNameBytes + 1, (ByteCount*)&utf8chars, catent.d_name, catent.d_fileno); + cnp->ustr.unicode, maxnamelen + 1, + (ByteCount*)&namelen, nameptr, + cnid); + is_mangled = 1; } } else { /* hfs */ switch(crp->recordType) { case kHFSFolderRecord: - catent.d_type = DT_DIR; - catent.d_fileno = crp->hfsFolder.folderID; + type = DT_DIR; + cnid = crp->hfsFolder.folderID; break; case kHFSFileRecord: - catent.d_type = DT_REG; - catent.d_fileno = crp->hfsFile.fileID; + type = DT_REG; + cnid = crp->hfsFile.fileID; break; default: return (0); /* stop */ }; cnp = (CatalogName*) ckp->hfs.nodeName; - result = hfs_to_utf8(state->cbs_vcb, cnp->pstr, kdirentMaxNameBytes + 1, - (ByteCount *)&utf8chars, catent.d_name); + result = hfs_to_utf8(hfsmp, cnp->pstr, maxnamelen + 1, + (ByteCount *)&namelen, nameptr); /* * When an HFS name cannot be encoded with the current * volume encoding we use MacRoman as a fallback. */ if (result) - result = mac_roman_to_utf8(cnp->pstr, kdirentMaxNameBytes + 1, - (ByteCount *)&utf8chars, catent.d_name); + result = mac_roman_to_utf8(cnp->pstr, maxnamelen + 1, + (ByteCount *)&namelen, nameptr); } - catent.d_namlen = utf8chars; - catent.d_reclen = DIRENTRY_SIZE(utf8chars); - - /* hide our private meta data directory */ - if (curID == kRootDirID && - catent.d_fileno == state->cbs_hiddenDirID && - catent.d_type == DT_DIR) { - if (state->cbs_case_sensitive) { - // This is how we skip over these entries. The next - // time we fill in a real item the uio_offset will - // point to the correct place in the "virtual" directory - // so that PositionIterator() will do the right thing - // when scanning to get to a particular position in the - // directory. - state->cbs_uio->uio_offset += catent.d_reclen; - state->cbs_lastoffset = state->cbs_uio->uio_offset; - - return (1); /* skip and continue */ - } else - goto lastitem; - } - - /* Hide the journal files */ - if ((curID == kRootDirID) && - (catent.d_type == DT_REG) && - ((catent.d_fileno == state->cbs_hiddenJournalID) || - (catent.d_fileno == state->cbs_hiddenInfoBlkID))) { - - // see comment up above for why this is here - state->cbs_uio->uio_offset += catent.d_reclen; - state->cbs_lastoffset = state->cbs_uio->uio_offset; + if (state->cbs_extended) { + entry->d_type = type; + entry->d_namlen = namelen; + entry->d_reclen = uiosize = EXT_DIRENT_LEN(namelen); + if (hide) + entry->d_fileno = 0; /* file number = 0 means skip entry */ + else + entry->d_fileno = cnid; - return (1); /* skip and continue */ + /* + * The index is 1 relative and includes "." and ".." + * + * Also stuff the cnid in the upper 32 bits of the cookie. + */ + entry->d_seekoff = (state->cbs_index + 3) | ((u_int64_t)cnid << 32); + uioaddr = (caddr_t) entry; + } else { + catent.d_type = type; + catent.d_namlen = namelen; + catent.d_reclen = uiosize = STD_DIRENT_LEN(namelen); + if (hide) + catent.d_fileno = 0; /* file number = 0 means skip entry */ + else + catent.d_fileno = cnid; + uioaddr = (caddr_t) &catent; } - state->cbs_lastoffset = state->cbs_uio->uio_offset; - uiobase = state->cbs_uio->uio_iov->iov_base; + /* Save current base address for post processing of hard-links. */ + uiobase = uio_curriovbase(state->cbs_uio); - /* if this entry won't fit then we're done */ - if (catent.d_reclen > state->cbs_uio->uio_resid || - (ilinkref != 0 && state->cbs_nlinks == state->cbs_maxlinks) || - (state->cbs_ncookies != 0 && state->cbs_numresults >= state->cbs_ncookies)) + /* If this entry won't fit then we're done */ + if ((uiosize > uio_resid(state->cbs_uio)) || + (ilinkref != 0 && state->cbs_nlinks == state->cbs_maxlinks)) { return (0); /* stop */ + } - state->cbs_result = uiomove((caddr_t) &catent, catent.d_reclen, state->cbs_uio); + state->cbs_result = uiomove(uioaddr, uiosize, state->cbs_uio); + if (state->cbs_result == 0) { + ++state->cbs_index; + /* Remember previous entry */ + state->cbs_desc->cd_cnid = cnid; + if (type == DT_DIR) { + state->cbs_desc->cd_flags |= CD_ISDIR; + } else { + state->cbs_desc->cd_flags &= ~CD_ISDIR; + } + if (state->cbs_desc->cd_nameptr != NULL) { + vfs_removename(state->cbs_desc->cd_nameptr); + } +#if 0 + state->cbs_desc->cd_encoding = xxxx; +#endif + if (!is_mangled) { + state->cbs_desc->cd_namelen = namelen; + state->cbs_desc->cd_nameptr = vfs_addname(nameptr, namelen, 0, 0); + } else { + /* Store unmangled name for the directory hint else it will + * restart readdir at the last location again + */ + char *new_nameptr; + size_t bufsize; + + cnp = (CatalogName *)&ckp->hfsPlus.nodeName; + bufsize = 1 + utf8_encodelen(cnp->ustr.unicode, + cnp->ustr.length * sizeof(UniChar), + ':', 0); + MALLOC(new_nameptr, char *, bufsize, M_TEMP, M_WAITOK); + result = utf8_encodestr(cnp->ustr.unicode, + cnp->ustr.length * sizeof(UniChar), + new_nameptr, &namelen, + bufsize, ':', 0); + + state->cbs_desc->cd_namelen = namelen; + state->cbs_desc->cd_nameptr = vfs_addname(new_nameptr, namelen, 0, 0); + + FREE(new_nameptr, M_TEMP); + } + } /* * Record any hard links for post processing. */ @@ -1756,161 +1995,211 @@ lastitem: state->cbs_nlinks++; } - if (state->cbs_cookies) { - state->cbs_cookies[state->cbs_numresults++] = state->cbs_uio->uio_offset; - } else { - state->cbs_numresults++; - } - - /* continue iteration if there's room */ + /* Continue iteration if there's room */ return (state->cbs_result == 0 && - state->cbs_uio->uio_resid >= AVERAGE_HFSDIRENTRY_SIZE); + uio_resid(state->cbs_uio) >= SMALL_DIRENTRY_SIZE); } -#define SMALL_DIRENTRY_SIZE (sizeof(struct dirent) - (MAXNAMLEN + 1) + 8) + /* - * + * Pack a uio buffer with directory entries from the catalog */ __private_extern__ int -cat_getdirentries(struct hfsmount *hfsmp, struct cat_desc *descp, int entrycnt, - struct uio *uio, int *eofflag, u_long *cookies, int ncookies) +cat_getdirentries(struct hfsmount *hfsmp, int entrycnt, directoryhint_t *dirhint, + uio_t uio, int extended, int * items) { - ExtendedVCB *vcb = HFSTOVCB(hfsmp); + FCB* fcb; BTreeIterator * iterator; - CatalogIterator *cip; - u_int32_t diroffset; - u_int16_t op; - struct read_state state; - u_int32_t dirID = descp->cd_cnid; + CatalogKey * key; + struct packdirentry_state state; void * buffer; int bufsize; - int maxdirentries; + int maxlinks; int result; - - diroffset = uio->uio_offset; - *eofflag = 0; - maxdirentries = MIN(entrycnt, uio->uio_resid / SMALL_DIRENTRY_SIZE); + int index; + int have_key; + + fcb = GetFileControlBlock(hfsmp->hfs_catalog_vp); /* Get a buffer for collecting link info and for a btree iterator */ - bufsize = (maxdirentries * sizeof(struct linkinfo)) + sizeof(*iterator); + maxlinks = MIN(entrycnt, uio_resid(uio) / SMALL_DIRENTRY_SIZE); + bufsize = (maxlinks * sizeof(linkinfo_t)) + sizeof(*iterator); + if (extended) { + bufsize += sizeof(struct direntry); + } MALLOC(buffer, void *, bufsize, M_TEMP, M_WAITOK); bzero(buffer, bufsize); + state.cbs_extended = extended; state.cbs_nlinks = 0; - state.cbs_maxlinks = maxdirentries; - state.cbs_linkinfo = (struct linkinfo *) buffer; - iterator = (BTreeIterator *) ((char *)buffer + (maxdirentries * sizeof(struct linkinfo))); - - /* get an iterator and position it */ - cip = GetCatalogIterator(vcb, dirID, diroffset); + state.cbs_maxlinks = maxlinks; + state.cbs_linkinfo = (linkinfo_t *) buffer; - result = PositionIterator(cip, diroffset, iterator, &op); - if (result == cmNotFound) { - *eofflag = 1; - result = 0; - AgeCatalogIterator(cip); - goto cleanup; - } else if ((result = MacToVFSError(result))) - goto cleanup; + iterator = (BTreeIterator *) ((char *)buffer + (maxlinks * sizeof(linkinfo_t))); + key = (CatalogKey *)&iterator->key; + have_key = 0; + index = dirhint->dh_index + 1; + if (extended) { + state.cbs_direntry = (struct direntry *)((char *)buffer + sizeof(BTreeIterator)); + } + /* + * Attempt to build a key from cached filename + */ + if (dirhint->dh_desc.cd_namelen != 0) { + if (buildkey(hfsmp, &dirhint->dh_desc, (HFSPlusCatalogKey *)key, 0) == 0) { + have_key = 1; + } + } - state.cbs_hiddenDirID = hfsmp->hfs_privdir_desc.cd_cnid; - if (hfsmp->jnl) { - state.cbs_hiddenJournalID = hfsmp->hfs_jnlfileid; - state.cbs_hiddenInfoBlkID = hfsmp->hfs_jnlinfoblkid; + /* + * If the last entry wasn't cached then position the btree iterator + */ + if ((index == 0) || !have_key) { + /* + * Position the iterator at the directory's thread record. + * (i.e. just before the first entry) + */ + buildthreadkey(dirhint->dh_desc.cd_parentcnid, (hfsmp->hfs_flags & HFS_STANDARD), key); + result = BTSearchRecord(fcb, iterator, NULL, NULL, iterator); + if (result) { + result = MacToVFSError(result); + goto cleanup; + } + + /* + * Iterate until we reach the entry just + * before the one we want to start with. + */ + if (index > 0) { + struct position_state ps; + + ps.error = 0; + ps.count = 0; + ps.index = index; + ps.parentID = dirhint->dh_desc.cd_parentcnid; + ps.hfsmp = hfsmp; + + result = BTIterateRecords(fcb, kBTreeNextRecord, iterator, + (IterateCallBackProcPtr)cat_findposition, &ps); + if (ps.error) + result = ps.error; + else + result = MacToVFSError(result); + if (result) { + result = MacToVFSError(result); + goto cleanup; + } + } } - state.cbs_lastoffset = cip->currentOffset; - state.cbs_vcb = vcb; + state.cbs_index = index; + state.cbs_hfsmp = hfsmp; state.cbs_uio = uio; + state.cbs_desc = &dirhint->dh_desc; state.cbs_result = 0; - state.cbs_parentID = dirID; - if (diroffset <= 2*sizeof(struct hfsdotentry)) { - state.cbs_numresults = diroffset/sizeof(struct hfsdotentry); - } else { - state.cbs_numresults = 0; - } - state.cbs_cookies = cookies; - state.cbs_ncookies = ncookies; + state.cbs_parentID = dirhint->dh_desc.cd_parentcnid; - if (vcb->vcbSigWord == kHFSPlusSigWord) - state.cbs_hfsPlus = 1; - else - state.cbs_hfsPlus = 0; - - if (hfsmp->hfs_flags & HFS_CASE_SENSITIVE) - state.cbs_case_sensitive = 1; - else - state.cbs_case_sensitive = 0; - - /* process as many entries as possible... */ - result = BTIterateRecords(GetFileControlBlock(vcb->catalogRefNum), op, - iterator, (IterateCallBackProcPtr)catrec_read, &state); + /* + * Process as many entries as possible starting at iterator->key. + */ + result = BTIterateRecords(fcb, kBTreeNextRecord, iterator, + (IterateCallBackProcPtr)cat_packdirentry, &state); + /* Note that state.cbs_index is still valid on errors */ + *items = state.cbs_index - index; + index = state.cbs_index; + + /* Finish updating the catalog iterator. */ + dirhint->dh_desc.cd_hint = iterator->hint.nodeNum; + dirhint->dh_desc.cd_flags |= CD_DECOMPOSED; + dirhint->dh_index = index - 1; + /* * Post process any hard links to get the real file id. */ if (state.cbs_nlinks > 0) { - struct iovec aiov; - struct uio auio; - u_int32_t fileid; + u_int32_t fileid = 0; + user_addr_t address; int i; - u_int32_t tempid; - - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_segflg = uio->uio_segflg; - auio.uio_rw = UIO_READ; /* read kernel memory into user memory */ - auio.uio_procp = uio->uio_procp; for (i = 0; i < state.cbs_nlinks; ++i) { - fileid = 0; - if (resolvelinkid(hfsmp, state.cbs_linkinfo[i].link_ref, &fileid) != 0) continue; - - /* Update the file id in the user's buffer */ - aiov.iov_base = (char *) state.cbs_linkinfo[i].dirent_addr; - aiov.iov_len = sizeof(fileid); - auio.uio_offset = 0; - auio.uio_resid = aiov.iov_len; - (void) uiomove((caddr_t)&fileid, sizeof(fileid), &auio); + /* This assumes that d_ino is always first field. */ + address = state.cbs_linkinfo[i].dirent_addr; + if (address == (user_addr_t)0) + continue; + if (uio_isuserspace(uio)) { + (void) copyout(&fileid, address, + extended ? sizeof(ino64_t) : sizeof(ino_t)); + } else /* system space */ { + ino64_t *inoptr = (ino64_t *)CAST_DOWN(caddr_t, address); + *inoptr = fileid; + } } } + if (state.cbs_result) result = state.cbs_result; else result = MacToVFSError(result); if (result == ENOENT) { - *eofflag = 1; result = 0; } - if (result == 0) { - cip->currentOffset = state.cbs_lastoffset; - cip->nextOffset = uio->uio_offset; - UpdateCatalogIterator(iterator, cip); - } - cleanup: - if (result) { - cip->volume = 0; - cip->folderID = 0; - AgeCatalogIterator(cip); - } - - (void) ReleaseCatalogIterator(cip); FREE(buffer, M_TEMP); return (result); } +/* + * Callback to establish directory position. + * Called with position_state for each item in a directory. + */ +static int +cat_findposition(const CatalogKey *ckp, const CatalogRecord *crp, + struct position_state *state) +{ + cnid_t curID; + + if (state->hfsmp->hfs_flags & HFS_STANDARD) + curID = ckp->hfs.parentID; + else + curID = ckp->hfsPlus.parentID; + + /* Make sure parent directory didn't change */ + if (state->parentID != curID) { + state->error = EINVAL; + return (0); /* stop */ + } + + /* Count this entry */ + switch(crp->recordType) { + case kHFSPlusFolderRecord: + case kHFSPlusFileRecord: + case kHFSFolderRecord: + case kHFSFileRecord: + ++state->count; + break; + default: + printf("cat_findposition: invalid record type %d in dir %d\n", + crp->recordType, curID); + state->error = EINVAL; + return (0); /* stop */ + }; + + return (state->count < state->index); +} + + /* * cat_binarykeycompare - compare two HFS Plus catalog keys. - * The name portion of the key is comapred using a 16-bit binary comparison. + * The name portion of the key is compared using a 16-bit binary comparison. * This is called from the b-tree code. */ __private_extern__ @@ -1965,6 +2254,69 @@ cat_binarykeycompare(HFSPlusCatalogKey *searchKey, HFSPlusCatalogKey *trialKey) } +/* + * Compare two standard HFS catalog keys + * + * Result: +n search key > trial key + * 0 search key = trial key + * -n search key < trial key + */ +int +CompareCatalogKeys(HFSCatalogKey *searchKey, HFSCatalogKey *trialKey) +{ + cnid_t searchParentID, trialParentID; + int result; + + searchParentID = searchKey->parentID; + trialParentID = trialKey->parentID; + + if (searchParentID > trialParentID) + result = 1; + else if (searchParentID < trialParentID) + result = -1; + else /* parent dirID's are equal, compare names */ + result = FastRelString(searchKey->nodeName, trialKey->nodeName); + + return result; +} + + +/* + * Compare two HFS+ catalog keys + * + * Result: +n search key > trial key + * 0 search key = trial key + * -n search key < trial key + */ +int +CompareExtendedCatalogKeys(HFSPlusCatalogKey *searchKey, HFSPlusCatalogKey *trialKey) +{ + cnid_t searchParentID, trialParentID; + int result; + + searchParentID = searchKey->parentID; + trialParentID = trialKey->parentID; + + if (searchParentID > trialParentID) { + result = 1; + } + else if (searchParentID < trialParentID) { + result = -1; + } else { + /* parent node ID's are equal, compare names */ + if ( searchKey->nodeName.length == 0 || trialKey->nodeName.length == 0 ) + result = searchKey->nodeName.length - trialKey->nodeName.length; + else + result = FastUnicodeCompare(&searchKey->nodeName.unicode[0], + searchKey->nodeName.length, + &trialKey->nodeName.unicode[0], + trialKey->nodeName.length); + } + + return result; +} + + /* * buildkey - build a Catalog b-tree key from a cnode descriptor */ @@ -2146,6 +2498,26 @@ exit: return MacToVFSError(result); } +/* + * getkeyplusattr - From id, fetch the key and the bsd attrs for a file/dir (could pass + * null arguments to cat_idlookup instead, but we save around 10% by not building the + * cat_desc here). Both key and attrp must point to real structures. + */ +__private_extern__ +int +cat_getkeyplusattr(struct hfsmount *hfsmp, cnid_t cnid, CatalogKey * key, struct cat_attr *attrp) +{ + int result; + + result = getkey(hfsmp, cnid, key); + + if (result == 0) { + result = cat_lookupbykey(hfsmp, key, 0, 0, NULL, attrp, NULL, NULL); + } + + return MacToVFSError(result); +} + /* * buildrecord - build a default catalog directory or file record @@ -2182,33 +2554,41 @@ buildrecord(struct cat_attr *attrp, cnid_t cnid, int std_hfs, u_int32_t encoding struct FndrFileInfo * fip = NULL; if (type == S_IFDIR) { - bzero(crp, sizeof(HFSPlusCatalogFolder)); crp->recordType = kHFSPlusFolderRecord; + crp->hfsPlusFolder.flags = 0; + crp->hfsPlusFolder.valence = 0; crp->hfsPlusFolder.folderID = cnid; crp->hfsPlusFolder.createDate = createtime; crp->hfsPlusFolder.contentModDate = createtime; - crp->hfsPlusFolder.accessDate = createtime; crp->hfsPlusFolder.attributeModDate = createtime; + crp->hfsPlusFolder.accessDate = createtime; + crp->hfsPlusFolder.backupDate = 0; crp->hfsPlusFolder.textEncoding = encoding; + crp->hfsPlusFolder.attrBlocks = 0; bcopy(attrp->ca_finderinfo, &crp->hfsPlusFolder.userInfo, 32); bsdp = &crp->hfsPlusFolder.bsdInfo; + bsdp->special.rawDevice = 0; *recordSize = sizeof(HFSPlusCatalogFolder); } else { - bzero(crp, sizeof(HFSPlusCatalogFile)); crp->recordType = kHFSPlusFileRecord; + crp->hfsPlusFile.flags = kHFSThreadExistsMask; + crp->hfsPlusFile.reserved1 = 0; crp->hfsPlusFile.fileID = cnid; crp->hfsPlusFile.createDate = createtime; crp->hfsPlusFile.contentModDate = createtime; crp->hfsPlusFile.accessDate = createtime; crp->hfsPlusFile.attributeModDate = createtime; - crp->hfsPlusFile.flags |= kHFSThreadExistsMask; + crp->hfsPlusFile.backupDate = 0; crp->hfsPlusFile.textEncoding = encoding; + crp->hfsPlusFile.attrBlocks = 0; bsdp = &crp->hfsPlusFile.bsdInfo; + bsdp->special.rawDevice = 0; switch(type) { case S_IFBLK: case S_IFCHR: /* BLK/CHR need to save the device info */ bsdp->special.rawDevice = attrp->ca_rdev; + bzero(&crp->hfsPlusFile.userInfo, 32); break; case S_IFREG: /* Hardlink links need to save the linkref */ @@ -2224,6 +2604,7 @@ buildrecord(struct cat_attr *attrp, cnid_t cnid, int std_hfs, u_int32_t encoding bcopy(attrp->ca_finderinfo, &crp->hfsPlusFile.userInfo, 32); break; } + bzero(&crp->hfsPlusFile.dataFork, 2*sizeof(HFSPlusForkData)); *recordSize = sizeof(HFSPlusCatalogFile); } bsdp->ownerID = attrp->ca_uid; @@ -2244,13 +2625,13 @@ builddesc(const HFSPlusCatalogKey *key, cnid_t cnid, u_long hint, u_long encodin { int result = 0; char * nameptr; - long bufsize; + size_t bufsize; size_t utf8len; char tmpbuff[128]; /* guess a size... */ bufsize = (3 * key->nodeName.length) + 1; - if (bufsize >= sizeof(tmpbuff)-1) { + if (bufsize >= sizeof(tmpbuff) - 1) { MALLOC(nameptr, char *, bufsize, M_TEMP, M_WAITOK); } else { nameptr = &tmpbuff[0]; @@ -2274,7 +2655,7 @@ builddesc(const HFSPlusCatalogKey *key, cnid_t cnid, u_long hint, u_long encodin bufsize, ':', 0); } descp->cd_parentcnid = key->parentID; - descp->cd_nameptr = add_name(nameptr, utf8len, 0, 0); + descp->cd_nameptr = vfs_addname(nameptr, utf8len, 0, 0); descp->cd_namelen = utf8len; descp->cd_cnid = cnid; descp->cd_hint = hint; @@ -2299,10 +2680,11 @@ getbsdattr(struct hfsmount *hfsmp, const struct HFSPlusCatalogFile *crp, struct int isDirectory = (crp->recordType == kHFSPlusFolderRecord); const struct HFSPlusBSDInfo *bsd = &crp->bsdInfo; + attrp->ca_recflags = crp->flags; attrp->ca_nlink = 1; attrp->ca_atime = to_bsd_time(crp->accessDate); + attrp->ca_atimeondisk = attrp->ca_atime; attrp->ca_mtime = to_bsd_time(crp->contentModDate); - attrp->ca_mtime_nsec = 0; attrp->ca_ctime = to_bsd_time(crp->attributeModDate); attrp->ca_itime = to_bsd_time(crp->createDate); attrp->ca_btime = to_bsd_time(crp->backupDate); @@ -2334,7 +2716,7 @@ getbsdattr(struct hfsmount *hfsmp, const struct HFSPlusCatalogFile *crp, struct break; } - if (HFSTOVFS(hfsmp)->mnt_flag & MNT_UNKNOWNPERMISSIONS) { + if (((unsigned int)vfs_flags(HFSTOVFS(hfsmp))) & MNT_UNKNOWNPERMISSIONS) { /* * Override the permissions as determined by the mount auguments * in ALMOST the same way unset permissions are treated but keep @@ -2354,6 +2736,7 @@ getbsdattr(struct hfsmount *hfsmp, const struct HFSPlusCatalogFile *crp, struct } attrp->ca_nlink = 2 + ((HFSPlusCatalogFolder *)crp)->valence; attrp->ca_entries = ((HFSPlusCatalogFolder *)crp)->valence; + attrp->ca_attrblks = ((HFSPlusCatalogFolder *)crp)->attrBlocks; } else { /* Keep IMMUTABLE bits in sync with HFS locked flag */ if (crp->flags & kHFSFileLockedMask) { @@ -2367,6 +2750,7 @@ getbsdattr(struct hfsmount *hfsmp, const struct HFSPlusCatalogFile *crp, struct } /* get total blocks (both forks) */ attrp->ca_blocks = crp->dataFork.totalBlocks + crp->resourceFork.totalBlocks; + attrp->ca_attrblks = crp->attrBlocks; } attrp->ca_fileid = crp->fileID; @@ -2485,7 +2869,7 @@ promoteattr(struct hfsmount *hfsmp, const CatalogRecord *dataPtr, struct HFSPlus crp->attributeModDate = crp->contentModDate; crp->accessDate = crp->contentModDate; bzero(&crp->bsdInfo, sizeof(HFSPlusBSDInfo)); - crp->reserved2 = 0; + crp->attrBlocks = 0; } /* @@ -2590,7 +2974,7 @@ getcnid(const CatalogRecord *crp) cnid = crp->hfsPlusFile.fileID; break; default: - panic("hfs: getcnid: unknown recordType (crp @ 0x%x)\n", crp); + printf("hfs: getcnid: unknown recordType (crp @ 0x%x)\n", crp); break; } @@ -2633,4 +3017,3 @@ isadir(const CatalogRecord *crp) crp->recordType == kHFSPlusFolderRecord); } - diff --git a/bsd/hfs/hfs_catalog.h b/bsd/hfs/hfs_catalog.h index 0f67eadf8..991478558 100644 --- a/bsd/hfs/hfs_catalog.h +++ b/bsd/hfs/hfs_catalog.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2002-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -26,9 +26,7 @@ #ifdef KERNEL #ifdef __APPLE_API_PRIVATE -#include <sys/namei.h> #include <sys/vnode.h> -#include <sys/lock.h> #include <hfs/hfs_format.h> @@ -71,22 +69,24 @@ struct cat_desc { struct cat_attr { cnid_t ca_fileid; /* inode number (for stat) normally == cnid */ mode_t ca_mode; /* file access mode and type (16 bits) */ - nlink_t ca_nlink; /* file link count (16 bit integer) */ + u_int16_t ca_recflags; /* catalog record flags (16 bit integer) */ + u_int32_t ca_nlink; /* file link count */ uid_t ca_uid; /* file owner */ gid_t ca_gid; /* file group */ dev_t ca_rdev; /* device a special file represents */ time_t ca_atime; /* last access time */ + time_t ca_atimeondisk; /* access time value on disk */ time_t ca_mtime; /* last data modification time */ - int32_t ca_mtime_nsec; /* last data modification time nanosec */ time_t ca_ctime; /* last file status change */ time_t ca_itime; /* file initialization time */ time_t ca_btime; /* last backup time */ - u_long ca_flags; /* status flags (chflags) */ + u_int32_t ca_flags; /* status flags (chflags) */ union { u_int32_t cau_blocks; /* total file blocks used (rsrc + data) */ u_int32_t cau_entries; /* total directory entries (valence) */ } ca_union; u_int8_t ca_finderinfo[32]; /* Opaque Finder information */ + u_int32_t ca_attrblks; /* cached count of attribute data blocks */ }; /* Aliases for common fields */ #define ca_blocks ca_union.cau_blocks @@ -112,6 +112,26 @@ struct cat_fork { #define cf_bytesread cf_union.cfu_bytesread +/* + * Directory Hint + * Used to hold state across directory enumerations. + * + */ +struct directoryhint { + SLIST_ENTRY(directoryhint) dh_link; /* chain */ + int dh_index; /* index into directory (zero relative) */ + u_int32_t dh_time; + struct cat_desc dh_desc; /* entry's descriptor */ +}; +typedef struct directoryhint directoryhint_t; + +#define HFS_MAXDIRHINTS 32 +#define HFS_DIRHINT_TTL 45 + +#define HFS_INDEX_MASK 0x03ffffff +#define HFS_INDEX_BITS 26 + + /* * Catalog Node Entry * @@ -160,6 +180,26 @@ typedef struct cat_cookie_t { char opaque[24]; } cat_cookie_t; +/* Universal catalog key */ +union CatalogKey { + HFSCatalogKey hfs; + HFSPlusCatalogKey hfsPlus; +}; +typedef union CatalogKey CatalogKey; + +/* Universal catalog data record */ +union CatalogRecord { + int16_t recordType; + HFSCatalogFolder hfsFolder; + HFSCatalogFile hfsFile; + HFSCatalogThread hfsThread; + HFSPlusCatalogFolder hfsPlusFolder; + HFSPlusCatalogFile hfsPlusFile; + HFSPlusCatalogThread hfsPlusThread; +}; +typedef union CatalogRecord CatalogRecord; + + /* * Catalog Interface * @@ -186,7 +226,8 @@ extern int cat_lookup ( struct hfsmount *hfsmp, int wantrsrc, struct cat_desc *outdescp, struct cat_attr *attrp, - struct cat_fork *forkp); + struct cat_fork *forkp, + cnid_t *desc_cnid); extern int cat_idlookup (struct hfsmount *hfsmp, cnid_t cnid, @@ -194,10 +235,13 @@ extern int cat_idlookup (struct hfsmount *hfsmp, struct cat_attr *attrp, struct cat_fork *forkp); +extern int cat_findname (struct hfsmount *hfsmp, + cnid_t cnid, + struct cat_desc *outdescp); + extern int cat_getentriesattr( struct hfsmount *hfsmp, - struct cat_desc *prevdesc, - int index, + directoryhint_t *dirhint, struct cat_entrylist *ce_list); extern int cat_rename ( struct hfsmount * hfsmp, @@ -214,12 +258,11 @@ extern int cat_update ( struct hfsmount *hfsmp, extern int cat_getdirentries( struct hfsmount *hfsmp, - struct cat_desc *descp, int entrycnt, - struct uio *uio, - int *eofflag, - u_long *cookies, - int ncookies); + directoryhint_t *dirhint, + uio_t uio, + int extended, + int * items); extern int cat_insertfilethread ( struct hfsmount *hfsmp, @@ -240,6 +283,38 @@ extern int cat_binarykeycompare( HFSPlusCatalogKey *searchKey, HFSPlusCatalogKey *trialKey); +extern int CompareCatalogKeys( + HFSCatalogKey *searchKey, + HFSCatalogKey *trialKey); + +extern int CompareExtendedCatalogKeys( + HFSPlusCatalogKey *searchKey, + HFSPlusCatalogKey *trialKey); + +extern void cat_convertattr( + struct hfsmount *hfsmp, + CatalogRecord * recp, + struct cat_attr *attrp, + struct cat_fork *datafp, + struct cat_fork *rsrcfp); + +extern int cat_convertkey( + struct hfsmount *hfsmp, + CatalogKey *key, + CatalogRecord * recp, + struct cat_desc *descp); + +extern int resolvelink( + struct hfsmount *hfsmp, + u_long linkref, + struct HFSPlusCatalogFile *recp); + +extern int cat_getkeyplusattr( + struct hfsmount *hfsmp, + cnid_t cnid, + CatalogKey *key, + struct cat_attr *attrp); + #endif /* __APPLE_API_PRIVATE */ #endif /* KERNEL */ #endif /* __HFS_CATALOG__ */ diff --git a/bsd/hfs/hfs_chash.c b/bsd/hfs/hfs_chash.c index 6a4b950ee..1cbaf8186 100644 --- a/bsd/hfs/hfs_chash.c +++ b/bsd/hfs/hfs_chash.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2002-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -59,12 +59,22 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/vnode.h> +#include <sys/kernel.h> #include <sys/malloc.h> #include <sys/proc.h> #include <sys/queue.h> + +#include "hfs.h" /* XXX bringup */ #include "hfs_cnode.h" +extern lck_attr_t * hfs_lock_attr; +extern lck_grp_t * hfs_mutex_group; +extern lck_grp_t * hfs_rwlock_group; + +lck_grp_t * chash_lck_grp; +lck_grp_attr_t * chash_lck_grp_attr; +lck_attr_t * chash_lck_attr; /* * Structures associated with cnode caching. @@ -72,7 +82,9 @@ LIST_HEAD(cnodehashhead, cnode) *cnodehashtbl; u_long cnodehash; /* size of hash table - 1 */ #define CNODEHASH(device, inum) (&cnodehashtbl[((device) + (inum)) & cnodehash]) -struct slock hfs_chash_slock; + +lck_mtx_t hfs_chash_mutex; + /* * Initialize cnode hash table. @@ -82,7 +94,15 @@ void hfs_chashinit() { cnodehashtbl = hashinit(desiredvnodes, M_HFSMNT, &cnodehash); - simple_lock_init(&hfs_chash_slock); + + chash_lck_grp_attr= lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(chash_lck_grp_attr); + chash_lck_grp = lck_grp_alloc_init("cnode_hash", chash_lck_grp_attr); + + chash_lck_attr = lck_attr_alloc_init(); + //lck_attr_setdebug(chash_lck_attr); + + lck_mtx_init(&hfs_chash_mutex, chash_lck_grp, chash_lck_attr); } @@ -90,123 +110,288 @@ hfs_chashinit() * Use the device, inum pair to find the incore cnode. * * If it is in core, but locked, wait for it. - * - * If the requested vnode (fork) is not available, then - * take a reference on the other vnode (fork) so that - * the upcoming getnewvnode can not aquire it. */ __private_extern__ -struct cnode * -hfs_chashget(dev_t dev, ino_t inum, int wantrsrc, - struct vnode **vpp, struct vnode **rvpp) +struct vnode * +hfs_chash_getvnode(dev_t dev, ino_t inum, int wantrsrc, int skiplock) { - struct proc *p = current_proc(); struct cnode *cp; struct vnode *vp; int error; + uint32_t vid; - *vpp = NULLVP; - *rvpp = NULLVP; /* * Go through the hash list * If a cnode is in the process of being cleaned out or being * allocated, wait for it to be finished and then try again. */ loop: - simple_lock(&hfs_chash_slock); + lck_mtx_lock(&hfs_chash_mutex); for (cp = CNODEHASH(dev, inum)->lh_first; cp; cp = cp->c_hash.le_next) { if ((cp->c_fileid != inum) || (cp->c_dev != dev)) continue; - if (ISSET(cp->c_flag, C_ALLOC)) { - /* - * cnode is being created. Wait for it to finish. - */ - SET(cp->c_flag, C_WALLOC); - simple_unlock(&hfs_chash_slock); - (void) tsleep((caddr_t)cp, PINOD, "hfs_chashget-1", 0); + /* Wait if cnode is being created or reclaimed. */ + if (ISSET(cp->c_hflag, H_ALLOC | H_TRANSIT | H_ATTACH)) { + SET(cp->c_hflag, H_WAITING); + + (void) msleep(cp, &hfs_chash_mutex, PDROP | PINOD, + "hfs_chash_getvnode", 0); goto loop; - } - if (ISSET(cp->c_flag, C_TRANSIT)) { - /* - * cnode is getting reclaimed wait for - * the operation to complete and return - * error + } + /* + * Skip cnodes that are not in the name space anymore + * note that this check is done outside of the proper + * lock to catch nodes already in this state... this + * state must be rechecked after we acquire the cnode lock + */ + if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { + continue; + } + /* Obtain the desired vnode. */ + vp = wantrsrc ? cp->c_rsrc_vp : cp->c_vp; + if (vp == NULLVP) + goto exit; + + vid = vnode_vid(vp); + lck_mtx_unlock(&hfs_chash_mutex); + + if ((error = vnode_getwithvid(vp, vid))) { + /* + * If vnode is being reclaimed, or has + * already changed identity, no need to wait */ - SET(cp->c_flag, C_WTRANSIT); - simple_unlock(&hfs_chash_slock); - (void)tsleep((caddr_t)cp, PINOD, "hfs_chashget-2", 0); - goto loop; + return (NULL); } - if (cp->c_flag & (C_NOEXISTS | C_DELETED)) + if (!skiplock && hfs_lock(cp, HFS_EXCLUSIVE_LOCK) != 0) { + vnode_put(vp); + return (NULL); + } + + /* + * Skip cnodes that are not in the name space anymore + * we need to check again with the cnode lock held + * because we may have blocked acquiring the vnode ref + * or the lock on the cnode which would allow the node + * to be unlinked + */ + if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { + if (!skiplock) + hfs_unlock(cp); + vnode_put(vp); + + return (NULL); + } + return (vp); + } +exit: + lck_mtx_unlock(&hfs_chash_mutex); + return (NULL); +} + + +/* + * Use the device, fileid pair to find the incore cnode. + * If no cnode if found one is created + * + * If it is in core, but locked, wait for it. + */ +__private_extern__ +int +hfs_chash_snoop(dev_t dev, ino_t inum, int (*callout)(const struct cat_desc *, + const struct cat_attr *, void *), void * arg) +{ + struct cnode *cp; + int result = ENOENT; + + /* + * Go through the hash list + * If a cnode is in the process of being cleaned out or being + * allocated, wait for it to be finished and then try again. + */ + lck_mtx_lock(&hfs_chash_mutex); + for (cp = CNODEHASH(dev, inum)->lh_first; cp; cp = cp->c_hash.le_next) { + if ((cp->c_fileid != inum) || (cp->c_dev != dev)) continue; + /* Skip cnodes being created or reclaimed. */ + if (!ISSET(cp->c_hflag, H_ALLOC | H_TRANSIT | H_ATTACH)) { + result = callout(&cp->c_desc, &cp->c_attr, arg); + } + break; + } + lck_mtx_unlock(&hfs_chash_mutex); + return (result); +} + +/* + * Use the device, fileid pair to find the incore cnode. + * If no cnode if found one is created + * + * If it is in core, but locked, wait for it. + */ +__private_extern__ +struct cnode * +hfs_chash_getcnode(dev_t dev, ino_t inum, struct vnode **vpp, int wantrsrc, int skiplock) +{ + struct cnode *cp; + struct cnode *ncp = NULL; + vnode_t vp; + uint32_t vid; + + /* + * Go through the hash list + * If a cnode is in the process of being cleaned out or being + * allocated, wait for it to be finished and then try again. + */ +loop: + lck_mtx_lock(&hfs_chash_mutex); + +loop_with_lock: + for (cp = CNODEHASH(dev, inum)->lh_first; cp; cp = cp->c_hash.le_next) { + if ((cp->c_fileid != inum) || (cp->c_dev != dev)) + continue; /* - * Try getting the desired vnode first. If - * it isn't available then take a reference - * on the other vnode. + * Wait if cnode is being created, attached to or reclaimed. */ - vp = wantrsrc ? cp->c_rsrc_vp : cp->c_vp; - if (vp == NULLVP) - vp = wantrsrc ? cp->c_vp : cp->c_rsrc_vp; - if (vp == NULLVP) - panic("hfs_chashget: orphaned cnode in hash"); + if (ISSET(cp->c_hflag, H_ALLOC | H_ATTACH | H_TRANSIT)) { + SET(cp->c_hflag, H_WAITING); - simple_lock(&vp->v_interlock); - simple_unlock(&hfs_chash_slock); - if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) - goto loop; - else if (cp->c_flag & C_NOEXISTS) { + (void) msleep(cp, &hfs_chash_mutex, PINOD, + "hfs_chash_getcnode", 0); + goto loop_with_lock; + } + /* + * Skip cnodes that are not in the name space anymore + * note that this check is done outside of the proper + * lock to catch nodes already in this state... this + * state must be rechecked after we acquire the cnode lock + */ + if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { + continue; + } + vp = wantrsrc ? cp->c_rsrc_vp : cp->c_vp; + if (vp == NULL) { /* - * While we were blocked the cnode got deleted. + * The desired vnode isn't there so tag the cnode. */ - vput(vp); - goto loop; + SET(cp->c_hflag, H_ATTACH); + + lck_mtx_unlock(&hfs_chash_mutex); + } else { + vid = vnode_vid(vp); + + lck_mtx_unlock(&hfs_chash_mutex); + + if (vnode_getwithvid(vp, vid)) + goto loop; + } + if (ncp) { + /* + * someone else won the race to create + * this cnode and add it to the hash + * just dump our allocation + */ + FREE_ZONE(ncp, sizeof(struct cnode), M_HFSNODE); + ncp = NULL; } + if (!skiplock && hfs_lock(cp, HFS_EXCLUSIVE_LOCK) != 0) { + if (vp != NULLVP) + vnode_put(vp); + lck_mtx_lock(&hfs_chash_mutex); - if (VNODE_IS_RSRC(vp)) - *rvpp = vp; - else - *vpp = vp; + if (vp == NULLVP) + CLR(cp->c_hflag, H_ATTACH); + goto loop_with_lock; + } /* - * Note that vget can block before aquiring the - * cnode lock. So we need to check if the vnode - * we wanted was created while we blocked. + * Skip cnodes that are not in the name space anymore + * we need to check again with the cnode lock held + * because we may have blocked acquiring the vnode ref + * or the lock on the cnode which would allow the node + * to be unlinked */ - if (wantrsrc && *rvpp == NULL && cp->c_rsrc_vp) { - error = vget(cp->c_rsrc_vp, 0, p); - vrele(*vpp); /* ref no longer needed */ - *vpp = NULL; - if (error) - goto loop; - *rvpp = cp->c_rsrc_vp; - - } else if (!wantrsrc && *vpp == NULL && cp->c_vp) { - error = vget(cp->c_vp, 0, p); - vrele(*rvpp); /* ref no longer needed */ - *rvpp = NULL; - if (error) - goto loop; - *vpp = cp->c_vp; + if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { + if (!skiplock) + hfs_unlock(cp); + if (vp != NULLVP) + vnode_put(vp); + lck_mtx_lock(&hfs_chash_mutex); + + if (vp == NULLVP) + CLR(cp->c_hflag, H_ATTACH); + goto loop_with_lock; } + *vpp = vp; return (cp); } - simple_unlock(&hfs_chash_slock); - return (NULL); + + /* + * Allocate a new cnode + */ + if (skiplock) + panic("%s - should never get here when skiplock is set \n", __FUNCTION__); + + if (ncp == NULL) { + lck_mtx_unlock(&hfs_chash_mutex); + + MALLOC_ZONE(ncp, struct cnode *, sizeof(struct cnode), M_HFSNODE, M_WAITOK); + /* + * since we dropped the chash lock, + * we need to go back and re-verify + * that this node hasn't come into + * existence... + */ + goto loop; + } + bzero(ncp, sizeof(struct cnode)); + SET(ncp->c_hflag, H_ALLOC); + ncp->c_fileid = inum; + ncp->c_dev = dev; + + lck_rw_init(&ncp->c_rwlock, hfs_rwlock_group, hfs_lock_attr); + if (!skiplock) + (void) hfs_lock(ncp, HFS_EXCLUSIVE_LOCK); + + /* Insert the new cnode with it's H_ALLOC flag set */ + LIST_INSERT_HEAD(CNODEHASH(dev, inum), ncp, c_hash); + lck_mtx_unlock(&hfs_chash_mutex); + + *vpp = NULL; + return (ncp); +} + + +__private_extern__ +void +hfs_chashwakeup(struct cnode *cp, int hflags) +{ + lck_mtx_lock(&hfs_chash_mutex); + + CLR(cp->c_hflag, hflags); + + if (ISSET(cp->c_hflag, H_WAITING)) { + CLR(cp->c_hflag, H_WAITING); + wakeup((caddr_t)cp); + } + lck_mtx_unlock(&hfs_chash_mutex); } /* - * Insert a cnode into the hash table. + * Re-hash two cnodes in the hash table. */ __private_extern__ void -hfs_chashinsert(struct cnode *cp) +hfs_chash_rehash(struct cnode *cp1, struct cnode *cp2) { - if (cp->c_fileid != 0) { - simple_lock(&hfs_chash_slock); - LIST_INSERT_HEAD(CNODEHASH(cp->c_dev, cp->c_fileid), cp, c_hash); - simple_unlock(&hfs_chash_slock); - } + lck_mtx_lock(&hfs_chash_mutex); + + LIST_REMOVE(cp1, c_hash); + LIST_REMOVE(cp2, c_hash); + LIST_INSERT_HEAD(CNODEHASH(cp1->c_dev, cp1->c_fileid), cp1, c_hash); + LIST_INSERT_HEAD(CNODEHASH(cp2->c_dev, cp2->c_fileid), cp2, c_hash); + + lck_mtx_unlock(&hfs_chash_mutex); } @@ -214,13 +399,56 @@ hfs_chashinsert(struct cnode *cp) * Remove a cnode from the hash table. */ __private_extern__ -void +int hfs_chashremove(struct cnode *cp) { - simple_lock(&hfs_chash_slock); + lck_mtx_lock(&hfs_chash_mutex); + + /* Check if a vnode is getting attached */ + if (ISSET(cp->c_hflag, H_ATTACH)) { + lck_mtx_unlock(&hfs_chash_mutex); + return (EBUSY); + } + LIST_REMOVE(cp, c_hash); + cp->c_hash.le_next = NULL; + cp->c_hash.le_prev = NULL; + + lck_mtx_unlock(&hfs_chash_mutex); + return (0); +} + +/* + * Remove a cnode from the hash table and wakeup any waiters. + */ +__private_extern__ +void +hfs_chash_abort(struct cnode *cp) +{ + lck_mtx_lock(&hfs_chash_mutex); + LIST_REMOVE(cp, c_hash); cp->c_hash.le_next = NULL; cp->c_hash.le_prev = NULL; - simple_unlock(&hfs_chash_slock); + + CLR(cp->c_hflag, H_ATTACH | H_ALLOC); + if (ISSET(cp->c_hflag, H_WAITING)) { + CLR(cp->c_hflag, H_WAITING); + wakeup((caddr_t)cp); + } + lck_mtx_unlock(&hfs_chash_mutex); } + +/* + * mark a cnode as in transistion + */ +__private_extern__ +void +hfs_chash_mark_in_transit(struct cnode *cp) +{ + lck_mtx_lock(&hfs_chash_mutex); + + SET(cp->c_hflag, H_TRANSIT); + + lck_mtx_unlock(&hfs_chash_mutex); +} diff --git a/bsd/hfs/hfs_cnode.c b/bsd/hfs/hfs_cnode.c index 3abf79b4f..1fb30d020 100644 --- a/bsd/hfs/hfs_cnode.c +++ b/bsd/hfs/hfs_cnode.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2002-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -26,8 +26,12 @@ #include <sys/mount.h> #include <sys/kernel.h> #include <sys/malloc.h> +#include <sys/time.h> #include <sys/ubc.h> #include <sys/quota.h> +#include <sys/kdebug.h> + +#include <kern/locks.h> #include <miscfs/specfs/specdev.h> #include <miscfs/fifofs/fifo.h> @@ -39,8 +43,21 @@ extern int prtactive; +extern lck_attr_t * hfs_lock_attr; +extern lck_grp_t * hfs_mutex_group; +extern lck_grp_t * hfs_rwlock_group; + +static int hfs_filedone(struct vnode *vp, vfs_context_t context); + +static void hfs_reclaim_cnode(struct cnode *); + +static int hfs_valid_cnode(struct hfsmount *, struct vnode *, struct componentname *, cnid_t); + +static int hfs_isordered(struct cnode *, struct cnode *); + +int hfs_vnop_inactive(struct vnop_inactive_args *); -extern void hfs_relnamehints(struct cnode *dcp); +int hfs_vnop_reclaim(struct vnop_reclaim_args *); /* @@ -48,35 +65,64 @@ extern void hfs_relnamehints(struct cnode *dcp); */ __private_extern__ int -hfs_inactive(ap) - struct vop_inactive_args /* { - struct vnode *a_vp; - } */ *ap; +hfs_vnop_inactive(struct vnop_inactive_args *ap) { struct vnode *vp = ap->a_vp; - struct cnode *cp = VTOC(vp); + struct cnode *cp; struct hfsmount *hfsmp = VTOHFS(vp); - struct proc *p = ap->a_p; - struct timeval tv; + struct proc *p = vfs_context_proc(ap->a_context); int error = 0; int recycle = 0; int forkcount = 0; int truncated = 0; - int started_tr = 0, grabbed_lock = 0; + int started_tr = 0; + int took_trunc_lock = 0; cat_cookie_t cookie; int cat_reserve = 0; + int lockflags; + enum vtype v_type; - if (prtactive && vp->v_usecount != 0) - vprint("hfs_inactive: pushing active", vp); + v_type = vnode_vtype(vp); + cp = VTOC(vp); + if ((hfsmp->hfs_flags & HFS_READ_ONLY) || vnode_issystem(vp)) { + return (0); + } /* * Ignore nodes related to stale file handles. */ - if (cp->c_mode == 0) - goto out; + if (cp->c_mode == 0) { + vnode_recycle(vp); + return (0); + } + + if ((v_type == VREG) && + (ISSET(cp->c_flag, C_DELETED) || VTOF(vp)->ff_blocks)) { + hfs_lock_truncate(cp, TRUE); + took_trunc_lock = 1; + } + + /* + * We do the ubc_setsize before we take the cnode + * lock and before the hfs_truncate (since we'll + * be inside a transaction). + */ + if ((v_type == VREG || v_type == VLNK) && + (cp->c_flag & C_DELETED) && + (VTOF(vp)->ff_blocks != 0)) { + ubc_setsize(vp, 0); + } + + (void) hfs_lock(cp, HFS_FORCE_LOCK); - if (hfsmp->hfs_flags & HFS_READ_ONLY) - goto out; + if (v_type == VREG && !ISSET(cp->c_flag, C_DELETED) && VTOF(vp)->ff_blocks) { + hfs_filedone(vp, ap->a_context); + } + /* + * Remove any directory hints + */ + if (v_type == VDIR) + hfs_reldirhints(cp, 0); if (cp->c_datafork) ++forkcount; @@ -84,9 +130,29 @@ hfs_inactive(ap) ++forkcount; /* If needed, get rid of any fork's data for a deleted file */ - if ((vp->v_type == VREG) && (cp->c_flag & C_DELETED)) { + if ((v_type == VREG || v_type == VLNK) && (cp->c_flag & C_DELETED)) { if (VTOF(vp)->ff_blocks != 0) { - error = VOP_TRUNCATE(vp, (off_t)0, IO_NDELAY, NOCRED, p); + // start the transaction out here so that + // the truncate and the removal of the file + // are all in one transaction. otherwise + // because this cnode is marked for deletion + // the truncate won't cause the catalog entry + // to get updated which means that we could + // free blocks but still keep a reference to + // them in the catalog entry and then double + // free them later. + // + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + goto out; + } + started_tr = 1; + + /* + * Since we're already inside a transaction, + * tell hfs_truncate to skip the ubc_setsize. + */ + error = hfs_truncate(vp, (off_t)0, IO_NDELAY, 1, ap->a_context); if (error) goto out; truncated = 1; @@ -103,21 +169,20 @@ hfs_inactive(ap) * Mark cnode in transit so that no one can get this * cnode from cnode hash. */ - SET(cp->c_flag, C_TRANSIT); + hfs_chash_mark_in_transit(cp); + cp->c_flag &= ~C_DELETED; + cp->c_flag |= C_NOEXISTS; // XXXdbg cp->c_rdev = 0; - // XXXdbg - hfs_global_shared_lock_acquire(hfsmp); - grabbed_lock = 1; - if (hfsmp->jnl) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - error = EINVAL; - goto out; + if (started_tr == 0) { + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + goto out; } started_tr = 1; } - + /* * Reserve some space in the Catalog file. */ @@ -126,14 +191,21 @@ hfs_inactive(ap) } cat_reserve = 1; - - /* Lock catalog b-tree */ - error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); - if (error) goto out; + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); if (cp->c_blocks > 0) printf("hfs_inactive: attempting to delete a non-empty file!"); + + // + // release the name pointer in the descriptor so that + // cat_delete() will use the file-id to do the deletion. + // in the case of hard links this is imperative (in the + // case of regular files the fileid and cnid are the + // same so it doesn't matter). + // + cat_releasedesc(&cp->c_desc); + /* * The descriptor name may be zero, * in which case the fileid is used. @@ -150,30 +222,32 @@ hfs_inactive(ap) &hfsmp->hfs_privdir_attr, NULL, NULL); } - /* Unlock catalog b-tree */ - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); - if (error) goto out; + if (error == 0) { + /* Delete any attributes, ignore errors */ + (void) hfs_removeallattr(hfsmp, cp->c_fileid); + } + + hfs_systemfile_unlock(hfsmp, lockflags); + + if (error) + goto out; #if QUOTA (void)hfs_chkiq(cp, -1, NOCRED, 0); #endif /* QUOTA */ cp->c_mode = 0; - cp->c_flag |= C_NOEXISTS | C_CHANGE | C_UPDATE; + cp->c_flag |= C_NOEXISTS; + cp->c_touch_chgtime = TRUE; + cp->c_touch_modtime = TRUE; if (error == 0) hfs_volupdate(hfsmp, VOL_RMFILE, 0); } - if (cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE)) { - tv = time; - // if the only thing being updated is the access time - // then set the modified bit too so that update will - // flush it to disk. otherwise it'll get dropped. - if ((cp->c_flag & C_CHANGEMASK) == C_ACCESS) { - cp->c_flag |= C_MODIFIED; - } - VOP_UPDATE(vp, &tv, &tv, 0); + if ((cp->c_flag & C_MODIFIED) || + cp->c_touch_acctime || cp->c_touch_chgtime || cp->c_touch_modtime) { + hfs_update(vp, 0); } out: if (cat_reserve) @@ -181,424 +255,256 @@ out: // XXXdbg - have to do this because a goto could have come here if (started_tr) { - journal_end_transaction(hfsmp->jnl); + hfs_end_transaction(hfsmp); started_tr = 0; } - if (grabbed_lock) { - hfs_global_shared_lock_release(hfsmp); - } - VOP_UNLOCK(vp, 0, p); + hfs_unlock(cp); + + if (took_trunc_lock) + hfs_unlock_truncate(cp); + /* * If we are done with the vnode, reclaim it * so that it can be reused immediately. */ if (cp->c_mode == 0 || recycle) - vrecycle(vp, (struct slock *)0, p); + vnode_recycle(vp); return (error); } +/* + * File clean-up (zero fill and shrink peof). + */ +static int +hfs_filedone(struct vnode *vp, vfs_context_t context) +{ + struct cnode *cp; + struct filefork *fp; + struct hfsmount *hfsmp; + off_t leof; + u_long blks, blocksize; + + cp = VTOC(vp); + fp = VTOF(vp); + hfsmp = VTOHFS(vp); + leof = fp->ff_size; + + if ((hfsmp->hfs_flags & HFS_READ_ONLY) || (fp->ff_blocks == 0)) + return (0); + + hfs_unlock(cp); + (void) cluster_push(vp, IO_CLOSE); + hfs_lock(cp, HFS_FORCE_LOCK); + + /* + * Explicitly zero out the areas of file + * that are currently marked invalid. + */ + while (!CIRCLEQ_EMPTY(&fp->ff_invalidranges)) { + struct rl_entry *invalid_range = CIRCLEQ_FIRST(&fp->ff_invalidranges); + off_t start = invalid_range->rl_start; + off_t end = invalid_range->rl_end; + + /* The range about to be written must be validated + * first, so that VNOP_BLOCKMAP() will return the + * appropriate mapping for the cluster code: + */ + rl_remove(start, end, &fp->ff_invalidranges); + + hfs_unlock(cp); + (void) cluster_write(vp, (struct uio *) 0, + leof, end + 1, start, (off_t)0, + IO_HEADZEROFILL | IO_NOZERODIRTY | IO_NOCACHE); + hfs_lock(cp, HFS_FORCE_LOCK); + cp->c_flag |= C_MODIFIED; + } + cp->c_flag &= ~C_ZFWANTSYNC; + cp->c_zftimeout = 0; + blocksize = VTOVCB(vp)->blockSize; + blks = leof / blocksize; + if (((off_t)blks * (off_t)blocksize) != leof) + blks++; + /* + * Shrink the peof to the smallest size neccessary to contain the leof. + */ + if (blks < fp->ff_blocks) + (void) hfs_truncate(vp, leof, IO_NDELAY, 0, context); + hfs_unlock(cp); + (void) cluster_push(vp, IO_CLOSE); + hfs_lock(cp, HFS_FORCE_LOCK); + + /* + * If the hfs_truncate didn't happen to flush the vnode's + * information out to disk, force it to be updated now that + * all invalid ranges have been zero-filled and validated: + */ + if (cp->c_flag & C_MODIFIED) { + hfs_update(vp, 0); + } + return (0); +} + /* * Reclaim a cnode so that it can be used for other purposes. */ __private_extern__ int -hfs_reclaim(ap) - struct vop_reclaim_args /* { - struct vnode *a_vp; - } */ *ap; +hfs_vnop_reclaim(struct vnop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; - struct cnode *cp = VTOC(vp); - struct vnode *devvp = NULL; + struct cnode *cp; struct filefork *fp = NULL; struct filefork *altfp = NULL; - int i; + int reclaim_cnode = 0; - if (prtactive && vp->v_usecount != 0) - vprint("hfs_reclaim(): pushing active", vp); + (void) hfs_lock(VTOC(vp), HFS_FORCE_LOCK); + cp = VTOC(vp); /* * Keep track of an inactive hot file. */ - (void) hfs_addhotfile(vp); + if (!vnode_isdir(vp) && !vnode_issystem(vp)) + (void) hfs_addhotfile(vp); - devvp = cp->c_devvp; /* For later releasing */ + vnode_removefsref(vp); /* * Find file fork for this vnode (if any) * Also check if another fork is active */ - if ((fp = cp->c_datafork) && (cp->c_vp == vp)) { + if (cp->c_vp == vp) { + fp = cp->c_datafork; + altfp = cp->c_rsrcfork; + cp->c_datafork = NULL; cp->c_vp = NULL; - altfp = cp->c_rsrcfork; - } else if ((fp = cp->c_rsrcfork) && (cp->c_rsrc_vp == vp)) { + } else if (cp->c_rsrc_vp == vp) { + fp = cp->c_rsrcfork; + altfp = cp->c_datafork; + cp->c_rsrcfork = NULL; cp->c_rsrc_vp = NULL; - if (VPARENT(vp) == cp->c_vp) { - cp->c_flag &= ~C_VPREFHELD; - } - altfp = cp->c_datafork; } else { - cp->c_vp = NULL; - fp = NULL; - altfp = NULL; + panic("hfs_vnop_reclaim: vp points to wrong cnode\n"); } - /* * On the last fork, remove the cnode from its hash chain. */ - if (altfp == NULL) - hfs_chashremove(cp); - - /* Release the file fork and related data (can block) */ + if (altfp == NULL) { + /* If we can't remove it then the cnode must persist! */ + if (hfs_chashremove(cp) == 0) + reclaim_cnode = 1; + /* + * Remove any directory hints + */ + if (vnode_isdir(vp)) { + hfs_reldirhints(cp, 0); + } + } + /* Release the file fork and related data */ if (fp) { - fp->ff_cp = NULL; /* Dump cached symlink data */ - if ((vp->v_type == VLNK) && (fp->ff_symlinkptr != NULL)) { + if (vnode_islnk(vp) && (fp->ff_symlinkptr != NULL)) { FREE(fp->ff_symlinkptr, M_TEMP); - fp->ff_symlinkptr = NULL; - } + } FREE_ZONE(fp, sizeof(struct filefork), M_HFSFORK); - fp = NULL; } - /* - * Purge old data structures associated with the cnode. - */ - cache_purge(vp); - if (devvp && altfp == NULL) { - cp->c_devvp = NULL; - vrele(devvp); - } - - vp->v_data = NULL; - /* * If there was only one active fork then we can release the cnode. */ - if (altfp == NULL) { -#if QUOTA - for (i = 0; i < MAXQUOTAS; i++) { - if (cp->c_dquot[i] != NODQUOT) { - dqreclaim(vp, cp->c_dquot[i]); - cp->c_dquot[i] = NODQUOT; - } - } -#endif /* QUOTA */ - /* - * Free any left over directory indices - */ - if (vp->v_type == VDIR) - hfs_relnamehints(cp); - - /* - * If the descriptor has a name then release it - */ - if (cp->c_desc.cd_flags & CD_HASBUF) { - char *nameptr; - - nameptr = cp->c_desc.cd_nameptr; - cp->c_desc.cd_nameptr = 0; - cp->c_desc.cd_flags &= ~CD_HASBUF; - cp->c_desc.cd_namelen = 0; - remove_name(nameptr); - } - CLR(cp->c_flag, (C_ALLOC | C_TRANSIT)); - if (ISSET(cp->c_flag, C_WALLOC) || ISSET(cp->c_flag, C_WTRANSIT)) - wakeup(cp); - FREE_ZONE(cp, sizeof(struct cnode), M_HFSNODE); - + if (reclaim_cnode) { + hfs_chashwakeup(cp, H_ALLOC | H_TRANSIT); + hfs_reclaim_cnode(cp); + } else /* cnode in use */ { + hfs_unlock(cp); } + vnode_clearfsnode(vp); return (0); } -/* - * get a cnode - * - * called by hfs_lookup and hfs_vget (descp == NULL) - * - * returns a locked vnode for cnode for given cnid/fileid - */ -__private_extern__ -int -hfs_getcnode(struct hfsmount *hfsmp, cnid_t cnid, struct cat_desc *descp, int wantrsrc, - struct cat_attr *attrp, struct cat_fork *forkp, struct vnode **vpp) -{ - dev_t dev = hfsmp->hfs_raw_dev; - struct vnode *vp = NULL; - struct vnode *rvp = NULL; - struct vnode *new_vp = NULL; - struct cnode *cp = NULL; - struct proc *p = current_proc(); - int retval = E_NONE; - - /* Check if unmount in progress */ - if (HFSTOVFS(hfsmp)->mnt_kern_flag & MNTK_UNMOUNT) { - *vpp = NULL; - return (EPERM); - } - - /* - * Check the hash for an active cnode - */ - cp = hfs_chashget(dev, cnid, wantrsrc, &vp, &rvp); - if (cp != NULL) { - /* hide open files that have been deleted */ - if ((hfsmp->hfs_privdir_desc.cd_cnid != 0) - && (cp->c_parentcnid == hfsmp->hfs_privdir_desc.cd_cnid) - && (cp->c_nlink == 0)) { - retval = ENOENT; - goto exit; - } - - /* Hide private journal files */ - if (hfsmp->jnl && - (cp->c_parentcnid == kRootDirID) && - ((cp->c_cnid == hfsmp->hfs_jnlfileid) || - (cp->c_cnid == hfsmp->hfs_jnlinfoblkid))) { - retval = ENOENT; - goto exit; - } - - if (wantrsrc && rvp != NULL) { - vp = rvp; - rvp = NULL; - goto done; - } - if (!wantrsrc && vp != NULL) { - /* Hardlinks need an updated catalog descriptor */ - if (descp && cp->c_flag & C_HARDLINK) { - replace_desc(cp, descp); - } - /* We have a vnode so we're done. */ - goto done; - } - } - - /* - * There was no active vnode so get a new one. - * Use the existing cnode (if any). - */ - if (descp != NULL) { - /* - * hfs_lookup case, use descp, attrp and forkp - */ - retval = hfs_getnewvnode(hfsmp, cp, descp, wantrsrc, attrp, - forkp, &new_vp); - } else { - struct cat_desc cndesc = {0}; - struct cat_attr cnattr = {0}; - struct cat_fork cnfork = {0}; - - /* - * hfs_vget case, need to lookup entry (by file id) - */ - if (cnid == kRootParID) { - static char hfs_rootname[] = "/"; - - cndesc.cd_nameptr = &hfs_rootname[0]; - cndesc.cd_namelen = 1; - cndesc.cd_parentcnid = kRootParID; - cndesc.cd_cnid = kRootParID; - cndesc.cd_flags = CD_ISDIR; - - cnattr.ca_fileid = kRootParID; - cnattr.ca_nlink = 2; - cnattr.ca_entries = 1; - cnattr.ca_mode = (S_IFDIR | S_IRWXU | S_IRWXG | S_IRWXO); - } else { - /* Lock catalog b-tree */ - retval = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, p); - if (retval) - goto exit; - - retval = cat_idlookup(hfsmp, cnid, &cndesc, &cnattr, &cnfork); - - /* Unlock catalog b-tree */ - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); - if (retval) - goto exit; - - /* Hide open files that have been deleted */ - if ((hfsmp->hfs_privdir_desc.cd_cnid != 0) && - (cndesc.cd_parentcnid == hfsmp->hfs_privdir_desc.cd_cnid) && - (cnattr.ca_nlink == 0)) { - cat_releasedesc(&cndesc); - retval = ENOENT; - goto exit; - } - } - - retval = hfs_getnewvnode(hfsmp, cp, &cndesc, 0, &cnattr, &cnfork, &new_vp); - - /* Hardlinks may need an updated catalog descriptor */ - if (retval == 0 - && new_vp - && (VTOC(new_vp)->c_flag & C_HARDLINK) - && cndesc.cd_nameptr - && cndesc.cd_namelen > 0) { - replace_desc(VTOC(new_vp), &cndesc); - } - - cat_releasedesc(&cndesc); - } - -exit: - /* Release reference taken on opposite vnode (if any). */ - if (vp) - vrele(vp); - else if (rvp) - vrele(rvp); - - if (retval) { - *vpp = NULL; - return (retval); - } - vp = new_vp; -done: - /* The cnode's vnode should be in vp. */ - if (vp == NULL) - panic("hfs_getcnode: missing vp!"); - - if (UBCISVALID(vp)) - UBCINFOCHECK("hfs_getcnode", vp); - *vpp = vp; - return (0); -} - +extern int (**hfs_vnodeop_p) (void *); +extern int (**hfs_specop_p) (void *); +extern int (**hfs_fifoop_p) (void *); /* * hfs_getnewvnode - get new default vnode * - * the vnode is returned locked + * The vnode is returned with an iocount and the cnode locked */ -extern int (**hfs_vnodeop_p) (void *); -extern int (**hfs_specop_p) (void *); -extern int (**hfs_fifoop_p) (void *); - __private_extern__ int -hfs_getnewvnode(struct hfsmount *hfsmp, struct cnode *cp, - struct cat_desc *descp, int wantrsrc, - struct cat_attr *attrp, struct cat_fork *forkp, +hfs_getnewvnode( + struct hfsmount *hfsmp, + struct vnode *dvp, + struct componentname *cnp, + struct cat_desc *descp, + int wantrsrc, + struct cat_attr *attrp, + struct cat_fork *forkp, struct vnode **vpp) { struct mount *mp = HFSTOVFS(hfsmp); struct vnode *vp = NULL; - struct vnode *rvp = NULL; - struct vnode *new_vp = NULL; - struct cnode *cp2 = NULL; + struct vnode **cvpp; + struct vnode *tvp = NULLVP; + struct cnode *cp = NULL; struct filefork *fp = NULL; - int allocated = 0; int i; int retval; - dev_t dev; - struct proc *p = current_proc(); -#if 0 - /* Bail when unmount is in progress */ - if (mp->mnt_kern_flag & MNTK_UNMOUNT) { + int issystemfile; + struct vnode_fsparam vfsp; + enum vtype vtype; + + if (attrp->ca_fileid == 0) { *vpp = NULL; - return (EPERM); + return (ENOENT); } -#endif #if !FIFO if (IFTOVT(attrp->ca_mode) == VFIFO) { *vpp = NULL; - return (EOPNOTSUPP); + return (ENOTSUP); } #endif - dev = hfsmp->hfs_raw_dev; - - /* If no cnode was passed in then create one */ - if (cp == NULL) { - MALLOC_ZONE(cp2, struct cnode *, sizeof(struct cnode), - M_HFSNODE, M_WAITOK); - bzero(cp2, sizeof(struct cnode)); - allocated = 1; - SET(cp2->c_flag, C_ALLOC); - cp2->c_cnid = descp->cd_cnid; - cp2->c_fileid = attrp->ca_fileid; - if (cp2->c_fileid == 0) { - FREE_ZONE(cp2, sizeof(struct cnode), M_HFSNODE); - *vpp = NULL; - return (ENOENT); - } - cp2->c_dev = dev; - lockinit(&cp2->c_lock, PINOD, "cnode", 0, 0); - (void) lockmgr(&cp2->c_lock, LK_EXCLUSIVE, (struct slock *)0, p); - /* - * There were several blocking points since we first - * checked the hash. Now that we're through blocking, - * check the hash again in case we're racing for the - * same cnode. - */ - cp = hfs_chashget(dev, attrp->ca_fileid, wantrsrc, &vp, &rvp); - if (cp != NULL) { - /* We lost the race - use the winner's cnode */ - FREE_ZONE(cp2, sizeof(struct cnode), M_HFSNODE); - allocated = 0; - if (wantrsrc && rvp != NULL) { - *vpp = rvp; - return (0); - } - if (!wantrsrc && vp != NULL) { - *vpp = vp; - return (0); - } - } else /* allocated */ { - cp = cp2; - hfs_chashinsert(cp); - } + vtype = IFTOVT(attrp->ca_mode); + issystemfile = (descp->cd_flags & CD_ISMETA) && (vtype == VREG); + + /* + * Get a cnode (new or existing) + * skip getting the cnode lock if we are getting resource fork (wantrsrc == 2) + */ + cp = hfs_chash_getcnode(hfsmp->hfs_raw_dev, attrp->ca_fileid, vpp, wantrsrc, (wantrsrc == 2)); + + /* Hardlinks may need an updated catalog descriptor */ + if ((cp->c_flag & C_HARDLINK) && descp->cd_nameptr && descp->cd_namelen > 0) { + replace_desc(cp, descp); } + /* Check if we found a matching vnode */ + if (*vpp != NULL) + return (0); - /* Allocate a new vnode. If unsuccesful, leave after freeing memory */ - if ((retval = getnewvnode(VT_HFS, mp, hfs_vnodeop_p, &new_vp))) { - if (allocated) { - hfs_chashremove(cp); - if (ISSET(cp->c_flag, C_WALLOC)) { - CLR(cp->c_flag, C_WALLOC); - wakeup(cp); - } - FREE_ZONE(cp2, sizeof(struct cnode), M_HFSNODE); - allocated = 0; - } else if (rvp) { - vput(rvp); - } else if (vp) { - vput(vp); + /* + * If this is a new cnode then initialize it. + */ + if (ISSET(cp->c_hflag, H_ALLOC)) { + lck_rw_init(&cp->c_truncatelock, hfs_rwlock_group, hfs_lock_attr); + + /* Make sure its still valid (ie exists on disk). */ + if (!hfs_valid_cnode(hfsmp, dvp, cnp, cp->c_fileid)) { + hfs_chash_abort(cp); + hfs_reclaim_cnode(cp); + *vpp = NULL; + return (ENOENT); } - *vpp = NULL; - return (retval); - } - if (allocated) { bcopy(attrp, &cp->c_attr, sizeof(struct cat_attr)); bcopy(descp, &cp->c_desc, sizeof(struct cat_desc)); - } - new_vp->v_data = cp; - if (wantrsrc && S_ISREG(cp->c_mode)) - cp->c_rsrc_vp = new_vp; - else - cp->c_vp = new_vp; - - /* Release reference taken on opposite vnode (if any). */ - if (rvp) - vrele(rvp); - if (vp) - vrele(vp); - - vp = new_vp; - vp->v_ubcinfo = UBC_NOINFO; - /* - * If this is a new cnode then initialize it using descp and attrp... - */ - if (allocated) { /* The name was inherited so clear descriptor state... */ descp->cd_namelen = 0; descp->cd_nameptr = NULL; @@ -613,7 +519,7 @@ hfs_getnewvnode(struct hfsmount *hfsmp, struct cnode *cp, /* Take one dev reference for each non-directory cnode */ if (IFTOVT(cp->c_mode) != VDIR) { cp->c_devvp = hfsmp->hfs_devvp; - VREF(cp->c_devvp); + vnode_ref(cp->c_devvp); } #if QUOTA for (i = 0; i < MAXQUOTAS; i++) @@ -621,7 +527,11 @@ hfs_getnewvnode(struct hfsmount *hfsmp, struct cnode *cp, #endif /* QUOTA */ } - if (IFTOVT(cp->c_mode) != VDIR) { + if (IFTOVT(cp->c_mode) == VDIR) { + if (cp->c_vp != NULL) + panic("hfs_getnewvnode: orphaned vnode (data)"); + cvpp = &cp->c_vp; + } else { if (forkp && attrp->ca_blocks < forkp->cf_blocks) panic("hfs_getnewvnode: bad ca_blocks (too small)"); /* @@ -629,89 +539,578 @@ hfs_getnewvnode(struct hfsmount *hfsmp, struct cnode *cp, */ MALLOC_ZONE(fp, struct filefork *, sizeof(struct filefork), M_HFSFORK, M_WAITOK); - bzero(fp, sizeof(struct filefork)); fp->ff_cp = cp; if (forkp) bcopy(forkp, &fp->ff_data, sizeof(struct cat_fork)); + else + bzero(&fp->ff_data, sizeof(struct cat_fork)); rl_init(&fp->ff_invalidranges); + fp->ff_sysfileinfo = 0; + if (wantrsrc) { if (cp->c_rsrcfork != NULL) - panic("stale rsrc fork"); + panic("hfs_getnewvnode: orphaned rsrc fork"); + if (cp->c_rsrc_vp != NULL) + panic("hfs_getnewvnode: orphaned vnode (rsrc)"); cp->c_rsrcfork = fp; + cvpp = &cp->c_rsrc_vp; + if ( (tvp = cp->c_vp) != NULLVP ) + cp->c_flag |= C_NEED_DVNODE_PUT; } else { if (cp->c_datafork != NULL) - panic("stale data fork"); + panic("hfs_getnewvnode: orphaned data fork"); + if (cp->c_vp != NULL) + panic("hfs_getnewvnode: orphaned vnode (data)"); cp->c_datafork = fp; + cvpp = &cp->c_vp; + if ( (tvp = cp->c_rsrc_vp) != NULLVP) + cp->c_flag |= C_NEED_RVNODE_PUT; } } + if (tvp != NULLVP) { + /* + * grab an iocount on the vnode we weren't + * interested in (i.e. we want the resource fork + * but the cnode already has the data fork) + * to prevent it from being + * recycled by us when we call vnode_create + * which will result in a deadlock when we + * try to take the cnode lock in hfs_vnop_fsync or + * hfs_vnop_reclaim... vnode_get can be called here + * because we already hold the cnode lock which will + * prevent the vnode from changing identity until + * we drop it.. vnode_get will not block waiting for + * a change of state... however, it will return an + * error if the current iocount == 0 and we've already + * started to terminate the vnode... we don't need/want to + * grab an iocount in the case since we can't cause + * the fileystem to be re-entered on this thread for this vp + * + * the matching vnode_put will happen in hfs_unlock + * after we've dropped the cnode lock + */ + if ( vnode_get(tvp) != 0) + cp->c_flag &= ~(C_NEED_RVNODE_PUT | C_NEED_DVNODE_PUT); + } + vfsp.vnfs_mp = mp; + vfsp.vnfs_vtype = vtype; + vfsp.vnfs_str = "hfs"; + vfsp.vnfs_dvp = dvp; + vfsp.vnfs_fsnode = cp; + vfsp.vnfs_cnp = cnp; + if (vtype == VFIFO ) + vfsp.vnfs_vops = hfs_fifoop_p; + else if (vtype == VBLK || vtype == VCHR) + vfsp.vnfs_vops = hfs_specop_p; + else + vfsp.vnfs_vops = hfs_vnodeop_p; + + if (vtype == VBLK || vtype == VCHR) + vfsp.vnfs_rdev = attrp->ca_rdev; + else + vfsp.vnfs_rdev = 0; - /* - * Finish vnode initialization. - * Setting the v_type 'stamps' the vnode as 'complete', - * so should be done almost last. - * - * At this point the vnode should be locked and fully - * allocated. And ready to be used or accessed. (though - * having it locked prevents most of this, it can still - * be accessed through lists and hashes). - */ - vp->v_type = IFTOVT(cp->c_mode); + if (forkp) + vfsp.vnfs_filesize = forkp->cf_size; + else + vfsp.vnfs_filesize = 0; + + if (dvp && cnp && (cnp->cn_flags & MAKEENTRY)) + vfsp.vnfs_flags = 0; + else + vfsp.vnfs_flags = VNFS_NOCACHE; /* Tag system files */ - if ((descp->cd_flags & CD_ISMETA) && (vp->v_type == VREG)) - vp->v_flag |= VSYSTEM; + vfsp.vnfs_marksystem = issystemfile; + /* Tag root directory */ - if (cp->c_cnid == kRootDirID) - vp->v_flag |= VROOT; + if (descp->cd_cnid == kHFSRootFolderID) + vfsp.vnfs_markroot = 1; + else + vfsp.vnfs_markroot = 0; + + if ((retval = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, cvpp))) { + if (fp) { + if (fp == cp->c_datafork) + cp->c_datafork = NULL; + else + cp->c_rsrcfork = NULL; + + FREE_ZONE(fp, sizeof(struct filefork), M_HFSFORK); + } + /* + * If this is a newly created cnode or a vnode reclaim + * occurred during the attachment, then cleanup the cnode. + */ + if ((cp->c_vp == NULL) && (cp->c_rsrc_vp == NULL)) { + hfs_chash_abort(cp); + hfs_reclaim_cnode(cp); + } else { + hfs_chashwakeup(cp, H_ALLOC | H_ATTACH); + hfs_unlock(cp); + } + *vpp = NULL; + return (retval); + } + vp = *cvpp; + vnode_addfsref(vp); + vnode_settag(vp, VT_HFS); + if (cp->c_flag & C_HARDLINK) + vnode_set_hard_link(vp); + hfs_chashwakeup(cp, H_ALLOC | H_ATTACH); + + /* + * Stop tracking an active hot file. + */ + if (!vnode_isdir(vp) && !vnode_issystem(vp)) + (void) hfs_removehotfile(vp); + + *vpp = vp; + return (0); +} + + +static void +hfs_reclaim_cnode(struct cnode *cp) +{ +#if QUOTA + int i; + + for (i = 0; i < MAXQUOTAS; i++) { + if (cp->c_dquot[i] != NODQUOT) { + dqreclaim(cp->c_dquot[i]); + cp->c_dquot[i] = NODQUOT; + } + } +#endif /* QUOTA */ + + if (cp->c_devvp) { + struct vnode *tmp_vp = cp->c_devvp; + + cp->c_devvp = NULL; + vnode_rele(tmp_vp); + } + + /* + * If the descriptor has a name then release it + */ + if (cp->c_desc.cd_flags & CD_HASBUF) { + char *nameptr; + + nameptr = cp->c_desc.cd_nameptr; + cp->c_desc.cd_nameptr = 0; + cp->c_desc.cd_flags &= ~CD_HASBUF; + cp->c_desc.cd_namelen = 0; + vfs_removename(nameptr); + } + + lck_rw_destroy(&cp->c_rwlock, hfs_rwlock_group); + lck_rw_destroy(&cp->c_truncatelock, hfs_rwlock_group); + bzero(cp, sizeof(struct cnode)); + FREE_ZONE(cp, sizeof(struct cnode), M_HFSNODE); +} + + +static int +hfs_valid_cnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname *cnp, cnid_t cnid) +{ + struct cat_attr attr; + struct cat_desc cndesc; + int stillvalid = 0; + int lockflags; - if ((vp->v_type == VREG) && !(vp->v_flag & VSYSTEM) - && (UBCINFOMISSING(vp) || UBCINFORECLAIMED(vp))) { - ubc_info_init(vp); + /* System files are always valid */ + if (cnid < kHFSFirstUserCatalogNodeID) + return (1); + + /* XXX optimization: check write count in dvp */ + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + + if (dvp && cnp) { + bzero(&cndesc, sizeof(cndesc)); + cndesc.cd_nameptr = cnp->cn_nameptr; + cndesc.cd_namelen = cnp->cn_namelen; + cndesc.cd_parentcnid = VTOC(dvp)->c_cnid; + cndesc.cd_hint = VTOC(dvp)->c_childhint; + + if ((cat_lookup(hfsmp, &cndesc, 0, NULL, &attr, NULL, NULL) == 0) && + (cnid == attr.ca_fileid)) { + stillvalid = 1; + } } else { - vp->v_ubcinfo = UBC_NOINFO; + if (cat_idlookup(hfsmp, cnid, NULL, NULL, NULL) == 0) { + stillvalid = 1; + } + } + hfs_systemfile_unlock(hfsmp, lockflags); + + return (stillvalid); +} + +/* + * Touch cnode times based on c_touch_xxx flags + * + * cnode must be locked exclusive + * + * This will also update the volume modify time + */ +__private_extern__ +void +hfs_touchtimes(struct hfsmount *hfsmp, struct cnode* cp) +{ + /* HFS Standard doesn't support access times */ + if (hfsmp->hfs_flags & HFS_STANDARD) { + cp->c_touch_acctime = FALSE; } - if (vp->v_type == VCHR || vp->v_type == VBLK) { - struct vnode *nvp; + if (cp->c_touch_acctime || cp->c_touch_chgtime || cp->c_touch_modtime) { + struct timeval tv; + int touchvol = 0; - vp->v_op = hfs_specop_p; - if ((nvp = checkalias(vp, cp->c_rdev, mp))) { + microtime(&tv); + + if (cp->c_touch_acctime) { + cp->c_atime = tv.tv_sec; /* - * Discard unneeded vnode, but save its cnode. - * Note that the lock is carried over in the - * cnode to the replacement vnode. + * When the access time is the only thing changing + * then make sure its sufficiently newer before + * committing it to disk. */ - nvp->v_data = vp->v_data; - vp->v_data = NULL; - vp->v_op = spec_vnodeop_p; - vrele(vp); - vgone(vp); + if ((((u_int32_t)cp->c_atime - (u_int32_t)(cp)->c_attr.ca_atimeondisk) > + ATIME_ONDISK_ACCURACY)) { + cp->c_flag |= C_MODIFIED; + } + cp->c_touch_acctime = FALSE; + } + if (cp->c_touch_modtime) { + cp->c_mtime = tv.tv_sec; + cp->c_touch_modtime = FALSE; + cp->c_flag |= C_MODIFIED; + touchvol = 1; +#if 1 /* - * Reinitialize aliased cnode. - * Assume its not a resource fork. + * HFS dates that WE set must be adjusted for DST */ - cp->c_vp = nvp; - vp = nvp; - } - } else if (vp->v_type == VFIFO) { -#if FIFO - vp->v_op = hfs_fifoop_p; + if ((hfsmp->hfs_flags & HFS_STANDARD) && gTimeZone.tz_dsttime) { + cp->c_mtime += 3600; + } #endif + } + if (cp->c_touch_chgtime) { + cp->c_ctime = tv.tv_sec; + cp->c_touch_chgtime = FALSE; + cp->c_flag |= C_MODIFIED; + touchvol = 1; + } + + /* Touch the volume modtime if needed */ + if (touchvol) { + HFSTOVCB(hfsmp)->vcbFlags |= 0xFF00; + HFSTOVCB(hfsmp)->vcbLsMod = tv.tv_sec; + } } +} + +/* + * Lock a cnode. + */ +__private_extern__ +int +hfs_lock(struct cnode *cp, enum hfslocktype locktype) +{ + void * thread = current_thread(); + + /* System files need to keep track of owner */ + if ((cp->c_fileid < kHFSFirstUserCatalogNodeID) && + (cp->c_fileid > kHFSRootFolderID) && + (locktype != HFS_SHARED_LOCK)) { + /* + * The extents and bitmap file locks support + * recursion and are always taken exclusive. + */ + if (cp->c_fileid == kHFSExtentsFileID || + cp->c_fileid == kHFSAllocationFileID) { + if (cp->c_lockowner == thread) { + cp->c_syslockcount++; + } else { + lck_rw_lock_exclusive(&cp->c_rwlock); + cp->c_lockowner = thread; + cp->c_syslockcount = 1; + } + } else { + lck_rw_lock_exclusive(&cp->c_rwlock); + cp->c_lockowner = thread; + } + } else if (locktype == HFS_SHARED_LOCK) { + lck_rw_lock_shared(&cp->c_rwlock); + cp->c_lockowner = HFS_SHARED_OWNER; + } else { + lck_rw_lock_exclusive(&cp->c_rwlock); + cp->c_lockowner = thread; + } /* - * Stop tracking an active hot file. + * Skip cnodes that no longer exist (were deleted). */ - (void) hfs_removehotfile(vp); + if ((locktype != HFS_FORCE_LOCK) && + ((cp->c_desc.cd_flags & CD_ISMETA) == 0) && + (cp->c_flag & C_NOEXISTS)) { + hfs_unlock(cp); + return (ENOENT); + } + return (0); +} - /* Vnode is now initialized - see if anyone was waiting for it. */ - CLR(cp->c_flag, C_ALLOC); - if (ISSET(cp->c_flag, C_WALLOC)) { - CLR(cp->c_flag, C_WALLOC); - wakeup((caddr_t)cp); +/* + * Lock a pair of cnodes. + */ +__private_extern__ +int +hfs_lockpair(struct cnode *cp1, struct cnode *cp2, enum hfslocktype locktype) +{ + struct cnode *first, *last; + int error; + + /* + * If cnodes match then just lock one. + */ + if (cp1 == cp2) { + return hfs_lock(cp1, locktype); } - *vpp = vp; + /* + * Lock in cnode parent-child order (if there is a relationship); + * otherwise lock in cnode address order. + */ + if ((IFTOVT(cp1->c_mode) == VDIR) && (cp1->c_fileid == cp2->c_parentcnid)) { + first = cp1; + last = cp2; + } else if (cp1 < cp2) { + first = cp1; + last = cp2; + } else { + first = cp2; + last = cp1; + } + + if ( (error = hfs_lock(first, locktype))) { + return (error); + } + if ( (error = hfs_lock(last, locktype))) { + hfs_unlock(first); + return (error); + } return (0); } +/* + * Check ordering of two cnodes. Return true if they are are in-order. + */ +static int +hfs_isordered(struct cnode *cp1, struct cnode *cp2) +{ + if (cp1 == cp2) + return (0); + if (cp1 == NULL || cp2 == (struct cnode *)0xffffffff) + return (1); + if (cp2 == NULL || cp1 == (struct cnode *)0xffffffff) + return (0); + if (cp1->c_fileid == cp2->c_parentcnid) + return (1); /* cp1 is the parent and should go first */ + if (cp2->c_fileid == cp1->c_parentcnid) + return (0); /* cp1 is the child and should go last */ + + return (cp1 < cp2); /* fall-back is to use address order */ +} + +/* + * Acquire 4 cnode locks. + * - locked in cnode parent-child order (if there is a relationship) + * otherwise lock in cnode address order (lesser address first). + * - all or none of the locks are taken + * - only one lock taken per cnode (dup cnodes are skipped) + * - some of the cnode pointers may be null + */ +__private_extern__ +int +hfs_lockfour(struct cnode *cp1, struct cnode *cp2, struct cnode *cp3, + struct cnode *cp4, enum hfslocktype locktype) +{ + struct cnode * a[3]; + struct cnode * b[3]; + struct cnode * list[4]; + struct cnode * tmp; + int i, j, k; + int error; + + if (hfs_isordered(cp1, cp2)) { + a[0] = cp1; a[1] = cp2; + } else { + a[0] = cp2; a[1] = cp1; + } + if (hfs_isordered(cp3, cp4)) { + b[0] = cp3; b[1] = cp4; + } else { + b[0] = cp4; b[1] = cp3; + } + a[2] = (struct cnode *)0xffffffff; /* sentinel value */ + b[2] = (struct cnode *)0xffffffff; /* sentinel value */ + + /* + * Build the lock list, skipping over duplicates + */ + for (i = 0, j = 0, k = 0; (i < 2 || j < 2); ) { + tmp = hfs_isordered(a[i], b[j]) ? a[i++] : b[j++]; + if (k == 0 || tmp != list[k-1]) + list[k++] = tmp; + } + + /* + * Now we can lock using list[0 - k]. + * Skip over NULL entries. + */ + for (i = 0; i < k; ++i) { + if (list[i]) + if ((error = hfs_lock(list[i], locktype))) { + /* Drop any locks we acquired. */ + while (--i >= 0) { + if (list[i]) + hfs_unlock(list[i]); + } + return (error); + } + } + return (0); +} + + +/* + * Unlock a cnode. + */ +__private_extern__ +void +hfs_unlock(struct cnode *cp) +{ + vnode_t rvp = NULLVP; + vnode_t dvp = NULLVP; + + /* System files need to keep track of owner */ + if ((cp->c_fileid < kHFSFirstUserCatalogNodeID) && + (cp->c_fileid > kHFSRootFolderID) && + (cp->c_datafork != NULL)) { + /* + * The extents and bitmap file locks support + * recursion and are always taken exclusive. + */ + if (cp->c_fileid == kHFSExtentsFileID || + cp->c_fileid == kHFSAllocationFileID) { + if (--cp->c_syslockcount > 0) { + return; + } + } + } + if (cp->c_flag & C_NEED_DVNODE_PUT) + dvp = cp->c_vp; + + if (cp->c_flag & C_NEED_RVNODE_PUT) + rvp = cp->c_rsrc_vp; + + cp->c_flag &= ~(C_NEED_DVNODE_PUT | C_NEED_RVNODE_PUT); + + cp-> c_lockowner = NULL; + lck_rw_done(&cp->c_rwlock); + + if (dvp) + vnode_put(dvp); + if (rvp) + vnode_put(rvp); +} + +/* + * Unlock a pair of cnodes. + */ +__private_extern__ +void +hfs_unlockpair(struct cnode *cp1, struct cnode *cp2) +{ + hfs_unlock(cp1); + if (cp2 != cp1) + hfs_unlock(cp2); +} + +/* + * Unlock a group of cnodes. + */ +__private_extern__ +void +hfs_unlockfour(struct cnode *cp1, struct cnode *cp2, struct cnode *cp3, struct cnode *cp4) +{ + struct cnode * list[4]; + int i, k = 0; + + if (cp1) { + hfs_unlock(cp1); + list[k++] = cp1; + } + if (cp2) { + for (i = 0; i < k; ++i) { + if (list[i] == cp2) + goto skip1; + } + hfs_unlock(cp2); + list[k++] = cp2; + } +skip1: + if (cp3) { + for (i = 0; i < k; ++i) { + if (list[i] == cp3) + goto skip2; + } + hfs_unlock(cp3); + list[k++] = cp3; + } +skip2: + if (cp4) { + for (i = 0; i < k; ++i) { + if (list[i] == cp4) + return; + } + hfs_unlock(cp4); + } +} + + +/* + * Protect a cnode against a truncation. + * + * Used mainly by read/write since they don't hold the + * cnode lock across calls to the cluster layer. + * + * The process doing a truncation must take the lock + * exclusive. The read/write processes can take it + * non-exclusive. + */ +__private_extern__ +void +hfs_lock_truncate(struct cnode *cp, int exclusive) +{ + if (cp->c_lockowner == current_thread()) + panic("hfs_lock_truncate: cnode 0x%08x locked!", cp); + + if (exclusive) + lck_rw_lock_exclusive(&cp->c_truncatelock); + else + lck_rw_lock_shared(&cp->c_truncatelock); +} + +__private_extern__ +void +hfs_unlock_truncate(struct cnode *cp) +{ + lck_rw_done(&cp->c_truncatelock); +} + + + + diff --git a/bsd/hfs/hfs_cnode.h b/bsd/hfs/hfs_cnode.h index 38ca8996e..64d2fd70d 100644 --- a/bsd/hfs/hfs_cnode.h +++ b/bsd/hfs/hfs_cnode.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2002-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -27,12 +27,13 @@ #ifdef KERNEL #ifdef __APPLE_API_PRIVATE #include <sys/types.h> -#include <sys/lock.h> #include <sys/queue.h> #include <sys/stat.h> #include <sys/vnode.h> #include <sys/quota.h> +#include <kern/locks.h> + #include <hfs/hfs_catalog.h> #include <hfs/rangelist.h> @@ -42,45 +43,35 @@ * Reading or writing any of these fields requires holding cnode lock. */ struct filefork { - struct cnode *ff_cp; /* cnode associated with this fork */ - struct rl_head ff_invalidranges; /* Areas of disk that should read back as zeroes */ - long ff_evtonly_refs; /* number of vnode references used solely for events (O_EVTONLY) */ + struct cnode *ff_cp; /* cnode associated with this fork */ + struct rl_head ff_invalidranges; /* Areas of disk that should read back as zeroes */ union { - struct hfslockf *ffu_lockf; /* Head of byte-level lock list. */ - void *ffu_sysdata; /* private data for system files */ - char *ffu_symlinkptr; /* symbolic link pathname */ - } ff_un; - struct cat_fork ff_data; + void *ffu_sysfileinfo; /* additional info for system files */ + char *ffu_symlinkptr; /* symbolic link pathname */ + } ff_union; + struct cat_fork ff_data; /* fork data (size, extents) */ }; typedef struct filefork filefork_t; /* Aliases for common fields */ -#define ff_size ff_data.cf_size -#define ff_clumpsize ff_data.cf_clump -#define ff_bytesread ff_data.cf_bytesread -#define ff_blocks ff_data.cf_blocks -#define ff_extents ff_data.cf_extents +#define ff_size ff_data.cf_size +#define ff_clumpsize ff_data.cf_clump +#define ff_bytesread ff_data.cf_bytesread +#define ff_blocks ff_data.cf_blocks +#define ff_extents ff_data.cf_extents #define ff_unallocblocks ff_data.cf_vblocks -#define ff_symlinkptr ff_un.ffu_symlinkptr -#define ff_lockf ff_un.ffu_lockf +#define ff_symlinkptr ff_union.ffu_symlinkptr +#define ff_sysfileinfo ff_union.ffu_sysfileinfo /* The btree code still needs these... */ -#define fcbEOF ff_size -#define fcbExtents ff_extents -#define fcbBTCBPtr ff_un.ffu_sysdata - - -/* - * Directory index entry - */ -struct hfs_index { - SLIST_ENTRY(hfs_index) hi_link; - int hi_index; - char hi_name[1]; -}; +#define fcbEOF ff_size +#define fcbExtents ff_extents +#define fcbBTCBPtr ff_sysfileinfo +typedef u_int8_t atomicflag_t; + /* * The cnode is used to represent each active (or recently active) * file or directory in the HFS filesystem. @@ -88,22 +79,32 @@ struct hfs_index { * Reading or writing any of these fields requires holding c_lock. */ struct cnode { - struct lock__bsd__ c_lock; /* cnode's lock */ + lck_rw_t c_rwlock; /* cnode's lock */ + void * c_lockowner; /* cnode's lock owner (exclusive case only) */ + lck_rw_t c_truncatelock; /* protects file from truncation during read/write */ LIST_ENTRY(cnode) c_hash; /* cnode's hash chain */ u_int32_t c_flag; /* cnode's runtime flags */ + u_int32_t c_hflag; /* cnode's flags for maintaining hash - protected by global hash lock */ struct vnode *c_vp; /* vnode for data fork or dir */ struct vnode *c_rsrc_vp; /* vnode for resource fork */ struct vnode *c_devvp; /* vnode for block I/O */ dev_t c_dev; /* cnode's device */ struct dquot *c_dquot[MAXQUOTAS]; /* cnode's quota info */ struct klist c_knotes; /* knotes attached to this vnode */ - cnid_t c_childhint; /* catalog hint for children */ + u_long c_childhint; /* catalog hint for children */ struct cat_desc c_desc; /* cnode's descriptor */ struct cat_attr c_attr; /* cnode's attributes */ - SLIST_HEAD(hfs_indexhead, hfs_index) c_indexlist; /* directory index list */ - long c_evtonly_refs; /* number of vnode references used solely for events (O_EVTONLY) */ + SLIST_HEAD(hfs_hinthead, directoryhint) c_hintlist; /* directory hint list */ + int16_t c_dirhinttag; /* directory hint tag */ + union { + int16_t cu_dirhintcnt; /* directory hint count */ + int16_t cu_syslockcount; /* system file use only */ + } c_union; struct filefork *c_datafork; /* cnode's data fork */ struct filefork *c_rsrcfork; /* cnode's rsrc fork */ + atomicflag_t c_touch_acctime; + atomicflag_t c_touch_chgtime; + atomicflag_t c_touch_modtime; }; typedef struct cnode cnode_t; @@ -121,40 +122,40 @@ typedef struct cnode cnode_t; #define c_rdev c_attr.ca_rdev #define c_atime c_attr.ca_atime #define c_mtime c_attr.ca_mtime -#define c_mtime_nsec c_attr.ca_mtime_nsec #define c_ctime c_attr.ca_ctime #define c_itime c_attr.ca_itime #define c_btime c_attr.ca_btime #define c_flags c_attr.ca_flags #define c_finderinfo c_attr.ca_finderinfo #define c_blocks c_attr.ca_blocks +#define c_attrblks c_attr.ca_attrblks #define c_entries c_attr.ca_entries #define c_zftimeout c_childhint +#define c_dirhintcnt c_union.cu_dirhintcnt +#define c_syslockcount c_union.cu_syslockcount -/* Runtime cnode flags (kept in c_flag) */ -#define C_ACCESS 0x00001 /* Access time update request */ -#define C_CHANGE 0x00002 /* Change time update request */ -#define C_UPDATE 0x00004 /* Modification time update request */ -#define C_MODIFIED 0x00008 /* CNode has been modified */ -#define C_RELOCATING 0x00010 /* CNode's fork is being relocated */ -#define C_NOEXISTS 0x00020 /* CNode has been deleted, catalog entry is gone */ -#define C_DELETED 0x00040 /* CNode has been marked to be deleted */ -#define C_HARDLINK 0x00080 /* CNode is a hard link */ +/* hash maintenance flags kept in c_hflag and protected by hfs_chash_mutex */ +#define H_ALLOC 0x00001 /* CNode is being allocated */ +#define H_ATTACH 0x00002 /* CNode is being attached to by another vnode */ +#define H_TRANSIT 0x00004 /* CNode is getting recycled */ +#define H_WAITING 0x00008 /* CNode is being waited for */ + -#define C_ALLOC 0x00100 /* CNode is being allocated */ -#define C_WALLOC 0x00200 /* Waiting for allocation to finish */ -#define C_TRANSIT 0x00400 /* CNode is getting recycled */ -#define C_WTRANSIT 0x00800 /* Waiting for cnode getting recycled */ -#define C_NOBLKMAP 0x01000 /* CNode blocks cannot be mapped */ -#define C_WBLKMAP 0x02000 /* Waiting for block map */ +/* Runtime cnode flags (kept in c_flag) */ +#define C_NEED_RVNODE_PUT 0x00001 /* Need to do a vnode_put on c_rsrc_vp after the unlock */ +#define C_NEED_DVNODE_PUT 0x00002 /* Need to do a vnode_put on c_vp after the unlock */ +#define C_ZFWANTSYNC 0x00004 /* fsync requested and file has holes */ +#define C_FROMSYNC 0x00008 /* fsync was called from sync */ -#define C_ZFWANTSYNC 0x04000 /* fsync requested and file has holes */ -#define C_VPREFHELD 0x08000 /* resource fork has done a vget() on c_vp (for its parent ptr) */ +#define C_MODIFIED 0x00010 /* CNode has been modified */ +#define C_NOEXISTS 0x00020 /* CNode has been deleted, catalog entry is gone */ +#define C_DELETED 0x00040 /* CNode has been marked to be deleted */ +#define C_HARDLINK 0x00080 /* CNode is a hard link */ -#define C_FROMSYNC 0x10000 /* fsync was called from sync */ -#define C_FORCEUPDATE 0x20000 /* force the catalog entry update */ +#define C_FORCEUPDATE 0x00100 /* force the catalog entry update */ +#define C_HASXATTRS 0x00200 /* cnode has extended attributes */ #define ZFTIMELIMIT (5 * 60) @@ -162,7 +163,7 @@ typedef struct cnode cnode_t; /* * Convert between cnode pointers and vnode pointers */ -#define VTOC(vp) ((struct cnode *)(vp)->v_data) +#define VTOC(vp) ((struct cnode *)vnode_fsnode((vp))) #define CTOV(cp,rsrc) (((rsrc) && S_ISREG((cp)->c_mode)) ? \ (cp)->c_rsrc_vp : (cp)->c_vp) @@ -183,7 +184,6 @@ typedef struct cnode cnode_t; FTOC(fp)->c_rsrc_vp : \ FTOC(fp)->c_vp) -#define EVTONLYREFS(vp) ((vp->v_type == VREG) ? VTOF(vp)->ff_evtonly_refs : VTOC(vp)->c_evtonly_refs) /* * Test for a resource fork @@ -193,57 +193,71 @@ typedef struct cnode cnode_t; #define VNODE_IS_RSRC(vp) ((vp) == VTOC((vp))->c_rsrc_vp) -/* - * CTIMES should be an inline function... - */ -#define C_TIMEMASK (C_ACCESS | C_CHANGE | C_UPDATE) - -#define C_CHANGEMASK (C_ACCESS | C_CHANGE | C_UPDATE | C_MODIFIED) - -#define ATIME_ACCURACY 1 #define ATIME_ONDISK_ACCURACY 300 -#define CTIMES(cp, t1, t2) { \ - if ((cp)->c_flag & C_TIMEMASK) { \ - /* \ - * Only do the update if it is more than just \ - * the C_ACCESS field being updated. \ - */ \ - if (((cp)->c_flag & C_CHANGEMASK) != C_ACCESS) { \ - if ((cp)->c_flag & C_ACCESS) { \ - (cp)->c_atime = (t1)->tv_sec; \ - } \ - if ((cp)->c_flag & C_UPDATE) { \ - (cp)->c_mtime = (t2)->tv_sec; \ - (cp)->c_mtime_nsec = (t2)->tv_usec * 1000; \ - } \ - if ((cp)->c_flag & C_CHANGE) { \ - (cp)->c_ctime = time.tv_sec; \ - } \ - (cp)->c_flag |= C_MODIFIED; \ - (cp)->c_flag &= ~C_TIMEMASK; \ - } \ - } \ -} - -/* This overlays the fid structure (see mount.h). */ + +/* This overlays the FileID portion of NFS file handles. */ struct hfsfid { - u_int16_t hfsfid_len; /* Length of structure. */ - u_int16_t hfsfid_pad; /* Force 32-bit alignment. */ - /* The following data is filesystem-dependent, up to MAXFIDSZ (16) bytes: */ u_int32_t hfsfid_cnid; /* Catalog node ID. */ u_int32_t hfsfid_gen; /* Generation number (create date). */ }; +extern void hfs_touchtimes(struct hfsmount *, struct cnode *); + /* * HFS cnode hash functions. */ extern void hfs_chashinit(void); extern void hfs_chashinsert(struct cnode *cp); -extern void hfs_chashremove(struct cnode *cp); -extern struct cnode * hfs_chashget(dev_t dev, ino_t inum, int wantrsrc, - struct vnode **vpp, struct vnode **rvpp); +extern int hfs_chashremove(struct cnode *cp); +extern void hfs_chash_abort(struct cnode *cp); +extern void hfs_chash_rehash(struct cnode *cp1, struct cnode *cp2); +extern void hfs_chashwakeup(struct cnode *cp, int flags); +extern void hfs_chash_mark_in_transit(struct cnode *cp); + +extern struct vnode * hfs_chash_getvnode(dev_t dev, ino_t inum, int wantrsrc, int skiplock); +extern struct cnode * hfs_chash_getcnode(dev_t dev, ino_t inum, struct vnode **vpp, int wantrsrc, int skiplock); +extern int hfs_chash_snoop(dev_t, ino_t, int (*)(const struct cat_desc *, + const struct cat_attr *, void *), void *); + +/* + * HFS directory hint functions. + */ +extern directoryhint_t * hfs_getdirhint(struct cnode *, int); +extern void hfs_reldirhint(struct cnode *, directoryhint_t *); +extern void hfs_reldirhints(struct cnode *, int); + +/* + * HFS cnode lock functions. + * + * HFS Locking Order: + * + * 1. cnode truncate lock (if needed) + * 2. cnode lock (in parent-child order if related, otherwise by address order) + * 3. journal (if needed) + * 4. system files (as needed) + * A. Catalog B-tree file + * B. Attributes B-tree file + * C. Allocation Bitmap file (always exclusive, supports recursion) + * D. Overflow Extents B-tree file (always exclusive, supports recursion) + * 5. hfs mount point (always last) + * + */ +enum hfslocktype {HFS_SHARED_LOCK = 1, HFS_EXCLUSIVE_LOCK = 2, HFS_FORCE_LOCK = 3}; +#define HFS_SHARED_OWNER (void *)0xffffffff + +extern int hfs_lock(struct cnode *, enum hfslocktype); +extern int hfs_lockpair(struct cnode *, struct cnode *, enum hfslocktype); +extern int hfs_lockfour(struct cnode *, struct cnode *, struct cnode *, struct cnode *, + enum hfslocktype); + +extern void hfs_unlock(struct cnode *); +extern void hfs_unlockpair(struct cnode *, struct cnode *); +extern void hfs_unlockfour(struct cnode *, struct cnode *, struct cnode *, struct cnode *); + +extern void hfs_lock_truncate(struct cnode *, int); +extern void hfs_unlock_truncate(struct cnode *); #endif /* __APPLE_API_PRIVATE */ #endif /* KERNEL */ diff --git a/bsd/hfs/hfs_dbg.h b/bsd/hfs/hfs_dbg.h index 0b4942f36..fe1d71c89 100644 --- a/bsd/hfs/hfs_dbg.h +++ b/bsd/hfs/hfs_dbg.h @@ -98,7 +98,7 @@ extern int hfs_dbg_err; #define DBG_ERR(x) { \ if(hfs_dbg_all || hfs_dbg_err) { \ - PRINTIT("%X: ", current_proc()->p_pid); \ + PRINTIT("%X: ", proc_selfpid()); \ PRINTIT("HFS ERROR: "); \ PRINTIT x; \ PRINT_DELAY; \ diff --git a/bsd/hfs/hfs_encodinghint.c b/bsd/hfs/hfs_encodinghint.c index 2aede276a..02a1fce32 100644 --- a/bsd/hfs/hfs_encodinghint.c +++ b/bsd/hfs/hfs_encodinghint.c @@ -22,6 +22,7 @@ #include <sys/param.h> #include <hfs/hfs_macos_defs.h> +#include <hfs/hfs.h> /* CJK Mac Encoding Bits */ @@ -42,6 +43,9 @@ u_int8_t cjk_lastunique = 0; u_int32_t hfs_encodingbias = 0; int hfs_islatinbias = 0; +extern lck_mtx_t encodinglst_mutex; + + /* Map CJK bits to Mac encoding */ u_int8_t cjk_encoding[] = { /* 0000 */ kTextEncodingMacUnicode, @@ -889,7 +893,7 @@ hfs_pickencoding(const u_int16_t *src, int len) __private_extern__ u_int32_t -hfs_getencodingbias() +hfs_getencodingbias(void) { return (hfs_encodingbias); } @@ -899,6 +903,8 @@ __private_extern__ void hfs_setencodingbias(u_int32_t bias) { + lck_mtx_lock(&encodinglst_mutex); + hfs_encodingbias = bias; switch (bias) { @@ -914,5 +920,7 @@ hfs_setencodingbias(u_int32_t bias) hfs_islatinbias = 0; break; } + + lck_mtx_unlock(&encodinglst_mutex); } diff --git a/bsd/hfs/hfs_encodings.c b/bsd/hfs/hfs_encodings.c index 4c2229c95..94029ef73 100644 --- a/bsd/hfs/hfs_encodings.c +++ b/bsd/hfs/hfs_encodings.c @@ -23,7 +23,6 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> -#include <sys/lock.h> #include <sys/malloc.h> #include <sys/queue.h> #include <sys/utfconv.h> @@ -31,9 +30,16 @@ #include "hfs.h" +lck_grp_t * encodinglst_lck_grp; +lck_grp_attr_t * encodinglst_lck_grp_attr; +lck_attr_t * encodinglst_lck_attr; + + /* hfs encoding converter list */ SLIST_HEAD(encodinglst, hfs_encoding) hfs_encoding_list = {0}; -decl_simple_lock_data(,hfs_encoding_list_slock); + +lck_mtx_t encodinglst_mutex; + /* hfs encoding converter entry */ @@ -61,7 +67,15 @@ void hfs_converterinit(void) { SLIST_INIT(&hfs_encoding_list); - simple_lock_init(&hfs_encoding_list_slock); + + encodinglst_lck_grp_attr= lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(encodinglst_lck_grp_attr); + encodinglst_lck_grp = lck_grp_alloc_init("cnode_hash", encodinglst_lck_grp_attr); + + encodinglst_lck_attr = lck_attr_alloc_init(); + //lck_attr_setdebug(encodinglst_lck_attr); + + lck_mtx_init(&encodinglst_mutex, encodinglst_lck_grp, encodinglst_lck_attr); /* * add resident MacRoman converter and take a reference @@ -87,7 +101,7 @@ hfs_addconverter(int id, UInt32 encoding, hfs_to_unicode_func_t get_unicode, uni MALLOC(encp, struct hfs_encoding *, sizeof(struct hfs_encoding), M_TEMP, M_WAITOK); - simple_lock(&hfs_encoding_list_slock); + lck_mtx_lock(&encodinglst_mutex); encp->link.sle_next = NULL; encp->refcount = 0; @@ -97,7 +111,7 @@ hfs_addconverter(int id, UInt32 encoding, hfs_to_unicode_func_t get_unicode, uni encp->kmod_id = id; SLIST_INSERT_HEAD(&hfs_encoding_list, encp, link); - simple_unlock(&hfs_encoding_list_slock); + lck_mtx_unlock(&encodinglst_mutex); return (0); } @@ -117,9 +131,8 @@ int hfs_remconverter(int id, UInt32 encoding) { struct hfs_encoding *encp; - int busy = 0; - simple_lock(&hfs_encoding_list_slock); + lck_mtx_lock(&encodinglst_mutex); SLIST_FOREACH(encp, &hfs_encoding_list, link) { if (encp->encoding == encoding && encp->kmod_id == id) { encp->refcount--; @@ -127,16 +140,19 @@ hfs_remconverter(int id, UInt32 encoding) /* if converter is no longer in use, release it */ if (encp->refcount <= 0 && encp->kmod_id != 0) { SLIST_REMOVE(&hfs_encoding_list, encp, hfs_encoding, link); + lck_mtx_unlock(&encodinglst_mutex); FREE(encp, M_TEMP); + return (0); } else { - busy = 1; + lck_mtx_unlock(&encodinglst_mutex); + return (1); /* busy */ } break; } } - simple_unlock(&hfs_encoding_list_slock); + lck_mtx_unlock(&encodinglst_mutex); - return (busy); + return (0); } @@ -151,7 +167,7 @@ hfs_getconverter(UInt32 encoding, hfs_to_unicode_func_t *get_unicode, unicode_to struct hfs_encoding *encp; int found = 0; - simple_lock(&hfs_encoding_list_slock); + lck_mtx_lock(&encodinglst_mutex); SLIST_FOREACH(encp, &hfs_encoding_list, link) { if (encp->encoding == encoding) { found = 1; @@ -161,7 +177,7 @@ hfs_getconverter(UInt32 encoding, hfs_to_unicode_func_t *get_unicode, unicode_to break; } } - simple_unlock(&hfs_encoding_list_slock); + lck_mtx_unlock(&encodinglst_mutex); if (!found) { *get_unicode = NULL; @@ -182,12 +198,10 @@ int hfs_relconverter(UInt32 encoding) { struct hfs_encoding *encp; - int found = 0; - simple_lock(&hfs_encoding_list_slock); + lck_mtx_lock(&encodinglst_mutex); SLIST_FOREACH(encp, &hfs_encoding_list, link) { if (encp->encoding == encoding) { - found = 1; encp->refcount--; /* if converter is no longer in use, release it */ @@ -195,19 +209,19 @@ hfs_relconverter(UInt32 encoding) int id = encp->kmod_id; SLIST_REMOVE(&hfs_encoding_list, encp, hfs_encoding, link); - FREE(encp, M_TEMP); - encp = NULL; - - simple_unlock(&hfs_encoding_list_slock); - kmod_destroy((host_priv_t) host_priv_self(), id); - simple_lock(&hfs_encoding_list_slock); + lck_mtx_unlock(&encodinglst_mutex); + + FREE(encp, M_TEMP); + kmod_destroy((host_priv_t) host_priv_self(), id); + return (0); } - break; + lck_mtx_unlock(&encodinglst_mutex); + return (0); } } - simple_unlock(&hfs_encoding_list_slock); + lck_mtx_unlock(&encodinglst_mutex); - return (found ? 0 : EINVAL); + return (EINVAL); } diff --git a/bsd/hfs/hfs_endian.c b/bsd/hfs/hfs_endian.c index cd5843aaf..0341f15db 100644 --- a/bsd/hfs/hfs_endian.c +++ b/bsd/hfs/hfs_endian.c @@ -355,6 +355,55 @@ hfs_swap_HFSPlusBTInternalNode ( if (unswap) srcPtr[0] = SWAP_BE16 (srcPtr[0]); } + } else if (fileID == kHFSAttributesFileID) { + HFSPlusAttrKey *srcKey; + HFSPlusAttrRecord *srcRec; + + for (i = 0; i < srcDesc->numRecords; i++) { + srcKey = (HFSPlusAttrKey *)((char *)src->buffer + srcOffs[i]); + + if (!unswap) srcKey->keyLength = SWAP_BE16(srcKey->keyLength); + srcRec = (HFSPlusAttrRecord *)((char *)srcKey + srcKey->keyLength + 2); + if (unswap) srcKey->keyLength = SWAP_BE16(srcKey->keyLength); + + srcKey->fileID = SWAP_BE32(srcKey->fileID); + srcKey->startBlock = SWAP_BE32(srcKey->startBlock); + + if (!unswap) srcKey->attrNameLen = SWAP_BE16(srcKey->attrNameLen); + for (j = 0; j < srcKey->attrNameLen; j++) + srcKey->attrName[j] = SWAP_BE16(srcKey->attrName[j]); + if (unswap) srcKey->attrNameLen = SWAP_BE16(srcKey->attrNameLen); + + /* If this is an index node, just swap the child node number */ + if (srcDesc->kind == kBTIndexNode) { + *((UInt32 *)srcRec) = SWAP_BE32 (*((UInt32 *)srcRec)); + continue; + } + + /* Swap the data record */ + if (!unswap) srcRec->recordType = SWAP_BE32(srcRec->recordType); + switch (srcRec->recordType) { + case kHFSPlusAttrInlineData: + /* We're not swapping the reserved fields */ + srcRec->attrData.attrSize = SWAP_BE32(srcRec->attrData.attrSize); + /* Not swapping the attrData */ + break; + case kHFSPlusAttrForkData: + /* We're not swapping the reserved field */ + hfs_swap_HFSPlusForkData(&srcRec->forkData.theFork); + break; + case kHFSPlusAttrExtents: + /* We're not swapping the reserved field */ + for (j = 0; j < kHFSPlusExtentDensity; j++) { + srcRec->overflowExtents.extents[j].startBlock = + SWAP_BE32(srcRec->overflowExtents.extents[j].startBlock); + srcRec->overflowExtents.extents[j].blockCount = + SWAP_BE32(srcRec->overflowExtents.extents[j].blockCount); + } + break; + } + if (unswap) srcRec->recordType = SWAP_BE32(srcRec->recordType); + } } else if (fileID > kHFSFirstUserCatalogNodeID) { HotFileKey *srcKey; UInt32 *srcRec; diff --git a/bsd/hfs/hfs_format.h b/bsd/hfs/hfs_format.h index 5caa3d2c8..001206d45 100644 --- a/bsd/hfs/hfs_format.h +++ b/bsd/hfs/hfs_format.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -22,8 +22,6 @@ #ifndef __HFS_FORMAT__ #define __HFS_FORMAT__ -#ifndef __HFSVOLUMES__ - #include <sys/types.h> #include <sys/appleapiopts.h> @@ -54,7 +52,8 @@ enum { kHFSXVersion = 0x0005, /* 'HX' volumes start with version 5 */ kHFSPlusMountVersion = 0x31302E30, /* '10.0' for Mac OS X */ - kHFSJMountVersion = 0x4846534a /* 'HFSJ' for journaled HFS+ on OS X */ + kHFSJMountVersion = 0x4846534a, /* 'HFSJ' for journaled HFS+ on OS X */ + kFSKMountVersion = 0x46534b21 /* 'FSK!' for failed journal replay */ }; @@ -91,7 +90,8 @@ enum { }; -#ifndef __FILES__ +#ifndef _HFSUNISTR255_DEFINED_ +#define _HFSUNISTR255_DEFINED_ /* Unicode strings are used for HFS Plus file and folder names */ struct HFSUniStr255 { u_int16_t length; /* number of unicode characters */ @@ -99,7 +99,7 @@ struct HFSUniStr255 { }; typedef struct HFSUniStr255 HFSUniStr255; typedef const HFSUniStr255 *ConstHFSUniStr255Param; -#endif /* __FILES__ */ +#endif /* _HFSUNISTR255_DEFINED_ */ enum { kHFSMaxVolumeNameChars = 27, @@ -242,7 +242,7 @@ struct HFSCatalogKey { u_int8_t keyLength; /* key length (in bytes) */ u_int8_t reserved; /* reserved (set to zero) */ u_int32_t parentID; /* parent folder ID */ - u_char nodeName[kHFSMaxFileNameChars + 1]; /* catalog node name */ + u_int8_t nodeName[kHFSMaxFileNameChars + 1]; /* catalog node name */ }; typedef struct HFSCatalogKey HFSCatalogKey; @@ -274,8 +274,15 @@ enum { enum { kHFSFileLockedBit = 0x0000, /* file is locked and cannot be written to */ kHFSFileLockedMask = 0x0001, + kHFSThreadExistsBit = 0x0001, /* a file thread record exists for this file */ - kHFSThreadExistsMask = 0x0002 + kHFSThreadExistsMask = 0x0002, + + kHFSHasAttributesBit = 0x0002, /* object has extended attributes */ + kHFSHasAttributesMask = 0x0004, + + kHFSHasSecurityBit = 0x0003, /* object has security data (ACLs) */ + kHFSHasSecurityMask = 0x0008 }; @@ -309,7 +316,7 @@ struct HFSPlusCatalogFolder { FndrDirInfo userInfo; /* Finder information */ FndrOpaqueInfo finderInfo; /* additional Finder information */ u_int32_t textEncoding; /* hint for name conversions */ - u_int32_t reserved; /* reserved - initialized as zero */ + u_int32_t attrBlocks; /* cached count of attribute data blocks */ }; typedef struct HFSPlusCatalogFolder HFSPlusCatalogFolder; @@ -352,7 +359,7 @@ struct HFSPlusCatalogFile { FndrFileInfo userInfo; /* Finder information */ FndrOpaqueInfo finderInfo; /* additional Finder information */ u_int32_t textEncoding; /* hint for name conversions */ - u_int32_t reserved2; /* reserved - initialized as zero */ + u_int32_t attrBlocks; /* cached count of attribute data blocks */ /* Note: these start on double long (64 bit) boundry */ HFSPlusForkData dataFork; /* size and block data for data fork */ @@ -365,7 +372,7 @@ struct HFSCatalogThread { int16_t recordType; /* == kHFSFolderThreadRecord or kHFSFileThreadRecord */ int32_t reserved[2]; /* reserved - initialized as zero */ u_int32_t parentID; /* parent ID for this catalog node */ - u_char nodeName[kHFSMaxFileNameChars + 1]; /* name of this catalog node */ + u_int8_t nodeName[kHFSMaxFileNameChars + 1]; /* name of this catalog node */ }; typedef struct HFSCatalogThread HFSCatalogThread; @@ -384,25 +391,10 @@ typedef struct HFSPlusCatalogThread HFSPlusCatalogThread; chosen so that they wouldn't conflict with the catalog record types. */ enum { - kHFSPlusAttrInlineData = 0x10, /* if size < kAttrOverflowSize */ - kHFSPlusAttrForkData = 0x20, /* if size >= kAttrOverflowSize */ - kHFSPlusAttrExtents = 0x30 /* overflow extents for large attributes */ -}; - - -/* - HFSPlusAttrInlineData - For small attributes, whose entire value is stored within this one - B-tree record. - There would not be any other records for this attribute. -*/ -struct HFSPlusAttrInlineData { - u_int32_t recordType; /* == kHFSPlusAttrInlineData*/ - u_int32_t reserved; - u_int32_t logicalSize; /* size in bytes of userData*/ - u_int8_t userData[2]; /* variable length; space allocated is a multiple of 2 bytes*/ + kHFSPlusAttrInlineData = 0x10, /* if size < kAttrOverflowSize */ + kHFSPlusAttrForkData = 0x20, /* if size >= kAttrOverflowSize */ + kHFSPlusAttrExtents = 0x30 /* overflow extents for large attributes */ }; -typedef struct HFSPlusAttrInlineData HFSPlusAttrInlineData; /* @@ -430,15 +422,58 @@ struct HFSPlusAttrExtents { }; typedef struct HFSPlusAttrExtents HFSPlusAttrExtents; +/* + * Atrributes B-tree Data Record + * + * For small attributes, whose entire value is stored + * within a single B-tree record. + */ +struct HFSPlusAttrData { + u_int32_t recordType; /* == kHFSPlusAttrInlineData */ + u_int32_t reserved[2]; + u_int32_t attrSize; /* size of attribute data in bytes */ + u_int8_t attrData[2]; /* variable length */ +}; +typedef struct HFSPlusAttrData HFSPlusAttrData; + + +/* HFSPlusAttrInlineData is obsolete use HFSPlusAttrData instead */ +struct HFSPlusAttrInlineData { + u_int32_t recordType; + u_int32_t reserved; + u_int32_t logicalSize; + u_int8_t userData[2]; +}; +typedef struct HFSPlusAttrInlineData HFSPlusAttrInlineData; + + /* A generic Attribute Record*/ union HFSPlusAttrRecord { u_int32_t recordType; - HFSPlusAttrInlineData inlineData; + HFSPlusAttrInlineData inlineData; /* NOT USED */ + HFSPlusAttrData attrData; HFSPlusAttrForkData forkData; HFSPlusAttrExtents overflowExtents; }; typedef union HFSPlusAttrRecord HFSPlusAttrRecord; +/* Attribute key */ +struct HFSPlusAttrKey { + u_int16_t keyLength; /* key length (in bytes) */ + u_int16_t pad; /* set to zero */ + u_int32_t fileID; /* file associated with attribute */ + u_int32_t startBlock; /* first attribue allocation block number for extents */ + u_int16_t attrNameLen; /* number of unicode characters */ + u_int16_t attrName[127]; /* attribute name (Unicode) */ +}; +typedef struct HFSPlusAttrKey HFSPlusAttrKey; + +#define kHFSPlusAttrKeyMaximumLength (sizeof(HFSPlusAttrKey) - sizeof(u_int16_t)) +#define kHFSPlusAttrKeyMinimumLength (kHFSPlusAttrKeyMaximumLength - (127 * sizeof(u_int16_t))) + +#endif /* __APPLE_API_UNSTABLE */ + + /* Key and node lengths */ enum { kHFSPlusExtentKeyMaximumLength = sizeof(HFSPlusExtentKey) - sizeof(u_int16_t), @@ -451,7 +486,6 @@ enum { kHFSPlusExtentMinNodeSize = 512, kHFSPlusAttrMinNodeSize = 4096 }; -#endif /* __APPLE_API_UNSTABLE */ /* HFS and HFS Plus volume attribute bits */ enum { @@ -495,7 +529,7 @@ struct HFSMasterDirectoryBlock { u_int16_t drAlBlSt; /* first allocation block in volume */ u_int32_t drNxtCNID; /* next unused catalog node ID */ u_int16_t drFreeBks; /* number of unused allocation blocks */ - u_char drVN[kHFSMaxVolumeNameChars + 1]; /* volume name */ + u_int8_t drVN[kHFSMaxVolumeNameChars + 1]; /* volume name */ u_int32_t drVolBkUp; /* date and time of last backup */ u_int16_t drVSeqNum; /* volume backup sequence number */ u_int32_t drWrCnt; /* volume write count */ @@ -626,7 +660,7 @@ enum { /* Catalog Key Name Comparison Type */ enum { kHFSCaseFolding = 0xCF, /* case folding (case-insensitive) */ - kHFSBinaryCompare = 0xBC, /* binary compare (case-sensitive) */ + kHFSBinaryCompare = 0xBC /* binary compare (case-sensitive) */ }; /* JournalInfoBlock - Structure that describes where our journal lives */ @@ -652,8 +686,4 @@ enum { } #endif -#else -#warning hfs_format.h is not compatible with HFSVolumes.h (include only one) -#endif /* __HFSVOLUMES__ */ - #endif /* __HFS_FORMAT__ */ diff --git a/bsd/hfs/hfs_fsctl.h b/bsd/hfs/hfs_fsctl.h new file mode 100644 index 000000000..573b0c9e0 --- /dev/null +++ b/bsd/hfs/hfs_fsctl.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef _HFS_FSCTL_H_ +#define _HFS_FSCTL_H_ + +#include <sys/appleapiopts.h> + +#include <sys/ioccom.h> +#include <sys/time.h> + + +#ifdef __APPLE_API_UNSTABLE + +struct hfs_backingstoreinfo { + int signature; /* == 3419115 */ + int version; /* version of this struct (1) */ + int backingfd; /* disk image file (on backing fs) */ + int bandsize; /* sparse disk image band size */ +}; + + +/* HFS FS CONTROL COMMANDS */ + +#define HFSIOC_RESIZE_VOLUME _IOW('h', 2, u_int64_t) +#define HFS_RESIZE_VOLUME IOCBASECMD(HFSIOC_RESIZE_VOLUME) + +#define HFSIOC_CHANGE_NEXT_ALLOCATION _IOWR('h', 3, u_int32_t) +#define HFS_CHANGE_NEXT_ALLOCATION IOCBASECMD(HFSIOC_CHANGE_NEXT_ALLOCATION) + +#define HFSIOC_GETCREATETIME _IOR('h', 4, time_t) +#define HFS_GETCREATETIME IOCBASECMD(HFSIOC_GETCREATETIME) + +#define HFSIOC_SETBACKINGSTOREINFO _IOW('h', 7, struct hfs_backingstoreinfo) +#define HFS_SETBACKINGSTOREINFO IOCBASECMD(HFSIOC_SETBACKINGSTOREINFO) + +#define HFSIOC_CLRBACKINGSTOREINFO _IO('h', 8) +#define HFS_CLRBACKINGSTOREINFO IOCBASECMD(HFSIOC_CLRBACKINGSTOREINFO) + +#define HFSIOC_SETACLSTATE _IOW('h', 10, int32_t) +#define HFS_SETACLSTATE IOCBASECMD(HFSIOC_SETACLSTATE) + +#endif /* __APPLE_API_UNSTABLE */ + + +#endif /* ! _HFS_FSCTL_H_ */ diff --git a/bsd/hfs/hfs_hotfiles.c b/bsd/hfs/hfs_hotfiles.c index e5f94cd64..536136fe6 100644 --- a/bsd/hfs/hfs_hotfiles.c +++ b/bsd/hfs/hfs_hotfiles.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2003-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -27,6 +27,8 @@ #include <sys/malloc.h> #include <sys/ubc.h> #include <sys/vnode.h> +#include <sys/vnode_internal.h> +#include <sys/kauth.h> #include <hfs/hfs.h> #include <hfs/hfs_endian.h> @@ -90,6 +92,8 @@ typedef struct hotfile_data { hotfile_entry_t entries[1]; } hotfile_data_t; +static int hfs_recording_start (struct hfsmount *); +static int hfs_recording_stop (struct hfsmount *); /* @@ -97,21 +101,27 @@ typedef struct hotfile_data { */ static void hf_insert (hotfile_data_t *, hotfile_entry_t *); static void hf_delete (hotfile_data_t *, u_int32_t, u_int32_t); -static hotfile_entry_t * hf_lookup (hotfile_data_t *, u_int32_t, u_int32_t); static hotfile_entry_t * hf_coldest (hotfile_data_t *); static hotfile_entry_t * hf_getnewentry (hotfile_data_t *); -static int hf_getsortedlist (hotfile_data_t *, hotfilelist_t *); -static void hf_printtree (hotfile_entry_t *); +static void hf_getsortedlist (hotfile_data_t *, hotfilelist_t *); + +#if HFC_DEBUG +static hotfile_entry_t * hf_lookup (hotfile_data_t *, u_int32_t, u_int32_t); +static void hf_maxdepth(hotfile_entry_t *, int, int *); +static void hf_printtree (hotfile_entry_t *); +#endif /* * Hot File misc support functions. */ -static int hotfiles_collect (struct hfsmount *, struct proc *); -static int hotfiles_age (struct hfsmount *, struct proc *); -static int hotfiles_adopt (struct hfsmount *, struct proc *); +static int hotfiles_collect (struct hfsmount *); +static int hotfiles_age (struct hfsmount *); +static int hotfiles_adopt (struct hfsmount *); static int hotfiles_evict (struct hfsmount *, struct proc *); -static int hotfiles_refine (struct hfsmount *, struct proc *); +static int hotfiles_refine (struct hfsmount *); static int hotextents(struct hfsmount *, HFSPlusExtentDescriptor *); +static int hfs_addhotfile_internal(struct vnode *); + /* * Hot File Cluster B-tree (on disk) functions. @@ -124,6 +134,9 @@ static int hfc_comparekeys (HotFileKey *, HotFileKey *); char hfc_tag[] = "CLUSTERED HOT FILES B-TREE "; +extern int UBCINFOEXISTS(struct vnode * vp); +extern int hfs_vnop_write(struct vnop_write_args *ap); + /* *======================================================================== @@ -134,12 +147,13 @@ char hfc_tag[] = "CLUSTERED HOT FILES B-TREE "; /* * Start recording the hotest files on a file system. * + * Requires that the hfc_mutex be held. */ -__private_extern__ -int -hfs_recording_start(struct hfsmount *hfsmp, struct proc *p) +static int +hfs_recording_start(struct hfsmount *hfsmp) { hotfile_data_t *hotdata; + struct timeval tv; int maxentries; size_t size; int i; @@ -150,7 +164,7 @@ hfs_recording_start(struct hfsmount *hfsmp, struct proc *p) (hfsmp->hfs_flags & HFS_METADATA_ZONE) == 0) { return (EPERM); } - if (HFSTOVCB(hfsmp)->freeBlocks < (2 * hfsmp->hfs_hotfile_maxblks)) { + if (HFSTOVCB(hfsmp)->freeBlocks < (2 * (u_int32_t)hfsmp->hfs_hotfile_maxblks)) { return (ENOSPC); } if (hfsmp->hfc_stage != HFC_IDLE) { @@ -169,6 +183,8 @@ hfs_recording_start(struct hfsmount *hfsmp, struct proc *p) FREE(tmp, M_TEMP); } + microuptime(&tv); + /* * On first startup check for suspended recording. */ @@ -182,14 +198,15 @@ hfs_recording_start(struct hfsmount *hfsmp, struct proc *p) (SWAP_BE32 (hotfileinfo.timeleft) > 0) && (SWAP_BE32 (hotfileinfo.timebase) > 0)) { hfsmp->hfc_maxfiles = SWAP_BE32 (hotfileinfo.maxfilecnt); - hfsmp->hfc_timeout = SWAP_BE32 (hotfileinfo.timeleft) + time.tv_sec ; + hfsmp->hfc_timeout = SWAP_BE32 (hotfileinfo.timeleft) + tv.tv_sec ; hfsmp->hfc_timebase = SWAP_BE32 (hotfileinfo.timebase); #if HFC_VERBOSE - printf("HFS: resume recording hot files (%d left)\n", SWAP_BE32 (hotfileinfo.timeleft)); + printf("Resume recording hot files on %s (%d secs left)\n", + hfsmp->vcbVN, SWAP_BE32 (hotfileinfo.timeleft)); #endif } else { hfsmp->hfc_maxfiles = HFC_DEFAULT_FILE_COUNT; - hfsmp->hfc_timebase = time.tv_sec + 1; + hfsmp->hfc_timebase = tv.tv_sec + 1; hfsmp->hfc_timeout = hfsmp->hfc_timebase + HFC_DEFAULT_DURATION; } (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); @@ -210,17 +227,16 @@ hfs_recording_start(struct hfsmount *hfsmp, struct proc *p) return (error); } #if HFC_VERBOSE - printf("HFS: begin recording hot files\n"); + printf("HFS: begin recording hot files on %s\n", hfsmp->vcbVN); #endif hfsmp->hfc_maxfiles = HFC_DEFAULT_FILE_COUNT; - hfsmp->hfc_timeout = time.tv_sec + HFC_DEFAULT_DURATION; + hfsmp->hfc_timeout = tv.tv_sec + HFC_DEFAULT_DURATION; /* Reset time base. */ if (hfsmp->hfc_timebase == 0) { - hfsmp->hfc_timebase = time.tv_sec + 1; + hfsmp->hfc_timebase = tv.tv_sec + 1; } else { - u_int32_t cumulativebase; - u_int32_t oldbase = hfsmp->hfc_timebase; + time_t cumulativebase; cumulativebase = hfsmp->hfc_timeout - (HFC_CUMULATIVE_CYCLES * HFC_DEFAULT_DURATION); hfsmp->hfc_timebase = MAX(hfsmp->hfc_timebase, cumulativebase); @@ -249,7 +265,6 @@ hfs_recording_start(struct hfsmount *hfsmp, struct proc *p) hotdata->hfsmp = hfsmp; hfsmp->hfc_recdata = hotdata; -out: hfsmp->hfc_stage = HFC_RECORDING; wakeup((caddr_t)&hfsmp->hfc_stage); return (0); @@ -257,23 +272,23 @@ out: /* * Stop recording the hotest files on a file system. + * + * Requires that the hfc_mutex be held. */ -__private_extern__ -int -hfs_recording_stop(struct hfsmount *hfsmp, struct proc *p) +static int +hfs_recording_stop(struct hfsmount *hfsmp) { hotfile_data_t *hotdata; hotfilelist_t *listp; + struct timeval tv; size_t size; enum hfc_stage newstage = HFC_IDLE; - void * tmp; int error; - if (hfsmp->hfc_stage != HFC_RECORDING) return (EPERM); - hotfiles_collect(hfsmp, p); + hotfiles_collect(hfsmp); if (hfsmp->hfc_stage != HFC_RECORDING) return (0); @@ -286,7 +301,7 @@ hfs_recording_stop(struct hfsmount *hfsmp, struct proc *p) * then dump the sample data */ #if HFC_VERBOSE - printf("HFS: end of hot file recording\n"); + printf("HFS: end of hot file recording on %s\n", hfsmp->vcbVN); #endif hotdata = (hotfile_data_t *)hfsmp->hfc_recdata; if (hotdata == NULL) @@ -318,7 +333,7 @@ hfs_recording_stop(struct hfsmount *hfsmp, struct proc *p) /* * Age the previous set of clustered hot files. */ - error = hotfiles_age(hfsmp, p); + error = hotfiles_age(hfsmp); if (error) { (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); hfsmp->hfc_filevp = NULL; @@ -333,14 +348,15 @@ hfs_recording_stop(struct hfsmount *hfsmp, struct proc *p) MALLOC(listp, hotfilelist_t *, size, M_TEMP, M_WAITOK); bzero(listp, size); - hf_getsortedlist(hotdata, listp); - listp->hfl_duration = time.tv_sec - hfsmp->hfc_timebase; + hf_getsortedlist(hotdata, listp); /* NOTE: destroys hot file tree! */ + microuptime(&tv); + listp->hfl_duration = tv.tv_sec - hfsmp->hfc_timebase; hfsmp->hfc_recdata = listp; /* * Account for duplicates. */ - error = hotfiles_refine(hfsmp, p); + error = hotfiles_refine(hfsmp); if (error) { (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); hfsmp->hfc_filevp = NULL; @@ -389,24 +405,34 @@ out: */ __private_extern__ int -hfs_recording_suspend(struct hfsmount *hfsmp, struct proc *p) +hfs_recording_suspend(struct hfsmount *hfsmp) { HotFilesInfo hotfileinfo; - hotfile_data_t *hotdata; + hotfile_data_t *hotdata = NULL; + struct timeval tv; int error; - if (hfsmp->hfc_stage != HFC_RECORDING) + if (hfsmp->hfc_stage == HFC_DISABLED) return (0); + lck_mtx_lock(&hfsmp->hfc_mutex); + + /* + * XXX NOTE + * A suspend can occur during eval/evict/adopt stage. + * In that case we would need to write out info and + * flush our HFBT vnode. Currently we just bail. + */ + hotdata = (hotfile_data_t *)hfsmp->hfc_recdata; - if (hotdata == NULL) { - hfsmp->hfc_stage = HFC_DISABLED; - return (0); + if (hotdata == NULL || hfsmp->hfc_stage != HFC_RECORDING) { + error = 0; + goto out; } hfsmp->hfc_stage = HFC_BUSY; #if HFC_VERBOSE - printf("HFS: suspend hot file recording\n"); + printf("HFS: suspend hot file recording on %s\n", hfsmp->vcbVN); #endif error = hfc_btree_open(hfsmp, &hfsmp->hfc_filevp); if (error) { @@ -414,78 +440,52 @@ hfs_recording_suspend(struct hfsmount *hfsmp, struct proc *p) goto out; } - hfs_global_shared_lock_acquire(hfsmp); - if (hfsmp->jnl) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - hfs_global_shared_lock_release(hfsmp); - error = EINVAL; - goto out; - } + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + goto out; + } + if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK) != 0) { + error = EPERM; + goto out; } - vn_lock(hfsmp->hfc_filevp, LK_EXCLUSIVE | LK_RETRY, p); + microuptime(&tv); hotfileinfo.magic = SWAP_BE32 (HFC_MAGIC); hotfileinfo.version = SWAP_BE32 (HFC_VERSION); hotfileinfo.duration = SWAP_BE32 (HFC_DEFAULT_DURATION); hotfileinfo.timebase = SWAP_BE32 (hfsmp->hfc_timebase); - hotfileinfo.timeleft = SWAP_BE32 (hfsmp->hfc_timeout - time.tv_sec); + hotfileinfo.timeleft = SWAP_BE32 (hfsmp->hfc_timeout - tv.tv_sec); hotfileinfo.threshold = SWAP_BE32 (hotdata->threshold); hotfileinfo.maxfileblks = SWAP_BE32 (hotdata->maxblocks); hotfileinfo.maxfilecnt = SWAP_BE32 (HFC_DEFAULT_FILE_COUNT); strcpy(hotfileinfo.tag, hfc_tag); (void) BTSetUserData(VTOF(hfsmp->hfc_filevp), &hotfileinfo, sizeof(hotfileinfo)); - (void) VOP_UNLOCK(hfsmp->hfc_filevp, 0, p); - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); - - (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); - hfsmp->hfc_filevp = NULL; + hfs_unlock(VTOC(hfsmp->hfc_filevp)); + hfs_end_transaction(hfsmp); out: - FREE(hotdata, M_TEMP); - - hfsmp->hfc_stage = HFC_DISABLED; - wakeup((caddr_t)&hfsmp->hfc_stage); - return (error); -} - -/* - * Abort a hot file recording session. - */ -__private_extern__ -int -hfs_recording_abort(struct hfsmount *hfsmp, struct proc *p) -{ - void * tmp; - - if (hfsmp->hfc_stage == HFC_DISABLED) - return (0); - - if (hfsmp->hfc_stage == HFC_BUSY) { - (void) tsleep((caddr_t)&hfsmp->hfc_stage, PINOD, "hfs_recording_abort", 0); + if (hfsmp->hfc_filevp) { + (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); + hfsmp->hfc_filevp = NULL; } - hfsmp->hfc_stage = HFC_BUSY; - - printf("HFS: terminate hot file recording\n"); - - if (hfsmp->hfc_recdata) { - tmp = hfsmp->hfc_recdata; + if (hotdata) { + FREE(hotdata, M_TEMP); hfsmp->hfc_recdata = NULL; - FREE(tmp, M_TEMP); } hfsmp->hfc_stage = HFC_DISABLED; wakeup((caddr_t)&hfsmp->hfc_stage); - return (0); +exit: + lck_mtx_unlock(&hfsmp->hfc_mutex); + return (error); } + /* * */ __private_extern__ int -hfs_recording_init(struct hfsmount *hfsmp, struct proc *p) +hfs_recording_init(struct hfsmount *hfsmp) { CatalogKey * keyp; CatalogRecord * datap; @@ -504,6 +504,14 @@ hfs_recording_init(struct hfsmount *hfsmp, struct proc *p) int inserted = 0; /* debug variables */ int filecount = 0; + /* + * For now, only the boot volume is supported. + */ + if ((vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) == 0) { + hfsmp->hfc_stage = HFC_DISABLED; + return (EPERM); + } + /* * If the Hot File btree exists then metadata zone is ready. */ @@ -513,15 +521,11 @@ hfs_recording_init(struct hfsmount *hfsmp, struct proc *p) hfsmp->hfc_stage = HFC_IDLE; return (0); } - /* - * For now, only the boot volume is supported. - */ - if ((HFSTOVFS(hfsmp)->mnt_flag & MNT_ROOTFS) == 0) { - hfsmp->hfs_flags &= ~HFS_METADATA_ZONE; - return (EPERM); - } error = hfc_btree_create(hfsmp, HFSTOVCB(hfsmp)->blockSize, HFC_DEFAULT_FILE_COUNT); if (error) { +#if HFC_VERBOSE + printf("Error %d creating hot file b-tree on %s \n", error, hfsmp->vcbVN); +#endif return (error); } /* @@ -531,6 +535,9 @@ hfs_recording_init(struct hfsmount *hfsmp, struct proc *p) panic("hfs_recording_init: hfc_filevp exists (vp = 0x%08x)", hfsmp->hfc_filevp); error = hfc_btree_open(hfsmp, &hfsmp->hfc_filevp); if (error) { +#if HFC_VERBOSE + printf("Error %d opening hot file b-tree on %s \n", error, hfsmp->vcbVN); +#endif return (error); } MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); @@ -557,15 +564,14 @@ hfs_recording_init(struct hfsmount *hfsmp, struct proc *p) /* * The writes to Hot File B-tree file are journaled. */ - hfs_global_shared_lock_acquire(hfsmp); - if (hfsmp->jnl) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - hfs_global_shared_lock_release(hfsmp); - error = EINVAL; - goto out1; - } + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + goto out1; } - vn_lock(hfsmp->hfc_filevp, LK_EXCLUSIVE | LK_RETRY, p); + if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK) != 0) { + error = EPERM; + goto out1; + } filefork = VTOF(hfsmp->hfc_filevp); /* @@ -614,7 +620,7 @@ hfs_recording_init(struct hfsmount *hfsmp, struct proc *p) key->fileID = cnid; key->forkType = 0; data = 0x3f3f3f3f; - error = BTInsertRecord(filefork, iterator, &record, sizeof(data)); + error = BTInsertRecord(filefork, iterator, &record, record.itemSize); if (error) { printf("hfs_recording_init: BTInsertRecord failed %d (fileid %d)\n", error, key->fileID); error = MacToVFSError(error); @@ -627,7 +633,7 @@ hfs_recording_init(struct hfsmount *hfsmp, struct proc *p) key->fileID = cnid; key->forkType = 0; data = HFC_MINIMUM_TEMPERATURE; - error = BTInsertRecord(filefork, iterator, &record, sizeof(data)); + error = BTInsertRecord(filefork, iterator, &record, record.itemSize); if (error) { printf("hfs_recording_init: BTInsertRecord failed %d (fileid %d)\n", error, key->fileID); error = MacToVFSError(error); @@ -636,12 +642,9 @@ hfs_recording_init(struct hfsmount *hfsmp, struct proc *p) inserted++; } (void) BTFlushPath(filefork); - (void) VOP_UNLOCK(hfsmp->hfc_filevp, 0, p); + hfs_unlock(VTOC(hfsmp->hfc_filevp)); - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); + hfs_end_transaction(hfsmp); #if HFC_VERBOSE printf("%d files identified out of %d\n", inserted, filecount); #endif @@ -667,15 +670,20 @@ __private_extern__ int hfs_hotfilesync(struct hfsmount *hfsmp, struct proc *p) { - if ((HFSTOVFS(hfsmp)->mnt_kern_flag & MNTK_UNMOUNT) == 0 && hfsmp->hfc_stage) { + if (hfsmp->hfc_stage) { + struct timeval tv; + + lck_mtx_lock(&hfsmp->hfc_mutex); + switch (hfsmp->hfc_stage) { case HFC_IDLE: - (void) hfs_recording_start(hfsmp, p); + (void) hfs_recording_start(hfsmp); break; case HFC_RECORDING: - if (time.tv_sec > hfsmp->hfc_timeout) - (void) hfs_recording_stop(hfsmp, p); + microuptime(&tv); + if (tv.tv_sec > hfsmp->hfc_timeout) + (void) hfs_recording_stop(hfsmp); break; case HFC_EVICTION: @@ -683,9 +691,13 @@ hfs_hotfilesync(struct hfsmount *hfsmp, struct proc *p) break; case HFC_ADOPTION: - (void) hotfiles_adopt(hfsmp, p); + (void) hotfiles_adopt(hfsmp); + break; + default: break; } + + lck_mtx_unlock(&hfsmp->hfc_mutex); } return (0); } @@ -699,10 +711,27 @@ hfs_hotfilesync(struct hfsmount *hfsmp, struct proc *p) * NOTE: Since both the data and resource fork can be hot, * there can be two entries for the same file id. * + * Note: the cnode is locked on entry. */ __private_extern__ int hfs_addhotfile(struct vnode *vp) +{ + hfsmount_t *hfsmp; + int error; + + hfsmp = VTOHFS(vp); + if (hfsmp->hfc_stage != HFC_RECORDING) + return (0); + + lck_mtx_lock(&hfsmp->hfc_mutex); + error = hfs_addhotfile_internal(vp); + lck_mtx_unlock(&hfsmp->hfc_mutex); + return (error); +} + +static int +hfs_addhotfile_internal(struct vnode *vp) { hotfile_data_t *hotdata; hotfile_entry_t *entry; @@ -714,9 +743,8 @@ hfs_addhotfile(struct vnode *vp) hfsmp = VTOHFS(vp); if (hfsmp->hfc_stage != HFC_RECORDING) return (0); - - if (!(vp->v_type == VREG || vp->v_type == VLNK) || - (vp->v_flag & (VSYSTEM | VSWAP))) { + + if ((!vnode_isreg(vp) && !vnode_islnk(vp)) || vnode_issystem(vp)) { return (0); } /* Skip resource forks for now. */ @@ -763,13 +791,14 @@ hfs_addhotfile(struct vnode *vp) } /* - * Remove a hot file to the recording list. + * Remove a hot file from the recording list. * * This can happen when a hot file becomes * an active vnode (active hot files are * not kept in the recording list until the * end of the recording period). * + * Note: the cnode is locked on entry. */ __private_extern__ int @@ -785,12 +814,9 @@ hfs_removehotfile(struct vnode *vp) if (hfsmp->hfc_stage != HFC_RECORDING) return (0); - if (!(vp->v_type == VREG || vp->v_type == VLNK) || - (vp->v_flag & (VSYSTEM | VSWAP))) { + if ((!vnode_isreg(vp) && !vnode_islnk(vp)) || vnode_issystem(vp)) { return (0); } - if ((hotdata = (hotfile_data_t *)hfsmp->hfc_recdata) == NULL) - return (0); ffp = VTOF(vp); cp = VTOC(vp); @@ -800,16 +826,23 @@ hfs_removehotfile(struct vnode *vp) return (0); } + lck_mtx_lock(&hfsmp->hfc_mutex); + if (hfsmp->hfc_stage != HFC_RECORDING) + goto out; + if ((hotdata = (hotfile_data_t *)hfsmp->hfc_recdata) == NULL) + goto out; + temperature = ffp->ff_bytesread / ffp->ff_size; if (temperature < hotdata->threshold) - return (0); + goto out; if (hotdata->coldest && (temperature >= hotdata->coldest->temperature)) { ++hotdata->refcount; hf_delete(hotdata, VTOC(vp)->c_fileid, temperature); --hotdata->refcount; } - +out: + lck_mtx_unlock(&hfsmp->hfc_mutex); return (0); } @@ -820,58 +853,35 @@ hfs_removehotfile(struct vnode *vp) *======================================================================== */ +static int +hotfiles_collect_callback(struct vnode *vp, __unused void *cargs) +{ + if ((vnode_isreg(vp) || vnode_islnk(vp)) && !vnode_issystem(vp)) + (void) hfs_addhotfile_internal(vp); + + return (VNODE_RETURNED); +} + /* * Add all active hot files to the recording list. */ static int -hotfiles_collect(struct hfsmount *hfsmp, struct proc *p) +hotfiles_collect(struct hfsmount *hfsmp) { struct mount *mp = HFSTOVFS(hfsmp); - struct vnode *nvp, *vp; - struct cnode *cp; - int error; - if (vfs_busy(mp, LK_NOWAIT, 0, p)) + if (vfs_busy(mp, LK_NOWAIT)) return (0); -loop: - simple_lock(&mntvnode_slock); - for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { - if (vp->v_mount != mp) { - simple_unlock(&mntvnode_slock); - goto loop; - } - simple_lock(&vp->v_interlock); - nvp = vp->v_mntvnodes.le_next; - - if ((vp->v_flag & VSYSTEM) || - !(vp->v_type == VREG || vp->v_type == VLNK)) { - simple_unlock(&vp->v_interlock); - continue; - } - cp = VTOC(vp); - if (cp == NULL || vp->v_flag & (VXLOCK|VORECLAIM)) { - simple_unlock(&vp->v_interlock); - continue; - } - - simple_unlock(&mntvnode_slock); - error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, p); - if (error) { - if (error == ENOENT) - goto loop; - simple_lock(&mntvnode_slock); - continue; - } - (void) hfs_addhotfile(vp); - vput(vp); - - simple_lock(&mntvnode_slock); - } - - simple_unlock(&mntvnode_slock); + /* + * hotfiles_collect_callback will be called for each vnode + * hung off of this mount point + * the vnode will be + * properly referenced and unreferenced around the callback + */ + vnode_iterate(mp, 0, hotfiles_collect_callback, (void *)NULL); - vfs_unbusy(mp, p); + vfs_unbusy(mp); return (0); } @@ -882,7 +892,7 @@ loop: * This is called from within BTUpdateRecord. */ static int -update_callback(const HotFileKey *key, u_int32_t *data, u_int16_t datalen, u_int32_t *state) +update_callback(const HotFileKey *key, u_int32_t *data, u_int32_t *state) { if (key->temperature == HFC_LOOKUPTAG) *data = *state; @@ -893,11 +903,10 @@ update_callback(const HotFileKey *key, u_int32_t *data, u_int16_t datalen, u_int * Identify files already in hot area. */ static int -hotfiles_refine(struct hfsmount *hfsmp, struct proc *p) +hotfiles_refine(struct hfsmount *hfsmp) { BTreeIterator * iterator; struct mount *mp; - struct vnode *vp; filefork_t * filefork; hotfilelist_t *listp; FSBufferDescriptor record; @@ -920,15 +929,14 @@ hotfiles_refine(struct hfsmount *hfsmp, struct proc *p) record.itemSize = sizeof(u_int32_t); record.itemCount = 1; - hfs_global_shared_lock_acquire(hfsmp); - if (hfsmp->jnl) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - hfs_global_shared_lock_release(hfsmp); - error = EINVAL; - goto out; - } + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + goto out; } - vn_lock(hfsmp->hfc_filevp, LK_EXCLUSIVE | LK_RETRY, p); + if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK) != 0) { + error = EPERM; + goto out; + } filefork = VTOF(hfsmp->hfc_filevp); for (i = 0; i < listp->hfl_count; ++i) { @@ -975,7 +983,7 @@ hotfiles_refine(struct hfsmount *hfsmp, struct proc *p) key->temperature = listp->hfl_hotfile[i].hf_temperature; key->fileID = listp->hfl_hotfile[i].hf_fileid; key->forkType = 0; - error = BTInsertRecord(filefork, iterator, &record, sizeof(data)); + error = BTInsertRecord(filefork, iterator, &record, record.itemSize); if (error) { printf("hotfiles_refine: BTInsertRecord failed %d (file %d)\n", error, key->fileID); error = MacToVFSError(error); @@ -991,12 +999,9 @@ hotfiles_refine(struct hfsmount *hfsmp, struct proc *p) } /* end for */ (void) BTFlushPath(filefork); - (void) VOP_UNLOCK(hfsmp->hfc_filevp, 0, p); + hfs_unlock(VTOC(hfsmp->hfc_filevp)); - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); + hfs_end_transaction(hfsmp); out: FREE(iterator, M_TEMP); return (error); @@ -1004,12 +1009,13 @@ out: /* * Move new hot files into hot area. + * + * Requires that the hfc_mutex be held. */ static int -hotfiles_adopt(struct hfsmount *hfsmp, struct proc *p) +hotfiles_adopt(struct hfsmount *hfsmp) { BTreeIterator * iterator; - struct mount *mp; struct vnode *vp; filefork_t * filefork; hotfilelist_t *listp; @@ -1023,7 +1029,6 @@ hotfiles_adopt(struct hfsmount *hfsmp, struct proc *p) int last; int error = 0; int startedtrans = 0; - int aquiredlock = 0; if ((listp = (hotfilelist_t *)hfsmp->hfc_recdata) == NULL) return (0); @@ -1031,10 +1036,13 @@ hotfiles_adopt(struct hfsmount *hfsmp, struct proc *p) if (hfsmp->hfc_stage != HFC_ADOPTION) { return (EBUSY); } + if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK) != 0) { + return (EPERM); + } + stage = hfsmp->hfc_stage; hfsmp->hfc_stage = HFC_BUSY; - mp = HFSTOVFS(hfsmp); blksmoved = 0; last = listp->hfl_next + HFC_FILESPERSYNC; if (last > listp->hfl_count) @@ -1049,7 +1057,6 @@ hotfiles_adopt(struct hfsmount *hfsmp, struct proc *p) record.itemSize = sizeof(u_int32_t); record.itemCount = 1; - vn_lock(hfsmp->hfc_filevp, LK_EXCLUSIVE | LK_RETRY, p); filefork = VTOF(hfsmp->hfc_filevp); for (i = listp->hfl_next; (i < last) && (blksmoved < HFC_BLKSPERSYNC); ++i) { @@ -1063,7 +1070,7 @@ hotfiles_adopt(struct hfsmount *hfsmp, struct proc *p) /* * Acquire a vnode for this file. */ - error = VFS_VGET(mp, &listp->hfl_hotfile[i].hf_fileid, &vp); + error = hfs_vget(hfsmp, listp->hfl_hotfile[i].hf_fileid, &vp, 0); if (error) { if (error == ENOENT) { error = 0; @@ -1072,23 +1079,26 @@ hotfiles_adopt(struct hfsmount *hfsmp, struct proc *p) } break; } - if (vp->v_type != VREG && vp->v_type != VLNK) { + if (!vnode_isreg(vp) && !vnode_islnk(vp)) { printf("hotfiles_adopt: huh, not a file %d (%d)\n", listp->hfl_hotfile[i].hf_fileid, VTOC(vp)->c_cnid); - vput(vp); - listp->hfl_hotfile[i].hf_temperature == 0; + hfs_unlock(VTOC(vp)); + vnode_put(vp); + listp->hfl_hotfile[i].hf_temperature = 0; listp->hfl_next++; continue; /* stale entry, go to next */ } if (hotextents(hfsmp, &VTOF(vp)->ff_extents[0])) { - vput(vp); - listp->hfl_hotfile[i].hf_temperature == 0; + hfs_unlock(VTOC(vp)); + vnode_put(vp); + listp->hfl_hotfile[i].hf_temperature = 0; listp->hfl_next++; listp->hfl_totalblocks -= listp->hfl_hotfile[i].hf_blocks; continue; /* stale entry, go to next */ } fileblocks = VTOF(vp)->ff_blocks; if (fileblocks > hfsmp->hfs_hotfile_freeblks) { - vput(vp); + hfs_unlock(VTOC(vp)); + vnode_put(vp); listp->hfl_next++; listp->hfl_totalblocks -= fileblocks; continue; /* entry too big, go to next */ @@ -1096,23 +1106,27 @@ hotfiles_adopt(struct hfsmount *hfsmp, struct proc *p) if ((blksmoved > 0) && (blksmoved + fileblocks) > HFC_BLKSPERSYNC) { - vput(vp); - break; + hfs_unlock(VTOC(vp)); + vnode_put(vp); + break; /* adopt this entry the next time around */ } /* Start a new transaction. */ - hfs_global_shared_lock_acquire(hfsmp); - aquiredlock = 1; - if (hfsmp->jnl) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - error = EINVAL; - vput(vp); - break; - } - startedtrans = 1; - } + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + hfs_unlock(VTOC(vp)); + vnode_put(vp); + break; + } + startedtrans = 1; + + if (VTOC(vp)->c_desc.cd_nameptr) + data = *(u_int32_t *)(VTOC(vp)->c_desc.cd_nameptr); + else + data = 0x3f3f3f3f; - error = hfs_relocate(vp, hfsmp->hfs_hotfile_start, p->p_ucred, p); - vput(vp); + error = hfs_relocate(vp, hfsmp->hfs_hotfile_start, kauth_cred_get(), current_proc()); + hfs_unlock(VTOC(vp)); + vnode_put(vp); if (error) break; @@ -1125,12 +1139,8 @@ hotfiles_adopt(struct hfsmount *hfsmp, struct proc *p) key->temperature = listp->hfl_hotfile[i].hf_temperature; key->fileID = listp->hfl_hotfile[i].hf_fileid; key->forkType = 0; - if (VTOC(vp)->c_desc.cd_nameptr) - data = *(u_int32_t *)(VTOC(vp)->c_desc.cd_nameptr); - else - data = 0x3f3f3f3f; - error = BTInsertRecord(filefork, iterator, &record, sizeof(data)); + error = BTInsertRecord(filefork, iterator, &record, record.itemSize); if (error) { printf("hotfiles_adopt: BTInsertRecord failed %d (fileid %d)\n", error, key->fileID); error = MacToVFSError(error); @@ -1144,7 +1154,7 @@ hotfiles_adopt(struct hfsmount *hfsmp, struct proc *p) key->fileID = listp->hfl_hotfile[i].hf_fileid; key->forkType = 0; data = listp->hfl_hotfile[i].hf_temperature; - error = BTInsertRecord(filefork, iterator, &record, sizeof(data)); + error = BTInsertRecord(filefork, iterator, &record, record.itemSize); if (error) { printf("hotfiles_adopt: BTInsertRecord failed %d (fileid %d)\n", error, key->fileID); error = MacToVFSError(error); @@ -1155,11 +1165,9 @@ hotfiles_adopt(struct hfsmount *hfsmp, struct proc *p) /* Transaction complete. */ if (startedtrans) { - journal_end_transaction(hfsmp->jnl); - startedtrans = 0; + hfs_end_transaction(hfsmp); + startedtrans = 0; } - hfs_global_shared_lock_release(hfsmp); - aquiredlock = 0; blksmoved += fileblocks; listp->hfl_next++; @@ -1180,14 +1188,10 @@ hotfiles_adopt(struct hfsmount *hfsmp, struct proc *p) /* Finish any outstanding transactions. */ if (startedtrans) { (void) BTFlushPath(filefork); - journal_end_transaction(hfsmp->jnl); + hfs_end_transaction(hfsmp); startedtrans = 0; } - if (aquiredlock) { - hfs_global_shared_lock_release(hfsmp); - aquiredlock = 0; - } - (void) VOP_UNLOCK(hfsmp->hfc_filevp, 0, p); + hfs_unlock(VTOC(hfsmp->hfc_filevp)); if ((listp->hfl_next >= listp->hfl_count) || (hfsmp->hfs_hotfile_freeblks <= 0)) { #if HFC_VERBOSE @@ -1209,12 +1213,13 @@ hotfiles_adopt(struct hfsmount *hfsmp, struct proc *p) /* * Reclaim space by evicting the coldest files. + * + * Requires that the hfc_mutex be held. */ static int hotfiles_evict(struct hfsmount *hfsmp, struct proc *p) { BTreeIterator * iterator; - struct mount *mp; struct vnode *vp; HotFileKey * key; filefork_t * filefork; @@ -1225,7 +1230,6 @@ hotfiles_evict(struct hfsmount *hfsmp, struct proc *p) int fileblocks; int error = 0; int startedtrans = 0; - int aquiredlock = 0; if (hfsmp->hfc_stage != HFC_EVICTION) { return (EBUSY); @@ -1234,17 +1238,19 @@ hotfiles_evict(struct hfsmount *hfsmp, struct proc *p) if ((listp = (hotfilelist_t *)hfsmp->hfc_recdata) == NULL) return (0); + if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK) != 0) { + return (EPERM); + } + stage = hfsmp->hfc_stage; hfsmp->hfc_stage = HFC_BUSY; - mp = HFSTOVFS(hfsmp); filesmoved = blksmoved = 0; MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); bzero(iterator, sizeof(*iterator)); key = (HotFileKey*) &iterator->key; - vn_lock(hfsmp->hfc_filevp, LK_EXCLUSIVE | LK_RETRY, p); filefork = VTOF(hfsmp->hfc_filevp); while (listp->hfl_reclaimblks > 0 && @@ -1278,20 +1284,19 @@ hotfiles_evict(struct hfsmount *hfsmp, struct proc *p) /* * Aquire the vnode for this file. */ - error = VFS_VGET(mp, &key->fileID, &vp); + error = hfs_vget(hfsmp, key->fileID, &vp, 0); /* Start a new transaction. */ - hfs_global_shared_lock_acquire(hfsmp); - aquiredlock = 1; - if (hfsmp->jnl) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - if (error == 0) - vput(vp); - error = EINVAL; - break; - } - startedtrans = 1; - } + if (hfs_start_transaction(hfsmp) != 0) { + if (error == 0) { + hfs_unlock(VTOC(vp)); + vnode_put(vp); + } + error = EINVAL; + break; + } + startedtrans = 1; + if (error) { if (error == ENOENT) { (void) BTDeleteRecord(filefork, iterator); @@ -1299,14 +1304,15 @@ hotfiles_evict(struct hfsmount *hfsmp, struct proc *p) (void) BTDeleteRecord(filefork, iterator); goto next; /* stale entry, go to next */ } else { - printf("hotfiles_evict: err %d getting file %d (%d)\n", + printf("hotfiles_evict: err %d getting file %d\n", error, key->fileID); } break; } - if (vp->v_type != VREG && vp->v_type != VLNK) { + if (!vnode_isreg(vp) && !vnode_islnk(vp)) { printf("hotfiles_evict: huh, not a file %d\n", key->fileID); - vput(vp); + hfs_unlock(VTOC(vp)); + vnode_put(vp); (void) BTDeleteRecord(filefork, iterator); key->temperature = HFC_LOOKUPTAG; (void) BTDeleteRecord(filefork, iterator); @@ -1315,7 +1321,8 @@ hotfiles_evict(struct hfsmount *hfsmp, struct proc *p) fileblocks = VTOF(vp)->ff_blocks; if ((blksmoved > 0) && (blksmoved + fileblocks) > HFC_BLKSPERSYNC) { - vput(vp); + hfs_unlock(VTOC(vp)); + vnode_put(vp); break; } /* @@ -1325,7 +1332,8 @@ hotfiles_evict(struct hfsmount *hfsmp, struct proc *p) #if HFC_VERBOSE printf("hotfiles_evict: file %d isn't hot!\n", key->fileID); #endif - vput(vp); + hfs_unlock(VTOC(vp)); + vnode_put(vp); (void) BTDeleteRecord(filefork, iterator); key->temperature = HFC_LOOKUPTAG; (void) BTDeleteRecord(filefork, iterator); @@ -1335,16 +1343,23 @@ hotfiles_evict(struct hfsmount *hfsmp, struct proc *p) /* * Relocate file out of hot area. */ - error = hfs_relocate(vp, HFSTOVCB(hfsmp)->nextAllocation, p->p_ucred, p); + error = hfs_relocate(vp, HFSTOVCB(hfsmp)->nextAllocation, proc_ucred(p), p); if (error) { - /* XXX skip to next record here! */ - printf("hotfiles_evict: err % relocating file\n", error, key->fileID); - vput(vp); - break; + printf("hotfiles_evict: err %d relocating file %d\n", error, key->fileID); + hfs_unlock(VTOC(vp)); + vnode_put(vp); + goto next; /* go to next */ } - (void) VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p); - vput(vp); + // + // We do not believe that this call to hfs_fsync() is + // necessary and it causes a journal transaction + // deadlock so we are removing it. + // + // (void) hfs_fsync(vp, MNT_WAIT, 0, p); + + hfs_unlock(VTOC(vp)); + vnode_put(vp); hfsmp->hfs_hotfile_freeblks += fileblocks; listp->hfl_reclaimblks -= fileblocks; @@ -1371,11 +1386,9 @@ next: /* Transaction complete. */ if (startedtrans) { - journal_end_transaction(hfsmp->jnl); + hfs_end_transaction(hfsmp); startedtrans = 0; } - hfs_global_shared_lock_release(hfsmp); - aquiredlock = 0; } /* end while */ @@ -1385,14 +1398,10 @@ next: /* Finish any outstanding transactions. */ if (startedtrans) { (void) BTFlushPath(filefork); - journal_end_transaction(hfsmp->jnl); + hfs_end_transaction(hfsmp); startedtrans = 0; } - if (aquiredlock) { - hfs_global_shared_lock_release(hfsmp); - aquiredlock = 0; - } - (void) VOP_UNLOCK(hfsmp->hfc_filevp, 0, p); + hfs_unlock(VTOC(hfsmp->hfc_filevp)); /* * Move to next stage when finished. @@ -1413,7 +1422,7 @@ next: * Age the existing records in the hot files b-tree. */ static int -hotfiles_age(struct hfsmount *hfsmp, struct proc *p) +hotfiles_age(struct hfsmount *hfsmp) { BTreeInfoRec btinfo; BTreeIterator * iterator; @@ -1450,15 +1459,14 @@ hotfiles_age(struct hfsmount *hfsmp, struct proc *p) /* * Capture b-tree changes inside a transaction */ - hfs_global_shared_lock_acquire(hfsmp); - if (hfsmp->jnl) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - hfs_global_shared_lock_release(hfsmp); - error = EINVAL; - goto out2; - } + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + goto out2; } - vn_lock(hfsmp->hfc_filevp, LK_EXCLUSIVE | LK_RETRY, p); + if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK) != 0) { + error = EPERM; + goto out1; + } filefork = VTOF(hfsmp->hfc_filevp); error = BTGetInformation(filefork, 0, &btinfo); @@ -1527,7 +1535,7 @@ hotfiles_age(struct hfsmount *hfsmp, struct proc *p) newtemp = MAX(prev_key->temperature >> 1, 4); prev_key->temperature = newtemp; - error = BTInsertRecord(filefork, prev_iterator, &prev_record, sizeof(data)); + error = BTInsertRecord(filefork, prev_iterator, &prev_record, prev_record.itemSize); if (error) { printf("hfs_agehotfiles: BTInsertRecord failed %d (file %d)\n", error, prev_key->fileID); error = MacToVFSError(error); @@ -1559,13 +1567,9 @@ hotfiles_age(struct hfsmount *hfsmp, struct proc *p) #endif (void) BTFlushPath(filefork); out: - (void) VOP_UNLOCK(hfsmp->hfc_filevp, 0, p); - - if (hfsmp->jnl) { - // hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); + hfs_unlock(VTOC(hfsmp->hfc_filevp)); +out1: + hfs_end_transaction(hfsmp); out2: FREE(iterator, M_TEMP); return (error); @@ -1608,36 +1612,34 @@ hotextents(struct hfsmount *hfsmp, HFSPlusExtentDescriptor * extents) /* * Open the hot files b-tree for writing. * - * On successful exit the vnode has a reference but is unlocked. + * On successful exit the vnode has a reference but not an iocount. */ static int hfc_btree_open(struct hfsmount *hfsmp, struct vnode **vpp) { struct proc *p; struct vnode *vp; - struct cat_desc cdesc = {0}; + struct cat_desc cdesc; struct cat_attr cattr; struct cat_fork cfork; static char filename[] = HFC_FILENAME; int error; int retry = 0; + int lockflags; *vpp = NULL; p = current_proc(); + bzero(&cdesc, sizeof(cdesc)); cdesc.cd_parentcnid = kRootDirID; cdesc.cd_nameptr = filename; cdesc.cd_namelen = strlen(filename); - /* Lock catalog b-tree */ - error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, p); - if (error) - return (error); + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - error = cat_lookup(hfsmp, &cdesc, 0, &cdesc, &cattr, &cfork); + error = cat_lookup(hfsmp, &cdesc, 0, &cdesc, &cattr, &cfork, NULL); - /* Unlock catalog b-tree */ - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); + hfs_systemfile_unlock(hfsmp, lockflags); if (error) { printf("hfc_btree_open: cat_lookup error %d\n", error); @@ -1645,18 +1647,19 @@ hfc_btree_open(struct hfsmount *hfsmp, struct vnode **vpp) } again: cdesc.cd_flags |= CD_ISMETA; - error = hfs_getnewvnode(hfsmp, NULL, &cdesc, 0, &cattr, &cfork, &vp); + error = hfs_getnewvnode(hfsmp, NULL, NULL, &cdesc, 0, &cattr, &cfork, &vp); if (error) { printf("hfc_btree_open: hfs_getnewvnode error %d\n", error); cat_releasedesc(&cdesc); return (error); } - if ((vp->v_flag & VSYSTEM) == 0) { + if (!vnode_issystem(vp)) { #if HFC_VERBOSE printf("hfc_btree_open: file has UBC, try again\n"); #endif - vput(vp); - vgone(vp); + hfs_unlock(VTOC(vp)); + vnode_recycle(vp); + vnode_put(vp); if (retry++ == 0) goto again; else @@ -1668,28 +1671,16 @@ again: if (error) { printf("hfc_btree_open: BTOpenPath error %d\n", error); error = MacToVFSError(error); - } else { -#if HFC_VERBOSE - struct BTreeInfoRec btinfo; - - if (BTGetInformation(VTOF(vp), 0, &btinfo) == 0) { - printf("btinfo: nodeSize %d\n", btinfo.nodeSize); - printf("btinfo: maxKeyLength %d\n", btinfo.maxKeyLength); - printf("btinfo: treeDepth %d\n", btinfo.treeDepth); - printf("btinfo: numRecords %d\n", btinfo.numRecords); - printf("btinfo: numNodes %d\n", btinfo.numNodes); - printf("btinfo: numFreeNodes %d\n", btinfo.numFreeNodes); - } -#endif } - VOP_UNLOCK(vp, 0, p); /* unlocked with a single reference */ - if (error) - vrele(vp); - else + hfs_unlock(VTOC(vp)); + if (error == 0) { *vpp = vp; + vnode_ref(vp); /* keep a reference while its open */ + } + vnode_put(vp); - if ((vp->v_flag & VSYSTEM) == 0) + if (!vnode_issystem(vp)) panic("hfc_btree_open: not a system file (vp = 0x%08x)", vp); if (UBCINFOEXISTS(vp)) @@ -1701,31 +1692,32 @@ again: /* * Close the hot files b-tree. * - * On entry the vnode is not locked but has a reference. + * On entry the vnode has a reference. */ static int hfc_btree_close(struct hfsmount *hfsmp, struct vnode *vp) { struct proc *p = current_proc(); - int error; + int error = 0; if (hfsmp->jnl) { journal_flush(hfsmp->jnl); } - if (vget(vp, LK_EXCLUSIVE, p) == 0) { - (void) VOP_FSYNC(vp, NOCRED, MNT_WAIT, p); - error = BTClosePath(VTOF(vp)); - if (error) - printf("hfc_btree_close: BTClosePath error %d\n", error); - vput(vp); + if (vnode_get(vp) == 0) { + error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK); + if (error == 0) { + (void) hfs_fsync(vp, MNT_WAIT, 0, p); + error = BTClosePath(VTOF(vp)); + hfs_unlock(VTOC(vp)); + } + vnode_rele(vp); + vnode_recycle(vp); + vnode_put(vp); } - vrele(vp); - vgone(vp); - vp = NULL; - return (0); + return (error); } /* @@ -1735,34 +1727,65 @@ hfc_btree_close(struct hfsmount *hfsmp, struct vnode *vp) static int hfc_btree_create(struct hfsmount *hfsmp, int nodesize, int entries) { - struct proc *p; - struct nameidata nd; - struct vnode *vp; - char path[128]; + struct vnode *dvp = NULL; + struct vnode *vp = NULL; + struct cnode *cp = NULL; + struct vfs_context context; + struct vnode_attr va; + struct componentname cname; + static char filename[] = HFC_FILENAME; int error; + context.vc_proc = current_proc(); + context.vc_ucred = kauth_cred_get(); if (hfsmp->hfc_filevp) panic("hfc_btree_create: hfc_filevp exists (vp = 0x%08x)", hfsmp->hfc_filevp); - p = current_proc(); - snprintf(path, sizeof(path), "%s/%s", - hfsmp->hfs_mp->mnt_stat.f_mntonname, HFC_FILENAME); - NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, path, p); - if ((error = vn_open(&nd, O_CREAT | FWRITE, S_IRUSR | S_IWUSR)) != 0) { + error = VFS_ROOT(HFSTOVFS(hfsmp), &dvp, &context); + if (error) { return (error); } - vp = nd.ni_vp; - + cname.cn_nameiop = CREATE; + cname.cn_flags = ISLASTCN; + cname.cn_context = &context; + cname.cn_pnbuf = filename; + cname.cn_pnlen = sizeof(filename); + cname.cn_nameptr = filename; + cname.cn_namelen = strlen(filename); + cname.cn_hash = 0; + cname.cn_consume = 0; + + VATTR_INIT(&va); + VATTR_SET(&va, va_type, VREG); + VATTR_SET(&va, va_mode, S_IFREG | S_IRUSR | S_IWUSR); + VATTR_SET(&va, va_uid, 0); + VATTR_SET(&va, va_gid, 0); + + /* call ourselves directly, ignore the higher-level VFS file creation code */ + error = VNOP_CREATE(dvp, &vp, &cname, &va, &context); + if (error) { + printf("HFS: error %d creating HFBT on %s\n", error, HFSTOVCB(hfsmp)->vcbVN); + goto out; + } + if (dvp) { + vnode_put(dvp); + dvp = NULL; + } + if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) { + goto out; + } + cp = VTOC(vp); + /* Don't use non-regular files or files with links. */ - if (vp->v_type != VREG || VTOC(vp)->c_nlink != 1) { + if (!vnode_isreg(vp) || cp->c_nlink != 1) { error = EFTYPE; goto out; } printf("HFS: created HFBT on %s\n", HFSTOVCB(hfsmp)->vcbVN); - if (VTOF(vp)->ff_size < nodesize) { + if (VTOF(vp)->ff_size < (u_int64_t)nodesize) { caddr_t buffer; u_int16_t *index; u_int16_t offset; @@ -1776,7 +1799,7 @@ hfc_btree_create(struct hfsmount *hfsmp, int nodesize, int entries) /* * Mark it invisible (truncate will pull these changes). */ - ((FndrFileInfo *)&VTOC(vp)->c_finderinfo[0])->fdFlags |= + ((FndrFileInfo *)&cp->c_finderinfo[0])->fdFlags |= SWAP_BE16 (kIsInvisible + kNameLocked); if (kmem_alloc(kernel_map, (vm_offset_t *)&buffer, nodesize)) { @@ -1831,29 +1854,49 @@ hfc_btree_create(struct hfsmount *hfsmp, int nodesize, int entries) - kBTreeHeaderUserBytes - (4 * sizeof(int16_t)); index[(nodesize / 2) - 4] = SWAP_BE16 (offset); - vp->v_flag |= VNOFLUSH; - error = VOP_TRUNCATE(vp, (off_t)filesize, IO_NDELAY, NOCRED, p); + vnode_setnoflush(vp); + error = hfs_truncate(vp, (off_t)filesize, IO_NDELAY, 0, &context); + if (error) { + printf("HFS: error %d growing HFBT on %s\n", error, HFSTOVCB(hfsmp)->vcbVN); + goto out; + } + cp->c_flag |= C_ZFWANTSYNC; + cp->c_zftimeout = 1; + if (error == 0) { - struct iovec aiov; - struct uio auio; - - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - aiov.iov_base = buffer; - aiov.iov_len = filesize; - auio.uio_resid = nodesize; - auio.uio_offset = (off_t)(0); - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = UIO_WRITE; - auio.uio_procp = (struct proc *)0; - error = VOP_WRITE(vp, &auio, 0, kernproc->p_ucred); + struct vnop_write_args args; + uio_t auio; + + auio = uio_create(1, 0, UIO_SYSSPACE32, UIO_WRITE); + uio_addiov(auio, (uintptr_t)buffer, nodesize); + + args.a_desc = &vnop_write_desc; + args.a_vp = vp; + args.a_uio = auio; + args.a_ioflag = 0; + args.a_context = &context; + + hfs_unlock(cp); + cp = NULL; + + error = hfs_vnop_write(&args); + if (error) + printf("HFS: error %d writing HFBT on %s\n", error, HFSTOVCB(hfsmp)->vcbVN); + + uio_free(auio); } kmem_free(kernel_map, (vm_offset_t)buffer, nodesize); } out: - (void) VOP_UNLOCK(vp, 0, p); - (void) vn_close(vp, FWRITE, kernproc->p_ucred, p); - vgone(vp); + if (dvp) { + vnode_put(dvp); + } + if (vp) { + if (cp) + hfs_unlock(cp); + vnode_recycle(vp); + vnode_put(vp); + } return (error); } @@ -1903,6 +1946,7 @@ hfc_comparekeys(HotFileKey *searchKey, HotFileKey *trialKey) /* * Lookup a hot file entry in the tree. */ +#if HFC_DEBUG static hotfile_entry_t * hf_lookup(hotfile_data_t *hotdata, u_int32_t fileid, u_int32_t temperature) { @@ -1923,6 +1967,7 @@ hf_lookup(hotfile_data_t *hotdata, u_int32_t fileid, u_int32_t temperature) } return (entry); } +#endif /* * Insert a hot file entry into the tree. @@ -1993,6 +2038,21 @@ hf_coldest(hotfile_data_t *hotdata) return (entry); } +/* + * Find the hottest entry in the tree. + */ +static hotfile_entry_t * +hf_hottest(hotfile_data_t *hotdata) +{ + hotfile_entry_t *entry = hotdata->rootentry; + + if (entry) { + while (entry->right) + entry = entry->right; + } + return (entry); +} + /* * Delete a hot file entry from the tree. */ @@ -2093,38 +2153,32 @@ hf_getnewentry(hotfile_data_t *hotdata) /* - * Visit the tree in desending order. + * Generate a sorted list of hot files (hottest to coldest). + * + * As a side effect, every node in the hot file tree will be + * deleted (moved to the free list). */ static void -hf_sortlist(hotfile_entry_t * root, int *index, hotfilelist_t *sortedlist) -{ - if (root) { - int i; - - hf_sortlist(root->right, index, sortedlist); - i = *index; - ++(*index); - sortedlist->hfl_hotfile[i].hf_fileid = root->fileid; - sortedlist->hfl_hotfile[i].hf_temperature = root->temperature; - sortedlist->hfl_hotfile[i].hf_blocks = root->blocks; - sortedlist->hfl_totalblocks += root->blocks; - hf_sortlist(root->left, index, sortedlist); - } -} - -/* - * Generate a sorted list of hot files. - */ -static int hf_getsortedlist(hotfile_data_t * hotdata, hotfilelist_t *sortedlist) { - int index = 0; - - hf_sortlist(hotdata->rootentry, &index, sortedlist); + int i = 0; + hotfile_entry_t *entry; + + while ((entry = hf_hottest(hotdata)) != NULL) { + sortedlist->hfl_hotfile[i].hf_fileid = entry->fileid; + sortedlist->hfl_hotfile[i].hf_temperature = entry->temperature; + sortedlist->hfl_hotfile[i].hf_blocks = entry->blocks; + sortedlist->hfl_totalblocks += entry->blocks; + ++i; - sortedlist->hfl_count = hotdata->activefiles; + hf_delete(hotdata, entry->fileid, entry->temperature); + } - return (index); + sortedlist->hfl_count = i; + +#if HFC_VERBOSE + printf("HFS: hf_getsortedlist returned %d entries\n", i); +#endif } diff --git a/bsd/hfs/hfs_hotfiles.h b/bsd/hfs/hfs_hotfiles.h index b1370b849..a9db6d619 100644 --- a/bsd/hfs/hfs_hotfiles.h +++ b/bsd/hfs/hfs_hotfiles.h @@ -39,7 +39,7 @@ #define HFC_CUMULATIVE_CYCLES 4 #define HFC_MAXIMUM_FILE_COUNT 5000 #define HFC_MAXIMUM_FILESIZE (10 * 1024 * 1024) -#define HFC_MINIMUM_TEMPERATURE 16 +#define HFC_MINIMUM_TEMPERATURE 24 /* @@ -107,15 +107,15 @@ struct vnode; */ int hfs_hotfilesync (struct hfsmount *, struct proc *); -int hfs_recording_init(struct hfsmount *, struct proc *); -int hfs_recording_start (struct hfsmount *, struct proc *); -int hfs_recording_stop (struct hfsmount *, struct proc *); -int hfs_recording_suspend (struct hfsmount *, struct proc *); -int hfs_recording_abort (struct hfsmount *, struct proc *); +int hfs_recording_init(struct hfsmount *); +int hfs_recording_suspend (struct hfsmount *); int hfs_addhotfile (struct vnode *); int hfs_removehotfile (struct vnode *); +int hfs_relocate(struct vnode *, u_int32_t, kauth_cred_t, struct proc *); + + #endif /* __APPLE_API_PRIVATE */ #endif /* KERNEL */ #endif /* __HFS_HOTFILES__ */ diff --git a/bsd/hfs/hfs_link.c b/bsd/hfs/hfs_link.c index 97dfde7bb..8ab33cf97 100644 --- a/bsd/hfs/hfs_link.c +++ b/bsd/hfs/hfs_link.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -25,7 +25,6 @@ #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/mount.h> -#include <sys/namei.h> #include <sys/stat.h> #include <sys/vnode.h> #include <vfs/vfs_support.h> @@ -37,6 +36,9 @@ #include "hfs_endian.h" +static int cur_link_id = 0; + + /* * Create a new indirect link * @@ -72,25 +74,12 @@ createindirectlink(struct hfsmount *hfsmp, u_int32_t linknum, fip->fdCreator = SWAP_BE32 (kHFSPlusCreator); /* 'hfs+' */ fip->fdFlags = SWAP_BE16 (kHasBeenInited); - hfs_global_shared_lock_acquire(hfsmp); - if (hfsmp->jnl) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - hfs_global_shared_lock_release(hfsmp); - return EINVAL; - } - } - /* Create the indirect link directly in the catalog */ result = cat_create(hfsmp, &desc, &attr, NULL); if (result == 0 && linkcnid != NULL) *linkcnid = attr.ca_fileid; - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); - return (result); } @@ -107,15 +96,22 @@ static int hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp, struct componentname *cnp) { - struct proc *p = cnp->cn_proc; + vfs_context_t ctx = cnp->cn_context; + struct proc *p = vfs_context_proc(ctx); u_int32_t indnodeno = 0; - char inodename[32]; + char inodename[32]; struct cat_desc to_desc; int newlink = 0; + int lockflags; int retval; - cat_cookie_t cookie = {0}; - + cat_cookie_t cookie; + cnid_t orig_cnid; + if (cur_link_id == 0) { + cur_link_id = ((random() & 0x3fffffff) + 100); + // printf("hfs: initializing cur link id to: 0x%.8x\n", cur_link_id); + } + /* We don't allow link nodes in our Private Meta Data folder! */ if (dcp->c_fileid == hfsmp->hfs_privdir_desc.cd_cnid) return (EPERM); @@ -123,16 +119,17 @@ hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp, if (hfs_freeblks(hfsmp, 0) == 0) return (ENOSPC); + bzero(&cookie, sizeof(cat_cookie_t)); /* Reserve some space in the Catalog file. */ if ((retval = cat_preflight(hfsmp, (2 * CAT_CREATE)+ CAT_RENAME, &cookie, p))) { return (retval); } - /* Lock catalog b-tree */ - retval = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); - if (retval) { - goto out2; - } + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); + + // save off a copy of the current cnid so we can put + // it back if we get errors down below + orig_cnid = cp->c_desc.cd_cnid; /* * If this is a new hardlink then we need to create the data @@ -146,7 +143,12 @@ hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp, do { /* get a unique indirect node number */ - indnodeno = ((random() & 0x3fffffff) + 100); + if (retval == 0) { + indnodeno = cp->c_fileid; + } else { + indnodeno = cur_link_id++; + } + MAKE_INODE_NAME(inodename, indnodeno); /* move source file to data node directory */ @@ -156,6 +158,11 @@ hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp, retval = cat_rename(hfsmp, &cp->c_desc, &hfsmp->hfs_privdir_desc, &to_desc, NULL); + if (retval != 0 && retval != EEXIST) { + printf("hfs_makelink: cat_rename to %s failed (%d). fileid %d\n", + inodename, retval, cp->c_fileid); + } + } while (retval == EEXIST); if (retval) goto out; @@ -164,19 +171,16 @@ hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp, retval = createindirectlink(hfsmp, indnodeno, cp->c_parentcnid, cp->c_desc.cd_nameptr, &cp->c_desc.cd_cnid); if (retval) { - /* put it source file back */ - // XXXdbg - #if 1 - { - int err; - err = cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL); - if (err) - panic("hfs_makelink: error %d from cat_rename backout 1", err); - } - #else - (void) cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL); - #endif - goto out; + /* put it source file back */ + int err; + + // Put this back to what it was before. + cp->c_desc.cd_cnid = orig_cnid; + + err = cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL); + if (err) + panic("hfs_makelink: error %d from cat_rename backout 1", err); + goto out; } cp->c_rdev = indnodeno; } else { @@ -188,22 +192,20 @@ hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp, */ retval = createindirectlink(hfsmp, indnodeno, dcp->c_fileid, cnp->cn_nameptr, NULL); if (retval && newlink) { - /* Get rid of new link */ - (void) cat_delete(hfsmp, &cp->c_desc, &cp->c_attr); - - /* Put the source file back */ - // XXXdbg - #if 1 - { - int err; - err = cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL); - if (err) - panic("hfs_makelink: error %d from cat_rename backout 2", err); - } - #else - (void) cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL); - #endif - goto out; + int err; + + /* Get rid of new link */ + (void) cat_delete(hfsmp, &cp->c_desc, &cp->c_attr); + + // Put this back to what it was before. + cp->c_desc.cd_cnid = orig_cnid; + + /* Put the source file back */ + err = cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL); + if (err) + panic("hfs_makelink: error %d from cat_rename backout 2", err); + + goto out; } /* @@ -212,17 +214,41 @@ hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp, * - mark the cnode as a hard link */ if (newlink) { + vnode_t vp; + + if (retval != 0) { + panic("hfs_makelink: retval %d but newlink = 1!\n", retval); + } + hfsmp->hfs_privdir_attr.ca_entries++; - (void)cat_update(hfsmp, &hfsmp->hfs_privdir_desc, - &hfsmp->hfs_privdir_attr, NULL, NULL); + retval = cat_update(hfsmp, &hfsmp->hfs_privdir_desc, + &hfsmp->hfs_privdir_attr, NULL, NULL); + if (retval != 0) { + panic("hfs_makelink: cat_update of privdir failed! (%d)\n", + retval); + } hfs_volupdate(hfsmp, VOL_MKFILE, 0); - cp->c_flag |= (C_CHANGE | C_HARDLINK); + cp->c_flag |= C_HARDLINK; + if ((vp = cp->c_vp) != NULLVP) { + if (vnode_get(vp) == 0) { + vnode_set_hard_link(vp); + vnode_put(vp); + } + } + if ((vp = cp->c_rsrc_vp) != NULLVP) { + if (vnode_get(vp) == 0) { + vnode_set_hard_link(vp); + vnode_put(vp); + } + } + cp->c_touch_chgtime = TRUE; + cp->c_flag |= C_FORCEUPDATE; } + dcp->c_flag |= C_FORCEUPDATE; out: - /* Unlock catalog b-tree */ - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); -out2: + hfs_systemfile_unlock(hfsmp, lockflags); + cat_postflight(hfsmp, &cookie, p); return (retval); } @@ -233,128 +259,106 @@ out2: #% link vp U U U #% link tdvp L U U # - vop_link { + vnop_link { IN WILLRELE struct vnode *vp; IN struct vnode *targetPar_vp; IN struct componentname *cnp; + IN vfs_context_t context; */ __private_extern__ int -hfs_link(ap) - struct vop_link_args /* { - struct vnode *a_vp; - struct vnode *a_tdvp; - struct componentname *a_cnp; - } */ *ap; +hfs_vnop_link(struct vnop_link_args *ap) { struct hfsmount *hfsmp; struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; - struct proc *p = cnp->cn_proc; struct cnode *cp; struct cnode *tdcp; - struct timeval tv; - int error; + enum vtype v_type; + int error, ret, lockflags; + struct cat_desc cndesc; - hfsmp = VTOHFS(vp); - -#if HFS_DIAGNOSTIC - if ((cnp->cn_flags & HASBUF) == 0) - panic("hfs_link: no name"); -#endif - if (tdvp->v_mount != vp->v_mount) { - VOP_ABORTOP(tdvp, cnp); - error = EXDEV; - goto out2; - } - if (VTOVCB(tdvp)->vcbSigWord != kHFSPlusSigWord) + if (VTOVCB(tdvp)->vcbSigWord != kHFSPlusSigWord) { return err_link(ap); /* hfs disks don't support hard links */ - - if (hfsmp->hfs_privdir_desc.cd_cnid == 0) + } + if (VTOHFS(vp)->hfs_privdir_desc.cd_cnid == 0) { return err_link(ap); /* no private metadata dir, no links possible */ - - if (tdvp != vp && (error = vn_lock(vp, LK_EXCLUSIVE, p))) { - VOP_ABORTOP(tdvp, cnp); - goto out2; } - cp = VTOC(vp); + if (vnode_mount(tdvp) != vnode_mount(vp)) { + return (EXDEV); + } + if ((error = hfs_lockpair(VTOC(tdvp), VTOC(vp), HFS_EXCLUSIVE_LOCK))) { + return (error); + } tdcp = VTOC(tdvp); + cp = VTOC(vp); + hfsmp = VTOHFS(vp); if (cp->c_nlink >= HFS_LINK_MAX) { - VOP_ABORTOP(tdvp, cnp); error = EMLINK; - goto out1; + goto out; } if (cp->c_flags & (IMMUTABLE | APPEND)) { - VOP_ABORTOP(tdvp, cnp); error = EPERM; - goto out1; + goto out; } - if (vp->v_type == VBLK || vp->v_type == VCHR) { - VOP_ABORTOP(tdvp, cnp); + if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { + error = ENOENT; + goto out; + } + + v_type = vnode_vtype(vp); + if (v_type == VBLK || v_type == VCHR) { error = EINVAL; /* cannot link to a special file */ - goto out1; + goto out; } - hfs_global_shared_lock_acquire(hfsmp); - if (hfsmp->jnl) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - hfs_global_shared_lock_release(hfsmp); - VOP_ABORTOP(tdvp, cnp); - error = EINVAL; /* cannot link to a special file */ - goto out1; - } + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; /* cannot link to a special file */ + goto out; } cp->c_nlink++; - cp->c_flag |= C_CHANGE; - tv = time; + cp->c_touch_chgtime = TRUE; - error = VOP_UPDATE(vp, &tv, &tv, 1); - if (!error) { - error = hfs_makelink(hfsmp, cp, tdcp, cnp); - } + error = hfs_makelink(hfsmp, cp, tdcp, cnp); if (error) { cp->c_nlink--; - cp->c_flag |= C_CHANGE; + hfs_volupdate(hfsmp, VOL_UPDATE, 0); } else { + /* Invalidate negative cache entries in the destination directory */ + if (hfsmp->hfs_flags & HFS_CASE_SENSITIVE) + cache_purge_negatives(tdvp); + /* Update the target directory and volume stats */ tdcp->c_nlink++; tdcp->c_entries++; - tdcp->c_flag |= C_CHANGE | C_UPDATE; - tv = time; - (void) VOP_UPDATE(tdvp, &tv, &tv, 0); + tdcp->c_touch_chgtime = TRUE; + tdcp->c_touch_modtime = TRUE; + tdcp->c_flag |= C_FORCEUPDATE; + + error = hfs_update(tdvp, 0); + if (error) { + panic("hfs_vnop_link: error updating tdvp 0x%x\n", tdvp); + } hfs_volupdate(hfsmp, VOL_MKFILE, (tdcp->c_cnid == kHFSRootFolderID)); } - // XXXdbg - need to do this here as well because cp could have changed - (void) VOP_UPDATE(vp, &tv, &tv, 1); - + cp->c_flag |= C_FORCEUPDATE; // otherwise hfs_update() might skip the update - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); - - /* free the pathname buffer */ - { - char *tmp = cnp->cn_pnbuf; - cnp->cn_pnbuf = NULL; - cnp->cn_flags &= ~HASBUF; - FREE_ZONE(tmp, cnp->cn_pnlen, M_NAMEI); + if ((ret = hfs_update(vp, TRUE)) != 0) { + panic("hfs_vnop_link: error %d updating vp @ 0x%x\n", ret, vp); } + + hfs_end_transaction(hfsmp); HFS_KNOTE(vp, NOTE_LINK); HFS_KNOTE(tdvp, NOTE_WRITE); - -out1: - if (tdvp != vp) - VOP_UNLOCK(vp, 0, p); -out2: - vput(tdvp); +out: + hfs_unlockpair(tdcp, cp); return (error); } diff --git a/bsd/hfs/hfs_lockf.c b/bsd/hfs/hfs_lockf.c deleted file mode 100644 index 63c1c0cb4..000000000 --- a/bsd/hfs/hfs_lockf.c +++ /dev/null @@ -1,707 +0,0 @@ -/* - * Copyright (c) 1999,2001-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ - -/* (c) 1997-1998,2001 Apple Computer, Inc. All Rights Reserved */ -/* - * Copyright (c) 1982, 1986, 1989, 1993 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Scooter Morris at Genentech Inc. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)hfs_lockf.c 1.0 - * derived from @(#)ufs_lockf.c 8.4 (Berkeley) 10/26/94 - */ - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/kernel.h> -#include <sys/file.h> -#include <sys/proc.h> -#include <sys/vnode.h> -#include <sys/malloc.h> -#include <sys/fcntl.h> - -#include "hfs_cnode.h" -#include "hfs_lockf.h" - -/* - * This variable controls the maximum number of processes that will - * be checked in doing deadlock detection. - */ -int hfsmaxlockdepth = MAXDEPTH; - -#ifdef LOCKF_DEBUG -#include <vm/vm.h> -#include <sys/sysctl.h> -int lockf_debug = 0; -struct ctldebug debug4 = { "lockf_debug", &lockf_debug }; -#endif - -#define NOLOCKF (struct hfslockf *)0 -#define SELF 0x1 -#define OTHERS 0x2 - -/* - * Set a byte-range lock. - */ -int -hfs_setlock(lock) - register struct hfslockf *lock; -{ - register struct hfslockf *block; - struct filefork *fork = lock->lf_fork; - struct hfslockf **prev, *overlap, *ltmp; - static char lockstr[] = "hfslockf"; - int ovcase, priority, needtolink, error; - -#ifdef LOCKF_DEBUG - if (lockf_debug & 1) - hfs_lprint("hfs_setlock", lock); -#endif /* LOCKF_DEBUG */ - - /* - * Set the priority - */ - priority = PLOCK; - if (lock->lf_type == F_WRLCK) - priority += 4; - priority |= PCATCH; - /* - * Scan lock list for this file looking for locks that would block us. - */ - while ((block = hfs_getblock(lock))) { - /* - * Free the structure and return if nonblocking. - */ - if ((lock->lf_flags & F_WAIT) == 0) { - FREE(lock, M_LOCKF); - return (EAGAIN); - } - /* - * We are blocked. Since flock style locks cover - * the whole file, there is no chance for deadlock. - * For byte-range locks we must check for deadlock. - * - * Deadlock detection is done by looking through the - * wait channels to see if there are any cycles that - * involve us. MAXDEPTH is set just to make sure we - * do not go off into neverland. - */ - if ((lock->lf_flags & F_POSIX) && - (block->lf_flags & F_POSIX)) { - register struct proc *wproc; - register struct hfslockf *waitblock; - int i = 0; - - /* The block is waiting on something */ - wproc = (struct proc *)block->lf_id; - while (wproc->p_wchan && - (wproc->p_wmesg == lockstr) && - (i++ < hfsmaxlockdepth)) { - waitblock = (struct hfslockf *)wproc->p_wchan; - /* Get the owner of the blocking lock */ - waitblock = waitblock->lf_next; - if ((waitblock->lf_flags & F_POSIX) == 0) - break; - wproc = (struct proc *)waitblock->lf_id; - if (wproc == (struct proc *)lock->lf_id) { - _FREE(lock, M_LOCKF); - return (EDEADLK); - } - } - } - /* - * For flock type locks, we must first remove - * any shared locks that we hold before we sleep - * waiting for an exclusive lock. - */ - if ((lock->lf_flags & F_FLOCK) && - lock->lf_type == F_WRLCK) { - lock->lf_type = F_UNLCK; - (void) hfs_clearlock(lock); - lock->lf_type = F_WRLCK; - } - /* - * Add our lock to the blocked list and sleep until we're free. - * Remember who blocked us (for deadlock detection). - */ - lock->lf_next = block; - TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block); -#ifdef LOCKF_DEBUG - if (lockf_debug & 1) { - hfs_lprint("hfs_setlock: blocking on", block); - hfs_lprintlist("hfs_setlock", block); - } -#endif /* LOCKF_DEBUG */ - if ((error = tsleep((caddr_t)lock, priority, lockstr, 0))) { - /* - * We may have been awakened by a signal (in - * which case we must remove ourselves from the - * blocked list) and/or by another process - * releasing a lock (in which case we have already - * been removed from the blocked list and our - * lf_next field set to NOLOCKF). - */ - if (lock->lf_next) - TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, - lf_block); - _FREE(lock, M_LOCKF); - return (error); - } - } - /* - * No blocks!! Add the lock. Note that we will - * downgrade or upgrade any overlapping locks this - * process already owns. - * - * Skip over locks owned by other processes. - * Handle any locks that overlap and are owned by ourselves. - */ - prev = &fork->ff_lockf; - block = fork->ff_lockf; - needtolink = 1; - for (;;) { - if ((ovcase = hfs_findoverlap(block, lock, SELF, &prev, &overlap))) - block = overlap->lf_next; - /* - * Six cases: - * 0) no overlap - * 1) overlap == lock - * 2) overlap contains lock - * 3) lock contains overlap - * 4) overlap starts before lock - * 5) overlap ends after lock - */ - switch (ovcase) { - case 0: /* no overlap */ - if (needtolink) { - *prev = lock; - lock->lf_next = overlap; - } - break; - - case 1: /* overlap == lock */ - /* - * If downgrading lock, others may be - * able to acquire it. - */ - if (lock->lf_type == F_RDLCK && - overlap->lf_type == F_WRLCK) - hfs_wakelock(overlap); - overlap->lf_type = lock->lf_type; - FREE(lock, M_LOCKF); - lock = overlap; /* for debug output below */ - break; - - case 2: /* overlap contains lock */ - /* - * Check for common starting point and different types. - */ - if (overlap->lf_type == lock->lf_type) { - _FREE(lock, M_LOCKF); - lock = overlap; /* for debug output below */ - break; - } - if (overlap->lf_start == lock->lf_start) { - *prev = lock; - lock->lf_next = overlap; - overlap->lf_start = lock->lf_end + 1; - } else - hfs_split(overlap, lock); - hfs_wakelock(overlap); - break; - - case 3: /* lock contains overlap */ - /* - * If downgrading lock, others may be able to - * acquire it, otherwise take the list. - */ - if (lock->lf_type == F_RDLCK && - overlap->lf_type == F_WRLCK) { - hfs_wakelock(overlap); - } else { - while ((ltmp = overlap->lf_blkhd.tqh_first)) { - TAILQ_REMOVE(&overlap->lf_blkhd, ltmp, - lf_block); - TAILQ_INSERT_TAIL(&lock->lf_blkhd, - ltmp, lf_block); - } - } - /* - * Add the new lock if necessary and delete the overlap. - */ - if (needtolink) { - *prev = lock; - lock->lf_next = overlap->lf_next; - prev = &lock->lf_next; - needtolink = 0; - } else - *prev = overlap->lf_next; - _FREE(overlap, M_LOCKF); - continue; - - case 4: /* overlap starts before lock */ - /* - * Add lock after overlap on the list. - */ - lock->lf_next = overlap->lf_next; - overlap->lf_next = lock; - overlap->lf_end = lock->lf_start - 1; - prev = &lock->lf_next; - hfs_wakelock(overlap); - needtolink = 0; - continue; - - case 5: /* overlap ends after lock */ - /* - * Add the new lock before overlap. - */ - if (needtolink) { - *prev = lock; - lock->lf_next = overlap; - } - overlap->lf_start = lock->lf_end + 1; - hfs_wakelock(overlap); - break; - } - break; - } -#ifdef LOCKF_DEBUG - if (lockf_debug & 1) { - hfs_lprint("hfs_setlock: got the lock", lock); - hfs_lprintlist("hfs_setlock", lock); - } -#endif /* LOCKF_DEBUG */ - return (0); -} - -/* - * Remove a file fork's byte-range lock. - * - * Generally, find the lock (or an overlap to that lock) - * and remove it (or shrink it), then wakeup anyone we can. - */ -int -hfs_clearlock(unlock) - register struct hfslockf *unlock; -{ - struct filefork *fork = unlock->lf_fork; - register struct hfslockf *lf = fork->ff_lockf; - struct hfslockf *overlap, **prev; - int ovcase; - - if (lf == NOLOCKF) - return (0); -#ifdef LOCKF_DEBUG - if (unlock->lf_type != F_UNLCK) - panic("hfs_clearlock: bad type"); - if (lockf_debug & 1) - hfs_lprint("hfs_clearlock", unlock); -#endif /* LOCKF_DEBUG */ - prev = &fork->ff_lockf; - while ((ovcase = hfs_findoverlap(lf, unlock, SELF, &prev, &overlap))) { - /* - * Wakeup the list of locks to be retried. - */ - hfs_wakelock(overlap); - - switch (ovcase) { - - case 1: /* overlap == lock */ - *prev = overlap->lf_next; - FREE(overlap, M_LOCKF); - break; - - case 2: /* overlap contains lock: split it */ - if (overlap->lf_start == unlock->lf_start) { - overlap->lf_start = unlock->lf_end + 1; - break; - } - hfs_split(overlap, unlock); - overlap->lf_next = unlock->lf_next; - break; - - case 3: /* lock contains overlap */ - *prev = overlap->lf_next; - lf = overlap->lf_next; - _FREE(overlap, M_LOCKF); - continue; - - case 4: /* overlap starts before lock */ - overlap->lf_end = unlock->lf_start - 1; - prev = &overlap->lf_next; - lf = overlap->lf_next; - continue; - - case 5: /* overlap ends after lock */ - overlap->lf_start = unlock->lf_end + 1; - break; - } - break; - } -#ifdef LOCKF_DEBUG - if (lockf_debug & 1) - hfs_lprintlist("hfs_clearlock", unlock); -#endif /* LOCKF_DEBUG */ - return (0); -} - -/* - * Check whether there is a blocking lock, - * and if so return its process identifier. - */ -int -hfs_getlock(lock, fl) - register struct hfslockf *lock; - register struct flock *fl; -{ - register struct hfslockf *block; - -#ifdef LOCKF_DEBUG - if (lockf_debug & 1) - hfs_lprint("hfs_getlock", lock); -#endif /* LOCKF_DEBUG */ - - if ((block = hfs_getblock(lock))) { - fl->l_type = block->lf_type; - fl->l_whence = SEEK_SET; - fl->l_start = block->lf_start; - if (block->lf_end == -1) - fl->l_len = 0; - else - fl->l_len = block->lf_end - block->lf_start + 1; - if (block->lf_flags & F_POSIX) - fl->l_pid = ((struct proc *)(block->lf_id))->p_pid; - else - fl->l_pid = -1; - } else { - fl->l_type = F_UNLCK; - } - return (0); -} - -/* - * Walk a file fork's list of locks and - * return the first blocking lock. - */ -struct hfslockf * -hfs_getblock(lock) - register struct hfslockf *lock; -{ - struct hfslockf **prev, *overlap, *lf = lock->lf_fork->ff_lockf; - int ovcase; - - prev = &lock->lf_fork->ff_lockf; - while ((ovcase = hfs_findoverlap(lf, lock, OTHERS, &prev, &overlap))) { - /* - * We've found an overlap, see if it blocks us - */ - if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK)) - return (overlap); - /* - * Nope, point to the next one on the list and - * see if it blocks us - */ - lf = overlap->lf_next; - } - return (NOLOCKF); -} - -/* - * Walk a file fork's list of locks to - * find an overlapping lock (if any). - * - * NOTE: this returns only the FIRST overlapping lock. There - * may be more than one. - */ -int -hfs_findoverlap(lf, lock, type, prev, overlap) - register struct hfslockf *lf; - struct hfslockf *lock; - int type; - struct hfslockf ***prev; - struct hfslockf **overlap; -{ - off_t start, end; - - *overlap = lf; - if (lf == NOLOCKF) - return (0); -#ifdef LOCKF_DEBUG - if (lockf_debug & 2) - hfs_lprint("hfs_findoverlap: looking for overlap in", lock); -#endif /* LOCKF_DEBUG */ - start = lock->lf_start; - end = lock->lf_end; - while (lf != NOLOCKF) { - if (((type & SELF) && lf->lf_id != lock->lf_id) || - ((type & OTHERS) && lf->lf_id == lock->lf_id)) { - *prev = &lf->lf_next; - *overlap = lf = lf->lf_next; - continue; - } -#ifdef LOCKF_DEBUG - if (lockf_debug & 2) - hfs_lprint("\tchecking", lf); -#endif /* LOCKF_DEBUG */ - /* - * OK, check for overlap - * - * Six cases: - * 0) no overlap - * 1) overlap == lock - * 2) overlap contains lock - * 3) lock contains overlap - * 4) overlap starts before lock - * 5) overlap ends after lock - */ - if ((lf->lf_end != -1 && start > lf->lf_end) || - (end != -1 && lf->lf_start > end)) { - /* Case 0 */ -#ifdef LOCKF_DEBUG - if (lockf_debug & 2) - printf("no overlap\n"); -#endif /* LOCKF_DEBUG */ - if ((type & SELF) && end != -1 && lf->lf_start > end) - return (0); - *prev = &lf->lf_next; - *overlap = lf = lf->lf_next; - continue; - } - if ((lf->lf_start == start) && (lf->lf_end == end)) { - /* Case 1 */ -#ifdef LOCKF_DEBUG - if (lockf_debug & 2) - printf("overlap == lock\n"); -#endif /* LOCKF_DEBUG */ - return (1); - } - if ((lf->lf_start <= start) && - (end != -1) && - ((lf->lf_end >= end) || (lf->lf_end == -1))) { - /* Case 2 */ -#ifdef LOCKF_DEBUG - if (lockf_debug & 2) - printf("overlap contains lock\n"); -#endif /* LOCKF_DEBUG */ - return (2); - } - if (start <= lf->lf_start && - (end == -1 || - (lf->lf_end != -1 && end >= lf->lf_end))) { - /* Case 3 */ -#ifdef LOCKF_DEBUG - if (lockf_debug & 2) - printf("lock contains overlap\n"); -#endif /* LOCKF_DEBUG */ - return (3); - } - if ((lf->lf_start < start) && - ((lf->lf_end >= start) || (lf->lf_end == -1))) { - /* Case 4 */ -#ifdef LOCKF_DEBUG - if (lockf_debug & 2) - printf("overlap starts before lock\n"); -#endif /* LOCKF_DEBUG */ - return (4); - } - if ((lf->lf_start > start) && - (end != -1) && - ((lf->lf_end > end) || (lf->lf_end == -1))) { - /* Case 5 */ -#ifdef LOCKF_DEBUG - if (lockf_debug & 2) - printf("overlap ends after lock\n"); -#endif /* LOCKF_DEBUG */ - return (5); - } - panic("hfs_findoverlap: default"); - } - return (0); -} - -/* - * Split a lock and a contained region into - * two or three locks as necessary. - */ -void -hfs_split(lock1, lock2) - register struct hfslockf *lock1; - register struct hfslockf *lock2; -{ - register struct hfslockf *splitlock; - -#ifdef LOCKF_DEBUG - if (lockf_debug & 2) { - hfs_lprint("hfs_split", lock1); - hfs_lprint("splitting from", lock2); - } -#endif /* LOCKF_DEBUG */ - /* - * Check to see if spliting into only two pieces. - */ - if (lock1->lf_start == lock2->lf_start) { - lock1->lf_start = lock2->lf_end + 1; - lock2->lf_next = lock1; - return; - } - if (lock1->lf_end == lock2->lf_end) { - lock1->lf_end = lock2->lf_start - 1; - lock2->lf_next = lock1->lf_next; - lock1->lf_next = lock2; - return; - } - /* - * Make a new lock consisting of the last part of - * the encompassing lock - */ - MALLOC(splitlock, struct hfslockf *, sizeof *splitlock, M_LOCKF, M_WAITOK); - bcopy((caddr_t)lock1, (caddr_t)splitlock, sizeof *splitlock); - splitlock->lf_start = lock2->lf_end + 1; - TAILQ_INIT(&splitlock->lf_blkhd); - lock1->lf_end = lock2->lf_start - 1; - /* - * OK, now link it in - */ - splitlock->lf_next = lock1->lf_next; - lock2->lf_next = splitlock; - lock1->lf_next = lock2; -} - -/* - * Wakeup a blocklist - */ -void -hfs_wakelock(listhead) - struct hfslockf *listhead; -{ - register struct hfslockf *wakelock; - - while ((wakelock = listhead->lf_blkhd.tqh_first)) { - TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block); - wakelock->lf_next = NOLOCKF; -#ifdef LOCKF_DEBUG - if (lockf_debug & 2) - hfs_lprint("hfs_wakelock: awakening", wakelock); -#endif /* LOCKF_DEBUG */ - wakeup((caddr_t)wakelock); - } -} - -#ifdef LOCKF_DEBUG -/* - * Print out a lock. - */ -hfs_lprint(tag, lock) - char *tag; - register struct hfslockf *lock; -{ - - printf("%s: lock 0x%lx for ", tag, lock); - if (lock->lf_flags & F_POSIX) - printf("proc %d", ((struct proc *)(lock->lf_id))->p_pid); - else - printf("id 0x%x", lock->lf_id); - printf(" in ino %d on dev <%d, %d>, %s, start %d, end %d", - FTOC(lock->lf_fork)->c_fileid, - major(FTOC(lock->lf_fork)->c_dev), - minor(FTOC(lock->lf_fork)->c_dev), - lock->lf_type == F_RDLCK ? "shared" : - lock->lf_type == F_WRLCK ? "exclusive" : - lock->lf_type == F_UNLCK ? "unlock" : - "unknown", lock->lf_start, lock->lf_end); - if (lock->lf_blkhd.tqh_first) - printf(" block 0x%x\n", lock->lf_blkhd.tqh_first); - else - printf("\n"); -} - -hfs_lprintlist(tag, lock) - char *tag; - struct hfslockf *lock; -{ - register struct hfslockf *lf, *blk; - - printf("%s: Lock list for ino %d on dev <%d, %d>:\n", - tag, FTOC(lock->lf_fork)->i_number, - major(FTOC(lock->lf_fork)->c_dev), - minor(FTOC(lock->lf_fork)->c_dev)); - for (lf = lock->lf_fork->ff_lockf; lf; lf = lf->lf_next) { - printf("\tlock 0x%lx for ", lf); - if (lf->lf_flags & F_POSIX) - printf("proc %d", ((struct proc *)(lf->lf_id))->p_pid); - else - printf("id 0x%x", lf->lf_id); - printf(", %s, start %d, end %d", - lf->lf_type == F_RDLCK ? "shared" : - lf->lf_type == F_WRLCK ? "exclusive" : - lf->lf_type == F_UNLCK ? "unlock" : - "unknown", lf->lf_start, lf->lf_end); - for (blk = lf->lf_blkhd.tqh_first; blk; - blk = blk->lf_block.tqe_next) { - printf("\n\t\tlock request 0x%lx for ", blk); - if (blk->lf_flags & F_POSIX) - printf("proc %d", - ((struct proc *)(blk->lf_id))->p_pid); - else - printf("id 0x%x", blk->lf_id); - printf(", %s, start %d, end %d", - blk->lf_type == F_RDLCK ? "shared" : - blk->lf_type == F_WRLCK ? "exclusive" : - blk->lf_type == F_UNLCK ? "unlock" : - "unknown", blk->lf_start, blk->lf_end); - if (blk->lf_blkhd.tqh_first) - panic("hfs_lprintlist: bad list"); - } - printf("\n"); - } -} -#endif /* LOCKF_DEBUG */ diff --git a/bsd/hfs/hfs_lockf.h b/bsd/hfs/hfs_lockf.h deleted file mode 100644 index 0ae8db758..000000000 --- a/bsd/hfs/hfs_lockf.h +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright (c) 1999,2001-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ - -/* (c) 1997-1998,2001 Apple Computer, Inc. All Rights Reserved */ -/* - * Copyright (c) 1991, 1993 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Scooter Morris at Genentech Inc. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * - * @(#)hfs_lockf.h 1.0 5/5/98 - * derived from @(#)lockf.h 8.2 (Berkeley) 10/26/94 - */ - -#ifndef __HFS_LOCKF__ -#define __HFS_LOCKF__ - -#include <sys/appleapiopts.h> - -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -/* - * The hfslockf structure is a kernel structure which contains the information - * associated with a byte range lock. The hfslockf structures are linked into - * a cnode's file fork. Locks are sorted by the starting byte of the lock for - * efficiency. - */ -TAILQ_HEAD(locklist, hfslockf); - -struct hfslockf { - short lf_flags; /* Semantics: F_POSIX, F_FLOCK, F_WAIT */ - short lf_type; /* Lock type: F_RDLCK, F_WRLCK */ - off_t lf_start; /* Byte # of the start of the lock */ - off_t lf_end; /* Byte # of the end of the lock (-1=EOF) */ - caddr_t lf_id; /* Id of the resource holding the lock */ - struct filefork *lf_fork; /* Back pointer to the file fork */ - struct hfslockf *lf_next; /* Pointer to the next lock on this fork */ - struct locklist lf_blkhd; /* List of requests blocked on this lock */ - TAILQ_ENTRY(hfslockf) lf_block;/* A request waiting for a lock */ -}; - -/* Maximum length of sleep chains to traverse to try and detect deadlock. */ -#define MAXDEPTH 50 - -__BEGIN_DECLS -void hfs_addblock __P((struct hfslockf *, struct hfslockf *)); -int hfs_clearlock __P((struct hfslockf *)); -int hfs_findoverlap __P((struct hfslockf *, - struct hfslockf *, int, struct hfslockf ***, struct hfslockf **)); -struct hfslockf * - hfs_getblock __P((struct hfslockf *)); -int hfs_getlock __P((struct hfslockf *, struct flock *)); -int hfs_setlock __P((struct hfslockf *)); -void hfs_split __P((struct hfslockf *, struct hfslockf *)); -void hfs_wakelock __P((struct hfslockf *)); -__END_DECLS - -#ifdef LOCKF_DEBUG -extern int lockf_debug; - -__BEGIN_DECLS -void hfs_lprint __P((char *, struct hfslockf *)); -void hfs_lprintlist __P((char *, struct hfslockf *)); -__END_DECLS -#endif /* LOCKF_DEBUG */ -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ -#endif /* __HFS_LOCKF__ */ diff --git a/bsd/hfs/hfs_lookup.c b/bsd/hfs/hfs_lookup.c index d707d1b18..1942c91d0 100644 --- a/bsd/hfs/hfs_lookup.c +++ b/bsd/hfs/hfs_lookup.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -65,27 +65,27 @@ * * hfs_lookup.c -- code to handle directory traversal on HFS/HFS+ volume */ -#define LEGACY_FORK_NAMES 0 #include <sys/param.h> -#include <sys/buf.h> #include <sys/file.h> #include <sys/mount.h> #include <sys/vnode.h> -#include <sys/namei.h> #include <sys/malloc.h> #include <sys/paths.h> +#include <sys/kdebug.h> +#include <sys/kauth.h> #include "hfs.h" #include "hfs_catalog.h" #include "hfs_cnode.h" +#define LEGACY_FORK_NAMES 1 static int forkcomponent(struct componentname *cnp, int *rsrcfork); #define _PATH_DATAFORKSPEC "/..namedfork/data" -#ifdef LEGACY_FORK_NAMES +#if LEGACY_FORK_NAMES #define LEGACY_RSRCFORKSPEC "/rsrc" #endif @@ -102,13 +102,6 @@ static int forkcomponent(struct componentname *cnp, int *rsrcfork); * creating, renaming, or deleting a directory entry may be calculated. * Notice that these are the only operations that can affect the directory of the target. * - * If flag has LOCKPARENT or'ed into it and the target of the pathname - * exists, lookup returns both the target and its parent directory locked. - * When creating or renaming and LOCKPARENT is specified, the target may - * not be ".". When deleting and LOCKPARENT is specified, the target may - * be "."., but the caller must check to ensure it does an vrele and vput - * instead of two vputs. - * * LOCKPARENT and WANTPARENT actually refer to the parent of the last item, * so if ISLASTCN is not set, they should be ignored. Also they are mutually exclusive, or * WANTPARENT really implies DONTLOCKPARENT. Either of them set means that the calling @@ -117,10 +110,6 @@ static int forkcomponent(struct componentname *cnp, int *rsrcfork); * Keeping the parent locked as long as possible protects from other processes * looking up the same item, so it has to be locked until the cnode is totally finished * - * This routine is actually used as VOP_CACHEDLOOKUP method, and the - * filesystem employs the generic hfs_cache_lookup() as VOP_LOOKUP - * method. - * * hfs_cache_lookup() performs the following for us: * check that it is a directory * check accessibility of directory @@ -130,7 +119,7 @@ static int forkcomponent(struct componentname *cnp, int *rsrcfork); * drop it * else * return name. - * return VOP_CACHEDLOOKUP() + * return hfs_lookup() * * Overall outline of hfs_lookup: * @@ -147,9 +136,10 @@ static int forkcomponent(struct componentname *cnp, int *rsrcfork); * nor deleting, add name to cache */ + /* - * Lookup *nm in directory *pvp, return it in *a_vpp. - * **a_vpp is held on exit. + * Lookup *cnp in directory *dvp, return it in *vpp. + * **vpp is held on exit. * We create a cnode for the file, but we do NOT open the file here. #% lookup dvp L ? ? @@ -162,65 +152,66 @@ static int forkcomponent(struct componentname *cnp, int *rsrcfork); * When should we lock parent_hp in here ?? */ - -__private_extern__ -int -hfs_lookup(ap) - struct vop_cachedlookup_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - } */ *ap; +static int +hfs_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, vfs_context_t context, int *cnode_locked) { - struct vnode *dvp; /* vnode for directory being searched */ struct cnode *dcp; /* cnode for directory being searched */ struct vnode *tvp; /* target vnode */ struct hfsmount *hfsmp; - struct componentname *cnp; - struct ucred *cred; + kauth_cred_t cred; struct proc *p; int wantrsrc = 0; int forknamelen = 0; int flags; - int wantparent; int nameiop; int retval = 0; int isDot; - struct cat_desc desc = {0}; + struct cat_desc desc; struct cat_desc cndesc; struct cat_attr attr; struct cat_fork fork; - struct vnode **vpp; + int lockflags; - vpp = ap->a_vpp; - cnp = ap->a_cnp; - dvp = ap->a_dvp; dcp = VTOC(dvp); hfsmp = VTOHFS(dvp); *vpp = NULL; + *cnode_locked = 0; isDot = FALSE; tvp = NULL; nameiop = cnp->cn_nameiop; - cred = cnp->cn_cred; - p = cnp->cn_proc; flags = cnp->cn_flags; - wantparent = flags & (LOCKPARENT|WANTPARENT); + bzero(&desc, sizeof(desc)); + + cred = vfs_context_ucred(context); + p = vfs_context_proc(context); /* * First check to see if it is a . or .., else look it up. */ if (flags & ISDOTDOT) { /* Wanting the parent */ + cnp->cn_flags &= ~MAKEENTRY; goto found; /* .. is always defined */ } else if ((cnp->cn_nameptr[0] == '.') && (cnp->cn_namelen == 1)) { isDot = TRUE; + cnp->cn_flags &= ~MAKEENTRY; goto found; /* We always know who we are */ } else { /* Check fork suffix to see if we want the resource fork */ forknamelen = forkcomponent(cnp, &wantrsrc); + + /* Resource fork names are not cached. */ + if (wantrsrc) + cnp->cn_flags &= ~MAKEENTRY; + + if (hfs_lock(dcp, HFS_EXCLUSIVE_LOCK) != 0) { + goto notfound; + } /* No need to go to catalog if there are no children */ - if (dcp->c_entries == 0) + if (dcp->c_entries == 0) { + hfs_unlock(dcp); goto notfound; + } bzero(&cndesc, sizeof(cndesc)); cndesc.cd_nameptr = cnp->cn_nameptr; @@ -228,22 +219,27 @@ hfs_lookup(ap) cndesc.cd_parentcnid = dcp->c_cnid; cndesc.cd_hint = dcp->c_childhint; - /* Lock catalog b-tree */ - retval = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, p); - if (retval) - goto exit; - retval = cat_lookup(hfsmp, &cndesc, wantrsrc, &desc, &attr, &fork); + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + + retval = cat_lookup(hfsmp, &cndesc, wantrsrc, &desc, &attr, &fork, NULL); - if (retval == 0 && S_ISREG(attr.ca_mode) && attr.ca_blocks < fork.cf_blocks) - panic("hfs_lookup: bad ca_blocks (too small)"); - - /* Unlock catalog b-tree */ - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); + hfs_systemfile_unlock(hfsmp, lockflags); + if (retval == 0) { dcp->c_childhint = desc.cd_hint; + hfs_unlock(dcp); goto found; } + hfs_unlock(dcp); notfound: + /* ENAMETOOLONG supersedes other errors */ + if (((nameiop != CREATE) && (nameiop != RENAME)) && + (retval != ENAMETOOLONG) && + (cnp->cn_namelen > kHFSPlusMaxFileNameChars)) { + retval = ENAMETOOLONG; + } else if (retval == 0) { + retval = ENOENT; + } /* * This is a non-existing entry * @@ -253,34 +249,23 @@ notfound: */ if ((nameiop == CREATE || nameiop == RENAME || (nameiop == DELETE && - (ap->a_cnp->cn_flags & DOWHITEOUT) && - (ap->a_cnp->cn_flags & ISWHITEOUT))) && - (flags & ISLASTCN)) { - /* - * Access for write is interpreted as allowing - * creation of files in the directory. - */ - retval = VOP_ACCESS(dvp, VWRITE, cred, cnp->cn_proc); - if (retval) { - goto exit; - } - - cnp->cn_flags |= SAVENAME; - if (!(flags & LOCKPARENT)) - VOP_UNLOCK(dvp, 0, p); + (cnp->cn_flags & DOWHITEOUT) && + (cnp->cn_flags & ISWHITEOUT))) && + (flags & ISLASTCN) && + (retval == ENOENT)) { retval = EJUSTRETURN; goto exit; } - /* * Insert name into cache (as non-existent) if appropriate. * * Only done for case-sensitive HFS+ volumes. */ - if ((hfsmp->hfs_flags & HFS_CASE_SENSITIVE) && - (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) - cache_enter(dvp, *vpp, cnp); - retval = ENOENT; + if ((retval == ENOENT) && + (hfsmp->hfs_flags & HFS_CASE_SENSITIVE) && + (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) { + cache_enter(dvp, NULL, cnp); + } goto exit; } @@ -300,186 +285,57 @@ found: wantrsrc = 0; forknamelen = 0; } - - /* - * If deleting, and at end of pathname, return - * parameters which can be used to remove file. - */ - if (nameiop == DELETE && (flags & ISLASTCN)) { - /* - * Write access to directory required to delete files. - */ - if ((retval = VOP_ACCESS(dvp, VWRITE, cred, cnp->cn_proc))) - goto exit; - - if (isDot) { /* Want to return ourselves */ - VREF(dvp); - *vpp = dvp; - goto exit; - } else if (flags & ISDOTDOT) { - retval = hfs_getcnode(hfsmp, dcp->c_parentcnid, - NULL, 0, NULL, NULL, &tvp); - if (retval) - goto exit; - } else { - retval = hfs_getcnode(hfsmp, attr.ca_fileid, - &desc, wantrsrc, &attr, &fork, &tvp); - if (retval) - goto exit; - } - - /* - * If directory is "sticky", then user must own - * the directory, or the file in it, else she - * may not delete it (unless she's root). This - * implements append-only directories. - */ - if ((dcp->c_mode & S_ISTXT) && - (cred->cr_uid != 0) && - (cred->cr_uid != dcp->c_uid) && - (tvp->v_type != VLNK) && - (hfs_owner_rights(hfsmp, VTOC(tvp)->c_uid, cred, p, false))) { - vput(tvp); - retval = EPERM; - goto exit; - } - - /* - * If this is a link node then we need to save the name - * (of the link) so we can delete it from the catalog b-tree. - * In this case, hfs_remove will then free the component name. - * - * DJB - IS THIS STILL NEEDED???? - */ - if (tvp && (VTOC(tvp)->c_flag & C_HARDLINK)) - cnp->cn_flags |= SAVENAME; - - if (!(flags & LOCKPARENT)) - VOP_UNLOCK(dvp, 0, p); - *vpp = tvp; - goto exit; - } - - /* - * If renaming, return the cnode and save the current name. - */ - if (nameiop == RENAME && wantparent && (flags & ISLASTCN)) { - if ((retval = VOP_ACCESS(dvp, VWRITE, cred, cnp->cn_proc)) != 0) - goto exit; - /* - * Careful about locking second cnode. - */ - if (isDot) { - retval = EISDIR; - goto exit; - } else if (flags & ISDOTDOT) { - retval = hfs_getcnode(hfsmp, dcp->c_parentcnid, - NULL, 0, NULL, NULL, &tvp); - if (retval) - goto exit; - } else { - retval = hfs_getcnode(hfsmp, attr.ca_fileid, - &desc, wantrsrc, &attr, &fork, &tvp); - if (retval) + if (flags & ISLASTCN) { + switch(nameiop) { + case DELETE: + cnp->cn_flags &= ~MAKEENTRY; + break; + + case RENAME: + cnp->cn_flags &= ~MAKEENTRY; + if (isDot) { + retval = EISDIR; goto exit; + } + break; } - cnp->cn_flags |= SAVENAME; - if (!(flags & LOCKPARENT)) - VOP_UNLOCK(dvp, 0, p); - *vpp = tvp; - goto exit; - } + } - /* - * We must get the target cnode before unlocking - * the directory to insure that the cnode will not be removed - * before we get it. We prevent deadlock by always fetching - * cnodes from the root, moving down the directory tree. Thus - * when following backward pointers ".." we must unlock the - * parent directory before getting the requested directory. - * There is a potential race condition here if both the current - * and parent directories are removed before the VFS_VGET for the - * cnode associated with ".." returns. We hope that this occurs - * infrequently since we cannot avoid this race condition without - * implementing a sophisticated deadlock detection algorithm. - */ - if (flags & ISDOTDOT) { - VOP_UNLOCK(dvp, 0, p); /* race to get the cnode */ - retval = hfs_getcnode(hfsmp, dcp->c_parentcnid, - NULL, 0, NULL, NULL, &tvp); - if (retval) { - vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, p); + if (isDot) { + if ((retval = vnode_get(dvp))) goto exit; - } - if ((flags & LOCKPARENT) && (flags & ISLASTCN) && (dvp != tvp) && - (retval = vn_lock(dvp, LK_EXCLUSIVE, p))) { - vput(tvp); + *vpp = dvp; + } else if (flags & ISDOTDOT) { + if ((retval = hfs_vget(hfsmp, dcp->c_parentcnid, &tvp, 0))) goto exit; - } + *cnode_locked = 1; *vpp = tvp; - } else if (isDot) { - VREF(dvp); /* we want ourself, ie "." */ - *vpp = dvp; } else { int type = (attr.ca_mode & S_IFMT); - if (!(flags & ISLASTCN) && type != S_IFDIR && type != S_IFLNK) { + if (!(flags & ISLASTCN) && (type != S_IFDIR) && (type != S_IFLNK)) { retval = ENOTDIR; goto exit; } - retval = hfs_getcnode(hfsmp, attr.ca_fileid, - &desc, wantrsrc, &attr, &fork, &tvp); - if (retval) - goto exit; + /* Names with composed chars are not cached. */ + if (cnp->cn_namelen != desc.cd_namelen) + cnp->cn_flags &= ~MAKEENTRY; - if (!(flags & LOCKPARENT) || !(flags & ISLASTCN)) - VOP_UNLOCK(dvp, 0, p); - *vpp = tvp; - } + /* Resource fork vnode names include the fork specifier. */ + if (wantrsrc && (flags & ISLASTCN)) + cnp->cn_namelen += forknamelen; - /* - * Insert name in cache if appropriate. - * - "." and ".." are not cached. - * - Resource fork names are not cached. - * - Names with composed chars are not cached. - */ - if ((cnp->cn_flags & MAKEENTRY) - && !isDot - && !(flags & ISDOTDOT) - && !wantrsrc - && (cnp->cn_namelen == VTOC(*vpp)->c_desc.cd_namelen)) { - cache_enter(dvp, *vpp, cnp); - } + retval = hfs_getnewvnode(hfsmp, dvp, cnp, &desc, wantrsrc, &attr, &fork, &tvp); + if (wantrsrc && (flags & ISLASTCN)) + cnp->cn_namelen -= forknamelen; - // - // have to patch up the resource fork name because - // it won't happen properly in the layers above us. - // - if (wantrsrc) { - if (VTOC(*vpp)->c_vp == NULL) { - if (VNAME(*vpp) == NULL) { - VNAME(*vpp) = add_name(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0); - } - if (VPARENT(*vpp) == NULL) { - vget(dvp, 0, p); - VPARENT(*vpp) = dvp; - } - } else { - if (VNAME(*vpp) == NULL) { - // the +1/-2 thing is to skip the leading "/" on the rsrc fork spec - // and to not count the trailing null byte at the end of the string. - VNAME(*vpp) = add_name(_PATH_RSRCFORKSPEC+1, sizeof(_PATH_RSRCFORKSPEC)-2, 0, 0); - } - if (VPARENT(*vpp) == NULL && *vpp != VTOC(*vpp)->c_vp) { - VPARENT(*vpp) = VTOC(*vpp)->c_vp; - VTOC(*vpp)->c_flag |= C_VPREFHELD; - vget(VTOC(*vpp)->c_vp, 0, p); - } - } + if (retval) + goto exit; + *cnode_locked = 1; + *vpp = tvp; } - exit: cat_releasedesc(&desc); return (retval); @@ -488,8 +344,6 @@ exit: /* - * Based on vn_cache_lookup (which is vfs_cache_lookup in FreeBSD 3.1) - * * Name caching works as follows: * * Names found by directory scans are retained in a cache @@ -511,65 +365,44 @@ exit: __private_extern__ int -hfs_cache_lookup(ap) - struct vop_lookup_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - } */ *ap; +hfs_vnop_lookup(struct vnop_lookup_args *ap) { - struct vnode *dvp; + struct vnode *dvp = ap->a_dvp; struct vnode *vp; struct cnode *cp; struct cnode *dcp; - int lockparent; int error; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; int flags = cnp->cn_flags; - struct proc *p = cnp->cn_proc; - u_long vpid; /* capability number of vnode */ - - dvp = ap->a_dvp; - lockparent = flags & LOCKPARENT; + int cnode_locked; - /* - * Check accessiblity of directory. - */ - if (dvp->v_type != VDIR) - return (ENOTDIR); - if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && - (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { - error = EROFS; - goto err_exit; - } + *vpp = NULL; dcp = VTOC(dvp); - if (((dcp->c_mode & S_IXALL) != S_IXALL) && (cnp->cn_cred->cr_uid != 0)) { - if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, p))) { - goto err_exit; - } - } /* * Lookup an entry in the cache - * If the lookup succeeds, the vnode is returned in *vpp, and a status of -1 is - * returned. If the lookup determines that the name does not exist - * (negative cacheing), a status of ENOENT is returned. If the lookup - * fails, a status of zero is returned. + * + * If the lookup succeeds, the vnode is returned in *vpp, + * and a status of -1 is returned. + * + * If the lookup determines that the name does not exist + * (negative cacheing), a status of ENOENT is returned. + * + * If the lookup fails, a status of zero is returned. */ error = cache_lookup(dvp, vpp, cnp); if (error != -1) { - if (error == 0) { /* Unsuccessfull */ - goto lookup; - } - - if (error == ENOENT) { - goto err_exit; - } + if (error == ENOENT) /* found a negative cache entry */ + goto exit; + goto lookup; /* did not find it in the cache */ } - /* We have a name that matched */ + /* + * We have a name that matched + * cache_lookup returns the vp with an iocount reference already taken + */ + error = 0; vp = *vpp; - vpid = vp->v_id; /* * If this is a hard-link vnode then we need to update @@ -578,40 +411,32 @@ hfs_cache_lookup(ap) * getattrlist calls to return the correct link info. */ cp = VTOC(vp); - if ((flags & ISLASTCN) && (cp->c_flag & C_HARDLINK) && - ((cp->c_parentcnid != VTOC(ap->a_dvp)->c_cnid) || - (bcmp(cnp->cn_nameptr, cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen) != 0))) { - - struct cat_desc desc; - /* - * Get an updated descriptor - */ - bzero(&desc, sizeof(desc)); - desc.cd_nameptr = cnp->cn_nameptr; - desc.cd_namelen = cnp->cn_namelen; - desc.cd_parentcnid = VTOC(ap->a_dvp)->c_cnid; - desc.cd_hint = VTOC(ap->a_dvp)->c_childhint; - if (cat_lookup(VTOHFS(vp), &desc, 0, &desc, NULL, NULL) == 0) - replace_desc(cp, &desc); - } + if ((flags & ISLASTCN) && (cp->c_flag & C_HARDLINK)) { + hfs_lock(cp, HFS_FORCE_LOCK); + if ((cp->c_parentcnid != VTOC(dvp)->c_cnid) || + (bcmp(cnp->cn_nameptr, cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen) != 0)) { + struct cat_desc desc; + int lockflags; - if (dvp == vp) { /* lookup on "." */ - VREF(vp); - error = 0; - } else if (flags & ISDOTDOT) { - /* - * Carefull on the locking policy, - * remember we always lock from parent to child, so have - * to release lock on child before trying to lock parent - * then regain lock if needed - */ - VOP_UNLOCK(dvp, 0, p); - error = vget(vp, LK_EXCLUSIVE, p); - if (!error && lockparent && (flags & ISLASTCN)) - error = vn_lock(dvp, LK_EXCLUSIVE, p); - } else { - if ((flags & ISLASTCN) == 0 && vp->v_type == VREG) { + /* + * Get an updated descriptor + */ + bzero(&desc, sizeof(desc)); + desc.cd_nameptr = cnp->cn_nameptr; + desc.cd_namelen = cnp->cn_namelen; + desc.cd_parentcnid = VTOC(dvp)->c_cnid; + desc.cd_hint = VTOC(dvp)->c_childhint; + + lockflags = hfs_systemfile_lock(VTOHFS(dvp), SFL_CATALOG, HFS_SHARED_LOCK); + if (cat_lookup(VTOHFS(vp), &desc, 0, &desc, NULL, NULL, NULL) == 0) + replace_desc(cp, &desc); + hfs_systemfile_unlock(VTOHFS(dvp), lockflags); + } + hfs_unlock(cp); + } + if (dvp != vp && !(flags & ISDOTDOT)) { + if ((flags & ISLASTCN) == 0 && vnode_isreg(vp)) { int wantrsrc = 0; cnp->cn_consume = forkcomponent(cnp, &wantrsrc); @@ -620,70 +445,51 @@ hfs_cache_lookup(ap) /* Fork names are only for lookups */ if (cnp->cn_nameiop != LOOKUP && cnp->cn_nameiop != CREATE) { + vnode_put(vp); error = EPERM; - - goto err_exit; + goto exit; } } - + /* + * Use cnode's rsrcfork vnode if possible. + */ if (wantrsrc) { - /* Use cnode's rsrcfork vnode (if available) */ - if (cp->c_rsrc_vp != NULL) { - *vpp = vp = cp->c_rsrc_vp; - if (VNAME(vp) == NULL) { - // the +1/-2 thing is to skip the leading "/" on the rsrc fork spec - // and to not count the trailing null byte at the end of the string. - VNAME(vp) = add_name(_PATH_RSRCFORKSPEC+1, sizeof(_PATH_RSRCFORKSPEC)-2, 0, 0); - } - if (VPARENT(vp) == NULL) { - vget(cp->c_vp, 0, p); - VPARENT(vp) = cp->c_vp; - } - vpid = vp->v_id; - } else { - goto lookup; + int vid; + + *vpp = NULL; + + if (cp->c_rsrc_vp == NULL) { + vnode_put(vp); + goto lookup; } + vid = vnode_vid(cp->c_rsrc_vp); + + error = vnode_getwithvid(cp->c_rsrc_vp, vid); + if (error) { + vnode_put(vp); + goto lookup; + } + *vpp = cp->c_rsrc_vp; + vnode_put(vp); + vp = *vpp; } } - error = vget(vp, 0, p); - if (error == 0) { - if (VTOC(vp) == NULL || vp->v_data != (void *)cp) { - panic("hfs: cache lookup: my cnode disappeared/went bad! vp 0x%x 0x%x 0x%x\n", - vp, vp->v_data, cp); - } - if (cnp->cn_nameiop == LOOKUP && - (!(flags & ISLASTCN) || (flags & SHAREDLEAF))) - error = lockmgr(&VTOC(vp)->c_lock, LK_SHARED, NULL, p); - else - error = lockmgr(&VTOC(vp)->c_lock, LK_EXCLUSIVE, NULL, p); - } - if (!lockparent || error || !(flags & ISLASTCN)) { - (void) lockmgr(&dcp->c_lock, LK_RELEASE, NULL, p); - } } + return (error); + +lookup: /* - * Check that the capability number did not change - * while we were waiting for the lock. + * The vnode was not in the name cache or it was stale. + * + * So we need to do a real lookup. */ - if (!error) { - if (vpid == vp->v_id) - return (0); - /* - * The above is the NORMAL exit, after this point is an error - * condition. - */ - vput(vp); - if (lockparent && (dvp != vp) && (flags & ISLASTCN)) - VOP_UNLOCK(dvp, 0, p); - } - - if ((error = vn_lock(dvp, LK_EXCLUSIVE, p))) - return (error); -lookup: - return (hfs_lookup(ap)); + cnode_locked = 0; -err_exit: - *vpp = NULL; + error = hfs_lookup(dvp, vpp, cnp, ap->a_context, &cnode_locked); + + if (cnode_locked) + hfs_unlock(VTOC(*vpp)); +exit: return (error); } @@ -714,10 +520,11 @@ forkcomponent(struct componentname *cnp, int *rsrcfork) consume = sizeof(_PATH_DATAFORKSPEC) - 1; } -#ifdef LEGACY_FORK_NAMES +#if LEGACY_FORK_NAMES else if (bcmp(suffix, LEGACY_RSRCFORKSPEC, sizeof(LEGACY_RSRCFORKSPEC)) == 0) { consume = sizeof(LEGACY_RSRCFORKSPEC) - 1; *rsrcfork = 1; + printf("HFS: /rsrc paths are deprecated (%s)\n", cnp->cn_nameptr); } #endif return (consume); diff --git a/bsd/hfs/hfs_macos_defs.h b/bsd/hfs/hfs_macos_defs.h index 33461cc17..56a0f2296 100644 --- a/bsd/hfs/hfs_macos_defs.h +++ b/bsd/hfs/hfs_macos_defs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -37,7 +37,6 @@ #include <sys/types.h> #include <sys/time.h> #include <sys/proc.h> -#include <sys/lock.h> #define TARGET_OS_MAC 0 diff --git a/bsd/hfs/hfs_mount.h b/bsd/hfs/hfs_mount.h index 28ee2fa90..d09334757 100644 --- a/bsd/hfs/hfs_mount.h +++ b/bsd/hfs/hfs_mount.h @@ -44,17 +44,18 @@ #ifdef __APPLE_API_UNSTABLE struct hfs_mount_args { +#ifndef KERNEL char *fspec; /* block special device to mount */ - struct export_args export; /* network export information */ +#endif uid_t hfs_uid; /* uid that owns hfs files (standard HFS only) */ gid_t hfs_gid; /* gid that owns hfs files (standard HFS only) */ mode_t hfs_mask; /* mask to be applied for hfs perms (standard HFS only) */ - u_long hfs_encoding; /* encoding for this volume (standard HFS only) */ + uint32_t hfs_encoding; /* encoding for this volume (standard HFS only) */ struct timezone hfs_timezone; /* user time zone info (standard HFS only) */ - int flags; /* mounting flags, see below */ + int flags; /* mounting flags, see below */ int journal_tbuffer_size; /* size in bytes of the journal transaction buffer */ - int journal_flags; /* flags to pass to journal_open/create */ - int journal_disable; /* don't use journaling (potentially dangerous) */ + int journal_flags; /* flags to pass to journal_open/create */ + int journal_disable; /* don't use journaling (potentially dangerous) */ }; #define HFSFSMNT_NOXONFILES 0x1 /* disable execute permissions for files */ diff --git a/bsd/hfs/hfs_notification.c b/bsd/hfs/hfs_notification.c index cb85a7ea8..60e96c5b1 100644 --- a/bsd/hfs/hfs_notification.c +++ b/bsd/hfs/hfs_notification.c @@ -25,11 +25,9 @@ #include <sys/file.h> #include <sys/dirent.h> #include <sys/stat.h> -#include <sys/buf.h> #include <sys/mount.h> #include <sys/vnode.h> #include <sys/malloc.h> -#include <sys/namei.h> #include <sys/ubc.h> #include <sys/quota.h> @@ -38,7 +36,6 @@ #include "hfs.h" #include "hfs_catalog.h" #include "hfs_cnode.h" -#include "hfs_lockf.h" #include "hfs_dbg.h" #include "hfs_mount.h" #include "hfs_quota.h" @@ -51,18 +48,22 @@ void hfs_generate_volume_notifications(struct hfsmount *hfsmp) { ExtendedVCB *vcb = HFSTOVCB(hfsmp); + fsid_t fsid; + + fsid.val[0] = (long)hfsmp->hfs_raw_dev; + fsid.val[1] = (long)vfs_typenum(HFSTOVFS(hfsmp)); if (hfsmp->hfs_notification_conditions & VQ_LOWDISK) { /* Check to see whether the free space is back above the minimal level: */ if (hfs_freeblks(hfsmp, 1) > hfsmp->hfs_freespace_notify_desiredlevel) { hfsmp->hfs_notification_conditions &= ~VQ_LOWDISK; - vfs_event_signal(&HFSTOVFS(hfsmp)->mnt_stat.f_fsid, hfsmp->hfs_notification_conditions, NULL); + vfs_event_signal(&fsid, hfsmp->hfs_notification_conditions, NULL); } } else { /* Check to see whether the free space fell below the requested limit: */ if (hfs_freeblks(hfsmp, 1) < hfsmp->hfs_freespace_notify_warninglimit) { hfsmp->hfs_notification_conditions |= VQ_LOWDISK; - vfs_event_signal(&HFSTOVFS(hfsmp)->mnt_stat.f_fsid, hfsmp->hfs_notification_conditions, NULL); + vfs_event_signal(&fsid, hfsmp->hfs_notification_conditions, NULL); } }; } diff --git a/bsd/hfs/hfs_quota.c b/bsd/hfs/hfs_quota.c index cfe328de1..80b01d62c 100644 --- a/bsd/hfs/hfs_quota.c +++ b/bsd/hfs/hfs_quota.c @@ -62,12 +62,14 @@ #include <sys/kernel.h> #include <sys/systm.h> #include <sys/mount.h> -#include <sys/namei.h> #include <sys/malloc.h> #include <sys/file.h> #include <sys/proc.h> +#include <sys/kauth.h> #include <sys/vnode.h> #include <sys/quota.h> +#include <sys/proc_internal.h> +#include <kern/kalloc.h> #include <hfs/hfs.h> #include <hfs/hfs_cnode.h> @@ -77,7 +79,9 @@ /* * Quota name to error message mapping. */ +#if 0 static char *quotatypes[] = INITQFNAMES; +#endif /* * Set up the quotas for a cnode. @@ -96,14 +100,14 @@ hfs_getinoquota(cp) int error; vp = cp->c_vp ? cp->c_vp : cp->c_rsrc_vp; - hfsmp = VFSTOHFS(vp->v_mount); + hfsmp = VTOHFS(vp); /* * Set up the user quota based on file uid. * EINVAL means that quotas are not enabled. */ if (cp->c_dquot[USRQUOTA] == NODQUOT && (error = - dqget(vp, cp->c_uid, &hfsmp->hfs_qfiles[USRQUOTA], USRQUOTA, &cp->c_dquot[USRQUOTA])) && + dqget(cp->c_uid, &hfsmp->hfs_qfiles[USRQUOTA], USRQUOTA, &cp->c_dquot[USRQUOTA])) && error != EINVAL) return (error); /* @@ -112,7 +116,7 @@ hfs_getinoquota(cp) */ if (cp->c_dquot[GRPQUOTA] == NODQUOT && (error = - dqget(vp, cp->c_gid, &hfsmp->hfs_qfiles[GRPQUOTA], GRPQUOTA, &cp->c_dquot[GRPQUOTA])) && + dqget(cp->c_gid, &hfsmp->hfs_qfiles[GRPQUOTA], GRPQUOTA, &cp->c_dquot[GRPQUOTA])) && error != EINVAL) return (error); return (0); @@ -125,7 +129,7 @@ int hfs_chkdq(cp, change, cred, flags) register struct cnode *cp; int64_t change; - struct ucred *cred; + kauth_cred_t cred; int flags; { register struct dquot *dq; @@ -144,10 +148,8 @@ hfs_chkdq(cp, change, cred, flags) for (i = 0; i < MAXQUOTAS; i++) { if ((dq = cp->c_dquot[i]) == NODQUOT) continue; - while (dq->dq_flags & DQ_LOCK) { - dq->dq_flags |= DQ_WANT; - sleep((caddr_t)dq, PINOD+1); - } + dqlock(dq); + ncurbytes = dq->dq_curbytes + change; if (ncurbytes >= 0) dq->dq_curbytes = ncurbytes; @@ -155,13 +157,15 @@ hfs_chkdq(cp, change, cred, flags) dq->dq_curbytes = 0; dq->dq_flags &= ~DQ_BLKS; dq->dq_flags |= DQ_MOD; + + dqunlock(dq); } return (0); } p = current_proc(); if (cred == NOCRED) - cred = kernproc->p_ucred; - if ((cred->cr_uid != 0) || (p->p_flag & P_FORCEQUOTA)) { + cred = proc_ucred(kernproc); + if (suser(cred, NULL) || proc_forcequota(p)) { for (i = 0; i < MAXQUOTAS; i++) { if ((dq = cp->c_dquot[i]) == NODQUOT) continue; @@ -175,12 +179,12 @@ hfs_chkdq(cp, change, cred, flags) for (i = 0; i < MAXQUOTAS; i++) { if ((dq = cp->c_dquot[i]) == NODQUOT) continue; - while (dq->dq_flags & DQ_LOCK) { - dq->dq_flags |= DQ_WANT; - sleep((caddr_t)dq, PINOD+1); - } + dqlock(dq); + dq->dq_curbytes += change; dq->dq_flags |= DQ_MOD; + + dqunlock(dq); } } return (error); @@ -194,26 +198,30 @@ int hfs_chkdqchg(cp, change, cred, type) struct cnode *cp; int64_t change; - struct ucred *cred; + kauth_cred_t cred; int type; { register struct dquot *dq = cp->c_dquot[type]; - u_int64_t ncurbytes = dq->dq_curbytes + change; + u_int64_t ncurbytes; struct vnode *vp = cp->c_vp ? cp->c_vp : cp->c_rsrc_vp; - + + dqlock(dq); + + ncurbytes = dq->dq_curbytes + change; /* * If user would exceed their hard limit, disallow space allocation. */ if (ncurbytes >= dq->dq_bhardlimit && dq->dq_bhardlimit) { if ((dq->dq_flags & DQ_BLKS) == 0 && - cp->c_uid == cred->cr_uid) { + cp->c_uid == kauth_cred_getuid(cred)) { #if 0 - printf("\n%s: write failed, %s disk limit reached\n", - vp->v_mount->mnt_stat.f_mntonname, + printf("\nwrite failed, %s disk limit reached\n", quotatypes[type]); #endif dq->dq_flags |= DQ_BLKS; } + dqunlock(dq); + return (EDQUOT); } /* @@ -221,31 +229,38 @@ hfs_chkdqchg(cp, change, cred, type) * allocation. Reset time limit as they cross their soft limit. */ if (ncurbytes >= dq->dq_bsoftlimit && dq->dq_bsoftlimit) { + struct timeval tv; + + microuptime(&tv); if (dq->dq_curbytes < dq->dq_bsoftlimit) { - dq->dq_btime = time.tv_sec + - VFSTOHFS(vp->v_mount)->hfs_qfiles[type].qf_btime; + dq->dq_btime = tv.tv_sec + + VTOHFS(vp)->hfs_qfiles[type].qf_btime; #if 0 - if (cp->c_uid == cred->cr_uid) - printf("\n%s: warning, %s %s\n", - vp->v_mount->mnt_stat.f_mntonname, + if (cp->c_uid == kauth_cred_getuid(cred)) + printf("\nwarning, %s %s\n", quotatypes[type], "disk quota exceeded"); #endif + dqunlock(dq); + return (0); } - if (time.tv_sec > dq->dq_btime) { + if (tv.tv_sec > dq->dq_btime) { if ((dq->dq_flags & DQ_BLKS) == 0 && - cp->c_uid == cred->cr_uid) { + cp->c_uid == kauth_cred_getuid(cred)) { #if 0 - printf("\n%s: write failed, %s %s\n", - vp->v_mount->mnt_stat.f_mntonname, + printf("\nwrite failed, %s %s\n", quotatypes[type], "disk quota exceeded for too long"); #endif dq->dq_flags |= DQ_BLKS; } + dqunlock(dq); + return (EDQUOT); } } + dqunlock(dq); + return (0); } @@ -256,7 +271,7 @@ int hfs_chkiq(cp, change, cred, flags) register struct cnode *cp; long change; - struct ucred *cred; + kauth_cred_t cred; int flags; { register struct dquot *dq; @@ -274,10 +289,8 @@ hfs_chkiq(cp, change, cred, flags) for (i = 0; i < MAXQUOTAS; i++) { if ((dq = cp->c_dquot[i]) == NODQUOT) continue; - while (dq->dq_flags & DQ_LOCK) { - dq->dq_flags |= DQ_WANT; - sleep((caddr_t)dq, PINOD+1); - } + dqlock(dq); + ncurinodes = dq->dq_curinodes + change; if (ncurinodes >= 0) dq->dq_curinodes = ncurinodes; @@ -285,13 +298,15 @@ hfs_chkiq(cp, change, cred, flags) dq->dq_curinodes = 0; dq->dq_flags &= ~DQ_INODS; dq->dq_flags |= DQ_MOD; + + dqunlock(dq); } return (0); } p = current_proc(); if (cred == NOCRED) - cred = kernproc->p_ucred; - if ((cred->cr_uid != 0) || (p->p_flag & P_FORCEQUOTA)) { + cred = proc_ucred(kernproc); + if (suser(cred, NULL) || proc_forcequota(p)) { for (i = 0; i < MAXQUOTAS; i++) { if ((dq = cp->c_dquot[i]) == NODQUOT) continue; @@ -305,12 +320,12 @@ hfs_chkiq(cp, change, cred, flags) for (i = 0; i < MAXQUOTAS; i++) { if ((dq = cp->c_dquot[i]) == NODQUOT) continue; - while (dq->dq_flags & DQ_LOCK) { - dq->dq_flags |= DQ_WANT; - sleep((caddr_t)dq, PINOD+1); - } + dqlock(dq); + dq->dq_curinodes += change; dq->dq_flags |= DQ_MOD; + + dqunlock(dq); } } return (error); @@ -324,26 +339,30 @@ int hfs_chkiqchg(cp, change, cred, type) struct cnode *cp; long change; - struct ucred *cred; + kauth_cred_t cred; int type; { register struct dquot *dq = cp->c_dquot[type]; - long ncurinodes = dq->dq_curinodes + change; + long ncurinodes; struct vnode *vp = cp->c_vp ? cp->c_vp : cp->c_rsrc_vp; + dqlock(dq); + + ncurinodes = dq->dq_curinodes + change; /* * If user would exceed their hard limit, disallow cnode allocation. */ if (ncurinodes >= dq->dq_ihardlimit && dq->dq_ihardlimit) { if ((dq->dq_flags & DQ_INODS) == 0 && - cp->c_uid == cred->cr_uid) { + cp->c_uid == kauth_cred_getuid(cred)) { #if 0 - printf("\n%s: write failed, %s cnode limit reached\n", - vp->v_mount->mnt_stat.f_mntonname, + printf("\nwrite failed, %s cnode limit reached\n", quotatypes[type]); #endif dq->dq_flags |= DQ_INODS; } + dqunlock(dq); + return (EDQUOT); } /* @@ -351,31 +370,38 @@ hfs_chkiqchg(cp, change, cred, type) * allocation. Reset time limit as they cross their soft limit. */ if (ncurinodes >= dq->dq_isoftlimit && dq->dq_isoftlimit) { + struct timeval tv; + + microuptime(&tv); if (dq->dq_curinodes < dq->dq_isoftlimit) { - dq->dq_itime = time.tv_sec + - VFSTOHFS(vp->v_mount)->hfs_qfiles[type].qf_itime; + dq->dq_itime = tv.tv_sec + + VTOHFS(vp)->hfs_qfiles[type].qf_itime; #if 0 - if (cp->c_uid == cred->cr_uid) - printf("\n%s: warning, %s %s\n", - vp->v_mount->mnt_stat.f_mntonname, + if (cp->c_uid == kauth_cred_getuid(cred)) + printf("\nwarning, %s %s\n", quotatypes[type], "cnode quota exceeded"); #endif + dqunlock(dq); + return (0); } - if (time.tv_sec > dq->dq_itime) { + if (tv.tv_sec > dq->dq_itime) { if ((dq->dq_flags & DQ_INODS) == 0 && - cp->c_uid == cred->cr_uid) { + cp->c_uid == kauth_cred_getuid(cred)) { #if 0 - printf("\n%s: write failed, %s %s\n", - vp->v_mount->mnt_stat.f_mntonname, + printf("\nwrite failed, %s %s\n", quotatypes[type], "cnode quota exceeded for too long"); #endif dq->dq_flags |= DQ_INODS; } + dqunlock(dq); + return (EDQUOT); } } + dqunlock(dq); + return (0); } @@ -389,12 +415,11 @@ hfs_chkdquot(cp) register struct cnode *cp; { struct vnode *vp = cp->c_vp ? cp->c_vp : cp->c_rsrc_vp; - struct hfsmount *hfsmp = VFSTOHFS(vp->v_mount); + struct hfsmount *hfsmp = VTOHFS(vp); register int i; for (i = 0; i < MAXQUOTAS; i++) { - if (hfsmp->hfs_qfiles[i].qf_vp == NULLVP || - (hfsmp->hfs_qfiles[i].qf_qflags & (QTF_OPENING|QTF_CLOSING))) + if (hfsmp->hfs_qfiles[i].qf_vp == NULLVP) continue; if (cp->c_dquot[i] == NODQUOT) { vprint("chkdquot: missing dquot", vp); @@ -411,132 +436,181 @@ hfs_chkdquot(cp) /* * Q_QUOTAON - set up a quota file for a particular file system. */ +struct hfs_quotaon_cargs { + int error; +}; + +static int +hfs_quotaon_callback(struct vnode *vp, void *cargs) +{ + struct hfs_quotaon_cargs *args; + + args = (struct hfs_quotaon_cargs *)cargs; + + args->error = hfs_getinoquota(VTOC(vp)); + if (args->error) + return (VNODE_RETURNED_DONE); + + return (VNODE_RETURNED); +} + int -hfs_quotaon(p, mp, type, fname, segflg) +hfs_quotaon(p, mp, type, fnamep) struct proc *p; struct mount *mp; register int type; - caddr_t fname; - enum uio_seg segflg; + caddr_t fnamep; { struct hfsmount *hfsmp = VFSTOHFS(mp); - struct vnode *vp, **vpp; - struct vnode *nextvp; - struct dquot *dq; - int error; - struct nameidata nd; + struct quotafile *qfp; + struct vnode *vp; + int error = 0; + struct hfs_quotaon_cargs args; - vpp = &hfsmp->hfs_qfiles[type].qf_vp; - NDINIT(&nd, LOOKUP, FOLLOW, segflg, fname, p); - if (error = vn_open(&nd, FREAD|FWRITE, 0)) - return (error); - vp = nd.ni_vp; - VOP_UNLOCK(vp, 0, p); - if (vp->v_type != VREG) { - (void) vn_close(vp, FREAD|FWRITE, p->p_ucred, p); - return (EACCES); + qfp = &hfsmp->hfs_qfiles[type]; + + if ( (qf_get(qfp, QTF_OPENING)) ) + return (0); + + error = vnode_open(fnamep, FREAD|FWRITE, 0, 0, &vp, NULL); + if (error) { + goto out; } - if (*vpp != vp) - hfs_quotaoff(p, mp, type); - hfsmp->hfs_qfiles[type].qf_qflags |= QTF_OPENING; - mp->mnt_flag |= MNT_QUOTA; - vp->v_flag |= VNOFLUSH; - *vpp = vp; + if (!vnode_isreg(vp)) { + (void) vnode_close(vp, FREAD|FWRITE, NULL); + error = EACCES; + goto out; + } + vfs_setflags(mp, (uint64_t)((unsigned int)MNT_QUOTA)); + vnode_setnoflush(vp); /* * Save the credential of the process that turned on quotas. */ - crhold(p->p_ucred); - hfsmp->hfs_qfiles[type].qf_cred = p->p_ucred; - /* Finish initializing the quota file */ - if (error = dqfileopen(&hfsmp->hfs_qfiles[type], type)) - goto exit; + qfp->qf_cred = kauth_cred_proc_ref(p); + qfp->qf_vp = vp; + /* + * Finish initializing the quota file + */ + error = dqfileopen(qfp, type); + if (error) { + (void) vnode_close(vp, FREAD|FWRITE, NULL); + + kauth_cred_rele(qfp->qf_cred); + qfp->qf_cred = NOCRED; + qfp->qf_vp = NULLVP; + goto out; + } + qf_put(qfp, QTF_OPENING); + /* * Search vnodes associated with this mount point, * adding references to quota file being opened. * NB: only need to add dquot's for cnodes being modified. + * + * hfs_quota_callback will be called for each vnode open for + * 'write' (VNODE_WRITEABLE) hung off of this mount point + * the vnode will be in an 'unbusy' state (VNODE_WAIT) and + * properly referenced and unreferenced around the callback */ -again: - for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nextvp) { - nextvp = vp->v_mntvnodes.le_next; - if (vp->v_writecount == 0) - continue; - if (vget(vp, LK_EXCLUSIVE, p)) - goto again; - if (error = hfs_getinoquota(VTOC(vp))) { - vput(vp); - break; - } - vput(vp); - if (vp->v_mntvnodes.le_next != nextvp || vp->v_mount != mp) - goto again; - } -exit: - hfsmp->hfs_qfiles[type].qf_qflags &= ~QTF_OPENING; - if (error) + args.error = 0; + + vnode_iterate(mp, VNODE_WRITEABLE | VNODE_WAIT, hfs_quotaon_callback, (void *)&args); + + error = args.error; + + if (error) { hfs_quotaoff(p, mp, type); + } + return (error); + +out: + qf_put(qfp, QTF_OPENING); + return (error); } + /* * Q_QUOTAOFF - turn off disk quotas for a filesystem. */ +struct hfs_quotaoff_cargs { + int type; +}; + +static int +hfs_quotaoff_callback(struct vnode *vp, void *cargs) +{ + struct hfs_quotaoff_cargs *args; + struct cnode *cp; + struct dquot *dq; + + args = (struct hfs_quotaoff_cargs *)cargs; + + cp = VTOC(vp); + + dq = cp->c_dquot[args->type]; + cp->c_dquot[args->type] = NODQUOT; + + dqrele(dq); + + return (VNODE_RETURNED); +} + int -hfs_quotaoff(p, mp, type) - struct proc *p; - struct mount *mp; - register int type; +hfs_quotaoff(__unused struct proc *p, struct mount *mp, register int type) { - struct vnode *vp; - struct vnode *qvp, *nextvp; + struct vnode *qvp; struct hfsmount *hfsmp = VFSTOHFS(mp); - struct dquot *dq; - struct cnode *cp; + struct quotafile *qfp; int error; - struct ucred *cred; + kauth_cred_t cred; + struct hfs_quotaoff_cargs args; + + qfp = &hfsmp->hfs_qfiles[type]; - if ((qvp = hfsmp->hfs_qfiles[type].qf_vp) == NULLVP) - return (0); - hfsmp->hfs_qfiles[type].qf_qflags |= QTF_CLOSING; + if ( (qf_get(qfp, QTF_CLOSING)) ) + return (0); + qvp = qfp->qf_vp; /* * Sync out any orpaned dirty dquot entries. */ - dqsync_orphans(&hfsmp->hfs_qfiles[type]); + dqsync_orphans(qfp); /* * Search vnodes associated with this mount point, * deleting any references to quota file being closed. + * + * hfs_quotaoff_callback will be called for each vnode + * hung off of this mount point + * the vnode will be in an 'unbusy' state (VNODE_WAIT) and + * properly referenced and unreferenced around the callback */ -again: - for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nextvp) { - nextvp = vp->v_mntvnodes.le_next; - if (vget(vp, LK_EXCLUSIVE, p)) - goto again; - cp = VTOC(vp); - dq = cp->c_dquot[type]; - cp->c_dquot[type] = NODQUOT; - dqrele(vp, dq); - vput(vp); - if (vp->v_mntvnodes.le_next != nextvp || vp->v_mount != mp) - goto again; - } + args.type = type; + + vnode_iterate(mp, VNODE_WAIT, hfs_quotaoff_callback, (void *)&args); + dqflush(qvp); /* Finish tearing down the quota file */ - dqfileclose(&hfsmp->hfs_qfiles[type], type); - qvp->v_flag &= ~VNOFLUSH; - error = vn_close(qvp, FREAD|FWRITE, p->p_ucred, p); - hfsmp->hfs_qfiles[type].qf_vp = NULLVP; - cred = hfsmp->hfs_qfiles[type].qf_cred; + dqfileclose(qfp, type); + + vnode_clearnoflush(qvp); + error = vnode_close(qvp, FREAD|FWRITE, NULL); + + qfp->qf_vp = NULLVP; + cred = qfp->qf_cred; if (cred != NOCRED) { - hfsmp->hfs_qfiles[type].qf_cred = NOCRED; - crfree(cred); + qfp->qf_cred = NOCRED; + kauth_cred_rele(cred); } - hfsmp->hfs_qfiles[type].qf_qflags &= ~QTF_CLOSING; for (type = 0; type < MAXQUOTAS; type++) if (hfsmp->hfs_qfiles[type].qf_vp != NULLVP) break; if (type == MAXQUOTAS) - mp->mnt_flag &= ~MNT_QUOTA; + vfs_clearflags(mp, (uint64_t)((unsigned int)MNT_QUOTA)); + + qf_put(qfp, QTF_CLOSING); + return (error); } @@ -544,19 +618,25 @@ again: * Q_GETQUOTA - return current values in a dqblk structure. */ int -hfs_getquota(mp, id, type, addr) +hfs_getquota(mp, id, type, datap) struct mount *mp; u_long id; int type; - caddr_t addr; + caddr_t datap; { struct dquot *dq; int error; - if (error = dqget(NULLVP, id, &VFSTOHFS(mp)->hfs_qfiles[type], type, &dq)) + error = dqget(id, &VFSTOHFS(mp)->hfs_qfiles[type], type, &dq); + if (error) return (error); - error = copyout((caddr_t)&dq->dq_dqb, addr, sizeof (struct dqblk)); - dqrele(NULLVP, dq); + dqlock(dq); + + bcopy(&dq->dq_dqb, datap, sizeof(dq->dq_dqb)); + + dqunlock(dq); + dqrele(dq); + return (error); } @@ -564,47 +644,47 @@ hfs_getquota(mp, id, type, addr) * Q_SETQUOTA - assign an entire dqblk structure. */ int -hfs_setquota(mp, id, type, addr) +hfs_setquota(mp, id, type, datap) struct mount *mp; u_long id; int type; - caddr_t addr; + caddr_t datap; { - register struct dquot *dq; - struct dquot *ndq; + struct dquot *dq; struct hfsmount *hfsmp = VFSTOHFS(mp); - struct dqblk newlim; + struct dqblk * newlimp = (struct dqblk *) datap; + struct timeval tv; int error; - if (error = copyin(addr, (caddr_t)&newlim, sizeof (struct dqblk))) - return (error); - if (error = dqget(NULLVP, id, &hfsmp->hfs_qfiles[type], type, &ndq)) + error = dqget(id, &hfsmp->hfs_qfiles[type], type, &dq); + if (error) return (error); - dq = ndq; - while (dq->dq_flags & DQ_LOCK) { - dq->dq_flags |= DQ_WANT; - sleep((caddr_t)dq, PINOD+1); - } + dqlock(dq); + /* * Copy all but the current values. * Reset time limit if previously had no soft limit or were * under it, but now have a soft limit and are over it. */ - newlim.dqb_curbytes = dq->dq_curbytes; - newlim.dqb_curinodes = dq->dq_curinodes; + newlimp->dqb_curbytes = dq->dq_curbytes; + newlimp->dqb_curinodes = dq->dq_curinodes; if (dq->dq_id != 0) { - newlim.dqb_btime = dq->dq_btime; - newlim.dqb_itime = dq->dq_itime; + newlimp->dqb_btime = dq->dq_btime; + newlimp->dqb_itime = dq->dq_itime; } - if (newlim.dqb_bsoftlimit && - dq->dq_curbytes >= newlim.dqb_bsoftlimit && - (dq->dq_bsoftlimit == 0 || dq->dq_curbytes < dq->dq_bsoftlimit)) - newlim.dqb_btime = time.tv_sec + hfsmp->hfs_qfiles[type].qf_btime; - if (newlim.dqb_isoftlimit && - dq->dq_curinodes >= newlim.dqb_isoftlimit && - (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit)) - newlim.dqb_itime = time.tv_sec + hfsmp->hfs_qfiles[type].qf_itime; - dq->dq_dqb = newlim; + if (newlimp->dqb_bsoftlimit && + dq->dq_curbytes >= newlimp->dqb_bsoftlimit && + (dq->dq_bsoftlimit == 0 || dq->dq_curbytes < dq->dq_bsoftlimit)) { + microuptime(&tv); + newlimp->dqb_btime = tv.tv_sec + hfsmp->hfs_qfiles[type].qf_btime; + } + if (newlimp->dqb_isoftlimit && + dq->dq_curinodes >= newlimp->dqb_isoftlimit && + (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit)) { + microuptime(&tv); + newlimp->dqb_itime = tv.tv_sec + hfsmp->hfs_qfiles[type].qf_itime; + } + bcopy(newlimp, &dq->dq_dqb, sizeof(dq->dq_dqb)); if (dq->dq_curbytes < dq->dq_bsoftlimit) dq->dq_flags &= ~DQ_BLKS; if (dq->dq_curinodes < dq->dq_isoftlimit) @@ -615,7 +695,10 @@ hfs_setquota(mp, id, type, addr) else dq->dq_flags &= ~DQ_FAKE; dq->dq_flags |= DQ_MOD; - dqrele(NULLVP, dq); + + dqunlock(dq); + dqrele(dq); + return (0); } @@ -623,60 +706,78 @@ hfs_setquota(mp, id, type, addr) * Q_SETUSE - set current cnode and byte usage. */ int -hfs_setuse(mp, id, type, addr) +hfs_setuse(mp, id, type, datap) struct mount *mp; u_long id; int type; - caddr_t addr; + caddr_t datap; { - register struct dquot *dq; struct hfsmount *hfsmp = VFSTOHFS(mp); - struct dquot *ndq; - struct dqblk usage; + struct dquot *dq; + struct timeval tv; int error; + struct dqblk *quotablkp = (struct dqblk *) datap; + + error = dqget(id, &hfsmp->hfs_qfiles[type], type, &dq); + if (error) + return (error); + dqlock(dq); - if (error = copyin(addr, (caddr_t)&usage, sizeof (struct dqblk))) - return (error); - if (error = dqget(NULLVP, id, &hfsmp->hfs_qfiles[type], type, &ndq)) - return (error); - dq = ndq; - while (dq->dq_flags & DQ_LOCK) { - dq->dq_flags |= DQ_WANT; - sleep((caddr_t)dq, PINOD+1); - } /* * Reset time limit if have a soft limit and were * previously under it, but are now over it. */ if (dq->dq_bsoftlimit && dq->dq_curbytes < dq->dq_bsoftlimit && - usage.dqb_curbytes >= dq->dq_bsoftlimit) - dq->dq_btime = time.tv_sec + hfsmp->hfs_qfiles[type].qf_btime; + quotablkp->dqb_curbytes >= dq->dq_bsoftlimit) { + microuptime(&tv); + dq->dq_btime = tv.tv_sec + hfsmp->hfs_qfiles[type].qf_btime; + } if (dq->dq_isoftlimit && dq->dq_curinodes < dq->dq_isoftlimit && - usage.dqb_curinodes >= dq->dq_isoftlimit) - dq->dq_itime = time.tv_sec + hfsmp->hfs_qfiles[type].qf_itime; - dq->dq_curbytes = usage.dqb_curbytes; - dq->dq_curinodes = usage.dqb_curinodes; + quotablkp->dqb_curinodes >= dq->dq_isoftlimit) { + microuptime(&tv); + dq->dq_itime = tv.tv_sec + hfsmp->hfs_qfiles[type].qf_itime; + } + dq->dq_curbytes = quotablkp->dqb_curbytes; + dq->dq_curinodes = quotablkp->dqb_curinodes; if (dq->dq_curbytes < dq->dq_bsoftlimit) dq->dq_flags &= ~DQ_BLKS; if (dq->dq_curinodes < dq->dq_isoftlimit) dq->dq_flags &= ~DQ_INODS; dq->dq_flags |= DQ_MOD; - dqrele(NULLVP, dq); + + dqunlock(dq); + dqrele(dq); + return (0); } + /* * Q_SYNC - sync quota files to disk. */ +static int +hfs_qsync_callback(struct vnode *vp, __unused void *cargs) +{ + struct cnode *cp; + struct dquot *dq; + int i; + + cp = VTOC(vp); + + for (i = 0; i < MAXQUOTAS; i++) { + dq = cp->c_dquot[i]; + if (dq != NODQUOT && (dq->dq_flags & DQ_MOD)) + dqsync(dq); + } + return (VNODE_RETURNED); +} + int hfs_qsync(mp) struct mount *mp; { struct hfsmount *hfsmp = VFSTOHFS(mp); - struct proc *p = current_proc(); /* XXX */ - struct vnode *vp, *nextvp; - struct dquot *dq; - int i, error; + int i; /* * Check if the mount point has any quotas. @@ -698,44 +799,14 @@ hfs_qsync(mp) /* * Search vnodes associated with this mount point, * synchronizing any modified dquot structures. + * + * hfs_qsync_callback will be called for each vnode + * hung off of this mount point + * the vnode will be + * properly referenced and unreferenced around the callback */ - simple_lock(&mntvnode_slock); -again: - for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nextvp) { - if (vp->v_mount != mp) - goto again; - nextvp = vp->v_mntvnodes.le_next; - simple_lock(&vp->v_interlock); - simple_unlock(&mntvnode_slock); - error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, p); - if (error) { - simple_lock(&mntvnode_slock); - if (error == ENOENT) - goto again; - continue; - } - - /* Make sure that this is really an hfs vnode. */ - if (vp->v_mount != mp || - vp->v_type == VNON || - vp->v_tag != VT_HFS || - VTOC(vp) == NULL) { - vput(vp); - simple_lock(&mntvnode_slock); - goto again; - } + vnode_iterate(mp, 0, hfs_qsync_callback, (void *)NULL); - for (i = 0; i < MAXQUOTAS; i++) { - dq = VTOC(vp)->c_dquot[i]; - if (dq != NODQUOT && (dq->dq_flags & DQ_MOD)) - dqsync(vp, dq); - } - vput(vp); - simple_lock(&mntvnode_slock); - if (vp->v_mntvnodes.le_next != nextvp) - goto again; - } - simple_unlock(&mntvnode_slock); return (0); } @@ -743,21 +814,21 @@ again: * Q_QUOTASTAT - get quota on/off status */ int -hfs_quotastat(mp, type, addr) +hfs_quotastat(mp, type, datap) struct mount *mp; register int type; - caddr_t addr; + caddr_t datap; { struct hfsmount *hfsmp = VFSTOHFS(mp); int error = 0; int qstat; - if ((mp->mnt_flag & MNT_QUOTA) && (hfsmp->hfs_qfiles[type].qf_vp != NULLVP)) + if ((((unsigned int)vfs_flags(mp)) & MNT_QUOTA) && (hfsmp->hfs_qfiles[type].qf_vp != NULLVP)) qstat = 1; /* quotas are on for this type */ else qstat = 0; /* quotas are off for this type */ - error = copyout ((caddr_t)&qstat, addr, sizeof(qstat)); + *((int *)datap) = qstat; return (error); } diff --git a/bsd/hfs/hfs_quota.h b/bsd/hfs/hfs_quota.h index 2b4ded302..bde8fc5cd 100644 --- a/bsd/hfs/hfs_quota.h +++ b/bsd/hfs/hfs_quota.h @@ -74,23 +74,23 @@ struct mount; struct proc; struct ucred; __BEGIN_DECLS -int hfs_chkdq __P((struct cnode *, int64_t, struct ucred *, int)); -int hfs_chkdqchg __P((struct cnode *, int64_t, struct ucred *, int)); -int hfs_chkiq __P((struct cnode *, long, struct ucred *, int)); -int hfs_chkiqchg __P((struct cnode *, long, struct ucred *, int)); -int hfs_getinoquota __P((struct cnode *)); -int hfs_getquota __P((struct mount *, u_long, int, caddr_t)); -int hfs_qsync __P((struct mount *mp)); -int hfs_quotaoff __P((struct proc *, struct mount *, int)); -int hfs_quotaon __P((struct proc *, struct mount *, int, caddr_t, enum uio_seg)); -int hfs_setquota __P((struct mount *, u_long, int, caddr_t)); -int hfs_setuse __P((struct mount *, u_long, int, caddr_t)); -int hfs_quotactl __P((struct mount *, int, uid_t, caddr_t, struct proc *)); +int hfs_chkdq(struct cnode *, int64_t, struct ucred *, int); +int hfs_chkdqchg(struct cnode *, int64_t, struct ucred *, int); +int hfs_chkiq(struct cnode *, long, struct ucred *, int); +int hfs_chkiqchg(struct cnode *, long, struct ucred *, int); +int hfs_getinoquota(struct cnode *); +int hfs_getquota(struct mount *, u_long, int, caddr_t); +int hfs_qsync(struct mount *mp); +int hfs_quotaoff(struct proc *, struct mount *, int); +int hfs_quotaon(struct proc *, struct mount *, int, caddr_t); +int hfs_quotastat(struct mount *, int, caddr_t); +int hfs_setquota(struct mount *, u_long, int, caddr_t); +int hfs_setuse(struct mount *, u_long, int, caddr_t); __END_DECLS #if DIAGNOSTIC __BEGIN_DECLS -void hfs_chkdquot __P((struct cnode *)); +void hfs_chkdquot(struct cnode *); __END_DECLS #endif #endif /* __APPLE_API_PRIVATE */ diff --git a/bsd/hfs/hfs_readwrite.c b/bsd/hfs/hfs_readwrite.c index d49ca795c..3a54712da 100644 --- a/bsd/hfs/hfs_readwrite.c +++ b/bsd/hfs/hfs_readwrite.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -36,18 +36,22 @@ #include <sys/stat.h> #include <sys/buf.h> #include <sys/proc.h> +#include <sys/kauth.h> #include <sys/vnode.h> #include <sys/uio.h> +#include <sys/vfs_context.h> #include <miscfs/specfs/specdev.h> #include <sys/ubc.h> #include <vm/vm_pageout.h> +#include <vm/vm_kern.h> #include <sys/kdebug.h> #include "hfs.h" #include "hfs_endian.h" +#include "hfs_fsctl.h" #include "hfs_quota.h" #include "hfscommon/headers/FileMgrInternal.h" #include "hfscommon/headers/BTreesInternal.h" @@ -64,204 +68,186 @@ enum { extern u_int32_t GetLogicalBlockSize(struct vnode *vp); -static int hfs_clonelink(struct vnode *, int, struct ucred *, struct proc *); -static int hfs_clonefile(struct vnode *, int, int, int, struct ucred *, struct proc *); -static int hfs_clonesysfile(struct vnode *, int, int, int, struct ucred *, struct proc *); +extern int hfs_setextendedsecurity(struct hfsmount *, int); + + +static int hfs_clonelink(struct vnode *, int, kauth_cred_t, struct proc *); +static int hfs_clonefile(struct vnode *, int, int, int); +static int hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *); /***************************************************************************** * -* Operations on vnodes +* I/O Operations on vnodes * *****************************************************************************/ +int hfs_vnop_read(struct vnop_read_args *); +int hfs_vnop_write(struct vnop_write_args *); +int hfs_vnop_ioctl(struct vnop_ioctl_args *); +int hfs_vnop_select(struct vnop_select_args *); +int hfs_vnop_blktooff(struct vnop_blktooff_args *); +int hfs_vnop_offtoblk(struct vnop_offtoblk_args *); +int hfs_vnop_blockmap(struct vnop_blockmap_args *); +int hfs_vnop_strategy(struct vnop_strategy_args *); +int hfs_vnop_allocate(struct vnop_allocate_args *); +int hfs_vnop_pagein(struct vnop_pagein_args *); +int hfs_vnop_pageout(struct vnop_pageout_args *); +int hfs_vnop_bwrite(struct vnop_bwrite_args *); -/* -#% read vp L L L -# - vop_read { - IN struct vnode *vp; - INOUT struct uio *uio; - IN int ioflag; - IN struct ucred *cred; - - */ +/* + * Read data from a file. + */ int -hfs_read(ap) - struct vop_read_args /* { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; - } */ *ap; +hfs_vnop_read(struct vnop_read_args *ap) { - register struct uio *uio = ap->a_uio; - register struct vnode *vp = ap->a_vp; + uio_t uio = ap->a_uio; + struct vnode *vp = ap->a_vp; struct cnode *cp; struct filefork *fp; - int devBlockSize = 0; + struct hfsmount *hfsmp; + off_t filesize; + off_t filebytes; + off_t start_resid = uio_resid(uio); + off_t offset = uio_offset(uio); int retval = 0; - off_t filesize; - off_t filebytes; - off_t start_resid = uio->uio_resid; /* Preflight checks */ - if ((vp->v_type != VREG) || !UBCINFOEXISTS(vp)) - return (EPERM); /* can only read regular files */ - if (uio->uio_resid == 0) + if (!vnode_isreg(vp)) { + /* can only read regular files */ + if (vnode_isdir(vp)) + return (EISDIR); + else + return (EPERM); + } + if (start_resid == 0) return (0); /* Nothing left to do */ - if (uio->uio_offset < 0) + if (offset < 0) return (EINVAL); /* cant read from a negative offset */ cp = VTOC(vp); fp = VTOF(vp); + hfsmp = VTOHFS(vp); + + /* Protect against a size change. */ + hfs_lock_truncate(cp, 0); + filesize = fp->ff_size; - filebytes = (off_t)fp->ff_blocks * (off_t)VTOVCB(vp)->blockSize; - if (uio->uio_offset > filesize) { - if ((!ISHFSPLUS(VTOVCB(vp))) && (uio->uio_offset > (off_t)MAXHFSFILESIZE)) - return (EFBIG); - else - return (0); + filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; + if (offset > filesize) { + if ((hfsmp->hfs_flags & HFS_STANDARD) && + (offset > (off_t)MAXHFSFILESIZE)) { + retval = EFBIG; + } + goto exit; } - VOP_DEVBLOCKSIZE(cp->c_devvp, &devBlockSize); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START, - (int)uio->uio_offset, uio->uio_resid, (int)filesize, (int)filebytes, 0); + (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0); - retval = cluster_read(vp, uio, filesize, devBlockSize, 0); + retval = cluster_read(vp, uio, filesize, 0); - cp->c_flag |= C_ACCESS; + cp->c_touch_acctime = TRUE; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END, - (int)uio->uio_offset, uio->uio_resid, (int)filesize, (int)filebytes, 0); + (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0); /* * Keep track blocks read */ if (VTOHFS(vp)->hfc_stage == HFC_RECORDING && retval == 0) { + int took_cnode_lock = 0; + off_t bytesread; + + bytesread = start_resid - uio_resid(uio); + + /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */ + if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) { + hfs_lock(cp, HFS_FORCE_LOCK); + took_cnode_lock = 1; + } /* * If this file hasn't been seen since the start of * the current sampling period then start over. */ if (cp->c_atime < VTOHFS(vp)->hfc_timebase) { - fp->ff_bytesread = start_resid - uio->uio_resid; - cp->c_atime = time.tv_sec; + struct timeval tv; + + fp->ff_bytesread = bytesread; + microtime(&tv); + cp->c_atime = tv.tv_sec; } else { - fp->ff_bytesread += start_resid - uio->uio_resid; + fp->ff_bytesread += bytesread; } + if (took_cnode_lock) + hfs_unlock(cp); } - +exit: + hfs_unlock_truncate(cp); return (retval); } /* - * Write data to a file or directory. -#% write vp L L L -# - vop_write { - IN struct vnode *vp; - INOUT struct uio *uio; - IN int ioflag; - IN struct ucred *cred; - - */ + * Write data to a file. + */ int -hfs_write(ap) - struct vop_write_args /* { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; - } */ *ap; +hfs_vnop_write(struct vnop_write_args *ap) { + uio_t uio = ap->a_uio; struct vnode *vp = ap->a_vp; - struct uio *uio = ap->a_uio; struct cnode *cp; struct filefork *fp; - struct proc *p; - struct timeval tv; - ExtendedVCB *vcb; - int devBlockSize = 0; - off_t origFileSize, writelimit, bytesToAdd; + struct hfsmount *hfsmp; + kauth_cred_t cred = NULL; + off_t origFileSize; + off_t writelimit; + off_t bytesToAdd; off_t actualBytesAdded; - u_long resid; - int eflags, ioflag; - int retval; off_t filebytes; - struct hfsmount *hfsmp; - int started_tr = 0, grabbed_lock = 0; + off_t offset; + size_t resid; + int eflags; + int ioflag = ap->a_ioflag; + int retval = 0; + int lockflags; + int cnode_locked = 0; + // LP64todo - fix this! uio_resid may be 64-bit value + resid = uio_resid(uio); + offset = uio_offset(uio); - if (uio->uio_offset < 0) + if (offset < 0) return (EINVAL); - if (uio->uio_resid == 0) + if (resid == 0) return (E_NONE); - if ((vp->v_type != VREG) || !UBCINFOEXISTS(vp)) - return (EPERM); /* Can only write regular files */ + if (!vnode_isreg(vp)) + return (EPERM); /* Can only write regular files */ + + /* Protect against a size change. */ + hfs_lock_truncate(VTOC(vp), TRUE); - ioflag = ap->a_ioflag; + if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) { + hfs_unlock_truncate(VTOC(vp)); + return (retval); + } + cnode_locked = 1; cp = VTOC(vp); fp = VTOF(vp); - vcb = VTOVCB(vp); - filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; - - if (ioflag & IO_APPEND) - uio->uio_offset = fp->ff_size; - if ((cp->c_flags & APPEND) && uio->uio_offset != fp->ff_size) - return (EPERM); - - // XXXdbg - don't allow modification of the journal or journal_info_block - if (VTOHFS(vp)->jnl && cp->c_datafork) { - struct HFSPlusExtentDescriptor *extd; + hfsmp = VTOHFS(vp); + filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; - extd = &cp->c_datafork->ff_extents[0]; - if (extd->startBlock == VTOVCB(vp)->vcbJinfoBlock || extd->startBlock == VTOHFS(vp)->jnl_start) { - return EPERM; - } + if (ioflag & IO_APPEND) { + uio_setoffset(uio, fp->ff_size); + offset = fp->ff_size; } - - writelimit = uio->uio_offset + uio->uio_resid; - - /* - * Maybe this should be above the vnode op call, but so long as - * file servers have no limits, I don't think it matters. - */ - p = uio->uio_procp; - if (vp->v_type == VREG && p && - writelimit > p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { - psignal(p, SIGXFSZ); - return (EFBIG); + if ((cp->c_flags & APPEND) && offset != fp->ff_size) { + retval = EPERM; + goto exit; } - p = current_proc(); - - VOP_DEVBLOCKSIZE(cp->c_devvp, &devBlockSize); - resid = uio->uio_resid; origFileSize = fp->ff_size; eflags = kEFDeferMask; /* defer file block allocations */ - filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START, - (int)uio->uio_offset, uio->uio_resid, (int)fp->ff_size, (int)filebytes, 0); - retval = 0; - - /* Now test if we need to extend the file */ - /* Doing so will adjust the filebytes for us */ - -#if QUOTA - if(writelimit > filebytes) { - bytesToAdd = writelimit - filebytes; - - retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, vcb->blockSize)), - ap->a_cred, 0); - if (retval) - return (retval); - } -#endif /* QUOTA */ - - hfsmp = VTOHFS(vp); #ifdef HFS_SPARSE_DEV /* @@ -276,66 +262,70 @@ hfs_write(ap) } #endif /* HFS_SPARSE_DEV */ - if (writelimit > filebytes) { - hfs_global_shared_lock_acquire(hfsmp); - grabbed_lock = 1; - } - if (hfsmp->jnl && (writelimit > filebytes)) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - hfs_global_shared_lock_release(hfsmp); - return EINVAL; - } - started_tr = 1; + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START, + (int)offset, uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0); + + /* Now test if we need to extend the file */ + /* Doing so will adjust the filebytes for us */ + + writelimit = offset + resid; + if (writelimit <= filebytes) + goto sizeok; + + cred = vfs_context_ucred(ap->a_context); +#if QUOTA + bytesToAdd = writelimit - filebytes; + retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)), + cred, 0); + if (retval) + goto exit; +#endif /* QUOTA */ + + if (hfs_start_transaction(hfsmp) != 0) { + retval = EINVAL; + goto exit; } while (writelimit > filebytes) { bytesToAdd = writelimit - filebytes; - if (ap->a_cred && suser(ap->a_cred, NULL) != 0) + if (cred && suser(cred, NULL) != 0) eflags |= kEFReserveMask; - /* lock extents b-tree (also protects volume bitmap) */ - retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, current_proc()); - if (retval != E_NONE) - break; + /* Protect extents b-tree and allocation bitmap */ + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); /* Files that are changing size are not hot file candidates. */ if (hfsmp->hfc_stage == HFC_RECORDING) { fp->ff_bytesread = 0; } - retval = MacToVFSError(ExtendFileC (vcb, (FCB*)fp, bytesToAdd, + retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd, 0, eflags, &actualBytesAdded)); - (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, p); + hfs_systemfile_unlock(hfsmp, lockflags); + if ((actualBytesAdded == 0) && (retval == E_NONE)) retval = ENOSPC; if (retval != E_NONE) break; - filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; + filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_NONE, - (int)uio->uio_offset, uio->uio_resid, (int)fp->ff_size, (int)filebytes, 0); - } - - // XXXdbg - if (started_tr) { - tv = time; - VOP_UPDATE(vp, &tv, &tv, 1); - - hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); - journal_end_transaction(hfsmp->jnl); - started_tr = 0; - } - if (grabbed_lock) { - hfs_global_shared_lock_release(hfsmp); - grabbed_lock = 0; + (int)offset, uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0); } + (void) hfs_update(vp, TRUE); + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + (void) hfs_end_transaction(hfsmp); +sizeok: if (retval == E_NONE) { off_t filesize; off_t zero_off; off_t tail_off; off_t inval_start; off_t inval_end; - off_t io_start, io_end; + off_t io_start; int lflag; struct rl_entry *invalid_range; @@ -346,15 +336,15 @@ hfs_write(ap) lflag = (ioflag & IO_SYNC); - if (uio->uio_offset <= fp->ff_size) { - zero_off = uio->uio_offset & ~PAGE_MASK_64; + if (offset <= fp->ff_size) { + zero_off = offset & ~PAGE_MASK_64; /* Check to see whether the area between the zero_offset and the start of the transfer to see whether is invalid and should be zero-filled as part of the transfer: */ - if (uio->uio_offset > zero_off) { - if (rl_scan(&fp->ff_invalidranges, zero_off, uio->uio_offset - 1, &invalid_range) != RL_NOOVERLAP) + if (offset > zero_off) { + if (rl_scan(&fp->ff_invalidranges, zero_off, offset - 1, &invalid_range) != RL_NOOVERLAP) lflag |= IO_HEADZEROFILL; } } else { @@ -373,7 +363,7 @@ hfs_write(ap) will be handled by the cluser_write of the actual data. */ inval_start = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64; - inval_end = uio->uio_offset & ~PAGE_MASK_64; + inval_end = offset & ~PAGE_MASK_64; zero_off = fp->ff_size; if ((fp->ff_size & PAGE_MASK_64) && @@ -397,6 +387,7 @@ hfs_write(ap) }; if (inval_start < inval_end) { + struct timeval tv; /* There's some range of data that's going to be marked invalid */ if (zero_off < inval_start) { @@ -404,20 +395,26 @@ hfs_write(ap) and the actual write will start on a page past inval_end. Now's the last chance to zero-fill the page containing the EOF: */ - retval = cluster_write(vp, (struct uio *) 0, + hfs_unlock(cp); + cnode_locked = 0; + retval = cluster_write(vp, (uio_t) 0, fp->ff_size, inval_start, - zero_off, (off_t)0, devBlockSize, + zero_off, (off_t)0, lflag | IO_HEADZEROFILL | IO_NOZERODIRTY); + hfs_lock(cp, HFS_FORCE_LOCK); + cnode_locked = 1; if (retval) goto ioerr_exit; + offset = uio_offset(uio); }; /* Mark the remaining area of the newly allocated space as invalid: */ rl_add(inval_start, inval_end - 1 , &fp->ff_invalidranges); - cp->c_zftimeout = time.tv_sec + ZFTIMELIMIT; + microuptime(&tv); + cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT; zero_off = fp->ff_size = inval_end; }; - if (uio->uio_offset > zero_off) lflag |= IO_HEADZEROFILL; + if (offset > zero_off) lflag |= IO_HEADZEROFILL; }; /* Check to see whether the area between the end of the write and the end of @@ -441,23 +438,32 @@ hfs_write(ap) * made readable (removed from the invalid ranges) before cluster_write * tries to write it: */ - io_start = (lflag & IO_HEADZEROFILL) ? zero_off : uio->uio_offset; - io_end = (lflag & IO_TAILZEROFILL) ? tail_off : writelimit; + io_start = (lflag & IO_HEADZEROFILL) ? zero_off : offset; if (io_start < fp->ff_size) { + off_t io_end; + + io_end = (lflag & IO_TAILZEROFILL) ? tail_off : writelimit; rl_remove(io_start, io_end - 1, &fp->ff_invalidranges); }; + + hfs_unlock(cp); + cnode_locked = 0; retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off, - tail_off, devBlockSize, lflag | IO_NOZERODIRTY); - - if (uio->uio_offset > fp->ff_size) { - fp->ff_size = uio->uio_offset; + tail_off, lflag | IO_NOZERODIRTY); + offset = uio_offset(uio); + if (offset > fp->ff_size) { + fp->ff_size = offset; ubc_setsize(vp, fp->ff_size); /* XXX check errors */ + /* Files that are changing size are not hot file candidates. */ + if (hfsmp->hfc_stage == HFC_RECORDING) + fp->ff_bytesread = 0; + } + if (resid > uio_resid(uio)) { + cp->c_touch_chgtime = TRUE; + cp->c_touch_modtime = TRUE; } - if (resid > uio->uio_resid) - cp->c_flag |= C_CHANGE | C_UPDATE; } - HFS_KNOTE(vp, NOTE_WRITE); ioerr_exit: @@ -466,109 +472,466 @@ ioerr_exit: * we clear the setuid and setgid bits as a precaution against * tampering. */ - if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0) - cp->c_mode &= ~(S_ISUID | S_ISGID); - + if (cp->c_mode & (S_ISUID | S_ISGID)) { + cred = vfs_context_ucred(ap->a_context); + if (resid > uio_resid(uio) && cred && suser(cred, NULL)) { + if (!cnode_locked) { + hfs_lock(cp, HFS_FORCE_LOCK); + cnode_locked = 1; + } + cp->c_mode &= ~(S_ISUID | S_ISGID); + } + } if (retval) { if (ioflag & IO_UNIT) { - (void)VOP_TRUNCATE(vp, origFileSize, - ioflag & IO_SYNC, ap->a_cred, uio->uio_procp); - uio->uio_offset -= resid - uio->uio_resid; - uio->uio_resid = resid; - filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; + if (!cnode_locked) { + hfs_lock(cp, HFS_FORCE_LOCK); + cnode_locked = 1; + } + (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC, + 0, ap->a_context); + // LP64todo - fix this! resid needs to by user_ssize_t + uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio)))); + uio_setresid(uio, resid); + filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; + } + } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio))) { + if (!cnode_locked) { + hfs_lock(cp, HFS_FORCE_LOCK); + cnode_locked = 1; } - } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) { - tv = time; - retval = VOP_UPDATE(vp, &tv, &tv, 1); + retval = hfs_update(vp, TRUE); } - vcb->vcbWrCnt++; + /* Updating vcbWrCnt doesn't need to be atomic. */ + hfsmp->vcbWrCnt++; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_END, - (int)uio->uio_offset, uio->uio_resid, (int)fp->ff_size, (int)filebytes, 0); - + (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0); +exit: + if (cnode_locked) + hfs_unlock(cp); + hfs_unlock_truncate(cp); return (retval); } +/* support for the "bulk-access" fcntl */ -#ifdef HFS_SPARSE_DEV -struct hfs_backingstoreinfo { - int signature; /* == 3419115 */ - int version; /* version of this struct (1) */ - int backingfd; /* disk image file (on backing fs) */ - int bandsize; /* sparse disk image band size */ +#define CACHE_ELEMS 64 +#define CACHE_LEVELS 16 +#define PARENT_IDS_FLAG 0x100 + +/* from hfs_attrlist.c */ +extern unsigned long DerivePermissionSummary(uid_t obj_uid, gid_t obj_gid, + mode_t obj_mode, struct mount *mp, + kauth_cred_t cred, struct proc *p); + +/* from vfs/vfs_fsevents.c */ +extern char *get_pathbuff(void); +extern void release_pathbuff(char *buff); + +struct access_cache { + int numcached; + int cachehits; /* these two for statistics gathering */ + int lookups; + unsigned int *acache; + Boolean *haveaccess; }; -#define HFSIOC_SETBACKINGSTOREINFO _IOW('h', 7, struct hfs_backingstoreinfo) -#define HFSIOC_CLRBACKINGSTOREINFO _IO('h', 8) +struct access_t { + uid_t uid; /* IN: effective user id */ + short flags; /* IN: access requested (i.e. R_OK) */ + short num_groups; /* IN: number of groups user belongs to */ + int num_files; /* IN: number of files to process */ + int *file_ids; /* IN: array of file ids */ + gid_t *groups; /* IN: array of groups */ + short *access; /* OUT: access info for each file (0 for 'has access') */ +}; -#define HFS_SETBACKINGSTOREINFO IOCBASECMD(HFSIOC_SETBACKINGSTOREINFO) -#define HFS_CLRBACKINGSTOREINFO IOCBASECMD(HFSIOC_CLRBACKINGSTOREINFO) +struct user_access_t { + uid_t uid; /* IN: effective user id */ + short flags; /* IN: access requested (i.e. R_OK) */ + short num_groups; /* IN: number of groups user belongs to */ + int num_files; /* IN: number of files to process */ + user_addr_t file_ids; /* IN: array of file ids */ + user_addr_t groups; /* IN: array of groups */ + user_addr_t access; /* OUT: access info for each file (0 for 'has access') */ +}; -#endif /* HFS_SPARSE_DEV */ +/* + * Perform a binary search for the given parent_id. Return value is + * found/not found boolean, and indexp will be the index of the item + * or the index at which to insert the item if it's not found. + */ +static int +lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id) +{ + unsigned int lo, hi; + int index, matches = 0; + + if (cache->numcached == 0) { + *indexp = 0; + return 0; // table is empty, so insert at index=0 and report no match + } + + if (cache->numcached > CACHE_ELEMS) { + /*printf("EGAD! numcached is %d... cut our losses and trim to %d\n", + cache->numcached, CACHE_ELEMS);*/ + cache->numcached = CACHE_ELEMS; + } + + lo = 0; + hi = cache->numcached - 1; + index = -1; + + /* perform binary search for parent_id */ + do { + unsigned int mid = (hi - lo)/2 + lo; + unsigned int this_id = cache->acache[mid]; + + if (parent_id == this_id) { + index = mid; + break; + } + + if (parent_id < this_id) { + hi = mid; + continue; + } + + if (parent_id > this_id) { + lo = mid + 1; + continue; + } + } while(lo < hi); + + /* check if lo and hi converged on the match */ + if (parent_id == cache->acache[hi]) { + index = hi; + } + + /* if no existing entry found, find index for new one */ + if (index == -1) { + index = (parent_id < cache->acache[hi]) ? hi : hi + 1; + matches = 0; + } else { + matches = 1; + } + + *indexp = index; + return matches; +} + +/* + * Add a node to the access_cache at the given index (or do a lookup first + * to find the index if -1 is passed in). We currently do a replace rather + * than an insert if the cache is full. + */ +static void +add_node(struct access_cache *cache, int index, cnid_t nodeID, int access) +{ + int lookup_index = -1; + + /* need to do a lookup first if -1 passed for index */ + if (index == -1) { + if (lookup_bucket(cache, &lookup_index, nodeID)) { + if (cache->haveaccess[lookup_index] != access) { + /* change access info for existing entry... should never happen */ + cache->haveaccess[lookup_index] = access; + } + + /* mission accomplished */ + return; + } else { + index = lookup_index; + } + + } + + /* if the cache is full, do a replace rather than an insert */ + if (cache->numcached >= CACHE_ELEMS) { + //printf("cache is full (%d). replace at index %d\n", cache->numcached, index); + cache->numcached = CACHE_ELEMS-1; + + if (index > cache->numcached) { + // printf("index %d pinned to %d\n", index, cache->numcached); + index = cache->numcached; + } + } else if (index >= 0 && index < cache->numcached) { + /* only do bcopy if we're inserting */ + bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) ); + bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(Boolean) ); + } + + cache->acache[index] = nodeID; + cache->haveaccess[index] = access; + cache->numcached++; +} + + +struct cinfo { + uid_t uid; + gid_t gid; + mode_t mode; + cnid_t parentcnid; +}; + +static int +snoop_callback(const struct cat_desc *descp, const struct cat_attr *attrp, void * arg) +{ + struct cinfo *cip = (struct cinfo *)arg; + + cip->uid = attrp->ca_uid; + cip->gid = attrp->ca_gid; + cip->mode = attrp->ca_mode; + cip->parentcnid = descp->cd_parentcnid; + + return (0); +} + +/* + * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item + * isn't incore, then go to the catalog. + */ +static int +do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, dev_t dev, cnid_t cnid, + struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp, struct proc *p) +{ + int error = 0; + + /* if this id matches the one the fsctl was called with, skip the lookup */ + if (cnid == skip_cp->c_cnid) { + cnattrp->ca_uid = skip_cp->c_uid; + cnattrp->ca_gid = skip_cp->c_gid; + cnattrp->ca_mode = skip_cp->c_mode; + keyp->hfsPlus.parentID = skip_cp->c_parentcnid; + } else { + struct cinfo c_info; + + /* otherwise, check the cnode hash incase the file/dir is incore */ + if (hfs_chash_snoop(dev, cnid, snoop_callback, &c_info) == 0) { + cnattrp->ca_uid = c_info.uid; + cnattrp->ca_gid = c_info.gid; + cnattrp->ca_mode = c_info.mode; + keyp->hfsPlus.parentID = c_info.parentcnid; + } else { + int lockflags; + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + + /* lookup this cnid in the catalog */ + error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp); + + hfs_systemfile_unlock(hfsmp, lockflags); + + cache->lookups++; + } + } + + return (error); +} /* + * Compute whether we have access to the given directory (nodeID) and all its parents. Cache + * up to CACHE_LEVELS as we progress towards the root. + */ +static int +do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID, + struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred, dev_t dev ) +{ + int myErr = 0; + int myResult; + HFSCatalogNodeID thisNodeID; + unsigned long myPerms; + struct cat_attr cnattr; + int cache_index = -1; + CatalogKey catkey; + + int i = 0, ids_to_cache = 0; + int parent_ids[CACHE_LEVELS]; + + /* root always has access */ + if (!suser(myp_ucred, NULL)) { + return (1); + } + + thisNodeID = nodeID; + while (thisNodeID >= kRootDirID) { + myResult = 0; /* default to "no access" */ + + /* check the cache before resorting to hitting the catalog */ + + /* ASSUMPTION: access info of cached entries is "final"... i.e. no need + * to look any further after hitting cached dir */ + + if (lookup_bucket(cache, &cache_index, thisNodeID)) { + cache->cachehits++; + myResult = cache->haveaccess[cache_index]; + goto ExitThisRoutine; + } + + /* remember which parents we want to cache */ + if (ids_to_cache < CACHE_LEVELS) { + parent_ids[ids_to_cache] = thisNodeID; + ids_to_cache++; + } + + /* do the lookup (checks the cnode hash, then the catalog) */ + myErr = do_attr_lookup(hfsmp, cache, dev, thisNodeID, skip_cp, &catkey, &cnattr, theProcPtr); + if (myErr) { + goto ExitThisRoutine; /* no access */ + } + + myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, + cnattr.ca_mode, hfsmp->hfs_mp, + myp_ucred, theProcPtr); + + if ( (myPerms & X_OK) == 0 ) { + myResult = 0; + goto ExitThisRoutine; /* no access */ + } + + /* up the hierarchy we go */ + thisNodeID = catkey.hfsPlus.parentID; + } + + /* if here, we have access to this node */ + myResult = 1; + + ExitThisRoutine: + if (myErr) { + //printf("*** error %d from catalog looking up parent %d/%d!\n", myErr, dev, thisNodeID); + myResult = 0; + } + *err = myErr; + + /* cache the parent directory(ies) */ + for (i = 0; i < ids_to_cache; i++) { + /* small optimization: get rid of double-lookup for all these */ + // printf("adding %d to cache with result: %d\n", parent_ids[i], myResult); + add_node(cache, -1, parent_ids[i], myResult); + } + + return (myResult); +} +/* end "bulk-access" support */ -#% ioctl vp U U U -# - vop_ioctl { - IN struct vnode *vp; - IN u_long command; - IN caddr_t data; - IN int fflag; - IN struct ucred *cred; - IN struct proc *p; - */ +/* + * Callback for use with freeze ioctl. + */ +static int +hfs_freezewrite_callback(struct vnode *vp, void *cargs) +{ + vnode_waitforwrites(vp, 0, 0, 0, "hfs freeze"); + + return 0; +} -/* ARGSUSED */ +/* + * Control filesystem operating characteristics. + */ int -hfs_ioctl(ap) - struct vop_ioctl_args /* { - struct vnode *a_vp; +hfs_vnop_ioctl( struct vnop_ioctl_args /* { + vnode_t a_vp; int a_command; caddr_t a_data; int a_fflag; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; + vfs_context_t a_context; + } */ *ap) { + struct vnode * vp = ap->a_vp; + struct hfsmount *hfsmp = VTOHFS(vp); + vfs_context_t context = ap->a_context; + kauth_cred_t cred = vfs_context_ucred(context); + proc_t p = vfs_context_proc(context); + struct vfsstatfs *vfsp; + boolean_t is64bit; + + is64bit = proc_is64bit(p); + switch (ap->a_command) { + case HFS_RESIZE_VOLUME: { + u_int64_t newsize; + u_int64_t cursize; + + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (suser(cred, NULL) && + kauth_cred_getuid(cred) != vfsp->f_owner) { + return (EACCES); /* must be owner of file system */ + } + if (!vnode_isvroot(vp)) { + return (EINVAL); + } + newsize = *(u_int64_t *)ap->a_data; + cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize; + + if (newsize > cursize) { + return hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context); + } else if (newsize < cursize) { + return hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context); + } else { + return (0); + } + } + case HFS_CHANGE_NEXT_ALLOCATION: { + u_int32_t location; + + if (vnode_vfsisrdonly(vp)) { + return (EROFS); + } + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (suser(cred, NULL) && + kauth_cred_getuid(cred) != vfsp->f_owner) { + return (EACCES); /* must be owner of file system */ + } + if (!vnode_isvroot(vp)) { + return (EINVAL); + } + location = *(u_int32_t *)ap->a_data; + if (location > hfsmp->totalBlocks - 1) { + return (EINVAL); + } + /* Return previous value. */ + *(u_int32_t *)ap->a_data = hfsmp->nextAllocation; + HFS_MOUNT_LOCK(hfsmp, TRUE); + hfsmp->nextAllocation = location; + hfsmp->vcbFlags |= 0xFF00; + HFS_MOUNT_UNLOCK(hfsmp, TRUE); + return (0); + } + #ifdef HFS_SPARSE_DEV case HFS_SETBACKINGSTOREINFO: { - struct hfsmount * hfsmp; struct vnode * bsfs_rootvp; struct vnode * di_vp; - struct file * di_fp; struct hfs_backingstoreinfo *bsdata; int error = 0; - hfsmp = VTOHFS(ap->a_vp); if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { return (EALREADY); } - if (ap->a_p->p_ucred->cr_uid != 0 && - ap->a_p->p_ucred->cr_uid != (HFSTOVFS(hfsmp))->mnt_stat.f_owner) { + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (suser(cred, NULL) && + kauth_cred_getuid(cred) != vfsp->f_owner) { return (EACCES); /* must be owner of file system */ } bsdata = (struct hfs_backingstoreinfo *)ap->a_data; if (bsdata == NULL) { return (EINVAL); } - if (error = fdgetf(ap->a_p, bsdata->backingfd, &di_fp)) { + if ((error = file_vnode(bsdata->backingfd, &di_vp))) { return (error); } - if (fref(di_fp) == -1) { - return (EBADF); + if ((error = vnode_getwithref(di_vp))) { + file_drop(bsdata->backingfd); + return(error); } - if (di_fp->f_type != DTYPE_VNODE) { - frele(di_fp); - return (EINVAL); - } - di_vp = (struct vnode *)di_fp->f_data; - if (ap->a_vp->v_mount == di_vp->v_mount) { - frele(di_fp); + + if (vnode_mount(vp) == vnode_mount(di_vp)) { + (void)vnode_put(di_vp); + file_drop(bsdata->backingfd); return (EINVAL); } @@ -576,28 +939,30 @@ hfs_ioctl(ap) * Obtain the backing fs root vnode and keep a reference * on it. This reference will be dropped in hfs_unmount. */ - error = VFS_ROOT(di_vp->v_mount, &bsfs_rootvp); + error = VFS_ROOT(vnode_mount(di_vp), &bsfs_rootvp, NULL); /* XXX use context! */ if (error) { - frele(di_fp); + (void)vnode_put(di_vp); + file_drop(bsdata->backingfd); return (error); } - VOP_UNLOCK(bsfs_rootvp, 0, ap->a_p); /* Hold on to the reference */ + vnode_ref(bsfs_rootvp); + vnode_put(bsfs_rootvp); hfsmp->hfs_backingfs_rootvp = bsfs_rootvp; hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE; hfsmp->hfs_sparsebandblks = bsdata->bandsize / HFSTOVCB(hfsmp)->blockSize; hfsmp->hfs_sparsebandblks *= 4; - frele(di_fp); + (void)vnode_put(di_vp); + file_drop(bsdata->backingfd); return (0); } case HFS_CLRBACKINGSTOREINFO: { - struct hfsmount * hfsmp; struct vnode * tmpvp; - hfsmp = VTOHFS(ap->a_vp); - if (ap->a_p->p_ucred->cr_uid != 0 && - ap->a_p->p_ucred->cr_uid != (HFSTOVFS(hfsmp))->mnt_stat.f_owner) { + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (suser(cred, NULL) && + kauth_cred_getuid(cred) != vfsp->f_owner) { return (EACCES); /* must be owner of file system */ } if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) && @@ -607,166 +972,502 @@ hfs_ioctl(ap) tmpvp = hfsmp->hfs_backingfs_rootvp; hfsmp->hfs_backingfs_rootvp = NULLVP; hfsmp->hfs_sparsebandblks = 0; - vrele(tmpvp); + vnode_rele(tmpvp); } return (0); } #endif /* HFS_SPARSE_DEV */ - case 6: { + case F_FREEZE_FS: { + struct mount *mp; + task_t task; + + if (!is_suser()) + return (EACCES); + + mp = vnode_mount(vp); + hfsmp = VFSTOHFS(mp); + + if (!(hfsmp->jnl)) + return (ENOTSUP); + + task = current_task(); + task_working_set_disable(task); + + // flush things before we get started to try and prevent + // dirty data from being paged out while we're frozen. + // note: can't do this after taking the lock as it will + // deadlock against ourselves. + vnode_iterate(mp, 0, hfs_freezewrite_callback, NULL); + hfs_global_exclusive_lock_acquire(hfsmp); + journal_flush(hfsmp->jnl); + // don't need to iterate on all vnodes, we just need to + // wait for writes to the system files and the device vnode + // vnode_iterate(mp, 0, hfs_freezewrite_callback, NULL); + if (HFSTOVCB(hfsmp)->extentsRefNum) + vnode_waitforwrites(HFSTOVCB(hfsmp)->extentsRefNum, 0, 0, 0, "hfs freeze"); + if (HFSTOVCB(hfsmp)->catalogRefNum) + vnode_waitforwrites(HFSTOVCB(hfsmp)->catalogRefNum, 0, 0, 0, "hfs freeze"); + if (HFSTOVCB(hfsmp)->allocationsRefNum) + vnode_waitforwrites(HFSTOVCB(hfsmp)->allocationsRefNum, 0, 0, 0, "hfs freeze"); + if (hfsmp->hfs_attribute_vp) + vnode_waitforwrites(hfsmp->hfs_attribute_vp, 0, 0, 0, "hfs freeze"); + vnode_waitforwrites(hfsmp->hfs_devvp, 0, 0, 0, "hfs freeze"); + + hfsmp->hfs_freezing_proc = current_proc(); + + return (0); + } + + case F_THAW_FS: { + if (!is_suser()) + return (EACCES); + + // if we're not the one who froze the fs then we + // can't thaw it. + if (hfsmp->hfs_freezing_proc != current_proc()) { + return EINVAL; + } + + // NOTE: if you add code here, also go check the + // code that "thaws" the fs in hfs_vnop_close() + // + hfsmp->hfs_freezing_proc = NULL; + hfs_global_exclusive_lock_release(hfsmp); + + return (0); + } + +#define HFSIOC_BULKACCESS _IOW('h', 9, struct access_t) +#define HFS_BULKACCESS_FSCTL IOCBASECMD(HFSIOC_BULKACCESS) + + case HFS_BULKACCESS_FSCTL: + case HFS_BULKACCESS: { + /* + * NOTE: on entry, the vnode is locked. Incase this vnode + * happens to be in our list of file_ids, we'll note it + * avoid calling hfs_chashget_nowait() on that id as that + * will cause a "locking against myself" panic. + */ + Boolean check_leaf = true; + + struct user_access_t *user_access_structp; + struct user_access_t tmp_user_access_t; + struct access_cache cache; + + int error = 0, i; + + dev_t dev = VTOC(vp)->c_dev; + + short flags; + struct ucred myucred; /* XXX ILLEGAL */ + int num_files; + int *file_ids = NULL; + short *access = NULL; + + cnid_t cnid; + cnid_t prevParent_cnid = 0; + unsigned long myPerms; + short myaccess = 0; + struct cat_attr cnattr; + CatalogKey catkey; + struct cnode *skip_cp = VTOC(vp); + struct vfs_context my_context; + + /* first, return error if not run as root */ + if (cred->cr_ruid != 0) { + return EPERM; + } + + /* initialize the local cache and buffers */ + cache.numcached = 0; + cache.cachehits = 0; + cache.lookups = 0; + + file_ids = (int *) get_pathbuff(); + access = (short *) get_pathbuff(); + cache.acache = (int *) get_pathbuff(); + cache.haveaccess = (Boolean *) get_pathbuff(); + + if (file_ids == NULL || access == NULL || cache.acache == NULL || cache.haveaccess == NULL) { + release_pathbuff((char *) file_ids); + release_pathbuff((char *) access); + release_pathbuff((char *) cache.acache); + release_pathbuff((char *) cache.haveaccess); + + return ENOMEM; + } + + /* struct copyin done during dispatch... need to copy file_id array separately */ + if (ap->a_data == NULL) { + error = EINVAL; + goto err_exit_bulk_access; + } + + if (is64bit) { + user_access_structp = (struct user_access_t *)ap->a_data; + } + else { + struct access_t * accessp = (struct access_t *)ap->a_data; + tmp_user_access_t.uid = accessp->uid; + tmp_user_access_t.flags = accessp->flags; + tmp_user_access_t.num_groups = accessp->num_groups; + tmp_user_access_t.num_files = accessp->num_files; + tmp_user_access_t.file_ids = CAST_USER_ADDR_T(accessp->file_ids); + tmp_user_access_t.groups = CAST_USER_ADDR_T(accessp->groups); + tmp_user_access_t.access = CAST_USER_ADDR_T(accessp->access); + user_access_structp = &tmp_user_access_t; + } + + num_files = user_access_structp->num_files; + if (num_files < 1) { + goto err_exit_bulk_access; + } + if (num_files > 256) { + error = EINVAL; + goto err_exit_bulk_access; + } + + if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids, + num_files * sizeof(int)))) { + goto err_exit_bulk_access; + } + + /* fill in the ucred structure */ + flags = user_access_structp->flags; + if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) { + flags = R_OK; + } + + /* check if we've been passed leaf node ids or parent ids */ + if (flags & PARENT_IDS_FLAG) { + check_leaf = false; + } + + memset(&myucred, 0, sizeof(myucred)); + myucred.cr_ref = 1; + myucred.cr_uid = myucred.cr_ruid = myucred.cr_svuid = user_access_structp->uid; + myucred.cr_ngroups = user_access_structp->num_groups; + if (myucred.cr_ngroups < 1 || myucred.cr_ngroups > 16) { + myucred.cr_ngroups = 0; + } else if ((error = copyin(user_access_structp->groups, (caddr_t)myucred.cr_groups, + myucred.cr_ngroups * sizeof(gid_t)))) { + goto err_exit_bulk_access; + } + myucred.cr_rgid = myucred.cr_svgid = myucred.cr_groups[0]; + + my_context.vc_proc = p; + my_context.vc_ucred = &myucred; + + /* Check access to each file_id passed in */ + for (i = 0; i < num_files; i++) { +#if 0 + cnid = (cnid_t) file_ids[i]; + + /* root always has access */ + if (!suser(&myucred, NULL)) { + access[i] = 0; + continue; + } + + if (check_leaf) { + + /* do the lookup (checks the cnode hash, then the catalog) */ + error = do_attr_lookup(hfsmp, &cache, dev, cnid, skip_cp, &catkey, &cnattr, p); + if (error) { + access[i] = (short) error; + continue; + } + + /* before calling CheckAccess(), check the target file for read access */ + myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, + cnattr.ca_mode, hfsmp->hfs_mp, &myucred, p ); + + + /* fail fast if no access */ + if ((myPerms & flags) == 0) { + access[i] = EACCES; + continue; + } + } else { + /* we were passed an array of parent ids */ + catkey.hfsPlus.parentID = cnid; + } + + /* if the last guy had the same parent and had access, we're done */ + if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0) { + cache.cachehits++; + access[i] = 0; + continue; + } + + myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID, + skip_cp, p, &myucred, dev); + + if ( myaccess ) { + access[i] = 0; // have access.. no errors to report + } else { + access[i] = (error != 0 ? (short) error : EACCES); + } + + prevParent_cnid = catkey.hfsPlus.parentID; +#else + int myErr; + + cnid = (cnid_t)file_ids[i]; + + while (cnid >= kRootDirID) { + /* get the vnode for this cnid */ + myErr = hfs_vget(hfsmp, cnid, &vp, 0); + if ( myErr ) { + access[i] = EACCES; + break; + } + + cnid = VTOC(vp)->c_parentcnid; + + hfs_unlock(VTOC(vp)); + if (vnode_vtype(vp) == VDIR) { + myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, &my_context); + if (myErr) { + // try again with just read-access + myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, &my_context); + } + } else { + myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, &my_context); + } + vnode_put(vp); + access[i] = myErr; + if (myErr) { + break; + } + } +#endif + } + + /* copyout the access array */ + if ((error = copyout((caddr_t)access, user_access_structp->access, + num_files * sizeof (short)))) { + goto err_exit_bulk_access; + } + + err_exit_bulk_access: + + //printf("on exit (err %d), numfiles/numcached/cachehits/lookups is %d/%d/%d/%d\n", error, num_files, cache.numcached, cache.cachehits, cache.lookups); + + release_pathbuff((char *) cache.acache); + release_pathbuff((char *) cache.haveaccess); + release_pathbuff((char *) file_ids); + release_pathbuff((char *) access); + + return (error); + } /* HFS_BULKACCESS */ + + case HFS_SETACLSTATE: { + int state; + + if (!is_suser()) { + return (EPERM); + } + if (ap->a_data == NULL) { + return (EINVAL); + } + state = *(int *)ap->a_data; + if (state == 0 || state == 1) + return hfs_setextendedsecurity(hfsmp, state); + else + return (EINVAL); + } + + case F_FULLFSYNC: { int error; - ap->a_vp->v_flag |= VFULLFSYNC; - error = VOP_FSYNC(ap->a_vp, ap->a_cred, MNT_NOWAIT, ap->a_p); - ap->a_vp->v_flag &= ~VFULLFSYNC; + error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK); + if (error == 0) { + error = hfs_fsync(vp, MNT_NOWAIT, TRUE, p); + hfs_unlock(VTOC(vp)); + } return error; } - case 5: { - register struct vnode *vp; + + case F_CHKCLEAN: { register struct cnode *cp; - struct filefork *fp; int error; - vp = ap->a_vp; - cp = VTOC(vp); - fp = VTOF(vp); - - if (vp->v_type != VREG) + if (!vnode_isreg(vp)) return EINVAL; - VOP_LEASE(vp, ap->a_p, ap->a_cred, LEASE_READ); - error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, ap->a_p); - if (error) - return (error); - - /* - * used by regression test to determine if - * all the dirty pages (via write) have been cleaned - * after a call to 'fsysnc'. - */ - error = is_file_clean(vp, fp->ff_size); - VOP_UNLOCK(vp, 0, ap->a_p); - + error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK); + if (error == 0) { + cp = VTOC(vp); + /* + * used by regression test to determine if + * all the dirty pages (via write) have been cleaned + * after a call to 'fsysnc'. + */ + error = is_file_clean(vp, VTOF(vp)->ff_size); + hfs_unlock(cp); + } return (error); } - case 1: { - register struct vnode *vp; + case F_RDADVISE: { register struct radvisory *ra; - register struct cnode *cp; struct filefork *fp; - int devBlockSize = 0; int error; - vp = ap->a_vp; - - if (vp->v_type != VREG) + if (!vnode_isreg(vp)) return EINVAL; - VOP_LEASE(vp, ap->a_p, ap->a_cred, LEASE_READ); - error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, ap->a_p); - if (error) - return (error); - ra = (struct radvisory *)(ap->a_data); - cp = VTOC(vp); fp = VTOF(vp); + /* Protect against a size change. */ + hfs_lock_truncate(VTOC(vp), TRUE); + if (ra->ra_offset >= fp->ff_size) { - VOP_UNLOCK(vp, 0, ap->a_p); - return (EFBIG); + error = EFBIG; + } else { + error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count); } - VOP_DEVBLOCKSIZE(cp->c_devvp, &devBlockSize); - - error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count, devBlockSize); - VOP_UNLOCK(vp, 0, ap->a_p); + hfs_unlock_truncate(VTOC(vp)); return (error); } - case 2: /* F_READBOOTBLOCKS */ - case 3: /* F_WRITEBOOTBLOCKS */ - { - struct vnode *vp = ap->a_vp; + case F_READBOOTSTRAP: + case F_WRITEBOOTSTRAP: + { struct vnode *devvp = NULL; - struct fbootstraptransfer *btd = (struct fbootstraptransfer *)ap->a_data; + user_fbootstraptransfer_t *user_bootstrapp; int devBlockSize; int error; - struct iovec aiov; - struct uio auio; - u_long blockNumber; + uio_t auio; + daddr64_t blockNumber; u_long blockOffset; u_long xfersize; struct buf *bp; + user_fbootstraptransfer_t user_bootstrap; - if ((vp->v_flag & VROOT) == 0) return EINVAL; - if (btd->fbt_offset + btd->fbt_length > 1024) return EINVAL; + if (!vnode_isvroot(vp)) + return (EINVAL); + /* LP64 - when caller is a 64 bit process then we are passed a pointer + * to a user_fbootstraptransfer_t else we get a pointer to a + * fbootstraptransfer_t which we munge into a user_fbootstraptransfer_t + */ + if (is64bit) { + user_bootstrapp = (user_fbootstraptransfer_t *)ap->a_data; + } + else { + fbootstraptransfer_t *bootstrapp = (fbootstraptransfer_t *)ap->a_data; + user_bootstrapp = &user_bootstrap; + user_bootstrap.fbt_offset = bootstrapp->fbt_offset; + user_bootstrap.fbt_length = bootstrapp->fbt_length; + user_bootstrap.fbt_buffer = CAST_USER_ADDR_T(bootstrapp->fbt_buffer); + } + if (user_bootstrapp->fbt_offset + user_bootstrapp->fbt_length > 1024) + return EINVAL; devvp = VTOHFS(vp)->hfs_devvp; - aiov.iov_base = btd->fbt_buffer; - aiov.iov_len = btd->fbt_length; - - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_offset = btd->fbt_offset; - auio.uio_resid = btd->fbt_length; - auio.uio_segflg = UIO_USERSPACE; - auio.uio_rw = (ap->a_command == 3) ? UIO_WRITE : UIO_READ; /* F_WRITEBOOTSTRAP / F_READBOOTSTRAP */ - auio.uio_procp = ap->a_p; - - VOP_DEVBLOCKSIZE(devvp, &devBlockSize); - - while (auio.uio_resid > 0) { - blockNumber = auio.uio_offset / devBlockSize; - error = bread(devvp, blockNumber, devBlockSize, ap->a_cred, &bp); - if (error) { - if (bp) brelse(bp); - return error; - }; - - blockOffset = auio.uio_offset % devBlockSize; - xfersize = devBlockSize - blockOffset; - error = uiomove((caddr_t)bp->b_data + blockOffset, (int)xfersize, &auio); - if (error) { - brelse(bp); - return error; - }; - if (auio.uio_rw == UIO_WRITE) { - error = VOP_BWRITE(bp); - if (error) return error; - } else { - brelse(bp); - }; - }; - }; - return 0; - - case _IOC(IOC_OUT,'h', 4, 0): /* Create date in local time */ - { - *(time_t *)(ap->a_data) = to_bsd_time(VTOVCB(ap->a_vp)->localCreateDate); - return 0; - } - - default: - return (ENOTTY); - } + auio = uio_create(1, user_bootstrapp->fbt_offset, + is64bit ? UIO_USERSPACE64 : UIO_USERSPACE32, + (ap->a_command == F_WRITEBOOTSTRAP) ? UIO_WRITE : UIO_READ); + uio_addiov(auio, user_bootstrapp->fbt_buffer, user_bootstrapp->fbt_length); + + devBlockSize = vfs_devblocksize(vnode_mount(vp)); + + while (uio_resid(auio) > 0) { + blockNumber = uio_offset(auio) / devBlockSize; + error = (int)buf_bread(devvp, blockNumber, devBlockSize, cred, &bp); + if (error) { + if (bp) buf_brelse(bp); + uio_free(auio); + return error; + }; + + blockOffset = uio_offset(auio) % devBlockSize; + xfersize = devBlockSize - blockOffset; + error = uiomove((caddr_t)buf_dataptr(bp) + blockOffset, (int)xfersize, auio); + if (error) { + buf_brelse(bp); + uio_free(auio); + return error; + }; + if (uio_rw(auio) == UIO_WRITE) { + error = VNOP_BWRITE(bp); + if (error) { + uio_free(auio); + return error; + } + } else { + buf_brelse(bp); + }; + }; + uio_free(auio); + }; + return 0; + + case _IOC(IOC_OUT,'h', 4, 0): /* Create date in local time */ + { + if (is64bit) { + *(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate)); + } + else { + *(time_t *)(ap->a_data) = to_bsd_time(VTOVCB(vp)->localCreateDate); + } + return 0; + } + + case HFS_GET_MOUNT_TIME: + return copyout(&hfsmp->hfs_mount_time, CAST_USER_ADDR_T(ap->a_data), sizeof(hfsmp->hfs_mount_time)); + break; + + case HFS_GET_LAST_MTIME: + return copyout(&hfsmp->hfs_last_mounted_mtime, CAST_USER_ADDR_T(ap->a_data), sizeof(hfsmp->hfs_last_mounted_mtime)); + break; + + case HFS_SET_BOOT_INFO: + if (!vnode_isvroot(vp)) + return(EINVAL); + if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner)) + return(EACCES); /* must be superuser or owner of filesystem */ + HFS_MOUNT_LOCK(hfsmp, TRUE); + bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo)); + HFS_MOUNT_UNLOCK(hfsmp, TRUE); + (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0); + break; + + case HFS_GET_BOOT_INFO: + if (!vnode_isvroot(vp)) + return(EINVAL); + HFS_MOUNT_LOCK(hfsmp, TRUE); + bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo)); + HFS_MOUNT_UNLOCK(hfsmp, TRUE); + break; + + default: + return (ENOTTY); + } /* Should never get here */ return 0; } -/* ARGSUSED */ +/* + * select + */ int -hfs_select(ap) - struct vop_select_args /* { - struct vnode *a_vp; +hfs_vnop_select(__unused struct vnop_select_args *ap) +/* + struct vnop_select_args { + vnode_t a_vp; int a_which; int a_fflags; - struct ucred *a_cred; void *a_wql; - struct proc *a_p; - } */ *ap; + vfs_context_t a_context; + }; +*/ { /* * We should really check to see if I/O is possible. @@ -774,192 +1475,139 @@ hfs_select(ap) return (1); } -/* - * Bmap converts a the logical block number of a file to its physical block - * number on the disk. - */ - -/* - * vp - address of vnode file the file - * bn - which logical block to convert to a physical block number. - * vpp - returns the vnode for the block special file holding the filesystem - * containing the file of interest - * bnp - address of where to return the filesystem physical block number -#% bmap vp L L L -#% bmap vpp - U - -# - vop_bmap { - IN struct vnode *vp; - IN daddr_t bn; - OUT struct vnode **vpp; - IN daddr_t *bnp; - OUT int *runp; - */ /* * Converts a logical block number to a physical block, and optionally returns * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize. * The physical block number is based on the device block size, currently its 512. * The block run is returned in logical blocks, and is the REMAINING amount of blocks */ - int -hfs_bmap(ap) - struct vop_bmap_args /* { - struct vnode *a_vp; - daddr_t a_bn; - struct vnode **a_vpp; - daddr_t *a_bnp; - int *a_runp; - } */ *ap; +hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, int *runp) { - struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct filefork *fp = VTOF(vp); struct hfsmount *hfsmp = VTOHFS(vp); - int retval = E_NONE; - daddr_t logBlockSize; - size_t bytesContAvail = 0; - off_t blockposition; - struct proc *p = NULL; - int lockExtBtree; - struct rl_entry *invalid_range; - enum rl_overlaptype overlaptype; + int retval = E_NONE; + daddr_t logBlockSize; + size_t bytesContAvail = 0; + off_t blockposition; + int lockExtBtree; + int lockflags = 0; /* * Check for underlying vnode requests and ensure that logical * to physical mapping is requested. */ - if (ap->a_vpp != NULL) - *ap->a_vpp = cp->c_devvp; - if (ap->a_bnp == NULL) + if (vpp != NULL) + *vpp = cp->c_devvp; + if (bnp == NULL) return (0); - /* Only clustered I/O should have delayed allocations. */ - DBG_ASSERT(fp->ff_unallocblocks == 0); - logBlockSize = GetLogicalBlockSize(vp); - blockposition = (off_t)ap->a_bn * (off_t)logBlockSize; + blockposition = (off_t)bn * (off_t)logBlockSize; lockExtBtree = overflow_extents(fp); - if (lockExtBtree) { - p = current_proc(); - retval = hfs_metafilelocking(hfsmp, kHFSExtentsFileID, - LK_EXCLUSIVE | LK_CANRECURSE, p); - if (retval) - return (retval); - } + + if (lockExtBtree) + lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_SHARED_LOCK); retval = MacToVFSError( MapFileBlockC (HFSTOVCB(hfsmp), (FCB*)fp, MAXPHYSIO, blockposition, - ap->a_bnp, + bnp, &bytesContAvail)); - if (lockExtBtree) (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p); - - if (retval == E_NONE) { - /* Adjust the mapping information for invalid file ranges: */ - overlaptype = rl_scan(&fp->ff_invalidranges, - blockposition, - blockposition + MAXPHYSIO - 1, - &invalid_range); - if (overlaptype != RL_NOOVERLAP) { - switch(overlaptype) { - case RL_MATCHINGOVERLAP: - case RL_OVERLAPCONTAINSRANGE: - case RL_OVERLAPSTARTSBEFORE: - /* There's no valid block for this byte offset: */ - *ap->a_bnp = (daddr_t)-1; - bytesContAvail = invalid_range->rl_end + 1 - blockposition; - break; - - case RL_OVERLAPISCONTAINED: - case RL_OVERLAPENDSAFTER: - /* The range of interest hits an invalid block before the end: */ - if (invalid_range->rl_start == blockposition) { - /* There's actually no valid information to be had starting here: */ - *ap->a_bnp = (daddr_t)-1; - if ((fp->ff_size > (invalid_range->rl_end + 1)) && - (invalid_range->rl_end + 1 - blockposition < bytesContAvail)) { - bytesContAvail = invalid_range->rl_end + 1 - blockposition; - }; - } else { - bytesContAvail = invalid_range->rl_start - blockposition; - }; - break; - }; - if (bytesContAvail > MAXPHYSIO) bytesContAvail = MAXPHYSIO; - }; - - /* Figure out how many read ahead blocks there are */ - if (ap->a_runp != NULL) { - if (can_cluster(logBlockSize)) { - /* Make sure this result never goes negative: */ - *ap->a_runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1; - } else { - *ap->a_runp = 0; - }; - }; - }; - - return (retval); -} + if (lockExtBtree) + hfs_systemfile_unlock(hfsmp, lockflags); -/* blktooff converts logical block number to file offset */ + if (retval == E_NONE) { + /* Figure out how many read ahead blocks there are */ + if (runp != NULL) { + if (can_cluster(logBlockSize)) { + /* Make sure this result never goes negative: */ + *runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1; + } else { + *runp = 0; + } + } + } + return (retval); +} +/* + * Convert logical block number to file offset. + */ int -hfs_blktooff(ap) - struct vop_blktooff_args /* { - struct vnode *a_vp; - daddr_t a_lblkno; +hfs_vnop_blktooff(struct vnop_blktooff_args *ap) +/* + struct vnop_blktooff_args { + vnode_t a_vp; + daddr64_t a_lblkno; off_t *a_offset; - } */ *ap; + }; +*/ { if (ap->a_vp == NULL) return (EINVAL); - *ap->a_offset = (off_t)ap->a_lblkno * PAGE_SIZE_64; + *ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp); return(0); } +/* + * Convert file offset to logical block number. + */ int -hfs_offtoblk(ap) - struct vop_offtoblk_args /* { - struct vnode *a_vp; +hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap) +/* + struct vnop_offtoblk_args { + vnode_t a_vp; off_t a_offset; - daddr_t *a_lblkno; - } */ *ap; + daddr64_t *a_lblkno; + }; +*/ { if (ap->a_vp == NULL) return (EINVAL); - *ap->a_lblkno = ap->a_offset / PAGE_SIZE_64; + *ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp)); return(0); } +/* + * Map file offset to physical block number. + * + * System file cnodes are expected to be locked (shared or exclusive). + */ int -hfs_cmap(ap) - struct vop_cmap_args /* { - struct vnode *a_vp; +hfs_vnop_blockmap(struct vnop_blockmap_args *ap) +/* + struct vnop_blockmap_args { + vnode_t a_vp; off_t a_foffset; size_t a_size; - daddr_t *a_bpn; + daddr64_t *a_bpn; size_t *a_run; void *a_poff; - } */ *ap; + int a_flags; + vfs_context_t a_context; + }; +*/ { - struct hfsmount *hfsmp = VTOHFS(ap->a_vp); - struct filefork *fp = VTOF(ap->a_vp); - size_t bytesContAvail = 0; - int retval = E_NONE; - int lockExtBtree = 0; - struct proc *p = NULL; - struct rl_entry *invalid_range; - enum rl_overlaptype overlaptype; - int started_tr = 0, grabbed_lock = 0; - struct timeval tv; + struct vnode *vp = ap->a_vp; + struct cnode *cp; + struct filefork *fp; + struct hfsmount *hfsmp; + size_t bytesContAvail = 0; + int retval = E_NONE; + int syslocks = 0; + int lockflags = 0; + struct rl_entry *invalid_range; + enum rl_overlaptype overlaptype; + int started_tr = 0; + int tooklock = 0; /* * Check for underlying vnode requests and ensure that logical @@ -968,56 +1616,43 @@ hfs_cmap(ap) if (ap->a_bpn == NULL) return (0); - p = current_proc(); - - if (ISSET(VTOC(ap->a_vp)->c_flag, C_NOBLKMAP)) { - /* - * File blocks are getting remapped. Wait until its finished. - */ - SET(VTOC(ap->a_vp)->c_flag, C_WBLKMAP); - (void) tsleep((caddr_t)VTOC(ap->a_vp), PINOD, "hfs_cmap", 0); - if (ISSET(VTOC(ap->a_vp)->c_flag, C_NOBLKMAP)) - panic("hfs_cmap: no mappable blocks"); - } + if ( !vnode_issystem(vp) && !vnode_islnk(vp)) { + if (VTOC(vp)->c_lockowner != current_thread()) { + hfs_lock(VTOC(vp), HFS_FORCE_LOCK); + tooklock = 1; + } else { + cp = VTOC(vp); + panic("blockmap: %s cnode lock already held!\n", + cp->c_desc.cd_nameptr ? cp->c_desc.cd_nameptr : ""); + } + } + hfsmp = VTOHFS(vp); + cp = VTOC(vp); + fp = VTOF(vp); - retry: +retry: if (fp->ff_unallocblocks) { - lockExtBtree = 1; - - // XXXdbg - hfs_global_shared_lock_acquire(hfsmp); - grabbed_lock = 1; - - if (hfsmp->jnl) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - hfs_global_shared_lock_release(hfsmp); - return EINVAL; - } else { - started_tr = 1; - } - } - - if (retval = hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_EXCLUSIVE | LK_CANRECURSE, p)) { - if (started_tr) { - journal_end_transaction(hfsmp->jnl); - } - if (grabbed_lock) { - hfs_global_shared_lock_release(hfsmp); - } - return (retval); + if (hfs_start_transaction(hfsmp) != 0) { + retval = EINVAL; + goto exit; + } else { + started_tr = 1; } + syslocks = SFL_EXTENTS | SFL_BITMAP; + } else if (overflow_extents(fp)) { - lockExtBtree = 1; - if (retval = hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_EXCLUSIVE | LK_CANRECURSE, p)) { - return retval; - } + syslocks = SFL_EXTENTS; } + + if (syslocks) + lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK); /* * Check for any delayed allocations. */ if (fp->ff_unallocblocks) { - SInt64 reqbytes, actbytes; + SInt64 actbytes; + u_int32_t loanedBlocks; // // Make sure we have a transaction. It's possible @@ -1026,345 +1661,144 @@ hfs_cmap(ap) // btree, ff_unallocblocks became non-zero and so we // will need to start a transaction. // - if (hfsmp->jnl && started_tr == 0) { - if (lockExtBtree) { - (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p); - lockExtBtree = 0; - } - - goto retry; + if (started_tr == 0) { + if (syslocks) { + hfs_systemfile_unlock(hfsmp, lockflags); + syslocks = 0; + } + goto retry; } - reqbytes = (SInt64)fp->ff_unallocblocks * - (SInt64)HFSTOVCB(hfsmp)->blockSize; /* - * Release the blocks on loan and aquire some real ones. - * Note that we can race someone else for these blocks - * (and lose) so cmap needs to handle a failure here. - * Currently this race can't occur because all allocations - * are protected by an exclusive lock on the Extents - * Overflow file. + * Note: ExtendFileC will Release any blocks on loan and + * aquire real blocks. So we ask to extend by zero bytes + * since ExtendFileC will account for the virtual blocks. */ - HFSTOVCB(hfsmp)->loanedBlocks -= fp->ff_unallocblocks; - FTOC(fp)->c_blocks -= fp->ff_unallocblocks; - fp->ff_blocks -= fp->ff_unallocblocks; - fp->ff_unallocblocks = 0; - /* Files that are changing size are not hot file candidates. */ - if (hfsmp->hfc_stage == HFC_RECORDING) { - fp->ff_bytesread = 0; - } - while (retval == 0 && reqbytes > 0) { - retval = MacToVFSError(ExtendFileC(HFSTOVCB(hfsmp), - (FCB*)fp, reqbytes, 0, - kEFAllMask | kEFNoClumpMask, &actbytes)); - if (retval == 0 && actbytes == 0) - retval = ENOSPC; - - if (retval) { - fp->ff_unallocblocks = - reqbytes / HFSTOVCB(hfsmp)->blockSize; - HFSTOVCB(hfsmp)->loanedBlocks += fp->ff_unallocblocks; - FTOC(fp)->c_blocks += fp->ff_unallocblocks; - fp->ff_blocks += fp->ff_unallocblocks; - } - reqbytes -= actbytes; + loanedBlocks = fp->ff_unallocblocks; + retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0, + kEFAllMask | kEFNoClumpMask, &actbytes); + + if (retval) { + fp->ff_unallocblocks = loanedBlocks; + cp->c_blocks += loanedBlocks; + fp->ff_blocks += loanedBlocks; + + HFS_MOUNT_LOCK(hfsmp, TRUE); + hfsmp->loanedBlocks += loanedBlocks; + HFS_MOUNT_UNLOCK(hfsmp, TRUE); } if (retval) { - (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p); - VTOC(ap->a_vp)->c_flag |= C_MODIFIED; + hfs_systemfile_unlock(hfsmp, lockflags); + cp->c_flag |= C_MODIFIED; if (started_tr) { - tv = time; - VOP_UPDATE(ap->a_vp, &tv, &tv, 1); + (void) hfs_update(vp, TRUE); + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); - hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); - journal_end_transaction(hfsmp->jnl); - } - if (grabbed_lock) { - hfs_global_shared_lock_release(hfsmp); + hfs_end_transaction(hfsmp); } - return (retval); + goto exit; } } - retval = MacToVFSError( - MapFileBlockC (HFSTOVCB(hfsmp), - (FCB *)fp, - ap->a_size, - ap->a_foffset, - ap->a_bpn, - &bytesContAvail)); - - if (lockExtBtree) - (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p); + retval = MapFileBlockC(hfsmp, (FCB *)fp, ap->a_size, ap->a_foffset, + ap->a_bpn, &bytesContAvail); + if (syslocks) { + hfs_systemfile_unlock(hfsmp, lockflags); + syslocks = 0; + } - // XXXdbg if (started_tr) { - tv = time; - retval = VOP_UPDATE(ap->a_vp, &tv, &tv, 1); - - hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); - journal_end_transaction(hfsmp->jnl); + (void) hfs_update(vp, TRUE); + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + hfs_end_transaction(hfsmp); started_tr = 0; - } - if (grabbed_lock) { - hfs_global_shared_lock_release(hfsmp); - grabbed_lock = 0; - } - - if (retval == E_NONE) { - /* Adjust the mapping information for invalid file ranges: */ - overlaptype = rl_scan(&fp->ff_invalidranges, - ap->a_foffset, - ap->a_foffset + (off_t)bytesContAvail - 1, - &invalid_range); - if (overlaptype != RL_NOOVERLAP) { - switch(overlaptype) { - case RL_MATCHINGOVERLAP: - case RL_OVERLAPCONTAINSRANGE: - case RL_OVERLAPSTARTSBEFORE: - /* There's no valid block for this byte offset: */ - *ap->a_bpn = (daddr_t)-1; - - /* There's no point limiting the amount to be returned if the - invalid range that was hit extends all the way to the EOF - (i.e. there's no valid bytes between the end of this range - and the file's EOF): - */ - if ((fp->ff_size > (invalid_range->rl_end + 1)) && - (invalid_range->rl_end + 1 - ap->a_foffset < bytesContAvail)) { - bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset; - }; - break; - - case RL_OVERLAPISCONTAINED: - case RL_OVERLAPENDSAFTER: - /* The range of interest hits an invalid block before the end: */ - if (invalid_range->rl_start == ap->a_foffset) { - /* There's actually no valid information to be had starting here: */ - *ap->a_bpn = (daddr_t)-1; - if ((fp->ff_size > (invalid_range->rl_end + 1)) && - (invalid_range->rl_end + 1 - ap->a_foffset < bytesContAvail)) { - bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset; - }; - } else { - bytesContAvail = invalid_range->rl_start - ap->a_foffset; - }; - break; - }; - if (bytesContAvail > ap->a_size) bytesContAvail = ap->a_size; - }; - - if (ap->a_run) *ap->a_run = bytesContAvail; - }; - - if (ap->a_poff) - *(int *)ap->a_poff = 0; - - return (retval); -} - + } + if (retval) { + goto exit; + } -/* - * Read or write a buffer that is not contiguous on disk. We loop over - * each device block, copying to or from caller's buffer. - * - * We could be a bit more efficient by transferring as much data as is - * contiguous. But since this routine should rarely be called, and that - * would be more complicated; best to keep it simple. - */ -static int -hfs_strategy_fragmented(struct buf *bp) -{ - register struct vnode *vp = bp->b_vp; - register struct cnode *cp = VTOC(vp); - register struct vnode *devvp = cp->c_devvp; - caddr_t ioaddr; /* Address of fragment within bp */ - struct buf *frag = NULL; /* For reading or writing a single block */ - int retval = 0; - long remaining; /* Bytes (in bp) left to transfer */ - off_t offset; /* Logical offset of current fragment in vp */ - u_long block_size; /* Size of one device block (and one I/O) */ - - /* Make sure we redo this mapping for the next I/O */ - bp->b_blkno = bp->b_lblkno; - - /* Set up the logical position and number of bytes to read/write */ - offset = (off_t) bp->b_lblkno * (off_t) GetLogicalBlockSize(vp); - block_size = VTOHFS(vp)->hfs_phys_block_size; + /* Adjust the mapping information for invalid file ranges: */ + overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset, + ap->a_foffset + (off_t)bytesContAvail - 1, + &invalid_range); + if (overlaptype != RL_NOOVERLAP) { + switch(overlaptype) { + case RL_MATCHINGOVERLAP: + case RL_OVERLAPCONTAINSRANGE: + case RL_OVERLAPSTARTSBEFORE: + /* There's no valid block for this byte offset: */ + *ap->a_bpn = (daddr64_t)-1; + /* There's no point limiting the amount to be returned + * if the invalid range that was hit extends all the way + * to the EOF (i.e. there's no valid bytes between the + * end of this range and the file's EOF): + */ + if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) && + (invalid_range->rl_end + 1 - ap->a_foffset < bytesContAvail)) { + bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset; + } + break; - /* Get an empty buffer to do the deblocking */ - frag = geteblk(block_size); - if (ISSET(bp->b_flags, B_READ)) - SET(frag->b_flags, B_READ); - - for (ioaddr = bp->b_data, remaining = bp->b_bcount; remaining != 0; - ioaddr += block_size, offset += block_size, - remaining -= block_size) { - frag->b_resid = frag->b_bcount; - CLR(frag->b_flags, B_DONE); - - /* Map the current position to a physical block number */ - retval = VOP_CMAP(vp, offset, block_size, &frag->b_lblkno, - NULL, NULL); - if (retval != 0) + case RL_OVERLAPISCONTAINED: + case RL_OVERLAPENDSAFTER: + /* The range of interest hits an invalid block before the end: */ + if (invalid_range->rl_start == ap->a_foffset) { + /* There's actually no valid information to be had starting here: */ + *ap->a_bpn = (daddr64_t)-1; + if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) && + (invalid_range->rl_end + 1 - ap->a_foffset < bytesContAvail)) { + bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset; + } + } else { + bytesContAvail = invalid_range->rl_start - ap->a_foffset; + } break; - /* - * Did we try to read a hole? - * (Should never happen for metadata!) - */ - if ((long)frag->b_lblkno == -1) { - bzero(ioaddr, block_size); - continue; - } - - /* If writing, copy before I/O */ - if (!ISSET(bp->b_flags, B_READ)) - bcopy(ioaddr, frag->b_data, block_size); - - /* Call the device to do the I/O and wait for it */ - frag->b_blkno = frag->b_lblkno; - frag->b_vp = devvp; /* Used to dispatch via VOP_STRATEGY */ - frag->b_dev = devvp->v_rdev; - retval = VOP_STRATEGY(frag); - frag->b_vp = NULL; - if (retval != 0) - break; - retval = biowait(frag); - if (retval != 0) + case RL_NOOVERLAP: break; - - /* If reading, copy after the I/O */ - if (ISSET(bp->b_flags, B_READ)) - bcopy(frag->b_data, ioaddr, block_size); + } /* end switch */ + if (bytesContAvail > ap->a_size) + bytesContAvail = ap->a_size; } - - frag->b_vp = NULL; - // - // XXXdbg - in the case that this is a meta-data block, it won't affect - // the journal because this bp is for a physical disk block, - // not a logical block that is part of the catalog or extents - // files. - SET(frag->b_flags, B_INVAL); - brelse(frag); - - if ((bp->b_error = retval) != 0) - SET(bp->b_flags, B_ERROR); - - biodone(bp); /* This I/O is now complete */ - return retval; + if (ap->a_run) + *ap->a_run = bytesContAvail; + + if (ap->a_poff) + *(int *)ap->a_poff = 0; +exit: + if (tooklock) + hfs_unlock(cp); + + return (MacToVFSError(retval)); } /* - * Calculate the logical to physical mapping if not done already, - * then call the device strategy routine. -# -#vop_strategy { -# IN struct buf *bp; - */ + * prepare and issue the I/O + * buf_strategy knows how to deal + * with requests that require + * fragmented I/Os + */ int -hfs_strategy(ap) - struct vop_strategy_args /* { - struct buf *a_bp; - } */ *ap; +hfs_vnop_strategy(struct vnop_strategy_args *ap) { - register struct buf *bp = ap->a_bp; - register struct vnode *vp = bp->b_vp; - register struct cnode *cp = VTOC(vp); - int retval = 0; - off_t offset; - size_t bytes_contig; - - if ( !(bp->b_flags & B_VECTORLIST)) { - if (vp->v_type == VBLK || vp->v_type == VCHR) - panic("hfs_strategy: device vnode passed!"); - - if (bp->b_flags & B_PAGELIST) { - /* - * If we have a page list associated with this bp, - * then go through cluster_bp since it knows how to - * deal with a page request that might span non- - * contiguous physical blocks on the disk... - */ - retval = cluster_bp(bp); - vp = cp->c_devvp; - bp->b_dev = vp->v_rdev; - - return (retval); - } - - /* - * If we don't already know the filesystem relative block - * number then get it using VOP_BMAP(). If VOP_BMAP() - * returns the block number as -1 then we've got a hole in - * the file. Although HFS filesystems don't create files with - * holes, invalidating of subranges of the file (lazy zero - * filling) may create such a situation. - */ - if (bp->b_blkno == bp->b_lblkno) { - offset = (off_t) bp->b_lblkno * - (off_t) GetLogicalBlockSize(vp); - - if ((retval = VOP_CMAP(vp, offset, bp->b_bcount, - &bp->b_blkno, &bytes_contig, NULL))) { - bp->b_error = retval; - bp->b_flags |= B_ERROR; - biodone(bp); - return (retval); - } - if (bytes_contig < bp->b_bcount) - { - /* - * We were asked to read a block that wasn't - * contiguous, so we have to read each of the - * pieces and copy them into the buffer. - * Since ordinary file I/O goes through - * cluster_io (which won't ask us for - * discontiguous data), this is probably an - * attempt to read or write metadata. - */ - return hfs_strategy_fragmented(bp); - } - if ((long)bp->b_blkno == -1) - clrbuf(bp); - } - if ((long)bp->b_blkno == -1) { - biodone(bp); - return (0); - } - if (bp->b_validend == 0) { - /* - * Record the exact size of the I/O transfer about to - * be made: - */ - bp->b_validend = bp->b_bcount; - } - } - vp = cp->c_devvp; - bp->b_dev = vp->v_rdev; + buf_t bp = ap->a_bp; + vnode_t vp = buf_vnode(bp); + struct cnode *cp = VTOC(vp); - return VOCALL (vp->v_op, VOFFSET(vop_strategy), ap); + return (buf_strategy(cp->c_devvp, ap)); } -static int do_hfs_truncate(ap) - struct vop_truncate_args /* { - struct vnode *a_vp; - off_t a_length; - int a_flags; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; +static int +do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, vfs_context_t context) { - register struct vnode *vp = ap->a_vp; register struct cnode *cp = VTOC(vp); struct filefork *fp = VTOF(vp); - off_t length; - long vflags; - struct timeval tv; + struct proc *p = vfs_context_proc(context);; + kauth_cred_t cred = vfs_context_ucred(context); int retval; off_t bytesToAdd; off_t actualBytesAdded; @@ -1372,11 +1806,8 @@ static int do_hfs_truncate(ap) u_long fileblocks; int blksize; struct hfsmount *hfsmp; + int lockflags; - if (vp->v_type != VREG && vp->v_type != VLNK) - return (EISDIR); /* cannot truncate an HFS directory! */ - - length = ap->a_length; blksize = VTOVCB(vp)->blockSize; fileblocks = fp->ff_blocks; filebytes = (off_t)fileblocks * (off_t)blksize; @@ -1392,7 +1823,6 @@ static int do_hfs_truncate(ap) hfsmp = VTOHFS(vp); - tv = time; retval = E_NONE; /* Files that are changing size are not hot file candidates. */ @@ -1405,7 +1835,7 @@ static int do_hfs_truncate(ap) * since there may be extra physical blocks that also need truncation. */ #if QUOTA - if (retval = hfs_getinoquota(cp)) + if ((retval = hfs_getinoquota(cp))) return(retval); #endif /* QUOTA */ @@ -1414,10 +1844,10 @@ static int do_hfs_truncate(ap) * last byte of the file is allocated. Since the smallest * value of ff_size is 0, length will be at least 1. */ - if (length > fp->ff_size) { + if (length > (off_t)fp->ff_size) { #if QUOTA retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)), - ap->a_cred, 0); + cred, 0); if (retval) goto Err_Exit; #endif /* QUOTA */ @@ -1432,7 +1862,7 @@ static int do_hfs_truncate(ap) /* All or nothing and don't round up to clumpsize. */ eflags = kEFAllMask | kEFNoClumpMask; - if (ap->a_cred && suser(ap->a_cred, NULL) != 0) + if (cred && suser(cred, NULL) != 0) eflags |= kEFReserveMask; /* keep a reserve */ /* @@ -1444,25 +1874,16 @@ static int do_hfs_truncate(ap) eflags |= kEFMetadataMask; blockHint = hfsmp->hfs_metazone_start; } - // XXXdbg - hfs_global_shared_lock_acquire(hfsmp); - if (hfsmp->jnl) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - retval = EINVAL; - goto Err_Exit; - } + if (hfs_start_transaction(hfsmp) != 0) { + retval = EINVAL; + goto Err_Exit; } - /* lock extents b-tree (also protects volume bitmap) */ - retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p); - if (retval) { - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); - - goto Err_Exit; - } + /* Protect extents b-tree and allocation bitmap */ + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); while ((length > filebytes) && (retval == E_NONE)) { bytesToAdd = length - filebytes; @@ -1481,17 +1902,14 @@ static int do_hfs_truncate(ap) } } /* endwhile */ - (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p); + hfs_systemfile_unlock(hfsmp, lockflags); - // XXXdbg if (hfsmp->jnl) { - tv = time; - VOP_UPDATE(vp, &tv, &tv, 1); + (void) hfs_update(vp, TRUE); + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + } - hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); + hfs_end_transaction(hfsmp); if (retval) goto Err_Exit; @@ -1500,16 +1918,17 @@ static int do_hfs_truncate(ap) (int)length, (int)fp->ff_size, (int)filebytes, 0, 0); } - if (!(ap->a_flags & IO_NOZEROFILL)) { + if (!(flags & IO_NOZEROFILL)) { if (UBCINFOEXISTS(vp) && retval == E_NONE) { struct rl_entry *invalid_range; - int devBlockSize; off_t zero_limit; zero_limit = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64; if (length < zero_limit) zero_limit = length; - if (length > fp->ff_size) { + if (length > (off_t)fp->ff_size) { + struct timeval tv; + /* Extending the file: time to fill out the current last page w. zeroes? */ if ((fp->ff_size & PAGE_MASK_64) && (rl_scan(&fp->ff_invalidranges, fp->ff_size & ~PAGE_MASK_64, @@ -1519,50 +1938,66 @@ static int do_hfs_truncate(ap) of the file, so zero out the remainder of that page to ensure the entire page contains valid data. Since there is no invalid range possible past the (current) eof, there's no need to remove anything - from the invalid range list before calling cluster_write(): */ - VOP_DEVBLOCKSIZE(cp->c_devvp, &devBlockSize); + from the invalid range list before calling cluster_write(): */ + hfs_unlock(cp); retval = cluster_write(vp, (struct uio *) 0, fp->ff_size, zero_limit, - fp->ff_size, (off_t)0, devBlockSize, - (ap->a_flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY); + fp->ff_size, (off_t)0, + (flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY); + hfs_lock(cp, HFS_FORCE_LOCK); if (retval) goto Err_Exit; /* Merely invalidate the remaining area, if necessary: */ if (length > zero_limit) { + microuptime(&tv); rl_add(zero_limit, length - 1, &fp->ff_invalidranges); - cp->c_zftimeout = time.tv_sec + ZFTIMELIMIT; + cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT; } } else { /* The page containing the (current) eof is invalid: just add the remainder of the page to the invalid list, along with the area being newly allocated: */ + microuptime(&tv); rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges); - cp->c_zftimeout = time.tv_sec + ZFTIMELIMIT; + cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT; }; } } else { panic("hfs_truncate: invoked on non-UBC object?!"); }; } - cp->c_flag |= C_UPDATE; + cp->c_touch_modtime = TRUE; fp->ff_size = length; - if (UBCISVALID(vp)) - ubc_setsize(vp, fp->ff_size); /* XXX check errors */ + /* Nested transactions will do their own ubc_setsize. */ + if (!skipsetsize) { + /* + * ubc_setsize can cause a pagein here + * so we need to drop cnode lock. + */ + hfs_unlock(cp); + ubc_setsize(vp, length); + hfs_lock(cp, HFS_FORCE_LOCK); + } } else { /* Shorten the size of the file */ - if (fp->ff_size > length) { + if ((off_t)fp->ff_size > length) { /* * Any buffers that are past the truncation point need to be - * invalidated (to maintain buffer cache consistency). For - * simplicity, we invalidate all the buffers by calling vinvalbuf. + * invalidated (to maintain buffer cache consistency). */ - if (UBCISVALID(vp)) - ubc_setsize(vp, length); /* XXX check errors */ - vflags = ((length > 0) ? V_SAVE : 0) | V_SAVEMETA; - retval = vinvalbuf(vp, vflags, ap->a_cred, ap->a_p, 0, 0); + /* Nested transactions will do their own ubc_setsize. */ + if (!skipsetsize) { + /* + * ubc_setsize can cause a pageout here + * so we need to drop cnode lock. + */ + hfs_unlock(cp); + ubc_setsize(vp, length); + hfs_lock(cp, HFS_FORCE_LOCK); + } /* Any space previously marked as invalid is now irrelevant: */ rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges); @@ -1574,28 +2009,28 @@ static int do_hfs_truncate(ap) */ if (fp->ff_unallocblocks > 0) { u_int32_t finalblks; + u_int32_t loanedBlocks; - /* lock extents b-tree */ - retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, - LK_EXCLUSIVE, ap->a_p); - if (retval) - goto Err_Exit; + HFS_MOUNT_LOCK(hfsmp, TRUE); + + loanedBlocks = fp->ff_unallocblocks; + cp->c_blocks -= loanedBlocks; + fp->ff_blocks -= loanedBlocks; + fp->ff_unallocblocks = 0; - VTOVCB(vp)->loanedBlocks -= fp->ff_unallocblocks; - cp->c_blocks -= fp->ff_unallocblocks; - fp->ff_blocks -= fp->ff_unallocblocks; - fp->ff_unallocblocks = 0; + hfsmp->loanedBlocks -= loanedBlocks; finalblks = (length + blksize - 1) / blksize; if (finalblks > fp->ff_blocks) { /* calculate required unmapped blocks */ - fp->ff_unallocblocks = finalblks - fp->ff_blocks; - VTOVCB(vp)->loanedBlocks += fp->ff_unallocblocks; - cp->c_blocks += fp->ff_unallocblocks; - fp->ff_blocks += fp->ff_unallocblocks; + loanedBlocks = finalblks - fp->ff_blocks; + hfsmp->loanedBlocks += loanedBlocks; + + fp->ff_unallocblocks = loanedBlocks; + cp->c_blocks += loanedBlocks; + fp->ff_blocks += loanedBlocks; } - (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, - LK_RELEASE, ap->a_p); + HFS_MOUNT_UNLOCK(hfsmp, TRUE); } /* @@ -1604,44 +2039,33 @@ static int do_hfs_truncate(ap) * truncate with the IO_NDELAY flag set. So when IO_NDELAY * isn't set, we make sure this isn't a TBE process. */ - if ((ap->a_flags & IO_NDELAY) || (!ISSET(ap->a_p->p_flag, P_TBE))) { + if ((flags & IO_NDELAY) || (proc_tbe(p) == 0)) { #if QUOTA off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize); #endif /* QUOTA */ - // XXXdbg - hfs_global_shared_lock_acquire(hfsmp); - if (hfsmp->jnl) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - retval = EINVAL; - goto Err_Exit; - } - } + if (hfs_start_transaction(hfsmp) != 0) { + retval = EINVAL; + goto Err_Exit; + } + + if (fp->ff_unallocblocks == 0) { + /* Protect extents b-tree and allocation bitmap */ + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - /* lock extents b-tree (also protects volume bitmap) */ - retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p); - if (retval) { - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); - goto Err_Exit; - } - - if (fp->ff_unallocblocks == 0) retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, false)); - (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p); - - // XXXdbg + hfs_systemfile_unlock(hfsmp, lockflags); + } if (hfsmp->jnl) { - tv = time; - VOP_UPDATE(vp, &tv, &tv, 1); - - hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); - journal_end_transaction(hfsmp->jnl); + (void) hfs_update(vp, TRUE); + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); } - hfs_global_shared_lock_release(hfsmp); + + hfs_end_transaction(hfsmp); filebytes = (off_t)fp->ff_blocks * (off_t)blksize; if (retval) @@ -1652,12 +2076,12 @@ static int do_hfs_truncate(ap) #endif /* QUOTA */ } /* Only set update flag if the logical length changes */ - if (fp->ff_size != length) - cp->c_flag |= C_UPDATE; + if ((off_t)fp->ff_size != length) + cp->c_touch_modtime = TRUE; fp->ff_size = length; } - cp->c_flag |= C_CHANGE; - retval = VOP_UPDATE(vp, &tv, &tv, MNT_WAIT); + cp->c_touch_chgtime = TRUE; + retval = hfs_update(vp, MNT_WAIT); if (retval) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE, -1, -1, -1, retval, 0); @@ -1672,42 +2096,24 @@ Err_Exit: } + /* -# -#% truncate vp L L L -# -vop_truncate { - IN struct vnode *vp; - IN off_t length; - IN int flags; (IO_SYNC) - IN struct ucred *cred; - IN struct proc *p; -}; * Truncate a cnode to at most length size, freeing (or adding) the * disk blocks. */ -int hfs_truncate(ap) - struct vop_truncate_args /* { - struct vnode *a_vp; - off_t a_length; - int a_flags; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; +__private_extern__ +int +hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, + vfs_context_t context) { - register struct vnode *vp = ap->a_vp; - register struct cnode *cp = VTOC(vp); struct filefork *fp = VTOF(vp); - off_t length; off_t filebytes; u_long fileblocks; - int blksize, error; - u_int64_t nsize; + int blksize, error = 0; - if (vp->v_type != VREG && vp->v_type != VLNK) + if (vnode_isdir(vp)) return (EISDIR); /* cannot truncate an HFS directory! */ - length = ap->a_length; blksize = VTOVCB(vp)->blockSize; fileblocks = fp->ff_blocks; filebytes = (off_t)fileblocks * (off_t)blksize; @@ -1715,96 +2121,94 @@ int hfs_truncate(ap) // have to loop truncating or growing files that are // really big because otherwise transactions can get // enormous and consume too many kernel resources. - if (length < filebytes && (filebytes - length) > HFS_BIGFILE_SIZE) { - while (filebytes > length) { - if ((filebytes - length) > HFS_BIGFILE_SIZE) { - filebytes -= HFS_BIGFILE_SIZE; - } else { - filebytes = length; - } - - ap->a_length = filebytes; - error = do_hfs_truncate(ap); - if (error) - break; - } - } else if (length > filebytes && (length - filebytes) > HFS_BIGFILE_SIZE) { - while (filebytes < length) { - if ((length - filebytes) > HFS_BIGFILE_SIZE) { - filebytes += HFS_BIGFILE_SIZE; - } else { - filebytes = (length - filebytes); + + if (length < filebytes) { + while (filebytes > length) { + if ((filebytes - length) > HFS_BIGFILE_SIZE) { + filebytes -= HFS_BIGFILE_SIZE; + } else { + filebytes = length; + } + error = do_hfs_truncate(vp, filebytes, flags, skipsetsize, context); + if (error) + break; + } + } else if (length > filebytes) { + while (filebytes < length) { + if ((length - filebytes) > HFS_BIGFILE_SIZE) { + filebytes += HFS_BIGFILE_SIZE; + } else { + filebytes = length; + } + error = do_hfs_truncate(vp, filebytes, flags, skipsetsize, context); + if (error) + break; } + } else /* Same logical size */ { - ap->a_length = filebytes; - error = do_hfs_truncate(ap); - if (error) - break; - } - } else { - error = do_hfs_truncate(ap); + error = do_hfs_truncate(vp, length, flags, skipsetsize, context); + } + /* Files that are changing size are not hot file candidates. */ + if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) { + fp->ff_bytesread = 0; } - return error; + return (error); } /* -# -#% allocate vp L L L -# -vop_allocate { - IN struct vnode *vp; - IN off_t length; - IN int flags; - OUT off_t *bytesallocated; - IN off_t offset; - IN struct ucred *cred; - IN struct proc *p; -}; - * allocate a cnode to at most length size + * Preallocate file storage space. */ -int hfs_allocate(ap) - struct vop_allocate_args /* { - struct vnode *a_vp; +int +hfs_vnop_allocate(struct vnop_allocate_args /* { + vnode_t a_vp; off_t a_length; u_int32_t a_flags; off_t *a_bytesallocated; off_t a_offset; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; + vfs_context_t a_context; + } */ *ap) { struct vnode *vp = ap->a_vp; - struct cnode *cp = VTOC(vp); - struct filefork *fp = VTOF(vp); - ExtendedVCB *vcb = VTOVCB(vp); + struct cnode *cp; + struct filefork *fp; + ExtendedVCB *vcb; off_t length = ap->a_length; off_t startingPEOF; off_t moreBytesRequested; off_t actualBytesAdded; off_t filebytes; u_long fileblocks; - long vflags; - struct timeval tv; int retval, retval2; UInt32 blockHint; UInt32 extendFlags; /* For call to ExtendFileC */ struct hfsmount *hfsmp; + kauth_cred_t cred = vfs_context_ucred(ap->a_context); + int lockflags; + + *(ap->a_bytesallocated) = 0; + + if (!vnode_isreg(vp)) + return (EISDIR); + if (length < (off_t)0) + return (EINVAL); + if ((retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) + return (retval); + cp = VTOC(vp); + fp = VTOF(vp); hfsmp = VTOHFS(vp); + vcb = VTOVCB(vp); - *(ap->a_bytesallocated) = 0; fileblocks = fp->ff_blocks; filebytes = (off_t)fileblocks * (off_t)vcb->blockSize; - if (length < (off_t)0) - return (EINVAL); - if (vp->v_type != VREG) - return (EISDIR); - if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) - return (EINVAL); + if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) { + retval = EINVAL; + goto Err_Exit; + } /* Fill in the flags word for the call to Extend the file */ @@ -1813,10 +2217,9 @@ int hfs_allocate(ap) extendFlags |= kEFContigMask; if (ap->a_flags & ALLOCATEALL) extendFlags |= kEFAllMask; - if (ap->a_cred && suser(ap->a_cred, NULL) != 0) + if (cred && suser(cred, NULL) != 0) extendFlags |= kEFReserveMask; - tv = time; retval = E_NONE; blockHint = 0; startingPEOF = filebytes; @@ -1841,9 +2244,9 @@ int hfs_allocate(ap) #if QUOTA retval = hfs_chkdq(cp, (int64_t)(roundup(moreBytesRequested, vcb->blockSize)), - ap->a_cred, 0); + cred, 0); if (retval) - return (retval); + goto Err_Exit; #endif /* QUOTA */ /* @@ -1865,24 +2268,16 @@ int hfs_allocate(ap) } } - // XXXdbg - hfs_global_shared_lock_acquire(hfsmp); - if (hfsmp->jnl) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - retval = EINVAL; - goto Err_Exit; - } + if (hfs_start_transaction(hfsmp) != 0) { + retval = EINVAL; + goto Err_Exit; } - /* lock extents b-tree (also protects volume bitmap) */ - retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p); - if (retval) { - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); - goto Err_Exit; - } + /* Protect extents b-tree and allocation bitmap */ + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); retval = MacToVFSError(ExtendFileC(vcb, (FCB*)fp, @@ -1894,21 +2289,18 @@ int hfs_allocate(ap) *(ap->a_bytesallocated) = actualBytesAdded; filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; - (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p); + hfs_systemfile_unlock(hfsmp, lockflags); - // XXXdbg if (hfsmp->jnl) { - tv = time; - VOP_UPDATE(vp, &tv, &tv, 1); - - hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); - journal_end_transaction(hfsmp->jnl); + (void) hfs_update(vp, TRUE); + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); } - hfs_global_shared_lock_release(hfsmp); + + hfs_end_transaction(hfsmp); /* * if we get an error and no changes were made then exit - * otherwise we must do the VOP_UPDATE to reflect the changes + * otherwise we must do the hfs_update to reflect the changes */ if (retval && (startingPEOF == filebytes)) goto Err_Exit; @@ -1929,55 +2321,38 @@ int hfs_allocate(ap) if (fp->ff_size > length) { /* * Any buffers that are past the truncation point need to be - * invalidated (to maintain buffer cache consistency). For - * simplicity, we invalidate all the buffers by calling vinvalbuf. + * invalidated (to maintain buffer cache consistency). */ - vflags = ((length > 0) ? V_SAVE : 0) | V_SAVEMETA; - (void) vinvalbuf(vp, vflags, ap->a_cred, ap->a_p, 0, 0); } - // XXXdbg - hfs_global_shared_lock_acquire(hfsmp); - if (hfsmp->jnl) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - retval = EINVAL; - goto Err_Exit; - } + if (hfs_start_transaction(hfsmp) != 0) { + retval = EINVAL; + goto Err_Exit; } - /* lock extents b-tree (also protects volume bitmap) */ - retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p); - if (retval) { - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); + /* Protect extents b-tree and allocation bitmap */ + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - goto Err_Exit; - } + retval = MacToVFSError(TruncateFileC(vcb, (FCB*)fp, length, false)); + + hfs_systemfile_unlock(hfsmp, lockflags); - retval = MacToVFSError( - TruncateFileC( - vcb, - (FCB*)fp, - length, - false)); - (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p); filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; if (hfsmp->jnl) { - tv = time; - VOP_UPDATE(vp, &tv, &tv, 1); - - hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); - journal_end_transaction(hfsmp->jnl); + (void) hfs_update(vp, TRUE); + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); } - hfs_global_shared_lock_release(hfsmp); + + hfs_end_transaction(hfsmp); /* * if we get an error and no changes were made then exit - * otherwise we must do the VOP_UPDATE to reflect the changes + * otherwise we must do the hfs_update to reflect the changes */ if (retval && (startingPEOF == filebytes)) goto Err_Exit; #if QUOTA @@ -1988,158 +2363,179 @@ int hfs_allocate(ap) if (fp->ff_size > filebytes) { fp->ff_size = filebytes; - if (UBCISVALID(vp)) - ubc_setsize(vp, fp->ff_size); /* XXX check errors */ + hfs_unlock(cp); + ubc_setsize(vp, fp->ff_size); + hfs_lock(cp, HFS_FORCE_LOCK); } } Std_Exit: - cp->c_flag |= C_CHANGE | C_UPDATE; - retval2 = VOP_UPDATE(vp, &tv, &tv, MNT_WAIT); + cp->c_touch_chgtime = TRUE; + cp->c_touch_modtime = TRUE; + retval2 = hfs_update(vp, MNT_WAIT); if (retval == 0) retval = retval2; Err_Exit: + hfs_unlock(cp); return (retval); } /* - * pagein for HFS filesystem + * Pagein for HFS filesystem */ int -hfs_pagein(ap) - struct vop_pagein_args /* { - struct vnode *a_vp, +hfs_vnop_pagein(struct vnop_pagein_args *ap) +/* + struct vnop_pagein_args { + vnode_t a_vp, upl_t a_pl, vm_offset_t a_pl_offset, off_t a_f_offset, size_t a_size, - struct ucred *a_cred, int a_flags - } */ *ap; + vfs_context_t a_context; + }; +*/ { - register struct vnode *vp = ap->a_vp; - int devBlockSize = 0; + vnode_t vp = ap->a_vp; int error; - if (vp->v_type != VREG) - panic("hfs_pagein: vp not UBC type\n"); - - VOP_DEVBLOCKSIZE(VTOC(vp)->c_devvp, &devBlockSize); - error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, - ap->a_size, (off_t)VTOF(vp)->ff_size, devBlockSize, - ap->a_flags); + ap->a_size, (off_t)VTOF(vp)->ff_size, ap->a_flags); /* - * Keep track blocks read + * Keep track of blocks read. */ if (VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) { struct cnode *cp; + struct filefork *fp; + int bytesread; + int took_cnode_lock = 0; - cp = VTOC(vp); + cp = VTOC(vp); + fp = VTOF(vp); + + if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE) + bytesread = fp->ff_size; + else + bytesread = ap->a_size; + + /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */ + if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) { + hfs_lock(cp, HFS_FORCE_LOCK); + took_cnode_lock = 1; + } /* * If this file hasn't been seen since the start of * the current sampling period then start over. */ - if (cp->c_atime < VTOHFS(vp)->hfc_timebase) - VTOF(vp)->ff_bytesread = ap->a_size; - else - VTOF(vp)->ff_bytesread += ap->a_size; + if (cp->c_atime < VTOHFS(vp)->hfc_timebase) { + struct timeval tv; - cp->c_flag |= C_ACCESS; + fp->ff_bytesread = bytesread; + microtime(&tv); + cp->c_atime = tv.tv_sec; + } else { + fp->ff_bytesread += bytesread; + } + cp->c_touch_acctime = TRUE; + if (took_cnode_lock) + hfs_unlock(cp); } - return (error); } /* - * pageout for HFS filesystem. + * Pageout for HFS filesystem. */ int -hfs_pageout(ap) - struct vop_pageout_args /* { - struct vnode *a_vp, +hfs_vnop_pageout(struct vnop_pageout_args *ap) +/* + struct vnop_pageout_args { + vnode_t a_vp, upl_t a_pl, vm_offset_t a_pl_offset, off_t a_f_offset, size_t a_size, - struct ucred *a_cred, int a_flags - } */ *ap; + vfs_context_t a_context; + }; +*/ { - struct vnode *vp = ap->a_vp; - struct cnode *cp = VTOC(vp); - struct filefork *fp = VTOF(vp); + vnode_t vp = ap->a_vp; + struct cnode *cp; + struct filefork *fp; int retval; - int devBlockSize = 0; off_t end_of_range; off_t filesize; - if (UBCINVALID(vp)) - panic("hfs_pageout: Not a VREG: vp=%x", vp); + cp = VTOC(vp); + if (cp->c_lockowner == current_thread()) { + panic("pageout: %s cnode lock already held!\n", + cp->c_desc.cd_nameptr ? cp->c_desc.cd_nameptr : ""); + } + if ( (retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) { + return (retval); + } + fp = VTOF(vp); - VOP_DEVBLOCKSIZE(cp->c_devvp, &devBlockSize); filesize = fp->ff_size; end_of_range = ap->a_f_offset + ap->a_size - 1; - if (cp->c_flag & C_RELOCATING) { - if (end_of_range < (filesize / 2)) { - return (EBUSY); - } - } - - if (end_of_range >= filesize) + if (end_of_range >= filesize) { end_of_range = (off_t)(filesize - 1); + } if (ap->a_f_offset < filesize) { rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges); cp->c_flag |= C_MODIFIED; /* leof is dirty */ } + hfs_unlock(cp); - retval = cluster_pageout(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, ap->a_size, - filesize, devBlockSize, ap->a_flags); + retval = cluster_pageout(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, + ap->a_size, filesize, ap->a_flags); /* - * If we successfully wrote any data, and we are not the superuser - * we clear the setuid and setgid bits as a precaution against - * tampering. + * If data was written, and setuid or setgid bits are set and + * this process is not the superuser then clear the setuid and + * setgid bits as a precaution against tampering. */ - if (retval == 0 && ap->a_cred && ap->a_cred->cr_uid != 0) + if ((retval == 0) && + (cp->c_mode & (S_ISUID | S_ISGID)) && + (vfs_context_suser(ap->a_context) != 0)) { + hfs_lock(cp, HFS_FORCE_LOCK); cp->c_mode &= ~(S_ISUID | S_ISGID); - + cp->c_touch_chgtime = TRUE; + hfs_unlock(cp); + } return (retval); } /* * Intercept B-Tree node writes to unswap them if necessary. -# -#vop_bwrite { -# IN struct buf *bp; */ int -hfs_bwrite(ap) - struct vop_bwrite_args /* { - struct buf *a_bp; - } */ *ap; +hfs_vnop_bwrite(struct vnop_bwrite_args *ap) { int retval = 0; register struct buf *bp = ap->a_bp; - register struct vnode *vp = bp->b_vp; + register struct vnode *vp = buf_vnode(bp); #if BYTE_ORDER == LITTLE_ENDIAN BlockDescriptor block; /* Trap B-Tree writes */ if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) || - (VTOC(vp)->c_fileid == kHFSCatalogFileID)) { + (VTOC(vp)->c_fileid == kHFSCatalogFileID) || + (VTOC(vp)->c_fileid == kHFSAttributesFileID)) { /* Swap if the B-Tree node is in native byte order */ - if (((UInt16 *)((char *)bp->b_data + bp->b_bcount - 2))[0] == 0x000e) { + if (((UInt16 *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) { /* Prepare the block pointer */ block.blockHeader = bp; - block.buffer = bp->b_data; + block.buffer = (char *)buf_dataptr(bp); /* not found in cache ==> came from disk */ - block.blockReadFromDisk = (bp->b_flags & B_CACHE) == 0; - block.blockSize = bp->b_bcount; + block.blockReadFromDisk = (buf_fromcache(bp) == 0); + block.blockSize = buf_count(bp); /* Endian un-swap B-Tree node */ SWAP_BT_NODE (&block, ISHFSPLUS (VTOVCB(vp)), VTOC(vp)->c_fileid, 1); @@ -2149,13 +2545,12 @@ hfs_bwrite(ap) } #endif /* This buffer shouldn't be locked anymore but if it is clear it */ - if (ISSET(bp->b_flags, B_LOCKED)) { - // XXXdbg - if (VTOHFS(vp)->jnl) { - panic("hfs: CLEARING the lock bit on bp 0x%x\n", bp); - } - CLR(bp->b_flags, B_LOCKED); - printf("hfs_bwrite: called with lock bit set\n"); + if ((buf_flags(bp) & B_LOCKED)) { + // XXXdbg + if (VTOHFS(vp)->jnl) { + panic("hfs: CLEARING the lock bit on bp 0x%x\n", bp); + } + buf_clearflags(bp, B_LOCKED); } retval = vn_bwrite (ap); @@ -2198,30 +2593,29 @@ hfs_bwrite(ap) */ __private_extern__ int -hfs_relocate(vp, blockHint, cred, p) - struct vnode *vp; - u_int32_t blockHint; - struct ucred *cred; - struct proc *p; +hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, + struct proc *p) { + struct cnode *cp; struct filefork *fp; struct hfsmount *hfsmp; - ExtendedVCB *vcb; - u_int32_t headblks; u_int32_t datablks; u_int32_t blksize; - u_int32_t realsize; u_int32_t growsize; u_int32_t nextallocsave; - u_int32_t sector_a; - u_int32_t sector_b; + daddr64_t sector_a, sector_b; + int disabled_caching = 0; int eflags; - u_int32_t oldstart; /* debug only */ off_t newbytes; - int retval, need_vinval=0; - - if (vp->v_type != VREG && vp->v_type != VLNK) { + int retval; + int lockflags = 0; + int took_trunc_lock = 0; + int started_tr = 0; + enum vtype vnodetype; + + vnodetype = vnode_vtype(vp); + if (vnodetype != VREG && vnodetype != VLNK) { return (EPERM); } @@ -2230,41 +2624,63 @@ hfs_relocate(vp, blockHint, cred, p) return (ENOSPC); } + cp = VTOC(vp); fp = VTOF(vp); if (fp->ff_unallocblocks) return (EINVAL); - vcb = VTOVCB(vp); - blksize = vcb->blockSize; + blksize = hfsmp->blockSize; if (blockHint == 0) - blockHint = vcb->nextAllocation; + blockHint = hfsmp->nextAllocation; if ((fp->ff_size > (u_int64_t)0x7fffffff) || - (vp->v_type == VLNK && fp->ff_size > blksize)) { + ((fp->ff_size > blksize) && vnodetype == VLNK)) { return (EFBIG); } + // + // We do not believe that this call to hfs_fsync() is + // necessary and it causes a journal transaction + // deadlock so we are removing it. + // + //if (vnodetype == VREG && !vnode_issystem(vp)) { + // retval = hfs_fsync(vp, MNT_WAIT, 0, p); + // if (retval) + // return (retval); + //} + + if (!vnode_issystem(vp) && (vnodetype != VLNK)) { + hfs_unlock(cp); + hfs_lock_truncate(cp, TRUE); + if ((retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) { + hfs_unlock_truncate(cp); + return (retval); + } + took_trunc_lock = 1; + } headblks = fp->ff_blocks; datablks = howmany(fp->ff_size, blksize); growsize = datablks * blksize; - realsize = fp->ff_size; eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask; if (blockHint >= hfsmp->hfs_metazone_start && blockHint <= hfsmp->hfs_metazone_end) eflags |= kEFMetadataMask; - hfs_global_shared_lock_acquire(hfsmp); - if (hfsmp->jnl) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - return (EINVAL); - } + if (hfs_start_transaction(hfsmp) != 0) { + if (took_trunc_lock) + hfs_unlock_truncate(cp); + return (EINVAL); } + started_tr = 1; + /* + * Protect the extents b-tree and the allocation bitmap + * during MapFileBlockC and ExtendFileC operations. + */ + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - /* Lock extents b-tree (also protects volume bitmap) */ - retval = hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_EXCLUSIVE, p); - if (retval) - goto out2; - - retval = MapFileBlockC(vcb, (FCB *)fp, 1, growsize - 1, §or_a, NULL); + retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, §or_a, NULL); if (retval) { retval = MacToVFSError(retval); goto out; @@ -2273,14 +2689,23 @@ hfs_relocate(vp, blockHint, cred, p) /* * STEP 1 - aquire new allocation blocks. */ - nextallocsave = vcb->nextAllocation; - retval = ExtendFileC(vcb, (FCB*)fp, growsize, blockHint, eflags, &newbytes); - if (eflags & kEFMetadataMask) - vcb->nextAllocation = nextallocsave; + if (!vnode_isnocache(vp)) { + vnode_setnocache(vp); + disabled_caching = 1; + + } + nextallocsave = hfsmp->nextAllocation; + retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes); + if (eflags & kEFMetadataMask) { + HFS_MOUNT_LOCK(hfsmp, TRUE); + hfsmp->nextAllocation = nextallocsave; + hfsmp->vcbFlags |= 0xFF00; + HFS_MOUNT_UNLOCK(hfsmp, TRUE); + } retval = MacToVFSError(retval); if (retval == 0) { - VTOC(vp)->c_flag |= C_MODIFIED; + cp->c_flag |= C_MODIFIED; if (newbytes < growsize) { retval = ENOSPC; goto restore; @@ -2290,7 +2715,7 @@ hfs_relocate(vp, blockHint, cred, p) goto restore; } - retval = MapFileBlockC(vcb, (FCB *)fp, 1, growsize, §or_b, NULL); + retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, §or_b, NULL); if (retval) { retval = MacToVFSError(retval); } else if ((sector_a + 1) == sector_b) { @@ -2304,101 +2729,106 @@ hfs_relocate(vp, blockHint, cred, p) goto restore; } } + /* Done with system locks and journal for now. */ + hfs_systemfile_unlock(hfsmp, lockflags); + lockflags = 0; + hfs_end_transaction(hfsmp); + started_tr = 0; + if (retval) { /* * Check to see if failure is due to excessive fragmentation. */ - if (retval == ENOSPC && - hfs_freeblks(hfsmp, 0) > (datablks * 2)) { + if ((retval == ENOSPC) && + (hfs_freeblks(hfsmp, 0) > (datablks * 2))) { hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE; } goto out; } - - fp->ff_size = fp->ff_blocks * blksize; - if (UBCISVALID(vp)) - (void) ubc_setsize(vp, fp->ff_size); - /* - * STEP 2 - clone data into the new allocation blocks. + * STEP 2 - clone file data into the new allocation blocks. */ - // XXXdbg - unlock the extents overflow file because hfs_clonefile() - // calls vinvalbuf() which calls hfs_fsync() which can - // call hfs_metasync() which may need to lock the catalog - // file -- but the catalog file may be locked and blocked - // waiting for the extents overflow file if we're unlucky. - // see radar 3742973 for more details. - (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, p); - - if (vp->v_type == VLNK) + if (vnodetype == VLNK) retval = hfs_clonelink(vp, blksize, cred, p); - else if (vp->v_flag & VSYSTEM) + else if (vnode_issystem(vp)) retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p); else - retval = hfs_clonefile(vp, headblks, datablks, blksize, cred, p); - - // XXXdbg - relock the extents overflow file - (void)hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_EXCLUSIVE, p); + retval = hfs_clonefile(vp, headblks, datablks, blksize); + /* Start transaction for step 3 or for a restore. */ + if (hfs_start_transaction(hfsmp) != 0) { + retval = EINVAL; + goto out; + } + started_tr = 1; if (retval) goto restore; - - oldstart = fp->ff_extents[0].startBlock; /* - * STEP 3 - switch to clone and remove old blocks. + * STEP 3 - switch to cloned data and remove old blocks. */ - SET(VTOC(vp)->c_flag, C_NOBLKMAP); /* suspend page-ins */ + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - retval = HeadTruncateFile(vcb, (FCB*)fp, headblks); + retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks); - CLR(VTOC(vp)->c_flag, C_NOBLKMAP); /* resume page-ins */ - if (ISSET(VTOC(vp)->c_flag, C_WBLKMAP)) - wakeup(VTOC(vp)); + hfs_systemfile_unlock(hfsmp, lockflags); + lockflags = 0; if (retval) goto restore; - - fp->ff_size = realsize; - if (UBCISVALID(vp)) { - (void) ubc_setsize(vp, realsize); - need_vinval = 1; - } - - CLR(VTOC(vp)->c_flag, C_RELOCATING); /* Resume page-outs for this file. */ out: - (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, p); + if (took_trunc_lock) + hfs_unlock_truncate(cp); - // XXXdbg - do this after unlocking the extents-overflow - // file to avoid deadlocks (see comment above by STEP 2) - if (need_vinval) { - (void) vinvalbuf(vp, V_SAVE, cred, p, 0, 0); + if (lockflags) { + hfs_systemfile_unlock(hfsmp, lockflags); + lockflags = 0; } - retval = VOP_FSYNC(vp, cred, MNT_WAIT, p); -out2: + // See comment up above about calls to hfs_fsync() + // + //if (retval == 0) + // retval = hfs_fsync(vp, MNT_WAIT, 0, p); + if (hfsmp->jnl) { - if (VTOC(vp)->c_cnid < kHFSFirstUserCatalogNodeID) + if (cp->c_cnid < kHFSFirstUserCatalogNodeID) (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH); else (void) hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); - journal_end_transaction(hfsmp->jnl); } - hfs_global_shared_lock_release(hfsmp); +exit: + if (disabled_caching) { + vnode_clearnocache(vp); + } + if (started_tr) + hfs_end_transaction(hfsmp); return (retval); restore: + if (fp->ff_blocks == headblks) + goto exit; /* * Give back any newly allocated space. */ - if (fp->ff_size != realsize) - fp->ff_size = realsize; - (void) TruncateFileC(vcb, (FCB*)fp, fp->ff_size, false); - if (UBCISVALID(vp)) - (void) ubc_setsize(vp, fp->ff_size); - CLR(VTOC(vp)->c_flag, C_RELOCATING); - goto out; + if (lockflags == 0) { + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + } + + (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, false); + + hfs_systemfile_unlock(hfsmp, lockflags); + lockflags = 0; + + if (took_trunc_lock) + hfs_unlock_truncate(cp); + goto exit; } @@ -2407,30 +2837,30 @@ restore: * */ static int -hfs_clonelink(struct vnode *vp, int blksize, struct ucred *cred, struct proc *p) +hfs_clonelink(struct vnode *vp, int blksize, kauth_cred_t cred, struct proc *p) { struct buf *head_bp = NULL; struct buf *tail_bp = NULL; int error; - error = meta_bread(vp, 0, blksize, cred, &head_bp); + error = (int)buf_meta_bread(vp, (daddr64_t)0, blksize, cred, &head_bp); if (error) goto out; - tail_bp = getblk(vp, 1, blksize, 0, 0, BLK_META); + tail_bp = buf_getblk(vp, (daddr64_t)1, blksize, 0, 0, BLK_META); if (tail_bp == NULL) { error = EIO; goto out; } - bcopy(head_bp->b_data, tail_bp->b_data, blksize); - error = bwrite(tail_bp); + bcopy((char *)buf_dataptr(head_bp), (char *)buf_dataptr(tail_bp), blksize); + error = (int)buf_bwrite(tail_bp); out: if (head_bp) { - head_bp->b_flags |= B_INVAL; - brelse(head_bp); + buf_markinvalid(head_bp); + buf_brelse(head_bp); } - (void) vinvalbuf(vp, V_SAVE, cred, p, 0, 0); + (void) buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0); return (error); } @@ -2440,39 +2870,19 @@ out: * */ static int -hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize, - struct ucred *cred, struct proc *p) +hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize) { caddr_t bufp; size_t writebase; size_t bufsize; size_t copysize; size_t iosize; - size_t filesize; + off_t filesize; size_t offset; - struct uio auio; - struct iovec aiov; - int devblocksize; - int didhold; - int error; - - - if ((error = vinvalbuf(vp, V_SAVE, cred, p, 0, 0))) { - printf("hfs_clonefile: vinvalbuf failed - %d\n", error); - return (error); - } - - if (!ubc_clean(vp, 1)) { - printf("hfs_clonefile: not ubc_clean\n"); - return (EIO); /* XXX error code */ - } - - /* - * Suspend page-outs for this file. - */ - SET(VTOC(vp)->c_flag, C_RELOCATING); + uio_t auio; + int error = 0; - filesize = VTOF(vp)->ff_size; + filesize = VTOF(vp)->ff_blocks * blksize; /* virtual file size */ writebase = blkstart * blksize; copysize = blkcnt * blksize; iosize = bufsize = MIN(copysize, 4096 * 16); @@ -2481,71 +2891,54 @@ hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize, if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) { return (ENOMEM); } + hfs_unlock(VTOC(vp)); - VOP_DEVBLOCKSIZE(VTOC(vp)->c_devvp, &devblocksize); - - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_procp = p; + auio = uio_create(1, 0, UIO_SYSSPACE32, UIO_READ); while (offset < copysize) { iosize = MIN(copysize - offset, iosize); - aiov.iov_base = bufp; - aiov.iov_len = iosize; - auio.uio_resid = iosize; - auio.uio_offset = offset; - auio.uio_rw = UIO_READ; + uio_reset(auio, offset, UIO_SYSSPACE32, UIO_READ); + uio_addiov(auio, (uintptr_t)bufp, iosize); - error = cluster_read(vp, &auio, copysize, devblocksize, 0); + error = cluster_read(vp, auio, copysize, 0); if (error) { printf("hfs_clonefile: cluster_read failed - %d\n", error); break; } - if (auio.uio_resid != 0) { - printf("clonedata: cluster_read: uio_resid = %d\n", (int)auio.uio_resid); + if (uio_resid(auio) != 0) { + printf("clonedata: cluster_read: uio_resid = %lld\n", uio_resid(auio)); error = EIO; break; } + uio_reset(auio, writebase + offset, UIO_SYSSPACE32, UIO_WRITE); + uio_addiov(auio, (uintptr_t)bufp, iosize); - aiov.iov_base = bufp; - aiov.iov_len = iosize; - auio.uio_resid = iosize; - auio.uio_offset = writebase + offset; - auio.uio_rw = UIO_WRITE; - - error = cluster_write(vp, &auio, filesize + offset, + error = cluster_write(vp, auio, filesize + offset, filesize + offset + iosize, - auio.uio_offset, 0, devblocksize, 0); + uio_offset(auio), 0, IO_NOCACHE | IO_SYNC); if (error) { printf("hfs_clonefile: cluster_write failed - %d\n", error); break; } - if (auio.uio_resid != 0) { + if (uio_resid(auio) != 0) { printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n"); error = EIO; break; } offset += iosize; } - if (error == 0) { - /* Clean the pages in VM. */ - didhold = ubc_hold(vp); - if (didhold) - (void) ubc_clean(vp, 1); - - /* - * Clean out all associated buffers. - */ - (void) vinvalbuf(vp, V_SAVE, cred, p, 0, 0); - - if (didhold) - ubc_rele(vp); - } + uio_free(auio); + + /* + * No need to call ubc_sync_range or hfs_invalbuf + * since the file was copied using IO_NOCACHE. + */ + kmem_free(kernel_map, (vm_offset_t)bufp, bufsize); - + + hfs_lock(VTOC(vp), HFS_FORCE_LOCK); return (error); } @@ -2555,15 +2948,17 @@ hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize, */ static int hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize, - struct ucred *cred, struct proc *p) + kauth_cred_t cred, struct proc *p) { caddr_t bufp; char * offset; size_t bufsize; size_t iosize; struct buf *bp = NULL; - daddr_t blkno; - daddr_t blk; + daddr64_t blkno; + daddr64_t blk; + daddr64_t start_blk; + daddr64_t last_blk; int breadcnt; int i; int error = 0; @@ -2576,30 +2971,31 @@ hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize, if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) { return (ENOMEM); } - blkstart = (blkstart * blksize) / iosize; - blkcnt = (blkcnt * blksize) / iosize; + start_blk = ((daddr64_t)blkstart * blksize) / iosize; + last_blk = ((daddr64_t)blkcnt * blksize) / iosize; blkno = 0; - while (blkno < blkcnt) { + while (blkno < last_blk) { /* * Read up to a megabyte */ offset = bufp; - for (i = 0, blk = blkno; (i < breadcnt) && (blk < blkcnt); ++i, ++blk) { - error = meta_bread(vp, blk, iosize, cred, &bp); + for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) { + error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp); if (error) { printf("hfs_clonesysfile: meta_bread error %d\n", error); goto out; } - if (bp->b_bcount != iosize) { - printf("hfs_clonesysfile: b_bcount is only %d\n", bp->b_bcount); + if (buf_count(bp) != iosize) { + printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp)); goto out; } - - bcopy(bp->b_data, offset, iosize); - bp->b_flags |= B_INVAL; - brelse(bp); + bcopy((char *)buf_dataptr(bp), offset, iosize); + + buf_markinvalid(bp); + buf_brelse(bp); bp = NULL; + offset += iosize; } @@ -2607,15 +3003,15 @@ hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize, * Write up to a megabyte */ offset = bufp; - for (i = 0; (i < breadcnt) && (blkno < blkcnt); ++i, ++blkno) { - bp = getblk(vp, blkstart + blkno, iosize, 0, 0, BLK_META); + for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) { + bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META); if (bp == NULL) { - printf("hfs_clonesysfile: getblk failed on blk %d\n", blkstart + blkno); + printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno); error = EIO; goto out; } - bcopy(offset, bp->b_data, iosize); - error = bwrite(bp); + bcopy(offset, (char *)buf_dataptr(bp), iosize); + error = (int)buf_bwrite(bp); bp = NULL; if (error) goto out; @@ -2624,13 +3020,12 @@ hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize, } out: if (bp) { - brelse(bp); + buf_brelse(bp); } kmem_free(kernel_map, (vm_offset_t)bufp, bufsize); - error = VOP_FSYNC(vp, cred, MNT_WAIT, p); + error = hfs_fsync(vp, MNT_WAIT, 0, p); return (error); } - diff --git a/bsd/hfs/hfs_search.c b/bsd/hfs/hfs_search.c index 83fef8f2e..930f9776c 100644 --- a/bsd/hfs/hfs_search.c +++ b/bsd/hfs/hfs_search.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1997-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -26,7 +26,6 @@ #include <sys/systm.h> #include <sys/kernel.h> #include <sys/file.h> -#include <sys/buf.h> #include <sys/proc.h> #include <sys/conf.h> #include <mach/machine/vm_types.h> @@ -35,6 +34,8 @@ #include <sys/signalvar.h> #include <sys/attr.h> #include <sys/utfconv.h> +#include <sys/kauth.h> +#include <sys/vnode_internal.h> #include "hfs.h" #include "hfs_dbg.h" @@ -43,16 +44,15 @@ #include "hfs_endian.h" #include "hfscommon/headers/FileMgrInternal.h" -#include "hfscommon/headers/CatalogPrivate.h" #include "hfscommon/headers/HFSUnicodeWrappers.h" #include "hfscommon/headers/BTreesPrivate.h" #include "hfscommon/headers/BTreeScanner.h" - +#include "hfscommon/headers/CatalogPrivate.h" /* Search criterea. */ struct directoryInfoSpec { - u_long numFiles; + u_int32_t numFiles; }; struct fileInfoSpec @@ -75,7 +75,7 @@ struct searchinfospec struct timespec changeDate; struct timespec accessDate; struct timespec lastBackupDate; - u_long finderInfo[8]; + uint8_t finderInfo[32]; uid_t uid; gid_t gid; mode_t mask; @@ -87,7 +87,7 @@ typedef struct searchinfospec searchinfospec_t; static void ResolveHardlink(ExtendedVCB *vcb, HFSPlusCatalogFile *recp); -static int UnpackSearchAttributeBlock(struct vnode *vp, struct attrlist *alist, +static int UnpackSearchAttributeBlock(struct hfsmount *hfsmp, struct attrlist *alist, searchinfospec_t *searchInfo, void *attributeBuffer); static int CheckCriteria( ExtendedVCB *vcb, @@ -101,10 +101,10 @@ static int CheckCriteria( ExtendedVCB *vcb, static int CheckAccess(ExtendedVCB *vcb, u_long searchBits, CatalogKey *key, struct proc *p); -static int InsertMatch(struct vnode *vp, struct uio *a_uio, CatalogRecord *rec, +static int InsertMatch(struct hfsmount *hfsmp, uio_t a_uio, CatalogRecord *rec, CatalogKey *key, struct attrlist *returnAttrList, void *attributesBuffer, void *variableBuffer, - u_long bufferSize, u_long * nummatches ); + u_long * nummatches ); static Boolean CompareRange(u_long val, u_long low, u_long high); static Boolean CompareWideRange(u_int64_t val, u_int64_t low, u_int64_t high); @@ -124,21 +124,8 @@ static Boolean CompareWideRange( u_int64_t val, u_int64_t low, u_int64_t high ) static Boolean IsTargetName( searchinfospec_t * searchInfoPtr, Boolean isHFSPlus ); #endif // Installer workaround -extern int cat_convertkey( - struct hfsmount *hfsmp, - CatalogKey *key, - CatalogRecord * recp, - struct cat_desc *descp); +__private_extern__ int hfs_vnop_search(struct vnop_searchfs_args *ap); -extern void cat_convertattr( - struct hfsmount *hfsmp, - CatalogRecord * recp, - struct cat_attr *attrp, - struct cat_fork *datafp, - struct cat_fork *rsrcfp); - -extern int resolvelink(struct hfsmount *hfsmp, u_long linkref, - struct HFSPlusCatalogFile *recp); /************************************************************************/ /* Entry for searchfs() */ @@ -149,19 +136,19 @@ extern int resolvelink(struct hfsmount *hfsmp, u_long linkref, # #% searchfs vp L L L # -vop_searchfs { +vnop_searchfs { IN struct vnode *vp; IN off_t length; IN int flags; - IN struct ucred *cred; + IN kauth_cred_t cred; IN struct proc *p; }; */ __private_extern__ int -hfs_search( ap ) - struct vop_searchfs_args *ap; /* +hfs_vnop_search(ap) + struct vnop_searchfs_args *ap; /* struct vnodeop_desc *a_desc; struct vnode *a_vp; void *a_searchparams1; @@ -175,9 +162,11 @@ hfs_search( ap ) u_long a_options; struct uio *a_uio; struct searchstate *a_searchstate; + vfs_context_t a_context; */ { ExtendedVCB *vcb = VTOVCB(ap->a_vp); + struct hfsmount *hfsmp; FCB * catalogFCB; searchinfospec_t searchInfo1; searchinfospec_t searchInfo2; @@ -185,7 +174,7 @@ hfs_search( ap ) void *variableBuffer; u_long fixedBlockSize; u_long eachReturnBufferSize; - struct proc *p = current_proc(); + struct proc *p = proc_self(); int err = E_NONE; int isHFSPlus; int timerExpired = false; @@ -194,9 +183,10 @@ hfs_search( ap ) CatalogRecord * myCurrentDataPtr; CatPosition * myCatPositionPtr; BTScanState myBTScanState; - void *user_start = NULL; - int user_len; + user_addr_t user_start = 0; + user_size_t user_len = 0; int32_t searchTime; + int lockflags; /* XXX Parameter check a_searchattrs? */ @@ -216,14 +206,15 @@ hfs_search( ap ) attrs = ap->a_searchattrs->commonattr | ap->a_returnattrs->commonattr; if (attrs & (ATTR_CMN_NAME | ATTR_CMN_PAROBJID)) return (EINVAL); - if ((err = suser(p->p_ucred, &p->p_acflag))) + if ((err = suser(kauth_cred_get(), 0))) return (err); } - if (ap->a_uio->uio_resid <= 0) + if (uio_resid(ap->a_uio) <= 0) return (EINVAL); isHFSPlus = (vcb->vcbSigWord == kHFSPlusSigWord); + hfsmp = VTOHFS(ap->a_vp); searchTime = kMaxMicroSecsInKernel; if (ap->a_timelimit->tv_sec == 0 && @@ -233,14 +224,15 @@ hfs_search( ap ) } /* UnPack the search boundries, searchInfo1, searchInfo2 */ - err = UnpackSearchAttributeBlock(ap->a_vp, ap->a_searchattrs, + err = UnpackSearchAttributeBlock(hfsmp, ap->a_searchattrs, &searchInfo1, ap->a_searchparams1); if (err) return err; - err = UnpackSearchAttributeBlock(ap->a_vp, ap->a_searchattrs, + err = UnpackSearchAttributeBlock(hfsmp, ap->a_searchattrs, &searchInfo2, ap->a_searchparams2); if (err) return err; - fixedBlockSize = sizeof(u_long) + hfs_attrblksize(ap->a_returnattrs); /* u_long for length longword */ + fixedBlockSize = sizeof(uint32_t) + hfs_attrblksize(ap->a_returnattrs); /* uint32_t for length word */ + eachReturnBufferSize = fixedBlockSize; if ( ap->a_returnattrs->commonattr & ATTR_CMN_NAME ) /* XXX should be more robust! */ @@ -253,20 +245,17 @@ hfs_search( ap ) // while holding the shared catalog file lock. see the comment // in hfs_readdir() for more details. // - if (VTOHFS(ap->a_vp)->jnl && ap->a_uio->uio_segflg == UIO_USERSPACE) { - user_start = ap->a_uio->uio_iov->iov_base; - user_len = ap->a_uio->uio_iov->iov_len; + if (hfsmp->jnl && uio_isuserspace(ap->a_uio)) { + user_start = uio_curriovbase(ap->a_uio); + user_len = uio_curriovlen(ap->a_uio); if ((err = vslock(user_start, user_len)) != 0) { - user_start = NULL; + user_start = 0; goto ExitThisRoutine; } } - /* Lock catalog b-tree */ - err = hfs_metafilelocking(VTOHFS(ap->a_vp), kHFSCatalogFileID, LK_SHARED, p); - if (err) - goto ExitThisRoutine; + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); catalogFCB = GetFileControlBlock(vcb->catalogRefNum); myCurrentKeyPtr = NULL; @@ -276,9 +265,11 @@ hfs_search( ap ) if (ap->a_options & SRCHFS_START) { /* Starting a new search. */ /* Make sure the on-disk Catalog file is current */ - (void) VOP_FSYNC(vcb->catalogRefNum, NOCRED, MNT_WAIT, p); - if (VTOHFS(ap->a_vp)->jnl) { - journal_flush(VTOHFS(ap->a_vp)->jnl); + (void) hfs_fsync(vcb->catalogRefNum, MNT_WAIT, 0, p); + if (hfsmp->jnl) { + hfs_systemfile_unlock(hfsmp, lockflags); + journal_flush(hfsmp->jnl); + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); } ap->a_options &= ~SRCHFS_START; @@ -312,17 +303,20 @@ hfs_search( ap ) result = BTSearchRecord( catalogFCB, &iterator, &btrec, &reclen, &iterator ); if ( result == E_NONE ) { + // need to unlock since CheckAccess assumes no lock held + hfs_systemfile_unlock(hfsmp, lockflags); if (CheckCriteria(vcb, ap->a_options, ap->a_searchattrs, &rec, keyp, &searchInfo1, &searchInfo2, false) && - CheckAccess(vcb, ap->a_options, keyp, ap->a_uio->uio_procp)) { + CheckAccess(vcb, ap->a_options, keyp, p)) { - result = InsertMatch(ap->a_vp, ap->a_uio, &rec, + result = InsertMatch(hfsmp, ap->a_uio, &rec, keyp, ap->a_returnattrs, attributesBuffer, variableBuffer, - eachReturnBufferSize, ap->a_nummatches); + ap->a_nummatches); if (result == E_NONE && *(ap->a_nummatches) >= ap->a_maxmatches) doQuickExit = true; } + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); } } #endif // Installer workaround @@ -340,9 +334,8 @@ hfs_search( ap ) err = EBUSY; /* catChangedErr */ } } + hfs_systemfile_unlock(hfsmp, lockflags); - /* Unlock catalog b-tree */ - (void) hfs_metafilelocking(VTOHFS(ap->a_vp), kHFSCatalogFileID, LK_RELEASE, p); if (err) goto ExitThisRoutine; #if 1 // Installer workaround (2940423) @@ -365,16 +358,17 @@ hfs_search( ap ) break; /* Resolve any hardlinks */ - if (isHFSPlus && (ap->a_options & SRCHFS_SKIPLINKS) == 0) + if (isHFSPlus && (ap->a_options & SRCHFS_SKIPLINKS) == 0) { + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); ResolveHardlink(vcb, (HFSPlusCatalogFile *) myCurrentDataPtr); - + hfs_systemfile_unlock(hfsmp, lockflags); + } if (CheckCriteria( vcb, ap->a_options, ap->a_searchattrs, myCurrentDataPtr, myCurrentKeyPtr, &searchInfo1, &searchInfo2, true ) - && CheckAccess(vcb, ap->a_options, myCurrentKeyPtr, ap->a_uio->uio_procp)) { - err = InsertMatch(ap->a_vp, ap->a_uio, myCurrentDataPtr, + && CheckAccess(vcb, ap->a_options, myCurrentKeyPtr, p)) { + err = InsertMatch(hfsmp, ap->a_uio, myCurrentDataPtr, myCurrentKeyPtr, ap->a_returnattrs, - attributesBuffer, variableBuffer, - eachReturnBufferSize, ap->a_nummatches); + attributesBuffer, variableBuffer, ap->a_nummatches); if (err) { /* * The last match didn't fit so come back @@ -394,7 +388,7 @@ hfs_search( ap ) * The idea here is to throttle the amount of time we * spend in the kernel. */ - myCurrentTime = time; + microuptime(&myCurrentTime); timersub(&myCurrentTime, &myBTScanState.startTime, &myElapsedTime); /* Note: assumes kMaxMicroSecsInKernel is less than 1,000,000 */ if (myElapsedTime.tv_sec > 0 @@ -425,9 +419,9 @@ QuickExit: } ExitThisRoutine: - FREE( attributesBuffer, M_TEMP ); + FREE( attributesBuffer, M_TEMP ); - if (VTOHFS(ap->a_vp)->jnl && user_start) { + if (hfsmp->jnl && user_start) { vsunlock(user_start, user_len, TRUE); } @@ -514,100 +508,6 @@ ComparePartialPascalName ( register ConstStr31Param str, register ConstStr31Para } - -static char *extension_table=NULL; -static int nexts; -static int max_ext_width; - -static int -extension_cmp(void *a, void *b) -{ - return (strlen((char *)a) - strlen((char *)b)); -} - - -// -// This is the api LaunchServices uses to inform the kernel -// the list of package extensions to ignore. -// -// Internally we keep the list sorted by the length of the -// the extension (from longest to shortest). We sort the -// list of extensions so that we can speed up our searches -// when comparing file names -- we only compare extensions -// that could possibly fit into the file name, not all of -// them (i.e. a short 8 character name can't have an 8 -// character extension). -// -__private_extern__ int -set_package_extensions_table(void *data, int nentries, int maxwidth) -{ - char *new_exts, *ptr; - int error, i, len; - - if (nentries <= 0 || nentries > 1024 || maxwidth <= 0 || maxwidth > 255) { - return EINVAL; - } - - MALLOC(new_exts, char *, nentries * maxwidth, M_TEMP, M_WAITOK); - - error = copyin(data, new_exts, nentries * maxwidth); - if (error) { - FREE(new_exts, M_TEMP); - return error; - } - - if (extension_table) { - FREE(extension_table, M_TEMP); - } - extension_table = new_exts; - nexts = nentries; - max_ext_width = maxwidth; - - qsort(extension_table, nexts, maxwidth, extension_cmp); - - return 0; -} - - -static int -is_package_name(char *name, int len) -{ - int i, extlen; - char *ptr, *name_ext; - - if (len <= 3) { - return 0; - } - - name_ext = NULL; - for(ptr=name; *ptr != '\0'; ptr++) { - if (*ptr == '.') { - name_ext = ptr; - } - } - - // if there is no "." extension, it can't match - if (name_ext == NULL) { - return 0; - } - - // advance over the "." - name_ext++; - - // now iterate over all the extensions to see if any match - ptr = &extension_table[0]; - for(i=0; i < nexts; i++, ptr+=max_ext_width) { - extlen = strlen(ptr); - if (strncmp(name_ext, ptr, extlen) == 0 && name_ext[extlen] == '\0') { - // aha, a match! - return 1; - } - } - - // if we get here, no extension matched - return 0; -} - // // Determine if a name is "inappropriate" where the definition // of "inappropriate" is up to higher level execs. Currently @@ -616,11 +516,11 @@ is_package_name(char *name, int len) static int is_inappropriate_name(char *name, int len) { - char *bad_names[] = { "System" }; + const char *bad_names[] = { "System" }; int bad_len[] = { 6 }; int i; - for(i=0; i < sizeof(bad_names) / sizeof(bad_names[0]); i++) { + for(i=0; i < (int) (sizeof(bad_names) / sizeof(bad_names[0])); i++) { if (len == bad_len[i] && strcmp(name, bad_names[i]) == 0) { return 1; } @@ -639,27 +539,25 @@ is_inappropriate_name(char *name, int len) static int CheckAccess(ExtendedVCB *theVCBPtr, u_long searchBits, CatalogKey *theKeyPtr, struct proc *theProcPtr) { - Boolean isHFSPlus; - int myErr; - int myResult; + Boolean isHFSPlus; + int myErr; + int myResult; HFSCatalogNodeID myNodeID; - unsigned long myPerms; - hfsmount_t * my_hfsmountPtr; - struct cat_desc my_cat_desc; - struct cat_attr my_cat_attr; - struct FndrDirInfo *finder_info; + hfsmount_t * hfsmp; + struct FndrDirInfo *finfop; + struct vnode * vp = NULL; + struct vfs_context my_context; - myResult = 0; /* default to "no access" */ - my_cat_desc.cd_nameptr = NULL; - my_cat_desc.cd_namelen = 0; + my_context.vc_proc = theProcPtr; + my_context.vc_ucred = kauth_cred_get(); - if ( theProcPtr->p_ucred->cr_uid == 0 ) { + if (!proc_suser(theProcPtr)) { myResult = 1; /* allow access */ goto ExitThisRoutine; /* root always has access */ } - my_hfsmountPtr = VCBTOHFS( theVCBPtr ); + hfsmp = VCBTOHFS( theVCBPtr ); isHFSPlus = ( theVCBPtr->vcbSigWord == kHFSPlusSigWord ); if ( isHFSPlus ) myNodeID = theKeyPtr->hfsPlus.parentID; @@ -667,54 +565,57 @@ CheckAccess(ExtendedVCB *theVCBPtr, u_long searchBits, CatalogKey *theKeyPtr, st myNodeID = theKeyPtr->hfs.parentID; while ( myNodeID >= kRootDirID ) { + cnode_t * cp; + /* now go get catalog data for this directory */ - myErr = hfs_metafilelocking( my_hfsmountPtr, kHFSCatalogFileID, LK_SHARED, theProcPtr ); - if ( myErr ) - goto ExitThisRoutine; /* no access */ - - myErr = cat_idlookup( my_hfsmountPtr, myNodeID, &my_cat_desc, &my_cat_attr, NULL ); - (void) hfs_metafilelocking( my_hfsmountPtr, kHFSCatalogFileID, LK_RELEASE, theProcPtr ); - if ( myErr ) + myErr = hfs_vget(hfsmp, myNodeID, &vp, 0); + if ( myErr ) { goto ExitThisRoutine; /* no access */ + } - if (searchBits & SRCHFS_SKIPPACKAGES) { - if (is_package_name(my_cat_desc.cd_nameptr, my_cat_desc.cd_namelen)) { - myResult = 0; - goto ExitThisRoutine; + cp = VTOC(vp); + finfop = (struct FndrDirInfo *)&cp->c_attr.ca_finderinfo[0]; + + if ( searchBits & SRCHFS_SKIPPACKAGES ) { + if ( (SWAP_BE16(finfop->frFlags) & kHasBundle) + || (cp->c_desc.cd_nameptr != NULL + && is_package_name(cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen)) ) { + myResult = 0; + goto ExitThisRoutine; } } - if (searchBits & SRCHFS_SKIPINAPPROPRIATE) { - if ( my_cat_desc.cd_parentcnid == kRootDirID - && is_inappropriate_name(my_cat_desc.cd_nameptr, my_cat_desc.cd_namelen)) { - myResult = 0; - goto ExitThisRoutine; + if ( searchBits & SRCHFS_SKIPINAPPROPRIATE ) { + if ( cp->c_parentcnid == kRootDirID && cp->c_desc.cd_nameptr != NULL && + is_inappropriate_name(cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen) ) { + myResult = 0; + goto ExitThisRoutine; } } - finder_info = (struct FndrDirInfo *)&my_cat_attr.ca_finderinfo[0]; - if ( (searchBits & SRCHFS_SKIPINVISIBLE) - && (SWAP_BE16(finder_info->frFlags) & kIsInvisible)) { - + if ( (searchBits & SRCHFS_SKIPINVISIBLE) && + (SWAP_BE16(finfop->frFlags) & kIsInvisible) ) { myResult = 0; goto ExitThisRoutine; } - myNodeID = my_cat_desc.cd_parentcnid; /* move up the hierarchy */ - myPerms = DerivePermissionSummary(my_cat_attr.ca_uid, my_cat_attr.ca_gid, - my_cat_attr.ca_mode, my_hfsmountPtr->hfs_mp, - theProcPtr->p_ucred, theProcPtr ); - - cat_releasedesc( &my_cat_desc ); - - if ( (myPerms & X_OK) == 0 ) + myNodeID = cp->c_parentcnid; /* move up the hierarchy */ + hfs_unlock(VTOC(vp)); + myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH), &my_context); + //myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), &my_context); + vnode_put(vp); + vp = NULL; + if ( myErr ) { goto ExitThisRoutine; /* no access */ + } } - myResult = 1; /* allow access */ ExitThisRoutine: - cat_releasedesc( &my_cat_desc ); + if ( vp != NULL ) { + hfs_unlock(VTOC(vp)); + vnode_put(vp); + } return ( myResult ); } @@ -732,10 +633,11 @@ CheckCriteria( ExtendedVCB *vcb, Boolean matched, atleastone; Boolean isHFSPlus; attrgroup_t searchAttributes; - struct cat_attr c_attr = {0}; + struct cat_attr c_attr; struct cat_fork datafork; struct cat_fork rsrcfork; + bzero(&c_attr, sizeof(c_attr)); isHFSPlus = (vcb->vcbSigWord == kHFSPlusSigWord); switch (rec->recordType) { @@ -1056,9 +958,9 @@ CheckCriteria( ExtendedVCB *vcb, /* mode */ if ( searchAttributes & ATTR_CMN_ACCESSMASK ) { - matched = CompareRange((u_long)c_attr.ca_mode, - (u_long)searchInfo1->mask, - (u_long)searchInfo2->mask); + matched = CompareRange((uint32_t)c_attr.ca_mode, + (uint32_t)searchInfo1->mask, + (uint32_t)searchInfo2->mask); if (matched == false) goto TestDone; atleastone = true; } @@ -1084,28 +986,28 @@ TestDone: * Adds another record to the packed array for output */ static int -InsertMatch( struct vnode *root_vp, struct uio *a_uio, CatalogRecord *rec, - CatalogKey *key, struct attrlist *returnAttrList, void *attributesBuffer, - void *variableBuffer, u_long bufferSize, u_long * nummatches ) +InsertMatch(struct hfsmount *hfsmp, uio_t a_uio, CatalogRecord *rec, + CatalogKey *key, struct attrlist *returnAttrList, + void *attributesBuffer, void *variableBuffer, u_long * nummatches) { int err; void *rovingAttributesBuffer; void *rovingVariableBuffer; u_long packedBufferSize; - ExtendedVCB *vcb = VTOVCB(root_vp); - Boolean isHFSPlus = vcb->vcbSigWord == kHFSPlusSigWord; - u_long privateDir = VTOHFS(root_vp)->hfs_privdir_desc.cd_cnid; + u_long privateDir = hfsmp->hfs_privdir_desc.cd_cnid; struct attrblock attrblk; - struct cat_desc c_desc = {0}; - struct cat_attr c_attr = {0}; + struct cat_desc c_desc; + struct cat_attr c_attr; struct cat_fork datafork; struct cat_fork rsrcfork; + bzero(&c_desc, sizeof(c_desc)); + bzero(&c_attr, sizeof(c_attr)); rovingAttributesBuffer = (char*)attributesBuffer + sizeof(u_long); /* Reserve space for length field */ rovingVariableBuffer = variableBuffer; /* Convert catalog record into cat_attr format. */ - cat_convertattr(VTOHFS(root_vp), rec, &c_attr, &datafork, &rsrcfork); + cat_convertattr(hfsmp, rec, &c_attr, &datafork, &rsrcfork); /* hide our private meta data directory */ if ((privateDir != 0) && (c_attr.ca_fileid == privateDir)) { @@ -1114,21 +1016,21 @@ InsertMatch( struct vnode *root_vp, struct uio *a_uio, CatalogRecord *rec, } /* Hide the private journal files */ - if (VTOHFS(root_vp)->jnl && - ((c_attr.ca_fileid == VTOHFS(root_vp)->hfs_jnlfileid) || - (c_attr.ca_fileid == VTOHFS(root_vp)->hfs_jnlinfoblkid))) { + if (hfsmp->jnl && + ((c_attr.ca_fileid == hfsmp->hfs_jnlfileid) || + (c_attr.ca_fileid == hfsmp->hfs_jnlinfoblkid))) { err = 0; goto exit; } if (returnAttrList->commonattr & ATTR_CMN_NAME) { - cat_convertkey(VTOHFS(root_vp), key, rec, &c_desc); + cat_convertkey(hfsmp, key, rec, &c_desc); } else { c_desc.cd_cnid = c_attr.ca_fileid; - if (isHFSPlus) - c_desc.cd_parentcnid = key->hfsPlus.parentID; - else + if (hfsmp->hfs_flags & HFS_STANDARD) c_desc.cd_parentcnid = key->hfs.parentID; + else + c_desc.cd_parentcnid = key->hfsPlus.parentID; } attrblk.ab_attrlist = returnAttrList; @@ -1137,11 +1039,11 @@ InsertMatch( struct vnode *root_vp, struct uio *a_uio, CatalogRecord *rec, attrblk.ab_flags = 0; attrblk.ab_blocksize = 0; - hfs_packattrblk(&attrblk, VTOHFS(root_vp), NULL, &c_desc, &c_attr, &datafork, &rsrcfork, a_uio->uio_procp); + hfs_packattrblk(&attrblk, hfsmp, NULL, &c_desc, &c_attr, &datafork, &rsrcfork, current_proc()); packedBufferSize = (char*)rovingVariableBuffer - (char*)attributesBuffer; - if ( packedBufferSize > a_uio->uio_resid ) + if ( packedBufferSize > uio_resid(a_uio) ) return( errSearchBufferFull ); (* nummatches)++; @@ -1157,18 +1059,21 @@ exit: static int -UnpackSearchAttributeBlock( struct vnode *vp, struct attrlist *alist, searchinfospec_t *searchInfo, void *attributeBuffer ) +UnpackSearchAttributeBlock( struct hfsmount *hfsmp, struct attrlist *alist, searchinfospec_t *searchInfo, void *attributeBuffer ) { attrgroup_t a; u_long bufferSize; + boolean_t is_64_bit; DBG_ASSERT(searchInfo != NULL); + + is_64_bit = proc_is64bit(current_proc()); - bufferSize = *((u_long *)attributeBuffer); + bufferSize = *((uint32_t *)attributeBuffer); if (bufferSize == 0) return (EINVAL); /* XXX -DJB is a buffer size of zero ever valid for searchfs? */ - ++((u_long *)attributeBuffer); /* advance past the size */ + ++((uint32_t *)attributeBuffer); /* advance past the size */ /* * UnPack common attributes @@ -1176,39 +1081,41 @@ UnpackSearchAttributeBlock( struct vnode *vp, struct attrlist *alist, searchinfo a = alist->commonattr; if ( a != 0 ) { if ( a & ATTR_CMN_NAME ) { - char *s = (char*) attributeBuffer + ((attrreference_t *) attributeBuffer)->attr_dataoffset; - size_t len = ((attrreference_t *) attributeBuffer)->attr_length; + char *s; + u_int32_t len; + + s = (char*) attributeBuffer + ((attrreference_t *) attributeBuffer)->attr_dataoffset; + len = ((attrreference_t *) attributeBuffer)->attr_length; if (len > sizeof(searchInfo->name)) return (EINVAL); - if (VTOVCB(vp)->vcbSigWord == kHFSPlusSigWord) { - size_t ucslen; - /* Convert name to Unicode to match HFS Plus B-Tree names */ + if (hfsmp->hfs_flags & HFS_STANDARD) { + /* Convert name to pascal string to match HFS B-Tree names */ if (len > 0) { - if (utf8_decodestr(s, len-1, (UniChar*)searchInfo->name, &ucslen, - sizeof(searchInfo->name), ':', UTF_DECOMPOSED)) + if (utf8_to_hfs(HFSTOVCB(hfsmp), len-1, s, (u_char*)searchInfo->name) != 0) return (EINVAL); - searchInfo->nameLength = ucslen / sizeof(UniChar); + searchInfo->nameLength = searchInfo->name[0]; } else { - searchInfo->nameLength = 0; + searchInfo->name[0] = searchInfo->nameLength = 0; } - ++((attrreference_t *)attributeBuffer); - + ++((attrreference_t *)attributeBuffer); } else { - /* Convert name to pascal string to match HFS B-Tree names */ + size_t ucslen; + /* Convert name to Unicode to match HFS Plus B-Tree names */ if (len > 0) { - if (utf8_to_hfs(VTOVCB(vp), len-1, s, (u_char*)searchInfo->name) != 0) + if (utf8_decodestr(s, len-1, (UniChar*)searchInfo->name, &ucslen, + sizeof(searchInfo->name), ':', UTF_DECOMPOSED)) return (EINVAL); - searchInfo->nameLength = searchInfo->name[0]; + searchInfo->nameLength = ucslen / sizeof(UniChar); } else { - searchInfo->name[0] = searchInfo->nameLength = 0; + searchInfo->nameLength = 0; } - ++((attrreference_t *)attributeBuffer); + ++((attrreference_t *)attributeBuffer); } } if ( a & ATTR_CMN_OBJID ) { @@ -1220,28 +1127,73 @@ UnpackSearchAttributeBlock( struct vnode *vp, struct attrlist *alist, searchinfo ++((fsobj_id_t *)attributeBuffer); } if ( a & ATTR_CMN_CRTIME ) { - searchInfo->creationDate = *((struct timespec *)attributeBuffer); - ++((struct timespec *)attributeBuffer); + if (is_64_bit) { + struct user_timespec tmp; + tmp = *((struct user_timespec *)attributeBuffer); + searchInfo->creationDate.tv_sec = (time_t)tmp.tv_sec; + searchInfo->creationDate.tv_nsec = tmp.tv_nsec; + ++((struct user_timespec *)attributeBuffer); + } + else { + searchInfo->creationDate = *((struct timespec *)attributeBuffer); + ++((struct timespec *)attributeBuffer); + } } if ( a & ATTR_CMN_MODTIME ) { - searchInfo->modificationDate = *((struct timespec *)attributeBuffer); - ++((struct timespec *)attributeBuffer); + if (is_64_bit) { + struct user_timespec tmp; + tmp = *((struct user_timespec *)attributeBuffer); + searchInfo->modificationDate.tv_sec = (time_t)tmp.tv_sec; + searchInfo->modificationDate.tv_nsec = tmp.tv_nsec; + ++((struct user_timespec *)attributeBuffer); + } + else { + searchInfo->modificationDate = *((struct timespec *)attributeBuffer); + ++((struct timespec *)attributeBuffer); + } } if ( a & ATTR_CMN_CHGTIME ) { - searchInfo->changeDate = *((struct timespec *)attributeBuffer); - ++((struct timespec *)attributeBuffer); + if (is_64_bit) { + struct user_timespec tmp; + tmp = *((struct user_timespec *)attributeBuffer); + searchInfo->changeDate.tv_sec = (time_t)tmp.tv_sec; + searchInfo->changeDate.tv_nsec = tmp.tv_nsec; + ++((struct user_timespec *)attributeBuffer); + } + else { + searchInfo->changeDate = *((struct timespec *)attributeBuffer); + ++((struct timespec *)attributeBuffer); + } } if ( a & ATTR_CMN_ACCTIME ) { - searchInfo->accessDate = *((struct timespec *)attributeBuffer); - ++((struct timespec *)attributeBuffer); + if (is_64_bit) { + struct user_timespec tmp; + tmp = *((struct user_timespec *)attributeBuffer); + searchInfo->accessDate.tv_sec = (time_t)tmp.tv_sec; + searchInfo->accessDate.tv_nsec = tmp.tv_nsec; + ++((struct user_timespec *)attributeBuffer); + } + else { + searchInfo->accessDate = *((struct timespec *)attributeBuffer); + ++((struct timespec *)attributeBuffer); + } } if ( a & ATTR_CMN_BKUPTIME ) { - searchInfo->lastBackupDate = *((struct timespec *)attributeBuffer); - ++((struct timespec *)attributeBuffer); + if (is_64_bit) { + struct user_timespec tmp; + tmp = *((struct user_timespec *)attributeBuffer); + searchInfo->lastBackupDate.tv_sec = (time_t)tmp.tv_sec; + searchInfo->lastBackupDate.tv_nsec = tmp.tv_nsec; + ++((struct user_timespec *)attributeBuffer); + } + else { + searchInfo->lastBackupDate = *((struct timespec *)attributeBuffer); + ++((struct timespec *)attributeBuffer); + } } if ( a & ATTR_CMN_FNDRINFO ) { - bcopy( attributeBuffer, searchInfo->finderInfo, sizeof(u_long) * 8 ); - (u_long *)attributeBuffer += 8; + bcopy( attributeBuffer, searchInfo->finderInfo, sizeof(searchInfo->finderInfo) ); + (uint8_t *)attributeBuffer += 32; } if ( a & ATTR_CMN_OWNERID ) { searchInfo->uid = *((uid_t *)attributeBuffer); @@ -1260,8 +1212,8 @@ UnpackSearchAttributeBlock( struct vnode *vp, struct attrlist *alist, searchinfo a = alist->dirattr; if ( a != 0 ) { if ( a & ATTR_DIR_ENTRYCOUNT ) { - searchInfo->d.numFiles = *((u_long *)attributeBuffer); - ++((u_long *)attributeBuffer); + searchInfo->d.numFiles = *((u_int32_t *)attributeBuffer); + ++((u_int32_t *)attributeBuffer); } } diff --git a/bsd/hfs/hfs_vfsops.c b/bsd/hfs/hfs_vfsops.c index af66e398b..f6569bf71 100644 --- a/bsd/hfs/hfs_vfsops.c +++ b/bsd/hfs/hfs_vfsops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -66,21 +66,22 @@ */ #include <sys/param.h> #include <sys/systm.h> +#include <sys/kauth.h> #include <sys/ubc.h> -#include <sys/namei.h> -#include <sys/vnode.h> -#include <sys/mount.h> +#include <sys/vnode_internal.h> +#include <sys/mount_internal.h> #include <sys/sysctl.h> #include <sys/malloc.h> #include <sys/stat.h> -#include <sys/lock.h> #include <sys/quota.h> #include <sys/disk.h> #include <sys/paths.h> #include <sys/utfconv.h> +#include <sys/kdebug.h> + +#include <kern/locks.h> -// XXXdbg #include <vfs/vfs_journal.h> #include <miscfs/specfs/specdev.h> @@ -91,6 +92,7 @@ #include "hfs_cnode.h" #include "hfs_dbg.h" #include "hfs_endian.h" +#include "hfs_hotfiles.h" #include "hfs_quota.h" #include "hfscommon/headers/FileMgrInternal.h" @@ -103,65 +105,60 @@ int hfs_dbg_err = 0; #endif +lck_grp_attr_t * hfs_group_attr; +lck_attr_t * hfs_lock_attr; +lck_grp_t * hfs_mutex_group; +lck_grp_t * hfs_rwlock_group; + + extern struct vnodeopv_desc hfs_vnodeop_opv_desc; extern void hfs_converterinit(void); -extern void inittodr( time_t base); +extern void inittodr(time_t base); +extern int hfs_write_access(struct vnode *, kauth_cred_t, struct proc *, Boolean); -static int hfs_changefs __P((struct mount *mp, struct hfs_mount_args *args, - struct proc *p)); -static int hfs_reload __P((struct mount *mp, struct ucred *cred, struct proc *p)); -static int hfs_mountfs __P((struct vnode *devvp, struct mount *mp, struct proc *p, - struct hfs_mount_args *args)); -static int hfs_statfs __P((struct mount *mp, register struct statfs *sbp, - struct proc *p)); -static int hfs_flushfiles __P((struct mount *, int, struct proc *)); +static int hfs_changefs(struct mount *mp, struct hfs_mount_args *args); +static int hfs_fhtovp(struct mount *mp, int fhlen, unsigned char *fhp, struct vnode **vpp, vfs_context_t context); +static int hfs_flushfiles(struct mount *, int, struct proc *); +static int hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush); +static int hfs_getmountpoint(struct vnode *vp, struct hfsmount **hfsmpp); +static int hfs_init(struct vfsconf *vfsp); +static int hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t context); +static int hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, vfs_context_t context); +static int hfs_reload(struct mount *mp, kauth_cred_t cred, struct proc *p); +static int hfs_vfs_root(struct mount *mp, struct vnode **vpp, vfs_context_t context); +static int hfs_quotactl(struct mount *, int, uid_t, caddr_t, vfs_context_t context); +static int hfs_start(struct mount *mp, int flags, vfs_context_t context); +static int hfs_statfs(struct mount *mp, register struct vfsstatfs *sbp, vfs_context_t context); +static int hfs_sync(struct mount *mp, int waitfor, vfs_context_t context); +static int hfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen, vfs_context_t context); +static int hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context); +static int hfs_vfs_vget(struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context); +static int hfs_vptofh(struct vnode *vp, int *fhlenp, unsigned char *fhp, vfs_context_t context); + +static int hfs_reclaimspace(struct hfsmount *hfsmp, u_long startblk); -static int hfs_extendfs __P((struct mount *, u_int64_t, struct proc *)); /* * Called by vfs_mountroot when mounting HFS Plus as root. */ + __private_extern__ int -hfs_mountroot() +hfs_mountroot(mount_t mp, vnode_t rvp, vfs_context_t context) { - extern struct vnode *rootvp; - struct mount *mp; - struct proc *p = current_proc(); /* XXX */ struct hfsmount *hfsmp; ExtendedVCB *vcb; + struct vfsstatfs *vfsp; int error; - /* - * Get vnode for rootdev. - */ - if ((error = bdevvp(rootdev, &rootvp))) { - printf("hfs_mountroot: can't setup bdevvp"); + if ((error = hfs_mountfs(rvp, mp, NULL, context))) return (error); - } - if ((error = vfs_rootmountalloc("hfs", "root_device", &mp))) { - vrele(rootvp); /* release the reference from bdevvp() */ - return (error); - } - if ((error = hfs_mountfs(rootvp, mp, p, NULL))) { - mp->mnt_vfc->vfc_refcount--; - - if (mp->mnt_kern_flag & MNTK_IO_XINFO) - FREE(mp->mnt_xinfo_ptr, M_TEMP); - vfs_unbusy(mp, p); - vrele(rootvp); /* release the reference from bdevvp() */ - FREE_ZONE(mp, sizeof (struct mount), M_MOUNT); - return (error); - } - simple_lock(&mountlist_slock); - CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list); - simple_unlock(&mountlist_slock); - /* Init hfsmp */ hfsmp = VFSTOHFS(mp); @@ -175,10 +172,9 @@ hfs_mountroot() vcb->reserveBlocks = ((u_int64_t)vcb->totalBlocks * HFS_MINFREE) / 100; vcb->reserveBlocks = MIN(vcb->reserveBlocks, HFS_MAXRESERVE / vcb->blockSize); - (void)hfs_statfs(mp, &mp->mnt_stat, p); - - vfs_unbusy(mp, p); - inittodr(HFSTOVCB(hfsmp)->vcbLsMod); + vfsp = vfs_statfs(mp); + (void)hfs_statfs(mp, vfsp, NULL); + return (0); } @@ -190,57 +186,62 @@ hfs_mountroot() */ static int -hfs_mount(mp, path, data, ndp, p) - register struct mount *mp; - char *path; - caddr_t data; - struct nameidata *ndp; - struct proc *p; +hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t context) { + struct proc *p = vfs_context_proc(context); struct hfsmount *hfsmp = NULL; - struct vnode *devvp; struct hfs_mount_args args; - size_t size; int retval = E_NONE; - int flags; - mode_t accessmode; - - if ((retval = copyin(data, (caddr_t)&args, sizeof(args)))) - goto error_exit; + uint32_t cmdflags; - /* - * If updating, check whether changing from read-only to - * read/write; if there is no device name, that's all we do. - */ - if (mp->mnt_flag & MNT_UPDATE) { - + if ((retval = copyin(data, (caddr_t)&args, sizeof(args)))) { + return (retval); + } + cmdflags = (uint32_t)vfs_flags(mp) & MNT_CMDFLAGS; + if (cmdflags & MNT_UPDATE) { hfsmp = VFSTOHFS(mp); + + /* Reload incore data after an fsck. */ + if (cmdflags & MNT_RELOAD) { + if (vfs_isrdonly(mp)) + return hfs_reload(mp, vfs_context_ucred(context), p); + else + return (EINVAL); + } + + /* Change to a read-only file system. */ if (((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) && - (mp->mnt_flag & MNT_RDONLY)) { - + vfs_isrdonly(mp)) { + int flags; + /* use VFS_SYNC to push out System (btree) files */ - retval = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p); - if (retval && ((mp->mnt_flag & MNT_FORCE) == 0)) - goto error_exit; + retval = VFS_SYNC(mp, MNT_WAIT, context); + if (retval && ((cmdflags & MNT_FORCE) == 0)) + goto out; flags = WRITECLOSE; - if (mp->mnt_flag & MNT_FORCE) + if (cmdflags & MNT_FORCE) flags |= FORCECLOSE; if ((retval = hfs_flushfiles(mp, flags, p))) - goto error_exit; + goto out; hfsmp->hfs_flags |= HFS_READ_ONLY; retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0); /* also get the volume bitmap blocks */ - if (!retval) - retval = VOP_FSYNC(hfsmp->hfs_devvp, NOCRED, MNT_WAIT, p); - + if (!retval) { + if (vnode_mount(hfsmp->hfs_devvp) == mp) { + retval = hfs_fsync(hfsmp->hfs_devvp, MNT_WAIT, 0, p); + } else { + vnode_get(hfsmp->hfs_devvp); + retval = VNOP_FSYNC(hfsmp->hfs_devvp, MNT_WAIT, context); + vnode_put(hfsmp->hfs_devvp); + } + } if (retval) { hfsmp->hfs_flags &= ~HFS_READ_ONLY; - goto error_exit; + goto out; } - if (hfsmp->jnl) { hfs_global_exclusive_lock_acquire(hfsmp); @@ -255,29 +256,11 @@ hfs_mount(mp, path, data, ndp, p) } } - if ((mp->mnt_flag & MNT_RELOAD) && - (retval = hfs_reload(mp, ndp->ni_cnd.cn_cred, p))) - goto error_exit; - - if ((hfsmp->hfs_flags & HFS_READ_ONLY) && - (mp->mnt_kern_flag & MNTK_WANTRDWR)) { - /* - * If upgrade to read-write by non-root, then verify - * that user has necessary permissions on the device. - */ - if (p->p_ucred->cr_uid != 0) { - devvp = hfsmp->hfs_devvp; - vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); - if ((retval = VOP_ACCESS(devvp, VREAD | VWRITE, p->p_ucred, p))) { - VOP_UNLOCK(devvp, 0, p); - goto error_exit; - } - VOP_UNLOCK(devvp, 0, p); - } + /* Change to a writable file system. */ + if (vfs_iswriteupgrade(mp)) { retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0); - if (retval != E_NONE) - goto error_exit; + goto out; // If the journal was shut-down previously because we were // asked to be read-only, let's start it back up again now @@ -285,12 +268,12 @@ hfs_mount(mp, path, data, ndp, p) if ( (HFSTOVCB(hfsmp)->vcbAtrb & kHFSVolumeJournaledMask) && hfsmp->jnl == NULL && hfsmp->jvp != NULL) { - int flags; + int jflags; if (hfsmp->hfs_flags & HFS_NEED_JNL_RESET) { - flags = JOURNAL_RESET; + jflags = JOURNAL_RESET; } else { - flags = 0; + jflags = 0; } hfs_global_exclusive_lock_acquire(hfsmp); @@ -300,7 +283,7 @@ hfs_mount(mp, path, data, ndp, p) hfsmp->jnl_size, hfsmp->hfs_devvp, hfsmp->hfs_phys_block_size, - flags, + jflags, 0, hfs_sync_metadata, hfsmp->hfs_mp); @@ -308,7 +291,7 @@ hfs_mount(mp, path, data, ndp, p) if (hfsmp->jnl == NULL) { retval = EINVAL; - goto error_exit; + goto out; } else { hfsmp->hfs_flags &= ~HFS_NEED_JNL_RESET; } @@ -317,141 +300,125 @@ hfs_mount(mp, path, data, ndp, p) /* Only clear HFS_READ_ONLY after a successfull write */ hfsmp->hfs_flags &= ~HFS_READ_ONLY; - } - if (((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) && - (HFSTOVCB(hfsmp)->vcbSigWord == kHFSPlusSigWord)) { - /* setup private/hidden directory for unlinked files */ - FindMetaDataDirectory(HFSTOVCB(hfsmp)); - if (hfsmp->jnl) + if (!(hfsmp->hfs_flags & (HFS_READ_ONLY & HFS_STANDARD))) { + /* setup private/hidden directory for unlinked files */ + FindMetaDataDirectory(HFSTOVCB(hfsmp)); hfs_remove_orphans(hfsmp); - - /* - * Allow hot file clustering if conditions allow. - */ - if ((hfsmp->hfs_flags & HFS_METADATA_ZONE) && - (mp->mnt_flag & MNT_RDONLY) && - (mp->mnt_kern_flag & MNTK_WANTRDWR)) { - (void) hfs_recording_init(hfsmp, p); + + /* + * Allow hot file clustering if conditions allow. + */ + if (hfsmp->hfs_flags & HFS_METADATA_ZONE) { + (void) hfs_recording_init(hfsmp); + } } } - if (args.fspec == 0) { - /* - * Process export requests. - */ - return vfs_export(mp, &hfsmp->hfs_export, &args.export); - } - } + /* Update file system parameters. */ + retval = hfs_changefs(mp, &args); - /* - * Not an update, or updating the name: look up the name - * and verify that it refers to a sensible block device. - */ - NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args.fspec, p); - retval = namei(ndp); - if (retval != E_NONE) { - DBG_ERR(("hfs_mount: CAN'T GET DEVICE: %s, %x\n", args.fspec, ndp->ni_vp->v_rdev)); - goto error_exit; - } + } else /* not an update request */ { - devvp = ndp->ni_vp; + /* Set the mount flag to indicate that we support volfs */ + vfs_setflags(mp, (uint64_t)((unsigned int)MNT_DOVOLFS)); - if (devvp->v_type != VBLK) { - vrele(devvp); - retval = ENOTBLK; - goto error_exit; + retval = hfs_mountfs(devvp, mp, &args, context); } - if (major(devvp->v_rdev) >= nblkdev) { - vrele(devvp); - retval = ENXIO; - goto error_exit; +out: + if (retval == 0) { + (void)hfs_statfs(mp, vfs_statfs(mp), context); } + return (retval); +} - /* - * If mount by non-root, then verify that user has necessary - * permissions on the device. - */ - if (p->p_ucred->cr_uid != 0) { - accessmode = VREAD; - if ((mp->mnt_flag & MNT_RDONLY) == 0) - accessmode |= VWRITE; - vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); - if ((retval = VOP_ACCESS(devvp, accessmode, p->p_ucred, p))) { - vput(devvp); - goto error_exit; - } - VOP_UNLOCK(devvp, 0, p); - } - if ((mp->mnt_flag & MNT_UPDATE) == 0) { - retval = hfs_mountfs(devvp, mp, p, &args); - if (retval != E_NONE) - vrele(devvp); - } else { - if (devvp != hfsmp->hfs_devvp) - retval = EINVAL; /* needs translation */ - else - retval = hfs_changefs(mp, &args, p); - vrele(devvp); - } +struct hfs_changefs_cargs { + struct hfsmount *hfsmp; + int namefix; + int permfix; + int permswitch; +}; - if (retval != E_NONE) { - goto error_exit; - } +static int +hfs_changefs_callback(struct vnode *vp, void *cargs) +{ + ExtendedVCB *vcb; + struct cnode *cp; + struct cat_desc cndesc; + struct cat_attr cnattr; + struct hfs_changefs_cargs *args; - /* Set the mount flag to indicate that we support volfs */ - mp->mnt_flag |= MNT_DOVOLFS; - if (VFSTOVCB(mp)->vcbSigWord == kHFSSigWord) { - /* HFS volumes only want roman-encoded names: */ - mp->mnt_flag |= MNT_FIXEDSCRIPTENCODING; - } - (void) copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN-1, &size); + args = (struct hfs_changefs_cargs *)cargs; - bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); - (void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size); - bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); - (void)hfs_statfs(mp, &mp->mnt_stat, p); - return (E_NONE); + cp = VTOC(vp); + vcb = HFSTOVCB(args->hfsmp); -error_exit: + if (cat_lookup(args->hfsmp, &cp->c_desc, 0, &cndesc, &cnattr, NULL, NULL)) { + /* + * If we couldn't find this guy skip to the next one + */ + if (args->namefix) + cache_purge(vp); - return (retval); -} + return (VNODE_RETURNED); + } + /* + * Get the real uid/gid and perm mask from disk. + */ + if (args->permswitch || args->permfix) { + cp->c_uid = cnattr.ca_uid; + cp->c_gid = cnattr.ca_gid; + cp->c_mode = cnattr.ca_mode; + } + /* + * If we're switching name converters then... + * Remove the existing entry from the namei cache. + * Update name to one based on new encoder. + */ + if (args->namefix) { + cache_purge(vp); + replace_desc(cp, &cndesc); + if (cndesc.cd_cnid == kHFSRootFolderID) { + strncpy(vcb->vcbVN, cp->c_desc.cd_nameptr, NAME_MAX); + cp->c_desc.cd_encoding = args->hfsmp->hfs_encoding; + } + } else { + cat_releasedesc(&cndesc); + } + return (VNODE_RETURNED); +} /* Change fs mount parameters */ static int -hfs_changefs(mp, args, p) - struct mount *mp; - struct hfs_mount_args *args; - struct proc *p; +hfs_changefs(struct mount *mp, struct hfs_mount_args *args) { int retval = 0; int namefix, permfix, permswitch; struct hfsmount *hfsmp; - struct cnode *cp; ExtendedVCB *vcb; - register struct vnode *vp, *nvp; hfs_to_unicode_func_t get_unicode_func; unicode_to_hfs_func_t get_hfsname_func; - struct cat_desc cndesc; - struct cat_attr cnattr; - u_long old_encoding; + u_long old_encoding = 0; + struct hfs_changefs_cargs cargs; + uint32_t mount_flags; hfsmp = VFSTOHFS(mp); vcb = HFSTOVCB(hfsmp); + mount_flags = (unsigned int)vfs_flags(mp); + permswitch = (((hfsmp->hfs_flags & HFS_UNKNOWN_PERMS) && - ((mp->mnt_flag & MNT_UNKNOWNPERMISSIONS) == 0)) || + ((mount_flags & MNT_UNKNOWNPERMISSIONS) == 0)) || (((hfsmp->hfs_flags & HFS_UNKNOWN_PERMS) == 0) && - (mp->mnt_flag & MNT_UNKNOWNPERMISSIONS))); + (mount_flags & MNT_UNKNOWNPERMISSIONS))); /* The root filesystem must operate with actual permissions: */ - if (permswitch && (mp->mnt_flag & MNT_ROOTFS) && (mp->mnt_flag & MNT_UNKNOWNPERMISSIONS)) { - mp->mnt_flag &= ~MNT_UNKNOWNPERMISSIONS; /* Just say "No". */ + if (permswitch && (mount_flags & MNT_ROOTFS) && (mount_flags & MNT_UNKNOWNPERMISSIONS)) { + vfs_clearflags(mp, (uint64_t)((unsigned int)MNT_UNKNOWNPERMISSIONS)); /* Just say "No". */ return EINVAL; } - if (mp->mnt_flag & MNT_UNKNOWNPERMISSIONS) + if (mount_flags & MNT_UNKNOWNPERMISSIONS) hfsmp->hfs_flags |= HFS_UNKNOWN_PERMS; else hfsmp->hfs_flags &= ~HFS_UNKNOWN_PERMS; @@ -466,12 +433,12 @@ hfs_changefs(mp, args, p) /* Change the default uid, gid and/or mask */ if ((args->hfs_uid != (uid_t)VNOVAL) && (hfsmp->hfs_uid != args->hfs_uid)) { hfsmp->hfs_uid = args->hfs_uid; - if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSPlusSigWord) + if (vcb->vcbSigWord == kHFSPlusSigWord) ++permfix; } if ((args->hfs_gid != (gid_t)VNOVAL) && (hfsmp->hfs_gid != args->hfs_gid)) { hfsmp->hfs_gid = args->hfs_gid; - if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSPlusSigWord) + if (vcb->vcbSigWord == kHFSPlusSigWord) ++permfix; } if (args->hfs_mask != (mode_t)VNOVAL) { @@ -480,13 +447,13 @@ hfs_changefs(mp, args, p) hfsmp->hfs_file_mask = args->hfs_mask & ALLPERMS; if ((args->flags != VNOVAL) && (args->flags & HFSFSMNT_NOXONFILES)) hfsmp->hfs_file_mask = (args->hfs_mask & DEFFILEMODE); - if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSPlusSigWord) + if (vcb->vcbSigWord == kHFSPlusSigWord) ++permfix; } } /* Change the hfs encoding value (hfs only) */ - if ((HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) && + if ((vcb->vcbSigWord == kHFSSigWord) && (args->hfs_encoding != (u_long)VNOVAL) && (hfsmp->hfs_encoding != args->hfs_encoding)) { @@ -513,77 +480,30 @@ hfs_changefs(mp, args, p) if (!(namefix || permfix || permswitch)) goto exit; + /* XXX 3762912 hack to support HFS filesystem 'owner' */ + if (permfix) + vfs_setowner(mp, + hfsmp->hfs_uid == UNKNOWNUID ? KAUTH_UID_NONE : hfsmp->hfs_uid, + hfsmp->hfs_gid == UNKNOWNGID ? KAUTH_GID_NONE : hfsmp->hfs_gid); + /* * For each active vnode fix things that changed * * Note that we can visit a vnode more than once * and we can race with fsync. + * + * hfs_changefs_callback will be called for each vnode + * hung off of this mount point + * the vnode will be + * properly referenced and unreferenced around the callback */ - simple_lock(&mntvnode_slock); -loop: - for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { - /* - * If the vnode that we are about to fix is no longer - * associated with this mount point, start over. - */ - if (vp->v_mount != mp) - goto loop; - - simple_lock(&vp->v_interlock); - nvp = vp->v_mntvnodes.le_next; - if (vp->v_flag & VSYSTEM) { - simple_unlock(&vp->v_interlock); - continue; - } - simple_unlock(&mntvnode_slock); - retval = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, p); - if (retval) { - simple_lock(&mntvnode_slock); - if (retval == ENOENT) - goto loop; - continue; - } - - cp = VTOC(vp); - - retval = cat_lookup(hfsmp, &cp->c_desc, 0, &cndesc, &cnattr, NULL); - /* If we couldn't find this guy skip to the next one */ - if (retval) { - if (namefix) - cache_purge(vp); - vput(vp); - simple_lock(&mntvnode_slock); - continue; - } - - /* Get the real uid/gid and perm mask from disk. */ - if (permswitch || permfix) { - cp->c_uid = cnattr.ca_uid; - cp->c_gid = cnattr.ca_gid; - cp->c_mode = cnattr.ca_mode; - } - - /* - * If we're switching name converters then... - * Remove the existing entry from the namei cache. - * Update name to one based on new encoder. - */ - if (namefix) { - cache_purge(vp); - replace_desc(cp, &cndesc); + cargs.hfsmp = hfsmp; + cargs.namefix = namefix; + cargs.permfix = permfix; + cargs.permswitch = permswitch; - if (cndesc.cd_cnid == kHFSRootFolderID) { - strncpy(vcb->vcbVN, cp->c_desc.cd_nameptr, NAME_MAX); - cp->c_desc.cd_encoding = hfsmp->hfs_encoding; - } - } else { - cat_releasedesc(&cndesc); - } - vput(vp); - simple_lock(&mntvnode_slock); + vnode_iterate(mp, 0, hfs_changefs_callback, (void *)&cargs); - } /* end for (vp...) */ - simple_unlock(&mntvnode_slock); /* * If we're switching name converters we can now * connect the new hfs_get_hfsname converter and @@ -599,6 +519,51 @@ exit: } +struct hfs_reload_cargs { + struct hfsmount *hfsmp; + kauth_cred_t cred; + struct proc *p; + int error; +}; + +static int +hfs_reload_callback(struct vnode *vp, void *cargs) +{ + struct cnode *cp; + struct hfs_reload_cargs *args; + + args = (struct hfs_reload_cargs *)cargs; + /* + * flush all the buffers associated with this node + */ + (void) buf_invalidateblks(vp, 0, 0, 0); + + cp = VTOC(vp); + /* + * Remove any directory hints + */ + if (vnode_isdir(vp)) + hfs_reldirhints(cp, 0); + + /* + * Re-read cnode data for all active vnodes (non-metadata files). + */ + if (!VNODE_IS_RSRC(vp)) { + struct cat_fork *datafork; + struct cat_desc desc; + + datafork = cp->c_datafork ? &cp->c_datafork->ff_data : NULL; + + /* lookup by fileID since name could have changed */ + if ((args->error = cat_idlookup(args->hfsmp, cp->c_fileid, &desc, &cp->c_attr, datafork))) + return (VNODE_RETURNED_DONE); + + /* update cnode's catalog descriptor */ + (void) replace_desc(cp, &desc); + } + return (VNODE_RETURNED); +} + /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). The filesystem must @@ -614,13 +579,9 @@ exit: * re-read cnode data for all active vnodes. */ static int -hfs_reload(mountp, cred, p) - register struct mount *mountp; - struct ucred *cred; - struct proc *p; +hfs_reload(struct mount *mountp, kauth_cred_t cred, struct proc *p) { - register struct vnode *vp, *nvp, *devvp; - struct cnode *cp; + register struct vnode *devvp; struct buf *bp; int sectorsize; int error, i; @@ -629,9 +590,8 @@ hfs_reload(mountp, cred, p) ExtendedVCB *vcb; struct filefork *forkp; struct cat_desc cndesc; - - if ((mountp->mnt_flag & MNT_RDONLY) == 0) - return (EINVAL); + struct hfs_reload_cargs args; + int lockflags; hfsmp = VFSTOHFS(mountp); vcb = HFSTOVCB(hfsmp); @@ -643,75 +603,42 @@ hfs_reload(mountp, cred, p) * Invalidate all cached meta-data. */ devvp = hfsmp->hfs_devvp; - if (vinvalbuf(devvp, 0, cred, p, 0, 0)) + if (buf_invalidateblks(devvp, 0, 0, 0)) panic("hfs_reload: dirty1"); - InvalidateCatalogCache(vcb); - -loop: - simple_lock(&mntvnode_slock); - for (vp = mountp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { - if (vp->v_mount != mountp) { - simple_unlock(&mntvnode_slock); - goto loop; - } - nvp = vp->v_mntvnodes.le_next; - - /* - * Invalidate all inactive vnodes. - */ - if (vrecycle(vp, &mntvnode_slock, p)) - goto loop; - - /* - * Invalidate all cached file data. - */ - simple_lock(&vp->v_interlock); - simple_unlock(&mntvnode_slock); - if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) { - goto loop; - } - if (vinvalbuf(vp, 0, cred, p, 0, 0)) - panic("hfs_reload: dirty2"); - - /* - * Re-read cnode data for all active vnodes (non-metadata files). - */ - cp = VTOC(vp); - if ((vp->v_flag & VSYSTEM) == 0 && !VNODE_IS_RSRC(vp)) { - struct cat_fork *datafork; - struct cat_desc desc; - datafork = cp->c_datafork ? &cp->c_datafork->ff_data : NULL; - - /* lookup by fileID since name could have changed */ - if ((error = cat_idlookup(hfsmp, cp->c_fileid, &desc, &cp->c_attr, datafork))) { - vput(vp); - return (error); - } + args.hfsmp = hfsmp; + args.cred = cred; + args.p = p; + args.error = 0; + /* + * hfs_reload_callback will be called for each vnode + * hung off of this mount point that can't be recycled... + * vnode_iterate will recycle those that it can (the VNODE_RELOAD option) + * the vnode will be in an 'unbusy' state (VNODE_WAIT) and + * properly referenced and unreferenced around the callback + */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); + vnode_iterate(mountp, VNODE_RELOAD | VNODE_WAIT, hfs_reload_callback, (void *)&args); + hfs_systemfile_unlock(hfsmp, lockflags); - /* update cnode's catalog descriptor */ - (void) replace_desc(cp, &desc); - } - vput(vp); - simple_lock(&mntvnode_slock); - } - simple_unlock(&mntvnode_slock); + if (args.error) + return (args.error); /* * Re-read VolumeHeader from disk. */ sectorsize = hfsmp->hfs_phys_block_size; - error = meta_bread(hfsmp->hfs_devvp, - (vcb->hfsPlusIOPosOffset / sectorsize) + HFS_PRI_SECTOR(sectorsize), + error = (int)buf_meta_bread(hfsmp->hfs_devvp, + (daddr64_t)((vcb->hfsPlusIOPosOffset / sectorsize) + HFS_PRI_SECTOR(sectorsize)), sectorsize, NOCRED, &bp); if (error) { if (bp != NULL) - brelse(bp); + buf_brelse(bp); return (error); } - vhp = (HFSPlusVolumeHeader *) (bp->b_data + HFS_PRI_OFFSET(sectorsize)); + vhp = (HFSPlusVolumeHeader *) (buf_dataptr(bp) + HFS_PRI_OFFSET(sectorsize)); /* Do a quick sanity check */ if ((SWAP_BE16(vhp->signature) != kHFSPlusSigWord && @@ -719,12 +646,12 @@ loop: (SWAP_BE16(vhp->version) != kHFSPlusVersion && SWAP_BE16(vhp->version) != kHFSXVersion) || SWAP_BE32(vhp->blockSize) != vcb->blockSize) { - brelse(bp); + buf_brelse(bp); return (EIO); } vcb->vcbLsMod = to_bsd_time(SWAP_BE32(vhp->modifyDate)); - vcb->vcbAtrb = (UInt16) SWAP_BE32 (vhp->attributes); /* VCB only uses lower 16 bits */ + vcb->vcbAtrb = SWAP_BE32 (vhp->attributes); vcb->vcbJinfoBlock = SWAP_BE32(vhp->journalInfoBlock); vcb->vcbClpSiz = SWAP_BE32 (vhp->rsrcClumpSize); vcb->vcbNxtCNID = SWAP_BE32 (vhp->nextCatalogID); @@ -765,6 +692,18 @@ loop: forkp->ff_blocks = SWAP_BE32 (vhp->catalogFile.totalBlocks); forkp->ff_clumpsize = SWAP_BE32 (vhp->catalogFile.clumpSize); + if (hfsmp->hfs_attribute_vp) { + forkp = VTOF(hfsmp->hfs_attribute_vp); + for (i = 0; i < kHFSPlusExtentDensity; i++) { + forkp->ff_extents[i].startBlock = + SWAP_BE32 (vhp->attributesFile.extents[i].startBlock); + forkp->ff_extents[i].blockCount = + SWAP_BE32 (vhp->attributesFile.extents[i].blockCount); + } + forkp->ff_size = SWAP_BE64 (vhp->attributesFile.logicalSize); + forkp->ff_blocks = SWAP_BE32 (vhp->attributesFile.totalBlocks); + forkp->ff_clumpsize = SWAP_BE32 (vhp->attributesFile.clumpSize); + } forkp = VTOF((struct vnode *)vcb->allocationsRefNum); for (i = 0; i < kHFSPlusExtentDensity; i++) { @@ -777,20 +716,26 @@ loop: forkp->ff_blocks = SWAP_BE32 (vhp->allocationFile.totalBlocks); forkp->ff_clumpsize = SWAP_BE32 (vhp->allocationFile.clumpSize); - brelse(bp); + buf_brelse(bp); vhp = NULL; /* * Re-load B-tree header data */ forkp = VTOF((struct vnode *)vcb->extentsRefNum); - if (error = MacToVFSError( BTReloadData((FCB*)forkp) )) + if ( (error = MacToVFSError( BTReloadData((FCB*)forkp) )) ) return (error); forkp = VTOF((struct vnode *)vcb->catalogRefNum); - if (error = MacToVFSError( BTReloadData((FCB*)forkp) )) + if ( (error = MacToVFSError( BTReloadData((FCB*)forkp) )) ) return (error); + if (hfsmp->hfs_attribute_vp) { + forkp = VTOF(hfsmp->hfs_attribute_vp); + if ( (error = MacToVFSError( BTReloadData((FCB*)forkp) )) ) + return (error); + } + /* Reload the volume name */ if ((error = cat_idlookup(hfsmp, kHFSRootFolderID, &cndesc, NULL, NULL))) return (error); @@ -808,91 +753,14 @@ loop: } -static int -get_raw_device(char *fspec, int is_user, int ronly, struct vnode **rvp, struct ucred *cred, struct proc *p) -{ - char *rawbuf; - char *dp; - size_t namelen; - struct nameidata nd; - int retval; - - *rvp = NULL; - - MALLOC(rawbuf, char *, MAXPATHLEN, M_HFSMNT, M_WAITOK); - if (rawbuf == NULL) { - retval = ENOMEM; - goto error_exit; - } - - if (is_user) { - retval = copyinstr(fspec, rawbuf, MAXPATHLEN - 1, &namelen); - if (retval != E_NONE) { - FREE(rawbuf, M_HFSMNT); - goto error_exit; - } - } else { - strcpy(rawbuf, fspec); - namelen = strlen(rawbuf); - } - - /* make sure it's null terminated */ - rawbuf[MAXPATHLEN-1] = '\0'; - - dp = &rawbuf[namelen-1]; - while(dp >= rawbuf && *dp != '/') { - dp--; - } - - if (dp != NULL) { - dp++; - } else { - dp = rawbuf; - } - - /* make room for and insert the 'r' for the raw device */ - memmove(dp+1, dp, strlen(dp)+1); - *dp = 'r'; - - NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, rawbuf, p); - retval = namei(&nd); - if (retval != E_NONE) { - DBG_ERR(("hfs_mountfs: can't open raw device for journal: %s, %x\n", rawbuf, nd.ni_vp->v_rdev)); - FREE(rawbuf, M_HFSMNT); - goto error_exit; - } - - *rvp = nd.ni_vp; - if ((retval = VOP_OPEN(*rvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p))) { - *rvp = NULL; - goto error_exit; - } - - // don't need this any more - FREE(rawbuf, M_HFSMNT); - - return 0; - - error_exit: - if (*rvp) { - (void)VOP_CLOSE(*rvp, ronly ? FREAD : FREAD|FWRITE, cred, p); - } - - if (rawbuf) { - FREE(rawbuf, M_HFSMNT); - } - return retval; -} - - - /* * Common code for mount and mountroot */ static int -hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p, - struct hfs_mount_args *args) +hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, + vfs_context_t context) { + struct proc *p = vfs_context_proc(context); int retval = E_NONE; struct hfsmount *hfsmp; struct buf *bp; @@ -901,41 +769,29 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p, int ronly; int i; int mntwrapper; - struct ucred *cred; + kauth_cred_t cred; u_int64_t disksize; - u_int64_t blkcnt; + daddr64_t blkcnt; u_int32_t blksize; u_int32_t minblksize; u_int32_t iswritable; - daddr_t mdb_offset; + daddr64_t mdb_offset; - dev = devvp->v_rdev; - cred = p ? p->p_ucred : NOCRED; + ronly = vfs_isrdonly(mp); + dev = vnode_specrdev(devvp); + cred = p ? vfs_context_ucred(context) : NOCRED; mntwrapper = 0; - /* - * Disallow multiple mounts of the same device. - * Disallow mounting of a device that is currently in use - * (except for root, which might share swap device for miniroot). - * Flush out any old buffers remaining from a previous use. - */ - if ((retval = vfs_mountedon(devvp))) - return (retval); - if ((vcount(devvp) > 1) && (devvp != rootvp)) - return (EBUSY); - if ((retval = vinvalbuf(devvp, V_SAVE, cred, p, 0, 0))) - return (retval); - - ronly = (mp->mnt_flag & MNT_RDONLY) != 0; - if ((retval = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p))) - return (retval); bp = NULL; hfsmp = NULL; mdbp = NULL; minblksize = kHFSBlockSize; + /* Advisory locking should be handled at the VFS layer */ + vfs_setlocklocal(mp); + /* Get the real physical block size. */ - if (VOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, cred, p)) { + if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, context)) { retval = ENXIO; goto error_exit; } @@ -943,90 +799,88 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p, if (blksize > 512) { u_int32_t size512 = 512; - if (VOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, cred, p)) { + if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, context)) { retval = ENXIO; goto error_exit; } } /* Get the number of 512 byte physical blocks. */ - if (VOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, cred, p)) { + if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, context)) { retval = ENXIO; goto error_exit; } /* Compute an accurate disk size (i.e. within 512 bytes) */ - disksize = blkcnt * (u_int64_t)512; + disksize = (u_int64_t)blkcnt * (u_int64_t)512; /* - * There are only 31 bits worth of block count in - * the buffer cache. So for large volumes a 4K - * physical block size is needed. + * On Tiger it is not necessary to switch the device + * block size to be 4k if there are more than 31-bits + * worth of blocks but to insure compatibility with + * pre-Tiger systems we have to do it. */ if (blkcnt > (u_int64_t)0x000000007fffffff) { minblksize = blksize = 4096; } + /* Now switch to our prefered physical block size. */ if (blksize > 512) { - if (VOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, cred, p)) { + if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, context)) { retval = ENXIO; goto error_exit; } /* Get the count of physical blocks. */ - if (VOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, cred, p)) { + if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, context)) { retval = ENXIO; goto error_exit; } } - /* * At this point: * minblksize is the minimum physical block size * blksize has our prefered physical block size * blkcnt has the total number of physical blocks */ - devvp->v_specsize = blksize; - - /* cache the IO attributes */ - if ((retval = vfs_init_io_attributes(devvp, mp))) { - printf("hfs_mountfs: vfs_init_io_attributes returned %d\n", - retval); - return (retval); - } - mdb_offset = HFS_PRI_SECTOR(blksize); - if ((retval = meta_bread(devvp, HFS_PRI_SECTOR(blksize), blksize, cred, &bp))) { + mdb_offset = (daddr64_t)HFS_PRI_SECTOR(blksize); + if ((retval = (int)buf_meta_bread(devvp, mdb_offset, blksize, cred, &bp))) { goto error_exit; } MALLOC(mdbp, HFSMasterDirectoryBlock *, kMDBSize, M_TEMP, M_WAITOK); - bcopy(bp->b_data + HFS_PRI_OFFSET(blksize), mdbp, kMDBSize); - brelse(bp); + bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(blksize), mdbp, kMDBSize); + buf_brelse(bp); bp = NULL; MALLOC(hfsmp, struct hfsmount *, sizeof(struct hfsmount), M_HFSMNT, M_WAITOK); bzero(hfsmp, sizeof(struct hfsmount)); /* - * Init the volume information structure - */ - mp->mnt_data = (qaddr_t)hfsmp; + * Init the volume information structure + */ + + lck_mtx_init(&hfsmp->hfs_mutex, hfs_mutex_group, hfs_lock_attr); + lck_mtx_init(&hfsmp->hfc_mutex, hfs_mutex_group, hfs_lock_attr); + lck_rw_init(&hfsmp->hfs_global_lock, hfs_rwlock_group, hfs_lock_attr); + + vfs_setfsprivate(mp, hfsmp); hfsmp->hfs_mp = mp; /* Make VFSTOHFS work */ - hfsmp->hfs_vcb.vcb_hfsmp = hfsmp; /* Make VCBTOHFS work */ - hfsmp->hfs_raw_dev = devvp->v_rdev; + hfsmp->hfs_raw_dev = vnode_specrdev(devvp); hfsmp->hfs_devvp = devvp; hfsmp->hfs_phys_block_size = blksize; hfsmp->hfs_phys_block_count = blkcnt; hfsmp->hfs_flags |= HFS_WRITEABLE_MEDIA; if (ronly) hfsmp->hfs_flags |= HFS_READ_ONLY; - if (mp->mnt_flag & MNT_UNKNOWNPERMISSIONS) + if (((unsigned int)vfs_flags(mp)) & MNT_UNKNOWNPERMISSIONS) hfsmp->hfs_flags |= HFS_UNKNOWN_PERMS; for (i = 0; i < MAXQUOTAS; i++) - hfsmp->hfs_qfiles[i].qf_vp = NULLVP; + dqfileinit(&hfsmp->hfs_qfiles[i]); if (args) { hfsmp->hfs_uid = (args->hfs_uid == (uid_t)VNOVAL) ? UNKNOWNUID : args->hfs_uid; if (hfsmp->hfs_uid == 0xfffffffd) hfsmp->hfs_uid = UNKNOWNUID; hfsmp->hfs_gid = (args->hfs_gid == (gid_t)VNOVAL) ? UNKNOWNGID : args->hfs_gid; if (hfsmp->hfs_gid == 0xfffffffd) hfsmp->hfs_gid = UNKNOWNGID; + vfs_setowner(mp, hfsmp->hfs_uid, hfsmp->hfs_gid); /* tell the VFS */ if (args->hfs_mask != (mode_t)VNOVAL) { hfsmp->hfs_dir_mask = args->hfs_mask & ALLPERMS; if (args->flags & HFSFSMNT_NOXONFILES) { @@ -1042,41 +896,48 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p, mntwrapper = 1; } else { /* Even w/o explicit mount arguments, MNT_UNKNOWNPERMISSIONS requires setting up uid, gid, and mask: */ - if (mp->mnt_flag & MNT_UNKNOWNPERMISSIONS) { + if (((unsigned int)vfs_flags(mp)) & MNT_UNKNOWNPERMISSIONS) { hfsmp->hfs_uid = UNKNOWNUID; hfsmp->hfs_gid = UNKNOWNGID; + vfs_setowner(mp, hfsmp->hfs_uid, hfsmp->hfs_gid); /* tell the VFS */ hfsmp->hfs_dir_mask = UNKNOWNPERMISSIONS & ALLPERMS; /* 0777: rwx---rwx */ hfsmp->hfs_file_mask = UNKNOWNPERMISSIONS & DEFFILEMODE; /* 0666: no --x by default? */ } } /* Find out if disk media is writable. */ - if (VOP_IOCTL(devvp, DKIOCISWRITABLE, (caddr_t)&iswritable, 0, cred, p) == 0) { + if (VNOP_IOCTL(devvp, DKIOCISWRITABLE, (caddr_t)&iswritable, 0, context) == 0) { if (iswritable) hfsmp->hfs_flags |= HFS_WRITEABLE_MEDIA; else hfsmp->hfs_flags &= ~HFS_WRITEABLE_MEDIA; } + // record the current time at which we're mounting this volume + { + struct timeval tv; + microtime(&tv); + hfsmp->hfs_mount_time = tv.tv_sec; + } + /* Mount a standard HFS disk */ if ((SWAP_BE16(mdbp->drSigWord) == kHFSSigWord) && (mntwrapper || (SWAP_BE16(mdbp->drEmbedSigWord) != kHFSPlusSigWord))) { - if (devvp == rootvp) { + if ((vfs_flags(mp) & MNT_ROOTFS)) { retval = EINVAL; /* Cannot root from HFS standard disks */ goto error_exit; } /* HFS disks can only use 512 byte physical blocks */ if (blksize > kHFSBlockSize) { blksize = kHFSBlockSize; - if (VOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, cred, p)) { + if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, context)) { retval = ENXIO; goto error_exit; } - if (VOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, cred, p)) { + if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, context)) { retval = ENXIO; goto error_exit; } - devvp->v_specsize = blksize; hfsmp->hfs_phys_block_size = blksize; hfsmp->hfs_phys_block_count = blkcnt; } @@ -1119,18 +980,16 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p, " a multiple of physical block size (%d);" " switching to 512\n", blksize); blksize = 512; - if (VOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, - (caddr_t)&blksize, FWRITE, cred, p)) { + if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, + (caddr_t)&blksize, FWRITE, context)) { retval = ENXIO; goto error_exit; } - if (VOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, - (caddr_t)&blkcnt, 0, cred, p)) { + if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, + (caddr_t)&blkcnt, 0, context)) { retval = ENXIO; goto error_exit; } - /* XXX do we need to call vfs_init_io_attributes again? */ - devvp->v_specsize = blksize; /* Note: relative block count adjustment */ hfsmp->hfs_phys_block_count *= hfsmp->hfs_phys_block_size / blksize; @@ -1142,12 +1001,12 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p, hfsmp->hfs_phys_block_count = disksize / blksize; - mdb_offset = (embeddedOffset / blksize) + HFS_PRI_SECTOR(blksize); - retval = meta_bread(devvp, mdb_offset, blksize, cred, &bp); + mdb_offset = (daddr64_t)((embeddedOffset / blksize) + HFS_PRI_SECTOR(blksize)); + retval = (int)buf_meta_bread(devvp, mdb_offset, blksize, cred, &bp); if (retval) goto error_exit; - bcopy(bp->b_data + HFS_PRI_OFFSET(blksize), mdbp, 512); - brelse(bp); + bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(blksize), mdbp, 512); + buf_brelse(bp); bp = NULL; vhp = (HFSPlusVolumeHeader*) mdbp; @@ -1181,42 +1040,45 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p, // point as journaled. // if (hfs_early_journal_init(hfsmp, vhp, args, embeddedOffset, mdb_offset, mdbp, cred) == 0) { - mp->mnt_flag |= MNT_JOURNALED; + vfs_setflags(mp, (uint64_t)((unsigned int)MNT_JOURNALED)); } else { // if the journal failed to open, then set the lastMountedVersion // to be "FSK!" which fsck_hfs will see and force the fsck instead // of just bailing out because the volume is journaled. - if (ronly != 0 || devvp == rootvp) { - HFSPlusVolumeHeader *vhp; + if (!ronly) { + HFSPlusVolumeHeader *jvhp; hfsmp->hfs_flags |= HFS_NEED_JNL_RESET; if (mdb_offset == 0) { - mdb_offset = (embeddedOffset / blksize) + HFS_PRI_SECTOR(blksize); + mdb_offset = (daddr64_t)((embeddedOffset / blksize) + HFS_PRI_SECTOR(blksize)); } bp = NULL; - retval = meta_bread(devvp, mdb_offset, blksize, cred, &bp); + retval = (int)buf_meta_bread(devvp, mdb_offset, blksize, cred, &bp); if (retval == 0) { - vhp = (HFSPlusVolumeHeader *)(bp->b_data + HFS_PRI_OFFSET(blksize)); + jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(blksize)); - if (SWAP_BE16(vhp->signature) == kHFSPlusSigWord || SWAP_BE16(vhp->signature) == kHFSXSigWord) { - vhp->lastMountedVersion = SWAP_BE32('FSK!'); - bwrite(bp); + if (SWAP_BE16(jvhp->signature) == kHFSPlusSigWord || SWAP_BE16(jvhp->signature) == kHFSXSigWord) { + printf ("hfs(1): Journal replay fail. Writing lastMountVersion as FSK!\n"); + jvhp->lastMountedVersion = SWAP_BE32(kFSKMountVersion); + buf_bwrite(bp); } else { - brelse(bp); + buf_brelse(bp); } bp = NULL; } else if (bp) { - brelse(bp); + buf_brelse(bp); + // clear this so the error exit path won't try to use it + bp = NULL; } } // if this isn't the root device just bail out. - // if it is the root device we just continue on + // If it is the root device we just continue on // in the hopes that fsck_hfs will be able to // fix any damage that exists on the volume. - if (devvp != rootvp) { + if ( !(vfs_flags(mp) & MNT_ROOTFS)) { retval = EINVAL; goto error_exit; } @@ -1226,7 +1088,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p, (void) hfs_getconverter(0, &hfsmp->hfs_get_unicode, &hfsmp->hfs_get_hfsname); - retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args); + retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args, cred); /* * If the backend didn't like our physical blocksize * then retry with physical blocksize of 512. @@ -1235,11 +1097,11 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p, printf("HFS Mount: could not use physical block size " "(%d) switching to 512\n", blksize); blksize = 512; - if (VOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, cred, p)) { + if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, context)) { retval = ENXIO; goto error_exit; } - if (VOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, cred, p)) { + if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, context)) { retval = ENXIO; goto error_exit; } @@ -1253,25 +1115,71 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p, journal_close(hfsmp->jnl); hfsmp->jnl = NULL; if (hfs_early_journal_init(hfsmp, vhp, args, embeddedOffset, mdb_offset, mdbp, cred) == 0) { - mp->mnt_flag |= MNT_JOURNALED; - } + vfs_setflags(mp, (uint64_t)((unsigned int)MNT_JOURNALED)); + } else { + // if the journal failed to open, then set the lastMountedVersion + // to be "FSK!" which fsck_hfs will see and force the fsck instead + // of just bailing out because the volume is journaled. + if (!ronly) { + HFSPlusVolumeHeader *jvhp; + + hfsmp->hfs_flags |= HFS_NEED_JNL_RESET; + + if (mdb_offset == 0) { + mdb_offset = (daddr64_t)((embeddedOffset / blksize) + HFS_PRI_SECTOR(blksize)); + } + + bp = NULL; + retval = (int)buf_meta_bread(devvp, mdb_offset, blksize, cred, &bp); + if (retval == 0) { + jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(blksize)); + + if (SWAP_BE16(jvhp->signature) == kHFSPlusSigWord || SWAP_BE16(jvhp->signature) == kHFSXSigWord) { + printf ("hfs(2): Journal replay fail. Writing lastMountVersion as FSK!\n"); + jvhp->lastMountedVersion = SWAP_BE32(kFSKMountVersion); + buf_bwrite(bp); + } else { + buf_brelse(bp); + } + bp = NULL; + } else if (bp) { + buf_brelse(bp); + // clear this so the error exit path won't try to use it + bp = NULL; + } + } + + // if this isn't the root device just bail out. + // If it is the root device we just continue on + // in the hopes that fsck_hfs will be able to + // fix any damage that exists on the volume. + if ( !(vfs_flags(mp) & MNT_ROOTFS)) { + retval = EINVAL; + goto error_exit; + } + } } /* Try again with a smaller block size... */ - retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args); + retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args, cred); } if (retval) (void) hfs_relconverter(0); } + // save off a snapshot of the mtime from the previous mount + // (for matador). + hfsmp->hfs_last_mounted_mtime = hfsmp->hfs_mtime; + if ( retval ) { goto error_exit; } - mp->mnt_stat.f_fsid.val[0] = (long)dev; - mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; - mp->mnt_maxsymlinklen = 0; - devvp->v_specflags |= SI_MOUNTEDON; + mp->mnt_vfsstat.f_fsid.val[0] = (long)dev; + mp->mnt_vfsstat.f_fsid.val[1] = vfs_typenum(mp); + vfs_setmaxsymlen(mp, 0); + mp->mnt_vtable->vfc_threadsafe = TRUE; + mp->mnt_vtable->vfc_vfsflags |= VFC_VFSNATIVEXATTR; if (args) { /* @@ -1320,17 +1228,17 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p, error_exit: if (bp) - brelse(bp); + buf_brelse(bp); if (mdbp) FREE(mdbp, M_TEMP); - (void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, cred, p); + if (hfsmp && hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) { - (void)VOP_CLOSE(hfsmp->jvp, ronly ? FREAD : FREAD|FWRITE, cred, p); + (void)VNOP_CLOSE(hfsmp->jvp, ronly ? FREAD : FREAD|FWRITE, context); hfsmp->jvp = NULL; } if (hfsmp) { FREE(hfsmp, M_HFSMNT); - mp->mnt_data = (qaddr_t)0; + vfs_setfsprivate(mp, NULL); } return (retval); } @@ -1342,10 +1250,7 @@ error_exit: */ /* ARGSUSED */ static int -hfs_start(mp, flags, p) - struct mount *mp; - int flags; - struct proc *p; +hfs_start(__unused struct mount *mp, __unused int flags, __unused vfs_context_t context) { return (0); } @@ -1355,16 +1260,14 @@ hfs_start(mp, flags, p) * unmount system call */ static int -hfs_unmount(mp, mntflags, p) - struct mount *mp; - int mntflags; - struct proc *p; +hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context) { + struct proc *p = vfs_context_proc(context); struct hfsmount *hfsmp = VFSTOHFS(mp); int retval = E_NONE; int flags; int force; - int started_tr = 0, grabbed_lock = 0; + int started_tr = 0; flags = 0; force = 0; @@ -1377,47 +1280,48 @@ hfs_unmount(mp, mntflags, p) return (retval); if (hfsmp->hfs_flags & HFS_METADATA_ZONE) - (void) hfs_recording_suspend(hfsmp, p); + (void) hfs_recording_suspend(hfsmp); /* * Flush out the b-trees, volume bitmap and Volume Header */ if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) { - hfs_global_shared_lock_acquire(hfsmp); - grabbed_lock = 1; - if (hfsmp->jnl) { - journal_start_transaction(hfsmp->jnl); - started_tr = 1; + hfs_start_transaction(hfsmp); + started_tr = 1; + + if (hfsmp->hfs_attribute_vp) { + (void) hfs_lock(VTOC(hfsmp->hfs_attribute_vp), HFS_EXCLUSIVE_LOCK); + retval = hfs_fsync(hfsmp->hfs_attribute_vp, MNT_WAIT, 0, p); + hfs_unlock(VTOC(hfsmp->hfs_attribute_vp)); + if (retval && !force) + goto err_exit; } - - retval = VOP_FSYNC(HFSTOVCB(hfsmp)->catalogRefNum, NOCRED, MNT_WAIT, p); + + (void) hfs_lock(VTOC(hfsmp->hfs_catalog_vp), HFS_EXCLUSIVE_LOCK); + retval = hfs_fsync(hfsmp->hfs_catalog_vp, MNT_WAIT, 0, p); + hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); if (retval && !force) goto err_exit; - retval = VOP_FSYNC(HFSTOVCB(hfsmp)->extentsRefNum, NOCRED, MNT_WAIT, p); + (void) hfs_lock(VTOC(hfsmp->hfs_extents_vp), HFS_EXCLUSIVE_LOCK); + retval = hfs_fsync(hfsmp->hfs_extents_vp, MNT_WAIT, 0, p); + hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); if (retval && !force) goto err_exit; - // if we have an allocation file, sync it too so we don't leave dirty - // blocks around - if (HFSTOVCB(hfsmp)->allocationsRefNum) { - if (retval = VOP_FSYNC(HFSTOVCB(hfsmp)->allocationsRefNum, NOCRED, MNT_WAIT, p)) { - if (!force) - goto err_exit; - } - } - - if (hfsmp->hfc_filevp && (hfsmp->hfc_filevp->v_flag & VSYSTEM)) { - retval = VOP_FSYNC(hfsmp->hfc_filevp, NOCRED, MNT_WAIT, p); + if (hfsmp->hfs_allocation_vp) { + (void) hfs_lock(VTOC(hfsmp->hfs_allocation_vp), HFS_EXCLUSIVE_LOCK); + retval = hfs_fsync(hfsmp->hfs_allocation_vp, MNT_WAIT, 0, p); + hfs_unlock(VTOC(hfsmp->hfs_allocation_vp)); if (retval && !force) goto err_exit; } - if (retval = VOP_FSYNC(hfsmp->hfs_devvp, NOCRED, MNT_WAIT, p)) { - if (!force) + if (hfsmp->hfc_filevp && vnode_issystem(hfsmp->hfc_filevp)) { + retval = hfs_fsync(hfsmp->hfc_filevp, MNT_WAIT, 0, p); + if (retval && !force) goto err_exit; } - #if 0 /* See if this volume is damaged, is so do not unmount cleanly */ if (HFSTOVCB(hfsmp)->vcbFlags & kHFS_DamagedVolume) { @@ -1428,21 +1332,15 @@ hfs_unmount(mp, mntflags, p) #else HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeUnmountedMask; #endif - retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1); + retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0); if (retval) { HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeUnmountedMask; if (!force) goto err_exit; /* could not flush everything */ } - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); - started_tr = 0; - } - if (grabbed_lock) { - hfs_global_shared_lock_release(hfsmp); - grabbed_lock = 0; - } + hfs_end_transaction(hfsmp); + started_tr = 0; } if (hfsmp->jnl) { @@ -1468,11 +1366,13 @@ hfs_unmount(mp, mntflags, p) hfsmp->jnl = NULL; } + VNOP_FSYNC(hfsmp->hfs_devvp, MNT_WAIT, context); + if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) { - retval = VOP_CLOSE(hfsmp->jvp, + retval = VNOP_CLOSE(hfsmp->jvp, hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE, - NOCRED, p); - vrele(hfsmp->jvp); + context); + vnode_put(hfsmp->jvp); hfsmp->jvp = NULL; } // XXXdbg @@ -1485,28 +1385,17 @@ hfs_unmount(mp, mntflags, p) hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE; tmpvp = hfsmp->hfs_backingfs_rootvp; hfsmp->hfs_backingfs_rootvp = NULLVP; - vrele(tmpvp); + vnode_rele(tmpvp); } #endif /* HFS_SPARSE_DEV */ - - hfsmp->hfs_devvp->v_specflags &= ~SI_MOUNTEDON; - retval = VOP_CLOSE(hfsmp->hfs_devvp, - hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE, - NOCRED, p); - if (retval && !force) - return(retval); - - vrele(hfsmp->hfs_devvp); + lck_mtx_destroy(&hfsmp->hfc_mutex, hfs_mutex_group); FREE(hfsmp, M_HFSMNT); - mp->mnt_data = (qaddr_t)0; + return (0); err_exit: - if (hfsmp->jnl && started_tr) { - journal_end_transaction(hfsmp->jnl); - } - if (grabbed_lock) { - hfs_global_shared_lock_release(hfsmp); + if (started_tr) { + hfs_end_transaction(hfsmp); } return retval; } @@ -1514,44 +1403,28 @@ hfs_unmount(mp, mntflags, p) /* * Return the root of a filesystem. - * - * OUT - vpp, should be locked and vget()'d (to increment usecount and lock) */ static int -hfs_root(mp, vpp) - struct mount *mp; - struct vnode **vpp; +hfs_vfs_root(struct mount *mp, struct vnode **vpp, __unused vfs_context_t context) { - struct vnode *nvp; - int retval; - UInt32 rootObjID = kRootDirID; - - if ((retval = VFS_VGET(mp, &rootObjID, &nvp))) - return (retval); - - *vpp = nvp; - return (0); + return hfs_vget(VFSTOHFS(mp), (cnid_t)kHFSRootFolderID, vpp, 1); } /* * Do operations associated with quotas */ -int -hfs_quotactl(mp, cmds, uid, arg, p) - struct mount *mp; - int cmds; - uid_t uid; - caddr_t arg; - struct proc *p; +static int +hfs_quotactl(struct mount *mp, int cmds, uid_t uid, caddr_t datap, vfs_context_t context) { + struct proc *p = vfs_context_proc(context); int cmd, type, error; #if !QUOTA - return (EOPNOTSUPP); + return (ENOTSUP); #else if (uid == -1) - uid = p->p_cred->p_ruid; + uid = vfs_context_ucred(context)->cr_ruid; cmd = cmds >> SUBCMDSHIFT; switch (cmd) { @@ -1559,24 +1432,24 @@ hfs_quotactl(mp, cmds, uid, arg, p) case Q_QUOTASTAT: break; case Q_GETQUOTA: - if (uid == p->p_cred->p_ruid) + if (uid == vfs_context_ucred(context)->cr_ruid) break; /* fall through */ default: - if (error = suser(p->p_ucred, &p->p_acflag)) + if ( (error = vfs_context_suser(context)) ) return (error); } type = cmds & SUBCMDMASK; if ((u_int)type >= MAXQUOTAS) return (EINVAL); - if (vfs_busy(mp, LK_NOWAIT, 0, p)) + if (vfs_busy(mp, LK_NOWAIT)) return (0); switch (cmd) { case Q_QUOTAON: - error = hfs_quotaon(p, mp, type, arg, UIO_USERSPACE); + error = hfs_quotaon(p, mp, type, datap); break; case Q_QUOTAOFF: @@ -1584,15 +1457,15 @@ hfs_quotactl(mp, cmds, uid, arg, p) break; case Q_SETQUOTA: - error = hfs_setquota(mp, uid, type, arg); + error = hfs_setquota(mp, uid, type, datap); break; case Q_SETUSE: - error = hfs_setuse(mp, uid, type, arg); + error = hfs_setuse(mp, uid, type, datap); break; case Q_GETQUOTA: - error = hfs_getquota(mp, uid, type, arg); + error = hfs_getquota(mp, uid, type, datap); break; case Q_SYNC: @@ -1600,52 +1473,66 @@ hfs_quotactl(mp, cmds, uid, arg, p) break; case Q_QUOTASTAT: - error = hfs_quotastat(mp, type, arg); + error = hfs_quotastat(mp, type, datap); break; default: error = EINVAL; break; } - vfs_unbusy(mp, p); + vfs_unbusy(mp); + return (error); #endif /* QUOTA */ } - - +/* Subtype is composite of bits */ +#define HFS_SUBTYPE_JOURNALED 0x01 +#define HFS_SUBTYPE_CASESENSITIVE 0x02 +/* bits 2 - 6 reserved */ +#define HFS_SUBTYPE_STANDARDHFS 0x80 /* * Get file system statistics. */ static int -hfs_statfs(mp, sbp, p) - struct mount *mp; - register struct statfs *sbp; - struct proc *p; +hfs_statfs(struct mount *mp, register struct vfsstatfs *sbp, __unused vfs_context_t context) { ExtendedVCB *vcb = VFSTOVCB(mp); struct hfsmount *hfsmp = VFSTOHFS(mp); u_long freeCNIDs; + uint16_t subtype = 0; freeCNIDs = (u_long)0xFFFFFFFF - (u_long)vcb->vcbNxtCNID; - sbp->f_bsize = vcb->blockSize; - sbp->f_iosize = hfsmp->hfs_logBlockSize; - sbp->f_blocks = vcb->totalBlocks; - sbp->f_bfree = hfs_freeblks(hfsmp, 0); - sbp->f_bavail = hfs_freeblks(hfsmp, 1); - sbp->f_files = vcb->totalBlocks - 2; /* max files is constrained by total blocks */ - sbp->f_ffree = MIN(freeCNIDs, sbp->f_bavail); - - sbp->f_type = 0; - if (sbp != &mp->mnt_stat) { - sbp->f_type = mp->mnt_vfc->vfc_typenum; - bcopy((caddr_t)mp->mnt_stat.f_mntonname, - (caddr_t)&sbp->f_mntonname[0], MNAMELEN); - bcopy((caddr_t)mp->mnt_stat.f_mntfromname, - (caddr_t)&sbp->f_mntfromname[0], MNAMELEN); + sbp->f_bsize = (uint32_t)vcb->blockSize; + sbp->f_iosize = (size_t)(MAX_UPL_TRANSFER * PAGE_SIZE); + sbp->f_blocks = (uint64_t)((unsigned long)vcb->totalBlocks); + sbp->f_bfree = (uint64_t)((unsigned long )hfs_freeblks(hfsmp, 0)); + sbp->f_bavail = (uint64_t)((unsigned long )hfs_freeblks(hfsmp, 1)); + sbp->f_files = (uint64_t)((unsigned long )(vcb->totalBlocks - 2)); /* max files is constrained by total blocks */ + sbp->f_ffree = (uint64_t)((unsigned long )(MIN(freeCNIDs, sbp->f_bavail))); + + /* + * Subtypes (flavors) for HFS + * 0: Mac OS Extended + * 1: Mac OS Extended (Journaled) + * 2: Mac OS Extended (Case Sensitive) + * 3: Mac OS Extended (Case Sensitive, Journaled) + * 4 - 127: Reserved + * 128: Mac OS Standard + * + */ + if (hfsmp->hfs_flags & HFS_STANDARD) { + subtype = HFS_SUBTYPE_STANDARDHFS; + } else /* HFS Plus */ { + if (hfsmp->jnl) + subtype |= HFS_SUBTYPE_JOURNALED; + if (hfsmp->hfs_flags & HFS_CASE_SENSITIVE) + subtype |= HFS_SUBTYPE_CASESENSITIVE; } + sbp->f_fssubtype = subtype; + return (0); } @@ -1663,57 +1550,81 @@ void hfs_sync_metadata(void *arg) { struct mount *mp = (struct mount *)arg; - struct cnode *cp; struct hfsmount *hfsmp; ExtendedVCB *vcb; - struct vnode *meta_vp[3]; - struct buf *bp; - int i, sectorsize, priIDSector, altIDSector, retval; - int error, allerror = 0; - + buf_t bp; + int sectorsize, retval; + daddr64_t priIDSector; hfsmp = VFSTOHFS(mp); vcb = HFSTOVCB(hfsmp); - bflushq(BQ_META, mp); - - -#if 1 // XXXdbg - I do not believe this is necessary... - // but if I pull it out, then the journal - // does not seem to get flushed properly - // when it is closed.... - // now make sure the super block is flushed sectorsize = hfsmp->hfs_phys_block_size; - priIDSector = (vcb->hfsPlusIOPosOffset / sectorsize) + - HFS_PRI_SECTOR(sectorsize); - retval = meta_bread(hfsmp->hfs_devvp, priIDSector, sectorsize, NOCRED, &bp); + priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / sectorsize) + + HFS_PRI_SECTOR(sectorsize)); + retval = (int)buf_meta_bread(hfsmp->hfs_devvp, priIDSector, sectorsize, NOCRED, &bp); if (retval != 0) { panic("hfs: sync_metadata: can't read super-block?! (retval 0x%x, priIDSector)\n", retval, priIDSector); } - if (retval == 0 && (bp->b_flags & B_DELWRI) && (bp->b_flags & B_LOCKED) == 0) { - bwrite(bp); + if (retval == 0 && ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI)) { + buf_bwrite(bp); } else if (bp) { - brelse(bp); + buf_brelse(bp); } // the alternate super block... // XXXdbg - we probably don't need to do this each and every time. // hfs_btreeio.c:FlushAlternate() should flag when it was // written... - altIDSector = (vcb->hfsPlusIOPosOffset / sectorsize) + - HFS_ALT_SECTOR(sectorsize, hfsmp->hfs_phys_block_count); - retval = meta_bread(hfsmp->hfs_devvp, altIDSector, sectorsize, NOCRED, &bp); - if (retval == 0 && (bp->b_flags & B_DELWRI) && (bp->b_flags & B_LOCKED) == 0) { - bwrite(bp); - } else if (bp) { - brelse(bp); + if (hfsmp->hfs_alt_id_sector) { + retval = (int)buf_meta_bread(hfsmp->hfs_devvp, hfsmp->hfs_alt_id_sector, sectorsize, NOCRED, &bp); + if (retval == 0 && ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI)) { + buf_bwrite(bp); + } else if (bp) { + buf_brelse(bp); + } } -#endif - } + +struct hfs_sync_cargs { + kauth_cred_t cred; + struct proc *p; + int waitfor; + int error; +}; + + +static int +hfs_sync_callback(struct vnode *vp, void *cargs) +{ + struct cnode *cp; + struct hfs_sync_cargs *args; + int error; + + args = (struct hfs_sync_cargs *)cargs; + + if (hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK) != 0) { + return (VNODE_RETURNED); + } + cp = VTOC(vp); + + if ((cp->c_flag & C_MODIFIED) || + (cp->c_touch_acctime | cp->c_touch_chgtime | cp->c_touch_modtime) || + vnode_hasdirtyblks(vp)) { + error = hfs_fsync(vp, args->waitfor, 0, args->p); + + if (error) + args->error = error; + } + hfs_unlock(cp); + return (VNODE_RETURNED); +} + + + /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; @@ -1722,155 +1633,88 @@ hfs_sync_metadata(void *arg) * Note: we are always called with the filesystem marked `MPBUSY'. */ static int -hfs_sync(mp, waitfor, cred, p) - struct mount *mp; - int waitfor; - struct ucred *cred; - struct proc *p; +hfs_sync(struct mount *mp, int waitfor, vfs_context_t context) { - struct vnode *nvp, *vp; + struct proc *p = vfs_context_proc(context); struct cnode *cp; struct hfsmount *hfsmp; ExtendedVCB *vcb; - struct vnode *meta_vp[3]; + struct vnode *meta_vp[4]; int i; int error, allerror = 0; + struct hfs_sync_cargs args; /* * During MNT_UPDATE hfs_changefs might be manipulating * vnodes so back off */ - if (mp->mnt_flag & MNT_UPDATE) + if (((uint32_t)vfs_flags(mp)) & MNT_UPDATE) /* XXX MNT_UPDATE may not be visible here */ return (0); hfsmp = VFSTOHFS(mp); if (hfsmp->hfs_flags & HFS_READ_ONLY) return (EROFS); -#if 0 - // XXXdbg first go through and flush out any modified - // meta data blocks so they go out in order... - bflushq(BQ_META, mp); - bflushq(BQ_LRU, mp); - // only flush locked blocks if we're not doing journaling - if (hfsmp->jnl == NULL) { - bflushq(BQ_LOCKED, mp); - } -#endif - + args.cred = vfs_context_proc(context); + args.waitfor = waitfor; + args.p = p; + args.error = 0; /* - * Write back each 'modified' vnode + * hfs_sync_callback will be called for each vnode + * hung off of this mount point... the vnode will be + * properly referenced and unreferenced around the callback */ + vnode_iterate(mp, 0, hfs_sync_callback, (void *)&args); -loop: - simple_lock(&mntvnode_slock); - for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { - int didhold; - /* - * If the vnode that we are about to sync is no longer - * associated with this mount point, start over. - */ - if (vp->v_mount != mp) { - simple_unlock(&mntvnode_slock); - goto loop; - } - - simple_lock(&vp->v_interlock); - nvp = vp->v_mntvnodes.le_next; - - cp = VTOC(vp); - - // restart our whole search if this guy is locked - // or being reclaimed. - if (vp->v_tag != VT_HFS || cp == NULL || vp->v_flag & (VXLOCK|VORECLAIM)) { - simple_unlock(&vp->v_interlock); - continue; - } - - if ((vp->v_flag & VSYSTEM) || (vp->v_type == VNON) || - (((cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE)) == 0) && - (vp->v_dirtyblkhd.lh_first == NULL) && !(vp->v_flag & VHASDIRTY))) { - simple_unlock(&vp->v_interlock); - simple_unlock(&mntvnode_slock); - simple_lock(&mntvnode_slock); - continue; - } - - simple_unlock(&mntvnode_slock); - error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, p); - if (error) { - if (error == ENOENT) { - /* - * If vnode is being reclaimed, yield so - * that it can be removed from our list. - */ - if (UBCISVALID(vp)) - (void) tsleep((caddr_t)&lbolt, PINOD, "hfs_sync", 0); - goto loop; - } - simple_lock(&mntvnode_slock); - continue; - } - - didhold = ubc_hold(vp); - - // mark the cnode so that fsync won't flush - // the journal since we're going to do that... - cp->c_flag |= C_FROMSYNC; - if ((error = VOP_FSYNC(vp, cred, waitfor, p))) { - allerror = error; - }; - cp->c_flag &= ~C_FROMSYNC; - - VOP_UNLOCK(vp, 0, p); - if (didhold) - ubc_rele(vp); - vrele(vp); - simple_lock(&mntvnode_slock); - }; + if (args.error) + allerror = args.error; vcb = HFSTOVCB(hfsmp); meta_vp[0] = vcb->extentsRefNum; meta_vp[1] = vcb->catalogRefNum; meta_vp[2] = vcb->allocationsRefNum; /* This is NULL for standard HFS */ + meta_vp[3] = hfsmp->hfs_attribute_vp; /* Optional file */ /* Now sync our three metadata files */ - for (i = 0; i < 3; ++i) { + for (i = 0; i < 4; ++i) { struct vnode *btvp; - btvp = btvp = meta_vp[i];; - if ((btvp==0) || (btvp->v_type == VNON) || (btvp->v_mount != mp)) + btvp = meta_vp[i];; + if ((btvp==0) || (vnode_mount(btvp) != mp)) continue; - simple_lock(&btvp->v_interlock); + /* XXX use hfs_systemfile_lock instead ? */ + (void) hfs_lock(VTOC(btvp), HFS_EXCLUSIVE_LOCK); cp = VTOC(btvp); - if (((cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE)) == 0) && - (btvp->v_dirtyblkhd.lh_first == NULL) && !(btvp->v_flag & VHASDIRTY)) { - simple_unlock(&btvp->v_interlock); + + if (((cp->c_flag & C_MODIFIED) == 0) && + (cp->c_touch_acctime == 0) && + (cp->c_touch_chgtime == 0) && + (cp->c_touch_modtime == 0) && + vnode_hasdirtyblks(btvp) == 0) { + hfs_unlock(VTOC(btvp)); continue; } - simple_unlock(&mntvnode_slock); - error = vget(btvp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, p); + error = vnode_get(btvp); if (error) { - simple_lock(&mntvnode_slock); + hfs_unlock(VTOC(btvp)); continue; } - if ((error = VOP_FSYNC(btvp, cred, waitfor, p))) + if ((error = hfs_fsync(btvp, waitfor, 0, p))) allerror = error; - VOP_UNLOCK(btvp, 0, p); - vrele(btvp); - simple_lock(&mntvnode_slock); - }; - simple_unlock(&mntvnode_slock); + hfs_unlock(cp); + vnode_put(btvp); + }; /* * Force stale file system control information to be flushed. */ if (vcb->vcbSigWord == kHFSSigWord) { - if ((error = VOP_FSYNC(hfsmp->hfs_devvp, cred, waitfor, p))) + if ((error = VNOP_FSYNC(hfsmp->hfs_devvp, waitfor, context))) { allerror = error; + } } #if QUOTA hfs_qsync(mp); @@ -1882,12 +1726,6 @@ loop: */ if (IsVCBDirty(vcb)) { - // XXXdbg - debugging, remove - if (hfsmp->jnl) { - //printf("hfs: sync: strange, a journaled volume w/dirty VCB? jnl 0x%x hfsmp 0x%x\n", - // hfsmp->jnl, hfsmp); - } - error = hfs_flushvolumeheader(hfsmp, waitfor, 0); if (error) allerror = error; @@ -1897,7 +1735,6 @@ loop: journal_flush(hfsmp->jnl); } - err_exit: return (allerror); } @@ -1913,33 +1750,24 @@ loop: * those rights via. exflagsp and credanonp */ static int -hfs_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) - register struct mount *mp; - struct fid *fhp; - struct mbuf *nam; - struct vnode **vpp; - int *exflagsp; - struct ucred **credanonp; +hfs_fhtovp(struct mount *mp, int fhlen, unsigned char *fhp, struct vnode **vpp, vfs_context_t context) { struct hfsfid *hfsfhp; struct vnode *nvp; int result; - struct netcred *np; *vpp = NULL; hfsfhp = (struct hfsfid *)fhp; - /* - * Get the export permission structure for this <mp, client> tuple. - */ - np = vfs_export_lookup(mp, &VFSTOHFS(mp)->hfs_export, nam); - if (nam && (np == NULL)) { - return EACCES; - }; + if (fhlen < sizeof(struct hfsfid)) + return (EINVAL); - result = VFS_VGET(mp, &hfsfhp->hfsfid_cnid, &nvp); - if (result) return result; - if (nvp == NULL) return ESTALE; + result = hfs_vget(VFSTOHFS(mp), hfsfhp->hfsfid_cnid, &nvp, 0); + if (result) { + if (result == ENOENT) + result = ESTALE; + return result; + } /* The createtime can be changed by hfs_setattr or hfs_setattrlist. * For NFS, we are assuming that only if the createtime was moved @@ -1953,28 +1781,13 @@ hfs_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) * wrap number and use that for generation number. For now do this. */ if ((hfsfhp->hfsfid_gen < VTOC(nvp)->c_itime)) { - vput(nvp); + hfs_unlock(VTOC(nvp)); + vnode_put(nvp); return (ESTALE); - }; - - if (VNAME(nvp) == NULL) { - struct cnode *cp = VTOC(nvp); - - if (nvp == cp->c_rsrc_vp) { - // the +1/-2 thing is to skip the leading "/" on the rsrc fork spec - // and to not count the trailing null byte at the end of the string. - VNAME(nvp) = add_name(_PATH_RSRCFORKSPEC+1, sizeof(_PATH_RSRCFORKSPEC)-2, 0, 0); - } else { - VNAME(nvp) = add_name(cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen, 0, 0); - } } - *vpp = nvp; - if (np) { - *exflagsp = np->netc_exflags; - *credanonp = &np->netc_anon; - } - + + hfs_unlock(VTOC(nvp)); return (0); } @@ -1984,22 +1797,22 @@ hfs_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) */ /* ARGSUSED */ static int -hfs_vptofh(vp, fhp) - struct vnode *vp; - struct fid *fhp; +hfs_vptofh(struct vnode *vp, int *fhlenp, unsigned char *fhp, vfs_context_t context) { struct cnode *cp; struct hfsfid *hfsfhp; if (ISHFS(VTOVCB(vp))) - return (EOPNOTSUPP); /* hfs standard is not exportable */ + return (ENOTSUP); /* hfs standard is not exportable */ + + if (*fhlenp < (int)sizeof(struct hfsfid)) + return (EOVERFLOW); cp = VTOC(vp); hfsfhp = (struct hfsfid *)fhp; - hfsfhp->hfsfid_len = sizeof(struct hfsfid); - hfsfhp->hfsfid_pad = 0; hfsfhp->hfsfid_cnid = cp->c_fileid; hfsfhp->hfsfid_gen = cp->c_itime; + *fhlenp = sizeof(struct hfsfid); return (0); } @@ -2009,8 +1822,7 @@ hfs_vptofh(vp, fhp) * Initial HFS filesystems, done only once. */ static int -hfs_init(vfsp) - struct vfsconf *vfsp; +hfs_init(__unused struct vfsconf *vfsp) { static int done = 0; @@ -2024,11 +1836,16 @@ hfs_init(vfsp) #endif /* QUOTA */ BTReserveSetup(); + + + hfs_lock_attr = lck_attr_alloc_init(); + hfs_group_attr = lck_grp_attr_alloc_init(); + hfs_mutex_group = lck_grp_alloc_init("hfs-mutex", hfs_group_attr); + hfs_rwlock_group = lck_grp_alloc_init("hfs-rwlock", hfs_group_attr); + + /* Turn on lock debugging */ + //lck_attr_setdebug(hfs_lock_attr); - /* - * Allocate Catalog Iterator cache... - */ - (void) InitCatalogCache(); return (0); } @@ -2039,14 +1856,16 @@ hfs_getmountpoint(vp, hfsmpp) struct hfsmount **hfsmpp; { struct hfsmount * hfsmp; + char fstypename[MFSNAMELEN]; if (vp == NULL) return (EINVAL); - if ((vp->v_flag & VROOT) == 0) + if (!vnode_isvroot(vp)) return (EINVAL); - if (strcmp(vp->v_mount->mnt_stat.f_fstypename, "hfs") != 0) + vnode_vfsname(vp, fstypename); + if (strcmp(fstypename, "hfs") != 0) return (EINVAL); hfsmp = VTOHFS(vp); @@ -2062,29 +1881,16 @@ hfs_getmountpoint(vp, hfsmpp) // XXXdbg #include <sys/filedesc.h> - /* * HFS filesystem related variables. */ static int -hfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) - int *name; - u_int namelen; - void *oldp; - size_t *oldlenp; - void *newp; - size_t newlen; - struct proc *p; +hfs_sysctl(int *name, __unused u_int namelen, user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen, vfs_context_t context) { - extern u_int32_t hfs_getencodingbias(void); - extern void hfs_setencodingbias(u_int32_t); - + struct proc *p = vfs_context_proc(context); int error; - struct sysctl_req *req; - struct vfsidctl vc; - struct mount *mp; struct hfsmount *hfsmp; - struct vfsquery vq; /* all sysctl names at this level are terminal */ @@ -2098,17 +1904,18 @@ hfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) return (error); } else if (name[0] == HFS_EXTEND_FS) { - u_int64_t newsize; - - if (newp == NULL) + u_int64_t newsize; + vnode_t vp = p->p_fd->fd_cdir; + + if (newp == USER_ADDR_NULL || vp == NULL) return (EINVAL); - if ((error = hfs_getmountpoint(p->p_fd->fd_cdir, &hfsmp))) + if ((error = hfs_getmountpoint(vp, &hfsmp))) return (error); error = sysctl_quad(oldp, oldlenp, newp, newlen, &newsize); if (error) return (error); - error = hfs_extendfs(HFSTOVFS(hfsmp), newsize, p); + error = hfs_extendfs(hfsmp, newsize, context); return (error); } else if (name[0] == HFS_ENCODINGHINT) { @@ -2128,7 +1935,7 @@ hfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) &bytes, bufsize, 0, UTF_DECOMPOSED); if (error == 0) { hint = hfs_pickencoding(unicode_name, bytes / 2); - error = sysctl_int(oldp, oldlenp, NULL, NULL, &hint); + error = sysctl_int(oldp, oldlenp, USER_ADDR_NULL, 0, &hint); } } FREE(unicode_name, M_TEMP); @@ -2139,15 +1946,17 @@ hfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) // make the file system journaled... struct vnode *vp = p->p_fd->fd_cdir, *jvp; ExtendedVCB *vcb; - int retval; struct cat_attr jnl_attr, jinfo_attr; struct cat_fork jnl_fork, jinfo_fork; void *jnl = NULL; + int lockflags; /* Only root can enable journaling */ - if (current_proc()->p_ucred->cr_uid != 0) { + if (!is_suser()) { return (EPERM); } + if (vp == NULL) + return EINVAL; hfsmp = VTOHFS(vp); if (hfsmp->hfs_flags & HFS_READ_ONLY) { @@ -2159,27 +1968,29 @@ hfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) } if (hfsmp->jnl) { - printf("hfs: volume @ mp 0x%x is already journaled!\n", vp->v_mount); + printf("hfs: volume @ mp 0x%x is already journaled!\n", vnode_mount(vp)); return EAGAIN; } vcb = HFSTOVCB(hfsmp); + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); if (BTHasContiguousNodes(VTOF(vcb->catalogRefNum)) == 0 || BTHasContiguousNodes(VTOF(vcb->extentsRefNum)) == 0) { printf("hfs: volume has a btree w/non-contiguous nodes. can not enable journaling.\n"); + hfs_systemfile_unlock(hfsmp, lockflags); return EINVAL; } + hfs_systemfile_unlock(hfsmp, lockflags); // make sure these both exist! - if ( GetFileInfo(vcb, kRootDirID, ".journal_info_block", &jinfo_attr, &jinfo_fork) == 0 - || GetFileInfo(vcb, kRootDirID, ".journal", &jnl_attr, &jnl_fork) == 0) { + if ( GetFileInfo(vcb, kHFSRootFolderID, ".journal_info_block", &jinfo_attr, &jinfo_fork) == 0 + || GetFileInfo(vcb, kHFSRootFolderID, ".journal", &jnl_attr, &jnl_fork) == 0) { return EINVAL; } - hfs_sync(hfsmp->hfs_mp, MNT_WAIT, FSCRED, p); - bflushq(BQ_META); + hfs_sync(hfsmp->hfs_mp, MNT_WAIT, context); printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n", (off_t)name[2], (off_t)name[3]); @@ -2198,7 +2009,7 @@ hfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) if (jnl == NULL) { printf("hfs: FAILED to create the journal!\n"); if (jvp && jvp != hfsmp->hfs_devvp) { - VOP_CLOSE(jvp, hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE, FSCRED, p); + VNOP_CLOSE(jvp, hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE, context); } jvp = NULL; @@ -2218,7 +2029,7 @@ hfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) hfsmp->hfs_jnlinfoblkid = jinfo_attr.ca_fileid; hfsmp->hfs_jnlfileid = jnl_attr.ca_fileid; - hfsmp->hfs_mp->mnt_flag |= MNT_JOURNALED; + vfs_setflags(hfsmp->hfs_mp, (uint64_t)((unsigned int)MNT_JOURNALED)); hfs_global_exclusive_lock_release(hfsmp); hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1); @@ -2227,32 +2038,29 @@ hfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) } else if (name[0] == HFS_DISABLE_JOURNALING) { // clear the journaling bit struct vnode *vp = p->p_fd->fd_cdir; - void *jnl; - int retval; /* Only root can disable journaling */ - if (current_proc()->p_ucred->cr_uid != 0) { + if (!is_suser()) { return (EPERM); } + if (vp == NULL) + return EINVAL; hfsmp = VTOHFS(vp); - printf("hfs: disabling journaling for mount @ 0x%x\n", vp->v_mount); + printf("hfs: disabling journaling for mount @ 0x%x\n", vnode_mount(vp)); - jnl = hfsmp->jnl; - hfs_global_exclusive_lock_acquire(hfsmp); // Lights out for you buddy! + journal_close(hfsmp->jnl); hfsmp->jnl = NULL; - journal_close(jnl); if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) { - VOP_CLOSE(hfsmp->jvp, hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE, FSCRED, p); + VNOP_CLOSE(hfsmp->jvp, hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE, context); } - hfsmp->jnl = NULL; hfsmp->jvp = NULL; - hfsmp->hfs_mp->mnt_flag &= ~MNT_JOURNALED; + vfs_clearflags(hfsmp->hfs_mp, (uint64_t)((unsigned int)MNT_JOURNALED)); hfsmp->jnl_start = 0; hfsmp->hfs_jnlinfoblkid = 0; hfsmp->hfs_jnlfileid = 0; @@ -2267,6 +2075,9 @@ hfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) struct vnode *vp = p->p_fd->fd_cdir; off_t jnl_start, jnl_size; + if (vp == NULL) + return EINVAL; + hfsmp = VTOHFS(vp); if (hfsmp->jnl == NULL) { jnl_start = 0; @@ -2276,10 +2087,10 @@ hfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) jnl_size = (off_t)hfsmp->jnl_size; } - if ((error = copyout((caddr_t)&jnl_start, (void *)name[1], sizeof(off_t))) != 0) { + if ((error = copyout((caddr_t)&jnl_start, CAST_USER_ADDR_T(name[1]), sizeof(off_t))) != 0) { return error; } - if ((error = copyout((caddr_t)&jnl_size, (void *)name[2], sizeof(off_t))) != 0) { + if ((error = copyout((caddr_t)&jnl_size, CAST_USER_ADDR_T(name[2]), sizeof(off_t))) != 0) { return error; } @@ -2289,13 +2100,29 @@ hfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) return set_package_extensions_table((void *)name[1], name[2], name[3]); } else if (name[0] == VFS_CTL_QUERY) { - req = oldp; /* we're new style vfs sysctl. */ - - error = SYSCTL_IN(req, &vc, sizeof(vc)); - if (error) return (error); + struct sysctl_req *req; + struct vfsidctl vc; + struct user_vfsidctl user_vc; + struct mount *mp; + struct vfsquery vq; + boolean_t is_64_bit; + + is_64_bit = proc_is64bit(p); + req = CAST_DOWN(struct sysctl_req *, oldp); /* we're new style vfs sysctl. */ - mp = vfs_getvfs(&vc.vc_fsid); - if (mp == NULL) return (ENOENT); + if (is_64_bit) { + error = SYSCTL_IN(req, &user_vc, sizeof(user_vc)); + if (error) return (error); + + mp = vfs_getvfs(&user_vc.vc_fsid); + } + else { + error = SYSCTL_IN(req, &vc, sizeof(vc)); + if (error) return (error); + + mp = vfs_getvfs(&vc.vc_fsid); + } + if (mp == NULL) return (ENOENT); hfsmp = VFSTOHFS(mp); bzero(&vq, sizeof(vq)); @@ -2303,101 +2130,126 @@ hfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) return SYSCTL_OUT(req, &vq, sizeof(vq));; }; - return (EOPNOTSUPP); + return (ENOTSUP); } -/* This will return a vnode of either a directory or a data vnode based on an object id. If - * it is a file id, its data fork will be returned. - */ static int -hfs_vget(mp, ino, vpp) - struct mount *mp; - void *ino; - struct vnode **vpp; +hfs_vfs_vget(struct mount *mp, ino64_t ino, struct vnode **vpp, __unused vfs_context_t context) +{ + return hfs_vget(VFSTOHFS(mp), (cnid_t)ino, vpp, 1); +} + + +/* + * Look up an HFS object by ID. + * + * The object is returned with an iocount reference and the cnode locked. + * + * If the object is a file then it will represent the data fork. + */ +__private_extern__ +int +hfs_vget(struct hfsmount *hfsmp, cnid_t cnid, struct vnode **vpp, int skiplock) { - cnid_t cnid = *(cnid_t *)ino; + struct vnode *vp = NULL; + struct cat_desc cndesc; + struct cat_attr cnattr; + struct cat_fork cnfork; + struct componentname cn; + int error; /* Check for cnids that should't be exported. */ if ((cnid < kHFSFirstUserCatalogNodeID) && (cnid != kHFSRootFolderID && cnid != kHFSRootParentID)) return (ENOENT); + /* Don't export HFS Private Data dir. */ - if (cnid == VFSTOHFS(mp)->hfs_privdir_desc.cd_cnid) + if (cnid == hfsmp->hfs_privdir_desc.cd_cnid) return (ENOENT); - return (hfs_getcnode(VFSTOHFS(mp), cnid, NULL, 0, NULL, NULL, vpp)); -} + /* + * Check the hash first + */ + vp = hfs_chash_getvnode(hfsmp->hfs_raw_dev, cnid, 0, skiplock); + if (vp) { + *vpp = vp; + return(0); + } -/* - * Check to see if a given vnode is only referenced for events: - * [ entered with vp->v_interlock locked ] - */ -static int -hfs_evtonly(struct vnode *vp) -{ - int ubc_refcount; + bzero(&cndesc, sizeof(cndesc)); + bzero(&cnattr, sizeof(cnattr)); + bzero(&cnfork, sizeof(cnfork)); - ubc_refcount = UBCINFOEXISTS(vp) ? 1 : 0; - return (vp->v_usecount == (ubc_refcount + EVTONLYREFS(vp))); -} + /* + * Not in hash, lookup in catalog + */ + if (cnid == kHFSRootParentID) { + static char hfs_rootname[] = "/"; + + cndesc.cd_nameptr = &hfs_rootname[0]; + cndesc.cd_namelen = 1; + cndesc.cd_parentcnid = kHFSRootParentID; + cndesc.cd_cnid = kHFSRootFolderID; + cndesc.cd_flags = CD_ISDIR; + + cnattr.ca_fileid = kHFSRootFolderID; + cnattr.ca_nlink = 2; + cnattr.ca_entries = 1; + cnattr.ca_mode = (S_IFDIR | S_IRWXU | S_IRWXG | S_IRWXO); + } else { + int lockflags; -/* - * Check to see if all non-system vnodes for a given mountpoint are events-only - */ -static int -hfs_flush_evtonly(struct mount *mp, int flags, int dispose, struct proc *p) -{ - struct vnode *vp, *nvp; - int busy = 0; - - simple_lock(&mntvnode_slock); -loop: - for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { - if (vp->v_mount != mp) goto loop; - nvp = vp->v_mntvnodes.le_next; - - simple_lock(&vp->v_interlock); - /* - * Skip over a vnodes marked VSYSTEM or VNOFLUSH. - */ - if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) || (vp->v_flag & VNOFLUSH))) { - simple_unlock(&vp->v_interlock); - continue; - }; - /* - * Skip over a vnodes marked VSWAP. - */ - if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) { - simple_unlock(&vp->v_interlock); - continue; + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + error = cat_idlookup(hfsmp, cnid, &cndesc, &cnattr, &cnfork); + hfs_systemfile_unlock(hfsmp, lockflags); + + if (error) { + *vpp = NULL; + return (error); } - if (hfs_evtonly(vp)) { - if (dispose) { - /* "dispose" implies "forcibly", a la "FORCECLOSE": */ - simple_unlock(&mntvnode_slock); - vgonel(vp, p); - simple_lock(&mntvnode_slock); - } else { - simple_unlock(&vp->v_interlock); - }; - continue; - }; - - simple_unlock(&vp->v_interlock); - ++busy; - /* If asked to dispose, keep trying. If only checking, the answer is now known. */ - if (dispose) { - continue; - } else { - break; - }; - } - simple_unlock(&mntvnode_slock); - - return (busy == 0); + + /* Hide open files that have been deleted */ + if ((hfsmp->hfs_privdir_desc.cd_cnid != 0) && + (cndesc.cd_parentcnid == hfsmp->hfs_privdir_desc.cd_cnid)) { + // XXXdbg - if this is a hardlink, we could call + // hfs_chash_snoop() to see if there is + // already a cnode and vnode present for + // this fileid. however I'd rather not + // risk it at this point in Tiger. + cat_releasedesc(&cndesc); + error = ENOENT; + *vpp = NULL; + return (error); + } + } + + /* + * Supply hfs_getnewvnode with a component name. + */ + MALLOC_ZONE(cn.cn_pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); + cn.cn_nameiop = LOOKUP; + cn.cn_flags = ISLASTCN | HASBUF; + cn.cn_context = NULL; + cn.cn_pnlen = MAXPATHLEN; + cn.cn_nameptr = cn.cn_pnbuf; + cn.cn_namelen = cndesc.cd_namelen; + cn.cn_hash = 0; + cn.cn_consume = 0; + bcopy(cndesc.cd_nameptr, cn.cn_nameptr, cndesc.cd_namelen + 1); + + /* XXX should we supply the parent as well... ? */ + error = hfs_getnewvnode(hfsmp, NULLVP, &cn, &cndesc, 0, &cnattr, &cnfork, &vp); + FREE_ZONE(cn.cn_pnbuf, cn.cn_pnlen, M_NAMEI); + + cat_releasedesc(&cndesc); + *vpp = vp; + if (vp && skiplock) + hfs_unlock(VTOC(vp)); + return (error); } + /* * Flush out all the files in a filesystem. */ @@ -2406,7 +2258,6 @@ hfs_flushfiles(struct mount *mp, int flags, struct proc *p) { struct hfsmount *hfsmp; struct vnode *skipvp = NULLVP; - struct vnode *rsrcvp; int quotafilecnt; int i; int error; @@ -2420,7 +2271,7 @@ hfs_flushfiles(struct mount *mp, int flags, struct proc *p) * extra reference when doing the intial vflush. */ quotafilecnt = 0; - if (mp->mnt_flag & MNT_QUOTA) { + if (((unsigned int)vfs_flags(mp)) & MNT_QUOTA) { /* Find out how many quota files we have open. */ for (i = 0; i < MAXQUOTAS; i++) { @@ -2429,37 +2280,30 @@ hfs_flushfiles(struct mount *mp, int flags, struct proc *p) } /* Obtain the root vnode so we can skip over it. */ - if (hfs_chashget(hfsmp->hfs_raw_dev, kRootDirID, 0, - &skipvp, &rsrcvp) == NULL) { - skipvp = NULLVP; - } + skipvp = hfs_chash_getvnode(hfsmp->hfs_raw_dev, kHFSRootFolderID, 0, 0); } #endif /* QUOTA */ error = vflush(mp, skipvp, SKIPSYSTEM | SKIPSWAP | flags); - /* - * If the vflush() call failed solely because there are - * some event-only vnodes in the list, then forcibly get - * rid of those vnodes before the final vflush() pass. - */ - if ((error == EBUSY) && hfs_flush_evtonly(mp, SKIPSYSTEM | SKIPSWAP, 0, p)) { - (void) hfs_flush_evtonly(mp, SKIPSYSTEM | SKIPSWAP, 1, p); - }; + if (error != 0) + return(error); + error = vflush(mp, skipvp, SKIPSYSTEM | flags); #if QUOTA - if (mp->mnt_flag & MNT_QUOTA) { + if (((unsigned int)vfs_flags(mp)) & MNT_QUOTA) { if (skipvp) { /* * See if there are additional references on the * root vp besides the ones obtained from the open - * quota files and the hfs_chashget call above. + * quota files and the hfs_chash_getvnode call above. */ if ((error == 0) && - (skipvp->v_usecount > (1 + quotafilecnt))) { + (vnode_isinuse(skipvp, quotafilecnt))) { error = EBUSY; /* root directory is still open */ } - vput(skipvp); + hfs_unlock(VTOC(skipvp)); + vnode_put(skipvp); } if (error && (flags & FORCECLOSE) == 0) return (error); @@ -2501,53 +2345,62 @@ hfs_setencodingbits(struct hfsmount *hfsmp, u_int32_t encoding) } if (index < 64) { - HFSTOVCB(hfsmp)->encodingsBitmap |= (u_int64_t)(1ULL << index); - HFSTOVCB(hfsmp)->vcbFlags |= 0xFF00; + HFS_MOUNT_LOCK(hfsmp, TRUE) + hfsmp->encodingsBitmap |= (u_int64_t)(1ULL << index); + hfsmp->vcbFlags |= 0xFF00; + HFS_MOUNT_UNLOCK(hfsmp, TRUE); } } /* * Update volume stats + * + * On journal volumes this will cause a volume header flush */ __private_extern__ int hfs_volupdate(struct hfsmount *hfsmp, enum volop op, int inroot) { - ExtendedVCB *vcb; + struct timeval tv; - vcb = HFSTOVCB(hfsmp); - vcb->vcbFlags |= 0xFF00; - vcb->vcbLsMod = time.tv_sec; + microtime(&tv); + + lck_mtx_lock(&hfsmp->hfs_mutex); + + hfsmp->vcbFlags |= 0xFF00; + hfsmp->hfs_mtime = tv.tv_sec; switch (op) { case VOL_UPDATE: break; case VOL_MKDIR: - if (vcb->vcbDirCnt != 0xFFFFFFFF) - ++vcb->vcbDirCnt; - if (inroot && vcb->vcbNmRtDirs != 0xFFFF) - ++vcb->vcbNmRtDirs; + if (hfsmp->hfs_dircount != 0xFFFFFFFF) + ++hfsmp->hfs_dircount; + if (inroot && hfsmp->vcbNmRtDirs != 0xFFFF) + ++hfsmp->vcbNmRtDirs; break; case VOL_RMDIR: - if (vcb->vcbDirCnt != 0) - --vcb->vcbDirCnt; - if (inroot && vcb->vcbNmRtDirs != 0xFFFF) - --vcb->vcbNmRtDirs; + if (hfsmp->hfs_dircount != 0) + --hfsmp->hfs_dircount; + if (inroot && hfsmp->vcbNmRtDirs != 0xFFFF) + --hfsmp->vcbNmRtDirs; break; case VOL_MKFILE: - if (vcb->vcbFilCnt != 0xFFFFFFFF) - ++vcb->vcbFilCnt; - if (inroot && vcb->vcbNmFls != 0xFFFF) - ++vcb->vcbNmFls; + if (hfsmp->hfs_filecount != 0xFFFFFFFF) + ++hfsmp->hfs_filecount; + if (inroot && hfsmp->vcbNmFls != 0xFFFF) + ++hfsmp->vcbNmFls; break; case VOL_RMFILE: - if (vcb->vcbFilCnt != 0) - --vcb->vcbFilCnt; - if (inroot && vcb->vcbNmFls != 0xFFFF) - --vcb->vcbNmFls; + if (hfsmp->hfs_filecount != 0) + --hfsmp->hfs_filecount; + if (inroot && hfsmp->vcbNmFls != 0xFFFF) + --hfsmp->vcbNmFls; break; } + lck_mtx_unlock(&hfsmp->hfs_mutex); + if (hfsmp->jnl) { hfs_flushvolumeheader(hfsmp, 0, 0); } @@ -2568,22 +2421,16 @@ hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush) ByteCount namelen; sectorsize = hfsmp->hfs_phys_block_size; - retval = bread(hfsmp->hfs_devvp, HFS_PRI_SECTOR(sectorsize), sectorsize, NOCRED, &bp); + retval = (int)buf_bread(hfsmp->hfs_devvp, (daddr64_t)HFS_PRI_SECTOR(sectorsize), sectorsize, NOCRED, &bp); if (retval) { if (bp) - brelse(bp); + buf_brelse(bp); return retval; } - DBG_ASSERT(bp != NULL); - DBG_ASSERT(bp->b_data != NULL); - DBG_ASSERT(bp->b_bcount == size); - - if (hfsmp->jnl) { - panic("hfs: standard hfs volumes should not be journaled!\n"); - } + lck_mtx_lock(&hfsmp->hfs_mutex); - mdb = (HFSMasterDirectoryBlock *)(bp->b_data + HFS_PRI_OFFSET(sectorsize)); + mdb = (HFSMasterDirectoryBlock *)(buf_dataptr(bp) + HFS_PRI_OFFSET(sectorsize)); mdb->drCrDate = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbCrDate))); mdb->drLsMod = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbLsMod))); @@ -2617,6 +2464,7 @@ hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush) mdb->drXTExtRec[2].blockCount = SWAP_BE16 (fp->ff_extents[2].blockCount); mdb->drXTFlSize = SWAP_BE32 (fp->ff_blocks * vcb->blockSize); mdb->drXTClpSiz = SWAP_BE32 (fp->ff_clumpsize); + FTOC(fp)->c_flag &= ~C_MODIFIED; fp = VTOF(vcb->catalogRefNum); mdb->drCTExtRec[0].startBlock = SWAP_BE16 (fp->ff_extents[0].startBlock); @@ -2627,28 +2475,28 @@ hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush) mdb->drCTExtRec[2].blockCount = SWAP_BE16 (fp->ff_extents[2].blockCount); mdb->drCTFlSize = SWAP_BE32 (fp->ff_blocks * vcb->blockSize); mdb->drCTClpSiz = SWAP_BE32 (fp->ff_clumpsize); + FTOC(fp)->c_flag &= ~C_MODIFIED; + + MarkVCBClean( vcb ); + + lck_mtx_unlock(&hfsmp->hfs_mutex); /* If requested, flush out the alternate MDB */ if (altflush) { struct buf *alt_bp = NULL; - u_long altIDSector; - - altIDSector = HFS_ALT_SECTOR(sectorsize, hfsmp->hfs_phys_block_count); - if (meta_bread(hfsmp->hfs_devvp, altIDSector, sectorsize, NOCRED, &alt_bp) == 0) { - bcopy(mdb, alt_bp->b_data + HFS_ALT_OFFSET(sectorsize), kMDBSize); + if (buf_meta_bread(hfsmp->hfs_devvp, hfsmp->hfs_alt_id_sector, sectorsize, NOCRED, &alt_bp) == 0) { + bcopy(mdb, (char *)buf_dataptr(alt_bp) + HFS_ALT_OFFSET(sectorsize), kMDBSize); - (void) VOP_BWRITE(alt_bp); + (void) VNOP_BWRITE(alt_bp); } else if (alt_bp) - brelse(alt_bp); + buf_brelse(alt_bp); } if (waitfor != MNT_WAIT) - bawrite(bp); + buf_bawrite(bp); else - retval = VOP_BWRITE(bp); - - MarkVCBClean( vcb ); + retval = VNOP_BWRITE(bp); return (retval); } @@ -2672,10 +2520,10 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush) struct buf *bp; int i; int sectorsize; - int priIDSector; + daddr64_t priIDSector; int critical = 0; u_int16_t signature; - u_int16_t version; + u_int16_t hfsversion; if (hfsmp->hfs_flags & HFS_READ_ONLY) { return(0); @@ -2686,27 +2534,19 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush) if (altflush) critical = 1; sectorsize = hfsmp->hfs_phys_block_size; - priIDSector = (vcb->hfsPlusIOPosOffset / sectorsize) + - HFS_PRI_SECTOR(sectorsize); + priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / sectorsize) + + HFS_PRI_SECTOR(sectorsize)); - // XXXdbg - hfs_global_shared_lock_acquire(hfsmp); - if (hfsmp->jnl) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - hfs_global_shared_lock_release(hfsmp); - return EINVAL; - } + if (hfs_start_transaction(hfsmp) != 0) { + return EINVAL; } - retval = meta_bread(hfsmp->hfs_devvp, priIDSector, sectorsize, NOCRED, &bp); + retval = (int)buf_meta_bread(hfsmp->hfs_devvp, priIDSector, sectorsize, NOCRED, &bp); if (retval) { if (bp) - brelse(bp); + buf_brelse(bp); - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); + hfs_end_transaction(hfsmp); printf("HFS: err %d reading VH blk (%s)\n", retval, vcb->vcbVN); return (retval); @@ -2716,23 +2556,23 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush) journal_modify_block_start(hfsmp->jnl, bp); } - volumeHeader = (HFSPlusVolumeHeader *)((char *)bp->b_data + HFS_PRI_OFFSET(sectorsize)); + volumeHeader = (HFSPlusVolumeHeader *)((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(sectorsize)); /* * Sanity check what we just read. */ signature = SWAP_BE16 (volumeHeader->signature); - version = SWAP_BE16 (volumeHeader->version); + hfsversion = SWAP_BE16 (volumeHeader->version); if ((signature != kHFSPlusSigWord && signature != kHFSXSigWord) || - (version < kHFSPlusVersion) || (version > 100) || + (hfsversion < kHFSPlusVersion) || (hfsversion > 100) || (SWAP_BE32 (volumeHeader->blockSize) != vcb->blockSize)) { #if 1 panic("HFS: corrupt VH on %s, sig 0x%04x, ver %d, blksize %d", - vcb->vcbVN, signature, version, + vcb->vcbVN, signature, hfsversion, SWAP_BE32 (volumeHeader->blockSize)); #endif printf("HFS: corrupt VH blk (%s)\n", vcb->vcbVN); - brelse(bp); + buf_brelse(bp); return (EIO); } @@ -2745,42 +2585,44 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush) struct buf *bp2; HFSMasterDirectoryBlock *mdb; - retval = meta_bread(hfsmp->hfs_devvp, HFS_PRI_SECTOR(sectorsize), + retval = (int)buf_meta_bread(hfsmp->hfs_devvp, (daddr64_t)HFS_PRI_SECTOR(sectorsize), sectorsize, NOCRED, &bp2); if (retval) { if (bp2) - brelse(bp2); + buf_brelse(bp2); retval = 0; } else { - mdb = (HFSMasterDirectoryBlock *)(bp2->b_data + + mdb = (HFSMasterDirectoryBlock *)(buf_dataptr(bp2) + HFS_PRI_OFFSET(sectorsize)); if ( SWAP_BE32 (mdb->drCrDate) != vcb->localCreateDate ) { - // XXXdbg if (hfsmp->jnl) { journal_modify_block_start(hfsmp->jnl, bp2); } mdb->drCrDate = SWAP_BE32 (vcb->localCreateDate); /* pick up the new create date */ - // XXXdbg if (hfsmp->jnl) { journal_modify_block_end(hfsmp->jnl, bp2); } else { - (void) VOP_BWRITE(bp2); /* write out the changes */ + (void) VNOP_BWRITE(bp2); /* write out the changes */ } } else { - brelse(bp2); /* just release it */ + buf_brelse(bp2); /* just release it */ } } } + if (1 /* hfsmp->jnl == 0 */) { + lck_mtx_lock(&hfsmp->hfs_mutex); + } + /* Note: only update the lower 16 bits worth of attributes */ - volumeHeader->attributes = SWAP_BE32 ((SWAP_BE32 (volumeHeader->attributes) & 0xFFFF0000) + (UInt16) vcb->vcbAtrb); - volumeHeader->journalInfoBlock = SWAP_BE32(vcb->vcbJinfoBlock); + volumeHeader->attributes = SWAP_BE32 (vcb->vcbAtrb); + volumeHeader->journalInfoBlock = SWAP_BE32 (vcb->vcbJinfoBlock); if (hfsmp->jnl) { volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSJMountVersion); } else { @@ -2791,6 +2633,7 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush) volumeHeader->backupDate = SWAP_BE32 (to_hfs_time(vcb->vcbVolBkUp)); volumeHeader->fileCount = SWAP_BE32 (vcb->vcbFilCnt); volumeHeader->folderCount = SWAP_BE32 (vcb->vcbDirCnt); + volumeHeader->totalBlocks = SWAP_BE32 (vcb->totalBlocks); volumeHeader->freeBlocks = SWAP_BE32 (vcb->freeBlocks); volumeHeader->nextAllocation = SWAP_BE32 (vcb->nextAllocation); volumeHeader->rsrcClumpSize = SWAP_BE32 (vcb->vcbClpSiz); @@ -2799,92 +2642,113 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush) volumeHeader->writeCount = SWAP_BE32 (vcb->vcbWrCnt); volumeHeader->encodingsBitmap = SWAP_BE64 (vcb->encodingsBitmap); - if (bcmp(vcb->vcbFndrInfo, volumeHeader->finderInfo, sizeof(volumeHeader->finderInfo)) != 0) + if (bcmp(vcb->vcbFndrInfo, volumeHeader->finderInfo, sizeof(volumeHeader->finderInfo)) != 0) { + bcopy(vcb->vcbFndrInfo, volumeHeader->finderInfo, sizeof(volumeHeader->finderInfo)); critical = 1; - bcopy(vcb->vcbFndrInfo, volumeHeader->finderInfo, sizeof(volumeHeader->finderInfo)); + } /* Sync Extents over-flow file meta data */ fp = VTOF(vcb->extentsRefNum); - for (i = 0; i < kHFSPlusExtentDensity; i++) { - volumeHeader->extentsFile.extents[i].startBlock = - SWAP_BE32 (fp->ff_extents[i].startBlock); - volumeHeader->extentsFile.extents[i].blockCount = - SWAP_BE32 (fp->ff_extents[i].blockCount); + if (FTOC(fp)->c_flag & C_MODIFIED) { + for (i = 0; i < kHFSPlusExtentDensity; i++) { + volumeHeader->extentsFile.extents[i].startBlock = + SWAP_BE32 (fp->ff_extents[i].startBlock); + volumeHeader->extentsFile.extents[i].blockCount = + SWAP_BE32 (fp->ff_extents[i].blockCount); + } + volumeHeader->extentsFile.logicalSize = SWAP_BE64 (fp->ff_size); + volumeHeader->extentsFile.totalBlocks = SWAP_BE32 (fp->ff_blocks); + volumeHeader->extentsFile.clumpSize = SWAP_BE32 (fp->ff_clumpsize); + FTOC(fp)->c_flag &= ~C_MODIFIED; } - FTOC(fp)->c_flag &= ~C_MODIFIED; - volumeHeader->extentsFile.logicalSize = SWAP_BE64 (fp->ff_size); - volumeHeader->extentsFile.totalBlocks = SWAP_BE32 (fp->ff_blocks); - volumeHeader->extentsFile.clumpSize = SWAP_BE32 (fp->ff_clumpsize); /* Sync Catalog file meta data */ fp = VTOF(vcb->catalogRefNum); - for (i = 0; i < kHFSPlusExtentDensity; i++) { - volumeHeader->catalogFile.extents[i].startBlock = - SWAP_BE32 (fp->ff_extents[i].startBlock); - volumeHeader->catalogFile.extents[i].blockCount = - SWAP_BE32 (fp->ff_extents[i].blockCount); + if (FTOC(fp)->c_flag & C_MODIFIED) { + for (i = 0; i < kHFSPlusExtentDensity; i++) { + volumeHeader->catalogFile.extents[i].startBlock = + SWAP_BE32 (fp->ff_extents[i].startBlock); + volumeHeader->catalogFile.extents[i].blockCount = + SWAP_BE32 (fp->ff_extents[i].blockCount); + } + volumeHeader->catalogFile.logicalSize = SWAP_BE64 (fp->ff_size); + volumeHeader->catalogFile.totalBlocks = SWAP_BE32 (fp->ff_blocks); + volumeHeader->catalogFile.clumpSize = SWAP_BE32 (fp->ff_clumpsize); + FTOC(fp)->c_flag &= ~C_MODIFIED; } - FTOC(fp)->c_flag &= ~C_MODIFIED; - volumeHeader->catalogFile.logicalSize = SWAP_BE64 (fp->ff_size); - volumeHeader->catalogFile.totalBlocks = SWAP_BE32 (fp->ff_blocks); - volumeHeader->catalogFile.clumpSize = SWAP_BE32 (fp->ff_clumpsize); /* Sync Allocation file meta data */ fp = VTOF(vcb->allocationsRefNum); - for (i = 0; i < kHFSPlusExtentDensity; i++) { - volumeHeader->allocationFile.extents[i].startBlock = - SWAP_BE32 (fp->ff_extents[i].startBlock); - volumeHeader->allocationFile.extents[i].blockCount = - SWAP_BE32 (fp->ff_extents[i].blockCount); + if (FTOC(fp)->c_flag & C_MODIFIED) { + for (i = 0; i < kHFSPlusExtentDensity; i++) { + volumeHeader->allocationFile.extents[i].startBlock = + SWAP_BE32 (fp->ff_extents[i].startBlock); + volumeHeader->allocationFile.extents[i].blockCount = + SWAP_BE32 (fp->ff_extents[i].blockCount); + } + volumeHeader->allocationFile.logicalSize = SWAP_BE64 (fp->ff_size); + volumeHeader->allocationFile.totalBlocks = SWAP_BE32 (fp->ff_blocks); + volumeHeader->allocationFile.clumpSize = SWAP_BE32 (fp->ff_clumpsize); + FTOC(fp)->c_flag &= ~C_MODIFIED; + } + + /* Sync Attribute file meta data */ + if (hfsmp->hfs_attribute_vp) { + fp = VTOF(hfsmp->hfs_attribute_vp); + for (i = 0; i < kHFSPlusExtentDensity; i++) { + volumeHeader->attributesFile.extents[i].startBlock = + SWAP_BE32 (fp->ff_extents[i].startBlock); + volumeHeader->attributesFile.extents[i].blockCount = + SWAP_BE32 (fp->ff_extents[i].blockCount); + } + FTOC(fp)->c_flag &= ~C_MODIFIED; + volumeHeader->attributesFile.logicalSize = SWAP_BE64 (fp->ff_size); + volumeHeader->attributesFile.totalBlocks = SWAP_BE32 (fp->ff_blocks); + volumeHeader->attributesFile.clumpSize = SWAP_BE32 (fp->ff_clumpsize); + } + + vcb->vcbFlags &= 0x00FF; + + if (1 /* hfsmp->jnl == 0 */) { + lck_mtx_unlock(&hfsmp->hfs_mutex); } - FTOC(fp)->c_flag &= ~C_MODIFIED; - volumeHeader->allocationFile.logicalSize = SWAP_BE64 (fp->ff_size); - volumeHeader->allocationFile.totalBlocks = SWAP_BE32 (fp->ff_blocks); - volumeHeader->allocationFile.clumpSize = SWAP_BE32 (fp->ff_clumpsize); /* If requested, flush out the alternate volume header */ - if (altflush) { + if (altflush && hfsmp->hfs_alt_id_sector) { struct buf *alt_bp = NULL; - u_long altIDSector; - - altIDSector = (vcb->hfsPlusIOPosOffset / sectorsize) + - HFS_ALT_SECTOR(sectorsize, hfsmp->hfs_phys_block_count); - if (meta_bread(hfsmp->hfs_devvp, altIDSector, sectorsize, NOCRED, &alt_bp) == 0) { + if (buf_meta_bread(hfsmp->hfs_devvp, hfsmp->hfs_alt_id_sector, sectorsize, NOCRED, &alt_bp) == 0) { if (hfsmp->jnl) { journal_modify_block_start(hfsmp->jnl, alt_bp); } - bcopy(volumeHeader, alt_bp->b_data + HFS_ALT_OFFSET(sectorsize), kMDBSize); + bcopy(volumeHeader, (char *)buf_dataptr(alt_bp) + HFS_ALT_OFFSET(sectorsize), kMDBSize); if (hfsmp->jnl) { journal_modify_block_end(hfsmp->jnl, alt_bp); } else { - (void) VOP_BWRITE(alt_bp); + (void) VNOP_BWRITE(alt_bp); } } else if (alt_bp) - brelse(alt_bp); + buf_brelse(alt_bp); } - // XXXdbg if (hfsmp->jnl) { journal_modify_block_end(hfsmp->jnl, bp); - journal_end_transaction(hfsmp->jnl); } else { if (waitfor != MNT_WAIT) - bawrite(bp); + buf_bawrite(bp); else { - retval = VOP_BWRITE(bp); + retval = VNOP_BWRITE(bp); /* When critical data changes, flush the device cache */ if (critical && (retval == 0)) { - (void) VOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, - NULL, FWRITE, NOCRED, current_proc()); + (void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, + NULL, FWRITE, NULL); } } } - hfs_global_shared_lock_release(hfsmp); + hfs_end_transaction(hfsmp); - vcb->vcbFlags &= 0x00FF; return (retval); } @@ -2892,26 +2756,29 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush) /* * Extend a file system. */ -static int -hfs_extendfs(struct mount *mp, u_int64_t newsize, struct proc *p) +__private_extern__ +int +hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) { + struct proc *p = vfs_context_proc(context); + kauth_cred_t cred = vfs_context_ucred(context); struct vnode *vp; struct vnode *devvp; struct buf *bp; - struct hfsmount *hfsmp; struct filefork *fp = NULL; ExtendedVCB *vcb; struct cat_fork forkdata; u_int64_t oldsize; u_int64_t newblkcnt; + u_int64_t prev_phys_block_count; u_int32_t addblks; u_int64_t sectorcnt; u_int32_t sectorsize; - daddr_t prev_alt_sector; - daddr_t bitmapblks; + daddr64_t prev_alt_sector; + daddr_t bitmapblks; + int lockflags; int error; - hfsmp = VFSTOHFS(mp); devvp = hfsmp->hfs_devvp; vcb = HFSTOVCB(hfsmp); @@ -2929,44 +2796,43 @@ hfs_extendfs(struct mount *mp, u_int64_t newsize, struct proc *p) * If extending file system by non-root, then verify * ownership and check permissions. */ - if (p->p_ucred->cr_uid != 0) { - error = hfs_root(mp, &vp); + if (suser(cred, NULL)) { + error = hfs_vget(hfsmp, kHFSRootFolderID, &vp, 0); + if (error) return (error); - error = hfs_owner_rights(hfsmp, VTOC(vp)->c_uid, p->p_ucred, p, 0); + error = hfs_owner_rights(hfsmp, VTOC(vp)->c_uid, cred, p, 0); if (error == 0) { - error = hfs_write_access(vp, p->p_ucred, p, false); + error = hfs_write_access(vp, cred, p, false); } - vput(vp); + hfs_unlock(VTOC(vp)); + vnode_put(vp); if (error) return (error); - vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); - error = VOP_ACCESS(devvp, VREAD | VWRITE, p->p_ucred, p); - VOP_UNLOCK(devvp, 0, p); + error = vnode_authorize(devvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, context); if (error) return (error); } - if (VOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)§orsize, 0, FSCRED, p)) { + if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)§orsize, 0, context)) { return (ENXIO); } if (sectorsize != hfsmp->hfs_phys_block_size) { return (ENXIO); } - if (VOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)§orcnt, 0, FSCRED, p)) { + if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)§orcnt, 0, context)) { return (ENXIO); } if ((sectorsize * sectorcnt) < newsize) { printf("hfs_extendfs: not enough space on device\n"); return (ENOSPC); } - oldsize = (u_int64_t)hfsmp->hfs_phys_block_count * - (u_int64_t)hfsmp->hfs_phys_block_size; + oldsize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize; /* * Validate new size. */ - if ((newsize <= oldsize) || (newsize % vcb->blockSize)) { + if ((newsize <= oldsize) || (newsize % sectorsize)) { printf("hfs_extendfs: invalid size\n"); return (EINVAL); } @@ -2980,23 +2846,12 @@ hfs_extendfs(struct mount *mp, u_int64_t newsize, struct proc *p) /* * Enclose changes inside a transaction. */ - hfs_global_shared_lock_acquire(hfsmp); - if (journal_start_transaction(hfsmp->jnl) != 0) { - hfs_global_shared_lock_release(hfsmp); + if (hfs_start_transaction(hfsmp) != 0) { return (EINVAL); } - /* - * Remember the location of existing alternate VH. - */ - prev_alt_sector = (vcb->hfsPlusIOPosOffset / sectorsize) + - HFS_ALT_SECTOR(sectorsize, hfsmp->hfs_phys_block_count); - + lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); vp = vcb->allocationsRefNum; - error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - if (error) { - goto out2; - } fp = VTOF(vp); bcopy(&fp->ff_data, &forkdata, sizeof(forkdata)); @@ -3004,13 +2859,13 @@ hfs_extendfs(struct mount *mp, u_int64_t newsize, struct proc *p) * Calculate additional space required (if any) by allocation bitmap. */ bitmapblks = roundup(newblkcnt / 8, vcb->vcbVBMIOSize) / vcb->blockSize; - if (bitmapblks > fp->ff_blocks) + if (bitmapblks > (daddr_t)fp->ff_blocks) bitmapblks -= fp->ff_blocks; else bitmapblks = 0; if (bitmapblks > 0) { - daddr_t blkno; + daddr64_t blkno; daddr_t blkcnt; /* @@ -3022,7 +2877,7 @@ hfs_extendfs(struct mount *mp, u_int64_t newsize, struct proc *p) goto out; } blkcnt = bitmapblks; - blkno = fp->ff_blocks; + blkno = (daddr64_t)fp->ff_blocks; fp->ff_blocks += bitmapblks; fp->ff_size += (u_int64_t)bitmapblks * (u_int64_t)vcb->blockSize; VTOC(vp)->c_blocks = fp->ff_blocks; @@ -3033,16 +2888,16 @@ hfs_extendfs(struct mount *mp, u_int64_t newsize, struct proc *p) bp = NULL; while (blkcnt > 0) { - error = meta_bread(vp, blkno, vcb->blockSize, NOCRED, &bp); + error = (int)buf_meta_bread(vp, blkno, vcb->blockSize, NOCRED, &bp); if (error) { if (bp) { - brelse(bp); + buf_brelse(bp); } break; } - bzero((char *)bp->b_data, vcb->blockSize); - bp->b_flags |= B_AGE; - error = bwrite(bp); + bzero((char *)buf_dataptr(bp), vcb->blockSize); + buf_markaged(bp); + error = (int)buf_bwrite(bp); if (error) break; --blkcnt; @@ -3080,14 +2935,17 @@ hfs_extendfs(struct mount *mp, u_int64_t newsize, struct proc *p) (void) BlockMarkFree(vcb, vcb->totalBlocks - 2, 2); else (void) BlockMarkFree(vcb, vcb->totalBlocks - 1, 1); - /* * Adjust file system variables for new space. */ + prev_phys_block_count = hfsmp->hfs_phys_block_count; + prev_alt_sector = hfsmp->hfs_alt_id_sector; + vcb->totalBlocks += addblks; vcb->freeBlocks += addblks - bitmapblks; hfsmp->hfs_phys_block_count = newsize / sectorsize; - + hfsmp->hfs_alt_id_sector = (hfsmp->hfsPlusIOPosOffset / sectorsize) + + HFS_ALT_SECTOR(sectorsize, hfsmp->hfs_phys_block_count); MarkVCBDirty(vcb); error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH); if (error) { @@ -3098,7 +2956,8 @@ hfs_extendfs(struct mount *mp, u_int64_t newsize, struct proc *p) fp->ff_size -= (u_int64_t)bitmapblks * (u_int64_t)vcb->blockSize; vcb->totalBlocks -= addblks; vcb->freeBlocks -= addblks - bitmapblks; - hfsmp->hfs_phys_block_count = oldsize / sectorsize; + hfsmp->hfs_phys_block_count = prev_phys_block_count; + hfsmp->hfs_alt_id_sector = prev_alt_sector; MarkVCBDirty(vcb); if (vcb->blockSize == 512) (void) BlockMarkAllocated(vcb, vcb->totalBlocks - 2, 2); @@ -3110,13 +2969,17 @@ hfs_extendfs(struct mount *mp, u_int64_t newsize, struct proc *p) * Invalidate the old alternate volume header. */ bp = NULL; - if (meta_bread(hfsmp->hfs_devvp, prev_alt_sector, sectorsize, - NOCRED, &bp) == 0) { - journal_modify_block_start(hfsmp->jnl, bp); - bzero(bp->b_data + HFS_ALT_OFFSET(sectorsize), kMDBSize); - journal_modify_block_end(hfsmp->jnl, bp); - } else if (bp) { - brelse(bp); + if (prev_alt_sector) { + if (buf_meta_bread(hfsmp->hfs_devvp, prev_alt_sector, sectorsize, + NOCRED, &bp) == 0) { + journal_modify_block_start(hfsmp->jnl, bp); + + bzero((char *)buf_dataptr(bp) + HFS_ALT_OFFSET(sectorsize), kMDBSize); + + journal_modify_block_end(hfsmp->jnl, bp); + } else if (bp) { + buf_brelse(bp); + } } out: if (error && fp) { @@ -3125,14 +2988,601 @@ out: VTOC(vp)->c_blocks = fp->ff_blocks; } - VOP_UNLOCK(vp, 0, p); -out2: - journal_end_transaction(hfsmp->jnl); - hfs_global_shared_lock_release(hfsmp); + hfs_systemfile_unlock(hfsmp, lockflags); + hfs_end_transaction(hfsmp); + + return (error); +} + +#define HFS_MIN_SIZE (32LL * 1024LL * 1024LL) + +/* + * Truncate a file system (while still mounted). + */ +__private_extern__ +int +hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, __unused vfs_context_t context) +{ + struct vnode* rvp = NULL; + struct buf *bp = NULL; + u_int64_t oldsize; + u_int32_t newblkcnt; + u_int32_t reclaimblks; + int lockflags = 0; + int transaction_begun = 0; + int error; + + /* + * Grab the root vnode to serialize with another hfs_truncatefs call. + */ + error = hfs_vget(hfsmp, kHFSRootFolderID, &rvp, 0); + if (error) { + return (error); + } + /* + * - HFS Plus file systems only. + * - Journaling must be enabled. + * - No embedded volumes. + */ + if ((hfsmp->hfs_flags & HFS_STANDARD) || + (hfsmp->jnl == NULL) || + (hfsmp->hfsPlusIOPosOffset != 0)) { + error = EPERM; + goto out; + } + oldsize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize; + newblkcnt = newsize / hfsmp->blockSize; + reclaimblks = hfsmp->totalBlocks - newblkcnt; + + /* Make sure new size is valid. */ + if ((newsize < HFS_MIN_SIZE) || + (newsize >= oldsize) || + (newsize % hfsmp->hfs_phys_block_size)) { + error = EINVAL; + goto out; + } + /* Make sure there's enough space to work with. */ + if (reclaimblks > (hfsmp->freeBlocks / 4)) { + error = ENOSPC; + goto out; + } + + printf("hfs_truncatefs: shrinking %s by %d blocks out of %d\n", + hfsmp->vcbVN, reclaimblks, hfsmp->totalBlocks); + + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + goto out; + } + transaction_begun = 1; + + /* + * Look for files that have blocks beyond newblkcnt. + */ + if (hfs_isallocated(hfsmp, newblkcnt, reclaimblks - 1)) { + /* + * hfs_reclaimspace will use separate transactions when + * relocating files (so we don't overwhelm the journal). + */ + hfs_end_transaction(hfsmp); + transaction_begun = 0; + + /* Attempt to reclaim some space. */ + if (hfs_reclaimspace(hfsmp, newblkcnt) != 0) { + printf("hfs_truncatefs: couldn't reclaim space on %s\n", hfsmp->vcbVN); + error = ENOSPC; + goto out; + } + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + goto out; + } + transaction_begun = 1; + + /* Check if we're clear now. */ + if (hfs_isallocated(hfsmp, newblkcnt, reclaimblks - 1)) { + printf("hfs_truncatefs: didn't reclaim enough space on %s\n", hfsmp->vcbVN); + error = ENOSPC; + goto out; + } + } + lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + + /* + * Mark the old alternate volume header as free. + * We don't bother shrinking allocation bitmap file. + */ + if (hfsmp->blockSize == 512) + (void) BlockMarkFree(hfsmp, hfsmp->totalBlocks - 2, 2); + else + (void) BlockMarkFree(hfsmp, hfsmp->totalBlocks - 1, 1); + + /* + * Allocate last block for alternate volume header. + */ + if (hfsmp->blockSize == 512) + error = BlockMarkAllocated(hfsmp, newblkcnt - 2, 2); + else + error = BlockMarkAllocated(hfsmp, newblkcnt - 1, 1); + + if (error) { + goto out; + } + /* + * Invalidate the existing alternate volume header. + */ + if (hfsmp->hfs_alt_id_sector) { + if (buf_meta_bread(hfsmp->hfs_devvp, hfsmp->hfs_alt_id_sector, + hfsmp->hfs_phys_block_size, NOCRED, &bp) == 0) { + journal_modify_block_start(hfsmp->jnl, bp); + + bzero((void*)((char *)buf_dataptr(bp) + HFS_ALT_OFFSET(hfsmp->hfs_phys_block_size)), kMDBSize); + + journal_modify_block_end(hfsmp->jnl, bp); + } else if (bp) { + buf_brelse(bp); + } + bp = NULL; + } + + /* + * Adjust file system variables and flush them to disk. + */ + hfsmp->freeBlocks -= hfsmp->totalBlocks - newblkcnt; + hfsmp->totalBlocks = newblkcnt; + hfsmp->hfs_phys_block_count = newsize / hfsmp->hfs_phys_block_size; + hfsmp->hfs_alt_id_sector = HFS_ALT_SECTOR(hfsmp->hfs_phys_block_size, hfsmp->hfs_phys_block_count); + MarkVCBDirty(hfsmp); + error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH); + if (error) + panic("hfs_truncatefs: unexpected error flushing volume header (%d)\n", error); +out: + if (lockflags) { + hfs_systemfile_unlock(hfsmp, lockflags); + } + if (transaction_begun) { + hfs_end_transaction(hfsmp); + } + if (rvp) { + hfs_unlock(VTOC(rvp)); + vnode_put(rvp); + } return (error); } +/* + * Reclaim space at the end of a file system. + */ +static int +hfs_reclaimspace(struct hfsmount *hfsmp, u_long startblk) +{ + struct vnode *vp = NULL; + FCB *fcb; + struct BTreeIterator * iterator = NULL; + struct FSBufferDescriptor btdata; + struct HFSPlusCatalogFile filerec; + u_int32_t saved_next_allocation; + cnid_t * cnidbufp; + size_t cnidbufsize; + int filecnt; + int maxfilecnt; + u_long block; + int lockflags; + int i; + int error; + + /* + * Check if Attributes file overlaps. + */ + if (hfsmp->hfs_attribute_vp) { + struct filefork *fp; + + fp = VTOF(hfsmp->hfs_attribute_vp); + for (i = 0; i < kHFSPlusExtentDensity; ++i) { + block = fp->ff_extents[i].startBlock + + fp->ff_extents[i].blockCount; + if (block >= startblk) { + printf("hfs_reclaimspace: Attributes file can't move\n"); + return (EPERM); + } + } + } + + /* For now we'll move a maximum of 16,384 files. */ + maxfilecnt = MIN(hfsmp->hfs_filecount, 16384); + cnidbufsize = maxfilecnt * sizeof(cnid_t); + if (kmem_alloc(kernel_map, (vm_offset_t *)&cnidbufp, cnidbufsize)) { + return (ENOMEM); + } + if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) { + kmem_free(kernel_map, (vm_offset_t)cnidbufp, cnidbufsize); + return (ENOMEM); + } + + saved_next_allocation = hfsmp->nextAllocation; + hfsmp->nextAllocation = hfsmp->hfs_metazone_start; + + fcb = VTOF(hfsmp->hfs_catalog_vp); + bzero(iterator, sizeof(*iterator)); + + btdata.bufferAddress = &filerec; + btdata.itemSize = sizeof(filerec); + btdata.itemCount = 1; + + /* Keep the Catalog file locked during iteration. */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + error = BTIterateRecord(fcb, kBTreeFirstRecord, iterator, NULL, NULL); + if (error) { + hfs_systemfile_unlock(hfsmp, lockflags); + goto out; + } + + /* + * Iterate over all the catalog records looking for files + * that overlap into the space we're trying to free up. + */ + for (filecnt = 0; filecnt < maxfilecnt; ) { + error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL); + if (error) { + if (error == btNotFound) + error = 0; + break; + } + if (filerec.recordType != kHFSPlusFileRecord || + filerec.fileID == hfsmp->hfs_jnlfileid) + continue; + /* + * Check if either fork overlaps target space. + */ + for (i = 0; i < kHFSPlusExtentDensity; ++i) { + block = filerec.dataFork.extents[i].startBlock + + filerec.dataFork.extents[i].blockCount; + if (block >= startblk) { + if (filerec.fileID == hfsmp->hfs_jnlfileid) { + printf("hfs_reclaimspace: cannot move active journal\n"); + error = EPERM; + break; + } + cnidbufp[filecnt++] = filerec.fileID; + break; + } + block = filerec.resourceFork.extents[i].startBlock + + filerec.resourceFork.extents[i].blockCount; + if (block >= startblk) { + cnidbufp[filecnt++] = filerec.fileID; + break; + } + } + } + /* All done with catalog. */ + hfs_systemfile_unlock(hfsmp, lockflags); + if (error) + goto out; + + /* Now move any files that are in the way. */ + for (i = 0; i < filecnt; ++i) { + struct vnode * rvp; + + if (hfs_vget(hfsmp, cnidbufp[i], &vp, 0) != 0) + continue; + + /* Relocate any data fork blocks. */ + if (VTOF(vp)->ff_blocks > 0) { + error = hfs_relocate(vp, hfsmp->hfs_metazone_end + 1, kauth_cred_get(), current_proc()); + } + hfs_unlock(VTOC(vp)); + if (error) + break; + + /* Relocate any resource fork blocks. */ + if ((VTOC((vp))->c_blocks - VTOF((vp))->ff_blocks) > 0) { + error = hfs_vgetrsrc(hfsmp, vp, &rvp, current_proc()); + if (error) + break; + hfs_lock(VTOC(rvp), HFS_EXCLUSIVE_LOCK); + error = hfs_relocate(rvp, hfsmp->hfs_metazone_end + 1, kauth_cred_get(), current_proc()); + hfs_unlock(VTOC(rvp)); + vnode_put(rvp); + if (error) + break; + } + vnode_put(vp); + vp = NULL; + } + if (vp) { + vnode_put(vp); + vp = NULL; + } + + /* + * Note: this implementation doesn't handle overflow extents. + */ +out: + kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator)); + kmem_free(kernel_map, (vm_offset_t)cnidbufp, cnidbufsize); + + /* On errors restore the roving allocation pointer. */ + if (error) { + hfsmp->nextAllocation = saved_next_allocation; + } + return (error); +} + + +/* + * Get file system attributes. + */ +static int +hfs_vfs_getattr(struct mount *mp, struct vfs_attr *fsap, __unused vfs_context_t context) +{ + ExtendedVCB *vcb = VFSTOVCB(mp); + struct hfsmount *hfsmp = VFSTOHFS(mp); + u_long freeCNIDs; + + freeCNIDs = (u_long)0xFFFFFFFF - (u_long)hfsmp->vcbNxtCNID; + + VFSATTR_RETURN(fsap, f_objcount, (uint64_t)hfsmp->vcbFilCnt + (uint64_t)hfsmp->vcbDirCnt); + VFSATTR_RETURN(fsap, f_filecount, (uint64_t)hfsmp->vcbFilCnt); + VFSATTR_RETURN(fsap, f_dircount, (uint64_t)hfsmp->vcbDirCnt); + VFSATTR_RETURN(fsap, f_maxobjcount, (uint64_t)0xFFFFFFFF); + VFSATTR_RETURN(fsap, f_iosize, (size_t)(MAX_UPL_TRANSFER * PAGE_SIZE)); + VFSATTR_RETURN(fsap, f_blocks, (uint64_t)hfsmp->totalBlocks); + VFSATTR_RETURN(fsap, f_bfree, (uint64_t)hfs_freeblks(hfsmp, 0)); + VFSATTR_RETURN(fsap, f_bavail, (uint64_t)hfs_freeblks(hfsmp, 1)); + VFSATTR_RETURN(fsap, f_bsize, (uint32_t)vcb->blockSize); + /* XXX needs clarification */ + VFSATTR_RETURN(fsap, f_bused, hfsmp->totalBlocks - hfs_freeblks(hfsmp, 1)); + /* Maximum files is constrained by total blocks. */ + VFSATTR_RETURN(fsap, f_files, (uint64_t)(hfsmp->totalBlocks - 2)); + VFSATTR_RETURN(fsap, f_ffree, MIN((uint64_t)freeCNIDs, (uint64_t)hfs_freeblks(hfsmp, 1))); + + fsap->f_fsid.val[0] = hfsmp->hfs_raw_dev; + fsap->f_fsid.val[1] = vfs_typenum(mp); + VFSATTR_SET_SUPPORTED(fsap, f_fsid); + + VFSATTR_RETURN(fsap, f_signature, vcb->vcbSigWord); + VFSATTR_RETURN(fsap, f_carbon_fsid, 0); + + if (VFSATTR_IS_ACTIVE(fsap, f_capabilities)) { + vol_capabilities_attr_t *cap; + + cap = &fsap->f_capabilities; + + if (hfsmp->hfs_flags & HFS_STANDARD) { + cap->capabilities[VOL_CAPABILITIES_FORMAT] = + VOL_CAP_FMT_PERSISTENTOBJECTIDS | + VOL_CAP_FMT_CASE_PRESERVING | + VOL_CAP_FMT_FAST_STATFS; + } else { + cap->capabilities[VOL_CAPABILITIES_FORMAT] = + VOL_CAP_FMT_PERSISTENTOBJECTIDS | + VOL_CAP_FMT_SYMBOLICLINKS | + VOL_CAP_FMT_HARDLINKS | + VOL_CAP_FMT_JOURNAL | + (hfsmp->jnl ? VOL_CAP_FMT_JOURNAL_ACTIVE : 0) | + (hfsmp->hfs_flags & HFS_CASE_SENSITIVE ? VOL_CAP_FMT_CASE_SENSITIVE : 0) | + VOL_CAP_FMT_CASE_PRESERVING | + VOL_CAP_FMT_FAST_STATFS | + VOL_CAP_FMT_2TB_FILESIZE; + } + cap->capabilities[VOL_CAPABILITIES_INTERFACES] = + VOL_CAP_INT_SEARCHFS | + VOL_CAP_INT_ATTRLIST | + VOL_CAP_INT_NFSEXPORT | + VOL_CAP_INT_READDIRATTR | + VOL_CAP_INT_EXCHANGEDATA | + VOL_CAP_INT_ALLOCATE | + VOL_CAP_INT_VOL_RENAME | + VOL_CAP_INT_ADVLOCK | + VOL_CAP_INT_FLOCK; + cap->capabilities[VOL_CAPABILITIES_RESERVED1] = 0; + cap->capabilities[VOL_CAPABILITIES_RESERVED2] = 0; + + cap->valid[VOL_CAPABILITIES_FORMAT] = + VOL_CAP_FMT_PERSISTENTOBJECTIDS | + VOL_CAP_FMT_SYMBOLICLINKS | + VOL_CAP_FMT_HARDLINKS | + VOL_CAP_FMT_JOURNAL | + VOL_CAP_FMT_JOURNAL_ACTIVE | + VOL_CAP_FMT_NO_ROOT_TIMES | + VOL_CAP_FMT_SPARSE_FILES | + VOL_CAP_FMT_ZERO_RUNS | + VOL_CAP_FMT_CASE_SENSITIVE | + VOL_CAP_FMT_CASE_PRESERVING | + VOL_CAP_FMT_FAST_STATFS | + VOL_CAP_FMT_2TB_FILESIZE; + cap->valid[VOL_CAPABILITIES_INTERFACES] = + VOL_CAP_INT_SEARCHFS | + VOL_CAP_INT_ATTRLIST | + VOL_CAP_INT_NFSEXPORT | + VOL_CAP_INT_READDIRATTR | + VOL_CAP_INT_EXCHANGEDATA | + VOL_CAP_INT_COPYFILE | + VOL_CAP_INT_ALLOCATE | + VOL_CAP_INT_VOL_RENAME | + VOL_CAP_INT_ADVLOCK | + VOL_CAP_INT_FLOCK; + cap->valid[VOL_CAPABILITIES_RESERVED1] = 0; + cap->valid[VOL_CAPABILITIES_RESERVED2] = 0; + VFSATTR_SET_SUPPORTED(fsap, f_capabilities); + } + if (VFSATTR_IS_ACTIVE(fsap, f_attributes)) { + vol_attributes_attr_t *attrp = &fsap->f_attributes; + + attrp->validattr.commonattr = ATTR_CMN_VALIDMASK; + attrp->validattr.volattr = ATTR_VOL_VALIDMASK & ~ATTR_VOL_INFO; + attrp->validattr.dirattr = ATTR_DIR_VALIDMASK; + attrp->validattr.fileattr = ATTR_FILE_VALIDMASK; + attrp->validattr.forkattr = 0; + + attrp->nativeattr.commonattr = ATTR_CMN_VALIDMASK; + attrp->nativeattr.volattr = ATTR_VOL_VALIDMASK & ~ATTR_VOL_INFO; + attrp->nativeattr.dirattr = ATTR_DIR_VALIDMASK; + attrp->nativeattr.fileattr = ATTR_FILE_VALIDMASK; + attrp->nativeattr.forkattr = 0; + VFSATTR_SET_SUPPORTED(fsap, f_attributes); + } + fsap->f_create_time.tv_sec = hfsmp->vcbCrDate; + fsap->f_create_time.tv_nsec = 0; + VFSATTR_SET_SUPPORTED(fsap, f_create_time); + fsap->f_modify_time.tv_sec = hfsmp->vcbLsMod; + fsap->f_modify_time.tv_nsec = 0; + VFSATTR_SET_SUPPORTED(fsap, f_modify_time); + + fsap->f_backup_time.tv_sec = hfsmp->vcbVolBkUp; + fsap->f_backup_time.tv_nsec = 0; + VFSATTR_SET_SUPPORTED(fsap, f_backup_time); + if (VFSATTR_IS_ACTIVE(fsap, f_fssubtype)) { + uint16_t subtype = 0; + + /* + * Subtypes (flavors) for HFS + * 0: Mac OS Extended + * 1: Mac OS Extended (Journaled) + * 2: Mac OS Extended (Case Sensitive) + * 3: Mac OS Extended (Case Sensitive, Journaled) + * 4 - 127: Reserved + * 128: Mac OS Standard + * + */ + if (hfsmp->hfs_flags & HFS_STANDARD) { + subtype = HFS_SUBTYPE_STANDARDHFS; + } else /* HFS Plus */ { + if (hfsmp->jnl) + subtype |= HFS_SUBTYPE_JOURNALED; + if (hfsmp->hfs_flags & HFS_CASE_SENSITIVE) + subtype |= HFS_SUBTYPE_CASESENSITIVE; + } + fsap->f_fssubtype = subtype; + VFSATTR_SET_SUPPORTED(fsap, f_fssubtype); + } + + if (VFSATTR_IS_ACTIVE(fsap, f_vol_name)) { + strncpy(fsap->f_vol_name, hfsmp->vcbVN, MAXPATHLEN); + fsap->f_vol_name[MAXPATHLEN - 1] = 0; + VFSATTR_SET_SUPPORTED(fsap, f_vol_name); + } + return (0); +} + +/* + * Perform a volume rename. Requires the FS' root vp. + */ +static int +hfs_rename_volume(struct vnode *vp, const char *name, proc_t p) +{ + ExtendedVCB *vcb = VTOVCB(vp); + struct cnode *cp = VTOC(vp); + struct hfsmount *hfsmp = VTOHFS(vp); + struct cat_desc to_desc; + struct cat_desc todir_desc; + struct cat_desc new_desc; + cat_cookie_t cookie; + int lockflags; + int error = 0; + + /* + * Ignore attempts to rename a volume to a zero-length name. + */ + if (name[0] == 0) + return(0); + + bzero(&to_desc, sizeof(to_desc)); + bzero(&todir_desc, sizeof(todir_desc)); + bzero(&new_desc, sizeof(new_desc)); + bzero(&cookie, sizeof(cookie)); + + todir_desc.cd_parentcnid = kHFSRootParentID; + todir_desc.cd_cnid = kHFSRootFolderID; + todir_desc.cd_flags = CD_ISDIR; + + to_desc.cd_nameptr = name; + to_desc.cd_namelen = strlen(name); + to_desc.cd_parentcnid = kHFSRootParentID; + to_desc.cd_cnid = cp->c_cnid; + to_desc.cd_flags = CD_ISDIR; + + if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK)) == 0) { + if ((error = hfs_start_transaction(hfsmp)) == 0) { + if ((error = cat_preflight(hfsmp, CAT_RENAME, &cookie, p)) == 0) { + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); + + error = cat_rename(hfsmp, &cp->c_desc, &todir_desc, &to_desc, &new_desc); + + /* + * If successful, update the name in the VCB, ensure it's terminated. + */ + if (!error) { + strncpy(vcb->vcbVN, name, sizeof(vcb->vcbVN)); + vcb->vcbVN[sizeof(vcb->vcbVN) - 1] = 0; + } + + hfs_systemfile_unlock(hfsmp, lockflags); + cat_postflight(hfsmp, &cookie, p); + + if (error) + vcb->vcbFlags |= 0xFF00; + (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0); + } + hfs_end_transaction(hfsmp); + } + if (!error) { + /* Release old allocated name buffer */ + if (cp->c_desc.cd_flags & CD_HASBUF) { + char *name = cp->c_desc.cd_nameptr; + + cp->c_desc.cd_nameptr = 0; + cp->c_desc.cd_namelen = 0; + cp->c_desc.cd_flags &= ~CD_HASBUF; + vfs_removename(name); + } + /* Update cnode's catalog descriptor */ + replace_desc(cp, &new_desc); + vcb->volumeNameEncodingHint = new_desc.cd_encoding; + cp->c_touch_chgtime = TRUE; + } + + hfs_unlock(cp); + } + + return(error); +} + +/* + * Get file system attributes. + */ +static int +hfs_vfs_setattr(struct mount *mp, struct vfs_attr *fsap, __unused vfs_context_t context) +{ + kauth_cred_t cred = vfs_context_ucred(context); + int error = 0; + + /* + * Must be superuser or owner of filesystem to change volume attributes + */ + if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(mp)->f_owner)) + return(EACCES); + + if (VFSATTR_IS_ACTIVE(fsap, f_vol_name)) { + vnode_t root_vp; + + error = hfs_vfs_root(mp, &root_vp, context); + if (error) + goto out; + + error = hfs_rename_volume(root_vp, fsap->f_vol_name, vfs_context_proc(context)); + (void) vnode_put(root_vp); + if (error) + goto out; + + VFSATTR_SET_SUPPORTED(fsap, f_vol_name); + } + +out: + return error; +} + /* * hfs vfs operations. @@ -3141,13 +3591,14 @@ struct vfsops hfs_vfsops = { hfs_mount, hfs_start, hfs_unmount, - hfs_root, + hfs_vfs_root, hfs_quotactl, - hfs_statfs, + hfs_vfs_getattr, /* was hfs_statfs */ hfs_sync, - hfs_vget, + hfs_vfs_vget, hfs_fhtovp, hfs_vptofh, hfs_init, - hfs_sysctl + hfs_sysctl, + hfs_vfs_setattr }; diff --git a/bsd/hfs/hfs_vfsutils.c b/bsd/hfs/hfs_vfsutils.c index 68263e6b4..c35236e69 100644 --- a/bsd/hfs/hfs_vfsutils.c +++ b/bsd/hfs/hfs_vfsutils.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -32,11 +32,11 @@ #include <sys/malloc.h> #include <sys/stat.h> #include <sys/mount.h> -#include <sys/namei.h> -#include <sys/lock.h> #include <sys/buf.h> #include <sys/ubc.h> #include <sys/unistd.h> +#include <sys/utfconv.h> +#include <sys/kauth.h> #include "hfs.h" #include "hfs_catalog.h" @@ -50,7 +50,7 @@ #include "hfscommon/headers/HFSUnicodeWrappers.h" -extern int count_lock_queue __P((void)); +extern int count_lock_queue(void); static void ReleaseMetaFileVNode(struct vnode *vp); @@ -63,11 +63,8 @@ static u_int32_t hfs_hotfile_freeblocks(struct hfsmount *); u_int32_t GetLogicalBlockSize(struct vnode *vp); -/* BTree accessor routines */ -extern OSStatus GetBTreeBlock(FileReference vp, UInt32 blockNum, GetBlockOptions options, BlockDescriptor *block); -extern OSStatus SetBTreeBlockSize(FileReference vp, ByteCount blockSize, ItemCount minBlockCount); -extern OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF); -extern OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlockOptions options); +extern int hfs_attrkeycompare(HFSPlusAttrKey *searchKey, HFSPlusAttrKey *trialKey); + //******************************************************************************* // Note: Finder information in the HFS/HFS+ metadata are considered opaque and @@ -83,6 +80,7 @@ extern OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, Relea char hfs_catname[] = "Catalog B-tree"; char hfs_extname[] = "Extents B-tree"; char hfs_vbmname[] = "Volume Bitmap"; +char hfs_attrname[] = "Attribute B-tree"; char hfs_privdirname[] = "\xE2\x90\x80\xE2\x90\x80\xE2\x90\x80\xE2\x90\x80HFS+ Private Data"; @@ -149,10 +147,11 @@ OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, hfsmp->hfs_logBlockSize = BestBlockSizeFit(vcb->blockSize, MAXBSIZE, hfsmp->hfs_phys_block_size); vcb->vcbVBMIOSize = kHFSBlockSize; - VCB_LOCK_INIT(vcb); + hfsmp->hfs_alt_id_sector = HFS_ALT_SECTOR(hfsmp->hfs_phys_block_size, + hfsmp->hfs_phys_block_count); bzero(&cndesc, sizeof(cndesc)); - cndesc.cd_parentcnid = kRootParID; + cndesc.cd_parentcnid = kHFSRootParentID; cndesc.cd_flags |= CD_ISMETA; bzero(&cnattr, sizeof(cnattr)); cnattr.ca_nlink = 1; @@ -177,13 +176,13 @@ OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, fork.cf_extents[2].blockCount = SWAP_BE16(mdb->drXTExtRec[2].blockCount); cnattr.ca_blocks = fork.cf_blocks; - error = hfs_getnewvnode(hfsmp, NULL, &cndesc, 0, &cnattr, &fork, - &vcb->extentsRefNum); + error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &fork, + &hfsmp->hfs_extents_vp); if (error) goto MtVolErr; - error = MacToVFSError(BTOpenPath(VTOF(vcb->extentsRefNum), + error = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_extents_vp), (KeyCompareProcPtr)CompareExtentKeys)); if (error) { - VOP_UNLOCK(vcb->extentsRefNum, 0, p); + hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); goto MtVolErr; } @@ -205,17 +204,34 @@ OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, fork.cf_extents[2].blockCount = SWAP_BE16(mdb->drCTExtRec[2].blockCount); cnattr.ca_blocks = fork.cf_blocks; - error = hfs_getnewvnode(hfsmp, NULL, &cndesc, 0, &cnattr, &fork, - &vcb->catalogRefNum); + error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &fork, + &hfsmp->hfs_catalog_vp); if (error) { - VOP_UNLOCK(vcb->extentsRefNum, 0, p); + hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); goto MtVolErr; } - error = MacToVFSError(BTOpenPath(VTOF(vcb->catalogRefNum), + error = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_catalog_vp), (KeyCompareProcPtr)CompareCatalogKeys)); if (error) { - VOP_UNLOCK(vcb->catalogRefNum, 0, p); - VOP_UNLOCK(vcb->extentsRefNum, 0, p); + hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); + hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); + goto MtVolErr; + } + + /* + * Set up dummy Allocation file vnode (used only for locking bitmap) + */ + cndesc.cd_nameptr = hfs_vbmname; + cndesc.cd_namelen = strlen(hfs_vbmname); + cndesc.cd_cnid = cnattr.ca_fileid = kHFSAllocationFileID; + bzero(&fork, sizeof(fork)); + cnattr.ca_blocks = 0; + + error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &fork, + &hfsmp->hfs_allocation_vp); + if (error) { + hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); + hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); goto MtVolErr; } @@ -223,10 +239,11 @@ OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, vcb->vcbAtrb &= ~kHFSVolumeUnmountedMask; /* - * all done with b-trees so we can unlock now... + * all done with system files so we can unlock now... */ - VOP_UNLOCK(vcb->catalogRefNum, 0, p); - VOP_UNLOCK(vcb->extentsRefNum, 0, p); + hfs_unlock(VTOC(hfsmp->hfs_allocation_vp)); + hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); + hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); if ( error == noErr ) { @@ -239,8 +256,8 @@ OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, //-- Release any resources allocated so far before exiting with an error: MtVolErr: - ReleaseMetaFileVNode(vcb->catalogRefNum); - ReleaseMetaFileVNode(vcb->extentsRefNum); + ReleaseMetaFileVNode(hfsmp->hfs_catalog_vp); + ReleaseMetaFileVNode(hfsmp->hfs_extents_vp); CmdDone: return (error); @@ -254,14 +271,14 @@ CmdDone: __private_extern__ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, - off_t embeddedOffset, u_int64_t disksize, struct proc *p, void *args) + off_t embeddedOffset, u_int64_t disksize, struct proc *p, void *args, kauth_cred_t cred) { register ExtendedVCB *vcb; struct cat_desc cndesc; struct cat_attr cnattr; struct cat_fork cfork; UInt32 blockSize; - u_int64_t volumesize; + daddr64_t spare_sectors; struct BTreeInfoRec btinfo; u_int16_t signature; u_int16_t version; @@ -285,7 +302,9 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, signature = kHFSPlusSigWord; hfsmp->hfs_flags |= HFS_X; } else { - printf("hfs_mount: invalid HFS+ sig 0x%04x\n", signature); + /* Removed printf for invalid HFS+ signature because it gives + * false error for UFS root volume + */ return (EINVAL); } @@ -314,7 +333,7 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, vcb->vcbSigWord = signature; vcb->vcbJinfoBlock = SWAP_BE32(vhp->journalInfoBlock); vcb->vcbLsMod = to_bsd_time(SWAP_BE32(vhp->modifyDate)); - vcb->vcbAtrb = (UInt16)SWAP_BE32(vhp->attributes); + vcb->vcbAtrb = SWAP_BE32(vhp->attributes); vcb->vcbClpSiz = SWAP_BE32(vhp->rsrcClumpSize); vcb->vcbNxtCNID = SWAP_BE32(vhp->nextCatalogID); vcb->vcbVolBkUp = to_bsd_time(SWAP_BE32(vhp->backupDate)); @@ -329,8 +348,6 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) vcb->vcbWrCnt++; /* compensate for write of Volume Header on last flush */ - VCB_LOCK_INIT(vcb); - /* Now fill in the Extended VCB info */ vcb->nextAllocation = SWAP_BE32(vhp->nextAllocation); vcb->totalBlocks = SWAP_BE32(vhp->totalBlocks); @@ -352,8 +369,23 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, hfsmp->hfs_logBlockSize = BestBlockSizeFit(vcb->blockSize, MAXBSIZE, hfsmp->hfs_phys_block_size); vcb->vcbVBMIOSize = min(vcb->blockSize, MAXPHYSIO); + /* + * Validate and initialize the location of the alternate volume header. + */ + spare_sectors = hfsmp->hfs_phys_block_count - + (((daddr64_t)vcb->totalBlocks * blockSize) / + hfsmp->hfs_phys_block_size); + + if (spare_sectors > (blockSize / hfsmp->hfs_phys_block_size)) { + hfsmp->hfs_alt_id_sector = 0; /* partition has grown! */ + } else { + hfsmp->hfs_alt_id_sector = (hfsmp->hfsPlusIOPosOffset / hfsmp->hfs_phys_block_size) + + HFS_ALT_SECTOR(hfsmp->hfs_phys_block_size, + hfsmp->hfs_phys_block_count); + } + bzero(&cndesc, sizeof(cndesc)); - cndesc.cd_parentcnid = kRootParID; + cndesc.cd_parentcnid = kHFSRootParentID; cndesc.cd_flags |= CD_ISMETA; bzero(&cnattr, sizeof(cnattr)); cnattr.ca_nlink = 1; @@ -377,14 +409,14 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, cfork.cf_extents[i].blockCount = SWAP_BE32 (vhp->extentsFile.extents[i].blockCount); } - retval = hfs_getnewvnode(hfsmp, NULL, &cndesc, 0, &cnattr, &cfork, - &vcb->extentsRefNum); + retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, + &hfsmp->hfs_extents_vp); if (retval) goto ErrorExit; - retval = MacToVFSError(BTOpenPath(VTOF(vcb->extentsRefNum), + retval = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_extents_vp), (KeyCompareProcPtr) CompareExtentKeysPlus)); if (retval) { - VOP_UNLOCK(vcb->extentsRefNum, 0, p); + hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); goto ErrorExit; } @@ -406,25 +438,25 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, cfork.cf_extents[i].blockCount = SWAP_BE32 (vhp->catalogFile.extents[i].blockCount); } - retval = hfs_getnewvnode(hfsmp, NULL, &cndesc, 0, &cnattr, &cfork, - &vcb->catalogRefNum); + retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, + &hfsmp->hfs_catalog_vp); if (retval) { - VOP_UNLOCK(vcb->extentsRefNum, 0, p); + hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); goto ErrorExit; } - retval = MacToVFSError(BTOpenPath(VTOF(vcb->catalogRefNum), + retval = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_catalog_vp), (KeyCompareProcPtr) CompareExtendedCatalogKeys)); if (retval) { - VOP_UNLOCK(vcb->catalogRefNum, 0, p); - VOP_UNLOCK(vcb->extentsRefNum, 0, p); + hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); + hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); goto ErrorExit; } if ((hfsmp->hfs_flags & HFS_X) && - BTGetInformation(VTOF(vcb->catalogRefNum), 0, &btinfo) == 0) { + BTGetInformation(VTOF(hfsmp->hfs_catalog_vp), 0, &btinfo) == 0) { if (btinfo.keyCompareType == kHFSBinaryCompare) { hfsmp->hfs_flags |= HFS_CASE_SENSITIVE; /* Install a case-sensitive key compare */ - (void) BTOpenPath(VTOF(vcb->catalogRefNum), + (void) BTOpenPath(VTOF(hfsmp->hfs_catalog_vp), (KeyCompareProcPtr)cat_binarykeycompare); } } @@ -447,20 +479,59 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, cfork.cf_extents[i].blockCount = SWAP_BE32 (vhp->allocationFile.extents[i].blockCount); } - retval = hfs_getnewvnode(hfsmp, NULL, &cndesc, 0, &cnattr, &cfork, - &vcb->allocationsRefNum); + retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, + &hfsmp->hfs_allocation_vp); if (retval) { - VOP_UNLOCK(vcb->catalogRefNum, 0, p); - VOP_UNLOCK(vcb->extentsRefNum, 0, p); + hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); + hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); goto ErrorExit; } + /* + * Set up Attribute B-tree vnode + */ + if (vhp->attributesFile.totalBlocks != 0) { + cndesc.cd_nameptr = hfs_attrname; + cndesc.cd_namelen = strlen(hfs_attrname); + cndesc.cd_cnid = cnattr.ca_fileid = kHFSAttributesFileID; + + cfork.cf_size = SWAP_BE64 (vhp->attributesFile.logicalSize); + cfork.cf_clump = SWAP_BE32 (vhp->attributesFile.clumpSize); + cfork.cf_blocks = SWAP_BE32 (vhp->attributesFile.totalBlocks); + cfork.cf_vblocks = 0; + cnattr.ca_blocks = cfork.cf_blocks; + for (i = 0; i < kHFSPlusExtentDensity; i++) { + cfork.cf_extents[i].startBlock = + SWAP_BE32 (vhp->attributesFile.extents[i].startBlock); + cfork.cf_extents[i].blockCount = + SWAP_BE32 (vhp->attributesFile.extents[i].blockCount); + } + retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, + &hfsmp->hfs_attribute_vp); + if (retval) { + hfs_unlock(VTOC(hfsmp->hfs_allocation_vp)); + hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); + hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); + goto ErrorExit; + } + retval = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_attribute_vp), + (KeyCompareProcPtr) hfs_attrkeycompare)); + if (retval) { + hfs_unlock(VTOC(hfsmp->hfs_attribute_vp)); + hfs_unlock(VTOC(hfsmp->hfs_allocation_vp)); + hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); + hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); + goto ErrorExit; + } + } + + /* Pick up volume name and create date */ retval = cat_idlookup(hfsmp, kHFSRootFolderID, &cndesc, &cnattr, NULL); if (retval) { - VOP_UNLOCK(vcb->allocationsRefNum, 0, p); - VOP_UNLOCK(vcb->catalogRefNum, 0, p); - VOP_UNLOCK(vcb->extentsRefNum, 0, p); + hfs_unlock(VTOC(hfsmp->hfs_allocation_vp)); + hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); + hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); goto ErrorExit; } vcb->vcbCrDate = cnattr.ca_itime; @@ -471,15 +542,17 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, /* mark the volume dirty (clear clean unmount bit) */ vcb->vcbAtrb &= ~kHFSVolumeUnmountedMask; if (hfsmp->jnl && (hfsmp->hfs_flags & HFS_READ_ONLY) == 0) { - hfs_flushvolumeheader(hfsmp, TRUE, TRUE); + hfs_flushvolumeheader(hfsmp, TRUE, 0); } /* * all done with metadata files so we can unlock now... */ - VOP_UNLOCK(vcb->allocationsRefNum, 0, p); - VOP_UNLOCK(vcb->catalogRefNum, 0, p); - VOP_UNLOCK(vcb->extentsRefNum, 0, p); + if (hfsmp->hfs_attribute_vp) + hfs_unlock(VTOC(hfsmp->hfs_attribute_vp)); + hfs_unlock(VTOC(hfsmp->hfs_allocation_vp)); + hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); + hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); // // Check if we need to do late journal initialization. This only @@ -494,9 +567,42 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, retval = hfs_late_journal_init(hfsmp, vhp, args); if (retval != 0) { hfsmp->jnl = NULL; + + // if the journal failed to open, then set the lastMountedVersion + // to be "FSK!" which fsck_hfs will see and force the fsck instead + // of just bailing out because the volume is journaled. + if (!(hfsmp->hfs_flags & HFS_READ_ONLY)) { + HFSPlusVolumeHeader *jvhp; + daddr64_t mdb_offset; + struct buf *bp = NULL; + + hfsmp->hfs_flags |= HFS_NEED_JNL_RESET; + + mdb_offset = (daddr64_t)((embeddedOffset / blockSize) + HFS_PRI_SECTOR(blockSize)); + + retval = (int)buf_meta_bread(hfsmp->hfs_devvp, mdb_offset, blockSize, cred, &bp); + if (retval == 0) { + jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(blockSize)); + + if (SWAP_BE16(jvhp->signature) == kHFSPlusSigWord || SWAP_BE16(jvhp->signature) == kHFSXSigWord) { + printf ("hfs(3): Journal replay fail. Writing lastMountVersion as FSK!\n"); + jvhp->lastMountedVersion = SWAP_BE32(kFSKMountVersion); + buf_bwrite(bp); + } else { + buf_brelse(bp); + } + bp = NULL; + } else if (bp) { + buf_brelse(bp); + // clear this so the error exit path won't try to use it + bp = NULL; + } + } + + retval = EINVAL; goto ErrorExit; } else if (hfsmp->jnl) { - hfsmp->hfs_mp->mnt_flag |= MNT_JOURNALED; + vfs_setflags(hfsmp->hfs_mp, (uint64_t)((unsigned int)MNT_JOURNALED)); } } else if (hfsmp->jnl) { struct cat_attr jinfo_attr, jnl_attr; @@ -529,7 +635,7 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, /* setup private/hidden directory for unlinked files */ FindMetaDataDirectory(vcb); - if (hfsmp->jnl && ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0)) + if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) hfs_remove_orphans(hfsmp); if ( !(vcb->vcbAtrb & kHFSVolumeHardwareLockMask) ) // if the disk is not write protected @@ -537,27 +643,28 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, MarkVCBDirty( vcb ); // mark VCB dirty so it will be written } - /* * Allow hot file clustering if conditions allow. */ if ((hfsmp->hfs_flags & HFS_METADATA_ZONE) && ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0)) { - (void) hfs_recording_init(hfsmp, p); + (void) hfs_recording_init(hfsmp); } + hfs_checkextendedsecurity(hfsmp); + return (0); ErrorExit: /* - * A fatal error occured and the volume cannot be mounted + * A fatal error occurred and the volume cannot be mounted * release any resources that we aquired... */ - - InvalidateCatalogCache(vcb); - ReleaseMetaFileVNode(vcb->allocationsRefNum); - ReleaseMetaFileVNode(vcb->catalogRefNum); - ReleaseMetaFileVNode(vcb->extentsRefNum); + if (hfsmp->hfs_attribute_vp) + ReleaseMetaFileVNode(hfsmp->hfs_attribute_vp); + ReleaseMetaFileVNode(hfsmp->hfs_allocation_vp); + ReleaseMetaFileVNode(hfsmp->hfs_catalog_vp); + ReleaseMetaFileVNode(hfsmp->hfs_extents_vp); return (retval); } @@ -573,12 +680,15 @@ static void ReleaseMetaFileVNode(struct vnode *vp) struct filefork *fp; if (vp && (fp = VTOF(vp))) { - if (fp->fcbBTCBPtr != NULL) + if (fp->fcbBTCBPtr != NULL) { + (void)hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK); (void) BTClosePath(fp); + hfs_unlock(VTOC(vp)); + } /* release the node even if BTClosePath fails */ - vrele(vp); - vgone(vp); + vnode_recycle(vp); + vnode_put(vp); } } @@ -594,28 +704,21 @@ __private_extern__ int hfsUnmount( register struct hfsmount *hfsmp, struct proc *p) { - ExtendedVCB *vcb = HFSTOVCB(hfsmp); - int retval = E_NONE; - - InvalidateCatalogCache( vcb ); + if (hfsmp->hfs_allocation_vp) + ReleaseMetaFileVNode(hfsmp->hfs_allocation_vp); - if (hfsmp->hfc_filevp) { - ReleaseMetaFileVNode(hfsmp->hfc_filevp); - hfsmp->hfc_filevp = NULL; - } - - if (vcb->vcbSigWord == kHFSPlusSigWord) - ReleaseMetaFileVNode(vcb->allocationsRefNum); + if (hfsmp->hfs_attribute_vp) + ReleaseMetaFileVNode(hfsmp->hfs_attribute_vp); - ReleaseMetaFileVNode(vcb->catalogRefNum); - ReleaseMetaFileVNode(vcb->extentsRefNum); + ReleaseMetaFileVNode(hfsmp->hfs_catalog_vp); + ReleaseMetaFileVNode(hfsmp->hfs_extents_vp); - return (retval); + return (0); } /* - * Test is fork has overflow extents. + * Test if fork has overflow extents. */ __private_extern__ int @@ -649,55 +752,128 @@ overflow_extents(struct filefork *fp) /* - * Lock/Unlock a metadata file. + * Lock HFS system file(s). */ __private_extern__ int -hfs_metafilelocking(struct hfsmount *hfsmp, u_long fileID, u_int flags, struct proc *p) +hfs_systemfile_lock(struct hfsmount *hfsmp, int flags, enum hfslocktype locktype) { - ExtendedVCB *vcb; - struct vnode *vp = NULL; - int numOfLockedBuffs; - int retval = 0; - - vcb = HFSTOVCB(hfsmp); - - switch (fileID) { - case kHFSExtentsFileID: - vp = vcb->extentsRefNum; - break; + if (flags & ~SFL_VALIDMASK) + panic("hfs_systemfile_lock: invalid lock request (0x%x)", (unsigned long) flags); + /* + * Locking order is Catalog file, Attributes file, Bitmap file, Extents file + */ + if (flags & SFL_CATALOG) { + (void) hfs_lock(VTOC(hfsmp->hfs_catalog_vp), locktype); + /* + * When the catalog file has overflow extents then + * also acquire the extents b-tree lock if its not + * already requested. + */ + if ((flags & SFL_EXTENTS) == 0 && + overflow_extents(VTOF(hfsmp->hfs_catalog_vp))) { + flags |= SFL_EXTENTS; + } + } + if (flags & SFL_ATTRIBUTE) { + if (hfsmp->hfs_attribute_vp) { + (void) hfs_lock(VTOC(hfsmp->hfs_attribute_vp), locktype); + /* + * When the attribute file has overflow extents then + * also acquire the extents b-tree lock if its not + * already requested. + */ + if ((flags & SFL_EXTENTS) == 0 && + overflow_extents(VTOF(hfsmp->hfs_attribute_vp))) { + flags |= SFL_EXTENTS; + } + } else { + flags &= ~SFL_ATTRIBUTE; + } + } + if (flags & SFL_BITMAP) { + /* + * Since the only bitmap operations are clearing and + * setting bits we always need exclusive access. And + * when we have a journal, we can "hide" behind that + * lock since we can only change the bitmap from + * within a transaction. + */ + if (hfsmp->jnl) { + flags &= ~SFL_BITMAP; + } else { + (void) hfs_lock(VTOC(hfsmp->hfs_allocation_vp), HFS_EXCLUSIVE_LOCK); + } + } + if (flags & SFL_EXTENTS) { + /* + * Since the extents btree lock is recursive we always + * need exclusive access. + */ + (void) hfs_lock(VTOC(hfsmp->hfs_extents_vp), HFS_EXCLUSIVE_LOCK); + } + return (flags); +} - case kHFSCatalogFileID: - vp = vcb->catalogRefNum; - break; +/* + * unlock HFS system file(s). + */ +__private_extern__ +void +hfs_systemfile_unlock(struct hfsmount *hfsmp, int flags) +{ + struct timeval tv; + u_int32_t lastfsync; + int numOfLockedBuffs; - case kHFSAllocationFileID: - /* bitmap is covered by Extents B-tree locking */ - /* FALL THROUGH */ - default: - panic("hfs_lockmetafile: invalid fileID"); + microuptime(&tv); + lastfsync = tv.tv_sec; + + if (flags & ~SFL_VALIDMASK) + panic("hfs_systemfile_unlock: invalid lock request (0x%x)", (unsigned long) flags); + + if (flags & SFL_ATTRIBUTE && hfsmp->hfs_attribute_vp) { + if (hfsmp->jnl == NULL) { + BTGetLastSync((FCB*)VTOF(hfsmp->hfs_attribute_vp), &lastfsync); + numOfLockedBuffs = count_lock_queue(); + if ((numOfLockedBuffs > kMaxLockedMetaBuffers) || + ((numOfLockedBuffs > 1) && ((tv.tv_sec - lastfsync) > + kMaxSecsForFsync))) { + hfs_btsync(hfsmp->hfs_attribute_vp, HFS_SYNCTRANS); + } + } + hfs_unlock(VTOC(hfsmp->hfs_attribute_vp)); } - - if ((flags & LK_TYPE_MASK) != LK_RELEASE) { - flags |= LK_RETRY; - } else if (hfsmp->jnl == NULL) { - struct timeval tv = time; - u_int32_t lastfsync = tv.tv_sec; - - (void) BTGetLastSync((FCB*)VTOF(vp), &lastfsync); - - numOfLockedBuffs = count_lock_queue(); - if ((numOfLockedBuffs > kMaxLockedMetaBuffers) || - ((numOfLockedBuffs > 1) && ((tv.tv_sec - lastfsync) > kMaxSecsForFsync))) { - hfs_btsync(vp, HFS_SYNCTRANS); + if (flags & SFL_CATALOG) { + if (hfsmp->jnl == NULL) { + BTGetLastSync((FCB*)VTOF(hfsmp->hfs_catalog_vp), &lastfsync); + numOfLockedBuffs = count_lock_queue(); + if ((numOfLockedBuffs > kMaxLockedMetaBuffers) || + ((numOfLockedBuffs > 1) && ((tv.tv_sec - lastfsync) > + kMaxSecsForFsync))) { + hfs_btsync(hfsmp->hfs_catalog_vp, HFS_SYNCTRANS); + } } + hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); + } + if (flags & SFL_BITMAP) { + hfs_unlock(VTOC(hfsmp->hfs_allocation_vp)); + } + if (flags & SFL_EXTENTS) { + if (hfsmp->jnl == NULL) { + BTGetLastSync((FCB*)VTOF(hfsmp->hfs_extents_vp), &lastfsync); + numOfLockedBuffs = count_lock_queue(); + if ((numOfLockedBuffs > kMaxLockedMetaBuffers) || + ((numOfLockedBuffs > 1) && ((tv.tv_sec - lastfsync) > + kMaxSecsForFsync))) { + hfs_btsync(hfsmp->hfs_extents_vp, HFS_SYNCTRANS); + } + } + hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); } - - retval = lockmgr(&VTOC(vp)->c_lock, flags, &vp->v_interlock, p); - - return (retval); } + /* * RequireFileLock * @@ -707,37 +883,32 @@ hfs_metafilelocking(struct hfsmount *hfsmp, u_long fileID, u_int flags, struct p #if HFS_DIAGNOSTIC void RequireFileLock(FileReference vp, int shareable) { - struct lock__bsd__ *lkp; - int locked = false; - pid_t pid; - void * self; - - pid = current_proc()->p_pid; - self = (void *) current_act(); - lkp = &VTOC(vp)->c_lock; + int locked; - simple_lock(&lkp->lk_interlock); + /* The extents btree and allocation bitmap are always exclusive. */ + if (VTOC(vp)->c_fileid == kHFSExtentsFileID || + VTOC(vp)->c_fileid == kHFSAllocationFileID) { + shareable = 0; + } - if (shareable && (lkp->lk_sharecount > 0) && (lkp->lk_lockholder == LK_NOPROC)) - locked = true; - else if ((lkp->lk_exclusivecount > 0) && (lkp->lk_lockholder == pid) && (lkp->lk_lockthread == self)) - locked = true; - - simple_unlock(&lkp->lk_interlock); + locked = VTOC(vp)->c_lockowner == (void *)current_thread(); - if (!locked) { + if (!locked && !shareable) { switch (VTOC(vp)->c_fileid) { - case 3: - DEBUG_BREAK_MSG((" #\n # RequireFileLock: extent btree vnode not locked! v: 0x%08X\n #\n", (u_int)vp)); - break; - - case 4: - DEBUG_BREAK_MSG((" #\n # RequireFileLock: catalog btree vnode not locked! v: 0x%08X\n #\n", (u_int)vp)); - break; - - default: - DEBUG_BREAK_MSG((" #\n # RequireFileLock: file (%d) not locked! v: 0x%08X\n #\n", VTOC(vp)->c_fileid, (u_int)vp)); - break; + case kHFSExtentsFileID: + panic("extents btree not locked! v: 0x%08X\n #\n", (u_int)vp); + break; + case kHFSCatalogFileID: + panic("catalog btree not locked! v: 0x%08X\n #\n", (u_int)vp); + break; + case kHFSAllocationFileID: + /* The allocation file can hide behind the jornal lock. */ + if (VTOHFS(vp)->jnl == NULL) + panic("allocation file not locked! v: 0x%08X\n #\n", (u_int)vp); + break; + case kHFSAttributesFileID: + panic("attributes btree not locked! v: 0x%08X\n #\n", (u_int)vp); + break; } } } @@ -757,15 +928,15 @@ void RequireFileLock(FileReference vp, int shareable) * */ int -hfs_owner_rights(struct hfsmount *hfsmp, uid_t cnode_uid, struct ucred *cred, +hfs_owner_rights(struct hfsmount *hfsmp, uid_t cnode_uid, kauth_cred_t cred, struct proc *p, int invokesuperuserstatus) { - if ((cred->cr_uid == cnode_uid) || /* [1a] */ + if ((kauth_cred_getuid(cred) == cnode_uid) || /* [1a] */ (cnode_uid == UNKNOWNUID) || /* [1b] */ - ((HFSTOVFS(hfsmp)->mnt_flag & MNT_UNKNOWNPERMISSIONS) && /* [2] */ - ((cred->cr_uid == hfsmp->hfs_uid) || /* [2a] */ + ((((unsigned int)vfs_flags(HFSTOVFS(hfsmp))) & MNT_UNKNOWNPERMISSIONS) && /* [2] */ + ((kauth_cred_getuid(cred) == hfsmp->hfs_uid) || /* [2a] */ (hfsmp->hfs_uid == UNKNOWNUID))) || /* [2b] */ - (invokesuperuserstatus && (suser(cred, &p->p_acflag) == 0))) { /* [3] */ + (invokesuperuserstatus && (suser(cred, 0) == 0))) { /* [3] */ return (0); } else { return (EPERM); @@ -834,6 +1005,7 @@ FindMetaDataDirectory(ExtendedVCB *vcb) struct proc *p = current_proc(); struct timeval tv; cat_cookie_t cookie; + int lockflags; int error; if (vcb->vcbSigWord != kHFSPlusSigWord) @@ -848,15 +1020,12 @@ FindMetaDataDirectory(ExtendedVCB *vcb) hfsmp->hfs_privdir_desc.cd_flags = CD_ISDIR; } - /* Lock catalog b-tree */ - if (hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, p) != 0) - return (0); + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); error = cat_lookup(hfsmp, &hfsmp->hfs_privdir_desc, 0, NULL, - &hfsmp->hfs_privdir_attr, NULL); + &hfsmp->hfs_privdir_attr, NULL, NULL); - /* Unlock catalog b-tree */ - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); + hfs_systemfile_unlock(hfsmp, lockflags); if (error == 0) { hfsmp->hfs_metadata_createdate = hfsmp->hfs_privdir_attr.ca_itime; @@ -868,22 +1037,16 @@ FindMetaDataDirectory(ExtendedVCB *vcb) (hfsmp->hfs_flags & HFS_READ_ONLY) == 0) { hfsmp->hfs_privdir_attr.ca_flags &= ~SF_IMMUTABLE; - hfs_global_shared_lock_acquire(hfsmp); - if (hfsmp->jnl) { - if ((error = journal_start_transaction(hfsmp->jnl)) != 0) { - hfs_global_shared_lock_release(hfsmp); - return (hfsmp->hfs_privdir_attr.ca_fileid); - } + if ((error = hfs_start_transaction(hfsmp)) != 0) { + return (hfsmp->hfs_privdir_attr.ca_fileid); } - if (hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, p) == 0) { - (void)cat_update(hfsmp, &hfsmp->hfs_privdir_desc, - &hfsmp->hfs_privdir_attr, NULL, NULL); - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); - } - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + (void) cat_update(hfsmp, &hfsmp->hfs_privdir_desc, + &hfsmp->hfs_privdir_attr, NULL, NULL); + hfs_systemfile_unlock(hfsmp, lockflags); + + hfs_end_transaction(hfsmp); } return (hfsmp->hfs_privdir_attr.ca_fileid); @@ -897,7 +1060,8 @@ FindMetaDataDirectory(ExtendedVCB *vcb) hfsmp->hfs_privdir_attr.ca_mode = S_IFDIR; hfsmp->hfs_privdir_attr.ca_nlink = 2; hfsmp->hfs_privdir_attr.ca_itime = vcb->vcbCrDate; - hfsmp->hfs_privdir_attr.ca_mtime = time.tv_sec; + microtime(&tv); + hfsmp->hfs_privdir_attr.ca_mtime = tv.tv_sec; /* hidden and off the desktop view */ fndrinfo = (struct FndrDirInfo *)&hfsmp->hfs_privdir_attr.ca_finderinfo; @@ -905,61 +1069,51 @@ FindMetaDataDirectory(ExtendedVCB *vcb) fndrinfo->frLocation.h = SWAP_BE16 (22460); fndrinfo->frFlags |= SWAP_BE16 (kIsInvisible + kNameLocked); - // XXXdbg - hfs_global_shared_lock_acquire(hfsmp); - if (hfsmp->jnl) { - if ((error = journal_start_transaction(hfsmp->jnl)) != 0) { - hfs_global_shared_lock_release(hfsmp); - return (0); - } + if ((error = hfs_start_transaction(hfsmp)) != 0) { + return (0); } /* Reserve some space in the Catalog file. */ if (cat_preflight(hfsmp, CAT_CREATE, &cookie, p) != 0) { - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); - return (0); + hfs_end_transaction(hfsmp); + + return (0); } - if (hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p) == 0) { + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); + error = cat_create(hfsmp, &hfsmp->hfs_privdir_desc, &hfsmp->hfs_privdir_attr, &out_desc); - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); - } + hfs_systemfile_unlock(hfsmp, lockflags); cat_postflight(hfsmp, &cookie, p); if (error) { - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); + hfs_volupdate(hfsmp, VOL_UPDATE, 0); - return (0); + hfs_end_transaction(hfsmp); + + return (0); } hfsmp->hfs_privdir_desc.cd_hint = out_desc.cd_hint; hfsmp->hfs_privdir_desc.cd_cnid = out_desc.cd_cnid; hfsmp->hfs_privdir_attr.ca_fileid = out_desc.cd_cnid; hfsmp->hfs_metadata_createdate = vcb->vcbCrDate; - - if (VFS_ROOT(HFSTOVFS(hfsmp), &dvp) == 0) { + + if (hfs_vget(hfsmp, kRootDirID, &dvp, 0) == 0) { dcp = VTOC(dvp); dcp->c_childhint = out_desc.cd_hint; dcp->c_nlink++; dcp->c_entries++; - dcp->c_flag |= C_CHANGE | C_UPDATE; - tv = time; - (void) VOP_UPDATE(dvp, &tv, &tv, 0); - vput(dvp); + dcp->c_touch_chgtime = TRUE; + dcp->c_touch_modtime = TRUE; + (void) hfs_update(dvp, 0); + hfs_unlock(dcp); + vnode_put(dvp); } hfs_volupdate(hfsmp, VOL_MKDIR, 1); - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); + hfs_end_transaction(hfsmp); cat_releasedesc(&out_desc); @@ -968,7 +1122,7 @@ FindMetaDataDirectory(ExtendedVCB *vcb) __private_extern__ u_long -GetFileInfo(ExtendedVCB *vcb, u_int32_t dirid, char *name, +GetFileInfo(ExtendedVCB *vcb, u_int32_t dirid, const char *name, struct cat_attr *fattr, struct cat_fork *forkinfo) { struct hfsmount * hfsmp; @@ -976,7 +1130,7 @@ GetFileInfo(ExtendedVCB *vcb, u_int32_t dirid, char *name, struct cnode * dcp = NULL; struct FndrDirInfo * fndrinfo; struct cat_desc jdesc; - struct timeval tv; + int lockflags; int error; if (vcb->vcbSigWord != kHFSPlusSigWord) @@ -989,25 +1143,22 @@ GetFileInfo(ExtendedVCB *vcb, u_int32_t dirid, char *name, jdesc.cd_nameptr = name; jdesc.cd_namelen = strlen(name); - /* Lock catalog b-tree */ - error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, current_proc()); - if (error) - return (0); - - error = cat_lookup(hfsmp, &jdesc, 0, NULL, fattr, forkinfo); - - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, current_proc()); + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + error = cat_lookup(hfsmp, &jdesc, 0, NULL, fattr, forkinfo, NULL); + hfs_systemfile_unlock(hfsmp, lockflags); if (error == 0) { return (fattr->ca_fileid); } else if (hfsmp->hfs_flags & HFS_READ_ONLY) { return (0); } + + return (0); /* XXX what callers expect on an error */ } /* - * On Journaled HFS, there can be orphaned files. These + * On HFS Plus Volume, there can be orphaned files. These * are files that were unlinked while busy. If the volume * was not cleanly unmounted then some of these files may * have persisted and need to be removed. @@ -1026,18 +1177,21 @@ hfs_remove_orphans(struct hfsmount * hfsmp) char filename[32]; char tempname[32]; size_t namelen; - cat_cookie_t cookie = {0}; + cat_cookie_t cookie; int catlock = 0; int catreserve = 0; int started_tr = 0; - int shared_lock = 0; + int lockflags; int result; - + int orphanedlinks = 0; + + bzero(&cookie, sizeof(cookie)); + if (hfsmp->hfs_flags & HFS_CLEANED_ORPHANS) return; vcb = HFSTOVCB(hfsmp); - fcb = VTOF(vcb->catalogRefNum); + fcb = VTOF(hfsmp->hfs_catalog_vp); btdata.bufferAddress = &filerec; btdata.itemSize = sizeof(filerec); @@ -1045,34 +1199,31 @@ hfs_remove_orphans(struct hfsmount * hfsmp) MALLOC(iterator, struct BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); bzero(iterator, sizeof(*iterator)); + + /* Build a key to "temp" */ keyp = (HFSPlusCatalogKey*)&iterator->key; keyp->parentID = hfsmp->hfs_privdir_desc.cd_cnid; + keyp->nodeName.length = 4; /* "temp" */ + keyp->keyLength = kHFSPlusCatalogKeyMinimumLength + keyp->nodeName.length * 2; + keyp->nodeName.unicode[0] = 't'; + keyp->nodeName.unicode[1] = 'e'; + keyp->nodeName.unicode[2] = 'm'; + keyp->nodeName.unicode[3] = 'p'; - result = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); - if (result) - goto exit; /* - * Position the iterator at the folder thread record. - * (i.e. one record before first child) + * Position the iterator just before the first real temp file. */ - result = BTSearchRecord(fcb, iterator, NULL, NULL, iterator); - - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); - if (result) - goto exit; + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); + (void) BTSearchRecord(fcb, iterator, NULL, NULL, iterator); + hfs_systemfile_unlock(hfsmp, lockflags); - /* Visit all the children in the HFS+ private directory. */ + /* Visit all the temp files in the HFS+ private directory. */ for (;;) { - result = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); - if (result) - goto exit; - + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); result = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL); - - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); + hfs_systemfile_unlock(hfsmp, lockflags); if (result) break; - if (keyp->parentID != hfsmp->hfs_privdir_desc.cd_cnid) break; if (filerec.recordType != kHFSPlusFileRecord) @@ -1089,33 +1240,30 @@ hfs_remove_orphans(struct hfsmount * hfsmp) * */ if (bcmp(tempname, filename, namelen) == 0) { - struct filefork dfork = {0}; - struct filefork rfork = {0}; - struct cnode cnode = {0}; - - // XXXdbg - hfs_global_shared_lock_acquire(hfsmp); - shared_lock = 1; - if (hfsmp->jnl) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - goto exit; - } - started_tr = 1; + struct filefork dfork; + struct filefork rfork; + struct cnode cnode; + + bzero(&dfork, sizeof(dfork)); + bzero(&rfork, sizeof(rfork)); + bzero(&cnode, sizeof(cnode)); + + if (hfs_start_transaction(hfsmp) != 0) { + printf("hfs_remove_orphans: failed to start transaction\n"); + goto exit; } + started_tr = 1; /* * Reserve some space in the Catalog file. */ if (cat_preflight(hfsmp, CAT_DELETE, &cookie, p) != 0) { + printf("hfs_remove_orphans: cat_preflight failed\n"); goto exit; } catreserve = 1; - /* Lock catalog b-tree */ - if (hfs_metafilelocking(hfsmp, kHFSCatalogFileID, - LK_EXCLUSIVE, p) != 0) { - goto exit; - } + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); catlock = 1; /* Build a fake cnode */ @@ -1159,8 +1307,8 @@ hfs_remove_orphans(struct hfsmount * hfsmp) // that no one transaction gets too big. // if (fsize > 0 && started_tr) { - journal_end_transaction(hfsmp->jnl); - if (journal_start_transaction(hfsmp->jnl) != 0) { + hfs_end_transaction(hfsmp); + if (hfs_start_transaction(hfsmp) != 0) { started_tr = 0; break; } @@ -1180,9 +1328,14 @@ hfs_remove_orphans(struct hfsmount * hfsmp) /* Remove the file record from the Catalog */ if (cat_delete(hfsmp, &cnode.c_desc, &cnode.c_attr) != 0) { - printf("error deleting cat rec!\n"); + printf("hfs_remove_oprhans: error deleting cat rec for id %d!\n", cnode.c_desc.cd_cnid); + hfs_volupdate(hfsmp, VOL_UPDATE, 0); break; } + ++orphanedlinks; + + /* Delete any attributes, ignore errors */ + (void) hfs_removeallattr(hfsmp, cnode.c_fileid); /* Update parent and volume counts */ hfsmp->hfs_privdir_attr.ca_entries--; @@ -1191,31 +1344,27 @@ hfs_remove_orphans(struct hfsmount * hfsmp) hfs_volupdate(hfsmp, VOL_RMFILE, 0); /* Drop locks and end the transaction */ - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); + hfs_systemfile_unlock(hfsmp, lockflags); cat_postflight(hfsmp, &cookie, p); catlock = catreserve = 0; if (started_tr) { - journal_end_transaction(hfsmp->jnl); + hfs_end_transaction(hfsmp); started_tr = 0; } - hfs_global_shared_lock_release(hfsmp); - shared_lock = 0; } /* end if */ } /* end for */ - + if (orphanedlinks > 0) + printf("HFS: Removed %d orphaned unlinked files\n", orphanedlinks); exit: if (catlock) { - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); + hfs_systemfile_unlock(hfsmp, lockflags); } if (catreserve) { cat_postflight(hfsmp, &cookie, p); } if (started_tr) { - journal_end_transaction(hfsmp->jnl); - } - if (shared_lock) { - hfs_global_shared_lock_release(hfsmp); + hfs_end_transaction(hfsmp); } FREE(iterator, M_TEMP); @@ -1238,7 +1387,7 @@ u_int32_t logBlockSize; /* start with default */ logBlockSize = VTOHFS(vp)->hfs_logBlockSize; - if (vp->v_flag & VSYSTEM) { + if (vnode_issystem(vp)) { if (VTOF(vp)->fcbBTCBPtr != NULL) { BTreeInfoRec bTreeInfo; @@ -1268,9 +1417,10 @@ __private_extern__ u_int32_t hfs_freeblks(struct hfsmount * hfsmp, int wantreserve) { - struct vcb_t *vcb = HFSTOVCB(hfsmp); + ExtendedVCB *vcb = HFSTOVCB(hfsmp); u_int32_t freeblks; + HFS_MOUNT_LOCK(hfsmp, TRUE); freeblks = vcb->freeBlocks; if (wantreserve) { if (freeblks > vcb->reserveBlocks) @@ -1282,6 +1432,7 @@ hfs_freeblks(struct hfsmount * hfsmp, int wantreserve) freeblks -= vcb->loanedBlocks; else freeblks = 0; + HFS_MOUNT_UNLOCK(hfsmp, TRUE); #ifdef HFS_SPARSE_DEV /* @@ -1289,18 +1440,19 @@ hfs_freeblks(struct hfsmount * hfsmp, int wantreserve) * available space on the backing store volume. */ if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) && hfsmp->hfs_backingfs_rootvp) { - struct statfs statbuf; /* 272 bytes */ + struct vfsstatfs *vfsp; /* 272 bytes */ u_int32_t vfreeblks; u_int32_t loanedblks; struct mount * backingfs_mp; - backingfs_mp = hfsmp->hfs_backingfs_rootvp->v_mount; + backingfs_mp = vnode_mount(hfsmp->hfs_backingfs_rootvp); - if (VFS_STATFS(backingfs_mp, &statbuf, current_proc()) == 0) { - vfreeblks = statbuf.f_bavail; + if (vfsp = vfs_statfs(backingfs_mp)) { + HFS_MOUNT_LOCK(hfsmp, TRUE); + vfreeblks = (u_int32_t)vfsp->f_bavail; /* Normalize block count if needed. */ - if (statbuf.f_bsize != vcb->blockSize) { - vfreeblks = ((u_int64_t)vfreeblks * (u_int64_t)statbuf.f_bsize) / vcb->blockSize; + if (vfsp->f_bsize != vcb->blockSize) { + vfreeblks = ((u_int64_t)vfreeblks * (u_int64_t)(vfsp->f_bsize)) / vcb->blockSize; } if (vfreeblks > hfsmp->hfs_sparsebandblks) vfreeblks -= hfsmp->hfs_sparsebandblks; @@ -1315,6 +1467,7 @@ hfs_freeblks(struct hfsmount * hfsmp, int wantreserve) vfreeblks = 0; freeblks = MIN(vfreeblks, freeblks); + HFS_MOUNT_UNLOCK(hfsmp, TRUE); } } #endif /* HFS_SPARSE_DEV */ @@ -1378,87 +1531,121 @@ short MacToVFSError(OSErr err) /* - * Get the directory entry name hint for a given index. - * The directory cnode (dcp) must be locked. + * Find the current thread's directory hint for a given index. + * + * Requires an exclusive lock on directory cnode. */ __private_extern__ -char * -hfs_getnamehint(struct cnode *dcp, int index) +directoryhint_t * +hfs_getdirhint(struct cnode *dcp, int index) { - struct hfs_index *entry; - - if (index > 0) { - SLIST_FOREACH(entry, &dcp->c_indexlist, hi_link) { - if (entry->hi_index == index) - return (entry->hi_name); + struct timeval tv; + directoryhint_t *hint, *next, *oldest; + char * name; + + oldest = NULL; + microuptime(&tv); + + /* Look for an existing hint first */ + for(hint = dcp->c_hintlist.slh_first; hint != NULL; hint = next) { + next = hint->dh_link.sle_next; + if (hint->dh_index == index) { + goto out; + } else if (oldest == NULL || (hint->dh_time < oldest->dh_time)) { + oldest = hint; } } - - return (NULL); -} - -/* - * Save a directory entry name hint for a given index. - * The directory cnode (dcp) must be locked. - */ -__private_extern__ -void -hfs_savenamehint(struct cnode *dcp, int index, const char * namehint) -{ - struct hfs_index *entry; - int len; - - if (index > 0) { - len = strlen(namehint); - MALLOC(entry, struct hfs_index *, len + sizeof(struct hfs_index), - M_TEMP, M_WAITOK); - entry->hi_index = index; - bcopy(namehint, entry->hi_name, len + 1); - SLIST_INSERT_HEAD(&dcp->c_indexlist, entry, hi_link); + /* Recycle one if we have too many already. */ + if ((dcp->c_dirhintcnt >= HFS_MAXDIRHINTS) && (oldest != NULL)) { + hint = oldest; + if ((name = hint->dh_desc.cd_nameptr)) { + hint->dh_desc.cd_nameptr = NULL; + vfs_removename(name); + } + goto init; } + + /* Create a default directory hint */ + MALLOC_ZONE(hint, directoryhint_t *, sizeof(directoryhint_t), M_HFSDIRHINT, M_WAITOK); + SLIST_INSERT_HEAD(&dcp->c_hintlist, hint, dh_link); + ++dcp->c_dirhintcnt; +init: + hint->dh_index = index; + hint->dh_desc.cd_flags = 0; + hint->dh_desc.cd_encoding = 0; + hint->dh_desc.cd_namelen = 0; + hint->dh_desc.cd_nameptr = NULL; + hint->dh_desc.cd_parentcnid = dcp->c_cnid; + hint->dh_desc.cd_hint = dcp->c_childhint; + hint->dh_desc.cd_cnid = 0; +out: + hint->dh_time = tv.tv_sec; + return (hint); } /* - * Release the directory entry name hint for a given index. - * The directory cnode (dcp) must be locked. + * Release a single directory hint. + * + * Requires an exclusive lock on directory cnode. */ __private_extern__ void -hfs_relnamehint(struct cnode *dcp, int index) +hfs_reldirhint(struct cnode *dcp, directoryhint_t * relhint) { - struct hfs_index *entry; - - if (index > 0) { - SLIST_FOREACH(entry, &dcp->c_indexlist, hi_link) { - if (entry->hi_index == index) { - SLIST_REMOVE(&dcp->c_indexlist, entry, hfs_index, - hi_link); - FREE(entry, M_TEMP); - break; + directoryhint_t *hint; + char * name; + + SLIST_FOREACH(hint, &dcp->c_hintlist, dh_link) { + if (hint == relhint) { + SLIST_REMOVE(&dcp->c_hintlist, hint, directoryhint, dh_link); + name = hint->dh_desc.cd_nameptr; + if (name != NULL) { + hint->dh_desc.cd_nameptr = NULL; + vfs_removename(name); } + FREE_ZONE(hint, sizeof(directoryhint_t), M_HFSDIRHINT); + --dcp->c_dirhintcnt; + break; } } } /* - * Release all directory entry name hints. + * Release directory hints for given directory + * + * Requires an exclusive lock on directory cnode. */ __private_extern__ void -hfs_relnamehints(struct cnode *dcp) +hfs_reldirhints(struct cnode *dcp, int stale_hints_only) { - struct hfs_index *entry; - struct hfs_index *next; - - if (!SLIST_EMPTY(&dcp->c_indexlist)) { - for(entry = SLIST_FIRST(&dcp->c_indexlist); - entry != NULL; - entry = next) { - next = SLIST_NEXT(entry, hi_link); - SLIST_REMOVE(&dcp->c_indexlist, entry, hfs_index, hi_link); - FREE(entry, M_TEMP); + struct timeval tv; + directoryhint_t *hint, *next; + char * name; + + if (stale_hints_only) + microuptime(&tv); + else + tv.tv_sec = 0; + + for (hint = dcp->c_hintlist.slh_first; hint != NULL; hint = next) { + next = hint->dh_link.sle_next; + if (stale_hints_only) { + /* Skip over newer entries. */ + if ((tv.tv_sec - hint->dh_time) < HFS_DIRHINT_TTL) + continue; + SLIST_REMOVE(&dcp->c_hintlist, hint, directoryhint, dh_link); + } + name = hint->dh_desc.cd_nameptr; + if (name != NULL) { + hint->dh_desc.cd_nameptr = NULL; + vfs_removename(name); } + FREE_ZONE(hint, sizeof(directoryhint_t), M_HFSDIRHINT); + --dcp->c_dirhintcnt; } + if (!stale_hints_only) + dcp->c_hintlist.slh_first = NULL; } @@ -1498,8 +1685,8 @@ out: __private_extern__ int hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, - void *_args, int embeddedOffset, int mdb_offset, - HFSMasterDirectoryBlock *mdbp, struct ucred *cred) + void *_args, off_t embeddedOffset, daddr64_t mdb_offset, + HFSMasterDirectoryBlock *mdbp, kauth_cred_t cred) { JournalInfoBlock *jibp; struct buf *jinfo_bp, *bp; @@ -1517,14 +1704,14 @@ hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, sectors_per_fsblock = SWAP_BE32(vhp->blockSize) / blksize; - retval = meta_bread(devvp, - embeddedOffset/blksize + - (SWAP_BE32(vhp->journalInfoBlock)*sectors_per_fsblock), + retval = (int)buf_meta_bread(devvp, + (daddr64_t)((embeddedOffset/blksize) + + (SWAP_BE32(vhp->journalInfoBlock)*sectors_per_fsblock)), SWAP_BE32(vhp->blockSize), cred, &jinfo_bp); if (retval) return retval; - jibp = (JournalInfoBlock *)jinfo_bp->b_data; + jibp = (JournalInfoBlock *)buf_dataptr(jinfo_bp); jibp->flags = SWAP_BE32(jibp->flags); jibp->offset = SWAP_BE64(jibp->offset); jibp->size = SWAP_BE64(jibp->size); @@ -1533,7 +1720,7 @@ hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, hfsmp->jvp = hfsmp->hfs_devvp; } else { printf("hfs: journal not stored in fs! don't know what to do.\n"); - brelse(jinfo_bp); + buf_brelse(jinfo_bp); return EINVAL; } @@ -1543,9 +1730,9 @@ hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, if (jibp->flags & kJIJournalNeedInitMask) { printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n", - jibp->offset + (off_t)embeddedOffset, jibp->size); + jibp->offset + embeddedOffset, jibp->size); hfsmp->jnl = journal_create(hfsmp->jvp, - jibp->offset + (off_t)embeddedOffset, + jibp->offset + embeddedOffset, jibp->size, devvp, blksize, @@ -1559,16 +1746,16 @@ hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, jibp->flags = SWAP_BE32(jibp->flags); jibp->offset = SWAP_BE64(jibp->offset); jibp->size = SWAP_BE64(jibp->size); - bwrite(jinfo_bp); + buf_bwrite(jinfo_bp); jinfo_bp = NULL; jibp = NULL; } else { //printf("hfs: Opening the journal (joffset 0x%llx sz 0x%llx vhp_blksize %d)...\n", - // jibp->offset + (off_t)embeddedOffset, + // jibp->offset + embeddedOffset, // jibp->size, SWAP_BE32(vhp->blockSize)); hfsmp->jnl = journal_open(hfsmp->jvp, - jibp->offset + (off_t)embeddedOffset, + jibp->offset + embeddedOffset, jibp->size, devvp, blksize, @@ -1576,7 +1763,7 @@ hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, arg_tbufsz, hfs_sync_metadata, hfsmp->hfs_mp); - brelse(jinfo_bp); + buf_brelse(jinfo_bp); jinfo_bp = NULL; jibp = NULL; @@ -1584,17 +1771,17 @@ hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, // reload the mdb because it could have changed // if the journal had to be replayed. if (mdb_offset == 0) { - mdb_offset = (embeddedOffset / blksize) + HFS_PRI_SECTOR(blksize); + mdb_offset = (daddr64_t)((embeddedOffset / blksize) + HFS_PRI_SECTOR(blksize)); } - retval = meta_bread(devvp, mdb_offset, blksize, cred, &bp); + retval = (int)buf_meta_bread(devvp, mdb_offset, blksize, cred, &bp); if (retval) { - brelse(bp); + buf_brelse(bp); printf("hfs: failed to reload the mdb after opening the journal (retval %d)!\n", retval); return retval; } - bcopy(bp->b_data + HFS_PRI_OFFSET(blksize), mdbp, 512); - brelse(bp); + bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(blksize), mdbp, 512); + buf_brelse(bp); bp = NULL; } } @@ -1673,9 +1860,9 @@ hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_a sectors_per_fsblock = SWAP_BE32(vhp->blockSize) / hfsmp->hfs_phys_block_size; - retval = meta_bread(devvp, - vcb->hfsPlusIOPosOffset / hfsmp->hfs_phys_block_size + - (SWAP_BE32(vhp->journalInfoBlock)*sectors_per_fsblock), + retval = (int)buf_meta_bread(devvp, + (daddr64_t)(vcb->hfsPlusIOPosOffset / hfsmp->hfs_phys_block_size + + (SWAP_BE32(vhp->journalInfoBlock)*sectors_per_fsblock)), SWAP_BE32(vhp->blockSize), NOCRED, &jinfo_bp); if (retval) { printf("hfs: can't read journal info block. disabling journaling.\n"); @@ -1683,7 +1870,7 @@ hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_a return 0; } - jibp = (JournalInfoBlock *)jinfo_bp->b_data; + jibp = (JournalInfoBlock *)buf_dataptr(jinfo_bp); jibp->flags = SWAP_BE32(jibp->flags); jibp->offset = SWAP_BE64(jibp->offset); jibp->size = SWAP_BE64(jibp->size); @@ -1692,7 +1879,7 @@ hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_a if (fid == 0 || jfork.cf_extents[0].startBlock == 0 || jfork.cf_size == 0) { printf("hfs: can't find the journal file! disabling journaling (start: %d)\n", jfork.cf_extents[0].startBlock); - brelse(jinfo_bp); + buf_brelse(jinfo_bp); vcb->vcbAtrb &= ~kHFSVolumeJournaledMask; return 0; } @@ -1720,7 +1907,7 @@ hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_a hfsmp->jvp = hfsmp->hfs_devvp; } else { printf("hfs: journal not stored in fs! don't know what to do.\n"); - brelse(jinfo_bp); + buf_brelse(jinfo_bp); return EINVAL; } @@ -1776,9 +1963,9 @@ hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_a jibp->offset = SWAP_BE64(jibp->offset); jibp->size = SWAP_BE64(jibp->size); - bwrite(jinfo_bp); + buf_bwrite(jinfo_bp); } else { - brelse(jinfo_bp); + buf_brelse(jinfo_bp); } jinfo_bp = NULL; jibp = NULL; @@ -1828,7 +2015,6 @@ static void hfs_metadatazone_init(struct hfsmount *hfsmp) { ExtendedVCB *vcb; - struct BTreeInfoRec btinfo; u_int64_t fs_size; u_int64_t zonesize; u_int64_t temp; @@ -1931,7 +2117,6 @@ hfs_metadatazone_init(struct hfsmount *hfsmp) } } filesize += (items + 1) * sizeof(struct dqblk); - hfsmp->hfs_hotfile_maxblks = filesize / vcb->blockSize; zonesize += filesize; /* @@ -1944,6 +2129,8 @@ hfs_metadatazone_init(struct hfsmount *hfsmp) filesize += temp / 3; hfsmp->hfs_catalog_maxblks += (temp - (temp / 3)) / vcb->blockSize; + hfsmp->hfs_hotfile_maxblks = filesize / vcb->blockSize; + /* Convert to allocation blocks. */ blk = zonesize / vcb->blockSize; @@ -1968,15 +2155,19 @@ static u_int32_t hfs_hotfile_freeblocks(struct hfsmount *hfsmp) { ExtendedVCB *vcb = HFSTOVCB(hfsmp); + int lockflags; int freeblocks; + lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); freeblocks = MetaZoneFreeBlocks(vcb); + hfs_systemfile_unlock(hfsmp, lockflags); + /* Minus Extents overflow file reserve. */ freeblocks -= - hfsmp->hfs_overflow_maxblks - VTOF(vcb->extentsRefNum)->ff_blocks; + hfsmp->hfs_overflow_maxblks - VTOF(hfsmp->hfs_extents_vp)->ff_blocks; /* Minus catalog file reserve. */ freeblocks -= - hfsmp->hfs_catalog_maxblks - VTOF(vcb->catalogRefNum)->ff_blocks; + hfsmp->hfs_catalog_maxblks - VTOF(hfsmp->hfs_catalog_vp)->ff_blocks; if (freeblocks < 0) freeblocks = 0; @@ -2011,3 +2202,55 @@ hfs_virtualmetafile(struct cnode *cp) return (0); } + +__private_extern__ +int +hfs_start_transaction(struct hfsmount *hfsmp) +{ + int ret; + + if (hfsmp->jnl == NULL || journal_owner(hfsmp->jnl) != current_thread()) { + lck_rw_lock_shared(&hfsmp->hfs_global_lock); + } + + if (hfsmp->jnl) { + ret = journal_start_transaction(hfsmp->jnl); + if (ret == 0) { + OSAddAtomic(1, &hfsmp->hfs_global_lock_nesting); + } + } else { + ret = 0; + } + + if (ret != 0) { + lck_rw_done(&hfsmp->hfs_global_lock); + } + + return ret; +} + +__private_extern__ +int +hfs_end_transaction(struct hfsmount *hfsmp) +{ + int need_unlock=0, ret; + + if ( hfsmp->jnl == NULL + || ( journal_owner(hfsmp->jnl) == current_thread() + && (OSAddAtomic(-1, &hfsmp->hfs_global_lock_nesting) == 1)) ) { + + need_unlock = 1; + } + + if (hfsmp->jnl) { + ret = journal_end_transaction(hfsmp->jnl); + } else { + ret = 0; + } + + if (need_unlock) { + lck_rw_done(&hfsmp->hfs_global_lock); + } + + return ret; +} diff --git a/bsd/hfs/hfs_vnops.c b/bsd/hfs/hfs_vnops.c index 0f4848c24..873ff095c 100644 --- a/bsd/hfs/hfs_vnops.c +++ b/bsd/hfs/hfs_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -22,18 +22,20 @@ #include <sys/systm.h> #include <sys/kernel.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/dirent.h> #include <sys/stat.h> #include <sys/buf.h> #include <sys/mount.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/malloc.h> -#include <sys/namei.h> #include <sys/ubc.h> +#include <sys/paths.h> #include <sys/quota.h> #include <sys/time.h> #include <sys/disk.h> +#include <sys/kauth.h> +#include <sys/uio_internal.h> #include <miscfs/specfs/specdev.h> #include <miscfs/fifofs/fifo.h> @@ -45,7 +47,6 @@ #include "hfs.h" #include "hfs_catalog.h" #include "hfs_cnode.h" -#include "hfs_lockf.h" #include "hfs_dbg.h" #include "hfs_mount.h" #include "hfs_quota.h" @@ -67,37 +68,45 @@ extern unsigned long strtoul(const char *, char **, int); -extern int groupmember(gid_t gid, struct ucred *cred); +static int hfs_makenode(struct vnode *dvp, struct vnode **vpp, + struct componentname *cnp, struct vnode_attr *vap, + vfs_context_t ctx); -static int hfs_makenode(int mode, struct vnode *dvp, struct vnode **vpp, - struct componentname *cnp); - -static int hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, - struct vnode **rvpp, struct proc *p); - -static int hfs_metasync(struct hfsmount *hfsmp, daddr_t node, struct proc *p); +static int hfs_metasync(struct hfsmount *hfsmp, daddr64_t node, struct proc *p); static int hfs_removedir(struct vnode *, struct vnode *, struct componentname *, int); static int hfs_removefile(struct vnode *, struct vnode *, struct componentname *, - int); + int, int); + +static int hfs_vnop_close(struct vnop_close_args*); +static int hfs_vnop_create(struct vnop_create_args*); +static int hfs_vnop_exchange(struct vnop_exchange_args*); +static int hfs_vnop_fsync(struct vnop_fsync_args*); +static int hfs_vnop_mkdir(struct vnop_mkdir_args*); +static int hfs_vnop_mknod(struct vnop_mknod_args*); +static int hfs_vnop_getattr(struct vnop_getattr_args*); +static int hfs_vnop_open(struct vnop_open_args*); +static int hfs_vnop_readdir(struct vnop_readdir_args*); +static int hfs_vnop_remove(struct vnop_remove_args*); +static int hfs_vnop_rename(struct vnop_rename_args*); +static int hfs_vnop_rmdir(struct vnop_rmdir_args*); +static int hfs_vnop_symlink(struct vnop_symlink_args*); +static int hfs_vnop_setattr(struct vnop_setattr_args*); /* Options for hfs_removedir and hfs_removefile */ -#define HFSRM_PARENT_LOCKED 0x01 -#define HFSRM_SKIP_RESERVE 0x02 -#define HFSRM_SAVE_NAME 0x04 -#define HFSRM_RENAMEOPTS 0x07 +#define HFSRM_SKIP_RESERVE 0x01 -int hfs_write_access(struct vnode *vp, struct ucred *cred, struct proc *p, Boolean considerFlags); +int hfs_write_access(struct vnode *vp, kauth_cred_t cred, struct proc *p, Boolean considerFlags); -int hfs_chflags(struct vnode *vp, u_long flags, struct ucred *cred, +int hfs_chflags(struct vnode *vp, uint32_t flags, kauth_cred_t cred, struct proc *p); -int hfs_chmod(struct vnode *vp, int mode, struct ucred *cred, +int hfs_chmod(struct vnode *vp, int mode, kauth_cred_t cred, struct proc *p); int hfs_chown(struct vnode *vp, uid_t uid, gid_t gid, - struct ucred *cred, struct proc *p); + kauth_cred_t cred, struct proc *p); /***************************************************************************** * @@ -106,140 +115,83 @@ int hfs_chown(struct vnode *vp, uid_t uid, gid_t gid, *****************************************************************************/ /* - * Create a regular file -#% create dvp L U U -#% create vpp - L - -# - vop_create { - IN WILLRELE struct vnode *dvp; - OUT struct vnode **vpp; - IN struct componentname *cnp; - IN struct vattr *vap; - - We are responsible for freeing the namei buffer, - it is done in hfs_makenode() -*/ - + * Create a regular file. + */ static int -hfs_create(ap) - struct vop_create_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - struct vattr *a_vap; - } */ *ap; +hfs_vnop_create(struct vnop_create_args *ap) { - struct vattr *vap = ap->a_vap; - - return (hfs_makenode(MAKEIMODE(vap->va_type, vap->va_mode), - ap->a_dvp, ap->a_vpp, ap->a_cnp)); + return hfs_makenode(ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap, ap->a_context); } - /* - * Mknod vnode call - -#% mknod dvp L U U -#% mknod vpp - X - -# - vop_mknod { - IN WILLRELE struct vnode *dvp; - OUT WILLRELE struct vnode **vpp; - IN struct componentname *cnp; - IN struct vattr *vap; - */ -/* ARGSUSED */ - + * Make device special file. + */ static int -hfs_mknod(ap) - struct vop_mknod_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - struct vattr *a_vap; - } */ *ap; +hfs_vnop_mknod(struct vnop_mknod_args *ap) { - struct vattr *vap = ap->a_vap; + struct vnode_attr *vap = ap->a_vap; + struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct cnode *cp; int error; - if (VTOVCB(ap->a_dvp)->vcbSigWord != kHFSPlusSigWord) { - VOP_ABORTOP(ap->a_dvp, ap->a_cnp); - vput(ap->a_dvp); - return (EOPNOTSUPP); + if (VTOVCB(dvp)->vcbSigWord != kHFSPlusSigWord) { + return (ENOTSUP); } /* Create the vnode */ - error = hfs_makenode(MAKEIMODE(vap->va_type, vap->va_mode), - ap->a_dvp, vpp, ap->a_cnp); + error = hfs_makenode(dvp, vpp, ap->a_cnp, vap, ap->a_context); if (error) return (error); + cp = VTOC(*vpp); - cp->c_flag |= C_ACCESS | C_CHANGE | C_UPDATE; + cp->c_touch_acctime = TRUE; + cp->c_touch_chgtime = TRUE; + cp->c_touch_modtime = TRUE; + if ((vap->va_rdev != VNOVAL) && (vap->va_type == VBLK || vap->va_type == VCHR)) cp->c_rdev = vap->va_rdev; - /* - * Remove cnode so that it will be reloaded by lookup and - * checked to see if it is an alias of an existing vnode. - * Note: unlike UFS, we don't bash v_type here. - */ - vput(*vpp); - vgone(*vpp); - *vpp = 0; + return (0); } - /* - * Open called. -#% open vp L L L -# - vop_open { - IN struct vnode *vp; - IN int mode; - IN struct ucred *cred; - IN struct proc *p; - */ - - + * Open a file/directory. + */ static int -hfs_open(ap) - struct vop_open_args /* { - struct vnode *a_vp; - int a_mode; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; +hfs_vnop_open(struct vnop_open_args *ap) { struct vnode *vp = ap->a_vp; - struct filefork *fp = VTOF(vp); + struct filefork *fp; struct timeval tv; + int error; /* * Files marked append-only must be opened for appending. */ - if ((vp->v_type != VDIR) && (VTOC(vp)->c_flags & APPEND) && + if ((VTOC(vp)->c_flags & APPEND) && !vnode_isdir(vp) && (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) return (EPERM); - if (ap->a_mode & O_EVTONLY) { - if (vp->v_type == VREG) { - ++VTOF(vp)->ff_evtonly_refs; - } else { - ++VTOC(vp)->c_evtonly_refs; - }; - }; + if (vnode_isreg(vp) && !UBCINFOEXISTS(vp)) + return (EBUSY); /* file is in use by the kernel */ + /* Don't allow journal file to be opened externally. */ + if (VTOC(vp)->c_fileid == VTOHFS(vp)->hfs_jnlfileid) + return (EPERM); /* * On the first (non-busy) open of a fragmented * file attempt to de-frag it (if its less than 20MB). */ if ((VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) || - !UBCISVALID(vp) || ubc_isinuse(vp, 1)) { + (VTOHFS(vp)->jnl == NULL) || + !vnode_isreg(vp) || vnode_isinuse(vp, 0)) { return (0); } + + if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) + return (error); fp = VTOF(vp); if (fp->ff_blocks && fp->ff_extents[7].blockCount != 0 && @@ -248,299 +200,219 @@ hfs_open(ap) * Wait until system bootup is done (3 min). */ microuptime(&tv); - if (tv.tv_sec < (60 * 3)) { - return (0); + if (tv.tv_sec > (60 * 3)) { + (void) hfs_relocate(vp, VTOVCB(vp)->nextAllocation + 4096, + vfs_context_ucred(ap->a_context), + vfs_context_proc(ap->a_context)); } - (void) hfs_relocate(vp, VTOVCB(vp)->nextAllocation + 4096, ap->a_cred, ap->a_p); } + hfs_unlock(VTOC(vp)); return (0); } -/* - * Close called. - * - * Update the times on the cnode. -#% close vp U U U -# - vop_close { - IN struct vnode *vp; - IN int fflag; - IN struct ucred *cred; - IN struct proc *p; - */ - +/* + * Close a file/directory. + */ static int -hfs_close(ap) - struct vop_close_args /* { +hfs_vnop_close(ap) + struct vnop_close_args /* { struct vnode *a_vp; int a_fflag; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { register struct vnode *vp = ap->a_vp; - register struct cnode *cp = VTOC(vp); - register struct filefork *fp = VTOF(vp); - struct proc *p = ap->a_p; - struct timeval tv; - off_t leof; - u_long blks, blocksize; - int devBlockSize; - int error; - - simple_lock(&vp->v_interlock); - if ((!UBCISVALID(vp) && vp->v_usecount > 1) - || (UBCISVALID(vp) && ubc_isinuse(vp, 1))) { - tv = time; - CTIMES(cp, &tv, &tv); - } - simple_unlock(&vp->v_interlock); - - if (ap->a_fflag & O_EVTONLY) { - if (vp->v_type == VREG) { - --VTOF(vp)->ff_evtonly_refs; - } else { - --VTOC(vp)->c_evtonly_refs; - }; - }; + register struct cnode *cp; + struct proc *p = vfs_context_proc(ap->a_context); + struct hfsmount *hfsmp; + int busy; - /* - * VOP_CLOSE can be called with vp locked (from vclean). - * We check for this case using VOP_ISLOCKED and bail. - * - * XXX During a force unmount we won't do the cleanup below! - */ - if (vp->v_type == VDIR || VOP_ISLOCKED(vp)) + if ( hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK) != 0) return (0); + cp = VTOC(vp); + hfsmp = VTOHFS(vp); - leof = fp->ff_size; - - if ((fp->ff_blocks > 0) && - !ISSET(cp->c_flag, C_DELETED) && - ((VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) == 0)) { - enum vtype our_type = vp->v_type; - u_long our_id = vp->v_id; - int was_nocache = ISSET(vp->v_flag, VNOCACHE_DATA); - - error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - if (error) - return (0); - /* - * Since we can context switch in vn_lock our vnode - * could get recycled (eg umount -f). Double check - * that its still ours. - */ - if (vp->v_type != our_type || vp->v_id != our_id - || cp != VTOC(vp) || !UBCINFOEXISTS(vp)) { - VOP_UNLOCK(vp, 0, p); - return (0); - } - - /* - * Last chance to explicitly zero out the areas - * that are currently marked invalid: - */ - VOP_DEVBLOCKSIZE(cp->c_devvp, &devBlockSize); - (void) cluster_push(vp); - SET(vp->v_flag, VNOCACHE_DATA); /* Don't cache zeros */ - while (!CIRCLEQ_EMPTY(&fp->ff_invalidranges)) { - struct rl_entry *invalid_range = CIRCLEQ_FIRST(&fp->ff_invalidranges); - off_t start = invalid_range->rl_start; - off_t end = invalid_range->rl_end; - - /* The range about to be written must be validated - * first, so that VOP_CMAP() will return the - * appropriate mapping for the cluster code: - */ - rl_remove(start, end, &fp->ff_invalidranges); - - (void) cluster_write(vp, (struct uio *) 0, leof, - invalid_range->rl_end + 1, invalid_range->rl_start, - (off_t)0, devBlockSize, IO_HEADZEROFILL | IO_NOZERODIRTY); - - if (ISSET(vp->v_flag, VHASDIRTY)) - (void) cluster_push(vp); + // if we froze the fs and we're exiting, then "thaw" the fs + if (hfsmp->hfs_freezing_proc == p && proc_exiting(p)) { + hfsmp->hfs_freezing_proc = NULL; + hfs_global_exclusive_lock_release(hfsmp); + } - cp->c_flag |= C_MODIFIED; - } - cp->c_flag &= ~C_ZFWANTSYNC; - cp->c_zftimeout = 0; - blocksize = VTOVCB(vp)->blockSize; - blks = leof / blocksize; - if (((off_t)blks * (off_t)blocksize) != leof) - blks++; - /* - * Shrink the peof to the smallest size neccessary to contain the leof. - */ - if (blks < fp->ff_blocks) - (void) VOP_TRUNCATE(vp, leof, IO_NDELAY, ap->a_cred, p); - (void) cluster_push(vp); + busy = vnode_isinuse(vp, 1); - if (!was_nocache) - CLR(vp->v_flag, VNOCACHE_DATA); - - /* - * If the VOP_TRUNCATE didn't happen to flush the vnode's - * information out to disk, force it to be updated now that - * all invalid ranges have been zero-filled and validated: - */ - if (cp->c_flag & C_MODIFIED) { - tv = time; - VOP_UPDATE(vp, &tv, &tv, 0); - } - VOP_UNLOCK(vp, 0, p); + if (busy) { + hfs_touchtimes(VTOHFS(vp), cp); + } + if (vnode_isdir(vp)) { + hfs_reldirhints(cp, busy); + } else if (vnode_issystem(vp) && !busy) { + vnode_recycle(vp); } - if ((vp->v_flag & VSYSTEM) && (vp->v_usecount == 1)) - vgone(vp); + + hfs_unlock(cp); return (0); } /* -#% access vp L L L -# - vop_access { - IN struct vnode *vp; - IN int mode; - IN struct ucred *cred; - IN struct proc *p; - - */ - + * Get basic attributes. + */ static int -hfs_access(ap) - struct vop_access_args /* { - struct vnode *a_vp; - int a_mode; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; +hfs_vnop_getattr(struct vnop_getattr_args *ap) { struct vnode *vp = ap->a_vp; - struct cnode *cp = VTOC(vp); - struct ucred *cred = ap->a_cred; - register gid_t *gp; - mode_t mode = ap->a_mode; - mode_t mask = 0; - int i; - int error; + struct vnode_attr *vap = ap->a_vap; + struct vnode *rvp = NULL; + struct hfsmount *hfsmp; + struct cnode *cp; + enum vtype v_type; + int error = 0; - /* - * Disallow write attempts on read-only file systems; - * unless the file is a socket, fifo, or a block or - * character device resident on the file system. - */ - if (mode & VWRITE) { - switch (vp->v_type) { - case VDIR: - case VLNK: - case VREG: - if (VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) - return (EROFS); -#if QUOTA - if ((error = hfs_getinoquota(cp))) - return (error); -#endif /* QUOTA */ - break; - } - /* If immutable bit set, nobody gets to write it. */ - if (cp->c_flags & IMMUTABLE) - return (EPERM); + if ((error = hfs_lock(VTOC(vp), HFS_SHARED_LOCK))) { + return (error); } + cp = VTOC(vp); + hfsmp = VTOHFS(vp); + hfs_touchtimes(hfsmp, cp); + v_type = vnode_vtype(vp); + VATTR_RETURN(vap, va_rdev, (v_type == VBLK || v_type == VCHR) ? cp->c_rdev : 0); + if (v_type == VDIR) { + if (VATTR_IS_ACTIVE(vap, va_nlink)) { + int entries; + + entries = cp->c_nlink; + if (vnode_isvroot(vp)) { + if (hfsmp->hfs_privdir_desc.cd_cnid != 0) + --entries; /* hide private dir */ + if (hfsmp->jnl) + entries -= 2; /* hide the journal files */ + } + VATTR_RETURN(vap, va_nlink, (uint64_t)entries); + } + + if (VATTR_IS_ACTIVE(vap, va_nchildren)) { + int entries; + + entries = cp->c_entries; + if (vnode_isvroot(vp)) { + if (hfsmp->hfs_privdir_desc.cd_cnid != 0) + --entries; /* hide private dir */ + if (hfsmp->jnl) + entries -= 2; /* hide the journal files */ + } + VATTR_RETURN(vap, va_nchildren, entries); + } + } else { + VATTR_RETURN(vap, va_nlink, (uint64_t)cp->c_nlink); + } - /* Otherwise, user id 0 always gets access. */ - if (cred->cr_uid == 0) - return (0); + /* conditional because 64-bit arithmetic can be expensive */ + if (VATTR_IS_ACTIVE(vap, va_total_size)) { + if (v_type == VDIR) { + VATTR_RETURN(vap, va_total_size, cp->c_nlink * AVERAGE_HFSDIRENTRY_SIZE); + } else { + uint64_t total_size = 0; + struct cnode *rcp; + + if (cp->c_datafork) { + total_size = cp->c_datafork->ff_size; + } - mask = 0; + if (cp->c_blocks - VTOF(vp)->ff_blocks) { + /* hfs_vgetrsrc does not use struct proc - therefore passing NULL */ + error = hfs_vgetrsrc(hfsmp, vp, &rvp, NULL); + if (error) { + goto out; + } + + rcp = VTOC(rvp); + if (rcp && rcp->c_rsrcfork) { + total_size += rcp->c_rsrcfork->ff_size; + } + } - /* Otherwise, check the owner. */ - if ( (cp->c_uid == cred->cr_uid) || (cp->c_uid == UNKNOWNUID) ) { - if (mode & VEXEC) - mask |= S_IXUSR; - if (mode & VREAD) - mask |= S_IRUSR; - if (mode & VWRITE) - mask |= S_IWUSR; - return ((cp->c_mode & mask) == mask ? 0 : EACCES); + VATTR_RETURN(vap, va_total_size, total_size); + /* Include size of attibute data (extents), if any */ + if (cp->c_attrblks) { + vap->va_total_size += (uint64_t)cp->c_attrblks * (uint64_t)hfsmp->blockSize; + } + } } - - /* Otherwise, check the groups. */ - if (! (VTOVFS(vp)->mnt_flag & MNT_UNKNOWNPERMISSIONS)) { - for (i = 0, gp = cred->cr_groups; i < cred->cr_ngroups; i++, gp++) - if (cp->c_gid == *gp) { - if (mode & VEXEC) - mask |= S_IXGRP; - if (mode & VREAD) - mask |= S_IRGRP; - if (mode & VWRITE) - mask |= S_IWGRP; - return ((cp->c_mode & mask) == mask ? 0 : EACCES); + if (VATTR_IS_ACTIVE(vap, va_total_alloc)) { + if (v_type == VDIR) { + VATTR_RETURN(vap, va_total_alloc, 0); + } else { + VATTR_RETURN(vap, va_total_alloc, (uint64_t)cp->c_blocks * (uint64_t)hfsmp->blockSize); + /* Include size of attibute data (extents), if any */ + if (cp->c_attrblks) { + vap->va_total_alloc += (uint64_t)cp->c_attrblks * (uint64_t)hfsmp->blockSize; } + } } + /* XXX broken... if ask for "data size" of rsrc fork vp you get rsrc fork size! */ + if (v_type == VDIR) { + VATTR_RETURN(vap, va_data_size, cp->c_nlink * AVERAGE_HFSDIRENTRY_SIZE); + } else { + VATTR_RETURN(vap, va_data_size, VTOF(vp)->ff_size); + } + if (VATTR_IS_ACTIVE(vap, va_data_alloc) && (v_type != VDIR)) { + /* XXX do we need to account for ff_unallocblocks ? */ + VATTR_RETURN(vap, va_data_alloc, (uint64_t)VTOF(vp)->ff_blocks * (uint64_t)hfsmp->blockSize); + } + /* XXX is this really a good 'optimal I/O size'? */ + VATTR_RETURN(vap, va_iosize, hfsmp->hfs_logBlockSize); + VATTR_RETURN(vap, va_uid, cp->c_uid); + VATTR_RETURN(vap, va_gid, cp->c_gid); + VATTR_RETURN(vap, va_mode, cp->c_mode); +#if 0 + /* XXX is S_IFXATTR still needed ??? */ + if (VNODE_IS_RSRC(vp)) + vap->va_mode |= S_IFXATTR; +#endif + VATTR_RETURN(vap, va_flags, cp->c_flags); - /* Otherwise, check everyone else. */ - if (mode & VEXEC) - mask |= S_IXOTH; - if (mode & VREAD) - mask |= S_IROTH; - if (mode & VWRITE) - mask |= S_IWOTH; - return ((cp->c_mode & mask) == mask ? 0 : EACCES); -} - - - -/* -#% getattr vp = = = -# - vop_getattr { - IN struct vnode *vp; - IN struct vattr *vap; - IN struct ucred *cred; - IN struct proc *p; - - */ - - -/* ARGSUSED */ -static int -hfs_getattr(ap) - struct vop_getattr_args /* { - struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - struct cnode *cp = VTOC(vp); - struct vattr *vap = ap->a_vap; - struct timeval tv; - - tv = time; - CTIMES(cp, &tv, &tv); - - vap->va_type = vp->v_type; - vap->va_mode = cp->c_mode; - vap->va_nlink = cp->c_nlink; /* - * [2856576] Since we are dynamically changing the owner, also - * effectively turn off the set-user-id and set-group-id bits, - * just like chmod(2) would when changing ownership. This prevents - * a security hole where set-user-id programs run as whoever is - * logged on (or root if nobody is logged in yet!) + * If the VFS wants extended security data, and we know that we + * don't have any (because it never told us it was setting any) + * then we can return the supported bit and no data. If we do + * have extended security, we can just leave the bit alone and + * the VFS will use the fallback path to fetch it. */ - if (cp->c_uid == UNKNOWNUID) { - vap->va_mode &= ~(S_ISUID | S_ISGID); - vap->va_uid = ap->a_cred->cr_uid; - } else { - vap->va_uid = cp->c_uid; + if (VATTR_IS_ACTIVE(vap, va_acl)) { + if ((cp->c_attr.ca_recflags & kHFSHasSecurityMask) == 0) { + vap->va_acl = KAUTH_FILESEC_NONE; + VATTR_SET_SUPPORTED(vap, va_acl); + } } - vap->va_gid = cp->c_gid; - vap->va_fsid = cp->c_dev; + vap->va_create_time.tv_sec = cp->c_itime; + vap->va_create_time.tv_nsec = 0; + VATTR_SET_SUPPORTED(vap, va_create_time); + + if (VATTR_IS_ACTIVE(vap, va_access_time)) { + /* Access times are lazyily updated, get current time if needed */ + if (cp->c_touch_acctime) { + struct timeval tv; + + microtime(&tv); + vap->va_access_time.tv_sec = tv.tv_sec; + } else { + vap->va_access_time.tv_sec = cp->c_atime; + } + vap->va_access_time.tv_nsec = 0; + VATTR_SET_SUPPORTED(vap, va_access_time); + } + vap->va_modify_time.tv_sec = cp->c_mtime; + vap->va_modify_time.tv_nsec = 0; + VATTR_SET_SUPPORTED(vap, va_modify_time); + vap->va_change_time.tv_sec = cp->c_ctime; + vap->va_change_time.tv_nsec = 0; + VATTR_SET_SUPPORTED(vap, va_change_time); + vap->va_backup_time.tv_sec = cp->c_btime; + vap->va_backup_time.tv_nsec = 0; + VATTR_SET_SUPPORTED(vap, va_backup_time); + /* * Exporting file IDs from HFS Plus: * @@ -549,175 +421,204 @@ hfs_getattr(ap) * c_cnid belongs to the active directory entry (ie the link) * and the c_fileid is for the actual inode (ie the data file). * - * The stat call (getattr) will always return the c_fileid - * and Carbon APIs, which are hardlink-ignorant, will always - * receive the c_cnid (from getattrlist). + * The stat call (getattr) uses va_fileid and the Carbon APIs, + * which are hardlink-ignorant, will ask for va_linkid. */ - vap->va_fileid = cp->c_fileid; - vap->va_atime.tv_sec = cp->c_atime; - vap->va_atime.tv_nsec = 0; - vap->va_mtime.tv_sec = cp->c_mtime; - vap->va_mtime.tv_nsec = cp->c_mtime_nsec; - vap->va_ctime.tv_sec = cp->c_ctime; - vap->va_ctime.tv_nsec = 0; - vap->va_gen = 0; - vap->va_flags = cp->c_flags; - vap->va_rdev = 0; - vap->va_blocksize = VTOVFS(vp)->mnt_stat.f_iosize; - vap->va_filerev = 0; - if (vp->v_type == VDIR) { - vap->va_size = cp->c_nlink * AVERAGE_HFSDIRENTRY_SIZE; - vap->va_bytes = 0; - } else { - vap->va_size = VTOF(vp)->ff_size; - vap->va_bytes = (u_quad_t)cp->c_blocks * - (u_quad_t)VTOVCB(vp)->blockSize; - if (vp->v_type == VBLK || vp->v_type == VCHR) - vap->va_rdev = cp->c_rdev; + VATTR_RETURN(vap, va_fileid, (uint64_t)cp->c_fileid); + VATTR_RETURN(vap, va_linkid, (uint64_t)cp->c_cnid); + VATTR_RETURN(vap, va_parentid, (uint64_t)cp->c_parentcnid); + VATTR_RETURN(vap, va_fsid, cp->c_dev); + VATTR_RETURN(vap, va_filerev, 0); + + VATTR_RETURN(vap, va_encoding, cp->c_encoding); + + /* if this is the root, let VFS to find out the mount name, which may be different from the real name */ + if (VATTR_IS_ACTIVE(vap, va_name) && !vnode_isvroot(vp)) { + /* Return the name for ATTR_CMN_NAME */ + if (cp->c_desc.cd_namelen == 0) { + error = ENOENT; + goto out; + } + + strncpy(vap->va_name, cp->c_desc.cd_nameptr, MAXPATHLEN); + vap->va_name[MAXPATHLEN-1] = '\0'; + VATTR_SET_SUPPORTED(vap, va_name); } - return (0); -} -/* - * Set attribute vnode op. called from several syscalls -#% setattr vp L L L -# - vop_setattr { - IN struct vnode *vp; - IN struct vattr *vap; - IN struct ucred *cred; - IN struct proc *p; - - */ +out: + hfs_unlock(cp); + if (rvp) { + vnode_put(rvp); + } + return (error); +} static int -hfs_setattr(ap) - struct vop_setattr_args /* { +hfs_vnop_setattr(ap) + struct vnop_setattr_args /* { struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - struct proc *a_p; + struct vnode_attr *a_vap; + vfs_context_t a_context; } */ *ap; { - struct vattr *vap = ap->a_vap; + struct vnode_attr *vap = ap->a_vap; struct vnode *vp = ap->a_vp; - struct cnode *cp = VTOC(vp); - struct ucred *cred = ap->a_cred; - struct proc *p = ap->a_p; - struct timeval atimeval, mtimeval; - int error; + struct cnode *cp = NULL; + struct hfsmount *hfsmp; + kauth_cred_t cred = vfs_context_ucred(ap->a_context); + struct proc *p = vfs_context_proc(ap->a_context); + int error = 0; + uid_t nuid; + gid_t ngid; - /* - * Check for unsettable attributes. - */ - if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || - (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || - (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || - ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { - return (EINVAL); - } + hfsmp = VTOHFS(vp); - // XXXdbg - // don't allow people to set the attributes of symlinks - // (nfs has a bad habit of doing ths and it can cause - // problems for journaling). - // - if (vp->v_type == VLNK) { - return 0; + /* Don't allow modification of the journal file. */ + if (hfsmp->hfs_jnlfileid == VTOC(vp)->c_fileid) { + return (EPERM); } + /* + * File size change request. + * We are guaranteed that this is not a directory, and that + * the filesystem object is writeable. + */ + VATTR_SET_SUPPORTED(vap, va_data_size); + if (VATTR_IS_ACTIVE(vap, va_data_size) && !vnode_islnk(vp)) { + /* Take truncate lock before taking cnode lock. */ + hfs_lock_truncate(VTOC(vp), TRUE); + if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) { + hfs_unlock_truncate(VTOC(vp)); + return (error); + } + cp = VTOC(vp); - if (vap->va_flags != VNOVAL) { - if (VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) - return (EROFS); - if ((error = hfs_chflags(vp, vap->va_flags, cred, p))) + error = hfs_truncate(vp, vap->va_data_size, vap->va_vaflags & 0xffff, 0, ap->a_context); + + hfs_unlock_truncate(cp); + if (error) + goto out; + } + if (cp == NULL) { + if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) return (error); - if (vap->va_flags & (IMMUTABLE | APPEND)) - return (0); + cp = VTOC(vp); } - if (cp->c_flags & (IMMUTABLE | APPEND)) - return (EPERM); + /* + * Owner/group change request. + * We are guaranteed that the new owner/group is valid and legal. + */ + VATTR_SET_SUPPORTED(vap, va_uid); + VATTR_SET_SUPPORTED(vap, va_gid); + nuid = VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : (uid_t)VNOVAL; + ngid = VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : (gid_t)VNOVAL; + if (((nuid != (uid_t)VNOVAL) || (ngid != (gid_t)VNOVAL)) && + ((error = hfs_chown(vp, nuid, ngid, cred, p)) != 0)) + goto out; - // XXXdbg - don't allow modification of the journal or journal_info_block - if (VTOHFS(vp)->jnl && cp->c_datafork) { - struct HFSPlusExtentDescriptor *extd; + /* + * Mode change request. + * We are guaranteed that the mode value is valid and that in + * conjunction with the owner and group, this change is legal. + */ + VATTR_SET_SUPPORTED(vap, va_mode); + if (VATTR_IS_ACTIVE(vap, va_mode) && + ((error = hfs_chmod(vp, (int)vap->va_mode, cred, p)) != 0)) + goto out; - extd = &cp->c_datafork->ff_extents[0]; - if (extd->startBlock == VTOVCB(vp)->vcbJinfoBlock || extd->startBlock == VTOHFS(vp)->jnl_start) { - return EPERM; - } - } + /* + * File flags change. + * We are guaranteed that only flags allowed to change given the + * current securelevel are being changed. + */ + VATTR_SET_SUPPORTED(vap, va_flags); + if (VATTR_IS_ACTIVE(vap, va_flags) && + ((error = hfs_chflags(vp, vap->va_flags, cred, p)) != 0)) + goto out; /* - * Go through the fields and update iff not VNOVAL. + * If the file's extended security data is being changed, we + * need to note the change. Note that because we don't store + * the data, we do not set the SUPPORTED bit; this will cause + * the VFS to use a fallback strategy. */ - if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { - if (VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) - return (EROFS); - if ((error = hfs_chown(vp, vap->va_uid, vap->va_gid, cred, p))) - return (error); - } - if (vap->va_size != VNOVAL) { - /* - * Disallow write attempts on read-only file systems; - * unless the file is a socket, fifo, or a block or - * character device resident on the file system. - */ - switch (vp->v_type) { - case VDIR: - return (EISDIR); - case VLNK: - case VREG: - if (VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) - return (EROFS); - break; - default: - break; + if (VATTR_IS_ACTIVE(vap, va_acl)) { + /* Remember if any ACL data was set or cleared. */ + if (vap->va_acl == NULL) { + /* being cleared */ + if (cp->c_attr.ca_recflags & kHFSHasSecurityMask) { + cp->c_attr.ca_recflags &= ~kHFSHasSecurityMask; + cp->c_touch_chgtime = TRUE; + } + } else { + /* being set */ + if ((cp->c_attr.ca_recflags & kHFSHasSecurityMask) == 0) { + cp->c_attr.ca_recflags |= kHFSHasSecurityMask; + cp->c_touch_chgtime = TRUE; + } } - if ((error = VOP_TRUNCATE(vp, vap->va_size, 0, cred, p))) - return (error); } - cp = VTOC(vp); - if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { - if (VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) - return (EROFS); - if (((error = hfs_owner_rights(VTOHFS(vp), cp->c_uid, cred, p, true)) != 0) && - ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || - (error = VOP_ACCESS(vp, VWRITE, cred, p)))) { - return (error); + + /* + * Timestamp updates. + */ + VATTR_SET_SUPPORTED(vap, va_create_time); + VATTR_SET_SUPPORTED(vap, va_access_time); + VATTR_SET_SUPPORTED(vap, va_modify_time); + VATTR_SET_SUPPORTED(vap, va_backup_time); + VATTR_SET_SUPPORTED(vap, va_change_time); + if (VATTR_IS_ACTIVE(vap, va_create_time) || + VATTR_IS_ACTIVE(vap, va_access_time) || + VATTR_IS_ACTIVE(vap, va_modify_time) || + VATTR_IS_ACTIVE(vap, va_backup_time)) { + if (vnode_islnk(vp)) + goto done; + if (VATTR_IS_ACTIVE(vap, va_create_time)) + cp->c_itime = vap->va_create_time.tv_sec; + if (VATTR_IS_ACTIVE(vap, va_access_time)) { + cp->c_atime = vap->va_access_time.tv_sec; + cp->c_touch_acctime = FALSE; } - if (vap->va_atime.tv_sec != VNOVAL) - cp->c_flag |= C_ACCESS; - if (vap->va_mtime.tv_sec != VNOVAL) { - cp->c_flag |= C_CHANGE | C_UPDATE; + if (VATTR_IS_ACTIVE(vap, va_modify_time)) { + cp->c_mtime = vap->va_modify_time.tv_sec; + cp->c_touch_modtime = FALSE; + cp->c_touch_chgtime = TRUE; + /* * The utimes system call can reset the modification * time but it doesn't know about HFS create times. - * So we need to insure that the creation time is + * So we need to ensure that the creation time is * always at least as old as the modification time. */ if ((VTOVCB(vp)->vcbSigWord == kHFSPlusSigWord) && - (cp->c_cnid != kRootDirID) && - (vap->va_mtime.tv_sec < cp->c_itime)) { - cp->c_itime = vap->va_mtime.tv_sec; + (cp->c_cnid != kHFSRootFolderID) && + (cp->c_mtime < cp->c_itime)) { + cp->c_itime = cp->c_mtime; } } - atimeval.tv_sec = vap->va_atime.tv_sec; - atimeval.tv_usec = 0; - mtimeval.tv_sec = vap->va_mtime.tv_sec; - mtimeval.tv_usec = 0; - if ((error = VOP_UPDATE(vp, &atimeval, &mtimeval, 1))) - return (error); + if (VATTR_IS_ACTIVE(vap, va_backup_time)) + cp->c_btime = vap->va_backup_time.tv_sec; + cp->c_flag |= C_MODIFIED; } - error = 0; - if (vap->va_mode != (mode_t)VNOVAL) { - if (VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) - return (EROFS); - error = hfs_chmod(vp, (int)vap->va_mode, cred, p); + + /* + * Set name encoding. + */ + VATTR_SET_SUPPORTED(vap, va_encoding); + if (VATTR_IS_ACTIVE(vap, va_encoding)) { + cp->c_encoding = vap->va_encoding; + hfs_setencodingbits(hfsmp, cp->c_encoding); } + +done: + if ((error = hfs_update(vp, TRUE)) != 0) + goto out; HFS_KNOTE(vp, NOTE_ATTRIB); +out: + if (cp) + hfs_unlock(cp); return (error); } @@ -728,11 +629,7 @@ hfs_setattr(ap) */ __private_extern__ int -hfs_chmod(vp, mode, cred, p) - register struct vnode *vp; - register int mode; - register struct ucred *cred; - struct proc *p; +hfs_chmod(struct vnode *vp, int mode, kauth_cred_t cred, struct proc *p) { register struct cnode *cp = VTOC(vp); int error; @@ -751,40 +648,31 @@ hfs_chmod(vp, mode, cred, p) } #if OVERRIDE_UNKNOWN_PERMISSIONS - if (VTOVFS(vp)->mnt_flag & MNT_UNKNOWNPERMISSIONS) { + if (((unsigned int)vfs_flags(VTOVFS(vp))) & MNT_UNKNOWNPERMISSIONS) { return (0); }; #endif - if ((error = hfs_owner_rights(VTOHFS(vp), cp->c_uid, cred, p, true)) != 0) - return (error); - if (cred->cr_uid) { - if (vp->v_type != VDIR && (mode & S_ISTXT)) - return (EFTYPE); - if (!groupmember(cp->c_gid, cred) && (mode & S_ISGID)) - return (EPERM); - } cp->c_mode &= ~ALLPERMS; cp->c_mode |= (mode & ALLPERMS); - cp->c_flag |= C_CHANGE; + cp->c_touch_chgtime = TRUE; return (0); } __private_extern__ int -hfs_write_access(struct vnode *vp, struct ucred *cred, struct proc *p, Boolean considerFlags) +hfs_write_access(struct vnode *vp, kauth_cred_t cred, struct proc *p, Boolean considerFlags) { struct cnode *cp = VTOC(vp); - gid_t *gp; int retval = 0; - int i; + int is_member; /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ - switch (vp->v_type) { + switch (vnode_vtype(vp)) { case VDIR: case VLNK: case VREG: @@ -800,7 +688,7 @@ hfs_write_access(struct vnode *vp, struct ucred *cred, struct proc *p, Boolean c return (EPERM); /* Otherwise, user id 0 always gets access. */ - if (cred->cr_uid == 0) + if (!suser(cred, NULL)) return (0); /* Otherwise, check the owner. */ @@ -808,9 +696,8 @@ hfs_write_access(struct vnode *vp, struct ucred *cred, struct proc *p, Boolean c return ((cp->c_mode & S_IWUSR) == S_IWUSR ? 0 : EACCES); /* Otherwise, check the groups. */ - for (i = 0, gp = cred->cr_groups; i < cred->cr_ngroups; i++, gp++) { - if (cp->c_gid == *gp) - return ((cp->c_mode & S_IWGRP) == S_IWGRP ? 0 : EACCES); + if (kauth_cred_ismember_gid(cred, cp->c_gid, &is_member) == 0 && is_member) { + return ((cp->c_mode & S_IWGRP) == S_IWGRP ? 0 : EACCES); } /* Otherwise, check everyone else. */ @@ -825,38 +712,13 @@ hfs_write_access(struct vnode *vp, struct ucred *cred, struct proc *p, Boolean c */ __private_extern__ int -hfs_chflags(vp, flags, cred, p) - register struct vnode *vp; - register u_long flags; - register struct ucred *cred; - struct proc *p; +hfs_chflags(struct vnode *vp, uint32_t flags, __unused kauth_cred_t cred, __unused struct proc *p) { register struct cnode *cp = VTOC(vp); - int retval; - if (VTOVCB(vp)->vcbSigWord == kHFSSigWord) { - if ((retval = hfs_write_access(vp, cred, p, false)) != 0) { - return retval; - }; - } else if ((retval = hfs_owner_rights(VTOHFS(vp), cp->c_uid, cred, p, true)) != 0) { - return retval; - }; - - if (cred->cr_uid == 0) { - if ((cp->c_flags & (SF_IMMUTABLE | SF_APPEND)) && - securelevel > 0) { - return EPERM; - }; - cp->c_flags = flags; - } else { - if (cp->c_flags & (SF_IMMUTABLE | SF_APPEND) || - (flags & UF_SETTABLE) != flags) { - return EPERM; - }; - cp->c_flags &= SF_SETTABLE; - cp->c_flags |= (flags & UF_SETTABLE); - } - cp->c_flag |= C_CHANGE; + cp->c_flags &= SF_SETTABLE; + cp->c_flags |= (flags & UF_SETTABLE); + cp->c_touch_chgtime = TRUE; return (0); } @@ -868,41 +730,42 @@ hfs_chflags(vp, flags, cred, p) */ __private_extern__ int -hfs_chown(vp, uid, gid, cred, p) - register struct vnode *vp; - uid_t uid; - gid_t gid; - struct ucred *cred; - struct proc *p; +hfs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred, + struct proc *p) { register struct cnode *cp = VTOC(vp); uid_t ouid; gid_t ogid; int error = 0; + int is_member; #if QUOTA register int i; int64_t change; #endif /* QUOTA */ if (VTOVCB(vp)->vcbSigWord != kHFSPlusSigWord) - return (EOPNOTSUPP); + return (ENOTSUP); - if (VTOVFS(vp)->mnt_flag & MNT_UNKNOWNPERMISSIONS) + if (((unsigned int)vfs_flags(VTOVFS(vp))) & MNT_UNKNOWNPERMISSIONS) return (0); if (uid == (uid_t)VNOVAL) uid = cp->c_uid; if (gid == (gid_t)VNOVAL) gid = cp->c_gid; + +#if 0 /* we are guaranteed that this is already the case */ /* * If we don't own the file, are trying to change the owner * of the file, or are not a member of the target group, * the caller must be superuser or the call fails. */ - if ((cred->cr_uid != cp->c_uid || uid != cp->c_uid || - (gid != cp->c_gid && !groupmember((gid_t)gid, cred))) && - (error = suser(cred, &p->p_acflag))) + if ((kauth_cred_getuid(cred) != cp->c_uid || uid != cp->c_uid || + (gid != cp->c_gid && + (kauth_cred_ismember_gid(cred, gid, &is_member) || !is_member))) && + (error = suser(cred, 0))) return (error); +#endif ogid = cp->c_gid; ouid = cp->c_uid; @@ -910,26 +773,26 @@ hfs_chown(vp, uid, gid, cred, p) if ((error = hfs_getinoquota(cp))) return (error); if (ouid == uid) { - dqrele(vp, cp->c_dquot[USRQUOTA]); + dqrele(cp->c_dquot[USRQUOTA]); cp->c_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { - dqrele(vp, cp->c_dquot[GRPQUOTA]); + dqrele(cp->c_dquot[GRPQUOTA]); cp->c_dquot[GRPQUOTA] = NODQUOT; } /* * Eventually need to account for (fake) a block per directory - *if (vp->v_type == VDIR) - *change = VTOVCB(vp)->blockSize; - *else + * if (vnode_isdir(vp)) + * change = VTOHFS(vp)->blockSize; + * else */ change = (int64_t)(cp->c_blocks) * (int64_t)VTOVCB(vp)->blockSize; (void) hfs_chkdq(cp, -change, cred, CHOWN); (void) hfs_chkiq(cp, -1, cred, CHOWN); for (i = 0; i < MAXQUOTAS; i++) { - dqrele(vp, cp->c_dquot[i]); + dqrele(cp->c_dquot[i]); cp->c_dquot[i] = NODQUOT; } #endif /* QUOTA */ @@ -938,11 +801,11 @@ hfs_chown(vp, uid, gid, cred, p) #if QUOTA if ((error = hfs_getinoquota(cp)) == 0) { if (ouid == uid) { - dqrele(vp, cp->c_dquot[USRQUOTA]); + dqrele(cp->c_dquot[USRQUOTA]); cp->c_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { - dqrele(vp, cp->c_dquot[GRPQUOTA]); + dqrele(cp->c_dquot[GRPQUOTA]); cp->c_dquot[GRPQUOTA] = NODQUOT; } if ((error = hfs_chkdq(cp, change, cred, CHOWN)) == 0) { @@ -952,7 +815,7 @@ hfs_chown(vp, uid, gid, cred, p) (void) hfs_chkdq(cp, -change, cred, CHOWN|FORCE); } for (i = 0; i < MAXQUOTAS; i++) { - dqrele(vp, cp->c_dquot[i]); + dqrele(cp->c_dquot[i]); cp->c_dquot[i] = NODQUOT; } } @@ -960,11 +823,11 @@ hfs_chown(vp, uid, gid, cred, p) cp->c_uid = ouid; if (hfs_getinoquota(cp) == 0) { if (ouid == uid) { - dqrele(vp, cp->c_dquot[USRQUOTA]); + dqrele(cp->c_dquot[USRQUOTA]); cp->c_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { - dqrele(vp, cp->c_dquot[GRPQUOTA]); + dqrele(cp->c_dquot[GRPQUOTA]); cp->c_dquot[GRPQUOTA] = NODQUOT; } (void) hfs_chkdq(cp, change, cred, FORCE|CHOWN); @@ -978,57 +841,59 @@ good: #endif /* QUOTA */ if (ouid != uid || ogid != gid) - cp->c_flag |= C_CHANGE; - if (ouid != uid && cred->cr_uid != 0) - cp->c_mode &= ~S_ISUID; - if (ogid != gid && cred->cr_uid != 0) - cp->c_mode &= ~S_ISGID; + cp->c_touch_chgtime = TRUE; return (0); } /* -# -#% exchange fvp L L L -#% exchange tvp L L L -# + * The hfs_exchange routine swaps the fork data in two files by + * exchanging some of the information in the cnode. It is used + * to preserve the file ID when updating an existing file, in + * case the file is being tracked through its file ID. Typically + * its used after creating a new file during a safe-save. */ - /* - * The hfs_exchange routine swaps the fork data in two files by - * exchanging some of the information in the cnode. It is used - * to preserve the file ID when updating an existing file, in - * case the file is being tracked through its file ID. Typically - * its used after creating a new file during a safe-save. - */ - static int -hfs_exchange(ap) - struct vop_exchange_args /* { +hfs_vnop_exchange(ap) + struct vnop_exchange_args /* { struct vnode *a_fvp; struct vnode *a_tvp; - struct ucred *a_cred; - struct proc *a_p; + int a_options; + vfs_context_t a_context; } */ *ap; { struct vnode *from_vp = ap->a_fvp; struct vnode *to_vp = ap->a_tvp; - struct cnode *from_cp = VTOC(from_vp); - struct cnode *to_cp = VTOC(to_vp); - struct hfsmount *hfsmp = VTOHFS(from_vp); + struct cnode *from_cp; + struct cnode *to_cp; + struct hfsmount *hfsmp; struct cat_desc tempdesc; struct cat_attr tempattr; - int error = 0, started_tr = 0, grabbed_lock = 0; - cat_cookie_t cookie = {0}; + int lockflags; + int error = 0, started_tr = 0, got_cookie = 0; + cat_cookie_t cookie; /* The files must be on the same volume. */ - if (from_vp->v_mount != to_vp->v_mount) + if (vnode_mount(from_vp) != vnode_mount(to_vp)) return (EXDEV); + if (from_vp == to_vp) + return (EINVAL); + + if ((error = hfs_lockpair(VTOC(from_vp), VTOC(to_vp), HFS_EXCLUSIVE_LOCK))) + return (error); + + from_cp = VTOC(from_vp); + to_cp = VTOC(to_vp); + hfsmp = VTOHFS(from_vp); + /* Only normal files can be exchanged. */ - if ((from_vp->v_type != VREG) || (to_vp->v_type != VREG) || + if (!vnode_isreg(from_vp) || !vnode_isreg(to_vp) || (from_cp->c_flag & C_HARDLINK) || (to_cp->c_flag & C_HARDLINK) || - VNODE_IS_RSRC(from_vp) || VNODE_IS_RSRC(to_vp)) - return (EINVAL); + VNODE_IS_RSRC(from_vp) || VNODE_IS_RSRC(to_vp)) { + error = EINVAL; + goto exit; + } // XXXdbg - don't allow modification of the journal or journal_info_block if (hfsmp->jnl) { @@ -1037,60 +902,58 @@ hfs_exchange(ap) if (from_cp->c_datafork) { extd = &from_cp->c_datafork->ff_extents[0]; if (extd->startBlock == VTOVCB(from_vp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) { - return EPERM; + error = EPERM; + goto exit; } } if (to_cp->c_datafork) { extd = &to_cp->c_datafork->ff_extents[0]; if (extd->startBlock == VTOVCB(to_vp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) { - return EPERM; + error = EPERM; + goto exit; } } } - // XXXdbg - hfs_global_shared_lock_acquire(hfsmp); - grabbed_lock = 1; - if (hfsmp->jnl) { - if ((error = journal_start_transaction(hfsmp->jnl)) != 0) { - goto Err_Exit; - } - started_tr = 1; + if ((error = hfs_start_transaction(hfsmp)) != 0) { + goto exit; } + started_tr = 1; /* * Reserve some space in the Catalog file. */ - if ((error = cat_preflight(hfsmp, CAT_EXCHANGE, &cookie, ap->a_p))) { - goto Err_Exit; + bzero(&cookie, sizeof(cookie)); + if ((error = cat_preflight(hfsmp, CAT_EXCHANGE, &cookie, vfs_context_proc(ap->a_context)))) { + goto exit; } - - /* Lock catalog b-tree */ - error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, ap->a_p); - if (error) goto Err_Exit; + got_cookie = 1; /* The backend code always tries to delete the virtual * extent id for exchanging files so we neeed to lock * the extents b-tree. */ - error = hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p); - if (error) { - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, ap->a_p); - goto Err_Exit; - } + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_EXTENTS | SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); /* Do the exchange */ - error = MacToVFSError(ExchangeFileIDs(HFSTOVCB(hfsmp), - from_cp->c_desc.cd_nameptr, to_cp->c_desc.cd_nameptr, - from_cp->c_parentcnid, to_cp->c_parentcnid, - from_cp->c_hint, to_cp->c_hint)); + error = ExchangeFileIDs(hfsmp, + from_cp->c_desc.cd_nameptr, + to_cp->c_desc.cd_nameptr, + from_cp->c_parentcnid, + to_cp->c_parentcnid, + from_cp->c_hint, + to_cp->c_hint); + hfs_systemfile_unlock(hfsmp, lockflags); - (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, ap->a_p); - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, ap->a_p); + /* + * Note that we don't need to exchange any extended attributes + * since the attributes are keyed by file ID. + */ if (error != E_NONE) { - goto Err_Exit; + error = MacToVFSError(error); + goto exit; } /* Purge the vnodes from the name cache */ @@ -1134,12 +997,8 @@ hfs_exchange(ap) to_cp->c_mode = tempattr.ca_mode; bcopy(tempattr.ca_finderinfo, to_cp->c_finderinfo, 32); - /* Reinsert into the cnode hash under new file IDs*/ - hfs_chashremove(from_cp); - hfs_chashremove(to_cp); - - hfs_chashinsert(from_cp); - hfs_chashinsert(to_cp); + /* Rehash the cnodes using their new file IDs */ + hfs_chash_rehash(from_cp, to_cp); /* * When a file moves out of "Cleanup At Startup" @@ -1148,84 +1007,73 @@ hfs_exchange(ap) if ((from_cp->c_flags & UF_NODUMP) && (from_cp->c_parentcnid != to_cp->c_parentcnid)) { from_cp->c_flags &= ~UF_NODUMP; - from_cp->c_flag |= C_CHANGE; + from_cp->c_touch_chgtime = TRUE; } if ((to_cp->c_flags & UF_NODUMP) && (to_cp->c_parentcnid != from_cp->c_parentcnid)) { to_cp->c_flags &= ~UF_NODUMP; - to_cp->c_flag |= C_CHANGE; + to_cp->c_touch_chgtime = TRUE; } HFS_KNOTE(from_vp, NOTE_ATTRIB); HFS_KNOTE(to_vp, NOTE_ATTRIB); -Err_Exit: - cat_postflight(hfsmp, &cookie, ap->a_p); - - // XXXdbg - if (started_tr) { - journal_end_transaction(hfsmp->jnl); +exit: + if (got_cookie) { + cat_postflight(hfsmp, &cookie, vfs_context_proc(ap->a_context)); } - if (grabbed_lock) { - hfs_global_shared_lock_release(hfsmp); + if (started_tr) { + hfs_end_transaction(hfsmp); } + hfs_unlockpair(from_cp, to_cp); return (error); } /* - -#% fsync vp L L L -# - vop_fsync { - IN struct vnode *vp; - IN struct ucred *cred; - IN int waitfor; - IN struct proc *p; - - */ -static int -hfs_fsync(ap) - struct vop_fsync_args /* { - struct vnode *a_vp; - struct ucred *a_cred; - int a_waitfor; - struct proc *a_p; - } */ *ap; + * cnode must be locked + */ +__private_extern__ +int +hfs_fsync(struct vnode *vp, int waitfor, int fullsync, struct proc *p) { - struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct filefork *fp = NULL; int retval = 0; - register struct buf *bp; + struct hfsmount *hfsmp = VTOHFS(vp); struct timeval tv; - struct buf *nbp; - struct hfsmount *hfsmp = VTOHFS(ap->a_vp); - int s; int wait; - int retry = 0; + int lockflag; + int took_trunc_lock = 0; - wait = (ap->a_waitfor == MNT_WAIT); + wait = (waitfor == MNT_WAIT); /* HFS directories don't have any data blocks. */ - if (vp->v_type == VDIR) + if (vnode_isdir(vp)) goto metasync; /* * For system files flush the B-tree header and * for regular files write out any clusters */ - if (vp->v_flag & VSYSTEM) { + if (vnode_issystem(vp)) { if (VTOF(vp)->fcbBTCBPtr != NULL) { // XXXdbg if (hfsmp->jnl == NULL) { BTFlushPath(VTOF(vp)); } } - } else if (UBCINFOEXISTS(vp)) - (void) cluster_push(vp); + } else if (UBCINFOEXISTS(vp)) { + hfs_unlock(cp); + hfs_lock_truncate(cp, TRUE); + took_trunc_lock = 1; + /* Don't hold cnode lock when calling into cluster layer. */ + (void) cluster_push(vp, 0); + + hfs_lock(cp, HFS_FORCE_LOCK); + } /* * When MNT_WAIT is requested and the zero fill timeout * has expired then we must explicitly zero out any areas @@ -1237,17 +1085,18 @@ hfs_fsync(ap) ((cp->c_flags & UF_NODUMP) == 0) && UBCINFOEXISTS(vp) && (fp = VTOF(vp)) && cp->c_zftimeout != 0) { - int devblksize; - int was_nocache; - - if (time.tv_sec < cp->c_zftimeout) { + microuptime(&tv); + if (tv.tv_sec < cp->c_zftimeout) { /* Remember that a force sync was requested. */ cp->c_flag |= C_ZFWANTSYNC; - goto loop; - } - VOP_DEVBLOCKSIZE(cp->c_devvp, &devblksize); - was_nocache = ISSET(vp->v_flag, VNOCACHE_DATA); - SET(vp->v_flag, VNOCACHE_DATA); /* Don't cache zeros */ + goto datasync; + } + if (!took_trunc_lock) { + hfs_unlock(cp); + hfs_lock_truncate(cp, TRUE); + hfs_lock(cp, HFS_FORCE_LOCK); + took_trunc_lock = 1; + } while (!CIRCLEQ_EMPTY(&fp->ff_invalidranges)) { struct rl_entry *invalid_range = CIRCLEQ_FIRST(&fp->ff_invalidranges); @@ -1255,125 +1104,78 @@ hfs_fsync(ap) off_t end = invalid_range->rl_end; /* The range about to be written must be validated - * first, so that VOP_CMAP() will return the + * first, so that VNOP_BLOCKMAP() will return the * appropriate mapping for the cluster code: */ rl_remove(start, end, &fp->ff_invalidranges); + /* Don't hold cnode lock when calling into cluster layer. */ + hfs_unlock(cp); (void) cluster_write(vp, (struct uio *) 0, - fp->ff_size, - invalid_range->rl_end + 1, - invalid_range->rl_start, - (off_t)0, devblksize, - IO_HEADZEROFILL | IO_NOZERODIRTY); + fp->ff_size, end + 1, start, (off_t)0, + IO_HEADZEROFILL | IO_NOZERODIRTY | IO_NOCACHE); + hfs_lock(cp, HFS_FORCE_LOCK); cp->c_flag |= C_MODIFIED; } - (void) cluster_push(vp); - if (!was_nocache) - CLR(vp->v_flag, VNOCACHE_DATA); + hfs_unlock(cp); + (void) cluster_push(vp, 0); + hfs_lock(cp, HFS_FORCE_LOCK); + cp->c_flag &= ~C_ZFWANTSYNC; cp->c_zftimeout = 0; } +datasync: + if (took_trunc_lock) + hfs_unlock_truncate(cp); + + /* + * if we have a journal and if journal_active() returns != 0 then the + * we shouldn't do anything to a locked block (because it is part + * of a transaction). otherwise we'll just go through the normal + * code path and flush the buffer. note journal_active() can return + * -1 if the journal is invalid -- however we still need to skip any + * locked blocks as they get cleaned up when we finish the transaction + * or close the journal. + */ + // if (hfsmp->jnl && journal_active(hfsmp->jnl) >= 0) + if (hfsmp->jnl) + lockflag = BUF_SKIP_LOCKED; + else + lockflag = 0; /* * Flush all dirty buffers associated with a vnode. */ -loop: - s = splbio(); - for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { - nbp = bp->b_vnbufs.le_next; - if ((bp->b_flags & B_BUSY)) - continue; - if ((bp->b_flags & B_DELWRI) == 0) - panic("hfs_fsync: bp 0x% not dirty (hfsmp 0x%x)", bp, hfsmp); - // XXXdbg - if (hfsmp->jnl && (bp->b_flags & B_LOCKED)) { - if ((bp->b_flags & B_META) == 0) { - panic("hfs: bp @ 0x%x is locked but not meta! jnl 0x%x\n", - bp, hfsmp->jnl); - } - // if journal_active() returns >= 0 then the journal is ok and we - // shouldn't do anything to this locked block (because it is part - // of a transaction). otherwise we'll just go through the normal - // code path and flush the buffer. - if (journal_active(hfsmp->jnl) >= 0) { - continue; - } - } - - bremfree(bp); - bp->b_flags |= B_BUSY; - /* Clear B_LOCKED, should only be set on meta files */ - bp->b_flags &= ~B_LOCKED; - - splx(s); - /* - * Wait for I/O associated with indirect blocks to complete, - * since there is no way to quickly wait for them below. - */ - if (bp->b_vp == vp || ap->a_waitfor == MNT_NOWAIT) - (void) bawrite(bp); - else - (void) VOP_BWRITE(bp); - goto loop; - } - - if (wait) { - while (vp->v_numoutput) { - vp->v_flag |= VBWAIT; - tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "hfs_fsync", 0); - } - - // XXXdbg -- is checking for hfsmp->jnl == NULL the right - // thing to do? - if (hfsmp->jnl == NULL && vp->v_dirtyblkhd.lh_first) { - /* still have some dirty buffers */ - if (retry++ > 10) { - vprint("hfs_fsync: dirty", vp); - splx(s); - /* - * Looks like the requests are not - * getting queued to the driver. - * Retrying here causes a cpu bound loop. - * Yield to the other threads and hope - * for the best. - */ - (void)tsleep((caddr_t)&vp->v_numoutput, - PRIBIO + 1, "hfs_fsync", hz/10); - retry = 0; - } else { - splx(s); - } - /* try again */ - goto loop; - } - } - splx(s); + buf_flushdirtyblks(vp, wait, lockflag, "hfs_fsync"); metasync: - tv = time; - if (vp->v_flag & VSYSTEM) { - if (VTOF(vp)->fcbBTCBPtr != NULL) + if (vnode_isreg(vp) && vnode_issystem(vp)) { + if (VTOF(vp)->fcbBTCBPtr != NULL) { + microuptime(&tv); BTSetLastSync(VTOF(vp), tv.tv_sec); - cp->c_flag &= ~(C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE); + } + cp->c_touch_acctime = FALSE; + cp->c_touch_chgtime = FALSE; + cp->c_touch_modtime = FALSE; } else /* User file */ { - retval = VOP_UPDATE(ap->a_vp, &tv, &tv, wait); + retval = hfs_update(vp, wait); /* When MNT_WAIT is requested push out any delayed meta data */ if ((retval == 0) && wait && cp->c_hint && !ISSET(cp->c_flag, C_DELETED | C_NOEXISTS)) { - hfs_metasync(VTOHFS(vp), cp->c_hint, ap->a_p); + hfs_metasync(VTOHFS(vp), (daddr64_t)cp->c_hint, p); } // make sure that we've really been called from the user // fsync() and if so push out any pending transactions // that this file might is a part of (and get them on // stable storage). - if (vp->v_flag & VFULLFSYNC) { + if (fullsync) { if (hfsmp->jnl) { journal_flush(hfsmp->jnl); } else { - VOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NOCRED, ap->a_p); + /* XXX need to pass context! */ + VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL); } } } @@ -1381,14 +1183,14 @@ metasync: return (retval); } + /* Sync an hfs catalog b-tree node */ static int -hfs_metasync(struct hfsmount *hfsmp, daddr_t node, struct proc *p) +hfs_metasync(struct hfsmount *hfsmp, daddr64_t node, struct proc *p) { - struct vnode *vp; - struct buf *bp; - struct buf *nbp; - int s; + vnode_t vp; + buf_t bp; + int lockflags; vp = HFSTOVCB(hfsmp)->catalogRefNum; @@ -1397,168 +1199,143 @@ hfs_metasync(struct hfsmount *hfsmp, daddr_t node, struct proc *p) return 0; } - if (hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p) != 0) - return (0); - + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); /* * Look for a matching node that has been delayed * but is not part of a set (B_LOCKED). + * + * BLK_ONLYVALID causes buf_getblk to return a + * buf_t for the daddr64_t specified only if it's + * currently resident in the cache... the size + * parameter to buf_getblk is ignored when this flag + * is set */ - s = splbio(); - for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { - nbp = bp->b_vnbufs.le_next; - if (bp->b_flags & B_BUSY) - continue; - if (bp->b_lblkno == node) { - if (bp->b_flags & B_LOCKED) - break; - - bremfree(bp); - bp->b_flags |= B_BUSY; - splx(s); - (void) VOP_BWRITE(bp); - goto exit; - } + bp = buf_getblk(vp, node, 0, 0, 0, BLK_META | BLK_ONLYVALID); + + if (bp) { + if ((buf_flags(bp) & (B_LOCKED | B_DELWRI)) == B_DELWRI) + (void) VNOP_BWRITE(bp); + else + buf_brelse(bp); } - splx(s); -exit: - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); + + hfs_systemfile_unlock(hfsmp, lockflags); return (0); } + +/*ARGSUSED 1*/ +static int +hfs_btsync_callback(struct buf *bp, void *dummy) +{ + buf_clearflags(bp, B_LOCKED); + (void) buf_bawrite(bp); + + return(BUF_CLAIMED); +} + + __private_extern__ int hfs_btsync(struct vnode *vp, int sync_transaction) { struct cnode *cp = VTOC(vp); - register struct buf *bp; struct timeval tv; - struct buf *nbp; - struct hfsmount *hfsmp = VTOHFS(vp); - int s; + int flags = 0; + if (sync_transaction) + flags |= BUF_SKIP_NONLOCKED; /* * Flush all dirty buffers associated with b-tree. */ -loop: - s = splbio(); - - for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { - nbp = bp->b_vnbufs.le_next; - if ((bp->b_flags & B_BUSY)) - continue; - if ((bp->b_flags & B_DELWRI) == 0) - panic("hfs_btsync: not dirty (bp 0x%x hfsmp 0x%x)", bp, hfsmp); - - // XXXdbg - if (hfsmp->jnl && (bp->b_flags & B_LOCKED)) { - if ((bp->b_flags & B_META) == 0) { - panic("hfs: bp @ 0x%x is locked but not meta! jnl 0x%x\n", - bp, hfsmp->jnl); - } - // if journal_active() returns >= 0 then the journal is ok and we - // shouldn't do anything to this locked block (because it is part - // of a transaction). otherwise we'll just go through the normal - // code path and flush the buffer. - if (journal_active(hfsmp->jnl) >= 0) { - continue; - } - } - - if (sync_transaction && !(bp->b_flags & B_LOCKED)) - continue; - - bremfree(bp); - bp->b_flags |= B_BUSY; - bp->b_flags &= ~B_LOCKED; + buf_iterate(vp, hfs_btsync_callback, flags, 0); - splx(s); - - (void) bawrite(bp); - - goto loop; - } - splx(s); - - tv = time; - if ((vp->v_flag & VSYSTEM) && (VTOF(vp)->fcbBTCBPtr != NULL)) + microuptime(&tv); + if (vnode_issystem(vp) && (VTOF(vp)->fcbBTCBPtr != NULL)) (void) BTSetLastSync(VTOF(vp), tv.tv_sec); - cp->c_flag &= ~(C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE); + cp->c_touch_acctime = FALSE; + cp->c_touch_chgtime = FALSE; + cp->c_touch_modtime = FALSE; return 0; } /* - * Rmdir system call. -#% rmdir dvp L U U -#% rmdir vp L U U -# - vop_rmdir { - IN WILLRELE struct vnode *dvp; - IN WILLRELE struct vnode *vp; - IN struct componentname *cnp; - + * Remove a directory. */ static int -hfs_rmdir(ap) - struct vop_rmdir_args /* { +hfs_vnop_rmdir(ap) + struct vnop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; + vfs_context_t a_context; } */ *ap; { - return (hfs_removedir(ap->a_dvp, ap->a_vp, ap->a_cnp, 0)); + struct vnode *dvp = ap->a_dvp; + struct vnode *vp = ap->a_vp; + int error; + + if (!vnode_isdir(vp)) { + return (ENOTDIR); + } + if (dvp == vp) { + return (EINVAL); + } + if ((error = hfs_lockpair(VTOC(dvp), VTOC(vp), HFS_EXCLUSIVE_LOCK))) + return (error); + + error = hfs_removedir(dvp, vp, ap->a_cnp, 0); + + hfs_unlockpair(VTOC(dvp), VTOC(vp)); + + return (error); } /* - * hfs_removedir + * Remove a directory + * + * Both dvp and vp cnodes are locked */ static int -hfs_removedir(dvp, vp, cnp, options) - struct vnode *dvp; - struct vnode *vp; - struct componentname *cnp; - int options; +hfs_removedir(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, + int skip_reserve) { - struct proc *p = cnp->cn_proc; + vfs_context_t ctx = cnp->cn_context; + struct proc *p = vfs_context_proc(ctx); struct cnode *cp; struct cnode *dcp; struct hfsmount * hfsmp; - struct timeval tv; - cat_cookie_t cookie = {0}; - int error = 0, started_tr = 0, grabbed_lock = 0; + struct cat_desc desc; + cat_cookie_t cookie; + int lockflags; + int error = 0, started_tr = 0, got_cookie = 0; cp = VTOC(vp); dcp = VTOC(dvp); hfsmp = VTOHFS(vp); - if (dcp == cp) { - vrele(dvp); - vput(vp); + if (dcp == cp) return (EINVAL); /* cannot remove "." */ - } #if QUOTA (void)hfs_getinoquota(cp); #endif - // XXXdbg - hfs_global_shared_lock_acquire(hfsmp); - grabbed_lock = 1; - if (hfsmp->jnl) { - if ((error = journal_start_transaction(hfsmp->jnl)) != 0) { - goto out; - } - started_tr = 1; + if ((error = hfs_start_transaction(hfsmp)) != 0) { + goto out; } + started_tr = 1; - if (!(options & HFSRM_SKIP_RESERVE)) { + if (!skip_reserve) { /* * Reserve some space in the Catalog file. */ + bzero(&cookie, sizeof(cookie)); if ((error = cat_preflight(hfsmp, CAT_DELETE, &cookie, p))) { goto out; } + got_cookie = 1; } /* @@ -1577,21 +1354,34 @@ hfs_removedir(dvp, vp, cnp, options) goto out; } + if (cp->c_entries > 0) + panic("hfs_rmdir: attempting to delete a non-empty directory!"); + /* Remove the entry from the namei cache: */ cache_purge(vp); - /* Lock catalog b-tree */ - error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); - if (error) goto out; + /* + * Protect against a race with rename by using the component + * name passed in and parent id from dvp (instead of using + * the cp->c_desc which may have changed). + */ + bzero(&desc, sizeof(desc)); + desc.cd_nameptr = cnp->cn_nameptr; + desc.cd_namelen = cnp->cn_namelen; + desc.cd_parentcnid = dcp->c_cnid; + desc.cd_cnid = cp->c_cnid; - if (cp->c_entries > 0) - panic("hfs_rmdir: attempting to delete a non-empty directory!"); /* Remove entry from catalog */ - error = cat_delete(hfsmp, &cp->c_desc, &cp->c_attr); + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); + error = cat_delete(hfsmp, &desc, &cp->c_attr); + if (error == 0) { + /* Delete any attributes, ignore errors */ + (void) hfs_removeallattr(hfsmp, cp->c_fileid); + } + hfs_systemfile_unlock(hfsmp, lockflags); - /* Unlock catalog b-tree */ - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); - if (error) goto out; + if (error) + goto out; #if QUOTA (void)hfs_chkiq(cp, -1, NOCRED, 0); @@ -1602,9 +1392,12 @@ hfs_removedir(dvp, vp, cnp, options) dcp->c_entries--; if (dcp->c_nlink > 0) dcp->c_nlink--; - dcp->c_flag |= C_CHANGE | C_UPDATE; - tv = time; - (void) VOP_UPDATE(dvp, &tv, &tv, 0); + dcp->c_touch_chgtime = TRUE; + dcp->c_touch_modtime = TRUE; + + dcp->c_flag |= C_FORCEUPDATE; // XXXdbg - don't screw around, force this guy out + + (void) hfs_update(dvp, 0); HFS_KNOTE(dvp, NOTE_WRITE | NOTE_LINK); hfs_volupdate(hfsmp, VOL_RMDIR, (dcp->c_cnid == kHFSRootFolderID)); @@ -1612,106 +1405,129 @@ hfs_removedir(dvp, vp, cnp, options) cp->c_mode = 0; /* Makes the vnode go away...see inactive */ cp->c_flag |= C_NOEXISTS; out: - if (!(options & HFSRM_PARENT_LOCKED)) { - vput(dvp); - } HFS_KNOTE(vp, NOTE_DELETE); - vput(vp); - if (!(options & HFSRM_SKIP_RESERVE)) { + if (got_cookie) { cat_postflight(hfsmp, &cookie, p); } - // XXXdbg if (started_tr) { - journal_end_transaction(hfsmp->jnl); - } - if (grabbed_lock) { - hfs_global_shared_lock_release(hfsmp); + hfs_end_transaction(hfsmp); } return (error); } -/* - -#% remove dvp L U U -#% remove vp L U U -# - vop_remove { - IN WILLRELE struct vnode *dvp; - IN WILLRELE struct vnode *vp; - IN struct componentname *cnp; - - */ +/* + * Remove a file or link. + */ static int -hfs_remove(ap) - struct vop_remove_args /* { +hfs_vnop_remove(ap) + struct vnop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; + int a_flags; + vfs_context_t a_context; } */ *ap; { - return (hfs_removefile(ap->a_dvp, ap->a_vp, ap->a_cnp, 0)); + struct vnode *dvp = ap->a_dvp; + struct vnode *vp = ap->a_vp; + int error; + + if (dvp == vp) { + return (EINVAL); + } + + hfs_lock_truncate(VTOC(vp), TRUE); + + if ((error = hfs_lockpair(VTOC(dvp), VTOC(vp), HFS_EXCLUSIVE_LOCK))) + goto out; + + error = hfs_removefile(dvp, vp, ap->a_cnp, ap->a_flags, 0); + + hfs_unlockpair(VTOC(dvp), VTOC(vp)); +out: + hfs_unlock_truncate(VTOC(vp)); + return (error); } +static int +hfs_removefile_callback(struct buf *bp, void *hfsmp) { + + if ( !(buf_flags(bp) & B_META)) + panic("hfs: symlink bp @ 0x%x is not marked meta-data!\n", bp); + /* + * it's part of the current transaction, kill it. + */ + journal_kill_block(((struct hfsmount *)hfsmp)->jnl, bp); + + return (BUF_CLAIMED); +} /* * hfs_removefile * - * Similar to hfs_remove except there are additional options. + * Similar to hfs_vnop_remove except there are additional options. + * + * Requires cnode and truncate locks to be held. */ static int -hfs_removefile(dvp, vp, cnp, options) - struct vnode *dvp; - struct vnode *vp; - struct componentname *cnp; - int options; +hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, + int flags, int skip_reserve) { struct vnode *rvp = NULL; struct cnode *cp; struct cnode *dcp; struct hfsmount *hfsmp; - struct proc *p = cnp->cn_proc; + struct cat_desc desc; + struct timeval tv; + vfs_context_t ctx = cnp->cn_context; int dataforkbusy = 0; int rsrcforkbusy = 0; int truncated = 0; - struct timeval tv; - cat_cookie_t cookie = {0}; + cat_cookie_t cookie; + int lockflags; int error = 0; - int started_tr = 0, grabbed_lock = 0; - int refcount, isbigfile = 0; + int started_tr = 0, got_cookie = 0; + int isbigfile = 0; + cnid_t real_cnid = 0; /* Directories should call hfs_rmdir! */ - if (vp->v_type == VDIR) { - error = EISDIR; - goto out; + if (vnode_isdir(vp)) { + return (EISDIR); } cp = VTOC(vp); dcp = VTOC(dvp); hfsmp = VTOHFS(vp); + + if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { + return 0; + } - if (cp->c_parentcnid != dcp->c_cnid) { + // if it's not a hardlink, check that the parent + // cnid is the same as the directory cnid + if ( (cp->c_flag & C_HARDLINK) == 0 + && (cp->c_parentcnid != hfsmp->hfs_privdir_desc.cd_cnid) + && (cp->c_parentcnid != dcp->c_cnid)) { error = EINVAL; goto out; } /* Make sure a remove is permitted */ - if ((cp->c_flags & (IMMUTABLE | APPEND)) || - (VTOC(dvp)->c_flags & APPEND) || - VNODE_IS_RSRC(vp)) { + if (VNODE_IS_RSRC(vp)) { error = EPERM; goto out; } /* * Aquire a vnode for a non-empty resource fork. - * (needed for VOP_TRUNCATE) + * (needed for hfs_truncate) */ if (cp->c_blocks - VTOF(vp)->ff_blocks) { - error = hfs_vgetrsrc(hfsmp, vp, &rvp, p); + error = hfs_vgetrsrc(hfsmp, vp, &rvp, 0); if (error) goto out; } @@ -1729,19 +1545,10 @@ hfs_removefile(dvp, vp, cnp, options) /* * Check if this file is being used. - * - * The namei done for the remove took a reference on the - * vnode (vp). And we took a ref on the resource vnode (rvp). - * Hence set 1 in the tookref parameter of ubc_isinuse(). */ - if (VTOC(vp)->c_flag & C_VPREFHELD) { - refcount = 2; - } else { - refcount = 1; - } - if (UBCISVALID(vp) && ubc_isinuse(vp, refcount)) + if (vnode_isinuse(vp, 0)) dataforkbusy = 1; - if (rvp && UBCISVALID(rvp) && ubc_isinuse(rvp, 1)) + if (rvp && vnode_isinuse(rvp, 0)) rsrcforkbusy = 1; // need this to check if we have to break the deletion @@ -1750,96 +1557,134 @@ hfs_removefile(dvp, vp, cnp, options) /* * Carbon semantics prohibit deleting busy files. - * (enforced when NODELETEBUSY is requested) + * (enforced when VNODE_REMOVE_NODELETEBUSY is requested) */ - if ((dataforkbusy || rsrcforkbusy) && - ((cnp->cn_flags & NODELETEBUSY) || - (hfsmp->hfs_privdir_desc.cd_cnid == 0))) { - error = EBUSY; - goto out; + if (dataforkbusy || rsrcforkbusy) { + if ((flags & VNODE_REMOVE_NODELETEBUSY) || + (hfsmp->hfs_privdir_desc.cd_cnid == 0)) { + error = EBUSY; + goto out; + } } #if QUOTA (void)hfs_getinoquota(cp); #endif /* QUOTA */ - // XXXdbg - hfs_global_shared_lock_acquire(hfsmp); - grabbed_lock = 1; - if (hfsmp->jnl) { - if ((error = journal_start_transaction(hfsmp->jnl)) != 0) { - goto out; + /* + * We do the ubc_setsize before the hfs_truncate + * since we'll be inside a transaction. + */ + if ((cp->c_flag & C_HARDLINK) == 0 && + (!dataforkbusy || !rsrcforkbusy)) { + /* + * A ubc_setsize can cause a pagein here + * so we need to the drop cnode lock. Note + * that we still hold the truncate lock. + */ + hfs_unlock(cp); + if (!dataforkbusy && cp->c_datafork->ff_blocks && !isbigfile) { + ubc_setsize(vp, 0); + } + if (!rsrcforkbusy && rvp) { + ubc_setsize(rvp, 0); + } + hfs_lock(cp, HFS_FORCE_LOCK); + } else { + struct cat_desc cndesc; + + // for hard links, re-lookup the name that was passed + // in so we get the correct cnid for the name (as + // opposed to the c_cnid in the cnode which could have + // been changed before this node got locked). + bzero(&cndesc, sizeof(cndesc)); + cndesc.cd_nameptr = cnp->cn_nameptr; + cndesc.cd_namelen = cnp->cn_namelen; + cndesc.cd_parentcnid = VTOC(dvp)->c_cnid; + cndesc.cd_hint = VTOC(dvp)->c_childhint; + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + + if (cat_lookup(hfsmp, &cndesc, 0, NULL, NULL, NULL, &real_cnid) != 0) { + hfs_systemfile_unlock(hfsmp, lockflags); + error = ENOENT; + goto out; } - started_tr = 1; + + hfs_systemfile_unlock(hfsmp, lockflags); + } + + if ((error = hfs_start_transaction(hfsmp)) != 0) { + goto out; } + started_tr = 1; - if (!(options & HFSRM_SKIP_RESERVE)) { + if (!skip_reserve) { /* * Reserve some space in the Catalog file. */ - if ((error = cat_preflight(hfsmp, CAT_DELETE, &cookie, p))) { + if ((error = cat_preflight(hfsmp, CAT_DELETE, &cookie, 0))) { goto out; } + got_cookie = 1; } /* Remove our entry from the namei cache. */ cache_purge(vp); // XXXdbg - if we're journaled, kill any dirty symlink buffers - if (hfsmp->jnl && vp->v_type == VLNK && vp->v_dirtyblkhd.lh_first) { - struct buf *bp, *nbp; - - recheck: - for (bp=vp->v_dirtyblkhd.lh_first; bp; bp=nbp) { - nbp = bp->b_vnbufs.le_next; - - if ((bp->b_flags & B_BUSY)) { - // if it was busy, someone else must be dealing - // with it so just move on. - continue; - } - - if (!(bp->b_flags & B_META)) { - panic("hfs: symlink bp @ 0x%x is not marked meta-data!\n", bp); - } - - // if it's part of the current transaction, kill it. - if (bp->b_flags & B_LOCKED) { - bremfree(bp); - bp->b_flags |= B_BUSY; - journal_kill_block(hfsmp->jnl, bp); - goto recheck; - } - } - } - // XXXdbg + if (hfsmp->jnl && vnode_islnk(vp)) + buf_iterate(vp, hfs_removefile_callback, BUF_SKIP_NONLOCKED, (void *)hfsmp); /* * Truncate any non-busy forks. Busy forks will * get trucated when their vnode goes inactive. * + * Since we're already inside a transaction, + * tell hfs_truncate to skip the ubc_setsize. + * * (Note: hard links are truncated in VOP_INACTIVE) */ if ((cp->c_flag & C_HARDLINK) == 0) { int mode = cp->c_mode; if (!dataforkbusy && !isbigfile && cp->c_datafork->ff_blocks != 0) { - cp->c_mode = 0; /* Suppress VOP_UPDATES */ - error = VOP_TRUNCATE(vp, (off_t)0, IO_NDELAY, NOCRED, p); + cp->c_mode = 0; /* Suppress hfs_update */ + error = hfs_truncate(vp, (off_t)0, IO_NDELAY, 1, ctx); cp->c_mode = mode; if (error) goto out; truncated = 1; } if (!rsrcforkbusy && rvp) { - cp->c_mode = 0; /* Suppress VOP_UPDATES */ - error = VOP_TRUNCATE(rvp, (off_t)0, IO_NDELAY, NOCRED, p); + cp->c_mode = 0; /* Suppress hfs_update */ + error = hfs_truncate(rvp, (off_t)0, IO_NDELAY, 1, ctx); cp->c_mode = mode; if (error) goto out; truncated = 1; } } + + /* + * Protect against a race with rename by using the component + * name passed in and parent id from dvp (instead of using + * the cp->c_desc which may have changed). + */ + desc.cd_flags = 0; + desc.cd_encoding = cp->c_desc.cd_encoding; + desc.cd_nameptr = cnp->cn_nameptr; + desc.cd_namelen = cnp->cn_namelen; + desc.cd_parentcnid = dcp->c_cnid; + desc.cd_hint = cp->c_desc.cd_hint; + if (real_cnid) { + // if it was a hardlink we had to re-lookup the cnid + desc.cd_cnid = real_cnid; + } else { + desc.cd_cnid = cp->c_cnid; + } + microtime(&tv); + /* * There are 3 remove cases to consider: * 1. File is a hardlink ==> remove the link @@ -1848,73 +1693,70 @@ hfs_removefile(dvp, vp, cnp, options) */ if (cp->c_flag & C_HARDLINK) { - struct cat_desc desc; - - if ((cnp->cn_flags & HASBUF) == 0 || - cnp->cn_nameptr[0] == '\0') { - error = ENOENT; /* name missing! */ - goto out; - } - - /* Setup a descriptor for the link */ - bzero(&desc, sizeof(desc)); - desc.cd_nameptr = cnp->cn_nameptr; - desc.cd_namelen = cnp->cn_namelen; - desc.cd_parentcnid = dcp->c_cnid; - /* XXX - if cnid is out of sync then the wrong thread rec will get deleted. */ - desc.cd_cnid = cp->c_cnid; - - /* Lock catalog b-tree */ - error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); - if (error) - goto out; + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); /* Delete the link record */ error = cat_delete(hfsmp, &desc, &cp->c_attr); + if (error == 0) { + /* Update the parent directory */ + if (dcp->c_entries > 0) + dcp->c_entries--; + if (dcp->c_nlink > 0) + dcp->c_nlink--; + dcp->c_ctime = tv.tv_sec; + dcp->c_mtime = tv.tv_sec; + (void ) cat_update(hfsmp, &dcp->c_desc, &dcp->c_attr, NULL, NULL); + + if (--cp->c_nlink < 1) { + char inodename[32]; + char delname[32]; + struct cat_desc to_desc; + struct cat_desc from_desc; - if ((error == 0) && (--cp->c_nlink < 1)) { - char inodename[32]; - char delname[32]; - struct cat_desc to_desc; - struct cat_desc from_desc; - - /* - * This is now esentially an open deleted file. - * Rename it to reflect this state which makes - * orphan file cleanup easier (see hfs_remove_orphans). - * Note: a rename failure here is not fatal. - */ - MAKE_INODE_NAME(inodename, cp->c_rdev); - bzero(&from_desc, sizeof(from_desc)); - from_desc.cd_nameptr = inodename; - from_desc.cd_namelen = strlen(inodename); - from_desc.cd_parentcnid = hfsmp->hfs_privdir_desc.cd_cnid; - from_desc.cd_flags = 0; - from_desc.cd_cnid = cp->c_fileid; - - MAKE_DELETED_NAME(delname, cp->c_fileid); - bzero(&to_desc, sizeof(to_desc)); - to_desc.cd_nameptr = delname; - to_desc.cd_namelen = strlen(delname); - to_desc.cd_parentcnid = hfsmp->hfs_privdir_desc.cd_cnid; - to_desc.cd_flags = 0; - to_desc.cd_cnid = cp->c_fileid; + /* + * This is now esentially an open deleted file. + * Rename it to reflect this state which makes + * orphan file cleanup easier (see hfs_remove_orphans). + * Note: a rename failure here is not fatal. + */ + MAKE_INODE_NAME(inodename, cp->c_rdev); + bzero(&from_desc, sizeof(from_desc)); + from_desc.cd_nameptr = inodename; + from_desc.cd_namelen = strlen(inodename); + from_desc.cd_parentcnid = hfsmp->hfs_privdir_desc.cd_cnid; + from_desc.cd_flags = 0; + from_desc.cd_cnid = cp->c_fileid; + + MAKE_DELETED_NAME(delname, cp->c_fileid); + bzero(&to_desc, sizeof(to_desc)); + to_desc.cd_nameptr = delname; + to_desc.cd_namelen = strlen(delname); + to_desc.cd_parentcnid = hfsmp->hfs_privdir_desc.cd_cnid; + to_desc.cd_flags = 0; + to_desc.cd_cnid = cp->c_fileid; - (void) cat_rename(hfsmp, &from_desc, &hfsmp->hfs_privdir_desc, - &to_desc, (struct cat_desc *)NULL); - cp->c_flag |= C_DELETED; + error = cat_rename(hfsmp, &from_desc, &hfsmp->hfs_privdir_desc, + &to_desc, (struct cat_desc *)NULL); + if (error != 0) { + panic("hfs_removefile: error %d from cat_rename(%s %s) cp 0x%x\n", + inodename, delname, cp); + } + if (error == 0) { + /* Update the file's state */ + cp->c_flag |= C_DELETED; + cp->c_ctime = tv.tv_sec; + (void) cat_update(hfsmp, &to_desc, &cp->c_attr, NULL, NULL); + } + } else { + /* Update the file's state */ + cp->c_ctime = tv.tv_sec; + (void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL); + } } - - /* Unlock the Catalog */ - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); - + hfs_systemfile_unlock(hfsmp, lockflags); if (error != 0) goto out; - cp->c_flag |= C_CHANGE; - tv = time; - (void) VOP_UPDATE(vp, &tv, &tv, 0); - hfs_volupdate(hfsmp, VOL_RMFILE, (dcp->c_cnid == kHFSRootFolderID)); } else if (dataforkbusy || rsrcforkbusy || isbigfile) { @@ -1936,49 +1778,47 @@ hfs_removefile(dvp, vp, cnp, options) to_desc.cd_flags = 0; to_desc.cd_cnid = cp->c_cnid; - /* Lock catalog b-tree */ - error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); - if (error) - goto out; + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); - error = cat_rename(hfsmp, &cp->c_desc, &todir_desc, + error = cat_rename(hfsmp, &desc, &todir_desc, &to_desc, (struct cat_desc *)NULL); - // XXXdbg - only bump this count if we were successful if (error == 0) { hfsmp->hfs_privdir_attr.ca_entries++; + (void) cat_update(hfsmp, &hfsmp->hfs_privdir_desc, + &hfsmp->hfs_privdir_attr, NULL, NULL); + + /* Update the parent directory */ + if (dcp->c_entries > 0) + dcp->c_entries--; + if (dcp->c_nlink > 0) + dcp->c_nlink--; + dcp->c_ctime = tv.tv_sec; + dcp->c_mtime = tv.tv_sec; + (void) cat_update(hfsmp, &dcp->c_desc, &dcp->c_attr, NULL, NULL); + + /* Update the file's state */ + cp->c_flag |= C_DELETED; + cp->c_ctime = tv.tv_sec; + --cp->c_nlink; + (void) cat_update(hfsmp, &to_desc, &cp->c_attr, NULL, NULL); } - (void)cat_update(hfsmp, &hfsmp->hfs_privdir_desc, - &hfsmp->hfs_privdir_attr, NULL, NULL); - - /* Unlock the Catalog */ - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); - if (error) goto out; - - cp->c_flag |= C_CHANGE | C_DELETED | C_NOEXISTS; - --cp->c_nlink; - tv = time; - (void) VOP_UPDATE(vp, &tv, &tv, 0); + hfs_systemfile_unlock(hfsmp, lockflags); + if (error) + goto out; } else /* Not busy */ { if (cp->c_blocks > 0) { -#if 0 - panic("hfs_remove: attempting to delete a non-empty file!"); -#else printf("hfs_remove: attempting to delete a non-empty file %s\n", cp->c_desc.cd_nameptr); error = EBUSY; goto out; -#endif } - /* Lock catalog b-tree */ - error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); - if (error) - goto out; + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); - error = cat_delete(hfsmp, &cp->c_desc, &cp->c_attr); + error = cat_delete(hfsmp, &desc, &cp->c_attr); if (error && error != ENXIO && error != ENOENT && truncated) { if ((cp->c_datafork && cp->c_datafork->ff_size != 0) || @@ -1990,10 +1830,22 @@ hfs_removefile(dvp, vp, cnp, options) cp->c_desc.cd_nameptr, cp->c_attr.ca_fileid, error); } } - - /* Unlock the Catalog */ - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); - if (error) goto out; + if (error == 0) { + /* Delete any attributes, ignore errors */ + (void) hfs_removeallattr(hfsmp, cp->c_fileid); + + /* Update the parent directory */ + if (dcp->c_entries > 0) + dcp->c_entries--; + if (dcp->c_nlink > 0) + dcp->c_nlink--; + dcp->c_ctime = tv.tv_sec; + dcp->c_mtime = tv.tv_sec; + (void) cat_update(hfsmp, &dcp->c_desc, &dcp->c_attr, NULL, NULL); + } + hfs_systemfile_unlock(hfsmp, lockflags); + if (error) + goto out; #if QUOTA (void)hfs_chkiq(cp, -1, NOCRED, 0); @@ -2001,8 +1853,10 @@ hfs_removefile(dvp, vp, cnp, options) cp->c_mode = 0; truncated = 0; // because the catalog entry is gone - cp->c_flag |= C_CHANGE | C_NOEXISTS; + cp->c_flag |= C_NOEXISTS; + cp->c_touch_chgtime = TRUE; /* XXX needed ? */ --cp->c_nlink; + hfs_volupdate(hfsmp, VOL_RMFILE, (dcp->c_cnid == kHFSRootFolderID)); } @@ -2016,66 +1870,32 @@ hfs_removefile(dvp, vp, cnp, options) */ cat_releasedesc(&cp->c_desc); - /* In all three cases the parent lost a child */ - if (dcp->c_entries > 0) - dcp->c_entries--; - if (dcp->c_nlink > 0) - dcp->c_nlink--; - dcp->c_flag |= C_CHANGE | C_UPDATE; - tv = time; - (void) VOP_UPDATE(dvp, &tv, &tv, 0); HFS_KNOTE(dvp, NOTE_WRITE); out: - /* All done with component name... */ - if ((options & HFSRM_SAVE_NAME) == 0 && - (cnp != 0) && - (cnp->cn_flags & (HASBUF | SAVENAME)) == (HASBUF | SAVENAME)) { - char *tmp = cnp->cn_pnbuf; - cnp->cn_pnbuf = NULL; - cnp->cn_flags &= ~HASBUF; - FREE_ZONE(tmp, cnp->cn_pnlen, M_NAMEI); - } - - if (!(options & HFSRM_SKIP_RESERVE)) { - cat_postflight(hfsmp, &cookie, p); + if (got_cookie) { + cat_postflight(hfsmp, &cookie, 0); } /* Commit the truncation to the catalog record */ if (truncated) { - cp->c_flag |= C_CHANGE | C_UPDATE | C_FORCEUPDATE; - tv = time; - (void) VOP_UPDATE(vp, &tv, &tv, 0); + cp->c_flag |= C_FORCEUPDATE; + cp->c_touch_chgtime = TRUE; + cp->c_touch_modtime = TRUE; + (void) hfs_update(vp, 0); } - // XXXdbg if (started_tr) { - journal_end_transaction(hfsmp->jnl); - } - if (grabbed_lock) { - hfs_global_shared_lock_release(hfsmp); + hfs_end_transaction(hfsmp); } HFS_KNOTE(vp, NOTE_DELETE); if (rvp) { HFS_KNOTE(rvp, NOTE_DELETE); - vrele(rvp); + /* Defer the vnode_put on rvp until the hfs_unlock(). */ + cp->c_flag |= C_NEED_RVNODE_PUT; }; - if (error) { - vput(vp); - } else { - VOP_UNLOCK(vp, 0, p); - // XXXdbg - try to prevent the lost ubc_info panic - if ((cp->c_flag & C_HARDLINK) == 0 || cp->c_nlink == 0) { - (void) ubc_uncache(vp); - } - vrele(vp); - } - if (!(options & HFSRM_PARENT_LOCKED)) { - vput(dvp); - } - return (error); } @@ -2090,7 +1910,7 @@ replace_desc(struct cnode *cp, struct cat_desc *cdp) cp->c_desc.cd_nameptr = 0; cp->c_desc.cd_namelen = 0; cp->c_desc.cd_flags &= ~CD_HASBUF; - remove_name(name); + vfs_removename(name); } bcopy(cdp, &cp->c_desc, sizeof(cp->c_desc)); @@ -2101,36 +1921,26 @@ replace_desc(struct cnode *cp, struct cat_desc *cdp) } -/* -# -#% rename fdvp U U U -#% rename fvp U U U -#% rename tdvp L U U -#% rename tvp X U U -# -*/ /* * Rename a cnode. * - * The VFS layer guarantees that source and destination will - * either both be directories, or both not be directories. - * - * When the target is a directory, hfs_rename must ensure - * that it is empty. + * The VFS layer guarantees that: + * - source and destination will either both be directories, or + * both not be directories. + * - all the vnodes are from the same file system * - * The rename system call is responsible for freeing - * the pathname buffers (ie no need to call VOP_ABORTOP). + * When the target is a directory, HFS must ensure that its empty. */ - static int -hfs_rename(ap) - struct vop_rename_args /* { +hfs_vnop_rename(ap) + struct vnop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; + vfs_context_t a_context; } */ *ap; { struct vnode *tvp = ap->a_tvp; @@ -2139,70 +1949,48 @@ hfs_rename(ap) struct vnode *fdvp = ap->a_fdvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; - struct proc *p = fcnp->cn_proc; - struct cnode *fcp = NULL; - struct cnode *fdcp = NULL; - struct cnode *tdcp = VTOC(tdvp); + struct proc *p = vfs_context_proc(ap->a_context); + struct cnode *fcp; + struct cnode *fdcp; + struct cnode *tdcp; + struct cnode *tcp; struct cat_desc from_desc; struct cat_desc to_desc; struct cat_desc out_desc; - struct hfsmount *hfsmp = NULL; - struct timeval tv; - cat_cookie_t cookie = {0}; - int fdvp_locked, fvp_locked, tdvp_locked, tvp_locked; - int tvp_deleted; - int started_tr = 0, grabbed_lock = 0; - int error = 0; - + struct hfsmount *hfsmp; + cat_cookie_t cookie; + int tvp_deleted = 0; + int started_tr = 0, got_cookie = 0; + int took_trunc_lock = 0; + int lockflags; + int error; - /* Establish our vnode lock state. */ - tdvp_locked = 1; - tvp_locked = (tvp != 0); - fdvp_locked = 0; - fvp_locked = 0; - tvp_deleted = 0; + /* When tvp exist, take the truncate lock for the hfs_removefile(). */ + if (tvp && vnode_isreg(tvp)) { + hfs_lock_truncate(VTOC(tvp), TRUE); + took_trunc_lock = 1; + } - /* - * Check for cross-device rename. - */ - if ((fvp->v_mount != tdvp->v_mount) || - (tvp && (fvp->v_mount != tvp->v_mount))) { - error = EXDEV; - goto out; + error = hfs_lockfour(VTOC(fdvp), VTOC(fvp), VTOC(tdvp), tvp ? VTOC(tvp) : NULL, + HFS_EXCLUSIVE_LOCK); + if (error) { + if (took_trunc_lock) + hfs_unlock_truncate(VTOC(tvp)); + return (error); } - /* - * When fvp matches tvp they must be case variants - * or hard links. - * - * In some cases tvp will be locked in other cases - * it be unlocked with no reference. Normalize the - * state here (unlocked with a reference) so that - * we can exit in a known state. - */ - if (fvp == tvp) { - if (VOP_ISLOCKED(tvp) && - (VTOC(tvp)->c_lock.lk_lockholder == p->p_pid) && - (VTOC(tvp)->c_lock.lk_lockthread == current_thread())) { - vput(tvp); - } - tvp = NULL; - tvp_locked = 0; + fdcp = VTOC(fdvp); + fcp = VTOC(fvp); + tdcp = VTOC(tdvp); + tcp = tvp ? VTOC(tvp) : NULL; + hfsmp = VTOHFS(tdvp); - /* - * If this a hard link with different parents - * and its not a case variant then keep tvp - * around for removal. - */ - if ((VTOC(fvp)->c_flag & C_HARDLINK) && - ((fdvp != tdvp) || - (hfs_namecmp(fcnp->cn_nameptr, fcnp->cn_namelen, - tcnp->cn_nameptr, tcnp->cn_namelen) != 0))) { - tvp = fvp; - vref(tvp); - } + /* Check for a race against unlink. */ + if (fcp->c_flag & C_NOEXISTS) { + error = ENOENT; + goto out; } - + /* * The following edge case is caught here: * (to cannot be a descendent of from) @@ -2218,7 +2006,7 @@ hfs_rename(ap) * / * o tvp */ - if (tdcp->c_parentcnid == VTOC(fvp)->c_cnid) { + if (tdcp->c_parentcnid == fcp->c_cnid) { error = EINVAL; goto out; } @@ -2238,7 +2026,7 @@ hfs_rename(ap) * / * o fvp */ - if (tvp && (tvp->v_type == VDIR) && (VTOC(tvp)->c_entries != 0)) { + if (tvp && vnode_isdir(tvp) && (tcp->c_entries != 0) && fvp != tvp) { error = ENOTEMPTY; goto out; } @@ -2260,14 +2048,11 @@ hfs_rename(ap) /* * Make sure "from" vnode and its parent are changeable. */ - if ((VTOC(fvp)->c_flags & (IMMUTABLE | APPEND)) || - (VTOC(fdvp)->c_flags & APPEND)) { + if ((fcp->c_flags & (IMMUTABLE | APPEND)) || (fdcp->c_flags & APPEND)) { error = EPERM; goto out; } - hfsmp = VTOHFS(tdvp); - /* * If the destination parent directory is "sticky", then the * user must own the parent directory, or the destination of @@ -2275,120 +2060,21 @@ hfs_rename(ap) * (except by root). This implements append-only directories. * * Note that checks for immutable and write access are done - * by the call to VOP_REMOVE. + * by the call to hfs_removefile. */ if (tvp && (tdcp->c_mode & S_ISTXT) && - (tcnp->cn_cred->cr_uid != 0) && - (tcnp->cn_cred->cr_uid != tdcp->c_uid) && - (hfs_owner_rights(hfsmp, VTOC(tvp)->c_uid, tcnp->cn_cred, p, false)) ) { + (suser(vfs_context_ucred(tcnp->cn_context), NULL)) && + (kauth_cred_getuid(vfs_context_ucred(tcnp->cn_context)) != tdcp->c_uid) && + (hfs_owner_rights(hfsmp, tcp->c_uid, vfs_context_ucred(tcnp->cn_context), p, false)) ) { error = EPERM; goto out; } #if QUOTA if (tvp) - (void)hfs_getinoquota(VTOC(tvp)); + (void)hfs_getinoquota(tcp); #endif - - /* - * Lock all the vnodes before starting a journal transaction. - */ - - /* - * Simple case (same parent) - just lock child (fvp). - */ - if (fdvp == tdvp) { - if (error = vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, p)) - goto out; - fvp_locked = 1; - goto vnlocked; - } - - /* - * If fdvp is the parent of tdvp then we'll need to - * drop tdvp's lock before acquiring a lock on fdvp. - * - * fdvp - * o - * / \ - * / \ - * tdvp o o fvp - * \ - * \ - * o tvp - * - * - * If the parent directories are unrelated then we'll - * need to aquire their vnode locks in vnode address - * order. Otherwise we can race with another rename - * call that involves the same vnodes except that to - * and from are switched and potentially deadlock. - * [ie rename("a/b", "c/d") vs rename("c/d", "a/b")] - * - * If its not either of the two above cases then we - * can safely lock fdvp and fvp. - */ - if ((VTOC(fdvp)->c_cnid == VTOC(tdvp)->c_parentcnid) || - ((VTOC(tdvp)->c_cnid != VTOC(fdvp)->c_parentcnid) && - (fdvp < tdvp))) { - - /* Drop locks on tvp and tdvp */ - if (tvp_locked) { - VOP_UNLOCK(tvp, 0, p); - tvp_locked = 0; - } - VOP_UNLOCK(tdvp, 0, p); - tdvp_locked = 0; - - /* Aquire locks in correct order */ - if ((error = vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY, p))) - goto out; - fdvp_locked = 1; - if ((error = vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY, p))) - goto out; - tdvp_locked = 1; - - /* - * Now that the parents are locked only one thread - * can continue. So the lock order of the children - * doesn't really matter - */ - if (tvp == fvp) { - if ((error = vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY, p))) - goto out; - tvp_locked = 1; - } else { - if (tvp) { - if ((error = vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY, p))) - goto out; - tvp_locked = 1; - } - if ((error = vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, p))) - goto out; - fvp_locked = 1; - } - - } else /* OK to lock fdvp and fvp */ { - if ((error = vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY, p))) - goto out; - fdvp_locked = 1; - if (error = vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, p)) - goto out; - if (tvp == fvp) - tvp_locked = 1; - else - fvp_locked = 1; - } - -vnlocked: - fdcp = VTOC(fdvp); - fcp = VTOC(fvp); - - /* - * While fvp is still locked, purge it from the name cache and - * grab it's c_cnid value. Note that the removal of tvp (below) - * can drop fvp's lock when fvp == tvp. - */ + /* Preflighting done, take fvp out of the name space. */ cache_purge(fvp); /* @@ -2396,14 +2082,13 @@ vnlocked: * we can drop its NODUMP status. */ if ((fcp->c_flags & UF_NODUMP) && - (fvp->v_type == VREG) && + vnode_isreg(fvp) && (fdvp != tdvp) && (fdcp->c_desc.cd_nameptr != NULL) && (strcmp(fdcp->c_desc.cd_nameptr, CARBON_TEMP_DIR_NAME) == 0)) { fcp->c_flags &= ~UF_NODUMP; - fcp->c_flag |= C_CHANGE; - tv = time; - (void) VOP_UPDATE(fvp, &tv, &tv, 0); + fcp->c_touch_chgtime = TRUE; + (void) hfs_update(fvp, 0); } bzero(&from_desc, sizeof(from_desc)); @@ -2420,80 +2105,108 @@ vnlocked: to_desc.cd_flags = fcp->c_desc.cd_flags & ~(CD_HASBUF | CD_DECOMPOSED); to_desc.cd_cnid = fcp->c_cnid; - hfs_global_shared_lock_acquire(hfsmp); - grabbed_lock = 1; - if (hfsmp->jnl) { - if ((error = journal_start_transaction(hfsmp->jnl)) != 0) { - goto out; + if ((error = hfs_start_transaction(hfsmp)) != 0) { + goto out; + } + started_tr = 1; + + // if it's a hardlink then re-lookup the name so + // that we get the correct cnid in from_desc (see + // the comment in hfs_removefile for more details) + // + if (fcp->c_flag & C_HARDLINK) { + struct cat_desc tmpdesc; + cnid_t real_cnid; + + bzero(&tmpdesc, sizeof(tmpdesc)); + tmpdesc.cd_nameptr = fcnp->cn_nameptr; + tmpdesc.cd_namelen = fcnp->cn_namelen; + tmpdesc.cd_parentcnid = fdcp->c_cnid; + tmpdesc.cd_hint = fdcp->c_childhint; + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + + if (cat_lookup(hfsmp, &tmpdesc, 0, NULL, NULL, NULL, &real_cnid) != 0) { + hfs_systemfile_unlock(hfsmp, lockflags); + goto out; } - started_tr = 1; + + // use the real cnid instead of whatever happened to be there + from_desc.cd_cnid = real_cnid; + hfs_systemfile_unlock(hfsmp, lockflags); } /* * Reserve some space in the Catalog file. */ + bzero(&cookie, sizeof(cookie)); if ((error = cat_preflight(hfsmp, CAT_RENAME + CAT_DELETE, &cookie, p))) { goto out; } + got_cookie = 1; /* - * If the destination exists then it needs to be removed. + * If the destination exists then it may need to be removed. */ - if (tvp) { - if (tvp != fvp) - cache_purge(tvp); /* - * Note that hfs_removedir and hfs_removefile - * will keep tdvp locked with a reference. - * But tvp will lose its lock and reference. + * When fvp matches tvp they must be case variants + * or hard links. */ - if (tvp->v_type == VDIR) - error = hfs_removedir(tdvp, tvp, tcnp, HFSRM_RENAMEOPTS); - else - error = hfs_removefile(tdvp, tvp, tcnp, HFSRM_RENAMEOPTS); + if (fvp == tvp) { + /* + * If this a hard link with different parents + * and its not a case variant then tvp should + * be removed. + */ + if (!((fcp->c_flag & C_HARDLINK) && + ((fdvp != tdvp) || + (hfs_namecmp(fcnp->cn_nameptr, fcnp->cn_namelen, + tcnp->cn_nameptr, tcnp->cn_namelen) != 0)))) { + goto skip; + } + } else { + cache_purge(tvp); + } + + if (vnode_isdir(tvp)) + error = hfs_removedir(tdvp, tvp, tcnp, HFSRM_SKIP_RESERVE); + else { + error = hfs_removefile(tdvp, tvp, tcnp, 0, HFSRM_SKIP_RESERVE); + } - if (tvp == fvp) - fvp_locked = 0; - tvp = NULL; - tvp_locked = 0; - tvp_deleted = 1; if (error) goto out; + tvp_deleted = 1; } - +skip: /* * All done with tvp and fvp */ - /* Lock catalog b-tree */ - error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); - if (error) - goto out; - + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); error = cat_rename(hfsmp, &from_desc, &tdcp->c_desc, &to_desc, &out_desc); - - /* Unlock catalog b-tree */ - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); + hfs_systemfile_unlock(hfsmp, lockflags); if (error) { goto out; } + /* Invalidate negative cache entries in the destination directory */ + if (hfsmp->hfs_flags & HFS_CASE_SENSITIVE) + cache_purge_negatives(tdvp); + /* Update cnode's catalog descriptor */ - if (fvp_locked) { - replace_desc(fcp, &out_desc); - fcp->c_parentcnid = tdcp->c_cnid; - fcp->c_hint = 0; - } + replace_desc(fcp, &out_desc); + fcp->c_parentcnid = tdcp->c_cnid; + fcp->c_hint = 0; - hfs_volupdate(hfsmp, fvp->v_type == VDIR ? VOL_RMDIR : VOL_RMFILE, + hfs_volupdate(hfsmp, vnode_isdir(fvp) ? VOL_RMDIR : VOL_RMFILE, (fdcp->c_cnid == kHFSRootFolderID)); - hfs_volupdate(hfsmp, fvp->v_type == VDIR ? VOL_MKDIR : VOL_MKFILE, + hfs_volupdate(hfsmp, vnode_isdir(fvp) ? VOL_MKDIR : VOL_MKFILE, (tdcp->c_cnid == kHFSRootFolderID)); /* Update both parent directories. */ - tv = time; if (fdvp != tdvp) { tdcp->c_nlink++; tdcp->c_entries++; @@ -2501,22 +2214,24 @@ vnlocked: fdcp->c_nlink--; if (fdcp->c_entries > 0) fdcp->c_entries--; - fdcp->c_flag |= C_CHANGE | C_UPDATE; - (void) VOP_UPDATE(fdvp, &tv, &tv, 0); + fdcp->c_touch_chgtime = TRUE; + fdcp->c_touch_modtime = TRUE; + + fdcp->c_flag |= C_FORCEUPDATE; // XXXdbg - force it out! + (void) hfs_update(fdvp, 0); } tdcp->c_childhint = out_desc.cd_hint; /* Cache directory's location */ - tdcp->c_flag |= C_CHANGE | C_UPDATE; - (void) VOP_UPDATE(tdvp, &tv, &tv, 0); + tdcp->c_touch_chgtime = TRUE; + tdcp->c_touch_modtime = TRUE; + tdcp->c_flag |= C_FORCEUPDATE; // XXXdbg - force it out! + (void) hfs_update(tdvp, 0); out: - if (hfsmp) { + if (got_cookie) { cat_postflight(hfsmp, &cookie, p); } if (started_tr) { - journal_end_transaction(hfsmp->jnl); - } - if (grabbed_lock) { - hfs_global_shared_lock_release(hfsmp); + hfs_end_transaction(hfsmp); } /* Note that if hfs_removedir or hfs_removefile was invoked above they will already have @@ -2527,24 +2242,11 @@ out: HFS_KNOTE(fdvp, NOTE_WRITE); if (tdvp != fdvp) HFS_KNOTE(tdvp, NOTE_WRITE); }; - if (fvp_locked) { - VOP_UNLOCK(fvp, 0, p); - } - if (fdvp_locked) { - VOP_UNLOCK(fdvp, 0, p); - } - if (tdvp_locked) { - VOP_UNLOCK(tdvp, 0, p); - } - if (tvp_locked) { - VOP_UNLOCK(tvp, 0, p); - } - vrele(fvp); - vrele(fdvp); - if (tvp) - vrele(tvp); - vrele(tdvp); + if (took_trunc_lock) + hfs_unlock_truncate(VTOC(tvp)); + + hfs_unlockfour(fdcp, fcp, tdcp, tcp); /* After tvp is removed the only acceptable error is EIO */ if (error && tvp_deleted) @@ -2554,239 +2256,276 @@ out: } - /* - * Mkdir system call -#% mkdir dvp L U U -#% mkdir vpp - L - -# - vop_mkdir { - IN WILLRELE struct vnode *dvp; - OUT struct vnode **vpp; - IN struct componentname *cnp; - IN struct vattr *vap; - - We are responsible for freeing the namei buffer, - it is done in hfs_makenode() -*/ - + * Make a directory. + */ static int -hfs_mkdir(ap) - struct vop_mkdir_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - struct vattr *a_vap; - } */ *ap; +hfs_vnop_mkdir(struct vnop_mkdir_args *ap) { - struct vattr *vap = ap->a_vap; - - return (hfs_makenode(MAKEIMODE(vap->va_type, vap->va_mode), - ap->a_dvp, ap->a_vpp, ap->a_cnp)); + /***** HACK ALERT ********/ + ap->a_cnp->cn_flags |= MAKEENTRY; + return hfs_makenode(ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap, ap->a_context); } /* - * symlink -- make a symbolic link -#% symlink dvp L U U -#% symlink vpp - U - -# -# XXX - note that the return vnode has already been VRELE'ed -# by the filesystem layer. To use it you must use vget, -# possibly with a further namei. -# - vop_symlink { - IN WILLRELE struct vnode *dvp; - OUT WILLRELE struct vnode **vpp; - IN struct componentname *cnp; - IN struct vattr *vap; - IN char *target; - - We are responsible for freeing the namei buffer, - it is done in hfs_makenode(). - -*/ - + * Create a symbolic link. + */ static int -hfs_symlink(ap) - struct vop_symlink_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - struct vattr *a_vap; - char *a_target; - } */ *ap; +hfs_vnop_symlink(struct vnop_symlink_args *ap) { - register struct vnode *vp, **vpp = ap->a_vpp; + struct vnode **vpp = ap->a_vpp; + struct vnode *dvp = ap->a_dvp; + struct vnode *vp = NULL; struct hfsmount *hfsmp; struct filefork *fp; - int len, error; struct buf *bp = NULL; + char *datap; + int started_tr = 0; + int len, error; /* HFS standard disks don't support symbolic links */ - if (VTOVCB(ap->a_dvp)->vcbSigWord != kHFSPlusSigWord) { - VOP_ABORTOP(ap->a_dvp, ap->a_cnp); - vput(ap->a_dvp); - return (EOPNOTSUPP); - } + if (VTOVCB(dvp)->vcbSigWord != kHFSPlusSigWord) + return (ENOTSUP); /* Check for empty target name */ - if (ap->a_target[0] == 0) { - VOP_ABORTOP(ap->a_dvp, ap->a_cnp); - vput(ap->a_dvp); + if (ap->a_target[0] == 0) return (EINVAL); - } - - - hfsmp = VTOHFS(ap->a_dvp); /* Create the vnode */ - if ((error = hfs_makenode(S_IFLNK | ap->a_vap->va_mode, - ap->a_dvp, vpp, ap->a_cnp))) { - return (error); + ap->a_vap->va_mode |= S_IFLNK; + if ((error = hfs_makenode(dvp, vpp, ap->a_cnp, ap->a_vap, ap->a_context))) { + goto out; } - vp = *vpp; - len = strlen(ap->a_target); + if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) + return (error); fp = VTOF(vp); + hfsmp = VTOHFS(dvp); + len = strlen(ap->a_target); #if QUOTA (void)hfs_getinoquota(VTOC(vp)); #endif /* QUOTA */ - // XXXdbg - hfs_global_shared_lock_acquire(hfsmp); - if (hfsmp->jnl) { - if ((error = journal_start_transaction(hfsmp->jnl)) != 0) { - hfs_global_shared_lock_release(hfsmp); - vput(vp); - return error; - } + if ((error = hfs_start_transaction(hfsmp)) != 0) { + goto out; } + started_tr = 1; - /* Allocate space for the link */ - error = VOP_TRUNCATE(vp, len, IO_NOZEROFILL, - ap->a_cnp->cn_cred, ap->a_cnp->cn_proc); + /* + * Allocate space for the link. + * + * Since we're already inside a transaction, + * tell hfs_truncate to skip the ubc_setsize. + * + * Don't need truncate lock since a symlink is treated as a system file. + */ + error = hfs_truncate(vp, len, IO_NOZEROFILL, 1, ap->a_context); if (error) goto out; /* XXX need to remove link */ /* Write the link to disk */ - bp = getblk(vp, 0, roundup((int)fp->ff_size, VTOHFS(vp)->hfs_phys_block_size), + bp = buf_getblk(vp, (daddr64_t)0, roundup((int)fp->ff_size, VTOHFS(vp)->hfs_phys_block_size), 0, 0, BLK_META); if (hfsmp->jnl) { journal_modify_block_start(hfsmp->jnl, bp); } - bzero(bp->b_data, bp->b_bufsize); - bcopy(ap->a_target, bp->b_data, len); + datap = (char *)buf_dataptr(bp); + bzero(datap, buf_size(bp)); + bcopy(ap->a_target, datap, len); + if (hfsmp->jnl) { journal_modify_block_end(hfsmp->jnl, bp); } else { - bawrite(bp); + buf_bawrite(bp); } + /* + * We defered the ubc_setsize for hfs_truncate + * since we were inside a transaction. + * + * We don't need to drop the cnode lock here + * since this is a symlink. + */ + ubc_setsize(vp, len); out: - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); + if (started_tr) + hfs_end_transaction(hfsmp); + if (vp) { + hfs_unlock(VTOC(vp)); } - hfs_global_shared_lock_release(hfsmp); - vput(vp); return (error); } -/* - * Dummy dirents to simulate the "." and ".." entries of the directory - * in a hfs filesystem. HFS doesn't provide these on disk. Note that - * the size of these entries is the smallest needed to represent them - * (only 12 byte each). - */ -static hfsdotentry rootdots[2] = { - { - 1, /* d_fileno */ - sizeof(struct hfsdotentry), /* d_reclen */ - DT_DIR, /* d_type */ - 1, /* d_namlen */ - "." /* d_name */ - }, - { - 1, /* d_fileno */ - sizeof(struct hfsdotentry), /* d_reclen */ - DT_DIR, /* d_type */ - 2, /* d_namlen */ - ".." /* d_name */ - } +/* structures to hold a "." or ".." directory entry */ +struct hfs_stddotentry { + u_int32_t d_fileno; /* unique file number */ + u_int16_t d_reclen; /* length of this structure */ + u_int8_t d_type; /* dirent file type */ + u_int8_t d_namlen; /* len of filename */ + char d_name[4]; /* "." or ".." */ }; -/* 4.3 Note: -* There is some confusion as to what the semantics of uio_offset are. -* In ufs, it represents the actual byte offset within the directory -* "file." HFS, however, just uses it as an entry counter - essentially -* assuming that it has no meaning except to the hfs_readdir function. -* This approach would be more efficient here, but some callers may -* assume the uio_offset acts like a byte offset. NFS in fact -* monkeys around with the offset field a lot between readdir calls. -* -* The use of the resid uiop->uio_resid and uiop->uio_iov->iov_len -* fields is a mess as well. The libc function readdir() returns -* NULL (indicating the end of a directory) when either -* the getdirentries() syscall (which calls this and returns -* the size of the buffer passed in less the value of uiop->uio_resid) -* returns 0, or a direct record with a d_reclen of zero. -* nfs_server.c:rfs_readdir(), on the other hand, checks for the end -* of the directory by testing uiop->uio_resid == 0. The solution -* is to pad the size of the last struct direct in a given -* block to fill the block if we are not at the end of the directory. -*/ +struct hfs_extdotentry { + u_int64_t d_fileno; /* unique file number */ + u_int64_t d_seekoff; /* seek offset (optional, used by servers) */ + u_int16_t d_reclen; /* length of this structure */ + u_int16_t d_namlen; /* len of filename */ + u_int8_t d_type; /* dirent file type */ + u_char d_name[3]; /* "." or ".." */ +}; +typedef union { + struct hfs_stddotentry std; + struct hfs_extdotentry ext; +} hfs_dotentry_t; /* - * NOTE: We require a minimal buffer size of DIRBLKSIZ for two reasons. One, it is the same value - * returned be stat() call as the block size. This is mentioned in the man page for getdirentries(): - * "Nbytes must be greater than or equal to the block size associated with the file, - * see stat(2)". Might as well settle on the same size of ufs. Second, this makes sure there is enough - * room for the . and .. entries that have to added manually. + * hfs_vnop_readdir reads directory entries into the buffer pointed + * to by uio, in a filesystem independent format. Up to uio_resid + * bytes of data can be transferred. The data in the buffer is a + * series of packed dirent structures where each one contains the + * following entries: + * + * u_int32_t d_fileno; // file number of entry + * u_int16_t d_reclen; // length of this record + * u_int8_t d_type; // file type + * u_int8_t d_namlen; // length of string in d_name + * char d_name[MAXNAMELEN+1]; // null terminated file name + * + * The current position (uio_offset) refers to the next block of + * entries. The offset can only be set to a value previously + * returned by hfs_vnop_readdir or zero. This offset does not have + * to match the number of bytes returned (in uio_resid). + * + * In fact, the offset used by HFS is essentially an index (26 bits) + * with a tag (6 bits). The tag is for associating the next request + * with the current request. This enables us to have multiple threads + * reading the directory while the directory is also being modified. + * + * Each tag/index pair is tied to a unique directory hint. The hint + * contains information (filename) needed to build the catalog b-tree + * key for finding the next set of entries. */ - -/* -#% readdir vp L L L -# -vop_readdir { - IN struct vnode *vp; - INOUT struct uio *uio; - IN struct ucred *cred; - INOUT int *eofflag; - OUT int *ncookies; - INOUT u_long **cookies; - */ static int -hfs_readdir(ap) - struct vop_readdir_args /* { - struct vnode *vp; - struct uio *uio; - struct ucred *cred; - int *eofflag; - int *ncookies; - u_long **cookies; +hfs_vnop_readdir(ap) + struct vnop_readdir_args /* { + vnode_t a_vp; + uio_t a_uio; + int a_flags; + int *a_eofflag; + int *a_numdirent; + vfs_context_t a_context; } */ *ap; { - register struct uio *uio = ap->a_uio; - struct cnode *cp = VTOC(ap->a_vp); - struct hfsmount *hfsmp = VTOHFS(ap->a_vp); - struct proc *p = current_proc(); - off_t off = uio->uio_offset; - int retval = 0; + struct vnode *vp = ap->a_vp; + uio_t uio = ap->a_uio; + struct cnode *cp; + struct hfsmount *hfsmp; + directoryhint_t *dirhint = NULL; + directoryhint_t localhint; + off_t offset; + off_t startoffset; + int error = 0; int eofflag = 0; - void *user_start = NULL; - int user_len; + user_addr_t user_start = 0; + user_size_t user_len = 0; + int index; + unsigned int tag; + int items; + int lockflags; + int extended; + int nfs_cookies; + caddr_t bufstart; + cnid_t cnid_hint = 0; + + items = 0; + startoffset = offset = uio_offset(uio); + bufstart = CAST_DOWN(caddr_t, uio_iov_base(uio)); + extended = (ap->a_flags & VNODE_READDIR_EXTENDED); + nfs_cookies = extended && (ap->a_flags & VNODE_READDIR_REQSEEKOFF); + + /* Sanity check the uio data. */ + if ((uio_iovcnt(uio) > 1) || + (uio_resid(uio) < (int)sizeof(struct dirent))) { + return (EINVAL); + } + /* Note that the dirhint calls require an exclusive lock. */ + if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) + return (error); + cp = VTOC(vp); + hfsmp = VTOHFS(vp); - int ncookies=0; - u_long *cookies=NULL; - u_long *cookiep=NULL; - - /* We assume it's all one big buffer... */ - if (uio->uio_iovcnt > 1 || uio->uio_resid < AVERAGE_HFSDIRENTRY_SIZE) - return EINVAL; + /* Pick up cnid hint (if any). */ + if (nfs_cookies) { + cnid_hint = (cnid_t)(uio_offset(uio) >> 32); + uio_setoffset(uio, uio_offset(uio) & 0x00000000ffffffffLL); + } + /* + * Synthesize entries for "." and ".." + */ + if (offset == 0) { + hfs_dotentry_t dotentry[2]; + size_t uiosize; + + if (extended) { + struct hfs_extdotentry *entry = &dotentry[0].ext; + + entry->d_fileno = cp->c_cnid; + entry->d_reclen = sizeof(struct hfs_extdotentry); + entry->d_type = DT_DIR; + entry->d_namlen = 1; + entry->d_name[0] = '.'; + entry->d_name[1] = '\0'; + entry->d_name[2] = '\0'; + entry->d_seekoff = 1; + + ++entry; + entry->d_fileno = cp->c_parentcnid; + entry->d_reclen = sizeof(struct hfs_extdotentry); + entry->d_type = DT_DIR; + entry->d_namlen = 2; + entry->d_name[0] = '.'; + entry->d_name[1] = '.'; + entry->d_name[2] = '\0'; + entry->d_seekoff = 2; + uiosize = 2 * sizeof(struct hfs_extdotentry); + } else { + struct hfs_stddotentry *entry = &dotentry[0].std; + + entry->d_fileno = cp->c_cnid; + entry->d_reclen = sizeof(struct hfs_stddotentry); + entry->d_type = DT_DIR; + entry->d_namlen = 1; + *(int *)&entry->d_name[0] = 0; + entry->d_name[0] = '.'; + + ++entry; + entry->d_fileno = cp->c_parentcnid; + entry->d_reclen = sizeof(struct hfs_stddotentry); + entry->d_type = DT_DIR; + entry->d_namlen = 2; + *(int *)&entry->d_name[0] = 0; + entry->d_name[0] = '.'; + entry->d_name[1] = '.'; + uiosize = 2 * sizeof(struct hfs_stddotentry); + } + if ((error = uiomove((caddr_t)&dotentry, uiosize, uio))) { + goto out; + } + offset += 2; + } - // XXXdbg + /* If there are no real entries then we're done. */ + if (cp->c_entries == 0) { + error = 0; + eofflag = 1; + uio_setoffset(uio, offset); + goto seekoffcalc; + } + + // // We have to lock the user's buffer here so that we won't // fault on it after we've acquired a shared lock on the // catalog file. The issue is that you can get a 3-way @@ -2804,168 +2543,129 @@ hfs_readdir(ap) // currently (10/30/02) that can fault on user data with a // shared lock on the catalog file. // - if (hfsmp->jnl && uio->uio_segflg == UIO_USERSPACE) { - user_start = uio->uio_iov->iov_base; - user_len = uio->uio_iov->iov_len; + if (hfsmp->jnl && uio_isuserspace(uio)) { + user_start = uio_curriovbase(uio); + user_len = uio_curriovlen(uio); - if ((retval = vslock(user_start, user_len)) != 0) { - return retval; + if ((error = vslock(user_start, user_len)) != 0) { + user_start = 0; + goto out; } } - - /* Create the entries for . and .. */ - if (uio->uio_offset < sizeof(rootdots)) { - caddr_t dep; - size_t dotsize; - - rootdots[0].d_fileno = cp->c_cnid; - rootdots[1].d_fileno = cp->c_parentcnid; - - if (uio->uio_offset == 0) { - dep = (caddr_t) &rootdots[0]; - dotsize = 2* sizeof(struct hfsdotentry); - } else if (uio->uio_offset == sizeof(struct hfsdotentry)) { - dep = (caddr_t) &rootdots[1]; - dotsize = sizeof(struct hfsdotentry); - } else { - retval = EINVAL; - goto Exit; + /* Convert offset into a catalog directory index. */ + index = (offset & HFS_INDEX_MASK) - 2; + tag = offset & ~HFS_INDEX_MASK; + + /* Lock catalog during cat_findname and cat_getdirentries. */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + + /* When called from NFS, try and resolve a cnid hint. */ + if (nfs_cookies && cnid_hint != 0) { + if (cat_findname(hfsmp, cnid_hint, &localhint.dh_desc) == 0) { + if ( localhint.dh_desc.cd_parentcnid == cp->c_cnid) { + localhint.dh_index = index - 1; + localhint.dh_time = 0; + localhint.dh_link.sle_next = 0; + dirhint = &localhint; /* don't forget to release the descriptor */ + } else { + cat_releasedesc(&localhint.dh_desc); + } } - - retval = uiomove(dep, dotsize, uio); - if (retval != 0) - goto Exit; } - if (ap->a_ncookies != NULL) { - /* - * These cookies are handles that allow NFS to restart - * scanning through a directory. If a directory is large - * enough, NFS will issue a successive readdir() with a - * uio->uio_offset that is equal to one of these cookies. - * - * The cookies that we generate are synthesized byte-offsets. - * The offset is where the dirent the dirent would be if the - * directory were an array of packed dirent structs. It is - * synthetic because that's not how directories are stored in - * HFS but other code expects that the cookie is a byte offset. - * - * We have to pre-allocate the cookies because cat_getdirentries() - * is the only one that can properly synthesize the offsets (since - * it may have to skip over entries and only it knows the true - * virtual offset of any particular directory entry). So we allocate - * a cookie table here and pass it in to cat_getdirentries(). - * - * Note that the handling of "." and ".." is mostly done here but - * cat_getdirentries() is aware of. - * - * Only the NFS server uses cookies so fortunately this code is - * not executed unless the NFS server is issuing the readdir - * request. - * - * Also note that the NFS server is the one responsible for - * free'ing the cookies even though we allocated them. Ick. - * - * We allocate a reasonable number of entries for the size of - * the buffer that we're going to fill in. cat_getdirentries() - * is smart enough to not overflow if there's more room in the - * buffer but not enough room in the cookie table. - */ - if (uio->uio_segflg != UIO_SYSSPACE) - panic("hfs_readdir: unexpected uio from NFS server"); - - ncookies = uio->uio_iov->iov_len / (AVERAGE_HFSDIRENTRY_SIZE/2); - MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP, M_WAITOK); + /* Get a directory hint (cnode must be locked exclusive) */ + if (dirhint == NULL) { + dirhint = hfs_getdirhint(cp, ((index - 1) & HFS_INDEX_MASK) | tag); - *ap->a_ncookies = ncookies; - *ap->a_cookies = cookies; - - /* handle cookies for "." and ".." */ - if (off == 0) { - cookies[0] = 0; - cookies[1] = sizeof(struct hfsdotentry); - } else if (off == sizeof(struct hfsdotentry)) { - cookies[0] = sizeof(struct hfsdotentry); + /* Hide tag from catalog layer. */ + dirhint->dh_index &= HFS_INDEX_MASK; + if (dirhint->dh_index == HFS_INDEX_MASK) { + dirhint->dh_index = -1; } } + + /* Pack the buffer with dirent entries. */ + error = cat_getdirentries(hfsmp, cp->c_entries, dirhint, uio, extended, &items); - /* If there are no children then we're done */ - if (cp->c_entries == 0) { + hfs_systemfile_unlock(hfsmp, lockflags); + + if (error != 0) { + goto out; + } + + /* Get index to the next item */ + index += items; + + if (items >= (int)cp->c_entries) { eofflag = 1; - retval = 0; - if (cookies) { - cookies[0] = 0; - cookies[1] = sizeof(struct hfsdotentry); - } - goto Exit; } - /* Lock catalog b-tree */ - retval = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, p); - if (retval) goto Exit; + /* Convert catalog directory index back into an offset. */ + while (tag == 0) + tag = (++cp->c_dirhinttag) << HFS_INDEX_BITS; + uio_setoffset(uio, (index + 2) | tag); + dirhint->dh_index |= tag; - retval = cat_getdirentries(hfsmp, &cp->c_desc, cp->c_entries, uio, &eofflag, cookies, ncookies); +seekoffcalc: + cp->c_touch_acctime = TRUE; - /* Unlock catalog b-tree */ - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); - - if (retval != E_NONE) { - goto Exit; - } - - /* were we already past eof ? */ - if (uio->uio_offset == off) { - retval = E_NONE; - goto Exit; + if (ap->a_numdirent) { + if (startoffset == 0) + items += 2; + *ap->a_numdirent = items; } - - cp->c_flag |= C_ACCESS; -Exit:; +out: if (hfsmp->jnl && user_start) { vsunlock(user_start, user_len, TRUE); } - - if (ap->a_eofflag) + /* If we didn't do anything then go ahead and dump the hint. */ + if ((dirhint != NULL) && + (dirhint != &localhint) && + (uio_offset(uio) == startoffset)) { + hfs_reldirhint(cp, dirhint); + eofflag = 1; + } + if (ap->a_eofflag) { *ap->a_eofflag = eofflag; - - return (retval); + } + if (dirhint == &localhint) { + cat_releasedesc(&localhint.dh_desc); + } + hfs_unlock(cp); + return (error); } /* - * Return target name of a symbolic link -#% readlink vp L L L -# - vop_readlink { - IN struct vnode *vp; - INOUT struct uio *uio; - IN struct ucred *cred; - */ - + * Read contents of a symbolic link. + */ static int -hfs_readlink(ap) - struct vop_readlink_args /* { +hfs_vnop_readlink(ap) + struct vnop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; - struct ucred *a_cred; + vfs_context_t a_context; } */ *ap; { - int retval; struct vnode *vp = ap->a_vp; struct cnode *cp; struct filefork *fp; + int error; - if (vp->v_type != VLNK) + if (!vnode_islnk(vp)) return (EINVAL); + if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) + return (error); cp = VTOC(vp); fp = VTOF(vp); /* Zero length sym links are not allowed */ if (fp->ff_size == 0 || fp->ff_size > MAXPATHLEN) { VTOVCB(vp)->vcbFlags |= kHFS_DamagedVolume; - return (EINVAL); + error = EINVAL; + goto exit; } /* Cache the path so we don't waste buffer cache resources */ @@ -2973,182 +2673,79 @@ hfs_readlink(ap) struct buf *bp = NULL; MALLOC(fp->ff_symlinkptr, char *, fp->ff_size, M_TEMP, M_WAITOK); - retval = meta_bread(vp, 0, - roundup((int)fp->ff_size, - VTOHFS(vp)->hfs_phys_block_size), - ap->a_cred, &bp); - if (retval) { + error = (int)buf_meta_bread(vp, (daddr64_t)0, + roundup((int)fp->ff_size, + VTOHFS(vp)->hfs_phys_block_size), + vfs_context_ucred(ap->a_context), &bp); + if (error) { if (bp) - brelse(bp); + buf_brelse(bp); if (fp->ff_symlinkptr) { FREE(fp->ff_symlinkptr, M_TEMP); fp->ff_symlinkptr = NULL; } - return (retval); - } - bcopy(bp->b_data, fp->ff_symlinkptr, (size_t)fp->ff_size); - if (bp) { - if (VTOHFS(vp)->jnl && (bp->b_flags & B_LOCKED) == 0) { - bp->b_flags |= B_INVAL; /* data no longer needed */ - } - brelse(bp); + goto exit; } - } - retval = uiomove((caddr_t)fp->ff_symlinkptr, (int)fp->ff_size, ap->a_uio); -#if 1 - /* - * Keep track blocks read - */ - if ((VTOHFS(vp)->hfc_stage == HFC_RECORDING) && (retval == 0)) { - - /* - * If this file hasn't been seen since the start of - * the current sampling period then start over. - */ - if (cp->c_atime < VTOHFS(vp)->hfc_timebase) - VTOF(vp)->ff_bytesread = fp->ff_size; - else - VTOF(vp)->ff_bytesread += fp->ff_size; - - // if (VTOF(vp)->ff_bytesread > fp->ff_size) - // cp->c_flag |= C_ACCESS; - } -#endif - return (retval); -} - -/* - * Lock an cnode. If its already locked, set the WANT bit and sleep. -#% lock vp U L U -# - vop_lock { - IN struct vnode *vp; - IN int flags; - IN struct proc *p; - */ - -static int -hfs_lock(ap) - struct vop_lock_args /* { - struct vnode *a_vp; - int a_flags; - struct proc *a_p; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - struct cnode *cp = VTOC(vp); - - return (lockmgr(&cp->c_lock, ap->a_flags, &vp->v_interlock, ap->a_p)); -} + bcopy((char *)buf_dataptr(bp), fp->ff_symlinkptr, (size_t)fp->ff_size); -/* - * Unlock an cnode. -#% unlock vp L U L -# - vop_unlock { - IN struct vnode *vp; - IN int flags; - IN struct proc *p; - - */ -static int -hfs_unlock(ap) - struct vop_unlock_args /* { - struct vnode *a_vp; - int a_flags; - struct proc *a_p; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - struct cnode *cp = VTOC(vp); -#if 0 - if (!lockstatus(&cp->c_lock)) { - printf("hfs_unlock: vnode %s wasn't locked!\n", - cp->c_desc.cd_nameptr ? cp->c_desc.cd_nameptr : ""); + if (VTOHFS(vp)->jnl && (buf_flags(bp) & B_LOCKED) == 0) { + buf_markinvalid(bp); /* data no longer needed */ + } + buf_brelse(bp); } -#endif - return (lockmgr(&cp->c_lock, ap->a_flags | LK_RELEASE, - &vp->v_interlock, ap->a_p)); -} + error = uiomove((caddr_t)fp->ff_symlinkptr, (int)fp->ff_size, ap->a_uio); + /* + * Keep track blocks read + */ + if ((VTOHFS(vp)->hfc_stage == HFC_RECORDING) && (error == 0)) { + + /* + * If this file hasn't been seen since the start of + * the current sampling period then start over. + */ + if (cp->c_atime < VTOHFS(vp)->hfc_timebase) + VTOF(vp)->ff_bytesread = fp->ff_size; + else + VTOF(vp)->ff_bytesread += fp->ff_size; + + // if (VTOF(vp)->ff_bytesread > fp->ff_size) + // cp->c_touch_acctime = TRUE; + } -/* - * Print out the contents of a cnode. -#% print vp = = = -# - vop_print { - IN struct vnode *vp; - */ -static int -hfs_print(ap) - struct vop_print_args /* { - struct vnode *a_vp; - } */ *ap; -{ - struct vnode * vp = ap->a_vp; - struct cnode *cp = VTOC(vp); - - printf("tag VT_HFS, cnid %d, on dev %d, %d", cp->c_cnid, - major(cp->c_dev), minor(cp->c_dev)); -#if FIFO - if (vp->v_type == VFIFO) - fifo_printinfo(vp); -#endif /* FIFO */ - lockmgr_printinfo(&cp->c_lock); - printf("\n"); - return (0); +exit: + hfs_unlock(cp); + return (error); } /* - * Check for a locked cnode. -#% islocked vp = = = -# - vop_islocked { - IN struct vnode *vp; - - */ -static int -hfs_islocked(ap) - struct vop_islocked_args /* { - struct vnode *a_vp; - } */ *ap; -{ - return (lockstatus(&VTOC(ap->a_vp)->c_lock)); -} - -/* - -#% pathconf vp L L L -# - vop_pathconf { - IN struct vnode *vp; - IN int name; - OUT register_t *retval; - - */ + * Get configurable pathname variables. + */ static int -hfs_pathconf(ap) - struct vop_pathconf_args /* { +hfs_vnop_pathconf(ap) + struct vnop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; + vfs_context_t a_context; } */ *ap; { - int retval = 0; - switch (ap->a_name) { case _PC_LINK_MAX: - if (VTOVCB(ap->a_vp)->vcbSigWord == kHFSPlusSigWord) - *ap->a_retval = HFS_LINK_MAX; - else + if (VTOHFS(ap->a_vp)->hfs_flags & HFS_STANDARD) *ap->a_retval = 1; + else + *ap->a_retval = HFS_LINK_MAX; break; case _PC_NAME_MAX: - *ap->a_retval = kHFSPlusMaxFileNameBytes; /* max # of characters x max utf8 representation */ + if (VTOHFS(ap->a_vp)->hfs_flags & HFS_STANDARD) + *ap->a_retval = kHFSMaxFileNameChars; /* 255 */ + else + *ap->a_retval = kHFSPlusMaxFileNameChars; /* 31 */ break; case _PC_PATH_MAX: - *ap->a_retval = PATH_MAX; /* 1024 */ + *ap->a_retval = PATH_MAX; /* 1024 */ break; case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; @@ -3172,214 +2769,60 @@ hfs_pathconf(ap) *ap->a_retval = 1; break; default: - retval = EINVAL; - } - - return (retval); -} - - -/* - * Advisory record locking support -#% advlock vp U U U -# - vop_advlock { - IN struct vnode *vp; - IN caddr_t id; - IN int op; - IN struct flock *fl; - IN int flags; - - */ -static int -hfs_advlock(ap) - struct vop_advlock_args /* { - struct vnode *a_vp; - caddr_t a_id; - int a_op; - struct flock *a_fl; - int a_flags; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - struct flock *fl = ap->a_fl; - struct hfslockf *lock; - struct filefork *fork; - off_t start, end; - int retval; - - /* Only regular files can have locks */ - if (vp->v_type != VREG) - return (EISDIR); - - fork = VTOF(ap->a_vp); - /* - * Avoid the common case of unlocking when cnode has no locks. - */ - if (fork->ff_lockf == (struct hfslockf *)0) { - if (ap->a_op != F_SETLK) { - fl->l_type = F_UNLCK; - return (0); - } - } - /* - * Convert the flock structure into a start and end. - */ - start = 0; - switch (fl->l_whence) { - case SEEK_SET: - case SEEK_CUR: - /* - * Caller is responsible for adding any necessary offset - * when SEEK_CUR is used. - */ - start = fl->l_start; - break; - case SEEK_END: - start = fork->ff_size + fl->l_start; - break; - default: - return (EINVAL); - } - - if (fl->l_len == 0) - end = -1; - else if (fl->l_len > 0) - end = start + fl->l_len - 1; - else { /* l_len is negative */ - end = start - 1; - start += fl->l_len; - } - if (start < 0) return (EINVAL); - - /* - * Create the hfslockf structure - */ - MALLOC(lock, struct hfslockf *, sizeof *lock, M_LOCKF, M_WAITOK); - lock->lf_start = start; - lock->lf_end = end; - lock->lf_id = ap->a_id; - lock->lf_fork = fork; - lock->lf_type = fl->l_type; - lock->lf_next = (struct hfslockf *)0; - TAILQ_INIT(&lock->lf_blkhd); - lock->lf_flags = ap->a_flags; - /* - * Do the requested operation. - */ - switch(ap->a_op) { - case F_SETLK: - retval = hfs_setlock(lock); - break; - case F_UNLCK: - retval = hfs_clearlock(lock); - FREE(lock, M_LOCKF); - break; - case F_GETLK: - retval = hfs_getlock(lock, fl); - FREE(lock, M_LOCKF); - break; - default: - retval = EINVAL; - _FREE(lock, M_LOCKF); - break; } - return (retval); + return (0); } - /* - * Update the access, modified, and node change times as specified - * by the C_ACCESS, C_UPDATE, and C_CHANGE flags respectively. The - * C_MODIFIED flag is used to specify that the node needs to be - * updated but that the times have already been set. The access and - * modified times are input parameters but the node change time is - * always taken from the current time. If waitfor is set, then wait - * for the disk write of the node to complete. + * Update a cnode's on-disk metadata. + * + * If waitfor is set, then wait for the disk write of + * the node to complete. + * + * The cnode must be locked exclusive */ -/* -#% update vp L L L - IN struct vnode *vp; - IN struct timeval *access; - IN struct timeval *modify; - IN int waitfor; -*/ -static int -hfs_update(ap) - struct vop_update_args /* { - struct vnode *a_vp; - struct timeval *a_access; - struct timeval *a_modify; - int a_waitfor; - } */ *ap; +__private_extern__ +int +hfs_update(struct vnode *vp, __unused int waitfor) { - struct vnode *vp = ap->a_vp; - struct cnode *cp = VTOC(ap->a_vp); + struct cnode *cp = VTOC(vp); struct proc *p; struct cat_fork *dataforkp = NULL; struct cat_fork *rsrcforkp = NULL; struct cat_fork datafork; - int updateflag; struct hfsmount *hfsmp; + int lockflags; int error; + p = current_proc(); hfsmp = VTOHFS(vp); - /* XXX do we really want to clear the sytem cnode flags here???? */ - if (((vp->v_flag & VSYSTEM) && (cp->c_cnid < kHFSFirstUserCatalogNodeID))|| - (VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) || - (cp->c_mode == 0)) { - cp->c_flag &= ~(C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE); + if (vnode_issystem(vp) && (cp->c_cnid < kHFSFirstUserCatalogNodeID)) { + return (0); + } + if ((hfsmp->hfs_flags & HFS_READ_ONLY) || (cp->c_mode == 0)) { + cp->c_flag &= ~C_MODIFIED; + cp->c_touch_acctime = 0; + cp->c_touch_chgtime = 0; + cp->c_touch_modtime = 0; return (0); } - updateflag = cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE | C_FORCEUPDATE); + hfs_touchtimes(hfsmp, cp); /* Nothing to update. */ - if (updateflag == 0) { + if ((cp->c_flag & (C_MODIFIED | C_FORCEUPDATE)) == 0) { return (0); } - /* HFS standard doesn't have access times. */ - if ((updateflag == C_ACCESS) && (VTOVCB(vp)->vcbSigWord == kHFSSigWord)) { - return (0); - } - if (updateflag & C_ACCESS) { - /* - * When the access time is the only thing changing - * then make sure its sufficiently newer before - * committing it to disk. - */ - if ((updateflag == C_ACCESS) && - (ap->a_access->tv_sec < (cp->c_atime + ATIME_ONDISK_ACCURACY))) { - return (0); - } - cp->c_atime = ap->a_access->tv_sec; - } - if (updateflag & C_UPDATE) { - cp->c_mtime = ap->a_modify->tv_sec; - cp->c_mtime_nsec = ap->a_modify->tv_usec * 1000; - } - if (updateflag & C_CHANGE) { - cp->c_ctime = time.tv_sec; - /* - * HFS dates that WE set must be adjusted for DST - */ - if ((VTOVCB(vp)->vcbSigWord == kHFSSigWord) && gTimeZone.tz_dsttime) { - cp->c_ctime += 3600; - cp->c_mtime = cp->c_ctime; - } - } if (cp->c_datafork) dataforkp = &cp->c_datafork->ff_data; if (cp->c_rsrcfork) rsrcforkp = &cp->c_rsrcfork->ff_data; - p = current_proc(); - /* * For delayed allocations updates are * postponed until an fsync or the file @@ -3394,9 +2837,7 @@ hfs_update(ap) (ISSET(cp->c_flag, C_DELETED) || (dataforkp && cp->c_datafork->ff_unallocblocks) || (rsrcforkp && cp->c_rsrcfork->ff_unallocblocks))) { - if (updateflag & (C_CHANGE | C_UPDATE)) - hfs_volupdate(hfsmp, VOL_UPDATE, 0); - cp->c_flag &= ~(C_ACCESS | C_CHANGE | C_UPDATE); + // cp->c_flag &= ~(C_ACCESS | C_CHANGE | C_UPDATE); cp->c_flag |= C_MODIFIED; HFS_KNOTE(vp, NOTE_ATTRIB); @@ -3404,16 +2845,9 @@ hfs_update(ap) return (0); } - - // XXXdbg - hfs_global_shared_lock_acquire(hfsmp); - if (hfsmp->jnl) { - if ((error = journal_start_transaction(hfsmp->jnl)) != 0) { - hfs_global_shared_lock_release(hfsmp); - return error; - } + if ((error = hfs_start_transaction(hfsmp)) != 0) { + return error; } - /* * For files with invalid ranges (holes) the on-disk @@ -3444,32 +2878,17 @@ hfs_update(ap) * A shared lock is sufficient since an update doesn't change * the tree and the lock on vp protects the cnode. */ - error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, p); - if (error) { - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); - return (error); - } + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); /* XXX - waitfor is not enforced */ error = cat_update(hfsmp, &cp->c_desc, &cp->c_attr, dataforkp, rsrcforkp); - /* Unlock the Catalog b-tree file. */ - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); - - if (updateflag & (C_CHANGE | C_UPDATE | C_FORCEUPDATE)) - hfs_volupdate(hfsmp, VOL_UPDATE, 0); + hfs_systemfile_unlock(hfsmp, lockflags); /* After the updates are finished, clear the flags */ - cp->c_flag &= ~(C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE | C_FORCEUPDATE); + cp->c_flag &= ~(C_MODIFIED | C_FORCEUPDATE); - // XXXdbg - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); + hfs_end_transaction(hfsmp); HFS_KNOTE(vp, NOTE_ATTRIB); @@ -3478,78 +2897,63 @@ hfs_update(ap) /* * Allocate a new node - * - * Upon leaving, namei buffer must be freed. - * */ static int -hfs_makenode(mode, dvp, vpp, cnp) - int mode; - struct vnode *dvp; - struct vnode **vpp; - struct componentname *cnp; +hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, + struct vnode_attr *vap, vfs_context_t ctx) { - struct cnode *cp; + struct cnode *cp = NULL; struct cnode *dcp; struct vnode *tvp; struct hfsmount *hfsmp; - struct timeval tv; - struct proc *p; struct cat_desc in_desc, out_desc; struct cat_attr attr; - cat_cookie_t cookie = {0}; - int error, started_tr = 0, grabbed_lock = 0; + struct timeval tv; + cat_cookie_t cookie; + int lockflags; + int error, started_tr = 0, got_cookie = 0; enum vtype vnodetype; + int mode; - p = cnp->cn_proc; + if ((error = hfs_lock(VTOC(dvp), HFS_EXCLUSIVE_LOCK))) + return (error); dcp = VTOC(dvp); hfsmp = VTOHFS(dvp); *vpp = NULL; tvp = NULL; - bzero(&out_desc, sizeof(out_desc)); + out_desc.cd_flags = 0; + out_desc.cd_nameptr = NULL; + + mode = MAKEIMODE(vap->va_type, vap->va_mode); if ((mode & S_IFMT) == 0) mode |= S_IFREG; vnodetype = IFTOVT(mode); - /* Check if unmount in progress */ - if (VTOVFS(dvp)->mnt_kern_flag & MNTK_UNMOUNT) { - error = EPERM; - goto exit; - } /* Check if were out of usable disk space. */ - if ((suser(cnp->cn_cred, NULL) != 0) && (hfs_freeblks(hfsmp, 1) <= 0)) { + if ((hfs_freeblks(hfsmp, 1) <= 0) && (suser(vfs_context_ucred(ctx), NULL) != 0)) { error = ENOSPC; goto exit; } + microtime(&tv); + /* Setup the default attributes */ bzero(&attr, sizeof(attr)); attr.ca_mode = mode; attr.ca_nlink = vnodetype == VDIR ? 2 : 1; - attr.ca_mtime = time.tv_sec; - attr.ca_mtime_nsec = time.tv_usec * 1000; + attr.ca_mtime = tv.tv_sec; if ((VTOVCB(dvp)->vcbSigWord == kHFSSigWord) && gTimeZone.tz_dsttime) { attr.ca_mtime += 3600; /* Same as what hfs_update does */ } attr.ca_atime = attr.ca_ctime = attr.ca_itime = attr.ca_mtime; - if (VTOVFS(dvp)->mnt_flag & MNT_UNKNOWNPERMISSIONS) { - attr.ca_uid = hfsmp->hfs_uid; - attr.ca_gid = hfsmp->hfs_gid; - } else { - if (vnodetype == VLNK) - attr.ca_uid = dcp->c_uid; - else - attr.ca_uid = cnp->cn_cred->cr_uid; - attr.ca_gid = dcp->c_gid; - } - /* - * Don't tag as a special file (BLK or CHR) until *after* - * hfs_getnewvnode is called. This insures that any - * alias checking is defered until hfs_mknod completes. - */ - if (vnodetype == VBLK || vnodetype == VCHR) - attr.ca_mode = (attr.ca_mode & ~S_IFMT) | S_IFREG; + attr.ca_atimeondisk = attr.ca_atime; + + attr.ca_uid = vap->va_uid; + attr.ca_gid = vap->va_gid; + VATTR_SET_SUPPORTED(vap, va_mode); + VATTR_SET_SUPPORTED(vap, va_uid); + VATTR_SET_SUPPORTED(vap, va_gid); /* Tag symlinks with a type and creator. */ if (vnodetype == VLNK) { @@ -3559,30 +2963,21 @@ hfs_makenode(mode, dvp, vpp, cnp) fip->fdType = SWAP_BE32(kSymLinkFileType); fip->fdCreator = SWAP_BE32(kSymLinkCreator); } - if ((attr.ca_mode & S_ISGID) && - !groupmember(dcp->c_gid, cnp->cn_cred) && - suser(cnp->cn_cred, NULL)) { - attr.ca_mode &= ~S_ISGID; - } if (cnp->cn_flags & ISWHITEOUT) attr.ca_flags |= UF_OPAQUE; /* Setup the descriptor */ - bzero(&in_desc, sizeof(in_desc)); in_desc.cd_nameptr = cnp->cn_nameptr; in_desc.cd_namelen = cnp->cn_namelen; in_desc.cd_parentcnid = dcp->c_cnid; in_desc.cd_flags = S_ISDIR(mode) ? CD_ISDIR : 0; + in_desc.cd_hint = dcp->c_childhint; + in_desc.cd_encoding = 0; - // XXXdbg - hfs_global_shared_lock_acquire(hfsmp); - grabbed_lock = 1; - if (hfsmp->jnl) { - if ((error = journal_start_transaction(hfsmp->jnl)) != 0) { - goto exit; - } - started_tr = 1; + if ((error = hfs_start_transaction(hfsmp)) != 0) { + goto exit; } + started_tr = 1; /* * Reserve some space in the Catalog file. @@ -3591,29 +2986,31 @@ hfs_makenode(mode, dvp, vpp, cnp) * request can cause an hfs_inactive call to * delete an unlinked file) */ - if ((error = cat_preflight(hfsmp, CAT_CREATE | CAT_DELETE, &cookie, p))) { + if ((error = cat_preflight(hfsmp, CAT_CREATE | CAT_DELETE, &cookie, 0))) { goto exit; } + got_cookie = 1; - /* Lock catalog b-tree */ - error = hfs_metafilelocking(VTOHFS(dvp), kHFSCatalogFileID, LK_EXCLUSIVE, p); - if (error) - goto exit; - + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); error = cat_create(hfsmp, &in_desc, &attr, &out_desc); - - /* Unlock catalog b-tree */ - (void) hfs_metafilelocking(VTOHFS(dvp), kHFSCatalogFileID, LK_RELEASE, p); + if (error == 0) { + /* Update the parent directory */ + dcp->c_childhint = out_desc.cd_hint; /* Cache directory's location */ + dcp->c_nlink++; + dcp->c_entries++; + dcp->c_ctime = tv.tv_sec; + dcp->c_mtime = tv.tv_sec; + (void) cat_update(hfsmp, &dcp->c_desc, &dcp->c_attr, NULL, NULL); + HFS_KNOTE(dvp, NOTE_ATTRIB); + } + hfs_systemfile_unlock(hfsmp, lockflags); if (error) goto exit; - /* Update the parent directory */ - dcp->c_childhint = out_desc.cd_hint; /* Cache directory's location */ - dcp->c_nlink++; - dcp->c_entries++; - dcp->c_flag |= C_CHANGE | C_UPDATE; - tv = time; - (void) VOP_UPDATE(dvp, &tv, &tv, 0); + /* Invalidate negative cache entries in the directory */ + if (hfsmp->hfs_flags & HFS_CASE_SENSITIVE) + cache_purge_negatives(dvp); + if (vnodetype == VDIR) { HFS_KNOTE(dvp, NOTE_WRITE | NOTE_LINK); } else { @@ -3630,77 +3027,74 @@ hfs_makenode(mode, dvp, vpp, cnp) // deadlock with someone on that other file system (since we could be // holding two transaction locks as well as various vnodes and we did // not obtain the locks on them in the proper order). - // + // // NOTE: this means that if the quota check fails or we have to update // the change time on a block-special device that those changes // will happen as part of independent transactions. // if (started_tr) { - journal_end_transaction(hfsmp->jnl); - started_tr = 0; - } - if (grabbed_lock) { - hfs_global_shared_lock_release(hfsmp); - grabbed_lock = 0; + hfs_end_transaction(hfsmp); + started_tr = 0; } - /* Create a vnode for the object just created: */ - error = hfs_getnewvnode(hfsmp, NULL, &out_desc, 0, &attr, NULL, &tvp); + /* + * Create a vnode for the object just created. + * + * The cnode is locked on successful return. + */ + error = hfs_getnewvnode(hfsmp, dvp, cnp, &out_desc, 0, &attr, NULL, &tvp); if (error) goto exit; // XXXdbg - cache_enter(dvp, tvp, cnp); + //cache_enter(dvp, tvp, cnp); -#if QUOTA cp = VTOC(tvp); +#if QUOTA /* * We call hfs_chkiq with FORCE flag so that if we * fall through to the rmdir we actually have * accounted for the inode */ - if ((error = hfs_getinoquota(cp)) || - (error = hfs_chkiq(cp, 1, cnp->cn_cred, FORCE))) { - if (tvp->v_type == VDIR) - VOP_RMDIR(dvp,tvp, cnp); - else - VOP_REMOVE(dvp,tvp, cnp); - - // because VOP_RMDIR and VOP_REMOVE already - // have done the vput() - dvp = NULL; - goto exit; - } -#endif /* QUOTA */ - - /* - * restore vtype and mode for VBLK and VCHR - */ - if (vnodetype == VBLK || vnodetype == VCHR) { - struct cnode *cp; - - cp = VTOC(tvp); - cp->c_mode = mode; - tvp->v_type = IFTOVT(mode); - cp->c_flag |= C_CHANGE; - tv = time; - if ((error = VOP_UPDATE(tvp, &tv, &tv, 1))) { - vput(tvp); + if (vfs_flags(HFSTOVFS(hfsmp)) & MNT_QUOTA) { + if ((error = hfs_getinoquota(cp)) || + (error = hfs_chkiq(cp, 1, vfs_context_ucred(ctx), FORCE))) { + + if (vnode_isdir(tvp)) + (void) hfs_removedir(dvp, tvp, cnp, 0); + else { + hfs_unlock(cp); + hfs_lock_truncate(cp, TRUE); + hfs_lock(cp, HFS_FORCE_LOCK); + (void) hfs_removefile(dvp, tvp, cnp, 0, 0); + hfs_unlock_truncate(cp); + } + /* + * we successfully allocated a new vnode, but + * the quota check is telling us we're beyond + * our limit, so we need to dump our lock + reference + */ + hfs_unlock(cp); + vnode_put(tvp); + goto exit; } } +#endif /* QUOTA */ + /* Remember if any ACL data was set. */ + if (VATTR_IS_ACTIVE(vap, va_acl) && + (vap->va_acl != NULL)) { + cp->c_attr.ca_recflags |= kHFSHasSecurityMask; + cp->c_touch_chgtime = TRUE; + (void) hfs_update(tvp, TRUE); + } *vpp = tvp; exit: cat_releasedesc(&out_desc); - cat_postflight(hfsmp, &cookie, p); - - if ((cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) { - char *tmp = cnp->cn_pnbuf; - cnp->cn_pnbuf = NULL; - cnp->cn_flags &= ~HASBUF; - FREE_ZONE(tmp, cnp->cn_pnlen, M_NAMEI); + if (got_cookie) { + cat_postflight(hfsmp, &cookie, 0); } /* * Check if a file is located in the "Cleanup At Startup" @@ -3711,55 +3105,61 @@ exit: (dcp->c_desc.cd_nameptr != NULL) && (strcmp(dcp->c_desc.cd_nameptr, CARBON_TEMP_DIR_NAME) == 0)) { struct vnode *ddvp; - cnid_t parid; - parid = dcp->c_parentcnid; - vput(dvp); + hfs_unlock(dcp); dvp = NULL; /* * The parent of "Cleanup At Startup" should * have the ASCII name of the userid. */ - if (VFS_VGET(HFSTOVFS(hfsmp), &parid, &ddvp) == 0) { - if (VTOC(ddvp)->c_desc.cd_nameptr) { - uid_t uid; - - uid = strtoul(VTOC(ddvp)->c_desc.cd_nameptr, 0, 0); - if (uid == cp->c_uid || uid == cnp->cn_cred->cr_uid) { - cp->c_flags |= UF_NODUMP; - cp->c_flag |= C_CHANGE; - } + if (hfs_vget(hfsmp, dcp->c_parentcnid, &ddvp, 0) == 0) { + if (VTOC(ddvp)->c_desc.cd_nameptr) { + uid_t uid; + + uid = strtoul(VTOC(ddvp)->c_desc.cd_nameptr, 0, 0); + if ((uid == cp->c_uid) || + (uid == vfs_context_ucred(ctx)->cr_uid)) { + cp->c_flags |= UF_NODUMP; + cp->c_touch_chgtime = TRUE; + } } - vput(ddvp); + hfs_unlock(VTOC(ddvp)); + vnode_put(ddvp); } } - if (dvp) - vput(dvp); - - if (started_tr) { - journal_end_transaction(hfsmp->jnl); - started_tr = 0; + if (dvp) { + hfs_unlock(dcp); + } + if (error == 0 && cp != NULL) { + hfs_unlock(cp); } - if (grabbed_lock) { - hfs_global_shared_lock_release(hfsmp); - grabbed_lock = 0; + if (started_tr) { + hfs_end_transaction(hfsmp); + started_tr = 0; } return (error); } -static int -hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, struct vnode **rvpp, struct proc *p) +/* + * WARNING - assumes caller has cnode lock. + */ +__private_extern__ +int +hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, struct vnode **rvpp, __unused struct proc *p) { struct vnode *rvp; struct cnode *cp = VTOC(vp); int error; + int vid; if ((rvp = cp->c_rsrc_vp)) { + vid = vnode_vid(rvp); + /* Use exising vnode */ - error = vget(rvp, 0, p); + error = vnode_getwithvid(rvp, vid); if (error) { char * name = VTOC(vp)->c_desc.cd_nameptr; @@ -3770,23 +3170,38 @@ hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, struct vnode **rvpp, stru } } else { struct cat_fork rsrcfork; + struct componentname cn; + int lockflags; - /* Lock catalog b-tree */ - error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, p); - if (error) - return (error); + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); /* Get resource fork data */ error = cat_lookup(hfsmp, &cp->c_desc, 1, (struct cat_desc *)0, - (struct cat_attr *)0, &rsrcfork); + (struct cat_attr *)0, &rsrcfork, NULL); - /* Unlock the Catalog */ - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); + hfs_systemfile_unlock(hfsmp, lockflags); if (error) return (error); - error = hfs_getnewvnode(hfsmp, cp, &cp->c_desc, 1, &cp->c_attr, - &rsrcfork, &rvp); + /* + * Supply hfs_getnewvnode with a component name. + */ + cn.cn_pnbuf = NULL; + if (cp->c_desc.cd_nameptr) { + MALLOC_ZONE(cn.cn_pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); + cn.cn_nameiop = LOOKUP; + cn.cn_flags = ISLASTCN | HASBUF; + cn.cn_context = NULL; + cn.cn_pnlen = MAXPATHLEN; + cn.cn_nameptr = cn.cn_pnbuf; + cn.cn_hash = 0; + cn.cn_consume = 0; + cn.cn_namelen = sprintf(cn.cn_nameptr, "%s%s", cp->c_desc.cd_nameptr, _PATH_RSRCFORKSPEC); + } + error = hfs_getnewvnode(hfsmp, vnode_parent(vp), cn.cn_pnbuf ? &cn : NULL, + &cp->c_desc, 2, &cp->c_attr, &rsrcfork, &rvp); + if (cn.cn_pnbuf) + FREE_ZONE(cn.cn_pnbuf, cn.cn_pnlen, M_NAMEI); if (error) return (error); } @@ -3800,28 +3215,34 @@ static void filt_hfsdetach(struct knote *kn) { struct vnode *vp; - int result; - struct proc *p = current_proc(); vp = (struct vnode *)kn->kn_hook; - if (1) { /* ! KNDETACH_VNLOCKED */ - result = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - if (result) return; - }; + if (vnode_getwithvid(vp, kn->kn_hookid)) + return; - result = KNOTE_DETACH(&VTOC(vp)->c_knotes, kn); - - if (1) { /* ! KNDETACH_VNLOCKED */ - VOP_UNLOCK(vp, 0, p); - }; + if (1) { /* ! KNDETACH_VNLOCKED */ + if (hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK) == 0) { + (void) KNOTE_DETACH(&VTOC(vp)->c_knotes, kn); + hfs_unlock(VTOC(vp)); + } + } + + vnode_put(vp); } /*ARGSUSED*/ static int filt_hfsread(struct knote *kn, long hint) { - struct vnode *vp = (struct vnode *)kn->kn_fp->f_data; + struct vnode *vp = (struct vnode *)kn->kn_hook; + int dropvp = 0; + if (hint == 0) { + if ((vnode_getwithvid(vp, kn->kn_hookid) != 0)) { + hint = NOTE_REVOKE; + } else + dropvp = 1; + } if (hint == NOTE_REVOKE) { /* * filesystem is gone, so set the EOF flag and schedule @@ -3831,7 +3252,13 @@ filt_hfsread(struct knote *kn, long hint) return (1); } - kn->kn_data = VTOF(vp)->ff_size - kn->kn_fp->f_offset; + /* poll(2) semantics dictate always saying there is data */ + kn->kn_data = (!(kn->kn_flags & EV_POLL)) ? + VTOF(vp)->ff_size - kn->kn_fp->f_fglob->fg_offset : 1; + + if (dropvp) + vnode_put(vp); + return (kn->kn_data != 0); } @@ -3839,14 +3266,23 @@ filt_hfsread(struct knote *kn, long hint) static int filt_hfswrite(struct knote *kn, long hint) { + int dropvp = 0; + + if (hint == 0) { + if ((vnode_getwithvid(kn->kn_hook, kn->kn_hookid) != 0)) { + hint = NOTE_REVOKE; + } else + vnode_put(kn->kn_hook); + } if (hint == NOTE_REVOKE) { /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ + kn->kn_data = 0; kn->kn_flags |= (EV_EOF | EV_ONESHOT); + return (1); } - kn->kn_data = 0; return (1); } @@ -3855,12 +3291,19 @@ static int filt_hfsvnode(struct knote *kn, long hint) { + if (hint == 0) { + if ((vnode_getwithvid(kn->kn_hook, kn->kn_hookid) != 0)) { + hint = NOTE_REVOKE; + } else + vnode_put(kn->kn_hook); + } if (kn->kn_sfflags & hint) kn->kn_fflags |= hint; - if (hint == NOTE_REVOKE) { - kn->kn_flags |= EV_EOF; + if ((hint == NOTE_REVOKE)) { + kn->kn_flags |= (EV_EOF | EV_ONESHOT); return (1); } + return (kn->kn_fflags != 0); } @@ -3872,35 +3315,31 @@ static struct filterops hfsvnode_filtops = { 1, NULL, filt_hfsdetach, filt_hfsvnode }; /* - # - #% kqfilt_add vp L L L - # - vop_kqfilt_add - IN struct vnode *vp; - IN struct knote *kn; - IN struct proc *p; + * Add a kqueue filter. */ static int -hfs_kqfilt_add(ap) - struct vop_kqfilt_add_args /* { +hfs_vnop_kqfiltadd( + struct vnop_kqfilt_add_args /* { struct vnode *a_vp; struct knote *a_kn; struct proc *p; - } */ *ap; + vfs_context_t a_context; + } */ *ap) { struct vnode *vp = ap->a_vp; struct knote *kn = ap->a_kn; + int error; switch (kn->kn_filter) { case EVFILT_READ: - if (vp->v_type == VREG) { + if (vnode_isreg(vp)) { kn->kn_fop = &hfsread_filtops; } else { return EINVAL; }; break; case EVFILT_WRITE: - if (vp->v_type == VREG) { + if (vnode_isreg(vp)) { kn->kn_fop = &hfswrite_filtops; } else { return EINVAL; @@ -3914,33 +3353,27 @@ hfs_kqfilt_add(ap) } kn->kn_hook = (caddr_t)vp; + kn->kn_hookid = vnode_vid(vp); - /* simple_lock(&vp->v_pollinfo.vpi_lock); */ + if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) + return (error); KNOTE_ATTACH(&VTOC(vp)->c_knotes, kn); - /* simple_unlock(&vp->v_pollinfo.vpi_lock); */ + hfs_unlock(VTOC(vp)); return (0); } /* - # - #% kqfilt_remove vp L L L - # - vop_kqfilt_remove - IN struct vnode *vp; - IN uintptr_t ident; - IN struct proc *p; + * Remove a kqueue filter */ static int -hfs_kqfilt_remove(ap) - struct vop_kqfilt_remove_args /* { +hfs_vnop_kqfiltremove(ap) + struct vnop_kqfilt_remove_args /* { struct vnode *a_vp; uintptr_t ident; - struct proc *p; + vfs_context_t a_context; } */ *ap; { - struct vnode *vp = ap->a_vp; - uintptr_t ident = ap->a_ident; int result; result = ENOTSUP; /* XXX */ @@ -3953,18 +3386,18 @@ hfs_kqfilt_remove(ap) */ static int hfsspec_read(ap) - struct vop_read_args /* { + struct vnop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; + vfs_context_t a_context; } */ *ap; { /* * Set access flag. */ - VTOC(ap->a_vp)->c_flag |= C_ACCESS; - return (VOCALL (spec_vnodeop_p, VOFFSET(vop_read), ap)); + VTOC(ap->a_vp)->c_touch_acctime = TRUE; + return (VOCALL (spec_vnodeop_p, VOFFSET(vnop_read), ap)); } /* @@ -3972,18 +3405,19 @@ hfsspec_read(ap) */ static int hfsspec_write(ap) - struct vop_write_args /* { + struct vnop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; + vfs_context_t a_context; } */ *ap; { /* * Set update and change flags. */ - VTOC(ap->a_vp)->c_flag |= C_CHANGE | C_UPDATE; - return (VOCALL (spec_vnodeop_p, VOFFSET(vop_write), ap)); + VTOC(ap->a_vp)->c_touch_chgtime = TRUE; + VTOC(ap->a_vp)->c_touch_modtime = TRUE; + return (VOCALL (spec_vnodeop_p, VOFFSET(vnop_write), ap)); } /* @@ -3993,21 +3427,23 @@ hfsspec_write(ap) */ static int hfsspec_close(ap) - struct vop_close_args /* { + struct vnop_close_args /* { struct vnode *a_vp; int a_fflag; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { struct vnode *vp = ap->a_vp; - struct cnode *cp = VTOC(vp); + struct cnode *cp; - simple_lock(&vp->v_interlock); - if (ap->a_vp->v_usecount > 1) - CTIMES(cp, &time, &time); - simple_unlock(&vp->v_interlock); - return (VOCALL (spec_vnodeop_p, VOFFSET(vop_close), ap)); + if (vnode_isinuse(ap->a_vp, 1)) { + if (hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK) == 0) { + cp = VTOC(vp); + hfs_touchtimes(VTOHFS(vp), cp); + hfs_unlock(cp); + } + } + return (VOCALL (spec_vnodeop_p, VOFFSET(vnop_close), ap)); } #if FIFO @@ -4016,11 +3452,11 @@ hfsspec_close(ap) */ static int hfsfifo_read(ap) - struct vop_read_args /* { + struct vnop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; + vfs_context_t a_context; } */ *ap; { extern int (**fifo_vnodeop_p)(void *); @@ -4028,8 +3464,8 @@ hfsfifo_read(ap) /* * Set access flag. */ - VTOC(ap->a_vp)->c_flag |= C_ACCESS; - return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_read), ap)); + VTOC(ap->a_vp)->c_touch_acctime = TRUE; + return (VOCALL (fifo_vnodeop_p, VOFFSET(vnop_read), ap)); } /* @@ -4037,11 +3473,11 @@ hfsfifo_read(ap) */ static int hfsfifo_write(ap) - struct vop_write_args /* { + struct vnop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; + vfs_context_t a_context; } */ *ap; { extern int (**fifo_vnodeop_p)(void *); @@ -4049,8 +3485,9 @@ hfsfifo_write(ap) /* * Set update and change flags. */ - VTOC(ap->a_vp)->c_flag |= C_CHANGE | C_UPDATE; - return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_write), ap)); + VTOC(ap->a_vp)->c_touch_chgtime = TRUE; + VTOC(ap->a_vp)->c_touch_modtime = TRUE; + return (VOCALL (fifo_vnodeop_p, VOFFSET(vnop_write), ap)); } /* @@ -4060,22 +3497,24 @@ hfsfifo_write(ap) */ static int hfsfifo_close(ap) - struct vop_close_args /* { + struct vnop_close_args /* { struct vnode *a_vp; int a_fflag; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { extern int (**fifo_vnodeop_p)(void *); struct vnode *vp = ap->a_vp; - struct cnode *cp = VTOC(vp); + struct cnode *cp; - simple_lock(&vp->v_interlock); - if (ap->a_vp->v_usecount > 1) - CTIMES(cp, &time, &time); - simple_unlock(&vp->v_interlock); - return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_close), ap)); + if (vnode_isinuse(ap->a_vp, 1)) { + if (hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK) == 0) { + cp = VTOC(vp); + hfs_touchtimes(VTOHFS(vp), cp); + hfs_unlock(cp); + } + } + return (VOCALL (fifo_vnodeop_p, VOFFSET(vnop_close), ap)); } /* @@ -4085,14 +3524,14 @@ hfsfifo_close(ap) */ int hfsfifo_kqfilt_add(ap) - struct vop_kqfilt_add_args *ap; + struct vnop_kqfilt_add_args *ap; { extern int (**fifo_vnodeop_p)(void *); int error; - error = VOCALL(fifo_vnodeop_p, VOFFSET(vop_kqfilt_add), ap); + error = VOCALL(fifo_vnodeop_p, VOFFSET(vnop_kqfilt_add), ap); if (error) - error = hfs_kqfilt_add(ap); + error = hfs_vnop_kqfiltadd(ap); return (error); } @@ -4103,109 +3542,126 @@ hfsfifo_kqfilt_add(ap) */ int hfsfifo_kqfilt_remove(ap) - struct vop_kqfilt_remove_args *ap; + struct vnop_kqfilt_remove_args *ap; { extern int (**fifo_vnodeop_p)(void *); int error; - error = VOCALL(fifo_vnodeop_p, VOFFSET(vop_kqfilt_remove), ap); + error = VOCALL(fifo_vnodeop_p, VOFFSET(vnop_kqfilt_remove), ap); if (error) - error = hfs_kqfilt_remove(ap); + error = hfs_vnop_kqfiltremove(ap); return (error); } #endif /* FIFO */ +/* + * Synchronize a file's in-core state with that on disk. + */ +static int +hfs_vnop_fsync(ap) + struct vnop_fsync_args /* { + struct vnode *a_vp; + int a_waitfor; + vfs_context_t a_context; + } */ *ap; +{ + struct vnode* vp = ap->a_vp; + int error; + + /* + * We need to allow ENOENT lock errors since unlink + * systenm call can call VNOP_FSYNC during vclean. + */ + error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK); + if (error) + return (0); + + error = hfs_fsync(vp, ap->a_waitfor, 0, vfs_context_proc(ap->a_context)); + + hfs_unlock(VTOC(vp)); + return (error); +} /***************************************************************************** * * VOP Tables * *****************************************************************************/ -int hfs_cache_lookup(); /* in hfs_lookup.c */ -int hfs_lookup(); /* in hfs_lookup.c */ -int hfs_read(); /* in hfs_readwrite.c */ -int hfs_write(); /* in hfs_readwrite.c */ -int hfs_ioctl(); /* in hfs_readwrite.c */ -int hfs_select(); /* in hfs_readwrite.c */ -int hfs_bmap(); /* in hfs_readwrite.c */ -int hfs_strategy(); /* in hfs_readwrite.c */ -int hfs_truncate(); /* in hfs_readwrite.c */ -int hfs_allocate(); /* in hfs_readwrite.c */ -int hfs_pagein(); /* in hfs_readwrite.c */ -int hfs_pageout(); /* in hfs_readwrite.c */ -int hfs_search(); /* in hfs_search.c */ -int hfs_bwrite(); /* in hfs_readwrite.c */ -int hfs_link(); /* in hfs_link.c */ -int hfs_blktooff(); /* in hfs_readwrite.c */ -int hfs_offtoblk(); /* in hfs_readwrite.c */ -int hfs_cmap(); /* in hfs_readwrite.c */ -int hfs_getattrlist(); /* in hfs_attrlist.c */ -int hfs_setattrlist(); /* in hfs_attrlist.c */ -int hfs_readdirattr(); /* in hfs_attrlist.c */ -int hfs_inactive(); /* in hfs_cnode.c */ -int hfs_reclaim(); /* in hfs_cnode.c */ +int hfs_vnop_readdirattr(struct vnop_readdirattr_args *); /* in hfs_attrlist.c */ +int hfs_vnop_inactive(struct vnop_inactive_args *); /* in hfs_cnode.c */ +int hfs_vnop_reclaim(struct vnop_reclaim_args *); /* in hfs_cnode.c */ +int hfs_vnop_link(struct vnop_link_args *); /* in hfs_link.c */ +int hfs_vnop_lookup(struct vnop_lookup_args *); /* in hfs_lookup.c */ +int hfs_vnop_search(struct vnop_searchfs_args *); /* in hfs_search.c */ + +int hfs_vnop_read(struct vnop_read_args *); /* in hfs_readwrite.c */ +int hfs_vnop_write(struct vnop_write_args *); /* in hfs_readwrite.c */ +int hfs_vnop_ioctl(struct vnop_ioctl_args *); /* in hfs_readwrite.c */ +int hfs_vnop_select(struct vnop_select_args *); /* in hfs_readwrite.c */ +int hfs_vnop_strategy(struct vnop_strategy_args *); /* in hfs_readwrite.c */ +int hfs_vnop_allocate(struct vnop_allocate_args *); /* in hfs_readwrite.c */ +int hfs_vnop_pagein(struct vnop_pagein_args *); /* in hfs_readwrite.c */ +int hfs_vnop_pageout(struct vnop_pageout_args *); /* in hfs_readwrite.c */ +int hfs_vnop_bwrite(struct vnop_bwrite_args *); /* in hfs_readwrite.c */ +int hfs_vnop_blktooff(struct vnop_blktooff_args *); /* in hfs_readwrite.c */ +int hfs_vnop_offtoblk(struct vnop_offtoblk_args *); /* in hfs_readwrite.c */ +int hfs_vnop_blockmap(struct vnop_blockmap_args *); /* in hfs_readwrite.c */ +int hfs_vnop_getxattr(struct vnop_getxattr_args *); /* in hfs_xattr.c */ +int hfs_vnop_setxattr(struct vnop_setxattr_args *); /* in hfs_xattr.c */ +int hfs_vnop_removexattr(struct vnop_removexattr_args *); /* in hfs_xattr.c */ +int hfs_vnop_listxattr(struct vnop_listxattr_args *); /* in hfs_xattr.c */ int (**hfs_vnodeop_p)(void *); #define VOPFUNC int (*)(void *) struct vnodeopv_entry_desc hfs_vnodeop_entries[] = { - { &vop_default_desc, (VOPFUNC)vn_default_error }, - { &vop_lookup_desc, (VOPFUNC)hfs_cache_lookup }, /* lookup */ - { &vop_create_desc, (VOPFUNC)hfs_create }, /* create */ - { &vop_mknod_desc, (VOPFUNC)hfs_mknod }, /* mknod */ - { &vop_open_desc, (VOPFUNC)hfs_open }, /* open */ - { &vop_close_desc, (VOPFUNC)hfs_close }, /* close */ - { &vop_access_desc, (VOPFUNC)hfs_access }, /* access */ - { &vop_getattr_desc, (VOPFUNC)hfs_getattr }, /* getattr */ - { &vop_setattr_desc, (VOPFUNC)hfs_setattr }, /* setattr */ - { &vop_read_desc, (VOPFUNC)hfs_read }, /* read */ - { &vop_write_desc, (VOPFUNC)hfs_write }, /* write */ - { &vop_ioctl_desc, (VOPFUNC)hfs_ioctl }, /* ioctl */ - { &vop_select_desc, (VOPFUNC)hfs_select }, /* select */ - { &vop_revoke_desc, (VOPFUNC)nop_revoke }, /* revoke */ - { &vop_exchange_desc, (VOPFUNC)hfs_exchange }, /* exchange */ - { &vop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */ - { &vop_fsync_desc, (VOPFUNC)hfs_fsync }, /* fsync */ - { &vop_seek_desc, (VOPFUNC)nop_seek }, /* seek */ - { &vop_remove_desc, (VOPFUNC)hfs_remove }, /* remove */ - { &vop_link_desc, (VOPFUNC)hfs_link }, /* link */ - { &vop_rename_desc, (VOPFUNC)hfs_rename }, /* rename */ - { &vop_mkdir_desc, (VOPFUNC)hfs_mkdir }, /* mkdir */ - { &vop_rmdir_desc, (VOPFUNC)hfs_rmdir }, /* rmdir */ - { &vop_mkcomplex_desc, (VOPFUNC)err_mkcomplex }, /* mkcomplex */ - { &vop_getattrlist_desc, (VOPFUNC)hfs_getattrlist }, /* getattrlist */ - { &vop_setattrlist_desc, (VOPFUNC)hfs_setattrlist }, /* setattrlist */ - { &vop_symlink_desc, (VOPFUNC)hfs_symlink }, /* symlink */ - { &vop_readdir_desc, (VOPFUNC)hfs_readdir }, /* readdir */ - { &vop_readdirattr_desc, (VOPFUNC)hfs_readdirattr }, /* readdirattr */ - { &vop_readlink_desc, (VOPFUNC)hfs_readlink }, /* readlink */ - { &vop_abortop_desc, (VOPFUNC)nop_abortop }, /* abortop */ - { &vop_inactive_desc, (VOPFUNC)hfs_inactive }, /* inactive */ - { &vop_reclaim_desc, (VOPFUNC)hfs_reclaim }, /* reclaim */ - { &vop_lock_desc, (VOPFUNC)hfs_lock }, /* lock */ - { &vop_unlock_desc, (VOPFUNC)hfs_unlock }, /* unlock */ - { &vop_bmap_desc, (VOPFUNC)hfs_bmap }, /* bmap */ - { &vop_strategy_desc, (VOPFUNC)hfs_strategy }, /* strategy */ - { &vop_print_desc, (VOPFUNC)hfs_print }, /* print */ - { &vop_islocked_desc, (VOPFUNC)hfs_islocked }, /* islocked */ - { &vop_pathconf_desc, (VOPFUNC)hfs_pathconf }, /* pathconf */ - { &vop_advlock_desc, (VOPFUNC)hfs_advlock }, /* advlock */ - { &vop_reallocblks_desc, (VOPFUNC)err_reallocblks }, /* reallocblks */ - { &vop_truncate_desc, (VOPFUNC)hfs_truncate }, /* truncate */ - { &vop_allocate_desc, (VOPFUNC)hfs_allocate }, /* allocate */ - { &vop_update_desc, (VOPFUNC)hfs_update }, /* update */ - { &vop_searchfs_desc, (VOPFUNC)hfs_search }, /* search fs */ - { &vop_bwrite_desc, (VOPFUNC)hfs_bwrite }, /* bwrite */ - { &vop_pagein_desc, (VOPFUNC)hfs_pagein }, /* pagein */ - { &vop_pageout_desc,(VOPFUNC) hfs_pageout }, /* pageout */ - { &vop_copyfile_desc, (VOPFUNC)err_copyfile }, /* copyfile */ - { &vop_blktooff_desc, (VOPFUNC)hfs_blktooff }, /* blktooff */ - { &vop_offtoblk_desc, (VOPFUNC)hfs_offtoblk }, /* offtoblk */ - { &vop_cmap_desc, (VOPFUNC)hfs_cmap }, /* cmap */ - { &vop_kqfilt_add_desc, (VOPFUNC)hfs_kqfilt_add }, /* kqfilt_add */ - { &vop_kqfilt_remove_desc, (VOPFUNC)hfs_kqfilt_remove }, /* kqfilt_remove */ + { &vnop_default_desc, (VOPFUNC)vn_default_error }, + { &vnop_lookup_desc, (VOPFUNC)hfs_vnop_lookup }, /* lookup */ + { &vnop_create_desc, (VOPFUNC)hfs_vnop_create }, /* create */ + { &vnop_mknod_desc, (VOPFUNC)hfs_vnop_mknod }, /* mknod */ + { &vnop_open_desc, (VOPFUNC)hfs_vnop_open }, /* open */ + { &vnop_close_desc, (VOPFUNC)hfs_vnop_close }, /* close */ + { &vnop_getattr_desc, (VOPFUNC)hfs_vnop_getattr }, /* getattr */ + { &vnop_setattr_desc, (VOPFUNC)hfs_vnop_setattr }, /* setattr */ + { &vnop_read_desc, (VOPFUNC)hfs_vnop_read }, /* read */ + { &vnop_write_desc, (VOPFUNC)hfs_vnop_write }, /* write */ + { &vnop_ioctl_desc, (VOPFUNC)hfs_vnop_ioctl }, /* ioctl */ + { &vnop_select_desc, (VOPFUNC)hfs_vnop_select }, /* select */ + { &vnop_revoke_desc, (VOPFUNC)nop_revoke }, /* revoke */ + { &vnop_exchange_desc, (VOPFUNC)hfs_vnop_exchange }, /* exchange */ + { &vnop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */ + { &vnop_fsync_desc, (VOPFUNC)hfs_vnop_fsync }, /* fsync */ + { &vnop_remove_desc, (VOPFUNC)hfs_vnop_remove }, /* remove */ + { &vnop_link_desc, (VOPFUNC)hfs_vnop_link }, /* link */ + { &vnop_rename_desc, (VOPFUNC)hfs_vnop_rename }, /* rename */ + { &vnop_mkdir_desc, (VOPFUNC)hfs_vnop_mkdir }, /* mkdir */ + { &vnop_rmdir_desc, (VOPFUNC)hfs_vnop_rmdir }, /* rmdir */ + { &vnop_symlink_desc, (VOPFUNC)hfs_vnop_symlink }, /* symlink */ + { &vnop_readdir_desc, (VOPFUNC)hfs_vnop_readdir }, /* readdir */ + { &vnop_readdirattr_desc, (VOPFUNC)hfs_vnop_readdirattr }, /* readdirattr */ + { &vnop_readlink_desc, (VOPFUNC)hfs_vnop_readlink }, /* readlink */ + { &vnop_inactive_desc, (VOPFUNC)hfs_vnop_inactive }, /* inactive */ + { &vnop_reclaim_desc, (VOPFUNC)hfs_vnop_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (VOPFUNC)hfs_vnop_strategy }, /* strategy */ + { &vnop_pathconf_desc, (VOPFUNC)hfs_vnop_pathconf }, /* pathconf */ + { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ + { &vnop_allocate_desc, (VOPFUNC)hfs_vnop_allocate }, /* allocate */ + { &vnop_searchfs_desc, (VOPFUNC)hfs_vnop_search }, /* search fs */ + { &vnop_bwrite_desc, (VOPFUNC)hfs_vnop_bwrite }, /* bwrite */ + { &vnop_pagein_desc, (VOPFUNC)hfs_vnop_pagein }, /* pagein */ + { &vnop_pageout_desc,(VOPFUNC) hfs_vnop_pageout }, /* pageout */ + { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* copyfile */ + { &vnop_blktooff_desc, (VOPFUNC)hfs_vnop_blktooff }, /* blktooff */ + { &vnop_offtoblk_desc, (VOPFUNC)hfs_vnop_offtoblk }, /* offtoblk */ + { &vnop_blockmap_desc, (VOPFUNC)hfs_vnop_blockmap }, /* blockmap */ + { &vnop_kqfilt_add_desc, (VOPFUNC)hfs_vnop_kqfiltadd }, /* kqfilt_add */ + { &vnop_kqfilt_remove_desc, (VOPFUNC)hfs_vnop_kqfiltremove }, /* kqfilt_remove */ + { &vnop_getxattr_desc, (VOPFUNC)hfs_vnop_getxattr}, + { &vnop_setxattr_desc, (VOPFUNC)hfs_vnop_setxattr}, + { &vnop_removexattr_desc, (VOPFUNC)hfs_vnop_removexattr}, + { &vnop_listxattr_desc, (VOPFUNC)hfs_vnop_listxattr}, { NULL, (VOPFUNC)NULL } }; @@ -4214,57 +3670,41 @@ struct vnodeopv_desc hfs_vnodeop_opv_desc = int (**hfs_specop_p)(void *); struct vnodeopv_entry_desc hfs_specop_entries[] = { - { &vop_default_desc, (VOPFUNC)vn_default_error }, - { &vop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */ - { &vop_create_desc, (VOPFUNC)spec_create }, /* create */ - { &vop_mknod_desc, (VOPFUNC)spec_mknod }, /* mknod */ - { &vop_open_desc, (VOPFUNC)spec_open }, /* open */ - { &vop_close_desc, (VOPFUNC)hfsspec_close }, /* close */ - { &vop_access_desc, (VOPFUNC)hfs_access }, /* access */ - { &vop_getattr_desc, (VOPFUNC)hfs_getattr }, /* getattr */ - { &vop_setattr_desc, (VOPFUNC)hfs_setattr }, /* setattr */ - { &vop_read_desc, (VOPFUNC)hfsspec_read }, /* read */ - { &vop_write_desc, (VOPFUNC)hfsspec_write }, /* write */ - { &vop_lease_desc, (VOPFUNC)spec_lease_check }, /* lease */ - { &vop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */ - { &vop_select_desc, (VOPFUNC)spec_select }, /* select */ - { &vop_revoke_desc, (VOPFUNC)spec_revoke }, /* revoke */ - { &vop_mmap_desc, (VOPFUNC)spec_mmap }, /* mmap */ - { &vop_fsync_desc, (VOPFUNC)hfs_fsync }, /* fsync */ - { &vop_seek_desc, (VOPFUNC)spec_seek }, /* seek */ - { &vop_remove_desc, (VOPFUNC)spec_remove }, /* remove */ - { &vop_link_desc, (VOPFUNC)spec_link }, /* link */ - { &vop_rename_desc, (VOPFUNC)spec_rename }, /* rename */ - { &vop_mkdir_desc, (VOPFUNC)spec_mkdir }, /* mkdir */ - { &vop_rmdir_desc, (VOPFUNC)spec_rmdir }, /* rmdir */ - { &vop_getattrlist_desc, (VOPFUNC)hfs_getattrlist }, - { &vop_symlink_desc, (VOPFUNC)spec_symlink }, /* symlink */ - { &vop_readdir_desc, (VOPFUNC)spec_readdir }, /* readdir */ - { &vop_readlink_desc, (VOPFUNC)spec_readlink }, /* readlink */ - { &vop_abortop_desc, (VOPFUNC)spec_abortop }, /* abortop */ - { &vop_inactive_desc, (VOPFUNC)hfs_inactive }, /* inactive */ - { &vop_reclaim_desc, (VOPFUNC)hfs_reclaim }, /* reclaim */ - { &vop_lock_desc, (VOPFUNC)hfs_lock }, /* lock */ - { &vop_unlock_desc, (VOPFUNC)hfs_unlock }, /* unlock */ - { &vop_bmap_desc, (VOPFUNC)spec_bmap }, /* bmap */ - { &vop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */ - { &vop_print_desc, (VOPFUNC)hfs_print }, /* print */ - { &vop_islocked_desc, (VOPFUNC)hfs_islocked }, /* islocked */ - { &vop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */ - { &vop_advlock_desc, (VOPFUNC)spec_advlock }, /* advlock */ - { &vop_blkatoff_desc, (VOPFUNC)spec_blkatoff }, /* blkatoff */ - { &vop_valloc_desc, (VOPFUNC)spec_valloc }, /* valloc */ - { &vop_reallocblks_desc, (VOPFUNC)spec_reallocblks }, /* reallocblks */ - { &vop_vfree_desc, (VOPFUNC)err_vfree }, /* vfree */ - { &vop_truncate_desc, (VOPFUNC)spec_truncate }, /* truncate */ - { &vop_update_desc, (VOPFUNC)hfs_update }, /* update */ - { &vop_bwrite_desc, (VOPFUNC)hfs_bwrite }, - { &vop_devblocksize_desc, (VOPFUNC)spec_devblocksize }, /* devblocksize */ - { &vop_pagein_desc, (VOPFUNC)hfs_pagein }, /* Pagein */ - { &vop_pageout_desc, (VOPFUNC)hfs_pageout }, /* Pageout */ - { &vop_copyfile_desc, (VOPFUNC)err_copyfile }, /* copyfile */ - { &vop_blktooff_desc, (VOPFUNC)hfs_blktooff }, /* blktooff */ - { &vop_offtoblk_desc, (VOPFUNC)hfs_offtoblk }, /* offtoblk */ + { &vnop_default_desc, (VOPFUNC)vn_default_error }, + { &vnop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */ + { &vnop_create_desc, (VOPFUNC)spec_create }, /* create */ + { &vnop_mknod_desc, (VOPFUNC)spec_mknod }, /* mknod */ + { &vnop_open_desc, (VOPFUNC)spec_open }, /* open */ + { &vnop_close_desc, (VOPFUNC)hfsspec_close }, /* close */ + { &vnop_getattr_desc, (VOPFUNC)hfs_vnop_getattr }, /* getattr */ + { &vnop_setattr_desc, (VOPFUNC)hfs_vnop_setattr }, /* setattr */ + { &vnop_read_desc, (VOPFUNC)hfsspec_read }, /* read */ + { &vnop_write_desc, (VOPFUNC)hfsspec_write }, /* write */ + { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */ + { &vnop_select_desc, (VOPFUNC)spec_select }, /* select */ + { &vnop_revoke_desc, (VOPFUNC)spec_revoke }, /* revoke */ + { &vnop_mmap_desc, (VOPFUNC)spec_mmap }, /* mmap */ + { &vnop_fsync_desc, (VOPFUNC)hfs_vnop_fsync }, /* fsync */ + { &vnop_remove_desc, (VOPFUNC)spec_remove }, /* remove */ + { &vnop_link_desc, (VOPFUNC)spec_link }, /* link */ + { &vnop_rename_desc, (VOPFUNC)spec_rename }, /* rename */ + { &vnop_mkdir_desc, (VOPFUNC)spec_mkdir }, /* mkdir */ + { &vnop_rmdir_desc, (VOPFUNC)spec_rmdir }, /* rmdir */ + { &vnop_symlink_desc, (VOPFUNC)spec_symlink }, /* symlink */ + { &vnop_readdir_desc, (VOPFUNC)spec_readdir }, /* readdir */ + { &vnop_readlink_desc, (VOPFUNC)spec_readlink }, /* readlink */ + { &vnop_inactive_desc, (VOPFUNC)hfs_vnop_inactive }, /* inactive */ + { &vnop_reclaim_desc, (VOPFUNC)hfs_vnop_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */ + { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */ + { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ + { &vnop_bwrite_desc, (VOPFUNC)hfs_vnop_bwrite }, + { &vnop_devblocksize_desc, (VOPFUNC)spec_devblocksize }, /* devblocksize */ + { &vnop_pagein_desc, (VOPFUNC)hfs_vnop_pagein }, /* Pagein */ + { &vnop_pageout_desc, (VOPFUNC)hfs_vnop_pageout }, /* Pageout */ + { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* copyfile */ + { &vnop_blktooff_desc, (VOPFUNC)hfs_vnop_blktooff }, /* blktooff */ + { &vnop_offtoblk_desc, (VOPFUNC)hfs_vnop_offtoblk }, /* offtoblk */ { (struct vnodeop_desc*)NULL, (VOPFUNC)NULL } }; struct vnodeopv_desc hfs_specop_opv_desc = @@ -4273,59 +3713,43 @@ struct vnodeopv_desc hfs_specop_opv_desc = #if FIFO int (**hfs_fifoop_p)(void *); struct vnodeopv_entry_desc hfs_fifoop_entries[] = { - { &vop_default_desc, (VOPFUNC)vn_default_error }, - { &vop_lookup_desc, (VOPFUNC)fifo_lookup }, /* lookup */ - { &vop_create_desc, (VOPFUNC)fifo_create }, /* create */ - { &vop_mknod_desc, (VOPFUNC)fifo_mknod }, /* mknod */ - { &vop_open_desc, (VOPFUNC)fifo_open }, /* open */ - { &vop_close_desc, (VOPFUNC)hfsfifo_close }, /* close */ - { &vop_access_desc, (VOPFUNC)hfs_access }, /* access */ - { &vop_getattr_desc, (VOPFUNC)hfs_getattr }, /* getattr */ - { &vop_setattr_desc, (VOPFUNC)hfs_setattr }, /* setattr */ - { &vop_read_desc, (VOPFUNC)hfsfifo_read }, /* read */ - { &vop_write_desc, (VOPFUNC)hfsfifo_write }, /* write */ - { &vop_lease_desc, (VOPFUNC)fifo_lease_check }, /* lease */ - { &vop_ioctl_desc, (VOPFUNC)fifo_ioctl }, /* ioctl */ - { &vop_select_desc, (VOPFUNC)fifo_select }, /* select */ - { &vop_revoke_desc, (VOPFUNC)fifo_revoke }, /* revoke */ - { &vop_mmap_desc, (VOPFUNC)fifo_mmap }, /* mmap */ - { &vop_fsync_desc, (VOPFUNC)hfs_fsync }, /* fsync */ - { &vop_seek_desc, (VOPFUNC)fifo_seek }, /* seek */ - { &vop_remove_desc, (VOPFUNC)fifo_remove }, /* remove */ - { &vop_link_desc, (VOPFUNC)fifo_link }, /* link */ - { &vop_rename_desc, (VOPFUNC)fifo_rename }, /* rename */ - { &vop_mkdir_desc, (VOPFUNC)fifo_mkdir }, /* mkdir */ - { &vop_rmdir_desc, (VOPFUNC)fifo_rmdir }, /* rmdir */ - { &vop_getattrlist_desc, (VOPFUNC)hfs_getattrlist }, - { &vop_symlink_desc, (VOPFUNC)fifo_symlink }, /* symlink */ - { &vop_readdir_desc, (VOPFUNC)fifo_readdir }, /* readdir */ - { &vop_readlink_desc, (VOPFUNC)fifo_readlink }, /* readlink */ - { &vop_abortop_desc, (VOPFUNC)fifo_abortop }, /* abortop */ - { &vop_inactive_desc, (VOPFUNC)hfs_inactive }, /* inactive */ - { &vop_reclaim_desc, (VOPFUNC)hfs_reclaim }, /* reclaim */ - { &vop_lock_desc, (VOPFUNC)hfs_lock }, /* lock */ - { &vop_unlock_desc, (VOPFUNC)hfs_unlock }, /* unlock */ - { &vop_bmap_desc, (VOPFUNC)fifo_bmap }, /* bmap */ - { &vop_strategy_desc, (VOPFUNC)fifo_strategy }, /* strategy */ - { &vop_print_desc, (VOPFUNC)hfs_print }, /* print */ - { &vop_islocked_desc, (VOPFUNC)hfs_islocked }, /* islocked */ - { &vop_pathconf_desc, (VOPFUNC)fifo_pathconf }, /* pathconf */ - { &vop_advlock_desc, (VOPFUNC)fifo_advlock }, /* advlock */ - { &vop_blkatoff_desc, (VOPFUNC)fifo_blkatoff }, /* blkatoff */ - { &vop_valloc_desc, (VOPFUNC)fifo_valloc }, /* valloc */ - { &vop_reallocblks_desc, (VOPFUNC)fifo_reallocblks }, /* reallocblks */ - { &vop_vfree_desc, (VOPFUNC)err_vfree }, /* vfree */ - { &vop_truncate_desc, (VOPFUNC)fifo_truncate }, /* truncate */ - { &vop_update_desc, (VOPFUNC)hfs_update }, /* update */ - { &vop_bwrite_desc, (VOPFUNC)hfs_bwrite }, - { &vop_pagein_desc, (VOPFUNC)hfs_pagein }, /* Pagein */ - { &vop_pageout_desc, (VOPFUNC)hfs_pageout }, /* Pageout */ - { &vop_copyfile_desc, (VOPFUNC)err_copyfile }, /* copyfile */ - { &vop_blktooff_desc, (VOPFUNC)hfs_blktooff }, /* blktooff */ - { &vop_offtoblk_desc, (VOPFUNC)hfs_offtoblk }, /* offtoblk */ - { &vop_cmap_desc, (VOPFUNC)hfs_cmap }, /* cmap */ - { &vop_kqfilt_add_desc, (VOPFUNC)hfsfifo_kqfilt_add }, /* kqfilt_add */ - { &vop_kqfilt_remove_desc, (VOPFUNC)hfsfifo_kqfilt_remove }, /* kqfilt_remove */ + { &vnop_default_desc, (VOPFUNC)vn_default_error }, + { &vnop_lookup_desc, (VOPFUNC)fifo_lookup }, /* lookup */ + { &vnop_create_desc, (VOPFUNC)fifo_create }, /* create */ + { &vnop_mknod_desc, (VOPFUNC)fifo_mknod }, /* mknod */ + { &vnop_open_desc, (VOPFUNC)fifo_open }, /* open */ + { &vnop_close_desc, (VOPFUNC)hfsfifo_close }, /* close */ + { &vnop_getattr_desc, (VOPFUNC)hfs_vnop_getattr }, /* getattr */ + { &vnop_setattr_desc, (VOPFUNC)hfs_vnop_setattr }, /* setattr */ + { &vnop_read_desc, (VOPFUNC)hfsfifo_read }, /* read */ + { &vnop_write_desc, (VOPFUNC)hfsfifo_write }, /* write */ + { &vnop_ioctl_desc, (VOPFUNC)fifo_ioctl }, /* ioctl */ + { &vnop_select_desc, (VOPFUNC)fifo_select }, /* select */ + { &vnop_revoke_desc, (VOPFUNC)fifo_revoke }, /* revoke */ + { &vnop_mmap_desc, (VOPFUNC)fifo_mmap }, /* mmap */ + { &vnop_fsync_desc, (VOPFUNC)hfs_vnop_fsync }, /* fsync */ + { &vnop_remove_desc, (VOPFUNC)fifo_remove }, /* remove */ + { &vnop_link_desc, (VOPFUNC)fifo_link }, /* link */ + { &vnop_rename_desc, (VOPFUNC)fifo_rename }, /* rename */ + { &vnop_mkdir_desc, (VOPFUNC)fifo_mkdir }, /* mkdir */ + { &vnop_rmdir_desc, (VOPFUNC)fifo_rmdir }, /* rmdir */ + { &vnop_symlink_desc, (VOPFUNC)fifo_symlink }, /* symlink */ + { &vnop_readdir_desc, (VOPFUNC)fifo_readdir }, /* readdir */ + { &vnop_readlink_desc, (VOPFUNC)fifo_readlink }, /* readlink */ + { &vnop_inactive_desc, (VOPFUNC)hfs_vnop_inactive }, /* inactive */ + { &vnop_reclaim_desc, (VOPFUNC)hfs_vnop_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (VOPFUNC)fifo_strategy }, /* strategy */ + { &vnop_pathconf_desc, (VOPFUNC)fifo_pathconf }, /* pathconf */ + { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ + { &vnop_bwrite_desc, (VOPFUNC)hfs_vnop_bwrite }, + { &vnop_pagein_desc, (VOPFUNC)hfs_vnop_pagein }, /* Pagein */ + { &vnop_pageout_desc, (VOPFUNC)hfs_vnop_pageout }, /* Pageout */ + { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* copyfile */ + { &vnop_blktooff_desc, (VOPFUNC)hfs_vnop_blktooff }, /* blktooff */ + { &vnop_offtoblk_desc, (VOPFUNC)hfs_vnop_offtoblk }, /* offtoblk */ + { &vnop_blockmap_desc, (VOPFUNC)hfs_vnop_blockmap }, /* blockmap */ + { &vnop_kqfilt_add_desc, (VOPFUNC)hfsfifo_kqfilt_add }, /* kqfilt_add */ + { &vnop_kqfilt_remove_desc, (VOPFUNC)hfsfifo_kqfilt_remove }, /* kqfilt_remove */ { (struct vnodeop_desc*)NULL, (VOPFUNC)NULL } }; struct vnodeopv_desc hfs_fifoop_opv_desc = diff --git a/bsd/hfs/hfs_xattr.c b/bsd/hfs/hfs_xattr.c new file mode 100644 index 000000000..5030db050 --- /dev/null +++ b/bsd/hfs/hfs_xattr.c @@ -0,0 +1,1062 @@ +/* + * Copyright (c) 2004-2005 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/utfconv.h> +#include <sys/vnode.h> +#include <sys/xattr.h> + +#include "hfs.h" +#include "hfs_cnode.h" +#include "hfs_mount.h" +#include "hfs_format.h" +#include "hfs_endian.h" + +#include "hfscommon/headers/BTreesInternal.h" + + +#define ATTRIBUTE_FILE_NODE_SIZE 8192 + + +/* State information for the listattr_callback callback function. */ +struct listattr_callback_state { + u_int32_t fileID; + int result; + uio_t uio; + size_t size; +}; + +#define HFS_MAXATTRIBUTESIZE (1024*1024) + +/* HFS Internal Names */ +#define XATTR_EXTENDEDSECURITY_NAME "system.extendedsecurity" + + +#define RESOURCE_FORK_EXISTS(VP) \ + ((VTOC((VP))->c_blocks - VTOF((VP))->ff_blocks) > 0) + +static u_int32_t emptyfinfo[8] = {0}; + + +extern int hfs_create_attr_btree(struct hfsmount *hfsmp, uint32_t nodesize, uint32_t nodecnt); + + +int hfs_vnop_getxattr(struct vnop_getxattr_args *ap); +int hfs_vnop_setxattr(struct vnop_setxattr_args *ap); +int hfs_vnop_removexattr(struct vnop_removexattr_args *ap); +int hfs_vnop_listxattr(struct vnop_listxattr_args *ap); +int hfs_attrkeycompare(HFSPlusAttrKey *searchKey, HFSPlusAttrKey *trialKey); + + + +static int listattr_callback(const HFSPlusAttrKey *key, const HFSPlusAttrData *data, + struct listattr_callback_state *state); + +static int buildkey(u_int32_t fileID, const char *attrname, HFSPlusAttrKey *key); + +static int getnodecount(struct hfsmount *hfsmp, size_t nodesize); + +static size_t getmaxinlineattrsize(struct vnode * attrvp); + +/* + * Retrieve the data of an extended attribute. + */ +__private_extern__ +int +hfs_vnop_getxattr(struct vnop_getxattr_args *ap) +/* + struct vnop_getxattr_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + char * a_name; + uio_t a_uio; + size_t *a_size; + int a_options; + vfs_context_t a_context; + }; +*/ +{ + struct vnode *vp = ap->a_vp; + struct hfsmount *hfsmp; + uio_t uio = ap->a_uio; + struct BTreeIterator * iterator = NULL; + struct filefork *btfile; + FSBufferDescriptor btdata; + HFSPlusAttrData * datap = NULL; + size_t bufsize; + UInt16 datasize; + int lockflags; + int result; + + if (ap->a_name == NULL || ap->a_name[0] == '\0') { + return (EINVAL); /* invalid name */ + } + hfsmp = VTOHFS(vp); + + if (!VNODE_IS_RSRC(vp)) { + /* Get the Finder Info. */ + if (bcmp(ap->a_name, XATTR_FINDERINFO_NAME, sizeof(XATTR_FINDERINFO_NAME)) == 0) { + bufsize = 32; + + /* If Finder Info is empty then it doesn't exist. */ + if (bcmp(VTOC(vp)->c_finderinfo, emptyfinfo, sizeof(emptyfinfo)) == 0) { + return (ENOATTR); + } + if (uio == NULL) { + *ap->a_size = bufsize; + return (0); + } + if (uio_resid(uio) < bufsize) + return (ERANGE); + + result = uiomove((caddr_t) &VTOC(vp)->c_finderinfo , bufsize, uio); + + return (result); + } + /* Read the Resource Fork. */ + if (bcmp(ap->a_name, XATTR_RESOURCEFORK_NAME, sizeof(XATTR_RESOURCEFORK_NAME)) == 0) { + struct vnode *rvp = NULL; + + if ( !vnode_isreg(vp) ) { + return (EPERM); + } + if ( !RESOURCE_FORK_EXISTS(vp)) { + return (ENOATTR); + } + if ((result = hfs_vgetrsrc(hfsmp, vp, &rvp, vfs_context_proc(ap->a_context)))) { + return (result); + } + if (uio == NULL) { + *ap->a_size = (size_t)VTOF(rvp)->ff_size; + } else { + result = VNOP_READ(rvp, uio, 0, ap->a_context); + } + vnode_put(rvp); + return (result); + } + } + /* + * Standard HFS only supports native FinderInfo and Resource Forks. + */ + if (hfsmp->hfs_flags & HFS_STANDARD) { + return (EPERM); + } + /* Bail if we don't have any extended attributes. */ + if ((hfsmp->hfs_attribute_vp == NULL) || + (VTOC(vp)->c_attr.ca_recflags & kHFSHasAttributesMask) == 0) { + return (ENOATTR); + } + btfile = VTOF(hfsmp->hfs_attribute_vp); + + MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); + bzero(iterator, sizeof(*iterator)); + + bufsize = sizeof(HFSPlusAttrData) - 2; + if (uio) + bufsize += uio_resid(uio); + MALLOC(datap, HFSPlusAttrData *, bufsize, M_TEMP, M_WAITOK); + btdata.bufferAddress = datap; + btdata.itemSize = bufsize; + btdata.itemCount = 1; + + result = buildkey(VTOC(vp)->c_fileid, ap->a_name, (HFSPlusAttrKey *)&iterator->key); + if (result) + goto exit; + + /* Lookup the attribute. */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); + result = BTSearchRecord(btfile, iterator, &btdata, &datasize, NULL); + hfs_systemfile_unlock(hfsmp, lockflags); + + if (result) { + if (result == btNotFound) + result = ENOATTR; + goto exit; + } + + *ap->a_size = datap->attrSize; + + /* Copy out the attribute data. */ + if (uio) { + if (datap->attrSize > uio_resid(uio)) + result = ERANGE; + else + result = uiomove((caddr_t) &datap->attrData , datap->attrSize, uio); + } +exit: + FREE(datap, M_TEMP); + FREE(iterator, M_TEMP); + + return MacToVFSError(result); +} + +/* + * Set the data of an extended attribute. + */ +__private_extern__ +int +hfs_vnop_setxattr(struct vnop_setxattr_args *ap) +/* + struct vnop_setxattr_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + char * a_name; + uio_t a_uio; + int a_options; + vfs_context_t a_context; + }; +*/ +{ + struct vnode *vp = ap->a_vp; + struct hfsmount *hfsmp; + uio_t uio = ap->a_uio; + struct BTreeIterator * iterator = NULL; + struct filefork *btfile; + size_t attrsize; + FSBufferDescriptor btdata; + HFSPlusAttrData * datap = NULL; + UInt16 datasize; + int lockflags; + int result; + + if (ap->a_name == NULL || ap->a_name[0] == '\0') { + return (EINVAL); /* invalid name */ + } + hfsmp = VTOHFS(vp); + if (VNODE_IS_RSRC(vp)) { + return (EPERM); + } + /* Set the Finder Info. */ + if (bcmp(ap->a_name, XATTR_FINDERINFO_NAME, sizeof(XATTR_FINDERINFO_NAME)) == 0) { + attrsize = 32; + + if (bcmp(VTOC(vp)->c_finderinfo, emptyfinfo, sizeof(emptyfinfo))) { + /* attr exists and "create" was specified. */ + if (ap->a_options & XATTR_CREATE) { + return (EEXIST); + } + } else { + /* attr doesn't exists and "replace" was specified. */ + if (ap->a_options & XATTR_REPLACE) { + return (ENOATTR); + } + } + if (uio_resid(uio) != attrsize) + return (ERANGE); + + result = uiomove((caddr_t) &VTOC(vp)->c_finderinfo , attrsize, uio); + if (result == 0) { + VTOC(vp)->c_touch_chgtime = TRUE; + VTOC(vp)->c_flag |= C_MODIFIED; + result = hfs_update(vp, FALSE); + } + return (result); + } + /* Write the Resource Fork. */ + if (bcmp(ap->a_name, XATTR_RESOURCEFORK_NAME, sizeof(XATTR_RESOURCEFORK_NAME)) == 0) { + struct vnode *rvp = NULL; + + if (!vnode_isreg(vp)) { + return (EPERM); + } + if (RESOURCE_FORK_EXISTS(vp)) { + /* attr exists and "create" was specified. */ + if (ap->a_options & XATTR_CREATE) { + return (EEXIST); + } + } else { + /* attr doesn't exists and "replace" was specified. */ + if (ap->a_options & XATTR_REPLACE) { + return (ENOATTR); + } + } + if ((result = hfs_vgetrsrc(hfsmp, vp, &rvp, vfs_context_proc(ap->a_context)))) { + return (result); + } + result = VNOP_WRITE(rvp, uio, 0, ap->a_context); + vnode_put(rvp); + return (result); + } + /* + * Standard HFS only supports native FinderInfo and Resource Forks. + */ + if (hfsmp->hfs_flags & HFS_STANDARD) { + return (EPERM); + } + if (hfsmp->hfs_max_inline_attrsize == 0) { + hfsmp->hfs_max_inline_attrsize = getmaxinlineattrsize(hfsmp->hfs_attribute_vp); + } + attrsize = uio_resid(uio); + if (attrsize > hfsmp->hfs_max_inline_attrsize) { + /* + * XXX Need to support extent-based attributes XXX + */ + return (E2BIG); + } + /* Calculate size of record rounded up to multiple of 2 bytes. */ + datasize = sizeof(HFSPlusAttrData) - 2 + attrsize + ((attrsize & 1) ? 1 : 0); + + MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); + bzero(iterator, sizeof(*iterator)); + + MALLOC(datap, HFSPlusAttrData *, datasize, M_TEMP, M_WAITOK); + btdata.bufferAddress = datap; + btdata.itemSize = datasize; + btdata.itemCount = 1; + datap->recordType = kHFSPlusAttrInlineData; + datap->reserved[0] = 0; + datap->reserved[1] = 0; + datap->attrSize = attrsize; + + /* Copy in the attribute data. */ + result = uiomove((caddr_t) &datap->attrData , attrsize, uio); + if (result) { + goto exit2; + } + /* Build a b-tree key. */ + result = buildkey(VTOC(vp)->c_fileid, ap->a_name, (HFSPlusAttrKey *)&iterator->key); + if (result) { + goto exit2; + } + /* Start a transaction for our changes. */ + if (hfs_start_transaction(hfsmp) != 0) { + result = EINVAL; + goto exit2; + } + + /* once we started the transaction, nobody can compete with us, so make sure this file is still there */ + struct cnode *cp; + cp = VTOC(vp); + if (cp->c_flag & C_NOEXISTS) { /* this file has already been removed */ + result = ENOENT; + goto exit1; + } + + /* + * If there isn't an attributes b-tree then create one. + */ + if (hfsmp->hfs_attribute_vp == NULL) { + lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); + result = hfs_create_attr_btree(hfsmp, ATTRIBUTE_FILE_NODE_SIZE, + getnodecount(hfsmp, ATTRIBUTE_FILE_NODE_SIZE)); + hfs_systemfile_unlock(hfsmp, lockflags); + if (result) { + goto exit1; + } + } + btfile = VTOF(hfsmp->hfs_attribute_vp); + + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); + + if (ap->a_options & XATTR_REPLACE) { + result = BTReplaceRecord(btfile, iterator, &btdata, datasize); + if (result) + goto exit0; + else + goto exit; + } + + /* Insert the attribute. */ + result = BTInsertRecord(btfile, iterator, &btdata, datasize); + if (result) { + if (result != btExists) { + goto exit0; + } + + // if it exists and XATTR_CREATE was specified, + // the spec says to return EEXIST + if (ap->a_options & XATTR_CREATE) { + result = EEXIST; + goto exit0; + } + /* XXX need to account for old size in c_attrblks */ + result = BTReplaceRecord(btfile, iterator, &btdata, datasize); + } +exit: + (void) BTFlushPath(btfile); +exit0: + hfs_systemfile_unlock(hfsmp, lockflags); + if (result == 0) { + struct cnode * cp; + + cp = VTOC(vp); + cp->c_touch_chgtime = TRUE; + if ((cp->c_attr.ca_recflags & kHFSHasAttributesMask) == 0) { + cp->c_attr.ca_recflags |= kHFSHasAttributesMask; + (void) hfs_update(vp, 0); + } + HFS_KNOTE(vp, NOTE_ATTRIB); + } +exit1: + /* Finish the transaction of our changes. */ + hfs_end_transaction(hfsmp); +exit2: + FREE(datap, M_TEMP); + FREE(iterator, M_TEMP); + + if (result == btNotFound) + result = ENOATTR; + else + result = MacToVFSError(result); + + return (result); +} + +/* + * Remove an extended attribute. + */ +__private_extern__ +int +hfs_vnop_removexattr(struct vnop_removexattr_args *ap) +/* + struct vnop_removexattr_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + char * a_name; + int a_options; + vfs_context_t a_context; + }; +*/ +{ + struct vnode *vp = ap->a_vp; + struct hfsmount *hfsmp; + struct BTreeIterator * iterator = NULL; + struct filefork *btfile; + struct proc *p = vfs_context_proc(ap->a_context); + FSBufferDescriptor btdata; + HFSPlusAttrData attrdata; + int lockflags; + int result; + + if (ap->a_name == NULL || ap->a_name[0] == '\0') { + return (EINVAL); /* invalid name */ + } + hfsmp = VTOHFS(vp); + if (VNODE_IS_RSRC(vp)) { + return (EPERM); + } + + /* If Resource Fork is non-empty then truncate it. */ + if (bcmp(ap->a_name, XATTR_RESOURCEFORK_NAME, sizeof(XATTR_RESOURCEFORK_NAME)) == 0) { + struct vnode *rvp = NULL; + + if ( !vnode_isreg(vp) ) { + return (EPERM); + } + if ( !RESOURCE_FORK_EXISTS(vp) ) { + return (ENOATTR); + } + if ((result = hfs_vgetrsrc(hfsmp, vp, &rvp, p))) { + return (result); + } + hfs_lock_truncate(VTOC(rvp), TRUE); + if ((result = hfs_lock(VTOC(rvp), HFS_EXCLUSIVE_LOCK))) { + hfs_unlock_truncate(VTOC(vp)); + vnode_put(rvp); + return (result); + } + result = hfs_truncate(rvp, (off_t)0, IO_NDELAY, 0, ap->a_context); + + hfs_unlock_truncate(VTOC(rvp)); + hfs_unlock(VTOC(rvp)); + + vnode_put(rvp); + return (result); + } + /* Clear out the Finder Info. */ + if (bcmp(ap->a_name, XATTR_FINDERINFO_NAME, sizeof(XATTR_FINDERINFO_NAME)) == 0) { + if (bcmp(VTOC(vp)->c_finderinfo, emptyfinfo, sizeof(emptyfinfo)) == 0) { + return (ENOATTR); + } + bzero(VTOC(vp)->c_finderinfo, sizeof(emptyfinfo)); + return (0); + } + /* + * Standard HFS only supports native FinderInfo and Resource Forks. + */ + if (hfsmp->hfs_flags & HFS_STANDARD) { + return (EPERM); + } + if (hfsmp->hfs_attribute_vp == NULL) { + return (ENOATTR); + } + btfile = VTOF(hfsmp->hfs_attribute_vp); + + MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); + bzero(iterator, sizeof(*iterator)); + + if (hfs_start_transaction(hfsmp) != 0) { + result = EINVAL; + goto exit2; + } + + result = buildkey(VTOC(vp)->c_fileid, ap->a_name, (HFSPlusAttrKey *)&iterator->key); + if (result) + goto exit2; + + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); + + btdata.bufferAddress = &attrdata; + btdata.itemSize = sizeof(attrdata); + btdata.itemCount = 1; + result = BTSearchRecord(btfile, iterator, &btdata, NULL, NULL); + if (result) + goto exit1; + + result = BTDeleteRecord(btfile, iterator); + (void) BTFlushPath(btfile); +exit1: + hfs_systemfile_unlock(hfsmp, lockflags); + if (result == 0) { + VTOC(vp)->c_touch_chgtime = TRUE; + HFS_KNOTE(vp, NOTE_ATTRIB); + } +exit2: + if (result == btNotFound) { + result = ENOATTR; + } + hfs_end_transaction(hfsmp); + + FREE(iterator, M_TEMP); + + return MacToVFSError(result); +} + + +/* + * Retrieve the list of extended attribute names. + */ +__private_extern__ +int +hfs_vnop_listxattr(struct vnop_listxattr_args *ap) +/* + struct vnop_listxattr_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + uio_t a_uio; + size_t *a_size; + int a_options; + vfs_context_t a_context; +*/ +{ + struct vnode *vp = ap->a_vp; + struct hfsmount *hfsmp; + uio_t uio = ap->a_uio; + struct BTreeIterator * iterator = NULL; + struct filefork *btfile; + struct listattr_callback_state state; + int lockflags; + int result; + + if (VNODE_IS_RSRC(vp)) { + return (EPERM); + } + hfsmp = VTOHFS(vp); + *ap->a_size = 0; + + /* If Finder Info is non-empty then export it. */ + if (bcmp(VTOC(vp)->c_finderinfo, emptyfinfo, sizeof(emptyfinfo)) != 0) { + if (uio == NULL) { + *ap->a_size += sizeof(XATTR_FINDERINFO_NAME); + } else if (uio_resid(uio) < sizeof(XATTR_FINDERINFO_NAME)) { + return (ERANGE); + } else { + result = uiomove((caddr_t)XATTR_FINDERINFO_NAME, + sizeof(XATTR_FINDERINFO_NAME), uio); + if (result) + return (result); + } + } + /* If Resource Fork is non-empty then export it. */ + if (vnode_isreg(vp) && RESOURCE_FORK_EXISTS(vp)) { + if (uio == NULL) { + *ap->a_size += sizeof(XATTR_RESOURCEFORK_NAME); + } else if (uio_resid(uio) < sizeof(XATTR_RESOURCEFORK_NAME)) { + return (ERANGE); + } else { + result = uiomove((caddr_t)XATTR_RESOURCEFORK_NAME, + sizeof(XATTR_RESOURCEFORK_NAME), uio); + if (result) + return (result); + } + } + /* + * Standard HFS only supports native FinderInfo and Resource Forks. + * Return at this point. + */ + if (hfsmp->hfs_flags & HFS_STANDARD) { + return (0); + } + /* Bail if we don't have any extended attributes. */ + if ((hfsmp->hfs_attribute_vp == NULL) || + (VTOC(vp)->c_attr.ca_recflags & kHFSHasAttributesMask) == 0) { + return (0); + } + btfile = VTOF(hfsmp->hfs_attribute_vp); + + MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); + bzero(iterator, sizeof(*iterator)); + result = buildkey(VTOC(vp)->c_fileid, NULL, (HFSPlusAttrKey *)&iterator->key); + if (result) + goto exit; + + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); + + result = BTSearchRecord(btfile, iterator, NULL, NULL, NULL); + if (result && result != btNotFound) { + hfs_systemfile_unlock(hfsmp, lockflags); + goto exit; + } + + state.fileID = VTOC(vp)->c_fileid; + state.result = 0; + state.uio = uio; + state.size = 0; + + /* + * Process entries starting just after iterator->key. + */ + result = BTIterateRecords(btfile, kBTreeNextRecord, iterator, + (IterateCallBackProcPtr)listattr_callback, &state); + hfs_systemfile_unlock(hfsmp, lockflags); + if (uio == NULL) { + *ap->a_size += state.size; + } +exit: + FREE(iterator, M_TEMP); + + if (state.result || result == btNotFound) + result = state.result; + + return MacToVFSError(result); +} + + +/* + * Callback - called for each attribute + */ +static int +listattr_callback(const HFSPlusAttrKey *key, __unused const HFSPlusAttrData *data, struct listattr_callback_state *state) +{ + char attrname[XATTR_MAXNAMELEN + 1]; + size_t bytecount; + int result; + + if (state->fileID != key->fileID) { + state->result = 0; + return (0); /* stop */ + } + /* + * Skip over non-primary keys + */ + if (key->startBlock != 0) { + return (1); /* continue */ + } + + result = utf8_encodestr(key->attrName, key->attrNameLen * sizeof(UniChar), + attrname, &bytecount, sizeof(attrname), 0, 0); + if (result) { + state->result = result; + return (0); /* stop */ + } + bytecount++; /* account for null termination char */ + + if (xattr_protected(attrname)) + return (1); /* continue */ + + if (state->uio == NULL) { + state->size += bytecount; + } else { + if (bytecount > uio_resid(state->uio)) { + state->result = ERANGE; + return (0); /* stop */ + } + result = uiomove((caddr_t) attrname, bytecount, state->uio); + if (result) { + state->result = result; + return (0); /* stop */ + } + } + return (1); /* continue */ +} + + +/* + * Remove all the attributes from a cnode. + * + * A jornal transaction must be already started. + * Attributes b-Tree must have exclusive lock held. + */ +__private_extern__ +int +hfs_removeallattr(struct hfsmount *hfsmp, u_int32_t fileid) +{ + BTreeIterator *next_iterator, *del_iterator; + HFSPlusAttrKey *next_key; + struct filefork *btfile; + int result, iter_result; + + if (hfsmp->hfs_attribute_vp == NULL) { + return (0); + } + btfile = VTOF(hfsmp->hfs_attribute_vp); + + MALLOC(next_iterator, BTreeIterator *, sizeof(BTreeIterator) * 2, M_TEMP, M_WAITOK); + bzero(next_iterator, sizeof(BTreeIterator) * 2); + del_iterator = &next_iterator[1]; + next_key = (HFSPlusAttrKey *)&next_iterator->key; + + /* + * Go to first possible attribute key/record pair + */ + (void) buildkey(fileid, NULL, next_key); + result = BTIterateRecord(btfile, kBTreeNextRecord, next_iterator, NULL, NULL); + if (result || next_key->fileID != fileid) { + goto exit; + } + /* Remember iterator of attribute to delete */ + bcopy(next_iterator, del_iterator, sizeof(BTreeIterator)); + + /* Loop until there are no more attributes for this file id */ + for(;;) { + iter_result = BTIterateRecord(btfile, kBTreeNextRecord, next_iterator, NULL, NULL); + + /* XXX need to free and extents for record types 0x20 and 0x30 */ + result = BTDeleteRecord(btfile, del_iterator); + if (result) { + goto exit; + } + if (iter_result) { + result = iter_result; + break; + } + if (iter_result || next_key->fileID != fileid) { + break; /* end of attributes for this file id */ + } + bcopy(next_iterator, del_iterator, sizeof(BTreeIterator)); + } +exit: + (void) BTFlushPath(btfile); + + if (result == btNotFound) { + result = 0; + } + FREE(next_iterator, M_TEMP); + return (result); +} + +/* + * Enable/Disable extended security (ACLs). + */ +__private_extern__ +int +hfs_setextendedsecurity(struct hfsmount *hfsmp, int state) +{ + struct BTreeIterator * iterator = NULL; + struct filefork *btfile; + int lockflags; + int result; + + if (hfsmp->hfs_flags & HFS_STANDARD) { + return (ENOTSUP); + } + + MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); + bzero(iterator, sizeof(*iterator)); + + /* + * Build a b-tree key. + * We use the root's parent id (1) to hold this volume attribute. + */ + (void) buildkey(kHFSRootParentID, XATTR_EXTENDEDSECURITY_NAME, + (HFSPlusAttrKey *)&iterator->key); + + /* Start a transaction for our changes. */ + if (hfs_start_transaction(hfsmp) != 0) { + result = EINVAL; + goto exit2; + } + /* + * If there isn't an attributes b-tree then create one. + */ + if (hfsmp->hfs_attribute_vp == NULL) { + lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); + result = hfs_create_attr_btree(hfsmp, ATTRIBUTE_FILE_NODE_SIZE, + getnodecount(hfsmp, ATTRIBUTE_FILE_NODE_SIZE)); + hfs_systemfile_unlock(hfsmp, lockflags); + if (result) { + goto exit1; + } + } + btfile = VTOF(hfsmp->hfs_attribute_vp); + + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); + + if (state == 0) { + /* Remove the attribute. */ + result = BTDeleteRecord(btfile, iterator); + if (result == btNotFound) + result = 0; + } else { + FSBufferDescriptor btdata; + HFSPlusAttrData attrdata; + UInt16 datasize; + + datasize = sizeof(attrdata); + btdata.bufferAddress = &attrdata; + btdata.itemSize = datasize; + btdata.itemCount = 1; + attrdata.recordType = kHFSPlusAttrInlineData; + attrdata.reserved[0] = 0; + attrdata.reserved[1] = 0; + attrdata.attrSize = 2; + attrdata.attrData[0] = 0; + attrdata.attrData[1] = 0; + + /* Insert the attribute. */ + result = BTInsertRecord(btfile, iterator, &btdata, datasize); + if (result == btExists) + result = 0; + } + (void) BTFlushPath(btfile); + + hfs_systemfile_unlock(hfsmp, lockflags); +exit1: + /* Finish the transaction of our changes. */ + hfs_end_transaction(hfsmp); +exit2: + FREE(iterator, M_TEMP); + + if (result == 0) { + if (state == 0) + vfs_clearextendedsecurity(HFSTOVFS(hfsmp)); + else + vfs_setextendedsecurity(HFSTOVFS(hfsmp)); + printf("hfs: %s extended security on %s\n", + state == 0 ? "disabling" : "enabling", hfsmp->vcbVN); + } + + return MacToVFSError(result); +} + +/* + * Check for extended security (ACLs). + */ +__private_extern__ +void +hfs_checkextendedsecurity(struct hfsmount *hfsmp) +{ + struct BTreeIterator * iterator; + struct filefork *btfile; + int lockflags; + int result; + + if (hfsmp->hfs_flags & HFS_STANDARD || + hfsmp->hfs_attribute_vp == NULL) { + return; + } + + MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); + bzero(iterator, sizeof(*iterator)); + + /* + * Build a b-tree key. + * We use the root's parent id (1) to hold this volume attribute. + */ + (void) buildkey(kHFSRootParentID, XATTR_EXTENDEDSECURITY_NAME, + (HFSPlusAttrKey *)&iterator->key); + + btfile = VTOF(hfsmp->hfs_attribute_vp); + + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); + + /* Check for our attribute. */ + result = BTSearchRecord(btfile, iterator, NULL, NULL, NULL); + + hfs_systemfile_unlock(hfsmp, lockflags); + FREE(iterator, M_TEMP); + + if (result == 0) { + vfs_setextendedsecurity(HFSTOVFS(hfsmp)); + printf("hfs mount: enabling extended security on %s\n", hfsmp->vcbVN); + } +} + + +/* + * hfs_attrkeycompare - compare two attribute b-tree keys. + * + * The name portion of the key is compared using a 16-bit binary comparison. + * This is called from the b-tree code. + */ +__private_extern__ +int +hfs_attrkeycompare(HFSPlusAttrKey *searchKey, HFSPlusAttrKey *trialKey) +{ + u_int32_t searchFileID, trialFileID; + int result; + + searchFileID = searchKey->fileID; + trialFileID = trialKey->fileID; + result = 0; + + if (searchFileID > trialFileID) { + ++result; + } else if (searchFileID < trialFileID) { + --result; + } else { + u_int16_t * str1 = &searchKey->attrName[0]; + u_int16_t * str2 = &trialKey->attrName[0]; + int length1 = searchKey->attrNameLen; + int length2 = trialKey->attrNameLen; + u_int16_t c1, c2; + int length; + + if (length1 < length2) { + length = length1; + --result; + } else if (length1 > length2) { + length = length2; + ++result; + } else { + length = length1; + } + + while (length--) { + c1 = *(str1++); + c2 = *(str2++); + + if (c1 > c2) { + result = 1; + break; + } + if (c1 < c2) { + result = -1; + break; + } + } + if (result) + return (result); + /* + * Names are equal; compare startBlock + */ + if (searchKey->startBlock == trialKey->startBlock) + return (0); + else + return (searchKey->startBlock < trialKey->startBlock ? -1 : 1); + } + + return result; +} + + +/* + * buildkey - build an Attribute b-tree key + */ +static int +buildkey(u_int32_t fileID, const char *attrname, HFSPlusAttrKey *key) +{ + int result = 0; + size_t unicodeBytes = 0; + + if (attrname != NULL) { + /* + * Convert filename from UTF-8 into Unicode + */ + result = utf8_decodestr(attrname, strlen(attrname), key->attrName, + &unicodeBytes, sizeof(key->attrName), 0, 0); + if (result) { + if (result != ENAMETOOLONG) + result = EINVAL; /* name has invalid characters */ + return (result); + } + key->attrNameLen = unicodeBytes / sizeof(UniChar); + key->keyLength = kHFSPlusAttrKeyMinimumLength + unicodeBytes; + } else { + key->attrNameLen = 0; + key->keyLength = kHFSPlusAttrKeyMinimumLength; + } + key->pad = 0; + key->fileID = fileID; + key->startBlock = 0; + + return (0); + } + +/* + * getnodecount - calculate starting node count for attributes b-tree. + */ +static int +getnodecount(struct hfsmount *hfsmp, size_t nodesize) +{ + int avedatasize; + int recpernode; + int count; + + avedatasize = sizeof(u_int16_t); /* index slot */ + avedatasize += kHFSPlusAttrKeyMinimumLength + HFS_AVERAGE_NAME_SIZE * sizeof(u_int16_t); + avedatasize += sizeof(HFSPlusAttrData) + 32; + + recpernode = (nodesize - sizeof(BTNodeDescriptor)) / avedatasize; + + count = (hfsmp->hfs_filecount + hfsmp->hfs_dircount) / 8; + count /= recpernode; + + /* XXX should also consider volume size XXX */ + + return (MAX(count, (int)(1024 * 1024) / (int)nodesize)); +} + + +/* + * getmaxinlineattrsize - calculate maximum inline attribute size. + * + * This yields 3,802 bytes for an 8K node size. + */ +static size_t +getmaxinlineattrsize(struct vnode * attrvp) +{ + struct BTreeInfoRec btinfo; + size_t nodesize = ATTRIBUTE_FILE_NODE_SIZE; + size_t maxsize; + + if (attrvp != NULL) { + (void) hfs_lock(VTOC(attrvp), HFS_SHARED_LOCK); + if (BTGetInformation(VTOF(attrvp), 0, &btinfo) == 0) + nodesize = btinfo.nodeSize; + hfs_unlock(VTOC(attrvp)); + } + maxsize = nodesize; + maxsize -= sizeof(BTNodeDescriptor); /* minus node descriptor */ + maxsize -= 3 * sizeof(UInt16); /* minus 3 index slots */ + maxsize /= 2; /* 2 key/rec pairs minumum */ + maxsize -= sizeof(HFSPlusAttrKey); /* minus maximum key size */ + maxsize -= sizeof(HFSPlusAttrData) - 2; /* minus data header */ + maxsize &= 0xFFFFFFFE; /* multiple of 2 bytes */ + + return (maxsize); +} + + diff --git a/bsd/hfs/hfscommon/BTree/BTree.c b/bsd/hfs/hfscommon/BTree/BTree.c index dc6c30940..f11af332a 100644 --- a/bsd/hfs/hfscommon/BTree/BTree.c +++ b/bsd/hfs/hfscommon/BTree/BTree.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -1120,7 +1120,7 @@ ProcessData: } while (err == 0) { - if (callBackProc(keyPtr, recordPtr, len, callBackState) == 0) + if (callBackProc(keyPtr, recordPtr, callBackState) == 0) break; if ((index+1) < ((NodeDescPtr)node.buffer)->numRecords) { @@ -1548,7 +1548,7 @@ BTUpdateRecord(FCB *filePtr, BTreeIterator *iterator, btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; - REQUIRE_FILE_LOCK(btreePtr->fileRefNum, false); + REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true); ////////////////////////////// Take A Hint ////////////////////////////////// @@ -1571,7 +1571,7 @@ BTUpdateRecord(FCB *filePtr, BTreeIterator *iterator, // XXXdbg ModifyBlockStart(btreePtr->fileRefNum, &nodeRec); - err = callBackProc(keyPtr, recordPtr, recordLen, callBackState); + err = callBackProc(keyPtr, recordPtr, callBackState); M_ExitOnError (err); err = UpdateNode (btreePtr, &nodeRec, 0, 0); @@ -1606,7 +1606,7 @@ BTUpdateRecord(FCB *filePtr, BTreeIterator *iterator, // XXXdbg ModifyBlockStart(btreePtr->fileRefNum, &nodeRec); - err = callBackProc(keyPtr, recordPtr, recordLen, callBackState); + err = callBackProc(keyPtr, recordPtr, callBackState); M_ExitOnError (err); err = UpdateNode (btreePtr, &nodeRec, 0, 0); @@ -1786,7 +1786,7 @@ OSStatus BTFlushPath (FCB *filePtr) M_ReturnErrorIf (btreePtr == nil, fsBTInvalidFileErr); - REQUIRE_FILE_LOCK(btreePtr->fileRefNum, false); + REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true); err = UpdateHeader (btreePtr, false); diff --git a/bsd/hfs/hfscommon/BTree/BTreeNodeReserve.c b/bsd/hfs/hfscommon/BTree/BTreeNodeReserve.c index 980541c3f..7baf03fb4 100644 --- a/bsd/hfs/hfscommon/BTree/BTreeNodeReserve.c +++ b/bsd/hfs/hfscommon/BTree/BTreeNodeReserve.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -21,6 +21,7 @@ */ #include "../headers/BTreesPrivate.h" #include "sys/malloc.h" +#include <kern/locks.h> /* @@ -53,7 +54,7 @@ struct nreserve { void *nr_tag; /* unique tag (per thread) */ }; -#define NR_GET_TAG() (current_act()) +#define NR_GET_TAG() (current_thread()) #define NR_CACHE 17 @@ -64,6 +65,11 @@ LIST_HEAD(nodereserve, nreserve) *nr_hashtbl; u_long nr_hashmask; +lck_grp_t * nr_lck_grp; +lck_grp_attr_t * nr_lck_grp_attr; +lck_attr_t * nr_lck_attr; + +lck_mtx_t nr_mutex; /* Internal Node Reserve Hash Routines (private) */ static void nr_insert (struct vnode *, struct nreserve *nrp, int); @@ -83,6 +89,15 @@ BTReserveSetup() panic("BTReserveSetup: nreserve size != opaque struct size"); nr_hashtbl = hashinit(NR_CACHE, M_HFSMNT, &nr_hashmask); + + nr_lck_grp_attr= lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(nr_lck_grp_attr); + nr_lck_grp = lck_grp_alloc_init("btree_node_reserve", nr_lck_grp_attr); + + nr_lck_attr = lck_attr_alloc_init(); + lck_attr_setdebug(nr_lck_attr); + + lck_mtx_init(&nr_mutex, nr_lck_grp, nr_lck_attr); } @@ -142,7 +157,7 @@ BTReserveSpace(FCB *file, int operations, void* data) totalNodes = rsrvNodes + btree->totalNodes - availNodes; /* See if we also need a map node */ - if (totalNodes > CalcMapBits(btree)) + if (totalNodes > (int)CalcMapBits(btree)) ++totalNodes; if ((err = ExtendBTree(btree, totalNodes))) return (err); @@ -179,7 +194,7 @@ BTReleaseReserve(FCB *file, void* data) } /* - * BTUpdateReserve - update a node reserve for allocations that occured. + * BTUpdateReserve - update a node reserve for allocations that occurred. */ __private_extern__ void @@ -209,11 +224,13 @@ nr_insert(struct vnode * btvp, struct nreserve *nrp, int nodecnt) /* * Check the cache - there may already be a reserve */ + lck_mtx_lock(&nr_mutex); nrhead = NR_HASH(btvp, tag); for (tmp_nrp = nrhead->lh_first; tmp_nrp; tmp_nrp = tmp_nrp->nr_hash.le_next) { if ((tmp_nrp->nr_tag == tag) && (tmp_nrp->nr_btvp == btvp)) { nrp->nr_tag = 0; + lck_mtx_unlock(&nr_mutex); return; } } @@ -224,6 +241,7 @@ nr_insert(struct vnode * btvp, struct nreserve *nrp, int nodecnt) nrp->nr_tag = tag; LIST_INSERT_HEAD(nrhead, nrp, nr_hash); ++nrinserts; + lck_mtx_unlock(&nr_mutex); } /* @@ -234,6 +252,7 @@ nr_delete(struct vnode * btvp, struct nreserve *nrp, int *nodecnt) { void * tag = NR_GET_TAG(); + lck_mtx_lock(&nr_mutex); if (nrp->nr_tag) { if ((nrp->nr_tag != tag) || (nrp->nr_btvp != btvp)) panic("nr_delete: invalid NR (%08x)", nrp); @@ -244,6 +263,7 @@ nr_delete(struct vnode * btvp, struct nreserve *nrp, int *nodecnt) } else { *nodecnt = 0; } + lck_mtx_unlock(&nr_mutex); } /* @@ -256,16 +276,21 @@ nr_lookup(struct vnode * btvp) struct nreserve *nrp; void* tag = NR_GET_TAG(); + lck_mtx_lock(&nr_mutex); + nrhead = NR_HASH(btvp, tag); for (nrp = nrhead->lh_first; nrp; nrp = nrp->nr_hash.le_next) { - if ((nrp->nr_tag == tag) && (nrp->nr_btvp == btvp)) + if ((nrp->nr_tag == tag) && (nrp->nr_btvp == btvp)) { + lck_mtx_unlock(&nr_mutex); return (nrp->nr_nodecnt - nrp->nr_newnodes); + } } + lck_mtx_unlock(&nr_mutex); return (0); } /* - * Update a node reserve for any allocations that occured. + * Update a node reserve for any allocations that occurred. */ static void nr_update(struct vnode * btvp, int nodecnt) @@ -274,6 +299,8 @@ nr_update(struct vnode * btvp, int nodecnt) struct nreserve *nrp; void* tag = NR_GET_TAG(); + lck_mtx_lock(&nr_mutex); + nrhead = NR_HASH(btvp, tag); for (nrp = nrhead->lh_first; nrp; nrp = nrp->nr_hash.le_next) { if ((nrp->nr_tag == tag) && (nrp->nr_btvp == btvp)) { @@ -281,4 +308,5 @@ nr_update(struct vnode * btvp, int nodecnt) break; } } + lck_mtx_unlock(&nr_mutex); } diff --git a/bsd/hfs/hfscommon/BTree/BTreeScanner.c b/bsd/hfs/hfscommon/BTree/BTreeScanner.c index 06e15a807..66521dbbd 100644 --- a/bsd/hfs/hfscommon/BTree/BTreeScanner.c +++ b/bsd/hfs/hfscommon/BTree/BTreeScanner.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1996-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1996-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -230,7 +230,7 @@ static int ReadMultipleNodes( BTScanState *theScanStatePtr ) { int myErr = E_NONE; BTreeControlBlockPtr myBTreeCBPtr; - daddr_t myPhyBlockNum; + daddr64_t myPhyBlockNum; u_int32_t myBufferSize; struct vnode * myDevPtr; int myBlockRun; @@ -239,8 +239,8 @@ static int ReadMultipleNodes( BTScanState *theScanStatePtr ) // release old buffer if we have one if ( theScanStatePtr->bufferPtr != NULL ) { - theScanStatePtr->bufferPtr->b_flags |= (B_INVAL | B_AGE); - brelse( theScanStatePtr->bufferPtr ); + buf_markinvalid(theScanStatePtr->bufferPtr); + buf_brelse( theScanStatePtr->bufferPtr ); theScanStatePtr->bufferPtr = NULL; theScanStatePtr->currentNodePtr = NULL; } @@ -248,8 +248,8 @@ static int ReadMultipleNodes( BTScanState *theScanStatePtr ) myBTreeCBPtr = theScanStatePtr->btcb; // map logical block in catalog btree file to physical block on volume - myErr = VOP_BMAP( myBTreeCBPtr->fileRefNum, theScanStatePtr->nodeNum, - &myDevPtr, &myPhyBlockNum, &myBlockRun ); + myErr = hfs_bmap(myBTreeCBPtr->fileRefNum, theScanStatePtr->nodeNum, + &myDevPtr, &myPhyBlockNum, &myBlockRun); if ( myErr != E_NONE ) { goto ExitThisRoutine; @@ -266,18 +266,18 @@ static int ReadMultipleNodes( BTScanState *theScanStatePtr ) } // now read blocks from the device - myErr = bread( myDevPtr, - myPhyBlockNum, - myBufferSize, - NOCRED, - &theScanStatePtr->bufferPtr ); + myErr = (int)buf_bread(myDevPtr, + myPhyBlockNum, + myBufferSize, + NOCRED, + &theScanStatePtr->bufferPtr ); if ( myErr != E_NONE ) { goto ExitThisRoutine; } - theScanStatePtr->nodesLeftInBuffer = theScanStatePtr->bufferPtr->b_bcount / theScanStatePtr->btcb->nodeSize; - theScanStatePtr->currentNodePtr = (BTNodeDescriptor *) theScanStatePtr->bufferPtr->b_data; + theScanStatePtr->nodesLeftInBuffer = buf_count(theScanStatePtr->bufferPtr) / theScanStatePtr->btcb->nodeSize; + theScanStatePtr->currentNodePtr = (BTNodeDescriptor *) buf_dataptr(theScanStatePtr->bufferPtr); ExitThisRoutine: return myErr; @@ -357,7 +357,7 @@ int BTScanInitialize( const FCB * btreeFile, scanState->currentNodePtr = NULL; scanState->nodesLeftInBuffer = 0; // no nodes currently in buffer scanState->recordsFound = recordsFound; - scanState->startTime = time; // initialize our throttle + microuptime(&scanState->startTime); // initialize our throttle return noErr; @@ -391,8 +391,8 @@ int BTScanTerminate( BTScanState * scanState, if ( scanState->bufferPtr != NULL ) { - scanState->bufferPtr->b_flags |= (B_INVAL | B_AGE); - brelse( scanState->bufferPtr ); + buf_markinvalid(scanState->bufferPtr); + buf_brelse( scanState->bufferPtr ); scanState->bufferPtr = NULL; scanState->currentNodePtr = NULL; } diff --git a/bsd/hfs/hfscommon/BTree/BTreeTreeOps.c b/bsd/hfs/hfscommon/BTree/BTreeTreeOps.c index 3a8463911..777e6f0fc 100644 --- a/bsd/hfs/hfscommon/BTree/BTreeTreeOps.c +++ b/bsd/hfs/hfscommon/BTree/BTreeTreeOps.c @@ -537,7 +537,7 @@ ErrorExit: (void) ReleaseNode (btreePtr, targetNode); (void) ReleaseNode (btreePtr, &leftNode); - Panic ("\p InsertLevel: an error occured!"); + Panic ("\p InsertLevel: an error occurred!"); return err; diff --git a/bsd/hfs/hfscommon/Catalog/Catalog.c b/bsd/hfs/hfscommon/Catalog/Catalog.c deleted file mode 100644 index e7134028f..000000000 --- a/bsd/hfs/hfscommon/Catalog/Catalog.c +++ /dev/null @@ -1,245 +0,0 @@ -/* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ - -#pragma segment Catalog - -#include <sys/param.h> -#include <sys/utfconv.h> - -#include "../../hfs_endian.h" - -#include "../headers/FileMgrInternal.h" -#include "../headers/BTreesInternal.h" -#include "../headers/CatalogPrivate.h" -#include "../headers/HFSUnicodeWrappers.h" - - -// External routines - -extern SInt32 FastRelString( ConstStr255Param str1, ConstStr255Param str2 ); - - -//_________________________________________________________________________________ -// Exported Routines -// -// CompareCatalogKeys - Compares two catalog keys. -// -//_________________________________________________________________________________ - - - -UInt32 -GetDirEntrySize(BTreeIterator *bip, ExtendedVCB * vol) -{ - CatalogKey * ckp; - CatalogName * cnp; - ByteCount utf8chars; - UInt8 name[kdirentMaxNameBytes + 1]; - OSErr result; - - ckp = (CatalogKey*) &bip->key; - - if (vol->vcbSigWord == kHFSPlusSigWord) { - cnp = (CatalogName*) &ckp->hfsPlus.nodeName; - utf8chars = utf8_encodelen(cnp->ustr.unicode, - cnp->ustr.length * sizeof(UniChar), ':', 0); - if (utf8chars > kdirentMaxNameBytes) - utf8chars = kdirentMaxNameBytes; - } else { /* hfs */ - cnp = (CatalogName*) ckp->hfs.nodeName; - result = hfs_to_utf8(vol, cnp->pstr, kdirentMaxNameBytes + 1, - &utf8chars, name); - if (result) { - /* - * When an HFS name cannot be encoded with the current - * volume encoding we use MacRoman as a fallback. - */ - result = mac_roman_to_utf8(cnp->pstr, MAXHFSVNODELEN + 1, - &utf8chars, name); - } - } - - return DIRENTRY_SIZE(utf8chars); -} -/* - * NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE - * - * This is assuming maxinum size of a name is 255 (kdirentMaxNameBytes), which is incorrect. - * Any caller of this has to make sure names > 255 are mangled!!!!!!!! - */ - -OSErr -PositionIterator(CatalogIterator *cip, UInt32 offset, BTreeIterator *bip, UInt16 *op) -{ -#define CAT_START_OFFSET (2 * sizeof(struct hfsdotentry)) - ExtendedVCB * vol; - FCB * fcb; - OSErr result = 0; - - /* are we past the end of a directory? */ - if (cip->folderID != cip->parentID) - return(cmNotFound); - - vol = cip->volume; - fcb = GetFileControlBlock(vol->catalogRefNum); - - /* make a btree iterator from catalog iterator */ - UpdateBtreeIterator(cip, bip); - - if (cip->currentOffset == offset) { - *op = kBTreeCurrentRecord; - - } else if (cip->nextOffset == offset) { - *op = kBTreeNextRecord; - - } else { /* start from beginning */ - *op = kBTreeNextRecord; - - /* Position iterator at the folder's thread record */ - result = BTSearchRecord(fcb, bip, NULL, NULL, bip); - if (result) - goto exit; - - /* find offset (note: n^2 / 2) */ - if (offset > CAT_START_OFFSET) { - HFSCatalogNodeID pid, *idp; - UInt32 curOffset, nextOffset; - - /* get first record (ie offset 24) */ - result = BTIterateRecord( fcb, kBTreeNextRecord, bip, NULL, NULL ); - if (result) - goto exit; - - if (vol->vcbSigWord == kHFSPlusSigWord) - idp = &((CatalogKey*) &bip->key)->hfsPlus.parentID; - else - idp = &((CatalogKey*) &bip->key)->hfs.parentID; - - pid = *idp; - - curOffset = CAT_START_OFFSET; - nextOffset = CAT_START_OFFSET + GetDirEntrySize(bip, vol); - - while (nextOffset < offset) { - result = BTIterateRecord( fcb, kBTreeNextRecord, bip, NULL, NULL ); - if (result) - goto exit; - - /* check for parent change */ - if (pid != *idp) { - result = cmNotFound; /* offset past end of directory */ - goto exit; - } - - curOffset = nextOffset; - nextOffset += GetDirEntrySize(bip, vol); - }; - - if (nextOffset != offset) { - result = cmNotFound; - goto exit; - } - - UpdateCatalogIterator(bip, cip); - cip->currentOffset = curOffset; - cip->nextOffset = nextOffset; - } - } - -exit: - if (result == btNotFound) - result = cmNotFound; - - return result; - -} /* end PositionIterator */ - - -//_________________________________________________________________________________ -// Routine: CompareCatalogKeys -// -// Function: Compares two catalog keys (a search key and a trial key). -// -// Result: +n search key > trial key -// 0 search key = trial key -// -n search key < trial key -//_________________________________________________________________________________ - -SInt32 -CompareCatalogKeys(HFSCatalogKey *searchKey, HFSCatalogKey *trialKey) -{ - HFSCatalogNodeID searchParentID, trialParentID; - SInt32 result; - - searchParentID = searchKey->parentID; - trialParentID = trialKey->parentID; - - if ( searchParentID > trialParentID ) // parent dirID is unsigned - result = 1; - else if ( searchParentID < trialParentID ) - result = -1; - else // parent dirID's are equal, compare names - result = FastRelString(searchKey->nodeName, trialKey->nodeName); - - return result; -} - - -//_________________________________________________________________________________ -// Routine: CompareExtendedCatalogKeys -// -// Function: Compares two large catalog keys (a search key and a trial key). -// -// Result: +n search key > trial key -// 0 search key = trial key -// -n search key < trial key -//_________________________________________________________________________________ - -SInt32 -CompareExtendedCatalogKeys(HFSPlusCatalogKey *searchKey, HFSPlusCatalogKey *trialKey) -{ - SInt32 result; - HFSCatalogNodeID searchParentID, trialParentID; - - searchParentID = searchKey->parentID; - trialParentID = trialKey->parentID; - - if ( searchParentID > trialParentID ) // parent node IDs are unsigned - { - result = 1; - } - else if ( searchParentID < trialParentID ) - { - result = -1; - } - else // parent node ID's are equal, compare names - { - if ( searchKey->nodeName.length == 0 || trialKey->nodeName.length == 0 ) - result = searchKey->nodeName.length - trialKey->nodeName.length; - else - result = FastUnicodeCompare(&searchKey->nodeName.unicode[0], searchKey->nodeName.length, - &trialKey->nodeName.unicode[0], trialKey->nodeName.length); - } - - return result; -} - diff --git a/bsd/hfs/hfscommon/Catalog/CatalogIterators.c b/bsd/hfs/hfscommon/Catalog/CatalogIterators.c deleted file mode 100644 index ddca514d4..000000000 --- a/bsd/hfs/hfscommon/Catalog/CatalogIterators.c +++ /dev/null @@ -1,643 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* - File: CatalogIterators.c - - Contains: Catalog Iterator Implementation - - Version: HFS Plus 1.0 - - Copyright: © 1997-1998 by Apple Computer, Inc., all rights reserved. - - File Ownership: - - DRI: Don Brady - - Other Contact: Mark Day - - Technology: Mac OS File System - - Writers: - - (msd) Mark Day - (djb) Don Brady - - Change History (most recent first): - <MacOSX> 4/23/98 djb Re-enable InvalidateCatalogCache (was commented out). - <MacOSX> 4/6/98 djb Add locking for cache globals (list) and iterators. - <MacOSX> 4/2/98 djb Define gCatalogCacheGlobals here instead of FSVars. - <MacOSX> 3/31/98 djb Sync up with final HFSVolumes.h header file. - - <CS3> 11/13/97 djb Radar #1683572 - Fix for indexed GetFileInfo. - <CS2> 10/17/97 msd Bug 1683506. Add support for long Unicode names in - CatalogIterators. Added a single global buffer for long Unicode - names; it is used by at most one CatalogIterator at a time. - <CS1> 10/1/97 djb first checked in -*/ - - -#include "../../hfs_macos_defs.h" -#include "../../hfs.h" -#include "../../hfs_dbg.h" -#include "../../hfs_format.h" - -#include "../headers/FileMgrInternal.h" -#include "../headers/BTreesInternal.h" -#include "../headers/CatalogPrivate.h" - - -#include <sys/param.h> -#include <sys/systm.h> -#include <libkern/libkern.h> -#include <sys/lock.h> - -static void InsertCatalogIteratorAsMRU( CatalogCacheGlobals *cacheGlobals, CatalogIterator *iterator ); - -static void InsertCatalogIteratorAsLRU( CatalogCacheGlobals *cacheGlobals, CatalogIterator *iterator ); - -static void PrepareForLongName( CatalogIterator *iterator ); - - -#if TARGET_API_MACOS_X - CatalogCacheGlobals *gCatalogCacheGlobals; - - #define GetCatalogCacheGlobals() (gCatalogCacheGlobals) - - #define CATALOG_ITER_LIST_LOCK(g) simple_lock(&(g)->simplelock) - - #define CATALOG_ITER_LIST_UNLOCK(g) simple_unlock(&(g)->simplelock) - - #define CI_LOCK(i) lockmgr(&(i)->iterator_lock, LK_EXCLUSIVE, (simple_lock_t) 0, current_proc()) - -#define CI_UNLOCK(i) lockmgr(&(i)->iterator_lock, LK_RELEASE, (simple_lock_t) 0, current_proc()) - -#define CI_SLEEPLESS_LOCK(i) lockmgr(&(i)->iterator_lock, LK_EXCLUSIVE | LK_NOWAIT, (simple_lock_t) 0, current_proc()) - -#define CI_LOCK_FROM_LIST(g,i) lockmgr(&(i)->iterator_lock, LK_EXCLUSIVE | LK_INTERLOCK, &(g)->simplelock, current_proc()) - -#else /* TARGET_API_MACOS_X */ - - #define GetCatalogCacheGlobals() ((CatalogCacheGlobals*) ((FSVarsRec*) LMGetFSMVars()->gCatalogCacheGlobals)) - - #define CATALOG_ITER_LIST_LOCK(g) - - #define CATALOG_ITER_LIST_UNLOCK(g) - - #define CI_LOCK(i) 0 - - #define CI_UNLOCK(i) 0 - - #define CI_SLEEPLESS_LOCK(i) 0 - - #define CI_LOCK_FROM_LIST(g,i) 0 - -#endif - - -//_______________________________________________________________________________ -// Routine: InitCatalogCache -// -// Function: Allocates cache, and initializes all the cache structures. -// -//_______________________________________________________________________________ -OSErr -InitCatalogCache(void) -{ - CatalogCacheGlobals * cacheGlobals; - CatalogIterator * iterator; - UInt32 cacheSize; - UInt16 i; - UInt16 lastIterator; - OSErr err; - - - cacheSize = sizeof(CatalogCacheGlobals) + ( kCatalogIteratorCount * sizeof(CatalogIterator) ); - cacheGlobals = (CatalogCacheGlobals *) NewPtrSysClear( cacheSize ); - - cacheGlobals->iteratorCount = kCatalogIteratorCount; - - lastIterator = kCatalogIteratorCount - 1; // last iterator number, since they start at 0 - - // Initialize the MRU order for the cache - cacheGlobals->mru = (CatalogIterator *) ( (Ptr)cacheGlobals + sizeof(CatalogCacheGlobals) ); - - // Initialize the LRU order for the cache - cacheGlobals->lru = (CatalogIterator *) ( (Ptr)(cacheGlobals->mru) + (lastIterator * sizeof(CatalogIterator)) ); - - - // Traverse iterators, setting initial mru, lru, and default values - for ( i = 0, iterator = cacheGlobals->mru; i < kCatalogIteratorCount ; i++, iterator = iterator->nextMRU ) - { - if ( i == lastIterator ) - iterator->nextMRU = nil; // terminate the list - else - iterator->nextMRU = (CatalogIterator *) ( (Ptr)iterator + sizeof(CatalogIterator) ); - - if ( i == 0 ) - iterator->nextLRU = nil; // terminate the list - else - iterator->nextLRU = (CatalogIterator *) ( (Ptr)iterator - sizeof(CatalogIterator) ); - - #if TARGET_API_MACOS_X - lockinit(&iterator->iterator_lock, PINOD, "hfs_catalog_iterator", 0, 0); - #endif - } - - #if TARGET_API_MAC_OS8 - (FSVarsRec*) LMGetFSMVars()->gCatalogCacheGlobals = (Ptr) cacheGlobals; - #endif - - #if TARGET_API_MACOS_X - gCatalogCacheGlobals = cacheGlobals; - simple_lock_init(&cacheGlobals->simplelock); - #endif - - return noErr; -} - - -//_______________________________________________________________________________ -// Routine: InvalidateCatalogCache -// -// Function: Trash any interators matching volume parameter -// -//_______________________________________________________________________________ -void PrintCatalogIterator( void ); - -void -InvalidateCatalogCache( ExtendedVCB *volume ) -{ - TrashCatalogIterator( volume, 0 ); -} - - -//_______________________________________________________________________________ -// Routine: PrintCatalogIterator -// -// Function: Prints all interators -// -//_______________________________________________________________________________ -#if HFS_DIAGNOSTIC -void -PrintCatalogIterator( void ) -{ - CatalogIterator *iterator; - CatalogCacheGlobals *cacheGlobals = GetCatalogCacheGlobals(); - int i; - - PRINTIT("CatalogCacheGlobals @ 0x%08lX are:\n", (unsigned long)cacheGlobals); - PRINTIT("\titeratorCount: %ld \n", cacheGlobals->iteratorCount); - PRINTIT("\tmru: 0x%08lX \n", (unsigned long)cacheGlobals->mru); - PRINTIT("\tlru: 0x%08lX \n", (unsigned long)cacheGlobals->lru); - - for ( iterator = cacheGlobals->mru, i=0 ; iterator != nil && i<32 ; iterator = iterator->nextMRU, i++) - { - PRINTIT("%d: ", i); - PRINTIT(" i: 0x%08lX", (unsigned long)iterator); - PRINTIT(" M: 0x%08lX", (unsigned long)iterator->nextMRU); - PRINTIT(" L: 0x%08lX", (unsigned long)iterator->nextLRU); - PRINTIT("\n"); - } -} -#endif - -//_______________________________________________________________________________ -// Routine: TrashCatalogIterator -// -// Function: Trash any interators matching volume and folder parameters -// -//_______________________________________________________________________________ -void -TrashCatalogIterator( const ExtendedVCB *volume, HFSCatalogNodeID folderID ) -{ - CatalogIterator *iterator; - CatalogCacheGlobals *cacheGlobals = GetCatalogCacheGlobals(); - - CATALOG_ITER_LIST_LOCK(cacheGlobals); - - for ( iterator = cacheGlobals->mru ; iterator != nil ; iterator = iterator->nextMRU ) - { - top: - - // first match the volume - if ( iterator->volume != volume ) - continue; - - // now match the folder (or all folders if 0) - if ( (folderID == 0) || (folderID == iterator->folderID) ) - { - CatalogIterator *next; - - iterator->volume = 0; // trash it - iterator->folderID = 0; - - next = iterator->nextMRU; // remember the next iterator - - // if iterator is not already last then make it last - if ( next != nil ) - { - InsertCatalogIteratorAsLRU( cacheGlobals, iterator ); - - // iterator->nextMRU will always be zero (since we moved it to the end) - // so set up the next iterator manually (we know its not nil) - iterator = next; - goto top; // process the next iterator - } - } - } - - CATALOG_ITER_LIST_UNLOCK(cacheGlobals); -} - - -//_______________________________________________________________________________ -// Routine: AgeCatalogIterator -// -// Function: Move iterator to the end of the list... -// -//_______________________________________________________________________________ -void -AgeCatalogIterator ( CatalogIterator *catalogIterator ) -{ - CatalogCacheGlobals * cacheGlobals = GetCatalogCacheGlobals(); - - CATALOG_ITER_LIST_LOCK(cacheGlobals); - - //PRINTIT(" AgeCatalogIterator: v=%d, d=%ld, i=%d\n", catalogIterator->volRefNum, catalogIterator->folderID, catalogIterator->currentIndex); - - InsertCatalogIteratorAsLRU( cacheGlobals, catalogIterator ); - - CATALOG_ITER_LIST_UNLOCK(cacheGlobals); -} - - -//_______________________________________________________________________________ -// Routine: GetCatalogIterator -// -// Function: Release interest in Catalog iterator -// -//_______________________________________________________________________________ -OSErr -ReleaseCatalogIterator( CatalogIterator* catalogIterator) -{ -#if TARGET_API_MACOS_X - //PRINTIT(" ReleaseCatalogIterator: v=%d, d=%ld, i=%d\n", catalogIterator->volRefNum, catalogIterator->folderID, catalogIterator->currentIndex); - return CI_UNLOCK(catalogIterator); -#else - return noErr; -#endif -} - - -//_______________________________________________________________________________ -// Routine: GetCatalogIterator -// -// Function: Returns an iterator associated with the volume, folderID, index, -// and iterationType (kIterateFilesOnly or kIterateAll). -// Searches the cache in MRU order. -// Inserts the resulting iterator at the head of mru automatically -// -// Note: The returned iterator is locked and ReleaseCatalogIterator must -// be called to unlock it. -// -//_______________________________________________________________________________ - -CatalogIterator* -GetCatalogIterator(ExtendedVCB *volume, HFSCatalogNodeID folderID, UInt32 offset) -{ - CatalogCacheGlobals *cacheGlobals = GetCatalogCacheGlobals(); - CatalogIterator *iterator; - CatalogIterator *bestIterator; - - bestIterator = NULL; - - CATALOG_ITER_LIST_LOCK(cacheGlobals); - - for (iterator = cacheGlobals->mru ; iterator != nil ; iterator = iterator->nextMRU) { - - /* first make sure volume and folder id match */ - if ((iterator->volume != volume) || (iterator->folderID != folderID)) { - continue; - } - - /* ignore busy iterators */ - if ( CI_SLEEPLESS_LOCK(iterator) == EBUSY ) { - //PRINTIT(" GetCatalogIterator: busy v=%d, d=%ld, i=%d\n", volume, folderID, iterator->currentIndex); - continue; - } - - /* we matched volume, folder id, now check the offset */ - if ( iterator->currentOffset == offset || iterator->nextOffset == offset) { - bestIterator = iterator; // we scored! - so get out of this loop - break; // break with iterator locked - } - - (void) CI_UNLOCK(iterator); // unlock iterator before moving to the next one - } - - // check if we didn't get one or if the one we got is too far away... - if (bestIterator == NULL) - { - bestIterator = cacheGlobals->lru; // start over with a new iterator - - //PRINTIT(" GetCatalogIterator: recycle v=%d, d=%ld, i=%d\n", bestIterator->volume, bestIterator->folderID, bestIterator->currentIndex); - (void) CI_LOCK_FROM_LIST(cacheGlobals, bestIterator); // XXX we should not eat the error! - - CATALOG_ITER_LIST_LOCK(cacheGlobals); // grab the lock again for MRU Insert below... - - bestIterator->volume = volume; // update the iterator's volume - bestIterator->folderID = folderID; // ... and folderID - bestIterator->currentIndex = 0xFFFF; // ... and offspring index marker - bestIterator->currentOffset = 0xFFFFFFFF; - bestIterator->nextOffset = 0xFFFFFFFF; - - bestIterator->btreeNodeHint = 0; - bestIterator->btreeIndexHint = 0; - bestIterator->parentID = folderID; // set key to folderID + empty name - bestIterator->folderName.unicodeName.length = 0; // clear pascal/unicode name - - if ( volume->vcbSigWord == kHFSPlusSigWord ) - bestIterator->nameType = kShortUnicodeName; - else - bestIterator->nameType = kShortPascalName; - } - else { - //PRINTIT(" GetCatalogIterator: found v=%d, d=%ld, i=%d\n", bestIterator->volume, bestIterator->folderID, bestIterator->currentIndex); - } - - // put this iterator at the front of the list - InsertCatalogIteratorAsMRU( cacheGlobals, bestIterator ); - - CATALOG_ITER_LIST_UNLOCK(cacheGlobals); - - return bestIterator; // return our best shot - -} /* GetCatalogIterator */ - - -//_______________________________________________________________________________ -// Routine: UpdateBtreeIterator -// -// Function: Fills in a BTreeIterator from a CatalogIterator -// -// Assumes: catalogIterator->nameType is correctly initialized! -// catalogIterator is locked (MacOS X) -//_______________________________________________________________________________ -void -UpdateBtreeIterator(const CatalogIterator *catalogIterator, BTreeIterator *btreeIterator) -{ - CatalogName * nodeName; - Boolean isHFSPlus; - - - btreeIterator->hint.writeCount = 0; - btreeIterator->hint.nodeNum = catalogIterator->btreeNodeHint; - btreeIterator->hint.index = catalogIterator->btreeIndexHint; - - switch (catalogIterator->nameType) - { - case kShortPascalName: - if ( catalogIterator->folderName.pascalName[0] > 0 ) - nodeName = (CatalogName *) catalogIterator->folderName.pascalName; - else - nodeName = NULL; - - isHFSPlus = false; - break; - - case kShortUnicodeName: - if ( catalogIterator->folderName.unicodeName.length > 0 ) - nodeName = (CatalogName *) &catalogIterator->folderName.unicodeName; - else - nodeName = NULL; - - isHFSPlus = true; - break; - - case kLongUnicodeName: - if ( catalogIterator->folderName.longNamePtr->length > 0 ) - nodeName = (CatalogName *) catalogIterator->folderName.longNamePtr; - else - nodeName = NULL; - - isHFSPlus = true; - break; - - default: - return; - } - - BuildCatalogKey(catalogIterator->parentID, nodeName, isHFSPlus, (CatalogKey*) &btreeIterator->key); -} - - -//_______________________________________________________________________________ -// Routine: UpdateCatalogIterator -// -// Function: Updates a CatalogIterator from a BTreeIterator -// -// Assumes: catalogIterator->nameType is correctly initialized! -// catalogIterator is locked (MacOS X) -//_______________________________________________________________________________ -void -UpdateCatalogIterator (const BTreeIterator *btreeIterator, CatalogIterator *catalogIterator) -{ - void * srcName; - void * dstName; - UInt16 nameSize; - CatalogKey * catalogKey; - - - catalogIterator->btreeNodeHint = btreeIterator->hint.nodeNum; - catalogIterator->btreeIndexHint = btreeIterator->hint.index; - - catalogKey = (CatalogKey*) &btreeIterator->key; - - switch (catalogIterator->nameType) - { - case kShortPascalName: - catalogIterator->parentID = catalogKey->hfs.parentID; - - dstName = catalogIterator->folderName.pascalName; - srcName = catalogKey->hfs.nodeName; - nameSize = catalogKey->hfs.nodeName[0] + sizeof(UInt8); - break; - - case kShortUnicodeName: - catalogIterator->parentID = catalogKey->hfsPlus.parentID; - - dstName = &catalogIterator->folderName.unicodeName; - srcName = &catalogKey->hfsPlus.nodeName; - nameSize = (catalogKey->hfsPlus.nodeName.length + 1) * sizeof(UInt16); - - // See if we need to make this iterator use long names - if ( nameSize > sizeof(catalogIterator->folderName.unicodeName) ) - { - PrepareForLongName(catalogIterator); // Find a long name buffer to use - dstName = catalogIterator->folderName.longNamePtr; - } - break; - - case kLongUnicodeName: - catalogIterator->parentID = catalogKey->hfsPlus.parentID; - - dstName = catalogIterator->folderName.longNamePtr; - srcName = &catalogKey->hfsPlus.nodeName; - nameSize = (catalogKey->hfsPlus.nodeName.length + 1) * sizeof(UInt16); - break; - - default: - return; - } - - if (catalogIterator->parentID != catalogIterator->folderID) - catalogIterator->nextOffset = 0xFFFFFFFF; - - BlockMoveData(srcName, dstName, nameSize); - -} // end UpdateCatalogIterator - - -//_______________________________________________________________________________ -// Routine: InsertCatalogIteratorAsMRU -// -// Function: Moves catalog iterator to head of mru order in double linked list -// -// Assumes list simple lock is held -//_______________________________________________________________________________ -static void -InsertCatalogIteratorAsMRU ( CatalogCacheGlobals *cacheGlobals, CatalogIterator *iterator ) -{ - CatalogIterator *swapIterator; - - if ( cacheGlobals->mru != iterator ) // if it's not already the mru iterator - { - swapIterator = cacheGlobals->mru; // put it in the front of the double queue - cacheGlobals->mru = iterator; - iterator->nextLRU->nextMRU = iterator->nextMRU; - if ( iterator->nextMRU != nil ) - iterator->nextMRU->nextLRU = iterator->nextLRU; - else - cacheGlobals->lru= iterator->nextLRU; - iterator->nextMRU = swapIterator; - iterator->nextLRU = nil; - swapIterator->nextLRU = iterator; - } -} - - -//________________________________________________________________________________ -// Routine: InsertCatalogIteratorAsLRU -// -// Function: Moves catalog iterator to head of lru order in double linked list -// -// Assumes list simple lock is held -//_______________________________________________________________________________ -static void -InsertCatalogIteratorAsLRU ( CatalogCacheGlobals *cacheGlobals, CatalogIterator *iterator ) -{ - CatalogIterator *swapIterator; - - if ( cacheGlobals->lru != iterator ) - { - swapIterator = cacheGlobals->lru; - cacheGlobals->lru = iterator; - iterator->nextMRU->nextLRU = iterator->nextLRU; - if ( iterator->nextLRU != nil ) - iterator->nextLRU->nextMRU = iterator->nextMRU; - else - cacheGlobals->mru= iterator->nextMRU; - iterator->nextLRU = swapIterator; - iterator->nextMRU = nil; - swapIterator->nextMRU = iterator; - } -} - - - -//_______________________________________________________________________________ -// Routine: PrepareForLongName -// -// Function: Takes a CatalogIterator whose nameType is kShortUnicodeName, and -// changes the nameType to kLongUnicodeName. -// -// Since long Unicode names aren't stored in the CatalogIterator itself, we have -// to point to an HFSUniStr255 for storage. In the current implementation, we have -// just one such global buffer in the cache globals. We'll set the iterator to -// point to the global buffer and invalidate the iterator that was using it -// (i.e. the iterator whose nameType is kLongUnicodeName). -// -// Eventually, we might want to have a list of long name buffers which we recycle -// using an LRU algorithm. Or perhaps, some other way.... -// -// Assumes: catalogIterator is locked (MacOS X) -//_______________________________________________________________________________ -static void -PrepareForLongName ( CatalogIterator *iterator ) -{ - CatalogCacheGlobals *cacheGlobals = GetCatalogCacheGlobals(); - CatalogIterator *iter; - - if (DEBUG_BUILD && iterator->nameType != kShortUnicodeName) - DebugStr("\p PrepareForLongName: nameType is wrong!"); - - // - // Walk through all the iterators. The first iterator whose nameType - // is kLongUnicodeName is invalidated (because it is using the global - // long name buffer). - // - - CATALOG_ITER_LIST_LOCK(cacheGlobals); - - for ( iter = cacheGlobals->mru ; iter != nil ; iter = iter->nextMRU ) - { - if (iter->nameType == kLongUnicodeName) - { - // if iterator is not already last then make it last - if ( iter->nextMRU != nil ) - InsertCatalogIteratorAsLRU( cacheGlobals, iter ); - - (void) CI_LOCK_FROM_LIST(cacheGlobals,iter); - iter->volume = 0; // trash it - iter->folderID = 0; - (void) CI_UNLOCK(iter); - - #if TARGET_API_MACOS_X - break; - #endif - } - } - - /* - * if iter is nil then none of the iterators was using the LongUnicodeName buffer - */ - if (iter == nil) - CATALOG_ITER_LIST_UNLOCK(cacheGlobals); - - // - // Change the nameType of this iterator and point to the global - // long name buffer. Note - this iterator is already locked - // - iterator->nameType = kLongUnicodeName; - iterator->folderName.longNamePtr = &cacheGlobals->longName; -} - diff --git a/bsd/hfs/hfscommon/Catalog/CatalogUtilities.c b/bsd/hfs/hfscommon/Catalog/CatalogUtilities.c index d1a43afb8..cad8b871e 100644 --- a/bsd/hfs/hfscommon/Catalog/CatalogUtilities.c +++ b/bsd/hfs/hfscommon/Catalog/CatalogUtilities.c @@ -291,10 +291,10 @@ FlushCatalog(ExtendedVCB *volume) if ( 0 /*fcb->fcbFlags & fcbModifiedMask*/ ) { - VCB_LOCK(volume); + HFS_MOUNT_LOCK(volume, TRUE); volume->vcbFlags |= 0xFF00; // Mark the VCB dirty volume->vcbLsMod = GetTimeUTC(); // update last modified date - VCB_UNLOCK(volume); + HFS_MOUNT_UNLOCK(volume, TRUE); // result = FlushVolumeControlBlock(volume); } diff --git a/bsd/hfs/hfscommon/Catalog/FileIDsServices.c b/bsd/hfs/hfscommon/Catalog/FileIDsServices.c index 80d7da83b..812f3e58c 100644 --- a/bsd/hfs/hfscommon/Catalog/FileIDsServices.c +++ b/bsd/hfs/hfscommon/Catalog/FileIDsServices.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -35,12 +35,12 @@ struct ExtentsRecBuffer { typedef struct ExtentsRecBuffer ExtentsRecBuffer; -UInt32 CheckExtents( void *extents, UInt32 blocks, Boolean isHFSPlus ); -OSErr DeleteExtents( ExtendedVCB *vcb, UInt32 fileNumber, Boolean isHFSPlus ); -OSErr MoveExtents( ExtendedVCB *vcb, UInt32 srcFileID, UInt32 destFileID, Boolean isHFSPlus ); -void CopyCatalogNodeInfo( CatalogRecord *src, CatalogRecord *dest ); -void CopyBigCatalogNodeInfo( CatalogRecord *src, CatalogRecord *dest ); -void CopyExtentInfo( ExtentKey *key, ExtentRecord *data, ExtentsRecBuffer *buffer, UInt16 bufferCount ); +static UInt32 CheckExtents( void *extents, UInt32 blocks, Boolean isHFSPlus ); +static OSErr DeleteExtents( ExtendedVCB *vcb, UInt32 fileNumber, Boolean isHFSPlus ); +static OSErr MoveExtents( ExtendedVCB *vcb, UInt32 srcFileID, UInt32 destFileID, Boolean isHFSPlus ); +static void CopyCatalogNodeInfo( CatalogRecord *src, CatalogRecord *dest ); +static void CopyBigCatalogNodeInfo( CatalogRecord *src, CatalogRecord *dest ); +static void CopyExtentInfo( ExtentKey *key, ExtentRecord *data, ExtentsRecBuffer *buffer, UInt16 bufferCount ); @@ -56,9 +56,6 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param OSErr err; Boolean isHFSPlus = ( vcb->vcbSigWord == kHFSPlusSigWord ); - TrashCatalogIterator(vcb, srcID); // invalidate any iterators for this parentID - TrashCatalogIterator(vcb, destID); // invalidate any iterators for this parentID - err = BuildCatalogKeyUTF8(vcb, srcID, srcName, kUndefinedStrLen, &srcKey, NULL); ReturnIfError(err); @@ -351,7 +348,7 @@ FlushAndReturn: } -void CopyCatalogNodeInfo( CatalogRecord *src, CatalogRecord *dest ) +static void CopyCatalogNodeInfo( CatalogRecord *src, CatalogRecord *dest ) { dest->hfsFile.dataLogicalSize = src->hfsFile.dataLogicalSize; dest->hfsFile.dataPhysicalSize = src->hfsFile.dataPhysicalSize; @@ -362,7 +359,7 @@ void CopyCatalogNodeInfo( CatalogRecord *src, CatalogRecord *dest ) BlockMoveData( src->hfsFile.rsrcExtents, dest->hfsFile.rsrcExtents, sizeof(HFSExtentRecord) ); } -void CopyBigCatalogNodeInfo( CatalogRecord *src, CatalogRecord *dest ) +static void CopyBigCatalogNodeInfo( CatalogRecord *src, CatalogRecord *dest ) { BlockMoveData( &src->hfsPlusFile.dataFork, &dest->hfsPlusFile.dataFork, sizeof(HFSPlusForkData) ); BlockMoveData( &src->hfsPlusFile.resourceFork, &dest->hfsPlusFile.resourceFork, sizeof(HFSPlusForkData) ); @@ -370,7 +367,7 @@ void CopyBigCatalogNodeInfo( CatalogRecord *src, CatalogRecord *dest ) } -OSErr MoveExtents( ExtendedVCB *vcb, UInt32 srcFileID, UInt32 destFileID, Boolean isHFSPlus ) +static OSErr MoveExtents( ExtendedVCB *vcb, UInt32 srcFileID, UInt32 destFileID, Boolean isHFSPlus ) { FCB * fcb; ExtentsRecBuffer extentsBuffer[kNumExtentsToCache]; @@ -528,7 +525,7 @@ OSErr MoveExtents( ExtendedVCB *vcb, UInt32 srcFileID, UInt32 destFileID, Boolea } -void CopyExtentInfo( ExtentKey *key, ExtentRecord *data, ExtentsRecBuffer *buffer, UInt16 bufferCount ) +static void CopyExtentInfo( ExtentKey *key, ExtentRecord *data, ExtentsRecBuffer *buffer, UInt16 bufferCount ) { BlockMoveData( key, &(buffer[bufferCount].extentKey), sizeof( ExtentKey ) ); BlockMoveData( data, &(buffer[bufferCount].extentData), sizeof( ExtentRecord ) ); @@ -536,7 +533,7 @@ void CopyExtentInfo( ExtentKey *key, ExtentRecord *data, ExtentsRecBuffer *buffe //-- Delete all extents in extent file that have the ID given. -OSErr DeleteExtents( ExtendedVCB *vcb, UInt32 fileID, Boolean isHFSPlus ) +static OSErr DeleteExtents( ExtendedVCB *vcb, UInt32 fileID, Boolean isHFSPlus ) { FCB * fcb; ExtentKey * extentKeyPtr; @@ -614,7 +611,7 @@ OSErr DeleteExtents( ExtendedVCB *vcb, UInt32 fileID, Boolean isHFSPlus ) // Check if there are extents represented in the extents overflow file. -UInt32 CheckExtents( void *extents, UInt32 totalBlocks, Boolean isHFSPlus ) +static UInt32 CheckExtents( void *extents, UInt32 totalBlocks, Boolean isHFSPlus ) { UInt32 extentAllocationBlocks; UInt16 i; diff --git a/bsd/hfs/hfscommon/Misc/FileExtentMapping.c b/bsd/hfs/hfscommon/Misc/FileExtentMapping.c index 6ac3df68d..76c6a407a 100644 --- a/bsd/hfs/hfscommon/Misc/FileExtentMapping.c +++ b/bsd/hfs/hfscommon/Misc/FileExtentMapping.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -19,136 +19,6 @@ * * @APPLE_LICENSE_HEADER_END@ */ -/* - File: FileExtentMapping.c - - Contains: xxx put contents here xxx - - Version: HFS Plus 1.0 - - Written by: Dave Heller, Mark Day - - Copyright: © 1996-1999 by Apple Computer, Inc., all rights reserved. - - File Ownership: - - DRI: Mark Day - - Other Contact: xxx put other contact here xxx - - Technology: xxx put technology here xxx - - Writers: - - (DSH) Deric Horn - (msd) Mark Day - (djb) Don Brady - - Change History (most recent first): - <MacOSX> 9/9/99 djb Fix fcbModifiedMask flag testing logic. - <MacOSX> 8/25/98 djb Flush extents b-tree header if dirty (2371088). - <MacOSX> 6/30/98 djb Add functions NodesAreContiguous and ExtentsAreIntegral (for radar #2249539). - <MacOSX> 6/23/98 djb Changed DeallocFile to DeleteFile which now deletes the catalog record. - Fixed UpdateExtentRecord to pass correct fcb to Btree routines. Fixed - hfs+ bug in CreateExtentRecord (double dereference). - <MacOSX> 5/20/98 djb In ExtendFileC don't lie about the peof! (radar #2230094). - <MacOSX> 4/17/98 djb Add VCB locking. - <MacOSX> 4/2/98 djb Switch over to real BTree interface (no more BTreeWrapper.c). - <MacOSX> 3/31/98 djb Sync up with final HFSVolumes.h header file. - - <CS24> 1/23/98 msd Bug 2208024: AllocContig is actually allocating one extent even - though there is not enough contiguous space. - <CS23> 12/2/97 DSH GetFCBExtentRecord no longer static so DFA can use it. - <CS22> 10/20/97 msd When allocating more space for a file, do the clump size - calculations in ExtendFileC, not BlockAllocate. Undo change from - <CS18>. - <CS21> 10/17/97 msd Conditionalize DebugStrs. - <CS20> 10/16/97 msd Simplify the code path for MapFileBlockC (logical to physical - block mapping) in the typical case where the file isn't - fragmented so badly that it has extents in the extents B-tree. - Simplified some of the calculations for all cases. - <CS19> 10/13/97 DSH FindExtentRecord & DeleteExtentRecord are also being used by DFA - no longer static. - <CS18> 10/6/97 msd When extending a file, set the physical EOF to include any extra - space allocated due to a file's clump size. - <CS17> 9/19/97 msd Remove the MapLogicalToPhysical SPI. It was never used and is - not being tested anyway. - <CS16> 9/5/97 msd In CompareExtentKeys and CompareExtentKeysPlus, use the symbolic - constants for key length. Don't DebugStr unless DEBUG_BUILD is - set. - <CS15> 7/24/97 djb Add instrumentation to MapFileBlockC - <CS14> 7/16/97 DSH FilesInternal.i renamed FileMgrInternal.i to avoid name - collision - <CS13> 7/15/97 DSH AdjEOF() mark the FCB as modified. (1664389) - <CS12> 7/8/97 DSH Loading PrecompiledHeaders from define passed in on C line - <CS11> 7/3/97 msd Bug #1663518. Remove DebugStr when setting the FCB extent record - for a volume control file. - <CS10> 6/27/97 msd Moved enum kFirstFileRefnum to FilesInternal. - <CS9> 6/24/97 djb Include "CatalogPrivate.h" - <CS8> 6/16/97 msd Finish implementation of CreateLargeFile SPI. - <CS7> 6/12/97 msd Add stub for CreateLargeFile SPI. - <CS6> 6/5/97 msd Add MapLogicalToPhysical. - <CS5> 6/2/97 msd In TruncateFileC, don't update the extent record unless it was - actually changed (prevents extra updates when truncating to the - end of the extent, and it is the last extent of the file.) Added - an AdjustEOF routine called by the assembly AdjEOF routine. It - copies the EOF, physical length, and extent information from one - FCB to all other FCBs for that fork. - <CS4> 5/20/97 DSH Removed const declaration in MapFileBlocC, const is benign when - passing by value, and SC requires it to match prototype. - <CS3> 5/15/97 msd Change enum kResourceForkType from -1 to 0xFF since it is now - unsigned. Change all forkType parameters to UInt8. - <CS2> 5/7/97 msd When checking for an unused extent descriptor, check the length, - not the starting block. - <CS1> 4/24/97 djb first checked in - <HFS25> 4/11/97 DSH use extended VCB fields catalogRefNum, and extentsRefNum. - <HFS24> 4/4/97 djb Get in sync with volume format changes. - <HFS23> 3/17/97 DSH Casting to compile with SC. - <HFS22> 2/26/97 msd Add instrumentation in ExtendFileC and TruncateFileC. In - CompareExtentKeys and CompareExtentKeysPlus, make sure the key - lengths are correct. - <HFS21> 2/5/97 msd The comparison with fsBTStartOfIterationErr didn't work because - the enum is an unsigned long; it is now casted to an OSErr - before comparing. - <HFS20> 1/31/97 msd In FindExtentRecord, turn an fsBTStartOfIterationErr error into - btNotFound. - <HFS19> 1/28/97 msd Fixed bug in MapFileBlockC where it returned the wrong number of - bytes available at the given block number. This could - potentially cause programs to read or write over other files. - <HFS18> 1/16/97 djb Extent key compare procs now return SInt32. Fixed - UpdateExtentRecord - it was passing a pointer to an ExtentKey - pointer. - <HFS17> 1/10/97 msd Change TruncateFileC to call DellocateFork when the new PEOF is - 0. Fixes a fxRangeErr returned when no extents existed. - <HFS16> 1/6/97 msd Previous change prevents extent records from being removed if - the files new PEOF is in the local (FCB/catalog) extents. - <HFS15> 1/3/97 djb Temp fix in TruncateFileC to prevent unwanted calls to - TruncateExtents. - <HFS14> 12/23/96 msd Previous change to SearchExtentFile didn't set up the outputs - for hint and key when the FCB extent record wasn't full. - <HFS13> 12/20/96 msd In SearchExtentFile, don't bother searching the extents file if - the FCB's extent record wasn't full, or if the FCB was for the - extents file itself. Modified SearchExtentRecord to return a - Boolean to indicate that the record was not full. - <HFS12> 12/19/96 DSH Changed refs from VCB to ExtendedVCB - <HFS11> 12/19/96 djb Updated for new B-tree Manager interface. - <HFS10> 12/12/96 djb Really use new SPI for GetCatalogNode. - <HFS9> 12/12/96 djb Use new Catalog SPI for GetCatalogNode. Added Mark's changes to - MapFileBlockC. - <HFS8> 12/11/96 msd TruncateFileC must always release extents, even if PEOF hasn't - changed (since allocation may have been rounded up due to clump - size). - <HFS7> 12/10/96 msd Check PRAGMA_LOAD_SUPPORTED before loading precompiled headers. - <HFS6> 12/4/96 DSH Precompiled headers - <HFS5> 11/26/96 msd Add an exported routine to grow the parallel FCB table to - accomodate the HFS+ ExtentRecord. - <HFS4> 11/26/96 msd Convert internal routines to use ExtentKey and ExtentRecord - (instead of the raw HFS structures). - <HFS3> 11/21/96 msd Added CompareExtentKeysPlus(). - <HFS2> 11/20/96 msd Finish porting FXM to C. - <HFS1> 11/6/96 DKH first checked in - -*/ #include "../../hfs.h" @@ -157,7 +27,6 @@ #include "../headers/FileMgrInternal.h" #include "../headers/BTreesInternal.h" -#include "../headers/CatalogPrivate.h" // calling a private catalog routine (LocateCatalogNode) #include <sys/malloc.h> @@ -165,8 +34,6 @@ ============================================================ Public (Exported) Routines: ============================================================ - DeAllocFile Deallocate all disk space allocated to a specified file. - Both forks are deallocated. ExtendFileC Allocate more space to a given file. @@ -193,21 +60,8 @@ Public (Exported) Routines: FlushExtentFile Flush the extents file for a given volume. - GrowParallelFCBs - Make sure the parallel FCB entries are big enough to support - the HFS+ ExtentRecord. If not, the array is grown and the - pre-existing data copied over. - AdjustEOF - Copy EOF, physical length, and extent records from one FCB - to all other FCBs for that fork. This is used when a file is - grown or shrunk as the result of a Write, SetEOF, or Allocate. - MapLogicalToPhysical - Map some position in a file to a volume block number. Also - returns the number of contiguous bytes that are mapped there. - This is a queued HFSDispatch call that does the equivalent of - MapFileBlockC, using a parameter block. ============================================================ Internal Routines: @@ -269,7 +123,7 @@ static OSErr DeleteExtentRecord( UInt32 startBlock); static OSErr CreateExtentRecord( - const ExtendedVCB *vcb, + ExtendedVCB *vcb, HFSPlusExtentKey *key, HFSPlusExtentRecord extents, UInt32 *hint); @@ -280,7 +134,7 @@ static OSErr GetFCBExtentRecord( HFSPlusExtentRecord extents); static OSErr SearchExtentFile( - const ExtendedVCB *vcb, + ExtendedVCB *vcb, const FCB *fcb, SInt64 filePosition, HFSPlusExtentKey *foundExtentKey, @@ -290,7 +144,7 @@ static OSErr SearchExtentFile( UInt32 *endingFABNPlusOne ); static OSErr SearchExtentRecord( - const ExtendedVCB *vcb, + ExtendedVCB *vcb, UInt32 searchFABN, const HFSPlusExtentRecord extentData, UInt32 extentDataStartFABN, @@ -319,7 +173,7 @@ static OSErr TruncateExtents( Boolean * recordDeleted); static OSErr UpdateExtentRecord ( - const ExtendedVCB *vcb, + ExtendedVCB *vcb, FCB *fcb, const HFSPlusExtentKey *extentFileKey, const HFSPlusExtentRecord extentData, @@ -484,22 +338,32 @@ static OSErr FindExtentRecord( static OSErr CreateExtentRecord( - const ExtendedVCB *vcb, + ExtendedVCB *vcb, HFSPlusExtentKey *key, HFSPlusExtentRecord extents, UInt32 *hint) { BTreeIterator * btIterator; FSBufferDescriptor btRecord; - UInt16 btRecordSize; - OSErr err; + UInt16 btRecordSize; + int lockflags; + OSErr err; err = noErr; *hint = 0; MALLOC(btIterator, BTreeIterator *, sizeof(*btIterator), M_TEMP, M_WAITOK); bzero(btIterator, sizeof(*btIterator)); - + + /* + * The lock taken by callers of ExtendFileC is speculative and + * only occurs when the file already has overflow extents. So + * We need to make sure we have the lock here. The extents + * btree lock can be nested (its recursive) so we always take + * it here. + */ + lockflags = hfs_systemfile_lock(vcb, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); + if (vcb->vcbSigWord == kHFSSigWord) { HFSExtentKey * keyPtr; HFSExtentRecord data; @@ -534,6 +398,8 @@ static OSErr CreateExtentRecord( (void) BTFlushPath(GetFileControlBlock(vcb->extentsRefNum)); + hfs_systemfile_unlock(vcb, lockflags); + FREE(btIterator, M_TEMP); return err; } @@ -588,20 +454,6 @@ static OSErr DeleteExtentRecord( // // Function: Maps a file position into a physical disk address. // -// Input: A2.L - VCB pointer -// (A1,D1.W) - FCB pointer -// D4.L - number of bytes desired -// D5.L - file position (byte address) -// -// Output: D3.L - physical start block -// D6.L - number of contiguous bytes available (up to D4 bytes) -// D0.L - result code <01Oct85> -// 0 = ok -// FXRangeErr = file position beyond mapped range <17Oct85> -// FXOvFlErr = extents file overflow <17Oct85> -// other = error <17Oct85> -// -// Called By: Log2Phys (read/write in place), Cache (map a file block). //_________________________________________________________________________________ __private_extern__ @@ -610,7 +462,7 @@ OSErr MapFileBlockC ( FCB *fcb, // FCB of file size_t numberOfBytes, // number of contiguous bytes desired off_t offset, // starting offset within file (in bytes) - daddr_t *startSector, // first sector (NOT an allocation block) + daddr64_t *startSector, // first sector (NOT an allocation block) size_t *availableBytes) // number of contiguous bytes (up to numberOfBytes) { OSErr err; @@ -625,12 +477,12 @@ OSErr MapFileBlockC ( off_t dataEnd; // (offset) end of range that is contiguous UInt32 sectorsPerBlock; // Number of sectors per allocation block UInt32 startBlock; // volume allocation block corresponding to firstFABN - daddr_t temp; + daddr64_t temp; off_t tmpOff; allocBlockSize = vcb->blockSize; sectorSize = VCBTOHFS(vcb)->hfs_phys_block_size; - + err = SearchExtentFile(vcb, fcb, offset, &foundKey, foundData, &foundIndex, &hint, &nextFABN); if (err == noErr) { startBlock = foundData[foundIndex].startBlock; @@ -658,7 +510,7 @@ OSErr MapFileBlockC ( // offset in sectors from start of the extent + // offset in sectors from start of allocation block space // - temp = (daddr_t)((offset - (off_t)((off_t)(firstFABN) * (off_t)(allocBlockSize)))/sectorSize); + temp = (daddr64_t)((offset - (off_t)((off_t)(firstFABN) * (off_t)(allocBlockSize)))/sectorSize); temp += startBlock * sectorsPerBlock; /* Add in any volume offsets */ @@ -682,6 +534,7 @@ OSErr MapFileBlockC ( else *availableBytes = tmpOff; } + return noErr; } @@ -762,6 +615,16 @@ static OSErr TruncateExtents( UInt32 hint; HFSPlusExtentKey key; HFSPlusExtentRecord extents; + int lockflags; + + /* + * The lock taken by callers of TruncateFileC is speculative and + * only occurs when the file already has overflow extents. So + * We need to make sure we have the lock here. The extents + * btree lock can be nested (its recursive) so we always take + * it here. + */ + lockflags = hfs_systemfile_lock(vcb, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); while (true) { err = FindExtentRecord(vcb, forkType, fileID, startBlock, false, &key, extents, &hint); @@ -780,6 +643,7 @@ static OSErr TruncateExtents( *recordDeleted = true; startBlock += numberExtentsReleased; } + hfs_systemfile_unlock(vcb, lockflags); return err; } @@ -823,9 +687,14 @@ OSErr FlushExtentFile( ExtendedVCB *vcb ) { FCB * fcb; OSErr err; + int lockflags; fcb = GetFileControlBlock(vcb->extentsRefNum); + + lockflags = hfs_systemfile_lock(vcb, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); err = BTFlushPath(fcb); + hfs_systemfile_unlock(vcb, lockflags); + if ( err == noErr ) { // If the FCB for the extent "file" is dirty, mark the VCB as dirty. @@ -1041,23 +910,6 @@ AddFileExtent(ExtendedVCB *vcb, FCB *fcb, UInt32 startBlock, UInt32 blockCount) // // Function: Extends the disk space allocated to a file. // -// Input: A2.L - VCB pointer -// A1.L - pointer to FCB array -// D1.W - file refnum -// D3.B - option flags -// kEFContigMask - force contiguous allocation -// kEFAllMask - allocate all requested bytes or none -// NOTE: You may not set both options. -// D4.L - number of additional bytes to allocate -// -// Output: D0.W - result code -// 0 = ok -// -n = IO error -// D6.L - number of bytes allocated -// -// Called by: FileAloc,FileWrite,SetEof -// -// Note: ExtendFile updates the PEOF in the FCB. //_________________________________________________________________________________ __private_extern__ @@ -1127,8 +979,11 @@ OSErr ExtendFileC ( && (vcb->vcbSigWord == kHFSPlusSigWord) && (bytesToAdd < (SInt64)HFS_MAX_DEFERED_ALLOC) && (blocksToAdd < hfs_freeblks(VCBTOHFS(vcb), 1))) { + HFS_MOUNT_LOCK(vcb, TRUE); + vcb->loanedBlocks += blocksToAdd; + HFS_MOUNT_UNLOCK(vcb, TRUE); + fcb->ff_unallocblocks += blocksToAdd; - vcb->loanedBlocks += blocksToAdd; FTOC(fcb)->c_blocks += blocksToAdd; fcb->ff_blocks += blocksToAdd; @@ -1140,13 +995,18 @@ OSErr ExtendFileC ( * Give back any unallocated blocks before doing real allocations. */ if (fcb->ff_unallocblocks > 0) { - blocksToAdd += fcb->ff_unallocblocks; - bytesToAdd = (SInt64)blocksToAdd * (SInt64)volumeBlockSize; + u_int32_t loanedBlocks; - vcb->loanedBlocks -= fcb->ff_unallocblocks; - FTOC(fcb)->c_blocks -= fcb->ff_unallocblocks; - fcb->ff_blocks -= fcb->ff_unallocblocks; + loanedBlocks = fcb->ff_unallocblocks; + blocksToAdd += loanedBlocks; + bytesToAdd = (SInt64)blocksToAdd * (SInt64)volumeBlockSize; + FTOC(fcb)->c_blocks -= loanedBlocks; + fcb->ff_blocks -= loanedBlocks; fcb->ff_unallocblocks = 0; + + HFS_MOUNT_LOCK(vcb, TRUE); + vcb->loanedBlocks -= loanedBlocks; + HFS_MOUNT_UNLOCK(vcb, TRUE); } // @@ -1154,7 +1014,7 @@ OSErr ExtendFileC ( // then set the maximum number of bytes to the requested number of bytes // rounded up to a multiple of the clump size. // - if ((vcb->vcbClpSiz > volumeBlockSize) + if ((vcb->vcbClpSiz > (int32_t)volumeBlockSize) && (bytesToAdd < (SInt64)HFS_MAX_DEFERED_ALLOC) && (flags & kEFNoClumpMask) == 0) { maximumBytes = (SInt64)howmany(bytesToAdd, vcb->vcbClpSiz); @@ -1166,13 +1026,15 @@ OSErr ExtendFileC ( // // Compute new physical EOF, rounded up to a multiple of a block. // - if ((vcb->vcbSigWord == kHFSSigWord) && ((((SInt64)fcb->ff_blocks * (SInt64)volumeBlockSize) + bytesToAdd) >= kTwoGigabytes)) // Too big? + if ( (vcb->vcbSigWord == kHFSSigWord) && // Too big? + ((((SInt64)fcb->ff_blocks * (SInt64)volumeBlockSize) + bytesToAdd) >= kTwoGigabytes) ) { if (allOrNothing) // Yes, must they have it all? goto Overflow; // Yes, can't have it else { --blocksToAdd; // No, give give 'em one block less bytesToAdd -= volumeBlockSize; } + } // // If allocation is all-or-nothing, make sure there are @@ -1379,10 +1241,17 @@ Exit: /* Keep the roving allocator out of the metadata zone. */ if (vcb->nextAllocation >= VCBTOHFS(vcb)->hfs_metazone_start && vcb->nextAllocation <= VCBTOHFS(vcb)->hfs_metazone_end) { + HFS_MOUNT_LOCK(vcb, TRUE); vcb->nextAllocation = VCBTOHFS(vcb)->hfs_metazone_end + 1; + vcb->vcbFlags |= 0xFF00; + HFS_MOUNT_UNLOCK(vcb, TRUE); } } - *actualBytesAdded = (SInt64)(fcb->ff_blocks - prevblocks) * (SInt64)volumeBlockSize; + if (prevblocks < fcb->ff_blocks) { + *actualBytesAdded = (SInt64)(fcb->ff_blocks - prevblocks) * (SInt64)volumeBlockSize; + } else { + *actualBytesAdded = 0; + } if (needsFlush) (void) FlushExtentFile(vcb); @@ -1405,18 +1274,6 @@ Overflow: // block boundry. If the 'TFTrunExt' option is specified, the file is // truncated to the end of the extent containing the new PEOF. // -// Input: A2.L - VCB pointer -// A1.L - pointer to FCB array -// D1.W - file refnum -// D2.B - option flags -// TFTrunExt - truncate to the extent containing new PEOF -// D3.L - new PEOF -// -// Output: D0.W - result code -// 0 = ok -// -n = IO error -// -// Note: TruncateFile updates the PEOF in the FCB. //_________________________________________________________________________________ __private_extern__ @@ -1441,7 +1298,6 @@ OSErr TruncateFileC ( UInt8 forkType; Boolean extentChanged; // true if we actually changed an extent Boolean recordDeleted; // true if an extent record got deleted - recordDeleted = false; @@ -1585,7 +1441,6 @@ OSErr TruncateFileC ( Done: ErrorExit: - if (recordDeleted) (void) FlushExtentFile(vcb); @@ -1611,7 +1466,8 @@ OSErr HeadTruncateFile ( UInt32 startblk; UInt32 blksfreed; int i, j; - int error; + int error = 0; + int lockflags; if (vcb->vcbSigWord != kHFSPlusSigWord) @@ -1663,6 +1519,8 @@ OSErr HeadTruncateFile ( if (blkcnt == 0) goto CopyExtents; + lockflags = hfs_systemfile_lock(vcb, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); + /* * Process overflow extents */ @@ -1716,6 +1574,7 @@ OSErr HeadTruncateFile ( startblk += extblks; } + hfs_systemfile_unlock(vcb, lockflags); CopyExtents: if (blksfreed) { @@ -1724,7 +1583,8 @@ CopyExtents: FTOC(fcb)->c_blocks -= blkcnt; fcb->ff_blocks = blkcnt; - FTOC(fcb)->c_flag |= C_CHANGE | C_FORCEUPDATE; + FTOC(fcb)->c_flag |= C_FORCEUPDATE; + FTOC(fcb)->c_touch_chgtime = TRUE; (void) FlushExtentFile(vcb); } @@ -1758,7 +1618,7 @@ ErrorExit: //‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ static OSErr SearchExtentRecord( - const ExtendedVCB *vcb, + ExtendedVCB *vcb, UInt32 searchFABN, const HFSPlusExtentRecord extentData, UInt32 extentDataStartFABN, @@ -1859,7 +1719,7 @@ static OSErr SearchExtentRecord( //‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ static OSErr SearchExtentFile( - const ExtendedVCB *vcb, + ExtendedVCB *vcb, const FCB *fcb, SInt64 filePosition, HFSPlusExtentKey *foundExtentKey, @@ -1872,6 +1732,7 @@ static OSErr SearchExtentFile( UInt32 filePositionBlock; SInt64 temp64; Boolean noMoreExtents; + int lockflags; temp64 = filePosition / (SInt64)vcb->blockSize; filePositionBlock = (UInt32)temp64; @@ -1904,8 +1765,11 @@ static OSErr SearchExtentFile( // // Find the desired record, or the previous record if it is the same fork // + lockflags = hfs_systemfile_lock(vcb, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); + err = FindExtentRecord(vcb, FORK_IS_RSRC(fcb) ? kResourceForkType : kDataForkType, FTOC(fcb)->c_fileid, filePositionBlock, true, foundExtentKey, foundExtentData, extentBTreeHint); + hfs_systemfile_unlock(vcb, lockflags); if (err == btNotFound) { // @@ -1938,7 +1802,7 @@ Exit: -//‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ +//============================================================================ // Routine: UpdateExtentRecord // // Function: Write new extent data to an existing extent record with a given key. @@ -1955,14 +1819,14 @@ Exit: // // Result: noErr = ok // (other) = error from BTree -//‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ +//============================================================================ static OSErr UpdateExtentRecord ( - const ExtendedVCB *vcb, - FCB *fcb, - const HFSPlusExtentKey *extentFileKey, - const HFSPlusExtentRecord extentData, - UInt32 extentBTreeHint) + ExtendedVCB *vcb, + FCB *fcb, + const HFSPlusExtentKey *extentFileKey, + const HFSPlusExtentRecord extentData, + UInt32 extentBTreeHint) { OSErr err = noErr; @@ -1975,6 +1839,7 @@ static OSErr UpdateExtentRecord ( FSBufferDescriptor btRecord; UInt16 btRecordSize; FCB * btFCB; + int lockflags; // // Need to find and change a record in Extents BTree @@ -1984,6 +1849,15 @@ static OSErr UpdateExtentRecord ( MALLOC(btIterator, BTreeIterator *, sizeof(*btIterator), M_TEMP, M_WAITOK); bzero(btIterator, sizeof(*btIterator)); + /* + * The lock taken by callers of ExtendFileC/TruncateFileC is + * speculative and only occurs when the file already has + * overflow extents. So we need to make sure we have the lock + * here. The extents btree lock can be nested (its recursive) + * so we always take it here. + */ + lockflags = hfs_systemfile_lock(vcb, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); + if (vcb->vcbSigWord == kHFSSigWord) { HFSExtentKey * key; // Actual extent key used on disk in HFS HFSExtentRecord foundData; // The extent data actually found @@ -2030,6 +1904,7 @@ static OSErr UpdateExtentRecord ( } (void) BTFlushPath(btFCB); } + hfs_systemfile_unlock(vcb, lockflags); FREE(btIterator, M_TEMP); } @@ -2141,6 +2016,7 @@ Boolean NodesAreContiguous( HFSPlusExtentRecord extents; OSErr result; Boolean lastExtentReached; + int lockflags; if (vcb->blockSize >= nodeSize) @@ -2153,23 +2029,27 @@ Boolean NodesAreContiguous( if ( !ExtentsAreIntegral(extents, mask, &blocksChecked, &lastExtentReached) ) return FALSE; - if (lastExtentReached || (SInt64)((SInt64)blocksChecked * (SInt64)vcb->blockSize) >= fcb->ff_size) + if ( lastExtentReached || + (SInt64)((SInt64)blocksChecked * (SInt64)vcb->blockSize) >= (SInt64)fcb->ff_size) return TRUE; startBlock = blocksChecked; + lockflags = hfs_systemfile_lock(vcb, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); + // check the overflow extents (if any) while ( !lastExtentReached ) { result = FindExtentRecord(vcb, kDataForkType, fcb->ff_cp->c_fileid, startBlock, FALSE, &key, extents, &hint); if (result) break; - if ( !ExtentsAreIntegral(extents, mask, &blocksChecked, &lastExtentReached) ) + if ( !ExtentsAreIntegral(extents, mask, &blocksChecked, &lastExtentReached) ) { + hfs_systemfile_unlock(vcb, lockflags); return FALSE; - + } startBlock += blocksChecked; } - + hfs_systemfile_unlock(vcb, lockflags); return TRUE; } diff --git a/bsd/hfs/hfscommon/Misc/VolumeAllocation.c b/bsd/hfs/hfscommon/Misc/VolumeAllocation.c index 1fa82a921..157b7fb57 100644 --- a/bsd/hfs/hfscommon/Misc/VolumeAllocation.c +++ b/bsd/hfs/hfscommon/Misc/VolumeAllocation.c @@ -240,9 +240,9 @@ OSErr BlockAllocate ( // next block to allocate from. // if (startingBlock == 0) { - VCB_LOCK(vcb); + HFS_MOUNT_LOCK(vcb, TRUE); startingBlock = vcb->nextAllocation; - VCB_UNLOCK(vcb); + HFS_MOUNT_UNLOCK(vcb, TRUE); updateAllocPtr = true; } if (startingBlock >= vcb->totalBlocks) { @@ -264,7 +264,9 @@ OSErr BlockAllocate ( (*actualStartBlock > startingBlock) && ((*actualStartBlock < VCBTOHFS(vcb)->hfs_metazone_start) || (*actualStartBlock > VCBTOHFS(vcb)->hfs_metazone_end))) { - vcb->nextAllocation = *actualStartBlock; /* XXX */ + HFS_MOUNT_LOCK(vcb, TRUE); + vcb->nextAllocation = *actualStartBlock; + HFS_MOUNT_UNLOCK(vcb, TRUE); } } else { /* @@ -285,7 +287,13 @@ OSErr BlockAllocate ( actualNumBlocks); } - if (err == noErr) { +Exit: + // if we actually allocated something then go update the + // various bits of state that we maintain regardless of + // whether there was an error (i.e. partial allocations + // still need to update things like the free block count). + // + if (*actualNumBlocks != 0) { // // If we used the volume's roving allocation pointer, then we need to update it. // Adding in the length of the current allocation might reduce the next allocate @@ -294,7 +302,7 @@ OSErr BlockAllocate ( // the file is closed or its EOF changed. Leaving the allocation pointer at the // start of the last allocation will avoid unnecessary fragmentation in this case. // - VCB_LOCK(vcb); + HFS_MOUNT_LOCK(vcb, TRUE); if (updateAllocPtr && ((*actualStartBlock < VCBTOHFS(vcb)->hfs_metazone_start) || @@ -305,14 +313,12 @@ OSErr BlockAllocate ( // Update the number of free blocks on the volume // vcb->freeBlocks -= *actualNumBlocks; - hfs_generate_volume_notifications(VCBTOHFS(vcb)); - VCB_UNLOCK(vcb); - MarkVCBDirty(vcb); + HFS_MOUNT_UNLOCK(vcb, TRUE); + + hfs_generate_volume_notifications(VCBTOHFS(vcb)); } -Exit: - return err; } @@ -363,14 +369,14 @@ OSErr BlockDeallocate ( // // Update the volume's free block count, and mark the VCB as dirty. // - VCB_LOCK(vcb); + HFS_MOUNT_LOCK(vcb, TRUE); vcb->freeBlocks += numBlocks; - hfs_generate_volume_notifications(VCBTOHFS(vcb)); if (vcb->nextAllocation == (firstBlock + numBlocks)) vcb->nextAllocation -= numBlocks; - VCB_UNLOCK(vcb); MarkVCBDirty(vcb); - + HFS_MOUNT_UNLOCK(vcb, TRUE); + + hfs_generate_volume_notifications(VCBTOHFS(vcb)); Exit: return err; @@ -395,8 +401,10 @@ MetaZoneFreeBlocks(ExtendedVCB *vcb) int bytesperblock; UInt8 byte; UInt8 *buffer; + blockRef = 0; bytesleft = freeblocks = 0; + buffer = NULL; bit = VCBTOHFS(vcb)->hfs_metazone_start; if (bit == 1) bit = 0; @@ -484,35 +492,35 @@ static OSErr ReadBitmapBlock( OSErr err; struct buf *bp = NULL; struct vnode *vp = NULL; - UInt32 block; + daddr64_t block; UInt32 blockSize; /* - * volume bitmap blocks are protected by the Extents B-tree lock + * volume bitmap blocks are protected by the allocation file lock */ - REQUIRE_FILE_LOCK(vcb->extentsRefNum, false); + REQUIRE_FILE_LOCK(vcb->hfs_allocation_vp, false); blockSize = (UInt32)vcb->vcbVBMIOSize; - block = bit / (blockSize * kBitsPerByte); + block = (daddr64_t)(bit / (blockSize * kBitsPerByte)); if (vcb->vcbSigWord == kHFSPlusSigWord) { - vp = vcb->allocationsRefNum; /* use allocation file vnode */ + vp = vcb->hfs_allocation_vp; /* use allocation file vnode */ } else /* hfs */ { vp = VCBTOHFS(vcb)->hfs_devvp; /* use device I/O vnode */ block += vcb->vcbVBMSt; /* map to physical block */ } - err = meta_bread(vp, block, blockSize, NOCRED, &bp); + err = (int)buf_meta_bread(vp, block, blockSize, NOCRED, &bp); if (bp) { if (err) { - brelse(bp); + buf_brelse(bp); *blockRef = NULL; *buffer = NULL; } else { *blockRef = (UInt32)bp; - *buffer = (UInt32 *)bp->b_data; + *buffer = (UInt32 *)buf_dataptr(bp); } } @@ -554,10 +562,10 @@ static OSErr ReleaseBitmapBlock( if (hfsmp->jnl) { journal_modify_block_end(hfsmp->jnl, bp); } else { - bdwrite(bp); + buf_bdwrite(bp); } } else { - brelse(bp); + buf_brelse(bp); } } @@ -1616,4 +1624,125 @@ ErrorExit: return err; } +/* + * Test to see if any blocks in a range are allocated. + * + * The journal or allocation file lock must be held. + */ +__private_extern__ +int +hfs_isallocated(struct hfsmount *hfsmp, u_long startingBlock, u_long numBlocks) +{ + UInt32 *currentWord; // Pointer to current word within bitmap block + UInt32 wordsLeft; // Number of words left in this bitmap block + UInt32 bitMask; // Word with given bits already set (ready to test) + UInt32 firstBit; // Bit index within word of first bit to allocate + UInt32 numBits; // Number of bits in word to allocate + UInt32 *buffer = NULL; + UInt32 blockRef; + UInt32 bitsPerBlock; + UInt32 wordsPerBlock; + int inuse = 0; + int error; + + /* + * Pre-read the bitmap block containing the first word of allocation + */ + error = ReadBitmapBlock(hfsmp, startingBlock, &buffer, &blockRef); + if (error) + return (error); + + /* + * Initialize currentWord, and wordsLeft. + */ + { + UInt32 wordIndexInBlock; + + bitsPerBlock = hfsmp->vcbVBMIOSize * kBitsPerByte; + wordsPerBlock = hfsmp->vcbVBMIOSize / kBytesPerWord; + + wordIndexInBlock = (startingBlock & (bitsPerBlock-1)) / kBitsPerWord; + currentWord = buffer + wordIndexInBlock; + wordsLeft = wordsPerBlock - wordIndexInBlock; + } + + /* + * First test any non word aligned bits. + */ + firstBit = startingBlock % kBitsPerWord; + if (firstBit != 0) { + bitMask = kAllBitsSetInWord >> firstBit; + numBits = kBitsPerWord - firstBit; + if (numBits > numBlocks) { + numBits = numBlocks; + bitMask &= ~(kAllBitsSetInWord >> (firstBit + numBits)); + } + if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { + inuse = 1; + goto Exit; + } + numBlocks -= numBits; + ++currentWord; + --wordsLeft; + } + + /* + * Test whole words (32 blocks) at a time. + */ + while (numBlocks >= kBitsPerWord) { + if (wordsLeft == 0) { + /* Read in the next bitmap block. */ + startingBlock += bitsPerBlock; + + buffer = NULL; + error = ReleaseBitmapBlock(hfsmp, blockRef, false); + if (error) goto Exit; + + error = ReadBitmapBlock(hfsmp, startingBlock, &buffer, &blockRef); + if (error) goto Exit; + + /* Readjust currentWord and wordsLeft. */ + currentWord = buffer; + wordsLeft = wordsPerBlock; + } + if (*currentWord != 0) { + inuse = 1; + goto Exit; + } + numBlocks -= kBitsPerWord; + ++currentWord; + --wordsLeft; + } + + /* + * Test any remaining blocks. + */ + if (numBlocks != 0) { + bitMask = ~(kAllBitsSetInWord >> numBlocks); + if (wordsLeft == 0) { + /* Read in the next bitmap block */ + startingBlock += bitsPerBlock; + + buffer = NULL; + error = ReleaseBitmapBlock(hfsmp, blockRef, false); + if (error) goto Exit; + + error = ReadBitmapBlock(hfsmp, startingBlock, &buffer, &blockRef); + if (error) goto Exit; + + currentWord = buffer; + wordsLeft = wordsPerBlock; + } + if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { + inuse = 1; + goto Exit; + } + } +Exit: + if (buffer) { + (void)ReleaseBitmapBlock(hfsmp, blockRef, false); + } + return (inuse); +} + diff --git a/bsd/hfs/hfscommon/Unicode/UnicodeWrappers.c b/bsd/hfs/hfscommon/Unicode/UnicodeWrappers.c index 1e02d0932..91b3e7a98 100644 --- a/bsd/hfs/hfscommon/Unicode/UnicodeWrappers.c +++ b/bsd/hfs/hfscommon/Unicode/UnicodeWrappers.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -51,36 +51,10 @@ enum { static void GetFilenameExtension( ItemCount length, ConstUniCharArrayPtr unicodeStr, char* extStr ); -static void GetFileIDString( HFSCatalogNodeID fileID, char* fileIDStr ); static UInt32 HexStringToInteger( UInt32 length, const UInt8 *hexStr ); - -/* - * Convert file ID into a hexidecimal string with no leading zeros - */ -static void -GetFileIDString( HFSCatalogNodeID fileID, char * fileIDStr ) -{ - SInt32 i, b; - UInt8 *translate = (UInt8 *) "0123456789ABCDEF"; - UInt8 c; - - fileIDStr[0] = '#'; - - for ( i = 0, b = 28; b >= 0; b -= 4 ) { - c = *(translate + ((fileID >> b) & 0x0000000F)); - - /* if its not a leading zero add it to our string */ - if ( (c != (UInt8) '0') || (i > 1) || (b == 0) ) - fileIDStr[++i] = c; - } - - fileIDStr[++i] = '\0'; -} - - /* * Get filename extension (if any) as a C string */ @@ -235,7 +209,7 @@ static UInt32 HexStringToInteger(UInt32 length, const UInt8 *hexStr) { UInt32 value; - short i; + UInt32 i; UInt8 c; const UInt8 *p; @@ -448,7 +422,7 @@ ConvertUnicodeToUTF8Mangled(ByteCount srcLen, ConstUniCharArrayPtr srcStr, ByteC char fileIDStr[15]; char extStr[15]; - GetFileIDString(cnid, fileIDStr); + sprintf(fileIDStr, "#%X", cnid); GetFilenameExtension(srcLen/sizeof(UniChar), srcStr, extStr); /* remove extension chars from source */ diff --git a/bsd/hfs/hfscommon/headers/BTreeScanner.h b/bsd/hfs/hfscommon/headers/BTreeScanner.h index ce9cf3002..368dd18c1 100644 --- a/bsd/hfs/hfscommon/headers/BTreeScanner.h +++ b/bsd/hfs/hfscommon/headers/BTreeScanner.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1996-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1996-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -36,7 +36,6 @@ // amount of time we are allowed to process a catalog search (in µ secs) // NOTE - code assumes kMaxMicroSecsInKernel is less than 1,000,000 -// jertodo - what should we set this to? enum { kMaxMicroSecsInKernel = (1000 * 100) }; // 1 tenth of a second // btree node scanner buffer size. at 32K we get 8 nodes. this is the size used diff --git a/bsd/hfs/hfscommon/headers/BTreesInternal.h b/bsd/hfs/hfscommon/headers/BTreesInternal.h index 650d82099..0cce7eb23 100644 --- a/bsd/hfs/hfscommon/headers/BTreesInternal.h +++ b/bsd/hfs/hfscommon/headers/BTreesInternal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -280,7 +280,7 @@ typedef BTreeIterator *BTreeIteratorPtr; //typedef SInt32 (* KeyCompareProcPtr)(BTreeKeyPtr a, BTreeKeyPtr b); -typedef SInt32 (* IterateCallBackProcPtr)(BTreeKeyPtr key, void * record, UInt16 recordLen, void * state); +typedef SInt32 (* IterateCallBackProcPtr)(BTreeKeyPtr key, void * record, void * state); extern OSStatus BTOpenPath(FCB *filePtr, KeyCompareProcPtr keyCompareProc); @@ -323,7 +323,7 @@ extern OSStatus BTDeleteRecord (FCB *filePtr, BTreeIterator *iterator ); extern OSStatus BTGetInformation (FCB *filePtr, - UInt16 version, + UInt16 vers, BTreeInfoRec *info ); extern OSStatus BTFlushPath (FCB *filePtr ); diff --git a/bsd/hfs/hfscommon/headers/CatalogPrivate.h b/bsd/hfs/hfscommon/headers/CatalogPrivate.h index e18592ebc..fcf12ac7c 100644 --- a/bsd/hfs/hfscommon/headers/CatalogPrivate.h +++ b/bsd/hfs/hfscommon/headers/CatalogPrivate.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000, 2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -79,73 +79,6 @@ #include "FileMgrInternal.h" #include "BTreesInternal.h" - #include <sys/lock.h> - -// private catalog data cache - - - -enum { - kCatalogIteratorCount = 16 // total number of Catalog iterators (shared by all HFS/HFS Plus volumes) -}; - - -// Catalog Iterator Name Types -enum { - kShortPascalName, - kShortUnicodeName, - kLongUnicodeName // non-local name -}; - - -// short unicode name (used by CatalogIterator) -struct UniStr63 { - UInt16 length; /* number of unicode characters */ - UniChar unicode[63]; /* unicode characters */ -}; -typedef struct UniStr63 UniStr63; - - -struct CatalogIterator -{ - struct CatalogIterator *nextMRU; // next iterator in MRU order - struct CatalogIterator *nextLRU; // next iterator in LRU order - - ExtendedVCB *volume; - SInt16 currentIndex; - SInt16 reserved; - UInt32 currentOffset; - UInt32 nextOffset; - HFSCatalogNodeID folderID; - - UInt32 btreeNodeHint; // node the key was last seen in - UInt16 btreeIndexHint; // index the key was last seen at - UInt16 nameType; // { 0 = Pascal, 1 = Unicode, 3 = long name} - HFSCatalogNodeID parentID; // parent folder ID - union - { - Str31 pascalName; - UniStr63 unicodeName; - HFSUniStr255 * longNamePtr; - } folderName; - - struct lock__bsd__ iterator_lock; -}; -typedef struct CatalogIterator CatalogIterator; - - -struct CatalogCacheGlobals { - UInt32 iteratorCount; // Number of iterators in cache - CatalogIterator * mru; - CatalogIterator * lru; - UInt32 reserved; - HFSUniStr255 longName; // used by a single kLongUnicodeName iterator - - simple_lock_data_t simplelock; -}; -typedef struct CatalogCacheGlobals CatalogCacheGlobals; - - // // Private Catalog Manager Routines (for use only by Catalog Manager, CatSearch and FileID Services) // @@ -188,21 +121,6 @@ extern OSErr ExchangeFiles( FIDParam *filePB, WDCBRecPtr *wdcbPtr ); extern void UpdateCatalogName( ConstStr31Param srcName, Str31 destName ); -// Catalog Iterator Routines - -extern CatalogIterator* GetCatalogIterator(ExtendedVCB *volume, HFSCatalogNodeID folderID, UInt32 offset); - -extern OSErr ReleaseCatalogIterator( CatalogIterator *catalogIterator ); - -extern void TrashCatalogIterator( const ExtendedVCB *volume, HFSCatalogNodeID folderID ); - -void AgeCatalogIterator( CatalogIterator *catalogIterator ); - -extern void UpdateBtreeIterator( const CatalogIterator *catalogIterator, BTreeIterator *btreeIterator ); - -extern void UpdateCatalogIterator( const BTreeIterator *btreeIterator, CatalogIterator *catalogIterator ); - - #endif /* __APPLE_API_PRIVATE */ #endif /* KERNEL */ #endif //__CATALOGPRIVATE__ diff --git a/bsd/hfs/hfscommon/headers/FileMgrInternal.h b/bsd/hfs/hfscommon/headers/FileMgrInternal.h index 8ed75e35f..15ccb6e63 100644 --- a/bsd/hfs/hfscommon/headers/FileMgrInternal.h +++ b/bsd/hfs/hfscommon/headers/FileMgrInternal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -151,25 +151,6 @@ union ExtentRecord { HFSPlusExtentRecord hfsPlus; }; typedef union ExtentRecord ExtentRecord; -/* Universal catalog key */ - -union CatalogKey { - HFSCatalogKey hfs; - HFSPlusCatalogKey hfsPlus; -}; -typedef union CatalogKey CatalogKey; -/* Universal catalog data record */ - -union CatalogRecord { - SInt16 recordType; - HFSCatalogFolder hfsFolder; - HFSCatalogFile hfsFile; - HFSCatalogThread hfsThread; - HFSPlusCatalogFolder hfsPlusFolder; - HFSPlusCatalogFile hfsPlusFile; - HFSPlusCatalogThread hfsPlusThread; -}; -typedef union CatalogRecord CatalogRecord; enum { @@ -205,10 +186,6 @@ EXTERN_API_C( Boolean ) IsVCBDirty (ExtendedVCB *vcb); -#define VCB_LOCK_INIT(vcb) simple_lock_init(&vcb->vcbSimpleLock) -#define VCB_LOCK(vcb) simple_lock(&vcb->vcbSimpleLock) -#define VCB_UNLOCK(vcb) simple_unlock(&vcb->vcbSimpleLock) - #define MarkVCBDirty(vcb) { ((vcb)->vcbFlags |= 0xFF00); } #define MarkVCBClean(vcb) { ((vcb)->vcbFlags &= 0x00FF); } #define IsVCBDirty(vcb) ((Boolean) ((vcb->vcbFlags & 0xFF00) != 0)) @@ -219,12 +196,7 @@ EXTERN_API_C( void ) ReturnIfError (OSErr result); #define ReturnIfError(result) if ( (result) != noErr ) return (result); else ; -/* Test for passed condition and return if true*/ -EXTERN_API_C( void ) -ReturnErrorIf (Boolean condition, - OSErr result); -#define ReturnErrorIf(condition, error) if ( (condition) ) return( (error) ); /* Exit function on error*/ EXTERN_API_C( void ) ExitOnError (OSErr result); @@ -244,21 +216,6 @@ ExchangeFileIDs (ExtendedVCB * volume, UInt32 srcHint, UInt32 destHint ); -EXTERN_API_C( SInt32 ) -CompareCatalogKeys (HFSCatalogKey * searchKey, - HFSCatalogKey * trialKey); - -EXTERN_API_C( SInt32 ) -CompareExtendedCatalogKeys (HFSPlusCatalogKey * searchKey, - HFSPlusCatalogKey * trialKey); - -EXTERN_API_C( OSErr ) -InitCatalogCache (void); - -EXTERN_API_C( void ) -InvalidateCatalogCache (ExtendedVCB * volume); - - /* BTree Manager Routines*/ @@ -305,10 +262,6 @@ BlockMarkAllocated(ExtendedVCB *vcb, UInt32 startingBlock, UInt32 numBlocks); EXTERN_API_C( OSErr ) BlockMarkFree( ExtendedVCB *vcb, UInt32 startingBlock, UInt32 numBlocks); -EXTERN_API_C( UInt32 ) -FileBytesToBlocks (SInt64 numerator, - UInt32 denominator); - EXTERN_API_C( UInt32 ) MetaZoneFreeBlocks(ExtendedVCB *vcb); @@ -343,9 +296,11 @@ MapFileBlockC (ExtendedVCB * vcb, FCB * fcb, size_t numberOfBytes, off_t offset, - daddr_t * startBlock, + daddr64_t * startBlock, size_t * availableBytes); +OSErr HeadTruncateFile(ExtendedVCB *vcb, FCB *fcb, UInt32 headblks); + EXTERN_API_C( int ) AddFileExtent (ExtendedVCB *vcb, FCB *fcb, UInt32 startBlock, UInt32 blockCount); @@ -356,10 +311,6 @@ NodesAreContiguous (ExtendedVCB * vcb, UInt32 nodeSize); #endif -/* Utility routines*/ - -EXTERN_API_C( OSErr ) -VolumeWritable (ExtendedVCB * vcb); /* Get the current time in UTC (GMT)*/ diff --git a/bsd/i386/Makefile b/bsd/i386/Makefile index e3f4901de..bb3988af6 100644 --- a/bsd/i386/Makefile +++ b/bsd/i386/Makefile @@ -8,16 +8,22 @@ include $(MakeInc_cmd) include $(MakeInc_def) DATAFILES = \ - cpu.h disklabel.h endian.h exec.h label_t.h param.h \ - profile.h psl.h ptrace.h reboot.h setjmp.h signal.h \ - spl.h table.h types.h ucontext.h user.h vmparam.h + endian.h param.h \ + profile.h setjmp.h signal.h \ + types.h ucontext.h vmparam.h _types.h + +KERNELFILES = \ + endian.h param.h \ + profile.h setjmp.h signal.h \ + types.h vmparam.h _types.h INSTALL_MD_LIST = ${DATAFILES} +INSTALL_MD_LCL_LIST = ${DATAFILES} disklabel.h INSTALL_MD_DIR = i386 -EXPORT_MD_LIST = ${DATAFILES} +EXPORT_MD_LIST = ${KERNELFILES} EXPORT_MD_DIR = i386 diff --git a/bsd/i386/_types.h b/bsd/i386/_types.h new file mode 100644 index 000000000..2a69df571 --- /dev/null +++ b/bsd/i386/_types.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#ifndef _BSD_I386__TYPES_H_ +#define _BSD_I386__TYPES_H_ + +/* + * This header file contains integer types. It's intended to also contain + * flotaing point and other arithmetic types, as needed, later. + */ + +#ifdef __GNUC__ +typedef __signed char __int8_t; +#else /* !__GNUC__ */ +typedef char __int8_t; +#endif /* !__GNUC__ */ +typedef unsigned char __uint8_t; +typedef unsigned short __int16_t; +typedef unsigned short __uint16_t; +typedef int __int32_t; +typedef unsigned int __uint32_t; +typedef long long __int64_t; +typedef unsigned long long __uint64_t; + +typedef long __darwin_intptr_t; +typedef unsigned int __darwin_natural_t; + +/* + * The rune type below is declared to be an ``int'' instead of the more natural + * ``unsigned long'' or ``long''. Two things are happening here. It is not + * unsigned so that EOF (-1) can be naturally assigned to it and used. Also, + * it looks like 10646 will be a 31 bit standard. This means that if your + * ints cannot hold 32 bits, you will be in trouble. The reason an int was + * chosen over a long is that the is*() and to*() routines take ints (says + * ANSI C), but they use __darwin_ct_rune_t instead of int. By changing it + * here, you lose a bit of ANSI conformance, but your programs will still + * work. + * + * NOTE: rune_t is not covered by ANSI nor other standards, and should not + * be instantiated outside of lib/libc/locale. Use wchar_t. wchar_t and + * rune_t must be the same type. Also wint_t must be no narrower than + * wchar_t, and should also be able to hold all members of the largest + * character set plus one extra value (WEOF). wint_t must be at least 16 bits. + */ + +typedef int __darwin_ct_rune_t; /* ct_rune_t */ + +/* + * mbstate_t is an opaque object to keep conversion state, during multibyte + * stream conversions. The content must not be referenced by user programs. + */ +typedef union { + char __mbstate8[128]; + long long _mbstateL; /* for alignment */ +} __mbstate_t; + +typedef __mbstate_t __darwin_mbstate_t; /* mbstate_t */ + +#if defined(__GNUC__) && defined(__PTRDIFF_TYPE__) +typedef __PTRDIFF_TYPE__ __darwin_ptrdiff_t; /* ptr1 - ptr2 */ +#else +typedef int __darwin_ptrdiff_t; /* ptr1 - ptr2 */ +#endif /* __GNUC__ */ + +#if defined(__GNUC__) && defined(__SIZE_TYPE__) +typedef __SIZE_TYPE__ __darwin_size_t; /* sizeof() */ +#else +typedef unsigned long __darwin_size_t; /* sizeof() */ +#endif + +#ifdef KERNEL +typedef void * __darwin_va_list; /* va_list */ +#else /* !KERNEL */ +#if (__GNUC__ > 2) +typedef __builtin_va_list __darwin_va_list; /* va_list */ +#else +typedef void * __darwin_va_list; /* va_list */ +#endif +#endif /* KERNEL */ + +#if defined(__GNUC__) && defined(__WCHAR_TYPE__) +typedef __WCHAR_TYPE__ __darwin_wchar_t; /* wchar_t */ +#else +typedef __darwin_ct_rune_t __darwin_wchar_t; /* wchar_t */ +#endif + +typedef __darwin_wchar_t __darwin_rune_t; /* rune_t */ + +#if defined(__GNUC__) && defined(__WINT_TYPE__) +typedef __WINT_TYPE__ __darwin_wint_t; /* wint_t */ +#else +typedef __darwin_ct_rune_t __darwin_wint_t; /* wint_t */ +#endif + +typedef unsigned long __darwin_clock_t; /* clock() */ +typedef __uint32_t __darwin_socklen_t; /* socklen_t (duh) */ +typedef long __darwin_ssize_t; /* byte count or error */ +typedef long __darwin_time_t; /* time() */ + +#endif /* _BSD_I386__TYPES_H_ */ diff --git a/bsd/i386/cpu.h b/bsd/i386/cpu.h deleted file mode 100644 index fbace41fc..000000000 --- a/bsd/i386/cpu.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* - * HISTORY - * - */ - -#ifndef _BSD_I386_CPU_H_ -#define _BSD_I386_CPU_H_ - -#include <sys/appleapiopts.h> - -#ifdef __APPLE_API_OBSOLETE -#define cpu_number() (0) -#endif /* __APPLE_API_OBSOLETE */ - -#endif /* _BSD_I386_CPU_H_ */ diff --git a/bsd/i386/endian.h b/bsd/i386/endian.h index 2d9a1d3a1..3e42f8a8b 100644 --- a/bsd/i386/endian.h +++ b/bsd/i386/endian.h @@ -71,52 +71,25 @@ #define _QUAD_HIGHWORD 1 #define _QUAD_LOWWORD 0 -#if defined(KERNEL) || !defined(_POSIX_SOURCE) /* * Definitions for byte order, according to byte significance from low * address to high. */ -#define LITTLE_ENDIAN 1234 /* LSB first: i386, vax */ -#define BIG_ENDIAN 4321 /* MSB first: 68000, ibm, net */ -#define PDP_ENDIAN 3412 /* LSB first in word, MSW first in long */ +#define __DARWIN_LITTLE_ENDIAN 1234 /* LSB first: i386, vax */ +#define __DARWIN_BIG_ENDIAN 4321 /* MSB first: 68000, ibm, net */ +#define __DARWIN_PDP_ENDIAN 3412 /* LSB first in word, MSW first in long */ -#define BYTE_ORDER LITTLE_ENDIAN +#define __DARWIN_BYTE_ORDER __DARWIN_LITTLE_ENDIAN -#include <sys/cdefs.h> +#if defined(KERNEL) || !defined(_POSIX_C_SOURCE) -__BEGIN_DECLS -unsigned long htonl __P((unsigned long)); -unsigned short htons __P((unsigned short)); -unsigned long ntohl __P((unsigned long)); -unsigned short ntohs __P((unsigned short)); -__END_DECLS +#define LITTLE_ENDIAN __DARWIN_LITTLE_ENDIAN +#define BIG_ENDIAN __DARWIN_BIG_ENDIAN +#define PDP_ENDIAN __DARWIN_PDP_ENDIAN -/* - * Macros for network/external number representation conversion. - */ -#if BYTE_ORDER == BIG_ENDIAN && !defined(lint) -#define ntohl(x) (x) -#define ntohs(x) (x) -#define htonl(x) (x) -#define htons(x) (x) - -#define NTOHL(x) (x) -#define NTOHS(x) (x) -#define HTONL(x) (x) -#define HTONS(x) (x) +#define BYTE_ORDER __DARWIN_BYTE_ORDER -#else -#include <machine/byte_order.h> - -#define ntohl(x) NXSwapBigLongToHost(x) -#define ntohs(x) NXSwapBigShortToHost(x) -#define htonl(x) NXSwapHostLongToBig(x) -#define htons(x) NXSwapHostShortToBig(x) +#include <sys/_endian.h> -#define NTOHL(x) (x) = ntohl((u_long)x) -#define NTOHS(x) (x) = ntohs((u_short)x) -#define HTONL(x) (x) = htonl((u_long)x) -#define HTONS(x) (x) = htons((u_short)x) -#endif -#endif /* defined(KERNEL) || !defined(_POSIX_SOURCE) */ +#endif /* defined(KERNEL) || !defined(_POSIX_C_SOURCE) */ #endif /* !_I386__ENDIAN_H_ */ diff --git a/bsd/i386/exec.h b/bsd/i386/exec.h index 237ecd5fc..882e9cd79 100644 --- a/bsd/i386/exec.h +++ b/bsd/i386/exec.h @@ -58,9 +58,7 @@ #define _BSD_I386_EXEC_H_ -#include <sys/appleapiopts.h> - -#ifdef __APPLE_API_OBSOLETE +#ifdef BSD_KERNEL_PRIVATE /* Size of a page in an object file. */ #define __LDPGSZ 4096 @@ -111,11 +109,6 @@ struct exec { unsigned int a_drsize; /* data relocation size */ }; -/* - * Address of ps_strings structure (in user space). - */ -#define PS_STRINGS \ - ((struct ps_strings *)(USRSTACK - sizeof(struct ps_strings))) -#endif /* __APPLE_API_OBSOLETE */ +#endif /* BSD_KERNEL_PRIVATE */ #endif /* _BSD_I386_EXEC_H_ */ diff --git a/bsd/i386/label_t.h b/bsd/i386/label_t.h deleted file mode 100644 index f47065aff..000000000 --- a/bsd/i386/label_t.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1992 NeXT Computer, Inc. - * - * Intel386 Family: For setjmp/longjmp (kernel version). - * - */ - -#ifndef _BSD_I386_LABEL_T_H_ -#define _BSD_I386_LABEL_T_H_ - -#include <sys/appleapiopts.h> - -#ifdef __APPLE_API_OBSOLETE -typedef struct label_t { - int val[14]; -} label_t; -#endif /* __APPLE_API_OBSOLETE */ - -#endif /* _BSD_I386_LABEL_T_H_ */ diff --git a/bsd/i386/param.h b/bsd/i386/param.h index f5e43d3f6..6be5ae90a 100644 --- a/bsd/i386/param.h +++ b/bsd/i386/param.h @@ -68,11 +68,11 @@ /* * Round p (pointer or byte index) up to a correctly-aligned value for all - * data types (int, long, ...). The result is u_int and must be cast to - * any desired pointer type. + * data types (int, long, ...). The result is unsigned int and must be + * cast to any desired pointer type. */ #define ALIGNBYTES 3 -#define ALIGN(p) (((u_int)(p) + ALIGNBYTES) &~ ALIGNBYTES) +#define ALIGN(p) (((unsigned int)(p) + ALIGNBYTES) &~ ALIGNBYTES) #define NBPG 4096 /* bytes/page */ #define PGOFSET (NBPG-1) /* byte offset into page */ @@ -83,8 +83,6 @@ #define BLKDEV_IOSIZE 2048 #define MAXPHYS (64 * 1024) /* max raw I/O transfer size */ -#define STACK_GROWS_UP 0 /* stack grows to lower addresses */ - #define CLSIZE 1 #define CLSIZELOG2 0 diff --git a/bsd/i386/reboot.h b/bsd/i386/reboot.h index dad563257..0724538c9 100644 --- a/bsd/i386/reboot.h +++ b/bsd/i386/reboot.h @@ -29,8 +29,7 @@ #include <sys/appleapiopts.h> -#ifdef KERNEL_PRIVATE -#ifdef __APPLE_API_PRIVATE +#ifdef BSD_KERNEL_PRIVATE /* * Use most significant 16 bits to avoid collisions with @@ -46,7 +45,6 @@ #define RB_BOOTDOS 0x00800000 /* reboot into DOS */ #define RB_PRETTY 0x01000000 /* shutdown with pretty graphics */ -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL_PRIVATE */ +#endif /* BSD_KERNEL_PRIVATE */ #endif /* _BSD_I386_REBOOT_H_ */ diff --git a/bsd/i386/setjmp.h b/bsd/i386/setjmp.h index b2584eb47..4d7ba1573 100644 --- a/bsd/i386/setjmp.h +++ b/bsd/i386/setjmp.h @@ -29,26 +29,39 @@ #define _BSD_I386_SETJMP_H #include <sys/cdefs.h> -#include <i386/signal.h> +#include <machine/signal.h> -typedef struct sigcontext jmp_buf[1]; +/* + * _JBLEN is number of ints required to save the following: + * eax, ebx, ecx, edx, edi, esi, ebp, esp, ss, eflags, eip, + * cs, de, es, fs, gs == 16 ints + * onstack, mask = 2 ints + */ +#if defined(KERNEL) +typedef struct sigcontext jmp_buf[1]; #define _JBLEN ((sizeof(struct sigcontext)) / sizeof(int)) typedef int sigjmp_buf[_JBLEN+1]; +#else +#define _JBLEN (18) +typedef int jmp_buf[_JBLEN]; +typedef int sigjmp_buf[_JBLEN + 1]; +#endif + __BEGIN_DECLS -extern int setjmp __P((jmp_buf env)); -extern void longjmp __P((jmp_buf env, int val)); +extern int setjmp(jmp_buf env); +extern void longjmp(jmp_buf env, int val); #ifndef _ANSI_SOURCE -int sigsetjmp __P((sigjmp_buf env, int val)); -void siglongjmp __P((sigjmp_buf env, int val)); +int _setjmp(jmp_buf env); +void _longjmp(jmp_buf, int val); +int sigsetjmp(sigjmp_buf env, int val); +void siglongjmp(sigjmp_buf env, int val); #endif /* _ANSI_SOURCE */ -#if !defined(_ANSI_SOURCE) && !defined(_POSIX_SOURCE) -int _setjmp __P((jmp_buf env)); -void _longjmp __P((jmp_buf, int val)); -void longjmperror __P((void)); +#if !defined(_ANSI_SOURCE) && !defined(_POSIX_C_SOURCE) +void longjmperror(void); #endif /* neither ANSI nor POSIX */ __END_DECLS #endif /* !_BSD_I386_SETJMP_H */ diff --git a/bsd/i386/signal.h b/bsd/i386/signal.h index 3c73b16a9..d1316b5df 100644 --- a/bsd/i386/signal.h +++ b/bsd/i386/signal.h @@ -27,10 +27,14 @@ #ifndef _i386_SIGNAL_ #define _i386_SIGNAL_ 1 +#ifndef _ANSI_SOURCE +typedef int sig_atomic_t; + +#ifndef _POSIX_C_SOURCE + #include <sys/appleapiopts.h> #ifdef __APPLE_API_OBSOLETE -typedef int sig_atomic_t; /* * Information pushed on stack when a signal is delivered. @@ -61,6 +65,8 @@ struct sigcontext { }; #endif /* __APPLE_API_OBSOLETE */ +#endif /* ! _POSIX_C_SOURCE */ +#endif /* ! _ANSI_SOURCE */ #endif /* _i386_SIGNAL_ */ diff --git a/bsd/i386/spl.h b/bsd/i386/spl.h deleted file mode 100644 index 0f6be5565..000000000 --- a/bsd/i386/spl.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -#ifndef _BSD_I386_SPL_H_ -#define _BSD_I386_SPL_H_ - -#ifdef KERNEL -#ifndef __ASSEMBLER__ -/* - * Machine-dependent SPL definitions. - * - */ -typedef unsigned spl_t; - -extern unsigned sploff(void); -extern unsigned splhigh(void); -extern unsigned splsched(void); -extern unsigned splclock(void); -extern unsigned splpower(void); -extern unsigned splvm(void); -extern unsigned splbio(void); -extern unsigned splimp(void); -extern unsigned spltty(void); -extern unsigned splnet(void); -extern unsigned splsoftclock(void); - -extern void spllo(void); -extern void splon(unsigned level); -extern void splx(unsigned level); -extern void spln(unsigned level); -#define splstatclock() splhigh() - -#endif /* __ASSEMBLER__ */ - -#endif - -#endif /* _BSD_I386_SPL_H_ */ diff --git a/bsd/i386/table.h b/bsd/i386/table.h deleted file mode 100644 index f55484ddc..000000000 --- a/bsd/i386/table.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1989 Next, Inc. - */ - -#ifndef _BSD_I386_TABLE_H_ -#define _BSD_I386_TABLE_H_ - -/* - * Empty file. - */ - -#endif /* _BSD_I386_TABLE_H_ */ diff --git a/bsd/i386/types.h b/bsd/i386/types.h index f370e9bf1..ab1c10837 100644 --- a/bsd/i386/types.h +++ b/bsd/i386/types.h @@ -61,26 +61,59 @@ #define _MACHTYPES_H_ #ifndef __ASSEMBLER__ +#include <i386/_types.h> #include <sys/cdefs.h> /* * Basic integral types. Omit the typedef if * not possible for a machine/compiler combination. */ +#ifndef _INT8_T +#define _INT8_T typedef __signed char int8_t; +#endif typedef unsigned char u_int8_t; +#ifndef _INT16_T +#define _INT16_T typedef short int16_t; +#endif typedef unsigned short u_int16_t; +#ifndef _INT32_T +#define _INT32_T typedef int int32_t; +#endif typedef unsigned int u_int32_t; +#ifndef _INT64_T +#define _INT64_T typedef long long int64_t; +#endif typedef unsigned long long u_int64_t; typedef int32_t register_t; -typedef long int intptr_t; +#ifndef _INTPTR_T +#define _INTPTR_T +typedef __darwin_intptr_t intptr_t; +#endif +#ifndef _UINTPTR_T +#define _UINTPTR_T typedef unsigned long int uintptr_t; +#endif +/* These types are used for reserving the largest possible size. */ +// LP64todo - typedef mach_vm_address_t user_addr_t; /* varying length pointers from user space */ +// LP64todo - typedef mach_vm_size_t user_size_t; /* varying length values from user space (unsigned) */ +typedef u_int32_t user_addr_t; +typedef u_int32_t user_size_t; +typedef int32_t user_ssize_t; +typedef int32_t user_long_t; +typedef u_int32_t user_ulong_t; +typedef int32_t user_time_t; +#define USER_ADDR_NULL ((user_addr_t) 0) +#define CAST_USER_ADDR_T(a_ptr) ((user_addr_t)(a_ptr)) + +#ifndef __offsetof #define __offsetof(type, field) ((size_t)(&((type *)0)->field)) +#endif #endif /* __ASSEMBLER__ */ #endif /* _MACHTYPES_H_ */ diff --git a/bsd/i386/ucontext.h b/bsd/i386/ucontext.h index 8ba671a2f..7f7a04474 100644 --- a/bsd/i386/ucontext.h +++ b/bsd/i386/ucontext.h @@ -26,26 +26,42 @@ #include <sys/appleapiopts.h> #include <mach/thread_status.h> -#include <signal.h> - #ifdef __APPLE_API_UNSTABLE /* WARNING: THIS WILL CHANGE; DO NOT COUNT ON THIS */ /* Needs to be finalized as to what it should contain */ -struct mcontext { +#ifndef _POSIX_C_SOURCE +struct mcontext +#else /* _POSIX_C_SOURCE */ +struct __darwin_mcontext +#endif /* _POSIX_C_SOURCE */ +{ struct sigcontext sc; }; +#ifndef _POSIX_C_SOURCE #define I386_MCONTEXT_SIZE sizeof(struct mcontext) +#endif /* _POSIX_C_SOURCE */ + +#ifndef _MCONTEXT_T +#define _MCONTEXT_T +typedef __darwin_mcontext_t mcontext_t; +#endif -typedef struct mcontext * mcontext_t; +#ifndef _POSIX_C_SOURCE -struct mcontext64 { +struct mcontext64 +{ struct sigcontext sc; }; #define I386_MCONTEXT64_SIZE sizeof(struct mcontext64) +#ifndef _MCONTEXT64_T +#define _MCONTEXT64_T typedef struct mcontext64 * mcontext64_t; +#endif + +#endif /* _POSIX_C_SOURCE */ #endif /* __APPLE_API_UNSTABLE */ diff --git a/bsd/i386/vmparam.h b/bsd/i386/vmparam.h index cba3d8b69..ffb77f55b 100644 --- a/bsd/i386/vmparam.h +++ b/bsd/i386/vmparam.h @@ -25,7 +25,7 @@ #include <sys/resource.h> -#define USRSTACK 0xbfff9000 +#define USRSTACK (0xC0000000) /* * Virtual memory related constants, all in bytes diff --git a/bsd/isofs/cd9660/Makefile b/bsd/isofs/cd9660/Makefile index 27f7df03a..fe4e6d369 100644 --- a/bsd/isofs/cd9660/Makefile +++ b/bsd/isofs/cd9660/Makefile @@ -26,9 +26,9 @@ INSTALL_MI_LIST = ${DATAFILES} INSTALL_MI_DIR = isofs/cd9660 -EXPORT_MI_LIST = ${DATAFILES} +EXPORT_MI_LIST = -EXPORT_MI_DIR = isofs/cd9660 +EXPORT_MI_DIR = include $(MakeInc_rule) diff --git a/bsd/isofs/cd9660/cd9660_bmap.c b/bsd/isofs/cd9660/cd9660_bmap.c index f01787f5f..53cbb55d9 100644 --- a/bsd/isofs/cd9660/cd9660_bmap.c +++ b/bsd/isofs/cd9660/cd9660_bmap.c @@ -65,84 +65,15 @@ #include <sys/vnode.h> #include <sys/mount.h> #include <sys/namei.h> -#include <sys/buf.h> #include <sys/file.h> #include <isofs/cd9660/iso.h> #include <isofs/cd9660/cd9660_node.h> -/* - * Bmap converts the logical block number of a file to its physical block - * number on the disk. The conversion is done by using the logical block - * number to index into the data block (extent) for the file. - */ -int -cd9660_bmap(ap) - struct vop_bmap_args /* { - struct vnode *a_vp; - daddr_t a_bn; - struct vnode **a_vpp; - daddr_t *a_bnp; - int *a_runp; - } */ *ap; -{ - struct iso_node *ip = VTOI(ap->a_vp); - daddr_t lblkno = ap->a_bn; - int bshift; - - /* - * Check for underlying vnode requests and ensure that logical - * to physical mapping is requested. - */ - if (ap->a_vpp != NULL) - *ap->a_vpp = ip->i_devvp; - if (ap->a_bnp == NULL) - return (0); - - /* - * Associated files have an Apple Double header - */ - if ((ip->i_flag & ISO_ASSOCIATED) && (lblkno > (ADH_BLKS - 1))) { - lblkno -= ADH_BLKS; - *ap->a_bnp = (ip->iso_start + lblkno); - if (ap->a_runp) - *ap->a_runp = 0; - return (0); - } - - /* - * Compute the requested block number - */ - bshift = ip->i_mnt->im_bshift; - *ap->a_bnp = (ip->iso_start + lblkno); - - /* - * Determine maximum number of readahead blocks following the - * requested block. - */ - if (ap->a_runp) { - int nblk; - - nblk = (ip->i_size >> bshift) - (lblkno + 1); - if (nblk <= 0) - *ap->a_runp = 0; - else if (nblk >= (MAXBSIZE >> bshift)) - *ap->a_runp = (MAXBSIZE >> bshift) - 1; - else - *ap->a_runp = nblk; - } - - return (0); -} /* blktooff converts a logical block number to a file offset */ int -cd9660_blktooff(ap) - struct vop_blktooff_args /* { - struct vnode *a_vp; - daddr_t a_lblkno; - off_t *a_offset; - } */ *ap; +cd9660_blktooff(struct vnop_blktooff_args *ap) { register struct iso_node *ip; register struct iso_mnt *imp; @@ -159,12 +90,7 @@ cd9660_blktooff(ap) /* offtoblk converts a file offset to a logical block number */ int -cd9660_offtoblk(ap) -struct vop_offtoblk_args /* { - struct vnode *a_vp; - off_t a_offset; - daddr_t *a_lblkno; - } */ *ap; +cd9660_offtoblk(struct vnop_offtoblk_args *ap) { register struct iso_node *ip; register struct iso_mnt *imp; @@ -175,20 +101,12 @@ struct vop_offtoblk_args /* { ip = VTOI(ap->a_vp); imp = ip->i_mnt; - *ap->a_lblkno = (daddr_t)lblkno(imp, ap->a_offset); + *ap->a_lblkno = (daddr64_t)lblkno(imp, ap->a_offset); return (0); } int -cd9660_cmap(ap) -struct vop_cmap_args /* { - struct vnode *a_vp; - off_t a_offset; - size_t a_size; - daddr_t *a_bpn; - size_t *a_run; - void *a_poff; -} */ *ap; +cd9660_blockmap(struct vnop_blockmap_args *ap) { struct iso_node *ip = VTOI(ap->a_vp); size_t cbytes; @@ -202,7 +120,7 @@ struct vop_cmap_args /* { if (ap->a_bpn == NULL) return (0); - VOP_DEVBLOCKSIZE(ip->i_devvp, &devBlockSize); + devBlockSize = vfs_devblocksize(vnode_mount(ap->a_vp)); /* * Associated files have an Apple Double header @@ -211,14 +129,14 @@ struct vop_cmap_args /* { if (offset < ADH_SIZE) { if (ap->a_run) *ap->a_run = 0; - *ap->a_bpn = -1; + *ap->a_bpn = (daddr64_t)-1; goto out; } else { offset -= ADH_SIZE; } } - *ap->a_bpn = (daddr_t)(ip->iso_start + lblkno(ip->i_mnt, offset)); + *ap->a_bpn = (daddr64_t)(ip->iso_start + lblkno(ip->i_mnt, offset)); /* * Determine maximum number of contiguous bytes following the diff --git a/bsd/isofs/cd9660/cd9660_lookup.c b/bsd/isofs/cd9660/cd9660_lookup.c index 34aaadd58..2ecf2568e 100644 --- a/bsd/isofs/cd9660/cd9660_lookup.c +++ b/bsd/isofs/cd9660/cd9660_lookup.c @@ -115,12 +115,7 @@ struct nchstats iso_nchstats; * NOTE: (LOOKUP | LOCKPARENT) currently returns the parent inode unlocked. */ int -cd9660_lookup(ap) - struct vop_lookup_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - } */ *ap; +cd9660_lookup(struct vnop_lookup_args *ap) { register struct vnode *vdp; /* vnode for directory being searched */ register struct iso_node *dp; /* inode for directory being searched */ @@ -150,8 +145,8 @@ cd9660_lookup(ap) struct componentname *cnp = ap->a_cnp; int flags = cnp->cn_flags; int nameiop = cnp->cn_nameiop; - struct proc *p = cnp->cn_proc; - int devBlockSize=0; + vfs_context_t ctx = cnp->cn_context; + struct proc *p = vfs_context_proc(ctx); size_t altlen; bp = NULL; @@ -164,14 +159,6 @@ cd9660_lookup(ap) wantassoc = 0; - /* - * Check accessiblity of directory. - */ - if (vdp->v_type != VDIR) - return (ENOTDIR); - if ( (error = VOP_ACCESS(vdp, VEXEC, cnp->cn_cred, p)) ) - return (error); - /* * We now have a segment name to search for, and a directory to search. * @@ -180,48 +167,9 @@ cd9660_lookup(ap) * we are looking for is known already. */ if ((error = cache_lookup(vdp, vpp, cnp))) { - int vpid; /* capability number of vnode */ - if (error == ENOENT) return (error); - /* - * Get the next vnode in the path. - * See comment below starting `Step through' for - * an explaination of the locking protocol. - */ - pdp = vdp; - dp = VTOI(*vpp); - vdp = *vpp; - vpid = vdp->v_id; - if (pdp == vdp) { - VREF(vdp); - error = 0; - } else if (flags & ISDOTDOT) { - VOP_UNLOCK(pdp, 0, p); - error = vget(vdp, LK_EXCLUSIVE | LK_RETRY, p); - if (!error && lockparent && (flags & ISLASTCN)) - error = VOP_LOCK(pdp, LK_EXCLUSIVE | LK_RETRY, p); - } else { - error = vget(vdp, LK_EXCLUSIVE | LK_RETRY, p); - if (!lockparent || error || !(flags & ISLASTCN)) - VOP_UNLOCK(pdp, 0, p); - } - /* - * Check that the capability number did not change - * while we were waiting for the lock. - */ - if (!error) { - if (vpid == vdp->v_id) - return (0); - vput(vdp); - if (lockparent && pdp != vdp && (flags & ISLASTCN)) - VOP_UNLOCK(pdp, 0, p); - } - if ( (error = VOP_LOCK(pdp, LK_EXCLUSIVE | LK_RETRY, p)) ) - return (error); - vdp = pdp; - dp = VTOI(pdp); - *vpp = NULL; + return (0); } len = cnp->cn_namelen; @@ -241,13 +189,13 @@ cd9660_lookup(ap) */ if ((imp->iso_ftype == ISO_FTYPE_JOLIET) && !((len == 1 && *name == '.') || (flags & ISDOTDOT))) { - int flags = UTF_PRECOMPOSED; + int flags1 = UTF_PRECOMPOSED; if (BYTE_ORDER != BIG_ENDIAN) - flags |= UTF_REVERSE_ENDIAN; + flags1 |= UTF_REVERSE_ENDIAN; (void) utf8_decodestr(name, len, (u_int16_t*) altname, &altlen, - sizeof(altname), 0, flags); + sizeof(altname), 0, flags1); name = altname; len = altlen; } @@ -272,7 +220,7 @@ cd9660_lookup(ap) dp->i_offset = dp->i_diroff; if ((entryoffsetinblock = dp->i_offset & bmask) && - (error = VOP_BLKATOFF(vdp, SECTOFF(imp, dp->i_offset), NULL, &bp))) + (error = cd9660_blkatoff(vdp, SECTOFF(imp, dp->i_offset), NULL, &bp))) return (error); numdirpasses = 2; iso_nchstats.ncs_2passes++; @@ -288,8 +236,8 @@ searchloop: */ if ((dp->i_offset & bmask) == 0) { if (bp != NULL) - brelse(bp); - if ( (error = VOP_BLKATOFF(vdp, SECTOFF(imp,dp->i_offset), NULL, &bp)) ) + buf_brelse(bp); + if ( (error = cd9660_blkatoff(vdp, SECTOFF(imp,dp->i_offset), NULL, &bp)) ) return (error); entryoffsetinblock = 0; } @@ -297,7 +245,7 @@ searchloop: * Get pointer to next entry. */ ep = (struct iso_directory_record *) - ((char *)bp->b_data + entryoffsetinblock); + ((char *)buf_dataptr(bp) + entryoffsetinblock); reclen = isonum_711(ep->length); if (reclen == 0) { @@ -328,7 +276,7 @@ searchloop: if (isoflags & directoryBit) ino = isodirino(ep, imp); else - ino = (bp->b_blkno << imp->im_bshift) + entryoffsetinblock; + ino = ((daddr_t)buf_blkno(bp) << imp->im_bshift) + entryoffsetinblock; dp->i_ino = ino; cd9660_rrip_getname(ep,altname,&namelen,&dp->i_ino,imp); if (namelen == cnp->cn_namelen @@ -357,14 +305,14 @@ searchloop: if ( isoflags & directoryBit ) ino = isodirino(ep, imp); else - ino = (bp->b_blkno << imp->im_bshift) + entryoffsetinblock; + ino = ((daddr_t)buf_blkno(bp) << imp->im_bshift) + entryoffsetinblock; saveoffset = dp->i_offset; } else if (imp->iso_ftype == ISO_FTYPE_JOLIET && !(res = ucsfncmp((u_int16_t*)name, len, (u_int16_t*) ep->name, namelen))) { if ( isoflags & directoryBit ) ino = isodirino(ep, imp); else - ino = (bp->b_blkno << imp->im_bshift) + entryoffsetinblock; + ino = ((daddr_t)buf_blkno(bp) << imp->im_bshift) + entryoffsetinblock; saveoffset = dp->i_offset; } else if (ino) goto foundino; @@ -387,13 +335,13 @@ foundino: if (lblkno(imp, dp->i_offset) != lblkno(imp, saveoffset)) { if (bp != NULL) - brelse(bp); - if ( (error = VOP_BLKATOFF(vdp, SECTOFF(imp, saveoffset), NULL, &bp)) ) + buf_brelse(bp); + if ( (error = cd9660_blkatoff(vdp, SECTOFF(imp, saveoffset), NULL, &bp)) ) return (error); } entryoffsetinblock = saveoffset & bmask; ep = (struct iso_directory_record *) - ((char *)bp->b_data + entryoffsetinblock); + ((char *)buf_dataptr(bp) + entryoffsetinblock); dp->i_offset = saveoffset; } goto found; @@ -410,20 +358,13 @@ notfound: goto searchloop; } if (bp != NULL) - brelse(bp); + buf_brelse(bp); /* * Insert name into cache (as non-existent) if appropriate. */ if (cnp->cn_flags & MAKEENTRY) cache_enter(vdp, *vpp, cnp); - if (nameiop == CREATE || nameiop == RENAME) { - /* - * return EROFS (NOT EJUSTRETURN). The caller will then unlock - * the parent for us. - */ - return (EROFS); - } return (ENOENT); found: @@ -463,44 +404,26 @@ found: * it's a relocated directory. */ if (flags & ISDOTDOT) { - VOP_UNLOCK(pdp, 0, p); /* race to get the inode */ - error = cd9660_vget_internal(vdp->v_mount, dp->i_ino, &tdp, + error = cd9660_vget_internal(vnode_mount(vdp), dp->i_ino, &tdp, NULL, NULL, dp->i_ino != ino, ep, p); VTOI(tdp)->i_parent = VTOI(pdp)->i_number; - brelse(bp); - if (error) { - VOP_LOCK(pdp, LK_EXCLUSIVE | LK_RETRY, p); - return (error); - } - if (lockparent && (flags & ISLASTCN) && - (error = VOP_LOCK(pdp, LK_EXCLUSIVE | LK_RETRY, p))) { - vput(tdp); - return (error); - } + buf_brelse(bp); + *vpp = tdp; } else if (dp->i_number == dp->i_ino) { - brelse(bp); - VREF(vdp); /* we want ourself, ie "." */ + buf_brelse(bp); + vnode_get(vdp); /* we want ourself, ie "." */ *vpp = vdp; } else { - error = cd9660_vget_internal(vdp->v_mount, dp->i_ino, &tdp, + error = cd9660_vget_internal(vnode_mount(vdp), dp->i_ino, &tdp, vdp, cnp, dp->i_ino != ino, ep, p); /* save parent inode number */ VTOI(tdp)->i_parent = VTOI(pdp)->i_number; - brelse(bp); + buf_brelse(bp); if (error) return (error); - if (!lockparent || !(flags & ISLASTCN)) - VOP_UNLOCK(pdp, 0, p); *vpp = tdp; } - - /* - * Insert name into cache if appropriate. - */ - if (cnp->cn_flags & MAKEENTRY) - cache_enter(vdp, *vpp, cnp); - return (0); } @@ -511,37 +434,32 @@ found: * remaining space in the directory. */ int -cd9660_blkatoff(ap) - struct vop_blkatoff_args /* { - struct vnode *a_vp; - off_t a_offset; - char **a_res; - struct buf **a_bpp; - } */ *ap; +cd9660_blkatoff(vnode_t vp, off_t offset, char **res, buf_t *bpp) { struct iso_node *ip; register struct iso_mnt *imp; - struct buf *bp; + buf_t bp; daddr_t lbn; int bsize, error; - ip = VTOI(ap->a_vp); + ip = VTOI(vp); imp = ip->i_mnt; - lbn = lblkno(imp, ap->a_offset); + lbn = lblkno(imp, offset); bsize = blksize(imp, ip, lbn); + if ((bsize != imp->im_sector_size) && - (ap->a_offset & (imp->im_sector_size - 1)) == 0) { + (offset & (imp->im_sector_size - 1)) == 0) { bsize = imp->im_sector_size; } - if ( (error = bread(ap->a_vp, lbn, bsize, NOCRED, &bp)) ) { - brelse(bp); - *ap->a_bpp = NULL; + if ( (error = (int)buf_bread(vp, (daddr64_t)((unsigned)lbn), bsize, NOCRED, &bp)) ) { + buf_brelse(bp); + *bpp = NULL; return (error); } - if (ap->a_res) - *ap->a_res = (char *)bp->b_data + blkoff(imp, ap->a_offset); - *ap->a_bpp = bp; + if (res) + *res = (char *)buf_dataptr(bp) + blkoff(imp, offset); + *bpp = bp; return (0); } diff --git a/bsd/isofs/cd9660/cd9660_mount.h b/bsd/isofs/cd9660/cd9660_mount.h index 462b612ae..c673db1b1 100644 --- a/bsd/isofs/cd9660/cd9660_mount.h +++ b/bsd/isofs/cd9660/cd9660_mount.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -63,6 +63,7 @@ #define __ISOFS_CD9660_CD9660_MOUNT_H__ #include <sys/appleapiopts.h> +#include <sys/cdefs.h> #ifdef __APPLE_API_UNSTABLE /* @@ -70,8 +71,9 @@ */ struct CDTOC; struct iso_args { +#ifndef KERNEL char *fspec; /* block special device to mount */ - struct export_args export; /* network export info */ +#endif int flags; /* mounting flags, see below */ int ssector; /* starting sector, 0 for 1st session */ int toc_length; /* Size of *toc, including the toc.length field */ @@ -83,5 +85,29 @@ struct iso_args { #define ISOFSMNT_NOJOLIET 0x00000008 /* disable Joliet Ext.*/ #define ISOFSMNT_TOC 0x00000010 /* iso_args.toc is valid */ +#ifdef KERNEL +/* LP64 version of iso_args. all pointers + * grow when we're dealing with a 64-bit process. + * WARNING - keep in sync with iso_args + */ +/* LP64todo - should this move? */ + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_iso_args { + int flags; /* mounting flags, see below */ + int ssector; /* starting sector, 0 for 1st session */ + int toc_length; /* Size of *toc, including the toc.length field */ + user_addr_t toc; +}; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +#endif /* KERNEL */ + #endif /* __APPLE_API_UNSTABLE */ #endif /* __ISOFS_CD9660_CD9660_MOUNT_H__ */ diff --git a/bsd/isofs/cd9660/cd9660_node.c b/bsd/isofs/cd9660/cd9660_node.c index 5ff664592..4418c9147 100644 --- a/bsd/isofs/cd9660/cd9660_node.c +++ b/bsd/isofs/cd9660/cd9660_node.c @@ -107,7 +107,7 @@ extern u_char isonullname[]; * Initialize hash links for inodes and dnodes. */ int -cd9660_init() +cd9660_init(__unused struct vfsconf *cp) { isohashtbl = hashinit(desiredvnodes, M_ISOFSMNT, &isohash); @@ -122,10 +122,7 @@ cd9660_init() * Enter a new node into the device hash list */ struct iso_dnode * -iso_dmap(device, inum, create) - dev_t device; - ino_t inum; - int create; +iso_dmap(dev_t device, ino_t inum, int create) { register struct iso_dnode **dpp, *dp, *dq; @@ -154,8 +151,7 @@ iso_dmap(device, inum, create) } void -iso_dunmap(device) - dev_t device; +iso_dunmap(dev_t device) { struct iso_dnode **dpp, *dp, *dq; @@ -178,48 +174,60 @@ iso_dunmap(device) * to it. If it is in core, but locked, wait for it. */ struct vnode * -cd9660_ihashget(device, inum, p) - dev_t device; - ino_t inum; - struct proc *p; +cd9660_ihashget(dev_t device, ino_t inum, struct proc *p) { register struct iso_node *ip; struct vnode *vp; - - for (;;) - for (ip = isohashtbl[INOHASH(device, inum)];; ip = ip->i_next) { - if (ip == NULL) - return (NULL); - if (inum == ip->i_number && device == ip->i_dev) { - /* - * This is my most dangerous change. I am not waiting for - * the inode lock anymore (ufs doesn't, why should we) and - * I'm worried because there is not lock on the hashtable, - * but there wasn't before so I'll let it go for now. - * -- chw -- + uint32_t vid; + +retry: + for (ip = isohashtbl[INOHASH(device, inum)]; ip; ip = ip->i_next) { + if (inum == ip->i_number && device == ip->i_dev) { + + if (ISSET(ip->i_flag, ISO_INALLOC)) { + /* + * inode is being created... wait for it + * to be ready for consumption */ - vp = ITOV(ip); - simple_lock(&vp->v_interlock); - if (!vget(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY, p)) - return (vp); - break; + SET(ip->i_flag, ISO_INWALLOC); + tsleep((caddr_t)ip, PINOD, "cd9960_ihashget", 0); + goto retry; } + vp = ITOV(ip); + /* + * the vid needs to be grabbed before we drop + * lock protecting the hash + */ + vid = vnode_vid(vp); + + /* + * we currently depend on running under the FS funnel + * when we do proper locking and advertise ourselves + * as thread safe, we'll need a lock to protect the + * hash lookup... this is where we would drop it + */ + if (vnode_getwithvid(vp, vid)) { + /* + * If vnode is being reclaimed, or has + * already changed identity, no need to wait + */ + return (NULL); + } + return (vp); } - /* NOTREACHED */ + } + return (NULL); } /* * Insert the inode into the hash table, and return it locked. */ void -cd9660_ihashins(ip) - struct iso_node *ip; +cd9660_ihashins(struct iso_node *ip) { struct iso_node **ipp, *iq; - struct proc *p = current_proc(); /* lock the inode, then put it on the appropriate hash list */ - lockmgr(&ip->i_lock, LK_EXCLUSIVE, (struct slock *)0, p); ipp = &isohashtbl[INOHASH(ip->i_dev, ip->i_number)]; if ((iq = *ipp)) @@ -227,14 +235,13 @@ cd9660_ihashins(ip) ip->i_next = iq; ip->i_prev = ipp; *ipp = ip; - } +} /* * Remove the inode from the hash table. */ void -cd9660_ihashrem(ip) - register struct iso_node *ip; +cd9660_ihashrem(register struct iso_node *ip) { register struct iso_node *iq; @@ -248,73 +255,53 @@ cd9660_ihashrem(ip) } /* - * Last reference to an inode, write the inode out and if necessary, - * truncate and deallocate the file. + * Last reference to an inode... if we're done with + * it, go ahead and recycle it for other use */ int -cd9660_inactive(ap) - struct vop_inactive_args /* { - struct vnode *a_vp; - struct proc *a_p; - } */ *ap; +cd9660_inactive(struct vnop_inactive_args *ap) { - struct vnode *vp = ap->a_vp; - struct proc *p = ap->a_p; - register struct iso_node *ip = VTOI(vp); - int error = 0; + vnode_t vp = ap->a_vp; + struct iso_node *ip = VTOI(vp); - if (prtactive && vp->v_usecount != 0) - vprint("cd9660_inactive: pushing active", vp); - /* - * We need to unlock the inode here. If we don't panics or - * hangs will ensue. Our callers expect us to take care of this. - */ - - VOP_UNLOCK(vp,0,p); - /* * If we are done with the inode, reclaim it * so that it can be reused immediately. */ - if (vp->v_usecount == 0 && ip->inode.iso_mode == 0) - vgone(vp); + if (ip->inode.iso_mode == 0) + vnode_recycle(vp); - return error; + return 0; } /* * Reclaim an inode so that it can be used for other purposes. */ int -cd9660_reclaim(ap) - struct vop_reclaim_args /* { - struct vnode *a_vp; - } */ *ap; +cd9660_reclaim(struct vnop_reclaim_args *ap) { - register struct vnode *vp = ap->a_vp; - register struct iso_node *ip = VTOI(vp); + vnode_t vp = ap->a_vp; + struct iso_node *ip = VTOI(vp); - if (prtactive && vp->v_usecount != 0) - vprint("cd9660_reclaim: pushing active", vp); + vnode_removefsref(vp); /* * Remove the inode from its hash chain. */ cd9660_ihashrem(ip); - /* - * Purge old data structures associated with the inode. - */ - cache_purge(vp); + if (ip->i_devvp) { - struct vnode *tvp = ip->i_devvp; + vnode_t devvp = ip->i_devvp; ip->i_devvp = NULL; - vrele(tvp); + vnode_rele(devvp); } + vnode_clearfsnode(vp); + if (ip->i_namep != isonullname) FREE(ip->i_namep, M_TEMP); if (ip->i_riff != NULL) FREE(ip->i_riff, M_TEMP); - FREE_ZONE(vp->v_data, sizeof(struct iso_node), M_ISOFSNODE); - vp->v_data = NULL; + FREE_ZONE(ip, sizeof(struct iso_node), M_ISOFSNODE); + return (0); } @@ -322,10 +309,8 @@ cd9660_reclaim(ap) * File attributes */ void -cd9660_defattr(isodir, inop, bp) - struct iso_directory_record *isodir; - struct iso_node *inop; - struct buf *bp; +cd9660_defattr(struct iso_directory_record *isodir, struct iso_node *inop, + struct buf *bp) { struct buf *bp2 = NULL; struct iso_mnt *imp; @@ -346,12 +331,11 @@ cd9660_defattr(isodir, inop, bp) if (!bp && ((imp = inop->i_mnt)->im_flags & ISOFSMNT_EXTATT) && (off = isonum_711(isodir->ext_attr_length))) { - VOP_BLKATOFF(ITOV(inop), (off_t)-(off << imp->im_bshift), NULL, - &bp2); + cd9660_blkatoff(ITOV(inop), (off_t)-(off << imp->im_bshift), NULL, &bp2); bp = bp2; } if (bp) { - ap = (struct iso_extended_attributes *)bp->b_data; + ap = (struct iso_extended_attributes *)buf_dataptr(bp); if (isonum_711(ap->version) == 1) { if (!(ap->perm[0]&0x40)) @@ -372,22 +356,20 @@ cd9660_defattr(isodir, inop, bp) ap = NULL; } if (!ap) { - inop->inode.iso_mode |= VREAD|VEXEC|(VREAD|VEXEC)>>3|(VREAD|VEXEC)>>6; - inop->inode.iso_uid = (uid_t)0; - inop->inode.iso_gid = (gid_t)0; + inop->inode.iso_mode |= VREAD|VWRITE|VEXEC|(VREAD|VEXEC)>>3|(VREAD|VEXEC)>>6; + inop->inode.iso_uid = ISO_UNKNOWNUID; + inop->inode.iso_gid = ISO_UNKNOWNGID; } if (bp2) - brelse(bp2); + buf_brelse(bp2); } /* * Time stamps */ void -cd9660_deftstamp(isodir,inop,bp) - struct iso_directory_record *isodir; - struct iso_node *inop; - struct buf *bp; +cd9660_deftstamp(struct iso_directory_record *isodir, struct iso_node *inop, + struct buf *bp) { struct buf *bp2 = NULL; struct iso_mnt *imp; @@ -398,11 +380,11 @@ cd9660_deftstamp(isodir,inop,bp) && ((imp = inop->i_mnt)->im_flags & ISOFSMNT_EXTATT) && (off = isonum_711(isodir->ext_attr_length))) { - VOP_BLKATOFF(ITOV(inop), (off_t)-(off << imp->im_bshift), NULL, &bp2); + cd9660_blkatoff(ITOV(inop), (off_t)-(off << imp->im_bshift), NULL, &bp2); bp = bp2; } if (bp) { - ap = (struct iso_extended_attributes *)bp->b_data; + ap = (struct iso_extended_attributes *)buf_dataptr(bp); if (isonum_711(ap->version) == 1) { if (!cd9660_tstamp_conv17(ap->ftime,&inop->inode.iso_atime)) @@ -420,16 +402,14 @@ cd9660_deftstamp(isodir,inop,bp) inop->inode.iso_mtime = inop->inode.iso_ctime; } if (bp2) - brelse(bp2); + buf_brelse(bp2); } int -cd9660_tstamp_conv7(pi,pu) - u_char *pi; - struct timespec *pu; +cd9660_tstamp_conv7(u_char *pi, struct timespec *pu) { int crtime, days; - int y, m, d, hour, minute, second, tz; + int y, m, d, hour, minute, second, mytz; y = pi[0] + 1900; m = pi[1]; @@ -437,7 +417,7 @@ cd9660_tstamp_conv7(pi,pu) hour = pi[3]; minute = pi[4]; second = pi[5]; - tz = pi[6]; + mytz = pi[6]; if (y < 1970) { pu->tv_sec = 0; @@ -458,8 +438,8 @@ cd9660_tstamp_conv7(pi,pu) crtime = ((((days * 24) + hour) * 60 + minute) * 60) + second; /* timezone offset is unreliable on some disks */ - if (-48 <= tz && tz <= 52) - crtime -= tz * 15 * 60; + if (-48 <= mytz && mytz <= 52) + crtime -= mytz * 15 * 60; } pu->tv_sec = crtime; pu->tv_nsec = 0; @@ -467,9 +447,7 @@ cd9660_tstamp_conv7(pi,pu) } static u_int -cd9660_chars2ui(begin,len) - u_char *begin; - int len; +cd9660_chars2ui(u_char *begin, int len) { u_int rc; @@ -481,9 +459,7 @@ cd9660_chars2ui(begin,len) } int -cd9660_tstamp_conv17(pi,pu) - u_char *pi; - struct timespec *pu; +cd9660_tstamp_conv17(u_char *pi, struct timespec *pu) { u_char buf[7]; @@ -512,9 +488,7 @@ cd9660_tstamp_conv17(pi,pu) } ino_t -isodirino(isodir, imp) - struct iso_directory_record *isodir; - struct iso_mnt *imp; +isodirino(struct iso_directory_record *isodir, struct iso_mnt *imp) { ino_t ino; diff --git a/bsd/isofs/cd9660/cd9660_node.h b/bsd/isofs/cd9660/cd9660_node.h index 68d471496..faa7450dd 100644 --- a/bsd/isofs/cd9660/cd9660_node.h +++ b/bsd/isofs/cd9660/cd9660_node.h @@ -75,6 +75,7 @@ */ #include <sys/lock.h> +#include <sys/kauth.h> #include <isofs/cd9660/iso.h> #ifndef doff_t @@ -121,7 +122,7 @@ struct iso_node { doff_t i_diroff; /* offset in dir, where we found last entry */ doff_t i_offset; /* offset of free space in directory */ ino_t i_ino; /* inode number of found directory */ - struct lock__bsd__ i_lock; /* Inode lock. */ + daddr_t i_lastr; /* last read (read ahead) */ long iso_extent; /* extent of file */ long i_size; long iso_start; /* actual start of data of file (may be different */ @@ -147,54 +148,65 @@ struct iso_node { /* These flags are kept in i_flag. */ #define ISO_ASSOCIATED 0x0001 /* node is an associated file. */ +#define ISO_INALLOC 0x0002 +#define ISO_INWALLOC 0x0004 + /* <ufs/inode.h> defines VTOI and ITOV macros */ #undef VTOI #undef ITOV -#define VTOI(vp) ((struct iso_node *)(vp)->v_data) +#define VTOI(vp) ((struct iso_node *)(vnode_fsnode(vp))) #define ITOV(ip) ((ip)->i_vnode) +/* similar in <hfs/hfs_mount.h> as default UID and GID */ +#define ISO_UNKNOWNUID ((uid_t)99) +#define ISO_UNKNOWNGID ((gid_t)99) + +int cd9660_access_internal(vnode_t, mode_t, kauth_cred_t); + /* * Prototypes for ISOFS vnode operations */ -int cd9660_lookup __P((struct vop_lookup_args *)); -int cd9660_open __P((struct vop_open_args *)); -int cd9660_close __P((struct vop_close_args *)); -int cd9660_access __P((struct vop_access_args *)); -int cd9660_getattr __P((struct vop_getattr_args *)); -int cd9660_read __P((struct vop_read_args *)); -int cd9660_xa_read __P((struct vop_read_args *)); -int cd9660_ioctl __P((struct vop_ioctl_args *)); -int cd9660_select __P((struct vop_select_args *)); -int cd9660_mmap __P((struct vop_mmap_args *)); -int cd9660_seek __P((struct vop_seek_args *)); -int cd9660_readdir __P((struct vop_readdir_args *)); -int cd9660_readlink __P((struct vop_readlink_args *)); -int cd9660_inactive __P((struct vop_inactive_args *)); -int cd9660_reclaim __P((struct vop_reclaim_args *)); -int cd9660_bmap __P((struct vop_bmap_args *)); -int cd9660_lock __P((struct vop_lock_args *)); -int cd9660_unlock __P((struct vop_unlock_args *)); -int cd9660_strategy __P((struct vop_strategy_args *)); -int cd9660_print __P((struct vop_print_args *)); -int cd9660_islocked __P((struct vop_islocked_args *)); -int cd9660_pathconf __P((struct vop_pathconf_args *)); -int cd9660_blkatoff __P((struct vop_blkatoff_args *)); - -void cd9660_defattr __P((struct iso_directory_record *, - struct iso_node *, struct buf *)); -void cd9660_deftstamp __P((struct iso_directory_record *, - struct iso_node *, struct buf *)); -struct vnode *cd9660_ihashget __P((dev_t, ino_t, struct proc *)); -void cd9660_ihashins __P((struct iso_node *)); -void cd9660_ihashrem __P((struct iso_node *)); -int cd9660_tstamp_conv7 __P((u_char *, struct timespec *)); -int cd9660_tstamp_conv17 __P((u_char *, struct timespec *)); -ino_t isodirino __P((struct iso_directory_record *, struct iso_mnt *)); +int cd9660_lookup (struct vnop_lookup_args *); +int cd9660_open (struct vnop_open_args *); +int cd9660_close (struct vnop_close_args *); +int cd9660_access (struct vnop_access_args *); +int cd9660_getattr (struct vnop_getattr_args *); +int cd9660_read (struct vnop_read_args *); +int cd9660_xa_read (struct vnop_read_args *); +int cd9660_ioctl (struct vnop_ioctl_args *); +int cd9660_select (struct vnop_select_args *); +int cd9660_mmap (struct vnop_mmap_args *); +int cd9660_readdir (struct vnop_readdir_args *); +int cd9660_readlink (struct vnop_readlink_args *); +int cd9660_inactive (struct vnop_inactive_args *); +int cd9660_reclaim (struct vnop_reclaim_args *); +int cd9660_strategy (struct vnop_strategy_args *); +int cd9660_pathconf (struct vnop_pathconf_args *); +int cd9660_enotsupp(void); +int cd9660_pagein(struct vnop_pagein_args *ap); +int cd9660_remove(struct vnop_remove_args *ap); +int cd9660_rmdir(struct vnop_rmdir_args *ap); +int cd9660_getattrlist(struct vnop_getattrlist_args *ap); + +__private_extern__ void cd9660_xa_init(struct iso_node *ip, + struct iso_directory_record *isodir); +__private_extern__ int cd9660_blkatoff (vnode_t, off_t, char **, buf_t *); + +void cd9660_defattr (struct iso_directory_record *, + struct iso_node *, struct buf *); +void cd9660_deftstamp (struct iso_directory_record *, + struct iso_node *, struct buf *); +struct vnode *cd9660_ihashget (dev_t, ino_t, struct proc *); +void cd9660_ihashins (struct iso_node *); +void cd9660_ihashrem (struct iso_node *); +int cd9660_tstamp_conv7 (u_char *, struct timespec *); +int cd9660_tstamp_conv17 (u_char *, struct timespec *); +ino_t isodirino (struct iso_directory_record *, struct iso_mnt *); #ifdef ISODEVMAP -struct iso_dnode *iso_dmap __P((dev_t, ino_t, int)); -void iso_dunmap __P((dev_t)); +struct iso_dnode *iso_dmap (dev_t, ino_t, int); +void iso_dunmap (dev_t); #endif #endif /* __APPLE_API_PRIVATE */ diff --git a/bsd/isofs/cd9660/cd9660_rrip.c b/bsd/isofs/cd9660/cd9660_rrip.c index 1481c728c..66d8e231f 100644 --- a/bsd/isofs/cd9660/cd9660_rrip.c +++ b/bsd/isofs/cd9660/cd9660_rrip.c @@ -89,9 +89,7 @@ * POSIX file attribute */ static int -cd9660_rrip_attr(p,ana) - ISO_RRIP_ATTR *p; - ISO_RRIP_ANALYZE *ana; +cd9660_rrip_attr(ISO_RRIP_ATTR *p, ISO_RRIP_ANALYZE *ana) { ana->inop->inode.iso_mode = isonum_733(p->mode); ana->inop->inode.iso_uid = isonum_733(p->uid); @@ -102,9 +100,7 @@ cd9660_rrip_attr(p,ana) } static void -cd9660_rrip_defattr(isodir,ana) - struct iso_directory_record *isodir; - ISO_RRIP_ANALYZE *ana; +cd9660_rrip_defattr(struct iso_directory_record *isodir, ISO_RRIP_ANALYZE *ana) { /* But this is a required field! */ printf("RRIP without PX field?\n"); @@ -115,9 +111,7 @@ cd9660_rrip_defattr(isodir,ana) * Symbolic Links */ static int -cd9660_rrip_slink(p,ana) - ISO_RRIP_SLINK *p; - ISO_RRIP_ANALYZE *ana; +cd9660_rrip_slink(ISO_RRIP_SLINK *p, ISO_RRIP_ANALYZE *ana) { register ISO_RRIP_SLINK_COMPONENT *pcomp; register ISO_RRIP_SLINK_COMPONENT *pcompe; @@ -173,7 +167,7 @@ cd9660_rrip_slink(p,ana) /* same as above */ outbuf -= len; len = 0; - inbuf = ana->imp->im_mountp->mnt_stat.f_mntonname; + inbuf = &(vfs_statfs(ana->imp->im_mountp)->f_mntonname); wlen = strlen(inbuf); break; @@ -226,9 +220,7 @@ cd9660_rrip_slink(p,ana) * Alternate name */ static int -cd9660_rrip_altname(p,ana) - ISO_RRIP_ALTNAME *p; - ISO_RRIP_ANALYZE *ana; +cd9660_rrip_altname(ISO_RRIP_ALTNAME *p, ISO_RRIP_ANALYZE *ana) { char *inbuf; int wlen; @@ -289,9 +281,7 @@ cd9660_rrip_altname(p,ana) } static void -cd9660_rrip_defname(isodir,ana) - struct iso_directory_record *isodir; - ISO_RRIP_ANALYZE *ana; +cd9660_rrip_defname(struct iso_directory_record *isodir, ISO_RRIP_ANALYZE *ana) { strcpy(ana->outbuf,".."); switch (*isodir->name) { @@ -313,9 +303,7 @@ cd9660_rrip_defname(isodir,ana) * Parent or Child Link */ static int -cd9660_rrip_pclink(p,ana) - ISO_RRIP_CLINK *p; - ISO_RRIP_ANALYZE *ana; +cd9660_rrip_pclink(ISO_RRIP_CLINK *p, ISO_RRIP_ANALYZE *ana) { *ana->inump = isonum_733(p->dir_loc) << ana->imp->im_bshift; ana->fields &= ~(ISO_SUSP_CLINK|ISO_SUSP_PLINK); @@ -325,10 +313,9 @@ cd9660_rrip_pclink(p,ana) /* * Relocated directory */ +/* ARGSUSED */ static int -cd9660_rrip_reldir(p,ana) - ISO_RRIP_RELDIR *p; - ISO_RRIP_ANALYZE *ana; +cd9660_rrip_reldir(__unused ISO_RRIP_RELDIR *p, ISO_RRIP_ANALYZE *ana) { /* special hack to make caller aware of RE field */ *ana->outlen = 0; @@ -337,9 +324,7 @@ cd9660_rrip_reldir(p,ana) } static int -cd9660_rrip_tstamp(p,ana) - ISO_RRIP_TSTAMP *p; - ISO_RRIP_ANALYZE *ana; +cd9660_rrip_tstamp(ISO_RRIP_TSTAMP *p, ISO_RRIP_ANALYZE *ana) { u_char *ptime; @@ -394,9 +379,8 @@ cd9660_rrip_tstamp(p,ana) } static void -cd9660_rrip_deftstamp(isodir,ana) - struct iso_directory_record *isodir; - ISO_RRIP_ANALYZE *ana; +cd9660_rrip_deftstamp(struct iso_directory_record *isodir, + ISO_RRIP_ANALYZE *ana) { cd9660_deftstamp(isodir,ana->inop,NULL); } @@ -405,9 +389,7 @@ cd9660_rrip_deftstamp(isodir,ana) * POSIX device modes */ static int -cd9660_rrip_device(p,ana) - ISO_RRIP_DEVICE *p; - ISO_RRIP_ANALYZE *ana; +cd9660_rrip_device(ISO_RRIP_DEVICE *p, ISO_RRIP_ANALYZE *ana) { u_int high, low; @@ -426,9 +408,7 @@ cd9660_rrip_device(p,ana) * Flag indicating */ static int -cd9660_rrip_idflag(p,ana) - ISO_RRIP_IDFLAG *p; - ISO_RRIP_ANALYZE *ana; +cd9660_rrip_idflag(ISO_RRIP_IDFLAG *p, ISO_RRIP_ANALYZE *ana) { ana->fields &= isonum_711(p->flags)|~0xff; /* don't touch high bits */ /* special handling of RE field */ @@ -442,9 +422,7 @@ cd9660_rrip_idflag(p,ana) * Continuation pointer */ static int -cd9660_rrip_cont(p,ana) - ISO_RRIP_CONT *p; - ISO_RRIP_ANALYZE *ana; +cd9660_rrip_cont(ISO_RRIP_CONT *p, ISO_RRIP_ANALYZE *ana) { ana->iso_ce_blk = isonum_733(p->location); ana->iso_ce_off = isonum_733(p->offset); @@ -455,10 +433,9 @@ cd9660_rrip_cont(p,ana) /* * System Use end */ +/* ARGSUSED */ static int -cd9660_rrip_stop(p,ana) - ISO_SUSP_HEADER *p; - ISO_RRIP_ANALYZE *ana; +cd9660_rrip_stop(__unused ISO_SUSP_HEADER *p, __unused ISO_RRIP_ANALYZE *ana) { return ISO_SUSP_STOP; } @@ -467,9 +444,7 @@ cd9660_rrip_stop(p,ana) * Extension reference */ static int -cd9660_rrip_extref(p,ana) - ISO_RRIP_EXTREF *p; - ISO_RRIP_ANALYZE *ana; +cd9660_rrip_extref(ISO_RRIP_EXTREF *p, ISO_RRIP_ANALYZE *ana) { if (isonum_711(p->len_id) != 10 || bcmp((char *)p + 8,"RRIP_1991A",10) @@ -479,18 +454,19 @@ cd9660_rrip_extref(p,ana) return ISO_SUSP_EXTREF; } +typedef int (*rrip_table_func)(ISO_SUSP_HEADER *phead, ISO_RRIP_ANALYZE *ana); +typedef int (*rrip_table_func2)(struct iso_directory_record *isodir, + ISO_RRIP_ANALYZE *ana); typedef struct { char type[2]; - int (*func)(); - void (*func2)(); + rrip_table_func func; + rrip_table_func2 func2; int result; } RRIP_TABLE; static int -cd9660_rrip_loop(isodir,ana,table) - struct iso_directory_record *isodir; - ISO_RRIP_ANALYZE *ana; - RRIP_TABLE *table; +cd9660_rrip_loop(struct iso_directory_record *isodir, ISO_RRIP_ANALYZE *ana, + RRIP_TABLE *table) { register RRIP_TABLE *ptable; register ISO_SUSP_HEADER *phead; @@ -528,7 +504,7 @@ cd9660_rrip_loop(isodir,ana,table) for (ptable = table; ptable->func; ptable++) { if (*phead->type == *ptable->type && phead->type[1] == ptable->type[1]) { - result |= ptable->func(phead,ana); + result |= (ptable->func(phead,ana)); break; } } @@ -552,22 +528,22 @@ cd9660_rrip_loop(isodir,ana,table) if (ana->fields && ana->iso_ce_len) { if (ana->iso_ce_blk >= ana->imp->volume_space_size || ana->iso_ce_off + ana->iso_ce_len > ana->imp->logical_block_size - || bread(ana->imp->im_devvp, + || buf_bread(ana->imp->im_devvp, #if 1 // radar 1669467 - logical and physical blocksize are the same - ana->iso_ce_blk, + (daddr64_t)((unsigned)ana->iso_ce_blk), #else - ana->iso_ce_blk << (ana->imp->im_bshift - DEV_BSHIFT), + (daddr64_t)((unsigned)(ana->iso_ce_blk << (ana->imp->im_bshift - DEV_BSHIFT))), #endif // radar 1669467 ana->imp->logical_block_size, NOCRED, &bp)) /* what to do now? */ break; - phead = (ISO_SUSP_HEADER *)(bp->b_data + ana->iso_ce_off); + phead = (ISO_SUSP_HEADER *)((char *)buf_dataptr(bp) + ana->iso_ce_off); pend = (ISO_SUSP_HEADER *) ((char *)phead + ana->iso_ce_len); } else break; } if (bp) - brelse(bp); + buf_brelse(bp); /* * If we don't find the Basic SUSP stuffs, just set default value * (attribute/time stamp) @@ -583,20 +559,25 @@ cd9660_rrip_loop(isodir,ana,table) * Get Attributes. */ static RRIP_TABLE rrip_table_analyze[] = { - { "PX", cd9660_rrip_attr, cd9660_rrip_defattr, ISO_SUSP_ATTR }, - { "TF", cd9660_rrip_tstamp, cd9660_rrip_deftstamp, ISO_SUSP_TSTAMP }, - { "PN", cd9660_rrip_device, 0, ISO_SUSP_DEVICE }, - { "RR", cd9660_rrip_idflag, 0, ISO_SUSP_IDFLAG }, - { "CE", cd9660_rrip_cont, 0, ISO_SUSP_CONT }, + { "PX", (rrip_table_func)cd9660_rrip_attr, + (rrip_table_func2)cd9660_rrip_defattr, + ISO_SUSP_ATTR }, + { "TF", (rrip_table_func)cd9660_rrip_tstamp, + (rrip_table_func2)cd9660_rrip_deftstamp, + ISO_SUSP_TSTAMP }, + { "PN", (rrip_table_func)cd9660_rrip_device, + 0, ISO_SUSP_DEVICE }, + { "RR", (rrip_table_func)cd9660_rrip_idflag, + 0, ISO_SUSP_IDFLAG }, + { "CE", (rrip_table_func)cd9660_rrip_cont, + 0, ISO_SUSP_CONT }, { "ST", cd9660_rrip_stop, 0, ISO_SUSP_STOP }, { "", 0, 0, 0 } }; int -cd9660_rrip_analyze(isodir,inop,imp) - struct iso_directory_record *isodir; - struct iso_node *inop; - struct iso_mnt *imp; +cd9660_rrip_analyze(struct iso_directory_record *isodir, struct iso_node *inop, + struct iso_mnt *imp) { ISO_RRIP_ANALYZE analyze; @@ -611,23 +592,26 @@ cd9660_rrip_analyze(isodir,inop,imp) * Get Alternate Name. */ static RRIP_TABLE rrip_table_getname[] = { - { "NM", cd9660_rrip_altname, cd9660_rrip_defname, ISO_SUSP_ALTNAME }, - { "CL", cd9660_rrip_pclink, 0, ISO_SUSP_CLINK|ISO_SUSP_PLINK }, - { "PL", cd9660_rrip_pclink, 0, ISO_SUSP_CLINK|ISO_SUSP_PLINK }, - { "RE", cd9660_rrip_reldir, 0, ISO_SUSP_RELDIR }, - { "RR", cd9660_rrip_idflag, 0, ISO_SUSP_IDFLAG }, - { "CE", cd9660_rrip_cont, 0, ISO_SUSP_CONT }, + { "NM", (rrip_table_func)cd9660_rrip_altname, + (rrip_table_func2)cd9660_rrip_defname, + ISO_SUSP_ALTNAME }, + { "CL", (rrip_table_func)cd9660_rrip_pclink, + 0, ISO_SUSP_CLINK|ISO_SUSP_PLINK }, + { "PL", (rrip_table_func)cd9660_rrip_pclink, + 0, ISO_SUSP_CLINK|ISO_SUSP_PLINK }, + { "RE", (rrip_table_func)cd9660_rrip_reldir, + 0, ISO_SUSP_RELDIR }, + { "RR", (rrip_table_func)cd9660_rrip_idflag, + 0, ISO_SUSP_IDFLAG }, + { "CE", (rrip_table_func)cd9660_rrip_cont, + 0, ISO_SUSP_CONT }, { "ST", cd9660_rrip_stop, 0, ISO_SUSP_STOP }, { "", 0, 0, 0 } }; int -cd9660_rrip_getname(isodir,outbuf,outlen,inump,imp) - struct iso_directory_record *isodir; - char *outbuf; - u_short *outlen; - ino_t *inump; - struct iso_mnt *imp; +cd9660_rrip_getname(struct iso_directory_record *isodir, char *outbuf, + u_short *outlen, ino_t *inump, struct iso_mnt *imp) { ISO_RRIP_ANALYZE analyze; RRIP_TABLE *tab; @@ -656,19 +640,19 @@ cd9660_rrip_getname(isodir,outbuf,outlen,inump,imp) * Get Symbolic Link. */ static RRIP_TABLE rrip_table_getsymname[] = { - { "SL", cd9660_rrip_slink, 0, ISO_SUSP_SLINK }, - { "RR", cd9660_rrip_idflag, 0, ISO_SUSP_IDFLAG }, - { "CE", cd9660_rrip_cont, 0, ISO_SUSP_CONT }, + { "SL", (rrip_table_func)cd9660_rrip_slink, + 0, ISO_SUSP_SLINK }, + { "RR", (rrip_table_func)cd9660_rrip_idflag, + 0, ISO_SUSP_IDFLAG }, + { "CE", (rrip_table_func)cd9660_rrip_cont, + 0, ISO_SUSP_CONT }, { "ST", cd9660_rrip_stop, 0, ISO_SUSP_STOP }, { "", 0, 0, 0 } }; int -cd9660_rrip_getsymname(isodir,outbuf,outlen,imp) - struct iso_directory_record *isodir; - char *outbuf; - u_short *outlen; - struct iso_mnt *imp; +cd9660_rrip_getsymname(struct iso_directory_record *isodir, char *outbuf, + u_short *outlen, struct iso_mnt *imp) { ISO_RRIP_ANALYZE analyze; @@ -684,8 +668,10 @@ cd9660_rrip_getsymname(isodir,outbuf,outlen,imp) } static RRIP_TABLE rrip_table_extref[] = { - { "ER", cd9660_rrip_extref, 0, ISO_SUSP_EXTREF }, - { "CE", cd9660_rrip_cont, 0, ISO_SUSP_CONT }, + { "ER", (rrip_table_func)cd9660_rrip_extref, + 0, ISO_SUSP_EXTREF }, + { "CE", (rrip_table_func)cd9660_rrip_cont, + 0, ISO_SUSP_CONT }, { "ST", cd9660_rrip_stop, 0, ISO_SUSP_STOP }, { "", 0, 0, 0 } }; @@ -695,9 +681,7 @@ static RRIP_TABLE rrip_table_extref[] = { * Note: We insist on the ER field. */ int -cd9660_rrip_offset(isodir,imp) - struct iso_directory_record *isodir; - struct iso_mnt *imp; +cd9660_rrip_offset(struct iso_directory_record *isodir, struct iso_mnt *imp) { ISO_RRIP_OFFSET *p; ISO_RRIP_ANALYZE analyze; diff --git a/bsd/isofs/cd9660/cd9660_util.c b/bsd/isofs/cd9660/cd9660_util.c index 2a3798e1d..a858d848b 100644 --- a/bsd/isofs/cd9660/cd9660_util.c +++ b/bsd/isofs/cd9660/cd9660_util.c @@ -76,6 +76,7 @@ #include <sys/stat.h> #include <sys/buf.h> #include <sys/proc.h> +#include <sys/kauth.h> #include <sys/conf.h> #include <sys/utfconv.h> #include <miscfs/specfs/specdev.h> /* XXX */ @@ -95,9 +96,7 @@ * Note: Version number plus ';' may be omitted. */ int -isofncmp(fn, fnlen, isofn, isolen) - u_char *fn, *isofn; - int fnlen, isolen; +isofncmp(u_char *fn, int fnlen, u_char *isofn, int isolen) { int i, j; char c; @@ -160,11 +159,7 @@ isofncmp(fn, fnlen, isofn, isolen) */ int -ucsfncmp(fn, fnlen, ucsfn, ucslen) - u_int16_t *fn; - int fnlen; - u_int16_t *ucsfn; - int ucslen; +ucsfncmp(u_int16_t *fn, int fnlen, u_int16_t *ucsfn, int ucslen) { int i, j; u_int16_t c; @@ -216,12 +211,8 @@ ucsfncmp(fn, fnlen, ucsfn, ucslen) * translate a filename */ void -isofntrans(infn, infnlen, outfn, outfnlen, original, assoc) - u_char *infn, *outfn; - int infnlen; - u_short *outfnlen; - int original; - int assoc; +isofntrans(u_char *infn, int infnlen, u_char *outfn, u_short *outfnlen, + int original, int assoc) { int fnidx = 0; @@ -266,13 +257,8 @@ isofntrans(infn, infnlen, outfn, outfnlen, original, assoc) * translate a UCS-2 filename to UTF-8 */ void -ucsfntrans(infn, infnlen, outfn, outfnlen, dir, assoc) - u_int16_t *infn; - int infnlen; - u_char *outfn; - u_short *outfnlen; - int dir; - int assoc; +ucsfntrans(u_int16_t *infn, int infnlen, u_char *outfn, u_short *outfnlen, + int dir, int assoc) { if (infnlen == 1) { strcpy(outfn, ".."); @@ -325,22 +311,19 @@ ucsfntrans(infn, infnlen, outfn, outfnlen, dir, assoc) * count the number of children by enumerating the directory */ static int -isochildcount(vdp, dircnt, filcnt) - struct vnode *vdp; - int *dircnt; - int *filcnt; +isochildcount(struct vnode *vdp, int *dircnt, int *filcnt) { struct iso_node *dp; struct buf *bp = NULL; struct iso_mnt *imp; struct iso_directory_record *ep; - u_long bmask; + uint32_t bmask; int error = 0; int reclen; int dirs, files; int blkoffset; int logblksize; - long diroffset; + int32_t diroffset; dp = VTOI(vdp); imp = dp->i_mnt; @@ -356,14 +339,14 @@ isochildcount(vdp, dircnt, filcnt) */ if ((diroffset & bmask) == 0) { if (bp != NULL) - brelse(bp); - if ( (error = VOP_BLKATOFF(vdp, SECTOFF(imp, diroffset), NULL, &bp)) ) + buf_brelse(bp); + if ( (error = cd9660_blkatoff(vdp, SECTOFF(imp, diroffset), NULL, &bp)) ) break; blkoffset = 0; } ep = (struct iso_directory_record *) - ((char *)bp->b_data + blkoffset); + (buf_dataptr(bp) + blkoffset); reclen = isonum_711(ep->length); if (reclen == 0) { @@ -399,7 +382,7 @@ isochildcount(vdp, dircnt, filcnt) } if (bp) - brelse (bp); + buf_brelse (bp); *dircnt = dirs; *filcnt = files; @@ -408,47 +391,33 @@ isochildcount(vdp, dircnt, filcnt) } -/* - * There are two ways to qualify for ownership rights on an object: - * - * 1. Your UID matches the UID of the vnode - * 2. You are root - * - */ -static int cd9660_owner_rights(uid_t owner, struct iso_mnt *imp, struct ucred *cred, struct proc *p, int invokesuperuserstatus) { - return ((cred->cr_uid == owner) || /* [1] */ - (invokesuperuserstatus && (suser(cred, &p->p_acflag) == 0))) ? 0 : EPERM; /* [2] */ -} - - - -static unsigned long DerivePermissionSummary(uid_t owner, gid_t group, mode_t obj_mode, struct iso_mnt *imp, struct ucred *cred, struct proc *p) { - register gid_t *gp; - unsigned long permissions; - int i; +static uint32_t +DerivePermissionSummary(uid_t owner, gid_t group, mode_t obj_mode, __unused struct iso_mnt *imp) +{ + kauth_cred_t cred = kauth_cred_get(); + uint32_t permissions; + int is_member; /* User id 0 (root) always gets access. */ - if (cred->cr_uid == 0) { + if (!suser(cred, NULL)) { permissions = R_OK | X_OK; goto Exit; }; /* Otherwise, check the owner. */ - if (cd9660_owner_rights(owner, imp, cred, p, 0) == 0) { - permissions = ((unsigned long)obj_mode & S_IRWXU) >> 6; + if (owner == kauth_cred_getuid(cred)) { + permissions = ((uint32_t)obj_mode & S_IRWXU) >> 6; goto Exit; } /* Otherwise, check the groups. */ - for (i = 0, gp = cred->cr_groups; i < cred->cr_ngroups; i++, gp++) { - if (group == *gp) { - permissions = ((unsigned long)obj_mode & S_IRWXG) >> 3; + if (kauth_cred_ismember_gid(cred, group, &is_member) == 0 && is_member) { + permissions = ((uint32_t)obj_mode & S_IRWXG) >> 3; goto Exit; - } - }; + } /* Otherwise, settle for 'others' access. */ - permissions = (unsigned long)obj_mode & S_IRWXO; + permissions = (uint32_t)obj_mode & S_IRWXO; Exit: return permissions & ~W_OK; /* Write access is always impossible */ @@ -460,6 +429,7 @@ attrcalcsize(struct attrlist *attrlist) { int size; attrgroup_t a; + boolean_t is_64_bit = proc_is64bit(current_proc()); #if ((ATTR_CMN_NAME | ATTR_CMN_DEVID | ATTR_CMN_FSID | ATTR_CMN_OBJTYPE | \ ATTR_CMN_OBJTAG | ATTR_CMN_OBJID | ATTR_CMN_OBJPERMANENTID | ATTR_CMN_PAROBJID | \ @@ -509,55 +479,80 @@ attrcalcsize(struct attrlist *attrlist) if (a & ATTR_CMN_OBJPERMANENTID) size += sizeof(fsobj_id_t); if (a & ATTR_CMN_PAROBJID) size += sizeof(fsobj_id_t); if (a & ATTR_CMN_SCRIPT) size += sizeof(text_encoding_t); - if (a & ATTR_CMN_CRTIME) size += sizeof(struct timespec); - if (a & ATTR_CMN_MODTIME) size += sizeof(struct timespec); - if (a & ATTR_CMN_CHGTIME) size += sizeof(struct timespec); - if (a & ATTR_CMN_ACCTIME) size += sizeof(struct timespec); - if (a & ATTR_CMN_BKUPTIME) size += sizeof(struct timespec); + if (a & ATTR_CMN_CRTIME) { + if (is_64_bit) + size += sizeof(struct user_timespec); + else + size += sizeof(struct timespec); + } + if (a & ATTR_CMN_MODTIME) { + if (is_64_bit) + size += sizeof(struct user_timespec); + else + size += sizeof(struct timespec); + } + if (a & ATTR_CMN_CHGTIME) { + if (is_64_bit) + size += sizeof(struct user_timespec); + else + size += sizeof(struct timespec); + } + if (a & ATTR_CMN_ACCTIME) { + if (is_64_bit) + size += sizeof(struct user_timespec); + else + size += sizeof(struct timespec); + } + if (a & ATTR_CMN_BKUPTIME) { + if (is_64_bit) + size += sizeof(struct user_timespec); + else + size += sizeof(struct timespec); + } if (a & ATTR_CMN_FNDRINFO) size += 32 * sizeof(u_int8_t); if (a & ATTR_CMN_OWNERID) size += sizeof(uid_t); if (a & ATTR_CMN_GRPID) size += sizeof(gid_t); - if (a & ATTR_CMN_ACCESSMASK) size += sizeof(u_long); - if (a & ATTR_CMN_NAMEDATTRCOUNT) size += sizeof(u_long); + if (a & ATTR_CMN_ACCESSMASK) size += sizeof(uint32_t); + if (a & ATTR_CMN_NAMEDATTRCOUNT) size += sizeof(uint32_t); if (a & ATTR_CMN_NAMEDATTRLIST) size += sizeof(struct attrreference); - if (a & ATTR_CMN_FLAGS) size += sizeof(u_long); - if (a & ATTR_CMN_USERACCESS) size += sizeof(u_long); + if (a & ATTR_CMN_FLAGS) size += sizeof(uint32_t); + if (a & ATTR_CMN_USERACCESS) size += sizeof(uint32_t); }; if ((a = attrlist->volattr) != 0) { - if (a & ATTR_VOL_FSTYPE) size += sizeof(u_long); - if (a & ATTR_VOL_SIGNATURE) size += sizeof(u_long); + if (a & ATTR_VOL_FSTYPE) size += sizeof(uint32_t); + if (a & ATTR_VOL_SIGNATURE) size += sizeof(uint32_t); if (a & ATTR_VOL_SIZE) size += sizeof(off_t); if (a & ATTR_VOL_SPACEFREE) size += sizeof(off_t); if (a & ATTR_VOL_SPACEAVAIL) size += sizeof(off_t); if (a & ATTR_VOL_MINALLOCATION) size += sizeof(off_t); if (a & ATTR_VOL_ALLOCATIONCLUMP) size += sizeof(off_t); - if (a & ATTR_VOL_IOBLOCKSIZE) size += sizeof(size_t); - if (a & ATTR_VOL_OBJCOUNT) size += sizeof(u_long); - if (a & ATTR_VOL_FILECOUNT) size += sizeof(u_long); - if (a & ATTR_VOL_DIRCOUNT) size += sizeof(u_long); - if (a & ATTR_VOL_MAXOBJCOUNT) size += sizeof(u_long); + if (a & ATTR_VOL_IOBLOCKSIZE) size += sizeof(uint32_t); + if (a & ATTR_VOL_OBJCOUNT) size += sizeof(uint32_t); + if (a & ATTR_VOL_FILECOUNT) size += sizeof(uint32_t); + if (a & ATTR_VOL_DIRCOUNT) size += sizeof(uint32_t); + if (a & ATTR_VOL_MAXOBJCOUNT) size += sizeof(uint32_t); if (a & ATTR_VOL_MOUNTPOINT) size += sizeof(struct attrreference); if (a & ATTR_VOL_NAME) size += sizeof(struct attrreference); - if (a & ATTR_VOL_MOUNTFLAGS) size += sizeof(u_long); + if (a & ATTR_VOL_MOUNTFLAGS) size += sizeof(uint32_t); if (a & ATTR_VOL_MOUNTEDDEVICE) size += sizeof(struct attrreference); if (a & ATTR_VOL_ENCODINGSUSED) size += sizeof(unsigned long long); if (a & ATTR_VOL_CAPABILITIES) size += sizeof(vol_capabilities_attr_t); if (a & ATTR_VOL_ATTRIBUTES) size += sizeof(vol_attributes_attr_t); }; if ((a = attrlist->dirattr) != 0) { - if (a & ATTR_DIR_LINKCOUNT) size += sizeof(u_long); - if (a & ATTR_DIR_ENTRYCOUNT) size += sizeof(u_long); - if (a & ATTR_DIR_MOUNTSTATUS) size += sizeof(u_long); + if (a & ATTR_DIR_LINKCOUNT) size += sizeof(uint32_t); + if (a & ATTR_DIR_ENTRYCOUNT) size += sizeof(uint32_t); + if (a & ATTR_DIR_MOUNTSTATUS) size += sizeof(uint32_t); }; if ((a = attrlist->fileattr) != 0) { - if (a & ATTR_FILE_LINKCOUNT) size += sizeof(u_long); + if (a & ATTR_FILE_LINKCOUNT) size += sizeof(uint32_t); if (a & ATTR_FILE_TOTALSIZE) size += sizeof(off_t); if (a & ATTR_FILE_ALLOCSIZE) size += sizeof(off_t); - if (a & ATTR_FILE_IOBLOCKSIZE) size += sizeof(size_t); - if (a & ATTR_FILE_CLUMPSIZE) size += sizeof(off_t); - if (a & ATTR_FILE_DEVTYPE) size += sizeof(u_long); - if (a & ATTR_FILE_FILETYPE) size += sizeof(u_long); - if (a & ATTR_FILE_FORKCOUNT) size += sizeof(u_long); + if (a & ATTR_FILE_IOBLOCKSIZE) size += sizeof(uint32_t); + if (a & ATTR_FILE_CLUMPSIZE) size += sizeof(uint32_t); + if (a & ATTR_FILE_DEVTYPE) size += sizeof(uint32_t); + if (a & ATTR_FILE_FILETYPE) size += sizeof(uint32_t); + if (a & ATTR_FILE_FORKCOUNT) size += sizeof(uint32_t); if (a & ATTR_FILE_FORKLIST) size += sizeof(struct attrreference); if (a & ATTR_FILE_DATALENGTH) size += sizeof(off_t); if (a & ATTR_FILE_DATAALLOCSIZE) size += sizeof(off_t); @@ -576,7 +571,7 @@ attrcalcsize(struct attrlist *attrlist) -void +static void packvolattr (struct attrlist *alist, struct iso_node *ip, /* ip for root directory */ void **attrbufptrptr, @@ -587,7 +582,8 @@ packvolattr (struct attrlist *alist, struct iso_mnt *imp; struct mount *mp; attrgroup_t a; - u_long attrlength; + uint32_t attrlength; + boolean_t is_64_bit = proc_is64bit(current_proc()); attrbufptr = *attrbufptrptr; varbufptr = *varbufptrptr; @@ -605,8 +601,8 @@ packvolattr (struct attrlist *alist, (u_int8_t *)varbufptr += attrlength + ((4 - (attrlength & 3)) & 3); ++((struct attrreference *)attrbufptr); }; - if (a & ATTR_CMN_DEVID) *((dev_t *)attrbufptr)++ = imp->im_devvp->v_rdev; - if (a & ATTR_CMN_FSID) *((fsid_t *)attrbufptr)++ = ITOV(ip)->v_mount->mnt_stat.f_fsid; + if (a & ATTR_CMN_DEVID) *((dev_t *)attrbufptr)++ = vnode_specrdev(imp->im_devvp); + if (a & ATTR_CMN_FSID) *((fsid_t *)attrbufptr)++ = vfs_statfs(vnode_mount(ITOV(ip)))->f_fsid; if (a & ATTR_CMN_OBJTYPE) *((fsobj_type_t *)attrbufptr)++ = 0; if (a & ATTR_CMN_OBJTAG) *((fsobj_tag_t *)attrbufptr)++ = VT_ISOFS; if (a & ATTR_CMN_OBJID) { @@ -625,10 +621,46 @@ packvolattr (struct attrlist *alist, ++((fsobj_id_t *)attrbufptr); }; if (a & ATTR_CMN_SCRIPT) *((text_encoding_t *)attrbufptr)++ = 0; - if (a & ATTR_CMN_CRTIME) *((struct timespec *)attrbufptr)++ = imp->creation_date; - if (a & ATTR_CMN_MODTIME) *((struct timespec *)attrbufptr)++ = imp->modification_date; - if (a & ATTR_CMN_CHGTIME) *((struct timespec *)attrbufptr)++ = imp->modification_date; - if (a & ATTR_CMN_ACCTIME) *((struct timespec *)attrbufptr)++ = imp->modification_date; + if (a & ATTR_CMN_CRTIME) { + if (is_64_bit) { + struct user_timespec *tmpp = ((struct user_timespec *)attrbufptr)++; + tmpp->tv_sec = (user_time_t) imp->creation_date.tv_sec; + tmpp->tv_nsec = imp->creation_date.tv_nsec; + } + else { + *((struct timespec *)attrbufptr)++ = imp->creation_date; + } + } + if (a & ATTR_CMN_MODTIME) { + if (is_64_bit) { + struct user_timespec *tmpp = ((struct user_timespec *)attrbufptr)++; + tmpp->tv_sec = (user_time_t) imp->modification_date.tv_sec; + tmpp->tv_nsec = imp->modification_date.tv_nsec; + } + else { + *((struct timespec *)attrbufptr)++ = imp->modification_date; + } + } + if (a & ATTR_CMN_CHGTIME) { + if (is_64_bit) { + struct user_timespec *tmpp = ((struct user_timespec *)attrbufptr)++; + tmpp->tv_sec = (user_time_t) imp->modification_date.tv_sec; + tmpp->tv_nsec = imp->modification_date.tv_nsec; + } + else { + *((struct timespec *)attrbufptr)++ = imp->modification_date; + } + } + if (a & ATTR_CMN_ACCTIME) { + if (is_64_bit) { + struct user_timespec *tmpp = ((struct user_timespec *)attrbufptr)++; + tmpp->tv_sec = (user_time_t) imp->modification_date.tv_sec; + tmpp->tv_nsec = imp->modification_date.tv_nsec; + } + else { + *((struct timespec *)attrbufptr)++ = imp->modification_date; + } + } if (a & ATTR_CMN_BKUPTIME) { ((struct timespec *)attrbufptr)->tv_sec = 0; ((struct timespec *)attrbufptr)->tv_nsec = 0; @@ -640,34 +672,32 @@ packvolattr (struct attrlist *alist, }; if (a & ATTR_CMN_OWNERID) *((uid_t *)attrbufptr)++ = ip->inode.iso_uid; if (a & ATTR_CMN_GRPID) *((gid_t *)attrbufptr)++ = ip->inode.iso_gid; - if (a & ATTR_CMN_ACCESSMASK) *((u_long *)attrbufptr)++ = (u_long)ip->inode.iso_mode; - if (a & ATTR_CMN_FLAGS) *((u_long *)attrbufptr)++ = 0; + if (a & ATTR_CMN_ACCESSMASK) *((uint32_t *)attrbufptr)++ = (uint32_t)ip->inode.iso_mode; + if (a & ATTR_CMN_FLAGS) *((uint32_t *)attrbufptr)++ = 0; if (a & ATTR_CMN_USERACCESS) { - *((u_long *)attrbufptr)++ = + *((uint32_t *)attrbufptr)++ = DerivePermissionSummary(ip->inode.iso_uid, ip->inode.iso_gid, ip->inode.iso_mode, - imp, - current_proc()->p_ucred, - current_proc()); + imp); }; }; if ((a = alist->volattr) != 0) { off_t blocksize = (off_t)imp->logical_block_size; - if (a & ATTR_VOL_FSTYPE) *((u_long *)attrbufptr)++ = (u_long)imp->im_mountp->mnt_vfc->vfc_typenum; - if (a & ATTR_VOL_SIGNATURE) *((u_long *)attrbufptr)++ = (u_long)ISO9660SIGNATURE; + if (a & ATTR_VOL_FSTYPE) *((uint32_t *)attrbufptr)++ = (uint32_t)vfs_typenum(mp); + if (a & ATTR_VOL_SIGNATURE) *((uint32_t *)attrbufptr)++ = (uint32_t)ISO9660SIGNATURE; if (a & ATTR_VOL_SIZE) *((off_t *)attrbufptr)++ = (off_t)imp->volume_space_size * blocksize; if (a & ATTR_VOL_SPACEFREE) *((off_t *)attrbufptr)++ = 0; if (a & ATTR_VOL_SPACEAVAIL) *((off_t *)attrbufptr)++ = 0; if (a & ATTR_VOL_MINALLOCATION) *((off_t *)attrbufptr)++ = blocksize; if (a & ATTR_VOL_ALLOCATIONCLUMP) *((off_t *)attrbufptr)++ = blocksize; - if (a & ATTR_VOL_IOBLOCKSIZE) *((size_t *)attrbufptr)++ = blocksize; - if (a & ATTR_VOL_OBJCOUNT) *((u_long *)attrbufptr)++ = 0; - if (a & ATTR_VOL_FILECOUNT) *((u_long *)attrbufptr)++ = 0; - if (a & ATTR_VOL_DIRCOUNT) *((u_long *)attrbufptr)++ = 0; - if (a & ATTR_VOL_MAXOBJCOUNT) *((u_long *)attrbufptr)++ = 0xFFFFFFFF; + if (a & ATTR_VOL_IOBLOCKSIZE) *((uint32_t *)attrbufptr)++ = (uint32_t)blocksize; + if (a & ATTR_VOL_OBJCOUNT) *((uint32_t *)attrbufptr)++ = 0; + if (a & ATTR_VOL_FILECOUNT) *((uint32_t *)attrbufptr)++ = 0; + if (a & ATTR_VOL_DIRCOUNT) *((uint32_t *)attrbufptr)++ = 0; + if (a & ATTR_VOL_MAXOBJCOUNT) *((uint32_t *)attrbufptr)++ = 0xFFFFFFFF; if (a & ATTR_VOL_NAME) { attrlength = strlen( imp->volume_id ) + 1; ((struct attrreference *)attrbufptr)->attr_dataoffset = (u_int8_t *)varbufptr - (u_int8_t *)attrbufptr; @@ -678,13 +708,15 @@ packvolattr (struct attrlist *alist, (u_int8_t *)varbufptr += attrlength + ((4 - (attrlength & 3)) & 3); ++((struct attrreference *)attrbufptr); }; - if (a & ATTR_VOL_MOUNTFLAGS) *((u_long *)attrbufptr)++ = (u_long)imp->im_mountp->mnt_flag; + if (a & ATTR_VOL_MOUNTFLAGS) { + *((uint32_t *)attrbufptr)++ = (uint32_t)vfs_flags(mp); + } if (a & ATTR_VOL_MOUNTEDDEVICE) { ((struct attrreference *)attrbufptr)->attr_dataoffset = (u_int8_t *)varbufptr - (u_int8_t *)attrbufptr; - ((struct attrreference *)attrbufptr)->attr_length = strlen(mp->mnt_stat.f_mntfromname) + 1; + ((struct attrreference *)attrbufptr)->attr_length = strlen(vfs_statfs(mp)->f_mntfromname) + 1; attrlength = ((struct attrreference *)attrbufptr)->attr_length; attrlength = attrlength + ((4 - (attrlength & 3)) & 3); /* round up to the next 4-byte boundary: */ - (void) bcopy(mp->mnt_stat.f_mntfromname, varbufptr, attrlength); + (void) bcopy(vfs_statfs(mp)->f_mntfromname, varbufptr, attrlength); /* Advance beyond the space just allocated: */ (u_int8_t *)varbufptr += attrlength; @@ -716,7 +748,8 @@ packvolattr (struct attrlist *alist, VOL_CAP_FMT_ZERO_RUNS | VOL_CAP_FMT_CASE_SENSITIVE | VOL_CAP_FMT_CASE_PRESERVING | - VOL_CAP_FMT_FAST_STATFS; + VOL_CAP_FMT_FAST_STATFS | + VOL_CAP_FMT_2TB_FILESIZE; ((vol_capabilities_attr_t *)attrbufptr)->valid[VOL_CAPABILITIES_INTERFACES] = VOL_CAP_INT_SEARCHFS | VOL_CAP_INT_ATTRLIST | @@ -764,7 +797,8 @@ packcommonattr (struct attrlist *alist, void *attrbufptr; void *varbufptr; attrgroup_t a; - u_long attrlength; + uint32_t attrlength; + boolean_t is_64_bit = proc_is64bit(current_proc()); attrbufptr = *attrbufptrptr; varbufptr = *varbufptrptr; @@ -774,7 +808,7 @@ packcommonattr (struct attrlist *alist, if (a & ATTR_CMN_NAME) { /* special case root since we know how to get it's name */ - if (ITOV(ip)->v_flag & VROOT) { + if (vnode_isvroot(ITOV(ip))) { attrlength = strlen( imp->volume_id ) + 1; (void) strncpy((unsigned char *)varbufptr, imp->volume_id, attrlength); } else { @@ -789,11 +823,11 @@ packcommonattr (struct attrlist *alist, ++((struct attrreference *)attrbufptr); }; if (a & ATTR_CMN_DEVID) *((dev_t *)attrbufptr)++ = ip->i_dev; - if (a & ATTR_CMN_FSID) *((fsid_t *)attrbufptr)++ = ITOV(ip)->v_mount->mnt_stat.f_fsid; - if (a & ATTR_CMN_OBJTYPE) *((fsobj_type_t *)attrbufptr)++ = ITOV(ip)->v_type; - if (a & ATTR_CMN_OBJTAG) *((fsobj_tag_t *)attrbufptr)++ = ITOV(ip)->v_tag; + if (a & ATTR_CMN_FSID) *((fsid_t *)attrbufptr)++ = vfs_statfs(vnode_mount(ITOV(ip)))->f_fsid; + if (a & ATTR_CMN_OBJTYPE) *((fsobj_type_t *)attrbufptr)++ = vnode_vtype(ITOV(ip)); + if (a & ATTR_CMN_OBJTAG) *((fsobj_tag_t *)attrbufptr)++ = vnode_tag(ITOV(ip)); if (a & ATTR_CMN_OBJID) { - if (ITOV(ip)->v_flag & VROOT) + if (vnode_isvroot(ITOV(ip))) ((fsobj_id_t *)attrbufptr)->fid_objno = 2; /* force root to be 2 */ else ((fsobj_id_t *)attrbufptr)->fid_objno = ip->i_number; @@ -801,7 +835,7 @@ packcommonattr (struct attrlist *alist, ++((fsobj_id_t *)attrbufptr); }; if (a & ATTR_CMN_OBJPERMANENTID) { - if (ITOV(ip)->v_flag & VROOT) + if (vnode_isvroot(ITOV(ip))) ((fsobj_id_t *)attrbufptr)->fid_objno = 2; /* force root to be 2 */ else ((fsobj_id_t *)attrbufptr)->fid_objno = ip->i_number; @@ -822,22 +856,67 @@ packcommonattr (struct attrlist *alist, ++((fsobj_id_t *)attrbufptr); }; if (a & ATTR_CMN_SCRIPT) *((text_encoding_t *)attrbufptr)++ = 0; - if (a & ATTR_CMN_CRTIME) *((struct timespec *)attrbufptr)++ = ip->inode.iso_mtime; - if (a & ATTR_CMN_MODTIME) *((struct timespec *)attrbufptr)++ = ip->inode.iso_mtime; - if (a & ATTR_CMN_CHGTIME) *((struct timespec *)attrbufptr)++ = ip->inode.iso_ctime; - if (a & ATTR_CMN_ACCTIME) *((struct timespec *)attrbufptr)++ = ip->inode.iso_atime; + if (a & ATTR_CMN_CRTIME) { + if (is_64_bit) { + struct user_timespec *tmpp = ((struct user_timespec *)attrbufptr)++; + tmpp->tv_sec = (user_time_t) ip->inode.iso_mtime.tv_sec; + tmpp->tv_nsec = ip->inode.iso_mtime.tv_nsec; + } + else { + *((struct timespec *)attrbufptr)++ = ip->inode.iso_mtime; + } + } + if (a & ATTR_CMN_MODTIME) { + if (is_64_bit) { + struct user_timespec *tmpp = ((struct user_timespec *)attrbufptr)++; + tmpp->tv_sec = (user_time_t) ip->inode.iso_mtime.tv_sec; + tmpp->tv_nsec = ip->inode.iso_mtime.tv_nsec; + } + else { + *((struct timespec *)attrbufptr)++ = ip->inode.iso_mtime; + } + } + if (a & ATTR_CMN_CHGTIME) { + if (is_64_bit) { + struct user_timespec *tmpp = ((struct user_timespec *)attrbufptr)++; + tmpp->tv_sec = (user_time_t) ip->inode.iso_ctime.tv_sec; + tmpp->tv_nsec = ip->inode.iso_ctime.tv_nsec; + } + else { + *((struct timespec *)attrbufptr)++ = ip->inode.iso_ctime; + } + } + if (a & ATTR_CMN_ACCTIME) { + if (is_64_bit) { + struct user_timespec *tmpp = ((struct user_timespec *)attrbufptr)++; + tmpp->tv_sec = (user_time_t) ip->inode.iso_atime.tv_sec; + tmpp->tv_nsec = ip->inode.iso_atime.tv_nsec; + } + else { + *((struct timespec *)attrbufptr)++ = ip->inode.iso_atime; + } + } if (a & ATTR_CMN_BKUPTIME) { - ((struct timespec *)attrbufptr)->tv_sec = 0; - ((struct timespec *)attrbufptr)->tv_nsec = 0; - ++((struct timespec *)attrbufptr); - }; + if (is_64_bit) { + struct user_timespec *tmpp = ((struct user_timespec *)attrbufptr)++; + tmpp->tv_sec = (user_time_t) 0; + tmpp->tv_nsec = 0; + } + else { + ((struct timespec *)attrbufptr)->tv_sec = 0; + ((struct timespec *)attrbufptr)->tv_nsec = 0; + ++((struct timespec *)attrbufptr); + *((struct timespec *)attrbufptr)++ = ip->inode.iso_atime; + } + } if (a & ATTR_CMN_FNDRINFO) { - struct finder_info finfo = {0}; + struct finder_info finfo; + bzero(&finfo, sizeof(finfo)); finfo.fdFlags = ip->i_FinderFlags; finfo.fdLocation.v = -1; finfo.fdLocation.h = -1; - if (ITOV(ip)->v_type == VREG) { + if (vnode_isreg(ITOV(ip))) { finfo.fdType = ip->i_FileType; finfo.fdCreator = ip->i_Creator; } @@ -848,16 +927,14 @@ packcommonattr (struct attrlist *alist, }; if (a & ATTR_CMN_OWNERID) *((uid_t *)attrbufptr)++ = ip->inode.iso_uid; if (a & ATTR_CMN_GRPID) *((gid_t *)attrbufptr)++ = ip->inode.iso_gid; - if (a & ATTR_CMN_ACCESSMASK) *((u_long *)attrbufptr)++ = (u_long)ip->inode.iso_mode; - if (a & ATTR_CMN_FLAGS) *((u_long *)attrbufptr)++ = 0; /* could also use ip->i_flag */ + if (a & ATTR_CMN_ACCESSMASK) *((uint32_t *)attrbufptr)++ = (uint32_t)ip->inode.iso_mode; + if (a & ATTR_CMN_FLAGS) *((uint32_t *)attrbufptr)++ = 0; /* could also use ip->i_flag */ if (a & ATTR_CMN_USERACCESS) { - *((u_long *)attrbufptr)++ = + *((uint32_t *)attrbufptr)++ = DerivePermissionSummary(ip->inode.iso_uid, ip->inode.iso_gid, ip->inode.iso_mode, - imp, - current_proc()->p_ucred, - current_proc()); + imp); }; }; @@ -870,7 +947,7 @@ void packdirattr(struct attrlist *alist, struct iso_node *ip, void **attrbufptrptr, - void **varbufptrptr) + __unused void **varbufptrptr) { void *attrbufptr; attrgroup_t a; @@ -880,7 +957,7 @@ packdirattr(struct attrlist *alist, filcnt = dircnt = 0; a = alist->dirattr; - if ((ITOV(ip)->v_type == VDIR) && (a != 0)) { + if (vnode_isdir(ITOV(ip)) && (a != 0)) { /* * if we haven't counted our children yet, do it now... */ @@ -895,17 +972,17 @@ packdirattr(struct attrlist *alist, } if (a & ATTR_DIR_LINKCOUNT) { - *((u_long *)attrbufptr)++ = ip->inode.iso_links; + *((uint32_t *)attrbufptr)++ = ip->inode.iso_links; } if (a & ATTR_DIR_ENTRYCOUNT) { /* exclude '.' and '..' from total caount */ - *((u_long *)attrbufptr)++ = ((ip->i_entries <= 2) ? 0 : (ip->i_entries - 2)); + *((uint32_t *)attrbufptr)++ = ((ip->i_entries <= 2) ? 0 : (ip->i_entries - 2)); } if (a & ATTR_DIR_MOUNTSTATUS) { - if (ITOV(ip)->v_mountedhere) { - *((u_long *)attrbufptr)++ = DIR_MNTSTATUS_MNTPOINT; + if (vnode_mountedhere(ITOV(ip))) { + *((uint32_t *)attrbufptr)++ = DIR_MNTSTATUS_MNTPOINT; } else { - *((u_long *)attrbufptr)++ = 0; + *((uint32_t *)attrbufptr)++ = 0; }; }; }; @@ -924,19 +1001,19 @@ packfileattr(struct attrlist *alist, void *varbufptr = *varbufptrptr; attrgroup_t a = alist->fileattr; - if ((ITOV(ip)->v_type == VREG) && (a != 0)) { + if (vnode_isreg(ITOV(ip)) && (a != 0)) { if (a & ATTR_FILE_LINKCOUNT) - *((u_long *)attrbufptr)++ = ip->inode.iso_links; + *((uint32_t *)attrbufptr)++ = ip->inode.iso_links; if (a & ATTR_FILE_TOTALSIZE) *((off_t *)attrbufptr)++ = (off_t)ip->i_size; if (a & ATTR_FILE_ALLOCSIZE) *((off_t *)attrbufptr)++ = (off_t)ip->i_size; if (a & ATTR_FILE_IOBLOCKSIZE) - *((u_long *)attrbufptr)++ = ip->i_mnt->logical_block_size; + *((uint32_t *)attrbufptr)++ = ip->i_mnt->logical_block_size; if (a & ATTR_FILE_CLUMPSIZE) - *((u_long *)attrbufptr)++ = ip->i_mnt->logical_block_size; + *((uint32_t *)attrbufptr)++ = ip->i_mnt->logical_block_size; if (a & ATTR_FILE_DEVTYPE) - *((u_long *)attrbufptr)++ = (u_long)ip->inode.iso_rdev; + *((uint32_t *)attrbufptr)++ = (uint32_t)ip->inode.iso_rdev; if (a & ATTR_FILE_DATALENGTH) *((off_t *)attrbufptr)++ = (off_t)ip->i_size; if (a & ATTR_FILE_DATAALLOCSIZE) @@ -965,7 +1042,7 @@ packattrblk(struct attrlist *alist, } else { packcommonattr(alist, ip, attrbufptrptr, varbufptrptr); - switch (ITOV(ip)->v_type) { + switch (vnode_vtype(ITOV(ip))) { case VDIR: packdirattr(alist, ip, attrbufptrptr, varbufptrptr); break; diff --git a/bsd/isofs/cd9660/cd9660_vfsops.c b/bsd/isofs/cd9660/cd9660_vfsops.c index a6e51cc5c..f026c811f 100644 --- a/bsd/isofs/cd9660/cd9660_vfsops.c +++ b/bsd/isofs/cd9660/cd9660_vfsops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -63,10 +63,11 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/proc.h> +#include <sys/kauth.h> #include <sys/kernel.h> #include <miscfs/specfs/specdev.h> #include <sys/buf.h> @@ -119,15 +120,13 @@ struct CDTOC { u_char isonullname[] = "\0"; -extern int enodev (); - struct vfsops cd9660_vfsops = { cd9660_mount, cd9660_start, cd9660_unmount, cd9660_root, - cd9660_quotactl, - cd9660_statfs, + NULL, /* quotactl */ + cd9660_vfs_getattr, cd9660_sync, cd9660_vget, cd9660_fhtovp, @@ -143,8 +142,8 @@ struct vfsops cd9660_vfsops = { */ #define ROOTNAME "root_device" -static int iso_mountfs __P((struct vnode *devvp, struct mount *mp, - struct proc *p, struct iso_args *argp)); +static int iso_mountfs(struct vnode *devvp, struct mount *mp, struct user_iso_args *argp, + vfs_context_t context); static void DRGetTypeCreatorAndFlags( struct iso_mnt * theMountPointPtr, @@ -153,67 +152,22 @@ static void DRGetTypeCreatorAndFlags( u_int32_t * theCreatorPtr, u_int16_t * theFlagsPtr); -int cd9660_vget_internal( - struct mount *mp, - ino_t ino, - struct vnode **vpp, - int relocated, - struct iso_directory_record *isodir, - struct proc *p); - int -cd9660_mountroot() +cd9660_mountroot(mount_t mp, vnode_t rvp, vfs_context_t context) { - register struct mount *mp; - extern struct vnode *rootvp; - struct proc *p = current_proc(); /* XXX */ - struct iso_mnt *imp; - size_t size; - int error; - struct iso_args args; - - /* - * Get vnodes for swapdev and rootdev. - */ - if ( bdevvp(rootdev, &rootvp)) - panic("cd9660_mountroot: can't setup bdevvp's"); + int error; + struct user_iso_args args; - MALLOC_ZONE(mp, struct mount *, - sizeof(struct mount), M_MOUNT, M_WAITOK); - bzero((char *)mp, (u_long)sizeof(struct mount)); - - /* Initialize the default IO constraints */ - mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS; - mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32; - - mp->mnt_op = &cd9660_vfsops; - mp->mnt_flag = MNT_RDONLY; - LIST_INIT(&mp->mnt_vnodelist); args.flags = ISOFSMNT_ROOT; args.ssector = 0; - args.fspec = 0; args.toc_length = 0; - args.toc = 0; - if ((error = iso_mountfs(rootvp, mp, p, &args))) { - vrele(rootvp); /* release the reference from bdevvp() */ + args.toc = USER_ADDR_NULL; - if (mp->mnt_kern_flag & MNTK_IO_XINFO) - FREE(mp->mnt_xinfo_ptr, M_TEMP); - FREE_ZONE(mp, sizeof (struct mount), M_MOUNT); + if ((error = iso_mountfs(rvp, mp, &args, context))) return (error); - } - simple_lock(&mountlist_slock); - CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list); - simple_unlock(&mountlist_slock); - mp->mnt_vnodecovered = NULLVP; - imp = VFSTOISOFS(mp); - (void) copystr("/", mp->mnt_stat.f_mntonname, MNAMELEN - 1, - &size); - bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); - (void) copystr(ROOTNAME, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, - &size); - bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); - (void)cd9660_statfs(mp, &mp->mnt_stat, p); + + (void)cd9660_statfs(mp, vfs_statfs(mp), context); + return (0); } @@ -223,72 +177,51 @@ cd9660_mountroot() * mount system call */ int -cd9660_mount(mp, path, data, ndp, p) - register struct mount *mp; - char *path; - caddr_t data; - struct nameidata *ndp; - struct proc *p; +cd9660_mount(mount_t mp, vnode_t devvp, user_addr_t data, vfs_context_t context) { - struct vnode *devvp; - struct iso_args args; - size_t size; + struct user_iso_args args; int error; struct iso_mnt *imp = NULL; - - if ((error = copyin(data, (caddr_t)&args, sizeof (struct iso_args)))) + + if (vfs_context_is64bit(context)) { + error = copyin(data, (caddr_t)&args, sizeof (args)); + } + else { + struct iso_args temp; + error = copyin(data, (caddr_t)&temp, sizeof (temp)); + args.flags = temp.flags; + args.ssector = temp.ssector; + args.toc_length = temp.toc_length; + args.toc = CAST_USER_ADDR_T(temp.toc); + } + if (error) return (error); - if ((mp->mnt_flag & MNT_RDONLY) == 0) + if (vfs_isrdwr(mp)) return (EROFS); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ - if (mp->mnt_flag & MNT_UPDATE) { + if (vfs_isupdate(mp)) { imp = VFSTOISOFS(mp); - if (args.fspec == 0) - return (vfs_export(mp, &imp->im_export, &args.export)); - } - /* - * Not an update, or updating the name: look up the name - * and verify that it refers to a sensible block device. - */ - NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args.fspec, p); - if ((error = namei(ndp))) - return (error); - devvp = ndp->ni_vp; - - if (devvp->v_type != VBLK) { - vrele(devvp); - return (ENOTBLK); - } - if (major(devvp->v_rdev) >= nblkdev) { - vrele(devvp); - return (ENXIO); + if (devvp == 0) + return (0); } - if ((mp->mnt_flag & MNT_UPDATE) == 0) - error = iso_mountfs(devvp, mp, p, &args); + if ( !vfs_isupdate(mp)) + error = iso_mountfs(devvp, mp, &args, context); else { if (devvp != imp->im_devvp) error = EINVAL; /* needs translation */ - else - vrele(devvp); } if (error) { - vrele(devvp); return (error); } /* Indicate that we don't support volfs */ - mp->mnt_flag &= ~MNT_DOVOLFS; + vfs_clearflags(mp, MNT_DOVOLFS); - (void) copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN - 1, &size); - bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); - (void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, - &size); - bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); return (0); } @@ -305,23 +238,23 @@ cd9660_mount(mp, path, data, ndp, p) * device's path. It is assumed to be in user memory. */ static struct vnode * -cd9660_phys_device(char *path, struct proc *p) +cd9660_phys_device(mount_t mp, vfs_context_t context) { int err; - char *whole_path = NULL; // path to "whole" device + char whole_path[64]; // path to "whole" device char *s, *saved; struct nameidata nd; struct vnode *result; - size_t actual_size; + struct vfsstatfs * sfs; - if (path == NULL) - return NULL; - + sfs = vfs_statfs(mp); result = NULL; + if (strlen(sfs->f_mntfromname) >= sizeof(whole_path)) + return (NULL); + /* Make a copy of the mount from name, then remove trailing "s...". */ - MALLOC(whole_path, char *, MNAMELEN, M_ISOFSMNT, M_WAITOK); - copyinstr(path, whole_path, MNAMELEN-1, &actual_size); + strncpy(whole_path, sfs->f_mntfromname, sizeof(whole_path)-1); /* * I would use strrchr or rindex here, but those are declared __private_extern__, @@ -333,25 +266,23 @@ cd9660_phys_device(char *path, struct proc *p) *saved = '\0'; /* Lookup the "whole" device. */ - NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, whole_path, p); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, CAST_USER_ADDR_T(whole_path), context); err = namei(&nd); if (err) { printf("isofs: Cannot find physical device: %s\n", whole_path); goto done; } - + nameidone(&nd); + /* Open the "whole" device. */ - err = VOP_OPEN(nd.ni_vp, FREAD, FSCRED, p); + err = VNOP_OPEN(nd.ni_vp, FREAD, context); if (err) { - vrele(nd.ni_vp); + vnode_put(nd.ni_vp); printf("isofs: Cannot open physical device: %s\n", whole_path); goto done; } - result = nd.ni_vp; - done: - FREE(whole_path, M_ISOFSMNT); return result; } @@ -366,41 +297,42 @@ static int cd9660_find_video_dir(struct iso_mnt *isomp) { int result, err; - struct vnode *rootvp = NULL; + struct vnode *rvp = NULL; struct vnode *videovp = NULL; struct componentname cn; + struct vfs_context context; char dirname[] = "MPEGAV"; result = 0; /* Assume not a video CD */ - err = cd9660_root(isomp->im_mountp, &rootvp); + err = cd9660_root(isomp->im_mountp, &rvp, NULL); if (err) { printf("cd9660_find_video_dir: cd9660_root failed (%d)\n", err); return 0; /* couldn't find video dir */ } + context.vc_proc = current_proc(); + context.vc_ucred = kauth_cred_get(); + cn.cn_nameiop = LOOKUP; - cn.cn_flags = LOCKPARENT|ISLASTCN; - cn.cn_proc = current_proc(); - cn.cn_cred = cn.cn_proc->p_ucred; + cn.cn_flags = ISLASTCN; + cn.cn_context = &context; cn.cn_pnbuf = dirname; cn.cn_pnlen = sizeof(dirname)-1; cn.cn_nameptr = cn.cn_pnbuf; cn.cn_namelen = cn.cn_pnlen; - err = VOP_LOOKUP(rootvp, &videovp, &cn); + err = VNOP_LOOKUP(rvp, &videovp, &cn, &context); if (err == 0) { struct iso_node *ip = VTOI(videovp); result = 1; /* Looks like video CD */ isomp->video_dir_start = ip->iso_start; isomp->video_dir_end = ip->iso_start + (ip->i_size >> isomp->im_bshift); isomp->im_flags2 |= IMF2_IS_VCD; - } - if (videovp != NULL) - vput(videovp); - if (rootvp != NULL) - vput(rootvp); + vnode_put(videovp); + } + vnode_put(rvp); return result; } @@ -409,20 +341,19 @@ cd9660_find_video_dir(struct iso_mnt *isomp) * Common code for mount and mountroot */ static int -iso_mountfs(devvp, mp, p, argp) +iso_mountfs(devvp, mp, argp, context) register struct vnode *devvp; struct mount *mp; - struct proc *p; - struct iso_args *argp; + struct user_iso_args *argp; + vfs_context_t context; { + struct proc *p; register struct iso_mnt *isomp = (struct iso_mnt *)0; struct buf *bp = NULL; struct buf *pribp = NULL, *supbp = NULL; - dev_t dev = devvp->v_rdev; + dev_t dev = vnode_specrdev(devvp); int error = EINVAL; int breaderr = 0; - int needclose = 0; - extern struct vnode *rootvp; u_long iso_bsize; int iso_blknum; int joliet_level; @@ -434,26 +365,9 @@ iso_mountfs(devvp, mp, p, argp) u_int8_t vdtype; int blkoff = argp->ssector; - if (!(mp->mnt_flag & MNT_RDONLY)) + if (vfs_isrdwr(mp)) return (EROFS); - /* - * Disallow multiple mounts of the same device. - * Disallow mounting of a device that is currently in use - * (except for root, which might share swap device for miniroot). - * Flush out any old buffers remaining from a previous use. - */ - if ((error = vfs_mountedon(devvp))) - return (error); - if (vcount(devvp) > 1 && devvp != rootvp) - return (EBUSY); - if ((error = vinvalbuf(devvp, V_SAVE, p->p_ucred, p, 0, 0))) - return (error); - - if ((error = VOP_OPEN(devvp, FREAD, FSCRED, p))) - return (error); - needclose = 1; - /* This is the "logical sector size". The standard says this * should be 2048 or the physical sector size on the device, * whichever is greater. For now, we'll just use a constant. @@ -461,24 +375,24 @@ iso_mountfs(devvp, mp, p, argp) iso_bsize = ISO_DEFAULT_BLOCK_SIZE; /* tell IOKit that we're assuming 2K sectors */ - if ((error = VOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, - (caddr_t)&iso_bsize, FWRITE, p->p_ucred, p))) + if ((error = VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, + (caddr_t)&iso_bsize, FWRITE, context))) return (error); - devvp->v_specsize = iso_bsize; + joliet_level = 0; for (iso_blknum = 16 + blkoff; iso_blknum < (100 + blkoff); iso_blknum++) { - if ((error = bread(devvp, iso_blknum, iso_bsize, NOCRED, &bp))) { + if ((error = (int)buf_bread(devvp, (daddr64_t)((unsigned)iso_blknum), iso_bsize, NOCRED, &bp))) { if (bp) { - bp->b_flags |= B_AGE; - brelse(bp); + buf_markaged(bp); + buf_brelse(bp); bp = NULL; } breaderr = error; - printf("iso_mountfs: bread error %d reading block %d\n", error, iso_blknum); + printf("iso_mountfs: buf_bread error %d reading block %d\n", error, iso_blknum); continue; } - vdp = (struct iso_volume_descriptor *)bp->b_data; + vdp = (struct iso_volume_descriptor *)buf_dataptr(bp); if (bcmp (vdp->volume_desc_id, ISO_STANDARD_ID, sizeof(vdp->volume_desc_id)) != 0) { #ifdef DEBUG printf("cd9660_vfsops.c: iso_mountfs: " @@ -531,15 +445,15 @@ iso_mountfs(devvp, mp, p, argp) } if (bp) { - bp->b_flags |= B_AGE; - brelse(bp); + buf_markaged(bp); + buf_brelse(bp); bp = NULL; } } if (bp) { - bp->b_flags |= B_AGE; - brelse(bp); + buf_markaged(bp); + buf_brelse(bp); bp = NULL; } @@ -612,22 +526,20 @@ iso_mountfs(devvp, mp, p, argp) while ((1 << isomp->im_bshift) < isomp->logical_block_size) isomp->im_bshift++; - pribp->b_flags |= B_AGE; - brelse(pribp); + buf_markaged(pribp); + buf_brelse(pribp); pribp = NULL; - mp->mnt_data = (qaddr_t)isomp; - mp->mnt_stat.f_fsid.val[0] = (long)dev; - mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; - mp->mnt_maxsymlinklen = 0; - mp->mnt_flag |= MNT_LOCAL; + vfs_setfsprivate(mp, (void *)isomp); + vfs_statfs(mp)->f_fsid.val[0] = (long)dev; + vfs_statfs(mp)->f_fsid.val[1] = vfs_typenum(mp); + vfs_setmaxsymlen(mp, 0); + vfs_setflags(mp, MNT_LOCAL); isomp->im_mountp = mp; isomp->im_dev = dev; isomp->im_devvp = devvp; - devvp->v_specflags |= SI_MOUNTEDON; - /* * If the logical block size is not 2K then we must * set the block device's physical block size to this @@ -636,24 +548,23 @@ iso_mountfs(devvp, mp, p, argp) */ if (logical_block_size != iso_bsize) { iso_bsize = logical_block_size; - if ((error = VOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, - (caddr_t)&iso_bsize, FWRITE, p->p_ucred, p))) + if ((error = VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, + (caddr_t)&iso_bsize, FWRITE, context))) goto out; - devvp->v_specsize = iso_bsize; } /* Check the Rock Ridge Extention support */ if (!(argp->flags & ISOFSMNT_NORRIP)) { - if ( (error = bread(isomp->im_devvp, - (isomp->root_extent + isonum_711(rootp->ext_attr_length)), - isomp->logical_block_size, NOCRED, &bp)) ) { + if ( (error = (int)buf_bread(isomp->im_devvp, + (daddr64_t)((unsigned)((isomp->root_extent + isonum_711(rootp->ext_attr_length)))), + isomp->logical_block_size, NOCRED, &bp)) ) { - printf("iso_mountfs: bread error %d reading block %d\n", + printf("iso_mountfs: buf_bread error %d reading block %d\n", error, isomp->root_extent + isonum_711(rootp->ext_attr_length)); argp->flags |= ISOFSMNT_NORRIP; goto skipRRIP; } - rootp = (struct iso_directory_record *)bp->b_data; + rootp = (struct iso_directory_record *)buf_dataptr(bp); if ((isomp->rr_skip = cd9660_rrip_offset(rootp,isomp)) < 0) { argp->flags |= ISOFSMNT_NORRIP; @@ -665,8 +576,8 @@ iso_mountfs(devvp, mp, p, argp) * The contents are valid, * but they will get reread as part of another vnode, so... */ - bp->b_flags |= B_AGE; - brelse(bp); + buf_markaged(bp); + buf_brelse(bp); bp = NULL; } skipRRIP: @@ -721,12 +632,12 @@ skipRRIP: bcopy (rootp, isomp->root, sizeof isomp->root); isomp->root_extent = isonum_733 (rootp->extent); isomp->root_size = isonum_733 (rootp->size); - supbp->b_flags |= B_AGE; + buf_markaged(supbp); isomp->iso_ftype = ISO_FTYPE_JOLIET; } if (supbp) { - brelse(supbp); + buf_brelse(supbp); supbp = NULL; } @@ -740,34 +651,28 @@ skipRRIP: /* See if this could be a Video CD */ if ((isomp->im_flags2 & IMF2_IS_CDXA) && cd9660_find_video_dir(isomp)) { /* Get the 2352-bytes-per-block device. */ - isomp->phys_devvp = cd9660_phys_device(argp->fspec, p); + isomp->phys_devvp = cd9660_phys_device(mp, context); } + /* Fill the default statfs information */ + (void) cd9660_statfs(mp, vfs_statfs(mp), context); + return (0); out: if (bp) - brelse(bp); + buf_brelse(bp); if (pribp) - brelse(pribp); + buf_brelse(pribp); if (supbp) - brelse(supbp); - if (needclose) - (void)VOP_CLOSE(devvp, FREAD, NOCRED, p); + buf_brelse(supbp); + if (isomp) { if (isomp->toc) FREE((caddr_t)isomp->toc, M_ISOFSMNT); FREE((caddr_t)isomp, M_ISOFSMNT); - mp->mnt_data = (qaddr_t)0; - } - - /* Clear the mounted on bit in the devvp If it */ - /* not set, this is a nop and there is no way to */ - /* get here with it set unless we did it. If you*/ - /* are making code changes which makes the above */ - /* assumption not true, change this code. */ - - devvp->v_specflags &= ~SI_MOUNTEDON; + vfs_setfsprivate(mp, (void *)0); + } return (error); } @@ -777,10 +682,8 @@ out: */ /* ARGSUSED */ int -cd9660_start(mp, flags, p) - struct mount *mp; - int flags; - struct proc *p; +cd9660_start(__unused struct mount *mp, __unused int flags, + __unused vfs_context_t context) { return (0); } @@ -789,10 +692,7 @@ cd9660_start(mp, flags, p) * unmount system call */ int -cd9660_unmount(mp, mntflags, p) - struct mount *mp; - int mntflags; - struct proc *p; +cd9660_unmount(struct mount *mp, int mntflags, vfs_context_t context) { register struct iso_mnt *isomp; int error, flags = 0; @@ -812,27 +712,17 @@ cd9660_unmount(mp, mntflags, p) if (isomp->iso_ftype == ISO_FTYPE_RRIP) iso_dunmap(isomp->im_dev); #endif - - isomp->im_devvp->v_specflags &= ~SI_MOUNTEDON; - error = VOP_CLOSE(isomp->im_devvp, FREAD, NOCRED, p); - if (error && !force ) - return(error); - - vrele(isomp->im_devvp); - if (isomp->phys_devvp) { - error = VOP_CLOSE(isomp->phys_devvp, FREAD, FSCRED, p); + error = VNOP_CLOSE(isomp->phys_devvp, FREAD, context); if (error && !force) return error; - vrele(isomp->phys_devvp); + vnode_put(isomp->phys_devvp); } if (isomp->toc) FREE((caddr_t)isomp->toc, M_ISOFSMNT); - FREE((caddr_t)isomp, M_ISOFSMNT); - mp->mnt_data = (qaddr_t)0; - mp->mnt_flag &= ~MNT_LOCAL; + return (0); } @@ -840,9 +730,7 @@ cd9660_unmount(mp, mntflags, p) * Return root of a filesystem */ int -cd9660_root(mp, vpp) - struct mount *mp; - struct vnode **vpp; +cd9660_root(struct mount *mp, struct vnode **vpp, __unused vfs_context_t context) { struct iso_mnt *imp = VFSTOISOFS(mp); struct iso_directory_record *dp = @@ -853,58 +741,50 @@ cd9660_root(mp, vpp) * With RRIP we must use the `.' entry of the root directory. * Simply tell vget, that it's a relocated directory. */ - return (cd9660_vget_internal(mp, ino, vpp, + return (cd9660_vget_internal(mp, ino, vpp, NULL, NULL, imp->iso_ftype == ISO_FTYPE_RRIP, dp, current_proc())); } -/* - * Do operations associated with quotas, not supported - */ -/* ARGSUSED */ -int -cd9660_quotactl(mp, cmd, uid, arg, p) - struct mount *mp; - int cmd; - uid_t uid; - caddr_t arg; - struct proc *p; -{ - - return (EOPNOTSUPP); -} - /* * Get file system statistics. */ +/* ARGSUSED */ int -cd9660_statfs(mp, sbp, p) - struct mount *mp; - register struct statfs *sbp; - struct proc *p; +cd9660_statfs(struct mount *mp, register struct vfsstatfs *sbp, + __unused vfs_context_t context) { register struct iso_mnt *isomp; isomp = VFSTOISOFS(mp); +#if 0 #ifdef COMPAT_09 sbp->f_type = 5; #else sbp->f_type = 0; #endif - sbp->f_bsize = isomp->logical_block_size; - sbp->f_iosize = sbp->f_bsize; /* XXX */ - sbp->f_blocks = isomp->volume_space_size; - sbp->f_bfree = 0; /* total free blocks */ - sbp->f_bavail = 0; /* blocks free for non superuser */ - sbp->f_files = 0; /* total files */ - sbp->f_ffree = 0; /* free file nodes */ - if (sbp != &mp->mnt_stat) { - bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); - bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); - } +#endif + sbp->f_bsize = (uint32_t)isomp->logical_block_size; + sbp->f_iosize = (size_t)sbp->f_bsize; /* XXX */ + sbp->f_blocks = (uint64_t)((unsigned long)isomp->volume_space_size); + sbp->f_bfree = (uint64_t)0; /* total free blocks */ + sbp->f_bavail = (uint64_t)0; /* blocks free for non superuser */ + sbp->f_files = (uint64_t)0; /* total files */ + sbp->f_ffree = (uint64_t)0; /* free file nodes */ + sbp->f_fstypename[(MFSTYPENAMELEN - 1)] = '\0'; - strncpy( sbp->f_fstypename, mp->mnt_vfc->vfc_name, (MFSNAMELEN - 1) ); - sbp->f_fstypename[(MFSNAMELEN - 1)] = '\0'; + /* + * Subtypes (flavors) for ISO 9660 + * 0: ISO-9660 + * 1: ISO-9660 (Joliet) + * 2: ISO-9660 (Rockridge) + */ + if (isomp->iso_ftype == ISO_FTYPE_JOLIET) + sbp->f_fssubtype = 1; + else if (isomp->iso_ftype == ISO_FTYPE_RRIP) + sbp->f_fssubtype = 2; + else + sbp->f_fssubtype = 0; /* DO NOT use the first spare for flags; it's been reassigned for another use: */ /* sbp->f_spare[0] = isomp->im_flags; */ @@ -912,13 +792,109 @@ cd9660_statfs(mp, sbp, p) return (0); } +int cd9660_vfs_getattr(struct mount *mp, struct vfs_attr *fsap, vfs_context_t context) +{ + struct iso_mnt *imp; + struct vfsstatfs *stats = vfs_statfs(mp); + + imp = VFSTOISOFS(mp); + + /* + * We don't know reasonable values for f_objcount, f_filecount, + * f_dircount, f_maxobjcount so don't bother making up (poor) + * numbers like 10.3.x and earlier did. + */ + + VFSATTR_RETURN(fsap, f_iosize, stats->f_iosize); + VFSATTR_RETURN(fsap, f_blocks, stats->f_blocks); + VFSATTR_RETURN(fsap, f_bfree, stats->f_bfree); + VFSATTR_RETURN(fsap, f_bavail, stats->f_bavail); + VFSATTR_RETURN(fsap, f_bused, stats->f_blocks); + + /* We don't have file counts, so don't return them */ + + /* f_fsid and f_owner should be handled by VFS */ + + /* We don't have a value for f_uuid */ + + if (VFSATTR_IS_ACTIVE(fsap, f_capabilities)) { + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] = + (imp->iso_ftype == ISO_FTYPE_RRIP ? VOL_CAP_FMT_SYMBOLICLINKS : 0) | + (imp->iso_ftype == ISO_FTYPE_RRIP ? VOL_CAP_FMT_HARDLINKS : 0) | + (imp->iso_ftype == ISO_FTYPE_RRIP || imp->iso_ftype == ISO_FTYPE_JOLIET + ? VOL_CAP_FMT_CASE_SENSITIVE : 0) | + VOL_CAP_FMT_CASE_PRESERVING | + VOL_CAP_FMT_FAST_STATFS; + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] = + VOL_CAP_INT_ATTRLIST | + VOL_CAP_INT_NFSEXPORT; + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_RESERVED1] = 0; + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_RESERVED2] = 0; + + fsap->f_capabilities.valid[VOL_CAPABILITIES_FORMAT] = + VOL_CAP_FMT_PERSISTENTOBJECTIDS | + VOL_CAP_FMT_SYMBOLICLINKS | + VOL_CAP_FMT_HARDLINKS | + VOL_CAP_FMT_JOURNAL | + VOL_CAP_FMT_JOURNAL_ACTIVE | + VOL_CAP_FMT_NO_ROOT_TIMES | + VOL_CAP_FMT_SPARSE_FILES | + VOL_CAP_FMT_ZERO_RUNS | + VOL_CAP_FMT_CASE_SENSITIVE | + VOL_CAP_FMT_CASE_PRESERVING | + VOL_CAP_FMT_FAST_STATFS | + VOL_CAP_FMT_2TB_FILESIZE; + fsap->f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] = + VOL_CAP_INT_SEARCHFS | + VOL_CAP_INT_ATTRLIST | + VOL_CAP_INT_NFSEXPORT | + VOL_CAP_INT_READDIRATTR | + VOL_CAP_INT_EXCHANGEDATA | + VOL_CAP_INT_COPYFILE | + VOL_CAP_INT_ALLOCATE | + VOL_CAP_INT_VOL_RENAME | + VOL_CAP_INT_ADVLOCK | + VOL_CAP_INT_FLOCK; + fsap->f_capabilities.valid[VOL_CAPABILITIES_RESERVED1] = 0; + fsap->f_capabilities.valid[VOL_CAPABILITIES_RESERVED2] = 0; + + VFSATTR_SET_SUPPORTED(fsap, f_capabilities); + } + + if (VFSATTR_IS_ACTIVE(fsap, f_attributes)) { + /* + * VFS should really set these based on the vfs_attr and vnop_attr + * fields the file system supports, combined with the conversions + * VFS has implemented. + */ + + fsap->f_attributes.validattr.commonattr = ATTR_CMN_VALIDMASK; + fsap->f_attributes.validattr.volattr = ATTR_VOL_VALIDMASK; + fsap->f_attributes.validattr.dirattr = ATTR_DIR_VALIDMASK; + fsap->f_attributes.validattr.fileattr = ATTR_FILE_VALIDMASK; + fsap->f_attributes.validattr.forkattr = ATTR_FORK_VALIDMASK; + + fsap->f_attributes.nativeattr.commonattr = ATTR_CMN_VALIDMASK; + fsap->f_attributes.nativeattr.volattr = ATTR_VOL_VALIDMASK; + fsap->f_attributes.nativeattr.dirattr = ATTR_DIR_VALIDMASK; + fsap->f_attributes.nativeattr.fileattr = ATTR_FILE_VALIDMASK; + fsap->f_attributes.nativeattr.forkattr = ATTR_FORK_VALIDMASK; + + VFSATTR_SET_SUPPORTED(fsap, f_attributes); + } + + VFSATTR_RETURN(fsap, f_create_time, imp->creation_date); + VFSATTR_RETURN(fsap, f_modify_time, imp->modification_date); + /* No explicit access time, so let VFS pick a default value */ + /* No explicit backup time, so let VFS pick a default value */ + + return 0; +} + /* ARGSUSED */ int -cd9660_sync(mp, waitfor, cred, p) - struct mount *mp; - int waitfor; - struct ucred *cred; - struct proc *p; +cd9660_sync(__unused struct mount *mp, __unused int waitfor, + __unused vfs_context_t context) { return (0); @@ -935,56 +911,38 @@ cd9660_sync(mp, waitfor, cred, p) */ struct ifid { - ushort ifid_len; - ushort ifid_pad; int ifid_ino; long ifid_start; }; /* ARGSUSED */ int -cd9660_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) - register struct mount *mp; - struct fid *fhp; - struct mbuf *nam; - struct vnode **vpp; - int *exflagsp; - struct ucred **credanonp; +cd9660_fhtovp(mount_t mp, int fhlen, unsigned char *fhp, vnode_t *vpp, vfs_context_t context) { struct ifid *ifhp = (struct ifid *)fhp; register struct iso_node *ip; - register struct netcred *np; - register struct iso_mnt *imp = VFSTOISOFS(mp); struct vnode *nvp; int error; + if (fhlen < (int)sizeof(struct ifid)) + return (EINVAL); + #ifdef ISOFS_DBG printf("fhtovp: ino %d, start %ld\n", ifhp->ifid_ino, ifhp->ifid_start); #endif - /* - * Get the export permission structure for this <mp, client> tuple. - */ - np = vfs_export_lookup(mp, &imp->im_export, nam); - if (nam && (np == NULL)) - return (EACCES); - - if ( (error = VFS_VGET(mp, &ifhp->ifid_ino, &nvp)) ) { + if ( (error = VFS_VGET(mp, (ino64_t)ifhp->ifid_ino, &nvp, context)) ) { *vpp = NULLVP; return (error); } ip = VTOI(nvp); if (ip->inode.iso_mode == 0) { - vput(nvp); + vnode_put(nvp); *vpp = NULLVP; return (ESTALE); } *vpp = nvp; - if (np) { - *exflagsp = np->netc_exflags; - *credanonp = &np->netc_anon; - } return (0); } @@ -1075,10 +1033,7 @@ cd9660_is_video_file(struct iso_node *ip, struct iso_mnt *imp) } int -cd9660_vget(mp, ino, vpp) - struct mount *mp; - void *ino; - struct vnode **vpp; +cd9660_vget(struct mount *mp, ino64_t ino, struct vnode **vpp, __unused vfs_context_t context) { /* * XXXX @@ -1087,55 +1042,56 @@ cd9660_vget(mp, ino, vpp) * that right now. */ - return ( cd9660_vget_internal( mp, *(ino_t*)ino, vpp, 0, - (struct iso_directory_record *) 0, - current_proc()) ); + return ( cd9660_vget_internal( mp, (ino_t)ino, vpp, NULL, NULL, + 0, (struct iso_directory_record *) 0, current_proc()) ); } int -cd9660_vget_internal(mp, ino, vpp, relocated, isodir, p) - struct mount *mp; - ino_t ino; - struct vnode **vpp; - int relocated; - struct iso_directory_record *isodir; - struct proc *p; +cd9660_vget_internal(mount_t mp, ino_t ino, vnode_t *vpp, vnode_t dvp, + struct componentname *cnp, int relocated, + struct iso_directory_record *isodir, proc_t p) { register struct iso_mnt *imp; struct iso_node *ip; - struct buf *bp; - struct vnode *vp, *nvp; - dev_t dev; - int error; - - imp = VFSTOISOFS(mp); - dev = imp->im_dev; - + buf_t bp = NULL; + vnode_t vp; + dev_t dev; + int error; + struct vnode_fsparam vfsp; + enum vtype vtype; + int is_video_file = 0; + + *vpp = NULLVP; + imp = VFSTOISOFS(mp); + dev = imp->im_dev; +#if 0 /* Check for unmount in progress */ - if (mp->mnt_kern_flag & MNTK_UNMOUNT) { - *vpp = NULLVP; - return (EPERM); - } - - if ((*vpp = cd9660_ihashget(dev, ino, p)) != NULLVP) - return (0); + if (mp->mnt_kern_flag & MNTK_UNMOUNT) + return (EPERM); +#endif MALLOC_ZONE(ip, struct iso_node *, sizeof(struct iso_node), - M_ISOFSNODE, M_WAITOK); - /* Allocate a new vnode/iso_node. */ - if ( (error = getnewvnode(VT_ISOFS, mp, cd9660_vnodeop_p, &vp)) ) { - FREE_ZONE(ip,sizeof(struct iso_node), M_ISOFSNODE); - *vpp = NULLVP; - return (error); + M_ISOFSNODE, M_WAITOK); + /* + * MALLOC_ZONE may block, so check for the inode being + * present in the hash after we get back... + * we also assume that we're under a filesystem lock + * so that we're not reentered between the ihashget and + * the ihashins... + */ + if ((*vpp = cd9660_ihashget(dev, ino, p)) != NULLVP) { + FREE_ZONE(ip, sizeof(struct iso_node), M_ISOFSNODE); + return (0); } bzero((caddr_t)ip, sizeof(struct iso_node)); - lockinit(&ip->i_lock, PINOD,"isonode",0,0); - vp->v_data = ip; - ip->i_vnode = vp; + ip->i_dev = dev; ip->i_number = ino; ip->i_namep = &isonullname[0]; + ip->i_mnt = imp; + ip->i_devvp = imp->im_devvp; + SET(ip->i_flag, ISO_INALLOC); /* * Put it onto its hash chain and lock it so that other requests for * this inode will block if they arrive while we are sleeping waiting @@ -1148,40 +1104,36 @@ cd9660_vget_internal(mp, ino, vpp, relocated, isodir, p) int lbn, off; lbn = lblkno(imp, ino); + if (lbn >= imp->volume_space_size) { - vput(vp); printf("fhtovp: lbn exceed volume space %d\n", lbn); - return (ESTALE); + error = ESTALE; + goto errout; } - off = blkoff(imp, ino); + if (off + ISO_DIRECTORY_RECORD_SIZE > imp->logical_block_size) { - vput(vp); printf("fhtovp: crosses block boundary %d\n", off + ISO_DIRECTORY_RECORD_SIZE); - return (ESTALE); + error = ESTALE; + goto errout; } - error = bread(imp->im_devvp, lbn, - imp->logical_block_size, NOCRED, &bp); + error = (int)buf_bread(imp->im_devvp, (daddr64_t)((unsigned)lbn), + imp->logical_block_size, NOCRED, &bp); if (error) { - vput(vp); - brelse(bp); - printf("fhtovp: bread error %d\n",error); - return (error); + printf("fhtovp: buf_bread error %d\n",error); + goto errout; } - isodir = (struct iso_directory_record *)(bp->b_data + off); + isodir = (struct iso_directory_record *)(buf_dataptr(bp) + off); - if (off + isonum_711(isodir->length) > - imp->logical_block_size) { - vput(vp); - if (bp != 0) - brelse(bp); + if (off + isonum_711(isodir->length) > imp->logical_block_size) { printf("fhtovp: directory crosses block boundary " "%d[off=%d/len=%d]\n", off +isonum_711(isodir->length), off, isonum_711(isodir->length)); - return (ESTALE); + error = ESTALE; + goto errout; } /* @@ -1193,31 +1145,40 @@ cd9660_vget_internal(mp, ino, vpp, relocated, isodir, p) struct iso_directory_record *pdp; pdp = (struct iso_directory_record *) - ((char *)bp->b_data + isonum_711(isodir->length)); + ((char *)buf_dataptr(bp) + isonum_711(isodir->length)); if ((isonum_711(pdp->flags) & directoryBit) && (pdp->name[0] == 1)) ip->i_parent = isodirino(pdp, imp); } - } else - bp = 0; - - ip->i_mnt = imp; - ip->i_devvp = imp->im_devvp; - VREF(ip->i_devvp); - + } if (relocated) { + daddr64_t lbn; + + if (bp) { + buf_brelse(bp); + bp = NULL; + } /* * On relocated directories we must * read the `.' entry out of a dir. */ ip->iso_start = ino >> imp->im_bshift; - if (bp != 0) - brelse(bp); - if ( (error = VOP_BLKATOFF(vp, (off_t)0, NULL, &bp)) ) { - vput(vp); - return (error); - } - isodir = (struct iso_directory_record *)bp->b_data; + /* + * caclulate the correct lbn to read block 0 + * of this node... this used to be a cd9660_blkatoff, but + * that requires the vnode to already be 'cooked'... in + * the new world, we don't create a vnode until the inode + * has been fully initialized... cd9660_blkatoff generates + * a buf_bread for im_sector_size associated with the node's vp + * I'm replacing it with a buf_bread for the same size and from + * the same location on the disk, but associated with the devvp + */ + lbn = (daddr64_t)((unsigned)ip->iso_start) + 0; + + if ((error = (int)buf_bread(imp->im_devvp, lbn, imp->im_sector_size, NOCRED, &bp))) + goto errout; + + isodir = (struct iso_directory_record *)buf_dataptr(bp); } /* @@ -1287,32 +1248,40 @@ cd9660_vget_internal(mp, ino, vpp, relocated, isodir, p) /* * Setup time stamp, attribute */ - vp->v_type = VNON; switch (imp->iso_ftype) { default: /* ISO_FTYPE_9660 */ { - struct buf *bp2; - int off; - if ((imp->im_flags & ISOFSMNT_EXTATT) - && (off = isonum_711(isodir->ext_attr_length))) - VOP_BLKATOFF(vp, (off_t)-(off << imp->im_bshift), NULL, &bp2); - else + buf_t bp2 = NULL; + daddr64_t lbn; + int off; + + if ((imp->im_flags & ISOFSMNT_EXTATT) && (off = isonum_711(isodir->ext_attr_length))) { + + lbn = (daddr64_t)((unsigned)ip->iso_start - off); + + if ((error = (int)buf_bread(imp->im_devvp, lbn, imp->im_sector_size, NOCRED, &bp2))) { + if (bp2) + buf_brelse(bp2); + goto errout; + } + } else bp2 = NULL; + cd9660_defattr(isodir, ip, bp2); cd9660_deftstamp(isodir, ip, bp2); + if (bp2) - brelse(bp2); + buf_brelse(bp2); break; } case ISO_FTYPE_RRIP: cd9660_rrip_analyze(isodir, ip, imp); break; } - /* * See if this is a Video CD file. If so, we must adjust the * length to account for larger sectors plus the RIFF header. - * We also must substitute the VOP_READ and VOP_PAGEIN functions. + * We also must substitute the vnop_read and vnop_pagein functions. * * The cd9660_is_video_file routine assumes that the inode has * been completely set up; it refers to several fields. @@ -1322,75 +1291,101 @@ cd9660_vget_internal(mp, ino, vpp, relocated, isodir, p) */ if (cd9660_is_video_file(ip, imp)) { - cd9660_xa_init(vp, isodir); + cd9660_xa_init(ip, isodir); + + is_video_file = 1; } - - if (bp != 0) - brelse(bp); - - /* - * Initialize the associated vnode - */ - if (ip->iso_extent == imp->root_extent) { - vp->v_flag |= VROOT; ip->i_parent = 1; /* root's parent is always 1 by convention */ /* mode type must be S_IFDIR */ ip->inode.iso_mode = (ip->inode.iso_mode & ~S_IFMT) | S_IFDIR; } - - switch (vp->v_type = IFTOVT(ip->inode.iso_mode)) { - case VFIFO: -#if FIFO - vp->v_op = cd9660_fifoop_p; - break; -#else - vput(vp); - return (EOPNOTSUPP); -#endif /* FIFO */ - case VCHR: - case VBLK: - /* - * if device, look at device number table for translation - */ -#ifdef ISODEVMAP - if (dp = iso_dmap(dev, ino, 0)) - ip->inode.iso_rdev = dp->d_dev; + vtype = IFTOVT(ip->inode.iso_mode); +#if !FIFO + if (vtype == VFIFO) { + error = ENOTSUP; + goto errout; + } #endif - vp->v_op = cd9660_specop_p; - if ( (nvp = checkalias(vp, ip->inode.iso_rdev, mp)) ) { - /* - * Discard unneeded vnode, but save its iso_node. - */ - cd9660_ihashrem(ip); - VOP_UNLOCK(vp, 0, p); - nvp->v_data = vp->v_data; - vp->v_data = NULL; - vp->v_op = spec_vnodeop_p; - vrele(vp); - vgone(vp); - /* - * Reinitialize aliased inode. - */ - vp = nvp; - ip->i_vnode = vp; - cd9660_ihashins(ip); - } - break; - case VREG: - ubc_info_init(vp); - break; - default: - break; +#ifdef ISODEVMAP + if (vtype == VCHR || vtype == VBLK) { + struct iso_dnode *dp; + + if (dp = iso_dmap(dev, ino, 0)) + ip->inode.iso_rdev = dp->d_dev; } - +#endif /* - * XXX need generation number? + * create the associated vnode */ + //bzero(&vfsp, sizeof(struct vnode_fsparam)); + vfsp.vnfs_mp = mp; + vfsp.vnfs_vtype = vtype; + vfsp.vnfs_str = "cd9660"; + vfsp.vnfs_dvp = dvp; + vfsp.vnfs_fsnode = ip; + vfsp.vnfs_cnp = cnp; + + if (is_video_file) + vfsp.vnfs_vops = cd9660_cdxaop_p; + else if (vtype == VFIFO ) + vfsp.vnfs_vops = cd9660_fifoop_p; + else if (vtype == VBLK || vtype == VCHR) + vfsp.vnfs_vops = cd9660_specop_p; + else + vfsp.vnfs_vops = cd9660_vnodeop_p; + + if (vtype == VBLK || vtype == VCHR) + vfsp.vnfs_rdev = ip->inode.iso_rdev; + else + vfsp.vnfs_rdev = 0; + + vfsp.vnfs_filesize = ip->i_size; + if (dvp && cnp && (cnp->cn_flags & MAKEENTRY)) + vfsp.vnfs_flags = 0; + else + vfsp.vnfs_flags = VNFS_NOCACHE; + + /* Tag root directory */ + if (ip->iso_extent == imp->root_extent) + vfsp.vnfs_markroot = 1; + else + vfsp.vnfs_markroot = 0; + + vfsp.vnfs_marksystem = 0; + + if ( (error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &vp)) ) + goto errout; + + ip->i_vnode = vp; + + vnode_ref(ip->i_devvp); + vnode_addfsref(vp); + vnode_settag(vp, VT_ISOFS); + + if (bp) + buf_brelse(bp); *vpp = vp; + CLR(ip->i_flag, ISO_INALLOC); + + if (ISSET(ip->i_flag, ISO_INWALLOC)) + wakeup(ip); + return (0); + +errout: + if (bp) + buf_brelse(bp); + cd9660_ihashrem(ip); + + if (ISSET(ip->i_flag, ISO_INWALLOC)) + wakeup(ip); + + FREE_ZONE(ip, sizeof(struct iso_node), M_ISOFSNODE); + + return (error); } @@ -1587,18 +1582,19 @@ DoneLooking: */ /* ARGSUSED */ int -cd9660_vptofh(vp, fhp) - struct vnode *vp; - struct fid *fhp; +cd9660_vptofh(struct vnode *vp, int *fhlenp, unsigned char *fhp, __unused vfs_context_t context) { register struct iso_node *ip = VTOI(vp); register struct ifid *ifhp; + + if (*fhlenp < (int)sizeof(struct ifid)) + return (EOVERFLOW); ifhp = (struct ifid *)fhp; - ifhp->ifid_len = sizeof(struct ifid); ifhp->ifid_ino = ip->i_number; ifhp->ifid_start = ip->iso_start; + *fhlenp = sizeof(struct ifid); #ifdef ISOFS_DBG printf("vptofh: ino %d, start %ld\n", @@ -1611,15 +1607,10 @@ cd9660_vptofh(vp, fhp) * Fast-FileSystem only? */ int -cd9660_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) - int * name; - u_int namelen; - void* oldp; - size_t * oldlenp; - void * newp; - size_t newlen; - struct proc * p; +cd9660_sysctl(__unused int *name, __unused u_int namelen, __unused user_addr_t oldp, + __unused size_t *oldlenp, __unused user_addr_t newp, + __unused size_t newlen, __unused vfs_context_t context) { - return (EOPNOTSUPP); + return (ENOTSUP); } diff --git a/bsd/isofs/cd9660/cd9660_vnops.c b/bsd/isofs/cd9660/cd9660_vnops.c index 9484b5084..6789bfc1b 100644 --- a/bsd/isofs/cd9660/cd9660_vnops.c +++ b/bsd/isofs/cd9660/cd9660_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -80,6 +80,7 @@ #include <sys/stat.h> #include <sys/buf.h> #include <sys/proc.h> +#include <sys/kauth.h> #include <sys/conf.h> #include <miscfs/specfs/specdev.h> #include <miscfs/fifofs/fifo.h> @@ -87,10 +88,16 @@ #include <sys/dir.h> #include <sys/attr.h> #include <vfs/vfs_support.h> +#include <vm/vm_kern.h> #include <sys/ubc.h> #include <sys/lock.h> +#include <sys/ubc_internal.h> +#include <sys/uio_internal.h> #include <architecture/byte_order.h> +#include <vm/vm_map.h> +#include <vm/vm_kern.h> /* kmem_alloc, kmem_free */ + #include <isofs/cd9660/iso.h> #include <isofs/cd9660/cd9660_node.h> #include <isofs/cd9660/iso_rrip.h> @@ -100,15 +107,8 @@ * * Nothing to do. */ -/* ARGSUSED */ int -cd9660_open(ap) - struct vop_open_args /* { - struct vnode *a_vp; - int a_mode; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; +cd9660_open(__unused struct vnop_open_args *ap) { return (0); } @@ -118,157 +118,55 @@ cd9660_open(ap) * * Update the times on the inode on writeable file systems. */ -/* ARGSUSED */ int -cd9660_close(ap) - struct vop_close_args /* { - struct vnode *a_vp; - int a_fflag; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; +cd9660_close(__unused struct vnop_close_args *ap) { return (0); } -/* - * Check mode permission on inode pointer. Mode is READ, WRITE or EXEC. - * The mode is shifted to select the owner/group/other fields. The - * super user is granted all permissions. - */ -/* ARGSUSED */ -int -cd9660_access(ap) - struct vop_access_args /* { - struct vnode *a_vp; - int a_mode; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - struct iso_node *ip = VTOI(vp); - struct ucred *cred = ap->a_cred; - mode_t mask, mode = ap->a_mode; - register gid_t *gp; - int i, error; - - /* - * Disallow write attempts on read-only file systems; - * unless the file is a socket, fifo, or a block or - * character device resident on the file system. - */ - if (mode & VWRITE) { - switch (vp->v_type) { - case VDIR: - case VLNK: - case VREG: - return (EROFS); - /* NOT REACHED */ - default: - break; - } - } - - /* If immutable bit set, nobody gets to write it. */ -#if 0 - if ((mode & VWRITE) && (ip->i_flag & IMMUTABLE)) - return (EPERM); -#endif - /* Otherwise, user id 0 always gets access. */ - if (cred->cr_uid == 0) - return (0); - - mask = 0; - - /* Otherwise, check the owner. */ - if (cred->cr_uid == ip->inode.iso_uid) { - if (mode & VEXEC) - mask |= S_IXUSR; - if (mode & VREAD) - mask |= S_IRUSR; - if (mode & VWRITE) - mask |= S_IWUSR; - return ((ip->inode.iso_mode & mask) == mask ? 0 : EACCES); - } - - /* Otherwise, check the groups. */ - for (i = 0, gp = cred->cr_groups; i < cred->cr_ngroups; i++, gp++) - if (ip->inode.iso_gid == *gp) { - if (mode & VEXEC) - mask |= S_IXGRP; - if (mode & VREAD) - mask |= S_IRGRP; - if (mode & VWRITE) - mask |= S_IWGRP; - return ((ip->inode.iso_mode & mask) == mask ? 0 : EACCES); - } - - /* Otherwise, check everyone else. */ - if (mode & VEXEC) - mask |= S_IXOTH; - if (mode & VREAD) - mask |= S_IROTH; - if (mode & VWRITE) - mask |= S_IWOTH; - return ((ip->inode.iso_mode & mask) == mask ? 0 : EACCES); -} - int -cd9660_getattr(ap) - struct vop_getattr_args /* { - struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; - +cd9660_getattr(struct vnop_getattr_args *ap) { struct vnode *vp = ap->a_vp; - register struct vattr *vap = ap->a_vap; + register struct vnode_attr *vap = ap->a_vap; register struct iso_node *ip = VTOI(vp); - vap->va_fsid = ip->i_dev; - vap->va_fileid = ip->i_number; + VATTR_RETURN(vap, va_fsid, ip->i_dev); + VATTR_RETURN(vap, va_fileid, ip->i_number); - vap->va_mode = ip->inode.iso_mode; - vap->va_nlink = ip->inode.iso_links; - vap->va_uid = ip->inode.iso_uid; - vap->va_gid = ip->inode.iso_gid; - vap->va_atime = ip->inode.iso_atime; - vap->va_mtime = ip->inode.iso_mtime; - vap->va_ctime = ip->inode.iso_ctime; - vap->va_rdev = ip->inode.iso_rdev; + VATTR_RETURN(vap, va_mode, ip->inode.iso_mode); + VATTR_RETURN(vap, va_nlink, ip->inode.iso_links); + VATTR_RETURN(vap, va_uid, ip->inode.iso_uid); + VATTR_RETURN(vap, va_gid, ip->inode.iso_gid); + VATTR_RETURN(vap, va_access_time, ip->inode.iso_atime); + VATTR_RETURN(vap, va_modify_time, ip->inode.iso_mtime); + VATTR_RETURN(vap, va_change_time, ip->inode.iso_ctime); + VATTR_RETURN(vap, va_rdev, ip->inode.iso_rdev); - vap->va_size = (u_quad_t) ip->i_size; + VATTR_RETURN(vap, va_data_size, (off_t)ip->i_size); if (ip->i_size == 0 && (vap->va_mode & S_IFMT) == S_IFLNK) { - struct vop_readlink_args rdlnk; - struct iovec aiov; - struct uio auio; + struct vnop_readlink_args rdlnk; + uio_t auio; + char uio_buf[ UIO_SIZEOF(1) ]; char *cp; MALLOC(cp, char *, MAXPATHLEN, M_TEMP, M_WAITOK); - aiov.iov_base = cp; - aiov.iov_len = MAXPATHLEN; - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_offset = 0; - auio.uio_rw = UIO_READ; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_procp = ap->a_p; - auio.uio_resid = MAXPATHLEN; - rdlnk.a_uio = &auio; + auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, + &uio_buf[0], sizeof(uio_buf)); + uio_addiov(auio, CAST_USER_ADDR_T(cp), MAXPATHLEN); + + rdlnk.a_uio = auio; rdlnk.a_vp = ap->a_vp; - rdlnk.a_cred = ap->a_cred; + rdlnk.a_context = ap->a_context; if (cd9660_readlink(&rdlnk) == 0) - vap->va_size = MAXPATHLEN - auio.uio_resid; + // LP64todo - fix this! + VATTR_RETURN(vap, va_data_size, MAXPATHLEN - uio_resid(auio)); FREE(cp, M_TEMP); } - vap->va_flags = 0; - vap->va_gen = 1; - vap->va_blocksize = ip->i_mnt->logical_block_size; - vap->va_bytes = (u_quad_t) (ip->i_size + ip->i_rsrcsize); - vap->va_type = vp->v_type; + VATTR_RETURN(vap, va_flags, 0); + VATTR_RETURN(vap, va_gen, 1); + VATTR_RETURN(vap, va_iosize, ip->i_mnt->logical_block_size); + VATTR_RETURN(vap, va_total_size, ip->i_size + ip->i_rsrcsize); return (0); } @@ -278,34 +176,27 @@ cd9660_getattr(ap) * Vnode op for reading. */ int -cd9660_read(ap) - struct vop_read_args /* { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; - } */ *ap; +cd9660_read(struct vnop_read_args *ap) { struct vnode *vp = ap->a_vp; register struct uio *uio = ap->a_uio; register struct iso_node *ip = VTOI(vp); register struct iso_mnt *imp; struct buf *bp; - daddr_t lbn, rablock; + daddr_t lbn; + daddr64_t rablock; off_t diff; int rasize, error = 0; - long size, n, on; - int devBlockSize = 0; + int32_t size, n, on; - if (uio->uio_resid == 0) + if (uio_resid(uio) == 0) return (0); if (uio->uio_offset < 0) return (EINVAL); imp = ip->i_mnt; - VOP_DEVBLOCKSIZE(ip->i_devvp, &devBlockSize); - if (UBCISVALID(vp)) { + if (UBCINFOEXISTS(vp)) { /* * Copy any part of the Apple Double header. */ @@ -330,93 +221,75 @@ cd9660_read(ap) header.finfo.fdLocation.h = -1; header.finfo.fdReserved = 0; - bytes = min(uio->uio_resid, sizeof(apple_double_header_t) - uio->uio_offset); + bytes = min(uio_resid(uio), sizeof(apple_double_header_t) - uio->uio_offset); error = uiomove(((char *) &header) + uio->uio_offset, bytes, uio); if (error) return error; } - if (uio->uio_resid && uio->uio_offset < ADH_SIZE) { + if (uio_resid(uio) && uio->uio_offset < ADH_SIZE) { caddr_t buffer; if (kmem_alloc(kernel_map, (vm_offset_t *)&buffer, ADH_SIZE)) { return (ENOMEM); } - bytes = min(uio->uio_resid, ADH_SIZE - uio->uio_offset); + bytes = min(uio_resid(uio), ADH_SIZE - uio->uio_offset); error = uiomove(((char *) buffer) + uio->uio_offset, bytes, uio); kmem_free(kernel_map, (vm_offset_t)buffer, ADH_SIZE); if (error) return error; } } - if (uio->uio_resid > 0) - error = cluster_read(vp, uio, (off_t)ip->i_size, devBlockSize, 0); + if (uio_resid(uio) > 0) + error = cluster_read(vp, uio, (off_t)ip->i_size, 0); } else { do { lbn = lblkno(imp, uio->uio_offset); on = blkoff(imp, uio->uio_offset); n = min((u_int)(imp->logical_block_size - on), - uio->uio_resid); + uio_resid(uio)); diff = (off_t)ip->i_size - uio->uio_offset; if (diff <= 0) return (0); if (diff < n) n = diff; size = blksize(imp, ip, lbn); - rablock = lbn + 1; + rablock = (daddr64_t)lbn + 1; - if (vp->v_lastr + 1 == lbn && + if (ip->i_lastr + 1 == lbn && lblktosize(imp, rablock) < ip->i_size) { - rasize = blksize(imp, ip, rablock); - error = breadn(vp, lbn, size, &rablock, + rasize = blksize(imp, ip, (daddr_t)rablock); + error = (int)buf_breadn(vp, (daddr64_t)((unsigned)lbn), size, &rablock, &rasize, 1, NOCRED, &bp); } else - error = bread(vp, lbn, size, NOCRED, &bp); + error = (int)buf_bread(vp, (daddr64_t)((unsigned)lbn), size, NOCRED, &bp); - vp->v_lastr = lbn; - n = min(n, size - bp->b_resid); + ip->i_lastr = lbn; + n = min(n, size - buf_resid(bp)); if (error) { - brelse(bp); + buf_brelse(bp); return (error); } - error = uiomove(bp->b_data + on, (int)n, uio); + error = uiomove((caddr_t)(buf_dataptr(bp) + on), (int)n, uio); if (n + on == imp->logical_block_size || uio->uio_offset == (off_t)ip->i_size) - bp->b_flags |= B_AGE; - brelse(bp); - } while (error == 0 && uio->uio_resid > 0 && n != 0); + buf_markaged(bp); + buf_brelse(bp); + } while (error == 0 && uio_resid(uio) > 0 && n != 0); } return (error); } -/* ARGSUSED */ int -cd9660_ioctl(ap) - struct vop_ioctl_args /* { - struct vnode *a_vp; - u_long a_command; - caddr_t a_data; - int a_fflag; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; +cd9660_ioctl(__unused struct vnop_ioctl_args *ap) { return (ENOTTY); } -/* ARGSUSED */ int -cd9660_select(ap) - struct vop_select_args /* { - struct vnode *a_vp; - int a_which; - int a_fflags; - struct ucred *a_cred; - void *a_wql; - struct proc *a_p; - } */ *ap; +cd9660_select(__unused struct vnop_select_args *ap) { /* * We should really check to see if I/O is possible. @@ -429,39 +302,13 @@ cd9660_select(ap) * * NB Currently unsupported. */ -/* ARGSUSED */ int -cd9660_mmap(ap) - struct vop_mmap_args /* { - struct vnode *a_vp; - int a_fflags; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; +cd9660_mmap(__unused struct vnop_mmap_args *ap) { return (EINVAL); } -/* - * Seek on a file - * - * Nothing to do, so just return. - */ -/* ARGSUSED */ -int -cd9660_seek(ap) - struct vop_seek_args /* { - struct vnode *a_vp; - off_t a_oldoff; - off_t a_newoff; - struct ucred *a_cred; - } */ *ap; -{ - - return (0); -} - /* * Structure for reading directories */ @@ -478,17 +325,14 @@ struct isoreaddir { }; static int -iso_uiodir(idp,dp,off) - struct isoreaddir *idp; - struct dirent *dp; - off_t off; +iso_uiodir(struct isoreaddir *idp, struct dirent *dp, off_t off) { int error; dp->d_name[dp->d_namlen] = 0; dp->d_reclen = DIRSIZ(dp); - if (idp->uio->uio_resid < dp->d_reclen) { + if (uio_resid(idp->uio) < dp->d_reclen) { idp->eofflag = 0; return (-1); } @@ -512,8 +356,7 @@ iso_uiodir(idp,dp,off) } static int -iso_shipdir(idp) - struct isoreaddir *idp; +iso_shipdir(struct isoreaddir *idp) { struct dirent *dp; int cl, sl; @@ -550,16 +393,7 @@ iso_shipdir(idp) * a sector. */ int -cd9660_readdir(ap) - struct vop_readdir_args /* { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - struct uio *a_uio; - struct ucred *a_cred; - int *a_eofflag; - int *a_ncookies; - u_long **a_cookies; - } */ *ap; +cd9660_readdir(struct vnop_readdir_args *ap) { register struct uio *uio = ap->a_uio; off_t startingOffset = uio->uio_offset; @@ -572,11 +406,14 @@ cd9660_readdir(ap) struct iso_directory_record *ep; int entryoffsetinblock; doff_t endsearch; - u_long bmask; + uint32_t bmask; int error = 0; int reclen; u_short namelen; + if (ap->a_flags & (VNODE_READDIR_EXTENDED | VNODE_READDIR_REQSEEKOFF)) + return (EINVAL); + dp = VTOI(vdp); imp = dp->i_mnt; bmask = imp->im_sector_size - 1; @@ -593,7 +430,7 @@ cd9660_readdir(ap) idp->curroff = uio->uio_offset; if ((entryoffsetinblock = idp->curroff & bmask) && - (error = VOP_BLKATOFF(vdp, SECTOFF(imp, idp->curroff), NULL, &bp))) { + (error = cd9660_blkatoff(vdp, SECTOFF(imp, idp->curroff), NULL, &bp))) { FREE(idp, M_TEMP); return (error); } @@ -607,8 +444,8 @@ cd9660_readdir(ap) */ if ((idp->curroff & bmask) == 0) { if (bp != NULL) - brelse(bp); - if ((error = VOP_BLKATOFF(vdp, SECTOFF(imp, idp->curroff), NULL, &bp))) + buf_brelse(bp); + if ((error = cd9660_blkatoff(vdp, SECTOFF(imp, idp->curroff), NULL, &bp))) break; entryoffsetinblock = 0; } @@ -616,7 +453,7 @@ cd9660_readdir(ap) * Get pointer to next entry. */ ep = (struct iso_directory_record *) - ((char *)bp->b_data + entryoffsetinblock); + (buf_dataptr(bp) + entryoffsetinblock); reclen = isonum_711(ep->length); if (reclen == 0) { @@ -658,7 +495,7 @@ cd9660_readdir(ap) if ( isonum_711(ep->flags) & directoryBit ) idp->current.d_fileno = isodirino(ep, imp); else { - idp->current.d_fileno = (bp->b_blkno << imp->im_bshift) + + idp->current.d_fileno = ((daddr_t)buf_blkno(bp) << imp->im_bshift) + entryoffsetinblock; } @@ -717,9 +554,9 @@ cd9660_readdir(ap) idp->current.d_namlen = 0; error = iso_shipdir(idp); } - +#if 0 if (!error && ap->a_ncookies) { - struct dirent *dp, *dpstart; + struct dirent *dirp, *dpstart; off_t bufferOffset; u_long *cookies; int ncookies; @@ -731,21 +568,23 @@ cd9660_readdir(ap) * * We assume the entire transfer is done to a single contiguous buffer. */ - if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) + if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg) || uio->uio_iovcnt != 1) panic("ufs_readdir: lost in space"); /* * Make a first pass over the buffer just generated, * counting the number of entries: */ - dpstart = (struct dirent *) (uio->uio_iov->iov_base - (uio->uio_offset - startingOffset)); - for (dp = dpstart, bufferOffset = startingOffset, ncookies = 0; + // LP64todo - fix this! + dpstart = (struct dirent *) + CAST_DOWN(caddr_t, (uio_iov_base(uio) - (uio->uio_offset - startingOffset))); + for (dirp = dpstart, bufferOffset = startingOffset, ncookies = 0; bufferOffset < uio->uio_offset; ) { - if (dp->d_reclen == 0) + if (dirp->d_reclen == 0) break; - bufferOffset += dp->d_reclen; + bufferOffset += dirp->d_reclen; ncookies++; - dp = (struct dirent *)((caddr_t)dp + dp->d_reclen); + dirp = (struct dirent *)((caddr_t)dirp + dirp->d_reclen); } lost += uio->uio_offset - bufferOffset; uio->uio_offset = bufferOffset; @@ -760,18 +599,18 @@ cd9660_readdir(ap) /* * Fill in the offsets for each entry in the buffer just allocated: */ - for (bufferOffset = startingOffset, dp = dpstart; bufferOffset < uio->uio_offset; ) { + for (bufferOffset = startingOffset, dirp = dpstart; bufferOffset < uio->uio_offset; ) { *(cookies++) = bufferOffset; - bufferOffset += dp->d_reclen; - dp = (struct dirent *)((caddr_t)dp + dp->d_reclen); + bufferOffset += dirp->d_reclen; + dirp = (struct dirent *)((caddr_t)dirp + dirp->d_reclen); } } - +#endif if (error < 0) error = 0; if (bp) - brelse (bp); + buf_brelse (bp); uio->uio_offset = idp->uio_off; *ap->a_eofflag = idp->eofflag; @@ -791,12 +630,7 @@ typedef struct iso_directory_record ISODIR; typedef struct iso_node ISONODE; typedef struct iso_mnt ISOMNT; int -cd9660_readlink(ap) - struct vop_readlink_args /* { - struct vnode *a_vp; - struct uio *a_uio; - struct ucred *a_cred; - } */ *ap; +cd9660_readlink(struct vnop_readlink_args *ap) { ISONODE *ip; ISODIR *dirp; @@ -817,18 +651,18 @@ cd9660_readlink(ap) /* * Get parents directory record block that this inode included. */ - error = bread(imp->im_devvp, - (ip->i_number >> imp->im_bshift), + error = (int)buf_bread(imp->im_devvp, + (daddr64_t)((unsigned)(ip->i_number >> imp->im_bshift)), imp->logical_block_size, NOCRED, &bp); if (error) { - brelse(bp); + buf_brelse(bp); return (EINVAL); } /* * Setup the directory pointer for this inode */ - dirp = (ISODIR *)(bp->b_data + (ip->i_number & imp->im_bmask)); + dirp = (ISODIR *)(buf_dataptr(bp) + (ip->i_number & imp->im_bmask)); /* * Just make sure, we have a right one.... @@ -836,7 +670,7 @@ cd9660_readlink(ap) */ if ((ip->i_number & imp->im_bmask) + isonum_711(dirp->length) > imp->logical_block_size) { - brelse(bp); + buf_brelse(bp); return (EINVAL); } @@ -844,151 +678,65 @@ cd9660_readlink(ap) * Now get a buffer * Abuse a namei buffer for now. */ - if (uio->uio_segflg == UIO_SYSSPACE) - symname = uio->uio_iov->iov_base; - else + if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) MALLOC_ZONE(symname, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); + else + // LP64todo - fix this! + symname = CAST_DOWN(caddr_t, uio_iov_base(uio)); /* * Ok, we just gathering a symbolic name in SL record. */ if (cd9660_rrip_getsymname(dirp, symname, &symlen, imp) == 0) { - if (uio->uio_segflg != UIO_SYSSPACE) + if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) FREE_ZONE(symname, MAXPATHLEN, M_NAMEI); - brelse(bp); + buf_brelse(bp); return (EINVAL); } /* * Don't forget before you leave from home ;-) */ - brelse(bp); + buf_brelse(bp); /* * return with the symbolic name to caller's. */ - if (uio->uio_segflg != UIO_SYSSPACE) { + if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) { error = uiomove(symname, symlen, uio); FREE_ZONE(symname, MAXPATHLEN, M_NAMEI); return (error); } - uio->uio_resid -= symlen; - uio->uio_iov->iov_base += symlen; - uio->uio_iov->iov_len -= symlen; +#if LP64KERN + uio_setresid(uio, (uio_resid(uio) - symlen)); + uio_iov_len_add(uio, -((int64_t)symlen)); +#else + uio_setresid(uio, (uio_resid(uio) - symlen)); + uio_iov_len_add(uio, -((int)symlen)); +#endif + uio_iov_base_add(uio, symlen); return (0); } -/* - * Lock an inode. - */ - -int -cd9660_lock(ap) - struct vop_lock_args /* { - struct vnode *a_vp; - int a_flags; - struct proc *a_p; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - - if (VTOI(vp) == (struct iso_node *) NULL) - panic ("cd9660_lock: null inode"); - return (lockmgr(&VTOI(vp)->i_lock, ap->a_flags, &vp->v_interlock,ap->a_p)); -} - -/* - * Unlock an inode. - */ - -int -cd9660_unlock(ap) - struct vop_unlock_args /* { - struct vnode *a_vp; - int a_flags; - struct proc *a_p; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - - return (lockmgr(&VTOI(vp)->i_lock, ap->a_flags | LK_RELEASE, &vp->v_interlock,ap->a_p)); - -} /* - * Calculate the logical to physical mapping if not done already, - * then call the device strategy routine. + * prepare and issue the I/O */ int -cd9660_strategy(ap) - struct vop_strategy_args /* { - struct buf *a_bp; - } */ *ap; -{ - register struct buf *bp = ap->a_bp; - register struct vnode *vp = bp->b_vp; - register struct iso_node *ip; - int error; - - ip = VTOI(vp); - if (vp->v_type == VBLK || vp->v_type == VCHR) - panic("cd9660_strategy: spec"); - if (bp->b_blkno == bp->b_lblkno) { - if ( (error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL)) ) { - bp->b_error = error; - bp->b_flags |= B_ERROR; - biodone(bp); - return (error); - } - if ((long)bp->b_blkno == -1) - clrbuf(bp); - } - if ((long)bp->b_blkno == -1) { - biodone(bp); - return (0); - } - vp = ip->i_devvp; - bp->b_dev = vp->v_rdev; - VOCALL (vp->v_op, VOFFSET(vop_strategy), ap); - return (0); -} - -/* - * Print out the contents of an inode. - */ -int -cd9660_print(ap) - struct vop_print_args /* { - struct vnode *a_vp; - } */ *ap; +cd9660_strategy(struct vnop_strategy_args *ap) { + buf_t bp = ap->a_bp; + vnode_t vp = buf_vnode(bp); + struct iso_node *ip = VTOI(vp); - printf("tag VT_ISOFS, isofs vnode\n"); - return (0); + return (buf_strategy(ip->i_devvp, ap)); } -/* - * Check for a locked inode. - */ -int -cd9660_islocked(ap) - struct vop_islocked_args /* { - struct vnode *a_vp; - } */ *ap; -{ - - return (lockstatus(&VTOI(ap->a_vp)->i_lock)); -} /* * Return POSIX pathconf information applicable to cd9660 filesystems. */ int -cd9660_pathconf(ap) - struct vop_pathconf_args /* { - struct vnode *a_vp; - int a_name; - register_t *a_retval; - } */ *ap; +cd9660_pathconf(struct vnop_pathconf_args *ap) { switch (ap->a_name) { @@ -1029,23 +777,13 @@ cd9660_pathconf(ap) * Unsupported operation */ int -cd9660_enotsupp() +cd9660_enotsupp(void) { - - return (EOPNOTSUPP); + return (ENOTSUP); } /* Pagein. similar to read */ int -cd9660_pagein(ap) - struct vop_pagein_args /* { - struct vnode *a_vp, - upl_t a_pl, - vm_offset_t a_pl_offset, - off_t a_f_offset, - size_t a_size, - struct ucred *a_cred, - int a_flags - } */ *ap; +cd9660_pagein(struct vnop_pagein_args *ap) { struct vnode *vp = ap->a_vp; upl_t pl = ap->a_pl; @@ -1096,17 +834,13 @@ cd9660_pagein(ap) ubc_upl_commit_range(pl, pl_offset, size, UPL_COMMIT_FREE_ON_EMPTY); } } else { - int devBlockSize = 0; - /* check pageouts are for reg file only and ubc info is present*/ if (UBCINVALID(vp)) panic("cd9660_pagein: Not a VREG"); UBCINFOCHECK("cd9660_pagein", vp); - VOP_DEVBLOCKSIZE(ip->i_devvp, &devBlockSize); - error = cluster_pagein(vp, pl, pl_offset, f_offset, size, - (off_t)ip->i_size, devBlockSize, flags); + (off_t)ip->i_size, flags); } return (error); } @@ -1117,16 +851,8 @@ cd9660_pagein(ap) * Locking policy: a_dvp and vp locked on entry, unlocked on exit */ int -cd9660_remove(ap) - struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; - struct componentname *a_cnp; } */ *ap; +cd9660_remove(__unused struct vnop_remove_args *ap) { - if (ap->a_dvp == ap->a_vp) - vrele(ap->a_vp); - else - vput(ap->a_vp); - vput(ap->a_dvp); - return (EROFS); } @@ -1137,9 +863,7 @@ cd9660_remove(ap) * Locking policy: a_dvp and vp locked on entry, unlocked on exit */ int -cd9660_rmdir(ap) - struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; - struct componentname *a_cnp; } */ *ap; +cd9660_rmdir(struct vnop_rmdir_args *ap) { (void) nop_rmdir(ap); return (EROFS); @@ -1150,24 +874,16 @@ cd9660_rmdir(ap) # #% getattrlist vp = = = # - vop_getattrlist { + vnop_getattrlist { IN struct vnode *vp; IN struct attrlist *alist; INOUT struct uio *uio; - IN struct ucred *cred; - IN struct proc *p; + IN vfs_context_t context; }; */ int -cd9660_getattrlist(ap) - struct vop_getattrlist_args /* { - struct vnode *a_vp; - struct attrlist *a_alist - struct uio *a_uio; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; +cd9660_getattrlist(struct vnop_getattrlist_args *ap) { struct attrlist *alist = ap->a_alist; int fixedblocksize; @@ -1215,24 +931,24 @@ cd9660_getattrlist(ap) fixedblocksize = attrcalcsize(alist); - attrblocksize = fixedblocksize + (sizeof(u_long)); /* u_long for length longword */ + attrblocksize = fixedblocksize + (sizeof(uint32_t)); /* uint32_t for length word */ if (alist->commonattr & ATTR_CMN_NAME) attrblocksize += NAME_MAX; if (alist->commonattr & ATTR_CMN_NAMEDATTRLIST) attrblocksize += 0; /* XXX PPD */ if (alist->volattr & ATTR_VOL_MOUNTPOINT) attrblocksize += PATH_MAX; if (alist->volattr & ATTR_VOL_NAME) attrblocksize += NAME_MAX; if (alist->fileattr & ATTR_FILE_FORKLIST) attrblocksize += 0; /* XXX PPD */ - attrbufsize = MIN(ap->a_uio->uio_resid, attrblocksize); + attrbufsize = MIN(uio_resid(ap->a_uio), attrblocksize); MALLOC(attrbufptr, void *, attrblocksize, M_TEMP, M_WAITOK); attrptr = attrbufptr; - *((u_long *)attrptr) = 0; /* Set buffer length in case of errors */ - ++((u_long *)attrptr); /* Reserve space for length field */ + *((uint32_t *)attrptr) = 0; /* Set buffer length in case of errors */ + ++((uint32_t *)attrptr); /* Reserve space for length field */ varptr = ((char *)attrptr) + fixedblocksize; /* Point to variable-length storage */ packattrblk(alist, ap->a_vp, &attrptr, &varptr); /* Store length of fixed + var block */ - *((u_long *)attrbufptr) = ((char*)varptr - (char*)attrbufptr); + *((uint32_t *)attrbufptr) = ((char*)varptr - (char*)attrbufptr); /* Don't copy out more data than was generated */ attrbufsize = MIN(attrbufsize, (char*)varptr - (char*)attrbufptr); @@ -1247,10 +963,9 @@ cd9660_getattrlist(ap) * Make a RIFF file header for a CD-ROM XA media file. */ __private_extern__ void -cd9660_xa_init(struct vnode *vp, struct iso_directory_record *isodir) +cd9660_xa_init(struct iso_node *ip, struct iso_directory_record *isodir) { - u_long sectors; - struct iso_node *ip = VTOI(vp); + uint32_t sectors; struct riff_header *header; u_char name_len; char *cdxa; @@ -1285,17 +1000,14 @@ cd9660_xa_init(struct vnode *vp, struct iso_directory_record *isodir) * device. This allows cd9660_strategy to be ignorant of the block * (sector) size. */ - vrele(ip->i_devvp); ip->i_devvp = ip->i_mnt->phys_devvp; - VREF(ip->i_devvp); ip->i_size = sectors * CDXA_SECTOR_SIZE + sizeof(struct riff_header); ip->i_riff = header; - vp->v_op = cd9660_cdxaop_p; } /* - * Helper routine for VOP_READ and VOP_PAGEIN of CD-ROM XA multimedia files. + * Helper routine for vnop_read and vnop_pagein of CD-ROM XA multimedia files. * This routine determines the physical location of the file, then reads * sectors directly from the device into a buffer. It also handles inserting * the RIFF header at the beginning of the file. @@ -1303,7 +1015,7 @@ cd9660_xa_init(struct vnode *vp, struct iso_directory_record *isodir) * Exactly one of buffer or uio must be non-zero. It will either bcopy to * buffer, or uiomove via uio. * - * XXX Should this code be using breadn and vp->v_lastr to support single-block + * XXX Should this code be using buf_breadn and ip->i_lastr to support single-block * read-ahead? Should we try more aggressive read-ahead like cluster_io does? * * XXX This could be made to do larger I/O to the device (reading all the @@ -1361,13 +1073,13 @@ cd9660_xa_read_common( /* Get a block from the underlying device */ block = ip->iso_start + (offset - sizeof(struct riff_header))/CDXA_SECTOR_SIZE; - error = bread(ip->i_devvp, block, CDXA_SECTOR_SIZE, NOCRED, &bp); + error = (int)buf_bread(ip->i_devvp, (daddr64_t)((unsigned)block), CDXA_SECTOR_SIZE, NOCRED, &bp); if (error) { - brelse(bp); + buf_brelse(bp); return error; } - if (bp->b_resid) { - printf("isofs: cd9660_xa_read_common: bread didn't read full sector\n"); + if (buf_resid(bp)) { + printf("isofs: cd9660_xa_read_common: buf_bread didn't read full sector\n"); return EIO; } @@ -1378,10 +1090,10 @@ cd9660_xa_read_common( count = diff; if (buffer) { - bcopy(bp->b_data+sect_off, buffer, count); + bcopy(CAST_DOWN(caddr_t, (buf_dataptr(bp)+sect_off)), buffer, count); buffer += count; } else { - error = uiomove(bp->b_data+sect_off, count, uio); + error = uiomove(CAST_DOWN(caddr_t, (buf_dataptr(bp)+sect_off)), count, uio); } amount -= count; offset += count; @@ -1391,8 +1103,8 @@ cd9660_xa_read_common( * age the device block. This is optimized for sequential access. */ if (sect_off+count == CDXA_SECTOR_SIZE || offset == (off_t)ip->i_size) - bp->b_flags |= B_AGE; - brelse(bp); + buf_markaged(bp); + buf_brelse(bp); } return error; @@ -1416,19 +1128,14 @@ cd9660_xa_read_common( * important. */ int -cd9660_xa_read(ap) - struct vop_read_args /* { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; - } */ *ap; +cd9660_xa_read(struct vnop_read_args *ap) { struct vnode *vp = ap->a_vp; register struct uio *uio = ap->a_uio; register struct iso_node *ip = VTOI(vp); off_t offset = uio->uio_offset; - size_t size = uio->uio_resid; + // LP64todo - fix this! + size_t size = uio_resid(uio); /* Check for some obvious parameter problems */ if (offset < 0) @@ -1452,16 +1159,7 @@ cd9660_xa_read(ap) * cluster_pagein. Instead, we have to map the page and read into it. */ static int -cd9660_xa_pagein(ap) - struct vop_pagein_args /* { - struct vnode *a_vp, - upl_t a_pl, - vm_offset_t a_pl_offset, - off_t a_f_offset, - size_t a_size, - struct ucred *a_cred, - int a_flags - } */ *ap; +cd9660_xa_pagein(struct vnop_pagein_args *ap) { struct vnode *vp = ap->a_vp; upl_t pl = ap->a_pl; @@ -1518,45 +1216,27 @@ cd9660_xa_pagein(ap) * Global vfs data structures for isofs */ #define cd9660_create \ - ((int (*) __P((struct vop_create_args *)))err_create) -#define cd9660_mknod ((int (*) __P((struct vop_mknod_args *)))err_mknod) -#define cd9660_setattr \ - ((int (*) __P((struct vop_setattr_args *)))cd9660_enotsupp) -#define cd9660_write ((int (*) __P((struct vop_write_args *)))cd9660_enotsupp) -#if NFSSERVER -int lease_check __P((struct vop_lease_args *)); -#define cd9660_lease_check lease_check -#else -#define cd9660_lease_check ((int (*) __P((struct vop_lease_args *)))nullop) -#endif -#define cd9660_fsync ((int (*) __P((struct vop_fsync_args *)))nullop) + ((int (*)(struct vnop_create_args *))err_create) +#define cd9660_mknod ((int (*)(struct vnop_mknod_args *))err_mknod) +#define cd9660_write ((int (*)(struct vnop_write_args *))cd9660_enotsupp) +#define cd9660_fsync ((int (*)(struct vnop_fsync_args *))nullop) #define cd9660_rename \ - ((int (*) __P((struct vop_rename_args *)))err_rename) + ((int (*)(struct vnop_rename_args *))err_rename) #define cd9660_copyfile \ - ((int (*) __P((struct vop_copyfile_args *)))err_copyfile) -#define cd9660_link ((int (*) __P((struct vop_link_args *)))err_link) -#define cd9660_mkdir ((int (*) __P((struct vop_mkdir_args *)))err_mkdir) + ((int (*)(struct vnop_copyfile_args *))err_copyfile) +#define cd9660_link ((int (*)(struct vnop_link_args *))err_link) +#define cd9660_mkdir ((int (*)(struct vnop_mkdir_args *))err_mkdir) #define cd9660_symlink \ - ((int (*) __P((struct vop_symlink_args *)))err_symlink) + ((int (*)(struct vnop_symlink_args *))err_symlink) #define cd9660_advlock \ - ((int (*) __P((struct vop_advlock_args *)))cd9660_enotsupp) -#define cd9660_valloc ((int(*) __P(( \ - struct vnode *pvp, \ - int mode, \ - struct ucred *cred, \ - struct vnode **vpp))) cd9660_enotsupp) -#define cd9660_vfree ((int (*) __P((struct vop_vfree_args *)))cd9660_enotsupp) -#define cd9660_truncate \ - ((int (*) __P((struct vop_truncate_args *)))cd9660_enotsupp) -#define cd9660_update \ - ((int (*) __P((struct vop_update_args *)))cd9660_enotsupp) + ((int (*)(struct vnop_advlock_args *))cd9660_enotsupp) #define cd9660_bwrite \ - ((int (*) __P((struct vop_bwrite_args *)))cd9660_enotsupp) + ((int (*)(struct vnop_bwrite_args *))cd9660_enotsupp) #define cd9660_pageout \ - ((int (*) __P((struct vop_pageout_args *)))cd9660_enotsupp) -int cd9660_blktooff(struct vop_blktooff_args *ap); -int cd9660_offtoblk(struct vop_offtoblk_args *ap); -int cd9660_cmap(struct vop_cmap_args *ap); + ((int (*)(struct vnop_pageout_args *))cd9660_enotsupp) +int cd9660_blktooff(struct vnop_blktooff_args *ap); +int cd9660_offtoblk(struct vnop_offtoblk_args *ap); +int cd9660_blockmap(struct vnop_blockmap_args *ap); #define VOPFUNC int (*)(void *) /* @@ -1564,55 +1244,40 @@ int cd9660_cmap(struct vop_cmap_args *ap); */ int (**cd9660_vnodeop_p)(void *); struct vnodeopv_entry_desc cd9660_vnodeop_entries[] = { - { &vop_default_desc, (VOPFUNC)vn_default_error }, - { &vop_lookup_desc, (VOPFUNC)cd9660_lookup }, /* lookup */ - { &vop_create_desc, (VOPFUNC)cd9660_create }, /* create */ - { &vop_mknod_desc, (VOPFUNC)cd9660_mknod }, /* mknod */ - { &vop_open_desc, (VOPFUNC)cd9660_open }, /* open */ - { &vop_close_desc, (VOPFUNC)cd9660_close }, /* close */ - { &vop_access_desc, (VOPFUNC)cd9660_access }, /* access */ - { &vop_getattr_desc, (VOPFUNC)cd9660_getattr }, /* getattr */ - { &vop_setattr_desc, (VOPFUNC)cd9660_setattr }, /* setattr */ - { &vop_read_desc, (VOPFUNC)cd9660_read }, /* read */ - { &vop_write_desc, (VOPFUNC)cd9660_write }, /* write */ - { &vop_lease_desc, (VOPFUNC)cd9660_lease_check },/* lease */ - { &vop_ioctl_desc, (VOPFUNC)cd9660_ioctl }, /* ioctl */ - { &vop_select_desc, (VOPFUNC)cd9660_select }, /* select */ - { &vop_mmap_desc, (VOPFUNC)cd9660_mmap }, /* mmap */ - { &vop_fsync_desc, (VOPFUNC)cd9660_fsync }, /* fsync */ - { &vop_seek_desc, (VOPFUNC)cd9660_seek }, /* seek */ - { &vop_remove_desc, (VOPFUNC)cd9660_remove }, /* remove */ - { &vop_link_desc, (VOPFUNC)cd9660_link }, /* link */ - { &vop_rename_desc, (VOPFUNC)cd9660_rename }, /* rename */ - { &vop_copyfile_desc, (VOPFUNC)cd9660_copyfile },/* copyfile */ - { &vop_mkdir_desc, (VOPFUNC)cd9660_mkdir }, /* mkdir */ - { &vop_rmdir_desc, (VOPFUNC)cd9660_rmdir }, /* rmdir */ - { &vop_symlink_desc, (VOPFUNC)cd9660_symlink }, /* symlink */ - { &vop_readdir_desc, (VOPFUNC)cd9660_readdir }, /* readdir */ - { &vop_readlink_desc, (VOPFUNC)cd9660_readlink },/* readlink */ - { &vop_abortop_desc, (VOPFUNC)nop_abortop }, /* abortop */ - { &vop_inactive_desc, (VOPFUNC)cd9660_inactive },/* inactive */ - { &vop_reclaim_desc, (VOPFUNC)cd9660_reclaim }, /* reclaim */ - { &vop_lock_desc, (VOPFUNC)cd9660_lock }, /* lock */ - { &vop_unlock_desc, (VOPFUNC)cd9660_unlock }, /* unlock */ - { &vop_bmap_desc, (VOPFUNC)cd9660_bmap }, /* bmap */ - { &vop_strategy_desc, (VOPFUNC)cd9660_strategy },/* strategy */ - { &vop_print_desc, (VOPFUNC)cd9660_print }, /* print */ - { &vop_islocked_desc, (VOPFUNC)cd9660_islocked },/* islocked */ - { &vop_pathconf_desc, (VOPFUNC)cd9660_pathconf },/* pathconf */ - { &vop_advlock_desc, (VOPFUNC)cd9660_advlock }, /* advlock */ - { &vop_blkatoff_desc, (VOPFUNC)cd9660_blkatoff },/* blkatoff */ - { &vop_valloc_desc, (VOPFUNC)cd9660_valloc }, /* valloc */ - { &vop_vfree_desc, (VOPFUNC)cd9660_vfree }, /* vfree */ - { &vop_truncate_desc, (VOPFUNC)cd9660_truncate },/* truncate */ - { &vop_update_desc, (VOPFUNC)cd9660_update }, /* update */ - { &vop_bwrite_desc, (VOPFUNC)vn_bwrite }, - { &vop_pagein_desc, (VOPFUNC)cd9660_pagein }, /* Pagein */ - { &vop_pageout_desc, (VOPFUNC)cd9660_pageout }, /* Pageout */ - { &vop_getattrlist_desc, (VOPFUNC)cd9660_getattrlist }, /* getattrlist */ - { &vop_blktooff_desc, (VOPFUNC)cd9660_blktooff }, /* blktooff */ - { &vop_offtoblk_desc, (VOPFUNC)cd9660_offtoblk }, /* offtoblk */ - { &vop_cmap_desc, (VOPFUNC)cd9660_cmap }, /* cmap */ + { &vnop_default_desc, (VOPFUNC)vn_default_error }, + { &vnop_lookup_desc, (VOPFUNC)cd9660_lookup }, /* lookup */ + { &vnop_create_desc, (VOPFUNC)cd9660_create }, /* create */ + { &vnop_mknod_desc, (VOPFUNC)cd9660_mknod }, /* mknod */ + { &vnop_open_desc, (VOPFUNC)cd9660_open }, /* open */ + { &vnop_close_desc, (VOPFUNC)cd9660_close }, /* close */ + { &vnop_getattr_desc, (VOPFUNC)cd9660_getattr }, /* getattr */ + { &vnop_read_desc, (VOPFUNC)cd9660_read }, /* read */ + { &vnop_write_desc, (VOPFUNC)cd9660_write }, /* write */ + { &vnop_ioctl_desc, (VOPFUNC)cd9660_ioctl }, /* ioctl */ + { &vnop_select_desc, (VOPFUNC)cd9660_select }, /* select */ + { &vnop_mmap_desc, (VOPFUNC)cd9660_mmap }, /* mmap */ + { &vnop_fsync_desc, (VOPFUNC)cd9660_fsync }, /* fsync */ + { &vnop_remove_desc, (VOPFUNC)cd9660_remove }, /* remove */ + { &vnop_link_desc, (VOPFUNC)cd9660_link }, /* link */ + { &vnop_rename_desc, (VOPFUNC)cd9660_rename }, /* rename */ + { &vnop_copyfile_desc, (VOPFUNC)cd9660_copyfile },/* copyfile */ + { &vnop_mkdir_desc, (VOPFUNC)cd9660_mkdir }, /* mkdir */ + { &vnop_rmdir_desc, (VOPFUNC)cd9660_rmdir }, /* rmdir */ + { &vnop_symlink_desc, (VOPFUNC)cd9660_symlink }, /* symlink */ + { &vnop_readdir_desc, (VOPFUNC)cd9660_readdir }, /* readdir */ + { &vnop_readlink_desc, (VOPFUNC)cd9660_readlink },/* readlink */ + { &vnop_inactive_desc, (VOPFUNC)cd9660_inactive },/* inactive */ + { &vnop_reclaim_desc, (VOPFUNC)cd9660_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (VOPFUNC)cd9660_strategy },/* strategy */ + { &vnop_pathconf_desc, (VOPFUNC)cd9660_pathconf },/* pathconf */ + { &vnop_advlock_desc, (VOPFUNC)cd9660_advlock }, /* advlock */ + { &vnop_bwrite_desc, (VOPFUNC)vn_bwrite }, + { &vnop_pagein_desc, (VOPFUNC)cd9660_pagein }, /* Pagein */ + { &vnop_pageout_desc, (VOPFUNC)cd9660_pageout }, /* Pageout */ + { &vnop_getattrlist_desc, (VOPFUNC)cd9660_getattrlist }, /* getattrlist */ + { &vnop_blktooff_desc, (VOPFUNC)cd9660_blktooff }, /* blktooff */ + { &vnop_offtoblk_desc, (VOPFUNC)cd9660_offtoblk }, /* offtoblk */ + { &vnop_blockmap_desc, (VOPFUNC)cd9660_blockmap }, /* blockmap */ { (struct vnodeop_desc*)NULL, (VOPFUNC)NULL } }; struct vnodeopv_desc cd9660_vnodeop_opv_desc = @@ -1621,57 +1286,44 @@ struct vnodeopv_desc cd9660_vnodeop_opv_desc = /* * The VOP table for CD-ROM XA (media) files is almost the same * as for ordinary files, except for read, and pagein. - * Note that cd9660_xa_read doesn't use cluster I/O, so cmap + * Note that cd9660_xa_read doesn't use cluster I/O, so blockmap * isn't needed, and isn't implemented. Similarly, it doesn't - * do bread() on CD XA vnodes, so bmap, blktooff, offtoblk + * do buf_bread() on CD XA vnodes, so bmap, blktooff, offtoblk * aren't needed. */ int (**cd9660_cdxaop_p)(void *); struct vnodeopv_entry_desc cd9660_cdxaop_entries[] = { - { &vop_default_desc, (VOPFUNC)vn_default_error }, - { &vop_lookup_desc, (VOPFUNC)cd9660_lookup }, /* lookup */ - { &vop_create_desc, (VOPFUNC)cd9660_create }, /* create */ - { &vop_mknod_desc, (VOPFUNC)cd9660_mknod }, /* mknod */ - { &vop_open_desc, (VOPFUNC)cd9660_open }, /* open */ - { &vop_close_desc, (VOPFUNC)cd9660_close }, /* close */ - { &vop_access_desc, (VOPFUNC)cd9660_access }, /* access */ - { &vop_getattr_desc, (VOPFUNC)cd9660_getattr }, /* getattr */ - { &vop_setattr_desc, (VOPFUNC)cd9660_setattr }, /* setattr */ - { &vop_read_desc, (VOPFUNC)cd9660_xa_read }, /* read */ - { &vop_write_desc, (VOPFUNC)cd9660_write }, /* write */ - { &vop_lease_desc, (VOPFUNC)cd9660_lease_check },/* lease */ - { &vop_ioctl_desc, (VOPFUNC)cd9660_ioctl }, /* ioctl */ - { &vop_select_desc, (VOPFUNC)cd9660_select }, /* select */ - { &vop_mmap_desc, (VOPFUNC)cd9660_mmap }, /* mmap */ - { &vop_fsync_desc, (VOPFUNC)cd9660_fsync }, /* fsync */ - { &vop_seek_desc, (VOPFUNC)cd9660_seek }, /* seek */ - { &vop_remove_desc, (VOPFUNC)cd9660_remove }, /* remove */ - { &vop_link_desc, (VOPFUNC)cd9660_link }, /* link */ - { &vop_rename_desc, (VOPFUNC)cd9660_rename }, /* rename */ - { &vop_copyfile_desc, (VOPFUNC)cd9660_copyfile },/* copyfile */ - { &vop_mkdir_desc, (VOPFUNC)cd9660_mkdir }, /* mkdir */ - { &vop_rmdir_desc, (VOPFUNC)cd9660_rmdir }, /* rmdir */ - { &vop_symlink_desc, (VOPFUNC)cd9660_symlink }, /* symlink */ - { &vop_readdir_desc, (VOPFUNC)cd9660_readdir }, /* readdir */ - { &vop_readlink_desc, (VOPFUNC)cd9660_readlink },/* readlink */ - { &vop_inactive_desc, (VOPFUNC)cd9660_inactive },/* inactive */ - { &vop_reclaim_desc, (VOPFUNC)cd9660_reclaim }, /* reclaim */ - { &vop_lock_desc, (VOPFUNC)cd9660_lock }, /* lock */ - { &vop_unlock_desc, (VOPFUNC)cd9660_unlock }, /* unlock */ - { &vop_strategy_desc, (VOPFUNC)cd9660_strategy },/* strategy */ - { &vop_print_desc, (VOPFUNC)cd9660_print }, /* print */ - { &vop_islocked_desc, (VOPFUNC)cd9660_islocked },/* islocked */ - { &vop_pathconf_desc, (VOPFUNC)cd9660_pathconf },/* pathconf */ - { &vop_advlock_desc, (VOPFUNC)cd9660_advlock }, /* advlock */ - { &vop_blkatoff_desc, (VOPFUNC)cd9660_blkatoff },/* blkatoff */ - { &vop_valloc_desc, (VOPFUNC)cd9660_valloc }, /* valloc */ - { &vop_vfree_desc, (VOPFUNC)cd9660_vfree }, /* vfree */ - { &vop_truncate_desc, (VOPFUNC)cd9660_truncate },/* truncate */ - { &vop_update_desc, (VOPFUNC)cd9660_update }, /* update */ - { &vop_bwrite_desc, (VOPFUNC)vn_bwrite }, - { &vop_pagein_desc, (VOPFUNC)cd9660_xa_pagein }, /* Pagein */ - { &vop_pageout_desc, (VOPFUNC)cd9660_pageout }, /* Pageout */ - { &vop_getattrlist_desc, (VOPFUNC)cd9660_getattrlist }, /* getattrlist */ + { &vnop_default_desc, (VOPFUNC)vn_default_error }, + { &vnop_lookup_desc, (VOPFUNC)cd9660_lookup }, /* lookup */ + { &vnop_create_desc, (VOPFUNC)cd9660_create }, /* create */ + { &vnop_mknod_desc, (VOPFUNC)cd9660_mknod }, /* mknod */ + { &vnop_open_desc, (VOPFUNC)cd9660_open }, /* open */ + { &vnop_close_desc, (VOPFUNC)cd9660_close }, /* close */ + { &vnop_getattr_desc, (VOPFUNC)cd9660_getattr }, /* getattr */ + { &vnop_read_desc, (VOPFUNC)cd9660_xa_read }, /* read */ + { &vnop_write_desc, (VOPFUNC)cd9660_write }, /* write */ + { &vnop_ioctl_desc, (VOPFUNC)cd9660_ioctl }, /* ioctl */ + { &vnop_select_desc, (VOPFUNC)cd9660_select }, /* select */ + { &vnop_mmap_desc, (VOPFUNC)cd9660_mmap }, /* mmap */ + { &vnop_fsync_desc, (VOPFUNC)cd9660_fsync }, /* fsync */ + { &vnop_remove_desc, (VOPFUNC)cd9660_remove }, /* remove */ + { &vnop_link_desc, (VOPFUNC)cd9660_link }, /* link */ + { &vnop_rename_desc, (VOPFUNC)cd9660_rename }, /* rename */ + { &vnop_copyfile_desc, (VOPFUNC)cd9660_copyfile },/* copyfile */ + { &vnop_mkdir_desc, (VOPFUNC)cd9660_mkdir }, /* mkdir */ + { &vnop_rmdir_desc, (VOPFUNC)cd9660_rmdir }, /* rmdir */ + { &vnop_symlink_desc, (VOPFUNC)cd9660_symlink }, /* symlink */ + { &vnop_readdir_desc, (VOPFUNC)cd9660_readdir }, /* readdir */ + { &vnop_readlink_desc, (VOPFUNC)cd9660_readlink },/* readlink */ + { &vnop_inactive_desc, (VOPFUNC)cd9660_inactive },/* inactive */ + { &vnop_reclaim_desc, (VOPFUNC)cd9660_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (VOPFUNC)cd9660_strategy },/* strategy */ + { &vnop_pathconf_desc, (VOPFUNC)cd9660_pathconf },/* pathconf */ + { &vnop_advlock_desc, (VOPFUNC)cd9660_advlock }, /* advlock */ + { &vnop_bwrite_desc, (VOPFUNC)vn_bwrite }, + { &vnop_pagein_desc, (VOPFUNC)cd9660_xa_pagein }, /* Pagein */ + { &vnop_pageout_desc, (VOPFUNC)cd9660_pageout }, /* Pageout */ + { &vnop_getattrlist_desc, (VOPFUNC)cd9660_getattrlist }, /* getattrlist */ { (struct vnodeop_desc*)NULL, (VOPFUNC)NULL } }; struct vnodeopv_desc cd9660_cdxaop_opv_desc = @@ -1682,54 +1334,39 @@ struct vnodeopv_desc cd9660_cdxaop_opv_desc = */ int (**cd9660_specop_p)(void *); struct vnodeopv_entry_desc cd9660_specop_entries[] = { - { &vop_default_desc, (VOPFUNC)vn_default_error }, - { &vop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */ - { &vop_create_desc, (VOPFUNC)spec_create }, /* create */ - { &vop_mknod_desc, (VOPFUNC)spec_mknod }, /* mknod */ - { &vop_open_desc, (VOPFUNC)spec_open }, /* open */ - { &vop_close_desc, (VOPFUNC)spec_close }, /* close */ - { &vop_access_desc, (VOPFUNC)cd9660_access }, /* access */ - { &vop_getattr_desc, (VOPFUNC)cd9660_getattr }, /* getattr */ - { &vop_setattr_desc, (VOPFUNC)cd9660_setattr }, /* setattr */ - { &vop_read_desc, (VOPFUNC)spec_read }, /* read */ - { &vop_write_desc, (VOPFUNC)spec_write }, /* write */ - { &vop_lease_desc, (VOPFUNC)spec_lease_check }, /* lease */ - { &vop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */ - { &vop_select_desc, (VOPFUNC)spec_select }, /* select */ - { &vop_mmap_desc, (VOPFUNC)spec_mmap }, /* mmap */ - { &vop_fsync_desc, (VOPFUNC)spec_fsync }, /* fsync */ - { &vop_seek_desc, (VOPFUNC)spec_seek }, /* seek */ - { &vop_remove_desc, (VOPFUNC)spec_remove }, /* remove */ - { &vop_link_desc, (VOPFUNC)spec_link }, /* link */ - { &vop_rename_desc, (VOPFUNC)spec_rename }, /* rename */ - { &vop_mkdir_desc, (VOPFUNC)spec_mkdir }, /* mkdir */ - { &vop_rmdir_desc, (VOPFUNC)spec_rmdir }, /* rmdir */ - { &vop_symlink_desc, (VOPFUNC)spec_symlink }, /* symlink */ - { &vop_readdir_desc, (VOPFUNC)spec_readdir }, /* readdir */ - { &vop_readlink_desc, (VOPFUNC)spec_readlink }, /* readlink */ - { &vop_abortop_desc, (VOPFUNC)spec_abortop }, /* abortop */ - { &vop_inactive_desc, (VOPFUNC)cd9660_inactive },/* inactive */ - { &vop_reclaim_desc, (VOPFUNC)cd9660_reclaim }, /* reclaim */ - { &vop_lock_desc, (VOPFUNC)cd9660_lock }, /* lock */ - { &vop_unlock_desc, (VOPFUNC)cd9660_unlock }, /* unlock */ - { &vop_bmap_desc, (VOPFUNC)spec_bmap }, /* bmap */ - { &vop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */ - { &vop_print_desc, (VOPFUNC)cd9660_print }, /* print */ - { &vop_islocked_desc, (VOPFUNC)cd9660_islocked },/* islocked */ - { &vop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */ - { &vop_advlock_desc, (VOPFUNC)spec_advlock }, /* advlock */ - { &vop_blkatoff_desc, (VOPFUNC)spec_blkatoff }, /* blkatoff */ - { &vop_valloc_desc, (VOPFUNC)spec_valloc }, /* valloc */ - { &vop_vfree_desc, (VOPFUNC)spec_vfree }, /* vfree */ - { &vop_truncate_desc, (VOPFUNC)spec_truncate }, /* truncate */ - { &vop_update_desc, (VOPFUNC)cd9660_update }, /* update */ - { &vop_bwrite_desc, (VOPFUNC)vn_bwrite }, - { &vop_devblocksize_desc, (VOPFUNC)spec_devblocksize }, /* devblocksize */ - { &vop_pagein_desc, (VOPFUNC)cd9660_pagein }, /* Pagein */ - { &vop_pageout_desc, (VOPFUNC)cd9660_pageout }, /* Pageout */ - { &vop_blktooff_desc, (VOPFUNC)cd9660_blktooff }, /* blktooff */ - { &vop_offtoblk_desc, (VOPFUNC)cd9660_offtoblk }, /* offtoblk */ - { &vop_cmap_desc, (VOPFUNC)cd9660_cmap }, /* cmap */ + { &vnop_default_desc, (VOPFUNC)vn_default_error }, + { &vnop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */ + { &vnop_create_desc, (VOPFUNC)spec_create }, /* create */ + { &vnop_mknod_desc, (VOPFUNC)spec_mknod }, /* mknod */ + { &vnop_open_desc, (VOPFUNC)spec_open }, /* open */ + { &vnop_close_desc, (VOPFUNC)spec_close }, /* close */ + { &vnop_getattr_desc, (VOPFUNC)cd9660_getattr }, /* getattr */ + { &vnop_read_desc, (VOPFUNC)spec_read }, /* read */ + { &vnop_write_desc, (VOPFUNC)spec_write }, /* write */ + { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */ + { &vnop_select_desc, (VOPFUNC)spec_select }, /* select */ + { &vnop_mmap_desc, (VOPFUNC)spec_mmap }, /* mmap */ + { &vnop_fsync_desc, (VOPFUNC)spec_fsync }, /* fsync */ + { &vnop_remove_desc, (VOPFUNC)spec_remove }, /* remove */ + { &vnop_link_desc, (VOPFUNC)spec_link }, /* link */ + { &vnop_rename_desc, (VOPFUNC)spec_rename }, /* rename */ + { &vnop_mkdir_desc, (VOPFUNC)spec_mkdir }, /* mkdir */ + { &vnop_rmdir_desc, (VOPFUNC)spec_rmdir }, /* rmdir */ + { &vnop_symlink_desc, (VOPFUNC)spec_symlink }, /* symlink */ + { &vnop_readdir_desc, (VOPFUNC)spec_readdir }, /* readdir */ + { &vnop_readlink_desc, (VOPFUNC)spec_readlink }, /* readlink */ + { &vnop_inactive_desc, (VOPFUNC)cd9660_inactive },/* inactive */ + { &vnop_reclaim_desc, (VOPFUNC)cd9660_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */ + { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */ + { &vnop_advlock_desc, (VOPFUNC)spec_advlock }, /* advlock */ + { &vnop_bwrite_desc, (VOPFUNC)vn_bwrite }, + { &vnop_devblocksize_desc, (VOPFUNC)spec_devblocksize }, /* devblocksize */ + { &vnop_pagein_desc, (VOPFUNC)cd9660_pagein }, /* Pagein */ + { &vnop_pageout_desc, (VOPFUNC)cd9660_pageout }, /* Pageout */ + { &vnop_blktooff_desc, (VOPFUNC)cd9660_blktooff }, /* blktooff */ + { &vnop_offtoblk_desc, (VOPFUNC)cd9660_offtoblk }, /* offtoblk */ + { &vnop_blockmap_desc, (VOPFUNC)cd9660_blockmap }, /* blockmap */ { (struct vnodeop_desc*)NULL, (VOPFUNC)NULL } }; struct vnodeopv_desc cd9660_specop_opv_desc = @@ -1738,52 +1375,37 @@ struct vnodeopv_desc cd9660_specop_opv_desc = #if FIFO int (**cd9660_fifoop_p)(void *); struct vnodeopv_entry_desc cd9660_fifoop_entries[] = { - { &vop_default_desc, (VOPFUNC)vn_default_error }, - { &vop_lookup_desc, (VOPFUNC)fifo_lookup }, /* lookup */ - { &vop_create_desc, (VOPFUNC)fifo_create }, /* create */ - { &vop_mknod_desc, (VOPFUNC)fifo_mknod }, /* mknod */ - { &vop_open_desc, (VOPFUNC)fifo_open }, /* open */ - { &vop_close_desc, (VOPFUNC)fifo_close }, /* close */ - { &vop_access_desc, (VOPFUNC)cd9660_access }, /* access */ - { &vop_getattr_desc, (VOPFUNC)cd9660_getattr }, /* getattr */ - { &vop_setattr_desc, (VOPFUNC)cd9660_setattr }, /* setattr */ - { &vop_read_desc, (VOPFUNC)fifo_read }, /* read */ - { &vop_write_desc, (VOPFUNC)fifo_write }, /* write */ - { &vop_lease_desc, (VOPFUNC)fifo_lease_check }, /* lease */ - { &vop_ioctl_desc, (VOPFUNC)fifo_ioctl }, /* ioctl */ - { &vop_select_desc, (VOPFUNC)fifo_select }, /* select */ - { &vop_mmap_desc, (VOPFUNC)fifo_mmap }, /* mmap */ - { &vop_fsync_desc, (VOPFUNC)fifo_fsync }, /* fsync */ - { &vop_seek_desc, (VOPFUNC)fifo_seek }, /* seek */ - { &vop_remove_desc, (VOPFUNC)fifo_remove }, /* remove */ - { &vop_link_desc, (VOPFUNC)fifo_link } , /* link */ - { &vop_rename_desc, (VOPFUNC)fifo_rename }, /* rename */ - { &vop_mkdir_desc, (VOPFUNC)fifo_mkdir }, /* mkdir */ - { &vop_rmdir_desc, (VOPFUNC)fifo_rmdir }, /* rmdir */ - { &vop_symlink_desc, (VOPFUNC)fifo_symlink }, /* symlink */ - { &vop_readdir_desc, (VOPFUNC)fifo_readdir }, /* readdir */ - { &vop_readlink_desc, (VOPFUNC)fifo_readlink }, /* readlink */ - { &vop_abortop_desc, (VOPFUNC)fifo_abortop }, /* abortop */ - { &vop_inactive_desc, (VOPFUNC)cd9660_inactive },/* inactive */ - { &vop_reclaim_desc, (VOPFUNC)cd9660_reclaim }, /* reclaim */ - { &vop_lock_desc, (VOPFUNC)cd9660_lock }, /* lock */ - { &vop_unlock_desc, (VOPFUNC)cd9660_unlock }, /* unlock */ - { &vop_bmap_desc, (VOPFUNC)fifo_bmap }, /* bmap */ - { &vop_strategy_desc, (VOPFUNC)fifo_strategy }, /* strategy */ - { &vop_print_desc, (VOPFUNC)cd9660_print }, /* print */ - { &vop_islocked_desc, (VOPFUNC)cd9660_islocked },/* islocked */ - { &vop_pathconf_desc, (VOPFUNC)fifo_pathconf }, /* pathconf */ - { &vop_advlock_desc, (VOPFUNC)fifo_advlock }, /* advlock */ - { &vop_blkatoff_desc, (VOPFUNC)fifo_blkatoff }, /* blkatoff */ - { &vop_valloc_desc, (VOPFUNC)fifo_valloc }, /* valloc */ - { &vop_vfree_desc, (VOPFUNC)fifo_vfree }, /* vfree */ - { &vop_truncate_desc, (VOPFUNC)fifo_truncate }, /* truncate */ - { &vop_update_desc, (VOPFUNC)cd9660_update }, /* update */ - { &vop_bwrite_desc, (VOPFUNC)vn_bwrite }, - { &vop_pagein_desc, (VOPFUNC)cd9660_pagein }, /* Pagein */ - { &vop_pageout_desc, (VOPFUNC)cd9660_pageout }, /* Pageout */ - { &vop_blktooff_desc, (VOPFUNC)cd9660_blktooff }, /* blktooff */ - { &vop_offtoblk_desc, (VOPFUNC)cd9660_offtoblk }, /* offtoblk */ + { &vnop_default_desc, (VOPFUNC)vn_default_error }, + { &vnop_lookup_desc, (VOPFUNC)fifo_lookup }, /* lookup */ + { &vnop_create_desc, (VOPFUNC)fifo_create }, /* create */ + { &vnop_mknod_desc, (VOPFUNC)fifo_mknod }, /* mknod */ + { &vnop_open_desc, (VOPFUNC)fifo_open }, /* open */ + { &vnop_close_desc, (VOPFUNC)fifo_close }, /* close */ + { &vnop_getattr_desc, (VOPFUNC)cd9660_getattr }, /* getattr */ + { &vnop_read_desc, (VOPFUNC)fifo_read }, /* read */ + { &vnop_write_desc, (VOPFUNC)fifo_write }, /* write */ + { &vnop_ioctl_desc, (VOPFUNC)fifo_ioctl }, /* ioctl */ + { &vnop_select_desc, (VOPFUNC)fifo_select }, /* select */ + { &vnop_mmap_desc, (VOPFUNC)fifo_mmap }, /* mmap */ + { &vnop_fsync_desc, (VOPFUNC)fifo_fsync }, /* fsync */ + { &vnop_remove_desc, (VOPFUNC)fifo_remove }, /* remove */ + { &vnop_link_desc, (VOPFUNC)fifo_link } , /* link */ + { &vnop_rename_desc, (VOPFUNC)fifo_rename }, /* rename */ + { &vnop_mkdir_desc, (VOPFUNC)fifo_mkdir }, /* mkdir */ + { &vnop_rmdir_desc, (VOPFUNC)fifo_rmdir }, /* rmdir */ + { &vnop_symlink_desc, (VOPFUNC)fifo_symlink }, /* symlink */ + { &vnop_readdir_desc, (VOPFUNC)fifo_readdir }, /* readdir */ + { &vnop_readlink_desc, (VOPFUNC)fifo_readlink }, /* readlink */ + { &vnop_inactive_desc, (VOPFUNC)cd9660_inactive },/* inactive */ + { &vnop_reclaim_desc, (VOPFUNC)cd9660_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (VOPFUNC)fifo_strategy }, /* strategy */ + { &vnop_pathconf_desc, (VOPFUNC)fifo_pathconf }, /* pathconf */ + { &vnop_advlock_desc, (VOPFUNC)fifo_advlock }, /* advlock */ + { &vnop_bwrite_desc, (VOPFUNC)vn_bwrite }, + { &vnop_pagein_desc, (VOPFUNC)cd9660_pagein }, /* Pagein */ + { &vnop_pageout_desc, (VOPFUNC)cd9660_pageout }, /* Pageout */ + { &vnop_blktooff_desc, (VOPFUNC)cd9660_blktooff }, /* blktooff */ + { &vnop_offtoblk_desc, (VOPFUNC)cd9660_offtoblk }, /* offtoblk */ { (struct vnodeop_desc*)NULL, (VOPFUNC)NULL } }; struct vnodeopv_desc cd9660_fifoop_opv_desc = diff --git a/bsd/isofs/cd9660/iso.h b/bsd/isofs/cd9660/iso.h index b97e89154..683f9f0e1 100644 --- a/bsd/isofs/cd9660/iso.h +++ b/bsd/isofs/cd9660/iso.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -261,7 +261,6 @@ struct iso_mnt { int im_sector_size; int volume_space_size; - struct netexport im_export; char root[ISODCL (157, 190)]; int root_extent; @@ -291,7 +290,7 @@ struct iso_mnt { /* CD is Video CD (version < 2.0) */ #define IMF2_IS_VCD 0x00000002 -#define VFSTOISOFS(mp) ((struct iso_mnt *)((mp)->mnt_data)) +#define VFSTOISOFS(mp) ((struct iso_mnt *)(vfs_fsprivate(mp))) #define blkoff(imp, loc) ((loc) & (imp)->im_bmask) #define lblktosize(imp, blk) ((blk) << (imp)->im_bshift) @@ -302,23 +301,22 @@ struct iso_mnt { (off_t)(((off) / (imp)->im_sector_size) * (imp)->im_sector_size) -int cd9660_mount __P((struct mount *, - char *, caddr_t, struct nameidata *, struct proc *)); -int cd9660_start __P((struct mount *, int, struct proc *)); -int cd9660_unmount __P((struct mount *, int, struct proc *)); -int cd9660_root __P((struct mount *, struct vnode **)); -int cd9660_quotactl __P((struct mount *, int, uid_t, caddr_t, struct proc *)); -int cd9660_statfs __P((struct mount *, struct statfs *, struct proc *)); -int cd9660_sync __P((struct mount *, int, struct ucred *, struct proc *)); -int cd9660_vget __P((struct mount *, void *, struct vnode **)); -int cd9660_fhtovp __P((struct mount *, struct fid *, struct mbuf *, - struct vnode **, int *, struct ucred **)); -int cd9660_vptofh __P((struct vnode *, struct fid *)); -int cd9660_init __P(()); +int cd9660_mount(struct mount *, vnode_t, user_addr_t, vfs_context_t); +int cd9660_start(struct mount *, int, vfs_context_t); +int cd9660_unmount(struct mount *, int, vfs_context_t); +int cd9660_root(struct mount *, struct vnode **, vfs_context_t); +int cd9660_statfs(struct mount *, struct vfsstatfs *, vfs_context_t); +int cd9660_vfs_getattr(struct mount *mp, struct vfs_attr *fsap, vfs_context_t context); +int cd9660_sync(struct mount *, int, vfs_context_t); +int cd9660_vget(struct mount *, ino64_t, struct vnode **, vfs_context_t); +int cd9660_fhtovp(struct mount *, int, unsigned char *, struct vnode **, vfs_context_t); +int cd9660_vptofh(struct vnode *, int *, unsigned char *, vfs_context_t); +int cd9660_init(struct vfsconf *); +int cd9660_mountroot(mount_t, vnode_t, vfs_context_t); +int cd9660_sysctl(int *, u_int, user_addr_t, size_t *, user_addr_t, size_t, vfs_context_t); -int cd9660_mountroot __P((void)); - -int cd9660_sysctl __P((int *, u_int, void *, size_t *, void *, size_t, struct proc *)); +int cd9660_vget_internal(mount_t, ino_t, vnode_t *, vnode_t, struct componentname *, + int, struct iso_directory_record *, proc_t); extern int (**cd9660_vnodeop_p)(void *); extern int (**cd9660_specop_p)(void *); @@ -328,15 +326,13 @@ extern int (**cd9660_fifoop_p)(void *); extern int (**cd9660_cdxaop_p)(void *); static __inline int -isonum_711(p) - u_char *p; +isonum_711(u_char *p) { return *p; } static __inline int -isonum_712(p) - char *p; +isonum_712(char *p) { return *p; } @@ -344,15 +340,13 @@ isonum_712(p) #ifndef UNALIGNED_ACCESS static __inline int -isonum_723(p) - u_char *p; +isonum_723(u_char *p) { return *p|(p[1] << 8); } static __inline int -isonum_733(p) - u_char *p; +isonum_733(u_char *p) { return *p|(p[1] << 8)|(p[2] << 16)|(p[3] << 24); } @@ -362,15 +356,13 @@ isonum_733(p) #if BYTE_ORDER == LITTLE_ENDIAN static __inline int -isonum_723(p) - u_char *p +isonum_723(u_char *p) { return *(u_int16t *)p; } static __inline int -isonum_733(p) - u_char *p; +isonum_733(u_char *p) { return *(u_int32t *)p; } @@ -380,15 +372,13 @@ isonum_733(p) #if BYTE_ORDER == BIG_ENDIAN static __inline int -isonum_723(p) - u_char *p +isonum_723(u_char *p) { return *(u_int16t *)(p + 2); } static __inline int -isonum_733(p) - u_char *p; +isonum_733(u_char *p) { return *(u_int32t *)(p + 4); } @@ -397,14 +387,21 @@ isonum_733(p) #endif /* UNALIGNED_ACCESS */ -int isofncmp __P((u_char *, int, u_char *, int)); -int ucsfncmp __P((u_int16_t *, int, u_int16_t *, int)); -void isofntrans __P((u_char *, int, u_char *, u_short *, int, int)); -void ucsfntrans __P((u_int16_t *, int, u_char *, u_short *, int, int)); -ino_t isodirino __P((struct iso_directory_record *, struct iso_mnt *)); -int attrcalcsize __P((struct attrlist *attrlist)); -void packattrblk __P((struct attrlist *alist, struct vnode *vp, - void **attrbufptrptr, void **varbufptrptr)); +int isofncmp(u_char *fn, int fnlen, u_char *isofn, int isolen); +int ucsfncmp(u_int16_t *, int, u_int16_t *, int); +void isofntrans(u_char *infn, int infnlen, u_char *outfn, u_short *outfnlen, + int original, int assoc); +void ucsfntrans(u_int16_t *, int, u_char *, u_short *, int, int); +int attrcalcsize(struct attrlist *attrlist); +struct iso_node; +void packcommonattr(struct attrlist *alist, struct iso_node *ip, + void **attrbufptrptr, void **varbufptrptr); +void packdirattr(struct attrlist *alist, struct iso_node *ip, + void **attrbufptrptr, void **varbufptrptr); +void packfileattr(struct attrlist *alist, struct iso_node *ip, + void **attrbufptrptr, void **varbufptrptr); +void packattrblk(struct attrlist *alist, struct vnode *vp, + void **attrbufptrptr, void **varbufptrptr); /* diff --git a/bsd/isofs/cd9660/iso_rrip.h b/bsd/isofs/cd9660/iso_rrip.h index 5081d1bdc..cfc5f1397 100644 --- a/bsd/isofs/cd9660/iso_rrip.h +++ b/bsd/isofs/cd9660/iso_rrip.h @@ -99,15 +99,15 @@ typedef struct { int cont; /* continuation of above */ } ISO_RRIP_ANALYZE; -int cd9660_rrip_analyze __P((struct iso_directory_record *isodir, - struct iso_node *inop, struct iso_mnt *imp)); -int cd9660_rrip_getname __P((struct iso_directory_record *isodir, +int cd9660_rrip_analyze(struct iso_directory_record *isodir, + struct iso_node *inop, struct iso_mnt *imp); +int cd9660_rrip_getname(struct iso_directory_record *isodir, char *outbuf, u_short *outlen, - ino_t *inump, struct iso_mnt *imp)); -int cd9660_rrip_getsymname __P((struct iso_directory_record *isodir, + ino_t *inump, struct iso_mnt *imp); +int cd9660_rrip_getsymname(struct iso_directory_record *isodir, char *outbuf, u_short *outlen, - struct iso_mnt *imp)); -int cd9660_rrip_offset __P((struct iso_directory_record *isodir, - struct iso_mnt *imp)); + struct iso_mnt *imp); +int cd9660_rrip_offset(struct iso_directory_record *isodir, + struct iso_mnt *imp); #endif /* __APPLE_API_PRIVATE */ #endif /* __ISOFS_CD9660_ISO_RRIP_H__ */ diff --git a/bsd/kern/ast.h b/bsd/kern/ast.h index f2b6aa07e..d320003a9 100644 --- a/bsd/kern/ast.h +++ b/bsd/kern/ast.h @@ -28,18 +28,10 @@ #ifndef _KERN_AST_H_ #define _KERN_AST_H_ -#include <kern/thread_act.h> - -#ifdef BSD_USE_APC - -extern thread_apc_handler_t bsd_ast; - -#else /* !BSD_USE_APC */ +#include <kern/thread.h> extern void astbsd_on(void); -extern void act_set_astbsd(thread_act_t); -extern void bsd_ast(thread_act_t); - -#endif /* !BSD_USE_APC */ +extern void act_set_astbsd(thread_t); +extern void bsd_ast(thread_t); #endif /* _KERN_AST_H_ */ diff --git a/bsd/kern/bsd_init.c b/bsd/kern/bsd_init.c index 2864a6cfd..a0d66765f 100644 --- a/bsd/kern/bsd_init.c +++ b/bsd/kern/bsd_init.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -70,23 +70,25 @@ #include <sys/param.h> #include <sys/filedesc.h> #include <sys/kernel.h> -#include <sys/mount.h> -#include <sys/proc.h> +#include <sys/mount_internal.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/systm.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/conf.h> -#include <sys/buf.h> +#include <sys/buf_internal.h> #include <sys/clist.h> #include <sys/user.h> #include <sys/time.h> #include <sys/systm.h> +#include <sys/mman.h> #include <bsm/audit_kernel.h> #include <sys/malloc.h> #include <sys/dkstat.h> -#include <machine/spl.h> +#include <kern/startup.h> #include <kern/thread.h> #include <kern/task.h> #include <kern/ast.h> @@ -113,6 +115,8 @@ #include <mach/shared_memory_server.h> #include <vm/vm_shared_memory_server.h> +#include <net/init.h> + extern int app_profile; /* on/off switch for pre-heat cache */ char copyright[] = @@ -126,7 +130,6 @@ extern void ux_handler(); struct proc proc0; struct session session0; struct pgrp pgrp0; -struct pcred cred0; struct filedesc filedesc0; struct plimit limit0; struct pstats pstats0; @@ -138,6 +141,7 @@ long tk_nin; long tk_nout; long tk_rawcc; +int lock_trace = 0; /* Global variables to make pstat happy. We do swapping differently */ int nswdev, nswap; int nswapmap; @@ -153,11 +157,10 @@ int hostnamelen; char domainname[MAXDOMNAMELEN]; int domainnamelen; char classichandler[32] = {0}; -long classichandler_fsid = -1L; +uint32_t classichandler_fsid = -1L; long classichandler_fileid = -1L; char rootdevice[16]; /* hfs device names have at least 9 chars */ -struct timeval boottime; /* GRODY! This has to go... */ #ifdef KMEMSTATS struct kmemstats kmemstats[M_LAST]; @@ -179,11 +182,17 @@ extern int bsd_hardclockinit; extern task_t bsd_init_task; extern char init_task_failure_data[]; extern void time_zone_slock_init(void); +static void process_name(char *, struct proc *); + +static void setconf(void); funnel_t *kernel_flock; -funnel_t *network_flock; -int disable_funnel = 0; /* disables split funnel */ -int enable_funnel = 0; /* disables split funnel */ + +extern void sysv_shm_lock_init(void); +extern void sysv_sem_lock_init(void); +extern void sysv_msg_lock_init(void); +extern void pshm_lock_init(); +extern void psem_lock_init(); /* * Initialization code. @@ -200,8 +209,8 @@ int enable_funnel = 0; /* disables split funnel */ /* * Sets the name for the given task. */ -void -proc_name(s, p) +static void +process_name(s, p) char *s; struct proc *p; { @@ -218,31 +227,47 @@ struct rlimit vm_initial_limit_stack = { DFLSSIZ, MAXSSIZ }; struct rlimit vm_initial_limit_data = { DFLDSIZ, MAXDSIZ }; struct rlimit vm_initial_limit_core = { DFLCSIZ, MAXCSIZ }; -extern thread_t first_thread; -extern thread_act_t cloneproc(struct proc *, int); -extern int (*mountroot) __P((void)); +extern thread_t cloneproc(struct proc *, int); +extern int (*mountroot)(void); extern int netboot_mountroot(); /* netboot.c */ extern int netboot_setup(struct proc * p); +lck_grp_t * proc_lck_grp; +lck_grp_attr_t * proc_lck_grp_attr; +lck_attr_t * proc_lck_attr; + /* hook called after root is mounted XXX temporary hack */ void (*mountroot_post_hook)(void); +/* + * This function is called very early on in the Mach startup, from the + * function start_kernel_threads() in osfmk/kern/startup.c. It's called + * in the context of the current (startup) task using a call to the + * function kernel_thread_create() to jump into start_kernel_threads(). + * Internally, kernel_thread_create() calls thread_create_internal(), + * which calls uthread_alloc(). The function of uthread_alloc() is + * normally to allocate a uthread structure, and fill out the uu_sigmask, + * uu_act, and uu_ucred fields. It skips filling these out in the case + * of the "task" being "kernel_task", because the order of operation is + * inverted. To account for that, we need to manually fill in at least + * the uu_cred field so that the uthread structure can be used like any + * other. + */ void bsd_init() { register struct proc *p; - extern struct ucred *rootcred; + struct uthread *ut; + extern kauth_cred_t rootcred; register int i; int s; thread_t th; + struct vfs_context context; void lightning_bolt(void ); kern_return_t ret; boolean_t funnel_state; - extern void uthread_zone_init(); - - - /* split funnel is enabled by default */ - PE_parse_boot_arg("dfnl", &disable_funnel); + struct ucred temp_cred; + extern void file_lock_init(void); kernel_flock = funnel_alloc(KERNEL_FUNNEL); if (kernel_flock == (funnel_t *)0 ) { @@ -251,29 +276,19 @@ bsd_init() funnel_state = thread_funnel_set(kernel_flock, TRUE); - if (!disable_funnel) { - network_flock = funnel_alloc(NETWORK_FUNNEL); - if (network_flock == (funnel_t *)0 ) { - panic("bsd_init: Failed to allocate network funnel"); - } - } else { - network_flock = kernel_flock; - } - printf(copyright); - + kmeminit(); parse_bsd_args(); - bsd_bufferinit(); - /* Initialize the uthread zone */ - uthread_zone_init(); + //uthread_zone_init(); /* XXX redundant: previous uthread_alloc() */ - /* - * Initialize process and pgrp structures. - */ + /* Initialize kauth subsystem before instancing the first credential */ + kauth_init(); + + /* Initialize process and pgrp structures. */ procinit(); kernproc = &proc0; @@ -285,12 +300,29 @@ bsd_init() p->p_pid = 0; /* give kernproc a name */ - proc_name("kernel_task", p); + process_name("kernel_task", p); + + + /* allocate proc lock group attribute and group */ + proc_lck_grp_attr= lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(proc_lck_grp_attr); + + proc_lck_grp = lck_grp_alloc_init("proc", proc_lck_grp_attr); + + + /* Allocate proc lock attribute */ + proc_lck_attr = lck_attr_alloc_init(); + //lck_attr_setdebug(proc_lck_attr); + + lck_mtx_init(&p->p_mlock, proc_lck_grp, proc_lck_attr); + lck_mtx_init(&p->p_fdmlock, proc_lck_grp, proc_lck_attr); if (current_task() != kernel_task) printf("bsd_init: We have a problem, " "current task is not kernel task\n"); + ut = (uthread_t)get_bsdthread_info(current_thread()); + /* * Create process 0. */ @@ -307,7 +339,7 @@ bsd_init() p->task = kernel_task; p->p_stat = SRUN; - p->p_flag = P_INMEM|P_SYSTEM; + p->p_flag = P_SYSTEM; p->p_nice = NZERO; p->p_pptr = p; lockinit(&p->signal_lock, PVM, "signal", 0, 0); @@ -316,20 +348,26 @@ bsd_init() p->sigwait_thread = THREAD_NULL; p->exit_thread = THREAD_NULL; - /* Create credentials. */ - lockinit(&cred0.pc_lock, PLOCK, "proc0 cred", 0, 0); - cred0.p_refcnt = 1; - p->p_cred = &cred0; - p->p_ucred = crget(); - p->p_ucred->cr_ngroups = 1; /* group 0 */ + /* + * Create credential. This also Initializes the audit information. + * XXX It is not clear what the initial values should be for audit ID, + * XXX session ID, etc.. + */ + bzero(&temp_cred, sizeof(temp_cred)); + temp_cred.cr_ngroups = 1; + + p->p_ucred = kauth_cred_create(&temp_cred); + + /* give the (already exisiting) initial thread a reference on it */ + kauth_cred_ref(p->p_ucred); + ut->uu_ucred = p->p_ucred; TAILQ_INIT(&p->aio_activeq); TAILQ_INIT(&p->aio_doneq); p->aio_active_count = 0; p->aio_done_count = 0; - /* Set the audit info for this process */ - audit_proc_init(p); + file_lock_init(); /* Create the file descriptor table. */ filedesc0.fd_refcnt = 1+1; /* +1 so shutdown will not _FREE_ZONE */ @@ -357,7 +395,7 @@ bsd_init() p->p_sigacts = &sigacts0; /* - * Charge root for one process. + * Charge root for two processes: init and mach_init. */ (void)chgproccnt(0, 1); @@ -372,12 +410,21 @@ bsd_init() &min, (vm_size_t)BSD_PAGABLE_MAP_SIZE, TRUE, - TRUE, + VM_FLAGS_ANYWHERE, &bsd_pageable_map); if (ret != KERN_SUCCESS) panic("bsd_init: Failed to allocate bsd pageable map"); } + /* + * Initialize buffers and hash links for buffers + * + * SIDE EFFECT: Starts a thread for bcleanbuf_thread(), so must + * happen after a credential has been associated with + * the kernel task. + */ + bsd_bufferinit(); + /* Initialize the execve() semaphore */ ret = semaphore_create(kernel_task, &execve_semaphore, SYNC_POLICY_FIFO, (BSD_PAGABLE_MAP_SIZE / NCARGS)); @@ -397,9 +444,6 @@ bsd_init() /* Initialize mbuf's. */ mbinit(); - /* Initialize syslog */ - log_init(); - /* * Initializes security event auditing. * XXX: Should/could this occur later? @@ -412,6 +456,18 @@ bsd_init() /* Initialize for async IO */ aio_init(); + /* Initialize pipes */ + pipeinit(); + + /* Initialize SysV shm subsystem locks; the subsystem proper is + * initialized through a sysctl. + */ + sysv_shm_lock_init(); + sysv_sem_lock_init(); + sysv_msg_lock_init(); + pshm_lock_init(); + psem_lock_init(); + /* POSIX Shm and Sem */ pshm_cache_init(); psem_cache_init(); @@ -421,13 +477,12 @@ bsd_init() * Initialize protocols. Block reception of incoming packets * until everything is ready. */ - s = splimp(); sysctl_register_fixed(); sysctl_mib_init(); dlil_init(); + proto_kpi_init(); socketinit(); domaininit(); - splx(s); p->p_fd->fd_cdir = NULL; p->p_fd->fd_rdir = NULL; @@ -456,42 +511,53 @@ bsd_init() /* Register the built-in dlil ethernet interface family */ ether_family_init(); + /* Call any kext code that wants to run just after network init */ + net_init_run(); + vnode_pager_bootstrap(); +#if 0 + /* XXX Hack for early debug stop */ + printf("\nabout to sleep for 10 seconds\n"); + IOSleep( 10 * 1000 ); + /* Debugger("hello"); */ +#endif + + inittodr(0); /* Mount the root file system. */ while( TRUE) { int err; setconf(); - /* - * read the time after clock_initialize_calendar() - * and before nfs mount - */ - microtime((struct timeval *)&time); - bsd_hardclockinit = -1; /* start ticking */ if (0 == (err = vfs_mountroot())) break; +#if NFSCLIENT if (mountroot == netboot_mountroot) { printf("cannot mount network root, errno = %d\n", err); mountroot = NULL; if (0 == (err = vfs_mountroot())) break; } +#endif printf("cannot mount root, errno = %d\n", err); boothowto |= RB_ASKNAME; } - mountlist.cqh_first->mnt_flag |= MNT_ROOTFS; + context.vc_proc = p; + context.vc_ucred = p->p_ucred; + mountlist.tqh_first->mnt_flag |= MNT_ROOTFS; /* Get the vnode for '/'. Set fdp->fd_fd.fd_cdir to reference it. */ - if (VFS_ROOT(mountlist.cqh_first, &rootvnode)) + if (VFS_ROOT(mountlist.tqh_first, &rootvnode, &context)) panic("bsd_init: cannot find root vnode"); - VREF(rootvnode); + rootvnode->v_flag |= VROOT; + (void)vnode_ref(rootvnode); + (void)vnode_put(rootvnode); filedesc0.fd_cdir = rootvnode; - VOP_UNLOCK(rootvnode, 0, p); +#if NFSCLIENT if (mountroot == netboot_mountroot) { int err; /* post mount setup */ @@ -499,14 +565,10 @@ bsd_init() panic("bsd_init: NetBoot could not find root, %d", err); } } +#endif - /* - * Now can look at time, having had a chance to verify the time - * from the file system. Reset p->p_rtime as it may have been - * munched in mi_switch() after the time got set. - */ - p->p_stats->p_start = boottime = time; + microtime(&p->p_stats->p_start); p->p_rtime.tv_sec = p->p_rtime.tv_usec = 0; #if DEVFS @@ -536,14 +598,14 @@ bsdinit_task(void) struct proc *p = current_proc(); struct uthread *ut; kern_return_t kr; - thread_act_t th_act; + thread_t th_act; shared_region_mapping_t system_region; - proc_name("init", p); + process_name("init", p); ux_handler_init(); - th_act = current_act(); + th_act = current_thread(); (void) host_set_exception_ports(host_priv_self(), EXC_MASK_ALL & ~(EXC_MASK_SYSCALL | EXC_MASK_MACH_SYSCALL | @@ -567,17 +629,16 @@ bsdinit_task(void) bsd_hardclockinit = 1; /* Start bsd hardclock */ bsd_init_task = get_threadtask(th_act); init_task_failure_data[0] = 0; - system_region = lookup_default_shared_region(ENV_DEFAULT_ROOT, - machine_slot[cpu_number()].cpu_type); + system_region = lookup_default_shared_region(ENV_DEFAULT_ROOT, cpu_type()); if (system_region == NULL) { - shared_file_boot_time_init(ENV_DEFAULT_ROOT, - machine_slot[cpu_number()].cpu_type); + shared_file_boot_time_init(ENV_DEFAULT_ROOT, cpu_type()); } else { vm_set_shared_region(get_threadtask(th_act), system_region); } load_init_program(p); /* turn on app-profiling i.e. pre-heating */ app_profile = 1; + lock_trace = 1; } void @@ -617,7 +678,8 @@ bsd_autoconf() #include <sys/disklabel.h> /* for MAXPARTITIONS */ -setconf() +static void +setconf(void) { extern kern_return_t IOFindBSDRoot( char * rootName, dev_t * root, u_int32_t * flags ); @@ -640,25 +702,29 @@ setconf() flags = 0; } +#if NFSCLIENT if( flags & 1 ) { /* network device */ mountroot = netboot_mountroot; } else { +#endif /* otherwise have vfs determine root filesystem */ mountroot = NULL; +#if NFSCLIENT } +#endif } bsd_utaskbootstrap() { - thread_act_t th_act; + thread_t th_act; struct uthread *ut; th_act = cloneproc(kernproc, 0); initproc = pfind(1); /* Set the launch time for init */ - initproc->p_stats->p_start = time; + microtime(&initproc->p_stats->p_start); ut = (struct uthread *)get_bsdthread_info(th_act); ut->uu_sigmask = 0; @@ -733,56 +799,10 @@ parse_bsd_args() return 0; } -boolean_t -thread_funnel_switch( - int oldfnl, - int newfnl) +#if !NFSCLIENT +int +netboot_root(void) { - boolean_t funnel_state_prev; - int curfnl; - funnel_t * curflock; - funnel_t * oldflock; - funnel_t * newflock; - funnel_t * exist_funnel; - extern int disable_funnel; - - - if (disable_funnel) - return(TRUE); - - if(oldfnl == newfnl) { - panic("thread_funnel_switch: can't switch to same funnel"); - } - - if ((oldfnl != NETWORK_FUNNEL) && (oldfnl != KERNEL_FUNNEL)) { - panic("thread_funnel_switch: invalid oldfunnel"); - } - if ((newfnl != NETWORK_FUNNEL) && (newfnl != KERNEL_FUNNEL)) { - panic("thread_funnel_switch: invalid newfunnel"); - } - - if((curflock = thread_funnel_get()) == THR_FUNNEL_NULL) { - panic("thread_funnel_switch: no funnel held"); - } - - if ((oldfnl == NETWORK_FUNNEL) && (curflock != network_flock)) - panic("thread_funnel_switch: network funnel not held"); - - if ((oldfnl == KERNEL_FUNNEL) && (curflock != kernel_flock)) - panic("thread_funnel_switch: kernel funnel not held"); - - if(oldfnl == NETWORK_FUNNEL) { - oldflock = network_flock; - newflock = kernel_flock; - } else { - oldflock = kernel_flock; - newflock = network_flock; - } - KERNEL_DEBUG(0x603242c | DBG_FUNC_NONE, oldflock, 1, 0, 0, 0); - thread_funnel_set(oldflock, FALSE); - KERNEL_DEBUG(0x6032428 | DBG_FUNC_NONE, newflock, 1, 0, 0, 0); - thread_funnel_set(newflock, TRUE); - KERNEL_DEBUG(0x6032434 | DBG_FUNC_NONE, newflock, 1, 0, 0, 0); - - return(TRUE); + return(0); } +#endif diff --git a/bsd/kern/bsd_stubs.c b/bsd/kern/bsd_stubs.c index 0597434fa..9a0c06054 100644 --- a/bsd/kern/bsd_stubs.c +++ b/bsd/kern/bsd_stubs.c @@ -28,12 +28,14 @@ #include <vm/vm_map.h> #include <sys/systm.h> #include <sys/conf.h> +#include <sys/proc_internal.h> #include <sys/buf.h> /* for SET */ #include <sys/user.h> /* Just to satisfy pstat command */ int dmmin, dmmax, dmtext; +vm_offset_t kmem_mb_alloc(vm_map_t mbmap, int size) { vm_offset_t addr; @@ -46,7 +48,13 @@ kmem_mb_alloc(vm_map_t mbmap, int size) } -pcb_synch() {} +/* + * XXX this function only exists to be exported and do nothing. + */ +void +pcb_synch(void) +{ +} struct proc * current_proc(void) @@ -54,10 +62,10 @@ current_proc(void) /* Never returns a NULL */ struct uthread * ut; struct proc *p; - thread_act_t thr_act = current_act(); + thread_t thr_act = current_thread(); ut = (struct uthread *)get_bsdthread_info(thr_act); - if (ut && (ut->uu_flag & P_VFORK) && ut->uu_proc) { + if (ut && (ut->uu_flag & UT_VFORK) && ut->uu_proc) { p = ut->uu_proc; if ((p->p_flag & P_INVFORK) == 0) panic("returning child proc not under vfork"); diff --git a/bsd/kern/init_sysent.c b/bsd/kern/init_sysent.c index 431063091..d0ea18c24 100644 --- a/bsd/kern/init_sysent.c +++ b/bsd/kern/init_sysent.c @@ -1,851 +1,463 @@ /* - * Copyright (c) 1995-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. + * @APPLE_LICENSE_HEADER_END@ * - * @APPLE_LICENSE_HEADER_END@ + * + * System call switch table. + * + * DO NOT EDIT-- this file is automatically generated. + * created from syscalls.master */ #include <sys/appleapiopts.h> #include <sys/param.h> #include <sys/systm.h> -#include <sys/signal.h> -#include <sys/mount.h> - -/* serial or parallel system call */ -#define syss(fn,no) {no, 0, KERNEL_FUNNEL, fn} -#define sysp(fn,no) {no, 1, KERNEL_FUNNEL, fn} -#define sysnets(fn,no) {no, 0, NETWORK_FUNNEL, fn} -#define sysnetp(fn,no) {no, 1, NETWORK_FUNNEL, fn} -#define sysnofnl(fn,no) {no, 0, NO_FUNNEL, fn} - -/* - * definitions - */ -int nosys(); -int exit(); -int fork(); -int read(); -int write(); -int open(); -int close(); -int wait4(); -int link(); -int unlink(); -int chdir(); -int fchdir(); -int mknod(); -int chmod(); -int chown(); -int obreak(); -int getfsstat(); -#if COMPAT_GETFSSTAT -int ogetfsstat(); -#endif -int getpid(); -int mount(); -int unmount(); -int setuid(); -int getuid(); -int geteuid(); -int ptrace(); -int recvmsg(); -int sendmsg(); -int recvfrom(); -int accept(); -int getpeername(); -int getsockname(); -int access(); -int chflags(); -int fchflags(); -int sync(); -int kill(); -int getppid(); -int dup(); -int pipe(); -int getegid(); -int profil(); -int load_shared_file(); -int reset_shared_file(); -int new_system_shared_regions(); -int ktrace(); -int sigaction(); -int getgid(); -int sigprocmask(); -int getlogin(); -int setlogin(); -int acct(); -int sigpending(); -int sigaltstack(); -int ioctl(); -int reboot(); -int revoke(); -int symlink(); -int readlink(); -int execve(); -int umask(); -int chroot(); -int msync(); -int vfork(); -int sbrk(); -int sstk(); -int ovadvise(); -int munmap(); -int mprotect(); -int madvise(); -int mincore(); -int getgroups(); -int setgroups(); -int getpgrp(); -int setpgid(); -int setitimer(); -int swapon(); -int getitimer(); -int getdtablesize(); -int dup2(); -int fcntl(); -int select(); -int fsync(); -int setpriority(); -int socket(); -int connect(); -int getpriority(); +#include <sys/types.h> +#include <sys/sysent.h> +#include <sys/sysproto.h> #ifdef __ppc__ -int osigreturn(); -#endif -int sigreturn(); -int bind(); -int setsockopt(); -int listen(); -int sigsuspend(); -#if TRACE -int vtrace(); +#define AC(name) (sizeof(struct name) / sizeof(uint64_t)) #else +#define AC(name) (sizeof(struct name) / sizeof(register_t)) #endif -int gettimeofday(); -#ifdef __ppc__ -int ppc_gettimeofday(); -#endif -int getrusage(); -int getsockopt(); -int readv(); -int writev(); -int settimeofday(); -int fchown(); -int fchmod(); -int rename(); -int flock(); -int mkfifo(); -int sendto(); -int shutdown(); -int socketpair(); -int mkdir(); -int rmdir(); -int utimes(); -int futimes(); -int adjtime(); -int setsid(); -int quotactl(); -int nfssvc(); -int statfs(); -int fstatfs(); -int getfh(); -int setgid(); -int setegid(); -int seteuid(); -int stat(); -int fstat(); -int lstat(); -int pathconf(); -int fpathconf(); -int getrlimit(); -int setrlimit(); -int getdirentries(); -int mmap(); -int nosys(); -int lseek(); -int truncate(); -int ftruncate(); -int __sysctl(); -int undelete(); -int setprivexec(); -int add_profil(); - -int kdebug_trace(); - -int mlock(); -int munlock(); -int minherit(); -int mlockall(); -int munlockall(); -#if COMPAT_43 -#define compat(name,n) syss(__CONCAT(o,name),n) -#define compatp(name,n) sysp(__CONCAT(o,name),n) -#define comaptnet(name,n) sysnets(__CONCAT(o,name),n) -#define comaptnetp(name,n) sysnetp(__CONCAT(o,name),n) - -int ocreat(); -int olseek(); -int ostat(); -int olstat(); -int ofstat(); -int ogetkerninfo(); -int osmmap(); -int ogetpagesize(); -int ommap(); -int owait(); -int ogethostname(); -int osethostname(); -int oaccept(); -int osend(); -int orecv(); -int osigvec(); -int osigblock(); -int osigsetmask(); -int osigstack(); -int orecvmsg(); -int osendmsg(); -int orecvfrom(); -int osetreuid(); -int osetregid(); -int otruncate(); -int oftruncate(); -int ogetpeername(); -int ogethostid(); -int osethostid(); -int ogetrlimit(); -int osetrlimit(); -int okillpg(); -int oquota(); -int ogetsockname(); -int ogetdomainname(); -int osetdomainname(); -int owait3(); -int ogetdirentries(); -#if NETAT -int ATsocket(); -int ATgetmsg(); -int ATputmsg(); -int ATPsndreq(); -int ATPsndrsp(); -int ATPgetreq(); -int ATPgetrsp(); -#endif /* NETAT */ - -/* Calls for supporting HFS Semantics */ - -int mkcomplex(); -int statv(); -int lstatv(); -int fstatv(); -int getattrlist(); -int setattrlist(); -int getdirentriesattr(); -int exchangedata(); -int checkuseraccess(); -int searchfs(); -int delete(); -int copyfile(); - -/* end of HFS calls */ - -#else /* COMPAT_43 */ -#define compat(n, name) syss(nosys,0) -#define compatp(n, name) sysp(nosys,0) -#define comaptnet(n, name) sysnets(nosys,0) -#define comaptnetp(n, name) sysnetp(nosys,0) -#endif /* COMPAT_43 */ - -int watchevent(); -int waitevent(); -int modwatch(); -int fsctl(); -int semsys(); -int msgsys(); -int shmsys(); -int semctl(); -int semget(); -int semop(); -int semconfig(); -int msgctl(); -int msgget(); -int msgsnd(); -int msgrcv(); -int shmat(); -int shmctl(); -int shmdt(); -int shmget(); -int shm_open(); -int shm_unlink(); -int sem_open(); -int sem_close(); -int sem_unlink(); -int sem_wait(); -int sem_trywait(); -int sem_post(); -int sem_getvalue(); -int sem_init(); -int sem_destroy(); - -int fmod_watch_enable(); -int fmod_watch(); - -int issetugid(); -int utrace(); -int pread(); -int pwrite(); -int getsid(); -int getpgid(); - -int __pthread_kill(); -int sigwait(); -int pthread_sigmask(); -int __disable_threadsignal(); - -int nfsclnt(); -int fhopen(); - -int aio_cancel(); -int aio_error(); -int aio_fsync(); -int aio_read(); -int aio_return(); -int aio_suspend(); -int aio_write(); -int lio_listio(); - -int kqueue(); -int kqueue_portset_np(); -int kqueue_from_portset_np(); -int kevent(); - -int audit(); -int auditon(); -int getauid(); -int setauid(); -int getaudit(); -int setaudit(); -int getaudit_addr(); -int setaudit_addr(); -int auditctl(); - -/* - * System call switch table. - */ - -/* - * N.B. - * The argument count numbers in this table are actually - * the number of UInt32 words that comprise the arguments - * not the number of arguments - * - * This value is not currently used on PPC but Intel Darwin - * does use it and will not work correctly if the values - * are wrong - */ -struct sysent sysent[] = { - syss(nosys,0), /* 0 = indir */ - syss(exit,1), /* 1 = exit */ - syss(fork,0), /* 2 = fork */ - sysp(read,3), /* 3 = read */ - sysp(write,3), /* 4 = write */ - syss(open,3), /* 5 = open */ - syss(close,1), /* 6 = close */ - syss(wait4, 4), /* 7 = wait4 */ - compat(creat,2), /* 8 = old creat */ - syss(link,2), /* 9 = link */ - syss(unlink,1), /* 10 = unlink */ - syss(nosys, 0), /* 11 was obsolete execv */ - syss(chdir,1), /* 12 = chdir */ - syss(fchdir,1), /* 13 = fchdir */ - syss(mknod,3), /* 14 = mknod */ - syss(chmod,2), /* 15 = chmod */ - syss(chown,3), /* 16 = chown; now 3 args */ - syss(obreak,1), /* 17 = old break */ +/* The casts are bogus but will do for now. */ +__private_extern__ struct sysent sysent[] = { + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 0 = nosys indirect syscall */ + {AC(exit_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)exit, munge_w, munge_d, _SYSCALL_RET_NONE}, /* 1 = exit */ + {0, _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)fork, NULL, NULL, _SYSCALL_RET_INT_T}, /* 2 = fork */ + {AC(read_args), _SYSCALL_CANCEL_PRE, NO_FUNNEL, (sy_call_t *)read, munge_www, munge_ddd, _SYSCALL_RET_SSIZE_T}, /* 3 = read */ + {AC(write_args), _SYSCALL_CANCEL_PRE, NO_FUNNEL, (sy_call_t *)write, munge_www, munge_ddd, _SYSCALL_RET_SSIZE_T}, /* 4 = write */ + {AC(open_args), _SYSCALL_CANCEL_PRE, NO_FUNNEL, (sy_call_t *)open, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 5 = open */ + {AC(close_args), _SYSCALL_CANCEL_PRE, NO_FUNNEL, (sy_call_t *)close, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 6 = close */ + {AC(wait4_args), _SYSCALL_CANCEL_PRE, KERNEL_FUNNEL, (sy_call_t *)wait4, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 7 = wait4 */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 8 = nosys old creat */ + {AC(link_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)link, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 9 = link */ + {AC(unlink_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)unlink, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 10 = unlink */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 11 = nosys old execv */ + {AC(chdir_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)chdir, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 12 = chdir */ + {AC(fchdir_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)fchdir, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 13 = fchdir */ + {AC(mknod_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)mknod, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 14 = mknod */ + {AC(chmod_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)chmod, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 15 = chmod */ + {AC(chown_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)chown, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 16 = chown */ + {AC(obreak_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL|UNSAFE_64BIT, (sy_call_t *)obreak, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 17 = obreak old break */ #if COMPAT_GETFSSTAT - syss(ogetfsstat, 3), /* 18 = ogetfsstat */ -#else - syss(getfsstat, 3), /* 18 = getfsstat */ -#endif - compat(lseek,3), /* 19 = old lseek */ - sysp(getpid,0), /* 20 = getpid */ - syss(nosys, 0), /* 21 was obsolete mount */ - syss(nosys, 0), /* 22 was obsolete umount */ - syss(setuid,1), /* 23 = setuid */ - sysp(getuid,0), /* 24 = getuid */ - sysp(geteuid,0), /* 25 = geteuid */ - syss(ptrace,4), /* 26 = ptrace */ - sysnets(recvmsg,3), /* 27 = recvmsg */ - sysnets(sendmsg,3), /* 28 = sendmsg */ - sysnets(recvfrom,6), /* 29 = recvfrom */ - sysnets(accept,3), /* 30 = accept */ - sysnets(getpeername,3), /* 31 = getpeername */ - sysnets(getsockname,3), /* 32 = getsockname */ - syss(access,2), /* 33 = access */ - syss(chflags,2), /* 34 = chflags */ - syss(fchflags,2), /* 35 = fchflags */ - syss(sync,0), /* 36 = sync */ - syss(kill,2), /* 37 = kill */ - compat(stat,2), /* 38 = old stat */ - sysp(getppid,0), /* 39 = getppid */ - compat(lstat,2), /* 40 = old lstat */ - syss(dup,1), /* 41 = dup */ - syss(pipe,0), /* 42 = pipe */ - sysp(getegid,0), /* 43 = getegid */ - syss(profil,4), /* 44 = profil */ - syss(ktrace,4), /* 45 = ktrace */ - syss(sigaction,3), /* 46 = sigaction */ - sysp(getgid,0), /* 47 = getgid */ - syss(sigprocmask,3), /* 48 = sigprocmask */ - syss(getlogin,2), /* 49 = getlogin */ - syss(setlogin,1), /* 50 = setlogin */ - syss(acct,1), /* 51 = turn acct off/on */ - syss(sigpending,1), /* 52 = sigpending */ - syss(sigaltstack,2), /* 53 = sigaltstack */ - syss(ioctl,3), /* 54 = ioctl */ - syss(reboot,2), /* 55 = reboot */ - syss(revoke,1), /* 56 = revoke */ - syss(symlink,2), /* 57 = symlink */ - syss(readlink,3), /* 58 = readlink */ - syss(execve,3), /* 59 = execve */ - syss(umask,1), /* 60 = umask */ - syss(chroot,1), /* 61 = chroot */ - compat(fstat,2), /* 62 = old fstat */ - syss(nosys,0), /* 63 = used internally, reserved */ - compat(getpagesize,0), /* 64 = old getpagesize */ - syss(msync,3), /* 65 = msync */ - syss(vfork,0), /* 66 = vfork */ - syss(nosys,0), /* 67 was obsolete vread */ - syss(nosys,0), /* 68 was obsolete vwrite */ - syss(sbrk,1), /* 69 = sbrk */ - syss(sstk,1), /* 70 = sstk */ - compat(smmap,6), /* 71 = old mmap */ - syss(ovadvise,1), /* 72 = old vadvise */ - sysnofnl(munmap,2), /* 73 = munmap */ - syss(mprotect,3), /* 74 = mprotect */ - syss(madvise,3), /* 75 = madvise */ - syss(nosys,0), /* 76 was obsolete vhangup */ - syss(nosys,0), /* 77 was obsolete vlimit */ - syss(mincore,3), /* 78 = mincore */ - sysp(getgroups,2), /* 79 = getgroups */ - sysp(setgroups,2), /* 80 = setgroups */ - sysp(getpgrp,0), /* 81 = getpgrp */ - sysp(setpgid,2), /* 82 = setpgid */ - syss(setitimer,3), /* 83 = setitimer */ - compat(wait,1), /* 84 = old wait */ - syss(swapon,1), /* 85 = swapon */ - syss(getitimer,2), /* 86 = getitimer */ - compat(gethostname,2), /* 87 = old gethostname */ - compat(sethostname,2), /* 88 = old sethostname */ - sysp(getdtablesize, 0), /* 89 getdtablesize */ - syss(dup2,2), /* 90 = dup2 */ - syss(nosys,0), /* 91 was obsolete getdopt */ - syss(fcntl,3), /* 92 = fcntl */ - syss(select,5), /* 93 = select */ - syss(nosys,0), /* 94 was obsolete setdopt */ - syss(fsync,1), /* 95 = fsync */ - sysp(setpriority,3), /* 96 = setpriority */ - sysnets(socket,3), /* 97 = socket */ - sysnets(connect,3), /* 98 = connect */ - comaptnet(accept,3), /* 99 = accept */ - sysp(getpriority,2), /* 100 = getpriority */ - comaptnet(send,4), /* 101 = old send */ - comaptnet(recv,4), /* 102 = old recv */ -#ifdef __ppc__ - syss(osigreturn,1), /* 103 = sigreturn ; compat for jaguar*/ + {AC(ogetfsstat_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)ogetfsstat, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 18 = ogetfsstat */ #else - syss(sigreturn,1), /* 103 = sigreturn */ + {AC(getfsstat_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)getfsstat, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 18 = getfsstat */ #endif - sysnets(bind,3), /* 104 = bind */ - sysnets(setsockopt,5), /* 105 = setsockopt */ - sysnets(listen,2), /* 106 = listen */ - syss(nosys,0), /* 107 was vtimes */ - compat(sigvec,3), /* 108 = sigvec */ - compat(sigblock,1), /* 109 = sigblock */ - compat(sigsetmask,1), /* 110 = sigsetmask */ - syss(sigsuspend,1), /* 111 = sigpause */ - compat(sigstack,2), /* 112 = sigstack */ - comaptnet(recvmsg,3), /* 113 = recvmsg */ - comaptnet(sendmsg,3), /* 114 = sendmsg */ - syss(nosys,0), /* 115 = old vtrace */ - -/* - * N.B. - * The argument count numbers in this table are actually - * the number of UInt32 words that comprise the arguments - * not the number of arguments - * - * This value is not currently used on PPC but Intel Darwin - * does use it and will not work correctly if the values - * are wrong - */ - + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 19 = nosys old lseek */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)getpid, NULL, NULL, _SYSCALL_RET_INT_T}, /* 20 = getpid */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 21 = nosys old mount */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 22 = nosys old umount */ + {AC(setuid_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)setuid, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 23 = setuid */ + {0, _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)getuid, NULL, NULL, _SYSCALL_RET_INT_T}, /* 24 = getuid */ + {0, _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)geteuid, NULL, NULL, _SYSCALL_RET_INT_T}, /* 25 = geteuid */ + {AC(ptrace_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)ptrace, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 26 = ptrace */ + {AC(recvmsg_args), _SYSCALL_CANCEL_PRE, NO_FUNNEL, (sy_call_t *)recvmsg, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 27 = recvmsg */ + {AC(sendmsg_args), _SYSCALL_CANCEL_PRE, NO_FUNNEL, (sy_call_t *)sendmsg, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 28 = sendmsg */ + {AC(recvfrom_args), _SYSCALL_CANCEL_PRE, NO_FUNNEL, (sy_call_t *)recvfrom, munge_wwwwww, munge_dddddd, _SYSCALL_RET_INT_T}, /* 29 = recvfrom */ + {AC(accept_args), _SYSCALL_CANCEL_PRE, NO_FUNNEL, (sy_call_t *)accept, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 30 = accept */ + {AC(getpeername_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)getpeername, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 31 = getpeername */ + {AC(getsockname_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)getsockname, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 32 = getsockname */ + {AC(access_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)access, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 33 = access */ + {AC(chflags_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)chflags, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 34 = chflags */ + {AC(fchflags_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)fchflags, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 35 = fchflags */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)sync, NULL, NULL, _SYSCALL_RET_INT_T}, /* 36 = sync */ + {AC(kill_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)kill, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 37 = kill */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 38 = nosys old stat */ + {0, _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)getppid, NULL, NULL, _SYSCALL_RET_INT_T}, /* 39 = getppid */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 40 = nosys old lstat */ + {AC(dup_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)dup, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 41 = dup */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)pipe, NULL, NULL, _SYSCALL_RET_INT_T}, /* 42 = pipe */ + {0, _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)getegid, NULL, NULL, _SYSCALL_RET_INT_T}, /* 43 = getegid */ + {AC(profil_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)profil, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 44 = profil */ + {AC(ktrace_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)ktrace, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 45 = ktrace */ + {AC(sigaction_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)sigaction, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 46 = sigaction */ + {0, _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)getgid, NULL, NULL, _SYSCALL_RET_INT_T}, /* 47 = getgid */ + {AC(sigprocmask_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)sigprocmask, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 48 = sigprocmask */ + {AC(getlogin_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)getlogin, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 49 = getlogin */ + {AC(setlogin_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)setlogin, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 50 = setlogin */ + {AC(acct_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)acct, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 51 = acct */ + {AC(sigpending_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)sigpending, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 52 = sigpending */ + {AC(sigaltstack_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)sigaltstack, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 53 = sigaltstack */ + {AC(ioctl_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)ioctl, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 54 = ioctl */ + {AC(reboot_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)reboot, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 55 = reboot */ + {AC(revoke_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)revoke, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 56 = revoke */ + {AC(symlink_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)symlink, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 57 = symlink */ + {AC(readlink_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)readlink, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 58 = readlink */ + {AC(execve_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)execve, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 59 = execve */ + {AC(umask_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)umask, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 60 = umask */ + {AC(chroot_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)chroot, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 61 = chroot */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 62 = nosys old fstat */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 63 = nosys used internally , reserved */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 64 = nosys old getpagesize */ + {AC(msync_args), _SYSCALL_CANCEL_PRE, NO_FUNNEL, (sy_call_t *)msync, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 65 = msync */ + {0, _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)vfork, NULL, NULL, _SYSCALL_RET_INT_T}, /* 66 = vfork */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 67 = nosys old vread */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 68 = nosys old vwrite */ + {AC(sbrk_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)sbrk, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 69 = sbrk */ + {AC(sstk_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)sstk, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 70 = sstk */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 71 = nosys old mmap */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)ovadvise, NULL, NULL, _SYSCALL_RET_INT_T}, /* 72 = ovadvise old vadvise */ + {AC(munmap_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)munmap, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 73 = munmap */ + {AC(mprotect_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)mprotect, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 74 = mprotect */ + {AC(madvise_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)madvise, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 75 = madvise */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 76 = nosys old vhangup */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 77 = nosys old vlimit */ + {AC(mincore_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)mincore, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 78 = mincore */ + {AC(getgroups_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)getgroups, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 79 = getgroups */ + {AC(setgroups_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)setgroups, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 80 = setgroups */ + {0, _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)getpgrp, NULL, NULL, _SYSCALL_RET_INT_T}, /* 81 = getpgrp */ + {AC(setpgid_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)setpgid, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 82 = setpgid */ + {AC(setitimer_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)setitimer, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 83 = setitimer */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 84 = nosys old wait */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)swapon, NULL, NULL, _SYSCALL_RET_INT_T}, /* 85 = swapon */ + {AC(getitimer_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)getitimer, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 86 = getitimer */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 87 = nosys old gethostname */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 88 = nosys old sethostname */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)getdtablesize, NULL, NULL, _SYSCALL_RET_INT_T}, /* 89 = getdtablesize */ + {AC(dup2_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)dup2, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 90 = dup2 */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 91 = nosys old getdopt */ + {AC(fcntl_args), _SYSCALL_CANCEL_PRE, NO_FUNNEL, (sy_call_t *)fcntl, munge_wws, munge_ddd, _SYSCALL_RET_INT_T}, /* 92 = fcntl */ + {AC(select_args), _SYSCALL_CANCEL_PRE, KERNEL_FUNNEL, (sy_call_t *)select, munge_wwwww, munge_ddddd, _SYSCALL_RET_INT_T}, /* 93 = select */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 94 = nosys old setdopt */ + {AC(fsync_args), _SYSCALL_CANCEL_PRE, NO_FUNNEL, (sy_call_t *)fsync, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 95 = fsync */ + {AC(setpriority_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)setpriority, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 96 = setpriority */ + {AC(socket_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)socket, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 97 = socket */ + {AC(connect_args), _SYSCALL_CANCEL_PRE, NO_FUNNEL, (sy_call_t *)connect, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 98 = connect */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 99 = nosys old accept */ + {AC(getpriority_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)getpriority, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 100 = getpriority */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 101 = nosys old send */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 102 = nosys old recv */ #ifdef __ppc__ - sysnofnl(ppc_gettimeofday,2), /* 116 = gettimeofday */ -#else - sysnofnl(gettimeofday,2), /* 116 = gettimeofday */ -#endif - sysp(getrusage,2), /* 117 = getrusage */ - sysnets(getsockopt,5), /* 118 = getsockopt */ - syss(nosys,0), /* 119 = old resuba */ - sysp(readv,3), /* 120 = readv */ - sysp(writev,3), /* 121 = writev */ - syss(settimeofday,2), /* 122 = settimeofday */ - syss(fchown,3), /* 123 = fchown */ - syss(fchmod,2), /* 124 = fchmod */ - comaptnet(recvfrom,6), /* 125 = recvfrom */ - compat(setreuid,2), /* 126 = setreuid */ - compat(setregid,2), /* 127 = setregid */ - syss(rename,2), /* 128 = rename */ - compat(truncate,2), /* 129 = old truncate */ - compat(ftruncate,2), /* 130 = ftruncate */ - syss(flock,2), /* 131 = flock */ - syss(mkfifo,2), /* 132 = mkfifo */ - sysnets(sendto,6), /* 133 = sendto */ - sysnets(shutdown,2), /* 134 = shutdown */ - sysnets(socketpair,4), /* 135 = socketpair */ - syss(mkdir,2), /* 136 = mkdir */ - syss(rmdir,1), /* 137 = rmdir */ - syss(utimes,2), /* 138 = utimes */ - syss(futimes,2), /* 139 = futimes */ - syss(adjtime,2), /* 140 = adjtime */ - comaptnet(getpeername,3),/* 141 = getpeername */ - compat(gethostid,0), /* 142 = old gethostid */ - sysp(nosys,0), /* 143 = old sethostid */ - compat(getrlimit,2), /* 144 = old getrlimit */ - compat(setrlimit,2), /* 145 = old setrlimit */ - compat(killpg,2), /* 146 = old killpg */ - syss(setsid,0), /* 147 = setsid */ - syss(nosys,0), /* 148 was setquota */ - syss(nosys,0), /* 149 was qquota */ - comaptnet(getsockname,3),/* 150 = getsockname */ - syss(getpgid,1), /* 151 = getpgid */ - sysp(setprivexec,1),/* 152 = setprivexec */ -#ifdef DOUBLE_ALIGN_PARAMS - syss(pread,6), /* 153 = pread */ - syss(pwrite,6), /* 154 = pwrite */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 103 = nosys old sigreturn */ #else - syss(pread,5), /* 153 = pread */ - syss(pwrite,5), /* 154 = pwrite */ + {AC(sigreturn_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL|UNSAFE_64BIT, (sy_call_t *)sigreturn, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 103 = sigreturn */ #endif - syss(nfssvc,2), /* 155 = nfs_svc */ - compat(getdirentries,4), /* 156 = old getdirentries */ - syss(statfs, 2), /* 157 = statfs */ - syss(fstatfs, 2), /* 158 = fstatfs */ - syss(unmount, 2), /* 159 = unmount */ - syss(nosys,0), /* 160 was async_daemon */ - syss(getfh,2), /* 161 = get file handle */ - compat(getdomainname,2), /* 162 = getdomainname */ - compat(setdomainname,2), /* 163 = setdomainname */ - syss(nosys,0), /* 164 */ -#if QUOTA - syss(quotactl, 4), /* 165 = quotactl */ -#else /* QUOTA */ - syss(nosys, 0), /* 165 = not configured */ -#endif /* QUOTA */ - syss(nosys,0), /* 166 was exportfs */ - syss(mount, 4), /* 167 = mount */ - syss(nosys,0), /* 168 was ustat */ - syss(nosys,0), /* 169 = nosys */ - syss(nosys,0), /* 170 was table */ - compat(wait3,3), /* 171 = old wait3 */ - syss(nosys,0), /* 172 was rpause */ - syss(nosys,0), /* 173 = nosys */ - syss(nosys,0), /* 174 was getdents */ - syss(nosys,0), /* 175 was gc_control */ - syss(add_profil,4), /* 176 = add_profil */ - syss(nosys,0), /* 177 */ - syss(nosys,0), /* 178 */ - syss(nosys,0), /* 179 */ - sysnofnl(kdebug_trace,6), /* 180 */ - syss(setgid,1), /* 181 */ - syss(setegid,1), /* 182 */ - syss(seteuid,1), /* 183 */ + {AC(bind_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)bind, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 104 = bind */ + {AC(setsockopt_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)setsockopt, munge_wwwww, munge_ddddd, _SYSCALL_RET_INT_T}, /* 105 = setsockopt */ + {AC(listen_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)listen, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 106 = listen */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 107 = nosys old vtimes */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 108 = nosys old sigvec */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 109 = nosys old sigblock */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 110 = nosys old sigsetmask */ + {AC(sigsuspend_args), _SYSCALL_CANCEL_PRE, KERNEL_FUNNEL, (sy_call_t *)sigsuspend, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 111 = sigsuspend */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 112 = nosys old sigstack */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 113 = nosys old recvmsg */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 114 = nosys old sendmsg */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 115 = nosys old vtrace */ #ifdef __ppc__ - syss(sigreturn, 2), /* 184 = nosys */ + {AC(ppc_gettimeofday_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)ppc_gettimeofday, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 116 = ppc_gettimeofday */ #else - syss(nosys,0), /* 184 = nosys */ + {AC(gettimeofday_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)gettimeofday, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 116 = gettimeofday */ #endif - syss(nosys,0), /* 185 = nosys */ - syss(nosys,0), /* 186 = nosys */ - syss(nosys,0), /* 187 = nosys */ - syss(stat,2), /* 188 = stat */ - syss(fstat,2), /* 189 = fstat */ - syss(lstat,2), /* 190 = lstat */ - syss(pathconf,2), /* 191 = pathconf */ - syss(fpathconf,2), /* 192 = fpathconf */ - -/* - * N.B. - * The argument count numbers in this table are actually - * the number of UInt32 words that comprise the arguments - * not the number of arguments - * - * This value is not currently used on PPC but Intel Darwin - * does use it and will not work correctly if the values - * are wrong - */ - -#if COMPAT_GETFSSTAT - syss(getfsstat,3), /* 193 = getfsstat */ + {AC(getrusage_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)getrusage, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 117 = getrusage */ + {AC(getsockopt_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)getsockopt, munge_wwwww, munge_ddddd, _SYSCALL_RET_INT_T}, /* 118 = getsockopt */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 119 = nosys old resuba */ + {AC(readv_args), _SYSCALL_CANCEL_PRE, NO_FUNNEL, (sy_call_t *)readv, munge_www, munge_ddd, _SYSCALL_RET_SSIZE_T}, /* 120 = readv */ + {AC(writev_args), _SYSCALL_CANCEL_PRE, NO_FUNNEL, (sy_call_t *)writev, munge_www, munge_ddd, _SYSCALL_RET_SSIZE_T}, /* 121 = writev */ + {AC(settimeofday_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)settimeofday, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 122 = settimeofday */ + {AC(fchown_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)fchown, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 123 = fchown */ + {AC(fchmod_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)fchmod, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 124 = fchmod */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 125 = nosys old recvfrom */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 126 = nosys old setreuid */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 127 = nosys old setregid */ + {AC(rename_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)rename, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 128 = rename */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 129 = nosys old truncate */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 130 = nosys old ftruncate */ + {AC(flock_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)flock, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 131 = flock */ + {AC(mkfifo_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)mkfifo, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 132 = mkfifo */ + {AC(sendto_args), _SYSCALL_CANCEL_PRE, NO_FUNNEL, (sy_call_t *)sendto, munge_wwwwww, munge_dddddd, _SYSCALL_RET_INT_T}, /* 133 = sendto */ + {AC(shutdown_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)shutdown, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 134 = shutdown */ + {AC(socketpair_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)socketpair, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 135 = socketpair */ + {AC(mkdir_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)mkdir, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 136 = mkdir */ + {AC(rmdir_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)rmdir, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 137 = rmdir */ + {AC(utimes_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)utimes, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 138 = utimes */ + {AC(futimes_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)futimes, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 139 = futimes */ + {AC(adjtime_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)adjtime, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 140 = adjtime */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 141 = nosys old getpeername */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 142 = nosys old gethostid */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 143 = nosys old sethostid */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 144 = nosys old getrlimit */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 145 = nosys old setrlimit */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 146 = nosys old killpg */ + {0, _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)setsid, NULL, NULL, _SYSCALL_RET_INT_T}, /* 147 = setsid */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 148 = nosys old setquota */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 149 = nosys old qquota */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 150 = nosys old getsockname */ + {AC(getpgid_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)getpgid, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 151 = getpgid */ + {AC(setprivexec_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)setprivexec, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 152 = setprivexec */ + {AC(pread_args), _SYSCALL_CANCEL_PRE, NO_FUNNEL, (sy_call_t *)pread, munge_wwwl, munge_dddd, _SYSCALL_RET_SSIZE_T}, /* 153 = pread */ + {AC(pwrite_args), _SYSCALL_CANCEL_PRE, NO_FUNNEL, (sy_call_t *)pwrite, munge_wwwl, munge_dddd, _SYSCALL_RET_SSIZE_T}, /* 154 = pwrite */ +#if NFSSERVER + {AC(nfssvc_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)nfssvc, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 155 = nfssvc */ #else - syss(nosys,0), /* 193 is unused */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 155 = nosys */ #endif - syss(getrlimit,2), /* 194 = getrlimit */ - syss(setrlimit,2), /* 195 = setrlimit */ - syss(getdirentries,4), /* 196 = getdirentries */ -#ifdef DOUBLE_ALIGN_PARAMS - syss(mmap,8), /* 197 = mmap */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 156 = nosys old getdirentries */ + {AC(statfs_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)statfs, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 157 = statfs */ + {AC(fstatfs_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)fstatfs, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 158 = fstatfs */ + {AC(unmount_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)unmount, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 159 = unmount */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 160 = nosys old async_daemon */ +#if NFSCLIENT + {AC(getfh_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)getfh, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 161 = getfh */ #else - syss(mmap,7), /* 197 = mmap */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 161 = nosys */ #endif - syss(nosys,0), /* 198 = __syscall */ -#ifdef DOUBLE_ALIGN_PARAMS - syss(lseek,5), /* 199 = lseek */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 162 = nosys old getdomainname */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 163 = nosys old setdomainname */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 164 = nosys */ + {AC(quotactl_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)quotactl, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 165 = quotactl */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 166 = nosys old exportfs */ + {AC(mount_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)mount, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 167 = mount */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 168 = nosys old ustat */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 169 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_NONE}, /* 170 = table old table */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 171 = nosys old wait3 */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 172 = nosys old rpause */ + {AC(waitid_args), _SYSCALL_CANCEL_PRE, KERNEL_FUNNEL, (sy_call_t *)waitid, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 173 = waitid */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 174 = nosys old getdents */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 175 = nosys old gc_control */ + {AC(add_profil_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)add_profil, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 176 = add_profil */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 177 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 178 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 179 = nosys */ + {AC(kdebug_trace_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL|UNSAFE_64BIT, (sy_call_t *)kdebug_trace, munge_wwwwww, munge_dddddd, _SYSCALL_RET_INT_T}, /* 180 = kdebug_trace */ + {AC(setgid_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)setgid, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 181 = setgid */ + {AC(setegid_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)setegid, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 182 = setegid */ + {AC(seteuid_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)seteuid, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 183 = seteuid */ +#ifdef __ppc__ + {AC(sigreturn_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)sigreturn, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 184 = sigreturn */ #else - syss(lseek,4), /* 199 = lseek */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 184 = nosys */ #endif -#ifdef DOUBLE_ALIGN_PARAMS - syss(truncate,4), /* 200 = truncate */ - syss(ftruncate,4), /* 201 = ftruncate */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 185 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 186 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 187 = nosys */ + {AC(stat_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)stat, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 188 = stat */ + {AC(fstat_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)fstat, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 189 = fstat */ + {AC(lstat_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)lstat, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 190 = lstat */ + {AC(pathconf_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)pathconf, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 191 = pathconf */ + {AC(fpathconf_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)fpathconf, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 192 = fpathconf */ +#if COMPAT_GETFSSTAT + {AC(getfsstat_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)getfsstat, munge_wsw, munge_ddd, _SYSCALL_RET_INT_T}, /* 193 = getfsstat */ #else - syss(truncate,3), /* 200 = truncate */ - syss(ftruncate,3), /* 201 = ftruncate */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 193 = nosys */ #endif - syss(__sysctl,6), /* 202 = __sysctl */ - sysp(mlock, 2), /* 203 = mlock */ - syss(munlock, 2), /* 204 = munlock */ - syss(undelete,1), /* 205 = undelete */ -#if NETAT - sysnets(ATsocket,1), /* 206 = ATsocket */ - sysnets(ATgetmsg,4), /* 207 = ATgetmsg*/ - sysnets(ATputmsg,4), /* 208 = ATputmsg*/ - sysnets(ATPsndreq,4), /* 209 = ATPsndreq*/ - sysnets(ATPsndrsp,4), /* 210 = ATPsndrsp*/ - sysnets(ATPgetreq,3), /* 211 = ATPgetreq*/ - sysnets(ATPgetrsp,2), /* 212 = ATPgetrsp*/ - syss(nosys,0), /* 213 = Reserved for AppleTalk */ - syss(kqueue_from_portset_np,1), /* 214 = kqueue_from_portset_np */ - syss(kqueue_portset_np,1), /* 215 = kqueue_portset_np */ + {AC(getrlimit_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)getrlimit, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 194 = getrlimit */ + {AC(setrlimit_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)setrlimit, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 195 = setrlimit */ + {AC(getdirentries_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)getdirentries, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 196 = getdirentries */ + {AC(mmap_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)mmap, munge_wwwwwl, munge_dddddd, _SYSCALL_RET_ADDR_T}, /* 197 = mmap */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 198 = nosys __syscall */ + {AC(lseek_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)lseek, munge_wlw, munge_ddd, _SYSCALL_RET_OFF_T}, /* 199 = lseek */ + {AC(truncate_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)truncate, munge_wl, munge_dd, _SYSCALL_RET_INT_T}, /* 200 = truncate */ + {AC(ftruncate_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)ftruncate, munge_wl, munge_dd, _SYSCALL_RET_INT_T}, /* 201 = ftruncate */ + {AC(__sysctl_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)__sysctl, munge_wwwwww, munge_dddddd, _SYSCALL_RET_INT_T}, /* 202 = __sysctl */ + {AC(mlock_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)mlock, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 203 = mlock */ + {AC(munlock_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)munlock, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 204 = munlock */ + {AC(undelete_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)undelete, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 205 = undelete */ +#ifdef __ppc__ + {AC(ATsocket_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)ATsocket, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 206 = ATsocket */ + {AC(ATgetmsg_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL|UNSAFE_64BIT, (sy_call_t *)ATgetmsg, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 207 = ATgetmsg */ + {AC(ATputmsg_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL|UNSAFE_64BIT, (sy_call_t *)ATputmsg, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 208 = ATputmsg */ + {AC(ATPsndreq_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL|UNSAFE_64BIT, (sy_call_t *)ATPsndreq, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 209 = ATPsndreq */ + {AC(ATPsndrsp_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL|UNSAFE_64BIT, (sy_call_t *)ATPsndrsp, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 210 = ATPsndrsp */ + {AC(ATPgetreq_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL|UNSAFE_64BIT, (sy_call_t *)ATPgetreq, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 211 = ATPgetreq */ + {AC(ATPgetrsp_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL|UNSAFE_64BIT, (sy_call_t *)ATPgetrsp, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 212 = ATPgetrsp */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 213 = nosys Reserved for AppleTalk */ #else - syss(nosys,0), /* 206 = Reserved for AppleTalk */ - syss(nosys,0), /* 207 = Reserved for AppleTalk */ - syss(nosys,0), /* 208 = Reserved for AppleTalk */ - syss(nosys,0), /* 209 = Reserved for AppleTalk */ - syss(nosys,0), /* 210 = Reserved for AppleTalk */ - syss(nosys,0), /* 211 = Reserved for AppleTalk */ - syss(nosys,0), /* 212 = Reserved for AppleTalk */ - syss(nosys,0), /* 213 = Reserved for AppleTalk */ - syss(nosys,0), /* 214 = Reserved for AppleTalk */ - syss(nosys,0), /* 215 = Reserved for AppleTalk */ -#endif /* NETAT */ - -/* - * System Calls 216 - 230 are reserved for calls to support HFS/HFS Plus - * file system semantics. Currently, we only use 215-227. The rest is - * for future expansion in anticipation of new MacOS APIs for HFS Plus. - * These calls are not conditionalized becuase while they are specific - * to HFS semantics, they are not specific to the HFS filesystem. - * We expect all filesystems to recognize the call and report that it is - * not supported or to actually implement it. - */ - -/* - * N.B. - * The argument count numbers in this table are actually - * the number of UInt32 words that comprise the arguments - * not the number of arguments - * - * This value is not currently used on PPC but Intel Darwin - * does use it and will not work correctly if the values - * are wrong - */ - - syss(nosys,3), /* 216 = HFS make complex file call (multipel forks */ - syss(nosys,2), /* 217 = HFS statv extended stat call for HFS */ - syss(nosys,2), /* 218 = HFS lstatv extended lstat call for HFS */ - syss(nosys,2), /* 219 = HFS fstatv extended fstat call for HFS */ - syss(getattrlist,5), /* 220 = HFS getarrtlist get attribute list cal */ - syss(setattrlist,5), /* 221 = HFS setattrlist set attribute list */ - syss(getdirentriesattr,8), /* 222 = HFS getdirentriesattr get directory attributes */ - syss(exchangedata,3), /* 223 = HFS exchangedata exchange file contents */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_NONE}, /* 206 = ATsocket */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL|UNSAFE_64BIT, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_NONE}, /* 207 = ATgetmsg */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL|UNSAFE_64BIT, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_NONE}, /* 208 = ATputmsg */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL|UNSAFE_64BIT, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_NONE}, /* 209 = ATPsndreq */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL|UNSAFE_64BIT, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_NONE}, /* 210 = ATPsndrsp */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL|UNSAFE_64BIT, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_NONE}, /* 211 = ATPgetreq */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL|UNSAFE_64BIT, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_NONE}, /* 212 = ATPgetrsp */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 213 = nosys Reserved for AppleTalk */ +#endif /* __ppc__ */ + {AC(kqueue_from_portset_np_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)kqueue_from_portset_np, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 214 = kqueue_from_portset_np */ + {AC(kqueue_portset_np_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)kqueue_portset_np, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 215 = kqueue_portset_np */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL|UNSAFE_64BIT, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_NONE}, /* 216 = mkcomplex soon to be obsolete */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL|UNSAFE_64BIT, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_NONE}, /* 217 = statv soon to be obsolete */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL|UNSAFE_64BIT, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_NONE}, /* 218 = lstatv soon to be obsolete */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL|UNSAFE_64BIT, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_NONE}, /* 219 = fstatv soon to be obsolete */ + {AC(getattrlist_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)getattrlist, munge_wwwww, munge_ddddd, _SYSCALL_RET_INT_T}, /* 220 = getattrlist */ + {AC(setattrlist_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)setattrlist, munge_wwwww, munge_ddddd, _SYSCALL_RET_INT_T}, /* 221 = setattrlist */ + {AC(getdirentriesattr_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)getdirentriesattr, munge_wwwwwwww, munge_dddddddd, _SYSCALL_RET_INT_T}, /* 222 = getdirentriesattr */ + {AC(exchangedata_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)exchangedata, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 223 = exchangedata */ #ifdef __APPLE_API_OBSOLETE - syss(checkuseraccess,6),/* 224 = HFS checkuseraccess check access to a file */ + {AC(checkuseraccess_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL|UNSAFE_64BIT, (sy_call_t *)checkuseraccess, munge_wwwwww, munge_dddddd, _SYSCALL_RET_INT_T}, /* 224 = checkuseraccess */ #else - syss(nosys,6),/* 224 = HFS checkuseraccess check access to a file */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 224 = nosys HFS checkuseraccess check access to a file */ #endif /* __APPLE_API_OBSOLETE */ - syss(searchfs,6), /* 225 = HFS searchfs to implement catalog searching */ - syss(delete,1), /* 226 = private delete (Carbon semantics) */ - syss(copyfile,6), /* 227 = copyfile - orignally for AFP */ - syss(nosys,0), /* 228 */ - syss(nosys,0), /* 229 */ - syss(nosys,0), /* 230 */ - sysnets(watchevent,2), /* 231 */ - sysnets(waitevent,2), /* 232 */ - sysnets(modwatch,2), /* 233 */ - syss(nosys,0), /* 234 */ - syss(nosys,0), /* 235 */ - syss(nosys,0), /* 236 */ - syss(nosys,0), /* 237 */ - syss(nosys,0), /* 238 */ - syss(nosys,0), /* 239 */ - syss(nosys,0), /* 240 */ - syss(nosys,0), /* 241 */ - syss(fsctl,4), /* 242 = fsctl */ - syss(nosys,0), /* 243 */ - syss(nosys,0), /* 244 */ - syss(nosys,0), /* 245 */ - syss(nosys,0), /* 246 */ - syss(nfsclnt,2), /* 247 = nfsclnt*/ - syss(fhopen,2), /* 248 = fhopen */ - syss(nosys,0), /* 249 */ - syss(minherit,3), /* 250 = minherit */ - syss(semsys,5), /* 251 = semsys */ - syss(msgsys,6), /* 252 = msgsys */ - syss(shmsys,4), /* 253 = shmsys */ - syss(semctl,4), /* 254 = semctl */ - syss(semget,3), /* 255 = semget */ - syss(semop,3), /* 256 = semop */ - syss(semconfig,1), /* 257 = semconfig */ - syss(msgctl,3), /* 258 = msgctl */ - syss(msgget,2), /* 259 = msgget */ - syss(msgsnd,4), /* 260 = msgsnd */ - syss(msgrcv,5), /* 261 = msgrcv */ - syss(shmat,3), /* 262 = shmat */ - syss(shmctl,3), /* 263 = shmctl */ - syss(shmdt,1), /* 264 = shmdt */ - syss(shmget,3), /* 265 = shmget */ - syss(shm_open,3), /* 266 = shm_open */ - syss(shm_unlink,1), /* 267 = shm_unlink */ - syss(sem_open,4), /* 268 = sem_open */ - syss(sem_close,1), /* 269 = sem_close */ - syss(sem_unlink,1), /* 270 = sem_unlink */ - syss(sem_wait,1), /* 271 = sem_wait */ - syss(sem_trywait,1), /* 272 = sem_trywait */ - syss(sem_post,1), /* 273 = sem_post */ - syss(sem_getvalue,2), /* 274 = sem_getvalue */ - syss(sem_init,3), /* 275 = sem_init */ - syss(sem_destroy,1), /* 276 = sem_destroy */ - syss(nosys,0), /* 277 */ - syss(nosys,0), /* 278 */ - syss(nosys,0), /* 279 */ - syss(nosys,0), /* 280 */ - syss(nosys,0), /* 281 */ - syss(nosys,0), /* 282 */ - syss(nosys,0), /* 283 */ - syss(nosys,0), /* 284 */ - syss(nosys,0), /* 285 */ - syss(nosys,0), /* 286 */ - syss(nosys,0), /* 287 */ - syss(nosys,0), /* 288 */ - syss(fmod_watch_enable, 1), /* 289 = fmod_watching */ - syss(fmod_watch, 4), /* 290 = fmod_watch */ - syss(nosys,0), /* 291 */ - syss(nosys,0), /* 292 */ - syss(nosys,0), /* 293 */ - syss(nosys,0), /* 294 */ - syss(nosys,0), /* 295 */ - syss(load_shared_file,7), /* 296 = load_shared_file */ - syss(reset_shared_file,3), /* 297 = reset_shared_file */ - syss(new_system_shared_regions,0), /* 298 = new_system_shared_regions */ - syss(nosys,0), /* 299 */ - syss(nosys,0), /* 300 */ - syss(nosys,0), /* 301 */ - syss(nosys,0), /* 302 */ - syss(nosys,0), /* 303 */ - syss(nosys,0), /* 304 */ - syss(nosys,0), /* 305 */ - syss(nosys,0), /* 306 */ - syss(nosys,0), /* 307 */ - syss(nosys,0), /* 308 */ - syss(nosys,0), /* 309 */ - syss(getsid,1), /* 310 = getsid */ - syss(nosys,0), /* 311 */ - syss(nosys,0), /* 312 */ - sysnofnl(aio_fsync,1), /* 313 = aio_fsync */ - sysnofnl(aio_return,1), /* 314 = aio_return */ - sysnofnl(aio_suspend,3), /* 315 = aio_suspend */ - sysnofnl(aio_cancel,2), /* 316 = aio_cancel */ - sysnofnl(aio_error,1), /* 317 = aio_error */ - sysnofnl(aio_read,1), /* 318 = aio_read */ - sysnofnl(aio_write,1), /* 319 = aio_write */ - sysnofnl(lio_listio,4), /* 320 = lio_listio */ - syss(nosys,0), /* 321 */ - syss(nosys,0), /* 322 */ - syss(nosys,0), /* 323 */ - syss(mlockall,1), /* 324 = mlockall*/ - syss(munlockall,1), /* 325 = munlockall*/ - syss(nosys,0), /* 326 */ - sysp(issetugid,0), /* 327 = issetugid */ - syss(__pthread_kill,2), /* 328 */ - syss(pthread_sigmask,3), /* 329 */ - syss(sigwait,2), /* 330 */ - syss(__disable_threadsignal,1), /* 331 */ - syss(nosys,0), /* 332 */ - syss(nosys,0), /* 333 */ - syss(nosys,0), /* 334 */ - syss(utrace,2), /* 335 = utrace */ - syss(nosys,0), /* 336 */ - syss(nosys,0), /* 337 */ - syss(nosys,0), /* 338 */ - syss(nosys,0), /* 339 */ - syss(nosys,0), /* 340 */ - syss(nosys,0), /* 341 */ - syss(nosys,0), /* 342 */ - syss(nosys,0), /* 343 */ - syss(nosys,0), /* 344 */ - syss(nosys,0), /* 345 */ - syss(nosys,0), /* 346 */ - syss(nosys,0), /* 347 */ - syss(nosys,0), /* 348 */ - syss(nosys,0), /* 349 */ - syss(audit,2), /* 350 */ - syss(auditon,3), /* 351 */ - syss(nosys,0), /* 352 */ - syss(getauid,1), /* 353 */ - syss(setauid,1), /* 354 */ - syss(getaudit,1), /* 355 */ - syss(setaudit,1), /* 356 */ - syss(getaudit_addr,2), /* 357 */ - syss(setaudit_addr,2), /* 358 */ - syss(auditctl,1), /* 359 */ - syss(nosys,0), /* 360 */ - syss(nosys,0), /* 361 */ - syss(kqueue,0), /* 362 = kqueue */ - syss(kevent,6), /* 363 = kevent */ - syss(nosys,0), /* 364 */ - syss(nosys,0), /* 365 */ - syss(nosys,0), /* 366 */ - syss(nosys,0), /* 367 */ - syss(nosys,0), /* 368 */ - syss(nosys,0) /* 369 */ - -/* - * N.B. - * The argument count numbers in this table are actually - * the number of UInt32 words that comprise the arguments - * not the number of arguments - * - * This value is not currently used on PPC but Intel Darwin - * does use it and will not work correctly if the values - * are wrong - */ - + {AC(searchfs_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)searchfs, munge_wwwwww, munge_dddddd, _SYSCALL_RET_INT_T}, /* 225 = searchfs */ + {AC(delete_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)delete, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 226 = delete private delete ( Carbon semantics ) */ + {AC(copyfile_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)copyfile, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 227 = copyfile */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 228 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 229 = nosys */ + {AC(poll_args), _SYSCALL_CANCEL_PRE, NO_FUNNEL, (sy_call_t *)poll, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 230 = poll */ + {AC(watchevent_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL|UNSAFE_64BIT, (sy_call_t *)watchevent, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 231 = watchevent */ + {AC(waitevent_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL|UNSAFE_64BIT, (sy_call_t *)waitevent, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 232 = waitevent */ + {AC(modwatch_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL|UNSAFE_64BIT, (sy_call_t *)modwatch, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 233 = modwatch */ + {AC(getxattr_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)getxattr, munge_wwwwww, munge_dddddd, _SYSCALL_RET_SSIZE_T}, /* 234 = getxattr */ + {AC(fgetxattr_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)fgetxattr, munge_wwwwww, munge_dddddd, _SYSCALL_RET_SSIZE_T}, /* 235 = fgetxattr */ + {AC(setxattr_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)setxattr, munge_wwwwww, munge_dddddd, _SYSCALL_RET_INT_T}, /* 236 = setxattr */ + {AC(fsetxattr_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)fsetxattr, munge_wwwwww, munge_dddddd, _SYSCALL_RET_INT_T}, /* 237 = fsetxattr */ + {AC(removexattr_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)removexattr, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 238 = removexattr */ + {AC(fremovexattr_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)fremovexattr, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 239 = fremovexattr */ + {AC(listxattr_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)listxattr, munge_wwww, munge_dddd, _SYSCALL_RET_SSIZE_T}, /* 240 = listxattr */ + {AC(flistxattr_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)flistxattr, munge_wwww, munge_dddd, _SYSCALL_RET_SSIZE_T}, /* 241 = flistxattr */ + {AC(fsctl_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)fsctl, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 242 = fsctl */ + {AC(initgroups_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)initgroups, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 243 = initgroups */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 244 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 245 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 246 = nosys */ +#if NFSCLIENT + {AC(nfsclnt_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)nfsclnt, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 247 = nfsclnt */ + {AC(fhopen_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)fhopen, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 248 = fhopen */ +#else + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 247 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 248 = nosys */ +#endif + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 249 = nosys */ + {AC(minherit_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)minherit, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 250 = minherit */ + {AC(semsys_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)semsys, munge_wwwww, munge_ddddd, _SYSCALL_RET_INT_T}, /* 251 = semsys */ + {AC(msgsys_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)msgsys, munge_wwwww, munge_ddddd, _SYSCALL_RET_INT_T}, /* 252 = msgsys */ + {AC(shmsys_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)shmsys, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 253 = shmsys */ + {AC(semctl_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)semctl, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 254 = semctl */ + {AC(semget_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)semget, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 255 = semget */ + {AC(semop_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)semop, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 256 = semop */ + {AC(semconfig_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)semconfig, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 257 = semconfig */ + {AC(msgctl_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)msgctl, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 258 = msgctl */ + {AC(msgget_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)msgget, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 259 = msgget */ + {AC(msgsnd_args), _SYSCALL_CANCEL_PRE, NO_FUNNEL, (sy_call_t *)msgsnd, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 260 = msgsnd */ + {AC(msgrcv_args), _SYSCALL_CANCEL_PRE, NO_FUNNEL, (sy_call_t *)msgrcv, munge_wwwsw, munge_ddddd, _SYSCALL_RET_SSIZE_T}, /* 261 = msgrcv */ + {AC(shmat_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)shmat, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 262 = shmat */ + {AC(shmctl_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)shmctl, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 263 = shmctl */ + {AC(shmdt_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)shmdt, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 264 = shmdt */ + {AC(shmget_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)shmget, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 265 = shmget */ + {AC(shm_open_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)shm_open, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 266 = shm_open */ + {AC(shm_unlink_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)shm_unlink, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 267 = shm_unlink */ + {AC(sem_open_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)sem_open, munge_wwww, munge_dddd, _SYSCALL_RET_ADDR_T}, /* 268 = sem_open */ + {AC(sem_close_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)sem_close, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 269 = sem_close */ + {AC(sem_unlink_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)sem_unlink, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 270 = sem_unlink */ + {AC(sem_wait_args), _SYSCALL_CANCEL_PRE, NO_FUNNEL, (sy_call_t *)sem_wait, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 271 = sem_wait */ + {AC(sem_trywait_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)sem_trywait, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 272 = sem_trywait */ + {AC(sem_post_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)sem_post, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 273 = sem_post */ + {AC(sem_getvalue_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)sem_getvalue, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 274 = sem_getvalue */ + {AC(sem_init_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)sem_init, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 275 = sem_init */ + {AC(sem_destroy_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)sem_destroy, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 276 = sem_destroy */ + {AC(open_extended_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)open_extended, munge_wwwwww, munge_dddddd, _SYSCALL_RET_INT_T}, /* 277 = open_extended */ + {AC(umask_extended_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)umask_extended, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 278 = umask_extended */ + {AC(stat_extended_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)stat_extended, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 279 = stat_extended */ + {AC(lstat_extended_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)lstat_extended, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 280 = lstat_extended */ + {AC(fstat_extended_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)fstat_extended, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 281 = fstat_extended */ + {AC(chmod_extended_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)chmod_extended, munge_wwwww, munge_ddddd, _SYSCALL_RET_INT_T}, /* 282 = chmod_extended */ + {AC(fchmod_extended_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)fchmod_extended, munge_wwwww, munge_ddddd, _SYSCALL_RET_INT_T}, /* 283 = fchmod_extended */ + {AC(access_extended_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)access_extended, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 284 = access_extended */ + {AC(settid_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)settid, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 285 = settid */ + {AC(gettid_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)gettid, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 286 = gettid */ + {AC(setsgroups_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)setsgroups, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 287 = setsgroups */ + {AC(getsgroups_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)getsgroups, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 288 = getsgroups */ + {AC(setwgroups_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)setwgroups, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 289 = setwgroups */ + {AC(getwgroups_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)getwgroups, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 290 = getwgroups */ + {AC(mkfifo_extended_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)mkfifo_extended, munge_wwwww, munge_ddddd, _SYSCALL_RET_INT_T}, /* 291 = mkfifo_extended */ + {AC(mkdir_extended_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)mkdir_extended, munge_wwwww, munge_ddddd, _SYSCALL_RET_INT_T}, /* 292 = mkdir_extended */ + {AC(identitysvc_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)identitysvc, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 293 = identitysvc */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 294 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 295 = nosys */ + {AC(load_shared_file_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL|UNSAFE_64BIT, (sy_call_t *)load_shared_file, munge_wwwwwww, munge_ddddddd, _SYSCALL_RET_INT_T}, /* 296 = load_shared_file */ + {AC(reset_shared_file_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL|UNSAFE_64BIT, (sy_call_t *)reset_shared_file, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 297 = reset_shared_file */ + {0, _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)new_system_shared_regions, NULL, NULL, _SYSCALL_RET_INT_T}, /* 298 = new_system_shared_regions */ + {AC(shared_region_map_file_np_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL|UNSAFE_64BIT, (sy_call_t *)shared_region_map_file_np, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 299 = shared_region_map_file_np */ + {AC(shared_region_make_private_np_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL|UNSAFE_64BIT, (sy_call_t *)shared_region_make_private_np, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 300 = shared_region_make_private_np */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 301 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 302 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 303 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 304 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 305 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 306 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 307 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 308 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 309 = nosys */ + {AC(getsid_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)getsid, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 310 = getsid */ + {AC(settid_with_pid_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)settid_with_pid, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 311 = settid_with_pid */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 312 = nosys */ + {AC(aio_fsync_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)aio_fsync, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 313 = aio_fsync */ + {AC(aio_return_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)aio_return, munge_w, munge_d, _SYSCALL_RET_SSIZE_T}, /* 314 = aio_return */ + {AC(aio_suspend_args), _SYSCALL_CANCEL_PRE, NO_FUNNEL, (sy_call_t *)aio_suspend, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 315 = aio_suspend */ + {AC(aio_cancel_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)aio_cancel, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 316 = aio_cancel */ + {AC(aio_error_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)aio_error, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 317 = aio_error */ + {AC(aio_read_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)aio_read, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 318 = aio_read */ + {AC(aio_write_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)aio_write, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 319 = aio_write */ + {AC(lio_listio_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)lio_listio, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T}, /* 320 = lio_listio */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 321 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 322 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 323 = nosys */ + {AC(mlockall_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)mlockall, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 324 = mlockall */ + {AC(munlockall_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)munlockall, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 325 = munlockall */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 326 = nosys */ + {0, _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)issetugid, NULL, NULL, _SYSCALL_RET_INT_T}, /* 327 = issetugid */ + {AC(__pthread_kill_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)__pthread_kill, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 328 = __pthread_kill */ + {AC(pthread_sigmask_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)pthread_sigmask, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 329 = pthread_sigmask */ + {AC(sigwait_args), _SYSCALL_CANCEL_PRE, KERNEL_FUNNEL, (sy_call_t *)sigwait, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 330 = sigwait */ + {AC(__disable_threadsignal_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)__disable_threadsignal, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 331 = __disable_threadsignal */ + {AC(__pthread_markcancel_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)__pthread_markcancel, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 332 = __pthread_markcancel */ + {AC(__pthread_canceled_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)__pthread_canceled, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 333 = __pthread_canceled */ + {AC(__semwait_signal_args), _SYSCALL_CANCEL_POST, NO_FUNNEL, (sy_call_t *)__semwait_signal, munge_wwwwww, munge_dddddd, _SYSCALL_RET_INT_T}, /* 334 = __semwait_signal */ + {AC(utrace_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)utrace, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 335 = utrace */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 336 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 337 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 338 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 339 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 340 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 341 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 342 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 343 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 344 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 345 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 346 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 347 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 348 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 349 = nosys */ + {AC(audit_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)audit, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 350 = audit */ + {AC(auditon_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)auditon, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 351 = auditon */ + {0, _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 352 = nosys */ + {AC(getauid_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)getauid, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 353 = getauid */ + {AC(setauid_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)setauid, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 354 = setauid */ + {AC(getaudit_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)getaudit, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 355 = getaudit */ + {AC(setaudit_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)setaudit, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 356 = setaudit */ + {AC(getaudit_addr_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)getaudit_addr, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 357 = getaudit_addr */ + {AC(setaudit_addr_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)setaudit_addr, munge_ww, munge_dd, _SYSCALL_RET_INT_T}, /* 358 = setaudit_addr */ + {AC(auditctl_args), _SYSCALL_CANCEL_NONE, KERNEL_FUNNEL, (sy_call_t *)auditctl, munge_w, munge_d, _SYSCALL_RET_INT_T}, /* 359 = auditctl */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 360 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 361 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)kqueue, NULL, NULL, _SYSCALL_RET_INT_T}, /* 362 = kqueue */ + {AC(kevent_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)kevent, munge_wwwwww, munge_dddddd, _SYSCALL_RET_INT_T}, /* 363 = kevent */ + {AC(lchown_args), _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)lchown, munge_www, munge_ddd, _SYSCALL_RET_INT_T}, /* 364 = lchown */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 365 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 366 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 367 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 368 = nosys */ + {0, _SYSCALL_CANCEL_NONE, NO_FUNNEL, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T}, /* 369 = nosys */ }; int nsysent = sizeof(sysent) / sizeof(sysent[0]); diff --git a/bsd/kern/kdebug.c b/bsd/kern/kdebug.c index afa4305d1..381b74fe2 100644 --- a/bsd/kern/kdebug.c +++ b/bsd/kern/kdebug.c @@ -1,7 +1,7 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * - * @APPLE_LICENSE_HEADER_START@ + * @Apple_LICENSE_HEADER_START@ * * The contents of this file constitute Original Code as defined in and * are subject to the Apple Public Source License Version 1.1 (the @@ -22,19 +22,20 @@ #include <machine/spl.h> +#include <sys/errno.h> +#include <sys/param.h> +#include <sys/proc_internal.h> +#include <sys/vm.h> +#include <sys/sysctl.h> +#include <sys/kdebug.h> +#include <sys/sysproto.h> + #define HZ 100 #include <mach/clock_types.h> #include <mach/mach_types.h> #include <mach/mach_time.h> #include <machine/machine_routines.h> -#include <sys/kdebug.h> -#include <sys/errno.h> -#include <sys/param.h> -#include <sys/proc.h> -#include <sys/vm.h> -#include <sys/sysctl.h> - #include <kern/thread.h> #include <kern/task.h> #include <vm/vm_kern.h> @@ -50,6 +51,13 @@ unsigned int kd_entropy_count = 0; unsigned int kd_entropy_indx = 0; unsigned int kd_entropy_buftomem = 0; + +#define SLOW_NOLOG 0x01 +#define SLOW_CHECKS 0x02 +#define SLOW_ENTROPY 0x04 + +unsigned int kdebug_slowcheck=SLOW_NOLOG; + /* kd_buf kd_buffer[kd_bufsize/sizeof(kd_buf)]; */ kd_buf * kd_bufptr; unsigned int kd_buftomem=0; @@ -59,7 +67,6 @@ kd_buf * kd_readlast; unsigned int nkdbufs = 8192; unsigned int kd_bufsize = 0; unsigned int kdebug_flags = 0; -unsigned int kdebug_nolog=1; unsigned int kdlog_beg=0; unsigned int kdlog_end=0; unsigned int kdlog_value1=0; @@ -68,7 +75,16 @@ unsigned int kdlog_value3=0; unsigned int kdlog_value4=0; unsigned long long kd_prev_timebase = 0LL; -decl_simple_lock_data(,kd_trace_lock); + +static lck_mtx_t * kd_trace_mtx; +static lck_grp_t * kd_trace_mtx_grp; +static lck_attr_t * kd_trace_mtx_attr; +static lck_grp_attr_t *kd_trace_mtx_grp_attr; + +static lck_spin_t * kd_trace_lock; +static lck_grp_t * kd_trace_lock_grp; +static lck_attr_t * kd_trace_lock_attr; +static lck_grp_attr_t *kd_trace_lock_grp_attr; kd_threadmap *kd_mapptr = 0; unsigned int kd_mapsize = 0; @@ -83,15 +99,6 @@ pid_t global_state_pid = -1; /* Used to control exclusive use of kd_buffer extern natural_t rtclock_decrementer_min; #endif /* ppc */ -struct kdebug_args { - int code; - int arg1; - int arg2; - int arg3; - int arg4; - int arg5; -}; - /* task to string structure */ struct tts { @@ -119,17 +126,18 @@ typedef void (*kd_chudhook_fn) (unsigned int debugid, unsigned int arg1, kd_chudhook_fn kdebug_chudhook = 0; /* pointer to CHUD toolkit function */ + /* Support syscall SYS_kdebug_trace */ kdebug_trace(p, uap, retval) struct proc *p; - struct kdebug_args *uap; + struct kdebug_trace_args *uap; register_t *retval; { - if (kdebug_nolog) - return(EINVAL); + if ( (kdebug_enable == 0) ) + return(EINVAL); - kernel_debug(uap->code, uap->arg1, uap->arg2, uap->arg3, uap->arg4, 0); - return(0); + kernel_debug(uap->code, uap->arg1, uap->arg2, uap->arg3, uap->arg4, 0); + return(0); } @@ -141,18 +149,20 @@ unsigned int debugid, arg1, arg2, arg3, arg4, arg5; struct proc *curproc; int s; unsigned long long now; - mach_timespec_t *tsp; + if (kdebug_enable & KDEBUG_ENABLE_CHUD) { - if (kdebug_chudhook) - kdebug_chudhook(debugid, arg1, arg2, arg3, arg4, arg5); + if (kdebug_chudhook) + kdebug_chudhook(debugid, arg1, arg2, arg3, arg4, arg5); - if (!((kdebug_enable & KDEBUG_ENABLE_ENTROPY) || - (kdebug_enable & KDEBUG_ENABLE_TRACE))) - return; + if ( !(kdebug_enable & (KDEBUG_ENABLE_ENTROPY | KDEBUG_ENABLE_TRACE))) + return; } - s = ml_set_interrupts_enabled(FALSE); + lck_spin_lock(kd_trace_lock); + + if (kdebug_slowcheck == 0) + goto record_trace; if (kdebug_enable & KDEBUG_ENABLE_ENTROPY) { @@ -166,16 +176,17 @@ unsigned int debugid, arg1, arg2, arg3, arg4, arg5; { /* Disable entropy collection */ kdebug_enable &= ~KDEBUG_ENABLE_ENTROPY; + kdebug_slowcheck &= ~SLOW_ENTROPY; } } - if (kdebug_nolog) + if ( (kdebug_slowcheck & SLOW_NOLOG) ) { + lck_spin_unlock(kd_trace_lock); ml_set_interrupts_enabled(s); return; } - - usimple_lock(&kd_trace_lock); + if (kdebug_flags & KDBG_PIDCHECK) { /* If kdebug flag is not set for current proc, return */ @@ -183,7 +194,7 @@ unsigned int debugid, arg1, arg2, arg3, arg4, arg5; if ((curproc && !(curproc->p_flag & P_KDEBUG)) && ((debugid&0xffff0000) != (MACHDBG_CODE(DBG_MACH_SCHED, 0) | DBG_FUNC_NONE))) { - usimple_unlock(&kd_trace_lock); + lck_spin_unlock(kd_trace_lock); ml_set_interrupts_enabled(s); return; } @@ -195,7 +206,7 @@ unsigned int debugid, arg1, arg2, arg3, arg4, arg5; if ((curproc && (curproc->p_flag & P_KDEBUG)) && ((debugid&0xffff0000) != (MACHDBG_CODE(DBG_MACH_SCHED, 0) | DBG_FUNC_NONE))) { - usimple_unlock(&kd_trace_lock); + lck_spin_unlock(kd_trace_lock); ml_set_interrupts_enabled(s); return; } @@ -203,10 +214,10 @@ unsigned int debugid, arg1, arg2, arg3, arg4, arg5; if (kdebug_flags & KDBG_RANGECHECK) { - if ((debugid < kdlog_beg) || (debugid > kdlog_end) + if ((debugid < kdlog_beg) || (debugid >= kdlog_end) && (debugid >> 24 != DBG_TRACE)) { - usimple_unlock(&kd_trace_lock); + lck_spin_unlock(kd_trace_lock); ml_set_interrupts_enabled(s); return; } @@ -219,35 +230,35 @@ unsigned int debugid, arg1, arg2, arg3, arg4, arg5; (debugid & DBG_FUNC_MASK) != kdlog_value4 && (debugid >> 24 != DBG_TRACE)) { - usimple_unlock(&kd_trace_lock); + lck_spin_unlock(kd_trace_lock); ml_set_interrupts_enabled(s); return; } } + +record_trace: kd = kd_bufptr; kd->debugid = debugid; kd->arg1 = arg1; kd->arg2 = arg2; kd->arg3 = arg3; kd->arg4 = arg4; - kd->arg5 = (int)current_act(); - if (cpu_number()) - kd->arg5 |= KDBG_CPU_MASK; + kd->arg5 = (int)current_thread(); - now = kd->timestamp = mach_absolute_time(); + now = mach_absolute_time() & KDBG_TIMESTAMP_MASK; /* Watch for out of order timestamps */ if (now < kd_prev_timebase) { - kd->timestamp = ++kd_prev_timebase; + now = ++kd_prev_timebase & KDBG_TIMESTAMP_MASK; } else { /* Then just store the previous timestamp */ kd_prev_timebase = now; } - + kd->timestamp = now | (((uint64_t)cpu_number()) << KDBG_CPU_SHIFT); kd_bufptr++; @@ -255,10 +266,10 @@ unsigned int debugid, arg1, arg2, arg3, arg4, arg5; kd_bufptr = kd_buffer; if (kd_bufptr == kd_readlast) { if (kdebug_flags & KDBG_NOWRAP) - kdebug_nolog = 1; + kdebug_slowcheck |= SLOW_NOLOG; kdebug_flags |= KDBG_WRAPPED; } - usimple_unlock(&kd_trace_lock); + lck_spin_unlock(kd_trace_lock); ml_set_interrupts_enabled(s); } @@ -270,26 +281,27 @@ unsigned int debugid, arg1, arg2, arg3, arg4, arg5; struct proc *curproc; int s; unsigned long long now; - mach_timespec_t *tsp; if (kdebug_enable & KDEBUG_ENABLE_CHUD) { - if (kdebug_chudhook) - (void)kdebug_chudhook(debugid, arg1, arg2, arg3, arg4, arg5); + if (kdebug_chudhook) + (void)kdebug_chudhook(debugid, arg1, arg2, arg3, arg4, arg5); - if (!((kdebug_enable & KDEBUG_ENABLE_ENTROPY) || - (kdebug_enable & KDEBUG_ENABLE_TRACE))) - return; + if ( !(kdebug_enable & (KDEBUG_ENABLE_ENTROPY | KDEBUG_ENABLE_TRACE))) + return; } - s = ml_set_interrupts_enabled(FALSE); + lck_spin_lock(kd_trace_lock); - if (kdebug_nolog) + if (kdebug_slowcheck == 0) + goto record_trace1; + + if ( (kdebug_slowcheck & SLOW_NOLOG) ) { + lck_spin_unlock(kd_trace_lock); ml_set_interrupts_enabled(s); return; } - usimple_lock(&kd_trace_lock); if (kdebug_flags & KDBG_PIDCHECK) { /* If kdebug flag is not set for current proc, return */ @@ -297,7 +309,7 @@ unsigned int debugid, arg1, arg2, arg3, arg4, arg5; if ((curproc && !(curproc->p_flag & P_KDEBUG)) && ((debugid&0xffff0000) != (MACHDBG_CODE(DBG_MACH_SCHED, 0) | DBG_FUNC_NONE))) { - usimple_unlock(&kd_trace_lock); + lck_spin_unlock(kd_trace_lock); ml_set_interrupts_enabled(s); return; } @@ -309,7 +321,7 @@ unsigned int debugid, arg1, arg2, arg3, arg4, arg5; if ((curproc && (curproc->p_flag & P_KDEBUG)) && ((debugid&0xffff0000) != (MACHDBG_CODE(DBG_MACH_SCHED, 0) | DBG_FUNC_NONE))) { - usimple_unlock(&kd_trace_lock); + lck_spin_unlock(kd_trace_lock); ml_set_interrupts_enabled(s); return; } @@ -317,10 +329,10 @@ unsigned int debugid, arg1, arg2, arg3, arg4, arg5; if (kdebug_flags & KDBG_RANGECHECK) { - if ((debugid < kdlog_beg) || (debugid > kdlog_end) + if ((debugid < kdlog_beg) || (debugid >= kdlog_end) && (debugid >> 24 != DBG_TRACE)) { - usimple_unlock(&kd_trace_lock); + lck_spin_unlock(kd_trace_lock); ml_set_interrupts_enabled(s); return; } @@ -333,12 +345,13 @@ unsigned int debugid, arg1, arg2, arg3, arg4, arg5; (debugid & DBG_FUNC_MASK) != kdlog_value4 && (debugid >> 24 != DBG_TRACE)) { - usimple_unlock(&kd_trace_lock); + lck_spin_unlock(kd_trace_lock); ml_set_interrupts_enabled(s); return; } } +record_trace1: kd = kd_bufptr; kd->debugid = debugid; kd->arg1 = arg1; @@ -346,20 +359,21 @@ unsigned int debugid, arg1, arg2, arg3, arg4, arg5; kd->arg3 = arg3; kd->arg4 = arg4; kd->arg5 = arg5; - now = kd->timestamp = mach_absolute_time(); + + now = mach_absolute_time() & KDBG_TIMESTAMP_MASK; /* Watch for out of order timestamps */ if (now < kd_prev_timebase) { - /* timestamps are out of order -- adjust */ - kd->timestamp = ++kd_prev_timebase; + now = ++kd_prev_timebase & KDBG_TIMESTAMP_MASK; } else { /* Then just store the previous timestamp */ kd_prev_timebase = now; } + kd->timestamp = now | (((uint64_t)cpu_number()) << KDBG_CPU_SHIFT); kd_bufptr++; @@ -367,24 +381,65 @@ unsigned int debugid, arg1, arg2, arg3, arg4, arg5; kd_bufptr = kd_buffer; if (kd_bufptr == kd_readlast) { if (kdebug_flags & KDBG_NOWRAP) - kdebug_nolog = 1; + kdebug_slowcheck |= SLOW_NOLOG; kdebug_flags |= KDBG_WRAPPED; } - usimple_unlock(&kd_trace_lock); + lck_spin_unlock(kd_trace_lock); ml_set_interrupts_enabled(s); } +static void +kdbg_lock_init() +{ + + if (kdebug_flags & KDBG_LOCKINIT) + return; + /* + * allocate lock group attribute and group + */ + kd_trace_lock_grp_attr = lck_grp_attr_alloc_init(); + //lck_grp_attr_setstat(kd_trace_lock_grp_attr); + kd_trace_lock_grp = lck_grp_alloc_init("kdebug", kd_trace_lock_grp_attr); + + kd_trace_mtx_grp_attr = lck_grp_attr_alloc_init(); + //lck_grp_attr_setstat(kd_trace_mtx_grp_attr); + kd_trace_mtx_grp = lck_grp_alloc_init("kdebug", kd_trace_mtx_grp_attr); + + /* + * allocate the lock attribute + */ + kd_trace_lock_attr = lck_attr_alloc_init(); + //lck_attr_setdebug(kd_trace_lock_attr); + + kd_trace_mtx_attr = lck_attr_alloc_init(); + //lck_attr_setdebug(kd_trace_mtx_attr); + + + /* + * allocate and initialize spin lock and mutex + */ + kd_trace_lock = lck_spin_alloc_init(kd_trace_lock_grp, kd_trace_lock_attr); + kd_trace_mtx = lck_mtx_alloc_init(kd_trace_mtx_grp, kd_trace_mtx_attr); + + kdebug_flags |= KDBG_LOCKINIT; +} + + +int kdbg_bootstrap() { + kd_bufsize = nkdbufs * sizeof(kd_buf); + if (kmem_alloc(kernel_map, &kd_buftomem, (vm_size_t)kd_bufsize) == KERN_SUCCESS) - kd_buffer = (kd_buf *) kd_buftomem; - else kd_buffer= (kd_buf *) 0; + kd_buffer = (kd_buf *) kd_buftomem; + else + kd_buffer= (kd_buf *) 0; kdebug_flags &= ~KDBG_WRAPPED; + if (kd_buffer) { - simple_lock_init(&kd_trace_lock); kdebug_flags |= (KDBG_INIT | KDBG_BUFINIT); kd_bufptr = kd_buffer; kd_buflast = &kd_bufptr[nkdbufs]; @@ -401,12 +456,22 @@ kdbg_bootstrap() kdbg_reinit() { - int x; + int s; int ret=0; - /* Disable trace collecting */ + /* + * Disable trace collecting + * First make sure we're not in + * the middle of cutting a trace + */ + s = ml_set_interrupts_enabled(FALSE); + lck_spin_lock(kd_trace_lock); + kdebug_enable &= ~KDEBUG_ENABLE_TRACE; - kdebug_nolog = 1; + kdebug_slowcheck |= SLOW_NOLOG; + + lck_spin_unlock(kd_trace_lock); + ml_set_interrupts_enabled(s); if ((kdebug_flags & KDBG_INIT) && (kdebug_flags & KDBG_BUFINIT) && kd_bufsize && kd_buffer) kmem_free(kernel_map, (vm_offset_t)kd_buffer, kd_bufsize); @@ -476,7 +541,8 @@ void kdbg_trace_string(struct proc *proc, long *arg1, long *arg2, long *arg3, lo *arg4=dbg_parms[3]; } -kdbg_resolve_map(thread_act_t th_act, krt_t *t) +static void +kdbg_resolve_map(thread_t th_act, krt_t *t) { kd_threadmap *mapptr; @@ -565,11 +631,12 @@ void kdbg_mapinit() if (p->p_flag & P_WEXIT) continue; - if (task_reference_try(p->task)) { - tts_mapptr[i].task = p->task; + if (p->task) { + task_reference(p->task); + tts_mapptr[i].task = p->task; tts_mapptr[i].pid = p->p_pid; - (void)strncpy(&tts_mapptr[i].task_comm, p->p_comm, sizeof(tts_mapptr[i].task_comm) - 1); - i++; + (void)strncpy(&tts_mapptr[i].task_comm, p->p_comm, sizeof(tts_mapptr[i].task_comm) - 1); + i++; } } tts_count = i; @@ -594,14 +661,29 @@ void kdbg_mapinit() } } -kdbg_clear() +static void +kdbg_clear(void) { -int x; + int s; + + /* + * Clean up the trace buffer + * First make sure we're not in + * the middle of cutting a trace + */ + s = ml_set_interrupts_enabled(FALSE); + lck_spin_lock(kd_trace_lock); - /* Clean up the trace buffer */ - global_state_pid = -1; kdebug_enable &= ~KDEBUG_ENABLE_TRACE; - kdebug_nolog = 1; + kdebug_slowcheck = SLOW_NOLOG; + + if (kdebug_enable & KDEBUG_ENABLE_ENTROPY) + kdebug_slowcheck |= SLOW_ENTROPY; + + lck_spin_unlock(kd_trace_lock); + ml_set_interrupts_enabled(s); + + global_state_pid = -1; kdebug_flags &= ~KDBG_BUFINIT; kdebug_flags &= (unsigned int)~KDBG_CKTYPES; kdebug_flags &= ~(KDBG_NOWRAP | KDBG_RANGECHECK | KDBG_VALCHECK); @@ -638,6 +720,8 @@ kdbg_setpid(kd_regtype *kdr) { kdebug_flags |= KDBG_PIDCHECK; kdebug_flags &= ~KDBG_PIDEXCLUDE; + kdebug_slowcheck |= SLOW_CHECKS; + p->p_flag |= P_KDEBUG; } else /* turn off pid check for this pid value */ @@ -673,6 +757,8 @@ kdbg_setpidex(kd_regtype *kdr) { kdebug_flags |= KDBG_PIDEXCLUDE; kdebug_flags &= ~KDBG_PIDCHECK; + kdebug_slowcheck |= SLOW_CHECKS; + p->p_flag |= P_KDEBUG; } else /* turn off pid exclusion for this pid value */ @@ -703,7 +789,7 @@ kdbg_setrtcdec(kd_regtype *kdr) rtclock_decrementer_min = decval; #else else - ret = EOPNOTSUPP; + ret = ENOTSUP; #endif /* ppc */ return(ret); @@ -723,6 +809,7 @@ kdbg_setreg(kd_regtype * kdr) kdebug_flags &= (unsigned int)~KDBG_CKTYPES; kdebug_flags &= ~KDBG_VALCHECK; /* Turn off specific value check */ kdebug_flags |= (KDBG_RANGECHECK | KDBG_CLASSTYPE); + kdebug_slowcheck |= SLOW_CHECKS; break; case KDBG_SUBCLSTYPE : val_1 = (kdr->value1 & 0xff); @@ -733,6 +820,7 @@ kdbg_setreg(kd_regtype * kdr) kdebug_flags &= (unsigned int)~KDBG_CKTYPES; kdebug_flags &= ~KDBG_VALCHECK; /* Turn off specific value check */ kdebug_flags |= (KDBG_RANGECHECK | KDBG_SUBCLSTYPE); + kdebug_slowcheck |= SLOW_CHECKS; break; case KDBG_RANGETYPE : kdlog_beg = (kdr->value1); @@ -740,6 +828,7 @@ kdbg_setreg(kd_regtype * kdr) kdebug_flags &= (unsigned int)~KDBG_CKTYPES; kdebug_flags &= ~KDBG_VALCHECK; /* Turn off specific value check */ kdebug_flags |= (KDBG_RANGECHECK | KDBG_RANGETYPE); + kdebug_slowcheck |= SLOW_CHECKS; break; case KDBG_VALCHECK: kdlog_value1 = (kdr->value1); @@ -749,9 +838,16 @@ kdbg_setreg(kd_regtype * kdr) kdebug_flags &= (unsigned int)~KDBG_CKTYPES; kdebug_flags &= ~KDBG_RANGECHECK; /* Turn off range check */ kdebug_flags |= KDBG_VALCHECK; /* Turn on specific value check */ + kdebug_slowcheck |= SLOW_CHECKS; break; case KDBG_TYPENONE : kdebug_flags &= (unsigned int)~KDBG_CKTYPES; + + if ( (kdebug_flags & (KDBG_RANGECHECK | KDBG_VALCHECK | KDBG_PIDCHECK | KDBG_PIDEXCLUDE)) ) + kdebug_slowcheck |= SLOW_CHECKS; + else + kdebug_slowcheck &= ~SLOW_CHECKS; + kdlog_beg = 0; kdlog_end = 0; break; @@ -805,8 +901,8 @@ kdbg_getreg(kd_regtype * kdr) } - -kdbg_readmap(kd_threadmap *buffer, size_t *number) +int +kdbg_readmap(user_addr_t buffer, size_t *number) { int avail = *number; int ret = 0; @@ -844,7 +940,8 @@ kdbg_readmap(kd_threadmap *buffer, size_t *number) return(ret); } -kdbg_getentropy (mach_timespec_t * buffer, size_t *number, int ms_timeout) +int +kdbg_getentropy (user_addr_t buffer, size_t *number, int ms_timeout) { int avail = *number; int ret = 0; @@ -878,11 +975,13 @@ kdbg_getentropy (mach_timespec_t * buffer, size_t *number, int ms_timeout) /* Enable entropy sampling */ kdebug_enable |= KDEBUG_ENABLE_ENTROPY; + kdebug_slowcheck |= SLOW_ENTROPY; ret = tsleep (kdbg_getentropy, PRIBIO | PCATCH, "kd_entropy", (ms_timeout/(1000/HZ))); /* Disable entropy sampling */ kdebug_enable &= ~KDEBUG_ENABLE_ENTROPY; + kdebug_slowcheck &= ~SLOW_ENTROPY; *number = 0; ret = 0; @@ -919,8 +1018,8 @@ void kdbg_control_chud(int val, void *fn) { if (val) { /* enable chudhook */ - kdebug_enable |= KDEBUG_ENABLE_CHUD; kdebug_chudhook = fn; + kdebug_enable |= KDEBUG_ENABLE_CHUD; } else { /* disable chudhook */ @@ -930,84 +1029,103 @@ void kdbg_control_chud(int val, void *fn) } -kdbg_control(name, namelen, where, sizep) -int *name; -u_int namelen; -char *where; -size_t *sizep; +kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) { -int ret=0; -int size=*sizep; -int max_entries; -unsigned int value = name[1]; -kd_regtype kd_Reg; -kbufinfo_t kd_bufinfo; - -pid_t curpid; -struct proc *p, *curproc; - - if (name[0] == KERN_KDGETBUF) { - /* - Does not alter the global_state_pid - This is a passive request. - */ - if (size < sizeof(kd_bufinfo.nkdbufs)) { - /* - There is not enough room to return even - the first element of the info structure. + int ret=0; + int size=*sizep; + int max_entries; + unsigned int value = name[1]; + kd_regtype kd_Reg; + kbufinfo_t kd_bufinfo; + pid_t curpid; + struct proc *p, *curproc; + + + kdbg_lock_init(); + lck_mtx_lock(kd_trace_mtx); + + if (name[0] == KERN_KDGETBUF) { + /* + * Does not alter the global_state_pid + * This is a passive request. */ - return(EINVAL); - } - - kd_bufinfo.nkdbufs = nkdbufs; - kd_bufinfo.nkdthreads = kd_mapsize / sizeof(kd_threadmap); - kd_bufinfo.nolog = kdebug_nolog; - kd_bufinfo.flags = kdebug_flags; - kd_bufinfo.bufid = global_state_pid; + if (size < sizeof(kd_bufinfo.nkdbufs)) { + /* + * There is not enough room to return even + * the first element of the info structure. + */ + lck_mtx_unlock(kd_trace_mtx); + + return(EINVAL); + } + kd_bufinfo.nkdbufs = nkdbufs; + kd_bufinfo.nkdthreads = kd_mapsize / sizeof(kd_threadmap); + + if ( (kdebug_slowcheck & SLOW_NOLOG) ) + kd_bufinfo.nolog = 1; + else + kd_bufinfo.nolog = 0; + kd_bufinfo.flags = kdebug_flags; + kd_bufinfo.bufid = global_state_pid; - if(size >= sizeof(kbufinfo_t)) { - /* Provide all the info we have */ - if(copyout (&kd_bufinfo, where, sizeof(kbufinfo_t))) - return(EINVAL); - } - else { - /* - For backwards compatibility, only provide - as much info as there is room for. - */ - if(copyout (&kd_bufinfo, where, size)) - return(EINVAL); - } - return(0); - } - else if (name[0] == KERN_KDGETENTROPY) { - if (kd_entropy_buffer) - return(EBUSY); - else - ret = kdbg_getentropy((mach_timespec_t *)where, sizep, value); - return (ret); - } - - if(curproc = current_proc()) - curpid = curproc->p_pid; - else - return (ESRCH); + if (size >= sizeof(kd_bufinfo)) { + /* + * Provide all the info we have + */ + if (copyout (&kd_bufinfo, where, sizeof(kd_bufinfo))) { + lck_mtx_unlock(kd_trace_mtx); + + return(EINVAL); + } + } + else { + /* + * For backwards compatibility, only provide + * as much info as there is room for. + */ + if (copyout (&kd_bufinfo, where, size)) { + lck_mtx_unlock(kd_trace_mtx); + + return(EINVAL); + } + } + lck_mtx_unlock(kd_trace_mtx); + return(0); + } else if (name[0] == KERN_KDGETENTROPY) { + if (kd_entropy_buffer) + ret = EBUSY; + else + ret = kdbg_getentropy(where, sizep, value); + lck_mtx_unlock(kd_trace_mtx); + + return (ret); + } + + if (curproc = current_proc()) + curpid = curproc->p_pid; + else { + lck_mtx_unlock(kd_trace_mtx); + + return (ESRCH); + } if (global_state_pid == -1) global_state_pid = curpid; - else if (global_state_pid != curpid) - { - if((p = pfind(global_state_pid)) == NULL) - { - /* The global pid no longer exists */ - global_state_pid = curpid; - } - else - { - /* The global pid exists, deny this request */ + else if (global_state_pid != curpid) { + if ((p = pfind(global_state_pid)) == NULL) { + /* + * The global pid no longer exists + */ + global_state_pid = curpid; + } else { + /* + * The global pid exists, deny this request + */ + lck_mtx_unlock(kd_trace_mtx); + return(EBUSY); - } - } + } + } switch(name[0]) { case KERN_KDEFLAGS: @@ -1027,17 +1145,15 @@ struct proc *p, *curproc; ret=EINVAL; break; } + kdebug_enable |= KDEBUG_ENABLE_TRACE; + kdebug_slowcheck &= ~SLOW_NOLOG; } - - if (value) - kdebug_enable |= KDEBUG_ENABLE_TRACE; else - kdebug_enable &= ~KDEBUG_ENABLE_TRACE; - - kdebug_nolog = (value)?0:1; - - if (kdebug_enable & KDEBUG_ENABLE_TRACE) - kdbg_mapinit(); + { + kdebug_enable &= ~KDEBUG_ENABLE_TRACE; + kdebug_slowcheck |= SLOW_NOLOG; + } + kdbg_mapinit(); break; case KERN_KDSETBUF: /* We allow a maximum buffer size of 25% of either ram or max mapped address, whichever is smaller */ @@ -1101,7 +1217,7 @@ struct proc *p, *curproc; ret = kdbg_setpidex(&kd_Reg); break; case KERN_KDTHRMAP: - ret = kdbg_readmap((kd_threadmap *)where, sizep); + ret = kdbg_readmap(where, sizep); break; case KERN_KDSETRTCDEC: if (size < sizeof(kd_regtype)) { @@ -1118,10 +1234,12 @@ struct proc *p, *curproc; default: ret= EINVAL; } + lck_mtx_unlock(kd_trace_mtx); + return(ret); } -kdbg_read(kd_buf * buffer, size_t *number) +kdbg_read(user_addr_t buffer, size_t *number) { int avail=*number; int count=0; @@ -1132,89 +1250,85 @@ unsigned int my_kdebug_flags; kd_buf * my_kd_bufptr; s = ml_set_interrupts_enabled(FALSE); - usimple_lock(&kd_trace_lock); + lck_spin_lock(kd_trace_lock); + my_kdebug_flags = kdebug_flags; my_kd_bufptr = kd_bufptr; - usimple_unlock(&kd_trace_lock); + + lck_spin_unlock(kd_trace_lock); ml_set_interrupts_enabled(s); count = avail/sizeof(kd_buf); + if (count) { if ((my_kdebug_flags & KDBG_BUFINIT) && kd_bufsize && kd_buffer) { if (count > nkdbufs) count = nkdbufs; - if (!(my_kdebug_flags & KDBG_WRAPPED) && (my_kd_bufptr > kd_readlast)) - { - copycount = my_kd_bufptr-kd_readlast; - if (copycount > count) - copycount = count; - - if (copyout(kd_readlast, buffer, copycount * sizeof(kd_buf))) - { - *number = 0; - return(EINVAL); - } - kd_readlast += copycount; - *number = copycount; - return(0); - } - else if (!(my_kdebug_flags & KDBG_WRAPPED) && (my_kd_bufptr == kd_readlast)) - { - *number = 0; - return(0); - } - else - { - if (my_kdebug_flags & KDBG_WRAPPED) - { - kd_readlast = my_kd_bufptr; + + if (!(my_kdebug_flags & KDBG_WRAPPED)) { + if (my_kd_bufptr == kd_readlast) { + *number = 0; + return(0); + } + if (my_kd_bufptr > kd_readlast) { + copycount = my_kd_bufptr - kd_readlast; + if (copycount > count) + copycount = count; + + if (copyout(kd_readlast, buffer, copycount * sizeof(kd_buf))) { + *number = 0; + return(EINVAL); + } + kd_readlast += copycount; + *number = copycount; + return(0); + } + } + if ( (my_kdebug_flags & KDBG_WRAPPED) ) { + /* Note that by setting kd_readlast equal to my_kd_bufptr, + * we now treat the kd_buffer read the same as if we weren't + * wrapped and my_kd_bufptr was less than kd_readlast. + */ + kd_readlast = my_kd_bufptr; kdebug_flags &= ~KDBG_WRAPPED; - } - - /* Note that by setting kd_readlast equal to my_kd_bufptr, - we now treat the kd_buffer read the same as if we weren't - wrapped and my_kd_bufptr was less than kd_readlast. - */ - - /* first copyout from readlast to end of kd_buffer */ - copycount = kd_buflast - kd_readlast; - if (copycount > count) - copycount = count; - if (copyout(kd_readlast, buffer, copycount * sizeof(kd_buf))) - { - *number = 0; + } + /* + * first copyout from readlast to end of kd_buffer + */ + copycount = kd_buflast - kd_readlast; + if (copycount > count) + copycount = count; + if (copyout(kd_readlast, buffer, copycount * sizeof(kd_buf))) { + *number = 0; return(EINVAL); - } - buffer += copycount; - count -= copycount; - totalcount = copycount; - kd_readlast += copycount; - if (kd_readlast == kd_buflast) - kd_readlast = kd_buffer; - if (count == 0) - { + } + buffer += (copycount * sizeof(kd_buf)); + count -= copycount; + totalcount = copycount; + kd_readlast += copycount; + + if (kd_readlast == kd_buflast) + kd_readlast = kd_buffer; + if (count == 0) { *number = totalcount; return(0); - } - - /* second copyout from top of kd_buffer to bufptr */ - copycount = my_kd_bufptr - kd_readlast; - if (copycount > count) - copycount = count; - if (copycount == 0) - { + } + /* second copyout from top of kd_buffer to bufptr */ + copycount = my_kd_bufptr - kd_readlast; + if (copycount > count) + copycount = count; + if (copycount == 0) { *number = totalcount; return(0); - } - if (copyout(kd_readlast, buffer, copycount * sizeof(kd_buf))) - { + } + if (copyout(kd_readlast, buffer, copycount * sizeof(kd_buf))) return(EINVAL); - } - kd_readlast += copycount; - totalcount += copycount; - *number = totalcount; - return(0); - } + + kd_readlast += copycount; + totalcount += copycount; + *number = totalcount; + return(0); + } /* end if KDBG_BUFINIT */ } /* end if count */ return (EINVAL); diff --git a/bsd/kern/kern_acct.c b/bsd/kern/kern_acct.c index 3654a9dc8..0b3168147 100644 --- a/bsd/kern/kern_acct.c +++ b/bsd/kern/kern_acct.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -67,10 +67,11 @@ #include <sys/param.h> -#include <sys/proc.h> -#include <sys/mount.h> -#include <sys/vnode.h> -#include <sys/file.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> +#include <sys/mount_internal.h> +#include <sys/vnode_internal.h> +#include <sys/file_internal.h> #include <sys/syslog.h> #include <sys/kernel.h> #include <sys/namei.h> @@ -79,6 +80,8 @@ #include <sys/resourcevar.h> #include <sys/ioctl.h> #include <sys/tty.h> +#include <sys/sysproto.h> +#include <machine/spl.h> /* * The routines implemented in this file are described in: @@ -96,15 +99,23 @@ * The former's operation is described in Leffler, et al., and the latter * was provided by UCB with the 4.4BSD-Lite release */ -comp_t encode_comp_t __P((u_long, u_long)); -void acctwatch __P((void *)); -void acctwatch_funnel __P((void *)); +comp_t encode_comp_t(u_long, u_long); +void acctwatch(void *); +void acctwatch_funnel(void *); /* - * Accounting vnode pointer, and saved vnode pointer. + * Accounting vnode pointer, and suspended accounting vnode pointer. States + * are as follows: + * + * acctp suspend_acctp state + * ------------- ------------ ------------------------------ + * NULL NULL Accounting disabled + * !NULL NULL Accounting enabled + * NULL !NULL Accounting enabled, but suspended + * !NULL !NULL <not allowed> */ struct vnode *acctp; -struct vnode *savacctp; +struct vnode *suspend_acctp; /* * Values associated with enabling and disabling accounting @@ -117,32 +128,32 @@ int acctchkfreq = 15; /* frequency (in seconds) to check space */ * Accounting system call. Written based on the specification and * previous implementation done by Mark Tinguely. */ -struct acct_args { - char *path; -}; -acct(p, uap, retval) - struct proc *p; - struct acct_args *uap; - int *retval; +int +acct(struct proc *p, struct acct_args *uap, __unused int *retval) { struct nameidata nd; int error; + struct vfs_context context; + + context.vc_proc = p; + context.vc_ucred = kauth_cred_get(); /* Make sure that the caller is root. */ - if (error = suser(p->p_ucred, &p->p_acflag)) + if ((error = suser(kauth_cred_get(), &p->p_acflag))) return (error); /* * If accounting is to be started to a file, open that file for * writing and make sure it's a 'normal'. */ - if (uap->path != NULL) { - NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, p); - if (error = vn_open(&nd, FWRITE, 0)) + if (uap->path != USER_ADDR_NULL) { + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, &context); + if ((error = vn_open(&nd, FWRITE, 0))) return (error); - VOP_UNLOCK(nd.ni_vp, 0, p); + vnode_put(nd.ni_vp); + if (nd.ni_vp->v_type != VREG) { - vn_close(nd.ni_vp, FWRITE, p->p_ucred, p); + vn_close(nd.ni_vp, FWRITE, kauth_cred_get(), p); return (EACCES); } } @@ -151,13 +162,14 @@ acct(p, uap, retval) * If accounting was previously enabled, kill the old space-watcher, * close the file, and (if no new file was specified, leave). */ - if (acctp != NULLVP || savacctp != NULLVP) { + if (acctp != NULLVP || suspend_acctp != NULLVP) { untimeout(acctwatch_funnel, NULL); - error = vn_close((acctp != NULLVP ? acctp : savacctp), FWRITE, - p->p_ucred, p); - acctp = savacctp = NULLVP; + error = vn_close((acctp != NULLVP ? acctp : suspend_acctp), FWRITE, + kauth_cred_get(), p); + + acctp = suspend_acctp = NULLVP; } - if (uap->path == NULL) + if (uap->path == USER_ADDR_NULL) return (error); /* @@ -175,13 +187,15 @@ acct(p, uap, retval) * and are enumerated below. (They're also noted in the system * "acct.h" header file.) */ +int acct_process(p) struct proc *p; { - struct acct acct; + struct acct an_acct; struct rusage *r; struct timeval ut, st, tmp; - int s, t; + int t; + int error; struct vnode *vp; /* If accounting isn't enabled, don't bother */ @@ -194,20 +208,18 @@ acct_process(p) */ /* (1) The name of the command that ran */ - bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm); + bcopy(p->p_comm, an_acct.ac_comm, sizeof an_acct.ac_comm); /* (2) The amount of user and system time that was used */ calcru(p, &ut, &st, NULL); - acct.ac_utime = encode_comp_t(ut.tv_sec, ut.tv_usec); - acct.ac_stime = encode_comp_t(st.tv_sec, st.tv_usec); + an_acct.ac_utime = encode_comp_t(ut.tv_sec, ut.tv_usec); + an_acct.ac_stime = encode_comp_t(st.tv_sec, st.tv_usec); /* (3) The elapsed time the commmand ran (and its starting time) */ - acct.ac_btime = p->p_stats->p_start.tv_sec; - s = splclock(); - tmp = time; - splx(s); + an_acct.ac_btime = p->p_stats->p_start.tv_sec; + microtime(&tmp); timevalsub(&tmp, &p->p_stats->p_start); - acct.ac_etime = encode_comp_t(tmp.tv_sec, tmp.tv_usec); + an_acct.ac_etime = encode_comp_t(tmp.tv_sec, tmp.tv_usec); /* (4) The average amount of memory used */ r = &p->p_stats->p_ru; @@ -215,33 +227,36 @@ acct_process(p) timevaladd(&tmp, &st); t = tmp.tv_sec * hz + tmp.tv_usec / tick; if (t) - acct.ac_mem = (r->ru_ixrss + r->ru_idrss + r->ru_isrss) / t; + an_acct.ac_mem = (r->ru_ixrss + r->ru_idrss + r->ru_isrss) / t; else - acct.ac_mem = 0; + an_acct.ac_mem = 0; /* (5) The number of disk I/O operations done */ - acct.ac_io = encode_comp_t(r->ru_inblock + r->ru_oublock, 0); + an_acct.ac_io = encode_comp_t(r->ru_inblock + r->ru_oublock, 0); /* (6) The UID and GID of the process */ - acct.ac_uid = p->p_cred->p_ruid; - acct.ac_gid = p->p_cred->p_rgid; + an_acct.ac_uid = p->p_ucred->cr_ruid; + an_acct.ac_gid = p->p_ucred->cr_rgid; /* (7) The terminal from which the process was started */ if ((p->p_flag & P_CONTROLT) && p->p_pgrp->pg_session->s_ttyp) - acct.ac_tty = p->p_pgrp->pg_session->s_ttyp->t_dev; + an_acct.ac_tty = p->p_pgrp->pg_session->s_ttyp->t_dev; else - acct.ac_tty = NODEV; + an_acct.ac_tty = NODEV; /* (8) The boolean flags that tell how the process terminated, etc. */ - acct.ac_flag = p->p_acflag; + an_acct.ac_flag = p->p_acflag; /* * Now, just write the accounting information to the file. */ - VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); - return (vn_rdwr(UIO_WRITE, vp, (caddr_t)&acct, sizeof (acct), - (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, p->p_ucred, - (int *)0, p)); + if ((error = vnode_getwithref(vp)) == 0) { + error = vn_rdwr(UIO_WRITE, vp, (caddr_t)&an_acct, sizeof (an_acct), + (off_t)0, UIO_SYSSPACE32, IO_APPEND|IO_UNIT, p->p_ucred, + (int *)0, p); + vnode_put(vp); + } + return (error); } /* @@ -301,32 +316,48 @@ acctwatch_funnel(a) */ /* ARGSUSED */ void -acctwatch(a) - void *a; +acctwatch(__unused void *a) { - struct statfs sb; - - if (savacctp != NULLVP) { - if (savacctp->v_type == VBAD) { - (void) vn_close(savacctp, FWRITE, NOCRED, NULL); - savacctp = NULLVP; + struct vfs_context context; + struct vfs_attr va; + + VFSATTR_INIT(&va); + VFSATTR_WANTED(&va, f_blocks); + VFSATTR_WANTED(&va, f_bavail); + context.vc_proc = current_proc(); + context.vc_ucred = kauth_cred_get(); + + if (suspend_acctp != NULLVP) { + /* + * Resuming accounting when accounting is suspended, and the + * filesystem containing the suspended accounting file goes + * below a low watermark + */ + if (suspend_acctp->v_type == VBAD) { + (void) vn_close(suspend_acctp, FWRITE, NOCRED, NULL); + suspend_acctp = NULLVP; return; } - (void)VFS_STATFS(savacctp->v_mount, &sb, (struct proc *)0); - if (sb.f_bavail > acctresume * sb.f_blocks / 100) { - acctp = savacctp; - savacctp = NULLVP; + (void)vfs_getattr(suspend_acctp->v_mount, &va, &context); + if (va.f_bavail > acctresume * va.f_blocks / 100) { + acctp = suspend_acctp; + suspend_acctp = NULLVP; log(LOG_NOTICE, "Accounting resumed\n"); } } else if (acctp != NULLVP) { + /* + * Suspending accounting when accounting is currently active, + * and the filesystem containing the active accounting file + * goes over a high watermark + */ if (acctp->v_type == VBAD) { (void) vn_close(acctp, FWRITE, NOCRED, NULL); acctp = NULLVP; return; } - (void)VFS_STATFS(acctp->v_mount, &sb, (struct proc *)0); - if (sb.f_bavail <= acctsuspend * sb.f_blocks / 100) { - savacctp = acctp; + (void)vfs_getattr(acctp->v_mount, &va, &context); + if (va.f_bavail <= acctsuspend * va.f_blocks / 100) { + suspend_acctp = acctp; acctp = NULLVP; log(LOG_NOTICE, "Accounting suspended\n"); } diff --git a/bsd/kern/kern_aio.c b/bsd/kern/kern_aio.c index f618a08eb..386774f05 100644 --- a/bsd/kern/kern_aio.c +++ b/bsd/kern/kern_aio.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2003-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -37,25 +37,31 @@ */ #include <sys/systm.h> -#include <sys/buf.h> #include <sys/fcntl.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/filedesc.h> #include <sys/kernel.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/malloc.h> -#include <sys/mount.h> +#include <sys/mount_internal.h> #include <sys/param.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> #include <sys/sysctl.h> #include <sys/unistd.h> #include <sys/user.h> #include <sys/aio_kern.h> +#include <sys/sysproto.h> #include <machine/limits.h> + +#include <mach/mach_types.h> +#include <kern/kern_types.h> #include <kern/zalloc.h> #include <kern/task.h> +#include <kern/sched_prim.h> + +#include <vm/vm_map.h> #include <sys/kdebug.h> #define AIO_work_queued 1 @@ -130,8 +136,8 @@ typedef struct aio_anchor_cb aio_anchor_cb; /* * aysnc IO locking macros used to protect critical sections. */ -#define AIO_LOCK usimple_lock( &aio_lock ) -#define AIO_UNLOCK usimple_unlock( &aio_lock ) +#define AIO_LOCK lck_mtx_lock(aio_lock) +#define AIO_UNLOCK lck_mtx_unlock(aio_lock) /* @@ -146,45 +152,44 @@ static aio_workq_entry * aio_get_some_work( void ); static boolean_t aio_last_group_io( aio_workq_entry *entryp ); static void aio_mark_requests( aio_workq_entry *entryp ); static int aio_queue_async_request( struct proc *procp, - struct aiocb *aiocbp, + user_addr_t aiocbp, int kindOfIO ); static int aio_validate( aio_workq_entry *entryp ); static void aio_work_thread( void ); static int do_aio_cancel( struct proc *p, int fd, - struct aiocb *aiocbp, + user_addr_t aiocbp, boolean_t wait_for_completion, boolean_t disable_notification ); static void do_aio_completion( aio_workq_entry *entryp ); static int do_aio_fsync( aio_workq_entry *entryp ); static int do_aio_read( aio_workq_entry *entryp ); static int do_aio_write( aio_workq_entry *entryp ); +static void do_munge_aiocb( struct aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp ); static boolean_t is_already_queued( struct proc *procp, - struct aiocb *aiocbp ); + user_addr_t aiocbp ); static int lio_create_async_entry( struct proc *procp, - struct aiocb *aiocbp, - struct sigevent *sigp, + user_addr_t aiocbp, + user_addr_t sigp, long group_tag, aio_workq_entry **entrypp ); static int lio_create_sync_entry( struct proc *procp, - struct aiocb *aiocbp, + user_addr_t aiocbp, long group_tag, aio_workq_entry **entrypp ); + /* * EXTERNAL PROTOTYPES */ /* in ...bsd/kern/sys_generic.c */ -extern struct file* holdfp( struct filedesc* fdp, int fd, int flag ); -extern int dofileread( struct proc *p, struct file *fp, int fd, - void *buf, size_t nbyte, off_t offset, - int flags, int *retval ); -extern int dofilewrite( struct proc *p, struct file *fp, int fd, - const void *buf, size_t nbyte, off_t offset, - int flags, int *retval ); -extern vm_map_t vm_map_switch( vm_map_t map ); - +extern int dofileread( struct proc *p, struct fileproc *fp, int fd, + user_addr_t bufp, user_size_t nbyte, + off_t offset, int flags, user_ssize_t *retval ); +extern int dofilewrite( struct proc *p, struct fileproc *fp, int fd, + user_addr_t bufp, user_size_t nbyte, off_t offset, + int flags, user_ssize_t *retval ); /* * aio external global variables. @@ -198,55 +203,13 @@ extern int aio_worker_threads; /* AIO_THREAD_COUNT - configurable */ * aio static variables. */ static aio_anchor_cb aio_anchor; -static simple_lock_data_t aio_lock; +static lck_mtx_t * aio_lock; +static lck_grp_t * aio_lock_grp; +static lck_attr_t * aio_lock_attr; +static lck_grp_attr_t * aio_lock_grp_attr; static struct zone *aio_workq_zonep; -/* - * syscall input parameters - */ -#ifndef _SYS_SYSPROTO_H_ - -struct aio_cancel_args { - int fd; - struct aiocb *aiocbp; -}; - -struct aio_error_args { - struct aiocb *aiocbp; -}; - -struct aio_fsync_args { - int op; - struct aiocb *aiocbp; -}; - -struct aio_read_args { - struct aiocb *aiocbp; -}; - -struct aio_return_args { - struct aiocb *aiocbp; -}; - -struct aio_suspend_args { - struct aiocb *const *aiocblist; - int nent; - const struct timespec *timeoutp; -}; - -struct aio_write_args { - struct aiocb *aiocbp; -}; - -struct lio_listio_args { - int mode; - struct aiocb *const *aiocblist; - int nent; - struct sigevent *sigp; -}; - -#endif /* _SYS_SYSPROTO_H_ */ /* @@ -260,9 +223,8 @@ struct lio_listio_args { int aio_cancel( struct proc *p, struct aio_cancel_args *uap, int *retval ) { - struct aiocb my_aiocb; + struct user_aiocb my_aiocb; int result; - boolean_t funnel_state; KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START, (int)p, (int)uap->aiocbp, 0, 0, 0 ); @@ -277,8 +239,16 @@ aio_cancel( struct proc *p, struct aio_cancel_args *uap, int *retval ) } *retval = -1; - if ( uap->aiocbp != NULL ) { - result = copyin( uap->aiocbp, &my_aiocb, sizeof(my_aiocb) ); + if ( uap->aiocbp != USER_ADDR_NULL ) { + if ( !IS_64BIT_PROCESS(p) ) { + struct aiocb aiocb32; + + result = copyin( uap->aiocbp, &aiocb32, sizeof(aiocb32) ); + if ( result == 0 ) + do_munge_aiocb( &aiocb32, &my_aiocb ); + } else + result = copyin( uap->aiocbp, &my_aiocb, sizeof(my_aiocb) ); + if ( result != 0 ) { result = EAGAIN; goto ExitRoutine; @@ -293,11 +263,7 @@ aio_cancel( struct proc *p, struct aio_cancel_args *uap, int *retval ) goto ExitRoutine; } } - - /* current BSD code assumes funnel lock is held */ - funnel_state = thread_funnel_set( kernel_flock, TRUE ); result = do_aio_cancel( p, uap->fd, uap->aiocbp, FALSE, FALSE ); - (void) thread_funnel_set( kernel_flock, funnel_state ); if ( result != -1 ) { *retval = result; @@ -319,7 +285,6 @@ ExitRoutine: /* * _aio_close - internal function used to clean up async IO requests for * a file descriptor that is closing. - * NOTE - kernel funnel lock is held when we get called. * THIS MAY BLOCK. */ @@ -339,7 +304,7 @@ _aio_close( struct proc *p, int fd ) (int)p, fd, 0, 0, 0 ); /* cancel all async IO requests on our todo queues for this file descriptor */ - error = do_aio_cancel( p, fd, NULL, TRUE, FALSE ); + error = do_aio_cancel( p, fd, 0, TRUE, FALSE ); if ( error == AIO_NOTCANCELED ) { /* * AIO_NOTCANCELED is returned when we find an aio request for this process @@ -450,7 +415,8 @@ aio_fsync( struct proc *p, struct aio_fsync_args *uap, int *retval ) (int)p, (int)uap->aiocbp, uap->op, 0, 0 ); *retval = 0; - if ( uap->op == O_SYNC ) + /* 0 := O_SYNC for binary backward compatibility with Panther */ + if (uap->op == O_SYNC || uap->op == 0) fsync_kind = AIO_FSYNC; #if 0 // we don't support fdatasync() call yet else if ( uap->op == O_DSYNC ) @@ -511,7 +477,7 @@ aio_read( struct proc *p, struct aio_read_args *uap, int *retval ) */ int -aio_return( struct proc *p, struct aio_return_args *uap, register_t *retval ) +aio_return( struct proc *p, struct aio_return_args *uap, user_ssize_t *retval ) { aio_workq_entry *entryp; int error; @@ -596,7 +562,6 @@ ExitRoutine: * a process that is going away due to exec(). We cancel any async IOs * we can and wait for those already active. We also disable signaling * for cancelled or active aio requests that complete. - * NOTE - kernel funnel lock is held when we get called. * This routine MAY block! */ @@ -622,7 +587,6 @@ _aio_exec( struct proc *p ) * a process that is terminating (via exit() or exec() ). We cancel any async IOs * we can and wait for those already active. We also disable signaling * for cancelled or active aio requests that complete. This routine MAY block! - * NOTE - kernel funnel lock is held when we get called. */ __private_extern__ void @@ -646,7 +610,7 @@ _aio_exit( struct proc *p ) * cancel async IO requests on the todo work queue and wait for those * already active to complete. */ - error = do_aio_cancel( p, 0, NULL, TRUE, TRUE ); + error = do_aio_cancel( p, 0, 0, TRUE, TRUE ); if ( error == AIO_NOTCANCELED ) { /* * AIO_NOTCANCELED is returned when we find an aio request for this process @@ -696,7 +660,6 @@ _aio_exit( struct proc *p ) } AIO_UNLOCK; -ExitRoutine: KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END, (int)p, 0, 0, 0, 0 ); @@ -718,11 +681,10 @@ ExitRoutine: * were already complete. * WARNING - do not deference aiocbp in this routine, it may point to user * land data that has not been copied in (when called from aio_cancel() ) - * NOTE - kernel funnel lock is held when we get called. */ static int -do_aio_cancel( struct proc *p, int fd, struct aiocb *aiocbp, +do_aio_cancel( struct proc *p, int fd, user_addr_t aiocbp, boolean_t wait_for_completion, boolean_t disable_notification ) { aio_workq_entry *entryp; @@ -738,9 +700,9 @@ do_aio_cancel( struct proc *p, int fd, struct aiocb *aiocbp, next_entryp = TAILQ_NEXT( entryp, aio_workq_link ); if ( p == entryp->procp ) { - if ( (aiocbp == NULL && fd == 0) || - (aiocbp != NULL && entryp->uaiocbp == aiocbp) || - (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) { + if ( (aiocbp == USER_ADDR_NULL && fd == 0) || + (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) || + (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) { /* we found a match so we remove the entry from the */ /* todo work queue and place it on the done queue */ TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link ); @@ -776,7 +738,7 @@ do_aio_cancel( struct proc *p, int fd, struct aiocb *aiocbp, else AIO_UNLOCK; - if ( aiocbp != NULL ) { + if ( aiocbp != USER_ADDR_NULL ) { return( result ); } @@ -801,9 +763,9 @@ do_aio_cancel( struct proc *p, int fd, struct aiocb *aiocbp, next_entryp = TAILQ_NEXT( entryp, aio_workq_link ); if ( p == entryp->procp ) { - if ( (aiocbp == NULL && fd == 0) || - (aiocbp != NULL && entryp->uaiocbp == aiocbp) || - (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) { + if ( (aiocbp == USER_ADDR_NULL && fd == 0) || + (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) || + (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) { /* we found a match so we remove the entry from the */ /* todo work queue and place it on the done queue */ TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link ); @@ -820,7 +782,7 @@ do_aio_cancel( struct proc *p, int fd, struct aiocb *aiocbp, TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link ); aio_anchor.aio_done_count++; p->aio_done_count++; - if ( aiocbp != NULL ) { + if ( aiocbp != USER_ADDR_NULL ) { AIO_UNLOCK; return( result ); } @@ -834,9 +796,9 @@ do_aio_cancel( struct proc *p, int fd, struct aiocb *aiocbp, * return AIO_NOTCANCELED result. */ TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) { - if ( (aiocbp == NULL && fd == 0) || - (aiocbp != NULL && entryp->uaiocbp == aiocbp) || - (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) { + if ( (aiocbp == USER_ADDR_NULL && fd == 0) || + (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) || + (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) { result = AIO_NOTCANCELED; KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE, @@ -846,7 +808,7 @@ do_aio_cancel( struct proc *p, int fd, struct aiocb *aiocbp, entryp->flags |= AIO_WAITING; /* flag for special completion processing */ if ( disable_notification ) entryp->flags |= AIO_DISABLE; /* flag for special completion processing */ - if ( aiocbp != NULL ) { + if ( aiocbp != USER_ADDR_NULL ) { AIO_UNLOCK; return( result ); } @@ -860,15 +822,15 @@ do_aio_cancel( struct proc *p, int fd, struct aiocb *aiocbp, */ if ( result == -1 ) { TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) { - if ( (aiocbp == NULL && fd == 0) || - (aiocbp != NULL && entryp->uaiocbp == aiocbp) || - (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) { + if ( (aiocbp == USER_ADDR_NULL && fd == 0) || + (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) || + (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) { result = AIO_ALLDONE; KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE, (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 ); - if ( aiocbp != NULL ) { + if ( aiocbp != USER_ADDR_NULL ) { AIO_UNLOCK; return( result ); } @@ -898,10 +860,9 @@ aio_suspend( struct proc *p, struct aio_suspend_args *uap, int *retval ) int error; int i, count; uint64_t abstime; - struct timespec ts; - struct timeval tv; + struct user_timespec ts; aio_workq_entry *entryp; - struct aiocb * *aiocbpp; + user_addr_t *aiocbpp; KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START, (int)p, uap->nent, 0, 0, 0 ); @@ -919,13 +880,23 @@ aio_suspend( struct proc *p, struct aio_suspend_args *uap, int *retval ) goto ExitThisRoutine; } - if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) { + if ( uap->nent < 1 || uap->nent > aio_max_requests_per_process ) { error = EINVAL; goto ExitThisRoutine; } - if ( uap->timeoutp != NULL ) { - error = copyin( (void *)uap->timeoutp, &ts, sizeof(ts) ); + if ( uap->timeoutp != USER_ADDR_NULL ) { + if ( proc_is64bit(p) ) { + error = copyin( uap->timeoutp, &ts, sizeof(ts) ); + } + else { + struct timespec temp; + error = copyin( uap->timeoutp, &temp, sizeof(temp) ); + if ( error == 0 ) { + ts.tv_sec = temp.tv_sec; + ts.tv_nsec = temp.tv_nsec; + } + } if ( error != 0 ) { error = EAGAIN; goto ExitThisRoutine; @@ -941,30 +912,44 @@ aio_suspend( struct proc *p, struct aio_suspend_args *uap, int *retval ) clock_absolutetime_interval_to_deadline( abstime, &abstime ); } - MALLOC( aiocbpp, void *, (uap->nent * sizeof(struct aiocb *)), M_TEMP, M_WAITOK ); + /* we reserve enough space for largest possible pointer size */ + MALLOC( aiocbpp, user_addr_t *, (uap->nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK ); if ( aiocbpp == NULL ) { error = EAGAIN; goto ExitThisRoutine; } - /* check list of aio requests to see if any have completed */ - for ( i = 0; i < uap->nent; i++ ) { - struct aiocb *aiocbp; + /* copyin our aiocb pointers from list */ + error = copyin( uap->aiocblist, aiocbpp, + proc_is64bit(p) ? (uap->nent * sizeof(user_addr_t)) + : (uap->nent * sizeof(uintptr_t)) ); + if ( error != 0 ) { + error = EAGAIN; + goto ExitThisRoutine; + } - /* copyin in aiocb pointer from list */ - error = copyin( (void *)(uap->aiocblist + i), (aiocbpp + i), sizeof(aiocbp) ); - if ( error != 0 ) { - error = EAGAIN; - goto ExitThisRoutine; + /* we depend on a list of user_addr_t's so we need to munge and expand */ + /* when these pointers came from a 32-bit process */ + if ( !proc_is64bit(p) && sizeof(uintptr_t) < sizeof(user_addr_t) ) { + /* position to the last entry and work back from there */ + uintptr_t *my_ptrp = ((uintptr_t *)aiocbpp) + (uap->nent - 1); + user_addr_t *my_addrp = aiocbpp + (uap->nent - 1); + for (i = 0; i < uap->nent; i++, my_ptrp--, my_addrp--) { + *my_addrp = (user_addr_t) (*my_ptrp); } + } + /* check list of aio requests to see if any have completed */ + AIO_LOCK; + for ( i = 0; i < uap->nent; i++ ) { + user_addr_t aiocbp; + /* NULL elements are legal so check for 'em */ aiocbp = *(aiocbpp + i); - if ( aiocbp == NULL ) + if ( aiocbp == USER_ADDR_NULL ) continue; - + /* return immediately if any aio request in the list is done */ - AIO_LOCK; TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) { if ( entryp->uaiocbp == aiocbp ) { *retval = 0; @@ -973,7 +958,6 @@ aio_suspend( struct proc *p, struct aio_suspend_args *uap, int *retval ) goto ExitThisRoutine; } } - AIO_UNLOCK; } /* for ( ; i < uap->nent; ) */ KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE, @@ -983,19 +967,15 @@ aio_suspend( struct proc *p, struct aio_suspend_args *uap, int *retval ) * wait for an async IO to complete or a signal fires or timeout expires. * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal * interrupts us. If an async IO completes before a signal fires or our - * timeout expires, we get a wakeup call from aio_work_thread(). We do not - * use tsleep() here in order to avoid getting kernel funnel lock. + * timeout expires, we get a wakeup call from aio_work_thread(). */ - assert_wait( (event_t) &p->AIO_SUSPEND_SLEEP_CHAN, THREAD_ABORTSAFE ); - if ( abstime > 0 ) { - thread_set_timer_deadline( abstime ); - } + assert_wait_deadline( (event_t) &p->AIO_SUSPEND_SLEEP_CHAN, THREAD_ABORTSAFE, abstime ); + AIO_UNLOCK; + error = thread_block( THREAD_CONTINUE_NULL ); + if ( error == THREAD_AWAKENED ) { /* got our wakeup call from aio_work_thread() */ - if ( abstime > 0 ) { - thread_cancel_timer(); - } *retval = 0; error = 0; } @@ -1005,9 +985,6 @@ aio_suspend( struct proc *p, struct aio_suspend_args *uap, int *retval ) } else { /* we were interrupted */ - if ( abstime > 0 ) { - thread_cancel_timer(); - } error = EINTR; } @@ -1066,11 +1043,13 @@ lio_listio( struct proc *p, struct lio_listio_args *uap, int *retval ) int result; long group_tag; aio_workq_entry * *entryp_listp; + user_addr_t *aiocbpp; KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START, (int)p, uap->nent, uap->mode, 0, 0 ); entryp_listp = NULL; + aiocbpp = NULL; call_result = -1; *retval = -1; if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) { @@ -1095,27 +1074,48 @@ lio_listio( struct proc *p, struct lio_listio_args *uap, int *retval ) * allocate a list of aio_workq_entry pointers that we will use to queue * up all our requests at once while holding our lock. */ - MALLOC( entryp_listp, void *, (uap->nent * sizeof(struct aiocb *)), M_TEMP, M_WAITOK ); + MALLOC( entryp_listp, void *, (uap->nent * sizeof(aio_workq_entry *)), M_TEMP, M_WAITOK ); if ( entryp_listp == NULL ) { call_result = EAGAIN; goto ExitRoutine; } + /* we reserve enough space for largest possible pointer size */ + MALLOC( aiocbpp, user_addr_t *, (uap->nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK ); + if ( aiocbpp == NULL ) { + call_result = EAGAIN; + goto ExitRoutine; + } + + /* copyin our aiocb pointers from list */ + result = copyin( uap->aiocblist, aiocbpp, + IS_64BIT_PROCESS(p) ? (uap->nent * sizeof(user_addr_t)) + : (uap->nent * sizeof(uintptr_t)) ); + if ( result != 0 ) { + call_result = EAGAIN; + goto ExitRoutine; + } + + /* we depend on a list of user_addr_t's so we need to munge and expand */ + /* when these pointers came from a 32-bit process */ + if ( !IS_64BIT_PROCESS(p) && sizeof(uintptr_t) < sizeof(user_addr_t) ) { + /* position to the last entry and work back from there */ + uintptr_t *my_ptrp = ((uintptr_t *)aiocbpp) + (uap->nent - 1); + user_addr_t *my_addrp = aiocbpp + (uap->nent - 1); + for (i = 0; i < uap->nent; i++, my_ptrp--, my_addrp--) { + *my_addrp = (user_addr_t) (*my_ptrp); + } + } + /* process list of aio requests */ for ( i = 0; i < uap->nent; i++ ) { - struct aiocb *my_aiocbp; + user_addr_t my_aiocbp; *(entryp_listp + i) = NULL; + my_aiocbp = *(aiocbpp + i); - /* copyin in aiocb pointer from list */ - result = copyin( (void *)(uap->aiocblist + i), &my_aiocbp, sizeof(my_aiocbp) ); - if ( result != 0 ) { - call_result = EAGAIN; - continue; - } - /* NULL elements are legal so check for 'em */ - if ( my_aiocbp == NULL ) + if ( my_aiocbp == USER_ADDR_NULL ) continue; if ( uap->mode == LIO_NOWAIT ) @@ -1150,7 +1150,8 @@ lio_listio( struct proc *p, struct lio_listio_args *uap, int *retval ) my_map = entryp->aio_map; entryp->aio_map = VM_MAP_NULL; - result = EAGAIN; + if ( call_result == -1 ) + call_result = EAGAIN; AIO_UNLOCK; aio_free_request( entryp, my_map ); AIO_LOCK; @@ -1170,11 +1171,11 @@ lio_listio( struct proc *p, struct lio_listio_args *uap, int *retval ) aio_anchor.lio_sync_workq_count++; } } - AIO_UNLOCK; - if ( uap->mode == LIO_NOWAIT ) + if ( uap->mode == LIO_NOWAIT ) { /* caller does not want to wait so we'll fire off a worker thread and return */ - wakeup_one( &aio_anchor.aio_async_workq ); + wakeup_one( (caddr_t) &aio_anchor.aio_async_workq ); + } else { aio_workq_entry *entryp; int error; @@ -1182,18 +1183,14 @@ lio_listio( struct proc *p, struct lio_listio_args *uap, int *retval ) /* * mode is LIO_WAIT - handle the IO requests now. */ - AIO_LOCK; entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq ); while ( entryp != NULL ) { if ( p == entryp->procp && group_tag == entryp->group_tag ) { - boolean_t funnel_state; TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link ); aio_anchor.lio_sync_workq_count--; AIO_UNLOCK; - // file system IO code path requires kernel funnel lock - funnel_state = thread_funnel_set( kernel_flock, TRUE ); if ( (entryp->flags & AIO_READ) != 0 ) { error = do_aio_read( entryp ); } @@ -1211,7 +1208,6 @@ lio_listio( struct proc *p, struct lio_listio_args *uap, int *retval ) entryp->errorval = error; if ( error != 0 && call_result == -1 ) call_result = EIO; - (void) thread_funnel_set( kernel_flock, funnel_state ); AIO_LOCK; /* we're done with the IO request so move it on the done queue */ @@ -1227,8 +1223,8 @@ lio_listio( struct proc *p, struct lio_listio_args *uap, int *retval ) entryp = TAILQ_NEXT( entryp, aio_workq_link ); } /* while ( entryp != NULL ) */ - AIO_UNLOCK; } /* uap->mode == LIO_WAIT */ + AIO_UNLOCK; /* call_result == -1 means we had no trouble queueing up requests */ if ( call_result == -1 ) { @@ -1239,6 +1235,8 @@ lio_listio( struct proc *p, struct lio_listio_args *uap, int *retval ) ExitRoutine: if ( entryp_listp != NULL ) FREE( entryp_listp, M_TEMP ); + if ( aiocbpp != NULL ) + FREE( aiocbpp, M_TEMP ); KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END, (int)p, call_result, 0, 0, 0 ); @@ -1258,30 +1256,31 @@ static void aio_work_thread( void ) { aio_workq_entry *entryp; - struct uthread *uthread = (struct uthread *)get_bsdthread_info(current_act()); for( ;; ) { + AIO_LOCK; entryp = aio_get_some_work(); if ( entryp == NULL ) { /* * aio worker threads wait for some work to get queued up * by aio_queue_async_request. Once some work gets queued * it will wake up one of these worker threads just before - * returning to our caller in user land. We do not use - * tsleep() here in order to avoid getting kernel funnel lock. + * returning to our caller in user land. */ assert_wait( (event_t) &aio_anchor.aio_async_workq, THREAD_UNINT ); - thread_block( THREAD_CONTINUE_NULL ); + AIO_UNLOCK; - KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_wake)) | DBG_FUNC_NONE, - 0, 0, 0, 0, 0 ); + thread_block( (thread_continue_t)aio_work_thread ); + /* NOT REACHED */ } else { int error; - boolean_t funnel_state; vm_map_t currentmap; vm_map_t oldmap = VM_MAP_NULL; task_t oldaiotask = TASK_NULL; + struct uthread *uthreadp = NULL; + + AIO_UNLOCK; KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START, (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 ); @@ -1290,12 +1289,11 @@ aio_work_thread( void ) * Assume the target's address space identity for the duration * of the IO. */ - funnel_state = thread_funnel_set( kernel_flock, TRUE ); - currentmap = get_task_map( (current_proc())->task ); if ( currentmap != entryp->aio_map ) { - oldaiotask = uthread->uu_aio_task; - uthread->uu_aio_task = entryp->procp->task; + uthreadp = (struct uthread *) get_bsdthread_info(current_thread()); + oldaiotask = uthreadp->uu_aio_task; + uthreadp->uu_aio_task = entryp->procp->task; oldmap = vm_map_switch( entryp->aio_map ); } @@ -1316,7 +1314,7 @@ aio_work_thread( void ) entryp->errorval = error; if ( currentmap != entryp->aio_map ) { (void) vm_map_switch( oldmap ); - uthread->uu_aio_task = oldaiotask; + uthreadp->uu_aio_task = oldaiotask; } /* we're done with the IO request so pop it off the active queue and */ @@ -1344,7 +1342,6 @@ aio_work_thread( void ) } do_aio_completion( entryp ); - (void) thread_funnel_set( kernel_flock, funnel_state ); KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END, (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval, @@ -1374,16 +1371,15 @@ aio_work_thread( void ) * aio_get_some_work - get the next async IO request that is ready to be executed. * aio_fsync complicates matters a bit since we cannot do the fsync until all async * IO requests at the time the aio_fsync call came in have completed. + * NOTE - AIO_LOCK must be held by caller */ static aio_workq_entry * aio_get_some_work( void ) { aio_workq_entry *entryp; - int skip_count = 0; /* pop some work off the work queue and add to our active queue */ - AIO_LOCK; for ( entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq ); entryp != NULL; entryp = TAILQ_NEXT( entryp, aio_workq_link ) ) { @@ -1408,7 +1404,6 @@ aio_get_some_work( void ) aio_anchor.aio_active_count++; entryp->procp->aio_active_count++; } - AIO_UNLOCK; return( entryp ); @@ -1427,7 +1422,7 @@ aio_delay_fsync_request( aio_workq_entry *entryp ) aio_workq_entry *my_entryp; TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) { - if ( my_entryp->fsyncp != NULL && + if ( my_entryp->fsyncp != USER_ADDR_NULL && entryp->uaiocbp == my_entryp->fsyncp && entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) { return( TRUE ); @@ -1447,7 +1442,7 @@ aio_delay_fsync_request( aio_workq_entry *entryp ) */ static int -aio_queue_async_request( struct proc *procp, struct aiocb *aiocbp, int kindOfIO ) +aio_queue_async_request( struct proc *procp, user_addr_t aiocbp, int kindOfIO ) { aio_workq_entry *entryp; int result; @@ -1464,7 +1459,16 @@ aio_queue_async_request( struct proc *procp, struct aiocb *aiocbp, int kindOfIO entryp->uaiocbp = aiocbp; entryp->flags |= kindOfIO; entryp->aio_map = VM_MAP_NULL; - result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) ); + + if ( !IS_64BIT_PROCESS(procp) ) { + struct aiocb aiocb32; + + result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) ); + if ( result == 0 ) + do_munge_aiocb( &aiocb32, &entryp->aiocb ); + } else + result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) ); + if ( result != 0 ) { result = EAGAIN; goto error_exit; @@ -1510,13 +1514,12 @@ aio_queue_async_request( struct proc *procp, struct aiocb *aiocbp, int kindOfIO TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link ); aio_anchor.aio_async_workq_count++; - AIO_UNLOCK; + wakeup_one( (caddr_t) &aio_anchor.aio_async_workq ); + AIO_UNLOCK; KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE, (int)procp, (int)aiocbp, 0, 0, 0 ); - - wakeup_one( &aio_anchor.aio_async_workq ); - + return( 0 ); error_exit: @@ -1542,8 +1545,8 @@ error_exit: */ static int -lio_create_async_entry( struct proc *procp, struct aiocb *aiocbp, - struct sigevent *sigp, long group_tag, +lio_create_async_entry( struct proc *procp, user_addr_t aiocbp, + user_addr_t sigp, long group_tag, aio_workq_entry **entrypp ) { aio_workq_entry *entryp; @@ -1562,7 +1565,16 @@ lio_create_async_entry( struct proc *procp, struct aiocb *aiocbp, entryp->flags |= AIO_LIO; entryp->group_tag = group_tag; entryp->aio_map = VM_MAP_NULL; - result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) ); + + if ( !IS_64BIT_PROCESS(procp) ) { + struct aiocb aiocb32; + + result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) ); + if ( result == 0 ) + do_munge_aiocb( &aiocb32, &entryp->aiocb ); + } else + result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) ); + if ( result != 0 ) { result = EAGAIN; goto error_exit; @@ -1577,8 +1589,32 @@ lio_create_async_entry( struct proc *procp, struct aiocb *aiocbp, /* use sigevent passed in to lio_listio for each of our calls, but only */ /* do completion notification after the last request completes. */ - if ( sigp != NULL ) { - result = copyin( sigp, &entryp->aiocb.aio_sigevent, sizeof(entryp->aiocb.aio_sigevent) ); + if ( sigp != USER_ADDR_NULL ) { + if ( !IS_64BIT_PROCESS(procp) ) { + struct sigevent sigevent32; + + result = copyin( sigp, &sigevent32, sizeof(sigevent32) ); + if ( result == 0 ) { + /* also need to munge aio_sigevent since it contains pointers */ + /* special case here. since we do not know if sigev_value is an */ + /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */ + /* means if we send this info back to user space we need to remember */ + /* sigev_value was not expanded for the 32-bit case. */ + /* NOTE - this does NOT affect us since we don't support sigev_value */ + /* yet in the aio context. */ + //LP64 + entryp->aiocb.aio_sigevent.sigev_notify = sigevent32.sigev_notify; + entryp->aiocb.aio_sigevent.sigev_signo = sigevent32.sigev_signo; + entryp->aiocb.aio_sigevent.sigev_value.size_equivalent.sival_int = + sigevent32.sigev_value.sival_int; + entryp->aiocb.aio_sigevent.sigev_notify_function = + CAST_USER_ADDR_T(sigevent32.sigev_notify_function); + entryp->aiocb.aio_sigevent.sigev_notify_attributes = + CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes); + } + } else + result = copyin( sigp, &entryp->aiocb.aio_sigevent, sizeof(entryp->aiocb.aio_sigevent) ); + if ( result != 0 ) { result = EAGAIN; goto error_exit; @@ -1599,7 +1635,7 @@ lio_create_async_entry( struct proc *procp, struct aiocb *aiocbp, error_exit: if ( entryp != NULL ) - zfree( aio_workq_zonep, (vm_offset_t) entryp ); + zfree( aio_workq_zonep, entryp ); return( result ); @@ -1645,7 +1681,7 @@ aio_mark_requests( aio_workq_entry *entryp ) */ static int -lio_create_sync_entry( struct proc *procp, struct aiocb *aiocbp, +lio_create_sync_entry( struct proc *procp, user_addr_t aiocbp, long group_tag, aio_workq_entry **entrypp ) { aio_workq_entry *entryp; @@ -1664,7 +1700,16 @@ lio_create_sync_entry( struct proc *procp, struct aiocb *aiocbp, entryp->flags |= AIO_LIO; entryp->group_tag = group_tag; entryp->aio_map = VM_MAP_NULL; - result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) ); + + if ( !IS_64BIT_PROCESS(procp) ) { + struct aiocb aiocb32; + + result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) ); + if ( result == 0 ) + do_munge_aiocb( &aiocb32, &entryp->aiocb ); + } else + result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) ); + if ( result != 0 ) { result = EAGAIN; goto error_exit; @@ -1687,7 +1732,7 @@ lio_create_sync_entry( struct proc *procp, struct aiocb *aiocbp, error_exit: if ( entryp != NULL ) - zfree( aio_workq_zonep, (vm_offset_t) entryp ); + zfree( aio_workq_zonep, entryp ); return( result ); @@ -1709,7 +1754,7 @@ aio_free_request( aio_workq_entry *entryp, vm_map_t the_map ) vm_map_deallocate( the_map ); } - zfree( aio_workq_zonep, (vm_offset_t) entryp ); + zfree( aio_workq_zonep, entryp ); return( 0 ); @@ -1722,8 +1767,7 @@ aio_free_request( aio_workq_entry *entryp, vm_map_t the_map ) static int aio_validate( aio_workq_entry *entryp ) { - boolean_t funnel_state; - struct file *fp; + struct fileproc *fp; int flag; int result; @@ -1746,10 +1790,10 @@ aio_validate( aio_workq_entry *entryp ) } if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) { - if ( entryp->aiocb.aio_offset < 0 || - entryp->aiocb.aio_nbytes < 0 || - entryp->aiocb.aio_nbytes > INT_MAX || - entryp->aiocb.aio_buf == NULL ) + // LP64todo - does max value for aio_nbytes need to grow? + if ( entryp->aiocb.aio_nbytes > INT_MAX || + entryp->aiocb.aio_buf == USER_ADDR_NULL || + entryp->aiocb.aio_offset < 0 ) return( EINVAL ); } @@ -1769,27 +1813,29 @@ aio_validate( aio_workq_entry *entryp ) return (EINVAL); /* validate the file descriptor and that the file was opened - * for the appropriate read / write access. This section requires - * kernel funnel lock. + * for the appropriate read / write access. */ - funnel_state = thread_funnel_set( kernel_flock, TRUE ); + proc_fdlock(entryp->procp); - result = fdgetf( entryp->procp, entryp->aiocb.aio_fildes, &fp ); + result = fp_lookup( entryp->procp, entryp->aiocb.aio_fildes, &fp , 1); if ( result == 0 ) { - if ( (fp->f_flag & flag) == 0 ) { + if ( (fp->f_fglob->fg_flag & flag) == 0 ) { /* we don't have read or write access */ result = EBADF; } - else if ( fp->f_type != DTYPE_VNODE ) { + else if ( fp->f_fglob->fg_type != DTYPE_VNODE ) { /* this is not a file */ result = ESPIPE; - } + } else + fp->f_flags |= FP_AIOISSUED; + + fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp , 1); } else { result = EBADF; } - (void) thread_funnel_set( kernel_flock, funnel_state ); + proc_fdunlock(entryp->procp); return( result ); @@ -1807,7 +1853,6 @@ static int aio_get_process_count( struct proc *procp ) { aio_workq_entry *entryp; - int error; int count; /* begin with count of completed async IO requests for this process */ @@ -1898,15 +1943,15 @@ do_aio_completion( aio_workq_entry *entryp ) AIO_LOCK; active_requests = aio_active_requests_for_process( entryp->procp ); - AIO_UNLOCK; + //AIO_UNLOCK; if ( active_requests < 1 ) { /* no active aio requests for this process, continue exiting */ + wakeup_one( (caddr_t) &entryp->procp->AIO_CLEANUP_SLEEP_CHAN ); KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE, (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 ); - - wakeup_one( &entryp->procp->AIO_CLEANUP_SLEEP_CHAN ); } + AIO_UNLOCK; return; } @@ -1920,10 +1965,12 @@ do_aio_completion( aio_workq_entry *entryp ) * call wakeup for them. If we do mark them we should unmark them after * the aio_suspend wakes up. */ + AIO_LOCK; + wakeup_one( (caddr_t) &entryp->procp->AIO_SUSPEND_SLEEP_CHAN ); + AIO_UNLOCK; + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE, (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 ); - - wakeup_one( &entryp->procp->AIO_SUSPEND_SLEEP_CHAN ); return; @@ -1971,20 +2018,27 @@ aio_last_group_io( aio_workq_entry *entryp ) static int do_aio_read( aio_workq_entry *entryp ) { - struct file *fp; + struct fileproc *fp; int error; - fp = holdfp( entryp->procp->p_fd, entryp->aiocb.aio_fildes, FREAD ); + if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) ) + return(error); + if ( (fp->f_fglob->fg_flag & FREAD) == 0 ) { + fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0); + return(EBADF); + } if ( fp != NULL ) { error = dofileread( entryp->procp, fp, entryp->aiocb.aio_fildes, - (void *)entryp->aiocb.aio_buf, + entryp->aiocb.aio_buf, entryp->aiocb.aio_nbytes, entryp->aiocb.aio_offset, FOF_OFFSET, &entryp->returnval ); - frele( fp ); + fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0); } - else + else { + fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0); error = EBADF; + } return( error ); @@ -1997,20 +2051,28 @@ do_aio_read( aio_workq_entry *entryp ) static int do_aio_write( aio_workq_entry *entryp ) { - struct file *fp; + struct fileproc *fp; int error; - fp = holdfp( entryp->procp->p_fd, entryp->aiocb.aio_fildes, FWRITE ); + if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) ) + return(error); + if ( (fp->f_fglob->fg_flag & FWRITE) == 0 ) { + fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0); + return(EBADF); + } if ( fp != NULL ) { error = dofilewrite( entryp->procp, fp, entryp->aiocb.aio_fildes, - (const void *)entryp->aiocb.aio_buf, + entryp->aiocb.aio_buf, entryp->aiocb.aio_nbytes, entryp->aiocb.aio_offset, FOF_OFFSET, &entryp->returnval ); - frele( fp ); + + fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0); } - else + else { + fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0); error = EBADF; + } return( error ); @@ -2038,21 +2100,32 @@ aio_active_requests_for_process( struct proc *procp ) static int do_aio_fsync( aio_workq_entry *entryp ) { - register struct vnode *vp; - struct file *fp; - int error; + struct vfs_context context; + struct vnode *vp; + struct fileproc *fp; + int error; /* * NOTE - we will not support AIO_DSYNC until fdatasync() is supported. * AIO_DSYNC is caught before we queue up a request and flagged as an error. * The following was shamelessly extracted from fsync() implementation. */ - error = getvnode( entryp->procp, entryp->aiocb.aio_fildes, &fp ); + + error = fp_getfvp( entryp->procp, entryp->aiocb.aio_fildes, &fp, &vp); if ( error == 0 ) { - vp = (struct vnode *)fp->f_data; - vn_lock( vp, LK_EXCLUSIVE | LK_RETRY, entryp->procp ); - error = VOP_FSYNC( vp, fp->f_cred, MNT_WAIT, entryp->procp ); - VOP_UNLOCK( vp, 0, entryp->procp ); + if ( (error = vnode_getwithref(vp)) ) { + fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0); + entryp->returnval = -1; + return(error); + } + context.vc_proc = entryp->procp; + context.vc_ucred = fp->f_fglob->fg_cred; + + error = VNOP_FSYNC( vp, MNT_WAIT, &context); + + (void)vnode_put(vp); + + fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0); } if ( error != 0 ) entryp->returnval = -1; @@ -2071,7 +2144,7 @@ do_aio_fsync( aio_workq_entry *entryp ) static boolean_t is_already_queued( struct proc *procp, - struct aiocb *aiocbp ) + user_addr_t aiocbp ) { aio_workq_entry *entryp; boolean_t result; @@ -2124,7 +2197,13 @@ aio_init( void ) { int i; - simple_lock_init( &aio_lock ); + aio_lock_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(aio_lock_grp_attr); + aio_lock_grp = lck_grp_alloc_init("aio", aio_lock_grp_attr); + aio_lock_attr = lck_attr_alloc_init(); + //lck_attr_setdebug(aio_lock_attr); + + aio_lock = lck_mtx_alloc_init(aio_lock_grp, aio_lock_attr); AIO_LOCK; TAILQ_INIT( &aio_anchor.aio_async_workq ); @@ -2173,5 +2252,39 @@ _aio_create_worker_threads( int num ) task_t get_aiotask(void) { - return ((struct uthread *)get_bsdthread_info(current_act()))->uu_aio_task; + return ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task; +} + + +/* + * In the case of an aiocb from a + * 32-bit process we need to expand some longs and pointers to the correct + * sizes in order to let downstream code always work on the same type of + * aiocb (in our case that is a user_aiocb) + */ +static void +do_munge_aiocb( struct aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp ) +{ + the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes; + the_user_aiocbp->aio_offset = my_aiocbp->aio_offset; + the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf); + the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes; + the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio; + the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode; + + /* special case here. since we do not know if sigev_value is an */ + /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */ + /* means if we send this info back to user space we need to remember */ + /* sigev_value was not expanded for the 32-bit case. */ + /* NOTE - this does NOT affect us since we don't support sigev_value */ + /* yet in the aio context. */ + //LP64 + the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify; + the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo; + the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int = + my_aiocbp->aio_sigevent.sigev_value.sival_int; + the_user_aiocbp->aio_sigevent.sigev_notify_function = + CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function); + the_user_aiocbp->aio_sigevent.sigev_notify_attributes = + CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes); } diff --git a/bsd/kern/kern_audit.c b/bsd/kern/kern_audit.c index 2fa7b1d50..131047494 100644 --- a/bsd/kern/kern_audit.c +++ b/bsd/kern/kern_audit.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2003-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -19,49 +19,52 @@ * * @APPLE_LICENSE_HEADER_END@ */ - #include <sys/param.h> -#include <sys/file.h> #include <sys/fcntl.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/namei.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/queue.h> #include <sys/systm.h> #include <sys/time.h> #include <sys/ucred.h> #include <sys/uio.h> #include <sys/unistd.h> -#include <sys/vnode.h> +#include <sys/file_internal.h> +#include <sys/vnode_internal.h> #include <sys/user.h> #include <sys/syscall.h> #include <sys/malloc.h> #include <sys/un.h> -#include <netinet/in.h> -#include <sys/socketvar.h> -#include <sys/protosw.h> +#include <sys/sysent.h> +#include <sys/sysproto.h> +#include <sys/vfs_context.h> #include <sys/domain.h> -#include <sys/mount.h> -#include <net/route.h> -#include <netinet/in_pcb.h> +#include <sys/protosw.h> +#include <sys/socketvar.h> #include <bsm/audit.h> #include <bsm/audit_kevents.h> #include <bsm/audit_klib.h> #include <bsm/audit_kernel.h> -#include <mach/message.h> -#include <mach/port.h> +#include <mach/host_priv.h> #include <mach/host_special_ports.h> -#include <mach/audit_triggers.h> +#include <mach/audit_triggers_server.h> +#include <kern/host.h> +#include <kern/kalloc.h> +#include <kern/zalloc.h> #include <kern/lock.h> #include <kern/wait_queue.h> -#include <kern/zalloc.h> -#include <kern/kalloc.h> +#include <kern/sched_prim.h> + +#include <net/route.h> -#include <audit.h> +#include <netinet/in.h> +#include <netinet/in_pcb.h> #ifdef AUDIT @@ -73,8 +76,10 @@ */ /* #define AUDIT_EXCESSIVELY_VERBOSE */ #ifdef AUDIT_EXCESSIVELY_VERBOSE +#define AUDIT_PRINTF_ONLY #define AUDIT_PRINTF(x) printf x #else +#define AUDIT_PRINTF_ONLY __unused #define AUDIT_PRINTF(X) #endif @@ -108,9 +113,9 @@ static mutex_t *audit_mtx; * not yet in the queue, which is needed to estimate the total * size of the combined set of records outstanding in the system. */ -static TAILQ_HEAD(, kaudit_record) audit_q; -static int audit_q_len; -static int audit_pre_q_len; +static TAILQ_HEAD(, kaudit_record) audit_q; +static size_t audit_q_len; +static size_t audit_pre_q_len; static wait_queue_t audit_wait_queue; static zone_t audit_zone; @@ -123,6 +128,12 @@ static zone_t audit_zone; static int audit_worker_event; #define AUDIT_WORKER_EVENT ((event_t)&audit_worker_event) +/* + * The audit worker thread (which is lazy started when we first + * rotate the audit log. + */ +static thread_t audit_worker_thread = THREAD_NULL; + /* * When an audit log is rotated, the actual rotation must be performed * by the audit worker thread, as it may have outstanding writes on the @@ -140,9 +151,9 @@ static int audit_worker_event; static int audit_replacement_event; #define AUDIT_REPLACEMENT_EVENT ((event_t)&audit_replacement_event) -static int audit_replacement_flag; +static int audit_replacement_flag; static struct vnode *audit_replacement_vp; -static struct ucred *audit_replacement_cred; +static kauth_cred_t audit_replacement_cred; /* * Wait queue for auditing threads that cannot commit the audit @@ -157,8 +168,8 @@ static struct au_qctrl audit_qctrl; /* * Flags to use on audit files when opening and closing. */ -const static int audit_open_flags = FWRITE | O_APPEND; -const static int audit_close_flags = FWRITE | O_APPEND; +static const int audit_open_flags = FWRITE | O_APPEND; +static const int audit_close_flags = FWRITE | O_APPEND; /* * Global audit statistiscs. @@ -203,54 +214,61 @@ static void audit_free(struct kaudit_record *ar) { if (ar->k_ar.ar_arg_upath1 != NULL) { - kfree((vm_offset_t)ar->k_ar.ar_arg_upath1, MAXPATHLEN); + kfree(ar->k_ar.ar_arg_upath1, MAXPATHLEN); } if (ar->k_ar.ar_arg_upath2 != NULL) { - kfree((vm_offset_t)ar->k_ar.ar_arg_upath2, MAXPATHLEN); + kfree(ar->k_ar.ar_arg_upath2, MAXPATHLEN); + } if (ar->k_ar.ar_arg_kpath1 != NULL) { - kfree((vm_offset_t)ar->k_ar.ar_arg_kpath1, MAXPATHLEN); + kfree(ar->k_ar.ar_arg_kpath1, MAXPATHLEN); + } if (ar->k_ar.ar_arg_kpath2 != NULL) { - kfree((vm_offset_t)ar->k_ar.ar_arg_kpath2, MAXPATHLEN); + kfree(ar->k_ar.ar_arg_kpath2, MAXPATHLEN); + } if (ar->k_ar.ar_arg_text != NULL) { - kfree((vm_offset_t)ar->k_ar.ar_arg_text, MAXPATHLEN); + kfree(ar->k_ar.ar_arg_text, MAXPATHLEN); + } if (ar->k_udata != NULL) { - kfree((vm_offset_t)ar->k_udata, (vm_size_t)ar->k_ulen); + kfree(ar->k_udata, ar->k_ulen); + } - zfree(audit_zone, (vm_offset_t)ar); + zfree(audit_zone, ar); } static int -audit_write(struct vnode *vp, struct kaudit_record *ar, struct ucred *cred, +audit_write(struct vnode *vp, struct kaudit_record *ar, kauth_cred_t cred, struct proc *p) { - struct statfs *mnt_stat = &vp->v_mount->mnt_stat; + struct vfsstatfs *mnt_stat = &vp->v_mount->mnt_vfsstat; int ret; struct au_record *bsm; - struct vattr vattr; + /* KVV maybe we should take a context as a param to audit_write? */ + struct vfs_context context; + off_t file_size; mach_port_t audit_port; - /* + /* * First, gather statistics on the audit log file and file system * so that we know how we're doing on space. In both cases, * if we're unable to perform the operation, we drop the record * and return. However, this is arguably an assertion failure. */ - ret = VFS_STATFS(vp->v_mount, mnt_stat, p); - if (ret) - goto out; - - ret = VOP_GETATTR(vp, &vattr, cred, p); + context.vc_proc = p; + context.vc_ucred = cred; + ret = vfs_update_vfsstat(vp->v_mount, &context); if (ret) goto out; /* update the global stats struct */ - audit_fstat.af_currsz = vattr.va_size; - + if ((ret = vnode_size(vp, &file_size, &context)) != 0) + goto out; + audit_fstat.af_currsz = file_size; + /* * Send a message to the audit daemon when disk space is getting * low. @@ -262,7 +280,7 @@ audit_write(struct vnode *vp, struct kaudit_record *ar, struct ucred *cred, printf("Cannot get audit control port\n"); if (audit_port != MACH_PORT_NULL) { - long temp; + uint64_t temp; /* * If we fall below percent free blocks, then trigger the @@ -290,7 +308,7 @@ audit_write(struct vnode *vp, struct kaudit_record *ar, struct ucred *cred, */ if ((audit_fstat.af_filesz != 0) && (audit_file_rotate_wait == 0) && - (vattr.va_size >= audit_fstat.af_filesz)) { + (file_size >= audit_fstat.af_filesz)) { audit_file_rotate_wait = 1; ret = audit_triggers(audit_port, AUDIT_TRIGGER_FILE_FULL); @@ -334,10 +352,15 @@ audit_write(struct vnode *vp, struct kaudit_record *ar, struct ucred *cred, * we ignore errors. */ if (ar->k_ar_commit & AR_COMMIT_USER) { - ret = vn_rdwr(UIO_WRITE, vp, (void *)ar->k_udata, ar->k_ulen, - (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, cred, NULL, p); - if (ret) + if (vnode_getwithref(vp) == 0) { + ret = vn_rdwr(UIO_WRITE, vp, (void *)ar->k_udata, ar->k_ulen, + (off_t)0, UIO_SYSSPACE32, IO_APPEND|IO_UNIT, cred, NULL, p); + vnode_put(vp); + if (ret) + goto out; + } else { goto out; + } } /* @@ -371,9 +394,11 @@ audit_write(struct vnode *vp, struct kaudit_record *ar, struct ucred *cred, * done before this function is called. This function will then * take the BSM record as a parameter. */ - ret = (vn_rdwr(UIO_WRITE, vp, (void *)bsm->data, bsm->len, - (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, cred, NULL, p)); - + if ((ret = vnode_getwithref(vp)) == 0) { + ret = (vn_rdwr(UIO_WRITE, vp, (void *)bsm->data, bsm->len, + (off_t)0, UIO_SYSSPACE32, IO_APPEND|IO_UNIT, cred, NULL, p)); + vnode_put(vp); + } kau_free(bsm); out: @@ -385,9 +410,7 @@ out: */ if (audit_in_failure && audit_q_len == 0 && audit_pre_q_len == 0) { - VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); - (void)VOP_FSYNC(vp, cred, MNT_WAIT, p); - VOP_UNLOCK(vp, 0, p); + (void)VNOP_FSYNC(vp, MNT_WAIT, &context); panic("Audit store overflow; record queue drained."); } @@ -395,14 +418,14 @@ out: } static void -audit_worker() +audit_worker(void) { int do_replacement_signal, error, release_funnel; TAILQ_HEAD(, kaudit_record) ar_worklist; - struct kaudit_record *ar, *ar_start, *ar_stop; + struct kaudit_record *ar; struct vnode *audit_vp, *old_vp; - - struct ucred *audit_cred, *old_cred; + kauth_cred_t audit_cred; + kauth_cred_t old_cred; struct proc *audit_p; AUDIT_PRINTF(("audit_worker starting\n")); @@ -456,8 +479,8 @@ audit_worker() AUDIT_PRINTF(("Closing old audit file\n")); vn_close(old_vp, audit_close_flags, old_cred, audit_p); - crfree(old_cred); - old_cred = NULL; + kauth_cred_rele(old_cred); + old_cred = NOCRED; old_vp = NULL; AUDIT_PRINTF(("Audit file closed\n")); } @@ -492,7 +515,8 @@ audit_worker() AUDIT_PRINTF(("audit_worker waiting\n")); ret = wait_queue_assert_wait(audit_wait_queue, AUDIT_WORKER_EVENT, - THREAD_UNINT); + THREAD_UNINT, + 0); mutex_unlock(audit_mtx); assert(ret == THREAD_WAITING); @@ -501,6 +525,7 @@ audit_worker() AUDIT_PRINTF(("audit_worker woken up\n")); AUDIT_PRINTF(("audit_worker: new vp = %p; value of flag %d\n", audit_replacement_vp, audit_replacement_flag)); + mutex_lock(audit_mtx); continue; } @@ -561,7 +586,6 @@ audit_worker() TAILQ_INSERT_TAIL(&ar_worklist, ar, k_q); } - mutex_unlock(audit_mtx); release_funnel = 0; while ((ar = TAILQ_FIRST(&ar_worklist))) { @@ -575,17 +599,16 @@ audit_worker() thread_funnel_set(kernel_flock, TRUE); release_funnel = 1; } - VOP_LEASE(audit_vp, audit_p, audit_cred, - LEASE_WRITE); error = audit_write(audit_vp, ar, audit_cred, audit_p); - if (error && audit_panic_on_write_fail) + if (error && audit_panic_on_write_fail) { panic("audit_worker: write error %d\n", error); - else if (error) + } else if (error) { printf("audit_worker: write error %d\n", error); } + } audit_free(ar); } if (release_funnel) @@ -623,7 +646,7 @@ audit_init(void) audit_qctrl.aq_bufsz = AQ_BUFSZ; audit_qctrl.aq_minfree = AU_FS_MINFREE; - audit_mtx = mutex_alloc(ETAP_NO_TRACE); + audit_mtx = mutex_alloc(0); audit_wait_queue = wait_queue_alloc(SYNC_POLICY_FIFO); audit_zone = zinit(sizeof(struct kaudit_record), AQ_HIWATER*sizeof(struct kaudit_record), @@ -632,12 +655,10 @@ audit_init(void) /* Initialize the BSM audit subsystem. */ kau_init(); - - kernel_thread(kernel_task, audit_worker); } static void -audit_rotate_vnode(struct ucred *cred, struct vnode *vp) +audit_rotate_vnode(kauth_cred_t cred, struct vnode *vp) { int ret; @@ -652,7 +673,8 @@ audit_rotate_vnode(struct ucred *cred, struct vnode *vp) "flag\n")); ret = wait_queue_assert_wait(audit_wait_queue, AUDIT_REPLACEMENT_EVENT, - THREAD_UNINT); + THREAD_UNINT, + 0); mutex_unlock(audit_mtx); assert(ret == THREAD_WAITING); @@ -668,10 +690,16 @@ audit_rotate_vnode(struct ucred *cred, struct vnode *vp) audit_replacement_vp = vp; /* - * Wake up the audit worker to perform the exchange once we - * release the mutex. + * Start or wake up the audit worker to perform the exchange. + * It will have to wait until we release the mutex. */ - wait_queue_wakeup_one(audit_wait_queue, AUDIT_WORKER_EVENT, THREAD_AWAKENED); + if (audit_worker_thread == THREAD_NULL) + audit_worker_thread = kernel_thread(kernel_task, + audit_worker); + else + wait_queue_wakeup_one(audit_wait_queue, + AUDIT_WORKER_EVENT, + THREAD_AWAKENED); /* * Wait for the audit_worker to broadcast that a replacement has @@ -682,7 +710,8 @@ audit_rotate_vnode(struct ucred *cred, struct vnode *vp) "replacement\n")); ret = wait_queue_assert_wait(audit_wait_queue, AUDIT_REPLACEMENT_EVENT, - THREAD_UNINT); + THREAD_UNINT, + 0); mutex_unlock(audit_mtx); assert(ret == THREAD_WAITING); @@ -706,7 +735,7 @@ audit_shutdown(void) static __inline__ struct uthread * curuthread(void) { - return (get_bsdthread_info(current_act())); + return (get_bsdthread_info(current_thread())); } static __inline__ struct kaudit_record * @@ -727,25 +756,20 @@ currecord(void) * work, since we pre-select only based on the AUE_audit event type, * not the event type submitted as part of the user audit data. */ -struct audit_args { - void * record; - int length; -}; /* ARGSUSED */ int -audit(struct proc *p, struct audit_args *uap, register_t *retval) +audit(struct proc *p, struct audit_args *uap, __unused register_t *retval) { - register struct pcred *pc = p->p_cred; int error; void * rec; struct kaudit_record *ar; struct uthread *uthr; - error = suser(pc->pc_ucred, &p->p_acflag); + error = suser(kauth_cred_get(), &p->p_acflag); if (error) return (error); - if ((uap->length <= 0) || (uap->length > audit_qctrl.aq_bufsz)) + if ((uap->length <= 0) || (uap->length > (int)audit_qctrl.aq_bufsz)) return (EINVAL); ar = currecord(); @@ -756,7 +780,7 @@ audit(struct proc *p, struct audit_args *uap, register_t *retval) if (ar == NULL) { uthr = curuthread(); if (uthr == NULL) /* can this happen? */ - return (ENOTSUP); + return (ENOTSUP); /* This is not very efficient; we're required to allocate * a complete kernel audit record just so the user record @@ -796,35 +820,29 @@ free_out: /* audit_syscall_exit() will free the audit record on the thread * even if we allocated it above. */ - kfree((vm_offset_t)rec, (vm_size_t)uap->length); + kfree(rec, uap->length); return (error); } /* * System call to manipulate auditing. */ -struct auditon_args { - int cmd; - void * data; - int length; -}; /* ARGSUSED */ int -auditon(struct proc *p, struct auditon_args *uap, register_t *retval) +auditon(struct proc *p, __unused struct auditon_args *uap, __unused register_t *retval) { - register struct pcred *pc = p->p_cred; int ret; int len; union auditon_udata udata; struct proc *tp; AUDIT_ARG(cmd, uap->cmd); - ret = suser(pc->pc_ucred, &p->p_acflag); + ret = suser(kauth_cred_get(), &p->p_acflag); if (ret) return (ret); len = uap->length; - if ((len <= 0) || (len > sizeof(union auditon_udata))) + if ((len <= 0) || (len > (int)sizeof(union auditon_udata))) return (EINVAL); memset((void *)&udata, 0, sizeof(udata)); @@ -850,7 +868,7 @@ auditon(struct proc *p, struct auditon_args *uap, register_t *retval) return (ret); AUDIT_ARG(auditon, &udata); break; - } +} /* XXX Need to implement these commands by accessing the global * values associated with the commands. @@ -865,9 +883,9 @@ auditon(struct proc *p, struct auditon_args *uap, register_t *retval) case A_SETPOLICY: if (udata.au_policy & ~(AUDIT_CNT|AUDIT_AHLT)) return (EINVAL); - /* +/* * XXX - Need to wake up waiters if the policy relaxes? - */ + */ audit_fail_stop = ((udata.au_policy & AUDIT_CNT) == 0); audit_panic_on_write_fail = (udata.au_policy & AUDIT_AHLT); break; @@ -940,16 +958,16 @@ auditon(struct proc *p, struct auditon_args *uap, register_t *retval) if ((tp = pfind(udata.au_aupinfo.ap_pid)) == NULL) return (EINVAL); - udata.au_aupinfo.ap_auid = tp->p_au->ai_auid; + udata.au_aupinfo.ap_auid = tp->p_ucred->cr_au.ai_auid; udata.au_aupinfo.ap_mask.am_success = - tp->p_au->ai_mask.am_success; + tp->p_ucred->cr_au.ai_mask.am_success; udata.au_aupinfo.ap_mask.am_failure = - tp->p_au->ai_mask.am_failure; + tp->p_ucred->cr_au.ai_mask.am_failure; udata.au_aupinfo.ap_termid.machine = - tp->p_au->ai_termid.machine; + tp->p_ucred->cr_au.ai_termid.machine; udata.au_aupinfo.ap_termid.port = - tp->p_au->ai_termid.port; - udata.au_aupinfo.ap_asid = tp->p_au->ai_asid; + tp->p_ucred->cr_au.ai_termid.port; + udata.au_aupinfo.ap_asid = tp->p_ucred->cr_au.ai_asid; break; case A_SETPMASK: if (udata.au_aupinfo.ap_pid < 1) @@ -957,10 +975,49 @@ auditon(struct proc *p, struct auditon_args *uap, register_t *retval) if ((tp = pfind(udata.au_aupinfo.ap_pid)) == NULL) return (EINVAL); - tp->p_au->ai_mask.am_success = - udata.au_aupinfo.ap_mask.am_success; - tp->p_au->ai_mask.am_failure = - udata.au_aupinfo.ap_mask.am_failure; + /* + * we are modifying the audit info in a credential so we need a new + * credential (or take another reference on an existing credential that + * matches our new one). We must do this because the audit info in the + * credential is used as part of our hash key. Get current credential + * in the target process and take a reference while we muck with it. + */ + for (;;) { + kauth_cred_t my_cred, my_new_cred; + struct auditinfo temp_auditinfo; + + my_cred = kauth_cred_proc_ref(tp); + /* + * set the credential with new info. If there is no change we get back + * the same credential we passed in. + */ + temp_auditinfo = my_cred->cr_au; + temp_auditinfo.ai_mask.am_success = + udata.au_aupinfo.ap_mask.am_success; + temp_auditinfo.ai_mask.am_failure = + udata.au_aupinfo.ap_mask.am_failure; + my_new_cred = kauth_cred_setauditinfo(my_cred, &temp_auditinfo); + + if (my_cred != my_new_cred) { + proc_lock(tp); + /* need to protect for a race where another thread also changed + * the credential after we took our reference. If p_ucred has + * changed then we should restart this again with the new cred. + */ + if (tp->p_ucred != my_cred) { + proc_unlock(tp); + kauth_cred_rele(my_cred); + kauth_cred_rele(my_new_cred); + /* try again */ + continue; + } + tp->p_ucred = my_new_cred; + proc_unlock(tp); + } + /* drop our extra reference */ + kauth_cred_rele(my_cred); + break; + } break; case A_SETFSIZE: if ((udata.au_fstat.af_filesz != 0) && @@ -979,9 +1036,9 @@ auditon(struct proc *p, struct auditon_args *uap, register_t *retval) return (ENOSYS); break; case A_SETKAUDIT: - return (ENOSYS); + return (ENOSYS); break; - } +} /* Copy data back to userspace for the GET comands */ switch (uap->cmd) { case A_GETPOLICY: @@ -1009,47 +1066,82 @@ auditon(struct proc *p, struct auditon_args *uap, register_t *retval) * System calls to manage the user audit information. * XXXAUDIT May need to lock the proc structure. */ -struct getauid_args { - au_id_t *auid; -}; /* ARGSUSED */ int -getauid(struct proc *p, struct getauid_args *uap, register_t *retval) +getauid(struct proc *p, struct getauid_args *uap, __unused register_t *retval) { - register struct pcred *pc = p->p_cred; int error; - error = copyout((void *)&p->p_au->ai_auid, (void *)uap->auid, - sizeof(*uap->auid)); + error = copyout((void *)&kauth_cred_get()->cr_au.ai_auid, + uap->auid, sizeof(au_id_t)); if (error) return (error); return (0); } -struct setauid_args { - au_id_t *auid; -}; /* ARGSUSED */ int -setauid(struct proc *p, struct setauid_args *uap, register_t *retval) +setauid(struct proc *p, struct setauid_args *uap, __unused register_t *retval) { - register struct pcred *pc = p->p_cred; int error; + au_id_t temp_au_id; - error = suser(pc->pc_ucred, &p->p_acflag); + error = suser(kauth_cred_get(), &p->p_acflag); if (error) return (error); - error = copyin((void *)uap->auid, (void *)&p->p_au->ai_auid, - sizeof(p->p_au->ai_auid)); + error = copyin(uap->auid, + (void *)&temp_au_id, + sizeof(au_id_t)); if (error) return (error); + /* + * we are modifying the audit info in a credential so we need a new + * credential (or take another reference on an existing credential that + * matches our new one). We must do this because the audit info in the + * credential is used as part of our hash key. Get current credential + * in the target process and take a reference while we muck with it. + */ + for (;;) { + kauth_cred_t my_cred, my_new_cred; + struct auditinfo temp_auditinfo; + + my_cred = kauth_cred_proc_ref(p); + /* + * set the credential with new info. If there is no change we get back + * the same credential we passed in. + */ + temp_auditinfo = my_cred->cr_au; + temp_auditinfo.ai_auid = temp_au_id; + my_new_cred = kauth_cred_setauditinfo(my_cred, &temp_auditinfo); + + if (my_cred != my_new_cred) { + proc_lock(p); + /* need to protect for a race where another thread also changed + * the credential after we took our reference. If p_ucred has + * changed then we should restart this again with the new cred. + */ + if (p->p_ucred != my_cred) { + proc_unlock(p); + kauth_cred_rele(my_cred); + kauth_cred_rele(my_new_cred); + /* try again */ + continue; + } + p->p_ucred = my_new_cred; + proc_unlock(p); + } + /* drop our extra reference */ + kauth_cred_rele(my_cred); + break; + } + /* propagate the change from the process to Mach task */ set_security_token(p); - audit_arg_auid(p->p_au->ai_auid); + audit_arg_auid(kauth_cred_get()->cr_au.ai_auid); return (0); } @@ -1060,80 +1152,106 @@ setauid(struct proc *p, struct setauid_args *uap, register_t *retval) * filtered out - but the rest of the information is * returned. */ -struct getaudit_args { - struct auditinfo *auditinfo; -}; /* ARGSUSED */ int -getaudit(struct proc *p, struct getaudit_args *uap, register_t *retval) +getaudit(struct proc *p, struct getaudit_args *uap, __unused register_t *retval) { - register struct pcred *pc = p->p_cred; - struct auditinfo ai = *p->p_au; + struct auditinfo ai; int error; + ai = kauth_cred_get()->cr_au; + /* only superuser gets to see the real mask */ - error = suser(pc->pc_ucred, &p->p_acflag); + error = suser(kauth_cred_get(), &p->p_acflag); if (error) { ai.ai_mask.am_success = ~0; ai.ai_mask.am_failure = ~0; } - error = copyout((void *)&ai, (void *)uap->auditinfo, sizeof(ai)); + error = copyout(&ai, uap->auditinfo, sizeof(ai)); if (error) return (error); return (0); } -struct setaudit_args { - struct auditinfo *auditinfo; -}; /* ARGSUSED */ int -setaudit(struct proc *p, struct setaudit_args *uap, register_t *retval) +setaudit(struct proc *p, struct setaudit_args *uap, __unused register_t *retval) { - register struct pcred *pc = p->p_cred; int error; + struct auditinfo temp_auditinfo; - error = suser(pc->pc_ucred, &p->p_acflag); + error = suser(kauth_cred_get(), &p->p_acflag); if (error) return (error); - error = copyin((void *)uap->auditinfo, (void *)p->p_au, - sizeof(*p->p_au)); + + error = copyin(uap->auditinfo, + (void *)&temp_auditinfo, + sizeof(temp_auditinfo)); if (error) return (error); + /* + * we are modifying the audit info in a credential so we need a new + * credential (or take another reference on an existing credential that + * matches our new one). We must do this because the audit info in the + * credential is used as part of our hash key. Get current credential + * in the target process and take a reference while we muck with it. + */ + for (;;) { + kauth_cred_t my_cred, my_new_cred; + + my_cred = kauth_cred_proc_ref(p); + /* + * set the credential with new info. If there is no change we get back + * the same credential we passed in. + */ + my_new_cred = kauth_cred_setauditinfo(my_cred, &temp_auditinfo); + + if (my_cred != my_new_cred) { + proc_lock(p); + /* need to protect for a race where another thread also changed + * the credential after we took our reference. If p_ucred has + * changed then we should restart this again with the new cred. + */ + if (p->p_ucred != my_cred) { + proc_unlock(p); + kauth_cred_rele(my_cred); + kauth_cred_rele(my_new_cred); + /* try again */ + continue; + } + p->p_ucred = my_new_cred; + proc_unlock(p); + } + /* drop our extra reference */ + kauth_cred_rele(my_cred); + break; + } + /* propagate the change from the process to Mach task */ set_security_token(p); - audit_arg_auditinfo(p->p_au); + audit_arg_auditinfo(&p->p_ucred->cr_au); return (0); } -struct getaudit_addr_args { - struct auditinfo_addr *auditinfo_addr; - int length; -}; /* ARGSUSED */ int -getaudit_addr(struct proc *p, struct getaudit_addr_args *uap, register_t *retval) +getaudit_addr(struct proc *p, __unused struct getaudit_addr_args *uap, __unused register_t *retval) { return (ENOSYS); } -struct setaudit_addr_args { - struct auditinfo_addr *auditinfo_addr; - int length; -}; /* ARGSUSED */ int -setaudit_addr(struct proc *p, struct setaudit_addr_args *uap, register_t *retval) +setaudit_addr(struct proc *p, __unused struct setaudit_addr_args *uap, __unused register_t *retval) { - register struct pcred *pc = p->p_cred; int error; - error = suser(pc->pc_ucred, &p->p_acflag); + error = suser(kauth_cred_get(), &p->p_acflag); if (error) return (error); return (ENOSYS); @@ -1143,20 +1261,20 @@ setaudit_addr(struct proc *p, struct setaudit_addr_args *uap, register_t *retval * Syscall to manage audit files. * */ -struct auditctl_args { - char *path; -}; /* ARGSUSED */ int -auditctl(struct proc *p, struct auditctl_args *uap) +auditctl(struct proc *p, struct auditctl_args *uap, __unused register_t *retval) { - struct kaudit_record *ar; struct nameidata nd; - struct ucred *cred; + kauth_cred_t cred; struct vnode *vp; - int error, flags, ret; + int error, flags; + struct vfs_context context; + + context.vc_proc = p; + context.vc_ucred = kauth_cred_get(); - error = suser(p->p_ucred, &p->p_acflag); + error = suser(kauth_cred_get(), &p->p_acflag); if (error) return (error); @@ -1168,26 +1286,31 @@ auditctl(struct proc *p, struct auditctl_args *uap) * validity checks, and grab another reference to the current * credential. */ - if (uap->path != NULL) { + if (uap->path != 0) { NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, - UIO_USERSPACE, uap->path, p); + (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32), + uap->path, &context); flags = audit_open_flags; error = vn_open(&nd, flags, 0); if (error) goto out; - VOP_UNLOCK(nd.ni_vp, 0, p); vp = nd.ni_vp; if (vp->v_type != VREG) { - vn_close(vp, audit_close_flags, p->p_ucred, p); + vn_close(vp, audit_close_flags, kauth_cred_get(), p); + vnode_put(vp); error = EINVAL; goto out; } - cred = p->p_ucred; - crhold(cred); + cred = kauth_cred_get_with_ref(); audit_suspended = 0; } - + /* + * a vp and cred of NULL is valid at this point + * and indicates we're to turn off auditing... + */ audit_rotate_vnode(cred, vp); + if (vp) + vnode_put(vp); out: return (error); } @@ -1200,7 +1323,7 @@ out: * MPSAFE */ struct kaudit_record * -audit_new(int event, struct proc *p, struct uthread *uthread) +audit_new(int event, struct proc *p, __unused struct uthread *uthread) { struct kaudit_record *ar; int no_record; @@ -1248,14 +1371,14 @@ audit_new(int event, struct proc *p, struct uthread *uthread) /* Export the subject credential. */ cru2x(p->p_ucred, &ar->k_ar.ar_subj_cred); - ar->k_ar.ar_subj_ruid = p->p_cred->p_ruid; - ar->k_ar.ar_subj_rgid = p->p_cred->p_rgid; + ar->k_ar.ar_subj_ruid = p->p_ucred->cr_ruid; + ar->k_ar.ar_subj_rgid = p->p_ucred->cr_rgid; ar->k_ar.ar_subj_egid = p->p_ucred->cr_groups[0]; - ar->k_ar.ar_subj_auid = p->p_au->ai_auid; - ar->k_ar.ar_subj_asid = p->p_au->ai_asid; + ar->k_ar.ar_subj_auid = p->p_ucred->cr_au.ai_auid; + ar->k_ar.ar_subj_asid = p->p_ucred->cr_au.ai_asid; ar->k_ar.ar_subj_pid = p->p_pid; - ar->k_ar.ar_subj_amask = p->p_au->ai_mask; - ar->k_ar.ar_subj_term = p->p_au->ai_termid; + ar->k_ar.ar_subj_amask = p->p_ucred->cr_au.ai_mask; + ar->k_ar.ar_subj_term = p->p_ucred->cr_au.ai_termid; bcopy(p->p_comm, ar->k_ar.ar_subj_comm, MAXCOMLEN); return (ar); @@ -1268,7 +1391,6 @@ audit_new(int event, struct proc *p, struct uthread *uthread) void audit_abort(struct kaudit_record *ar) { - mutex_lock(audit_mtx); audit_pre_q_len--; mutex_unlock(audit_mtx); @@ -1326,7 +1448,7 @@ audit_commit(struct kaudit_record *ar, int error, int retval) if (au_preselect(ar->k_ar.ar_event, aumask, sorf) != 0) ar->k_ar_commit |= AR_COMMIT_KERNEL; - if (ar->k_ar_commit & (AR_COMMIT_USER | AR_COMMIT_KERNEL) == 0) { + if ((ar->k_ar_commit & (AR_COMMIT_USER | AR_COMMIT_KERNEL)) == 0) { mutex_lock(audit_mtx); audit_pre_q_len--; mutex_unlock(audit_mtx); @@ -1348,7 +1470,6 @@ audit_commit(struct kaudit_record *ar, int error, int retval) nanotime(&ar->k_ar.ar_endtime); mutex_lock(audit_mtx); - /* * Note: it could be that some records initiated while audit was * enabled should still be committed? @@ -1359,7 +1480,7 @@ audit_commit(struct kaudit_record *ar, int error, int retval) audit_free(ar); return; } - + /* * Constrain the number of committed audit records based on * the configurable parameter. @@ -1368,7 +1489,8 @@ audit_commit(struct kaudit_record *ar, int error, int retval) ret = wait_queue_assert_wait(audit_wait_queue, AUDIT_COMMIT_EVENT, - THREAD_UNINT); + THREAD_UNINT, + 0); mutex_unlock(audit_mtx); assert(ret == THREAD_WAITING); @@ -1405,39 +1527,41 @@ audit_syscall_enter(unsigned short code, struct proc *proc, /* Check which audit mask to use; either the kernel non-attributable * event mask or the process audit mask. */ - if (proc->p_au->ai_auid == AU_DEFAUDITID) + if (proc->p_ucred->cr_au.ai_auid == AU_DEFAUDITID) aumask = &audit_nae_mask; else - aumask = &proc->p_au->ai_mask; - + aumask = &proc->p_ucred->cr_au.ai_mask; + /* * Allocate an audit record, if preselection allows it, and store * in the BSD thread for later use. */ if (au_preselect(audit_event, aumask, - AU_PRS_FAILURE | AU_PRS_SUCCESS)) { + AU_PRS_FAILURE | AU_PRS_SUCCESS)) { /* * If we're out of space and need to suspend unprivileged * processes, do that here rather than trying to allocate * another audit record. */ if (audit_in_failure && - suser(proc->p_ucred, &proc->p_acflag) != 0) { + suser(kauth_cred_get(), &proc->p_acflag) != 0) { int ret; + assert(audit_worker_thread != THREAD_NULL); ret = wait_queue_assert_wait(audit_wait_queue, - AUDIT_FAILURE_EVENT, THREAD_UNINT); + AUDIT_FAILURE_EVENT, THREAD_UNINT, 0); assert(ret == THREAD_WAITING); (void)thread_block(THREAD_CONTINUE_NULL); panic("audit_failing_stop: thread continued"); } - uthread->uu_ar = audit_new(audit_event, proc, uthread); - } else - uthread->uu_ar = NULL; -} + uthread->uu_ar = audit_new(audit_event, proc, uthread); + } else { + uthread->uu_ar = NULL; + } + } void -audit_syscall_exit(int error, struct proc *proc, struct uthread *uthread) +audit_syscall_exit(int error, AUDIT_PRINTF_ONLY struct proc *proc, struct uthread *uthread) { int retval; @@ -1455,8 +1579,9 @@ audit_syscall_exit(int error, struct proc *proc, struct uthread *uthread) retval = uthread->uu_rval[0]; audit_commit(uthread->uu_ar, error, retval); - if (uthread->uu_ar != NULL) + if (uthread->uu_ar != NULL) { AUDIT_PRINTF(("audit record committed by pid %d\n", proc->p_pid)); + } uthread->uu_ar = NULL; } @@ -1488,10 +1613,10 @@ audit_mach_syscall_enter(unsigned short audit_event) /* Check which audit mask to use; either the kernel non-attributable * event mask or the process audit mask. */ - if (proc->p_au->ai_auid == AU_DEFAUDITID) + if (proc->p_ucred->cr_au.ai_auid == AU_DEFAUDITID) aumask = &audit_nae_mask; else - aumask = &proc->p_au->ai_mask; + aumask = &proc->p_ucred->cr_au.ai_mask; /* * Allocate an audit record, if desired, and store in the BSD @@ -1526,7 +1651,7 @@ audit_mach_syscall_exit(int retval, struct uthread *uthread) * record for this event. */ void -audit_arg_addr(void * addr) +audit_arg_addr(user_addr_t addr) { struct kaudit_record *ar; @@ -1534,12 +1659,12 @@ audit_arg_addr(void * addr) if (ar == NULL) return; - ar->k_ar.ar_arg_addr = addr; + ar->k_ar.ar_arg_addr = CAST_DOWN(void *, addr); /* XXX */ ar->k_ar.ar_valid_arg |= ARG_ADDR; } void -audit_arg_len(int len) +audit_arg_len(user_size_t len) { struct kaudit_record *ar; @@ -1547,7 +1672,7 @@ audit_arg_len(int len) if (ar == NULL) return; - ar->k_ar.ar_arg_len = len; + ar->k_ar.ar_arg_len = CAST_DOWN(int, len); /* XXX */ ar->k_ar.ar_valid_arg |= ARG_LEN; } @@ -1610,9 +1735,9 @@ audit_arg_uid(uid_t uid, uid_t euid, uid_t ruid, uid_t suid) } void -audit_arg_groupset(gid_t *gidset, u_int gidset_size) +audit_arg_groupset(const gid_t *gidset, u_int gidset_size) { - int i; + uint i; struct kaudit_record *ar; ar = currecord(); @@ -1626,7 +1751,7 @@ audit_arg_groupset(gid_t *gidset, u_int gidset_size) } void -audit_arg_login(char *login) +audit_arg_login(const char *login) { struct kaudit_record *ar; @@ -1647,7 +1772,7 @@ audit_arg_login(char *login) } void -audit_arg_ctlname(int *name, int namelen) +audit_arg_ctlname(const int *name, int namelen) { struct kaudit_record *ar; @@ -1730,7 +1855,6 @@ void audit_arg_pid(pid_t pid) { struct kaudit_record *ar; - struct proc *p; ar = currecord(); if (ar == NULL) @@ -1738,7 +1862,6 @@ audit_arg_pid(pid_t pid) ar->k_ar.ar_arg_pid = pid; ar->k_ar.ar_valid_arg |= ARG_PID; - } void @@ -1750,15 +1873,13 @@ audit_arg_process(struct proc *p) if ((ar == NULL) || (p == NULL)) return; - /* XXX May need to lock the credentials structures */ - ar->k_ar.ar_arg_auid = p->p_au->ai_auid; + ar->k_ar.ar_arg_auid = p->p_ucred->cr_au.ai_auid; ar->k_ar.ar_arg_euid = p->p_ucred->cr_uid; ar->k_ar.ar_arg_egid = p->p_ucred->cr_groups[0]; - ar->k_ar.ar_arg_ruid = p->p_cred->p_ruid; - ar->k_ar.ar_arg_rgid = p->p_cred->p_rgid; - ar->k_ar.ar_arg_asid = p->p_au->ai_asid; - - ar->k_ar.ar_arg_termid = p->p_au->ai_termid; + ar->k_ar.ar_arg_ruid = p->p_ucred->cr_ruid; + ar->k_ar.ar_arg_rgid = p->p_ucred->cr_rgid; + ar->k_ar.ar_arg_asid = p->p_ucred->cr_au.ai_asid; + ar->k_ar.ar_arg_termid = p->p_ucred->cr_au.ai_termid; ar->k_ar.ar_valid_arg |= ARG_AUID | ARG_EUID | ARG_EGID | ARG_RUID | ARG_RGID | ARG_ASID | ARG_TERMID | ARG_PROCESS; @@ -1832,7 +1953,7 @@ audit_arg_auid(uid_t auid) } void -audit_arg_auditinfo(struct auditinfo *au_info) +audit_arg_auditinfo(const struct auditinfo *au_info) { struct kaudit_record *ar; @@ -1850,7 +1971,7 @@ audit_arg_auditinfo(struct auditinfo *au_info) } void -audit_arg_text(char *text) +audit_arg_text(const char *text) { struct kaudit_record *ar; @@ -1900,7 +2021,7 @@ audit_arg_svipc_cmd(int cmd) } void -audit_arg_svipc_perm(struct ipc_perm *perm) +audit_arg_svipc_perm(const struct ipc_perm *perm) { struct kaudit_record *ar; @@ -1955,7 +2076,7 @@ audit_arg_posix_ipc_perm(uid_t uid, gid_t gid, mode_t mode) } void -audit_arg_auditon(union auditon_udata *udata) +audit_arg_auditon(const union auditon_udata *udata) { struct kaudit_record *ar; @@ -1963,32 +2084,32 @@ audit_arg_auditon(union auditon_udata *udata) if (ar == NULL) return; - bcopy((void *)udata, &ar->k_ar.ar_arg_auditon, + bcopy((const void *)udata, &ar->k_ar.ar_arg_auditon, sizeof(ar->k_ar.ar_arg_auditon)); ar->k_ar.ar_valid_arg |= ARG_AUDITON; } -/* +/* * Audit information about a file, either the file's vnode info, or its * socket address info. */ void -audit_arg_file(struct proc *p, struct file *fp) +audit_arg_file(__unused struct proc *p, const struct fileproc *fp) { struct kaudit_record *ar; struct socket *so; struct inpcb *pcb; - if (fp->f_type == DTYPE_VNODE) { - audit_arg_vnpath((struct vnode *)fp->f_data, ARG_VNODE1); + if (fp->f_fglob->fg_type == DTYPE_VNODE) { + audit_arg_vnpath_withref((struct vnode *)fp->f_fglob->fg_data, ARG_VNODE1); return; } - if (fp->f_type == DTYPE_SOCKET) { + if (fp->f_fglob->fg_type == DTYPE_SOCKET) { ar = currecord(); if (ar == NULL) return; - so = (struct socket *)fp->f_data; + so = (struct socket *)fp->f_fglob->fg_data; if (INP_CHECK_SOCKAF(so, PF_INET)) { if (so->so_pcb == NULL) return; @@ -2013,51 +2134,6 @@ audit_arg_file(struct proc *p, struct file *fp) } -/* - * Initialize the audit information for the a process, presumably the first - * process in the system. - * XXX It is not clear what the initial values should be for session ID, - * terminal ID etc. - */ -void -audit_proc_init(struct proc *p) -{ - MALLOC_ZONE(p->p_au, struct auditinfo *, sizeof(*p->p_au), - M_SUBPROC, M_WAITOK); - - bzero((void *)p->p_au, sizeof(*p->p_au)); - - p->p_au->ai_auid = AU_DEFAUDITID; -} - -/* - * Copy the audit info from the parent process to the child process when - * a fork takes place. - * XXX Need to check for failure from the memory allocation, in here - * as well as in any functions that use the process auditing info. - */ -void -audit_proc_fork(struct proc *parent, struct proc *child) -{ - /* Always set up the audit information pointer as this function - * should only be called when the proc is new. If proc structures - * are ever cached and reused, then this behavior will leak memory. - */ - MALLOC_ZONE(child->p_au, struct auditinfo *, sizeof(*child->p_au), - M_SUBPROC, M_WAITOK); - - bcopy(parent->p_au, child->p_au, sizeof(*child->p_au)); -} - -/* - * Free the auditing structure for the process. - */ -void -audit_proc_free(struct proc *p) -{ - FREE_ZONE((void *)p->p_au, sizeof(*p->p_au), M_SUBPROC); - p->p_au = NULL; -} /* * Store a path as given by the user process for auditing into the audit @@ -2074,7 +2150,7 @@ audit_arg_upath(struct proc *p, char *upath, u_int64_t flags) if (p == NULL || upath == NULL) return; /* nothing to do! */ - if (flags & (ARG_UPATH1 | ARG_UPATH2) == 0) + if ((flags & (ARG_UPATH1 | ARG_UPATH2)) == 0) return; ar = currecord(); @@ -2101,9 +2177,9 @@ audit_arg_upath(struct proc *p, char *upath, u_int64_t flags) ar->k_ar.ar_valid_arg |= ARG_UPATH1; else ar->k_ar.ar_valid_arg |= ARG_UPATH2; - } else { - kfree((vm_offset_t)*pathp, MAXPATHLEN); - *pathp = NULL; + } else { + kfree(*pathp, MAXPATHLEN); + *pathp = NULL; } } @@ -2112,7 +2188,7 @@ audit_arg_upath(struct proc *p, char *upath, u_int64_t flags) * record. * * It is assumed that the caller will hold any vnode locks necessary to - * perform a VOP_GETATTR() on the passed vnode. + * perform a VNOP_GETATTR() on the passed vnode. * * XXX: The attr code is very similar to vfs_vnops.c:vn_stat(), but * always provides access to the generation number as we need that @@ -2125,12 +2201,13 @@ void audit_arg_vnpath(struct vnode *vp, u_int64_t flags) { struct kaudit_record *ar; - struct vattr vattr; + struct vnode_attr va; int error; int len; char **pathp; struct vnode_au_info *vnp; struct proc *p; + struct vfs_context context; if (vp == NULL) return; @@ -2139,7 +2216,7 @@ audit_arg_vnpath(struct vnode *vp, u_int64_t flags) if (ar == NULL) /* This will be the case for unaudited system calls */ return; - if (flags & (ARG_VNODE1 | ARG_VNODE2) == 0) + if ((flags & (ARG_VNODE1 | ARG_VNODE2)) == 0) return; p = current_proc(); @@ -2170,32 +2247,40 @@ audit_arg_vnpath(struct vnode *vp, u_int64_t flags) */ len = MAXPATHLEN; if (vn_getpath(vp, *pathp, &len) == 0) { - if (flags & ARG_VNODE1) - ar->k_ar.ar_valid_arg |= ARG_KPATH1; - else - ar->k_ar.ar_valid_arg |= ARG_KPATH2; + if (flags & ARG_VNODE1) + ar->k_ar.ar_valid_arg |= ARG_KPATH1; + else + ar->k_ar.ar_valid_arg |= ARG_KPATH2; } else { - kfree((vm_offset_t)*pathp, MAXPATHLEN); + kfree(*pathp, MAXPATHLEN); *pathp = NULL; } - /* - * XXX: We'd assert the vnode lock here, only Darwin doesn't - * appear to have vnode locking assertions. - */ - error = VOP_GETATTR(vp, &vattr, p->p_ucred, p); + context.vc_proc = p; + context.vc_ucred = kauth_cred_get(); + + VATTR_INIT(&va); + VATTR_WANTED(&va, va_mode); + VATTR_WANTED(&va, va_uid); + VATTR_WANTED(&va, va_gid); + VATTR_WANTED(&va, va_rdev); + VATTR_WANTED(&va, va_fsid); + VATTR_WANTED(&va, va_fileid); + VATTR_WANTED(&va, va_gen); + error = vnode_getattr(vp, &va, &context); if (error) { /* XXX: How to handle this case? */ return; } - vnp->vn_mode = vattr.va_mode; - vnp->vn_uid = vattr.va_uid; - vnp->vn_gid = vattr.va_gid; - vnp->vn_dev = vattr.va_rdev; - vnp->vn_fsid = vattr.va_fsid; - vnp->vn_fileid = vattr.va_fileid; - vnp->vn_gen = vattr.va_gen; + /* XXX do we want to fall back here when these aren't supported? */ + vnp->vn_mode = va.va_mode; + vnp->vn_uid = va.va_uid; + vnp->vn_gid = va.va_gid; + vnp->vn_dev = va.va_rdev; + vnp->vn_fsid = va.va_fsid; + vnp->vn_fileid = (u_long)va.va_fileid; + vnp->vn_gen = va.va_gen; if (flags & ARG_VNODE1) ar->k_ar.ar_valid_arg |= ARG_VNODE1; else @@ -2204,7 +2289,16 @@ audit_arg_vnpath(struct vnode *vp, u_int64_t flags) } void -audit_arg_mach_port1(mach_port_t port) +audit_arg_vnpath_withref(struct vnode *vp, u_int64_t flags) +{ + if (vp == NULL || vnode_getwithref(vp)) + return; + audit_arg_vnpath(vp, flags); + (void)vnode_put(vp); +} + +void +audit_arg_mach_port1(mach_port_name_t port) { struct kaudit_record *ar; @@ -2217,7 +2311,7 @@ audit_arg_mach_port1(mach_port_t port) } void -audit_arg_mach_port2(mach_port_t port) +audit_arg_mach_port2(mach_port_name_t port) { struct kaudit_record *ar; @@ -2237,15 +2331,16 @@ audit_arg_mach_port2(mach_port_t port) void audit_sysclose(struct proc *p, int fd) { - struct file *fp; + struct fileproc *fp; + struct vnode *vp; audit_arg_fd(fd); - if (getvnode(p, fd, &fp) != 0) + if (fp_getfvp(p, fd, &fp, &vp) != 0) return; - audit_arg_vnpath((struct vnode *)fp->f_data, ARG_VNODE1); - + audit_arg_vnpath_withref((struct vnode *)fp->f_fglob->fg_data, ARG_VNODE1); + file_drop(fd); } #else /* !AUDIT */ @@ -2316,22 +2411,4 @@ auditctl(struct proc *p, struct auditctl_args *uap, register_t *retval) return (ENOSYS); } -void -audit_proc_init(struct proc *p) -{ - -} - -void -audit_proc_fork(struct proc *parent, struct proc *child) -{ - -} - -void -audit_proc_free(struct proc *p) -{ - -} - #endif /* AUDIT */ diff --git a/bsd/kern/kern_authorization.c b/bsd/kern/kern_authorization.c new file mode 100644 index 000000000..b5dbe6706 --- /dev/null +++ b/bsd/kern/kern_authorization.c @@ -0,0 +1,1014 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/* + * Centralized authorisation framework. + */ + +#include <sys/appleapiopts.h> +#include <sys/param.h> /* XXX trim includes */ +#include <sys/acct.h> +#include <sys/systm.h> +#include <sys/ucred.h> +#include <sys/proc_internal.h> +#include <sys/timeb.h> +#include <sys/times.h> +#include <sys/malloc.h> +#include <sys/vnode_internal.h> +#include <sys/kauth.h> +#include <sys/stat.h> + +#include <bsm/audit_kernel.h> + +#include <sys/mount.h> +#include <sys/sysproto.h> +#include <mach/message.h> +#include <mach/host_security.h> + +#include <kern/locks.h> + + +/* + * Authorization scopes. + */ + +lck_grp_t *kauth_lck_grp; +static lck_mtx_t *kauth_scope_mtx; +#define KAUTH_SCOPELOCK() lck_mtx_lock(kauth_scope_mtx); +#define KAUTH_SCOPEUNLOCK() lck_mtx_unlock(kauth_scope_mtx); + +/* + * We support listeners for scopes that have not been registered yet. + * If a listener comes in for a scope that is not active we hang the listener + * off our kauth_dangling_listeners list and once the scope becomes active we + * remove it from kauth_dangling_listeners and add it to the active scope. + */ +struct kauth_listener { + TAILQ_ENTRY(kauth_listener) kl_link; + const char * kl_identifier; + kauth_scope_callback_t kl_callback; + void * kl_idata; +}; + +/* XXX - kauth_todo - there is a race if a scope listener is removed while we + * we are in the kauth_authorize_action code path. We intentionally do not take + * a scope lock in order to get the best possible performance. we will fix this + * post Tiger. + * Until the race is fixed our kext clients are responsible for all active + * requests that may be in their callback code or on the way to their callback + * code before they free kauth_listener.kl_callback or kauth_listener.kl_idata. + * We keep copies of these in our kauth_local_listener in an attempt to limit + * our expose to unlisten race. + */ +struct kauth_local_listener { + kauth_listener_t kll_listenerp; + kauth_scope_callback_t kll_callback; + void * kll_idata; +}; +typedef struct kauth_local_listener *kauth_local_listener_t; + +static TAILQ_HEAD(,kauth_listener) kauth_dangling_listeners; + +/* + * Scope listeners need to be reworked to be dynamic. + * We intentionally used a static table to avoid locking issues with linked + * lists. The listeners may be called quite often. + * XXX - kauth_todo + */ +#define KAUTH_SCOPE_MAX_LISTENERS 15 + +struct kauth_scope { + TAILQ_ENTRY(kauth_scope) ks_link; + volatile struct kauth_local_listener ks_listeners[KAUTH_SCOPE_MAX_LISTENERS]; + const char * ks_identifier; + kauth_scope_callback_t ks_callback; + void * ks_idata; + u_int ks_flags; +}; + +/* values for kauth_scope.ks_flags */ +#define KS_F_HAS_LISTENERS (1 << 0) + +static TAILQ_HEAD(,kauth_scope) kauth_scopes; + +static int kauth_add_callback_to_scope(kauth_scope_t sp, kauth_listener_t klp); +static void kauth_scope_init(void); +static kauth_scope_t kauth_alloc_scope(const char *identifier, kauth_scope_callback_t callback, void *idata); +static kauth_listener_t kauth_alloc_listener(const char *identifier, kauth_scope_callback_t callback, void *idata); +#if 0 +static int kauth_scope_valid(kauth_scope_t scope); +#endif + +kauth_scope_t kauth_scope_process; +static int kauth_authorize_process_callback(kauth_cred_t _credential, void *_idata, kauth_action_t _action, + uintptr_t arg0, uintptr_t arg1, __unused uintptr_t arg2, __unused uintptr_t arg3); +kauth_scope_t kauth_scope_generic; +static int kauth_authorize_generic_callback(kauth_cred_t _credential, void *_idata, kauth_action_t _action, + uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3); +kauth_scope_t kauth_scope_fileop; + +extern int cansignal(struct proc *, kauth_cred_t, struct proc *, int); +extern char * get_pathbuff(void); +extern void release_pathbuff(char *path); + +/* + * Initialization. + */ +void +kauth_init(void) +{ + lck_grp_attr_t *grp_attributes; + + TAILQ_INIT(&kauth_scopes); + TAILQ_INIT(&kauth_dangling_listeners); + + /* set up our lock group */ + grp_attributes = lck_grp_attr_alloc_init(); + kauth_lck_grp = lck_grp_alloc_init("kauth", grp_attributes); + lck_grp_attr_free(grp_attributes); + + /* bring up kauth subsystem components */ + kauth_cred_init(); + kauth_identity_init(); + kauth_groups_init(); + kauth_scope_init(); + kauth_resolver_init(); + + /* can't alloc locks after this */ + lck_grp_free(kauth_lck_grp); + kauth_lck_grp = NULL; +} + +static void +kauth_scope_init(void) +{ + kauth_scope_mtx = lck_mtx_alloc_init(kauth_lck_grp, 0 /*LCK_ATTR_NULL*/); + kauth_scope_process = kauth_register_scope(KAUTH_SCOPE_PROCESS, kauth_authorize_process_callback, NULL); + kauth_scope_generic = kauth_register_scope(KAUTH_SCOPE_GENERIC, kauth_authorize_generic_callback, NULL); + kauth_scope_fileop = kauth_register_scope(KAUTH_SCOPE_FILEOP, NULL, NULL); +} + +/* + * Scope registration. + */ + +static kauth_scope_t +kauth_alloc_scope(const char *identifier, kauth_scope_callback_t callback, void *idata) +{ + kauth_scope_t sp; + + /* + * Allocate and populate the scope structure. + */ + MALLOC(sp, kauth_scope_t, sizeof(*sp), M_KAUTH, M_WAITOK); + if (sp == NULL) + return(NULL); + bzero(&sp->ks_listeners, sizeof(sp->ks_listeners)); + sp->ks_flags = 0; + sp->ks_identifier = identifier; + sp->ks_idata = idata; + sp->ks_callback = callback; + return(sp); +} + +static kauth_listener_t +kauth_alloc_listener(const char *identifier, kauth_scope_callback_t callback, void *idata) +{ + kauth_listener_t lsp; + + /* + * Allocate and populate the listener structure. + */ + MALLOC(lsp, kauth_listener_t, sizeof(*lsp), M_KAUTH, M_WAITOK); + if (lsp == NULL) + return(NULL); + lsp->kl_identifier = identifier; + lsp->kl_idata = idata; + lsp->kl_callback = callback; + return(lsp); +} + +kauth_scope_t +kauth_register_scope(const char *identifier, kauth_scope_callback_t callback, void *idata) +{ + kauth_scope_t sp, tsp; + kauth_listener_t klp; + + if ((sp = kauth_alloc_scope(identifier, callback, idata)) == NULL) + return(NULL); + + /* + * Lock the list and insert. + */ + KAUTH_SCOPELOCK(); + TAILQ_FOREACH(tsp, &kauth_scopes, ks_link) { + /* duplicate! */ + if (strcmp(tsp->ks_identifier, identifier) == 0) { + KAUTH_SCOPEUNLOCK(); + FREE(sp, M_KAUTH); + return(NULL); + } + } + TAILQ_INSERT_TAIL(&kauth_scopes, sp, ks_link); + + /* + * Look for listeners waiting for this scope, move them to the active scope + * listener table. + * Note that we have to restart the scan every time we remove an entry + * from the list, since we can't remove the current item from the list. + */ +restart: + TAILQ_FOREACH(klp, &kauth_dangling_listeners, kl_link) { + if (strcmp(klp->kl_identifier, sp->ks_identifier) == 0) { + /* found a match on the dangling listener list. add it to the + * the active scope. + */ + if (kauth_add_callback_to_scope(sp, klp) == 0) { + TAILQ_REMOVE(&kauth_dangling_listeners, klp, kl_link); + } + else { +#if 0 + printf("%s - failed to add listener to scope \"%s\" \n", __FUNCTION__, sp->ks_identifier); +#endif + break; + } + goto restart; + } + } + + KAUTH_SCOPEUNLOCK(); + return(sp); +} + + + +void +kauth_deregister_scope(kauth_scope_t scope) +{ + int i; + + KAUTH_SCOPELOCK(); + + TAILQ_REMOVE(&kauth_scopes, scope, ks_link); + + /* relocate listeners back to the waiting list */ + for (i = 0; i < KAUTH_SCOPE_MAX_LISTENERS; i++) { + if (scope->ks_listeners[i].kll_listenerp != NULL) { + TAILQ_INSERT_TAIL(&kauth_dangling_listeners, scope->ks_listeners[i].kll_listenerp, kl_link); + scope->ks_listeners[i].kll_listenerp = NULL; + /* + * XXX - kauth_todo - WARNING, do not clear kll_callback or + * kll_idata here. they are part of our scope unlisten race hack + */ + } + } + KAUTH_SCOPEUNLOCK(); + FREE(scope, M_KAUTH); + + return; +} + +kauth_listener_t +kauth_listen_scope(const char *identifier, kauth_scope_callback_t callback, void *idata) +{ + kauth_listener_t klp; + kauth_scope_t sp; + + if ((klp = kauth_alloc_listener(identifier, callback, idata)) == NULL) + return(NULL); + + /* + * Lock the scope list and check to see whether this scope already exists. + */ + KAUTH_SCOPELOCK(); + TAILQ_FOREACH(sp, &kauth_scopes, ks_link) { + if (strcmp(sp->ks_identifier, identifier) == 0) { + /* scope exists, add it to scope listener table */ + if (kauth_add_callback_to_scope(sp, klp) == 0) { + KAUTH_SCOPEUNLOCK(); + return(klp); + } + /* table already full */ + KAUTH_SCOPEUNLOCK(); + FREE(klp, M_KAUTH); + return(NULL); + } + } + + /* scope doesn't exist, put on waiting list. */ + TAILQ_INSERT_TAIL(&kauth_dangling_listeners, klp, kl_link); + + KAUTH_SCOPEUNLOCK(); + + return(klp); +} + +void +kauth_unlisten_scope(kauth_listener_t listener) +{ + kauth_scope_t sp; + kauth_listener_t klp; + int i, listener_count, do_free; + + KAUTH_SCOPELOCK(); + + /* search the active scope for this listener */ + TAILQ_FOREACH(sp, &kauth_scopes, ks_link) { + do_free = 0; + if ((sp->ks_flags & KS_F_HAS_LISTENERS) != 0) { + listener_count = 0; + for (i = 0; i < KAUTH_SCOPE_MAX_LISTENERS; i++) { + if (sp->ks_listeners[i].kll_listenerp == listener) { + sp->ks_listeners[i].kll_listenerp = NULL; + do_free = 1; + /* + * XXX - kauth_todo - WARNING, do not clear kll_callback or + * kll_idata here. they are part of our scope unlisten race hack + */ + } + else if (sp->ks_listeners[i].kll_listenerp != NULL) { + listener_count++; + } + } + if (do_free) { + if (listener_count == 0) { + sp->ks_flags &= ~KS_F_HAS_LISTENERS; + } + KAUTH_SCOPEUNLOCK(); + FREE(listener, M_KAUTH); + return; + } + } + } + + /* if not active, check the dangling list */ + TAILQ_FOREACH(klp, &kauth_dangling_listeners, kl_link) { + if (klp == listener) { + TAILQ_REMOVE(&kauth_dangling_listeners, klp, kl_link); + KAUTH_SCOPEUNLOCK(); + FREE(listener, M_KAUTH); + return; + } + } + + KAUTH_SCOPEUNLOCK(); + return; +} + +/* + * Authorization requests. + */ +int +kauth_authorize_action(kauth_scope_t scope, kauth_cred_t credential, kauth_action_t action, + uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3) +{ + int result, ret, i; + + /* ask the scope */ + if (scope->ks_callback != NULL) + result = scope->ks_callback(credential, scope->ks_idata, action, arg0, arg1, arg2, arg3); + else + result = KAUTH_RESULT_DEFER; + + /* check with listeners */ + if ((scope->ks_flags & KS_F_HAS_LISTENERS) != 0) { + for (i = 0; i < KAUTH_SCOPE_MAX_LISTENERS; i++) { + /* XXX - kauth_todo - there is a race here if listener is removed - we will fix this post Tiger. + * Until the race is fixed our kext clients are responsible for all active requests that may + * be in their callbacks or on the way to their callbacks before they free kl_callback or kl_idata. + * We keep copies of these in our kauth_local_listener in an attempt to limit our expose to + * unlisten race. + */ + if (scope->ks_listeners[i].kll_listenerp == NULL || + scope->ks_listeners[i].kll_callback == NULL) + continue; + + ret = scope->ks_listeners[i].kll_callback( + credential, scope->ks_listeners[i].kll_idata, + action, arg0, arg1, arg2, arg3); + if ((ret == KAUTH_RESULT_DENY) || + (result == KAUTH_RESULT_DEFER)) + result = ret; + } + } + + /* we need an explicit allow, or the auth fails */ + /* XXX need a mechanism for auth failure to be signalled vs. denial */ + return(result == KAUTH_RESULT_ALLOW ? 0 : EPERM); +} + +/* + * Default authorization handlers. + */ +int +kauth_authorize_allow(__unused kauth_cred_t credential, __unused void *idata, __unused kauth_action_t action, + __unused uintptr_t arg0, __unused uintptr_t arg1, __unused uintptr_t arg2, __unused uintptr_t arg3) +{ + + return(KAUTH_RESULT_ALLOW); +} + +#if 0 +/* + * Debugging support. + */ +static int +kauth_scope_valid(kauth_scope_t scope) +{ + kauth_scope_t sp; + + KAUTH_SCOPELOCK(); + TAILQ_FOREACH(sp, &kauth_scopes, ks_link) { + if (sp == scope) + break; + } + KAUTH_SCOPEUNLOCK(); + return((sp == NULL) ? 0 : 1); +} +#endif + +/* + * Process authorization scope. + */ + +int +kauth_authorize_process(kauth_cred_t credential, kauth_action_t action, struct proc *process, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3) +{ + return(kauth_authorize_action(kauth_scope_process, credential, action, (uintptr_t)process, arg1, arg2, arg3)); +} + +static int +kauth_authorize_process_callback(kauth_cred_t credential, __unused void *idata, kauth_action_t action, + uintptr_t arg0, uintptr_t arg1, __unused uintptr_t arg2, __unused uintptr_t arg3) +{ + switch(action) { + case KAUTH_PROCESS_CANSIGNAL: + panic("KAUTH_PROCESS_CANSIGNAL not implemented"); + /* XXX credential wrong here */ + /* arg0 - process to signal + * arg1 - signal to send the process + */ + if (cansignal(current_proc(), credential, (struct proc *)arg0, (int)arg1)) + return(KAUTH_RESULT_ALLOW); + break; + case KAUTH_PROCESS_CANTRACE: + /* current_proc() - process that will do the tracing + * arg0 - process to be traced + * arg1 - pointer to int - reason (errno) for denial + */ + if (cantrace(current_proc(), credential, (proc_t)arg0, (int *)arg1)) + return(KAUTH_RESULT_ALLOW); + break; + } + + /* no explicit result, so defer to others in the chain */ + return(KAUTH_RESULT_DEFER); +} + +/* + * File system operation authorization scope. This is really only a notification + * of the file system operation, not an authorization check. Thus the result is + * not relevant. + * arguments passed to KAUTH_FILEOP_OPEN listeners + * arg0 is pointer to vnode (vnode *) for given user path. + * arg1 is pointer to path (char *) passed in to open. + * arguments passed to KAUTH_FILEOP_CLOSE listeners + * arg0 is pointer to vnode (vnode *) for file to be closed. + * arg1 is pointer to path (char *) of file to be closed. + * arg2 is close flags. + * arguments passed to KAUTH_FILEOP_RENAME listeners + * arg0 is pointer to "from" path (char *). + * arg1 is pointer to "to" path (char *). + * arguments passed to KAUTH_FILEOP_EXCHANGE listeners + * arg0 is pointer to file 1 path (char *). + * arg1 is pointer to file 2 path (char *). + * arguments passed to KAUTH_FILEOP_EXEC listeners + * arg0 is pointer to vnode (vnode *) for executable. + * arg1 is pointer to path (char *) to executable. + */ + +int +kauth_authorize_fileop_has_listeners(void) +{ + /* + * return 1 if we have any listeners for the fileop scope + * otherwize return 0 + */ + if ((kauth_scope_fileop->ks_flags & KS_F_HAS_LISTENERS) != 0) { + return(1); + } + return (0); +} + +int +kauth_authorize_fileop(kauth_cred_t credential, kauth_action_t action, uintptr_t arg0, uintptr_t arg1) +{ + char *namep = NULL; + int name_len; + uintptr_t arg2 = 0; + + /* we do not have a primary handler for the fileop scope so bail out if + * there are no listeners. + */ + if ((kauth_scope_fileop->ks_flags & KS_F_HAS_LISTENERS) == 0) { + return(0); + } + + if (action == KAUTH_FILEOP_OPEN || action == KAUTH_FILEOP_CLOSE || action == KAUTH_FILEOP_EXEC) { + /* get path to the given vnode as a convenience to our listeners. + */ + namep = get_pathbuff(); + name_len = MAXPATHLEN; + if (vn_getpath((vnode_t)arg0, namep, &name_len) != 0) { + release_pathbuff(namep); + return(0); + } + if (action == KAUTH_FILEOP_CLOSE) { + arg2 = arg1; /* close has some flags that come in via arg1 */ + } + arg1 = (uintptr_t)namep; + } + kauth_authorize_action(kauth_scope_fileop, credential, action, arg0, arg1, arg2, 0); + + if (namep != NULL) { + release_pathbuff(namep); + } + + return(0); +} + +/* + * Generic authorization scope. + */ + +int +kauth_authorize_generic(kauth_cred_t credential, kauth_action_t action) +{ + if (credential == NULL) + panic("auth against NULL credential"); + + return(kauth_authorize_action(kauth_scope_generic, credential, action, 0, 0, 0, 0)); + +} + +static int +kauth_authorize_generic_callback(kauth_cred_t credential, __unused void *idata, kauth_action_t action, + __unused uintptr_t arg0, __unused uintptr_t arg1, __unused uintptr_t arg2, __unused uintptr_t arg3) +{ + switch(action) { + case KAUTH_GENERIC_ISSUSER: + /* XXX == 0 ? */ + return((kauth_cred_getuid(credential) == 0) ? + KAUTH_RESULT_ALLOW : KAUTH_RESULT_DENY); + break; + } + + /* no explicit result, so defer to others in the chain */ + return(KAUTH_RESULT_DEFER); +} + +/* + * ACL evaluator. + * + * Determines whether the credential has the requested rights for an object secured by the supplied + * ACL. + * + * Evaluation proceeds from the top down, with access denied if any ACE denies any of the requested + * rights, or granted if all of the requested rights are satisfied by the ACEs so far. + */ +int +kauth_acl_evaluate(kauth_cred_t cred, kauth_acl_eval_t eval) +{ + int applies, error, i; + kauth_ace_t ace; + guid_t guid; + uint32_t rights; + int wkguid; + + /* always allowed to do nothing */ + if (eval->ae_requested == 0) { + eval->ae_result = KAUTH_RESULT_ALLOW; + return(0); + } + + eval->ae_residual = eval->ae_requested; + + /* + * Get our guid for comparison purposes. + */ + if ((error = kauth_cred_getguid(cred, &guid)) != 0) { + eval->ae_result = KAUTH_RESULT_DENY; + KAUTH_DEBUG(" ACL - can't get credential GUID (%d), ACL denied", error); + return(error); + } + + KAUTH_DEBUG(" ACL - %d entries, initial residual %x", eval->ae_count, eval->ae_residual); + for (i = 0, ace = eval->ae_acl; i < eval->ae_count; i++, ace++) { + + /* + * Skip inherit-only entries. + */ + if (ace->ace_flags & KAUTH_ACE_ONLY_INHERIT) + continue; + + /* + * Expand generic rights, if appropriate. + */ + rights = ace->ace_rights; + if (rights & KAUTH_ACE_GENERIC_ALL) + rights |= eval->ae_exp_gall; + if (rights & KAUTH_ACE_GENERIC_READ) + rights |= eval->ae_exp_gread; + if (rights & KAUTH_ACE_GENERIC_WRITE) + rights |= eval->ae_exp_gwrite; + if (rights & KAUTH_ACE_GENERIC_EXECUTE) + rights |= eval->ae_exp_gexec; + + /* + * Determine whether this entry applies to the current request. This + * saves us checking the GUID if the entry has nothing to do with what + * we're currently doing. + */ + switch(ace->ace_flags & KAUTH_ACE_KINDMASK) { + case KAUTH_ACE_PERMIT: + if (!(eval->ae_residual & rights)) + continue; + break; + case KAUTH_ACE_DENY: + if (!(eval->ae_requested & rights)) + continue; + break; + default: + /* we don't recognise this ACE, skip it */ + continue; + } + + /* + * Verify whether this entry applies to the credential. + */ + wkguid = kauth_wellknown_guid(&ace->ace_applicable); + switch(wkguid) { + case KAUTH_WKG_OWNER: + applies = eval->ae_options & KAUTH_AEVAL_IS_OWNER; + break; + case KAUTH_WKG_GROUP: + applies = eval->ae_options & KAUTH_AEVAL_IN_GROUP; + break; + /* we short-circuit these here rather than wasting time calling the group membership code */ + case KAUTH_WKG_EVERYBODY: + applies = 1; + break; + case KAUTH_WKG_NOBODY: + applies = 0; + break; + + default: + /* check to see whether it's exactly us, or a group we are a member of */ + applies = kauth_guid_equal(&guid, &ace->ace_applicable); + KAUTH_DEBUG(" ACL - ACE applicable " K_UUID_FMT " caller " K_UUID_FMT " %smatched", + K_UUID_ARG(ace->ace_applicable), K_UUID_ARG(guid), applies ? "" : "not "); + + if (!applies) { + error = kauth_cred_ismember_guid(cred, &ace->ace_applicable, &applies); + /* + * If we can't resolve group membership, we have to limit misbehaviour. + * If the ACE is an 'allow' ACE, assume the cred is not a member (avoid + * granting excess access). If the ACE is a 'deny' ACE, assume the cred + * is a member (avoid failing to deny). + */ + if (error != 0) { + KAUTH_DEBUG(" ACL[%d] - can't get membership, making pessimistic assumption", i); + switch(ace->ace_flags & KAUTH_ACE_KINDMASK) { + case KAUTH_ACE_PERMIT: + applies = 0; + break; + case KAUTH_ACE_DENY: + applies = 1; + break; + } + } else { + KAUTH_DEBUG(" ACL - %s group member", applies ? "is" : "not"); + } + } else { + KAUTH_DEBUG(" ACL - entry matches caller"); + } + } + if (!applies) + continue; + + /* + * Apply ACE to outstanding rights. + */ + switch(ace->ace_flags & KAUTH_ACE_KINDMASK) { + case KAUTH_ACE_PERMIT: + /* satisfy any rights that this ACE grants */ + eval->ae_residual = eval->ae_residual & ~rights; + KAUTH_DEBUG(" ACL[%d] - rights %x leave residual %x", i, rights, eval->ae_residual); + /* all rights satisfied? */ + if (eval->ae_residual == 0) { + eval->ae_result = KAUTH_RESULT_ALLOW; + return(0); + } + break; + case KAUTH_ACE_DENY: + /* deny the request if any of the requested rights is denied */ + if (eval->ae_requested & rights) { + KAUTH_DEBUG(" ACL[%d] - denying based on %x", i, rights); + eval->ae_result = KAUTH_RESULT_DENY; + return(0); + } + break; + default: + KAUTH_DEBUG(" ACL - unknown entry kind %d", ace->ace_flags & KAUTH_ACE_KINDMASK); + break; + } + } + /* if not permitted, defer to other modes of authorisation */ + eval->ae_result = KAUTH_RESULT_DEFER; + return(0); +} + +/* + * Perform ACL inheritance and umask-ACL handling. + * + * Entries are inherited from the ACL on dvp. A caller-supplied + * ACL is in initial, and the result is output into product. + * If the process has a umask ACL and one is not supplied, we use + * the umask ACL. + * If isdir is set, the resultant ACL is for a directory, otherwise it is for a file. + */ +int +kauth_acl_inherit(vnode_t dvp, kauth_acl_t initial, kauth_acl_t *product, int isdir, vfs_context_t ctx) +{ + int entries, error, index; + unsigned int i; + struct vnode_attr dva; + kauth_acl_t inherit, result; + + /* + * Fetch the ACL from the directory. This should never fail. Note that we don't + * manage inheritance when the remote server is doing authorization; we just + * want to compose the umask-ACL and any initial ACL. + */ + inherit = NULL; + if ((dvp != NULL) && !vfs_authopaque(vnode_mount(dvp))) { + VATTR_INIT(&dva); + VATTR_WANTED(&dva, va_acl); + if ((error = vnode_getattr(dvp, &dva, ctx)) != 0) { + KAUTH_DEBUG(" ERROR - could not get parent directory ACL for inheritance"); + return(error); + } + if (VATTR_IS_SUPPORTED(&dva, va_acl)) + inherit = dva.va_acl; + } + + /* + * Compute the number of entries in the result ACL by scanning the input lists. + */ + entries = 0; + if (inherit != NULL) { + for (i = 0; i < inherit->acl_entrycount; i++) { + if (inherit->acl_ace[i].ace_flags & (isdir ? KAUTH_ACE_DIRECTORY_INHERIT : KAUTH_ACE_FILE_INHERIT)) + entries++; + } + } + + if (initial == NULL) { + /* XXX 3634665 TODO: fetch umask ACL from the process, set in initial */ + } + + if (initial != NULL) { + entries += initial->acl_entrycount; + } + + /* + * If there is no initial ACL, and no inheritable entries, the + * object should have no ACL at all. + * Note that this differs from the case where the initial ACL + * is empty, in which case the object must also have an empty ACL. + */ + if ((entries == 0) && (initial == NULL)) { + *product = NULL; + error = 0; + goto out; + } + + /* + * Allocate the result buffer. + */ + if ((result = kauth_acl_alloc(entries)) == NULL) { + KAUTH_DEBUG(" ERROR - could not allocate %d-entry result buffer for inherited ACL"); + error = ENOMEM; + goto out; + } + + /* + * Composition is simply: + * - initial + * - inherited + */ + index = 0; + if (initial != NULL) { + for (i = 0; i < initial->acl_entrycount; i++) + result->acl_ace[index++] = initial->acl_ace[i]; + KAUTH_DEBUG(" INHERIT - applied %d initial entries", index); + } + if (inherit != NULL) { + for (i = 0; i < inherit->acl_entrycount; i++) { + /* inherit onto this object? */ + if (inherit->acl_ace[i].ace_flags & (isdir ? KAUTH_ACE_DIRECTORY_INHERIT : KAUTH_ACE_FILE_INHERIT)) { + result->acl_ace[index] = inherit->acl_ace[i]; + result->acl_ace[index].ace_flags |= KAUTH_ACE_INHERITED; + /* don't re-inherit? */ + if (result->acl_ace[index].ace_flags & KAUTH_ACE_LIMIT_INHERIT) + result->acl_ace[index].ace_flags &= + ~(KAUTH_ACE_DIRECTORY_INHERIT | KAUTH_ACE_FILE_INHERIT | KAUTH_ACE_LIMIT_INHERIT); + index++; + } + } + } + result->acl_entrycount = index; + *product = result; + KAUTH_DEBUG(" INHERIT - product ACL has %d entries", index); + error = 0; +out: + if (inherit != NULL) + kauth_acl_free(inherit); + return(error); +} + +/* + * Optimistically copy in a kauth_filesec structure + * Parameters: xsecurity user space kauth_filesec_t + * xsecdstpp pointer to kauth_filesec_t + * + * Returns: 0 on success, EINVAL or EFAULT depending on failure mode. + * Modifies: xsecdestpp, which contains a pointer to an allocated + * and copied-in kauth_filesec_t + */ + +int +kauth_copyinfilesec(user_addr_t xsecurity, kauth_filesec_t *xsecdestpp) +{ + user_addr_t uaddr, known_bound; + int error; + kauth_filesec_t fsec; + u_int32_t count; + size_t copysize; + + error = 0; + fsec = NULL; + + /* + * Make a guess at the size of the filesec. We start with the base + * pointer, and look at how much room is left on the page, clipped + * to a sensible upper bound. If it turns out this isn't enough, + * we'll size based on the actual ACL contents and come back again. + * + * The upper bound must be less than KAUTH_ACL_MAX_ENTRIES. The + * value here is fairly arbitrary. It's ok to have a zero count. + */ + known_bound = xsecurity + sizeof(struct kauth_filesec); + uaddr = mach_vm_round_page(known_bound); + count = (uaddr - known_bound) / sizeof(struct kauth_ace); + if (count > 32) + count = 32; +restart: + if ((fsec = kauth_filesec_alloc(count)) == NULL) { + error = ENOMEM; + goto out; + } + copysize = KAUTH_FILESEC_SIZE(count); + if ((error = copyin(xsecurity, (caddr_t)fsec, copysize)) != 0) + goto out; + + /* validate the filesec header */ + if (fsec->fsec_magic != KAUTH_FILESEC_MAGIC) { + error = EINVAL; + goto out; + } + + /* + * Is there an ACL payload, and is it too big? + */ + if ((fsec->fsec_entrycount != KAUTH_FILESEC_NOACL) && + (fsec->fsec_entrycount > count)) { + if (fsec->fsec_entrycount > KAUTH_ACL_MAX_ENTRIES) { + error = EINVAL; + goto out; + } + count = fsec->fsec_entrycount; + kauth_filesec_free(fsec); + goto restart; + } + +out: + if (error) { + if (fsec) + kauth_filesec_free(fsec); + } else { + *xsecdestpp = fsec; + } + return(error); +} + +/* + * Allocate a filesec structure. + */ +kauth_filesec_t +kauth_filesec_alloc(int count) +{ + kauth_filesec_t fsp; + + /* if the caller hasn't given us a valid size hint, assume the worst */ + if ((count < 0) || (count > KAUTH_ACL_MAX_ENTRIES)) + return(NULL); + + MALLOC(fsp, kauth_filesec_t, KAUTH_FILESEC_SIZE(count), M_KAUTH, M_WAITOK); + if (fsp != NULL) { + fsp->fsec_magic = KAUTH_FILESEC_MAGIC; + fsp->fsec_owner = kauth_null_guid; + fsp->fsec_group = kauth_null_guid; + fsp->fsec_entrycount = KAUTH_FILESEC_NOACL; + fsp->fsec_flags = 0; + } + return(fsp); +} + +void +kauth_filesec_free(kauth_filesec_t fsp) +{ +#ifdef KAUTH_DEBUG_ENABLE + if (fsp == KAUTH_FILESEC_NONE) + panic("freeing KAUTH_FILESEC_NONE"); + if (fsp == KAUTH_FILESEC_WANTED) + panic("freeing KAUTH_FILESEC_WANTED"); +#endif + FREE(fsp, M_KAUTH); +} + + +/* + * Allocate an ACL buffer. + */ +kauth_acl_t +kauth_acl_alloc(int count) +{ + kauth_acl_t aclp; + + /* if the caller hasn't given us a valid size hint, assume the worst */ + if ((count < 0) || (count > KAUTH_ACL_MAX_ENTRIES)) + return(NULL); + + MALLOC(aclp, kauth_acl_t, KAUTH_ACL_SIZE(count), M_KAUTH, M_WAITOK); + if (aclp != NULL) { + aclp->acl_entrycount = 0; + aclp->acl_flags = 0; + } + return(aclp); +} + +void +kauth_acl_free(kauth_acl_t aclp) +{ + FREE(aclp, M_KAUTH); +} + + +/* + * WARNING - caller must hold KAUTH_SCOPELOCK + */ +static int kauth_add_callback_to_scope(kauth_scope_t sp, kauth_listener_t klp) +{ + int i; + + for (i = 0; i < KAUTH_SCOPE_MAX_LISTENERS; i++) { + if (sp->ks_listeners[i].kll_listenerp == NULL) { + sp->ks_listeners[i].kll_callback = klp->kl_callback; + sp->ks_listeners[i].kll_idata = klp->kl_idata; + sp->ks_listeners[i].kll_listenerp = klp; + sp->ks_flags |= KS_F_HAS_LISTENERS; + return(0); + } + } + return(ENOSPC); +} diff --git a/bsd/kern/kern_bsm_audit.c b/bsd/kern/kern_bsm_audit.c index 44367bf9d..b4ddb4064 100644 --- a/bsd/kern/kern_bsm_audit.c +++ b/bsd/kern/kern_bsm_audit.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2003-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -19,9 +19,8 @@ * * @APPLE_LICENSE_HEADER_END@ */ - #include <sys/types.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/ipc.h> #include <sys/sem.h> #include <sys/socketvar.h> @@ -30,6 +29,7 @@ #include <sys/fcntl.h> #include <sys/user.h> +#include <sys/ipc.h> #include <bsm/audit.h> #include <bsm/audit_record.h> #include <bsm/audit_kernel.h> @@ -41,6 +41,7 @@ #include <netinet/ip.h> #include <kern/lock.h> +#include <kern/kalloc.h> /* The number of BSM records allocated. */ static int bsm_rec_count = 0; @@ -58,6 +59,8 @@ LIST_HEAD(, au_record) bsm_free_q; */ static mutex_t *bsm_audit_mutex; +static void audit_sys_auditon(struct audit_record *ar, struct au_record *rec); + /* * Initialize the BSM auditing subsystem. */ @@ -66,7 +69,7 @@ kau_init(void) { printf("BSM auditing present\n"); LIST_INIT(&bsm_free_q); - bsm_audit_mutex = mutex_alloc(ETAP_NO_TRACE); + bsm_audit_mutex = mutex_alloc(0); au_evclassmap_init(); } @@ -111,7 +114,7 @@ kau_open(void) } rec->data = (u_char *)kalloc(MAX_AUDIT_RECORD_SIZE * sizeof(u_char)); if((rec->data) == NULL) { - kfree((vm_offset_t)rec, (vm_size_t)sizeof(*rec)); + kfree(rec, sizeof(*rec)); return NULL; } mutex_lock(bsm_audit_mutex); @@ -153,7 +156,8 @@ int kau_write(struct au_record *rec, struct au_token *tok) * Close out the audit record by adding the header token, identifying * any missing tokens. Write out the tokens to the record memory. */ -int kau_close(struct au_record *rec, struct timespec *ctime, short event) +int +kau_close(struct au_record *rec, struct timespec *ctime, short event) { u_char *dptr; size_t tot_rec_size; @@ -183,6 +187,8 @@ int kau_close(struct au_record *rec, struct timespec *ctime, short event) dptr += cur->len; } } + + return(retval); } /* @@ -196,7 +202,7 @@ void kau_free(struct au_record *rec) /* Free the token list */ while ((tok = TAILQ_FIRST(&rec->token_q))) { TAILQ_REMOVE(&rec->token_q, tok, tokens); - kfree((vm_offset_t)tok, sizeof(*tok) + tok->len); + kfree(tok, sizeof(*tok) + tok->len); } rec->used = 0; @@ -246,7 +252,7 @@ void kau_free(struct au_record *rec) kau_write(rec, tok); \ } \ } while (0) - + #define KPATH1_VNODE1_TOKENS \ do { \ if (ar->ar_valid_arg & ARG_KPATH1) { \ @@ -307,13 +313,13 @@ void kau_free(struct au_record *rec) * auditon() system call. * */ -void +static void audit_sys_auditon(struct audit_record *ar, struct au_record *rec) { struct au_token *tok; switch (ar->ar_arg_cmd) { - case A_SETPOLICY: + case A_SETPOLICY: if (sizeof(ar->ar_arg_auditon.au_flags) > 4) tok = au_to_arg64(1, "policy", ar->ar_arg_auditon.au_flags); @@ -322,7 +328,7 @@ audit_sys_auditon(struct audit_record *ar, struct au_record *rec) ar->ar_arg_auditon.au_flags); kau_write(rec, tok); break; - case A_SETKMASK: + case A_SETKMASK: tok = au_to_arg32(2, "setkmask:as_success", ar->ar_arg_auditon.au_mask.am_success); kau_write(rec, tok); @@ -330,7 +336,7 @@ audit_sys_auditon(struct audit_record *ar, struct au_record *rec) ar->ar_arg_auditon.au_mask.am_failure); kau_write(rec, tok); break; - case A_SETQCTRL: + case A_SETQCTRL: tok = au_to_arg32(3, "setqctrl:aq_hiwater", ar->ar_arg_auditon.au_qctrl.aq_hiwater); kau_write(rec, tok); @@ -347,7 +353,7 @@ audit_sys_auditon(struct audit_record *ar, struct au_record *rec) ar->ar_arg_auditon.au_qctrl.aq_minfree); kau_write(rec, tok); break; - case A_SETUMASK: + case A_SETUMASK: tok = au_to_arg32(3, "setumask:as_success", ar->ar_arg_auditon.au_auinfo.ai_mask.am_success); kau_write(rec, tok); @@ -355,7 +361,7 @@ audit_sys_auditon(struct audit_record *ar, struct au_record *rec) ar->ar_arg_auditon.au_auinfo.ai_mask.am_failure); kau_write(rec, tok); break; - case A_SETSMASK: + case A_SETSMASK: tok = au_to_arg32(3, "setsmask:as_success", ar->ar_arg_auditon.au_auinfo.ai_mask.am_success); kau_write(rec, tok); @@ -363,7 +369,7 @@ audit_sys_auditon(struct audit_record *ar, struct au_record *rec) ar->ar_arg_auditon.au_auinfo.ai_mask.am_failure); kau_write(rec, tok); break; - case A_SETCOND: + case A_SETCOND: if (sizeof(ar->ar_arg_auditon.au_cond) > 4) tok = au_to_arg64(3, "setcond", ar->ar_arg_auditon.au_cond); @@ -372,7 +378,7 @@ audit_sys_auditon(struct audit_record *ar, struct au_record *rec) ar->ar_arg_auditon.au_cond); kau_write(rec, tok); break; - case A_SETCLASS: + case A_SETCLASS: tok = au_to_arg32(2, "setclass:ec_event", ar->ar_arg_auditon.au_evclass.ec_number); kau_write(rec, tok); @@ -380,7 +386,7 @@ audit_sys_auditon(struct audit_record *ar, struct au_record *rec) ar->ar_arg_auditon.au_evclass.ec_class); kau_write(rec, tok); break; - case A_SETPMASK: + case A_SETPMASK: tok = au_to_arg32(2, "setpmask:as_success", ar->ar_arg_auditon.au_aupinfo.ap_mask.am_success); kau_write(rec, tok); @@ -388,7 +394,7 @@ audit_sys_auditon(struct audit_record *ar, struct au_record *rec) ar->ar_arg_auditon.au_aupinfo.ap_mask.am_failure); kau_write(rec, tok); break; - case A_SETFSIZE: + case A_SETFSIZE: tok = au_to_arg32(2, "setfsize:filesize", ar->ar_arg_auditon.au_fstat.af_filesz); kau_write(rec, tok); @@ -608,6 +614,7 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau) break; case AUE_CHOWN: + case AUE_LCHOWN: tok = au_to_arg32(2, "new file uid", ar->ar_arg_uid); kau_write(rec, tok); tok = au_to_arg32(3, "new file gid", ar->ar_arg_gid); @@ -729,7 +736,7 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau) kau_write(rec, tok); UPATH1_KPATH1_VNODE1_TOKENS; break; - + case AUE_MKDIR: tok = au_to_arg32(2, "mode", ar->ar_arg_mode); kau_write(rec, tok); @@ -750,9 +757,9 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau) case AUE_MLOCK: case AUE_MUNLOCK: case AUE_MINHERIT: - tok = au_to_arg32(1, "addr", (u_int32_t)ar->ar_arg_addr); + tok = au_to_arg32(1, "addr", (u_int32_t)ar->ar_arg_addr); /* LP64todo */ kau_write(rec, tok); - tok = au_to_arg32(2, "len", ar->ar_arg_len); + tok = au_to_arg32(2, "len", ar->ar_arg_len); /* LP64todo */ kau_write(rec, tok); if (ar->ar_event == AUE_MMAP) FD_KPATH1_VNODE1_TOKENS; @@ -829,7 +836,7 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau) case AUE_PTRACE: tok = au_to_arg32(1, "request", ar->ar_arg_cmd); kau_write(rec, tok); - tok = au_to_arg32(3, "addr", (u_int32_t)ar->ar_arg_addr); + tok = au_to_arg32(3, "addr", (u_int32_t)ar->ar_arg_addr); /* LP64todo */ kau_write(rec, tok); tok = au_to_arg32(4, "data", ar->ar_arg_value); kau_write(rec, tok); @@ -886,7 +893,7 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau) if (ar->ar_valid_arg & ARG_GROUPSET) { for(ctr = 0; ctr < ar->ar_arg_groups.gidset_size; ctr++) { - tok = au_to_arg32(1, "setgroups", ar->ar_arg_groups.gidset[ctr]); + tok = au_to_arg32(1, "setgroups", ar->ar_arg_groups.gidset[ctr]); kau_write(rec, tok); } } @@ -1140,7 +1147,7 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau) * */ int -bsm_rec_verify(void *rec) +bsm_rec_verify(void* rec) { char c = *(char *)rec; /* diff --git a/bsd/kern/kern_bsm_klib.c b/bsd/kern/kern_bsm_klib.c index b3e33f193..1aacd0dd0 100644 --- a/bsd/kern/kern_bsm_klib.c +++ b/bsd/kern/kern_bsm_klib.c @@ -20,8 +20,10 @@ * @APPLE_LICENSE_HEADER_END@ */ +#include <sys/systm.h> #include <sys/types.h> -#include <sys/vnode.h> +#include <sys/proc_internal.h> +#include <sys/vnode_internal.h> #include <sys/fcntl.h> #include <sys/filedesc.h> #include <sys/sem.h> @@ -352,7 +354,7 @@ au_event_t sys_au_event[] = { AUE_NULL, /* 295 */ AUE_LOADSHFILE, /* 296 = load_shared_file */ AUE_RESETSHFILE, /* 297 = reset_shared_file */ - AUE_NEWSYSTEMSHREG, /* 298 = new_system_shared_regions */ + AUE_NEWSYSTEMSHREG, /* 298 = new_system_shared_regions */ AUE_NULL, /* 299 */ AUE_NULL, /* 300 */ AUE_NULL, /* 301 */ @@ -418,7 +420,7 @@ au_event_t sys_au_event[] = { AUE_NULL, /* 361 */ AUE_NULL, /* 362 = kqueue */ AUE_NULL, /* 363 = kevent */ - AUE_NULL, /* 364 */ + AUE_LCHOWN, /* 364 = lchown */ AUE_NULL, /* 365 */ AUE_NULL, /* 366 */ AUE_NULL, /* 367 */ @@ -459,12 +461,12 @@ au_class_t au_event_class(au_event_t event) return (AU_NULL); } -/* + /* * Insert a event to class mapping. If the event already exists in the * mapping, then replace the mapping with the new one. * XXX There is currently no constraints placed on the number of mappings. * May want to either limit to a number, or in terms of memory usage. - */ + */ void au_evclassmap_insert(au_event_t event, au_class_t class) { struct evclass_list *evcl; @@ -478,14 +480,13 @@ void au_evclassmap_insert(au_event_t event, au_class_t class) return; } } - kmem_alloc(kernel_map, &evc, sizeof(*evc)); + kmem_alloc(kernel_map, (vm_offset_t *)&evc, sizeof(*evc)); if (evc == NULL) { return; } evc->event = event; evc->class = class; LIST_INSERT_HEAD(&evcl->head, evc, entry); - } void au_evclassmap_init() @@ -499,7 +500,7 @@ void au_evclassmap_init() for (i = 0; i < nsys_au_event; i++) { if (sys_au_event[i] != AUE_NULL) { au_evclassmap_insert(sys_au_event[i], AU_NULL); - } + } } /* Add the Mach system call events */ au_evclassmap_insert(AUE_TASKFORPID, AU_NULL); @@ -508,27 +509,26 @@ void au_evclassmap_init() au_evclassmap_insert(AUE_SWAPOFF, AU_NULL); au_evclassmap_insert(AUE_MAPFD, AU_NULL); au_evclassmap_insert(AUE_INITPROCESS, AU_NULL); - + /* Add the specific open events to the mapping. */ au_evclassmap_insert(AUE_OPEN_R, AU_FREAD); - au_evclassmap_insert(AUE_OPEN_RC, AU_FREAD|AU_FCREATE); - au_evclassmap_insert(AUE_OPEN_RTC, AU_FREAD|AU_FCREATE|AU_FDELETE); - au_evclassmap_insert(AUE_OPEN_RT, AU_FREAD|AU_FDELETE); - au_evclassmap_insert(AUE_OPEN_RW, AU_FREAD|AU_FWRITE); - au_evclassmap_insert(AUE_OPEN_RWC, AU_FREAD|AU_FWRITE|AU_FCREATE); - au_evclassmap_insert(AUE_OPEN_RWTC, AU_FREAD|AU_FWRITE|AU_FCREATE|AU_FDELETE); - au_evclassmap_insert(AUE_OPEN_RWT, AU_FREAD|AU_FWRITE|AU_FDELETE); - au_evclassmap_insert(AUE_OPEN_W, AU_FWRITE); - au_evclassmap_insert(AUE_OPEN_WC, AU_FWRITE|AU_FCREATE); - au_evclassmap_insert(AUE_OPEN_WTC, AU_FWRITE|AU_FCREATE|AU_FDELETE); - au_evclassmap_insert(AUE_OPEN_WT, AU_FWRITE|AU_FDELETE); + au_evclassmap_insert(AUE_OPEN_RC, AU_FREAD|AU_FCREATE); + au_evclassmap_insert(AUE_OPEN_RTC, AU_FREAD|AU_FCREATE|AU_FDELETE); + au_evclassmap_insert(AUE_OPEN_RT, AU_FREAD|AU_FDELETE); + au_evclassmap_insert(AUE_OPEN_RW, AU_FREAD|AU_FWRITE); + au_evclassmap_insert(AUE_OPEN_RWC, AU_FREAD|AU_FWRITE|AU_FCREATE); + au_evclassmap_insert(AUE_OPEN_RWTC, AU_FREAD|AU_FWRITE|AU_FCREATE|AU_FDELETE); + au_evclassmap_insert(AUE_OPEN_RWT, AU_FREAD|AU_FWRITE|AU_FDELETE); + au_evclassmap_insert(AUE_OPEN_W, AU_FWRITE); + au_evclassmap_insert(AUE_OPEN_WC, AU_FWRITE|AU_FCREATE); + au_evclassmap_insert(AUE_OPEN_WTC, AU_FWRITE|AU_FCREATE|AU_FDELETE); + au_evclassmap_insert(AUE_OPEN_WT, AU_FWRITE|AU_FDELETE); } -/* + /* * Check whether an event is aditable by comparing the mask of classes this * event is part of against the given mask. - * - */ + */ int au_preselect(au_event_t event, au_mask_t *mask_p, int sorf) { au_class_t effmask = 0; @@ -538,10 +538,10 @@ int au_preselect(au_event_t event, au_mask_t *mask_p, int sorf) return (-1); ae_class = au_event_class(event); - /* + /* * Perform the actual check of the masks against the event. */ - if (sorf & AU_PRS_SUCCESS) { + if(sorf & AU_PRS_SUCCESS) { effmask |= (mask_p->am_success & ae_class); } @@ -580,6 +580,7 @@ au_event_t ctlname_to_sysctlevent(int name[], uint64_t valid_arg) { case KERN_SAVED_IDS: case KERN_NETBOOT: case KERN_SYMFILE: + case KERN_SHREG_PRIVATIZABLE: return AUE_SYSCTL_NONADMIN; /* only treat the sets as admin */ @@ -656,13 +657,13 @@ au_event_t flags_and_error_to_openevent(int oflags, int error) { default: aevent = AUE_OPEN; break; - } +} - /* +/* * Convert chatty errors to better matching events. * Failures to find a file are really just attribute * events - so recast them as such. - */ +*/ switch (aevent) { case AUE_OPEN_R: case AUE_OPEN_RT: @@ -672,12 +673,12 @@ au_event_t flags_and_error_to_openevent(int oflags, int error) { case AUE_OPEN_WT: if (error == ENOENT) aevent = AUE_OPEN; - } +} return aevent; } /* Convert a MSGCTL command to a specific event. */ -int msgctl_to_event(int cmd) +au_event_t msgctl_to_event(int cmd) { switch (cmd) { case IPC_RMID: @@ -693,7 +694,7 @@ int msgctl_to_event(int cmd) } /* Convert a SEMCTL command to a specific event. */ -int semctl_to_event(int cmd) +au_event_t semctl_to_event(int cmd) { switch (cmd) { case GETALL: @@ -829,12 +830,9 @@ int canon_path(struct proc *p, char *path, char *cpath) cpath[0] = '\0'; return (ret); } - /* The length returned by vn_getpath() is two greater than the - * number of characters in the string. - */ if (len < MAXPATHLEN) - cpath[len-2] = '/'; - strncpy(cpath + len-1, bufp, MAXPATHLEN - len); + cpath[len-1] = '/'; + strncpy(cpath + len, bufp, MAXPATHLEN - len); } else { strncpy(cpath, bufp, MAXPATHLEN); } diff --git a/bsd/kern/kern_bsm_token.c b/bsd/kern/kern_bsm_token.c index cceb7c6df..7be61356e 100644 --- a/bsd/kern/kern_bsm_token.c +++ b/bsd/kern/kern_bsm_token.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2003-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -23,13 +23,18 @@ #include <sys/types.h> #include <sys/un.h> #include <sys/event.h> +#include <sys/ucred.h> +#include <sys/ipc.h> #include <bsm/audit.h> #include <bsm/audit_record.h> #include <bsm/audit_klib.h> #include <bsm/audit_kernel.h> #include <kern/clock.h> +#include <kern/kalloc.h> + +#include <string.h> #define GET_TOKEN_AREA(tok, dptr, length) \ do {\ @@ -38,8 +43,8 @@ {\ tok->len = length;\ dptr = tok->t_data = (u_char *)&tok[1];\ - memset(dptr, 0, length);\ - }\ + memset(dptr, 0, length);\ + }\ }while(0) @@ -131,7 +136,7 @@ token_t *au_to_arg(char n, char *text, u_int32_t v) * node ID 8 bytes * device 4 bytes/8 bytes (32-bit/64-bit) */ -token_t *au_to_attr32(struct vattr *attr) +token_t *au_to_attr32(__unused struct vnode_attr *attr) { return NULL; } @@ -180,16 +185,17 @@ token_t *kau_to_attr32(struct vnode_au_info *vni) return t; } -token_t *au_to_attr64(struct vattr *attr) +token_t *au_to_attr64(__unused struct vnode_attr *attr) { + return NULL; } - -token_t *kau_to_attr64(struct vnode_au_info *vni) + +token_t *kau_to_attr64(__unused struct vnode_au_info *vni) { return NULL; } -token_t *au_to_attr(struct vattr *attr) +token_t *au_to_attr(struct vnode_attr *attr) { return au_to_attr32(attr); @@ -519,7 +525,7 @@ token_t *au_to_opaque(char *data, u_int16_t bytes) * file name len 2 bytes * file pathname N bytes + 1 terminating NULL byte */ -token_t *kau_to_file(char *file, struct timeval *tv) +token_t *kau_to_file(const char *file, const struct timeval *tv) { token_t *t; u_char *dptr; @@ -666,12 +672,17 @@ token_t *au_to_process32(au_id_t auid, uid_t euid, gid_t egid, return t; } -token_t *au_to_process64(au_id_t auid, uid_t euid, gid_t egid, - uid_t ruid, gid_t rgid, pid_t pid, - au_asid_t sid, au_tid_t *tid) +token_t *au_to_process64(__unused au_id_t auid, + __unused uid_t euid, + __unused gid_t egid, + __unused uid_t ruid, + __unused gid_t rgid, + __unused pid_t pid, + __unused au_asid_t sid, + __unused au_tid_t *tid) { - return NULL; -} + return NULL; + } token_t *au_to_process(au_id_t auid, uid_t euid, gid_t egid, uid_t ruid, gid_t rgid, pid_t pid, @@ -730,13 +741,19 @@ token_t *au_to_process32_ex(au_id_t auid, uid_t euid, gid_t egid, return t; } -token_t *au_to_process64_ex(au_id_t auid, uid_t euid, gid_t egid, - uid_t ruid, gid_t rgid, pid_t pid, - au_asid_t sid, au_tid_addr_t *tid) +token_t *au_to_process64_ex( + __unused au_id_t auid, + __unused uid_t euid, + __unused gid_t egid, + __unused uid_t ruid, + __unused gid_t rgid, + __unused pid_t pid, + __unused au_asid_t sid, + __unused au_tid_addr_t *tid) { return NULL; } - + token_t *au_to_process_ex(au_id_t auid, uid_t euid, gid_t egid, uid_t ruid, gid_t rgid, pid_t pid, au_asid_t sid, au_tid_addr_t *tid) @@ -820,7 +837,7 @@ token_t *au_to_seq(long audit_count) * remote port 2 bytes * remote Internet address 4 bytes */ -token_t *au_to_socket(struct socket *so) +token_t *au_to_socket(__unused struct socket *so) { return NULL; } @@ -865,14 +882,20 @@ token_t *kau_to_socket(struct socket_au_info *soi) * address type/length 4 bytes * remote Internet address 4 bytes/16 bytes (IPv4/IPv6 address) */ -token_t *au_to_socket_ex_32(u_int16_t lp, u_int16_t rp, - struct sockaddr *la, struct sockaddr *ra) +token_t *au_to_socket_ex_32( + __unused u_int16_t lp, + __unused u_int16_t rp, + __unused struct sockaddr *la, + __unused struct sockaddr *ra) { return NULL; } -token_t *au_to_socket_ex_128(u_int16_t lp, u_int16_t rp, - struct sockaddr *la, struct sockaddr *ra) +token_t *au_to_socket_ex_128( + __unused u_int16_t lp, + __unused u_int16_t rp, + __unused struct sockaddr *la, + __unused struct sockaddr *ra) { return NULL; } @@ -1019,13 +1042,19 @@ token_t *au_to_subject32(au_id_t auid, uid_t euid, gid_t egid, return t; } -token_t *au_to_subject64(au_id_t auid, uid_t euid, gid_t egid, - uid_t ruid, gid_t rgid, pid_t pid, - au_asid_t sid, au_tid_t *tid) +token_t *au_to_subject64( + __unused au_id_t auid, + __unused uid_t euid, + __unused gid_t egid, + __unused uid_t ruid, + __unused gid_t rgid, + __unused pid_t pid, + __unused au_asid_t sid, + __unused au_tid_t *tid) { - return NULL; -} - + return NULL; + } + token_t *au_to_subject(au_id_t auid, uid_t euid, gid_t egid, uid_t ruid, gid_t rgid, pid_t pid, au_asid_t sid, au_tid_t *tid) @@ -1083,9 +1112,15 @@ token_t *au_to_subject32_ex(au_id_t auid, uid_t euid, return t; } -token_t *au_to_subject64_ex(au_id_t auid, uid_t euid, - gid_t egid, uid_t ruid, gid_t rgid, pid_t pid, - au_asid_t sid, au_tid_addr_t *tid) +token_t *au_to_subject64_ex( + __unused au_id_t auid, + __unused uid_t euid, + __unused gid_t egid, + __unused uid_t ruid, + __unused gid_t rgid, + __unused pid_t pid, + __unused au_asid_t sid, + __unused au_tid_addr_t *tid) { return NULL; } @@ -1211,7 +1246,7 @@ token_t *au_to_exec_env(const char **env) * seconds of time 4 bytes/8 bytes (32-bit/64-bit value) * milliseconds of time 4 bytes/8 bytes (32-bit/64-bit value) */ -token_t *kau_to_header32(struct timespec *ctime, int rec_size, +token_t *kau_to_header32(const struct timespec *ctime, int rec_size, au_event_t e_type, au_emod_t e_mod) { token_t *t; @@ -1236,13 +1271,16 @@ token_t *kau_to_header32(struct timespec *ctime, int rec_size, return t; } -token_t *kau_to_header64(struct timespec *ctime, int rec_size, - au_event_t e_type, au_emod_t e_mod) +token_t *kau_to_header64( + __unused const struct timespec *ctime, + __unused int rec_size, + __unused au_event_t e_type, + __unused au_emod_t e_mod) { return NULL; } - -token_t *kau_to_header(struct timespec *ctime, int rec_size, + +token_t *kau_to_header(const struct timespec *ctime, int rec_size, au_event_t e_type, au_emod_t e_mod) { return kau_to_header32(ctime, rec_size, e_type, e_mod); diff --git a/bsd/kern/kern_clock.c b/bsd/kern/kern_clock.c index 8e34ca9e2..76c5353de 100644 --- a/bsd/kern/kern_clock.c +++ b/bsd/kern/kern_clock.c @@ -71,8 +71,9 @@ #include <sys/resourcevar.h> #include <sys/kernel.h> #include <sys/resource.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> #include <sys/vm.h> +#include <sys/sysctl.h> #ifdef GPROF #include <sys/gmon.h> @@ -85,6 +86,14 @@ #include <kern/thread_call.h> +void bsd_uprofil(struct time_value *syst, user_addr_t pc); +void get_procrustime(time_value_t *tv); +int sysctl_clockrate(user_addr_t where, size_t *sizep); +int tvtohz(struct timeval *tv); +extern void psignal_sigprof(struct proc *); +extern void psignal_vtalarm(struct proc *); +extern void psignal_xcpu(struct proc *); + /* * Clock handling routines. * @@ -107,13 +116,21 @@ * we run through the statistics gathering routine as well. */ +int hz = 100; /* GET RID OF THIS !!! */ +int tick = (1000000 / 100); /* GET RID OF THIS !!! */ + int bsd_hardclockinit = 0; /*ARGSUSED*/ void -bsd_hardclock(usermode, pc, numticks) - boolean_t usermode; - caddr_t pc; - int numticks; +bsd_hardclock( + boolean_t usermode, +#ifdef GPROF + caddr_t pc, +#else + __unused caddr_t pc, +#endif + int numticks + ) { register struct proc *p; register thread_t thread; @@ -123,17 +140,11 @@ bsd_hardclock(usermode, pc, numticks) if (!bsd_hardclockinit) return; - /* - * Increment the time-of-day. - */ - microtime(&tv); - time = tv; - if (bsd_hardclockinit < 0) { return; } - thread = current_act(); + thread = current_thread(); /* * Charge the time out based on the mode the cpu is in. * Here again we fudge for the lack of proper interval timers @@ -141,7 +152,7 @@ bsd_hardclock(usermode, pc, numticks) * one tick. */ p = (struct proc *)current_proc(); - if (p && ((p->p_flag & P_WEXIT) == NULL)) { + if (p && ((p->p_flag & P_WEXIT) == 0)) { if (usermode) { if (p->p_stats && p->p_stats->p_prof.pr_scale) { p->p_flag |= P_OWEUPC; @@ -156,7 +167,6 @@ bsd_hardclock(usermode, pc, numticks) if (p->p_stats && timerisset(&p->p_stats->p_timer[ITIMER_VIRTUAL].it_value) && !itimerdecr(&p->p_stats->p_timer[ITIMER_VIRTUAL], nusecs)) { - extern void psignal_vtalarm(struct proc *); /* does psignal(p, SIGVTALRM) in a thread context */ thread_call_func((thread_call_func_t)psignal_vtalarm, p, FALSE); @@ -179,7 +189,6 @@ bsd_hardclock(usermode, pc, numticks) thread_read_times(thread, &user_time, &sys_time); if ((sys_time.seconds + user_time.seconds + 1) > p->p_limit->pl_rlimit[RLIMIT_CPU].rlim_cur) { - extern void psignal_xcpu(struct proc *); /* does psignal(p, SIGXCPU) in a thread context */ thread_call_func((thread_call_func_t)psignal_xcpu, p, FALSE); @@ -191,7 +200,6 @@ bsd_hardclock(usermode, pc, numticks) } if (timerisset(&p->p_stats->p_timer[ITIMER_PROF].it_value) && !itimerdecr(&p->p_stats->p_timer[ITIMER_PROF], nusecs)) { - extern void psignal_sigprof(struct proc *); /* does psignal(p, SIGPROF) in a thread context */ thread_call_func((thread_call_func_t)psignal_sigprof, p, FALSE); @@ -213,8 +221,15 @@ bsd_hardclock(usermode, pc, numticks) /*ARGSUSED*/ void gatherstats( - boolean_t usermode, - caddr_t pc) +#ifdef GPROF + boolean_t usermode, + caddr_t pc +#else + __unused boolean_t usermode, + __unused caddr_t pc +#endif + ) + { #ifdef GPROF if (!usermode) { @@ -269,12 +284,46 @@ untimeout( } +/* + * Set a timeout. + * + * fcn: function to call + * param: parameter to pass to function + * ts: timeout interval, in timespec + */ +void +bsd_timeout( + timeout_fcn_t fcn, + void *param, + struct timespec *ts) +{ + uint64_t deadline = 0; + + if (ts && (ts->tv_sec || ts->tv_nsec)) { + nanoseconds_to_absolutetime((uint64_t)ts->tv_sec * NSEC_PER_SEC + ts->tv_nsec, &deadline ); + clock_absolutetime_interval_to_deadline( deadline, &deadline ); + } + thread_call_func_delayed((thread_call_func_t)fcn, param, deadline); +} + +/* + * Cancel a timeout. + */ +void +bsd_untimeout( + register timeout_fcn_t fcn, + register void *param) +{ + thread_call_func_cancel((thread_call_func_t)fcn, param, FALSE); +} + /* * Compute number of hz until specified time. * Used to compute third argument to timeout() from an * absolute time. */ +int hzto(tv) struct timeval *tv; { @@ -309,9 +358,7 @@ hzto(tv) * Return information about system clocks. */ int -sysctl_clockrate(where, sizep) - register char *where; - size_t *sizep; +sysctl_clockrate(user_addr_t where, size_t *sizep) { struct clockinfo clkinfo; @@ -322,7 +369,7 @@ sysctl_clockrate(where, sizep) clkinfo.tick = tick; clkinfo.profhz = hz; clkinfo.stathz = hz; - return sysctl_rdstruct(where, sizep, NULL, &clkinfo, sizeof(clkinfo)); + return sysctl_rdstruct(where, sizep, USER_ADDR_NULL, &clkinfo, sizeof(clkinfo)); } @@ -330,8 +377,7 @@ sysctl_clockrate(where, sizep) * Compute number of ticks in the specified amount of time. */ int -tvtohz(tv) - struct timeval *tv; +tvtohz(struct timeval *tv) { register unsigned long ticks; register long sec, usec; @@ -412,7 +458,7 @@ stopprofclock(p) } void -bsd_uprofil(struct time_value *syst, unsigned int pc) +bsd_uprofil(struct time_value *syst, user_addr_t pc) { struct proc *p = current_proc(); int ticks; diff --git a/bsd/kern/kern_control.c b/bsd/kern/kern_control.c index d57e83851..955bbd375 100644 --- a/bsd/kern/kern_control.c +++ b/bsd/kern/kern_control.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -19,12 +19,12 @@ * * @APPLE_LICENSE_HEADER_END@ */ -/* Copyright (C) 1999 Apple Computer, Inc. */ /* - * NKE management domain - allows control connections to - * an NKE and to read/write data. + * Kernel Control domain - allows control connections to + * and to read/write data. * + * Vincent Lubet, 040506 * Christophe Allie, 010928 * Justin C. Walker, 990319 */ @@ -39,7 +39,6 @@ #include <sys/domain.h> #include <sys/malloc.h> #include <sys/mbuf.h> -#include <net/kext_net.h> #include <sys/sys_domain.h> #include <sys/kern_event.h> #include <sys/kern_control.h> @@ -50,7 +49,6 @@ #include <kern/thread.h> - /* * Definitions and vars for we support */ @@ -59,390 +57,842 @@ #define CTL_RECVSIZE (8 * 1024) /* default buffer size */ /* - internal structure maintained for each register controller -*/ -struct ctl -{ - TAILQ_ENTRY(ctl) next; /* controller chain */ - struct socket *skt; /* current controlling socket */ + * Definitions and vars for we support + */ - /* controller information provided when registering */ - u_int32_t id; /* unique nke identifier, provided by DTS */ - u_int32_t unit; /* unit number for use by the nke */ - void *userdata; /* for private use by nke */ - - /* misc communication information */ - u_int32_t flags; /* support flags */ - u_int32_t recvbufsize; /* request more than the default buffer size */ - u_int32_t sendbufsize; /* request more than the default buffer size */ - - /* Dispatch functions */ - int (*connect)(kern_ctl_ref, void *); /* Make contact */ - void (*disconnect)(kern_ctl_ref, void *); /* Break contact */ - int (*write) (kern_ctl_ref, void *, struct mbuf *); /* Send data to nke */ - int (*set)(kern_ctl_ref, void *, int, void *, size_t ); /* set ctl configuration */ - int (*get)(kern_ctl_ref, void *, int, void *, size_t *); /* get ctl configuration */ +static u_int32_t ctl_last_id = 0; +static u_int32_t ctl_max = 256; +static u_int32_t ctl_maxunit = 65536; +static lck_grp_attr_t *ctl_lck_grp_attr = 0; +static lck_attr_t *ctl_lck_attr = 0; +static lck_grp_t *ctl_lck_grp = 0; +static lck_mtx_t *ctl_mtx; + +/* + * internal structure maintained for each register controller + */ + +struct ctl_cb; + +struct kctl +{ + TAILQ_ENTRY(kctl) next; /* controller chain */ + + /* controller information provided when registering */ + char name[MAX_KCTL_NAME]; /* unique nke identifier, provided by DTS */ + u_int32_t id; + u_int32_t reg_unit; + + /* misc communication information */ + u_int32_t flags; /* support flags */ + u_int32_t recvbufsize; /* request more than the default buffer size */ + u_int32_t sendbufsize; /* request more than the default buffer size */ + + /* Dispatch functions */ + ctl_connect_func connect; /* Make contact */ + ctl_disconnect_func disconnect; /* Break contact */ + ctl_send_func send; /* Send data to nke */ + ctl_setopt_func setopt; /* set kctl configuration */ + ctl_getopt_func getopt; /* get kctl configuration */ + + TAILQ_HEAD(, ctl_cb) kcb_head; + u_int32_t lastunit; }; +struct ctl_cb { + TAILQ_ENTRY(ctl_cb) next; /* controller chain */ + lck_mtx_t *mtx; + struct socket *so; /* controlling socket */ + struct kctl *kctl; /* back pointer to controller */ + u_int32_t unit; + void *userdata; +}; /* all the controllers are chained */ -TAILQ_HEAD(, ctl) ctl_head; - -int ctl_attach(struct socket *, int, struct proc *); -int ctl_connect(struct socket *, struct sockaddr *, struct proc *); -int ctl_disconnect(struct socket *); -int ctl_ioctl(struct socket *so, u_long cmd, caddr_t data, +TAILQ_HEAD(, kctl) ctl_head; + +static int ctl_attach(struct socket *, int, struct proc *); +static int ctl_detach(struct socket *); +static int ctl_sofreelastref(struct socket *so); +static int ctl_connect(struct socket *, struct sockaddr *, struct proc *); +static int ctl_disconnect(struct socket *); +static int ctl_ioctl(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct proc *p); -int ctl_send(struct socket *, int, struct mbuf *, +static int ctl_send(struct socket *, int, struct mbuf *, struct sockaddr *, struct mbuf *, struct proc *); -int ctl_ctloutput(struct socket *, struct sockopt *); +static int ctl_ctloutput(struct socket *, struct sockopt *); +static int ctl_peeraddr(struct socket *so, struct sockaddr **nam); + +static struct kctl *ctl_find_by_id(u_int32_t); +static struct kctl *ctl_find_by_name(const char *); +static struct kctl *ctl_find_by_id_unit(u_int32_t id, u_int32_t unit); -struct ctl *ctl_find(u_int32_t, u_int32_t unit); -void ctl_post_msg(u_long event_code, u_int32_t id, u_int32_t unit); +static struct ctl_cb *kcb_find(struct kctl *, u_int32_t unit); +static void ctl_post_msg(u_long event_code, u_int32_t id); +static int ctl_lock(struct socket *, int, int); +static int ctl_unlock(struct socket *, int, int); +static lck_mtx_t * ctl_getlock(struct socket *, int); -struct pr_usrreqs ctl_usrreqs = +static struct pr_usrreqs ctl_usrreqs = { pru_abort_notsupp, pru_accept_notsupp, ctl_attach, pru_bind_notsupp, - ctl_connect, pru_connect2_notsupp, ctl_ioctl, pru_detach_notsupp, - ctl_disconnect, pru_listen_notsupp, pru_peeraddr_notsupp, + ctl_connect, pru_connect2_notsupp, ctl_ioctl, ctl_detach, + ctl_disconnect, pru_listen_notsupp, ctl_peeraddr, pru_rcvd_notsupp, pru_rcvoob_notsupp, ctl_send, pru_sense_null, pru_shutdown_notsupp, pru_sockaddr_notsupp, - sosend, soreceive, sopoll + sosend, soreceive, pru_sopoll_notsupp +}; + +static struct protosw kctlswk_dgram = +{ + SOCK_DGRAM, &systemdomain, SYSPROTO_CONTROL, + PR_ATOMIC|PR_CONNREQUIRED|PR_PCBLOCK, + NULL, NULL, NULL, ctl_ctloutput, + NULL, NULL, + NULL, NULL, NULL, NULL, &ctl_usrreqs, + ctl_lock, ctl_unlock, ctl_getlock, { 0, 0 } , 0, { 0 } }; -struct protosw ctlsw = +static struct protosw kctlswk_stream = { - SOCK_DGRAM, &systemdomain, SYSPROTO_CONTROL, PR_ATOMIC|PR_CONNREQUIRED, + SOCK_STREAM, &systemdomain, SYSPROTO_CONTROL, + PR_CONNREQUIRED|PR_PCBLOCK, NULL, NULL, NULL, ctl_ctloutput, NULL, NULL, - NULL, NULL, NULL, NULL, &ctl_usrreqs + NULL, NULL, NULL, NULL, &ctl_usrreqs, + ctl_lock, ctl_unlock, ctl_getlock, { 0, 0 } , 0, { 0 } }; + /* - * Install the protosw's for the NKE manager. + * Install the protosw's for the Kernel Control manager. */ -int +__private_extern__ int kern_control_init(void) { - int retval; - - retval = net_add_proto(&ctlsw, &systemdomain); - if (retval) { - log(LOG_WARNING, "Can't install Kernel Controller Manager (%d)\n", retval); - return retval; - } + int error = 0; + + ctl_lck_grp_attr = lck_grp_attr_alloc_init(); + if (ctl_lck_grp_attr == 0) { + printf(": lck_grp_attr_alloc_init failed\n"); + error = ENOMEM; + goto done; + } + lck_grp_attr_setdefault(ctl_lck_grp_attr); + + ctl_lck_grp = lck_grp_alloc_init("Kernel Control Protocol", ctl_lck_grp_attr); + if (ctl_lck_grp == 0) { + printf("kern_control_init: lck_grp_alloc_init failed\n"); + error = ENOMEM; + goto done; + } + + ctl_lck_attr = lck_attr_alloc_init(); + if (ctl_lck_attr == 0) { + printf("kern_control_init: lck_attr_alloc_init failed\n"); + error = ENOMEM; + goto done; + } + lck_attr_setdefault(ctl_lck_attr); + + ctl_mtx = lck_mtx_alloc_init(ctl_lck_grp, ctl_lck_attr); + if (ctl_mtx == 0) { + printf("kern_control_init: lck_mtx_alloc_init failed\n"); + error = ENOMEM; + goto done; + } + TAILQ_INIT(&ctl_head); + + error = net_add_proto(&kctlswk_dgram, &systemdomain); + if (error) { + log(LOG_WARNING, "kern_control_init: net_add_proto dgram failed (%d)\n", error); + } + error = net_add_proto(&kctlswk_stream, &systemdomain); + if (error) { + log(LOG_WARNING, "kern_control_init: net_add_proto stream failed (%d)\n", error); + } + + done: + if (error != 0) { + if (ctl_mtx) { + lck_mtx_free(ctl_mtx, ctl_lck_grp); + ctl_mtx = 0; + } + if (ctl_lck_grp) { + lck_grp_free(ctl_lck_grp); + ctl_lck_grp = 0; + } + if (ctl_lck_grp_attr) { + lck_grp_attr_free(ctl_lck_grp_attr); + ctl_lck_grp_attr = 0; + } + if (ctl_lck_attr) { + lck_attr_free(ctl_lck_attr); + ctl_lck_attr = 0; + } + } + return error; +} - TAILQ_INIT(&ctl_head); - - return(KERN_SUCCESS); +static void +kcb_delete(struct ctl_cb *kcb) +{ + if (kcb != 0) { + if (kcb->mtx != 0) + lck_mtx_free(kcb->mtx, ctl_lck_grp); + FREE(kcb, M_TEMP); + } } /* * Kernel Controller user-request functions + * attach function must exist and succeed + * detach not necessary + * we need a pcb for the per socket mutex */ -int -ctl_attach (struct socket *so, int proto, struct proc *p) +static int +ctl_attach(__unused struct socket *so, __unused int proto, __unused struct proc *p) { - /* - * attach function must exist and succeed - * detach not necessary since we use - * connect/disconnect to handle so_pcb - */ + int error = 0; + struct ctl_cb *kcb = 0; + + MALLOC(kcb, struct ctl_cb *, sizeof(struct ctl_cb), M_TEMP, M_WAITOK); + if (kcb == NULL) { + error = ENOMEM; + goto quit; + } + bzero(kcb, sizeof(struct ctl_cb)); + + kcb->mtx = lck_mtx_alloc_init(ctl_lck_grp, ctl_lck_attr); + if (kcb->mtx == NULL) { + error = ENOMEM; + goto quit; + } + kcb->so = so; + so->so_pcb = (caddr_t)kcb; + +quit: + if (error != 0) { + kcb_delete(kcb); + kcb = 0; + } + return error; +} + +static int +ctl_sofreelastref(struct socket *so) +{ + struct ctl_cb *kcb = (struct ctl_cb *)so->so_pcb; + + so->so_pcb = 0; + + if (kcb != 0) { + struct kctl *kctl; + if ((kctl = kcb->kctl) != 0) { + lck_mtx_lock(ctl_mtx); + TAILQ_REMOVE(&kctl->kcb_head, kcb, next); + lck_mtx_lock(ctl_mtx); + } + kcb_delete(kcb); + } + return 0; +} + +static int +ctl_detach(struct socket *so) +{ + struct ctl_cb *kcb = (struct ctl_cb *)so->so_pcb; + if (kcb == 0) + return 0; + + soisdisconnected(so); + so->so_flags |= SOF_PCBCLEARING; return 0; } -int -ctl_connect(struct socket *so, struct sockaddr *nam, struct proc *p) + +static int +ctl_connect(struct socket *so, struct sockaddr *nam, __unused struct proc *p) { - struct ctl *ctl; - int error = 0; - struct sockaddr_ctl *sa = (struct sockaddr_ctl *)nam; + struct kctl *kctl; + int error = 0; + struct sockaddr_ctl sa; + struct ctl_cb *kcb = (struct ctl_cb *)so->so_pcb; + + if (kcb == 0) + panic("ctl_connect so_pcb null\n"); + + if (nam->sa_len != sizeof(struct sockaddr_ctl)) + return(EINVAL); + + bcopy(nam, &sa, sizeof(struct sockaddr_ctl)); + + lck_mtx_lock(ctl_mtx); + kctl = ctl_find_by_id_unit(sa.sc_id, sa.sc_unit); + if (kctl == NULL) { + lck_mtx_unlock(ctl_mtx); + return ENOENT; + } - ctl = ctl_find(sa->sc_id, sa->sc_unit); - if (ctl == NULL) - return(EADDRNOTAVAIL); + if (((kctl->flags & CTL_FLAG_REG_SOCK_STREAM) && (so->so_type != SOCK_STREAM)) || + (!(kctl->flags & CTL_FLAG_REG_SOCK_STREAM) && (so->so_type != SOCK_DGRAM))) { + lck_mtx_unlock(ctl_mtx); + return EPROTOTYPE; + } - if (ctl->flags & CTL_FLAG_PRIVILEGED) { - if (p == 0) + if (kctl->flags & CTL_FLAG_PRIVILEGED) { + if (p == 0) { + lck_mtx_unlock(ctl_mtx); return(EINVAL); - if (error = suser(p->p_ucred, &p->p_acflag)) + } + if ((error = proc_suser(p))) { + lck_mtx_unlock(ctl_mtx); return error; + } + } + + if ((kctl->flags & CTL_FLAG_REG_ID_UNIT) || sa.sc_unit != 0) { + if (kcb_find(kctl, sa.sc_unit) != NULL) { + lck_mtx_unlock(ctl_mtx); + return EBUSY; + } + } else { + u_int32_t unit = kctl->lastunit + 1; + + while (1) { + if (unit == ctl_maxunit) + unit = 1; + if (kcb_find(kctl, unit) == NULL) { + kctl->lastunit = sa.sc_unit = unit; + break; + } + if (unit++ == kctl->lastunit) { + lck_mtx_unlock(ctl_mtx); + return EBUSY; + } + } } - if (ctl->skt != NULL) - return(EBUSY); + kcb->unit = sa.sc_unit; + kcb->kctl = kctl; + TAILQ_INSERT_TAIL(&kctl->kcb_head, kcb, next); + lck_mtx_unlock(ctl_mtx); - error = soreserve(so, - ctl->sendbufsize ? ctl->sendbufsize : CTL_SENDSIZE, - ctl->recvbufsize ? ctl->recvbufsize : CTL_RECVSIZE); + error = soreserve(so, kctl->sendbufsize, kctl->recvbufsize); if (error) - return error; + goto done; + soisconnecting(so); - ctl->skt = so; + socket_unlock(so, 0); + error = (*kctl->connect)(kctl, &sa, &kcb->userdata); + socket_lock(so, 0); + if (error) + goto done; - if (ctl->connect) - error = (*ctl->connect)(ctl, ctl->userdata); + soisconnected(so); + +done: if (error) { - ctl->skt = NULL; - return error; + soisdisconnected(so); + lck_mtx_lock(ctl_mtx); + kcb->kctl = 0; + kcb->unit = 0; + TAILQ_REMOVE(&kctl->kcb_head, kcb, next); + lck_mtx_unlock(ctl_mtx); } - - so->so_pcb = (caddr_t)ctl; - soisconnected(so); - return error; } -int +static int ctl_disconnect(struct socket *so) { - struct ctl *ctl; - - if ((ctl = (struct ctl *)so->so_pcb)) - { - if (ctl->disconnect) - (*ctl->disconnect)(ctl, ctl->userdata); - ctl->skt = NULL; - so->so_pcb = NULL; + struct ctl_cb *kcb = (struct ctl_cb *)so->so_pcb; + + if ((kcb = (struct ctl_cb *)so->so_pcb)) { + struct kctl *kctl = kcb->kctl; + + if (kctl && kctl->disconnect) { + socket_unlock(so, 0); + (*kctl->disconnect)(kctl, kcb->unit, kcb->userdata); + socket_lock(so, 0); + } + lck_mtx_lock(ctl_mtx); + kcb->kctl = 0; + kcb->unit = 0; + TAILQ_REMOVE(&kctl->kcb_head, kcb, next); soisdisconnected(so); + lck_mtx_unlock(ctl_mtx); } return 0; } -int -ctl_send(struct socket *so, int flags, struct mbuf *m, - struct sockaddr *addr, struct mbuf *control, - struct proc *p) +static int +ctl_peeraddr(struct socket *so, struct sockaddr **nam) { - struct ctl *ctl = (struct ctl *)so->so_pcb; - int error = 0; - - if (ctl == NULL) - return(ENOTCONN); - - if (ctl->write) - error = (*ctl->write)(ctl, ctl->userdata, m); - - return error; + struct ctl_cb *kcb = (struct ctl_cb *)so->so_pcb; + struct kctl *kctl; + struct sockaddr_ctl sc; + + if (kcb == NULL) /* sanity check */ + return(ENOTCONN); + + if ((kctl = kcb->kctl) == NULL) + return(EINVAL); + + bzero(&sc, sizeof(struct sockaddr_ctl)); + sc.sc_len = sizeof(struct sockaddr_ctl); + sc.sc_family = AF_SYSTEM; + sc.ss_sysaddr = AF_SYS_CONTROL; + sc.sc_id = kctl->id; + sc.sc_unit = kcb->unit; + + *nam = dup_sockaddr((struct sockaddr *)&sc, 1); + + return 0; } -int -ctl_enqueuembuf(void *ctlref, struct mbuf *m, u_int32_t flags) +static int +ctl_send(struct socket *so, int flags, struct mbuf *m, + __unused struct sockaddr *addr, __unused struct mbuf *control, + __unused struct proc *p) { - struct ctl *ctl = (struct ctl *)ctlref; - struct socket *so = (struct socket *)ctl->skt; - - if (ctl == NULL) /* sanity check */ - return(EINVAL); - - if (so == NULL) - return(ENOTCONN); - - if (sbspace(&so->so_rcv) < m->m_pkthdr.len) - return(ENOBUFS); - - sbappend(&so->so_rcv, m); - if ((flags & CTL_DATA_NOWAKEUP) == 0) - sorwakeup(so); - return 0; + int error = 0; + struct ctl_cb *kcb = (struct ctl_cb *)so->so_pcb; + struct kctl *kctl; + + if (kcb == NULL) /* sanity check */ + return(ENOTCONN); + + if ((kctl = kcb->kctl) == NULL) + return(EINVAL); + + if (kctl->send) { + socket_unlock(so, 0); + error = (*kctl->send)(kctl, kcb->unit, kcb->userdata, m, flags); + socket_lock(so, 0); + } + return error; } -int -ctl_enqueuedata(void *ctlref, void *data, size_t len, u_int32_t flags) +errno_t +ctl_enqueuembuf(void *kctlref, u_int32_t unit, struct mbuf *m, u_int32_t flags) { - struct ctl *ctl = (struct ctl *)ctlref; - struct socket *so = (struct socket *)ctl->skt; - struct mbuf *m; - - if (ctl == NULL) /* sanity check */ - return(EINVAL); - - if (so == NULL) - return(ENOTCONN); - - if (len > MCLBYTES) - return(EMSGSIZE); + struct ctl_cb *kcb; + struct socket *so; + errno_t error = 0; + struct kctl *kctl = (struct kctl *)kctlref; + + if (kctl == NULL) + return EINVAL; + + kcb = kcb_find(kctl, unit); + if (kcb == NULL) + return EINVAL; + + so = (struct socket *)kcb->so; + if (so == NULL) + return EINVAL; + + socket_lock(so, 1); + if (sbspace(&so->so_rcv) < m->m_pkthdr.len) { + error = ENOBUFS; + goto bye; + } + if ((flags & CTL_DATA_EOR)) + m->m_flags |= M_EOR; + if (sbappend(&so->so_rcv, m) && (flags & CTL_DATA_NOWAKEUP) == 0) + sorwakeup(so); +bye: + socket_unlock(so, 1); + return error; +} - if (sbspace(&so->so_rcv) < len) - return(ENOBUFS); - - if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) - return (ENOBUFS); - - if (len > MHLEN) { - MCLGET(m, M_NOWAIT); - if (!(m->m_flags & M_EXT)) { - m_freem(m); - return(ENOBUFS); - } - } +errno_t +ctl_enqueuedata(void *kctlref, u_int32_t unit, void *data, size_t len, u_int32_t flags) +{ + struct ctl_cb *kcb; + struct socket *so; + struct mbuf *m; + errno_t error = 0; + struct kctl *kctl = (struct kctl *)kctlref; + unsigned int num_needed; + struct mbuf *n; + size_t curlen = 0; + + if (kctlref == NULL) + return EINVAL; + + kcb = kcb_find(kctl, unit); + if (kcb == NULL) + return EINVAL; + + so = (struct socket *)kcb->so; + if (so == NULL) + return EINVAL; + + socket_lock(so, 1); + if ((size_t)sbspace(&so->so_rcv) < len) { + error = ENOBUFS; + goto bye; + } + + num_needed = 1; + m = m_allocpacket_internal(&num_needed, len, NULL, M_NOWAIT, 1, 0); + if (m == NULL) { + printf("ctl_enqueuedata: m_allocpacket_internal(%lu) failed\n", len); + error = ENOBUFS; + goto bye; + } + + for (n = m; n != NULL; n = n->m_next) { + size_t mlen = mbuf_maxlen(n); + + if (mlen + curlen > len) + mlen = len - curlen; + n->m_len = mlen; + bcopy((char *)data + curlen, n->m_data, mlen); + curlen += mlen; + } + mbuf_pkthdr_setlen(m, curlen); + + if ((flags & CTL_DATA_EOR)) + m->m_flags |= M_EOR; + if (sbappend(&so->so_rcv, m) && (flags & CTL_DATA_NOWAKEUP) == 0) + sorwakeup(so); +bye: + socket_unlock(so, 1); + return error; +} - bcopy(data, mtod(m, void *), len); - m->m_pkthdr.len = m->m_len = len; - sbappend(&so->so_rcv, m); - if ((flags & CTL_DATA_NOWAKEUP) == 0) - sorwakeup(so); - return 0; +errno_t +ctl_getenqueuespace(kern_ctl_ref kctlref, u_int32_t unit, size_t *space) +{ + struct ctl_cb *kcb; + struct kctl *kctl = (struct kctl *)kctlref; + struct socket *so; + + if (kctlref == NULL || space == NULL) + return EINVAL; + + kcb = kcb_find(kctl, unit); + if (kcb == NULL) + return EINVAL; + + so = (struct socket *)kcb->so; + if (so == NULL) + return EINVAL; + + socket_lock(so, 1); + *space = sbspace(&so->so_rcv); + socket_unlock(so, 1); + + return 0; } -int +static int ctl_ctloutput(struct socket *so, struct sockopt *sopt) { - struct ctl *ctl = (struct ctl *)so->so_pcb; - int error = 0, s; - void *data; - size_t len; + struct ctl_cb *kcb = (struct ctl_cb *)so->so_pcb; + struct kctl *kctl; + int error = 0; + void *data; + size_t len; + + if (sopt->sopt_level != SYSPROTO_CONTROL) { + return(EINVAL); + } + + if (kcb == NULL) /* sanity check */ + return(ENOTCONN); + + if ((kctl = kcb->kctl) == NULL) + return(EINVAL); + + switch (sopt->sopt_dir) { + case SOPT_SET: + if (kctl->setopt == NULL) + return(ENOTSUP); + MALLOC(data, void *, sopt->sopt_valsize, M_TEMP, M_WAITOK); + if (data == NULL) + return(ENOMEM); + error = sooptcopyin(sopt, data, sopt->sopt_valsize, sopt->sopt_valsize); + if (error == 0) { + socket_unlock(so, 0); + error = (*kctl->setopt)(kcb->kctl, kcb->unit, kcb->userdata, sopt->sopt_name, + data, sopt->sopt_valsize); + socket_lock(so, 0); + } + FREE(data, M_TEMP); + break; + + case SOPT_GET: + if (kctl->getopt == NULL) + return(ENOTSUP); + data = NULL; + if (sopt->sopt_valsize && sopt->sopt_val) { + MALLOC(data, void *, sopt->sopt_valsize, M_TEMP, M_WAITOK); + if (data == NULL) + return(ENOMEM); + } + len = sopt->sopt_valsize; + socket_unlock(so, 0); + error = (*kctl->getopt)(kcb->kctl, kcb->unit, kcb->userdata, sopt->sopt_name, + data, &len); + socket_lock(so, 0); + if (error == 0) { + if (data != NULL) + error = sooptcopyout(sopt, data, len); + else + sopt->sopt_valsize = len; + } + if (data != NULL) + FREE(data, M_TEMP); + break; + } + return error; +} - if (sopt->sopt_level != SYSPROTO_CONTROL) { - return(EINVAL); - } +static int +ctl_ioctl(__unused struct socket *so, u_long cmd, caddr_t data, + __unused struct ifnet *ifp, __unused struct proc *p) +{ + int error = ENOTSUP; + + switch (cmd) { + /* get the number of controllers */ + case CTLIOCGCOUNT: { + struct kctl *kctl; + int n = 0; + + lck_mtx_lock(ctl_mtx); + TAILQ_FOREACH(kctl, &ctl_head, next) + n++; + lck_mtx_unlock(ctl_mtx); + + *(u_int32_t *)data = n; + error = 0; + break; + } + case CTLIOCGINFO: { + struct ctl_info *ctl_info = (struct ctl_info *)data; + struct kctl *kctl = 0; + size_t name_len = strlen(ctl_info->ctl_name); + + if (name_len == 0 || name_len + 1 > MAX_KCTL_NAME) { + error = EINVAL; + break; + } + lck_mtx_lock(ctl_mtx); + kctl = ctl_find_by_name(ctl_info->ctl_name); + lck_mtx_unlock(ctl_mtx); + if (kctl == 0) { + error = ENOENT; + break; + } + ctl_info->ctl_id = kctl->id; + error = 0; + break; + } + + /* add controls to get list of NKEs */ + + } + + return error; +} - if (ctl == NULL) - return(ENOTCONN); - - switch (sopt->sopt_dir) { - case SOPT_SET: - if (ctl->set == NULL) - return(ENOTSUP); - MALLOC(data, void *, sopt->sopt_valsize, M_TEMP, M_WAITOK); - if (data == NULL) - return(ENOMEM); - error = sooptcopyin(sopt, data, sopt->sopt_valsize, sopt->sopt_valsize); - if (error == 0) - error = (*ctl->set)(ctl, ctl->userdata, sopt->sopt_name, data, sopt->sopt_valsize); - FREE(data, M_TEMP); - break; - - case SOPT_GET: - if (ctl->get == NULL) - return(ENOTSUP); - data = NULL; - if (sopt->sopt_valsize && sopt->sopt_val) { - MALLOC(data, void *, sopt->sopt_valsize, M_TEMP, M_WAITOK); - if (data == NULL) - return(ENOMEM); - } - len = sopt->sopt_valsize; - error = (*ctl->get)(ctl, ctl->userdata, sopt->sopt_name, data, &len); - if (error == 0) { - if (data != NULL) - error = sooptcopyout(sopt, data, len); - else - sopt->sopt_valsize = len; - } - if (data != NULL) - FREE(data, M_TEMP); - break; - } - return error; +/* + * Register/unregister a NKE + */ +errno_t +ctl_register(struct kern_ctl_reg *userkctl, kern_ctl_ref *kctlref) +{ + struct kctl *kctl = 0; + u_int32_t id = -1; + u_int32_t n; + size_t name_len; + + if (userkctl == NULL) /* sanity check */ + return(EINVAL); + if (userkctl->ctl_connect == NULL) + return(EINVAL); + name_len = strlen(userkctl->ctl_name); + if (name_len == 0 || name_len + 1 > MAX_KCTL_NAME) + return(EINVAL); + + MALLOC(kctl, struct kctl *, sizeof(*kctl), M_TEMP, M_WAITOK); + if (kctl == NULL) + return(ENOMEM); + bzero((char *)kctl, sizeof(*kctl)); + + lck_mtx_lock(ctl_mtx); + + if ((userkctl->ctl_flags & CTL_FLAG_REG_ID_UNIT) == 0) { + if (ctl_find_by_name(userkctl->ctl_name) != NULL) { + lck_mtx_unlock(ctl_mtx); + FREE(kctl, M_TEMP); + return(EEXIST); + } + for (n = 0, id = ctl_last_id + 1; n < ctl_max; id++, n++) { + if (id == 0) { + n--; + continue; + } + if (ctl_find_by_id(id) == 0) + break; + } + if (id == ctl_max) { + lck_mtx_unlock(ctl_mtx); + FREE(kctl, M_TEMP); + return(ENOBUFS); + } + userkctl->ctl_id =id; + kctl->id = id; + kctl->reg_unit = -1; + } else { + if (ctl_find_by_id_unit(userkctl->ctl_id, userkctl->ctl_unit) != NULL) { + lck_mtx_unlock(ctl_mtx); + FREE(kctl, M_TEMP); + return(EEXIST); + } + kctl->id = userkctl->ctl_id; + kctl->reg_unit = userkctl->ctl_unit; + } + strcpy(kctl->name, userkctl->ctl_name); + kctl->flags = userkctl->ctl_flags; + + /* Let the caller know the default send and receive sizes */ + if (userkctl->ctl_sendsize == 0) + userkctl->ctl_sendsize = CTL_SENDSIZE; + kctl->sendbufsize = userkctl->ctl_sendsize; + + if (kctl->recvbufsize == 0) + userkctl->ctl_recvsize = CTL_RECVSIZE; + kctl->recvbufsize = userkctl->ctl_recvsize; + + kctl->connect = userkctl->ctl_connect; + kctl->disconnect = userkctl->ctl_disconnect; + kctl->send = userkctl->ctl_send; + kctl->setopt = userkctl->ctl_setopt; + kctl->getopt = userkctl->ctl_getopt; + + TAILQ_INIT(&kctl->kcb_head); + + TAILQ_INSERT_TAIL(&ctl_head, kctl, next); + ctl_max++; + + lck_mtx_unlock(ctl_mtx); + + *kctlref = kctl; + + ctl_post_msg(KEV_CTL_REGISTERED, kctl->id); + return(0); } -int ctl_ioctl(struct socket *so, u_long cmd, caddr_t data, - struct ifnet *ifp, struct proc *p) -{ - int error = ENOTSUP, s, n; - struct ctl *ctl = (struct ctl *)so->so_pcb; - - switch (cmd) { - /* get the number of controllers */ - case CTLIOCGCOUNT: - n = 0; - TAILQ_FOREACH(ctl, &ctl_head, next) - n++; - *(u_int32_t *)data = n; - error = 0; - break; - +errno_t +ctl_deregister(void *kctlref) +{ + struct kctl *kctl; - /* add controls to get list of NKEs */ + if (kctlref == NULL) /* sanity check */ + return(EINVAL); + lck_mtx_lock(ctl_mtx); + TAILQ_FOREACH(kctl, &ctl_head, next) { + if (kctl == (struct kctl *)kctlref) + break; } + if (kctl != (struct kctl *)kctlref) { + lck_mtx_unlock(ctl_mtx); + return EINVAL; + } + if (!TAILQ_EMPTY(&kctl->kcb_head)) { + lck_mtx_unlock(ctl_mtx); + return EBUSY; + } + + TAILQ_REMOVE(&ctl_head, kctl, next); + ctl_max--; + + lck_mtx_unlock(ctl_mtx); - return error; + ctl_post_msg(KEV_CTL_DEREGISTERED, kctl->id); + FREE(kctl, M_TEMP); + return(0); } /* - * Register/unregister a NKE + * Must be called with global lock taked */ -int -ctl_register(struct kern_ctl_reg *userctl, void *userdata, kern_ctl_ref *ctlref) +static struct kctl * +ctl_find_by_id(u_int32_t id) { - struct ctl *ctl; + struct kctl *kctl; - if (userctl == NULL) /* sanity check */ - return(EINVAL); - - ctl = ctl_find(userctl->ctl_id, userctl->ctl_unit); - if (ctl != NULL) - return(EEXIST); - - MALLOC(ctl, struct ctl *, sizeof(*ctl), M_TEMP, M_WAITOK); - if (ctl == NULL) - return(ENOMEM); - - bzero((char *)ctl, sizeof(*ctl)); - - ctl->id = userctl->ctl_id; - ctl->unit = userctl->ctl_unit; - ctl->flags = userctl->ctl_flags; - ctl->sendbufsize = userctl->ctl_sendsize; - ctl->recvbufsize = userctl->ctl_recvsize; - ctl->userdata = userdata; - ctl->connect = userctl->ctl_connect; - ctl->disconnect = userctl->ctl_disconnect; - ctl->write = userctl->ctl_write; - ctl->set = userctl->ctl_set; - ctl->get = userctl->ctl_get; - - TAILQ_INSERT_TAIL(&ctl_head, ctl, next); - - *ctlref = ctl; + TAILQ_FOREACH(kctl, &ctl_head, next) + if (kctl->id == id) + return kctl; - ctl_post_msg(KEV_CTL_REGISTERED, ctl->id, ctl->unit); - return(0); + return NULL; } -int -ctl_deregister(void *ctlref) +/* + * Must be called with global ctl_mtx lock taked + */ +static struct kctl * +ctl_find_by_name(const char *name) { - struct ctl *ctl = (struct ctl *)ctlref; - struct socket *so; + struct kctl *kctl; - if (ctl == NULL) /* sanity check */ - return(EINVAL); + TAILQ_FOREACH(kctl, &ctl_head, next) + if (strcmp(kctl->name, name) == 0) + return kctl; - TAILQ_REMOVE(&ctl_head, ctl, next); + return NULL; +} - if (ctl->skt) { - ctl->skt->so_pcb = 0; - soisdisconnected(ctl->skt); +/* + * Must be called with global ctl_mtx lock taked + * + */ +static struct kctl * +ctl_find_by_id_unit(u_int32_t id, u_int32_t unit) +{ + struct kctl *kctl; + + TAILQ_FOREACH(kctl, &ctl_head, next) { + if (kctl->id == id && (kctl->flags & CTL_FLAG_REG_ID_UNIT) == 0) + return kctl; + else if (kctl->id == id && kctl->reg_unit == unit) + return kctl; } - - ctl_post_msg(KEV_CTL_DEREGISTERED, ctl->id, ctl->unit); - FREE(ctl, M_TEMP); - return(0); + return NULL; } /* - * Locate a NKE + * Must be called with kernel controller lock taken */ -struct ctl * -ctl_find(u_int32_t id, u_int32_t unit) +static struct ctl_cb * +kcb_find(struct kctl *kctl, u_int32_t unit) { - struct ctl *ctl; + struct ctl_cb *kcb; - TAILQ_FOREACH(ctl, &ctl_head, next) - if ((ctl->id == id) && (ctl->unit == unit)) - return ctl; + TAILQ_FOREACH(kcb, &kctl->kcb_head, next) + if ((kcb->unit == unit)) + return kcb; return NULL; } -void ctl_post_msg(u_long event_code, u_int32_t id, u_int32_t unit) +/* + * Must be called witout lock + */ +static void +ctl_post_msg(u_long event_code, u_int32_t id) { struct ctl_event_data ctl_ev_data; struct kev_msg ev_msg; @@ -456,7 +906,6 @@ void ctl_post_msg(u_long event_code, u_int32_t id, u_int32_t unit) /* common nke subclass data */ bzero(&ctl_ev_data, sizeof(ctl_ev_data)); ctl_ev_data.ctl_id = id; - ctl_ev_data.ctl_unit = unit; ev_msg.dv[0].data_ptr = &ctl_ev_data; ev_msg.dv[0].data_length = sizeof(ctl_ev_data); @@ -465,3 +914,83 @@ void ctl_post_msg(u_long event_code, u_int32_t id, u_int32_t unit) kev_post_msg(&ev_msg); } +static int +ctl_lock(struct socket *so, int refcount, int lr) + { + int lr_saved; +#ifdef __ppc__ + if (lr == 0) { + __asm__ volatile("mflr %0" : "=r" (lr_saved)); + } + else lr_saved = lr; +#endif + + if (so->so_pcb) { + lck_mtx_lock(((struct ctl_cb *)so->so_pcb)->mtx); + } else { + panic("ctl_lock: so=%x NO PCB! lr=%x\n", so, lr_saved); + lck_mtx_lock(so->so_proto->pr_domain->dom_mtx); + } + + if (so->so_usecount < 0) + panic("ctl_lock: so=%x so_pcb=%x lr=%x ref=%x\n", + so, so->so_pcb, lr_saved, so->so_usecount); + + if (refcount) + so->so_usecount++; + so->reserved3 = (void *)lr_saved; + return (0); +} + +static int +ctl_unlock(struct socket *so, int refcount, int lr) +{ + int lr_saved; + lck_mtx_t * mutex_held; + +#ifdef __ppc__ + if (lr == 0) { + __asm__ volatile("mflr %0" : "=r" (lr_saved)); + } + else lr_saved = lr; +#endif + +#ifdef MORE_KCTLLOCK_DEBUG + printf("ctl_unlock: so=%x sopcb=%x lock=%x ref=%x lr=%x\n", + so, so->so_pcb, ((struct ctl_cb *)so->so_pcb)->mtx, so->so_usecount, lr_saved); +#endif + if (refcount) + so->so_usecount--; + + if (so->so_usecount < 0) + panic("ctl_unlock: so=%x usecount=%x\n", so, so->so_usecount); + if (so->so_pcb == NULL) { + panic("ctl_unlock: so=%x NO PCB usecount=%x lr=%x\n", so, so->so_usecount, lr_saved); + mutex_held = so->so_proto->pr_domain->dom_mtx; + } else { + mutex_held = ((struct ctl_cb *)so->so_pcb)->mtx; + } + lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); + lck_mtx_unlock(mutex_held); + so->reserved4 = (void *)lr_saved; + + if (so->so_usecount == 0) + ctl_sofreelastref(so); + + return (0); +} + +static lck_mtx_t * +ctl_getlock(struct socket *so, __unused int locktype) +{ + struct ctl_cb *kcb = (struct ctl_cb *)so->so_pcb; + + if (so->so_pcb) { + if (so->so_usecount < 0) + panic("ctl_getlock: so=%x usecount=%x\n", so, so->so_usecount); + return(kcb->mtx); + } else { + panic("ctl_getlock: so=%x NULL so_pcb\n", so); + return (so->so_proto->pr_domain->dom_mtx); + } +} diff --git a/bsd/kern/kern_core.c b/bsd/kern/kern_core.c index fe156fcbf..d17444fd6 100644 --- a/bsd/kern/kern_core.c +++ b/bsd/kern/kern_core.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -35,13 +35,13 @@ #include <sys/signalvar.h> #include <sys/resourcevar.h> #include <sys/namei.h> -#include <sys/vnode.h> -#include <sys/proc.h> +#include <sys/vnode_internal.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/timeb.h> #include <sys/times.h> -#include <sys/buf.h> #include <sys/acct.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/uio.h> #include <sys/kernel.h> #include <sys/stat.h> @@ -51,6 +51,11 @@ #include <mach/vm_statistics.h> #include <vm/vm_kern.h> +#include <vm/vm_protos.h> /* last */ +#include <vm/vm_map.h> /* current_map() */ +#include <mach/mach_vm.h> /* mach_vm_region_recurse() */ +#include <mach/task.h> /* task_suspend() */ +#include <kern/task.h> /* get_task_numacts() */ typedef struct { int flavor; /* the number for this flavor */ @@ -93,23 +98,26 @@ typedef struct { } tir_t; /* XXX should be static */ -void collectth_state(thread_act_t th_act, tir_t *t); +void collectth_state(thread_t th_act, void *tirp); /* XXX not in a Mach header anywhere */ -kern_return_t thread_getstatus(register thread_act_t act, int flavor, +kern_return_t thread_getstatus(register thread_t act, int flavor, thread_state_t tstate, mach_msg_type_number_t *count); +void task_act_iterate_wth_args(task_t, void(*)(thread_t, void *), void *); -__private_extern__ do_coredump = 1; /* default: dump cores */ -__private_extern__ sugid_coredump = 0; /* deafult: but not on SGUID binaries */ +__private_extern__ int do_coredump = 1; /* default: dump cores */ +__private_extern__ int sugid_coredump = 0; /* default: but not SGUID binaries */ void -collectth_state(thread_act_t th_act, tir_t *t) +collectth_state(thread_t th_act, void *tirp) { vm_offset_t header; int hoffset, i ; mythread_state_flavor_t *flavors; struct thread_command *tc; + tir_t *t = (tir_t *)tirp; + /* * Fill in thread command structure. */ @@ -140,9 +148,6 @@ collectth_state(thread_act_t th_act, tir_t *t) t->hoffset = hoffset; } -extern boolean_t coredumpok(vm_map_t map, vm_offset_t va); /* temp fix */ -extern task_t current_task(void); /* XXX */ - /* * Create a core image on the file "core". */ @@ -151,19 +156,17 @@ int coredump(struct proc *p) { int error=0; - register struct pcred *pcred = p->p_cred; - register struct ucred *cred = pcred->pc_ucred; - struct nameidata nd; - struct vattr vattr; + kauth_cred_t cred = kauth_cred_get(); + struct vnode_attr va; + struct vfs_context context; vm_map_t map; int thread_count, segment_count; int command_size, header_size, tstate_size; - int hoffset, foffset, vmoffset; + int hoffset; + off_t foffset; + vm_map_offset_t vmoffset; vm_offset_t header; - struct machine_slot *ms; - struct mach_header *mh; - struct segment_command *sc; - vm_size_t size; + vm_map_size_t vmsize; vm_prot_t prot; vm_prot_t maxprot; vm_inherit_t inherit; @@ -180,15 +183,26 @@ coredump(struct proc *p) int vbrcount=0; tir_t tir1; struct vnode * vp; + struct mach_header *mh; + struct mach_header_64 *mh64; + int is_64 = 0; + size_t mach_header_sz = sizeof(struct mach_header); + size_t segment_command_sz = sizeof(struct segment_command); if (do_coredump == 0 || /* Not dumping at all */ ( (sugid_coredump == 0) && /* Not dumping SUID/SGID binaries */ - ( (pcred->p_svuid != pcred->p_ruid) || - (pcred->p_svgid != pcred->p_rgid)))) { + ( (cred->cr_svuid != cred->cr_ruid) || + (cred->cr_svgid != cred->cr_rgid)))) { return (EFAULT); } + if (IS_64BIT_PROCESS(p)) { + is_64 = 1; + mach_header_sz = sizeof(struct mach_header_64); + segment_command_sz = sizeof(struct segment_command_64); + } + task = current_task(); map = current_map(); mapsize = get_vmmap_size(map); @@ -198,30 +212,31 @@ coredump(struct proc *p) (void) task_suspend(task); /* create name according to sysctl'able format string */ - name = proc_core_name(p->p_comm, p->p_ucred->cr_uid, p->p_pid); + name = proc_core_name(p->p_comm, kauth_cred_getuid(cred), p->p_pid); /* if name creation fails, fall back to historical behaviour... */ if (name == NULL) { - sprintf(core_name, "/cores/core.%d", p->p_pid); + sprintf(core_name, "/cores/core.%d", p->p_pid); name = core_name; } + context.vc_proc = p; + context.vc_ucred = cred; - NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, p); - if((error = vn_open(&nd, O_CREAT | FWRITE | O_NOFOLLOW, S_IRUSR )) != 0) - return (error); - vp = nd.ni_vp; - + if ((error = vnode_open(name, (O_CREAT | FWRITE | O_NOFOLLOW), S_IRUSR, 0, &vp, &context))) + return (error); + + VATTR_INIT(&va); + VATTR_WANTED(&va, va_nlink); /* Don't dump to non-regular files or files with links. */ if (vp->v_type != VREG || - VOP_GETATTR(vp, &vattr, cred, p) || vattr.va_nlink != 1) { + vnode_getattr(vp, &va, &context) || va.va_nlink != 1) { error = EFAULT; goto out; } - VATTR_NULL(&vattr); - vattr.va_size = 0; - VOP_LEASE(vp, p, cred, LEASE_WRITE); - VOP_SETATTR(vp, &vattr, cred, p); + VATTR_INIT(&va); /* better to do it here than waste more stack in vnode_setsize */ + VATTR_SET(&va, va_data_size, 0); + vnode_setattr(vp, &va, &context); p->p_acflag |= ACORE; /* @@ -238,45 +253,71 @@ coredump(struct proc *p) tstate_size += sizeof(mythread_state_flavor_t) + (flavors[i].count * sizeof(int)); - command_size = segment_count*sizeof(struct segment_command) + + command_size = segment_count * segment_command_sz + thread_count*sizeof(struct thread_command) + tstate_size*thread_count; - header_size = command_size + sizeof(struct mach_header); + header_size = command_size + mach_header_sz; - (void) kmem_alloc_wired(kernel_map, + (void) kmem_alloc(kernel_map, (vm_offset_t *)&header, (vm_size_t)header_size); /* * Set up Mach-O header. */ - mh = (struct mach_header *) header; - ms = &machine_slot[cpu_number()]; - mh->magic = MH_MAGIC; - mh->cputype = ms->cpu_type; - mh->cpusubtype = ms->cpu_subtype; - mh->filetype = MH_CORE; - mh->ncmds = segment_count + thread_count; - mh->sizeofcmds = command_size; - - hoffset = sizeof(struct mach_header); /* offset into header */ - foffset = round_page_32(header_size); /* offset into file */ - vmoffset = VM_MIN_ADDRESS; /* offset into VM */ + if (is_64) { + mh64 = (struct mach_header_64 *)header; + mh64->magic = MH_MAGIC_64; + mh64->cputype = cpu_type(); + mh64->cpusubtype = cpu_subtype(); + mh64->filetype = MH_CORE; + mh64->ncmds = segment_count + thread_count; + mh64->sizeofcmds = command_size; + mh64->reserved = 0; /* 8 byte alignment */ + } else { + mh = (struct mach_header *)header; + mh->magic = MH_MAGIC; + mh->cputype = cpu_type(); + mh->cpusubtype = cpu_subtype(); + mh->filetype = MH_CORE; + mh->ncmds = segment_count + thread_count; + mh->sizeofcmds = command_size; + } + + hoffset = mach_header_sz; /* offset into header */ + foffset = round_page(header_size); /* offset into file */ + vmoffset = MACH_VM_MIN_ADDRESS; /* offset into VM */ + /* * We use to check for an error, here, now we try and get * as much as we can */ - while (segment_count > 0){ + while (segment_count > 0) { + struct segment_command *sc; + struct segment_command_64 *sc64; + /* * Get region information for next region. */ while (1) { vbrcount = VM_REGION_SUBMAP_INFO_COUNT_64; - if((kret = vm_region_recurse_64(map, - &vmoffset, &size, &nesting_depth, - &vbr, &vbrcount)) != KERN_SUCCESS) { + if((kret = mach_vm_region_recurse(map, + &vmoffset, &vmsize, &nesting_depth, + (vm_region_recurse_info_t)&vbr, + &vbrcount)) != KERN_SUCCESS) { + break; + } + /* + * If we get a valid mapping back, but we're dumping + * a 32 bit process, and it's over the allowable + * address space of a 32 bit process, it's the same + * as if mach_vm_region_recurse() failed. + */ + if (!(is_64) && + (vmoffset + vmsize > VM_MAX_ADDRESS)) { + kret = KERN_INVALID_ADDRESS; break; } if(vbr.is_submap) { @@ -295,26 +336,41 @@ coredump(struct proc *p) /* * Fill in segment command structure. */ - sc = (struct segment_command *) (header + hoffset); - sc->cmd = LC_SEGMENT; - sc->cmdsize = sizeof(struct segment_command); - /* segment name is zerod by kmem_alloc */ - sc->segname[0] = 0; - sc->vmaddr = vmoffset; - sc->vmsize = size; - sc->fileoff = foffset; - sc->filesize = size; - sc->maxprot = maxprot; - sc->initprot = prot; - sc->nsects = 0; + if (is_64) { + sc64 = (struct segment_command_64 *)(header + hoffset); + sc64->cmd = LC_SEGMENT_64; + sc64->cmdsize = sizeof(struct segment_command_64); + /* segment name is zeroed by kmem_alloc */ + sc64->segname[0] = 0; + sc64->vmaddr = vmoffset; + sc64->vmsize = vmsize; + sc64->fileoff = foffset; + sc64->filesize = vmsize; + sc64->maxprot = maxprot; + sc64->initprot = prot; + sc64->nsects = 0; + } else { + sc = (struct segment_command *) (header + hoffset); + sc->cmd = LC_SEGMENT; + sc->cmdsize = sizeof(struct segment_command); + /* segment name is zeroed by kmem_alloc */ + sc->segname[0] = 0; + sc->vmaddr = CAST_DOWN(vm_offset_t,vmoffset); + sc->vmsize = CAST_DOWN(vm_size_t,vmsize); + sc->fileoff = CAST_DOWN(uint32_t,foffset); + sc->filesize = CAST_DOWN(uint32_t,vmsize); + sc->maxprot = maxprot; + sc->initprot = prot; + sc->nsects = 0; + } /* * Write segment out. Try as hard as possible to * get read access to the data. */ if ((prot & VM_PROT_READ) == 0) { - vm_protect(map, vmoffset, size, FALSE, - prot|VM_PROT_READ); + mach_vm_protect(map, vmoffset, vmsize, FALSE, + prot|VM_PROT_READ); } /* * Only actually perform write if we can read. @@ -324,16 +380,42 @@ coredump(struct proc *p) if ((maxprot & VM_PROT_READ) == VM_PROT_READ && vbr.user_tag != VM_MEMORY_IOKIT && coredumpok(map,vmoffset)) { - error = vn_rdwr(UIO_WRITE, vp, (caddr_t)vmoffset, size, foffset, - UIO_USERSPACE, IO_NODELOCKED|IO_UNIT, cred, (int *) 0, p); + vm_map_size_t tmp_vmsize = vmsize; + off_t xfer_foffset = foffset; + + //LP64todo - works around vn_rdwr_64() 2G limit + while (tmp_vmsize > 0) { + vm_map_size_t xfer_vmsize = tmp_vmsize; + if (xfer_vmsize > INT_MAX) + xfer_vmsize = INT_MAX; + error = vn_rdwr_64(UIO_WRITE, vp, + vmoffset, xfer_vmsize, xfer_foffset, + (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32), + IO_NODELOCKED|IO_UNIT, cred, (int *) 0, p); + tmp_vmsize -= xfer_vmsize; + xfer_foffset += xfer_vmsize; + } } - hoffset += sizeof(struct segment_command); - foffset += size; - vmoffset += size; + hoffset += segment_command_sz; + foffset += vmsize; + vmoffset += vmsize; segment_count--; } + /* + * If there are remaining segments which have not been written + * out because break in the loop above, then they were not counted + * because they exceed the real address space of the executable + * type: remove them from the header's count. This is OK, since + * we are allowed to have a sparse area following the segments. + */ + if (is_64) { + mh64->ncmds -= segment_count; + } else { + mh->ncmds -= segment_count; + } + tir1.header = header; tir1.hoffset = hoffset; tir1.flavors = flavors; @@ -342,15 +424,15 @@ coredump(struct proc *p) /* * Write out the Mach header at the beginning of the - * file. + * file. OK to use a 32 bit write for this. */ error = vn_rdwr(UIO_WRITE, vp, (caddr_t)header, header_size, (off_t)0, - UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, (int *) 0, p); + UIO_SYSSPACE32, IO_NODELOCKED|IO_UNIT, cred, (int *) 0, p); kmem_free(kernel_map, header, header_size); out: - VOP_UNLOCK(vp, 0, p); - error1 = vn_close(vp, FWRITE, cred, p); + error1 = vnode_close(vp, FWRITE, &context); if (error == 0) error = error1; + return (error); } diff --git a/bsd/kern/kern_credential.c b/bsd/kern/kern_credential.c new file mode 100644 index 000000000..0a917310f --- /dev/null +++ b/bsd/kern/kern_credential.c @@ -0,0 +1,2268 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/* + * Kernel Authorization framework: Management of process/thread credentials and identity information. + */ + + +#include <sys/param.h> /* XXX trim includes */ +#include <sys/acct.h> +#include <sys/systm.h> +#include <sys/ucred.h> +#include <sys/proc_internal.h> +#include <sys/user.h> +#include <sys/timeb.h> +#include <sys/times.h> +#include <sys/malloc.h> +#include <sys/kauth.h> +#include <sys/kernel.h> + +#include <bsm/audit_kernel.h> + +#include <sys/mount.h> +#include <sys/sysproto.h> +#include <mach/message.h> +#include <mach/host_security.h> + +#include <libkern/OSAtomic.h> + +#include <kern/task.h> +#include <kern/lock.h> +#ifdef MACH_ASSERT +# undef MACH_ASSERT +#endif +#define MACH_ASSERT 1 /* XXX so bogus */ +#include <kern/assert.h> + +#define CRED_DIAGNOSTIC 1 + +# define NULLCRED_CHECK(_c) do {if (((_c) == NOCRED) || ((_c) == FSCRED)) panic("bad credential %p", _c);} while(0) + +/* + * Interface to external identity resolver. + * + * The architecture of the interface is simple; the external resolver calls in to + * get work, then calls back with completed work. It also calls us to let us know + * that it's (re)started, so that we can resubmit work if it times out. + */ + +static lck_mtx_t *kauth_resolver_mtx; +#define KAUTH_RESOLVER_LOCK() lck_mtx_lock(kauth_resolver_mtx); +#define KAUTH_RESOLVER_UNLOCK() lck_mtx_unlock(kauth_resolver_mtx); + +static volatile pid_t kauth_resolver_identity; +static int kauth_resolver_registered; +static uint32_t kauth_resolver_sequence; + +struct kauth_resolver_work { + TAILQ_ENTRY(kauth_resolver_work) kr_link; + struct kauth_identity_extlookup kr_work; + uint32_t kr_seqno; + int kr_refs; + int kr_flags; +#define KAUTH_REQUEST_UNSUBMITTED (1<<0) +#define KAUTH_REQUEST_SUBMITTED (1<<1) +#define KAUTH_REQUEST_DONE (1<<2) + int kr_result; +}; + +TAILQ_HEAD(kauth_resolver_unsubmitted_head, kauth_resolver_work) kauth_resolver_unsubmitted; +TAILQ_HEAD(kauth_resolver_submitted_head, kauth_resolver_work) kauth_resolver_submitted; +TAILQ_HEAD(kauth_resolver_done_head, kauth_resolver_work) kauth_resolver_done; + +static int kauth_resolver_submit(struct kauth_identity_extlookup *lkp); +static int kauth_resolver_complete(user_addr_t message); +static int kauth_resolver_getwork(user_addr_t message); + +#define KAUTH_CRED_PRIMES_COUNT 7 +static const int kauth_cred_primes[KAUTH_CRED_PRIMES_COUNT] = {97, 241, 397, 743, 1499, 3989, 7499}; +static int kauth_cred_primes_index = 0; +static int kauth_cred_table_size = 0; + +TAILQ_HEAD(kauth_cred_entry_head, ucred); +static struct kauth_cred_entry_head * kauth_cred_table_anchor = NULL; + +#define KAUTH_CRED_HASH_DEBUG 0 + +static int kauth_cred_add(kauth_cred_t new_cred); +static void kauth_cred_remove(kauth_cred_t cred); +static inline u_long kauth_cred_hash(const uint8_t *datap, int data_len, u_long start_key); +static u_long kauth_cred_get_hashkey(kauth_cred_t cred); +static kauth_cred_t kauth_cred_update(kauth_cred_t old_cred, kauth_cred_t new_cred, boolean_t retain_auditinfo); + +#if KAUTH_CRED_HASH_DEBUG +static int kauth_cred_count = 0; +static void kauth_cred_hash_print(void); +static void kauth_cred_print(kauth_cred_t cred); +#endif + +void +kauth_resolver_init(void) +{ + TAILQ_INIT(&kauth_resolver_unsubmitted); + TAILQ_INIT(&kauth_resolver_submitted); + TAILQ_INIT(&kauth_resolver_done); + kauth_resolver_sequence = 31337; + kauth_resolver_mtx = lck_mtx_alloc_init(kauth_lck_grp, 0/*LCK_ATTR_NULL*/); +} + +/* + * Allocate a work queue entry, submit the work and wait for completion. + * + * XXX do we want an 'interruptible' flag vs. always being interruptible? + */ +static int +kauth_resolver_submit(struct kauth_identity_extlookup *lkp) +{ + struct kauth_resolver_work *workp, *killp; + struct timespec ts; + int error, shouldfree; + + /* no point actually blocking if the resolver isn't up yet */ + if (kauth_resolver_identity == 0) { + /* + * We've already waited an initial 30 seconds with no result. + * Sleep on a stack address so no one wakes us before timeout; + * we sleep a half a second in case we are a high priority + * process, so that memberd doesn't starve while we are in a + * tight loop between user and kernel, eating all the CPU. + */ + error = tsleep(&ts, PZERO | PCATCH, "kr_submit", hz/2); + if (kauth_resolver_identity == 0) { + /* + * if things haven't changed while we were asleep, + * tell the caller we couldn't get an authoritative + * answer. + */ + return(EWOULDBLOCK); + } + } + + MALLOC(workp, struct kauth_resolver_work *, sizeof(*workp), M_KAUTH, M_WAITOK); + if (workp == NULL) + return(ENOMEM); + + workp->kr_work = *lkp; + workp->kr_refs = 1; + workp->kr_flags = KAUTH_REQUEST_UNSUBMITTED; + workp->kr_result = 0; + + /* + * We insert the request onto the unsubmitted queue, the call in from the + * resolver will it to the submitted thread when appropriate. + */ + KAUTH_RESOLVER_LOCK(); + workp->kr_seqno = workp->kr_work.el_seqno = kauth_resolver_sequence++; + workp->kr_work.el_result = KAUTH_EXTLOOKUP_INPROG; + + /* XXX as an optimisation, we could check the queue for identical items and coalesce */ + TAILQ_INSERT_TAIL(&kauth_resolver_unsubmitted, workp, kr_link); + + wakeup_one((caddr_t)&kauth_resolver_unsubmitted); + for (;;) { + /* we could compute a better timeout here */ + ts.tv_sec = 30; + ts.tv_nsec = 0; + error = msleep(workp, kauth_resolver_mtx, PCATCH, "kr_submit", &ts); + /* request has been completed? */ + if ((error == 0) && (workp->kr_flags & KAUTH_REQUEST_DONE)) + break; + /* woken because the resolver has died? */ + if (kauth_resolver_identity == 0) { + error = EIO; + break; + } + /* an error? */ + if (error != 0) + break; + } + /* if the request was processed, copy the result */ + if (error == 0) + *lkp = workp->kr_work; + + /* + * If the request timed out and was never collected, the resolver is dead and + * probably not coming back anytime soon. In this case we revert to no-resolver + * behaviour, and punt all the other sleeping requests to clear the backlog. + */ + if ((error == EWOULDBLOCK) && (workp->kr_flags & KAUTH_REQUEST_UNSUBMITTED)) { + KAUTH_DEBUG("RESOLVER - request timed out without being collected for processing, resolver dead"); + kauth_resolver_identity = 0; + /* kill all the other requestes that are waiting as well */ + TAILQ_FOREACH(killp, &kauth_resolver_submitted, kr_link) + wakeup(killp); + TAILQ_FOREACH(killp, &kauth_resolver_unsubmitted, kr_link) + wakeup(killp); + } + + /* drop our reference on the work item, and note whether we should free it or not */ + if (--workp->kr_refs <= 0) { + /* work out which list we have to remove it from */ + if (workp->kr_flags & KAUTH_REQUEST_DONE) { + TAILQ_REMOVE(&kauth_resolver_done, workp, kr_link); + } else if (workp->kr_flags & KAUTH_REQUEST_SUBMITTED) { + TAILQ_REMOVE(&kauth_resolver_submitted, workp, kr_link); + } else if (workp->kr_flags & KAUTH_REQUEST_UNSUBMITTED) { + TAILQ_REMOVE(&kauth_resolver_unsubmitted, workp, kr_link); + } else { + KAUTH_DEBUG("RESOLVER - completed request has no valid queue"); + } + shouldfree = 1; + } else { + /* someone else still has a reference on this request */ + shouldfree = 0; + } + /* collect request result */ + if (error == 0) + error = workp->kr_result; + KAUTH_RESOLVER_UNLOCK(); + /* + * If we dropped the last reference, free the request. + */ + if (shouldfree) + FREE(workp, M_KAUTH); + + KAUTH_DEBUG("RESOLVER - returning %d", error); + return(error); +} + +/* + * System call interface for the external identity resolver. + */ +int +identitysvc(__unused struct proc *p, struct identitysvc_args *uap, __unused register_t *retval) +{ + int opcode = uap->opcode; + user_addr_t message = uap->message; + struct kauth_resolver_work *workp; + int error; + pid_t new_id; + + /* + * New server registering itself. + */ + if (opcode == KAUTH_EXTLOOKUP_REGISTER) { + new_id = current_proc()->p_pid; + if ((error = kauth_authorize_generic(kauth_cred_get(), KAUTH_GENERIC_ISSUSER)) != 0) { + KAUTH_DEBUG("RESOLVER - pid %d refused permission to become identity resolver", new_id); + return(error); + } + KAUTH_RESOLVER_LOCK(); + if (kauth_resolver_identity != new_id) { + KAUTH_DEBUG("RESOLVER - new resolver %d taking over from old %d", new_id, kauth_resolver_identity); + /* + * We have a new server, so assume that all the old requests have been lost. + */ + while ((workp = TAILQ_LAST(&kauth_resolver_submitted, kauth_resolver_submitted_head)) != NULL) { + TAILQ_REMOVE(&kauth_resolver_submitted, workp, kr_link); + workp->kr_flags &= ~KAUTH_REQUEST_SUBMITTED; + workp->kr_flags |= KAUTH_REQUEST_UNSUBMITTED; + TAILQ_INSERT_HEAD(&kauth_resolver_unsubmitted, workp, kr_link); + } + kauth_resolver_identity = new_id; + kauth_resolver_registered = 1; + wakeup(&kauth_resolver_unsubmitted); + } + KAUTH_RESOLVER_UNLOCK(); + return(0); + } + + /* + * Beyond this point, we must be the resolver process. + */ + if (current_proc()->p_pid != kauth_resolver_identity) { + KAUTH_DEBUG("RESOLVER - call from bogus resolver %d\n", current_proc()->p_pid); + return(EPERM); + } + + /* + * Got a result returning? + */ + if (opcode & KAUTH_EXTLOOKUP_RESULT) { + if ((error = kauth_resolver_complete(message)) != 0) + return(error); + } + + /* + * Caller wants to take more work? + */ + if (opcode & KAUTH_EXTLOOKUP_WORKER) { + if ((error = kauth_resolver_getwork(message)) != 0) + return(error); + } + + return(0); +} + +/* + * Get work for a caller. + */ +static int +kauth_resolver_getwork(user_addr_t message) +{ + struct kauth_resolver_work *workp; + int error; + + KAUTH_RESOLVER_LOCK(); + error = 0; + while ((workp = TAILQ_FIRST(&kauth_resolver_unsubmitted)) == NULL) { + error = msleep(&kauth_resolver_unsubmitted, kauth_resolver_mtx, PCATCH, "GRGetWork", 0); + if (error != 0) + break; + } + if (workp != NULL) { + if ((error = copyout(&workp->kr_work, message, sizeof(workp->kr_work))) != 0) { + KAUTH_DEBUG("RESOLVER - error submitting work to resolve"); + goto out; + } + TAILQ_REMOVE(&kauth_resolver_unsubmitted, workp, kr_link); + workp->kr_flags &= ~KAUTH_REQUEST_UNSUBMITTED; + workp->kr_flags |= KAUTH_REQUEST_SUBMITTED; + TAILQ_INSERT_TAIL(&kauth_resolver_submitted, workp, kr_link); + } + +out: + KAUTH_RESOLVER_UNLOCK(); + return(error); +} + +/* + * Return a result from userspace. + */ +static int +kauth_resolver_complete(user_addr_t message) +{ + struct kauth_identity_extlookup extl; + struct kauth_resolver_work *workp; + int error, result; + + if ((error = copyin(message, &extl, sizeof(extl))) != 0) { + KAUTH_DEBUG("RESOLVER - error getting completed work\n"); + return(error); + } + + KAUTH_RESOLVER_LOCK(); + + error = 0; + result = 0; + switch (extl.el_result) { + case KAUTH_EXTLOOKUP_INPROG: + { + static int once = 0; + + /* XXX this should go away once memberd is updated */ + if (!once) { + printf("kauth_resolver: memberd is not setting valid result codes (assuming always successful)\n"); + once = 1; + } + } + /* FALLTHROUGH */ + case KAUTH_EXTLOOKUP_SUCCESS: + break; + + case KAUTH_EXTLOOKUP_FATAL: + /* fatal error means the resolver is dead */ + KAUTH_DEBUG("RESOLVER - resolver %d died, waiting for a new one", kauth_resolver_identity); + kauth_resolver_identity = 0; + /* XXX should we terminate all outstanding requests? */ + error = EIO; + break; + case KAUTH_EXTLOOKUP_BADRQ: + KAUTH_DEBUG("RESOLVER - resolver reported invalid request %d", extl.el_seqno); + result = EINVAL; + break; + case KAUTH_EXTLOOKUP_FAILURE: + KAUTH_DEBUG("RESOLVER - resolver reported transient failure for request %d", extl.el_seqno); + result = EIO; + break; + default: + KAUTH_DEBUG("RESOLVER - resolver returned unexpected status %d", extl.el_result); + result = EIO; + break; + } + + /* + * In the case of a fatal error, we assume that the resolver will restart + * quickly and re-collect all of the outstanding requests. Thus, we don't + * complete the request which returned the fatal error status. + */ + if (extl.el_result != KAUTH_EXTLOOKUP_FATAL) { + /* scan our list for this request */ + TAILQ_FOREACH(workp, &kauth_resolver_submitted, kr_link) { + /* found it? */ + if (workp->kr_seqno == extl.el_seqno) { + /* copy result */ + workp->kr_work = extl; + /* move onto completed list and wake up requester(s) */ + TAILQ_REMOVE(&kauth_resolver_submitted, workp, kr_link); + workp->kr_flags &= ~KAUTH_REQUEST_SUBMITTED; + workp->kr_flags |= KAUTH_REQUEST_DONE; + workp->kr_result = result; + TAILQ_INSERT_TAIL(&kauth_resolver_done, workp, kr_link); + wakeup(workp); + break; + } + } + } + /* + * Note that it's OK for us not to find anything; if the request has + * timed out the work record will be gone. + */ + KAUTH_RESOLVER_UNLOCK(); + + return(error); +} + + +/* + * Identity cache. + */ + +struct kauth_identity { + TAILQ_ENTRY(kauth_identity) ki_link; + int ki_valid; +#define KI_VALID_UID (1<<0) /* UID and GID are mutually exclusive */ +#define KI_VALID_GID (1<<1) +#define KI_VALID_GUID (1<<2) +#define KI_VALID_NTSID (1<<3) + uid_t ki_uid; + gid_t ki_gid; + guid_t ki_guid; + ntsid_t ki_ntsid; + /* + * Expiry times are the earliest time at which we will disregard the cached state and go to + * userland. Before then if the valid bit is set, we will return the cached value. If it's + * not set, we will not go to userland to resolve, just assume that there is no answer + * available. + */ + time_t ki_guid_expiry; + time_t ki_ntsid_expiry; +}; + +static TAILQ_HEAD(kauth_identity_head, kauth_identity) kauth_identities; +#define KAUTH_IDENTITY_CACHEMAX 100 /* XXX sizing? */ +static int kauth_identity_count; + +static lck_mtx_t *kauth_identity_mtx; +#define KAUTH_IDENTITY_LOCK() lck_mtx_lock(kauth_identity_mtx); +#define KAUTH_IDENTITY_UNLOCK() lck_mtx_unlock(kauth_identity_mtx); + + +static struct kauth_identity *kauth_identity_alloc(uid_t uid, gid_t gid, guid_t *guidp, time_t guid_expiry, + ntsid_t *ntsidp, time_t ntsid_expiry); +static void kauth_identity_register(struct kauth_identity *kip); +static void kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_identity *kip); +static void kauth_identity_lru(struct kauth_identity *kip); +static int kauth_identity_guid_expired(struct kauth_identity *kip); +static int kauth_identity_ntsid_expired(struct kauth_identity *kip); +static int kauth_identity_find_uid(uid_t uid, struct kauth_identity *kir); +static int kauth_identity_find_gid(gid_t gid, struct kauth_identity *kir); +static int kauth_identity_find_guid(guid_t *guidp, struct kauth_identity *kir); +static int kauth_identity_find_ntsid(ntsid_t *ntsid, struct kauth_identity *kir); + +void +kauth_identity_init(void) +{ + TAILQ_INIT(&kauth_identities); + kauth_identity_mtx = lck_mtx_alloc_init(kauth_lck_grp, 0/*LCK_ATTR_NULL*/); +} + +static int +kauth_identity_resolve(__unused struct kauth_identity_extlookup *el) +{ + return(kauth_resolver_submit(el)); +} + +static struct kauth_identity * +kauth_identity_alloc(uid_t uid, gid_t gid, guid_t *guidp, time_t guid_expiry, ntsid_t *ntsidp, time_t ntsid_expiry) +{ + struct kauth_identity *kip; + + /* get and fill in a new identity */ + MALLOC(kip, struct kauth_identity *, sizeof(*kip), M_KAUTH, M_WAITOK | M_ZERO); + if (kip != NULL) { + if (gid != KAUTH_GID_NONE) { + kip->ki_gid = gid; + kip->ki_valid = KI_VALID_GID; + } + if (uid != KAUTH_UID_NONE) { + if (kip->ki_valid & KI_VALID_GID) + panic("can't allocate kauth identity with both uid and gid"); + kip->ki_uid = uid; + kip->ki_valid = KI_VALID_UID; + } + if (guidp != NULL) { + kip->ki_guid = *guidp; + kip->ki_valid |= KI_VALID_GUID; + } + kip->ki_guid_expiry = guid_expiry; + if (ntsidp != NULL) { + kip->ki_ntsid = *ntsidp; + kip->ki_valid |= KI_VALID_NTSID; + } + kip->ki_ntsid_expiry = ntsid_expiry; + } + return(kip); +} + +/* + * Register an association between identity tokens. + */ +static void +kauth_identity_register(struct kauth_identity *kip) +{ + struct kauth_identity *ip; + + /* + * We search the cache for the UID listed in the incoming association. If we + * already have an entry, the new information is merged. + */ + ip = NULL; + KAUTH_IDENTITY_LOCK(); + if (kip->ki_valid & KI_VALID_UID) { + if (kip->ki_valid & KI_VALID_GID) + panic("kauth_identity: can't insert record with both UID and GID as key"); + TAILQ_FOREACH(ip, &kauth_identities, ki_link) + if ((ip->ki_valid & KI_VALID_UID) && (ip->ki_uid == kip->ki_uid)) + break; + } else if (kip->ki_valid & KI_VALID_GID) { + TAILQ_FOREACH(ip, &kauth_identities, ki_link) + if ((ip->ki_valid & KI_VALID_GID) && (ip->ki_gid == kip->ki_gid)) + break; + } else { + panic("kauth_identity: can't insert record without UID or GID as key"); + } + + if (ip != NULL) { + /* we already have an entry, merge/overwrite */ + if (kip->ki_valid & KI_VALID_GUID) { + ip->ki_guid = kip->ki_guid; + ip->ki_valid |= KI_VALID_GUID; + } + ip->ki_guid_expiry = kip->ki_guid_expiry; + if (kip->ki_valid & KI_VALID_NTSID) { + ip->ki_ntsid = kip->ki_ntsid; + ip->ki_valid |= KI_VALID_NTSID; + } + ip->ki_ntsid_expiry = kip->ki_ntsid_expiry; + /* and discard the incoming identity */ + FREE(kip, M_KAUTH); + ip = NULL; + } else { + /* don't have any information on this identity, so just add it */ + TAILQ_INSERT_HEAD(&kauth_identities, kip, ki_link); + if (++kauth_identity_count > KAUTH_IDENTITY_CACHEMAX) { + ip = TAILQ_LAST(&kauth_identities, kauth_identity_head); + TAILQ_REMOVE(&kauth_identities, ip, ki_link); + kauth_identity_count--; + } + } + KAUTH_IDENTITY_UNLOCK(); + /* have to drop lock before freeing expired entry */ + if (ip != NULL) + FREE(ip, M_KAUTH); +} + +/* + * Given a lookup result, add any associations that we don't + * currently have. + */ +static void +kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_identity *rkip) +{ + struct timeval tv; + struct kauth_identity *kip; + + microuptime(&tv); + + /* user identity? */ + if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_UID) { + KAUTH_IDENTITY_LOCK(); + TAILQ_FOREACH(kip, &kauth_identities, ki_link) { + /* matching record */ + if ((kip->ki_valid & KI_VALID_UID) && (kip->ki_uid == elp->el_uid)) { + if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_UGUID) { + kip->ki_guid = elp->el_uguid; + kip->ki_valid |= KI_VALID_GUID; + } + kip->ki_guid_expiry = tv.tv_sec + elp->el_uguid_valid; + if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_USID) { + kip->ki_ntsid = elp->el_usid; + kip->ki_valid |= KI_VALID_NTSID; + } + kip->ki_ntsid_expiry = tv.tv_sec + elp->el_usid_valid; + kauth_identity_lru(kip); + if (rkip != NULL) + *rkip = *kip; + KAUTH_DEBUG("CACHE - refreshed %d is " K_UUID_FMT, kip->ki_uid, K_UUID_ARG(kip->ki_guid)); + break; + } + } + KAUTH_IDENTITY_UNLOCK(); + /* not found in cache, add new record */ + if (kip == NULL) { + kip = kauth_identity_alloc(elp->el_uid, KAUTH_GID_NONE, + (elp->el_flags & KAUTH_EXTLOOKUP_VALID_UGUID) ? &elp->el_uguid : NULL, + tv.tv_sec + elp->el_uguid_valid, + (elp->el_flags & KAUTH_EXTLOOKUP_VALID_USID) ? &elp->el_usid : NULL, + tv.tv_sec + elp->el_usid_valid); + if (kip != NULL) { + if (rkip != NULL) + *rkip = *kip; + KAUTH_DEBUG("CACHE - learned %d is " K_UUID_FMT, kip->ki_uid, K_UUID_ARG(kip->ki_guid)); + kauth_identity_register(kip); + } + } + } + + /* group identity? */ + if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_GID) { + KAUTH_IDENTITY_LOCK(); + TAILQ_FOREACH(kip, &kauth_identities, ki_link) { + /* matching record */ + if ((kip->ki_valid & KI_VALID_GID) && (kip->ki_gid == elp->el_gid)) { + if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_GGUID) { + kip->ki_guid = elp->el_gguid; + kip->ki_valid |= KI_VALID_GUID; + } + kip->ki_guid_expiry = tv.tv_sec + elp->el_gguid_valid; + if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_GSID) { + kip->ki_ntsid = elp->el_gsid; + kip->ki_valid |= KI_VALID_NTSID; + } + kip->ki_ntsid_expiry = tv.tv_sec + elp->el_gsid_valid; + kauth_identity_lru(kip); + if (rkip != NULL) + *rkip = *kip; + KAUTH_DEBUG("CACHE - refreshed %d is " K_UUID_FMT, kip->ki_uid, K_UUID_ARG(kip->ki_guid)); + break; + } + } + KAUTH_IDENTITY_UNLOCK(); + /* not found in cache, add new record */ + if (kip == NULL) { + kip = kauth_identity_alloc(KAUTH_UID_NONE, elp->el_gid, + (elp->el_flags & KAUTH_EXTLOOKUP_VALID_GGUID) ? &elp->el_gguid : NULL, + tv.tv_sec + elp->el_gguid_valid, + (elp->el_flags & KAUTH_EXTLOOKUP_VALID_GSID) ? &elp->el_gsid : NULL, + tv.tv_sec + elp->el_gsid_valid); + if (kip != NULL) { + if (rkip != NULL) + *rkip = *kip; + KAUTH_DEBUG("CACHE - learned %d is " K_UUID_FMT, kip->ki_uid, K_UUID_ARG(kip->ki_guid)); + kauth_identity_register(kip); + } + } + } + +} + +/* + * Promote the entry to the head of the LRU, assumes the cache is locked. + * + * This is called even if the entry has expired; typically an expired entry + * that's been looked up is about to be revalidated, and having it closer to + * the head of the LRU means finding it quickly again when the revalidation + * comes through. + */ +static void +kauth_identity_lru(struct kauth_identity *kip) +{ + if (kip != TAILQ_FIRST(&kauth_identities)) { + TAILQ_REMOVE(&kauth_identities, kip, ki_link); + TAILQ_INSERT_HEAD(&kauth_identities, kip, ki_link); + } +} + +/* + * Handly lazy expiration of translations. + */ +static int +kauth_identity_guid_expired(struct kauth_identity *kip) +{ + struct timeval tv; + + microuptime(&tv); + KAUTH_DEBUG("CACHE - GUID expires @ %d now %d", kip->ki_guid_expiry, tv.tv_sec); + return((kip->ki_guid_expiry <= tv.tv_sec) ? 1 : 0); +} + +static int +kauth_identity_ntsid_expired(struct kauth_identity *kip) +{ + struct timeval tv; + + microuptime(&tv); + KAUTH_DEBUG("CACHE - NTSID expires @ %d now %d", kip->ki_ntsid_expiry, tv.tv_sec); + return((kip->ki_ntsid_expiry <= tv.tv_sec) ? 1 : 0); +} + +/* + * Search for an entry by UID. Returns a copy of the entry, ENOENT if no valid + * association exists for the UID. + */ +static int +kauth_identity_find_uid(uid_t uid, struct kauth_identity *kir) +{ + struct kauth_identity *kip; + + KAUTH_IDENTITY_LOCK(); + TAILQ_FOREACH(kip, &kauth_identities, ki_link) { + if ((kip->ki_valid & KI_VALID_UID) && (uid == kip->ki_uid)) { + kauth_identity_lru(kip); + *kir = *kip; + break; + } + } + KAUTH_IDENTITY_UNLOCK(); + return((kip == NULL) ? ENOENT : 0); +} + + +/* + * Search for an entry by GID. Returns a copy of the entry, ENOENT if no valid + * association exists for the GID. + */ +static int +kauth_identity_find_gid(uid_t gid, struct kauth_identity *kir) +{ + struct kauth_identity *kip; + + KAUTH_IDENTITY_LOCK(); + TAILQ_FOREACH(kip, &kauth_identities, ki_link) { + if ((kip->ki_valid & KI_VALID_GID) && (gid == kip->ki_gid)) { + kauth_identity_lru(kip); + *kir = *kip; + break; + } + } + KAUTH_IDENTITY_UNLOCK(); + return((kip == NULL) ? ENOENT : 0); +} + + +/* + * Search for an entry by GUID. Returns a copy of the entry, ENOENT if no valid + * association exists for the GUID. Note that the association may be expired, + * in which case the caller may elect to call out to userland to revalidate. + */ +static int +kauth_identity_find_guid(guid_t *guidp, struct kauth_identity *kir) +{ + struct kauth_identity *kip; + + KAUTH_IDENTITY_LOCK(); + TAILQ_FOREACH(kip, &kauth_identities, ki_link) { + if ((kip->ki_valid & KI_VALID_GUID) && (kauth_guid_equal(guidp, &kip->ki_guid))) { + kauth_identity_lru(kip); + *kir = *kip; + break; + } + } + KAUTH_IDENTITY_UNLOCK(); + return((kip == NULL) ? ENOENT : 0); +} + +/* + * Search for an entry by NT Security ID. Returns a copy of the entry, ENOENT if no valid + * association exists for the SID. Note that the association may be expired, + * in which case the caller may elect to call out to userland to revalidate. + */ +static int +kauth_identity_find_ntsid(ntsid_t *ntsid, struct kauth_identity *kir) +{ + struct kauth_identity *kip; + + KAUTH_IDENTITY_LOCK(); + TAILQ_FOREACH(kip, &kauth_identities, ki_link) { + if ((kip->ki_valid & KI_VALID_NTSID) && (kauth_ntsid_equal(ntsid, &kip->ki_ntsid))) { + kauth_identity_lru(kip); + *kir = *kip; + break; + } + } + KAUTH_IDENTITY_UNLOCK(); + return((kip == NULL) ? ENOENT : 0); +} + +/* + * GUID handling. + */ +guid_t kauth_null_guid; + +int +kauth_guid_equal(guid_t *guid1, guid_t *guid2) +{ + return(!bcmp(guid1, guid2, sizeof(*guid1))); +} + +/* + * Look for well-known GUIDs. + */ +int +kauth_wellknown_guid(guid_t *guid) +{ + static char fingerprint[] = {0xab, 0xcd, 0xef, 0xab, 0xcd, 0xef, 0xab, 0xcd, 0xef, 0xab, 0xcd, 0xef}; + int code; + /* + * All WKGs begin with the same 12 bytes. + */ + if (!bcmp((void *)guid, fingerprint, 12)) { + /* + * The final 4 bytes are our code. + */ + code = *(u_int32_t *)&guid->g_guid[12]; + switch(code) { + case 0x0000000c: + return(KAUTH_WKG_EVERYBODY); + case 0xfffffffe: + return(KAUTH_WKG_NOBODY); + case 0x0000000a: + return(KAUTH_WKG_OWNER); + case 0x00000010: + return(KAUTH_WKG_GROUP); + } + } + return(KAUTH_WKG_NOT); +} + + +/* + * NT Security Identifier handling. + */ +int +kauth_ntsid_equal(ntsid_t *sid1, ntsid_t *sid2) +{ + /* check sizes for equality, also sanity-check size while we're at it */ + if ((KAUTH_NTSID_SIZE(sid1) == KAUTH_NTSID_SIZE(sid2)) && + (KAUTH_NTSID_SIZE(sid1) <= sizeof(*sid1)) && + !bcmp(sid1, sid2, KAUTH_NTSID_SIZE(sid1))) + return(1); + return(0); +} + +/* + * Identity KPI + * + * We support four tokens representing identity: + * - Credential reference + * - UID + * - GUID + * - NT security identifier + * + * Of these, the UID is the ubiquitous identifier; cross-referencing should + * be done using it. + */ + +static int kauth_cred_cache_lookup(int from, int to, void *src, void *dst); + +/* + * Fetch UID from credential. + */ +uid_t +kauth_cred_getuid(kauth_cred_t cred) +{ + NULLCRED_CHECK(cred); + return(cred->cr_uid); +} + +/* + * Fetch GID from credential. + */ +uid_t +kauth_cred_getgid(kauth_cred_t cred) +{ + NULLCRED_CHECK(cred); + return(cred->cr_gid); +} + +/* + * Fetch UID from GUID. + */ +int +kauth_cred_guid2uid(guid_t *guidp, uid_t *uidp) +{ + return(kauth_cred_cache_lookup(KI_VALID_GUID, KI_VALID_UID, guidp, uidp)); +} + +/* + * Fetch GID from GUID. + */ +int +kauth_cred_guid2gid(guid_t *guidp, gid_t *gidp) +{ + return(kauth_cred_cache_lookup(KI_VALID_GUID, KI_VALID_GID, guidp, gidp)); +} + +/* + * Fetch UID from NT SID. + */ +int +kauth_cred_ntsid2uid(ntsid_t *sidp, uid_t *uidp) +{ + return(kauth_cred_cache_lookup(KI_VALID_NTSID, KI_VALID_UID, sidp, uidp)); +} + +/* + * Fetch GID from NT SID. + */ +int +kauth_cred_ntsid2gid(ntsid_t *sidp, gid_t *gidp) +{ + return(kauth_cred_cache_lookup(KI_VALID_NTSID, KI_VALID_GID, sidp, gidp)); +} + +/* + * Fetch GUID from NT SID. + */ +int +kauth_cred_ntsid2guid(ntsid_t *sidp, guid_t *guidp) +{ + return(kauth_cred_cache_lookup(KI_VALID_NTSID, KI_VALID_GUID, sidp, guidp)); +} + +/* + * Fetch GUID from UID. + */ +int +kauth_cred_uid2guid(uid_t uid, guid_t *guidp) +{ + return(kauth_cred_cache_lookup(KI_VALID_UID, KI_VALID_GUID, &uid, guidp)); +} + +/* + * Fetch user GUID from credential. + */ +int +kauth_cred_getguid(kauth_cred_t cred, guid_t *guidp) +{ + NULLCRED_CHECK(cred); + return(kauth_cred_uid2guid(kauth_cred_getuid(cred), guidp)); +} + +/* + * Fetch GUID from GID. + */ +int +kauth_cred_gid2guid(gid_t gid, guid_t *guidp) +{ + return(kauth_cred_cache_lookup(KI_VALID_GID, KI_VALID_GUID, &gid, guidp)); +} + +/* + * Fetch NT SID from UID. + */ +int +kauth_cred_uid2ntsid(uid_t uid, ntsid_t *sidp) +{ + return(kauth_cred_cache_lookup(KI_VALID_UID, KI_VALID_NTSID, &uid, sidp)); +} + +/* + * Fetch NT SID from credential. + */ +int +kauth_cred_getntsid(kauth_cred_t cred, ntsid_t *sidp) +{ + NULLCRED_CHECK(cred); + return(kauth_cred_uid2ntsid(kauth_cred_getuid(cred), sidp)); +} + +/* + * Fetch NT SID from GID. + */ +int +kauth_cred_gid2ntsid(gid_t gid, ntsid_t *sidp) +{ + return(kauth_cred_cache_lookup(KI_VALID_GID, KI_VALID_NTSID, &gid, sidp)); +} + +/* + * Fetch NT SID from GUID. + */ +int +kauth_cred_guid2ntsid(guid_t *guidp, ntsid_t *sidp) +{ + return(kauth_cred_cache_lookup(KI_VALID_GUID, KI_VALID_NTSID, guidp, sidp)); +} + + + +/* + * Lookup a translation in the cache. + */ +static int +kauth_cred_cache_lookup(int from, int to, void *src, void *dst) +{ + struct kauth_identity ki; + struct kauth_identity_extlookup el; + int error; + int (* expired)(struct kauth_identity *kip); + + KAUTH_DEBUG("CACHE - translate %d to %d", from, to); + + /* + * Look for an existing cache entry for this association. + * If the entry has not expired, return the cached information. + */ + ki.ki_valid = 0; + switch(from) { + case KI_VALID_UID: + error = kauth_identity_find_uid(*(uid_t *)src, &ki); + break; + case KI_VALID_GID: + error = kauth_identity_find_gid(*(gid_t *)src, &ki); + break; + case KI_VALID_GUID: + error = kauth_identity_find_guid((guid_t *)src, &ki); + break; + case KI_VALID_NTSID: + error = kauth_identity_find_ntsid((ntsid_t *)src, &ki); + break; + default: + return(EINVAL); + } + /* lookup failure or error */ + if (error != 0) { + /* any other error is fatal */ + if (error != ENOENT) { + KAUTH_DEBUG("CACHE - cache search error %d", error); + return(error); + } + } else { + /* do we have a translation? */ + if (ki.ki_valid & to) { + /* found a valid cached entry, check expiry */ + switch(to) { + case KI_VALID_GUID: + expired = kauth_identity_guid_expired; + break; + case KI_VALID_NTSID: + expired = kauth_identity_ntsid_expired; + break; + default: + switch(from) { + case KI_VALID_GUID: + expired = kauth_identity_guid_expired; + break; + case KI_VALID_NTSID: + expired = kauth_identity_ntsid_expired; + break; + default: + expired = NULL; + } + } + KAUTH_DEBUG("CACHE - found matching entry with valid %d", ki.ki_valid); + /* + * If no expiry function, or not expired, we have found + * a hit. + */ + if (!expired) { + KAUTH_DEBUG("CACHE - no expiry function"); + goto found; + } + if (!expired(&ki)) { + KAUTH_DEBUG("CACHE - entry valid, unexpired"); + goto found; + } + /* + * We leave ki_valid set here; it contains a translation but the TTL has + * expired. If we can't get a result from the resolver, we will + * use it as a better-than nothing alternative. + */ + KAUTH_DEBUG("CACHE - expired entry found"); + } + } + + /* + * Call the resolver. We ask for as much data as we can get. + */ + switch(from) { + case KI_VALID_UID: + el.el_flags = KAUTH_EXTLOOKUP_VALID_UID; + el.el_uid = *(uid_t *)src; + break; + case KI_VALID_GID: + el.el_flags = KAUTH_EXTLOOKUP_VALID_GID; + el.el_gid = *(gid_t *)src; + break; + case KI_VALID_GUID: + el.el_flags = KAUTH_EXTLOOKUP_VALID_UGUID | KAUTH_EXTLOOKUP_VALID_GGUID; + el.el_uguid = *(guid_t *)src; + el.el_gguid = *(guid_t *)src; + break; + case KI_VALID_NTSID: + el.el_flags = KAUTH_EXTLOOKUP_VALID_USID | KAUTH_EXTLOOKUP_VALID_GSID; + el.el_usid = *(ntsid_t *)src; + el.el_gsid = *(ntsid_t *)src; + break; + default: + return(EINVAL); + } + /* + * Here we ask for everything all at once, to avoid having to work + * out what we really want now, or might want soon. + * + * Asking for SID translations when we don't know we need them right + * now is going to cause excess work to be done if we're connected + * to a network that thinks it can translate them. This list needs + * to get smaller/smarter. + */ + el.el_flags |= KAUTH_EXTLOOKUP_WANT_UID | KAUTH_EXTLOOKUP_WANT_GID | + KAUTH_EXTLOOKUP_WANT_UGUID | KAUTH_EXTLOOKUP_WANT_GGUID | + KAUTH_EXTLOOKUP_WANT_USID | KAUTH_EXTLOOKUP_WANT_GSID; + KAUTH_DEBUG("CACHE - calling resolver for %x", el.el_flags); + error = kauth_identity_resolve(&el); + KAUTH_DEBUG("CACHE - resolver returned %d", error); + /* was the lookup successful? */ + if (error == 0) { + /* + * Save the results from the lookup - may have other information even if we didn't + * get a guid. + */ + kauth_identity_updatecache(&el, &ki); + } + /* + * Check to see if we have a valid result. + */ + if (!error && !(ki.ki_valid & to)) + error = ENOENT; + if (error) + return(error); +found: + switch(to) { + case KI_VALID_UID: + *(uid_t *)dst = ki.ki_uid; + break; + case KI_VALID_GID: + *(gid_t *)dst = ki.ki_gid; + break; + case KI_VALID_GUID: + *(guid_t *)dst = ki.ki_guid; + break; + case KI_VALID_NTSID: + *(ntsid_t *)dst = ki.ki_ntsid; + break; + default: + return(EINVAL); + } + KAUTH_DEBUG("CACHE - returned successfully"); + return(0); +} + + +/* + * Group membership cache. + * + * XXX the linked-list implementation here needs to be optimized. + */ + +struct kauth_group_membership { + TAILQ_ENTRY(kauth_group_membership) gm_link; + uid_t gm_uid; /* the identity whose membership we're recording */ + gid_t gm_gid; /* group of which they are a member */ + time_t gm_expiry; /* TTL for the membership */ + int gm_flags; +#define KAUTH_GROUP_ISMEMBER (1<<0) +}; + +TAILQ_HEAD(kauth_groups_head, kauth_group_membership) kauth_groups; +#define KAUTH_GROUPS_CACHEMAX 100 /* XXX sizing? */ +static int kauth_groups_count; + +static lck_mtx_t *kauth_groups_mtx; +#define KAUTH_GROUPS_LOCK() lck_mtx_lock(kauth_groups_mtx); +#define KAUTH_GROUPS_UNLOCK() lck_mtx_unlock(kauth_groups_mtx); + +static int kauth_groups_expired(struct kauth_group_membership *gm); +static void kauth_groups_lru(struct kauth_group_membership *gm); +static void kauth_groups_updatecache(struct kauth_identity_extlookup *el); + +void +kauth_groups_init(void) +{ + TAILQ_INIT(&kauth_groups); + kauth_groups_mtx = lck_mtx_alloc_init(kauth_lck_grp, 0/*LCK_ATTR_NULL*/); +} + +static int +kauth_groups_expired(struct kauth_group_membership *gm) +{ + struct timeval tv; + + microuptime(&tv); + return((gm->gm_expiry <= tv.tv_sec) ? 1 : 0); +} + +static void +kauth_groups_lru(struct kauth_group_membership *gm) +{ + if (gm != TAILQ_FIRST(&kauth_groups)) { + TAILQ_REMOVE(&kauth_groups, gm, gm_link); + TAILQ_INSERT_HEAD(&kauth_groups, gm, gm_link); + } +} + +static void +kauth_groups_updatecache(struct kauth_identity_extlookup *el) +{ + struct kauth_group_membership *gm; + struct timeval tv; + + /* need a valid response if we are to cache anything */ + if ((el->el_flags & + (KAUTH_EXTLOOKUP_VALID_UID | KAUTH_EXTLOOKUP_VALID_GID | KAUTH_EXTLOOKUP_VALID_MEMBERSHIP)) != + (KAUTH_EXTLOOKUP_VALID_UID | KAUTH_EXTLOOKUP_VALID_GID | KAUTH_EXTLOOKUP_VALID_MEMBERSHIP)) + return; + + microuptime(&tv); + + /* search for an existing record for this association before inserting */ + KAUTH_GROUPS_LOCK(); + TAILQ_FOREACH(gm, &kauth_groups, gm_link) { + if ((el->el_uid == gm->gm_uid) && + (el->el_gid == gm->gm_gid)) { + if (el->el_flags & KAUTH_EXTLOOKUP_ISMEMBER) { + gm->gm_flags |= KAUTH_GROUP_ISMEMBER; + } else { + gm->gm_flags &= ~KAUTH_GROUP_ISMEMBER; + } + gm->gm_expiry = el->el_member_valid + tv.tv_sec; + kauth_groups_lru(gm); + break; + } + } + KAUTH_GROUPS_UNLOCK(); + + /* if we found an entry to update, stop here */ + if (gm != NULL) + return; + + /* allocate a new record */ + MALLOC(gm, struct kauth_group_membership *, sizeof(*gm), M_KAUTH, M_WAITOK); + if (gm != NULL) { + gm->gm_uid = el->el_uid; + gm->gm_gid = el->el_gid; + if (el->el_flags & KAUTH_EXTLOOKUP_ISMEMBER) { + gm->gm_flags |= KAUTH_GROUP_ISMEMBER; + } else { + gm->gm_flags &= ~KAUTH_GROUP_ISMEMBER; + } + gm->gm_expiry = el->el_member_valid + tv.tv_sec; + } + + /* + * Insert the new entry. Note that it's possible to race ourselves here + * and end up with duplicate entries in the list. Wasteful, but harmless + * since the first into the list will never be looked up, and thus will + * eventually just fall off the end. + */ + KAUTH_GROUPS_LOCK(); + TAILQ_INSERT_HEAD(&kauth_groups, gm, gm_link); + if (kauth_groups_count++ > KAUTH_GROUPS_CACHEMAX) { + gm = TAILQ_LAST(&kauth_groups, kauth_groups_head); + TAILQ_REMOVE(&kauth_groups, gm, gm_link); + kauth_groups_count--; + } else { + gm = NULL; + } + KAUTH_GROUPS_UNLOCK(); + + /* free expired cache entry */ + if (gm != NULL) + FREE(gm, M_KAUTH); +} + +/* + * Group membership KPI + */ +/* + * This function guarantees not to modify resultp when returning an error. + */ +int +kauth_cred_ismember_gid(kauth_cred_t cred, gid_t gid, int *resultp) +{ + struct kauth_group_membership *gm; + struct kauth_identity_extlookup el; + int i, error; + + /* + * Check the per-credential list of override groups. + * + * We can conditionalise this on cred->cr_gmuid == KAUTH_UID_NONE since + * the cache should be used for that case. + */ + for (i = 0; i < cred->cr_ngroups; i++) { + if (gid == cred->cr_groups[i]) { + *resultp = 1; + return(0); + } + } + + /* + * If we don't have a UID for group membership checks, the in-cred list + * was authoritative and we can stop here. + */ + if (cred->cr_gmuid == KAUTH_UID_NONE) { + *resultp = 0; + return(0); + } + + + /* + * If the resolver hasn't checked in yet, we are early in the boot phase and + * the local group list is complete and authoritative. + */ + if (!kauth_resolver_registered) { + *resultp = 0; + return(0); + } + + /* TODO: */ + /* XXX check supplementary groups */ + /* XXX check whiteout groups */ + /* XXX nesting of supplementary/whiteout groups? */ + + /* + * Check the group cache. + */ + KAUTH_GROUPS_LOCK(); + TAILQ_FOREACH(gm, &kauth_groups, gm_link) { + if ((gm->gm_uid == cred->cr_gmuid) && (gm->gm_gid == gid) && !kauth_groups_expired(gm)) { + kauth_groups_lru(gm); + break; + } + } + + /* did we find a membership entry? */ + if (gm != NULL) + *resultp = (gm->gm_flags & KAUTH_GROUP_ISMEMBER) ? 1 : 0; + KAUTH_GROUPS_UNLOCK(); + + /* if we did, we can return now */ + if (gm != NULL) + return(0); + + /* nothing in the cache, need to go to userland */ + el.el_flags = KAUTH_EXTLOOKUP_VALID_UID | KAUTH_EXTLOOKUP_VALID_GID | KAUTH_EXTLOOKUP_WANT_MEMBERSHIP; + el.el_uid = cred->cr_gmuid; + el.el_gid = gid; + error = kauth_identity_resolve(&el); + if (error != 0) + return(error); + /* save the results from the lookup */ + kauth_groups_updatecache(&el); + + /* if we successfully ascertained membership, report */ + if (el.el_flags & KAUTH_EXTLOOKUP_VALID_MEMBERSHIP) { + *resultp = (el.el_flags & KAUTH_EXTLOOKUP_ISMEMBER) ? 1 : 0; + return(0); + } + + return(ENOENT); +} + +/* + * Determine whether the supplied credential is a member of the + * group nominated by GUID. + */ +int +kauth_cred_ismember_guid(kauth_cred_t cred, guid_t *guidp, int *resultp) +{ + gid_t gid; + int error, wkg; + + error = 0; + wkg = kauth_wellknown_guid(guidp); + switch(wkg) { + case KAUTH_WKG_NOBODY: + *resultp = 0; + break; + case KAUTH_WKG_EVERYBODY: + *resultp = 1; + break; + default: + /* translate guid to gid */ + if ((error = kauth_cred_guid2gid(guidp, &gid)) != 0) { + /* + * If we have no guid -> gid translation, it's not a group and + * thus the cred can't be a member. + */ + if (error == ENOENT) { + *resultp = 0; + error = 0; + } + } else { + error = kauth_cred_ismember_gid(cred, gid, resultp); + } + } + return(error); +} + +/* + * Fast replacement for issuser() + */ +int +kauth_cred_issuser(kauth_cred_t cred) +{ + return(cred->cr_uid == 0); +} + +/* + * Credential KPI + */ + +/* lock protecting credential hash table */ +static lck_mtx_t *kauth_cred_hash_mtx; +#define KAUTH_CRED_HASH_LOCK() lck_mtx_lock(kauth_cred_hash_mtx); +#define KAUTH_CRED_HASH_UNLOCK() lck_mtx_unlock(kauth_cred_hash_mtx); + +void +kauth_cred_init(void) +{ + int i; + + kauth_cred_hash_mtx = lck_mtx_alloc_init(kauth_lck_grp, 0/*LCK_ATTR_NULL*/); + kauth_cred_table_size = kauth_cred_primes[kauth_cred_primes_index]; + + /*allocate credential hash table */ + MALLOC(kauth_cred_table_anchor, struct kauth_cred_entry_head *, + (sizeof(struct kauth_cred_entry_head) * kauth_cred_table_size), + M_KAUTH, M_WAITOK | M_ZERO); + for (i = 0; i < kauth_cred_table_size; i++) { + TAILQ_INIT(&kauth_cred_table_anchor[i]); + } +} + +/* + * Return the current thread's effective UID. + */ +uid_t +kauth_getuid(void) +{ + return(kauth_cred_get()->cr_uid); +} + +/* + * Return the current thread's real UID. + */ +uid_t +kauth_getruid(void) +{ + return(kauth_cred_get()->cr_ruid); +} + +/* + * Return the current thread's effective GID. + */ +gid_t +kauth_getgid(void) +{ + return(kauth_cred_get()->cr_groups[0]); +} + +/* + * Return the current thread's real GID. + */ +gid_t +kauth_getrgid(void) +{ + return(kauth_cred_get()->cr_rgid); +} + +/* + * Returns a pointer to the current thread's credential, does not take a + * reference (so the caller must not do anything that would let the thread's + * credential change while using the returned value). + */ +kauth_cred_t +kauth_cred_get(void) +{ + struct proc *p; + struct uthread *uthread; + + uthread = get_bsdthread_info(current_thread()); + /* sanity */ + if (uthread == NULL) + panic("thread wants credential but has no BSD thread info"); + /* + * We can lazy-bind credentials to threads, as long as their processes have them. + * If we later inline this function, the code in this block should probably be + * called out in a function. + */ + if (uthread->uu_ucred == NOCRED) { + if ((p = (proc_t) get_bsdtask_info(get_threadtask(current_thread()))) == NULL) + panic("thread wants credential but has no BSD process"); + proc_lock(p); + kauth_cred_ref(uthread->uu_ucred = p->p_ucred); + proc_unlock(p); + } + return(uthread->uu_ucred); +} + +/* + * Returns a pointer to the current thread's credential, takes a reference. + */ +kauth_cred_t +kauth_cred_get_with_ref(void) +{ + struct proc *procp; + struct uthread *uthread; + + uthread = get_bsdthread_info(current_thread()); + /* sanity checks */ + if (uthread == NULL) + panic("%s - thread wants credential but has no BSD thread info", __FUNCTION__); + if ((procp = (proc_t) get_bsdtask_info(get_threadtask(current_thread()))) == NULL) + panic("%s - thread wants credential but has no BSD process", __FUNCTION__); + + /* + * We can lazy-bind credentials to threads, as long as their processes have them. + * If we later inline this function, the code in this block should probably be + * called out in a function. + */ + proc_lock(procp); + if (uthread->uu_ucred == NOCRED) { + /* take reference for new cred in thread */ + kauth_cred_ref(uthread->uu_ucred = proc_ucred(procp)); + } + /* take a reference for our caller */ + kauth_cred_ref(uthread->uu_ucred); + proc_unlock(procp); + return(uthread->uu_ucred); +} + +/* + * Returns a pointer to the given process's credential, takes a reference. + */ +kauth_cred_t +kauth_cred_proc_ref(proc_t procp) +{ + kauth_cred_t cred; + + proc_lock(procp); + cred = proc_ucred(procp); + kauth_cred_ref(cred); + proc_unlock(procp); + return(cred); +} + +/* + * Allocates a new credential. + */ +kauth_cred_t +kauth_cred_alloc(void) +{ + kauth_cred_t newcred; + + MALLOC(newcred, kauth_cred_t, sizeof(*newcred), M_KAUTH, M_WAITOK | M_ZERO); + if (newcred != 0) { + newcred->cr_ref = 1; + /* must do this, or cred has same group membership as uid 0 */ + newcred->cr_gmuid = KAUTH_UID_NONE; +#if CRED_DIAGNOSTIC + } else { + panic("kauth_cred_alloc: couldn't allocate credential"); +#endif + } + +#if KAUTH_CRED_HASH_DEBUG + kauth_cred_count++; +#endif + + return(newcred); +} + +/* + * Looks to see if we already have a known credential and if found bumps the + * reference count and returns it. If there are no credentials that match + * the given credential then we allocate a new credential. + * + * Note that the gmuid is hard-defaulted to the UID specified. Since we maintain + * this field, we can't expect callers to know how it needs to be set. Callers + * should be prepared for this field to be overwritten. + */ +kauth_cred_t +kauth_cred_create(kauth_cred_t cred) +{ + kauth_cred_t found_cred, new_cred = NULL; + + cred->cr_gmuid = cred->cr_uid; + + for (;;) { + KAUTH_CRED_HASH_LOCK(); + found_cred = kauth_cred_find(cred); + if (found_cred != NULL) { + /* found an existing credential so we'll bump reference count and return */ + kauth_cred_ref(found_cred); + KAUTH_CRED_HASH_UNLOCK(); + return(found_cred); + } + KAUTH_CRED_HASH_UNLOCK(); + + /* no existing credential found. create one and add it to our hash table */ + new_cred = kauth_cred_alloc(); + if (new_cred != NULL) { + int err; + new_cred->cr_uid = cred->cr_uid; + new_cred->cr_ruid = cred->cr_ruid; + new_cred->cr_svuid = cred->cr_svuid; + new_cred->cr_rgid = cred->cr_rgid; + new_cred->cr_svgid = cred->cr_svgid; + new_cred->cr_gmuid = cred->cr_gmuid; + new_cred->cr_ngroups = cred->cr_ngroups; + bcopy(&cred->cr_groups[0], &new_cred->cr_groups[0], sizeof(new_cred->cr_groups)); + KAUTH_CRED_HASH_LOCK(); + err = kauth_cred_add(new_cred); + KAUTH_CRED_HASH_UNLOCK(); + + /* retry if kauth_cred_add returns non zero value */ + if (err == 0) + break; + FREE(new_cred, M_KAUTH); + new_cred = NULL; + } + } + + return(new_cred); +} + +/* + * Update the given credential using the uid argument. The given uid is used + * set the effective user ID, real user ID, and saved user ID. We only + * allocate a new credential when the given uid actually results in changes to + * the existing credential. + */ +kauth_cred_t +kauth_cred_setuid(kauth_cred_t cred, uid_t uid) +{ + struct ucred temp_cred; + + NULLCRED_CHECK(cred); + + /* don't need to do anything if the effective, real and saved user IDs are + * already the same as the user ID passed in + */ + if (cred->cr_uid == uid && cred->cr_ruid == uid && cred->cr_svuid == uid) { + /* no change needed */ + return(cred); + } + + /* look up in cred hash table to see if we have a matching credential + * with new values. + */ + bcopy(cred, &temp_cred, sizeof(temp_cred)); + temp_cred.cr_uid = uid; + temp_cred.cr_ruid = uid; + temp_cred.cr_svuid = uid; + temp_cred.cr_gmuid = uid; + + return(kauth_cred_update(cred, &temp_cred, TRUE)); +} + +/* + * Update the given credential using the euid argument. The given uid is used + * set the effective user ID. We only allocate a new credential when the given + * uid actually results in changes to the existing credential. + */ +kauth_cred_t +kauth_cred_seteuid(kauth_cred_t cred, uid_t euid) +{ + struct ucred temp_cred; + + NULLCRED_CHECK(cred); + + /* don't need to do anything if the given effective user ID is already the + * same as the effective user ID in the credential. + */ + if (cred->cr_uid == euid) { + /* no change needed */ + return(cred); + } + + /* look up in cred hash table to see if we have a matching credential + * with new values. + */ + bcopy(cred, &temp_cred, sizeof(temp_cred)); + temp_cred.cr_uid = euid; + + return(kauth_cred_update(cred, &temp_cred, TRUE)); +} + +/* + * Update the given credential using the gid argument. The given gid is used + * set the effective group ID, real group ID, and saved group ID. We only + * allocate a new credential when the given gid actually results in changes to + * the existing credential. + */ +kauth_cred_t +kauth_cred_setgid(kauth_cred_t cred, gid_t gid) +{ + struct ucred temp_cred; + + NULLCRED_CHECK(cred); + + /* don't need to do anything if the given group ID is already the + * same as the group ID in the credential. + */ + if (cred->cr_groups[0] == gid && cred->cr_rgid == gid && cred->cr_svgid == gid) { + /* no change needed */ + return(cred); + } + + /* look up in cred hash table to see if we have a matching credential + * with new values. + */ + bcopy(cred, &temp_cred, sizeof(temp_cred)); + temp_cred.cr_groups[0] = gid; + temp_cred.cr_rgid = gid; + temp_cred.cr_svgid = gid; + + return(kauth_cred_update(cred, &temp_cred, TRUE)); +} + +/* + * Update the given credential using the egid argument. The given gid is used + * set the effective user ID. We only allocate a new credential when the given + * gid actually results in changes to the existing credential. + */ +kauth_cred_t +kauth_cred_setegid(kauth_cred_t cred, gid_t egid) +{ + struct ucred temp_cred; + + NULLCRED_CHECK(cred); + + /* don't need to do anything if the given group ID is already the + * same as the group Id in the credential. + */ + if (cred->cr_groups[0] == egid) { + /* no change needed */ + return(cred); + } + + /* look up in cred hash table to see if we have a matching credential + * with new values. + */ + bcopy(cred, &temp_cred, sizeof(temp_cred)); + temp_cred.cr_groups[0] = egid; + + return(kauth_cred_update(cred, &temp_cred, TRUE)); +} + +/* + * Update the given credential with the given groups. We only allocate a new + * credential when the given gid actually results in changes to the existing + * credential. + * The gmuid argument supplies a new uid (or KAUTH_UID_NONE to opt out) + * which will be used for group membership checking. + */ +kauth_cred_t +kauth_cred_setgroups(kauth_cred_t cred, gid_t *groups, int groupcount, uid_t gmuid) +{ + int i; + struct ucred temp_cred; + + NULLCRED_CHECK(cred); + + /* don't need to do anything if the given list of groups does not change. + */ + if ((cred->cr_gmuid == gmuid) && (cred->cr_ngroups == groupcount)) { + for (i = 0; i < groupcount; i++) { + if (cred->cr_groups[i] != groups[i]) + break; + } + if (i == groupcount) { + /* no change needed */ + return(cred); + } + } + + /* look up in cred hash table to see if we have a matching credential + * with new values. + */ + bcopy(cred, &temp_cred, sizeof(temp_cred)); + temp_cred.cr_ngroups = groupcount; + bcopy(groups, temp_cred.cr_groups, sizeof(temp_cred.cr_groups)); + temp_cred.cr_gmuid = gmuid; + + return(kauth_cred_update(cred, &temp_cred, TRUE)); +} + +/* + * Update the given credential using the uid and gid arguments. The given uid + * is used set the effective user ID, real user ID, and saved user ID. + * The given gid is used set the effective group ID, real group ID, and saved + * group ID. + * We only allocate a new credential when the given uid and gid actually results + * in changes to the existing credential. + */ +kauth_cred_t +kauth_cred_setuidgid(kauth_cred_t cred, uid_t uid, gid_t gid) +{ + struct ucred temp_cred; + + NULLCRED_CHECK(cred); + + /* don't need to do anything if the effective, real and saved user IDs are + * already the same as the user ID passed in + */ + if (cred->cr_uid == uid && cred->cr_ruid == uid && cred->cr_svuid == uid && + cred->cr_groups[0] == gid && cred->cr_rgid == gid && cred->cr_svgid == gid) { + /* no change needed */ + return(cred); + } + + /* look up in cred hash table to see if we have a matching credential + * with new values. + */ + bzero(&temp_cred, sizeof(temp_cred)); + temp_cred.cr_uid = uid; + temp_cred.cr_ruid = uid; + temp_cred.cr_svuid = uid; + temp_cred.cr_gmuid = uid; + temp_cred.cr_ngroups = 1; + temp_cred.cr_groups[0] = gid; + temp_cred.cr_rgid = gid; + temp_cred.cr_svgid = gid; + + return(kauth_cred_update(cred, &temp_cred, TRUE)); +} + +/* + * Update the given credential using the uid and gid arguments. The given uid + * is used to set the saved user ID. The given gid is used to set the + * saved group ID. + * We only allocate a new credential when the given uid and gid actually results + * in changes to the existing credential. + */ +kauth_cred_t +kauth_cred_setsvuidgid(kauth_cred_t cred, uid_t uid, gid_t gid) +{ + struct ucred temp_cred; + + NULLCRED_CHECK(cred); + + /* don't need to do anything if the effective, real and saved user IDs are + * already the same as the user ID passed in + */ + if (cred->cr_svuid == uid && cred->cr_svgid == gid) { + /* no change needed */ + return(cred); + } + + /* look up in cred hash table to see if we have a matching credential + * with new values. + */ + bcopy(cred, &temp_cred, sizeof(temp_cred)); + temp_cred.cr_svuid = uid; + temp_cred.cr_svgid = gid; + + return(kauth_cred_update(cred, &temp_cred, TRUE)); +} + +/* + * Update the given credential using the given auditinfo_t. + * We only allocate a new credential when the given auditinfo_t actually results + * in changes to the existing credential. + */ +kauth_cred_t +kauth_cred_setauditinfo(kauth_cred_t cred, auditinfo_t *auditinfo_p) +{ + struct ucred temp_cred; + + NULLCRED_CHECK(cred); + + /* don't need to do anything if the audit info is already the same as the + * audit info in the credential passed in + */ + if (bcmp(&cred->cr_au, auditinfo_p, sizeof(cred->cr_au)) == 0) { + /* no change needed */ + return(cred); + } + + /* look up in cred hash table to see if we have a matching credential + * with new values. + */ + bcopy(cred, &temp_cred, sizeof(temp_cred)); + bcopy(auditinfo_p, &temp_cred.cr_au, sizeof(temp_cred.cr_au)); + + return(kauth_cred_update(cred, &temp_cred, FALSE)); +} + +/* + * Add a reference to the passed credential. + */ +void +kauth_cred_ref(kauth_cred_t cred) +{ + int old_value; + + NULLCRED_CHECK(cred); + + old_value = OSAddAtomic(1, &cred->cr_ref); + + if (old_value < 1) + panic("kauth_cred_ref: trying to take a reference on a cred with no references"); + + return; +} + +/* + * Drop a reference from the passed credential, potentially destroying it. + */ +void +kauth_cred_rele(kauth_cred_t cred) +{ + int old_value; + + NULLCRED_CHECK(cred); + + KAUTH_CRED_HASH_LOCK(); + old_value = OSAddAtomic(-1, &cred->cr_ref); + +#if DIAGNOSTIC + if (old_value == 0) + panic("kauth_cred_rele: dropping a reference on a cred with no references"); +#endif + + if (old_value < 3) { + /* the last reference is our credential hash table */ + kauth_cred_remove(cred); + } + KAUTH_CRED_HASH_UNLOCK(); +} + +/* + * Duplicate a credential. + * NOTE - caller should call kauth_cred_add after any credential changes are made. + */ +kauth_cred_t +kauth_cred_dup(kauth_cred_t cred) +{ + kauth_cred_t newcred; + +#if CRED_DIAGNOSTIC + if (cred == NOCRED || cred == FSCRED) + panic("kauth_cred_dup: bad credential"); +#endif + newcred = kauth_cred_alloc(); + if (newcred != NULL) { + bcopy(cred, newcred, sizeof(*newcred)); + newcred->cr_ref = 1; + } + return(newcred); +} + +/* + * Returns a credential based on the passed credential but which + * reflects the real rather than effective UID and GID. + * NOTE - we do NOT decrement cred reference count on passed in credential + */ +kauth_cred_t +kauth_cred_copy_real(kauth_cred_t cred) +{ + kauth_cred_t newcred = NULL, found_cred; + struct ucred temp_cred; + + /* if the credential is already 'real', just take a reference */ + if ((cred->cr_ruid == cred->cr_uid) && + (cred->cr_rgid == cred->cr_gid)) { + kauth_cred_ref(cred); + return(cred); + } + + /* look up in cred hash table to see if we have a matching credential + * with new values. + */ + bcopy(cred, &temp_cred, sizeof(temp_cred)); + temp_cred.cr_uid = cred->cr_ruid; + temp_cred.cr_groups[0] = cred->cr_rgid; + /* if the cred is not opted out, make sure we are using the r/euid for group checks */ + if (temp_cred.cr_gmuid != KAUTH_UID_NONE) + temp_cred.cr_gmuid = cred->cr_ruid; + + for (;;) { + int err; + + KAUTH_CRED_HASH_LOCK(); + found_cred = kauth_cred_find(&temp_cred); + if (found_cred == cred) { + /* same cred so just bail */ + KAUTH_CRED_HASH_UNLOCK(); + return(cred); + } + if (found_cred != NULL) { + /* found a match so we bump reference count on new one and decrement + * reference count on the old one. + */ + kauth_cred_ref(found_cred); + KAUTH_CRED_HASH_UNLOCK(); + return(found_cred); + } + + /* must allocate a new credential, copy in old credential data and update + * with real user and group IDs. + */ + newcred = kauth_cred_dup(&temp_cred); + err = kauth_cred_add(newcred); + KAUTH_CRED_HASH_UNLOCK(); + + /* retry if kauth_cred_add returns non zero value */ + if (err == 0) + break; + FREE(newcred, M_KAUTH); + newcred = NULL; + } + + return(newcred); +} + +/* + * common code to update a credential. model_cred is a temporary, non reference + * counted credential used only for comparison and modeling purposes. old_cred + * is a live reference counted credential that we intend to update using model_cred + * as our model. + */ +static kauth_cred_t kauth_cred_update(kauth_cred_t old_cred, kauth_cred_t model_cred, boolean_t retain_auditinfo) +{ + kauth_cred_t found_cred, new_cred = NULL; + + /* make sure we carry the auditinfo forward to the new credential unless + * we are actually updating the auditinfo. + */ + if (retain_auditinfo) + bcopy(&old_cred->cr_au, &model_cred->cr_au, sizeof(model_cred->cr_au)); + + for (;;) { + int err; + + KAUTH_CRED_HASH_LOCK(); + found_cred = kauth_cred_find(model_cred); + if (found_cred == old_cred) { + /* same cred so just bail */ + KAUTH_CRED_HASH_UNLOCK(); + return(old_cred); + } + if (found_cred != NULL) { + /* found a match so we bump reference count on new one and decrement + * reference count on the old one. + */ + kauth_cred_ref(found_cred); + KAUTH_CRED_HASH_UNLOCK(); + kauth_cred_rele(old_cred); + return(found_cred); + } + + /* must allocate a new credential using the model. also + * adds the new credential to the credential hash table. + */ + new_cred = kauth_cred_dup(model_cred); + err = kauth_cred_add(new_cred); + KAUTH_CRED_HASH_UNLOCK(); + + /* retry if kauth_cred_add returns non zero value */ + if (err == 0) + break; + FREE(new_cred, M_KAUTH); + new_cred = NULL; + } + + kauth_cred_rele(old_cred); + return(new_cred); +} + +/* + * Add the given credential to our credential hash table and take an additional + * reference to account for our use of the credential in the hash table. + * NOTE - expects caller to hold KAUTH_CRED_HASH_LOCK! + */ +static int kauth_cred_add(kauth_cred_t new_cred) +{ + u_long hash_key; + + hash_key = kauth_cred_get_hashkey(new_cred); + hash_key %= kauth_cred_table_size; + + /* race fix - there is a window where another matching credential + * could have been inserted between the time this one was created and we + * got the hash lock. If we find a match return an error and have the + * the caller retry. + */ + if (kauth_cred_find(new_cred) != NULL) { + return(-1); + } + + /* take a reference for our use in credential hash table */ + kauth_cred_ref(new_cred); + + /* insert the credential into the hash table */ + TAILQ_INSERT_HEAD(&kauth_cred_table_anchor[hash_key], new_cred, cr_link); + + return(0); +} + +/* + * Remove the given credential from our credential hash table. + * NOTE - expects caller to hold KAUTH_CRED_HASH_LOCK! + */ +static void kauth_cred_remove(kauth_cred_t cred) +{ + u_long hash_key; + kauth_cred_t found_cred; + + hash_key = kauth_cred_get_hashkey(cred); + hash_key %= kauth_cred_table_size; + + /* avoid race */ + if (cred->cr_ref < 1) + panic("cred reference underflow"); + if (cred->cr_ref > 1) + return; /* someone else got a ref */ + + /* find cred in the credential hash table */ + TAILQ_FOREACH(found_cred, &kauth_cred_table_anchor[hash_key], cr_link) { + if (found_cred == cred) { + /* found a match, remove it from the hash table */ + TAILQ_REMOVE(&kauth_cred_table_anchor[hash_key], found_cred, cr_link); + FREE(cred, M_KAUTH); +#if KAUTH_CRED_HASH_DEBUG + kauth_cred_count--; +#endif + return; + } + } + + /* did not find a match. this should not happen! */ + printf("%s - %d - %s - did not find a match \n", __FILE__, __LINE__, __FUNCTION__); + return; +} + +/* + * Using the given credential data, look for a match in our credential hash + * table. + * NOTE - expects caller to hold KAUTH_CRED_HASH_LOCK! + */ +kauth_cred_t kauth_cred_find(kauth_cred_t cred) +{ + u_long hash_key; + kauth_cred_t found_cred; + +#if KAUTH_CRED_HASH_DEBUG + static int test_count = 0; + + test_count++; + if ((test_count % 200) == 0) { + kauth_cred_hash_print(); + } +#endif + + hash_key = kauth_cred_get_hashkey(cred); + hash_key %= kauth_cred_table_size; + + /* find cred in the credential hash table */ + TAILQ_FOREACH(found_cred, &kauth_cred_table_anchor[hash_key], cr_link) { + if (bcmp(&found_cred->cr_uid, &cred->cr_uid, (sizeof(struct ucred) - offsetof(struct ucred, cr_uid))) == 0) { + /* found a match */ + return(found_cred); + } + } + /* no match found */ + return(NULL); +} + +/* + * Generates a hash key using data that makes up a credential. Based on ElfHash. + */ +static u_long kauth_cred_get_hashkey(kauth_cred_t cred) +{ + u_long hash_key = 0; + + hash_key = kauth_cred_hash((uint8_t *)&cred->cr_uid, + (sizeof(struct ucred) - offsetof(struct ucred, cr_uid)), + hash_key); + return(hash_key); +} + +/* + * Generates a hash key using data that makes up a credential. Based on ElfHash. + */ +static inline u_long kauth_cred_hash(const uint8_t *datap, int data_len, u_long start_key) +{ + u_long hash_key = start_key; + u_long temp; + + while (data_len > 0) { + hash_key = (hash_key << 4) + *datap++; + temp = hash_key & 0xF0000000; + if (temp) { + hash_key ^= temp >> 24; + } + hash_key &= ~temp; + data_len--; + } + return(hash_key); +} + +#if KAUTH_CRED_HASH_DEBUG +static void kauth_cred_hash_print(void) +{ + int i, j; + kauth_cred_t found_cred; + + printf("\n\t kauth credential hash table statistics - current cred count %d \n", kauth_cred_count); + /* count slot hits, misses, collisions, and max depth */ + for (i = 0; i < kauth_cred_table_size; i++) { + printf("[%02d] ", i); + j = 0; + TAILQ_FOREACH(found_cred, &kauth_cred_table_anchor[i], cr_link) { + if (j > 0) { + printf("---- "); + } + j++; + kauth_cred_print(found_cred); + printf("\n"); + } + if (j == 0) { + printf("NOCRED \n"); + } + } +} + + +static void kauth_cred_print(kauth_cred_t cred) +{ + int i; + + printf("0x%02X - refs %d uids %d %d %d ", cred, cred->cr_ref, cred->cr_uid, cred->cr_ruid, cred->cr_svuid); + printf("group count %d gids ", cred->cr_ngroups); + for (i = 0; i < NGROUPS; i++) { + printf("%d ", cred->cr_groups[i]); + } + printf("%d %d %d ", cred->cr_rgid, cred->cr_svgid, cred->cr_gmuid); + printf("auditinfo %d %d %d %d %d %d ", + cred->cr_au.ai_auid, cred->cr_au.ai_mask.am_success, cred->cr_au.ai_mask.am_failure, + cred->cr_au.ai_termid.port, cred->cr_au.ai_termid.machine, cred->cr_au.ai_asid); + +} +#endif diff --git a/bsd/kern/kern_descrip.c b/bsd/kern/kern_descrip.c index d72fb8932..b0c759539 100644 --- a/bsd/kern/kern_descrip.c +++ b/bsd/kern/kern_descrip.c @@ -64,61 +64,145 @@ #include <sys/systm.h> #include <sys/filedesc.h> #include <sys/kernel.h> -#include <sys/vnode.h> -#include <sys/proc.h> -#include <sys/file.h> +#include <sys/vnode_internal.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> +#include <sys/file_internal.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/stat.h> #include <sys/ioctl.h> #include <sys/fcntl.h> #include <sys/malloc.h> +#include <sys/mman.h> #include <sys/syslog.h> #include <sys/unistd.h> #include <sys/resourcevar.h> #include <sys/aio_kern.h> +#include <sys/ev.h> +#include <kern/lock.h> #include <bsm/audit_kernel.h> -#include <sys/mount.h> - +#include <sys/mount_internal.h> +#include <sys/kdebug.h> +#include <sys/sysproto.h> +#include <sys/pipe.h> +#include <kern/kern_types.h> +#include <kern/kalloc.h> + +struct psemnode; +struct pshmnode; + +int fdopen(dev_t dev, int mode, int type, struct proc *p); +int ogetdtablesize(struct proc *p, void *uap, register_t *retval); +int finishdup(struct proc * p, struct filedesc *fdp, int old, int new, register_t *retval); + +int closef(struct fileglob *fg, struct proc *p); +int falloc_locked(struct proc *p, struct fileproc **resultfp, int *resultfd, int locked); +void fddrop(struct proc *p, int fd); +int fdgetf_noref(struct proc *p, int fd, struct fileproc **resultfp); +void fg_drop(struct fileproc * fp); +void fg_free(struct fileglob *fg); +void fg_ref(struct fileproc * fp); +int fp_getfpshm(struct proc *p, int fd, struct fileproc **resultfp, struct pshmnode **resultpshm); + +static int closef_finish(struct fileproc *fp, struct fileglob *fg, struct proc *p); + +extern void file_lock_init(void); +extern int is_suser(void); +extern int kqueue_stat(struct fileproc *fp, struct stat *st, struct proc *p); +extern int soo_stat(struct socket *so, struct stat *ub); +extern int vn_path_package_check(vnode_t vp, char *path, int pathlen, int *component); + +extern kauth_scope_t kauth_scope_fileop; + +#define f_flag f_fglob->fg_flag +#define f_type f_fglob->fg_type +#define f_msgcount f_fglob->fg_msgcount +#define f_cred f_fglob->fg_cred +#define f_ops f_fglob->fg_ops +#define f_offset f_fglob->fg_offset +#define f_data f_fglob->fg_data /* * Descriptor management. */ struct filelist filehead; /* head of list of open files */ +struct fmsglist fmsghead; /* head of list of open files */ +struct fmsglist fmsg_ithead; /* head of list of open files */ int nfiles; /* actual number of open files */ -static int frele_internal(struct file *); + +lck_grp_attr_t * file_lck_grp_attr; +lck_grp_t * file_lck_grp; +lck_attr_t * file_lck_attr; + +lck_mtx_t * uipc_lock; +lck_mtx_t * file_iterate_lcok; +lck_mtx_t * file_flist_lock; + + +void +file_lock_init(void) +{ + + /* allocate file lock group attribute and group */ + file_lck_grp_attr= lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(file_lck_grp_attr); + + file_lck_grp = lck_grp_alloc_init("file", file_lck_grp_attr); + + /* Allocate file lock attribute */ + file_lck_attr = lck_attr_alloc_init(); + //lck_attr_setdebug(file_lck_attr); + + uipc_lock = lck_mtx_alloc_init(file_lck_grp, file_lck_attr); + file_iterate_lcok = lck_mtx_alloc_init(file_lck_grp, file_lck_attr); + file_flist_lock = lck_mtx_alloc_init(file_lck_grp, file_lck_attr); + + + +} + + +void +proc_fdlock(struct proc *p) +{ + lck_mtx_lock(&p->p_fdmlock); +} + +void +proc_fdunlock(struct proc *p) +{ + lck_mtx_unlock(&p->p_fdmlock); +} /* * System calls on descriptors. */ -/* ARGSUSED */ + int -getdtablesize(p, uap, retval) - struct proc *p; - void *uap; - register_t *retval; +getdtablesize(struct proc *p, __unused struct getdtablesize_args *uap, register_t *retval) { + proc_fdlock(p); *retval = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles); + proc_fdunlock(p); + return (0); } -/* ARGSUSED */ int -ogetdtablesize(p, uap, retval) - struct proc *p; - void *uap; - register_t *retval; +ogetdtablesize(struct proc *p, __unused void *uap, register_t *retval) { + proc_fdlock(p); *retval = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, NOFILE); + proc_fdunlock(p); + return (0); } -static __inline__ -void _fdrelse(fdp, fd) - register struct filedesc *fdp; - register int fd; +static __inline__ void +_fdrelse(struct filedesc *fdp, int fd) { if (fd < fdp->fd_freefile) fdp->fd_freefile = fd; @@ -128,6 +212,7 @@ void _fdrelse(fdp, fd) #endif fdp->fd_ofiles[fd] = NULL; fdp->fd_ofileflags[fd] = 0; + while ((fd = fdp->fd_lastfile) > 0 && fdp->fd_ofiles[fd] == NULL && !(fdp->fd_ofileflags[fd] & UF_RESERVED)) @@ -137,9 +222,6 @@ void _fdrelse(fdp, fd) /* * Duplicate a file descriptor. */ -struct dup_args { - u_int fd; -}; /* ARGSUSED */ int dup(p, uap, retval) @@ -150,23 +232,28 @@ dup(p, uap, retval) register struct filedesc *fdp = p->p_fd; register int old = uap->fd; int new, error; + struct fileproc *fp; - if ((u_int)old >= fdp->fd_nfiles || - fdp->fd_ofiles[old] == NULL || - (fdp->fd_ofileflags[old] & UF_RESERVED)) - return (EBADF); - if (error = fdalloc(p, 0, &new)) + proc_fdlock(p); + if ( (error = fp_lookup(p, old, &fp, 1)) ) { + proc_fdunlock(p); + return(error); + } + if ( (error = fdalloc(p, 0, &new)) ) { + fp_drop(p, old, fp, 1); + proc_fdunlock(p); return (error); - return (finishdup(fdp, old, new, retval)); + } + error = finishdup(p, fdp, old, new, retval); + fp_drop(p, old, fp, 1); + proc_fdunlock(p); + + return (error); } /* * Duplicate a file descriptor to a particular value. */ -struct dup2_args { - u_int from; - u_int to; -}; /* ARGSUSED */ int dup2(p, uap, retval) @@ -177,224 +264,308 @@ dup2(p, uap, retval) register struct filedesc *fdp = p->p_fd; register int old = uap->from, new = uap->to; int i, error; + struct fileproc *fp; - if ((u_int)old >= fdp->fd_nfiles || - fdp->fd_ofiles[old] == NULL || - (fdp->fd_ofileflags[old] & UF_RESERVED) || - (u_int)new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur || - (u_int)new >= maxfiles) + proc_fdlock(p); + + if ( (error = fp_lookup(p, old, &fp, 1)) ) { + proc_fdunlock(p); + return(error); + } + if (new < 0 || + new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur || + new >= maxfiles) { + fp_drop(p, old, fp, 1); + proc_fdunlock(p); return (EBADF); + } if (old == new) { + fp_drop(p, old, fp, 1); *retval = new; + proc_fdunlock(p); return (0); } - if ((u_int)new >= fdp->fd_nfiles) { - if (error = fdalloc(p, new, &i)) + if (new < 0 || new >= fdp->fd_nfiles) { + if ( (error = fdalloc(p, new, &i)) ) { + fp_drop(p, old, fp, 1); + proc_fdunlock(p); return (error); + } if (new != i) { _fdrelse(fdp, i); goto closeit; } } else { - struct file **fpp; + struct fileproc **fpp; char flags; closeit: - if ((flags = fdp->fd_ofileflags[new]) & UF_RESERVED) + flags = fdp->fd_ofileflags[new]; + if ((flags & (UF_RESERVED | UF_CLOSING)) == UF_RESERVED) { + fp_drop(p, old, fp, 1); + proc_fdunlock(p); return (EBADF); - fdp->fd_ofileflags[new] = (flags & ~UF_MAPPED) | UF_RESERVED; + } + /* * dup2() must succeed even if the close has an error. */ if (*(fpp = &fdp->fd_ofiles[new])) { - struct file *fp = *fpp; + struct fileproc *nfp = *fpp; + close_internal(p, new, nfp, (CLOSEINT_LOCKED | CLOSEINT_WAITONCLOSE | CLOSEINT_NOFDRELSE | CLOSEINT_NOFDNOREF)); *fpp = NULL; - (void) closef(fp, p); } } - return (finishdup(fdp, old, new, retval)); + error = finishdup(p, fdp, old, new, retval); + fp_drop(p, old, fp, 1); + proc_fdunlock(p); + + return(error); } /* * The file control system call. */ -struct fcntl_args { - int fd; - int cmd; - int arg; -}; -/* ARGSUSED */ int fcntl(p, uap, retval) struct proc *p; - register struct fcntl_args *uap; + struct fcntl_args *uap; register_t *retval; { int fd = uap->fd; - register struct filedesc *fdp = p->p_fd; - register struct file *fp; - register char *pop; - struct vnode *vp, *devvp; + struct filedesc *fdp = p->p_fd; + struct fileproc *fp; + char *pop; + struct vnode *vp; int i, tmp, error, error2, flg = F_POSIX; struct flock fl; - fstore_t alloc_struct; /* structure for allocate command */ - u_int32_t alloc_flags = 0; - off_t offset; /* used for F_SETSIZE */ + struct vfs_context context; + off_t offset; int newmin; - struct radvisory ra_struct; - fbootstraptransfer_t fbt_struct; /* for F_READBOOTSTRAP and F_WRITEBOOTSTRAP */ - struct log2phys l2p_struct; /* structure for allocate command */ - daddr_t lbn, bn; + daddr64_t lbn, bn; int devBlockSize = 0; + unsigned int fflag; + user_addr_t argp; AUDIT_ARG(fd, uap->fd); AUDIT_ARG(cmd, uap->cmd); - if ((u_int)fd >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[fd]) == NULL || - (fdp->fd_ofileflags[fd] & UF_RESERVED)) - return (EBADF); + + proc_fdlock(p); + if ( (error = fp_lookup(p, fd, &fp, 1)) ) { + proc_fdunlock(p); + return(error); + } + context.vc_proc = p; + context.vc_ucred = fp->f_cred; + if (proc_is64bit(p)) { + argp = uap->arg; + } + else { + /* since the arg parameter is defined as a long but may be either + * a long or a pointer we must take care to handle sign extension + * issues. Our sys call munger will sign extend a long when we are + * called from a 32-bit process. Since we can never have an address + * greater than 32-bits from a 32-bit process we lop off the top + * 32-bits to avoid getting the wrong address + */ + argp = CAST_USER_ADDR_T(uap->arg); + } + pop = &fdp->fd_ofileflags[fd]; switch (uap->cmd) { case F_DUPFD: - newmin = (long)uap->arg; + newmin = CAST_DOWN(int, uap->arg); if ((u_int)newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur || - (u_int)newmin >= maxfiles) - return (EINVAL); - if (error = fdalloc(p, newmin, &i)) - return (error); - return (finishdup(fdp, fd, i, retval)); + newmin >= maxfiles) { + error = EINVAL; + goto out; + } + if ( (error = fdalloc(p, newmin, &i)) ) + goto out; + error = finishdup(p, fdp, fd, i, retval); + goto out; case F_GETFD: *retval = (*pop & UF_EXCLOSE)? 1 : 0; - return (0); + error = 0; + goto out; case F_SETFD: *pop = (*pop &~ UF_EXCLOSE) | - ((long)(uap->arg) & 1)? UF_EXCLOSE : 0; - return (0); + (uap->arg & 1)? UF_EXCLOSE : 0; + error = 0; + goto out; case F_GETFL: *retval = OFLAGS(fp->f_flag); - return (0); + error = 0; + goto out; case F_SETFL: fp->f_flag &= ~FCNTLFLAGS; - fp->f_flag |= FFLAGS((long)uap->arg) & FCNTLFLAGS; + tmp = CAST_DOWN(int, uap->arg); + fp->f_flag |= FFLAGS(tmp) & FCNTLFLAGS; tmp = fp->f_flag & FNONBLOCK; error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, p); if (error) - return (error); + goto out; tmp = fp->f_flag & FASYNC; error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, p); if (!error) - return (0); + goto out; fp->f_flag &= ~FNONBLOCK; tmp = 0; (void)fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, p); - return (error); + goto out; case F_GETOWN: if (fp->f_type == DTYPE_SOCKET) { *retval = ((struct socket *)fp->f_data)->so_pgid; - return (0); + error = 0; + goto out; } error = fo_ioctl(fp, (int)TIOCGPGRP, (caddr_t)retval, p); *retval = -*retval; - return (error); + goto out; case F_SETOWN: + tmp = CAST_DOWN(pid_t, uap->arg); if (fp->f_type == DTYPE_SOCKET) { - ((struct socket *)fp->f_data)->so_pgid = - (long)uap->arg; - return (0); + ((struct socket *)fp->f_data)->so_pgid = tmp; + error =0; + goto out; } - if ((long)uap->arg <= 0) { - uap->arg = (int)(-(long)(uap->arg)); + if (fp->f_type == DTYPE_PIPE) { + error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, p); + goto out; + } + + if (tmp <= 0) { + tmp = -tmp; } else { - struct proc *p1 = pfind((long)uap->arg); - if (p1 == 0) - return (ESRCH); - uap->arg = (int)p1->p_pgrp->pg_id; + struct proc *p1 = pfind(tmp); + if (p1 == 0) { + error = ESRCH; + goto out; + } + tmp = (int)p1->p_pgrp->pg_id; } - return (fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&uap->arg, p)); + error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, p); + goto out; case F_SETLKW: flg |= F_WAIT; /* Fall into F_SETLK */ case F_SETLK: - if (fp->f_type != DTYPE_VNODE) - return (EBADF); + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } vp = (struct vnode *)fp->f_data; + fflag = fp->f_flag; + offset = fp->f_offset; + proc_fdunlock(p); + /* Copy in the lock structure */ - error = copyin((caddr_t)uap->arg, (caddr_t)&fl, sizeof (fl)); - if (error) - break; + error = copyin(argp, (caddr_t)&fl, sizeof (fl)); + if (error) { + goto outdrop; + } + if ( (error = vnode_getwithref(vp)) ) { + goto outdrop; + } if (fl.l_whence == SEEK_CUR) - fl.l_start += fp->f_offset; + fl.l_start += offset; + switch (fl.l_type) { case F_RDLCK: - if ((fp->f_flag & FREAD) != 0) { - p->p_flag |= P_ADVLOCK; - error = VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &fl, flg); - } else + if ((fflag & FREAD) == 0) { + (void)vnode_put(vp); error = EBADF; - break; + goto outdrop; + } + p->p_flag |= P_ADVLOCK; + error = VNOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &fl, flg, &context); + (void)vnode_put(vp); + goto outdrop; case F_WRLCK: - if ((fp->f_flag & FWRITE) != 0) { - p->p_flag |= P_ADVLOCK; - error = VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &fl, flg); - } else + if ((fflag & FWRITE) == 0) { + (void)vnode_put(vp); error = EBADF; - break; + goto outdrop; + } + p->p_flag |= P_ADVLOCK; + error = VNOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &fl, flg, &context); + (void)vnode_put(vp); + goto outdrop; case F_UNLCK: - error = VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &fl, F_POSIX); - break; + error = VNOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &fl, + F_POSIX, &context); + (void)vnode_put(vp); + goto outdrop; default: + (void)vnode_put(vp); error = EINVAL; - break; + goto outdrop; } - break; case F_GETLK: - if (fp->f_type != DTYPE_VNODE) - return (EBADF); + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } vp = (struct vnode *)fp->f_data; + offset = fp->f_offset; + proc_fdunlock(p); + /* Copy in the lock structure */ - error = copyin((caddr_t)uap->arg, (caddr_t)&fl, sizeof (fl)); + error = copyin(argp, (caddr_t)&fl, sizeof (fl)); if (error) - break; - if (fl.l_whence == SEEK_CUR) - fl.l_start += fp->f_offset; - error = VOP_ADVLOCK(vp, (caddr_t)p, F_GETLK, &fl, F_POSIX); - if (error) - break; - error = copyout((caddr_t)&fl, (caddr_t)uap->arg, sizeof (fl)); - break; + goto outdrop; + + if ( (error = vnode_getwithref(vp)) == 0 ) { + if (fl.l_whence == SEEK_CUR) + fl.l_start += offset; + + error = VNOP_ADVLOCK(vp, (caddr_t)p, F_GETLK, &fl, F_POSIX, &context); + + (void)vnode_put(vp); + + if (error == 0) + error = copyout((caddr_t)&fl, argp, sizeof (fl)); + } + goto outdrop; + + case F_PREALLOCATE: { + fstore_t alloc_struct; /* structure for allocate command */ + u_int32_t alloc_flags = 0; + + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } - case F_PREALLOCATE: - if (fp->f_type != DTYPE_VNODE) - return (EBADF); vp = (struct vnode *)fp->f_data; + proc_fdunlock(p); /* make sure that we have write permission */ if ((fp->f_flag & FWRITE) == 0) { error = EBADF; - break; + goto outdrop; } - error = copyin((caddr_t)uap->arg, (caddr_t)&alloc_struct, - sizeof (alloc_struct)); + error = copyin(argp, (caddr_t)&alloc_struct, sizeof (alloc_struct)); if (error) - break; + goto outdrop; /* now set the space allocated to 0 */ alloc_struct.fst_bytesalloc = 0; @@ -421,53 +592,55 @@ fcntl(p, uap, retval) switch (alloc_struct.fst_posmode) { case F_PEOFPOSMODE: - if (alloc_struct.fst_offset == 0) - alloc_flags |= ALLOCATEFROMPEOF; - else + if (alloc_struct.fst_offset != 0) { error = EINVAL; + goto outdrop; + } + + alloc_flags |= ALLOCATEFROMPEOF; break; case F_VOLPOSMODE: - if (alloc_struct.fst_offset > 0) - alloc_flags |= ALLOCATEFROMVOL; - else + if (alloc_struct.fst_offset <= 0) { error = EINVAL; + goto outdrop; + } + + alloc_flags |= ALLOCATEFROMVOL; break; - default: + default: { error = EINVAL; - break; + goto outdrop; + } } + if ( (error = vnode_getwithref(vp)) == 0 ) { + /* + * call allocate to get the space + */ + error = VNOP_ALLOCATE(vp,alloc_struct.fst_length,alloc_flags, + &alloc_struct.fst_bytesalloc, alloc_struct.fst_offset, + &context); + (void)vnode_put(vp); - if (error) - break; + error2 = copyout((caddr_t)&alloc_struct, argp, sizeof (alloc_struct)); - /* lock the vnode and call allocate to get the space */ - error = vn_lock(vp, LK_EXCLUSIVE|LK_RETRY, p); - if (error) - break; - error = VOP_ALLOCATE(vp,alloc_struct.fst_length,alloc_flags, - &alloc_struct.fst_bytesalloc, alloc_struct.fst_offset, - fp->f_cred, p); - VOP_UNLOCK(vp, 0, p); - - if (error2 = copyout((caddr_t)&alloc_struct, - (caddr_t)uap->arg, - sizeof (alloc_struct))) { - if (!error) + if (error == 0) error = error2; } - break; + goto outdrop; + } case F_SETSIZE: - if (fp->f_type != DTYPE_VNODE) - return (EBADF); - vp = (struct vnode *)fp->f_data; + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } + proc_fdunlock(p); - error = copyin((caddr_t)uap->arg, (caddr_t)&offset, - sizeof (off_t)); + error = copyin(argp, (caddr_t)&offset, sizeof (off_t)); if (error) - break; + goto outdrop; /* * Make sure that we are root. Growing a file @@ -477,390 +650,613 @@ fcntl(p, uap, retval) if (!is_suser()) { error = EACCES; - break; + goto outdrop; } + vp = (struct vnode *)fp->f_data; - /* lock the vnode and call allocate to get the space */ - error = vn_lock(vp, LK_EXCLUSIVE|LK_RETRY, p); - if (error) - break; - error = VOP_TRUNCATE(vp,offset,IO_NOZEROFILL,fp->f_cred,p); - VOP_UNLOCK(vp,0,p); - break; + if ( (error = vnode_getwithref(vp)) == 0 ) { + /* + * set the file size + */ + error = vnode_setsize(vp, offset, IO_NOZEROFILL, &context); + + (void)vnode_put(vp); + } + goto outdrop; case F_RDAHEAD: - if (fp->f_type != DTYPE_VNODE) - return (EBADF); + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } vp = (struct vnode *)fp->f_data; + proc_fdunlock(p); - simple_lock(&vp->v_interlock); - if (uap->arg) - vp->v_flag &= ~VRAOFF; - else - vp->v_flag |= VRAOFF; - simple_unlock(&vp->v_interlock); - error = 0; - break; + if ( (error = vnode_getwithref(vp)) == 0) { + if (uap->arg) + vnode_clearnoreadahead(vp); + else + vnode_setnoreadahead(vp); + + (void)vnode_put(vp); + } + goto outdrop; case F_NOCACHE: - if (fp->f_type != DTYPE_VNODE) - return (EBADF); + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } vp = (struct vnode *)fp->f_data; + proc_fdunlock(p); - simple_lock(&vp->v_interlock); - if (uap->arg) - vp->v_flag |= VNOCACHE_DATA; - else - vp->v_flag &= ~VNOCACHE_DATA; - simple_unlock(&vp->v_interlock); - error = 0; - break; + if ( (error = vnode_getwithref(vp)) == 0 ) { + if (uap->arg) + vnode_setnocache(vp); + else + vnode_clearnocache(vp); - case F_RDADVISE: - if (fp->f_type != DTYPE_VNODE) - return (EBADF); - vp = (struct vnode *)fp->f_data; + (void)vnode_put(vp); + } + goto outdrop; - if (error = copyin((caddr_t)uap->arg, - (caddr_t)&ra_struct, sizeof (ra_struct))) - break; - error = VOP_IOCTL(vp, 1, (caddr_t)&ra_struct, 0, fp->f_cred, p); - break; + case F_RDADVISE: { + struct radvisory ra_struct; - case F_CHKCLEAN: - /* - * used by regression test to determine if - * all the dirty pages (via write) have been cleaned - * after a call to 'fsysnc'. - */ - if (fp->f_type != DTYPE_VNODE) - return (EBADF); + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } vp = (struct vnode *)fp->f_data; + proc_fdunlock(p); - error = VOP_IOCTL(vp, 5, 0, 0, fp->f_cred, p); - break; + if ( (error = copyin(argp, (caddr_t)&ra_struct, sizeof (ra_struct))) ) + goto outdrop; + if ( (error = vnode_getwithref(vp)) == 0 ) { + error = VNOP_IOCTL(vp, F_RDADVISE, (caddr_t)&ra_struct, 0, &context); + + (void)vnode_put(vp); + } + goto outdrop; + } case F_READBOOTSTRAP: - case F_WRITEBOOTSTRAP: - if (fp->f_type != DTYPE_VNODE) - return (EBADF); + case F_WRITEBOOTSTRAP: { + fbootstraptransfer_t fbt_struct; + user_fbootstraptransfer_t user_fbt_struct; + int sizeof_struct; + caddr_t boot_structp; + + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } vp = (struct vnode *)fp->f_data; + proc_fdunlock(p); - error = copyin((caddr_t)uap->arg, (caddr_t)&fbt_struct, - sizeof (fbt_struct)); + if (IS_64BIT_PROCESS(p)) { + sizeof_struct = sizeof(user_fbt_struct); + boot_structp = (caddr_t) &user_fbt_struct; + } + else { + sizeof_struct = sizeof(fbt_struct); + boot_structp = (caddr_t) &fbt_struct; + } + error = copyin(argp, boot_structp, sizeof_struct); if (error) - break; - + goto outdrop; + if ( (error = vnode_getwithref(vp)) ) { + goto outdrop; + } if (uap->cmd == F_WRITEBOOTSTRAP) { - /* - * Make sure that we are root. Updating the - * bootstrap on a disk could be a security hole - */ + /* + * Make sure that we are root. Updating the + * bootstrap on a disk could be a security hole + */ if (!is_suser()) { + (void)vnode_put(vp); error = EACCES; - break; + goto outdrop; } } - - if (vp->v_tag != VT_HFS) /* XXX */ + if (strcmp(vnode_mount(vp)->mnt_vfsstat.f_fstypename, "hfs") != 0) { error = EINVAL; - else { - /* lock the vnode and call VOP_IOCTL to handle the I/O */ - error = vn_lock(vp, LK_EXCLUSIVE|LK_RETRY, p); - if (error) - break; - error = VOP_IOCTL(vp, (uap->cmd == F_WRITEBOOTSTRAP) ? 3 : 2, - (caddr_t)&fbt_struct, 0, fp->f_cred, p); - VOP_UNLOCK(vp,0,p); + } else { + /* + * call vnop_ioctl to handle the I/O + */ + error = VNOP_IOCTL(vp, uap->cmd, boot_structp, 0, &context); } - break; + (void)vnode_put(vp); + goto outdrop; + } + case F_LOG2PHYS: { + struct log2phys l2p_struct; /* structure for allocate command */ - case F_LOG2PHYS: - if (fp->f_type != DTYPE_VNODE) - return (EBADF); + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } vp = (struct vnode *)fp->f_data; + proc_fdunlock(p); + if ( (error = vnode_getwithref(vp)) ) { + goto outdrop; + } + error = VNOP_OFFTOBLK(vp, fp->f_offset, &lbn); + if (error) { + (void)vnode_put(vp); + goto outdrop; + } + error = VNOP_BLKTOOFF(vp, lbn, &offset); + if (error) { + (void)vnode_put(vp); + goto outdrop; + } + devBlockSize = vfs_devblocksize(vnode_mount(vp)); + + error = VNOP_BLOCKMAP(vp, offset, devBlockSize, &bn, NULL, NULL, 0, &context); + + (void)vnode_put(vp); - error = vn_lock(vp, LK_EXCLUSIVE|LK_RETRY, p); - if (error) - break; - error = VOP_OFFTOBLK(vp, fp->f_offset, &lbn); - if (error) - break; - error = VOP_BLKTOOFF(vp, lbn, &offset); - if (error) - break; - error = VOP_BMAP(vp, lbn, &devvp, &bn, 0); - VOP_DEVBLOCKSIZE(devvp, &devBlockSize); - VOP_UNLOCK(vp, 0, p); if (!error) { l2p_struct.l2p_flags = 0; /* for now */ l2p_struct.l2p_contigbytes = 0; /* for now */ l2p_struct.l2p_devoffset = bn * devBlockSize; l2p_struct.l2p_devoffset += fp->f_offset - offset; - error = copyout((caddr_t)&l2p_struct, - (caddr_t)uap->arg, - sizeof (l2p_struct)); + error = copyout((caddr_t)&l2p_struct, argp, sizeof (l2p_struct)); + } + goto outdrop; } - break; - case F_GETPATH: { - char *pathbuf; - int len; - extern int vn_getpath(struct vnode *vp, char *pathbuf, int *len); + char *pathbufp; + int pathlen; - if (fp->f_type != DTYPE_VNODE) - return (EBADF); + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } vp = (struct vnode *)fp->f_data; + proc_fdunlock(p); - len = MAXPATHLEN; - MALLOC(pathbuf, char *, len, M_TEMP, M_WAITOK); + pathlen = MAXPATHLEN; + MALLOC(pathbufp, char *, pathlen, M_TEMP, M_WAITOK); + if (pathbufp == NULL) { + error = ENOMEM; + goto outdrop; + } + if ( (error = vnode_getwithref(vp)) == 0 ) { + error = vn_getpath(vp, pathbufp, &pathlen); + (void)vnode_put(vp); - error = vn_lock(vp, LK_EXCLUSIVE|LK_RETRY, p); - if (error) { - FREE(pathbuf, M_TEMP); - break; - } - error = vn_getpath(vp, pathbuf, &len); - if (error == 0) - error = copyout((caddr_t)pathbuf, (caddr_t)uap->arg, len); - VOP_UNLOCK(vp, 0, p); - FREE(pathbuf, M_TEMP); - break; + if (error == 0) + error = copyout((caddr_t)pathbufp, argp, pathlen); + } + FREE(pathbufp, M_TEMP); + goto outdrop; } - case F_FULLFSYNC: { - if (fp->f_type != DTYPE_VNODE) - return (EBADF); + case F_PATHPKG_CHECK: { + char *pathbufp; + size_t pathlen; + + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } vp = (struct vnode *)fp->f_data; + proc_fdunlock(p); - error = vn_lock(vp, LK_EXCLUSIVE|LK_RETRY, p); - if (error) - break; + pathlen = MAXPATHLEN; + pathbufp = kalloc(MAXPATHLEN); + + if ( (error = copyinstr(argp, pathbufp, MAXPATHLEN, &pathlen)) == 0 ) { + if ( (error = vnode_getwithref(vp)) == 0 ) { + error = vn_path_package_check(vp, pathbufp, pathlen, retval); + + (void)vnode_put(vp); + } + } + kfree(pathbufp, MAXPATHLEN); + goto outdrop; + } + + case F_CHKCLEAN: // used by regression tests to see if all dirty pages got cleaned by fsync() + case F_FULLFSYNC: // fsync + flush the journal + DKIOCSYNCHRONIZECACHE + case F_FREEZE_FS: // freeze all other fs operations for the fs of this fd + case F_THAW_FS: { // thaw all frozen fs operations for the fs of this fd + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } + vp = (struct vnode *)fp->f_data; + proc_fdunlock(p); - error = VOP_IOCTL(vp, 6, (caddr_t)NULL, 0, fp->f_cred, p); - VOP_UNLOCK(vp, 0, p); + if ( (error = vnode_getwithref(vp)) == 0 ) { + error = VNOP_IOCTL(vp, uap->cmd, (caddr_t)NULL, 0, &context); + + (void)vnode_put(vp); + } break; } default: - return (EINVAL); + if (uap->cmd < FCNTL_FS_SPECIFIC_BASE) { + error = EINVAL; + goto out; + } + + // if it's a fs-specific fcntl() then just pass it through + + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } + vp = (struct vnode *)fp->f_data; + proc_fdunlock(p); + + if ( (error = vnode_getwithref(vp)) == 0 ) { + error = VNOP_IOCTL(vp, uap->cmd, CAST_DOWN(caddr_t, argp), 0, &context); + + (void)vnode_put(vp); + } + break; + } - /* - * Fall thru to here for all vnode operations. - * We audit the path after the call to avoid - * triggering file table state changes during - * the audit pathname allocation. - */ - AUDIT_ARG(vnpath, vp, ARG_VNODE1); - return error; +outdrop: + AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1); + fp_drop(p, fd, fp, 0); + return(error); +out: + fp_drop(p, fd, fp, 1); + proc_fdunlock(p); + return(error); } /* * Common code for dup, dup2, and fcntl(F_DUPFD). */ int -finishdup(fdp, old, new, retval) - register struct filedesc *fdp; - register int old, new; - register_t *retval; +finishdup(struct proc * p, struct filedesc *fdp, int old, int new, register_t *retval) { - register struct file *fp; + struct fileproc *nfp; + struct fileproc *ofp; - if ((fp = fdp->fd_ofiles[old]) == NULL || + if ((ofp = fdp->fd_ofiles[old]) == NULL || (fdp->fd_ofileflags[old] & UF_RESERVED)) { _fdrelse(fdp, new); return (EBADF); } - fdp->fd_ofiles[new] = fp; + fg_ref(ofp); + proc_fdunlock(p); + + MALLOC_ZONE(nfp, struct fileproc *, sizeof(struct fileproc), M_FILEPROC, M_WAITOK); + bzero(nfp, sizeof(struct fileproc)); + + proc_fdlock(p); + nfp->f_flags = ofp->f_flags; + nfp->f_fglob = ofp->f_fglob; + nfp->f_iocount = 0; + + fdp->fd_ofiles[new] = nfp; fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE; - (void)fref(fp); if (new > fdp->fd_lastfile) fdp->fd_lastfile = new; *retval = new; return (0); } -/* - * Close a file descriptor. - */ -struct close_args { - int fd; -}; -/* ARGSUSED */ + int -close(p, uap, retval) - struct proc *p; - struct close_args *uap; - register_t *retval; +close(struct proc *p, struct close_args *uap, __unused register_t *retval) { + struct fileproc *fp; int fd = uap->fd; - register struct filedesc *fdp = p->p_fd; - register struct file *fp; + int error =0; AUDIT_SYSCLOSE(p, fd); - if ((u_int)fd >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[fd]) == NULL || - (fdp->fd_ofileflags[fd] & UF_RESERVED)) - return (EBADF); + + proc_fdlock(p); + + if ( (error = fp_lookup(p,fd,&fp, 1)) ) { + proc_fdunlock(p); + return(error); + } + + error = close_internal(p, fd, fp, CLOSEINT_LOCKED | CLOSEINT_WAITONCLOSE); + + proc_fdunlock(p); + + return(error); +} + + +/* + * Close a file descriptor. + */ +int +close_internal(struct proc *p, int fd, struct fileproc *fp, int flags) +{ + struct filedesc *fdp = p->p_fd; + int error =0; + int locked = flags & CLOSEINT_LOCKED; + int waitonclose = flags & CLOSEINT_WAITONCLOSE; + int norelse = flags & CLOSEINT_NOFDRELSE; + int nofdref = flags & CLOSEINT_NOFDNOREF; + int slpstate = PRIBIO; + + if (!locked) + proc_fdlock(p); /* Keep people from using the filedesc while we are closing it */ fdp->fd_ofileflags[fd] |= UF_RESERVED; - - /* cancel all async IO requests that can be cancelled. */ - _aio_close( p, fd ); - if (fd < fdp->fd_knlistsize) + fdp->fd_ofileflags[fd] |= UF_CLOSING; + + + if ((waitonclose && ((fp->f_flags & FP_CLOSING) == FP_CLOSING))) { + if (nofdref == 0) + fp_drop(p, fd, fp, 1); + fp->f_flags |= FP_WAITCLOSE; + if (!locked) + slpstate |= PDROP; + msleep(&fp->f_flags, &p->p_fdmlock, slpstate, "close wait",0) ; + return(EBADF); + } + + fp->f_flags |= FP_CLOSING; + if (nofdref) + fp->f_iocount++; + + if ( (fp->f_flags & FP_AIOISSUED) || kauth_authorize_fileop_has_listeners() ) { + + proc_fdunlock(p); + + if ( (fp->f_type == DTYPE_VNODE) && kauth_authorize_fileop_has_listeners() ) { + /* + * call out to allow 3rd party notification of close. + * Ignore result of kauth_authorize_fileop call. + */ + if (vnode_getwithref((vnode_t)fp->f_data) == 0) { + u_int fileop_flags = 0; + if ((fp->f_flags & FP_WRITTEN) != 0) + fileop_flags |= KAUTH_FILEOP_CLOSE_MODIFIED; + kauth_authorize_fileop(fp->f_fglob->fg_cred, KAUTH_FILEOP_CLOSE, + (uintptr_t)fp->f_data, (uintptr_t)fileop_flags); + vnode_put((vnode_t)fp->f_data); + } + } + if (fp->f_flags & FP_AIOISSUED) + /* + * cancel all async IO requests that can be cancelled. + */ + _aio_close( p, fd ); + + proc_fdlock(p); + } + + if (fd < fdp->fd_knlistsize) knote_fdclose(p, fd); - _fdrelse(fdp, fd); - return (closef(fp, p)); + if (fp->f_flags & FP_WAITEVENT) + (void)waitevent_close(p, fp); + + if ((fp->f_flags & FP_INCHRREAD) == 0) + fileproc_drain(p, fp); + if (norelse == 0) + _fdrelse(fdp, fd); + error = closef_locked(fp, fp->f_fglob, p); + if ((fp->f_flags & FP_WAITCLOSE) == FP_WAITCLOSE) + wakeup(&fp->f_flags); + fp->f_flags &= ~(FP_WAITCLOSE | FP_CLOSING); + + if (!locked) + proc_fdunlock(p); + + FREE_ZONE(fp, sizeof *fp, M_FILEPROC); + return(error); } /* * Return status information about a file descriptor. + * + * XXX switch on node type is bogus; need a stat in struct fileops instead. */ -struct fstat_args { - int fd; - struct stat *sb; -}; -/* ARGSUSED */ -int -fstat(p, uap, retval) - struct proc *p; - register struct fstat_args *uap; - register_t *retval; +static int +fstat1(struct proc *p, int fd, user_addr_t ub, user_addr_t xsecurity, user_addr_t xsecurity_size) { - int fd = uap->fd; - register struct filedesc *fdp = p->p_fd; - register struct file *fp; - struct stat ub; - int error; + struct fileproc *fp; + struct stat sb; + struct user_stat user_sb; + int error, my_size; + int funnel_state; + short type; + caddr_t data; + kauth_filesec_t fsec; + ssize_t xsecurity_bufsize; + int entrycount; + struct vfs_context context; - AUDIT_ARG(fd, uap->fd); - if ((u_int)fd >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[fd]) == NULL || - (fdp->fd_ofileflags[fd] & UF_RESERVED)) - return (EBADF); - switch (fp->f_type) { + + AUDIT_ARG(fd, fd); + + if ((error = fp_lookup(p, fd, &fp, 0)) != 0) + return(error); + type = fp->f_type; + data = fp->f_data; + fsec = KAUTH_FILESEC_NONE; + + switch (type) { case DTYPE_VNODE: - error = vn_stat((struct vnode *)fp->f_data, &ub, p); - if (error == 0) { - AUDIT_ARG(vnpath, (struct vnode *)fp->f_data, ARG_VNODE1); + context.vc_proc = current_proc(); + context.vc_ucred = kauth_cred_get(); + if ((error = vnode_getwithref((vnode_t)data)) == 0) { + /* + * If the caller has the file open, and is not requesting extended security, + * we are going to let them get the basic stat information. + */ + if (xsecurity == USER_ADDR_NULL) { + error = vn_stat_noauth((vnode_t)data, &sb, NULL, &context); + } else { + error = vn_stat((vnode_t)data, &sb, &fsec, &context); + } + + AUDIT_ARG(vnpath, (struct vnode *)data, ARG_VNODE1); + (void)vnode_put((vnode_t)data); } break; case DTYPE_SOCKET: - error = soo_stat((struct socket *)fp->f_data, &ub); + error = soo_stat((struct socket *)data, &sb); + break; + + case DTYPE_PIPE: + error = pipe_stat((void *)data, &sb); break; case DTYPE_PSXSHM: - error = pshm_stat((void *)fp->f_data, &ub); + error = pshm_stat((void *)data, &sb); break; case DTYPE_KQUEUE: - error = kqueue_stat(fp, &ub, p); - break; + funnel_state = thread_funnel_set(kernel_flock, TRUE); + error = kqueue_stat(fp, &sb, p); + thread_funnel_set(kernel_flock, funnel_state); + break; default: - panic("fstat"); - /*NOTREACHED*/ + error = EBADF; + goto out; + } + /* Zap spare fields */ + sb.st_lspare = 0; + sb.st_qspare[0] = 0LL; + sb.st_qspare[1] = 0LL; + if (error == 0) { + caddr_t sbp; + if (IS_64BIT_PROCESS(current_proc())) { + munge_stat(&sb, &user_sb); + my_size = sizeof(user_sb); + sbp = (caddr_t)&user_sb; + } + else { + my_size = sizeof(sb); + sbp = (caddr_t)&sb; + } + error = copyout(sbp, ub, my_size); } - if (error == 0) - error = copyout((caddr_t)&ub, (caddr_t)uap->sb, - sizeof (ub)); - return (error); -} - -#if COMPAT_43 -/* - * Return status information about a file descriptor. - */ -struct ofstat_args { - int fd; - struct ostat *sb; -}; -/* ARGSUSED */ -ofstat(p, uap, retval) - struct proc *p; - register struct ofstat_args *uap; - register_t *retval; -{ - int fd = uap->fd; - register struct filedesc *fdp = p->p_fd; - register struct file *fp; - struct stat ub; - struct ostat oub; - int error; - if ((u_int)fd >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[fd]) == NULL || - (fdp->fd_ofileflags[fd] & UF_RESERVED)) - return (EBADF); - switch (fp->f_type) { + /* caller wants extended security information? */ + if (xsecurity != USER_ADDR_NULL) { - case DTYPE_VNODE: - error = vn_stat((struct vnode *)fp->f_data, &ub, p); - break; + /* did we get any? */ + if (fsec == KAUTH_FILESEC_NONE) { + if (susize(xsecurity_size, 0) != 0) { + error = EFAULT; + goto out; + } + } else { + /* find the user buffer size */ + xsecurity_bufsize = fusize(xsecurity_size); - case DTYPE_SOCKET: - error = soo_stat((struct socket *)fp->f_data, &ub); - break; + /* copy out the actual data size */ + if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) { + error = EFAULT; + goto out; + } - default: - panic("ofstat"); - /*NOTREACHED*/ + /* if the caller supplied enough room, copy out to it */ + if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) + error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec)); + } } - cvtstat(&ub, &oub); - if (error == 0) - error = copyout((caddr_t)&oub, (caddr_t)uap->sb, - sizeof (oub)); +out: + fp_drop(p, fd, fp, 0); + if (fsec != NULL) + kauth_filesec_free(fsec); return (error); } -#endif /* COMPAT_43 */ + +int +fstat_extended(struct proc *p, struct fstat_extended_args *uap, __unused register_t *retval) +{ + return(fstat1(p, uap->fd, uap->ub, uap->xsecurity, uap->xsecurity_size)); +} + +int +fstat(struct proc *p, register struct fstat_args *uap, __unused register_t *retval) +{ + return(fstat1(p, uap->fd, uap->ub, 0, 0)); +} /* * Return pathconf information about a file descriptor. */ -struct fpathconf_args { - int fd; - int name; -}; -/* ARGSUSED */ +int fpathconf(p, uap, retval) struct proc *p; register struct fpathconf_args *uap; register_t *retval; { int fd = uap->fd; - struct filedesc *fdp = p->p_fd; - struct file *fp; + struct fileproc *fp; struct vnode *vp; + struct vfs_context context; + int error = 0; + short type; + caddr_t data; + AUDIT_ARG(fd, uap->fd); - if ((u_int)fd >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[fd]) == NULL || - (fdp->fd_ofileflags[fd] & UF_RESERVED)) - return (EBADF); - switch (fp->f_type) { + if ( (error = fp_lookup(p, fd, &fp, 0)) ) + return(error); + type = fp->f_type; + data = fp->f_data; + + switch (type) { case DTYPE_SOCKET: - if (uap->name != _PC_PIPE_BUF) - return (EINVAL); + if (uap->name != _PC_PIPE_BUF) { + error = EINVAL; + goto out; + } *retval = PIPE_BUF; - return (0); + error = 0; + goto out; + + case DTYPE_PIPE: + *retval = PIPE_BUF; + error = 0; + goto out; case DTYPE_VNODE: - vp = (struct vnode *)fp->f_data; - AUDIT_ARG(vnpath, vp, ARG_VNODE1); + vp = (struct vnode *)data; + + if ( (error = vnode_getwithref(vp)) == 0) { + AUDIT_ARG(vnpath, vp, ARG_VNODE1); + + context.vc_proc = p; + context.vc_ucred = kauth_cred_get(); + + error = vn_pathconf(vp, uap->name, retval, &context); + + (void)vnode_put(vp); + } + goto out; - return (VOP_PATHCONF(vp, uap->name, retval)); + case DTYPE_PSXSHM: + case DTYPE_KQUEUE: + error = EINVAL; + goto out; default: - panic("fpathconf"); + panic("fpathconf (unrecognized - %d)", type); } /*NOTREACHED*/ +out: + fp_drop(p, fd, fp, 0); + return(error); } /* @@ -876,8 +1272,8 @@ fdalloc(p, want, result) { register struct filedesc *fdp = p->p_fd; register int i; - int lim, last, nfiles, oldnfiles; - struct file **newofiles, **ofiles; + int lim, last, numfiles, oldnfiles; + struct fileproc **newofiles, **ofiles; char *newofileflags, *ofileflags; /* @@ -911,19 +1307,24 @@ fdalloc(p, want, result) if (fdp->fd_nfiles >= lim) return (EMFILE); if (fdp->fd_nfiles < NDEXTENT) - nfiles = NDEXTENT; + numfiles = NDEXTENT; else - nfiles = 2 * fdp->fd_nfiles; + numfiles = 2 * fdp->fd_nfiles; /* Enforce lim */ - if (nfiles > lim) - nfiles = lim; - MALLOC_ZONE(newofiles, struct file **, - nfiles * OFILESIZE, M_OFILETABL, M_WAITOK); - if (fdp->fd_nfiles >= nfiles) { - FREE_ZONE(newofiles, nfiles * OFILESIZE, M_OFILETABL); + if (numfiles > lim) + numfiles = lim; + proc_fdunlock(p); + MALLOC_ZONE(newofiles, struct fileproc **, + numfiles * OFILESIZE, M_OFILETABL, M_WAITOK); + proc_fdlock(p); + if (newofiles == NULL) { + return (ENOMEM); + } + if (fdp->fd_nfiles >= numfiles) { + FREE_ZONE(newofiles, numfiles * OFILESIZE, M_OFILETABL); continue; } - newofileflags = (char *) &newofiles[nfiles]; + newofileflags = (char *) &newofiles[numfiles]; /* * Copy the existing ofile and ofileflags arrays * and zero the new portion of each array. @@ -932,90 +1333,517 @@ fdalloc(p, want, result) (void) memcpy(newofiles, fdp->fd_ofiles, oldnfiles * sizeof *fdp->fd_ofiles); (void) memset(&newofiles[oldnfiles], 0, - (nfiles - oldnfiles) * sizeof *fdp->fd_ofiles); + (numfiles - oldnfiles) * sizeof *fdp->fd_ofiles); (void) memcpy(newofileflags, fdp->fd_ofileflags, oldnfiles * sizeof *fdp->fd_ofileflags); (void) memset(&newofileflags[oldnfiles], 0, - (nfiles - oldnfiles) * + (numfiles - oldnfiles) * sizeof *fdp->fd_ofileflags); ofiles = fdp->fd_ofiles; fdp->fd_ofiles = newofiles; fdp->fd_ofileflags = newofileflags; - fdp->fd_nfiles = nfiles; + fdp->fd_nfiles = numfiles; FREE_ZONE(ofiles, oldnfiles * OFILESIZE, M_OFILETABL); fdexpand++; } } -/* - * Check to see whether n user file descriptors - * are available to the process p. - */ +/* + * Check to see whether n user file descriptors + * are available to the process p. + */ +int +fdavail(p, n) + struct proc *p; + int n; +{ + struct filedesc *fdp = p->p_fd; + struct fileproc **fpp; + char *flags; + int i, lim; + + lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles); + if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0) + return (1); + fpp = &fdp->fd_ofiles[fdp->fd_freefile]; + flags = &fdp->fd_ofileflags[fdp->fd_freefile]; + for (i = fdp->fd_nfiles - fdp->fd_freefile; --i >= 0; fpp++, flags++) + if (*fpp == NULL && !(*flags & UF_RESERVED) && --n <= 0) + return (1); + return (0); +} + +void +fdrelse(p, fd) + struct proc *p; + int fd; +{ + _fdrelse(p->p_fd, fd); +} + +void +fddrop(p, fd) + struct proc *p; + int fd; +{ + struct filedesc *fdp = p->p_fd; + struct fileproc *fp; + + if (fd < fdp->fd_freefile) + fdp->fd_freefile = fd; +#if DIAGNOSTIC + if (fd > fdp->fd_lastfile) + panic("fdrelse: fd_lastfile inconsistent"); +#endif + fp = fdp->fd_ofiles[fd]; + fdp->fd_ofiles[fd] = NULL; + fdp->fd_ofileflags[fd] = 0; + + while ((fd = fdp->fd_lastfile) > 0 && + fdp->fd_ofiles[fd] == NULL && + !(fdp->fd_ofileflags[fd] & UF_RESERVED)) + fdp->fd_lastfile--; + FREE_ZONE(fp, sizeof *fp, M_FILEPROC); +} + + +int +fdgetf_noref(p, fd, resultfp) + struct proc *p; + int fd; + struct fileproc **resultfp; +{ + struct filedesc *fdp = p->p_fd; + struct fileproc *fp; + + if (fd < 0 || fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL || + (fdp->fd_ofileflags[fd] & UF_RESERVED)) { + return (EBADF); + } + if (resultfp) + *resultfp = fp; + return (0); +} + + +/* should be called only when proc_fdlock is held */ +void +fp_setflags(proc_t p, struct fileproc * fp, int flags) +{ + proc_fdlock(p); + fp->f_flags |= flags; + proc_fdunlock(p); +} + +void +fp_clearflags(proc_t p, struct fileproc * fp, int flags) +{ + + proc_fdlock(p); + if (fp) + fp->f_flags &= ~flags; + proc_fdunlock(p); +} + +int +fp_getfvp(p, fd, resultfp, resultvp) + struct proc *p; + int fd; + struct fileproc **resultfp; + struct vnode **resultvp; +{ + struct filedesc *fdp = p->p_fd; + struct fileproc *fp; + + proc_fdlock(p); + if (fd < 0 || fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL || + (fdp->fd_ofileflags[fd] & UF_RESERVED)) { + proc_fdunlock(p); + return (EBADF); + } + if (fp->f_type != DTYPE_VNODE) { + proc_fdunlock(p); + return(ENOTSUP); + } + fp->f_iocount++; + + if (resultfp) + *resultfp = fp; + if (resultvp) + *resultvp = (struct vnode *)fp->f_data; + proc_fdunlock(p); + + return (0); +} + + +/* + * Returns: EBADF The file descriptor is invalid + * EOPNOTSUPP The file descriptor is not a socket + * 0 Success + * + * Notes: EOPNOTSUPP should probably be ENOTSOCK; this function is only + * ever called from accept1(). + */ +int +fp_getfsock(p, fd, resultfp, results) + struct proc *p; + int fd; + struct fileproc **resultfp; + struct socket **results; +{ + struct filedesc *fdp = p->p_fd; + struct fileproc *fp; + + proc_fdlock(p); + if (fd < 0 || fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL || + (fdp->fd_ofileflags[fd] & UF_RESERVED)) { + proc_fdunlock(p); + return (EBADF); + } + if (fp->f_type != DTYPE_SOCKET) { + proc_fdunlock(p); + return(EOPNOTSUPP); + } + fp->f_iocount++; + + if (resultfp) + *resultfp = fp; + if (results) + *results = (struct socket *)fp->f_data; + proc_fdunlock(p); + + return (0); +} + + +int +fp_getfkq(p, fd, resultfp, resultkq) + struct proc *p; + int fd; + struct fileproc **resultfp; + struct kqueue **resultkq; +{ + struct filedesc *fdp = p->p_fd; + struct fileproc *fp; + + proc_fdlock(p); + if ( fd < 0 || fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL || + (fdp->fd_ofileflags[fd] & UF_RESERVED)) { + proc_fdunlock(p); + return (EBADF); + } + if (fp->f_type != DTYPE_KQUEUE) { + proc_fdunlock(p); + return(EBADF); + } + fp->f_iocount++; + + if (resultfp) + *resultfp = fp; + if (resultkq) + *resultkq = (struct kqueue *)fp->f_data; + proc_fdunlock(p); + + return (0); +} + +int +fp_getfpshm(p, fd, resultfp, resultpshm) + struct proc *p; + int fd; + struct fileproc **resultfp; + struct pshmnode **resultpshm; +{ + struct filedesc *fdp = p->p_fd; + struct fileproc *fp; + + proc_fdlock(p); + if (fd < 0 || fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL || + (fdp->fd_ofileflags[fd] & UF_RESERVED)) { + proc_fdunlock(p); + return (EBADF); + } + if (fp->f_type != DTYPE_PSXSHM) { + + proc_fdunlock(p); + return(EBADF); + } + fp->f_iocount++; + + if (resultfp) + *resultfp = fp; + if (resultpshm) + *resultpshm = (struct pshmnode *)fp->f_data; + proc_fdunlock(p); + + return (0); +} + + +int +fp_getfpsem(p, fd, resultfp, resultpsem) + struct proc *p; + int fd; + struct fileproc **resultfp; + struct psemnode **resultpsem; +{ + struct filedesc *fdp = p->p_fd; + struct fileproc *fp; + + proc_fdlock(p); + if (fd < 0 || fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL || + (fdp->fd_ofileflags[fd] & UF_RESERVED)) { + proc_fdunlock(p); + return (EBADF); + } + if (fp->f_type != DTYPE_PSXSEM) { + proc_fdunlock(p); + return(EBADF); + } + fp->f_iocount++; + + if (resultfp) + *resultfp = fp; + if (resultpsem) + *resultpsem = (struct psemnode *)fp->f_data; + proc_fdunlock(p); + + return (0); +} +int +fp_lookup(p, fd, resultfp, locked) + struct proc *p; + int fd; + struct fileproc **resultfp; + int locked; +{ + struct filedesc *fdp = p->p_fd; + struct fileproc *fp; + + if (!locked) + proc_fdlock(p); + if (fd < 0 || fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL || + (fdp->fd_ofileflags[fd] & UF_RESERVED)) { + if (!locked) + proc_fdunlock(p); + return (EBADF); + } + fp->f_iocount++; + + if (resultfp) + *resultfp = fp; + if (!locked) + proc_fdunlock(p); + + return (0); +} + +int +fp_drop_written(proc_t p, int fd, struct fileproc *fp) +{ + int error; + + proc_fdlock(p); + + fp->f_flags |= FP_WRITTEN; + + error = fp_drop(p, fd, fp, 1); + + proc_fdunlock(p); + + return (error); +} + + +int +fp_drop_event(proc_t p, int fd, struct fileproc *fp) +{ + int error; + + proc_fdlock(p); + + fp->f_flags |= FP_WAITEVENT; + + error = fp_drop(p, fd, fp, 1); + + proc_fdunlock(p); + + return (error); +} + int -fdavail(p, n) +fp_drop(p, fd, fp, locked) struct proc *p; - register int n; + int fd; + struct fileproc *fp; + int locked; { - register struct filedesc *fdp = p->p_fd; - register struct file **fpp; - register char *flags; - register int i, lim; + struct filedesc *fdp = p->p_fd; - lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles); - if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0) - return (1); - fpp = &fdp->fd_ofiles[fdp->fd_freefile]; - flags = &fdp->fd_ofileflags[fdp->fd_freefile]; - for (i = fdp->fd_nfiles - fdp->fd_freefile; --i >= 0; fpp++, flags++) - if (*fpp == NULL && !(*flags & UF_RESERVED) && --n <= 0) - return (1); + if (!locked) + proc_fdlock(p); + if ((fp == FILEPROC_NULL) && (fd < 0 || fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL || + ((fdp->fd_ofileflags[fd] & UF_RESERVED) && + !(fdp->fd_ofileflags[fd] & UF_CLOSING)))) { + if (!locked) + proc_fdunlock(p); + return (EBADF); + } + fp->f_iocount--; + + if (p->p_fpdrainwait && fp->f_iocount == 0) { + p->p_fpdrainwait = 0; + wakeup(&p->p_fpdrainwait); + } + if (!locked) + proc_fdunlock(p); + return (0); } -void -fdrelse(p, fd) - struct proc *p; - int fd; +int +file_vnode(int fd, struct vnode **vpp) { - _fdrelse(p->p_fd, fd); + struct proc * p = current_proc(); + struct fileproc *fp; + int error; + + proc_fdlock(p); + if ( (error = fp_lookup(p, fd, &fp, 1)) ) { + proc_fdunlock(p); + return(error); + } + if (fp->f_type != DTYPE_VNODE) { + fp_drop(p, fd, fp,1); + proc_fdunlock(p); + return(EINVAL); + } + *vpp = (struct vnode *)fp->f_data; + proc_fdunlock(p); + + return(0); } + int -fdgetf(p, fd, resultfp) - register struct proc *p; - register int fd; - struct file **resultfp; +file_socket(int fd, struct socket **sp) { - register struct filedesc *fdp = p->p_fd; - struct file *fp; + struct proc * p = current_proc(); + struct fileproc *fp; + int error; + + proc_fdlock(p); + if ( (error = fp_lookup(p, fd, &fp, 1)) ) { + proc_fdunlock(p); + return(error); + } + if (fp->f_type != DTYPE_SOCKET) { + fp_drop(p, fd, fp,1); + proc_fdunlock(p); + return(ENOTSOCK); + } + *sp = (struct socket *)fp->f_data; + proc_fdunlock(p); - if ((u_int)fd >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[fd]) == NULL || - (fdp->fd_ofileflags[fd] & UF_RESERVED)) + return(0); +} + +int +file_flags(int fd, int * flags) +{ + + struct proc * p = current_proc(); + struct fileproc *fp; + int error; + + proc_fdlock(p); + if ( (error = fp_lookup(p, fd, &fp, 1)) ) { + proc_fdunlock(p); + return(error); + } + *flags = (int)fp->f_flag; + fp_drop(p, fd, fp,1); + proc_fdunlock(p); + + return(0); +} + + +int +file_drop(int fd) +{ + struct fileproc *fp; + struct proc *p = current_proc(); + + proc_fdlock(p); + if (fd < 0 || fd >= p->p_fd->fd_nfiles || + (fp = p->p_fd->fd_ofiles[fd]) == NULL || + ((p->p_fd->fd_ofileflags[fd] & UF_RESERVED) && + !(p->p_fd->fd_ofileflags[fd] & UF_CLOSING))) { + proc_fdunlock(p); return (EBADF); + } + fp->f_iocount --; + + if (p->p_fpdrainwait && fp->f_iocount == 0) { + p->p_fpdrainwait = 0; + wakeup(&p->p_fpdrainwait); + } + proc_fdunlock(p); + return(0); + - if (resultfp) - *resultfp = fp; - return (0); } +int +falloc(p, resultfp, resultfd ) + struct proc *p; + struct fileproc **resultfp; + int *resultfd; +{ + int error; + + proc_fdlock(p); + error = falloc_locked(p, resultfp, resultfd, 1); + proc_fdunlock(p); + + return(error); +} /* * Create a new open file structure and allocate * a file decriptor for the process that refers to it. */ int -falloc(p, resultfp, resultfd) - register struct proc *p; - struct file **resultfp; +falloc_locked(p, resultfp, resultfd, locked) + struct proc *p; + struct fileproc **resultfp; int *resultfd; + int locked; { - register struct file *fp, *fq; - int error, i; - - if (error = fdalloc(p, 0, &i)) + struct fileproc *fp, *fq; + struct fileglob *fg; + int error, nfd; + + if (!locked) + proc_fdlock(p); + if ( (error = fdalloc(p, 0, &nfd)) ) { + if (!locked) + proc_fdunlock(p); return (error); + } if (nfiles >= maxfiles) { + if (!locked) + proc_fdunlock(p); tablefull("file"); return (ENFILE); } @@ -1025,22 +1853,43 @@ falloc(p, resultfp, resultfd) * of open files at that point, otherwise put it at the front of * the list of open files. */ + proc_fdunlock(p); + + MALLOC_ZONE(fp, struct fileproc *, sizeof(struct fileproc), M_FILEPROC, M_WAITOK); + MALLOC_ZONE(fg, struct fileglob *, sizeof(struct fileglob), M_FILEGLOB, M_WAITOK); + bzero(fp, sizeof(struct fileproc)); + bzero(fg, sizeof(struct fileglob)); + lck_mtx_init(&fg->fg_lock, file_lck_grp, file_lck_attr); + + fp->f_iocount = 1; + fg->fg_count = 1; + fp->f_fglob = fg; + + proc_fdlock(p); + + fp->f_cred = kauth_cred_proc_ref(p); + + lck_mtx_lock(file_flist_lock); + nfiles++; - MALLOC_ZONE(fp, struct file *, sizeof(struct file), M_FILE, M_WAITOK); - bzero(fp, sizeof(struct file)); - p->p_fd->fd_ofiles[i] = fp; - fp->f_count = 1; - fp->f_cred = p->p_ucred; - crhold(fp->f_cred); + + if ( (fq = p->p_fd->fd_ofiles[0]) ) { + LIST_INSERT_AFTER(fq->f_fglob, fg, f_list); + } else { + LIST_INSERT_HEAD(&filehead, fg, f_list); + } + lck_mtx_unlock(file_flist_lock); + + p->p_fd->fd_ofiles[nfd] = fp; + + if (!locked) + proc_fdunlock(p); + if (resultfp) *resultfp = fp; if (resultfd) - *resultfd = i; - if (fq = p->p_fd->fd_ofiles[0]) { - LIST_INSERT_AFTER(fq, fp, f_list); - } else { - LIST_INSERT_HEAD(&filehead, fp, f_list); - } + *resultfd = nfd; + return (0); } @@ -1048,38 +1897,42 @@ falloc(p, resultfp, resultfd) * Free a file structure. */ void -ffree(fp) - register struct file *fp; +fg_free(fg) + struct fileglob *fg; { - register struct file *fq; - struct ucred *cred; + kauth_cred_t cred; + + lck_mtx_lock(file_flist_lock); + LIST_REMOVE(fg, f_list); + nfiles--; + lck_mtx_unlock(file_flist_lock); - LIST_REMOVE(fp, f_list); - cred = fp->f_cred; + cred = fg->fg_cred; if (cred != NOCRED) { - fp->f_cred = NOCRED; - crfree(cred); + fg->fg_cred = NOCRED; + kauth_cred_rele(cred); } + lck_mtx_destroy(&fg->fg_lock, file_lck_grp); - nfiles--; - memset(fp, 0xff, sizeof *fp); - fp->f_count = (short)0xffff; - - FREE_ZONE(fp, sizeof *fp, M_FILE); + FREE_ZONE(fg, sizeof *fg, M_FILEGLOB); } void fdexec(p) struct proc *p; { - register struct filedesc *fdp = p->p_fd; - register int i = fdp->fd_lastfile; - register struct file **fpp = &fdp->fd_ofiles[i]; - register char *flags = &fdp->fd_ofileflags[i]; + struct filedesc *fdp = p->p_fd; + int i = fdp->fd_lastfile; + struct fileproc **fpp = &fdp->fd_ofiles[i]; + char *flags = &fdp->fd_ofileflags[i]; + int funnel_state; + + funnel_state = thread_funnel_set(kernel_flock, FALSE); + proc_fdlock(p); while (i >= 0) { if ((*flags & (UF_RESERVED|UF_EXCLOSE)) == UF_EXCLOSE) { - register struct file *fp = *fpp; + struct fileproc *fp = *fpp; if (i < fdp->fd_knlistsize) knote_fdclose(p, i); @@ -1087,13 +1940,14 @@ fdexec(p) *fpp = NULL; *flags = 0; if (i == fdp->fd_lastfile && i > 0) fdp->fd_lastfile--; - closef(fp, p); + closef_locked(fp, fp->f_fglob, p); + FREE_ZONE(fp, sizeof *fp, M_FILEPROC); } - else - *flags &= ~UF_MAPPED; i--; fpp--; flags--; } + proc_fdunlock(p); + thread_funnel_set(kernel_flock, funnel_state); } /* @@ -1103,15 +1957,74 @@ struct filedesc * fdcopy(p) struct proc *p; { - register struct filedesc *newfdp, *fdp = p->p_fd; - register int i; + struct filedesc *newfdp, *fdp = p->p_fd; + int i; + struct fileproc *ofp, *fp; + vnode_t v_dir; MALLOC_ZONE(newfdp, struct filedesc *, sizeof *newfdp, M_FILEDESC, M_WAITOK); + if (newfdp == NULL) + return(NULL); + + proc_fdlock(p); + + /* + * the FD_CHROOT flag will be inherited via this copy + */ (void) memcpy(newfdp, fdp, sizeof *newfdp); - VREF(newfdp->fd_cdir); - if (newfdp->fd_rdir) - VREF(newfdp->fd_rdir); + + /* + * for both fd_cdir and fd_rdir make sure we get + * a valid reference... if we can't, than set + * set the pointer(s) to NULL in the child... this + * will keep us from using a non-referenced vp + * and allows us to do the vnode_rele only on + * a properly referenced vp + */ + if ( (v_dir = newfdp->fd_cdir) ) { + if (vnode_getwithref(v_dir) == 0) { + if ( (vnode_ref(v_dir)) ) + newfdp->fd_cdir = NULL; + vnode_put(v_dir); + } else + newfdp->fd_cdir = NULL; + } + if (newfdp->fd_cdir == NULL && fdp->fd_cdir) { + /* + * we couldn't get a new reference on + * the current working directory being + * inherited... we might as well drop + * our reference from the parent also + * since the vnode has gone DEAD making + * it useless... by dropping it we'll + * be that much closer to recyling it + */ + vnode_rele(fdp->fd_cdir); + fdp->fd_cdir = NULL; + } + + if ( (v_dir = newfdp->fd_rdir) ) { + if (vnode_getwithref(v_dir) == 0) { + if ( (vnode_ref(v_dir)) ) + newfdp->fd_rdir = NULL; + vnode_put(v_dir); + } else + newfdp->fd_rdir = NULL; + } + if (newfdp->fd_rdir == NULL && fdp->fd_rdir) { + /* + * we couldn't get a new reference on + * the root directory being + * inherited... we might as well drop + * our reference from the parent also + * since the vnode has gone DEAD making + * it useless... by dropping it we'll + * be that much closer to recyling it + */ + vnode_rele(fdp->fd_rdir); + fdp->fd_rdir = NULL; + } newfdp->fd_refcnt = 1; /* @@ -1132,13 +2045,27 @@ fdcopy(p) while (i > 2 * NDEXTENT && i > newfdp->fd_lastfile * 2) i /= 2; } - MALLOC_ZONE(newfdp->fd_ofiles, struct file **, + proc_fdunlock(p); + + MALLOC_ZONE(newfdp->fd_ofiles, struct fileproc **, i * OFILESIZE, M_OFILETABL, M_WAITOK); + if (newfdp->fd_ofiles == NULL) { + if (newfdp->fd_cdir) + vnode_rele(newfdp->fd_cdir); + if (newfdp->fd_rdir) + vnode_rele(newfdp->fd_rdir); + + FREE_ZONE(newfdp, sizeof *newfdp, M_FILEDESC); + return(NULL); + } + proc_fdlock(p); + newfdp->fd_ofileflags = (char *) &newfdp->fd_ofiles[i]; newfdp->fd_nfiles = i; + if (fdp->fd_nfiles > 0) { - register struct file **fpp; - register char *flags; + struct fileproc **fpp; + char *flags; (void) memcpy(newfdp->fd_ofiles, fdp->fd_ofiles, i * sizeof *fdp->fd_ofiles); @@ -1164,12 +2091,19 @@ fdcopy(p) newfdp->fd_knhash = NULL; newfdp->fd_knhashmask = 0; } - fpp = newfdp->fd_ofiles; flags = newfdp->fd_ofileflags; + for (i = newfdp->fd_lastfile; i-- >= 0; fpp++, flags++) - if (*fpp != NULL && !(*flags & UF_RESERVED)) { - (void)fref(*fpp); + if ((ofp = *fpp) != NULL && !(*flags & UF_RESERVED)) { + MALLOC_ZONE(fp, struct fileproc *, sizeof(struct fileproc), M_FILEPROC, M_WAITOK); + bzero(fp, sizeof(struct fileproc)); + fp->f_flags = ofp->f_flags; + //fp->f_iocount = ofp->f_iocount; + fp->f_iocount = 0; + fp->f_fglob = ofp->f_fglob; + (void)fg_ref(fp); + *fpp = fp; } else { *fpp = NULL; *flags = 0; @@ -1177,6 +2111,7 @@ fdcopy(p) } else (void) memset(newfdp->fd_ofiles, 0, i * OFILESIZE); + proc_fdunlock(p); return (newfdp); } @@ -1188,63 +2123,59 @@ fdfree(p) struct proc *p; { struct filedesc *fdp; - struct file *fp; + struct fileproc *fp; int i; - struct vnode *tvp; + + proc_fdlock(p); /* Certain daemons might not have file descriptors */ - if ((fdp = p->p_fd) == NULL) - return; + fdp = p->p_fd; - if (--fdp->fd_refcnt > 0) + if ((fdp == NULL) || (--fdp->fd_refcnt > 0)) { + proc_fdunlock(p); return; + } + if (fdp->fd_refcnt == 0xffff) + panic("fdfree: bad fd_refcnt"); /* Last reference: the structure can't change out from under us */ - if (fdp->fd_nfiles > 0) { - for (i = fdp->fd_lastfile; i >= 0; i--) -#if 1 /* WORKAROUND */ - /* - * Merlot: need to remove the bogus f_data check - * from the following "if" statement. It's there - * because of the network/kernel funnel race on a - * close of a socket vs. fdfree on exit. See - * Radar rdar://problem/3365650 for details, but - * the sort version is the commment before the "if" - * above is wrong under certain circumstances. - * - * We have to do this twice, in case knote_fdclose() - * results in a block. - * - * This works because an fdfree() will set all fields - * in the struct file to -1. - */ - if ((fp = fdp->fd_ofiles[i]) != NULL && - fp->f_data != (caddr_t)-1) { - if (i < fdp->fd_knlistsize) - knote_fdclose(p, i); - if (fp->f_data != (caddr_t)-1) - (void) closef(fp, p); - } -#else /* !WORKAROUND */ + + if (fdp->fd_nfiles > 0 && fdp->fd_ofiles) { + for (i = fdp->fd_lastfile; i >= 0; i--) { if ((fp = fdp->fd_ofiles[i]) != NULL) { + + if (fdp->fd_ofileflags[i] & UF_RESERVED) + panic("fdfree: found fp with UF_RESERVED\n"); + + /* closef drops the iocount ... */ + if ((fp->f_flags & FP_INCHRREAD) != 0) + fp->f_iocount++; + fdp->fd_ofiles[i] = NULL; + fdp->fd_ofileflags[i] |= UF_RESERVED; + if (i < fdp->fd_knlistsize) knote_fdclose(p, i); - (void) closef(fp, p); + if (fp->f_flags & FP_WAITEVENT) + (void)waitevent_close(p, fp); + (void) closef_locked(fp, fp->f_fglob, p); + FREE_ZONE(fp, sizeof *fp, M_FILEPROC); } -#endif /* !WORKAROUND */ - FREE_ZONE(fdp->fd_ofiles, - fdp->fd_nfiles * OFILESIZE, M_OFILETABL); - } + } + FREE_ZONE(fdp->fd_ofiles, fdp->fd_nfiles * OFILESIZE, M_OFILETABL); + fdp->fd_ofiles = NULL; + fdp->fd_nfiles = 0; + } - tvp = fdp->fd_cdir; - fdp->fd_cdir = NULL; - vrele(tvp); + proc_fdunlock(p); + + if (fdp->fd_cdir) + vnode_rele(fdp->fd_cdir); + if (fdp->fd_rdir) + vnode_rele(fdp->fd_rdir); - if (fdp->fd_rdir) { - tvp = fdp->fd_rdir; - fdp->fd_rdir = NULL; - vrele(tvp); - } + proc_fdlock(p); + p->p_fd = NULL; + proc_fdunlock(p); if (fdp->fd_knlist) FREE(fdp->fd_knlist, M_KQUEUE); @@ -1252,39 +2183,60 @@ fdfree(p) FREE(fdp->fd_knhash, M_KQUEUE); FREE_ZONE(fdp, sizeof *fdp, M_FILEDESC); - - // XXXdbg - { - void clean_up_fmod_watch(struct proc *p); - clean_up_fmod_watch(p); - } } static int -closef_finish(fp, p) - register struct file *fp; - register struct proc *p; +closef_finish(fp, fg, p) + struct fileproc *fp; + struct fileglob *fg; + struct proc *p; { struct vnode *vp; struct flock lf; int error; + struct vfs_context context; - if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) { + if ((fg->fg_flag & FHASLOCK) && fg->fg_type == DTYPE_VNODE) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; - vp = (struct vnode *)fp->f_data; - (void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); + vp = (struct vnode *)fg->fg_data; + context.vc_proc = p; + context.vc_ucred = fg->fg_cred; + + (void) VNOP_ADVLOCK(vp, (caddr_t)fg, F_UNLCK, &lf, F_FLOCK, &context); } - if (fp->f_ops) - error = fo_close(fp, p); + if (fg->fg_ops) + error = fo_close(fg, p); else error = 0; - ffree(fp); + + if (((fp != (struct fileproc *)0) && ((fp->f_flags & FP_INCHRREAD) != 0))) { + proc_fdlock(p); + if ( ((fp->f_flags & FP_INCHRREAD) != 0) ) { + fileproc_drain(p, fp); + } + proc_fdunlock(p); + } + fg_free(fg); + return (error); } +int +closef(fg, p) + struct fileglob *fg; + struct proc *p; +{ + int error; + + proc_fdlock(p); + error = closef_locked((struct fileproc *)0, fg, p); + proc_fdunlock(p); + + return(error); +} /* * Internal form of close. * Decrement reference count on file structure. @@ -1292,16 +2244,19 @@ closef_finish(fp, p) * that was being passed in a message. */ int -closef(fp, p) - register struct file *fp; - register struct proc *p; +closef_locked(fp, fg, p) + struct fileproc *fp; + struct fileglob *fg; + struct proc *p; { struct vnode *vp; struct flock lf; + struct vfs_context context; int error; - if (fp == NULL) + if (fg == NULL) { return (0); + } /* * POSIX record locking dictates that any close releases ALL * locks owned by this process. This is handled by setting @@ -1310,70 +2265,140 @@ closef(fp, p) * If the descriptor was in a message, POSIX-style locks * aren't passed with the descriptor. */ - if (p && (p->p_flag & P_ADVLOCK) && fp->f_type == DTYPE_VNODE) { + if (p && (p->p_flag & P_ADVLOCK) && fg->fg_type == DTYPE_VNODE) { + proc_fdunlock(p); + lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; - vp = (struct vnode *)fp->f_data; - (void) VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_POSIX); + vp = (struct vnode *)fg->fg_data; + + if ( (error = vnode_getwithref(vp)) == 0 ) { + context.vc_proc = p; + context.vc_ucred = fg->fg_cred; + (void) VNOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_POSIX, &context); + + (void)vnode_put(vp); + } + proc_fdlock(p); } - if (frele_internal(fp) > 0) + lck_mtx_lock(&fg->fg_lock); + fg->fg_count--; + + if (fg->fg_count > 0) { + lck_mtx_unlock(&fg->fg_lock); return (0); - return(closef_finish(fp, p)); + } + if (fg->fg_count != 0) + panic("fg: being freed with bad fg_count (%d)", fg, fg->fg_count); + + if (fp && (fp->f_flags & FP_WRITTEN)) + fg->fg_flag |= FWASWRITTEN; + + fg->fg_lflags |= FG_TERM; + lck_mtx_unlock(&fg->fg_lock); + + proc_fdunlock(p); + error = closef_finish(fp, fg, p); + proc_fdlock(p); + + return(error); +} + + +extern int selwait; +void +fileproc_drain(struct proc *p, struct fileproc * fp) +{ + fp->f_iocount-- ; /* (the one the close holds) */ + + while (fp->f_iocount) { + if (((fp->f_flags & FP_INSELECT)== FP_INSELECT)) { + wait_queue_wakeup_all((wait_queue_t)fp->f_waddr, &selwait, THREAD_INTERRUPTED); + } else { + if (fp->f_fglob->fg_ops->fo_drain) { + (*fp->f_fglob->fg_ops->fo_drain)(fp, p); + } + } + p->p_fpdrainwait = 1; + + msleep(&p->p_fpdrainwait, &p->p_fdmlock, PRIBIO, "fpdrain",0); + + //panic("successful wait after drain\n"); + } +} + +int +fp_free(struct proc * p, int fd, struct fileproc * fp) +{ + proc_fdlock(p); + fdrelse(p, fd); + proc_fdunlock(p); + + fg_free(fp->f_fglob); + FREE_ZONE(fp, sizeof *fp, M_FILEPROC); } + /* * Apply an advisory lock on a file descriptor. * * Just attempt to get a record lock of the requested type on * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0). */ -struct flock_args { - int fd; - int how; -}; -/* ARGSUSED */ int -flock(p, uap, retval) - struct proc *p; - register struct flock_args *uap; - register_t *retval; +flock(struct proc *p, register struct flock_args *uap, __unused register_t *retval) { int fd = uap->fd; int how = uap->how; - register struct filedesc *fdp = p->p_fd; - register struct file *fp; + struct fileproc *fp; struct vnode *vp; struct flock lf; + struct vfs_context context; + int error=0; AUDIT_ARG(fd, uap->fd); - if ((u_int)fd >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[fd]) == NULL || - (fdp->fd_ofileflags[fd] & UF_RESERVED)) - return (EBADF); - if (fp->f_type != DTYPE_VNODE) - return (EOPNOTSUPP); - vp = (struct vnode *)fp->f_data; + if ( (error = fp_getfvp(p, fd, &fp, &vp)) ) { + return(error); + } + if ( (error = vnode_getwithref(vp)) ) { + goto out1; + } AUDIT_ARG(vnpath, vp, ARG_VNODE1); + + context.vc_proc = p; + context.vc_ucred = fp->f_cred; + lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (how & LOCK_UN) { lf.l_type = F_UNLCK; fp->f_flag &= ~FHASLOCK; - return (VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK)); + error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_UNLCK, &lf, F_FLOCK, &context); + goto out; } if (how & LOCK_EX) lf.l_type = F_WRLCK; else if (how & LOCK_SH) lf.l_type = F_RDLCK; - else - return (EBADF); + else { + error = EBADF; + goto out; + } fp->f_flag |= FHASLOCK; - if (how & LOCK_NB) - return (VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK)); - return (VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK|F_WAIT)); + if (how & LOCK_NB) { + error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, F_FLOCK, &context); + goto out; + } + error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, F_FLOCK|F_WAIT, &context); +out: + (void)vnode_put(vp); +out1: + fp_drop(p, fd, fp, 0); + return(error); + } /* @@ -1384,12 +2409,8 @@ flock(p, uap, retval) * consists of only the ``open()'' routine, because all subsequent * references to this file will be direct to the other driver. */ -/* ARGSUSED */ int -fdopen(dev, mode, type, p) - dev_t dev; - int mode, type; - struct proc *p; +fdopen(dev_t dev, __unused int mode, __unused int type, struct proc *p) { /* @@ -1397,7 +2418,7 @@ fdopen(dev, mode, type, p) * the file descriptor being sought for duplication. The error * return ensures that the vnode for this device will be released * by vn_open. Open will detect this special error and take the - * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN + * actions in dupfdopen below. Other callers of vn_open or vnop_open * will simply report the error. */ p->p_dupfd = minor(dev); @@ -1414,8 +2435,9 @@ dupfdopen(fdp, indx, dfd, mode, error) int mode; int error; { - register struct file *wfp; - struct file *fp; + struct fileproc *wfp; + struct fileproc *fp; + struct proc * p = current_proc(); /* * If the to-be-dup'd fd number is greater than the allowed number @@ -1424,12 +2446,16 @@ dupfdopen(fdp, indx, dfd, mode, error) * falloc could allocate an already closed to-be-dup'd descriptor * as the new descriptor. */ + proc_fdlock(p); + fp = fdp->fd_ofiles[indx]; - if ((u_int)dfd >= fdp->fd_nfiles || + if (dfd < 0 || dfd >= fdp->fd_nfiles || (wfp = fdp->fd_ofiles[dfd]) == NULL || wfp == fp || - (fdp->fd_ofileflags[dfd] & UF_RESERVED)) - return (EBADF); + (fdp->fd_ofileflags[dfd] & UF_RESERVED)) { + proc_fdunlock(p); + return (EBADF); + } /* * There are two cases of interest here. * @@ -1448,13 +2474,21 @@ dupfdopen(fdp, indx, dfd, mode, error) * Check that the mode the file is being opened for is a * subset of the mode of the existing descriptor. */ - if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) + if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) { + proc_fdunlock(p); return (EACCES); - (void)fref(wfp); + } if (indx > fdp->fd_lastfile) - fdp->fd_lastfile = indx;; - fdp->fd_ofiles[indx] = wfp; + fdp->fd_lastfile = indx; + (void)fg_ref(wfp); + + if (fp->f_fglob) + fg_free(fp->f_fglob); + fp->f_fglob = wfp->f_fglob; + fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; + + proc_fdunlock(p); return (0); case ENXIO: @@ -1462,72 +2496,157 @@ dupfdopen(fdp, indx, dfd, mode, error) * Steal away the file pointer from dfd, and stuff it into indx. */ if (indx > fdp->fd_lastfile) - fdp->fd_lastfile = indx;; - fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd]; + fdp->fd_lastfile = indx; + + if (fp->f_fglob) + fg_free(fp->f_fglob); + fp->f_fglob = wfp->f_fglob; + fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; _fdrelse(fdp, dfd); + + proc_fdunlock(p); + + FREE_ZONE(wfp, sizeof *fp, M_FILEPROC); + return (0); default: + proc_fdunlock(p); return (error); } /* NOTREACHED */ } -/* Reference manipulation routines for the file structure */ - -int -fref(struct file *fp) +void +fg_ref(struct fileproc * fp) { - if (fp->f_count == (short)0xffff) - return (-1); - if (++fp->f_count <= 0) - panic("fref: f_count"); - return ((int)fp->f_count); + struct fileglob *fg; + + fg = fp->f_fglob; + + lck_mtx_lock(&fg->fg_lock); + fg->fg_count++; + lck_mtx_unlock(&fg->fg_lock); } -static int -frele_internal(struct file *fp) +void +fg_drop(struct fileproc * fp) { - if (fp->f_count == (short)0xffff) - panic("frele: stale"); - if (--fp->f_count < 0) - panic("frele: count < 0"); - return ((int)fp->f_count); + struct fileglob *fg; + + fg = fp->f_fglob; + lck_mtx_lock(&fg->fg_lock); + fg->fg_count--; + lck_mtx_unlock(&fg->fg_lock); } -int -frele(struct file *fp) +void +fg_insertuipc(struct fileglob * fg) { - int count; - funnel_t * fnl; - extern int disable_funnel; +int insertque = 0; - fnl = thread_funnel_get(); - /* - * If the funnels are merged then atleast a funnel should be held - * else frele should come in with kernel funnel only - */ - if (!disable_funnel && (fnl != kernel_flock)) { - panic("frele: kernel funnel not held"); + lck_mtx_lock(&fg->fg_lock); + + while (fg->fg_lflags & FG_RMMSGQ) { + fg->fg_lflags |= FG_WRMMSGQ; + msleep(&fg->fg_lflags, &fg->fg_lock, 0, "fg_insertuipc", 0); + } - } else if (fnl == THR_FUNNEL_NULL) { - panic("frele: no funnel held"); + fg->fg_count++; + fg->fg_msgcount++; + if (fg->fg_msgcount == 1) { + fg->fg_lflags |= FG_INSMSGQ; + insertque=1; } + lck_mtx_unlock(&fg->fg_lock); + + if (insertque) { + lck_mtx_lock(uipc_lock); + LIST_INSERT_HEAD(&fmsghead, fg, f_msglist); + lck_mtx_unlock(uipc_lock); + lck_mtx_lock(&fg->fg_lock); + fg->fg_lflags &= ~FG_INSMSGQ; + if (fg->fg_lflags & FG_WINSMSGQ) { + fg->fg_lflags &= ~FG_WINSMSGQ; + wakeup(&fg->fg_lflags); + } + lck_mtx_unlock(&fg->fg_lock); + } + +} - if ((count = frele_internal(fp)) == 0) { - /* some one closed the fd while we were blocked */ - (void)closef_finish(fp, current_proc()); +void +fg_removeuipc(struct fileglob * fg) +{ +int removeque = 0; + + lck_mtx_lock(&fg->fg_lock); + while (fg->fg_lflags & FG_INSMSGQ) { + fg->fg_lflags |= FG_WINSMSGQ; + msleep(&fg->fg_lflags, &fg->fg_lock, 0, "fg_removeuipc", 0); + } + fg->fg_msgcount--; + if (fg->fg_msgcount == 0) { + fg->fg_lflags |= FG_RMMSGQ; + removeque=1; } - return(count); + lck_mtx_unlock(&fg->fg_lock); + + if (removeque) { + lck_mtx_lock(uipc_lock); + LIST_REMOVE(fg, f_msglist); + lck_mtx_unlock(uipc_lock); + lck_mtx_lock(&fg->fg_lock); + fg->fg_lflags &= ~FG_RMMSGQ; + if (fg->fg_lflags & FG_WRMMSGQ) { + fg->fg_lflags &= ~FG_WRMMSGQ; + wakeup(&fg->fg_lflags); + } + lck_mtx_unlock(&fg->fg_lock); + } +} + + +int +fo_read(struct fileproc *fp, struct uio *uio, kauth_cred_t cred, int flags, struct proc *p) +{ + return ((*fp->f_ops->fo_read)(fp, uio, cred, flags, p)); +} + +int +fo_write(struct fileproc *fp, struct uio *uio, kauth_cred_t cred, int flags, struct proc *p) +{ + return((*fp->f_ops->fo_write)(fp, uio, cred, flags, p)); +} + +int +fo_ioctl(struct fileproc *fp, u_long com, caddr_t data, struct proc *p) +{ +int error; + + proc_fdunlock(p); + error = (*fp->f_ops->fo_ioctl)(fp, com, data, p); + proc_fdlock(p); + return(error); +} + +int +fo_select(struct fileproc *fp, int which, void *wql, struct proc *p) +{ + return((*fp->f_ops->fo_select)(fp, which, wql, p)); +} + +int +fo_close(struct fileglob *fg, struct proc *p) +{ + return((*fg->fg_ops->fo_close)(fg, p)); } int -fcount(struct file *fp) +fo_kqfilter(struct fileproc *fp, struct knote *kn, struct proc *p) { - if (fp->f_count == (short)0xffff) - panic("fcount: stale"); - return ((int)fp->f_count); + return ((*fp->f_ops->fo_kqfilter)(fp, kn, p)); } diff --git a/bsd/kern/kern_event.c b/bsd/kern/kern_event.c index 55c2fab03..1bf948822 100644 --- a/bsd/kern/kern_event.c +++ b/bsd/kern/kern_event.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -48,15 +48,17 @@ /* * @(#)kern_event.c 1.0 (3/31/2000) */ +#include <stdint.h> #include <sys/param.h> #include <sys/systm.h> #include <sys/filedesc.h> #include <sys/kernel.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/malloc.h> #include <sys/unistd.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/fcntl.h> #include <sys/select.h> #include <sys/queue.h> @@ -68,26 +70,43 @@ #include <sys/stat.h> #include <sys/sysctl.h> #include <sys/uio.h> - +#include <sys/sysproto.h> +#include <sys/user.h> +#include <string.h> + +#include <kern/lock.h> +#include <kern/clock.h> +#include <kern/thread_call.h> +#include <kern/sched_prim.h> #include <kern/zalloc.h> +#include <kern/assert.h> + +#include <libkern/libkern.h> + +extern void unix_syscall_return(int); MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); -static int kqueue_scan(struct file *fp, int maxevents, - struct kevent *ulistp, const struct timespec *timeout, - register_t *retval, struct proc *p); -static void kqueue_wakeup(struct kqueue *kq); +static inline void kqlock(struct kqueue *kq); +static inline void kqunlock(struct kqueue *kq); + +static int kqlock2knoteuse(struct kqueue *kq, struct knote *kn); +static int kqlock2knoteusewait(struct kqueue *kq, struct knote *kn); +static int kqlock2knotedrop(struct kqueue *kq, struct knote *kn); +static int knoteuse2kqlock(struct kqueue *kq, struct knote *kn); -static int kqueue_read __P((struct file *fp, struct uio *uio, - struct ucred *cred, int flags, struct proc *p)); -static int kqueue_write __P((struct file *fp, struct uio *uio, - struct ucred *cred, int flags, struct proc *p)); -static int kqueue_ioctl __P((struct file *fp, u_long com, caddr_t data, - struct proc *p)); -static int kqueue_select __P((struct file *fp, int which, void *wql, - struct proc *p)); -static int kqueue_close __P((struct file *fp, struct proc *p)); -static int kqueue_kqfilter __P((struct file *fp, struct knote *kn, struct proc *p)); +static void kqueue_wakeup(struct kqueue *kq); +static int kqueue_read(struct fileproc *fp, struct uio *uio, + kauth_cred_t cred, int flags, struct proc *p); +static int kqueue_write(struct fileproc *fp, struct uio *uio, + kauth_cred_t cred, int flags, struct proc *p); +static int kqueue_ioctl(struct fileproc *fp, u_long com, caddr_t data, + struct proc *p); +static int kqueue_select(struct fileproc *fp, int which, void *wql, + struct proc *p); +static int kqueue_close(struct fileglob *fp, struct proc *p); +static int kqueue_kqfilter(struct fileproc *fp, struct knote *kn, struct proc *p); +extern int kqueue_stat(struct fileproc *fp, struct stat *st, struct proc *p); static struct fileops kqueueops = { kqueue_read, @@ -95,15 +114,28 @@ static struct fileops kqueueops = { kqueue_ioctl, kqueue_select, kqueue_close, - kqueue_kqfilter + kqueue_kqfilter, + 0 }; -static void knote_fdpattach(struct knote *kn, struct filedesc *fdp); +static int kevent_copyin(user_addr_t *addrp, struct kevent *kevp, struct proc *p); +static int kevent_copyout(struct kevent *kevp, user_addr_t *addrp, struct proc *p); + +static int kevent_callback(struct kqueue *kq, struct kevent *kevp, void *data); +static void kevent_continue(struct kqueue *kq, void *data, int error); +static void kevent_scan_continue(void *contp, wait_result_t wait_result); +static int kevent_process(struct kqueue *kq, kevent_callback_t callback, + void *data, int *countp, struct proc *p); +static void knote_put(struct knote *kn); +static int knote_fdpattach(struct knote *kn, struct filedesc *fdp, struct proc *p); static void knote_drop(struct knote *kn, struct proc *p); +static void knote_activate(struct knote *kn); +static void knote_deactivate(struct knote *kn); static void knote_enqueue(struct knote *kn); static void knote_dequeue(struct knote *kn); static struct knote *knote_alloc(void); static void knote_free(struct knote *kn); +extern void knote_init(void); static int filt_fileattach(struct knote *kn); static struct filterops file_filtops = @@ -115,7 +147,7 @@ static struct filterops kqread_filtops = { 1, NULL, filt_kqdetach, filt_kqueue }; /* - * JMM - placeholder for not-yet-implemented filters + * placeholder for not-yet-implemented filters */ static int filt_badattach(struct knote *kn); static struct filterops bad_filtops = @@ -132,9 +164,10 @@ extern struct filterops fs_filtops; extern struct filterops sig_filtops; -#if 0 -/* JMM - We don't implement these now */ -static void filt_timerexpire(void *knx); + +/* Timer filter */ +static int filt_timercompute(struct knote *kn, uint64_t *abs_time); +static void filt_timerexpire(void *knx, void *param1); static int filt_timerattach(struct knote *kn); static void filt_timerdetach(struct knote *kn); static int filt_timer(struct knote *kn, long hint); @@ -142,20 +175,21 @@ static int filt_timer(struct knote *kn, long hint); static struct filterops timer_filtops = { 0, filt_timerattach, filt_timerdetach, filt_timer }; -static int kq_ncallouts = 0; -static int kq_calloutmax = (4 * 1024); +/* to avoid arming timers that fire quicker than we can handle */ +static uint64_t filt_timerfloor = 0; -SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, - &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue"); -#endif /* 0 */ +static lck_mtx_t _filt_timerlock; +static void filt_timerlock(void); +static void filt_timerunlock(void); -static zone_t knote_zone; +/* + * Sentinel marker for a thread scanning through the list of + * active knotes. + */ +static struct filterops threadmarker_filtops = + { 0, filt_badattach, 0, 0 }; -#define KNOTE_ACTIVATE(kn) do { \ - kn->kn_status |= KN_ACTIVE; \ - if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \ - knote_enqueue(kn); \ -} while(0) +static zone_t knote_zone; #define KN_HASHSIZE 64 /* XXX should be tunable */ #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) @@ -178,15 +212,155 @@ static struct filterops *sysfilt_ops[] = { &file_filtops, /* EVFILT_VNODE */ &proc_filtops, /* EVFILT_PROC */ &sig_filtops, /* EVFILT_SIGNAL */ -#if 0 &timer_filtops, /* EVFILT_TIMER */ -#else - &bad_filtops, /* EVFILT_TIMER */ -#endif &bad_filtops, /* EVFILT_MACHPORT */ - &fs_filtops /* EVFILT_FS */ + &fs_filtops /* EVFILT_FS */ }; +/* + * kqueue/note lock attributes and implementations + * + * kqueues have locks, while knotes have use counts + * Most of the knote state is guarded by the object lock. + * the knote "inuse" count and status use the kqueue lock. + */ +lck_grp_attr_t * kq_lck_grp_attr; +lck_grp_t * kq_lck_grp; +lck_attr_t * kq_lck_attr; + +static inline void +kqlock(struct kqueue *kq) +{ + lck_spin_lock(&kq->kq_lock); +} + +static inline void +kqunlock(struct kqueue *kq) +{ + lck_spin_unlock(&kq->kq_lock); +} + +/* + * Convert a kq lock to a knote use referece. + * + * If the knote is being dropped, we can't get + * a use reference, so just return with it + * still locked. + * + * - kq locked at entry + * - unlock on exit if we get the use reference + */ +static int +kqlock2knoteuse(struct kqueue *kq, struct knote *kn) +{ + if (kn->kn_status & KN_DROPPING) + return 0; + kn->kn_inuse++; + kqunlock(kq); + return 1; + } + +/* + * Convert a kq lock to a knote use referece. + * + * If the knote is being dropped, we can't get + * a use reference, so just return with it + * still locked. + * + * - kq locked at entry + * - kq always unlocked on exit + */ +static int +kqlock2knoteusewait(struct kqueue *kq, struct knote *kn) +{ + if (!kqlock2knoteuse(kq, kn)) { + kn->kn_status |= KN_DROPWAIT; + assert_wait(&kn->kn_status, THREAD_UNINT); + kqunlock(kq); + thread_block(THREAD_CONTINUE_NULL); + return 0; + } + return 1; + } + +/* + * Convert from a knote use reference back to kq lock. + * + * Drop a use reference and wake any waiters if + * this is the last one. + * + * The exit return indicates if the knote is + * still alive - but the kqueue lock is taken + * unconditionally. + */ +static int +knoteuse2kqlock(struct kqueue *kq, struct knote *kn) +{ + kqlock(kq); + if ((--kn->kn_inuse == 0) && + (kn->kn_status & KN_USEWAIT)) { + kn->kn_status &= ~KN_USEWAIT; + thread_wakeup(&kn->kn_inuse); + } + return ((kn->kn_status & KN_DROPPING) == 0); + } + +/* + * Convert a kq lock to a knote drop referece. + * + * If the knote is in use, wait for the use count + * to subside. We first mark our intention to drop + * it - keeping other users from "piling on." + * If we are too late, we have to wait for the + * other drop to complete. + * + * - kq locked at entry + * - always unlocked on exit. + * - caller can't hold any locks that would prevent + * the other dropper from completing. + */ +static int +kqlock2knotedrop(struct kqueue *kq, struct knote *kn) +{ + + if ((kn->kn_status & KN_DROPPING) == 0) { + kn->kn_status |= KN_DROPPING; + if (kn->kn_inuse > 0) { + kn->kn_status |= KN_USEWAIT; + assert_wait(&kn->kn_inuse, THREAD_UNINT); + kqunlock(kq); + thread_block(THREAD_CONTINUE_NULL); + } else + kqunlock(kq); + return 1; + } else { + kn->kn_status |= KN_DROPWAIT; + assert_wait(&kn->kn_status, THREAD_UNINT); + kqunlock(kq); + thread_block(THREAD_CONTINUE_NULL); + return 0; + } +} + +/* + * Release a knote use count reference. + */ +static void +knote_put(struct knote *kn) +{ + struct kqueue *kq = kn->kn_kq; + + kqlock(kq); + if ((--kn->kn_inuse == 0) && + (kn->kn_status & KN_USEWAIT)) { + kn->kn_status &= ~KN_USEWAIT; + thread_wakeup(&kn->kn_inuse); + } + kqunlock(kq); + } + + + static int filt_fileattach(struct knote *kn) { @@ -194,20 +368,27 @@ filt_fileattach(struct knote *kn) return (fo_kqfilter(kn->kn_fp, kn, current_proc())); } +#define f_flag f_fglob->fg_flag +#define f_type f_fglob->fg_type +#define f_msgcount f_fglob->fg_msgcount +#define f_cred f_fglob->fg_cred +#define f_ops f_fglob->fg_ops +#define f_offset f_fglob->fg_offset +#define f_data f_fglob->fg_data + static void filt_kqdetach(struct knote *kn) { struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; - if (kq->kq_state & KQ_SEL) - return; - + kqlock(kq); KNOTE_DETACH(&kq->kq_sel.si_note, kn); + kqunlock(kq); } /*ARGSUSED*/ static int -filt_kqueue(struct knote *kn, long hint) +filt_kqueue(struct knote *kn, __unused long hint) { struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; @@ -219,21 +400,23 @@ static int filt_procattach(struct knote *kn) { struct proc *p; + int funnel_state; + + funnel_state = thread_funnel_set(kernel_flock, TRUE); p = pfind(kn->kn_id); - if (p == NULL) + if (p == NULL) { + thread_funnel_set(kernel_flock, funnel_state); return (ESRCH); - if (! PRISON_CHECK(current_proc(), p)) - return (EACCES); + } - kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ /* * internal flag indicating registration done by kernel */ if (kn->kn_flags & EV_FLAG1) { - kn->kn_data = kn->kn_sdata; /* ppid */ + kn->kn_data = (int)kn->kn_sdata; /* ppid */ kn->kn_fflags = NOTE_CHILD; kn->kn_flags &= ~EV_FLAG1; } @@ -241,6 +424,8 @@ filt_procattach(struct knote *kn) /* XXX lock the proc here while adding to the list? */ KNOTE_ATTACH(&p->p_klist, kn); + thread_funnel_set(kernel_flock, funnel_state); + return (0); } @@ -255,19 +440,25 @@ filt_procattach(struct knote *kn) static void filt_procdetach(struct knote *kn) { - struct proc *p = kn->kn_ptr.p_proc; + struct proc *p; + int funnel_state; - if (kn->kn_status & KN_DETACHED) - return; + funnel_state = thread_funnel_set(kernel_flock, TRUE); + p = pfind(kn->kn_id); + + if (p != (struct proc *)NULL) + KNOTE_DETACH(&p->p_klist, kn); - /* XXX locking? this might modify another process. */ - KNOTE_DETACH(&p->p_klist, kn); + thread_funnel_set(kernel_flock, funnel_state); } static int filt_proc(struct knote *kn, long hint) { u_int event; + int funnel_state; + + funnel_state = thread_funnel_set(kernel_flock, TRUE); /* * mask off extra data @@ -284,8 +475,8 @@ filt_proc(struct knote *kn, long hint) * process is gone, so flag the event as finished. */ if (event == NOTE_EXIT) { - kn->kn_status |= KN_DETACHED; kn->kn_flags |= (EV_EOF | EV_ONESHOT); + thread_funnel_set(kernel_flock, funnel_state); return (1); } @@ -307,240 +498,654 @@ filt_proc(struct knote *kn, long hint) kev.fflags = kn->kn_sfflags; kev.data = kn->kn_id; /* parent */ kev.udata = kn->kn_kevent.udata; /* preserve udata */ - error = kqueue_register(kn->kn_kq, &kev, NULL); + error = kevent_register(kn->kn_kq, &kev, NULL); if (error) kn->kn_fflags |= NOTE_TRACKERR; } + event = kn->kn_fflags; + thread_funnel_set(kernel_flock, funnel_state); - return (kn->kn_fflags != 0); + return (event != 0); } -#if 0 +/* + * filt_timercompute - compute absolute timeout + * + * The saved-data field in the knote contains the + * time value. The saved filter-flags indicates + * the unit of measurement. + * + * If the timeout is not absolute, adjust it for + * the current time. + */ +static int +filt_timercompute(struct knote *kn, uint64_t *abs_time) +{ + uint64_t multiplier; + uint64_t raw; + + switch (kn->kn_sfflags & (NOTE_SECONDS|NOTE_USECONDS|NOTE_NSECONDS)) { + case NOTE_SECONDS: + multiplier = NSEC_PER_SEC; + break; + case NOTE_USECONDS: + multiplier = NSEC_PER_USEC; + break; + case NOTE_NSECONDS: + multiplier = 1; + break; + case 0: /* milliseconds (default) */ + multiplier = NSEC_PER_SEC / 1000; + break; + default: + return EINVAL; + } + nanoseconds_to_absolutetime((uint64_t)kn->kn_sdata * multiplier, &raw); + if (raw <= filt_timerfloor) { + *abs_time = 0; + return 0; + } + if ((kn->kn_sfflags & NOTE_ABSOLUTE) == NOTE_ABSOLUTE) { + uint32_t seconds, nanoseconds; + uint64_t now; + + clock_get_calendar_nanotime(&seconds, &nanoseconds); + nanoseconds_to_absolutetime((uint64_t)seconds * NSEC_PER_SEC + nanoseconds, + &now); + if (now >= raw + filt_timerfloor) { + *abs_time = 0; + return 0; + } + raw -= now; + } + clock_absolutetime_interval_to_deadline(raw, abs_time); + return 0; +} + +/* + * filt_timerexpire - the timer callout routine + * + * Just propagate the timer event into the knote + * filter routine (by going through the knote + * synchronization point). Pass a hint to + * indicate this is a real event, not just a + * query from above. + */ static void -filt_timerexpire(void *knx) +filt_timerexpire(void *knx, __unused void *spare) { + struct klist timer_list; struct knote *kn = knx; - struct callout *calloutp; - struct timeval tv; - int tticks; - - kn->kn_data++; - KNOTE_ACTIVATE(kn); - - if ((kn->kn_flags & EV_ONESHOT) == 0) { - tv.tv_sec = kn->kn_sdata / 1000; - tv.tv_usec = (kn->kn_sdata % 1000) * 1000; - tticks = tvtohz(&tv); - calloutp = (struct callout *)kn->kn_hook; - callout_reset(calloutp, tticks, filt_timerexpire, kn); - } + + /* no "object" for timers, so fake a list */ + SLIST_INIT(&timer_list); + SLIST_INSERT_HEAD(&timer_list, kn, kn_selnext); + KNOTE(&timer_list, 1); } /* - * data contains amount of time to sleep, in milliseconds + * data contains amount of time to sleep, in milliseconds, + * or a pointer to a timespec structure. */ static int filt_timerattach(struct knote *kn) { - struct callout *calloutp; - struct timeval tv; - int tticks; + thread_call_t callout; + uint64_t deadline; + int error; - if (kq_ncallouts >= kq_calloutmax) - return (ENOMEM); - kq_ncallouts++; + error = filt_timercompute(kn, &deadline); + if (error) + return (error); - tv.tv_sec = kn->kn_sdata / 1000; - tv.tv_usec = (kn->kn_sdata % 1000) * 1000; - tticks = tvtohz(&tv); + if (deadline) { + callout = thread_call_allocate(filt_timerexpire, kn); + if (NULL == callout) + return (ENOMEM); + } else { + /* handle as immediate */ + kn->kn_sdata = 0; + callout = NULL; + } - kn->kn_flags |= EV_CLEAR; /* automatically set */ - MALLOC(calloutp, struct callout *, sizeof(*calloutp), - M_KQUEUE, M_WAITOK); - callout_init(calloutp); - callout_reset(calloutp, tticks, filt_timerexpire, kn); - kn->kn_hook = (caddr_t)calloutp; + filt_timerlock(); + kn->kn_hook = (caddr_t)callout; + /* absolute=EV_ONESHOT */ + if (kn->kn_sfflags & NOTE_ABSOLUTE) + kn->kn_flags |= EV_ONESHOT; + + if (deadline) { + /* all others - if not faking immediate */ + kn->kn_flags |= EV_CLEAR; + thread_call_enter_delayed(callout, deadline); + kn->kn_hookid = 0; + } else { + /* fake immediate */ + kn->kn_hookid = 1; + } + filt_timerunlock(); return (0); } static void filt_timerdetach(struct knote *kn) { - struct callout *calloutp; - - calloutp = (struct callout *)kn->kn_hook; - callout_stop(calloutp); - FREE(calloutp, M_KQUEUE); - kq_ncallouts--; + thread_call_t callout; + + filt_timerlock(); + callout = (thread_call_t)kn->kn_hook; + if (callout != NULL) { + boolean_t cancelled; + + /* cancel the callout if we can */ + cancelled = thread_call_cancel(callout); + if (cancelled) { + /* got it, just free it */ + kn->kn_hook = NULL; + filt_timerunlock(); + thread_call_free(callout); + return; + } + /* we have to wait for the expire routine. */ + kn->kn_hookid = -1; /* we are detaching */ + assert_wait(&kn->kn_hook, THREAD_UNINT); + filt_timerunlock(); + thread_block(THREAD_CONTINUE_NULL); + assert(kn->kn_hook == NULL); + return; + } + /* nothing to do */ + filt_timerunlock(); } + + static int -filt_timer(struct knote *kn, long hint) +filt_timer(struct knote *kn, __unused long hint) { + int result; + + if (hint) { + /* real timer pop */ + thread_call_t callout; + boolean_t detaching; + + filt_timerlock(); + + kn->kn_data++; + + detaching = (kn->kn_hookid < 0); + callout = (thread_call_t)kn->kn_hook; - return (kn->kn_data != 0); + if (!detaching && (kn->kn_flags & EV_ONESHOT) == 0) { + uint64_t deadline; + int error; + + /* user input data may have changed - deal */ + error = filt_timercompute(kn, &deadline); + if (error) { + kn->kn_flags |= EV_ERROR; + kn->kn_data = error; + } else if (deadline == 0) { + /* revert to fake immediate */ + kn->kn_flags &= ~EV_CLEAR; + kn->kn_sdata = 0; + kn->kn_hookid = 1; + } else { + /* keep the callout and re-arm */ + thread_call_enter_delayed(callout, deadline); + filt_timerunlock(); + return 1; + } + } + kn->kn_hook = NULL; + filt_timerunlock(); + thread_call_free(callout); + + /* if someone is waiting for timer to pop */ + if (detaching) + thread_wakeup(&kn->kn_hook); + + return 1; + } + + /* user-query */ + filt_timerlock(); + + /* change fake timer to real if needed */ + while (kn->kn_hookid > 0 && kn->kn_sdata > 0) { + int error; + + /* update the fake timer (make real) */ + kn->kn_hookid = 0; + kn->kn_data = 0; + filt_timerunlock(); + error = filt_timerattach(kn); + filt_timerlock(); + if (error) { + kn->kn_flags |= EV_ERROR; + kn->kn_data = error; + filt_timerunlock(); + return 1; + } + } + + /* if still fake, pretend it fired */ + if (kn->kn_hookid > 0) + kn->kn_data = 1; + + result = (kn->kn_data != 0); + filt_timerunlock(); + return result; +} + +static void +filt_timerlock(void) +{ + lck_mtx_lock(&_filt_timerlock); +} + +static void +filt_timerunlock(void) +{ + lck_mtx_unlock(&_filt_timerlock); } -#endif /* 0 */ /* * JMM - placeholder for not-yet-implemented filters */ static int -filt_badattach(struct knote *kn) +filt_badattach(__unused struct knote *kn) { - return(EOPNOTSUPP); + return(ENOTSUP); } -#ifndef _SYS_SYSPROTO_H_ -struct kqueue_args { - int dummy; -}; -#endif -int -kqueue(struct proc *p, struct kqueue_args *uap, register_t *retval) +struct kqueue * +kqueue_alloc(struct proc *p) +{ + struct filedesc *fdp = p->p_fd; + struct kqueue *kq; + + MALLOC_ZONE(kq, struct kqueue *, sizeof(struct kqueue), M_KQUEUE, M_WAITOK); + if (kq != NULL) { + bzero(kq, sizeof(struct kqueue)); + lck_spin_init(&kq->kq_lock, kq_lck_grp, kq_lck_attr); + TAILQ_INIT(&kq->kq_head); + TAILQ_INIT(&kq->kq_inprocess); + kq->kq_fdp = fdp; + } + + if (fdp->fd_knlistsize < 0) { + proc_fdlock(p); + if (fdp->fd_knlistsize < 0) + fdp->fd_knlistsize = 0; /* this process has had a kq */ + proc_fdunlock(p); + } + + return kq; +} + + +/* + * kqueue_dealloc - detach all knotes from a kqueue and free it + * + * We walk each list looking for knotes referencing this + * this kqueue. If we find one, we try to drop it. But + * if we fail to get a drop reference, that will wait + * until it is dropped. So, we can just restart again + * safe in the assumption that the list will eventually + * not contain any more references to this kqueue (either + * we dropped them all, or someone else did). + * + * Assumes no new events are being added to the kqueue. + * Nothing locked on entry or exit. + */ +void +kqueue_dealloc(struct kqueue *kq, struct proc *p) { struct filedesc *fdp = p->p_fd; + struct knote *kn; + int i; + + proc_fdlock(p); + for (i = 0; i < fdp->fd_knlistsize; i++) { + kn = SLIST_FIRST(&fdp->fd_knlist[i]); + while (kn != NULL) { + if (kq == kn->kn_kq) { + kqlock(kq); + proc_fdunlock(p); + /* drop it ourselves or wait */ + if (kqlock2knotedrop(kq, kn)) { + kn->kn_fop->f_detach(kn); + knote_drop(kn, p); + } + proc_fdlock(p); + /* start over at beginning of list */ + kn = SLIST_FIRST(&fdp->fd_knlist[i]); + continue; + } + kn = SLIST_NEXT(kn, kn_link); + } + } + if (fdp->fd_knhashmask != 0) { + for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) { + kn = SLIST_FIRST(&fdp->fd_knhash[i]); + while (kn != NULL) { + if (kq == kn->kn_kq) { + kqlock(kq); + proc_fdunlock(p); + /* drop it ourselves or wait */ + if (kqlock2knotedrop(kq, kn)) { + kn->kn_fop->f_detach(kn); + knote_drop(kn, p); + } + proc_fdlock(p); + /* start over at beginning of list */ + kn = SLIST_FIRST(&fdp->fd_knhash[i]); + continue; + } + kn = SLIST_NEXT(kn, kn_link); + } + } + } + proc_fdunlock(p); + lck_spin_destroy(&kq->kq_lock, kq_lck_grp); + FREE_ZONE(kq, sizeof(struct kqueue), M_KQUEUE); +} + +int +kqueue(struct proc *p, __unused struct kqueue_args *uap, register_t *retval) +{ struct kqueue *kq; - struct file *fp; + struct fileproc *fp; int fd, error; error = falloc(p, &fp, &fd); - if (error) + if (error) { return (error); + } + + kq = kqueue_alloc(p); + if (kq == NULL) { + fp_free(p, fd, fp); + return (ENOMEM); + } + fp->f_flag = FREAD | FWRITE; fp->f_type = DTYPE_KQUEUE; fp->f_ops = &kqueueops; - kq = (struct kqueue *)_MALLOC(sizeof(struct kqueue), M_KQUEUE, M_WAITOK | M_ZERO); - TAILQ_INIT(&kq->kq_head); fp->f_data = (caddr_t)kq; + + proc_fdlock(p); + *fdflags(p, fd) &= ~UF_RESERVED; + fp_drop(p, fd, fp, 1); + proc_fdunlock(p); + *retval = fd; - if (fdp->fd_knlistsize < 0) - fdp->fd_knlistsize = 0; /* this process has a kq */ - kq->kq_fdp = fdp; return (error); } -#ifndef _SYS_SYSPROTO_H_ -struct kqueue_portset_np_args { - int fd; -}; -#endif int -kqueue_portset_np(struct proc *p, struct kqueue_portset_np_args *uap, register_t *retval) +kqueue_portset_np(__unused struct proc *p, + __unused struct kqueue_portset_np_args *uap, + __unused register_t *retval) { /* JMM - Placeholder for now */ - return (EOPNOTSUPP); + return (ENOTSUP); } -#ifndef _SYS_SYSPROTO_H_ -struct kqueue_from_portset_np_args { - int fd; -}; -#endif int -kqueue_from_portset_np(struct proc *p, struct kqueue_from_portset_np_args *uap, register_t *retval) +kqueue_from_portset_np(__unused struct proc *p, + __unused struct kqueue_from_portset_np_args *uap, + __unused register_t *retval) { /* JMM - Placeholder for now */ - return (EOPNOTSUPP); + return (ENOTSUP); } -#if !0 -/* JMM - We don't implement this yet */ -#define fhold(fp) -#define fdrop(fp, p) -#endif /* !0 */ - -#ifndef _SYS_SYSPROTO_H_ -struct kevent_args { - int fd; - const struct kevent *changelist; - int nchanges; - struct kevent *eventlist; - int nevents; - const struct timespec *timeout; -}; -#endif -int -kevent(struct proc *p, struct kevent_args *uap, register_t *retval) +static int +kevent_copyin(user_addr_t *addrp, struct kevent *kevp, struct proc *p) { - struct filedesc* fdp = p->p_fd; - struct kqueue *kq; - struct file *fp = NULL; - struct timespec ts; - int i, nerrors, error; + int advance; + int error; - if (uap->timeout != NULL) { - error = copyin((caddr_t)uap->timeout, (caddr_t)&ts, sizeof(ts)); + if (IS_64BIT_PROCESS(p)) { + struct user_kevent kev64; + + advance = sizeof(kev64); + error = copyin(*addrp, (caddr_t)&kev64, advance); if (error) - goto done; - uap->timeout = &ts; + return error; + kevp->ident = CAST_DOWN(uintptr_t, kev64.ident); + kevp->filter = kev64.filter; + kevp->flags = kev64.flags; + kevp->fflags = kev64.fflags; + kevp->data = CAST_DOWN(intptr_t, kev64.data); + kevp->udata = kev64.udata; + } else { + /* + * compensate for legacy in-kernel kevent layout + * where the udata field is alredy 64-bit. + */ + advance = sizeof(*kevp) + sizeof(void *) - sizeof(user_addr_t); + error = copyin(*addrp, (caddr_t)kevp, advance); } + if (!error) + *addrp += advance; + return error; +} - if (((u_int)uap->fd) >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[uap->fd]) == NULL || - (fp->f_type != DTYPE_KQUEUE)) - return (EBADF); +static int +kevent_copyout(struct kevent *kevp, user_addr_t *addrp, struct proc *p) +{ + int advance; + int error; + + if (IS_64BIT_PROCESS(p)) { + struct user_kevent kev64; + + kev64.ident = (uint64_t) kevp->ident; + kev64.filter = kevp->filter; + kev64.flags = kevp->flags; + kev64.fflags = kevp->fflags; + kev64.data = (int64_t) kevp->data; + kev64.udata = kevp->udata; + advance = sizeof(kev64); + error = copyout((caddr_t)&kev64, *addrp, advance); + } else { + /* + * compensate for legacy in-kernel kevent layout + * where the udata field is alredy 64-bit. + */ + advance = sizeof(*kevp) + sizeof(void *) - sizeof(user_addr_t); + error = copyout((caddr_t)kevp, *addrp, advance); + } + if (!error) + *addrp += advance; + return error; +} - fhold(fp); +/* + * kevent_continue - continue a kevent syscall after blocking + * + * assume we inherit a use count on the kq fileglob. + */ - kq = (struct kqueue *)fp->f_data; - nerrors = 0; +static void +kevent_continue(__unused struct kqueue *kq, void *data, int error) +{ + struct _kevent *cont_args; + struct fileproc *fp; + register_t *retval; + int noutputs; + int fd; + struct proc *p = current_proc(); + + cont_args = (struct _kevent *)data; + noutputs = cont_args->eventout; + retval = cont_args->retval; + fd = cont_args->fd; + fp = cont_args->fp; + + fp_drop(p, fd, fp, 0); + + /* don't restart after signals... */ + if (error == ERESTART) + error = EINTR; + else if (error == EWOULDBLOCK) + error = 0; + if (error == 0) + *retval = noutputs; + unix_syscall_return(error); +} - while (uap->nchanges > 0) { - int i; - int n = uap->nchanges > KQ_NEVENTS ? KQ_NEVENTS : uap->nchanges; - struct kevent kq_kev[n]; +/* + * kevent - [syscall] register and wait for kernel events + * + */ - error = copyin((caddr_t)uap->changelist, (caddr_t)kq_kev, - n * sizeof(struct kevent)); +int +kevent(struct proc *p, struct kevent_args *uap, register_t *retval) +{ + user_addr_t changelist = uap->changelist; + user_addr_t ueventlist = uap->eventlist; + int nchanges = uap->nchanges; + int nevents = uap->nevents; + int fd = uap->fd; + + struct _kevent *cont_args; + uthread_t ut; + struct kqueue *kq; + struct fileproc *fp; + struct kevent kev; + int error, noutputs; + struct timeval atv; + + /* convert timeout to absolute - if we have one */ + if (uap->timeout != USER_ADDR_NULL) { + struct timeval rtv; + if ( IS_64BIT_PROCESS(p) ) { + struct user_timespec ts; + error = copyin( uap->timeout, &ts, sizeof(ts) ); + if ((ts.tv_sec & 0xFFFFFFFF00000000ull) != 0) + error = EINVAL; + else + TIMESPEC_TO_TIMEVAL(&rtv, &ts); + } else { + struct timespec ts; + error = copyin( uap->timeout, &ts, sizeof(ts) ); + TIMESPEC_TO_TIMEVAL(&rtv, &ts); + } if (error) - goto done; - for (i = 0; i < n; i++) { - struct kevent *kevp = &kq_kev[i]; + return error; + if (itimerfix(&rtv)) + return EINVAL; + getmicrouptime(&atv); + timevaladd(&atv, &rtv); + } else { + atv.tv_sec = 0; + atv.tv_usec = 0; + } - kevp->flags &= ~EV_SYSFLAGS; - error = kqueue_register(kq, kevp, p); - if (error) { - if (uap->nevents != 0) { - kevp->flags = EV_ERROR; - kevp->data = error; - (void) copyout((caddr_t)kevp, - (caddr_t)uap->eventlist, - sizeof(*kevp)); - uap->eventlist++; - uap->nevents--; - nerrors++; - } else { - goto done; - } - } + /* get a usecount for the kq itself */ + if ((error = fp_getfkq(p, fd, &fp, &kq)) != 0) + return(error); + + /* register all the change requests the user provided... */ + noutputs = 0; + while (nchanges > 0) { + error = kevent_copyin(&changelist, &kev, p); + if (error) + break; + + kev.flags &= ~EV_SYSFLAGS; + error = kevent_register(kq, &kev, p); + if (error) { + if (nevents == 0) + break; + kev.flags = EV_ERROR; + kev.data = error; + (void) kevent_copyout(&kev, &ueventlist, p); + nevents--; + noutputs++; } - uap->nchanges -= n; - uap->changelist += n; - } - if (nerrors) { - *retval = nerrors; - error = 0; - goto done; + nchanges--; } - error = kqueue_scan(fp, uap->nevents, uap->eventlist, uap->timeout, retval, p); -done: - if (fp != NULL) - fdrop(fp, p); - return (error); + /* store the continuation/completion data in the uthread */ + ut = (uthread_t)get_bsdthread_info(current_thread()); + cont_args = (struct _kevent *)&ut->uu_state.ss_kevent; + cont_args->fp = fp; + cont_args->fd = fd; + cont_args->retval = retval; + cont_args->eventlist = ueventlist; + cont_args->eventcount = nevents; + cont_args->eventout = noutputs; + + if (nevents > 0 && noutputs == 0 && error == 0) + error = kevent_scan(kq, kevent_callback, + kevent_continue, cont_args, + &atv, p); + kevent_continue(kq, cont_args, error); + /* NOTREACHED */ + return error; +} + + +/* + * kevent_callback - callback for each individual event + * + * called with nothing locked + * caller holds a reference on the kqueue + */ + +static int +kevent_callback(__unused struct kqueue *kq, struct kevent *kevp, void *data) +{ + struct _kevent *cont_args; + int error; + + cont_args = (struct _kevent *)data; + assert(cont_args->eventout < cont_arg->eventcount); + + /* + * Copy out the appropriate amount of event data for this user. + */ + error = kevent_copyout(kevp, &cont_args->eventlist, current_proc()); + + /* + * If there isn't space for additional events, return + * a harmless error to stop the processing here + */ + if (error == 0 && ++cont_args->eventout == cont_args->eventcount) + error = EWOULDBLOCK; + return error; } +/* + * kevent_register - add a new event to a kqueue + * + * Creates a mapping between the event source and + * the kqueue via a knote data structure. + * + * Because many/most the event sources are file + * descriptor related, the knote is linked off + * the filedescriptor table for quick access. + * + * called with nothing locked + * caller holds a reference on the kqueue + */ + int -kqueue_register(struct kqueue *kq, struct kevent *kev, struct proc *p) +kevent_register(struct kqueue *kq, struct kevent *kev, struct proc *p) { struct filedesc *fdp = kq->kq_fdp; struct filterops *fops; - struct file *fp = NULL; + struct fileproc *fp = NULL; struct knote *kn = NULL; - int s, error = 0; + int error = 0; if (kev->filter < 0) { if (kev->filter + EVFILT_SYSCOUNT < 0) @@ -556,20 +1161,22 @@ kqueue_register(struct kqueue *kq, struct kevent *kev, struct proc *p) return (EINVAL); } - if (fops->f_isfd) { - /* validate descriptor */ - if ((u_int)kev->ident >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[kev->ident]) == NULL) - return (EBADF); - fhold(fp); + /* this iocount needs to be dropped if it is not registered */ + if (fops->f_isfd && (error = fp_lookup(p, kev->ident, &fp, 0)) != 0) + return(error); - if (kev->ident < fdp->fd_knlistsize) { + restart: + proc_fdlock(p); + if (fops->f_isfd) { + /* fd-based knotes are linked off the fd table */ + if (kev->ident < (u_int)fdp->fd_knlistsize) { SLIST_FOREACH(kn, &fdp->fd_knlist[kev->ident], kn_link) if (kq == kn->kn_kq && kev->filter == kn->kn_filter) break; } } else { + /* hash non-fd knotes here too */ if (fdp->fd_knhashmask != 0) { struct klist *list; @@ -583,329 +1190,497 @@ kqueue_register(struct kqueue *kq, struct kevent *kev, struct proc *p) } } - if (kn == NULL && ((kev->flags & EV_ADD) == 0)) { - error = ENOENT; - goto done; + /* + * kn now contains the matching knote, or NULL if no match + */ + if (kn == NULL) { + if ((kev->flags & (EV_ADD|EV_DELETE)) == EV_ADD) { + kn = knote_alloc(); + if (kn == NULL) { + proc_fdunlock(p); + error = ENOMEM; + goto done; + } + kn->kn_fp = fp; + kn->kn_kq = kq; + kn->kn_tq = &kq->kq_head; + kn->kn_fop = fops; + kn->kn_sfflags = kev->fflags; + kn->kn_sdata = kev->data; + kev->fflags = 0; + kev->data = 0; + kn->kn_kevent = *kev; + kn->kn_inuse = 1; /* for f_attach() */ + kn->kn_status = 0; + + /* before anyone can find it */ + if (kev->flags & EV_DISABLE) + kn->kn_status |= KN_DISABLED; + + error = knote_fdpattach(kn, fdp, p); + proc_fdunlock(p); + + if (error) { + knote_free(kn); + goto done; + } + + /* + * apply reference count to knote structure, and + * do not release it at the end of this routine. + */ + fp = NULL; + + /* + * If the attach fails here, we can drop it knowing + * that nobody else has a reference to the knote. + */ + if ((error = fops->f_attach(kn)) != 0) { + knote_drop(kn, p); + goto done; + } + } else { + proc_fdunlock(p); + error = ENOENT; + goto done; + } + } else { + /* existing knote - get kqueue lock */ + kqlock(kq); + proc_fdunlock(p); + + if (kev->flags & EV_DELETE) { + knote_dequeue(kn); + kn->kn_status |= KN_DISABLED; + if (kqlock2knotedrop(kq, kn)) { + kn->kn_fop->f_detach(kn); + knote_drop(kn, p); + } + goto done; + } + + /* update status flags for existing knote */ + if (kev->flags & EV_DISABLE) { + knote_dequeue(kn); + kn->kn_status |= KN_DISABLED; + } else if (kev->flags & EV_ENABLE) { + kn->kn_status &= ~KN_DISABLED; + if (kn->kn_status & KN_ACTIVE) + knote_enqueue(kn); + } + + /* + * If somebody is in the middle of dropping this + * knote - go find/insert a new one. But we have + * wait for this one to go away first. + */ + if (!kqlock2knoteusewait(kq, kn)) + /* kqueue unlocked */ + goto restart; + + /* + * The user may change some filter values after the + * initial EV_ADD, but doing so will not reset any + * filter which have already been triggered. + */ + kn->kn_sfflags = kev->fflags; + kn->kn_sdata = kev->data; + kn->kn_kevent.udata = kev->udata; + } + + /* still have use ref on knote */ + if (kn->kn_fop->f_event(kn, 0)) { + if (knoteuse2kqlock(kq, kn)) + knote_activate(kn); + kqunlock(kq); + } else { + knote_put(kn); + } + +done: + if (fp != NULL) + fp_drop(p, kev->ident, fp, 0); + return (error); +} + +/* + * kevent_process - process the triggered events in a kqueue + * + * Walk the queued knotes and validate that they are + * really still triggered events by calling the filter + * routines (if necessary). Hold a use reference on + * the knote to avoid it being detached. For each event + * that is still considered triggered, invoke the + * callback routine provided. + * + * caller holds a reference on the kqueue. + * kqueue locked on entry and exit - but may be dropped + */ + +static int +kevent_process(struct kqueue *kq, + kevent_callback_t callback, + void *data, + int *countp, + struct proc *p) +{ + struct knote *kn; + struct kevent kev; + int nevents; + int error; + + restart: + if (kq->kq_count == 0) { + *countp = 0; + return 0; + } + + /* if someone else is processing the queue, wait */ + if (!TAILQ_EMPTY(&kq->kq_inprocess)) { + assert_wait(&kq->kq_inprocess, THREAD_UNINT); + kq->kq_state |= KQ_PROCWAIT; + kqunlock(kq); + thread_block(THREAD_CONTINUE_NULL); + kqlock(kq); + goto restart; } - /* - * kn now contains the matching knote, or NULL if no match - */ - if (kev->flags & EV_ADD) { + error = 0; + nevents = 0; + while (error == 0 && + (kn = TAILQ_FIRST(&kq->kq_head)) != NULL) { - if (kn == NULL) { - kn = knote_alloc(); - if (kn == NULL) { - error = ENOMEM; - goto done; - } - kn->kn_fp = fp; - kn->kn_kq = kq; - kn->kn_fop = fops; + /* + * move knote to the processed queue. + * this is also protected by the kq lock. + */ + assert(kn->kn_tq == &kq->kq_head); + TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); + kn->kn_tq = &kq->kq_inprocess; + TAILQ_INSERT_TAIL(&kq->kq_inprocess, kn, kn_tqe); - /* - * apply reference count to knote structure, and - * do not release it at the end of this routine. - */ - fp = NULL; + /* + * Non-EV_ONESHOT events must be re-validated. + * + * Convert our lock to a use-count and call the event's + * filter routine to update. + * + * If the event is dropping (or no longer valid), we + * already have it off the active queue, so just + * finish the job of deactivating it. + */ + if ((kn->kn_flags & EV_ONESHOT) == 0) { + int result; - kn->kn_sfflags = kev->fflags; - kn->kn_sdata = kev->data; - kev->fflags = 0; - kev->data = 0; - kn->kn_kevent = *kev; + if (kqlock2knoteuse(kq, kn)) { + + /* call the filter with just a ref */ + result = kn->kn_fop->f_event(kn, 0); - knote_fdpattach(kn, fdp); - if ((error = fops->f_attach(kn)) != 0) { + if (!knoteuse2kqlock(kq, kn) || result == 0) { + knote_deactivate(kn); + continue; + } + } else { + knote_deactivate(kn); + continue; + } + } + + /* + * Got a valid triggered knote with the kqueue + * still locked. Snapshot the data, and determine + * how to dispatch the knote for future events. + */ + kev = kn->kn_kevent; + + /* now what happens to it? */ + if (kn->kn_flags & EV_ONESHOT) { + knote_deactivate(kn); + if (kqlock2knotedrop(kq, kn)) { + kn->kn_fop->f_detach(kn); knote_drop(kn, p); - goto done; } + } else if (kn->kn_flags & EV_CLEAR) { + knote_deactivate(kn); + kn->kn_data = 0; + kn->kn_fflags = 0; + kqunlock(kq); } else { /* - * The user may change some filter values after the - * initial EV_ADD, but doing so will not reset any - * filter which have already been triggered. + * leave on in-process queue. We'll + * move all the remaining ones back + * the kq queue and wakeup any + * waiters when we are done. */ - kn->kn_sfflags = kev->fflags; - kn->kn_sdata = kev->data; - kn->kn_kevent.udata = kev->udata; + kqunlock(kq); } - s = splhigh(); - if (kn->kn_fop->f_event(kn, 0)) - KNOTE_ACTIVATE(kn); - splx(s); + /* callback to handle each event as we find it */ + error = (callback)(kq, &kev, data); + nevents++; - } else if (kev->flags & EV_DELETE) { - kn->kn_fop->f_detach(kn); - knote_drop(kn, p); - goto done; + kqlock(kq); } - if ((kev->flags & EV_DISABLE) && - ((kn->kn_status & KN_DISABLED) == 0)) { - s = splhigh(); - kn->kn_status |= KN_DISABLED; - splx(s); + /* + * With the kqueue still locked, move any knotes + * remaining on the in-process queue back to the + * kq's queue and wake up any waiters. + */ + while ((kn = TAILQ_FIRST(&kq->kq_inprocess)) != NULL) { + assert(kn->kn_tq == &kq->kq_inprocess); + TAILQ_REMOVE(&kq->kq_inprocess, kn, kn_tqe); + kn->kn_tq = &kq->kq_head; + TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); } - - if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) { - s = splhigh(); - kn->kn_status &= ~KN_DISABLED; - if ((kn->kn_status & KN_ACTIVE) && - ((kn->kn_status & KN_QUEUED) == 0)) - knote_enqueue(kn); - splx(s); + if (kq->kq_state & KQ_PROCWAIT) { + kq->kq_state &= ~KQ_PROCWAIT; + thread_wakeup(&kq->kq_inprocess); } -done: - if (fp != NULL) - fdrop(fp, p); - return (error); + *countp = nevents; + return error; } -static int -kqueue_scan(struct file *fp, int maxevents, struct kevent *ulistp, - const struct timespec *tsp, register_t *retval, struct proc *p) + +static void +kevent_scan_continue(void *data, wait_result_t wait_result) { - struct kqueue *kq = (struct kqueue *)fp->f_data; - struct timeval atv, rtv, ttv; - int s, count, timeout, error = 0; - struct knote marker; - - count = maxevents; - if (count == 0) - goto done; - - if (tsp != NULL) { - TIMESPEC_TO_TIMEVAL(&atv, tsp); - if (itimerfix(&atv)) { - error = EINVAL; - goto done; + uthread_t ut = (uthread_t)get_bsdthread_info(current_thread()); + struct _kevent_scan * cont_args = &ut->uu_state.ss_kevent_scan; + struct kqueue *kq = (struct kqueue *)data; + int error; + int count; + + /* convert the (previous) wait_result to a proper error */ + switch (wait_result) { + case THREAD_AWAKENED: + kqlock(kq); + error = kevent_process(kq, cont_args->call, cont_args, &count, current_proc()); + if (error == 0 && count == 0) { + assert_wait_deadline(kq, THREAD_ABORTSAFE, cont_args->deadline); + kq->kq_state |= KQ_SLEEP; + kqunlock(kq); + thread_block_parameter(kevent_scan_continue, kq); + /* NOTREACHED */ } - if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) - timeout = -1; - else - timeout = atv.tv_sec > 24 * 60 * 60 ? - 24 * 60 * 60 * hz : tvtohz(&atv); - getmicrouptime(&rtv); - timevaladd(&atv, &rtv); - } else { - atv.tv_sec = 0; - atv.tv_usec = 0; - timeout = 0; + kqunlock(kq); + break; + case THREAD_TIMED_OUT: + error = EWOULDBLOCK; + break; + case THREAD_INTERRUPTED: + error = EINTR; + break; + default: + panic("kevent_scan_cont() - invalid wait_result (%d)", wait_result); + error = 0; } - goto start; + + /* call the continuation with the results */ + assert(cont_args->cont != NULL); + (cont_args->cont)(kq, cont_args->data, error); +} -retry: - if (atv.tv_sec || atv.tv_usec) { - getmicrouptime(&rtv); - if (timevalcmp(&rtv, &atv, >=)) - goto done; - ttv = atv; - timevalsub(&ttv, &rtv); - timeout = ttv.tv_sec > 24 * 60 * 60 ? - 24 * 60 * 60 * hz : tvtohz(&ttv); - } -start: - s = splhigh(); - if (kq->kq_count == 0) { - if (timeout < 0) { - error = EWOULDBLOCK; - } else { - kq->kq_state |= KQ_SLEEP; - error = tsleep(kq, PSOCK | PCATCH, "kqread", timeout); - } - splx(s); - if (error == 0) - goto retry; - /* don't restart after signals... */ - if (error == ERESTART) - error = EINTR; - else if (error == EWOULDBLOCK) - error = 0; - goto done; - } +/* + * kevent_scan - scan and wait for events in a kqueue + * + * Process the triggered events in a kqueue. + * + * If there are no events triggered arrange to + * wait for them. If the caller provided a + * continuation routine, then kevent_scan will + * also. + * + * The callback routine must be valid. + * The caller must hold a use-count reference on the kq. + */ - /* JMM - This marker trick doesn't work with multiple threads */ - TAILQ_INSERT_TAIL(&kq->kq_head, &marker, kn_tqe); - while (count) { - int maxkev = (count > KQ_NEVENTS) ? KQ_NEVENTS : count; - struct kevent kq_kev[maxkev]; - struct kevent *kevp = kq_kev; - struct knote *kn; - int nkev = 0; - - while (nkev < maxkev) { - kn = TAILQ_FIRST(&kq->kq_head); - TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); - if (kn == &marker) { - if (count == maxevents) - goto retry; - break; - } else if (kn->kn_status & KN_DISABLED) { - kn->kn_status &= ~KN_QUEUED; - kq->kq_count--; - continue; - } else if ((kn->kn_flags & EV_ONESHOT) == 0 && - kn->kn_fop->f_event(kn, 0) == 0) { - kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); - kq->kq_count--; - continue; - } +int +kevent_scan(struct kqueue *kq, + kevent_callback_t callback, + kevent_continue_t continuation, + void *data, + struct timeval *atvp, + struct proc *p) +{ + thread_continue_t cont = THREAD_CONTINUE_NULL; + uint64_t deadline; + int error; + int first; - *kevp = kn->kn_kevent; - kevp++; - nkev++; - count--; + assert(callback != NULL); - if (kn->kn_flags & EV_ONESHOT) { - kn->kn_status &= ~KN_QUEUED; - kq->kq_count--; - splx(s); - kn->kn_fop->f_detach(kn); - knote_drop(kn, p); - s = splhigh(); - } else if (kn->kn_flags & EV_CLEAR) { - kn->kn_data = 0; - kn->kn_fflags = 0; - kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); - kq->kq_count--; + first = 1; + for (;;) { + wait_result_t wait_result; + int count; + + /* + * Make a pass through the kq to find events already + * triggered. + */ + kqlock(kq); + error = kevent_process(kq, callback, data, &count, p); + if (error || count) + break; /* lock still held */ + + /* looks like we have to consider blocking */ + if (first) { + first = 0; + /* convert the timeout to a deadline once */ + if (atvp->tv_sec || atvp->tv_usec) { + uint32_t seconds, nanoseconds; + uint64_t now; + + clock_get_uptime(&now); + nanoseconds_to_absolutetime((uint64_t)atvp->tv_sec * NSEC_PER_SEC + + atvp->tv_usec * NSEC_PER_USEC, + &deadline); + if (now >= deadline) { + /* non-blocking call */ + error = EWOULDBLOCK; + break; /* lock still held */ + } + deadline -= now; + clock_absolutetime_interval_to_deadline(deadline, &deadline); } else { - TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); + deadline = 0; /* block forever */ + } + + if (continuation) { + uthread_t ut = (uthread_t)get_bsdthread_info(current_thread()); + struct _kevent_scan *cont_args = &ut->uu_state.ss_kevent_scan; + + cont_args->call = callback; + cont_args->cont = continuation; + cont_args->deadline = deadline; + cont_args->data = data; + cont = kevent_scan_continue; } } - splx(s); - error = copyout((caddr_t)kq_kev, (caddr_t)ulistp, - sizeof(struct kevent) * nkev); - if (kn == &marker) - goto done; - ulistp += nkev; - s = splhigh(); - if (error) - break; + + /* go ahead and wait */ + assert_wait_deadline(kq, THREAD_ABORTSAFE, deadline); + kq->kq_state |= KQ_SLEEP; + kqunlock(kq); + wait_result = thread_block_parameter(cont, kq); + /* NOTREACHED if (continuation != NULL) */ + + switch (wait_result) { + case THREAD_AWAKENED: + continue; + case THREAD_TIMED_OUT: + return EWOULDBLOCK; + case THREAD_INTERRUPTED: + return EINTR; + default: + panic("kevent_scan - bad wait_result (%d)", + wait_result); + error = 0; + } } - TAILQ_REMOVE(&kq->kq_head, &marker, kn_tqe); - splx(s); -done: - *retval = maxevents - count; - return (error); + kqunlock(kq); + return error; } + /* * XXX * This could be expanded to call kqueue_scan, if desired. */ /*ARGSUSED*/ static int -kqueue_read(struct file *fp, struct uio *uio, struct ucred *cred, - int flags, struct proc *p) +kqueue_read(__unused struct fileproc *fp, + __unused struct uio *uio, + __unused kauth_cred_t cred, + __unused int flags, + __unused struct proc *p) { return (ENXIO); } /*ARGSUSED*/ static int -kqueue_write(struct file *fp, struct uio *uio, struct ucred *cred, - int flags, struct proc *p) +kqueue_write(__unused struct fileproc *fp, + __unused struct uio *uio, + __unused kauth_cred_t cred, + __unused int flags, + __unused struct proc *p) { return (ENXIO); } /*ARGSUSED*/ static int -kqueue_ioctl(struct file *fp, u_long com, caddr_t data, struct proc *p) +kqueue_ioctl(__unused struct fileproc *fp, + __unused u_long com, + __unused caddr_t data, + __unused struct proc *p) { return (ENOTTY); } /*ARGSUSED*/ static int -kqueue_select(struct file *fp, int which, void *wql, struct proc *p) +kqueue_select(struct fileproc *fp, int which, void *wql, struct proc *p) { struct kqueue *kq = (struct kqueue *)fp->f_data; int retnum = 0; - int s = splnet(); if (which == FREAD) { + kqlock(kq); if (kq->kq_count) { retnum = 1; } else { - selrecord(p, &kq->kq_sel, wql); + selrecord(p, &kq->kq_sel, wql); kq->kq_state |= KQ_SEL; } + kqunlock(kq); } - splx(s); return (retnum); } +/* + * kqueue_close - + */ /*ARGSUSED*/ static int -kqueue_close(struct file *fp, struct proc *p) +kqueue_close(struct fileglob *fg, struct proc *p) { - struct kqueue *kq = (struct kqueue *)fp->f_data; - struct filedesc *fdp = p->p_fd; - struct knote **knp, *kn, *kn0; - int i; - - for (i = 0; i < fdp->fd_knlistsize; i++) { - knp = &SLIST_FIRST(&fdp->fd_knlist[i]); - kn = *knp; - while (kn != NULL) { - kn0 = SLIST_NEXT(kn, kn_link); - if (kq == kn->kn_kq) { - kn->kn_fop->f_detach(kn); - fdrop(kn->kn_fp, p); - knote_free(kn); - *knp = kn0; - } else { - knp = &SLIST_NEXT(kn, kn_link); - } - kn = kn0; - } - } - if (fdp->fd_knhashmask != 0) { - for (i = 0; i < fdp->fd_knhashmask + 1; i++) { - knp = &SLIST_FIRST(&fdp->fd_knhash[i]); - kn = *knp; - while (kn != NULL) { - kn0 = SLIST_NEXT(kn, kn_link); - if (kq == kn->kn_kq) { - kn->kn_fop->f_detach(kn); - /* XXX non-fd release of kn->kn_ptr */ - knote_free(kn); - *knp = kn0; - } else { - knp = &SLIST_NEXT(kn, kn_link); - } - kn = kn0; - } - } - } - _FREE(kq, M_KQUEUE); - fp->f_data = NULL; + struct kqueue *kq = (struct kqueue *)fg->fg_data; + kqueue_dealloc(kq, p); + fg->fg_data = NULL; return (0); } /*ARGSUSED*/ +/* + * The callers has taken a use-count reference on this kqueue and will donate it + * to the kqueue we are being added to. This keeps the kqueue from closing until + * that relationship is torn down. + */ static int -kqueue_kqfilter(struct file *fp, struct knote *kn, struct proc *p) +kqueue_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused struct proc *p) { struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; - if (kn->kn_filter != EVFILT_READ || (kq->kq_state & KQ_SEL)) + if (kn->kn_filter != EVFILT_READ) return (1); kn->kn_fop = &kqread_filtops; + kqlock(kq); KNOTE_ATTACH(&kq->kq_sel.si_note, kn); + kqunlock(kq); return (0); } /*ARGSUSED*/ int -kqueue_stat(struct file *fp, struct stat *st, struct proc *p) +kqueue_stat(struct fileproc *fp, struct stat *st, __unused struct proc *p) { struct kqueue *kq = (struct kqueue *)fp->f_data; @@ -916,19 +1691,22 @@ kqueue_stat(struct file *fp, struct stat *st, struct proc *p) return (0); } +/* + * Called with the kqueue locked + */ static void kqueue_wakeup(struct kqueue *kq) { if (kq->kq_state & KQ_SLEEP) { kq->kq_state &= ~KQ_SLEEP; - wakeup(kq); + thread_wakeup(kq); } if (kq->kq_state & KQ_SEL) { - // kq->kq_state &= ~KQ_SEL; /* remove for now */ + kq->kq_state &= ~KQ_SEL; selwakeup(&kq->kq_sel); - } else - KNOTE(&kq->kq_sel.si_note, 0); + } + KNOTE(&kq->kq_sel.si_note, 0); } void @@ -937,21 +1715,46 @@ klist_init(struct klist *list) SLIST_INIT(list); } + /* - * walk down a list of knotes, activating them if their event has triggered. + * Query/Post each knote in the object's list + * + * The object lock protects the list. It is assumed + * that the filter/event routine for the object can + * determine that the object is already locked (via + * the hind) and not deadlock itself. + * + * The object lock should also hold off pending + * detach/drop operations. But we'll prevent it here + * too - just in case. */ void knote(struct klist *list, long hint) { struct knote *kn; - SLIST_FOREACH(kn, list, kn_selnext) - if (kn->kn_fop->f_event(kn, hint)) - KNOTE_ACTIVATE(kn); + SLIST_FOREACH(kn, list, kn_selnext) { + struct kqueue *kq = kn->kn_kq; + + kqlock(kq); + if (kqlock2knoteuse(kq, kn)) { + int result; + + /* call the event with only a use count */ + result = kn->kn_fop->f_event(kn, hint); + + /* if its not going away and triggered */ + if (knoteuse2kqlock(kq, kn) && result) + knote_activate(kn); + /* lock held again */ + } + kqunlock(kq); + } } /* * attach a knote to the specified list. Return true if this is the first entry. + * The list is protected by whatever lock the object it is associated with uses. */ int knote_attach(struct klist *list, struct knote *kn) @@ -963,6 +1766,7 @@ knote_attach(struct klist *list, struct knote *kn) /* * detach a knote from the specified list. Return true if that was the last entry. + * The list is protected by whatever lock the object it is associated with uses. */ int knote_detach(struct klist *list, struct knote *kn) @@ -972,67 +1776,88 @@ knote_detach(struct klist *list, struct knote *kn) } /* - * remove all knotes from a specified klist + * remove all knotes referencing a specified fd + * + * Essentially an inlined knote_remove & knote_drop + * when we know for sure that the thing is a file + * + * Entered with the proc_fd lock already held. + * It returns the same way, but may drop it temporarily. */ void -knote_remove(struct proc *p, struct klist *list) +knote_fdclose(struct proc *p, int fd) { + struct filedesc *fdp = p->p_fd; + struct klist *list; struct knote *kn; + list = &fdp->fd_knlist[fd]; while ((kn = SLIST_FIRST(list)) != NULL) { - kn->kn_fop->f_detach(kn); - knote_drop(kn, p); - } -} + struct kqueue *kq = kn->kn_kq; -/* - * remove all knotes referencing a specified fd - */ -void -knote_fdclose(struct proc *p, int fd) -{ - struct filedesc *fdp = p->p_fd; - struct klist *list = &fdp->fd_knlist[fd]; + kqlock(kq); + proc_fdunlock(p); + + /* + * Convert the lock to a drop ref. + * If we get it, go ahead and drop it. + * Otherwise, we waited for it to + * be dropped by the other guy, so + * it is safe to move on in the list. + */ + if (kqlock2knotedrop(kq, kn)) { + kn->kn_fop->f_detach(kn); + knote_drop(kn, p); + } + + proc_fdlock(p); - knote_remove(p, list); + /* the fd tables may have changed - start over */ + list = &fdp->fd_knlist[fd]; + } } -static void -knote_fdpattach(struct knote *kn, struct filedesc *fdp) +/* proc_fdlock held on entry (and exit) */ +static int +knote_fdpattach(struct knote *kn, struct filedesc *fdp, __unused struct proc *p) { - struct klist *list; - int size; + struct klist *list = NULL; if (! kn->kn_fop->f_isfd) { if (fdp->fd_knhashmask == 0) fdp->fd_knhash = hashinit(KN_HASHSIZE, M_KQUEUE, &fdp->fd_knhashmask); list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)]; - goto done; - } - - if (fdp->fd_knlistsize <= kn->kn_id) { - size = fdp->fd_knlistsize; - while (size <= kn->kn_id) - size += KQEXTENT; - MALLOC(list, struct klist *, - size * sizeof(struct klist *), M_KQUEUE, M_WAITOK); - bcopy((caddr_t)fdp->fd_knlist, (caddr_t)list, - fdp->fd_knlistsize * sizeof(struct klist *)); - bzero((caddr_t)list + - fdp->fd_knlistsize * sizeof(struct klist *), - (size - fdp->fd_knlistsize) * sizeof(struct klist *)); - if (fdp->fd_knlist != NULL) + } else { + if ((u_int)fdp->fd_knlistsize <= kn->kn_id) { + u_int size = 0; + + /* have to grow the fd_knlist */ + size = fdp->fd_knlistsize; + while (size <= kn->kn_id) + size += KQEXTENT; + MALLOC(list, struct klist *, + size * sizeof(struct klist *), M_KQUEUE, M_WAITOK); + if (list == NULL) + return (ENOMEM); + + bcopy((caddr_t)fdp->fd_knlist, (caddr_t)list, + fdp->fd_knlistsize * sizeof(struct klist *)); + bzero((caddr_t)list + + fdp->fd_knlistsize * sizeof(struct klist *), + (size - fdp->fd_knlistsize) * sizeof(struct klist *)); FREE(fdp->fd_knlist, M_KQUEUE); - fdp->fd_knlistsize = size; - fdp->fd_knlist = list; + fdp->fd_knlist = list; + fdp->fd_knlistsize = size; + } + list = &fdp->fd_knlist[kn->kn_id]; } - list = &fdp->fd_knlist[kn->kn_id]; -done: SLIST_INSERT_HEAD(list, kn, kn_link); - kn->kn_status = 0; + return (0); } + + /* * should be called at spl == 0, since we don't want to hold spl * while calling fdrop and free. @@ -1041,55 +1866,97 @@ static void knote_drop(struct knote *kn, struct proc *p) { struct filedesc *fdp = p->p_fd; + struct kqueue *kq = kn->kn_kq; struct klist *list; + proc_fdlock(p); if (kn->kn_fop->f_isfd) list = &fdp->fd_knlist[kn->kn_id]; else list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)]; SLIST_REMOVE(list, kn, knote, kn_link); - if (kn->kn_status & KN_QUEUED) - knote_dequeue(kn); + kqlock(kq); + knote_dequeue(kn); + if (kn->kn_status & KN_DROPWAIT) + thread_wakeup(&kn->kn_status); + kqunlock(kq); + proc_fdunlock(p); + if (kn->kn_fop->f_isfd) - fdrop(kn->kn_fp, p); + fp_drop(p, kn->kn_id, kn->kn_fp, 0); + knote_free(kn); } +/* called with kqueue lock held */ +static void +knote_activate(struct knote *kn) +{ + struct kqueue *kq = kn->kn_kq; + + kn->kn_status |= KN_ACTIVE; + knote_enqueue(kn); + kqueue_wakeup(kq); + } + +/* called with kqueue lock held */ +static void +knote_deactivate(struct knote *kn) +{ + kn->kn_status &= ~KN_ACTIVE; + knote_dequeue(kn); +} +/* called with kqueue lock held */ static void knote_enqueue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; - int s = splhigh(); - KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); + if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) { + struct kqtailq *tq = kn->kn_tq; - TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); - kn->kn_status |= KN_QUEUED; - kq->kq_count++; - splx(s); - kqueue_wakeup(kq); + TAILQ_INSERT_TAIL(tq, kn, kn_tqe); + kn->kn_status |= KN_QUEUED; + kq->kq_count++; + } } +/* called with kqueue lock held */ static void knote_dequeue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; - int s = splhigh(); - KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); + assert((kn->kn_status & KN_DISABLED) == 0); + if ((kn->kn_status & KN_QUEUED) == KN_QUEUED) { + struct kqtailq *tq = kn->kn_tq; - TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); - kn->kn_status &= ~KN_QUEUED; - kq->kq_count--; - splx(s); + TAILQ_REMOVE(tq, kn, kn_tqe); + kn->kn_tq = &kq->kq_head; + kn->kn_status &= ~KN_QUEUED; + kq->kq_count--; + } } void knote_init(void) { knote_zone = zinit(sizeof(struct knote), 8192*sizeof(struct knote), 8192, "knote zone"); + + /* allocate kq lock group attribute and group */ + kq_lck_grp_attr= lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(kq_lck_grp_attr); + + kq_lck_grp = lck_grp_alloc_init("kqueue", kq_lck_grp_attr); + + /* Allocate kq lock attribute */ + kq_lck_attr = lck_attr_alloc_init(); + lck_attr_setdefault(kq_lck_attr); + + /* Initialize the timer filter lock */ + lck_mtx_init(&_filt_timerlock, kq_lck_grp, kq_lck_attr); } SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL) @@ -1102,7 +1969,7 @@ knote_alloc(void) static void knote_free(struct knote *kn) { - zfree(knote_zone, (vm_offset_t)kn); + zfree(knote_zone, kn); } #include <sys/param.h> @@ -1116,8 +1983,17 @@ knote_free(struct knote *kn) #include <sys/syslog.h> -int raw_usrreq(); -struct pr_usrreqs event_usrreqs; +static int kev_attach(struct socket *so, int proto, struct proc *p); +static int kev_detach(struct socket *so); +static int kev_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct proc *p); + +struct pr_usrreqs event_usrreqs = { + pru_abort_notsupp, pru_accept_notsupp, kev_attach, pru_bind_notsupp, pru_connect_notsupp, + pru_connect2_notsupp, kev_control, kev_detach, pru_disconnect_notsupp, + pru_listen_notsupp, pru_peeraddr_notsupp, pru_rcvd_notsupp, pru_rcvoob_notsupp, + pru_send_notsupp, pru_sense_null, pru_shutdown_notsupp, pru_sockaddr_notsupp, + pru_sosend_notsupp, soreceive, pru_sopoll_notsupp +}; struct protosw eventsw[] = { { @@ -1125,7 +2001,14 @@ struct protosw eventsw[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, &event_usrreqs +#if __APPLE__ + 0, +#endif + &event_usrreqs, + 0, 0, 0, +#if __APPLE__ + {0, 0}, 0, {0} +#endif } }; @@ -1133,7 +2016,12 @@ static struct kern_event_head kern_event_head; static u_long static_event_id = 0; +struct domain *sysdom = &systemdomain; +static lck_grp_t *evt_mtx_grp; +static lck_attr_t *evt_mtx_attr; +static lck_grp_attr_t *evt_mtx_grp_attr; +lck_mtx_t *evt_mutex; /* * Install the protosw's for the NKE manager. Invoked at * extension load time @@ -1143,14 +2031,32 @@ kern_event_init(void) { int retval; - if ((retval = net_add_proto(eventsw, &systemdomain)) == 0) - return(KERN_SUCCESS); + if ((retval = net_add_proto(eventsw, &systemdomain)) != 0) { + log(LOG_WARNING, "Can't install kernel events protocol (%d)\n", retval); + return(retval); + } - log(LOG_WARNING, "Can't install kernel events protocol (%d)\n", retval); - return(retval); + /* + * allocate lock group attribute and group for kern event + */ + evt_mtx_grp_attr = lck_grp_attr_alloc_init(); + + evt_mtx_grp = lck_grp_alloc_init("eventlist", evt_mtx_grp_attr); + + /* + * allocate the lock attribute for mutexes + */ + evt_mtx_attr = lck_attr_alloc_init(); + lck_attr_setdefault(evt_mtx_attr); + evt_mutex = lck_mtx_alloc_init(evt_mtx_grp, evt_mtx_attr); + if (evt_mutex == NULL) + return (ENOMEM); + + return(KERN_SUCCESS); } -int kev_attach(struct socket *so, int proto, struct proc *p) +static int +kev_attach(struct socket *so, __unused int proto, __unused struct proc *p) { int error; struct kern_event_pcb *ev_pcb; @@ -1159,7 +2065,7 @@ int kev_attach(struct socket *so, int proto, struct proc *p) if (error) return error; - ev_pcb = _MALLOC(sizeof(struct kern_event_pcb), M_PCB, M_WAITOK); + MALLOC(ev_pcb, struct kern_event_pcb *, sizeof(struct kern_event_pcb), M_PCB, M_WAITOK); if (ev_pcb == 0) return ENOBUFS; @@ -1167,25 +2073,69 @@ int kev_attach(struct socket *so, int proto, struct proc *p) ev_pcb->vendor_code_filter = 0xffffffff; so->so_pcb = (caddr_t) ev_pcb; + lck_mtx_lock(evt_mutex); LIST_INSERT_HEAD(&kern_event_head, ev_pcb, ev_link); + lck_mtx_unlock(evt_mutex); return 0; } -int kev_detach(struct socket *so) +static int +kev_detach(struct socket *so) { struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *) so->so_pcb; if (ev_pcb != 0) { - LIST_REMOVE(ev_pcb, ev_link); - FREE(ev_pcb, M_PCB); - so->so_pcb = 0; + lck_mtx_lock(evt_mutex); + LIST_REMOVE(ev_pcb, ev_link); + lck_mtx_unlock(evt_mutex); + FREE(ev_pcb, M_PCB); + so->so_pcb = 0; + so->so_flags |= SOF_PCBCLEARING; } return 0; } +/* + * For now, kev_vender_code and mbuf_tags use the same + * mechanism. + */ +extern errno_t mbuf_tag_id_find_internal(const char *string, u_long *out_id, + int create); + +errno_t kev_vendor_code_find( + const char *string, + u_long *out_vender_code) +{ + if (strlen(string) >= KEV_VENDOR_CODE_MAX_STR_LEN) { + return EINVAL; + } + return mbuf_tag_id_find_internal(string, out_vender_code, 1); +} + +extern void mbuf_tag_id_first_last(u_long *first, u_long *last); + +errno_t kev_msg_post(struct kev_msg *event_msg) +{ + u_long min_vendor, max_vendor; + + mbuf_tag_id_first_last(&min_vendor, &max_vendor); + + if (event_msg == NULL) + return EINVAL; + + /* Limit third parties to posting events for registered vendor codes only */ + if (event_msg->vendor_code < min_vendor || + event_msg->vendor_code > max_vendor) + { + return EINVAL; + } + + return kev_post_msg(event_msg); +} + int kev_post_msg(struct kev_msg *event_msg) { @@ -1193,9 +2143,21 @@ int kev_post_msg(struct kev_msg *event_msg) struct kern_event_pcb *ev_pcb; struct kern_event_msg *ev; char *tmp; - int total_size; + unsigned long total_size; int i; + /* Verify the message is small enough to fit in one mbuf w/o cluster */ + total_size = KEV_MSG_HEADER_SIZE; + + for (i = 0; i < 5; i++) { + if (event_msg->dv[i].data_length == 0) + break; + total_size += event_msg->dv[i].data_length; + } + + if (total_size > MLEN) { + return EMSGSIZE; + } m = m_get(M_DONTWAIT, MT_DATA); if (m == 0) @@ -1215,7 +2177,6 @@ int kev_post_msg(struct kev_msg *event_msg) tmp += event_msg->dv[i].data_length; } - ev->id = ++static_event_id; ev->total_size = total_size; ev->vendor_code = event_msg->vendor_code; @@ -1224,6 +2185,7 @@ int kev_post_msg(struct kev_msg *event_msg) ev->event_code = event_msg->event_code; m->m_len = total_size; + lck_mtx_lock(evt_mutex); for (ev_pcb = LIST_FIRST(&kern_event_head); ev_pcb; ev_pcb = LIST_NEXT(ev_pcb, ev_link)) { @@ -1245,67 +2207,69 @@ int kev_post_msg(struct kev_msg *event_msg) m2 = m_copym(m, 0, m->m_len, M_NOWAIT); if (m2 == 0) { m_free(m); + lck_mtx_unlock(evt_mutex); return ENOBUFS; } - - sbappendrecord(&ev_pcb->ev_socket->so_rcv, m2); - sorwakeup(ev_pcb->ev_socket); + socket_lock(ev_pcb->ev_socket, 1); + if (sbappendrecord(&ev_pcb->ev_socket->so_rcv, m2)) + sorwakeup(ev_pcb->ev_socket); + socket_unlock(ev_pcb->ev_socket, 1); } - m_free(m); + lck_mtx_unlock(evt_mutex); return 0; } - -int kev_control(so, cmd, data, ifp, p) - struct socket *so; - u_long cmd; - caddr_t data; - register struct ifnet *ifp; - struct proc *p; +static int +kev_control(struct socket *so, + u_long cmd, + caddr_t data, + __unused struct ifnet *ifp, + __unused struct proc *p) { - struct kev_request *kev_req = (struct kev_request *) data; - int stat = 0; - struct kern_event_pcb *ev_pcb; - u_long *id_value = (u_long *) data; - - - switch (cmd) { - - case SIOCGKEVID: - *id_value = static_event_id; - break; - - case SIOCSKEVFILT: - ev_pcb = (struct kern_event_pcb *) so->so_pcb; - ev_pcb->vendor_code_filter = kev_req->vendor_code; - ev_pcb->class_filter = kev_req->kev_class; - ev_pcb->subclass_filter = kev_req->kev_subclass; - break; - - case SIOCGKEVFILT: - ev_pcb = (struct kern_event_pcb *) so->so_pcb; - kev_req->vendor_code = ev_pcb->vendor_code_filter; - kev_req->kev_class = ev_pcb->class_filter; - kev_req->kev_subclass = ev_pcb->subclass_filter; - break; - - default: - return EOPNOTSUPP; - } - - return 0; + struct kev_request *kev_req = (struct kev_request *) data; + struct kern_event_pcb *ev_pcb; + struct kev_vendor_code *kev_vendor; + u_long *id_value = (u_long *) data; + + + switch (cmd) { + + case SIOCGKEVID: + *id_value = static_event_id; + break; + + case SIOCSKEVFILT: + ev_pcb = (struct kern_event_pcb *) so->so_pcb; + ev_pcb->vendor_code_filter = kev_req->vendor_code; + ev_pcb->class_filter = kev_req->kev_class; + ev_pcb->subclass_filter = kev_req->kev_subclass; + break; + + case SIOCGKEVFILT: + ev_pcb = (struct kern_event_pcb *) so->so_pcb; + kev_req->vendor_code = ev_pcb->vendor_code_filter; + kev_req->kev_class = ev_pcb->class_filter; + kev_req->kev_subclass = ev_pcb->subclass_filter; + break; + + case SIOCGKEVVENDOR: + kev_vendor = (struct kev_vendor_code*)data; + + /* Make sure string is NULL terminated */ + kev_vendor->vendor_string[KEV_VENDOR_CODE_MAX_STR_LEN-1] = 0; + + return mbuf_tag_id_find_internal(kev_vendor->vendor_string, + &kev_vendor->vendor_code, 0); + + default: + return ENOTSUP; + } + + return 0; } -struct pr_usrreqs event_usrreqs = { - pru_abort_notsupp, pru_accept_notsupp, kev_attach, pru_bind_notsupp, pru_connect_notsupp, - pru_connect2_notsupp, kev_control, kev_detach, pru_disconnect_notsupp, - pru_listen_notsupp, pru_peeraddr_notsupp, pru_rcvd_notsupp, pru_rcvoob_notsupp, - pru_send_notsupp, pru_sense_null, pru_shutdown_notsupp, pru_sockaddr_notsupp, - pru_sosend_notsupp, soreceive, sopoll -}; - diff --git a/bsd/kern/kern_exec.c b/bsd/kern/kern_exec.c index 191e3f396..e3ed77f2c 100644 --- a/bsd/kern/kern_exec.c +++ b/bsd/kern/kern_exec.c @@ -74,37 +74,53 @@ #include <sys/systm.h> #include <sys/filedesc.h> #include <sys/kernel.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/user.h> -#include <sys/buf.h> #include <sys/socketvar.h> #include <sys/malloc.h> #include <sys/namei.h> -#include <sys/mount.h> -#include <sys/vnode.h> -#include <sys/file.h> +#include <sys/mount_internal.h> +#include <sys/vnode_internal.h> +#include <sys/file_internal.h> #include <sys/stat.h> -#include <sys/uio.h> +#include <sys/uio_internal.h> #include <sys/acct.h> #include <sys/exec.h> #include <sys/kdebug.h> #include <sys/signal.h> #include <sys/aio_kern.h> +#include <sys/sysproto.h> +#include <sys/shm_internal.h> /* shmexec() */ +#include <sys/ubc_internal.h> /* ubc_map() */ #include <bsm/audit_kernel.h> +#include <mach/mach_types.h> +#include <mach/task.h> +#include <mach/thread_act.h> +#include <mach/vm_map.h> +#include <mach/mach_vm.h> #include <mach/vm_param.h> #include <vm/vm_map.h> - -extern vm_map_t vm_map_switch(vm_map_t map); /* XXX */ - #include <vm/vm_kern.h> +#include <vm/vm_pager.h> +#include <vm/vm_kern.h> +#include <vm/task_working_set.h> #include <vm/vm_shared_memory_server.h> +/* + * Mach things for which prototypes are unavailable from Mach headers + */ +void ipc_task_reset( + task_t task); + +extern struct savearea *get_user_regs(thread_t); + + #include <kern/thread.h> #include <kern/task.h> - #include <kern/ast.h> #include <kern/mach_loader.h> #include <mach-o/fat.h> @@ -112,152 +128,723 @@ extern vm_map_t vm_map_switch(vm_map_t map); /* XXX */ #include <machine/vmparam.h> #if KTRACE #include <sys/ktrace.h> -#include <sys/ubc.h> #endif +#include <sys/imgact.h> + + +/* + * SIZE_MAXPTR The maximum size of a user space pointer, in bytes + * SIZE_IMG_STRSPACE The available string space, minus two pointers; we + * define it interms of the maximum, since we don't + * know the pointer size going in, until after we've + * parsed the executable image. + */ +#define SIZE_MAXPTR 8 /* 64 bits */ +#define SIZE_IMG_STRSPACE (NCARGS - 2 * SIZE_MAXPTR) int app_profile = 0; extern vm_map_t bsd_pageable_map; +extern struct fileops vnops; #define ROUND_PTR(type, addr) \ (type *)( ( (unsigned)(addr) + 16 - 1) \ & ~(16 - 1) ) +struct image_params; /* Forward */ +static int exec_copyout_strings(struct image_params *imgp, user_addr_t *stackp); static int load_return_to_errno(load_return_t lrtn); -int execve(struct proc *p, struct execve_args *uap, register_t *retval); -static int execargs_alloc(vm_offset_t *addrp); -static int execargs_free(vm_offset_t addr); +static int execargs_alloc(struct image_params *imgp); +static int execargs_free(struct image_params *imgp); +static int exec_check_permissions(struct image_params *imgp); +static int exec_extract_strings(struct image_params *imgp); +static int exec_handle_sugid(struct image_params *imgp); static int sugid_scripts = 0; SYSCTL_INT (_kern, OID_AUTO, sugid_scripts, CTLFLAG_RW, &sugid_scripts, 0, ""); +static kern_return_t create_unix_stack(vm_map_t map, user_addr_t user_stack, + int customstack, struct proc *p); +static int copyoutptr(user_addr_t ua, user_addr_t ptr, int ptr_size); + +/* XXX forward; should be in headers, but can't be for one reason or another */ +extern int grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype); +extern void vfork_return(thread_t th_act, + struct proc * p, + struct proc *p2, + register_t *retval); -int -execv(p, args, retval) - struct proc *p; - void *args; - int *retval; -{ - ((struct execve_args *)args)->envp = NULL; - return (execve(p, args, retval)); -} extern char classichandler[32]; -extern long classichandler_fsid; +extern uint32_t classichandler_fsid; extern long classichandler_fileid; + /* - * Helper routine to get rid of a loop in execve. Given a pointer to - * something for the arg list (which might be in kernel space or in user - * space), copy it into the kernel buffer at the currentWritePt. This code - * does the proper thing to get the data transferred. - * bytesWritten, currentWritePt, and bytesLeft are kept up-to-date. + * exec_add_string + * + * Add the requested string to the string space area. + * + * Parameters; struct image_params * image parameter block + * user_addr_t string to add to strings area + * uio_seg segment where string is located + * + * Returns: 0 Success + * !0 Failure errno from copyinstr() + * + * Implicit returns: + * (imgp->ip_strendp) updated location of next add, if any + * (imgp->ip_strspace) updated byte count of space remaining */ - -static int copyArgument(char *argument, int pointerInKernel, - int *bytesWritten,char **currentWritePt, - int *bytesLeft){ +static int +exec_add_string(struct image_params *imgp, user_addr_t str, /*uio_seg*/int seg) +{ int error = 0; + do { size_t len = 0; - if (*bytesLeft <= 0) { + if (imgp->ip_strspace <= 0) { error = E2BIG; break; } - if (pointerInKernel == UIO_SYSSPACE) { - error = copystr(argument, *currentWritePt, (unsigned)*bytesLeft, &len); + if (IS_UIO_SYS_SPACE(seg)) { + char *kstr = CAST_DOWN(char *,str); /* SAFE */ + error = copystr(kstr, imgp->ip_strendp, imgp->ip_strspace, &len); } else { - /* - * pointer in kernel == UIO_USERSPACE - * Copy in from user space. - */ - error = copyinstr((caddr_t)argument, *currentWritePt, (unsigned)*bytesLeft, + error = copyinstr(str, imgp->ip_strendp, imgp->ip_strspace, &len); } - *currentWritePt += len; - *bytesWritten += len; - *bytesLeft -= len; + imgp->ip_strendp += len; + imgp->ip_strspace -= len; } while (error == ENAMETOOLONG); + return error; } -/* ARGSUSED */ -int -execve(p, uap, retval) - register struct proc *p; - register struct execve_args *uap; - register_t *retval; +/* + * exec_save_path + * + * To support new app package launching for Mac OS X, the dyld needs the + * first argument to execve() stored on the user stack. + * + * Save the executable path name at the top of the strings area and set + * the argument vector pointer to the location following that to indicate + * the start of the argument and environment tuples, setting the remaining + * string space count to the size of the string area minus the path length + * and a reserve for two pointers. + * + * Parameters; struct image_params * image parameter block + * char * path used to invoke program + * uio_seg segment where path is located + * + * Returns: int 0 Success + * !0 Failure: error number + * Implicit returns: + * (imgp->ip_strings) saved path + * (imgp->ip_strspace) space remaining in ip_strings + * (imgp->ip_argv) beginning of argument list + * (imgp->ip_strendp) start of remaining copy area + * + * Note: We have to do this before the initial namei() since in the + * path contains symbolic links, namei() will overwrite the + * original path buffer contents. If the last symbolic link + * resolved was a relative pathname, we would lose the original + * "path", which could be an absolute pathname. This might be + * unacceptable for dyld. + */ +static int +exec_save_path(struct image_params *imgp, user_addr_t path, /*uio_seg*/int seg) { - register struct ucred *cred = p->p_ucred; - register struct filedesc *fdp = p->p_fd; - int nc; - char *cp; - int na, ne, ucp, ap, cc; - unsigned len; - int executingInterpreter=0; - - int executingClassic=0; - char binaryWithClassicName[sizeof(p->p_comm)] = {0}; - char *execnamep; - struct vnode *vp; - struct vattr vattr; - struct vattr origvattr; - vm_offset_t execargs; - struct nameidata nd; - struct ps_strings ps; -#define SHSIZE 512 - /* Argument(s) to an interpreter. If we're executing a shell - * script, the name (#!/bin/csh) is allowed to be followed by - * arguments. cfarg holds these arguments. + int error; + size_t len; + char *kpath = CAST_DOWN(char *,path); /* SAFE */ + + imgp->ip_strendp = imgp->ip_strings; + imgp->ip_strspace = SIZE_IMG_STRSPACE; + + len = MIN(MAXPATHLEN, imgp->ip_strspace); + + switch( seg) { + case UIO_USERSPACE32: + case UIO_USERSPACE64: /* Same for copyin()... */ + error = copyinstr(path, imgp->ip_strings, len, &len); + break; + case UIO_SYSSPACE32: + error = copystr(kpath, imgp->ip_strings, len, &len); + break; + default: + error = EFAULT; + break; + } + + if (!error) { + imgp->ip_strendp += len; + imgp->ip_strspace -= len; + imgp->ip_argv = imgp->ip_strendp; + } + + return(error); +} + + + +/* + * exec_shell_imgact + * + * Image activator for interpreter scripts. If the image begins with the + * characters "#!", then it is an interpreter script. Verify that we are + * not already executing in Classic mode, and that the length of the script + * line indicating the interpreter is not in excess of the maximum allowed + * size. If this is the case, then break out the arguments, if any, which + * are separated by white space, and copy them into the argument save area + * as if they were provided on the command line before all other arguments. + * The line ends when we encounter a comment character ('#') or newline. + * + * Parameters; struct image_params * image parameter block + * + * Returns: -1 not an interpreter (keep looking) + * -3 Success: interpreter: relookup + * >0 Failure: interpreter: error number + * + * A return value other than -1 indicates subsequent image activators should + * not be given the opportunity to attempt to activate the image. + */ +static int +exec_shell_imgact(struct image_params *imgp) +{ + char *vdata = imgp->ip_vdata; + char *ihp; + char *line_endp; + char *interp; + + /* + * Make sure it's a shell script. If we've already redirected + * from an interpreted file once, don't do it again. + * + * Note: We disallow Classic, since the expectation is that we + * may run a Classic interpreter, but not an interpret a Classic + * image. This is consistent with historical behaviour. */ - char cfarg[SHSIZE]; - boolean_t is_fat; - kern_return_t ret; - struct mach_header *mach_header; - struct fat_header *fat_header; - struct fat_arch fat_arch; - load_return_t lret; - load_result_t load_result; + if (vdata[0] != '#' || + vdata[1] != '!' || + (imgp->ip_flags & IMGPF_INTERPRET) != 0) { + return (-1); + } + + + imgp->ip_flags |= IMGPF_INTERPRET; + + /* Check to see if SUGID scripts are permitted. If they aren't then + * clear the SUGID bits. + * imgp->ip_vattr is known to be valid. + */ + if (sugid_scripts == 0) { + imgp->ip_origvattr->va_mode &= ~(VSUID | VSGID); + } + + /* Find the nominal end of the interpreter line */ + for( ihp = &vdata[2]; *ihp != '\n' && *ihp != '#'; ihp++) { + if (ihp >= &vdata[IMG_SHSIZE]) + return (ENOEXEC); + } + + line_endp = ihp; + ihp = &vdata[2]; + /* Skip over leading spaces - until the interpreter name */ + while ( ihp < line_endp && ((*ihp == ' ') || (*ihp == '\t'))) + ihp++; + + /* + * Find the last non-whitespace character before the end of line or + * the beginning of a comment; this is our new end of line. + */ + for (;line_endp > ihp && ((*line_endp == ' ') || (*line_endp == '\t')); line_endp--) + continue; + + /* Empty? */ + if (line_endp == ihp) + return (ENOEXEC); + + /* copy the interpreter name */ + interp = imgp->ip_interp_name; + while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t')) + *interp++ = *ihp++; + *interp = '\0'; + + exec_save_path(imgp, CAST_USER_ADDR_T(imgp->ip_interp_name), + UIO_SYSSPACE32); + + ihp = &vdata[2]; + while (ihp < line_endp) { + /* Skip leading whitespace before each argument */ + while ((*ihp == ' ') || (*ihp == '\t')) + ihp++; + + if (ihp >= line_endp) + break; + + /* We have an argument; copy it */ + while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t')) { + *imgp->ip_strendp++ = *ihp++; + imgp->ip_strspace--; + } + *imgp->ip_strendp++ = 0; + imgp->ip_strspace--; + imgp->ip_argc++; + } + + return (-3); +} + + + +/* + * exec_fat_imgact + * + * Image activator for fat 1.0 binaries. If the binary is fat, then we + * need to select an image from it internally, and make that the image + * we are going to attempt to execute. At present, this consists of + * reloading the first page for the image with a first page from the + * offset location indicated by the fat header. + * + * Important: This image activator is byte order neutral. + * + * Note: If we find an encapsulated binary, we make no assertions + * about its validity; instead, we leave that up to a rescan + * for an activator to claim it, and, if it is claimed by one, + * that activator is responsible for determining validity. + */ +static int +exec_fat_imgact(struct image_params *imgp) +{ + struct proc *p = vfs_context_proc(imgp->ip_vfs_context); + kauth_cred_t cred = p->p_ucred; + struct fat_header *fat_header = (struct fat_header *)imgp->ip_vdata; + struct fat_arch fat_arch; + int resid, error; + load_return_t lret; + + /* Make sure it's a fat binary */ + if ((fat_header->magic != FAT_MAGIC) && + (fat_header->magic != FAT_CIGAM)) { + error = -1; + goto bad; + } + + /* Look up our preferred architecture in the fat file. */ + lret = fatfile_getarch_affinity(imgp->ip_vp, + (vm_offset_t)fat_header, + &fat_arch, + (p->p_flag & P_AFFINITY)); + if (lret != LOAD_SUCCESS) { + error = load_return_to_errno(lret); + goto bad; + } + + /* Read the Mach-O header out of it */ + error = vn_rdwr(UIO_READ, imgp->ip_vp, imgp->ip_vdata, + PAGE_SIZE, fat_arch.offset, + UIO_SYSSPACE32, (IO_UNIT|IO_NODELOCKED), + cred, &resid, p); + if (error) { + goto bad; + } + + /* Did we read a complete header? */ + if (resid) { + error = EBADEXEC; + goto bad; + } + + /* Success. Indicate we have identified an encapsulated binary */ + error = -2; + imgp->ip_arch_offset = (user_size_t)fat_arch.offset; + imgp->ip_arch_size = (user_size_t)fat_arch.size; + +bad: + return (error); +} + +/* + * exec_mach_imgact + * + * Image activator for mach-o 1.0 binaries. + * + * Important: This image activator is NOT byte order neutral. + */ +static int +exec_mach_imgact(struct image_params *imgp) +{ + struct mach_header *mach_header = (struct mach_header *)imgp->ip_vdata; + kauth_cred_t cred = vfs_context_ucred(imgp->ip_vfs_context); + struct proc *p = vfs_context_proc(imgp->ip_vfs_context); + int error = 0; + int vfexec = 0; + task_t task; + task_t new_task; + thread_t thread; struct uthread *uthread; - vm_map_t old_map; + vm_map_t old_map = VM_MAP_NULL; vm_map_t map; - int i; boolean_t clean_regions = FALSE; - shared_region_mapping_t shared_region = NULL; shared_region_mapping_t initial_region = NULL; + load_return_t lret; + load_result_t load_result; + + /* + * make sure it's a Mach-O 1.0 or Mach-O 2.0 binary; the difference + * is a reserved field on the end, so for the most part, we can + * treat them as if they were identical. + */ + if ((mach_header->magic != MH_MAGIC) && + (mach_header->magic != MH_MAGIC_64)) { + error = -1; + goto bad; + } + + task = current_task(); + thread = current_thread(); + uthread = get_bsdthread_info(thread); + + if (uthread->uu_flag & UT_VFORK) + vfexec = 1; /* Mark in exec */ + + if ((mach_header->cputype & CPU_ARCH_ABI64) == CPU_ARCH_ABI64) + imgp->ip_flags |= IMGPF_IS_64BIT; + + if (!grade_binary(mach_header->cputype, mach_header->cpusubtype)) { + error = EBADARCH; + goto bad; + } + + /* + * Copy in arguments/environment from the old process, if the + * vector is non-NULL (i.e. exec is not being called from + * load_init_program(), as a special case, at system startup). + */ + if (imgp->ip_user_argv != 0LL) { + error = exec_extract_strings(imgp); + if (error) + goto bad; + } + + /* + * Hack for binary compatability; put three NULs on the end of the + * string area, and round it up to the next word boundary. This + * ensures padding with NULs to the boundary. + */ + imgp->ip_strendp[0] = 0; + imgp->ip_strendp[1] = 0; + imgp->ip_strendp[2] = 0; + imgp->ip_strendp += (((imgp->ip_strendp - imgp->ip_strings) + NBPW-1) & ~(NBPW-1)); + + + if (vfexec) { + kern_return_t result; + + result = task_create_internal(task, FALSE, &new_task); + if (result != KERN_SUCCESS) + printf("execve: task_create failed. Code: 0x%x\n", result); + p->task = new_task; + set_bsdtask_info(new_task, p); + if (p->p_nice != 0) + resetpriority(p); + map = get_task_map(new_task); + result = thread_create(new_task, &imgp->ip_vfork_thread); + if (result != KERN_SUCCESS) + printf("execve: thread_create failed. Code: 0x%x\n", result); + /* reset local idea of task, thread, uthread */ + task = new_task; + thread = imgp->ip_vfork_thread; + uthread = get_bsdthread_info(thread); + } else { + map = VM_MAP_NULL; + } + + /* + * We set these flags here; this is OK, since if we fail after + * this point, we have already destroyed the parent process anyway. + */ + if (imgp->ip_flags & IMGPF_IS_64BIT) { + task_set_64bit(task, TRUE); + p->p_flag |= P_LP64; + } else { + task_set_64bit(task, FALSE); + p->p_flag &= ~P_LP64; + } + + /* + * Load the Mach-O file. + */ +/* LP64 - remove following "if" statement after osfmk/vm/task_working_set.c */ +if((imgp->ip_flags & IMGPF_IS_64BIT) == 0) + if(imgp->ip_tws_cache_name) { + tws_handle_startup_file(task, kauth_cred_getuid(cred), + imgp->ip_tws_cache_name, imgp->ip_vp, &clean_regions); + } + + vm_get_shared_region(task, &initial_region); + + + /* + * NOTE: An error after this point indicates we have potentially + * destroyed or overwrote some process state while attempting an + * execve() following a vfork(), which is an unrecoverable condition. + */ + + /* + * We reset the task to 64-bit (or not) here. It may have picked up + * a new map, and we need that to reflect its true 64-bit nature. + */ + task_set_64bit(task, + ((imgp->ip_flags & IMGPF_IS_64BIT) == IMGPF_IS_64BIT)); + + /* + * Actually load the image file we previously decided to load. + */ + lret = load_machfile(imgp, mach_header, thread, map, clean_regions, &load_result); + + if (lret != LOAD_SUCCESS) { + error = load_return_to_errno(lret); + goto badtoolate; + } + + /* load_machfile() maps the vnode */ + (void)ubc_map(imgp->ip_vp, PROT_EXEC); + + /* + * deal with set[ug]id. + */ + error = exec_handle_sugid(imgp); + + KNOTE(&p->p_klist, NOTE_EXEC); + + if (!vfexec && (p->p_flag & P_TRACED)) + psignal(p, SIGTRAP); + + if (error) { + goto badtoolate; + } + vnode_put(imgp->ip_vp); + imgp->ip_vp = NULL; + + if (load_result.unixproc && + create_unix_stack(get_task_map(task), + load_result.user_stack, load_result.customstack, p)) { + error = load_return_to_errno(LOAD_NOSPACE); + goto badtoolate; + } + + if (vfexec) { + uthread->uu_ar0 = (void *)get_user_regs(thread); + old_map = vm_map_switch(get_task_map(task)); + } + + if (load_result.unixproc) { + user_addr_t ap; + + /* + * Copy the strings area out into the new process address + * space. + */ + ap = p->user_stack; + error = exec_copyout_strings(imgp, &ap); + if (error) { + if (vfexec) + vm_map_switch(old_map); + goto badtoolate; + } + /* Set the stack */ + thread_setuserstack(thread, ap); + } + + if (load_result.dynlinker) { + uint64_t ap; + + /* Adjust the stack */ + if (imgp->ip_flags & IMGPF_IS_64BIT) { + ap = thread_adjuserstack(thread, -8); + (void)copyoutptr(load_result.mach_header, ap, 8); + } else { + ap = thread_adjuserstack(thread, -4); + (void)suword(ap, load_result.mach_header); + } + } + + if (vfexec) { + vm_map_switch(old_map); + } + /* Set the entry point */ + thread_setentrypoint(thread, load_result.entry_point); + + /* Stop profiling */ + stopprofclock(p); + + /* + * Reset signal state. + */ + execsigs(p, thread); + + /* + * Close file descriptors + * which specify close-on-exec. + */ + fdexec(p); + + /* + * need to cancel async IO requests that can be cancelled and wait for those + * already active. MAY BLOCK! + */ + _aio_exec( p ); + + /* FIXME: Till vmspace inherit is fixed: */ + if (!vfexec && p->vm_shm) + shmexec(p); + /* Clean up the semaphores */ + semexit(p); + + /* + * Remember file name for accounting. + */ + p->p_acflag &= ~AFORK; + /* If the translated name isn't NULL, then we want to use + * that translated name as the name we show as the "real" name. + * Otherwise, use the name passed into exec. + */ + if (0 != imgp->ip_p_comm[0]) { + bcopy((caddr_t)imgp->ip_p_comm, (caddr_t)p->p_comm, + sizeof(p->p_comm)); + } else { + if (imgp->ip_ndp->ni_cnd.cn_namelen > MAXCOMLEN) + imgp->ip_ndp->ni_cnd.cn_namelen = MAXCOMLEN; + bcopy((caddr_t)imgp->ip_ndp->ni_cnd.cn_nameptr, (caddr_t)p->p_comm, + (unsigned)imgp->ip_ndp->ni_cnd.cn_namelen); + p->p_comm[imgp->ip_ndp->ni_cnd.cn_namelen] = '\0'; + } - union { - /* #! and name of interpreter */ - char ex_shell[SHSIZE]; - /* Mach-O executable */ - struct mach_header mach_header; - /* Fat executable */ - struct fat_header fat_header; - char pad[512]; - } exdata; + { + /* This is for kdebug */ + long dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4; + + /* Collect the pathname for tracing */ + kdbg_trace_string(p, &dbg_arg1, &dbg_arg2, &dbg_arg3, &dbg_arg4); + + + + if (vfexec) + { + KERNEL_DEBUG_CONSTANT1((TRACEDBG_CODE(DBG_TRACE_DATA, 2)) | DBG_FUNC_NONE, + p->p_pid ,0,0,0, (unsigned int)thread); + KERNEL_DEBUG_CONSTANT1((TRACEDBG_CODE(DBG_TRACE_STRING, 2)) | DBG_FUNC_NONE, + dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, (unsigned int)thread); + } + else + { + KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_DATA, 2)) | DBG_FUNC_NONE, + p->p_pid ,0,0,0,0); + KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_STRING, 2)) | DBG_FUNC_NONE, + dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, 0); + } + } + + p->p_flag &= ~P_CLASSIC; + + /* + * mark as execed, wakeup the process that vforked (if any) and tell + * it that it now has it's own resources back + */ + p->p_flag |= P_EXEC; + if (p->p_pptr && (p->p_flag & P_PPWAIT)) { + p->p_flag &= ~P_PPWAIT; + wakeup((caddr_t)p->p_pptr); + } + + if (vfexec && (p->p_flag & P_TRACED)) { + psignal_vfork(p, new_task, thread, SIGTRAP); + } + +badtoolate: + if (vfexec) { + task_deallocate(new_task); + thread_deallocate(thread); + if (error) + error = 0; + } + +bad: + return(error); +} + + + + +/* + * Our image activator table; this is the table of the image types we are + * capable of loading. We list them in order of preference to ensure the + * fastest image load speed. + * + * XXX hardcoded, for now; should use linker sets + */ +struct execsw { + int (*ex_imgact)(struct image_params *); + const char *ex_name; +} execsw[] = { + { exec_mach_imgact, "Mach-o Binary" }, + { exec_fat_imgact, "Fat Binary" }, + { exec_shell_imgact, "Interpreter Script" }, + { NULL, NULL} +}; + + +/* + * TODO: Dynamic linker header address on stack is copied via suword() + */ +/* ARGSUSED */ +int +execve(struct proc *p, struct execve_args *uap, register_t *retval) +{ + kauth_cred_t cred = p->p_ucred; + struct image_params image_params, *imgp; + struct vnode_attr va; + struct vnode_attr origva; + struct nameidata nd; + struct uthread *uthread; + int i; int resid, error; - char *savedpath; - int savedpathlen = 0; - vm_offset_t *execargsp; - char *cpnospace; task_t task; - task_t new_task; - thread_act_t thr_act; int numthreads; int vfexec=0; - unsigned long arch_offset =0; - unsigned long arch_size = 0; - char *ws_cache_name = NULL; /* used for pre-heat */ + int once = 1; /* save SGUID-ness for interpreted files */ + char alt_p_comm[sizeof(p->p_comm)] = {0}; /* for Classic */ + int is_64 = IS_64BIT_PROCESS(p); + int seg = (is_64 ? UIO_USERSPACE64 : UIO_USERSPACE32); + struct vfs_context context; + + context.vc_proc = p; + context.vc_ucred = p->p_ucred; /* XXX must NOT be kauth_cred_get() */ + + + imgp = &image_params; + + /* Initialize the common data in the image_params structure */ + bzero(imgp, sizeof(*imgp)); + imgp->ip_user_fname = uap->fname; + imgp->ip_user_argv = uap->argp; + imgp->ip_user_envv = uap->envp; + imgp->ip_vattr = &va; + imgp->ip_origvattr = &origva; + imgp->ip_vfs_context = &context; + imgp->ip_flags = (is_64 ? IMGPF_WAS_64BIT : IMGPF_NONE); + imgp->ip_tws_cache_name = NULL; + imgp->ip_p_comm = alt_p_comm; /* for Classic */ - /* + /* * XXXAUDIT: Currently, we only audit the pathname of the binary. * There may also be poor interaction with dyld. */ - cfarg[0] = '\0'; /* initialize to null value. */ task = current_task(); - thr_act = current_act(); - uthread = get_bsdthread_info(thr_act); + uthread = get_bsdthread_info(current_thread()); - if (uthread->uu_flag & P_VFORK) { + if (uthread->uu_flag & UT_VFORK) { vfexec = 1; /* Mark in exec */ } else { if (task != kernel_task) { @@ -265,543 +852,556 @@ execve(p, uap, retval) if (numthreads <= 0 ) return(EINVAL); if (numthreads > 1) { - return(EOPNOTSUPP); + return(ENOTSUP); } } } - error = execargs_alloc(&execargs); + error = execargs_alloc(imgp); if (error) return(error); - - savedpath = (char *)execargs; - - /* - * To support new app package launching for Mac OS X, the dyld - * needs the first argument to execve() stored on the user stack. - * Copyin the "path" at the begining of the "execargs" buffer - * allocated above. - * - * We have to do this before namei() because in case of - * symbolic links, namei() would overwrite the original "path". - * In case the last symbolic link resolved was a relative pathname - * we would lose the original "path", which could be an - * absolute pathname. This might be unacceptable for dyld. - */ - /* XXX We could optimize to avoid copyinstr in the namei() */ - + /* * XXXAUDIT: Note: the double copyin introduces an audit * race. To correct this race, we must use a single - * copyin(). + * copyin(), e.g. by passing a flag to namei to indicate an + * external path buffer is being used. */ - - error = copyinstr(uap->fname, savedpath, - MAXPATHLEN, (size_t *)&savedpathlen); + error = exec_save_path(imgp, uap->fname, seg); if (error) { - execargs_free(execargs); + execargs_free(imgp); return(error); } + /* - * copyinstr will put in savedpathlen, the count of - * characters (including NULL) in the path. * No app profiles under chroot */ - - if((fdp->fd_rdir == NULLVP) && (app_profile != 0)) { + if((p->p_fd->fd_rdir == NULLVP) && (app_profile != 0)) { /* grab the name of the file out of its path */ /* we will need this for lookup within the */ /* name file */ - ws_cache_name = savedpath + savedpathlen; - while (ws_cache_name[0] != '/') { - if(ws_cache_name == savedpath) { - ws_cache_name--; + /* Scan backwards for the first '/' or start of string */ + imgp->ip_tws_cache_name = imgp->ip_strendp; + while (imgp->ip_tws_cache_name[0] != '/') { + if(imgp->ip_tws_cache_name == imgp->ip_strings) { + imgp->ip_tws_cache_name--; break; } - ws_cache_name--; + imgp->ip_tws_cache_name--; } - ws_cache_name++; + imgp->ip_tws_cache_name++; } + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, + seg, uap->fname, imgp->ip_vfs_context); - /* Save the name aside for future use */ - execargsp = (vm_offset_t *)((char *)(execargs) + savedpathlen); - - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | SAVENAME | AUDITVNPATH1, - UIO_USERSPACE, uap->fname, p); +again: error = namei(&nd); if (error) - goto bad1; - vp = nd.ni_vp; - VOP_LEASE(vp, p, p->p_ucred, LEASE_READ); - - if ((error = VOP_GETATTR(vp, &origvattr, p->p_ucred, p))) goto bad; + imgp->ip_ndp = &nd; /* successful namei(); call nameidone() later */ + imgp->ip_vp = nd.ni_vp; /* if set, need to vnode_put() at some point */ - /* Check mount point */ - if (vp->v_mount->mnt_flag & MNT_NOEXEC) { - error = EACCES; + error = exec_check_permissions(imgp); + if (error) goto bad; - } - if ((vp->v_mount->mnt_flag & MNT_NOSUID) || (p->p_flag & P_TRACED)) - origvattr.va_mode &= ~(VSUID | VSGID); - - *(&vattr) = *(&origvattr); + /* Copy; avoid invocation of an interpreter overwriting the original */ + if (once) { + once = 0; + origva = va; + } -again: - error = check_exec_access(p, vp, &vattr); + error = vn_rdwr(UIO_READ, imgp->ip_vp, imgp->ip_vdata, PAGE_SIZE, 0, + UIO_SYSSPACE32, IO_NODELOCKED, cred, &resid, p); if (error) goto bad; + +encapsulated_binary: + error = -1; + for(i = 0; error == -1 && execsw[i].ex_imgact != NULL; i++) { - /* - * Read in first few bytes of file for segment sizes, magic number: - * 407 = plain executable - * 410 = RO text - * 413 = demand paged RO text - * Also an ASCII line beginning with #! is - * the file name of a ``shell'' and arguments may be prepended - * to the argument list if given here. - * - * SHELL NAMES ARE LIMITED IN LENGTH. - * - * ONLY ONE ARGUMENT MAY BE PASSED TO THE SHELL FROM - * THE ASCII LINE. - */ + error = (*execsw[i].ex_imgact)(imgp); - exdata.ex_shell[0] = '\0'; /* for zero length files */ + switch (error) { + /* case -1: not claimed: continue */ + case -2: /* Encapsulated binary */ + goto encapsulated_binary; - error = vn_rdwr(UIO_READ, vp, (caddr_t)&exdata, sizeof (exdata), 0, - UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p); + case -3: /* Interpreter */ + vnode_put(imgp->ip_vp); + imgp->ip_vp = NULL; /* already put */ + nd.ni_cnd.cn_nameiop = LOOKUP; + nd.ni_cnd.cn_flags = (nd.ni_cnd.cn_flags & HASBUF) | + (FOLLOW | LOCKLEAF); - if (error) - goto bad; -#ifndef lint - if (resid > sizeof(exdata) - min(sizeof(exdata.mach_header), - sizeof(exdata.fat_header)) - && exdata.ex_shell[0] != '#') { + nd.ni_segflg = UIO_SYSSPACE32; + nd.ni_dirp = CAST_USER_ADDR_T(imgp->ip_interp_name); + goto again; + + default: + break; + } + } + + /* call out to allow 3rd party notification of exec. + * Ignore result of kauth_authorize_fileop call. + */ + if (error == 0 && kauth_authorize_fileop_has_listeners()) { + kauth_authorize_fileop(vfs_context_ucred(&context), KAUTH_FILEOP_EXEC, + (uintptr_t)nd.ni_vp, 0); + } + + /* Image not claimed by any activator? */ + if (error == -1) error = ENOEXEC; - goto bad; + +bad: + if (imgp->ip_ndp) + nameidone(imgp->ip_ndp); + if (imgp->ip_vp) + vnode_put(imgp->ip_vp); + if (imgp->ip_strings) + execargs_free(imgp); + if (!error && vfexec) { + vfork_return(current_thread(), p->p_pptr, p, retval); + (void)thread_resume(imgp->ip_vfork_thread); + return(0); } -#endif /* lint */ - mach_header = &exdata.mach_header; - fat_header = &exdata.fat_header; - if ((mach_header->magic == MH_CIGAM) && - (classichandler[0] == 0)) { - error = EBADARCH; - goto bad; - } else if ((mach_header->magic == MH_MAGIC) || - (mach_header->magic == MH_CIGAM)) { - is_fat = FALSE; - } else if ((fat_header->magic == FAT_MAGIC) || - (fat_header->magic == FAT_CIGAM)) { - is_fat = TRUE; + return(error); +} + + +static int +copyinptr(user_addr_t froma, user_addr_t *toptr, int ptr_size) +{ + int error; + + if (ptr_size == 4) { + /* 64 bit value containing 32 bit address */ + unsigned int i; + + error = copyin(froma, &i, 4); + *toptr = CAST_USER_ADDR_T(i); /* SAFE */ } else { - /* If we've already redirected once from an interpreted file - * to an interpreter, don't permit the second time. - */ - if (exdata.ex_shell[0] != '#' || - exdata.ex_shell[1] != '!' || - executingInterpreter) { - error = ENOEXEC; - goto bad; - } - if (executingClassic == 1) { - error = EBADARCH; - goto bad; - } + error = copyin(froma, toptr, 8); + } + return (error); +} - /* Check to see if SUGID scripts are permitted. If they aren't then - * clear the SUGID bits. - */ - if (sugid_scripts == 0) { - origvattr.va_mode &= ~(VSUID | VSGID); - } - - cp = &exdata.ex_shell[2]; /* skip "#!" */ - while (cp < &exdata.ex_shell[SHSIZE]) { - if (*cp == '\t') /* convert all tabs to spaces */ - *cp = ' '; - else if (*cp == '\n' || *cp == '#') { - *cp = '\0'; /* trunc the line at nl or comment */ - - /* go back and remove the spaces before the /n or # */ - /* todo: do we have to do this if we fix the passing of args to shells ? */ - if ( cp != &exdata.ex_shell[2] ) { - do { - if ( *(cp-1) != ' ') - break; - *(--cp) = '\0'; - } while ( cp != &exdata.ex_shell[2] ); - } - break; - } - cp++; - } - if (*cp != '\0') { - error = ENOEXEC; - goto bad; - } - cp = &exdata.ex_shell[2]; - while (*cp == ' ') - cp++; - execnamep = cp; - while (*cp && *cp != ' ') - cp++; - cfarg[0] = '\0'; - cpnospace = cp; - if (*cp) { - *cp++ = '\0'; - while (*cp == ' ') - cp++; - if (*cp) - bcopy((caddr_t)cp, (caddr_t)cfarg, SHSIZE); - } - /* - * Support for new app package launching for Mac OS X. - * We are about to retry the execve() by changing the path to the - * interpreter name. Need to re-initialize the savedpath and - * savedpathlen. +1 for NULL. - */ - savedpathlen = (cpnospace - execnamep + 1); - error = copystr(execnamep, savedpath, - savedpathlen, (size_t *)&savedpathlen); - if (error) - goto bad; +static int +copyoutptr(user_addr_t ua, user_addr_t ptr, int ptr_size) +{ + int error; - /* Save the name aside for future use */ - execargsp = (vm_offset_t *)((char *)(execargs) + savedpathlen); - - executingInterpreter= 1; - vput(vp); - nd.ni_cnd.cn_nameiop = LOOKUP; - nd.ni_cnd.cn_flags = (nd.ni_cnd.cn_flags & HASBUF) | - (FOLLOW | LOCKLEAF | SAVENAME); - nd.ni_segflg = UIO_SYSSPACE; - nd.ni_dirp = execnamep; - if ((error = namei(&nd))) - goto bad1; - vp = nd.ni_vp; - VOP_LEASE(vp, p, cred, LEASE_READ); - if ((error = VOP_GETATTR(vp, &vattr, p->p_ucred, p))) - goto bad; - goto again; + if (ptr_size == 4) { + /* 64 bit value containing 32 bit address */ + unsigned int i = CAST_DOWN(unsigned int,ua); /* SAFE */ + + error = copyout(&i, ptr, 4); + } else { + error = copyout(&ua, ptr, 8); } + return (error); +} + + +/* + * exec_copyout_strings + * + * Copy out the strings segment to user space. The strings segment is put + * on a preinitialized stack frame. + * + * Parameters: struct image_params * the image parameter block + * int * a pointer to the stack offset variable + * + * Returns: 0 Success + * !0 Faiure: errno + * + * Implicit returns: + * (*stackp) The stack offset, modified + * + * Note: The strings segment layout is backward, from the beginning + * of the top of the stack to consume the minimal amount of + * space possible; the returned stack pointer points to the + * end of the area consumed (stacks grow upward). + * + * argc is an int; arg[i] are pointers; env[i] are pointers; + * exec_path is a pointer; the 0's are (void *)NULL's + * + * The stack frame layout is: + * + * +-------------+ + * sp-> | argc | + * +-------------+ + * | arg[0] | + * +-------------+ + * : + * : + * +-------------+ + * | arg[argc-1] | + * +-------------+ + * | 0 | + * +-------------+ + * | env[0] | + * +-------------+ + * : + * : + * +-------------+ + * | env[n] | + * +-------------+ + * | 0 | + * +-------------+ + * | exec_path | In MacOS X PR2 Beaker2E the path passed to exec() is + * +-------------+ passed on the stack just after the trailing 0 of the + * | 0 | the envp[] array as a pointer to a string. + * +-------------+ + * | PATH AREA | + * +-------------+ + * | STRING AREA | + * : + * : + * | | <- p->user_stack + * +-------------+ + * + * Although technically a part of the STRING AREA, we treat the PATH AREA as + * a separate entity. This allows us to align the beginning of the PATH AREA + * to a pointer boundary so that the exec_path, env[i], and argv[i] pointers + * which preceed it on the stack are properly aligned. + * + * TODO: argc copied with suword(), which takes a 64 bit address + */ +static int +exec_copyout_strings(struct image_params *imgp, user_addr_t *stackp) +{ + struct proc *p = vfs_context_proc(imgp->ip_vfs_context); + int ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4; + char *argv = imgp->ip_argv; /* modifiable copy of argv */ + user_addr_t string_area; /* *argv[], *env[] */ + user_addr_t path_area; /* package launch path */ + user_addr_t ptr_area; /* argv[], env[], exec_path */ + user_addr_t stack; + int stringc = imgp->ip_argc + imgp->ip_envc; + int len; + int error; + int strspace; + + stack = *stackp; + + /* + * Set up pointers to the beginning of the string area, the beginning + * of the path area, and the beginning of the pointer area (actually, + * the location of argc, an int, which may be smaller than a pointer, + * but we use ptr_size worth of space for it, for alignment). + */ + string_area = stack - (((imgp->ip_strendp - imgp->ip_strings) + ptr_size-1) & ~(ptr_size-1)) - ptr_size; + path_area = string_area - (((imgp->ip_argv - imgp->ip_strings) + ptr_size-1) & ~(ptr_size-1)); + ptr_area = path_area - ((imgp->ip_argc + imgp->ip_envc + 4) * ptr_size) - ptr_size /*argc*/; + + /* Return the initial stack address: the location of argc */ + *stackp = ptr_area; /* - * Collect arguments on "file" in swap space. + * Record the size of the arguments area so that sysctl_procargs() + * can return the argument area without having to parse the arguments. */ - na = 0; - ne = 0; - nc = 0; - cc = 0; + p->p_argc = imgp->ip_argc; + p->p_argslen = (int)(stack - path_area); + + /* * Support for new app package launching for Mac OS X allocates - * the "path" at the begining. - * execargs get allocated after that + * the "path" at the begining of the imgp->ip_strings buffer. + * copy it just before the string area. */ - cp = (char *) execargsp; /* running pointer for copy */ + len = 0; + error = copyoutstr(imgp->ip_strings, path_area, + (unsigned)(imgp->ip_argv - imgp->ip_strings), + (size_t *)&len); + if (error) + goto bad; + + + /* Save a NULL pointer below it */ + (void)copyoutptr(0LL, path_area - ptr_size, ptr_size); + + /* Save the pointer to "path" just below it */ + (void)copyoutptr(path_area, path_area - 2*ptr_size, ptr_size); + /* - * size of execargs less sizeof "path", - * a pointer to "path" and a NULL poiter + * ptr_size for 2 NULL one each ofter arg[argc -1] and env[n] + * ptr_size for argc + * skip over saved path, ptr_size for pointer to path, + * and ptr_size for the NULL after pointer to path. */ - cc = NCARGS - savedpathlen - 2*NBPW; + + /* argc (int32, stored in a ptr_size area) */ + (void)suword(ptr_area, imgp->ip_argc); + ptr_area += sizeof(int); + /* pad to ptr_size, if 64 bit image, to ensure user stack alignment */ + if (imgp->ip_flags & IMGPF_IS_64BIT) { + (void)suword(ptr_area, 0); /* int, not long: ignored */ + ptr_area += sizeof(int); + } + + /* - * Copy arguments into file in argdev area. + * We use (string_area - path_area) here rather than the more + * intuitive (imgp->ip_argv - imgp->ip_strings) because we are + * interested in the length of the PATH_AREA in user space, + * rather than the actual length of the execution path, since + * it includes alignment padding of the PATH_AREA + STRING_AREA + * to a ptr_size boundary. */ + strspace = SIZE_IMG_STRSPACE - (string_area - path_area); + for (;;) { + if (stringc == imgp->ip_envc) { + /* argv[n] = NULL */ + (void)copyoutptr(0LL, ptr_area, ptr_size); + ptr_area += ptr_size; + } + if (--stringc < 0) + break; + + /* pointer: argv[n]/env[n] */ + (void)copyoutptr(string_area, ptr_area, ptr_size); + /* string : argv[n][]/env[n][] */ + do { + if (strspace <= 0) { + error = E2BIG; + break; + } + error = copyoutstr(argv, string_area, + (unsigned)strspace, + (size_t *)&len); + string_area += len; + argv += len; + strspace -= len; + } while (error == ENAMETOOLONG); + if (error == EFAULT || error == E2BIG) + break; /* bad stack - user's problem */ + ptr_area += ptr_size; + } + /* env[n] = NULL */ + (void)copyoutptr(0LL, ptr_area, ptr_size); + +bad: + return(error); +} + + +/* + * exec_extract_strings + * + * Copy arguments and environment from user space into work area; we may + * have already copied some early arguments into the work area, and if + * so, any arguments opied in are appended to those already there. + * + * Parameters: struct image_params * the image parameter block + * + * Returns: 0 Success + * !0 Failure: errno + * + * Implicit returns; + * (imgp->ip_argc) Count of arguments, updated + * (imgp->ip_envc) Count of environment strings, updated + * + * + * Notes: The argument and environment vectors are user space pointers + * to arrays of user space pointers. + */ +static int +exec_extract_strings(struct image_params *imgp) +{ + int error = 0; + struct proc *p = vfs_context_proc(imgp->ip_vfs_context); + int seg = (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32); + int ptr_size = (imgp->ip_flags & IMGPF_WAS_64BIT) ? 8 : 4; + user_addr_t argv = imgp->ip_user_argv; + user_addr_t envv = imgp->ip_user_envv; + + /* Now, get rest of arguments */ /* - * If we have a fat file, find "our" executable. + * If we are running an interpreter, replace the av[0] that was + * passed to execve() with the fully qualified path name that was + * passed to execve() for interpreters which do not use the PATH + * to locate their script arguments. */ - if (is_fat) { - /* - * Look up our architecture in the fat file. - */ - lret = fatfile_getarch_affinity(vp,(vm_offset_t)fat_header, &fat_arch, - (p->p_flag & P_AFFINITY)); - if (lret != LOAD_SUCCESS) { - error = load_return_to_errno(lret); + if((imgp->ip_flags & IMGPF_INTERPRET) != 0 && argv != 0LL) { + user_addr_t arg; + + error = copyinptr(argv, &arg, ptr_size); + if (error) goto bad; + if (arg != 0LL && arg != (user_addr_t)-1) { + argv += ptr_size; + error = exec_add_string(imgp, imgp->ip_user_fname, seg); + if (error) + goto bad; + imgp->ip_argc++; } - /* Read the Mach-O header out of it */ - error = vn_rdwr(UIO_READ, vp, (caddr_t)&exdata.mach_header, - sizeof (exdata.mach_header), - fat_arch.offset, - UIO_SYSSPACE, (IO_UNIT|IO_NODELOCKED), cred, &resid, p); + } - if (error) { + while (argv != 0LL) { + user_addr_t arg; + + error = copyinptr(argv, &arg, ptr_size); + if (error) goto bad; - } - /* Did we read a complete header? */ - if (resid) { - error = EBADEXEC; + argv += ptr_size; + if (arg == 0LL) { + break; + } else if (arg == (user_addr_t)-1) { + /* Um... why would it be -1? */ + error = EFAULT; goto bad; } + /* + * av[n...] = arg[n] + */ + error = exec_add_string(imgp, arg, seg); + if (error) + goto bad; + imgp->ip_argc++; + } + + /* Now, get the environment */ + while (envv != 0LL) { + user_addr_t env; + + error = copyinptr(envv, &env, ptr_size); + if (error) + goto bad; - /* Is what we found a Mach-O executable */ - if ((mach_header->magic != MH_MAGIC) && - (mach_header->magic != MH_CIGAM)) { - error = ENOEXEC; + envv += ptr_size; + if (env == 0LL) { + break; + } else if (env == (user_addr_t)-1) { + error = EFAULT; goto bad; } - - arch_offset = fat_arch.offset; - arch_size = fat_arch.size; - } else { /* - * Load the Mach-O file. - */ - arch_offset = 0; - arch_size = (u_long)vattr.va_size; - } - - if ( ! check_cpu_subtype(mach_header->cpusubtype) ) { - error = EBADARCH; - goto bad; - } - - if (mach_header->magic == MH_CIGAM) { - - int classicBinaryLen = nd.ni_cnd.cn_namelen; - if (classicBinaryLen > MAXCOMLEN) - classicBinaryLen = MAXCOMLEN; - bcopy((caddr_t)nd.ni_cnd.cn_nameptr, - (caddr_t)binaryWithClassicName, - (unsigned)classicBinaryLen); - binaryWithClassicName[classicBinaryLen] = '\0'; - executingClassic = 1; - - vput(vp); /* cleanup? */ - nd.ni_cnd.cn_nameiop = LOOKUP; - - nd.ni_cnd.cn_flags = (nd.ni_cnd.cn_flags & HASBUF) | - /* (FOLLOW | LOCKLEAF | SAVENAME) */ - (LOCKLEAF | SAVENAME); - nd.ni_segflg = UIO_SYSSPACE; - - nd.ni_dirp = classichandler; - if ((error = namei(&nd)) != 0) { - error = EBADARCH; - goto bad1; - } - vp = nd.ni_vp; - - VOP_LEASE(vp,p,cred,LEASE_READ); - if ((error = VOP_GETATTR(vp,&vattr,p->p_ucred,p))) { + * av[n...] = env[n] + */ + error = exec_add_string(imgp, env, seg); + if (error) goto bad; - } - goto again; + imgp->ip_envc++; } +bad: + return error; +} - if (uap->argp != NULL) { - /* geez -- why would argp ever be NULL, and why would we proceed? */ - - /* First, handle any argument massaging */ - if (executingInterpreter && executingClassic) { - error = copyArgument(classichandler,UIO_SYSSPACE,&nc,&cp,&cc); - na++; - if (error) goto bad; - - /* Now name the interpreter. */ - error = copyArgument(savedpath,UIO_SYSSPACE,&nc,&cp,&cc); - na++; - if (error) goto bad; - /* - * if we're running an interpreter, as we'd be passing the - * command line executable as an argument to the interpreter already. - * Doing "execve("myShellScript","bogusName",arg1,arg2,...) - * probably shouldn't ever let bogusName be seen by the shell - * script. - */ - - if (cfarg[0]) { - error = copyArgument(cfarg,UIO_SYSSPACE,&nc,&cp,&cc); - na++; - if (error) goto bad; - } - - char* originalExecutable = uap->fname; - error = copyArgument(originalExecutable,UIO_USERSPACE,&nc,&cp,&cc); - na++; - /* remove argv[0] b/c we've already placed it at */ - /* this point */ - uap->argp++; - if (error) goto bad; - - /* and continue with rest of the arguments. */ - } else if (executingClassic) { - error = copyArgument(classichandler,UIO_SYSSPACE,&nc,&cp,&cc); - na++; - if (error) goto bad; - - char* originalExecutable = uap->fname; - error = copyArgument(originalExecutable,UIO_USERSPACE,&nc,&cp,&cc); - if (error) goto bad; - uap->argp++; - na++; - - /* and rest of arguments continue as before. */ - } else if (executingInterpreter) { - char *actualExecutable = nd.ni_cnd.cn_nameptr; - error = copyArgument(actualExecutable,UIO_SYSSPACE,&nc,&cp,&cc); - na++; - /* remove argv[0] b/c we just placed it in the arg list. */ - uap->argp++; - if (error) goto bad; - /* Copy the argument in the interpreter first line if there - * was one. - */ - if (cfarg[0]) { - error = copyArgument(cfarg,UIO_SYSSPACE,&nc,&cp,&cc); - na++; - if (error) goto bad; - } - - /* copy the name of the file being interpreted, gotten from - * the structures passed in to execve. - */ - error = copyArgument(uap->fname,UIO_USERSPACE,&nc,&cp,&cc); - na++; - } - /* Now, get rest of arguments */ - while (uap->argp != NULL) { - char* userArgument = (char*)fuword((caddr_t) uap->argp); - uap->argp++; - if (userArgument == NULL) { - break; - } else if ((int)userArgument == -1) { - /* Um... why would it be -1? */ - error = EFAULT; - goto bad; - } - error = copyArgument(userArgument, UIO_USERSPACE,&nc,&cp,&cc); - if (error) goto bad; - na++; - } - /* Now, get the environment */ - while (uap->envp != NULL) { - char *userEnv = (char*) fuword((caddr_t) uap->envp); - uap->envp++; - if (userEnv == NULL) { - break; - } else if ((int)userEnv == -1) { - error = EFAULT; - goto bad; - } - error = copyArgument(userEnv,UIO_USERSPACE,&nc,&cp,&cc); - if (error) goto bad; - na++; - ne++; - } - } +#define unix_stack_size(p) (p->p_rlimit[RLIMIT_STACK].rlim_cur) - /* make sure there are nulls are the end!! */ - { - int cnt = 3; - char *mp = cp; +static int +exec_check_permissions(struct image_params *imgp) +{ + struct vnode *vp = imgp->ip_vp; + struct vnode_attr *vap = imgp->ip_vattr; + struct proc *p = vfs_context_proc(imgp->ip_vfs_context); + int error; + kauth_action_t action; - while ( cnt-- ) - *mp++ = '\0'; - } + /* Only allow execution of regular files */ + if (!vnode_isreg(vp)) + return (EACCES); + + /* Get the file attributes that we will be using here and elsewhere */ + VATTR_INIT(vap); + VATTR_WANTED(vap, va_uid); + VATTR_WANTED(vap, va_gid); + VATTR_WANTED(vap, va_mode); + VATTR_WANTED(vap, va_fsid); + VATTR_WANTED(vap, va_fileid); + VATTR_WANTED(vap, va_data_size); + if ((error = vnode_getattr(vp, vap, imgp->ip_vfs_context)) != 0) + return (error); - /* and round up count of bytes written to next word. */ - nc = (nc + NBPW-1) & ~(NBPW-1); + /* + * Ensure that at least one execute bit is on - otherwise root + * will always succeed, and we don't want to happen unless the + * file really is executable. + */ + if ((vap->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) + return (EACCES); - if (vattr.va_fsid == classichandler_fsid && - vattr.va_fileid == classichandler_fileid) { - executingClassic = 1; - } + /* Disallow zero length files */ + if (vap->va_data_size == 0) + return (ENOEXEC); - if (vfexec) { - kern_return_t result; + imgp->ip_arch_offset = (user_size_t)0; + imgp->ip_arch_size = vap->va_data_size; - result = task_create_internal(task, FALSE, &new_task); - if (result != KERN_SUCCESS) - printf("execve: task_create failed. Code: 0x%x\n", result); - p->task = new_task; - set_bsdtask_info(new_task, p); - if (p->p_nice != 0) - resetpriority(p); - task = new_task; - map = get_task_map(new_task); - result = thread_create(new_task, &thr_act); - if (result != KERN_SUCCESS) - printf("execve: thread_create failed. Code: 0x%x\n", result); - uthread = get_bsdthread_info(thr_act); - } else { - map = VM_MAP_NULL; - } + /* Disable setuid-ness for traced programs or if MNT_NOSUID */ + if ((vp->v_mount->mnt_flag & MNT_NOSUID) || (p->p_flag & P_TRACED)) + vap->va_mode &= ~(VSUID | VSGID); + + /* Check for execute permission */ + action = KAUTH_VNODE_EXECUTE; + /* Traced images must also be readable */ + if (p->p_flag & P_TRACED) + action |= KAUTH_VNODE_READ_DATA; + if ((error = vnode_authorize(vp, NULL, action, imgp->ip_vfs_context)) != 0) + return (error); - /* - * Load the Mach-O file. - */ - VOP_UNLOCK(vp, 0, p); /* XXX */ - if(ws_cache_name) { - tws_handle_startup_file(task, cred->cr_uid, - ws_cache_name, vp, &clean_regions); - } + /* Don't let it run if anyone had it open for writing */ + if (vp->v_writecount) + return (ETXTBSY); - vm_get_shared_region(task, &initial_region); - int parentIsClassic = (p->p_flag & P_CLASSIC); - struct vnode *rootDir = p->p_fd->fd_rdir; - - if ((parentIsClassic && !executingClassic) || - (!parentIsClassic && executingClassic)) { - shared_region = lookup_default_shared_region( - (int)rootDir, - (executingClassic ? - CPU_TYPE_POWERPC : - machine_slot[cpu_number()].cpu_type)); - if (shared_region == NULL) { - shared_region_mapping_t old_region; - shared_region_mapping_t new_region; - vm_get_shared_region(current_task(), &old_region); - /* grrrr... this sets current_task(), not task - * -- they're different (usually) - */ - shared_file_boot_time_init( - (int)rootDir, - (executingClassic ? - CPU_TYPE_POWERPC : - machine_slot[cpu_number()].cpu_type)); - if ( current_task() != task ) { - vm_get_shared_region(current_task(),&new_region); - vm_set_shared_region(task,new_region); - vm_set_shared_region(current_task(),old_region); - } - } else { - vm_set_shared_region(task, shared_region); - } - shared_region_mapping_dealloc(initial_region); - } - - lret = load_machfile(vp, mach_header, arch_offset, - arch_size, &load_result, thr_act, map, clean_regions); - if (lret != LOAD_SUCCESS) { - error = load_return_to_errno(lret); - vrele(vp); - vp = NULL; - goto badtoolate; - } + /* XXX May want to indicate to underlying FS that vnode is open */ - /* load_machfile() maps the vnode */ - ubc_map(vp); + return (error); +} + +/* + * exec_handle_sugid + * + * Initially clear the P_SUGID in the process flags; if an SUGID process is + * exec'ing a non-SUGID image, then this is the point of no return. + * + * If the image being activated is SUGI, then replace the credential with a + * copy, disable tracing (unless the tracing process is root), reset the + * mach task port to revoke it, set the P_SUGID bit, + * + * If the saved user and group ID will be changing, then make sure it happens + * to a new credential, rather than a shared one. + * + * Set the security token (this is probably obsolete, given that the token + * should not technically be separate from the credential itself). + * + * Parameters: struct image_params * the image parameter block + * + * Returns: void No failure indication + * + * Implicit returns: + * <process credential> Potentially modified/replaced + * <task port> Potentially revoked + * <process flags> P_SUGID bit potentially modified + * <security token> Potentially modified + */ +static int +exec_handle_sugid(struct image_params *imgp) +{ + kauth_cred_t cred = vfs_context_ucred(imgp->ip_vfs_context); + struct proc *p = vfs_context_proc(imgp->ip_vfs_context); + int i; + int error = 0; + static struct vnode *dev_null = NULLVP; - /* - * deal with set[ug]id. - */ p->p_flag &= ~P_SUGID; - if (((origvattr.va_mode & VSUID) != 0 && - p->p_ucred->cr_uid != origvattr.va_uid) - || (origvattr.va_mode & VSGID) != 0 && - p->p_ucred->cr_gid != origvattr.va_gid) { - p->p_ucred = crcopy(cred); + + if (((imgp->ip_origvattr->va_mode & VSUID) != 0 && + kauth_cred_getuid(cred) != imgp->ip_origvattr->va_uid) || + ((imgp->ip_origvattr->va_mode & VSGID) != 0 && + cred->cr_gid != imgp->ip_origvattr->va_gid)) { #if KTRACE /* * If process is being ktraced, turn off - unless @@ -811,16 +1411,18 @@ again: struct vnode *tvp = p->p_tracep; p->p_tracep = NULL; p->p_traceflag = 0; - - if (UBCINFOEXISTS(tvp)) - ubc_rele(tvp); - vrele(tvp); + vnode_rele(tvp); } #endif - if (origvattr.va_mode & VSUID) - p->p_ucred->cr_uid = origvattr.va_uid; - if (origvattr.va_mode & VSGID) - p->p_ucred->cr_gid = origvattr.va_gid; + /* + * Replace the credential with a copy of itself if euid or egid change. + */ + if (imgp->ip_origvattr->va_mode & VSUID) { + p->p_ucred = kauth_cred_seteuid(p->p_ucred, imgp->ip_origvattr->va_uid); + } + if (imgp->ip_origvattr->va_mode & VSGID) { + p->p_ucred = kauth_cred_setegid(p->p_ucred, imgp->ip_origvattr->va_gid); + } /* * Have mach reset the task port. We don't want @@ -828,10 +1430,30 @@ again: * exec to be able to access/control the task * after. */ - ipc_task_reset(task); + if (current_task() == p->task) + ipc_task_reset(p->task); p->p_flag |= P_SUGID; + /* Cache the vnode for /dev/null the first time around */ + if (dev_null == NULLVP) { + struct nameidata nd1; + + NDINIT(&nd1, LOOKUP, FOLLOW, UIO_SYSSPACE32, + CAST_USER_ADDR_T("/dev/null"), + imgp->ip_vfs_context); + + if ((error = vn_open(&nd1, FREAD, 0)) == 0) { + dev_null = nd1.ni_vp; + /* + * vn_open returns with both a use_count + * and an io_count on the found vnode + * drop the io_count, but keep the use_count + */ + vnode_put(nd1.ni_vp); + } + } + /* Radar 2261856; setuid security hole fix */ /* Patch from OpenBSD: A. Ramesh */ /* @@ -841,307 +1463,77 @@ again: * descriptors in this range which has implied meaning * to libc. */ - for (i = 0; i < 3; i++) { - extern struct fileops vnops; - struct nameidata nd1; - struct file *fp; - int indx; + if (dev_null != NULLVP) { + for (i = 0; i < 3; i++) { + struct fileproc *fp; + int indx; + + if (p->p_fd->fd_ofiles[i] != NULL) + continue; - if (p->p_fd->fd_ofiles[i] == NULL) { if ((error = falloc(p, &fp, &indx)) != 0) continue; - NDINIT(&nd1, LOOKUP, FOLLOW, UIO_SYSSPACE, - "/dev/null", p); - if ((error = vn_open(&nd1, FREAD, 0)) != 0) { - ffree(fp); - p->p_fd->fd_ofiles[indx] = NULL; + + if ((error = vnode_ref_ext(dev_null, FREAD)) != 0) { + fp_free(p, indx, fp); break; } - fp->f_flag = FREAD; - fp->f_type = DTYPE_VNODE; - fp->f_ops = &vnops; - fp->f_data = (caddr_t)nd1.ni_vp; - VOP_UNLOCK(nd1.ni_vp, 0, p); - } - } - } - p->p_cred->p_svuid = p->p_ucred->cr_uid; - p->p_cred->p_svgid = p->p_ucred->cr_gid; - set_security_token(p); - - KNOTE(&p->p_klist, NOTE_EXEC); - - if (!vfexec && (p->p_flag & P_TRACED)) - psignal(p, SIGTRAP); - - if (error) { - vrele(vp); - vp = NULL; - goto badtoolate; - } - VOP_LOCK(vp, LK_EXCLUSIVE | LK_RETRY, p); /* XXX */ - vput(vp); - vp = NULL; - - if (load_result.unixproc && - create_unix_stack(get_task_map(task), - load_result.user_stack, load_result.customstack, p)) { - error = load_return_to_errno(LOAD_NOSPACE); - goto badtoolate; - } - - if (vfexec) { - uthread->uu_ar0 = (void *)get_user_regs(thr_act); - } - - /* - * Copy back arglist if necessary. - */ - - - ucp = (int)p->user_stack; - if (vfexec) { - old_map = vm_map_switch(get_task_map(task)); - } - if (load_result.unixproc) { - int pathptr; - - ucp = ucp - nc - NBPW; /* begining of the STRING AREA */ - - /* - * Support for new app package launching for Mac OS X allocates - * the "path" at the begining of the execargs buffer. - * copy it just before the string area. - */ - len = 0; - pathptr = ucp - ((savedpathlen + NBPW-1) & ~(NBPW-1)); - error = copyoutstr(savedpath, (caddr_t)pathptr, - (unsigned)savedpathlen, (size_t *)&len); - savedpathlen = (savedpathlen + NBPW-1) & ~(NBPW-1); - - if (error) { - if (vfexec) - vm_map_switch(old_map); - goto badtoolate; - } - - /* - * Record the size of the arguments area so that - * sysctl_procargs() can return the argument area without having - * to parse the arguments. - */ - p->p_argslen = (int)p->user_stack - pathptr; - p->p_argc = na - ne; /* save argc for sysctl_procargs() */ - - /* Save a NULL pointer below it */ - (void) suword((caddr_t)(pathptr - NBPW), 0); - /* Save the pointer to "path" just below it */ - (void) suword((caddr_t)(pathptr - 2*NBPW), pathptr); - - /* - * na includes arg[] and env[]. - * NBPW for 2 NULL one each ofter arg[argc -1] and env[n] - * NBPW for argc - * skip over saved path, NBPW for pointer to path, - * and NBPW for the NULL after pointer to path. - */ - ap = ucp - na*NBPW - 3*NBPW - savedpathlen - 2*NBPW; -#if defined(ppc) - thread_setuserstack(thr_act, ap); /* Set the stack */ -#else - uthread->uu_ar0[SP] = ap; -#endif - (void) suword((caddr_t)ap, na-ne); /* argc */ - nc = 0; - cc = 0; - - cp = (char *) execargsp; - cc = NCARGS - savedpathlen - 2*NBPW; - ps.ps_argvstr = (char *)ucp; /* first argv string */ - ps.ps_nargvstr = na - ne; /* argc */ - for (;;) { - ap += NBPW; - if (na == ne) { - (void) suword((caddr_t)ap, 0); - ap += NBPW; - ps.ps_envstr = (char *)ucp; - ps.ps_nenvstr = ne; + fp->f_fglob->fg_flag = FREAD; + fp->f_fglob->fg_type = DTYPE_VNODE; + fp->f_fglob->fg_ops = &vnops; + fp->f_fglob->fg_data = (caddr_t)dev_null; + + proc_fdlock(p); + *fdflags(p, indx) &= ~UF_RESERVED; + fp_drop(p, indx, fp, 1); + proc_fdunlock(p); } - if (--na < 0) - break; - (void) suword((caddr_t)ap, ucp); - do { - error = copyoutstr(cp, (caddr_t)ucp, - (unsigned)cc, (size_t *)&len); - ucp += len; - cp += len; - nc += len; - cc -= len; - } while (error == ENAMETOOLONG); - if (error == EFAULT) - break; /* bad stack - user's problem */ + /* + * for now we need to drop the reference immediately + * since we don't have any mechanism in place to + * release it before starting to unmount "/dev" + * during a reboot/shutdown + */ + vnode_rele(dev_null); + dev_null = NULLVP; } - (void) suword((caddr_t)ap, 0); - } - - if (load_result.dynlinker) { -#if defined(ppc) - ap = thread_adjuserstack(thr_act, -4); /* Adjust the stack */ -#else - ap = uthread->uu_ar0[SP] -= 4; -#endif - (void) suword((caddr_t)ap, load_result.mach_header); - } - - if (vfexec) { - vm_map_switch(old_map); - } -#if defined(ppc) - thread_setentrypoint(thr_act, load_result.entry_point); /* Set the entry point */ -#elif defined(i386) - uthread->uu_ar0[PC] = load_result.entry_point; -#else -#error architecture not implemented! -#endif - - /* Stop profiling */ - stopprofclock(p); - - /* - * Reset signal state. - */ - execsigs(p, thr_act); - - /* - * Close file descriptors - * which specify close-on-exec. - */ - fdexec(p); - - /* - * need to cancel async IO requests that can be cancelled and wait for those - * already active. MAY BLOCK! - */ - _aio_exec( p ); - - /* FIXME: Till vmspace inherit is fixed: */ - if (!vfexec && p->vm_shm) - shmexec(p); - /* Clean up the semaphores */ - semexit(p); - - /* - * Remember file name for accounting. - */ - p->p_acflag &= ~AFORK; - /* If the translated name isn't NULL, then we want to use - * that translated name as the name we show as the "real" name. - * Otherwise, use the name passed into exec. - */ - if (0 != binaryWithClassicName[0]) { - bcopy((caddr_t)binaryWithClassicName, (caddr_t)p->p_comm, - sizeof(binaryWithClassicName)); - } else { - if (nd.ni_cnd.cn_namelen > MAXCOMLEN) - nd.ni_cnd.cn_namelen = MAXCOMLEN; - bcopy((caddr_t)nd.ni_cnd.cn_nameptr, (caddr_t)p->p_comm, - (unsigned)nd.ni_cnd.cn_namelen); - p->p_comm[nd.ni_cnd.cn_namelen] = '\0'; - } - - { - /* This is for kdebug */ - long dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4; - - /* Collect the pathname for tracing */ - kdbg_trace_string(p, &dbg_arg1, &dbg_arg2, &dbg_arg3, &dbg_arg4); - - - - if (vfexec) - { - KERNEL_DEBUG_CONSTANT1((TRACEDBG_CODE(DBG_TRACE_DATA, 2)) | DBG_FUNC_NONE, - p->p_pid ,0,0,0, (unsigned int)thr_act); - KERNEL_DEBUG_CONSTANT1((TRACEDBG_CODE(DBG_TRACE_STRING, 2)) | DBG_FUNC_NONE, - dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, (unsigned int)thr_act); - } - else - { - KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_DATA, 2)) | DBG_FUNC_NONE, - p->p_pid ,0,0,0,0); - KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_STRING, 2)) | DBG_FUNC_NONE, - dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, 0); - } } - if (executingClassic) - p->p_flag |= P_CLASSIC | P_AFFINITY; - else - p->p_flag &= ~P_CLASSIC; - /* - * mark as execed, wakeup the process that vforked (if any) and tell - * it that it now has it's own resources back + * Implement the semantic where the effective user and group become + * the saved user and group in exec'ed programs. */ - p->p_flag |= P_EXEC; - if (p->p_pptr && (p->p_flag & P_PPWAIT)) { - p->p_flag &= ~P_PPWAIT; - wakeup((caddr_t)p->p_pptr); - } - - if (vfexec && (p->p_flag & P_TRACED)) { - psignal_vfork(p, new_task, thr_act, SIGTRAP); - } + p->p_ucred = kauth_cred_setsvuidgid(p->p_ucred, kauth_cred_getuid(p->p_ucred), p->p_ucred->cr_gid); + + /* XXX Obsolete; security token should not be separate from cred */ + set_security_token(p); -badtoolate: - if (vfexec) { - task_deallocate(new_task); - act_deallocate(thr_act); - if (error) - error = 0; - } -bad: - FREE_ZONE(nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); - if (vp) - vput(vp); -bad1: - if (execargs) - execargs_free(execargs); - if (!error && vfexec) { - vfork_return(current_act(), p->p_pptr, p, retval); - (void) thread_resume(thr_act); - return(0); - } return(error); } - -#define unix_stack_size(p) (p->p_rlimit[RLIMIT_STACK].rlim_cur) - -kern_return_t -create_unix_stack(map, user_stack, customstack, p) - vm_map_t map; - vm_offset_t user_stack; - int customstack; - struct proc *p; +static kern_return_t +create_unix_stack(vm_map_t map, user_addr_t user_stack, int customstack, + struct proc *p) { - vm_size_t size; - vm_offset_t addr; + mach_vm_size_t size; + mach_vm_offset_t addr; - p->user_stack = (caddr_t)user_stack; + p->user_stack = user_stack; if (!customstack) { - size = round_page_64(unix_stack_size(p)); - addr = trunc_page_32(user_stack - size); - return (vm_allocate(map, &addr, size, - VM_MAKE_TAG(VM_MEMORY_STACK) | FALSE)); + size = mach_vm_round_page(unix_stack_size(p)); + addr = mach_vm_trunc_page(user_stack - size); + return (mach_vm_allocate(map, &addr, size, + VM_MAKE_TAG(VM_MEMORY_STACK) | + VM_FLAGS_FIXED)); } else return(KERN_SUCCESS); } #include <sys/reboot.h> -char init_program_name[128] = "/sbin/mach_init\0"; +static char init_program_name[128] = "/sbin/launchd"; +static const char * other_init = "/sbin/mach_init"; char init_args[128] = ""; @@ -1150,15 +1542,12 @@ int init_attempts = 0; void -load_init_program(p) - struct proc *p; +load_init_program(struct proc *p) { vm_offset_t init_addr; - int *old_ap; char *argv[3]; - int error; - register_t retval[2]; - struct uthread * ut; + int error; + register_t retval[2]; error = 0; @@ -1174,7 +1563,6 @@ load_init_program(p) if (error && ((boothowto & RB_INITNAME) == 0) && (init_attempts == 1)) { - static char other_init[] = "/etc/mach_init"; printf("Load of %s, errno %d, trying %s\n", init_program_name, error, other_init); error = 0; @@ -1198,11 +1586,12 @@ load_init_program(p) init_addr = VM_MIN_ADDRESS; (void) vm_allocate(current_map(), &init_addr, - PAGE_SIZE, TRUE); + PAGE_SIZE, VM_FLAGS_ANYWHERE); if (init_addr == 0) init_addr++; + (void) copyout((caddr_t) init_program_name, - (caddr_t) (init_addr), + CAST_USER_ADDR_T(init_addr), (unsigned) sizeof(init_program_name)+1); argv[0] = (char *) init_addr; @@ -1216,7 +1605,7 @@ load_init_program(p) */ (void) copyout((caddr_t) init_args, - (caddr_t) (init_addr), + CAST_USER_ADDR_T(init_addr), (unsigned) sizeof(init_args)); argv[1] = (char *) init_addr; @@ -1234,16 +1623,16 @@ load_init_program(p) */ (void) copyout((caddr_t) argv, - (caddr_t) (init_addr), + CAST_USER_ADDR_T(init_addr), (unsigned) sizeof(argv)); /* * Set up argument block for fake call to execve. */ - init_exec_args.fname = argv[0]; - init_exec_args.argp = (char **) init_addr; - init_exec_args.envp = 0; + init_exec_args.fname = CAST_USER_ADDR_T(argv[0]); + init_exec_args.argp = CAST_USER_ADDR_T((char **)init_addr); + init_exec_args.envp = CAST_USER_ADDR_T(0); /* So that mach_init task * is set with uid,gid 0 token @@ -1284,31 +1673,6 @@ load_return_to_errno(load_return_t lrtn) } } -/* - * exec_check_access() - */ -int -check_exec_access(p, vp, vap) - struct proc *p; - struct vnode *vp; - struct vattr *vap; -{ - int flag; - int error; - - if (error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p)) - return (error); - flag = p->p_flag; - if (flag & P_TRACED) { - if (error = VOP_ACCESS(vp, VREAD, p->p_ucred, p)) - return (error); - } - if (vp->v_type != VREG || - (vap->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) - return (EACCES); - return (0); -} - #include <mach/mach_types.h> #include <mach/vm_prot.h> #include <mach/semaphore.h> @@ -1318,9 +1682,12 @@ check_exec_access(p, vp, vap) extern semaphore_t execve_semaphore; +/* + * The block of memory used by the execve arguments. At the same time, + * we allocate a page so that we can read in the first page of the image. + */ static int -execargs_alloc(addrp) - vm_offset_t *addrp; +execargs_alloc(struct image_params *imgp) { kern_return_t kret; @@ -1337,7 +1704,8 @@ execargs_alloc(addrp) return (EINTR); } - kret = kmem_alloc_pageable(bsd_pageable_map, addrp, NCARGS); + kret = kmem_alloc_pageable(bsd_pageable_map, (vm_offset_t *)&imgp->ip_strings, NCARGS + PAGE_SIZE); + imgp->ip_vdata = imgp->ip_strings + NCARGS; if (kret != KERN_SUCCESS) { semaphore_signal(execve_semaphore); return (ENOMEM); @@ -1346,12 +1714,12 @@ execargs_alloc(addrp) } static int -execargs_free(addr) - vm_offset_t addr; +execargs_free(struct image_params *imgp) { kern_return_t kret; - kmem_free(bsd_pageable_map, addr, NCARGS); + kmem_free(bsd_pageable_map, (vm_offset_t)imgp->ip_strings, NCARGS + PAGE_SIZE); + imgp->ip_strings = NULL; kret = semaphore_signal(execve_semaphore); switch (kret) { diff --git a/bsd/kern/kern_exit.c b/bsd/kern/kern_exit.c index 157ec5f05..cb515a0dd 100644 --- a/bsd/kern/kern_exit.c +++ b/bsd/kern/kern_exit.c @@ -68,53 +68,93 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/ioctl.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/tty.h> #include <sys/time.h> #include <sys/resource.h> #include <sys/kernel.h> -#include <sys/buf.h> #include <sys/wait.h> -#include <sys/file.h> -#include <sys/vnode.h> +#include <sys/file_internal.h> +#include <sys/vnode_internal.h> #include <sys/syslog.h> #include <sys/malloc.h> #include <sys/resourcevar.h> #include <sys/ptrace.h> #include <sys/user.h> #include <sys/aio_kern.h> +#include <sys/sysproto.h> +#include <sys/signalvar.h> +#include <sys/filedesc.h> /* fdfree */ +#include <sys/shm_internal.h> /* shmexit */ +#include <sys/acct.h> /* acct_process */ +#include <machine/spl.h> #include <bsm/audit_kernel.h> #include <bsm/audit_kevents.h> #include <mach/mach_types.h> + +#include <kern/kern_types.h> +#include <kern/kalloc.h> +#include <kern/task.h> #include <kern/thread.h> -#include <kern/thread_act.h> #include <kern/sched_prim.h> #include <kern/assert.h> #if KTRACE #include <sys/ktrace.h> -#include <sys/ubc.h> #endif +#include <mach/mach_types.h> +#include <mach/task.h> +#include <mach/thread_act.h> +#include <mach/mach_traps.h> /* init_process */ + extern char init_task_failure_data[]; -int exit1 __P((struct proc *, int, int *)); +int exit1(struct proc *, int, int *); void proc_prepareexit(struct proc *p); -int vfork_exit(struct proc *p, int rv); +void vfork_exit(struct proc *p, int rv); void vproc_exit(struct proc *p); +__private_extern__ void munge_rusage(struct rusage *a_rusage_p, struct user_rusage *a_user_rusage_p); + +/* + * Things which should have prototypes in headers, but don't + */ +void unix_syscall_return(int); +void *get_bsduthreadarg(thread_t); +void proc_exit(struct proc *p); +int wait1continue(int result); +int waitidcontinue(int result); +int *get_bsduthreadrval(thread_t); +kern_return_t sys_perf_notify(struct task *task, exception_data_t code, + mach_msg_type_number_t codeCnt); + +/* + * NOTE: Source and target may *NOT* overlap! + * XXX Should share code with bsd/dev/ppc/unix_signal.c + */ +static void +siginfo_64to32(user_siginfo_t *in, siginfo_t *out) +{ + out->si_signo = in->si_signo; + out->si_errno = in->si_errno; + out->si_code = in->si_code; + out->si_pid = in->si_pid; + out->si_uid = in->si_uid; + out->si_status = in->si_status; + out->si_addr = CAST_DOWN(void *,in->si_addr); + /* following cast works for sival_int because of padding */ + out->si_value.sival_ptr = CAST_DOWN(void *,in->si_value.sival_ptr); + out->si_band = in->si_band; /* range reduction */ + out->pad[0] = in->pad[0]; /* mcontext.ss.r1 */ +} /* * exit -- * Death of process. */ -struct exit_args { - int rval; -}; void -exit(p, uap, retval) - struct proc *p; - struct exit_args *uap; - int *retval; +exit(struct proc *p, struct exit_args *uap, int *retval) { exit1(p, W_EXITCODE(uap->rval, 0), retval); @@ -133,15 +173,11 @@ exit(p, uap, retval) * status and rusage for wait(). Check for child processes and orphan them. */ int -exit1(p, rv, retval) - register struct proc *p; - int rv; - int * retval; +exit1(struct proc *p, int rv, int *retval) { - register struct proc *q, *nq; - thread_act_t self = current_act(); + thread_t self = current_thread(); struct task *task = p->task; - register int i,s; + register int s; struct uthread *ut; /* @@ -151,13 +187,11 @@ exit1(p, rv, retval) */ ut = get_bsdthread_info(self); - if (ut->uu_flag & P_VFORK) { - if (!vfork_exit(p, rv)) { + if (ut->uu_flag & UT_VFORK) { + vfork_exit(p, rv); vfork_return(self, p->p_pptr, p , retval); unix_syscall_return(0); /* NOT REACHED */ - } - return(EINVAL); } AUDIT_SYSCALL_EXIT(0, p, ut); /* Exit is always successfull */ signal_lock(p); @@ -198,14 +232,14 @@ exit1(p, rv, retval) void proc_prepareexit(struct proc *p) { - int s; struct uthread *ut; exception_data_t code[EXCEPTION_CODE_MAX]; - thread_act_t self = current_act(); + thread_t self = current_thread(); - code[0] = 0xFF000001; /* Set terminate code */ - code[1] = p->p_pid; /* Pass out the pid */ - (void)sys_perf_notify(p->task, &code, 2); /* Notify the perf server */ + code[0] = (exception_data_t)0xFF000001; /* Set terminate code */ + code[1] = (exception_data_t)p->p_pid; /* Pass out the pid */ + /* Notify the perf server */ + (void)sys_perf_notify(p->task, (exception_data_t)&code, 2); /* * Remove proc from allproc queue and from pidhash chain. @@ -225,7 +259,7 @@ proc_prepareexit(struct proc *p) * P_PPWAIT is set; we will wakeup the parent below. */ p->p_flag &= ~(P_TRACED | P_PPWAIT); - p->p_sigignore = ~0; + p->p_sigignore = ~(sigcantmask); p->p_siglist = 0; ut = get_bsdthread_info(self); ut->uu_siglist = 0; @@ -237,7 +271,7 @@ proc_exit(struct proc *p) { register struct proc *q, *nq, *pp; struct task *task = p->task; - register int i,s; + register int s; boolean_t funnel_state; /* This can happen if thread_terminate of the single thread @@ -252,6 +286,8 @@ proc_exit(struct proc *p) proc_prepareexit(p); } + p->p_lflag |= P_LPEXIT; + /* XXX Zombie allocation may fail, in which case stats get lost */ MALLOC_ZONE(p->p_ru, struct rusage *, sizeof (*p->p_ru), M_ZOMBIE, M_WAITOK); @@ -278,6 +314,7 @@ proc_exit(struct proc *p) if (sp->s_ttyvp) { struct vnode *ttyvp; + struct vfs_context context; /* * Controlling process. @@ -293,13 +330,16 @@ proc_exit(struct proc *p) * The tty could have been revoked * if we blocked. */ + context.vc_proc = p; + context.vc_ucred = p->p_ucred; if (sp->s_ttyvp) - VOP_REVOKE(sp->s_ttyvp, REVOKEALL); + VNOP_REVOKE(sp->s_ttyvp, REVOKEALL, &context); } ttyvp = sp->s_ttyvp; sp->s_ttyvp = NULL; - if (ttyvp) - vrele(ttyvp); + if (ttyvp) { + vnode_rele(ttyvp); + } /* * s_ttyp is not zero'd; we use this to indicate * that the session once had a controlling terminal. @@ -317,22 +357,15 @@ proc_exit(struct proc *p) /* * release trace file */ - p->p_traceflag = 0; /* don't trace the vrele() */ + p->p_traceflag = 0; /* don't trace the vnode_put() */ if (p->p_tracep) { struct vnode *tvp = p->p_tracep; p->p_tracep = NULL; - - if (UBCINFOEXISTS(tvp)) - ubc_rele(tvp); - vrele(tvp); + vnode_rele(tvp); } #endif - q = p->p_children.lh_first; - if (q) /* only need this if any child is S_ZOMB */ - wakeup((caddr_t) initproc); - for (; q != 0; q = nq) { - nq = q->p_sibling.le_next; + while (q = p->p_children.lh_first) { proc_reparent(q, initproc); /* * Traced processes are killed @@ -348,9 +381,9 @@ proc_exit(struct proc *p) * the first thread in the task. So any attempts to kill * the process would result into a deadlock on q->sigwait. */ - thread_resume((thread_act_t)q->sigwait_thread); + thread_resume((thread_t)q->sigwait_thread); clear_wait(q->sigwait_thread, THREAD_INTERRUPTED); - threadsignal((thread_act_t)q->sigwait_thread, SIGKILL, 0); + threadsignal((thread_t)q->sigwait_thread, SIGKILL, 0); } psignal(q, SIGKILL); } @@ -358,14 +391,16 @@ proc_exit(struct proc *p) /* * Save exit status and final rusage info, adding in child rusage - * info and self times. + * info and self times. If we were unable to allocate a zombie + * structure, this information is lost. */ - *p->p_ru = p->p_stats->p_ru; + if (p->p_ru != NULL) { + *p->p_ru = p->p_stats->p_ru; - timerclear(&p->p_ru->ru_utime); - timerclear(&p->p_ru->ru_stime); + timerclear(&p->p_ru->ru_utime); + timerclear(&p->p_ru->ru_stime); - if (task) { + if (task) { task_basic_info_data_t tinfo; task_thread_times_info_data_t ttimesinfo; int task_info_stuff, task_ttimes_stuff; @@ -373,7 +408,7 @@ proc_exit(struct proc *p) task_info_stuff = TASK_BASIC_INFO_COUNT; task_info(task, TASK_BASIC_INFO, - &tinfo, &task_info_stuff); + (task_info_t)&tinfo, &task_info_stuff); p->p_ru->ru_utime.tv_sec = tinfo.user_time.seconds; p->p_ru->ru_utime.tv_usec = tinfo.user_time.microseconds; p->p_ru->ru_stime.tv_sec = tinfo.system_time.seconds; @@ -381,7 +416,7 @@ proc_exit(struct proc *p) task_ttimes_stuff = TASK_THREAD_TIMES_INFO_COUNT; task_info(task, TASK_THREAD_TIMES_INFO, - &ttimesinfo, &task_ttimes_stuff); + (task_info_t)&ttimesinfo, &task_ttimes_stuff); ut.tv_sec = ttimesinfo.user_time.seconds; ut.tv_usec = ttimesinfo.user_time.microseconds; @@ -389,9 +424,10 @@ proc_exit(struct proc *p) st.tv_usec = ttimesinfo.system_time.microseconds; timeradd(&ut,&p->p_ru->ru_utime,&p->p_ru->ru_utime); timeradd(&st,&p->p_ru->ru_stime,&p->p_ru->ru_stime); - } + } - ruadd(p->p_ru, &p->p_stats->p_cru); + ruadd(p->p_ru, &p->p_stats->p_cru); + } /* * Free up profiling buffers. @@ -405,7 +441,7 @@ proc_exit(struct proc *p) for (; p1 != NULL; p1 = pn) { pn = p1->pr_next; - kfree((vm_offset_t)p1, sizeof *p1); + kfree(p1, sizeof *p1); } } @@ -422,9 +458,6 @@ proc_exit(struct proc *p) FREE_ZONE(p->p_limit, sizeof *p->p_limit, M_SUBPROC); p->p_limit = NULL; - /* Free the auditing info */ - audit_proc_free(p); - /* * Finish up by terminating the task * and halt this thread (only if a @@ -440,18 +473,20 @@ proc_exit(struct proc *p) * Notify parent that we're gone. */ if (p->p_pptr->p_flag & P_NOCLDWAIT) { - struct proc * pp = p->p_pptr; + struct proc *opp = p->p_pptr; /* * Add child resource usage to parent before giving - * zombie to init + * zombie to init. If we were unable to allocate a + * zombie structure, this information is lost. */ - ruadd(&p->p_pptr->p_stats->p_cru, p->p_ru); + if (p->p_ru != NULL) + ruadd(&p->p_pptr->p_stats->p_cru, p->p_ru); proc_reparent(p, initproc); /* If there are no more children wakeup parent */ - if (LIST_EMPTY(&pp->p_children)) - wakeup((caddr_t)pp); + if (LIST_EMPTY(&opp->p_children)) + wakeup((caddr_t)opp); } /* should be fine as parent proc would be initproc */ pp = p->p_pptr; @@ -459,14 +494,13 @@ proc_exit(struct proc *p) pp->si_pid = p->p_pid; pp->si_status = p->p_xstat; pp->si_code = CLD_EXITED; - pp->si_uid = p->p_cred->p_ruid; + pp->si_uid = p->p_ucred->cr_ruid; } - psignal(pp, SIGCHLD); - - /* mark as a zombie */ p->p_stat = SZOMB; + psignal(pp, SIGCHLD); + /* and now wakeup the parent */ wakeup((caddr_t)p->p_pptr); @@ -474,73 +508,98 @@ proc_exit(struct proc *p) } -struct wait4_args { - int pid; - int *status; - int options; - struct rusage *rusage; -}; - -#if COMPAT_43 -int -owait(p, uap, retval) - struct proc *p; - void *uap; - int *retval; -{ - struct wait4_args *a; - - a = (struct wait4_args *)get_bsduthreadarg(current_act()); - - a->options = 0; - a->rusage = NULL; - a->pid = WAIT_ANY; - a->status = NULL; - return (wait1(p, a, retval, 1)); -} - -int -wait4(p, uap, retval) - struct proc *p; - struct wait4_args *uap; - int *retval; +/* + * reap_child_process + * + * Description: Given a process from which all status information needed + * has already been extracted, if the process is a ptrace + * attach process, detach it and give it back to its real + * parent, else recover all resources remaining associated + * with it. + * + * Parameters: struct proc *parent Parent of process being reaped + * struct proc *child Process to reap + * + * Returns: 0 Process was not reaped because it + * came from an attach + * 1 Process was reaped + */ +static int +reap_child_process(struct proc *parent, struct proc *child) { - return (wait1(p, uap, retval, 0)); -} + struct proc *trace_parent; /* Traced parent process, if tracing */ + struct vnode *tvp; /* Traced vnode pointer, if used */ -struct owait3_args { - int *status; - int options; - struct rusage *rusage; -}; + /* + * If we got the child via a ptrace 'attach', + * we need to give it back to the old parent. + */ + if (child->p_oppid && (trace_parent = pfind(child->p_oppid))) { + child->p_oppid = 0; + proc_reparent(child, trace_parent); + if (trace_parent != initproc) { + trace_parent->si_pid = child->p_pid; + trace_parent->si_status = child->p_xstat; + trace_parent->si_code = CLD_CONTINUED; + trace_parent->si_uid = child->p_ucred->cr_ruid; + } + psignal(trace_parent, SIGCHLD); + wakeup((caddr_t)trace_parent); + return (0); + } + child->p_xstat = 0; + if (child->p_ru) { + ruadd(&parent->p_stats->p_cru, child->p_ru); + FREE_ZONE(child->p_ru, sizeof *child->p_ru, M_ZOMBIE); + child->p_ru = NULL; + } else { + printf("Warning : lost p_ru for %s\n", child->p_comm); + } -int -owait3(p, uap, retval) - struct proc *p; - struct owait3_args *uap; - int *retval; -{ - struct wait4_args *a; + /* + * Decrement the count of procs running with this uid. + */ + (void)chgproccnt(child->p_ucred->cr_ruid, -1); - a = (struct wait4_args *)get_bsduthreadarg(current_act()); + /* + * Free up credentials. + */ + if (child->p_ucred != NOCRED) { + kauth_cred_t ucr = child->p_ucred; + child->p_ucred = NOCRED; + kauth_cred_rele(ucr); + } - a->rusage = uap->rusage; - a->options = uap->options; - a->status = uap->status; - a->pid = WAIT_ANY; + /* + * Release reference to text vnode + */ + tvp = child->p_textvp; + child->p_textvp = NULL; + if (tvp) { + vnode_rele(tvp); + } + /* + * Finally finished with old proc entry. + * Unlink it from its process group and free it. + */ + leavepgrp(child); + LIST_REMOVE(child, p_list); /* off zombproc */ + LIST_REMOVE(child, p_sibling); + child->p_flag &= ~P_WAITING; - return (wait1(p, a, retval, 1)); + lck_mtx_destroy(&child->p_mlock, proc_lck_grp); + lck_mtx_destroy(&child->p_fdmlock, proc_lck_grp); + FREE_ZONE(child, sizeof *child, M_PROC); + nprocs--; + return (1); } -#else -#define wait1 wait4 -#endif int -wait1continue(result) +wait1continue(int result) { void *vt; - thread_act_t thread; + thread_t thread; int *retval; struct proc *p; @@ -548,27 +607,19 @@ wait1continue(result) return(result); p = current_proc(); - thread = current_act(); - vt = (void *)get_bsduthreadarg(thread); - retval = (int *)get_bsduthreadrval(thread); - return(wait1((struct proc *)p, (struct wait4_args *)vt, retval, 0)); + thread = current_thread(); + vt = get_bsduthreadarg(thread); + retval = get_bsduthreadrval(thread); + return(wait4((struct proc *)p, (struct wait4_args *)vt, retval)); } int -wait1(q, uap, retval, compat) - register struct proc *q; - register struct wait4_args *uap; - register_t *retval; -#if COMPAT_43 - int compat; -#endif +wait4(struct proc *q, struct wait4_args *uap, register_t *retval) { register int nfound; - register struct proc *p, *t; + register struct proc *p; int status, error; - struct vnode *tvp; -retry: if (uap->pid == 0) uap->pid = -q->p_pgid; @@ -580,6 +631,9 @@ loop: p->p_pgid != -(uap->pid)) continue; nfound++; + + /* XXX This is racy because we don't get the lock!!!! */ + if (p->p_flag & P_WAITING) { (void)tsleep(&p->p_stat, PWAIT, "waitcoll", 0); goto loop; @@ -588,113 +642,59 @@ loop: if (p->p_stat == SZOMB) { retval[0] = p->p_pid; -#if COMPAT_43 - if (compat) - retval[1] = p->p_xstat; - else -#endif if (uap->status) { status = p->p_xstat; /* convert to int */ - if (error = copyout((caddr_t)&status, - (caddr_t)uap->status, - sizeof(status))) { + error = copyout((caddr_t)&status, + uap->status, + sizeof(status)); + if (error) { p->p_flag &= ~P_WAITING; wakeup(&p->p_stat); return (error); } } - if (uap->rusage && - (error = copyout((caddr_t)p->p_ru, - (caddr_t)uap->rusage, - sizeof (struct rusage)))) { - p->p_flag &= ~P_WAITING; - wakeup(&p->p_stat); - return (error); - } - /* - * If we got the child via a ptrace 'attach', - * we need to give it back to the old parent. - */ - if (p->p_oppid && (t = pfind(p->p_oppid))) { - p->p_oppid = 0; - proc_reparent(p, t); - if (t != initproc) { - t->si_pid = p->p_pid; - t->si_status = p->p_xstat; - t->si_code = CLD_CONTINUED; - t->si_uid = p->p_cred->p_ruid; + if (uap->rusage) { + if (p->p_ru == NULL) { + error = ENOMEM; + } else { + if (IS_64BIT_PROCESS(q)) { + struct user_rusage my_rusage; + munge_rusage(p->p_ru, &my_rusage); + error = copyout((caddr_t)&my_rusage, + uap->rusage, + sizeof (my_rusage)); + } + else { + error = copyout((caddr_t)p->p_ru, + uap->rusage, + sizeof (struct rusage)); + } } - psignal(t, SIGCHLD); - wakeup((caddr_t)t); - p->p_flag &= ~P_WAITING; - wakeup(&p->p_stat); - return (0); - } - p->p_xstat = 0; - if (p->p_ru) { - ruadd(&q->p_stats->p_cru, p->p_ru); - FREE_ZONE(p->p_ru, sizeof *p->p_ru, M_ZOMBIE); - p->p_ru = NULL; - } else { - printf("Warning : lost p_ru for %s\n", p->p_comm); - } - - /* - * Decrement the count of procs running with this uid. - */ - (void)chgproccnt(p->p_cred->p_ruid, -1); - - /* - * Free up credentials. - */ - if (--p->p_cred->p_refcnt == 0) { - struct ucred *ucr = p->p_ucred; - struct pcred *pcr; - - if (ucr != NOCRED) { - p->p_ucred = NOCRED; - crfree(ucr); + /* information unavailable? */ + if (error) { + p->p_flag &= ~P_WAITING; + wakeup(&p->p_stat); + return (error); } - pcr = p->p_cred; - p->p_cred = NULL; - FREE_ZONE(pcr, sizeof *pcr, M_SUBPROC); } - /* - * Release reference to text vnode - */ - tvp = p->p_textvp; - p->p_textvp = NULL; - if (tvp) - vrele(tvp); + /* Clean up */ + if (!reap_child_process(q, p)) + p->p_flag &= ~P_WAITING; - /* - * Finally finished with old proc entry. - * Unlink it from its process group and free it. - */ - leavepgrp(p); - LIST_REMOVE(p, p_list); /* off zombproc */ - LIST_REMOVE(p, p_sibling); - p->p_flag &= ~P_WAITING; - FREE_ZONE(p, sizeof *p, M_PROC); - nprocs--; + /* Wake other wait'ers, if any */ wakeup(&p->p_stat); + return (0); } if (p->p_stat == SSTOP && (p->p_flag & P_WAITED) == 0 && (p->p_flag & P_TRACED || uap->options & WUNTRACED)) { p->p_flag |= P_WAITED; retval[0] = p->p_pid; -#if COMPAT_43 - if (compat) { - retval[1] = W_STOPCODE(p->p_xstat); - error = 0; - } else -#endif if (uap->status) { status = W_STOPCODE(p->p_xstat); error = copyout((caddr_t)&status, - (caddr_t)uap->status, + uap->status, sizeof(status)); } else error = 0; @@ -713,7 +713,262 @@ loop: return (0); } - if (error = tsleep0((caddr_t)q, PWAIT | PCATCH, "wait", 0, wait1continue)) + if ((error = tsleep0((caddr_t)q, PWAIT | PCATCH, "wait", 0, wait1continue))) + return (error); + + goto loop; +} + + +int +waitidcontinue(int result) +{ + void *vt; + thread_t thread; + int *retval; + struct proc *p; + + if (result) + return(result); + + p = current_proc(); + thread = current_thread(); + vt = get_bsduthreadarg(thread); + retval = get_bsduthreadrval(thread); + return(waitid((struct proc *)p, (struct waitid_args *)vt, retval)); +} + +/* + * Description: Suspend the calling thread until one child of the process + * containing the calling thread changes state. + * + * Parameters: uap->idtype one of P_PID, P_PGID, P_ALL + * uap->id pid_t or gid_t or ignored + * uap->infop Address of signinfo_t struct in + * user space into which to return status + * uap->options flag values + * + * Returns: 0 Success + * !0 Error returning status to user space + */ +int +waitid(struct proc *q, struct waitid_args *uap, register_t *retval) +{ + user_siginfo_t collect64; /* siginfo data to return to caller */ + + register int nfound; + register struct proc *p; + int error; + +loop: + nfound = 0; + for (p = q->p_children.lh_first; p != 0; p = p->p_sibling.le_next) { + switch(uap->idtype) { + case P_PID: /* child with process ID equal to... */ + if (p->p_pid != (pid_t)uap->id) + continue; + break; + case P_PGID: /* child with process group ID equal to... */ + if (p->p_pgid != (pid_t)uap->id) + continue; + break; + case P_ALL: /* any child */ + break; + } + + /* XXX This is racy because we don't get the lock!!!! */ + + /* + * Wait collision; go to sleep and restart; used to maintain + * the single return for waited process guarantee. + */ + if (p->p_flag & P_WAITING) { + (void)tsleep(&p->p_stat, PWAIT, "waitidcoll", 0); + goto loop; + } + p->p_flag |= P_WAITING; /* mark busy */ + + nfound++; + + /* + * Types of processes we are interested in + * + * XXX Don't know what to do for WCONTINUED?!? + */ + switch(p->p_stat) { + case SZOMB: /* Exited */ + if (!(uap->options & WEXITED)) + break; + + /* Collect "siginfo" information for caller */ + collect64.si_signo = 0; + collect64.si_code = 0; + collect64.si_errno = 0; + collect64.si_pid = 0; + collect64.si_uid = 0; + collect64.si_addr = 0; + collect64.si_status = p->p_xstat; + collect64.si_band = 0; + + if (IS_64BIT_PROCESS(p)) { + error = copyout((caddr_t)&collect64, + uap->infop, + sizeof(collect64)); + } else { + siginfo_t collect; + siginfo_64to32(&collect64,&collect); + error = copyout((caddr_t)&collect, + uap->infop, + sizeof(collect)); + } + /* information unavailable? */ + if (error) { + p->p_flag &= ~P_WAITING; + wakeup(&p->p_stat); + return (error); + } + + /* Prevent other process for waiting for this event? */ + if (!(uap->options & WNOWAIT)) { + /* Clean up */ + if (!reap_child_process(q, p)) + p->p_flag &= ~P_WAITING; + + /* Wake other wait'ers, if any */ + wakeup(&p->p_stat); + } + + return (0); + + case SSTOP: /* Stopped */ + /* + * If we are not interested in stopped processes, then + * ignore this one. + */ + if (!(uap->options & WSTOPPED)) + break; + + /* + * If someone has already waited it, we lost a race + * to be the one to return status. + */ + if ((p->p_flag & P_WAITED) != 0) + break; + + /* + * If this is not a traced process, and they haven't + * indicated an interest in untraced processes, then + * ignore this one. + */ + if (!(p->p_flag & P_TRACED) && !(uap->options & WUNTRACED)) + break; + + /* Collect "siginfo" information for caller */ + collect64.si_signo = 0; + collect64.si_code = 0; + collect64.si_errno = 0; + collect64.si_pid = 0; + collect64.si_uid = 0; + collect64.si_addr = 0; + collect64.si_status = p->p_xstat; + collect64.si_band = 0; + + if (IS_64BIT_PROCESS(p)) { + error = copyout((caddr_t)&collect64, + uap->infop, + sizeof(collect64)); + } else { + siginfo_t collect; + siginfo_64to32(&collect64,&collect); + error = copyout((caddr_t)&collect, + uap->infop, + sizeof(collect)); + } + /* information unavailable? */ + if (error) { + p->p_flag &= ~P_WAITING; + wakeup(&p->p_stat); + return (error); + } + + /* Prevent other process for waiting for this event? */ + if (!(uap->options & WNOWAIT)) { + p->p_flag |= P_WAITED; + } + + p->p_flag &= ~P_WAITING; + wakeup(&p->p_stat); + return (0); + + default: /* All others */ + /* ...meaning Continued */ + if (!(uap->options & WCONTINUED)) + break; + + /* + * If the flag isn't set, then this process has not + * been stopped and continued, or the status has + * already been reaped by another caller of waitid(). + */ + if ((p->p_flag & P_CONTINUED) == 0) + break; + + /* Collect "siginfo" information for caller */ + collect64.si_signo = 0; + collect64.si_code = 0; + collect64.si_errno = 0; + collect64.si_pid = 0; + collect64.si_uid = 0; + collect64.si_addr = 0; + collect64.si_status = p->p_xstat; + collect64.si_band = 0; + + if (IS_64BIT_PROCESS(p)) { + error = copyout((caddr_t)&collect64, + uap->infop, + sizeof(collect64)); + } else { + siginfo_t collect; + siginfo_64to32(&collect64,&collect); + error = copyout((caddr_t)&collect, + uap->infop, + sizeof(collect)); + } + /* information unavailable? */ + if (error) { + p->p_flag &= ~P_WAITING; + wakeup(&p->p_stat); + return (error); + } + + /* Prevent other process for waiting for this event? */ + if (!(uap->options & WNOWAIT)) { + p->p_flag &= ~P_CONTINUED; + } + + p->p_flag &= ~P_WAITING; + wakeup(&p->p_stat); + return (0); + + break; + } + + + /* Not a process we are interested in; go on to next child */ + p->p_flag &= ~P_WAITING; + wakeup(&p->p_stat); + } + + /* No child processes that could possibly satisfy the request? */ + if (nfound == 0) + return (ECHILD); + + if (uap->options & WNOHANG) { + retval[0] = 0; + return (0); + } + + if ((error = tsleep0((caddr_t)q, PWAIT | PCATCH, "waitid", 0, waitidcontinue))) return (error); goto loop; @@ -723,9 +978,7 @@ loop: * make process 'parent' the new parent of process 'child'. */ void -proc_reparent(child, parent) - register struct proc *child; - register struct proc *parent; +proc_reparent(struct proc *child, struct proc *parent) { if (child->p_pptr == parent) @@ -734,6 +987,9 @@ proc_reparent(child, parent) LIST_REMOVE(child, p_sibling); LIST_INSERT_HEAD(&parent->p_children, child, p_sibling); child->p_pptr = parent; + + if (initproc == parent && child->p_stat == SZOMB) + psignal(initproc, SIGCHLD); } /* @@ -742,12 +998,12 @@ proc_reparent(child, parent) * gunned down by kill(-1, 0). */ kern_return_t -init_process(void) +init_process(__unused struct init_process_args *args) { register struct proc *p = current_proc(); AUDIT_MACH_SYSCALL_ENTER(AUE_INITPROCESS); - if (suser(p->p_ucred, &p->p_acflag)) { + if (suser(kauth_cred_get(), &p->p_acflag)) { AUDIT_MACH_SYSCALL_EXIT(KERN_NO_ACCESS); return(KERN_NO_ACCESS); } @@ -769,16 +1025,6 @@ init_process(void) return(KERN_SUCCESS); } -void -process_terminate_self(void) -{ - struct proc *p = current_proc(); - - if (p != NULL) { - exit1(p, W_EXITCODE(0, SIGKILL), (int *)NULL); - /*NOTREACHED*/ - } -} /* * Exit: deallocate address space and other resources, change proc state @@ -786,31 +1032,57 @@ process_terminate_self(void) * status and rusage for wait(). Check for child processes and orphan them. */ -int -vfork_exit(p, rv) - struct proc *p; - int rv; +void +vfork_exit(struct proc *p, int rv) { - register struct proc *q, *nq; - thread_act_t self = current_act(); + thread_t self = current_thread(); +#ifdef FIXME struct task *task = p->task; - register int i,s; +#endif + register int s; struct uthread *ut; exception_data_t code[EXCEPTION_CODE_MAX]; - ut = get_bsdthread_info(self); - if (p->exit_thread) { - return(1); - } - p->exit_thread = self; - + /* + * If a thread in this task has already + * called exit(), then halt any others + * right here. + */ + + ut = get_bsdthread_info(self); +#ifdef FIXME + signal_lock(p); + while (p->exit_thread != self) { + if (sig_try_locked(p) <= 0) { + if (get_threadtask(self) != task) { + signal_unlock(p); + return; + } + signal_unlock(p); + thread_terminate(self); + thread_funnel_set(kernel_flock, FALSE); + thread_exception_return(); + /* NOTREACHED */ + } + sig_lock_to_exit(p); + } + signal_unlock(p); + if (p->p_pid == 1) { + printf("pid 1 exited (signal %d, exit %d)", + WTERMSIG(rv), WEXITSTATUS(rv)); +panic("init died\nState at Last Exception:\n\n%s", init_task_failure_data); + } +#endif /* FIXME */ + s = splsched(); p->p_flag |= P_WEXIT; + p->p_lflag |= P_LPEXIT; splx(s); - code[0] = 0xFF000001; /* Set terminate code */ - code[1] = p->p_pid; /* Pass out the pid */ - (void)sys_perf_notify(p->task, &code, 2); /* Notify the perf server */ + code[0] = (exception_data_t)0xFF000001; /* Set terminate code */ + code[1] = (exception_data_t)p->p_pid; /* Pass out the pid */ + /* Notify the perf server */ + (void)sys_perf_notify(p->task, (exception_data_t)&code, 2); /* * Remove proc from allproc queue and from pidhash chain. @@ -835,17 +1107,17 @@ vfork_exit(p, rv) p->p_xstat = rv; vproc_exit(p); - return(0); } void vproc_exit(struct proc *p) { register struct proc *q, *nq, *pp; +#ifdef FIXME struct task *task = p->task; - register int i,s; - boolean_t funnel_state; +#endif + /* XXX Zombie allocation may fail, in which case stats get lost */ MALLOC_ZONE(p->p_ru, struct rusage *, sizeof (*p->p_ru), M_ZOMBIE, M_WAITOK); @@ -860,6 +1132,7 @@ vproc_exit(struct proc *p) if (sp->s_ttyvp) { struct vnode *ttyvp; + struct vfs_context context; /* * Controlling process. @@ -875,13 +1148,16 @@ vproc_exit(struct proc *p) * The tty could have been revoked * if we blocked. */ + context.vc_proc = p; + context.vc_ucred = p->p_ucred; if (sp->s_ttyvp) - VOP_REVOKE(sp->s_ttyvp, REVOKEALL); + VNOP_REVOKE(sp->s_ttyvp, REVOKEALL, &context); } ttyvp = sp->s_ttyvp; sp->s_ttyvp = NULL; - if (ttyvp) - vrele(ttyvp); + if (ttyvp) { + vnode_rele(ttyvp); + } /* * s_ttyp is not zero'd; we use this to indicate * that the session once had a controlling terminal. @@ -898,22 +1174,15 @@ vproc_exit(struct proc *p) /* * release trace file */ - p->p_traceflag = 0; /* don't trace the vrele() */ + p->p_traceflag = 0; /* don't trace the vnode_rele() */ if (p->p_tracep) { struct vnode *tvp = p->p_tracep; p->p_tracep = NULL; - - if (UBCINFOEXISTS(tvp)) - ubc_rele(tvp); - vrele(tvp); + vnode_rele(tvp); } #endif - q = p->p_children.lh_first; - if (q) /* only need this if any child is S_ZOMB */ - wakeup((caddr_t) initproc); - for (; q != 0; q = nq) { - nq = q->p_sibling.le_next; + while (q = p->p_children.lh_first) { proc_reparent(q, initproc); /* * Traced processes are killed @@ -929,9 +1198,9 @@ vproc_exit(struct proc *p) * the first thread in the task. So any attempts to kill * the process would result into a deadlock on q->sigwait. */ - thread_resume((thread_act_t)q->sigwait_thread); + thread_resume((thread_t)q->sigwait_thread); clear_wait(q->sigwait_thread, THREAD_INTERRUPTED); - threadsignal((thread_act_t)q->sigwait_thread, SIGKILL, 0); + threadsignal((thread_t)q->sigwait_thread, SIGKILL, 0); } psignal(q, SIGKILL); } @@ -939,15 +1208,16 @@ vproc_exit(struct proc *p) /* * Save exit status and final rusage info, adding in child rusage - * info and self times. + * info and self times. If we were unable to allocate a zombie + * structure, this information is lost. */ - *p->p_ru = p->p_stats->p_ru; - - timerclear(&p->p_ru->ru_utime); - timerclear(&p->p_ru->ru_stime); + if (p->p_ru != NULL) { + *p->p_ru = p->p_stats->p_ru; + timerclear(&p->p_ru->ru_utime); + timerclear(&p->p_ru->ru_stime); #ifdef FIXME - if (task) { + if (task) { task_basic_info_data_t tinfo; task_thread_times_info_data_t ttimesinfo; int task_info_stuff, task_ttimes_stuff; @@ -970,11 +1240,12 @@ vproc_exit(struct proc *p) st.tv_sec = ttimesinfo.system_time.seconds; st.tv_usec = ttimesinfo.system_time.microseconds; timeradd(&ut,&p->p_ru->ru_utime,&p->p_ru->ru_utime); - timeradd(&st,&p->p_ru->ru_stime,&p->p_ru->ru_stime); - } + timeradd(&st,&p->p_ru->ru_stime,&p->p_ru->ru_stime); + } #endif /* FIXME */ - ruadd(p->p_ru, &p->p_stats->p_cru); + ruadd(p->p_ru, &p->p_stats->p_cru); + } /* * Free up profiling buffers. @@ -988,7 +1259,7 @@ vproc_exit(struct proc *p) for (; p1 != NULL; p1 = pn) { pn = p1->pr_next; - kfree((vm_offset_t)p1, sizeof *p1); + kfree(p1, sizeof *p1); } } @@ -1020,13 +1291,48 @@ vproc_exit(struct proc *p) pp->si_pid = p->p_pid; pp->si_status = p->p_xstat; pp->si_code = CLD_EXITED; - pp->si_uid = p->p_cred->p_ruid; + pp->si_uid = p->p_ucred->cr_ruid; } - psignal(p->p_pptr, SIGCHLD); - /* mark as a zombie */ p->p_stat = SZOMB; + psignal(p->p_pptr, SIGCHLD); + /* and now wakeup the parent */ wakeup((caddr_t)p->p_pptr); } + + +/* + * munge_rusage + * LP64 support - long is 64 bits if we are dealing with a 64 bit user + * process. We munge the kernel (32 bit) version of rusage into the + * 64 bit version. + */ +__private_extern__ void +munge_rusage(struct rusage *a_rusage_p, struct user_rusage *a_user_rusage_p) +{ + /* timeval changes size, so utime and stime need special handling */ + a_user_rusage_p->ru_utime.tv_sec = a_rusage_p->ru_utime.tv_sec; + a_user_rusage_p->ru_utime.tv_usec = a_rusage_p->ru_utime.tv_usec; + a_user_rusage_p->ru_stime.tv_sec = a_rusage_p->ru_stime.tv_sec; + a_user_rusage_p->ru_stime.tv_usec = a_rusage_p->ru_stime.tv_usec; + /* + * everything else can be a direct assign, since there is no loss + * of precision implied boing 32->64. + */ + a_user_rusage_p->ru_maxrss = a_rusage_p->ru_maxrss; + a_user_rusage_p->ru_ixrss = a_rusage_p->ru_ixrss; + a_user_rusage_p->ru_idrss = a_rusage_p->ru_idrss; + a_user_rusage_p->ru_isrss = a_rusage_p->ru_isrss; + a_user_rusage_p->ru_minflt = a_rusage_p->ru_minflt; + a_user_rusage_p->ru_majflt = a_rusage_p->ru_majflt; + a_user_rusage_p->ru_nswap = a_rusage_p->ru_nswap; + a_user_rusage_p->ru_inblock = a_rusage_p->ru_inblock; + a_user_rusage_p->ru_oublock = a_rusage_p->ru_oublock; + a_user_rusage_p->ru_msgsnd = a_rusage_p->ru_msgsnd; + a_user_rusage_p->ru_msgrcv = a_rusage_p->ru_msgrcv; + a_user_rusage_p->ru_nsignals = a_rusage_p->ru_nsignals; + a_user_rusage_p->ru_nvcsw = a_rusage_p->ru_nvcsw; + a_user_rusage_p->ru_nivcsw = a_rusage_p->ru_nivcsw; +} diff --git a/bsd/kern/kern_fork.c b/bsd/kern/kern_fork.c index 40a2275c1..a993e3356 100644 --- a/bsd/kern/kern_fork.c +++ b/bsd/kern/kern_fork.c @@ -66,29 +66,33 @@ #include <sys/filedesc.h> #include <sys/kernel.h> #include <sys/malloc.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/user.h> #include <sys/resourcevar.h> -#include <sys/vnode.h> -#include <sys/file.h> +#include <sys/vnode_internal.h> +#include <sys/file_internal.h> #include <sys/acct.h> -#include <sys/wait.h> - -#include <bsm/audit_kernel.h> - #if KTRACE #include <sys/ktrace.h> -#include <sys/ubc.h> #endif +#include <bsm/audit_kernel.h> + #include <mach/mach_types.h> +#include <kern/kern_types.h> +#include <kern/kalloc.h> #include <kern/mach_param.h> +#include <kern/task.h> +#include <kern/zalloc.h> #include <machine/spl.h> -thread_act_t cloneproc(struct proc *, int); +#include <vm/vm_protos.h> // for vm_map_commpage64 + +thread_t cloneproc(struct proc *, int); struct proc * forkproc(struct proc *, int); -thread_act_t procdup(); +thread_t procdup(struct proc *child, struct proc *parent); #define DOFORK 0x1 /* fork() system call */ #define DOVFORK 0x2 /* vfork() system call */ @@ -98,10 +102,7 @@ static int fork1(struct proc *, long, register_t *); * fork system call. */ int -fork(p, uap, retval) - struct proc *p; - void *uap; - register_t *retval; +fork(struct proc *p, __unused void *uap, register_t *retval) { return (fork1(p, (long)DOFORK, retval)); } @@ -110,18 +111,15 @@ fork(p, uap, retval) * vfork system call */ int -vfork(p, uap, retval) - struct proc *p; - void *uap; - register_t *retval; +vfork(struct proc *p, void *uap, register_t *retval) { register struct proc * newproc; register uid_t uid; - thread_act_t cur_act = (thread_act_t)current_act(); + thread_t cur_act = (thread_t)current_thread(); int count; task_t t; uthread_t ut; - + /* * Although process entries are dynamically created, we still keep * a global limit on the maximum number we will create. Don't allow @@ -129,7 +127,7 @@ vfork(p, uap, retval) * exceed the limit. The variable nprocs is the current number of * processes, maxproc is the limit. */ - uid = p->p_cred->p_ruid; + uid = kauth_cred_get()->cr_ruid; if ((nprocs >= maxproc - 1 && uid != 0) || nprocs >= maxproc) { tablefull("proc"); retval[1] = 0; @@ -147,7 +145,7 @@ vfork(p, uap, retval) } ut = (struct uthread *)get_bsdthread_info(cur_act); - if (ut->uu_flag & P_VFORK) { + if (ut->uu_flag & UT_VFORK) { printf("vfork called recursively by %s\n", p->p_comm); (void)chgproccnt(uid, -1); return (EINVAL); @@ -172,14 +170,20 @@ vfork(p, uap, retval) newproc->p_flag |= P_INVFORK; newproc->p_vforkact = cur_act; - ut->uu_flag |= P_VFORK; + ut->uu_flag |= UT_VFORK; ut->uu_proc = newproc; ut->uu_userstate = (void *)act_thread_csave(); ut->uu_vforkmask = ut->uu_sigmask; + /* temporarily drop thread-set-id state */ + if (ut->uu_flag & UT_SETUID) { + ut->uu_flag |= UT_WASSETUID; + ut->uu_flag &= ~UT_SETUID; + } + thread_set_child(cur_act, newproc->p_pid); - newproc->p_stats->p_start = time; + microtime(&newproc->p_stats->p_start); newproc->p_acflag = AFORK; /* @@ -202,38 +206,35 @@ vfork(p, uap, retval) * Return to parent vfork ehread() */ void -vfork_return(th_act, p, p2, retval) - thread_act_t th_act; - struct proc * p; - struct proc *p2; - register_t *retval; +vfork_return(__unused thread_t th_act, struct proc *p, struct proc *p2, + register_t *retval) { - long flags; - register uid_t uid; - int s, count; - task_t t; + thread_t cur_act = (thread_t)current_thread(); uthread_t ut; - ut = (struct uthread *)get_bsdthread_info(th_act); + ut = (struct uthread *)get_bsdthread_info(cur_act); act_thread_catt(ut->uu_userstate); /* Make sure only one at this time */ - if (p) { - p->p_vforkcnt--; - if (p->p_vforkcnt <0) - panic("vfork cnt is -ve"); - if (p->p_vforkcnt <=0) - p->p_flag &= ~P_VFORK; - } + p->p_vforkcnt--; + if (p->p_vforkcnt <0) + panic("vfork cnt is -ve"); + if (p->p_vforkcnt <=0) + p->p_flag &= ~P_VFORK; ut->uu_userstate = 0; - ut->uu_flag &= ~P_VFORK; + ut->uu_flag &= ~UT_VFORK; + /* restore thread-set-id state */ + if (ut->uu_flag & UT_WASSETUID) { + ut->uu_flag |= UT_SETUID; + ut->uu_flag &= UT_WASSETUID; + } ut->uu_proc = 0; ut->uu_sigmask = ut->uu_vforkmask; p2->p_flag &= ~P_INVFORK; p2->p_vforkact = (void *)0; - thread_set_parent(th_act, p2->p_pid); + thread_set_parent(cur_act, p2->p_pid); if (retval) { retval[0] = p2->p_pid; @@ -243,16 +244,12 @@ vfork_return(th_act, p, p2, retval) return; } -thread_act_t -procdup( - struct proc *child, - struct proc *parent) +thread_t +procdup(struct proc *child, struct proc *parent) { - thread_act_t thread; + thread_t thread; task_t task; kern_return_t result; - pmap_t pmap; - extern task_t kernel_task; if (parent->task == kernel_task) result = task_create_internal(TASK_NULL, FALSE, &task); @@ -263,6 +260,18 @@ procdup( child->task = task; /* task->proc = child; */ set_bsdtask_info(task, child); + if (parent->p_flag & P_LP64) { + task_set_64bit(task, TRUE); + child->p_flag |= P_LP64; +#ifdef __PPC__ + /* LP64todo - clean up this hacked mapping of commpage */ + pmap_map_sharedpage(task, get_map_pmap(get_task_map(task))); + vm_map_commpage64(get_task_map(task)); +#endif /* __PPC__ */ + } else { + task_set_64bit(task, FALSE); + child->p_flag &= ~P_LP64; + } if (child->p_nice != 0) resetpriority(child); @@ -282,9 +291,9 @@ fork1(p1, flags, retval) { register struct proc *p2; register uid_t uid; - thread_act_t newth; - int s, count; - task_t t; + thread_t newth; + int count; + task_t t; /* * Although process entries are dynamically created, we still keep @@ -293,7 +302,7 @@ fork1(p1, flags, retval) * exceed the limit. The variable nprocs is the current number of * processes, maxproc is the limit. */ - uid = p1->p_cred->p_ruid; + uid = kauth_cred_get()->cr_ruid; if ((nprocs >= maxproc - 1 && uid != 0) || nprocs >= maxproc) { tablefull("proc"); retval[1] = 0; @@ -321,9 +330,7 @@ fork1(p1, flags, retval) thread_set_child(newth, p2->p_pid); - s = splhigh(); - p2->p_stats->p_start = time; - splx(s); + microtime(&p2->p_stats->p_start); p2->p_acflag = AFORK; /* @@ -339,10 +346,10 @@ fork1(p1, flags, retval) (void) thread_resume(newth); /* drop the extra references we got during the creation */ - if (t = (task_t)get_threadtask(newth)) { + if ((t = (task_t)get_threadtask(newth)) != NULL) { task_deallocate(t); } - act_deallocate(newth); + thread_deallocate(newth); KNOTE(&p1->p_klist, NOTE_FORK | p2->p_pid); @@ -364,13 +371,13 @@ fork1(p1, flags, retval) * lock set. fork() code needs to explicity remove this lock * before signals can be delivered */ -thread_act_t +thread_t cloneproc(p1, lock) register struct proc *p1; register int lock; { register struct proc *p2; - thread_act_t th; + thread_t th; p2 = (struct proc *)forkproc(p1,lock); @@ -399,17 +406,20 @@ forkproc(p1, lock) { register struct proc *p2, *newproc; static int nextpid = 0, pidchecked = 0; - thread_t th; /* Allocate new proc. */ MALLOC_ZONE(newproc, struct proc *, sizeof *newproc, M_PROC, M_WAITOK); - MALLOC_ZONE(newproc->p_cred, struct pcred *, - sizeof *newproc->p_cred, M_SUBPROC, M_WAITOK); + if (newproc == NULL) + panic("forkproc: M_PROC zone exhausted"); MALLOC_ZONE(newproc->p_stats, struct pstats *, sizeof *newproc->p_stats, M_SUBPROC, M_WAITOK); + if (newproc->p_stats == NULL) + panic("forkproc: M_SUBPROC zone exhausted (p_stats)"); MALLOC_ZONE(newproc->p_sigacts, struct sigacts *, sizeof *newproc->p_sigacts, M_SUBPROC, M_WAITOK); + if (newproc->p_sigacts == NULL) + panic("forkproc: M_SUBPROC zone exhausted (p_sigacts)"); /* * Find an unused process ID. We remember a range of unused IDs @@ -464,9 +474,9 @@ again: nprocs++; p2 = newproc; p2->p_stat = SIDL; + p2->p_shutdownstate = 0; p2->p_pid = nextpid; - p2->p_shutdownstate = 0; /* * Make a proc table entry for the new process. * Start by zeroing the section of proc that is zero-initialized, @@ -479,34 +489,35 @@ again: p2->vm_shm = (void *)NULL; /* Make sure it is zero */ /* - * Copy the audit info. - */ - audit_proc_fork(p1, p2); - - /* + * Some flags are inherited from the parent. * Duplicate sub-structures as needed. * Increase reference counts on shared objects. * The p_stats and p_sigacts substructs are set in vm_fork. */ - p2->p_flag = P_INMEM; - p2->p_flag |= (p1->p_flag & P_CLASSIC); // copy from parent - p2->p_flag |= (p1->p_flag & P_AFFINITY); // copy from parent + p2->p_flag = (p1->p_flag & (P_LP64 | P_CLASSIC | P_AFFINITY)); if (p1->p_flag & P_PROFIL) startprofclock(p2); - bcopy(p1->p_cred, p2->p_cred, sizeof(*p2->p_cred)); - p2->p_cred->p_refcnt = 1; - crhold(p1->p_ucred); - lockinit(&p2->p_cred->pc_lock, PLOCK, "proc cred", 0, 0); + /* + * Note that if the current thread has an assumed identity, this + * credential will be granted to the new process. + */ + p2->p_ucred = kauth_cred_get_with_ref(); + + lck_mtx_init(&p2->p_mlock, proc_lck_grp, proc_lck_attr); + lck_mtx_init(&p2->p_fdmlock, proc_lck_grp, proc_lck_attr); klist_init(&p2->p_klist); /* bump references to the text vnode */ p2->p_textvp = p1->p_textvp; - if (p2->p_textvp) - VREF(p2->p_textvp); - + if (p2->p_textvp) { + vnode_rele(p2->p_textvp); + } + /* XXX may fail to copy descriptors to child */ p2->p_fd = fdcopy(p1); + if (p1->vm_shm) { - shmfork(p1,p2); + /* XXX may fail to attach shm to child */ + (void)shmfork(p1,p2); } /* * If p_limit is still copy-on-write, bump refcnt, @@ -528,6 +539,8 @@ again: ((caddr_t)&p2->p_stats->pstat_endcopy - (caddr_t)&p2->p_stats->pstat_startcopy)); + bzero(&p2->p_stats->user_p_prof, sizeof(struct user_uprof)); + if (p1->p_sigacts != NULL) (void)memcpy(p2->p_sigacts, p1->p_sigacts, sizeof *p2->p_sigacts); @@ -553,6 +566,7 @@ again: p2->user_stack = p1->user_stack; p2->p_vforkcnt = 0; p2->p_vforkact = 0; + p2->p_lflag = 0; TAILQ_INIT(&p2->p_uthlist); TAILQ_INIT(&p2->aio_activeq); TAILQ_INIT(&p2->aio_doneq); @@ -567,9 +581,7 @@ again: if (p1->p_traceflag&KTRFAC_INHERIT) { p2->p_traceflag = p1->p_traceflag; if ((p2->p_tracep = p1->p_tracep) != NULL) { - if (UBCINFOEXISTS(p2->p_tracep)) - ubc_hold(p2->p_tracep); - VREF(p2->p_tracep); + vnode_ref(p2->p_tracep); } } #endif @@ -577,30 +589,41 @@ again: } +void +proc_lock(proc_t p) +{ + lck_mtx_lock(&p->p_mlock); +} + +void +proc_unlock(proc_t p) +{ + lck_mtx_unlock(&p->p_mlock); +} + #include <kern/zalloc.h> struct zone *uthread_zone; int uthread_zone_inited = 0; void -uthread_zone_init() +uthread_zone_init(void) { if (!uthread_zone_inited) { uthread_zone = zinit(sizeof(struct uthread), - THREAD_MAX * sizeof(struct uthread), - THREAD_CHUNK * sizeof(struct uthread), - "uthreads"); + THREAD_MAX * sizeof(struct uthread), + THREAD_CHUNK * sizeof(struct uthread), + "uthreads"); uthread_zone_inited = 1; } } void * -uthread_alloc(task_t task, thread_act_t thr_act ) +uthread_alloc(task_t task, thread_t thr_act ) { struct proc *p; struct uthread *uth, *uth_parent; void *ut; - extern task_t kernel_task; boolean_t funnel_state; if (!uthread_zone_inited) @@ -609,22 +632,44 @@ uthread_alloc(task_t task, thread_act_t thr_act ) ut = (void *)zalloc(uthread_zone); bzero(ut, sizeof(struct uthread)); - if (task != kernel_task) { - uth = (struct uthread *)ut; - p = (struct proc *) get_bsdtask_info(task); + p = (struct proc *) get_bsdtask_info(task); + uth = (struct uthread *)ut; + /* + * Thread inherits credential from the creating thread, if both + * are in the same task. + * + * If the creating thread has no credential or is from another + * task we can leave the new thread credential NULL. If it needs + * one later, it will be lazily assigned from the task's process. + */ + uth_parent = (struct uthread *)get_bsdthread_info(current_thread()); + if ((task == current_task()) && + (uth_parent != NULL) && + (uth_parent->uu_ucred != NOCRED)) { + uth->uu_ucred = uth_parent->uu_ucred; + kauth_cred_ref(uth->uu_ucred); + /* the credential we just inherited is an assumed credential */ + if (uth_parent->uu_flag & UT_SETUID) + uth->uu_flag |= UT_SETUID; + } else { + uth->uu_ucred = NOCRED; + } + + if (task != kernel_task) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); - uth_parent = (struct uthread *)get_bsdthread_info(current_act()); if (uth_parent) { - if (uth_parent->uu_flag & USAS_OLDMASK) + if (uth_parent->uu_flag & UT_SAS_OLDMASK) uth->uu_sigmask = uth_parent->uu_oldmask; else uth->uu_sigmask = uth_parent->uu_sigmask; } uth->uu_act = thr_act; //signal_lock(p); - if (p) + if (p) { TAILQ_INSERT_TAIL(&p->p_uthlist, uth, uu_list); + } //signal_unlock(p); (void)thread_funnel_set(kernel_flock, funnel_state); } @@ -634,16 +679,12 @@ uthread_alloc(task_t task, thread_act_t thr_act ) void -uthread_free(task_t task, thread_t act, void *uthread, void * bsd_info) +uthread_free(task_t task, void *uthread, void * bsd_info) { struct _select *sel; struct uthread *uth = (struct uthread *)uthread; struct proc * p = (struct proc *)bsd_info; - extern task_t kernel_task; - int size; boolean_t funnel_state; - struct nlminfo *nlmp; - struct proc * vproc; /* * Per-thread audit state should never last beyond system @@ -653,40 +694,31 @@ uthread_free(task_t task, thread_t act, void *uthread, void * bsd_info) */ assert(uth->uu_ar == NULL); - sel = &uth->uu_state.ss_select; + sel = &uth->uu_select; /* cleanup the select bit space */ if (sel->nbytes) { FREE(sel->ibits, M_TEMP); FREE(sel->obits, M_TEMP); } - if (sel->allocsize && uth->uu_wqsub){ - kfree(uth->uu_wqsub, sel->allocsize); - sel->count = sel->nfcount = 0; + if (sel->allocsize && sel->wqset){ + kfree(sel->wqset, sel->allocsize); + sel->count = 0; sel->allocsize = 0; - uth->uu_wqsub = 0; + sel->wqset = 0; sel->wql = 0; } - if ((nlmp = uth->uu_nlminfo)) { - uth->uu_nlminfo = 0; - FREE(nlmp, M_LOCKF); - } - - if ((task != kernel_task) ) { - int vfork_exit(struct proc *, int); + if (uth->uu_ucred != NOCRED) + kauth_cred_rele(uth->uu_ucred); + if ((task != kernel_task) && p) { funnel_state = thread_funnel_set(kernel_flock, TRUE); - if (p) - TAILQ_REMOVE(&p->p_uthlist, uth, uu_list); - if ((uth->uu_flag & P_VFORK) && (vproc = uth->uu_proc) - && (vproc->p_flag & P_INVFORK)) { - if (!vfork_exit(vproc, W_EXITCODE(0, SIGKILL))) - vfork_return(act, p, vproc, NULL); - - } + //signal_lock(p); + TAILQ_REMOVE(&p->p_uthlist, uth, uu_list); + //signal_unlock(p); (void)thread_funnel_set(kernel_flock, funnel_state); } /* and free the uthread itself */ - zfree(uthread_zone, (vm_offset_t)uthread); + zfree(uthread_zone, uthread); } diff --git a/bsd/kern/kern_ktrace.c b/bsd/kern/kern_ktrace.c index 4234c2fbc..c77a03c90 100644 --- a/bsd/kern/kern_ktrace.c +++ b/bsd/kern/kern_ktrace.c @@ -60,27 +60,28 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/types.h> -#include <sys/proc.h> -#include <sys/file.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> +#include <sys/file_internal.h> #include <sys/namei.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #if KTRACE #include <sys/ktrace.h> #endif #include <sys/malloc.h> #include <sys/syslog.h> -#include <sys/ubc.h> +#include <sys/sysproto.h> +#include <sys/uio_internal.h> #include <bsm/audit_kernel.h> #if KTRACE -static struct ktr_header *ktrgetheader __P((int type)); -static void ktrwrite __P((struct vnode *, struct ktr_header *, - struct uio *, int)); -static int ktrcanset __P((struct proc *,struct proc *)); -static int ktrsetchildren __P((struct proc *,struct proc *, - int, int, struct vnode *)); -static int ktrops __P((struct proc *,struct proc *,int,int,struct vnode *)); +static struct ktr_header *ktrgetheader(int type); +static void ktrwrite(struct vnode *, struct ktr_header *, struct uio *); +static int ktrcanset(struct proc *,struct proc *); +static int ktrsetchildren(struct proc *,struct proc *, + int, int, struct vnode *); +static int ktrops(struct proc *,struct proc *,int,int,struct vnode *); static struct ktr_header * @@ -92,27 +93,28 @@ ktrgetheader(type) MALLOC(kth, struct ktr_header *, sizeof (struct ktr_header), M_KTRACE, M_WAITOK); - kth->ktr_type = type; - microtime(&kth->ktr_time); - kth->ktr_pid = p->p_pid; - bcopy(p->p_comm, kth->ktr_comm, MAXCOMLEN); + if (kth != NULL) { + kth->ktr_type = type; + microtime(&kth->ktr_time); + kth->ktr_pid = p->p_pid; + bcopy(p->p_comm, kth->ktr_comm, MAXCOMLEN); + } return (kth); } #endif void -ktrsyscall(p, code, narg, args, funnel_type) +ktrsyscall(p, code, narg, args) struct proc *p; int code, narg; - register_t args[]; - int funnel_type; + u_int64_t args[]; { #if KTRACE struct vnode *vp; struct ktr_header *kth; struct ktr_syscall *ktp; register int len; - register_t *argp; + u_int64_t *argp; int i; if (!KTRPOINT(p, KTR_SYSCALL)) @@ -120,10 +122,18 @@ ktrsyscall(p, code, narg, args, funnel_type) vp = p->p_tracep; len = __offsetof(struct ktr_syscall, ktr_args) + - (narg * sizeof(register_t)); + (narg * sizeof(u_int64_t)); p->p_traceflag |= KTRFAC_ACTIVE; kth = ktrgetheader(KTR_SYSCALL); + if (kth == NULL) { + p->p_traceflag &= ~KTRFAC_ACTIVE; + return; + } MALLOC(ktp, struct ktr_syscall *, len, M_KTRACE, M_WAITOK); + if (ktp == NULL) { + FREE(kth, M_KTRACE); + return; + } ktp->ktr_code = code; ktp->ktr_narg = narg; argp = &ktp->ktr_args[0]; @@ -131,7 +141,7 @@ ktrsyscall(p, code, narg, args, funnel_type) *argp++ = args[i]; kth->ktr_buf = (caddr_t)ktp; kth->ktr_len = len; - ktrwrite(vp, kth, NULL, funnel_type); + ktrwrite(vp, kth, NULL); FREE(ktp, M_KTRACE); FREE(kth, M_KTRACE); p->p_traceflag &= ~KTRFAC_ACTIVE; @@ -141,11 +151,10 @@ ktrsyscall(p, code, narg, args, funnel_type) } void -ktrsysret(p, code, error, retval, funnel_type) +ktrsysret(p, code, error, retval) struct proc *p; int code, error; register_t retval; - int funnel_type; { #if KTRACE struct vnode *vp; @@ -158,6 +167,10 @@ ktrsysret(p, code, error, retval, funnel_type) vp = p->p_tracep; p->p_traceflag |= KTRFAC_ACTIVE; kth = ktrgetheader(KTR_SYSRET); + if (kth == NULL) { + p->p_traceflag &= ~KTRFAC_ACTIVE; + return; + } ktp.ktr_code = code; ktp.ktr_error = error; ktp.ktr_retval = retval; /* what about val2 ? */ @@ -165,7 +178,7 @@ ktrsysret(p, code, error, retval, funnel_type) kth->ktr_buf = (caddr_t)&ktp; kth->ktr_len = sizeof(struct ktr_sysret); - ktrwrite(vp, kth, NULL, funnel_type); + ktrwrite(vp, kth, NULL); FREE(kth, M_KTRACE); p->p_traceflag &= ~KTRFAC_ACTIVE; #else @@ -184,22 +197,25 @@ ktrnamei(vp, path) p->p_traceflag |= KTRFAC_ACTIVE; kth = ktrgetheader(KTR_NAMEI); + if (kth == NULL) { + p->p_traceflag &= ~KTRFAC_ACTIVE; + return; + } kth->ktr_len = strlen(path); kth->ktr_buf = path; - ktrwrite(vp, kth, NULL, KERNEL_FUNNEL); + ktrwrite(vp, kth, NULL); FREE(kth, M_KTRACE); p->p_traceflag &= ~KTRFAC_ACTIVE; } void -ktrgenio(vp, fd, rw, uio, error, funnel_type) +ktrgenio(vp, fd, rw, uio, error) struct vnode *vp; int fd; enum uio_rw rw; struct uio *uio; int error; - int funnel_type; { struct ktr_header *kth; struct ktr_genio ktg; @@ -210,6 +226,10 @@ ktrgenio(vp, fd, rw, uio, error, funnel_type) p->p_traceflag |= KTRFAC_ACTIVE; kth = ktrgetheader(KTR_GENIO); + if (kth == NULL) { + p->p_traceflag &= ~KTRFAC_ACTIVE; + return; + } ktg.ktr_fd = fd; ktg.ktr_rw = rw; kth->ktr_buf = (caddr_t)&ktg; @@ -217,19 +237,18 @@ ktrgenio(vp, fd, rw, uio, error, funnel_type) uio->uio_offset = 0; uio->uio_rw = UIO_WRITE; - ktrwrite(vp, kth, uio, funnel_type); + ktrwrite(vp, kth, uio); FREE(kth, M_KTRACE); p->p_traceflag &= ~KTRFAC_ACTIVE; } void -ktrpsig(vp, sig, action, mask, code, funnel_type) +ktrpsig(vp, sig, action, mask, code) struct vnode *vp; int sig; sig_t action; sigset_t *mask; int code; - int funnel_type; { struct ktr_header *kth; struct ktr_psig kp; @@ -237,6 +256,10 @@ ktrpsig(vp, sig, action, mask, code, funnel_type) p->p_traceflag |= KTRFAC_ACTIVE; kth = ktrgetheader(KTR_PSIG); + if (kth == NULL) { + p->p_traceflag &= ~KTRFAC_ACTIVE; + return; + } kp.signo = (char)sig; kp.action = action; kp.mask = *mask; @@ -244,16 +267,15 @@ ktrpsig(vp, sig, action, mask, code, funnel_type) kth->ktr_buf = (caddr_t)&kp; kth->ktr_len = sizeof (struct ktr_psig); - ktrwrite(vp, kth, NULL, funnel_type); + ktrwrite(vp, kth, NULL); FREE(kth, M_KTRACE); p->p_traceflag &= ~KTRFAC_ACTIVE; } void -ktrcsw(vp, out, user, funnel_type) +ktrcsw(vp, out, user) struct vnode *vp; int out, user; - int funnel_type; { struct ktr_header *kth; struct ktr_csw kc; @@ -261,12 +283,16 @@ ktrcsw(vp, out, user, funnel_type) p->p_traceflag |= KTRFAC_ACTIVE; kth = ktrgetheader(KTR_CSW); + if (kth == NULL) { + p->p_traceflag &= ~KTRFAC_ACTIVE; + return; + } kc.out = out; kc.user = user; kth->ktr_buf = (caddr_t)&kc; kth->ktr_len = sizeof (struct ktr_csw); - ktrwrite(vp, kth, NULL, funnel_type); + ktrwrite(vp, kth, NULL); FREE(kth, M_KTRACE); p->p_traceflag &= ~KTRFAC_ACTIVE; } @@ -277,18 +303,9 @@ ktrcsw(vp, out, user, funnel_type) /* * ktrace system call */ -struct ktrace_args { - char *fname; - int ops; - int facs; - int pid; -}; /* ARGSUSED */ int -ktrace(curp, uap, retval) - struct proc *curp; - register struct ktrace_args *uap; - register_t *retval; +ktrace(struct proc *curp, register struct ktrace_args *uap, __unused register_t *retval) { #if KTRACE register struct vnode *vp = NULL; @@ -300,25 +317,33 @@ ktrace(curp, uap, retval) int ret = 0; int error = 0; struct nameidata nd; + struct vfs_context context; AUDIT_ARG(cmd, uap->ops); AUDIT_ARG(pid, uap->pid); AUDIT_ARG(value, uap->facs); + + context.vc_proc = curp; + context.vc_ucred = kauth_cred_get(); + curp->p_traceflag |= KTRFAC_ACTIVE; if (ops != KTROP_CLEAR) { /* * an operation which requires a file argument. */ - NDINIT(&nd, LOOKUP, (NOFOLLOW|LOCKLEAF), UIO_USERSPACE, uap->fname, curp); + NDINIT(&nd, LOOKUP, (NOFOLLOW|LOCKLEAF), UIO_USERSPACE, + uap->fname, &context); error = vn_open(&nd, FREAD|FWRITE|O_NOFOLLOW, 0); if (error) { curp->p_traceflag &= ~KTRFAC_ACTIVE; return (error); } vp = nd.ni_vp; - VOP_UNLOCK(vp, 0, curp); + if (vp->v_type != VREG) { - (void) vn_close(vp, FREAD|FWRITE, curp->p_ucred, curp); + (void) vn_close(vp, FREAD|FWRITE, kauth_cred_get(), curp); + (void) vnode_put(vp); + curp->p_traceflag &= ~KTRFAC_ACTIVE; return (EACCES); } @@ -335,10 +360,7 @@ ktrace(curp, uap, retval) p->p_traceflag = 0; if (tvp != NULL) { p->p_tracep = NULL; - - VOP_CLOSE(vp, FREAD|FWRITE, curp->p_ucred, curp); - ubc_rele(tvp); - vrele(tvp); + vnode_rele(tvp); } } else error = EPERM; @@ -390,8 +412,10 @@ ktrace(curp, uap, retval) if (!ret) error = EPERM; done: - if (vp != NULL) - (void) vn_close(vp, FWRITE, curp->p_ucred, curp); + if (vp != NULL) { + (void) vn_close(vp, FWRITE, kauth_cred_get(), curp); + (void) vnode_put(vp); + } curp->p_traceflag &= ~KTRFAC_ACTIVE; return (error); #else @@ -402,17 +426,10 @@ done: /* * utrace system call */ -struct utrace_args { - const void * addr; - size_t len; -}; /* ARGSUSED */ int -utrace(curp, uap, retval) - struct proc *curp; - register struct utrace_args *uap; - register_t *retval; +utrace(__unused struct proc *curp, register struct utrace_args *uap, __unused register_t *retval) { #if KTRACE struct ktr_header *kth; @@ -425,11 +442,19 @@ utrace(curp, uap, retval) return (EINVAL); p->p_traceflag |= KTRFAC_ACTIVE; kth = ktrgetheader(KTR_USER); + if (kth == NULL) { + p->p_traceflag &= ~KTRFAC_ACTIVE; + return(ENOMEM); + } MALLOC(cp, caddr_t, uap->len, M_KTRACE, M_WAITOK); - if (!copyin((caddr_t)uap->addr, cp, uap->len)) { + if (cp == NULL) { + FREE(kth, M_KTRACE); + return(ENOMEM); + } + if (copyin(uap->addr, cp, uap->len) == 0) { kth->ktr_buf = cp; kth->ktr_len = uap->len; - ktrwrite(p->p_tracep, kth, NULL, KERNEL_FUNNEL); + ktrwrite(p->p_tracep, kth, NULL); } FREE(kth, M_KTRACE); FREE(cp, M_KTRACE); @@ -454,24 +479,19 @@ ktrops(curp, p, ops, facs, vp) return (0); if (ops == KTROP_SET) { if (p->p_tracep != vp) { - /* - * if trace file already in use, relinquish - */ tvp = p->p_tracep; - - if (UBCINFOEXISTS(vp)) - ubc_hold(vp); - VREF(vp); - + vnode_ref(vp); p->p_tracep = vp; + if (tvp != NULL) { - VOP_CLOSE(tvp, FREAD|FWRITE, p->p_ucred, p); - ubc_rele(tvp); - vrele(tvp); + /* + * if trace file already in use, relinquish + */ + vnode_rele(tvp); } } p->p_traceflag |= facs; - if (curp->p_ucred->cr_uid == 0) + if (!suser(kauth_cred_get(), NULL)) p->p_traceflag |= KTRFAC_ROOT; } else { /* KTROP_CLEAR */ @@ -481,10 +501,7 @@ ktrops(curp, p, ops, facs, vp) p->p_traceflag = 0; if (tvp != NULL) { p->p_tracep = NULL; - - VOP_CLOSE(tvp, FREAD|FWRITE, p->p_ucred, p); - ubc_rele(tvp); - vrele(tvp); + vnode_rele(tvp); } } } @@ -525,118 +542,49 @@ ktrsetchildren(curp, top, ops, facs, vp) } static void -ktrwrite(vp, kth, uio, funnel_type) - struct vnode *vp; - register struct ktr_header *kth; - struct uio *uio; +ktrwrite(struct vnode *vp, struct ktr_header *kth, struct uio *uio) { - struct uio auio; - struct iovec aiov[2]; + uio_t auio; register struct proc *p = current_proc(); /* XXX */ + struct vfs_context context; int error; + char uio_buf[ UIO_SIZEOF(2) ]; if (vp == NULL) return; - if (funnel_type == -1) { - funnel_t *f = thread_funnel_get(); - if(f == THR_FUNNEL_NULL) - funnel_type = NO_FUNNEL; - else if (f == (funnel_t *)network_flock) - funnel_type = NETWORK_FUNNEL; - else if (f == (funnel_t *)kernel_flock) - funnel_type = KERNEL_FUNNEL; - } - - switch (funnel_type) { - case KERNEL_FUNNEL: - /* Nothing more to do */ - break; - case NETWORK_FUNNEL: - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - break; - case NO_FUNNEL: - (void) thread_funnel_set(kernel_flock, TRUE); - break; - default: - panic("Invalid funnel (%)", funnel_type); - } - auio.uio_iov = &aiov[0]; - auio.uio_offset = 0; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = UIO_WRITE; - aiov[0].iov_base = (caddr_t)kth; - aiov[0].iov_len = sizeof(struct ktr_header); - auio.uio_resid = sizeof(struct ktr_header); - auio.uio_iovcnt = 1; - auio.uio_procp = current_proc(); + auio = uio_createwithbuffer(2, 0, UIO_SYSSPACE, UIO_WRITE, + &uio_buf[0], sizeof(uio_buf)); + uio_addiov(auio, CAST_USER_ADDR_T(kth), sizeof(struct ktr_header)); + context.vc_proc = p; + context.vc_ucred = kauth_cred_get(); + if (kth->ktr_len > 0) { - auio.uio_iovcnt++; - aiov[1].iov_base = kth->ktr_buf; - aiov[1].iov_len = kth->ktr_len; - auio.uio_resid += kth->ktr_len; + uio_addiov(auio, CAST_USER_ADDR_T(kth->ktr_buf), kth->ktr_len); if (uio != NULL) - kth->ktr_len += uio->uio_resid; + kth->ktr_len += uio_resid(uio); } - error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - if (error) - goto bad; - (void)VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); - error = VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, p->p_ucred); - if (error == 0 && uio != NULL) { - (void)VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); - error = VOP_WRITE(vp, uio, IO_UNIT | IO_APPEND, p->p_ucred); - } - VOP_UNLOCK(vp, 0, p); - if (!error) { - switch (funnel_type) { - case KERNEL_FUNNEL: - /* Nothing more to do */ - break; - case NETWORK_FUNNEL: - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - /* switch funnel to NETWORK_FUNNEL */ - break; - case NO_FUNNEL: - (void) thread_funnel_set(kernel_flock, FALSE); - break; - default: - panic("Invalid funnel (%)", funnel_type); + if ((error = vnode_getwithref(vp)) == 0) { + error = VNOP_WRITE(vp, auio, IO_UNIT | IO_APPEND, &context); + if (error == 0 && uio != NULL) { + error = VNOP_WRITE(vp, uio, IO_UNIT | IO_APPEND, &context); } - return; + vnode_put(vp); } - -bad: - /* - * If error encountered, give up tracing on this vnode. - */ - log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n", - error); - LIST_FOREACH(p, &allproc, p_list) { - if (p->p_tracep == vp) { - p->p_tracep = NULL; - p->p_traceflag = 0; - - VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p); - ubc_rele(vp); - vrele(vp); + if (error) { + /* + * If error encountered, give up tracing on this vnode. + */ + log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n", + error); + LIST_FOREACH(p, &allproc, p_list) { + if (p->p_tracep == vp) { + p->p_tracep = NULL; + p->p_traceflag = 0; + vnode_rele(vp); + } } } - - switch (funnel_type) { - case KERNEL_FUNNEL: - /* Nothing more to do */ - break; - case NETWORK_FUNNEL: - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - /* switch funnel to NETWORK_FUNNEL */ - break; - case NO_FUNNEL: - (void) thread_funnel_set(kernel_flock, FALSE); - break; - default: - panic("Invalid funnel (%)", funnel_type); - } } /* @@ -649,21 +597,23 @@ bad: * TODO: check groups. use caller effective gid. */ static int -ktrcanset(callp, targetp) - struct proc *callp, *targetp; +ktrcanset(__unused struct proc *callp, struct proc *targetp) { - register struct pcred *caller = callp->p_cred; - register struct pcred *target = targetp->p_cred; + kauth_cred_t caller = kauth_cred_get(); + kauth_cred_t target = targetp->p_ucred; /* XXX */ +#if 0 + /* PRISON_CHECK was defined to 1 always .... */ if (!PRISON_CHECK(callp, targetp)) return (0); - if ((caller->pc_ucred->cr_uid == target->p_ruid && - target->p_ruid == target->p_svuid && - caller->p_rgid == target->p_rgid && /* XXX */ - target->p_rgid == target->p_svgid && +#endif + if ((kauth_cred_getuid(caller) == target->cr_ruid && + target->cr_ruid == target->cr_svuid && + caller->cr_rgid == target->cr_rgid && /* XXX */ + target->cr_rgid == target->cr_svgid && (targetp->p_traceflag & KTRFAC_ROOT) == 0 && (targetp->p_flag & P_SUGID) == 0) || - caller->pc_ucred->cr_uid == 0) + !suser(caller, NULL)) return (1); return (0); diff --git a/bsd/kern/kern_lock.c b/bsd/kern/kern_lock.c index 898924500..c69140fda 100644 --- a/bsd/kern/kern_lock.c +++ b/bsd/kern/kern_lock.c @@ -60,7 +60,7 @@ */ #include <sys/param.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> #include <sys/lock.h> #include <kern/cpu_number.h> #include <kern/thread.h> @@ -91,11 +91,9 @@ int lock_wait_time = 100; if (lock_wait_time > 0) { \ int i; \ \ - simple_unlock(&lkp->lk_interlock); \ for (i = lock_wait_time; i > 0; i--) \ if (!(wanted)) \ break; \ - simple_lock(&lkp->lk_interlock); \ } \ if (!(wanted)) \ break; @@ -117,10 +115,8 @@ int lock_wait_time = 100; PAUSE(lkp, wanted); \ for (error = 0; wanted; ) { \ (lkp)->lk_waitcount++; \ - simple_unlock(&(lkp)->lk_interlock); \ error = tsleep((void *)lkp, (lkp)->lk_prio, \ (lkp)->lk_wmesg, (lkp)->lk_timo); \ - simple_lock(&(lkp)->lk_interlock); \ (lkp)->lk_waitcount--; \ if (error) \ break; \ @@ -137,13 +133,12 @@ void lockinit(lkp, prio, wmesg, timo, flags) struct lock__bsd__ *lkp; int prio; - char *wmesg; + const char *wmesg; int timo; int flags; { bzero(lkp, sizeof(struct lock__bsd__)); - simple_lock_init(&lkp->lk_interlock); lkp->lk_flags = flags & LK_EXTFLG_MASK; lkp->lk_prio = prio; lkp->lk_timo = timo; @@ -161,12 +156,10 @@ lockstatus(lkp) { int lock_type = 0; - simple_lock(&lkp->lk_interlock); if (lkp->lk_exclusivecount != 0) lock_type = LK_EXCLUSIVE; else if (lkp->lk_sharecount != 0) lock_type = LK_SHARED; - simple_unlock(&lkp->lk_interlock); return (lock_type); } @@ -181,7 +174,7 @@ int lockmgr(lkp, flags, interlkp, p) struct lock__bsd__ *lkp; u_int flags; - simple_lock_t interlkp; + void * interlkp; struct proc *p; { int error; @@ -189,14 +182,11 @@ lockmgr(lkp, flags, interlkp, p) int extflags; void *self; - error = 0; self = current_act(); + error = 0; self = current_thread(); if (p) pid = p->p_pid; else pid = LK_KERNPROC; - simple_lock(&lkp->lk_interlock); - if (flags & LK_INTERLOCK) - simple_unlock(interlkp); extflags = (flags | lkp->lk_flags) & LK_EXTFLG_MASK; #if 0 /* @@ -429,13 +419,11 @@ lockmgr(lkp, flags, interlkp, p) (LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE)) || lkp->lk_sharecount != 0 || lkp->lk_waitcount != 0); ) { lkp->lk_flags |= LK_WAITDRAIN; - simple_unlock(&lkp->lk_interlock); if (error = tsleep((void *)&lkp->lk_flags, lkp->lk_prio, lkp->lk_wmesg, lkp->lk_timo)) return (error); if ((extflags) & LK_SLEEPFAIL) return (ENOLCK); - simple_lock(&lkp->lk_interlock); } lkp->lk_flags |= LK_DRAINING | LK_HAVE_EXCL; lkp->lk_lockholder = pid; @@ -445,7 +433,6 @@ lockmgr(lkp, flags, interlkp, p) break; default: - simple_unlock(&lkp->lk_interlock); panic("lockmgr: unknown locktype request %d", flags & LK_TYPE_MASK); /* NOTREACHED */ @@ -456,7 +443,6 @@ lockmgr(lkp, flags, interlkp, p) lkp->lk_flags &= ~LK_WAITDRAIN; wakeup((void *)&lkp->lk_flags); } - simple_unlock(&lkp->lk_interlock); return (error); } @@ -464,6 +450,7 @@ lockmgr(lkp, flags, interlkp, p) * Print out information about state of a lock. Used by VOP_PRINT * routines to display ststus about contained locks. */ +void lockmgr_printinfo(lkp) struct lock__bsd__ *lkp; { diff --git a/bsd/ufs/ufs/ufs_lockf.c b/bsd/kern/kern_lockf.c similarity index 61% rename from bsd/ufs/ufs/ufs_lockf.c rename to bsd/kern/kern_lockf.c index 4f4a71933..1ef3470ce 100644 --- a/bsd/ufs/ufs/ufs_lockf.c +++ b/bsd/kern/kern_lockf.c @@ -1,25 +1,3 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. @@ -35,10 +13,6 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. @@ -55,53 +29,229 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)ufs_lockf.c 8.4 (Berkeley) 10/26/94 + * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94 */ +#include <sys/cdefs.h> #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> -#include <sys/file.h> +#include <sys/lock.h> +#include <sys/mount.h> #include <sys/proc.h> +#include <sys/unistd.h> #include <sys/vnode.h> +#include <sys/vnode_internal.h> +#include <sys/vnode_if.h> #include <sys/malloc.h> #include <sys/fcntl.h> -#include <sys/quota.h> - -#include <ufs/ufs/lockf.h> -#include <ufs/ufs/quota.h> -#include <ufs/ufs/inode.h> -#include <ufs/ufs/ufs_extern.h> +#include <sys/lockf.h> +#if DEAD_CODE /* * This variable controls the maximum number of processes that will * be checked in doing deadlock detection. */ -int maxlockdepth = MAXDEPTH; +static int maxlockdepth = MAXDEPTH; +#endif /* DEAD_CODE */ #ifdef LOCKF_DEBUG -#include <vm/vm.h> #include <sys/sysctl.h> -int lockf_debug = 0; -struct ctldebug debug4 = { "lockf_debug", &lockf_debug }; + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> + + +static int lockf_debug = 2; +SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW, &lockf_debug, 0, ""); #endif +MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures"); + #define NOLOCKF (struct lockf *)0 #define SELF 0x1 #define OTHERS 0x2 +#define OFF_MAX 0x7fffffffffffffffULL /* max off_t */ +static int lf_clearlock(struct lockf *); +static int lf_findoverlap(struct lockf *, + struct lockf *, int, struct lockf ***, struct lockf **); +static struct lockf * + lf_getblock(struct lockf *); +static int lf_getlock(struct lockf *, struct flock *); +static int lf_setlock(struct lockf *); +static void lf_split(struct lockf *, struct lockf *); +static void lf_wakelock(struct lockf *); /* - * Set a byte-range lock. + * Advisory record locking support */ int +lf_advlock(ap) + struct vnop_advlock_args /* { + struct vnode *a_vp; + caddr_t a_id; + int a_op; + struct flock *a_fl; + int a_flags; + vfs_context_t a_context; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct flock *fl = ap->a_fl; + vfs_context_t context = ap->a_context; + struct lockf *lock; + off_t start, end, oadd; + u_quad_t size; + int error; + struct lockf **head = &vp->v_lockf; + + /* XXX HFS may need a !vnode_isreg(vp) EISDIR error here */ + + /* + * Avoid the common case of unlocking when inode has no locks. + */ + if (*head == (struct lockf *)0) { + if (ap->a_op != F_SETLK) { + fl->l_type = F_UNLCK; +#ifdef LOCKF_DEBUG + printf("lf_advlock: unlock without lock\n"); +#endif /* LOCKF_DEBUG */ + return (0); + } + } + + /* + * Convert the flock structure into a start and end. + */ + switch (fl->l_whence) { + + case SEEK_SET: + case SEEK_CUR: + /* + * Caller is responsible for adding any necessary offset + * when SEEK_CUR is used. + */ + start = fl->l_start; + break; + + case SEEK_END: + + if ((error = vnode_size(vp, &size, context))) +{ +#ifdef LOCKF_DEBUG + printf("lf_advlock: vnode_getattr failed: %d\n", error); +#endif /* LOCKF_DEBUG */ + return (error); +} + + if (size > OFF_MAX || + (fl->l_start > 0 && size > OFF_MAX - fl->l_start)) + return (EOVERFLOW); + start = size + fl->l_start; + break; + + default: +#ifdef LOCKF_DEBUG + printf("lf_advlock: unknown whence %d\n", fl->l_whence); +#endif /* LOCKF_DEBUG */ + return (EINVAL); + } + if (start < 0) +{ +#ifdef LOCKF_DEBUG + printf("lf_advlock: start < 0 (%qd)\n", start); +#endif /* LOCKF_DEBUG */ + return (EINVAL); +} + if (fl->l_len < 0) { + if (start == 0) +{ +#ifdef LOCKF_DEBUG + printf("lf_advlock: len < 0 & start == 0\n"); +#endif /* LOCKF_DEBUG */ + return (EINVAL); +} + end = start - 1; + start += fl->l_len; + if (start < 0) +{ +#ifdef LOCKF_DEBUG + printf("lf_advlock: start < 0 (%qd)\n", start); +#endif /* LOCKF_DEBUG */ + return (EINVAL); +} + } else if (fl->l_len == 0) + end = -1; + else { + oadd = fl->l_len - 1; + if (oadd > (off_t)(OFF_MAX - start)) +{ +#ifdef LOCKF_DEBUG + printf("lf_advlock: overflow\n"); +#endif /* LOCKF_DEBUG */ + return (EOVERFLOW); +} + end = start + oadd; + } + /* + * Create the lockf structure + */ + MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK); + lock->lf_start = start; + lock->lf_end = end; + lock->lf_id = ap->a_id; + lock->lf_vnode = vp; + lock->lf_type = fl->l_type; + lock->lf_head = head; + lock->lf_next = (struct lockf *)0; + TAILQ_INIT(&lock->lf_blkhd); + lock->lf_flags = ap->a_flags; + + lck_mtx_lock(&vp->v_lock); /* protect the lockf list */ + /* + * Do the requested operation. + */ + switch(ap->a_op) { + case F_SETLK: + error = lf_setlock(lock); + break; + + case F_UNLCK: + error = lf_clearlock(lock); + FREE(lock, M_LOCKF); + break; + + case F_GETLK: + error = lf_getlock(lock, fl); + FREE(lock, M_LOCKF); + break; + + default: + FREE(lock, M_LOCKF); + error = EINVAL; + break; + } + lck_mtx_unlock(&vp->v_lock); /* done maniplulating the list */ + +#ifdef LOCKF_DEBUG + printf("lf_advlock: normal exit: %d\n", error); +#endif /* LOCKF_DEBUG */ + return (error); +} + +/* + * Set a byte-range lock. + */ +static int lf_setlock(lock) - register struct lockf *lock; + struct lockf *lock; { - register struct lockf *block; - struct inode *ip = lock->lf_inode; + struct lockf *block; + struct lockf **head = lock->lf_head; struct lockf **prev, *overlap, *ltmp; static char lockstr[] = "lockf"; int ovcase, priority, needtolink, error; + struct vnode *vp = lock->lf_vnode; #ifdef LOCKF_DEBUG if (lockf_debug & 1) @@ -118,7 +268,7 @@ lf_setlock(lock) /* * Scan lock list for this file looking for locks that would block us. */ - while (block = lf_getblock(lock)) { + while ((block = lf_getblock(lock))) { /* * Free the structure and return if nonblocking. */ @@ -126,6 +276,10 @@ lf_setlock(lock) FREE(lock, M_LOCKF); return (EAGAIN); } +#if DEAD_CODE +/* + * XXX This is dead code on MacOS X; it shouldn't be. + */ /* * We are blocked. Since flock style locks cover * the whole file, there is no chance for deadlock. @@ -138,27 +292,35 @@ lf_setlock(lock) */ if ((lock->lf_flags & F_POSIX) && (block->lf_flags & F_POSIX)) { - register struct proc *wproc; - register struct lockf *waitblock; + struct proc *wproc; + struct thread *td; + struct lockf *waitblock; int i = 0; /* The block is waiting on something */ + /* XXXKSE this is not complete under threads */ wproc = (struct proc *)block->lf_id; - while (wproc->p_wchan && - (wproc->p_wmesg == lockstr) && - (i++ < maxlockdepth)) { - waitblock = (struct lockf *)wproc->p_wchan; - /* Get the owner of the blocking lock */ - waitblock = waitblock->lf_next; - if ((waitblock->lf_flags & F_POSIX) == 0) - break; - wproc = (struct proc *)waitblock->lf_id; - if (wproc == (struct proc *)lock->lf_id) { - _FREE(lock, M_LOCKF); - return (EDEADLK); + mtx_lock_spin(&sched_lock); + FOREACH_THREAD_IN_PROC(wproc, td) { + while (td->td_wchan && + (td->td_wmesg == lockstr) && + (i++ < maxlockdepth)) { + waitblock = (struct lockf *)td->td_wchan; + /* Get the owner of the blocking lock */ + waitblock = waitblock->lf_next; + if ((waitblock->lf_flags & F_POSIX) == 0) + break; + wproc = (struct proc *)waitblock->lf_id; + if (wproc == (struct proc *)lock->lf_id) { + mtx_unlock_spin(&sched_lock); + FREE(lock, M_LOCKF); + return (EDEADLK); + } } } + mtx_unlock_spin(&sched_lock); } +#endif /* DEAD_CODE */ /* * For flock type locks, we must first remove * any shared locks that we hold before we sleep @@ -182,21 +344,23 @@ lf_setlock(lock) lf_printlist("lf_setlock", block); } #endif /* LOCKF_DEBUG */ - if (error = tsleep((caddr_t)lock, priority, lockstr, 0)) { + error = msleep(lock, &vp->v_lock, priority, lockstr, 0); + if (error) { /* XXX */ /* - * We may have been awakened by a signal (in - * which case we must remove ourselves from the - * blocked list) and/or by another process - * releasing a lock (in which case we have already - * been removed from the blocked list and our + * We may have been awakened by a signal and/or by a + * debugger continuing us (in which cases we must remove + * ourselves from the blocked list) and/or by another + * process releasing a lock (in which case we have + * already been removed from the blocked list and our * lf_next field set to NOLOCKF). */ - if (lock->lf_next) - TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, - lf_block); - _FREE(lock, M_LOCKF); + if (lock->lf_next) { + TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block); + lock->lf_next = NOLOCKF; + } + FREE(lock, M_LOCKF); return (error); - } + } /* XXX */ } /* * No blocks!! Add the lock. Note that we will @@ -206,11 +370,12 @@ lf_setlock(lock) * Skip over locks owned by other processes. * Handle any locks that overlap and are owned by ourselves. */ - prev = &ip->i_lockf; - block = ip->i_lockf; + prev = head; + block = *head; needtolink = 1; for (;;) { - if (ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap)) + ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap); + if (ovcase) block = overlap->lf_next; /* * Six cases: @@ -247,7 +412,7 @@ lf_setlock(lock) * Check for common starting point and different types. */ if (overlap->lf_type == lock->lf_type) { - _FREE(lock, M_LOCKF); + FREE(lock, M_LOCKF); lock = overlap; /* for debug output below */ break; } @@ -269,11 +434,13 @@ lf_setlock(lock) overlap->lf_type == F_WRLCK) { lf_wakelock(overlap); } else { - while (ltmp = overlap->lf_blkhd.tqh_first) { + while (!TAILQ_EMPTY(&overlap->lf_blkhd)) { + ltmp = TAILQ_FIRST(&overlap->lf_blkhd); TAILQ_REMOVE(&overlap->lf_blkhd, ltmp, lf_block); TAILQ_INSERT_TAIL(&lock->lf_blkhd, ltmp, lf_block); + ltmp->lf_next = lock; } } /* @@ -286,7 +453,7 @@ lf_setlock(lock) needtolink = 0; } else *prev = overlap->lf_next; - _FREE(overlap, M_LOCKF); + FREE(overlap, M_LOCKF); continue; case 4: /* overlap starts before lock */ @@ -330,12 +497,12 @@ lf_setlock(lock) * Generally, find the lock (or an overlap to that lock) * and remove it (or shrink it), then wakeup anyone we can. */ -int +static int lf_clearlock(unlock) - register struct lockf *unlock; + struct lockf *unlock; { - struct inode *ip = unlock->lf_inode; - register struct lockf *lf = ip->i_lockf; + struct lockf **head = unlock->lf_head; + struct lockf *lf = *head; struct lockf *overlap, **prev; int ovcase; @@ -347,8 +514,8 @@ lf_clearlock(unlock) if (lockf_debug & 1) lf_print("lf_clearlock", unlock); #endif /* LOCKF_DEBUG */ - prev = &ip->i_lockf; - while (ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap)) { + prev = head; + while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap))) { /* * Wakeup the list of locks to be retried. */ @@ -373,7 +540,7 @@ lf_clearlock(unlock) case 3: /* lock contains overlap */ *prev = overlap->lf_next; lf = overlap->lf_next; - _FREE(overlap, M_LOCKF); + FREE(overlap, M_LOCKF); continue; case 4: /* overlap starts before lock */ @@ -399,19 +566,19 @@ lf_clearlock(unlock) * Check whether there is a blocking lock, * and if so return its process identifier. */ -int +static int lf_getlock(lock, fl) - register struct lockf *lock; - register struct flock *fl; + struct lockf *lock; + struct flock *fl; { - register struct lockf *block; + struct lockf *block; #ifdef LOCKF_DEBUG if (lockf_debug & 1) lf_print("lf_getlock", lock); #endif /* LOCKF_DEBUG */ - if (block = lf_getblock(lock)) { + if ((block = lf_getblock(lock))) { fl->l_type = block->lf_type; fl->l_whence = SEEK_SET; fl->l_start = block->lf_start; @@ -420,7 +587,7 @@ lf_getlock(lock, fl) else fl->l_len = block->lf_end - block->lf_start + 1; if (block->lf_flags & F_POSIX) - fl->l_pid = ((struct proc *)(block->lf_id))->p_pid; + fl->l_pid = proc_pid((struct proc *)(block->lf_id)); else fl->l_pid = -1; } else { @@ -433,15 +600,15 @@ lf_getlock(lock, fl) * Walk the list of locks for an inode and * return the first blocking lock. */ -struct lockf * +static struct lockf * lf_getblock(lock) - register struct lockf *lock; + struct lockf *lock; { - struct lockf **prev, *overlap, *lf = lock->lf_inode->i_lockf; + struct lockf **prev, *overlap, *lf = *(lock->lf_head); int ovcase; - prev = &lock->lf_inode->i_lockf; - while (ovcase = lf_findoverlap(lf, lock, OTHERS, &prev, &overlap)) { + prev = lock->lf_head; + while ((ovcase = lf_findoverlap(lf, lock, OTHERS, &prev, &overlap))) { /* * We've found an overlap, see if it blocks us */ @@ -457,15 +624,15 @@ lf_getblock(lock) } /* - * Walk the list of locks for an inode to + * Walk the list of locks to * find an overlapping lock (if any). * * NOTE: this returns only the FIRST overlapping lock. There * may be more than one. */ -int +static int lf_findoverlap(lf, lock, type, prev, overlap) - register struct lockf *lf; + struct lockf *lf; struct lockf *lock; int type; struct lockf ***prev; @@ -573,12 +740,12 @@ lf_findoverlap(lf, lock, type, prev, overlap) * Split a lock and a contained region into * two or three locks as necessary. */ -void +static void lf_split(lock1, lock2) - register struct lockf *lock1; - register struct lockf *lock2; + struct lockf *lock1; + struct lockf *lock2; { - register struct lockf *splitlock; + struct lockf *splitlock; #ifdef LOCKF_DEBUG if (lockf_debug & 2) { @@ -605,7 +772,7 @@ lf_split(lock1, lock2) * the encompassing lock */ MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK); - bcopy((caddr_t)lock1, (caddr_t)splitlock, sizeof *splitlock); + bcopy(lock1, splitlock, sizeof *splitlock); splitlock->lf_start = lock2->lf_end + 1; TAILQ_INIT(&splitlock->lf_blkhd); lock1->lf_end = lock2->lf_start - 1; @@ -620,20 +787,21 @@ lf_split(lock1, lock2) /* * Wakeup a blocklist */ -void +static void lf_wakelock(listhead) struct lockf *listhead; { - register struct lockf *wakelock; + struct lockf *wakelock; - while (wakelock = listhead->lf_blkhd.tqh_first) { + while (!TAILQ_EMPTY(&listhead->lf_blkhd)) { + wakelock = TAILQ_FIRST(&listhead->lf_blkhd); TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block); wakelock->lf_next = NOLOCKF; #ifdef LOCKF_DEBUG if (lockf_debug & 2) lf_print("lf_wakelock: awakening", wakelock); #endif /* LOCKF_DEBUG */ - wakeup((caddr_t)wakelock); + wakeup(wakelock); } } @@ -641,65 +809,74 @@ lf_wakelock(listhead) /* * Print out a lock. */ +void lf_print(tag, lock) char *tag; - register struct lockf *lock; + struct lockf *lock; { - - printf("%s: lock 0x%lx for ", tag, lock); + + printf("%s: lock %p for ", tag, (void *)lock); if (lock->lf_flags & F_POSIX) - printf("proc %d", ((struct proc *)(lock->lf_id))->p_pid); + printf("proc %ld", (long)((struct proc *)lock->lf_id)->p_pid); + else + printf("id %p", (void *)lock->lf_id); + if (lock->lf_vnode != 0) + printf(" in vno 0x%08x, %s, start %jd, end %jd", + lock->lf_vnode, + lock->lf_type == F_RDLCK ? "shared" : + lock->lf_type == F_WRLCK ? "exclusive" : + lock->lf_type == F_UNLCK ? "unlock" : "unknown", + (intmax_t)lock->lf_start, (intmax_t)lock->lf_end); else - printf("id 0x%x", lock->lf_id); - printf(" in ino %d on dev <%d, %d>, %s, start %d, end %d", - lock->lf_inode->i_number, - major(lock->lf_inode->i_dev), - minor(lock->lf_inode->i_dev), - lock->lf_type == F_RDLCK ? "shared" : - lock->lf_type == F_WRLCK ? "exclusive" : - lock->lf_type == F_UNLCK ? "unlock" : - "unknown", lock->lf_start, lock->lf_end); - if (lock->lf_blkhd.tqh_first) - printf(" block 0x%x\n", lock->lf_blkhd.tqh_first); + printf(" %s, start %jd, end %jd", + lock->lf_type == F_RDLCK ? "shared" : + lock->lf_type == F_WRLCK ? "exclusive" : + lock->lf_type == F_UNLCK ? "unlock" : "unknown", + (intmax_t)lock->lf_start, (intmax_t)lock->lf_end); + if (!TAILQ_EMPTY(&lock->lf_blkhd)) + printf(" block %p\n", (void *)TAILQ_FIRST(&lock->lf_blkhd)); else printf("\n"); } +void lf_printlist(tag, lock) char *tag; struct lockf *lock; { - register struct lockf *lf, *blk; - - printf("%s: Lock list for ino %d on dev <%d, %d>:\n", - tag, lock->lf_inode->i_number, - major(lock->lf_inode->i_dev), - minor(lock->lf_inode->i_dev)); - for (lf = lock->lf_inode->i_lockf; lf; lf = lf->lf_next) { - printf("\tlock 0x%lx for ", lf); + struct lockf *lf, *blk; + + if (lock->lf_vnode == 0) + return; + + printf("%s: Lock list for vno 0x%08x:\n", + tag, lock->lf_vnode); + for (lf = lock->lf_vnode->v_lockf; lf; lf = lf->lf_next) { + printf("\tlock %p for ",(void *)lf); if (lf->lf_flags & F_POSIX) - printf("proc %d", ((struct proc *)(lf->lf_id))->p_pid); + printf("proc %ld", + (long)((struct proc *)lf->lf_id)->p_pid); else - printf("id 0x%x", lf->lf_id); - printf(", %s, start %d, end %d", - lf->lf_type == F_RDLCK ? "shared" : - lf->lf_type == F_WRLCK ? "exclusive" : - lf->lf_type == F_UNLCK ? "unlock" : - "unknown", lf->lf_start, lf->lf_end); - for (blk = lf->lf_blkhd.tqh_first; blk; - blk = blk->lf_block.tqe_next) { - printf("\n\t\tlock request 0x%lx for ", blk); + printf("id %p", (void *)lf->lf_id); + printf(", %s, start %jd, end %jd", + lf->lf_type == F_RDLCK ? "shared" : + lf->lf_type == F_WRLCK ? "exclusive" : + lf->lf_type == F_UNLCK ? "unlock" : + "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end); + TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) { + printf("\n\t\tlock request %p for ", (void *)blk); if (blk->lf_flags & F_POSIX) - printf("proc %d", - ((struct proc *)(blk->lf_id))->p_pid); + printf("proc %ld", + (long)((struct proc *)blk->lf_id)->p_pid); else - printf("id 0x%x", blk->lf_id); - printf(", %s, start %d, end %d", - blk->lf_type == F_RDLCK ? "shared" : - blk->lf_type == F_WRLCK ? "exclusive" : - blk->lf_type == F_UNLCK ? "unlock" : - "unknown", blk->lf_start, blk->lf_end); - if (blk->lf_blkhd.tqh_first) + printf("id %p", (void *)blk->lf_id); + printf(", %s, start %jd, end %jd", + blk->lf_type == F_RDLCK ? "shared" : + blk->lf_type == F_WRLCK ? "exclusive" : + blk->lf_type == F_UNLCK ? "unlock" : + "unknown", (intmax_t)blk->lf_start, + (intmax_t)blk->lf_end); + if (!TAILQ_EMPTY(&blk->lf_blkhd)) panic("lf_printlist: bad list"); } printf("\n"); diff --git a/bsd/kern/kern_malloc.c b/bsd/kern/kern_malloc.c index 3f1a92d27..5ae60405a 100644 --- a/bsd/kern/kern_malloc.c +++ b/bsd/kern/kern_malloc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -71,14 +71,16 @@ #include <sys/event.h> #include <sys/eventvar.h> -#include <sys/proc.h> -#include <sys/mount.h> -#include <sys/vnode.h> +#include <sys/proc_internal.h> +#include <sys/mount_internal.h> +#include <sys/vnode_internal.h> +#include <sys/ubc_internal.h> #include <sys/namei.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/filedesc.h> #include <sys/tty.h> #include <sys/quota.h> +#include <sys/uio_internal.h> #include <ufs/ufs/inode.h> @@ -92,7 +94,6 @@ #include <nfs/nfsproto.h> #include <nfs/nfsnode.h> #include <nfs/nfsmount.h> -#include <nfs/nqnfs.h> #include <vfs/vfs_journal.h> @@ -133,7 +134,7 @@ struct kmzones { SOS(ucred), KMZ_CREATEZONE, /* 16 M_CRED */ SOS(pgrp), KMZ_CREATEZONE, /* 17 M_PGRP */ SOS(session), KMZ_CREATEZONE, /* 18 M_SESSION */ - SOS(iovec), KMZ_LOOKUPZONE, /* 19 M_IOV */ + SOS(iovec_32), KMZ_LOOKUPZONE, /* 19 M_IOV32 */ SOS(mount), KMZ_CREATEZONE, /* 20 M_MOUNT */ 0, KMZ_MALLOC, /* 21 M_FHANDLE */ SOS(nfsreq), KMZ_CREATEZONE, /* 22 M_NFSREQ */ @@ -152,17 +153,17 @@ struct kmzones { 0, KMZ_MALLOC, /* 35 M_VMPVENT */ 0, KMZ_MALLOC, /* 36 M_VMPAGER */ 0, KMZ_MALLOC, /* 37 M_VMPGDATA */ - SOS(file), KMZ_CREATEZONE, /* 38 M_FILE */ + SOS(fileproc), KMZ_CREATEZONE, /* 38 M_FILEPROC */ SOS(filedesc), KMZ_CREATEZONE, /* 39 M_FILEDESC */ SOX(lockf), KMZ_CREATEZONE, /* 40 M_LOCKF */ SOS(proc), KMZ_CREATEZONE, /* 41 M_PROC */ - SOS(pcred), KMZ_CREATEZONE, /* 42 M_SUBPROC */ + SOS(pstats), KMZ_CREATEZONE, /* 42 M_SUBPROC */ 0, KMZ_MALLOC, /* 43 M_SEGMENT */ M_FFSNODE, KMZ_SHAREZONE, /* 44 M_LFSNODE */ SOS(inode), KMZ_CREATEZONE, /* 45 M_FFSNODE */ M_FFSNODE, KMZ_SHAREZONE, /* 46 M_MFSNODE */ - SOS(nqlease), KMZ_CREATEZONE, /* 47 M_NQLEASE */ - SOS(nqm), KMZ_CREATEZONE, /* 48 M_NQMHOST */ + 0, KMZ_MALLOC, /* 47 M_NQLEASE */ + 0, KMZ_MALLOC, /* 48 M_NQMHOST */ 0, KMZ_MALLOC, /* 49 M_NETADDR */ SOX(nfssvc_sock), KMZ_CREATEZONE, /* 50 M_NFSSVC */ @@ -215,6 +216,14 @@ struct kmzones { SOS(transaction), KMZ_CREATEZONE, /* 92 M_JNL_TR */ SOS(specinfo), KMZ_CREATEZONE, /* 93 M_SPECINFO */ SOS(kqueue), KMZ_CREATEZONE, /* 94 M_KQUEUE */ + SOS(directoryhint), KMZ_CREATEZONE, /* 95 M_HFSDIRHINT */ + SOS(cl_readahead), KMZ_CREATEZONE, /* 96 M_CLRDAHEAD */ + SOS(cl_writebehind),KMZ_CREATEZONE, /* 97 M_CLWRBEHIND */ + SOS(iovec_64), KMZ_LOOKUPZONE, /* 98 M_IOV64 */ + SOS(fileglob), KMZ_CREATEZONE, /* 99 M_FILEGLOB */ + 0, KMZ_MALLOC, /* 100 M_KAUTH */ + 0, KMZ_MALLOC, /* 101 M_DUMMYNET */ + SOS(unsafe_fsnode),KMZ_CREATEZONE, /* 102 M_UNSAFEFS */ #undef SOS #undef SOX }; @@ -283,7 +292,8 @@ struct _mhead { #define ZEROSIZETOKEN (void *)0xFADEDFAD -void *_MALLOC( +void * +_MALLOC( size_t size, int type, int flags) @@ -317,7 +327,8 @@ void *_MALLOC( return (mem->hdr.dat); } -void _FREE( +void +_FREE( void *addr, int type) { @@ -332,10 +343,11 @@ void _FREE( return; /* correct (convenient bsd kernel legacy) */ hdr = addr; hdr--; - kfree((vm_offset_t)hdr, hdr->mlen); + kfree(hdr, hdr->mlen); } -void *_MALLOC_ZONE( +void * +_MALLOC_ZONE( size_t size, int type, int flags) @@ -348,7 +360,7 @@ void *_MALLOC_ZONE( kmz = &kmzones[type]; if (kmz->kz_zalloczone == KMZ_MALLOC) - panic("_malloc_zone ZONE"); + panic("_malloc_zone ZONE: type = %d", type); /* XXX */ if (kmz->kz_elemsize == -1) @@ -370,7 +382,8 @@ void *_MALLOC_ZONE( return (elem); } -void _FREE_ZONE( +void +_FREE_ZONE( void *elem, size_t size, int type) @@ -389,7 +402,7 @@ void _FREE_ZONE( panic("FREE_SIZE XXX"); /* XXX */ if (size == kmz->kz_elemsize) - zfree(kmz->kz_zalloczone, (vm_offset_t)elem); + zfree(kmz->kz_zalloczone, elem); else - kfree((vm_offset_t)elem, size); + kfree(elem, size); } diff --git a/bsd/kern/kern_mib.c b/bsd/kern/kern_mib.c index 35b5ee7bf..11d967a42 100644 --- a/bsd/kern/kern_mib.c +++ b/bsd/kern/kern_mib.c @@ -64,7 +64,7 @@ #include <sys/kernel.h> #include <sys/systm.h> #include <sys/sysctl.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> #include <sys/unistd.h> #if defined(SMP) @@ -76,10 +76,9 @@ #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/proc.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/vnode.h> #include <sys/unistd.h> -#include <sys/buf.h> #include <sys/ioctl.h> #include <sys/namei.h> #include <sys/tty.h> @@ -96,7 +95,7 @@ extern vm_map_t bsd_pageable_map; -#include <sys/mount.h> +#include <sys/mount_internal.h> #include <sys/kdebug.h> #include <IOKit/IOPlatformExpert.h> @@ -105,6 +104,7 @@ extern vm_map_t bsd_pageable_map; #include <machine/machine_routines.h> #include <machine/cpu_capabilities.h> +static int cputype, cpusubtype, cputhreadtype; SYSCTL_NODE(, 0, sysctl, CTLFLAG_RW, 0, "Sysctl internal magic"); @@ -132,6 +132,14 @@ SYSCTL_NODE(, CTL_USER, user, CTLFLAG_RW, 0, */ #define CTLHW_RETQUAD (1 << 31) +#define CTLHW_LOCAL (1 << 30) + +#define HW_LOCAL_CPUTHREADTYPE (1 | CTLHW_LOCAL) +#define HW_LOCAL_PHYSICALCPU (2 | CTLHW_LOCAL) +#define HW_LOCAL_PHYSICALCPUMAX (3 | CTLHW_LOCAL) +#define HW_LOCAL_LOGICALCPU (4 | CTLHW_LOCAL) +#define HW_LOCAL_LOGICALCPUMAX (5 | CTLHW_LOCAL) + /* * Supporting some variables requires us to do "real" work. We @@ -146,6 +154,9 @@ sysctl_hw_generic SYSCTL_HANDLER_ARGS ml_cpu_info_t cpu_info; int val, doquad; long long qval; + host_basic_info_data_t hinfo; + kern_return_t kret; + int count = HOST_BASIC_INFO_COUNT; /* * Test and mask off the 'return quad' flag. @@ -156,6 +167,9 @@ sysctl_hw_generic SYSCTL_HANDLER_ARGS ml_cpu_get_info(&cpu_info); +#define BSD_HOST 1 + kret = host_info(BSD_HOST, HOST_BASIC_INFO, &hinfo, &count); + /* * Handle various OIDs. * @@ -164,32 +178,40 @@ sysctl_hw_generic SYSCTL_HANDLER_ARGS */ switch (arg2) { case HW_NCPU: - { - host_basic_info_data_t hinfo; - kern_return_t kret; - int count = HOST_BASIC_INFO_COUNT; -#define BSD_HOST 1 - - kret = host_info(BSD_HOST, HOST_BASIC_INFO, &hinfo, &count); - if (kret == KERN_SUCCESS) { - return(SYSCTL_RETURN(req, hinfo.max_cpus)); - } else { - return(EINVAL); - } + if (kret == KERN_SUCCESS) { + return(SYSCTL_RETURN(req, hinfo.max_cpus)); + } else { + return(EINVAL); } case HW_AVAILCPU: - { - host_basic_info_data_t hinfo; - kern_return_t kret; - int count = HOST_BASIC_INFO_COUNT; -#define BSD_HOST 1 - - kret = host_info(BSD_HOST, HOST_BASIC_INFO, &hinfo, &count); - if (kret == KERN_SUCCESS) { - return(SYSCTL_RETURN(req, hinfo.avail_cpus)); - } else { - return(EINVAL); - } + if (kret == KERN_SUCCESS) { + return(SYSCTL_RETURN(req, hinfo.avail_cpus)); + } else { + return(EINVAL); + } + case HW_LOCAL_PHYSICALCPU: + if (kret == KERN_SUCCESS) { + return(SYSCTL_RETURN(req, hinfo.physical_cpu)); + } else { + return(EINVAL); + } + case HW_LOCAL_PHYSICALCPUMAX: + if (kret == KERN_SUCCESS) { + return(SYSCTL_RETURN(req, hinfo.physical_cpu_max)); + } else { + return(EINVAL); + } + case HW_LOCAL_LOGICALCPU: + if (kret == KERN_SUCCESS) { + return(SYSCTL_RETURN(req, hinfo.logical_cpu)); + } else { + return(EINVAL); + } + case HW_LOCAL_LOGICALCPUMAX: + if (kret == KERN_SUCCESS) { + return(SYSCTL_RETURN(req, hinfo.logical_cpu_max)); + } else { + return(EINVAL); } case HW_CACHELINE: val = cpu_info.cache_line_size; @@ -268,11 +290,15 @@ sysctl_hw_generic SYSCTL_HANDLER_ARGS /* * hw.* MIB variables. */ -SYSCTL_PROC (_hw, HW_NCPU, ncpu, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_NCPU, sysctl_hw_generic, "I", ""); +SYSCTL_PROC (_hw, HW_NCPU, ncpu, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_NCPU, sysctl_hw_generic, "I", ""); SYSCTL_PROC (_hw, HW_AVAILCPU, activecpu, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_AVAILCPU, sysctl_hw_generic, "I", ""); +SYSCTL_PROC (_hw, OID_AUTO, physicalcpu, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_LOCAL_PHYSICALCPU, sysctl_hw_generic, "I", ""); +SYSCTL_PROC (_hw, OID_AUTO, physicalcpu_max, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_LOCAL_PHYSICALCPUMAX, sysctl_hw_generic, "I", ""); +SYSCTL_PROC (_hw, OID_AUTO, logicalcpu, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_LOCAL_LOGICALCPU, sysctl_hw_generic, "I", ""); +SYSCTL_PROC (_hw, OID_AUTO, logicalcpu_max, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_LOCAL_LOGICALCPUMAX, sysctl_hw_generic, "I", ""); SYSCTL_INT (_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD | CTLFLAG_KERN, NULL, BYTE_ORDER, ""); -SYSCTL_INT (_hw, OID_AUTO, cputype, CTLFLAG_RD | CTLFLAG_KERN, &machine_slot[0].cpu_type, 0, ""); -SYSCTL_INT (_hw, OID_AUTO, cpusubtype, CTLFLAG_RD | CTLFLAG_KERN, &machine_slot[0].cpu_subtype, 0, ""); +SYSCTL_INT (_hw, OID_AUTO, cputype, CTLFLAG_RD | CTLFLAG_KERN, &cputype, 0, ""); +SYSCTL_INT (_hw, OID_AUTO, cpusubtype, CTLFLAG_RD | CTLFLAG_KERN, &cpusubtype, 0, ""); SYSCTL_INT2QUAD(_hw, OID_AUTO, pagesize, CTLFLAG_RD | CTLFLAG_KERN, &page_size, ""); SYSCTL_QUAD (_hw, OID_AUTO, busfrequency, CTLFLAG_RD | CTLFLAG_KERN, &gPEClockFrequencyInfo.bus_frequency_hz, ""); SYSCTL_QUAD (_hw, OID_AUTO, busfrequency_min, CTLFLAG_RD | CTLFLAG_KERN, &gPEClockFrequencyInfo.bus_frequency_min_hz, ""); @@ -339,7 +365,10 @@ SYSCTL_PROC(_hw, HW_L3SETTINGS, l3settings, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG void sysctl_mib_init(void) { - + cputype = cpu_type(); + cpusubtype = cpu_subtype(); + cputhreadtype = cpu_threadtype(); + /* * Populate the optional portion of the hw.* MIB. * @@ -347,6 +376,12 @@ sysctl_mib_init(void) * that actually directly relate to the functions in * question. */ + + if (cputhreadtype != CPU_THREADTYPE_NONE) { + static SYSCTL_INT(_hw, OID_AUTO, cputhreadtype, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN, &cputhreadtype, 0, ""); + sysctl_register_oid(&sysctl__hw_cputhreadtype); + } + #ifdef __ppc__ { static int altivec_flag = -1; diff --git a/bsd/kern/kern_mman.c b/bsd/kern/kern_mman.c index e234d8955..f91cbf079 100644 --- a/bsd/kern/kern_mman.c +++ b/bsd/kern/kern_mman.c @@ -68,79 +68,55 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/filedesc.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/resourcevar.h> -#include <sys/buf.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/acct.h> #include <sys/wait.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/vadvise.h> #include <sys/trace.h> #include <sys/mman.h> #include <sys/conf.h> #include <sys/stat.h> #include <sys/ubc.h> +#include <sys/sysproto.h> #include <bsm/audit_kernel.h> #include <bsm/audit_kevents.h> #include <mach/mach_types.h> +#include <mach/mach_traps.h> +#include <mach/vm_sync.h> +#include <mach/vm_behavior.h> +#include <mach/vm_inherit.h> +#include <mach/vm_statistics.h> +#include <mach/mach_vm.h> +#include <mach/vm_map.h> +#include <mach/host_priv.h> #include <kern/cpu_number.h> +#include <kern/host.h> #include <vm/vm_map.h> #include <vm/vm_kern.h> #include <vm/vm_pager.h> -#include <mach/vm_sync.h> -#include <mach/vm_behavior.h> -#include <mach/vm_inherit.h> -#include <mach/vm_statistics.h> - -struct sbrk_args { - int incr; -}; - -/* ARGSUSED */ int -sbrk(p, uap, retval) - struct proc *p; - struct sbrk_args *uap; - register_t *retval; +sbrk(__unused struct proc *p, __unused struct sbrk_args *uap, __unused register_t *retval) { /* Not yet implemented */ - return (EOPNOTSUPP); + return (ENOTSUP); } -struct sstk_args { - int incr; -} *uap; - -/* ARGSUSED */ int -sstk(p, uap, retval) - struct proc *p; - struct sstk_args *uap; - register_t *retval; +sstk(__unused struct proc *p, __unused struct sstk_args *uap, __unused register_t *retval) { /* Not yet implemented */ - return (EOPNOTSUPP); + return (ENOTSUP); } -#if COMPAT_43 -/* ARGSUSED */ -int -ogetpagesize(p, uap, retval) - struct proc *p; - void *uap; - register_t *retval; -{ - - *retval = PAGE_SIZE; - return (0); -} -#endif /* COMPAT_43 */ struct osmmap_args { caddr_t addr; @@ -152,80 +128,68 @@ struct osmmap_args { }; int -osmmap(curp, uap, retval) - struct proc *curp; - register struct osmmap_args *uap; - register_t *retval; +osmmap( + struct proc *curp, + register struct osmmap_args *uap, + register_t *retval) { -struct mmap_args { - caddr_t addr; - size_t len; - int prot; - int flags; - int fd; -#ifdef DOUBLE_ALIGN_PARAMS - long pad; -#endif - off_t pos; -} newargs; + struct mmap_args newargs; + user_addr_t addr; + int ret; if ((uap->share == MAP_SHARED )|| (uap->share == MAP_PRIVATE )) { - newargs.addr = uap->addr; - newargs.len = (size_t)uap->len; + newargs.addr = CAST_USER_ADDR_T(uap->addr); + newargs.len = CAST_USER_ADDR_T(uap->len); newargs.prot = uap->prot; newargs.flags = uap->share; newargs.fd = uap->fd; newargs.pos = (off_t)uap->pos; - return(mmap(curp,&newargs, retval)); + ret = mmap(curp, &newargs, &addr); + if (ret == 0) + *retval = CAST_DOWN(register_t, addr); } else - return(EINVAL); + ret = EINVAL; + return ret; } -struct mmap_args { - caddr_t addr; - size_t len; - int prot; - int flags; - int fd; -#ifdef DOUBLE_ALIGN_PARAMS - long pad; -#endif - off_t pos; -}; + int -mmap(p, uap, retval) - struct proc *p; - struct mmap_args *uap; - register_t *retval; +mmap(struct proc *p, struct mmap_args *uap, user_addr_t *retval) { /* * Map in special device (must be SHARED) or file */ - struct file *fp; + struct fileproc *fp; register struct vnode *vp; int flags; int prot; int err=0; vm_map_t user_map; kern_return_t result; - vm_offset_t user_addr; - vm_size_t user_size; - vm_offset_t pageoff; + mach_vm_offset_t user_addr; + mach_vm_size_t user_size; + vm_object_offset_t pageoff; vm_object_offset_t file_pos; - boolean_t find_space, docow; + int alloc_flags; + boolean_t docow; vm_prot_t maxprot; void *handle; vm_pager_t pager; int mapanon=0; + int fpref=0; + int error =0; + int fd = uap->fd; - user_addr = (vm_offset_t)uap->addr; - user_size = (vm_size_t) uap->len; - AUDIT_ARG(addr, (void *)user_addr); - AUDIT_ARG(len, (int) user_size); + user_addr = (mach_vm_offset_t)uap->addr; + user_size = (mach_vm_size_t) uap->len; + + AUDIT_ARG(addr, user_addr); + AUDIT_ARG(len, user_size); AUDIT_ARG(fd, uap->fd); prot = (uap->prot & VM_PROT_ALL); flags = uap->flags; + vp = NULLVP; /* * The vm code does not have prototypes & compiler doesn't do the' @@ -237,21 +201,20 @@ mmap(p, uap, retval) /* make sure mapping fits into numeric range etc */ if ((file_pos + user_size > (vm_object_offset_t)-PAGE_SIZE_64) || - ((ssize_t) uap->len < 0 )|| - ((flags & MAP_ANON) && uap->fd != -1)) + ((flags & MAP_ANON) && fd != -1)) return (EINVAL); /* * Align the file position to a page boundary, * and save its page offset component. */ - pageoff = ((vm_offset_t)file_pos & PAGE_MASK); + pageoff = (file_pos & PAGE_MASK); file_pos -= (vm_object_offset_t)pageoff; /* Adjust size for rounding (on both ends). */ user_size += pageoff; /* low end... */ - user_size = (vm_size_t) round_page_32(user_size); /* hi end */ + user_size = mach_vm_round_page(user_size); /* hi end */ /* @@ -267,13 +230,6 @@ mmap(p, uap, retval) user_addr -= pageoff; if (user_addr & PAGE_MASK) return (EINVAL); - /* Address range must be all in user VM space. */ - if (VM_MAX_ADDRESS > 0 && (user_addr + user_size > VM_MAX_ADDRESS)) - return (EINVAL); - if (VM_MIN_ADDRESS > 0 && user_addr < VM_MIN_ADDRESS) - return (EINVAL); - if (user_addr + user_size < user_addr) - return (EINVAL); } #ifdef notyet /* DO not have apis to get this info, need to wait till then*/ @@ -285,8 +241,8 @@ mmap(p, uap, retval) * There should really be a pmap call to determine a reasonable * location. */ - else if (addr < round_page_32(p->p_vmspace->vm_daddr + MAXDSIZ)) - addr = round_page_32(p->p_vmspace->vm_daddr + MAXDSIZ); + else if (addr < mach_vm_round_page(p->p_vmspace->vm_daddr + MAXDSIZ)) + addr = mach_vm_round_page(p->p_vmspace->vm_daddr + MAXDSIZ); #endif @@ -300,37 +256,61 @@ mmap(p, uap, retval) file_pos = 0; mapanon = 1; } else { + struct vnode_attr va; + struct vfs_context context; /* * Mapping file, get fp for validation. Obtain vnode and make * sure it is of appropriate type. */ - err = fdgetf(p, uap->fd, &fp); + err = fp_lookup(p, fd, &fp, 0); if (err) return(err); - if(fp->f_type == DTYPE_PSXSHM) { - uap->addr = (caddr_t)user_addr; - uap->len = user_size; + fpref = 1; + if(fp->f_fglob->fg_type == DTYPE_PSXSHM) { + uap->addr = (user_addr_t)user_addr; + uap->len = (user_size_t)user_size; uap->prot = prot; uap->flags = flags; uap->pos = file_pos; - return(pshm_mmap(p, uap, retval, fp , pageoff)); + error = pshm_mmap(p, uap, retval, fp, (off_t)pageoff); + goto bad; } - if (fp->f_type != DTYPE_VNODE) - return(EINVAL); - vp = (struct vnode *)fp->f_data; - - if (vp->v_type != VREG && vp->v_type != VCHR) - return (EINVAL); + if (fp->f_fglob->fg_type != DTYPE_VNODE) { + error = EINVAL; + goto bad; + } + vp = (struct vnode *)fp->f_fglob->fg_data; + error = vnode_getwithref(vp); + if(error != 0) + goto bad; + + if (vp->v_type != VREG && vp->v_type != VCHR) { + (void)vnode_put(vp); + error = EINVAL; + goto bad; + } AUDIT_ARG(vnpath, vp, ARG_VNODE1); + + /* conformance change - mmap needs to update access time for mapped + * files + */ + VATTR_INIT(&va); + nanotime(&va.va_access_time); + VATTR_SET_ACTIVE(&va, va_access_time); + context.vc_proc = p; + context.vc_ucred = kauth_cred_get(); + vnode_setattr(vp, &va, &context); /* * XXX hack to handle use of /dev/zero to map anon memory (ala * SunOS). */ if (vp->v_type == VCHR || vp->v_type == VSTR) { - return(ENODEV); + (void)vnode_put(vp); + error = ENODEV; + goto bad; } else { /* * Ensure that file and memory protections are @@ -342,10 +322,13 @@ mmap(p, uap, retval) * proc does a setuid? */ maxprot = VM_PROT_EXECUTE; /* ??? */ - if (fp->f_flag & FREAD) + if (fp->f_fglob->fg_flag & FREAD) maxprot |= VM_PROT_READ; - else if (prot & PROT_READ) - return (EACCES); + else if (prot & PROT_READ) { + (void)vnode_put(vp); + error = EACCES; + goto bad; + } /* * If we are sharing potential changes (either via * MAP_SHARED or via the implicit sharing of character @@ -355,19 +338,30 @@ mmap(p, uap, retval) */ if ((flags & MAP_SHARED) != 0) { - if ((fp->f_flag & FWRITE) != 0) { - struct vattr va; - if ((err = - VOP_GETATTR(vp, &va, - p->p_ucred, p))) - return (err); - if ((va.va_flags & - (IMMUTABLE|APPEND)) == 0) - maxprot |= VM_PROT_WRITE; - else if (prot & PROT_WRITE) - return (EPERM); - } else if ((prot & PROT_WRITE) != 0) - return (EACCES); + if ((fp->f_fglob->fg_flag & FWRITE) != 0) { + /* + * check for write access + * + * Note that we already made this check when granting FWRITE + * against the file, so it seems redundant here. + */ + error = vnode_authorize(vp, NULL, KAUTH_VNODE_CHECKIMMUTABLE, &context); + + /* if not granted for any reason, but we wanted it, bad */ + if ((prot & PROT_WRITE) && (error != 0)) { + vnode_put(vp); + goto bad; + } + + /* if writable, remember */ + if (error == 0) + maxprot |= VM_PROT_WRITE; + + } else if ((prot & PROT_WRITE) != 0) { + (void)vnode_put(vp); + error = EACCES; + goto bad; + } } else maxprot |= VM_PROT_WRITE; @@ -375,42 +369,56 @@ mmap(p, uap, retval) } } - if (user_size == 0) - return(0); + if (user_size == 0) { + if (!mapanon) + (void)vnode_put(vp); + error = 0; + goto bad; + } /* * We bend a little - round the start and end addresses * to the nearest page boundary. */ - user_size = round_page_32(user_size); + user_size = mach_vm_round_page(user_size); - if (file_pos & PAGE_MASK_64) - return (EINVAL); + if (file_pos & PAGE_MASK_64) { + if (!mapanon) + (void)vnode_put(vp); + error = EINVAL; + goto bad; + } user_map = current_map(); if ((flags & MAP_FIXED) == 0) { - find_space = TRUE; - user_addr = round_page_32(user_addr); + alloc_flags = VM_FLAGS_ANYWHERE; + user_addr = mach_vm_round_page(user_addr); } else { - if (user_addr != trunc_page_32(user_addr)) - return (EINVAL); - find_space = FALSE; - (void) vm_deallocate(user_map, user_addr, user_size); + if (user_addr != mach_vm_trunc_page(user_addr)) { + if (!mapanon) + (void)vnode_put(vp); + error = EINVAL; + goto bad; + } + /* + * mmap(MAP_FIXED) will replace any existing mappings in the + * specified range, if the new mapping is successful. + * If we just deallocate the specified address range here, + * another thread might jump in and allocate memory in that + * range before we get a chance to establish the new mapping, + * and we won't have a chance to restore the old mappings. + * So we use VM_FLAGS_OVERWRITE to let Mach VM know that it + * has to deallocate the existing mappings and establish the + * new ones atomically. + */ + alloc_flags = VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE; } /* * Lookup/allocate object. */ - if (flags & MAP_ANON) { - /* - * Unnamed anonymous regions always start at 0. - */ - if (handle == 0) - file_pos = 0; - } - if (handle == NULL) { pager = NULL; #ifdef notyet @@ -423,23 +431,22 @@ mmap(p, uap, retval) maxprot |= VM_PROT_EXECUTE; #endif #endif - result = vm_allocate(user_map, &user_addr, user_size, find_space); + result = mach_vm_map(user_map, &user_addr, user_size, 0, + alloc_flags, IPC_PORT_NULL, 0, + FALSE, prot, maxprot, + (flags & MAP_SHARED) ? VM_INHERIT_SHARE : + VM_INHERIT_DEFAULT); if (result != KERN_SUCCESS) goto out; - - result = vm_protect(user_map, user_addr, user_size, TRUE, maxprot); - if (result != KERN_SUCCESS) - goto out; - result = vm_protect(user_map, user_addr, user_size, FALSE, prot); - if (result != KERN_SUCCESS) - goto out; - } else { UBCINFOCHECK("mmap", vp); pager = (vm_pager_t)ubc_getpager(vp); - if (pager == NULL) - return (ENOMEM); + if (pager == NULL) { + (void)vnode_put(vp); + error = ENOMEM; + goto bad; + } /* * Set credentials: @@ -465,80 +472,63 @@ mmap(p, uap, retval) #endif #endif /* notyet */ - result = vm_map_64(user_map, &user_addr, user_size, - 0, find_space, pager, file_pos, docow, - prot, maxprot, - VM_INHERIT_DEFAULT); + result = mach_vm_map(user_map, &user_addr, user_size, + 0, alloc_flags, (ipc_port_t)pager, file_pos, + docow, prot, maxprot, + (flags & MAP_SHARED) ? VM_INHERIT_SHARE : + VM_INHERIT_DEFAULT); - if (result != KERN_SUCCESS) + if (result != KERN_SUCCESS) { + (void)vnode_put(vp); goto out; + } - ubc_map(vp); + (void)ubc_map(vp,(prot & ( PROT_EXEC | PROT_READ | PROT_WRITE | PROT_EXEC))); } - if (flags & MAP_SHARED) { - result = vm_inherit(user_map, user_addr, user_size, - VM_INHERIT_SHARE); - if (result != KERN_SUCCESS) { - (void) vm_deallocate(user_map, user_addr, user_size); - goto out; - } - } + if (!mapanon) + (void)vnode_put(vp); out: switch (result) { case KERN_SUCCESS: - if (!mapanon) - *fdflags(p, uap->fd) |= UF_MAPPED; - *retval = (register_t)(user_addr + pageoff); - return (0); + *retval = user_addr + pageoff; + error = 0; + break; case KERN_INVALID_ADDRESS: case KERN_NO_SPACE: - return (ENOMEM); + error = ENOMEM; + break; case KERN_PROTECTION_FAILURE: - return (EACCES); + error = EACCES; + break; default: - return (EINVAL); + error = EINVAL; + break; } - /*NOTREACHED*/ +bad: + if (fpref) + fp_drop(p, fd, fp, 0); + return(error); } -struct msync_args { - caddr_t addr; - int len; - int flags; -}; int -msync(p, uap, retval) - struct proc *p; - struct msync_args *uap; - register_t *retval; +msync(__unused struct proc *p, struct msync_args *uap, __unused register_t *retval) { - vm_offset_t addr; - vm_size_t size, pageoff; + mach_vm_offset_t addr; + mach_vm_size_t size; int flags; vm_map_t user_map; int rv; vm_sync_t sync_flags=0; - addr = (vm_offset_t) uap->addr; - pageoff = (addr & PAGE_MASK); - addr -= pageoff; - size = uap->len; - size = (vm_size_t) round_page_32(size); - flags = uap->flags; - - if (addr + size < addr) - return(EINVAL); - - user_map = current_map(); - - if ((flags & (MS_ASYNC|MS_SYNC)) == (MS_ASYNC|MS_SYNC)) - return (EINVAL); - - if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) - return (EINVAL); + addr = (mach_vm_offset_t) uap->addr; + size = (mach_vm_size_t)uap->len; + if (addr & PAGE_MASK_64) { + /* UNIX SPEC: user address is not page-aligned, return EINVAL */ + return EINVAL; + } if (size == 0) { /* * We cannot support this properly without maintaining @@ -550,6 +540,12 @@ msync(p, uap, retval) return (EINVAL); /* XXX breaks posix apps */ } + flags = uap->flags; + /* disallow contradictory flags */ + if ((flags & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC) || + (flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) + return (EINVAL); + if (flags & MS_KILLPAGES) sync_flags |= VM_SYNC_KILLPAGES; if (flags & MS_DEACTIVATE) @@ -563,111 +559,88 @@ msync(p, uap, retval) else sync_flags |= VM_SYNC_SYNCHRONOUS; } - rv = vm_msync(user_map, addr, size, sync_flags); + + sync_flags |= VM_SYNC_CONTIGUOUS; /* complain if holes */ + + user_map = current_map(); + rv = mach_vm_msync(user_map, addr, size, sync_flags); switch (rv) { case KERN_SUCCESS: break; - case KERN_INVALID_ADDRESS: - return (EINVAL); /* Sun returns ENOMEM? */ + case KERN_INVALID_ADDRESS: /* hole in region being sync'ed */ + return (ENOMEM); case KERN_FAILURE: return (EIO); default: return (EINVAL); } - return (0); } int -mremap() +mremap(void) { /* Not yet implemented */ - return (EOPNOTSUPP); + return (ENOTSUP); } -struct munmap_args { - caddr_t addr; - int len; -}; int -munmap(p, uap, retval) - struct proc *p; - struct munmap_args *uap; - register_t *retval; - +munmap(__unused struct proc *p, struct munmap_args *uap, __unused register_t *retval) { - vm_offset_t user_addr; - vm_size_t user_size, pageoff; + mach_vm_offset_t user_addr; + mach_vm_size_t user_size; kern_return_t result; - user_addr = (vm_offset_t) uap->addr; - user_size = (vm_size_t) uap->len; + user_addr = (mach_vm_offset_t) uap->addr; + user_size = (mach_vm_size_t) uap->len; - AUDIT_ARG(addr, (void *)user_addr); - AUDIT_ARG(len, (int) user_size); + AUDIT_ARG(addr, user_addr); + AUDIT_ARG(len, user_size); - pageoff = (user_addr & PAGE_MASK); + if (user_addr & PAGE_MASK_64) { + /* UNIX SPEC: user address is not page-aligned, return EINVAL */ + return EINVAL; + } - user_addr -= pageoff; - user_size += pageoff; - user_size = round_page_32(user_size); if (user_addr + user_size < user_addr) return(EINVAL); - if (user_size == 0) - return (0); - - /* Address range must be all in user VM space. */ - if (VM_MAX_ADDRESS > 0 && (user_addr + user_size > VM_MAX_ADDRESS)) - return (EINVAL); - if (VM_MIN_ADDRESS > 0 && user_addr < VM_MIN_ADDRESS) - return (EINVAL); - + if (user_size == 0) { + /* UNIX SPEC: size is 0, return EINVAL */ + return EINVAL; + } - result = vm_deallocate(current_map(), user_addr, user_size); + result = mach_vm_deallocate(current_map(), user_addr, user_size); if (result != KERN_SUCCESS) { return(EINVAL); } return(0); } -void -munmapfd(p, fd) - struct proc *p; - int fd; -{ - /* - * XXX should vm_deallocate any regions mapped to this file - */ - *fdflags(p, fd) &= ~UF_MAPPED; -} - -struct mprotect_args { - caddr_t addr; - int len; - int prot; -}; int -mprotect(p, uap, retval) - struct proc *p; - struct mprotect_args *uap; - register_t *retval; +mprotect(__unused struct proc *p, struct mprotect_args *uap, __unused register_t *retval) { register vm_prot_t prot; - vm_offset_t user_addr; - vm_size_t user_size, pageoff; + mach_vm_offset_t user_addr; + mach_vm_size_t user_size; kern_return_t result; vm_map_t user_map; AUDIT_ARG(addr, uap->addr); AUDIT_ARG(len, uap->len); AUDIT_ARG(value, uap->prot); - user_addr = (vm_offset_t) uap->addr; - user_size = (vm_size_t) uap->len; + + user_addr = (mach_vm_offset_t) uap->addr; + user_size = (mach_vm_size_t) uap->len; prot = (vm_prot_t)(uap->prot & VM_PROT_ALL); + if (user_addr & PAGE_MASK_64) { + /* UNIX SPEC: user address is not page-aligned, return EINVAL */ + return EINVAL; + } + #ifdef notyet /* Hmm .. */ #if defined(VM_PROT_READ_IS_EXEC) @@ -676,41 +649,28 @@ mprotect(p, uap, retval) #endif #endif /* notyet */ - pageoff = (user_addr & PAGE_MASK); - user_addr -= pageoff; - user_size += pageoff; - user_size = round_page_32(user_size); - if (user_addr + user_size < user_addr) - return(EINVAL); - user_map = current_map(); - result = vm_map_protect(user_map, user_addr, user_addr+user_size, prot, - FALSE); + result = mach_vm_protect(user_map, user_addr, user_size, + FALSE, prot); switch (result) { case KERN_SUCCESS: return (0); case KERN_PROTECTION_FAILURE: return (EACCES); + case KERN_INVALID_ADDRESS: + /* UNIX SPEC: for an invalid address range, return ENOMEM */ + return ENOMEM; } return (EINVAL); } -struct minherit_args { - void *addr; - size_t len; - int inherit; -}; - int -minherit(p, uap, retval) - struct proc *p; - struct minherit_args *uap; - register_t *retval; +minherit(__unused struct proc *p, struct minherit_args *uap, __unused register_t *retval) { - vm_offset_t addr; - vm_size_t size, pageoff; + mach_vm_offset_t addr; + mach_vm_size_t size; register vm_inherit_t inherit; vm_map_t user_map; kern_return_t result; @@ -718,19 +678,13 @@ minherit(p, uap, retval) AUDIT_ARG(addr, uap->addr); AUDIT_ARG(len, uap->len); AUDIT_ARG(value, uap->inherit); - addr = (vm_offset_t)uap->addr; - size = uap->len; - inherit = uap->inherit; - pageoff = (addr & PAGE_MASK); - addr -= pageoff; - size += pageoff; - size = (vm_size_t) round_page_32(size); - if (addr + size < addr) - return(EINVAL); + addr = (mach_vm_offset_t)uap->addr; + size = (mach_vm_size_t)uap->len; + inherit = uap->inherit; user_map = current_map(); - result = vm_inherit(user_map, addr, size, + result = mach_vm_inherit(user_map, addr, size, inherit); switch (result) { case KERN_SUCCESS: @@ -741,45 +695,19 @@ minherit(p, uap, retval) return (EINVAL); } -struct madvise_args { - caddr_t addr; - int len; - int behav; -}; -/* ARGSUSED */ int -madvise(p, uap, retval) - struct proc *p; - struct madvise_args *uap; - register_t *retval; +madvise(__unused struct proc *p, struct madvise_args *uap, __unused register_t *retval) { vm_map_t user_map; - vm_offset_t start, end; + mach_vm_offset_t start; + mach_vm_size_t size; vm_behavior_t new_behavior; kern_return_t result; - /* - * Check for illegal addresses. Watch out for address wrap... Note - * that VM_*_ADDRESS are not constants due to casts (argh). - */ - if (VM_MAX_ADDRESS > 0 && - ((vm_offset_t) uap->addr + uap->len) > VM_MAX_ADDRESS) - return (ENOMEM); - if (VM_MIN_ADDRESS > 0 && uap->addr < VM_MIN_ADDRESS) - return (ENOMEM); - - if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) - return (ENOMEM); - /* * Since this routine is only advisory, we default to conservative * behavior. */ - start = trunc_page_32((vm_offset_t) uap->addr); - end = round_page_32((vm_offset_t) uap->addr + uap->len); - - user_map = current_map(); - switch (uap->behav) { case MADV_RANDOM: new_behavior = VM_BEHAVIOR_RANDOM; @@ -800,33 +728,28 @@ madvise(p, uap, retval) return(EINVAL); } - result = vm_behavior_set(user_map, start, end, new_behavior); + start = (mach_vm_offset_t) uap->addr; + size = (mach_vm_size_t) uap->len; + + user_map = current_map(); + + result = mach_vm_behavior_set(user_map, start, size, new_behavior); switch (result) { case KERN_SUCCESS: return (0); case KERN_INVALID_ADDRESS: - return (EINVAL); + return (ENOMEM); } return (EINVAL); } -struct mincore_args { - const void *addr; - size_t len; - char *vec; -}; -/* ARGSUSED */ int -mincore(p, uap, retval) - struct proc *p; - struct mincore_args *uap; - register_t *retval; +mincore(__unused struct proc *p, struct mincore_args *uap, __unused register_t *retval) { - vm_offset_t addr, first_addr; - vm_offset_t end; + mach_vm_offset_t addr, first_addr, end; vm_map_t map; - char *vec; + user_addr_t vec; int error; int vecindex, lastvecindex; int mincoreinfo=0; @@ -834,17 +757,17 @@ mincore(p, uap, retval) kern_return_t ret; int numref; + char c; + map = current_map(); /* * Make sure that the addresses presented are valid for user * mode. */ - first_addr = addr = trunc_page_32((vm_offset_t) uap->addr); - end = addr + (vm_size_t)round_page_32(uap->len); + first_addr = addr = mach_vm_trunc_page(uap->addr); + end = addr + mach_vm_round_page(uap->len); - if (VM_MAX_ADDRESS > 0 && end > VM_MAX_ADDRESS) - return (EINVAL); if (end < addr) return (EINVAL); @@ -861,7 +784,7 @@ mincore(p, uap, retval) * up the pages elsewhere. */ lastvecindex = -1; - for(addr; addr < end; addr += PAGE_SIZE) { + for( ; addr < end; addr += PAGE_SIZE ) { pqueryinfo = 0; ret = vm_map_page_query(map, addr, &pqueryinfo, &numref); if (ret != KERN_SUCCESS) @@ -885,7 +808,8 @@ mincore(p, uap, retval) * the byte vector is zeroed for those skipped entries. */ while((lastvecindex + 1) < vecindex) { - error = subyte( vec + lastvecindex, 0); + c = 0; + error = copyout(&c, vec + lastvecindex, 1); if (error) { return (EFAULT); } @@ -895,7 +819,8 @@ mincore(p, uap, retval) /* * Pass the page information to the user */ - error = subyte( vec + vecindex, mincoreinfo); + c = (char)mincoreinfo; + error = copyout(&c, vec + vecindex, 1); if (error) { return (EFAULT); } @@ -908,7 +833,8 @@ mincore(p, uap, retval) */ vecindex = (end - first_addr) >> PAGE_SHIFT; while((lastvecindex + 1) < vecindex) { - error = subyte( vec + lastvecindex, 0); + c = 0; + error = copyout(&c, vec + lastvecindex, 1); if (error) { return (EFAULT); } @@ -918,36 +844,31 @@ mincore(p, uap, retval) return (0); } -struct mlock_args { - caddr_t addr; - size_t len; -}; - int -mlock(p, uap, retval) - struct proc *p; - struct mlock_args *uap; - register_t *retval; +mlock(__unused struct proc *p, struct mlock_args *uap, __unused register_t *retvalval) { vm_map_t user_map; - vm_offset_t addr; - vm_size_t size, pageoff; - int error; + vm_map_offset_t addr; + vm_map_size_t size, pageoff; kern_return_t result; AUDIT_ARG(addr, uap->addr); AUDIT_ARG(len, uap->len); - addr = (vm_offset_t) uap->addr; - size = uap->len; - pageoff = (addr & PAGE_MASK); - addr -= pageoff; - size += pageoff; - size = (vm_size_t) round_page_32(size); + addr = (vm_map_offset_t) uap->addr; + size = (vm_map_size_t)uap->len; /* disable wrap around */ if (addr + size < addr) return (EINVAL); + + if (size == 0) + return (0); + + pageoff = (addr & PAGE_MASK); + addr -= pageoff; + size = vm_map_round_page(size+pageoff); + #ifdef notyet /* Hmm.. What am I going to do with this? */ if (atop(size) + cnt.v_wire_count > vm_page_max_wired) @@ -957,7 +878,7 @@ mlock(p, uap, retval) p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) return (ENOMEM); #else - error = suser(p->p_ucred, &p->p_acflag); + error = suser(kauth_cred_get(), &p->p_acflag); if (error) return (error); #endif @@ -965,45 +886,30 @@ mlock(p, uap, retval) user_map = current_map(); - /* vm_wire */ - result = vm_map_wire(user_map, addr, (vm_offset_t)(addr+size), VM_PROT_NONE, TRUE); + /* have to call vm_map_wire directly to pass "I don't know" protections */ + result = vm_map_wire(user_map, addr, addr+size, VM_PROT_NONE, TRUE); return (result == KERN_SUCCESS ? 0 : ENOMEM); } -struct munlock_args { - caddr_t addr; - size_t len; -}; int -munlock(p, uap, retval) - struct proc *p; - struct munlock_args *uap; - register_t *retval; +munlock(__unused struct proc *p, struct munlock_args *uap, __unused register_t *retval) { - vm_offset_t addr; - vm_size_t size, pageoff; - int error; + mach_vm_offset_t addr; + mach_vm_size_t size; vm_map_t user_map; kern_return_t result; AUDIT_ARG(addr, uap->addr); - AUDIT_ARG(len, uap->len); - addr = (vm_offset_t) uap->addr; - size = uap->len; + AUDIT_ARG(addr, uap->len); - pageoff = (addr & PAGE_MASK); - addr -= pageoff; - size += pageoff; - size = (vm_size_t) round_page_32(size); + addr = (mach_vm_offset_t) uap->addr; + size = (mach_vm_size_t)uap->len; - /* disable wrap around */ - if (addr + size < addr) - return (EINVAL); #ifdef notyet /* Hmm.. What am I going to do with this? */ #ifndef pmap_wired_count - error = suser(p->p_ucred, &p->p_acflag); + error = suser(kauth_cred_get(), &p->p_acflag); if (error) return (error); #endif @@ -1011,46 +917,28 @@ munlock(p, uap, retval) user_map = current_map(); - /* vm_wire */ - result = vm_wire(host_priv_self(), user_map, addr, size, VM_PROT_NONE); + /* JMM - need to remove all wirings by spec - this just removes one */ + result = mach_vm_wire(host_priv_self(), user_map, addr, size, VM_PROT_NONE); return (result == KERN_SUCCESS ? 0 : ENOMEM); } -struct mlockall_args { - int how; -}; - int -mlockall(p, uap) - struct proc *p; - struct mlockall_args *uap; +mlockall(__unused struct proc *p, __unused struct mlockall_args *uap, __unused register_t *retval) { return (ENOSYS); } -struct munlockall_args { - int how; -}; - int -munlockall(p, uap) - struct proc *p; - struct munlockall_args *uap; +munlockall(__unused struct proc *p, __unused struct munlockall_args *uap, __unused register_t *retval) { return(ENOSYS); } /* BEGIN DEFUNCT */ -struct obreak_args { - char *nsiz; -}; int -obreak(p, uap, retval) - struct proc *p; - struct obreak_args *uap; - register_t *retval; +obreak(__unused struct proc *p, __unused struct obreak_args *uap, __unused register_t *retval) { /* Not implemented, obsolete */ return (ENOMEM); @@ -1059,38 +947,32 @@ obreak(p, uap, retval) int both; int -ovadvise() +ovadvise(__unused struct proc *p, __unused struct ovadvise_args *uap, __unused register_t *retval) { #ifdef lint both = 0; #endif + return( 0 ); } /* END DEFUNCT */ -/* CDY need to fix interface to allow user to map above 32 bits */ /* USV: No! need to obsolete map_fd()! mmap() already supports 64 bits */ kern_return_t -map_fd( - int fd, - vm_offset_t offset, - vm_offset_t *va, - boolean_t findspace, - vm_size_t size) +map_fd(struct map_fd_args *args) { + int fd = args->fd; + vm_offset_t offset = args->offset; + vm_offset_t *va = args->va; + boolean_t findspace = args->findspace; + vm_size_t size = args->size; kern_return_t ret; - boolean_t funnel_state; AUDIT_MACH_SYSCALL_ENTER(AUE_MAPFD); - AUDIT_ARG(addr, va); + AUDIT_ARG(addr, CAST_DOWN(user_addr_t, va)); AUDIT_ARG(fd, fd); - funnel_state = thread_funnel_set(kernel_flock, TRUE); - - ret = map_fd_funneled( fd, (vm_object_offset_t)offset, - va, findspace, size); - - (void) thread_funnel_set(kernel_flock, FALSE); + ret = map_fd_funneled( fd, (vm_object_offset_t)offset, va, findspace, size); AUDIT_MACH_SYSCALL_EXIT(ret); return ret; @@ -1105,114 +987,153 @@ map_fd_funneled( vm_size_t size) { kern_return_t result; - struct file *fp; + struct fileproc *fp; struct vnode *vp; void * pager; vm_offset_t map_addr=0; vm_size_t map_size; - vm_map_copy_t tmp; int err=0; vm_map_t my_map; struct proc *p =(struct proc *)current_proc(); + struct vnode_attr vattr; + struct vfs_context context; /* * Find the inode; verify that it's a regular file. */ - err = fdgetf(p, fd, &fp); + err = fp_lookup(p, fd, &fp, 0); if (err) return(err); - if (fp->f_type != DTYPE_VNODE) - return(KERN_INVALID_ARGUMENT); + if (fp->f_fglob->fg_type != DTYPE_VNODE){ + err = KERN_INVALID_ARGUMENT; + goto bad; + } - if (!(fp->f_flag & FREAD)) - return (KERN_PROTECTION_FAILURE); + if (!(fp->f_fglob->fg_flag & FREAD)) { + err = KERN_PROTECTION_FAILURE; + goto bad; + } - vp = (struct vnode *)fp->f_data; + vp = (struct vnode *)fp->f_fglob->fg_data; + err = vnode_getwithref(vp); + if(err != 0) + goto bad; - if (vp->v_type != VREG) - return (KERN_INVALID_ARGUMENT); + if (vp->v_type != VREG) { + (void)vnode_put(vp); + err = KERN_INVALID_ARGUMENT; + goto bad; + } AUDIT_ARG(vnpath, vp, ARG_VNODE1); + /* conformance change - mmap needs to update access time for mapped + * files + */ + VATTR_INIT(&vattr); + nanotime(&vattr.va_access_time); + VATTR_SET_ACTIVE(&vattr, va_access_time); + context.vc_proc = p; + context.vc_ucred = kauth_cred_get(); + vnode_setattr(vp, &vattr, &context); + if (offset & PAGE_MASK_64) { printf("map_fd: file offset not page aligned(%d : %s)\n",p->p_pid, p->p_comm); - return (KERN_INVALID_ARGUMENT); + (void)vnode_put(vp); + err = KERN_INVALID_ARGUMENT; + goto bad; } - map_size = round_page_32(size); + map_size = round_page(size); /* * Allow user to map in a zero length file. */ - if (size == 0) - return (KERN_SUCCESS); + if (size == 0) { + (void)vnode_put(vp); + err = KERN_SUCCESS; + goto bad; + } /* * Map in the file. */ UBCINFOCHECK("map_fd_funneled", vp); pager = (void *) ubc_getpager(vp); - if (pager == NULL) - return (KERN_FAILURE); + if (pager == NULL) { + (void)vnode_put(vp); + err = KERN_FAILURE; + goto bad; + } my_map = current_map(); result = vm_map_64( my_map, - &map_addr, map_size, (vm_offset_t)0, TRUE, - pager, offset, TRUE, + &map_addr, map_size, (vm_offset_t)0, + VM_FLAGS_ANYWHERE, pager, offset, TRUE, VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT); - if (result != KERN_SUCCESS) - return (result); + if (result != KERN_SUCCESS) { + (void)vnode_put(vp); + err = result; + goto bad; + } if (!findspace) { vm_offset_t dst_addr; vm_map_copy_t tmp; - if (copyin(va, &dst_addr, sizeof (dst_addr)) || + if (copyin(CAST_USER_ADDR_T(va), &dst_addr, sizeof (dst_addr)) || trunc_page_32(dst_addr) != dst_addr) { (void) vm_map_remove( my_map, map_addr, map_addr + map_size, VM_MAP_NO_FLAGS); - return (KERN_INVALID_ADDRESS); + (void)vnode_put(vp); + err = KERN_INVALID_ADDRESS; + goto bad; } - result = vm_map_copyin( - my_map, - map_addr, map_size, TRUE, - &tmp); + result = vm_map_copyin(my_map, (vm_map_address_t)map_addr, + (vm_map_size_t)map_size, TRUE, &tmp); if (result != KERN_SUCCESS) { - (void) vm_map_remove( - my_map, - map_addr, map_addr + map_size, + (void) vm_map_remove(my_map, vm_map_trunc_page(map_addr), + vm_map_round_page(map_addr + map_size), VM_MAP_NO_FLAGS); - return (result); + (void)vnode_put(vp); + err = result; + goto bad; } - result = vm_map_copy_overwrite( - my_map, - dst_addr, tmp, FALSE); + result = vm_map_copy_overwrite(my_map, + (vm_map_address_t)dst_addr, tmp, FALSE); if (result != KERN_SUCCESS) { vm_map_copy_discard(tmp); - return (result); + (void)vnode_put(vp); + err = result; + goto bad; } } else { - if (copyout(&map_addr, va, sizeof (map_addr))) { - (void) vm_map_remove( - my_map, - map_addr, map_addr + map_size, + if (copyout(&map_addr, CAST_USER_ADDR_T(va), sizeof (map_addr))) { + (void) vm_map_remove(my_map, vm_map_trunc_page(map_addr), + vm_map_round_page(map_addr + map_size), VM_MAP_NO_FLAGS); - return (KERN_INVALID_ADDRESS); + (void)vnode_put(vp); + err = KERN_INVALID_ADDRESS; + goto bad; } } ubc_setcred(vp, current_proc()); - ubc_map(vp); - - return (KERN_SUCCESS); + (void)ubc_map(vp, (PROT_READ | PROT_WRITE | PROT_EXEC)); + (void)vnode_put(vp); + err = 0; +bad: + fp_drop(p, fd, fp, 0); + return (err); } + diff --git a/bsd/kern/kern_newsysctl.c b/bsd/kern/kern_newsysctl.c index 064678311..63524f2c4 100644 --- a/bsd/kern/kern_newsysctl.c +++ b/bsd/kern/kern_newsysctl.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -66,7 +66,7 @@ #include <sys/kernel.h> #include <sys/sysctl.h> #include <sys/malloc.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> #include <sys/systm.h> #include <bsm/audit_kernel.h> @@ -307,15 +307,15 @@ sysctl_sysctl_name SYSCTL_HANDLER_ARGS int error = 0; struct sysctl_oid *oid; struct sysctl_oid_list *lsp = &sysctl__children, *lsp2; - char buf[10]; + char tempbuf[10]; while (namelen) { if (!lsp) { - snprintf(buf,sizeof(buf),"%d",*name); + snprintf(tempbuf,sizeof(tempbuf),"%d",*name); if (req->oldidx) error = SYSCTL_OUT(req, ".", 1); if (!error) - error = SYSCTL_OUT(req, buf, strlen(buf)); + error = SYSCTL_OUT(req, tempbuf, strlen(tempbuf)); if (error) return (error); namelen--; @@ -497,8 +497,7 @@ sysctl_sysctl_name2oid SYSCTL_HANDLER_ARGS if (req->newlen >= MAXPATHLEN) /* XXX arbitrary, undocumented */ return (ENAMETOOLONG); - p = _MALLOC(req->newlen+1, M_TEMP, M_WAITOK); - + MALLOC(p, char *,req->newlen+1, M_TEMP, M_WAITOK); if (!p) return ENOMEM; @@ -737,14 +736,13 @@ static int sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l) { size_t i = 0; - int error = 0; if (req->oldptr) { i = l; if (i > req->oldlen - req->oldidx) i = req->oldlen - req->oldidx; if (i > 0) - bcopy((void*)p, (char *)req->oldptr + req->oldidx, i); + bcopy((void*)p, CAST_DOWN(char *, (req->oldptr + req->oldidx)), i); } req->oldidx += l; if (req->oldptr && i != l) @@ -759,7 +757,7 @@ sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l) return 0; if (req->newlen - req->newidx < l) return (EINVAL); - bcopy((char *)req->newptr + req->newidx, p, l); + bcopy(CAST_DOWN(char *, (req->newptr + req->newidx)), p, l); req->newidx += l; return (0); } @@ -779,10 +777,10 @@ kernel_sysctl(struct proc *p, int *name, u_int namelen, void *old, size_t *oldle if (oldlenp) req.oldlen = *oldlenp; if (old) - req.oldptr= old; + req.oldptr = CAST_USER_ADDR_T(old); if (newlen) { req.newlen = newlen; - req.newptr = new; + req.newptr = CAST_USER_ADDR_T(new); } req.oldfunc = sysctl_old_kernel; req.newfunc = sysctl_new_kernel; @@ -806,7 +804,7 @@ kernel_sysctl(struct proc *p, int *name, u_int namelen, void *old, size_t *oldle /* unlock memory if required */ if (req.lock == 2) - vsunlock(req.oldptr, req.oldlen, B_WRITE); + vsunlock(req.oldptr, (user_size_t)req.oldlen, B_WRITE); memlock.sl_lock = 0; @@ -845,8 +843,7 @@ sysctl_old_user(struct sysctl_req *req, const void *p, size_t l) if (i > req->oldlen - req->oldidx) i = req->oldlen - req->oldidx; if (i > 0) - error = copyout((void*)p, (char *)req->oldptr + req->oldidx, - i); + error = copyout((void*)p, (req->oldptr + req->oldidx), i); } req->oldidx += l; if (error) @@ -865,7 +862,7 @@ sysctl_new_user(struct sysctl_req *req, void *p, size_t l) return 0; if (req->newlen - req->newidx < l) return (EINVAL); - error = copyin((char *)req->newptr + req->newidx, p, l); + error = copyin((req->newptr + req->newidx), p, l); req->newidx += l; return (error); } @@ -934,13 +931,6 @@ found: return EINVAL; } - /* - * Switch to the NETWORK funnel for CTL_NET and KERN_IPC sysctls - */ - - if (((name[0] == CTL_NET) || ((name[0] == CTL_KERN) && - (name[1] == KERN_IPC)))) - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { i = (oid->oid_handler) (oid, @@ -952,14 +942,6 @@ found: req); } - /* - * Switch back to the KERNEL funnel, if necessary - */ - - if (((name[0] == CTL_NET) || ((name[0] == CTL_KERN) && - (name[1] == KERN_IPC)))) - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - return (i); } @@ -984,17 +966,17 @@ new_sysctl(struct proc *p, struct sysctl_args *uap) if (uap->namelen > CTL_MAXNAME || uap->namelen < 2) return (EINVAL); - error = copyin(uap->name, &name, uap->namelen * sizeof(int)); + error = copyin(CAST_USER_ADDR_T(uap->name), &name, uap->namelen * sizeof(int)); if (error) return (error); error = userland_sysctl(p, name, uap->namelen, - uap->old, uap->oldlenp, 0, - uap->new, uap->newlen, &j); + CAST_USER_ADDR_T(uap->old), uap->oldlenp, 0, + CAST_USER_ADDR_T(uap->new), uap->newlen, &j); if (error && error != ENOMEM) return (error); if (uap->oldlenp) { - i = copyout(&j, uap->oldlenp, sizeof(j)); + i = copyout(&j, CAST_USER_ADDR_T(uap->oldlenp), sizeof(j)); if (i) return (i); } @@ -1006,7 +988,9 @@ new_sysctl(struct proc *p, struct sysctl_args *uap) * must be in kernel space. */ int -userland_sysctl(struct proc *p, int *name, u_int namelen, void *old, size_t *oldlenp, int inkernel, void *new, size_t newlen, size_t *retval) +userland_sysctl(struct proc *p, int *name, u_int namelen, user_addr_t oldp, + size_t *oldlenp, int inkernel, user_addr_t newp, size_t newlen, + size_t *retval) { int error = 0; struct sysctl_req req, req2; @@ -1019,19 +1003,19 @@ userland_sysctl(struct proc *p, int *name, u_int namelen, void *old, size_t *old if (inkernel) { req.oldlen = *oldlenp; } else { - error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp)); + error = copyin(CAST_USER_ADDR_T(oldlenp), &req.oldlen, sizeof(*oldlenp)); if (error) return (error); } } - if (old) { - req.oldptr= old; + if (oldp) { + req.oldptr = oldp; } if (newlen) { req.newlen = newlen; - req.newptr = new; + req.newptr = newp; } req.oldfunc = sysctl_old_user; diff --git a/bsd/kern/kern_panicinfo.c b/bsd/kern/kern_panicinfo.c index 83f753872..9ad8549f1 100644 --- a/bsd/kern/kern_panicinfo.c +++ b/bsd/kern/kern_panicinfo.c @@ -23,210 +23,159 @@ #include <sys/param.h> #include <sys/fcntl.h> #include <sys/malloc.h> -#include <sys/namei.h> #include <sys/proc.h> -#include <sys/stat.h> #include <sys/sysctl.h> #include <sys/vnode.h> #include <sys/vm.h> +#include <sys/systm.h> +#include <mach/mach_types.h> #include <mach/kern_return.h> +#include <kern/kern_types.h> +#include <vm/vm_kern.h> -/* prototypes not exported by osfmk. */ -extern void kmem_free(vm_map_t, vm_offset_t, vm_size_t); -extern kern_return_t kmem_alloc_wired(vm_map_t, vm_offset_t *, vm_size_t); +/* prototypes not exported by osfmk/console. */ +extern void panic_dialog_test( void ); +extern int panic_dialog_set_image( const unsigned char * ptr, unsigned int size ); +extern void panic_dialog_get_image( unsigned char ** ptr, unsigned int * size ); -/* Globals */ -static off_t imagesizelimit = (4 * 4096); +/* make the compiler happy */ +extern int sysctl_dopanicinfo(int *, u_int, user_addr_t, size_t *, user_addr_t, size_t, struct proc *); -/* Information about the current panic image */ -static int image_bits = 32; /* Bitdepth */ -static char *image_pathname = NULL; /* path to it */ -static size_t image_pathlen = 0; /* and the length of the pathname */ +#define PANIC_IMAGE_SIZE_LIMIT (32 * 4096) /* 128K - Maximum amount of memory consumed for the panic UI */ +#define KERN_PANICINFO_TEST (KERN_PANICINFO_IMAGE+2) /* Allow the panic UI to be tested by root without causing a panic */ -static vm_offset_t image_ptr = NULL; /* the image itself */ -static off_t image_size = 0; /* and the imagesize */ - - -__private_extern__ void -get_panicimage(vm_offset_t *imageptr, vm_size_t *imagesize, int *imagebits) -{ - *imageptr = image_ptr; - *imagesize = image_size; - *imagebits = image_bits; -} - -static int -panicimage_from_file( - char *imname, - off_t sizelimit, - vm_offset_t *image, - off_t *filesize, - struct proc *p) -{ - int error = 0; - int error1 = 0; - int aresid; - struct nameidata nd; - struct vattr vattr; - struct vnode * vp; - kern_return_t kret; - struct pcred *pcred = p->p_cred; - struct ucred *cred = pcred->pc_ucred; - vm_offset_t iobuf; - - /* Open the file */ - NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, imname, p); - error = vn_open(&nd, FREAD, S_IRUSR); - if (error) - return (error); - vp = nd.ni_vp; - - if (vp->v_type != VREG) { - error = EFAULT; - goto out; - } - - /* get the file size */ - error = VOP_GETATTR(vp, &vattr, cred, p); - if (error) - goto out; - - /* validate the file size */ - if (vattr.va_size > sizelimit) { - error = EFBIG; - goto out; - } - - /* allocate kernel wired memory */ - kret = kmem_alloc_wired(kernel_map, &iobuf, - (vm_size_t)vattr.va_size); - if (kret != KERN_SUCCESS) { - switch (kret) { - default: - error = EINVAL; - break; - case KERN_NO_SPACE: - case KERN_RESOURCE_SHORTAGE: - error = ENOMEM; - break; - case KERN_PROTECTION_FAILURE: - error = EPERM; - break; - } - goto out; - } - - /* read the file in the kernel buffer */ - error = vn_rdwr(UIO_READ, vp, (caddr_t)iobuf, (int)vattr.va_size, - (off_t)0, UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, - cred, &aresid, p); - if (error) { - (void)kmem_free(kernel_map, iobuf, (vm_size_t)vattr.va_size); - goto out; - } - - /* - * return the image to the caller - * freeing this memory is callers responsibility - */ - *image = iobuf; - *filesize = (off_t)vattr.va_size; - -out: - VOP_UNLOCK(vp, 0, p); - error1 = vn_close(vp, FREAD, cred, p); - if (error == 0) - error = error1; - return (error); -} +/* Local data */ +static int image_size_limit = PANIC_IMAGE_SIZE_LIMIT; __private_extern__ int sysctl_dopanicinfo(name, namelen, oldp, oldlenp, newp, newlen, p) int *name; u_int namelen; - void *oldp; + user_addr_t oldp; size_t *oldlenp; - void *newp; + user_addr_t newp; size_t newlen; struct proc *p; { int error = 0; - int bitdepth = 32; /* default is 32 bits */ - char *imname; + vm_offset_t newimage = (vm_offset_t )NULL; + kern_return_t kret; + unsigned char * prev_image_ptr; + unsigned int prev_image_size; + /* all sysctl names at this level are terminal */ if (namelen != 1) return (ENOTDIR); /* overloaded */ + if ( (error = proc_suser(p)) ) /* must be super user to muck with image */ + return (error); + switch (name[0]) { default: - return (EOPNOTSUPP); + return (ENOTSUP); + + case KERN_PANICINFO_TEST: + + panic_dialog_test(); + return (0); + case KERN_PANICINFO_MAXSIZE: - if (newp != NULL && (error = suser(p->p_ucred, &p->p_acflag))) - return (error); - error = sysctl_quad(oldp, oldlenp, newp, newlen, &imagesizelimit); + + /* return the image size limits */ + + newlen = 0; + newp = USER_ADDR_NULL; + + error = sysctl_int(oldp, oldlenp, newp, newlen, &image_size_limit); + return (error); - case KERN_PANICINFO_IMAGE16: - bitdepth = 16; - /* and fall through */ - case KERN_PANICINFO_IMAGE32: - /* allocate a buffer for the image pathname */ - MALLOC_ZONE(imname, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); - - if (!newp) { - bcopy(image_pathname, imname, image_pathlen); - imname[image_pathlen] = '\0'; - } else - imname[0] = '\0'; - error = sysctl_string(oldp, oldlenp, newp, newlen, - imname, MAXPATHLEN); - if (newp && !error) { - char *tmpstr, *oldstr; - off_t filesize = 0; - size_t len; - vm_offset_t image; - vm_offset_t oimage = NULL; - vm_size_t osize = 0; /* covariable: quiet compiler */ - - len = strlen(imname); - oldstr = image_pathname; - - error = panicimage_from_file(imname, imagesizelimit, - &image, &filesize, p); - if (error) + case KERN_PANICINFO_IMAGE: + + /* If we have a new image, allocate wired kernel memory and copy it in from user space */ + if ( newp != USER_ADDR_NULL ) { + + /* check the length of the incoming image before allocating space for it. */ + if ( newlen > (size_t)image_size_limit ) + return (ENOMEM); + + /* allocate some kernel wired memory for the new image */ + kret = kmem_alloc(kernel_map, &newimage, (vm_size_t)round_page_32(newlen)); + + if (kret != KERN_SUCCESS) { + switch (kret) { + default: + error = EINVAL; + break; + case KERN_NO_SPACE: + case KERN_RESOURCE_SHORTAGE: + error = ENOMEM; + break; + case KERN_PROTECTION_FAILURE: + error = EPERM; + break; + } + + return (error); + } + + /* copy the image in from user space */ + if ( (error = copyin(newp, (char *) newimage, newlen)) ) goto errout; - /* release the old image */ - if (image_ptr) { - oimage = image_ptr; - osize = image_size; + } else { /* setup to make the default image active */ + + newimage = (vm_offset_t )NULL; + newlen = 0; + } + + /* get the current image location and size */ + panic_dialog_get_image( &prev_image_ptr, &prev_image_size ); + + /* did the caller request a copy of the previous image ? */ + if ( oldp != USER_ADDR_NULL ) { + if ( *oldlenp < prev_image_size ) { + error = ERANGE; + goto errout; } - /* remember the new one */ - image_ptr = image; - image_bits = bitdepth; /* new bith depth */ - image_size = filesize; /* new imagesize */ + /* copy the image to user space or zero the size if the default image is active */ + if ( prev_image_ptr != NULL ) { + if ( (error = copyout( prev_image_ptr, oldp, prev_image_size )) ) + goto errout; - if (oimage) - kmem_free(kernel_map, oimage, osize); + *oldlenp = prev_image_size; + } + else /* tell the user that the default image is active */ + *oldlenp = 0; + } - /* save the new name */ - MALLOC(tmpstr, char *, len+1, M_TEMP, M_WAITOK); - bcopy(imname, tmpstr, len); - tmpstr[len] = '\0'; + /* Make the new image active, or reactivate the default image. + But, handle the special case of asking for the current image + without changing the current image. + */ - image_pathname = tmpstr; /* new pathname */ - image_pathlen = len; /* new pathname length */ + if ( !(oldp && newp == USER_ADDR_NULL) ) { + if ( (error = panic_dialog_set_image( (unsigned char *) newimage, newlen )) ) + goto errout; - /* free the old name */ - FREE(oldstr, M_TEMP); + /* free the wired memory used by the previous image */ + if ( prev_image_ptr != NULL ) { + (void)kmem_free(kernel_map, (vm_offset_t) prev_image_ptr, (vm_size_t)round_page_32(prev_image_size)); + printf("Panic UI memory freed (%d)\n", round_page_32(prev_image_size)); + } } + + return (0); + errout: - FREE_ZONE(imname, MAXPATHLEN, M_NAMEI); + if ( newimage != (vm_offset_t )NULL ) + (void)kmem_free(kernel_map, newimage, (vm_size_t)round_page_32(newlen)); + return (error); } } diff --git a/bsd/kern/kern_pcsamples.c b/bsd/kern/kern_pcsamples.c index 7d7ab169f..f231dd6cb 100644 --- a/bsd/kern/kern_pcsamples.c +++ b/bsd/kern/kern_pcsamples.c @@ -23,15 +23,17 @@ #include <sys/kdebug.h> #include <sys/errno.h> #include <sys/param.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> #include <sys/vm.h> #include <sys/sysctl.h> +#include <sys/systm.h> #include <vm/vm_kern.h> +#include <machine/machine_routines.h> -unsigned int pc_buftomem = 0; -u_long * pc_buffer = 0; /* buffer that holds each pc */ -u_long * pc_bufptr = 0; -u_long * pc_buflast = 0; +vm_offset_t pc_buftomem = 0; +unsigned int * pc_buffer = 0; /* buffer that holds each pc */ +unsigned int * pc_bufptr = 0; +unsigned int * pc_buflast = 0; unsigned int npcbufs = 8192; /* number of pc entries in buffer */ unsigned int pc_bufsize = 0; unsigned int pcsample_flags = 0; @@ -43,16 +45,26 @@ boolean_t pc_trace_frameworks = FALSE; char pcsample_comm[MAXCOMLEN + 1]; /* Set the default framework boundaries */ -u_long pcsample_beg = 0; -u_long pcsample_end = 0; +unsigned int pcsample_beg = 0; +unsigned int pcsample_end = 0; static pid_t global_state_pid = -1; /* Used to control exclusive use of pc_buffer */ extern int pc_trace_buf[]; extern int pc_trace_cnt; +void add_pcbuffer(void); +int branch_tracing_enabled(void); +int disable_branch_tracing(void); +int enable_branch_tracing(void); +int pcsamples_bootstrap(void); +void pcsamples_clear(void); +int pcsamples_control(int *name, u_int namelen, user_addr_t where, size_t *sizep); +int pcsamples_read(user_addr_t buffer, size_t *number); +int pcsamples_reinit(void); + int -enable_branch_tracing() +enable_branch_tracing(void) { #ifndef i386 struct proc *p; @@ -74,24 +86,24 @@ enable_branch_tracing() } int -disable_branch_tracing() +disable_branch_tracing(void) { - struct proc *p; - switch (pc_sample_pid) { + struct proc *p; + switch (pc_sample_pid) { case -1: - pc_trace_frameworks = FALSE; - break; - case 0: - break; - default: - p = pfind(pc_sample_pid); - if (p) { - p->p_flag &= ~P_BTRACE; - } - break; -} - clr_be_bit(); - return 1; + pc_trace_frameworks = FALSE; + break; + case 0: + break; + default: + p = pfind(pc_sample_pid); + if (p) { + p->p_flag &= ~P_BTRACE; + } + break; + } + clr_be_bit(); + return 1; } /* @@ -99,7 +111,7 @@ disable_branch_tracing() * is called from context_switch in the scheduler */ int -branch_tracing_enabled() +branch_tracing_enabled(void) { struct proc *p = current_proc(); if (TRUE == pc_trace_frameworks) return TRUE; @@ -111,12 +123,10 @@ branch_tracing_enabled() void -add_pcbuffer() +add_pcbuffer(void) { int i; - u_long pc; - struct proc *curproc; - extern unsigned int kdebug_flags; + unsigned int pc; if (!pcsample_enable) return; @@ -134,7 +144,7 @@ add_pcbuffer() } /* Then the sample is in our range */ - *pc_bufptr = (u_long)pc; + *pc_bufptr = pc; pc_bufptr++; } } @@ -149,7 +159,8 @@ add_pcbuffer() return; } -pcsamples_bootstrap() +int +pcsamples_bootstrap(void) { if (!disable_branch_tracing()) return(ENOTSUP); @@ -157,9 +168,9 @@ pcsamples_bootstrap() pc_bufsize = npcbufs * sizeof(* pc_buffer); if (kmem_alloc(kernel_map, &pc_buftomem, (vm_size_t)pc_bufsize) == KERN_SUCCESS) - pc_buffer = (u_long *) pc_buftomem; + pc_buffer = (unsigned int *) pc_buftomem; else - pc_buffer= (u_long *) 0; + pc_buffer = NULL; if (pc_buffer) { pc_bufptr = pc_buffer; @@ -173,12 +184,12 @@ pcsamples_bootstrap() } -pcsamples_reinit() +int +pcsamples_reinit(void) { -int x; -int ret=0; + int ret=0; - pcsample_enable = 0; + pcsample_enable = 0; if (pc_bufsize && pc_buffer) kmem_free(kernel_map, (vm_offset_t)pc_buffer, pc_bufsize); @@ -187,16 +198,17 @@ int ret=0; return(ret); } -pcsamples_clear() +void +pcsamples_clear(void) { - /* Clean up the sample buffer, set defaults */ - global_state_pid = -1; + /* Clean up the sample buffer, set defaults */ + global_state_pid = -1; pcsample_enable = 0; if(pc_bufsize && pc_buffer) kmem_free(kernel_map, (vm_offset_t)pc_buffer, pc_bufsize); - pc_buffer = (u_long *)0; - pc_bufptr = (u_long *)0; - pc_buflast = (u_long *)0; + pc_buffer = NULL; + pc_bufptr = NULL; + pc_buflast = NULL; pc_bufsize = 0; pcsample_beg= 0; pcsample_end= 0; @@ -204,27 +216,24 @@ pcsamples_clear() (void)disable_branch_tracing(); pc_sample_pid = 0; pc_trace_frameworks = FALSE; - } -pcsamples_control(name, namelen, where, sizep) -int *name; -u_int namelen; -char *where; -size_t *sizep; +int +pcsamples_control(int *name, __unused u_int namelen, user_addr_t where, size_t *sizep) { -int ret=0; -int size=*sizep; -unsigned int value = name[1]; -pcinfo_t pc_bufinfo; -pid_t *pidcheck; - -pid_t curpid; -struct proc *p, *curproc; - - if (name[0] != PCSAMPLE_GETNUMBUF) - { - if(curproc = current_proc()) + int ret=0; + size_t size=*sizep; + int value = name[1]; + pcinfo_t pc_bufinfo; + pid_t *pidcheck; + + pid_t curpid; + struct proc *p, *curproc; + + if (name[0] != PCSAMPLE_GETNUMBUF) + { + curproc = current_proc(); + if (curproc) curpid = curproc->p_pid; else return (ESRCH); @@ -243,29 +252,29 @@ struct proc *p, *curproc; /* The global pid exists, deny this request */ return(EBUSY); } - } - } + } + } switch(name[0]) { - case PCSAMPLE_DISABLE: /* used to disable */ + case PCSAMPLE_DISABLE: /* used to disable */ pcsample_enable=0; break; - case PCSAMPLE_SETNUMBUF: - /* The buffer size is bounded by a min and max number of samples */ - if (value < pc_trace_cnt) { - ret=EINVAL; + case PCSAMPLE_SETNUMBUF: + /* The buffer size is bounded by a min and max number of samples */ + if (value < pc_trace_cnt) { + ret=EINVAL; break; } if (value <= MAX_PCSAMPLES) - /* npcbufs = value & ~(PC_TRACE_CNT-1); */ - npcbufs = value; + /* npcbufs = value & ~(PC_TRACE_CNT-1); */ + npcbufs = value; else - npcbufs = MAX_PCSAMPLES; + npcbufs = MAX_PCSAMPLES; break; - case PCSAMPLE_GETNUMBUF: - if(size < sizeof(pcinfo_t)) { - ret=EINVAL; + case PCSAMPLE_GETNUMBUF: + if (size < sizeof(pc_bufinfo)) { + ret=EINVAL; break; } pc_bufinfo.npcbufs = npcbufs; @@ -278,13 +287,13 @@ struct proc *p, *curproc; ret=EINVAL; } break; - case PCSAMPLE_SETUP: + case PCSAMPLE_SETUP: ret=pcsamples_reinit(); break; - case PCSAMPLE_REMOVE: + case PCSAMPLE_REMOVE: pcsamples_clear(); break; - case PCSAMPLE_READBUF: + case PCSAMPLE_READBUF: /* A nonzero value says enable and wait on the buffer */ /* A zero value says read up the buffer immediately */ if (value == 0) @@ -333,13 +342,13 @@ struct proc *p, *curproc; } break; - case PCSAMPLE_SETREG: - if (size < sizeof(pcinfo_t)) + case PCSAMPLE_SETREG: + if (size < sizeof(pc_bufinfo)) { ret = EINVAL; break; } - if (copyin(where, &pc_bufinfo, sizeof(pcinfo_t))) + if (copyin(where, &pc_bufinfo, sizeof(pc_bufinfo))) { ret = EINVAL; break; @@ -348,25 +357,25 @@ struct proc *p, *curproc; pcsample_beg = pc_bufinfo.pcsample_beg; pcsample_end = pc_bufinfo.pcsample_end; break; - case PCSAMPLE_COMM: - if (!(sizeof(pcsample_comm) > size)) - { - ret = EINVAL; - break; - } - bzero((void *)pcsample_comm, sizeof(pcsample_comm)); - if (copyin(where, pcsample_comm, size)) - { - ret = EINVAL; + case PCSAMPLE_COMM: + if (!(sizeof(pcsample_comm) > size)) + { + ret = EINVAL; + break; + } + bzero((void *)pcsample_comm, sizeof(pcsample_comm)); + if (copyin(where, pcsample_comm, size)) + { + ret = EINVAL; break; - } + } /* Check for command name or pid */ - if (pcsample_comm[0] != '\0') - { - ret= EOPNOTSUPP; + if (pcsample_comm[0] != '\0') + { + ret= ENOTSUP; break; - } + } else { if (size != (2 * sizeof(pid_t))) @@ -381,8 +390,8 @@ struct proc *p, *curproc; } } break; - default: - ret= EOPNOTSUPP; + default: + ret= ENOTSUP; break; } return(ret); @@ -396,13 +405,13 @@ struct proc *p, *curproc; to fill the buffer and throw the rest away. This buffer never wraps. */ -pcsamples_read(u_long *buffer, size_t *number) +int +pcsamples_read(user_addr_t buffer, size_t *number) { -int count=0; -int ret=0; -int copycount; + size_t count=0; + size_t copycount; - count = (*number)/sizeof(u_long); + count = (*number)/sizeof(* pc_buffer); if (count && pc_bufsize && pc_buffer) { @@ -418,7 +427,7 @@ int copycount; copycount = count; /* We actually have data to send up */ - if(copyout(pc_buffer, buffer, copycount * sizeof(u_long))) + if(copyout(pc_buffer, buffer, copycount * sizeof(* pc_buffer))) { *number = 0; return(EINVAL); diff --git a/bsd/kern/kern_physio.c b/bsd/kern/kern_physio.c index c4f2415f5..2b6ba9062 100644 --- a/bsd/kern/kern_physio.c +++ b/bsd/kern/kern_physio.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -66,23 +66,28 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/buf.h> +#include <sys/buf_internal.h> #include <sys/conf.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> +#include <sys/uio_internal.h> int physio(strategy, bp, dev, flags, minphys, uio, blocksize) void (*strategy)(); - struct buf *bp; + buf_t bp; dev_t dev; int flags; u_int (*minphys)(); struct uio *uio; int blocksize; { - struct iovec *iovp; struct proc *p = current_proc(); - int error, done, i, nobuf, s, todo; + int error, i, nobuf, todo, iosize; +#if LP64KERN + int64_t done; +#else + int done; +#endif error = 0; flags &= B_READ | B_WRITE; @@ -95,64 +100,56 @@ physio(strategy, bp, dev, flags, minphys, uio, blocksize) * we're doing a read, that's a *write* to user-space. */ for (i = 0; i < uio->uio_iovcnt; i++) { - if(uio->uio_segflg != UIO_SYSSPACE) { - if (!useracc(uio->uio_iov[i].iov_base, - uio->uio_iov[i].iov_len, + if(UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) { + if (!useracc(uio_iov_base_at(uio, i), + uio_iov_len_at(uio, i), (flags == B_READ) ? B_WRITE : B_READ)) return (EFAULT); } } /* Make sure we have a buffer, creating one if necessary. */ if (nobuf = (bp == NULL)) { -// bp = getphysbuf(); - panic("physio: null buf pointer\n"); + bp = buf_alloc((vnode_t)0); } - /* [raise the processor priority level to splbio;] */ - s = splbio(); - /* [while the buffer is marked busy] */ - while (bp->b_flags & B_BUSY) { - /* [mark the buffer wanted] */ - bp->b_flags |= B_WANTED; - /* [wait until the buffer is available] */ - tsleep((caddr_t)bp, PRIBIO+1, "physbuf", 0); + while (((error = (int)buf_acquire(bp, 0, 0, 0)) == EAGAIN)); + + if (error) { + if (nobuf) + buf_free(bp); + return (error); } - /* Mark it busy, so nobody else will use it. */ - bp->b_flags |= B_BUSY; - - /* [lower the priority level] */ - splx(s); - /* [set up the fixed part of the buffer for a transfer] */ bp->b_dev = dev; - bp->b_error = 0; bp->b_proc = p; + buf_seterror(bp, 0); /* - * [while there are data to transfer and no I/O error] + * [while there is data to transfer and no I/O error] * Note that I/O errors are handled with a 'goto' at the bottom * of the 'while' loop. */ for (i = 0; i < uio->uio_iovcnt; i++) { - iovp = &uio->uio_iov[i]; - while (iovp->iov_len > 0) { + while (uio_iov_len_at(uio, i) > 0) { /* * [mark the buffer busy for physical I/O] * (i.e. set B_PHYS (because it's an I/O to user * memory, and B_RAW, because B_RAW is to be * "Set by physio for raw transfers.", in addition - * to the "busy" and read/write flag.) + * to the read/write flag.) */ - s = splbio(); - bp->b_flags = B_BUSY | B_PHYS | B_RAW | flags; - splx(s); + buf_setflags(bp, B_PHYS | B_RAW | flags); + + if ( (iosize = uio_iov_len_at(uio, i)) > MAXPHYSIO_WIRED) + iosize = MAXPHYSIO_WIRED; /* [set up the buffer for a maximum-sized transfer] */ - bp->b_blkno = uio->uio_offset / blocksize; - bp->b_bcount = iovp->iov_len; - bp->b_data = iovp->iov_base; + buf_setblkno(bp, uio->uio_offset / blocksize); + buf_setcount(bp, iosize); + // LP64todo - fix this! + buf_setdataptr(bp, CAST_DOWN(caddr_t, uio_iov_base_at(uio, i))); /* * [call minphys to bound the tranfer size] @@ -160,65 +157,42 @@ physio(strategy, bp, dev, flags, minphys, uio, blocksize) * for later comparison. */ (*minphys)(bp); - todo = bp->b_bcount; + todo = buf_count(bp); /* * [lock the part of the user address space involved * in the transfer] - * Beware vmapbuf(); it clobbers b_data and - * saves it in b_saveaddr. However, vunmapbuf() - * restores it. */ - if(uio->uio_segflg != UIO_SYSSPACE) - vslock(bp->b_data, todo); + if(UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) + vslock(CAST_USER_ADDR_T(buf_dataptr(bp)), + (user_size_t)todo); -#if 0 - vmapbuf(bp, todo); -#endif /* 0 */ /* [call strategy to start the transfer] */ (*strategy)(bp); - /* - * Note that the raise/wait/lower/get error - * steps below would be done by biowait(), but - * we want to unlock the address space before - * we lower the priority. - * - * [raise the priority level to splbio] - */ - s = splbio(); /* [wait for the transfer to complete] */ - while ((bp->b_flags & B_DONE) == 0) - tsleep((caddr_t) bp, PRIBIO + 1, "physio", 0); + error = (int)buf_biowait(bp); /* * [unlock the part of the address space previously * locked] */ -#if 0 - vunmapbuf(bp, todo); -#endif /* 0 */ - if(uio->uio_segflg != UIO_SYSSPACE) - vsunlock(bp->b_data, todo); - - /* remember error value (save a splbio/splx pair) */ - if (bp->b_flags & B_ERROR) - error = (bp->b_error ? bp->b_error : EIO); - - /* [lower the priority level] */ - splx(s); + if(UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) + vsunlock(CAST_USER_ADDR_T(buf_dataptr(bp)), + (user_size_t)todo, + (flags & B_READ)); /* * [deduct the transfer size from the total number * of data to transfer] */ - done = bp->b_bcount - bp->b_resid; - iovp->iov_len -= done; - iovp->iov_base += done; - uio->uio_offset += done; - uio->uio_resid -= done; + done = buf_count(bp) - buf_resid(bp); + uio_iov_len_add_at(uio, -done, i); + uio_iov_base_add_at(uio, done, i); + uio->uio_offset += done; + uio_setresid(uio, (uio_resid(uio) - done)); /* * Now, check for an error. @@ -235,25 +209,14 @@ done: * Remember if somebody wants it, so we can wake them up below. * Also, if we had to steal it, give it back. */ - s = splbio(); - bp->b_flags &= ~(B_BUSY | B_PHYS | B_RAW); -#if 0 - if (nobuf) - putphysbuf(bp); + buf_clearflags(bp, B_PHYS | B_RAW); + if (nobuf) + buf_free(bp); else -#endif /* 0 */ - { - /* - * [if another process is waiting for the raw I/O buffer, - * wake up processes waiting to do physical I/O; - */ - if (bp->b_flags & B_WANTED) { - bp->b_flags &= ~B_WANTED; - wakeup(bp); + { + buf_drop(bp); } - } - splx(s); return (error); } @@ -272,8 +235,8 @@ minphys(bp) struct buf *bp; { - bp->b_bcount = min(MAXPHYS, bp->b_bcount); - return bp->b_bcount; + buf_setcount(bp, min(MAXPHYS, buf_count(bp))); + return buf_count(bp); } /* diff --git a/bsd/kern/kern_proc.c b/bsd/kern/kern_proc.c index 2a4636217..89a60a915 100644 --- a/bsd/kern/kern_proc.c +++ b/bsd/kern/kern_proc.c @@ -67,11 +67,10 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> -#include <sys/proc.h> -#include <sys/buf.h> +#include <sys/proc_internal.h> #include <sys/acct.h> #include <sys/wait.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <ufs/ufs/quota.h> #include <sys/uio.h> #include <sys/malloc.h> @@ -80,6 +79,7 @@ #include <sys/tty.h> #include <sys/signalvar.h> #include <sys/syslog.h> +#include <sys/kernel_types.h> /* * Structure associated with user cacheing. @@ -102,6 +102,7 @@ struct pgrphashhead *pgrphashtbl; u_long pgrphash; struct proclist allproc; struct proclist zombproc; +extern struct tty cons; /* Name to give to core files */ __private_extern__ char corefilename[MAXPATHLEN+1] = {"/cores/core.%P"}; @@ -154,6 +155,8 @@ chgproccnt(uid, diff) panic("chgproccnt: lost user"); } MALLOC_ZONE(uip, struct uidinfo *, sizeof(*uip), M_PROC, M_WAITOK); + if (uip == NULL) + panic("chgproccnt: M_PROC zone depleted"); LIST_INSERT_HEAD(uipp, uip, ui_hash); uip->ui_uid = uid; uip->ui_proccnt = diff; @@ -177,7 +180,7 @@ inferior(p) * Is p an inferior of t ? */ int -isinferior(struct proc *p, register struct proc *t) +isinferior(struct proc *p, struct proc *t) { /* if p==t they are not inferior */ @@ -189,6 +192,186 @@ isinferior(struct proc *p, register struct proc *t) return (1); } +int +proc_isinferior(int pid1, int pid2) +{ + proc_t p; + proc_t t; + + if (((p = pfind(pid1)) != (struct proc *)0 ) && ((t = pfind(pid2)) != (struct proc *)0)) + return (isinferior(p, t)); + return(0); +} + +proc_t +proc_find(int pid) +{ + return(pfind(pid)); +} + +int +proc_rele(__unused proc_t p) +{ + return(0); +} + +proc_t +proc_self() +{ + return(current_proc()); +} + + +int +proc_pid(proc_t p) +{ + return(p->p_pid); +} + +int +proc_ppid(proc_t p) +{ + if (p->p_pptr != (struct proc *)0) + return(p->p_pptr->p_pid); + return(0); +} + +int +proc_selfpid(void) +{ + struct proc *p = current_proc(); + return(p->p_pid); +} + + +int +proc_selfppid(void) +{ + struct proc *p = current_proc(); + if (p->p_pptr) + return(p->p_pptr->p_pid); + else + return(0); +} + +void +proc_name(int pid, char * buf, int size) +{ + struct proc *p; + + if ((p = pfind(pid))!= (struct proc *)0) { + strncpy(buf, &p->p_comm[0], size); + buf[size-1] = 0; + } +} + +void +proc_selfname(char * buf, int size) +{ + struct proc *p; + + if ((p = current_proc())!= (struct proc *)0) { + strncpy(buf, &p->p_comm[0], size); + buf[size-1] = 0; + } +} + +void +proc_signal(int pid, int signum) +{ + proc_t p; + + if ((p = pfind(pid))!= (struct proc *)0) { + psignal(p, signum); + } +} + +int +proc_issignal(int pid, sigset_t mask) +{ + proc_t p; + + if ((p = pfind(pid))!= (struct proc *)0) { + return(proc_pendingsignals(p, mask)); + } + return(0); +} + +int +proc_noremotehang(proc_t p) +{ + int retval = 0; + + if (p) + retval = p->p_flag & P_NOREMOTEHANG; + return(retval? 1: 0); + +} + +int +proc_exiting(proc_t p) +{ + int retval = 0; + + if (p) + retval = p->p_flag & P_WEXIT; + return(retval? 1: 0); +} + + +int +proc_forcequota(proc_t p) +{ + int retval = 0; + + if (p) + retval = p->p_flag & P_FORCEQUOTA; + return(retval? 1: 0); + +} + +int +proc_tbe(proc_t p) +{ + int retval = 0; + + if (p) + retval = p->p_flag & P_TBE; + return(retval? 1: 0); + +} + +int +proc_suser(proc_t p) +{ + return(suser(p->p_ucred, NULL)); + +} + +kauth_cred_t +proc_ucred(proc_t p) +{ + return(p->p_ucred); +} + + +int +proc_is64bit(proc_t p) +{ + return(IS_64BIT_PROCESS(p)); +} + +/* LP64todo - figure out how to identify 64-bit processes if NULL procp */ +int +IS_64BIT_PROCESS(proc_t p) +{ + if (p && (p->p_flag & P_LP64)) + return(1); + else + return(0); +} + + /* * Locate a process by number */ @@ -267,6 +450,8 @@ enterpgrp(p, pgid, mksess) #endif MALLOC_ZONE(pgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP, M_WAITOK); + if (pgrp == NULL) + panic("enterpgrp: M_PGRP zone depleted"); if ((np = pfind(savepid)) == NULL || np != p) { FREE_ZONE(pgrp, sizeof(struct pgrp), M_PGRP); return (ESRCH); @@ -279,6 +464,8 @@ enterpgrp(p, pgid, mksess) */ MALLOC_ZONE(sess, struct session *, sizeof(struct session), M_SESSION, M_WAITOK); + if (sess == NULL) + panic("enterpgrp: M_SESSION zone depleted"); sess->s_leader = p; sess->s_sid = p->p_pid; sess->s_count = 1; @@ -341,13 +528,21 @@ void pgdelete(pgrp) register struct pgrp *pgrp; { + struct tty * ttyp; + int removettypgrp = 0; + ttyp = pgrp->pg_session->s_ttyp; if (pgrp->pg_session->s_ttyp != NULL && - pgrp->pg_session->s_ttyp->t_pgrp == pgrp) + pgrp->pg_session->s_ttyp->t_pgrp == pgrp) { pgrp->pg_session->s_ttyp->t_pgrp = NULL; + removettypgrp = 1; + } LIST_REMOVE(pgrp, pg_hash); - if (--pgrp->pg_session->s_count == 0) + if (--pgrp->pg_session->s_count == 0) { + if (removettypgrp && (ttyp == &cons) && (ttyp->t_session == pgrp->pg_session)) + ttyp->t_session = 0; FREE_ZONE(pgrp->pg_session, sizeof(struct session), M_SESSION); + } FREE_ZONE(pgrp, sizeof *pgrp, M_PGRP); } @@ -400,7 +595,7 @@ fixjobc(struct proc *p, struct pgrp *pgrp, int entering) hispgrp->pg_jobc++; else if (--hispgrp->pg_jobc == 0) orphanpg(hispgrp); -} + } } /* @@ -427,15 +622,17 @@ orphanpg(struct pgrp *pg) } #ifdef DEBUG +void pgrpdump(void); /* forward declare here (called from debugger) */ + void -pgrpdump() +pgrpdump(void) { - register struct pgrp *pgrp; - register struct proc *p; - register i; + struct pgrp *pgrp; + struct proc *p; + u_long i; for (i = 0; i <= pgrphash; i++) { - if (pgrp = pgrphashtbl[i].lh_first) { + if ((pgrp = pgrphashtbl[i].lh_first) != NULL) { printf("\tindx %d\n", i); for (; pgrp != 0; pgrp = pgrp->pg_hash.le_next) { printf("\tpgrp 0x%08x, pgid %d, sess %p, sesscnt %d, mem %p\n", diff --git a/bsd/kern/kern_prot.c b/bsd/kern/kern_prot.c index db25475d6..1a963663b 100644 --- a/bsd/kern/kern_prot.c +++ b/bsd/kern/kern_prot.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -68,31 +68,34 @@ #include <sys/acct.h> #include <sys/systm.h> #include <sys/ucred.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> +#include <sys/user.h> +#include <sys/kauth.h> #include <sys/timeb.h> #include <sys/times.h> #include <sys/malloc.h> #include <bsm/audit_kernel.h> -#include <sys/mount.h> +#include <sys/mount_internal.h> +#include <sys/sysproto.h> #include <mach/message.h> #include <mach/host_security.h> #include <kern/host.h> +int groupmember(gid_t gid, kauth_cred_t cred); +int is_suser(void); +int is_suser1(void); + +extern int prepare_profile_database(int user); + /* * setprivexec: (dis)allow this process to hold * task, thread, or execption ports of processes about to exec. */ -struct setprivexec_args { - int flag; -}; int -setprivexec(p, uap, retval) - struct proc *p; - register struct setprivexec_args *uap; - register_t *retval; +setprivexec(struct proc *p, struct setprivexec_args *uap, register_t *retval) { AUDIT_ARG(value, uap->flag); *retval = p->p_debugger; @@ -101,24 +104,17 @@ setprivexec(p, uap, retval) } /* ARGSUSED */ -getpid(p, uap, retval) - struct proc *p; - void *uap; - register_t *retval; +int +getpid(struct proc *p, __unused struct getpid_args *uap, register_t *retval) { *retval = p->p_pid; -#if COMPAT_43 - retval[1] = p->p_pptr->p_pid; -#endif return (0); } /* ARGSUSED */ -getppid(p, uap, retval) - struct proc *p; - void *uap; - register_t *retval; +int +getppid(struct proc *p, __unused struct getppid_args *uap, register_t *retval) { *retval = p->p_pptr->p_pid; @@ -126,10 +122,8 @@ getppid(p, uap, retval) } /* Get process group ID; note that POSIX getpgrp takes no parameter */ -getpgrp(p, uap, retval) - struct proc *p; - void *uap; - register_t *retval; +int +getpgrp(struct proc *p, __unused struct getpgrp_args *uap, register_t *retval) { *retval = p->p_pgrp->pg_id; @@ -137,15 +131,8 @@ getpgrp(p, uap, retval) } /* Get an arbitary pid's process group id */ -struct getpgid_args { - pid_t pid; -}; - int -getpgid(p, uap, retval) - struct proc *p; - struct getpgid_args *uap; - register_t *retval; +getpgid(struct proc *p, struct getpgid_args *uap, register_t *retval) { struct proc *pt; @@ -163,15 +150,9 @@ found: /* * Get an arbitary pid's session id. */ -struct getsid_args { - pid_t pid; -}; int -getsid(p, uap, retval) - struct proc *p; - struct getsid_args *uap; - register_t *retval; +getsid(struct proc *p, struct getsid_args *uap, register_t *retval) { struct proc *pt; @@ -187,41 +168,54 @@ found: } /* ARGSUSED */ -getuid(p, uap, retval) - struct proc *p; - void *uap; - register_t *retval; +int +getuid(__unused struct proc *p, __unused struct getuid_args *uap, register_t *retval) { - *retval = p->p_cred->p_ruid; -#if COMPAT_43 - retval[1] = p->p_ucred->cr_uid; -#endif + *retval = kauth_getruid(); return (0); } /* ARGSUSED */ -geteuid(p, uap, retval) - struct proc *p; - void *uap; - register_t *retval; +int +geteuid(__unused struct proc *p, __unused struct geteuid_args *uap, register_t *retval) +{ + + *retval = kauth_getuid(); + return (0); +} + +/* + * Return the per-thread override identity. + */ +int +gettid(__unused struct proc *p, struct gettid_args *uap, register_t *retval) { + struct uthread *uthread = get_bsdthread_info(current_thread()); + int error; - *retval = p->p_ucred->cr_uid; + /* + * If this thread is not running with an override identity, we can't + * return one to the caller, so return an error instead. + */ + if (!(uthread->uu_flag & UT_SETUID)) + return (ESRCH); + + if ((error = suword(uap->uidp, uthread->uu_ucred->cr_ruid))) + return (error); + if ((error = suword(uap->gidp, uthread->uu_ucred->cr_rgid))) + return (error); + + *retval = 0; return (0); } /* ARGSUSED */ -getgid(p, uap, retval) - struct proc *p; - void *uap; - register_t *retval; +int +getgid(__unused struct proc *p, __unused struct getgid_args *uap, register_t *retval) { - *retval = p->p_cred->p_rgid; -#if COMPAT_43 - retval[1] = p->p_ucred->cr_groups[0]; -#endif + *retval = kauth_getrgid(); return (0); } @@ -231,52 +225,70 @@ getgid(p, uap, retval) * correctly in a library function. */ /* ARGSUSED */ -getegid(p, uap, retval) - struct proc *p; - void *uap; - register_t *retval; +int +getegid(struct proc *p, __unused struct getegid_args *uap, register_t *retval) { - *retval = p->p_ucred->cr_groups[0]; + *retval = kauth_getgid(); return (0); } -struct getgroups_args { - u_int gidsetsize; - gid_t *gidset; -}; -getgroups(p, uap, retval) - struct proc *p; - register struct getgroups_args *uap; - register_t *retval; +int +getgroups(__unused struct proc *p, struct getgroups_args *uap, register_t *retval) { - register struct pcred *pc = p->p_cred; - register u_int ngrp; + register int ngrp; int error; + kauth_cred_t cred; + + /* grab reference while we muck around with the credential */ + cred = kauth_cred_get_with_ref(); if ((ngrp = uap->gidsetsize) == 0) { - *retval = pc->pc_ucred->cr_ngroups; + *retval = cred->cr_ngroups; + kauth_cred_rele(cred); return (0); } - if (ngrp < pc->pc_ucred->cr_ngroups) + if (ngrp < cred->cr_ngroups) { + kauth_cred_rele(cred); return (EINVAL); - pcred_readlock(p); - ngrp = pc->pc_ucred->cr_ngroups; - if (error = copyout((caddr_t)pc->pc_ucred->cr_groups, - (caddr_t)uap->gidset, ngrp * sizeof(gid_t))) { - pcred_unlock(p); + } + ngrp = cred->cr_ngroups; + if ((error = copyout((caddr_t)cred->cr_groups, + uap->gidset, + ngrp * sizeof(gid_t)))) { + kauth_cred_rele(cred); return (error); } - pcred_unlock(p); + kauth_cred_rele(cred); *retval = ngrp; return (0); } +/* + * Return the per-thread/per-process supplementary groups list. + */ +#warning XXX implement +int +getsgroups(__unused struct proc *p, __unused struct getsgroups_args *uap, __unused register_t *retval) +{ + /* XXX implement */ + return(ENOTSUP); +} + +/* + * Return the per-thread/per-process whiteout groups list. + */ +#warning XXX implement +int +getwgroups(__unused struct proc *p, __unused struct getwgroups_args *uap, __unused register_t *retval) +{ + /* XXX implement */ + return(ENOTSUP); +} + /* ARGSUSED */ -setsid(p, uap, retval) - register struct proc *p; - void *uap; - register_t *retval; +int +setsid(struct proc *p, __unused struct setsid_args *uap, register_t *retval) { if (p->p_pgid == p->p_pid || pgfind(p->p_pid) || p->p_flag & P_INVFORK) { @@ -297,19 +309,14 @@ setsid(p, uap, retval) * if a child * pid must be in same session (EPERM) * pid can't have done an exec (EACCES) + * ig pgid is -ve return EINVAL (as per SUV spec) * if pgid != pid * there must exist some pid in same session having pgid (EPERM) * pid must not be session leader (EPERM) */ -struct setpgid_args { - int pid; - int pgid; -}; /* ARGSUSED */ -setpgid(curp, uap, retval) - struct proc *curp; - register struct setpgid_args *uap; - register_t *retval; +int +setpgid(struct proc *curp, register struct setpgid_args *uap, __unused register_t *retval) { register struct proc *targp; /* target process */ register struct pgrp *pgrp; /* target pgrp */ @@ -325,6 +332,8 @@ setpgid(curp, uap, retval) targp = curp; if (SESS_LEADER(targp)) return (EPERM); + if (uap->pgid < 0) + return(EINVAL); if (uap->pgid == 0) uap->pgid = targp->p_pid; else if (uap->pgid != targp->p_pid) @@ -334,13 +343,8 @@ setpgid(curp, uap, retval) return (enterpgrp(targp, uap->pgid, 0)); } -struct issetugid_args { - int dummy; -}; -issetugid(p, uap, retval) - struct proc *p; - struct issetugid_args *uap; - register_t *retval; +int +issetugid(struct proc *p, __unused struct issetugid_args *uap, register_t *retval) { /* * Note: OpenBSD sets a P_SUGIDEXEC flag set at execve() time, @@ -355,23 +359,18 @@ issetugid(p, uap, retval) return (0); } -struct setuid_args { - uid_t uid; -}; /* ARGSUSED */ -setuid(p, uap, retval) - struct proc *p; - struct setuid_args *uap; - register_t *retval; +int +setuid(struct proc *p, struct setuid_args *uap, __unused register_t *retval) { - register struct pcred *pc = p->p_cred; register uid_t uid; int error; + kauth_cred_t my_cred, my_new_cred; uid = uap->uid; AUDIT_ARG(uid, uid, 0, 0, 0); - if (uid != pc->p_ruid && - (error = suser(pc->pc_ucred, &p->p_acflag))) + if (uid != p->p_ucred->cr_ruid && + (error = suser(p->p_ucred, &p->p_acflag))) return (error); /* * Everything's okay, do it. @@ -381,239 +380,447 @@ setuid(p, uap, retval) /* prepare app access profile files */ prepare_profile_database(uap->uid); - pcred_writelock(p); - (void)chgproccnt(pc->p_ruid, -1); + (void)chgproccnt(kauth_getruid(), -1); (void)chgproccnt(uid, 1); - pc->pc_ucred = crcopy(pc->pc_ucred); - pc->pc_ucred->cr_uid = uid; - pc->p_ruid = uid; - pc->p_svuid = uid; - pcred_unlock(p); + + /* get current credential and take a reference while we muck with it */ + for (;;) { + my_cred = kauth_cred_proc_ref(p); + + /* + * set the credential with new info. If there is no change we get back + * the same credential we passed in. + */ + my_new_cred = kauth_cred_setuid(my_cred, uid); + if (my_cred != my_new_cred) { + proc_lock(p); + /* need to protect for a race where another thread also changed + * the credential after we took our reference. If p_ucred has + * changed then we should restart this again with the new cred. + */ + if (p->p_ucred != my_cred) { + proc_unlock(p); + kauth_cred_rele(my_cred); + kauth_cred_rele(my_new_cred); + /* try again */ + continue; + } + p->p_ucred = my_new_cred; + p->p_flag |= P_SUGID; + proc_unlock(p); + } + /* drop our extra reference */ + kauth_cred_rele(my_cred); + break; + } + set_security_token(p); - p->p_flag |= P_SUGID; return (0); } -struct seteuid_args { - uid_t euid; -}; /* ARGSUSED */ -seteuid(p, uap, retval) - struct proc *p; - struct seteuid_args *uap; - register_t *retval; +int +seteuid(struct proc *p, struct seteuid_args *uap, __unused register_t *retval) { - register struct pcred *pc = p->p_cred; register uid_t euid; int error; + kauth_cred_t my_cred, my_new_cred; euid = uap->euid; AUDIT_ARG(uid, 0, euid, 0, 0); - if (euid != pc->p_ruid && euid != pc->p_svuid && - (error = suser(pc->pc_ucred, &p->p_acflag))) + if (euid != p->p_ucred->cr_ruid && euid != p->p_ucred->cr_svuid && + (error = suser(p->p_ucred, &p->p_acflag))) return (error); /* * Everything's okay, do it. Copy credentials so other references do - * not see our changes. + * not see our changes. get current credential and take a reference + * while we muck with it */ - pcred_writelock(p); - pc->pc_ucred = crcopy(pc->pc_ucred); - pc->pc_ucred->cr_uid = euid; - pcred_unlock(p); + for (;;) { + my_cred = kauth_cred_proc_ref(p); + + /* + * set the credential with new info. If there is no change we get back + * the same credential we passed in. + */ + my_new_cred = kauth_cred_seteuid(p->p_ucred, euid); + + if (my_cred != my_new_cred) { + proc_lock(p); + /* need to protect for a race where another thread also changed + * the credential after we took our reference. If p_ucred has + * changed then we should restart this again with the new cred. + */ + if (p->p_ucred != my_cred) { + proc_unlock(p); + kauth_cred_rele(my_cred); + kauth_cred_rele(my_new_cred); + /* try again */ + continue; + } + p->p_ucred = my_new_cred; + p->p_flag |= P_SUGID; + proc_unlock(p); + } + /* drop our extra reference */ + kauth_cred_rele(my_cred); + break; + } + set_security_token(p); - p->p_flag |= P_SUGID; return (0); } -struct setgid_args { - gid_t gid; -}; /* ARGSUSED */ -setgid(p, uap, retval) - struct proc *p; - struct setgid_args *uap; - register_t *retval; +int +setgid(struct proc *p, struct setgid_args *uap, __unused register_t *retval) { - register struct pcred *pc = p->p_cred; register gid_t gid; int error; + kauth_cred_t my_cred, my_new_cred; gid = uap->gid; AUDIT_ARG(gid, gid, 0, 0, 0); - if (gid != pc->p_rgid && (error = suser(pc->pc_ucred, &p->p_acflag))) + if (gid != p->p_ucred->cr_rgid && (error = suser(p->p_ucred, &p->p_acflag))) return (error); - pcred_writelock(p); - pc->pc_ucred = crcopy(pc->pc_ucred); - pc->pc_ucred->cr_groups[0] = gid; - pc->p_rgid = gid; - pc->p_svgid = gid; /* ??? */ - pcred_unlock(p); + + /* get current credential and take a reference while we muck with it */ + for (;;) { + my_cred = kauth_cred_proc_ref(p); + + /* + * set the credential with new info. If there is no change we get back + * the same credential we passed in. + */ + my_new_cred = kauth_cred_setgid(p->p_ucred, gid); + if (my_cred != my_new_cred) { + proc_lock(p); + /* need to protect for a race where another thread also changed + * the credential after we took our reference. If p_ucred has + * changed then we should restart this again with the new cred. + */ + if (p->p_ucred != my_cred) { + proc_unlock(p); + kauth_cred_rele(my_cred); + kauth_cred_rele(my_new_cred); + /* try again */ + continue; + } + p->p_ucred = my_new_cred; + p->p_flag |= P_SUGID; + proc_unlock(p); + } + /* drop our extra reference */ + kauth_cred_rele(my_cred); + break; + } + set_security_token(p); - p->p_flag |= P_SUGID; return (0); } -struct setegid_args { - gid_t egid; -}; /* ARGSUSED */ -setegid(p, uap, retval) - struct proc *p; - struct setegid_args *uap; - register_t *retval; +int +setegid(struct proc *p, struct setegid_args *uap, __unused register_t *retval) { - register struct pcred *pc = p->p_cred; register gid_t egid; int error; + kauth_cred_t my_cred, my_new_cred; egid = uap->egid; AUDIT_ARG(gid, 0, egid, 0, 0); - if (egid != pc->p_rgid && egid != pc->p_svgid && - (error = suser(pc->pc_ucred, &p->p_acflag))) + if (egid != p->p_ucred->cr_rgid && egid != p->p_ucred->cr_svgid && + (error = suser(p->p_ucred, &p->p_acflag))) return (error); - pcred_writelock(p); - pc->pc_ucred = crcopy(pc->pc_ucred); - pc->pc_ucred->cr_groups[0] = egid; - pcred_unlock(p); + + /* get current credential and take a reference while we muck with it */ + for (;;) { + my_cred = kauth_cred_proc_ref(p); + + /* + * set the credential with new info. If there is no change we get back + * the same credential we passed in. + */ + my_new_cred = kauth_cred_setegid(p->p_ucred, egid); + if (my_cred != my_new_cred) { + proc_lock(p); + /* need to protect for a race where another thread also changed + * the credential after we took our reference. If p_ucred has + * changed then we should restart this again with the new cred. + */ + if (p->p_ucred != my_cred) { + proc_unlock(p); + kauth_cred_rele(my_cred); + kauth_cred_rele(my_new_cred); + /* try again */ + continue; + } + p->p_ucred = my_new_cred; + p->p_flag |= P_SUGID; + proc_unlock(p); + } + /* drop our extra reference */ + kauth_cred_rele(my_cred); + break; + } + set_security_token(p); - p->p_flag |= P_SUGID; return (0); } -struct setgroups_args{ - u_int gidsetsize; - gid_t *gidset; -}; +/* + * Set the per-thread override identity. The first parameter can be the + * current real UID, KAUTH_UID_NONE, or, if the caller is priviledged, it + * can be any UID. If it is KAUTH_UID_NONE, then as a special case, this + * means "revert to the per process credential"; otherwise, if permitted, + * it changes the effective, real, and saved UIDs and GIDs for the current + * thread to the requested UID and single GID, and clears all other GIDs. + */ +int +settid(struct proc *p, struct settid_args *uap, __unused register_t *retval) +{ + kauth_cred_t uc; + struct uthread *uthread = get_bsdthread_info(current_thread()); + register uid_t uid; + register gid_t gid; + + uid = uap->uid; + gid = uap->gid; + AUDIT_ARG(uid, uid, gid, gid, 0); + + if (suser(p->p_ucred, &p->p_acflag) != 0) { + return (EPERM); + } + + if (uid == KAUTH_UID_NONE) { + + /* must already be assuming another identity in order to revert back */ + if ((uthread->uu_flag & UT_SETUID) == 0) + return (EPERM); + + /* revert to delayed binding of process credential */ + uc = kauth_cred_proc_ref(p); + kauth_cred_rele(uthread->uu_ucred); + uthread->uu_ucred = uc; + uthread->uu_flag &= ~UT_SETUID; + } else { + kauth_cred_t my_cred, my_new_cred; + + /* cannot already be assuming another identity */ + if ((uthread->uu_flag & UT_SETUID) != 0) { + return (EPERM); + } + + /* + * get a new credential instance from the old if this one changes else + * kauth_cred_setuidgid returns the same credential. we take an extra + * reference on the current credential while we muck wit it here. + */ + kauth_cred_ref(uthread->uu_ucred); + my_cred = uthread->uu_ucred; + my_new_cred = kauth_cred_setuidgid(my_cred, uid, gid); + if (my_cred != my_new_cred) + uthread->uu_ucred = my_new_cred; + uthread->uu_flag |= UT_SETUID; + + /* drop our extra reference */ + kauth_cred_rele(my_cred); + } + /* + * XXX should potentially set per thread security token (there is + * XXX none). + * XXX it is unclear whether P_SUGID should be st at this point; + * XXX in theory, it is being deprecated. + */ + return (0); +} + +/* + * Set the per-thread override identity. Use this system call for a thread to + * assume the identity of another process or to revert back to normal identity + * of the current process. + * When the "assume" argument is non zero the current thread will assume the + * identity of the process represented by the pid argument. + * When the assume argument is zero we revert back to our normal identity. + */ +int +settid_with_pid(struct proc *p, struct settid_with_pid_args *uap, __unused register_t *retval) +{ + proc_t target_proc; + struct uthread *uthread = get_bsdthread_info(current_thread()); + kauth_cred_t my_cred, my_target_cred, my_new_cred; + + AUDIT_ARG(pid, uap->pid); + AUDIT_ARG(value, uap->assume); + + if (suser(p->p_ucred, &p->p_acflag) != 0) { + return (EPERM); + } + + /* + * XXX should potentially set per thread security token (there is + * XXX none). + * XXX it is unclear whether P_SUGID should be st at this point; + * XXX in theory, it is being deprecated. + */ + + /* + * assume argument tells us to assume the identity of the process with the + * id passed in the pid argument. + */ + if (uap->assume != 0) { + /* can't do this if we have already assumed an identity */ + if ((uthread->uu_flag & UT_SETUID) != 0) + return (EPERM); + + target_proc = pfind(uap->pid); + /* can't assume the identity of the kernel process */ + if (target_proc == NULL || target_proc == kernproc) { + return (ESRCH); + } + + /* + * take a reference on the credential used in our target process then use + * it as the identity for our current thread. + */ + kauth_cred_ref(uthread->uu_ucred); + my_cred = uthread->uu_ucred; + my_target_cred = kauth_cred_proc_ref(target_proc); + my_new_cred = kauth_cred_setuidgid(my_cred, my_target_cred->cr_uid, my_target_cred->cr_gid); + if (my_cred != my_new_cred) + uthread->uu_ucred = my_new_cred; + + uthread->uu_flag |= UT_SETUID; + + /* drop our extra references */ + kauth_cred_rele(my_cred); + kauth_cred_rele(my_target_cred); + + return (0); + } + + /* we are reverting back to normal mode of operation where delayed binding + * of the process credential sets the credential in the thread (uu_ucred) + */ + if ((uthread->uu_flag & UT_SETUID) == 0) + return (EPERM); + + /* revert to delayed binding of process credential */ + my_new_cred = kauth_cred_proc_ref(p); + kauth_cred_rele(uthread->uu_ucred); + uthread->uu_ucred = my_new_cred; + uthread->uu_flag &= ~UT_SETUID; + + return (0); +} /* ARGSUSED */ -setgroups(p, uap, retval) - struct proc *p; - struct setgroups_args *uap; - register_t *retval; +static int +setgroups1(struct proc *p, u_int gidsetsize, user_addr_t gidset, uid_t gmuid, __unused register_t *retval) { - register struct pcred *pc = p->p_cred; - struct ucred *new, *old; register u_int ngrp; - int error; + gid_t newgroups[NGROUPS] = { 0 }; + int error; + kauth_cred_t my_cred, my_new_cred; - if (error = suser(pc->pc_ucred, &p->p_acflag)) + if ((error = suser(p->p_ucred, &p->p_acflag))) return (error); - ngrp = uap->gidsetsize; + ngrp = gidsetsize; if (ngrp > NGROUPS) return (EINVAL); - new = crget(); - + if ( ngrp < 1 ) { ngrp = 1; } else { - error = copyin((caddr_t)uap->gidset, - (caddr_t)new->cr_groups, ngrp * sizeof(gid_t)); + error = copyin(gidset, + (caddr_t)newgroups, ngrp * sizeof(gid_t)); if (error) { - crfree(new); return (error); } } - new->cr_ngroups = ngrp; - AUDIT_ARG(groupset, new->cr_groups, ngrp); - pcred_writelock(p); - old = pc->pc_ucred; - new->cr_uid = old->cr_uid; - pc->pc_ucred = new; - pcred_unlock(p); + + /* get current credential and take a reference while we muck with it */ + for (;;) { + my_cred = kauth_cred_proc_ref(p); + + /* + * set the credential with new info. If there is no change we get back + * the same credential we passed in. + */ + my_new_cred = kauth_cred_setgroups(p->p_ucred, &newgroups[0], ngrp, gmuid); + if (my_cred != my_new_cred) { + proc_lock(p); + /* need to protect for a race where another thread also changed + * the credential after we took our reference. If p_ucred has + * changed then we should restart this again with the new cred. + */ + if (p->p_ucred != my_cred) { + proc_unlock(p); + kauth_cred_rele(my_cred); + kauth_cred_rele(my_new_cred); + /* try again */ + continue; + } + p->p_ucred = my_new_cred; + p->p_flag |= P_SUGID; + proc_unlock(p); + } + /* drop our extra reference */ + kauth_cred_rele(my_cred); + break; + } + + AUDIT_ARG(groupset, p->p_ucred->cr_groups, ngrp); set_security_token(p); - p->p_flag |= P_SUGID; - if (old != NOCRED) - crfree(old); + return (0); } -#if COMPAT_43 -struct osetreuid_args{ - int ruid; - int euid; -}; -/* ARGSUSED */ -osetreuid(p, uap, retval) - register struct proc *p; - struct osetreuid_args *uap; - register_t *retval; +int +initgroups(struct proc *p, struct initgroups_args *uap, __unused register_t *retval) { - struct seteuid_args seuidargs; - struct setuid_args suidargs; + return(setgroups1(p, uap->gidsetsize, uap->gidset, uap->gmuid, retval)); +} - /* - * There are five cases, and we attempt to emulate them in - * the following fashion: - * -1, -1: return 0. This is correct emulation. - * -1, N: call seteuid(N). This is correct emulation. - * N, -1: if we called setuid(N), our euid would be changed - * to N as well. the theory is that we don't want to - * revoke root access yet, so we call seteuid(N) - * instead. This is incorrect emulation, but often - * suffices enough for binary compatibility. - * N, N: call setuid(N). This is correct emulation. - * N, M: call setuid(N). This is close to correct emulation. - */ - if (uap->ruid == (uid_t)-1) { - if (uap->euid == (uid_t)-1) - return (0); /* -1, -1 */ - seuidargs.euid = uap->euid; /* -1, N */ - return (seteuid(p, &seuidargs, retval)); - } - if (uap->euid == (uid_t)-1) { - seuidargs.euid = uap->ruid; /* N, -1 */ - return (seteuid(p, &seuidargs, retval)); - } - suidargs.uid = uap->ruid; /* N, N and N, M */ - return (setuid(p, &suidargs, retval)); +int +setgroups(struct proc *p, struct setgroups_args *uap, __unused register_t *retval) +{ + return(setgroups1(p, uap->gidsetsize, uap->gidset, KAUTH_UID_NONE, retval)); } -struct osetregid_args { - int rgid; - int egid; -}; -/* ARGSUSED */ -osetregid(p, uap, retval) - register struct proc *p; - struct osetregid_args *uap; - register_t *retval; +/* + * Set the per-thread/per-process supplementary groups list. + */ +#warning XXX implement +int +setsgroups(__unused struct proc *p, __unused struct setsgroups_args *uap, __unused register_t *retval) { - struct setegid_args segidargs; - struct setgid_args sgidargs; + return(ENOTSUP); +} - /* - * There are five cases, described above in osetreuid() - */ - if (uap->rgid == (gid_t)-1) { - if (uap->egid == (gid_t)-1) - return (0); /* -1, -1 */ - segidargs.egid = uap->egid; /* -1, N */ - return (setegid(p, &segidargs, retval)); - } - if (uap->egid == (gid_t)-1) { - segidargs.egid = uap->rgid; /* N, -1 */ - return (setegid(p, &segidargs, retval)); - } - sgidargs.gid = uap->rgid; /* N, N and N, M */ - return (setgid(p, &sgidargs, retval)); +/* + * Set the per-thread/per-process whiteout groups list. + */ +#warning XXX implement +int +setwgroups(__unused struct proc *p, __unused struct setwgroups_args *uap, __unused register_t *retval) +{ + return(ENOTSUP); } -#endif /* COMPAT_43 */ /* * Check if gid is a member of the group set. + * + * XXX This interface is going away */ -groupmember(gid, cred) - gid_t gid; - register struct ucred *cred; +int +groupmember(gid_t gid, kauth_cred_t cred) { - register gid_t *gp; - gid_t *egp; + int is_member; - egp = &(cred->cr_groups[cred->cr_ngroups]); - for (gp = cred->cr_groups; gp < egp; gp++) - if (*gp == gid) - return (1); + if (kauth_cred_ismember_gid(cred, gid, &is_member) == 0 && is_member) + return (1); return (0); } @@ -622,16 +829,17 @@ groupmember(gid, cred) * privilege; if so, and we have accounting info, set the flag * indicating use of super-powers. * Returns 0 or error. + * + * XXX This interface is going away */ -suser(cred, acflag) - struct ucred *cred; - u_short *acflag; +int +suser(kauth_cred_t cred, u_short *acflag) { #if DIAGNOSTIC if (cred == NOCRED || cred == FSCRED) panic("suser"); #endif - if (cred->cr_uid == 0) { + if (kauth_cred_getuid(cred) == 0) { if (acflag) *acflag |= ASU; return (0); @@ -659,148 +867,40 @@ is_suser1(void) return (0); return (suser(p->p_ucred, &p->p_acflag) == 0 || - p->p_cred->p_ruid == 0 || p->p_cred->p_svuid == 0); -} - -/* - * Allocate a zeroed cred structure. - */ -struct ucred * -crget() -{ - register struct ucred *cr; - - MALLOC_ZONE(cr, struct ucred *, sizeof(*cr), M_CRED, M_WAITOK); - bzero((caddr_t)cr, sizeof(*cr)); - cr->cr_ref = 1; - return (cr); -} - -/* - * Free a cred structure. - * Throws away space when ref count gets to 0. - */ -void -crfree(cr) - struct ucred *cr; -{ -#if DIAGNOSTIC - if (cr == NOCRED || cr == FSCRED) - panic("crfree"); -#endif - if (--cr->cr_ref == 0) - FREE_ZONE((caddr_t)cr, sizeof *cr, M_CRED); -} - -/* - * Copy cred structure to a new one and free the old one. - */ -struct ucred * -crcopy(cr) - struct ucred *cr; -{ - struct ucred *newcr; - -#if DIAGNOSTIC - if (cr == NOCRED || cr == FSCRED) - panic("crcopy"); -#endif - if (cr->cr_ref == 1) - return (cr); - newcr = crget(); - *newcr = *cr; - crfree(cr); - newcr->cr_ref = 1; - return (newcr); -} - -/* - * Dup cred struct to a new held one. - */ -struct ucred * -crdup(cr) - struct ucred *cr; -{ - struct ucred *newcr; - -#if DIAGNOSTIC - if (cr == NOCRED || cr == FSCRED) - panic("crdup"); -#endif - newcr = crget(); - *newcr = *cr; - newcr->cr_ref = 1; - return (newcr); -} - -/* - * compare two cred structs - */ -int -crcmp(cr1, cr2) - struct ucred *cr1; - struct ucred *cr2; -{ - int i; - - if (cr1 == cr2) - return 0; - if (cr1 == NOCRED || cr1 == FSCRED || - cr2 == NOCRED || cr2 == FSCRED) - return 1; - if (cr1->cr_uid != cr2->cr_uid) - return 1; - if (cr1->cr_ngroups != cr2->cr_ngroups) - return 1; - // XXX assumes groups will always be listed in some order - for (i=0; i < cr1->cr_ngroups; i++) - if (cr1->cr_groups[i] != cr2->cr_groups[i]) - return 1; - return (0); + p->p_ucred->cr_ruid == 0 || p->p_ucred->cr_svuid == 0); } /* * Get login name, if available. */ -struct getlogin_args { - char *namebuf; - u_int namelen; -}; /* ARGSUSED */ -getlogin(p, uap, retval) - struct proc *p; - struct getlogin_args *uap; - register_t *retval; +int +getlogin(struct proc *p, struct getlogin_args *uap, __unused register_t *retval) { if (uap->namelen > sizeof (p->p_pgrp->pg_session->s_login)) uap->namelen = sizeof (p->p_pgrp->pg_session->s_login); return (copyout((caddr_t) p->p_pgrp->pg_session->s_login, - (caddr_t)uap->namebuf, uap->namelen)); + uap->namebuf, uap->namelen)); } /* * Set login name. */ -struct setlogin_args { - char *namebuf; -}; /* ARGSUSED */ -setlogin(p, uap, retval) - struct proc *p; - struct setlogin_args *uap; - register_t *retval; +int +setlogin(struct proc *p, struct setlogin_args *uap, __unused register_t *retval) { int error; int dummy=0; - if (error = suser(p->p_ucred, &p->p_acflag)) + if ((error = suser(p->p_ucred, &p->p_acflag))) return (error); - error = copyinstr((caddr_t) uap->namebuf, + error = copyinstr(uap->namebuf, (caddr_t) p->p_pgrp->pg_session->s_login, sizeof (p->p_pgrp->pg_session->s_login) - 1, (size_t *)&dummy); - if(!error) + if (!error) AUDIT_ARG(text, p->p_pgrp->pg_session->s_login); else if (error == ENAMETOOLONG) error = EINVAL; @@ -809,14 +909,37 @@ setlogin(p, uap, retval) /* Set the secrity token of the task with current euid and eguid */ -kern_return_t +/* + * XXX This needs to change to give the task a reference and/or an opaque + * XXX identifier. + */ +int set_security_token(struct proc * p) { security_token_t sec_token; audit_token_t audit_token; - sec_token.val[0] = p->p_ucred->cr_uid; - sec_token.val[1] = p->p_ucred->cr_gid; + /* + * Don't allow a vfork child to override the parent's token settings + * (since they share a task). Instead, the child will just have to + * suffer along using the parent's token until the exec(). It's all + * undefined behavior anyway, right? + */ + if (p->task == current_task()) { + uthread_t uthread; + uthread = (uthread_t)get_bsdthread_info(current_thread()); + if (uthread->uu_flag & UT_VFORK) + return (1); + } + + /* XXX mach_init doesn't have a p_ucred when it calls this function */ + if (p->p_ucred != NOCRED && p->p_ucred != FSCRED) { + sec_token.val[0] = kauth_cred_getuid(p->p_ucred); + sec_token.val[1] = p->p_ucred->cr_gid; + } else { + sec_token.val[0] = 0; + sec_token.val[1] = 0; + } /* * The current layout of the Mach audit token explicitly @@ -827,36 +950,36 @@ set_security_token(struct proc * p) * the user of the trailer from future representation * changes. */ - audit_token.val[0] = p->p_au->ai_auid; + audit_token.val[0] = p->p_ucred->cr_au.ai_auid; audit_token.val[1] = p->p_ucred->cr_uid; - audit_token.val[2] = p->p_ucred->cr_gid; - audit_token.val[3] = p->p_cred->p_ruid; - audit_token.val[4] = p->p_cred->p_rgid; + audit_token.val[2] = p->p_ucred->cr_gid; + audit_token.val[3] = p->p_ucred->cr_ruid; + audit_token.val[4] = p->p_ucred->cr_rgid; audit_token.val[5] = p->p_pid; - audit_token.val[6] = p->p_au->ai_asid; - audit_token.val[7] = p->p_au->ai_termid.port; + audit_token.val[6] = p->p_ucred->cr_au.ai_asid; + audit_token.val[7] = p->p_ucred->cr_au.ai_termid.port; - return host_security_set_task_token(host_security_self(), + return (host_security_set_task_token(host_security_self(), p->task, sec_token, audit_token, (sec_token.val[0]) ? HOST_PRIV_NULL : - host_priv_self()); + host_priv_self()) != KERN_SUCCESS); } /* - * Fill in a struct xucred based on a struct ucred. + * Fill in a struct xucred based on a kauth_cred_t. */ __private_extern__ void -cru2x(struct ucred *cr, struct xucred *xcr) +cru2x(kauth_cred_t cr, struct xucred *xcr) { bzero(xcr, sizeof(*xcr)); xcr->cr_version = XUCRED_VERSION; - xcr->cr_uid = cr->cr_uid; + xcr->cr_uid = kauth_cred_getuid(cr); xcr->cr_ngroups = cr->cr_ngroups; bcopy(cr->cr_groups, xcr->cr_groups, sizeof(xcr->cr_groups)); } diff --git a/bsd/kern/kern_resource.c b/bsd/kern/kern_resource.c index 99b821e5e..6ce5a3874 100644 --- a/bsd/kern/kern_resource.c +++ b/bsd/kern/kern_resource.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -64,25 +64,29 @@ #include <sys/systm.h> #include <sys/sysctl.h> #include <sys/kernel.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/resourcevar.h> #include <sys/malloc.h> -#include <sys/proc.h> -#include <sys/mount.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> +#include <machine/spl.h> + +#include <sys/mount_internal.h> +#include <sys/sysproto.h> #include <bsm/audit_kernel.h> -#include <machine/spl.h> #include <machine/vmparam.h> #include <mach/mach_types.h> #include <mach/time_value.h> #include <mach/task_info.h> +#include <mach/vm_map.h> #include <vm/vm_map.h> -int donice __P((struct proc *curp, struct proc *chgp, int n)); -int dosetrlimit __P((struct proc *p, u_int which, struct rlimit *limp)); +int donice(struct proc *curp, struct proc *chgp, int n); +int dosetrlimit(struct proc *p, u_int which, struct rlimit *limp); rlim_t maxdmap = MAXDSIZ; /* XXX */ rlim_t maxsmap = MAXSSIZ; /* XXX */ @@ -106,19 +110,15 @@ SYSCTL_INT( _kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, /* * Resource controls and accounting. */ -struct getpriority_args { - int which; - int who; -}; int -getpriority(curp, uap, retval) - struct proc *curp; - register struct getpriority_args *uap; - register_t *retval; +getpriority(struct proc *curp, struct getpriority_args *uap, register_t *retval) { register struct proc *p; register int low = PRIO_MAX + 1; + if (uap->who < 0) + return (EINVAL); + switch (uap->which) { case PRIO_PROCESS: @@ -147,9 +147,9 @@ getpriority(curp, uap, retval) case PRIO_USER: if (uap->who == 0) - uap->who = curp->p_ucred->cr_uid; + uap->who = kauth_cred_getuid(kauth_cred_get()); for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) - if (p->p_ucred->cr_uid == uap->who && + if (kauth_cred_getuid(p->p_ucred) == uap->who && p->p_nice < low) low = p->p_nice; break; @@ -163,17 +163,9 @@ getpriority(curp, uap, retval) return (0); } -struct setpriority_args { - int which; - int who; - int prio; -}; /* ARGSUSED */ int -setpriority(curp, uap, retval) - struct proc *curp; - register struct setpriority_args *uap; - register_t *retval; +setpriority(struct proc *curp, struct setpriority_args *uap, __unused register_t *retval) { register struct proc *p; int found = 0, error = 0; @@ -182,6 +174,9 @@ setpriority(curp, uap, retval) AUDIT_ARG(owner, uap->who, 0); AUDIT_ARG(value, uap->prio); + if (uap->who < 0) + return (EINVAL); + switch (uap->which) { case PRIO_PROCESS: @@ -212,9 +207,9 @@ setpriority(curp, uap, retval) case PRIO_USER: if (uap->who == 0) - uap->who = curp->p_ucred->cr_uid; + uap->who = kauth_cred_getuid(kauth_cred_get()); for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) - if (p->p_ucred->cr_uid == uap->who) { + if (kauth_cred_getuid(p->p_ucred) == uap->who) { error = donice(curp, p, uap->prio); found++; } @@ -233,89 +228,33 @@ donice(curp, chgp, n) register struct proc *curp, *chgp; register int n; { - register struct pcred *pcred = curp->p_cred; + kauth_cred_t ucred = curp->p_ucred; - if (pcred->pc_ucred->cr_uid && pcred->p_ruid && - pcred->pc_ucred->cr_uid != chgp->p_ucred->cr_uid && - pcred->p_ruid != chgp->p_ucred->cr_uid) + if (suser(ucred, NULL) && ucred->cr_ruid && + kauth_cred_getuid(ucred) != kauth_cred_getuid(chgp->p_ucred) && + ucred->cr_ruid != kauth_cred_getuid(chgp->p_ucred)) return (EPERM); if (n > PRIO_MAX) n = PRIO_MAX; if (n < PRIO_MIN) n = PRIO_MIN; - if (n < chgp->p_nice && suser(pcred->pc_ucred, &curp->p_acflag)) + if (n < chgp->p_nice && suser(ucred, &curp->p_acflag)) return (EACCES); chgp->p_nice = n; (void)resetpriority(chgp); return (0); } -#if COMPAT_43 -struct osetrlimit_args { - u_int which; - struct ogetrlimit * rlp; -}; -/* ARGSUSED */ -int -osetrlimit(p, uap, retval) - struct proc *p; - struct osetrlimit_args *uap; - register_t *retval; -{ - struct orlimit olim; - struct rlimit lim; - int error; - - if (error = copyin((caddr_t)uap->rlp, (caddr_t)&olim, - sizeof (struct orlimit))) - return (error); - lim.rlim_cur = olim.rlim_cur; - lim.rlim_max = olim.rlim_max; - return (dosetrlimit(p, uap->which, &lim)); -} -struct ogetrlimit_args { - u_int which; - struct ogetrlimit * rlp; -}; /* ARGSUSED */ int -ogetrlimit(p, uap, retval) - struct proc *p; - struct ogetrlimit_args *uap; - register_t *retval; -{ - struct orlimit olim; - - if (uap->which >= RLIM_NLIMITS) - return (EINVAL); - olim.rlim_cur = p->p_rlimit[uap->which].rlim_cur; - if (olim.rlim_cur == -1) - olim.rlim_cur = 0x7fffffff; - olim.rlim_max = p->p_rlimit[uap->which].rlim_max; - if (olim.rlim_max == -1) - olim.rlim_max = 0x7fffffff; - return (copyout((caddr_t)&olim, (caddr_t)uap->rlp, - sizeof(olim))); -} -#endif /* COMPAT_43 */ - -struct setrlimit_args { - u_int which; - struct rlimit * rlp; -}; -/* ARGSUSED */ -int -setrlimit(p, uap, retval) - struct proc *p; - register struct setrlimit_args *uap; - register_t *retval; +setrlimit(struct proc *p, register struct setrlimit_args *uap, __unused register_t *retval) { struct rlimit alim; int error; - if (error = copyin((caddr_t)uap->rlp, (caddr_t)&alim, - sizeof (struct rlimit))) + if ((error = copyin(uap->rlp, (caddr_t)&alim, + sizeof (struct rlimit)))) return (error); return (dosetrlimit(p, uap->which, &alim)); } @@ -327,7 +266,6 @@ dosetrlimit(p, which, limp) struct rlimit *limp; { register struct rlimit *alimp; - extern rlim_t maxdmap, maxsmap; int error; if (which >= RLIM_NLIMITS) @@ -335,7 +273,7 @@ dosetrlimit(p, which, limp) alimp = &p->p_rlimit[which]; if (limp->rlim_cur > alimp->rlim_max || limp->rlim_max > alimp->rlim_max) - if (error = suser(p->p_ucred, &p->p_acflag)) + if ((error = suser(kauth_cred_get(), &p->p_acflag))) return (error); if (limp->rlim_cur > limp->rlim_max) limp->rlim_cur = limp->rlim_max; @@ -366,9 +304,8 @@ dosetrlimit(p, which, limp) * up make more accessible, if going down make inaccessible. */ if (limp->rlim_cur != alimp->rlim_cur) { - vm_offset_t addr; - vm_size_t size; - vm_prot_t prot; + user_addr_t addr; + user_size_t size; if (limp->rlim_cur > alimp->rlim_cur) { /* grow stack */ @@ -377,13 +314,14 @@ dosetrlimit(p, which, limp) #if STACK_GROWTH_UP /* go to top of current stack */ - addr = trunc_page((unsigned int)(p->user_stack + alimp->rlim_cur)); + addr = p->user_stack + alimp->rlim_cur; #else STACK_GROWTH_UP - addr = trunc_page_32((unsigned int)(p->user_stack - alimp->rlim_cur)); + addr = p->user_stack - alimp->rlim_cur; addr -= size; #endif /* STACK_GROWTH_UP */ - if (vm_allocate(current_map(), - &addr, size, FALSE) != KERN_SUCCESS) + if (mach_vm_allocate(current_map(), + &addr, size, + VM_FLAGS_FIXED) != KERN_SUCCESS) return(EINVAL); } else { /* shrink stack */ @@ -434,22 +372,14 @@ dosetrlimit(p, which, limp) return (0); } -struct getrlimit_args { - u_int which; - struct rlimit * rlp; -}; /* ARGSUSED */ int -getrlimit(p, uap, retval) - struct proc *p; - register struct getrlimit_args *uap; - register_t *retval; +getrlimit(struct proc *p, register struct getrlimit_args *uap, __unused register_t *retval) { - if (uap->which >= RLIM_NLIMITS) return (EINVAL); return (copyout((caddr_t)&p->p_rlimit[uap->which], - (caddr_t)uap->rlp, sizeof (struct rlimit))); + uap->rlp, sizeof (struct rlimit))); } /* @@ -500,24 +430,22 @@ calcru(p, up, sp, ip) } } -struct getrusage_args { - int who; - struct rusage * rusage; -}; +__private_extern__ void munge_rusage(struct rusage *a_rusage_p, struct user_rusage *a_user_rusage_p); + /* ARGSUSED */ int -getrusage(p, uap, retval) - register struct proc *p; - register struct getrusage_args *uap; - register_t *retval; +getrusage(register struct proc *p, register struct getrusage_args *uap, __unused register_t *retval) { struct rusage *rup, rubuf; + struct user_rusage rubuf64; + size_t retsize = sizeof(rubuf); /* default: 32 bits */ + caddr_t retbuf = (caddr_t)&rubuf; /* default: 32 bits */ switch (uap->who) { - case RUSAGE_SELF: rup = &p->p_stats->p_ru; calcru(p, &rup->ru_utime, &rup->ru_stime, NULL); + // LP64todo: proc struct should have 64 bit version of struct rubuf = *rup; break; @@ -529,8 +457,12 @@ getrusage(p, uap, retval) default: return (EINVAL); } - return (copyout((caddr_t)&rubuf, (caddr_t)uap->rusage, - sizeof (struct rusage))); + if (IS_64BIT_PROCESS(p)) { + retsize = sizeof(rubuf64); + retbuf = (caddr_t)&rubuf64; + munge_rusage(&rubuf, &rubuf64); + } + return (copyout(retbuf, uap->rusage, retsize)); } void @@ -562,6 +494,8 @@ limcopy(lim) MALLOC_ZONE(copy, struct plimit *, sizeof(struct plimit), M_SUBPROC, M_WAITOK); + if (copy == NULL) + panic("limcopy"); bcopy(lim->pl_rlimit, copy->pl_rlimit, sizeof(struct rlimit) * RLIM_NLIMITS); copy->p_lflags = 0; diff --git a/bsd/kern/kern_shutdown.c b/bsd/kern/kern_shutdown.c index f8568f9be..7c8eb53b7 100644 --- a/bsd/kern/kern_shutdown.c +++ b/bsd/kern/kern_shutdown.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -30,13 +30,12 @@ #include <sys/systm.h> #include <sys/kernel.h> #include <sys/vm.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> #include <sys/user.h> -#include <sys/buf.h> #include <sys/reboot.h> #include <sys/conf.h> -#include <sys/vnode.h> -#include <sys/file.h> +#include <sys/vnode_internal.h> +#include <sys/file_internal.h> #include <sys/clist.h> #include <sys/callout.h> #include <sys/mbuf.h> @@ -55,7 +54,9 @@ #include <vm/vm_kern.h> #include <mach/vm_param.h> #include <sys/filedesc.h> +#include <mach/host_priv.h> #include <mach/host_reboot.h> + #include <bsm/audit_kernel.h> int waittime = -1; @@ -70,6 +71,7 @@ boot(paniced, howto, command) struct proc *p = current_proc(); /* XXX */ int hostboot_option=0; int funnel_state; + struct proc *launchd_proc; static void proc_shutdown(); extern void md_prepare_for_shutdown(int paniced, int howto, char * command); @@ -96,24 +98,28 @@ boot(paniced, howto, command) sync(p, (void *)NULL, (int *)NULL); - /* Release vnodes from the VM object cache */ - ubc_unmountall(); + /* + * Now that all processes have been termianted and system is sync'ed up, + * suspend launchd + */ - IOSleep( 1 * 1000 ); + launchd_proc = pfind(1); + if (launchd_proc && p != launchd_proc) { + task_suspend(launchd_proc->task); + } /* * Unmount filesystems */ - if (panicstr == 0) - vfs_unmountall(); + vfs_unmountall(); /* Wait for the buffer cache to clean remaining dirty buffers */ - for (iter = 0; iter < 20; iter++) { + for (iter = 0; iter < 100; iter++) { nbusy = count_busy_buffers(); if (nbusy == 0) break; printf("%d ", nbusy); - IOSleep( 4 * nbusy ); + IOSleep( 1 * nbusy ); } if (nbusy) printf("giving up\n"); @@ -135,6 +141,16 @@ boot(paniced, howto, command) if (paniced == RB_PANIC) hostboot_option = HOST_REBOOT_HALT; + /* + * if we're going to power down due to a halt, + * give the disks a chance to finish getting + * the track cache flushed to the media... + * unfortunately, some of our earlier drives + * don't properly hold off on returning + * from the track flush command (issued by + * the unmounts) until it's actully fully + * committed. + */ if (hostboot_option == HOST_REBOOT_HALT) IOSleep( 1 * 1000 ); @@ -161,6 +177,7 @@ proc_shutdown() struct proc *p, *self; struct vnode **cdirp, **rdirp, *vp; int restart, i, TERM_catch; + int delayterm = 0; /* * Kill as many procs as we can. (Except ourself...) @@ -168,11 +185,13 @@ proc_shutdown() self = (struct proc *)current_proc(); /* - * Suspend /etc/init + * Signal the init with SIGTERM so that he does not launch + * new processes */ p = pfind(1); - if (p && p != self) - task_suspend(p->task); /* stop init */ + if (p && p != self) { + psignal(p, SIGTERM); + } printf("Killing all processes "); @@ -181,15 +200,19 @@ proc_shutdown() */ sigterm_loop: for (p = allproc.lh_first; p; p = p->p_list.le_next) { - if (((p->p_flag&P_SYSTEM) == 0) && (p->p_pptr->p_pid != 0) && (p != self) && (p->p_shutdownstate == 0)) { + if (((p->p_flag&P_SYSTEM) == 0) && (p->p_pptr->p_pid != 0) && (p != self) && (p->p_stat != SZOMB) && (p->p_shutdownstate == 0)) { + + if ((delayterm == 0) && ((p->p_lflag& P_LDELAYTERM) == P_LDELAYTERM)) { + continue; + } if (p->p_sigcatch & sigmask(SIGTERM)) { - p->p_shutdownstate = 1; + p->p_shutdownstate = 1; psignal(p, SIGTERM); goto sigterm_loop; - } } } + } /* * now wait for up to 30 seconds to allow those procs catching SIGTERM * to digest it @@ -201,23 +224,26 @@ sigterm_loop: * and then check to see if the tasks that were sent a * SIGTERM have exited */ - IOSleep(100); + IOSleep(100); TERM_catch = 0; - for (p = allproc.lh_first; p; p = p->p_list.le_next) { - if (p->p_shutdownstate == 1) - TERM_catch++; + for (p = allproc.lh_first; p; p = p->p_list.le_next) { + if (p->p_shutdownstate == 1) { + TERM_catch++; + } } if (TERM_catch == 0) break; } if (TERM_catch) { - /* + /* * log the names of the unresponsive tasks */ + for (p = allproc.lh_first; p; p = p->p_list.le_next) { - if (p->p_shutdownstate == 1) + if (p->p_shutdownstate == 1) { printf("%s[%d]: didn't act on SIGTERM\n", p->p_comm, p->p_pid); + } } IOSleep(1000 * 5); } @@ -227,10 +253,13 @@ sigterm_loop: */ sigkill_loop: for (p = allproc.lh_first; p; p = p->p_list.le_next) { - if (((p->p_flag&P_SYSTEM) == 0) && (p->p_pptr->p_pid != 0) && (p != self) && (p->p_shutdownstate != 2)) { - psignal(p, SIGKILL); + if (((p->p_flag&P_SYSTEM) == 0) && (p->p_pptr->p_pid != 0) && (p != self) && (p->p_stat != SZOMB) && (p->p_shutdownstate != 2)) { + + if ((delayterm == 0) && ((p->p_lflag& P_LDELAYTERM) == P_LDELAYTERM)) { + continue; + } + psignal(p, SIGKILL); p->p_shutdownstate = 2; - goto sigkill_loop; } } @@ -241,7 +270,7 @@ sigkill_loop: IOSleep(200); /* double the time from 100 to 200 for NFS requests in particular */ for (p = allproc.lh_first; p; p = p->p_list.le_next) { - if (p->p_shutdownstate == 2) + if (p->p_shutdownstate == 2) break; } if (!p) @@ -253,7 +282,8 @@ sigkill_loop: */ p = allproc.lh_first; while (p) { - if ((p->p_flag&P_SYSTEM) || (p->p_pptr->p_pid == 0) || (p == self)) { + if ((p->p_flag&P_SYSTEM) || (!delayterm && ((p->p_lflag& P_LDELAYTERM))) + || (p->p_pptr->p_pid == 0) || (p == self)) { p = p->p_list.le_next; } else { @@ -264,12 +294,11 @@ sigkill_loop: * understand the sig_lock. This needs to be fixed. * XXX */ - if (p->exit_thread) { /* someone already doing it */ - /* give him a chance */ - thread_block(THREAD_CONTINUE_NULL); - } - else { - p->exit_thread = current_act(); + if (p->exit_thread) { /* someone already doing it */ + /* give him a chance */ + thread_block(THREAD_CONTINUE_NULL); + } else { + p->exit_thread = current_thread(); printf("."); exit1(p, 1, (int *)NULL); } @@ -277,28 +306,13 @@ sigkill_loop: } } printf("\n"); - /* - * Forcibly free resources of what's left. - */ -#ifdef notyet - p = allproc.lh_first; - while (p) { - /* - * Close open files and release open-file table. - * This may block! - */ - /* panics on reboot due to "zfree: non-allocated memory in collectable zone" message */ - fdfree(p); - p = p->p_list.le_next; + + /* Now start the termination of processes that are marked for delayed termn */ + if (delayterm == 0) { + delayterm = 1; + goto sigterm_loop; } -#endif /* notyet */ - /* Wait for the reaper thread to run, and clean up what we have done - * before we proceed with the hardcore shutdown. This reduces the race - * between kill_tasks and the reaper thread. - */ - /* thread_wakeup(&reaper_queue); */ - /* IOSleep( 1 * 1000); */ printf("continuing\n"); } diff --git a/bsd/kern/kern_sig.c b/bsd/kern/kern_sig.c index 6c593326e..6313188a9 100644 --- a/bsd/kern/kern_sig.c +++ b/bsd/kern/kern_sig.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -63,15 +63,13 @@ #define SIGPROP /* include signal properties table */ #include <sys/param.h> #include <sys/resourcevar.h> -#include <sys/namei.h> -#include <sys/vnode.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/systm.h> #include <sys/timeb.h> #include <sys/times.h> -#include <sys/buf.h> #include <sys/acct.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/kernel.h> #include <sys/wait.h> #include <sys/signalvar.h> @@ -82,34 +80,62 @@ #include <sys/stat.h> #include <sys/lock.h> #include <sys/kdebug.h> + #include <sys/mount.h> +#include <sys/sysproto.h> #include <bsm/audit_kernel.h> +#include <machine/spl.h> + #include <kern/cpu_number.h> #include <sys/vm.h> #include <sys/user.h> /* for coredump */ #include <kern/ast.h> /* for APC support */ +#include <kern/lock.h> +#include <kern/task.h> /* extern void *get_bsdtask_info(task_t); */ #include <kern/thread.h> #include <kern/sched_prim.h> #include <kern/thread_call.h> #include <mach/exception.h> +#include <mach/task.h> +#include <mach/thread_act.h> + +/* + * Missing prototypes that Mach should export + * + * +++ + */ +extern int thread_enable_fpe(thread_t act, int onoff); +extern void unix_syscall_return(int error); +extern thread_t port_name_to_thread(mach_port_name_t port_name); +extern kern_return_t check_actforsig(task_t task, thread_t thread, int setast); +extern kern_return_t get_signalact(task_t , thread_t *, int); +extern boolean_t thread_should_abort(thread_t); +extern unsigned int get_useraddr(void); + +/* + * --- + */ extern void doexception(int exc, int code, int sub); -void stop __P((struct proc *p)); -int cansignal __P((struct proc *, struct pcred *, struct proc *, int)); -int killpg1 __P((struct proc *, int, int, int)); -void sigexit_locked __P((struct proc *, int)); -int setsigvec __P((struct proc *, int, struct __sigaction *)); -void exit1 __P((struct proc *, int, int *)); -int signal_lock __P((struct proc *)); -int signal_unlock __P((struct proc *)); -void signal_setast __P((thread_act_t)); -void psignal_lock __P((struct proc *, int, int)); -void psignal_uthread __P((thread_act_t, int)); +void stop(struct proc *p); +int cansignal(struct proc *, kauth_cred_t, struct proc *, int); +int killpg1(struct proc *, int, int, int); +void sigexit_locked(struct proc *, int); +int setsigvec(struct proc *, int, struct __user_sigaction *); +void exit1(struct proc *, int, int *); +void psignal_uthread(thread_t, int); kern_return_t do_bsdexception(int, int, int); +void __posix_sem_syscall_return(kern_return_t); + +/* implementations in osfmk/kern/sync_sema.c. We do not want port.h in this scope, so void * them */ +kern_return_t semaphore_timedwait_signal_trap_internal(void *, void *,time_t, int32_t, void (*)(int)); +kern_return_t semaphore_timedwait_trap_internal(void *, time_t, int32_t, void (*)(int)); +kern_return_t semaphore_wait_signal_trap_internal(void *, void *, void (*)(int)); +kern_return_t semaphore_wait_trap_internal(void *, void (*)(int)); static int filt_sigattach(struct knote *kn); static void filt_sigdetach(struct knote *kn); @@ -118,8 +144,52 @@ static int filt_signal(struct knote *kn, long hint); struct filterops sig_filtops = { 0, filt_sigattach, filt_sigdetach, filt_signal }; + +/* + * NOTE: Source and target may *NOT* overlap! (target is smaller) + */ +static void +sigaltstack_64to32(struct user_sigaltstack *in, struct sigaltstack *out) +{ + out->ss_sp = CAST_DOWN(void *,in->ss_sp); + out->ss_size = in->ss_size; + out->ss_flags = in->ss_flags; +} + +/* + * NOTE: Source and target may are permitted to overlap! (source is smaller); + * this works because we copy fields in order from the end of the struct to + * the beginning. + */ +static void +sigaltstack_32to64(struct sigaltstack *in, struct user_sigaltstack *out) +{ + out->ss_flags = in->ss_flags; + out->ss_size = in->ss_size; + out->ss_sp = CAST_USER_ADDR_T(in->ss_sp); +} + +static void +sigaction_64to32(struct user_sigaction *in, struct sigaction *out) +{ + /* This assumes 32 bit __sa_handler is of type sig_t */ + out->__sigaction_u.__sa_handler = CAST_DOWN(sig_t,in->__sigaction_u.__sa_handler); + out->sa_mask = in->sa_mask; + out->sa_flags = in->sa_flags; +} + +static void +__sigaction_32to64(struct __sigaction *in, struct __user_sigaction *out) +{ + out->__sigaction_u.__sa_handler = CAST_USER_ADDR_T(in->__sigaction_u.__sa_handler); + out->sa_tramp = CAST_USER_ADDR_T(in->sa_tramp); + out->sa_mask = in->sa_mask; + out->sa_flags = in->sa_flags; +} + + #if SIGNAL_DEBUG -void ram_printf __P((int)); +void ram_printf(int); int ram_debug=0; unsigned int rdebug_proc=0; void @@ -155,8 +225,7 @@ int error = 0; #endif /* DIAGNOSTIC */ siglock_retry: - /* TBD: check p last arg */ - error = lockmgr(&p->signal_lock, LK_EXCLUSIVE, 0, (struct proc *)p); + error = lockmgr((struct lock__bsd__ *)&p->signal_lock[0], LK_EXCLUSIVE, 0, (struct proc *)0); if (error == EINTR) goto siglock_retry; return(error); @@ -186,23 +255,23 @@ signal_unlock(struct proc *p) #endif /* DIAGNOSTIC */ /* TBD: check p last arg */ - return(lockmgr(&p->signal_lock, LK_RELEASE, (simple_lock_t)0, (struct proc *)p)); + return(lockmgr((struct lock__bsd__ *)&p->signal_lock[0], LK_RELEASE, (simple_lock_t)0, (struct proc *)0)); } void signal_setast(sig_actthread) -thread_act_t sig_actthread; +thread_t sig_actthread; { act_set_astbsd(sig_actthread); } /* - * Can process p, with pcred pc, send the signal signum to process q? + * Can process p, with ucred uc, send the signal signum to process q? */ int -cansignal(p, pc, q, signum) +cansignal(p, uc, q, signum) struct proc *p; - struct pcred *pc; + kauth_cred_t uc; struct proc *q; int signum; { @@ -210,7 +279,7 @@ cansignal(p, pc, q, signum) if (p == q) return(1); - if (pc->pc_ucred->cr_uid == 0) + if (!suser(uc, NULL)) return (1); /* root can always signal */ if (signum == SIGCONT && q->p_session == p->p_session) @@ -233,10 +302,10 @@ cansignal(p, pc, q, signum) case SIGHUP: case SIGUSR1: case SIGUSR2: - if (pc->p_ruid == q->p_cred->p_ruid || - pc->pc_ucred->cr_uid == q->p_cred->p_ruid || - pc->p_ruid == q->p_ucred->cr_uid || - pc->pc_ucred->cr_uid == q->p_ucred->cr_uid) + if (uc->cr_ruid == q->p_ucred->cr_ruid || + kauth_cred_getuid(uc) == q->p_ucred->cr_ruid || + uc->cr_ruid == kauth_cred_getuid(q->p_ucred) || + kauth_cred_getuid(uc) == kauth_cred_getuid(q->p_ucred)) return (1); } return (0); @@ -246,34 +315,27 @@ cansignal(p, pc, q, signum) * because the P_SUGID test exists, this has extra tests which * could be removed. */ - if (pc->p_ruid == q->p_cred->p_ruid || - pc->p_ruid == q->p_cred->p_svuid || - pc->pc_ucred->cr_uid == q->p_cred->p_ruid || - pc->pc_ucred->cr_uid == q->p_cred->p_svuid || - pc->p_ruid == q->p_ucred->cr_uid || - pc->pc_ucred->cr_uid == q->p_ucred->cr_uid) + if (uc->cr_ruid == q->p_ucred->cr_ruid || + uc->cr_ruid == q->p_ucred->cr_svuid || + kauth_cred_getuid(uc) == q->p_ucred->cr_ruid || + kauth_cred_getuid(uc) == q->p_ucred->cr_svuid || + uc->cr_ruid == kauth_cred_getuid(q->p_ucred) || + kauth_cred_getuid(uc) == kauth_cred_getuid(q->p_ucred)) return (1); return (0); } -struct sigaction_args { - int signum; - struct __sigaction *nsa; - struct sigaction *osa; -}; /* ARGSUSED */ int -sigaction(p, uap, retval) - struct proc *p; - register struct sigaction_args *uap; - register_t *retval; +sigaction(struct proc *p, register struct sigaction_args *uap, __unused register_t *retval) { - struct sigaction vec; - struct __sigaction __vec; + struct user_sigaction vec; + struct __user_sigaction __vec; - register struct sigaction *sa; + struct user_sigaction *sa = &vec; register struct sigacts *ps = p->p_sigacts; + register int signum; int bit, error=0; @@ -281,7 +343,7 @@ sigaction(p, uap, retval) if (signum <= 0 || signum >= NSIG || signum == SIGKILL || signum == SIGSTOP) return (EINVAL); - sa = &vec; + if (uap->osa) { sa->sa_handler = ps->ps_sigact[signum]; sa->sa_mask = ps->ps_catchmask[signum]; @@ -301,13 +363,26 @@ sigaction(p, uap, retval) sa->sa_flags |= SA_NOCLDSTOP; if ((signum == SIGCHLD) && (p->p_flag & P_NOCLDWAIT)) sa->sa_flags |= SA_NOCLDWAIT; - if (error = copyout((caddr_t)sa, (caddr_t)uap->osa, - sizeof (vec))) + + if (IS_64BIT_PROCESS(p)) { + error = copyout(sa, uap->osa, sizeof(struct user_sigaction)); + } else { + struct sigaction vec32; + sigaction_64to32(sa, &vec32); + error = copyout(&vec32, uap->osa, sizeof(struct sigaction)); + } + if (error) return (error); } if (uap->nsa) { - if (error = copyin((caddr_t)uap->nsa, (caddr_t)&__vec, - sizeof (__vec))) + if (IS_64BIT_PROCESS(p)) { + error = copyin(uap->nsa, &__vec, sizeof(struct __user_sigaction)); + } else { + struct __sigaction __vec32; + error = copyin(uap->nsa, &__vec32, sizeof(struct __sigaction)); + __sigaction_32to64(&__vec32, &__vec); + } + if (error) return (error); error = setsigvec(p, signum, &__vec); } @@ -319,7 +394,7 @@ int clear_procsiglist(struct proc *p, int bit) { struct uthread * uth; - thread_act_t thact; + thread_t thact; signal_lock(p); @@ -342,11 +417,12 @@ clear_procsiglist(struct proc *p, int bit) return(0); } -int + +static int unblock_procsigmask(struct proc *p, int bit) { struct uthread * uth; - thread_act_t thact; + thread_t thact; signal_lock(p); if ((p->p_flag & P_INVFORK) && p->p_vforkact) { @@ -368,11 +444,11 @@ unblock_procsigmask(struct proc *p, int bit) } -int +static int block_procsigmask(struct proc *p, int bit) { struct uthread * uth; - thread_act_t thact; + thread_t thact; signal_lock(p); if ((p->p_flag & P_INVFORK) && p->p_vforkact) { @@ -392,11 +468,12 @@ block_procsigmask(struct proc *p, int bit) signal_unlock(p); return(0); } + int set_procsigmask(struct proc *p, int bit) { struct uthread * uth; - thread_act_t thact; + thread_t thact; signal_lock(p); if ((p->p_flag & P_INVFORK) && p->p_vforkact) { @@ -417,11 +494,9 @@ set_procsigmask(struct proc *p, int bit) return(0); } +/* XXX should be static? */ int -setsigvec(p, signum, sa) - register struct proc *p; - int signum; - register struct __sigaction *sa; +setsigvec(struct proc *p, int signum, struct __user_sigaction *sa) { register struct sigacts *ps = p->p_sigacts; register int bit; @@ -434,7 +509,7 @@ setsigvec(p, signum, sa) * Change setting atomically. */ ps->ps_sigact[signum] = sa->sa_handler; - ps->ps_trampact[signum] = (sig_t) sa->sa_tramp; + ps->ps_trampact[signum] = sa->sa_tramp; ps->ps_catchmask[signum] = sa->sa_mask &~ sigcantmask; if (sa->sa_flags & SA_SIGINFO) ps->ps_siginfo |= bit; @@ -478,9 +553,9 @@ setsigvec(p, signum, sa) #ifdef __ppc__ if (signum == SIGFPE) { if (sa->sa_handler == SIG_DFL || sa->sa_handler == SIG_IGN) - thread_enable_fpe(current_act(), 0); + thread_enable_fpe(current_thread(), 0); else - thread_enable_fpe(current_act(), 1); + thread_enable_fpe(current_thread(), 1); } #endif /* __ppc__ */ /* @@ -527,7 +602,7 @@ siginit(p) void execsigs(p, thr_act) register struct proc *p; - register thread_act_t thr_act; + register thread_t thr_act; { register struct sigacts *ps = p->p_sigacts; register int nc, mask; @@ -560,7 +635,7 @@ execsigs(p, thr_act) */ ps->ps_sigstk.ss_flags = SA_DISABLE; ps->ps_sigstk.ss_size = 0; - ps->ps_sigstk.ss_sp = 0; + ps->ps_sigstk.ss_sp = USER_ADDR_NULL; ps->ps_flags = 0; } @@ -570,47 +645,39 @@ execsigs(p, thr_act) * and return old mask as return value; * the library stub does the rest. */ -struct sigprocmask_args { - int how; - sigset_t *mask; - sigset_t * omask; -}; int -sigprocmask(p, uap, retval) - register struct proc *p; - struct sigprocmask_args *uap; - register_t *retval; +sigprocmask(register struct proc *p, struct sigprocmask_args *uap, __unused register_t *retval) { int error = 0; sigset_t oldmask, nmask; - sigset_t * omask = uap->omask; + user_addr_t omask = uap->omask; struct uthread *ut; - ut = (struct uthread *)get_bsdthread_info(current_act()); + ut = (struct uthread *)get_bsdthread_info(current_thread()); oldmask = ut->uu_sigmask; - if (uap->mask == (sigset_t *)0) { + if (uap->mask == USER_ADDR_NULL) { /* just want old mask */ goto out; } - error = copyin((caddr_t)uap->mask, &nmask, sizeof(sigset_t)); + error = copyin(uap->mask, &nmask, sizeof(sigset_t)); if (error) goto out; switch (uap->how) { case SIG_BLOCK: block_procsigmask(p, (nmask & ~sigcantmask)); - signal_setast(current_act()); + signal_setast(current_thread()); break; case SIG_UNBLOCK: unblock_procsigmask(p, (nmask & ~sigcantmask)); - signal_setast(current_act()); + signal_setast(current_thread()); break; case SIG_SETMASK: set_procsigmask(p, (nmask & ~sigcantmask)); - signal_setast(current_act()); + signal_setast(current_thread()); break; default: @@ -618,24 +685,18 @@ sigprocmask(p, uap, retval) break; } out: - if (!error && omask) + if (!error && omask != USER_ADDR_NULL) copyout(&oldmask, omask, sizeof(sigset_t)); return (error); } -struct sigpending_args { - struct sigvec *osv; -}; int -sigpending(p, uap, retval) - struct proc *p; - register struct sigpending_args *uap; - register_t *retval; +sigpending(__unused struct proc *p, register struct sigpending_args *uap, __unused register_t *retval) { struct uthread *ut; sigset_t pendlist; - ut = (struct uthread *)get_bsdthread_info(current_act()); + ut = (struct uthread *)get_bsdthread_info(current_thread()); pendlist = ut->uu_siglist; if (uap->osv) @@ -643,95 +704,6 @@ sigpending(p, uap, retval) return(0); } -#if COMPAT_43 -/* - * Generalized interface signal handler, 4.3-compatible. - */ -struct osigvec_args { - int signum; - struct sigvec *nsv; - struct sigvec *osv; -}; -/* ARGSUSED */ -int -osigvec(p, uap, retval) - struct proc *p; - register struct osigvec_args *uap; - register_t *retval; -{ - struct sigvec __vec; - struct sigvec vec; - register struct sigacts *ps = p->p_sigacts; - register struct sigvec *sv; - register int signum; - int bit, error=0; - -#if 0 - signum = uap->signum; - if (signum <= 0 || signum >= NSIG || - signum == SIGKILL || signum == SIGSTOP) - return (EINVAL); - sv = &vec; - if (uap->osv) { - *(sig_t *)&sv->sv_handler = ps->ps_sigact[signum]; - sv->sv_mask = ps->ps_catchmask[signum]; - bit = sigmask(signum); - sv->sv_flags = 0; - if ((ps->ps_sigonstack & bit) != 0) - sv->sv_flags |= SV_ONSTACK; - if ((ps->ps_sigintr & bit) != 0) - sv->sv_flags |= SV_INTERRUPT; - if (p->p_flag & P_NOCLDSTOP) - sv->sv_flags |= SA_NOCLDSTOP; - if (error = copyout((caddr_t)sv, (caddr_t)uap->osv, - sizeof (vec))) - return (error); - } - if (uap->nsv) { - if (error = copyin((caddr_t)uap->nsv, (caddr_t)sv, - sizeof (vec))) - return (error); - sv->sv_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */ - error = setsigvec(p, signum, (struct sigaction *)sv); - } -#else -error = ENOSYS; -#endif - return (error); -} - -struct osigblock_args { - int mask; -}; -int -osigblock(p, uap, retval) - register struct proc *p; - struct osigblock_args *uap; - register_t *retval; -{ - struct uthread * uth = get_bsdthread_info(current_act()); - - *retval = uth->uu_sigmask; - uth->uu_sigmask |= (uap->mask & ~sigcantmask); - return (0); -} - -struct osigsetmask_args { - int mask; -}; -int -osigsetmask(p, uap, retval) - struct proc *p; - struct osigsetmask_args *uap; - register_t *retval; -{ - struct uthread * uth = get_bsdthread_info(current_act()); - - *retval = uth->uu_sigmask; - uth->uu_sigmask = (uap->mask & ~sigcantmask); - return (0); -} -#endif /* COMPAT_43 */ /* * Suspend process until signal, providing mask to be set @@ -739,28 +711,19 @@ osigsetmask(p, uap, retval) * libc stub passes mask, not pointer, to save a copyin. */ -int -sigcontinue(error) +static int +sigcontinue(__unused int error) { - struct uthread *ut = get_bsdthread_info(current_act()); +// struct uthread *ut = get_bsdthread_info(current_thread()); unix_syscall_return(EINTR); } -struct sigsuspend_args { - sigset_t mask; -}; - -/* ARGSUSED */ int -sigsuspend(p, uap, retval) - register struct proc *p; - struct sigsuspend_args *uap; - register_t *retval; +sigsuspend(register struct proc *p, struct sigsuspend_args *uap, __unused register_t *retval) { - register struct sigacts *ps = p->p_sigacts; struct uthread *ut; - ut = (struct uthread *)get_bsdthread_info(current_act()); + ut = (struct uthread *)get_bsdthread_info(current_thread()); /* * When returning from sigpause, we want @@ -770,56 +733,184 @@ sigsuspend(p, uap, retval) * to indicate this. */ ut->uu_oldmask = ut->uu_sigmask; - ut->uu_flag |= USAS_OLDMASK; + ut->uu_flag |= UT_SAS_OLDMASK; ut->uu_sigmask = (uap->mask & ~sigcantmask); (void) tsleep0((caddr_t) p, PPAUSE|PCATCH, "pause", 0, sigcontinue); /* always return EINTR rather than ERESTART... */ return (EINTR); } -struct __disable_thsignal_args { - int value; -}; int -__disable_threadsignal(p, uap, retval) - struct proc *p; - register struct __disable_thsignal_args *uap; - register_t *retval; +__disable_threadsignal(struct proc *p, + __unused register struct __disable_threadsignal_args *uap, + __unused register_t *retval) { struct uthread *uth; - uth = (struct uthread *)get_bsdthread_info(current_act()); + uth = (struct uthread *)get_bsdthread_info(current_thread()); /* No longer valid to have any signal delivered */ signal_lock(p); - uth->uu_flag |= UNO_SIGMASK; + uth->uu_flag |= UT_NO_SIGMASK; signal_unlock(p); return(0); } -struct pthread_kill_args { - void * thread_port; - int sig; -}; -int -__pthread_kill(p, uap, retval) +int +__pthread_markcancel(p, uap, retval) struct proc *p; - register struct pthread_kill_args *uap; + register struct __pthread_markcancel_args *uap; register_t *retval; { thread_act_t target_act; int error = 0; - int signum = uap->sig; struct uthread *uth; - target_act = (thread_act_t)port_name_to_act(uap->thread_port); + target_act = (thread_act_t)port_name_to_thread(uap->thread_port); if (target_act == THR_ACT_NULL) return (ESRCH); + + uth = (struct uthread *)get_bsdthread_info(target_act); + + /* if the thread is in vfork do not cancel */ + if ((uth->uu_flag & (P_VFORK | UT_CANCEL | UT_CANCELED )) == 0) { + uth->uu_flag |= (UT_CANCEL | UT_NO_SIGMASK); + if (((uth->uu_flag & UT_NOTCANCELPT) == 0) + && ((uth->uu_flag & UT_CANCELDISABLE) == 0)) + thread_abort_safely(target_act); + } + + thread_deallocate(target_act); + return (error); +} + +/* if action =0 ; return the cancellation state , + * if marked for cancellation, make the thread canceled + * if action = 1 ; Enable the cancel handling + * if action = 2; Disable the cancel handling + */ +int +__pthread_canceled(p, uap, retval) + struct proc *p; + register struct __pthread_canceled_args *uap; + register_t *retval; +{ + thread_act_t thr_act; + struct uthread *uth; + int action = uap->action; + + thr_act = current_act(); + uth = (struct uthread *)get_bsdthread_info(thr_act); + + switch (action) { + case 1: + uth->uu_flag &= ~UT_CANCELDISABLE; + return(0); + case 2: + uth->uu_flag |= UT_CANCELDISABLE; + return(0); + case 0: + default: + /* if the thread is in vfork do not cancel */ + if((uth->uu_flag & ( UT_CANCELDISABLE | UT_CANCEL | UT_CANCELED)) == UT_CANCEL) { + uth->uu_flag &= ~UT_CANCEL; + uth->uu_flag |= (UT_CANCELED | UT_NO_SIGMASK); + return(0); + } + return(EINVAL); + } + return(EINVAL); +} + +void +__posix_sem_syscall_return(kern_return_t kern_result) +{ + int error = 0; + + if (kern_result == KERN_SUCCESS) + error = 0; + else if (kern_result == KERN_ABORTED) + error = EINTR; + else if (kern_result == KERN_OPERATION_TIMED_OUT) + error = ETIMEDOUT; + else + error = EINVAL; + unix_syscall_return(error); + /* does not return */ +} + + +int +__semwait_signal(p, uap, retval) + struct proc *p; + register struct __semwait_signal_args *uap; + register_t *retval; +{ + + kern_return_t kern_result; + mach_timespec_t then; + struct timespec now; + + if(uap->timeout) { + + if (uap->relative) { + then.tv_sec = uap->tv_sec; + then.tv_nsec = uap->tv_nsec; + } else { + nanotime(&now); + then.tv_sec = uap->tv_sec - now.tv_sec; + then.tv_nsec = uap->tv_nsec - now.tv_nsec; + if (then.tv_nsec < 0) { + then.tv_nsec += NSEC_PER_SEC; + then.tv_sec--; + } + } + + if (uap->mutex_sem == (void *)NULL) + kern_result = semaphore_timedwait_trap_internal(uap->cond_sem, then.tv_sec, then.tv_nsec, __posix_sem_syscall_return); + else + kern_result = semaphore_timedwait_signal_trap_internal(uap->cond_sem, uap->mutex_sem, then.tv_sec, then.tv_nsec, __posix_sem_syscall_return); + + } else { + + if (uap->mutex_sem == (void *)NULL) + kern_result = semaphore_wait_trap_internal(uap->cond_sem, __posix_sem_syscall_return); + else + + kern_result = semaphore_wait_signal_trap_internal(uap->cond_sem, uap->mutex_sem, __posix_sem_syscall_return); + } + +out: + if (kern_result == KERN_SUCCESS) + return(0); + else if (kern_result == KERN_ABORTED) + return(EINTR); + else if (kern_result == KERN_OPERATION_TIMED_OUT) + return(ETIMEDOUT); + else + return(EINVAL); +} + + +int +__pthread_kill(__unused struct proc *p, + register struct __pthread_kill_args *uap, + __unused register_t *retval) +{ + thread_t target_act; + int error = 0; + int signum = uap->sig; + struct uthread *uth; + + target_act = (thread_t)port_name_to_thread(uap->thread_port); + + if (target_act == THREAD_NULL) + return (ESRCH); if ((u_int)signum >= NSIG) { error = EINVAL; goto out; @@ -827,7 +918,7 @@ __pthread_kill(p, uap, retval) uth = (struct uthread *)get_bsdthread_info(target_act); - if (uth->uu_flag & UNO_SIGMASK) { + if (uth->uu_flag & UT_NO_SIGMASK) { error = ESRCH; goto out; } @@ -835,39 +926,32 @@ __pthread_kill(p, uap, retval) if (signum) psignal_uthread(target_act, signum); out: - act_deallocate(target_act); + thread_deallocate(target_act); return (error); } -struct pthread_sigmask_args { - int how; - const sigset_t *set; - sigset_t * oset; -}; int -pthread_sigmask(p, uap, retval) - register struct proc *p; - register struct pthread_sigmask_args *uap; - register_t *retval; +pthread_sigmask(__unused register struct proc *p, + register struct pthread_sigmask_args *uap, + __unused register_t *retval) { - int how = uap->how; - const sigset_t *set = uap->set; - sigset_t * oset = uap->oset; - const sigset_t nset; + user_addr_t set = uap->set; + user_addr_t oset = uap->oset; + sigset_t nset; int error = 0; struct uthread *ut; sigset_t oldset; - ut = (struct uthread *)get_bsdthread_info(current_act()); + ut = (struct uthread *)get_bsdthread_info(current_thread()); oldset = ut->uu_sigmask; - if (set == (sigset_t *) 0) { + if (set == USER_ADDR_NULL) { /* need only old mask */ goto out; } - error = copyin((caddr_t)set, (caddr_t)&nset, sizeof(sigset_t)); + error = copyin(set, &nset, sizeof(sigset_t)); if (error) goto out; @@ -878,12 +962,12 @@ pthread_sigmask(p, uap, retval) case SIG_UNBLOCK: ut->uu_sigmask &= ~(nset); - signal_setast(current_act()); + signal_setast(current_thread()); break; case SIG_SETMASK: ut->uu_sigmask = (nset & ~sigcantmask); - signal_setast(current_act()); + signal_setast(current_thread()); break; default: @@ -891,40 +975,30 @@ pthread_sigmask(p, uap, retval) } out: - if (!error && oset) - copyout((caddr_t)&oldset, (caddr_t)oset, sizeof(sigset_t)); + if (!error && oset != USER_ADDR_NULL) + copyout(&oldset, oset, sizeof(sigset_t)); return(error); } -struct sigwait_args { - const sigset_t *set; - int *sig; -}; - int -sigwait(p, uap, retval) - register struct proc *p; - register struct sigwait_args *uap; - register_t *retval; +sigwait(register struct proc *p, register struct sigwait_args *uap, __unused register_t *retval) { - register struct sigacts *ps = p->p_sigacts; struct uthread *ut; struct uthread *uth; - thread_act_t thact; int error = 0; sigset_t mask; sigset_t siglist; sigset_t sigw=0; int signum; - ut = (struct uthread *)get_bsdthread_info(current_act()); + ut = (struct uthread *)get_bsdthread_info(current_thread()); - if (uap->set == (const sigset_t *)0) + if (uap->set == USER_ADDR_NULL) return(EINVAL); - error = copyin((caddr_t)uap->set, (caddr_t)&mask, sizeof(sigset_t)); + error = copyin(uap->set, &mask, sizeof(sigset_t)); if (error) return(error); @@ -939,7 +1013,7 @@ sigwait(p, uap, retval) return(EINVAL); } else { TAILQ_FOREACH(uth, &p->p_uthlist, uu_list) { - if (sigw = uth->uu_siglist & siglist) { + if ( (sigw = uth->uu_siglist & siglist) ) { break; } } @@ -957,7 +1031,7 @@ sigwait(p, uap, retval) * to indicate this. */ ut->uu_oldmask = ut->uu_sigmask; - ut->uu_flag |= USAS_OLDMASK; + ut->uu_flag |= UT_SAS_OLDMASK; if (siglist == (sigset_t)0) return(EINVAL); /* SIGKILL and SIGSTOP are not maskable as well */ @@ -972,7 +1046,7 @@ sigwait(p, uap, retval) sigw = (ut->uu_sigwait & siglist); ut->uu_sigmask = ut->uu_oldmask; ut->uu_oldmask = 0; - ut->uu_flag &= ~USAS_OLDMASK; + ut->uu_flag &= ~UT_SAS_OLDMASK; sigwait1: ut->uu_sigwait = 0; if (!error) { @@ -980,7 +1054,7 @@ sigwait1: if (!signum) panic("sigwait with no signal wakeup"); ut->uu_siglist &= ~(sigmask(signum)); - if (uap->sig) + if (uap->sig != USER_ADDR_NULL) error = copyout(&signum, uap->sig, sizeof(int)); } @@ -988,65 +1062,38 @@ sigwait1: } -#if COMPAT_43 -struct osigstack_args { - struct sigstack *nss; - struct sigstack *oss; -}; - -/* ARGSUSED */ -int -osigstack(p, uap, retval) - struct proc *p; - register struct osigstack_args *uap; - register_t *retval; -{ - struct sigstack ss; - struct sigacts *psp; - int error = 0; - - psp = p->p_sigacts; - ss.ss_sp = psp->ps_sigstk.ss_sp; - ss.ss_onstack = psp->ps_sigstk.ss_flags & SA_ONSTACK; - if (uap->oss && (error = copyout((caddr_t)&ss, - (caddr_t)uap->oss, sizeof (struct sigstack)))) - return (error); - if (uap->nss && (error = copyin((caddr_t)uap->nss, - (caddr_t)&ss, sizeof (ss))) == 0) { - psp->ps_sigstk.ss_sp = ss.ss_sp; - psp->ps_sigstk.ss_size = 0; - psp->ps_sigstk.ss_flags |= ss.ss_onstack & SA_ONSTACK; - psp->ps_flags |= SAS_ALTSTACK; - } - return (error); -} -#endif /* COMPAT_43 */ -struct sigaltstack_args { - struct sigaltstack *nss; - struct sigaltstack *oss; -}; -/* ARGSUSED */ int -sigaltstack(p, uap, retval) - struct proc *p; - register struct sigaltstack_args *uap; - register_t *retval; +sigaltstack(struct proc *p, register struct sigaltstack_args *uap, __unused register_t *retval) { struct sigacts *psp; - struct sigaltstack ss; + struct user_sigaltstack ss; int error; psp = p->p_sigacts; if ((psp->ps_flags & SAS_ALTSTACK) == 0) psp->ps_sigstk.ss_flags |= SA_DISABLE; - if (uap->oss && (error = copyout((caddr_t)&psp->ps_sigstk, - (caddr_t)uap->oss, sizeof (struct sigaltstack)))) - return (error); - if (uap->nss == 0) + if (uap->oss) { + if (IS_64BIT_PROCESS(p)) { + error = copyout(&psp->ps_sigstk, uap->oss, sizeof(struct user_sigaltstack)); + } else { + struct sigaltstack ss32; + sigaltstack_64to32(&psp->ps_sigstk, &ss32); + error = copyout(&ss32, uap->oss, sizeof(struct sigaltstack)); + } + if (error) + return (error); + } + if (uap->nss == USER_ADDR_NULL) return (0); - if (error = copyin((caddr_t)uap->nss, (caddr_t)&ss, - sizeof (ss))) + if (IS_64BIT_PROCESS(p)) { + error = copyin(uap->nss, &ss, sizeof(struct user_sigaltstack)); + } else { + struct sigaltstack ss32; + error = copyin(uap->nss, &ss32, sizeof(struct sigaltstack)); + sigaltstack_32to64(&ss32,&ss); + } + if (error) return (error); if ((ss.ss_flags & ~SA_DISABLE) != 0) { return(EINVAL); @@ -1068,22 +1115,15 @@ sigaltstack(p, uap, retval) return (0); } -struct kill_args { - int pid; - int signum; -}; -/* ARGSUSED */ int -kill(cp, uap, retval) - register struct proc *cp; - register struct kill_args *uap; - register_t *retval; +kill(struct proc *cp, struct kill_args *uap, __unused register_t *retval) { register struct proc *p; - register struct pcred *pc = cp->p_cred; + kauth_cred_t uc = kauth_cred_get(); + + AUDIT_ARG(pid, uap->pid); + AUDIT_ARG(signum, uap->signum); - AUDIT_ARG(pid, uap->pid); - AUDIT_ARG(signum, uap->signum); if ((u_int)uap->signum >= NSIG) return (EINVAL); if (uap->pid > 0) { @@ -1099,7 +1139,7 @@ kill(cp, uap, retval) return (ESRCH); } AUDIT_ARG(process, p); - if (!cansignal(cp, pc, p, uap->signum)) + if (!cansignal(cp, uc, p, uap->signum)) return (EPERM); if (uap->signum) psignal(p, uap->signum); @@ -1116,26 +1156,6 @@ kill(cp, uap, retval) /* NOTREACHED */ } -#if COMPAT_43 -struct okillpg_args { - int pgid; - int signum; -}; -/* ARGSUSED */ -int -okillpg(p, uap, retval) - struct proc *p; - register struct okillpg_args *uap; - register_t *retval; -{ - - AUDIT_ARG(pid, uap->pgid); - AUDIT_ARG(signum, uap->signum); - if ((u_int)uap->signum >= NSIG) - return (EINVAL); - return (killpg1(p, uap->signum, uap->pgid, 0)); -} -#endif /* COMPAT_43 */ /* * Common code for kill process group/broadcast kill. @@ -1147,7 +1167,7 @@ killpg1(cp, signum, pgid, all) int signum, pgid, all; { register struct proc *p; - register struct pcred *pc = cp->p_cred; + kauth_cred_t uc = cp->p_ucred; struct pgrp *pgrp; int nfound = 0; @@ -1157,7 +1177,7 @@ killpg1(cp, signum, pgid, all) */ for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || - p == cp || !cansignal(cp, pc, p, signum)) + p == cp || !cansignal(cp, uc, p, signum)) continue; nfound++; if (signum) @@ -1178,7 +1198,7 @@ killpg1(cp, signum, pgid, all) p = p->p_pglist.le_next) { if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || p->p_stat == SZOMB || - !cansignal(cp, pc, p, signum)) + !cansignal(cp, uc, p, signum)) continue; nfound++; if (signum) @@ -1244,10 +1264,7 @@ tty_pgsignal(pgrp, signum) * Send a signal caused by a trap to a specific thread. */ void -threadsignal(sig_actthread, signum, code) - register thread_act_t sig_actthread; - register int signum; - u_long code; +threadsignal(thread_t sig_actthread, int signum, u_long code) { register struct uthread *uth; register struct task * sig_task; @@ -1264,7 +1281,7 @@ threadsignal(sig_actthread, signum, code) p = (struct proc *)(get_bsdtask_info(sig_task)); uth = get_bsdthread_info(sig_actthread); - if (uth && (uth->uu_flag & P_VFORK)) + if (uth && (uth->uu_flag & UT_VFORK)) p = uth->uu_proc; if (!(p->p_flag & P_TRACED) && (p->p_sigignore & mask)) @@ -1287,18 +1304,11 @@ psignal(p, signum) } void -psignal_vfork(p, new_task, thr_act, signum) - register struct proc *p; - task_t new_task; - thread_act_t thr_act; - register int signum; +psignal_vfork(struct proc *p, task_t new_task, thread_t thr_act, int signum) { - int withlock = 1; - int pend = 0; - register int s, prop; + register int prop; register sig_t action; int mask; - kern_return_t kret; struct uthread *uth; if ((u_int)signum >= NSIG || signum == 0) @@ -1312,7 +1322,7 @@ psignal_vfork(p, new_task, thr_act, signum) } #endif /* SIGNAL_DEBUG */ - if ((new_task == TASK_NULL) || (thr_act == (thread_act_t)NULL) || is_kerneltask(new_task)) + if ((new_task == TASK_NULL) || (thr_act == (thread_t)NULL) || is_kerneltask(new_task)) return; @@ -1394,13 +1404,13 @@ psigout: signal_unlock(p); } -thread_act_t +static thread_t get_signalthread(struct proc *p, int signum) { struct uthread *uth; - thread_act_t thr_act; + thread_t thr_act; sigset_t mask = sigmask(signum); - thread_act_t sig_thread_act; + thread_t sig_thread_act; struct task * sig_task = p->task; kern_return_t kret; @@ -1410,11 +1420,11 @@ get_signalthread(struct proc *p, int signum) if (kret == KERN_SUCCESS) return(sig_thread_act); else - return(THR_ACT_NULL); + return(THREAD_NULL); } TAILQ_FOREACH(uth, &p->p_uthlist, uu_list) { - if(((uth->uu_flag & UNO_SIGMASK)== 0) && + if(((uth->uu_flag & UT_NO_SIGMASK)== 0) && (((uth->uu_sigmask & mask) == 0) || (uth->uu_sigwait & mask))) { if (check_actforsig(p->task, uth->uu_act, 1) == KERN_SUCCESS) return(uth->uu_act); @@ -1424,7 +1434,7 @@ get_signalthread(struct proc *p, int signum) return(thr_act); } - return(THR_ACT_NULL); + return(THREAD_NULL); } /* @@ -1448,11 +1458,11 @@ psignal_lock(p, signum, withlock) { register int s, prop; register sig_t action; - thread_act_t sig_thread_act; + thread_t sig_thread_act; register task_t sig_task; int mask; struct uthread *uth; - kern_return_t kret; + boolean_t funnel_state = FALSE; int sw_funnel = 0; if ((u_int)signum >= NSIG || signum == 0) @@ -1466,9 +1476,9 @@ psignal_lock(p, signum, withlock) } #endif /* SIGNAL_DEBUG */ - if (thread_funnel_get() == (funnel_t *)network_flock) { + if (thread_funnel_get() == (funnel_t *)0) { sw_funnel = 1; - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + funnel_state = thread_funnel_set(kernel_flock, TRUE); } /* * We will need the task pointer later. Grab it now to @@ -1477,7 +1487,7 @@ psignal_lock(p, signum, withlock) */ if (((sig_task = p->task) == TASK_NULL) || is_kerneltask(sig_task)) { if (sw_funnel) - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + thread_funnel_set(kernel_flock, funnel_state); return; } @@ -1492,7 +1502,7 @@ psignal_lock(p, signum, withlock) */ if (ISSET(p->p_flag, P_REBOOT)) { if (sw_funnel) - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + thread_funnel_set(kernel_flock, funnel_state); return; } @@ -1515,11 +1525,11 @@ psignal_lock(p, signum, withlock) /* If successful return with ast set */ sig_thread_act = get_signalthread(p, signum); - if (sig_thread_act == THR_ACT_NULL) { + if (sig_thread_act == THREAD_NULL) { /* XXXX FIXME - /* if it is sigkill, may be we should - * inject a thread to terminate - */ + * if it is sigkill, may be we should + * inject a thread to terminate + */ #if SIGNAL_DEBUG ram_printf(1); #endif /* SIGNAL_DEBUG */ @@ -1543,12 +1553,13 @@ psignal_lock(p, signum, withlock) */ if (p->p_sigignore & mask) goto psigout; + /* sigwait takes precedence */ if (uth->uu_sigwait & mask) - action = SIG_WAIT; - if (uth->uu_sigmask & mask) - action = SIG_HOLD; + action = KERN_SIG_WAIT; + else if (uth->uu_sigmask & mask) + action = KERN_SIG_HOLD; else if (p->p_sigcatch & mask) - action = SIG_CATCH; + action = KERN_SIG_CATCH; else action = SIG_DFL; } @@ -1583,7 +1594,7 @@ psignal_lock(p, signum, withlock) * Defer further processing for signals which are held, * except that stopped processes must be continued by SIGCONT. */ - if (action == SIG_HOLD && ((prop & SA_CONT) == 0 || p->p_stat != SSTOP)) { + if (action == KERN_SIG_HOLD && ((prop & SA_CONT) == 0 || p->p_stat != SSTOP)) { goto psigout; } /* @@ -1608,14 +1619,16 @@ psignal_lock(p, signum, withlock) goto psigout; } - if (action == SIG_WAIT) { + if (action == KERN_SIG_WAIT) { uth->uu_sigwait = mask; uth->uu_siglist &= ~mask; p->p_siglist &= ~mask; wakeup(&uth->uu_sigwait); /* if it is SIGCONT resume whole process */ - if (prop & SA_CONT) + if (prop & SA_CONT) { + p->p_flag |= P_CONTINUED; (void) task_resume(sig_task); + } goto psigout; } @@ -1630,6 +1643,7 @@ psignal_lock(p, signum, withlock) p->p_flag &= ~P_TTYSLEEP; wakeup(&p->p_siglist); } else { + p->p_flag |= P_CONTINUED; (void) task_resume(sig_task); } p->p_stat = SRUN; @@ -1669,7 +1683,7 @@ psignal_lock(p, signum, withlock) pp->si_pid = p->p_pid; pp->si_status = p->p_xstat; pp->si_code = CLD_STOPPED; - pp->si_uid = p->p_cred->p_ruid; + pp->si_uid = p->p_ucred->cr_ruid; psignal(pp, SIGCHLD); } } @@ -1706,6 +1720,7 @@ psignal_lock(p, signum, withlock) p->p_flag &= ~P_TTYSLEEP; wakeup(&p->p_siglist); } else { + p->p_flag |= P_CONTINUED; (void) task_resume(sig_task); } uth->uu_siglist &= ~mask; @@ -1745,20 +1760,20 @@ psigout: if (withlock) signal_unlock(p); if (sw_funnel) - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + thread_funnel_set(kernel_flock, funnel_state); } /* psignal_lock(p, signum, withlock ) */ void psignal_uthread(thr_act, signum) - thread_act_t thr_act; + thread_t thr_act; int signum; { struct proc *p; - register int s, prop; + register int prop; register sig_t action; - thread_act_t sig_thread_act; + thread_t sig_thread_act; register task_t sig_task; int mask; struct uthread *uth; @@ -1836,12 +1851,13 @@ psignal_uthread(thr_act, signum) */ if (p->p_sigignore & mask) goto puthout; + /* sigwait takes precedence */ if (uth->uu_sigwait & mask) - action = SIG_WAIT; - if (uth->uu_sigmask & mask) - action = SIG_HOLD; + action = KERN_SIG_WAIT; + else if (uth->uu_sigmask & mask) + action = KERN_SIG_HOLD; else if (p->p_sigcatch & mask) - action = SIG_CATCH; + action = KERN_SIG_CATCH; else action = SIG_DFL; } @@ -1875,7 +1891,7 @@ psignal_uthread(thr_act, signum) * Defer further processing for signals which are held, * except that stopped processes must be continued by SIGCONT. */ - if (action == SIG_HOLD && ((prop & SA_CONT) == 0 || p->p_stat != SSTOP)) + if (action == KERN_SIG_HOLD && ((prop & SA_CONT) == 0 || p->p_stat != SSTOP)) goto puthout; /* @@ -1900,14 +1916,16 @@ psignal_uthread(thr_act, signum) goto puthout; } - if (action == SIG_WAIT) { + if (action == KERN_SIG_WAIT) { uth->uu_sigwait = mask; uth->uu_siglist &= ~mask; p->p_siglist &= ~mask; wakeup(&uth->uu_sigwait); /* if it is SIGCONT resume whole process */ - if (prop & SA_CONT) + if (prop & SA_CONT) { + p->p_flag |= P_CONTINUED; (void) task_resume(sig_task); + } goto puthout; } @@ -1917,8 +1935,10 @@ psignal_uthread(thr_act, signum) * Wake up the thread, but don't un-suspend it * (except for SIGCONT). */ - if (prop & SA_CONT) + if (prop & SA_CONT) { + p->p_flag |= P_CONTINUED; (void) task_resume(sig_task); + } goto psurun; } else { /* Default action - varies */ @@ -1952,7 +1972,7 @@ psignal_uthread(thr_act, signum) pp->si_pid = p->p_pid; pp->si_status = p->p_xstat; pp->si_code = CLD_STOPPED; - pp->si_uid = p->p_cred->p_ruid; + pp->si_uid = p->p_ucred->cr_ruid; psignal(pp, SIGCHLD); } stop(p); @@ -1990,6 +2010,7 @@ psignal_uthread(thr_act, signum) p->p_flag &= ~P_TTYSLEEP; wakeup(&p->p_siglist); } else { + p->p_flag |= P_CONTINUED; (void) task_resume(sig_task); } uth->uu_siglist &= ~mask; @@ -2031,20 +2052,18 @@ puthout: __inline__ void -sig_lock_to_exit( - struct proc *p) +sig_lock_to_exit(struct proc *p) { - thread_t self = current_act(); + thread_t self = current_thread(); p->exit_thread = self; (void) task_suspend(p->task); } __inline__ int -sig_try_locked( - struct proc *p) +sig_try_locked(struct proc *p) { - thread_t self = current_act(); + thread_t self = current_thread(); while (p->sigwait || p->exit_thread) { if (p->exit_thread) { @@ -2090,14 +2109,11 @@ issignal(p) register struct proc *p; { register int signum, mask, prop, sigbits; - task_t task = p->task; - thread_act_t cur_act; - int s; + thread_t cur_act; struct uthread * ut; - kern_return_t kret; struct proc *pp; - cur_act = current_act(); + cur_act = current_thread(); #if SIGNAL_DEBUG if(rdebug_proc && (p == rdebug_proc)) { @@ -2138,7 +2154,6 @@ issignal(p) continue; } if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) { - register int hold; register task_t task; /* * If traced, always stop, and stay @@ -2151,7 +2166,7 @@ issignal(p) p->sigwait = TRUE; p->sigwait_thread = cur_act; p->p_stat = SSTOP; - p->p_flag &= ~P_WAITED; + p->p_flag &= ~(P_WAITED|P_CONTINUED); ut->uu_siglist &= ~mask; /* clear the old signal */ p->p_siglist &= ~mask; /* clear the old signal */ signal_unlock(p); @@ -2162,7 +2177,7 @@ issignal(p) pp->si_pid = p->p_pid; pp->si_status = p->p_xstat; pp->si_code = CLD_TRAPPED; - pp->si_uid = p->p_cred->p_ruid; + pp->si_uid = p->p_ucred->cr_ruid; psignal(pp, SIGCHLD); /* * XXX Have to really stop for debuggers; @@ -2176,7 +2191,7 @@ issignal(p) p->sigwait = TRUE; p->sigwait_thread = cur_act; p->p_stat = SSTOP; - p->p_flag &= ~P_WAITED; + p->p_flag &= ~(P_WAITED|P_CONTINUED); ut->uu_siglist &= ~mask; /* clear the old signal */ p->p_siglist &= ~mask; /* clear the old signal */ @@ -2203,7 +2218,7 @@ issignal(p) * clear it, since sig_lock_to_exit will * wait. */ - clear_wait(current_act(), THREAD_INTERRUPTED); + clear_wait(current_thread(), THREAD_INTERRUPTED); sig_lock_to_exit(p); /* * Since this thread will be resumed @@ -2220,7 +2235,7 @@ issignal(p) /* * We may have to quit */ - if (thread_should_abort(current_act())) { + if (thread_should_abort(current_thread())) { signal_unlock(p); return(0); } @@ -2287,7 +2302,7 @@ issignal(p) pp->si_pid = p->p_pid; pp->si_status = p->p_xstat; pp->si_code = CLD_STOPPED; - pp->si_uid = p->p_cred->p_ruid; + pp->si_uid = p->p_ucred->cr_ruid; psignal(pp, SIGCHLD); } } @@ -2339,14 +2354,12 @@ CURSIG(p) register struct proc *p; { register int signum, mask, prop, sigbits; - task_t task = p->task; - thread_act_t cur_act; - int s; + thread_t cur_act; struct uthread * ut; int retnum = 0; - cur_act = current_act(); + cur_act = current_thread(); ut = get_bsdthread_info(cur_act); @@ -2469,9 +2482,9 @@ stop(p) register struct proc *p; { p->p_stat = SSTOP; - p->p_flag &= ~P_WAITED; + p->p_flag &= ~(P_WAITED|P_CONTINUED); if (p->p_pptr->p_stat != SSTOP) - wakeup((caddr_t)p->p_pptr); + wakeup((caddr_t)p->p_pptr); (void) task_suspend(p->task); /*XXX*/ } @@ -2480,12 +2493,11 @@ stop(p) * from the current set of pending signals. */ void -postsig(signum) - register int signum; +postsig(int signum) { - register struct proc *p = current_proc(); - register struct sigacts *ps = p->p_sigacts; - register sig_t action; + struct proc *p = current_proc(); + struct sigacts *ps = p->p_sigacts; + user_addr_t catcher; u_long code; int mask, returnmask; struct uthread * ut; @@ -2509,20 +2521,21 @@ postsig(signum) return; } - ut = (struct uthread *)get_bsdthread_info(current_act()); + ut = (struct uthread *)get_bsdthread_info(current_thread()); mask = sigmask(signum); ut->uu_siglist &= ~mask; p->p_siglist &= ~mask; - action = ps->ps_sigact[signum]; + catcher = ps->ps_sigact[signum]; #if KTRACE + //LP64: catcher argument is a 64 bit user space handler address if (KTRPOINT(p, KTR_PSIG)) ktrpsig(p->p_tracep, - signum, action, ut->uu_flag & USAS_OLDMASK ? - &ut->uu_oldmask : &ut->uu_sigmask, 0, -1); + signum, CAST_DOWN(void *,catcher), ut->uu_flag & UT_SAS_OLDMASK ? + &ut->uu_oldmask : &ut->uu_sigmask, 0); #endif - if (action == SIG_DFL) { + if (catcher == SIG_DFL) { /* - * Default action, where the default is to kill + * Default catcher, where the default is to kill * the process. (Other cases were ignored above.) */ /* called with signal_lock() held */ @@ -2534,7 +2547,7 @@ postsig(signum) * If we get here, the signal must be caught. */ #if DIAGNOSTIC - if (action == SIG_IGN || (ut->uu_sigmask & mask)) + if (catcher == SIG_IGN || (ut->uu_sigmask & mask)) log(LOG_WARNING, "postsig: processing masked or ignored signal\n"); #endif @@ -2547,9 +2560,9 @@ postsig(signum) * mask from before the sigpause is what we want * restored after the signal processing is completed. */ - if (ut->uu_flag & USAS_OLDMASK) { + if (ut->uu_flag & UT_SAS_OLDMASK) { returnmask = ut->uu_oldmask; - ut->uu_flag &= ~USAS_OLDMASK; + ut->uu_flag &= ~UT_SAS_OLDMASK; ut->uu_oldmask = 0; } else returnmask = ut->uu_sigmask; @@ -2566,7 +2579,7 @@ postsig(signum) #ifdef __ppc__ /* Needs to disable to run in user mode */ if (signum == SIGFPE) { - thread_enable_fpe(current_act(), 0); + thread_enable_fpe(current_thread(), 0); } #endif /* __ppc__ */ @@ -2577,7 +2590,7 @@ postsig(signum) ps->ps_code = 0; } p->p_stats->p_ru.ru_nsignals++; - sendsig(p, action, signum, returnmask, code); + sendsig(p, catcher, signum, returnmask, code); } signal_unlock(p); } @@ -2601,10 +2614,12 @@ sigexit_locked(p, signum) p->p_acflag |= AXSIG; if (sigprop[signum] & SA_CORE) { p->p_sigacts->ps_sig = signum; + signal_unlock(p); if (coredump(p) == 0) signum |= WCOREFLAG; - } - signal_unlock(p); + } else + signal_unlock(p); + exit1(p, W_EXITCODE(0, signum), (int *)NULL); /* NOTREACHED */ } @@ -2645,21 +2660,22 @@ filt_signal(struct knote *kn, long hint) if (hint & NOTE_SIGNAL) { hint &= ~NOTE_SIGNAL; - if (kn->kn_id == hint) + if (kn->kn_id == (unsigned int)hint) kn->kn_data++; } return (kn->kn_data != 0); } + void -bsd_ast(thread_act_t thr_act) +bsd_ast(thread_t thr_act) { struct proc *p = current_proc(); struct uthread *ut = get_bsdthread_info(thr_act); int signum; - unsigned int pc; + user_addr_t pc; boolean_t funnel_state; - static bsd_init_done = 0; + static int bsd_init_done = 0; if (p == NULL) return; @@ -2672,13 +2688,11 @@ bsd_ast(thread_act_t thr_act) p->p_flag &= ~P_OWEUPC; } - if (CHECK_SIGNALS(p, current_act(), ut)) { - while (signum = issignal(p)) + if (CHECK_SIGNALS(p, current_thread(), ut)) { + while ( (signum = issignal(p)) ) postsig(signum); } if (!bsd_init_done) { - extern void bsdinit_task(void); - bsd_init_done = 1; bsdinit_task(); } @@ -2743,6 +2757,7 @@ task_t task; } } + kern_return_t do_bsdexception( int exc, @@ -2750,10 +2765,63 @@ do_bsdexception( int sub) { exception_data_type_t codes[EXCEPTION_CODE_MAX]; - extern kern_return_t bsd_exception(int, exception_data_type_t codes[], int); codes[0] = code; codes[1] = sub; return(bsd_exception(exc, codes, 2)); } +int +proc_pendingsignals(struct proc *p, sigset_t mask) +{ + struct uthread * uth; + thread_t th; + sigset_t bits = 0; + int error; + + /* If the process is in proc exit return no signal info */ + if (p->p_lflag & P_LPEXIT) + return(0); + + /* duplicate the signal lock code to enable recursion; as exit + * holds the lock too long. All this code is being reworked + * this is just a workaround for regressions till new code + * arrives. + */ +ppend_retry: + error = lockmgr((struct lock__bsd__ *)&p->signal_lock[0], (LK_EXCLUSIVE | LK_CANRECURSE), 0, (struct proc *)0); + if (error == EINTR) + goto ppend_retry; + + if ((p->p_flag & P_INVFORK) && p->p_vforkact) { + th = p->p_vforkact; + uth = (struct uthread *)get_bsdthread_info(th); + if (uth) { + bits = (((uth->uu_siglist & ~uth->uu_sigmask) & ~p->p_sigignore) & mask); + } + goto out; + } + + bits = 0; + TAILQ_FOREACH(uth, &p->p_uthlist, uu_list) { + bits |= (((uth->uu_siglist & ~uth->uu_sigmask) & ~p->p_sigignore) & mask); + } +out: + signal_unlock(p); + return(bits); +} + +int +thread_issignal(proc_t p, thread_t th, sigset_t mask) +{ + struct uthread * uth; + sigset_t bits=0; + + + uth = (struct uthread *)get_bsdthread_info(th); + if (uth) { + bits = (((uth->uu_siglist & ~uth->uu_sigmask) & ~p->p_sigignore) & mask); + } + return(bits); +} + diff --git a/bsd/kern/kern_subr.c b/bsd/kern/kern_subr.c index cc5b4382e..40e9f4c35 100644 --- a/bsd/kern/kern_subr.c +++ b/bsd/kern/kern_subr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -62,10 +62,12 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> #include <sys/malloc.h> #include <sys/queue.h> #include <vm/pmap.h> +#include <sys/uio_internal.h> +#include <kern/kalloc.h> #include <kdebug.h> @@ -73,137 +75,224 @@ #define DBG_UIO_COPYOUT 16 #define DBG_UIO_COPYIN 17 +#if DEBUG +#include <kern/simple_lock.h> + +static int uio_t_count = 0; +#endif /* DEBUG */ + int uiomove(cp, n, uio) register caddr_t cp; register int n; - register struct uio *uio; + register uio_t uio; { return uiomove64((addr64_t)((unsigned int)cp), n, uio); } + // LP64todo - fix this! 'n' should be int64_t? int -uiomove64(addr64_t cp, int n, struct uio *uio) +uiomove64(addr64_t cp, int n, register struct uio *uio) { - register struct iovec *iov; - u_int cnt; +#if LP64KERN + register uint64_t acnt; +#else + register u_int acnt; +#endif int error = 0; #if DIAGNOSTIC if (uio->uio_rw != UIO_READ && uio->uio_rw != UIO_WRITE) panic("uiomove: mode"); - if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != current_proc()) - panic("uiomove proc"); #endif - while (n > 0 && uio->uio_resid) { - iov = uio->uio_iov; - cnt = iov->iov_len; - if (cnt == 0) { - uio->uio_iov++; +#if LP64_DEBUG + if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) { + panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__); + } +#endif /* LP64_DEBUG */ + + while (n > 0 && uio_resid(uio)) { + acnt = uio_iov_len(uio); + if (acnt == 0) { + uio_next_iov(uio); uio->uio_iovcnt--; continue; } - if (cnt > n) - cnt = n; + if (n > 0 && acnt > (uint64_t)n) + acnt = n; + switch (uio->uio_segflg) { + case UIO_USERSPACE64: + case UIO_USERISPACE64: + // LP64 - 3rd argument in debug code is 64 bit, expected to be 32 bit + if (uio->uio_rw == UIO_READ) + { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_START, + (int)cp, (int)uio->uio_iovs.iov64p->iov_base, acnt, 0,0); + + error = copyout( CAST_DOWN(caddr_t, cp), uio->uio_iovs.iov64p->iov_base, acnt ); + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_END, + (int)cp, (int)uio->uio_iovs.iov64p->iov_base, acnt, 0,0); + } + else + { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_START, + (int)uio->uio_iovs.iov64p->iov_base, (int)cp, acnt, 0,0); + + error = copyin(uio->uio_iovs.iov64p->iov_base, CAST_DOWN(caddr_t, cp), acnt); + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_END, + (int)uio->uio_iovs.iov64p->iov_base, (int)cp, acnt, 0,0); + } + if (error) + return (error); + break; + + case UIO_USERSPACE32: + case UIO_USERISPACE32: case UIO_USERSPACE: case UIO_USERISPACE: if (uio->uio_rw == UIO_READ) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_START, - (int)cp, (int)iov->iov_base, cnt, 0,0); + (int)cp, (int)uio->uio_iovs.iov32p->iov_base, acnt, 0,0); - error = copyout( CAST_DOWN(caddr_t, cp), iov->iov_base, cnt ); + error = copyout( CAST_DOWN(caddr_t, cp), CAST_USER_ADDR_T(uio->uio_iovs.iov32p->iov_base), acnt ); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_END, - (int)cp, (int)iov->iov_base, cnt, 0,0); + (int)cp, (int)uio->uio_iovs.iov32p->iov_base, acnt, 0,0); } else { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_START, - (int)iov->iov_base, (int)cp, cnt, 0,0); + (int)uio->uio_iovs.iov32p->iov_base, (int)cp, acnt, 0,0); - error = copyin(iov->iov_base, CAST_DOWN(caddr_t, cp), cnt); + error = copyin(CAST_USER_ADDR_T(uio->uio_iovs.iov32p->iov_base), CAST_DOWN(caddr_t, cp), acnt); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_END, - (int)iov->iov_base, (int)cp, cnt, 0,0); + (int)uio->uio_iovs.iov32p->iov_base, (int)cp, acnt, 0,0); } if (error) return (error); break; + case UIO_SYSSPACE32: case UIO_SYSSPACE: if (uio->uio_rw == UIO_READ) - error = copywithin(CAST_DOWN(caddr_t, cp), iov->iov_base, - cnt); + error = copywithin(CAST_DOWN(caddr_t, cp), (caddr_t)uio->uio_iovs.iov32p->iov_base, + acnt); else - error = copywithin(iov->iov_base, CAST_DOWN(caddr_t, cp), - cnt); + error = copywithin((caddr_t)uio->uio_iovs.iov32p->iov_base, CAST_DOWN(caddr_t, cp), + acnt); break; + case UIO_PHYS_USERSPACE64: + if (uio->uio_rw == UIO_READ) + { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_START, + (int)cp, (int)uio->uio_iovs.iov64p->iov_base, acnt, 1,0); + + error = copypv((addr64_t)cp, uio->uio_iovs.iov64p->iov_base, acnt, cppvPsrc | cppvNoRefSrc); + if (error) /* Copy physical to virtual */ + error = EFAULT; + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_END, + (int)cp, (int)uio->uio_iovs.iov64p->iov_base, acnt, 1,0); + } + else + { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_START, + (int)uio->uio_iovs.iov64p->iov_base, (int)cp, acnt, 1,0); + + error = copypv(uio->uio_iovs.iov64p->iov_base, (addr64_t)cp, acnt, cppvPsnk | cppvNoRefSrc | cppvNoModSnk); + if (error) /* Copy virtual to physical */ + error = EFAULT; + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_END, + (int)uio->uio_iovs.iov64p->iov_base, (int)cp, acnt, 1,0); + } + if (error) + return (error); + break; + + case UIO_PHYS_USERSPACE32: case UIO_PHYS_USERSPACE: if (uio->uio_rw == UIO_READ) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_START, - (int)cp, (int)iov->iov_base, cnt, 1,0); + (int)cp, (int)uio->uio_iovs.iov32p->iov_base, acnt, 1,0); - if (error = copypv((addr64_t)cp, (addr64_t)((unsigned int)iov->iov_base), cnt, cppvPsrc | cppvNoRefSrc)) /* Copy physical to virtual */ + error = copypv((addr64_t)cp, (addr64_t)uio->uio_iovs.iov32p->iov_base, acnt, cppvPsrc | cppvNoRefSrc); + if (error) /* Copy physical to virtual */ error = EFAULT; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_END, - (int)cp, (int)iov->iov_base, cnt, 1,0); + (int)cp, (int)uio->uio_iovs.iov32p->iov_base, acnt, 1,0); } else { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_START, - (int)iov->iov_base, (int)cp, cnt, 1,0); + (int)uio->uio_iovs.iov32p->iov_base, (int)cp, acnt, 1,0); - if (error = copypv((addr64_t)((unsigned int)iov->iov_base), (addr64_t)cp, cnt, cppvPsnk | cppvNoRefSrc | cppvNoModSnk)) /* Copy virtual to physical */ + error = copypv((addr64_t)uio->uio_iovs.iov32p->iov_base, (addr64_t)cp, acnt, cppvPsnk | cppvNoRefSrc | cppvNoModSnk); + if (error) /* Copy virtual to physical */ error = EFAULT; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_END, - (int)iov->iov_base, (int)cp, cnt, 1,0); + (int)uio->uio_iovs.iov32p->iov_base, (int)cp, acnt, 1,0); } if (error) return (error); break; + case UIO_PHYS_SYSSPACE32: case UIO_PHYS_SYSSPACE: if (uio->uio_rw == UIO_READ) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_START, - (int)cp, (int)iov->iov_base, cnt, 2,0); + (int)cp, (int)uio->uio_iovs.iov32p->iov_base, acnt, 2,0); - if (error = copypv((addr64_t)cp, (addr64_t)((unsigned int)iov->iov_base), cnt, cppvKmap | cppvPsrc | cppvNoRefSrc)) /* Copy physical to virtual */ + error = copypv((addr64_t)cp, uio->uio_iovs.iov32p->iov_base, acnt, cppvKmap | cppvPsrc | cppvNoRefSrc); + if (error) /* Copy physical to virtual */ error = EFAULT; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_END, - (int)cp, (int)iov->iov_base, cnt, 2,0); + (int)cp, (int)uio->uio_iovs.iov32p->iov_base, acnt, 2,0); } else { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_START, - (int)iov->iov_base, (int)cp, cnt, 2,0); + (int)uio->uio_iovs.iov32p->iov_base, (int)cp, acnt, 2,0); - if (error = copypv((addr64_t)((unsigned int)iov->iov_base), (addr64_t)cp, cnt, cppvKmap | cppvPsnk | cppvNoRefSrc | cppvNoModSnk)) /* Copy virtual to physical */ + error = copypv(uio->uio_iovs.iov32p->iov_base, (addr64_t)cp, acnt, cppvKmap | cppvPsnk | cppvNoRefSrc | cppvNoModSnk); + if (error) /* Copy virtual to physical */ error = EFAULT; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_END, - (int)iov->iov_base, (int)cp, cnt, 2,0); + (int)uio->uio_iovs.iov32p->iov_base, (int)cp, acnt, 2,0); } if (error) return (error); break; + + default: + break; } - iov->iov_base += cnt; - iov->iov_len -= cnt; - uio->uio_resid -= cnt; - uio->uio_offset += cnt; - cp += cnt; - n -= cnt; + uio_iov_base_add(uio, acnt); +#if LP64KERN + uio_iov_len_add(uio, -((int64_t)acnt)); + uio_setresid(uio, (uio_resid(uio) - ((int64_t)acnt))); +#else + uio_iov_len_add(uio, -((int)acnt)); + uio_setresid(uio, (uio_resid(uio) - ((int)acnt))); +#endif + uio->uio_offset += acnt; + cp += acnt; + n -= acnt; } return (error); } @@ -216,38 +305,46 @@ ureadc(c, uio) register int c; register struct uio *uio; { - register struct iovec *iov; - - if (uio->uio_resid <= 0) + if (uio_resid(uio) <= 0) panic("ureadc: non-positive resid"); again: if (uio->uio_iovcnt == 0) panic("ureadc: non-positive iovcnt"); - iov = uio->uio_iov; - if (iov->iov_len <= 0) { + if (uio_iov_len(uio) <= 0) { uio->uio_iovcnt--; - uio->uio_iov++; + uio_next_iov(uio); goto again; } switch (uio->uio_segflg) { + case UIO_USERSPACE32: case UIO_USERSPACE: - if (subyte(iov->iov_base, c) < 0) + if (subyte(CAST_USER_ADDR_T(uio->uio_iovs.iov32p->iov_base), c) < 0) return (EFAULT); break; + case UIO_USERSPACE64: + if (subyte((user_addr_t)uio->uio_iovs.iov64p->iov_base, c) < 0) + return (EFAULT); + break; + + case UIO_SYSSPACE32: case UIO_SYSSPACE: - *iov->iov_base = c; + *((caddr_t)uio->uio_iovs.iov32p->iov_base) = c; break; + case UIO_USERISPACE32: case UIO_USERISPACE: - if (suibyte(iov->iov_base, c) < 0) + if (suibyte(CAST_USER_ADDR_T(uio->uio_iovs.iov32p->iov_base), c) < 0) return (EFAULT); break; + + default: + break; } - iov->iov_base++; - iov->iov_len--; - uio->uio_resid--; + uio_iov_base_add(uio, 1); + uio_iov_len_add(uio, -1); + uio_setresid(uio, (uio_resid(uio) - 1)); uio->uio_offset++; return (0); } @@ -257,36 +354,43 @@ again: /* * Get next character written in by user from uio. */ +int uwritec(uio) - struct uio *uio; + uio_t uio; { - register struct iovec *iov; - register int c; + register int c = 0; - if (uio->uio_resid <= 0) + if (uio_resid(uio) <= 0) return (-1); again: if (uio->uio_iovcnt <= 0) panic("uwritec: non-positive iovcnt"); - iov = uio->uio_iov; - if (iov->iov_len == 0) { - uio->uio_iov++; + + if (uio_iov_len(uio) == 0) { + uio_next_iov(uio); if (--uio->uio_iovcnt == 0) return (-1); goto again; } switch (uio->uio_segflg) { + case UIO_USERSPACE32: case UIO_USERSPACE: - c = fubyte(iov->iov_base); + c = fubyte(CAST_USER_ADDR_T(uio->uio_iovs.iov32p->iov_base)); break; + case UIO_USERSPACE64: + c = fubyte((user_addr_t)uio->uio_iovs.iov64p->iov_base); + break; + + case UIO_SYSSPACE32: case UIO_SYSSPACE: - c = *iov->iov_base & 0377; + c = *((caddr_t)uio->uio_iovs.iov32p->iov_base) & 0377; break; + case UIO_USERISPACE32: case UIO_USERISPACE: - c = fuibyte(iov->iov_base); + c = fuibyte(CAST_USER_ADDR_T(uio->uio_iovs.iov32p->iov_base)); break; default: @@ -296,9 +400,9 @@ again: } if (c < 0) return (-1); - iov->iov_base++; - iov->iov_len--; - uio->uio_resid--; + uio_iov_base_add(uio, 1); + uio_iov_len_add(uio, -1); + uio_setresid(uio, (uio_resid(uio) - 1)); uio->uio_offset++; return (c); } @@ -322,10 +426,806 @@ hashinit(elements, type, hashmask) continue; hashsize >>= 1; MALLOC(hashtbl, struct generic *, - (u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); - bzero(hashtbl, (u_long)hashsize * sizeof(*hashtbl)); - for (i = 0; i < hashsize; i++) - LIST_INIT(&hashtbl[i]); - *hashmask = hashsize - 1; + (u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK|M_ZERO); + if (hashtbl != NULL) { + for (i = 0; i < hashsize; i++) + LIST_INIT(&hashtbl[i]); + *hashmask = hashsize - 1; + } return (hashtbl); } + +/* + * uio_resid - return the residual IO value for the given uio_t + */ +user_ssize_t uio_resid( uio_t a_uio ) +{ +#if DEBUG + if (a_uio == NULL) { + panic("%s :%d - invalid uio_t\n", __FILE__, __LINE__); + } +/* if (IS_VALID_UIO_SEGFLG(a_uio->uio_segflg) == 0) { */ +/* panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__); */ +/* } */ +#endif /* DEBUG */ + + /* return 0 if there are no active iovecs */ + if (a_uio == NULL) { + return( 0 ); + } + + if (UIO_IS_64_BIT_SPACE(a_uio)) { +#if 1 // LP64todo - remove this temp workaround once we go live with uio KPI + return( (user_ssize_t)a_uio->uio_resid ); +#else + return( a_uio->uio_resid_64 ); +#endif + } + return( (user_ssize_t)a_uio->uio_resid ); +} + +/* + * uio_setresid - set the residual IO value for the given uio_t + */ +void uio_setresid( uio_t a_uio, user_ssize_t a_value ) +{ +#if DEBUG + if (a_uio == NULL) { + panic("%s :%d - invalid uio_t\n", __FILE__, __LINE__); + } +/* if (IS_VALID_UIO_SEGFLG(a_uio->uio_segflg) == 0) { */ +/* panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__); */ +/* } */ +#endif /* DEBUG */ + + if (a_uio == NULL) { + return; + } + + if (UIO_IS_64_BIT_SPACE(a_uio)) { +#if 1 // LP64todo - remove this temp workaround once we go live with uio KPI + a_uio->uio_resid = (int)a_value; +#else + a_uio->uio_resid_64 = a_value; +#endif + } + else { + a_uio->uio_resid = (int)a_value; + } + return; +} + +#if 0 // obsolete +/* + * uio_proc_t - return the proc_t for the given uio_t + * WARNING - This call is going away. Find another way to get the proc_t!! + */ +__private_extern__ proc_t uio_proc_t( uio_t a_uio ) +{ +#if LP64_DEBUG + if (a_uio == NULL) { + panic("%s :%d - invalid uio_t\n", __FILE__, __LINE__); + } +#endif /* LP64_DEBUG */ + + /* return 0 if there are no active iovecs */ + if (a_uio == NULL) { + return( NULL ); + } + return( a_uio->uio_procp ); +} + +/* + * uio_setproc_t - set the residual IO value for the given uio_t + * WARNING - This call is going away. + */ +__private_extern__ void uio_setproc_t( uio_t a_uio, proc_t a_proc_t ) +{ + if (a_uio == NULL) { +#if LP64_DEBUG + panic("%s :%d - invalid uio_t\n", __FILE__, __LINE__); +#endif /* LP64_DEBUG */ + return; + } + + a_uio->uio_procp = a_proc_t; + return; +} +#endif // obsolete + +/* + * uio_curriovbase - return the base address of the current iovec associated + * with the given uio_t. May return 0. + */ +user_addr_t uio_curriovbase( uio_t a_uio ) +{ +#if LP64_DEBUG + if (a_uio == NULL) { + panic("%s :%d - invalid uio_t\n", __FILE__, __LINE__); + } +#endif /* LP64_DEBUG */ + + if (a_uio == NULL || a_uio->uio_iovcnt < 1) { + return(0); + } + + if (UIO_IS_64_BIT_SPACE(a_uio)) { + return(a_uio->uio_iovs.uiovp->iov_base); + } + return((user_addr_t)((uintptr_t)a_uio->uio_iovs.kiovp->iov_base)); + +} + +/* + * uio_curriovlen - return the length value of the current iovec associated + * with the given uio_t. + */ +user_size_t uio_curriovlen( uio_t a_uio ) +{ +#if LP64_DEBUG + if (a_uio == NULL) { + panic("%s :%d - invalid uio_t\n", __FILE__, __LINE__); + } +#endif /* LP64_DEBUG */ + + if (a_uio == NULL || a_uio->uio_iovcnt < 1) { + return(0); + } + + if (UIO_IS_64_BIT_SPACE(a_uio)) { + return(a_uio->uio_iovs.uiovp->iov_len); + } + return((user_size_t)a_uio->uio_iovs.kiovp->iov_len); +} + +/* + * uio_setcurriovlen - set the length value of the current iovec associated + * with the given uio_t. + */ +__private_extern__ void uio_setcurriovlen( uio_t a_uio, user_size_t a_value ) +{ +#if LP64_DEBUG + if (a_uio == NULL) { + panic("%s :%d - invalid uio_t\n", __FILE__, __LINE__); + } +#endif /* LP64_DEBUG */ + + if (a_uio == NULL) { + return; + } + + if (UIO_IS_64_BIT_SPACE(a_uio)) { + a_uio->uio_iovs.uiovp->iov_len = a_value; + } + else { +#if LP64_DEBUG + if (a_value > 0xFFFFFFFFull) { + panic("%s :%d - invalid a_value\n", __FILE__, __LINE__); + } +#endif /* LP64_DEBUG */ + a_uio->uio_iovs.kiovp->iov_len = (size_t)a_value; + } + return; +} + +/* + * uio_iovcnt - return count of active iovecs for the given uio_t + */ +int uio_iovcnt( uio_t a_uio ) +{ +#if LP64_DEBUG + if (a_uio == NULL) { + panic("%s :%d - invalid uio_t\n", __FILE__, __LINE__); + } +#endif /* LP64_DEBUG */ + + if (a_uio == NULL) { + return(0); + } + + return( a_uio->uio_iovcnt ); +} + +/* + * uio_offset - return the current offset value for the given uio_t + */ +off_t uio_offset( uio_t a_uio ) +{ +#if LP64_DEBUG + if (a_uio == NULL) { + panic("%s :%d - invalid uio_t\n", __FILE__, __LINE__); + } +#endif /* LP64_DEBUG */ + + if (a_uio == NULL) { + return(0); + } + return( a_uio->uio_offset ); +} + +/* + * uio_setoffset - set the current offset value for the given uio_t + */ +void uio_setoffset( uio_t a_uio, off_t a_offset ) +{ +#if LP64_DEBUG + if (a_uio == NULL) { + panic("%s :%d - invalid uio_t\n", __FILE__, __LINE__); + } +#endif /* LP64_DEBUG */ + + if (a_uio == NULL) { + return; + } + a_uio->uio_offset = a_offset; + return; +} + +/* + * uio_rw - return the read / write flag for the given uio_t + */ +int uio_rw( uio_t a_uio ) +{ +#if LP64_DEBUG + if (a_uio == NULL) { + panic("%s :%d - invalid uio_t\n", __FILE__, __LINE__); + } +#endif /* LP64_DEBUG */ + + if (a_uio == NULL) { + return(-1); + } + return( a_uio->uio_rw ); +} + +/* + * uio_setrw - set the read / write flag for the given uio_t + */ +void uio_setrw( uio_t a_uio, int a_value ) +{ + if (a_uio == NULL) { +#if LP64_DEBUG + panic("%s :%d - invalid uio_t\n", __FILE__, __LINE__); +#endif /* LP64_DEBUG */ + return; + } + +#if LP64_DEBUG + if (!(a_value == UIO_READ || a_value == UIO_WRITE)) { + panic("%s :%d - invalid a_value\n", __FILE__, __LINE__); + } +#endif /* LP64_DEBUG */ + + if (a_value == UIO_READ || a_value == UIO_WRITE) { + a_uio->uio_rw = a_value; + } + return; +} + +/* + * uio_isuserspace - return non zero value if the address space + * flag is for a user address space (could be 32 or 64 bit). + */ +int uio_isuserspace( uio_t a_uio ) +{ + if (a_uio == NULL) { +#if LP64_DEBUG + panic("%s :%d - invalid uio_t\n", __FILE__, __LINE__); +#endif /* LP64_DEBUG */ + return(0); + } + + if (UIO_SEG_IS_USER_SPACE(a_uio->uio_segflg)) { + return( 1 ); + } + return( 0 ); +} + + +/* + * uio_create - create an uio_t. + * Space is allocated to hold up to a_iovcount number of iovecs. The uio_t + * is not fully initialized until all iovecs are added using uio_addiov calls. + * a_iovcount is the maximum number of iovecs you may add. + */ +uio_t uio_create( int a_iovcount, /* number of iovecs */ + off_t a_offset, /* current offset */ + int a_spacetype, /* type of address space */ + int a_iodirection ) /* read or write flag */ +{ + void * my_buf_p; + int my_size; + uio_t my_uio; + + my_size = sizeof(struct uio) + (sizeof(struct user_iovec) * a_iovcount); + my_buf_p = kalloc(my_size); + my_uio = uio_createwithbuffer( a_iovcount, + a_offset, + a_spacetype, + a_iodirection, + my_buf_p, + my_size ); + if (my_uio != 0) { + /* leave a note that we allocated this uio_t */ + my_uio->uio_flags |= UIO_FLAGS_WE_ALLOCED; +#if DEBUG + hw_atomic_add(&uio_t_count, 1); +#endif + } + + return( my_uio ); +} + + +/* + * uio_createwithbuffer - create an uio_t. + * Create a uio_t using the given buffer. The uio_t + * is not fully initialized until all iovecs are added using uio_addiov calls. + * a_iovcount is the maximum number of iovecs you may add. + * This call may fail if the given buffer is not large enough. + */ +__private_extern__ uio_t + uio_createwithbuffer( int a_iovcount, /* number of iovecs */ + off_t a_offset, /* current offset */ + int a_spacetype, /* type of address space */ + int a_iodirection, /* read or write flag */ + void *a_buf_p, /* pointer to a uio_t buffer */ + int a_buffer_size ) /* size of uio_t buffer */ +{ + uio_t my_uio = (uio_t) a_buf_p; + int my_size; + + my_size = sizeof(struct uio) + (sizeof(struct user_iovec) * a_iovcount); + if (a_buffer_size < my_size) { +#if DEBUG + panic("%s :%d - a_buffer_size is too small\n", __FILE__, __LINE__); +#endif /* DEBUG */ + return( NULL ); + } + my_size = a_buffer_size; + +#if DEBUG + if (my_uio == 0) { + panic("%s :%d - could not allocate uio_t\n", __FILE__, __LINE__); + } + if (!IS_VALID_UIO_SEGFLG(a_spacetype)) { + panic("%s :%d - invalid address space type\n", __FILE__, __LINE__); + } + if (!(a_iodirection == UIO_READ || a_iodirection == UIO_WRITE)) { + panic("%s :%d - invalid IO direction flag\n", __FILE__, __LINE__); + } + if (a_iovcount > UIO_MAXIOV) { + panic("%s :%d - invalid a_iovcount\n", __FILE__, __LINE__); + } +#endif /* DEBUG */ + + bzero(my_uio, my_size); + my_uio->uio_size = my_size; + + /* we use uio_segflg to indicate if the uio_t is the new format or */ + /* old (pre LP64 support) legacy format */ + switch (a_spacetype) { + case UIO_USERSPACE: + my_uio->uio_segflg = UIO_USERSPACE32; + case UIO_SYSSPACE: + my_uio->uio_segflg = UIO_SYSSPACE32; + case UIO_PHYS_USERSPACE: + my_uio->uio_segflg = UIO_PHYS_USERSPACE32; + case UIO_PHYS_SYSSPACE: + my_uio->uio_segflg = UIO_PHYS_SYSSPACE32; + default: + my_uio->uio_segflg = a_spacetype; + break; + } + + if (a_iovcount > 0) { + my_uio->uio_iovs.uiovp = (struct user_iovec *) + (((uint8_t *)my_uio) + sizeof(struct uio)); + } + else { + my_uio->uio_iovs.uiovp = NULL; + } + + my_uio->uio_max_iovs = a_iovcount; + my_uio->uio_offset = a_offset; + my_uio->uio_rw = a_iodirection; + my_uio->uio_flags = UIO_FLAGS_INITED; + + return( my_uio ); +} + +/* + * uio_spacetype - return the address space type for the given uio_t + */ +int uio_spacetype( uio_t a_uio ) +{ + if (a_uio == NULL) { +#if LP64_DEBUG + panic("%s :%d - invalid uio_t\n", __FILE__, __LINE__); +#endif /* LP64_DEBUG */ + return(-1); + } + + return( a_uio->uio_segflg ); +} + +/* + * uio_iovsaddr - get the address of the iovec array for the given uio_t. + * This returns the location of the iovecs within the uio. + * NOTE - for compatibility mode we just return the current value in uio_iovs + * which will increase as the IO is completed and is NOT embedded within the + * uio, it is a seperate array of one or more iovecs. + */ +struct user_iovec * uio_iovsaddr( uio_t a_uio ) +{ + struct user_iovec * my_addr; + + if (a_uio == NULL) { + return(NULL); + } + + if (a_uio->uio_segflg == UIO_USERSPACE || a_uio->uio_segflg == UIO_SYSSPACE) { + /* we need this for compatibility mode. */ + my_addr = (struct user_iovec *) a_uio->uio_iovs.iovp; + } + else { + my_addr = (struct user_iovec *) (((uint8_t *)a_uio) + sizeof(struct uio)); + } + return(my_addr); +} + +/* + * uio_reset - reset an uio_t. + * Reset the given uio_t to initial values. The uio_t is not fully initialized + * until all iovecs are added using uio_addiov calls. + * The a_iovcount value passed in the uio_create is the maximum number of + * iovecs you may add. + */ +void uio_reset( uio_t a_uio, + off_t a_offset, /* current offset */ + int a_spacetype, /* type of address space */ + int a_iodirection ) /* read or write flag */ +{ + vm_size_t my_size; + int my_max_iovs; + u_int32_t my_old_flags; + +#if LP64_DEBUG + if (a_uio == NULL) { + panic("%s :%d - could not allocate uio_t\n", __FILE__, __LINE__); + } + if (!IS_VALID_UIO_SEGFLG(a_spacetype)) { + panic("%s :%d - invalid address space type\n", __FILE__, __LINE__); + } + if (!(a_iodirection == UIO_READ || a_iodirection == UIO_WRITE)) { + panic("%s :%d - invalid IO direction flag\n", __FILE__, __LINE__); + } +#endif /* LP64_DEBUG */ + + if (a_uio == NULL) { + return; + } + + my_size = a_uio->uio_size; + my_old_flags = a_uio->uio_flags; + my_max_iovs = a_uio->uio_max_iovs; + bzero(a_uio, my_size); + a_uio->uio_size = my_size; + a_uio->uio_segflg = a_spacetype; + if (my_max_iovs > 0) { + a_uio->uio_iovs.uiovp = (struct user_iovec *) + (((uint8_t *)a_uio) + sizeof(struct uio)); + } + else { + a_uio->uio_iovs.uiovp = NULL; + } + a_uio->uio_max_iovs = my_max_iovs; + a_uio->uio_offset = a_offset; + a_uio->uio_rw = a_iodirection; + a_uio->uio_flags = my_old_flags; + + return; +} + +/* + * uio_free - free a uio_t allocated via uio_init. this also frees all + * associated iovecs. + */ +void uio_free( uio_t a_uio ) +{ +#if DEBUG + if (a_uio == NULL) { + panic("%s :%d - passing NULL uio_t\n", __FILE__, __LINE__); + } +#endif /* LP64_DEBUG */ + + if (a_uio != NULL && (a_uio->uio_flags & UIO_FLAGS_WE_ALLOCED) != 0) { +#if DEBUG + if ((int)(hw_atomic_sub(&uio_t_count, 1)) < 0) { + panic("%s :%d - uio_t_count has gone negative\n", __FILE__, __LINE__); + } +#endif + kfree(a_uio, a_uio->uio_size); + } + + +} + +/* + * uio_addiov - add an iovec to the given uio_t. You may call this up to + * the a_iovcount number that was passed to uio_create. This call will + * increment the residual IO count as iovecs are added to the uio_t. + * returns 0 if add was successful else non zero. + */ +int uio_addiov( uio_t a_uio, user_addr_t a_baseaddr, user_size_t a_length ) +{ + int i; + + if (a_uio == NULL) { +#if DEBUG + panic("%s :%d - invalid uio_t\n", __FILE__, __LINE__); +#endif /* LP64_DEBUG */ + return(-1); + } + + if (UIO_IS_64_BIT_SPACE(a_uio)) { + for ( i = 0; i < a_uio->uio_max_iovs; i++ ) { + if (a_uio->uio_iovs.uiovp[i].iov_len == 0 && a_uio->uio_iovs.uiovp[i].iov_base == 0) { + a_uio->uio_iovs.uiovp[i].iov_len = a_length; + a_uio->uio_iovs.uiovp[i].iov_base = a_baseaddr; + a_uio->uio_iovcnt++; +#if 1 // LP64todo - remove this temp workaround once we go live with uio KPI + a_uio->uio_resid += a_length; +#else + a_uio->uio_resid_64 += a_length; +#endif + return( 0 ); + } + } + } + else { + for ( i = 0; i < a_uio->uio_max_iovs; i++ ) { + if (a_uio->uio_iovs.kiovp[i].iov_len == 0 && a_uio->uio_iovs.kiovp[i].iov_base == 0) { + a_uio->uio_iovs.kiovp[i].iov_len = (u_int32_t)a_length; + a_uio->uio_iovs.kiovp[i].iov_base = (u_int32_t)((uintptr_t)a_baseaddr); + a_uio->uio_iovcnt++; + a_uio->uio_resid += a_length; + return( 0 ); + } + } + } + + return( -1 ); +} + +/* + * uio_getiov - get iovec data associated with the given uio_t. Use + * a_index to iterate over each iovec (0 to (uio_iovcnt(uio_t) - 1)). + * a_baseaddr_p and a_length_p may be NULL. + * returns -1 when a_index is >= uio_t.uio_iovcnt or invalid uio_t. + * returns 0 when data is returned. + */ +int uio_getiov( uio_t a_uio, + int a_index, + user_addr_t * a_baseaddr_p, + user_size_t * a_length_p ) +{ + if (a_uio == NULL) { +#if DEBUG + panic("%s :%d - invalid uio_t\n", __FILE__, __LINE__); +#endif /* DEBUG */ + return(-1); + } + if ( a_index < 0 || a_index >= a_uio->uio_iovcnt) { + return(-1); + } + + if (UIO_IS_64_BIT_SPACE(a_uio)) { + if (a_baseaddr_p != NULL) { + *a_baseaddr_p = a_uio->uio_iovs.uiovp[a_index].iov_base; + } + if (a_length_p != NULL) { + *a_length_p = a_uio->uio_iovs.uiovp[a_index].iov_len; + } + } + else { + if (a_baseaddr_p != NULL) { + *a_baseaddr_p = a_uio->uio_iovs.kiovp[a_index].iov_base; + } + if (a_length_p != NULL) { + *a_length_p = a_uio->uio_iovs.kiovp[a_index].iov_len; + } + } + + return( 0 ); +} + +/* + * uio_calculateresid - runs through all iovecs associated with this + * uio_t and calculates (and sets) the residual IO count. + */ +__private_extern__ void uio_calculateresid( uio_t a_uio ) +{ + int i; + + if (a_uio == NULL) { +#if LP64_DEBUG + panic("%s :%d - invalid uio_t\n", __FILE__, __LINE__); +#endif /* LP64_DEBUG */ + return; + } + + a_uio->uio_iovcnt = 0; + if (UIO_IS_64_BIT_SPACE(a_uio)) { +#if 1 // LP64todo - remove this temp workaround once we go live with uio KPI + a_uio->uio_resid = 0; +#else + a_uio->uio_resid_64 = 0; +#endif + for ( i = 0; i < a_uio->uio_max_iovs; i++ ) { + if (a_uio->uio_iovs.uiovp[i].iov_len != 0 && a_uio->uio_iovs.uiovp[i].iov_base != 0) { + a_uio->uio_iovcnt++; +#if 1 // LP64todo - remove this temp workaround once we go live with uio KPI + a_uio->uio_resid += a_uio->uio_iovs.uiovp[i].iov_len; +#else + a_uio->uio_resid_64 += a_uio->uio_iovs.uiovp[i].iov_len; +#endif + } + } + } + else { + a_uio->uio_resid = 0; + for ( i = 0; i < a_uio->uio_max_iovs; i++ ) { + if (a_uio->uio_iovs.kiovp[i].iov_len != 0 && a_uio->uio_iovs.kiovp[i].iov_base != 0) { + a_uio->uio_iovcnt++; + a_uio->uio_resid += a_uio->uio_iovs.kiovp[i].iov_len; + } + } + } + return; +} + +/* + * uio_update - update the given uio_t for a_count of completed IO. + * This call decrements the current iovec length and residual IO value + * and increments the current iovec base address and offset value. + * If the current iovec length is 0 then advance to the next + * iovec (if any). + */ +void uio_update( uio_t a_uio, user_size_t a_count ) +{ +#if LP64_DEBUG + if (a_uio == NULL) { + panic("%s :%d - invalid uio_t\n", __FILE__, __LINE__); + } + if (UIO_IS_32_BIT_SPACE(a_uio) && a_count > 0xFFFFFFFFull) { + panic("%s :%d - invalid count value \n", __FILE__, __LINE__); + } +#endif /* LP64_DEBUG */ + + if (a_uio == NULL || a_uio->uio_iovcnt < 1) { + return; + } + + if (UIO_IS_64_BIT_SPACE(a_uio)) { + if (a_count > a_uio->uio_iovs.uiovp->iov_len) { + a_uio->uio_iovs.uiovp->iov_base += a_uio->uio_iovs.uiovp->iov_len; + a_uio->uio_iovs.uiovp->iov_len = 0; + } + else { + a_uio->uio_iovs.uiovp->iov_base += a_count; + a_uio->uio_iovs.uiovp->iov_len -= a_count; + } +#if 1 // LP64todo - remove this temp workaround once we go live with uio KPI + if (a_uio->uio_resid < 0) { + a_uio->uio_resid = 0; + } + if (a_count > (user_size_t)a_uio->uio_resid) { + a_uio->uio_offset += a_uio->uio_resid; + a_uio->uio_resid = 0; + } + else { + a_uio->uio_offset += a_count; + a_uio->uio_resid -= a_count; + } +#else + if (a_uio->uio_resid_64 < 0) { + a_uio->uio_resid_64 = 0; + } + if (a_count > (user_size_t)a_uio->uio_resid_64) { + a_uio->uio_offset += a_uio->uio_resid_64; + a_uio->uio_resid_64 = 0; + } + else { + a_uio->uio_offset += a_count; + a_uio->uio_resid_64 -= a_count; + } +#endif // LP64todo + + /* advance to next iovec if current one is totally consumed */ + while (a_uio->uio_iovcnt > 0 && a_uio->uio_iovs.uiovp->iov_len == 0) { + a_uio->uio_iovcnt--; + if (a_uio->uio_iovcnt > 0) { + a_uio->uio_iovs.uiovp++; + } + } + } + else { + if (a_count > a_uio->uio_iovs.kiovp->iov_len) { + a_uio->uio_iovs.kiovp->iov_base += a_uio->uio_iovs.kiovp->iov_len; + a_uio->uio_iovs.kiovp->iov_len = 0; + } + else { + a_uio->uio_iovs.kiovp->iov_base += a_count; + a_uio->uio_iovs.kiovp->iov_len -= a_count; + } + if (a_uio->uio_resid < 0) { + a_uio->uio_resid = 0; + } + if (a_count > (user_size_t)a_uio->uio_resid) { + a_uio->uio_offset += a_uio->uio_resid; + a_uio->uio_resid = 0; + } + else { + a_uio->uio_offset += a_count; + a_uio->uio_resid -= a_count; + } + + /* advance to next iovec if current one is totally consumed */ + while (a_uio->uio_iovcnt > 0 && a_uio->uio_iovs.kiovp->iov_len == 0) { + a_uio->uio_iovcnt--; + if (a_uio->uio_iovcnt > 0) { + a_uio->uio_iovs.kiovp++; + } + } + } + return; +} + + +/* + * uio_duplicate - allocate a new uio and make a copy of the given uio_t. + * may return NULL. + */ +uio_t uio_duplicate( uio_t a_uio ) +{ + uio_t my_uio; + int i; + + if (a_uio == NULL) { + return(NULL); + } + + my_uio = (uio_t) kalloc(a_uio->uio_size); + if (my_uio == 0) { + panic("%s :%d - allocation failed\n", __FILE__, __LINE__); + } + + bcopy((void *)a_uio, (void *)my_uio, a_uio->uio_size); + /* need to set our iovec pointer to point to first active iovec */ + if (my_uio->uio_max_iovs > 0) { + my_uio->uio_iovs.uiovp = (struct user_iovec *) + (((uint8_t *)my_uio) + sizeof(struct uio)); + + /* advance to first nonzero iovec */ + if (my_uio->uio_iovcnt > 0) { + for ( i = 0; i < my_uio->uio_max_iovs; i++ ) { + if (UIO_IS_64_BIT_SPACE(a_uio)) { + if (my_uio->uio_iovs.uiovp->iov_len != 0) { + break; + } + my_uio->uio_iovs.uiovp++; + } + else { + if (my_uio->uio_iovs.kiovp->iov_len != 0) { + break; + } + my_uio->uio_iovs.kiovp++; + } + } + } + } + + return(my_uio); +} + diff --git a/bsd/kern/kern_symfile.c b/bsd/kern/kern_symfile.c index 9091fca23..ed56bd1cd 100644 --- a/bsd/kern/kern_symfile.c +++ b/bsd/kern/kern_symfile.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -23,17 +23,22 @@ * * File: bsd/kern/kern_symfile.c * - * This file contains creates a dummy symbol file for mach_kernel based on - * the symbol table information passed by the SecondaryLoader/PlatformExpert. - * This allows us to correctly link other executables (drivers, etc) against the - * the kernel in cases where the kernel image on the root device does not match - * the live kernel. This can occur during net-booting where the actual kernel - * image is obtained from the network via tftp rather than the root - * device. + * This file contains creates a dummy symbol file for mach_kernel + * based on the symbol table information passed by the + * SecondaryLoader/PlatformExpert. This allows us to correctly + * link other executables (drivers, etc) against the the kernel in + * cases where the kernel image on the root device does not match + * the live kernel. This can occur during net-booting where the + * actual kernel image is obtained from the network via tftp rather + * than the root device. * - * If a symbol table is available, then the file /mach.sym will be created - * containing a Mach Header and a LC_SYMTAB load command followed by the - * the symbol table data for mach_kernel. + * If a symbol table is available, then the file /mach.sym will be + * created containing a Mach Header and a LC_SYMTAB load command + * followed by the the symbol table data for mach_kernel. + * + * NOTE: This file supports only 32 bit kernels at the present time; + * adding support for 64 bit kernels is possible, but is not + * necessary at the present time. * * HISTORY * @@ -47,21 +52,25 @@ #include <sys/signalvar.h> #include <sys/resourcevar.h> #include <sys/namei.h> -#include <sys/vnode.h> -#include <sys/proc.h> +#include <sys/vnode_internal.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/timeb.h> #include <sys/times.h> -#include <sys/buf.h> #include <sys/acct.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/uio.h> #include <sys/kernel.h> #include <sys/stat.h> +#include <sys/disk.h> +#include <sys/conf.h> #include <mach-o/loader.h> #include <mach-o/nlist.h> +#include <kern/kalloc.h> #include <vm/vm_kern.h> +#include <pexpert/pexpert.h> extern unsigned char rootdevice[]; extern struct mach_header _mh_execute_header; @@ -73,15 +82,15 @@ extern int IODTGetLoaderInfo(char *key, void **infoAddr, int *infoSize); extern void IODTFreeLoaderInfo(char *key, void *infoAddr, int infoSize); /* - * + * Can only operate against currently running 32 bit mach_kernel */ -static int output_kernel_symbols(struct proc *p) +static int +output_kernel_symbols(struct proc *p) { struct vnode *vp; - struct pcred *pcred = p->p_cred; - struct ucred *cred = pcred->pc_ucred; - struct nameidata nd; - struct vattr vattr; + kauth_cred_t cred = p->p_ucred; /* XXX */ + struct vnode_attr va; + struct vfs_context context; struct load_command *cmd; struct mach_header *orig_mh, *mh; struct segment_command *orig_ds, *orig_ts, *orig_le, *sg; @@ -90,9 +99,9 @@ static int output_kernel_symbols(struct proc *p) struct nlist *sym; vm_size_t orig_mhsize, orig_st_size; vm_offset_t header; - vm_size_t header_size; + vm_size_t header_size = 0; /* out: protected by header */ int error, error1; - int i, j; + unsigned int i, j; caddr_t addr; vm_offset_t offset; int rc_mh, rc_sc; @@ -117,28 +126,29 @@ static int output_kernel_symbols(struct proc *p) IODTFreeLoaderInfo("Kernel-__SYMTAB", (void *)orig_st, round_page_32(orig_st_size)); - if (pcred->p_svuid != pcred->p_ruid || pcred->p_svgid != pcred->p_rgid) + if (cred->cr_svuid != cred->cr_ruid || cred->cr_svgid != cred->cr_rgid) goto out; // Check to see if the root is 'e' or 'n', is this a test for network? if (rootdevice[0] == 'e' && rootdevice[1] == 'n') goto out; - NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "mach.sym", p); - if((error = vn_open(&nd, O_CREAT | FWRITE, S_IRUSR | S_IRGRP | S_IROTH))) goto out; + context.vc_proc = p; + context.vc_ucred = cred; + + if ((error = vnode_open("mach.sym", (O_CREAT | FWRITE), (S_IRUSR | S_IRGRP | S_IROTH), 0, &vp, &context))) + goto out; - vp = nd.ni_vp; - /* Don't dump to non-regular files or files with links. */ error = EFAULT; - if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred, p) - || vattr.va_nlink != 1) + VATTR_INIT(&va); + VATTR_WANTED(&va, va_nlink); + if ((vp->v_type != VREG) || vnode_getattr(vp, &va, &context) || (va.va_nlink != 1)) goto out; - VATTR_NULL(&vattr); - vattr.va_size = 0; - VOP_LEASE(vp, p, cred, LEASE_WRITE); - VOP_SETATTR(vp, &vattr, cred, p); + VATTR_INIT(&va); /* better to do it here than waste more stack in vnode_getsize */ + VATTR_SET(&va, va_data_size, 0); + vnode_setattr(vp, &va, &context); p->p_acflag |= ACORE; // If the file type is MH_EXECUTE then this must be a kernel @@ -149,14 +159,14 @@ static int output_kernel_symbols(struct proc *p) cmd = (struct load_command *) &orig_mh[1]; for (i = 0; i < orig_mh->ncmds; i++) { if (cmd->cmd == LC_SEGMENT) { - struct segment_command *sg = (struct segment_command *) cmd; + struct segment_command *orig_sg = (struct segment_command *) cmd; - if (!strcmp(SEG_TEXT, sg->segname)) - orig_ts = sg; - else if (!strcmp(SEG_DATA, sg->segname)) - orig_ds = sg; - else if (!strcmp(SEG_LINKEDIT, sg->segname)) - orig_le = sg; + if (!strcmp(SEG_TEXT, orig_sg->segname)) + orig_ts = orig_sg; + else if (!strcmp(SEG_DATA, orig_sg->segname)) + orig_ds = orig_sg; + else if (!strcmp(SEG_LINKEDIT, orig_sg->segname)) + orig_le = orig_sg; } else if (cmd->cmd == LC_SYMTAB) orig_st = (struct symtab_command *) cmd; @@ -183,7 +193,7 @@ static int output_kernel_symbols(struct proc *p) + orig_ds->cmdsize + sizeof(struct symtab_command); - (void) kmem_alloc_wired(kernel_map, + (void) kmem_alloc(kernel_map, (vm_offset_t *) &header, (vm_size_t) header_size); if (header) @@ -204,7 +214,7 @@ static int output_kernel_symbols(struct proc *p) mh->flags = orig_mh->flags; // Initialise the current file offset and addr - offset = round_page_32(header_size); + offset = round_page(header_size); addr = (caddr_t) const_text->addr; // Load address of __TEXT,__const /* @@ -217,7 +227,7 @@ static int output_kernel_symbols(struct proc *p) sg->vmaddr = (unsigned long) addr; sg->vmsize = const_text->size; sg->fileoff = 0; - sg->filesize = const_text->size + round_page_32(header_size); + sg->filesize = const_text->size + round_page(header_size); sg->maxprot = 0; sg->initprot = 0; sg->flags = 0; @@ -234,7 +244,7 @@ static int output_kernel_symbols(struct proc *p) const_text = se; } } - offset = round_page_32((vm_address_t) offset); + offset = round_page(offset); // Now copy of the __DATA segment load command, the image need // not be stored to disk nobody needs it, yet! @@ -255,7 +265,7 @@ static int output_kernel_symbols(struct proc *p) se->offset = offset; se->nreloc = 0; } - offset = round_page_32(offset); + offset = round_page(offset); /* @@ -285,7 +295,7 @@ static int output_kernel_symbols(struct proc *p) * Write out the load commands at the beginning of the file. */ error = vn_rdwr(UIO_WRITE, vp, (caddr_t) mh, header_size, (off_t) 0, - UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, (int *) 0, p); + UIO_SYSSPACE32, IO_NODELOCKED|IO_UNIT, cred, (int *) 0, p); if (error) goto out; @@ -294,7 +304,7 @@ static int output_kernel_symbols(struct proc *p) */ error = vn_rdwr(UIO_WRITE, vp, (caddr_t) const_text->addr, const_text->size, const_text->offset, - UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, (int *) 0, p); + UIO_SYSSPACE32, IO_NODELOCKED|IO_UNIT, cred, (int *) 0, p); if (error) goto out; @@ -304,17 +314,13 @@ static int output_kernel_symbols(struct proc *p) offset = st->nsyms * sizeof(struct nlist) + st->strsize; // symtab size error = vn_rdwr(UIO_WRITE, vp, (caddr_t) orig_le->vmaddr, offset, st->symoff, - UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, (int *) 0, p); - if (error) - goto out; - + UIO_SYSSPACE32, IO_NODELOCKED|IO_UNIT, cred, (int *) 0, p); out: if (header) kmem_free(kernel_map, header, header_size); if (vp) { - VOP_UNLOCK(vp, 0, p); - error1 = vn_close(vp, FWRITE, cred, p); + error1 = vnode_close(vp, FWRITE, &context); if (!error) error = error1; } @@ -334,3 +340,4 @@ int get_kernel_symfile(struct proc *p, char **symfile) return error_code; } + diff --git a/bsd/kern/kern_synch.c b/bsd/kern/kern_synch.c index 97b35818c..9f33c4ce1 100644 --- a/bsd/kern/kern_synch.c +++ b/bsd/kern/kern_synch.c @@ -28,12 +28,11 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> #include <sys/user.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/vnode.h> #include <sys/kernel.h> -#include <sys/buf.h> #include <machine/spl.h> @@ -48,6 +47,8 @@ #include <kern/task.h> #include <mach/time_value.h> +#include <kern/lock.h> + #if KTRACE #include <sys/uio.h> @@ -55,19 +56,22 @@ #endif static void -_sleep_continue(void) +_sleep_continue( + void *parameter, + wait_result_t wresult) { - register struct proc *p; - register thread_t self = current_act(); + register struct proc *p = current_proc(); + register thread_t self = current_thread(); struct uthread * ut; int sig, catch; int error = 0; + int dropmutex; ut = get_bsdthread_info(self); - catch = ut->uu_pri & PCATCH; - p = current_proc(); + catch = ut->uu_pri & PCATCH; + dropmutex = ut->uu_pri & PDROP; - switch (get_thread_waitresult(self)) { + switch (wresult) { case THREAD_TIMED_OUT: error = EWOULDBLOCK; break; @@ -94,7 +98,10 @@ _sleep_continue(void) if (thread_should_abort(self)) { error = EINTR; } - } + } else if( (ut->uu_flag & ( UT_CANCELDISABLE | UT_CANCEL | UT_CANCELED)) == UT_CANCEL) { + /* due to thread cancel */ + error = EINTR; + } } else error = EINTR; break; @@ -103,13 +110,12 @@ _sleep_continue(void) if (error == EINTR || error == ERESTART) act_set_astbsd(self); - if (ut->uu_timo) - thread_cancel_timer(); - #if KTRACE if (KTRPOINT(p, KTR_CSW)) - ktrcsw(p->p_tracep, 0, 0, -1); + ktrcsw(p->p_tracep, 0, 0); #endif + if (ut->uu_mtx && !dropmutex) + lck_mtx_lock(ut->uu_mtx); unix_syscall_return((*ut->uu_continuation)(error)); } @@ -126,104 +132,116 @@ _sleep_continue(void) * Callers of this routine must be prepared for * premature return, and check that the reason for * sleeping has gone away. + * + * if msleep was the entry point, than we have a mutex to deal with + * + * The mutex is unlocked before the caller is blocked, and + * relocked before msleep returns unless the priority includes the PDROP + * flag... if PDROP is specified, _sleep returns with the mutex unlocked + * regardless of whether it actually blocked or not. */ static int _sleep( caddr_t chan, - int pri, - char *wmsg, + int pri, + const char *wmsg, u_int64_t abstime, - int (*continuation)(int)) + int (*continuation)(int), + lck_mtx_t *mtx) { register struct proc *p; - register thread_t self = current_act(); + register thread_t self = current_thread(); struct uthread * ut; int sig, catch = pri & PCATCH; - int sigttblock = pri & PTTYBLOCK; + int dropmutex = pri & PDROP; int wait_result; int error = 0; - spl_t s; - - s = splhigh(); ut = get_bsdthread_info(self); - + p = current_proc(); #if KTRACE if (KTRPOINT(p, KTR_CSW)) - ktrcsw(p->p_tracep, 1, 0, -1); + ktrcsw(p->p_tracep, 1, 0); #endif p->p_priority = pri & PRIMASK; - - if (chan != NULL) - assert_wait_prim(chan, NULL, abstime, - (catch) ? THREAD_ABORTSAFE : THREAD_UNINT); - else - if (abstime != 0) - thread_set_timer_deadline(abstime); - - /* - * We start our timeout - * before calling CURSIG, as we could stop there, and a wakeup - * or a SIGCONT (or both) could occur while we were stopped. - * A SIGCONT would cause us to be marked as SSLEEP - * without resuming us, thus we must be ready for sleep - * when CURSIG is called. If the wakeup happens while we're - * stopped, p->p_wchan will be 0 upon return from CURSIG. - */ - if (catch) { - if (SHOULDissignal(p,ut)) { - if (sig = CURSIG(p)) { - if (clear_wait(self, THREAD_INTERRUPTED) == KERN_FAILURE) - goto block; - /* if SIGTTOU or SIGTTIN then block till SIGCONT */ - if (sigttblock && ((sig == SIGTTOU) || (sig == SIGTTIN))) { - p->p_flag |= P_TTYSLEEP; - /* reset signal bits */ - clear_procsiglist(p, sig); - assert_wait(&p->p_siglist, THREAD_ABORTSAFE); - /* assert wait can block and SIGCONT should be checked */ - if (p->p_flag & P_TTYSLEEP) - thread_block(THREAD_CONTINUE_NULL); - /* return with success */ - error = 0; + p->p_stats->p_ru.ru_nvcsw++; + + if (mtx != NULL && chan != NULL && (thread_continue_t)continuation == THREAD_CONTINUE_NULL) { + + if (abstime) + wait_result = lck_mtx_sleep_deadline(mtx, (dropmutex) ? LCK_SLEEP_UNLOCK : 0, + chan, (catch) ? THREAD_ABORTSAFE : THREAD_UNINT, abstime); + else + wait_result = lck_mtx_sleep(mtx, (dropmutex) ? LCK_SLEEP_UNLOCK : 0, + chan, (catch) ? THREAD_ABORTSAFE : THREAD_UNINT); + } + else { + if (chan != NULL) + assert_wait_deadline(chan, (catch) ? THREAD_ABORTSAFE : THREAD_UNINT, abstime); + if (mtx) + lck_mtx_unlock(mtx); + if (catch) { + if (SHOULDissignal(p,ut)) { + if (sig = CURSIG(p)) { + if (clear_wait(self, THREAD_INTERRUPTED) == KERN_FAILURE) + goto block; + /* if SIGTTOU or SIGTTIN then block till SIGCONT */ + if ((pri & PTTYBLOCK) && ((sig == SIGTTOU) || (sig == SIGTTIN))) { + p->p_flag |= P_TTYSLEEP; + /* reset signal bits */ + clear_procsiglist(p, sig); + assert_wait(&p->p_siglist, THREAD_ABORTSAFE); + /* assert wait can block and SIGCONT should be checked */ + if (p->p_flag & P_TTYSLEEP) { + thread_block(THREAD_CONTINUE_NULL); + + if (mtx && !dropmutex) + lck_mtx_lock(mtx); + } + + /* return with success */ + error = 0; + goto out; + } + if (p->p_sigacts->ps_sigintr & sigmask(sig)) + error = EINTR; + else + error = ERESTART; + if (mtx && !dropmutex) + lck_mtx_lock(mtx); goto out; } - if (p->p_sigacts->ps_sigintr & sigmask(sig)) - error = EINTR; - else - error = ERESTART; + } + if (thread_should_abort(self)) { + if (clear_wait(self, THREAD_INTERRUPTED) == KERN_FAILURE) + goto block; + error = EINTR; + + if (mtx && !dropmutex) + lck_mtx_lock(mtx); goto out; } - } - if (thread_should_abort(self)) { - if (clear_wait(self, THREAD_INTERRUPTED) == KERN_FAILURE) - goto block; - error = EINTR; - goto out; - } - if (get_thread_waitresult(self) != THREAD_WAITING) { - /*already happened */ - goto out; - } - } + } -block: - splx(s); - p->p_stats->p_ru.ru_nvcsw++; +block: + if ((thread_continue_t)continuation != THREAD_CONTINUE_NULL) { + ut->uu_continuation = continuation; + ut->uu_pri = pri; + ut->uu_timo = abstime? 1: 0; + ut->uu_mtx = mtx; + (void) thread_block(_sleep_continue); + /* NOTREACHED */ + } + + wait_result = thread_block(THREAD_CONTINUE_NULL); - if ((thread_continue_t)continuation != THREAD_CONTINUE_NULL ) { - ut->uu_continuation = continuation; - ut->uu_pri = pri; - ut->uu_timo = abstime? 1: 0; - (void) thread_block(_sleep_continue); - /* NOTREACHED */ + if (mtx && !dropmutex) + lck_mtx_lock(mtx); } - wait_result = thread_block(THREAD_CONTINUE_NULL); - switch (wait_result) { case THREAD_TIMED_OUT: error = EWOULDBLOCK; @@ -241,7 +259,7 @@ block: if (catch) { if (thread_should_abort(self)) { error = EINTR; - } else if (SHOULDissignal(p,ut)) { + } else if (SHOULDissignal(p, ut)) { if (sig = CURSIG(p)) { if (p->p_sigacts->ps_sigintr & sigmask(sig)) error = EINTR; @@ -259,12 +277,10 @@ block: out: if (error == EINTR || error == ERESTART) act_set_astbsd(self); - if (abstime) - thread_cancel_timer(); - (void) splx(s); + #if KTRACE if (KTRPOINT(p, KTR_CSW)) - ktrcsw(p->p_tracep, 0, 0, -1); + ktrcsw(p->p_tracep, 0, 0); #endif return (error); } @@ -274,28 +290,74 @@ sleep( void *chan, int pri) { - return _sleep((caddr_t)chan, pri, (char *)NULL, 0, (int (*)(int))0); + return _sleep((caddr_t)chan, pri, (char *)NULL, 0, (int (*)(int))0, (lck_mtx_t *)0); +} + +int +msleep0( + void *chan, + lck_mtx_t *mtx, + int pri, + const char *wmsg, + int timo, + int (*continuation)(int)) +{ + u_int64_t abstime = 0; + + if (timo) + clock_interval_to_deadline(timo, NSEC_PER_SEC / hz, &abstime); + + return _sleep((caddr_t)chan, pri, wmsg, abstime, continuation, mtx); +} + +int +msleep( + void *chan, + lck_mtx_t *mtx, + int pri, + const char *wmsg, + struct timespec *ts) +{ + u_int64_t abstime = 0; + + if (ts && (ts->tv_sec || ts->tv_nsec)) { + nanoseconds_to_absolutetime((uint64_t)ts->tv_sec * NSEC_PER_SEC + ts->tv_nsec, &abstime ); + clock_absolutetime_interval_to_deadline( abstime, &abstime ); + } + + return _sleep((caddr_t)chan, pri, wmsg, abstime, (int (*)(int))0, mtx); +} + +int +msleep1( + void *chan, + lck_mtx_t *mtx, + int pri, + const char *wmsg, + u_int64_t abstime) +{ + return _sleep((caddr_t)chan, pri, wmsg, abstime, (int (*)(int))0, mtx); } int tsleep( - void *chan, + void *chan, int pri, - char *wmsg, + const char *wmsg, int timo) { u_int64_t abstime = 0; if (timo) clock_interval_to_deadline(timo, NSEC_PER_SEC / hz, &abstime); - return _sleep((caddr_t)chan, pri, wmsg, abstime, (int (*)(int))0); + return _sleep((caddr_t)chan, pri, wmsg, abstime, (int (*)(int))0, (lck_mtx_t *)0); } int tsleep0( - void *chan, + void *chan, int pri, - char *wmsg, + const char *wmsg, int timo, int (*continuation)(int)) { @@ -303,18 +365,18 @@ tsleep0( if (timo) clock_interval_to_deadline(timo, NSEC_PER_SEC / hz, &abstime); - return _sleep((caddr_t)chan, pri, wmsg, abstime, continuation); + return _sleep((caddr_t)chan, pri, wmsg, abstime, continuation, (lck_mtx_t *)0); } int tsleep1( void *chan, - int pri, - char *wmsg, + int pri, + const char *wmsg, u_int64_t abstime, - int (*continuation)(int)) + int (*continuation)(int)) { - return _sleep((caddr_t)chan, pri, wmsg, abstime, continuation); + return _sleep((caddr_t)chan, pri, wmsg, abstime, continuation, (lck_mtx_t *)0); } /* @@ -366,10 +428,11 @@ static fixpt_t cexp[3] = { void compute_averunnable( - register int nrun) + void *arg) { - register int i; + unsigned int nrun = *(unsigned int *)arg; struct loadavg *avg = &averunnable; + register int i; for (i = 0; i < 3; i++) avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + diff --git a/bsd/kern/kern_sysctl.c b/bsd/kern/kern_sysctl.c index afff95618..dd0736fc9 100644 --- a/bsd/kern/kern_sysctl.c +++ b/bsd/kern/kern_sysctl.c @@ -66,9 +66,10 @@ #include <sys/systm.h> #include <sys/kernel.h> #include <sys/malloc.h> -#include <sys/proc.h> -#include <sys/file.h> -#include <sys/vnode.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> +#include <sys/file_internal.h> +#include <sys/vnode_internal.h> #include <sys/unistd.h> #include <sys/buf.h> #include <sys/ioctl.h> @@ -86,19 +87,24 @@ #include <mach/mach_types.h> #include <mach/vm_param.h> #include <kern/task.h> +#include <kern/lock.h> #include <vm/vm_kern.h> +#include <vm/vm_map.h> #include <mach/host_info.h> extern vm_map_t bsd_pageable_map; -#include <sys/mount.h> +#include <sys/mount_internal.h> #include <sys/kdebug.h> +#include <sys/sysproto.h> #include <IOKit/IOPlatformExpert.h> #include <pexpert/pexpert.h> #include <machine/machine_routines.h> +#include <vm/vm_protos.h> + sysctlfn kern_sysctl; #ifdef DEBUG sysctlfn debug_sysctl; @@ -112,68 +118,154 @@ extern int aio_max_requests_per_process; extern int aio_worker_threads; extern int maxprocperuid; extern int maxfilesperproc; +extern int lowpri_IO_window_msecs; +extern int lowpri_IO_delay_msecs; - +static void +fill_eproc(struct proc *p, struct eproc *ep); +static void +fill_externproc(struct proc *p, struct extern_proc *exp); +static void +fill_user_eproc(struct proc *p, struct user_eproc *ep); +static void +fill_user_proc(struct proc *p, struct user_kinfo_proc *kp); +static void +fill_user_externproc(struct proc *p, struct user_extern_proc *exp); +extern int +kdbg_control(int *name, u_int namelen, user_addr_t where, size_t * sizep); int -userland_sysctl(struct proc *p, int *name, u_int namelen, void *old, size_t - *oldlenp, int inkernel, void *new, size_t newlen, size_t *retval); - +kdebug_ops(int *name, u_int namelen, user_addr_t where, size_t *sizep, struct proc *p); +#if NFSCLIENT +extern int +netboot_root(void); +#endif +int +pcsamples_ops(int *name, u_int namelen, user_addr_t where, size_t *sizep, + struct proc *p); +__private_extern__ kern_return_t +reset_vmobjectcache(unsigned int val1, unsigned int val2); +extern int +resize_namecache(u_int newsize); static int -sysctl_aiomax( void *oldp, size_t *oldlenp, void *newp, size_t newlen ); +sysctl_aiomax(user_addr_t oldp, size_t *oldlenp, user_addr_t newp, size_t newlen); static int -sysctl_aioprocmax( void *oldp, size_t *oldlenp, void *newp, size_t newlen ); +sysctl_aioprocmax(user_addr_t oldp, size_t *oldlenp, user_addr_t newp, size_t newlen); static int -sysctl_aiothreads( void *oldp, size_t *oldlenp, void *newp, size_t newlen ); +sysctl_aiothreads(user_addr_t oldp, size_t *oldlenp, user_addr_t newp, size_t newlen); +extern int +sysctl_clockrate(user_addr_t where, size_t *sizep); +int +sysctl_doproc(int *name, u_int namelen, user_addr_t where, size_t *sizep); +int +sysctl_doprof(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen); +int +sysctl_file(user_addr_t where, size_t *sizep); static void fill_proc(struct proc *p, struct kinfo_proc *kp); static int -sysctl_maxfilesperproc( void *oldp, size_t *oldlenp, void *newp, size_t newlen ); +sysctl_maxfilesperproc(user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen); static int -sysctl_maxprocperuid( void *oldp, size_t *oldlenp, void *newp, size_t newlen ); +sysctl_maxprocperuid(user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen); static int -sysctl_maxproc( void *oldp, size_t *oldlenp, void *newp, size_t newlen ); +sysctl_maxproc(user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen); +int +sysctl_procargs(int *name, u_int namelen, user_addr_t where, + size_t *sizep, struct proc *cur_proc); static int -sysctl_procargs2( int *name, u_int namelen, char *where, size_t *sizep, struct proc *cur_proc); +sysctl_procargs2(int *name, u_int namelen, user_addr_t where, size_t *sizep, + struct proc *cur_proc); static int -sysctl_procargsx( int *name, u_int namelen, char *where, size_t *sizep, struct proc *cur_proc, int argc_yes); +sysctl_procargsx(int *name, u_int namelen, user_addr_t where, size_t *sizep, + struct proc *cur_proc, int argc_yes); +int +sysctl_struct(user_addr_t oldp, size_t *oldlenp, user_addr_t newp, + size_t newlen, void *sp, int len); +extern int +sysctl_vnode(user_addr_t where, size_t *sizep); /* * temporary location for vm_sysctl. This should be machine independant */ + +extern uint32_t mach_factor[3]; + +static void +loadavg32to64(struct loadavg *la32, struct user_loadavg *la64) +{ + la64->ldavg[0] = la32->ldavg[0]; + la64->ldavg[1] = la32->ldavg[1]; + la64->ldavg[2] = la32->ldavg[2]; + la64->fscale = (user_long_t)la32->fscale; +} + int -vm_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) - int *name; - u_int namelen; - void *oldp; - size_t *oldlenp; - void *newp; - size_t newlen; - struct proc *p; +vm_sysctl(int *name, __unused u_int namelen, user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen, __unused struct proc *p) { - extern uint32_t mach_factor[3]; struct loadavg loadinfo; switch (name[0]) { case VM_LOADAVG: - return (sysctl_struct(oldp, oldlenp, newp, newlen, + if (proc_is64bit(p)) { + struct user_loadavg loadinfo64; + loadavg32to64(&averunnable, &loadinfo64); + return (sysctl_struct(oldp, oldlenp, newp, newlen, + &loadinfo64, sizeof(loadinfo64))); + } else { + return (sysctl_struct(oldp, oldlenp, newp, newlen, &averunnable, sizeof(struct loadavg))); + } case VM_MACHFACTOR: loadinfo.ldavg[0] = mach_factor[0]; loadinfo.ldavg[1] = mach_factor[1]; loadinfo.ldavg[2] = mach_factor[2]; loadinfo.fscale = LSCALE; - return (sysctl_struct(oldp, oldlenp, newp, newlen, + if (proc_is64bit(p)) { + struct user_loadavg loadinfo64; + loadavg32to64(&loadinfo, &loadinfo64); + return (sysctl_struct(oldp, oldlenp, newp, newlen, + &loadinfo64, sizeof(loadinfo64))); + } else { + return (sysctl_struct(oldp, oldlenp, newp, newlen, &loadinfo, sizeof(struct loadavg))); + } + case VM_SWAPUSAGE: { + int error; + uint64_t swap_total; + uint64_t swap_avail; + uint32_t swap_pagesize; + boolean_t swap_encrypted; + struct xsw_usage xsu; + + error = macx_swapinfo(&swap_total, + &swap_avail, + &swap_pagesize, + &swap_encrypted); + if (error) + return error; + + xsu.xsu_total = swap_total; + xsu.xsu_avail = swap_avail; + xsu.xsu_used = swap_total - swap_avail; + xsu.xsu_pagesize = swap_pagesize; + xsu.xsu_encrypted = swap_encrypted; + return sysctl_struct(oldp, oldlenp, newp, newlen, + &xsu, sizeof (struct xsw_usage)); + } case VM_METER: - return (EOPNOTSUPP); + return (ENOTSUP); case VM_MAXID: - return (EOPNOTSUPP); + return (ENOTSUP); default: - return (EOPNOTSUPP); + return (ENOTSUP); } /* NOTREACHED */ - return (EOPNOTSUPP); + return (ENOTSUP); } /* @@ -185,23 +277,12 @@ static struct sysctl_lock { int sl_locked; } memlock; -struct __sysctl_args { - int *name; - u_int namelen; - void *old; - size_t *oldlenp; - void *new; - size_t newlen; -}; int -__sysctl(p, uap, retval) - struct proc *p; - register struct __sysctl_args *uap; - register_t *retval; +__sysctl(struct proc *p, struct __sysctl_args *uap, __unused register_t *retval) { int error, dolock = 1; - size_t savelen, oldlen = 0; - sysctlfn *fn; + size_t savelen = 0, oldlen = 0, newlen; + sysctlfn *fnp = NULL; int name[CTL_MAXNAME]; int i; int error1; @@ -211,51 +292,71 @@ __sysctl(p, uap, retval) */ if (uap->namelen > CTL_MAXNAME || uap->namelen < 2) return (EINVAL); - if (error = - copyin(uap->name, &name, uap->namelen * sizeof(int))) + error = copyin(uap->name, &name[0], uap->namelen * sizeof(int)); + if (error) return (error); - + AUDIT_ARG(ctlname, name, uap->namelen); + if (proc_is64bit(p)) { + /* uap->newlen is a size_t value which grows to 64 bits + * when coming from a 64-bit process. since it's doubtful we'll + * have a sysctl newp buffer greater than 4GB we shrink it to size_t + */ + newlen = CAST_DOWN(size_t, uap->newlen); + } + else { + newlen = uap->newlen; + } + /* CTL_UNSPEC is used to get oid to AUTO_OID */ - if (uap->new != NULL - && ((name[0] == CTL_KERN - && !(name[1] == KERN_IPC || name[1] == KERN_PANICINFO)) - || (name[0] == CTL_HW) - || (name[0] == CTL_VM) - || (name[0] == CTL_VFS)) - && (error = suser(p->p_ucred, &p->p_acflag))) + if (uap->new != USER_ADDR_NULL + && ((name[0] == CTL_KERN + && !(name[1] == KERN_IPC || name[1] == KERN_PANICINFO || name[1] == KERN_PROCDELAYTERM || + name[1] == KERN_PROC_LOW_PRI_IO)) + || (name[0] == CTL_HW) + || (name[0] == CTL_VM) + || (name[0] == CTL_VFS)) + && (error = suser(kauth_cred_get(), &p->p_acflag))) return (error); switch (name[0]) { case CTL_KERN: - fn = kern_sysctl; + fnp = kern_sysctl; if ((name[1] != KERN_VNODE) && (name[1] != KERN_FILE) && (name[1] != KERN_PROC)) dolock = 0; break; case CTL_VM: - fn = vm_sysctl; + fnp = vm_sysctl; break; case CTL_VFS: - fn = vfs_sysctl; + fnp = vfs_sysctl; break; #ifdef DEBUG case CTL_DEBUG: - fn = debug_sysctl; + fnp = debug_sysctl; break; #endif default: - fn = 0; + fnp = NULL; } - if (uap->oldlenp && - (error = copyin(uap->oldlenp, &oldlen, sizeof(oldlen)))) - return (error); + if (uap->oldlenp != USER_ADDR_NULL) { + uint64_t oldlen64 = fuulong(uap->oldlenp); + + oldlen = CAST_DOWN(size_t, oldlen64); + /* + * If more than 4G, clamp to 4G - useracc() below will catch + * with an EFAULT, if it's actually necessary. + */ + if (oldlen64 > 0x00000000ffffffffULL) + oldlen = 0xffffffffUL; + } - if (uap->old != NULL) { - if (!useracc(uap->old, oldlen, B_WRITE)) + if (uap->old != USER_ADDR_NULL) { + if (!useracc(uap->old, (user_size_t)oldlen, B_WRITE)) return (EFAULT); /* The pc sampling mechanism does not need to take this lock */ @@ -269,7 +370,8 @@ __sysctl(p, uap, retval) memlock.sl_lock = 1; } - if (dolock && oldlen && (error = vslock(uap->old, oldlen))) { + if (dolock && oldlen && + (error = vslock(uap->old, (user_size_t)oldlen))) { if ((name[1] != KERN_PCSAMPLES) && (! ((name[1] == KERN_KDEBUG) && (name[2] == KERN_KDGETENTROPY)))) { memlock.sl_lock = 0; @@ -283,20 +385,22 @@ __sysctl(p, uap, retval) savelen = oldlen; } - if (fn) - error = (*fn)(name + 1, uap->namelen - 1, uap->old, - &oldlen, uap->new, uap->newlen, p); + if (fnp) { + error = (*fnp)(name + 1, uap->namelen - 1, uap->old, + &oldlen, uap->new, newlen, p); + } else - error = EOPNOTSUPP; + error = ENOTSUP; - if ( (name[0] != CTL_VFS) && (error == EOPNOTSUPP)) - error = userland_sysctl(p, name, uap->namelen, - uap->old, uap->oldlenp, 0, - uap->new, uap->newlen, &oldlen); + if ( (name[0] != CTL_VFS) && (error == ENOTSUP)) { + size_t tmp = oldlen; + error = userland_sysctl(p, name, uap->namelen, uap->old, &tmp, + 1, uap->new, newlen, &oldlen); + } - if (uap->old != NULL) { + if (uap->old != USER_ADDR_NULL) { if (dolock && savelen) { - error1 = vsunlock(uap->old, savelen, B_WRITE); + error1 = vsunlock(uap->old, (user_size_t)savelen, B_WRITE); if (!error && error1) error = error1; } @@ -311,8 +415,8 @@ __sysctl(p, uap, retval) if ((error) && (error != ENOMEM)) return (error); - if (uap->oldlenp) { - i = copyout(&oldlen, uap->oldlenp, sizeof(oldlen)); + if (uap->oldlenp != USER_ADDR_NULL) { + i = suulong(uap->oldlenp, oldlen); if (i) return i; } @@ -323,19 +427,14 @@ __sysctl(p, uap, retval) /* * Attributes stored in the kernel. */ -extern char hostname[MAXHOSTNAMELEN]; /* defined in bsd/kern/init_main.c */ -extern int hostnamelen; -extern char domainname[MAXHOSTNAMELEN]; -extern int domainnamelen; extern char classichandler[32]; -extern long classichandler_fsid; +extern uint32_t classichandler_fsid; extern long classichandler_fileid; __private_extern__ char corefilename[MAXPATHLEN+1]; -__private_extern__ do_coredump; -__private_extern__ sugid_coredump; +__private_extern__ int do_coredump; +__private_extern__ int sugid_coredump; -extern long hostid; #ifdef INSECURE int securelevel = -1; #else @@ -343,21 +442,21 @@ int securelevel; #endif static int -sysctl_affinity(name, namelen, oldBuf, oldSize, newBuf, newSize, cur_proc) - int *name; - u_int namelen; - char *oldBuf; - size_t *oldSize; - char *newBuf; - size_t newSize; - struct proc *cur_proc; +sysctl_affinity( + int *name, + u_int namelen, + user_addr_t oldBuf, + size_t *oldSize, + user_addr_t newBuf, + __unused size_t newSize, + struct proc *cur_proc) { if (namelen < 1) - return (EOPNOTSUPP); + return (ENOTSUP); if (name[0] == 0 && 1 == namelen) { return sysctl_rdint(oldBuf, oldSize, newBuf, - (cur_proc->p_flag & P_AFFINITY) ? 1 : 0); + (cur_proc->p_flag & P_AFFINITY) ? 1 : 0); } else if (name[0] == 1 && 2 == namelen) { if (name[1] == 0) { cur_proc->p_flag &= ~P_AFFINITY; @@ -366,123 +465,125 @@ sysctl_affinity(name, namelen, oldBuf, oldSize, newBuf, newSize, cur_proc) } return 0; } - return (EOPNOTSUPP); + return (ENOTSUP); } static int -sysctl_classic(name, namelen, oldBuf, oldSize, newBuf, newSize, cur_proc) - int *name; - u_int namelen; - char *oldBuf; - size_t *oldSize; - char *newBuf; - size_t newSize; - struct proc *cur_proc; +sysctl_classic( + int *name, + u_int namelen, + user_addr_t oldBuf, + size_t *oldSize, + user_addr_t newBuf, + __unused size_t newSize, + struct proc *cur_proc) { - int newVal; - int err; struct proc *p; if (namelen != 1) - return (EOPNOTSUPP); + return (ENOTSUP); p = pfind(name[0]); if (p == NULL) return (EINVAL); - if ((p->p_ucred->cr_uid != cur_proc->p_ucred->cr_uid) - && suser(cur_proc->p_ucred, &cur_proc->p_acflag)) + if ((kauth_cred_getuid(p->p_ucred) != kauth_cred_getuid(kauth_cred_get())) + && suser(kauth_cred_get(), &cur_proc->p_acflag)) return (EPERM); return sysctl_rdint(oldBuf, oldSize, newBuf, - (p->p_flag & P_CLASSIC) ? 1 : 0); + (p->p_flag & P_CLASSIC) ? 1 : 0); } static int -sysctl_classichandler(name, namelen, oldBuf, oldSize, newBuf, newSize, p) - int *name; - u_int namelen; - char *oldBuf; - size_t *oldSize; - char *newBuf; - size_t newSize; - struct proc *p; +sysctl_classichandler( + __unused int *name, + __unused u_int namelen, + user_addr_t oldBuf, + size_t *oldSize, + user_addr_t newBuf, + size_t newSize, + struct proc *p) { int error; - int len; + size_t len; struct nameidata nd; - struct vattr vattr; + struct vnode_attr va; char handler[sizeof(classichandler)]; - - if ((error = suser(p->p_ucred, &p->p_acflag))) - return (error); - len = strlen(classichandler) + 1; - if (oldBuf && *oldSize < len) - return (ENOMEM); - if (newBuf && newSize >= sizeof(classichandler)) - return (ENAMETOOLONG); - *oldSize = len - 1; + struct vfs_context context; + + context.vc_proc = p; + context.vc_ucred = kauth_cred_get(); + + if (oldSize) { + len = strlen(classichandler) + 1; + if (oldBuf) { + if (*oldSize < len) + return (ENOMEM); + error = copyout(classichandler, oldBuf, len); + if (error) + return (error); + } + *oldSize = len - 1; + } if (newBuf) { + error = suser(context.vc_ucred, &p->p_acflag); + if (error) + return (error); + if (newSize >= sizeof(classichandler)) + return (ENAMETOOLONG); error = copyin(newBuf, handler, newSize); if (error) return (error); handler[newSize] = 0; - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, - handler, p); + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE32, + CAST_USER_ADDR_T(handler), &context); error = namei(&nd); if (error) return (error); + nameidone(&nd); + /* Check mount point */ if ((nd.ni_vp->v_mount->mnt_flag & MNT_NOEXEC) || (nd.ni_vp->v_type != VREG)) { - vput(nd.ni_vp); + vnode_put(nd.ni_vp); return (EACCES); } - error = VOP_GETATTR(nd.ni_vp, &vattr, p->p_ucred, p); + + VATTR_INIT(&va); + VATTR_WANTED(&va, va_fsid); + VATTR_WANTED(&va, va_fileid); + error = vnode_getattr(nd.ni_vp, &va, &context); if (error) { - vput(nd.ni_vp); + vnode_put(nd.ni_vp); return (error); } - classichandler_fsid = vattr.va_fsid; - classichandler_fileid = vattr.va_fileid; - vput(nd.ni_vp); - } - if (oldBuf) { - error = copyout(classichandler, oldBuf, len); - if (error) - return (error); - } - if (newBuf) { + vnode_put(nd.ni_vp); + + classichandler_fsid = va.va_fsid; + classichandler_fileid = (u_long)va.va_fileid; strcpy(classichandler, handler); } - return (error); + return 0; } extern int get_kernel_symfile( struct proc *, char **); -extern int sysctl_dopanicinfo(int *, u_int, void *, size_t *, - void *, size_t, struct proc *); +__private_extern__ int +sysctl_dopanicinfo(int *, u_int, user_addr_t, size_t *, user_addr_t, + size_t, struct proc *); /* * kernel related system variables. */ int -kern_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) - int *name; - u_int namelen; - void *oldp; - size_t *oldlenp; - void *newp; - size_t newlen; - struct proc *p; +kern_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen, struct proc *p) { int error, level, inthostid, tmp; unsigned int oldval=0; char *str; - extern char ostype[], osrelease[], version[]; - extern int netboot_root(); - /* all sysctl names not listed below are terminal at this level */ if (namelen != 1 && !(name[0] == KERN_PROC @@ -495,7 +596,8 @@ kern_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) || name[0] == KERN_SYSV || name[0] == KERN_AFFINITY || name[0] == KERN_CLASSIC - || name[0] == KERN_PANICINFO) + || name[0] == KERN_PANICINFO + || name[0] == KERN_POSIX) ) return (ENOTDIR); /* overloaded */ @@ -528,14 +630,14 @@ kern_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) case KERN_SECURELVL: level = securelevel; if ((error = sysctl_int(oldp, oldlenp, newp, newlen, &level)) || - newp == NULL) + newp == USER_ADDR_NULL) return (error); if (level < securelevel && p->p_pid != 1) return (EPERM); securelevel = level; return (0); case KERN_HOSTNAME: - error = sysctl_string(oldp, oldlenp, newp, newlen, + error = sysctl_trstring(oldp, oldlenp, newp, newlen, hostname, sizeof(hostname)); if (newp && !error) hostnamelen = newlen; @@ -554,8 +656,15 @@ kern_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) case KERN_CLOCKRATE: return (sysctl_clockrate(oldp, oldlenp)); case KERN_BOOTTIME: - return (sysctl_rdstruct(oldp, oldlenp, newp, &boottime, + { + struct timeval t; + + t.tv_sec = boottime_sec(); + t.tv_usec = 0; + + return (sysctl_rdstruct(oldp, oldlenp, newp, &t, sizeof(struct timeval))); + } case KERN_VNODE: return (sysctl_vnode(oldp, oldlenp)); case KERN_PROC: @@ -594,8 +703,10 @@ kern_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) if ( error ) return error; return (sysctl_rdstring(oldp, oldlenp, newp, str)); +#if NFSCLIENT case KERN_NETBOOT: return (sysctl_rdint(oldp, oldlenp, newp, netboot_root())); +#endif case KERN_PANICINFO: return(sysctl_dopanicinfo(name + 1, namelen - 1, oldp, oldlenp, newp, newlen, p)); @@ -614,6 +725,10 @@ kern_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) return( sysctl_aioprocmax( oldp, oldlenp, newp, newlen ) ); case KERN_AIOTHREADS: return( sysctl_aiothreads( oldp, oldlenp, newp, newlen ) ); + case KERN_USRSTACK: + return (sysctl_rdint(oldp, oldlenp, newp, (uintptr_t)p->user_stack)); + case KERN_USRSTACK64: + return (sysctl_rdquad(oldp, oldlenp, newp, p->user_stack)); case KERN_COREFILE: error = sysctl_string(oldp, oldlenp, newp, newlen, corefilename, sizeof(corefilename)); @@ -621,7 +736,7 @@ kern_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) case KERN_COREDUMP: tmp = do_coredump; error = sysctl_int(oldp, oldlenp, newp, newlen, &do_coredump); - if (!error && (do_coredump < 0) || (do_coredump > 1)) { + if (!error && ((do_coredump < 0) || (do_coredump > 1))) { do_coredump = tmp; error = EINVAL; } @@ -629,13 +744,112 @@ kern_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) case KERN_SUGID_COREDUMP: tmp = sugid_coredump; error = sysctl_int(oldp, oldlenp, newp, newlen, &sugid_coredump); - if (!error && (sugid_coredump < 0) || (sugid_coredump > 1)) { + if (!error && ((sugid_coredump < 0) || (sugid_coredump > 1))) { sugid_coredump = tmp; error = EINVAL; } return (error); + case KERN_PROCDELAYTERM: + { + int old_value, new_value; + + error = 0; + if (oldp && *oldlenp < sizeof(int)) + return (ENOMEM); + if ( newp && newlen != sizeof(int) ) + return(EINVAL); + *oldlenp = sizeof(int); + old_value = (p->p_lflag & P_LDELAYTERM)? 1: 0; + if (oldp && (error = copyout( &old_value, oldp, sizeof(int)))) + return(error); + if (error == 0 && newp ) + error = copyin( newp, &new_value, sizeof(int) ); + if (error == 0 && newp) { + if (new_value) + p->p_lflag |= P_LDELAYTERM; + else + p->p_lflag &= ~P_LDELAYTERM; + } + return(error); + } + case KERN_PROC_LOW_PRI_IO: + { + int old_value, new_value; + + error = 0; + if (oldp && *oldlenp < sizeof(int)) + return (ENOMEM); + if ( newp && newlen != sizeof(int) ) + return(EINVAL); + *oldlenp = sizeof(int); + + old_value = (p->p_lflag & P_LLOW_PRI_IO)? 0x01: 0; + if (p->p_lflag & P_LBACKGROUND_IO) + old_value |= 0x02; + + if (oldp && (error = copyout( &old_value, oldp, sizeof(int)))) + return(error); + if (error == 0 && newp ) + error = copyin( newp, &new_value, sizeof(int) ); + if (error == 0 && newp) { + if (new_value & 0x01) + p->p_lflag |= P_LLOW_PRI_IO; + else if (new_value & 0x02) + p->p_lflag |= P_LBACKGROUND_IO; + else if (new_value == 0) + p->p_lflag &= ~(P_LLOW_PRI_IO | P_LBACKGROUND_IO); + } + return(error); + } + case KERN_LOW_PRI_WINDOW: + { + int old_value, new_value; + + error = 0; + if (oldp && *oldlenp < sizeof(old_value) ) + return (ENOMEM); + if ( newp && newlen != sizeof(new_value) ) + return(EINVAL); + *oldlenp = sizeof(old_value); + + old_value = lowpri_IO_window_msecs; + + if (oldp && (error = copyout( &old_value, oldp, *oldlenp))) + return(error); + if (error == 0 && newp ) + error = copyin( newp, &new_value, sizeof(newlen) ); + if (error == 0 && newp) { + lowpri_IO_window_msecs = new_value; + } + return(error); + } + case KERN_LOW_PRI_DELAY: + { + int old_value, new_value; + + error = 0; + if (oldp && *oldlenp < sizeof(old_value) ) + return (ENOMEM); + if ( newp && newlen != sizeof(new_value) ) + return(EINVAL); + *oldlenp = sizeof(old_value); + + old_value = lowpri_IO_delay_msecs; + + if (oldp && (error = copyout( &old_value, oldp, *oldlenp))) + return(error); + if (error == 0 && newp ) + error = copyin( newp, &new_value, sizeof(newlen) ); + if (error == 0 && newp) { + lowpri_IO_delay_msecs = new_value; + } + return(error); + } + case KERN_SHREG_PRIVATIZABLE: + /* this kernel does implement shared_region_make_private_np() */ + return (sysctl_rdint(oldp, oldlenp, newp, 1)); default: - return (EOPNOTSUPP); + return (ENOTSUP); } /* NOTREACHED */ } @@ -659,14 +873,8 @@ static struct ctldebug *debugvars[CTL_DEBUG_MAXID] = { &debug15, &debug16, &debug17, &debug18, &debug19, }; int -debug_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) - int *name; - u_int namelen; - void *oldp; - size_t *oldlenp; - void *newp; - size_t newlen; - struct proc *p; +debug_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen, struct proc *p) { struct ctldebug *cdp; @@ -675,14 +883,14 @@ debug_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) return (ENOTDIR); /* overloaded */ cdp = debugvars[name[0]]; if (cdp->debugname == 0) - return (EOPNOTSUPP); + return (ENOTSUP); switch (name[1]) { case CTL_DEBUG_NAME: return (sysctl_rdstring(oldp, oldlenp, newp, cdp->debugname)); case CTL_DEBUG_VALUE: return (sysctl_int(oldp, oldlenp, newp, newlen, cdp->debugvar)); default: - return (EOPNOTSUPP); + return (ENOTSUP); } /* NOTREACHED */ } @@ -693,15 +901,13 @@ debug_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) * for an integer-valued sysctl function. */ int -sysctl_int(oldp, oldlenp, newp, newlen, valp) - void *oldp; - size_t *oldlenp; - void *newp; - size_t newlen; - int *valp; +sysctl_int(user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen, int *valp) { int error = 0; + if (oldp != USER_ADDR_NULL && oldlenp == NULL) + return (EFAULT); if (oldp && *oldlenp < sizeof(int)) return (ENOMEM); if (newp && newlen != sizeof(int)) @@ -720,14 +926,12 @@ sysctl_int(oldp, oldlenp, newp, newlen, valp) * As above, but read-only. */ int -sysctl_rdint(oldp, oldlenp, newp, val) - void *oldp; - size_t *oldlenp; - void *newp; - int val; +sysctl_rdint(user_addr_t oldp, size_t *oldlenp, user_addr_t newp, int val) { int error = 0; + if (oldp != USER_ADDR_NULL && oldlenp == NULL) + return (EFAULT); if (oldp && *oldlenp < sizeof(int)) return (ENOMEM); if (newp) @@ -743,15 +947,13 @@ sysctl_rdint(oldp, oldlenp, newp, val) * for an quad(64bit)-valued sysctl function. */ int -sysctl_quad(oldp, oldlenp, newp, newlen, valp) - void *oldp; - size_t *oldlenp; - void *newp; - size_t newlen; - quad_t *valp; +sysctl_quad(user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen, quad_t *valp) { int error = 0; + if (oldp != USER_ADDR_NULL && oldlenp == NULL) + return (EFAULT); if (oldp && *oldlenp < sizeof(quad_t)) return (ENOMEM); if (newp && newlen != sizeof(quad_t)) @@ -776,13 +978,58 @@ sysctl_rdquad(oldp, oldlenp, newp, val) { int error = 0; + if (oldp != USER_ADDR_NULL && oldlenp == NULL) + return (EFAULT); if (oldp && *oldlenp < sizeof(quad_t)) return (ENOMEM); if (newp) return (EPERM); *oldlenp = sizeof(quad_t); if (oldp) - error = copyout((caddr_t)&val, oldp, sizeof(quad_t)); + error = copyout((caddr_t)&val, CAST_USER_ADDR_T(oldp), sizeof(quad_t)); + return (error); +} + +/* + * Validate parameters and get old / set new parameters + * for a string-valued sysctl function. Unlike sysctl_string, if you + * give it a too small (but larger than 0 bytes) buffer, instead of + * returning ENOMEM, it truncates the returned string to the buffer + * size. This preserves the semantics of some library routines + * implemented via sysctl, which truncate their returned data, rather + * than simply returning an error. The returned string is always NUL + * terminated. + */ +int +sysctl_trstring(user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen, char *str, int maxlen) +{ + int len, copylen, error = 0; + + if (oldp != USER_ADDR_NULL && oldlenp == NULL) + return (EFAULT); + copylen = len = strlen(str) + 1; + if (oldp && (len < 0 || *oldlenp < 1)) + return (ENOMEM); + if (oldp && (*oldlenp < (size_t)len)) + copylen = *oldlenp + 1; + if (newp && (maxlen < 0 || newlen >= (size_t)maxlen)) + return (EINVAL); + *oldlenp = copylen - 1; /* deal with NULL strings correctly */ + if (oldp) { + error = copyout(str, oldp, copylen); + if (!error) { + unsigned char c = 0; + /* NUL terminate */ + oldp += *oldlenp; + error = copyout((void *)&c, oldp, sizeof(char)); + } + } + if (error == 0 && newp) { + error = copyin(newp, str, newlen); + str[newlen] = 0; + AUDIT_ARG(text, (char *)str); + } return (error); } @@ -791,20 +1038,17 @@ sysctl_rdquad(oldp, oldlenp, newp, val) * for a string-valued sysctl function. */ int -sysctl_string(oldp, oldlenp, newp, newlen, str, maxlen) - void *oldp; - size_t *oldlenp; - void *newp; - size_t newlen; - char *str; - int maxlen; +sysctl_string(user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen, char *str, int maxlen) { int len, error = 0; + if (oldp != USER_ADDR_NULL && oldlenp == NULL) + return (EFAULT); len = strlen(str) + 1; - if (oldp && *oldlenp < len) + if (oldp && (len < 0 || *oldlenp < (size_t)len)) return (ENOMEM); - if (newp && newlen >= maxlen) + if (newp && (maxlen < 0 || newlen >= (size_t)maxlen)) return (EINVAL); *oldlenp = len -1; /* deal with NULL strings correctly */ if (oldp) { @@ -822,16 +1066,15 @@ sysctl_string(oldp, oldlenp, newp, newlen, str, maxlen) * As above, but read-only. */ int -sysctl_rdstring(oldp, oldlenp, newp, str) - void *oldp; - size_t *oldlenp; - void *newp; - char *str; +sysctl_rdstring(user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, char *str) { int len, error = 0; + if (oldp != USER_ADDR_NULL && oldlenp == NULL) + return (EFAULT); len = strlen(str) + 1; - if (oldp && *oldlenp < len) + if (oldp && *oldlenp < (size_t)len) return (ENOMEM); if (newp) return (EPERM); @@ -846,19 +1089,16 @@ sysctl_rdstring(oldp, oldlenp, newp, str) * for a structure oriented sysctl function. */ int -sysctl_struct(oldp, oldlenp, newp, newlen, sp, len) - void *oldp; - size_t *oldlenp; - void *newp; - size_t newlen; - void *sp; - int len; +sysctl_struct(user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen, void *sp, int len) { int error = 0; - if (oldp && *oldlenp < len) + if (oldp != USER_ADDR_NULL && oldlenp == NULL) + return (EFAULT); + if (oldp && (len < 0 || *oldlenp < (size_t)len)) return (ENOMEM); - if (newp && newlen > len) + if (newp && (len < 0 || newlen > (size_t)len)) return (EINVAL); if (oldp) { *oldlenp = len; @@ -874,15 +1114,14 @@ sysctl_struct(oldp, oldlenp, newp, newlen, sp, len) * for a structure oriented sysctl function. */ int -sysctl_rdstruct(oldp, oldlenp, newp, sp, len) - void *oldp; - size_t *oldlenp; - void *newp, *sp; - int len; +sysctl_rdstruct(user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, void *sp, int len) { int error = 0; - if (oldp && *oldlenp < len) + if (oldp != USER_ADDR_NULL && oldlenp == NULL) + return (EFAULT); + if (oldp && (len < 0 || *oldlenp < (size_t)len)) return (ENOMEM); if (newp) return (EPERM); @@ -896,31 +1135,31 @@ sysctl_rdstruct(oldp, oldlenp, newp, sp, len) * Get file structures. */ int -sysctl_file(where, sizep) - char *where; - size_t *sizep; +sysctl_file(user_addr_t where, size_t *sizep) { int buflen, error; - struct file *fp; - char *start = where; + struct fileglob *fg; + user_addr_t start = where; + struct extern_file nef; buflen = *sizep; - if (where == NULL) { + if (where == USER_ADDR_NULL) { /* * overestimate by 10 files */ - *sizep = sizeof(filehead) + (nfiles + 10) * sizeof(struct file); + *sizep = sizeof(filehead) + (nfiles + 10) * sizeof(struct extern_file); return (0); } /* * first copyout filehead */ - if (buflen < sizeof(filehead)) { + if (buflen < 0 || (size_t)buflen < sizeof(filehead)) { *sizep = 0; return (0); } - if (error = copyout((caddr_t)&filehead, where, sizeof(filehead))) + error = copyout((caddr_t)&filehead, where, sizeof(filehead)); + if (error) return (error); buflen -= sizeof(filehead); where += sizeof(filehead); @@ -928,17 +1167,28 @@ sysctl_file(where, sizep) /* * followed by an array of file structures */ - for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next) { - if (buflen < sizeof(struct file)) { + for (fg = filehead.lh_first; fg != 0; fg = fg->f_list.le_next) { + if (buflen < 0 || (size_t)buflen < sizeof(struct extern_file)) { *sizep = where - start; return (ENOMEM); } - if (error = copyout((caddr_t)fp, where, sizeof (struct file))) + nef.f_list.le_next = (struct extern_file *)fg->f_list.le_next; + nef.f_list.le_prev = (struct extern_file **)fg->f_list.le_prev; + nef.f_flag = (fg->fg_flag & FMASK); + nef.f_type = fg->fg_type; + nef.f_count = fg->fg_count; + nef.f_msgcount = fg->fg_msgcount; + nef.f_cred = fg->fg_cred; + nef.f_ops = fg->fg_ops; + nef.f_offset = fg->fg_offset; + nef.f_data = fg->fg_data; + error = copyout((caddr_t)&nef, where, sizeof (struct extern_file)); + if (error) return (error); - buflen -= sizeof(struct file); - where += sizeof(struct file); + buflen -= sizeof(struct extern_file); + where += sizeof(struct extern_file); } - *sizep = where - start; + *sizep = where - start; return (0); } @@ -948,24 +1198,33 @@ sysctl_file(where, sizep) #define KERN_PROCSLOP (5 * sizeof (struct kinfo_proc)) int -sysctl_doproc(name, namelen, where, sizep) - int *name; - u_int namelen; - char *where; - size_t *sizep; +sysctl_doproc(int *name, u_int namelen, user_addr_t where, size_t *sizep) { - register struct proc *p; - register struct kinfo_proc *dp = (struct kinfo_proc *)where; - register int needed = 0; - int buflen = where != NULL ? *sizep : 0; + struct proc *p; + user_addr_t dp = where; + size_t needed = 0; + int buflen = where != USER_ADDR_NULL ? *sizep : 0; int doingzomb; - struct kinfo_proc kproc; int error = 0; + boolean_t is_64_bit = FALSE; + struct kinfo_proc kproc; + struct user_kinfo_proc user_kproc; + int sizeof_kproc; + caddr_t kprocp; if (namelen != 2 && !(namelen == 1 && name[0] == KERN_PROC_ALL)) return (EINVAL); p = allproc.lh_first; doingzomb = 0; + is_64_bit = proc_is64bit(current_proc()); + if (is_64_bit) { + sizeof_kproc = sizeof(user_kproc); + kprocp = (caddr_t) &user_kproc; + } + else { + sizeof_kproc = sizeof(kproc); + kprocp = (caddr_t) &kproc; + } again: for (; p != 0; p = p->p_list.le_next) { /* @@ -1001,34 +1260,39 @@ again: case KERN_PROC_UID: if ((p->p_ucred == NULL) || - (p->p_ucred->cr_uid != (uid_t)name[1])) + (kauth_cred_getuid(p->p_ucred) != (uid_t)name[1])) continue; break; case KERN_PROC_RUID: if ((p->p_ucred == NULL) || - (p->p_cred->p_ruid != (uid_t)name[1])) + (p->p_ucred->cr_ruid != (uid_t)name[1])) continue; break; } - if (buflen >= sizeof(struct kinfo_proc)) { - bzero(&kproc, sizeof(struct kinfo_proc)); - fill_proc(p, &kproc); - if (error = copyout((caddr_t)&kproc, &dp->kp_proc, - sizeof(struct kinfo_proc))) + if (buflen >= sizeof_kproc) { + bzero(kprocp, sizeof_kproc); + if (is_64_bit) { + fill_user_proc(p, (struct user_kinfo_proc *) kprocp); + } + else { + fill_proc(p, (struct kinfo_proc *) kprocp); + } + error = copyout(kprocp, dp, sizeof_kproc); + if (error) return (error); - dp++; - buflen -= sizeof(struct kinfo_proc); + dp += sizeof_kproc; + buflen -= sizeof_kproc; } - needed += sizeof(struct kinfo_proc); + needed += sizeof_kproc; } if (doingzomb == 0) { p = zombproc.lh_first; doingzomb++; goto again; } - if (where != NULL) { - *sizep = (caddr_t)dp - where; + if (where != USER_ADDR_NULL) { + *sizep = dp - where; if (needed > *sizep) return (ENOMEM); } else { @@ -1061,10 +1325,23 @@ fill_eproc(p, ep) ep->e_jobc = 0; } ep->e_ppid = (p->p_pptr) ? p->p_pptr->p_pid : 0; - if (p->p_cred) { - ep->e_pcred = *p->p_cred; - if (p->p_ucred) - ep->e_ucred = *p->p_ucred; + /* Pre-zero the fake historical pcred */ + bzero(&ep->e_pcred, sizeof(struct _pcred)); + if (p->p_ucred) { + /* XXX not ref-counted */ + + /* A fake historical pcred */ + ep->e_pcred.p_ruid = p->p_ucred->cr_ruid; + ep->e_pcred.p_svuid = p->p_ucred->cr_svuid; + ep->e_pcred.p_rgid = p->p_ucred->cr_rgid; + ep->e_pcred.p_svgid = p->p_ucred->cr_svgid; + + /* A fake historical *kauth_cred_t */ + ep->e_ucred.cr_ref = p->p_ucred->cr_ref; + ep->e_ucred.cr_uid = kauth_cred_getuid(p->p_ucred); + ep->e_ucred.cr_ngroups = p->p_ucred->cr_ngroups; + bcopy(p->p_ucred->cr_groups, ep->e_ucred.cr_groups, NGROUPS*sizeof(gid_t)); + } if (p->p_stat == SIDL || p->p_stat == SZOMB) { ep->e_vm.vm_tsize = 0; @@ -1089,6 +1366,72 @@ fill_eproc(p, ep) ep->e_xccount = ep->e_xswrss = 0; } +/* + * Fill in an LP64 version of eproc structure for the specified process. + */ +static void +fill_user_eproc(register struct proc *p, register struct user_eproc *ep) +{ + register struct tty *tp; + struct session *sessionp = NULL; + + ep->e_paddr = CAST_USER_ADDR_T(p); + if (p->p_pgrp) { + sessionp = p->p_pgrp->pg_session; + ep->e_sess = CAST_USER_ADDR_T(sessionp); + ep->e_pgid = p->p_pgrp->pg_id; + ep->e_jobc = p->p_pgrp->pg_jobc; + if (sessionp) { + if (sessionp->s_ttyvp) + ep->e_flag = EPROC_CTTY; + } + } else { + ep->e_sess = USER_ADDR_NULL; + ep->e_pgid = 0; + ep->e_jobc = 0; + } + ep->e_ppid = (p->p_pptr) ? p->p_pptr->p_pid : 0; + /* Pre-zero the fake historical pcred */ + bzero(&ep->e_pcred, sizeof(ep->e_pcred)); + if (p->p_ucred) { + /* XXX not ref-counted */ + + /* A fake historical pcred */ + ep->e_pcred.p_ruid = p->p_ucred->cr_ruid; + ep->e_pcred.p_svuid = p->p_ucred->cr_svuid; + ep->e_pcred.p_rgid = p->p_ucred->cr_rgid; + ep->e_pcred.p_svgid = p->p_ucred->cr_svgid; + + /* A fake historical *kauth_cred_t */ + ep->e_ucred.cr_ref = p->p_ucred->cr_ref; + ep->e_ucred.cr_uid = kauth_cred_getuid(p->p_ucred); + ep->e_ucred.cr_ngroups = p->p_ucred->cr_ngroups; + bcopy(p->p_ucred->cr_groups, ep->e_ucred.cr_groups, NGROUPS*sizeof(gid_t)); + + } + if (p->p_stat == SIDL || p->p_stat == SZOMB) { + ep->e_vm.vm_tsize = 0; + ep->e_vm.vm_dsize = 0; + ep->e_vm.vm_ssize = 0; + } + ep->e_vm.vm_rssize = 0; + + if ((p->p_flag & P_CONTROLT) && (sessionp) && + (tp = sessionp->s_ttyp)) { + ep->e_tdev = tp->t_dev; + ep->e_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID; + ep->e_tsess = CAST_USER_ADDR_T(tp->t_session); + } else + ep->e_tdev = NODEV; + + if (SESS_LEADER(p)) + ep->e_flag |= EPROC_SLEADER; + if (p->p_wmesg) + strncpy(ep->e_wmesg, p->p_wmesg, WMESGLEN); + ep->e_xsize = ep->e_xrssize = 0; + ep->e_xccount = ep->e_xswrss = 0; +} + /* * Fill in an eproc structure for the specified process. */ @@ -1108,7 +1451,7 @@ fill_externproc(p, exp) exp->p_oppid = p->p_oppid ; exp->p_dupfd = p->p_dupfd ; /* Mach related */ - exp->user_stack = p->user_stack ; + exp->user_stack = CAST_DOWN(caddr_t, p->user_stack); exp->exit_thread = p->exit_thread ; exp->p_debugger = p->p_debugger ; exp->sigwait = p->sigwait ; @@ -1142,7 +1485,67 @@ fill_externproc(p, exp) exp->p_addr = NULL; exp->p_xstat = p->p_xstat ; exp->p_acflag = p->p_acflag ; - exp->p_ru = p->p_ru ; + exp->p_ru = p->p_ru ; /* XXX may be NULL */ +} + +/* + * Fill in an LP64 version of extern_proc structure for the specified process. + */ +static void +fill_user_externproc(register struct proc *p, register struct user_extern_proc *exp) +{ + exp->p_forw = exp->p_back = USER_ADDR_NULL; + if (p->p_stats) { + exp->p_starttime.tv_sec = p->p_stats->p_start.tv_sec; + exp->p_starttime.tv_usec = p->p_stats->p_start.tv_usec; + } + exp->p_vmspace = USER_ADDR_NULL; + exp->p_sigacts = CAST_USER_ADDR_T(p->p_sigacts); + exp->p_flag = p->p_flag; + exp->p_stat = p->p_stat ; + exp->p_pid = p->p_pid ; + exp->p_oppid = p->p_oppid ; + exp->p_dupfd = p->p_dupfd ; + /* Mach related */ + exp->user_stack = p->user_stack; + exp->exit_thread = CAST_USER_ADDR_T(p->exit_thread); + exp->p_debugger = p->p_debugger ; + exp->sigwait = p->sigwait ; + /* scheduling */ + exp->p_estcpu = p->p_estcpu ; + exp->p_cpticks = p->p_cpticks ; + exp->p_pctcpu = p->p_pctcpu ; + exp->p_wchan = CAST_USER_ADDR_T(p->p_wchan); + exp->p_wmesg = CAST_USER_ADDR_T(p->p_wmesg); + exp->p_swtime = p->p_swtime ; + exp->p_slptime = p->p_slptime ; + exp->p_realtimer.it_interval.tv_sec = p->p_realtimer.it_interval.tv_sec; + exp->p_realtimer.it_interval.tv_usec = p->p_realtimer.it_interval.tv_usec; + exp->p_realtimer.it_value.tv_sec = p->p_realtimer.it_value.tv_sec; + exp->p_realtimer.it_value.tv_usec = p->p_realtimer.it_value.tv_usec; + exp->p_rtime.tv_sec = p->p_rtime.tv_sec; + exp->p_rtime.tv_usec = p->p_rtime.tv_usec; + exp->p_uticks = p->p_uticks ; + exp->p_sticks = p->p_sticks ; + exp->p_iticks = p->p_iticks ; + exp->p_traceflag = p->p_traceflag ; + exp->p_tracep = CAST_USER_ADDR_T(p->p_tracep); + exp->p_siglist = 0 ; /* No longer relevant */ + exp->p_textvp = CAST_USER_ADDR_T(p->p_textvp); + exp->p_holdcnt = 0 ; + exp->p_sigmask = 0 ; /* no longer avaialable */ + exp->p_sigignore = p->p_sigignore ; + exp->p_sigcatch = p->p_sigcatch ; + exp->p_priority = p->p_priority ; + exp->p_usrpri = p->p_usrpri ; + exp->p_nice = p->p_nice ; + bcopy(&p->p_comm, &exp->p_comm,MAXCOMLEN); + exp->p_comm[MAXCOMLEN] = '\0'; + exp->p_pgrp = CAST_USER_ADDR_T(p->p_pgrp); + exp->p_addr = USER_ADDR_NULL; + exp->p_xstat = p->p_xstat ; + exp->p_acflag = p->p_acflag ; + exp->p_ru = CAST_USER_ADDR_T(p->p_ru); /* XXX may be NULL */ } static void @@ -1154,20 +1557,21 @@ fill_proc(p, kp) fill_eproc(p, &kp->kp_eproc); } +static void +fill_user_proc(register struct proc *p, register struct user_kinfo_proc *kp) +{ + fill_user_externproc(p, &kp->kp_proc); + fill_user_eproc(p, &kp->kp_eproc); +} + int -kdebug_ops(name, namelen, where, sizep, p) -int *name; -u_int namelen; -char *where; -size_t *sizep; -struct proc *p; +kdebug_ops(int *name, u_int namelen, user_addr_t where, + size_t *sizep, struct proc *p) { - int size=*sizep; int ret=0; - extern int kdbg_control(int *name, u_int namelen, - char * where,size_t * sizep); - if (ret = suser(p->p_ucred, &p->p_acflag)) + ret = suser(kauth_cred_get(), &p->p_acflag); + if (ret) return(ret); switch(name[0]) { @@ -1189,25 +1593,23 @@ struct proc *p; ret = kdbg_control(name, namelen, where, sizep); break; default: - ret= EOPNOTSUPP; + ret= ENOTSUP; break; } return(ret); } +extern int pcsamples_control(int *name, u_int namelen, user_addr_t where, + size_t * sizep); + int -pcsamples_ops(name, namelen, where, sizep, p) -int *name; -u_int namelen; -char *where; -size_t *sizep; -struct proc *p; +pcsamples_ops(int *name, u_int namelen, user_addr_t where, + size_t *sizep, struct proc *p) { int ret=0; - extern int pcsamples_control(int *name, u_int namelen, - char * where,size_t * sizep); - if (ret = suser(p->p_ucred, &p->p_acflag)) + ret = suser(kauth_cred_get(), &p->p_acflag); + if (ret) return(ret); switch(name[0]) { @@ -1222,7 +1624,7 @@ struct proc *p; ret = pcsamples_control(name, namelen, where, sizep); break; default: - ret= EOPNOTSUPP; + ret= ENOTSUP; break; } return(ret); @@ -1233,56 +1635,45 @@ struct proc *p; * user stack down through the saved exec_path, whichever is smaller. */ int -sysctl_procargs(name, namelen, where, sizep, cur_proc) - int *name; - u_int namelen; - char *where; - size_t *sizep; - struct proc *cur_proc; +sysctl_procargs(int *name, u_int namelen, user_addr_t where, + size_t *sizep, struct proc *cur_proc) { return sysctl_procargsx( name, namelen, where, sizep, cur_proc, 0); } static int -sysctl_procargs2(name, namelen, where, sizep, cur_proc) - int *name; - u_int namelen; - char *where; - size_t *sizep; - struct proc *cur_proc; +sysctl_procargs2(int *name, u_int namelen, user_addr_t where, + size_t *sizep, struct proc *cur_proc) { return sysctl_procargsx( name, namelen, where, sizep, cur_proc, 1); } static int -sysctl_procargsx(name, namelen, where, sizep, cur_proc, argc_yes) - int *name; - u_int namelen; - char *where; - size_t *sizep; - struct proc *cur_proc; - int argc_yes; +sysctl_procargsx(int *name, __unused u_int namelen, user_addr_t where, + size_t *sizep, struct proc *cur_proc, int argc_yes) { - register struct proc *p; - register int needed = 0; - int buflen = where != NULL ? *sizep : 0; + struct proc *p; + int buflen = where != USER_ADDR_NULL ? *sizep : 0; int error = 0; struct vm_map *proc_map; struct task * task; vm_map_copy_t tmp; - vm_offset_t arg_addr; - vm_size_t arg_size; + user_addr_t arg_addr; + size_t arg_size; caddr_t data; - unsigned size; + int size; vm_offset_t copy_start, copy_end; - int *ip; kern_return_t ret; int pid; if (argc_yes) - buflen -= NBPW; /* reserve first word to return argc */ + buflen -= sizeof(int); /* reserve first word to return argc */ - if ((buflen <= 0) || (buflen > ARG_MAX)) { + /* we only care about buflen when where (oldp from sysctl) is not NULL. */ + /* when where (oldp from sysctl) is NULL and sizep (oldlenp from sysctl */ + /* is not NULL then the caller wants us to return the length needed to */ + /* hold the data we would return */ + if (where != USER_ADDR_NULL && (buflen <= 0 || buflen > ARG_MAX)) { return(EINVAL); } arg_size = buflen; @@ -1291,8 +1682,6 @@ sysctl_procargsx(name, namelen, where, sizep, cur_proc, argc_yes) * Lookup process by pid */ pid = name[0]; - - restart: p = pfind(pid); if (p == NULL) { return(EINVAL); @@ -1311,10 +1700,35 @@ sysctl_procargsx(name, namelen, where, sizep, cur_proc, argc_yes) if (!p->user_stack) return(EINVAL); - if ((p->p_ucred->cr_uid != cur_proc->p_ucred->cr_uid) - && suser(cur_proc->p_ucred, &cur_proc->p_acflag)) + if (where == USER_ADDR_NULL) { + /* caller only wants to know length of proc args data */ + if (sizep == NULL) + return(EFAULT); + + size = p->p_argslen; + if (argc_yes) { + size += sizeof(int); + } + else { + /* + * old PROCARGS will return the executable's path and plus some + * extra space for work alignment and data tags + */ + size += PATH_MAX + (6 * sizeof(int)); + } + size += (size & (sizeof(int) - 1)) ? (sizeof(int) - (size & (sizeof(int) - 1))) : 0; + *sizep = size; + return (0); + } + + if ((kauth_cred_getuid(p->p_ucred) != kauth_cred_getuid(kauth_cred_get())) + && suser(kauth_cred_get(), &cur_proc->p_acflag)) return (EINVAL); - arg_addr = (vm_offset_t)(p->user_stack - arg_size); + + if ((u_int)arg_size > p->p_argslen) + arg_size = round_page(p->p_argslen); + + arg_addr = p->user_stack - arg_size; /* @@ -1327,30 +1741,32 @@ sysctl_procargsx(name, namelen, where, sizep, cur_proc, argc_yes) return(EINVAL); /* - * A regular task_reference call can block, causing the funnel - * to be dropped and allowing the proc/task to get freed. - * Instead, we issue a non-blocking attempt at the task reference, - * and look up the proc/task all over again if that fails. + * Once we have a task reference we can convert that into a + * map reference, which we will use in the calls below. The + * task/process may change its map after we take this reference + * (see execve), but the worst that will happen then is a return + * of stale info (which is always a possibility). */ - if (!task_reference_try(task)) { - mutex_pause(); - goto restart; - } + task_reference(task); + proc_map = get_task_map_reference(task); + task_deallocate(task); + if (proc_map == NULL) + return(EINVAL); - ret = kmem_alloc(kernel_map, ©_start, round_page_32(arg_size)); + + ret = kmem_alloc(kernel_map, ©_start, round_page(arg_size)); if (ret != KERN_SUCCESS) { - task_deallocate(task); + vm_map_deallocate(proc_map); return(ENOMEM); } - proc_map = get_task_map(task); - copy_end = round_page_32(copy_start + arg_size); + copy_end = round_page(copy_start + arg_size); - if( vm_map_copyin(proc_map, trunc_page(arg_addr), round_page_32(arg_size), - FALSE, &tmp) != KERN_SUCCESS) { - task_deallocate(task); + if( vm_map_copyin(proc_map, (vm_map_address_t)arg_addr, + (vm_map_size_t)arg_size, FALSE, &tmp) != KERN_SUCCESS) { + vm_map_deallocate(proc_map); kmem_free(kernel_map, copy_start, - round_page_32(arg_size)); + round_page(arg_size)); return (EIO); } @@ -1358,28 +1774,29 @@ sysctl_procargsx(name, namelen, where, sizep, cur_proc, argc_yes) * Now that we've done the copyin from the process' * map, we can release the reference to it. */ - task_deallocate(task); + vm_map_deallocate(proc_map); - if( vm_map_copy_overwrite(kernel_map, copy_start, - tmp, FALSE) != KERN_SUCCESS) { + if( vm_map_copy_overwrite(kernel_map, + (vm_map_address_t)copy_start, + tmp, FALSE) != KERN_SUCCESS) { kmem_free(kernel_map, copy_start, - round_page_32(arg_size)); + round_page(arg_size)); return (EIO); } - data = (caddr_t) (copy_end - arg_size); - - if (buflen > p->p_argslen) { - data = &data[buflen - p->p_argslen]; + if (arg_size > p->p_argslen) { + data = (caddr_t) (copy_end - p->p_argslen); size = p->p_argslen; } else { - size = buflen; + data = (caddr_t) (copy_end - arg_size); + size = arg_size; } if (argc_yes) { /* Put processes argc as the first word in the copyout buffer */ suword(where, p->p_argc); - error = copyout(data, where + NBPW, size); + error = copyout(data, (where + sizeof(int)), size); + size += sizeof(int); } else { error = copyout(data, where, size); @@ -1391,14 +1808,13 @@ sysctl_procargsx(name, namelen, where, sizep, cur_proc, argc_yes) * * Note: we keep all pointers&sizes aligned to word boundries */ - - if ( (! error) && (buflen > p->p_argslen) ) + if ( (! error) && (buflen > 0 && (u_int)buflen > p->p_argslen) ) { - int binPath_sz; + int binPath_sz, alignedBinPath_sz = 0; int extraSpaceNeeded, addThis; - char * placeHere; + user_addr_t placeHere; char * str = (char *) data; - unsigned int max_len = size; + int max_len = size; /* Some apps are really bad about messing up their stacks So, we have to be extra careful about getting the length @@ -1413,31 +1829,32 @@ sysctl_procargsx(name, namelen, where, sizep, cur_proc, argc_yes) while ( (binPath_sz < max_len-1) && (*str++ != 0) ) binPath_sz++; + /* If we have a NUL terminator, copy it, too */ if (binPath_sz < max_len-1) binPath_sz += 1; /* Pre-Flight the space requiremnts */ /* Account for the padding that fills out binPath to the next word */ - binPath_sz += (binPath_sz & (NBPW-1)) ? (NBPW-(binPath_sz & (NBPW-1))) : 0; + alignedBinPath_sz += (binPath_sz & (sizeof(int)-1)) ? (sizeof(int)-(binPath_sz & (sizeof(int)-1))) : 0; placeHere = where + size; /* Account for the bytes needed to keep placeHere word aligned */ - addThis = ((unsigned long)placeHere & (NBPW-1)) ? (NBPW-((unsigned long)placeHere & (NBPW-1))) : 0; + addThis = (placeHere & (sizeof(int)-1)) ? (sizeof(int)-(placeHere & (sizeof(int)-1))) : 0; /* Add up all the space that is needed */ - extraSpaceNeeded = binPath_sz + addThis + (4 * NBPW); + extraSpaceNeeded = alignedBinPath_sz + addThis + binPath_sz + (4 * sizeof(int)); /* is there is room to tack on argv[0]? */ - if ( (buflen & ~(NBPW-1)) >= ( p->p_argslen + extraSpaceNeeded )) + if ( (buflen & ~(sizeof(int)-1)) >= ( p->p_argslen + extraSpaceNeeded )) { placeHere += addThis; suword(placeHere, 0); - placeHere += NBPW; + placeHere += sizeof(int); suword(placeHere, 0xBFFF0000); - placeHere += NBPW; + placeHere += sizeof(int); suword(placeHere, 0); - placeHere += NBPW; + placeHere += sizeof(int); error = copyout(data, placeHere, binPath_sz); if ( ! error ) { @@ -1456,7 +1873,7 @@ sysctl_procargsx(name, namelen, where, sizep, cur_proc, argc_yes) return(error); } - if (where != NULL) + if (where != USER_ADDR_NULL) *sizep = size; return (0); } @@ -1469,7 +1886,7 @@ sysctl_procargsx(name, namelen, where, sizep, cur_proc, argc_yes) * limit. */ static int -sysctl_aiomax( void *oldp, size_t *oldlenp, void *newp, size_t newlen ) +sysctl_aiomax(user_addr_t oldp, size_t *oldlenp, user_addr_t newp, size_t newlen) { int error = 0; int new_value; @@ -1502,7 +1919,7 @@ sysctl_aiomax( void *oldp, size_t *oldlenp, void *newp, size_t newlen ) * limit. */ static int -sysctl_aioprocmax( void *oldp, size_t *oldlenp, void *newp, size_t newlen ) +sysctl_aioprocmax(user_addr_t oldp, size_t *oldlenp, user_addr_t newp, size_t newlen ) { int error = 0; int new_value = 0; @@ -1534,7 +1951,7 @@ sysctl_aioprocmax( void *oldp, size_t *oldlenp, void *newp, size_t newlen ) * We only allow an increase in the number of worker threads. */ static int -sysctl_aiothreads( void *oldp, size_t *oldlenp, void *newp, size_t newlen ) +sysctl_aiothreads(user_addr_t oldp, size_t *oldlenp, user_addr_t newp, size_t newlen) { int error = 0; int new_value; @@ -1568,20 +1985,21 @@ sysctl_aiothreads( void *oldp, size_t *oldlenp, void *newp, size_t newlen ) * Makes sure per UID limit is less than the system wide limit. */ static int -sysctl_maxprocperuid( void *oldp, size_t *oldlenp, void *newp, size_t newlen ) +sysctl_maxprocperuid(user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen) { int error = 0; int new_value; - if ( oldp != NULL && *oldlenp < sizeof(int) ) + if ( oldp != USER_ADDR_NULL && *oldlenp < sizeof(int) ) return (ENOMEM); - if ( newp != NULL && newlen != sizeof(int) ) + if ( newp != USER_ADDR_NULL && newlen != sizeof(int) ) return (EINVAL); *oldlenp = sizeof(int); - if ( oldp != NULL ) + if ( oldp != USER_ADDR_NULL ) error = copyout( &maxprocperuid, oldp, sizeof(int) ); - if ( error == 0 && newp != NULL ) { + if ( error == 0 && newp != USER_ADDR_NULL ) { error = copyin( newp, &new_value, sizeof(int) ); if ( error == 0 ) { AUDIT_ARG(value, new_value); @@ -1590,7 +2008,7 @@ sysctl_maxprocperuid( void *oldp, size_t *oldlenp, void *newp, size_t newlen ) else error = EINVAL; } - else + else error = EINVAL; } return( error ); @@ -1604,20 +2022,21 @@ sysctl_maxprocperuid( void *oldp, size_t *oldlenp, void *newp, size_t newlen ) * Makes sure per process limit is less than the system-wide limit. */ static int -sysctl_maxfilesperproc( void *oldp, size_t *oldlenp, void *newp, size_t newlen ) +sysctl_maxfilesperproc(user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen) { int error = 0; int new_value; - if ( oldp != NULL && *oldlenp < sizeof(int) ) + if ( oldp != USER_ADDR_NULL && *oldlenp < sizeof(int) ) return (ENOMEM); - if ( newp != NULL && newlen != sizeof(int) ) + if ( newp != USER_ADDR_NULL && newlen != sizeof(int) ) return (EINVAL); *oldlenp = sizeof(int); - if ( oldp != NULL ) + if ( oldp != USER_ADDR_NULL ) error = copyout( &maxfilesperproc, oldp, sizeof(int) ); - if ( error == 0 && newp != NULL ) { + if ( error == 0 && newp != USER_ADDR_NULL ) { error = copyin( newp, &new_value, sizeof(int) ); if ( error == 0 ) { AUDIT_ARG(value, new_value); @@ -1641,25 +2060,26 @@ sysctl_maxfilesperproc( void *oldp, size_t *oldlenp, void *newp, size_t newlen ) * limit set at kernel compilation. */ static int -sysctl_maxproc( void *oldp, size_t *oldlenp, void *newp, size_t newlen ) +sysctl_maxproc(user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen ) { int error = 0; int new_value; - if ( oldp != NULL && *oldlenp < sizeof(int) ) + if ( oldp != USER_ADDR_NULL && *oldlenp < sizeof(int) ) return (ENOMEM); - if ( newp != NULL && newlen != sizeof(int) ) + if ( newp != USER_ADDR_NULL && newlen != sizeof(int) ) return (EINVAL); *oldlenp = sizeof(int); - if ( oldp != NULL ) + if ( oldp != USER_ADDR_NULL ) error = copyout( &maxproc, oldp, sizeof(int) ); - if ( error == 0 && newp != NULL ) { + if ( error == 0 && newp != USER_ADDR_NULL ) { error = copyin( newp, &new_value, sizeof(int) ); if ( error == 0 ) { AUDIT_ARG(value, new_value); if ( new_value <= hard_maxproc && new_value > 0 ) - maxproc = new_value; + maxproc = new_value; else error = EINVAL; } diff --git a/bsd/kern/kern_time.c b/bsd/kern/kern_time.c index 0a4d9e52e..07354b8b7 100644 --- a/bsd/kern/kern_time.c +++ b/bsd/kern/kern_time.c @@ -59,18 +59,38 @@ #include <sys/resourcevar.h> #include <sys/kernel.h> #include <sys/systm.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/vnode.h> -#include <sys/mount.h> +#include <sys/mount_internal.h> +#include <sys/sysproto.h> +#include <sys/signalvar.h> #include <kern/clock.h> +#include <kern/thread_call.h> #define HZ 100 /* XXX */ -volatile struct timeval time; /* simple lock used to access timezone, tz structure */ -decl_simple_lock_data(, tz_slock); +lck_spin_t * tz_slock; +lck_grp_t * tz_slock_grp; +lck_attr_t * tz_slock_attr; +lck_grp_attr_t *tz_slock_grp_attr; + +static void setthetime( + struct timeval *tv); + +void time_zone_slock_init(void); + +int gettimeofday(struct proc *p, +#ifdef __ppc__ + struct ppc_gettimeofday_args *uap, +#else + struct gettimeofday_args *uap, +#endif + register_t *retval); + /* * Time of day and interval timer support. * @@ -79,177 +99,183 @@ decl_simple_lock_data(, tz_slock); * here provide support for adding and subtracting timeval structures * and decrementing interval timers, optionally reloading the interval * timers when they expire. + * + * XXX Y2038 bug because of clock_get_calendar_microtime() first argument */ -struct gettimeofday_args{ - struct timeval *tp; - struct timezone *tzp; -}; /* ARGSUSED */ int -gettimeofday(p, uap, retval) - struct proc *p; - register struct gettimeofday_args *uap; - register_t *retval; +gettimeofday(__unused struct proc *p, +#ifdef __ppc__ + register struct ppc_gettimeofday_args *uap, +#else + register struct gettimeofday_args *uap, +#endif + __unused register_t *retval) { struct timeval atv; int error = 0; - extern simple_lock_data_t tz_slock; struct timezone ltz; /* local copy */ /* NOTE THIS implementation is for non ppc architectures only */ if (uap->tp) { - clock_get_calendar_microtime(&atv.tv_sec, &atv.tv_usec); - if (error = copyout((caddr_t)&atv, (caddr_t)uap->tp, - sizeof (atv))) + clock_get_calendar_microtime((uint32_t *)&atv.tv_sec, &atv.tv_usec); + if (IS_64BIT_PROCESS(p)) { + struct user_timeval user_atv; + user_atv.tv_sec = atv.tv_sec; + user_atv.tv_usec = atv.tv_usec; + /* + * This cast is not necessary for PPC, but is + * mostly harmless. + */ + error = copyout(&user_atv, CAST_USER_ADDR_T(uap->tp), sizeof(struct user_timeval)); + } else { + error = copyout(&atv, CAST_USER_ADDR_T(uap->tp), sizeof(struct timeval)); + } + if (error) return(error); } if (uap->tzp) { - usimple_lock(&tz_slock); + lck_spin_lock(tz_slock); ltz = tz; - usimple_unlock(&tz_slock); - error = copyout((caddr_t)<z, (caddr_t)uap->tzp, + lck_spin_unlock(tz_slock); + error = copyout((caddr_t)<z, CAST_USER_ADDR_T(uap->tzp), sizeof (tz)); } return(error); } -struct settimeofday_args { - struct timeval *tv; - struct timezone *tzp; -}; +/* + * XXX Y2038 bug because of setthetime() argument + */ /* ARGSUSED */ int -settimeofday(p, uap, retval) - struct proc *p; - struct settimeofday_args *uap; - register_t *retval; +settimeofday(struct proc *p, struct settimeofday_args *uap, __unused register_t *retval) { struct timeval atv; struct timezone atz; - int error, s; - extern simple_lock_data_t tz_slock; + int error; - if (error = suser(p->p_ucred, &p->p_acflag)) - return (error); - /* Verify all parameters before changing time. */ - if (uap->tv && (error = copyin((caddr_t)uap->tv, - (caddr_t)&atv, sizeof(atv)))) + if ((error = suser(kauth_cred_get(), &p->p_acflag))) return (error); - if (uap->tzp && (error = copyin((caddr_t)uap->tzp, - (caddr_t)&atz, sizeof(atz)))) + /* Verify all parameters before changing time */ + if (uap->tv) { + if (IS_64BIT_PROCESS(p)) { + struct user_timeval user_atv; + error = copyin(uap->tv, &user_atv, sizeof(struct user_timeval)); + atv.tv_sec = user_atv.tv_sec; + atv.tv_usec = user_atv.tv_usec; + } else { + error = copyin(uap->tv, &atv, sizeof(struct timeval)); + } + if (error) + return (error); + } + if (uap->tzp && (error = copyin(uap->tzp, (caddr_t)&atz, sizeof(atz)))) return (error); - if (uap->tv) + if (uap->tv) { + timevalfix(&atv); + if (atv.tv_sec < 0 || (atv.tv_sec == 0 && atv.tv_usec < 0)) + return (EPERM); setthetime(&atv); + } if (uap->tzp) { - usimple_lock(&tz_slock); + lck_spin_lock(tz_slock); tz = atz; - usimple_unlock(&tz_slock); + lck_spin_unlock(tz_slock); } return (0); } -setthetime(tv) - struct timeval *tv; +static void +setthetime( + struct timeval *tv) { - long delta = tv->tv_sec - time.tv_sec; - clock_set_calendar_microtime(tv->tv_sec, tv->tv_usec); - boottime.tv_sec += delta; -#if NFSCLIENT || NFSSERVER - lease_updatetime(delta); -#endif } -struct adjtime_args { - struct timeval *delta; - struct timeval *olddelta; -}; +/* + * XXX Y2038 bug because of clock_adjtime() first argument + */ /* ARGSUSED */ int -adjtime(p, uap, retval) - struct proc *p; - register struct adjtime_args *uap; - register_t *retval; +adjtime(struct proc *p, register struct adjtime_args *uap, __unused register_t *retval) { struct timeval atv; int error; - if (error = suser(p->p_ucred, &p->p_acflag)) + if ((error = suser(kauth_cred_get(), &p->p_acflag))) return (error); - if (error = copyin((caddr_t)uap->delta, - (caddr_t)&atv, sizeof (struct timeval))) + if (IS_64BIT_PROCESS(p)) { + struct user_timeval user_atv; + error = copyin(uap->delta, &user_atv, sizeof(struct user_timeval)); + atv.tv_sec = user_atv.tv_sec; + atv.tv_usec = user_atv.tv_usec; + } else { + error = copyin(uap->delta, &atv, sizeof(struct timeval)); + } + if (error) return (error); - /* - * Compute the total correction and the rate at which to apply it. - */ - clock_adjtime(&atv.tv_sec, &atv.tv_usec); + /* + * Compute the total correction and the rate at which to apply it. + */ + clock_adjtime((int32_t *)&atv.tv_sec, &atv.tv_usec); if (uap->olddelta) { - (void) copyout((caddr_t)&atv, - (caddr_t)uap->olddelta, sizeof (struct timeval)); + if (IS_64BIT_PROCESS(p)) { + struct user_timeval user_atv; + user_atv.tv_sec = atv.tv_sec; + user_atv.tv_usec = atv.tv_usec; + error = copyout(&user_atv, uap->olddelta, sizeof(struct user_timeval)); + } else { + error = copyout(&atv, uap->olddelta, sizeof(struct timeval)); + } } return (0); } /* - * Initialze the time of day register. - * Trust the RTC except for the case where it is set before - * the UNIX epoch. In that case use the the UNIX epoch. - * The argument passed in is ignored. + * Verify the calendar value. If negative, + * reset to zero (the epoch). */ void -inittodr(base) - time_t base; +inittodr( + __unused time_t base) { struct timeval tv; /* * Assertion: * The calendar has already been - * set up from the battery clock. + * set up from the platform clock. * * The value returned by microtime() * is gotten from the calendar. */ microtime(&tv); - time = tv; - boottime.tv_sec = tv.tv_sec; - boottime.tv_usec = 0; - - /* - * If the RTC does not have acceptable value, i.e. time before - * the UNIX epoch, set it to the UNIX epoch - */ - if (tv.tv_sec < 0) { + if (tv.tv_sec < 0 || tv.tv_usec < 0) { printf ("WARNING: preposterous time in Real Time Clock"); - time.tv_sec = 0; /* the UNIX epoch */ - time.tv_usec = 0; - setthetime(&time); - boottime = time; + tv.tv_sec = 0; /* the UNIX epoch */ + tv.tv_usec = 0; + setthetime(&tv); printf(" -- CHECK AND RESET THE DATE!\n"); } - - return; } -void timevaladd( - struct timeval *t1, - struct timeval *t2); -void timevalsub( - struct timeval *t1, - struct timeval *t2); -void timevalfix( - struct timeval *t1); +time_t +boottime_sec(void) +{ + uint32_t sec, nanosec; + clock_get_boottime_nanotime(&sec, &nanosec); + return (sec); +} -uint64_t - tvtoabstime( - struct timeval *tvp); +uint64_t tvtoabstime(struct timeval *tvp); /* * Get value of an interval timer. The process virtual and @@ -271,16 +297,9 @@ uint64_t * absolute time when the timer should go off. */ -struct getitimer_args { - u_int which; - struct itimerval *itv; -}; /* ARGSUSED */ int -getitimer(p, uap, retval) - struct proc *p; - register struct getitimer_args *uap; - register_t *retval; +getitimer(struct proc *p, register struct getitimer_args *uap, __unused register_t *retval) { struct itimerval aitv; @@ -310,15 +329,18 @@ getitimer(p, uap, retval) else aitv = p->p_stats->p_timer[uap->which]; - return (copyout((caddr_t)&aitv, - (caddr_t)uap->itv, sizeof (struct itimerval))); + if (IS_64BIT_PROCESS(p)) { + struct user_itimerval user_itv; + user_itv.it_interval.tv_sec = aitv.it_interval.tv_sec; + user_itv.it_interval.tv_usec = aitv.it_interval.tv_usec; + user_itv.it_value.tv_sec = aitv.it_value.tv_sec; + user_itv.it_value.tv_usec = aitv.it_value.tv_usec; + return (copyout((caddr_t)&user_itv, uap->itv, sizeof (struct user_itimerval))); + } else { + return (copyout((caddr_t)&aitv, uap->itv, sizeof (struct itimerval))); + } } -struct setitimer_args { - u_int which; - struct itimerval *itv; - struct itimerval *oitv; -}; /* ARGSUSED */ int setitimer(p, uap, retval) @@ -327,28 +349,38 @@ setitimer(p, uap, retval) register_t *retval; { struct itimerval aitv; - register struct itimerval *itvp; + user_addr_t itvp; int error; if (uap->which > ITIMER_PROF) return (EINVAL); - if ((itvp = uap->itv) && - (error = copyin((caddr_t)itvp, - (caddr_t)&aitv, sizeof (struct itimerval)))) - return (error); - if ((uap->itv = uap->oitv) && (error = getitimer(p, uap, retval))) + if ((itvp = uap->itv)) { + if (IS_64BIT_PROCESS(p)) { + struct user_itimerval user_itv; + if ((error = copyin(itvp, (caddr_t)&user_itv, sizeof (struct user_itimerval)))) + return (error); + aitv.it_interval.tv_sec = user_itv.it_interval.tv_sec; + aitv.it_interval.tv_usec = user_itv.it_interval.tv_usec; + aitv.it_value.tv_sec = user_itv.it_value.tv_sec; + aitv.it_value.tv_usec = user_itv.it_value.tv_usec; + } else { + if ((error = copyin(itvp, (caddr_t)&aitv, sizeof (struct itimerval)))) + return (error); + } + } + if ((uap->itv = uap->oitv) && (error = getitimer(p, (struct getitimer_args *)uap, retval))) return (error); if (itvp == 0) return (0); if (itimerfix(&aitv.it_value) || itimerfix(&aitv.it_interval)) return (EINVAL); if (uap->which == ITIMER_REAL) { - thread_call_func_cancel(realitexpire, (void *)p->p_pid, FALSE); + thread_call_func_cancel((thread_call_func_t)realitexpire, (void *)p->p_pid, FALSE); if (timerisset(&aitv.it_value)) { microuptime(&p->p_rtime); timevaladd(&p->p_rtime, &aitv.it_value); thread_call_func_delayed( - realitexpire, (void *)p->p_pid, + (thread_call_func_t)realitexpire, (void *)p->p_pid, tvtoabstime(&p->p_rtime)); } else @@ -376,8 +408,9 @@ realitexpire( { register struct proc *p; struct timeval now; - boolean_t funnel_state = thread_funnel_set(kernel_flock, TRUE); + boolean_t funnel_state; + funnel_state = thread_funnel_set(kernel_flock, TRUE); p = pfind((pid_t)pid); if (p == NULL) { (void) thread_funnel_set(kernel_flock, FALSE); @@ -410,7 +443,7 @@ realitexpire( psignal(p, SIGALRM); - thread_call_func_delayed(realitexpire, pid, tvtoabstime(&p->p_rtime)); + thread_call_func_delayed((thread_call_func_t)realitexpire, pid, tvtoabstime(&p->p_rtime)); (void) thread_funnel_set(kernel_flock, FALSE); } @@ -527,14 +560,14 @@ void microtime( struct timeval *tvp) { - clock_get_calendar_microtime(&tvp->tv_sec, &tvp->tv_usec); + clock_get_calendar_microtime((uint32_t *)&tvp->tv_sec, &tvp->tv_usec); } void microuptime( struct timeval *tvp) { - clock_get_system_microtime(&tvp->tv_sec, &tvp->tv_usec); + clock_get_system_microtime((uint32_t *)&tvp->tv_sec, &tvp->tv_usec); } /* @@ -544,14 +577,14 @@ void nanotime( struct timespec *tsp) { - clock_get_calendar_nanotime((uint32_t *)&tsp->tv_sec, &tsp->tv_nsec); + clock_get_calendar_nanotime((uint32_t *)&tsp->tv_sec, (uint32_t *)&tsp->tv_nsec); } void nanouptime( struct timespec *tsp) { - clock_get_system_nanotime((uint32_t *)&tsp->tv_sec, &tsp->tv_nsec); + clock_get_system_nanotime((uint32_t *)&tsp->tv_sec, (uint32_t *)&tsp->tv_nsec); } uint64_t @@ -570,9 +603,17 @@ tvtoabstime( void time_zone_slock_init(void) { - extern simple_lock_data_t tz_slock; + /* allocate lock group attribute and group */ + tz_slock_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(tz_slock_grp_attr); - simple_lock_init(&tz_slock); + tz_slock_grp = lck_grp_alloc_init("tzlock", tz_slock_grp_attr); + /* Allocate lock attribute */ + tz_slock_attr = lck_attr_alloc_init(); + //lck_attr_setdebug(tz_slock_attr); + /* Allocate the spin lock */ + tz_slock = lck_spin_alloc_init(tz_slock_grp, tz_slock_attr); } + diff --git a/bsd/kern/kern_xxx.c b/bsd/kern/kern_xxx.c index ebc0af446..470a220e8 100644 --- a/bsd/kern/kern_xxx.c +++ b/bsd/kern/kern_xxx.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -60,7 +60,8 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/reboot.h> #include <sys/vm.h> #include <sys/sysctl.h> @@ -68,142 +69,25 @@ #include <bsm/audit_kernel.h> -#include <sys/mount.h> +#include <sys/mount_internal.h> +#include <sys/sysproto.h> -#if COMPAT_43 -/* ARGSUSED */ int -ogethostid(p, uap, retval) -struct proc *p; -void *uap; -register_t *retval; -{ - - *retval = hostid; - return 0; -} - -struct osethostid_args { - long hostid; -}; -/* ARGSUSED */ -int -osethostid(p, uap, retval) -struct proc *p; -register struct osethostid_args *uap; -register_t *retval; -{ - int error; - - if (error = suser(p->p_ucred, &p->p_acflag)) - return (error); - hostid = uap->hostid; - return (0); - -} - -struct ogethostname_args { - char *hostname; - u_int len; -}; -/* ARGSUSED */ -int -ogethostname(p, uap, retval) -struct proc *p; -register struct ogethostname_args *uap; -register_t *retval; -{ - int name; - - name = KERN_HOSTNAME; - - return (kern_sysctl(&name, 1, uap->hostname, &uap->len, 0, 0)); -} - -struct osethostname_args { - char *hostname; - u_int len; -}; -/* ARGSUSED */ -int -osethostname(p, uap, retval) -struct proc *p; -register struct osethostname_args *uap; -register_t *retval; -{ - int name; - int error; - - if (error = suser(p->p_ucred, &p->p_acflag)) - return (error); - - name = KERN_HOSTNAME; - return (kern_sysctl(&name, 1, 0, 0, uap->hostname, - uap->len)); -} - -struct ogetdomainname_args { - char *domainname; - int len; -}; -/* ARGSUSED */ -int -ogetdomainname(p, uap, retval) -struct proc *p; -register struct ogetdomainname_args *uap; -register_t *retval; -{ - int name; - - name = KERN_DOMAINNAME; - return (kern_sysctl(&name, 1, uap->domainname, - &uap->len, 0, 0)); -} - -struct osetdomainname_args { - char *domainname; - u_int len; -}; -/* ARGSUSED */ -int -osetdomainname(p, uap, retval) -struct proc *p; -register struct osetdomainname_args *uap; -register_t *retval; -{ - int name; - int error; - - if (error = suser(p->p_ucred, &p->p_acflag)) - return (error); - name = KERN_DOMAINNAME; - return (kern_sysctl(&name, 1, 0, 0, uap->domainname, - uap->len)); -} -#endif /* COMPAT_43 */ - -struct reboot_args { - int opt; - char *command; -}; - -reboot(p, uap, retval) -struct proc *p; -register struct reboot_args *uap; -register_t *retval; +reboot(struct proc *p, register struct reboot_args *uap, __unused register_t *retval) { char command[64]; int error; int dummy=0; AUDIT_ARG(cmd, uap->opt); + command[0] = '\0'; - if (error = suser(p->p_cred->pc_ucred, &p->p_acflag)) + if ((error = suser(kauth_cred_get(), &p->p_acflag))) return(error); if (uap->opt & RB_COMMAND) - error = copyinstr((void *)uap->command, + error = copyinstr(uap->command, (void *)command, sizeof(command), (size_t *)&dummy); if (!error) { SET(p->p_flag, P_REBOOT); /* No more signals for this proc */ diff --git a/bsd/kern/kpi_mbuf.c b/bsd/kern/kpi_mbuf.c new file mode 100644 index 000000000..15c290aab --- /dev/null +++ b/bsd/kern/kpi_mbuf.c @@ -0,0 +1,939 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#define __KPI__ +//#include <sys/kpi_interface.h> + +#include <sys/param.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <kern/debug.h> +#include <libkern/OSAtomic.h> +#include <kern/kalloc.h> +#include <string.h> + +void mbuf_tag_id_first_last(u_long *first, u_long *last); +errno_t mbuf_tag_id_find_internal(const char *string, u_long *out_id, int create); + +static const mbuf_flags_t mbuf_flags_mask = MBUF_EXT | MBUF_PKTHDR | MBUF_EOR | + MBUF_BCAST | MBUF_MCAST | MBUF_FRAG | MBUF_FIRSTFRAG | + MBUF_LASTFRAG | MBUF_PROMISC; + +void* mbuf_data(mbuf_t mbuf) +{ + return m_mtod(mbuf); +} + +void* mbuf_datastart(mbuf_t mbuf) +{ + if (mbuf->m_flags & M_EXT) + return mbuf->m_ext.ext_buf; + if (mbuf->m_flags & M_PKTHDR) + return mbuf->m_pktdat; + return mbuf->m_dat; +} + +errno_t mbuf_setdata(mbuf_t mbuf, void* data, size_t len) +{ + size_t start = (size_t)((char*)mbuf_datastart(mbuf)); + size_t maxlen = mbuf_maxlen(mbuf); + + if ((size_t)data < start || ((size_t)data) + len > start + maxlen) + return EINVAL; + mbuf->m_data = data; + mbuf->m_len = len; + + return 0; +} + +errno_t mbuf_align_32(mbuf_t mbuf, size_t len) +{ + if ((mbuf->m_flags & M_EXT) != 0 && m_mclhasreference(mbuf)) + return ENOTSUP; + mbuf->m_data = mbuf_datastart(mbuf); + mbuf->m_data += ((mbuf_trailingspace(mbuf) - len) &~ (sizeof(u_int32_t) - 1)); + + return 0; +} + +addr64_t mbuf_data_to_physical(void* ptr) +{ + return (addr64_t)mcl_to_paddr(ptr); +} + +errno_t mbuf_get(mbuf_how_t how, mbuf_type_t type, mbuf_t *mbuf) +{ + /* Must set *mbuf to NULL in failure case */ + *mbuf = m_get(how, type); + + return (*mbuf == NULL) ? ENOMEM : 0; +} + +errno_t mbuf_gethdr(mbuf_how_t how, mbuf_type_t type, mbuf_t *mbuf) +{ + /* Must set *mbuf to NULL in failure case */ + *mbuf = m_gethdr(how, type); + + return (*mbuf == NULL) ? ENOMEM : 0; +} + +extern struct mbuf * m_mbigget(struct mbuf *m, int nowait); + +errno_t mbuf_getcluster(mbuf_how_t how, mbuf_type_t type, size_t size, mbuf_t* mbuf) +{ + /* Must set *mbuf to NULL in failure case */ + errno_t error = 0; + int created = 0; + + if (mbuf == NULL) + return EINVAL; + if (*mbuf == NULL) { + *mbuf = m_get(how, type); + if (*mbuf == NULL) + return ENOMEM; + created = 1; + } + /* + * At the time this code was written, m_mclget and m_mbigget would always + * return the same value that was passed in to it. + */ + if (size == MCLBYTES) { + *mbuf = m_mclget(*mbuf, how); + } else if (size == NBPG) { + *mbuf = m_mbigget(*mbuf, how); + } else { + error = EINVAL; + goto out; + } + if (*mbuf == NULL || ((*mbuf)->m_flags & M_EXT) == 0) + error = ENOMEM; +out: + if (created && error != 0) { + error = ENOMEM; + mbuf_free(*mbuf); + *mbuf = NULL; + } + return error; +} + +errno_t mbuf_mclget(mbuf_how_t how, mbuf_type_t type, mbuf_t *mbuf) +{ + /* Must set *mbuf to NULL in failure case */ + errno_t error = 0; + int created = 0; + if (mbuf == NULL) return EINVAL; + if (*mbuf == NULL) { + error = mbuf_get(how, type, mbuf); + if (error) + return error; + created = 1; + } + + /* + * At the time this code was written, m_mclget would always + * return the same value that was passed in to it. + */ + *mbuf = m_mclget(*mbuf, how); + + if (created && ((*mbuf)->m_flags & M_EXT) == 0) { + mbuf_free(*mbuf); + *mbuf = NULL; + } + if (*mbuf == NULL || ((*mbuf)->m_flags & M_EXT) == 0) + error = ENOMEM; + return error; +} + + +errno_t mbuf_getpacket(mbuf_how_t how, mbuf_t *mbuf) +{ + /* Must set *mbuf to NULL in failure case */ + errno_t error = 0; + + *mbuf = m_getpacket_how(how); + + if (*mbuf == NULL) { + if (how == MBUF_WAITOK) + error = ENOMEM; + else + error = EWOULDBLOCK; + } + + return error; +} + +mbuf_t mbuf_free(mbuf_t mbuf) +{ + return m_free(mbuf); +} + +void mbuf_freem(mbuf_t mbuf) +{ + m_freem(mbuf); +} + +int mbuf_freem_list(mbuf_t mbuf) +{ + return m_freem_list(mbuf); +} + +size_t mbuf_leadingspace(mbuf_t mbuf) +{ + return m_leadingspace(mbuf); +} + +size_t mbuf_trailingspace(mbuf_t mbuf) +{ + return m_trailingspace(mbuf); +} + +/* Manipulation */ +errno_t mbuf_copym(mbuf_t src, size_t offset, size_t len, + mbuf_how_t how, mbuf_t *new_mbuf) +{ + /* Must set *mbuf to NULL in failure case */ + *new_mbuf = m_copym(src, offset, len, how); + + return (*new_mbuf == NULL) ? ENOMEM : 0; +} + +errno_t mbuf_dup(mbuf_t src, mbuf_how_t how, mbuf_t *new_mbuf) +{ + /* Must set *new_mbuf to NULL in failure case */ + *new_mbuf = m_dup(src, how); + + return (*new_mbuf == NULL) ? ENOMEM : 0; +} + +errno_t mbuf_prepend(mbuf_t *orig, size_t len, mbuf_how_t how) +{ + /* Must set *orig to NULL in failure case */ + *orig = m_prepend_2(*orig, len, how); + + return (*orig == NULL) ? ENOMEM : 0; +} + +errno_t mbuf_split(mbuf_t src, size_t offset, + mbuf_how_t how, mbuf_t *new_mbuf) +{ + /* Must set *new_mbuf to NULL in failure case */ + *new_mbuf = m_split(src, offset, how); + + return (*new_mbuf == NULL) ? ENOMEM : 0; +} + +errno_t mbuf_pullup(mbuf_t *mbuf, size_t len) +{ + /* Must set *mbuf to NULL in failure case */ + *mbuf = m_pullup(*mbuf, len); + + return (*mbuf == NULL) ? ENOMEM : 0; +} + +errno_t mbuf_pulldown(mbuf_t src, size_t *offset, size_t len, mbuf_t *location) +{ + /* Must set *location to NULL in failure case */ + int new_offset; + *location = m_pulldown(src, *offset, len, &new_offset); + *offset = new_offset; + + return (*location == NULL) ? ENOMEM : 0; +} + +void mbuf_adj(mbuf_t mbuf, int len) +{ + m_adj(mbuf, len); +} + +errno_t mbuf_copydata(mbuf_t m, size_t off, size_t len, void* out_data) +{ + /* Copied m_copydata, added error handling (don't just panic) */ + int count; + + while (off > 0) { + if (m == 0) + return EINVAL; + if (off < (size_t)m->m_len) + break; + off -= m->m_len; + m = m->m_next; + } + while (len > 0) { + if (m == 0) + return EINVAL; + count = m->m_len - off > len ? len : m->m_len - off; + bcopy(mtod(m, caddr_t) + off, out_data, count); + len -= count; + out_data = ((char*)out_data) + count; + off = 0; + m = m->m_next; + } + + return 0; +} + +int mbuf_mclref(mbuf_t mbuf) +{ + return m_mclref(mbuf); +} + +int mbuf_mclunref(mbuf_t mbuf) +{ + return m_mclunref(mbuf); +} + +int mbuf_mclhasreference(mbuf_t mbuf) +{ + if ((mbuf->m_flags & M_EXT)) + return m_mclhasreference(mbuf); + else + return 0; +} + + +/* mbuf header */ +mbuf_t mbuf_next(mbuf_t mbuf) +{ + return mbuf->m_next; +} + +errno_t mbuf_setnext(mbuf_t mbuf, mbuf_t next) +{ + if (next && ((next)->m_nextpkt != NULL || + (next)->m_type == MT_FREE)) return EINVAL; + mbuf->m_next = next; + + return 0; +} + +mbuf_t mbuf_nextpkt(mbuf_t mbuf) +{ + return mbuf->m_nextpkt; +} + +void mbuf_setnextpkt(mbuf_t mbuf, mbuf_t nextpkt) +{ + mbuf->m_nextpkt = nextpkt; +} + +size_t mbuf_len(mbuf_t mbuf) +{ + return mbuf->m_len; +} + +void mbuf_setlen(mbuf_t mbuf, size_t len) +{ + mbuf->m_len = len; +} + +size_t mbuf_maxlen(mbuf_t mbuf) +{ + if (mbuf->m_flags & M_EXT) + return mbuf->m_ext.ext_size; + return &mbuf->m_dat[MLEN] - ((char*)mbuf_datastart(mbuf)); +} + +mbuf_type_t mbuf_type(mbuf_t mbuf) +{ + return mbuf->m_type; +} + +errno_t mbuf_settype(mbuf_t mbuf, mbuf_type_t new_type) +{ + if (new_type == MBUF_TYPE_FREE) return EINVAL; + + m_mchtype(mbuf, new_type); + + return 0; +} + +mbuf_flags_t mbuf_flags(mbuf_t mbuf) +{ + return mbuf->m_flags & mbuf_flags_mask; +} + +errno_t mbuf_setflags(mbuf_t mbuf, mbuf_flags_t flags) +{ + if ((flags & ~mbuf_flags_mask) != 0) return EINVAL; + mbuf->m_flags = flags | + (mbuf->m_flags & ~mbuf_flags_mask); + + return 0; +} + +errno_t mbuf_setflags_mask(mbuf_t mbuf, mbuf_flags_t flags, mbuf_flags_t mask) +{ + if (((flags | mask) & ~mbuf_flags_mask) != 0) return EINVAL; + + mbuf->m_flags = (flags & mask) | (mbuf->m_flags & ~mask); + + return 0; +} + +errno_t mbuf_copy_pkthdr(mbuf_t dest, mbuf_t src) +{ + if (((src)->m_flags & M_PKTHDR) == 0) + return EINVAL; + + m_copy_pkthdr(dest, src); + + return 0; +} + +size_t mbuf_pkthdr_len(mbuf_t mbuf) +{ + return mbuf->m_pkthdr.len; +} + +void mbuf_pkthdr_setlen(mbuf_t mbuf, size_t len) +{ + mbuf->m_pkthdr.len = len; +} + +ifnet_t mbuf_pkthdr_rcvif(mbuf_t mbuf) +{ + // If we reference count ifnets, we should take a reference here before returning + return mbuf->m_pkthdr.rcvif; +} + +errno_t mbuf_pkthdr_setrcvif(mbuf_t mbuf, ifnet_t ifnet) +{ + /* May want to walk ifnet list to determine if interface is valid */ + mbuf->m_pkthdr.rcvif = (struct ifnet*)ifnet; + return 0; +} + +void* mbuf_pkthdr_header(mbuf_t mbuf) +{ + return mbuf->m_pkthdr.header; +} + +void mbuf_pkthdr_setheader(mbuf_t mbuf, void *header) +{ + mbuf->m_pkthdr.header = (void*)header; +} + +/* mbuf aux data */ +errno_t mbuf_aux_add(mbuf_t mbuf, int family, mbuf_type_t type, mbuf_t *aux_mbuf) +{ + *aux_mbuf = m_aux_add(mbuf, family, type); + return (*aux_mbuf == NULL) ? ENOMEM : 0; +} + +mbuf_t mbuf_aux_find(mbuf_t mbuf, int family, mbuf_type_t type) +{ + return m_aux_find(mbuf, family, type); +} + +void mbuf_aux_delete(mbuf_t mbuf, mbuf_t aux) +{ + m_aux_delete(mbuf, aux); +} + +void +mbuf_inbound_modified(mbuf_t mbuf) +{ + /* Invalidate hardware generated checksum flags */ + mbuf->m_pkthdr.csum_flags = 0; +} + +extern void in_cksum_offset(struct mbuf* m, size_t ip_offset); +extern void in_delayed_cksum_offset(struct mbuf *m, int ip_offset); + +void +mbuf_outbound_finalize(mbuf_t mbuf, u_long protocol_family, size_t protocol_offset) +{ + if ((mbuf->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_IP)) == 0) + return; + + /* Generate the packet in software, client needs it */ + switch (protocol_family) { + case PF_INET: + if (mbuf->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + in_delayed_cksum_offset(mbuf, protocol_offset); + } + + if (mbuf->m_pkthdr.csum_flags & CSUM_DELAY_IP) { + in_cksum_offset(mbuf, protocol_offset); + } + + mbuf->m_pkthdr.csum_flags &= ~(CSUM_DELAY_DATA | CSUM_DELAY_IP); + break; + + default: + /* + * Not sure what to do here if anything. + * Hardware checksum code looked pretty IPv4 specific. + */ + if ((mbuf->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_IP)) != 0) + panic("mbuf_outbound_finalize - CSUM flags set for non-IPv4 packet (%d)!\n", protocol_family); + } +} + +errno_t +mbuf_set_vlan_tag( + mbuf_t mbuf, + u_int16_t vlan) +{ + mbuf->m_pkthdr.csum_flags |= CSUM_VLAN_TAG_VALID; + mbuf->m_pkthdr.vlan_tag = vlan; + + return 0; +} + +errno_t +mbuf_get_vlan_tag( + mbuf_t mbuf, + u_int16_t *vlan) +{ + if ((mbuf->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) == 0) + return ENXIO; // No vlan tag set + + *vlan = mbuf->m_pkthdr.vlan_tag; + + return 0; +} + +errno_t +mbuf_clear_vlan_tag( + mbuf_t mbuf) +{ + mbuf->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID; + mbuf->m_pkthdr.vlan_tag = 0; + + return 0; +} + +static const mbuf_csum_request_flags_t mbuf_valid_csum_request_flags = + MBUF_CSUM_REQ_IP | MBUF_CSUM_REQ_TCP | MBUF_CSUM_REQ_UDP | MBUF_CSUM_REQ_SUM16; + +errno_t +mbuf_set_csum_requested( + mbuf_t mbuf, + mbuf_csum_request_flags_t request, + u_int32_t value) +{ + request &= mbuf_valid_csum_request_flags; + mbuf->m_pkthdr.csum_flags = (mbuf->m_pkthdr.csum_flags & 0xffff0000) | request; + mbuf->m_pkthdr.csum_data = value; + + return 0; +} + +errno_t +mbuf_get_csum_requested( + mbuf_t mbuf, + mbuf_csum_request_flags_t *request, + u_int32_t *value) +{ + *request = mbuf->m_pkthdr.csum_flags; + *request &= mbuf_valid_csum_request_flags; + if (value != NULL) { + *value = mbuf->m_pkthdr.csum_data; + } + + return 0; +} + +errno_t +mbuf_clear_csum_requested( + mbuf_t mbuf) +{ + mbuf->m_pkthdr.csum_flags &= 0xffff0000; + mbuf->m_pkthdr.csum_data = 0; + + return 0; +} + +static const mbuf_csum_performed_flags_t mbuf_valid_csum_performed_flags = + MBUF_CSUM_DID_IP | MBUF_CSUM_IP_GOOD | MBUF_CSUM_DID_DATA | + MBUF_CSUM_PSEUDO_HDR | MBUF_CSUM_TCP_SUM16; + +errno_t +mbuf_set_csum_performed( + mbuf_t mbuf, + mbuf_csum_performed_flags_t performed, + u_int32_t value) +{ + performed &= mbuf_valid_csum_performed_flags; + mbuf->m_pkthdr.csum_flags = (mbuf->m_pkthdr.csum_flags & 0xffff0000) | performed; + mbuf->m_pkthdr.csum_data = value; + + return 0; +} + +errno_t +mbuf_get_csum_performed( + mbuf_t mbuf, + mbuf_csum_performed_flags_t *performed, + u_int32_t *value) +{ + *performed = mbuf->m_pkthdr.csum_flags & mbuf_valid_csum_performed_flags; + *value = mbuf->m_pkthdr.csum_data; + + return 0; +} + +errno_t +mbuf_clear_csum_performed( + mbuf_t mbuf) +{ + mbuf->m_pkthdr.csum_flags &= 0xffff0000; + mbuf->m_pkthdr.csum_data = 0; + + return 0; +} + +/* + * Mbuf tag KPIs + */ + +struct mbuf_tag_id_entry { + SLIST_ENTRY(mbuf_tag_id_entry) next; + mbuf_tag_id_t id; + char string[]; +}; + +#define MBUF_TAG_ID_ENTRY_SIZE(__str) \ + ((size_t)&(((struct mbuf_tag_id_entry*)0)->string[0]) + \ + strlen(__str) + 1) + +#define MTAG_FIRST_ID 1000 +static u_long mtag_id_next = MTAG_FIRST_ID; +static SLIST_HEAD(,mbuf_tag_id_entry) mtag_id_list = {NULL}; +static lck_mtx_t *mtag_id_lock = NULL; + +__private_extern__ void +mbuf_tag_id_first_last( + u_long *first, + u_long *last) +{ + *first = MTAG_FIRST_ID; + *last = mtag_id_next - 1; +} + +__private_extern__ errno_t +mbuf_tag_id_find_internal( + const char *string, + u_long *out_id, + int create) +{ + struct mbuf_tag_id_entry *entry = NULL; + + + *out_id = 0; + + if (string == NULL || out_id == NULL) { + return EINVAL; + } + + /* Don't bother allocating the lock if we're only doing a lookup */ + if (create == 0 && mtag_id_lock == NULL) + return ENOENT; + + /* Allocate lock if necessary */ + if (mtag_id_lock == NULL) { + lck_grp_attr_t *grp_attrib = NULL; + lck_attr_t *lck_attrb = NULL; + lck_grp_t *lck_group = NULL; + lck_mtx_t *new_lock = NULL; + + grp_attrib = lck_grp_attr_alloc_init(); + lck_grp_attr_setdefault(grp_attrib); + lck_group = lck_grp_alloc_init("mbuf_tag_allocate_id", grp_attrib); + lck_grp_attr_free(grp_attrib); + lck_attrb = lck_attr_alloc_init(); + lck_attr_setdefault(lck_attrb); + lck_attr_setdebug(lck_attrb); + new_lock = lck_mtx_alloc_init(lck_group, lck_attrb); + if (!OSCompareAndSwap((UInt32)0, (UInt32)new_lock, (UInt32*)&mtag_id_lock)) { + /* + * If the atomic swap fails, someone else has already + * done this work. We can free the stuff we allocated. + */ + lck_mtx_free(new_lock, lck_group); + lck_grp_free(lck_group); + } + lck_attr_free(lck_attrb); + } + + /* Look for an existing entry */ + lck_mtx_lock(mtag_id_lock); + SLIST_FOREACH(entry, &mtag_id_list, next) { + if (strcmp(string, entry->string) == 0) { + break; + } + } + + if (entry == NULL) { + if (create == 0) { + lck_mtx_unlock(mtag_id_lock); + return ENOENT; + } + + entry = kalloc(MBUF_TAG_ID_ENTRY_SIZE(string)); + if (entry == NULL) { + lck_mtx_unlock(mtag_id_lock); + return ENOMEM; + } + + strcpy(entry->string, string); + entry->id = mtag_id_next; + mtag_id_next++; + SLIST_INSERT_HEAD(&mtag_id_list, entry, next); + } + lck_mtx_unlock(mtag_id_lock); + + *out_id = entry->id; + + return 0; +} + +errno_t +mbuf_tag_id_find( + const char *string, + mbuf_tag_id_t *out_id) +{ + return mbuf_tag_id_find_internal(string, (u_long*)out_id, 1); +} + +errno_t +mbuf_tag_allocate( + mbuf_t mbuf, + mbuf_tag_id_t id, + mbuf_tag_type_t type, + size_t length, + mbuf_how_t how, + void** data_p) +{ + struct m_tag *tag; + + if (data_p != NULL) + *data_p = NULL; + + /* Sanity check parameters */ + if (mbuf == NULL || (mbuf->m_flags & M_PKTHDR) == 0 || id < MTAG_FIRST_ID || + id >= mtag_id_next || length < 1 || (length & 0xffff0000) != 0 || + data_p == NULL) { + return EINVAL; + } + + /* Make sure this mtag hasn't already been allocated */ + tag = m_tag_locate(mbuf, id, type, NULL); + if (tag != NULL) { + return EEXIST; + } + + /* Allocate an mtag */ + tag = m_tag_alloc(id, type, length, how); + if (tag == NULL) { + return how == M_WAITOK ? ENOMEM : EWOULDBLOCK; + } + + /* Attach the mtag and set *data_p */ + m_tag_prepend(mbuf, tag); + *data_p = tag + 1; + + return 0; +} + +errno_t +mbuf_tag_find( + mbuf_t mbuf, + mbuf_tag_id_t id, + mbuf_tag_type_t type, + size_t* length, + void** data_p) +{ + struct m_tag *tag; + + if (length != NULL) + *length = 0; + if (data_p != NULL) + *data_p = NULL; + + /* Sanity check parameters */ + if (mbuf == NULL || (mbuf->m_flags & M_PKTHDR) == 0 || id < MTAG_FIRST_ID || + id >= mtag_id_next || length == NULL || data_p == NULL) { + return EINVAL; + } + + /* Locate an mtag */ + tag = m_tag_locate(mbuf, id, type, NULL); + if (tag == NULL) { + return ENOENT; + } + + /* Copy out the pointer to the data and the lenght value */ + *length = tag->m_tag_len; + *data_p = tag + 1; + + return 0; +} + +void +mbuf_tag_free( + mbuf_t mbuf, + mbuf_tag_id_t id, + mbuf_tag_type_t type) +{ + struct m_tag *tag; + + if (mbuf == NULL || (mbuf->m_flags & M_PKTHDR) == 0 || id < MTAG_FIRST_ID || + id >= mtag_id_next) + return; + + tag = m_tag_locate(mbuf, id, type, NULL); + if (tag == NULL) { + return; + } + + m_tag_delete(mbuf, tag); + return; +} + +/* mbuf stats */ +void mbuf_stats(struct mbuf_stat *stats) +{ + stats->mbufs = mbstat.m_mbufs; + stats->clusters = mbstat.m_clusters; + stats->clfree = mbstat.m_clfree; + stats->drops = mbstat.m_drops; + stats->wait = mbstat.m_wait; + stats->drain = mbstat.m_drain; + __builtin_memcpy(stats->mtypes, mbstat.m_mtypes, sizeof(stats->mtypes)); + stats->mcfail = mbstat.m_mcfail; + stats->mpfail = mbstat.m_mpfail; + stats->msize = mbstat.m_msize; + stats->mclbytes = mbstat.m_mclbytes; + stats->minclsize = mbstat.m_minclsize; + stats->mlen = mbstat.m_mlen; + stats->mhlen = mbstat.m_mhlen; + stats->bigclusters = mbstat.m_bigclusters; + stats->bigclfree = mbstat.m_bigclfree; + stats->bigmclbytes = mbstat.m_bigmclbytes; +} + +errno_t +mbuf_allocpacket(mbuf_how_t how, size_t packetlen, unsigned int *maxchunks, mbuf_t *mbuf) +{ + errno_t error; + struct mbuf *m; + unsigned int numpkts = 1; + unsigned int numchunks = maxchunks ? *maxchunks : 0; + + if (packetlen == 0) { + error = EINVAL; + goto out; + } + m = m_allocpacket_internal(&numpkts, packetlen, maxchunks ? &numchunks : NULL, how, 1, 0); + if (m == 0) { + if (maxchunks && *maxchunks && numchunks > *maxchunks) + error = ENOBUFS; + else + error = ENOMEM; + } else { + error = 0; + *mbuf = m; + } +out: + return error; +} + + +/* + * mbuf_copyback differs from m_copyback in a few ways: + * 1) mbuf_copyback will allocate clusters for new mbufs we append + * 2) mbuf_copyback will grow the last mbuf in the chain if possible + * 3) mbuf_copyback reports whether or not the operation succeeded + * 4) mbuf_copyback allows the caller to specify M_WAITOK or M_NOWAIT + */ +errno_t +mbuf_copyback( + mbuf_t m, + size_t off, + size_t len, + const void *data, + mbuf_how_t how) +{ + size_t mlen; + mbuf_t m_start = m; + mbuf_t n; + int totlen = 0; + errno_t result = 0; + const char *cp = data; + + if (m == NULL || len == 0 || data == NULL) + return EINVAL; + + while (off > (mlen = m->m_len)) { + off -= mlen; + totlen += mlen; + if (m->m_next == 0) { + n = m_getclr(how, m->m_type); + if (n == 0) { + result = ENOBUFS; + goto out; + } + n->m_len = MIN(MLEN, len + off); + m->m_next = n; + } + m = m->m_next; + } + + while (len > 0) { + mlen = MIN(m->m_len - off, len); + if (mlen < len && m->m_next == NULL && mbuf_trailingspace(m) > 0) { + size_t grow = MIN(mbuf_trailingspace(m), len - mlen); + mlen += grow; + m->m_len += grow; + } + bcopy(cp, off + (char*)mbuf_data(m), (unsigned)mlen); + cp += mlen; + len -= mlen; + mlen += off; + off = 0; + totlen += mlen; + if (len == 0) + break; + if (m->m_next == 0) { + n = m_get(how, m->m_type); + if (n == NULL) { + result = ENOBUFS; + goto out; + } + if (len > MINCLSIZE) { + /* cluter allocation failure is okay, we can grow chain */ + mbuf_mclget(how, m->m_type, &n); + } + n->m_len = MIN(mbuf_maxlen(n), len); + m->m_next = n; + } + m = m->m_next; + } + +out: + if ((m_start->m_flags & M_PKTHDR) && (m_start->m_pkthdr.len < totlen)) + m_start->m_pkthdr.len = totlen; + + return result; +} diff --git a/bsd/kern/kpi_socket.c b/bsd/kern/kpi_socket.c new file mode 100644 index 000000000..c2d295c27 --- /dev/null +++ b/bsd/kern/kpi_socket.c @@ -0,0 +1,772 @@ +/* + * Copyright (c) 2003-2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#define __KPI__ +#include <sys/kernel.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/errno.h> +#include <sys/malloc.h> +#include <sys/protosw.h> +#include <sys/domain.h> +#include <sys/mbuf.h> +#include <sys/fcntl.h> +#include <sys/filio.h> +#include <sys/uio_internal.h> +#include <kern/lock.h> + +extern void *memcpy(void *, const void *, size_t); +extern int soclose_locked(struct socket *so); + +errno_t sock_send_internal( + socket_t sock, + const struct msghdr *msg, + mbuf_t data, + int flags, + size_t *sentlen); + + + +errno_t +sock_accept( + socket_t sock, + struct sockaddr *from, + int fromlen, + int flags, + sock_upcall callback, + void* cookie, + socket_t *new_sock) +{ + struct sockaddr *sa; + struct socket *new_so; + lck_mtx_t *mutex_held; + int dosocklock; + errno_t error = 0; + + if (sock == NULL || new_sock == NULL) return EINVAL; + socket_lock(sock, 1); + if ((sock->so_options & SO_ACCEPTCONN) == 0) { + socket_unlock(sock, 1); + return EINVAL; + } + if ((flags & ~(MSG_DONTWAIT)) != 0) { + socket_unlock(sock, 1); + return ENOTSUP; + } + if (((flags & MSG_DONTWAIT) != 0 || (sock->so_state & SS_NBIO) != 0) && + sock->so_comp.tqh_first == NULL) { + socket_unlock(sock, 1); + return EWOULDBLOCK; + } + + if (sock->so_proto->pr_getlock != NULL) { + mutex_held = (*sock->so_proto->pr_getlock)(sock, 0); + dosocklock = 1; + } + else { + mutex_held = sock->so_proto->pr_domain->dom_mtx; + dosocklock = 0; + } + + while (TAILQ_EMPTY(&sock->so_comp) && sock->so_error == 0) { + if (sock->so_state & SS_CANTRCVMORE) { + sock->so_error = ECONNABORTED; + break; + } + error = msleep((caddr_t)&sock->so_timeo, mutex_held, PSOCK | PCATCH, "sock_accept", 0); + if (error) { + socket_unlock(sock, 1); + return (error); + } + } + if (sock->so_error) { + error = sock->so_error; + sock->so_error = 0; + socket_unlock(sock, 1); + return (error); + } + + new_so = TAILQ_FIRST(&sock->so_comp); + TAILQ_REMOVE(&sock->so_comp, new_so, so_list); + sock->so_qlen--; + socket_unlock(sock, 1); /* release the head */ + + if (dosocklock) { + lck_mtx_assert(new_so->so_proto->pr_getlock(new_so, 0), + LCK_MTX_ASSERT_NOTOWNED); + socket_lock(new_so, 1); + } + + new_so->so_state &= ~SS_COMP; + new_so->so_head = NULL; + soacceptlock(new_so, &sa, 0); + + if (callback) { + new_so->so_upcall = callback; + new_so->so_upcallarg = cookie; + new_so->so_rcv.sb_flags |= SB_UPCALL; + } + + if (sa && from) + { + if (fromlen > sa->sa_len) fromlen = sa->sa_len; + memcpy(from, sa, fromlen); + } + if (sa) FREE(sa, M_SONAME); + *new_sock = new_so; + if (dosocklock) + socket_unlock(new_so, 1); + return error; +} + +errno_t +sock_bind( + socket_t sock, + const struct sockaddr *to) +{ + if (sock == NULL || to == NULL) return EINVAL; + + return sobind(sock, (struct sockaddr*)to); +} + +errno_t +sock_connect( + socket_t sock, + const struct sockaddr *to, + int flags) +{ + int error = 0; + lck_mtx_t *mutex_held; + + if (sock == NULL || to == NULL) return EINVAL; + + socket_lock(sock, 1); + + if ((sock->so_state & SS_ISCONNECTING) && + ((sock->so_state & SS_NBIO) != 0 || + (flags & MSG_DONTWAIT) != 0)) { + socket_unlock(sock, 1); + return EALREADY; + } + error = soconnectlock(sock, (struct sockaddr*)to, 0); + if (!error) { + if ((sock->so_state & SS_ISCONNECTING) && + ((sock->so_state & SS_NBIO) != 0 || (flags & MSG_DONTWAIT) != 0)) { + socket_unlock(sock, 1); + return EINPROGRESS; + } + + if (sock->so_proto->pr_getlock != NULL) + mutex_held = (*sock->so_proto->pr_getlock)(sock, 0); + else + mutex_held = sock->so_proto->pr_domain->dom_mtx; + + while ((sock->so_state & SS_ISCONNECTING) && sock->so_error == 0) { + error = msleep((caddr_t)&sock->so_timeo, mutex_held, PSOCK | PCATCH, + "sock_connect", 0); + if (error) + break; + } + + if (error == 0) { + error = sock->so_error; + sock->so_error = 0; + } + } + else { + sock->so_state &= ~SS_ISCONNECTING; + } + socket_unlock(sock, 1); + return error; +} + +errno_t +sock_connectwait( + socket_t sock, + const struct timeval *tv) +{ + lck_mtx_t * mutex_held; + errno_t retval = 0; + struct timespec ts; + + socket_lock(sock, 1); + + // Check if we're already connected or if we've already errored out + if ((sock->so_state & SS_ISCONNECTING) == 0 || sock->so_error) { + if (sock->so_error) { + retval = sock->so_error; + sock->so_error = 0; + } + else { + if ((sock->so_state & SS_ISCONNECTED) != 0) + retval = 0; + else + retval = EINVAL; + } + goto done; + } + + // copied translation from timeval to hertz from SO_RCVTIMEO handling + if (tv->tv_sec < 0 || tv->tv_sec > SHRT_MAX / hz || + tv->tv_usec < 0 || tv->tv_usec >= 1000000) { + retval = EDOM; + goto done; + } + + ts.tv_sec = tv->tv_sec; + ts.tv_nsec = (tv->tv_usec * NSEC_PER_USEC); + if ( (ts.tv_sec + (ts.tv_nsec/NSEC_PER_SEC))/100 > SHRT_MAX) { + retval = EDOM; + goto done; + } + + if (sock->so_proto->pr_getlock != NULL) + mutex_held = (*sock->so_proto->pr_getlock)(sock, 0); + else + mutex_held = sock->so_proto->pr_domain->dom_mtx; + + msleep((caddr_t)&sock->so_timeo, mutex_held, PSOCK, "sock_connectwait", &ts); + + // Check if we're still waiting to connect + if ((sock->so_state & SS_ISCONNECTING) && sock->so_error == 0) { + retval = EINPROGRESS; + goto done; + } + + if (sock->so_error) { + retval = sock->so_error; + sock->so_error = 0; + } + +done: + socket_unlock(sock, 1); + return retval; +} + +errno_t +sock_nointerrupt( + socket_t sock, + int on) +{ + socket_lock(sock, 1); + + if (on) { + sock->so_rcv.sb_flags |= SB_NOINTR; // This isn't safe + sock->so_snd.sb_flags |= SB_NOINTR; // This isn't safe + } + else { + sock->so_rcv.sb_flags &= ~SB_NOINTR; // This isn't safe + sock->so_snd.sb_flags &= ~SB_NOINTR; // This isn't safe + } + + socket_unlock(sock, 1); + + return 0; +} + +errno_t +sock_getpeername( + socket_t sock, + struct sockaddr *peername, + int peernamelen) +{ + int error = 0; + struct sockaddr *sa = NULL; + + if (sock == NULL || peername == NULL || peernamelen < 0) return EINVAL; + socket_lock(sock, 1); + if ((sock->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { + socket_unlock(sock, 1); + return ENOTCONN; + } + error = sock->so_proto->pr_usrreqs->pru_peeraddr(sock, &sa); + if (!error) + { + if (peernamelen > sa->sa_len) peernamelen = sa->sa_len; + memcpy(peername, sa, peernamelen); + } + if (sa) FREE(sa, M_SONAME); + socket_unlock(sock, 1); + return error; +} + +errno_t +sock_getsockname( + socket_t sock, + struct sockaddr *sockname, + int socknamelen) +{ + int error = 0; + struct sockaddr *sa = NULL; + + if (sock == NULL || sockname == NULL || socknamelen < 0) return EINVAL; + socket_lock(sock, 1); + error = sock->so_proto->pr_usrreqs->pru_sockaddr(sock, &sa); + if (!error) + { + if (socknamelen > sa->sa_len) socknamelen = sa->sa_len; + memcpy(sockname, sa, socknamelen); + } + if (sa) FREE(sa, M_SONAME); + socket_unlock(sock, 1); + return error; +} + +errno_t +sock_getsockopt( + socket_t sock, + int level, + int optname, + void *optval, + int *optlen) +{ + int error = 0; + struct sockopt sopt; + + if (sock == NULL || optval == NULL || optlen == NULL) return EINVAL; + sopt.sopt_dir = SOPT_GET; + sopt.sopt_level = level; + sopt.sopt_name = optname; + sopt.sopt_val = CAST_USER_ADDR_T(optval); + sopt.sopt_valsize = *optlen; + sopt.sopt_p = NULL; + error = sogetopt(sock, &sopt); /* will lock socket */ + if (error == 0) *optlen = sopt.sopt_valsize; + return error; +} + +errno_t +sock_ioctl( + socket_t sock, + unsigned long request, + void *argp) +{ + return soioctl(sock, request, argp, NULL); /* will lock socket */ +} + +errno_t +sock_setsockopt( + socket_t sock, + int level, + int optname, + const void *optval, + int optlen) +{ + struct sockopt sopt; + + if (sock == NULL || optval == NULL) return EINVAL; + sopt.sopt_dir = SOPT_SET; + sopt.sopt_level = level; + sopt.sopt_name = optname; + sopt.sopt_val = CAST_USER_ADDR_T(optval); + sopt.sopt_valsize = optlen; + sopt.sopt_p = NULL; + return sosetopt(sock, &sopt); /* will lock socket */ +} + +errno_t +sock_listen( + socket_t sock, + int backlog) +{ + if (sock == NULL) return EINVAL; + return solisten(sock, backlog); /* will lock socket */ +} + +static errno_t +sock_receive_internal( + socket_t sock, + struct msghdr *msg, + mbuf_t *data, + int flags, + size_t *recvdlen) +{ + uio_t auio; + struct mbuf *control = NULL; + int error = 0; + int length = 0; + struct sockaddr *fromsa; + char uio_buf[ UIO_SIZEOF((msg != NULL) ? msg->msg_iovlen : 0) ]; + + if (sock == NULL) return EINVAL; + + auio = uio_createwithbuffer(((msg != NULL) ? msg->msg_iovlen : 0), + 0, UIO_SYSSPACE, UIO_READ, + &uio_buf[0], sizeof(uio_buf)); + if (msg && data == NULL) { + int i; + struct iovec_32 *tempp = (struct iovec_32 *) msg->msg_iov; + + for (i = 0; i < msg->msg_iovlen; i++) { + uio_addiov(auio, CAST_USER_ADDR_T((tempp + i)->iov_base), (tempp + i)->iov_len); + } + if (uio_resid(auio) < 0) return EINVAL; + } + else { + uio_setresid(auio, (uio_resid(auio) + *recvdlen)); + } + length = uio_resid(auio); + + if (recvdlen) + *recvdlen = 0; + + if (msg && msg->msg_control) { + if ((size_t)msg->msg_controllen < sizeof(struct cmsghdr)) return EINVAL; + if ((size_t)msg->msg_controllen > MLEN) return EINVAL; + control = m_get(M_NOWAIT, MT_CONTROL); + if (control == NULL) return ENOMEM; + memcpy(mtod(control, caddr_t), msg->msg_control, msg->msg_controllen); + control->m_len = msg->msg_controllen; + } + + /* let pru_soreceive handle the socket locking */ + error = sock->so_proto->pr_usrreqs->pru_soreceive(sock, &fromsa, auio, + data, control ? &control : NULL, &flags); + if (error) goto cleanup; + + if (recvdlen) + *recvdlen = length - uio_resid(auio); + if (msg) { + msg->msg_flags = flags; + + if (msg->msg_name) + { + int salen; + salen = msg->msg_namelen; + if (msg->msg_namelen > 0 && fromsa != 0) + { + salen = MIN(salen, fromsa->sa_len); + memcpy(msg->msg_name, fromsa, + msg->msg_namelen > fromsa->sa_len ? fromsa->sa_len : msg->msg_namelen); + } + } + + if (msg->msg_control) + { + struct mbuf* m = control; + u_char* ctlbuf = msg->msg_control; + int clen = msg->msg_controllen; + msg->msg_controllen = 0; + + while (m && clen > 0) + { + unsigned int tocopy; + if (clen >= m->m_len) + { + tocopy = m->m_len; + } + else + { + msg->msg_flags |= MSG_CTRUNC; + tocopy = clen; + } + memcpy(ctlbuf, mtod(m, caddr_t), tocopy); + ctlbuf += tocopy; + clen -= tocopy; + m = m->m_next; + } + msg->msg_controllen = (u_int32_t)ctlbuf - (u_int32_t)msg->msg_control; + } + } + +cleanup: + if (control) m_freem(control); + if (fromsa) FREE(fromsa, M_SONAME); + return error; +} + +errno_t +sock_receive( + socket_t sock, + struct msghdr *msg, + int flags, + size_t *recvdlen) +{ + if ((msg == NULL) || + (msg->msg_iovlen < 1) || + (msg->msg_iov[0].iov_len == 0) || + (msg->msg_iov[0].iov_base == NULL)) + return EINVAL; + return sock_receive_internal(sock, msg, NULL, flags, recvdlen); +} + +errno_t +sock_receivembuf( + socket_t sock, + struct msghdr *msg, + mbuf_t *data, + int flags, + size_t *recvlen) +{ + if (data == NULL || recvlen == 0 || *recvlen <= 0 || (msg && + (msg->msg_iov != NULL || msg->msg_iovlen != 0))) + return EINVAL; + return sock_receive_internal(sock, msg, data, flags, recvlen); +} + +errno_t +sock_send_internal( + socket_t sock, + const struct msghdr *msg, + mbuf_t data, + int flags, + size_t *sentlen) +{ + uio_t auio = NULL; + struct mbuf *control = NULL; + int error = 0; + int datalen = 0; + char uio_buf[ UIO_SIZEOF((msg != NULL ? msg->msg_iovlen : 1)) ]; + + if (sock == NULL) { + error = EINVAL; + goto errorout; + } + + if (data == 0 && msg != NULL) { + struct iovec_32 *tempp = (struct iovec_32 *) msg->msg_iov; + + auio = uio_createwithbuffer(msg->msg_iovlen, 0, UIO_SYSSPACE, UIO_WRITE, + &uio_buf[0], sizeof(uio_buf)); + if (tempp != NULL) + { + int i; + + for (i = 0; i < msg->msg_iovlen; i++) { + uio_addiov(auio, CAST_USER_ADDR_T((tempp + i)->iov_base), (tempp + i)->iov_len); + } + + if (uio_resid(auio) < 0) { + error = EINVAL; + goto errorout; + } + } + } + + if (sentlen) + *sentlen = 0; + + if (auio) + datalen = uio_resid(auio); + else + datalen = data->m_pkthdr.len; + + if (msg && msg->msg_control) + { + if ((size_t)msg->msg_controllen < sizeof(struct cmsghdr)) return EINVAL; + if ((size_t)msg->msg_controllen > MLEN) return EINVAL; + control = m_get(M_NOWAIT, MT_CONTROL); + if (control == NULL) { + error = ENOMEM; + goto errorout; + } + memcpy(mtod(control, caddr_t), msg->msg_control, msg->msg_controllen); + control->m_len = msg->msg_controllen; + } + + error = sock->so_proto->pr_usrreqs->pru_sosend(sock, msg ? (struct sockaddr*)msg->msg_name : 0, + auio, data, control, flags); + if (error == 0 && sentlen) { + if (auio) + *sentlen = datalen - uio_resid(auio); + else + *sentlen = datalen; + } + + return error; + +/* + * In cases where we detect an error before returning, we need to + * free the mbuf chain if there is one. sosend (and pru_sosend) will + * free the mbuf chain if they encounter an error. + */ +errorout: + if (control) + m_freem(control); + if (data) + m_freem(data); + if (sentlen) + *sentlen = 0; + return error; +} + +errno_t +sock_send( + socket_t sock, + const struct msghdr *msg, + int flags, + size_t *sentlen) +{ + if (msg == NULL || msg->msg_iov == NULL || msg->msg_iovlen < 1) + return EINVAL; + return sock_send_internal(sock, msg, NULL, flags, sentlen); +} + +errno_t +sock_sendmbuf( + socket_t sock, + const struct msghdr *msg, + mbuf_t data, + int flags, + size_t *sentlen) +{ + if (data == NULL || (msg && + (msg->msg_iov != NULL || msg->msg_iovlen != 0))) { + if (data) + m_freem(data); + return EINVAL; + } + return sock_send_internal(sock, msg, data, flags, sentlen); +} + +errno_t +sock_shutdown( + socket_t sock, + int how) +{ + if (sock == NULL) return EINVAL; + return soshutdown(sock, how); +} + +typedef void (*so_upcall)(struct socket *sock, void* arg, int waitf); + +errno_t +sock_socket( + int domain, + int type, + int protocol, + sock_upcall callback, + void* context, + socket_t *new_so) +{ + int error = 0; + if (new_so == NULL) return EINVAL; + /* socreate will create an initial so_count */ + error = socreate(domain, new_so, type, protocol); + if (error == 0 && callback) + { + (*new_so)->so_rcv.sb_flags |= SB_UPCALL; + (*new_so)->so_upcall = (so_upcall)callback; + (*new_so)->so_upcallarg = context; + } + return error; +} + +void +sock_close( + socket_t sock) +{ + if (sock == NULL) return; + soclose(sock); +} + +/* Do we want this to be APPLE_PRIVATE API?: YES (LD 12/23/04)*/ +void +sock_retain( + socket_t sock) +{ + if (sock == NULL) return; + socket_lock(sock, 1); + sock->so_retaincnt++; + sock->so_usecount++; /* add extra reference for holding the socket */ + socket_unlock(sock, 1); +} + +/* Do we want this to be APPLE_PRIVATE API? */ +void +sock_release( + socket_t sock) +{ + if (sock == NULL) return; + socket_lock(sock, 1); + sock->so_retaincnt--; + if (sock->so_retaincnt < 0) + panic("sock_release: negative retain count for sock=%x cnt=%x\n", + sock, sock->so_retaincnt); + if ((sock->so_retaincnt == 0) && (sock->so_usecount == 2)) + soclose_locked(sock); /* close socket only if the FD is not holding it */ + else + sock->so_usecount--; /* remove extra reference holding the socket */ + socket_unlock(sock, 1); +} + +errno_t +sock_setpriv( + socket_t sock, + int on) +{ + if (sock == NULL) return EINVAL; + socket_lock(sock, 1); + if (on) + { + sock->so_state |= SS_PRIV; + } + else + { + sock->so_state &= ~SS_PRIV; + } + socket_unlock(sock, 1); + return 0; +} + +int +sock_isconnected( + socket_t sock) +{ + int retval; + socket_lock(sock, 1); + retval = (sock->so_state & SS_ISCONNECTED) != 0; + socket_unlock(sock, 1); + return (retval); +} + +int +sock_isnonblocking( + socket_t sock) +{ + int retval; + socket_lock(sock, 1); + retval = (sock->so_state & SS_NBIO) != 0; + socket_unlock(sock, 1); + return (retval); +} + +errno_t +sock_gettype( + socket_t sock, + int *outDomain, + int *outType, + int *outProtocol) +{ + socket_lock(sock, 1); + if (outDomain) + *outDomain = sock->so_proto->pr_domain->dom_family; + if (outType) + *outType = sock->so_type; + if (outProtocol) + *outProtocol = sock->so_proto->pr_protocol; + socket_unlock(sock, 1); + return 0; +} diff --git a/bsd/kern/kpi_socketfilter.c b/bsd/kern/kpi_socketfilter.c new file mode 100644 index 000000000..729f5fac1 --- /dev/null +++ b/bsd/kern/kpi_socketfilter.c @@ -0,0 +1,595 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include <sys/kpi_socketfilter.h> + +#include <sys/socket.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/malloc.h> +#include <sys/protosw.h> +#include <kern/locks.h> +#include <net/kext_net.h> + +static struct socket_filter_list sock_filter_head; +static lck_mtx_t *sock_filter_lock = 0; + +__private_extern__ void +sflt_init(void) +{ + lck_grp_attr_t *grp_attrib = 0; + lck_attr_t *lck_attrib = 0; + lck_grp_t *lck_group = 0; + + TAILQ_INIT(&sock_filter_head); + + /* Allocate a spin lock */ + grp_attrib = lck_grp_attr_alloc_init(); + lck_grp_attr_setdefault(grp_attrib); + lck_group = lck_grp_alloc_init("socket filter lock", grp_attrib); + lck_grp_attr_free(grp_attrib); + lck_attrib = lck_attr_alloc_init(); + lck_attr_setdefault(lck_attrib); + lck_attr_setdebug(lck_attrib); + sock_filter_lock = lck_mtx_alloc_init(lck_group, lck_attrib); + lck_grp_free(lck_group); + lck_attr_free(lck_attrib); +} + +__private_extern__ void +sflt_initsock( + struct socket *so) +{ + struct protosw *proto = so->so_proto; + struct socket_filter *filter; + + if (TAILQ_FIRST(&proto->pr_filter_head) != NULL) { + lck_mtx_lock(sock_filter_lock); + TAILQ_FOREACH(filter, &proto->pr_filter_head, sf_protosw_next) { + sflt_attach_private(so, filter, 0, 0); + } + lck_mtx_unlock(sock_filter_lock); + } +} + +__private_extern__ void +sflt_termsock( + struct socket *so) +{ + struct socket_filter_entry *filter; + struct socket_filter_entry *filter_next; + + for (filter = so->so_filt; filter; filter = filter_next) { + filter_next = filter->sfe_next_onsocket; + sflt_detach_private(filter, 0); + } +} + +__private_extern__ void +sflt_use( + struct socket *so) +{ + so->so_filteruse++; +} + +__private_extern__ void +sflt_unuse( + struct socket *so) +{ + so->so_filteruse--; + if (so->so_filteruse == 0) { + struct socket_filter_entry *filter; + struct socket_filter_entry *next_filter; + // search for detaching filters + for (filter = so->so_filt; filter; filter = next_filter) { + next_filter = filter->sfe_next_onsocket; + + if (filter->sfe_flags & SFEF_DETACHING) { + sflt_detach_private(filter, 0); + } + } + } +} + +__private_extern__ void +sflt_notify( + struct socket *so, + sflt_event_t event, + void *param) +{ + struct socket_filter_entry *filter; + int filtered = 0; + + for (filter = so->so_filt; filter; + filter = filter->sfe_next_onsocket) { + if (filter->sfe_filter->sf_filter.sf_notify) { + if (filtered == 0) { + filtered = 1; + sflt_use(so); + socket_unlock(so, 0); + } + filter->sfe_filter->sf_filter.sf_notify( + filter->sfe_cookie, so, event, param); + } + } + + if (filtered != 0) { + socket_lock(so, 0); + sflt_unuse(so); + } +} + +__private_extern__ int +sflt_data_in( + struct socket *so, + const struct sockaddr *from, + mbuf_t *data, + mbuf_t *control, + sflt_data_flag_t flags) +{ + struct socket_filter_entry *filter; + int filtered = 0; + int error = 0; + + for (filter = so->so_filt; filter; + filter = filter->sfe_next_onsocket) { + if (filter->sfe_filter->sf_filter.sf_data_in) { + if (filtered == 0) { + filtered = 1; + sflt_use(so); + socket_unlock(so, 0); + } + error = filter->sfe_filter->sf_filter.sf_data_in( + filter->sfe_cookie, so, from, data, control, flags); + } + } + + if (filtered != 0) { + socket_lock(so, 0); + sflt_unuse(so); + } + + return error; +} + +/* sflt_attach_private + * + * Assumptions: If filter is not NULL, socket_filter_lock is held. + */ + +__private_extern__ int +sflt_attach_private( + struct socket *so, + struct socket_filter *filter, + sflt_handle handle, + int sock_locked) +{ + struct socket_filter_entry *entry = NULL; + int didlock = 0; + int error = 0; + + if (filter == NULL) { + /* Find the filter by the handle */ + lck_mtx_lock(sock_filter_lock); + didlock = 1; + + TAILQ_FOREACH(filter, &sock_filter_head, sf_global_next) { + if (filter->sf_filter.sf_handle == handle) + break; + } + } + + if (filter == NULL) + error = ENOENT; + + if (error == 0) { + /* allocate the socket filter entry */ + MALLOC(entry, struct socket_filter_entry *, sizeof(*entry), M_IFADDR, M_WAITOK); + if (entry == NULL) { + error = ENOMEM; + } + } + + if (error == 0) { + /* Initialize the socket filter entry and call the attach function */ + entry->sfe_filter = filter; + entry->sfe_socket = so; + entry->sfe_cookie = NULL; + if (entry->sfe_filter->sf_filter.sf_attach) { + filter->sf_usecount++; + + if (sock_locked) + socket_unlock(so, 0); + error = entry->sfe_filter->sf_filter.sf_attach(&entry->sfe_cookie, so); + if (sock_locked) + socket_lock(so, 0); + + filter->sf_usecount--; + + /* If the attach function returns an error, this filter is not attached */ + if (error) { + FREE(entry, M_IFADDR); + entry = NULL; + } + } + } + + if (error == 0) { + /* Put the entry in the socket list */ + entry->sfe_next_onsocket = so->so_filt; + so->so_filt = entry; + + /* Put the entry in the filter list */ + entry->sfe_next_onfilter = filter->sf_entry_head; + filter->sf_entry_head = entry; + + /* Increment the socket's usecount */ + so->so_usecount++; + + /* Incremenet the parent filter's usecount */ + filter->sf_usecount++; + } + + if (didlock) { + lck_mtx_unlock(sock_filter_lock); + } + + return error; +} + + +/* sflt_detach_private + * + * Assumptions: if you pass 0 in for the second parameter, you are holding the + * socket lock for the socket the entry is attached to. If you pass 1 in for + * the second parameter, it is assumed that the entry is not on the filter's + * list and the socket lock is not held. + */ + +__private_extern__ void +sflt_detach_private( + struct socket_filter_entry *entry, + int filter_detached) +{ + struct socket *so = entry->sfe_socket; + struct socket_filter_entry **next_ptr; + int detached = 0; + int found = 0; + + if (filter_detached) { + socket_lock(entry->sfe_socket, 0); + } + + /* + * Attempt to find the entry on the filter's list and + * remove it. This prevents a filter detaching at the + * same time from attempting to remove the same entry. + */ + lck_mtx_lock(sock_filter_lock); + if (!filter_detached) { + for (next_ptr = &entry->sfe_filter->sf_entry_head; *next_ptr; + next_ptr = &((*next_ptr)->sfe_next_onfilter)) { + if (*next_ptr == entry) { + found = 1; + *next_ptr = entry->sfe_next_onfilter; + break; + } + } + } + + if (!filter_detached && !found && (entry->sfe_flags & SFEF_DETACHING) == 0) { + lck_mtx_unlock(sock_filter_lock); + return; + } + + if (entry->sfe_socket->so_filteruse != 0) { + lck_mtx_unlock(sock_filter_lock); + entry->sfe_flags |= SFEF_DETACHING; + return; + } + + /* + * Check if we are removing the last attached filter and + * the parent filter is being unregistered. + */ + if (entry->sfe_socket->so_filteruse == 0) { + entry->sfe_filter->sf_usecount--; + if ((entry->sfe_filter->sf_usecount == 0) && + (entry->sfe_filter->sf_flags & SFF_DETACHING) != 0) + detached = 1; + } + lck_mtx_unlock(sock_filter_lock); + + /* Remove from the socket list */ + for (next_ptr = &entry->sfe_socket->so_filt; *next_ptr; + next_ptr = &((*next_ptr)->sfe_next_onsocket)) { + if (*next_ptr == entry) { + *next_ptr = entry->sfe_next_onsocket; + break; + } + } + + if (entry->sfe_filter->sf_filter.sf_detach) + entry->sfe_filter->sf_filter.sf_detach(entry->sfe_cookie, entry->sfe_socket); + + if (detached && entry->sfe_filter->sf_filter.sf_unregistered) { + entry->sfe_filter->sf_filter.sf_unregistered(entry->sfe_filter->sf_filter.sf_handle); + FREE(entry->sfe_filter, M_IFADDR); + } + + if (filter_detached) { + socket_unlock(entry->sfe_socket, 1); + } + else { + // We need some better way to decrement the usecount + so->so_usecount--; + } + FREE(entry, M_IFADDR); +} + +errno_t +sflt_attach( + socket_t socket, + sflt_handle handle) +{ + if (socket == NULL || handle == 0) + return EINVAL; + + return sflt_attach_private(socket, NULL, handle, 0); +} + +errno_t +sflt_detach( + socket_t socket, + sflt_handle handle) +{ + struct socket_filter_entry *filter; + errno_t result = 0; + + if (socket == NULL || handle == 0) + return EINVAL; + + socket_lock(socket, 1); + + for (filter = socket->so_filt; filter; + filter = filter->sfe_next_onsocket) { + if (filter->sfe_filter->sf_filter.sf_handle == handle) + break; + } + + if (filter != NULL) { + sflt_detach_private(filter, 0); + } + else { + result = ENOENT; + } + + socket_unlock(socket, 1); + + return result; +} + + +errno_t +sflt_register( + const struct sflt_filter *filter, + int domain, + int type, + int protocol) +{ + struct socket_filter *sock_filt = NULL; + struct socket_filter *match = NULL; + int error = 0; + struct protosw *pr = pffindproto(domain, protocol, type); + + if (pr == NULL) return ENOENT; + + if (filter->sf_attach == NULL || filter->sf_detach == NULL) return EINVAL; + if (filter->sf_handle == 0) return EINVAL; + if (filter->sf_name == NULL) return EINVAL; + + /* Allocate the socket filter */ + MALLOC(sock_filt, struct socket_filter*, sizeof(*sock_filt), M_IFADDR, M_WAITOK); + if (sock_filt == NULL) { + return ENOBUFS; + } + + bzero(sock_filt, sizeof(*sock_filt)); + sock_filt->sf_filter = *filter; + + lck_mtx_lock(sock_filter_lock); + /* Look for an existing entry */ + TAILQ_FOREACH(match, &sock_filter_head, sf_global_next) { + if (match->sf_filter.sf_handle == sock_filt->sf_filter.sf_handle) { + break; + } + } + + /* Add the entry only if there was no existing entry */ + if (match == NULL) { + TAILQ_INSERT_TAIL(&sock_filter_head, sock_filt, sf_global_next); + if ((sock_filt->sf_filter.sf_flags & SFLT_GLOBAL) != 0) { + TAILQ_INSERT_TAIL(&pr->pr_filter_head, sock_filt, sf_protosw_next); + sock_filt->sf_proto = pr; + } + } + lck_mtx_unlock(sock_filter_lock); + + if (match != NULL) { + FREE(sock_filt, M_IFADDR); + return EEXIST; + } + + return error; +} + +errno_t +sflt_unregister( + sflt_handle handle) +{ + struct socket_filter *filter; + struct socket_filter_entry *entry_head = NULL; + + /* Find the entry and remove it from the global and protosw lists */ + lck_mtx_lock(sock_filter_lock); + TAILQ_FOREACH(filter, &sock_filter_head, sf_global_next) { + if (filter->sf_filter.sf_handle == handle) + break; + } + + if (filter) { + TAILQ_REMOVE(&sock_filter_head, filter, sf_global_next); + if ((filter->sf_filter.sf_flags & SFLT_GLOBAL) != 0) { + TAILQ_REMOVE(&filter->sf_proto->pr_filter_head, filter, sf_protosw_next); + } + entry_head = filter->sf_entry_head; + filter->sf_entry_head = NULL; + filter->sf_flags |= SFF_DETACHING; + } + + lck_mtx_unlock(sock_filter_lock); + + if (filter == NULL) + return ENOENT; + + /* We need to detach the filter from any sockets it's attached to */ + if (entry_head == 0) { + if (filter->sf_filter.sf_unregistered) + filter->sf_filter.sf_unregistered(filter->sf_filter.sf_handle); + } else { + while (entry_head) { + struct socket_filter_entry *next_entry; + next_entry = entry_head->sfe_next_onfilter; + sflt_detach_private(entry_head, 1); + entry_head = next_entry; + } + } + + return 0; +} + +errno_t +sock_inject_data_in( + socket_t so, + const struct sockaddr* from, + mbuf_t data, + mbuf_t control, + sflt_data_flag_t flags) +{ + int error = 0; + if (so == NULL || data == NULL) return EINVAL; + + if (flags & sock_data_filt_flag_oob) { + return ENOTSUP; + } + + socket_lock(so, 1); + + if (from) { + if (sbappendaddr(&so->so_rcv, (struct sockaddr*)from, data, + control, NULL)) + sorwakeup(so); + goto done; + } + + if (control) { + if (sbappendcontrol(&so->so_rcv, data, control, NULL)) + sorwakeup(so); + goto done; + } + + if (flags & sock_data_filt_flag_record) { + if (control || from) { + error = EINVAL; + goto done; + } + if (sbappendrecord(&so->so_rcv, (struct mbuf*)data)) + sorwakeup(so); + goto done; + } + + if (sbappend(&so->so_rcv, data)) + sorwakeup(so); +done: + socket_unlock(so, 1); + return error; +} + +errno_t +sock_inject_data_out( + socket_t so, + const struct sockaddr* to, + mbuf_t data, + mbuf_t control, + sflt_data_flag_t flags) +{ + int sosendflags = 0; + if (flags & sock_data_filt_flag_oob) sosendflags = MSG_OOB; + return sosend(so, (const struct sockaddr*)to, NULL, + data, control, sosendflags); +} + +sockopt_dir +sockopt_direction( + sockopt_t sopt) +{ + return (sopt->sopt_dir == SOPT_GET) ? sockopt_get : sockopt_set; +} + +int +sockopt_level( + sockopt_t sopt) +{ + return sopt->sopt_level; +} + +int +sockopt_name( + sockopt_t sopt) +{ + return sopt->sopt_name; +} + +size_t +sockopt_valsize( + sockopt_t sopt) +{ + return sopt->sopt_valsize; +} + +errno_t +sockopt_copyin( + sockopt_t sopt, + void *data, + size_t len) +{ + return sooptcopyin(sopt, data, len, len); +} + +errno_t +sockopt_copyout( + sockopt_t sopt, + void *data, + size_t len) +{ + return sooptcopyout(sopt, data, len); +} diff --git a/bsd/kern/mach_fat.c b/bsd/kern/mach_fat.c index 4d3c8e07b..408d2ecb2 100644 --- a/bsd/kern/mach_fat.c +++ b/bsd/kern/mach_fat.c @@ -40,7 +40,10 @@ #include <kern/mach_loader.h> #include <architecture/byte_order.h> -#define CPU_TYPE_NATIVE (machine_slot[cpu_number()].cpu_type) +/* XXX should be in common header */ +extern int grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype); + +#define CPU_TYPE_NATIVE (cpu_type()) #define CPU_TYPE_CLASSIC CPU_TYPE_POWERPC /********************************************************************** @@ -51,7 +54,9 @@ * * Args: vp: The vnode for the fat file. * header: A pointer to the fat file header. - * cpu_type: The required cpu type. + * req_cpu_type: The required cpu type. + * mask_bits: Bits to mask from the sub-image type when + * grading it vs. the req_cpu_type * archret (out): Pointer to fat_arch structure to hold * the results. * @@ -60,15 +65,19 @@ **********************************************************************/ static load_return_t fatfile_getarch2( +#if 0 struct vnode *vp, +#else + __unused struct vnode *vp, +#endif vm_offset_t data_ptr, - cpu_type_t cpu_type, + cpu_type_t req_cpu_type, + cpu_type_t mask_bits, struct fat_arch *archret) { /* vm_pager_t pager; */ vm_offset_t addr; vm_size_t size; - kern_return_t kret; load_return_t lret; struct fat_arch *arch; struct fat_arch *best_arch; @@ -77,7 +86,9 @@ fatfile_getarch2( int nfat_arch; int end_of_archs; struct fat_header *header; +#if 0 off_t filesize; +#endif /* * Get the pager for the file. @@ -108,7 +119,7 @@ fatfile_getarch2( * Round size of fat_arch structures up to page boundry. */ size = round_page_32(end_of_archs); - if (size <= 0) + if (size == 0) return(LOAD_BADMACHO); /* @@ -123,13 +134,14 @@ fatfile_getarch2( /* * Check to see if right cpu type. */ - if(NXSwapBigIntToHost(arch->cputype) != cpu_type) + if(((cpu_type_t)NXSwapBigIntToHost(arch->cputype) & ~mask_bits) != req_cpu_type) continue; /* * Get the grade of the cpu subtype. */ - grade = grade_cpu_subtype( + grade = grade_binary( + NXSwapBigIntToHost(arch->cputype), NXSwapBigIntToHost(arch->cpusubtype)); /* @@ -187,10 +199,14 @@ fatfile_getarch_affinity( primary_type = CPU_TYPE_NATIVE; fallback_type = CPU_TYPE_CLASSIC; } - lret = fatfile_getarch2(vp, data_ptr, primary_type, archret); + /* + * Ignore the architectural bits when determining if an image + * in a fat file should be skipped or graded. + */ + lret = fatfile_getarch2(vp, data_ptr, primary_type, CPU_ARCH_MASK, archret); if ((lret != 0) && handler) { lret = fatfile_getarch2(vp, data_ptr, fallback_type, - archret); + 0, archret); } return lret; } @@ -215,6 +231,31 @@ fatfile_getarch( vm_offset_t data_ptr, struct fat_arch *archret) { - return fatfile_getarch2(vp, data_ptr, CPU_TYPE_NATIVE, archret); + return fatfile_getarch2(vp, data_ptr, CPU_TYPE_NATIVE, 0, archret); +} + +/********************************************************************** + * Routine: fatfile_getarch_with_bits() + * + * Function: Locate the architecture-dependant contents of a fat + * file that match this CPU. + * + * Args: vp: The vnode for the fat file. + * archbits: Architecture specific feature bits + * header: A pointer to the fat file header. + * archret (out): Pointer to fat_arch structure to hold + * the results. + * + * Returns: KERN_SUCCESS: Valid architecture found. + * KERN_FAILURE: No valid architecture found. + **********************************************************************/ +load_return_t +fatfile_getarch_with_bits( + struct vnode *vp, + integer_t archbits, + vm_offset_t data_ptr, + struct fat_arch *archret) +{ + return fatfile_getarch2(vp, data_ptr, archbits | CPU_TYPE_NATIVE, 0, archret); } diff --git a/bsd/kern/mach_header.c b/bsd/kern/mach_header.c index 34ffecda5..9071eaa48 100644 --- a/bsd/kern/mach_header.c +++ b/bsd/kern/mach_header.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -24,6 +24,14 @@ * * Functions for accessing mach-o headers. * + * NOTE: This file supports only 32 bit mach headers at the present + * time; it's primary use is by kld, and all externally + * referenced routines at the present time operate against + * the 32 bit mach header _mh_execute_header, which is the + * header for the currently executing kernel. Adding support + * for 64 bit kernels is possible, but is not necessary at the + * present time. + * * HISTORY * 27-MAR-97 Umesh Vaishampayan (umeshv@NeXT.com) * Added getsegdatafromheader(); @@ -35,26 +43,22 @@ #if !defined(KERNEL_PRELOAD) #include <kern/mach_header.h> +#include <string.h> // from libsa extern struct mach_header _mh_execute_header; -struct section *getsectbynamefromheader( - struct mach_header *header, - char *seg_name, - char *sect_name); -struct segment_command *getsegbynamefromheader( - struct mach_header *header, - char *seg_name); - /* * return the last address (first avail) + * + * This routine operates against the currently executing kernel only */ -vm_offset_t getlastaddr(void) +vm_offset_t +getlastaddr(void) { struct segment_command *sgp; vm_offset_t last_addr = 0; struct mach_header *header = &_mh_execute_header; - int i; + unsigned long i; sgp = (struct segment_command *) ((char *)header + sizeof(struct mach_header)); @@ -69,10 +73,12 @@ vm_offset_t getlastaddr(void) } #if FIXME /* [ */ +/* + * This routine operates against the currently executing kernel only + */ struct mach_header ** getmachheaders(void) { - extern struct mach_header _mh_execute_header; struct mach_header **tl; tl = (struct mach_header **)malloc(2*sizeof(struct mach_header *)); tl[0] = &_mh_execute_header; @@ -86,12 +92,14 @@ getmachheaders(void) * named segment if it exist in the mach header passed to it. Also it returns * the size of the section data indirectly through the pointer size. Otherwise * it returns zero for the pointer and the size. + * + * This routine can operate against any 32 bit mach header. */ void * getsectdatafromheader( struct mach_header *mhp, - char *segname, - char *sectname, + const char *segname, + const char *sectname, int *size) { const struct section *sp; @@ -112,11 +120,13 @@ getsectdatafromheader( * if it exist in the mach header passed to it. Also it returns * the size of the segment data indirectly through the pointer size. * Otherwise it returns zero for the pointer and the size. + * + * This routine can operate against any 32 bit mach header. */ void * getsegdatafromheader( - struct mach_header *mhp, - char *segname, + struct mach_header *mhp, + const char *segname, int *size) { const struct segment_command *sc; @@ -136,16 +146,18 @@ getsegdatafromheader( * This routine returns the section structure for the named section in the * named segment for the mach_header pointer passed to it if it exist. * Otherwise it returns zero. + * + * This routine can operate against any 32 bit mach header. */ struct section * getsectbynamefromheader( struct mach_header *mhp, - char *segname, - char *sectname) + const char *segname, + const char *sectname) { struct segment_command *sgp; struct section *sp; - long i, j; + unsigned long i, j; sgp = (struct segment_command *) ((char *)mhp + sizeof(struct mach_header)); @@ -170,12 +182,16 @@ getsectbynamefromheader( return((struct section *)0); } -struct segment_command *getsegbynamefromheader( +/* + * This routine can operate against any 32 bit mach header. + */ +struct segment_command * +getsegbynamefromheader( struct mach_header *header, - char *seg_name) + const char *seg_name) { struct segment_command *sgp; - int i; + unsigned long i; sgp = (struct segment_command *) ((char *)header + sizeof(struct mach_header)); @@ -221,7 +237,9 @@ static struct { 4, // align 0, // reloff 0, // nreloc - 0 // flags + 0, // flags + 0, // reserved1 + 0 // reserved2 } }; @@ -232,16 +250,25 @@ static vm_offset_t getsizeofmacho(struct mach_header *header); /* * Return the first segment_command in the header. + * + * This routine operates against the currently executing kernel only */ -struct segment_command *firstseg(void) +struct segment_command * +firstseg(void) { return firstsegfromheader(&_mh_execute_header); } -struct segment_command *firstsegfromheader(struct mach_header *header) +/* + * This routine can operate against any 32 bit mach header, and returns a + * pointer to a 32 bit segment_command structure from the file prefixed by + * the header it is passed as its argument. + */ +struct segment_command * +firstsegfromheader(struct mach_header *header) { struct segment_command *sgp; - int i; + unsigned long i; sgp = (struct segment_command *) ((char *)header + sizeof(struct mach_header)); @@ -253,7 +280,14 @@ struct segment_command *firstsegfromheader(struct mach_header *header) return (struct segment_command *)0; } -struct segment_command *nextseg(struct segment_command *sgp) +/* + * This routine operates against a 32 bit mach segment_command structure + * pointer from the currently executing kernel only, to obtain the + * sequentially next segment_command structure in the currently executing + * kernel + */ +struct segment_command * +nextseg(struct segment_command *sgp) { struct segment_command *this; @@ -269,12 +303,18 @@ struct segment_command *nextseg(struct segment_command *sgp) return this; } -struct segment_command *nextsegfromheader( +/* + * This routine operates against any 32 bit mach segment_command structure + * pointer and the provided 32 bit header, to obtain the sequentially next + * segment_command structure in that header. + */ +struct segment_command * +nextsegfromheader( struct mach_header *header, struct segment_command *seg) { struct segment_command *sgp; - int i; + unsigned long i; sgp = (struct segment_command *) ((char *)header + sizeof(struct mach_header)); @@ -299,9 +339,11 @@ struct segment_command *nextsegfromheader( /* - * Return the address of the named Mach-O segment, or NULL. + * Return the address of the named Mach-O segment from the currently + * executing 32 bit kernel, or NULL. */ -struct segment_command *getsegbyname(char *seg_name) +struct segment_command * +getsegbyname(const char *seg_name) { struct segment_command *this; @@ -319,42 +361,60 @@ struct segment_command *getsegbyname(char *seg_name) /* * This routine returns the a pointer the section structure of the named - * section in the named segment if it exist in the mach executable it is - * linked into. Otherwise it returns zero. + * section in the named segment if it exists in the currently executing + * kernel, which it is presumed to be linked into. Otherwise it returns NULL. */ struct section * getsectbyname( - char *segname, - char *sectname) + const char *segname, + const char *sectname) { return(getsectbynamefromheader( (struct mach_header *)&_mh_execute_header, segname, sectname)); } -struct section *firstsect(struct segment_command *sgp) +/* + * This routine can operate against any 32 bit segment_command structure to + * return the first 32 bit section immediately following that structure. If + * there are no sections associated with the segment_command structure, it + * returns NULL. + */ +struct section * +firstsect(struct segment_command *sgp) { - struct section *sp; - if (!sgp || sgp->nsects == 0) return (struct section *)0; return (struct section *)(sgp+1); } -struct section *nextsect(struct segment_command *sgp, struct section *sp) +/* + * This routine can operate against any 32 bit segment_command structure and + * 32 bit section to return the next consecutive 32 bit section immediately + * following the 32 bit section provided. If there are no sections following + * the provided section, it returns NULL. + */ +struct section * +nextsect(struct segment_command *sgp, struct section *sp) { struct section *fsp = firstsect(sgp); - if (sp - fsp >= sgp->nsects-1) + if (((unsigned long)(sp - fsp) + 1) >= sgp->nsects) return (struct section *)0; return sp+1; } -static struct fvmfile_command *fvmfilefromheader(struct mach_header *header) +/* + * This routine can operate against any 32 bit mach header to return the + * first occurring 32 bit fvmfile_command section. If one is not present, + * it returns NULL. + */ +static struct fvmfile_command * +fvmfilefromheader(struct mach_header *header) { struct fvmfile_command *fvp; - int i; + unsigned long i; fvp = (struct fvmfile_command *) ((char *)header + sizeof(struct mach_header)); @@ -368,8 +428,11 @@ static struct fvmfile_command *fvmfilefromheader(struct mach_header *header) /* * Create a fake USER seg if a fvmfile_command is present. + * + * This routine operates against the currently executing kernel only */ -struct segment_command *getfakefvmseg(void) +struct segment_command * +getfakefvmseg(void) { struct segment_command *sgp = getsegbyname("__USER"); struct fvmfile_command *fvp = fvmfilefromheader(&_mh_execute_header); @@ -396,16 +459,20 @@ struct segment_command *getfakefvmseg(void) printf("fake fvm seg __USER/\"%s\" at 0x%x, size 0x%x\n", sp->sectname, sp->addr, sp->size); #endif /* DEBUG */ + + return sgp; } /* * Figure out the size the size of the data associated with a * loaded mach_header. + * + * This routine can operate against any 32 bit mach header. */ -static vm_offset_t getsizeofmacho(struct mach_header *header) +static vm_offset_t +getsizeofmacho(struct mach_header *header) { struct segment_command *sgp; - struct section *sp; vm_offset_t last_addr; last_addr = 0; diff --git a/bsd/kern/mach_header.h b/bsd/kern/mach_header.h index 1e4cbeaba..ff667a6f4 100644 --- a/bsd/kern/mach_header.h +++ b/bsd/kern/mach_header.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -24,6 +24,11 @@ * * Definitions for accessing mach-o headers. * + * NOTE: The functions prototyped by this header only operate againt + * 32 bit mach headers. Many of these functions imply the + * currently running kernel, and cannot be used against mach + * headers other than that of the currently running kernel. + * * HISTORY * 29-Jan-92 Mike DeMoney (mike@next.com) * Made into machine independent form from machdep/m68k/mach_header.h. @@ -46,17 +51,17 @@ struct segment_command *nextseg(struct segment_command *sgp); struct segment_command *nextsegfromheader( struct mach_header *header, struct segment_command *seg); -struct segment_command *getsegbyname(char *seg_name); +struct segment_command *getsegbyname(const char *seg_name); struct segment_command *getsegbynamefromheader( struct mach_header *header, - char *seg_name); -void *getsegdatafromheader(struct mach_header *, char *, int *); -struct section *getsectbyname(char *seg_name, char *sect_name); + const char *seg_name); +void *getsegdatafromheader(struct mach_header *, const char *, int *); +struct section *getsectbyname(const char *seg_name, const char *sect_name); struct section *getsectbynamefromheader( struct mach_header *header, - char *seg_name, - char *sect_name); -void *getsectdatafromheader(struct mach_header *, char *, char *, int *); + const char *seg_name, + const char *sect_name); +void *getsectdatafromheader(struct mach_header *, const char *, const char *, int *); struct section *firstsect(struct segment_command *sgp); struct section *nextsect(struct segment_command *sgp, struct section *sp); struct fvmlib_command *fvmlib(void); diff --git a/bsd/kern/mach_loader.c b/bsd/kern/mach_loader.c index 8a988f82d..a12aa7682 100644 --- a/bsd/kern/mach_loader.c +++ b/bsd/kern/mach_loader.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -30,129 +30,205 @@ * 21-Jul-88 Avadis Tevanian, Jr. (avie) at NeXT * Started. */ + #include <sys/param.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/uio.h> #include <sys/namei.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/stat.h> #include <sys/malloc.h> -#include <sys/mount.h> +#include <sys/mount_internal.h> #include <sys/fcntl.h> -#include <sys/ubc.h> +#include <sys/ubc_internal.h> +#include <sys/imgact.h> #include <mach/mach_types.h> +#include <mach/vm_map.h> /* vm_allocate() */ +#include <mach/mach_vm.h> /* mach_vm_allocate() */ +#include <mach/vm_statistics.h> +#include <mach/shared_memory_server.h> +#include <mach/task.h> +#include <mach/thread_act.h> + +#include <machine/vmparam.h> +#include <kern/kern_types.h> +#include <kern/cpu_number.h> #include <kern/mach_loader.h> +#include <kern/kalloc.h> #include <kern/task.h> +#include <kern/thread.h> #include <mach-o/fat.h> #include <mach-o/loader.h> -#include <kern/cpu_number.h> - +#include <vm/pmap.h> #include <vm/vm_map.h> #include <vm/vm_kern.h> #include <vm/vm_pager.h> #include <vm/vnode_pager.h> -#include <mach/vm_statistics.h> - -#include <mach/shared_memory_server.h> #include <vm/vm_shared_memory_server.h> +#include <vm/vm_protos.h> -#include <machine/vmparam.h> +/* + * XXX vm/pmap.h should not treat these prototypes as MACH_KERNEL_PRIVATE + * when KERNEL is defined. + */ +extern pmap_t pmap_create(vm_map_size_t size); +extern void pmap_switch(pmap_t); +extern void pmap_map_sharedpage(task_t task, pmap_t pmap); + +/* + * XXX kern/thread.h should not treat these prototypes as MACH_KERNEL_PRIVATE + * when KERNEL is defined. + */ +extern kern_return_t thread_setstatus(thread_t thread, int flavor, + thread_state_t tstate, + mach_msg_type_number_t count); + +extern kern_return_t thread_state_initialize(thread_t thread); + + +/* XXX should have prototypes in a shared header file */ +extern int grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype); +extern int get_map_nentries(vm_map_t); +extern kern_return_t thread_userstack(thread_t, int, thread_state_t, + unsigned int, mach_vm_offset_t *, int *); +extern kern_return_t thread_entrypoint(thread_t, int, thread_state_t, + unsigned int, mach_vm_offset_t *); + + +/* An empty load_result_t */ +static load_result_t load_result_null = { + MACH_VM_MIN_ADDRESS, + MACH_VM_MIN_ADDRESS, + MACH_VM_MIN_ADDRESS, + 0, + 0, + 0, + 0 +}; /* * Prototypes of static functions. */ -static -load_return_t +static load_return_t parse_machfile( struct vnode *vp, - vm_map_t map, - thread_act_t thr_act, + vm_map_t map, + thread_t thr_act, struct mach_header *header, - unsigned long file_offset, - unsigned long macho_size, - int depth, - load_result_t *result, - boolean_t clean_regions -), + off_t file_offset, + off_t macho_size, + boolean_t shared_regions, + boolean_t clean_regions, + int depth, + load_result_t *result +); + +static load_return_t load_segment( struct segment_command *scp, void * pager, - unsigned long pager_offset, - unsigned long macho_size, - unsigned long end_of_file, + off_t pager_offset, + off_t macho_size, + off_t end_of_file, vm_map_t map, load_result_t *result -), +); + +static load_return_t +load_segment_64( + struct segment_command_64 *scp64, + void *pager, + off_t pager_offset, + off_t macho_size, + off_t end_of_file, + vm_map_t map, + load_result_t *result +); + +static load_return_t load_unixthread( struct thread_command *tcp, - thread_act_t thr_act, + thread_t thr_act, load_result_t *result -), +); + +static load_return_t load_thread( struct thread_command *tcp, - thread_act_t thr_act, + thread_t thr_act, load_result_t *result -), +); + +static load_return_t load_threadstate( thread_t thread, unsigned long *ts, unsigned long total_size -), +); + +static load_return_t load_threadstack( thread_t thread, unsigned long *ts, unsigned long total_size, - vm_offset_t *user_stack, + mach_vm_offset_t *user_stack, int *customstack -), +); + +static load_return_t load_threadentry( thread_t thread, unsigned long *ts, unsigned long total_size, - vm_offset_t *entry_point -), + mach_vm_offset_t *entry_point +); + +static load_return_t load_dylinker( struct dylinker_command *lcp, + integer_t archbits, vm_map_t map, - thread_act_t thr_act, + thread_t thr_act, int depth, load_result_t *result, boolean_t clean_regions -), +); + +static load_return_t get_macho_vnode( char *path, + integer_t archbits, struct mach_header *mach_header, - unsigned long *file_offset, - unsigned long *macho_size, + off_t *file_offset, + off_t *macho_size, struct vnode **vpp ); load_return_t load_machfile( - struct vnode *vp, + struct image_params *imgp, struct mach_header *header, - unsigned long file_offset, - unsigned long macho_size, - load_result_t *result, - thread_act_t thr_act, + thread_t thr_act, vm_map_t new_map, - boolean_t clean_regions + boolean_t clean_regions, + load_result_t *result ) { - pmap_t pmap; + struct vnode *vp = imgp->ip_vp; + off_t file_offset = imgp->ip_arch_offset; + off_t macho_size = imgp->ip_arch_size; + + pmap_t pmap = 0; /* protected by create_map */ vm_map_t map; vm_map_t old_map; load_result_t myresult; - kern_return_t kret; load_return_t lret; boolean_t create_map = TRUE; -#ifndef i386 - extern pmap_t pmap_create(vm_size_t size); /* XXX */ -#endif if (new_map != VM_MAP_NULL) { create_map = FALSE; @@ -164,7 +240,7 @@ load_machfile( pmap = get_task_pmap(current_task()); pmap_reference(pmap); #else - pmap = pmap_create((vm_size_t) 0); + pmap = pmap_create((vm_map_size_t) 0); #endif map = vm_map_create(pmap, get_map_min(old_map), @@ -176,10 +252,11 @@ load_machfile( if (!result) result = &myresult; - *result = (load_result_t) { 0 }; + *result = load_result_null; lret = parse_machfile(vp, map, thr_act, header, file_offset, macho_size, - 0, result, clean_regions); + ((imgp->ip_flags & IMGPF_IS_64BIT) == 0), /* shared regions? */ + clean_regions, 0, result); if (lret != LOAD_SUCCESS) { if (create_map) { @@ -213,27 +290,38 @@ load_machfile( int dylink_test = 1; +/* + * The file size of a mach-o file is limited to 32 bits; this is because + * this is the limit on the kalloc() of enough bytes for a mach_header and + * the contents of its sizeofcmds, which is currently constrained to 32 + * bits in the file format itself. We read into the kernel buffer the + * commands section, and then parse it in order to parse the mach-o file + * format load_command segment(s). We are only interested in a subset of + * the total set of possible commands. + */ static load_return_t parse_machfile( - struct vnode *vp, + struct vnode *vp, vm_map_t map, - thread_act_t thr_act, + thread_t thr_act, struct mach_header *header, - unsigned long file_offset, - unsigned long macho_size, + off_t file_offset, + off_t macho_size, + boolean_t shared_regions, + boolean_t clean_regions, int depth, - load_result_t *result, - boolean_t clean_regions + load_result_t *result ) { - struct machine_slot *ms; uint32_t ncmds; - struct load_command *lcp, *next; + struct load_command *lcp; struct dylinker_command *dlp = 0; + integer_t dlarchbits = 0; void * pager; load_return_t ret = LOAD_SUCCESS; - vm_offset_t addr, kl_addr; + caddr_t addr; + void * kl_addr; vm_size_t size,kl_size; size_t offset; size_t oldoffset; /* for overflow check */ @@ -242,6 +330,13 @@ parse_machfile( int error; int resid=0; task_t task; + size_t mach_header_sz = sizeof(struct mach_header); + boolean_t abi64; + + if (header->magic == MH_MAGIC_64 || + header->magic == MH_CIGAM_64) { + mach_header_sz = sizeof(struct mach_header_64); + } /* * Break infinite recursion @@ -256,11 +351,12 @@ parse_machfile( /* * Check to see if right machine type. */ - ms = &machine_slot[cpu_number()]; - if ((header->cputype != ms->cpu_type) || - !check_cpu_subtype(header->cpusubtype)) + if (((cpu_type_t)(header->cputype & ~CPU_ARCH_MASK) != cpu_type()) || + !grade_binary(header->cputype, header->cpusubtype)) return(LOAD_BADARCH); + abi64 = ((header->cputype & CPU_ARCH_ABI64) == CPU_ARCH_ABI64); + switch (header->filetype) { case MH_OBJECT: @@ -295,13 +391,13 @@ parse_machfile( * Map portion that must be accessible directly into * kernel's map. */ - if ((sizeof (struct mach_header) + header->sizeofcmds) > macho_size) + if ((mach_header_sz + header->sizeofcmds) > macho_size) return(LOAD_BADMACHO); /* * Round size of Mach-O commands up to page boundry. */ - size = round_page_32(sizeof (struct mach_header) + header->sizeofcmds); + size = round_page(mach_header_sz + header->sizeofcmds); if (size <= 0) return(LOAD_BADMACHO); @@ -311,17 +407,18 @@ parse_machfile( addr = 0; kl_size = size; kl_addr = kalloc(size); - addr = kl_addr; + addr = (caddr_t)kl_addr; if (addr == NULL) return(LOAD_NOSPACE); - if(error = vn_rdwr(UIO_READ, vp, (caddr_t)addr, size, file_offset, - UIO_SYSSPACE, 0, p->p_ucred, &resid, p)) { + error = vn_rdwr(UIO_READ, vp, addr, size, file_offset, + UIO_SYSSPACE32, 0, kauth_cred_get(), &resid, p); + if (error) { if (kl_addr ) kfree(kl_addr, kl_size); return(LOAD_IOERROR); } - /* ubc_map(vp); */ /* NOT HERE */ + /* (void)ubc_map(vp, PROT_EXEC); */ /* NOT HERE */ /* * Scan through the commands, processing each one as necessary. @@ -333,7 +430,7 @@ parse_machfile( * run off the end of the reserved section by incrementing * the offset too far, so we are implicitly fail-safe. */ - offset = sizeof(struct mach_header); + offset = mach_header_sz; ncmds = header->ncmds; while (ncmds--) { /* @@ -353,8 +450,8 @@ parse_machfile( */ if (oldoffset > offset || lcp->cmdsize < sizeof(struct load_command) || - offset > header->sizeofcmds + sizeof(struct mach_header)) { - ret = LOAD_BADMACHO; + offset > header->sizeofcmds + mach_header_sz) { + ret = LOAD_BADMACHO; break; } @@ -363,41 +460,59 @@ parse_machfile( * intervention is required. */ switch(lcp->cmd) { + case LC_SEGMENT_64: + if (pass != 1) + break; + ret = load_segment_64( + (struct segment_command_64 *)lcp, + pager, + file_offset, + macho_size, + ubc_getsize(vp), + map, + result); + break; case LC_SEGMENT: if (pass != 1) break; ret = load_segment( (struct segment_command *) lcp, - pager, file_offset, + pager, + file_offset, macho_size, - (unsigned long)ubc_getsize(vp), + ubc_getsize(vp), map, result); break; case LC_THREAD: if (pass != 2) break; - ret = load_thread((struct thread_command *)lcp, thr_act, + ret = load_thread((struct thread_command *)lcp, + thr_act, result); break; case LC_UNIXTHREAD: if (pass != 2) break; ret = load_unixthread( - (struct thread_command *) lcp, thr_act, + (struct thread_command *) lcp, + thr_act, result); break; case LC_LOAD_DYLINKER: if (pass != 2) break; - if ((depth == 1) && (dlp == 0)) + if ((depth == 1) && (dlp == 0)) { dlp = (struct dylinker_command *)lcp; - else + dlarchbits = (header->cputype & CPU_ARCH_MASK); + } else { ret = LOAD_FAILURE; + } break; default: /* Other commands are ignored by the kernel */ ret = LOAD_SUCCESS; + break; } if (ret != LOAD_SUCCESS) break; @@ -405,8 +520,10 @@ parse_machfile( if (ret != LOAD_SUCCESS) break; } - if ((ret == LOAD_SUCCESS) && (depth == 1)) { - vm_offset_t addr; + if (ret == LOAD_SUCCESS) { + + if (shared_regions) { + vm_offset_t vmaddr; shared_region_mapping_t shared_region; struct shared_region_task_mappings map_info; shared_region_mapping_t next; @@ -454,25 +571,24 @@ RedoLookup: } } - if (dylink_test) { p->p_flag |= P_NOSHLIB; /* no shlibs in use */ - addr = map_info.client_base; + vmaddr = map_info.client_base; if(clean_regions) { - vm_map(map, &addr, map_info.text_size, - 0, SHARED_LIB_ALIAS, + vm_map(map, &vmaddr, map_info.text_size, + 0, SHARED_LIB_ALIAS|VM_FLAGS_FIXED, map_info.text_region, 0, FALSE, VM_PROT_READ, VM_PROT_READ, VM_INHERIT_SHARE); } else { - vm_map(map, &addr, map_info.text_size, 0, + vm_map(map, &vmaddr, map_info.text_size, 0, (VM_MEMORY_SHARED_PMAP << 24) - | SHARED_LIB_ALIAS, + | SHARED_LIB_ALIAS | VM_FLAGS_FIXED, map_info.text_region, 0, FALSE, VM_PROT_READ, VM_PROT_READ, VM_INHERIT_SHARE); } - addr = map_info.client_base + map_info.text_size; - vm_map(map, &addr, map_info.data_size, - 0, SHARED_LIB_ALIAS, + vmaddr = map_info.client_base + map_info.text_size; + vm_map(map, &vmaddr, map_info.data_size, + 0, SHARED_LIB_ALIAS | VM_FLAGS_FIXED, map_info.data_region, 0, TRUE, VM_PROT_READ, VM_PROT_READ, VM_INHERIT_SHARE); @@ -497,27 +613,36 @@ RedoLookup: &(map_info.system), &(map_info.flags), &next); - addr = map_info.client_base; - vm_map(map, &addr, map_info.text_size, - 0, SHARED_LIB_ALIAS, + vmaddr = map_info.client_base; + vm_map(map, &vmaddr, map_info.text_size, + 0, SHARED_LIB_ALIAS | VM_FLAGS_FIXED, map_info.text_region, 0, FALSE, VM_PROT_READ, VM_PROT_READ, VM_INHERIT_SHARE); } } - if (dlp != 0) { - ret = load_dylinker(dlp, map, thr_act, - depth, result, clean_regions); - } + } + if (dlp != 0) + ret = load_dylinker(dlp, dlarchbits, map, thr_act, depth, result, clean_regions); + + if(depth == 1) { + if (result->thread_count == 0) + ret = LOAD_FAILURE; +#ifdef __ppc__ + else if ( abi64 ) { + /* Map in 64-bit commpage */ + /* LP64todo - make this clean */ + pmap_map_sharedpage(current_task(), get_map_pmap(map)); + vm_map_commpage64(map); + } +#endif + } } if (kl_addr ) kfree(kl_addr, kl_size); - if ((ret == LOAD_SUCCESS) && (depth == 1) && - (result->thread_count == 0)) - ret = LOAD_FAILURE; if (ret == LOAD_SUCCESS) - ubc_map(vp); + (void)ubc_map(vp, PROT_EXEC); return(ret); } @@ -527,9 +652,9 @@ load_return_t load_segment( struct segment_command *scp, void * pager, - unsigned long pager_offset, - unsigned long macho_size, - unsigned long end_of_file, + off_t pager_offset, + off_t macho_size, + __unused off_t end_of_file, vm_map_t map, load_result_t *result ) @@ -537,7 +662,6 @@ load_segment( kern_return_t ret; vm_offset_t map_addr, map_offset; vm_size_t map_size, seg_size, delta_size; - caddr_t tmp; vm_prot_t initprot; vm_prot_t maxprot; @@ -548,15 +672,15 @@ load_segment( if (scp->fileoff + scp->filesize > macho_size) return (LOAD_BADMACHO); - seg_size = round_page_32(scp->vmsize); + seg_size = round_page(scp->vmsize); if (seg_size == 0) return(KERN_SUCCESS); /* * Round sizes to page size. */ - map_size = round_page_32(scp->filesize); - map_addr = trunc_page_32(scp->vmaddr); + map_size = round_page(scp->filesize); + map_addr = trunc_page(scp->vmaddr); map_offset = pager_offset + scp->fileoff; @@ -567,8 +691,8 @@ load_segment( * Map a copy of the file into the address space. */ ret = vm_map(map, - &map_addr, map_size, (vm_offset_t)0, FALSE, - pager, map_offset, TRUE, + &map_addr, map_size, (vm_offset_t)0, + VM_FLAGS_FIXED, pager, map_offset, TRUE, initprot, maxprot, VM_INHERIT_DEFAULT); if (ret != KERN_SUCCESS) @@ -583,7 +707,7 @@ load_segment( if (delta_size > 0) { vm_offset_t tmp; - ret = vm_allocate(kernel_map, &tmp, delta_size, TRUE); + ret = vm_allocate(kernel_map, &tmp, delta_size, VM_FLAGS_ANYWHERE); if (ret != KERN_SUCCESS) return(LOAD_RESOURCE); @@ -608,7 +732,7 @@ load_segment( if (delta_size > 0) { vm_offset_t tmp = map_addr + map_size; - ret = vm_allocate(map, &tmp, delta_size, FALSE); + ret = vm_allocate(map, &tmp, delta_size, VM_FLAGS_FIXED); if (ret != KERN_SUCCESS) return(LOAD_NOSPACE); } @@ -634,49 +758,110 @@ load_segment( static load_return_t -load_unixthread( - struct thread_command *tcp, - thread_act_t thread, +load_segment_64( + struct segment_command_64 *scp64, + void * pager, + off_t pager_offset, + off_t macho_size, + __unused off_t end_of_file, + vm_map_t map, load_result_t *result ) { - load_return_t ret; - int customstack =0; + kern_return_t ret; + mach_vm_offset_t map_addr, map_offset; + mach_vm_size_t map_size, seg_size, delta_size; + vm_prot_t initprot; + vm_prot_t maxprot; - if (result->thread_count != 0) + /* + * Make sure what we get from the file is really ours (as specified + * by macho_size). + */ + if (scp64->fileoff + scp64->filesize > (uint64_t)macho_size) + return (LOAD_BADMACHO); + + seg_size = round_page_64(scp64->vmsize); + if (seg_size == 0) + return(KERN_SUCCESS); + + /* + * Round sizes to page size. + */ + map_size = round_page_64(scp64->filesize); /* limited to 32 bits */ + map_addr = round_page_64(scp64->vmaddr); + + map_offset = pager_offset + scp64->fileoff; /* limited to 32 bits */ + + if (map_size > 0) { + initprot = (scp64->initprot) & VM_PROT_ALL; + maxprot = (scp64->maxprot) & VM_PROT_ALL; + /* + * Map a copy of the file into the address space. + */ + ret = mach_vm_map(map, + &map_addr, map_size, (mach_vm_offset_t)0, + VM_FLAGS_FIXED, pager, map_offset, TRUE, + initprot, maxprot, + VM_INHERIT_DEFAULT); + if (ret != KERN_SUCCESS) + return(LOAD_NOSPACE); + + /* + * If the file didn't end on a page boundary, + * we need to zero the leftover. + */ + delta_size = map_size - scp64->filesize; +#if FIXME + if (delta_size > 0) { + mach_vm_offset_t tmp; + + ret = vm_allocate(kernel_map, &tmp, delta_size, VM_FLAGS_ANYWHERE); + if (ret != KERN_SUCCESS) + return(LOAD_RESOURCE); + + if (copyout(tmp, map_addr + scp64->filesize, + delta_size)) { + (void) vm_deallocate( + kernel_map, tmp, delta_size); return (LOAD_FAILURE); + } - ret = load_threadstack(thread, - (unsigned long *)(((vm_offset_t)tcp) + - sizeof(struct thread_command)), - tcp->cmdsize - sizeof(struct thread_command), - &result->user_stack, - &customstack); - if (ret != LOAD_SUCCESS) - return(ret); + (void) vm_deallocate(kernel_map, tmp, delta_size); + } +#endif /* FIXME */ + } - if (customstack) - result->customstack = 1; - else - result->customstack = 0; - ret = load_threadentry(thread, - (unsigned long *)(((vm_offset_t)tcp) + - sizeof(struct thread_command)), - tcp->cmdsize - sizeof(struct thread_command), - &result->entry_point); - if (ret != LOAD_SUCCESS) - return(ret); + /* + * If the virtual size of the segment is greater + * than the size from the file, we need to allocate + * zero fill memory for the rest. + */ + delta_size = seg_size - map_size; + if (delta_size > 0) { + mach_vm_offset_t tmp = map_addr + map_size; - ret = load_threadstate(thread, - (unsigned long *)(((vm_offset_t)tcp) + - sizeof(struct thread_command)), - tcp->cmdsize - sizeof(struct thread_command)); - if (ret != LOAD_SUCCESS) - return (ret); + ret = mach_vm_allocate(map, &tmp, delta_size, VM_FLAGS_FIXED); + if (ret != KERN_SUCCESS) + return(LOAD_NOSPACE); + } - result->unixproc = TRUE; - result->thread_count++; + /* + * Set protection values. (Note: ignore errors!) + */ + if (scp64->maxprot != VM_PROT_DEFAULT) { + (void) mach_vm_protect(map, + map_addr, seg_size, + TRUE, scp64->maxprot); + } + if (scp64->initprot != VM_PROT_DEFAULT) { + (void) mach_vm_protect(map, + map_addr, seg_size, + FALSE, scp64->initprot); + } + if ( (scp64->fileoff == 0) && (scp64->filesize != 0) ) + result->mach_header = map_addr; return(LOAD_SUCCESS); } @@ -684,7 +869,7 @@ static load_return_t load_thread( struct thread_command *tcp, - thread_act_t thread, + thread_t thread, load_result_t *result ) { @@ -700,7 +885,7 @@ load_thread( kret = thread_create(task, &thread); if (kret != KERN_SUCCESS) return(LOAD_RESOURCE); - act_deallocate(thread); + thread_deallocate(thread); } lret = load_threadstate(thread, @@ -746,6 +931,54 @@ load_thread( return(LOAD_SUCCESS); } +static +load_return_t +load_unixthread( + struct thread_command *tcp, + thread_t thread, + load_result_t *result +) +{ + load_return_t ret; + int customstack =0; + + if (result->thread_count != 0) + return (LOAD_FAILURE); + + ret = load_threadstack(thread, + (unsigned long *)(((vm_offset_t)tcp) + + sizeof(struct thread_command)), + tcp->cmdsize - sizeof(struct thread_command), + &result->user_stack, + &customstack); + if (ret != LOAD_SUCCESS) + return(ret); + + if (customstack) + result->customstack = 1; + else + result->customstack = 0; + ret = load_threadentry(thread, + (unsigned long *)(((vm_offset_t)tcp) + + sizeof(struct thread_command)), + tcp->cmdsize - sizeof(struct thread_command), + &result->entry_point); + if (ret != LOAD_SUCCESS) + return(ret); + + ret = load_threadstate(thread, + (unsigned long *)(((vm_offset_t)tcp) + + sizeof(struct thread_command)), + tcp->cmdsize - sizeof(struct thread_command)); + if (ret != LOAD_SUCCESS) + return (ret); + + result->unixproc = TRUE; + result->thread_count++; + + return(LOAD_SUCCESS); +} + static load_return_t load_threadstate( @@ -757,18 +990,29 @@ load_threadstate( kern_return_t ret; unsigned long size; int flavor; + unsigned long thread_size; + ret = thread_state_initialize( thread ); + if (ret != KERN_SUCCESS) + return(LOAD_FAILURE); + /* - * Set the thread state. + * Set the new thread state; iterate through the state flavors in + * the mach-o file. */ - while (total_size > 0) { flavor = *ts++; size = *ts++; - total_size -= (size+2)*sizeof(unsigned long); - if (total_size < 0) + thread_size = (size+2)*sizeof(unsigned long); + if (thread_size > total_size) return(LOAD_BADMACHO); - ret = thread_setstatus(thread, flavor, ts, size); + total_size -= thread_size; + /* + * Third argument is a kernel space pointer; it gets cast + * to the appropriate type in machine_thread_set_state() + * based on the value of flavor. + */ + ret = thread_setstatus(thread, flavor, (thread_state_t)ts, size); if (ret != KERN_SUCCESS) return(LOAD_FAILURE); ts += size; /* ts is a (unsigned long *) */ @@ -782,23 +1026,29 @@ load_threadstack( thread_t thread, unsigned long *ts, unsigned long total_size, - vm_offset_t *user_stack, + user_addr_t *user_stack, int *customstack ) { kern_return_t ret; unsigned long size; int flavor; + unsigned long stack_size; while (total_size > 0) { flavor = *ts++; size = *ts++; - total_size -= (size+2)*sizeof(unsigned long); - if (total_size < 0) + stack_size = (size+2)*sizeof(unsigned long); + if (stack_size > total_size) return(LOAD_BADMACHO); - *user_stack = USRSTACK; - ret = thread_userstack(thread, flavor, ts, size, - user_stack, customstack); + total_size -= stack_size; + + /* + * Third argument is a kernel space pointer; it gets cast + * to the appropriate type in thread_userstack() based on + * the value of flavor. + */ + ret = thread_userstack(thread, flavor, (thread_state_t)ts, size, user_stack, customstack); if (ret != KERN_SUCCESS) return(LOAD_FAILURE); ts += size; /* ts is a (unsigned long *) */ @@ -812,24 +1062,31 @@ load_threadentry( thread_t thread, unsigned long *ts, unsigned long total_size, - vm_offset_t *entry_point + mach_vm_offset_t *entry_point ) { kern_return_t ret; unsigned long size; int flavor; + unsigned long entry_size; /* * Set the thread state. */ - *entry_point = 0; + *entry_point = MACH_VM_MIN_ADDRESS; while (total_size > 0) { flavor = *ts++; size = *ts++; - total_size -= (size+2)*sizeof(unsigned long); - if (total_size < 0) + entry_size = (size+2)*sizeof(unsigned long); + if (entry_size > total_size) return(LOAD_BADMACHO); - ret = thread_entrypoint(thread, flavor, ts, size, entry_point); + total_size -= entry_size; + /* + * Third argument is a kernel space pointer; it gets cast + * to the appropriate type in thread_entrypoint() based on + * the value of flavor. + */ + ret = thread_entrypoint(thread, flavor, (thread_state_t)ts, size, entry_point); if (ret != KERN_SUCCESS) return(LOAD_FAILURE); ts += size; /* ts is a (unsigned long *) */ @@ -842,8 +1099,9 @@ static load_return_t load_dylinker( struct dylinker_command *lcp, + integer_t archbits, vm_map_t map, - thread_act_t thr_act, + thread_t thr_act, int depth, load_result_t *result, boolean_t clean_regions @@ -853,15 +1111,14 @@ load_dylinker( char *p; struct vnode *vp; struct mach_header header; - unsigned long file_offset; - unsigned long macho_size; + off_t file_offset; + off_t macho_size; vm_map_t copy_map; load_result_t myresult; kern_return_t ret; vm_map_copy_t tmp; - vm_offset_t dyl_start, map_addr; - vm_size_t dyl_length; - extern pmap_t pmap_create(vm_size_t size); /* XXX */ + mach_vm_offset_t dyl_start, map_addr; + mach_vm_size_t dyl_length; name = (char *)lcp + lcp->name.offset; /* @@ -873,35 +1130,39 @@ load_dylinker( return(LOAD_BADMACHO); } while (*p++); - ret = get_macho_vnode(name, &header, &file_offset, &macho_size, &vp); + ret = get_macho_vnode(name, archbits, &header, &file_offset, &macho_size, &vp); if (ret) return (ret); - myresult = (load_result_t) { 0 }; - /* * Load the Mach-O. + * Use a temporary map to do the work. */ - - copy_map = vm_map_create(pmap_create(macho_size), - get_map_min(map), get_map_max( map), TRUE); + copy_map = vm_map_create(pmap_create(vm_map_round_page(macho_size)), + get_map_min(map), get_map_max(map), TRUE); + if (VM_MAP_NULL == copy_map) { + ret = LOAD_RESOURCE; + goto out; + } + + myresult = load_result_null; ret = parse_machfile(vp, copy_map, thr_act, &header, file_offset, macho_size, - depth, &myresult, clean_regions); + FALSE, clean_regions, depth, &myresult); if (ret) goto out; if (get_map_nentries(copy_map) > 0) { - dyl_start = get_map_start(copy_map); - dyl_length = get_map_end(copy_map) - dyl_start; + dyl_start = mach_get_vm_start(copy_map); + dyl_length = mach_get_vm_end(copy_map) - dyl_start; map_addr = dyl_start; - ret = vm_allocate(map, &map_addr, dyl_length, FALSE); + ret = mach_vm_allocate(map, &map_addr, dyl_length, VM_FLAGS_FIXED); if (ret != KERN_SUCCESS) { - ret = vm_allocate(map, &map_addr, dyl_length, TRUE); + ret = mach_vm_allocate(map, &map_addr, dyl_length, VM_FLAGS_ANYWHERE); } if (ret != KERN_SUCCESS) { @@ -909,24 +1170,29 @@ load_dylinker( goto out; } - ret = vm_map_copyin(copy_map, dyl_start, dyl_length, TRUE, - &tmp); + ret = vm_map_copyin(copy_map, + (vm_map_address_t)dyl_start, + (vm_map_size_t)dyl_length, + TRUE, &tmp); if (ret != KERN_SUCCESS) { (void) vm_map_remove(map, - map_addr, - map_addr + dyl_length, - VM_MAP_NO_FLAGS); + vm_map_trunc_page(map_addr), + vm_map_round_page(map_addr + dyl_length), + VM_MAP_NO_FLAGS); goto out; } - ret = vm_map_copy_overwrite(map, map_addr, tmp, FALSE); + ret = vm_map_copy_overwrite(map, + (vm_map_address_t)map_addr, + tmp, FALSE); if (ret != KERN_SUCCESS) { - vm_map_copy_discard(tmp); - (void) vm_map_remove(map, - map_addr, - map_addr + dyl_length, - VM_MAP_NO_FLAGS); - goto out; } + vm_map_copy_discard(tmp); + (void) vm_map_remove(map, + vm_map_trunc_page(map_addr), + vm_map_round_page(map_addr + dyl_length), + VM_MAP_NO_FLAGS); + goto out; + } if (map_addr != dyl_start) myresult.entry_point += (map_addr - dyl_start); @@ -936,28 +1202,35 @@ load_dylinker( if (ret == LOAD_SUCCESS) { result->dynlinker = TRUE; result->entry_point = myresult.entry_point; - ubc_map(vp); + (void)ubc_map(vp, PROT_EXEC); } out: vm_map_deallocate(copy_map); - vrele(vp); + vnode_put(vp); return (ret); } +/* + * This routine exists to support the load_dylinker(). + * + * This routine has its own, separate, understanding of the FAT file format, + * which is terrifically unfortunate. + */ static load_return_t get_macho_vnode( char *path, + integer_t archbits, struct mach_header *mach_header, - unsigned long *file_offset, - unsigned long *macho_size, + off_t *file_offset, + off_t *macho_size, struct vnode **vpp ) { struct vnode *vp; - struct vattr attr, *atp; + struct vfs_context context; struct nameidata nid, *ndp; struct proc *p = current_proc(); /* XXXX */ boolean_t is_fat; @@ -970,23 +1243,25 @@ get_macho_vnode( char pad[512]; } header; off_t fsize = (off_t)0; - struct ucred *cred = p->p_ucred; + struct ucred *cred = kauth_cred_get(); int err2; + context.vc_proc = p; + context.vc_ucred = cred; + ndp = &nid; - atp = &attr; /* init the namei data to point the file user's program name */ - NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path, p); + NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE32, CAST_USER_ADDR_T(path), &context); - if (error = namei(ndp)) { + if ((error = namei(ndp)) != 0) { if (error == ENOENT) error = LOAD_ENOENT; else error = LOAD_FAILURE; return(error); } - + nameidone(ndp); vp = ndp->ni_vp; /* check for regular file */ @@ -995,8 +1270,8 @@ get_macho_vnode( goto bad1; } - /* get attributes */ - if (error = VOP_GETATTR(vp, &attr, cred, p)) { + /* get size */ + if ((error = vnode_size(vp, &fsize, &context)) != 0) { error = LOAD_FAILURE; goto bad1; } @@ -1007,39 +1282,26 @@ get_macho_vnode( goto bad1; } - if ((vp->v_mount->mnt_flag & MNT_NOSUID) || (p->p_flag & P_TRACED)) - atp->va_mode &= ~(VSUID | VSGID); - - /* check access. for root we have to see if any exec bit on */ - if (error = VOP_ACCESS(vp, VEXEC, cred, p)) { + /* check access */ + if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_EXECUTE, &context)) != 0) { error = LOAD_PROTECT; goto bad1; } - if ((atp->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { - error = LOAD_PROTECT; - goto bad1; - } - - /* hold the vnode for the IO */ - if (UBCINFOEXISTS(vp) && !ubc_hold(vp)) { - error = LOAD_ENOENT; - goto bad1; - } /* try to open it */ - if (error = VOP_OPEN(vp, FREAD, cred, p)) { + if ((error = VNOP_OPEN(vp, FREAD, &context)) != 0) { error = LOAD_PROTECT; - ubc_rele(vp); goto bad1; } - if(error = vn_rdwr(UIO_READ, vp, (caddr_t)&header, sizeof(header), 0, - UIO_SYSSPACE, IO_NODELOCKED, cred, &resid, p)) { + if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)&header, sizeof(header), 0, + UIO_SYSSPACE32, IO_NODELOCKED, cred, &resid, p)) != 0) { error = LOAD_IOERROR; goto bad2; } - if (header.mach_header.magic == MH_MAGIC) + if (header.mach_header.magic == MH_MAGIC || + header.mach_header.magic == MH_MAGIC_64) is_fat = FALSE; else if (header.fat_header.magic == FAT_MAGIC || header.fat_header.magic == FAT_CIGAM) @@ -1051,21 +1313,22 @@ get_macho_vnode( if (is_fat) { /* Look up our architecture in the fat file. */ - error = fatfile_getarch(vp, (vm_offset_t)(&header.fat_header), &fat_arch); + error = fatfile_getarch_with_bits(vp, archbits, (vm_offset_t)(&header.fat_header), &fat_arch); if (error != LOAD_SUCCESS) goto bad2; /* Read the Mach-O header out of it */ error = vn_rdwr(UIO_READ, vp, (caddr_t)&header.mach_header, sizeof(header.mach_header), fat_arch.offset, - UIO_SYSSPACE, IO_NODELOCKED, cred, &resid, p); + UIO_SYSSPACE32, IO_NODELOCKED, cred, &resid, p); if (error) { error = LOAD_IOERROR; goto bad2; } /* Is this really a Mach-O? */ - if (header.mach_header.magic != MH_MAGIC) { + if (header.mach_header.magic != MH_MAGIC && + header.mach_header.magic != MH_MAGIC_64) { error = LOAD_BADMACHO; goto bad2; } @@ -1073,28 +1336,36 @@ get_macho_vnode( *file_offset = fat_arch.offset; *macho_size = fsize = fat_arch.size; } else { + /* + * Force get_macho_vnode() to fail if the architecture bits + * do not match the expected architecture bits. This in + * turn causes load_dylinker() to fail for the same reason, + * so it ensures the dynamic linker and the binary are in + * lock-step. This is potentially bad, if we ever add to + * the CPU_ARCH_* bits any bits that are desirable but not + * required, since the dynamic linker might work, but we will + * refuse to load it because of this check. + */ + if ((cpu_type_t)(header.mach_header.cputype & CPU_ARCH_MASK) != archbits) + return(LOAD_BADARCH); *file_offset = 0; - *macho_size = fsize = attr.va_size; + *macho_size = fsize; } *mach_header = header.mach_header; *vpp = vp; - if (UBCISVALID(vp)) - ubc_setsize(vp, fsize); /* XXX why? */ + + ubc_setsize(vp, fsize); - VOP_UNLOCK(vp, 0, p); - ubc_rele(vp); return (error); bad2: - VOP_UNLOCK(vp, 0, p); - err2 = VOP_CLOSE(vp, FREAD, cred, p); - ubc_rele(vp); - vrele(vp); + err2 = VNOP_CLOSE(vp, FREAD, &context); + vnode_put(vp); return (error); bad1: - vput(vp); + vnode_put(vp); return(error); } diff --git a/bsd/kern/mach_loader.h b/bsd/kern/mach_loader.h index 939445e02..75713b125 100644 --- a/bsd/kern/mach_loader.h +++ b/bsd/kern/mach_loader.h @@ -39,11 +39,15 @@ typedef int load_return_t; +/* + * Structure describing the result from calling load_machfile(), if that + * function returns LOAD_SUCCESS. + */ typedef struct _load_result { - vm_offset_t mach_header; - vm_offset_t entry_point; - vm_offset_t user_stack; - int thread_count; + user_addr_t mach_header; + user_addr_t entry_point; + user_addr_t user_stack; + int thread_count; unsigned int /* boolean_t */ unixproc :1, dynlinker :1, @@ -51,15 +55,14 @@ typedef struct _load_result { :0; } load_result_t; +struct image_params; load_return_t load_machfile( - struct vnode *vp, + struct image_params *imgp, struct mach_header *header, - unsigned long file_offset, - unsigned long macho_size, - load_result_t *result, - thread_act_t thr_act, + thread_t thr_act, vm_map_t map, - boolean_t clean_regions); + boolean_t clean_regions, + load_result_t *result); #define LOAD_SUCCESS 0 #define LOAD_BADARCH 1 /* CPU type/subtype not found */ diff --git a/bsd/kern/mach_process.c b/bsd/kern/mach_process.c index 8c0567ea1..caa043027 100644 --- a/bsd/kern/mach_process.c +++ b/bsd/kern/mach_process.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -65,7 +65,8 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/errno.h> #include <sys/ptrace.h> #include <sys/uio.h> @@ -73,7 +74,8 @@ #include <sys/sysctl.h> #include <sys/wait.h> -#include <sys/mount.h> +#include <sys/mount_internal.h> +#include <sys/sysproto.h> #include <bsm/audit_kernel.h> @@ -81,22 +83,23 @@ #include <kern/thread.h> #include <mach/machine/thread_status.h> + /* Macros to clear/set/test flags. */ #define SET(t, f) (t) |= (f) #define CLR(t, f) (t) &= ~(f) #define ISSET(t, f) ((t) & (f)) -void psignal_lock __P((struct proc *, int, int)); +extern thread_t port_name_to_thread(mach_port_name_t port_name); +extern kern_return_t thread_getstatus(thread_t thread, int flavor, thread_state_t tstate, mach_msg_type_number_t *count); +extern thread_t get_firstthread(task_t); + +#if defined (ppc) +extern kern_return_t thread_setstatus(thread_t thread, int flavor, thread_state_t tstate, mach_msg_type_number_t count); +#endif /* * sys-trace system call. */ -struct ptrace_args { - int req; - pid_t pid; - caddr_t addr; - int data; -}; int ptrace(p, uap, retval) @@ -105,15 +108,10 @@ ptrace(p, uap, retval) register_t *retval; { struct proc *t = current_proc(); /* target process */ - vm_offset_t start_addr, end_addr, - kern_addr, offset; - vm_size_t size; task_t task; - thread_t thread; - thread_act_t th_act; + thread_t th_act; struct uthread *ut; int *locr0; - int error = 0; #if defined(ppc) struct ppc_thread_state64 statep; #elif defined(i386) @@ -129,14 +127,14 @@ ptrace(p, uap, retval) AUDIT_ARG(addr, uap->addr); AUDIT_ARG(value, uap->data); - if (uap->req == PT_DENY_ATTACH) { - if (ISSET(p->p_flag, P_TRACED)) { - exit1(p, W_EXITCODE(ENOTSUP, 0), retval); - /* drop funnel before we return */ - thread_funnel_set(kernel_flock, FALSE); - thread_exception_return(); - /* NOTREACHED */ - } + if (uap->req == PT_DENY_ATTACH) { + if (ISSET(p->p_flag, P_TRACED)) { + exit1(p, W_EXITCODE(ENOTSUP, 0), retval); + /* drop funnel before we return */ + thread_funnel_set(kernel_flock, FALSE); + thread_exception_return(); + /* NOTREACHED */ + } SET(p->p_flag, P_NOATTACH); return(0); @@ -173,7 +171,6 @@ ptrace(p, uap, retval) if ((t = pfind(uap->pid)) == NULL) return (ESRCH); - AUDIT_ARG(process, t); /* We do not want ptrace to do anything with kernel, init @@ -188,52 +185,35 @@ ptrace(p, uap, retval) tr_sigexc = 1; } if (uap->req == PT_ATTACH) { - - /* - * You can't attach to a process if: - * (1) it's the process that's doing the attaching, - */ - if (t->p_pid == p->p_pid) - return (EINVAL); - - /* - * (2) it's already being traced, or - */ - if (ISSET(t->p_flag, P_TRACED)) - return (EBUSY); - - /* - * (3) it's not owned by you, or is set-id on exec - * (unless you're root). - */ - if ((t->p_cred->p_ruid != p->p_cred->p_ruid || - ISSET(t->p_flag, P_SUGID)) && - (error = suser(p->p_ucred, &p->p_acflag)) != 0) - return (error); - - if ((p->p_flag & P_TRACED) && isinferior(p, t)) - return(EPERM); - - if (ISSET(t->p_flag, P_NOATTACH)) { - psignal(p, SIGSEGV); - return (EBUSY); + int err; + + if ( kauth_authorize_process(proc_ucred(p), KAUTH_PROCESS_CANTRACE, + t, (uintptr_t)&err, 0, 0) == 0 ) { + /* it's OK to attach */ + SET(t->p_flag, P_TRACED); + if (tr_sigexc) + SET(t->p_flag, P_SIGEXC); + + t->p_oppid = t->p_pptr->p_pid; + if (t->p_pptr != p) + proc_reparent(t, p); + + if (get_task_userstop(task) == 0 ) { + t->p_xstat = 0; + psignal(t, SIGSTOP); + } else { + t->p_xstat = SIGSTOP; + task_resume(task); + } + return(0); } - SET(t->p_flag, P_TRACED); - if (tr_sigexc) - SET(t->p_flag, P_SIGEXC); - - t->p_oppid = t->p_pptr->p_pid; - if (t->p_pptr != p) - proc_reparent(t, p); - - if (get_task_userstop(task) == 0 ) { - t->p_xstat = 0; - psignal(t, SIGSTOP); - } else { - t->p_xstat = SIGSTOP; - task_resume(task); + else { + /* not allowed to attach, proper error code returned by kauth_authorize_process */ + if (ISSET(t->p_flag, P_NOATTACH)) { + psignal(p, SIGSEGV); + } + return (err); } - return(0); } /* @@ -284,8 +264,8 @@ ptrace(p, uap, retval) case PT_STEP: /* single step the child */ case PT_CONTINUE: /* continue the child */ - th_act = (thread_act_t)get_firstthread(task); - if (th_act == THR_ACT_NULL) + th_act = (thread_t)get_firstthread(task); + if (th_act == THREAD_NULL) goto errorLabel; ut = (uthread_t)get_bsdthread_info(th_act); locr0 = ut->uu_ar0; @@ -296,13 +276,13 @@ ptrace(p, uap, retval) } #elif defined(ppc) state_count = PPC_THREAD_STATE64_COUNT; - if (thread_getstatus(th_act, PPC_THREAD_STATE64, &statep, &state_count) != KERN_SUCCESS) { + if (thread_getstatus(th_act, PPC_THREAD_STATE64, (thread_state_t)&statep, (mach_msg_type_number_t *)&state_count) != KERN_SUCCESS) { goto errorLabel; } #else #error architecture not supported #endif - if ((int)uap->addr != 1) { + if (uap->addr != (user_addr_t)1) { #if defined(i386) locr0[PC] = (int)uap->addr; #elif defined(ppc) @@ -310,18 +290,18 @@ ptrace(p, uap, retval) if (!ALIGNED((int)uap->addr, sizeof(int))) return (ERESTART); - statep.srr0 = (uint64_t)((uint32_t)uap->addr); + statep.srr0 = uap->addr; state_count = PPC_THREAD_STATE64_COUNT; - if (thread_setstatus(th_act, PPC_THREAD_STATE64, &statep, &state_count) != KERN_SUCCESS) { + if (thread_setstatus(th_act, PPC_THREAD_STATE64, (thread_state_t)&statep, state_count) != KERN_SUCCESS) { goto errorLabel; } #undef ALIGNED #else #error architecture not implemented! #endif - } /* (int)uap->addr != 1 */ + } /* uap->addr != (user_addr_t)1 */ - if ((unsigned)uap->data < 0 || (unsigned)uap->data >= NSIG) + if ((unsigned)uap->data >= NSIG) goto errorLabel; if (uap->data != 0) { @@ -329,7 +309,7 @@ ptrace(p, uap, retval) } #if defined(ppc) state_count = PPC_THREAD_STATE64_COUNT; - if (thread_getstatus(th_act, PPC_THREAD_STATE64, &statep, &state_count) != KERN_SUCCESS) { + if (thread_getstatus(th_act, PPC_THREAD_STATE64, (thread_state_t)&statep, (mach_msg_type_number_t *)&state_count) != KERN_SUCCESS) { goto errorLabel; } #endif @@ -354,7 +334,7 @@ ptrace(p, uap, retval) } #if defined (ppc) state_count = PPC_THREAD_STATE64_COUNT; - if (thread_setstatus(th_act, PPC_THREAD_STATE64, &statep, &state_count) != KERN_SUCCESS) { + if (thread_setstatus(th_act, PPC_THREAD_STATE64, (thread_state_t)&statep, state_count) != KERN_SUCCESS) { goto errorLabel; } #endif @@ -369,19 +349,17 @@ ptrace(p, uap, retval) break; case PT_THUPDATE: { - thread_act_t target_act; - if ((unsigned)uap->data >= NSIG) goto errorLabel; - th_act = (thread_act_t)port_name_to_act((void *)uap->addr); - if (th_act == THR_ACT_NULL) + th_act = port_name_to_thread(CAST_DOWN(mach_port_name_t, uap->addr)); + if (th_act == THREAD_NULL) return (ESRCH); ut = (uthread_t)get_bsdthread_info(th_act); if (uap->data) ut->uu_siglist |= sigmask(uap->data); t->p_xstat = uap->data; t->p_stat = SRUN; - act_deallocate(th_act); + thread_deallocate(th_act); return(0); } break; @@ -393,3 +371,51 @@ errorLabel: return(0); } + +/* + * determine if one process (cur_procp) can trace another process (traced_procp). + */ + +int +cantrace(proc_t cur_procp, kauth_cred_t creds, proc_t traced_procp, int *errp) +{ + int my_err; + /* + * You can't trace a process if: + * (1) it's the process that's doing the tracing, + */ + if (traced_procp->p_pid == cur_procp->p_pid) { + *errp = EINVAL; + return (0); + } + + /* + * (2) it's already being traced, or + */ + if (ISSET(traced_procp->p_flag, P_TRACED)) { + *errp = EBUSY; + return (0); + } + + /* + * (3) it's not owned by you, or is set-id on exec + * (unless you're root). + */ + if ((creds->cr_ruid != proc_ucred(traced_procp)->cr_ruid || + ISSET(traced_procp->p_flag, P_SUGID)) && + (my_err = suser(creds, &cur_procp->p_acflag)) != 0) { + *errp = my_err; + return (0); + } + + if ((cur_procp->p_flag & P_TRACED) && isinferior(cur_procp, traced_procp)) { + *errp = EPERM; + return (0); + } + + if (ISSET(traced_procp->p_flag, P_NOATTACH)) { + *errp = EBUSY; + return (0); + } + return(1); +} diff --git a/bsd/kern/makesyscalls.sh b/bsd/kern/makesyscalls.sh new file mode 100755 index 000000000..d186f1690 --- /dev/null +++ b/bsd/kern/makesyscalls.sh @@ -0,0 +1,694 @@ +#! /bin/sh - +# @(#)makesyscalls.sh 8.1 (Berkeley) 6/10/93 +# $FreeBSD: src/sys/kern/makesyscalls.sh,v 1.60 2003/04/01 01:12:24 jeff Exp $ +# +# Copyright (c) 2004 Apple Computer, Inc. All rights reserved. +# +# @APPLE_LICENSE_HEADER_START@ +# +# The contents of this file constitute Original Code as defined in and +# are subject to the Apple Public Source License Version 1.1 (the +# "License"). You may not use this file except in compliance with the +# License. Please obtain a copy of the License at +# http://www.apple.com/publicsource and read it before using this file. +# +# This Original Code and all software distributed under the License are +# distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER +# EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +# INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the +# License for the specific language governing rights and limitations +# under the License. +# +# @APPLE_LICENSE_HEADER_END@ +# + +set -e + +# output files: +syscallnamesfile="syscalls.c" +sysprotofile="../sys/sysproto.h" +sysproto_h=_SYS_SYSPROTO_H_ +syshdrfile="../sys/syscall.h" +syscall_h=_SYS_SYSCALL_H_ +syscalltablefile="init_sysent.c" +syscallprefix="SYS_" +switchname="sysent" +namesname="syscallnames" + +# tmp files: +syslegal="sysent.syslegal.$$" +sysent="sysent.switch.$$" +sysinc="sysinc.switch.$$" +sysarg="sysarg.switch.$$" +sysprotoend="sysprotoend.$$" +syscallnamestempfile="syscallnamesfile.$$" +syshdrtempfile="syshdrtempfile.$$" + +trap "rm $syslegal $sysent $sysinc $sysarg $sysprotoend $syscallnamestempfile $syshdrtempfile" 0 + +touch $syslegal $sysent $sysinc $sysarg $sysprotoend $syscallnamestempfile $syshdrtempfile + +case $# in + 0) echo "usage: $0 input-file <config-file>" 1>&2 + exit 1 + ;; +esac + +if [ -n "$2" -a -f "$2" ]; then + . $2 +fi + +sed -e ' +s/\$//g +:join + /\\$/{a\ + + N + s/\\\n// + b join + } +2,${ + /^#/!s/\([{}()*,]\)/ \1 /g +} +' < $1 | awk " + BEGIN { + syslegal = \"$syslegal\" + sysprotofile = \"$sysprotofile\" + sysprotoend = \"$sysprotoend\" + sysproto_h = \"$sysproto_h\" + syscall_h = \"$syscall_h\" + sysent = \"$sysent\" + syscalltablefile = \"$syscalltablefile\" + sysinc = \"$sysinc\" + sysarg = \"$sysarg\" + syscallnamesfile = \"$syscallnamesfile\" + syscallnamestempfile = \"$syscallnamestempfile\" + syshdrfile = \"$syshdrfile\" + syshdrtempfile = \"$syshdrtempfile\" + syscallprefix = \"$syscallprefix\" + switchname = \"$switchname\" + namesname = \"$namesname\" + infile = \"$1\" + "' + + printf "/*\n" > syslegal + printf " * Copyright (c) 2004 Apple Computer, Inc. All rights reserved.\n" > syslegal + printf " * \n" > syslegal + printf " * @APPLE_LICENSE_HEADER_START@ \n" > syslegal + printf " * \n" > syslegal + printf " * The contents of this file constitute Original Code as defined in and \n" > syslegal + printf " * are subject to the Apple Public Source License Version 1.1 (the \n" > syslegal + printf " * \"License\"). You may not use this file except in compliance with the \n" > syslegal + printf " * License. Please obtain a copy of the License at \n" > syslegal + printf " * http://www.apple.com/publicsource and read it before using this file. \n" > syslegal + printf " * \n" > syslegal + printf " * This Original Code and all software distributed under the License are \n" > syslegal + printf " * distributed on an \"AS IS\" basis, WITHOUT WARRANTY OF ANY KIND, EITHER \n" > syslegal + printf " * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, \n" > syslegal + printf " * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, \n" > syslegal + printf " * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the \n" > syslegal + printf " * License for the specific language governing rights and limitations \n" > syslegal + printf " * under the License. \n" > syslegal + printf " * \n" > syslegal + printf " * @APPLE_LICENSE_HEADER_END@ \n" > syslegal + printf " * \n" > syslegal + printf " * \n" > syslegal + printf " * System call switch table.\n *\n" > syslegal + printf " * DO NOT EDIT-- this file is automatically generated.\n" > syslegal + printf " * created from %s\n */\n\n", infile > syslegal + } + NR == 1 { + printf "\n/* The casts are bogus but will do for now. */\n" > sysent + printf "__private_extern__ struct sysent %s[] = {\n",switchname > sysent + + printf "#ifndef %s\n", sysproto_h > sysarg + printf "#define\t%s\n\n", sysproto_h > sysarg + printf "#ifndef %s\n", syscall_h > syshdrtempfile + printf "#define\t%s\n\n", syscall_h > syshdrtempfile + printf "#include <sys/appleapiopts.h>\n" > syshdrtempfile + printf "#ifdef __APPLE_API_PRIVATE\n" > syshdrtempfile + printf "#include <sys/appleapiopts.h>\n" > sysarg + printf "#include <sys/cdefs.h>\n" > sysarg + printf "#include <sys/mount_internal.h>\n" > sysarg + printf "#include <sys/types.h>\n" > sysarg + printf "#include <sys/sem_internal.h>\n" > sysarg + printf "#include <sys/semaphore.h>\n" > sysarg + printf "#include <sys/wait.h>\n" > sysarg + printf "#include <mach/shared_memory_server.h>\n" > sysarg + printf "\n#ifdef KERNEL\n" > sysarg + printf "#ifdef __APPLE_API_PRIVATE\n" > sysarg + printf "#ifdef __ppc__\n" > sysarg + printf "#define\tPAD_(t)\t(sizeof(uint64_t) <= sizeof(t) \\\n " > sysarg + printf "\t\t? 0 : sizeof(uint64_t) - sizeof(t))\n" > sysarg + printf "#else\n" > sysarg + printf "#define\tPAD_(t)\t(sizeof(register_t) <= sizeof(t) \\\n " > sysarg + printf "\t\t? 0 : sizeof(register_t) - sizeof(t))\n" > sysarg + printf "#endif\n" > sysarg + printf "#if BYTE_ORDER == LITTLE_ENDIAN\n"> sysarg + printf "#define\tPADL_(t)\t0\n" > sysarg + printf "#define\tPADR_(t)\tPAD_(t)\n" > sysarg + printf "#else\n" > sysarg + printf "#define\tPADL_(t)\tPAD_(t)\n" > sysarg + printf "#define\tPADR_(t)\t0\n" > sysarg + printf "#endif\n" > sysarg + printf "\n__BEGIN_DECLS\n" > sysarg + printf "#ifndef __MUNGE_ONCE\n" > sysarg + printf "#define __MUNGE_ONCE\n" > sysarg + printf "#ifdef __ppc__\n" > sysarg + printf "void munge_w(const void *, void *); \n" > sysarg + printf "void munge_ww(const void *, void *); \n" > sysarg + printf "void munge_www(const void *, void *); \n" > sysarg + printf "void munge_wwww(const void *, void *); \n" > sysarg + printf "void munge_wwwww(const void *, void *); \n" > sysarg + printf "void munge_wwwwww(const void *, void *); \n" > sysarg + printf "void munge_wwwwwww(const void *, void *); \n" > sysarg + printf "void munge_wwwwwwww(const void *, void *); \n" > sysarg + printf "void munge_d(const void *, void *); \n" > sysarg + printf "void munge_dd(const void *, void *); \n" > sysarg + printf "void munge_ddd(const void *, void *); \n" > sysarg + printf "void munge_dddd(const void *, void *); \n" > sysarg + printf "void munge_ddddd(const void *, void *); \n" > sysarg + printf "void munge_dddddd(const void *, void *); \n" > sysarg + printf "void munge_ddddddd(const void *, void *); \n" > sysarg + printf "void munge_dddddddd(const void *, void *); \n" > sysarg + printf "void munge_wl(const void *, void *); \n" > sysarg + printf "void munge_wlw(const void *, void *); \n" > sysarg + printf "void munge_wwwl(const void *, void *); \n" > sysarg + printf "void munge_wwwwl(const void *, void *); \n" > sysarg + printf "void munge_wwwwwl(const void *, void *); \n" > sysarg + printf "void munge_wsw(const void *, void *); \n" > sysarg + printf "void munge_wws(const void *, void *); \n" > sysarg + printf "void munge_wwwsw(const void *, void *); \n" > sysarg + printf "#else \n" > sysarg + printf "#define munge_w NULL \n" > sysarg + printf "#define munge_ww NULL \n" > sysarg + printf "#define munge_www NULL \n" > sysarg + printf "#define munge_wwww NULL \n" > sysarg + printf "#define munge_wwwww NULL \n" > sysarg + printf "#define munge_wwwwww NULL \n" > sysarg + printf "#define munge_wwwwwww NULL \n" > sysarg + printf "#define munge_wwwwwwww NULL \n" > sysarg + printf "#define munge_d NULL \n" > sysarg + printf "#define munge_dd NULL \n" > sysarg + printf "#define munge_ddd NULL \n" > sysarg + printf "#define munge_dddd NULL \n" > sysarg + printf "#define munge_ddddd NULL \n" > sysarg + printf "#define munge_dddddd NULL \n" > sysarg + printf "#define munge_ddddddd NULL \n" > sysarg + printf "#define munge_dddddddd NULL \n" > sysarg + printf "#define munge_wl NULL \n" > sysarg + printf "#define munge_wlw NULL \n" > sysarg + printf "#define munge_wwwl NULL \n" > sysarg + printf "#define munge_wwwwl NULL \n" > sysarg + printf "#define munge_wwwwwl NULL \n" > sysarg + printf "#define munge_wsw NULL \n" > sysarg + printf "#define munge_wws NULL \n" > sysarg + printf "#define munge_wwwsw NULL \n" > sysarg + printf "#endif // __ppc__\n" > sysarg + printf "#endif /* !__MUNGE_ONCE */\n" > sysarg + + printf "\n" > sysarg + + printf "const char *%s[] = {\n", namesname > syscallnamestempfile + next + } + NF == 0 || $1 ~ /^;/ { + next + } + $1 ~ /^#[ ]*include/ { + print > sysinc + next + } + $1 ~ /^#[ ]*if/ { + print > sysent + print > sysarg + print > syscallnamestempfile + print > syshdrtempfile + print > sysprotoend + savesyscall = syscall + next + } + $1 ~ /^#[ ]*else/ { + print > sysent + print > sysarg + print > syscallnamestempfile + print > syshdrtempfile + print > sysprotoend + syscall = savesyscall + next + } + $1 ~ /^#/ { + print > sysent + print > sysarg + print > syscallnamestempfile + print > syshdrtempfile + print > sysprotoend + next + } + syscall != $1 { + printf "%s: line %d: syscall number out of sync at %d\n", + infile, NR, syscall + printf "line is:\n" + print + exit 1 + } + function align_comment(linesize, location, thefile) { + printf(" ") > thefile + while (linesize < location) { + printf(" ") > thefile + linesize++ + } + } + function parserr(was, wanted) { + printf "%s: line %d: unexpected %s (expected %s)\n", + infile, NR, was, wanted + exit 1 + } + + function parseline() { + funcname = "" + current_field = 5 + args_start = 0 + args_end = 0 + comments_start = 0 + comments_end = 0 + argc = 0 + argssize = "0" + additional_comments = " " + + # find start and end of call name and arguments + if ($current_field != "{") + parserr($current_field, "{") + args_start = current_field + current_field++ + while (current_field <= NF) { + if ($current_field == "}") { + args_end = current_field + break + } + current_field++ + } + if (args_end == 0) { + printf "%s: line %d: invalid call name and arguments\n", + infile, NR + exit 1 + } + + # find start and end of optional comments + current_field++ + if (current_field < NF && $current_field == "{") { + comments_start = current_field + while (current_field <= NF) { + if ($current_field == "}") { + comments_end = current_field + break + } + current_field++ + } + if (comments_end == 0) { + printf "%s: line %d: invalid comments \n", + infile, NR + exit 1 + } + } + + if ($args_end != "}") + parserr($args_end, "}") + args_end-- + if ($args_end != ";") + parserr($args_end, ";") + args_end-- + if ($args_end != ")") + parserr($args_end, ")") + args_end-- + + # extract additional comments + if (comments_start != 0) { + current_field = comments_start + 1 + while (current_field < comments_end) { + additional_comments = additional_comments $current_field " " + current_field++ + } + } + + # get function return type + current_field = args_start + 1 + returntype = $current_field + + # get function name and set up to get arguments + current_field++ + funcname = $current_field + argalias = funcname "_args" + current_field++ # bump past function name + + if ($current_field != "(") + parserr($current_field, "(") + current_field++ + + if (current_field == args_end) { + if ($current_field != "void") + parserr($current_field, "argument definition") + return + } + + # extract argument types and names + while (current_field <= args_end) { + argc++ + argtype[argc]="" + ext_argtype[argc]="" + oldf="" + while (current_field < args_end && $(current_field + 1) != ",") { + if (argtype[argc] != "" && oldf != "*") { + argtype[argc] = argtype[argc] " "; + } + argtype[argc] = argtype[argc] $current_field; + ext_argtype[argc] = argtype[argc]; + oldf = $current_field; + current_field++ + } + if (argtype[argc] == "") + parserr($current_field, "argument definition") + argname[argc] = $current_field; + current_field += 2; # skip name, and any comma + } + if (argc > 8) { + printf "%s: line %d: too many arguments!\n", infile, NR + exit 1 + } + if (argc != 0) + argssize = "AC(" argalias ")" + } + + { + add_sysent_entry = 1 + add_sysnames_entry = 1 + add_sysheader_entry = 1 + add_sysproto_entry = 1 + add_64bit_unsafe = 0 + add_64bit_fakesafe = 0 + add_cancel_enable = "0" + + if ($2 == "NONE") { + add_cancel_enable = "_SYSCALL_CANCEL_NONE" + } + else if ($2 == "PRE") { + add_cancel_enable = "_SYSCALL_CANCEL_PRE" + } + else if ($2 == "POST") { + add_cancel_enable = "_SYSCALL_CANCEL_POST" + } + else { + printf "%s: line %d: unrecognized keyword %s\n", infile, NR, $2 + exit 1 + + } + + if ($3 == "KERN") { + my_funnel = "KERNEL_FUNNEL" + } + else if ($3 == "NONE") { + my_funnel = "NO_FUNNEL" + } + else { + printf "%s: line %d: unrecognized keyword %s\n", infile, NR, $3 + exit 1 + } + + if ($4 != "ALL" && $4 != "UALL") { + files_keyword_OK = 0 + add_sysent_entry = 0 + add_sysnames_entry = 0 + add_sysheader_entry = 0 + add_sysproto_entry = 0 + + if (match($4, "[T]") != 0) { + add_sysent_entry = 1 + files_keyword_OK = 1 + } + if (match($4, "[N]") != 0) { + add_sysnames_entry = 1 + files_keyword_OK = 1 + } + if (match($4, "[H]") != 0) { + add_sysheader_entry = 1 + files_keyword_OK = 1 + } + if (match($4, "[P]") != 0) { + add_sysproto_entry = 1 + files_keyword_OK = 1 + } + if (match($4, "[U]") != 0) { + add_64bit_unsafe = 1 + } + if (match($4, "[F]") != 0) { + add_64bit_fakesafe = 1 + } + + if (files_keyword_OK == 0) { + printf "%s: line %d: unrecognized keyword %s\n", infile, NR, $4 + exit 1 + } + } + else if ($4 == "UALL") { + add_64bit_unsafe = 1; + } + + + parseline() + + # output function argument structures to sysproto.h and build the + # name of the appropriate argument mungers + munge32 = "NULL" + munge64 = "NULL" + if (funcname != "nosys" || (syscall == 0 && funcname == "nosys")) { + if (argc != 0) { + if (add_sysproto_entry == 1) { + printf("struct %s {\n", argalias) > sysarg + } + munge32 = "munge_" + munge64 = "munge_" + for (i = 1; i <= argc; i++) { + # Build name of argument munger. + # We account for all sys call argument types here. + # This is where you add any new types. With LP64 support + # each argument consumes 64-bits. + # see .../xnu/bsd/dev/ppc/munge.s for munge argument types. + if (argtype[i] == "long") { + if (add_64bit_unsafe == 0) + ext_argtype[i] = "user_long_t"; + munge32 = munge32 "s" + munge64 = munge64 "d" + } + else if (argtype[i] == "u_long") { + if (add_64bit_unsafe == 0) + ext_argtype[i] = "user_ulong_t"; + munge32 = munge32 "w" + munge64 = munge64 "d" + } + else if (argtype[i] == "size_t") { + if (add_64bit_unsafe == 0) + ext_argtype[i] = "user_size_t"; + munge32 = munge32 "w" + munge64 = munge64 "d" + } + else if (argtype[i] == "ssize_t") { + if (add_64bit_unsafe == 0) + ext_argtype[i] = "user_ssize_t"; + munge32 = munge32 "s" + munge64 = munge64 "d" + } + else if (argtype[i] == "user_ssize_t" || argtype[i] == "user_long_t") { + munge32 = munge32 "s" + munge64 = munge64 "d" + } + else if (argtype[i] == "user_addr_t" || argtype[i] == "user_size_t" || + argtype[i] == "user_ulong_t") { + munge32 = munge32 "w" + munge64 = munge64 "d" + } + else if (argtype[i] == "caddr_t" || argtype[i] == "semun_t" || + match(argtype[i], "[\*]") != 0) { + if (add_64bit_unsafe == 0) + ext_argtype[i] = "user_addr_t"; + munge32 = munge32 "w" + munge64 = munge64 "d" + } + else if (argtype[i] == "int" || argtype[i] == "u_int" || + argtype[i] == "uid_t" || argtype[i] == "pid_t" || + argtype[i] == "id_t" || argtype[i] == "idtype_t" || + argtype[i] == "socklen_t" || argtype[i] == "uint32_t" || argtype[i] == "int32_t" || + argtype[i] == "sigset_t" || argtype[i] == "gid_t" || + argtype[i] == "semconfig_ctl_t" || argtype[i] == "mode_t" || argtype[i] == "key_t" || argtype[i] == "time_t") { + munge32 = munge32 "w" + munge64 = munge64 "d" + } + else if (argtype[i] == "off_t" || argtype[i] == "int64_t" || argtype[i] == "uint64_t") { + munge32 = munge32 "l" + munge64 = munge64 "d" + } + else { + printf "%s: line %d: invalid type \"%s\" \n", + infile, NR, argtype[i] + printf "You need to add \"%s\" into the type checking code. \n", + argtype[i] + exit 1 + } + if (add_sysproto_entry == 1) { + printf("\tchar %s_l_[PADL_(%s)]; " \ + "%s %s; char %s_r_[PADR_(%s)];\n", + argname[i], ext_argtype[i], + ext_argtype[i], argname[i], + argname[i], ext_argtype[i]) > sysarg + } + } + if (add_sysproto_entry == 1) { + printf("};\n") > sysarg + } + } + else if (add_sysproto_entry == 1) { + printf("struct %s {\n\tregister_t dummy;\n};\n", argalias) > sysarg + } + } + + # output to init_sysent.c + tempname = funcname + if (add_sysent_entry == 0) { + argssize = "0" + munge32 = "NULL" + munge64 = "NULL" + munge_ret = "_SYSCALL_RET_NONE" + tempname = "nosys" + } + else { + # figure out which return value type to munge + if (returntype == "user_addr_t") { + munge_ret = "_SYSCALL_RET_ADDR_T" + } + else if (returntype == "user_ssize_t") { + munge_ret = "_SYSCALL_RET_SSIZE_T" + } + else if (returntype == "user_size_t") { + munge_ret = "_SYSCALL_RET_SIZE_T" + } + else if (returntype == "int") { + munge_ret = "_SYSCALL_RET_INT_T" + } + else if (returntype == "u_int") { + munge_ret = "_SYSCALL_RET_UINT_T" + } + else if (returntype == "off_t") { + munge_ret = "_SYSCALL_RET_OFF_T" + } + else if (returntype == "void") { + munge_ret = "_SYSCALL_RET_NONE" + } + else { + printf "%s: line %d: invalid return type \"%s\" \n", + infile, NR, returntype + printf "You need to add \"%s\" into the return type checking code. \n", + returntype + exit 1 + } + } + + if (add_64bit_unsafe == 1 && add_64bit_fakesafe == 0) + my_funnel = my_funnel "|UNSAFE_64BIT"; + + printf("\t{%s, %s, %s, \(sy_call_t *\)%s, %s, %s, %s},", + argssize, add_cancel_enable, my_funnel, tempname, munge32, munge64, munge_ret) > sysent + linesize = length(argssize) + length(add_cancel_enable) + length(my_funnel) + length(tempname) + \ + length(munge32) + length(munge64) + length(munge_ret) + 28 + align_comment(linesize, 88, sysent) + printf("/* %d = %s%s*/\n", syscall, funcname, additional_comments) > sysent + + # output to syscalls.c + if (add_sysnames_entry == 1) { + tempname = funcname + if (funcname == "nosys") { + if (syscall == 0) + tempname = "syscall" + else + tempname = "#" syscall + } + printf("\t\"%s\", ", tempname) > syscallnamestempfile + linesize = length(tempname) + 8 + align_comment(linesize, 25, syscallnamestempfile) + if (substr(tempname,1,1) == "#") { + printf("/* %d =%s*/\n", syscall, additional_comments) > syscallnamestempfile + } + else { + printf("/* %d = %s%s*/\n", syscall, tempname, additional_comments) > syscallnamestempfile + } + } + + # output to syscalls.h + if (add_sysheader_entry == 1) { + tempname = funcname + if (syscall == 0) { + tempname = "syscall" + } + if (tempname != "nosys") { + printf("#define\t%s%s", syscallprefix, tempname) > syshdrtempfile + linesize = length(syscallprefix) + length(tempname) + 12 + align_comment(linesize, 30, syshdrtempfile) + printf("%d\n", syscall) > syshdrtempfile + # special case for gettimeofday on ppc - cctools project uses old name + if (tempname == "ppc_gettimeofday") { + printf("#define\t%s%s", syscallprefix, "gettimeofday") > syshdrtempfile + linesize = length(syscallprefix) + length(tempname) + 12 + align_comment(linesize, 30, syshdrtempfile) + printf("%d\n", syscall) > syshdrtempfile + } + } + else { + printf("\t\t\t/* %d %s*/\n", syscall, additional_comments) > syshdrtempfile + } + } + + # output function prototypes to sysproto.h + if (add_sysproto_entry == 1) { + if (funcname =="exit") { + printf("void %s(struct proc *, struct %s *, int *);\n", + funcname, argalias) > sysprotoend + } + else if (funcname != "nosys" || (syscall == 0 && funcname == "nosys")) { + printf("int %s(struct proc *, struct %s *, %s *);\n", + funcname, argalias, returntype) > sysprotoend + } + } + + syscall++ + next + } + + END { + printf "#ifdef __ppc__\n" > sysinc + printf "#define AC(name) (sizeof(struct name) / sizeof(uint64_t))\n" > sysinc + printf "#else\n" > sysinc + printf "#define AC(name) (sizeof(struct name) / sizeof(register_t))\n" > sysinc + printf "#endif\n" > sysinc + printf "\n" > sysinc + + printf("\n__END_DECLS\n") > sysprotoend + printf("#undef PAD_\n") > sysprotoend + printf("#undef PADL_\n") > sysprotoend + printf("#undef PADR_\n") > sysprotoend + printf "\n#endif /* __APPLE_API_PRIVATE */\n" > sysprotoend + printf "#endif /* KERNEL */\n" > sysprotoend + printf("\n#endif /* !%s */\n", sysproto_h) > sysprotoend + + printf("};\n") > sysent + printf("int nsysent = sizeof(sysent) / sizeof(sysent[0]);\n") > sysent + + printf("};\n") > syscallnamestempfile + printf("#define\t%sMAXSYSCALL\t%d\n", syscallprefix, syscall) \ + > syshdrtempfile + printf("\n#endif /* __APPLE_API_PRIVATE */\n") > syshdrtempfile + printf("#endif /* !%s */\n", syscall_h) > syshdrtempfile + } ' + +cat $syslegal $sysinc $sysent > $syscalltablefile +cat $syslegal $sysarg $sysprotoend > $sysprotofile +cat $syslegal $syscallnamestempfile > $syscallnamesfile +cat $syslegal $syshdrtempfile > $syshdrfile diff --git a/bsd/kern/netboot.c b/bsd/kern/netboot.c index c2bc3b3a3..2555875b7 100644 --- a/bsd/kern/netboot.c +++ b/bsd/kern/netboot.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2001-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -30,11 +30,11 @@ #include <sys/kernel.h> #include <sys/conf.h> #include <sys/ioctl.h> -#include <sys/proc.h> -#include <sys/mount.h> +#include <sys/proc_internal.h> +#include <sys/mount_internal.h> #include <sys/mbuf.h> #include <sys/filedesc.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/malloc.h> #include <sys/socket.h> #include <sys/socketvar.h> @@ -46,9 +46,11 @@ #include <netinet/in.h> #include <netinet/if_ether.h> #include <netinet/dhcp_options.h> -#include <kern/kalloc.h> #include <pexpert/pexpert.h> +#include <kern/kern_types.h> +#include <kern/kalloc.h> + //#include <libkern/libkern.h> extern struct filedesc filedesc0; @@ -250,6 +252,7 @@ static __inline__ boolean_t parse_netboot_path(char * path, struct in_addr * iaddr_p, char * * host, char * * mount_dir, char * * image_path) { + static char tmp[MAX_IPv4_STR_LEN]; /* Danger - not thread safe */ char * start; char * colon; @@ -283,7 +286,7 @@ parse_netboot_path(char * path, struct in_addr * iaddr_p, char * * host, (void)find_colon(start); *image_path = start; } - *host = inet_ntoa(*iaddr_p); + *host = inet_ntop(AF_INET, iaddr_p, tmp, sizeof(tmp)); return (TRUE); } @@ -353,6 +356,8 @@ netboot_info_init(struct in_addr iaddr) char * vndevice = NULL; MALLOC_ZONE(vndevice, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (vndevice == NULL) + panic("netboot_info_init: M_NAMEI zone exhausted"); if (PE_parse_boot_arg("vndevice", vndevice) == TRUE) { use_hdix = FALSE; } @@ -366,6 +371,8 @@ netboot_info_init(struct in_addr iaddr) /* check for a booter-specified path then a NetBoot path */ MALLOC_ZONE(root_path, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (root_path == NULL) + panic("netboot_info_init: M_NAMEI zone exhausted"); if (PE_parse_boot_arg("rp", root_path) == TRUE || PE_parse_boot_arg("rootpath", root_path) == TRUE || get_root_path(root_path) == TRUE) { @@ -431,15 +438,15 @@ netboot_info_free(struct netboot_info * * info_p) if (info) { if (info->mount_point) { - kfree((vm_offset_t)info->mount_point, info->mount_point_length); + kfree(info->mount_point, info->mount_point_length); } if (info->server_name) { - kfree((vm_offset_t)info->server_name, info->server_name_length); + kfree(info->server_name, info->server_name_length); } if (info->image_path) { - kfree((vm_offset_t)info->image_path, info->image_path_length); + kfree(info->image_path, info->image_path_length); } - kfree((vm_offset_t)info, sizeof(*info)); + kfree(info, sizeof(*info)); } *info_p = NULL; return; @@ -617,13 +624,14 @@ find_interface(void) struct ifnet * ifp = NULL; if (rootdevice[0]) { - ifp = ifunit(rootdevice); + ifp = ifunit(rootdevice); } if (ifp == NULL) { - TAILQ_FOREACH(ifp, &ifnet, if_link) - if ((ifp->if_flags & - (IFF_LOOPBACK|IFF_POINTOPOINT)) == 0) - break; + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_link) + if ((ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) == 0) + break; + ifnet_head_done(); } return (ifp); } @@ -643,7 +651,6 @@ netboot_mountroot(void) bzero(&ifr, sizeof(ifr)); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); /* find the interface */ ifp = find_interface(); @@ -701,7 +708,6 @@ netboot_mountroot(void) } soclose(so); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); S_netboot_info_p = netboot_info_init(iaddr); switch (S_netboot_info_p->image_type) { @@ -760,7 +766,6 @@ failed: if (so != NULL) { soclose(so); } - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); return (error); } @@ -799,20 +804,24 @@ netboot_setup(struct proc * p) if (error == 0 && rootvnode != NULL) { struct vnode *tvp; struct vnode *newdp; + struct vfs_context context; + + context.vc_proc = p; + context.vc_ucred = proc_ucred(p); /* XXX kauth_cred_get() ??? proxy */ /* Get the vnode for '/'. Set fdp->fd_fd.fd_cdir to reference it. */ - if (VFS_ROOT(mountlist.cqh_last, &newdp)) + if (VFS_ROOT(mountlist.tqh_last, &newdp, &context)) panic("netboot_setup: cannot find root vnode"); - VREF(newdp); + vnode_ref(newdp); + vnode_put(newdp); tvp = rootvnode; - vrele(tvp); + vnode_rele(tvp); filedesc0.fd_cdir = newdp; rootvnode = newdp; - simple_lock(&mountlist_slock); - CIRCLEQ_REMOVE(&mountlist, CIRCLEQ_FIRST(&mountlist), mnt_list); - simple_unlock(&mountlist_slock); - VOP_UNLOCK(rootvnode, 0, p); - mountlist.cqh_first->mnt_flag |= MNT_ROOTFS; + mount_list_lock(); + TAILQ_REMOVE(&mountlist, TAILQ_FIRST(&mountlist), mnt_list); + mount_list_unlock(); + mountlist.tqh_first->mnt_flag |= MNT_ROOTFS; } done: netboot_info_free(&S_netboot_info_p); diff --git a/bsd/kern/posix_sem.c b/bsd/kern/posix_sem.c index e6ab89ff6..cf17502e0 100644 --- a/bsd/kern/posix_sem.c +++ b/bsd/kern/posix_sem.c @@ -39,11 +39,11 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/filedesc.h> #include <sys/stat.h> -#include <sys/buf.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/vnode.h> @@ -51,6 +51,7 @@ #include <sys/tty.h> #include <sys/malloc.h> #include <sys/semaphore.h> +#include <sys/sysproto.h> #include <bsm/audit_kernel.h> @@ -58,10 +59,23 @@ #include <mach/vm_prot.h> #include <mach/semaphore.h> #include <mach/sync_policy.h> +#include <mach/task.h> +#include <kern/kern_types.h> #include <kern/task.h> #include <kern/clock.h> #include <mach/kern_return.h> +#if KTRACE +#include <sys/ktrace.h> +#endif + +#define f_flag f_fglob->fg_flag +#define f_type f_fglob->fg_type +#define f_msgcount f_fglob->fg_msgcount +#define f_cred f_fglob->fg_cred +#define f_ops f_fglob->fg_ops +#define f_offset f_fglob->fg_offset +#define f_data f_fglob->fg_data #define PSEMNAMLEN 31 /* maximum name segment length we bother with */ struct pseminfo { @@ -71,7 +85,7 @@ struct pseminfo { uid_t psem_uid; gid_t psem_gid; char psem_name[PSEMNAMLEN + 1]; /* segment name */ - void * psem_semobject; + semaphore_t psem_semobject; struct proc * sem_proc; }; #define PSEMINFO_NULL (struct pseminfo *)0 @@ -123,25 +137,58 @@ struct psemnode { LIST_HEAD(psemhashhead, psemcache) *psemhashtbl; /* Hash Table */ u_long psemhash; /* size of hash table - 1 */ long psemnument; /* number of cache entries allocated */ +long posix_sem_max = 10000; /* tunable for max POSIX semaphores */ + /* 10000 limits to ~1M of memory */ +SYSCTL_NODE(_kern, KERN_POSIX, posix, CTLFLAG_RW, 0, "Posix"); +SYSCTL_NODE(_kern_posix, OID_AUTO, sem, CTLFLAG_RW, 0, "Semaphores"); +SYSCTL_INT (_kern_posix_sem, OID_AUTO, max, CTLFLAG_RW, &posix_sem_max, 0, "max"); + struct psemstats psemstats; /* cache effectiveness statistics */ -static int psem_cache_search __P((struct pseminfo **, - struct psemname *, struct psemcache **)); +static int psem_access(struct pseminfo *pinfo, int mode, kauth_cred_t cred); +static int psem_cache_search(struct pseminfo **, + struct psemname *, struct psemcache **); +static int psem_delete(struct pseminfo * pinfo); -static int psem_read __P((struct file *fp, struct uio *uio, - struct ucred *cred, int flags, struct proc *p)); -static int psem_write __P((struct file *fp, struct uio *uio, - struct ucred *cred, int flags, struct proc *p)); -static int psem_ioctl __P((struct file *fp, u_long com, - caddr_t data, struct proc *p)); -static int psem_select __P((struct file *fp, int which, void *wql, - struct proc *p)); -static int psem_closefile __P((struct file *fp, struct proc *p)); +static int psem_read (struct fileproc *fp, struct uio *uio, + kauth_cred_t cred, int flags, struct proc *p); +static int psem_write (struct fileproc *fp, struct uio *uio, + kauth_cred_t cred, int flags, struct proc *p); +static int psem_ioctl (struct fileproc *fp, u_long com, + caddr_t data, struct proc *p); +static int psem_select (struct fileproc *fp, int which, void *wql, struct proc *p); +static int psem_closefile (struct fileglob *fp, struct proc *p); -static int psem_kqfilter __P((struct file *fp, struct knote *kn, struct proc *p)); +static int psem_kqfilter (struct fileproc *fp, struct knote *kn, struct proc *p); struct fileops psemops = - { psem_read, psem_write, psem_ioctl, psem_select, psem_closefile, psem_kqfilter }; + { psem_read, psem_write, psem_ioctl, psem_select, psem_closefile, psem_kqfilter, 0 }; + + +static lck_grp_t *psx_sem_subsys_lck_grp; +static lck_grp_attr_t *psx_sem_subsys_lck_grp_attr; +static lck_attr_t *psx_sem_subsys_lck_attr; +static lck_mtx_t psx_sem_subsys_mutex; + +#define PSEM_SUBSYS_LOCK() lck_mtx_lock(& psx_sem_subsys_mutex) +#define PSEM_SUBSYS_UNLOCK() lck_mtx_unlock(& psx_sem_subsys_mutex) + + +static int psem_cache_add(struct pseminfo *psemp, struct psemname *pnp, struct psemcache *pcp); +/* Initialize the mutex governing access to the posix sem subsystem */ +__private_extern__ void +psem_lock_init( void ) +{ + + psx_sem_subsys_lck_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(psx_sem_subsys_lck_grp_attr); + + psx_sem_subsys_lck_grp = lck_grp_alloc_init("posix shared memory", psx_sem_subsys_lck_grp_attr); + + psx_sem_subsys_lck_attr = lck_attr_alloc_init(); + /* lck_attr_setdebug(psx_sem_subsys_lck_attr); */ + lck_mtx_init(& psx_sem_subsys_mutex, psx_sem_subsys_lck_grp, psx_sem_subsys_lck_attr); +} /* * Lookup an entry in the cache @@ -159,8 +206,8 @@ psem_cache_search(psemp, pnp, pcache) struct psemname *pnp; struct psemcache **pcache; { - register struct psemcache *pcp, *nnp; - register struct psemhashhead *pcpp; + struct psemcache *pcp, *nnp; + struct psemhashhead *pcpp; if (pnp->psem_namelen > PSEMNAMLEN) { psemstats.longnames++; @@ -201,12 +248,9 @@ psem_cache_search(psemp, pnp, pcache) * Add an entry to the cache. */ static int -psem_cache_add(psemp, pnp) - struct pseminfo *psemp; - struct psemname *pnp; +psem_cache_add(struct pseminfo *psemp, struct psemname *pnp, struct psemcache *pcp) { - register struct psemcache *pcp; - register struct psemhashhead *pcpp; + struct psemhashhead *pcpp; struct pseminfo *dpinfo; struct psemcache *dpcp; @@ -215,20 +259,14 @@ psem_cache_add(psemp, pnp) panic("cache_enter: name too long"); #endif - /* - * We allocate a new entry if we are less than the maximum - * allowed and the one at the front of the LRU list is in use. - * Otherwise we use the one at the front of the LRU list. - */ - pcp = (struct psemcache *)_MALLOC(sizeof(struct psemcache), M_SHM, M_WAITOK); + /* if the entry has already been added by some one else return */ if (psem_cache_search(&dpinfo, pnp, &dpcp) == -1) { - _FREE(pcp, M_SHM); return(EEXIST); } + if (psemnument >= posix_sem_max) + return(ENOSPC); psemnument++; - - bzero(pcp, sizeof(struct psemcache)); /* * Fill in cache info, if vp is NULL this is a "negative" cache entry. * For negative entries, we have to record whether it is a whiteout. @@ -241,7 +279,7 @@ psem_cache_add(psemp, pnp) pcpp = PSEMHASH(pnp); #if DIAGNOSTIC { - register struct psemcache *p; + struct psemcache *p; for (p = pcpp->lh_first; p != 0; p = p->psem_hash.le_next) if (p == pcp) @@ -256,14 +294,13 @@ psem_cache_add(psemp, pnp) * Name cache initialization, from vfs_init() when we are booting */ void -psem_cache_init() +psem_cache_init(void) { psemhashtbl = hashinit(desiredvnodes, M_SHM, &psemhash); } static void -psem_cache_delete(pcp) - struct psemcache *pcp; +psem_cache_delete(struct psemcache *pcp) { #if DIAGNOSTIC if (pcp->psem_hash.le_prev == 0) @@ -276,6 +313,7 @@ psem_cache_delete(pcp) psemnument--; } +#if NOT_USED /* * Invalidate a all entries to particular vnode. * @@ -284,40 +322,29 @@ psem_cache_delete(pcp) * need to ditch the entire cache, to avoid confusion. No valid vnode will * ever have (v_id == 0). */ -void +static void psem_cache_purge(void) { struct psemcache *pcp; struct psemhashhead *pcpp; for (pcpp = &psemhashtbl[psemhash]; pcpp >= psemhashtbl; pcpp--) { - while (pcp = pcpp->lh_first) + while ( (pcp = pcpp->lh_first) ) psem_cache_delete(pcp); } } - -struct sem_open_args { - const char *name; - int oflag; - int mode; - int value; -}; +#endif /* NOT_USED */ int -sem_open(p, uap, retval) - struct proc *p; - register struct sem_open_args *uap; - register_t *retval; +sem_open(struct proc *p, struct sem_open_args *uap, user_addr_t *retval) { - register struct filedesc *fdp = p->p_fd; - register struct file *fp; - register struct vnode *vp; - int i; - struct file *nfp; - int type, indx, error; + struct fileproc *fp; + size_t i; + struct fileproc *nfp; + int indx, error; struct psemname nd; struct pseminfo *pinfo; - extern struct fileops psemops; + struct psemcache *pcp; char * pnbuf; char * nameptr; char * cp; @@ -334,18 +361,20 @@ sem_open(p, uap, retval) AUDIT_ARG(fflags, uap->oflag); AUDIT_ARG(mode, uap->mode); AUDIT_ARG(value, uap->value); + pinfo = PSEMINFO_NULL; - MALLOC_ZONE(pnbuf, caddr_t, - MAXPATHLEN, M_NAMEI, M_WAITOK); + MALLOC_ZONE(pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (pnbuf == NULL) + return(ENOSPC); + pathlen = MAXPATHLEN; - error = copyinstr((void *)uap->name, pnbuf, - MAXPATHLEN, &pathlen); + error = copyinstr(uap->name, pnbuf, MAXPATHLEN, &pathlen); if (error) { goto bad; } AUDIT_ARG(text, pnbuf); - if (pathlen > PSEMNAMLEN) { + if ( (pathlen > PSEMNAMLEN) ) { error = ENAMETOOLONG; goto bad; } @@ -374,9 +403,16 @@ sem_open(p, uap, retval) nd.psem_hash += (unsigned char)*cp * i; } +#if KTRACE + if (KTRPOINT(p, KTR_NAMEI)) + ktrnamei(p->p_tracep, nameptr); +#endif + + PSEM_SUBSYS_LOCK(); error = psem_cache_search(&pinfo, &nd, &pcache); if (error == ENOENT) { + PSEM_SUBSYS_UNLOCK(); error = EINVAL; goto bad; @@ -387,10 +423,12 @@ sem_open(p, uap, retval) incache = 1; fmode = FFLAGS(uap->oflag); - if (error = falloc(p, &nfp, &indx)) { + PSEM_SUBSYS_UNLOCK(); + error = falloc(p, &nfp, &indx); + if (error) goto bad; - } + PSEM_SUBSYS_LOCK(); fp = nfp; cmode &= ALLPERMS; @@ -401,7 +439,8 @@ sem_open(p, uap, retval) } #endif AUDIT_ARG(posix_ipc_perm, pinfo->psem_uid, - pinfo->psem_gid, pinfo->psem_mode); + pinfo->psem_gid, pinfo->psem_mode); + PSEM_SUBSYS_UNLOCK(); error = EEXIST; goto bad1; } @@ -410,58 +449,96 @@ sem_open(p, uap, retval) fmode &= ~O_CREAT; } - if (fmode & O_CREAT) { + if ( (fmode & O_CREAT) ) { if((value < 0) && (value > SEM_VALUE_MAX)) { + PSEM_SUBSYS_UNLOCK(); error = EINVAL; goto bad1; } - pinfo = (struct pseminfo *)_MALLOC(sizeof(struct pseminfo), M_SHM, M_WAITOK); - bzero(pinfo, sizeof(struct pseminfo)); + PSEM_SUBSYS_UNLOCK(); + MALLOC(pinfo, struct pseminfo *, sizeof(struct pseminfo), M_SHM, M_WAITOK|M_ZERO); + if (pinfo == NULL) { + error = ENOSPC; + goto bad1; + } + PSEM_SUBSYS_LOCK(); + pinfo_alloc = 1; pinfo->psem_flags = PSEM_DEFINED | PSEM_INCREATE; pinfo->psem_usecount = 1; pinfo->psem_mode = cmode; - pinfo->psem_uid = p->p_ucred->cr_uid; - pinfo->psem_gid = p->p_ucred->cr_gid; + pinfo->psem_uid = kauth_cred_getuid(kauth_cred_get()); + pinfo->psem_gid = kauth_cred_get()->cr_gid; + PSEM_SUBSYS_UNLOCK(); kret = semaphore_create(kernel_task, &pinfo->psem_semobject, SYNC_POLICY_FIFO, value); if(kret != KERN_SUCCESS) goto bad3; + PSEM_SUBSYS_LOCK(); pinfo->psem_flags &= ~PSEM_DEFINED; pinfo->psem_flags |= PSEM_ALLOCATED; pinfo->sem_proc = p; } else { /* semaphore should exist as it is without O_CREAT */ if (!incache) { + PSEM_SUBSYS_UNLOCK(); error = ENOENT; goto bad1; } if( pinfo->psem_flags & PSEM_INDELETE) { + PSEM_SUBSYS_UNLOCK(); error = ENOENT; goto bad1; } AUDIT_ARG(posix_ipc_perm, pinfo->psem_uid, - pinfo->psem_gid, pinfo->psem_mode); - if (error = psem_access(pinfo, fmode, p->p_ucred, p)) + pinfo->psem_gid, pinfo->psem_mode); + if ( (error = psem_access(pinfo, fmode, kauth_cred_get())) ) { + PSEM_SUBSYS_UNLOCK(); goto bad1; + } } - pnode = (struct psemnode *)_MALLOC(sizeof(struct psemnode), M_SHM, M_WAITOK); - bzero(pnode, sizeof(struct psemnode)); + PSEM_SUBSYS_UNLOCK(); + MALLOC(pnode, struct psemnode *, sizeof(struct psemnode), M_SHM, M_WAITOK|M_ZERO); + if (pnode == NULL) { + error = ENOSPC; + goto bad1; + } + if (!incache) { + /* + * We allocate a new entry if we are less than the maximum + * allowed and the one at the front of the LRU list is in use. + * Otherwise we use the one at the front of the LRU list. + */ + MALLOC(pcp, struct psemcache *, sizeof(struct psemcache), M_SHM, M_WAITOK|M_ZERO); + if (pcp == NULL) { + error = ENOMEM; + goto bad2; + } + } + PSEM_SUBSYS_LOCK(); if (!incache) { - if (error = psem_cache_add(pinfo, &nd)) { - goto bad2; + if ( (error = psem_cache_add(pinfo, &nd, pcp)) ) { + PSEM_SUBSYS_UNLOCK(); + FREE(pcp, M_SHM); + goto bad2; } } pinfo->psem_flags &= ~PSEM_INCREATE; pinfo->psem_usecount++; pnode->pinfo = pinfo; + PSEM_SUBSYS_UNLOCK(); + + proc_fdlock(p); fp->f_flag = fmode & FMASK; fp->f_type = DTYPE_PSXSEM; fp->f_ops = &psemops; fp->f_data = (caddr_t)pnode; *fdflags(p, indx) &= ~UF_RESERVED; - *retval = indx; + fp_drop(p, indx, fp, 1); + proc_fdunlock(p); + + *retval = CAST_USER_ADDR_T(indx); FREE_ZONE(pnbuf, MAXPATHLEN, M_NAMEI); return (0); @@ -476,36 +553,33 @@ bad3: } goto bad1; bad2: - _FREE(pnode, M_SHM); - if (pinfo_alloc) - _FREE(pinfo, M_SHM); + FREE(pnode, M_SHM); bad1: - fdrelse(p, indx); - ffree(nfp); + if (pinfo_alloc) + FREE(pinfo, M_SHM); + fp_free(p, indx, nfp); bad: FREE_ZONE(pnbuf, MAXPATHLEN, M_NAMEI); return (error); } -int -psem_access(pinfo, mode, cred, p) - struct pseminfo *pinfo; - int mode; - struct ucred *cred; - struct proc *p; +/* + * XXX This code is repeated in several places + */ +static int +psem_access(struct pseminfo *pinfo, int mode, kauth_cred_t cred) { mode_t mask; - register gid_t *gp; - int i, error; + int is_member; /* Otherwise, user id 0 always gets access. */ - if (cred->cr_uid == 0) + if (!suser(cred, NULL)) return (0); mask = 0; /* Otherwise, check the owner. */ - if (cred->cr_uid == pinfo->psem_uid) { + if (kauth_cred_getuid(cred) == pinfo->psem_uid) { if (mode & FREAD) mask |= S_IRUSR; if (mode & FWRITE) @@ -514,14 +588,13 @@ psem_access(pinfo, mode, cred, p) } /* Otherwise, check the groups. */ - for (i = 0, gp = cred->cr_groups; i < cred->cr_ngroups; i++, gp++) - if (pinfo->psem_gid == *gp) { - if (mode & FREAD) - mask |= S_IRGRP; - if (mode & FWRITE) - mask |= S_IWGRP; - return ((pinfo->psem_mode & mask) == mask ? 0 : EACCES); - } + if (kauth_cred_ismember_gid(cred, pinfo->psem_gid, &is_member) == 0 && is_member) { + if (mode & FREAD) + mask |= S_IRGRP; + if (mode & FWRITE) + mask |= S_IWGRP; + return ((pinfo->psem_mode & mask) == mask ? 0 : EACCES); + } /* Otherwise, check everyone else. */ if (mode & FREAD) @@ -531,40 +604,28 @@ psem_access(pinfo, mode, cred, p) return ((pinfo->psem_mode & mask) == mask ? 0 : EACCES); } -struct sem_unlink_args { - const char *name; -}; - int -sem_unlink(p, uap, retval) - struct proc *p; - register struct sem_unlink_args *uap; - register_t *retval; +sem_unlink(__unused struct proc *p, struct sem_unlink_args *uap, __unused register_t *retval) { - register struct filedesc *fdp = p->p_fd; - register struct file *fp; - int flags, i; + size_t i; int error=0; struct psemname nd; struct pseminfo *pinfo; - extern struct fileops psemops; char * pnbuf; char * nameptr; char * cp; size_t pathlen, plen; - int fmode, cmode ; int incache = 0; - struct psemnode * pnode = PSEMNODE_NULL; struct psemcache *pcache = PSEMCACHE_NULL; - kern_return_t kret; pinfo = PSEMINFO_NULL; - MALLOC_ZONE(pnbuf, caddr_t, - MAXPATHLEN, M_NAMEI, M_WAITOK); + MALLOC_ZONE(pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (pnbuf == NULL) { + return(ENOSPC); /* XXX non-standard */ + } pathlen = MAXPATHLEN; - error = copyinstr((void *)uap->name, pnbuf, - MAXPATHLEN, &pathlen); + error = copyinstr(uap->name, pnbuf, MAXPATHLEN, &pathlen); if (error) { goto bad; } @@ -599,29 +660,37 @@ sem_unlink(p, uap, retval) nd.psem_hash += (unsigned char)*cp * i; } + PSEM_SUBSYS_LOCK(); error = psem_cache_search(&pinfo, &nd, &pcache); if (error == ENOENT) { + PSEM_SUBSYS_UNLOCK(); error = EINVAL; goto bad; } if (!error) { + PSEM_SUBSYS_UNLOCK(); error = EINVAL; goto bad; } else incache = 1; - if (error = psem_access(pinfo, pinfo->psem_mode, p->p_ucred, p)) + if ( (error = psem_access(pinfo, pinfo->psem_mode, kauth_cred_get())) ) { + PSEM_SUBSYS_UNLOCK(); goto bad; + } if ((pinfo->psem_flags & (PSEM_DEFINED | PSEM_ALLOCATED))==0) { + PSEM_SUBSYS_UNLOCK(); return (EINVAL); } - if (pinfo->psem_flags & PSEM_INDELETE) { + if ( (pinfo->psem_flags & PSEM_INDELETE) ) { + PSEM_SUBSYS_UNLOCK(); error = 0; goto bad; } + AUDIT_ARG(posix_ipc_perm, pinfo->psem_uid, pinfo->psem_gid, pinfo->psem_mode); @@ -630,122 +699,127 @@ sem_unlink(p, uap, retval) if (!pinfo->psem_usecount) { psem_delete(pinfo); - _FREE(pinfo,M_SHM); + FREE(pinfo,M_SHM); } else pinfo->psem_flags |= PSEM_REMOVED; psem_cache_delete(pcache); - _FREE(pcache, M_SHM); + PSEM_SUBSYS_UNLOCK(); + FREE(pcache, M_SHM); error = 0; bad: FREE_ZONE(pnbuf, MAXPATHLEN, M_NAMEI); return (error); } -struct sem_close_args { - sem_t *sem; -}; - int -sem_close(p, uap, retval) - struct proc *p; - struct sem_close_args *uap; - register_t *retval; +sem_close(struct proc *p, struct sem_close_args *uap, __unused register_t *retval) { - int fd = (int)uap->sem; - register struct filedesc *fdp = p->p_fd; - register struct file *fp; + int fd = CAST_DOWN(int,uap->sem); + struct fileproc *fp; int error = 0; AUDIT_ARG(fd, fd); /* XXX This seems wrong; uap->sem is a pointer */ - if ((u_int)fd >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[fd]) == NULL || - (fdp->fd_ofileflags[fd] & UF_RESERVED)) - return (EBADF); - fdrelse(p, fd); - if( error = closef(fp, p)) + + proc_fdlock(p); + error = fp_lookup(p,fd, &fp, 1); + if (error) { + proc_fdunlock(p); return(error); - return(0); + } + fdrelse(p, fd); + error = closef_locked(fp, fp->f_fglob, p); + FREE_ZONE(fp, sizeof *fp, M_FILEPROC); + proc_fdunlock(p); + return(error); } -struct sem_wait_args { - sem_t *sem; -}; - int -sem_wait(p, uap, retval) - struct proc *p; - struct sem_wait_args *uap; - register_t *retval; +sem_wait(struct proc *p, struct sem_wait_args *uap, __unused register_t *retval) { - int fd = (int)uap->sem; - register struct filedesc *fdp = p->p_fd; - struct file *fp; + int fd = CAST_DOWN(int,uap->sem); + struct fileproc *fp; struct pseminfo * pinfo; struct psemnode * pnode ; kern_return_t kret; int error; - if (error = fdgetf(p, (int)uap->sem, &fp)) + error = fp_getfpsem(p, fd, &fp, &pnode); + if (error) return (error); - if (fp->f_type != DTYPE_PSXSEM) - return(EBADF); - if (((pnode = (struct psemnode *)fp->f_data)) == PSEMNODE_NULL ) - return(EINVAL); - if ((pinfo = pnode->pinfo) == PSEMINFO_NULL) - return(EINVAL); + if (((pnode = (struct psemnode *)fp->f_data)) == PSEMNODE_NULL ) { + error = EINVAL; + goto out; + } + PSEM_SUBSYS_LOCK(); + if ((pinfo = pnode->pinfo) == PSEMINFO_NULL) { + PSEM_SUBSYS_UNLOCK(); + error = EINVAL; + goto out; + } if ((pinfo->psem_flags & (PSEM_DEFINED | PSEM_ALLOCATED)) != PSEM_ALLOCATED) { - return(EINVAL); + PSEM_SUBSYS_UNLOCK(); + error = EINVAL; + goto out; } + PSEM_SUBSYS_UNLOCK(); kret = semaphore_wait(pinfo->psem_semobject); switch (kret) { case KERN_INVALID_ADDRESS: case KERN_PROTECTION_FAILURE: - return (EACCES); + error = EACCES; + break; case KERN_ABORTED: case KERN_OPERATION_TIMED_OUT: - return (EINTR); + error = EINTR; + break; case KERN_SUCCESS: - return(0); + error = 0; + break; default: - return (EINVAL); + error = EINVAL; + break; } -} +out: + fp_drop(p, fd, fp, 0); + return(error); -struct sem_trywait_args { - sem_t *sem; -}; +} int -sem_trywait(p, uap, retval) - struct proc *p; - struct sem_trywait_args *uap; - register_t *retval; +sem_trywait(struct proc *p, struct sem_trywait_args *uap, __unused register_t *retval) { - int fd = (int)uap->sem; - register struct filedesc *fdp = p->p_fd; - struct file *fp; + int fd = CAST_DOWN(int,uap->sem); + struct fileproc *fp; struct pseminfo * pinfo; struct psemnode * pnode ; kern_return_t kret; mach_timespec_t wait_time; int error; - if (error = fdgetf(p, (int)uap->sem, &fp)) + error = fp_getfpsem(p, fd, &fp, &pnode); + if (error) return (error); - if (fp->f_type != DTYPE_PSXSEM) - return(EBADF); - if (((pnode = (struct psemnode *)fp->f_data)) == PSEMNODE_NULL ) - return(EINVAL); - if ((pinfo = pnode->pinfo) == PSEMINFO_NULL) - return(EINVAL); + if (((pnode = (struct psemnode *)fp->f_data)) == PSEMNODE_NULL ) { + error = EINVAL; + goto out; + } + PSEM_SUBSYS_LOCK(); + if ((pinfo = pnode->pinfo) == PSEMINFO_NULL) { + PSEM_SUBSYS_UNLOCK(); + error = EINVAL; + goto out; + } if ((pinfo->psem_flags & (PSEM_DEFINED | PSEM_ALLOCATED)) != PSEM_ALLOCATED) { - return(EINVAL); + PSEM_SUBSYS_UNLOCK(); + error = EINVAL; + goto out; } + PSEM_SUBSYS_UNLOCK(); wait_time.tv_sec = 0; wait_time.tv_nsec = 0; @@ -753,121 +827,112 @@ sem_trywait(p, uap, retval) switch (kret) { case KERN_INVALID_ADDRESS: case KERN_PROTECTION_FAILURE: - return (EINVAL); + error = EINVAL; + break; case KERN_ABORTED: - return (EINTR); + error = EINTR; + break; case KERN_OPERATION_TIMED_OUT: - return (EAGAIN); + error = EAGAIN; + break; case KERN_SUCCESS: - return(0); + error = 0; + break; default: - return (EINVAL); + error = EINVAL; + break; } +out: + fp_drop(p, fd, fp, 0); + return(error); } -struct sem_post_args { - sem_t *sem; -}; - int -sem_post(p, uap, retval) - struct proc *p; - struct sem_post_args *uap; - register_t *retval; +sem_post(struct proc *p, struct sem_post_args *uap, __unused register_t *retval) { - int fd = (int)uap->sem; - register struct filedesc *fdp = p->p_fd; - struct file *fp; + int fd = CAST_DOWN(int,uap->sem); + struct fileproc *fp; struct pseminfo * pinfo; struct psemnode * pnode ; kern_return_t kret; int error; - if (error = fdgetf(p, (int)uap->sem, &fp)) + error = fp_getfpsem(p, fd, &fp, &pnode); + if (error) return (error); - if (fp->f_type != DTYPE_PSXSEM) - return(EBADF); - if (((pnode = (struct psemnode *)fp->f_data)) == PSEMNODE_NULL ) - return(EINVAL); - if ((pinfo = pnode->pinfo) == PSEMINFO_NULL) - return(EINVAL); + if (((pnode = (struct psemnode *)fp->f_data)) == PSEMNODE_NULL ) { + error = EINVAL; + goto out; + } + PSEM_SUBSYS_LOCK(); + if ((pinfo = pnode->pinfo) == PSEMINFO_NULL) { + PSEM_SUBSYS_UNLOCK(); + error = EINVAL; + goto out; + } if ((pinfo->psem_flags & (PSEM_DEFINED | PSEM_ALLOCATED)) != PSEM_ALLOCATED) { - return(EINVAL); + PSEM_SUBSYS_UNLOCK(); + error = EINVAL; + goto out; } + PSEM_SUBSYS_UNLOCK(); kret = semaphore_signal(pinfo->psem_semobject); switch (kret) { case KERN_INVALID_ADDRESS: case KERN_PROTECTION_FAILURE: - return (EINVAL); + error = EINVAL; + break; case KERN_ABORTED: case KERN_OPERATION_TIMED_OUT: - return (EINTR); + error = EINTR; + break; case KERN_SUCCESS: - return(0); + error = 0; + break; default: - return (EINVAL); + error = EINVAL; + break; } +out: + fp_drop(p, fd, fp, 0); + return(error); } -struct sem_init_args { - sem_t *sem; - int phsared; - unsigned int value; -}; - int -sem_init(p, uap, retval) - struct proc *p; - struct sem_init_args *uap; - register_t *retval; +sem_init(__unused struct proc *p, __unused struct sem_init_args *uap, __unused register_t *retval) { return(ENOSYS); } -struct sem_destroy_args { - sem_t *sem; -}; - int -sem_destroy(p, uap, retval) - struct proc *p; - struct sem_destroy_args *uap; - register_t *retval; +sem_destroy(__unused struct proc *p, __unused struct sem_destroy_args *uap, __unused register_t *retval) { return(ENOSYS); } -struct sem_getvalue_args { - sem_t *sem; - int * sval; -}; - int -sem_getvalue(p, uap, retval) - struct proc *p; - struct sem_getvalue_args *uap; - register_t *retval; +sem_getvalue(__unused struct proc *p, __unused struct sem_getvalue_args *uap, __unused register_t *retval) { return(ENOSYS); } static int -psem_close(pnode, flags, cred, p) - register struct psemnode *pnode; - int flags; - struct ucred *cred; - struct proc *p; +psem_close(struct psemnode *pnode, __unused int flags, + __unused kauth_cred_t cred, __unused struct proc *p) { int error=0; - kern_return_t kret; register struct pseminfo *pinfo; - if ((pinfo = pnode->pinfo) == PSEMINFO_NULL) + PSEM_SUBSYS_LOCK(); + if ((pinfo = pnode->pinfo) == PSEMINFO_NULL){ + PSEM_SUBSYS_UNLOCK(); return(EINVAL); + } if ((pinfo->psem_flags & PSEM_ALLOCATED) != PSEM_ALLOCATED) { + PSEM_SUBSYS_UNLOCK(); return(EINVAL); } #if DIAGNOSTIC @@ -878,24 +943,33 @@ psem_close(pnode, flags, cred, p) pinfo->psem_usecount--; if ((pinfo->psem_flags & PSEM_REMOVED) && !pinfo->psem_usecount) { + PSEM_SUBSYS_UNLOCK(); + /* lock dropped as only semaphore is destroyed here */ error = psem_delete(pinfo); - _FREE(pinfo,M_SHM); + FREE(pinfo,M_SHM); + } else { + PSEM_SUBSYS_UNLOCK(); } - _FREE(pnode, M_SHM); + /* subsystem lock is dropped when we get here */ + FREE(pnode, M_SHM); return (error); } static int -psem_closefile(fp, p) - struct file *fp; +psem_closefile(fg, p) + struct fileglob *fg; struct proc *p; { + int error; - return (psem_close(((struct psemnode *)fp->f_data), fp->f_flag, - fp->f_cred, p)); + /* Not locked as psem_close is called only from here and is locked properly */ + error = psem_close(((struct psemnode *)fg->fg_data), fg->fg_flag, + fg->fg_cred, p); + + return(error); } -int +static int psem_delete(struct pseminfo * pinfo) { kern_return_t kret; @@ -917,53 +991,39 @@ psem_delete(struct pseminfo * pinfo) } static int -psem_read(fp, uio, cred, flags, p) - struct file *fp; - struct uio *uio; - struct ucred *cred; - int flags; - struct proc *p; +psem_read(__unused struct fileproc *fp, __unused struct uio *uio, + __unused kauth_cred_t cred, __unused int flags, + __unused struct proc *p) { - return(EOPNOTSUPP); + return(ENOTSUP); } static int -psem_write(fp, uio, cred, flags, p) - struct file *fp; - struct uio *uio; - struct ucred *cred; - int flags; - struct proc *p; +psem_write(__unused struct fileproc *fp, __unused struct uio *uio, + __unused kauth_cred_t cred, __unused int flags, + __unused struct proc *p) { - return(EOPNOTSUPP); + return(ENOTSUP); } static int -psem_ioctl(fp, com, data, p) - struct file *fp; - u_long com; - caddr_t data; - struct proc *p; +psem_ioctl(__unused struct fileproc *fp, __unused u_long com, + __unused caddr_t data, __unused struct proc *p) { - return(EOPNOTSUPP); + return(ENOTSUP); } static int -psem_select(fp, which, wql, p) - struct file *fp; - int which; - void *wql; - struct proc *p; +psem_select(__unused struct fileproc *fp, __unused int which, + __unused void *wql, __unused struct proc *p) { - return(EOPNOTSUPP); + return(ENOTSUP); } static int -psem_kqfilter(fp, kn, p) - struct file *fp; - struct knote *kn; - struct proc *p; +psem_kqfilter(__unused struct fileproc *fp, __unused struct knote *kn, + __unused struct proc *p) { - return (EOPNOTSUPP); + return (ENOTSUP); } diff --git a/bsd/kern/posix_shm.c b/bsd/kern/posix_shm.c index 49e035a40..f44264c37 100644 --- a/bsd/kern/posix_shm.c +++ b/bsd/kern/posix_shm.c @@ -39,11 +39,11 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/filedesc.h> #include <sys/stat.h> -#include <sys/buf.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/vnode.h> @@ -51,16 +51,34 @@ #include <sys/tty.h> #include <sys/malloc.h> #include <sys/mman.h> +#include <sys/stat.h> +#include <sys/sysproto.h> #include <bsm/audit_kernel.h> #include <mach/mach_types.h> +#include <mach/mach_vm.h> +#include <mach/vm_map.h> #include <mach/vm_prot.h> #include <mach/vm_inherit.h> #include <mach/kern_return.h> #include <mach/memory_object_control.h> +#include <vm/vm_map.h> +#include <vm/vm_protos.h> +#include <vm/vm_shared_memory_server.h> + +#if KTRACE +#include <sys/ktrace.h> +#endif +#define f_flag f_fglob->fg_flag +#define f_type f_fglob->fg_type +#define f_msgcount f_fglob->fg_msgcount +#define f_cred f_fglob->fg_cred +#define f_ops f_fglob->fg_ops +#define f_offset f_fglob->fg_offset +#define f_data f_fglob->fg_data #define PSHMNAMLEN 31 /* maximum name segment length we bother with */ struct pshminfo { @@ -113,8 +131,8 @@ struct pshmname { }; struct pshmnode { - off_t mapp_addr; - size_t map_size; + off_t mapp_addr; + user_size_t map_size; struct pshminfo *pinfo; unsigned int pshm_usecount; #if DIAGNOSTIC @@ -127,25 +145,59 @@ struct pshmnode { #define PSHMHASH(pnp) \ (&pshmhashtbl[(pnp)->pshm_hash & pshmhash]) + LIST_HEAD(pshmhashhead, pshmcache) *pshmhashtbl; /* Hash Table */ u_long pshmhash; /* size of hash table - 1 */ long pshmnument; /* number of cache entries allocated */ struct pshmstats pshmstats; /* cache effectiveness statistics */ -static int pshm_read __P((struct file *fp, struct uio *uio, - struct ucred *cred, int flags, struct proc *p)); -static int pshm_write __P((struct file *fp, struct uio *uio, - struct ucred *cred, int flags, struct proc *p)); -static int pshm_ioctl __P((struct file *fp, u_long com, - caddr_t data, struct proc *p)); -static int pshm_select __P((struct file *fp, int which, void *wql, - struct proc *p)); -static int pshm_closefile __P((struct file *fp, struct proc *p)); - -static int pshm_kqfilter __P((struct file *fp, struct knote *kn, struct proc *p)); +static int pshm_read (struct fileproc *fp, struct uio *uio, + kauth_cred_t cred, int flags, struct proc *p); +static int pshm_write (struct fileproc *fp, struct uio *uio, + kauth_cred_t cred, int flags, struct proc *p); +static int pshm_ioctl (struct fileproc *fp, u_long com, + caddr_t data, struct proc *p); +static int pshm_select (struct fileproc *fp, int which, void *wql, struct proc *p); +static int pshm_close(struct pshmnode *pnode); +static int pshm_closefile (struct fileglob *fg, struct proc *p); + +static int pshm_kqfilter(struct fileproc *fp, struct knote *kn, struct proc *p); + +int pshm_access(struct pshminfo *pinfo, int mode, kauth_cred_t cred, struct proc *p); +static int pshm_cache_add(struct pshminfo *pshmp, struct pshmname *pnp, struct pshmcache *pcp); +static void pshm_cache_delete(struct pshmcache *pcp); +#if NOT_USED +static void pshm_cache_purge(void); +#endif /* NOT_USED */ +static int pshm_cache_search(struct pshminfo **pshmp, struct pshmname *pnp, + struct pshmcache **pcache); struct fileops pshmops = - { pshm_read, pshm_write, pshm_ioctl, pshm_select, pshm_closefile, pshm_kqfilter }; + { pshm_read, pshm_write, pshm_ioctl, pshm_select, pshm_closefile, pshm_kqfilter, 0 }; + +static lck_grp_t *psx_shm_subsys_lck_grp; +static lck_grp_attr_t *psx_shm_subsys_lck_grp_attr; +static lck_attr_t *psx_shm_subsys_lck_attr; +static lck_mtx_t psx_shm_subsys_mutex; + +#define PSHM_SUBSYS_LOCK() lck_mtx_lock(& psx_shm_subsys_mutex) +#define PSHM_SUBSYS_UNLOCK() lck_mtx_unlock(& psx_shm_subsys_mutex) + + +/* Initialize the mutex governing access to the posix shm subsystem */ +__private_extern__ void +pshm_lock_init( void ) +{ + + psx_shm_subsys_lck_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(psx_shm_subsys_lck_grp_attr); + + psx_shm_subsys_lck_grp = lck_grp_alloc_init("posix shared memory", psx_shm_subsys_lck_grp_attr); + + psx_shm_subsys_lck_attr = lck_attr_alloc_init(); + /* lck_attr_setdebug(psx_shm_subsys_lck_attr); */ + lck_mtx_init(& psx_shm_subsys_mutex, psx_shm_subsys_lck_grp, psx_shm_subsys_lck_attr); +} /* * Lookup an entry in the cache @@ -157,14 +209,12 @@ struct fileops pshmops = * fails, a status of zero is returned. */ -int -pshm_cache_search(pshmp, pnp, pcache) - struct pshminfo **pshmp; - struct pshmname *pnp; - struct pshmcache **pcache; +static int +pshm_cache_search(struct pshminfo **pshmp, struct pshmname *pnp, + struct pshmcache **pcache) { - register struct pshmcache *pcp, *nnp; - register struct pshmhashhead *pcpp; + struct pshmcache *pcp, *nnp; + struct pshmhashhead *pcpp; if (pnp->pshm_namelen > PSHMNAMLEN) { pshmstats.longnames++; @@ -203,14 +253,12 @@ pshm_cache_search(pshmp, pnp, pcache) /* * Add an entry to the cache. + * XXX should be static? */ -int -pshm_cache_add(pshmp, pnp) - struct pshminfo *pshmp; - struct pshmname *pnp; +static int +pshm_cache_add(struct pshminfo *pshmp, struct pshmname *pnp, struct pshmcache *pcp) { - register struct pshmcache *pcp; - register struct pshmhashhead *pcpp; + struct pshmhashhead *pcpp; struct pshminfo *dpinfo; struct pshmcache *dpcp; @@ -219,20 +267,13 @@ pshm_cache_add(pshmp, pnp) panic("cache_enter: name too long"); #endif - /* - * We allocate a new entry if we are less than the maximum - * allowed and the one at the front of the LRU list is in use. - * Otherwise we use the one at the front of the LRU list. - */ - pcp = (struct pshmcache *)_MALLOC(sizeof(struct pshmcache), M_SHM, M_WAITOK); + /* if the entry has already been added by some one else return */ if (pshm_cache_search(&dpinfo, pnp, &dpcp) == -1) { - _FREE(pcp, M_SHM); return(EEXIST); } pshmnument++; - bzero(pcp, sizeof(struct pshmcache)); /* * Fill in cache info, if vp is NULL this is a "negative" cache entry. * For negative entries, we have to record whether it is a whiteout. @@ -245,7 +286,7 @@ pshm_cache_add(pshmp, pnp) pcpp = PSHMHASH(pnp); #if DIAGNOSTIC { - register struct pshmcache *p; + struct pshmcache *p; for (p = pcpp->lh_first; p != 0; p = p->pshm_hash.le_next) if (p == pcp) @@ -260,11 +301,12 @@ pshm_cache_add(pshmp, pnp) * Name cache initialization, from vfs_init() when we are booting */ void -pshm_cache_init() +pshm_cache_init(void) { pshmhashtbl = hashinit(desiredvnodes, M_SHM, &pshmhash); } +#if NOT_USED /* * Invalidate a all entries to particular vnode. * @@ -273,20 +315,21 @@ pshm_cache_init() * need to ditch the entire cache, to avoid confusion. No valid vnode will * ever have (v_id == 0). */ -void +static void pshm_cache_purge(void) { struct pshmcache *pcp; struct pshmhashhead *pcpp; for (pcpp = &pshmhashtbl[pshmhash]; pcpp >= pshmhashtbl; pcpp--) { - while (pcp = pcpp->lh_first) + while ( (pcp = pcpp->lh_first) ) pshm_cache_delete(pcp); } } +#endif /* NOT_USED */ -pshm_cache_delete(pcp) - struct pshmcache *pcp; +static void +pshm_cache_delete(struct pshmcache *pcp) { #if DIAGNOSTIC if (pcp->pshm_hash.le_prev == 0) @@ -300,27 +343,15 @@ pshm_cache_delete(pcp) } -struct shm_open_args { - const char *name; - int oflag; - int mode; -}; - int -shm_open(p, uap, retval) - struct proc *p; - register struct shm_open_args *uap; - register_t *retval; +shm_open(struct proc *p, struct shm_open_args *uap, register_t *retval) { - register struct filedesc *fdp = p->p_fd; - register struct file *fp; - register struct vnode *vp; - int i; - struct file *nfp; - int type, indx, error; + struct fileproc *fp; + size_t i; + struct fileproc *nfp; + int indx, error; struct pshmname nd; struct pshminfo *pinfo; - extern struct fileops pshmops; char * pnbuf; char * nameptr; char * cp; @@ -330,17 +361,21 @@ shm_open(p, uap, retval) int incache = 0; struct pshmnode * pnode = PSHMNODE_NULL; struct pshmcache * pcache = PSHMCACHE_NULL; + struct pshmcache *pcp; int pinfo_alloc=0; AUDIT_ARG(fflags, uap->oflag); AUDIT_ARG(mode, uap->mode); + pinfo = PSHMINFO_NULL; - MALLOC_ZONE(pnbuf, caddr_t, - MAXPATHLEN, M_NAMEI, M_WAITOK); + MALLOC_ZONE(pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (pnbuf == NULL) { + return(ENOSPC); + } + pathlen = MAXPATHLEN; - error = copyinstr((void *)uap->name, (void *)pnbuf, - MAXPATHLEN, &pathlen); + error = copyinstr(uap->name, (void *)pnbuf, MAXPATHLEN, &pathlen); if (error) { goto bad; } @@ -375,9 +410,16 @@ shm_open(p, uap, retval) nd.pshm_hash += (unsigned char)*cp * i; } +#if KTRACE + if (KTRPOINT(p, KTR_NAMEI)) + ktrnamei(p->p_tracep, nameptr); +#endif + + PSHM_SUBSYS_LOCK(); error = pshm_cache_search(&pinfo, &nd, &pcache); if (error == ENOENT) { + PSHM_SUBSYS_UNLOCK(); error = EINVAL; goto bad; @@ -388,12 +430,22 @@ shm_open(p, uap, retval) incache = 1; fmode = FFLAGS(uap->oflag); if ((fmode & (FREAD | FWRITE))==0) { + PSHM_SUBSYS_UNLOCK(); error = EINVAL; goto bad; } - if (error = falloc(p, &nfp, &indx)) + /* + * XXXXXXXXXX TBD XXXXXXXXXX + * There is a race that existed with the funnels as well. + * Need to be fixed later + */ + PSHM_SUBSYS_UNLOCK(); + error = falloc(p, &nfp, &indx); + if (error ) goto bad; + PSHM_SUBSYS_LOCK(); + fp = nfp; cmode &= ALLPERMS; @@ -409,43 +461,57 @@ shm_open(p, uap, retval) } #endif error = EEXIST; + PSHM_SUBSYS_UNLOCK(); goto bad1; } if (!incache) { + PSHM_SUBSYS_UNLOCK(); /* create a new one */ - pinfo = (struct pshminfo *)_MALLOC(sizeof(struct pshminfo), M_SHM, M_WAITOK); - bzero(pinfo, sizeof(struct pshminfo)); - pinfo_alloc = 1; + MALLOC(pinfo, struct pshminfo *, sizeof(struct pshminfo), M_SHM, M_WAITOK|M_ZERO); + if (pinfo == NULL) { + error = ENOSPC; + goto bad1; + } + PSHM_SUBSYS_LOCK(); + pinfo_alloc = 1; pinfo->pshm_flags = PSHM_DEFINED | PSHM_INCREATE; - pinfo->pshm_usecount = 1; + pinfo->pshm_usecount = 1; /* existence reference */ pinfo->pshm_mode = cmode; - pinfo->pshm_uid = p->p_ucred->cr_uid; - pinfo->pshm_gid = p->p_ucred->cr_gid; + pinfo->pshm_uid = kauth_cred_getuid(kauth_cred_get()); + pinfo->pshm_gid = kauth_cred_get()->cr_gid; } else { /* already exists */ if( pinfo->pshm_flags & PSHM_INDELETE) { + PSHM_SUBSYS_UNLOCK(); error = ENOENT; goto bad1; } - AUDIT_ARG(posix_ipc_perm, pinfo->pshm_uid, - pinfo->pshm_gid, pinfo->pshm_mode); - if (error = pshm_access(pinfo, fmode, p->p_ucred, p)) + AUDIT_ARG(posix_ipc_perm, pinfo->pshm_uid, + pinfo->pshm_gid, pinfo->pshm_mode); + if ( (error = pshm_access(pinfo, fmode, kauth_cred_get(), p)) ) { + PSHM_SUBSYS_UNLOCK(); goto bad1; + } } } else { if (!incache) { /* O_CREAT is not set and the shm obecj does not exist */ + PSHM_SUBSYS_UNLOCK(); error = ENOENT; goto bad1; } if( pinfo->pshm_flags & PSHM_INDELETE) { + PSHM_SUBSYS_UNLOCK(); error = ENOENT; goto bad1; } - if (error = pshm_access(pinfo, fmode, p->p_ucred, p)) + if ( (error = pshm_access(pinfo, fmode, kauth_cred_get(), p)) ) { + PSHM_SUBSYS_UNLOCK(); goto bad1; + } } if (fmode & O_TRUNC) { + PSHM_SUBSYS_UNLOCK(); error = EINVAL; goto bad2; } @@ -455,54 +521,74 @@ shm_open(p, uap, retval) if (fmode & FREAD) pinfo->pshm_readcount++; #endif - pnode = (struct pshmnode *)_MALLOC(sizeof(struct pshmnode), M_SHM, M_WAITOK); - bzero(pnode, sizeof(struct pshmnode)); + PSHM_SUBSYS_UNLOCK(); + MALLOC(pnode, struct pshmnode *, sizeof(struct pshmnode), M_SHM, M_WAITOK|M_ZERO); + if (pnode == NULL) { + error = ENOSPC; + goto bad2; + } + if (!incache) { + /* + * We allocate a new entry if we are less than the maximum + * allowed and the one at the front of the LRU list is in use. + * Otherwise we use the one at the front of the LRU list. + */ + MALLOC(pcp, struct pshmcache *, sizeof(struct pshmcache), M_SHM, M_WAITOK|M_ZERO); + if (pcp == NULL) { + error = ENOSPC; + goto bad2; + } + + } + PSHM_SUBSYS_LOCK(); if (!incache) { - if (error = pshm_cache_add(pinfo, &nd)) { - goto bad3; + if ( (error = pshm_cache_add(pinfo, &nd, pcp)) ) { + PSHM_SUBSYS_UNLOCK(); + FREE(pcp, M_SHM); + goto bad3; } } pinfo->pshm_flags &= ~PSHM_INCREATE; - pinfo->pshm_usecount++; + pinfo->pshm_usecount++; /* extra reference for the new fd */ pnode->pinfo = pinfo; + + PSHM_SUBSYS_UNLOCK(); + proc_fdlock(p); fp->f_flag = fmode & FMASK; fp->f_type = DTYPE_PSXSHM; fp->f_ops = &pshmops; fp->f_data = (caddr_t)pnode; *fdflags(p, indx) &= ~UF_RESERVED; + fp_drop(p, indx, fp, 1); + proc_fdunlock(p); + *retval = indx; FREE_ZONE(pnbuf, MAXPATHLEN, M_NAMEI); return (0); bad3: - _FREE(pnode, M_SHM); + FREE(pnode, M_SHM); bad2: if (pinfo_alloc) - _FREE(pinfo, M_SHM); + FREE(pinfo, M_SHM); bad1: - fdrelse(p, indx); - ffree(nfp); + fp_free(p, indx, fp); bad: FREE_ZONE(pnbuf, MAXPATHLEN, M_NAMEI); return (error); } -/* ARGSUSED */ int -pshm_truncate(p, fp, fd, length, retval) - struct proc *p; - struct file *fp; - int fd; - off_t length; - register_t *retval; +pshm_truncate(__unused struct proc *p, struct fileproc *fp, __unused int fd, + off_t length, __unused register_t *retval) { struct pshminfo * pinfo; struct pshmnode * pnode ; kern_return_t kret; vm_offset_t user_addr; - void * mem_object; + mem_entry_name_port_t mem_object; vm_size_t size; if (fp->f_type != DTYPE_PSXSHM) { @@ -513,15 +599,20 @@ pshm_truncate(p, fp, fd, length, retval) if (((pnode = (struct pshmnode *)fp->f_data)) == PSHMNODE_NULL ) return(EINVAL); - if ((pinfo = pnode->pinfo) == PSHMINFO_NULL) + PSHM_SUBSYS_LOCK(); + if ((pinfo = pnode->pinfo) == PSHMINFO_NULL) { + PSHM_SUBSYS_UNLOCK(); return(EINVAL); + } if ((pinfo->pshm_flags & (PSHM_DEFINED | PSHM_ALLOCATED)) != PSHM_DEFINED) { + PSHM_SUBSYS_UNLOCK(); return(EINVAL); } + PSHM_SUBSYS_UNLOCK(); size = round_page_64(length); - kret = vm_allocate(current_map(), &user_addr, size, TRUE); + kret = vm_allocate(current_map(), &user_addr, size, VM_FLAGS_ANYWHERE); if (kret != KERN_SUCCESS) goto out; @@ -533,10 +624,12 @@ pshm_truncate(p, fp, fd, length, retval) vm_deallocate(current_map(), user_addr, size); + PSHM_SUBSYS_LOCK(); pinfo->pshm_flags &= ~PSHM_DEFINED; pinfo->pshm_flags = PSHM_ALLOCATED; - pinfo->pshm_memobject = mem_object; + pinfo->pshm_memobject = (void *)mem_object; pinfo->pshm_length = size; + PSHM_SUBSYS_UNLOCK(); return(0); out: @@ -553,39 +646,44 @@ out: } int -pshm_stat(pnode, sb) -struct pshmnode *pnode; -struct stat *sb; +pshm_stat(struct pshmnode *pnode, struct stat *sb) { struct pshminfo *pinfo; - if ((pinfo = pnode->pinfo) == PSHMINFO_NULL) + PSHM_SUBSYS_LOCK(); + if ((pinfo = pnode->pinfo) == PSHMINFO_NULL){ + PSHM_SUBSYS_UNLOCK(); return(EINVAL); + } bzero(sb, sizeof(struct stat)); sb->st_mode = pinfo->pshm_mode; sb->st_uid = pinfo->pshm_uid; sb->st_gid = pinfo->pshm_gid; sb->st_size = pinfo->pshm_length; + PSHM_SUBSYS_UNLOCK(); return(0); } +/* + * This is called only from shm_open which holds pshm_lock(); + * XXX This code is repeated many times + */ int -pshm_access(struct pshminfo *pinfo, int mode, struct ucred *cred, struct proc *p) +pshm_access(struct pshminfo *pinfo, int mode, kauth_cred_t cred, __unused struct proc *p) { mode_t mask; - register gid_t *gp; - int i, error; + int is_member; /* Otherwise, user id 0 always gets access. */ - if (cred->cr_uid == 0) + if (!suser(cred, NULL)) return (0); mask = 0; /* Otherwise, check the owner. */ - if (cred->cr_uid == pinfo->pshm_uid) { + if (kauth_cred_getuid(cred) == pinfo->pshm_uid) { if (mode & FREAD) mask |= S_IRUSR; if (mode & FWRITE) @@ -594,14 +692,13 @@ pshm_access(struct pshminfo *pinfo, int mode, struct ucred *cred, struct proc *p } /* Otherwise, check the groups. */ - for (i = 0, gp = cred->cr_groups; i < cred->cr_ngroups; i++, gp++) - if (pinfo->pshm_gid == *gp) { - if (mode & FREAD) - mask |= S_IRGRP; - if (mode & FWRITE) - mask |= S_IWGRP; - return ((pinfo->pshm_mode & mask) == mask ? 0 : EACCES); - } + if (kauth_cred_ismember_gid(cred, pinfo->pshm_gid, &is_member) == 0 && is_member) { + if (mode & FREAD) + mask |= S_IRGRP; + if (mode & FWRITE) + mask |= S_IWGRP; + return ((pinfo->pshm_mode & mask) == mask ? 0 : EACCES); + } /* Otherwise, check everyone else. */ if (mode & FREAD) @@ -611,29 +708,18 @@ pshm_access(struct pshminfo *pinfo, int mode, struct ucred *cred, struct proc *p return ((pinfo->pshm_mode & mask) == mask ? 0 : EACCES); } -struct mmap_args { - caddr_t addr; - size_t len; - int prot; - int flags; - int fd; -#ifdef DOUBLE_ALIGN_PARAMS - long pad; -#endif - off_t pos; -}; - int -pshm_mmap(struct proc *p, struct mmap_args *uap, register_t *retval, struct file *fp, vm_size_t pageoff) +pshm_mmap(struct proc *p, struct mmap_args *uap, user_addr_t *retval, struct fileproc *fp, off_t pageoff) { - vm_offset_t user_addr = (vm_offset_t)uap->addr; - vm_size_t user_size = (vm_size_t)uap->len ; + mach_vm_offset_t user_addr = (mach_vm_offset_t)uap->addr; + mach_vm_size_t user_size = (mach_vm_size_t)uap->len ; int prot = uap->prot; int flags = uap->flags; vm_object_offset_t file_pos = (vm_object_offset_t)uap->pos; int fd = uap->fd; vm_map_t user_map; - boolean_t find_space,docow; + int alloc_flags; + boolean_t docow; kern_return_t kret; struct pshminfo * pinfo; struct pshmnode * pnode; @@ -653,57 +739,71 @@ pshm_mmap(struct proc *p, struct mmap_args *uap, register_t *retval, struct file if (((pnode = (struct pshmnode *)fp->f_data)) == PSHMNODE_NULL ) return(EINVAL); - if ((pinfo = pnode->pinfo) == PSHMINFO_NULL) + PSHM_SUBSYS_LOCK(); + if ((pinfo = pnode->pinfo) == PSHMINFO_NULL) { + PSHM_SUBSYS_UNLOCK(); return(EINVAL); + } if ((pinfo->pshm_flags & PSHM_ALLOCATED) != PSHM_ALLOCATED) { + PSHM_SUBSYS_UNLOCK(); return(EINVAL); } - if (user_size > pinfo->pshm_length) { + if ((off_t)user_size > pinfo->pshm_length) { + PSHM_SUBSYS_UNLOCK(); return(EINVAL); } - if ((off_t)user_size + file_pos > pinfo->pshm_length) { + if ((off_t)(user_size + file_pos) > pinfo->pshm_length) { + PSHM_SUBSYS_UNLOCK(); return(EINVAL); } if ((mem_object = pinfo->pshm_memobject) == NULL) { + PSHM_SUBSYS_UNLOCK(); return(EINVAL); } - + + PSHM_SUBSYS_UNLOCK(); user_map = current_map(); if ((flags & MAP_FIXED) == 0) { - find_space = TRUE; - user_addr = round_page_32(user_addr); + alloc_flags = VM_FLAGS_ANYWHERE; + user_addr = mach_vm_round_page(user_addr); } else { - if (user_addr != trunc_page_32(user_addr)) + if (user_addr != mach_vm_trunc_page(user_addr)) return (EINVAL); - find_space = FALSE; - (void) vm_deallocate(user_map, user_addr, user_size); + /* + * We do not get rid of the existing mappings here because + * it wouldn't be atomic (see comment in mmap()). We let + * Mach VM know that we want it to replace any existing + * mapping with the new one. + */ + alloc_flags = VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE; } docow = FALSE; - kret = vm_map_64(user_map, &user_addr, user_size, - 0, find_space, pinfo->pshm_memobject, file_pos, docow, + kret = mach_vm_map(user_map, &user_addr, user_size, + 0, alloc_flags, pinfo->pshm_memobject, file_pos, docow, prot, VM_PROT_DEFAULT, - VM_INHERIT_DEFAULT); - + VM_INHERIT_SHARE); if (kret != KERN_SUCCESS) goto out; - kret = vm_inherit(user_map, user_addr, user_size, + /* LP64todo - this should be superfluous at this point */ + kret = mach_vm_inherit(user_map, user_addr, user_size, VM_INHERIT_SHARE); if (kret != KERN_SUCCESS) { - (void) vm_deallocate(user_map, user_addr, user_size); + (void) mach_vm_deallocate(user_map, user_addr, user_size); goto out; } + PSHM_SUBSYS_LOCK(); pnode->mapp_addr = user_addr; pnode->map_size = user_size; pinfo->pshm_flags |= (PSHM_MAPPED | PSHM_INUSE); + PSHM_SUBSYS_UNLOCK(); out: switch (kret) { case KERN_SUCCESS: - *fdflags(p, fd) |= UF_MAPPED; - *retval = (register_t)(user_addr + pageoff); + *retval = (user_addr + pageoff); return (0); case KERN_INVALID_ADDRESS: case KERN_NO_SPACE: @@ -716,40 +816,29 @@ out: } -struct shm_unlink_args { - const char *name; -}; - int -shm_unlink(p, uap, retval) - struct proc *p; - register struct shm_unlink_args *uap; - register_t *retval; +shm_unlink(__unused struct proc *p, struct shm_unlink_args *uap, + __unused register_t *retval) { - register struct filedesc *fdp = p->p_fd; - register struct file *fp; - int flags, i; + size_t i; int error=0; struct pshmname nd; struct pshminfo *pinfo; - extern struct fileops pshmops; char * pnbuf; char * nameptr; char * cp; size_t pathlen, plen; - int fmode, cmode ; int incache = 0; - struct pshmnode * pnode = PSHMNODE_NULL; struct pshmcache *pcache = PSHMCACHE_NULL; - kern_return_t kret; pinfo = PSHMINFO_NULL; - MALLOC_ZONE(pnbuf, caddr_t, - MAXPATHLEN, M_NAMEI, M_WAITOK); + MALLOC_ZONE(pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (pnbuf == NULL) { + return(ENOSPC); /* XXX non-standard */ + } pathlen = MAXPATHLEN; - error = copyinstr((void *)uap->name, (void *)pnbuf, - MAXPATHLEN, &pathlen); + error = copyinstr(uap->name, (void *)pnbuf, MAXPATHLEN, &pathlen); if (error) { goto bad; } @@ -784,65 +873,72 @@ shm_unlink(p, uap, retval) nd.pshm_hash += (unsigned char)*cp * i; } + PSHM_SUBSYS_LOCK(); error = pshm_cache_search(&pinfo, &nd, &pcache); if (error == ENOENT) { + PSHM_SUBSYS_UNLOCK(); error = EINVAL; goto bad; } if (!error) { + PSHM_SUBSYS_UNLOCK(); error = EINVAL; goto bad; } else incache = 1; if ((pinfo->pshm_flags & (PSHM_DEFINED | PSHM_ALLOCATED))==0) { + PSHM_SUBSYS_UNLOCK(); return (EINVAL); } if (pinfo->pshm_flags & PSHM_INDELETE) { + PSHM_SUBSYS_UNLOCK(); error = 0; goto bad; } - if (pinfo->pshm_memobject == NULL) { - error = EINVAL; - goto bad; - } - AUDIT_ARG(posix_ipc_perm, pinfo->pshm_uid, pinfo->pshm_gid, pinfo->pshm_mode); + + /* + * JMM - How should permissions be checked? + */ + pinfo->pshm_flags |= PSHM_INDELETE; - pinfo->pshm_usecount--; - kret = mach_destroy_memory_entry(pinfo->pshm_memobject); pshm_cache_delete(pcache); - _FREE(pcache, M_SHM); pinfo->pshm_flags |= PSHM_REMOVED; + /* release the existence reference */ + if (!--pinfo->pshm_usecount) { + PSHM_SUBSYS_UNLOCK(); + /* + * If this is the last reference going away on the object, + * then we need to destroy the backing object. The name + * has an implied but uncounted reference on the object, + * once it's created, since it's used as a rendesvous, and + * therefore may be subsequently reopened. + */ + if (pinfo->pshm_memobject != NULL) + mach_memory_entry_port_release(pinfo->pshm_memobject); + PSHM_SUBSYS_LOCK(); + FREE(pinfo,M_SHM); + } + PSHM_SUBSYS_UNLOCK(); + FREE(pcache, M_SHM); error = 0; bad: FREE_ZONE(pnbuf, MAXPATHLEN, M_NAMEI); return (error); -out: - switch (kret) { - case KERN_INVALID_ADDRESS: - case KERN_PROTECTION_FAILURE: - return (EACCES); - default: - return (EINVAL); - } } -int -pshm_close(pnode, flags, cred, p) - register struct pshmnode *pnode; - int flags; - struct ucred *cred; - struct proc *p; +/* already called locked */ +static int +pshm_close(struct pshmnode *pnode) { int error=0; - kern_return_t kret; - register struct pshminfo *pinfo; + struct pshminfo *pinfo; if ((pinfo = pnode->pinfo) == PSHMINFO_NULL) return(EINVAL); @@ -855,71 +951,68 @@ pshm_close(pnode, flags, cred, p) kprintf("negative usecount in pshm_close\n"); } #endif /* DIAGNOSTIC */ - pinfo->pshm_usecount--; + pinfo->pshm_usecount--; /* release this fd's reference */ if ((pinfo->pshm_flags & PSHM_REMOVED) && !pinfo->pshm_usecount) { - _FREE(pinfo,M_SHM); - } - _FREE(pnode, M_SHM); + PSHM_SUBSYS_UNLOCK(); + /* + * If this is the last reference going away on the object, + * then we need to destroy the backing object. + */ + if (pinfo->pshm_memobject != NULL) + mach_memory_entry_port_release(pinfo->pshm_memobject); + PSHM_SUBSYS_LOCK(); + FREE(pinfo,M_SHM); + } + FREE(pnode, M_SHM); return (error); } +/* struct proc passed to match prototype for struct fileops */ static int -pshm_closefile(fp, p) - struct file *fp; - struct proc *p; +pshm_closefile(struct fileglob *fg, __unused struct proc *p) { - return (pshm_close(((struct pshmnode *)fp->f_data), fp->f_flag, - fp->f_cred, p)); + int error; + + PSHM_SUBSYS_LOCK(); + error = pshm_close(((struct pshmnode *)fg->fg_data)); + PSHM_SUBSYS_UNLOCK(); + return(error); } static int -pshm_read(fp, uio, cred, flags, p) - struct file *fp; - struct uio *uio; - struct ucred *cred; - int flags; - struct proc *p; +pshm_read(__unused struct fileproc *fp, __unused struct uio *uio, + __unused kauth_cred_t cred, __unused int flags, + __unused struct proc *p) { - return(EOPNOTSUPP); + return(ENOTSUP); } static int -pshm_write(fp, uio, cred, flags, p) - struct file *fp; - struct uio *uio; - struct ucred *cred; - int flags; - struct proc *p; +pshm_write(__unused struct fileproc *fp, __unused struct uio *uio, + __unused kauth_cred_t cred, __unused int flags, + __unused struct proc *p) { - return(EOPNOTSUPP); + return(ENOTSUP); } static int -pshm_ioctl(fp, com, data, p) - struct file *fp; - u_long com; - caddr_t data; - struct proc *p; +pshm_ioctl(__unused struct fileproc *fp, __unused u_long com, + __unused caddr_t data, __unused struct proc *p) { - return(EOPNOTSUPP); + return(ENOTSUP); } static int -pshm_select(fp, which, wql, p) - struct file *fp; - int which; - void *wql; - struct proc *p; +pshm_select(__unused struct fileproc *fp, __unused int which, __unused void *wql, + __unused struct proc *p) { - return(EOPNOTSUPP); + return(ENOTSUP); } static int -pshm_kqfilter(fp, kn, p) - struct file *fp; - struct knote *kn; - struct proc *p; +pshm_kqfilter(__unused struct fileproc *fp, __unused struct knote *kn, + __unused struct proc *p) { - return(EOPNOTSUPP); + return(ENOTSUP); } diff --git a/bsd/kern/qsort.c b/bsd/kern/qsort.c index 7eeb4e408..9ac15c01b 100644 --- a/bsd/kern/qsort.c +++ b/bsd/kern/qsort.c @@ -60,8 +60,8 @@ #include <sys/types.h> //#include <stdlib.h> -static inline char *med3 __P((char *, char *, char *, int (*)())); -static inline void swapfunc __P((char *, char *, int, int)); +static inline char *med3(char *, char *, char *, int (*)()); +static inline void swapfunc(char *, char *, int, int); #define min(a, b) (a) < (b) ? a : b diff --git a/bsd/kern/spl.c b/bsd/kern/spl.c index 4ab15677d..52768d76b 100644 --- a/bsd/kern/spl.c +++ b/bsd/kern/spl.c @@ -27,8 +27,6 @@ unsigned sploff( void) { - if(thread_funnel_get() == THR_FUNNEL_NULL) - panic("%s not under funnel", "sploff()"); return(0); } @@ -36,8 +34,6 @@ unsigned splhigh( void) { - if(thread_funnel_get() == THR_FUNNEL_NULL) - panic("%s not under funnel", "splhigh()"); return(0); } @@ -45,8 +41,6 @@ unsigned splsched( void) { - if(thread_funnel_get() == THR_FUNNEL_NULL) - panic("%s not under funnel", "splsched()"); return(0); } @@ -54,8 +48,6 @@ unsigned splclock ( void) { - if(thread_funnel_get() == THR_FUNNEL_NULL) - panic("%s not under funnel", "splclock()"); return(0); } @@ -63,8 +55,6 @@ unsigned splpower ( void) { - if(thread_funnel_get() == THR_FUNNEL_NULL) - panic("%s not under funnel", "splpower()"); return(0); } @@ -72,8 +62,6 @@ unsigned splvm( void) { - if(thread_funnel_get() == THR_FUNNEL_NULL) - panic("%s not under funnel", "splvm()"); return(0); } @@ -81,8 +69,6 @@ unsigned splbio ( void) { - if(thread_funnel_get() == THR_FUNNEL_NULL) - panic("%s not under funnel", "splbio()"); return(0); } @@ -90,16 +76,12 @@ unsigned splimp( void) { - if(thread_funnel_get() == THR_FUNNEL_NULL) - panic("%s not under funnel", "splimp()"); return(0); } unsigned spltty(void) { - if(thread_funnel_get() == THR_FUNNEL_NULL) - panic("%s not under funnel", "spltty()"); return(0); } @@ -107,55 +89,42 @@ unsigned splnet( void) { - if(thread_funnel_get() == THR_FUNNEL_NULL) - panic("%s not under funnel", "splnet()"); return(0); } unsigned splsoftclock(void) { - if(thread_funnel_get() == THR_FUNNEL_NULL) - panic("%s not under funnel", "splsoftclock()"); return(0); } void spllo(void) { - if(thread_funnel_get() == THR_FUNNEL_NULL) - panic("%s not under funnel", "spllo()"); return; } void spl0(void) { - if(thread_funnel_get() == THR_FUNNEL_NULL) - panic("%s not under funnel", "spl0()"); return; } void spln(unsigned t) { - if(thread_funnel_get() == THR_FUNNEL_NULL) - panic("%s not under funnel", "spln()"); return; } void splx(unsigned l) { - if(thread_funnel_get() == THR_FUNNEL_NULL) - panic("%s not under funnel", "splx()"); return; } void splon(unsigned l) { - if(thread_funnel_get() == THR_FUNNEL_NULL) - panic("%s not under funnel", "splon()"); return; } + diff --git a/bsd/kern/subr_log.c b/bsd/kern/subr_log.c index 2bab82bac..73dc5cc51 100644 --- a/bsd/kern/subr_log.c +++ b/bsd/kern/subr_log.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -61,14 +61,16 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> #include <sys/vnode.h> #include <sys/ioctl.h> #include <sys/msgbuf.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/errno.h> #include <sys/select.h> +#include <sys/kernel.h> #include <kern/thread.h> +#include <sys/lock.h> #define LOG_RDPRI (PZERO + 1) @@ -86,16 +88,21 @@ int log_open; /* also used in log() */ struct msgbuf temp_msgbuf; struct msgbuf *msgbufp; static int _logentrypend = 0; +static int log_inited = 0; +void bsd_log_lock(void); +/* the following two are implemented in osfmk/kern/printf.c */ +extern void bsd_log_unlock(void); +extern void bsd_log_init(void); /* * Serialize log access. Note that the log can be written at interrupt level, * so any log manipulations that can be done from, or affect, another processor * at interrupt level must be guarded with a spin lock. */ -decl_simple_lock_data(,log_lock); /* stop races dead in their tracks */ -#define LOG_LOCK() simple_lock(&log_lock) -#define LOG_UNLOCK() simple_unlock(&log_lock) -#define LOG_LOCK_INIT() simple_lock_init(&log_lock) + +#define LOG_LOCK() bsd_log_lock() +#define LOG_UNLOCK() bsd_log_unlock() + /*ARGSUSED*/ logopen(dev, flags, mode, p) @@ -137,9 +144,7 @@ logclose(dev, flag) LOG_LOCK(); log_open = 0; selwakeup(&logsoftc.sc_selp); - oldpri = splhigh(); selthreadclear(&logsoftc.sc_selp); - splx(oldpri); LOG_UNLOCK(); return (0); } @@ -154,42 +159,57 @@ logread(dev, uio, flag) register long l; register int s; int error = 0; + char localbuff[MSG_BSIZE]; + int copybytes; - s = splhigh(); + LOG_LOCK(); while (msgbufp->msg_bufr == msgbufp->msg_bufx) { if (flag & IO_NDELAY) { - splx(s); - return (EWOULDBLOCK); + error = EWOULDBLOCK; + goto out; } if (logsoftc.sc_state & LOG_NBIO) { - splx(s); - return (EWOULDBLOCK); + error = EWOULDBLOCK; + goto out; } logsoftc.sc_state |= LOG_RDWAIT; + LOG_UNLOCK(); + /* + * If the wakeup is missed the ligtening bolt will wake this up + * if there are any new characters. If that doesn't do it + * then wait for 5 sec and reevaluate + */ if (error = tsleep((caddr_t)msgbufp, LOG_RDPRI | PCATCH, - "klog", 0)) { - splx(s); - return (error); + "klog", 5 * hz)) { + /* if it times out; ignore */ + if (error != EWOULDBLOCK) + return (error); } + LOG_LOCK(); } - splx(s); logsoftc.sc_state &= ~LOG_RDWAIT; - while (uio->uio_resid > 0) { + + while (uio_resid(uio) > 0) { l = msgbufp->msg_bufx - msgbufp->msg_bufr; if (l < 0) l = MSG_BSIZE - msgbufp->msg_bufr; - l = min(l, uio->uio_resid); + l = min(l, uio_resid(uio)); if (l == 0) break; - error = uiomove((caddr_t)&msgbufp->msg_bufc[msgbufp->msg_bufr], + bcopy(&msgbufp->msg_bufc[msgbufp->msg_bufr], &localbuff[0], l); + LOG_UNLOCK(); + error = uiomove((caddr_t)&localbuff[0], (int)l, uio); + LOG_LOCK(); if (error) break; msgbufp->msg_bufr += l; if (msgbufp->msg_bufr < 0 || msgbufp->msg_bufr >= MSG_BSIZE) msgbufp->msg_bufr = 0; } +out: + LOG_UNLOCK(); return (error); } @@ -201,19 +221,19 @@ logselect(dev, rw, wql, p) void * wql; struct proc *p; { - int s = splhigh(); switch (rw) { case FREAD: + LOG_LOCK(); if (msgbufp->msg_bufr != msgbufp->msg_bufx) { - splx(s); + LOG_UNLOCK(); return (1); } selrecord(p, &logsoftc.sc_selp, wql); + LOG_UNLOCK(); break; } - splx(s); return (0); } @@ -224,24 +244,26 @@ logwakeup() int pgid; boolean_t funnel_state; - if (!log_open) + LOG_LOCK(); + if (!log_open) { + LOG_UNLOCK(); return; - funnel_state = thread_funnel_set(kernel_flock, TRUE); + } selwakeup(&logsoftc.sc_selp); if (logsoftc.sc_state & LOG_ASYNC) { - LOG_LOCK(); pgid = logsoftc.sc_pgid; LOG_UNLOCK(); if (pgid < 0) gsignal(-pgid, SIGIO); else if (p = pfind(pgid)) psignal(p, SIGIO); + LOG_LOCK(); } if (logsoftc.sc_state & LOG_RDWAIT) { wakeup((caddr_t)msgbufp); logsoftc.sc_state &= ~LOG_RDWAIT; } - (void) thread_funnel_set(kernel_flock, funnel_state); + LOG_UNLOCK(); } void @@ -262,13 +284,12 @@ logioctl(dev, com, data, flag) long l; int s; + LOG_LOCK(); switch (com) { /* return number of characters immediately available */ case FIONREAD: - s = splhigh(); l = msgbufp->msg_bufx - msgbufp->msg_bufr; - splx(s); if (l < 0) l += MSG_BSIZE; *(off_t *)data = l; @@ -289,28 +310,28 @@ logioctl(dev, com, data, flag) break; case TIOCSPGRP: - LOG_LOCK(); logsoftc.sc_pgid = *(int *)data; - LOG_UNLOCK(); break; case TIOCGPGRP: - LOG_LOCK(); *(int *)data = logsoftc.sc_pgid; - LOG_UNLOCK(); break; default: + LOG_UNLOCK(); return (-1); } + LOG_UNLOCK(); return (0); } void -log_init() +bsd_log_init() { - msgbufp = &temp_msgbuf; - LOG_LOCK_INIT(); + if (!log_inited) { + msgbufp = &temp_msgbuf; + log_inited = 1; + } } void @@ -318,8 +339,10 @@ log_putc(char c) { register struct msgbuf *mbp; - if (msgbufp == NULL) - msgbufp =&temp_msgbuf; + if (!log_inited) { + panic("bsd log is not inited"); + } + LOG_LOCK(); mbp = msgbufp; if (mbp-> msg_magic != MSG_MAGIC) { @@ -334,4 +357,6 @@ log_putc(char c) _logentrypend = 1; if (mbp->msg_bufx < 0 || mbp->msg_bufx >= MSG_BSIZE) mbp->msg_bufx = 0; + LOG_UNLOCK(); } + diff --git a/bsd/kern/subr_prf.c b/bsd/kern/subr_prf.c index 8bcf0e3ce..3b77fb394 100644 --- a/bsd/kern/subr_prf.c +++ b/bsd/kern/subr_prf.c @@ -77,14 +77,13 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/buf.h> #include <sys/conf.h> #include <sys/reboot.h> #include <sys/msgbuf.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> #include <sys/ioctl.h> #include <sys/tty.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/tprintf.h> #include <sys/syslog.h> #include <stdarg.h> @@ -127,8 +126,6 @@ extern int __doprnt(const char *fmt, static void puts(const char *s, int flags, struct tty *ttyp); static void printn(u_long n, int b, int flags, struct tty *ttyp, int zf, int fld_size); -/* MP printf stuff */ -decl_simple_lock_data(,printf_lock) #if NCPUS > 1 boolean_t new_printf_cpu_number; /* do we need to output who we are */ #endif @@ -299,7 +296,6 @@ int prf(const char *fmt, va_list ap, int flags, struct tty *ttyp) int cpun = cpu_number(); if(ttyp == 0) { - simple_lock(&printf_lock); } else TTY_LOCK(ttyp); @@ -317,7 +313,6 @@ int prf(const char *fmt, va_list ap, int flags, struct tty *ttyp) #if NCPUS > 1 if(ttyp == 0) { - simple_unlock(&printf_lock); } else TTY_UNLOCK(ttyp); #endif diff --git a/bsd/kern/subr_prof.c b/bsd/kern/subr_prof.c index 4ffac789b..9b3791f0a 100644 --- a/bsd/kern/subr_prof.c +++ b/bsd/kern/subr_prof.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -58,13 +58,20 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> #include <sys/user.h> #include <machine/spl.h> +#include <machine/machine_routines.h> -#include <sys/mount.h> +#include <sys/mount_internal.h> +#include <sys/sysproto.h> +#include <mach/mach_types.h> +#include <kern/kern_types.h> #include <kern/cpu_number.h> +#include <kern/kalloc.h> + +extern boolean_t ml_set_interrupts_enabled(boolean_t enable); #ifdef GPROF #include <sys/malloc.h> @@ -72,18 +79,25 @@ #include <kern/mach_header.h> #include <machine/profile.h> -decl_simple_lock_data(,mcount_lock); +lck_spin_t * mcount_lock; +lck_grp_t * mcount_lock_grp; +lck_attr_t * mcount_lock_attr; /* * Froms is actually a bunch of unsigned shorts indexing tos */ struct gmonparam _gmonparam = { GMON_PROF_OFF }; -kmstartup() +/* + * This code uses 32 bit mach object segment information from the currently + * running kernel. + */ +void +kmstartup(void) { char *cp; u_long fromssize, tossize; - struct segment_command *sgp; + struct segment_command *sgp; /* 32 bit mach object file segment */ struct gmonparam *p = &_gmonparam; sgp = getsegbyname("__TEXT"); @@ -120,20 +134,20 @@ kmstartup() p->kcount = (u_short *)cp; cp += p->kcountsize; p->froms = (u_short *)cp; - simple_lock_init(&mcount_lock); + + mcount_lock_grp = lck_grp_alloc_init("MCOUNT", LCK_GRP_ATTR_NULL); + mcount_lock_attr = lck_attr_alloc_init(); + //lck_attr_setdebug(mcount_lock_attr); + mcount_lock = lck_spin_alloc_init(mcount_lock_grp, mcount_lock_attr); + } /* * Return kernel profiling information. */ int -sysctl_doprof(name, namelen, oldp, oldlenp, newp, newlen) - int *name; - u_int namelen; - void *oldp; - size_t *oldlenp; - void *newp; - size_t newlen; +sysctl_doprof(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen) { struct gmonparam *gp = &_gmonparam; int error; @@ -153,18 +167,18 @@ sysctl_doprof(name, namelen, oldp, oldlenp, newp, newlen) startprofclock(kernproc); return (0); case GPROF_COUNT: - return (sysctl_struct(oldp, oldlenp, newp, newlen, - gp->kcount, gp->kcountsize)); + return (sysctl_struct(oldp, oldlenp, newp, newlen, + gp->kcount, gp->kcountsize)); case GPROF_FROMS: return (sysctl_struct(oldp, oldlenp, newp, newlen, - gp->froms, gp->fromssize)); + gp->froms, gp->fromssize)); case GPROF_TOS: return (sysctl_struct(oldp, oldlenp, newp, newlen, - gp->tos, gp->tossize)); + gp->tos, gp->tossize)); case GPROF_GMONPARAM: return (sysctl_rdstruct(oldp, oldlenp, newp, gp, sizeof *gp)); default: - return (EOPNOTSUPP); + return (ENOTSUP); } /* NOTREACHED */ } @@ -191,7 +205,7 @@ mcount( if (p->state != GMON_PROF_ON) return; - usimple_lock(&mcount_lock); + lck_spin_lock(mcount_lock); /* * check that frompcindex is a reasonable pc value. @@ -274,36 +288,26 @@ mcount( } done: - usimple_unlock(&mcount_lock); + lck_spin_unlock(mcount_lock); return; overflow: p->state = GMON_PROF_ERROR; - usimple_unlock(&mcount_lock); + lck_spin_unlock(mcount_lock); printf("mcount: tos overflow\n"); return; } #endif /* GPROF */ -#define PROFILE_LOCK(x) simple_lock(x) -#define PROFILE_UNLOCK(x) simple_unlock(x) +#define PROFILE_LOCK(x) +#define PROFILE_UNLOCK(x) -struct profil_args { - short *bufbase; - u_int bufsize; - u_int pcoffset; - u_int pcscale; -}; int -profil(p, uap, retval) - struct proc *p; - register struct profil_args *uap; - register_t *retval; +profil(struct proc *p, register struct profil_args *uap, __unused register_t *retval) { - register struct uprof *upp = &p->p_stats->p_prof; - struct uprof *upc, *nupc; - int s; + struct uprof *upp = &p->p_stats->p_prof; + int s; if (uap->pcscale > (1 << 16)) return (EINVAL); @@ -313,53 +317,95 @@ profil(p, uap, retval) } /* Block profile interrupts while changing state. */ - s = ml_set_interrupts_enabled(FALSE); - PROFILE_LOCK(&upp->pr_lock); - upp->pr_base = (caddr_t)uap->bufbase; - upp->pr_size = uap->bufsize; - upp->pr_off = uap->pcoffset; - upp->pr_scale = uap->pcscale; - - /* remove buffers previously allocated with add_profil() */ - for (upc = upp->pr_next; upc; upc = nupc) { - nupc = upc->pr_next; - kfree(upc, sizeof (struct uprof)); + s = ml_set_interrupts_enabled(FALSE); + + if (proc_is64bit(p)) { + struct user_uprof *user_upp = &p->p_stats->user_p_prof; + struct user_uprof *upc, *nupc; + + PROFILE_LOCK(&user_upp->pr_lock); + user_upp->pr_base = uap->bufbase; + user_upp->pr_size = uap->bufsize; + user_upp->pr_off = uap->pcoffset; + user_upp->pr_scale = uap->pcscale; + upp->pr_base = NULL; + upp->pr_size = 0; + upp->pr_scale = 0; + + /* remove buffers previously allocated with add_profil() */ + for (upc = user_upp->pr_next; upc; upc = nupc) { + nupc = upc->pr_next; + kfree(upc, sizeof (*upc)); + } + user_upp->pr_next = 0; + PROFILE_UNLOCK(&user_upp->pr_lock); + } + else { + struct uprof *upc, *nupc; + + PROFILE_LOCK(&upp->pr_lock); + upp->pr_base = CAST_DOWN(caddr_t, uap->bufbase); + upp->pr_size = uap->bufsize; + upp->pr_off = uap->pcoffset; + upp->pr_scale = uap->pcscale; + + /* remove buffers previously allocated with add_profil() */ + for (upc = upp->pr_next; upc; upc = nupc) { + nupc = upc->pr_next; + kfree(upc, sizeof (struct uprof)); + } + upp->pr_next = 0; + PROFILE_UNLOCK(&upp->pr_lock); } - upp->pr_next = 0; - PROFILE_UNLOCK(&upp->pr_lock); startprofclock(p); ml_set_interrupts_enabled(s); return(0); } -struct add_profile_args { - short *bufbase; - u_int bufsize; - u_int pcoffset; - u_int pcscale; -}; int -add_profil(p, uap, retval) - struct proc *p; - register struct add_profile_args *uap; - register_t *retval; +add_profil(struct proc *p, register struct add_profil_args *uap, __unused register_t *retval) { struct uprof *upp = &p->p_stats->p_prof, *upc; + struct user_uprof *user_upp = NULL, *user_upc; int s; + boolean_t is64bit = proc_is64bit(p); - if (upp->pr_scale == 0) - return (0); - s = ml_set_interrupts_enabled(FALSE); - upc = (struct uprof *) kalloc(sizeof (struct uprof)); - upc->pr_base = (caddr_t)uap->bufbase; - upc->pr_size = uap->bufsize; - upc->pr_off = uap->pcoffset; - upc->pr_scale = uap->pcscale; - PROFILE_LOCK(&upp->pr_lock); - upc->pr_next = upp->pr_next; - upp->pr_next = upc; - PROFILE_UNLOCK(&upp->pr_lock); + if (is64bit) { + user_upp = &p->p_stats->user_p_prof; + if (user_upp->pr_scale == 0) + return (0); + } + else { + if (upp->pr_scale == 0) + return (0); + } + + s = ml_set_interrupts_enabled(FALSE); + + if (is64bit) { + user_upc = (struct user_uprof *) kalloc(sizeof (struct user_uprof)); + user_upc->pr_base = uap->bufbase; + user_upc->pr_size = uap->bufsize; + user_upc->pr_off = uap->pcoffset; + user_upc->pr_scale = uap->pcscale; + PROFILE_LOCK(&user_upp->pr_lock); + user_upc->pr_next = user_upp->pr_next; + user_upp->pr_next = user_upc; + PROFILE_UNLOCK(&user_upp->pr_lock); + } + else { + upc = (struct uprof *) kalloc(sizeof (struct uprof)); + upc->pr_base = CAST_DOWN(caddr_t, uap->bufbase); + upc->pr_size = uap->bufsize; + upc->pr_off = uap->pcoffset; + upc->pr_scale = uap->pcscale; + PROFILE_LOCK(&upp->pr_lock); + upc->pr_next = upp->pr_next; + upp->pr_next = upc; + PROFILE_UNLOCK(&upp->pr_lock); + } + ml_set_interrupts_enabled(s); return(0); } @@ -390,11 +436,9 @@ add_profil(p, uap, retval) void addupc_task(p, pc, ticks) register struct proc *p; - register u_long pc; + user_addr_t pc; u_int ticks; { - register struct uprof *prof; - register short *cell; register u_int off; u_short count; @@ -402,19 +446,44 @@ addupc_task(p, pc, ticks) if ((p->p_flag & P_PROFIL) == 0 || ticks == 0) return; - for (prof = &p->p_stats->p_prof; prof; prof = prof->pr_next) { - off = PC_TO_INDEX(pc,prof); - cell = (short *)(prof->pr_base + off); - if (cell >= (short *)prof->pr_base && - cell < (short*)(prof->pr_size + (int) prof->pr_base)) { - if (copyin((caddr_t)cell, (caddr_t) &count, sizeof(count)) == 0) { - count += ticks; - if(copyout((caddr_t) &count, (caddr_t)cell, sizeof(count)) == 0) - return; - } - p->p_stats->p_prof.pr_scale = 0; - stopprofclock(p); - break; - } + if (proc_is64bit(p)) { + struct user_uprof *prof; + user_addr_t cell; + + for (prof = &p->p_stats->user_p_prof; prof; prof = prof->pr_next) { + off = PC_TO_INDEX(pc, prof); + cell = (prof->pr_base + off); + if (cell >= prof->pr_base && + cell < (prof->pr_size + prof->pr_base)) { + if (copyin(cell, (caddr_t) &count, sizeof(count)) == 0) { + count += ticks; + if(copyout((caddr_t) &count, cell, sizeof(count)) == 0) + return; + } + p->p_stats->user_p_prof.pr_scale = 0; + stopprofclock(p); + break; + } + } + } + else { + struct uprof *prof; + short *cell; + + for (prof = &p->p_stats->p_prof; prof; prof = prof->pr_next) { + off = PC_TO_INDEX(CAST_DOWN(uint, pc),prof); + cell = (short *)(prof->pr_base + off); + if (cell >= (short *)prof->pr_base && + cell < (short*)(prof->pr_size + (int) prof->pr_base)) { + if (copyin(CAST_USER_ADDR_T(cell), (caddr_t) &count, sizeof(count)) == 0) { + count += ticks; + if(copyout((caddr_t) &count, CAST_USER_ADDR_T(cell), sizeof(count)) == 0) + return; + } + p->p_stats->p_prof.pr_scale = 0; + stopprofclock(p); + break; + } + } } } diff --git a/bsd/kern/subr_xxx.c b/bsd/kern/subr_xxx.c index 503401a6c..393c07142 100644 --- a/bsd/kern/subr_xxx.c +++ b/bsd/kern/subr_xxx.c @@ -58,8 +58,7 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/conf.h> -#include <sys/buf.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> #include <sys/vnode.h> #include <sys/uio.h> @@ -114,11 +113,13 @@ enosys() /* * Return error for operation not supported * on a specific object or file type. + * + * XXX Name of this routine is wrong. */ int eopnotsupp() { - return (EOPNOTSUPP); + return (ENOTSUP); } /* diff --git a/bsd/kern/sys_domain.c b/bsd/kern/sys_domain.c index ab3f62847..244d2ed3c 100644 --- a/bsd/kern/sys_domain.c +++ b/bsd/kern/sys_domain.c @@ -33,11 +33,10 @@ /* domain init function */ -void systemdomain_init(); - +void systemdomain_init(void); struct domain systemdomain = - { PF_SYSTEM, "system", systemdomain_init, 0, 0, 0}; + { PF_SYSTEM, "system", systemdomain_init, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, { 0, 0} }; void systemdomain_init() diff --git a/bsd/kern/sys_generic.c b/bsd/kern/sys_generic.c index 15fdf111f..72169c03d 100644 --- a/bsd/kern/sys_generic.c +++ b/bsd/kern/sys_generic.c @@ -64,27 +64,41 @@ #include <sys/systm.h> #include <sys/filedesc.h> #include <sys/ioctl.h> -#include <sys/file.h> -#include <sys/proc.h> +#include <sys/file_internal.h> +#include <sys/proc_internal.h> #include <sys/socketvar.h> +#if KTRACE +#include <sys/uio_internal.h> +#else #include <sys/uio.h> +#endif #include <sys/kernel.h> #include <sys/stat.h> #include <sys/malloc.h> +#include <sys/sysproto.h> -#include <sys/mount.h> +#include <sys/mount_internal.h> #include <sys/protosw.h> #include <sys/ev.h> #include <sys/user.h> #include <sys/kdebug.h> +#include <sys/poll.h> +#include <sys/event.h> +#include <sys/eventvar.h> + +#include <mach/mach_types.h> +#include <kern/kern_types.h> #include <kern/assert.h> -#include <kern/thread_act.h> +#include <kern/kalloc.h> +#include <kern/thread.h> +#include <kern/clock.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/errno.h> #include <sys/syscall.h> +#include <sys/pipe.h> #include <bsm/audit_kernel.h> @@ -106,259 +120,325 @@ #include <netinet/tcp_debug.h> /* for wait queue based select */ #include <kern/wait_queue.h> +#include <kern/kalloc.h> #if KTRACE #include <sys/ktrace.h> #endif -#include <sys/vnode.h> - +#include <sys/vnode_internal.h> + +int rd_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval); +int wr_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval); +extern void *get_bsduthreadarg(thread_t); +extern int *get_bsduthreadrval(thread_t); + +__private_extern__ int dofileread(struct proc *p, struct fileproc *fp, int fd, + user_addr_t bufp, user_size_t nbyte, + off_t offset, int flags, user_ssize_t *retval); +__private_extern__ int dofilewrite(struct proc *p, struct fileproc *fp, int fd, + user_addr_t bufp, user_size_t nbyte, + off_t offset, int flags, user_ssize_t *retval); +__private_extern__ int preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_vnode); +__private_extern__ void donefileread(struct proc *p, struct fileproc *fp_ret, int fd); -__private_extern__ struct file* -holdfp(fdp, fd, flag) - struct filedesc* fdp; - int fd, flag; -{ - struct file* fp; - - if (((u_int)fd) >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[fd]) == NULL || - (fp->f_flag & flag) == 0) { - return (NULL); - } - if (fref(fp) == -1) - return (NULL); - return (fp); -} +#if NETAT +extern int appletalk_inited; +#endif /* NETAT */ +#define f_flag f_fglob->fg_flag +#define f_type f_fglob->fg_type +#define f_msgcount f_fglob->fg_msgcount +#define f_cred f_fglob->fg_cred +#define f_ops f_fglob->fg_ops +#define f_offset f_fglob->fg_offset +#define f_data f_fglob->fg_data /* * Read system call. */ -#ifndef _SYS_SYSPROTO_H_ -struct read_args { - int fd; - char *cbuf; - u_int nbyte; -}; -#endif int read(p, uap, retval) struct proc *p; register struct read_args *uap; - register_t *retval; + user_ssize_t *retval; { - register struct file *fp; + struct fileproc *fp; int error; + int fd = uap->fd; + + if ( (error = preparefileread(p, &fp, fd, 0)) ) + return (error); - if ((fp = holdfp(p->p_fd, uap->fd, FREAD)) == NULL) - return (EBADF); error = dofileread(p, fp, uap->fd, uap->cbuf, uap->nbyte, - (off_t)-1, 0, retval); - frele(fp); - return(error); + (off_t)-1, 0, retval); + + donefileread(p, fp, fd); + + return (error); } /* * Pread system call */ -#ifndef _SYS_SYSPROTO_H_ -struct pread_args { - int fd; - void *buf; - size_t nbyte; -#ifdef DOUBLE_ALIGN_PARAMS - int pad; -#endif - off_t offset; -}; -#endif int pread(p, uap, retval) struct proc *p; register struct pread_args *uap; - int *retval; + user_ssize_t *retval; { - register struct file *fp; + struct fileproc *fp; + int fd = uap->fd; int error; - if ((fp = holdfp(p->p_fd, uap->fd, FREAD)) == NULL) - return (EBADF); - if (fp->f_type != DTYPE_VNODE) { - error = ESPIPE; - } else { - error = dofileread(p, fp, uap->fd, uap->buf, uap->nbyte, - uap->offset, FOF_OFFSET, retval); - } - frele(fp); + if ( (error = preparefileread(p, &fp, fd, 1)) ) + return (error); + + error = dofileread(p, fp, uap->fd, uap->buf, uap->nbyte, + uap->offset, FOF_OFFSET, retval); + donefileread(p, fp, fd); + if (!error) KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) | DBG_FUNC_NONE), uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0); - return(error); + return (error); } /* * Code common for read and pread */ + +void +donefileread(struct proc *p, struct fileproc *fp, int fd) +{ + proc_fdlock(p); + + fp->f_flags &= ~FP_INCHRREAD; + + fp_drop(p, fd, fp, 1); + proc_fdunlock(p); +} + +int +preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_pread) +{ + vnode_t vp; + int error; + struct fileproc *fp; + + proc_fdlock(p); + + error = fp_lookup(p, fd, &fp, 1); + + if (error) { + proc_fdunlock(p); + return (error); + } + if ((fp->f_flag & FREAD) == 0) { + error = EBADF; + goto out; + } + if (check_for_pread && (fp->f_type != DTYPE_VNODE)) { + error = ESPIPE; + goto out; + } + if (fp->f_type == DTYPE_VNODE) { + vp = (struct vnode *)fp->f_fglob->fg_data; + + if (vp->v_type == VCHR) + fp->f_flags |= FP_INCHRREAD; + } + + *fp_ret = fp; + + proc_fdunlock(p); + return (0); + +out: + fp_drop(p, fd, fp, 1); + proc_fdunlock(p); + return (error); +} + + __private_extern__ int -dofileread(p, fp, fd, buf, nbyte, offset, flags, retval) +dofileread(p, fp, fd, bufp, nbyte, offset, flags, retval) struct proc *p; - struct file *fp; + struct fileproc *fp; int fd, flags; - void *buf; - size_t nbyte; + user_addr_t bufp; + user_size_t nbyte; off_t offset; - int *retval; + user_ssize_t *retval; { - struct uio auio; - struct iovec aiov; - long cnt, error = 0; + uio_t auio; + user_ssize_t bytecnt; + long error = 0; + char uio_buf[ UIO_SIZEOF(1) ]; #if KTRACE - struct iovec ktriov; - struct uio ktruio; + uio_t ktruio = NULL; + char ktr_uio_buf[ UIO_SIZEOF(1) ]; int didktr = 0; #endif - aiov.iov_base = (caddr_t)buf; - aiov.iov_len = nbyte; - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_offset = offset; + // LP64todo - do we want to raise this? if (nbyte > INT_MAX) return (EINVAL); - auio.uio_resid = nbyte; - auio.uio_rw = UIO_READ; - auio.uio_segflg = UIO_USERSPACE; - auio.uio_procp = p; + + if (IS_64BIT_PROCESS(p)) { + auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_READ, + &uio_buf[0], sizeof(uio_buf)); + } else { + auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_READ, + &uio_buf[0], sizeof(uio_buf)); + } + uio_addiov(auio, bufp, nbyte); + #if KTRACE /* * if tracing, save a copy of iovec */ if (KTRPOINT(p, KTR_GENIO)) { - ktriov = aiov; - ktruio = auio; didktr = 1; + + if (IS_64BIT_PROCESS(p)) { + ktruio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_READ, + &ktr_uio_buf[0], sizeof(ktr_uio_buf)); + } else { + ktruio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_READ, + &ktr_uio_buf[0], sizeof(ktr_uio_buf)); + } + uio_addiov(ktruio, bufp, nbyte); } #endif - cnt = nbyte; + bytecnt = nbyte; - if ((error = fo_read(fp, &auio, fp->f_cred, flags, p))) { - if (auio.uio_resid != cnt && (error == ERESTART || + if ((error = fo_read(fp, auio, fp->f_cred, flags, p))) { + if (uio_resid(auio) != bytecnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } - cnt -= auio.uio_resid; + bytecnt -= uio_resid(auio); #if KTRACE if (didktr && error == 0) { - ktruio.uio_iov = &ktriov; - ktruio.uio_resid = cnt; - ktrgenio(p->p_tracep, fd, UIO_READ, &ktruio, error, - KERNEL_FUNNEL); + uio_setresid(ktruio, bytecnt); + ktrgenio(p->p_tracep, fd, UIO_READ, ktruio, error); } #endif - *retval = cnt; + + *retval = bytecnt; + return (error); } /* * Scatter read system call. */ -#ifndef _SYS_SYSPROTO_H_ -struct readv_args { - int fd; - struct iovec *iovp; - u_int iovcnt; -}; -#endif int readv(p, uap, retval) struct proc *p; register struct readv_args *uap; - int *retval; + user_ssize_t *retval; { - struct uio auio; - register struct iovec *iov; + uio_t auio = NULL; int error; - struct iovec aiov[UIO_SMALLIOV]; - - if (uap->iovcnt > UIO_SMALLIOV) { - if (uap->iovcnt > UIO_MAXIOV) - return (EINVAL); - if ((iov = (struct iovec *) - kalloc(sizeof(struct iovec) * (uap->iovcnt))) == 0) - return (ENOMEM); - } else - iov = aiov; - auio.uio_iov = iov; - auio.uio_iovcnt = uap->iovcnt; - auio.uio_rw = UIO_READ; - error = copyin((caddr_t)uap->iovp, (caddr_t)iov, - uap->iovcnt * sizeof (struct iovec)); - if (!error) - error = rwuio(p, uap->fd, &auio, UIO_READ, retval); - if (uap->iovcnt > UIO_SMALLIOV) - kfree(iov, sizeof(struct iovec)*uap->iovcnt); + int size_of_iovec; + struct user_iovec *iovp; + + /* Verify range bedfore calling uio_create() */ + if (uap->iovcnt <= 0 || uap->iovcnt > UIO_MAXIOV) + return (EINVAL); + + /* allocate a uio large enough to hold the number of iovecs passed */ + auio = uio_create(uap->iovcnt, 0, + (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32), + UIO_READ); + + /* get location of iovecs within the uio. then copyin the iovecs from + * user space. + */ + iovp = uio_iovsaddr(auio); + if (iovp == NULL) { + error = ENOMEM; + goto ExitThisRoutine; + } + size_of_iovec = (IS_64BIT_PROCESS(p) ? sizeof(struct user_iovec) : sizeof(struct iovec)); + error = copyin(uap->iovp, (caddr_t)iovp, (uap->iovcnt * size_of_iovec)); + if (error) { + goto ExitThisRoutine; + } + + /* finalize uio_t for use and do the IO + */ + uio_calculateresid(auio); + error = rd_uio(p, uap->fd, auio, retval); + +ExitThisRoutine: + if (auio != NULL) { + uio_free(auio); + } return (error); } /* * Write system call */ -#ifndef _SYS_SYSPROTO_H_ -struct write_args { - int fd; - char *cbuf; - u_int nbyte; -}; -#endif int write(p, uap, retval) struct proc *p; register struct write_args *uap; - int *retval; + user_ssize_t *retval; { - register struct file *fp; + struct fileproc *fp; int error; + int fd = uap->fd; - if ((fp = holdfp(p->p_fd, uap->fd, FWRITE)) == NULL) - return (EBADF); - error = dofilewrite(p, fp, uap->fd, uap->cbuf, uap->nbyte, + error = fp_lookup(p,fd,&fp,0); + if (error) + return(error); + if ((fp->f_flag & FWRITE) == 0) { + error = EBADF; + } else { + error = dofilewrite(p, fp, uap->fd, uap->cbuf, uap->nbyte, (off_t)-1, 0, retval); - frele(fp); + } + if (error == 0) + fp_drop_written(p, fd, fp); + else + fp_drop(p, fd, fp, 0); return(error); } /* - * Pwrite system call + * pwrite system call */ -#ifndef _SYS_SYSPROTO_H_ -struct pwrite_args { - int fd; - const void *buf; - size_t nbyte; -#ifdef DOUBLE_ALIGN_PARAMS - int pad; -#endif - off_t offset; -}; -#endif int pwrite(p, uap, retval) struct proc *p; register struct pwrite_args *uap; - int *retval; + user_ssize_t *retval; { - register struct file *fp; + struct fileproc *fp; int error; + int fd = uap->fd; + + error = fp_lookup(p,fd,&fp,0); + if (error) + return(error); - if ((fp = holdfp(p->p_fd, uap->fd, FWRITE)) == NULL) - return (EBADF); - if (fp->f_type != DTYPE_VNODE) { - error = ESPIPE; - } else { - error = dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte, - uap->offset, FOF_OFFSET, retval); + if ((fp->f_flag & FWRITE) == 0) { + error = EBADF; + } else { + if (fp->f_type != DTYPE_VNODE) { + error = ESPIPE; + } else { + error = dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte, + uap->offset, FOF_OFFSET, retval); + } } - frele(fp); + if (error == 0) + fp_drop_written(p, fd, fp); + else + fp_drop(p, fd, fp, 0); if (!error) KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pwrite) | DBG_FUNC_NONE), @@ -368,231 +448,295 @@ pwrite(p, uap, retval) } __private_extern__ int -dofilewrite(p, fp, fd, buf, nbyte, offset, flags, retval) +dofilewrite(p, fp, fd, bufp, nbyte, offset, flags, retval) struct proc *p; - struct file *fp; + struct fileproc *fp; int fd, flags; - const void *buf; - size_t nbyte; + user_addr_t bufp; + user_size_t nbyte; off_t offset; - int *retval; + user_ssize_t *retval; { - struct uio auio; - struct iovec aiov; - long cnt, error = 0; + uio_t auio; + long error = 0; + user_ssize_t bytecnt; + char uio_buf[ UIO_SIZEOF(1) ]; #if KTRACE - struct iovec ktriov; - struct uio ktruio; + uio_t ktruio; int didktr = 0; + char ktr_uio_buf[ UIO_SIZEOF(1) ]; #endif - - aiov.iov_base = (void *)(uintptr_t)buf; - aiov.iov_len = nbyte; - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_offset = offset; + + // LP64todo - do we want to raise this? if (nbyte > INT_MAX) return (EINVAL); - auio.uio_resid = nbyte; - auio.uio_rw = UIO_WRITE; - auio.uio_segflg = UIO_USERSPACE; - auio.uio_procp = p; + + if (IS_64BIT_PROCESS(p)) { + auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_WRITE, + &uio_buf[0], sizeof(uio_buf)); + } else { + auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_WRITE, + &uio_buf[0], sizeof(uio_buf)); + } + uio_addiov(auio, bufp, nbyte); + #if KTRACE /* * if tracing, save a copy of iovec and uio */ if (KTRPOINT(p, KTR_GENIO)) { - ktriov = aiov; - ktruio = auio; didktr = 1; + + if (IS_64BIT_PROCESS(p)) { + ktruio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_WRITE, + &ktr_uio_buf[0], sizeof(ktr_uio_buf)); + } else { + ktruio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_WRITE, + &ktr_uio_buf[0], sizeof(ktr_uio_buf)); + } + uio_addiov(ktruio, bufp, nbyte); } #endif - cnt = nbyte; - if (fp->f_type == DTYPE_VNODE) - bwillwrite(); - if ((error = fo_write(fp, &auio, fp->f_cred, flags, p))) { - if (auio.uio_resid != cnt && (error == ERESTART || + bytecnt = nbyte; + if ((error = fo_write(fp, auio, fp->f_cred, flags, p))) { + if (uio_resid(auio) != bytecnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* The socket layer handles SIGPIPE */ if (error == EPIPE && fp->f_type != DTYPE_SOCKET) psignal(p, SIGPIPE); } - cnt -= auio.uio_resid; + bytecnt -= uio_resid(auio); #if KTRACE if (didktr && error == 0) { - ktruio.uio_iov = &ktriov; - ktruio.uio_resid = cnt; - ktrgenio(p->p_tracep, fd, UIO_WRITE, &ktruio, error, - KERNEL_FUNNEL); + uio_setresid(ktruio, bytecnt); + ktrgenio(p->p_tracep, fd, UIO_WRITE, ktruio, error); } #endif - *retval = cnt; + *retval = bytecnt; + return (error); } /* * Gather write system call */ -#ifndef _SYS_SYSPROTO_H_ -struct writev_args { - int fd; - struct iovec *iovp; - u_int iovcnt; -}; -#endif int writev(p, uap, retval) struct proc *p; register struct writev_args *uap; - int *retval; + user_ssize_t *retval; { - struct uio auio; - register struct iovec *iov; + uio_t auio = NULL; int error; - struct iovec aiov[UIO_SMALLIOV]; - - if (uap->iovcnt > UIO_SMALLIOV) { - if (uap->iovcnt > UIO_MAXIOV) - return (EINVAL); - if ((iov = (struct iovec *) - kalloc(sizeof(struct iovec) * (uap->iovcnt))) == 0) - return (ENOMEM); - } else - iov = aiov; - auio.uio_iov = iov; - auio.uio_iovcnt = uap->iovcnt; - auio.uio_rw = UIO_WRITE; - error = copyin((caddr_t)uap->iovp, (caddr_t)iov, - uap->iovcnt * sizeof (struct iovec)); - if (!error) - error = rwuio(p, uap->fd, &auio, UIO_WRITE, retval); - if (uap->iovcnt > UIO_SMALLIOV) - kfree(iov, sizeof(struct iovec)*uap->iovcnt); + int size_of_iovec; + struct user_iovec *iovp; + + /* Verify range bedfore calling uio_create() */ + if (uap->iovcnt <= 0 || uap->iovcnt > UIO_MAXIOV) + return (EINVAL); + + /* allocate a uio large enough to hold the number of iovecs passed */ + auio = uio_create(uap->iovcnt, 0, + (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32), + UIO_WRITE); + + /* get location of iovecs within the uio. then copyin the iovecs from + * user space. + */ + iovp = uio_iovsaddr(auio); + if (iovp == NULL) { + error = ENOMEM; + goto ExitThisRoutine; + } + size_of_iovec = (IS_64BIT_PROCESS(p) ? sizeof(struct user_iovec) : sizeof(struct iovec)); + error = copyin(uap->iovp, (caddr_t)iovp, (uap->iovcnt * size_of_iovec)); + if (error) { + goto ExitThisRoutine; + } + + /* finalize uio_t for use and do the IO + */ + uio_calculateresid(auio); + error = wr_uio(p, uap->fd, auio, retval); + +ExitThisRoutine: + if (auio != NULL) { + uio_free(auio); + } return (error); } + int -rwuio(p, fdes, uio, rw, retval) +wr_uio(p, fdes, uio, retval) struct proc *p; int fdes; - register struct uio *uio; - enum uio_rw rw; - int *retval; + register uio_t uio; + user_ssize_t *retval; { - struct file *fp; - register struct iovec *iov; - int i, count, flag, error; + struct fileproc *fp; + int error; + user_ssize_t count; #if KTRACE - struct iovec *ktriov; + struct iovec_64 *ktriov = NULL; struct uio ktruio; int didktr = 0; u_int iovlen; #endif - if (error = fdgetf(p, fdes, &fp)) - return (error); + error = fp_lookup(p,fdes,&fp,0); + if (error) + return(error); - if ((fp->f_flag&(rw==UIO_READ ? FREAD : FWRITE)) == 0) { - return(EBADF); + if ((fp->f_flag & FWRITE) == 0) { + error = EBADF; + goto out; } - uio->uio_resid = 0; - uio->uio_segflg = UIO_USERSPACE; - uio->uio_procp = p; - iov = uio->uio_iov; - for (i = 0; i < uio->uio_iovcnt; i++) { - if (iov->iov_len < 0) { - return(EINVAL); + count = uio_resid(uio); +#if KTRACE + /* + * if tracing, save a copy of iovec + */ + if (KTRPOINT(p, KTR_GENIO)) { + iovlen = uio->uio_iovcnt * + (IS_64BIT_PROCESS(p) ? sizeof (struct iovec_64) : sizeof (struct iovec_32)); + MALLOC(ktriov, struct iovec_64 *, iovlen, M_TEMP, M_WAITOK); + if (ktriov != NULL) { + bcopy((caddr_t)uio->uio_iovs.iov64p, (caddr_t)ktriov, iovlen); + ktruio = *uio; + didktr = 1; } - uio->uio_resid += iov->iov_len; - if (uio->uio_resid < 0) { - return(EINVAL); + } +#endif + error = fo_write(fp, uio, fp->f_cred, 0, p); + if (error) { + if (uio_resid(uio) != count && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + /* The socket layer handles SIGPIPE */ + if (error == EPIPE && fp->f_type != DTYPE_SOCKET) + psignal(p, SIGPIPE); + } + *retval = count - uio_resid(uio); + +#if KTRACE + if (didktr) { + if (error == 0) { + ktruio.uio_iovs.iov64p = ktriov; + uio_setresid(&ktruio, *retval); + ktrgenio(p->p_tracep, fdes, UIO_WRITE, &ktruio, error); } - iov++; + FREE(ktriov, M_TEMP); } - count = uio->uio_resid; +#endif + +out: + if ( (error == 0) ) + fp_drop_written(p, fdes, fp); + else + fp_drop(p, fdes, fp, 0); + return(error); +} + + +int +rd_uio(p, fdes, uio, retval) + struct proc *p; + int fdes; + register uio_t uio; + user_ssize_t *retval; +{ + struct fileproc *fp; + int error; + user_ssize_t count; +#if KTRACE + struct iovec_64 *ktriov = NULL; + struct uio ktruio; + int didktr = 0; + u_int iovlen; +#endif + + if ( (error = preparefileread(p, &fp, fdes, 0)) ) + return (error); + + count = uio_resid(uio); #if KTRACE /* * if tracing, save a copy of iovec */ if (KTRPOINT(p, KTR_GENIO)) { - iovlen = uio->uio_iovcnt * sizeof (struct iovec); - MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); - bcopy((caddr_t)uio->uio_iov, (caddr_t)ktriov, iovlen); - ktruio = *uio; - didktr = 1; + iovlen = uio->uio_iovcnt * + (IS_64BIT_PROCESS(p) ? sizeof (struct iovec_64) : sizeof (struct iovec_32)); + MALLOC(ktriov, struct iovec_64 *, iovlen, M_TEMP, M_WAITOK); + if (ktriov != NULL) { + bcopy((caddr_t)uio->uio_iovs.iov64p, (caddr_t)ktriov, iovlen); + ktruio = *uio; + didktr = 1; + } } #endif + error = fo_read(fp, uio, fp->f_cred, 0, p); - if (rw == UIO_READ) { - if (error = fo_read(fp, uio, fp->f_cred, 0, p)) - if (uio->uio_resid != count && (error == ERESTART || - error == EINTR || error == EWOULDBLOCK)) - error = 0; - } else { - if (fp->f_type == DTYPE_VNODE) - bwillwrite(); - if (error = fo_write(fp, uio, fp->f_cred, 0, p)) { - if (uio->uio_resid != count && (error == ERESTART || - error == EINTR || error == EWOULDBLOCK)) - error = 0; - /* The socket layer handles SIGPIPE */ - if (error == EPIPE && fp->f_type != DTYPE_SOCKET) - psignal(p, SIGPIPE); - } + if (error) { + if (uio_resid(uio) != count && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; } - - *retval = count - uio->uio_resid; + *retval = count - uio_resid(uio); #if KTRACE if (didktr) { if (error == 0) { - ktruio.uio_iov = ktriov; - ktruio.uio_resid = *retval; - ktrgenio(p->p_tracep, fdes, rw, &ktruio, error, - KERNEL_FUNNEL); + ktruio.uio_iovs.iov64p = ktriov; + uio_setresid(&ktruio, *retval); + ktrgenio(p->p_tracep, fdes, UIO_READ, &ktruio, error); } FREE(ktriov, M_TEMP); } #endif + donefileread(p, fp, fdes); - return(error); + return (error); } /* * Ioctl system call + * */ -#ifndef _SYS_SYSPROTO_H_ -struct ioctl_args { - int fd; - u_long com; - caddr_t data; -}; -#endif int -ioctl(p, uap, retval) - struct proc *p; - register struct ioctl_args *uap; - register_t *retval; +ioctl(struct proc *p, register struct ioctl_args *uap, __unused register_t *retval) { - struct file *fp; + struct fileproc *fp; register u_long com; - register int error; + int error = 0; register u_int size; - caddr_t data, memp; + caddr_t datap, memp; + boolean_t is64bit; int tmp; #define STK_PARAMS 128 char stkbuf[STK_PARAMS]; + int fd = uap->fd; AUDIT_ARG(fd, uap->fd); - AUDIT_ARG(cmd, uap->com); /* XXX cmd is int, uap->com is long */ + AUDIT_ARG(cmd, CAST_DOWN(int, uap->com)); /* LP64todo: uap->com is a user-land long */ AUDIT_ARG(addr, uap->data); - if (error = fdgetf(p, uap->fd, &fp)) - return (error); + + is64bit = proc_is64bit(p); + + proc_fdlock(p); + error = fp_lookup(p,fd,&fp,1); + if (error) { + proc_fdunlock(p); + return(error); + } AUDIT_ARG(file, p, fp); - if ((fp->f_flag & (FREAD | FWRITE)) == 0) - return (EBADF); + + if ((fp->f_flag & (FREAD | FWRITE)) == 0) { + error = EBADF; + goto out; + } #if NETAT /* @@ -600,14 +744,16 @@ ioctl(p, uap, retval) * while implementing an ATioctl system call */ { - extern int appletalk_inited; - if (appletalk_inited && ((uap->com & 0x0000FFFF) == 0xff99)) { + u_long fixed_command; #ifdef APPLETALK_DEBUG kprintf("ioctl: special AppleTalk \n"); #endif - error = fo_ioctl(fp, uap->com, uap->data, p); - return(error); + datap = &stkbuf[0]; + *(user_addr_t *)datap = uap->data; + fixed_command = _IOW(0, 0xff99, uap->data); + error = fo_ioctl(fp, fixed_command, datap, p); + goto out; } } @@ -617,10 +763,12 @@ ioctl(p, uap, retval) switch (com = uap->com) { case FIONCLEX: *fdflags(p, uap->fd) &= ~UF_EXCLOSE; - return (0); + error =0; + goto out; case FIOCLEX: *fdflags(p, uap->fd) |= UF_EXCLOSE; - return (0); + error =0; + goto out; } /* @@ -628,38 +776,62 @@ ioctl(p, uap, retval) * copied to/from the user's address space. */ size = IOCPARM_LEN(com); - if (size > IOCPARM_MAX) - return (ENOTTY); + if (size > IOCPARM_MAX) { + error = ENOTTY; + goto out; + } memp = NULL; if (size > sizeof (stkbuf)) { - if ((memp = (caddr_t)kalloc(size)) == 0) - return(ENOMEM); - data = memp; + proc_fdunlock(p); + if ((memp = (caddr_t)kalloc(size)) == 0) { + proc_fdlock(p); + error = ENOMEM; + goto out; + } + proc_fdlock(p); + datap = memp; } else - data = stkbuf; + datap = &stkbuf[0]; if (com&IOC_IN) { if (size) { - error = copyin(uap->data, data, (u_int)size); + proc_fdunlock(p); + error = copyin(uap->data, datap, size); if (error) { if (memp) kfree(memp, size); - return (error); + proc_fdlock(p); + goto out; } - } else - *(caddr_t *)data = uap->data; + proc_fdlock(p); + } else { + /* XXX - IOC_IN and no size? we should proably return an error here!! */ + if (is64bit) { + *(user_addr_t *)datap = uap->data; + } + else { + *(uint32_t *)datap = (uint32_t)uap->data; + } + } } else if ((com&IOC_OUT) && size) /* * Zero the buffer so the user always * gets back something deterministic. */ - bzero(data, size); - else if (com&IOC_VOID) - *(caddr_t *)data = uap->data; + bzero(datap, size); + else if (com&IOC_VOID) { + /* XXX - this is odd since IOC_VOID means no parameters */ + if (is64bit) { + *(user_addr_t *)datap = uap->data; + } + else { + *(uint32_t *)datap = (uint32_t)uap->data; + } + } switch (com) { case FIONBIO: - if (tmp = *(int *)data) + if ( (tmp = *(int *)datap) ) fp->f_flag |= FNONBLOCK; else fp->f_flag &= ~FNONBLOCK; @@ -667,7 +839,7 @@ ioctl(p, uap, retval) break; case FIOASYNC: - if (tmp = *(int *)data) + if ( (tmp = *(int *)datap) ) fp->f_flag |= FASYNC; else fp->f_flag &= ~FASYNC; @@ -675,12 +847,16 @@ ioctl(p, uap, retval) break; case FIOSETOWN: - tmp = *(int *)data; + tmp = *(int *)datap; if (fp->f_type == DTYPE_SOCKET) { ((struct socket *)fp->f_data)->so_pgid = tmp; error = 0; break; } + if (fp->f_type == DTYPE_PIPE) { + error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, p); + break; + } if (tmp <= 0) { tmp = -tmp; } else { @@ -697,26 +873,31 @@ ioctl(p, uap, retval) case FIOGETOWN: if (fp->f_type == DTYPE_SOCKET) { error = 0; - *(int *)data = ((struct socket *)fp->f_data)->so_pgid; + *(int *)datap = ((struct socket *)fp->f_data)->so_pgid; break; } - error = fo_ioctl(fp, TIOCGPGRP, data, p); - *(int *)data = -*(int *)data; + error = fo_ioctl(fp, TIOCGPGRP, datap, p); + *(int *)datap = -*(int *)datap; break; default: - error = fo_ioctl(fp, com, data, p); + error = fo_ioctl(fp, com, datap, p); /* * Copy any data to user, size was * already set and checked above. */ if (error == 0 && (com&IOC_OUT) && size) - error = copyout(data, uap->data, (u_int)size); + error = copyout(datap, uap->data, (u_int)size); break; } + proc_fdunlock(p); if (memp) kfree(memp, size); - return (error); + proc_fdlock(p); +out: + fp_drop(p, fd, fp, 1); + proc_fdunlock(p); + return(error); } int selwait, nselcoll; @@ -725,42 +906,29 @@ int selwait, nselcoll; extern int selcontinue(int error); extern int selprocess(int error, int sel_pass); static int selscan(struct proc *p, struct _select * sel, - int nfd, register_t *retval, int sel_pass); + int nfd, register_t *retval, int sel_pass, wait_queue_sub_t wqsub); static int selcount(struct proc *p, u_int32_t *ibits, u_int32_t *obits, - int nfd, int * count, int * nfcount); + int nfd, int * count); +static int seldrop(struct proc *p, u_int32_t *ibits, int nfd); extern uint64_t tvtoabstime(struct timeval *tvp); /* * Select system call. */ -#ifndef _SYS_SYSPROTO_H_ -struct select_args { - int nd; - u_int32_t *in; - u_int32_t *ou; - u_int32_t *ex; - struct timeval *tv; -}; -#endif int -select(p, uap, retval) - register struct proc *p; - register struct select_args *uap; - register_t *retval; +select(struct proc *p, struct select_args *uap, register_t *retval) { int error = 0; u_int ni, nw, size; - thread_act_t th_act; + thread_t th_act; struct uthread *uth; struct _select *sel; int needzerofill = 1; - int kfcount =0; - int nfcount = 0; int count = 0; - th_act = current_act(); + th_act = current_thread(); uth = get_bsdthread_info(th_act); - sel = &uth->uu_state.ss_select; + sel = &uth->uu_select; retval = (int *)get_bsduthreadrval(th_act); *retval = 0; @@ -780,10 +948,10 @@ select(p, uap, retval) */ if (sel->nbytes == 0) { sel->nbytes = 3 * ni; - MALLOC(sel->ibits, u_int32_t *, sel->nbytes, M_TEMP, M_WAITOK); - MALLOC(sel->obits, u_int32_t *, sel->nbytes, M_TEMP, M_WAITOK); - bzero((caddr_t)sel->ibits, sel->nbytes); - bzero((caddr_t)sel->obits, sel->nbytes); + MALLOC(sel->ibits, u_int32_t *, sel->nbytes, M_TEMP, M_WAITOK | M_ZERO); + MALLOC(sel->obits, u_int32_t *, sel->nbytes, M_TEMP, M_WAITOK | M_ZERO); + if ((sel->ibits == NULL) || (sel->obits == NULL)) + panic("select out of memory"); needzerofill = 0; } @@ -795,10 +963,10 @@ select(p, uap, retval) sel->nbytes = (3 * ni); FREE(sel->ibits, M_TEMP); FREE(sel->obits, M_TEMP); - MALLOC(sel->ibits, u_int32_t *, sel->nbytes, M_TEMP, M_WAITOK); - MALLOC(sel->obits, u_int32_t *, sel->nbytes, M_TEMP, M_WAITOK); - bzero((caddr_t)sel->ibits, sel->nbytes); - bzero((caddr_t)sel->obits, sel->nbytes); + MALLOC(sel->ibits, u_int32_t *, sel->nbytes, M_TEMP, M_WAITOK | M_ZERO); + MALLOC(sel->obits, u_int32_t *, sel->nbytes, M_TEMP, M_WAITOK | M_ZERO); + if ((sel->ibits == NULL) || (sel->obits == NULL)) + panic("select out of memory"); needzerofill = 0; } @@ -812,7 +980,7 @@ select(p, uap, retval) */ #define getbits(name, x) \ do { \ - if (uap->name && (error = copyin((caddr_t)uap->name, \ + if (uap->name && (error = copyin(uap->name, \ (caddr_t)&sel->ibits[(x) * nw], ni))) \ goto continuation; \ } while (0) @@ -824,8 +992,15 @@ select(p, uap, retval) if (uap->tv) { struct timeval atv; - - error = copyin((caddr_t)uap->tv, (caddr_t)&atv, sizeof (atv)); + if (IS_64BIT_PROCESS(p)) { + struct user_timeval atv64; + error = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64)); + /* Loses resolution - assume timeout < 68 years */ + atv.tv_sec = atv64.tv_sec; + atv.tv_usec = atv64.tv_usec; + } else { + error = copyin(uap->tv, (caddr_t)&atv, sizeof(atv)); + } if (error) goto continuation; if (itimerfix(&atv)) { @@ -839,36 +1014,33 @@ select(p, uap, retval) else sel->abstime = 0; - sel->nfcount = 0; - if (error = selcount(p, sel->ibits, sel->obits, uap->nd, &count, &nfcount)) { + if ( (error = selcount(p, sel->ibits, sel->obits, uap->nd, &count)) ) { goto continuation; } - sel->nfcount = nfcount; sel->count = count; - size = SIZEOF_WAITQUEUE_SUB + (count * SIZEOF_WAITQUEUE_LINK); + size = SIZEOF_WAITQUEUE_SET + (count * SIZEOF_WAITQUEUE_LINK); if (sel->allocsize) { - if (uth->uu_wqsub == 0) + if (sel->wqset == 0) panic("select: wql memory smashed"); /* needed for the select now */ if (size > sel->allocsize) { - kfree(uth->uu_wqsub, sel->allocsize); + kfree(sel->wqset, sel->allocsize); sel->allocsize = size; - uth->uu_wqsub = (wait_queue_sub_t)kalloc(sel->allocsize); - if (uth->uu_wqsub == (wait_queue_sub_t)NULL) + sel->wqset = (wait_queue_set_t)kalloc(size); + if (sel->wqset == (wait_queue_set_t)NULL) panic("failed to allocate memory for waitqueue\n"); - sel->wql = (char *)uth->uu_wqsub + SIZEOF_WAITQUEUE_SUB; } } else { sel->count = count; sel->allocsize = size; - uth->uu_wqsub = (wait_queue_sub_t)kalloc(sel->allocsize); - if (uth->uu_wqsub == (wait_queue_sub_t)NULL) + sel->wqset = (wait_queue_set_t)kalloc(sel->allocsize); + if (sel->wqset == (wait_queue_set_t)NULL) panic("failed to allocate memory for waitqueue\n"); - sel->wql = (char *)uth->uu_wqsub + SIZEOF_WAITQUEUE_SUB; } - bzero(uth->uu_wqsub, size); - wait_queue_sub_init(uth->uu_wqsub, (SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST)); + bzero(sel->wqset, size); + sel->wql = (char *)sel->wqset + SIZEOF_WAITQUEUE_SET; + wait_queue_set_init(sel->wqset, (SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST)); continuation: return selprocess(error, SEL_FIRSTPASS); @@ -881,11 +1053,11 @@ selcontinue(int error) } int -selprocess(error, sel_pass) +selprocess(int error, int sel_pass) { int ncoll; u_int ni, nw; - thread_act_t th_act; + thread_t th_act; struct uthread *uth; struct proc *p; struct select_args *uap; @@ -898,11 +1070,11 @@ selprocess(error, sel_pass) wait_result_t wait_result; p = current_proc(); - th_act = current_act(); + th_act = current_thread(); uap = (struct select_args *)get_bsduthreadarg(th_act); retval = (int *)get_bsduthreadrval(th_act); uth = get_bsdthread_info(th_act); - sel = &uth->uu_state.ss_select; + sel = &uth->uu_select; /* if it is first pass wait queue is not setup yet */ if ((error != 0) && (sel_pass == SEL_FIRSTPASS)) @@ -919,9 +1091,9 @@ retry: /* skip scans if the select is just for timeouts */ if (sel->count) { if (sel_pass == SEL_FIRSTPASS) - wait_queue_sub_clearrefs(uth->uu_wqsub); + wait_queue_sub_clearrefs(sel->wqset); - error = selscan(p, sel, uap->nd, retval, sel_pass); + error = selscan(p, sel, uap->nd, retval, sel_pass, sel->wqset); if (error || *retval) { goto done; } @@ -974,12 +1146,12 @@ retry: panic("selprocess: 2nd pass assertwaiting"); /* Wait Queue Subordinate has waitqueue as first element */ - wait_result = wait_queue_assert_wait((wait_queue_t)uth->uu_wqsub, - &selwait, THREAD_ABORTSAFE); + wait_result = wait_queue_assert_wait((wait_queue_t)sel->wqset, + &selwait, THREAD_ABORTSAFE, sel->abstime); if (wait_result != THREAD_AWAKENED) { /* there are no preposted events */ - error = tsleep1(NULL, PSOCK | PCATCH, - "select", sel->abstime, selcontinue); + error = tsleep1(NULL, PSOCK | PCATCH, + "select", 0, selcontinue); } else { prepost = 1; error = 0; @@ -992,8 +1164,10 @@ retry: goto retry; } done: - if (unwind) - wait_subqueue_unlink_all(uth->uu_wqsub); + if (unwind) { + wait_subqueue_unlink_all(sel->wqset); + seldrop(p, sel->ibits, uap->nd); + } p->p_flag &= ~P_SELECT; /* select is not restarted after signals... */ if (error == ERESTART) @@ -1005,8 +1179,8 @@ done: #define putbits(name, x) \ do { \ - if (uap->name && (error2 = copyout((caddr_t)&sel->obits[(x) * nw], \ - (caddr_t)uap->name, ni))) \ + if (uap->name && (error2 = \ + copyout((caddr_t)&sel->obits[(x) * nw], uap->name, ni))) \ error = error2; \ } while (0) @@ -1022,17 +1196,18 @@ done: } static int -selscan(p, sel, nfd, retval, sel_pass) +selscan(p, sel, nfd, retval, sel_pass, wqsub) struct proc *p; struct _select *sel; int nfd; register_t *retval; int sel_pass; + wait_queue_sub_t wqsub; { register struct filedesc *fdp = p->p_fd; register int msk, i, j, fd; register u_int32_t bits; - struct file *fp; + struct fileproc *fp; int n = 0; int nc = 0; static int flag[3] = { FREAD, FWRITE, 0 }; @@ -1040,10 +1215,7 @@ selscan(p, sel, nfd, retval, sel_pass) u_int nw; u_int32_t *ibits, *obits; char * wql; - int nfunnel = 0; - int count, nfcount; char * wql_ptr; - struct vnode *vp; /* * Problems when reboot; due to MacOSX signal probs @@ -1053,89 +1225,42 @@ selscan(p, sel, nfd, retval, sel_pass) *retval=0; return(EIO); } - ibits = sel->ibits; obits = sel->obits; wql = sel->wql; - count = sel->count; - nfcount = sel->nfcount; - - if (nfcount > count) - panic("selcount count<nfcount"); - nw = howmany(nfd, NFDBITS); nc = 0; - if ( nfcount < count) { - /* some or all in kernel funnel */ - for (msk = 0; msk < 3; msk++) { - iptr = (u_int32_t *)&ibits[msk * nw]; - optr = (u_int32_t *)&obits[msk * nw]; - for (i = 0; i < nfd; i += NFDBITS) { - bits = iptr[i/NFDBITS]; - while ((j = ffs(bits)) && (fd = i + --j) < nfd) { - bits &= ~(1 << j); - fp = fdp->fd_ofiles[fd]; - if (fp == NULL || - (fdp->fd_ofileflags[fd] & UF_RESERVED)) { - return(EBADF); - } - if (sel_pass == SEL_SECONDPASS) - wql_ptr = (char *)0; - else - wql_ptr = (wql+ nc * SIZEOF_WAITQUEUE_LINK); - /* - * Merlot: need to remove the bogus f_data check - * from the following "if" statement. It's there - * because of various problems stemming from - * races due to the split-funnels and lack of real - * referencing on sockets... - */ - if (fp->f_ops && (fp->f_type != DTYPE_SOCKET) - && (fp->f_data != (caddr_t)-1) - && !(fp->f_type == DTYPE_VNODE - && (vp = (struct vnode *)fp->f_data) - && vp->v_type == VFIFO) - && fo_select(fp, flag[msk], wql_ptr, p)) { - optr[fd/NFDBITS] |= (1 << (fd % NFDBITS)); - n++; - } - nc++; - } - } - } - } - - if (nfcount) { - /* socket file descriptors for scan */ - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + proc_fdlock(p); - nc = 0; + if (sel->count) { for (msk = 0; msk < 3; msk++) { iptr = (u_int32_t *)&ibits[msk * nw]; optr = (u_int32_t *)&obits[msk * nw]; + for (i = 0; i < nfd; i += NFDBITS) { bits = iptr[i/NFDBITS]; + while ((j = ffs(bits)) && (fd = i + --j) < nfd) { bits &= ~(1 << j); fp = fdp->fd_ofiles[fd]; + if (fp == NULL || (fdp->fd_ofileflags[fd] & UF_RESERVED)) { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + proc_fdunlock(p); return(EBADF); } - if (sel_pass == SEL_SECONDPASS) + if (sel_pass == SEL_SECONDPASS) { wql_ptr = (char *)0; - else - wql_ptr = (wql+ nc * SIZEOF_WAITQUEUE_LINK); - if (fp->f_ops - && (fp->f_type == DTYPE_SOCKET - || (fp->f_type == DTYPE_VNODE - && (vp = (struct vnode *)fp->f_data) - && vp != (struct vnode *)-1 - && vp->v_type == VFIFO)) - && fo_select(fp, flag[msk], wql_ptr, p)) { + fp->f_flags &= ~FP_INSELECT; + fp->f_waddr = (void *)0; + } else { + wql_ptr = (wql + nc * SIZEOF_WAITQUEUE_LINK); + fp->f_flags |= FP_INSELECT; + fp->f_waddr = (void *)wqsub; + } + if (fp->f_ops && fo_select(fp, flag[msk], wql_ptr, p)) { optr[fd/NFDBITS] |= (1 << (fd % NFDBITS)); n++; } @@ -1143,43 +1268,227 @@ selscan(p, sel, nfd, retval, sel_pass) } } } - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); } - + proc_fdunlock(p); *retval = n; return (0); } -/*ARGSUSED*/ +static int poll_callback(struct kqueue *, struct kevent *, void *); + +struct poll_continue_args { + user_addr_t pca_fds; + u_int pca_nfds; + u_int pca_rfds; +}; + int -seltrue(dev, flag, p) - dev_t dev; - int flag; - struct proc *p; +poll(struct proc *p, struct poll_args *uap, register_t *retval) { + struct poll_continue_args *cont; + struct pollfd *fds; + struct kqueue *kq; + struct timeval atv; + int ncoll, error = 0; + u_int nfds = uap->nfds; + u_int rfds = 0; + u_int i; + size_t ni; - return (1); -} + /* + * This is kinda bogus. We have fd limits, but that is not + * really related to the size of the pollfd array. Make sure + * we let the process use at least FD_SETSIZE entries and at + * least enough for the current limits. We want to be reasonably + * safe, but not overly restrictive. + */ + if (nfds > OPEN_MAX || + (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && nfds > FD_SETSIZE)) + return (EINVAL); -static int -selcount(p, ibits, obits, nfd, count, nfcount) - struct proc *p; - u_int32_t *ibits, *obits; - int nfd; - int *count; - int *nfcount; -{ - register struct filedesc *fdp = p->p_fd; - register int msk, i, j, fd; - register u_int32_t bits; - struct file *fp; + kq = kqueue_alloc(p); + if (kq == NULL) + return (EAGAIN); + + ni = nfds * sizeof(struct pollfd) + sizeof(struct poll_continue_args); + MALLOC(cont, struct poll_continue_args *, ni, M_TEMP, M_WAITOK); + if (NULL == cont) { + error = EAGAIN; + goto out; + } + + fds = (struct pollfd *)&cont[1]; + error = copyin(uap->fds, fds, nfds * sizeof(struct pollfd)); + if (error) + goto out; + + if (uap->timeout != -1) { + struct timeval rtv; + + atv.tv_sec = uap->timeout / 1000; + atv.tv_usec = (uap->timeout % 1000) * 1000; + if (itimerfix(&atv)) { + error = EINVAL; + goto out; + } + getmicrouptime(&rtv); + timevaladd(&atv, &rtv); + } else { + atv.tv_sec = 0; + atv.tv_usec = 0; + } + + /* JMM - all this P_SELECT stuff is bogus */ + ncoll = nselcoll; + p->p_flag |= P_SELECT; + + for (i = 0; i < nfds; i++) { + short events = fds[i].events; + struct kevent kev; + int kerror = 0; + + /* per spec, ignore fd values below zero */ + if (fds[i].fd < 0) { + fds[i].revents = 0; + continue; + } + + /* convert the poll event into a kqueue kevent */ + kev.ident = fds[i].fd; + kev.flags = EV_ADD | EV_ONESHOT | EV_POLL; + kev.fflags = NOTE_LOWAT; + kev.data = 1; /* efficiency be damned: any data should trigger */ + kev.udata = CAST_USER_ADDR_T(&fds[i]); + + /* Handle input events */ + if (events & ( POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND )) { + kev.filter = EVFILT_READ; + if (!(events & ( POLLIN | POLLRDNORM ))) + kev.flags |= EV_OOBAND; + kerror = kevent_register(kq, &kev, p); + } + + /* Handle output events */ + if (kerror == 0 && + events & ( POLLOUT | POLLWRNORM | POLLWRBAND )) { + kev.filter = EVFILT_WRITE; + kerror = kevent_register(kq, &kev, p); + } + + /* Handle BSD extension vnode events */ + if (kerror == 0 && + events & ( POLLEXTEND | POLLATTRIB | POLLNLINK | POLLWRITE )) { + kev.filter = EVFILT_VNODE; + kev.fflags = 0; + if (events & POLLEXTEND) + kev.fflags |= NOTE_EXTEND; + if (events & POLLATTRIB) + kev.fflags |= NOTE_ATTRIB; + if (events & POLLNLINK) + kev.fflags |= NOTE_LINK; + if (events & POLLWRITE) + kev.fflags |= NOTE_WRITE; + kerror = kevent_register(kq, &kev, p); + } + + if (kerror != 0) { + fds[i].revents = POLLNVAL; + rfds++; + } else + fds[i].revents = 0; + } + + /* Did we have any trouble registering? */ + if (rfds > 0) + goto done; + + /* scan for, and possibly wait for, the kevents to trigger */ + cont->pca_fds = uap->fds; + cont->pca_nfds = nfds; + cont->pca_rfds = rfds; + error = kevent_scan(kq, poll_callback, NULL, cont, &atv, p); + rfds = cont->pca_rfds; + + done: + p->p_flag &= ~P_SELECT; + /* poll is not restarted after signals... */ + if (error == ERESTART) + error = EINTR; + if (error == EWOULDBLOCK) + error = 0; + if (error == 0) { + error = copyout(fds, uap->fds, nfds * sizeof(struct pollfd)); + *retval = rfds; + } + + out: + if (NULL != cont) + FREE(cont, M_TEMP); + + kqueue_dealloc(kq, p); + return (error); +} + +static int +poll_callback(__unused struct kqueue *kq, struct kevent *kevp, void *data) +{ + struct poll_continue_args *cont = (struct poll_continue_args *)data; + struct pollfd *fds = CAST_DOWN(struct pollfd *, kevp->udata); + + /* convert the results back into revents */ + if (kevp->flags & EV_EOF) + fds->revents |= POLLHUP; + if (kevp->flags & EV_ERROR) + fds->revents |= POLLERR; + cont->pca_rfds++; + + switch (kevp->filter) { + case EVFILT_READ: + if (kevp->data != 0) + fds->revents |= (fds->events & ( POLLIN | POLLRDNORM )); + if (kevp->flags & EV_OOBAND) + fds->revents |= (fds->events & ( POLLPRI | POLLRDBAND )); + break; + + case EVFILT_WRITE: + if (!(fds->revents & POLLHUP)) + fds->revents |= (fds->events & ( POLLOUT | POLLWRNORM | POLLWRBAND )); + break; + + case EVFILT_PROC: + if (kevp->fflags & NOTE_EXTEND) + fds->revents |= (fds->events & POLLEXTEND); + if (kevp->fflags & NOTE_ATTRIB) + fds->revents |= (fds->events & POLLATTRIB); + if (kevp->fflags & NOTE_LINK) + fds->revents |= (fds->events & POLLNLINK); + if (kevp->fflags & NOTE_WRITE) + fds->revents |= (fds->events & POLLWRITE); + break; + } + return 0; +} + +int +seltrue(__unused dev_t dev, __unused int flag, __unused struct proc *p) +{ + + return (1); +} + +static int +selcount(struct proc *p, u_int32_t *ibits, __unused u_int32_t *obits, + int nfd, int *count) +{ + register struct filedesc *fdp = p->p_fd; + register int msk, i, j, fd; + register u_int32_t bits; + struct fileproc *fp; int n = 0; - int nc = 0; - int nfc = 0; - static int flag[3] = { FREAD, FWRITE, 0 }; - u_int32_t *iptr, *fptr, *fbits; + u_int32_t *iptr; u_int nw; - struct vnode *vp; + int error=0; + int dropcount; /* * Problems when reboot; due to MacOSX signal probs @@ -1187,13 +1496,11 @@ selcount(p, ibits, obits, nfd, count, nfcount) */ if (fdp == NULL) { *count=0; - *nfcount=0; return(EIO); } - nw = howmany(nfd, NFDBITS); - + proc_fdlock(p); for (msk = 0; msk < 3; msk++) { iptr = (u_int32_t *)&ibits[msk * nw]; for (i = 0; i < nfd; i += NFDBITS) { @@ -1204,20 +1511,102 @@ selcount(p, ibits, obits, nfd, count, nfcount) if (fp == NULL || (fdp->fd_ofileflags[fd] & UF_RESERVED)) { *count=0; - *nfcount=0; - return(EBADF); + error = EBADF; + goto bad; } - if (fp->f_type == DTYPE_SOCKET || - (fp->f_type == DTYPE_VNODE - && (vp = (struct vnode *)fp->f_data) - && vp->v_type == VFIFO)) - nfc++; + fp->f_iocount++; n++; } } } + proc_fdunlock(p); + *count = n; - *nfcount = nfc; + return (0); +bad: + dropcount = 0; + + if (n== 0) + goto out; + /* undo the iocounts */ + for (msk = 0; msk < 3; msk++) { + iptr = (u_int32_t *)&ibits[msk * nw]; + for (i = 0; i < nfd; i += NFDBITS) { + bits = iptr[i/NFDBITS]; + while ((j = ffs(bits)) && (fd = i + --j) < nfd) { + bits &= ~(1 << j); + fp = fdp->fd_ofiles[fd]; + if (dropcount >= n) + goto out; + fp->f_iocount--; + + if (p->p_fpdrainwait && fp->f_iocount == 0) { + p->p_fpdrainwait = 0; + wakeup(&p->p_fpdrainwait); + } + dropcount++; + } + } + } +out: + proc_fdunlock(p); + return(error); +} + +static int +seldrop(p, ibits, nfd) + struct proc *p; + u_int32_t *ibits; + int nfd; +{ + register struct filedesc *fdp = p->p_fd; + register int msk, i, j, fd; + register u_int32_t bits; + struct fileproc *fp; + int n = 0; + u_int32_t *iptr; + u_int nw; + + /* + * Problems when reboot; due to MacOSX signal probs + * in Beaker1C ; verify that the p->p_fd is valid + */ + if (fdp == NULL) { + return(EIO); + } + + nw = howmany(nfd, NFDBITS); + + + proc_fdlock(p); + for (msk = 0; msk < 3; msk++) { + iptr = (u_int32_t *)&ibits[msk * nw]; + for (i = 0; i < nfd; i += NFDBITS) { + bits = iptr[i/NFDBITS]; + while ((j = ffs(bits)) && (fd = i + --j) < nfd) { + bits &= ~(1 << j); + fp = fdp->fd_ofiles[fd]; + if (fp == NULL +#if 0 + /* if you are here then it is being closed */ + || (fdp->fd_ofileflags[fd] & UF_RESERVED) +#endif + ) { + proc_fdunlock(p); + return(EBADF); + } + n++; + fp->f_iocount--; + fp->f_flags &= ~FP_INSELECT; + + if (p->p_fpdrainwait && fp->f_iocount == 0) { + p->p_fpdrainwait = 0; + wakeup(&p->p_fpdrainwait); + } + } + } + } + proc_fdunlock(p); return (0); } @@ -1225,12 +1614,9 @@ selcount(p, ibits, obits, nfd, count, nfcount) * Record a select request. */ void -selrecord(selector, sip, p_wql) - struct proc *selector; - struct selinfo *sip; - void * p_wql; +selrecord(__unused struct proc *selector, struct selinfo *sip, void * p_wql) { - thread_act_t cur_act = current_act(); + thread_t cur_act = current_thread(); struct uthread * ut = get_bsdthread_info(cur_act); /* need to look at collisions */ @@ -1256,8 +1642,9 @@ selrecord(selector, sip, p_wql) sip->si_flags &= ~SI_COLL; sip->si_flags |= SI_RECORDED; - if (!wait_queue_member(&sip->si_wait_queue, ut->uu_wqsub)) - wait_queue_link_noalloc(&sip->si_wait_queue, ut->uu_wqsub, (wait_queue_link_t)p_wql); + if (!wait_queue_member(&sip->si_wait_queue, ut->uu_select.wqset)) + wait_queue_link_noalloc(&sip->si_wait_queue, ut->uu_select.wqset, + (wait_queue_link_t)p_wql); return; } @@ -1304,243 +1691,367 @@ selthreadclear(sip) } -extern struct eventqelt *evprocdeque(struct proc *p, struct eventqelt *eqp); + + +#define DBG_EVENT 0x10 + +#define DBG_POST 0x10 +#define DBG_WATCH 0x11 +#define DBG_WAIT 0x12 +#define DBG_MOD 0x13 +#define DBG_EWAKEUP 0x14 +#define DBG_ENQUEUE 0x15 +#define DBG_DEQUEUE 0x16 + +#define DBG_MISC_POST MISCDBG_CODE(DBG_EVENT,DBG_POST) +#define DBG_MISC_WATCH MISCDBG_CODE(DBG_EVENT,DBG_WATCH) +#define DBG_MISC_WAIT MISCDBG_CODE(DBG_EVENT,DBG_WAIT) +#define DBG_MISC_MOD MISCDBG_CODE(DBG_EVENT,DBG_MOD) +#define DBG_MISC_EWAKEUP MISCDBG_CODE(DBG_EVENT,DBG_EWAKEUP) +#define DBG_MISC_ENQUEUE MISCDBG_CODE(DBG_EVENT,DBG_ENQUEUE) +#define DBG_MISC_DEQUEUE MISCDBG_CODE(DBG_EVENT,DBG_DEQUEUE) + + +#define EVPROCDEQUE(p, evq) do { \ + proc_lock(p); \ + if (evq->ee_flags & EV_QUEUED) { \ + TAILQ_REMOVE(&p->p_evlist, evq, ee_plist); \ + evq->ee_flags &= ~EV_QUEUED; \ + } \ + proc_unlock(p); \ +} while (0); + /* * called upon socket close. deque and free all events for - * the socket + * the socket... socket must be locked by caller. */ void evsofree(struct socket *sp) { - struct eventqelt *eqp, *next; + struct eventqelt *evq, *next; + proc_t p; + + if (sp == NULL) + return; - if (sp == NULL) return; + for (evq = sp->so_evlist.tqh_first; evq != NULL; evq = next) { + next = evq->ee_slist.tqe_next; + p = evq->ee_proc; - for (eqp = sp->so_evlist.tqh_first; eqp != NULL; eqp = next) { - next = eqp->ee_slist.tqe_next; - evprocdeque(eqp->ee_proc, eqp); // remove from proc q if there - TAILQ_REMOVE(&sp->so_evlist, eqp, ee_slist); // remove from socket q - FREE(eqp, M_TEMP); - } + if (evq->ee_flags & EV_QUEUED) { + EVPROCDEQUE(p, evq); + } + TAILQ_REMOVE(&sp->so_evlist, evq, ee_slist); // remove from socket q + FREE(evq, M_TEMP); + } } -#define DBG_EVENT 0x10 +/* + * called upon pipe close. deque and free all events for + * the pipe... pipe must be locked by caller + */ +void +evpipefree(struct pipe *cpipe) +{ + struct eventqelt *evq, *next; + proc_t p; -#define DBG_POST 0x10 -#define DBG_WATCH 0x11 -#define DBG_WAIT 0x12 -#define DBG_MOD 0x13 -#define DBG_EWAKEUP 0x14 -#define DBG_ENQUEUE 0x15 -#define DBG_DEQUEUE 0x16 + for (evq = cpipe->pipe_evlist.tqh_first; evq != NULL; evq = next) { + next = evq->ee_slist.tqe_next; + p = evq->ee_proc; -#define DBG_MISC_POST MISCDBG_CODE(DBG_EVENT,DBG_POST) -#define DBG_MISC_WATCH MISCDBG_CODE(DBG_EVENT,DBG_WATCH) -#define DBG_MISC_WAIT MISCDBG_CODE(DBG_EVENT,DBG_WAIT) -#define DBG_MISC_MOD MISCDBG_CODE(DBG_EVENT,DBG_MOD) -#define DBG_MISC_EWAKEUP MISCDBG_CODE(DBG_EVENT,DBG_EWAKEUP) -#define DBG_MISC_ENQUEUE MISCDBG_CODE(DBG_EVENT,DBG_ENQUEUE) -#define DBG_MISC_DEQUEUE MISCDBG_CODE(DBG_EVENT,DBG_DEQUEUE) + EVPROCDEQUE(p, evq); + + TAILQ_REMOVE(&cpipe->pipe_evlist, evq, ee_slist); // remove from pipe q + FREE(evq, M_TEMP); + } +} /* - * enque this event if it's not already queued. wakeup - the proc if we do queue this event to it. + * enqueue this event if it's not already queued. wakeup + * the proc if we do queue this event to it... + * entered with proc lock held... we drop it before + * doing the wakeup and return in that state */ -void -evprocenque(struct eventqelt *eqp) +static void +evprocenque(struct eventqelt *evq) { - struct proc *p; - - assert(eqp); - KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_START, eqp, eqp->ee_flags, eqp->ee_eventmask,0,0); - if (eqp->ee_flags & EV_QUEUED) { - KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0); - return; - } - eqp->ee_flags |= EV_QUEUED; - eqp->ee_eventmask = 0; // disarm - p = eqp->ee_proc; - TAILQ_INSERT_TAIL(&p->p_evlist, eqp, ee_plist); - KERNEL_DEBUG(DBG_MISC_EWAKEUP,0,0,0,eqp,0); - wakeup(&p->p_evlist); - KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0); + proc_t p; + + assert(evq); + p = evq->ee_proc; + + KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_START, evq, evq->ee_flags, evq->ee_eventmask,0,0); + + proc_lock(p); + + if (evq->ee_flags & EV_QUEUED) { + proc_unlock(p); + + KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0); + return; + } + evq->ee_flags |= EV_QUEUED; + + TAILQ_INSERT_TAIL(&p->p_evlist, evq, ee_plist); + + proc_unlock(p); + + wakeup(&p->p_evlist); + + KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0); } + /* - * given either a sockbuf or a socket run down the - * event list and queue ready events found + * pipe lock must be taken by the caller */ void -postevent(struct socket *sp, struct sockbuf *sb, int event) +postpipeevent(struct pipe *pipep, int event) { - int mask; - struct eventqelt *evq; - register struct tcpcb *tp; - - if (sb) sp = sb->sb_so; - if (!sp || sp->so_evlist.tqh_first == NULL) return; - - KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_START, event,0,0,0,0); - - for (evq = sp->so_evlist.tqh_first; - evq != NULL; evq = evq->ee_slist.tqe_next) { - - mask = 0; - - /* ready for reading: - - byte cnt >= receive low water mark - - read-half of conn closed - - conn pending for listening sock - - socket error pending - - ready for writing - - byte cnt avail >= send low water mark - - write half of conn closed - - socket error pending - - non-blocking conn completed successfully - - exception pending - - out of band data - - sock at out of band mark - - */ - switch (event & EV_DMASK) { - - case EV_RWBYTES: - case EV_OOB: - case EV_RWBYTES|EV_OOB: - if (event & EV_OOB) { - if ((evq->ee_eventmask & EV_EX)) { - if (sp->so_oobmark || ((sp->so_state & SS_RCVATMARK))) { - mask |= EV_EX|EV_OOB; - } - } - } - if (event & EV_RWBYTES) { - if ((evq->ee_eventmask & EV_RE) && soreadable(sp)) { - if ((sp->so_type == SOCK_STREAM) && (sp->so_error == ECONNREFUSED) || - (sp->so_error == ECONNRESET)) { - if ((sp->so_pcb == 0) || - !(tp = sototcpcb(sp)) || - (tp->t_state == TCPS_CLOSED)) { - mask |= EV_RE|EV_RESET; - break; - } - } - if (sp->so_state & SS_CANTRCVMORE) { - mask |= EV_RE|EV_FIN; - evq->ee_req.er_rcnt = sp->so_rcv.sb_cc; - break; - } - mask |= EV_RE; - evq->ee_req.er_rcnt = sp->so_rcv.sb_cc; - } - - if ((evq->ee_eventmask & EV_WR) && sowriteable(sp)) { - if ((sp->so_type == SOCK_STREAM) &&(sp->so_error == ECONNREFUSED) || - (sp->so_error == ECONNRESET)) { - if ((sp->so_pcb == 0) || - !(tp = sototcpcb(sp)) || - (tp->t_state == TCPS_CLOSED)) { - mask |= EV_WR|EV_RESET; - break; - } - } - mask |= EV_WR; - evq->ee_req.er_wcnt = sbspace(&sp->so_snd); - } - } - break; - - case EV_RCONN: - if ((evq->ee_eventmask & EV_RE)) { - evq->ee_req.er_rcnt = sp->so_qlen + 1; // incl this one - mask |= EV_RE|EV_RCONN; - } - break; - - case EV_WCONN: - if ((evq->ee_eventmask & EV_WR)) { - mask |= EV_WR|EV_WCONN; - } - break; - - case EV_RCLOSED: - if ((evq->ee_eventmask & EV_RE)) { - mask |= EV_RE|EV_RCLOSED; - } - break; - - case EV_WCLOSED: - if ((evq->ee_eventmask & EV_WR)) { - mask |= EV_WR|EV_WCLOSED; - } - break; - - case EV_FIN: - if (evq->ee_eventmask & EV_RE) { - mask |= EV_RE|EV_FIN; - } - break; - - case EV_RESET: - case EV_TIMEOUT: - if (evq->ee_eventmask & EV_RE) { - mask |= EV_RE | event; - } - if (evq->ee_eventmask & EV_WR) { - mask |= EV_WR | event; - } - break; - - default: - return; - } /* switch */ - - if (mask) { - evq->ee_req.er_eventbits |= mask; - KERNEL_DEBUG(DBG_MISC_POST, evq, evq->ee_req.er_eventbits, mask,0,0); - evprocenque(evq); - } - } - KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, 0,0,0,0,0); + int mask; + struct eventqelt *evq; + + if (pipep == NULL) + return; + KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_START, event,0,0,1,0); + + for (evq = pipep->pipe_evlist.tqh_first; + evq != NULL; evq = evq->ee_slist.tqe_next) { + + if (evq->ee_eventmask == 0) + continue; + mask = 0; + + switch (event & (EV_RWBYTES | EV_RCLOSED | EV_WCLOSED)) { + + case EV_RWBYTES: + if ((evq->ee_eventmask & EV_RE) && pipep->pipe_buffer.cnt) { + mask |= EV_RE; + evq->ee_req.er_rcnt = pipep->pipe_buffer.cnt; + } + if ((evq->ee_eventmask & EV_WR) && + (pipep->pipe_buffer.size - pipep->pipe_buffer.cnt) >= PIPE_BUF) { + + if (pipep->pipe_state & PIPE_EOF) { + mask |= EV_WR|EV_RESET; + break; + } + mask |= EV_WR; + evq->ee_req.er_wcnt = pipep->pipe_buffer.size - pipep->pipe_buffer.cnt; + } + break; + + case EV_WCLOSED: + case EV_RCLOSED: + if ((evq->ee_eventmask & EV_RE)) { + mask |= EV_RE|EV_RCLOSED; + } + if ((evq->ee_eventmask & EV_WR)) { + mask |= EV_WR|EV_WCLOSED; + } + break; + + default: + return; + } + if (mask) { + /* + * disarm... postevents are nops until this event is 'read' via + * waitevent and then re-armed via modwatch + */ + evq->ee_eventmask = 0; + + /* + * since events are disarmed until after the waitevent + * the ee_req.er_xxxx fields can't change once we've + * inserted this event into the proc queue... + * therefore, the waitevent will see a 'consistent' + * snapshot of the event, even though it won't hold + * the pipe lock, and we're updating the event outside + * of the proc lock, which it will hold + */ + evq->ee_req.er_eventbits |= mask; + + KERNEL_DEBUG(DBG_MISC_POST, evq, evq->ee_req.er_eventbits, mask, 1,0); + + evprocenque(evq); + } + } + KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, 0,0,0,1,0); } + /* - * remove and return the first event (eqp=NULL) or a specific - * event, or return NULL if no events found + * given either a sockbuf or a socket run down the + * event list and queue ready events found... + * the socket must be locked by the caller */ -struct eventqelt * -evprocdeque(struct proc *p, struct eventqelt *eqp) +void +postevent(struct socket *sp, struct sockbuf *sb, int event) { - - KERNEL_DEBUG(DBG_MISC_DEQUEUE|DBG_FUNC_START,p,eqp,0,0,0); - - if (eqp && ((eqp->ee_flags & EV_QUEUED) == NULL)) { - KERNEL_DEBUG(DBG_MISC_DEQUEUE|DBG_FUNC_END,0,0,0,0,0); - return(NULL); - } - if (p->p_evlist.tqh_first == NULL) { - KERNEL_DEBUG(DBG_MISC_DEQUEUE|DBG_FUNC_END,0,0,0,0,0); - return(NULL); - } - if (eqp == NULL) { // remove first - eqp = p->p_evlist.tqh_first; - } - TAILQ_REMOVE(&p->p_evlist, eqp, ee_plist); - eqp->ee_flags &= ~EV_QUEUED; - KERNEL_DEBUG(DBG_MISC_DEQUEUE|DBG_FUNC_END,eqp,0,0,0,0); - return(eqp); + int mask; + struct eventqelt *evq; + struct tcpcb *tp; + + if (sb) + sp = sb->sb_so; + if (sp == NULL) + return; + + KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_START, (int)sp, event, 0, 0, 0); + + for (evq = sp->so_evlist.tqh_first; + evq != NULL; evq = evq->ee_slist.tqe_next) { + + if (evq->ee_eventmask == 0) + continue; + mask = 0; + + /* ready for reading: + - byte cnt >= receive low water mark + - read-half of conn closed + - conn pending for listening sock + - socket error pending + + ready for writing + - byte cnt avail >= send low water mark + - write half of conn closed + - socket error pending + - non-blocking conn completed successfully + + exception pending + - out of band data + - sock at out of band mark + */ + + switch (event & EV_DMASK) { + + case EV_OOB: + if ((evq->ee_eventmask & EV_EX)) { + if (sp->so_oobmark || ((sp->so_state & SS_RCVATMARK))) + mask |= EV_EX|EV_OOB; + } + break; + + case EV_RWBYTES|EV_OOB: + if ((evq->ee_eventmask & EV_EX)) { + if (sp->so_oobmark || ((sp->so_state & SS_RCVATMARK))) + mask |= EV_EX|EV_OOB; + } + /* + * fall into the next case + */ + case EV_RWBYTES: + if ((evq->ee_eventmask & EV_RE) && soreadable(sp)) { + if (sp->so_error) { + if ((sp->so_type == SOCK_STREAM) && ((sp->so_error == ECONNREFUSED) || (sp->so_error == ECONNRESET))) { + if ((sp->so_pcb == 0) || (((struct inpcb *)sp->so_pcb)->inp_state == INPCB_STATE_DEAD) || !(tp = sototcpcb(sp)) || + (tp->t_state == TCPS_CLOSED)) { + mask |= EV_RE|EV_RESET; + break; + } + } + } + mask |= EV_RE; + evq->ee_req.er_rcnt = sp->so_rcv.sb_cc; + + if (sp->so_state & SS_CANTRCVMORE) { + mask |= EV_FIN; + break; + } + } + if ((evq->ee_eventmask & EV_WR) && sowriteable(sp)) { + if (sp->so_error) { + if ((sp->so_type == SOCK_STREAM) && ((sp->so_error == ECONNREFUSED) || (sp->so_error == ECONNRESET))) { + if ((sp->so_pcb == 0) || (((struct inpcb *)sp->so_pcb)->inp_state == INPCB_STATE_DEAD) || !(tp = sototcpcb(sp)) || + (tp->t_state == TCPS_CLOSED)) { + mask |= EV_WR|EV_RESET; + break; + } + } + } + mask |= EV_WR; + evq->ee_req.er_wcnt = sbspace(&sp->so_snd); + } + break; + + case EV_RCONN: + if ((evq->ee_eventmask & EV_RE)) { + mask |= EV_RE|EV_RCONN; + evq->ee_req.er_rcnt = sp->so_qlen + 1; // incl this one + } + break; + + case EV_WCONN: + if ((evq->ee_eventmask & EV_WR)) { + mask |= EV_WR|EV_WCONN; + } + break; + + case EV_RCLOSED: + if ((evq->ee_eventmask & EV_RE)) { + mask |= EV_RE|EV_RCLOSED; + } + break; + + case EV_WCLOSED: + if ((evq->ee_eventmask & EV_WR)) { + mask |= EV_WR|EV_WCLOSED; + } + break; + + case EV_FIN: + if (evq->ee_eventmask & EV_RE) { + mask |= EV_RE|EV_FIN; + } + break; + + case EV_RESET: + case EV_TIMEOUT: + if (evq->ee_eventmask & EV_RE) { + mask |= EV_RE | event; + } + if (evq->ee_eventmask & EV_WR) { + mask |= EV_WR | event; + } + break; + + default: + KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, (int)sp, -1, 0, 0, 0); + return; + } /* switch */ + + KERNEL_DEBUG(DBG_MISC_POST, (int)evq, evq->ee_eventmask, evq->ee_req.er_eventbits, mask, 0); + + if (mask) { + /* + * disarm... postevents are nops until this event is 'read' via + * waitevent and then re-armed via modwatch + */ + evq->ee_eventmask = 0; + + /* + * since events are disarmed until after the waitevent + * the ee_req.er_xxxx fields can't change once we've + * inserted this event into the proc queue... + * since waitevent can't see this event until we + * enqueue it, waitevent will see a 'consistent' + * snapshot of the event, even though it won't hold + * the socket lock, and we're updating the event outside + * of the proc lock, which it will hold + */ + evq->ee_req.er_eventbits |= mask; + + evprocenque(evq); + } + } + KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, (int)sp, 0, 0, 0, 0); } -struct evwatch_args { - struct eventreq *u_req; - int u_eventmask; -}; - /* * watchevent system call. user passes us an event to watch @@ -1552,78 +2063,99 @@ struct evwatch_args { * should this prevent duplicate events on same socket? */ int -watchevent(p, uap, retval) - struct proc *p; - struct evwatch_args *uap; - register_t *retval; +watchevent(proc_t p, struct watchevent_args *uap, __unused int *retval) { - struct eventqelt *eqp = (struct eventqelt *)0; - struct eventqelt *np; - struct eventreq *erp; - struct file *fp; - struct socket *sp; - int error; - - KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_START, 0,0,0,0,0); - - // get a qelt and fill with users req - MALLOC(eqp, struct eventqelt *, sizeof(struct eventqelt), M_TEMP, M_WAITOK); - if (!eqp) panic("can't MALLOC eqp"); - erp = &eqp->ee_req; - // get users request pkt - if (error = copyin((caddr_t)uap->u_req, (caddr_t)erp, - sizeof(struct eventreq))) { - FREE(eqp, M_TEMP); - KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0); - return(error); - } - KERNEL_DEBUG(DBG_MISC_WATCH, erp->er_handle,uap->u_eventmask,eqp,0,0); - // validate, freeing qelt if errors - error = 0; - if (erp->er_type != EV_FD) { - error = EINVAL; - } else if (erp->er_handle < 0) { - error = EBADF; - } else if (erp->er_handle > p->p_fd->fd_nfiles) { - error = EBADF; - } else if ((fp = *fdfile(p, erp->er_handle)) == NULL) { - error = EBADF; - } else if (fp->f_type != DTYPE_SOCKET) { - error = EINVAL; - } - if (error) { - FREE(eqp,M_TEMP); - KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0); - return(error); - } - - erp->er_rcnt = erp->er_wcnt = erp->er_eventbits = 0; - eqp->ee_proc = p; - eqp->ee_eventmask = uap->u_eventmask & EV_MASK; - eqp->ee_flags = 0; - - sp = (struct socket *)fp->f_data; - assert(sp != NULL); - - // only allow one watch per file per proc - for (np = sp->so_evlist.tqh_first; np != NULL; np = np->ee_slist.tqe_next) { - if (np->ee_proc == p) { - FREE(eqp,M_TEMP); - KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, EINVAL,0,0,0,0); - return(EINVAL); - } - } - - TAILQ_INSERT_TAIL(&sp->so_evlist, eqp, ee_slist); - postevent(sp, 0, EV_RWBYTES); // catch existing events - KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, 0,0,0,0,0); - return(0); + struct eventqelt *evq = (struct eventqelt *)0; + struct eventqelt *np = NULL; + struct eventreq *erp; + struct fileproc *fp = NULL; + int error; + + KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_START, 0,0,0,0,0); + + // get a qelt and fill with users req + MALLOC(evq, struct eventqelt *, sizeof(struct eventqelt), M_TEMP, M_WAITOK); + + if (evq == NULL) + panic("can't MALLOC evq"); + erp = &evq->ee_req; + + // get users request pkt + if ( (error = copyin(CAST_USER_ADDR_T(uap->u_req), (caddr_t)erp, + sizeof(struct eventreq))) ) { + FREE(evq, M_TEMP); + + KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0); + return(error); + } + KERNEL_DEBUG(DBG_MISC_WATCH, erp->er_handle,uap->u_eventmask,evq,0,0); + + // validate, freeing qelt if errors + error = 0; + proc_fdlock(p); + + if (erp->er_type != EV_FD) { + error = EINVAL; + } else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) { + error = EBADF; + } else if (fp->f_type == DTYPE_SOCKET) { + socket_lock((struct socket *)fp->f_data, 1); + np = ((struct socket *)fp->f_data)->so_evlist.tqh_first; + } else if (fp->f_type == DTYPE_PIPE) { + PIPE_LOCK((struct pipe *)fp->f_data); + np = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first; + } else { + fp_drop(p, erp->er_handle, fp, 1); + error = EINVAL; + } + proc_fdunlock(p); + + if (error) { + FREE(evq, M_TEMP); + + KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0); + return(error); + } + + /* + * only allow one watch per file per proc + */ + for ( ; np != NULL; np = np->ee_slist.tqe_next) { + if (np->ee_proc == p) { + if (fp->f_type == DTYPE_SOCKET) + socket_unlock((struct socket *)fp->f_data, 1); + else + PIPE_UNLOCK((struct pipe *)fp->f_data); + fp_drop(p, erp->er_handle, fp, 0); + FREE(evq, M_TEMP); + + KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, EINVAL,0,0,0,0); + return(EINVAL); + } + } + erp->er_ecnt = erp->er_rcnt = erp->er_wcnt = erp->er_eventbits = 0; + evq->ee_proc = p; + evq->ee_eventmask = uap->u_eventmask & EV_MASK; + evq->ee_flags = 0; + + if (fp->f_type == DTYPE_SOCKET) { + TAILQ_INSERT_TAIL(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist); + postevent((struct socket *)fp->f_data, 0, EV_RWBYTES); // catch existing events + + socket_unlock((struct socket *)fp->f_data, 1); + } else { + TAILQ_INSERT_TAIL(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist); + postpipeevent((struct pipe *)fp->f_data, EV_RWBYTES); + + PIPE_UNLOCK((struct pipe *)fp->f_data); + } + fp_drop_event(p, erp->er_handle, fp); + + KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, 0,0,0,0,0); + return(0); } -struct evwait_args { - struct eventreq *u_req; - struct timeval *tv; -}; + /* * waitevent system call. @@ -1632,57 +2164,71 @@ struct evwait_args { * or poll mode (tv=NULL); */ int -waitevent(p, uap, retval) - struct proc *p; - struct evwait_args *uap; - register_t *retval; +waitevent(proc_t p, struct waitevent_args *uap, int *retval) { - int error = 0; - struct eventqelt *eqp; + int error = 0; + struct eventqelt *evq; + struct eventreq er; uint64_t abstime, interval; if (uap->tv) { struct timeval atv; - error = copyin((caddr_t)uap->tv, (caddr_t)&atv, sizeof (atv)); + error = copyin(CAST_USER_ADDR_T(uap->tv), (caddr_t)&atv, sizeof (atv)); if (error) return(error); if (itimerfix(&atv)) { error = EINVAL; return(error); } - interval = tvtoabstime(&atv); - } - else - abstime = interval = 0; + } else + interval = 0; KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_START, 0,0,0,0,0); + proc_lock(p); retry: - if ((eqp = evprocdeque(p,NULL)) != NULL) { - error = copyout((caddr_t)&eqp->ee_req, - (caddr_t)uap->u_req, sizeof(struct eventreq)); - KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error, - eqp->ee_req.er_handle,eqp->ee_req.er_eventbits,eqp,0); + if ((evq = p->p_evlist.tqh_first) != NULL) { + /* + * found one... make a local copy while it's still on the queue + * to prevent it from changing while in the midst of copying + * don't want to hold the proc lock across a copyout because + * it might block on a page fault at the target in user space + */ + bcopy((caddr_t)&evq->ee_req, (caddr_t)&er, sizeof (struct eventreq)); + + TAILQ_REMOVE(&p->p_evlist, evq, ee_plist); + + evq->ee_flags &= ~EV_QUEUED; + proc_unlock(p); + + error = copyout((caddr_t)&er, CAST_USER_ADDR_T(uap->u_req), sizeof(struct eventreq)); + + KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error, + evq->ee_req.er_handle,evq->ee_req.er_eventbits,evq,0); return (error); } else { if (uap->tv && interval == 0) { + proc_unlock(p); *retval = 1; // poll failed - KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error,0,0,0,0); + KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error,0,0,0,0); return (error); } - if (interval != 0) clock_absolutetime_interval_to_deadline(interval, &abstime); + else + abstime = 0; KERNEL_DEBUG(DBG_MISC_WAIT, 1,&p->p_evlist,0,0,0); - error = tsleep1(&p->p_evlist, PSOCK | PCATCH, - "waitevent", abstime, (int (*)(int))0); + + error = msleep1(&p->p_evlist, &p->p_mlock, (PSOCK | PCATCH), "waitevent", abstime); + KERNEL_DEBUG(DBG_MISC_WAIT, 2,&p->p_evlist,0,0,0); + if (error == 0) goto retry; if (error == ERESTART) @@ -1692,16 +2238,12 @@ retry: error = 0; } } + proc_unlock(p); KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, 0,0,0,0,0); - return (error); } -struct modwatch_args { - struct eventreq *u_req; - int u_eventmask; -}; /* * modwatch system call. user passes in event to modify. @@ -1709,87 +2251,202 @@ struct modwatch_args { * it needed. */ int -modwatch(p, uap, retval) - struct proc *p; - struct modwatch_args *uap; - register_t *retval; +modwatch(proc_t p, struct modwatch_args *uap, __unused int *retval) { - struct eventreq er; - struct eventreq *erp = &er; - struct eventqelt *evq; - int error; - struct file *fp; - struct socket *sp; - int flag; - - KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_START, 0,0,0,0,0); - - // get users request pkt - if (error = copyin((caddr_t)uap->u_req, (caddr_t)erp, - sizeof(struct eventreq))) return(error); - - if (erp->er_type != EV_FD) return(EINVAL); - if (erp->er_handle < 0) return(EBADF); - if (erp->er_handle > p->p_fd->fd_nfiles) return(EBADF); - if ((fp = *fdfile(p, erp->er_handle)) == NULL) - return(EBADF); - if (fp->f_type != DTYPE_SOCKET) return(EINVAL); // for now must be sock - sp = (struct socket *)fp->f_data; - - /* soo_close sets f_data to 0 before switching funnel */ - if (sp == (struct socket *)0) - return(EBADF); - - // locate event if possible - for (evq = sp->so_evlist.tqh_first; - evq != NULL; evq = evq->ee_slist.tqe_next) { - if (evq->ee_proc == p) break; - } - - if (evq == NULL) { - KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, EINVAL,0,0,0,0); - return(EINVAL); - } - KERNEL_DEBUG(DBG_MISC_MOD, erp->er_handle,uap->u_eventmask,evq,0,0); - - if (uap->u_eventmask == EV_RM) { - evprocdeque(p, evq); - TAILQ_REMOVE(&sp->so_evlist, evq, ee_slist); - FREE(evq, M_TEMP); - KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, 0,0,0,0,0); - return(0); - } - - switch (uap->u_eventmask & EV_MASK) { + struct eventreq er; + struct eventreq *erp = &er; + struct eventqelt *evq; + int error; + struct fileproc *fp; + int flag; + + KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_START, 0,0,0,0,0); + + /* + * get user's request pkt + */ + if ((error = copyin(CAST_USER_ADDR_T(uap->u_req), (caddr_t)erp, + sizeof(struct eventreq)))) { + KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, error,0,0,0,0); + return(error); + } + proc_fdlock(p); + + if (erp->er_type != EV_FD) { + error = EINVAL; + } else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) { + error = EBADF; + } else if (fp->f_type == DTYPE_SOCKET) { + socket_lock((struct socket *)fp->f_data, 1); + evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first; + } else if (fp->f_type == DTYPE_PIPE) { + PIPE_LOCK((struct pipe *)fp->f_data); + evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first; + } else { + fp_drop(p, erp->er_handle, fp, 1); + error = EINVAL; + } + + if (error) { + proc_fdunlock(p); + KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, error,0,0,0,0); + return(error); + } + + if ((uap->u_eventmask == EV_RM) && (fp->f_flags & FP_WAITEVENT)) { + fp->f_flags &= ~FP_WAITEVENT; + } + proc_fdunlock(p); + + // locate event if possible + for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) { + if (evq->ee_proc == p) + break; + } + if (evq == NULL) { + if (fp->f_type == DTYPE_SOCKET) + socket_unlock((struct socket *)fp->f_data, 1); + else + PIPE_UNLOCK((struct pipe *)fp->f_data); + fp_drop(p, erp->er_handle, fp, 0); + KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, EINVAL,0,0,0,0); + return(EINVAL); + } + KERNEL_DEBUG(DBG_MISC_MOD, erp->er_handle,uap->u_eventmask,evq,0,0); + + if (uap->u_eventmask == EV_RM) { + EVPROCDEQUE(p, evq); + + if (fp->f_type == DTYPE_SOCKET) { + TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist); + socket_unlock((struct socket *)fp->f_data, 1); + } else { + TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist); + PIPE_UNLOCK((struct pipe *)fp->f_data); + } + fp_drop(p, erp->er_handle, fp, 0); + FREE(evq, M_TEMP); + KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, 0,0,0,0,0); + return(0); + } + switch (uap->u_eventmask & EV_MASK) { - case 0: - flag = 0; - break; - - case EV_RE: - case EV_WR: - case EV_RE|EV_WR: - flag = EV_RWBYTES; - break; - - case EV_EX: - flag = EV_OOB; - break; - - case EV_EX|EV_RE: - case EV_EX|EV_WR: - case EV_EX|EV_RE|EV_WR: - flag = EV_OOB|EV_RWBYTES; - break; - - default: - return(EINVAL); - } - - evq->ee_eventmask = uap->u_eventmask & EV_MASK; - evprocdeque(p, evq); - evq->ee_req.er_eventbits = 0; - postevent(sp, 0, flag); - KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, evq->ee_req.er_handle,evq->ee_eventmask,sp,flag,0); - return(0); + case 0: + flag = 0; + break; + + case EV_RE: + case EV_WR: + case EV_RE|EV_WR: + flag = EV_RWBYTES; + break; + + case EV_EX: + flag = EV_OOB; + break; + + case EV_EX|EV_RE: + case EV_EX|EV_WR: + case EV_EX|EV_RE|EV_WR: + flag = EV_OOB|EV_RWBYTES; + break; + + default: + if (fp->f_type == DTYPE_SOCKET) + socket_unlock((struct socket *)fp->f_data, 1); + else + PIPE_UNLOCK((struct pipe *)fp->f_data); + fp_drop(p, erp->er_handle, fp, 0); + KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, EINVAL,0,0,0,0); + return(EINVAL); + } + /* + * since we're holding the socket/pipe lock, the event + * cannot go from the unqueued state to the queued state + * however, it can go from the queued state to the unqueued state + * since that direction is protected by the proc_lock... + * so do a quick check for EV_QUEUED w/o holding the proc lock + * since by far the common case will be NOT EV_QUEUED, this saves + * us taking the proc_lock the majority of the time + */ + if (evq->ee_flags & EV_QUEUED) { + /* + * EVPROCDEQUE will recheck the state after it grabs the proc_lock + */ + EVPROCDEQUE(p, evq); + } + /* + * while the event is off the proc queue and + * we're holding the socket/pipe lock + * it's safe to update these fields... + */ + evq->ee_req.er_eventbits = 0; + evq->ee_eventmask = uap->u_eventmask & EV_MASK; + + if (fp->f_type == DTYPE_SOCKET) { + postevent((struct socket *)fp->f_data, 0, flag); + socket_unlock((struct socket *)fp->f_data, 1); + } + else { + postpipeevent((struct pipe *)fp->f_data, flag); + PIPE_UNLOCK((struct pipe *)fp->f_data); + } + fp_drop(p, erp->er_handle, fp, 0); + KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, evq->ee_req.er_handle,evq->ee_eventmask,fp->f_data,flag,0); + return(0); } + +/* this routine is called from the close of fd with proc_fdlock held */ +int +waitevent_close(struct proc *p, struct fileproc *fp) +{ + struct eventqelt *evq; + + + fp->f_flags &= ~FP_WAITEVENT; + + if (fp->f_type == DTYPE_SOCKET) { + socket_lock((struct socket *)fp->f_data, 1); + evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first; + } + else if (fp->f_type == DTYPE_PIPE) { + PIPE_LOCK((struct pipe *)fp->f_data); + evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first; + } + else { + return(EINVAL); + } + proc_fdunlock(p); + + + // locate event if possible + for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) { + if (evq->ee_proc == p) + break; + } + if (evq == NULL) { + if (fp->f_type == DTYPE_SOCKET) + socket_unlock((struct socket *)fp->f_data, 1); + else + PIPE_UNLOCK((struct pipe *)fp->f_data); + + proc_fdlock(p); + + return(EINVAL); + } + EVPROCDEQUE(p, evq); + + if (fp->f_type == DTYPE_SOCKET) { + TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist); + socket_unlock((struct socket *)fp->f_data, 1); + } else { + TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist); + PIPE_UNLOCK((struct pipe *)fp->f_data); + } + FREE(evq, M_TEMP); + + proc_fdlock(p); + + return(0); +} + diff --git a/bsd/kern/sys_pipe.c b/bsd/kern/sys_pipe.c new file mode 100644 index 000000000..2fb396aa0 --- /dev/null +++ b/bsd/kern/sys_pipe.c @@ -0,0 +1,1646 @@ +/* + * Copyright (c) 1996 John S. Dyson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice immediately at the beginning of the file, without modification, + * this list of conditions, and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Absolutely no warranty of function or purpose is made by the author + * John S. Dyson. + * 4. Modifications may be freely made to this file if the above conditions + * are met. + */ +/* + * Copyright (c) 2003-2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/* + * This file contains a high-performance replacement for the socket-based + * pipes scheme originally used in FreeBSD/4.4Lite. It does not support + * all features of sockets, but does do everything that pipes normally + * do. + */ + +/* + * This code has two modes of operation, a small write mode and a large + * write mode. The small write mode acts like conventional pipes with + * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the + * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT + * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and + * the receiving process can copy it directly from the pages in the sending + * process. + * + * If the sending process receives a signal, it is possible that it will + * go away, and certainly its address space can change, because control + * is returned back to the user-mode side. In that case, the pipe code + * arranges to copy the buffer supplied by the user process, to a pageable + * kernel buffer, and the receiving process will grab the data from the + * pageable kernel buffer. Since signals don't happen all that often, + * the copy operation is normally eliminated. + * + * The constant PIPE_MINDIRECT is chosen to make sure that buffering will + * happen for small transfers so that the system will not spend all of + * its time context switching. + * + * In order to limit the resource use of pipes, two sysctls exist: + * + * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable + * address space available to us in pipe_map. Whenever the amount in use + * exceeds half of this value, all new pipes will be created with size + * SMALL_PIPE_SIZE, rather than PIPE_SIZE. Big pipe creation will be limited + * as well. This value is loader tunable only. + * + * kern.ipc.maxpipekvawired - This value limits the amount of memory that may + * be wired in order to facilitate direct copies using page flipping. + * Whenever this value is exceeded, pipes will fall back to using regular + * copies. This value is sysctl controllable at all times. + * + * These values are autotuned in subr_param.c. + * + * Memory usage may be monitored through the sysctls + * kern.ipc.pipes, kern.ipc.pipekva and kern.ipc.pipekvawired. + * + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/filedesc.h> +#include <sys/kernel.h> +#include <sys/vnode.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> +#include <sys/file_internal.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <sys/fcntl.h> +#include <sys/malloc.h> +#include <sys/syslog.h> +#include <sys/unistd.h> +#include <sys/resourcevar.h> +#include <sys/aio_kern.h> +#include <sys/signalvar.h> +#include <sys/pipe.h> +#include <sys/sysproto.h> + +#include <bsm/audit_kernel.h> + +#include <sys/kdebug.h> + +#include <kern/zalloc.h> +#include <vm/vm_kern.h> +#include <libkern/OSAtomic.h> + +#define f_flag f_fglob->fg_flag +#define f_type f_fglob->fg_type +#define f_msgcount f_fglob->fg_msgcount +#define f_cred f_fglob->fg_cred +#define f_ops f_fglob->fg_ops +#define f_offset f_fglob->fg_offset +#define f_data f_fglob->fg_data +/* + * Use this define if you want to disable *fancy* VM things. Expect an + * approx 30% decrease in transfer rate. This could be useful for + * NetBSD or OpenBSD. + * + * this needs to be ported to X and the performance measured + * before committing to supporting it + */ +#define PIPE_NODIRECT 1 + +#ifndef PIPE_NODIRECT + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_object.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/uma.h> + +#endif + + +/* + * interfaces to the outside world + */ +static int pipe_read(struct fileproc *fp, struct uio *uio, + kauth_cred_t cred, int flags, struct proc *p); + +static int pipe_write(struct fileproc *fp, struct uio *uio, + kauth_cred_t cred, int flags, struct proc *p); + +static int pipe_close(struct fileglob *fg, struct proc *p); + +static int pipe_select(struct fileproc *fp, int which, void * wql, struct proc *p); + +static int pipe_kqfilter(struct fileproc *fp, struct knote *kn, struct proc *p); + +static int pipe_ioctl(struct fileproc *fp, u_long cmd, caddr_t data, struct proc *p); + + +struct fileops pipeops = + { pipe_read, + pipe_write, + pipe_ioctl, + pipe_select, + pipe_close, + pipe_kqfilter, + 0 }; + + +static void filt_pipedetach(struct knote *kn); +static int filt_piperead(struct knote *kn, long hint); +static int filt_pipewrite(struct knote *kn, long hint); + +static struct filterops pipe_rfiltops = + { 1, NULL, filt_pipedetach, filt_piperead }; +static struct filterops pipe_wfiltops = + { 1, NULL, filt_pipedetach, filt_pipewrite }; + +/* + * Default pipe buffer size(s), this can be kind-of large now because pipe + * space is pageable. The pipe code will try to maintain locality of + * reference for performance reasons, so small amounts of outstanding I/O + * will not wipe the cache. + */ +#define MINPIPESIZE (PIPE_SIZE/3) + +/* + * Limit the number of "big" pipes + */ +#define LIMITBIGPIPES 32 +static int nbigpipe; + +static int amountpipes; +static int amountpipekva; + +#ifndef PIPE_NODIRECT +static int amountpipekvawired; +#endif +int maxpipekva = 1024 * 1024 * 16; + +#if PIPE_SYSCTLS +SYSCTL_DECL(_kern_ipc); + +SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RD, + &maxpipekva, 0, "Pipe KVA limit"); +SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekvawired, CTLFLAG_RW, + &maxpipekvawired, 0, "Pipe KVA wired limit"); +SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD, + &amountpipes, 0, "Current # of pipes"); +SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD, + &nbigpipe, 0, "Current # of big pipes"); +SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD, + &amountpipekva, 0, "Pipe KVA usage"); +SYSCTL_INT(_kern_ipc, OID_AUTO, pipekvawired, CTLFLAG_RD, + &amountpipekvawired, 0, "Pipe wired KVA usage"); +#endif + +void pipeinit(void *dummy __unused); +static void pipeclose(struct pipe *cpipe); +static void pipe_free_kmem(struct pipe *cpipe); +static int pipe_create(struct pipe **cpipep); +static void pipeselwakeup(struct pipe *cpipe, struct pipe *spipe); +static __inline int pipelock(struct pipe *cpipe, int catch); +static __inline void pipeunlock(struct pipe *cpipe); + +#ifndef PIPE_NODIRECT +static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio); +static void pipe_destroy_write_buffer(struct pipe *wpipe); +static int pipe_direct_write(struct pipe *wpipe, struct uio *uio); +static void pipe_clone_write_buffer(struct pipe *wpipe); +#endif + +extern int postpipeevent(struct pipe *, int); +extern void evpipefree(struct pipe *cpipe); + + +static int pipespace(struct pipe *cpipe, int size); + +static lck_grp_t *pipe_mtx_grp; +static lck_attr_t *pipe_mtx_attr; +static lck_grp_attr_t *pipe_mtx_grp_attr; + +static zone_t pipe_zone; + +SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); + +void +pipeinit(void *dummy __unused) +{ + pipe_zone = (zone_t)zinit(sizeof(struct pipe), 8192 * sizeof(struct pipe), 4096, "pipe zone"); + + /* + * allocate lock group attribute and group for pipe mutexes + */ + pipe_mtx_grp_attr = lck_grp_attr_alloc_init(); + //lck_grp_attr_setstat(pipe_mtx_grp_attr); + pipe_mtx_grp = lck_grp_alloc_init("pipe", pipe_mtx_grp_attr); + + /* + * allocate the lock attribute for pipe mutexes + */ + pipe_mtx_attr = lck_attr_alloc_init(); + //lck_attr_setdebug(pipe_mtx_attr); +} + + + +/* + * The pipe system call for the DTYPE_PIPE type of pipes + */ + +/* ARGSUSED */ +int +pipe(struct proc *p, __unused struct pipe_args *uap, register_t *retval) +{ + struct fileproc *rf, *wf; + struct pipe *rpipe, *wpipe; + lck_mtx_t *pmtx; + int fd, error; + + if ((pmtx = lck_mtx_alloc_init(pipe_mtx_grp, pipe_mtx_attr)) == NULL) + return (ENOMEM); + + rpipe = wpipe = NULL; + if (pipe_create(&rpipe) || pipe_create(&wpipe)) { + error = ENFILE; + goto freepipes; + } + /* + * allocate the space for the normal I/O direction up + * front... we'll delay the allocation for the other + * direction until a write actually occurs (most + * likely it won't)... + * + * Reduce to 1/4th pipe size if we're over our global max. + */ + if (amountpipekva > maxpipekva / 2) + error = pipespace(rpipe, SMALL_PIPE_SIZE); + else + error = pipespace(rpipe, PIPE_SIZE); + if (error) + goto freepipes; + +#ifndef PIPE_NODIRECT + rpipe->pipe_state |= PIPE_DIRECTOK; + wpipe->pipe_state |= PIPE_DIRECTOK; +#endif + TAILQ_INIT(&rpipe->pipe_evlist); + TAILQ_INIT(&wpipe->pipe_evlist); + + error = falloc(p, &rf, &fd); + if (error) { + goto freepipes; + } + retval[0] = fd; + + /* + * for now we'll create half-duplex + * pipes... this is what we've always + * supported.. + */ + rf->f_flag = FREAD; + rf->f_type = DTYPE_PIPE; + rf->f_data = (caddr_t)rpipe; + rf->f_ops = &pipeops; + + error = falloc(p, &wf, &fd); + if (error) { + fp_free(p, retval[0], rf); + goto freepipes; + } + wf->f_flag = FWRITE; + wf->f_type = DTYPE_PIPE; + wf->f_data = (caddr_t)wpipe; + wf->f_ops = &pipeops; + + retval[1] = fd; +#ifdef MAC + /* + * XXXXXXXX SHOULD NOT HOLD FILE_LOCK() XXXXXXXXXXXX + * + * struct pipe represents a pipe endpoint. The MAC label is shared + * between the connected endpoints. As a result mac_init_pipe() and + * mac_create_pipe() should only be called on one of the endpoints + * after they have been connected. + */ + mac_init_pipe(rpipe); + mac_create_pipe(td->td_ucred, rpipe); +#endif + proc_fdlock(p); + *fdflags(p, retval[0]) &= ~UF_RESERVED; + *fdflags(p, retval[1]) &= ~UF_RESERVED; + fp_drop(p, retval[0], rf, 1); + fp_drop(p, retval[1], wf, 1); + proc_fdunlock(p); + + rpipe->pipe_peer = wpipe; + wpipe->pipe_peer = rpipe; + + rpipe->pipe_mtxp = wpipe->pipe_mtxp = pmtx; + + return (0); + +freepipes: + pipeclose(rpipe); + pipeclose(wpipe); + lck_mtx_free(pmtx, pipe_mtx_grp); + + return (error); +} + + +int +pipe_stat(struct pipe *cpipe, struct stat *ub) +{ +#ifdef MAC + int error; +#endif + struct timeval now; + + if (cpipe == NULL) + return (EBADF); +#ifdef MAC + PIPE_LOCK(cpipe); + error = mac_check_pipe_stat(active_cred, cpipe); + PIPE_UNLOCK(cpipe); + if (error) + return (error); +#endif + if (cpipe->pipe_buffer.buffer == 0) { + /* + * must be stat'ing the write fd + */ + cpipe = cpipe->pipe_peer; + + if (cpipe == NULL) + return (EBADF); + } + bzero(ub, sizeof(*ub)); + ub->st_mode = S_IFIFO | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; + ub->st_blksize = cpipe->pipe_buffer.size; + ub->st_size = cpipe->pipe_buffer.cnt; + ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize; + ub->st_nlink = 1; + + ub->st_uid = kauth_getuid(); + ub->st_gid = kauth_getgid(); + + microtime(&now); + ub->st_atimespec.tv_sec = now.tv_sec; + ub->st_atimespec.tv_nsec = now.tv_usec * 1000; + + ub->st_mtimespec.tv_sec = now.tv_sec; + ub->st_mtimespec.tv_nsec = now.tv_usec * 1000; + + ub->st_ctimespec.tv_sec = now.tv_sec; + ub->st_ctimespec.tv_nsec = now.tv_usec * 1000; + + /* + * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen, st_uid, st_gid. + * XXX (st_dev, st_ino) should be unique. + */ + return (0); +} + + +/* + * Allocate kva for pipe circular buffer, the space is pageable + * This routine will 'realloc' the size of a pipe safely, if it fails + * it will retain the old buffer. + * If it fails it will return ENOMEM. + */ +static int +pipespace(struct pipe *cpipe, int size) +{ + vm_offset_t buffer; + + size = round_page(size); + + if (kmem_alloc(kernel_map, &buffer, size) != KERN_SUCCESS) + return(ENOMEM); + + /* free old resources if we're resizing */ + pipe_free_kmem(cpipe); + cpipe->pipe_buffer.buffer = (caddr_t)buffer; + cpipe->pipe_buffer.size = size; + cpipe->pipe_buffer.in = 0; + cpipe->pipe_buffer.out = 0; + cpipe->pipe_buffer.cnt = 0; + + OSAddAtomic(1, (SInt32 *)&amountpipes); + OSAddAtomic(cpipe->pipe_buffer.size, (SInt32 *)&amountpipekva); + + return (0); +} + +/* + * initialize and allocate VM and memory for pipe + */ +static int +pipe_create(struct pipe **cpipep) +{ + struct pipe *cpipe; + + cpipe = (struct pipe *)zalloc(pipe_zone); + + if ((*cpipep = cpipe) == NULL) + return (ENOMEM); + + /* + * protect so pipespace or pipeclose don't follow a junk pointer + * if pipespace() fails. + */ + bzero(cpipe, sizeof *cpipe); + + return (0); +} + + +/* + * lock a pipe for I/O, blocking other access + */ +static __inline int +pipelock(cpipe, catch) + struct pipe *cpipe; + int catch; +{ + int error; + + while (cpipe->pipe_state & PIPE_LOCKFL) { + cpipe->pipe_state |= PIPE_LWANT; + + error = msleep(cpipe, PIPE_MTX(cpipe), catch ? (PRIBIO | PCATCH) : PRIBIO, + "pipelk", 0); + if (error != 0) + return (error); + } + cpipe->pipe_state |= PIPE_LOCKFL; + + return (0); +} + +/* + * unlock a pipe I/O lock + */ +static __inline void +pipeunlock(cpipe) + struct pipe *cpipe; +{ + + cpipe->pipe_state &= ~PIPE_LOCKFL; + + if (cpipe->pipe_state & PIPE_LWANT) { + cpipe->pipe_state &= ~PIPE_LWANT; + wakeup(cpipe); + } +} + +static void +pipeselwakeup(cpipe, spipe) + struct pipe *cpipe; + struct pipe *spipe; +{ + + if (cpipe->pipe_state & PIPE_SEL) { + cpipe->pipe_state &= ~PIPE_SEL; + selwakeup(&cpipe->pipe_sel); + } + if (cpipe->pipe_state & PIPE_KNOTE) + KNOTE(&cpipe->pipe_sel.si_note, 1); + + postpipeevent(cpipe, EV_RWBYTES); + + if (spipe && (spipe->pipe_state & PIPE_ASYNC) && spipe->pipe_pgid) { + struct proc *p; + + if (spipe->pipe_pgid < 0) + gsignal(-spipe->pipe_pgid, SIGIO); + else if ((p = pfind(spipe->pipe_pgid)) != (struct proc *)0) + psignal(p, SIGIO); + } +} + +/* ARGSUSED */ +static int +pipe_read(struct fileproc *fp, struct uio *uio, __unused kauth_cred_t active_cred, __unused int flags, __unused struct proc *p) +{ + struct pipe *rpipe = (struct pipe *)fp->f_data; + int error; + int nread = 0; + u_int size; + + PIPE_LOCK(rpipe); + ++rpipe->pipe_busy; + + error = pipelock(rpipe, 1); + if (error) + goto unlocked_error; + +#ifdef MAC + error = mac_check_pipe_read(active_cred, rpipe); + if (error) + goto locked_error; +#endif + + while (uio_resid(uio)) { + /* + * normal pipe buffer receive + */ + if (rpipe->pipe_buffer.cnt > 0) { + size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; + if (size > rpipe->pipe_buffer.cnt) + size = rpipe->pipe_buffer.cnt; + // LP64todo - fix this! + if (size > (u_int) uio_resid(uio)) + size = (u_int) uio_resid(uio); + + PIPE_UNLOCK(rpipe); + error = uiomove( + &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], + size, uio); + PIPE_LOCK(rpipe); + if (error) + break; + + rpipe->pipe_buffer.out += size; + if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) + rpipe->pipe_buffer.out = 0; + + rpipe->pipe_buffer.cnt -= size; + + /* + * If there is no more to read in the pipe, reset + * its pointers to the beginning. This improves + * cache hit stats. + */ + if (rpipe->pipe_buffer.cnt == 0) { + rpipe->pipe_buffer.in = 0; + rpipe->pipe_buffer.out = 0; + } + nread += size; +#ifndef PIPE_NODIRECT + /* + * Direct copy, bypassing a kernel buffer. + */ + } else if ((size = rpipe->pipe_map.cnt) && + (rpipe->pipe_state & PIPE_DIRECTW)) { + caddr_t va; + // LP64todo - fix this! + if (size > (u_int) uio_resid(uio)) + size = (u_int) uio_resid(uio); + + va = (caddr_t) rpipe->pipe_map.kva + + rpipe->pipe_map.pos; + PIPE_UNLOCK(rpipe); + error = uiomove(va, size, uio); + PIPE_LOCK(rpipe); + if (error) + break; + nread += size; + rpipe->pipe_map.pos += size; + rpipe->pipe_map.cnt -= size; + if (rpipe->pipe_map.cnt == 0) { + rpipe->pipe_state &= ~PIPE_DIRECTW; + wakeup(rpipe); + } +#endif + } else { + /* + * detect EOF condition + * read returns 0 on EOF, no need to set error + */ + if (rpipe->pipe_state & PIPE_EOF) + break; + + /* + * If the "write-side" has been blocked, wake it up now. + */ + if (rpipe->pipe_state & PIPE_WANTW) { + rpipe->pipe_state &= ~PIPE_WANTW; + wakeup(rpipe); + } + + /* + * Break if some data was read. + */ + if (nread > 0) + break; + + /* + * Unlock the pipe buffer for our remaining processing. + * We will either break out with an error or we will + * sleep and relock to loop. + */ + pipeunlock(rpipe); + + /* + * Handle non-blocking mode operation or + * wait for more data. + */ + if (fp->f_flag & FNONBLOCK) { + error = EAGAIN; + } else { + rpipe->pipe_state |= PIPE_WANTR; + + error = msleep(rpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "piperd", 0); + + if (error == 0) + error = pipelock(rpipe, 1); + } + if (error) + goto unlocked_error; + } + } +#ifdef MAC +locked_error: +#endif + pipeunlock(rpipe); + +unlocked_error: + --rpipe->pipe_busy; + + /* + * PIPE_WANT processing only makes sense if pipe_busy is 0. + */ + if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { + rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); + wakeup(rpipe); + } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { + /* + * Handle write blocking hysteresis. + */ + if (rpipe->pipe_state & PIPE_WANTW) { + rpipe->pipe_state &= ~PIPE_WANTW; + wakeup(rpipe); + } + } + + if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) + pipeselwakeup(rpipe, rpipe->pipe_peer); + + PIPE_UNLOCK(rpipe); + + return (error); +} + + + +#ifndef PIPE_NODIRECT +/* + * Map the sending processes' buffer into kernel space and wire it. + * This is similar to a physical write operation. + */ +static int +pipe_build_write_buffer(wpipe, uio) + struct pipe *wpipe; + struct uio *uio; +{ + pmap_t pmap; + u_int size; + int i, j; + vm_offset_t addr, endaddr; + + + size = (u_int) uio->uio_iov->iov_len; + if (size > wpipe->pipe_buffer.size) + size = wpipe->pipe_buffer.size; + + pmap = vmspace_pmap(curproc->p_vmspace); + endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size); + addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base); + for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) { + /* + * vm_fault_quick() can sleep. Consequently, + * vm_page_lock_queue() and vm_page_unlock_queue() + * should not be performed outside of this loop. + */ + race: + if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) { + vm_page_lock_queues(); + for (j = 0; j < i; j++) + vm_page_unhold(wpipe->pipe_map.ms[j]); + vm_page_unlock_queues(); + return (EFAULT); + } + wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr, + VM_PROT_READ); + if (wpipe->pipe_map.ms[i] == NULL) + goto race; + } + +/* + * set up the control block + */ + wpipe->pipe_map.npages = i; + wpipe->pipe_map.pos = + ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; + wpipe->pipe_map.cnt = size; + +/* + * and map the buffer + */ + if (wpipe->pipe_map.kva == 0) { + /* + * We need to allocate space for an extra page because the + * address range might (will) span pages at times. + */ + wpipe->pipe_map.kva = kmem_alloc_nofault(kernel_map, + wpipe->pipe_buffer.size + PAGE_SIZE); + atomic_add_int(&amountpipekvawired, + wpipe->pipe_buffer.size + PAGE_SIZE); + } + pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms, + wpipe->pipe_map.npages); + +/* + * and update the uio data + */ + + uio->uio_iov->iov_len -= size; + uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size; + if (uio->uio_iov->iov_len == 0) + uio->uio_iov++; + uio_setresid(uio, (uio_resid(uio) - size)); + uio->uio_offset += size; + return (0); +} + +/* + * unmap and unwire the process buffer + */ +static void +pipe_destroy_write_buffer(wpipe) + struct pipe *wpipe; +{ + int i; + + if (wpipe->pipe_map.kva) { + pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages); + + if (amountpipekvawired > maxpipekvawired / 2) { + /* Conserve address space */ + vm_offset_t kva = wpipe->pipe_map.kva; + wpipe->pipe_map.kva = 0; + kmem_free(kernel_map, kva, + wpipe->pipe_buffer.size + PAGE_SIZE); + atomic_subtract_int(&amountpipekvawired, + wpipe->pipe_buffer.size + PAGE_SIZE); + } + } + vm_page_lock_queues(); + for (i = 0; i < wpipe->pipe_map.npages; i++) { + vm_page_unhold(wpipe->pipe_map.ms[i]); + } + vm_page_unlock_queues(); + wpipe->pipe_map.npages = 0; +} + +/* + * In the case of a signal, the writing process might go away. This + * code copies the data into the circular buffer so that the source + * pages can be freed without loss of data. + */ +static void +pipe_clone_write_buffer(wpipe) + struct pipe *wpipe; +{ + int size; + int pos; + + size = wpipe->pipe_map.cnt; + pos = wpipe->pipe_map.pos; + + wpipe->pipe_buffer.in = size; + wpipe->pipe_buffer.out = 0; + wpipe->pipe_buffer.cnt = size; + wpipe->pipe_state &= ~PIPE_DIRECTW; + + PIPE_UNLOCK(wpipe); + bcopy((caddr_t) wpipe->pipe_map.kva + pos, + wpipe->pipe_buffer.buffer, size); + pipe_destroy_write_buffer(wpipe); + PIPE_LOCK(wpipe); +} + +/* + * This implements the pipe buffer write mechanism. Note that only + * a direct write OR a normal pipe write can be pending at any given time. + * If there are any characters in the pipe buffer, the direct write will + * be deferred until the receiving process grabs all of the bytes from + * the pipe buffer. Then the direct mapping write is set-up. + */ +static int +pipe_direct_write(wpipe, uio) + struct pipe *wpipe; + struct uio *uio; +{ + int error; + +retry: + while (wpipe->pipe_state & PIPE_DIRECTW) { + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + wpipe->pipe_state |= PIPE_WANTW; + error = msleep(wpipe, PIPE_MTX(wpipe), + PRIBIO | PCATCH, "pipdww", 0); + if (error) + goto error1; + if (wpipe->pipe_state & PIPE_EOF) { + error = EPIPE; + goto error1; + } + } + wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ + if (wpipe->pipe_buffer.cnt > 0) { + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + + wpipe->pipe_state |= PIPE_WANTW; + error = msleep(wpipe, PIPE_MTX(wpipe), + PRIBIO | PCATCH, "pipdwc", 0); + if (error) + goto error1; + if (wpipe->pipe_state & PIPE_EOF) { + error = EPIPE; + goto error1; + } + goto retry; + } + + wpipe->pipe_state |= PIPE_DIRECTW; + + pipelock(wpipe, 0); + PIPE_UNLOCK(wpipe); + error = pipe_build_write_buffer(wpipe, uio); + PIPE_LOCK(wpipe); + pipeunlock(wpipe); + if (error) { + wpipe->pipe_state &= ~PIPE_DIRECTW; + goto error1; + } + + error = 0; + while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { + if (wpipe->pipe_state & PIPE_EOF) { + pipelock(wpipe, 0); + PIPE_UNLOCK(wpipe); + pipe_destroy_write_buffer(wpipe); + PIPE_LOCK(wpipe); + pipeselwakeup(wpipe, wpipe); + pipeunlock(wpipe); + error = EPIPE; + goto error1; + } + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + pipeselwakeup(wpipe, wpipe); + error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, + "pipdwt", 0); + } + + pipelock(wpipe,0); + if (wpipe->pipe_state & PIPE_DIRECTW) { + /* + * this bit of trickery substitutes a kernel buffer for + * the process that might be going away. + */ + pipe_clone_write_buffer(wpipe); + } else { + PIPE_UNLOCK(wpipe); + pipe_destroy_write_buffer(wpipe); + PIPE_LOCK(wpipe); + } + pipeunlock(wpipe); + return (error); + +error1: + wakeup(wpipe); + return (error); +} +#endif + + + +static int +pipe_write(struct fileproc *fp, struct uio *uio, __unused kauth_cred_t active_cred, __unused int flags, __unused struct proc *p) +{ + int error = 0; + int orig_resid; + int pipe_size; + struct pipe *wpipe, *rpipe; + + rpipe = (struct pipe *)fp->f_data; + + PIPE_LOCK(rpipe); + wpipe = rpipe->pipe_peer; + + /* + * detect loss of pipe read side, issue SIGPIPE if lost. + */ + if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF)) { + PIPE_UNLOCK(rpipe); + return (EPIPE); + } +#ifdef MAC + error = mac_check_pipe_write(active_cred, wpipe); + if (error) { + PIPE_UNLOCK(rpipe); + return (error); + } +#endif + ++wpipe->pipe_busy; + + pipe_size = 0; + + if (wpipe->pipe_buffer.buffer == 0) { + /* + * need to allocate some storage... we delay the allocation + * until the first write on fd[0] to avoid allocating storage for both + * 'pipe ends'... most pipes are half-duplex with the writes targeting + * fd[1], so allocating space for both ends is a waste... + * + * Reduce to 1/4th pipe size if we're over our global max. + */ + if (amountpipekva > maxpipekva / 2) + pipe_size = SMALL_PIPE_SIZE; + else + pipe_size = PIPE_SIZE; + } + + /* + * If it is advantageous to resize the pipe buffer, do + * so. + */ + if ((uio_resid(uio) > PIPE_SIZE) && + (wpipe->pipe_buffer.size <= PIPE_SIZE) && + (amountpipekva < maxpipekva / 2) && + (nbigpipe < LIMITBIGPIPES) && +#ifndef PIPE_NODIRECT + (wpipe->pipe_state & PIPE_DIRECTW) == 0 && +#endif + (wpipe->pipe_buffer.cnt == 0)) { + + pipe_size = BIG_PIPE_SIZE; + + } + if (pipe_size) { + /* + * need to do initial allocation or resizing of pipe + */ + if ((error = pipelock(wpipe, 1)) == 0) { + PIPE_UNLOCK(wpipe); + if (pipespace(wpipe, pipe_size) == 0) + OSAddAtomic(1, (SInt32 *)&nbigpipe); + PIPE_LOCK(wpipe); + pipeunlock(wpipe); + + if (wpipe->pipe_buffer.buffer == 0) { + /* + * initial allocation failed + */ + error = ENOMEM; + } + } + if (error) { + /* + * If an error occurred unbusy and return, waking up any pending + * readers. + */ + --wpipe->pipe_busy; + if ((wpipe->pipe_busy == 0) && + (wpipe->pipe_state & PIPE_WANT)) { + wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); + wakeup(wpipe); + } + PIPE_UNLOCK(rpipe); + return(error); + } + } + // LP64todo - fix this! + orig_resid = uio_resid(uio); + + while (uio_resid(uio)) { + int space; + +#ifndef PIPE_NODIRECT + /* + * If the transfer is large, we can gain performance if + * we do process-to-process copies directly. + * If the write is non-blocking, we don't use the + * direct write mechanism. + * + * The direct write mechanism will detect the reader going + * away on us. + */ + if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) && + (fp->f_flag & FNONBLOCK) == 0 && + amountpipekvawired + uio->uio_resid < maxpipekvawired) { + error = pipe_direct_write(wpipe, uio); + if (error) + break; + continue; + } + + /* + * Pipe buffered writes cannot be coincidental with + * direct writes. We wait until the currently executing + * direct write is completed before we start filling the + * pipe buffer. We break out if a signal occurs or the + * reader goes away. + */ + retrywrite: + while (wpipe->pipe_state & PIPE_DIRECTW) { + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipbww", 0); + + if (wpipe->pipe_state & PIPE_EOF) + break; + if (error) + break; + } +#else + retrywrite: +#endif + space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; + + /* + * Writes of size <= PIPE_BUF must be atomic. + */ + if ((space < uio_resid(uio)) && (orig_resid <= PIPE_BUF)) + space = 0; + + if (space > 0) { + + if ((error = pipelock(wpipe,1)) == 0) { + int size; /* Transfer size */ + int segsize; /* first segment to transfer */ + + if (wpipe->pipe_state & PIPE_EOF) { + pipeunlock(wpipe); + error = EPIPE; + break; + } +#ifndef PIPE_NODIRECT + /* + * It is possible for a direct write to + * slip in on us... handle it here... + */ + if (wpipe->pipe_state & PIPE_DIRECTW) { + pipeunlock(wpipe); + goto retrywrite; + } +#endif + /* + * If a process blocked in pipelock, our + * value for space might be bad... the mutex + * is dropped while we're blocked + */ + if (space > (int)(wpipe->pipe_buffer.size - + wpipe->pipe_buffer.cnt)) { + pipeunlock(wpipe); + goto retrywrite; + } + + /* + * Transfer size is minimum of uio transfer + * and free space in pipe buffer. + */ + // LP64todo - fix this! + if (space > uio_resid(uio)) + size = uio_resid(uio); + else + size = space; + /* + * First segment to transfer is minimum of + * transfer size and contiguous space in + * pipe buffer. If first segment to transfer + * is less than the transfer size, we've got + * a wraparound in the buffer. + */ + segsize = wpipe->pipe_buffer.size - + wpipe->pipe_buffer.in; + if (segsize > size) + segsize = size; + + /* Transfer first segment */ + + PIPE_UNLOCK(rpipe); + error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], + segsize, uio); + PIPE_LOCK(rpipe); + + if (error == 0 && segsize < size) { + /* + * Transfer remaining part now, to + * support atomic writes. Wraparound + * happened. + */ + if (wpipe->pipe_buffer.in + segsize != + wpipe->pipe_buffer.size) + panic("Expected pipe buffer " + "wraparound disappeared"); + + PIPE_UNLOCK(rpipe); + error = uiomove( + &wpipe->pipe_buffer.buffer[0], + size - segsize, uio); + PIPE_LOCK(rpipe); + } + if (error == 0) { + wpipe->pipe_buffer.in += size; + if (wpipe->pipe_buffer.in >= + wpipe->pipe_buffer.size) { + if (wpipe->pipe_buffer.in != + size - segsize + + wpipe->pipe_buffer.size) + panic("Expected " + "wraparound bad"); + wpipe->pipe_buffer.in = size - + segsize; + } + + wpipe->pipe_buffer.cnt += size; + if (wpipe->pipe_buffer.cnt > + wpipe->pipe_buffer.size) + panic("Pipe buffer overflow"); + + } + pipeunlock(wpipe); + } + if (error) + break; + + } else { + /* + * If the "read-side" has been blocked, wake it up now. + */ + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + /* + * don't block on non-blocking I/O + * we'll do the pipeselwakeup on the way out + */ + if (fp->f_flag & FNONBLOCK) { + error = EAGAIN; + break; + } + /* + * We have no more space and have something to offer, + * wake up select/poll. + */ + pipeselwakeup(wpipe, wpipe); + + wpipe->pipe_state |= PIPE_WANTW; + + error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipewr", 0); + + if (error != 0) + break; + /* + * If read side wants to go away, we just issue a signal + * to ourselves. + */ + if (wpipe->pipe_state & PIPE_EOF) { + error = EPIPE; + break; + } + } + } + --wpipe->pipe_busy; + + if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { + wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); + wakeup(wpipe); + } + if (wpipe->pipe_buffer.cnt > 0) { + /* + * If there are any characters in the buffer, we wake up + * the reader if it was blocked waiting for data. + */ + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + /* + * wake up thread blocked in select/poll or post the notification + */ + pipeselwakeup(wpipe, wpipe); + } + PIPE_UNLOCK(rpipe); + + return (error); +} + +/* + * we implement a very minimal set of ioctls for compatibility with sockets. + */ +/* ARGSUSED 3 */ +static int +pipe_ioctl(struct fileproc *fp, u_long cmd, caddr_t data, __unused struct proc *p) +{ + struct pipe *mpipe = (struct pipe *)fp->f_data; +#ifdef MAC + int error; +#endif + + PIPE_LOCK(mpipe); + +#ifdef MAC + error = mac_check_pipe_ioctl(active_cred, mpipe, cmd, data); + if (error) { + PIPE_UNLOCK(mpipe); + + return (error); + } +#endif + + switch (cmd) { + + case FIONBIO: + PIPE_UNLOCK(mpipe); + return (0); + + case FIOASYNC: + if (*(int *)data) { + mpipe->pipe_state |= PIPE_ASYNC; + } else { + mpipe->pipe_state &= ~PIPE_ASYNC; + } + PIPE_UNLOCK(mpipe); + return (0); + + case FIONREAD: +#ifndef PIPE_NODIRECT + if (mpipe->pipe_state & PIPE_DIRECTW) + *(int *)data = mpipe->pipe_map.cnt; + else +#endif + *(int *)data = mpipe->pipe_buffer.cnt; + PIPE_UNLOCK(mpipe); + return (0); + + case TIOCSPGRP: + mpipe->pipe_pgid = *(int *)data; + + PIPE_UNLOCK(mpipe); + return (0); + + case TIOCGPGRP: + *(int *)data = mpipe->pipe_pgid; + + PIPE_UNLOCK(mpipe); + return (0); + + } + PIPE_UNLOCK(mpipe); + return (ENOTTY); +} + + +static int +pipe_select(struct fileproc *fp, int which, void *wql, struct proc *p) +{ + struct pipe *rpipe = (struct pipe *)fp->f_data; + struct pipe *wpipe; + int retnum = 0; + + if (rpipe == NULL || rpipe == (struct pipe *)-1) + return (retnum); + + PIPE_LOCK(rpipe); + + wpipe = rpipe->pipe_peer; + + switch (which) { + + case FREAD: + if ((rpipe->pipe_state & PIPE_DIRECTW) || + (rpipe->pipe_buffer.cnt > 0) || + (rpipe->pipe_state & PIPE_EOF)) { + + retnum = 1; + } else { + rpipe->pipe_state |= PIPE_SEL; + selrecord(p, &rpipe->pipe_sel, wql); + } + break; + + case FWRITE: + if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) || + (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && + (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) { + + retnum = 1; + } else { + wpipe->pipe_state |= PIPE_SEL; + selrecord(p, &wpipe->pipe_sel, wql); + } + break; + case 0: + rpipe->pipe_state |= PIPE_SEL; + selrecord(p, &rpipe->pipe_sel, wql); + break; + } + PIPE_UNLOCK(rpipe); + + return (retnum); +} + + +/* ARGSUSED 1 */ +static int +pipe_close(struct fileglob *fg, __unused struct proc *p) +{ + struct pipe *cpipe; + + proc_fdlock(p); + cpipe = (struct pipe *)fg->fg_data; + fg->fg_data = NULL; + proc_fdunlock(p); + + if (cpipe) + pipeclose(cpipe); + + return (0); +} + +static void +pipe_free_kmem(struct pipe *cpipe) +{ + + if (cpipe->pipe_buffer.buffer != NULL) { + if (cpipe->pipe_buffer.size > PIPE_SIZE) + OSAddAtomic(-1, (SInt32 *)&nbigpipe); + OSAddAtomic(cpipe->pipe_buffer.size, (SInt32 *)&amountpipekva); + OSAddAtomic(-1, (SInt32 *)&amountpipes); + + kmem_free(kernel_map, (vm_offset_t)cpipe->pipe_buffer.buffer, + cpipe->pipe_buffer.size); + cpipe->pipe_buffer.buffer = NULL; + } +#ifndef PIPE_NODIRECT + if (cpipe->pipe_map.kva != 0) { + atomic_subtract_int(&amountpipekvawired, + cpipe->pipe_buffer.size + PAGE_SIZE); + kmem_free(kernel_map, + cpipe->pipe_map.kva, + cpipe->pipe_buffer.size + PAGE_SIZE); + cpipe->pipe_map.cnt = 0; + cpipe->pipe_map.kva = 0; + cpipe->pipe_map.pos = 0; + cpipe->pipe_map.npages = 0; + } +#endif +} + +/* + * shutdown the pipe + */ +static void +pipeclose(struct pipe *cpipe) +{ + struct pipe *ppipe; + + if (cpipe == NULL) + return; + + /* partially created pipes won't have a valid mutex. */ + if (PIPE_MTX(cpipe) != NULL) + PIPE_LOCK(cpipe); + + pipeselwakeup(cpipe, cpipe); + + /* + * If the other side is blocked, wake it up saying that + * we want to close it down. + */ + while (cpipe->pipe_busy) { + cpipe->pipe_state |= PIPE_WANT | PIPE_EOF; + + wakeup(cpipe); + + msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0); + } + +#ifdef MAC + if (cpipe->pipe_label != NULL && cpipe->pipe_peer == NULL) + mac_destroy_pipe(cpipe); +#endif + + /* + * Disconnect from peer + */ + if ((ppipe = cpipe->pipe_peer) != NULL) { + + ppipe->pipe_state |= PIPE_EOF; + + pipeselwakeup(ppipe, ppipe); + wakeup(ppipe); + + if (cpipe->pipe_state & PIPE_KNOTE) + KNOTE(&ppipe->pipe_sel.si_note, 1); + + postpipeevent(ppipe, EV_RCLOSED); + + ppipe->pipe_peer = NULL; + } + evpipefree(cpipe); + + /* + * free resources + */ + if (PIPE_MTX(cpipe) != NULL) { + if (ppipe != NULL) { + /* + * since the mutex is shared and the peer is still + * alive, we need to release the mutex, not free it + */ + PIPE_UNLOCK(cpipe); + } else { + /* + * peer is gone, so we're the sole party left with + * interest in this mutex... we can just free it + */ + lck_mtx_free(PIPE_MTX(cpipe), pipe_mtx_grp); + } + } + pipe_free_kmem(cpipe); + + zfree(pipe_zone, cpipe); +} + + +/*ARGSUSED*/ +static int +pipe_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused struct proc *p) +{ + struct pipe *cpipe; + + cpipe = (struct pipe *)kn->kn_fp->f_data; + + PIPE_LOCK(cpipe); + + switch (kn->kn_filter) { + case EVFILT_READ: + kn->kn_fop = &pipe_rfiltops; + break; + case EVFILT_WRITE: + kn->kn_fop = &pipe_wfiltops; + + if (cpipe->pipe_peer == NULL) { + /* + * other end of pipe has been closed + */ + PIPE_UNLOCK(cpipe); + return (EPIPE); + } + cpipe = cpipe->pipe_peer; + break; + default: + PIPE_UNLOCK(cpipe); + return (1); + } + + if (KNOTE_ATTACH(&cpipe->pipe_sel.si_note, kn)) + cpipe->pipe_state |= PIPE_KNOTE; + + PIPE_UNLOCK(cpipe); + return (0); +} + +static void +filt_pipedetach(struct knote *kn) +{ + struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data; + + PIPE_LOCK(cpipe); + + if (kn->kn_filter == EVFILT_WRITE) { + if (cpipe->pipe_peer == NULL) { + PIPE_UNLOCK(cpipe); + return; + } + cpipe = cpipe->pipe_peer; + } + if (cpipe->pipe_state & PIPE_KNOTE) { + if (KNOTE_DETACH(&cpipe->pipe_sel.si_note, kn)) + cpipe->pipe_state &= ~PIPE_KNOTE; + } + PIPE_UNLOCK(cpipe); +} + +/*ARGSUSED*/ +static int +filt_piperead(struct knote *kn, long hint) +{ + struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; + struct pipe *wpipe; + int retval; + + /* + * if hint == 0, then we've been called from the kevent + * world directly and do not currently hold the pipe mutex... + * if hint == 1, we're being called back via the KNOTE post + * we made in pipeselwakeup, and we already hold the mutex... + */ + if (hint == 0) + PIPE_LOCK(rpipe); + + wpipe = rpipe->pipe_peer; + kn->kn_data = rpipe->pipe_buffer.cnt; + +#ifndef PIPE_NODIRECT + if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) + kn->kn_data = rpipe->pipe_map.cnt; +#endif + if ((rpipe->pipe_state & PIPE_EOF) || + (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { + kn->kn_flags |= EV_EOF; + retval = 1; + } else + retval = (kn->kn_sfflags & NOTE_LOWAT) ? + (kn->kn_data >= kn->kn_sdata) : (kn->kn_data > 0); + + if (hint == 0) + PIPE_UNLOCK(rpipe); + + return (retval); +} + +/*ARGSUSED*/ +static int +filt_pipewrite(struct knote *kn, long hint) +{ + struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; + struct pipe *wpipe; + + /* + * if hint == 0, then we've been called from the kevent + * world directly and do not currently hold the pipe mutex... + * if hint == 1, we're being called back via the KNOTE post + * we made in pipeselwakeup, and we already hold the mutex... + */ + if (hint == 0) + PIPE_LOCK(rpipe); + + wpipe = rpipe->pipe_peer; + + if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { + kn->kn_data = 0; + kn->kn_flags |= EV_EOF; + + if (hint == 0) + PIPE_UNLOCK(rpipe); + return (1); + } + kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; + +#ifndef PIPE_NODIRECT + if (wpipe->pipe_state & PIPE_DIRECTW) + kn->kn_data = 0; +#endif + if (hint == 0) + PIPE_UNLOCK(rpipe); + + return (kn->kn_data >= ((kn->kn_sfflags & NOTE_LOWAT) ? + kn->kn_sdata : PIPE_BUF)); +} diff --git a/bsd/kern/sys_socket.c b/bsd/kern/sys_socket.c index 973f4870f..1e7b7d3c0 100644 --- a/bsd/kern/sys_socket.c +++ b/bsd/kern/sys_socket.c @@ -56,7 +56,7 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/event.h> #include <sys/protosw.h> #include <sys/socket.h> @@ -66,141 +66,106 @@ #include <sys/stat.h> #include <sys/uio.h> #include <sys/filedesc.h> +#include <sys/kauth.h> +#include <sys/signalvar.h> #include <net/if.h> #include <net/route.h> -int soo_read __P((struct file *fp, struct uio *uio, - struct ucred *cred, int flags, struct proc *p)); -int soo_write __P((struct file *fp, struct uio *uio, - struct ucred *cred, int flags, struct proc *p)); -int soo_close __P((struct file *fp, struct proc *p)); - -int soo_select __P((struct file *fp, int which, void * wql, struct proc *p)); - -int soo_kqfilter __P((struct file *fp, struct knote *kn, struct proc *p)); +/* + * File operations on sockets. + */ +int soo_read(struct fileproc *fp, struct uio *uio, kauth_cred_t cred, + int flags, struct proc *p); +int soo_write(struct fileproc *fp, struct uio *uio, kauth_cred_t cred, + int flags, struct proc *p); +int soo_close(struct fileglob *fp, struct proc *p); +int soo_ioctl(struct fileproc *fp, u_long cmd, caddr_t data, struct proc *p); +int soo_stat(struct socket *so, struct stat *ub); +int soo_select(struct fileproc *fp, int which, void * wql, struct proc *p); +int soo_kqfilter(struct fileproc *fp, struct knote *kn, struct proc *p); +int soo_drain(struct fileproc *fp, struct proc *p); struct fileops socketops = - { soo_read, soo_write, soo_ioctl, soo_select, soo_close, soo_kqfilter }; + { soo_read, soo_write, soo_ioctl, soo_select, soo_close, soo_kqfilter, soo_drain }; /* ARGSUSED */ int -soo_read(fp, uio, cred, flags, p) - struct file *fp; - struct uio *uio; - struct ucred *cred; - int flags; - struct proc *p; +soo_read( + struct fileproc *fp, + struct uio *uio, + __unused kauth_cred_t cred, + __unused int flags, + __unused struct proc *p) { struct socket *so; - struct kextcb *kp; int stat; - int (*fsoreceive) __P((struct socket *so, + int (*fsoreceive)(struct socket *so2, struct sockaddr **paddr, - struct uio *uio, struct mbuf **mp0, - struct mbuf **controlp, int *flagsp)); + struct uio *uio2, struct mbuf **mp0, + struct mbuf **controlp, int *flagsp); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - if ((so = (struct socket *)fp->f_data) == NULL) { + if ((so = (struct socket *)fp->f_fglob->fg_data) == NULL) { /* This is not a valid open file descriptor */ - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - return (EBADF); + return(EBADF); } - +//###LD will have to change fsoreceive = so->so_proto->pr_usrreqs->pru_soreceive; - if (fsoreceive != soreceive) - { kp = sotokextcb(so); - while (kp) - { if (kp->e_soif && kp->e_soif->sf_soreceive) - (*kp->e_soif->sf_soreceive)(so, 0, &uio, - 0, 0, 0, kp); - kp = kp->e_next; - } - - } stat = (*fsoreceive)(so, 0, uio, 0, 0, 0); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); return stat; } /* ARGSUSED */ int -soo_write(fp, uio, cred, flags, p) - struct file *fp; - struct uio *uio; - struct ucred *cred; - int flags; - struct proc *p; +soo_write( + struct fileproc *fp, + struct uio *uio, + __unused kauth_cred_t cred, + __unused int flags, + struct proc *procp) { struct socket *so; - int (*fsosend) __P((struct socket *so, struct sockaddr *addr, - struct uio *uio, struct mbuf *top, - struct mbuf *control, int flags)); - struct kextcb *kp; + int (*fsosend)(struct socket *so2, struct sockaddr *addr, + struct uio *uio2, struct mbuf *top, + struct mbuf *control, int flags2); int stat; - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - - if ((so = (struct socket *)fp->f_data) == NULL) { - /* This is not a valid open file descriptor */ - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - return (EBADF); - } + if ((so = (struct socket *)fp->f_fglob->fg_data) == NULL) { + /* This is not a valid open file descriptor */ + return (EBADF); + } fsosend = so->so_proto->pr_usrreqs->pru_sosend; - if (fsosend != sosend) - { kp = sotokextcb(so); - while (kp) - { if (kp->e_soif && kp->e_soif->sf_sosend) - (*kp->e_soif->sf_sosend)(so, 0, &uio, - 0, 0, 0, kp); - kp = kp->e_next; - } - } stat = (*fsosend)(so, 0, uio, 0, 0, 0); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - /* Generation of SIGPIPE can be controlled per socket */ - if (stat == EPIPE && uio->uio_procp && !(so->so_flags & SOF_NOSIGPIPE)) - psignal(uio->uio_procp, SIGPIPE); + /* Generation of SIGPIPE can be controlled per socket */ + if (stat == EPIPE && procp && !(so->so_flags & SOF_NOSIGPIPE)) + psignal(procp, SIGPIPE); - return stat; + return stat; } -int -soo_ioctl(fp, cmd, data, p) - struct file *fp; - u_long cmd; - register caddr_t data; - struct proc *p; +__private_extern__ int +soioctl( + struct socket *so, + u_long cmd, + caddr_t data, + struct proc *p) { - register struct socket *so; struct sockopt sopt; - struct kextcb *kp; int error = 0; + int dropsockref = -1; - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - - if ((so = (struct socket *)fp->f_data) == NULL) { - /* This is not a valid open file descriptor */ - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - return (EBADF); - } - kp = sotokextcb(so); - sopt.sopt_level = cmd; - sopt.sopt_name = (int)data; - sopt.sopt_p = p; + socket_lock(so, 1); - while (kp) - { if (kp->e_soif && kp->e_soif->sf_socontrol) - (*kp->e_soif->sf_socontrol)(so, &sopt, kp); - kp = kp->e_next; - } + sopt.sopt_level = cmd; + sopt.sopt_name = (int)data; + sopt.sopt_p = p; switch (cmd) { @@ -210,8 +175,7 @@ soo_ioctl(fp, cmd, data, p) else so->so_state &= ~SS_NBIO; - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - return (0); + goto out; case FIOASYNC: if (*(int *)data) { @@ -223,28 +187,23 @@ soo_ioctl(fp, cmd, data, p) so->so_rcv.sb_flags &= ~SB_ASYNC; so->so_snd.sb_flags &= ~SB_ASYNC; } - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - return (0); + goto out; case FIONREAD: *(int *)data = so->so_rcv.sb_cc; - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - return (0); + goto out; case SIOCSPGRP: so->so_pgid = *(int *)data; - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - return (0); + goto out; case SIOCGPGRP: *(int *)data = so->so_pgid; - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - return (0); + goto out; case SIOCATMARK: *(int *)data = (so->so_state&SS_RCVATMARK) != 0; - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - return (0); + goto out; case SIOCSETOT: { /* @@ -256,18 +215,14 @@ soo_ioctl(fp, cmd, data, p) /* let's make sure it's either -1 or a valid file descriptor */ if (cloned_fd != -1) { - struct file *cloned_fp; - error = getsock(p->p_fd, cloned_fd, &cloned_fp); + error = file_socket(cloned_fd, &cloned_so); if (error) { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - return (error); + goto out; } - - cloned_so = (struct socket *)cloned_fp->f_data; + dropsockref = cloned_fd; } /* Always set socket non-blocking for OT */ - fp->f_flag |= FNONBLOCK; so->so_state |= SS_NBIO; so->so_options |= SO_DONTTRUNC | SO_WANTMORE; so->so_flags |= SOF_NOSIGPIPE; @@ -284,15 +239,13 @@ soo_ioctl(fp, cmd, data, p) if (cloned_so->so_snd.sb_hiwat > 0) { if (sbreserve(&so->so_snd, cloned_so->so_snd.sb_hiwat) == 0) { error = ENOBUFS; - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - return (error); + goto out; } } if (cloned_so->so_rcv.sb_hiwat > 0) { if (sbreserve(&so->so_rcv, cloned_so->so_rcv.sb_hiwat) == 0) { error = ENOBUFS; - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - return (error); + goto out; } } @@ -304,7 +257,7 @@ soo_ioctl(fp, cmd, data, p) (cloned_so->so_rcv.sb_lowat > so->so_rcv.sb_hiwat) ? so->so_rcv.sb_hiwat : cloned_so->so_rcv.sb_lowat; - /* SO_SNDTIMEO, SO_RCVTIMEO */ + /* SO_SNDTIMEO, SO_RCVTIMEO */ so->so_snd.sb_timeo = cloned_so->so_snd.sb_timeo; so->so_rcv.sb_timeo = cloned_so->so_rcv.sb_timeo; } @@ -314,8 +267,7 @@ soo_ioctl(fp, cmd, data, p) if (error == EOPNOTSUPP) error = 0; - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - return (error); + goto out; } } /* @@ -324,36 +276,64 @@ soo_ioctl(fp, cmd, data, p) * different entry since a socket's unnecessary */ if (IOCGROUP(cmd) == 'i') - error = ifioctl(so, cmd, data, p); + error = ifioctllocked(so, cmd, data, p); else if (IOCGROUP(cmd) == 'r') error = rtioctl(cmd, data, p); else error = (*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, 0, p); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); +out: + if (dropsockref != -1) + file_drop(dropsockref); + socket_unlock(so, 1); + + return error; +} + +int +soo_ioctl(fp, cmd, data, p) + struct fileproc *fp; + u_long cmd; + register caddr_t data; + struct proc *p; +{ + register struct socket *so; + int error; + + + if ((so = (struct socket *)fp->f_fglob->fg_data) == NULL) { + /* This is not a valid open file descriptor */ + return (EBADF); + } + + error = soioctl(so, cmd, data, p); + + if (error == 0 && cmd == SIOCSETOT) + fp->f_fglob->fg_flag |= FNONBLOCK; + return error; } int soo_select(fp, which, wql, p) - struct file *fp; + struct fileproc *fp; int which; void * wql; struct proc *p; { - register struct socket *so = (struct socket *)fp->f_data; - register int s = splnet(); + register struct socket *so = (struct socket *)fp->f_fglob->fg_data; int retnum=0; - if (so == NULL || so == (struct socket*)-1) goto done; + if (so == NULL || so == (struct socket*)-1) + return (0); + socket_lock(so, 1); switch (which) { case FREAD: so->so_rcv.sb_flags |= SB_SEL; if (soreadable(so)) { - splx(s); retnum = 1; so->so_rcv.sb_flags &= ~SB_SEL; goto done; @@ -364,7 +344,6 @@ soo_select(fp, which, wql, p) case FWRITE: so->so_snd.sb_flags |= SB_SEL; if (sowriteable(so)) { - splx(s); retnum = 1; so->so_snd.sb_flags &= ~SB_SEL; goto done; @@ -375,7 +354,6 @@ soo_select(fp, which, wql, p) case 0: so->so_rcv.sb_flags |= SB_SEL; if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) { - splx(s); retnum = 1; so->so_rcv.sb_flags &= ~SB_SEL; goto done; @@ -383,8 +361,9 @@ soo_select(fp, which, wql, p) selrecord(p, &so->so_rcv.sb_sel, wql); break; } - splx(s); + done: + socket_unlock(so, 1); return (retnum); } @@ -396,36 +375,49 @@ soo_stat(so, ub) { int stat; - /* - * DANGER: by the time we get the network funnel the socket - * may have been closed - */ - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); bzero((caddr_t)ub, sizeof (*ub)); + socket_lock(so, 1); ub->st_mode = S_IFSOCK; stat = (*so->so_proto->pr_usrreqs->pru_sense)(so, ub); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + socket_unlock(so, 1); return stat; } /* ARGSUSED */ int -soo_close(fp, p) - struct file *fp; - struct proc *p; +soo_close(struct fileglob *fg, __unused proc_t p) { int error = 0; struct socket *sp; - sp = (struct socket *)fp->f_data; - fp->f_data = NULL; + sp = (struct socket *)fg->fg_data; + fg->fg_data = NULL; - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); if (sp) error = soclose(sp); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); return (error); } + +int +soo_drain(struct fileproc *fp, __unused struct proc *p) +{ + int error = 0; + struct socket *so = (struct socket *)fp->f_fglob->fg_data; + + if (so) { + socket_lock(so, 1); + so->so_state |= SS_DRAINING; + + wakeup((caddr_t)&so->so_timeo); + sorwakeup(so); + sowwakeup(so); + + socket_unlock(so, 1); + } + + return error; +} + diff --git a/bsd/kern/syscalls.c b/bsd/kern/syscalls.c index f2495c800..e75391978 100644 --- a/bsd/kern/syscalls.c +++ b/bsd/kern/syscalls.c @@ -1,403 +1,448 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. + * @APPLE_LICENSE_HEADER_END@ * - * @APPLE_LICENSE_HEADER_END@ + * + * System call switch table. + * + * DO NOT EDIT-- this file is automatically generated. + * created from syscalls.master */ -/* Copyright (c) 1992,1995-1999 Apple Computer, Inc. All rights resereved. */ -char *syscallnames[] = { - "syscall", /* 0 = syscall */ - "exit", /* 1 = exit */ - "fork", /* 2 = fork */ - "read", /* 3 = read */ - "write", /* 4 = write */ - "open", /* 5 = open */ - "close", /* 6 = close */ - "wait4", /* 7 = wait4 */ - "obs_creat", /* 8 = old creat */ - "link", /* 9 = link */ - "unlink", /* 10 = unlink */ - "obs_execv", /* 11 = obsolete execv */ - "chdir", /* 12 = chdir */ - "fchdir", /* 13 = fchdir */ - "mknod", /* 14 = mknod */ - "chmod", /* 15 = chmod */ - "chown", /* 16 = chown */ - "obs_break", /* 17 = obsolete break */ - "obs_getfsstat", /* 18 = obsolete getfsstat */ - "old_lseek", /* 19 = old lseek */ - "getpid", /* 20 = getpid */ - "obs_mount", /* 21 = obsolete mount */ - "obs_unmount", /* 22 = obsolete unmount */ - "setuid", /* 23 = setuid */ - "getuid", /* 24 = getuid */ - "geteuid", /* 25 = geteuid */ - "ptrace", /* 26 = ptrace */ - "recvmsg", /* 27 = recvmsg */ - "sendmsg", /* 28 = sendmsg */ - "recvfrom", /* 29 = recvfrom */ - "accept", /* 30 = accept */ - "getpeername", /* 31 = getpeername */ - "getsockname", /* 32 = getsockname */ - "access", /* 33 = access */ - "chflags", /* 34 = chflags */ - "fchflags", /* 35 = fchflags */ - "sync", /* 36 = sync */ - "kill", /* 37 = kill */ - "obs_stat", /* 38 = old stat */ - "getppid", /* 39 = getppid */ - "obs_lstat", /* 40 = old lstat */ - "dup", /* 41 = dup */ - "pipe", /* 42 = pipe */ - "getegid", /* 43 = getegid */ - "profil", /* 44 = profil */ - "ktrace", /* 45 = ktrace */ - "sigaction", /* 46 = sigaction */ - "getgid", /* 47 = getgid */ - "sigprocmask", /* 48 = sigprocmask */ - "getlogin", /* 49 = getlogin */ - "setlogin", /* 50 = setlogin */ - "acct", /* 51 = acct */ - "sigpending", /* 52 = sigpending */ - "sigaltstack", /* 53 = sigaltstack */ - "ioctl", /* 54 = ioctl */ - "reboot", /* 55 = reboot */ - "revoke", /* 56 = revoke */ - "symlink", /* 57 = symlink */ - "readlink", /* 58 = readlink */ - "execve", /* 59 = execve */ - "umask", /* 60 = umask */ - "chroot", /* 61 = chroot */ - "obs_fstat", /* 62 = old fstat */ - "#63", /* 63 = reserved */ - "obs_getpagesize", /* 64 = old getpagesize */ - "msync", /* 65 = msync */ - "vfork", /* 66 = vfork */ - "obs_vread", /* 67 = obsolete vread */ - "obs_vwrite", /* 68 = obsolete vwrite */ - "sbrk", /* 69 = sbrk */ - "sstk", /* 70 = sstk */ - "obs_mmap", /* 71 = old mmap */ - "obs_vadvise", /* 72 = obsolete vadvise */ - "munmap", /* 73 = munmap */ - "mprotect", /* 74 = mprotect */ - "madvise", /* 75 = madvise */ - "#76", /* 76 = obsolete vhangup */ - "#77", /* 77 = obsolete vlimit */ - "mincore", /* 78 = mincore */ - "getgroups", /* 79 = getgroups */ - "setgroups", /* 80 = setgroups */ - "getpgrp", /* 81 = getpgrp */ - "setpgid", /* 82 = setpgid */ - "setitimer", /* 83 = setitimer */ - "old_wait", /* 84 = old wait */ - "obs_swapon", /* 85 = swapon */ - "getitimer", /* 86 = getitimer */ - "obs_gethostname", /* 87 = old gethostname */ - "obs_sethostname", /* 88 = old sethostname */ - "getdtablesize", /* 89 = getdtablesize */ - "dup2", /* 90 = dup2 */ - "#91", /* 91 = getdopt */ - "fcntl", /* 92 = fcntl */ - "select", /* 93 = select */ - "#94", /* 94 = setdopt */ - "fsync", /* 95 = fsync */ - "setpriority", /* 96 = setpriority */ - "socket", /* 97 = socket */ - "connect", /* 98 = connect */ - "obs_accept", /* 99 = old accept */ - "getpriority", /* 100 = getpriority */ - "old_send", /* 101 = old send */ - "old_recv", /* 102 = old recv */ +const char *syscallnames[] = { + "syscall", /* 0 = syscall indirect syscall */ + "exit", /* 1 = exit */ + "fork", /* 2 = fork */ + "read", /* 3 = read */ + "write", /* 4 = write */ + "open", /* 5 = open */ + "close", /* 6 = close */ + "wait4", /* 7 = wait4 */ + "#8", /* 8 = old creat */ + "link", /* 9 = link */ + "unlink", /* 10 = unlink */ + "#11", /* 11 = old execv */ + "chdir", /* 12 = chdir */ + "fchdir", /* 13 = fchdir */ + "mknod", /* 14 = mknod */ + "chmod", /* 15 = chmod */ + "chown", /* 16 = chown */ + "obreak", /* 17 = obreak old break */ +#if COMPAT_GETFSSTAT + "ogetfsstat", /* 18 = ogetfsstat */ +#else + "getfsstat", /* 18 = getfsstat */ +#endif + "#19", /* 19 = old lseek */ + "getpid", /* 20 = getpid */ + "#21", /* 21 = old mount */ + "#22", /* 22 = old umount */ + "setuid", /* 23 = setuid */ + "getuid", /* 24 = getuid */ + "geteuid", /* 25 = geteuid */ + "ptrace", /* 26 = ptrace */ + "recvmsg", /* 27 = recvmsg */ + "sendmsg", /* 28 = sendmsg */ + "recvfrom", /* 29 = recvfrom */ + "accept", /* 30 = accept */ + "getpeername", /* 31 = getpeername */ + "getsockname", /* 32 = getsockname */ + "access", /* 33 = access */ + "chflags", /* 34 = chflags */ + "fchflags", /* 35 = fchflags */ + "sync", /* 36 = sync */ + "kill", /* 37 = kill */ + "#38", /* 38 = old stat */ + "getppid", /* 39 = getppid */ + "#40", /* 40 = old lstat */ + "dup", /* 41 = dup */ + "pipe", /* 42 = pipe */ + "getegid", /* 43 = getegid */ + "profil", /* 44 = profil */ + "ktrace", /* 45 = ktrace */ + "sigaction", /* 46 = sigaction */ + "getgid", /* 47 = getgid */ + "sigprocmask", /* 48 = sigprocmask */ + "getlogin", /* 49 = getlogin */ + "setlogin", /* 50 = setlogin */ + "acct", /* 51 = acct */ + "sigpending", /* 52 = sigpending */ + "sigaltstack", /* 53 = sigaltstack */ + "ioctl", /* 54 = ioctl */ + "reboot", /* 55 = reboot */ + "revoke", /* 56 = revoke */ + "symlink", /* 57 = symlink */ + "readlink", /* 58 = readlink */ + "execve", /* 59 = execve */ + "umask", /* 60 = umask */ + "chroot", /* 61 = chroot */ + "#62", /* 62 = old fstat */ + "#63", /* 63 = used internally , reserved */ + "#64", /* 64 = old getpagesize */ + "msync", /* 65 = msync */ + "vfork", /* 66 = vfork */ + "#67", /* 67 = old vread */ + "#68", /* 68 = old vwrite */ + "sbrk", /* 69 = sbrk */ + "sstk", /* 70 = sstk */ + "#71", /* 71 = old mmap */ + "ovadvise", /* 72 = ovadvise old vadvise */ + "munmap", /* 73 = munmap */ + "mprotect", /* 74 = mprotect */ + "madvise", /* 75 = madvise */ + "#76", /* 76 = old vhangup */ + "#77", /* 77 = old vlimit */ + "mincore", /* 78 = mincore */ + "getgroups", /* 79 = getgroups */ + "setgroups", /* 80 = setgroups */ + "getpgrp", /* 81 = getpgrp */ + "setpgid", /* 82 = setpgid */ + "setitimer", /* 83 = setitimer */ + "#84", /* 84 = old wait */ + "swapon", /* 85 = swapon */ + "getitimer", /* 86 = getitimer */ + "#87", /* 87 = old gethostname */ + "#88", /* 88 = old sethostname */ + "getdtablesize", /* 89 = getdtablesize */ + "dup2", /* 90 = dup2 */ + "#91", /* 91 = old getdopt */ + "fcntl", /* 92 = fcntl */ + "select", /* 93 = select */ + "#94", /* 94 = old setdopt */ + "fsync", /* 95 = fsync */ + "setpriority", /* 96 = setpriority */ + "socket", /* 97 = socket */ + "connect", /* 98 = connect */ + "#99", /* 99 = old accept */ + "getpriority", /* 100 = getpriority */ + "#101", /* 101 = old send */ + "#102", /* 102 = old recv */ #ifdef __ppc__ - "osigreturn", /* 103 = sigreturn */ + "#103", /* 103 = old sigreturn */ #else - "sigreturn", /* 103 = sigreturn */ + "sigreturn", /* 103 = sigreturn */ #endif - "bind", /* 104 = bind */ - "setsockopt", /* 105 = setsockopt */ - "listen", /* 106 = listen */ - "#107", /* 107 = obsolete vtimes */ - "obs_sigvec", /* 108 = old sigvec */ - "obs_sigblock", /* 109 = old sigblock */ - "obs_sigsetmask", /* 110 = old sigsetmask */ - "sigsuspend", /* 111 = sigsuspend */ - "obs_sigstack", /* 112 = old sigstack */ - "obs_recvmsg", /* 113 = old recvmsg */ - "obs_sendmsg", /* 114 = old sendmsg */ - "#115", /* 115 = obsolete vtrace */ - "gettimeofday", /* 116 = gettimeofday */ - "getrusage", /* 117 = getrusage */ - "getsockopt", /* 118 = getsockopt */ - "#119", /* 119 = nosys */ - "readv", /* 120 = readv */ - "writev", /* 121 = writev */ - "settimeofday", /* 122 = settimeofday */ - "fchown", /* 123 = fchown */ - "fchmod", /* 124 = fchmod */ - "obs_recvfrom", /* 125 = old recvfrom */ - "obs_setreuid", /* 126 = old setreuid */ - "obs_setregid", /* 127 = old setregid */ - "rename", /* 128 = rename */ - "obs_truncate", /* 129 = old truncate */ - "obs_ftruncate", /* 130 = old ftruncate */ - "flock", /* 131 = flock */ - "mkfifo", /* 132 = mkfifo */ - "sendto", /* 133 = sendto */ - "shutdown", /* 134 = shutdown */ - "socketpair", /* 135 = socketpair */ - "mkdir", /* 136 = mkdir */ - "rmdir", /* 137 = rmdir */ - "utimes", /* 138 = utimes */ - "futimes", /* 139 = futimes */ - "adjtime", /* 140 = adjtime */ - "obs_getpeername", /* 141 = old getpeername */ - "obs_gethostid", /* 142 = old gethostid */ - "#143", /* 143 = old sethostid */ - "obs_getrlimit", /* 144 = old getrlimit */ - "obs_setrlimit", /* 145 = old setrlimit */ - "obs_killpg", /* 146 = old killpg */ - "setsid", /* 147 = setsid */ - "#148", /* 148 = obsolete setquota */ - "#149", /* 149 = obsolete qquota */ - "obs_getsockname", /* 150 = old getsockname */ - "getpgid", /* 151 = getpgid */ - "setprivexec", /* 152 = setprivexec */ - "pread", /* 153 = pread */ - "pwrite", /* 154 = pwrite */ - "nfssvc", /* 155 = nfssvc */ - "getdirentries", /* 156 =getdirentries */ - "statfs", /* 157 = statfs */ - "fstatfs", /* 158 = fstatfs */ - "unmount", /* 159 = unmount */ - "#160", /* 160 = obsolete async_daemon */ - "getfh", /* 161 = getfh */ - "obs_getdomainname",/* 162 = old getdomainname */ - "obs_setdomainname",/* 163 = old setdomainname */ - "#164", /* 164 */ - "quotactl", /* 165 = quotactl */ - "#166", /* 166 = obsolete exportfs */ - "mount", /* 167 = mount */ - "#168", /* 168 = obsolete ustat */ - "#169", /* 169 = nosys */ - "#170", /* 170 = obsolete table */ - "obs_wait3", /* 171 = old wait3 */ - "#172", /* 172 = obsolete rpause */ - "#173", /* 173 = nosys */ - "#174", /* 174 = obsolete getdents */ - "#175", /* 175 = nosys */ - "add_profil", /* 176 = add_profil */ /* NeXT */ - "#177", /* 177 = nosys */ - "#178", /* 178 = nosys */ - "#179", /* 179 = nosys */ - "kdebug_trace", /* 180 = kdebug_trace */ - "setgid", /* 181 = setgid */ - "setegid", /* 182 = setegid */ - "seteuid", /* 183 = seteuid */ + "bind", /* 104 = bind */ + "setsockopt", /* 105 = setsockopt */ + "listen", /* 106 = listen */ + "#107", /* 107 = old vtimes */ + "#108", /* 108 = old sigvec */ + "#109", /* 109 = old sigblock */ + "#110", /* 110 = old sigsetmask */ + "sigsuspend", /* 111 = sigsuspend */ + "#112", /* 112 = old sigstack */ + "#113", /* 113 = old recvmsg */ + "#114", /* 114 = old sendmsg */ + "#115", /* 115 = old vtrace */ #ifdef __ppc__ - "sigreturn", /* 184 = sigreturn */ + "ppc_gettimeofday", /* 116 = ppc_gettimeofday */ +#else + "gettimeofday", /* 116 = gettimeofday */ +#endif + "getrusage", /* 117 = getrusage */ + "getsockopt", /* 118 = getsockopt */ + "#119", /* 119 = old resuba */ + "readv", /* 120 = readv */ + "writev", /* 121 = writev */ + "settimeofday", /* 122 = settimeofday */ + "fchown", /* 123 = fchown */ + "fchmod", /* 124 = fchmod */ + "#125", /* 125 = old recvfrom */ + "#126", /* 126 = old setreuid */ + "#127", /* 127 = old setregid */ + "rename", /* 128 = rename */ + "#129", /* 129 = old truncate */ + "#130", /* 130 = old ftruncate */ + "flock", /* 131 = flock */ + "mkfifo", /* 132 = mkfifo */ + "sendto", /* 133 = sendto */ + "shutdown", /* 134 = shutdown */ + "socketpair", /* 135 = socketpair */ + "mkdir", /* 136 = mkdir */ + "rmdir", /* 137 = rmdir */ + "utimes", /* 138 = utimes */ + "futimes", /* 139 = futimes */ + "adjtime", /* 140 = adjtime */ + "#141", /* 141 = old getpeername */ + "#142", /* 142 = old gethostid */ + "#143", /* 143 = old sethostid */ + "#144", /* 144 = old getrlimit */ + "#145", /* 145 = old setrlimit */ + "#146", /* 146 = old killpg */ + "setsid", /* 147 = setsid */ + "#148", /* 148 = old setquota */ + "#149", /* 149 = old qquota */ + "#150", /* 150 = old getsockname */ + "getpgid", /* 151 = getpgid */ + "setprivexec", /* 152 = setprivexec */ + "pread", /* 153 = pread */ + "pwrite", /* 154 = pwrite */ +#if NFSSERVER + "nfssvc", /* 155 = nfssvc */ +#else + "#155", /* 155 = */ +#endif + "#156", /* 156 = old getdirentries */ + "statfs", /* 157 = statfs */ + "fstatfs", /* 158 = fstatfs */ + "unmount", /* 159 = unmount */ + "#160", /* 160 = old async_daemon */ +#if NFSCLIENT + "getfh", /* 161 = getfh */ +#else + "#161", /* 161 = */ +#endif + "#162", /* 162 = old getdomainname */ + "#163", /* 163 = old setdomainname */ + "#164", /* 164 = */ + "quotactl", /* 165 = quotactl */ + "#166", /* 166 = old exportfs */ + "mount", /* 167 = mount */ + "#168", /* 168 = old ustat */ + "#169", /* 169 = */ + "table", /* 170 = table old table */ + "#171", /* 171 = old wait3 */ + "#172", /* 172 = old rpause */ + "waitid", /* 173 = waitid */ + "#174", /* 174 = old getdents */ + "#175", /* 175 = old gc_control */ + "add_profil", /* 176 = add_profil */ + "#177", /* 177 = */ + "#178", /* 178 = */ + "#179", /* 179 = */ + "kdebug_trace", /* 180 = kdebug_trace */ + "setgid", /* 181 = setgid */ + "setegid", /* 182 = setegid */ + "seteuid", /* 183 = seteuid */ +#ifdef __ppc__ + "sigreturn", /* 184 = sigreturn */ +#else + "#184", /* 184 = */ +#endif + "#185", /* 185 = */ + "#186", /* 186 = */ + "#187", /* 187 = */ + "stat", /* 188 = stat */ + "fstat", /* 189 = fstat */ + "lstat", /* 190 = lstat */ + "pathconf", /* 191 = pathconf */ + "fpathconf", /* 192 = fpathconf */ +#if COMPAT_GETFSSTAT + "getfsstat", /* 193 = getfsstat */ +#else + "#193", /* 193 = */ +#endif + "getrlimit", /* 194 = getrlimit */ + "setrlimit", /* 195 = setrlimit */ + "getdirentries", /* 196 = getdirentries */ + "mmap", /* 197 = mmap */ + "#198", /* 198 = __syscall */ + "lseek", /* 199 = lseek */ + "truncate", /* 200 = truncate */ + "ftruncate", /* 201 = ftruncate */ + "__sysctl", /* 202 = __sysctl */ + "mlock", /* 203 = mlock */ + "munlock", /* 204 = munlock */ + "undelete", /* 205 = undelete */ +#ifdef __ppc__ + "ATsocket", /* 206 = ATsocket */ + "ATgetmsg", /* 207 = ATgetmsg */ + "ATputmsg", /* 208 = ATputmsg */ + "ATPsndreq", /* 209 = ATPsndreq */ + "ATPsndrsp", /* 210 = ATPsndrsp */ + "ATPgetreq", /* 211 = ATPgetreq */ + "ATPgetrsp", /* 212 = ATPgetrsp */ + "#213", /* 213 = Reserved for AppleTalk */ +#else + "ATsocket", /* 206 = ATsocket */ + "ATgetmsg", /* 207 = ATgetmsg */ + "ATputmsg", /* 208 = ATputmsg */ + "ATPsndreq", /* 209 = ATPsndreq */ + "ATPsndrsp", /* 210 = ATPsndrsp */ + "ATPgetreq", /* 211 = ATPgetreq */ + "ATPgetrsp", /* 212 = ATPgetrsp */ + "#213", /* 213 = Reserved for AppleTalk */ +#endif /* __ppc__ */ + "kqueue_from_portset_np", /* 214 = kqueue_from_portset_np */ + "kqueue_portset_np", /* 215 = kqueue_portset_np */ + "mkcomplex", /* 216 = mkcomplex soon to be obsolete */ + "statv", /* 217 = statv soon to be obsolete */ + "lstatv", /* 218 = lstatv soon to be obsolete */ + "fstatv", /* 219 = fstatv soon to be obsolete */ + "getattrlist", /* 220 = getattrlist */ + "setattrlist", /* 221 = setattrlist */ + "getdirentriesattr", /* 222 = getdirentriesattr */ + "exchangedata", /* 223 = exchangedata */ +#ifdef __APPLE_API_OBSOLETE + "checkuseraccess", /* 224 = checkuseraccess */ +#else + "#224", /* 224 = HFS checkuseraccess check access to a file */ +#endif /* __APPLE_API_OBSOLETE */ + "searchfs", /* 225 = searchfs */ + "delete", /* 226 = delete private delete ( Carbon semantics ) */ + "copyfile", /* 227 = copyfile */ + "#228", /* 228 = */ + "#229", /* 229 = */ + "poll", /* 230 = poll */ + "watchevent", /* 231 = watchevent */ + "waitevent", /* 232 = waitevent */ + "modwatch", /* 233 = modwatch */ + "getxattr", /* 234 = getxattr */ + "fgetxattr", /* 235 = fgetxattr */ + "setxattr", /* 236 = setxattr */ + "fsetxattr", /* 237 = fsetxattr */ + "removexattr", /* 238 = removexattr */ + "fremovexattr", /* 239 = fremovexattr */ + "listxattr", /* 240 = listxattr */ + "flistxattr", /* 241 = flistxattr */ + "fsctl", /* 242 = fsctl */ + "initgroups", /* 243 = initgroups */ + "#244", /* 244 = */ + "#245", /* 245 = */ + "#246", /* 246 = */ +#if NFSCLIENT + "nfsclnt", /* 247 = nfsclnt */ + "fhopen", /* 248 = fhopen */ #else - "#184", /* 184 = nosys */ + "#247", /* 247 = */ + "#248", /* 248 = */ #endif - "#185", /* 185 = nosys */ - "#186", /* 186 = nosys */ - "#187", /* 187 = nosys */ - "stat", /* 188 = stat */ - "fstat", /* 189 = fstat */ - "lstat", /* 190 = lstat */ - "pathconf", /* 191 = pathconf */ - "fpathconf", /* 192 = fpathconf */ - "obs_getfsstat", /* 193 = old getfsstat */ - "getrlimit", /* 194 = getrlimit */ - "setrlimit", /* 195 = setrlimit */ - "getdirentries", /* 196 = getdirentries */ - "mmap", /* 197 = mmap */ - "#198", /* 198 = __syscall */ - "lseek", /* 199 = lseek */ - "truncate", /* 200 = truncate */ - "ftruncate", /* 201 = ftruncate */ - "__sysctl", /* 202 = __sysctl */ - "mlock", /* 203 = mlock */ - "munlock", /* 204 = munlock */ - "undelete", /* 205 = undelete */ - "ATsocket", /* 206 = ATsocket */ - "ATgetmsg", /* 207 = ATgetmsg */ - "ATputmsg", /* 208 = ATputmsg */ - "ATPsndreq", /* 209 = ATPsndreq */ - "ATPsndrsp", /* 210 = ATPsndrsp */ - "ATPgetreq", /* 211 = ATPgetreq */ - "ATPgetrsp", /* 212 = ATPgetrsp */ - "#213", /* 213 = Reserved for AppleTalk */ - "kqueue_from_portset_np", /* 214 = kqueue_from_portset_np */ - "kqueue_portset_np", /* 215 = kqueue_portset_np */ - "#216", /* 216 = Reserved */ - "#217", /* 217 = Reserved */ - "#218", /* 218 = Reserved */ - "#219", /* 219 = Reserved */ - "getattrlist", /* 220 = getattrlist */ - "setattrlist", /* 221 = setattrlist */ - "getdirentriesattr", /* 222 = getdirentriesattr */ - "exchangedata", /* 223 = exchangedata */ - "checkuseraccess", /* 224 - checkuseraccess */ - "searchfs", /* 225 = searchfs */ - "delete", /* 226 = private delete call */ - "copyfile", /* 227 = copyfile */ - "#228", /* 228 = nosys */ - "#229", /* 229 = nosys */ - "#230", /* 230 = reserved for AFS */ - "watchevent", /* 231 = watchevent */ - "waitevent", /* 232 = waitevent */ - "modwatch", /* 233 = modwatch */ - "#234", /* 234 = nosys */ - "#235", /* 235 = nosys */ - "#236", /* 236 = nosys */ - "#237", /* 237 = nosys */ - "#238", /* 238 = nosys */ - "#239", /* 239 = nosys */ - "#240", /* 240 = nosys */ - "#241", /* 241 = nosys */ - "fsctl", /* 242 = fsctl */ - "#243", /* 243 = nosys */ - "#244", /* 244 = nosys */ - "#245", /* 245 = nosys */ - "#246", /* 246 = nosys */ - "nfsclnt", /* 247 = nfsclnt */ - "fhopen", /* 248 = fhopen */ - "#249", /* 249 = nosys */ - "minherit", /* 250 = minherit */ - "semsys", /* 251 = semsys */ - "msgsys", /* 252 = msgsys */ - "shmsys", /* 253 = shmsys */ - "semctl", /* 254 = semctl */ - "semget", /* 255 = semget */ - "semop", /* 256 = semop */ - "semconfig", /* 257 = semconfig */ - "msgctl", /* 258 = msgctl */ - "msgget", /* 259 = msgget */ - "msgsnd", /* 260 = msgsnd */ - "msgrcv", /* 261 = msgrcv */ - "shmat", /* 262 = shmat */ - "shmctl", /* 263 = shmctl */ - "shmdt", /* 264 = shmdt */ - "shmget", /* 265 = shmget */ - "shm_open", /* 266 = shm_open */ - "shm_unlink", /* 267 = shm_unlink */ - "sem_open", /* 268 = sem_open */ - "sem_close", /* 269 = sem_close */ - "sem_unlink", /* 270 = sem_unlink */ - "sem_wait", /* 271 = sem_wait */ - "sem_trywait", /* 272 = sem_trywait */ - "sem_post", /* 273 = sem_post */ - "sem_getvalue", /* 274 = sem_getvalue */ - "sem_init", /* 275 = sem_init */ - "sem_destroy", /* 276 = sem_destroy */ - "#277", /* 277 = nosys */ - "#278", /* 278 = nosys */ - "#279", /* 279 = nosys */ - "#280", /* 280 = nosys */ - "#281", /* 281 = nosys */ - "#282", /* 282 = nosys */ - "#283", /* 283 = nosys */ - "#284", /* 284 = nosys */ - "#285", /* 285 = nosys */ - "#286", /* 286 = nosys */ - "#287", /* 287 = nosys */ - "#288", /* 288 = nosys */ - "#289", /* 289 = nosys */ - "#290", /* 290 = nosys */ - "#291", /* 291 = nosys */ - "#292", /* 292 = nosys */ - "#293", /* 293 = nosys */ - "#294", /* 294 = nosys */ - "#295", /* 295 = nosys */ - "load_shared_file", /* 296 = load_shared_file */ - "reset_shared_file", /* 297 = reset_shared_file */ - "new_system_shared_regions", /* 298 = new_system_shared_regions */ - "#299", /* 299 = nosys */ - "#300", /* 300 = modnext */ - "#301", /* 301 = modstat */ - "#302", /* 302 = modfnext */ - "#303", /* 303 = modfind */ - "#304", /* 304 = kldload */ - "#305", /* 305 = kldunload */ - "#306", /* 306 = kldfind */ - "#307", /* 307 = kldnext */ - "#308", /* 308 = kldstat */ - "#309", /* 309 = kldfirstmod */ - "getsid", /* 310 = getsid */ - "#311", /* 311 = setresuid */ - "#312", /* 312 = setresgid */ - "aio_fsync", /* 313 = aio_fsync */ - "aio_return", /* 314 = aio_return */ - "aio_suspend", /* 315 = aio_suspend */ - "aio_cancel", /* 316 = aio_cancel */ - "aio_error", /* 317 = aio_error */ - "aio_read", /* 318 = aio_read */ - "aio_write", /* 319 = aio_write */ - "lio_listio", /* 320 = lio_listio */ - "#321", /* 321 = yield */ - "#322", /* 322 = thr_sleep */ - "#323", /* 323 = thr_wakeup */ - "mlockall", /* 324 = mlockall */ - "munlockall", /* 325 = munlockall */ - "#326", /* 326 */ - "issetugid", /* 327 = issetugid */ - "__pthread_kill", /* 328 = __pthread_kill */ - "pthread_sigmask", /* 329 = pthread_sigmask */ - "sigwait", /* 330 = sigwait */ - "#331", /* 331 */ - "#332", /* 332 */ - "#333", /* 333 */ - "#334", /* 334 */ - "utrace", /* 335 = utrace */ - "#336", /* 336 */ - "#337", /* 337 */ - "#338", /* 338 */ - "#339", /* 339 */ - "#340", /* 340 = TBD sigprocmask */ - "#341", /* 341 = TBD sigsuspend */ - "#342", /* 342 = TBD sigaction */ - "#343", /* 343 = TBD sigpending */ - "#344", /* 344 = TBD sigreturn */ - "#345", /* 345 = TBD sigtimedwait */ - "#346", /* 346 = TBD sigwaitinfo */ - "#347", /* 347 */ - "#348", /* 348 */ - "#349" /* 349 */ - "audit", /* 350 */ - "auditon", /* 351 */ - "#352", /* 352 */ - "getauid", /* 353 */ - "setauid", /* 354 */ - "getaudit", /* 355 */ - "setaudit", /* 356 */ - "getaudit_addr", /* 357 */ - "setaudit_addr", /* 358 */ - "auditctl", /* 359 */ - "#360", /* 360 */ - "#361", /* 361 */ - "kqueue", /* 362 = kqueue */ - "kevent", /* 363 = kevent */ - "#364", /* 364 */ - "#365", /* 365 */ - "#366", /* 366 */ - "#367", /* 367 */ - "#368", /* 368 */ - "#369" /* 369 */ + "#249", /* 249 = */ + "minherit", /* 250 = minherit */ + "semsys", /* 251 = semsys */ + "msgsys", /* 252 = msgsys */ + "shmsys", /* 253 = shmsys */ + "semctl", /* 254 = semctl */ + "semget", /* 255 = semget */ + "semop", /* 256 = semop */ + "semconfig", /* 257 = semconfig */ + "msgctl", /* 258 = msgctl */ + "msgget", /* 259 = msgget */ + "msgsnd", /* 260 = msgsnd */ + "msgrcv", /* 261 = msgrcv */ + "shmat", /* 262 = shmat */ + "shmctl", /* 263 = shmctl */ + "shmdt", /* 264 = shmdt */ + "shmget", /* 265 = shmget */ + "shm_open", /* 266 = shm_open */ + "shm_unlink", /* 267 = shm_unlink */ + "sem_open", /* 268 = sem_open */ + "sem_close", /* 269 = sem_close */ + "sem_unlink", /* 270 = sem_unlink */ + "sem_wait", /* 271 = sem_wait */ + "sem_trywait", /* 272 = sem_trywait */ + "sem_post", /* 273 = sem_post */ + "sem_getvalue", /* 274 = sem_getvalue */ + "sem_init", /* 275 = sem_init */ + "sem_destroy", /* 276 = sem_destroy */ + "open_extended", /* 277 = open_extended */ + "umask_extended", /* 278 = umask_extended */ + "stat_extended", /* 279 = stat_extended */ + "lstat_extended", /* 280 = lstat_extended */ + "fstat_extended", /* 281 = fstat_extended */ + "chmod_extended", /* 282 = chmod_extended */ + "fchmod_extended", /* 283 = fchmod_extended */ + "access_extended", /* 284 = access_extended */ + "settid", /* 285 = settid */ + "gettid", /* 286 = gettid */ + "setsgroups", /* 287 = setsgroups */ + "getsgroups", /* 288 = getsgroups */ + "setwgroups", /* 289 = setwgroups */ + "getwgroups", /* 290 = getwgroups */ + "mkfifo_extended", /* 291 = mkfifo_extended */ + "mkdir_extended", /* 292 = mkdir_extended */ + "identitysvc", /* 293 = identitysvc */ + "#294", /* 294 = */ + "#295", /* 295 = */ + "load_shared_file", /* 296 = load_shared_file */ + "reset_shared_file", /* 297 = reset_shared_file */ + "new_system_shared_regions", /* 298 = new_system_shared_regions */ + "shared_region_map_file_np", /* 299 = shared_region_map_file_np */ + "shared_region_make_private_np", /* 300 = shared_region_make_private_np */ + "#301", /* 301 = */ + "#302", /* 302 = */ + "#303", /* 303 = */ + "#304", /* 304 = */ + "#305", /* 305 = */ + "#306", /* 306 = */ + "#307", /* 307 = */ + "#308", /* 308 = */ + "#309", /* 309 = */ + "getsid", /* 310 = getsid */ + "settid_with_pid", /* 311 = settid_with_pid */ + "#312", /* 312 = */ + "aio_fsync", /* 313 = aio_fsync */ + "aio_return", /* 314 = aio_return */ + "aio_suspend", /* 315 = aio_suspend */ + "aio_cancel", /* 316 = aio_cancel */ + "aio_error", /* 317 = aio_error */ + "aio_read", /* 318 = aio_read */ + "aio_write", /* 319 = aio_write */ + "lio_listio", /* 320 = lio_listio */ + "#321", /* 321 = */ + "#322", /* 322 = */ + "#323", /* 323 = */ + "mlockall", /* 324 = mlockall */ + "munlockall", /* 325 = munlockall */ + "#326", /* 326 = */ + "issetugid", /* 327 = issetugid */ + "__pthread_kill", /* 328 = __pthread_kill */ + "pthread_sigmask", /* 329 = pthread_sigmask */ + "sigwait", /* 330 = sigwait */ + "__disable_threadsignal", /* 331 = __disable_threadsignal */ + "__pthread_markcancel", /* 332 = __pthread_markcancel */ + "__pthread_canceled", /* 333 = __pthread_canceled */ + "__semwait_signal", /* 334 = __semwait_signal */ + "utrace", /* 335 = utrace */ + "#336", /* 336 = */ + "#337", /* 337 = */ + "#338", /* 338 = */ + "#339", /* 339 = */ + "#340", /* 340 = */ + "#341", /* 341 = */ + "#342", /* 342 = */ + "#343", /* 343 = */ + "#344", /* 344 = */ + "#345", /* 345 = */ + "#346", /* 346 = */ + "#347", /* 347 = */ + "#348", /* 348 = */ + "#349", /* 349 = */ + "audit", /* 350 = audit */ + "auditon", /* 351 = auditon */ + "#352", /* 352 = */ + "getauid", /* 353 = getauid */ + "setauid", /* 354 = setauid */ + "getaudit", /* 355 = getaudit */ + "setaudit", /* 356 = setaudit */ + "getaudit_addr", /* 357 = getaudit_addr */ + "setaudit_addr", /* 358 = setaudit_addr */ + "auditctl", /* 359 = auditctl */ + "#360", /* 360 = */ + "#361", /* 361 = */ + "kqueue", /* 362 = kqueue */ + "kevent", /* 363 = kevent */ + "lchown", /* 364 = lchown */ + "#365", /* 365 = */ + "#366", /* 366 = */ + "#367", /* 367 = */ + "#368", /* 368 = */ + "#369", /* 369 = */ }; diff --git a/bsd/kern/syscalls.master b/bsd/kern/syscalls.master new file mode 100644 index 000000000..a3e6e7d59 --- /dev/null +++ b/bsd/kern/syscalls.master @@ -0,0 +1,474 @@ +; derived from: FreeBSD @(#)syscalls.master 8.2 (Berkeley) 1/13/94 +; +; System call name/number master file. +; This is file processed by .../xnu/bsd/kern/makesyscalls.sh and creates: +; .../xnu/bsd/kern/init_sysent.c +; .../xnu/bsd/kern/syscalls.c +; .../xnu/bsd/sys/syscall.h +; .../xnu/bsd/sys/sysproto.h + +; Columns -> | Number | Cancel | Funnel | Files | { Name and Args } | { Comments } +; Number: system call number, must be in order +; Cancel: type of thread cancel - "PRE", "POST" or "NONE" +; Funnel: type of funnel - "KERN" or "NONE" +; Files: with files to generate - "ALL" or any combo of: +; "T" for syscall table (in init_sysent.c) +; "N" for syscall names (in syscalls.c) +; "H" for syscall headers (in syscall.h) +; "P" for syscall prototypes (in sysproto.h) +; Comments: additional comments about the sys call copied to output files + +; #ifdef's, #include's, #if's etc. are copied to all output files. + +#include <sys/appleapiopts.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/sysent.h> +#include <sys/sysproto.h> + +0 NONE NONE ALL { int nosys(void); } { indirect syscall } +1 NONE KERN ALL { void exit(int rval); } +2 NONE KERN ALL { int fork(void); } +3 PRE NONE ALL { user_ssize_t read(int fd, user_addr_t cbuf, user_size_t nbyte); } +4 PRE NONE ALL { user_ssize_t write(int fd, user_addr_t cbuf, user_size_t nbyte); } +5 PRE NONE ALL { int open(user_addr_t path, int flags, int mode); } +6 PRE NONE ALL { int close(int fd); } +7 PRE KERN ALL { int wait4(int pid, user_addr_t status, int options, user_addr_t rusage); } +8 NONE NONE ALL { int nosys(void); } { old creat } +9 NONE NONE ALL { int link(user_addr_t path, user_addr_t link); } +10 NONE NONE ALL { int unlink(user_addr_t path); } +11 NONE NONE ALL { int nosys(void); } { old execv } +12 NONE NONE ALL { int chdir(user_addr_t path); } +13 NONE NONE ALL { int fchdir(int fd); } +14 NONE NONE ALL { int mknod(user_addr_t path, int mode, int dev); } +15 NONE NONE ALL { int chmod(user_addr_t path, int mode); } +16 NONE NONE ALL { int chown(user_addr_t path, int uid, int gid); } +17 NONE NONE UALL { int obreak(char *nsize); } { old break } + +#if COMPAT_GETFSSTAT +18 NONE NONE ALL { int ogetfsstat(user_addr_t buf, int bufsize, int flags); } +#else +18 NONE NONE ALL { int getfsstat(user_addr_t buf, int bufsize, int flags); } +#endif + +19 NONE NONE ALL { int nosys(void); } { old lseek } +20 NONE NONE ALL { int getpid(void); } +21 NONE NONE ALL { int nosys(void); } { old mount } +22 NONE NONE ALL { int nosys(void); } { old umount } +23 NONE KERN ALL { int setuid(uid_t uid); } +24 NONE KERN ALL { int getuid(void); } +25 NONE KERN ALL { int geteuid(void); } +26 NONE KERN ALL { int ptrace(int req, pid_t pid, caddr_t addr, int data); } +27 PRE NONE ALL { int recvmsg(int s, struct msghdr *msg, int flags); } +28 PRE NONE ALL { int sendmsg(int s, caddr_t msg, int flags); } +29 PRE NONE ALL { int recvfrom(int s, void *buf, size_t len, int flags, struct sockaddr *from, int *fromlenaddr); } +30 PRE NONE ALL { int accept(int s, caddr_t name, socklen_t *anamelen); } +31 NONE NONE ALL { int getpeername(int fdes, caddr_t asa, socklen_t *alen); } +32 NONE NONE ALL { int getsockname(int fdes, caddr_t asa, socklen_t *alen); } +33 NONE NONE ALL { int access(user_addr_t path, int flags); } +34 NONE NONE ALL { int chflags(char *path, int flags); } +35 NONE NONE ALL { int fchflags(int fd, int flags); } +36 NONE NONE ALL { int sync(void); } +37 NONE KERN ALL { int kill(int pid, int signum); } +38 NONE NONE ALL { int nosys(void); } { old stat } +39 NONE KERN ALL { int getppid(void); } +40 NONE NONE ALL { int nosys(void); } { old lstat } +41 NONE NONE ALL { int dup(u_int fd); } +42 NONE NONE ALL { int pipe(void); } +43 NONE KERN ALL { int getegid(void); } +44 NONE KERN ALL { int profil(short *bufbase, size_t bufsize, u_long pcoffset, u_int pcscale); } +45 NONE KERN ALL { int ktrace(const char *fname, int ops, int facs, int pid); } +46 NONE KERN ALL { int sigaction(int signum, struct __sigaction *nsa, struct sigaction *osa); } +47 NONE KERN ALL { int getgid(void); } +48 NONE KERN ALL { int sigprocmask(int how, user_addr_t mask, user_addr_t omask); } +49 NONE KERN ALL { int getlogin(char *namebuf, u_int namelen); } +50 NONE KERN ALL { int setlogin(char *namebuf); } +51 NONE KERN ALL { int acct(char *path); } +52 NONE KERN ALL { int sigpending(struct sigvec *osv); } +53 NONE KERN ALL { int sigaltstack(struct sigaltstack *nss, struct sigaltstack *oss); } +54 NONE NONE ALL { int ioctl(int fd, u_long com, caddr_t data); } +55 NONE KERN ALL { int reboot(int opt, char *command); } +56 NONE NONE ALL { int revoke(char *path); } +57 NONE NONE ALL { int symlink(char *path, char *link); } +58 NONE NONE ALL { int readlink(char *path, char *buf, int count); } +59 NONE KERN ALL { int execve(char *fname, char **argp, char **envp); } +60 NONE KERN ALL { int umask(int newmask); } +61 NONE KERN ALL { int chroot(user_addr_t path); } +62 NONE NONE ALL { int nosys(void); } { old fstat } +63 NONE NONE ALL { int nosys(void); } { used internally, reserved } +64 NONE NONE ALL { int nosys(void); } { old getpagesize } +65 PRE NONE ALL { int msync(caddr_t addr, size_t len, int flags); } +66 NONE KERN ALL { int vfork(void); } +67 NONE NONE ALL { int nosys(void); } { old vread } +68 NONE NONE ALL { int nosys(void); } { old vwrite } +69 NONE NONE ALL { int sbrk(int incr); } +70 NONE NONE ALL { int sstk(int incr); } +71 NONE NONE ALL { int nosys(void); } { old mmap } +72 NONE NONE ALL { int ovadvise(void); } { old vadvise } +73 NONE NONE ALL { int munmap(caddr_t addr, size_t len); } +74 NONE NONE ALL { int mprotect(caddr_t addr, size_t len, int prot); } +75 NONE NONE ALL { int madvise(caddr_t addr, size_t len, int behav); } +76 NONE NONE ALL { int nosys(void); } { old vhangup } +77 NONE NONE ALL { int nosys(void); } { old vlimit } +78 NONE NONE ALL { int mincore(user_addr_t addr, user_size_t len, user_addr_t vec); } +79 NONE KERN ALL { int getgroups(u_int gidsetsize, gid_t *gidset); } +80 NONE KERN ALL { int setgroups(u_int gidsetsize, gid_t *gidset); } +81 NONE KERN ALL { int getpgrp(void); } +82 NONE KERN ALL { int setpgid(int pid, int pgid); } +83 NONE KERN ALL { int setitimer(u_int which, struct itimerval *itv, struct itimerval *oitv); } +84 NONE NONE ALL { int nosys(void); } { old wait } +85 NONE NONE ALL { int swapon(void); } +86 NONE KERN ALL { int getitimer(u_int which, struct itimerval *itv); } +87 NONE NONE ALL { int nosys(void); } { old gethostname } +88 NONE NONE ALL { int nosys(void); } { old sethostname } +89 NONE NONE ALL { int getdtablesize(void); } +90 NONE NONE ALL { int dup2(u_int from, u_int to); } +91 NONE NONE ALL { int nosys(void); } { old getdopt } +92 PRE NONE ALL { int fcntl(int fd, int cmd, long arg); } +93 PRE KERN ALL { int select(int nd, u_int32_t *in, u_int32_t *ou, u_int32_t *ex, struct timeval *tv); } +94 NONE NONE ALL { int nosys(void); } { old setdopt } +95 PRE NONE ALL { int fsync(int fd); } +96 NONE KERN ALL { int setpriority(int which, int who, int prio); } +97 NONE NONE ALL { int socket(int domain, int type, int protocol); } +98 PRE NONE ALL { int connect(int s, caddr_t name, socklen_t namelen); } +99 NONE NONE ALL { int nosys(void); } { old accept } +100 NONE KERN ALL { int getpriority(int which, int who); } +101 NONE NONE ALL { int nosys(void); } { old send } +102 NONE NONE ALL { int nosys(void); } { old recv } + +#ifdef __ppc__ +103 NONE NONE ALL { int nosys(void); } { old sigreturn } +#else +103 NONE KERN UALL { int sigreturn(struct sigcontext *sigcntxp); } +#endif + +104 NONE NONE ALL { int bind(int s, caddr_t name, socklen_t namelen); } +105 NONE NONE ALL { int setsockopt(int s, int level, int name, caddr_t val, socklen_t valsize); } +106 NONE NONE ALL { int listen(int s, int backlog); } +107 NONE NONE ALL { int nosys(void); } { old vtimes } +108 NONE NONE ALL { int nosys(void); } { old sigvec } +109 NONE NONE ALL { int nosys(void); } { old sigblock } +110 NONE NONE ALL { int nosys(void); } { old sigsetmask } +111 PRE KERN ALL { int sigsuspend(sigset_t mask); } +112 NONE NONE ALL { int nosys(void); } { old sigstack } +113 NONE NONE ALL { int nosys(void); } { old recvmsg } +114 NONE NONE ALL { int nosys(void); } { old sendmsg } +115 NONE NONE ALL { int nosys(void); } { old vtrace } + +#ifdef __ppc__ +116 NONE NONE ALL { int ppc_gettimeofday(struct timeval *tp, struct timezone *tzp); } +#else +116 NONE NONE ALL { int gettimeofday(struct timeval *tp, struct timezone *tzp); } +#endif + +117 NONE KERN ALL { int getrusage(int who, struct rusage *rusage); } +118 NONE NONE ALL { int getsockopt(int s, int level, int name, caddr_t val, socklen_t *avalsize); } +119 NONE NONE ALL { int nosys(void); } { old resuba } +120 PRE NONE ALL { user_ssize_t readv(int fd, struct iovec *iovp, u_int iovcnt); } +121 PRE NONE ALL { user_ssize_t writev(int fd, struct iovec *iovp, u_int iovcnt); } +122 NONE KERN ALL { int settimeofday(struct timeval *tv, struct timezone *tzp); } +123 NONE NONE ALL { int fchown(int fd, int uid, int gid); } +124 NONE NONE ALL { int fchmod(int fd, int mode); } +125 NONE NONE ALL { int nosys(void); } { old recvfrom } +126 NONE NONE ALL { int nosys(void); } { old setreuid } +127 NONE NONE ALL { int nosys(void); } { old setregid } +128 NONE NONE ALL { int rename(char *from, char *to); } +129 NONE NONE ALL { int nosys(void); } { old truncate } +130 NONE NONE ALL { int nosys(void); } { old ftruncate } +131 NONE NONE ALL { int flock(int fd, int how); } +132 NONE NONE ALL { int mkfifo(user_addr_t path, int mode); } +133 PRE NONE ALL { int sendto(int s, caddr_t buf, size_t len, int flags, caddr_t to, socklen_t tolen); } +134 NONE NONE ALL { int shutdown(int s, int how); } +135 NONE NONE ALL { int socketpair(int domain, int type, int protocol, int *rsv); } +136 NONE NONE ALL { int mkdir(user_addr_t path, int mode); } +137 NONE NONE ALL { int rmdir(char *path); } +138 NONE NONE ALL { int utimes(char *path, struct timeval *tptr); } +139 NONE NONE ALL { int futimes(int fd, struct timeval *tptr); } +140 NONE KERN ALL { int adjtime(struct timeval *delta, struct timeval *olddelta); } +141 NONE NONE ALL { int nosys(void); } { old getpeername } +142 NONE NONE ALL { int nosys(void); } { old gethostid } +143 NONE NONE ALL { int nosys(void); } { old sethostid } +144 NONE NONE ALL { int nosys(void); } { old getrlimit } +145 NONE NONE ALL { int nosys(void); } { old setrlimit } +146 NONE NONE ALL { int nosys(void); } { old killpg } +147 NONE KERN ALL { int setsid(void); } +148 NONE NONE ALL { int nosys(void); } { old setquota } +149 NONE NONE ALL { int nosys(void); } { old qquota } +150 NONE NONE ALL { int nosys(void); } { old getsockname } +151 NONE KERN ALL { int getpgid(pid_t pid); } +152 NONE KERN ALL { int setprivexec(int flag); } +153 PRE NONE ALL { user_ssize_t pread(int fd, user_addr_t buf, user_size_t nbyte, off_t offset); } +154 PRE NONE ALL { user_ssize_t pwrite(int fd, user_addr_t buf, user_size_t nbyte, off_t offset); } + +#if NFSSERVER +155 NONE KERN ALL { int nfssvc(int flag, caddr_t argp); } +#else +155 NONE NONE ALL { int nosys(void); } +#endif + +156 NONE NONE ALL { int nosys(void); } { old getdirentries } +157 NONE NONE ALL { int statfs(char *path, struct statfs *buf); } +158 NONE NONE ALL { int fstatfs(int fd, struct statfs *buf); } +159 NONE NONE ALL { int unmount(user_addr_t path, int flags); } +160 NONE NONE ALL { int nosys(void); } { old async_daemon } + +#if NFSCLIENT +161 NONE KERN ALL { int getfh(char *fname, fhandle_t *fhp); } +#else +161 NONE NONE ALL { int nosys(void); } +#endif + +162 NONE NONE ALL { int nosys(void); } { old getdomainname } +163 NONE NONE ALL { int nosys(void); } { old setdomainname } +164 NONE NONE ALL { int nosys(void); } +165 NONE KERN ALL { int quotactl(char *path, int cmd, int uid, caddr_t arg); } +166 NONE NONE ALL { int nosys(void); } { old exportfs } +167 NONE NONE ALL { int mount(char *type, char *path, int flags, caddr_t data); } +168 NONE NONE ALL { int nosys(void); } { old ustat } +169 NONE NONE ALL { int nosys(void); } +170 NONE NONE HN { int table(void); } { old table } +171 NONE NONE ALL { int nosys(void); } { old wait3 } +172 NONE NONE ALL { int nosys(void); } { old rpause } +173 PRE KERN ALL { int waitid(idtype_t idtype, id_t id, siginfo_t *infop, int options); } +174 NONE NONE ALL { int nosys(void); } { old getdents } +175 NONE NONE ALL { int nosys(void); } { old gc_control } +176 NONE KERN ALL { int add_profil(short *bufbase, size_t bufsize, u_long pcoffset, u_int pcscale); } +177 NONE NONE ALL { int nosys(void); } +178 NONE NONE ALL { int nosys(void); } +179 NONE NONE ALL { int nosys(void); } +180 NONE NONE UALL { int kdebug_trace(int code, int arg1, int arg2, int arg3, int arg4, int arg5); } +181 NONE KERN ALL { int setgid(gid_t gid); } +182 NONE KERN ALL { int setegid(gid_t egid); } +183 NONE KERN ALL { int seteuid(uid_t euid); } + +#ifdef __ppc__ +184 NONE KERN ALL { int sigreturn(struct ucontext *uctx, int infostyle); } +#else +184 NONE NONE ALL { int nosys(void); } +#endif + +185 NONE NONE ALL { int nosys(void); } +186 NONE NONE ALL { int nosys(void); } +187 NONE NONE ALL { int nosys(void); } +188 NONE NONE ALL { int stat(user_addr_t path, user_addr_t ub); } +189 NONE NONE ALL { int fstat(int fd, user_addr_t ub); } +190 NONE NONE ALL { int lstat(user_addr_t path, user_addr_t ub); } +191 NONE NONE ALL { int pathconf(char *path, int name); } +192 NONE NONE ALL { int fpathconf(int fd, int name); } + +#if COMPAT_GETFSSTAT +193 NONE NONE ALL { int getfsstat(user_addr_t buf, user_long_t bufsize, int flags); } +#else +193 NONE NONE ALL { int nosys(void); } +#endif + +194 NONE KERN ALL { int getrlimit(u_int which, struct rlimit *rlp); } +195 NONE KERN ALL { int setrlimit(u_int which, struct rlimit *rlp); } +196 NONE NONE ALL { int getdirentries(int fd, char *buf, u_int count, long *basep); } +197 NONE NONE ALL { user_addr_t mmap(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos); } +198 NONE NONE ALL { int nosys(void); } { __syscall } +199 NONE NONE ALL { off_t lseek(int fd, off_t offset, int whence); } +200 NONE NONE ALL { int truncate(char *path, off_t length); } +201 NONE NONE ALL { int ftruncate(int fd, off_t length); } +202 NONE KERN ALL { int __sysctl(int *name, u_int namelen, void *old, size_t *oldlenp, void *new, size_t newlen); } +203 NONE NONE ALL { int mlock(caddr_t addr, size_t len); } +204 NONE NONE ALL { int munlock(caddr_t addr, size_t len); } +205 NONE NONE ALL { int undelete(user_addr_t path); } + +#ifdef __ppc__ +206 NONE NONE ALL { int ATsocket(int proto); } +207 NONE NONE UALL { int ATgetmsg(int fd, void *ctlptr, void *datptr, int *flags); } +208 NONE NONE UALL { int ATputmsg(int fd, void *ctlptr, void *datptr, int flags); } +209 NONE NONE UALL { int ATPsndreq(int fd, unsigned char *buf, int len, int nowait); } +210 NONE NONE UALL { int ATPsndrsp(int fd, unsigned char *respbuff, int resplen, int datalen); } +211 NONE NONE UALL { int ATPgetreq(int fd, unsigned char *buf, int buflen); } +212 NONE NONE UALL { int ATPgetrsp(int fd, unsigned char *bdsp); } +213 NONE NONE ALL { int nosys(void); } { Reserved for AppleTalk } +#else +206 NONE NONE HN { int ATsocket(int proto); } +207 NONE NONE UHN { int ATgetmsg(int fd, void *ctlptr, void *datptr, int *flags); } +208 NONE NONE UHN { int ATputmsg(int fd, void *ctlptr, void *datptr, int flags); } +209 NONE NONE UHN { int ATPsndreq(int fd, unsigned char *buf, int len, int nowait); } +210 NONE NONE UHN { int ATPsndrsp(int fd, unsigned char *respbuff, int resplen, int datalen); } +211 NONE NONE UHN { int ATPgetreq(int fd, unsigned char *buf, int buflen); } +212 NONE NONE UHN { int ATPgetrsp(int fd, unsigned char *bdsp); } +213 NONE NONE ALL { int nosys(void); } { Reserved for AppleTalk } +#endif /* __ppc__ */ + +214 NONE KERN ALL { int kqueue_from_portset_np(int portset); } +215 NONE KERN ALL { int kqueue_portset_np(int fd); } + +; System Calls 216 - 230 are reserved for calls to support HFS/HFS Plus +; file system semantics. Currently, we only use 215-227. The rest is +; for future expansion in anticipation of new MacOS APIs for HFS Plus. +; These calls are not conditionalized becuase while they are specific +; to HFS semantics, they are not specific to the HFS filesystem. +; We expect all filesystems to recognize the call and report that it is +; not supported or to actually implement it. +216 NONE NONE UHN { int mkcomplex(const char *path, mode_t mode, u_long type); } { soon to be obsolete } +217 NONE NONE UHN { int statv(const char *path, struct vstat *vsb); } { soon to be obsolete } +218 NONE NONE UHN { int lstatv(const char *path, struct vstat *vsb); } { soon to be obsolete } +219 NONE NONE UHN { int fstatv(int fd, struct vstat *vsb); } { soon to be obsolete } +220 NONE NONE ALL { int getattrlist(const char *path, struct attrlist *alist, void *attributeBuffer, size_t bufferSize, u_long options); } +221 NONE NONE ALL { int setattrlist(const char *path, struct attrlist *alist, void *attributeBuffer, size_t bufferSize, u_long options); } +222 NONE NONE ALL { int getdirentriesattr(int fd, struct attrlist *alist, void *buffer, size_t buffersize, u_long *count, u_long *basep, u_long *newstate, u_long options); } +223 NONE NONE ALL { int exchangedata(const char *path1, const char *path2, u_long options); } + +#ifdef __APPLE_API_OBSOLETE +224 NONE NONE UALL { int checkuseraccess(const char *path, uid_t userid, gid_t *groups, int ngroups, int accessrequired, u_long options); } +#else +224 NONE NONE ALL { int nosys(void); } { HFS checkuseraccess check access to a file } +#endif /* __APPLE_API_OBSOLETE */ +225 NONE KERN ALL { int searchfs(const char *path, struct fssearchblock *searchblock, u_long *nummatches, u_long scriptcode, u_long options, struct searchstate *state); } +226 NONE NONE ALL { int delete(user_addr_t path); } { private delete (Carbon semantics) } +227 NONE NONE ALL { int copyfile(char *from, char *to, int mode, int flags); } +228 NONE NONE ALL { int nosys(void); } +229 NONE NONE ALL { int nosys(void); } +230 PRE NONE ALL { int poll(struct pollfd *fds, u_int nfds, int timeout); } +231 NONE NONE UALL { int watchevent(struct eventreq *u_req, int u_eventmask); } +232 NONE NONE UALL { int waitevent(struct eventreq *u_req, struct timeval *tv); } +233 NONE NONE UALL { int modwatch(struct eventreq *u_req, int u_eventmask); } +234 NONE NONE ALL { user_ssize_t getxattr(user_addr_t path, user_addr_t attrname, user_addr_t value, size_t size, uint32_t position, int options); } +235 NONE NONE ALL { user_ssize_t fgetxattr(int fd, user_addr_t attrname, user_addr_t value, size_t size, uint32_t position, int options); } +236 NONE NONE ALL { int setxattr(user_addr_t path, user_addr_t attrname, user_addr_t value, size_t size, uint32_t position, int options); } +237 NONE NONE ALL { int fsetxattr(int fd, user_addr_t attrname, user_addr_t value, size_t size, uint32_t position, int options); } +238 NONE NONE ALL { int removexattr(user_addr_t path, user_addr_t attrname, int options); } +239 NONE NONE ALL { int fremovexattr(int fd, user_addr_t attrname, int options); } +240 NONE NONE ALL { user_ssize_t listxattr(user_addr_t path, user_addr_t namebuf, size_t bufsize, int options); } +241 NONE NONE ALL { user_ssize_t flistxattr(int fd, user_addr_t namebuf, size_t bufsize, int options); } +242 NONE KERN ALL { int fsctl(const char *path, u_long cmd, caddr_t data, u_long options); } +243 NONE KERN ALL { int initgroups(u_int gidsetsize, gid_t *gidset, int gmuid); } +244 NONE NONE ALL { int nosys(void); } +245 NONE NONE ALL { int nosys(void); } +246 NONE NONE ALL { int nosys(void); } + +#if NFSCLIENT +247 NONE KERN ALL { int nfsclnt(int flag, caddr_t argp); } +248 NONE KERN ALL { int fhopen(const struct fhandle *u_fhp, int flags); } +#else +247 NONE NONE ALL { int nosys(void); } +248 NONE NONE ALL { int nosys(void); } +#endif + +249 NONE NONE ALL { int nosys(void); } +250 NONE NONE ALL { int minherit(void *addr, size_t len, int inherit); } +251 NONE NONE ALL { int semsys(u_int which, int a2, int a3, int a4, int a5); } +252 NONE NONE ALL { int msgsys(u_int which, int a2, int a3, int a4, int a5); } +253 NONE NONE ALL { int shmsys(u_int which, int a2, int a3, int a4); } +254 NONE NONE ALL { int semctl(int semid, int semnum, int cmd, semun_t arg); } +255 NONE NONE ALL { int semget(key_t key, int nsems, int semflg); } +256 NONE NONE ALL { int semop(int semid, struct sembuf *sops, int nsops); } +257 NONE NONE ALL { int semconfig(semconfig_ctl_t flag); } +258 NONE NONE ALL { int msgctl(int msqid, int cmd, struct msqid_ds *buf); } +259 NONE NONE ALL { int msgget(key_t key, int msgflg); } +260 PRE NONE ALL { int msgsnd(int msqid, void *msgp, size_t msgsz, int msgflg); } +261 PRE NONE ALL { user_ssize_t msgrcv(int msqid, void *msgp, size_t msgsz, long msgtyp, int msgflg); } +262 NONE NONE ALL { int shmat(int shmid, void *shmaddr, int shmflg); } +263 NONE NONE ALL { int shmctl(int shmid, int cmd, struct shmid_ds *buf); } +264 NONE NONE ALL { int shmdt(void *shmaddr); } +265 NONE NONE ALL { int shmget(key_t key, size_t size, int shmflg); } +266 NONE NONE ALL { int shm_open(const char *name, int oflag, int mode); } +267 NONE NONE ALL { int shm_unlink(const char *name); } +268 NONE NONE ALL { user_addr_t sem_open(const char *name, int oflag, int mode, int value); } +269 NONE NONE ALL { int sem_close(sem_t *sem); } +270 NONE NONE ALL { int sem_unlink(const char *name); } +271 PRE NONE ALL { int sem_wait(sem_t *sem); } +272 NONE NONE ALL { int sem_trywait(sem_t *sem); } +273 NONE NONE ALL { int sem_post(sem_t *sem); } +274 NONE NONE ALL { int sem_getvalue(sem_t *sem, int *sval); } +275 NONE NONE ALL { int sem_init(sem_t *sem, int phsared, u_int value); } +276 NONE NONE ALL { int sem_destroy(sem_t *sem); } +277 NONE NONE ALL { int open_extended(user_addr_t path, int flags, uid_t uid, gid_t gid, int mode, user_addr_t xsecurity); } +278 NONE KERN ALL { int umask_extended(int newmask, user_addr_t xsecurity); } +279 NONE NONE ALL { int stat_extended(user_addr_t path, user_addr_t ub, user_addr_t xsecurity, user_addr_t xsecurity_size); } +280 NONE NONE ALL { int lstat_extended(user_addr_t path, user_addr_t ub, user_addr_t xsecurity, user_addr_t xsecurity_size); } +281 NONE NONE ALL { int fstat_extended(int fd, user_addr_t ub, user_addr_t xsecurity, user_addr_t xsecurity_size); } +282 NONE NONE ALL { int chmod_extended(user_addr_t path, uid_t uid, gid_t gid, int mode, user_addr_t xsecurity); } +283 NONE NONE ALL { int fchmod_extended(int fd, uid_t uid, gid_t gid, int mode, user_addr_t xsecurity); } +284 NONE NONE ALL { int access_extended(user_addr_t entries, size_t size, user_addr_t results, uid_t uid); } +285 NONE NONE ALL { int settid(uid_t uid, gid_t gid); } +286 NONE NONE ALL { int gettid(uid_t *uidp, gid_t *gidp); } +287 NONE NONE ALL { int setsgroups(int setlen, user_addr_t guidset); } +288 NONE NONE ALL { int getsgroups(user_addr_t setlen, user_addr_t guidset); } +289 NONE NONE ALL { int setwgroups(int setlen, user_addr_t guidset); } +290 NONE NONE ALL { int getwgroups(user_addr_t setlen, user_addr_t guidset); } +291 NONE NONE ALL { int mkfifo_extended(user_addr_t path, uid_t uid, gid_t gid, int mode, user_addr_t xsecurity); } +292 NONE NONE ALL { int mkdir_extended(user_addr_t path, uid_t uid, gid_t gid, int mode, user_addr_t xsecurity); } +293 NONE NONE ALL { int identitysvc(int opcode, user_addr_t message); } +294 NONE NONE ALL { int nosys(void); } +295 NONE NONE ALL { int nosys(void); } +296 NONE KERN UALL { int load_shared_file(char *filename, caddr_t mfa, u_long mfs, caddr_t *ba, int map_cnt, sf_mapping_t *mappings, int *flags); } +297 NONE KERN UALL { int reset_shared_file(caddr_t *ba, int map_cnt, sf_mapping_t *mappings); } +298 NONE KERN ALL { int new_system_shared_regions(void); } +299 NONE KERN UALL { int shared_region_map_file_np(int fd, uint32_t mappingCount, user_addr_t mappings, user_addr_t slide_p); } +300 NONE KERN UALL { int shared_region_make_private_np(uint32_t rangeCount, user_addr_t ranges); } +301 NONE NONE ALL { int nosys(void); } +302 NONE NONE ALL { int nosys(void); } +303 NONE NONE ALL { int nosys(void); } +304 NONE NONE ALL { int nosys(void); } +305 NONE NONE ALL { int nosys(void); } +306 NONE NONE ALL { int nosys(void); } +307 NONE NONE ALL { int nosys(void); } +308 NONE NONE ALL { int nosys(void); } +309 NONE NONE ALL { int nosys(void); } +310 NONE KERN ALL { int getsid(pid_t pid); } +311 NONE NONE ALL { int settid_with_pid(pid_t pid, int assume); } +312 NONE NONE ALL { int nosys(void); } +313 NONE NONE ALL { int aio_fsync(int op, user_addr_t aiocbp); } +314 NONE NONE ALL { user_ssize_t aio_return(user_addr_t aiocbp); } +315 PRE NONE ALL { int aio_suspend(user_addr_t aiocblist, int nent, user_addr_t timeoutp); } +316 NONE NONE ALL { int aio_cancel(int fd, user_addr_t aiocbp); } +317 NONE NONE ALL { int aio_error(user_addr_t aiocbp); } +318 NONE NONE ALL { int aio_read(user_addr_t aiocbp); } +319 NONE NONE ALL { int aio_write(user_addr_t aiocbp); } +320 NONE NONE ALL { int lio_listio(int mode, user_addr_t aiocblist, int nent, user_addr_t sigp); } +321 NONE NONE ALL { int nosys(void); } +322 NONE NONE ALL { int nosys(void); } +323 NONE NONE ALL { int nosys(void); } +324 NONE NONE ALL { int mlockall(int how); } +325 NONE NONE ALL { int munlockall(int how); } +326 NONE NONE ALL { int nosys(void); } +327 NONE KERN ALL { int issetugid(void); } +328 NONE KERN ALL { int __pthread_kill(int thread_port, int sig); } +329 NONE KERN ALL { int pthread_sigmask(int how, user_addr_t set, user_addr_t oset); } +330 PRE KERN ALL { int sigwait(user_addr_t set, user_addr_t sig); } +331 NONE KERN ALL { int __disable_threadsignal(int value); } +332 NONE NONE ALL { int __pthread_markcancel(int thread_port); } +333 NONE NONE ALL { int __pthread_canceled(int action); } +334 POST NONE ALL { int __semwait_signal(int cond_sem, int mutex_sem, int timeout, int relative, time_t tv_sec, int32_t tv_nsec); } +335 NONE KERN ALL { int utrace(const void *addr, size_t len); } +336 NONE NONE ALL { int nosys(void); } +337 NONE NONE ALL { int nosys(void); } +338 NONE NONE ALL { int nosys(void); } +339 NONE NONE ALL { int nosys(void); } +340 NONE NONE ALL { int nosys(void); } +341 NONE NONE ALL { int nosys(void); } +342 NONE NONE ALL { int nosys(void); } +343 NONE NONE ALL { int nosys(void); } +344 NONE NONE ALL { int nosys(void); } +345 NONE NONE ALL { int nosys(void); } +346 NONE NONE ALL { int nosys(void); } +347 NONE NONE ALL { int nosys(void); } +348 NONE NONE ALL { int nosys(void); } +349 NONE NONE ALL { int nosys(void); } +350 NONE KERN ALL { int audit(void *record, int length); } +351 NONE KERN ALL { int auditon(int cmd, void *data, int length); } +352 NONE KERN ALL { int nosys(void); } +353 NONE KERN ALL { int getauid(au_id_t *auid); } +354 NONE KERN ALL { int setauid(au_id_t *auid); } +355 NONE KERN ALL { int getaudit(struct auditinfo *auditinfo); } +356 NONE KERN ALL { int setaudit(struct auditinfo *auditinfo); } +357 NONE KERN ALL { int getaudit_addr(struct auditinfo_addr *auditinfo_addr, int length); } +358 NONE KERN ALL { int setaudit_addr(struct auditinfo_addr *auditinfo_addr, int length); } +359 NONE KERN ALL { int auditctl(char *path); } +360 NONE NONE ALL { int nosys(void); } +361 NONE NONE ALL { int nosys(void); } +362 NONE NONE ALL { int kqueue(void); } +363 NONE NONE ALL { int kevent(int fd, const struct kevent *changelist, int nchanges, struct kevent *eventlist, int nevents, const struct timespec *timeout); } +364 NONE NONE ALL { int lchown(user_addr_t path, uid_t owner, gid_t group); } +365 NONE NONE ALL { int nosys(void); } +366 NONE NONE ALL { int nosys(void); } +367 NONE NONE ALL { int nosys(void); } +368 NONE NONE ALL { int nosys(void); } +369 NONE NONE ALL { int nosys(void); } diff --git a/bsd/kern/sysctl_init.c b/bsd/kern/sysctl_init.c index 2fc67f072..e50013d38 100644 --- a/bsd/kern/sysctl_init.c +++ b/bsd/kern/sysctl_init.c @@ -21,18 +21,19 @@ */ #include <sys/param.h> -#include <sys/buf.h> #include <sys/kernel.h> #include <sys/sysctl.h> extern struct sysctl_oid sysctl__debug_bpf_bufsize; extern struct sysctl_oid sysctl__debug_bpf_maxbufsize; +extern struct sysctl_oid sysctl__debug_bpf_maxdevices; +extern struct sysctl_oid sysctl__debug_iokit; #if TUN extern struct sysctl_oid sysctl__debug_if_tun_debug; #endif -#if COMPAT_43 +#if COMPAT_43_TTY #ifndef NeXT extern struct sysctl_oid sysctl__debug_ttydebug; #endif @@ -42,6 +43,10 @@ extern struct sysctl_oid sysctl__hw_machine; extern struct sysctl_oid sysctl__hw_model; extern struct sysctl_oid sysctl__hw_ncpu; extern struct sysctl_oid sysctl__hw_activecpu; +extern struct sysctl_oid sysctl__hw_physicalcpu; +extern struct sysctl_oid sysctl__hw_physicalcpu_max; +extern struct sysctl_oid sysctl__hw_logicalcpu; +extern struct sysctl_oid sysctl__hw_logicalcpu_max; extern struct sysctl_oid sysctl__hw_byteorder; extern struct sysctl_oid sysctl__hw_cputype; extern struct sysctl_oid sysctl__hw_cpusubtype; @@ -84,6 +89,10 @@ extern struct sysctl_oid sysctl__kern_sysv_shmmin; extern struct sysctl_oid sysctl__kern_sysv_shmmni; extern struct sysctl_oid sysctl__kern_sysv_shmseg; extern struct sysctl_oid sysctl__kern_sysv_shmall; +extern struct sysctl_oid sysctl__kern_sysv_ipcs; +extern struct sysctl_oid sysctl__kern_sysv_ipcs_shm; +extern struct sysctl_oid sysctl__kern_sysv_ipcs_sem; +extern struct sysctl_oid sysctl__kern_sysv_ipcs_msg; extern struct sysctl_oid sysctl__kern_sysv_semmni; extern struct sysctl_oid sysctl__kern_sysv_semmns; @@ -93,12 +102,16 @@ extern struct sysctl_oid sysctl__kern_sysv_semume; extern struct sysctl_oid sysctl__kern_dummy; extern struct sysctl_oid sysctl__kern_ipc_maxsockbuf; +extern struct sysctl_oid sysctl__kern_ipc_mbstat; extern struct sysctl_oid sysctl__kern_ipc_nmbclusters; extern struct sysctl_oid sysctl__kern_ipc_sockbuf_waste_factor; extern struct sysctl_oid sysctl__kern_ipc_somaxconn; extern struct sysctl_oid sysctl__kern_ipc_sosendminchain; extern struct sysctl_oid sysctl__kern_ipc_sorecvmincopy; extern struct sysctl_oid sysctl__kern_ipc_maxsockets; +extern struct sysctl_oid sysctl__kern_posix; +extern struct sysctl_oid sysctl__kern_posix_sem; +extern struct sysctl_oid sysctl__kern_posix_sem_max; extern struct sysctl_oid sysctl__kern_sugid_scripts; extern struct sysctl_oid sysctl__net_inet_icmp_icmplim; extern struct sysctl_oid sysctl__net_inet_icmp_maskrepl; @@ -126,6 +139,7 @@ extern struct sysctl_oid sysctl__net_inet_ip_subnets_are_local; extern struct sysctl_oid sysctl__net_inet_ip_keepfaith; extern struct sysctl_oid sysctl__net_inet_ip_maxfragpackets; extern struct sysctl_oid sysctl__net_inet_ip_maxfragsperpacket; +extern struct sysctl_oid sysctl__net_inet_ip_maxfrags; extern struct sysctl_oid sysctl__net_inet_ip_check_interface; extern struct sysctl_oid sysctl__net_inet_ip_check_route_selfref; extern struct sysctl_oid sysctl__net_inet_ip_use_route_genid; @@ -134,17 +148,39 @@ extern struct sysctl_oid sysctl__net_inet_ip_gifttl; #endif #if DUMMYNET -extern struct sysctl_oid sysctl__net_inet_ip_dummynet_calls; -extern struct sysctl_oid sysctl__net_inet_ip_dummynet_debug; -extern struct sysctl_oid sysctl__net_inet_ip_dummynet_idle; +extern struct sysctl_oid sysctl__net_inet_ip_dummynet_hash_size; +extern struct sysctl_oid sysctl__net_inet_ip_dummynet_curr_time; +extern struct sysctl_oid sysctl__net_inet_ip_dummynet_ready_heap; +extern struct sysctl_oid sysctl__net_inet_ip_dummynet_extract_heap; +extern struct sysctl_oid sysctl__net_inet_ip_dummynet_searches; +extern struct sysctl_oid sysctl__net_inet_ip_dummynet_search_steps; +extern struct sysctl_oid sysctl__net_inet_ip_dummynet_expire; +extern struct sysctl_oid sysctl__net_inet_ip_dummynet_max_chain_len; +extern struct sysctl_oid sysctl__net_inet_ip_dummynet_red_lookup_depth; +extern struct sysctl_oid sysctl__net_inet_ip_dummynet_red_avg_pkt_size; +extern struct sysctl_oid sysctl__net_inet_ip_dummynet_red_max_pkt_size; extern struct sysctl_oid sysctl__net_inet_ip_dummynet; #endif #if IPFIREWALL && !IPFIREWALL_KEXT +extern struct sysctl_oid sysctl__net_inet_ip_fw_enable; extern struct sysctl_oid sysctl__net_inet_ip_fw_debug; extern struct sysctl_oid sysctl__net_inet_ip_fw_verbose; extern struct sysctl_oid sysctl__net_inet_ip_fw_verbose_limit; extern struct sysctl_oid sysctl__net_inet_ip_fw_one_pass; +extern struct sysctl_oid sysctl__net_inet_ip_fw_autoinc_step; +extern struct sysctl_oid sysctl__net_inet_ip_fw_dyn_buckets; +extern struct sysctl_oid sysctl__net_inet_ip_fw_curr_dyn_buckets; +extern struct sysctl_oid sysctl__net_inet_ip_fw_dyn_count; +extern struct sysctl_oid sysctl__net_inet_ip_fw_dyn_max; +extern struct sysctl_oid sysctl__net_inet_ip_fw_static_count; +extern struct sysctl_oid sysctl__net_inet_ip_fw_dyn_ack_lifetime; +extern struct sysctl_oid sysctl__net_inet_ip_fw_dyn_syn_lifetime; +extern struct sysctl_oid sysctl__net_inet_ip_fw_dyn_fin_lifetime; +extern struct sysctl_oid sysctl__net_inet_ip_fw_dyn_rst_lifetime; +extern struct sysctl_oid sysctl__net_inet_ip_fw_dyn_udp_lifetime; +extern struct sysctl_oid sysctl__net_inet_ip_fw_dyn_short_lifetime; +extern struct sysctl_oid sysctl__net_inet_ip_fw_dyn_keepalive; extern struct sysctl_oid sysctl__net_inet_ip_fw; #endif @@ -152,6 +188,7 @@ extern struct sysctl_oid sysctl__net_inet_ip_linklocal; extern struct sysctl_oid sysctl__net_inet_ip_linklocal_stat; extern struct sysctl_oid sysctl__net_inet_ip_linklocal_in; extern struct sysctl_oid sysctl__net_inet_ip_linklocal_in_allowbadttl; +extern struct sysctl_oid sysctl__net_inet_ip_maxchainsent; extern struct sysctl_oid sysctl__net_inet_raw_maxdgram; extern struct sysctl_oid sysctl__net_inet_raw_recvspace; @@ -166,6 +203,7 @@ extern struct sysctl_oid sysctl__net_inet_tcp_keepinit; extern struct sysctl_oid sysctl__net_inet_tcp_keepintvl; extern struct sysctl_oid sysctl__net_inet_tcp_mssdflt; extern struct sysctl_oid sysctl__net_inet_tcp_minmss; +extern struct sysctl_oid sysctl__net_inet_tcp_minmssoverload; extern struct sysctl_oid sysctl__net_inet_tcp_recvspace; extern struct sysctl_oid sysctl__net_inet_tcp_sendspace; extern struct sysctl_oid sysctl__net_inet_tcp_slowlink_wsize; @@ -175,6 +213,7 @@ extern struct sysctl_oid sysctl__net_inet_tcp_path_mtu_discovery; extern struct sysctl_oid sysctl__net_inet_tcp_slowstart_flightsize; extern struct sysctl_oid sysctl__net_inet_tcp_local_slowstart_flightsize; extern struct sysctl_oid sysctl__net_inet_tcp_newreno; +extern struct sysctl_oid sysctl__net_inet_tcp_packetchain; extern struct sysctl_oid sysctl__net_inet_tcp_tcbhashsize; extern struct sysctl_oid sysctl__net_inet_tcp_do_tcpdrain; extern struct sysctl_oid sysctl__net_inet_tcp_icmp_may_rst; @@ -199,6 +238,7 @@ extern struct sysctl_oid sysctl__net_inet_udp_checksum; extern struct sysctl_oid sysctl__net_inet_udp_maxdgram; extern struct sysctl_oid sysctl__net_inet_udp_recvspace; extern struct sysctl_oid sysctl__net_inet_udp_blackhole; +extern struct sysctl_oid sysctl__net_inet_udp_pcbcount; #if NETAT extern struct sysctl_oid sysctl__net_appletalk_debug; @@ -221,7 +261,7 @@ extern struct sysctl_oid sysctl__net_link_ether_inet_maxtries; extern struct sysctl_oid sysctl__net_link_ether_inet_proxyall; extern struct sysctl_oid sysctl__net_link_ether_inet_prune_intvl; extern struct sysctl_oid sysctl__net_link_ether_inet_useloopback; -extern struct sysctl_oid sysctl__net_link_ether_inet_log_arp_wrong_iface; +extern struct sysctl_oid sysctl__net_link_ether_inet_log_arp_warnings; extern struct sysctl_oid sysctl__net_link_ether_inet_apple_hwcksum_tx; extern struct sysctl_oid sysctl__net_link_ether_inet_apple_hwcksum_rx; @@ -229,6 +269,7 @@ extern struct sysctl_oid sysctl__net_link_ether_inet_apple_hwcksum_rx; extern struct sysctl_oid sysctl__net_link_generic_system_ifcount; extern struct sysctl_oid sysctl__net_link_generic; extern struct sysctl_oid sysctl__net_link_generic_ifdata; +extern struct sysctl_oid sysctl__net_link_generic_ifalldata; extern struct sysctl_oid sysctl__net_link_generic_system; #endif @@ -259,10 +300,12 @@ extern struct sysctl_oid sysctl__vfs_nfs_diskless_rootpath; extern struct sysctl_oid sysctl__vfs_nfs_diskless_swappath; extern struct sysctl_oid sysctl__vfs_nfs_nfsstats; #endif +#if NFSCLIENT extern struct sysctl_oid sysctl__vfs_generic_nfs_client_initialdowndelay; extern struct sysctl_oid sysctl__vfs_generic_nfs_client_nextdowndelay; extern struct sysctl_oid sysctl__vfs_generic_nfs_client; extern struct sysctl_oid sysctl__vfs_generic_nfs; +#endif extern struct sysctl_oid sysctl__vfs_generic; extern struct sysctl_oid sysctl__vfs_generic_vfsidlist; @@ -339,6 +382,7 @@ extern struct sysctl_oid sysctl__net_inet6_ip6_forwarding; extern struct sysctl_oid sysctl__net_inet6_ip6_redirect; extern struct sysctl_oid sysctl__net_inet6_ip6_hlim; extern struct sysctl_oid sysctl__net_inet6_ip6_maxfragpackets; +extern struct sysctl_oid sysctl__net_inet6_ip6_maxfrags; extern struct sysctl_oid sysctl__net_inet6_ip6_accept_rtadv; extern struct sysctl_oid sysctl__net_inet6_ip6_keepfaith; extern struct sysctl_oid sysctl__net_inet6_ip6_log_interval; @@ -354,6 +398,7 @@ extern struct sysctl_oid sysctl__net_inet6_ip6_use_tempaddr; extern struct sysctl_oid sysctl__net_inet6_ip6_v6only; extern struct sysctl_oid sysctl__net_inet6_ip6_auto_linklocal; extern struct sysctl_oid sysctl__net_inet6_ip6_rip6stats; +extern struct sysctl_oid sysctl__net_inet6_ip6_mrt6stat; extern struct sysctl_oid sysctl__net_inet6_ip6_rtexpire; extern struct sysctl_oid sysctl__net_inet6_ip6_rtminexpire; extern struct sysctl_oid sysctl__net_inet6_ip6_rtmaxcache; @@ -421,6 +466,7 @@ extern struct sysctl_oid sysctl__net_key_esp_keymin; extern struct sysctl_oid sysctl__net_key_esp_auth; extern struct sysctl_oid sysctl__net_key_ah_keymin; extern struct sysctl_oid sysctl__net_key_natt_keepalive_interval; +extern struct sysctl_oid sysctl__net_key_pfkeystat; #endif @@ -434,41 +480,58 @@ struct sysctl_oid *newsysctl_list[] = &sysctl__vfs, &sysctl__sysctl, &sysctl__debug_bpf_bufsize, - &sysctl__debug_bpf_maxbufsize + &sysctl__debug_bpf_maxbufsize, + &sysctl__debug_bpf_maxdevices, + &sysctl__debug_iokit #if TUN ,&sysctl__debug_if_tun_debug #endif -#if COMPAT_43 +#if COMPAT_43_TTY #ifndef NeXT ,&sysctl__debug_ttydebug #endif #endif + ,&sysctl__kern_posix + ,&sysctl__kern_posix_sem + ,&sysctl__kern_posix_sem_max + ,&sysctl__kern_sysv_shmmax ,&sysctl__kern_sysv_shmmin ,&sysctl__kern_sysv_shmmni ,&sysctl__kern_sysv_shmseg ,&sysctl__kern_sysv_shmall + ,&sysctl__kern_sysv_ipcs + ,&sysctl__kern_sysv_ipcs_shm + ,&sysctl__kern_sysv_ipcs_sem + ,&sysctl__kern_sysv_ipcs_msg ,&sysctl__kern_sysv_semmni ,&sysctl__kern_sysv_semmns ,&sysctl__kern_sysv_semmnu ,&sysctl__kern_sysv_semmsl ,&sysctl__kern_sysv_semume ,&sysctl__kern_dummy + ,&sysctl__kern_ipc_maxsockbuf + ,&sysctl__kern_ipc_mbstat ,&sysctl__kern_ipc_nmbclusters ,&sysctl__kern_ipc_sockbuf_waste_factor ,&sysctl__kern_ipc_somaxconn ,&sysctl__kern_ipc_sosendminchain ,&sysctl__kern_ipc_sorecvmincopy ,&sysctl__kern_ipc_maxsockets + ,&sysctl__kern_sugid_scripts ,&sysctl__hw_machine ,&sysctl__hw_model ,&sysctl__hw_ncpu ,&sysctl__hw_activecpu + ,&sysctl__hw_physicalcpu + ,&sysctl__hw_physicalcpu_max + ,&sysctl__hw_logicalcpu + ,&sysctl__hw_logicalcpu_max ,&sysctl__hw_byteorder ,&sysctl__hw_cputype ,&sysctl__hw_cpusubtype @@ -529,6 +592,7 @@ struct sysctl_oid *newsysctl_list[] = ,&sysctl__net_inet_ip_keepfaith ,&sysctl__net_inet_ip_maxfragpackets ,&sysctl__net_inet_ip_maxfragsperpacket + ,&sysctl__net_inet_ip_maxfrags ,&sysctl__net_inet_ip_check_interface ,&sysctl__net_inet_ip_check_route_selfref ,&sysctl__net_inet_ip_use_route_genid @@ -536,23 +600,46 @@ struct sysctl_oid *newsysctl_list[] = ,&sysctl__net_inet_ip_gifttl #endif #if DUMMYNET - ,&sysctl__net_inet_ip_dummynet_calls - ,&sysctl__net_inet_ip_dummynet_debug - ,&sysctl__net_inet_ip_dummynet_idle + ,&sysctl__net_inet_ip_dummynet_hash_size + ,&sysctl__net_inet_ip_dummynet_curr_time + ,&sysctl__net_inet_ip_dummynet_ready_heap + ,&sysctl__net_inet_ip_dummynet_extract_heap + ,&sysctl__net_inet_ip_dummynet_searches + ,&sysctl__net_inet_ip_dummynet_search_steps + ,&sysctl__net_inet_ip_dummynet_expire + ,&sysctl__net_inet_ip_dummynet_max_chain_len + ,&sysctl__net_inet_ip_dummynet_red_lookup_depth + ,&sysctl__net_inet_ip_dummynet_red_avg_pkt_size + ,&sysctl__net_inet_ip_dummynet_red_max_pkt_size ,&sysctl__net_inet_ip_dummynet #endif #if IPFIREWALL && !IPFIREWALL_KEXT + ,&sysctl__net_inet_ip_fw_enable ,&sysctl__net_inet_ip_fw_debug ,&sysctl__net_inet_ip_fw_verbose ,&sysctl__net_inet_ip_fw_verbose_limit ,&sysctl__net_inet_ip_fw_one_pass + ,&sysctl__net_inet_ip_fw_autoinc_step + ,&sysctl__net_inet_ip_fw_dyn_buckets + ,&sysctl__net_inet_ip_fw_curr_dyn_buckets + ,&sysctl__net_inet_ip_fw_dyn_count + ,&sysctl__net_inet_ip_fw_dyn_max + ,&sysctl__net_inet_ip_fw_static_count + ,&sysctl__net_inet_ip_fw_dyn_ack_lifetime + ,&sysctl__net_inet_ip_fw_dyn_syn_lifetime + ,&sysctl__net_inet_ip_fw_dyn_fin_lifetime + ,&sysctl__net_inet_ip_fw_dyn_rst_lifetime + ,&sysctl__net_inet_ip_fw_dyn_udp_lifetime + ,&sysctl__net_inet_ip_fw_dyn_short_lifetime + ,&sysctl__net_inet_ip_fw_dyn_keepalive ,&sysctl__net_inet_ip_fw #endif ,&sysctl__net_inet_ip_linklocal ,&sysctl__net_inet_ip_linklocal_stat ,&sysctl__net_inet_ip_linklocal_in ,&sysctl__net_inet_ip_linklocal_in_allowbadttl + ,&sysctl__net_inet_ip_maxchainsent ,&sysctl__net_inet_raw_maxdgram ,&sysctl__net_inet_raw_recvspace ,&sysctl__net_inet_tcp_always_keepalive @@ -566,6 +653,7 @@ struct sysctl_oid *newsysctl_list[] = ,&sysctl__net_inet_tcp_keepintvl ,&sysctl__net_inet_tcp_mssdflt ,&sysctl__net_inet_tcp_minmss + ,&sysctl__net_inet_tcp_minmssoverload ,&sysctl__net_inet_tcp_recvspace ,&sysctl__net_inet_tcp_sendspace ,&sysctl__net_inet_tcp_slowlink_wsize @@ -575,6 +663,7 @@ struct sysctl_oid *newsysctl_list[] = ,&sysctl__net_inet_tcp_slowstart_flightsize ,&sysctl__net_inet_tcp_local_slowstart_flightsize ,&sysctl__net_inet_tcp_newreno + ,&sysctl__net_inet_tcp_packetchain ,&sysctl__net_inet_tcp_tcbhashsize ,&sysctl__net_inet_tcp_do_tcpdrain ,&sysctl__net_inet_tcp_icmp_may_rst @@ -599,6 +688,7 @@ struct sysctl_oid *newsysctl_list[] = ,&sysctl__net_inet_udp_maxdgram ,&sysctl__net_inet_udp_recvspace ,&sysctl__net_inet_udp_blackhole + ,&sysctl__net_inet_udp_pcbcount #if NETAT ,&sysctl__net_appletalk_debug @@ -622,13 +712,14 @@ struct sysctl_oid *newsysctl_list[] = ,&sysctl__net_link_ether_inet_proxyall ,&sysctl__net_link_ether_inet_prune_intvl ,&sysctl__net_link_ether_inet_useloopback - ,&sysctl__net_link_ether_inet_log_arp_wrong_iface + ,&sysctl__net_link_ether_inet_log_arp_warnings ,&sysctl__net_link_ether_inet_apple_hwcksum_tx ,&sysctl__net_link_ether_inet_apple_hwcksum_rx #if NETMIBS ,&sysctl__net_link_generic_system_ifcount ,&sysctl__net_link_generic ,&sysctl__net_link_generic_ifdata + ,&sysctl__net_link_generic_ifalldata ,&sysctl__net_link_generic_system #endif @@ -664,10 +755,12 @@ struct sysctl_oid *newsysctl_list[] = ,&sysctl__vfs_generic_vfsidlist ,&sysctl__vfs_generic_ctlbyfsid ,&sysctl__vfs_generic_noremotehang +#if NFSCLIENT ,&sysctl__vfs_generic_nfs ,&sysctl__vfs_generic_nfs_client ,&sysctl__vfs_generic_nfs_client_initialdowndelay ,&sysctl__vfs_generic_nfs_client_nextdowndelay +#endif ,&sysctl__kern_ipc ,&sysctl__kern_sysv ,&sysctl__net_inet @@ -721,6 +814,7 @@ struct sysctl_oid *newsysctl_list[] = ,&sysctl__net_inet6_ip6_redirect ,&sysctl__net_inet6_ip6_hlim ,&sysctl__net_inet6_ip6_maxfragpackets + ,&sysctl__net_inet6_ip6_maxfrags ,&sysctl__net_inet6_ip6_accept_rtadv ,&sysctl__net_inet6_ip6_keepfaith ,&sysctl__net_inet6_ip6_log_interval @@ -736,6 +830,7 @@ struct sysctl_oid *newsysctl_list[] = ,&sysctl__net_inet6_ip6_v6only ,&sysctl__net_inet6_ip6_auto_linklocal ,&sysctl__net_inet6_ip6_rip6stats + ,&sysctl__net_inet6_ip6_mrt6stat ,&sysctl__net_inet6_ip6_rtexpire ,&sysctl__net_inet6_ip6_rtminexpire ,&sysctl__net_inet6_ip6_rtmaxcache @@ -788,6 +883,7 @@ struct sysctl_oid *newsysctl_list[] = ,&sysctl__net_key_esp_auth ,&sysctl__net_key_ah_keymin ,&sysctl__net_key_natt_keepalive_interval + ,&sysctl__net_key_pfkeystat ,&sysctl__net_inet_ipsec ,&sysctl__net_inet_ipsec_stats ,&sysctl__net_inet_ipsec_def_policy diff --git a/bsd/kern/sysv_ipc.c b/bsd/kern/sysv_ipc.c index e39505b70..0a13e17fd 100644 --- a/bsd/kern/sysv_ipc.c +++ b/bsd/kern/sysv_ipc.c @@ -55,6 +55,7 @@ #include <sys/param.h> #include <sys/ipc.h> #include <sys/ucred.h> +#include <sys/kauth.h> /* @@ -65,23 +66,22 @@ */ int -ipcperm(cred, perm, mode) - struct ucred *cred; - struct ipc_perm *perm; - int mode; +ipcperm(kauth_cred_t cred, struct ipc_perm *perm, int mode) { if (!suser(cred, (u_short *)NULL)) return (0); /* Check for user match. */ - if (cred->cr_uid != perm->cuid && cred->cr_uid != perm->uid) { + if (kauth_cred_getuid(cred) != perm->cuid && kauth_cred_getuid(cred) != perm->uid) { + int is_member; + if (mode & IPC_M) return (EPERM); /* Check for group match. */ mode >>= 3; - if (!groupmember(perm->gid, cred) && - !groupmember(perm->cgid, cred)) + if ((kauth_cred_ismember_gid(cred, perm->gid, &is_member) || !is_member) && + (kauth_cred_ismember_gid(cred, perm->cgid, &is_member) || !is_member)) /* Check for `other' match. */ mode >>= 3; } @@ -90,70 +90,3 @@ ipcperm(cred, perm, mode) return (0); return ((mode & perm->mode) == mode ? 0 : EACCES); } - - - -/* - * SYSVMSG stubs - */ - -int -msgsys(p, uap) - struct proc *p; - /* XXX actually varargs. */ -#if 0 - struct msgsys_args *uap; -#else - void *uap; -#endif -{ - return(EOPNOTSUPP); -}; - -int -msgctl(p, uap) - struct proc *p; -#if 0 - register struct msgctl_args *uap; -#else - void *uap; -#endif -{ - return(EOPNOTSUPP); -}; - -int -msgget(p, uap) - struct proc *p; -#if 0 - register struct msgget_args *uap; -#else - void *uap; -#endif -{ - return(EOPNOTSUPP); -}; - -int -msgsnd(p, uap) - struct proc *p; -#if 0 - register struct msgsnd_args *uap; -#else - void *uap; -#endif -{ - return(EOPNOTSUPP); -}; - -int -msgrcv(p, uap) - struct proc *p; -#if 0 - register struct msgrcv_args *uap; -#else - void *uap; -#endif -{ - return(EOPNOTSUPP); -}; diff --git a/bsd/kern/sysv_msg.c b/bsd/kern/sysv_msg.c index 4226476d2..757edc883 100644 --- a/bsd/kern/sysv_msg.c +++ b/bsd/kern/sysv_msg.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -40,31 +40,29 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/sysproto.h> #include <sys/kernel.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/msg.h> -#include <sys/sysent.h> +#include <sys/malloc.h> +#include <mach/mach_types.h> #include <bsm/audit_kernel.h> -static void msginit __P((void *)); -SYSINIT(sysv_msg, SI_SUB_SYSV_MSG, SI_ORDER_FIRST, msginit, NULL) +#include <sys/filedesc.h> +#include <sys/file_internal.h> +#include <sys/sysctl.h> +#include <sys/sysproto.h> +#include <sys/ipcs.h> + +static void msginit(void *); #define MSG_DEBUG #undef MSG_DEBUG_OK -#ifndef _SYS_SYSPROTO_H_ -struct msgctl_args; -int msgctl __P((struct proc *p, struct msgctl_args *uap)); -struct msgget_args; -int msgget __P((struct proc *p, struct msgget_args *uap)); -struct msgsnd_args; -int msgsnd __P((struct proc *p, struct msgsnd_args *uap)); -struct msgrcv_args; -int msgrcv __P((struct proc *p, struct msgrcv_args *uap)); -#endif -static void msg_freehdr __P((struct msg *msghdr)); +static void msg_freehdr(struct msg *msghdr); + +typedef int sy_call_t(struct proc *, void *, int *); /* XXX casting to (sy_call_t *) is bogus, as usual. */ static sy_call_t *msgcalls[] = { @@ -72,20 +70,117 @@ static sy_call_t *msgcalls[] = { (sy_call_t *)msgsnd, (sy_call_t *)msgrcv }; -static int nfree_msgmaps; /* # of free map entries */ -static short free_msgmaps; /* head of linked list of free map entries */ -static struct msg *free_msghdrs; /* list of free msg headers */ -char *msgpool; /* MSGMAX byte long msg buffer pool */ -struct msgmap *msgmaps; /* MSGSEG msgmap structures */ -struct msg *msghdrs; /* MSGTQL msg headers */ -struct msqid_ds *msqids; /* MSGMNI msqid_ds struct's */ - -void -msginit(dummy) - void *dummy; +static int nfree_msgmaps; /* # of free map entries */ +static short free_msgmaps; /* free map entries list head */ +static struct msg *free_msghdrs; /* list of free msg headers */ +char *msgpool; /* MSGMAX byte long msg buffer pool */ +struct msgmap *msgmaps; /* MSGSEG msgmap structures */ +struct msg *msghdrs; /* MSGTQL msg headers */ +struct user_msqid_ds *msqids; /* MSGMNI user_msqid_ds struct's */ + +static lck_grp_t *sysv_msg_subsys_lck_grp; +static lck_grp_attr_t *sysv_msg_subsys_lck_grp_attr; +static lck_attr_t *sysv_msg_subsys_lck_attr; +static lck_mtx_t sysv_msg_subsys_mutex; + +#define SYSV_MSG_SUBSYS_LOCK() lck_mtx_lock(&sysv_msg_subsys_mutex) +#define SYSV_MSG_SUBSYS_UNLOCK() lck_mtx_unlock(&sysv_msg_subsys_mutex) + +void sysv_msg_lock_init(void); + + +#ifdef __APPLE_API_PRIVATE +struct msginfo msginfo = { + MSGMAX, /* = (MSGSSZ*MSGSEG) : max chars in a message */ + MSGMNI, /* = 40 : max message queue identifiers */ + MSGMNB, /* = 2048 : max chars in a queue */ + MSGTQL, /* = 40 : max messages in system */ + MSGSSZ, /* = 8 : size of a message segment (2^N long) */ + MSGSEG /* = 2048 : number of message segments */ +}; +#endif /* __APPLE_API_PRIVATE */ + +/* Initialize the mutex governing access to the SysV msg subsystem */ +__private_extern__ void +sysv_msg_lock_init( void ) +{ + sysv_msg_subsys_lck_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(sysv_msg_subsys_lck_grp_attr); + + sysv_msg_subsys_lck_grp = lck_grp_alloc_init("sysv_msg_subsys_lock", sysv_msg_subsys_lck_grp_attr); + + sysv_msg_subsys_lck_attr = lck_attr_alloc_init(); + /* lck_attr_setdebug(sysv_msg_subsys_lck_attr); */ + lck_mtx_init(&sysv_msg_subsys_mutex, sysv_msg_subsys_lck_grp, sysv_msg_subsys_lck_attr); +} + +static __inline__ user_time_t +sysv_msgtime(void) +{ + struct timeval tv; + microtime(&tv); + return (tv.tv_sec); +} + +/* + * NOTE: Source and target may *NOT* overlap! (target is smaller) + */ +static void +msqid_ds_64to32(struct user_msqid_ds *in, struct msqid_ds *out) +{ + out->msg_perm = in->msg_perm; + out->msg_qnum = in->msg_qnum; + out->msg_cbytes = in->msg_cbytes; /* for ipcs */ + out->msg_qbytes = in->msg_qbytes; + out->msg_lspid = in->msg_lspid; + out->msg_lrpid = in->msg_lrpid; + out->msg_stime = in->msg_stime; /* XXX loss of range */ + out->msg_rtime = in->msg_rtime; /* XXX loss of range */ + out->msg_ctime = in->msg_ctime; /* XXX loss of range */ +} + +/* + * NOTE: Source and target may are permitted to overlap! (source is smaller); + * this works because we copy fields in order from the end of the struct to + * the beginning. + */ +static void +msqid_ds_32to64(struct msqid_ds *in, struct user_msqid_ds *out) +{ + out->msg_ctime = in->msg_ctime; + out->msg_rtime = in->msg_rtime; + out->msg_stime = in->msg_stime; + out->msg_lrpid = in->msg_lrpid; + out->msg_lspid = in->msg_lspid; + out->msg_qbytes = in->msg_qbytes; + out->msg_cbytes = in->msg_cbytes; /* for ipcs */ + out->msg_qnum = in->msg_qnum; + out->msg_perm = in->msg_perm; +} + +/* This routine assumes the system is locked prior to calling this routine */ +void +msginit(__unused void *dummy) { + static int initted = 0; register int i; + /* Lazy initialization on first system call; we don't have SYSINIT(). */ + if (initted) + return; + initted = 1; + + msgpool = (char *)_MALLOC(msginfo.msgmax, M_SHM, M_WAITOK); + MALLOC(msgmaps, struct msgmap *, + sizeof(struct msgmap) * msginfo.msgseg, + M_SHM, M_WAITOK); + MALLOC(msghdrs, struct msg *, + sizeof(struct msg) * msginfo.msgtql, + M_SHM, M_WAITOK); + MALLOC(msqids, struct user_msqid_ds *, + sizeof(struct user_msqid_ds) * msginfo.msgmni, + M_SHM, M_WAITOK); + /* * msginfo.msgssz should be a power of two for efficiency reasons. * It is also pretty silly if msginfo.msgssz is less than 8 @@ -140,28 +235,17 @@ msginit(dummy) /* * Entry point for all MSG calls */ -int -msgsys(p, uap) - struct proc *p; /* XXX actually varargs. */ - struct msgsys_args /* { - u_int which; - int a2; - int a3; - int a4; - int a5; - int a6; - } */ *uap; +int +msgsys(struct proc *p, struct msgsys_args *uap, register_t *retval) { - if (uap->which >= sizeof(msgcalls)/sizeof(msgcalls[0])) return (EINVAL); - return ((*msgcalls[uap->which])(p, &uap->a2)); + return ((*msgcalls[uap->which])(p, &uap->a2, retval)); } static void -msg_freehdr(msghdr) - struct msg *msghdr; +msg_freehdr(struct msg *msghdr) { while (msghdr->msg_ts > 0) { short next; @@ -183,29 +267,23 @@ msg_freehdr(msghdr) free_msghdrs = msghdr; } -#ifndef _SYS_SYSPROTO_H_ -struct msgctl_args { - int msqid; - int cmd; - struct msqid_ds *buf; -}; -#endif - int -msgctl(p, uap) - struct proc *p; - register struct msgctl_args *uap; +msgctl(struct proc *p, struct msgctl_args *uap, register_t *retval) { int msqid = uap->msqid; int cmd = uap->cmd; - struct msqid_ds *user_msqptr = uap->buf; - struct ucred *cred = p->p_ucred; + kauth_cred_t cred = kauth_cred_get(); int rval, eval; - struct msqid_ds msqbuf; - register struct msqid_ds *msqptr; + struct user_msqid_ds msqbuf; + struct user_msqid_ds *msqptr; + struct user_msqid_ds umsds; + + SYSV_MSG_SUBSYS_LOCK(); + + msginit( 0); #ifdef MSG_DEBUG_OK - printf("call to msgctl(%d, %d, 0x%x)\n", msqid, cmd, user_msqptr); + printf("call to msgctl(%d, %d, 0x%qx)\n", msqid, cmd, uap->buf); #endif AUDIT_ARG(svipc_cmd, cmd); @@ -217,7 +295,8 @@ msgctl(p, uap) printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid, msginfo.msgmni); #endif - return(EINVAL); + eval = EINVAL; + goto msgctlout; } msqptr = &msqids[msqid]; @@ -226,13 +305,15 @@ msgctl(p, uap) #ifdef MSG_DEBUG_OK printf("no such msqid\n"); #endif - return(EINVAL); + eval = EINVAL; + goto msgctlout; } if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) { #ifdef MSG_DEBUG_OK printf("wrong sequence number\n"); #endif - return(EINVAL); + eval = EINVAL; + goto msgctlout; } eval = 0; @@ -244,7 +325,8 @@ msgctl(p, uap) { struct msg *msghdr; if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_M))) - return(eval); + goto msgctlout; + /* Free the message headers */ msghdr = msqptr->msg_first; while (msghdr != NULL) { @@ -272,15 +354,31 @@ msgctl(p, uap) case IPC_SET: if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_M))) + goto msgctlout; + + SYSV_MSG_SUBSYS_UNLOCK(); + + if (IS_64BIT_PROCESS(p)) { + eval = copyin(uap->buf, &msqbuf, sizeof(struct user_msqid_ds)); + } else { + eval = copyin(uap->buf, &msqbuf, sizeof(struct msqid_ds)); + /* convert in place; ugly, but safe */ + msqid_ds_32to64((struct msqid_ds *)&msqbuf, &msqbuf); + } + if (eval) return(eval); - if ((eval = copyin(user_msqptr, &msqbuf, sizeof(msqbuf))) != 0) - return(eval); + + SYSV_MSG_SUBSYS_LOCK(); + if (msqbuf.msg_qbytes > msqptr->msg_qbytes) { eval = suser(cred, &p->p_acflag); if (eval) - return(eval); + goto msgctlout; } - if (msqbuf.msg_qbytes > msginfo.msgmnb) { + + + /* compare (msglen_t) value against restrict (int) value */ + if (msqbuf.msg_qbytes > (msglen_t)msginfo.msgmnb) { #ifdef MSG_DEBUG_OK printf("can't increase msg_qbytes beyond %d (truncating)\n", msginfo.msgmnb); @@ -291,14 +389,15 @@ msgctl(p, uap) #ifdef MSG_DEBUG_OK printf("can't reduce msg_qbytes to 0\n"); #endif - return(EINVAL); /* non-standard errno! */ + eval = EINVAL; + goto msgctlout; } msqptr->msg_perm.uid = msqbuf.msg_perm.uid; /* change the owner */ msqptr->msg_perm.gid = msqbuf.msg_perm.gid; /* change the owner */ msqptr->msg_perm.mode = (msqptr->msg_perm.mode & ~0777) | (msqbuf.msg_perm.mode & 0777); msqptr->msg_qbytes = msqbuf.msg_qbytes; - msqptr->msg_ctime = time_second; + msqptr->msg_ctime = sysv_msgtime(); break; case IPC_STAT: @@ -306,41 +405,48 @@ msgctl(p, uap) #ifdef MSG_DEBUG_OK printf("requester doesn't have read access\n"); #endif - return(eval); + goto msgctlout; + } + + bcopy(msqptr, &umsds, sizeof(struct user_msqid_ds)); + + SYSV_MSG_SUBSYS_UNLOCK(); + if (IS_64BIT_PROCESS(p)) { + eval = copyout(&umsds, uap->buf, sizeof(struct user_msqid_ds)); + } else { + struct msqid_ds msqid_ds32; + msqid_ds_64to32(&umsds, &msqid_ds32); + eval = copyout(&msqid_ds32, uap->buf, sizeof(struct msqid_ds)); } - eval = copyout((caddr_t)msqptr, user_msqptr, - sizeof(struct msqid_ds)); + SYSV_MSG_SUBSYS_LOCK(); break; default: #ifdef MSG_DEBUG_OK printf("invalid command %d\n", cmd); #endif - return(EINVAL); + eval = EINVAL; + goto msgctlout; } if (eval == 0) - p->p_retval[0] = rval; + *retval = rval; +msgctlout: + SYSV_MSG_SUBSYS_UNLOCK(); return(eval); } -#ifndef _SYS_SYSPROTO_H_ -struct msgget_args { - key_t key; - int msgflg; -}; -#endif - int -msgget(p, uap) - struct proc *p; - register struct msgget_args *uap; +msgget(__unused struct proc *p, struct msgget_args *uap, register_t *retval) { int msqid, eval; int key = uap->key; int msgflg = uap->msgflg; - struct ucred *cred = p->p_ucred; - register struct msqid_ds *msqptr = NULL; + kauth_cred_t cred = kauth_cred_get(); + struct user_msqid_ds *msqptr = NULL; + + SYSV_MSG_SUBSYS_LOCK(); + msginit( 0); #ifdef MSG_DEBUG_OK printf("msgget(0x%x, 0%o)\n", key, msgflg); @@ -361,29 +467,30 @@ msgget(p, uap) #ifdef MSG_DEBUG_OK printf("not exclusive\n"); #endif - return(EEXIST); + eval = EEXIST; + goto msggetout; } if ((eval = ipcperm(cred, &msqptr->msg_perm, msgflg & 0700 ))) { #ifdef MSG_DEBUG_OK printf("requester doesn't have 0%o access\n", msgflg & 0700); #endif - return(eval); + goto msggetout; } goto found; } } #ifdef MSG_DEBUG_OK - printf("need to allocate the msqid_ds\n"); + printf("need to allocate the user_msqid_ds\n"); #endif if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) { for (msqid = 0; msqid < msginfo.msgmni; msqid++) { /* - * Look for an unallocated and unlocked msqid_ds. - * msqid_ds's can be locked by msgsnd or msgrcv while - * they are copying the message in/out. We can't - * re-use the entry until they release it. + * Look for an unallocated and unlocked user_msqid_ds. + * user_msqid_ds's can be locked by msgsnd or msgrcv + * while they are copying the message in/out. We + * can't re-use the entry until they release it. */ msqptr = &msqids[msqid]; if (msqptr->msg_qbytes == 0 && @@ -392,16 +499,17 @@ msgget(p, uap) } if (msqid == msginfo.msgmni) { #ifdef MSG_DEBUG_OK - printf("no more msqid_ds's available\n"); + printf("no more user_msqid_ds's available\n"); #endif - return(ENOSPC); + eval = ENOSPC; + goto msggetout; } #ifdef MSG_DEBUG_OK printf("msqid %d is available\n", msqid); #endif msqptr->msg_perm.key = key; - msqptr->msg_perm.cuid = cred->cr_uid; - msqptr->msg_perm.uid = cred->cr_uid; + msqptr->msg_perm.cuid = kauth_cred_getuid(cred); + msqptr->msg_perm.uid = kauth_cred_getuid(cred); msqptr->msg_perm.cgid = cred->cr_gid; msqptr->msg_perm.gid = cred->cr_gid; msqptr->msg_perm.mode = (msgflg & 0777); @@ -416,47 +524,45 @@ msgget(p, uap) msqptr->msg_lrpid = 0; msqptr->msg_stime = 0; msqptr->msg_rtime = 0; - msqptr->msg_ctime = time_second; + msqptr->msg_ctime = sysv_msgtime(); } else { #ifdef MSG_DEBUG_OK printf("didn't find it and wasn't asked to create it\n"); #endif - return(ENOENT); + eval = ENOENT; + goto msggetout; } found: /* Construct the unique msqid */ - p->p_retval[0] = IXSEQ_TO_IPCID(msqid, msqptr->msg_perm); - AUDIT_ARG(svipc_id, p->p_retval[0]); - return(0); + *retval = IXSEQ_TO_IPCID(msqid, msqptr->msg_perm); + AUDIT_ARG(svipc_id, *retval); + eval = 0; +msggetout: + SYSV_MSG_SUBSYS_UNLOCK(); + return(eval); } -#ifndef _SYS_SYSPROTO_H_ -struct msgsnd_args { - int msqid; - void *msgp; - size_t msgsz; - int msgflg; -}; -#endif int -msgsnd(p, uap) - struct proc *p; - register struct msgsnd_args *uap; +msgsnd(struct proc *p, struct msgsnd_args *uap, register_t *retval) { int msqid = uap->msqid; - void *user_msgp = uap->msgp; - size_t msgsz = uap->msgsz; + user_addr_t user_msgp = uap->msgp; + size_t msgsz = (size_t)uap->msgsz; /* limit to 4G */ int msgflg = uap->msgflg; int segs_needed, eval; - struct ucred *cred = p->p_ucred; - register struct msqid_ds *msqptr; - register struct msg *msghdr; + struct user_msqid_ds *msqptr; + struct msg *msghdr; short next; + user_long_t msgtype; + + + SYSV_MSG_SUBSYS_LOCK(); + msginit( 0); #ifdef MSG_DEBUG_OK - printf("call to msgsnd(%d, 0x%x, %d, %d)\n", msqid, user_msgp, msgsz, + printf("call to msgsnd(%d, 0x%qx, %d, %d)\n", msqid, user_msgp, msgsz, msgflg); #endif @@ -468,7 +574,8 @@ msgsnd(p, uap) printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid, msginfo.msgmni); #endif - return(EINVAL); + eval = EINVAL; + goto msgsndout; } msqptr = &msqids[msqid]; @@ -476,20 +583,22 @@ msgsnd(p, uap) #ifdef MSG_DEBUG_OK printf("no such message queue id\n"); #endif - return(EINVAL); + eval = EINVAL; + goto msgsndout; } if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) { #ifdef MSG_DEBUG_OK printf("wrong sequence number\n"); #endif - return(EINVAL); + eval = EINVAL; + goto msgsndout; } - if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_W))) { + if ((eval = ipcperm(kauth_cred_get(), &msqptr->msg_perm, IPC_W))) { #ifdef MSG_DEBUG_OK printf("requester doesn't have write access\n"); #endif - return(eval); + goto msgsndout; } segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz; @@ -509,7 +618,8 @@ msgsnd(p, uap) #ifdef MSG_DEBUG_OK printf("msgsz > msqptr->msg_qbytes\n"); #endif - return(EINVAL); + eval = EINVAL; + goto msgsndout; } if (msqptr->msg_perm.mode & MSG_LOCKED) { @@ -544,19 +654,20 @@ msgsnd(p, uap) #ifdef MSG_DEBUG_OK printf("need more resources but caller doesn't want to wait\n"); #endif - return(EAGAIN); + eval = EAGAIN; + goto msgsndout; } if ((msqptr->msg_perm.mode & MSG_LOCKED) != 0) { #ifdef MSG_DEBUG_OK - printf("we don't own the msqid_ds\n"); + printf("we don't own the user_msqid_ds\n"); #endif we_own_it = 0; } else { /* Force later arrivals to wait for our request */ #ifdef MSG_DEBUG_OK - printf("we own the msqid_ds\n"); + printf("we own the user_msqid_ds\n"); #endif msqptr->msg_perm.mode |= MSG_LOCKED; we_own_it = 1; @@ -564,7 +675,7 @@ msgsnd(p, uap) #ifdef MSG_DEBUG_OK printf("goodnight\n"); #endif - eval = tsleep((caddr_t)msqptr, (PZERO - 4) | PCATCH, + eval = msleep((caddr_t)msqptr, &sysv_msg_subsys_mutex, (PZERO - 4) | PCATCH, "msgwait", 0); #ifdef MSG_DEBUG_OK printf("good morning, eval=%d\n", eval); @@ -575,7 +686,8 @@ msgsnd(p, uap) #ifdef MSG_DEBUG_OK printf("msgsnd: interrupted system call\n"); #endif - return(EINTR); + eval = EINTR; + goto msgsndout; } /* @@ -588,12 +700,14 @@ msgsnd(p, uap) #endif /* The SVID says to return EIDRM. */ #ifdef EIDRM - return(EIDRM); + eval = EIDRM; #else /* Unfortunately, BSD doesn't define that code yet! */ - return(EINVAL); + eval = EINVAL; #endif + goto msgsndout; + } } else { @@ -619,12 +733,12 @@ msgsnd(p, uap) panic("no more msghdrs"); /* - * Re-lock the msqid_ds in case we page-fault when copying in the - * message + * Re-lock the user_msqid_ds in case we page-fault when copying in + * the message */ if ((msqptr->msg_perm.mode & MSG_LOCKED) != 0) - panic("msqid_ds is already locked"); + panic("user_msqid_ds is already locked"); msqptr->msg_perm.mode |= MSG_LOCKED; /* @@ -661,25 +775,36 @@ msgsnd(p, uap) } /* - * Copy in the message type + * Copy in the message type. For a 64 bit process, this is 64 bits, + * but we only ever use the low 32 bits, so the cast is OK. */ + if (IS_64BIT_PROCESS(p)) { + SYSV_MSG_SUBSYS_UNLOCK(); + eval = copyin(user_msgp, &msgtype, sizeof(msgtype)); + SYSV_MSG_SUBSYS_LOCK(); + msghdr->msg_type = CAST_DOWN(long,msgtype); + user_msgp = user_msgp + sizeof(msgtype); /* ptr math */ + } else { + SYSV_MSG_SUBSYS_UNLOCK(); + eval = copyin(user_msgp, &msghdr->msg_type, sizeof(long)); + SYSV_MSG_SUBSYS_LOCK(); + user_msgp = user_msgp + sizeof(long); /* ptr math */ + } - if ((eval = copyin(user_msgp, &msghdr->msg_type, - sizeof(msghdr->msg_type))) != 0) { + if (eval != 0) { #ifdef MSG_DEBUG_OK printf("error %d copying the message type\n", eval); #endif msg_freehdr(msghdr); msqptr->msg_perm.mode &= ~MSG_LOCKED; wakeup((caddr_t)msqptr); - return(eval); + goto msgsndout; } - user_msgp = (char *)user_msgp + sizeof(msghdr->msg_type); + /* * Validate the message type */ - if (msghdr->msg_type < 1) { msg_freehdr(msghdr); msqptr->msg_perm.mode &= ~MSG_LOCKED; @@ -687,17 +812,18 @@ msgsnd(p, uap) #ifdef MSG_DEBUG_OK printf("mtype (%d) < 1\n", msghdr->msg_type); #endif - return(EINVAL); + eval = EINVAL; + goto msgsndout; } /* * Copy in the message body */ - next = msghdr->msg_spot; while (msgsz > 0) { size_t tlen; - if (msgsz > msginfo.msgssz) + /* compare input (size_t) value against restrict (int) value */ + if (msgsz > (size_t)msginfo.msgssz) tlen = msginfo.msgssz; else tlen = msgsz; @@ -705,31 +831,36 @@ msgsnd(p, uap) panic("next too low #2"); if (next >= msginfo.msgseg) panic("next out of range #2"); - if ((eval = copyin(user_msgp, &msgpool[next * msginfo.msgssz], - tlen)) != 0) { + + SYSV_MSG_SUBSYS_UNLOCK(); + eval = copyin(user_msgp, &msgpool[next * msginfo.msgssz], tlen); + SYSV_MSG_SUBSYS_LOCK(); + + if (eval != 0) { #ifdef MSG_DEBUG_OK printf("error %d copying in message segment\n", eval); #endif msg_freehdr(msghdr); msqptr->msg_perm.mode &= ~MSG_LOCKED; wakeup((caddr_t)msqptr); - return(eval); + + goto msgsndout; } msgsz -= tlen; - user_msgp = (char *)user_msgp + tlen; + user_msgp = user_msgp + tlen; /* ptr math */ next = msgmaps[next].next; } if (next != -1) panic("didn't use all the msg segments"); /* - * We've got the message. Unlock the msqid_ds. + * We've got the message. Unlock the user_msqid_ds. */ msqptr->msg_perm.mode &= ~MSG_LOCKED; /* - * Make sure that the msqid_ds is still allocated. + * Make sure that the user_msqid_ds is still allocated. */ if (msqptr->msg_qbytes == 0) { @@ -737,11 +868,12 @@ msgsnd(p, uap) wakeup((caddr_t)msqptr); /* The SVID says to return EIDRM. */ #ifdef EIDRM - return(EIDRM); + eval = EIDRM; #else /* Unfortunately, BSD doesn't define that code yet! */ - return(EINVAL); + eval = EINVAL; #endif + goto msgsndout; } /* @@ -760,42 +892,39 @@ msgsnd(p, uap) msqptr->msg_cbytes += msghdr->msg_ts; msqptr->msg_qnum++; msqptr->msg_lspid = p->p_pid; - msqptr->msg_stime = time_second; + msqptr->msg_stime = sysv_msgtime(); wakeup((caddr_t)msqptr); - p->p_retval[0] = 0; - return(0); + *retval = 0; + eval = 0; + +msgsndout: + SYSV_MSG_SUBSYS_UNLOCK(); + return(eval); } -#ifndef _SYS_SYSPROTO_H_ -struct msgrcv_args { - int msqid; - void *msgp; - size_t msgsz; - long msgtyp; - int msgflg; -}; -#endif int -msgrcv(p, uap) - struct proc *p; - register struct msgrcv_args *uap; +msgrcv(struct proc *p, struct msgrcv_args *uap, user_ssize_t *retval) { int msqid = uap->msqid; - void *user_msgp = uap->msgp; - size_t msgsz = uap->msgsz; - long msgtyp = uap->msgtyp; + user_addr_t user_msgp = uap->msgp; + size_t msgsz = (size_t)uap->msgsz; /* limit to 4G */ + long msgtyp = (long)uap->msgtyp; /* limit to 32 bits */ int msgflg = uap->msgflg; size_t len; - struct ucred *cred = p->p_ucred; - register struct msqid_ds *msqptr; - register struct msg *msghdr; + struct user_msqid_ds *msqptr; + struct msg *msghdr; int eval; short next; + user_long_t msgtype; + long msg_type_long; + + SYSV_MSG_SUBSYS_LOCK(); + msginit( 0); #ifdef MSG_DEBUG_OK - printf("call to msgrcv(%d, 0x%x, %d, %ld, %d)\n", msqid, user_msgp, + printf("call to msgrcv(%d, 0x%qx, %d, %ld, %d)\n", msqid, user_msgp, msgsz, msgtyp, msgflg); #endif @@ -807,7 +936,8 @@ msgrcv(p, uap) printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid, msginfo.msgmni); #endif - return(EINVAL); + eval = EINVAL; + goto msgrcvout; } msqptr = &msqids[msqid]; @@ -815,20 +945,22 @@ msgrcv(p, uap) #ifdef MSG_DEBUG_OK printf("no such message queue id\n"); #endif - return(EINVAL); + eval = EINVAL; + goto msgrcvout; } if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) { #ifdef MSG_DEBUG_OK printf("wrong sequence number\n"); #endif - return(EINVAL); + eval = EINVAL; + goto msgrcvout; } - if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_R))) { + if ((eval = ipcperm(kauth_cred_get(), &msqptr->msg_perm, IPC_R))) { #ifdef MSG_DEBUG_OK printf("requester doesn't have read access\n"); #endif - return(eval); + goto msgrcvout; } msghdr = NULL; @@ -842,7 +974,8 @@ msgrcv(p, uap) printf("first message on the queue is too big (want %d, got %d)\n", msgsz, msghdr->msg_ts); #endif - return(E2BIG); + eval = E2BIG; + goto msgrcvout; } if (msqptr->msg_first == msqptr->msg_last) { msqptr->msg_first = NULL; @@ -881,7 +1014,8 @@ msgrcv(p, uap) printf("requested message on the queue is too big (want %d, got %d)\n", msgsz, msghdr->msg_ts); #endif - return(E2BIG); + eval = E2BIG; + goto msgrcvout; } *prev = msghdr->msg_next; if (msghdr == msqptr->msg_last) { @@ -928,11 +1062,12 @@ msgrcv(p, uap) #endif /* The SVID says to return ENOMSG. */ #ifdef ENOMSG - return(ENOMSG); + eval = ENOMSG; #else /* Unfortunately, BSD doesn't define that code yet! */ - return(EAGAIN); + eval = EAGAIN; #endif + goto msgrcvout; } /* @@ -942,7 +1077,7 @@ msgrcv(p, uap) #ifdef MSG_DEBUG_OK printf("msgrcv: goodnight\n"); #endif - eval = tsleep((caddr_t)msqptr, (PZERO - 4) | PCATCH, "msgwait", + eval = msleep((caddr_t)msqptr, &sysv_msg_subsys_mutex, (PZERO - 4) | PCATCH, "msgwait", 0); #ifdef MSG_DEBUG_OK printf("msgrcv: good morning (eval=%d)\n", eval); @@ -952,7 +1087,8 @@ msgrcv(p, uap) #ifdef MSG_DEBUG_OK printf("msgsnd: interrupted system call\n"); #endif - return(EINTR); + eval = EINTR; + goto msgrcvout; } /* @@ -966,11 +1102,12 @@ msgrcv(p, uap) #endif /* The SVID says to return EIDRM. */ #ifdef EIDRM - return(EIDRM); + eval = EIDRM; #else /* Unfortunately, BSD doesn't define that code yet! */ - return(EINVAL); + eval = EINVAL; #endif + goto msgrcvout; } } @@ -983,7 +1120,7 @@ msgrcv(p, uap) msqptr->msg_cbytes -= msghdr->msg_ts; msqptr->msg_qnum--; msqptr->msg_lrpid = p->p_pid; - msqptr->msg_rtime = time_second; + msqptr->msg_rtime = sysv_msgtime(); /* * Make msgsz the actual amount that we'll be returning. @@ -1002,17 +1139,34 @@ msgrcv(p, uap) * Return the type to the user. */ - eval = copyout((caddr_t)&(msghdr->msg_type), user_msgp, - sizeof(msghdr->msg_type)); + /* + * Copy out the message type. For a 64 bit process, this is 64 bits, + * but we only ever use the low 32 bits, so the cast is OK. + */ + if (IS_64BIT_PROCESS(p)) { + msgtype = msghdr->msg_type; + SYSV_MSG_SUBSYS_UNLOCK(); + eval = copyout(&msgtype, user_msgp, sizeof(msgtype)); + SYSV_MSG_SUBSYS_LOCK(); + user_msgp = user_msgp + sizeof(msgtype); /* ptr math */ + } else { + msg_type_long = msghdr->msg_type; + SYSV_MSG_SUBSYS_UNLOCK(); + eval = copyout(&msg_type_long, user_msgp, sizeof(long)); + SYSV_MSG_SUBSYS_LOCK(); + user_msgp = user_msgp + sizeof(long); /* ptr math */ + } + if (eval != 0) { #ifdef MSG_DEBUG_OK printf("error (%d) copying out message type\n", eval); #endif msg_freehdr(msghdr); wakeup((caddr_t)msqptr); - return(eval); + + goto msgrcvout; } - user_msgp = (char *)user_msgp + sizeof(msghdr->msg_type); + /* * Return the segments to the user @@ -1022,7 +1176,8 @@ msgrcv(p, uap) for (len = 0; len < msgsz; len += msginfo.msgssz) { size_t tlen; - if (msgsz > msginfo.msgssz) + /* compare input (size_t) value against restrict (int) value */ + if (msgsz > (size_t)msginfo.msgssz) tlen = msginfo.msgssz; else tlen = msgsz; @@ -1030,8 +1185,10 @@ msgrcv(p, uap) panic("next too low #3"); if (next >= msginfo.msgseg) panic("next out of range #3"); - eval = copyout((caddr_t)&msgpool[next * msginfo.msgssz], + SYSV_MSG_SUBSYS_UNLOCK(); + eval = copyout(&msgpool[next * msginfo.msgssz], user_msgp, tlen); + SYSV_MSG_SUBSYS_LOCK(); if (eval != 0) { #ifdef MSG_DEBUG_OK printf("error (%d) copying out message segment\n", @@ -1039,9 +1196,9 @@ msgrcv(p, uap) #endif msg_freehdr(msghdr); wakeup((caddr_t)msqptr); - return(eval); + goto msgrcvout; } - user_msgp = (char *)user_msgp + tlen; + user_msgp = user_msgp + tlen; /* ptr math */ next = msgmaps[next].next; } @@ -1051,6 +1208,121 @@ msgrcv(p, uap) msg_freehdr(msghdr); wakeup((caddr_t)msqptr); - p->p_retval[0] = msgsz; - return(0); + *retval = msgsz; + eval = 0; +msgrcvout: + SYSV_MSG_SUBSYS_UNLOCK(); + return(eval); +} + +static int +IPCS_msg_sysctl(__unused struct sysctl_oid *oidp, __unused void *arg1, + __unused int arg2, struct sysctl_req *req) +{ + int error; + int cursor; + union { + struct IPCS_command u32; + struct user_IPCS_command u64; + } ipcs; + struct msqid_ds msqid_ds32; /* post conversion, 32 bit version */ + void *msqid_dsp; + size_t ipcs_sz = sizeof(struct user_IPCS_command); + size_t msqid_ds_sz = sizeof(struct user_msqid_ds); + struct proc *p = current_proc(); + + if (!IS_64BIT_PROCESS(p)) { + ipcs_sz = sizeof(struct IPCS_command); + msqid_ds_sz = sizeof(struct msqid_ds); + } + + /* Copy in the command structure */ + if ((error = SYSCTL_IN(req, &ipcs, ipcs_sz)) != 0) { + return(error); + } + + if (!IS_64BIT_PROCESS(p)) /* convert in place */ + ipcs.u64.ipcs_data = CAST_USER_ADDR_T(ipcs.u32.ipcs_data); + + /* Let us version this interface... */ + if (ipcs.u64.ipcs_magic != IPCS_MAGIC) { + return(EINVAL); + } + + SYSV_MSG_SUBSYS_LOCK(); + + switch(ipcs.u64.ipcs_op) { + case IPCS_MSG_CONF: /* Obtain global configuration data */ + if (ipcs.u64.ipcs_datalen != sizeof(struct msginfo)) { + error = ERANGE; + break; + } + if (ipcs.u64.ipcs_cursor != 0) { /* fwd. compat. */ + error = EINVAL; + break; + } + SYSV_MSG_SUBSYS_UNLOCK(); + error = copyout(&msginfo, ipcs.u64.ipcs_data, ipcs.u64.ipcs_datalen); + SYSV_MSG_SUBSYS_LOCK(); + break; + + case IPCS_MSG_ITER: /* Iterate over existing segments */ + /* Not done up top so we can set limits via sysctl (later) */ + msginit( 0); + + cursor = ipcs.u64.ipcs_cursor; + if (cursor < 0 || cursor >= msginfo.msgmni) { + error = ERANGE; + break; + } + if (ipcs.u64.ipcs_datalen != (int)msqid_ds_sz) { + error = ENOMEM; + break; + } + for( ; cursor < msginfo.msgmni; cursor++) { + if (msqids[cursor].msg_qbytes != 0) /* allocated */ + break; + continue; + } + if (cursor == msginfo.msgmni) { + error = ENOENT; + break; + } + + msqid_dsp = &msqids[cursor]; /* default: 64 bit */ + + /* + * If necessary, convert the 64 bit kernel segment + * descriptor to a 32 bit user one. + */ + if (!IS_64BIT_PROCESS(p)) { + msqid_ds_64to32(msqid_dsp, &msqid_ds32); + msqid_dsp = &msqid_ds32; + } + SYSV_MSG_SUBSYS_UNLOCK(); + error = copyout(msqid_dsp, ipcs.u64.ipcs_data, ipcs.u64.ipcs_datalen); + if (!error) { + /* update cursor */ + ipcs.u64.ipcs_cursor = cursor + 1; + + if (!IS_64BIT_PROCESS(p)) /* convert in place */ + ipcs.u32.ipcs_data = CAST_DOWN(void *,ipcs.u64.ipcs_data); + error = SYSCTL_OUT(req, &ipcs, ipcs_sz); + } + SYSV_MSG_SUBSYS_LOCK(); + break; + + default: + error = EINVAL; + break; + } + + SYSV_MSG_SUBSYS_UNLOCK(); + return(error); } + +SYSCTL_DECL(_kern_sysv_ipcs); +SYSCTL_PROC(_kern_sysv_ipcs, OID_AUTO, msg, CTLFLAG_RW|CTLFLAG_ANYBODY, + 0, 0, IPCS_msg_sysctl, + "S,IPCS_msg_command", + "ipcs msg command interface"); diff --git a/bsd/kern/sysv_sem.c b/bsd/kern/sysv_sem.c index 6764d816a..8f7b26537 100644 --- a/bsd/kern/sysv_sem.c +++ b/bsd/kern/sysv_sem.c @@ -33,49 +33,27 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> -#include <sys/proc.h> -#include <sys/sem.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> +#include <sys/sem_internal.h> #include <sys/malloc.h> +#include <mach/mach_types.h> + #include <sys/filedesc.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/sysctl.h> +#include <sys/ipcs.h> +#include <sys/sysent.h> +#include <sys/sysproto.h> #include <bsm/audit_kernel.h> -#include <mach/mach_types.h> - -/*#include <sys/sysproto.h>*/ -/*#include <sys/sysent.h>*/ /* Uncomment this line to see the debugging output */ /* #define SEM_DEBUG */ -/* Macros to deal with the semaphore subsystem lock. The lock currently uses - * the semlock_holder static variable as a mutex. NULL means no lock, any - * value other than NULL means locked. semlock_holder is used because it was - * present in the code before the Darwin port, and for no other reason. - * When the time comes to relax the funnel requirements of the kernel only - * these macros should need to be changed. A spin lock would work well. - */ -/* Aquire the lock */ -#define SUBSYSTEM_LOCK_AQUIRE(p) { sysv_sem_aquiring_threads++; \ - while (semlock_holder != NULL) \ - (void) tsleep((caddr_t)&semlock_holder, (PZERO - 4), "sysvsem", 0); \ - semlock_holder = p; \ - sysv_sem_aquiring_threads--; } - -/* Release the lock */ -#define SUBSYSTEM_LOCK_RELEASE { semlock_holder = NULL; wakeup((caddr_t)&semlock_holder); } - -/* Release the lock and return a value */ -#define UNLOCK_AND_RETURN(ret) { SUBSYSTEM_LOCK_RELEASE; return(ret); } +#define M_SYSVSEM M_TEMP -#define M_SYSVSEM M_SUBPROC - -#if 0 -static void seminit __P((void *)); -SYSINIT(sysv_sem, SI_SUB_SYSV_SEM, SI_ORDER_FIRST, seminit, NULL) -#endif 0 /* Hard system limits to avoid resource starvation / DOS attacks. * These are not needed if we can make the semaphore pages swappable. @@ -110,27 +88,11 @@ struct seminfo seminfo = { SEMAEM /* adjust on exit max value */ }; -/* A counter so the module unload code knows when there are no more processes using - * the sysv_sem code */ -static long sysv_sem_sleeping_threads = 0; -static long sysv_sem_aquiring_threads = 0; - -struct semctl_args; -int semctl __P((struct proc *p, struct semctl_args *uap, int *)); -struct semget_args; -int semget __P((struct proc *p, struct semget_args *uap, int *)); -struct semop_args; -int semop __P((struct proc *p, struct semop_args *uap, int *)); -struct semconfig_args; -int semconfig __P((struct proc *p, struct semconfig_args *uap, int *)); - -static struct sem_undo *semu_alloc __P((struct proc *p)); -static int semundo_adjust __P((struct proc *p, struct sem_undo **supptr, - int semid, int semnum, int adjval)); -static void semundo_clear __P((int semid, int semnum)); - -typedef int sy_call_t __P((struct proc *, void *, int *)); +static struct sem_undo *semu_alloc(struct proc *p); +static int semundo_adjust(struct proc *p, struct sem_undo **supptr, + int semid, int semnum, int adjval); +static void semundo_clear(int semid, int semnum); /* XXX casting to (sy_call_t *) is bogus, as usual. */ static sy_call_t *semcalls[] = { @@ -138,45 +100,97 @@ static sy_call_t *semcalls[] = { (sy_call_t *)semop, (sy_call_t *)semconfig }; -static int semtot = 0; /* # of used semaphores */ -struct semid_ds *sema = NULL; /* semaphore id pool */ -struct sem *sem = NULL; /* semaphore pool */ -static struct sem_undo *semu_list = NULL; /* list of active undo structures */ -struct sem_undo *semu = NULL; /* semaphore undo pool */ +static int semtot = 0; /* # of used semaphores */ +struct user_semid_ds *sema = NULL; /* semaphore id pool */ +struct sem *sem_pool = NULL; /* semaphore pool */ +static struct sem_undo *semu_list = NULL; /* active undo structures */ +struct sem_undo *semu = NULL; /* semaphore undo pool */ -static struct proc *semlock_holder = NULL; -/* seminit no longer needed. The data structures are grown dynamically */ -void -seminit() +void sysv_sem_lock_init(void); +static lck_grp_t *sysv_sem_subsys_lck_grp; +static lck_grp_attr_t *sysv_sem_subsys_lck_grp_attr; +static lck_attr_t *sysv_sem_subsys_lck_attr; +static lck_mtx_t sysv_sem_subsys_mutex; + +#define SYSV_SEM_SUBSYS_LOCK() lck_mtx_lock(&sysv_sem_subsys_mutex) +#define SYSV_SEM_SUBSYS_UNLOCK() lck_mtx_unlock(&sysv_sem_subsys_mutex) + + +__private_extern__ void +sysv_sem_lock_init( void ) +{ + + sysv_sem_subsys_lck_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(sysv_sem_subsys_lck_grp_attr); + + sysv_sem_subsys_lck_grp = lck_grp_alloc_init("sysv_shm_subsys_lock", sysv_sem_subsys_lck_grp_attr); + + sysv_sem_subsys_lck_attr = lck_attr_alloc_init(); + lck_attr_setdebug(sysv_sem_subsys_lck_attr); + lck_mtx_init(&sysv_sem_subsys_mutex, sysv_sem_subsys_lck_grp, sysv_sem_subsys_lck_attr); +} + +static __inline__ user_time_t +sysv_semtime(void) +{ + struct timeval tv; + microtime(&tv); + return (tv.tv_sec); +} + +/* + * XXX conversion of internal user_time_t to external tume_t loses + * XXX precision; not an issue for us now, since we are only ever + * XXX setting 32 bits worth of time into it. + * + * pad field contents are not moved correspondingly; contents will be lost + * + * NOTE: Source and target may *NOT* overlap! (target is smaller) + */ +static void +semid_ds_64to32(struct user_semid_ds *in, struct semid_ds *out) { + out->sem_perm = in->sem_perm; + out->sem_base = (__int32_t)in->sem_base; + out->sem_nsems = in->sem_nsems; + out->sem_otime = in->sem_otime; /* XXX loses precision */ + out->sem_ctime = in->sem_ctime; /* XXX loses precision */ } +/* + * pad field contents are not moved correspondingly; contents will be lost + * + * NOTE: Source and target may are permitted to overlap! (source is smaller); + * this works because we copy fields in order from the end of the struct to + * the beginning. + * + * XXX use CAST_USER_ADDR_T() for lack of a CAST_USER_TIME_T(); net effect + * XXX is the same. + */ +static void +semid_ds_32to64(struct semid_ds *in, struct user_semid_ds *out) +{ + out->sem_ctime = in->sem_ctime; + out->sem_otime = in->sem_otime; + out->sem_nsems = in->sem_nsems; + out->sem_base = (void *)in->sem_base; + out->sem_perm = in->sem_perm; +} + + /* * Entry point for all SEM calls * * In Darwin this is no longer the entry point. It will be removed after * the code has been tested better. */ -struct semsys_args { - u_int which; - int a2; - int a3; - int a4; - int a5; -}; +/* XXX actually varargs. */ int -semsys(p, uap, retval) - struct proc *p; - /* XXX actually varargs. */ - struct semsys_args *uap; - register_t *retval; +semsys(struct proc *p, struct semsys_args *uap, register_t *retval) { /* The individual calls handling the locking now */ - /*while (semlock_holder != NULL && semlock_holder != p) - (void) tsleep((caddr_t)&semlock_holder, (PZERO - 4), "semsys", 0); - */ if (uap->which >= sizeof(semcalls)/sizeof(semcalls[0])) return (EINVAL); @@ -198,27 +212,18 @@ semsys(p, uap, retval) * in /dev/kmem. */ -#ifndef _SYS_SYSPROTO_H_ -struct semconfig_args { - semconfig_ctl_t flag; -}; -#endif - int -semconfig(p, uap, retval) - struct proc *p; - struct semconfig_args *uap; - register_t *retval; +semconfig(__unused struct proc *p, struct semconfig_args *uap, register_t *retval) { int eval = 0; switch (uap->flag) { case SEM_CONFIG_FREEZE: - SUBSYSTEM_LOCK_AQUIRE(p); + SYSV_SEM_SUBSYS_LOCK(); break; case SEM_CONFIG_THAW: - SUBSYSTEM_LOCK_RELEASE; + SYSV_SEM_SUBSYS_UNLOCK(); break; default: @@ -232,19 +237,26 @@ semconfig(p, uap, retval) return(eval); } -/* Expand the semu array to the given capacity. If the expansion fails +/* + * Expand the semu array to the given capacity. If the expansion fails * return 0, otherwise return 1. * * Assumes we already have the subsystem lock. */ static int -grow_semu_array(newSize) - int newSize; +grow_semu_array(int newSize) { - register int i, j; + register int i; register struct sem_undo *newSemu; + static boolean_t grow_semu_array_in_progress = FALSE; + + while (grow_semu_array_in_progress) { + msleep(&grow_semu_array_in_progress, &sysv_sem_subsys_mutex, + PPAUSE, "grow_semu_array", NULL); + } + if (newSize <= seminfo.semmnu) - return 0; + return 1; if (newSize > limitseminfo.semmnu) /* enforce hard limit */ { #ifdef SEM_DEBUG @@ -259,8 +271,13 @@ grow_semu_array(newSize) #ifdef SEM_DEBUG printf("growing semu[] from %d to %d\n", seminfo.semmnu, newSize); #endif - MALLOC(newSemu, struct sem_undo*, sizeof(struct sem_undo)*newSize, + grow_semu_array_in_progress = TRUE; + SYSV_SEM_SUBSYS_UNLOCK(); + MALLOC(newSemu, struct sem_undo*, sizeof(struct sem_undo) * newSize, M_SYSVSEM, M_WAITOK); + SYSV_SEM_SUBSYS_LOCK(); + grow_semu_array_in_progress = FALSE; + wakeup((caddr_t) &grow_semu_array_in_progress); if (NULL == newSemu) { #ifdef SEM_DEBUG @@ -273,8 +290,6 @@ grow_semu_array(newSize) for (i = 0; i < seminfo.semmnu; i++) { newSemu[i] = semu[i]; - for(j = 0; j < SEMUME; j++) /* Is this really needed? */ - newSemu[i].un_ent[j] = semu[i].un_ent[j]; } for (i = seminfo.semmnu; i < newSize; i++) { @@ -300,10 +315,9 @@ grow_semu_array(newSize) * Assumes we already have the subsystem lock. */ static int -grow_sema_array(newSize) - int newSize; +grow_sema_array(int newSize) { - register struct semid_ds *newSema; + register struct user_semid_ds *newSema; register int i; if (newSize <= seminfo.semmni) @@ -322,7 +336,7 @@ grow_sema_array(newSize) #ifdef SEM_DEBUG printf("growing sema[] from %d to %d\n", seminfo.semmni, newSize); #endif - MALLOC(newSema, struct semid_ds*, sizeof(struct semid_ds)*newSize, + MALLOC(newSema, struct user_semid_ds *, sizeof(struct user_semid_ds) * newSize, M_SYSVSEM, M_WAITOK); if (NULL == newSema) { @@ -342,7 +356,7 @@ grow_sema_array(newSize) * this with the existing code, so we wake up the * process and let it do a lot of work to determine the * semaphore set is really not available yet, and then - * sleep on the correct, reallocated semid_ds pointer. + * sleep on the correct, reallocated user_semid_ds pointer. */ if (sema[i].sem_perm.mode & SEM_ALLOC) wakeup((caddr_t)&sema[i]); @@ -350,7 +364,7 @@ grow_sema_array(newSize) for (i = seminfo.semmni; i < newSize; i++) { - newSema[i].sem_base = 0; + newSema[i].sem_base = NULL; newSema[i].sem_perm.mode = 0; } @@ -367,38 +381,38 @@ grow_sema_array(newSize) } /* - * Expand the sem array to the given capacity. If the expansion fails + * Expand the sem_pool array to the given capacity. If the expansion fails * we return 0 (fail), otherwise we return 1 (success). * * Assumes we already hold the subsystem lock. */ static int -grow_sem_array(newSize) - int newSize; +grow_sem_pool(int new_pool_size) { - register struct sem *newSem = NULL; - register int i; + struct sem *new_sem_pool = NULL; + struct sem *sem_free; + int i; - if (newSize < semtot) + if (new_pool_size < semtot) return 0; - if (newSize > limitseminfo.semmns) /* enforce hard limit */ - { + /* enforce hard limit */ + if (new_pool_size > limitseminfo.semmns) { #ifdef SEM_DEBUG printf("semaphore hard limit of %d reached, requested %d\n", - limitseminfo.semmns, newSize); + limitseminfo.semmns, new_pool_size); #endif return 0; } - newSize = (newSize/SEMMNS_INC + 1) * SEMMNS_INC; - newSize = newSize > limitseminfo.semmns ? limitseminfo.semmns : newSize; + + new_pool_size = (new_pool_size/SEMMNS_INC + 1) * SEMMNS_INC; + new_pool_size = new_pool_size > limitseminfo.semmns ? limitseminfo.semmns : new_pool_size; #ifdef SEM_DEBUG - printf("growing sem array from %d to %d\n", seminfo.semmns, newSize); + printf("growing sem_pool array from %d to %d\n", seminfo.semmns, new_pool_size); #endif - MALLOC(newSem, struct sem*, sizeof(struct sem)*newSize, + MALLOC(new_sem_pool, struct sem *, sizeof(struct sem) * new_pool_size, M_SYSVSEM, M_WAITOK); - if (NULL == newSem) - { + if (NULL == new_sem_pool) { #ifdef SEM_DEBUG printf("allocation failed. no changes made.\n"); #endif @@ -406,26 +420,24 @@ grow_sem_array(newSize) } /* We have our new memory, now copy the old contents over */ - if (sem) + if (sem_pool) for(i = 0; i < seminfo.semmns; i++) - newSem[i] = sem[i]; + new_sem_pool[i] = sem_pool[i]; /* Update our id structures to point to the new semaphores */ - for(i = 0; i < seminfo.semmni; i++) + for(i = 0; i < seminfo.semmni; i++) { if (sema[i].sem_perm.mode & SEM_ALLOC) /* ID in use */ - { - if (newSem > sem) - sema[i].sem_base += newSem - sem; - else - sema[i].sem_base -= sem - newSem; - } + sema[i].sem_base += (new_sem_pool - sem_pool); + } + + sem_free = sem_pool; + sem_pool = new_sem_pool; /* clean up the old array */ - if (sem) - FREE(sem, M_SYSVSEM); + if (sem_free != NULL) + FREE(sem_free, M_SYSVSEM); - sem = newSem; - seminfo.semmns = newSize; + seminfo.semmns = new_pool_size; #ifdef SEM_DEBUG printf("expansion complete\n"); #endif @@ -440,8 +452,7 @@ grow_sem_array(newSize) */ static struct sem_undo * -semu_alloc(p) - struct proc *p; +semu_alloc(struct proc *p) { register int i; register struct sem_undo *suptr; @@ -466,6 +477,7 @@ semu_alloc(p) suptr->un_next = semu_list; semu_list = suptr; suptr->un_cnt = 0; + suptr->un_ent = NULL; suptr->un_proc = p; return(suptr); } @@ -515,16 +527,12 @@ semu_alloc(p) * * Assumes we already hold the subsystem lock. */ - static int -semundo_adjust(p, supptr, semid, semnum, adjval) - register struct proc *p; - struct sem_undo **supptr; - int semid, semnum; - int adjval; +semundo_adjust(struct proc *p, struct sem_undo **supptr, int semid, + int semnum, int adjval) { register struct sem_undo *suptr; - register struct undo *sunptr; + register struct undo *sueptr, **suepptr, *new_sueptr; int i; /* Look for and remember the sem_undo if the caller doesn't provide @@ -553,31 +561,75 @@ semundo_adjust(p, supptr, semid, semnum, adjval) * Look for the requested entry and adjust it (delete if adjval becomes * 0). */ - sunptr = &suptr->un_ent[0]; - for (i = 0; i < suptr->un_cnt; i++, sunptr++) { - if (sunptr->un_id != semid || sunptr->un_num != semnum) + new_sueptr = NULL; +lookup: + for (i = 0, suepptr = &suptr->un_ent, sueptr = suptr->un_ent; + i < suptr->un_cnt; + i++, suepptr = &sueptr->une_next, sueptr = sueptr->une_next) { + if (sueptr->une_id != semid || sueptr->une_num != semnum) continue; if (adjval == 0) - sunptr->un_adjval = 0; + sueptr->une_adjval = 0; else - sunptr->un_adjval += adjval; - if (sunptr->un_adjval == 0) { + sueptr->une_adjval += adjval; + if (sueptr->une_adjval == 0) { suptr->un_cnt--; - if (i < suptr->un_cnt) - suptr->un_ent[i] = - suptr->un_ent[suptr->un_cnt]; + *suepptr = sueptr->une_next; + FREE(sueptr, M_SYSVSEM); + sueptr = NULL; + } + if (new_sueptr != NULL) { + /* + * We lost the race: free the "undo" entry we allocated + * and use the one that won. + */ + FREE(new_sueptr, M_SYSVSEM); + new_sueptr = NULL; } return(0); } /* Didn't find the right entry - create it */ - if (adjval == 0) + if (adjval == 0) { + if (new_sueptr != NULL) { + FREE(new_sueptr, M_SYSVSEM); + new_sueptr = NULL; + } return(0); - if (suptr->un_cnt != limitseminfo.semume) { - sunptr = &suptr->un_ent[suptr->un_cnt]; + } + + if (new_sueptr != NULL) { + /* + * Use the new "undo" entry we allocated in the previous pass + */ + new_sueptr->une_next = suptr->un_ent; + suptr->un_ent = new_sueptr; suptr->un_cnt++; - sunptr->un_adjval = adjval; - sunptr->un_id = semid; sunptr->un_num = semnum; + new_sueptr->une_adjval = adjval; + new_sueptr->une_id = semid; + new_sueptr->une_num = semnum; + return 0; + } + + if (suptr->un_cnt != limitseminfo.semume) { + SYSV_SEM_SUBSYS_UNLOCK(); + /* + * Unlocking opens the door to race conditions. Someone else + * could be trying to allocate the same thing at this point, + * so we'll have to check if we lost the race. + */ + MALLOC(new_sueptr, struct undo *, sizeof (struct undo), + M_SYSVSEM, M_WAITOK); + SYSV_SEM_SUBSYS_LOCK(); + if (new_sueptr == NULL) { + return ENOMEM; + } + /* + * There might be other threads doing the same thing for this + * process, so check again if an "undo" entry exists for that + * semaphore. + */ + goto lookup; } else return(EINVAL); return(0); @@ -586,94 +638,96 @@ semundo_adjust(p, supptr, semid, semnum, adjval) /* Assumes we already hold the subsystem lock. */ static void -semundo_clear(semid, semnum) - int semid, semnum; +semundo_clear(int semid, int semnum) { - register struct sem_undo *suptr; + struct sem_undo *suptr; for (suptr = semu_list; suptr != NULL; suptr = suptr->un_next) { - register struct undo *sunptr = &suptr->un_ent[0]; - register int i = 0; + struct undo *sueptr; + struct undo **suepptr; + int i = 0; + sueptr = suptr->un_ent; + suepptr = &suptr->un_ent; while (i < suptr->un_cnt) { - if (sunptr->un_id == semid) { - if (semnum == -1 || sunptr->un_num == semnum) { + if (sueptr->une_id == semid) { + if (semnum == -1 || sueptr->une_num == semnum) { suptr->un_cnt--; - if (i < suptr->un_cnt) { - suptr->un_ent[i] = - suptr->un_ent[suptr->un_cnt]; - continue; - } + *suepptr = sueptr->une_next; + FREE(sueptr, M_SYSVSEM); + sueptr = *suepptr; + continue; } if (semnum != -1) break; } - i++, sunptr++; + i++; + suepptr = &sueptr->une_next; + sueptr = sueptr->une_next; } } } /* - * Note that the user-mode half of this passes a union, not a pointer + * Note that the user-mode half of this passes a union coerced to a + * user_addr_t. The union contains either an int or a pointer, and + * so we have to coerce it back, variant on whether the calling + * process is 64 bit or not. The coercion works for the 'val' element + * because the alignment is the same in user and kernel space. */ -#ifndef _SYS_SYSPROTO_H_ -struct semctl_args { - int semid; - int semnum; - int cmd; - union semun arg; -}; -#endif - int -semctl(p, uap, retval) - struct proc *p; - register struct semctl_args *uap; - register_t *retval; +semctl(struct proc *p, struct semctl_args *uap, register_t *retval) { int semid = uap->semid; int semnum = uap->semnum; int cmd = uap->cmd; - union semun arg = uap->arg; - union semun real_arg; - struct ucred *cred = p->p_ucred; + user_semun_t user_arg = (user_semun_t)uap->arg; + kauth_cred_t cred = kauth_cred_get(); int i, rval, eval; - struct semid_ds sbuf; - register struct semid_ds *semaptr; + struct user_semid_ds sbuf; + struct user_semid_ds *semaptr; + struct user_semid_ds uds; + AUDIT_ARG(svipc_cmd, cmd); AUDIT_ARG(svipc_id, semid); - SUBSYSTEM_LOCK_AQUIRE(p); + + SYSV_SEM_SUBSYS_LOCK(); + #ifdef SEM_DEBUG - printf("call to semctl(%d, %d, %d, 0x%x)\n", semid, semnum, cmd, arg); + printf("call to semctl(%d, %d, %d, 0x%qx)\n", semid, semnum, cmd, user_arg); #endif semid = IPCID_TO_IX(semid); - if (semid < 0 || semid >= seminfo.semmni) -{ + + if (semid < 0 || semid >= seminfo.semmni) { #ifdef SEM_DEBUG printf("Invalid semid\n"); #endif - UNLOCK_AND_RETURN(EINVAL); -} + eval = EINVAL; + goto semctlout; + } semaptr = &sema[semid]; if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 || - semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid)) - UNLOCK_AND_RETURN(EINVAL); + semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid)) { + eval = EINVAL; + goto semctlout; + } eval = 0; rval = 0; switch (cmd) { case IPC_RMID: - if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_M))) - UNLOCK_AND_RETURN(eval); - semaptr->sem_perm.cuid = cred->cr_uid; - semaptr->sem_perm.uid = cred->cr_uid; + if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_M))) + goto semctlout; + + semaptr->sem_perm.cuid = kauth_cred_getuid(cred); + semaptr->sem_perm.uid = kauth_cred_getuid(cred); semtot -= semaptr->sem_nsems; - for (i = semaptr->sem_base - sem; i < semtot; i++) - sem[i] = sem[i + semaptr->sem_nsems]; + for (i = semaptr->sem_base - sem_pool; i < semtot; i++) + sem_pool[i] = sem_pool[i + semaptr->sem_nsems]; for (i = 0; i < seminfo.semmni; i++) { if ((sema[i].sem_perm.mode & SEM_ALLOC) && sema[i].sem_base > semaptr->sem_base) @@ -686,60 +740,84 @@ semctl(p, uap, retval) case IPC_SET: if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_M))) - UNLOCK_AND_RETURN(eval); - /*if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0) - UNLOCK_AND_RETURN(eval);*/ - if ((eval = copyin(arg.buf, (caddr_t)&sbuf, - sizeof(sbuf))) != 0) - UNLOCK_AND_RETURN(eval); + goto semctlout; + + SYSV_SEM_SUBSYS_UNLOCK(); + + if (IS_64BIT_PROCESS(p)) { + eval = copyin(user_arg.buf, &sbuf, sizeof(struct user_semid_ds)); + } else { + eval = copyin(user_arg.buf, &sbuf, sizeof(struct semid_ds)); + /* convert in place; ugly, but safe */ + semid_ds_32to64((struct semid_ds *)&sbuf, &sbuf); + } + + if (eval != 0) + return(eval); + + SYSV_SEM_SUBSYS_LOCK(); + semaptr->sem_perm.uid = sbuf.sem_perm.uid; semaptr->sem_perm.gid = sbuf.sem_perm.gid; semaptr->sem_perm.mode = (semaptr->sem_perm.mode & ~0777) | (sbuf.sem_perm.mode & 0777); - semaptr->sem_ctime = time_second; + semaptr->sem_ctime = sysv_semtime(); break; case IPC_STAT: if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R))) - UNLOCK_AND_RETURN(eval); - /*if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0) - UNLOCK_AND_RETURN(eval);*/ - eval = copyout((caddr_t)semaptr, arg.buf, - sizeof(struct semid_ds)); + goto semctlout; + bcopy(semaptr, &uds, sizeof(struct user_semid_ds)); + SYSV_SEM_SUBSYS_UNLOCK(); + if (IS_64BIT_PROCESS(p)) { + eval = copyout(&uds, user_arg.buf, sizeof(struct user_semid_ds)); + } else { + struct semid_ds semid_ds32; + semid_ds_64to32(&uds, &semid_ds32); + eval = copyout(&semid_ds32, user_arg.buf, sizeof(struct semid_ds)); + } + SYSV_SEM_SUBSYS_LOCK(); break; case GETNCNT: if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R))) - UNLOCK_AND_RETURN(eval); - if (semnum < 0 || semnum >= semaptr->sem_nsems) - UNLOCK_AND_RETURN(EINVAL); + goto semctlout; + if (semnum < 0 || semnum >= semaptr->sem_nsems) { + eval = EINVAL; + goto semctlout; + } rval = semaptr->sem_base[semnum].semncnt; break; case GETPID: if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R))) - UNLOCK_AND_RETURN(eval); - if (semnum < 0 || semnum >= semaptr->sem_nsems) - UNLOCK_AND_RETURN(EINVAL); + goto semctlout; + if (semnum < 0 || semnum >= semaptr->sem_nsems) { + eval = EINVAL; + goto semctlout; + } rval = semaptr->sem_base[semnum].sempid; break; case GETVAL: if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R))) - UNLOCK_AND_RETURN(eval); - if (semnum < 0 || semnum >= semaptr->sem_nsems) - UNLOCK_AND_RETURN(EINVAL); + goto semctlout; + if (semnum < 0 || semnum >= semaptr->sem_nsems) { + eval = EINVAL; + goto semctlout; + } rval = semaptr->sem_base[semnum].semval; break; case GETALL: if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R))) - UNLOCK_AND_RETURN(eval); - /*if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0) - UNLOCK_AND_RETURN(eval);*/ + goto semctlout; +/* XXXXXXXXXXXXXXXX TBD XXXXXXXXXXXXXXXX */ for (i = 0; i < semaptr->sem_nsems; i++) { + /* XXX could be done in one go... */ eval = copyout((caddr_t)&semaptr->sem_base[i].semval, - &arg.array[i], sizeof(arg.array[0])); + user_arg.array + (i * sizeof(unsigned short)), + sizeof(unsigned short)); if (eval != 0) break; } @@ -747,9 +825,11 @@ semctl(p, uap, retval) case GETZCNT: if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R))) - UNLOCK_AND_RETURN(eval); - if (semnum < 0 || semnum >= semaptr->sem_nsems) - UNLOCK_AND_RETURN(EINVAL); + goto semctlout; + if (semnum < 0 || semnum >= semaptr->sem_nsems) { + eval = EINVAL; + goto semctlout; + } rval = semaptr->sem_base[semnum].semzcnt; break; @@ -759,36 +839,35 @@ semctl(p, uap, retval) #ifdef SEM_DEBUG printf("Invalid credentials for write\n"); #endif - UNLOCK_AND_RETURN(eval); + goto semctlout; } if (semnum < 0 || semnum >= semaptr->sem_nsems) { #ifdef SEM_DEBUG printf("Invalid number out of range for set\n"); #endif - UNLOCK_AND_RETURN(EINVAL); + eval = EINVAL; + goto semctlout; } - /*if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0) - { -#ifdef SEM_DEBUG - printf("Error during value copyin\n"); -#endif - UNLOCK_AND_RETURN(eval); - }*/ - semaptr->sem_base[semnum].semval = arg.val; + /* + * Cast down a pointer instead of using 'val' member directly + * to avoid introducing endieness and a pad field into the + * header file. Ugly, but it works. + */ + semaptr->sem_base[semnum].semval = CAST_DOWN(int,user_arg.buf); semundo_clear(semid, semnum); wakeup((caddr_t)semaptr); break; case SETALL: if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_W))) - UNLOCK_AND_RETURN(eval); - /*if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0) - UNLOCK_AND_RETURN(eval);*/ + goto semctlout; +/*** XXXXXXXXXXXX TBD ********/ for (i = 0; i < semaptr->sem_nsems; i++) { - eval = copyin(&arg.array[i], + /* XXX could be done in one go... */ + eval = copyin(user_arg.array + (i * sizeof(unsigned short)), (caddr_t)&semaptr->sem_base[i].semval, - sizeof(arg.array[0])); + sizeof(unsigned short)); if (eval != 0) break; } @@ -797,41 +876,36 @@ semctl(p, uap, retval) break; default: - UNLOCK_AND_RETURN(EINVAL); + eval = EINVAL; + goto semctlout; } if (eval == 0) *retval = rval; - UNLOCK_AND_RETURN(eval); +semctlout: + SYSV_SEM_SUBSYS_UNLOCK(); + return(eval); } -#ifndef _SYS_SYSPROTO_H_ -struct semget_args { - key_t key; - int nsems; - int semflg; -}; -#endif - int -semget(p, uap, retval) - struct proc *p; - register struct semget_args *uap; - register_t *retval; +semget(__unused struct proc *p, struct semget_args *uap, register_t *retval) { int semid, eval; int key = uap->key; int nsems = uap->nsems; int semflg = uap->semflg; - struct ucred *cred = p->p_ucred; + kauth_cred_t cred = kauth_cred_get(); - SUBSYSTEM_LOCK_AQUIRE(p); #ifdef SEM_DEBUG if (key != IPC_PRIVATE) printf("semget(0x%x, %d, 0%o)\n", key, nsems, semflg); else printf("semget(IPC_PRIVATE, %d, 0%o)\n", nsems, semflg); #endif + + + SYSV_SEM_SUBSYS_LOCK(); + if (key != IPC_PRIVATE) { for (semid = 0; semid < seminfo.semmni; semid++) { @@ -845,18 +919,20 @@ semget(p, uap, retval) #endif if ((eval = ipcperm(cred, &sema[semid].sem_perm, semflg & 0700))) - UNLOCK_AND_RETURN(eval); - if (nsems > 0 && sema[semid].sem_nsems < nsems) { + goto semgetout; + if (nsems < 0 || sema[semid].sem_nsems < nsems) { #ifdef SEM_DEBUG printf("too small\n"); #endif - UNLOCK_AND_RETURN(EINVAL); + eval = EINVAL; + goto semgetout; } if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) { #ifdef SEM_DEBUG printf("not exclusive\n"); #endif - UNLOCK_AND_RETURN(EEXIST); + eval = EEXIST; + goto semgetout; } goto found; } @@ -871,19 +947,20 @@ semget(p, uap, retval) printf("nsems out of range (0<%d<=%d)\n", nsems, seminfo.semmsl); #endif - UNLOCK_AND_RETURN(EINVAL); + eval = EINVAL; + goto semgetout; } if (nsems > seminfo.semmns - semtot) { #ifdef SEM_DEBUG printf("not enough semaphores left (need %d, got %d)\n", nsems, seminfo.semmns - semtot); #endif - if (!grow_sem_array(semtot + nsems)) - { + if (!grow_sem_pool(semtot + nsems)) { #ifdef SEM_DEBUG printf("failed to grow the sem array\n"); #endif - UNLOCK_AND_RETURN(ENOSPC); + eval = ENOSPC; + goto semgetout; } } for (semid = 0; semid < seminfo.semmni; semid++) { @@ -899,15 +976,16 @@ semget(p, uap, retval) #ifdef SEM_DEBUG printf("failed to grow sema array\n"); #endif - UNLOCK_AND_RETURN(ENOSPC); + eval = ENOSPC; + goto semgetout; } } #ifdef SEM_DEBUG printf("semid %d is available\n", semid); #endif sema[semid].sem_perm.key = key; - sema[semid].sem_perm.cuid = cred->cr_uid; - sema[semid].sem_perm.uid = cred->cr_uid; + sema[semid].sem_perm.cuid = kauth_cred_getuid(cred); + sema[semid].sem_perm.uid = kauth_cred_getuid(cred); sema[semid].sem_perm.cgid = cred->cr_gid; sema[semid].sem_perm.gid = cred->cr_gid; sema[semid].sem_perm.mode = (semflg & 0777) | SEM_ALLOC; @@ -915,20 +993,21 @@ semget(p, uap, retval) (sema[semid].sem_perm.seq + 1) & 0x7fff; sema[semid].sem_nsems = nsems; sema[semid].sem_otime = 0; - sema[semid].sem_ctime = time_second; - sema[semid].sem_base = &sem[semtot]; + sema[semid].sem_ctime = sysv_semtime(); + sema[semid].sem_base = &sem_pool[semtot]; semtot += nsems; bzero(sema[semid].sem_base, sizeof(sema[semid].sem_base[0])*nsems); #ifdef SEM_DEBUG printf("sembase = 0x%x, next = 0x%x\n", sema[semid].sem_base, - &sem[semtot]); + &sem_pool[semtot]); #endif } else { #ifdef SEM_DEBUG printf("didn't find it and wasn't asked to create it\n"); #endif - UNLOCK_AND_RETURN(ENOENT); + eval = ENOENT; + goto semgetout; } found: @@ -937,72 +1016,73 @@ found: #ifdef SEM_DEBUG printf("semget is done, returning %d\n", *retval); #endif - SUBSYSTEM_LOCK_RELEASE; - return(0); -} + eval = 0; -#ifndef _SYS_SYSPROTO_H_ -struct semop_args { - int semid; - struct sembuf *sops; - int nsops; -}; -#endif +semgetout: + SYSV_SEM_SUBSYS_UNLOCK(); + return(eval); +} int -semop(p, uap, retval) - struct proc *p; - register struct semop_args *uap; - register_t *retval; +semop(struct proc *p, struct semop_args *uap, register_t *retval) { int semid = uap->semid; int nsops = uap->nsops; struct sembuf sops[MAX_SOPS]; - register struct semid_ds *semaptr; - register struct sembuf *sopptr; - register struct sem *semptr; + register struct user_semid_ds *semaptr; + register struct sembuf *sopptr = NULL; /* protected by 'semptr' */ + register struct sem *semptr = NULL; /* protected by 'if' */ struct sem_undo *suptr = NULL; - struct ucred *cred = p->p_ucred; int i, j, eval; int do_wakeup, do_undos; AUDIT_ARG(svipc_id, uap->semid); - SUBSYSTEM_LOCK_AQUIRE(p); + + SYSV_SEM_SUBSYS_LOCK(); + #ifdef SEM_DEBUG printf("call to semop(%d, 0x%x, %d)\n", semid, sops, nsops); #endif semid = IPCID_TO_IX(semid); /* Convert back to zero origin */ - if (semid < 0 || semid >= seminfo.semmni) - UNLOCK_AND_RETURN(EINVAL); + if (semid < 0 || semid >= seminfo.semmni) { + eval = EINVAL; + goto semopout; + } semaptr = &sema[semid]; - if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0) - UNLOCK_AND_RETURN(EINVAL); - if (semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid)) - UNLOCK_AND_RETURN(EINVAL); + if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0) { + eval = EINVAL; + goto semopout; + } + if (semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid)) { + eval = EINVAL; + goto semopout; + } - if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_W))) { + if ((eval = ipcperm(kauth_cred_get(), &semaptr->sem_perm, IPC_W))) { #ifdef SEM_DEBUG printf("eval = %d from ipaccess\n", eval); #endif - UNLOCK_AND_RETURN(eval); + goto semopout; } if (nsops < 0 || nsops > MAX_SOPS) { #ifdef SEM_DEBUG printf("too many sops (max=%d, nsops=%d)\n", MAX_SOPS, nsops); #endif - UNLOCK_AND_RETURN(E2BIG); + eval = E2BIG; + goto semopout; } - if ((eval = copyin(uap->sops, &sops, nsops * sizeof(sops[0]))) != 0) { + /* OK for LP64, since sizeof(struct sembuf) is currently invariant */ + if ((eval = copyin(uap->sops, &sops, nsops * sizeof(struct sembuf))) != 0) { #ifdef SEM_DEBUG printf("eval = %d from copyin(%08x, %08x, %ld)\n", eval, - uap->sops, &sops, nsops * sizeof(sops[0])); + uap->sops, &sops, nsops * sizeof(struct sembuf)); #endif - UNLOCK_AND_RETURN(eval); + goto semopout; } /* @@ -1022,8 +1102,10 @@ semop(p, uap, retval) for (i = 0; i < nsops; i++) { sopptr = &sops[i]; - if (sopptr->sem_num >= semaptr->sem_nsems) - UNLOCK_AND_RETURN(EFBIG); + if (sopptr->sem_num >= semaptr->sem_nsems) { + eval = EFBIG; + goto semopout; + } semptr = &semaptr->sem_base[sopptr->sem_num]; @@ -1084,8 +1166,10 @@ semop(p, uap, retval) * If the request that we couldn't satisfy has the * NOWAIT flag set then return with EAGAIN. */ - if (sopptr->sem_flg & IPC_NOWAIT) - UNLOCK_AND_RETURN(EAGAIN); + if (sopptr->sem_flg & IPC_NOWAIT) { + eval = EAGAIN; + goto semopout; + } if (sopptr->sem_op == 0) semptr->semzcnt++; @@ -1100,23 +1184,23 @@ semop(p, uap, retval) * waiting for. We will get the lock back after we * wake up. */ - SUBSYSTEM_LOCK_RELEASE; - sysv_sem_sleeping_threads++; - eval = tsleep((caddr_t)semaptr, (PZERO - 4) | PCATCH, + eval = msleep((caddr_t)semaptr, &sysv_sem_subsys_mutex , (PZERO - 4) | PCATCH, "semwait", 0); - sysv_sem_sleeping_threads--; #ifdef SEM_DEBUG printf("semop: good morning (eval=%d)!\n", eval); #endif - /* There is no need to get the lock if we are just - * going to return without performing more semaphore - * operations. - */ - if (eval != 0) - return(EINTR); + /* we need the lock here due to mods on semptr */ + if (eval != 0) { + if (sopptr->sem_op == 0) + semptr->semzcnt--; + else + semptr->semncnt--; + + eval = EINTR; + goto semopout; + } - SUBSYSTEM_LOCK_AQUIRE(p); /* Get it back */ suptr = NULL; /* sem_undo may have been reallocated */ semaptr = &sema[semid]; /* sema may have been reallocated */ @@ -1132,11 +1216,16 @@ semop(p, uap, retval) semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid)) { /* The man page says to return EIDRM. */ /* Unfortunately, BSD doesn't define that code! */ + if (sopptr->sem_op == 0) + semptr->semzcnt--; + else + semptr->semncnt--; #ifdef EIDRM - UNLOCK_AND_RETURN(EIDRM); + eval = EIDRM; #else - UNLOCK_AND_RETURN(EINVAL); + eval = EINVAL; #endif + goto semopout; } /* @@ -1201,7 +1290,7 @@ done: #ifdef SEM_DEBUG printf("eval = %d from semundo_adjust\n", eval); #endif - UNLOCK_AND_RETURN(eval); + goto semopout; } /* loop through the sops */ } /* if (do_undos) */ @@ -1212,16 +1301,6 @@ done: semptr->sempid = p->p_pid; } - /* Do a wakeup if any semaphore was up'd. - * we will release our lock on the semaphore subsystem before - * we wakeup other processes to prevent a little thrashing. - * Note that this is fine because we are done using the - * semaphore structures at this point in time. We only use - * a local variable pointer value, and the retval - * parameter. - * Note 2: Future use of sem_wakeup may reqiure the lock. - */ - SUBSYSTEM_LOCK_RELEASE; if (do_wakeup) { #ifdef SEM_DEBUG printf("semop: doing wakeup\n"); @@ -1239,7 +1318,10 @@ done: printf("semop: done\n"); #endif *retval = 0; - return(0); + eval = 0; +semopout: + SYSV_SEM_SUBSYS_UNLOCK(); + return(eval); } /* @@ -1247,8 +1329,7 @@ done: * semaphores. */ void -semexit(p) - struct proc *p; +semexit(struct proc *p) { register struct sem_undo *suptr; register struct sem_undo **supptr; @@ -1258,10 +1339,11 @@ semexit(p) * anything to undo, but we need the lock to prevent * dynamic memory race conditions. */ - SUBSYSTEM_LOCK_AQUIRE(p); - if (!sem) + SYSV_SEM_SUBSYS_LOCK(); + + if (!sem_pool) { - SUBSYSTEM_LOCK_RELEASE; + SYSV_SEM_SUBSYS_UNLOCK(); return; } did_something = 0; @@ -1289,13 +1371,17 @@ semexit(p) * If there are any active undo elements then process them. */ if (suptr->un_cnt > 0) { - int ix; + while (suptr->un_ent != NULL) { + struct undo *sueptr; + int semid; + int semnum; + int adjval; + struct user_semid_ds *semaptr; - for (ix = 0; ix < suptr->un_cnt; ix++) { - int semid = suptr->un_ent[ix].un_id; - int semnum = suptr->un_ent[ix].un_num; - int adjval = suptr->un_ent[ix].un_adjval; - struct semid_ds *semaptr; + sueptr = suptr->un_ent; + semid = sueptr->une_id; + semnum = sueptr->une_num; + adjval = sueptr->une_adjval; semaptr = &sema[semid]; if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0) @@ -1305,10 +1391,11 @@ semexit(p) #ifdef SEM_DEBUG printf("semexit: %08x id=%d num=%d(adj=%d) ; sem=%d\n", - suptr->un_proc, suptr->un_ent[ix].un_id, - suptr->un_ent[ix].un_num, - suptr->un_ent[ix].un_adjval, - semaptr->sem_base[semnum].semval); + suptr->un_proc, + semid, + semnum, + adjval, + semaptr->sem_base[semnum].semval); #endif if (adjval < 0) { @@ -1336,6 +1423,10 @@ semexit(p) #ifdef SEM_DEBUG printf("semexit: back from wakeup\n"); #endif + suptr->un_cnt--; + suptr->un_ent = sueptr->une_next; + FREE(sueptr, M_SYSVSEM); + sueptr = NULL; } } @@ -1368,32 +1459,36 @@ unlock: * same leaky semaphore problem. */ - SUBSYSTEM_LOCK_RELEASE; + SYSV_SEM_SUBSYS_UNLOCK(); } + + /* (struct sysctl_oid *oidp, void *arg1, int arg2, \ struct sysctl_req *req) */ static int -sysctl_seminfo SYSCTL_HANDLER_ARGS +sysctl_seminfo(__unused struct sysctl_oid *oidp, void *arg1, + __unused int arg2, struct sysctl_req *req) { int error = 0; error = SYSCTL_OUT(req, arg1, sizeof(int)); - if (error || !req->newptr) + if (error || req->newptr == USER_ADDR_NULL) return(error); - SUBSYSTEM_LOCK_AQUIRE(current_proc()); + SYSV_SEM_SUBSYS_LOCK(); + /* Set the values only if shared memory is not initialised */ - if ((sem == (struct sem *) 0) && - (sema == (struct semid_ds *) 0) && - (semu == (struct semid_ds *) 0) && - (semu_list == (struct sem_undo *) 0)) { - if (error = SYSCTL_IN(req, arg1, sizeof(int))) { + if ((sem_pool == NULL) && + (sema == NULL) && + (semu == NULL) && + (semu_list == NULL)) { + if ((error = SYSCTL_IN(req, arg1, sizeof(int)))) { goto out; } } else error = EINVAL; out: - SUBSYSTEM_LOCK_RELEASE; + SYSV_SEM_SUBSYS_UNLOCK(); return(error); } @@ -1416,3 +1511,103 @@ SYSCTL_PROC(_kern_sysv, KSYSV_SEMUNE, semume, CTLTYPE_INT | CTLFLAG_RW, &limitseminfo.semume, 0, &sysctl_seminfo ,"I","semume"); +static int +IPCS_sem_sysctl(__unused struct sysctl_oid *oidp, __unused void *arg1, + __unused int arg2, struct sysctl_req *req) +{ + int error; + int cursor; + union { + struct IPCS_command u32; + struct user_IPCS_command u64; + } ipcs; + struct semid_ds semid_ds32; /* post conversion, 32 bit version */ + void *semid_dsp; + size_t ipcs_sz = sizeof(struct user_IPCS_command); + size_t semid_ds_sz = sizeof(struct user_semid_ds); + struct proc *p = current_proc(); + + /* Copy in the command structure */ + if ((error = SYSCTL_IN(req, &ipcs, ipcs_sz)) != 0) { + return(error); + } + + if (!IS_64BIT_PROCESS(p)) { + ipcs_sz = sizeof(struct IPCS_command); + semid_ds_sz = sizeof(struct semid_ds); + } + + /* Let us version this interface... */ + if (ipcs.u64.ipcs_magic != IPCS_MAGIC) { + return(EINVAL); + } + + SYSV_SEM_SUBSYS_LOCK(); + switch(ipcs.u64.ipcs_op) { + case IPCS_SEM_CONF: /* Obtain global configuration data */ + if (ipcs.u64.ipcs_datalen != sizeof(struct seminfo)) { + error = ERANGE; + break; + } + if (ipcs.u64.ipcs_cursor != 0) { /* fwd. compat. */ + error = EINVAL; + break; + } + SYSV_SEM_SUBSYS_UNLOCK(); + error = copyout(&seminfo, ipcs.u64.ipcs_data, ipcs.u64.ipcs_datalen); + SYSV_SEM_SUBSYS_LOCK(); + break; + + case IPCS_SEM_ITER: /* Iterate over existing segments */ + cursor = ipcs.u64.ipcs_cursor; + if (cursor < 0 || cursor >= seminfo.semmni) { + error = ERANGE; + break; + } + if (ipcs.u64.ipcs_datalen != (int)semid_ds_sz ) { + error = EINVAL; + break; + } + for( ; cursor < seminfo.semmni; cursor++) { + if (sema[cursor].sem_perm.mode & SEM_ALLOC) + break; + continue; + } + if (cursor == seminfo.semmni) { + error = ENOENT; + break; + } + + semid_dsp = &sema[cursor]; /* default: 64 bit */ + + /* + * If necessary, convert the 64 bit kernel segment + * descriptor to a 32 bit user one. + */ + if (!IS_64BIT_PROCESS(p)) { + semid_ds_64to32(semid_dsp, &semid_ds32); + semid_dsp = &semid_ds32; + } + SYSV_SEM_SUBSYS_UNLOCK(); + error = copyout(semid_dsp, ipcs.u64.ipcs_data, ipcs.u64.ipcs_datalen); + if (!error) { + /* update cursor */ + ipcs.u64.ipcs_cursor = cursor + 1; + error = SYSCTL_OUT(req, &ipcs, ipcs_sz); + } + SYSV_SEM_SUBSYS_LOCK(); + break; + + default: + error = EINVAL; + break; + } + SYSV_SEM_SUBSYS_UNLOCK(); + return(error); +} + +SYSCTL_DECL(_kern_sysv_ipcs); +SYSCTL_PROC(_kern_sysv_ipcs, OID_AUTO, sem, CTLFLAG_RW|CTLFLAG_ANYBODY, + 0, 0, IPCS_sem_sysctl, + "S,IPCS_sem_command", + "ipcs sem command interface"); diff --git a/bsd/kern/sysv_shm.c b/bsd/kern/sysv_shm.c index 7b2eff349..c626909e0 100644 --- a/bsd/kern/sysv_shm.c +++ b/bsd/kern/sysv_shm.c @@ -56,39 +56,49 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> -#include <sys/shm.h> -#include <sys/proc.h> +#include <sys/shm_internal.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/malloc.h> #include <sys/mman.h> #include <sys/stat.h> #include <sys/sysctl.h> +#include <sys/ipcs.h> +#include <sys/sysent.h> +#include <sys/sysproto.h> #include <bsm/audit_kernel.h> #include <mach/mach_types.h> #include <mach/vm_inherit.h> +#include <mach/vm_map.h> + +#include <mach/mach_vm.h> + #include <vm/vm_map.h> +#include <vm/vm_shared_memory_server.h> +#include <vm/vm_protos.h> -struct shmat_args; -extern int shmat __P((struct proc *p, struct shmat_args *uap, int *retval)); -struct shmctl_args; -extern int shmctl __P((struct proc *p, struct shmctl_args *uap, int *retval)); -struct shmdt_args; -extern int shmdt __P((struct proc *p, struct shmdt_args *uap, int *retval)); -struct shmget_args; -extern int shmget __P((struct proc *p, struct shmget_args *uap, int *retval)); +#include <kern/locks.h> +static void shminit(void *); #if 0 -static void shminit __P((void *)); SYSINIT(sysv_shm, SI_SUB_SYSV_SHM, SI_ORDER_FIRST, shminit, NULL) #endif 0 -struct oshmctl_args; -static int oshmctl __P((struct proc *p, struct oshmctl_args *uap, int * retval)); -static int shmget_allocate_segment __P((struct proc *p, struct shmget_args *uap, int mode, int * retval)); -static int shmget_existing __P((struct proc *p, struct shmget_args *uap, int mode, int segnum, int * retval)); +static lck_grp_t *sysv_shm_subsys_lck_grp; +static lck_grp_attr_t *sysv_shm_subsys_lck_grp_attr; +static lck_attr_t *sysv_shm_subsys_lck_attr; +static lck_mtx_t sysv_shm_subsys_mutex; -typedef int sy_call_t __P((struct proc *, void *, int *)); +#define SYSV_SHM_SUBSYS_LOCK() lck_mtx_lock(&sysv_shm_subsys_mutex) +#define SYSV_SHM_SUBSYS_UNLOCK() lck_mtx_unlock(&sysv_shm_subsys_mutex) + +static int oshmctl(void *p, void *uap, void *retval); +static int shmget_allocate_segment(struct proc *p, struct shmget_args *uap, int mode, int * retval); +static int shmget_existing(struct shmget_args *uap, int mode, int segnum, int * retval); +static void shmid_ds_64to32(struct user_shmid_ds *in, struct shmid_ds *out); +static void shmid_ds_32to64(struct shmid_ds *in, struct user_shmid_ds *out); /* XXX casting to (sy_call_t *) is bogus, as usual. */ static sy_call_t *shmcalls[] = { @@ -103,23 +113,22 @@ static sy_call_t *shmcalls[] = { #define SHMSEG_WANTED 0x1000 static int shm_last_free, shm_nused, shm_committed; -struct shmid_ds *shmsegs; +struct user_shmid_ds *shmsegs; /* 64 bit version */ static int shm_inited = 0; struct shm_handle { - /* vm_offset_t kva; */ - void * shm_object; + void * shm_object; /* vm_offset_t kva; */ }; struct shmmap_state { - vm_offset_t va; - int shmid; + mach_vm_address_t va; /* user address */ + int shmid; /* segment id */ }; -static void shm_deallocate_segment __P((struct shmid_ds *)); -static int shm_find_segment_by_key __P((key_t)); -static struct shmid_ds *shm_find_segment_by_shmid __P((int)); -static int shm_delete_mapping __P((struct proc *, struct shmmap_state *, int)); +static void shm_deallocate_segment(struct user_shmid_ds *); +static int shm_find_segment_by_key(key_t); +static struct user_shmid_ds *shm_find_segment_by_shmid(int); +static int shm_delete_mapping(struct proc *, struct shmmap_state *, int); #ifdef __APPLE_API_PRIVATE struct shminfo shminfo = { @@ -131,9 +140,58 @@ struct shminfo shminfo = { }; #endif /* __APPLE_API_PRIVATE */ +void sysv_shm_lock_init(void); + +static __inline__ time_t +sysv_shmtime(void) +{ + struct timeval tv; + microtime(&tv); + return (tv.tv_sec); +} + +/* + * This conversion is safe, since if we are converting for a 32 bit process, + * then it's value of (struct shmid_ds)->shm_segsz will never exceed 4G. + * + * NOTE: Source and target may *NOT* overlap! (target is smaller) + */ +static void +shmid_ds_64to32(struct user_shmid_ds *in, struct shmid_ds *out) +{ + out->shm_perm = in->shm_perm; + out->shm_segsz = (size_t)in->shm_segsz; + out->shm_lpid = in->shm_lpid; + out->shm_cpid = in->shm_cpid; + out->shm_nattch = in->shm_nattch; + out->shm_atime = in->shm_atime; + out->shm_dtime = in->shm_dtime; + out->shm_ctime = in->shm_ctime; + out->shm_internal = CAST_DOWN(void *,in->shm_internal); +} + +/* + * NOTE: Source and target may are permitted to overlap! (source is smaller); + * this works because we copy fields in order from the end of the struct to + * the beginning. + */ +static void +shmid_ds_32to64(struct shmid_ds *in, struct user_shmid_ds *out) +{ + out->shm_internal = CAST_USER_ADDR_T(in->shm_internal); + out->shm_ctime = in->shm_ctime; + out->shm_dtime = in->shm_dtime; + out->shm_atime = in->shm_atime; + out->shm_nattch = in->shm_nattch; + out->shm_cpid = in->shm_cpid; + out->shm_lpid = in->shm_lpid; + out->shm_segsz = (user_size_t)in->shm_segsz; + out->shm_perm = in->shm_perm; +} + + static int -shm_find_segment_by_key(key) - key_t key; +shm_find_segment_by_key(key_t key) { int i; @@ -144,12 +202,11 @@ shm_find_segment_by_key(key) return -1; } -static struct shmid_ds * -shm_find_segment_by_shmid(shmid) - int shmid; +static struct user_shmid_ds * +shm_find_segment_by_shmid(int shmid) { int segnum; - struct shmid_ds *shmseg; + struct user_shmid_ds *shmseg; segnum = IPCID_TO_IX(shmid); if (segnum < 0 || segnum >= shminfo.shmmni) @@ -163,44 +220,40 @@ shm_find_segment_by_shmid(shmid) } static void -shm_deallocate_segment(shmseg) - struct shmid_ds *shmseg; +shm_deallocate_segment(struct user_shmid_ds *shmseg) { struct shm_handle *shm_handle; - struct shmmap_state *shmmap_s=NULL; - size_t size; - char * ptr; + mach_vm_size_t size; - shm_handle = shmseg->shm_internal; - size = round_page_32(shmseg->shm_segsz); - mach_destroy_memory_entry(shm_handle->shm_object); + shm_handle = CAST_DOWN(void *,shmseg->shm_internal); /* tunnel */ + size = mach_vm_round_page(shmseg->shm_segsz); + mach_memory_entry_port_release(shm_handle->shm_object); + shm_handle->shm_object = NULL; FREE((caddr_t)shm_handle, M_SHM); - shmseg->shm_internal = NULL; + shmseg->shm_internal = USER_ADDR_NULL; /* tunnel */ shm_committed -= btoc(size); shm_nused--; shmseg->shm_perm.mode = SHMSEG_FREE; } static int -shm_delete_mapping(p, shmmap_s, deallocate) - struct proc *p; - struct shmmap_state *shmmap_s; - int deallocate; +shm_delete_mapping(__unused struct proc *p, struct shmmap_state *shmmap_s, + int deallocate) { - struct shmid_ds *shmseg; + struct user_shmid_ds *shmseg; int segnum, result; - size_t size; + mach_vm_size_t size; segnum = IPCID_TO_IX(shmmap_s->shmid); shmseg = &shmsegs[segnum]; - size = round_page_32(shmseg->shm_segsz); + size = mach_vm_round_page(shmseg->shm_segsz); /* XXX done for us? */ if (deallocate) { - result = vm_deallocate(current_map(), shmmap_s->va, size); + result = mach_vm_deallocate(current_map(), shmmap_s->va, size); if (result != KERN_SUCCESS) return EINVAL; } shmmap_s->shmid = -1; - shmseg->shm_dtime = time_second; + shmseg->shm_dtime = sysv_shmtime(); if ((--shmseg->shm_nattch <= 0) && (shmseg->shm_perm.mode & SHMSEG_REMOVED)) { shm_deallocate_segment(shmseg); @@ -209,224 +262,205 @@ shm_delete_mapping(p, shmmap_s, deallocate) return 0; } -struct shmdt_args { - void *shmaddr; -}; - int -shmdt(p, uap, retval) - struct proc *p; - struct shmdt_args *uap; - register_t *retval; +shmdt(struct proc *p, struct shmdt_args *uap, register_t *retval) { struct shmmap_state *shmmap_s; int i; + int shmdtret = 0; + + // LP64todo - fix this + AUDIT_ARG(svipc_addr, CAST_DOWN(void *,uap->shmaddr)); + + SYSV_SHM_SUBSYS_LOCK(); - AUDIT_ARG(svipc_addr, uap->shmaddr); - if (!shm_inited) - return(EINVAL); + if (!shm_inited) { + shmdtret = EINVAL; + goto shmdt_out; + } shmmap_s = (struct shmmap_state *)p->vm_shm; - if (shmmap_s == NULL) - return EINVAL; + if (shmmap_s == NULL) { + shmdtret = EINVAL; + goto shmdt_out; + } + for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) if (shmmap_s->shmid != -1 && - shmmap_s->va == (vm_offset_t)uap->shmaddr) + shmmap_s->va == (mach_vm_offset_t)uap->shmaddr) break; - if (i == shminfo.shmseg) - return EINVAL; - return shm_delete_mapping(p, shmmap_s, 1); + if (i == shminfo.shmseg) { + shmdtret = EINVAL; + goto shmdt_out; + } + i = shm_delete_mapping(p, shmmap_s, 1); + + if (i == 0) + *retval = 0; + shmdtret = i; +shmdt_out: + SYSV_SHM_SUBSYS_UNLOCK(); + return shmdtret; } -#ifndef _SYS_SYSPROTO_H_ -struct shmat_args { - int shmid; - void *shmaddr; - int shmflg; -}; -#endif - int -shmat(p, uap, retval) - struct proc *p; - struct shmat_args *uap; - register_t *retval; +shmat(struct proc *p, struct shmat_args *uap, register_t *retval) { int error, i, flags; - struct ucred *cred = p->p_ucred; - struct shmid_ds *shmseg; - struct shmmap_state *shmmap_s = NULL; - struct shm_handle *shm_handle; - vm_offset_t attach_va; - vm_prot_t prot; - vm_size_t size; - kern_return_t rv; + struct user_shmid_ds *shmseg; + struct shmmap_state *shmmap_s = NULL; + struct shm_handle *shm_handle; + mach_vm_address_t attach_va; /* attach address in/out */ + mach_vm_size_t map_size; /* size of map entry */ + vm_prot_t prot; + size_t size; + kern_return_t rv; + int shmat_ret = 0; AUDIT_ARG(svipc_id, uap->shmid); - AUDIT_ARG(svipc_addr, uap->shmaddr); - if (!shm_inited) - return(EINVAL); + // LP64todo - fix this + AUDIT_ARG(svipc_addr, CAST_DOWN(void *,uap->shmaddr)); + + SYSV_SHM_SUBSYS_LOCK(); + + if (!shm_inited) { + shmat_ret = EINVAL; + goto shmat_out; + } + shmmap_s = (struct shmmap_state *)p->vm_shm; + if (shmmap_s == NULL) { size = shminfo.shmseg * sizeof(struct shmmap_state); - shmmap_s = (struct shmmap_state *)_MALLOC(size, M_SHM, M_WAITOK); + MALLOC(shmmap_s, struct shmmap_state *, size, M_SHM, M_WAITOK); + if (shmmap_s == NULL) { + shmat_ret = ENOMEM; + goto shmat_out; + } for (i = 0; i < shminfo.shmseg; i++) shmmap_s[i].shmid = -1; p->vm_shm = (caddr_t)shmmap_s; } shmseg = shm_find_segment_by_shmid(uap->shmid); - if (shmseg == NULL) - return EINVAL; + if (shmseg == NULL) { + shmat_ret = EINVAL; + goto shmat_out; + } AUDIT_ARG(svipc_perm, &shmseg->shm_perm); - error = ipcperm(cred, &shmseg->shm_perm, + error = ipcperm(kauth_cred_get(), &shmseg->shm_perm, (uap->shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W); - if (error) - return error; + if (error) { + shmat_ret = error; + goto shmat_out; + } + for (i = 0; i < shminfo.shmseg; i++) { if (shmmap_s->shmid == -1) break; shmmap_s++; } - if (i >= shminfo.shmseg) - return EMFILE; - size = round_page_32(shmseg->shm_segsz); + if (i >= shminfo.shmseg) { + shmat_ret = EMFILE; + goto shmat_out; + } + + map_size = mach_vm_round_page(shmseg->shm_segsz); prot = VM_PROT_READ; if ((uap->shmflg & SHM_RDONLY) == 0) prot |= VM_PROT_WRITE; flags = MAP_ANON | MAP_SHARED; - if (uap->shmaddr) { + if (uap->shmaddr) flags |= MAP_FIXED; - if (uap->shmflg & SHM_RND) - attach_va = (vm_offset_t)uap->shmaddr & ~(SHMLBA-1); - else if (((vm_offset_t)uap->shmaddr & (SHMLBA-1)) == 0) - attach_va = (vm_offset_t)uap->shmaddr; - else - return EINVAL; - } else { - attach_va = round_page_32((unsigned int)uap->shmaddr); - } - - shm_handle = shmseg->shm_internal; - rv = vm_map(current_map(), &attach_va, size, 0, (flags & MAP_FIXED)? FALSE: TRUE, - shm_handle->shm_object, 0, FALSE, prot, prot, VM_INHERIT_DEFAULT); + + attach_va = (mach_vm_address_t)uap->shmaddr; + if (uap->shmflg & SHM_RND) + attach_va &= ~(SHMLBA-1); + else if ((attach_va & (SHMLBA-1)) != 0) { + shmat_ret = EINVAL; + goto shmat_out; + } + + shm_handle = CAST_DOWN(void *, shmseg->shm_internal); /* tunnel */ + + rv = mach_vm_map(current_map(), /* process map */ + &attach_va, /* attach address */ + map_size, /* segment size */ + (mach_vm_offset_t)0, /* alignment mask */ + (flags & MAP_FIXED)? VM_FLAGS_FIXED: VM_FLAGS_ANYWHERE, + shm_handle->shm_object, + (mach_vm_offset_t)0, + FALSE, + prot, + prot, + VM_INHERIT_DEFAULT); if (rv != KERN_SUCCESS) goto out; - rv = vm_inherit(current_map(), attach_va, size, - VM_INHERIT_SHARE); + + rv = mach_vm_inherit(current_map(), attach_va, map_size, VM_INHERIT_SHARE); if (rv != KERN_SUCCESS) { - (void) vm_deallocate(current_map(), attach_va, size); + (void)mach_vm_deallocate(current_map(), attach_va, map_size); goto out; } shmmap_s->va = attach_va; shmmap_s->shmid = uap->shmid; shmseg->shm_lpid = p->p_pid; - shmseg->shm_atime = time_second; + shmseg->shm_atime = sysv_shmtime(); shmseg->shm_nattch++; - *retval = attach_va; - return( 0); + *retval = attach_va; /* XXX return -1 on error */ + shmat_ret = 0; + goto shmat_out; out: switch (rv) { case KERN_INVALID_ADDRESS: case KERN_NO_SPACE: - return (ENOMEM); + shmat_ret = ENOMEM; case KERN_PROTECTION_FAILURE: - return (EACCES); + shmat_ret = EACCES; default: - return (EINVAL); + shmat_ret = EINVAL; } - +shmat_out: + SYSV_SHM_SUBSYS_UNLOCK(); + return shmat_ret; } -struct oshmid_ds { - struct ipc_perm shm_perm; /* operation perms */ - int shm_segsz; /* size of segment (bytes) */ - ushort shm_cpid; /* pid, creator */ - ushort shm_lpid; /* pid, last operation */ - short shm_nattch; /* no. of current attaches */ - time_t shm_atime; /* last attach time */ - time_t shm_dtime; /* last detach time */ - time_t shm_ctime; /* last change time */ - void *shm_handle; /* internal handle for shm segment */ -}; - -struct oshmctl_args { - int shmid; - int cmd; - struct oshmid_ds *ubuf; -}; - static int -oshmctl(p, uap, retval) - struct proc *p; - struct oshmctl_args *uap; - register_t *retval; +oshmctl(__unused void *p, __unused void *uap, __unused void *retval) { -#ifdef COMPAT_43 - int error; - struct ucred *cred = p->p_ucred; - struct shmid_ds *shmseg; - struct oshmid_ds outbuf; - - if (!shm_inited) - return(EINVAL); - shmseg = shm_find_segment_by_shmid(uap->shmid); - if (shmseg == NULL) - return EINVAL; - switch (uap->cmd) { - case IPC_STAT: - error = ipcperm(cred, &shmseg->shm_perm, IPC_R); - if (error) - return error; - outbuf.shm_perm = shmseg->shm_perm; - outbuf.shm_segsz = shmseg->shm_segsz; - outbuf.shm_cpid = shmseg->shm_cpid; - outbuf.shm_lpid = shmseg->shm_lpid; - outbuf.shm_nattch = shmseg->shm_nattch; - outbuf.shm_atime = shmseg->shm_atime; - outbuf.shm_dtime = shmseg->shm_dtime; - outbuf.shm_ctime = shmseg->shm_ctime; - outbuf.shm_handle = shmseg->shm_internal; - error = copyout((caddr_t)&outbuf, uap->ubuf, sizeof(outbuf)); - if (error) - return error; - break; - default: - /* XXX casting to (sy_call_t *) is bogus, as usual. */ - return ((sy_call_t *)shmctl)(p, uap, retval); - } - return 0; -#else return EINVAL; -#endif } -#ifndef _SYS_SYSPROTO_H_ -struct shmctl_args { - int shmid; - int cmd; - struct shmid_ds *buf; -}; -#endif - int -shmctl(p, uap, retval) - struct proc *p; - struct shmctl_args *uap; - register_t *retval; +shmctl(__unused struct proc *p, struct shmctl_args *uap, register_t *retval) { int error; - struct ucred *cred = p->p_ucred; - struct shmid_ds inbuf; - struct shmid_ds *shmseg; + kauth_cred_t cred = kauth_cred_get(); + struct user_shmid_ds inbuf; + struct user_shmid_ds *shmseg; + size_t shmid_ds_sz = sizeof(struct user_shmid_ds); + + int shmctl_ret = 0; AUDIT_ARG(svipc_cmd, uap->cmd); AUDIT_ARG(svipc_id, uap->shmid); - if (!shm_inited) - return(EINVAL); + + SYSV_SHM_SUBSYS_LOCK(); + + if (!shm_inited) { + shmctl_ret = EINVAL; + goto shmctl_out; + } + + if (!IS_64BIT_PROCESS(p)) + shmid_ds_sz = sizeof(struct shmid_ds); + shmseg = shm_find_segment_by_shmid(uap->shmid); - if (shmseg == NULL) - return EINVAL; + if (shmseg == NULL) { + shmctl_ret = EINVAL; + goto shmctl_out; + } + /* XXAUDIT: This is the perms BEFORE any change by this call. This * may not be what is desired. */ @@ -435,30 +469,53 @@ shmctl(p, uap, retval) switch (uap->cmd) { case IPC_STAT: error = ipcperm(cred, &shmseg->shm_perm, IPC_R); - if (error) - return error; - error = copyout((caddr_t)shmseg, uap->buf, sizeof(inbuf)); - if (error) - return error; + if (error) { + shmctl_ret = error; + goto shmctl_out; + } + + if (IS_64BIT_PROCESS(p)) { + error = copyout(shmseg, uap->buf, sizeof(struct user_shmid_ds)); + } else { + struct shmid_ds shmid_ds32; + shmid_ds_64to32(shmseg, &shmid_ds32); + error = copyout(&shmid_ds32, uap->buf, sizeof(struct shmid_ds)); + } + if (error) { + shmctl_ret = error; + goto shmctl_out; + } break; case IPC_SET: error = ipcperm(cred, &shmseg->shm_perm, IPC_M); - if (error) - return error; - error = copyin(uap->buf, (caddr_t)&inbuf, sizeof(inbuf)); - if (error) - return error; + if (error) { + shmctl_ret = error; + goto shmctl_out; + } + if (IS_64BIT_PROCESS(p)) { + error = copyin(uap->buf, &inbuf, sizeof(struct user_shmid_ds)); + } else { + error = copyin(uap->buf, &inbuf, sizeof(struct shmid_ds)); + /* convert in place; ugly, but safe */ + shmid_ds_32to64((struct shmid_ds *)&inbuf, &inbuf); + } + if (error) { + shmctl_ret = error; + goto shmctl_out; + } shmseg->shm_perm.uid = inbuf.shm_perm.uid; shmseg->shm_perm.gid = inbuf.shm_perm.gid; shmseg->shm_perm.mode = (shmseg->shm_perm.mode & ~ACCESSPERMS) | (inbuf.shm_perm.mode & ACCESSPERMS); - shmseg->shm_ctime = time_second; + shmseg->shm_ctime = sysv_shmtime(); break; case IPC_RMID: error = ipcperm(cred, &shmseg->shm_perm, IPC_M); - if (error) - return error; + if (error) { + shmctl_ret = error; + goto shmctl_out; + } shmseg->shm_perm.key = IPC_PRIVATE; shmseg->shm_perm.mode |= SHMSEG_REMOVED; if (shmseg->shm_nattch <= 0) { @@ -471,29 +528,20 @@ shmctl(p, uap, retval) case SHM_UNLOCK: #endif default: - return EINVAL; + shmctl_ret = EINVAL; + goto shmctl_out; } - return 0; + *retval = 0; + shmctl_ret = 0; +shmctl_out: + SYSV_SHM_SUBSYS_UNLOCK(); + return shmctl_ret; } -#ifndef _SYS_SYSPROTO_H_ -struct shmget_args { - key_t key; - size_t size; - int shmflg; -}; -#endif - static int -shmget_existing(p, uap, mode, segnum, retval) - struct proc *p; - struct shmget_args *uap; - int mode; - int segnum; - int *retval; +shmget_existing(struct shmget_args *uap, int mode, int segnum, int *retval) { - struct shmid_ds *shmseg; - struct ucred *cred = p->p_ucred; + struct user_shmid_ds *shmseg; int error; shmseg = &shmsegs[segnum]; @@ -509,7 +557,7 @@ shmget_existing(p, uap, mode, segnum, retval) return error; return EAGAIN; } - error = ipcperm(cred, &shmseg->shm_perm, mode); + error = ipcperm(kauth_cred_get(), &shmseg->shm_perm, mode); if (error) return error; if (uap->size && uap->size > shmseg->shm_segsz) @@ -521,25 +569,23 @@ shmget_existing(p, uap, mode, segnum, retval) } static int -shmget_allocate_segment(p, uap, mode, retval) - struct proc *p; - struct shmget_args *uap; - int mode; - int * retval; +shmget_allocate_segment(struct proc *p, struct shmget_args *uap, int mode, + int *retval) { int i, segnum, shmid, size; - struct ucred *cred = p->p_ucred; - struct shmid_ds *shmseg; + kauth_cred_t cred = kauth_cred_get(); + struct user_shmid_ds *shmseg; struct shm_handle *shm_handle; kern_return_t kret; vm_offset_t user_addr; void * mem_object; - if (uap->size < shminfo.shmmin || uap->size > shminfo.shmmax) + if (uap->size < (user_size_t)shminfo.shmmin || + uap->size > (user_size_t)shminfo.shmmax) return EINVAL; if (shm_nused >= shminfo.shmmni) /* any shmids left? */ return ENOSPC; - size = round_page_32(uap->size); + size = mach_vm_round_page(uap->size); if (shm_committed + btoc(size) > shminfo.shmall) return ENOMEM; if (shm_last_free < 0) { @@ -558,25 +604,33 @@ shmget_allocate_segment(p, uap, mode, retval) * In case we sleep in malloc(), mark the segment present but deleted * so that noone else tries to create the same key. */ - kret = vm_allocate(current_map(), &user_addr, size, TRUE); + kret = vm_allocate(current_map(), &user_addr, size, VM_FLAGS_ANYWHERE); if (kret != KERN_SUCCESS) goto out; - kret = mach_make_memory_entry (current_map(), &size, - user_addr, VM_PROT_DEFAULT, &mem_object, 0); + kret = mach_make_memory_entry (current_map(), &size, user_addr, + VM_PROT_DEFAULT, (mem_entry_name_port_t *)&mem_object, 0); if (kret != KERN_SUCCESS) goto out; + + vm_deallocate(current_map(), user_addr, size); + shmseg->shm_perm.mode = SHMSEG_ALLOCATED | SHMSEG_REMOVED; shmseg->shm_perm.key = uap->key; shmseg->shm_perm.seq = (shmseg->shm_perm.seq + 1) & 0x7fff; - shm_handle = (struct shm_handle *) - _MALLOC(sizeof(struct shm_handle), M_SHM, M_WAITOK); + MALLOC(shm_handle, struct shm_handle *, sizeof(struct shm_handle), M_SHM, M_WAITOK); + if (shm_handle == NULL) { + kret = KERN_NO_SPACE; + mach_memory_entry_port_release(mem_object); + mem_object = NULL; + goto out; + } shm_handle->shm_object = mem_object; shmid = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm); - shmseg->shm_internal = shm_handle; - shmseg->shm_perm.cuid = shmseg->shm_perm.uid = cred->cr_uid; + shmseg->shm_internal = CAST_USER_ADDR_T(shm_handle); /* tunnel */ + shmseg->shm_perm.cuid = shmseg->shm_perm.uid = kauth_cred_getuid(cred); shmseg->shm_perm.cgid = shmseg->shm_perm.gid = cred->cr_gid; shmseg->shm_perm.mode = (shmseg->shm_perm.mode & SHMSEG_WANTED) | (mode & ACCESSPERMS) | SHMSEG_ALLOCATED; @@ -584,7 +638,7 @@ shmget_allocate_segment(p, uap, mode, retval) shmseg->shm_cpid = p->p_pid; shmseg->shm_lpid = shmseg->shm_nattch = 0; shmseg->shm_atime = shmseg->shm_dtime = 0; - shmseg->shm_ctime = time_second; + shmseg->shm_ctime = sysv_shmtime(); shm_committed += btoc(size); shm_nused++; AUDIT_ARG(svipc_perm, &shmseg->shm_perm); @@ -613,89 +667,107 @@ out: } int -shmget(p, uap, retval) - struct proc *p; - struct shmget_args *uap; - register_t *retval; +shmget(struct proc *p, struct shmget_args *uap, register_t *retval) { int segnum, mode, error; - + int shmget_ret = 0; + /* Auditing is actually done in shmget_allocate_segment() */ - if (!shm_inited) - return(EINVAL); + + SYSV_SHM_SUBSYS_LOCK(); + + if (!shm_inited) { + shmget_ret = EINVAL; + goto shmget_out; + } mode = uap->shmflg & ACCESSPERMS; if (uap->key != IPC_PRIVATE) { again: segnum = shm_find_segment_by_key(uap->key); if (segnum >= 0) { - error = shmget_existing(p, uap, mode, segnum, retval); + error = shmget_existing(uap, mode, segnum, retval); if (error == EAGAIN) goto again; - return(error); + shmget_ret = error; + goto shmget_out; + } + if ((uap->shmflg & IPC_CREAT) == 0) { + shmget_ret = ENOENT; + goto shmget_out; } - if ((uap->shmflg & IPC_CREAT) == 0) - return ENOENT; } - return( shmget_allocate_segment(p, uap, mode, retval));; + shmget_ret = shmget_allocate_segment(p, uap, mode, retval); +shmget_out: + SYSV_SHM_SUBSYS_UNLOCK(); + return shmget_ret; /*NOTREACHED*/ } -struct shmsys_args { - u_int which; - int a2; - int a3; - int a4; -}; +/* XXX actually varargs. */ int -shmsys(p, uap, retval) - struct proc *p; - /* XXX actually varargs. */ - struct shmsys_args *uap; - register_t *retval; +shmsys(struct proc *p, struct shmsys_args *uap, register_t *retval) { - if (!shm_inited) - return(EINVAL); + /* The routine that we are dispatching already does this */ if (uap->which >= sizeof(shmcalls)/sizeof(shmcalls[0])) return EINVAL; return ((*shmcalls[uap->which])(p, &uap->a2, retval)); } -void -shmfork(p1, p2) - struct proc *p1, *p2; +/* + * Return 0 on success, 1 on failure. + */ +int +shmfork(struct proc *p1, struct proc *p2) { struct shmmap_state *shmmap_s; size_t size; int i; + int shmfork_ret = 0; - if (!shm_inited) - return; + SYSV_SHM_SUBSYS_LOCK(); + + if (!shm_inited) { + shmfork_ret = 0; + goto shmfork_out; + } + size = shminfo.shmseg * sizeof(struct shmmap_state); - shmmap_s = (struct shmmap_state *)_MALLOC(size, M_SHM, M_WAITOK); - bcopy((caddr_t)p1->vm_shm, (caddr_t)shmmap_s, size); - p2->vm_shm = (caddr_t)shmmap_s; - for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) - if (shmmap_s->shmid != -1) - shmsegs[IPCID_TO_IX(shmmap_s->shmid)].shm_nattch++; + MALLOC(shmmap_s, struct shmmap_state *, size, M_SHM, M_WAITOK); + if (shmmap_s != NULL) { + bcopy((caddr_t)p1->vm_shm, (caddr_t)shmmap_s, size); + p2->vm_shm = (caddr_t)shmmap_s; + for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) + if (shmmap_s->shmid != -1) + shmsegs[IPCID_TO_IX(shmmap_s->shmid)].shm_nattch++; + shmfork_ret = 0; + goto shmfork_out; + } + + shmfork_ret = 1; /* failed to copy to child - ENOMEM */ +shmfork_out: + SYSV_SHM_SUBSYS_UNLOCK(); + return shmfork_ret; } void -shmexit(p) - struct proc *p; +shmexit(struct proc *p) { struct shmmap_state *shmmap_s; int i; shmmap_s = (struct shmmap_state *)p->vm_shm; + + SYSV_SHM_SUBSYS_LOCK(); for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) if (shmmap_s->shmid != -1) shm_delete_mapping(p, shmmap_s, 1); FREE((caddr_t)p->vm_shm, M_SHM); p->vm_shm = NULL; + SYSV_SHM_SUBSYS_UNLOCK(); } /* @@ -705,32 +777,42 @@ shmexit(p) * need to do to keep the System V shared memory subsystem sane. */ __private_extern__ void -shmexec(p) - struct proc *p; +shmexec(struct proc *p) { struct shmmap_state *shmmap_s; int i; shmmap_s = (struct shmmap_state *)p->vm_shm; + SYSV_SHM_SUBSYS_LOCK(); for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) if (shmmap_s->shmid != -1) shm_delete_mapping(p, shmmap_s, 0); FREE((caddr_t)p->vm_shm, M_SHM); p->vm_shm = NULL; + SYSV_SHM_SUBSYS_UNLOCK(); } void -shminit(dummy) - void *dummy; +shminit(__unused void *dummy) { int i; int s; if (!shm_inited) { - s = sizeof(struct shmid_ds) * shminfo.shmmni; + /* + * we store internally 64 bit, since if we didn't, we would + * be unable to represent a segment size in excess of 32 bits + * with the (struct shmid_ds)->shm_segsz field; also, POSIX + * dictates this filed be a size_t, which is 64 bits when + * running 64 bit binaries. + */ + s = sizeof(struct user_shmid_ds) * shminfo.shmmni; - MALLOC(shmsegs, struct shmid_ds *, s, - M_SHM, M_WAITOK); + MALLOC(shmsegs, struct user_shmid_ds *, s, M_SHM, M_WAITOK); + if (shmsegs == NULL) { + /* XXX fail safely: leave shared memory uninited */ + return; + } for (i = 0; i < shminfo.shmmni; i++) { shmsegs[i].shm_perm.mode = SHMSEG_FREE; shmsegs[i].shm_perm.seq = 0; @@ -741,56 +823,193 @@ shminit(dummy) shm_inited = 1; } } +/* Initialize the mutex governing access to the SysV shm subsystem */ +__private_extern__ void +sysv_shm_lock_init( void ) +{ + + sysv_shm_subsys_lck_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(sysv_shm_subsys_lck_grp_attr); + + sysv_shm_subsys_lck_grp = lck_grp_alloc_init("sysv_shm_subsys_lock", sysv_shm_subsys_lck_grp_attr); + + sysv_shm_subsys_lck_attr = lck_attr_alloc_init(); + /* lck_attr_setdebug(sysv_shm_subsys_lck_attr); */ + lck_mtx_init(&sysv_shm_subsys_mutex, sysv_shm_subsys_lck_grp, sysv_shm_subsys_lck_attr); +} /* (struct sysctl_oid *oidp, void *arg1, int arg2, \ struct sysctl_req *req) */ static int -sysctl_shminfo SYSCTL_HANDLER_ARGS +sysctl_shminfo(__unused struct sysctl_oid *oidp, void *arg1, + __unused int arg2, struct sysctl_req *req) { int error = 0; + int sysctl_shminfo_ret = 0; - error = SYSCTL_OUT(req, arg1, sizeof(int)); - if (error || !req->newptr) + error = SYSCTL_OUT(req, arg1, sizeof(user_ssize_t)); + if (error || req->newptr == USER_ADDR_NULL) return(error); + SYSV_SHM_SUBSYS_LOCK(); /* Set the values only if shared memory is not initialised */ if (!shm_inited) { - if (error = SYSCTL_IN(req, arg1, sizeof(int))) - return(error); + if ((error = SYSCTL_IN(req, arg1, sizeof(user_ssize_t))) + != 0) { + sysctl_shminfo_ret = error; + goto sysctl_shminfo_out; + } + if (arg1 == &shminfo.shmmax) { - if (shminfo.shmmax & PAGE_MASK) { - shminfo.shmmax = -1; - return(EINVAL); + if (shminfo.shmmax & PAGE_MASK_64) { + shminfo.shmmax = (user_ssize_t)-1; + sysctl_shminfo_ret = EINVAL; + goto sysctl_shminfo_out; } } /* Initialize only when all values are set */ - if ((shminfo.shmmax != -1) && - (shminfo.shmmin != -1) && - (shminfo.shmmni != -1) && - (shminfo.shmseg != -1) && - (shminfo.shmall != -1)) { + if ((shminfo.shmmax != (user_ssize_t)-1) && + (shminfo.shmmin != (user_ssize_t)-1) && + (shminfo.shmmni != (user_ssize_t)-1) && + (shminfo.shmseg != (user_ssize_t)-1) && + (shminfo.shmall != (user_ssize_t)-1)) { shminit(NULL); } } - return(0); + sysctl_shminfo_ret = 0; +sysctl_shminfo_out: + SYSV_SHM_SUBSYS_UNLOCK(); + return sysctl_shminfo_ret; +} + +static int +IPCS_shm_sysctl(__unused struct sysctl_oid *oidp, __unused void *arg1, + __unused int arg2, struct sysctl_req *req) +{ + int error; + int cursor; + union { + struct IPCS_command u32; + struct user_IPCS_command u64; + } ipcs; + struct shmid_ds shmid_ds32; /* post conversion, 32 bit version */ + void *shmid_dsp; + size_t ipcs_sz = sizeof(struct user_IPCS_command); + size_t shmid_ds_sz = sizeof(struct user_shmid_ds); + struct proc *p = current_proc(); + + int ipcs__shminfo_ret = 0; + + SYSV_SHM_SUBSYS_LOCK(); + + if (!shm_inited) { + error = EINVAL; + goto ipcs_shm_sysctl_out; + } + + if (!IS_64BIT_PROCESS(p)) { + ipcs_sz = sizeof(struct IPCS_command); + shmid_ds_sz = sizeof(struct shmid_ds); + } + + /* Copy in the command structure */ + if ((error = SYSCTL_IN(req, &ipcs, ipcs_sz)) != 0) { + goto ipcs_shm_sysctl_out; + } + + if (!IS_64BIT_PROCESS(p)) /* convert in place */ + ipcs.u64.ipcs_data = CAST_USER_ADDR_T(ipcs.u32.ipcs_data); + + /* Let us version this interface... */ + if (ipcs.u64.ipcs_magic != IPCS_MAGIC) { + error = EINVAL; + goto ipcs_shm_sysctl_out; + } + + switch(ipcs.u64.ipcs_op) { + case IPCS_SHM_CONF: /* Obtain global configuration data */ + if (ipcs.u64.ipcs_datalen != sizeof(struct shminfo)) { + if (ipcs.u64.ipcs_cursor != 0) { /* fwd. compat. */ + error = ENOMEM; + break; + } + error = ERANGE; + break; + } + error = copyout(&shminfo, ipcs.u64.ipcs_data, ipcs.u64.ipcs_datalen); + break; + + case IPCS_SHM_ITER: /* Iterate over existing segments */ + cursor = ipcs.u64.ipcs_cursor; + if (cursor < 0 || cursor >= shminfo.shmmni) { + error = ERANGE; + break; + } + if (ipcs.u64.ipcs_datalen != (int)shmid_ds_sz) { + error = ENOMEM; + break; + } + for( ; cursor < shminfo.shmmni; cursor++) { + if (shmsegs[cursor].shm_perm.mode & SHMSEG_ALLOCATED) + break; + continue; + } + if (cursor == shminfo.shmmni) { + error = ENOENT; + break; + } + + shmid_dsp = &shmsegs[cursor]; /* default: 64 bit */ + + /* + * If necessary, convert the 64 bit kernel segment + * descriptor to a 32 bit user one. + */ + if (!IS_64BIT_PROCESS(p)) { + shmid_ds_64to32(shmid_dsp, &shmid_ds32); + shmid_dsp = &shmid_ds32; + } + error = copyout(shmid_dsp, ipcs.u64.ipcs_data, ipcs.u64.ipcs_datalen); + if (!error) { + /* update cursor */ + ipcs.u64.ipcs_cursor = cursor + 1; + + if (!IS_64BIT_PROCESS(p)) /* convert in place */ + ipcs.u32.ipcs_data = CAST_DOWN(void *,ipcs.u64.ipcs_data); + error = SYSCTL_OUT(req, &ipcs, ipcs_sz); + } + break; + + default: + error = EINVAL; + break; + } +ipcs_shm_sysctl_out: + SYSV_SHM_SUBSYS_UNLOCK(); + return(error); } SYSCTL_NODE(_kern, KERN_SYSV, sysv, CTLFLAG_RW, 0, "SYSV"); -SYSCTL_PROC(_kern_sysv, KSYSV_SHMMAX, shmmax, CTLTYPE_INT | CTLFLAG_RW, - &shminfo.shmmax, 0, &sysctl_shminfo ,"I","shmmax"); +SYSCTL_PROC(_kern_sysv, KSYSV_SHMMAX, shmmax, CTLTYPE_QUAD | CTLFLAG_RW, + &shminfo.shmmax, 0, &sysctl_shminfo ,"Q","shmmax"); -SYSCTL_PROC(_kern_sysv, KSYSV_SHMMIN, shmmin, CTLTYPE_INT | CTLFLAG_RW, - &shminfo.shmmin, 0, &sysctl_shminfo ,"I","shmmin"); +SYSCTL_PROC(_kern_sysv, KSYSV_SHMMIN, shmmin, CTLTYPE_QUAD | CTLFLAG_RW, + &shminfo.shmmin, 0, &sysctl_shminfo ,"Q","shmmin"); -SYSCTL_PROC(_kern_sysv, KSYSV_SHMMNI, shmmni, CTLTYPE_INT | CTLFLAG_RW, - &shminfo.shmmni, 0, &sysctl_shminfo ,"I","shmmni"); +SYSCTL_PROC(_kern_sysv, KSYSV_SHMMNI, shmmni, CTLTYPE_QUAD | CTLFLAG_RW, + &shminfo.shmmni, 0, &sysctl_shminfo ,"Q","shmmni"); -SYSCTL_PROC(_kern_sysv, KSYSV_SHMSEG, shmseg, CTLTYPE_INT | CTLFLAG_RW, - &shminfo.shmseg, 0, &sysctl_shminfo ,"I","shmseg"); +SYSCTL_PROC(_kern_sysv, KSYSV_SHMSEG, shmseg, CTLTYPE_QUAD | CTLFLAG_RW, + &shminfo.shmseg, 0, &sysctl_shminfo ,"Q","shmseg"); -SYSCTL_PROC(_kern_sysv, KSYSV_SHMALL, shmall, CTLTYPE_INT | CTLFLAG_RW, - &shminfo.shmall, 0, &sysctl_shminfo ,"I","shmall"); +SYSCTL_PROC(_kern_sysv, KSYSV_SHMALL, shmall, CTLTYPE_QUAD | CTLFLAG_RW, + &shminfo.shmall, 0, &sysctl_shminfo ,"Q","shmall"); +SYSCTL_NODE(_kern_sysv, OID_AUTO, ipcs, CTLFLAG_RW, 0, "SYSVIPCS"); +SYSCTL_PROC(_kern_sysv_ipcs, OID_AUTO, shm, CTLFLAG_RW|CTLFLAG_ANYBODY, + 0, 0, IPCS_shm_sysctl, + "S,IPCS_shm_command", + "ipcs shm command interface"); diff --git a/bsd/kern/tty.c b/bsd/kern/tty.c index dee8138ff..824160f87 100644 --- a/bsd/kern/tty.c +++ b/bsd/kern/tty.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -98,8 +98,9 @@ #include <sys/systm.h> #undef TTYDEFCHARS #include <sys/ioctl.h> -#include <sys/proc.h> -#include <sys/file.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> +#include <sys/file_internal.h> #include <sys/conf.h> #include <sys/dkstat.h> #include <sys/uio.h> @@ -132,20 +133,22 @@ #include <machdep/machine/pmap.h> #endif /* 0 ] */ #endif /* !NeXT */ +#include <sys/resource.h> /* averunnable */ #ifndef NeXT -static int proc_compare __P((struct proc *p1, struct proc *p2)); +static int proc_compare(struct proc *p1, struct proc *p2); #endif /* NeXT */ -static int ttnread __P((struct tty *tp)); -static void ttyecho __P((int c, struct tty *tp)); -static int ttyoutput __P((int c, register struct tty *tp)); -static void ttypend __P((struct tty *tp)); -static void ttyretype __P((struct tty *tp)); -static void ttyrub __P((int c, struct tty *tp)); -static void ttyrubo __P((struct tty *tp, int cnt)); -static void ttystop __P((struct tty *tp, int rw)); -static void ttyunblock __P((struct tty *tp)); -static int ttywflush __P((struct tty *tp)); +static int ttnread(struct tty *tp); +static void ttyecho(int c, struct tty *tp); +static int ttyoutput(int c, register struct tty *tp); +static void ttypend(struct tty *tp); +static void ttyretype(struct tty *tp); +static void ttyrub(int c, struct tty *tp); +static void ttyrubo(struct tty *tp, int count); +static void ttystop(struct tty *tp, int rw); +static void ttyunblock(struct tty *tp); +static int ttywflush(struct tty *tp); +static int proc_compare(struct proc *p1, struct proc *p2); /* * Table with character classes and parity. The 8th bit indicates parity, @@ -236,6 +239,37 @@ static u_char const char_type[] = { #undef MAX_INPUT /* XXX wrong in <sys/syslimits.h> */ #define MAX_INPUT TTYHOG +static void +termios32to64(struct termios *in, struct user_termios *out) +{ + out->c_iflag = (user_tcflag_t)in->c_iflag; + out->c_oflag = (user_tcflag_t)in->c_oflag; + out->c_cflag = (user_tcflag_t)in->c_cflag; + out->c_lflag = (user_tcflag_t)in->c_lflag; + + /* bcopy is OK, since this type is ILP32/LP64 size invariant */ + bcopy(in->c_cc, out->c_cc, sizeof(in->c_cc)); + + out->c_ispeed = (user_speed_t)in->c_ispeed; + out->c_ospeed = (user_speed_t)in->c_ospeed; +} + +static void +termios64to32(struct user_termios *in, struct termios *out) +{ + out->c_iflag = (tcflag_t)in->c_iflag; + out->c_oflag = (tcflag_t)in->c_oflag; + out->c_cflag = (tcflag_t)in->c_cflag; + out->c_lflag = (tcflag_t)in->c_lflag; + + /* bcopy is OK, since this type is ILP32/LP64 size invariant */ + bcopy(in->c_cc, out->c_cc, sizeof(in->c_cc)); + + out->c_ispeed = (speed_t)in->c_ispeed; + out->c_ospeed = (speed_t)in->c_ospeed; +} + + /* * Initial open of tty, or (re)entry to standard tty line discipline. */ @@ -778,41 +812,31 @@ ttyoutput(c, tp) */ /* ARGSUSED */ int -#ifndef NeXT -ttioctl(tp, cmd, data, flag) - register struct tty *tp; - int cmd, flag; - void *data; -#else -ttioctl(tp, cmd, data, flag, p) - register struct tty *tp; - u_long cmd; - caddr_t data; - int flag; - struct proc *p; -#endif +ttioctl(register struct tty *tp, + u_long cmd, caddr_t data, int flag, + struct proc *p) { -#ifndef NeXT - register struct proc *p = curproc; /* XXX */ -#endif int s, error; struct uthread *ut; - ut = (struct uthread *)get_bsdthread_info(current_act()); + ut = (struct uthread *)get_bsdthread_info(current_thread()); /* If the ioctl involves modification, hang if in the background. */ switch (cmd) { case TIOCFLUSH: case TIOCSETA: + case TIOCSETA_64: case TIOCSETD: case TIOCSETAF: + case TIOCSETAF_64: case TIOCSETAW: + case TIOCSETAW_64: #ifdef notdef case TIOCSPGRP: #endif case TIOCSTAT: case TIOCSTI: case TIOCSWINSZ: -#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#if COMPAT_43_TTY || defined(COMPAT_SUNOS) case TIOCLBIC: case TIOCLBIS: case TIOCLSET: @@ -884,7 +908,7 @@ ttioctl(tp, cmd, data, flag, p) return (EBUSY); } #if defined(NeXT) || !defined(UCONSOLE) - if ( (error = suser(p->p_ucred, &p->p_acflag)) ) + if ( (error = suser(kauth_cred_get(), &p->p_acflag)) ) return (error); #endif constty = tp; @@ -907,10 +931,13 @@ ttioctl(tp, cmd, data, flag, p) if (error) return (error); break; - case TIOCGETA: { /* get termios struct */ - struct termios *t = (struct termios *)data; - - bcopy(&tp->t_termios, t, sizeof(struct termios)); + case TIOCGETA: /* get termios struct */ + case TIOCGETA_64: { /* get termios struct */ + if (IS_64BIT_PROCESS(p)) { + termios32to64(&tp->t_termios, (struct user_termios *)data); + } else { + bcopy(&tp->t_termios, data, sizeof(struct termios)); + } break; } case TIOCGETD: /* get line discipline */ @@ -940,20 +967,29 @@ ttioctl(tp, cmd, data, flag, p) *(int *)data = tp->t_outq.c_cc; break; case TIOCSETA: /* set termios struct */ + case TIOCSETA_64: case TIOCSETAW: /* drain output, set */ - case TIOCSETAF: { /* drn out, fls in, set */ + case TIOCSETAW_64: + case TIOCSETAF: /* drn out, fls in, set */ + case TIOCSETAF_64: { /* drn out, fls in, set */ register struct termios *t = (struct termios *)data; + struct termios lcl_termios; + if (IS_64BIT_PROCESS(p)) { + termios64to32((struct user_termios *)data, &lcl_termios); + t = &lcl_termios; + } if (t->c_ispeed < 0 || t->c_ospeed < 0) return (EINVAL); s = spltty(); - if (cmd == TIOCSETAW || cmd == TIOCSETAF) { + if (cmd == TIOCSETAW || cmd == TIOCSETAF || + cmd == TIOCSETAW_64 || cmd == TIOCSETAF_64) { error = ttywait(tp); if (error) { splx(s); return (error); } - if (cmd == TIOCSETAF) + if (cmd == TIOCSETAF || cmd == TIOCSETAF_64) ttyflush(tp, FREAD); } if (!ISSET(t->c_cflag, CIGNORE)) { @@ -990,7 +1026,7 @@ ttioctl(tp, cmd, data, flag, p) ttsetwater(tp); } if (ISSET(t->c_lflag, ICANON) != ISSET(tp->t_lflag, ICANON) && - cmd != TIOCSETAF) { + cmd != TIOCSETAF && cmd != TIOCSETAF_64) { if (ISSET(t->c_lflag, ICANON)) SET(tp->t_lflag, PENDIN); else { @@ -1045,9 +1081,8 @@ ttioctl(tp, cmd, data, flag, p) case TIOCSETD: { /* set line discipline */ register int t = *(int *)data; dev_t device = tp->t_dev; - extern int nlinesw; - if ((u_int)t >= nlinesw) + if (t >= nlinesw) return (ENXIO); if (t != tp->t_line) { s = spltty(); @@ -1074,9 +1109,9 @@ ttioctl(tp, cmd, data, flag, p) splx(s); break; case TIOCSTI: /* simulate terminal input */ - if (p->p_ucred->cr_uid && (flag & FREAD) == 0) + if (suser(kauth_cred_get(), NULL) && (flag & FREAD) == 0) return (EPERM); - if (p->p_ucred->cr_uid && !isctty(p, tp)) + if (suser(kauth_cred_get(), NULL) && !isctty(p, tp)) return (EACCES); s = spltty(); (*linesw[tp->t_line].l_rint)(*(u_char *)data, tp); @@ -1132,7 +1167,7 @@ ttioctl(tp, cmd, data, flag, p) } break; case TIOCSDRAINWAIT: - error = suser(p->p_ucred, &p->p_acflag); + error = suser(kauth_cred_get(), &p->p_acflag); if (error) return (error); tp->t_timeout = *(int *)data * hz; @@ -1143,14 +1178,14 @@ ttioctl(tp, cmd, data, flag, p) *(int *)data = tp->t_timeout / hz; break; default: -#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#if COMPAT_43_TTY || defined(COMPAT_SUNOS) #ifdef NeXT return (ttcompat(tp, cmd, data, flag, p)); #else return (ttcompat(tp, cmd, data, flag)); #endif /* NeXT */ #else - return (-1); + return (ENOTTY); #endif } @@ -1589,7 +1624,7 @@ ttread(tp, uio, flag) funnel_state = thread_funnel_set(kernel_flock, TRUE); - ut = (struct uthread *)get_bsdthread_info(current_act()); + ut = (struct uthread *)get_bsdthread_info(current_thread()); loop: s = spltty(); @@ -1654,7 +1689,6 @@ loop: int m = cc[VMIN]; long t = cc[VTIME]; struct timeval etime, timecopy; - int x; /* * Check each of the four combinations. @@ -1683,9 +1717,7 @@ loop: goto sleep; if (qp->c_cc >= m) goto read; - x = splclock(); - timecopy = time; - splx(x); + microuptime(&timecopy); if (!has_etime) { /* first character, start timer */ has_etime = 1; @@ -1714,9 +1746,7 @@ loop: } else { /* m == 0 */ if (qp->c_cc > 0) goto read; - x = splclock(); - timecopy = time; - splx(x); + microuptime(&timecopy); if (!has_etime) { has_etime = 1; @@ -1789,7 +1819,7 @@ read: char ibuf[IBUFSIZ]; int icc; - icc = min(uio->uio_resid, IBUFSIZ); + icc = min(uio_resid(uio), IBUFSIZ); icc = q_to_b(qp, ibuf, icc); if (icc <= 0) { if (first) @@ -1808,7 +1838,7 @@ read: #endif if (error) break; - if (uio->uio_resid == 0) + if (uio_resid(uio) == 0) break; first = 0; } @@ -1857,7 +1887,7 @@ slowcase: ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL) snpinc((struct snoop *)tp->t_sc, (char)c); #endif - if (uio->uio_resid == 0) + if (uio_resid(uio) == 0) break; /* * In canonical mode check for a "break character" @@ -1895,10 +1925,11 @@ ttycheckoutq(tp, wait) register struct tty *tp; int wait; { - int hiwat, s, oldsig; + int hiwat, s; + sigset_t oldsig; struct uthread *ut; - ut = (struct uthread *)get_bsdthread_info(current_act()); + ut = (struct uthread *)get_bsdthread_info(current_thread()); hiwat = tp->t_hiwat; s = spltty(); @@ -1931,23 +1962,24 @@ ttwrite(tp, uio, flag) register char *cp = NULL; register int cc, ce; register struct proc *p; - int i, hiwat, cnt, error, s; + int i, hiwat, count, error, s; char obuf[OBUFSIZ]; boolean_t funnel_state; struct uthread *ut; funnel_state = thread_funnel_set(kernel_flock, TRUE); - ut = (struct uthread *)get_bsdthread_info(current_act()); + ut = (struct uthread *)get_bsdthread_info(current_thread()); hiwat = tp->t_hiwat; - cnt = uio->uio_resid; + // LP64todo - fix this! + count = uio_resid(uio); error = 0; cc = 0; loop: s = spltty(); if (ISSET(tp->t_state, TS_ZOMBIE)) { splx(s); - if (uio->uio_resid == cnt) + if (uio_resid(uio) == count) error = EIO; goto out; } @@ -1988,9 +2020,9 @@ loop: * output translation. Keep track of high water mark, sleep on * overflow awaiting device aid in acquiring new space. */ - while (uio->uio_resid > 0 || cc > 0) { + while (uio_resid(uio) > 0 || cc > 0) { if (ISSET(tp->t_lflag, FLUSHO)) { - uio->uio_resid = 0; + uio_setresid(uio, 0); thread_funnel_set(kernel_flock, funnel_state); return (0); } @@ -2001,7 +2033,7 @@ loop: * leftover from last time. */ if (cc == 0) { - cc = min(uio->uio_resid, OBUFSIZ); + cc = min(uio_resid(uio), OBUFSIZ); cp = obuf; error = uiomove(cp, cc, uio); if (error) { @@ -2027,7 +2059,7 @@ loop: ce = cc; else { ce = cc - scanc((u_int)cc, (u_char *)cp, - (u_char *)char_type, CCLASSMASK); + char_type, CCLASSMASK); /* * If ce is zero, then we're processing * a special character through ttyoutput. @@ -2105,7 +2137,7 @@ out: * offset and iov pointers have moved forward, but it doesn't matter * (the call will either return short or restart with a new uio). */ - uio->uio_resid += cc; + uio_setresid(uio, (uio_resid(uio) + cc)); thread_funnel_set(kernel_flock, funnel_state); return (error); @@ -2134,9 +2166,9 @@ ovhiwat: } if (flag & IO_NDELAY) { splx(s); - uio->uio_resid += cc; + uio_setresid(uio, (uio_resid(uio) + cc)); thread_funnel_set(kernel_flock, funnel_state); - return (uio->uio_resid == cnt ? EWOULDBLOCK : 0); + return (uio_resid(uio) == count ? EWOULDBLOCK : 0); } SET(tp->t_state, TS_SO_OLOWAT); error = ttysleep(tp, TSA_OLOWAT(tp), TTOPRI | PCATCH, "ttywri", @@ -2242,15 +2274,13 @@ ttyrub(c, tp) } /* - * Back over cnt characters, erasing them. + * Back over count characters, erasing them. */ static void -ttyrubo(tp, cnt) - register struct tty *tp; - int cnt; +ttyrubo(struct tty *tp, int count) { - while (cnt-- > 0) { + while (count-- > 0) { (void)ttyoutput('\b', tp); (void)ttyoutput(' ', tp); (void)ttyoutput('\b', tp); @@ -2395,10 +2425,10 @@ ttspeedtab(speed, table) * */ void -ttsetwater(tp) - struct tty *tp; +ttsetwater(struct tty *tp) { - register int cps, x; + int cps; + unsigned int x; #define CLAMP(x, h, l) ((x) > h ? h : ((x) < l) ? l : (x)) @@ -2413,17 +2443,107 @@ ttsetwater(tp) /* NeXT ttyinfo has been converted to the MACH kernel */ #include <mach/thread_info.h> +/* XXX Should be in Mach header <kern/thread.h>, but doesn't work */ +extern kern_return_t thread_info_internal(thread_t thread, + thread_flavor_t flavor, + thread_info_t thread_info_out, + mach_msg_type_number_t *thread_info_count); + /* * Report on state of foreground process group. */ void -ttyinfo(tp) - register struct tty *tp; +ttyinfo(struct tty *tp) { - /* NOT IMPLEMENTED FOR MACH */ + int load; + thread_t thread; + uthread_t uthread; + struct proc *p; + struct proc *pick; + const char *state; + struct timeval utime; + struct timeval stime; + thread_basic_info_data_t basic_info; + mach_msg_type_number_t mmtn = THREAD_BASIC_INFO_COUNT; + + if (ttycheckoutq(tp,0) == 0) + return; + + /* Print load average. */ + load = (averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT; + ttyprintf(tp, "load: %d.%02d ", load / 100, load % 100); + + /* + * On return following a ttyprintf(), we set tp->t_rocount to 0 so + * that pending input will be retyped on BS. + */ + if (tp->t_session == NULL) { + ttyprintf(tp, "not a controlling terminal\n"); + tp->t_rocount = 0; + return; +} + if (tp->t_pgrp == NULL) { + ttyprintf(tp, "no foreground process group\n"); + tp->t_rocount = 0; + return; + } + /* first process in process group */ + if ((p = tp->t_pgrp->pg_members.lh_first) == NULL) { + ttyprintf(tp, "empty foreground process group\n"); + tp->t_rocount = 0; + return; + } + + /* + * Pick the most interesting process and copy some of its + * state for printing later. + */ + for (pick = NULL; p != NULL; p = p->p_pglist.le_next) { + if (proc_compare(pick, p)) + pick = p; + } + + if (TAILQ_EMPTY(&pick->p_uthlist) || + (uthread = TAILQ_FIRST(&pick->p_uthlist)) == NULL || + (thread = uthread->uu_act) == NULL || + (thread_info_internal(thread, THREAD_BASIC_INFO, (thread_info_t)&basic_info, &mmtn) != KERN_SUCCESS)) { + ttyprintf(tp, "foreground process without thread\n"); + tp->t_rocount = 0; + return; + } + + switch(basic_info.run_state) { + case TH_STATE_RUNNING: + state = "running"; + break; + case TH_STATE_STOPPED: + state = "stopped"; + break; + case TH_STATE_WAITING: + state = "waiting"; + break; + case TH_STATE_UNINTERRUPTIBLE: + state = "uninterruptible"; + break; + case TH_STATE_HALTED: + state = "halted"; + break; + default: + state = "unknown"; + break; + } + calcru(pick, &utime, &stime, NULL); + + /* Print command, pid, state, utime, and stime */ + ttyprintf(tp, " cmd: %s %d %s %ld.%02ldu %ld.%02lds\n", + pick->p_comm, + pick->p_pid, + state, + (long)utime.tv_sec, utime.tv_usec / 10000, + (long)stime.tv_sec, stime.tv_usec / 10000); + tp->t_rocount = 0; } -#ifndef NeXT /* * Returns 1 if p2 is "better" than p1 * @@ -2433,8 +2553,7 @@ ttyinfo(tp) * 2) Runnable processes are favored over anything else. The runner * with the highest cpu utilization is picked (p_estcpu). Ties are * broken by picking the highest pid. - * 3) The sleeper with the shortest sleep time is next. With ties, - * we pick out just "short-term" sleepers (P_SINTR == 0). + * 3) The sleeper with the shortest sleep time is next. * 4) Further ties are broken by picking the highest pid. */ #define ISRUN(p) (((p)->p_stat == SRUN) || ((p)->p_stat == SIDL)) @@ -2486,16 +2605,8 @@ proc_compare(p1, p2) return (0); if (p1->p_slptime > p2->p_slptime) return (1); - /* - * favor one sleeping in a non-interruptible sleep - */ - if (p1->p_flag & P_SINTR && (p2->p_flag & P_SINTR) == 0) - return (1); - if (p2->p_flag & P_SINTR && (p1->p_flag & P_SINTR) == 0) - return (0); return (p2->p_pid > p1->p_pid); /* tie - return highest pid */ } -#endif /* NeXT */ /* * Output char to tty; console putchar style. @@ -2527,11 +2638,7 @@ tputchar(c, tp) * at the start of the call. */ int -ttysleep(tp, chan, pri, wmesg, timo) - struct tty *tp; - void *chan; - int pri, timo; - char *wmesg; +ttysleep(struct tty *tp, void *chan, int pri, const char *wmesg, int timo) { int error; int gen; @@ -2548,17 +2655,18 @@ ttysleep(tp, chan, pri, wmesg, timo) * Allocate a tty structure and its associated buffers. */ struct tty * -ttymalloc() +ttymalloc(void) { struct tty *tp; - MALLOC(tp, struct tty *, sizeof(struct tty), M_TTYS, M_WAITOK); - bzero(tp, sizeof *tp); - /* XXX: default to TTYCLSIZE(1024) chars for now */ - clalloc(&tp->t_rawq, TTYCLSIZE, 1); - clalloc(&tp->t_canq, TTYCLSIZE, 1); - /* output queue doesn't need quoting */ - clalloc(&tp->t_outq, TTYCLSIZE, 0); + MALLOC(tp, struct tty *, sizeof(struct tty), M_TTYS, M_WAITOK|M_ZERO); + if (tp != NULL) { + /* XXX: default to TTYCLSIZE(1024) chars for now */ + clalloc(&tp->t_rawq, TTYCLSIZE, 1); + clalloc(&tp->t_canq, TTYCLSIZE, 1); + /* output queue doesn't need quoting */ + clalloc(&tp->t_outq, TTYCLSIZE, 0); + } return(tp); } @@ -2591,8 +2699,7 @@ ttymalloc() { struct tty *tp; - tp = _MALLOC(sizeof *tp, M_TTYS, M_WAITOK); - bzero(tp, sizeof *tp); + MALLOC(tp, struct tty *, sizeof *tp, M_TTYS, M_WAITOK|M_ZERO); return (tp); } #endif diff --git a/bsd/kern/tty_compat.c b/bsd/kern/tty_compat.c index cf6e818b1..346ac762b 100644 --- a/bsd/kern/tty_compat.c +++ b/bsd/kern/tty_compat.c @@ -62,30 +62,30 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/ioctl.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> #include <sys/tty.h> #include <sys/termios.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/conf.h> #include <sys/kernel.h> #include <sys/sysctl.h> #include <sys/syslog.h> -/* NeXT Move define down here cause COMPAT_43 not valid earlier */ -#if COMPAT_43 || defined(COMPAT_SUNOS) +/* NeXT Move define down here cause COMPAT_43_TTY not valid earlier */ +#if COMPAT_43_TTY || defined(COMPAT_SUNOS) -static int ttcompatgetflags __P((struct tty *tp)); -static void ttcompatsetflags __P((struct tty *tp, struct termios *t)); -static void ttcompatsetlflags __P((struct tty *tp, struct termios *t)); -static int ttcompatspeedtab __P((int speed, struct speedtab *table)); - - -static int ttydebug = 0; - -#ifndef NeXT -SYSCTL_INT(_debug, OID_AUTO, ttydebug, CTLFLAG_RW, &ttydebug, 0, ""); -#endif +static int ttcompatgetflags(struct tty *tp); +static void ttcompatsetflags(struct tty *tp, struct termios *t); +static void ttcompatsetlflags(struct tty *tp, struct termios *t); +static int ttcompatspeedtab(int speed, struct speedtab *table); +/* + * These two tables encode baud rate to speed code and speed code to + * baud rate information. They are a mapping between the <sys/termios.h> + * baud rate constants and the <sys/ttydev.h> baud rate constants. We + * cannot use those constants directly here because they occupy the same + * name space. + */ static struct speedtab compatspeeds[] = { #define MAX_SPEED 17 { 115200, 17 }, @@ -113,10 +113,30 @@ static int compatspcodes[] = { 1800, 2400, 4800, 9600, 19200, 38400, 57600, 115200, }; +/* + * ttcompatspeedtab + * + * Description: Given a baud rate value as an integer, and a speed table, + * convert the baud rate to a speed code, according to the + * contents of the table. This effectively changes termios.h + * baud rate values into ttydev.h baud rate codes. + * + * Parameters: int speed Baud rate, as an integer + * struct speedtab *table Baud rate table to speed code table + * + * Returns: 1 B50 speed code; returned if we can + * not find an answer in the table. + * 0 If a 0 was requested in order to + * trigger a hangup (250ms of line + * silence, per Bell 103C standard). + * * A speed code matching the requested + * baud rate (potentially rounded down, + * if there is no exact match). + * + * Notes: This function is used for TIOCGETP, TIOCSETP, and TIOCSETN. + */ static int -ttcompatspeedtab(speed, table) - int speed; - register struct speedtab *table; +ttcompatspeedtab(int speed, struct speedtab *table) { if (speed == 0) return (0); /* hangup */ @@ -126,25 +146,67 @@ ttcompatspeedtab(speed, table) return (1); /* 50, min and not hangup */ } -#ifndef NeXT -int -ttsetcompat(tp, com, data, term) - register struct tty *tp; - int *com; - caddr_t data; - struct termios *term; -#else +/* + * ttsetcompat + * + * Description: Convert backward compatability set command arguments as + * follows: + * + * TIOCSETP -> TIOSETAF + * TIOCSETN -> TIOCSETA + * TIOCSETC -> TIOCSETA + * TIOCSLTC -> TIOCSETA + * TIOCLBIS -> TIOCSETA + * TIOCLBIC -> TIOCSETA + * TIOCLSET -> TIOCSETA + * + * The converted command argument and potentially modified 'term' + * argument are returned to the caller, which will then call ttioctl(), + * if this function returns successfully. + * + * Parameters struct tty *tp The tty on which the operation is + * being performed. + * u_long *com A pointer to the terminal input/output + * command being requested; its contents + * will be modified per the table above, + * on a non-error return. + * caddr_t data Command specific parameter data; this + * data is read but not modified. + * struct termios *term A local stack termios structure from + * ttcompat(), whose contents are to be + * modified based on *com and *data. + * + * Returns: EINVAL An input speed or output speed is + * outside the allowable range for a + * TIOCSETP or TIOCSETN command. + * 0 All other cases return 0. + * + * Notes: This function may modify the contents of the tp->t_flags + * field in a successful call to TIOCSETP, TIOCSETN, TIOCLBIS, + * TIOCLBIC, or TIOCLSET. + * + * All other tp fields will remain unmodifed, since the struct + * termious is a local stack copy from ttcompat(), and not the + * real thing. A subsequent call to ttioctl() in ttcompat(), + * however, may result in subsequent changes. + */ __private_extern__ int -ttsetcompat(tp, com, data, term) - register struct tty *tp; - u_long *com; - caddr_t data; - struct termios *term; -#endif /* !NeXT */ +ttsetcompat(struct tty *tp, u_long *com, caddr_t data, struct termios *term) { switch (*com) { case TIOCSETP: - case TIOCSETN: { + /* + * Wait for all characters queued for output to drain, then + * Discard all characters queued for input, and then set + * the input and output speeds and device flags, per the + * contents of the struct sgttyb that 'data' points to. + */ + case TIOCSETN: + /* + * Same as TIOCSETP, but the output is not drained, and any + * pending input is not discarded. + */ + { register struct sgttyb *sg = (struct sgttyb *)data; int speed; @@ -167,7 +229,12 @@ ttsetcompat(tp, com, data, term) *com = (*com == TIOCSETP) ? TIOCSETAF : TIOCSETA; break; } - case TIOCSETC: { + case TIOCSETC: + /* + * Set the terminal control characters per the contents of + * the struct tchars that 'data' points to. + */ + { struct tchars *tc = (struct tchars *)data; register cc_t *cc; @@ -183,7 +250,12 @@ ttsetcompat(tp, com, data, term) *com = TIOCSETA; break; } - case TIOCSLTC: { + case TIOCSLTC: + /* + * Set the terminal control characters per the contents of + * the struct ltchars that 'data' points to. + */ + { struct ltchars *ltc = (struct ltchars *)data; register cc_t *cc; @@ -198,8 +270,23 @@ ttsetcompat(tp, com, data, term) break; } case TIOCLBIS: + /* + * Set the bits in the terminal state local flags word + * (16 bits) for the terminal to the current bits OR + * those in the 16 bit value pointed to by 'data'. + */ case TIOCLBIC: + /* + * Clear the bits in the terminal state local flags word + * for the terminal to the current bits AND those bits NOT + * in the 16 bit value pointed to by 'data'. + */ case TIOCLSET: + /* + * Set the terminal state local flags word to exactly those + * bits that correspond to the 16 bit value pointed to by + * 'data'. + */ if (*com == TIOCLSET) tp->t_flags = (tp->t_flags&0xffff) | *(int *)data<<16; else { @@ -217,23 +304,44 @@ ttsetcompat(tp, com, data, term) return 0; } +/* + * ttcompat + * + * Description: For 'set' commands, convert the command and arguments as + * necessary, and call ttioctl(), returning the result as + * our result; for 'get' commands, obtain the requested data + * from the appropriate source, and return it in the expected + * format. If the command is not recognized, return EINVAL. + * + * Parameters struct tty *tp The tty on which the operation is + * being performed. + * u_long com The terminal input/output command + * being requested. + * caddr_t data The pointer to the user data argument + * provided with the command. + * int flag The file open flags (e.g. FREAD). + * struct proc *p The current process pointer for the + * operation. + * + * Returns: 0 Most 'get' operations can't fail, and + * therefore return this. + * ENOTTY TIOCGSID may return this when you + * attempt to get the session ID for a + * terminal with no associated session, + * or for which there is a session, but + * no session leader. + * EIOCTL If the command cannot be handled at + * this layer, this will be returned. + * * Any value returned by ttioctl(), if a + * set command is requested. + * + * NOTES: The process pointer may be a proxy on whose behalf we are + * operating, so it is not safe to simply use current_process() + * instead. + */ /*ARGSUSED*/ -#ifndef NeXT -int -ttcompat(tp, com, data, flag) - register struct tty *tp; - int com; - caddr_t data; - int flag; -#else __private_extern__ int -ttcompat(tp, com, data, flag, p) - register struct tty *tp; - u_long com; - caddr_t data; - int flag; - struct proc *p; -#endif /* !NeXT */ +ttcompat(struct tty *tp, u_long com, caddr_t data, int flag, struct proc *p) { switch (com) { case TIOCSETP: @@ -242,20 +350,26 @@ ttcompat(tp, com, data, flag, p) case TIOCSLTC: case TIOCLBIS: case TIOCLBIC: - case TIOCLSET: { + case TIOCLSET: + /* + * See ttsetcompat() for a full description of these command + * values and their meanings. + */ + { struct termios term; int error; term = tp->t_termios; if ((error = ttsetcompat(tp, &com, data, &term)) != 0) return error; -#ifdef NeXT return ttioctl(tp, com, (caddr_t) &term, flag, p); -#else - return ttioctl(tp, com, &term, flag); -#endif } - case TIOCGETP: { + case TIOCGETP: + /* + * Get the current input and output speeds, and device + * flags, into the structure pointed to by 'data'. + */ + { register struct sgttyb *sg = (struct sgttyb *)data; register cc_t *cc = tp->t_cc; @@ -269,7 +383,12 @@ ttcompat(tp, com, data, flag, p) sg->sg_flags = tp->t_flags = ttcompatgetflags(tp); break; } - case TIOCGETC: { + case TIOCGETC: + /* + * Get the terminal control characters into the struct + * tchars that 'data' points to. + */ + { struct tchars *tc = (struct tchars *)data; register cc_t *cc = tp->t_cc; @@ -281,7 +400,12 @@ ttcompat(tp, com, data, flag, p) tc->t_brkc = cc[VEOL]; break; } - case TIOCGLTC: { + case TIOCGLTC: + /* + * Get the terminal control characters into the struct + * ltchars that 'data' points to. + */ + { struct ltchars *ltc = (struct ltchars *)data; register cc_t *cc = tp->t_cc; @@ -294,33 +418,30 @@ ttcompat(tp, com, data, flag, p) break; } case TIOCLGET: + /* + * Get the terminal state local flags word into the 16 bit + * value pointed to by 'data'. + */ tp->t_flags = (ttcompatgetflags(tp) & 0xffff0000UL) | (tp->t_flags & 0xffff); *(int *)data = tp->t_flags>>16; -#ifndef NeXT - if (ttydebug) - printf("CLGET: returning %x\n", *(int *)data); -#endif break; case OTIOCGETD: + /* + * Get the current line discipline into the int pointed to + * by 'data'. + */ *(int *)data = tp->t_line ? tp->t_line : 2; break; -#ifndef NeXT - case OTIOCSETD: { - int ldisczero = 0; - - return (ttioctl(tp, TIOCSETD, - *(int *)data == 2 ? (caddr_t)&ldisczero : data, flag)); - } - - case OTIOCCONS: - *(int *)data = 1; - return (ttioctl(tp, TIOCCONS, data, flag)); -#else - case OTIOCSETD: { + case OTIOCSETD: + /* + * Set the current line discipline based on the value of the + * int pointed to by 'data'. + */ + { int ldisczero = 0; return (ttioctl(tp, TIOCSETD, @@ -328,10 +449,16 @@ ttcompat(tp, com, data, flag, p) } case OTIOCCONS: + /* + * Become the console device. + */ *(int *)data = 1; return (ttioctl(tp, TIOCCONS, data, flag, p)); case TIOCGSID: + /* + * Get the current session ID (controlling process' PID). + */ if (tp->t_session == NULL) return ENOTTY; @@ -340,23 +467,44 @@ ttcompat(tp, com, data, flag, p) *(int *) data = tp->t_session->s_leader->p_pid; break; -#endif /* NeXT */ default: - return (-1); + /* + * This ioctl is not handled at this layer. + */ + return (ENOTTY); } + + /* + * Successful 'get' operation. + */ return (0); } +/* + * ttcompatgetflags + * + * Description: Get the terminal state local flags, device flags, and current + * speed code for the device (all 32 bits are returned). + * + * Parameters struct tty *tp The tty on which the operation is + * being performed. + * + * Returns: * Integer value corresponding to the + * current terminal state local flags + * word. + * + * Notes: Caller is responsible for breaking these bits back out into + * separate 16 bit filelds, if that's what was actually desired. + */ static int -ttcompatgetflags(tp) - register struct tty *tp; +ttcompatgetflags(struct tty *tp) { register tcflag_t iflag = tp->t_iflag; register tcflag_t lflag = tp->t_lflag; register tcflag_t oflag = tp->t_oflag; register tcflag_t cflag = tp->t_cflag; - register flags = 0; + register int flags = 0; if (iflag&IXOFF) flags |= TANDEM; @@ -380,12 +528,12 @@ ttcompatgetflags(tp) if ((lflag&ICANON) == 0) { /* fudge */ if (iflag&(INPCK|ISTRIP|IXON) || lflag&(IEXTEN|ISIG) - || cflag&(CSIZE|PARENB) != CS8) + || (cflag&(CSIZE|PARENB)) != CS8) flags |= CBREAK; else flags |= RAW; } - if (!(flags&RAW) && !(oflag&OPOST) && cflag&(CSIZE|PARENB) == CS8) + if (!(flags&RAW) && !(oflag&OPOST) && (cflag&(CSIZE|PARENB)) == CS8) flags |= LITOUT; if (cflag&MDMBUF) flags |= MDMBUF; @@ -404,19 +552,27 @@ ttcompatgetflags(tp) if ((iflag&IXANY) == 0) flags |= DECCTQ; flags |= lflag&(ECHO|TOSTOP|FLUSHO|PENDIN|NOFLSH); -#ifndef NeXT - if (ttydebug) - printf("getflags: %x\n", flags); -#endif return (flags); } +/* + * ttcompatsetflags + * + * Description: Given a set of compatability flags, convert the compatability + * flags in the terminal flags fields into canonical flags in the + * provided termios struct. + * + * Parameters: struct tty *tp The tty on which the operation is + * being performed. + * struct termios *t The termios structure into which to + * return the converted flags. + * + * Returns: void (implicit: *t, modified) + */ static void -ttcompatsetflags(tp, t) - register struct tty *tp; - register struct termios *t; +ttcompatsetflags(struct tty *tp, struct termios *t) { - register flags = tp->t_flags; + register int flags = tp->t_flags; register tcflag_t iflag = t->c_iflag; register tcflag_t oflag = t->c_oflag; register tcflag_t lflag = t->c_lflag; @@ -490,12 +646,24 @@ ttcompatsetflags(tp, t) t->c_cflag = cflag; } +/* + * ttcompatsetlflags + * + * Description: Given a set of compatability terminal state local flags, + * convert the compatability flags in the terminal flags + * fields into canonical flags in the provided termios struct. + * + * Parameters: struct tty *tp The tty on which the operation is + * being performed. + * struct termios *t The termios structure into which to + * return the converted local flags. + * + * Returns: void (implicit: *t, modified) + */ static void -ttcompatsetlflags(tp, t) - register struct tty *tp; - register struct termios *t; +ttcompatsetlflags(struct tty *tp, struct termios *t) { - register flags = tp->t_flags; + register int flags = tp->t_flags; register tcflag_t iflag = t->c_iflag; register tcflag_t oflag = t->c_oflag; register tcflag_t lflag = t->c_lflag; @@ -567,4 +735,4 @@ ttcompatsetlflags(tp, t) t->c_lflag = lflag; t->c_cflag = cflag; } -#endif /* COMPAT_43 || COMPAT_SUNOS */ +#endif /* COMPAT_43_TTY || COMPAT_SUNOS */ diff --git a/bsd/kern/tty_conf.c b/bsd/kern/tty_conf.c index bb4a8c9c0..5bf3647b1 100644 --- a/bsd/kern/tty_conf.c +++ b/bsd/kern/tty_conf.c @@ -72,40 +72,38 @@ #ifndef NeXT static l_open_t l_noopen; static l_close_t l_noclose; -static l_ioctl_t l_nullioctl; static l_rint_t l_norint; -static l_start_t l_nostart; #else /* NeXT */ -#define l_noopen ((int (*) __P((dev_t, struct tty *)))enodev) -#define l_noclose ((int (*) __P((struct tty *, int flags)))enodev) -#define l_noread ((int (*) __P((struct tty *, struct uio *, int)))enodev) -#define l_nowrite l_noread -#define l_norint ((int (*) __P((int c, struct tty *)))enodev) -#define l_nostart ((int (*) __P((struct tty *)))enodev) -static int -l_nullioctl(struct tty *tp, u_long cmd, caddr_t data, int flag, struct proc *p); +#define l_noopen ((l_open_t *) &enodev) +#define l_noclose ((l_close_t *) &enodev) +#define l_noread ((l_read_t *) &enodev) +#define l_nowrite ((l_write_t *) &enodev) +#define l_norint ((l_rint_t *) &enodev) #endif /* !NeXT */ +static l_ioctl_t l_noioctl; +static l_start_t l_nostart; + /* * XXX it probably doesn't matter what the entries other than the l_open - * entry are here. The l_nullioctl and ttymodem entries still look fishy. + * entry are here. The l_noioctl and ttymodem entries still look fishy. * Reconsider the removal of nullmodem anyway. It was too much like * ttymodem, but a completely null version might be useful. */ #define NODISC(n) \ { l_noopen, l_noclose, l_noread, l_nowrite, \ - l_nullioctl, l_norint, l_nostart, ttymodem } + l_noioctl, l_norint, l_nostart, ttymodem } struct linesw linesw[MAXLDISC] = { /* 0- termios */ { ttyopen, ttylclose, ttread, ttwrite, - l_nullioctl, ttyinput, ttstart, ttymodem }, + l_noioctl, ttyinput, ttwwakeup, ttymodem }, NODISC(1), /* 1- defunct */ /* 2- NTTYDISC */ -#ifdef COMPAT_43 +#if COMPAT_43_TTY { ttyopen, ttylclose, ttread, ttwrite, - l_nullioctl, ttyinput, ttstart, ttymodem }, + l_noioctl, ttyinput, ttwwakeup, ttymodem }, #else NODISC(2), #endif @@ -215,14 +213,6 @@ l_norint(c, tp) return (ENODEV); } - -static int -l_nostart(tp) - struct tty *tp; -{ - - return (ENODEV); -} #endif /* !NeXT */ /* @@ -230,13 +220,13 @@ l_nostart(tp) * discipline specific ioctl command. */ static int -l_nullioctl(tp, cmd, data, flags, p) - struct tty *tp; - u_long cmd; - caddr_t data; - int flags; - struct proc *p; +l_noioctl(__unused struct tty *tp, __unused u_long cmd, __unused caddr_t data, + __unused int flags, __unused struct proc *p) { - return (-1); + return ENOTTY; } + +static void +l_nostart(__unused struct tty *tp) + { } diff --git a/bsd/kern/tty_pty.c b/bsd/kern/tty_pty.c index 7003a4202..5ec154c67 100644 --- a/bsd/kern/tty_pty.c +++ b/bsd/kern/tty_pty.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -64,10 +64,11 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/ioctl.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/tty.h> #include <sys/conf.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/uio.h> #include <sys/kernel.h> #include <sys/vnode.h> @@ -78,23 +79,24 @@ #define FREE_BSDSTATIC static #else -#include <machine/spl.h> - #define FREE_BSDSTATIC __private_extern__ #define d_devtotty_t struct tty ** #ifdef d_stop_t #undef d_stop_t #endif -typedef void d_stop_t __P((struct tty *tp, int rw)); +typedef void d_stop_t(struct tty *tp, int rw); #endif /* NeXT */ +/* XXX function should be removed??? */ +int pty_init(int n_ptys); + #ifdef notyet -static void ptyattach __P((int n)); +static void ptyattach(int n); #endif -static void ptsstart __P((struct tty *tp)); -static void ptcwakeup __P((struct tty *tp, int flag)); +static void ptsstart(struct tty *tp); +static void ptcwakeup(struct tty *tp, int flag); FREE_BSDSTATIC d_open_t ptsopen; FREE_BSDSTATIC d_close_t ptsclose; @@ -204,7 +206,8 @@ ptyattach(n) #endif #ifndef DEVFS -int pty_init() +int +pty_init(__unused int n_ptys) { return 0; } @@ -212,7 +215,8 @@ int pty_init() #include <miscfs/devfs/devfs.h> #define START_CHAR 'p' #define HEX_BASE 16 -int pty_init(int n_ptys) +int +pty_init(int n_ptys) { int i; int j; @@ -238,23 +242,24 @@ int pty_init(int n_ptys) /*ARGSUSED*/ FREE_BSDSTATIC int -ptsopen(dev, flag, devtype, p) - dev_t dev; - int flag, devtype; - struct proc *p; +ptsopen(dev_t dev, int flag, __unused int devtype, __unused struct proc *p) { register struct tty *tp; int error; + boolean_t funnel_state; + funnel_state = thread_funnel_set(kernel_flock, TRUE); #ifndef NeXT tp = &pt_tty[minor(dev)]; #else /* - * You will see this sourt of code coming up in diffs later both + * You will see this sort of code coming up in diffs later both * the ttymalloc and the tp indirection. */ - if (minor(dev) >= npty) - return (ENXIO); + if (minor(dev) >= npty) { + error = ENXIO; + goto out; + } if (!pt_tty[minor(dev)]) { tp = pt_tty[minor(dev)] = ttymalloc(); } else @@ -268,8 +273,10 @@ ptsopen(dev, flag, devtype, p) tp->t_cflag = TTYDEF_CFLAG; tp->t_ispeed = tp->t_ospeed = TTYDEF_SPEED; ttsetwater(tp); /* would be done in xxparam() */ - } else if (tp->t_state&TS_XCLUDE && p->p_ucred->cr_uid != 0) - return (EBUSY); + } else if (tp->t_state&TS_XCLUDE && suser(kauth_cred_get(), NULL)) { + error = EBUSY; + goto out; + } if (tp->t_oproc) /* Ctrlr still around. */ (void)(*linesw[tp->t_line].l_modem)(tp, 1); while ((tp->t_state & TS_CARR_ON) == 0) { @@ -278,27 +285,31 @@ ptsopen(dev, flag, devtype, p) error = ttysleep(tp, TSA_CARR_ON(tp), TTIPRI | PCATCH, "ptsopn", 0); if (error) - return (error); + goto out; } error = (*linesw[tp->t_line].l_open)(dev, tp); if (error == 0) ptcwakeup(tp, FREAD|FWRITE); +out: + (void) thread_funnel_set(kernel_flock, funnel_state); return (error); } FREE_BSDSTATIC int -ptsclose(dev, flag, mode, p) - dev_t dev; - int flag, mode; - struct proc *p; +ptsclose(dev_t dev, int flag, __unused int mode, __unused proc_t p) { register struct tty *tp; int err; + boolean_t funnel_state; + + funnel_state = thread_funnel_set(kernel_flock, TRUE); tp = pt_tty[minor(dev)]; err = (*linesw[tp->t_line].l_close)(tp, flag); ptsstop(tp, FREAD|FWRITE); (void) ttyclose(tp); + + (void) thread_funnel_set(kernel_flock, funnel_state); return (err); } @@ -317,21 +328,27 @@ ptsread(dev, uio, flag) register struct pt_ioctl *pti = &pt_ioctl[minor(dev)]; int error = 0; struct uthread *ut; + boolean_t funnel_state; - ut = (struct uthread *)get_bsdthread_info(current_act()); + funnel_state = thread_funnel_set(kernel_flock, TRUE); + + + ut = (struct uthread *)get_bsdthread_info(current_thread()); again: if (pti->pt_flags & PF_REMOTE) { while (isbackground(p, tp)) { if ((p->p_sigignore & sigmask(SIGTTIN)) || (ut->uu_sigmask & sigmask(SIGTTIN)) || p->p_pgrp->pg_jobc == 0 || - p->p_flag & P_PPWAIT) - return (EIO); + p->p_flag & P_PPWAIT) { + error = EIO; + goto out; + } pgsignal(p->p_pgrp, SIGTTIN, 1); error = ttysleep(tp, &lbolt, TTIPRI | PCATCH | PTTYBLOCK, "ptsbg", 0); if (error) - return (error); + goto out; } if (tp->t_canq.c_cc == 0) { if (flag & IO_NDELAY) @@ -339,22 +356,31 @@ again: error = ttysleep(tp, TSA_PTS_READ(tp), TTIPRI | PCATCH, "ptsin", 0); if (error) - return (error); + goto out; goto again; } - while (tp->t_canq.c_cc > 1 && uio->uio_resid > 0) - if (ureadc(getc(&tp->t_canq), uio) < 0) { - error = EFAULT; + while (tp->t_canq.c_cc > 1 && uio_resid(uio) > 0) { + int cc; + char buf[BUFSIZ]; + + cc = min(uio_resid(uio), BUFSIZ); + // Don't copy the very last byte + cc = min(cc, tp->t_canq.c_cc - 1); + cc = q_to_b(&tp->t_canq, buf, cc); + error = uiomove(buf, cc, uio); + if (error) break; - } + } if (tp->t_canq.c_cc == 1) (void) getc(&tp->t_canq); if (tp->t_canq.c_cc) - return (error); + goto out; } else if (tp->t_oproc) error = (*linesw[tp->t_line].l_read)(tp, uio, flag); ptcwakeup(tp, FWRITE); +out: + (void) thread_funnel_set(kernel_flock, funnel_state); return (error); } @@ -370,11 +396,19 @@ ptswrite(dev, uio, flag) int flag; { register struct tty *tp; + int error; + boolean_t funnel_state; + + funnel_state = thread_funnel_set(kernel_flock, TRUE); tp = pt_tty[minor(dev)]; if (tp->t_oproc == 0) - return (EIO); - return ((*linesw[tp->t_line].l_write)(tp, uio, flag)); + error = EIO; + else + error = (*linesw[tp->t_line].l_write)(tp, uio, flag); + + (void) thread_funnel_set(kernel_flock, funnel_state); + return (error); } /* @@ -386,14 +420,20 @@ ptsstart(tp) struct tty *tp; { register struct pt_ioctl *pti = &pt_ioctl[minor(tp->t_dev)]; + boolean_t funnel_state; + + funnel_state = thread_funnel_set(kernel_flock, TRUE); if (tp->t_state & TS_TTSTOP) - return; + goto out; if (pti->pt_flags & PF_STOPPED) { pti->pt_flags &= ~PF_STOPPED; pti->pt_send = TIOCPKT_START; } ptcwakeup(tp, FREAD); +out: + (void) thread_funnel_set(kernel_flock, funnel_state); + return; } static void @@ -402,6 +442,9 @@ ptcwakeup(tp, flag) int flag; { struct pt_ioctl *pti = &pt_ioctl[minor(tp->t_dev)]; + boolean_t funnel_state; + + funnel_state = thread_funnel_set(kernel_flock, TRUE); if (flag & FREAD) { selwakeup(&pti->pt_selr); @@ -411,25 +454,31 @@ ptcwakeup(tp, flag) selwakeup(&pti->pt_selw); wakeup(TSA_PTC_WRITE(tp)); } + (void) thread_funnel_set(kernel_flock, funnel_state); } FREE_BSDSTATIC int -ptcopen(dev, flag, devtype, p) - dev_t dev; - int flag, devtype; - struct proc *p; +ptcopen(dev_t dev, __unused int flag, __unused int devtype, __unused proc_t p) { register struct tty *tp; struct pt_ioctl *pti; + int error = 0; + boolean_t funnel_state; - if (minor(dev) >= npty) - return (ENXIO); + funnel_state = thread_funnel_set(kernel_flock, TRUE); + + if (minor(dev) >= npty) { + error = ENXIO; + goto out; + } if(!pt_tty[minor(dev)]) { tp = pt_tty[minor(dev)] = ttymalloc(); } else tp = pt_tty[minor(dev)]; - if (tp->t_oproc) - return (EIO); + if (tp->t_oproc) { + error = EIO; + goto out; + } tp->t_oproc = ptsstart; #ifdef sun4c tp->t_stop = ptsstop; @@ -440,17 +489,18 @@ ptcopen(dev, flag, devtype, p) pti->pt_flags = 0; pti->pt_send = 0; pti->pt_ucntl = 0; - return (0); +out: + (void) thread_funnel_set(kernel_flock, funnel_state); + return (error); } FREE_BSDSTATIC int -ptcclose(dev, flags, fmt, p) - dev_t dev; - int flags; - int fmt; - struct proc *p; +ptcclose(dev_t dev, __unused int flags, __unused int fmt, __unused proc_t p) { register struct tty *tp; + boolean_t funnel_state; + + funnel_state = thread_funnel_set(kernel_flock, TRUE); tp = pt_tty[minor(dev)]; (void)(*linesw[tp->t_line].l_modem)(tp, 0); @@ -470,6 +520,8 @@ ptcclose(dev, flags, fmt, p) } tp->t_oproc = 0; /* mark closed */ + + (void) thread_funnel_set(kernel_flock, funnel_state); return (0); } @@ -483,6 +535,9 @@ ptcread(dev, uio, flag) struct pt_ioctl *pti = &pt_ioctl[minor(dev)]; char buf[BUFSIZ]; int error = 0, cc; + boolean_t funnel_state; + + funnel_state = thread_funnel_set(kernel_flock, TRUE); /* * We want to block until the slave @@ -495,43 +550,48 @@ ptcread(dev, uio, flag) if (pti->pt_flags&PF_PKT && pti->pt_send) { error = ureadc((int)pti->pt_send, uio); if (error) - return (error); + goto out; if (pti->pt_send & TIOCPKT_IOCTL) { - cc = min(uio->uio_resid, + cc = min(uio_resid(uio), sizeof(tp->t_termios)); uiomove((caddr_t)&tp->t_termios, cc, uio); } pti->pt_send = 0; - return (0); + goto out; } if (pti->pt_flags&PF_UCNTL && pti->pt_ucntl) { error = ureadc((int)pti->pt_ucntl, uio); if (error) - return (error); + goto out; pti->pt_ucntl = 0; - return (0); + goto out; } if (tp->t_outq.c_cc && (tp->t_state&TS_TTSTOP) == 0) break; } if ((tp->t_state & TS_CONNECTED) == 0) - return (0); /* EOF */ - if (flag & IO_NDELAY) - return (EWOULDBLOCK); + goto out; /* EOF */ + if (flag & IO_NDELAY) { + error = EWOULDBLOCK; + goto out; + } error = tsleep(TSA_PTC_READ(tp), TTIPRI | PCATCH, "ptcin", 0); if (error) - return (error); + goto out; } if (pti->pt_flags & (PF_PKT|PF_UCNTL)) error = ureadc(0, uio); - while (uio->uio_resid > 0 && error == 0) { - cc = q_to_b(&tp->t_outq, buf, min(uio->uio_resid, BUFSIZ)); + while (uio_resid(uio) > 0 && error == 0) { + cc = q_to_b(&tp->t_outq, buf, min(uio_resid(uio), BUFSIZ)); if (cc <= 0) break; error = uiomove(buf, cc, uio); } - ttwwakeup(tp); + (*linesw[tp->t_line].l_start)(tp); + +out: + (void) thread_funnel_set(kernel_flock, funnel_state); return (error); } @@ -542,6 +602,9 @@ ptsstop(tp, flush) { struct pt_ioctl *pti = &pt_ioctl[minor(tp->t_dev)]; int flag; + boolean_t funnel_state; + + funnel_state = thread_funnel_set(kernel_flock, TRUE); /* note: FLUSHREAD and FLUSHWRITE already ok */ if (flush == 0) { @@ -557,6 +620,8 @@ ptsstop(tp, flush) if (flush & FWRITE) flag |= FREAD; ptcwakeup(tp, flag); + + (void) thread_funnel_set(kernel_flock, funnel_state); } FREE_BSDSTATIC int @@ -568,30 +633,35 @@ ptcselect(dev, rw, wql, p) { register struct tty *tp = pt_tty[minor(dev)]; struct pt_ioctl *pti = &pt_ioctl[minor(dev)]; - int s; + int retval = 0; + boolean_t funnel_state; - if ((tp->t_state & TS_CONNECTED) == 0) - return (1); + funnel_state = thread_funnel_set(kernel_flock, TRUE); + + if ((tp->t_state & TS_CONNECTED) == 0) { + retval = 1; + goto out; + } switch (rw) { case FREAD: /* * Need to block timeouts (ttrstart). */ - s = spltty(); if ((tp->t_state&TS_ISOPEN) && tp->t_outq.c_cc && (tp->t_state&TS_TTSTOP) == 0) { - splx(s); - return (1); + retval = 1; + goto out; } - splx(s); /* FALLTHROUGH */ case 0: /* exceptional */ if ((tp->t_state&TS_ISOPEN) && ((pti->pt_flags&PF_PKT && pti->pt_send) || - (pti->pt_flags&PF_UCNTL && pti->pt_ucntl))) - return (1); + (pti->pt_flags&PF_UCNTL && pti->pt_ucntl))) { + retval = 1; + goto out; + } selrecord(p, &pti->pt_selr, wql); break; @@ -599,20 +669,28 @@ ptcselect(dev, rw, wql, p) case FWRITE: if (tp->t_state&TS_ISOPEN) { if (pti->pt_flags & PF_REMOTE) { - if (tp->t_canq.c_cc == 0) - return (1); + if (tp->t_canq.c_cc == 0) { + retval = 1; + goto out; + } } else { - if (tp->t_rawq.c_cc + tp->t_canq.c_cc < TTYHOG-2) - return (1); - if (tp->t_canq.c_cc == 0 && (tp->t_iflag&ICANON)) - return (1); + if (tp->t_rawq.c_cc + tp->t_canq.c_cc < TTYHOG-2) { + retval = 1; + goto out; + } + if (tp->t_canq.c_cc == 0 && (tp->t_iflag&ICANON)) { + retval = 1; + goto out; + } } } selrecord(p, &pti->pt_selw, wql); break; } - return (0); +out: + (void) thread_funnel_set(kernel_flock, funnel_state); + return (retval); } FREE_BSDSTATIC int @@ -625,9 +703,12 @@ ptcwrite(dev, uio, flag) register u_char *cp = NULL; register int cc = 0; u_char locbuf[BUFSIZ]; - int cnt = 0; + int wcnt = 0; struct pt_ioctl *pti = &pt_ioctl[minor(dev)]; int error = 0; + boolean_t funnel_state; + + funnel_state = thread_funnel_set(kernel_flock, TRUE); again: if ((tp->t_state&TS_ISOPEN) == 0) @@ -635,20 +716,21 @@ again: if (pti->pt_flags & PF_REMOTE) { if (tp->t_canq.c_cc) goto block; - while ((uio->uio_resid > 0 || cc > 0) && + while ((uio_resid(uio) > 0 || cc > 0) && tp->t_canq.c_cc < TTYHOG - 1) { if (cc == 0) { - cc = min(uio->uio_resid, BUFSIZ); + cc = min(uio_resid(uio), BUFSIZ); cc = min(cc, TTYHOG - 1 - tp->t_canq.c_cc); cp = locbuf; error = uiomove((caddr_t)cp, cc, uio); if (error) - return (error); + goto out; /* check again for safety */ if ((tp->t_state & TS_ISOPEN) == 0) { /* adjust as usual */ - uio->uio_resid += cc; - return (EIO); + uio_setresid(uio, (uio_resid(uio) + cc)); + error = EIO; + goto out; } } if (cc > 0) { @@ -666,24 +748,25 @@ again: } } /* adjust for data copied in but not written */ - uio->uio_resid += cc; + uio_setresid(uio, (uio_resid(uio) + cc)); (void) putc(0, &tp->t_canq); ttwakeup(tp); wakeup(TSA_PTS_READ(tp)); - return (0); + goto out; } - while (uio->uio_resid > 0 || cc > 0) { + while (uio_resid(uio) > 0 || cc > 0) { if (cc == 0) { - cc = min(uio->uio_resid, BUFSIZ); + cc = min(uio_resid(uio), BUFSIZ); cp = locbuf; error = uiomove((caddr_t)cp, cc, uio); if (error) - return (error); + goto out; /* check again for safety */ if ((tp->t_state & TS_ISOPEN) == 0) { /* adjust for data copied in but not written */ - uio->uio_resid += cc; - return (EIO); + uio_setresid(uio, (uio_resid(uio) + cc)); + error = EIO; + goto out; } } while (cc > 0) { @@ -693,12 +776,14 @@ again: goto block; } (*linesw[tp->t_line].l_rint)(*cp++, tp); - cnt++; + wcnt++; cc--; } cc = 0; } - return (0); +out: + (void) thread_funnel_set(kernel_flock, funnel_state); + return (error); block: /* * Come here to wait for slave to open, for space @@ -706,21 +791,22 @@ block: */ if ((tp->t_state & TS_CONNECTED) == 0) { /* adjust for data copied in but not written */ - uio->uio_resid += cc; - return (EIO); + uio_setresid(uio, (uio_resid(uio) + cc)); + error = EIO; + goto out; } if (flag & IO_NDELAY) { /* adjust for data copied in but not written */ - uio->uio_resid += cc; - if (cnt == 0) - return (EWOULDBLOCK); - return (0); + uio_setresid(uio, (uio_resid(uio) + cc)); + if (wcnt == 0) + error = EWOULDBLOCK; + goto out; } error = tsleep(TSA_PTC_WRITE(tp), TTOPRI | PCATCH, "ptcout", 0); if (error) { /* adjust for data copied in but not written */ - uio->uio_resid += cc; - return (error); + uio_setresid(uio, (uio_resid(uio) + cc)); + goto out; } goto again; } @@ -759,7 +845,10 @@ ptyioctl(dev, cmd, data, flag, p) register struct tty *tp = pt_tty[minor(dev)]; register struct pt_ioctl *pti = &pt_ioctl[minor(dev)]; register u_char *cc = tp->t_cc; - int stop, error; + int stop, error = 0; + boolean_t funnel_state; + + funnel_state = thread_funnel_set(kernel_flock, TRUE); /* * IF CONTROLLER STTY THEN MUST FLUSH TO PREVENT A HANG. @@ -785,7 +874,7 @@ ptyioctl(dev, cmd, data, flag, p) } tp->t_lflag &= ~EXTPROC; } - return(0); + goto out; } else #ifndef NeXT if (cdevsw[major(dev)]->d_open == ptcopen) @@ -800,25 +889,29 @@ ptyioctl(dev, cmd, data, flag, p) * in that case, tp must be the controlling terminal. */ *(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : 0; - return (0); + goto out; case TIOCPKT: if (*(int *)data) { - if (pti->pt_flags & PF_UCNTL) - return (EINVAL); + if (pti->pt_flags & PF_UCNTL) { + error = EINVAL; + goto out; + } pti->pt_flags |= PF_PKT; } else pti->pt_flags &= ~PF_PKT; - return (0); + goto out; case TIOCUCNTL: if (*(int *)data) { - if (pti->pt_flags & PF_PKT) - return (EINVAL); + if (pti->pt_flags & PF_PKT) { + error = EINVAL; + goto out; + } pti->pt_flags |= PF_UCNTL; } else pti->pt_flags &= ~PF_UCNTL; - return (0); + goto out; case TIOCREMOTE: if (*(int *)data) @@ -826,9 +919,9 @@ ptyioctl(dev, cmd, data, flag, p) else pti->pt_flags &= ~PF_REMOTE; ttyflush(tp, FREAD|FWRITE); - return (0); + goto out; -#ifdef COMPAT_43 +#if COMPAT_43_TTY case TIOCSETP: case TIOCSETN: #endif @@ -841,30 +934,33 @@ ptyioctl(dev, cmd, data, flag, p) case TIOCSIG: if (*(unsigned int *)data >= NSIG || - *(unsigned int *)data == 0) - return(EINVAL); + *(unsigned int *)data == 0) { + error = EINVAL; + goto out; + } if ((tp->t_lflag&NOFLSH) == 0) ttyflush(tp, FREAD|FWRITE); pgsignal(tp->t_pgrp, *(unsigned int *)data, 1); if ((*(unsigned int *)data == SIGINFO) && ((tp->t_lflag&NOKERNINFO) == 0)) ttyinfo(tp); - return(0); + goto out; } error = (*linesw[tp->t_line].l_ioctl)(tp, cmd, data, flag, p); - if (error < 0) - error = ttioctl(tp, cmd, data, flag, p); - if (error < 0) { - if (pti->pt_flags & PF_UCNTL && - (cmd & ~0xff) == UIOCCMD(0)) { + if (error == ENOTTY) { + error = ttioctl(tp, cmd, data, flag, p); + if (error == ENOTTY + && pti->pt_flags & PF_UCNTL && (cmd & ~0xff) == UIOCCMD(0)) { + /* Process the UIOCMD ioctl group */ if (cmd & 0xff) { pti->pt_ucntl = (u_char)cmd; ptcwakeup(tp, FREAD); } - return (0); + error = 0; + goto out; } - error = ENOTTY; } + /* * If external processing and packet mode send ioctl packet. */ @@ -873,11 +969,11 @@ ptyioctl(dev, cmd, data, flag, p) case TIOCSETA: case TIOCSETAW: case TIOCSETAF: -#ifdef COMPAT_43 +#if COMPAT_43_TTY case TIOCSETP: case TIOCSETN: #endif -#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#if COMPAT_43_TTY || defined(COMPAT_SUNOS) case TIOCSETC: case TIOCSLTC: case TIOCLBIS: @@ -907,6 +1003,8 @@ ptyioctl(dev, cmd, data, flag, p) ptcwakeup(tp, FREAD); } } +out: + (void) thread_funnel_set(kernel_flock, funnel_state); return (error); } diff --git a/bsd/kern/tty_subr.c b/bsd/kern/tty_subr.c index 9bc116f30..1fcda738d 100644 --- a/bsd/kern/tty_subr.c +++ b/bsd/kern/tty_subr.c @@ -56,7 +56,6 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/buf.h> #include <sys/ioctl.h> #include <sys/tty.h> #include <sys/malloc.h> @@ -89,7 +88,7 @@ * Initialize clists. */ void -cinit() +cinit(void) { } @@ -346,10 +345,7 @@ out: * clrbit(cp, off + len); */ void -clrbits(cp, off, len) - u_char *cp; - int off; - int len; +clrbits(u_char *cp, int off, int len) { int sby, sbi, eby, ebi; register int i; @@ -385,13 +381,10 @@ clrbits(cp, off, len) * Return number of bytes not transfered. */ int -b_to_q(cp, count, clp) - u_char *cp; - int count; - struct clist *clp; +b_to_q(const u_char *cp, int count, struct clist *clp) { - register int cc; - register u_char *p = cp; + int cc; + const u_char *p = cp; int s; if (count <= 0) diff --git a/bsd/kern/tty_tb.c b/bsd/kern/tty_tb.c index c04197e24..d4b8e4d62 100644 --- a/bsd/kern/tty_tb.c +++ b/bsd/kern/tty_tb.c @@ -38,7 +38,7 @@ #include <sys/tablet.h> #include <sys/tty.h> #if NeXT -#include <sys/proc.h> +#include <sys/proc_internal.h> #endif /* diff --git a/bsd/kern/tty_tty.c b/bsd/kern/tty_tty.c index 359d71096..8489c7766 100644 --- a/bsd/kern/tty_tty.c +++ b/bsd/kern/tty_tty.c @@ -62,10 +62,10 @@ #include <sys/systm.h> #include <sys/conf.h> #include <sys/ioctl.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> #include <sys/tty.h> -#include <sys/vnode.h> -#include <sys/file.h> +#include <sys/vnode_internal.h> +#include <sys/file_internal.h> #ifndef NeXT #include <sys/kernel.h> #ifdef DEVFS @@ -78,6 +78,18 @@ static d_write_t cttywrite; static d_ioctl_t cttyioctl; static d_select_t cttyselect; +#endif /* !NeXT */ + +/* Forward declarations for cdevsw[] entry */ +/* XXX we should consider making these static */ +int cttyopen(dev_t dev, int flag, int mode, struct proc *p); +int cttyread(dev_t dev, struct uio *uio, int flag); +int cttywrite(dev_t dev, struct uio *uio, int flag); +int cttyioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, struct proc *p); +int cttyselect(dev_t dev, int flag, void* wql, struct proc *p); + +#ifndef NeXT + #define CDEV_MAJOR 1 /* Don't make static, fdesc_vnops uses this. */ struct cdevsw ctty_cdevsw = @@ -91,80 +103,59 @@ struct cdevsw ctty_cdevsw = /*ARGSUSED*/ int -cttyopen(dev, flag, mode, p) - dev_t dev; - int flag, mode; - struct proc *p; +cttyopen(__unused dev_t dev, int flag, __unused int mode, struct proc *p) { struct vnode *ttyvp = cttyvp(p); + struct vfs_context context; int error; if (ttyvp == NULL) return (ENXIO); -#ifndef NeXT - VOP_LOCK(ttyvp); -#else - /* - * This is the only place that NeXT Guarding has been used for - * VOP_.*LOCK style calls. Note all of the other diffs should - * use the three paramater lock/unlock. - */ - vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, p); -#endif -#ifdef PARANOID - /* - * Since group is tty and mode is 620 on most terminal lines - * and since sessions protect terminals from processes outside - * your session, this check is probably no longer necessary. - * Since it inhibits setuid root programs that later switch - * to another user from accessing /dev/tty, we have decided - * to delete this test. (mckusick 5/93) - */ - error = VOP_ACCESS(ttyvp, - (flag&FREAD ? VREAD : 0) | (flag&FWRITE ? VWRITE : 0), p->p_ucred, p); - if (!error) -#endif /* PARANOID */ - error = VOP_OPEN(ttyvp, flag, NOCRED, p); - VOP_UNLOCK(ttyvp, 0, p); + context.vc_proc = p; + context.vc_ucred = p->p_ucred; + error = VNOP_OPEN(ttyvp, flag, &context); + return (error); } /*ARGSUSED*/ int -cttyread(dev, uio, flag) - dev_t dev; - struct uio *uio; - int flag; +cttyread(__unused dev_t dev, struct uio *uio, int flag) { - struct proc *p = uio->uio_procp; - register struct vnode *ttyvp = cttyvp(uio->uio_procp); + struct proc *p = current_proc(); + register struct vnode *ttyvp = cttyvp(p); + struct vfs_context context; int error; if (ttyvp == NULL) return (EIO); - vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, p); - error = VOP_READ(ttyvp, uio, flag, NOCRED); - VOP_UNLOCK(ttyvp, 0, p); + + context.vc_proc = p; + context.vc_ucred = NOCRED; + + error = VNOP_READ(ttyvp, uio, flag, &context); + return (error); } /*ARGSUSED*/ int -cttywrite(dev, uio, flag) - dev_t dev; - struct uio *uio; - int flag; +cttywrite(__unused dev_t dev, struct uio *uio, int flag) { - struct proc *p = uio->uio_procp; - register struct vnode *ttyvp = cttyvp(uio->uio_procp); + struct proc *p = current_proc(); + register struct vnode *ttyvp = cttyvp(p); + struct vfs_context context; int error; if (ttyvp == NULL) return (EIO); - vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, p); - error = VOP_WRITE(ttyvp, uio, flag, NOCRED); - VOP_UNLOCK(ttyvp, 0, p); + + context.vc_proc = p; + context.vc_ucred = NOCRED; + + error = VNOP_WRITE(ttyvp, uio, flag, &context); + return (error); } @@ -179,15 +170,12 @@ cttyioctl(dev, cmd, addr, flag, p) struct proc *p; #else int -cttyioctl(dev, cmd, addr, flag, p) - dev_t dev; - u_long cmd; - caddr_t addr; - int flag; - struct proc *p; +cttyioctl(__unused dev_t dev, u_long cmd, caddr_t addr, int flag, + struct proc *p) #endif /* !NeXT */ { struct vnode *ttyvp = cttyvp(p); + struct vfs_context context; if (ttyvp == NULL) return (EIO); @@ -200,22 +188,25 @@ cttyioctl(dev, cmd, addr, flag, p) } else return (EINVAL); } - return (VOP_IOCTL(ttyvp, cmd, addr, flag, NOCRED, p)); + context.vc_proc = p; + context.vc_ucred = NOCRED; + + return (VNOP_IOCTL(ttyvp, cmd, addr, flag, &context)); } /*ARGSUSED*/ int -cttyselect(dev, flag, wql, p) - dev_t dev; - int flag; - void * wql; - struct proc *p; +cttyselect(__unused dev_t dev, int flag, void* wql, struct proc *p) { struct vnode *ttyvp = cttyvp(p); + struct vfs_context context; + + context.vc_proc = p; + context.vc_ucred = NOCRED; if (ttyvp == NULL) return (1); /* try operation to get EOF/failure */ - return (VOP_SELECT(ttyvp, flag, FREAD|FWRITE, NOCRED, wql, p)); + return (VNOP_SELECT(ttyvp, flag, FREAD|FWRITE, wql, &context)); } #ifndef NeXT diff --git a/bsd/kern/ubc_subr.c b/bsd/kern/ubc_subr.c index 2f85056f6..1660c5b7a 100644 --- a/bsd/kern/ubc_subr.c +++ b/bsd/kern/ubc_subr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -37,18 +37,25 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/lock.h> -#include <sys/ubc.h> -#include <sys/mount.h> -#include <sys/vnode.h> -#include <sys/ubc.h> +#include <sys/mman.h> +#include <sys/mount_internal.h> +#include <sys/vnode_internal.h> +#include <sys/ubc_internal.h> #include <sys/ucred.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/buf.h> #include <mach/mach_types.h> #include <mach/memory_object_types.h> +#include <mach/memory_object_control.h> +#include <mach/vm_map.h> +#include <mach/upl.h> +#include <kern/kern_types.h> #include <kern/zalloc.h> +#include <vm/vm_kern.h> +#include <vm/vm_protos.h> /* last */ #if DIAGNOSTIC #if defined(assert) @@ -60,79 +67,12 @@ #include <kern/assert.h> #endif /* DIAGNOSTIC */ -struct zone *ubc_info_zone; - -/* lock for changes to struct UBC */ -static __inline__ void -ubc_lock(struct vnode *vp) -{ - /* For now, just use the v_interlock */ - simple_lock(&vp->v_interlock); -} - -/* unlock */ -static __inline__ void -ubc_unlock(struct vnode *vp) -{ - /* For now, just use the v_interlock */ - simple_unlock(&vp->v_interlock); -} - -/* - * Serialize the requests to the VM - * Returns: - * 0 - Failure - * 1 - Sucessful in acquiring the lock - * 2 - Sucessful in acquiring the lock recursively - * do not call ubc_unbusy() - * [This is strange, but saves 4 bytes in struct ubc_info] - */ -static int -ubc_busy(struct vnode *vp) -{ - register struct ubc_info *uip; - - if (!UBCINFOEXISTS(vp)) - return (0); - - uip = vp->v_ubcinfo; - - while (ISSET(uip->ui_flags, UI_BUSY)) { - - if (uip->ui_owner == (void *)current_act()) - return (2); - - SET(uip->ui_flags, UI_WANTED); - (void) tsleep((caddr_t)&vp->v_ubcinfo, PINOD, "ubcbusy", 0); - - if (!UBCINFOEXISTS(vp)) - return (0); - } - uip->ui_owner = (void *)current_act(); - - SET(uip->ui_flags, UI_BUSY); - - return (1); -} +int ubc_info_init_internal(struct vnode *vp, int withfsize, off_t filesize); +int ubc_umcallback(vnode_t, void *); +int ubc_isinuse_locked(vnode_t, int, int); +int ubc_msync_internal(vnode_t, off_t, off_t, off_t *, int, int *); -static void -ubc_unbusy(struct vnode *vp) -{ - register struct ubc_info *uip; - - if (!UBCINFOEXISTS(vp)) { - wakeup((caddr_t)&vp->v_ubcinfo); - return; - } - uip = vp->v_ubcinfo; - CLR(uip->ui_flags, UI_BUSY); - uip->ui_owner = (void *)NULL; - - if (ISSET(uip->ui_flags, UI_WANTED)) { - CLR(uip->ui_flags, UI_WANTED); - wakeup((caddr_t)&vp->v_ubcinfo); - } -} +struct zone *ubc_info_zone; /* * Initialization of the zone for Unified Buffer Cache. @@ -153,50 +93,35 @@ ubc_init() */ int ubc_info_init(struct vnode *vp) +{ + return(ubc_info_init_internal(vp, 0, 0)); +} +int +ubc_info_init_withsize(struct vnode *vp, off_t filesize) +{ + return(ubc_info_init_internal(vp, 1, filesize)); +} + +int +ubc_info_init_internal(struct vnode *vp, int withfsize, off_t filesize) { register struct ubc_info *uip; void * pager; - struct vattr vattr; struct proc *p = current_proc(); int error = 0; kern_return_t kret; memory_object_control_t control; - if (!UBCISVALID(vp)) - return (EINVAL); + uip = vp->v_ubcinfo; - ubc_lock(vp); - if (ISSET(vp->v_flag, VUINIT)) { - /* - * other thread is already doing this - * wait till done - */ - while (ISSET(vp->v_flag, VUINIT)) { - SET(vp->v_flag, VUWANT); /* XXX overloaded! */ - ubc_unlock(vp); - (void) tsleep((caddr_t)vp, PINOD, "ubcinfo", 0); - ubc_lock(vp); - } - ubc_unlock(vp); - return (0); - } else { - SET(vp->v_flag, VUINIT); - } + if (uip == UBC_INFO_NULL) { - uip = vp->v_ubcinfo; - if ((uip == UBC_INFO_NULL) || (uip == UBC_NOINFO)) { - ubc_unlock(vp); uip = (struct ubc_info *) zalloc(ubc_info_zone); - uip->ui_pager = MEMORY_OBJECT_NULL; - uip->ui_control = MEMORY_OBJECT_CONTROL_NULL; - uip->ui_flags = UI_INITED; + bzero((char *)uip, sizeof(struct ubc_info)); + uip->ui_vnode = vp; + uip->ui_flags = UI_INITED; uip->ui_ucred = NOCRED; - uip->ui_refcount = 1; - uip->ui_size = 0; - uip->ui_mapped = 0; - uip->ui_owner = (void *)NULL; - ubc_lock(vp); } #if DIAGNOSTIC else @@ -206,21 +131,17 @@ ubc_info_init(struct vnode *vp) assert(uip->ui_flags != UI_NONE); assert(uip->ui_vnode == vp); -#if 0 - if(ISSET(uip->ui_flags, UI_HASPAGER)) - goto done; -#endif /* 0 */ - /* now set this ubc_info in the vnode */ vp->v_ubcinfo = uip; - SET(uip->ui_flags, UI_HASPAGER); - ubc_unlock(vp); + pager = (void *)vnode_pager_setup(vp, uip->ui_pager); assert(pager); - ubc_setpager(vp, pager); + + SET(uip->ui_flags, UI_HASPAGER); + uip->ui_pager = pager; /* - * Note: We can not use VOP_GETATTR() to get accurate + * Note: We can not use VNOP_GETATTR() to get accurate * value of ui_size. Thanks to NFS. * nfs_getattr() can call vinvalbuf() and in this case * ubc_info is not set up to deal with that. @@ -244,25 +165,24 @@ ubc_info_init(struct vnode *vp) assert(control); uip->ui_control = control; /* cache the value of the mo control */ SET(uip->ui_flags, UI_HASOBJREF); /* with a named reference */ +#if 0 /* create a pager reference on the vnode */ error = vnode_pager_vget(vp); if (error) panic("ubc_info_init: vnode_pager_vget error = %d", error); - - /* initialize the size */ - error = VOP_GETATTR(vp, &vattr, p->p_ucred, p); - - ubc_lock(vp); - uip->ui_size = (error ? 0: vattr.va_size); - -done: - CLR(vp->v_flag, VUINIT); - if (ISSET(vp->v_flag, VUWANT)) { - CLR(vp->v_flag, VUWANT); - ubc_unlock(vp); - wakeup((caddr_t)vp); - } else - ubc_unlock(vp); +#endif + if (withfsize == 0) { + struct vfs_context context; + /* initialize the size */ + context.vc_proc = p; + context.vc_ucred = kauth_cred_get(); + error = vnode_size(vp, &uip->ui_size, &context); + if (error) + uip->ui_size = 0; + } else { + uip->ui_size = filesize; + } + vp->v_lflag |= VNAMED_UBC; return (error); } @@ -271,16 +191,18 @@ done: static void ubc_info_free(struct ubc_info *uip) { - struct ucred *credp; + kauth_cred_t credp; credp = uip->ui_ucred; if (credp != NOCRED) { uip->ui_ucred = NOCRED; - crfree(credp); + kauth_cred_rele(credp); } if (uip->ui_control != MEMORY_OBJECT_CONTROL_NULL) memory_object_control_deallocate(uip->ui_control); + + cluster_release(uip); zfree(ubc_info_zone, (vm_offset_t)uip); return; @@ -289,20 +211,7 @@ ubc_info_free(struct ubc_info *uip) void ubc_info_deallocate(struct ubc_info *uip) { - - assert(uip->ui_refcount > 0); - - if (uip->ui_refcount-- == 1) { - struct vnode *vp; - - vp = uip->ui_vnode; - if (ISSET(uip->ui_flags, UI_WANTED)) { - CLR(uip->ui_flags, UI_WANTED); - wakeup((caddr_t)&vp->v_ubcinfo); - } - - ubc_info_free(uip); - } + ubc_info_free(uip); } /* @@ -321,9 +230,6 @@ ubc_setsize(struct vnode *vp, off_t nsize) if (nsize < (off_t)0) return (0); - if (UBCINVALID(vp)) - return (0); - if (!UBCINFOEXISTS(vp)) return (0); @@ -357,7 +263,7 @@ ubc_setsize(struct vnode *vp, off_t nsize) /* invalidate last page and old contents beyond nsize */ kret = memory_object_lock_request(control, (memory_object_offset_t)lastpg, - (memory_object_size_t)(olastpgend - lastpg), + (memory_object_size_t)(olastpgend - lastpg), NULL, NULL, MEMORY_OBJECT_RETURN_NONE, MEMORY_OBJECT_DATA_FLUSH, VM_PROT_NO_CHANGE); if (kret != KERN_SUCCESS) @@ -369,7 +275,7 @@ ubc_setsize(struct vnode *vp, off_t nsize) /* flush the last page */ kret = memory_object_lock_request(control, (memory_object_offset_t)lastpg, - PAGE_SIZE_64, + PAGE_SIZE_64, NULL, NULL, MEMORY_OBJECT_RETURN_DIRTY, FALSE, VM_PROT_NO_CHANGE); @@ -377,7 +283,7 @@ ubc_setsize(struct vnode *vp, off_t nsize) /* invalidate last page and old contents beyond nsize */ kret = memory_object_lock_request(control, (memory_object_offset_t)lastpg, - (memory_object_size_t)(olastpgend - lastpg), + (memory_object_size_t)(olastpgend - lastpg), NULL, NULL, MEMORY_OBJECT_RETURN_NONE, MEMORY_OBJECT_DATA_FLUSH, VM_PROT_NO_CHANGE); if (kret != KERN_SUCCESS) @@ -394,141 +300,50 @@ ubc_setsize(struct vnode *vp, off_t nsize) off_t ubc_getsize(struct vnode *vp) { - return (vp->v_ubcinfo->ui_size); -} - -/* - * Caller indicate that the object corresponding to the vnode - * can not be cached in object cache. Make it so. - * returns 1 on success, 0 on failure - */ -int -ubc_uncache(struct vnode *vp) -{ - kern_return_t kret; - struct ubc_info *uip; - int recursed; - memory_object_control_t control; - memory_object_perf_info_data_t perf; - - if (!UBCINFOEXISTS(vp)) - return (0); - - if ((recursed = ubc_busy(vp)) == 0) - return (0); - - uip = vp->v_ubcinfo; - - assert(uip != UBC_INFO_NULL); - - /* - * AGE it so that vfree() can make sure that it - * would get recycled soon after the last reference is gone - * This will insure that .nfs turds would not linger + /* people depend on the side effect of this working this way + * as they call this for directory */ - vagevp(vp); - - /* set the "do not cache" bit */ - SET(uip->ui_flags, UI_DONTCACHE); - - control = uip->ui_control; - assert(control); - - perf.cluster_size = PAGE_SIZE; /* XXX use real cluster_size. */ - perf.may_cache = FALSE; - kret = memory_object_change_attributes(control, - MEMORY_OBJECT_PERFORMANCE_INFO, - (memory_object_info_t) &perf, - MEMORY_OBJECT_PERF_INFO_COUNT); - - if (kret != KERN_SUCCESS) { - printf("ubc_uncache: memory_object_change_attributes_named " - "kret = %d", kret); - if (recursed == 1) - ubc_unbusy(vp); - return (0); - } - - ubc_release_named(vp); - - if (recursed == 1) - ubc_unbusy(vp); - return (1); + if (!UBCINFOEXISTS(vp)) + return ((off_t)0); + return (vp->v_ubcinfo->ui_size); } /* - * call ubc_clean() and ubc_uncache() on all the vnodes + * call ubc_sync_range(vp, 0, EOF, UBC_PUSHALL) on all the vnodes * for this mount point. * returns 1 on success, 0 on failure */ + __private_extern__ int ubc_umount(struct mount *mp) { - struct proc *p = current_proc(); - struct vnode *vp, *nvp; - int ret = 1; - -loop: - simple_lock(&mntvnode_slock); - for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { - if (vp->v_mount != mp) { - simple_unlock(&mntvnode_slock); - goto loop; - } - nvp = vp->v_mntvnodes.le_next; - simple_unlock(&mntvnode_slock); - if (UBCINFOEXISTS(vp)) { - - /* - * Must get a valid reference on the vnode - * before callig UBC functions - */ - if (vget(vp, 0, p)) { - ret = 0; - simple_lock(&mntvnode_slock); - continue; /* move on to the next vnode */ - } - ret &= ubc_clean(vp, 0); /* do not invalidate */ - ret &= ubc_uncache(vp); - vrele(vp); - } - simple_lock(&mntvnode_slock); - } - simple_unlock(&mntvnode_slock); - return (ret); + vnode_iterate(mp, 0, ubc_umcallback, 0); + return(0); } -/* - * Call ubc_unmount() for all filesystems. - * The list is traversed in reverse order - * of mounting to avoid dependencies. - */ -__private_extern__ void -ubc_unmountall() +static int +ubc_umcallback(vnode_t vp, __unused void * args) { - struct mount *mp, *nmp; - /* - * Since this only runs when rebooting, it is not interlocked. - */ - for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { - nmp = mp->mnt_list.cqe_prev; - (void) ubc_umount(mp); + if (UBCINFOEXISTS(vp)) { + + cluster_push(vp, 0); + + (void) ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL, UBC_PUSHALL); } + return (VNODE_RETURNED); } + + /* Get the credentials */ -struct ucred * +kauth_cred_t ubc_getcred(struct vnode *vp) { - struct ubc_info *uip; - - uip = vp->v_ubcinfo; + if (UBCINFOEXISTS(vp)) + return (vp->v_ubcinfo->ui_ucred); - if (UBCINVALID(vp)) - return (NOCRED); - - return (uip->ui_ucred); + return (NOCRED); } /* @@ -540,18 +355,20 @@ int ubc_setcred(struct vnode *vp, struct proc *p) { struct ubc_info *uip; - struct ucred *credp; - - uip = vp->v_ubcinfo; + kauth_cred_t credp; - if (UBCINVALID(vp)) + if ( !UBCINFOEXISTS(vp)) return (0); + vnode_lock(vp); + + uip = vp->v_ubcinfo; credp = uip->ui_ucred; + if (credp == NOCRED) { - crhold(p->p_ucred); - uip->ui_ucred = p->p_ucred; + uip->ui_ucred = kauth_cred_proc_ref(p); } + vnode_unlock(vp); return (1); } @@ -560,14 +377,10 @@ ubc_setcred(struct vnode *vp, struct proc *p) __private_extern__ memory_object_t ubc_getpager(struct vnode *vp) { - struct ubc_info *uip; - - uip = vp->v_ubcinfo; - - if (UBCINVALID(vp)) - return (0); + if (UBCINFOEXISTS(vp)) + return (vp->v_ubcinfo->ui_pager); - return (uip->ui_pager); + return (0); } /* @@ -579,458 +392,217 @@ ubc_getpager(struct vnode *vp) */ memory_object_control_t -ubc_getobject(struct vnode *vp, int flags) +ubc_getobject(struct vnode *vp, __unused int flags) { - struct ubc_info *uip; - int recursed; - memory_object_control_t control; + if (UBCINFOEXISTS(vp)) + return((vp->v_ubcinfo->ui_control)); - if (UBCINVALID(vp)) - return (0); - - if (flags & UBC_FOR_PAGEOUT) - return(vp->v_ubcinfo->ui_control); - - if ((recursed = ubc_busy(vp)) == 0) - return (0); - - uip = vp->v_ubcinfo; - control = uip->ui_control; - - if ((flags & UBC_HOLDOBJECT) && (!ISSET(uip->ui_flags, UI_HASOBJREF))) { - - /* - * Take a temporary reference on the ubc info so that it won't go - * away during our recovery attempt. - */ - ubc_lock(vp); - uip->ui_refcount++; - ubc_unlock(vp); - if (memory_object_recover_named(control, TRUE) == KERN_SUCCESS) { - SET(uip->ui_flags, UI_HASOBJREF); - } else { - control = MEMORY_OBJECT_CONTROL_NULL; - } - if (recursed == 1) - ubc_unbusy(vp); - ubc_info_deallocate(uip); - - } else { - if (recursed == 1) - ubc_unbusy(vp); - } - - return (control); + return (0); } -/* Set the pager */ -int -ubc_setpager(struct vnode *vp, memory_object_t pager) -{ - struct ubc_info *uip; - - uip = vp->v_ubcinfo; - - if (UBCINVALID(vp)) - return (0); - - uip->ui_pager = pager; - return (1); -} - -int -ubc_setflags(struct vnode * vp, int flags) -{ - struct ubc_info *uip; - - if (UBCINVALID(vp)) - return (0); - - uip = vp->v_ubcinfo; - - SET(uip->ui_flags, flags); - - return (1); -} - -int -ubc_clearflags(struct vnode * vp, int flags) -{ - struct ubc_info *uip; - - if (UBCINVALID(vp)) - return (0); - - uip = vp->v_ubcinfo; - - CLR(uip->ui_flags, flags); - - return (1); -} - - -int -ubc_issetflags(struct vnode * vp, int flags) -{ - struct ubc_info *uip; - - if (UBCINVALID(vp)) - return (0); - - uip = vp->v_ubcinfo; - - return (ISSET(uip->ui_flags, flags)); -} off_t -ubc_blktooff(struct vnode *vp, daddr_t blkno) +ubc_blktooff(vnode_t vp, daddr64_t blkno) { off_t file_offset; int error; - if (UBCINVALID(vp)) - return ((off_t)-1); + if (UBCINVALID(vp)) + return ((off_t)-1); - error = VOP_BLKTOOFF(vp, blkno, &file_offset); + error = VNOP_BLKTOOFF(vp, blkno, &file_offset); if (error) file_offset = -1; return (file_offset); } -daddr_t -ubc_offtoblk(struct vnode *vp, off_t offset) +daddr64_t +ubc_offtoblk(vnode_t vp, off_t offset) { - daddr_t blkno; + daddr64_t blkno; int error = 0; - if (UBCINVALID(vp)) { - return ((daddr_t)-1); - } + if (UBCINVALID(vp)) + return ((daddr64_t)-1); - error = VOP_OFFTOBLK(vp, offset, &blkno); + error = VNOP_OFFTOBLK(vp, offset, &blkno); if (error) blkno = -1; return (blkno); } -/* - * Cause the file data in VM to be pushed out to the storage - * it also causes all currently valid pages to be released - * returns 1 on success, 0 on failure - */ int -ubc_clean(struct vnode *vp, int invalidate) +ubc_pages_resident(vnode_t vp) { - off_t size; - struct ubc_info *uip; - memory_object_control_t control; - kern_return_t kret; - int flags = 0; - - if (UBCINVALID(vp)) + kern_return_t kret; + boolean_t has_pages_resident; + + if ( !UBCINFOEXISTS(vp)) return (0); - - if (!UBCINFOEXISTS(vp)) + + kret = memory_object_pages_resident(vp->v_ubcinfo->ui_control, &has_pages_resident); + + if (kret != KERN_SUCCESS) return (0); + + if (has_pages_resident == TRUE) + return (1); + + return (0); +} - /* - * if invalidate was requested, write dirty data and then discard - * the resident pages - */ - if (invalidate) - flags = (MEMORY_OBJECT_DATA_FLUSH | MEMORY_OBJECT_DATA_NO_CHANGE); - - uip = vp->v_ubcinfo; - size = uip->ui_size; /* call ubc_getsize() ??? */ - - control = uip->ui_control; - assert(control); - - cluster_release(vp); - vp->v_clen = 0; - - /* Write the dirty data in the file and discard cached pages */ - kret = memory_object_lock_request(control, - (memory_object_offset_t)0, - (memory_object_size_t)round_page_64(size), - MEMORY_OBJECT_RETURN_ALL, flags, - VM_PROT_NO_CHANGE); - - if (kret != KERN_SUCCESS) - printf("ubc_clean: clean failed (error = %d)\n", kret); - return ((kret == KERN_SUCCESS) ? 1 : 0); -} /* - * Cause the file data in VM to be pushed out to the storage - * currently valid pages are NOT invalidated - * returns 1 on success, 0 on failure + * This interface will eventually be deprecated + * + * clean and/or invalidate a range in the memory object that backs this + * vnode. The start offset is truncated to the page boundary and the + * size is adjusted to include the last page in the range. + * + * returns 1 for success, 0 for failure */ int -ubc_pushdirty(struct vnode *vp) +ubc_sync_range(vnode_t vp, off_t beg_off, off_t end_off, int flags) { - off_t size; - struct ubc_info *uip; - memory_object_control_t control; - kern_return_t kret; - - if (UBCINVALID(vp)) - return (0); - - if (!UBCINFOEXISTS(vp)) - return (0); - - uip = vp->v_ubcinfo; - size = uip->ui_size; /* call ubc_getsize() ??? */ - - control = uip->ui_control; - assert(control); - - vp->v_flag &= ~VHASDIRTY; - vp->v_clen = 0; - - /* Write the dirty data in the file and discard cached pages */ - kret = memory_object_lock_request(control, - (memory_object_offset_t)0, - (memory_object_size_t)round_page_64(size), - MEMORY_OBJECT_RETURN_DIRTY, FALSE, - VM_PROT_NO_CHANGE); - - if (kret != KERN_SUCCESS) - printf("ubc_pushdirty: flush failed (error = %d)\n", kret); - - return ((kret == KERN_SUCCESS) ? 1 : 0); + return (ubc_msync_internal(vp, beg_off, end_off, NULL, flags, NULL)); } + /* - * Cause the file data in VM to be pushed out to the storage - * currently valid pages are NOT invalidated - * returns 1 on success, 0 on failure + * clean and/or invalidate a range in the memory object that backs this + * vnode. The start offset is truncated to the page boundary and the + * size is adjusted to include the last page in the range. + * if a */ -int -ubc_pushdirty_range(struct vnode *vp, off_t offset, off_t size) +errno_t +ubc_msync(vnode_t vp, off_t beg_off, off_t end_off, off_t *resid_off, int flags) { - struct ubc_info *uip; - memory_object_control_t control; - kern_return_t kret; - - if (UBCINVALID(vp)) - return (0); - - if (!UBCINFOEXISTS(vp)) - return (0); - - uip = vp->v_ubcinfo; + int retval; + int io_errno = 0; + + if (resid_off) + *resid_off = beg_off; - control = uip->ui_control; - assert(control); + retval = ubc_msync_internal(vp, beg_off, end_off, resid_off, flags, &io_errno); - /* Write any dirty pages in the requested range of the file: */ - kret = memory_object_lock_request(control, - (memory_object_offset_t)offset, - (memory_object_size_t)round_page_64(size), - MEMORY_OBJECT_RETURN_DIRTY, FALSE, - VM_PROT_NO_CHANGE); + if (retval == 0 && io_errno == 0) + return (EINVAL); + return (io_errno); +} - if (kret != KERN_SUCCESS) - printf("ubc_pushdirty_range: flush failed (error = %d)\n", kret); - return ((kret == KERN_SUCCESS) ? 1 : 0); -} /* - * Make sure the vm object does not vanish - * returns 1 if the hold count was incremented - * returns 0 if the hold count was not incremented - * This return value should be used to balance - * ubc_hold() and ubc_rele(). + * clean and/or invalidate a range in the memory object that backs this + * vnode. The start offset is truncated to the page boundary and the + * size is adjusted to include the last page in the range. */ -int -ubc_hold(struct vnode *vp) +static int +ubc_msync_internal(vnode_t vp, off_t beg_off, off_t end_off, off_t *resid_off, int flags, int *io_errno) { - struct ubc_info *uip; - int recursed; - memory_object_control_t object; - -retry: - - if (UBCINVALID(vp)) - return (0); + memory_object_size_t tsize; + kern_return_t kret; + int request_flags = 0; + int flush_flags = MEMORY_OBJECT_RETURN_NONE; + + if ( !UBCINFOEXISTS(vp)) + return (0); + if (end_off <= beg_off) + return (0); + if ((flags & (UBC_INVALIDATE | UBC_PUSHDIRTY | UBC_PUSHALL)) == 0) + return (0); + + if (flags & UBC_INVALIDATE) + /* + * discard the resident pages + */ + request_flags = (MEMORY_OBJECT_DATA_FLUSH | MEMORY_OBJECT_DATA_NO_CHANGE); - ubc_lock(vp); - if (ISSET(vp->v_flag, VUINIT)) { - /* - * other thread is not done initializing this - * yet, wait till it's done and try again + if (flags & UBC_SYNC) + /* + * wait for all the I/O to complete before returning */ - while (ISSET(vp->v_flag, VUINIT)) { - SET(vp->v_flag, VUWANT); /* XXX overloaded! */ - ubc_unlock(vp); - (void) tsleep((caddr_t)vp, PINOD, "ubchold", 0); - ubc_lock(vp); - } - ubc_unlock(vp); - goto retry; - } - ubc_unlock(vp); + request_flags |= MEMORY_OBJECT_IO_SYNC; - if ((recursed = ubc_busy(vp)) == 0) { - /* must be invalid or dying vnode */ - assert(UBCINVALID(vp) || - ((vp->v_flag & VXLOCK) || (vp->v_flag & VTERMINATE))); - return (0); - } + if (flags & UBC_PUSHDIRTY) + /* + * we only return the dirty pages in the range + */ + flush_flags = MEMORY_OBJECT_RETURN_DIRTY; - uip = vp->v_ubcinfo; - assert(uip->ui_control != MEMORY_OBJECT_CONTROL_NULL); - - ubc_lock(vp); - uip->ui_refcount++; - ubc_unlock(vp); - - if (!ISSET(uip->ui_flags, UI_HASOBJREF)) { - if (memory_object_recover_named(uip->ui_control, TRUE) - != KERN_SUCCESS) { - if (recursed == 1) - ubc_unbusy(vp); - ubc_info_deallocate(uip); - return (0); - } - SET(uip->ui_flags, UI_HASOBJREF); - } - if (recursed == 1) - ubc_unbusy(vp); + if (flags & UBC_PUSHALL) + /* + * then return all the interesting pages in the range (both dirty and precious) + * to the pager + */ + flush_flags = MEMORY_OBJECT_RETURN_ALL; - assert(uip->ui_refcount > 0); + beg_off = trunc_page_64(beg_off); + end_off = round_page_64(end_off); + tsize = (memory_object_size_t)end_off - beg_off; - return (1); + /* flush and/or invalidate pages in the range requested */ + kret = memory_object_lock_request(vp->v_ubcinfo->ui_control, + beg_off, tsize, resid_off, io_errno, + flush_flags, request_flags, VM_PROT_NO_CHANGE); + + return ((kret == KERN_SUCCESS) ? 1 : 0); } -/* - * Drop the holdcount. - * release the reference on the vm object if the this is "uncached" - * ubc_info. - */ -void -ubc_rele(struct vnode *vp) -{ - struct ubc_info *uip; - - if (UBCINVALID(vp)) - return; - - if (!UBCINFOEXISTS(vp)) { - /* nothing more to do for a dying vnode */ - if ((vp->v_flag & VXLOCK) || (vp->v_flag & VTERMINATE)) - return; - panic("ubc_rele: can not"); - } - - uip = vp->v_ubcinfo; - - if (uip->ui_refcount == 1) - panic("ubc_rele: ui_refcount"); - - --uip->ui_refcount; - - if ((uip->ui_refcount == 1) - && ISSET(uip->ui_flags, UI_DONTCACHE)) - (void) ubc_release_named(vp); - - return; -} /* * The vnode is mapped explicitly, mark it so. */ -__private_extern__ void -ubc_map(struct vnode *vp) +__private_extern__ int +ubc_map(vnode_t vp, int flags) { struct ubc_info *uip; + int error = 0; + int need_ref = 0; + struct vfs_context context; - if (UBCINVALID(vp)) - return; - - if (!UBCINFOEXISTS(vp)) - return; - - ubc_lock(vp); - uip = vp->v_ubcinfo; - - SET(uip->ui_flags, UI_WASMAPPED); - uip->ui_mapped = 1; - ubc_unlock(vp); + if (vnode_getwithref(vp)) + return (0); - return; -} + if (UBCINFOEXISTS(vp)) { + context.vc_proc = current_proc(); + context.vc_ucred = kauth_cred_get(); -/* - * Release the memory object reference on the vnode - * only if it is not in use - * Return 1 if the reference was released, 0 otherwise. - */ -int -ubc_release_named(struct vnode *vp) -{ - struct ubc_info *uip; - int recursed; - memory_object_control_t control; - kern_return_t kret = KERN_FAILURE; + error = VNOP_MMAP(vp, flags, &context); - if (UBCINVALID(vp)) - return (0); + if (error != EPERM) + error = 0; - if ((recursed = ubc_busy(vp)) == 0) - return (0); - uip = vp->v_ubcinfo; + if (error == 0) { + vnode_lock(vp); + + uip = vp->v_ubcinfo; - /* can not release held or mapped vnodes */ - if (ISSET(uip->ui_flags, UI_HASOBJREF) && - (uip->ui_refcount == 1) && !uip->ui_mapped) { - control = uip->ui_control; - assert(control); + if ( !ISSET(uip->ui_flags, UI_ISMAPPED)) + need_ref = 1; + SET(uip->ui_flags, (UI_WASMAPPED | UI_ISMAPPED)); - // XXXdbg - if (vp->v_flag & VDELETED) { - ubc_setsize(vp, (off_t)0); + vnode_unlock(vp); + + if (need_ref) + vnode_ref(vp); } - - CLR(uip->ui_flags, UI_HASOBJREF); - kret = memory_object_release_name(control, - MEMORY_OBJECT_RESPECT_CACHE); } + vnode_put(vp); - if (recursed == 1) - ubc_unbusy(vp); - return ((kret != KERN_SUCCESS) ? 0 : 1); -} - -/* - * This function used to called by extensions directly. Some may - * still exist with this behavior. In those cases, we will do the - * release as part of reclaiming or cleaning the vnode. We don't - * need anything explicit - so just stub this out until those callers - * get cleaned up. - */ -int -ubc_release( - struct vnode *vp) -{ - return 0; + return (error); } /* * destroy the named reference for a given vnode */ __private_extern__ int -ubc_destroy_named( - struct vnode *vp) +ubc_destroy_named(struct vnode *vp) { memory_object_control_t control; - struct proc *p; struct ubc_info *uip; kern_return_t kret; @@ -1046,10 +618,6 @@ ubc_destroy_named( uip = vp->v_ubcinfo; - /* can not destroy held vnodes */ - if (uip->ui_refcount > 1) - return (0); - /* * Terminate the memory object. * memory_object_destroy() will result in @@ -1060,6 +628,9 @@ ubc_destroy_named( control = ubc_getobject(vp, UBC_HOLDOBJECT); if (control != MEMORY_OBJECT_CONTROL_NULL) { + /* + * XXXXX - should we hold the vnode lock here? + */ if (ISSET(vp->v_flag, VTERMINATE)) panic("ubc_destroy_named: already teminating"); SET(vp->v_flag, VTERMINATE); @@ -1074,115 +645,83 @@ ubc_destroy_named( * wait for vnode_pager_no_senders() to clear * VTERMINATE */ - while (ISSET(vp->v_flag, VTERMINATE)) { - SET(vp->v_flag, VTERMWANT); - (void)tsleep((caddr_t)&vp->v_ubcinfo, + vnode_lock(vp); + while (ISSET(vp->v_lflag, VNAMED_UBC)) { + (void)msleep((caddr_t)&vp->v_lflag, &vp->v_lock, PINOD, "ubc_destroy_named", 0); } + vnode_unlock(vp); } return (1); } /* - * Invalidate a range in the memory object that backs this - * vnode. The offset is truncated to the page boundary and the - * size is adjusted to include the last page in the range. + * Find out whether a vnode is in use by UBC + * Returns 1 if file is in use by UBC, 0 if not */ int -ubc_invalidate(struct vnode *vp, off_t offset, size_t size) +ubc_isinuse(struct vnode *vp, int busycount) { - struct ubc_info *uip; - memory_object_control_t control; - kern_return_t kret; - off_t toff; - size_t tsize; - - if (UBCINVALID(vp)) + if ( !UBCINFOEXISTS(vp)) return (0); - - if (!UBCINFOEXISTS(vp)) - return (0); - - toff = trunc_page_64(offset); - tsize = (size_t)(round_page_64(offset+size) - toff); - uip = vp->v_ubcinfo; - control = uip->ui_control; - assert(control); - - /* invalidate pages in the range requested */ - kret = memory_object_lock_request(control, - (memory_object_offset_t)toff, - (memory_object_size_t)tsize, - MEMORY_OBJECT_RETURN_NONE, - (MEMORY_OBJECT_DATA_NO_CHANGE| MEMORY_OBJECT_DATA_FLUSH), - VM_PROT_NO_CHANGE); - if (kret != KERN_SUCCESS) - printf("ubc_invalidate: invalidate failed (error = %d)\n", kret); - - return ((kret == KERN_SUCCESS) ? 1 : 0); + return(ubc_isinuse_locked(vp, busycount, 0)); } -/* - * Find out whether a vnode is in use by UBC - * Returns 1 if file is in use by UBC, 0 if not - */ + int -ubc_isinuse(struct vnode *vp, int busycount) +ubc_isinuse_locked(struct vnode *vp, int busycount, int locked) { - if (!UBCINFOEXISTS(vp)) - return (0); + int retval = 0; - if (busycount == 0) { - printf("ubc_isinuse: called without a valid reference" - ": v_tag = %d\v", vp->v_tag); - vprint("ubc_isinuse", vp); - return (0); - } - if (vp->v_usecount > busycount+1) - return (1); + if (!locked) + vnode_lock(vp); - if ((vp->v_usecount == busycount+1) - && (vp->v_ubcinfo->ui_mapped == 1)) - return (1); - else - return (0); + if ((vp->v_usecount - vp->v_kusecount) > busycount) + retval = 1; + + if (!locked) + vnode_unlock(vp); + return (retval); } + /* - * The backdoor routine to clear the ui_mapped. * MUST only be called by the VM - * - * Note that this routine is not called under funnel. There are numerous - * things about the calling sequence that make this work on SMP. - * Any code change in those paths can break this. - * */ __private_extern__ void ubc_unmap(struct vnode *vp) { + struct vfs_context context; struct ubc_info *uip; - boolean_t funnel_state; - - if (UBCINVALID(vp)) - return; + int need_rele = 0; - if (!UBCINFOEXISTS(vp)) - return; + if (vnode_getwithref(vp)) + return; - ubc_lock(vp); - uip = vp->v_ubcinfo; - uip->ui_mapped = 0; - if ((uip->ui_refcount > 1) || !ISSET(uip->ui_flags, UI_DONTCACHE)) { - ubc_unlock(vp); - return; - } - ubc_unlock(vp); + if (UBCINFOEXISTS(vp)) { + vnode_lock(vp); - funnel_state = thread_funnel_set(kernel_flock, TRUE); - (void) ubc_release_named(vp); - (void) thread_funnel_set(kernel_flock, funnel_state); + uip = vp->v_ubcinfo; + if (ISSET(uip->ui_flags, UI_ISMAPPED)) { + CLR(uip->ui_flags, UI_ISMAPPED); + need_rele = 1; + } + vnode_unlock(vp); + + if (need_rele) { + context.vc_proc = current_proc(); + context.vc_ucred = kauth_cred_get(); + (void)VNOP_MNOMAP(vp, &context); + + vnode_rele(vp); + } + } + /* + * the drop of the vnode ref will cleanup + */ + vnode_put(vp); } kern_return_t @@ -1254,7 +793,6 @@ ubc_create_upl( memory_object_control_t control; int count; int ubcflags; - off_t file_offset; kern_return_t kr; if (bufsize & 0xfff) @@ -1378,3 +916,46 @@ ubc_upl_pageinfo( { return (UPL_GET_INTERNAL_PAGE_LIST(upl)); } + +/************* UBC APIS **************/ + +int +UBCINFOMISSING(struct vnode * vp) +{ + return((vp) && ((vp)->v_type == VREG) && ((vp)->v_ubcinfo == UBC_INFO_NULL)); +} + +int +UBCINFORECLAIMED(struct vnode * vp) +{ + return((vp) && ((vp)->v_type == VREG) && ((vp)->v_ubcinfo == UBC_INFO_NULL)); +} + + +int +UBCINFOEXISTS(struct vnode * vp) +{ + return((vp) && ((vp)->v_type == VREG) && ((vp)->v_ubcinfo != UBC_INFO_NULL)); +} +int +UBCISVALID(struct vnode * vp) +{ + return((vp) && ((vp)->v_type == VREG) && !((vp)->v_flag & VSYSTEM)); +} +int +UBCINVALID(struct vnode * vp) +{ + return(((vp) == NULL) || ((vp) && ((vp)->v_type != VREG)) + || ((vp) && ((vp)->v_flag & VSYSTEM))); +} +int +UBCINFOCHECK(const char * fun, struct vnode * vp) +{ + if ((vp) && ((vp)->v_type == VREG) && + ((vp)->v_ubcinfo == UBC_INFO_NULL)) { + panic("%s: lost ubc_info", (fun)); + return(1); + } else + return(0); +} + diff --git a/bsd/kern/uipc_domain.c b/bsd/kern/uipc_domain.c index efb0a3cd6..9be151def 100644 --- a/bsd/kern/uipc_domain.c +++ b/bsd/kern/uipc_domain.c @@ -64,13 +64,13 @@ #include <sys/time.h> #include <sys/kernel.h> #include <sys/systm.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> #include <sys/sysctl.h> #include <sys/syslog.h> #include <sys/queue.h> -void pffasttimo __P((void *)); -void pfslowtimo __P((void *)); +void pffasttimo(void *); +void pfslowtimo(void *); /* * Add/delete 'domain': Link structure into system list, @@ -78,11 +78,21 @@ void pfslowtimo __P((void *)); * To delete, just remove from the list (dom_refs must be zero) */ +lck_grp_t *domain_proto_mtx_grp; +lck_attr_t *domain_proto_mtx_attr; +static lck_grp_attr_t *domain_proto_mtx_grp_attr; +lck_mtx_t *domain_proto_mtx; +extern int do_reclaim; void init_domain(register struct domain *dp) { struct protosw *pr; + if ((dp->dom_mtx = lck_mtx_alloc_init(domain_proto_mtx_grp, domain_proto_mtx_attr)) == NULL) { + printf("init_domain: can't init domain mtx for domain=%s\n", dp->dom_name); + return; /* we have a problem... */ + } + if (dp->dom_init) (*dp->dom_init)(); @@ -109,6 +119,7 @@ void init_domain(register struct domain *dp) void concat_domain(struct domain *dp) { + lck_mtx_assert(domain_proto_mtx, LCK_MTX_ASSERT_OWNED); dp->dom_next = domains; domains = dp; } @@ -116,33 +127,30 @@ void concat_domain(struct domain *dp) void net_add_domain(register struct domain *dp) { register struct protosw *pr; - register int s; - extern int splhigh(void); - extern int splx(int); kprintf("Adding domain %s (family %d)\n", dp->dom_name, dp->dom_family); /* First, link in the domain */ - s = splhigh(); + lck_mtx_lock(domain_proto_mtx); concat_domain(dp); init_domain(dp); + lck_mtx_unlock(domain_proto_mtx); - splx(s); } int net_del_domain(register struct domain *dp) { register struct domain *dp1, *dp2; - register int s, retval = 0; - extern int splhigh(void); - extern int splx(int); + register int retval = 0; + + lck_mtx_lock(domain_proto_mtx); - if (dp->dom_refs) + if (dp->dom_refs) { + lck_mtx_unlock(domain_proto_mtx); return(EBUSY); - - s = splhigh(); + } for (dp2 = NULL, dp1 = domains; dp1; dp2 = dp1, dp1 = dp1->dom_next) { if (dp == dp1) @@ -155,27 +163,24 @@ net_del_domain(register struct domain *dp) domains = dp1->dom_next; } else retval = EPFNOSUPPORT; - splx(s); + lck_mtx_unlock(domain_proto_mtx); return(retval); } /* * net_add_proto - link a protosw into a domain's protosw chain + * + * note: protocols must use their own domain lock before calling net_add_proto */ int net_add_proto(register struct protosw *pp, register struct domain *dp) { register struct protosw *pp1, *pp2; - register int s; - extern int splhigh(void); - extern int splx(int); - s = splhigh(); for (pp2 = NULL, pp1 = dp->dom_protosw; pp1; pp1 = pp1->pr_next) { if (pp1->pr_type == pp->pr_type && pp1->pr_protocol == pp->pr_protocol) { - splx(s); return(EEXIST); } pp2 = pp1; @@ -185,13 +190,12 @@ net_add_proto(register struct protosw *pp, else pp2->pr_next = pp; pp->pr_next = NULL; - TAILQ_INIT(&pp->pr_sfilter); + TAILQ_INIT(&pp->pr_filter_head); if (pp->pr_init) (*pp->pr_init)(); /* Make sure pr_init isn't called again!! */ pp->pr_init = 0; - splx(s); return(0); } @@ -199,17 +203,15 @@ net_add_proto(register struct protosw *pp, * net_del_proto - remove a protosw from a domain's protosw chain. * Search the protosw chain for the element with matching data. * Then unlink and return. + * + * note: protocols must use their own domain lock before calling net_del_proto */ int net_del_proto(register int type, register int protocol, register struct domain *dp) { register struct protosw *pp1, *pp2; - int s; - extern int splhigh(void); - extern int splx(int); - s = splhigh(); for (pp2 = NULL, pp1 = dp->dom_protosw; pp1; pp1 = pp1->pr_next) { if (pp1->pr_type == type && pp1->pr_protocol == protocol) @@ -217,14 +219,12 @@ net_del_proto(register int type, pp2 = pp1; } if (pp1 == NULL) { - splx(s); return(ENXIO); } if (pp2) pp2->pr_next = pp1->pr_next; else dp->dom_protosw = pp1->pr_next; - splx(s); return(0); } @@ -255,11 +255,30 @@ domaininit() extern struct domain keydomain; #endif + /* + * allocate lock group attribute and group for domain mutexes + */ + domain_proto_mtx_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setdefault(domain_proto_mtx_grp_attr); + + domain_proto_mtx_grp = lck_grp_alloc_init("domain", domain_proto_mtx_grp_attr); + + /* + * allocate the lock attribute for per domain mutexes + */ + domain_proto_mtx_attr = lck_attr_alloc_init(); + lck_attr_setdefault(domain_proto_mtx_attr); + + if ((domain_proto_mtx = lck_mtx_alloc_init(domain_proto_mtx_grp, domain_proto_mtx_attr)) == NULL) { + printf("domaininit: can't init domain mtx for domain list\n"); + return; /* we have a problem... */ + } /* * Add all the static domains to the domains list */ - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + lck_mtx_lock(domain_proto_mtx); + concat_domain(&localdomain); concat_domain(&routedomain); concat_domain(&inetdomain); @@ -293,9 +312,9 @@ domaininit() for (dp = domains; dp; dp = dp->dom_next) init_domain(dp); + lck_mtx_unlock(domain_proto_mtx); timeout(pffasttimo, NULL, 1); timeout(pfslowtimo, NULL, 1); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); } struct protosw * @@ -305,14 +324,20 @@ pffindtype(family, type) register struct domain *dp; register struct protosw *pr; + lck_mtx_assert(domain_proto_mtx, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(domain_proto_mtx); for (dp = domains; dp; dp = dp->dom_next) if (dp->dom_family == family) goto found; + lck_mtx_unlock(domain_proto_mtx); return (0); found: for (pr = dp->dom_protosw; pr; pr = pr->pr_next) - if (pr->pr_type && pr->pr_type == type) + if (pr->pr_type && pr->pr_type == type) { + lck_mtx_unlock(domain_proto_mtx); return (pr); + } + lck_mtx_unlock(domain_proto_mtx); return (0); } @@ -320,18 +345,35 @@ struct domain * pffinddomain(int pf) { struct domain *dp; + lck_mtx_assert(domain_proto_mtx, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(domain_proto_mtx); dp = domains; while (dp) - { if (dp->dom_family == pf) + { if (dp->dom_family == pf) { + lck_mtx_unlock(domain_proto_mtx); return(dp); + } dp = dp->dom_next; } + lck_mtx_unlock(domain_proto_mtx); return(NULL); } struct protosw * pffindproto(family, protocol, type) int family, protocol, type; +{ + register struct protosw *pr; + lck_mtx_assert(domain_proto_mtx, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(domain_proto_mtx); + pr = pffindproto_locked(family, protocol, type); + lck_mtx_unlock(domain_proto_mtx); + return (pr); +} + +struct protosw * +pffindproto_locked(family, protocol, type) + int family, protocol, type; { register struct domain *dp; register struct protosw *pr; @@ -356,18 +398,12 @@ found: } int -net_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) - int *name; - u_int namelen; - void *oldp; - size_t *oldlenp; - void *newp; - size_t newlen; - struct proc *p; +net_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen, struct proc *p) { register struct domain *dp; register struct protosw *pr; - int family, protocol; + int family, protocol, error; /* * All sysctl names at this level are nonterminal; @@ -381,15 +417,21 @@ net_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) if (family == 0) return (0); + lck_mtx_lock(domain_proto_mtx); for (dp = domains; dp; dp = dp->dom_next) if (dp->dom_family == family) goto found; + lck_mtx_unlock(domain_proto_mtx); return (ENOPROTOOPT); found: for (pr = dp->dom_protosw; pr; pr = pr->pr_next) - if (pr->pr_protocol == protocol && pr->pr_sysctl) - return ((*pr->pr_sysctl)(name + 2, namelen - 2, - oldp, oldlenp, newp, newlen)); + if (pr->pr_protocol == protocol && pr->pr_sysctl) { + error = (*pr->pr_sysctl)(name + 2, namelen - 2, + oldp, oldlenp, newp, newlen); + lck_mtx_unlock(domain_proto_mtx); + return (error); + } + lck_mtx_unlock(domain_proto_mtx); return (ENOPROTOOPT); } @@ -412,10 +454,13 @@ pfctlinput2(cmd, sa, ctlparam) if (!sa) return; + + lck_mtx_lock(domain_proto_mtx); for (dp = domains; dp; dp = dp->dom_next) for (pr = dp->dom_protosw; pr; pr = pr->pr_next) if (pr->pr_ctlinput) (*pr->pr_ctlinput)(cmd, sa, ctlparam); + lck_mtx_unlock(domain_proto_mtx); } void @@ -424,17 +469,19 @@ pfslowtimo(arg) { register struct domain *dp; register struct protosw *pr; - boolean_t funnel_state; - - funnel_state = thread_funnel_set(network_flock, TRUE); - for (dp = domains; dp; dp = dp->dom_next) - for (pr = dp->dom_protosw; pr; pr = pr->pr_next) + lck_mtx_lock(domain_proto_mtx); + for (dp = domains; dp; dp = dp->dom_next) + for (pr = dp->dom_protosw; pr; pr = pr->pr_next) { if (pr->pr_slowtimo) (*pr->pr_slowtimo)(); + if (do_reclaim && pr->pr_drain) + (*pr->pr_drain)(); + } + do_reclaim = 0; + lck_mtx_unlock(domain_proto_mtx); timeout(pfslowtimo, NULL, hz/2); - (void) thread_funnel_set(network_flock, FALSE); } void @@ -443,15 +490,12 @@ pffasttimo(arg) { register struct domain *dp; register struct protosw *pr; - boolean_t funnel_state; - - funnel_state = thread_funnel_set(network_flock, TRUE); + lck_mtx_lock(domain_proto_mtx); for (dp = domains; dp; dp = dp->dom_next) for (pr = dp->dom_protosw; pr; pr = pr->pr_next) if (pr->pr_fasttimo) (*pr->pr_fasttimo)(); + lck_mtx_unlock(domain_proto_mtx); timeout(pffasttimo, NULL, hz/5); - - (void) thread_funnel_set(network_flock, FALSE); } diff --git a/bsd/kern/uipc_mbuf.c b/bsd/kern/uipc_mbuf.c index 270534767..84100312a 100644 --- a/bsd/kern/uipc_mbuf.c +++ b/bsd/kern/uipc_mbuf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -70,10 +70,10 @@ #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/kernel.h> +#include <sys/sysctl.h> #include <sys/syslog.h> #include <sys/protosw.h> #include <sys/domain.h> -#include <net/netisr.h> #include <kern/queue.h> #include <kern/kern_types.h> @@ -81,6 +81,9 @@ #include <IOKit/IOMapper.h> +extern vm_offset_t kmem_mb_alloc(vm_map_t , int ); +extern boolean_t PE_parse_boot_arg(const char *, void *); + #define _MCLREF(p) (++mclrefcnt[mtocl(p)]) #define _MCLUNREF(p) (--mclrefcnt[mtocl(p)] == 0) #define _M_CLEAR_PKTHDR(mbuf_ptr) (mbuf_ptr)->m_pkthdr.rcvif = NULL; \ @@ -89,24 +92,28 @@ (mbuf_ptr)->m_pkthdr.csum_flags = 0; \ (mbuf_ptr)->m_pkthdr.csum_data = 0; \ (mbuf_ptr)->m_pkthdr.aux = (struct mbuf*)NULL; \ - (mbuf_ptr)->m_pkthdr.reserved_1 = 0; \ (mbuf_ptr)->m_pkthdr.vlan_tag = 0; \ - (mbuf_ptr)->m_pkthdr.reserved2 = NULL; + (mbuf_ptr)->m_pkthdr.socket_id = 0; \ + SLIST_INIT(&(mbuf_ptr)->m_pkthdr.tags); -extern pmap_t kernel_pmap; /* The kernel's pmap */ /* kernel translater */ extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); -decl_simple_lock_data(, mbuf_slock); +lck_mtx_t * mbuf_mlock; +lck_grp_t * mbuf_mlock_grp; +lck_grp_attr_t * mbuf_mlock_grp_attr; +lck_attr_t * mbuf_mlock_attr; +extern lck_mtx_t *domain_proto_mtx; + struct mbuf *mfree; /* mbuf free list */ struct mbuf *mfreelater; /* mbuf deallocation list */ extern vm_map_t mb_map; /* special map */ int m_want; /* sleepers on mbufs */ -extern int nmbclusters; /* max number of mapped clusters */ short *mclrefcnt; /* mapped cluster reference counts */ int *mcl_paddr; static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */ union mcluster *mclfree; /* mapped cluster free list */ +union mbigcluster *mbigfree; /* mapped cluster free list */ int max_linkhdr; /* largest link-level header */ int max_protohdr; /* largest protocol header */ int max_hdr; /* largest link+protocol header */ @@ -116,18 +123,32 @@ union mcluster *mbutl; /* first mapped cluster address */ union mcluster *embutl; /* ending virtual address of mclusters */ static int nclpp; /* # clusters per physical page */ -static char mbfail[] = "mbuf not mapped"; -static int m_howmany(); +static int m_howmany(int, size_t ); +void m_reclaim(void); +static int m_clalloc(const int , const int, const size_t, int); +int do_reclaim = 0; + +#define MF_NOWAIT 0x1 +#define MF_BIG 0x2 /* The number of cluster mbufs that are allocated, to start. */ #define MINCL max(16, 2) static int mbuf_expand_thread_wakeup = 0; static int mbuf_expand_mcl = 0; +static int mbuf_expand_big = 0; static int mbuf_expand_thread_initialized = 0; static void mbuf_expand_thread_init(void); +static void mbuf_expand_thread(void); +static int m_expand(int ); +static caddr_t m_bigalloc(int ); +static void m_bigfree(caddr_t , u_int , caddr_t ); +static struct mbuf * m_mbigget(struct mbuf *, int ); +void mbinit(void); +static void m_range_check(void *addr); + #if 0 static int mfree_munge = 0; @@ -176,25 +197,39 @@ munge_mbuf(struct mbuf *m) } -void -mbinit() +static void +m_range_check(void *addr) { - int s,m; + if (addr && (addr < (void *)mbutl || addr >= (void *)embutl)) + panic("mbuf address out of range 0x%x", addr); +} + +__private_extern__ void +mbinit(void) +{ + int m; int initmcl = 32; - int mcl_pages; + int mcl_pages; if (nclpp) return; nclpp = round_page_32(MCLBYTES) / MCLBYTES; /* see mbufgc() */ if (nclpp < 1) nclpp = 1; - MBUF_LOCKINIT(); -// NETISR_LOCKINIT(); + mbuf_mlock_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setdefault(mbuf_mlock_grp_attr); + + mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr); + mbuf_mlock_attr = lck_attr_alloc_init(); + lck_attr_setdefault(mbuf_mlock_attr); + + mbuf_mlock = lck_mtx_alloc_init(mbuf_mlock_grp, mbuf_mlock_attr); - mbstat.m_msize = MSIZE; - mbstat.m_mclbytes = MCLBYTES; - mbstat.m_minclsize = MINCLSIZE; - mbstat.m_mlen = MLEN; - mbstat.m_mhlen = MHLEN; + mbstat.m_msize = MSIZE; + mbstat.m_mclbytes = MCLBYTES; + mbstat.m_minclsize = MINCLSIZE; + mbstat.m_mlen = MLEN; + mbstat.m_mhlen = MHLEN; + mbstat.m_bigmclbytes = NBPG; if (nmbclusters == 0) nmbclusters = NMBCLUSTERS; @@ -205,20 +240,20 @@ mbinit() for (m = 0; m < nmbclusters; m++) mclrefcnt[m] = -1; - /* Calculate the number of pages assigned to the cluster pool */ - mcl_pages = nmbclusters/(PAGE_SIZE/CLBYTES); + /* Calculate the number of pages assigned to the cluster pool */ + mcl_pages = nmbclusters/(NBPG/CLBYTES); MALLOC(mcl_paddr, int *, mcl_pages * sizeof(int), M_TEMP, M_WAITOK); if (mcl_paddr == 0) panic("mbinit1"); - /* Register with the I/O Bus mapper */ - mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages); + /* Register with the I/O Bus mapper */ + mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages); bzero((char *)mcl_paddr, mcl_pages * sizeof(int)); embutl = (union mcluster *)((unsigned char *)mbutl + (nmbclusters * MCLBYTES)); PE_parse_boot_arg("initmcl", &initmcl); - if (m_clalloc(max(PAGE_SIZE/CLBYTES, 1) * initmcl, M_WAIT) == 0) + if (m_clalloc(max(NBPG/CLBYTES, 1) * initmcl, M_WAIT, MCLBYTES, 0) == 0) goto bad; MBUF_UNLOCK(); @@ -232,108 +267,156 @@ bad: /* * Allocate some number of mbuf clusters * and place on cluster free list. + * Take the mbuf lock (if not already locked) and do not release it */ /* ARGSUSED */ -m_clalloc(ncl, nowait) - register int ncl; - int nowait; +static int +m_clalloc( + const int num, + const int nowait, + const size_t bufsize, + int locked) { - register union mcluster *mcl; - register int i; - vm_size_t size; - static char doing_alloc; + int i; + vm_size_t size = 0; + int numpages = 0; + vm_offset_t page = 0; + if (locked == 0) + MBUF_LOCK(); /* * Honor the caller's wish to block or not block. * We have a way to grow the pool asynchronously, * by kicking the dlil_input_thread. */ - if ((i = m_howmany()) <= 0) + i = m_howmany(num, bufsize); + if (i == 0 || nowait == M_DONTWAIT) goto out; - if ((nowait == M_DONTWAIT)) - goto out; + MBUF_UNLOCK(); + size = round_page_32(i * bufsize); + page = kmem_mb_alloc(mb_map, size); - if (ncl < i) - ncl = i; - size = round_page_32(ncl * MCLBYTES); - mcl = (union mcluster *)kmem_mb_alloc(mb_map, size); - - if (mcl == 0 && ncl > 1) { - size = round_page_32(MCLBYTES); /* Try for 1 if failed */ - mcl = (union mcluster *)kmem_mb_alloc(mb_map, size); + if (page == 0) { + size = NBPG; /* Try for 1 if failed */ + page = kmem_mb_alloc(mb_map, size); } + MBUF_LOCK(); - if (mcl) { - MBUF_LOCK(); - ncl = size / MCLBYTES; - for (i = 0; i < ncl; i++) { - if (++mclrefcnt[mtocl(mcl)] != 0) - panic("m_clalloc already there"); - if (((int)mcl & PAGE_MASK) == 0) { - ppnum_t offset = ((char *)mcl - (char *)mbutl)/PAGE_SIZE; - ppnum_t new_page = pmap_find_phys(kernel_pmap, (vm_address_t) mcl); - - /* - * In the case of no mapper being available - * the following code nops and returns the - * input page, if there is a mapper the I/O - * page appropriate is returned. - */ - new_page = IOMapperInsertPage(mcl_paddr_base, offset, new_page); - mcl_paddr[offset] = new_page << 12; - } - - mcl->mcl_next = mclfree; - mclfree = mcl++; + if (page) { + numpages = size / NBPG; + for (i = 0; i < numpages; i++, page += NBPG) { + if (((int)page & PGOFSET) == 0) { + ppnum_t offset = ((char *)page - (char *)mbutl)/NBPG; + ppnum_t new_page = pmap_find_phys(kernel_pmap, (vm_address_t) page); + + /* + * In the case of no mapper being available + * the following code nops and returns the + * input page, if there is a mapper the I/O + * page appropriate is returned. + */ + new_page = IOMapperInsertPage(mcl_paddr_base, offset, new_page); + mcl_paddr[offset] = new_page << 12; + } + if (bufsize == MCLBYTES) { + union mcluster *mcl = (union mcluster *)page; + + if (++mclrefcnt[mtocl(mcl)] != 0) + panic("m_clalloc already there"); + mcl->mcl_next = mclfree; + mclfree = mcl++; + if (++mclrefcnt[mtocl(mcl)] != 0) + panic("m_clalloc already there"); + mcl->mcl_next = mclfree; + mclfree = mcl++; + } else { + union mbigcluster *mbc = (union mbigcluster *)page; + + if (++mclrefcnt[mtocl(mbc)] != 0) + panic("m_clalloc already there"); + if (++mclrefcnt[mtocl(mbc) + 1] != 0) + panic("m_clalloc already there"); + + mbc->mbc_next = mbigfree; + mbigfree = mbc; + } + } + if (bufsize == MCLBYTES) { + int numcl = numpages << 1; + mbstat.m_clfree += numcl; + mbstat.m_clusters += numcl; + return (numcl); + } else { + mbstat.m_bigclfree += numpages; + mbstat.m_bigclusters += numpages; + return (numpages); } - mbstat.m_clfree += ncl; - mbstat.m_clusters += ncl; - return (ncl); } /* else ... */ out: - MBUF_LOCK(); - /* - * When non-blocking we kick the dlil thread if we havve to grow the + * When non-blocking we kick a thread if we havve to grow the * pool or if the number of free clusters is less than requested. */ - if ((nowait == M_DONTWAIT) && (i > 0 || ncl >= mbstat.m_clfree)) { - mbuf_expand_mcl = 1; - if (mbuf_expand_thread_initialized) - wakeup((caddr_t)&mbuf_expand_thread_wakeup); + if (bufsize == MCLBYTES) { + if (i > 0) { + /* Remember total number of clusters needed at this time */ + i += mbstat.m_clusters; + if (i > mbuf_expand_mcl) { + mbuf_expand_mcl = i; + if (mbuf_expand_thread_initialized) + wakeup((caddr_t)&mbuf_expand_thread_wakeup); + } + } + + if (mbstat.m_clfree >= num) + return 1; + } else { + if (i > 0) { + /* Remember total number of 4KB clusters needed at this time */ + i += mbstat.m_bigclusters; + if (i > mbuf_expand_big) { + mbuf_expand_big = i; + if (mbuf_expand_thread_initialized) + wakeup((caddr_t)&mbuf_expand_thread_wakeup); + } + } + + if (mbstat.m_bigclfree >= num) + return 1; } - - if (mbstat.m_clfree >= ncl) - return 1; - return 0; } /* * Add more free mbufs by cutting up a cluster. */ -m_expand(canwait) - int canwait; +static int +m_expand(int canwait) { - register caddr_t mcl; + caddr_t mcl; - if (mbstat.m_clfree < (mbstat.m_clusters >> 4)) - /* 1/16th of the total number of cluster mbufs allocated is - reserved for large packets. The number reserved must - always be < 1/2, or future allocation will be prevented. - */ - return 0; + if (mbstat.m_clfree < (mbstat.m_clusters >> 4)) { + /* + * 1/16th of the total number of cluster mbufs allocated is + * reserved for large packets. The number reserved must + * always be < 1/2, or future allocation will be prevented. + */ + (void)m_clalloc(1, canwait, MCLBYTES, 0); + MBUF_UNLOCK(); + if (mbstat.m_clfree < (mbstat.m_clusters >> 4)) + return 0; + } MCLALLOC(mcl, canwait); if (mcl) { - register struct mbuf *m = (struct mbuf *)mcl; - register int i = NMBPCL; + struct mbuf *m = (struct mbuf *)mcl; + int i = NMBPCL; MBUF_LOCK(); mbstat.m_mtypes[MT_FREE] += i; mbstat.m_mbufs += i; while (i--) { - _MFREE_MUNGE(m); + _MFREE_MUNGE(m); m->m_type = MT_FREE; m->m_next = mfree; mfree = m++; @@ -352,14 +435,12 @@ m_expand(canwait) * then re-attempt to allocate an mbuf. */ struct mbuf * -m_retry(canwait, type) - int canwait, type; +m_retry( + int canwait, + int type) { - register struct mbuf *m; - int wait, s; - funnel_t * fnl; - int fnl_switch = 0; - boolean_t funnel_state; + struct mbuf *m; + int wait; for (;;) { (void) m_expand(canwait); @@ -369,12 +450,13 @@ m_retry(canwait, type) (m)->m_type = (type); (m)->m_data = (m)->m_dat; (m)->m_flags = 0; + (m)->m_len = 0; } if (m || canwait == M_DONTWAIT) break; MBUF_LOCK(); wait = m_want++; - mbuf_expand_mcl = 1; + mbuf_expand_mcl++; if (wait == 0) mbstat.m_drain++; else @@ -384,25 +466,13 @@ m_retry(canwait, type) if (mbuf_expand_thread_initialized) wakeup((caddr_t)&mbuf_expand_thread_wakeup); - /* - * Need to be inside network funnel for m_reclaim because it calls into the - * socket domains and tsleep end-up calling splhigh - */ - fnl = thread_funnel_get(); - if (wait == 0 && fnl == network_flock) { + if (wait == 0) { m_reclaim(); - } else if (fnl != THR_FUNNEL_NULL) { - /* Sleep with a small timeout as insurance */ - (void) tsleep((caddr_t)&mfree, PZERO-1, "m_retry", hz); } else { - /* We are called from a non-BSD context: use mach primitives */ - u_int64_t abstime = 0; - - assert_wait((event_t)&mfree, THREAD_UNINT); - clock_interval_to_deadline(hz, NSEC_PER_SEC / hz, &abstime); - thread_set_timer_deadline(abstime); - if (thread_block(THREAD_CONTINUE_NULL) != THREAD_TIMED_OUT) - thread_cancel_timer(); + struct timespec ts; + ts.tv_sec = 1; + ts.tv_nsec = 0; + (void) msleep((caddr_t)&mfree, 0, (PZERO-1) | PDROP, "m_retry", &ts); } } if (m == 0) @@ -414,12 +484,14 @@ m_retry(canwait, type) * As above; retry an MGETHDR. */ struct mbuf * -m_retryhdr(canwait, type) - int canwait, type; +m_retryhdr( + int canwait, + int type) { - register struct mbuf *m; + struct mbuf *m; - if (m = m_retry(canwait, type)) { + if ((m = m_retry(canwait, type))) { + m->m_next = m->m_nextpkt = 0; m->m_flags |= M_PKTHDR; m->m_data = m->m_pktdat; _M_CLEAR_PKTHDR(m); @@ -427,15 +499,10 @@ m_retryhdr(canwait, type) return (m); } -m_reclaim() +void +m_reclaim(void) { - register struct domain *dp; - register struct protosw *pr; - - for (dp = domains; dp; dp = dp->dom_next) - for (pr = dp->dom_protosw; pr; pr = pr->pr_next) - if (pr->pr_drain) - (*pr->pr_drain)(); + do_reclaim = 1; /* drain is performed in pfslowtimo(), to avoid deadlocks */ mbstat.m_drain++; } @@ -445,10 +512,15 @@ m_reclaim() * for critical paths. */ struct mbuf * -m_get(nowait, type) - int nowait, type; +m_get( + int nowait, + int type) { - register struct mbuf *m; + struct mbuf *m; + + m_range_check(mfree); + m_range_check(mclfree); + m_range_check(mbigfree); _MINTGET(m, type); if (m) { @@ -456,17 +528,29 @@ m_get(nowait, type) m->m_type = type; m->m_data = m->m_dat; m->m_flags = 0; + m->m_len = 0; } else (m) = m_retry(nowait, type); + m_range_check(mfree); + m_range_check(mclfree); + m_range_check(mbigfree); + + return (m); } struct mbuf * -m_gethdr(nowait, type) - int nowait, type; +m_gethdr( + int nowait, + int type) { - register struct mbuf *m; + struct mbuf *m; + + m_range_check(mfree); + m_range_check(mclfree); + m_range_check(mbigfree); + _MINTGET(m, type); if (m) { @@ -474,18 +558,25 @@ m_gethdr(nowait, type) m->m_type = type; m->m_data = m->m_pktdat; m->m_flags = M_PKTHDR; + m->m_len = 0; _M_CLEAR_PKTHDR(m) } else m = m_retryhdr(nowait, type); + m_range_check(mfree); + m_range_check(mclfree); + m_range_check(mbigfree); + + return m; } struct mbuf * -m_getclr(nowait, type) - int nowait, type; +m_getclr( + int nowait, + int type) { - register struct mbuf *m; + struct mbuf *m; MGET(m, nowait, type); if (m == 0) @@ -495,11 +586,15 @@ m_getclr(nowait, type) } struct mbuf * -m_free(m) - struct mbuf *m; +m_free( + struct mbuf *m) { struct mbuf *n = m->m_next; - int i, s; + int i; + + m_range_check(m); + m_range_check(mfree); + m_range_check(mclfree); if (m->m_type == MT_FREE) panic("freeing free mbuf"); @@ -509,6 +604,8 @@ m_free(m) { m_freem(m->m_pkthdr.aux); } + if ((m->m_flags & M_PKTHDR) != 0) + m_tag_delete_chain(m, NULL); MBUF_LOCK(); if ((m->m_flags & M_EXT)) @@ -517,6 +614,9 @@ m_free(m) remque((queue_t)&m->m_ext.ext_refs); } else if (m->m_ext.ext_free == NULL) { union mcluster *mcl= (union mcluster *)m->m_ext.ext_buf; + + m_range_check(mcl); + if (_MCLUNREF(mcl)) { mcl->mcl_next = mclfree; mclfree = mcl; @@ -537,7 +637,7 @@ m_free(m) } mbstat.m_mtypes[m->m_type]--; (void) _MCLUNREF(m); - _MFREE_MUNGE(m); + _MFREE_MUNGE(m); m->m_type = MT_FREE; mbstat.m_mtypes[m->m_type]++; m->m_flags = 0; @@ -553,9 +653,9 @@ m_free(m) /* m_mclget() add an mbuf cluster to a normal mbuf */ struct mbuf * -m_mclget(m, nowait) - struct mbuf *m; - int nowait; +m_mclget( + struct mbuf *m, + int nowait) { MCLALLOC(m->m_ext.ext_buf, nowait); if (m->m_ext.ext_buf) { @@ -572,12 +672,12 @@ m_mclget(m, nowait) /* m_mclalloc() allocate an mbuf cluster */ caddr_t -m_mclalloc( nowait) - int nowait; +m_mclalloc( + int nowait) { caddr_t p; - (void)m_clalloc(1, nowait); + (void)m_clalloc(1, nowait, MCLBYTES, 0); if ((p = (caddr_t)mclfree)) { ++mclrefcnt[mtocl(p)]; mbstat.m_clfree--; @@ -587,16 +687,19 @@ m_mclalloc( nowait) } MBUF_UNLOCK(); - return p; + return p; } /* m_mclfree() releases a reference to a cluster allocated by MCLALLOC, * freeing the cluster if the reference count has reached 0. */ void -m_mclfree(p) - caddr_t p; +m_mclfree( + caddr_t p) { MBUF_LOCK(); + + m_range_check(p); + if (--mclrefcnt[mtocl(p)] == 0) { ((union mcluster *)(p))->mcl_next = mclfree; mclfree = (union mcluster *)(p); @@ -607,153 +710,450 @@ m_mclfree(p) /* mcl_hasreference() checks if a cluster of an mbuf is referenced by another mbuf */ int -m_mclhasreference(m) - struct mbuf *m; +m_mclhasreference( + struct mbuf *m) { return (m->m_ext.ext_refs.forward != &(m->m_ext.ext_refs)); } +__private_extern__ caddr_t +m_bigalloc(int nowait) +{ + caddr_t p; + + (void)m_clalloc(1, nowait, NBPG, 0); + if ((p = (caddr_t)mbigfree)) { + if (mclrefcnt[mtocl(p)] != mclrefcnt[mtocl(p) + 1]) + panic("m_bigalloc mclrefcnt %x mismatch %d != %d", + p, mclrefcnt[mtocl(p)], mclrefcnt[mtocl(p) + 1]); + if (mclrefcnt[mtocl(p)] || mclrefcnt[mtocl(p) + 1]) + panic("m_bigalloc mclrefcnt %x not null %d != %d", + p, mclrefcnt[mtocl(p)], mclrefcnt[mtocl(p) + 1]); + ++mclrefcnt[mtocl(p)]; + ++mclrefcnt[mtocl(p) + 1]; + mbstat.m_bigclfree--; + mbigfree = ((union mbigcluster *)p)->mbc_next; + } else { + mbstat.m_drops++; + } + MBUF_UNLOCK(); + return p; +} + +__private_extern__ void +m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg) +{ + m_range_check(p); + + if (mclrefcnt[mtocl(p)] != mclrefcnt[mtocl(p) + 1]) + panic("m_bigfree mclrefcnt %x mismatch %d != %d", + p, mclrefcnt[mtocl(p)], mclrefcnt[mtocl(p) + 1]); + --mclrefcnt[mtocl(p)]; + --mclrefcnt[mtocl(p) + 1]; + if (mclrefcnt[mtocl(p)] == 0) { + ((union mbigcluster *)(p))->mbc_next = mbigfree; + mbigfree = (union mbigcluster *)(p); + mbstat.m_bigclfree++; + } +} + +/* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */ +__private_extern__ struct mbuf * +m_mbigget(struct mbuf *m, int nowait) +{ + m->m_ext.ext_buf = m_bigalloc(nowait); + if (m->m_ext.ext_buf) { + m->m_data = m->m_ext.ext_buf; + m->m_flags |= M_EXT; + m->m_ext.ext_size = NBPG; + m->m_ext.ext_free = m_bigfree; + m->m_ext.ext_arg = 0; + m->m_ext.ext_refs.forward = m->m_ext.ext_refs.backward = + &m->m_ext.ext_refs; + } + + return m; +} + + /* */ void -m_copy_pkthdr(to, from) - struct mbuf *to, *from; +m_copy_pkthdr( + struct mbuf *to, + struct mbuf *from) { to->m_pkthdr = from->m_pkthdr; from->m_pkthdr.aux = (struct mbuf *)NULL; + SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */ to->m_flags = from->m_flags & M_COPYFLAGS; to->m_data = (to)->m_pktdat; } -/* Best effort to get a mbuf cluster + pkthdr under one lock. - * If we don't have them avail, just bail out and use the regular - * path. - * Used by drivers to allocated packets on receive ring. +/* + * "Move" mbuf pkthdr from "from" to "to". + * "from" must have M_PKTHDR set, and "to" must be empty. */ -struct mbuf * -m_getpacket(void) +#ifndef __APPLE__ +void +m_move_pkthdr(struct mbuf *to, struct mbuf *from) { - struct mbuf *m; - m_clalloc(1, M_DONTWAIT); /* takes the MBUF_LOCK, but doesn't release it... */ - if ((mfree != 0) && (mclfree != 0)) { /* mbuf + cluster are available */ - m = mfree; - mfree = m->m_next; - MCHECK(m); - ++mclrefcnt[mtocl(m)]; - mbstat.m_mtypes[MT_FREE]--; - mbstat.m_mtypes[MT_DATA]++; - m->m_ext.ext_buf = (caddr_t)mclfree; /* get the cluster */ - ++mclrefcnt[mtocl(m->m_ext.ext_buf)]; - mbstat.m_clfree--; - mclfree = ((union mcluster *)(m->m_ext.ext_buf))->mcl_next; - - m->m_next = m->m_nextpkt = 0; - m->m_type = MT_DATA; - m->m_data = m->m_ext.ext_buf; - m->m_flags = M_PKTHDR | M_EXT; - _M_CLEAR_PKTHDR(m) - m->m_ext.ext_free = 0; - m->m_ext.ext_size = MCLBYTES; - m->m_ext.ext_refs.forward = m->m_ext.ext_refs.backward = - &m->m_ext.ext_refs; - MBUF_UNLOCK(); - } - else { /* slow path: either mbuf or cluster need to be allocated anyway */ - MBUF_UNLOCK(); + KASSERT((to->m_flags & M_EXT) == 0, ("m_move_pkthdr: to has cluster")); - MGETHDR(m, M_WAITOK, MT_DATA ); - - if ( m == 0 ) - return (NULL); - - MCLGET( m, M_WAITOK ); - if ( ( m->m_flags & M_EXT ) == 0 ) - { - m_free(m); m = 0; - } - } - return (m); + to->m_flags = from->m_flags & M_COPYFLAGS; + to->m_data = to->m_pktdat; + to->m_pkthdr = from->m_pkthdr; /* especially tags */ + SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */ + from->m_flags &= ~M_PKTHDR; } +#endif +/* + * Duplicate "from"'s mbuf pkthdr in "to". + * "from" must have M_PKTHDR set, and "to" must be empty. + * In particular, this does a deep copy of the packet tags. + */ +#ifndef __APPLE__ +int +m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how) +{ + to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); + if ((to->m_flags & M_EXT) == 0) + to->m_data = to->m_pktdat; + to->m_pkthdr = from->m_pkthdr; + SLIST_INIT(&to->m_pkthdr.tags); + return (m_tag_copy_chain(to, from, how)); +} +#endif /* * return a list of mbuf hdrs that point to clusters... - * try for num_needed, if this can't be met, return whatever + * try for num_needed, if wantall is not set, return whatever * number were available... set up the first num_with_pkthdrs * with mbuf hdrs configured as packet headers... these are * chained on the m_nextpkt field... any packets requested beyond * this are chained onto the last packet header's m_next field. + * The size of the cluster is controlled by the paramter bufsize. */ -struct mbuf * -m_getpackets(int num_needed, int num_with_pkthdrs, int how) +__private_extern__ struct mbuf * +m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs, int how, int wantall, size_t bufsize) { struct mbuf *m; struct mbuf **np, *top; - + unsigned int num, needed = *num_needed; + + if (bufsize != MCLBYTES && bufsize != NBPG) + return 0; + top = NULL; np = ⊤ + + (void)m_clalloc(needed, how, bufsize, 0); /* takes the MBUF_LOCK, but doesn't release it... */ + + for (num = 0; num < needed; num++) { + m_range_check(mfree); + m_range_check(mclfree); + m_range_check(mbigfree); + + if (mfree && ((bufsize == NBPG && mbigfree) || (bufsize == MCLBYTES && mclfree))) { + /* mbuf + cluster are available */ + m = mfree; + MCHECK(m); + mfree = m->m_next; + ++mclrefcnt[mtocl(m)]; + mbstat.m_mtypes[MT_FREE]--; + mbstat.m_mtypes[MT_DATA]++; + if (bufsize == NBPG) { + m->m_ext.ext_buf = (caddr_t)mbigfree; /* get the big cluster */ + ++mclrefcnt[mtocl(m->m_ext.ext_buf)]; + ++mclrefcnt[mtocl(m->m_ext.ext_buf) + 1]; + mbstat.m_bigclfree--; + mbigfree = ((union mbigcluster *)(m->m_ext.ext_buf))->mbc_next; + m->m_ext.ext_free = m_bigfree; + m->m_ext.ext_size = NBPG; + } else { + m->m_ext.ext_buf = (caddr_t)mclfree; /* get the cluster */ + ++mclrefcnt[mtocl(m->m_ext.ext_buf)]; + mbstat.m_clfree--; + mclfree = ((union mcluster *)(m->m_ext.ext_buf))->mcl_next; + m->m_ext.ext_free = 0; + m->m_ext.ext_size = MCLBYTES; + } + m->m_ext.ext_arg = 0; + m->m_ext.ext_refs.forward = m->m_ext.ext_refs.backward = &m->m_ext.ext_refs; + m->m_next = m->m_nextpkt = 0; + m->m_type = MT_DATA; + m->m_data = m->m_ext.ext_buf; + m->m_len = 0; - m_clalloc(num_needed, how); /* takes the MBUF_LOCK, but doesn't release it... */ + if (num_with_pkthdrs == 0) + m->m_flags = M_EXT; + else { + m->m_flags = M_PKTHDR | M_EXT; + _M_CLEAR_PKTHDR(m); + + num_with_pkthdrs--; + } + } else { + MBUF_UNLOCK(); + + if (num_with_pkthdrs == 0) { + MGET(m, how, MT_DATA ); + } else { + MGETHDR(m, how, MT_DATA); + + num_with_pkthdrs--; + } + if (m == 0) + goto fail; + + if (bufsize == NBPG) + m = m_mbigget(m, how); + else + m = m_mclget(m, how); + if ((m->m_flags & M_EXT) == 0) { + m_free(m); + goto fail; + } + MBUF_LOCK(); + } + *np = m; + + if (num_with_pkthdrs) + np = &m->m_nextpkt; + else + np = &m->m_next; + } + MBUF_UNLOCK(); + + *num_needed = num; + return (top); +fail: + if (wantall && top) { + m_freem(top); + return 0; + } + return top; +} - while (num_needed--) { - if (mfree && mclfree) { /* mbuf + cluster are available */ - m = mfree; - MCHECK(m); - mfree = m->m_next; - ++mclrefcnt[mtocl(m)]; - mbstat.m_mtypes[MT_FREE]--; - mbstat.m_mtypes[MT_DATA]++; - m->m_ext.ext_buf = (caddr_t)mclfree; /* get the cluster */ - ++mclrefcnt[mtocl(m->m_ext.ext_buf)]; - mbstat.m_clfree--; - mclfree = ((union mcluster *)(m->m_ext.ext_buf))->mcl_next; - m->m_next = m->m_nextpkt = 0; - m->m_type = MT_DATA; - m->m_data = m->m_ext.ext_buf; - m->m_ext.ext_free = 0; - m->m_ext.ext_size = MCLBYTES; - m->m_ext.ext_refs.forward = m->m_ext.ext_refs.backward = &m->m_ext.ext_refs; +/* + * Return list of mbuf linked by m_nextpkt + * Try for num_needed, and if wantall is not set, return whatever + * number were available + * The size of each mbuf in the list is controlled by the parameter packetlen. + * Each mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf in + * the chain is called a segment. + * If maxsegments is not null and the value pointed to is not null, this specify + * the maximum number of segments for a chain of mbufs. + * If maxsegments is zero or the value pointed to is zero the + * caller does not have any restriction on the number of segments. + * The actual number of segments of a mbuf chain is return in the value pointed + * to by maxsegments. + * When possible the allocation is done under a single lock. + */ - if (num_with_pkthdrs == 0) - m->m_flags = M_EXT; - else { - m->m_flags = M_PKTHDR | M_EXT; - _M_CLEAR_PKTHDR(m); +__private_extern__ struct mbuf * +m_allocpacket_internal(unsigned int *num_needed, size_t packetlen, unsigned int * maxsegments, + int how, int wantall, size_t wantsize) +{ + struct mbuf **np, *top; + size_t bufsize; + unsigned int num; + unsigned int numchunks = 0; - num_with_pkthdrs--; + top = NULL; + np = ⊤ + + if (wantsize == 0) { + if (packetlen <= MINCLSIZE) + bufsize = packetlen; + else if (packetlen > MCLBYTES) + bufsize = NBPG; + else + bufsize = MCLBYTES; + } else if (wantsize == MCLBYTES || wantsize == NBPG) + bufsize = wantsize; + else + return 0; + + if (bufsize <= MHLEN) { + numchunks = 1; + } else if (bufsize <= MINCLSIZE) { + if (maxsegments != NULL && *maxsegments == 1) { + bufsize = MCLBYTES; + numchunks = 1; + } else { + numchunks = 2; } + } else if (bufsize == NBPG) { + numchunks = ((packetlen - 1) >> PGSHIFT) + 1; + } else { + numchunks = ((packetlen - 1) >> MCLSHIFT) + 1; + } + if (maxsegments != NULL) { + if (*maxsegments && numchunks > *maxsegments) { + *maxsegments = numchunks; + return 0; + } + *maxsegments = numchunks; + } + /* m_clalloc takes the MBUF_LOCK, but do not release it */ + (void)m_clalloc(numchunks, how, (bufsize == NBPG) ? NBPG : MCLBYTES, 0); + for (num = 0; num < *num_needed; num++) { + struct mbuf **nm, *pkt = 0; + size_t len; + + nm = &pkt; + + m_range_check(mfree); + m_range_check(mclfree); + m_range_check(mbigfree); + + for (len = 0; len < packetlen; ) { + struct mbuf *m = NULL; + + if (wantsize == 0 && packetlen > MINCLSIZE) { + if (packetlen - len > MCLBYTES) + bufsize = NBPG; + else + bufsize = MCLBYTES; + } + len += bufsize; + + if (mfree && ((bufsize == NBPG && mbigfree) || (bufsize == MCLBYTES && mclfree))) { + /* mbuf + cluster are available */ + m = mfree; + MCHECK(m); + mfree = m->m_next; + ++mclrefcnt[mtocl(m)]; + mbstat.m_mtypes[MT_FREE]--; + mbstat.m_mtypes[MT_DATA]++; + if (bufsize == NBPG) { + m->m_ext.ext_buf = (caddr_t)mbigfree; /* get the big cluster */ + ++mclrefcnt[mtocl(m->m_ext.ext_buf)]; + ++mclrefcnt[mtocl(m->m_ext.ext_buf) + 1]; + mbstat.m_bigclfree--; + mbigfree = ((union mbigcluster *)(m->m_ext.ext_buf))->mbc_next; + m->m_ext.ext_free = m_bigfree; + m->m_ext.ext_size = NBPG; + } else { + m->m_ext.ext_buf = (caddr_t)mclfree; /* get the cluster */ + ++mclrefcnt[mtocl(m->m_ext.ext_buf)]; + mbstat.m_clfree--; + mclfree = ((union mcluster *)(m->m_ext.ext_buf))->mcl_next; + m->m_ext.ext_free = 0; + m->m_ext.ext_size = MCLBYTES; + } + m->m_ext.ext_arg = 0; + m->m_ext.ext_refs.forward = m->m_ext.ext_refs.backward = &m->m_ext.ext_refs; + m->m_next = m->m_nextpkt = 0; + m->m_type = MT_DATA; + m->m_data = m->m_ext.ext_buf; + m->m_len = 0; + + if (pkt == 0) { + pkt = m; + m->m_flags = M_PKTHDR | M_EXT; + _M_CLEAR_PKTHDR(m); + } else { + m->m_flags = M_EXT; + } + } else { + MBUF_UNLOCK(); + + if (pkt == 0) { + MGETHDR(m, how, MT_DATA); + } else { + MGET(m, how, MT_DATA ); + } + if (m == 0) { + m_freem(pkt); + goto fail; + } + if (bufsize <= MINCLSIZE) { + if (bufsize > MHLEN) { + MGET(m->m_next, how, MT_DATA); + if (m->m_next == 0) { + m_free(m); + m_freem(pkt); + goto fail; + } + } + } else { + if (bufsize == NBPG) + m = m_mbigget(m, how); + else + m = m_mclget(m, how); + if ((m->m_flags & M_EXT) == 0) { + m_free(m); + m_freem(pkt); + goto fail; + } + } + MBUF_LOCK(); + } + *nm = m; + nm = &m->m_next; + } + *np = pkt; + np = &pkt->m_nextpkt; + } + MBUF_UNLOCK(); + *num_needed = num; + + return top; +fail: + if (wantall && top) { + m_freem(top); + return 0; + } + *num_needed = num; + + return top; +} - } else { - MBUF_UNLOCK(); +/* Best effort to get a mbuf cluster + pkthdr under one lock. + * If we don't have them avail, just bail out and use the regular + * path. + * Used by drivers to allocated packets on receive ring. + */ +__private_extern__ struct mbuf * +m_getpacket_how(int how) +{ + unsigned int num_needed = 1; + + return m_getpackets_internal(&num_needed, 1, how, 1, MCLBYTES); +} - if (num_with_pkthdrs == 0) { - MGET(m, how, MT_DATA ); - } else { - MGETHDR(m, how, MT_DATA); +/* Best effort to get a mbuf cluster + pkthdr under one lock. + * If we don't have them avail, just bail out and use the regular + * path. + * Used by drivers to allocated packets on receive ring. + */ +struct mbuf * +m_getpacket(void) +{ + unsigned int num_needed = 1; - num_with_pkthdrs--; - } - if (m == 0) - return(top); - - MCLGET(m, how); - if ((m->m_flags & M_EXT) == 0) { - m_free(m); - return(top); - } - MBUF_LOCK(); - } - *np = m; + return m_getpackets_internal(&num_needed, 1, M_WAITOK, 1, MCLBYTES); +} - if (num_with_pkthdrs) - np = &m->m_nextpkt; - else - np = &m->m_next; - } - MBUF_UNLOCK(); - return (top); +/* + * return a list of mbuf hdrs that point to clusters... + * try for num_needed, if this can't be met, return whatever + * number were available... set up the first num_with_pkthdrs + * with mbuf hdrs configured as packet headers... these are + * chained on the m_nextpkt field... any packets requested beyond + * this are chained onto the last packet header's m_next field. + */ +struct mbuf * +m_getpackets(int num_needed, int num_with_pkthdrs, int how) +{ + unsigned int n = num_needed; + + return m_getpackets_internal(&n, num_with_pkthdrs, how, 0, MCLBYTES); } @@ -773,7 +1173,11 @@ m_getpackethdrs(int num_needed, int how) MBUF_LOCK(); while (num_needed--) { - if (m = mfree) { /* mbufs are available */ + m_range_check(mfree); + m_range_check(mclfree); + m_range_check(mbigfree); + + if ((m = mfree)) { /* mbufs are available */ MCHECK(m); mfree = m->m_next; ++mclrefcnt[mtocl(m)]; @@ -782,20 +1186,18 @@ m_getpackethdrs(int num_needed, int how) m->m_next = m->m_nextpkt = 0; m->m_type = MT_DATA; - m->m_flags = M_PKTHDR; + m->m_flags = M_PKTHDR; + m->m_len = 0; m->m_data = m->m_pktdat; _M_CLEAR_PKTHDR(m); } else { - MBUF_UNLOCK(); - - m = m_retryhdr(how, MT_DATA); - - if (m == 0) - return(top); - - MBUF_LOCK(); + MBUF_UNLOCK(); + m = m_retryhdr(how, MT_DATA); + if (m == 0) + return(top); + MBUF_LOCK(); } *np = m; np = &m->m_nextpkt; @@ -810,8 +1212,8 @@ m_getpackethdrs(int num_needed, int how) * returns the count for mbufs packets freed. Used by the drivers. */ int -m_freem_list(m) - struct mbuf *m; +m_freem_list( + struct mbuf *m) { struct mbuf *nextpkt; int i, count=0; @@ -830,6 +1232,12 @@ m_freem_list(m) struct mbuf *n; + m_range_check(m); + m_range_check(mfree); + m_range_check(mclfree); + m_range_check(mbigfree); + + /* Free the aux data if there is any */ if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.aux) { /* @@ -845,6 +1253,13 @@ m_freem_list(m) m = nextpkt->m_pkthdr.aux; nextpkt->m_pkthdr.aux = NULL; } + + if ((m->m_flags & M_PKTHDR) != 0 && !SLIST_EMPTY(&m->m_pkthdr.tags)) { + /* A quick (albeit inefficient) expedient */ + MBUF_UNLOCK(); + m_tag_delete_chain(m, NULL); + MBUF_LOCK(); + } n = m->m_next; @@ -858,6 +1273,9 @@ m_freem_list(m) remque((queue_t)&m->m_ext.ext_refs); } else if (m->m_ext.ext_free == NULL) { union mcluster *mcl= (union mcluster *)m->m_ext.ext_buf; + + m_range_check(mcl); + if (_MCLUNREF(mcl)) { mcl->mcl_next = mclfree; mclfree = mcl; @@ -881,20 +1299,20 @@ m_freem_list(m) } m = nextpkt; /* bump m with saved nextpkt if any */ } - if (i = m_want) - m_want = 0; + if ((i = m_want)) + m_want = 0; MBUF_UNLOCK(); if (i) - wakeup((caddr_t)&mfree); + wakeup((caddr_t)&mfree); return (count); } void -m_freem(m) - register struct mbuf *m; +m_freem( + struct mbuf *m) { while (m) m = m_free(m); @@ -907,8 +1325,9 @@ m_freem(m) * Compute the amount of space available * before the current start of data in an mbuf. */ -m_leadingspace(m) -register struct mbuf *m; +int +m_leadingspace( + struct mbuf *m) { if (m->m_flags & M_EXT) { if (MCLHASREFERENCE(m)) @@ -924,8 +1343,9 @@ register struct mbuf *m; * Compute the amount of space available * after the end of data in an mbuf. */ -m_trailingspace(m) -register struct mbuf *m; +int +m_trailingspace( + struct mbuf *m) { if (m->m_flags & M_EXT) { if (MCLHASREFERENCE(m)) @@ -943,9 +1363,10 @@ register struct mbuf *m; * Does not adjust packet header length. */ struct mbuf * -m_prepend(m, len, how) - register struct mbuf *m; - int len, how; +m_prepend( + struct mbuf *m, + int len, + int how) { struct mbuf *mn; @@ -973,9 +1394,10 @@ m_prepend(m, len, how) * */ struct mbuf * -m_prepend_2(m, len, how) - register struct mbuf *m; - int len, how; +m_prepend_2( + struct mbuf *m, + int len, + int how) { if (M_LEADINGSPACE(m) >= len) { m->m_data -= len; @@ -996,13 +1418,14 @@ m_prepend_2(m, len, how) int MCFail; struct mbuf * -m_copym(m, off0, len, wait) - register struct mbuf *m; - int off0, wait; - register int len; +m_copym( + struct mbuf *m, + int off0, + int len, + int wait) { - register struct mbuf *n, **np; - register int off = off0; + struct mbuf *n, **np; + int off = off0; struct mbuf *top; int copyhdr = 0; @@ -1023,20 +1446,24 @@ m_copym(m, off0, len, wait) MBUF_LOCK(); while (len > 0) { + m_range_check(mfree); + m_range_check(mclfree); + m_range_check(mbigfree); + if (m == 0) { if (len != M_COPYALL) panic("m_copym"); break; } - if (n = mfree) { - MCHECK(n); - ++mclrefcnt[mtocl(n)]; + if ((n = mfree)) { + MCHECK(n); + ++mclrefcnt[mtocl(n)]; mbstat.m_mtypes[MT_FREE]--; mbstat.m_mtypes[m->m_type]++; mfree = n->m_next; n->m_next = n->m_nextpkt = 0; n->m_type = m->m_type; - n->m_data = n->m_dat; + n->m_data = n->m_dat; n->m_flags = 0; } else { MBUF_UNLOCK(); @@ -1105,15 +1532,16 @@ nospace: * rescan the entire mbuf list (normally hung off of the socket) */ struct mbuf * -m_copym_with_hdrs(m, off0, len, wait, m_last, m_off) - register struct mbuf *m; - int off0, wait; - register int len; - struct mbuf **m_last; - int *m_off; +m_copym_with_hdrs( + struct mbuf *m, + int off0, + int len, + int wait, + struct mbuf **m_last, + int *m_off) { - register struct mbuf *n, **np; - register int off = off0; + struct mbuf *n, **np = 0; + int off = off0; struct mbuf *top = 0; int copyhdr = 0; int type; @@ -1130,9 +1558,14 @@ m_copym_with_hdrs(m, off0, len, wait, m_last, m_off) m = m->m_next; } } + MBUF_LOCK(); while (len > 0) { + m_range_check(mfree); + m_range_check(mclfree); + m_range_check(mbigfree); + if (top == 0) type = MT_HEADER; else { @@ -1140,7 +1573,7 @@ m_copym_with_hdrs(m, off0, len, wait, m_last, m_off) panic("m_gethdr_and_copym"); type = m->m_type; } - if (n = mfree) { + if ((n = mfree)) { MCHECK(n); ++mclrefcnt[mtocl(n)]; mbstat.m_mtypes[MT_FREE]--; @@ -1223,13 +1656,13 @@ nospace: * Copy data from an mbuf chain starting "off" bytes from the beginning, * continuing for "len" bytes, into the indicated buffer. */ -void m_copydata(m, off, len, cp) - register struct mbuf *m; - register int off; - register int len; - caddr_t cp; +void m_copydata( + struct mbuf *m, + int off, + int len, + caddr_t cp) { - register unsigned count; + unsigned count; if (off < 0 || len < 0) panic("m_copydata"); @@ -1258,8 +1691,8 @@ void m_copydata(m, off, len, cp) * Both chains must be of the same type (e.g. MT_DATA). * Any m_pkthdr is not updated. */ -void m_cat(m, n) - register struct mbuf *m, *n; +void m_cat( + struct mbuf *m, struct mbuf *n) { while (m->m_next) m = m->m_next; @@ -1279,13 +1712,13 @@ void m_cat(m, n) } void -m_adj(mp, req_len) - struct mbuf *mp; - int req_len; +m_adj( + struct mbuf *mp, + int req_len) { - register int len = req_len; - register struct mbuf *m; - register count; + int len = req_len; + struct mbuf *m; + int count; if ((m = mp) == NULL) return; @@ -1348,7 +1781,7 @@ m_adj(mp, req_len) } count -= m->m_len; } - while (m = m->m_next) + while ((m = m->m_next)) m->m_len = 0; } } @@ -1364,12 +1797,12 @@ m_adj(mp, req_len) int MPFail; struct mbuf * -m_pullup(n, len) - register struct mbuf *n; - int len; +m_pullup( + struct mbuf *n, + int len) { - register struct mbuf *m; - register int count; + struct mbuf *m; + int count; int space; /* @@ -1428,11 +1861,12 @@ bad: * attempts to restore the chain to its original state. */ struct mbuf * -m_split(m0, len0, wait) - register struct mbuf *m0; - int len0, wait; +m_split( + struct mbuf *m0, + int len0, + int wait) { - register struct mbuf *m, *n; + struct mbuf *m, *n; unsigned len = len0, remain; for (m = m0; m && len > m->m_len; m = m->m_next) @@ -1491,16 +1925,17 @@ extpacket: * Routine to copy from device local memory into mbufs. */ struct mbuf * -m_devget(buf, totlen, off0, ifp, copy) - char *buf; - int totlen, off0; - struct ifnet *ifp; - void (*copy)(); +m_devget( + char *buf, + int totlen, + int off0, + struct ifnet *ifp, + void (*copy)(const void *, void *, size_t)) { - register struct mbuf *m; + struct mbuf *m; struct mbuf *top = 0, **mp = ⊤ - register int off = off0, len; - register char *cp; + int off = off0, len; + char *cp; char *epkt; cp = buf; @@ -1571,35 +2006,61 @@ m_devget(buf, totlen, off0, ifp, copy) * Ensure hysteresis between hi/lo. */ static int -m_howmany() +m_howmany(int num, size_t bufsize) { - register int i; - - /* Under minimum */ - if (mbstat.m_clusters < MINCL) - return (MINCL - mbstat.m_clusters); - /* Too few (free < 1/2 total) and not over maximum */ - if (mbstat.m_clusters < nmbclusters && - (i = ((mbstat.m_clusters >> 1) - mbstat.m_clfree)) > 0) - return i; - return 0; + int i = 0; + + /* Bail if we've maxed out the mbuf memory map */ + if (mbstat.m_clusters + (mbstat.m_bigclusters << 1) < nmbclusters) { + int j = 0; + + if (bufsize == MCLBYTES) { + /* Under minimum */ + if (mbstat.m_clusters < MINCL) + return (MINCL - mbstat.m_clusters); + /* Too few (free < 1/2 total) and not over maximum */ + if (mbstat.m_clusters < (nmbclusters >> 1)) { + if (num >= mbstat.m_clfree) + i = num - mbstat.m_clfree; + if (((mbstat.m_clusters + num) >> 1) > mbstat.m_clfree) + j = ((mbstat.m_clusters + num) >> 1) - mbstat.m_clfree; + i = max(i, j); + if (i + mbstat.m_clusters >= (nmbclusters >> 1)) + i = (nmbclusters >> 1) - mbstat.m_clusters; + } + } else { + /* Under minimum */ + if (mbstat.m_bigclusters < MINCL) + return (MINCL - mbstat.m_bigclusters); + /* Too few (free < 1/2 total) and not over maximum */ + if (mbstat.m_bigclusters < (nmbclusters >> 2)) { + if (num >= mbstat.m_bigclfree) + i = num - mbstat.m_bigclfree; + if (((mbstat.m_bigclusters + num) >> 1) > mbstat.m_bigclfree) + j = ((mbstat.m_bigclusters + num) >> 1) - mbstat.m_bigclfree; + i = max(i, j); + if (i + mbstat.m_bigclusters >= (nmbclusters >> 2)) + i = (nmbclusters >> 2) - mbstat.m_bigclusters; + } + } + } + return i; } - /* * Copy data from a buffer back into the indicated mbuf chain, * starting "off" bytes from the beginning, extending the mbuf * chain if necessary. */ void -m_copyback(m0, off, len, cp) - struct mbuf *m0; - register int off; - register int len; - caddr_t cp; +m_copyback( + struct mbuf *m0, + int off, + int len, + caddr_t cp) { - register int mlen; - register struct mbuf *m = m0, *n; + int mlen; + struct mbuf *m = m0, *n; int totlen = 0; if (m0 == 0) @@ -1640,16 +2101,16 @@ out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) } -char *mcl_to_paddr(register char *addr) { - register int base_phys; +char *mcl_to_paddr(char *addr) { + int base_phys; if (addr < (char *)mbutl || addr >= (char *)embutl) return (0); - base_phys = mcl_paddr[(addr - (char *)mbutl) >> PAGE_SHIFT]; + base_phys = mcl_paddr[(addr - (char *)mbutl) >> PGSHIFT]; if (base_phys == 0) return (0); - return ((char *)((int)base_phys | ((int)addr & PAGE_MASK))); + return ((char *)((int)base_phys | ((int)addr & PGOFSET))); } /* @@ -1663,8 +2124,9 @@ char *mcl_to_paddr(register char *addr) { int MDFail; struct mbuf * -m_dup(register struct mbuf *m, int how) -{ register struct mbuf *n, **np; +m_dup(struct mbuf *m, int how) +{ + struct mbuf *n, **np; struct mbuf *top; int copyhdr = 0; @@ -1684,16 +2146,16 @@ m_dup(register struct mbuf *m, int how) { if ((n = m_gethdr(how, m->m_type)) == NULL) return(NULL); n->m_len = m->m_len; - n->m_flags |= (m->m_flags & M_COPYFLAGS); - n->m_pkthdr.len = m->m_pkthdr.len; - n->m_pkthdr.rcvif = m->m_pkthdr.rcvif; - n->m_pkthdr.header = NULL; - n->m_pkthdr.csum_flags = 0; - n->m_pkthdr.csum_data = 0; - n->m_pkthdr.aux = NULL; - n->m_pkthdr.vlan_tag = 0; - n->m_pkthdr.reserved_1 = 0; - n->m_pkthdr.reserved2 = 0; + n->m_flags |= (m->m_flags & M_COPYFLAGS); + n->m_pkthdr.len = m->m_pkthdr.len; + n->m_pkthdr.rcvif = m->m_pkthdr.rcvif; + n->m_pkthdr.header = NULL; + n->m_pkthdr.csum_flags = 0; + n->m_pkthdr.csum_data = 0; + n->m_pkthdr.aux = NULL; + n->m_pkthdr.vlan_tag = 0; + n->m_pkthdr.socket_id = 0; + SLIST_INIT(&n->m_pkthdr.tags); bcopy(m->m_data, n->m_data, m->m_pkthdr.len); return(n); } @@ -1805,29 +2267,54 @@ void m_mcheck(struct mbuf *m) panic("mget MCHECK: m_type=%x m=%x", m->m_type, m); } -void +static void mbuf_expand_thread(void) { - while (1) { - int expand_mcl; - MBUF_LOCK(); - expand_mcl = mbuf_expand_mcl; - mbuf_expand_mcl = 0; - MBUF_UNLOCK(); - if (expand_mcl) { - caddr_t p; - MCLALLOC(p, M_WAIT); - if (p) MCLFREE(p); + while (1) { + MBUF_LOCK(); + if (mbuf_expand_mcl) { + int n; + + /* Adjust to the current number of cluster in use */ + n = mbuf_expand_mcl - (mbstat.m_clusters - mbstat.m_clfree); + mbuf_expand_mcl = 0; + + if (n > 0) + (void)m_clalloc(n, M_WAIT, MCLBYTES, 1); + } + if (mbuf_expand_big) { + int n; + + /* Adjust to the current number of 4 KB cluster in use */ + n = mbuf_expand_big - (mbstat.m_bigclusters - mbstat.m_bigclfree); + mbuf_expand_big = 0; + + if (n > 0) + (void)m_clalloc(n, M_WAIT, NBPG, 1); } - assert_wait(&mbuf_expand_thread_wakeup, THREAD_UNINT); - (void) thread_block(mbuf_expand_thread); - } + MBUF_UNLOCK(); + /* + * Because we can run out of memory before filling the mbuf map, we + * should not allocate more clusters than they are mbufs -- otherwise + * we could have a large number of useless clusters allocated. + */ + while (mbstat.m_mbufs < mbstat.m_bigclusters + mbstat.m_clusters) { + if (m_expand(M_WAIT) == 0) + break; + } + + assert_wait(&mbuf_expand_thread_wakeup, THREAD_UNINT); + (void) thread_block((thread_continue_t)mbuf_expand_thread); + } } -void +static void mbuf_expand_thread_init(void) { mbuf_expand_thread_initialized++; mbuf_expand_thread(); } +SYSCTL_DECL(_kern_ipc); +SYSCTL_STRUCT(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RW, &mbstat, mbstat, ""); + diff --git a/bsd/kern/uipc_mbuf2.c b/bsd/kern/uipc_mbuf2.c index ef742b340..a8c8652b2 100644 --- a/bsd/kern/uipc_mbuf2.c +++ b/bsd/kern/uipc_mbuf2.c @@ -90,7 +90,7 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> #include <sys/malloc.h> #include <sys/mbuf.h> #if defined(PULLDOWN_STAT) && defined(INET6) @@ -279,17 +279,9 @@ m_pulldown(m, off, len, offp) if ((n->m_flags & M_EXT) == 0) sharedcluster = 0; else { -#ifdef __bsdi__ - if (n->m_ext.ext_func) -#else if (n->m_ext.ext_free) -#endif sharedcluster = 1; -#ifdef __NetBSD__ - else if (MCLISREFERENCED(n)) -#else - else if (mclrefcnt[mtocl(n->m_ext.ext_buf)] > 1) -#endif + else if (m_mclhasreference(n)) sharedcluster = 1; else sharedcluster = 0; @@ -440,3 +432,180 @@ m_aux_delete(m, victim) n = next; } } + +/* Get a packet tag structure along with specified data following. */ +struct m_tag * +m_tag_alloc(u_int32_t id, u_int16_t type, int len, int wait) +{ + struct m_tag *t; + + if (len < 0) + return NULL; +#ifndef __APPLE__ + t = malloc(len + sizeof(struct m_tag), M_PACKET_TAGS, wait); +#else + /*MALLOC(t, struct m_tag *, len + sizeof(struct m_tag), M_TEMP, M_WAITOK);*/ + if (len + sizeof(struct m_tag) <= MLEN) { + struct mbuf *m = m_get(wait, MT_TAG); + if (m == NULL) + return NULL; + t = (struct m_tag *) m->m_dat; + } else if (len + sizeof(struct m_tag) <= MCLBYTES) { + MCLALLOC((caddr_t)t, wait); + } else + t = NULL; +#endif + if (t == NULL) + return NULL; + t->m_tag_type = type; + t->m_tag_len = len; + t->m_tag_id = id; + return t; +} + + +/* Free a packet tag. */ +void +m_tag_free(struct m_tag *t) +{ +#ifndef __APPLE__ + free(t, M_PACKET_TAGS); +#else + /* FREE(t, M_TEMP); */ + if (t == NULL) + return; + if (t->m_tag_len <= MLEN) { + struct mbuf * m = m_dtom(t); + m_free(m); + } else { + MCLFREE((caddr_t)t); + } +#endif +} + +/* Prepend a packet tag. */ +void +m_tag_prepend(struct mbuf *m, struct m_tag *t) +{ + KASSERT(m && t, ("m_tag_prepend: null argument, m %p t %p", m, t)); + SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link); +} + +/* Unlink a packet tag. */ +void +m_tag_unlink(struct mbuf *m, struct m_tag *t) +{ + KASSERT(m && t, ("m_tag_unlink: null argument, m %p t %p", m, t)); + SLIST_REMOVE(&m->m_pkthdr.tags, t, m_tag, m_tag_link); +} + +/* Unlink and free a packet tag. */ +void +m_tag_delete(struct mbuf *m, struct m_tag *t) +{ + KASSERT(m && t, ("m_tag_delete: null argument, m %p t %p", m, t)); + m_tag_unlink(m, t); + m_tag_free(t); +} + +/* Unlink and free a packet tag chain, starting from given tag. */ +void +m_tag_delete_chain(struct mbuf *m, struct m_tag *t) +{ + struct m_tag *p, *q; + + KASSERT(m, ("m_tag_delete_chain: null mbuf")); + if (t != NULL) + p = t; + else + p = SLIST_FIRST(&m->m_pkthdr.tags); + if (p == NULL) + return; + while ((q = SLIST_NEXT(p, m_tag_link)) != NULL) + m_tag_delete(m, q); + m_tag_delete(m, p); +} + +/* Find a tag, starting from a given position. */ +struct m_tag * +m_tag_locate(struct mbuf *m, u_int32_t id, u_int16_t type, struct m_tag *t) +{ + struct m_tag *p; + + KASSERT(m, ("m_tag_find: null mbuf")); + if (t == NULL) + p = SLIST_FIRST(&m->m_pkthdr.tags); + else + p = SLIST_NEXT(t, m_tag_link); + while (p != NULL) { + if (p->m_tag_id == id && p->m_tag_type == type) + return p; + p = SLIST_NEXT(p, m_tag_link); + } + return NULL; +} + +/* Copy a single tag. */ +struct m_tag * +m_tag_copy(struct m_tag *t, int how) +{ + struct m_tag *p; + + KASSERT(t, ("m_tag_copy: null tag")); + p = m_tag_alloc(t->m_tag_type, t->m_tag_id, t->m_tag_len, how); + if (p == NULL) + return (NULL); + bcopy(t + 1, p + 1, t->m_tag_len); /* Copy the data */ + return p; +} + +/* + * Copy two tag chains. The destination mbuf (to) loses any attached + * tags even if the operation fails. This should not be a problem, as + * m_tag_copy_chain() is typically called with a newly-allocated + * destination mbuf. + */ +int +m_tag_copy_chain(struct mbuf *to, struct mbuf *from, int how) +{ + struct m_tag *p, *t, *tprev = NULL; + + KASSERT(to && from, + ("m_tag_copy: null argument, to %p from %p", to, from)); + m_tag_delete_chain(to, NULL); + SLIST_FOREACH(p, &from->m_pkthdr.tags, m_tag_link) { + t = m_tag_copy(p, how); + if (t == NULL) { + m_tag_delete_chain(to, NULL); + return 0; + } + if (tprev == NULL) + SLIST_INSERT_HEAD(&to->m_pkthdr.tags, t, m_tag_link); + else { + SLIST_INSERT_AFTER(tprev, t, m_tag_link); + tprev = t; + } + } + return 1; +} + +/* Initialize tags on an mbuf. */ +void +m_tag_init(struct mbuf *m) +{ + SLIST_INIT(&m->m_pkthdr.tags); +} + +/* Get first tag in chain. */ +struct m_tag * +m_tag_first(struct mbuf *m) +{ + return SLIST_FIRST(&m->m_pkthdr.tags); +} + +/* Get next tag in chain. */ +struct m_tag * +m_tag_next(__unused struct mbuf *m, struct m_tag *t) +{ + return SLIST_NEXT(t, m_tag_link); +} diff --git a/bsd/kern/uipc_proto.c b/bsd/kern/uipc_proto.c index 1d31b684a..6fd419ddd 100644 --- a/bsd/kern/uipc_proto.c +++ b/bsd/kern/uipc_proto.c @@ -75,22 +75,29 @@ extern struct domain localdomain; /* or at least forward */ static struct protosw localsw[] = { { SOCK_STREAM, &localdomain, 0, PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS, - 0, 0, 0, 0, + 0, 0, 0, uipc_ctloutput, 0, 0, 0, 0, 0, - 0, &uipc_usrreqs + 0, + &uipc_usrreqs, + 0, 0, 0 + }, { SOCK_DGRAM, &localdomain, 0, PR_ATOMIC|PR_ADDR|PR_RIGHTS, - 0, 0, 0, 0, + 0, 0, 0, uipc_ctloutput, 0, 0, 0, 0, 0, - 0, &uipc_usrreqs + 0, + &uipc_usrreqs, + 0, 0, 0 }, { 0, 0, 0, 0, 0, 0, raw_ctlinput, 0, 0, - raw_init, 0, 0, 0, - 0, &raw_usrreqs + 0, 0, 0, 0, + 0, + &raw_usrreqs, + 0, 0, 0 } }; diff --git a/bsd/kern/uipc_socket.c b/bsd/kern/uipc_socket.c index ebeeec818..2018446b4 100644 --- a/bsd/kern/uipc_socket.c +++ b/bsd/kern/uipc_socket.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -60,8 +60,9 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/filedesc.h> -#include <sys/proc.h> -#include <sys/file.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> +#include <sys/file_internal.h> #include <sys/fcntl.h> #include <sys/malloc.h> #include <sys/mbuf.h> @@ -82,6 +83,7 @@ #include <netinet/in.h> #include <netinet/in_pcb.h> #include <kern/zalloc.h> +#include <kern/locks.h> #include <machine/limits.h> int so_cache_hw = 0; @@ -96,6 +98,11 @@ struct zone *so_cache_zone; extern int get_inpcb_str_size(); extern int get_tcp_str_size(); +static lck_grp_t *so_cache_mtx_grp; +static lck_attr_t *so_cache_mtx_attr; +static lck_grp_attr_t *so_cache_mtx_grp_attr; +lck_mtx_t *so_cache_mtx; + #include <machine/limits.h> static void filt_sordetach(struct knote *kn); @@ -111,6 +118,7 @@ static struct filterops soread_filtops = static struct filterops sowrite_filtops = { 1, NULL, filt_sowdetach, filt_sowrite }; +#define EVEN_MORE_LOCKING_DEBUG 0 int socket_debug = 0; int socket_zone = M_SOCKET; so_gen_t so_gencnt; /* generation count for sockets */ @@ -128,6 +136,7 @@ MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES) + SYSCTL_DECL(_kern_ipc); static int somaxconn = SOMAXCONN; @@ -144,8 +153,6 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, CTLFLAG_RW, &sorecvmincopy, 0, ""); void so_cache_timer(); -struct mbuf *m_getpackets(int, int, int); - /* * Socket operation routines. @@ -156,20 +163,54 @@ struct mbuf *m_getpackets(int, int, int); */ #ifdef __APPLE__ + +vm_size_t so_cache_zone_element_size; + +static int sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list, int *resid); + + void socketinit() { vm_size_t str_size; + if (so_cache_init_done) { + printf("socketinit: already called...\n"); + return; + } + + /* + * allocate lock group attribute and group for socket cache mutex + */ + so_cache_mtx_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setdefault(so_cache_mtx_grp_attr); + + so_cache_mtx_grp = lck_grp_alloc_init("so_cache", so_cache_mtx_grp_attr); + + /* + * allocate the lock attribute for socket cache mutex + */ + so_cache_mtx_attr = lck_attr_alloc_init(); + lck_attr_setdefault(so_cache_mtx_attr); + so_cache_init_done = 1; - timeout(so_cache_timer, NULL, (SO_CACHE_FLUSH_INTERVAL * hz)); + so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr); /* cached sockets mutex */ + + if (so_cache_mtx == NULL) + return; /* we're hosed... */ + str_size = (vm_size_t)( sizeof(struct socket) + 4 + get_inpcb_str_size() + 4 + get_tcp_str_size()); so_cache_zone = zinit (str_size, 120000*str_size, 8192, "socache zone"); #if TEMPDEBUG - kprintf("cached_sock_alloc -- so_cache_zone size is %x\n", str_size); + printf("cached_sock_alloc -- so_cache_zone size is %x\n", str_size); #endif + timeout(so_cache_timer, NULL, (SO_CACHE_FLUSH_INTERVAL * hz)); + + so_cache_zone_element_size = str_size; + + sflt_init(); } @@ -179,11 +220,11 @@ int waitok; { caddr_t temp; - int s; register u_long offset; - s = splnet(); + lck_mtx_lock(so_cache_mtx); + if (cached_sock_count) { cached_sock_count--; *so = socket_cache_head; @@ -195,7 +236,8 @@ int waitok; socket_cache_head->cache_prev = 0; else socket_cache_tail = 0; - splx(s); + + lck_mtx_unlock(so_cache_mtx); temp = (*so)->so_saved_pcb; bzero((caddr_t)*so, sizeof(struct socket)); @@ -204,13 +246,16 @@ int waitok; cached_sock_count); #endif (*so)->so_saved_pcb = temp; + (*so)->cached_in_sock_layer = 1; + } else { #if TEMPDEBUG kprintf("Allocating cached sock %x from memory\n", *so); #endif - splx(s); + lck_mtx_unlock(so_cache_mtx); + if (waitok) *so = (struct socket *) zalloc(so_cache_zone); else @@ -255,17 +300,16 @@ int waitok; void cached_sock_free(so) struct socket *so; { - int s; + lck_mtx_lock(so_cache_mtx); - s = splnet(); if (++cached_sock_count > MAX_CACHED_SOCKETS) { --cached_sock_count; - splx(s); + lck_mtx_unlock(so_cache_mtx); #if TEMPDEBUG kprintf("Freeing overflowed cached socket %x\n", so); #endif - zfree(so_cache_zone, (vm_offset_t) so); + zfree(so_cache_zone, so); } else { #if TEMPDEBUG @@ -283,7 +327,7 @@ struct socket *so; so->cache_timestamp = so_cache_time; socket_cache_head = so; - splx(s); + lck_mtx_unlock(so_cache_mtx); } #if TEMPDEBUG @@ -297,44 +341,38 @@ struct socket *so; void so_cache_timer() { register struct socket *p; - register int s; register int n_freed = 0; - boolean_t funnel_state; - funnel_state = thread_funnel_set(network_flock, TRUE); - ++so_cache_time; + lck_mtx_lock(so_cache_mtx); - s = splnet(); + ++so_cache_time; - while (p = socket_cache_tail) + while ( (p = socket_cache_tail) ) { if ((so_cache_time - p->cache_timestamp) < SO_CACHE_TIME_LIMIT) break; so_cache_timeouts++; - if (socket_cache_tail = p->cache_prev) + if ( (socket_cache_tail = p->cache_prev) ) p->cache_prev->cache_next = 0; if (--cached_sock_count == 0) socket_cache_head = 0; - splx(s); - zfree(so_cache_zone, (vm_offset_t) p); + zfree(so_cache_zone, p); - splnet(); if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) { so_cache_max_freed++; break; } } - splx(s); + lck_mtx_unlock(so_cache_mtx); timeout(so_cache_timer, NULL, (SO_CACHE_FLUSH_INTERVAL * hz)); - (void) thread_funnel_set(network_flock, FALSE); } #endif /* __APPLE__ */ @@ -358,12 +396,12 @@ soalloc(waitok, dom, type) cached_sock_alloc(&so, waitok); else { - so = _MALLOC_ZONE(sizeof(*so), socket_zone, M_WAITOK); + MALLOC_ZONE(so, struct socket *, sizeof(*so), socket_zone, M_WAITOK); if (so) bzero(so, sizeof *so); } /* XXX race condition for reentrant kernel */ - +//###LD Atomic add for so_gencnt if (so) { so->so_gencnt = ++so_gencnt; so->so_zone = socket_zone; @@ -415,23 +453,22 @@ socreate(dom, aso, type, proto) #ifdef __APPLE__ if (p != 0) { - if (p->p_ucred->cr_uid == 0) + so->so_uid = kauth_cred_getuid(kauth_cred_get()); + if (!suser(kauth_cred_get(),NULL)) so->so_state = SS_PRIV; - - so->so_uid = p->p_ucred->cr_uid; } #else - so->so_cred = p->p_ucred; - crhold(so->so_cred); + so->so_cred = kauth_cred_get_with_ref(); #endif so->so_proto = prp; #ifdef __APPLE__ so->so_rcv.sb_flags |= SB_RECV; /* XXX */ - if (prp->pr_sfilter.tqh_first) - error = sfilter_init(so); - if (error == 0) + so->so_rcv.sb_so = so->so_snd.sb_so = so; #endif - error = (*prp->pr_usrreqs->pru_attach)(so, proto, p); + +//### Attachement will create the per pcb lock if necessary and increase refcount + + error = (*prp->pr_usrreqs->pru_attach)(so, proto, p); if (error) { /* * Warning: @@ -439,13 +476,16 @@ socreate(dom, aso, type, proto) * so protocol attachment handler must be coded carefuly */ so->so_state |= SS_NOFDREF; - sofree(so); + sofreelastref(so, 1); return (error); } + so->so_usecount++; #ifdef __APPLE__ prp->pr_domain->dom_refs++; - so->so_rcv.sb_so = so->so_snd.sb_so = so; TAILQ_INIT(&so->so_evlist); + + /* Attach socket filters for this protocol */ + sflt_initsock(so); #if TCPDEBUG if (tcpconsdebug == 2) so->so_options |= SO_DEBUG; @@ -463,29 +503,40 @@ sobind(so, nam) { struct proc *p = current_proc(); - int error; - struct kextcb *kp; - int s = splnet(); + int error = 0; + struct socket_filter_entry *filter; + int filtered = 0; - error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p); - if (error == 0) { - kp = sotokextcb(so); - while (kp) { - if (kp->e_soif && kp->e_soif->sf_sobind) { - error = (*kp->e_soif->sf_sobind)(so, nam, kp); - if (error) { - if (error == EJUSTRETURN) { - error = 0; - break; - } - splx(s); - return(error); - } + socket_lock(so, 1); + + /* Socket filter */ + error = 0; + for (filter = so->so_filt; filter && (error == 0); + filter = filter->sfe_next_onsocket) { + if (filter->sfe_filter->sf_filter.sf_bind) { + if (filtered == 0) { + filtered = 1; + sflt_use(so); + socket_unlock(so, 0); } - kp = kp->e_next; + error = filter->sfe_filter->sf_filter.sf_bind( + filter->sfe_cookie, so, nam); } } - splx(s); + if (filtered != 0) { + socket_lock(so, 0); + sflt_unuse(so); + } + /* End socket filter */ + + if (error == 0) + error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p); + + socket_unlock(so, 1); + + if (error == EJUSTRETURN) + error = 0; + return (error); } @@ -513,13 +564,17 @@ sodealloc(so) FREE(so->so_accf, M_ACCF); } #endif /* INET */ - crfree(so->so_cred); + kauth_cred_rele(so->so_cred); zfreei(so->so_zone, so); #else if (so->cached_in_sock_layer == 1) cached_sock_free(so); - else - _FREE_ZONE(so, sizeof(*so), so->so_zone); + else { + if (so->cached_in_sock_layer == -1) + panic("sodealloc: double dealloc: so=%x\n", so); + so->cached_in_sock_layer = -1; + FREE_ZONE(so, sizeof(*so), so->so_zone); + } #endif /* __APPLE__ */ } @@ -529,64 +584,65 @@ solisten(so, backlog) int backlog; { - struct kextcb *kp; struct proc *p = current_proc(); - int s, error; + int error; - s = splnet(); - error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p); + socket_lock(so, 1); + + { + struct socket_filter_entry *filter; + int filtered = 0; + error = 0; + for (filter = so->so_filt; filter && (error == 0); + filter = filter->sfe_next_onsocket) { + if (filter->sfe_filter->sf_filter.sf_listen) { + if (filtered == 0) { + filtered = 1; + sflt_use(so); + socket_unlock(so, 0); + } + error = filter->sfe_filter->sf_filter.sf_listen( + filter->sfe_cookie, so); + } + } + if (filtered != 0) { + socket_lock(so, 0); + sflt_unuse(so); + } + } + + if (error == 0) { + error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p); + } + if (error) { - splx(s); + socket_unlock(so, 1); + if (error == EJUSTRETURN) + error = 0; return (error); } - if (TAILQ_EMPTY(&so->so_comp)) + + if (TAILQ_EMPTY(&so->so_comp)) so->so_options |= SO_ACCEPTCONN; if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; so->so_qlimit = backlog; - kp = sotokextcb(so); - while (kp) { - if (kp->e_soif && kp->e_soif->sf_solisten) { - error = (*kp->e_soif->sf_solisten)(so, kp); - if (error) { - if (error == EJUSTRETURN) { - error = 0; - break; - } - splx(s); - return(error); - } - } - kp = kp->e_next; - } - splx(s); + socket_unlock(so, 1); return (0); } - void -sofree(so) +sofreelastref(so, dealloc) register struct socket *so; + int dealloc; { int error; - struct kextcb *kp; struct socket *head = so->so_head; - kp = sotokextcb(so); - while (kp) { - if (kp->e_soif && kp->e_soif->sf_sofree) { - error = (*kp->e_soif->sf_sofree)(so, kp); - if (error) { - selthreadclear(&so->so_snd.sb_sel); - selthreadclear(&so->so_rcv.sb_sel); - return; /* void fn */ - } - } - kp = kp->e_next; - } + /*### Assume socket is locked */ - if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { + if ((!(so->so_flags & SOF_PCBCLEARING)) || ((so->so_state & SS_NOFDREF) == 0)) { #ifdef __APPLE__ selthreadclear(&so->so_snd.sb_sel); selthreadclear(&so->so_rcv.sb_sel); @@ -594,6 +650,7 @@ sofree(so) return; } if (head != NULL) { + socket_lock(head, 1); if (so->so_state & SS_INCOMP) { TAILQ_REMOVE(&head->so_incomp, so, so_list); head->so_incqlen--; @@ -608,6 +665,7 @@ sofree(so) selthreadclear(&so->so_snd.sb_sel); selthreadclear(&so->so_rcv.sb_sel); #endif + socket_unlock(head, 1); return; } else { panic("sofree: not queued"); @@ -615,14 +673,20 @@ sofree(so) head->so_qlen--; so->so_state &= ~SS_INCOMP; so->so_head = NULL; + socket_unlock(head, 1); } #ifdef __APPLE__ selthreadclear(&so->so_snd.sb_sel); sbrelease(&so->so_snd); #endif sorflush(so); - sfilter_term(so); - sodealloc(so); + + /* 3932268: disable upcall */ + so->so_rcv.sb_flags &= ~SB_UPCALL; + so->so_snd.sb_flags &= ~SB_UPCALL; + + if (dealloc) + sodealloc(so); } /* @@ -631,52 +695,69 @@ sofree(so) * Free socket when disconnect complete. */ int -soclose(so) +soclose_locked(so) register struct socket *so; { - int s = splnet(); /* conservative */ int error = 0; - struct kextcb *kp; + lck_mtx_t * mutex_held; + struct timespec ts; -#ifndef __APPLE__ - funsetown(so->so_sigio); -#endif - kp = sotokextcb(so); - while (kp) { - if (kp->e_soif && kp->e_soif->sf_soclose) { - error = (*kp->e_soif->sf_soclose)(so, kp); - if (error) { - splx(s); - return((error == EJUSTRETURN) ? 0 : error); - } - } - kp = kp->e_next; + if (so->so_usecount == 0) { + panic("soclose: so=%x refcount=0\n", so); } - if (so->so_options & SO_ACCEPTCONN) { - struct socket *sp, *sonext; - - sp = TAILQ_FIRST(&so->so_incomp); - for (; sp != NULL; sp = sonext) { - sonext = TAILQ_NEXT(sp, so_list); - (void) soabort(sp); - } - for (sp = TAILQ_FIRST(&so->so_comp); sp != NULL; sp = sonext) { - sonext = TAILQ_NEXT(sp, so_list); - /* Dequeue from so_comp since sofree() won't do it */ - TAILQ_REMOVE(&so->so_comp, sp, so_list); - so->so_qlen--; - sp->so_state &= ~SS_COMP; - sp->so_head = NULL; - (void) soabort(sp); - } - - } - if (so->so_pcb == 0) + sflt_notify(so, sock_evt_closing, NULL); + + if ((so->so_options & SO_ACCEPTCONN)) { + struct socket *sp; + + /* We do not want new connection to be added to the connection queues */ + so->so_options &= ~SO_ACCEPTCONN; + + while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { + /* A bit tricky here. We need to keep + * a lock if it's a protocol global lock + * but we want the head, not the socket locked + * in the case of per-socket lock... + */ + if (so->so_proto->pr_getlock != NULL) + socket_lock(sp, 1); + if (so->so_proto->pr_getlock != NULL) + socket_unlock(so, 0); + (void) soabort(sp); + if (so->so_proto->pr_getlock != NULL) + socket_lock(so, 0); + if (so->so_proto->pr_getlock != NULL) + socket_unlock(sp, 1); + } + + while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { + if (so->so_proto->pr_getlock != NULL) + socket_lock(sp, 1); + + /* Dequeue from so_comp since sofree() won't do it */ + TAILQ_REMOVE(&so->so_comp, sp, so_list); + so->so_qlen--; + sp->so_state &= ~SS_COMP; + sp->so_head = NULL; + + if (so->so_proto->pr_getlock != NULL) + socket_unlock(so, 0); + (void) soabort(sp); + if (so->so_proto->pr_getlock != NULL) + socket_lock(so, 0); + if (so->so_proto->pr_getlock != NULL) + socket_unlock(sp, 1); + } + } + if (so->so_pcb == 0) { + /* 3915887: mark the socket as ready for dealloc */ + so->so_flags |= SOF_PCBCLEARING; goto discard; + } if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { - error = sodisconnect(so); + error = sodisconnectlocked(so); if (error) goto drop; } @@ -684,20 +765,34 @@ soclose(so) if ((so->so_state & SS_ISDISCONNECTING) && (so->so_state & SS_NBIO)) goto drop; + if (so->so_proto->pr_getlock != NULL) + mutex_held = (*so->so_proto->pr_getlock)(so, 0); + else + mutex_held = so->so_proto->pr_domain->dom_mtx; while (so->so_state & SS_ISCONNECTED) { - error = tsleep((caddr_t)&so->so_timeo, - PSOCK | PCATCH, "soclos", so->so_linger); - if (error) + ts.tv_sec = (so->so_linger/100); + ts.tv_nsec = (so->so_linger % 100) * NSEC_PER_USEC * 1000 * 10; + error = msleep((caddr_t)&so->so_timeo, mutex_held, + PSOCK | PCATCH, "soclos", &ts); + if (error) { + /* It's OK when the time fires, don't report an error */ + if (error == EWOULDBLOCK) + error = 0; break; + } } } } drop: - if (so->so_pcb) { + if (so->so_usecount == 0) + panic("soclose: usecount is zero so=%x\n", so); + if (so->so_pcb && !(so->so_flags & SOF_PCBCLEARING)) { int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); if (error == 0) error = error2; } + if (so->so_usecount <= 0) + panic("soclose: usecount is zero so=%x\n", so); discard: if (so->so_pcb && so->so_state & SS_NOFDREF) panic("soclose: NOFDREF"); @@ -706,20 +801,49 @@ discard: so->so_proto->pr_domain->dom_refs--; evsofree(so); #endif + so->so_usecount--; sofree(so); - splx(s); return (error); } +int +soclose(so) + register struct socket *so; +{ + int error = 0; + socket_lock(so, 1); + if (so->so_retaincnt == 0) + error = soclose_locked(so); + else { /* if the FD is going away, but socket is retained in kernel remove its reference */ + so->so_usecount--; + if (so->so_usecount < 2) + panic("soclose: retaincnt non null and so=%x usecount=%x\n", so->so_usecount); + } + socket_unlock(so, 1); + return (error); +} + + /* * Must be called at splnet... */ +//#### Should already be locked int soabort(so) struct socket *so; { int error; +#ifdef MORE_LOCKING_DEBUG + lck_mtx_t * mutex_held; + + if (so->so_proto->pr_getlock != NULL) + mutex_held = (*so->so_proto->pr_getlock)(so, 0); + else + mutex_held = so->so_proto->pr_domain->dom_mtx; + lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); +#endif + error = (*so->so_proto->pr_usrreqs->pru_abort)(so); if (error) { sofree(so); @@ -729,55 +853,48 @@ soabort(so) } int -soaccept(so, nam) +soacceptlock(so, nam, dolock) register struct socket *so; struct sockaddr **nam; + int dolock; { - int s = splnet(); int error; - struct kextcb *kp; + + if (dolock) socket_lock(so, 1); if ((so->so_state & SS_NOFDREF) == 0) panic("soaccept: !NOFDREF"); so->so_state &= ~SS_NOFDREF; error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); - if (error == 0) { - kp = sotokextcb(so); - while (kp) { - if (kp->e_soif && kp->e_soif->sf_soaccept) { - error = (*kp->e_soif->sf_soaccept)(so, nam, kp); - if (error) { - if (error == EJUSTRETURN) { - error = 0; - break; - } - splx(s); - return(error); - } - } - kp = kp->e_next; - } - } - - splx(s); + if (dolock) socket_unlock(so, 1); return (error); } +int +soaccept(so, nam) + register struct socket *so; + struct sockaddr **nam; +{ + return (soacceptlock(so, nam, 1)); +} int -soconnect(so, nam) +soconnectlock(so, nam, dolock) register struct socket *so; struct sockaddr *nam; + int dolock; { int s; int error; struct proc *p = current_proc(); - struct kextcb *kp; - if (so->so_options & SO_ACCEPTCONN) + if (dolock) socket_lock(so, 1); + + if (so->so_options & SO_ACCEPTCONN) { + if (dolock) socket_unlock(so, 1); return (EOPNOTSUPP); - s = splnet(); + } /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. @@ -786,72 +903,77 @@ soconnect(so, nam) */ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || - (error = sodisconnect(so)))) + (error = sodisconnectlocked(so)))) error = EISCONN; else { - /* - * Run connect filter before calling protocol: - * - non-blocking connect returns before completion; - * - allows filters to modify address. - */ - kp = sotokextcb(so); - while (kp) { - if (kp->e_soif && kp->e_soif->sf_soconnect) { - error = (*kp->e_soif->sf_soconnect)(so, nam, kp); - if (error) { - if (error == EJUSTRETURN) { - error = 0; - } - splx(s); - return(error); - } - } - kp = kp->e_next; - } + /* + * Run connect filter before calling protocol: + * - non-blocking connect returns before completion; + */ + { + struct socket_filter_entry *filter; + int filtered = 0; + error = 0; + for (filter = so->so_filt; filter && (error == 0); + filter = filter->sfe_next_onsocket) { + if (filter->sfe_filter->sf_filter.sf_connect_out) { + if (filtered == 0) { + filtered = 1; + sflt_use(so); + socket_unlock(so, 0); + } + error = filter->sfe_filter->sf_filter.sf_connect_out( + filter->sfe_cookie, so, nam); + } + } + if (filtered != 0) { + socket_lock(so, 0); + sflt_unuse(so); + } + } + if (error) { + if (error == EJUSTRETURN) + error = 0; + if (dolock) socket_unlock(so, 1); + return error; + } + error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p); } - splx(s); + if (dolock) socket_unlock(so, 1); return (error); } +int +soconnect(so, nam) + register struct socket *so; + struct sockaddr *nam; +{ + return (soconnectlock(so, nam, 1)); +} + int soconnect2(so1, so2) register struct socket *so1; struct socket *so2; { - int s = splnet(); int error; - struct kextcb *kp; +//####### Assumes so1 is already locked / + + socket_lock(so2, 1); error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); - if (error == 0) { - kp = sotokextcb(so1); - while (kp) { - if (kp->e_soif && kp->e_soif->sf_soconnect2) { - error = (*kp->e_soif->sf_soconnect2)(so1, so2, kp); - if (error) { - if (error == EJUSTRETURN) { - return 0; - break; - } - splx(s); - return(error); - } - } - kp = kp->e_next; - } - } - splx(s); + + socket_unlock(so2, 1); return (error); } + int -sodisconnect(so) +sodisconnectlocked(so) register struct socket *so; { - int s = splnet(); int error; - struct kextcb *kp; if ((so->so_state & SS_ISCONNECTED) == 0) { error = ENOTCONN; @@ -861,31 +983,102 @@ sodisconnect(so) error = EALREADY; goto bad; } + error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); + if (error == 0) { - kp = sotokextcb(so); - while (kp) { - if (kp->e_soif && kp->e_soif->sf_sodisconnect) { - error = (*kp->e_soif->sf_sodisconnect)(so, kp); - if (error) { - if (error == EJUSTRETURN) { - error = 0; - break; - } - splx(s); - return(error); - } - } - kp = kp->e_next; - } + sflt_notify(so, sock_evt_disconnected, NULL); } bad: - splx(s); return (error); } +//### Locking version +int +sodisconnect(so) + register struct socket *so; +{ + int error; + + socket_lock(so, 1); + error = sodisconnectlocked(so); + socket_unlock(so, 1); + return(error); +} #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_DONTWAIT : M_WAIT) + +/* + * sosendcheck will lock the socket buffer if it isn't locked and + * verify that there is space for the data being inserted. + */ + +static int +sosendcheck( + struct socket *so, + struct sockaddr *addr, + long resid, + long clen, + long atomic, + int flags, + int *sblocked) +{ + int error = 0; + long space; + +restart: + if (*sblocked == 0) { + error = sblock(&so->so_snd, SBLOCKWAIT(flags)); + if (error) + return error; + *sblocked = 1; + } + + if (so->so_state & SS_CANTSENDMORE) + return EPIPE; + + if (so->so_error) { + error = so->so_error; + so->so_error = 0; + return error; + } + + if ((so->so_state & SS_ISCONNECTED) == 0) { + /* + * `sendto' and `sendmsg' is allowed on a connection- + * based socket if it supports implied connect. + * Return ENOTCONN if not connected and no address is + * supplied. + */ + if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && + (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { + if ((so->so_state & SS_ISCONFIRMING) == 0 && + !(resid == 0 && clen != 0)) + return ENOTCONN; + } else if (addr == 0 && !(flags&MSG_HOLD)) + return (so->so_proto->pr_flags & PR_CONNREQUIRED) ? ENOTCONN : EDESTADDRREQ; + } + space = sbspace(&so->so_snd); + if (flags & MSG_OOB) + space += 1024; + if ((atomic && resid > so->so_snd.sb_hiwat) || + clen > so->so_snd.sb_hiwat) + return EMSGSIZE; + if (space < resid + clen && + (atomic || space < so->so_snd.sb_lowat || space < clen)) { + if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) + return EWOULDBLOCK; + sbunlock(&so->so_snd, 1); + error = sbwait(&so->so_snd); + if (error) { + return error; + } + goto restart; + } + + return 0; +} + /* * Send on a socket. * If send must go all at once and message is larger than @@ -920,13 +1113,14 @@ sosend(so, addr, uio, top, control, flags) struct mbuf **mp; register struct mbuf *m, *freelist = NULL; register long space, len, resid; - int clen = 0, error, s, dontroute, mlen, sendflags; + int clen = 0, error, dontroute, mlen, sendflags; int atomic = sosendallatonce(so) || top; + int sblocked = 0; struct proc *p = current_proc(); - struct kextcb *kp; if (uio) - resid = uio->uio_resid; + // LP64todo - fix this! + resid = uio_resid(uio); else resid = top->m_pkthdr.len; @@ -937,6 +1131,8 @@ sosend(so, addr, uio, top, control, flags) so->so_snd.sb_lowat, so->so_snd.sb_hiwat); + socket_lock(so, 1); + /* * In theory resid should be unsigned. * However, space must be signed, as it might be less than 0 @@ -947,8 +1143,9 @@ sosend(so, addr, uio, top, control, flags) * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM * type sockets since that's an error. */ - if (resid < 0 || so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { + if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { error = EINVAL; + socket_unlock(so, 1); goto out; } @@ -959,161 +1156,138 @@ sosend(so, addr, uio, top, control, flags) p->p_stats->p_ru.ru_msgsnd++; if (control) clen = control->m_len; -#define snderr(errno) { error = errno; splx(s); goto release; } -restart: - error = sblock(&so->so_snd, SBLOCKWAIT(flags)); - if (error) - goto out; do { - s = splnet(); - if (so->so_state & SS_CANTSENDMORE) - snderr(EPIPE); - if (so->so_error) { - error = so->so_error; - so->so_error = 0; - splx(s); - goto release; - } - if ((so->so_state & SS_ISCONNECTED) == 0) { - /* - * `sendto' and `sendmsg' is allowed on a connection- - * based socket if it supports implied connect. - * Return ENOTCONN if not connected and no address is - * supplied. - */ - if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && - (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { - if ((so->so_state & SS_ISCONFIRMING) == 0 && - !(resid == 0 && clen != 0)) - snderr(ENOTCONN); - } else if (addr == 0 && !(flags&MSG_HOLD)) - snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? - ENOTCONN : EDESTADDRREQ); - } - space = sbspace(&so->so_snd); - if (flags & MSG_OOB) - space += 1024; - if ((atomic && resid > so->so_snd.sb_hiwat) || - clen > so->so_snd.sb_hiwat) - snderr(EMSGSIZE); - if (space < resid + clen && - (atomic || space < so->so_snd.sb_lowat || space < clen)) { - if (so->so_state & SS_NBIO) - snderr(EWOULDBLOCK); - sbunlock(&so->so_snd); - error = sbwait(&so->so_snd); - splx(s); - if (error) + error = sosendcheck(so, addr, resid, clen, atomic, flags, &sblocked); + if (error) { + if (sblocked) + goto release; + else { + socket_unlock(so, 1); goto out; - goto restart; + } } - splx(s); mp = ⊤ - space -= clen; + space = sbspace(&so->so_snd) - clen + ((flags & MSG_OOB) ? 1024 : 0); do { - if (uio == NULL) { - /* - * Data is prepackaged in "top". - */ - resid = 0; - if (flags & MSG_EOR) - top->m_flags |= M_EOR; - } else { - boolean_t dropped_funnel = FALSE; - int chainlength; - int bytes_to_copy; - - bytes_to_copy = min(resid, space); - - if (sosendminchain > 0) { - if (bytes_to_copy >= sosendminchain) { - dropped_funnel = TRUE; - (void)thread_funnel_set(network_flock, FALSE); - } - chainlength = 0; - } else - chainlength = sosendmaxchain; - - do { - - if (bytes_to_copy >= MINCLSIZE) { - /* - * try to maintain a local cache of mbuf clusters needed to complete this write - * the list is further limited to the number that are currently needed to fill the socket - * this mechanism allows a large number of mbufs/clusters to be grabbed under a single - * mbuf lock... if we can't get any clusters, than fall back to trying for mbufs - * if we fail early (or miscalcluate the number needed) make sure to release any clusters - * we haven't yet consumed. - */ - if ((m = freelist) == NULL) { - int num_needed; - int hdrs_needed = 0; - - if (top == 0) - hdrs_needed = 1; - num_needed = bytes_to_copy / MCLBYTES; - - if ((bytes_to_copy - (num_needed * MCLBYTES)) >= MINCLSIZE) - num_needed++; - - if ((freelist = m_getpackets(num_needed, hdrs_needed, M_WAIT)) == NULL) - goto getpackets_failed; - m = freelist; - } - freelist = m->m_next; - m->m_next = NULL; - - mlen = MCLBYTES; - len = min(mlen, bytes_to_copy); - } else { -getpackets_failed: - if (top == 0) { - MGETHDR(m, M_WAIT, MT_DATA); - mlen = MHLEN; - m->m_pkthdr.len = 0; - m->m_pkthdr.rcvif = (struct ifnet *)0; - } else { - MGET(m, M_WAIT, MT_DATA); - mlen = MLEN; - } - len = min(mlen, bytes_to_copy); - /* - * For datagram protocols, leave room - * for protocol headers in first mbuf. - */ - if (atomic && top == 0 && len < mlen) - MH_ALIGN(m, len); - } - chainlength += len; - space -= len; - - error = uiomove(mtod(m, caddr_t), (int)len, uio); - - resid = uio->uio_resid; - - m->m_len = len; - *mp = m; - top->m_pkthdr.len += len; - if (error) - break; - mp = &m->m_next; - if (resid <= 0) { + if (uio == NULL) { + /* + * Data is prepackaged in "top". + */ + resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; - break; - } - bytes_to_copy = min(resid, space); - - } while (space > 0 && (chainlength < sosendmaxchain || atomic || resid < MINCLSIZE)); + } else { + int chainlength; + int bytes_to_copy; + + bytes_to_copy = min(resid, space); + + if (sosendminchain > 0) { + chainlength = 0; + } else + chainlength = sosendmaxchain; + + socket_unlock(so, 0); + + do { + int num_needed; + int hdrs_needed = (top == 0) ? 1 : 0; + + /* + * try to maintain a local cache of mbuf clusters needed to complete this write + * the list is further limited to the number that are currently needed to fill the socket + * this mechanism allows a large number of mbufs/clusters to be grabbed under a single + * mbuf lock... if we can't get any clusters, than fall back to trying for mbufs + * if we fail early (or miscalcluate the number needed) make sure to release any clusters + * we haven't yet consumed. + */ + if (freelist == NULL && bytes_to_copy > MCLBYTES) { + num_needed = bytes_to_copy / NBPG; + + if ((bytes_to_copy - (num_needed * NBPG)) >= MINCLSIZE) + num_needed++; + + freelist = m_getpackets_internal(&num_needed, hdrs_needed, M_WAIT, 0, NBPG); + /* Fall back to cluster size if allocation failed */ + } + + if (freelist == NULL && bytes_to_copy > MINCLSIZE) { + num_needed = bytes_to_copy / MCLBYTES; + + if ((bytes_to_copy - (num_needed * MCLBYTES)) >= MINCLSIZE) + num_needed++; + + freelist = m_getpackets_internal(&num_needed, hdrs_needed, M_WAIT, 0, MCLBYTES); + /* Fall back to a single mbuf if allocation failed */ + } + + if (freelist == NULL) { + if (top == 0) + MGETHDR(freelist, M_WAIT, MT_DATA); + else + MGET(freelist, M_WAIT, MT_DATA); + + if (freelist == NULL) { + error = ENOBUFS; + socket_lock(so, 0); + if (sblocked) { + goto release; + } else { + socket_unlock(so, 1); + goto out; + } + } + /* + * For datagram protocols, leave room + * for protocol headers in first mbuf. + */ + if (atomic && top == 0 && bytes_to_copy < MHLEN) + MH_ALIGN(freelist, bytes_to_copy); + } + m = freelist; + freelist = m->m_next; + m->m_next = NULL; + + if ((m->m_flags & M_EXT)) + mlen = m->m_ext.ext_size; + else if ((m->m_flags & M_PKTHDR)) + mlen = MHLEN - m_leadingspace(m); + else + mlen = MLEN; + len = min(mlen, bytes_to_copy); + + chainlength += len; + + space -= len; - if (dropped_funnel == TRUE) - (void)thread_funnel_set(network_flock, TRUE); - if (error) - goto release; - } + error = uiomove(mtod(m, caddr_t), (int)len, uio); + + // LP64todo - fix this! + resid = uio_resid(uio); + + m->m_len = len; + *mp = m; + top->m_pkthdr.len += len; + if (error) + break; + mp = &m->m_next; + if (resid <= 0) { + if (flags & MSG_EOR) + top->m_flags |= M_EOR; + break; + } + bytes_to_copy = min(resid, space); + + } while (space > 0 && (chainlength < sosendmaxchain || atomic || resid < MINCLSIZE)); + + socket_lock(so, 0); + + if (error) + goto release; + } if (flags & (MSG_HOLD|MSG_SEND)) { /* Enqueue for later, go away if HOLD */ @@ -1138,7 +1312,6 @@ getpackets_failed: } if (dontroute) so->so_options |= SO_DONTROUTE; - s = splnet(); /* XXX */ /* Compute flags here, for pru_send and NKEs */ sendflags = (flags & MSG_OOB) ? PRUS_OOB : /* @@ -1152,32 +1325,84 @@ getpackets_failed: PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME */ (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; - kp = sotokextcb(so); - while (kp) - { if (kp->e_soif && kp->e_soif->sf_sosend) { - error = (*kp->e_soif->sf_sosend)(so, &addr, - &uio, &top, - &control, - &sendflags, - kp); - if (error) { - splx(s); + + /* + * Socket filter processing + */ + { + struct socket_filter_entry *filter; + int filtered; + + filtered = 0; + error = 0; + for (filter = so->so_filt; filter && (error == 0); + filter = filter->sfe_next_onsocket) { + if (filter->sfe_filter->sf_filter.sf_data_out) { + int so_flags = 0; + if (filtered == 0) { + filtered = 1; + /* + * We don't let sbunlock unlock the socket because + * we don't want it to decrement the usecount. + */ + sbunlock(&so->so_snd, 1); + sblocked = 0; + socket_unlock(so, 0); + so_flags = (sendflags & MSG_OOB) ? sock_data_filt_flag_oob : 0; + } + error = filter->sfe_filter->sf_filter.sf_data_out( + filter->sfe_cookie, so, addr, &top, &control, so_flags); + } + } + + if (filtered) { + /* + * At this point, we've run at least one filter. + * The socket is unlocked as is the socket buffer. + */ + socket_lock(so, 0); if (error == EJUSTRETURN) { - sbunlock(&so->so_snd); + error = 0; + clen = 0; + control = 0; + top = 0; + socket_unlock(so, 1); + goto out; + } + else if (error) { + socket_unlock(so, 1); + goto out; + } - if (freelist) - m_freem_list(freelist); - return(0); + + /* Verify our state again, this will lock the socket buffer */ + error = sosendcheck(so, addr, top->m_pkthdr.len, + control ? control->m_pkthdr.len : 0, + atomic, flags, &sblocked); + if (error) { + if (sblocked) { + /* sbunlock at release will unlock the socket */ + goto release; + } + else { + socket_unlock(so, 1); + goto out; + } } - goto release; } } - kp = kp->e_next; - } - - error = (*so->so_proto->pr_usrreqs->pru_send)(so, - sendflags, top, addr, control, p); - splx(s); + /* + * End Socket filter processing + */ + + if (error == EJUSTRETURN) { + /* A socket filter handled this data */ + error = 0; + } + else { + error = (*so->so_proto->pr_usrreqs->pru_send)(so, + sendflags, top, addr, control, p); + } #ifdef __APPLE__ if (flags & MSG_SEND) so->so_temp = NULL; @@ -1194,7 +1419,7 @@ getpackets_failed: } while (resid); release: - sbunlock(&so->so_snd); + sbunlock(&so->so_snd, 0); /* will unlock socket */ out: if (top) m_freem(top); @@ -1238,13 +1463,13 @@ soreceive(so, psa, uio, mp0, controlp, flagsp) struct mbuf **controlp; int *flagsp; { - register struct mbuf *m, **mp, *ml; - register int flags, len, error, s, offset; + register struct mbuf *m, **mp, *ml = NULL; + register int flags, len, error, offset; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; int moff, type = 0; - int orig_resid = uio->uio_resid; - struct kextcb *kp; + // LP64todo - fix this! + int orig_resid = uio_resid(uio); volatile struct mbuf *free_list; volatile int delayed_copy_len; int can_delay; @@ -1252,27 +1477,20 @@ soreceive(so, psa, uio, mp0, controlp, flagsp) struct proc *p = current_proc(); + // LP64todo - fix this! KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so, - uio->uio_resid, + uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat); - kp = sotokextcb(so); - while (kp) { - if (kp->e_soif && kp->e_soif->sf_soreceive) { - error = (*kp->e_soif->sf_soreceive)(so, psa, &uio, - mp0, controlp, - flagsp, kp); - if (error) { - KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,0,0,0,0); - return((error == EJUSTRETURN) ? 0 : error); - } - } - kp = kp->e_next; - } + socket_lock(so, 1); +#ifdef MORE_LOCKING_DEBUG + if (so->so_usecount == 1) + panic("soreceive: so=%x no other reference on socket\n", so); +#endif mp = mp0; if (psa) *psa = 0; @@ -1293,17 +1511,21 @@ soreceive(so, psa, uio, mp0, controlp, flagsp) (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) { m = m_get(M_WAIT, MT_DATA); if (m == NULL) { + socket_unlock(so, 1); KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, ENOBUFS,0,0,0,0); return (ENOBUFS); } error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); if (error) goto bad; + socket_unlock(so, 0); do { + // LP64todo - fix this! error = uiomove(mtod(m, caddr_t), - (int) min(uio->uio_resid, m->m_len), uio); + (int) min(uio_resid(uio), m->m_len), uio); m = m_free(m); - } while (uio->uio_resid && error == 0 && m); + } while (uio_resid(uio) && error == 0 && m); + socket_lock(so, 0); bad: if (m) m_freem(m); @@ -1319,7 +1541,8 @@ bad: goto nooob; } else if (error == 0 && flagsp) *flagsp |= MSG_OOB; - } + } + socket_unlock(so, 1); KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,0,0,0,0); #endif return (error); @@ -1327,19 +1550,23 @@ bad: nooob: if (mp) *mp = (struct mbuf *)0; - if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) + if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) (*pr->pr_usrreqs->pru_rcvd)(so, 0); free_list = (struct mbuf *)0; delayed_copy_len = 0; restart: +#ifdef MORE_LOCKING_DEBUG + if (so->so_usecount <= 1) + printf("soreceive: sblock so=%x ref=%d on socket\n", so, so->so_usecount); +#endif error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); if (error) { + socket_unlock(so, 1); KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,0,0,0,0); return (error); } - s = splnet(); m = so->so_rcv.sb_mb; /* @@ -1354,9 +1581,9 @@ restart: * a short count if a timeout or signal occurs after we start. */ if (m == 0 || (((flags & MSG_DONTWAIT) == 0 && - so->so_rcv.sb_cc < uio->uio_resid) && + so->so_rcv.sb_cc < uio_resid(uio)) && (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || - ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && + ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) && m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { KASSERT(m != 0 || !so->so_rcv.sb_cc, ("receive 1")); @@ -1384,21 +1611,27 @@ restart: error = ENOTCONN; goto release; } - if (uio->uio_resid == 0) + if (uio_resid(uio) == 0) goto release; - if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) { + if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { error = EWOULDBLOCK; goto release; } - sbunlock(&so->so_rcv); + sbunlock(&so->so_rcv, 1); +#ifdef EVEN_MORE_LOCKING_DEBUG if (socket_debug) printf("Waiting for socket data\n"); +#endif error = sbwait(&so->so_rcv); +#ifdef EVEN_MORE_LOCKING_DEBUG if (socket_debug) printf("SORECEIVE - sbwait returned %d\n", error); - splx(s); +#endif + if (so->so_usecount < 1) + panic("soreceive: after 2nd sblock so=%x ref=%d on socket\n", so, so->so_usecount); if (error) { + socket_unlock(so, 1); KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,0,0,0,0); return (error); } @@ -1434,6 +1667,8 @@ dontblock: m = m->m_next; } else { sbfree(&so->so_rcv, m); + if (m->m_next == 0 && so->so_rcv.sb_cc != 0) + panic("soreceive: about to create invalid socketbuf"); MFREE(m, so->so_rcv.sb_mb); m = so->so_rcv.sb_mb; } @@ -1448,9 +1683,14 @@ dontblock: if (controlp) { if (pr->pr_domain->dom_externalize && mtod(m, struct cmsghdr *)->cmsg_type == - SCM_RIGHTS) + SCM_RIGHTS) { + socket_unlock(so, 0); /* release socket lock: see 3903171 */ error = (*pr->pr_domain->dom_externalize)(m); + socket_lock(so, 0); + } *controlp = m; + if (m->m_next == 0 && so->so_rcv.sb_cc != 0) + panic("soreceive: so->so_rcv.sb_mb->m_next == 0 && so->so_rcv.sb_cc != 0"); so->so_rcv.sb_mb = m->m_next; m->m_next = 0; m = so->so_rcv.sb_mb; @@ -1474,15 +1714,14 @@ dontblock: moff = 0; offset = 0; - if (!(flags & MSG_PEEK) && uio->uio_resid > sorecvmincopy) + if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) can_delay = 1; else can_delay = 0; need_event = 0; - - while (m && (uio->uio_resid - delayed_copy_len) > 0 && error == 0) { + while (m && (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) { if (m->m_type == MT_OOBDATA) { if (type != MT_OOBDATA) break; @@ -1509,7 +1748,8 @@ dontblock: } #endif so->so_state &= ~SS_RCVATMARK; - len = uio->uio_resid - delayed_copy_len; + // LP64todo - fix this! + len = uio_resid(uio) - delayed_copy_len; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; if (len > m->m_len - moff) @@ -1534,13 +1774,11 @@ dontblock: */ delayed_copy_len += len; } else { - splx(s); if (delayed_copy_len) { - error = sodelayed_copy(uio, &free_list, &delayed_copy_len); + error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len); if (error) { - s = splnet(); goto release; } if (m != so->so_rcv.sb_mb) { @@ -1556,14 +1794,15 @@ dontblock: break; } } + socket_unlock(so, 0); error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio); + socket_lock(so, 0); - s = splnet(); if (error) goto release; } } else - uio->uio_resid -= len; + uio_setresid(uio, (uio_resid(uio) - len)); if (len == m->m_len - moff) { if (m->m_flags & M_EOR) @@ -1574,6 +1813,7 @@ dontblock: } else { nextrecord = m->m_nextpkt; sbfree(&so->so_rcv, m); + m->m_nextpkt = NULL; if (mp) { *mp = m; @@ -1581,7 +1821,6 @@ dontblock: so->so_rcv.sb_mb = m = m->m_next; *mp = (struct mbuf *)0; } else { - m->m_nextpkt = 0; if (free_list == NULL) free_list = m; else @@ -1622,7 +1861,7 @@ dontblock: break; } } - if (flags & MSG_EOR) + if (flags & MSG_EOR) break; /* * If the MSG_WAITALL or MSG_WAITSTREAM flag is set (for non-atomic socket), @@ -1631,12 +1870,12 @@ dontblock: * with a short count but without error. * Keep sockbuf locked against other readers. */ - while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == 0 && (uio->uio_resid - delayed_copy_len) > 0 && + while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == 0 && (uio_resid(uio) - delayed_copy_len) > 0 && !sosendallatonce(so) && !nextrecord) { if (so->so_error || so->so_state & SS_CANTRCVMORE) goto release; - if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) + if (pr->pr_flags & PR_WANTRCVD && so->so_pcb && (((struct inpcb *)so->so_pcb)->inp_state != INPCB_STATE_DEAD)) (*pr->pr_usrreqs->pru_rcvd)(so, flags); if (sbwait(&so->so_rcv)) { error = 0; @@ -1657,7 +1896,7 @@ dontblock: */ if (delayed_copy_len > sorecvmincopy && (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) { - error = sodelayed_copy(uio, &free_list, &delayed_copy_len); + error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len); if (error) goto release; @@ -1668,6 +1907,10 @@ dontblock: } } } +#ifdef MORE_LOCKING_DEBUG + if (so->so_usecount <= 1) + panic("soreceive: after big while so=%x ref=%d on socket\n", so, so->so_usecount); +#endif if (m && pr->pr_flags & PR_ATOMIC) { #ifdef __APPLE__ @@ -1693,7 +1936,7 @@ dontblock: flags |= MSG_HAVEMORE; if (delayed_copy_len) { - error = sodelayed_copy(uio, &free_list, &delayed_copy_len); + error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len); if (error) goto release; @@ -1705,28 +1948,31 @@ dontblock: if (need_event) postevent(so, 0, EV_OOB); #endif - if (orig_resid == uio->uio_resid && orig_resid && + if (orig_resid == uio_resid(uio) && orig_resid && (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { - sbunlock(&so->so_rcv); - splx(s); + sbunlock(&so->so_rcv, 1); goto restart; } if (flagsp) *flagsp |= flags; release: +#ifdef MORE_LOCKING_DEBUG + if (so->so_usecount <= 1) + panic("soreceive: release so=%x ref=%d on socket\n", so, so->so_usecount); +#endif if (delayed_copy_len) { - error = sodelayed_copy(uio, &free_list, &delayed_copy_len); + error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len); } if (free_list) { m_freem_list((struct mbuf *)free_list); } - sbunlock(&so->so_rcv); - splx(s); + sbunlock(&so->so_rcv, 0); /* will unlock socket */ + // LP64todo - fix this! KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, - uio->uio_resid, + uio_resid(uio), so->so_rcv.sb_cc, 0, error); @@ -1735,19 +1981,15 @@ release: } -int sodelayed_copy(struct uio *uio, struct mbuf **free_list, int *resid) +static int sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list, int *resid) { int error = 0; - boolean_t dropped_funnel = FALSE; struct mbuf *m; m = *free_list; - if (*resid >= sorecvmincopy) { - dropped_funnel = TRUE; + socket_unlock(so, 0); - (void)thread_funnel_set(network_flock, FALSE); - } while (m && error == 0) { error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio); @@ -1759,8 +2001,7 @@ int sodelayed_copy(struct uio *uio, struct mbuf **free_list, int *resid) *free_list = (struct mbuf *)NULL; *resid = 0; - if (dropped_funnel == TRUE) - (void)thread_funnel_set(network_flock, TRUE); + socket_lock(so, 0); return (error); } @@ -1772,22 +2013,11 @@ soshutdown(so, how) register int how; { register struct protosw *pr = so->so_proto; - struct kextcb *kp; int ret; - - KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, 0,0,0,0,0); - kp = sotokextcb(so); - while (kp) { - if (kp->e_soif && kp->e_soif->sf_soshutdown) { - ret = (*kp->e_soif->sf_soshutdown)(so, how, kp); - if (ret) { - KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, 0,0,0,0,0); - return((ret == EJUSTRETURN) ? 0 : ret); - } - } - kp = kp->e_next; - } + socket_lock(so, 1); + + sflt_notify(so, sock_evt_shutdown, &how); if (how != SHUT_WR) { sorflush(so); @@ -1797,10 +2027,12 @@ soshutdown(so, how) ret = ((*pr->pr_usrreqs->pru_shutdown)(so)); postevent(so, 0, EV_WCLOSED); KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, 0,0,0,0,0); + socket_unlock(so, 1); return(ret); } KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, 0,0,0,0,0); + socket_unlock(so, 1); return (0); } @@ -1810,37 +2042,36 @@ sorflush(so) { register struct sockbuf *sb = &so->so_rcv; register struct protosw *pr = so->so_proto; - register int s, error; struct sockbuf asb; - struct kextcb *kp; - kp = sotokextcb(so); - while (kp) { - if (kp->e_soif && kp->e_soif->sf_sorflush) { - if ((*kp->e_soif->sf_sorflush)(so, kp)) - return; - } - kp = kp->e_next; - } +#ifdef MORE_LOCKING_DEBUG + lck_mtx_t * mutex_held; + + if (so->so_proto->pr_getlock != NULL) + mutex_held = (*so->so_proto->pr_getlock)(so, 0); + else + mutex_held = so->so_proto->pr_domain->dom_mtx; + lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); +#endif + + sflt_notify(so, sock_evt_flush_read, NULL); sb->sb_flags |= SB_NOINTR; (void) sblock(sb, M_WAIT); - s = splimp(); socantrcvmore(so); - sbunlock(sb); + sbunlock(sb, 1); #ifdef __APPLE__ selthreadclear(&sb->sb_sel); #endif asb = *sb; bzero((caddr_t)sb, sizeof (*sb)); + sb->sb_so = so; /* reestablish link to socket */ if (asb.sb_flags & SB_KNOTE) { sb->sb_sel.si_note = asb.sb_sel.si_note; sb->sb_flags = SB_KNOTE; } - splx(s); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) (*pr->pr_domain->dom_dispose)(asb.sb_mb); - sbrelease(&asb); } @@ -1874,7 +2105,7 @@ sooptcopyin(sopt, buf, len, minlen) if (sopt->sopt_p != 0) return (copyin(sopt->sopt_val, buf, valsize)); - bcopy(sopt->sopt_val, buf, valsize); + bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize); return 0; } @@ -1887,36 +2118,60 @@ sosetopt(so, sopt) struct linger l; struct timeval tv; short val; - struct kextcb *kp; + + socket_lock(so, 1); if (sopt->sopt_dir != SOPT_SET) { sopt->sopt_dir = SOPT_SET; } - kp = sotokextcb(so); - while (kp) { - if (kp->e_soif && kp->e_soif->sf_socontrol) { - error = (*kp->e_soif->sf_socontrol)(so, sopt, kp); - if (error) - return((error == EJUSTRETURN) ? 0 : error); + { + struct socket_filter_entry *filter; + int filtered = 0; + error = 0; + for (filter = so->so_filt; filter && (error == 0); + filter = filter->sfe_next_onsocket) { + if (filter->sfe_filter->sf_filter.sf_setoption) { + if (filtered == 0) { + filtered = 1; + sflt_use(so); + socket_unlock(so, 0); + } + error = filter->sfe_filter->sf_filter.sf_setoption( + filter->sfe_cookie, so, sopt); + } + } + + if (filtered != 0) { + socket_lock(so, 0); + sflt_unuse(so); + + if (error) { + if (error == EJUSTRETURN) + error = 0; + goto bad; + } } - kp = kp->e_next; } error = 0; if (sopt->sopt_level != SOL_SOCKET) { - if (so->so_proto && so->so_proto->pr_ctloutput) - return ((*so->so_proto->pr_ctloutput) - (so, sopt)); + if (so->so_proto && so->so_proto->pr_ctloutput) { + error = (*so->so_proto->pr_ctloutput) + (so, sopt); + socket_unlock(so, 1); + return (error); + } error = ENOPROTOOPT; } else { switch (sopt->sopt_name) { case SO_LINGER: + case SO_LINGER_SEC: error = sooptcopyin(sopt, &l, sizeof l, sizeof l); if (error) goto bad; - so->so_linger = l.l_linger; + so->so_linger = (sopt->sopt_name == SO_LINGER) ? l.l_linger : l.l_linger * hz; if (l.l_onoff) so->so_options |= SO_LINGER; else @@ -2000,29 +2255,18 @@ sosetopt(so, sopt) if (error) goto bad; - /* assert(hz > 0); */ - if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz || + if (tv.tv_sec < 0 || tv.tv_sec > LONG_MAX || tv.tv_usec < 0 || tv.tv_usec >= 1000000) { error = EDOM; goto bad; } - /* assert(tick > 0); */ - /* assert(ULONG_MAX - SHRT_MAX >= 1000000); */ - { - long tmp = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; - if (tmp > SHRT_MAX) { - error = EDOM; - goto bad; - } - val = tmp; - } - + switch (sopt->sopt_name) { case SO_SNDTIMEO: - so->so_snd.sb_timeo = val; + so->so_snd.sb_timeo = tv; break; case SO_RCVTIMEO: - so->so_rcv.sb_timeo = val; + so->so_rcv.sb_timeo = tv; break; } break; @@ -2030,14 +2274,13 @@ sosetopt(so, sopt) case SO_NKE: { struct so_nke nke; - struct NFDescriptor *nf1, *nf2 = NULL; error = sooptcopyin(sopt, &nke, sizeof nke, sizeof nke); if (error) goto bad; - error = nke_insert(so, &nke); + error = sflt_attach_private(so, NULL, nke.nke_handle, 1); break; } @@ -2075,6 +2318,7 @@ sosetopt(so, sopt) } } bad: + socket_unlock(so, 1); return (error); } @@ -2101,11 +2345,11 @@ sooptcopyout(sopt, buf, len) */ valsize = min(len, sopt->sopt_valsize); sopt->sopt_valsize = valsize; - if (sopt->sopt_val != 0) { + if (sopt->sopt_val != USER_ADDR_NULL) { if (sopt->sopt_p != 0) error = copyout(buf, sopt->sopt_val, valsize); else - bcopy(buf, sopt->sopt_val, valsize); + bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize); } return error; } @@ -2118,35 +2362,60 @@ sogetopt(so, sopt) int error, optval; struct linger l; struct timeval tv; - struct mbuf *m; - struct kextcb *kp; if (sopt->sopt_dir != SOPT_GET) { sopt->sopt_dir = SOPT_GET; } - kp = sotokextcb(so); - while (kp) { - if (kp->e_soif && kp->e_soif->sf_socontrol) { - error = (*kp->e_soif->sf_socontrol)(so, sopt, kp); - if (error) - return((error == EJUSTRETURN) ? 0 : error); + socket_lock(so, 1); + + { + struct socket_filter_entry *filter; + int filtered = 0; + error = 0; + for (filter = so->so_filt; filter && (error == 0); + filter = filter->sfe_next_onsocket) { + if (filter->sfe_filter->sf_filter.sf_getoption) { + if (filtered == 0) { + filtered = 1; + sflt_use(so); + socket_unlock(so, 0); + } + error = filter->sfe_filter->sf_filter.sf_getoption( + filter->sfe_cookie, so, sopt); + } + } + if (filtered != 0) { + socket_lock(so, 0); + sflt_unuse(so); + + if (error) { + if (error == EJUSTRETURN) + error = 0; + socket_unlock(so, 1); + return error; + } } - kp = kp->e_next; } error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto && so->so_proto->pr_ctloutput) { - return ((*so->so_proto->pr_ctloutput) - (so, sopt)); - } else + error = (*so->so_proto->pr_ctloutput) + (so, sopt); + socket_unlock(so, 1); + return (error); + } else { + socket_unlock(so, 1); return (ENOPROTOOPT); + } } else { switch (sopt->sopt_name) { case SO_LINGER: + case SO_LINGER_SEC: l.l_onoff = so->so_options & SO_LINGER; - l.l_linger = so->so_linger; + l.l_linger = (sopt->sopt_name == SO_LINGER) ? so->so_linger : + so->so_linger / hz; error = sooptcopyout(sopt, &l, sizeof l); break; @@ -2183,25 +2452,19 @@ integer: m1 = so->so_rcv.sb_mb; if (so->so_proto->pr_flags & PR_ATOMIC) { -#if 0 - kprintf("SKT CC: %d\n", so->so_rcv.sb_cc); -#endif while (m1) { if (m1->m_type == MT_DATA) pkt_total += m1->m_len; -#if 0 - kprintf("CNT: %d/%d\n", m1->m_len, pkt_total); -#endif m1 = m1->m_next; } optval = pkt_total; } else optval = so->so_rcv.sb_cc; -#if 0 - kprintf("RTN: %d\n", optval); -#endif goto integer; } + case SO_NWRITE: + optval = so->so_snd.sb_cc; + goto integer; #endif case SO_ERROR: optval = so->so_error; @@ -2226,90 +2489,29 @@ integer: case SO_SNDTIMEO: case SO_RCVTIMEO: - optval = (sopt->sopt_name == SO_SNDTIMEO ? + tv = (sopt->sopt_name == SO_SNDTIMEO ? so->so_snd.sb_timeo : so->so_rcv.sb_timeo); - tv.tv_sec = optval / hz; - tv.tv_usec = (optval % hz) * tick; error = sooptcopyout(sopt, &tv, sizeof tv); break; - case SO_NOSIGPIPE: - optval = (so->so_flags & SOF_NOSIGPIPE); - goto integer; + case SO_NOSIGPIPE: + optval = (so->so_flags & SOF_NOSIGPIPE); + goto integer; case SO_NOADDRERR: - optval = (so->so_flags & SOF_NOADDRAVAIL); - goto integer; + optval = (so->so_flags & SOF_NOADDRAVAIL); + goto integer; default: error = ENOPROTOOPT; break; } + socket_unlock(so, 1); return (error); } } -#ifdef __APPLE__ -/* - * Network filter support - */ -/* Run the list of filters, creating extension control blocks */ -sfilter_init(register struct socket *so) -{ struct kextcb *kp, **kpp; - struct protosw *prp; - struct NFDescriptor *nfp; - - prp = so->so_proto; - nfp = prp->pr_sfilter.tqh_first; /* non-null */ - kpp = &so->so_ext; - kp = NULL; - while (nfp) - { MALLOC(kp, struct kextcb *, sizeof(*kp), - M_TEMP, M_WAITOK); - if (kp == NULL) - return(ENOBUFS); /* so_free will clean up */ - *kpp = kp; - kpp = &kp->e_next; - kp->e_next = NULL; - kp->e_fcb = NULL; - kp->e_nfd = nfp; - kp->e_soif = nfp->nf_soif; - kp->e_sout = nfp->nf_soutil; - /* - * Ignore return value for create - * Everyone gets a chance at startup - */ - if (kp->e_soif && kp->e_soif->sf_socreate) - (*kp->e_soif->sf_socreate)(so, prp, kp); - nfp = nfp->nf_next.tqe_next; - } - return(0); -} - -/* - * Run the list of filters, freeing extension control blocks - * Assumes the soif/soutil blocks have been handled. - */ -sfilter_term(struct socket *so) -{ struct kextcb *kp, *kp1; - - kp = so->so_ext; - while (kp) - { kp1 = kp->e_next; - /* - * Ignore return code on termination; everyone must - * get terminated. - */ - if (kp->e_soif && kp->e_soif->sf_sofree) - kp->e_soif->sf_sofree(so, kp); - FREE(kp, M_TEMP); - kp = kp1; - } - return(0); -} -#endif __APPLE__ - /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ int soopt_getm(struct sockopt *sopt, struct mbuf **mp) @@ -2366,22 +2568,21 @@ soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; - if (sopt->sopt_val == NULL) + if (sopt->sopt_val == USER_ADDR_NULL) return 0; while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_p != NULL) { int error; - error = copyin(sopt->sopt_val, mtod(m, char *), - m->m_len); + error = copyin(sopt->sopt_val, mtod(m, char *), m->m_len); if (error != 0) { m_freem(m0); return(error); } } else - bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); + bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), mtod(m, char *), m->m_len); sopt->sopt_valsize -= m->m_len; - (caddr_t)sopt->sopt_val += m->m_len; + sopt->sopt_val += m->m_len; m = m->m_next; } if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ @@ -2396,22 +2597,21 @@ soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) struct mbuf *m0 = m; size_t valsize = 0; - if (sopt->sopt_val == NULL) + if (sopt->sopt_val == USER_ADDR_NULL) return 0; while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_p != NULL) { int error; - error = copyout(mtod(m, char *), sopt->sopt_val, - m->m_len); + error = copyout(mtod(m, char *), sopt->sopt_val, m->m_len); if (error != 0) { m_freem(m0); return(error); } } else - bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); + bcopy(mtod(m, char *), CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len); sopt->sopt_valsize -= m->m_len; - (caddr_t)sopt->sopt_val += m->m_len; + sopt->sopt_val += m->m_len; valsize += m->m_len; m = m->m_next; } @@ -2429,16 +2629,7 @@ sohasoutofband(so) register struct socket *so; { struct proc *p; - struct kextcb *kp; - kp = sotokextcb(so); - while (kp) { - if (kp->e_soif && kp->e_soif->sf_sohasoutofband) { - if ((*kp->e_soif->sf_sohasoutofband)(so, kp)) - return; - } - kp = kp->e_next; - } if (so->so_pgid < 0) gsignal(-so->so_pgid, SIGURG); else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0) @@ -2447,11 +2638,12 @@ sohasoutofband(so) } int -sopoll(struct socket *so, int events, struct ucred *cred, void * wql) +sopoll(struct socket *so, int events, __unused kauth_cred_t cred, void * wql) { struct proc *p = current_proc(); int revents = 0; - int s = splnet(); + + socket_lock(so, 1); if (events & (POLLIN | POLLRDNORM)) if (soreadable(so)) @@ -2479,17 +2671,18 @@ sopoll(struct socket *so, int events, struct ucred *cred, void * wql) } } - splx(s); + socket_unlock(so, 1); return (revents); } +int soo_kqfilter(struct fileproc *fp, struct knote *kn, struct proc *p); int -soo_kqfilter(struct file *fp, struct knote *kn, struct proc *p) +soo_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused struct proc *p) { - struct socket *so = (struct socket *)kn->kn_fp->f_data; + struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; struct sockbuf *sb; - int s; + socket_lock(so, 1); switch (kn->kn_filter) { case EVFILT_READ: @@ -2504,81 +2697,127 @@ soo_kqfilter(struct file *fp, struct knote *kn, struct proc *p) sb = &so->so_snd; break; default: + socket_unlock(so, 1); return (1); } - if (sb->sb_sel.si_flags & SI_INITED) - return (1); - - s = splnet(); if (KNOTE_ATTACH(&sb->sb_sel.si_note, kn)) sb->sb_flags |= SB_KNOTE; - splx(s); + socket_unlock(so, 1); return (0); } static void filt_sordetach(struct knote *kn) { - struct socket *so = (struct socket *)kn->kn_fp->f_data; - int s = splnet(); + struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; - if (so->so_rcv.sb_flags & SB_KNOTE && - !(so->so_rcv.sb_sel.si_flags & SI_INITED)) + socket_lock(so, 1); + if (so->so_rcv.sb_flags & SB_KNOTE) if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) so->so_rcv.sb_flags &= ~SB_KNOTE; - splx(s); + socket_unlock(so, 1); } /*ARGSUSED*/ static int filt_soread(struct knote *kn, long hint) { - struct socket *so = (struct socket *)kn->kn_fp->f_data; + struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; - kn->kn_data = so->so_rcv.sb_cc; - if (so->so_state & SS_CANTRCVMORE) { - kn->kn_flags |= EV_EOF; - kn->kn_fflags = so->so_error; - return (1); + if ((hint & SO_FILT_HINT_LOCKED) == 0) + socket_lock(so, 1); + + if (so->so_oobmark) { + if (kn->kn_flags & EV_OOBAND) { + kn->kn_data = so->so_rcv.sb_cc - so->so_oobmark; + if ((hint & SO_FILT_HINT_LOCKED) == 0) + socket_unlock(so, 1); + return (1); + } + kn->kn_data = so->so_oobmark; + kn->kn_flags |= EV_OOBAND; + } else { + kn->kn_data = so->so_rcv.sb_cc; + if (so->so_state & SS_CANTRCVMORE) { + kn->kn_flags |= EV_EOF; + kn->kn_fflags = so->so_error; + if ((hint & SO_FILT_HINT_LOCKED) == 0) + socket_unlock(so, 1); + return (1); + } } - if (so->so_error) /* temporary udp error */ + + if (so->so_state & SS_RCVATMARK) { + if (kn->kn_flags & EV_OOBAND) { + if ((hint & SO_FILT_HINT_LOCKED) == 0) + socket_unlock(so, 1); + return (1); + } + kn->kn_flags |= EV_OOBAND; + } else if (kn->kn_flags & EV_OOBAND) { + kn->kn_data = 0; + if ((hint & SO_FILT_HINT_LOCKED) == 0) + socket_unlock(so, 1); + return (0); + } + + if (so->so_error) { /* temporary udp error */ + if ((hint & SO_FILT_HINT_LOCKED) == 0) + socket_unlock(so, 1); return (1); - if (kn->kn_sfflags & NOTE_LOWAT) - return (kn->kn_data >= kn->kn_sdata); - return (kn->kn_data >= so->so_rcv.sb_lowat); + } + + if ((hint & SO_FILT_HINT_LOCKED) == 0) + socket_unlock(so, 1); + + return( kn->kn_flags & EV_OOBAND || + kn->kn_data >= ((kn->kn_sfflags & NOTE_LOWAT) ? + kn->kn_sdata : so->so_rcv.sb_lowat)); } static void filt_sowdetach(struct knote *kn) { - struct socket *so = (struct socket *)kn->kn_fp->f_data; - int s = splnet(); + struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; + socket_lock(so, 1); - if(so->so_snd.sb_flags & SB_KNOTE && - !(so->so_snd.sb_sel.si_flags & SI_INITED)) + if(so->so_snd.sb_flags & SB_KNOTE) if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) so->so_snd.sb_flags &= ~SB_KNOTE; - splx(s); + socket_unlock(so, 1); } /*ARGSUSED*/ static int filt_sowrite(struct knote *kn, long hint) { - struct socket *so = (struct socket *)kn->kn_fp->f_data; + struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; + + if ((hint & SO_FILT_HINT_LOCKED) == 0) + socket_lock(so, 1); kn->kn_data = sbspace(&so->so_snd); if (so->so_state & SS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; + if ((hint & SO_FILT_HINT_LOCKED) == 0) + socket_unlock(so, 1); return (1); } - if (so->so_error) /* temporary udp error */ + if (so->so_error) { /* temporary udp error */ + if ((hint & SO_FILT_HINT_LOCKED) == 0) + socket_unlock(so, 1); return (1); + } if (((so->so_state & SS_ISCONNECTED) == 0) && - (so->so_proto->pr_flags & PR_CONNREQUIRED)) + (so->so_proto->pr_flags & PR_CONNREQUIRED)) { + if ((hint & SO_FILT_HINT_LOCKED) == 0) + socket_unlock(so, 1); return (0); + } + if ((hint & SO_FILT_HINT_LOCKED) == 0) + socket_unlock(so, 1); if (kn->kn_sfflags & NOTE_LOWAT) return (kn->kn_data >= kn->kn_sdata); return (kn->kn_data >= so->so_snd.sb_lowat); @@ -2588,9 +2827,123 @@ filt_sowrite(struct knote *kn, long hint) static int filt_solisten(struct knote *kn, long hint) { - struct socket *so = (struct socket *)kn->kn_fp->f_data; + struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; + int isempty; + if ((hint & SO_FILT_HINT_LOCKED) == 0) + socket_lock(so, 1); kn->kn_data = so->so_qlen; - return (! TAILQ_EMPTY(&so->so_comp)); + isempty = ! TAILQ_EMPTY(&so->so_comp); + if ((hint & SO_FILT_HINT_LOCKED) == 0) + socket_unlock(so, 1); + return (isempty); } + +int +socket_lock(so, refcount) + struct socket *so; + int refcount; +{ + int error = 0, lr, lr_saved; +#ifdef __ppc__ + __asm__ volatile("mflr %0" : "=r" (lr)); + lr_saved = lr; +#endif + + if (so->so_proto->pr_lock) { + error = (*so->so_proto->pr_lock)(so, refcount, lr_saved); + } + else { +#ifdef MORE_LOCKING_DEBUG + lck_mtx_assert(so->so_proto->pr_domain->dom_mtx, LCK_MTX_ASSERT_NOTOWNED); +#endif + lck_mtx_lock(so->so_proto->pr_domain->dom_mtx); + if (refcount) + so->so_usecount++; + so->reserved3 = (void*)lr_saved; /* save caller for refcount going to zero */ + } + + return(error); + +} + +int +socket_unlock(so, refcount) + struct socket *so; + int refcount; +{ + int error = 0, lr, lr_saved; + lck_mtx_t * mutex_held; + +#ifdef __ppc__ +__asm__ volatile("mflr %0" : "=r" (lr)); + lr_saved = lr; +#endif + + + + if (so->so_proto == NULL) + panic("socket_unlock null so_proto so=%x\n", so); + + if (so && so->so_proto->pr_unlock) + error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved); + else { + mutex_held = so->so_proto->pr_domain->dom_mtx; +#ifdef MORE_LOCKING_DEBUG + lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); +#endif + if (refcount) { + if (so->so_usecount <= 0) + panic("socket_unlock: bad refcount so=%x value=%d\n", so, so->so_usecount); + so->so_usecount--; + if (so->so_usecount == 0) { + sofreelastref(so, 1); + } + else + so->reserved4 = (void*)lr_saved; /* save caller */ + } + lck_mtx_unlock(mutex_held); + } + + return(error); +} +//### Called with socket locked, will unlock socket +void +sofree(so) + struct socket *so; +{ + + int lr, lr_saved; + lck_mtx_t * mutex_held; +#ifdef __ppc__ + __asm__ volatile("mflr %0" : "=r" (lr)); + lr_saved = lr; +#endif + if (so->so_proto->pr_getlock != NULL) + mutex_held = (*so->so_proto->pr_getlock)(so, 0); + else + mutex_held = so->so_proto->pr_domain->dom_mtx; + lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); + + /* Remove the filters */ + sflt_termsock(so); + + sofreelastref(so, 0); +} + +void +soreference(so) + struct socket *so; +{ + socket_lock(so, 1); /* locks & take one reference on socket */ + socket_unlock(so, 0); /* unlock only */ +} + +void +sodereference(so) + struct socket *so; +{ + socket_lock(so, 0); + socket_unlock(so, 1); +} diff --git a/bsd/kern/uipc_socket2.c b/bsd/kern/uipc_socket2.c index 175824a8a..2fb59d20f 100644 --- a/bsd/kern/uipc_socket2.c +++ b/bsd/kern/uipc_socket2.c @@ -61,7 +61,8 @@ #include <sys/systm.h> #include <sys/domain.h> #include <sys/kernel.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/protosw.h> @@ -71,7 +72,10 @@ #include <sys/signalvar.h> #include <sys/sysctl.h> #include <sys/ev.h> - +#include <kern/locks.h> +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_pcb.h> #include <sys/kdebug.h> #define DBG_FNC_SBDROP NETDBG_CODE(DBG_NETSOCK, 4) @@ -115,7 +119,6 @@ static u_long sb_efficiency = 8; /* parameter for sbreserve() */ * the kernel, the wakeups done here will sometimes * cause software-interrupt process scheduling. */ - void soisconnecting(so) register struct socket *so; @@ -123,6 +126,8 @@ soisconnecting(so) so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTING; + + sflt_notify(so, sock_evt_connecting, NULL); } void @@ -130,30 +135,27 @@ soisconnected(so) struct socket *so; { struct socket *head = so->so_head; - struct kextcb *kp; - - kp = sotokextcb(so); - while (kp) { - if (kp->e_soif && kp->e_soif->sf_soisconnected) { - if ((*kp->e_soif->sf_soisconnected)(so, kp)) - return; - } - kp = kp->e_next; - } so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; + + sflt_notify(so, sock_evt_connected, NULL); + if (head && (so->so_state & SS_INCOMP)) { - postevent(head,0,EV_RCONN); + if (head->so_proto->pr_getlock != NULL) + socket_lock(head, 1); + postevent(head, 0, EV_RCONN); TAILQ_REMOVE(&head->so_incomp, so, so_list); head->so_incqlen--; so->so_state &= ~SS_INCOMP; TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); so->so_state |= SS_COMP; sorwakeup(head); - wakeup_one(&head->so_timeo); + wakeup_one((caddr_t)&head->so_timeo); + if (head->so_proto->pr_getlock != NULL) + socket_unlock(head, 1); } else { - postevent(so,0,EV_WCONN); + postevent(so, 0, EV_WCONN); wakeup((caddr_t)&so->so_timeo); sorwakeup(so); sowwakeup(so); @@ -164,19 +166,9 @@ void soisdisconnecting(so) register struct socket *so; { - register struct kextcb *kp; - - kp = sotokextcb(so); - while (kp) { - if (kp->e_soif && kp->e_soif->sf_soisdisconnecting) { - if ((*kp->e_soif->sf_soisdisconnecting)(so, kp)) - return; - } - kp = kp->e_next; - } - so->so_state &= ~SS_ISCONNECTING; so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); + sflt_notify(so, sock_evt_disconnecting, NULL); wakeup((caddr_t)&so->so_timeo); sowwakeup(so); sorwakeup(so); @@ -186,19 +178,9 @@ void soisdisconnected(so) register struct socket *so; { - register struct kextcb *kp; - - kp = sotokextcb(so); - while (kp) { - if (kp->e_soif && kp->e_soif->sf_soisdisconnected) { - if ((*kp->e_soif->sf_soisdisconnected)(so, kp)) - return; - } - kp = kp->e_next; - } - so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); + sflt_notify(so, sock_evt_disconnected, NULL); wakeup((caddr_t)&so->so_timeo); sowwakeup(so); sorwakeup(so); @@ -218,7 +200,7 @@ struct socket * sodropablereq(head) register struct socket *head; { - register struct socket *so; + struct socket *so, *sonext = NULL; unsigned int i, j, qlen; static int rnd; static struct timeval old_runtime; @@ -234,18 +216,27 @@ sodropablereq(head) so = TAILQ_FIRST(&head->so_incomp); if (!so) - return (so); + return (NULL); qlen = head->so_incqlen; if (++cur_cnt > qlen || old_cnt > qlen) { rnd = (314159 * rnd + 66329) & 0xffff; j = ((qlen + 1) * rnd) >> 16; - - while (j-- && so) - so = TAILQ_NEXT(so, so_list); +//###LD To clean up + while (j-- && so) { +// if (in_pcb_checkstate(so->so_pcb, WNT_ACQUIRE, 0) != WNT_STOPUSING) { + socket_lock(so, 1); + sonext = TAILQ_NEXT(so, so_list); +// in_pcb_check_state(so->so_pcb, WNT_RELEASE, 0); + socket_unlock(so, 1); + so = sonext; + } } - return (so); +// if (in_pcb_checkstate(so->so_pcb, WNT_ACQUIRE, 0) == WNT_STOPUSING) +// return (NULL); +// else + return (so); } /* @@ -256,14 +247,20 @@ sodropablereq(head) * data structure of the original socket, and return this. * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. */ -struct socket * -sonewconn(head, connstatus) +static struct socket * +sonewconn_internal(head, connstatus) register struct socket *head; int connstatus; { int error = 0; register struct socket *so; - register struct kextcb *kp; + lck_mtx_t *mutex_held; + + if (head->so_proto->pr_getlock != NULL) + mutex_held = (*head->so_proto->pr_getlock)(head, 0); + else + mutex_held = head->so_proto->pr_domain->dom_mtx; + lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); if (head->so_qlen > 3 * head->so_qlimit / 2) return ((struct socket *)0); @@ -285,36 +282,25 @@ sonewconn(head, connstatus) so->so_timeo = head->so_timeo; so->so_pgid = head->so_pgid; so->so_uid = head->so_uid; + so->so_usecount = 1; - /* Attach socket filters for this protocol */ - if (so->so_proto->pr_sfilter.tqh_first) - error = sfilter_init(so); - if (error != 0) { + if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { + sflt_termsock(so); sodealloc(so); return ((struct socket *)0); } - /* Call socket filters' sonewconn1 function if set */ - kp = sotokextcb(so); - while (kp) { - if (kp->e_soif && kp->e_soif->sf_sonewconn) { - error = (int)(*kp->e_soif->sf_sonewconn)(so, connstatus, kp); - if (error == EJUSTRETURN) { - return so; - } else if (error != 0) { - sodealloc(so); - return NULL; - } - } - kp = kp->e_next; - } - - if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) || - (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { - sfilter_term(so); + /* + * Must be done with head unlocked to avoid deadlock with pcb list + */ + socket_unlock(head, 0); + if (((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL) != 0) || error) { + sflt_termsock(so); sodealloc(so); + socket_lock(head, 0); return ((struct socket *)0); } + socket_lock(head, 0); #ifdef __APPLE__ so->so_proto->pr_domain->dom_refs++; #endif @@ -328,18 +314,57 @@ sonewconn(head, connstatus) head->so_incqlen++; } head->so_qlen++; - if (connstatus) { - sorwakeup(head); - wakeup((caddr_t)&head->so_timeo); - so->so_state |= connstatus; - } #ifdef __APPLE__ so->so_rcv.sb_so = so->so_snd.sb_so = so; TAILQ_INIT(&so->so_evlist); + + /* Attach socket filters for this protocol */ + sflt_initsock(so); #endif + if (connstatus) { + so->so_state |= connstatus; + sorwakeup(head); + wakeup((caddr_t)&head->so_timeo); + } return (so); } + +struct socket * +sonewconn( + struct socket *head, + int connstatus, + const struct sockaddr *from) +{ + int error = 0; + struct socket_filter_entry *filter; + int filtered = 0; + + error = 0; + for (filter = head->so_filt; filter && (error == 0); + filter = filter->sfe_next_onsocket) { + if (filter->sfe_filter->sf_filter.sf_connect_in) { + if (filtered == 0) { + filtered = 1; + sflt_use(head); + socket_unlock(head, 0); + } + error = filter->sfe_filter->sf_filter.sf_connect_in( + filter->sfe_cookie, head, from); + } + } + if (filtered != 0) { + socket_lock(head, 0); + sflt_unuse(head); + } + + if (error) { + return NULL; + } + + return sonewconn_internal(head, connstatus); +} + /* * Socantsendmore indicates that no more data will be sent on the * socket; it would normally be applied to a socket when the user @@ -354,19 +379,8 @@ void socantsendmore(so) struct socket *so; { - register struct kextcb *kp; - - kp = sotokextcb(so); - while (kp) { - if (kp->e_soif && kp->e_soif->sf_socantsendmore) { - if ((*kp->e_soif->sf_socantsendmore)(so, kp)) - return; - } - kp = kp->e_next; - } - - so->so_state |= SS_CANTSENDMORE; + sflt_notify(so, sock_evt_cantsendmore, NULL); sowwakeup(so); } @@ -374,19 +388,8 @@ void socantrcvmore(so) struct socket *so; { - register struct kextcb *kp; - - kp = sotokextcb(so); - while (kp) { - if (kp->e_soif && kp->e_soif->sf_socantrcvmore) { - if ((*kp->e_soif->sf_socantrcvmore)(so, kp)) - return; - } - kp = kp->e_next; - } - - so->so_state |= SS_CANTRCVMORE; + sflt_notify(so, sock_evt_cantrecvmore, NULL); sorwakeup(so); } @@ -397,11 +400,42 @@ int sbwait(sb) struct sockbuf *sb; { + int error = 0, lr, lr_saved; + struct socket *so = sb->sb_so; + lck_mtx_t *mutex_held; + struct timespec ts; + +#ifdef __ppc__ + __asm__ volatile("mflr %0" : "=r" (lr)); + lr_saved = lr; +#endif + + + if (so->so_proto->pr_getlock != NULL) + mutex_held = (*so->so_proto->pr_getlock)(so, 0); + else + mutex_held = so->so_proto->pr_domain->dom_mtx; sb->sb_flags |= SB_WAIT; - return (tsleep((caddr_t)&sb->sb_cc, - (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait", - sb->sb_timeo)); + + if (so->so_usecount < 1) + panic("sbwait: so=%x refcount=%d\n", so, so->so_usecount); + ts.tv_sec = sb->sb_timeo.tv_sec; + ts.tv_nsec = sb->sb_timeo.tv_usec * 1000; + error = msleep((caddr_t)&sb->sb_cc, mutex_held, + (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait", + &ts); + + lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); + + if (so->so_usecount < 1) + panic("sbwait: so=%x refcount=%d\n", so, so->so_usecount); + + if ((so->so_state & SS_DRAINING)) { + error = EBADF; + } + + return (error); } /* @@ -412,14 +446,31 @@ int sb_lock(sb) register struct sockbuf *sb; { - int error; + struct socket *so = sb->sb_so; + lck_mtx_t * mutex_held; + int error = 0, lr, lr_saved; + +#ifdef __ppc__ + __asm__ volatile("mflr %0" : "=r" (lr)); + lr_saved = lr; +#endif + + if (so == NULL) + panic("sb_lock: null so back pointer sb=%x\n", sb); while (sb->sb_flags & SB_LOCK) { sb->sb_flags |= SB_WANT; - error = tsleep((caddr_t)&sb->sb_flags, - (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH, - "sblock", 0); - if (error) + if (so->so_proto->pr_getlock != NULL) + mutex_held = (*so->so_proto->pr_getlock)(so, 0); + else + mutex_held = so->so_proto->pr_domain->dom_mtx; + if (so->so_usecount < 1) + panic("sb_lock: so=%x refcount=%d\n", so, so->so_usecount); + error = msleep((caddr_t)&sb->sb_flags, mutex_held, + (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sblock", 0); + if (so->so_usecount < 1) + panic("sb_lock: 2 so=%x refcount=%d\n", so, so->so_usecount); + if (error) return (error); } sb->sb_flags |= SB_LOCK; @@ -437,8 +488,6 @@ sowakeup(so, sb) register struct sockbuf *sb; { struct proc *p = current_proc(); - /* We clear the flag before calling selwakeup. */ - /* BSD calls selwakeup then sets the flag */ sb->sb_flags &= ~SB_SEL; selwakeup(&sb->sb_sel); if (sb->sb_flags & SB_WAIT) { @@ -451,11 +500,14 @@ sowakeup(so, sb) else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0) psignal(p, SIGIO); } - if (sb->sb_flags & SB_UPCALL) + if (sb->sb_flags & SB_KNOTE) { + KNOTE(&sb->sb_sel.si_note, SO_FILT_HINT_LOCKED); + } + if (sb->sb_flags & SB_UPCALL) { + socket_unlock(so, 0); (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT); - if (sb->sb_flags & SB_KNOTE && - !(sb->sb_sel.si_flags & SI_INITED)) - KNOTE(&sb->sb_sel.si_note, 0); + socket_lock(so, 0); + } } /* @@ -495,16 +547,6 @@ soreserve(so, sndcc, rcvcc) register struct socket *so; u_long sndcc, rcvcc; { - register struct kextcb *kp; - - kp = sotokextcb(so); - while (kp) { - if (kp->e_soif && kp->e_soif->sf_soreserve) { - if ((*kp->e_soif->sf_soreserve)(so, sndcc, rcvcc, kp)) - return; - } - kp = kp->e_next; - } if (sbreserve(&so->so_snd, sndcc) == 0) goto bad; @@ -591,44 +633,55 @@ sbrelease(sb) * the mbuf chain is recorded in sb. Empty mbufs are * discarded and mbufs are compacted where possible. */ -void +int sbappend(sb, m) struct sockbuf *sb; struct mbuf *m; { - struct kextcb *kp; - register struct mbuf *n; + register struct mbuf *n, *sb_first; + int result = 0; + int error = 0; KERNEL_DEBUG((DBG_FNC_SBAPPEND | DBG_FUNC_START), sb, m->m_len, 0, 0, 0); if (m == 0) - return; - kp = sotokextcb(sbtoso(sb)); - while (kp) { - if (kp->e_sout && kp->e_sout->su_sbappend) { - if ((*kp->e_sout->su_sbappend)(sb, m, kp)) { - KERNEL_DEBUG((DBG_FNC_SBAPPEND | DBG_FUNC_END), sb, sb->sb_cc, kp, 0, 0); - return; - } - } - kp = kp->e_next; - } - n = sb->sb_mb; + return 0; + sb_first = n = sb->sb_mb; if (n) { while (n->m_nextpkt) n = n->m_nextpkt; do { if (n->m_flags & M_EOR) { - sbappendrecord(sb, m); /* XXXXXX!!!! */ + result = sbappendrecord(sb, m); /* XXXXXX!!!! */ KERNEL_DEBUG((DBG_FNC_SBAPPEND | DBG_FUNC_END), sb, sb->sb_cc, 0, 0, 0); - return; + return result; } } while (n->m_next && (n = n->m_next)); } - sbcompress(sb, m, n); + + if ((sb->sb_flags & SB_RECV) != 0) { + error = sflt_data_in(sb->sb_so, NULL, &m, NULL, 0); + if (error) { + /* no data was appended, caller should not call sowakeup */ + return 0; + } + } + + /* 3962537 - sflt_data_in may drop the lock, need to validate state again */ + if (sb_first != sb->sb_mb) { + n = sb->sb_mb; + if (n) { + while (n->m_nextpkt) + n = n->m_nextpkt; + } + } + + result = sbcompress(sb, m, n); KERNEL_DEBUG((DBG_FNC_SBAPPEND | DBG_FUNC_END), sb, sb->sb_cc, 0, 0, 0); + + return result; } #ifdef SOCKBUF_DEBUG @@ -639,6 +692,17 @@ sbcheck(sb) register struct mbuf *m; register struct mbuf *n = 0; register u_long len = 0, mbcnt = 0; + lck_mtx_t *mutex_held; + + if (sb->sb_so->so_proto->pr_getlock != NULL) + mutex_held = (*sb->sb_so->so_proto->pr_getlock)(sb->sb_so, 0); + else + mutex_held = sb->sb_so->so_proto->pr_domain->dom_mtx; + + lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); + + if (sbchecking == 0) + return; for (m = sb->sb_mb; m; m = n) { n = m->m_nextpkt; @@ -649,18 +713,10 @@ sbcheck(sb) mbcnt += m->m_ext.ext_size; } } -#ifndef __APPLE__ if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { - printf("cc %ld != %ld || mbcnt %ld != %ld\n", len, sb->sb_cc, + panic("cc %ld != %ld || mbcnt %ld != %ld\n", len, sb->sb_cc, mbcnt, sb->sb_mbcnt); - panic("sbcheck"); } -#else - if (len != sb->sb_cc) - printf("sbcheck len %ld != sb_cc %ld\n", len, sb->sb_cc); - if (mbcnt != sb->sb_mbcnt) - printf("sbcheck mbcnt %ld != sb_mbcnt %ld\n", mbcnt, sb->sb_mbcnt); -#endif } #endif @@ -668,24 +724,24 @@ sbcheck(sb) * As above, except the mbuf chain * begins a new record. */ -void +int sbappendrecord(sb, m0) register struct sockbuf *sb; register struct mbuf *m0; { register struct mbuf *m; - register struct kextcb *kp; + int result = 0; if (m0 == 0) - return; - - kp = sotokextcb(sbtoso(sb)); - while (kp) - { if (kp->e_sout && kp->e_sout->su_sbappendrecord) - { if ((*kp->e_sout->su_sbappendrecord)(sb, m0, kp)) - return; + return 0; + + if ((sb->sb_flags & SB_RECV) != 0) { + int error = sflt_data_in(sb->sb_so, NULL, &m0, NULL, sock_data_filt_flag_record); + if (error != 0) { + if (error != EJUSTRETURN) + m_freem(m0); + return 0; } - kp = kp->e_next; } m = sb->sb_mb; @@ -707,7 +763,7 @@ sbappendrecord(sb, m0) m0->m_flags &= ~M_EOR; m->m_flags |= M_EOR; } - sbcompress(sb, m, m0); + return sbcompress(sb, m, m0); } /* @@ -715,25 +771,27 @@ sbappendrecord(sb, m0) * is inserted at the beginning of the sockbuf, * but after any other OOB data. */ -void +int sbinsertoob(sb, m0) - register struct sockbuf *sb; - register struct mbuf *m0; + struct sockbuf *sb; + struct mbuf *m0; { - register struct mbuf *m; - register struct mbuf **mp; - register struct kextcb *kp; + struct mbuf *m; + struct mbuf **mp; if (m0 == 0) - return; - - kp = sotokextcb(sbtoso(sb)); - while (kp) - { if (kp->e_sout && kp->e_sout->su_sbinsertoob) - { if ((*kp->e_sout->su_sbinsertoob)(sb, m0, kp)) - return; + return 0; + + if ((sb->sb_flags & SB_RECV) != 0) { + int error = sflt_data_in(sb->sb_so, NULL, &m0, NULL, + sock_data_filt_flag_oob); + + if (error) { + if (error != EJUSTRETURN) { + m_freem(m0); + } + return 0; } - kp = kp->e_next; } for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) { @@ -764,7 +822,7 @@ sbinsertoob(sb, m0) m0->m_flags &= ~M_EOR; m->m_flags |= M_EOR; } - sbcompress(sb, m, m0); + return sbcompress(sb, m, m0); } /* @@ -773,28 +831,18 @@ sbinsertoob(sb, m0) * m0 must include a packet header with total length. * Returns 0 if no space in sockbuf or insufficient mbufs. */ -int -sbappendaddr(sb, asa, m0, control) +static int +sbappendaddr_internal(sb, asa, m0, control) register struct sockbuf *sb; struct sockaddr *asa; struct mbuf *m0, *control; { register struct mbuf *m, *n; int space = asa->sa_len; - register struct kextcb *kp; if (m0 && (m0->m_flags & M_PKTHDR) == 0) panic("sbappendaddr"); - kp = sotokextcb(sbtoso(sb)); - while (kp) - { if (kp->e_sout && kp->e_sout->su_sbappendaddr) - { if ((*kp->e_sout->su_sbappendaddr)(sb, asa, m0, control, kp)) - return 0; - } - kp = kp->e_next; - } - if (m0) space += m0->m_pkthdr.len; for (n = control; n; n = n->m_next) { @@ -830,26 +878,55 @@ sbappendaddr(sb, asa, m0, control) } int -sbappendcontrol(sb, m0, control) +sbappendaddr( + struct sockbuf* sb, + struct sockaddr* asa, + struct mbuf *m0, + struct mbuf *control, + int *error_out) +{ + int result = 0; + + if (error_out) *error_out = 0; + + if (m0 && (m0->m_flags & M_PKTHDR) == 0) + panic("sbappendaddrorfree"); + + /* Call socket data in filters */ + if ((sb->sb_flags & SB_RECV) != 0) { + int error; + error = sflt_data_in(sb->sb_so, asa, &m0, &control, 0); + if (error) { + if (error != EJUSTRETURN) { + if (m0) m_freem(m0); + if (control) m_freem(control); + if (error_out) *error_out = error; + } + return 0; + } + } + + result = sbappendaddr_internal(sb, asa, m0, control); + if (result == 0) { + if (m0) m_freem(m0); + if (control) m_freem(control); + if (error_out) *error_out = ENOBUFS; + } + + return result; +} + +static int +sbappendcontrol_internal(sb, m0, control) struct sockbuf *sb; struct mbuf *control, *m0; { register struct mbuf *m, *n; int space = 0; - register struct kextcb *kp; if (control == 0) panic("sbappendcontrol"); - kp = sotokextcb(sbtoso(sb)); - while (kp) - { if (kp->e_sout && kp->e_sout->su_sbappendcontrol) - { if ((*kp->e_sout->su_sbappendcontrol)(sb, m0, control, kp)) - return 0; - } - kp = kp->e_next; - } - for (m = control; ; m = m->m_next) { space += m->m_len; if (m->m_next == 0) @@ -874,12 +951,46 @@ sbappendcontrol(sb, m0, control) return (1); } +int +sbappendcontrol( + struct sockbuf *sb, + struct mbuf *m0, + struct mbuf *control, + int *error_out) +{ + int result = 0; + + if (error_out) *error_out = 0; + + if (sb->sb_flags & SB_RECV) { + int error; + error = sflt_data_in(sb->sb_so, NULL, &m0, &control, 0); + if (error) { + if (error != EJUSTRETURN) { + if (m0) m_freem(m0); + if (control) m_freem(control); + if (error_out) *error_out = error; + } + return 0; + } + } + + result = sbappendcontrol_internal(sb, m0, control); + if (result == 0) { + if (m0) m_freem(m0); + if (control) m_freem(control); + if (error_out) *error_out = ENOBUFS; + } + + return result; +} + /* * Compress mbuf chain m into the socket * buffer sb following mbuf n. If n * is null, the buffer is presumed empty. */ -void +static int sbcompress(sb, m, n) register struct sockbuf *sb; register struct mbuf *m, *n; @@ -927,6 +1038,7 @@ sbcompress(sb, m, n) printf("semi-panic: sbcompress\n"); } postevent(0,sb, EV_RWBYTES); + return 1; } /* @@ -937,17 +1049,8 @@ void sbflush(sb) register struct sockbuf *sb; { - register struct kextcb *kp; - - kp = sotokextcb(sbtoso(sb)); - while (kp) { - if (kp->e_sout && kp->e_sout->su_sbflush) { - if ((*kp->e_sout->su_sbflush)(sb, kp)) - return; - } - kp = kp->e_next; - } - + if (sb->sb_so == NULL) + panic ("sbflush sb->sb_so already null sb=%x\n", sb); (void)sblock(sb, M_WAIT); while (sb->sb_mbcnt) { /* @@ -958,12 +1061,12 @@ sbflush(sb) break; sbdrop(sb, (int)sb->sb_cc); } - if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt) - panic("sbflush: cc %ld || mb %p || mbcnt %ld", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt); - - sbunlock(sb); + if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt || sb->sb_so == NULL) + panic("sbflush: cc %ld || mb %p || mbcnt %ld sb_so=%x", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt, sb->sb_so); postevent(0, sb, EV_RWBYTES); + sbunlock(sb, 1); /* keep socket locked */ + } /* @@ -984,20 +1087,9 @@ sbdrop(sb, len) { register struct mbuf *m, *free_list, *ml; struct mbuf *next, *last; - register struct kextcb *kp; KERNEL_DEBUG((DBG_FNC_SBDROP | DBG_FUNC_START), sb, len, 0, 0, 0); - kp = sotokextcb(sbtoso(sb)); - while (kp) { - if (kp->e_sout && kp->e_sout->su_sbdrop) { - if ((*kp->e_sout->su_sbdrop)(sb, len, kp)) { - KERNEL_DEBUG((DBG_FNC_SBDROP | DBG_FUNC_END), sb, len, kp, 0, 0); - return; - } - } - kp = kp->e_next; - } next = (m = sb->sb_mb) ? m->m_nextpkt : 0; free_list = last = m; ml = (struct mbuf *)0; @@ -1065,16 +1157,6 @@ sbdroprecord(sb) register struct sockbuf *sb; { register struct mbuf *m, *mn; - register struct kextcb *kp; - - kp = sotokextcb(sbtoso(sb)); - while (kp) { - if (kp->e_sout && kp->e_sout->su_sbdroprecord) { - if ((*kp->e_sout->su_sbdroprecord)(sb, kp)) - return; - } - kp = kp->e_next; - } m = sb->sb_mb; if (m) { @@ -1266,8 +1348,9 @@ int pru_soreceive(struct socket *so, } -int pru_sopoll_notsupp(struct socket *so, int events, - struct ucred *cred) +int +pru_sopoll_notsupp(__unused struct socket *so, __unused int events, + __unused kauth_cred_t cred, __unused void *wql) { return EOPNOTSUPP; } @@ -1365,13 +1448,40 @@ sblock(struct sockbuf *sb, int wf) /* release lock on sockbuf sb */ void -sbunlock(struct sockbuf *sb) +sbunlock(struct sockbuf *sb, int keeplocked) { + struct socket *so = sb->sb_so; + int lr, lr_saved; + lck_mtx_t *mutex_held; + +#ifdef __ppc__ + __asm__ volatile("mflr %0" : "=r" (lr)); + lr_saved = lr; +#endif sb->sb_flags &= ~SB_LOCK; + + if (so->so_proto->pr_getlock != NULL) + mutex_held = (*so->so_proto->pr_getlock)(so, 0); + else + mutex_held = so->so_proto->pr_domain->dom_mtx; + + if (keeplocked == 0) + lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); + if (sb->sb_flags & SB_WANT) { sb->sb_flags &= ~SB_WANT; + if (so->so_usecount < 0) + panic("sbunlock: b4 wakeup so=%x ref=%d lr=%x sb_flags=%x\n", sb->sb_so, so->so_usecount, lr_saved, sb->sb_flags); + wakeup((caddr_t)&(sb)->sb_flags); } + if (keeplocked == 0) { /* unlock on exit */ + so->so_usecount--; + if (so->so_usecount < 0) + panic("sbunlock: unlock on exit so=%x lr=%x sb_flags=%x\n", so, so->so_usecount,lr_saved, sb->sb_flags); + so->reserved4= lr_saved; + lck_mtx_unlock(mutex_held); + } } void @@ -1424,8 +1534,12 @@ sotoxsocket(struct socket *so, struct xsocket *xso) xso->so_linger = so->so_linger; xso->so_state = so->so_state; xso->so_pcb = so->so_pcb; - xso->xso_protocol = so->so_proto->pr_protocol; - xso->xso_family = so->so_proto->pr_domain->dom_family; + if (so->so_proto) { + xso->xso_protocol = so->so_proto->pr_protocol; + xso->xso_family = so->so_proto->pr_domain->dom_family; + } + else + xso->xso_protocol = xso->xso_family = 0; xso->so_qlen = so->so_qlen; xso->so_incqlen = so->so_incqlen; xso->so_qlimit = so->so_qlimit; @@ -1453,7 +1567,9 @@ sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb) xsb->sb_mbmax = sb->sb_mbmax; xsb->sb_lowat = sb->sb_lowat; xsb->sb_flags = sb->sb_flags; - xsb->sb_timeo = sb->sb_timeo; + xsb->sb_timeo = (u_long)(sb->sb_timeo.tv_sec * hz) + sb->sb_timeo.tv_usec / tick; + if (xsb->sb_timeo == 0 && sb->sb_timeo.tv_usec != 0) + xsb->sb_timeo = 1; } /* diff --git a/bsd/kern/uipc_syscalls.c b/bsd/kern/uipc_syscalls.c index 5f35cd283..6fba04eae 100644 --- a/bsd/kern/uipc_syscalls.c +++ b/bsd/kern/uipc_syscalls.c @@ -62,23 +62,34 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/filedesc.h> -#include <sys/proc.h> -#include <sys/file.h> -#include <sys/buf.h> +#include <sys/proc_internal.h> +#include <sys/file_internal.h> #include <sys/malloc.h> #include <sys/mbuf.h> +#include <kern/lock.h> +#include <sys/domain.h> #include <sys/protosw.h> +#include <sys/signalvar.h> #include <sys/socket.h> #include <sys/socketvar.h> #if KTRACE #include <sys/ktrace.h> #endif #include <sys/kernel.h> +#include <sys/uio_internal.h> #include <bsm/audit_kernel.h> #include <sys/kdebug.h> - +#include <sys/sysproto.h> + +#define f_flag f_fglob->fg_flag +#define f_type f_fglob->fg_type +#define f_msgcount f_fglob->fg_msgcount +#define f_cred f_fglob->fg_cred +#define f_ops f_fglob->fg_ops +#define f_offset f_fglob->fg_offset +#define f_data f_fglob->fg_data #if KDEBUG #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0) @@ -94,35 +105,13 @@ #endif -struct getsockname_args { - int fdes; - caddr_t asa; - socklen_t *alen; -}; -struct getsockopt_args { - int s; - int level; - int name; - caddr_t val; - socklen_t *avalsize; -} ; - -struct accept_args { - int s; - caddr_t name; - socklen_t *anamelen; -}; - -struct getpeername_args { - int fdes; - caddr_t asa; - socklen_t *alen; -}; +#define HACK_FOR_4056224 1 +#if HACK_FOR_4056224 +static pid_t last_pid_4056224 = 0; +#endif /* HACK_FOR_4056224 */ -/* ARGSUSED */ - #if SENDFILE static void sf_buf_init(void *arg); SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL) @@ -136,187 +125,225 @@ static struct sf_buf *sf_bufs; static int sf_buf_alloc_want; #endif -static int sendit __P((struct proc *p, int s, struct msghdr *mp, int flags, register_t *retval)); -static int recvit __P((struct proc *p, int s, struct msghdr *mp, - caddr_t namelenp, register_t *retval)); +static int sendit(struct proc *p, int s, struct user_msghdr *mp, uio_t uiop, + int flags, register_t *retval); +static int recvit(struct proc *p, int s, struct user_msghdr *mp, uio_t uiop, + user_addr_t namelenp, register_t *retval); -static int accept1 __P((struct proc *p, struct accept_args *uap, register_t *retval, int compat)); -static int getsockname1 __P((struct proc *p, struct getsockname_args *uap, - register_t *retval, int compat)); -static int getpeername1 __P((struct proc *p, struct getpeername_args *uap, - register_t *retval, int compat)); +static int accept1(struct proc *p, struct accept_args *uap, register_t *retval, int compat); +static int getsockname1(struct proc *p, struct getsockname_args *uap, + register_t *retval, int compat); +static int getpeername1(struct proc *p, struct getpeername_args *uap, + register_t *retval, int compat); + + +#if COMPAT_43_SOCKET +struct orecvmsg_args { + int s; + struct omsghdr *msg; + int flags; +}; +struct osendmsg_args { + int s; + caddr_t msg; + int flags; +}; +struct osend_args { + int s; + caddr_t buf; + int len; + int flags; +}; +struct orecv_args { + int s; + caddr_t buf; + int len; + int flags; +}; + +int oaccept(struct proc *p, struct accept_args *uap, register_t *retval); +int ogetpeername(struct proc *p, struct getpeername_args *uap, register_t *retval); +int ogetsockname(struct proc *p, struct getsockname_args *uap, register_t *retval); +int orecv(struct proc *p, struct orecv_args *uap, register_t *retval); +int orecvfrom(struct proc *p, struct recvfrom_args *uap, register_t *retval); +int orecvmsg(struct proc *p, struct orecvmsg_args *uap, register_t *retval); +int osend(struct proc *p, struct osend_args *uap, register_t *retval); +int osendmsg(struct proc *p, struct osendmsg_args *uap, register_t *retval); +#endif // COMPAT_43_SOCKET /* * System call interface to the socket abstraction. */ -#if COMPAT_43 || defined(COMPAT_SUNOS) -#define COMPAT_OLDSOCK -#endif extern struct fileops socketops; -struct socket_args { - int domain; - int type; - int protocol; -}; int socket(p, uap, retval) struct proc *p; register struct socket_args *uap; register_t *retval; { - struct filedesc *fdp = p->p_fd; struct socket *so; - struct file *fp; + struct fileproc *fp; int fd, error; AUDIT_ARG(socket, uap->domain, uap->type, uap->protocol); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - error = falloc(p, &fp, &fd); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - if (error) + error = falloc(p, &fp, &fd); + if (error) { return (error); + } fp->f_flag = FREAD|FWRITE; fp->f_type = DTYPE_SOCKET; fp->f_ops = &socketops; - if (error = socreate(uap->domain, &so, uap->type, - uap->protocol)) { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - fdrelse(p, fd); - ffree(fp); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + + error = socreate(uap->domain, &so, uap->type, uap->protocol); + if (error) { + fp_free(p, fd, fp); } else { fp->f_data = (caddr_t)so; + + proc_fdlock(p); *fdflags(p, fd) &= ~UF_RESERVED; + + fp_drop(p, fd, fp, 1); + proc_fdunlock(p); + *retval = fd; } return (error); } -struct bind_args { - int s; - caddr_t name; - socklen_t namelen; -}; - /* ARGSUSED */ int -bind(p, uap, retval) - struct proc *p; - register struct bind_args *uap; - register_t *retval; +bind(struct proc *p, struct bind_args *uap, __unused register_t *retval) { - struct file *fp; struct sockaddr *sa; + struct socket *so; int error; AUDIT_ARG(fd, uap->s); - error = getsock(p->p_fd, uap->s, &fp); + error = file_socket(uap->s, &so); if (error) return (error); error = getsockaddr(&sa, uap->name, uap->namelen); - if (error) - return (error); + if (error) + goto out; AUDIT_ARG(sockaddr, p, sa); - if (fp->f_data != NULL) - error = sobind((struct socket *)fp->f_data, sa); + if (so != NULL) + error = sobind(so, sa); else error = EBADF; FREE(sa, M_SONAME); +out: + file_drop(uap->s); return (error); } -struct listen_args { - int s; - int backlog; -}; - - int -listen(p, uap, retval) - struct proc *p; - register struct listen_args *uap; - register_t *retval; +listen(__unused struct proc *p, register struct listen_args *uap, + __unused register_t *retval) { - struct file *fp; int error; + struct socket * so; AUDIT_ARG(fd, uap->s); - error = getsock(p->p_fd, uap->s, &fp); + error = file_socket(uap->s, &so); if (error) return (error); - if (fp->f_data != NULL) - return (solisten((struct socket *)fp->f_data, uap->backlog)); + if (so != NULL) + error = solisten(so, uap->backlog); else - return (EBADF); + error = EBADF; + file_drop(uap->s); + return (error); } -#ifndef COMPAT_OLDSOCK +#if !COMPAT_43_SOCKET #define accept1 accept #endif int -accept1(p, uap, retval, compat) - struct proc *p; - register struct accept_args *uap; - register_t *retval; - int compat; +accept1(struct proc *p, struct accept_args *uap, register_t *retval, int compat) { - struct file *fp; + struct fileproc *fp; struct sockaddr *sa; - u_int namelen; - int error, s; - struct socket *head, *so; - int fd; + socklen_t namelen; + int error; + struct socket *head, *so = NULL; + lck_mtx_t *mutex_held; + int fd = uap->s; + int newfd;; short fflag; /* type must match fp->f_flag */ - int tmpfd; + int dosocklock = 0; AUDIT_ARG(fd, uap->s); if (uap->name) { - error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen, - sizeof (namelen)); + error = copyin(uap->anamelen, (caddr_t)&namelen, + sizeof(socklen_t)); if(error) return (error); } - error = getsock(p->p_fd, uap->s, &fp); - if (error) + error = fp_getfsock(p, fd, &fp, &head); + if (error) { + if (error == EOPNOTSUPP) + error = ENOTSOCK; return (error); - s = splnet(); - head = (struct socket *)fp->f_data; + } if (head == NULL) { - splx(s); - return (EBADF); + error = EBADF; + goto out; } + + socket_lock(head, 1); + + if (head->so_proto->pr_getlock != NULL) { + mutex_held = (*head->so_proto->pr_getlock)(head, 0); + dosocklock = 1; + } + else { + mutex_held = head->so_proto->pr_domain->dom_mtx; + dosocklock = 0; + } + + if ((head->so_options & SO_ACCEPTCONN) == 0) { - splx(s); - return (EINVAL); + socket_unlock(head, 1); + error = EINVAL; + goto out; } if ((head->so_state & SS_NBIO) && head->so_comp.tqh_first == NULL) { - splx(s); - return (EWOULDBLOCK); + socket_unlock(head, 1); + error = EWOULDBLOCK; + goto out; } while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { if (head->so_state & SS_CANTRCVMORE) { head->so_error = ECONNABORTED; break; } - error = tsleep((caddr_t)&head->so_timeo, PSOCK | PCATCH, + if (head->so_usecount < 1) + panic("accept1: head=%x refcount=%d\n", head, head->so_usecount); + error = msleep((caddr_t)&head->so_timeo, mutex_held, PSOCK | PCATCH, "accept", 0); + if (head->so_usecount < 1) + panic("accept1: 2 head=%x refcount=%d\n", head, head->so_usecount); + if ((head->so_state & SS_DRAINING)) { + error = ECONNABORTED; + } if (error) { - splx(s); - return (error); + socket_unlock(head, 1); + goto out; } } if (head->so_error) { error = head->so_error; head->so_error = 0; - splx(s); - return (error); + socket_unlock(head, 1); + goto out; } @@ -327,14 +354,14 @@ accept1(p, uap, retval, compat) * block allowing another process to accept the connection * instead. */ + lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); so = TAILQ_FIRST(&head->so_comp); TAILQ_REMOVE(&head->so_comp, so, so_list); head->so_qlen--; - + socket_unlock(head, 0); /* unlock head to avoid deadlock with select, keep a ref on head */ fflag = fp->f_flag; - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - error = falloc(p, &fp, &fd); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + proc_fdlock(p); + error = falloc_locked(p, &fp, &newfd, 1); if (error) { /* * Probably ran out of file descriptors. Put the @@ -342,114 +369,130 @@ accept1(p, uap, retval, compat) * do another wakeup so some other process might * have a chance at it. */ + proc_fdunlock(p); + socket_lock(head, 0); TAILQ_INSERT_HEAD(&head->so_comp, so, so_list); head->so_qlen++; - wakeup_one(&head->so_timeo); - splx(s); - return (error); - } else { - *fdflags(p, fd) &= ~UF_RESERVED; - *retval = fd; - } - - so->so_state &= ~SS_COMP; - so->so_head = NULL; + wakeup_one((caddr_t)&head->so_timeo); + socket_unlock(head, 1); + goto out; + } + *fdflags(p, newfd) &= ~UF_RESERVED; + *retval = newfd; fp->f_type = DTYPE_SOCKET; fp->f_flag = fflag; fp->f_ops = &socketops; fp->f_data = (caddr_t)so; + fp_drop(p, newfd, fp, 1); + proc_fdunlock(p); + socket_lock(head, 0); + if (dosocklock) + socket_lock(so, 1); + so->so_state &= ~SS_COMP; + so->so_head = NULL; sa = 0; - (void) soaccept(so, &sa); + (void) soacceptlock(so, &sa, 0); + socket_unlock(head, 1); if (sa == 0) { namelen = 0; if (uap->name) goto gotnoname; - return 0; + if (dosocklock) + socket_unlock(so, 1); + error = 0; + goto out; } AUDIT_ARG(sockaddr, p, sa); if (uap->name) { /* check sa_len before it is destroyed */ if (namelen > sa->sa_len) namelen = sa->sa_len; -#ifdef COMPAT_OLDSOCK +#if COMPAT_43_SOCKET if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif - error = copyout(sa, (caddr_t)uap->name, (u_int)namelen); + error = copyout(sa, uap->name, namelen); if (!error) gotnoname: - error = copyout((caddr_t)&namelen, - (caddr_t)uap->anamelen, sizeof (*uap->anamelen)); + error = copyout((caddr_t)&namelen, uap->anamelen, + sizeof(socklen_t)); } FREE(sa, M_SONAME); - splx(s); + if (dosocklock) + socket_unlock(so, 1); +out: + file_drop(fd); return (error); } int -accept(p, uap, retval) - struct proc *p; - struct accept_args *uap; - register_t *retval; +accept(struct proc *p, struct accept_args *uap, register_t *retval) { return (accept1(p, uap, retval, 0)); } -#ifdef COMPAT_OLDSOCK +#if COMPAT_43_SOCKET int -oaccept(p, uap, retval) - struct proc *p; - struct accept_args *uap; - register_t *retval; +oaccept(struct proc *p, struct accept_args *uap, register_t *retval) { return (accept1(p, uap, retval, 1)); } -#endif /* COMPAT_OLDSOCK */ +#endif /* COMPAT_43_SOCKET */ -struct connect_args { - int s; - caddr_t name; - socklen_t namelen; -}; /* ARGSUSED */ int -connect(p, uap, retval) - struct proc *p; - register struct connect_args *uap; - register_t *retval; +connect(struct proc *p, struct connect_args *uap, __unused register_t *retval) { - struct file *fp; - register struct socket *so; + struct socket *so; struct sockaddr *sa; - int error, s; + lck_mtx_t *mutex_held; + int error; + int fd = uap->s; AUDIT_ARG(fd, uap->s); - error = getsock(p->p_fd, uap->s, &fp); + error = file_socket( fd, &so); if (error) return (error); - so = (struct socket *)fp->f_data; - if (so == NULL) - return (EBADF); - if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) - return (EALREADY); + if (so == NULL) { + error = EBADF; + goto out; + } + + socket_lock(so, 1); + + if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { + socket_unlock(so, 1); + error = EALREADY; + goto out; + } error = getsockaddr(&sa, uap->name, uap->namelen); - if (error) - return (error); + if (error) { + socket_unlock(so, 1); + goto out; + } AUDIT_ARG(sockaddr, p, sa); - error = soconnect(so, sa); + error = soconnectlock(so, sa, 0); if (error) goto bad; if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { FREE(sa, M_SONAME); - return (EINPROGRESS); + socket_unlock(so, 1); + error = EINPROGRESS; + goto out; } - s = splnet(); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { - error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH, + if (so->so_proto->pr_getlock != NULL) + mutex_held = (*so->so_proto->pr_getlock)(so, 0); + else + mutex_held = so->so_proto->pr_domain->dom_mtx; + error = msleep((caddr_t)&so->so_timeo, mutex_held, PSOCK | PCATCH, "connec", 0); + if ((so->so_state & SS_DRAINING)) { + error = ECONNABORTED; + } if (error) break; } @@ -457,29 +500,21 @@ connect(p, uap, retval) error = so->so_error; so->so_error = 0; } - splx(s); bad: so->so_state &= ~SS_ISCONNECTING; + socket_unlock(so, 1); FREE(sa, M_SONAME); if (error == ERESTART) error = EINTR; +out: + file_drop(fd); return (error); } -struct socketpair_args { - int domain; - int type; - int protocol; - int *rsv; -}; int -socketpair(p, uap, retval) - struct proc *p; - register struct socketpair_args *uap; - register_t *retval; +socketpair(struct proc *p, struct socketpair_args *uap, __unused register_t *retval) { - register struct filedesc *fdp = p->p_fd; - struct file *fp1, *fp2; + struct fileproc *fp1, *fp2; struct socket *so1, *so2; int fd, error, sv[2]; @@ -490,57 +525,59 @@ socketpair(p, uap, retval) error = socreate(uap->domain, &so2, uap->type, uap->protocol); if (error) goto free1; - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + error = falloc(p, &fp1, &fd); - if (error) + if (error) { goto free2; - sv[0] = fd; + } fp1->f_flag = FREAD|FWRITE; fp1->f_type = DTYPE_SOCKET; fp1->f_ops = &socketops; fp1->f_data = (caddr_t)so1; + sv[0] = fd; + error = falloc(p, &fp2, &fd); - if (error) + if (error) { goto free3; + } fp2->f_flag = FREAD|FWRITE; fp2->f_type = DTYPE_SOCKET; fp2->f_ops = &socketops; fp2->f_data = (caddr_t)so2; sv[1] = fd; - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + error = soconnect2(so1, so2); if (error) { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); goto free4; } - if (uap->type == SOCK_DGRAM) { /* * Datagram socket connection is asymmetric. */ error = soconnect2(so2, so1); if (error) { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); goto free4; } } + + proc_fdlock(p); *fdflags(p, sv[0]) &= ~UF_RESERVED; *fdflags(p, sv[1]) &= ~UF_RESERVED; - error = copyout((caddr_t)sv, (caddr_t)uap->rsv, - 2 * sizeof (int)); + fp_drop(p, sv[0], fp1, 1); + fp_drop(p, sv[1], fp2, 1); + proc_fdunlock(p); + + error = copyout((caddr_t)sv, uap->rsv, 2 * sizeof(int)); #if 0 /* old pipe(2) syscall compatability, unused these days */ retval[0] = sv[0]; /* XXX ??? */ retval[1] = sv[1]; /* XXX ??? */ #endif /* 0 */ return (error); free4: - fdrelse(p, sv[1]); - ffree(fp2); + fp_free(p, sv[1], fp2); free3: - fdrelse(p, sv[0]); - ffree(fp1); + fp_free(p, sv[0], fp1); free2: - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); (void)soclose(so2); free1: (void)soclose(so1); @@ -548,68 +585,41 @@ free1: } static int -sendit(p, s, mp, flags, retsize) - register struct proc *p; - int s; - register struct msghdr *mp; - int flags; - register_t *retsize; +sendit(struct proc *p, int s, struct user_msghdr *mp, uio_t uiop, + int flags, register_t *retval) { - struct file *fp; - struct uio auio; - register struct iovec *iov; - register int i; struct mbuf *control; struct sockaddr *to; - int len, error; + int error; struct socket *so; + user_ssize_t len; #if KTRACE - struct iovec *ktriov = NULL; - struct uio ktruio; + uio_t ktruio = NULL; #endif KERNEL_DEBUG(DBG_FNC_SENDIT | DBG_FUNC_START, 0,0,0,0,0); - if (error = getsock(p->p_fd, s, &fp)) + error = file_socket(s, &so); + if (error ) { KERNEL_DEBUG(DBG_FNC_SENDIT | DBG_FUNC_END, error,0,0,0,0); return (error); } - - auio.uio_iov = mp->msg_iov; - auio.uio_iovcnt = mp->msg_iovlen; - auio.uio_segflg = UIO_USERSPACE; - auio.uio_rw = UIO_WRITE; - auio.uio_procp = p; - auio.uio_offset = 0; /* XXX */ - auio.uio_resid = 0; - iov = mp->msg_iov; - for (i = 0; i < mp->msg_iovlen; i++, iov++) { - if (iov->iov_len < 0) - { - KERNEL_DEBUG(DBG_FNC_SENDIT | DBG_FUNC_END, EINVAL,0,0,0,0); - return (EINVAL); - } - - if ((auio.uio_resid += iov->iov_len) < 0) - { - KERNEL_DEBUG(DBG_FNC_SENDIT | DBG_FUNC_END, EINVAL,0,0,0,0); - return (EINVAL); - } - } + if (mp->msg_name) { error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); if (error) { KERNEL_DEBUG(DBG_FNC_SENDIT | DBG_FUNC_END, error,0,0,0,0); - return (error); + goto out; } AUDIT_ARG(sockaddr, p, to); - } else + } else { to = 0; + } if (mp->msg_control) { - if (mp->msg_controllen < sizeof(struct cmsghdr) -#ifdef COMPAT_OLDSOCK - && mp->msg_flags != MSG_COMPAT + if (mp->msg_controllen < ((socklen_t)sizeof(struct cmsghdr)) +#if COMPAT_43_SOCKET + && !(mp->msg_flags & MSG_COMPAT) #endif ) { error = EINVAL; @@ -619,8 +629,8 @@ sendit(p, s, mp, flags, retsize) mp->msg_controllen, MT_CONTROL); if (error) goto bad; -#ifdef COMPAT_OLDSOCK - if (mp->msg_flags == MSG_COMPAT) { +#if COMPAT_43_SOCKET + if (mp->msg_flags & MSG_COMPAT) { register struct cmsghdr *cm; M_PREPEND(control, sizeof(*cm), M_WAIT); @@ -635,27 +645,24 @@ sendit(p, s, mp, flags, retsize) } } #endif - } else + } else { control = 0; + } #if KTRACE - if (KTRPOINT(p, KTR_GENIO)) { - int iovlen = auio.uio_iovcnt * sizeof (struct iovec); - - MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); - bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); - ktruio = auio; - } + if (KTRPOINT(p, KTR_GENIO)) { + ktruio = uio_duplicate(uiop); + } #endif - len = auio.uio_resid; - so = (struct socket *)fp->f_data; + + len = uio_resid(uiop); if (so == NULL) error = EBADF; else - error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control, + error = so->so_proto->pr_usrreqs->pru_sosend(so, to, uiop, 0, control, flags); if (error) { - if (auio.uio_resid != len && (error == ERESTART || + if (uio_resid(uiop) != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket */ @@ -663,278 +670,287 @@ sendit(p, s, mp, flags, retsize) psignal(p, SIGPIPE); } if (error == 0) - *retsize = len - auio.uio_resid; + *retval = (int)(len - uio_resid(uiop)); +bad: #if KTRACE - if (ktriov != NULL) { + if (ktruio != NULL) { if (error == 0) { - ktruio.uio_iov = ktriov; - ktruio.uio_resid = retsize[0]; - ktrgenio(p->p_tracep, s, UIO_WRITE, &ktruio, error, -1); + uio_setresid(ktruio, retval[0]); + ktrgenio(p->p_tracep, s, UIO_WRITE, ktruio, error); } - FREE(ktriov, M_TEMP); + uio_free(ktruio); } #endif -bad: if (to) FREE(to, M_SONAME); KERNEL_DEBUG(DBG_FNC_SENDIT | DBG_FUNC_END, error,0,0,0,0); +out: + file_drop(s); return (error); } -struct sendto_args { - int s; - caddr_t buf; - size_t len; - int flags; - caddr_t to; - int tolen; -}; - int -sendto(p, uap, retval) - struct proc *p; - register struct sendto_args /* { - int s; - caddr_t buf; - size_t len; - int flags; - caddr_t to; - int tolen; - } */ *uap; - register_t *retval; - +sendto(struct proc *p, struct sendto_args *uap, register_t *retval) { - struct msghdr msg; - struct iovec aiov; - int stat; + struct user_msghdr msg; + int error; + uio_t auio = NULL; KERNEL_DEBUG(DBG_FNC_SENDTO | DBG_FUNC_START, 0,0,0,0,0); AUDIT_ARG(fd, uap->s); + auio = uio_create(1, 0, + (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32), + UIO_WRITE); + if (auio == NULL) { + return (ENOMEM); + } + uio_addiov(auio, uap->buf, uap->len); + msg.msg_name = uap->to; msg.msg_namelen = uap->tolen; - msg.msg_iov = &aiov; - msg.msg_iovlen = 1; + /* no need to set up msg_iov. sendit uses uio_t we send it */ + msg.msg_iov = 0; + msg.msg_iovlen = 0; msg.msg_control = 0; -#ifdef COMPAT_OLDSOCK msg.msg_flags = 0; -#endif - aiov.iov_base = uap->buf; - aiov.iov_len = uap->len; - stat = sendit(p, uap->s, &msg, uap->flags, retval); - KERNEL_DEBUG(DBG_FNC_SENDTO | DBG_FUNC_END, stat, *retval,0,0,0); - return(stat); -} -#ifdef COMPAT_OLDSOCK -struct osend_args { - int s; - caddr_t buf; - int len; - int flags; -}; + error = sendit(p, uap->s, &msg, auio, uap->flags, retval); + + if (auio != NULL) { + uio_free(auio); + } + +#if HACK_FOR_4056224 + /* + * Radar 4056224 + * Temporary workaround to let send() and recv() work over a pipe for binary compatibility + * This will be removed in the release following Tiger + */ + if (error == ENOTSOCK) { + struct fileproc *fp; + + if (fp_lookup(p, uap->s, &fp, 0) == 0) { + (void) fp_drop(p, uap->s, fp,0); + + if (fp->f_type == DTYPE_PIPE) { + struct write_args write_uap; + user_ssize_t write_retval; + + if (p->p_pid > last_pid_4056224) { + last_pid_4056224 = p->p_pid; + + printf("%s[%d] uses send/recv on a pipe\n", + p->p_comm, p->p_pid); + } + + bzero(&write_uap, sizeof(struct write_args)); + write_uap.fd = uap->s; + write_uap.cbuf = uap->buf; + write_uap.nbyte = uap->len; + + error = write(p, &write_uap, &write_retval); + *retval = (int)write_retval; + } + } + } +#endif /* HACK_FOR_4056224 */ -int -osend(p, uap, retval) - struct proc *p; - register struct osend_args /* { - int s; - caddr_t buf; - int len; - int flags; - } */ *uap; - register_t *retval; + KERNEL_DEBUG(DBG_FNC_SENDTO | DBG_FUNC_END, error, *retval,0,0,0); + + return(error); +} +#if COMPAT_43_SOCKET +int +osend(__unused struct proc *p, + __unused struct osend_args *uap, + __unused register_t *retval) { - struct msghdr msg; - struct iovec aiov; - - msg.msg_name = 0; - msg.msg_namelen = 0; - msg.msg_iov = &aiov; - msg.msg_iovlen = 1; - aiov.iov_base = uap->buf; - aiov.iov_len = uap->len; - msg.msg_control = 0; - msg.msg_flags = 0; - return (sendit(p, uap->s, &msg, uap->flags, retval)); + /* these are no longer supported and in fact + * there is no way to call it directly. + * LP64todo - remove this once we're sure there are no clients + */ + return (ENOTSUP); } -struct osendmsg_args { - int s; - caddr_t msg; - int flags; -}; int -osendmsg(p, uap, retval) - struct proc *p; - register struct osendmsg_args /* { - int s; - caddr_t msg; - int flags; - } */ *uap; - register_t *retval; - +osendmsg(__unused struct proc *p, + __unused struct osendmsg_args *uap, + __unused register_t *retval) { - struct msghdr msg; - struct iovec aiov[UIO_SMALLIOV], *iov; - int error; - - error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr)); - if (error) - return (error); - if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { - if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) - return (EMSGSIZE); - MALLOC(iov, struct iovec *, - sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, - M_WAITOK); - } else - iov = aiov; - error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, - (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); - if (error) - goto done; - msg.msg_flags = MSG_COMPAT; - msg.msg_iov = iov; - error = sendit(p, uap->s, &msg, uap->flags, retval); -done: - if (iov != aiov) - FREE(iov, M_IOV); - return (error); + /* these are no longer supported and in fact + * there is no way to call it directly. + * LP64todo - remove this once we're sure there are no clients + */ + return (ENOTSUP); } #endif -struct sendmsg_args { - int s; - caddr_t msg; - int flags; -}; int -sendmsg(p, uap, retval) - struct proc *p; - register struct sendmsg_args *uap; - register_t *retval; +sendmsg(struct proc *p, register struct sendmsg_args *uap, register_t *retval) { struct msghdr msg; - struct iovec aiov[UIO_SMALLIOV], *iov; + struct user_msghdr user_msg; + caddr_t msghdrp; + int size_of_msghdr; int error; + int size_of_iovec; + uio_t auio = NULL; + struct user_iovec *iovp; KERNEL_DEBUG(DBG_FNC_SENDMSG | DBG_FUNC_START, 0,0,0,0,0); AUDIT_ARG(fd, uap->s); - if (error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg))) + if (IS_64BIT_PROCESS(p)) { + msghdrp = (caddr_t) &user_msg; + size_of_msghdr = sizeof(user_msg); + size_of_iovec = sizeof(struct user_iovec); + } + else { + msghdrp = (caddr_t) &msg; + size_of_msghdr = sizeof(msg); + size_of_iovec = sizeof(struct iovec); + } + error = copyin(uap->msg, msghdrp, size_of_msghdr); + if (error) { KERNEL_DEBUG(DBG_FNC_SENDMSG | DBG_FUNC_END, error,0,0,0,0); return (error); } - - if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { - if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) { - KERNEL_DEBUG(DBG_FNC_SENDMSG | DBG_FUNC_END, EMSGSIZE,0,0,0,0); - return (EMSGSIZE); - } - MALLOC(iov, struct iovec *, - sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, - M_WAITOK); - } else - iov = aiov; - if (msg.msg_iovlen && - (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, - (unsigned)(msg.msg_iovlen * sizeof (struct iovec))))) - goto done; - msg.msg_iov = iov; -#ifdef COMPAT_OLDSOCK - msg.msg_flags = 0; + + /* only need to copy if user process is not 64-bit */ + if (!IS_64BIT_PROCESS(p)) { + user_msg.msg_flags = msg.msg_flags; + user_msg.msg_controllen = msg.msg_controllen; + user_msg.msg_control = CAST_USER_ADDR_T(msg.msg_control); + user_msg.msg_iovlen = msg.msg_iovlen; + user_msg.msg_iov = CAST_USER_ADDR_T(msg.msg_iov); + user_msg.msg_namelen = msg.msg_namelen; + user_msg.msg_name = CAST_USER_ADDR_T(msg.msg_name); + } + + if (user_msg.msg_iovlen <= 0 || user_msg.msg_iovlen > UIO_MAXIOV) { + KERNEL_DEBUG(DBG_FNC_SENDMSG | DBG_FUNC_END, EMSGSIZE,0,0,0,0); + return (EMSGSIZE); + } + + /* allocate a uio large enough to hold the number of iovecs passed */ + auio = uio_create(user_msg.msg_iovlen, 0, + (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32), + UIO_WRITE); + if (auio == NULL) { + error = ENOBUFS; + goto done; + } + + if (user_msg.msg_iovlen) { + /* get location of iovecs within the uio. then copyin the iovecs from + * user space. + */ + iovp = uio_iovsaddr(auio); + if (iovp == NULL) { + error = ENOBUFS; + goto done; + } + error = copyin(user_msg.msg_iov, (caddr_t)iovp, (user_msg.msg_iovlen * size_of_iovec)); + if (error) + goto done; + user_msg.msg_iov = CAST_USER_ADDR_T(iovp); + + /* finish setup of uio_t */ + uio_calculateresid(auio); + } + else { + user_msg.msg_iov = 0; + } + +#if COMPAT_43_SOCKET + user_msg.msg_flags = 0; #endif - error = sendit(p, uap->s, &msg, uap->flags, retval); + error = sendit(p, uap->s, &user_msg, auio, uap->flags, retval); done: - if (iov != aiov) - FREE(iov, M_IOV); + if (auio != NULL) { + uio_free(auio); + } KERNEL_DEBUG(DBG_FNC_SENDMSG | DBG_FUNC_END, error,0,0,0,0); + return (error); } static int -recvit(p, s, mp, namelenp, retval) +recvit(p, s, mp, uiop, namelenp, retval) register struct proc *p; int s; - register struct msghdr *mp; - caddr_t namelenp; + register struct user_msghdr *mp; + uio_t uiop; + user_addr_t namelenp; register_t *retval; { - struct file *fp; - struct uio auio; - register struct iovec *iov; - register int i; int len, error; struct mbuf *m, *control = 0; - caddr_t ctlbuf; + user_addr_t ctlbuf; struct socket *so; struct sockaddr *fromsa = 0; + struct fileproc *fp; #if KTRACE - struct iovec *ktriov = NULL; - struct uio ktruio; + uio_t ktruio = NULL; #endif KERNEL_DEBUG(DBG_FNC_RECVIT | DBG_FUNC_START, 0,0,0,0,0); - if (error = getsock(p->p_fd, s, &fp)) - { + proc_fdlock(p); + if ( (error = fp_lookup(p, s, &fp, 1)) ) { KERNEL_DEBUG(DBG_FNC_RECVIT | DBG_FUNC_END, error,0,0,0,0); + proc_fdunlock(p); return (error); } + if (fp->f_type != DTYPE_SOCKET) { + fp_drop(p, s, fp,1); + proc_fdunlock(p); + return(ENOTSOCK); + } - auio.uio_iov = mp->msg_iov; - auio.uio_iovcnt = mp->msg_iovlen; - auio.uio_segflg = UIO_USERSPACE; - auio.uio_rw = UIO_READ; - auio.uio_procp = p; - auio.uio_offset = 0; /* XXX */ - auio.uio_resid = 0; - iov = mp->msg_iov; - for (i = 0; i < mp->msg_iovlen; i++, iov++) { - if ((auio.uio_resid += iov->iov_len) < 0) { - KERNEL_DEBUG(DBG_FNC_RECVIT | DBG_FUNC_END, EINVAL,0,0,0,0); - return (EINVAL); - } + so = (struct socket *)fp->f_data; + + proc_fdunlock(p); + if (uio_resid(uiop) < 0) { + KERNEL_DEBUG(DBG_FNC_RECVIT | DBG_FUNC_END, EINVAL,0,0,0,0); + error = EINVAL; + goto out1; } #if KTRACE if (KTRPOINT(p, KTR_GENIO)) { - int iovlen = auio.uio_iovcnt * sizeof (struct iovec); - - MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); - bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); - ktruio = auio; + ktruio = uio_duplicate(uiop); } #endif - len = auio.uio_resid; - so = (struct socket *)fp->f_data; + + len = uio_resid(uiop); if (so == NULL) error = EBADF; - else - error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio, + else { + error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, uiop, (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0, &mp->msg_flags); + } AUDIT_ARG(sockaddr, p, fromsa); if (error) { - if (auio.uio_resid != len && (error == ERESTART || + if (uio_resid(uiop) != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } #if KTRACE - if (ktriov != NULL) { + if (ktruio != NULL) { if (error == 0) { - ktruio.uio_iov = ktriov; - ktruio.uio_resid = len - auio.uio_resid; - ktrgenio(p->p_tracep, s, UIO_WRITE, &ktruio, error, -1); + uio_setresid(ktruio, len - uio_resid(uiop)); + ktrgenio(p->p_tracep, s, UIO_WRITE, ktruio, error); } - FREE(ktriov, M_TEMP); + uio_free(ktruio); } #endif if (error) goto out; - *retval = len - auio.uio_resid; + *retval = len - uio_resid(uiop); if (mp->msg_name) { len = mp->msg_namelen; if (len <= 0 || fromsa == 0) @@ -945,20 +961,19 @@ recvit(p, s, mp, namelenp, retval) #endif /* save sa_len before it is destroyed by MSG_COMPAT */ len = MIN(len, fromsa->sa_len); -#ifdef COMPAT_OLDSOCK +#if COMPAT_43_SOCKET if (mp->msg_flags & MSG_COMPAT) ((struct osockaddr *)fromsa)->sa_family = fromsa->sa_family; #endif - error = copyout(fromsa, - (caddr_t)mp->msg_name, (unsigned)len); + error = copyout(fromsa, mp->msg_name, (unsigned)len); if (error) goto out; } mp->msg_namelen = len; if (namelenp && (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) { -#ifdef COMPAT_OLDSOCK +#if COMPAT_43_SOCKET if (mp->msg_flags & MSG_COMPAT) error = 0; /* old recvfrom didn't check */ else @@ -967,7 +982,7 @@ recvit(p, s, mp, namelenp, retval) } } if (mp->msg_control) { -#ifdef COMPAT_OLDSOCK +#if COMPAT_43_SOCKET /* * We assume that old recvmsg calls won't receive access * rights and other control info, esp. as control info @@ -990,7 +1005,7 @@ recvit(p, s, mp, namelenp, retval) len = mp->msg_controllen; m = control; mp->msg_controllen = 0; - ctlbuf = (caddr_t) mp->msg_control; + ctlbuf = mp->msg_control; while (m && len > 0) { unsigned int tocopy; @@ -1002,8 +1017,8 @@ recvit(p, s, mp, namelenp, retval) tocopy = len; } - if (error = copyout((caddr_t)mtod(m, caddr_t), - ctlbuf, tocopy)) + error = copyout((caddr_t)mtod(m, caddr_t), ctlbuf, tocopy); + if (error) goto out; ctlbuf += tocopy; @@ -1018,19 +1033,12 @@ out: if (control) m_freem(control); KERNEL_DEBUG(DBG_FNC_RECVIT | DBG_FUNC_END, error,0,0,0,0); +out1: + fp_drop(p, s, fp, 0); return (error); } -struct recvfrom_args { - int s; - caddr_t buf; - size_t len; - int flags; - caddr_t from; - int *fromlenaddr; -}; - int recvfrom(p, uap, retval) struct proc *p; @@ -1044,37 +1052,83 @@ recvfrom(p, uap, retval) } */ *uap; register_t *retval; { - struct msghdr msg; - struct iovec aiov; + struct user_msghdr msg; int error; + uio_t auio = NULL; KERNEL_DEBUG(DBG_FNC_RECVFROM | DBG_FUNC_START, 0,0,0,0,0); AUDIT_ARG(fd, uap->s); if (uap->fromlenaddr) { - error = copyin((caddr_t)uap->fromlenaddr, + error = copyin(uap->fromlenaddr, (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen)); if (error) return (error); } else msg.msg_namelen = 0; msg.msg_name = uap->from; - msg.msg_iov = &aiov; - msg.msg_iovlen = 1; - aiov.iov_base = uap->buf; - aiov.iov_len = uap->len; + auio = uio_create(1, 0, + (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32), + UIO_READ); + if (auio == NULL) { + return (ENOMEM); + } + + uio_addiov(auio, uap->buf, uap->len); + /* no need to set up msg_iov. recvit uses uio_t we send it */ + msg.msg_iov = 0; + msg.msg_iovlen = 0; msg.msg_control = 0; + msg.msg_controllen = 0; msg.msg_flags = uap->flags; + error = recvit(p, uap->s, &msg, auio, uap->fromlenaddr, retval); + if (auio != NULL) { + uio_free(auio); + } + +#if HACK_FOR_4056224 + /* + * Radar 4056224 + * Temporary workaround to let send() and recv() work over a pipe for binary compatibility + * This will be removed in the release following Tiger + */ + if (error == ENOTSOCK && proc_is64bit(p) == 0) { + struct fileproc *fp; + + if (fp_lookup(p, uap->s, &fp, 0) == 0) { + (void) fp_drop(p, uap->s, fp,0); + + if (fp->f_type == DTYPE_PIPE) { + struct read_args read_uap; + user_ssize_t read_retval; + + if (p->p_pid > last_pid_4056224) { + last_pid_4056224 = p->p_pid; + + printf("%s[%d] uses send/recv on a pipe\n", + p->p_comm, p->p_pid); + } + + bzero(&read_uap, sizeof(struct read_args)); + read_uap.fd = uap->s; + read_uap.cbuf = uap->buf; + read_uap.nbyte = uap->len; + + error = read(p, &read_uap, &read_retval); + *retval = (int)read_retval; + } + } + } +#endif /* HACK_FOR_4056224 */ + KERNEL_DEBUG(DBG_FNC_RECVFROM | DBG_FUNC_END, error,0,0,0,0); - return (recvit(p, uap->s, &msg, (caddr_t)uap->fromlenaddr, retval)); + + return (error); } -#ifdef COMPAT_OLDSOCK +#if COMPAT_43_SOCKET int -orecvfrom(p, uap, retval) - struct proc *p; - struct recvfrom_args *uap; - register_t *retval; +orecvfrom(struct proc *p, struct recvfrom_args *uap, register_t *retval) { uap->flags |= MSG_COMPAT; @@ -1083,32 +1137,17 @@ orecvfrom(p, uap, retval) #endif -#ifdef COMPAT_OLDSOCK -struct orecv_args { - int s; - caddr_t buf; - int len; - int flags; -}; - +#if COMPAT_43_SOCKET int -orecv(p, uap, retval) - struct proc *p; - struct orecv_args *uap; - register_t *retval; +orecv(__unused struct proc *p, __unused struct orecv_args *uap, + __unused register_t *retval) { - struct msghdr msg; - struct iovec aiov; - - msg.msg_name = 0; - msg.msg_namelen = 0; - msg.msg_iov = &aiov; - msg.msg_iovlen = 1; - aiov.iov_base = uap->buf; - aiov.iov_len = uap->len; - msg.msg_control = 0; - msg.msg_flags = uap->flags; - return (recvit(p, uap->s, &msg, (caddr_t)0, retval)); + /* these are no longer supported and in fact + * there is no way to call it directly. + * LP64todo - remove this once we're sure there are no clients + */ + + return (ENOTSUP); } /* @@ -1116,58 +1155,20 @@ orecv(p, uap, retval) * overlays the new one, missing only the flags, and with the (old) access * rights where the control fields are now. */ -struct orecvmsg_args { - int s; - struct omsghdr *msg; - int flags; -}; - int -orecvmsg(p, uap, retval) - struct proc *p; - struct orecvmsg_args *uap; - register_t *retval; +orecvmsg(__unused struct proc *p, __unused struct orecvmsg_args *uap, + __unused register_t *retval) { - struct msghdr msg; - struct iovec aiov[UIO_SMALLIOV], *iov; - int error; + /* these are no longer supported and in fact + * there is no way to call it directly. + * LP64todo - remove this once we're sure there are no clients + */ - error = copyin((caddr_t)uap->msg, (caddr_t)&msg, - sizeof (struct omsghdr)); - if (error) - return (error); - if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { - if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) - return (EMSGSIZE); - MALLOC(iov, struct iovec *, - sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, - M_WAITOK); - } else - iov = aiov; - msg.msg_flags = uap->flags | MSG_COMPAT; - error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, - (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); - if (error) - goto done; - msg.msg_iov = iov; - error = recvit(p, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen, retval); + return (ENOTSUP); - if (msg.msg_controllen && error == 0) - error = copyout((caddr_t)&msg.msg_controllen, - (caddr_t)&uap->msg->msg_accrightslen, sizeof (int)); -done: - if (iov != aiov) - FREE(iov, M_IOV); - return (error); } #endif -struct recvmsg_args { - int s; - struct msghdr *msg; - int flags; -}; - int recvmsg(p, uap, retval) struct proc *p; @@ -1175,73 +1176,125 @@ recvmsg(p, uap, retval) register_t *retval; { struct msghdr msg; - struct iovec aiov[UIO_SMALLIOV], *uiov, *iov; + struct user_msghdr user_msg; + caddr_t msghdrp; + int size_of_msghdr; + user_addr_t uiov; register int error; + int size_of_iovec; + uio_t auio = NULL; + struct user_iovec *iovp; KERNEL_DEBUG(DBG_FNC_RECVMSG | DBG_FUNC_START, 0,0,0,0,0); AUDIT_ARG(fd, uap->s); - if (error = copyin((caddr_t)uap->msg, (caddr_t)&msg, - sizeof (msg))) + if (IS_64BIT_PROCESS(p)) { + msghdrp = (caddr_t) &user_msg; + size_of_msghdr = sizeof(user_msg); + size_of_iovec = sizeof(struct user_iovec); + } + else { + msghdrp = (caddr_t) &msg; + size_of_msghdr = sizeof(msg); + size_of_iovec = sizeof(struct iovec); + } + error = copyin(uap->msg, msghdrp, size_of_msghdr); + if (error) { KERNEL_DEBUG(DBG_FNC_RECVMSG | DBG_FUNC_END, error,0,0,0,0); return (error); } - if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { - if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) { - KERNEL_DEBUG(DBG_FNC_RECVMSG | DBG_FUNC_END, EMSGSIZE,0,0,0,0); - return (EMSGSIZE); - } - MALLOC(iov, struct iovec *, - sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, - M_WAITOK); - } else - iov = aiov; -#ifdef COMPAT_OLDSOCK - msg.msg_flags = uap->flags &~ MSG_COMPAT; + /* only need to copy if user process is not 64-bit */ + if (!IS_64BIT_PROCESS(p)) { + user_msg.msg_flags = msg.msg_flags; + user_msg.msg_controllen = msg.msg_controllen; + user_msg.msg_control = CAST_USER_ADDR_T(msg.msg_control); + user_msg.msg_iovlen = msg.msg_iovlen; + user_msg.msg_iov = CAST_USER_ADDR_T(msg.msg_iov); + user_msg.msg_namelen = msg.msg_namelen; + user_msg.msg_name = CAST_USER_ADDR_T(msg.msg_name); + } + + if (user_msg.msg_iovlen <= 0 || user_msg.msg_iovlen > UIO_MAXIOV) { + KERNEL_DEBUG(DBG_FNC_RECVMSG | DBG_FUNC_END, EMSGSIZE,0,0,0,0); + return (EMSGSIZE); + } + +#if COMPAT_43_SOCKET + user_msg.msg_flags = uap->flags &~ MSG_COMPAT; #else - msg.msg_flags = uap->flags; + user_msg.msg_flags = uap->flags; #endif - uiov = msg.msg_iov; - msg.msg_iov = iov; - error = copyin((caddr_t)uiov, (caddr_t)iov, - (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); + + /* allocate a uio large enough to hold the number of iovecs passed */ + auio = uio_create(user_msg.msg_iovlen, 0, + (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32), + UIO_READ); + if (auio == NULL) { + error = ENOMEM; + goto done; + } + + /* get location of iovecs within the uio. then copyin the iovecs from + * user space. + */ + iovp = uio_iovsaddr(auio); + if (iovp == NULL) { + error = ENOMEM; + goto done; + } + uiov = user_msg.msg_iov; + user_msg.msg_iov = CAST_USER_ADDR_T(iovp); + error = copyin(uiov, (caddr_t)iovp, (user_msg.msg_iovlen * size_of_iovec)); if (error) goto done; - error = recvit(p, uap->s, &msg, (caddr_t)0, retval); + + /* finish setup of uio_t */ + uio_calculateresid(auio); + + error = recvit(p, uap->s, &user_msg, auio, 0, retval); if (!error) { - msg.msg_iov = uiov; - error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg)); + user_msg.msg_iov = uiov; + /* only need to copy if user process is not 64-bit */ + if (!IS_64BIT_PROCESS(p)) { + // LP64todo - do all these change? if not, then no need to copy all of them! + msg.msg_flags = user_msg.msg_flags; + msg.msg_controllen = user_msg.msg_controllen; + msg.msg_control = CAST_DOWN(caddr_t, user_msg.msg_control); + msg.msg_iovlen = user_msg.msg_iovlen; + msg.msg_iov = (struct iovec *) CAST_DOWN(caddr_t, user_msg.msg_iov); + msg.msg_namelen = user_msg.msg_namelen; + msg.msg_name = CAST_DOWN(caddr_t, user_msg.msg_name); + } + error = copyout(msghdrp, uap->msg, size_of_msghdr); } done: - if (iov != aiov) - FREE(iov, M_IOV); + if (auio != NULL) { + uio_free(auio); + } KERNEL_DEBUG(DBG_FNC_RECVMSG | DBG_FUNC_END, error,0,0,0,0); return (error); } /* ARGSUSED */ -struct shutdown_args { - int s; - int how; -}; - int -shutdown(p, uap, retval) - struct proc *p; - struct shutdown_args *uap; - register_t *retval; +shutdown(__unused struct proc *p, struct shutdown_args *uap, __unused register_t *retval) { - struct file *fp; + struct socket * so; int error; AUDIT_ARG(fd, uap->s); - error = getsock(p->p_fd, uap->s, &fp); + error = file_socket(uap->s, &so); if (error) return (error); - if (fp->f_data == NULL) - return (EBADF); - return (soshutdown((struct socket *)fp->f_data, uap->how)); + if (so == NULL) { + error = EBADF; + goto out; + } + error = soshutdown((struct socket *)so, uap->how); +out: + file_drop(uap->s); + return(error); } @@ -1249,21 +1302,10 @@ shutdown(p, uap, retval) /* ARGSUSED */ -struct setsockopt_args { - int s; - int level; - int name; - caddr_t val; - socklen_t valsize; -}; - int -setsockopt(p, uap, retval) - struct proc *p; - struct setsockopt_args *uap; - register_t *retval; +setsockopt(struct proc *p, struct setsockopt_args *uap, __unused register_t *retval) { - struct file *fp; + struct socket * so; struct sockopt sopt; int error; @@ -1273,7 +1315,7 @@ setsockopt(p, uap, retval) if (uap->valsize < 0) return (EINVAL); - error = getsock(p->p_fd, uap->s, &fp); + error = file_socket(uap->s, &so); if (error) return (error); @@ -1284,33 +1326,37 @@ setsockopt(p, uap, retval) sopt.sopt_valsize = uap->valsize; sopt.sopt_p = p; - if (fp->f_data == NULL) - return (EBADF); - return (sosetopt((struct socket *)fp->f_data, &sopt)); + if (so == NULL) { + error = EINVAL; + goto out; + } + error = sosetopt(so, &sopt); +out: + file_drop(uap->s); + return(error); } int -getsockopt(p, uap, retval) - struct proc *p; - struct getsockopt_args *uap; - register_t *retval; +getsockopt(struct proc *p, struct getsockopt_args *uap, __unused register_t *retval) { - int valsize, error; - struct file *fp; - struct sockopt sopt; + int error; + socklen_t valsize; + struct sockopt sopt; + struct socket * so; - error = getsock(p->p_fd, uap->s, &fp); + error = file_socket(uap->s, &so); if (error) return (error); if (uap->val) { - error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize, - sizeof (valsize)); + error = copyin(uap->avalsize, (caddr_t)&valsize, sizeof (valsize)); if (error) - return (error); - if (valsize < 0) - return (EINVAL); + goto out; + if (valsize < 0) { + error = EINVAL; + goto out; + } } else valsize = 0; @@ -1321,79 +1367,17 @@ getsockopt(p, uap, retval) sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */ sopt.sopt_p = p; - if (fp->f_data == NULL) - return (EBADF); - error = sogetopt((struct socket *)fp->f_data, &sopt); + if (so == NULL) { + error = EBADF; + goto out; + } + error = sogetopt((struct socket *)so, &sopt); if (error == 0) { valsize = sopt.sopt_valsize; - error = copyout((caddr_t)&valsize, - (caddr_t)uap->avalsize, sizeof (valsize)); - } - return (error); -} - - - -struct pipe_args { - int dummy; -}; -/* ARGSUSED */ -int -pipe(p, uap, retval) - struct proc *p; - struct pipe_args *uap; - register_t *retval; -{ - struct file *rf, *wf; - struct socket *rso, *wso; - int fd, error; - - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - if (error = socreate(AF_UNIX, &rso, SOCK_STREAM, 0)) { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - return (error); - } - if (error = socreate(AF_UNIX, &wso, SOCK_STREAM, 0)) { - goto free1; + error = copyout((caddr_t)&valsize, uap->avalsize, sizeof (valsize)); } - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - error = falloc(p, &rf, &fd); - if (error) - goto free2; - retval[0] = fd; - rf->f_flag = FREAD; - rf->f_type = DTYPE_SOCKET; - rf->f_ops = &socketops; - rf->f_data = (caddr_t)rso; - if (error = falloc(p, &wf, &fd)) - goto free3; - wf->f_flag = FWRITE; - wf->f_type = DTYPE_SOCKET; - wf->f_ops = &socketops; - wf->f_data = (caddr_t)wso; - retval[1] = fd; - - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - error = unp_connect2(wso, rso); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - if (error) - goto free4; - *fdflags(p, retval[0]) &= ~UF_RESERVED; - *fdflags(p, retval[1]) &= ~UF_RESERVED; - return (0); -free4: - fdrelse(p, retval[1]); - ffree(wf); -free3: - fdrelse(p, retval[0]); - ffree(rf); -free2: - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - (void)soclose(wso); -free1: - (void)soclose(rso); - - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); +out: + file_drop(uap->s); return (error); } @@ -1403,29 +1387,53 @@ free1: */ /* ARGSUSED */ static int -getsockname1(p, uap, retval, compat) - struct proc *p; - register struct getsockname_args *uap; - register_t *retval; - int compat; +getsockname1(__unused struct proc *p, struct getsockname_args *uap, __unused register_t *retval, + int compat) { - struct file *fp; - register struct socket *so; + struct socket *so; struct sockaddr *sa; - u_int len; + socklen_t len; int error; - error = getsock(p->p_fd, uap->fdes, &fp); + error = file_socket(uap->fdes, &so); if (error) return (error); - error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len)); + error = copyin(uap->alen, (caddr_t)&len, sizeof(socklen_t)); if (error) - return (error); - so = (struct socket *)fp->f_data; - if (so == NULL) - return (EBADF); + goto out; + if (so == NULL) { + error = EBADF; + goto out; + } sa = 0; + socket_lock(so, 1); error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa); + if (error == 0) + { + struct socket_filter_entry *filter; + int filtered = 0; + for (filter = so->so_filt; filter && error == 0; + filter = filter->sfe_next_onsocket) { + if (filter->sfe_filter->sf_filter.sf_getsockname) { + if (!filtered) { + filtered = 1; + sflt_use(so); + socket_unlock(so, 0); + } + error = filter->sfe_filter->sf_filter.sf_getsockname(filter->sfe_cookie, + so, &sa); + } + } + + if (error == EJUSTRETURN) + error = 0; + + if (filtered) { + socket_lock(so, 0); + sflt_unuse(so); + } + } + socket_unlock(so, 1); if (error) goto bad; if (sa == 0) { @@ -1434,73 +1442,97 @@ getsockname1(p, uap, retval, compat) } len = MIN(len, sa->sa_len); -#ifdef COMPAT_OLDSOCK +#if COMPAT_43_SOCKET if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif - error = copyout(sa, (caddr_t)uap->asa, (u_int)len); + error = copyout((caddr_t)sa, uap->asa, len); if (error == 0) gotnothing: - error = copyout((caddr_t)&len, (caddr_t)uap->alen, - sizeof (len)); + error = copyout((caddr_t)&len, uap->alen, sizeof(socklen_t)); bad: if (sa) FREE(sa, M_SONAME); +out: + file_drop(uap->fdes); return (error); } int -getsockname(p, uap, retval) - struct proc *p; - struct getsockname_args *uap; - register_t *retval; +getsockname(struct proc *p, struct getsockname_args *uap, register_t *retval) { - return (getsockname1(p, uap, retval, 0)); } -#ifdef COMPAT_OLDSOCK +#if COMPAT_43_SOCKET int -ogetsockname(p, uap, retval) - struct proc *p; - struct getsockname_args *uap; - register_t *retval; +ogetsockname(struct proc *p, struct getsockname_args *uap, register_t *retval) { - return (getsockname1(p, uap, retval, 1)); } -#endif /* COMPAT_OLDSOCK */ +#endif /* COMPAT_43_SOCKET */ /* * Get name of peer for connected socket. */ /* ARGSUSED */ int -getpeername1(p, uap, retval, compat) - struct proc *p; - register struct getpeername_args *uap; - register_t *retval; - int compat; +getpeername1(__unused struct proc *p, struct getpeername_args *uap, __unused register_t *retval, + int compat) { - struct file *fp; - register struct socket *so; + struct socket *so; struct sockaddr *sa; - u_int len; + socklen_t len; int error; - error = getsock(p->p_fd, uap->fdes, &fp); - if (error) - return (error); - so = (struct socket *)fp->f_data; - if (so == NULL) - return (EBADF); - if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) - return (ENOTCONN); - error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len)); + error = file_socket(uap->fdes, &so); if (error) return (error); + if (so == NULL) { + error = EBADF; + goto out; + } + + socket_lock(so, 1); + + if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { + socket_unlock(so, 1); + error = ENOTCONN; + goto out; + } + error = copyin(uap->alen, (caddr_t)&len, sizeof(socklen_t)); + if (error) { + socket_unlock(so, 1); + goto out; + } sa = 0; error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa); + if (error == 0) + { + struct socket_filter_entry *filter; + int filtered = 0; + for (filter = so->so_filt; filter && error == 0; + filter = filter->sfe_next_onsocket) { + if (filter->sfe_filter->sf_filter.sf_getpeername) { + if (!filtered) { + filtered = 1; + sflt_use(so); + socket_unlock(so, 0); + } + error = filter->sfe_filter->sf_filter.sf_getpeername(filter->sfe_cookie, + so, &sa); + } + } + + if (error == EJUSTRETURN) + error = 0; + + if (filtered) { + socket_lock(so, 0); + sflt_unuse(so); + } + } + socket_unlock(so, 1); if (error) goto bad; if (sa == 0) { @@ -1508,48 +1540,43 @@ getpeername1(p, uap, retval, compat) goto gotnothing; } len = MIN(len, sa->sa_len); -#ifdef COMPAT_OLDSOCK +#if COMPAT_43_SOCKET if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif - error = copyout(sa, (caddr_t)uap->asa, (u_int)len); + error = copyout(sa, uap->asa, len); if (error) goto bad; gotnothing: - error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len)); + error = copyout((caddr_t)&len, uap->alen, sizeof(socklen_t)); bad: if (sa) FREE(sa, M_SONAME); +out: + file_drop(uap->fdes); return (error); } int -getpeername(p, uap, retval) - struct proc *p; - struct getpeername_args *uap; - register_t *retval; +getpeername(struct proc *p, struct getpeername_args *uap, register_t *retval) { return (getpeername1(p, uap, retval, 0)); } -#ifdef COMPAT_OLDSOCK +#if COMPAT_43_SOCKET int -ogetpeername(p, uap, retval) - struct proc *p; - struct ogetpeername_args *uap; - register_t *retval; +ogetpeername(struct proc *p, struct getpeername_args *uap, register_t *retval) { - /* XXX uap should have type `getpeername_args *' to begin with. */ - return (getpeername1(p, (struct getpeername_args *)uap, retval, 1)); + return (getpeername1(p, uap, retval, 1)); } -#endif /* COMPAT_OLDSOCK */ +#endif /* COMPAT_43_SOCKET */ int -sockargs(mp, buf, buflen, type) +sockargs(mp, data, buflen, type) struct mbuf **mp; - caddr_t buf; + user_addr_t data; int buflen, type; { register struct sockaddr *sa; @@ -1557,18 +1584,26 @@ sockargs(mp, buf, buflen, type) int error; if ((u_int)buflen > MLEN) { -#ifdef COMPAT_OLDSOCK +#if COMPAT_43_SOCKET if (type == MT_SONAME && (u_int)buflen <= 112) buflen = MLEN; /* unix domain compat. hack */ else #endif - return (EINVAL); + if ((u_int)buflen > MCLBYTES) + return (EINVAL); } m = m_get(M_WAIT, type); if (m == NULL) return (ENOBUFS); + if ((u_int)buflen > MLEN) { + MCLGET(m, M_WAIT); + if ((m->m_flags & M_EXT) == 0) { + m_free(m); + return ENOBUFS; + } + } m->m_len = buflen; - error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); + error = copyin(data, mtod(m, caddr_t), (u_int)buflen); if (error) (void) m_free(m); else { @@ -1576,7 +1611,7 @@ sockargs(mp, buf, buflen, type) if (type == MT_SONAME) { sa = mtod(m, struct sockaddr *); -#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN +#if COMPAT_43_SOCKET && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif @@ -1586,11 +1621,11 @@ sockargs(mp, buf, buflen, type) return (error); } +/* + * Given a user_addr_t of length len, allocate and fill out a *sa. + */ int -getsockaddr(namp, uaddr, len) - struct sockaddr **namp; - caddr_t uaddr; - size_t len; +getsockaddr(struct sockaddr **namp, user_addr_t uaddr, size_t len) { struct sockaddr *sa; int error; @@ -1602,11 +1637,14 @@ getsockaddr(namp, uaddr, len) return EINVAL; MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK); - error = copyin(uaddr, sa, len); + if (sa == NULL) { + return ENOMEM; + } + error = copyin(uaddr, (caddr_t)sa, len); if (error) { FREE(sa, M_SONAME); } else { -#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN +#if COMPAT_43_SOCKET && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif @@ -1616,23 +1654,6 @@ getsockaddr(namp, uaddr, len) return error; } -int -getsock(fdp, fdes, fpp) - struct filedesc *fdp; - int fdes; - struct file **fpp; -{ - register struct file *fp; - - if ((unsigned)fdes >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[fdes]) == NULL || - (fdp->fd_ofileflags[fdes] & UF_RESERVED)) - return (EBADF); - if (fp->f_type != DTYPE_SOCKET) - return (ENOTSOCK); - *fpp = fp; - return (0); -} #if SENDFILE /* @@ -1647,9 +1668,11 @@ sf_buf_init(void *arg) int i; SLIST_INIT(&sf_freelist); - sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE); - sf_bufs = _MALLOC(nsfbufs * sizeof(struct sf_buf), M_TEMP, M_NOWAIT); - bzero(sf_bufs, nsfbufs * sizeof(struct sf_buf)); + kmem_alloc_pageable(kernel_map, &sf_base, nsfbufs * PAGE_SIZE); + MALLOC(sf_bufs, struct sf_buf *, nsfbufs * sizeof(struct sf_buf), M_TEMP, M_NOWAIT|M_ZERO); + if (sf_bufs == NULL) + return; /* XXX silently fail leaving sf_bufs NULL */ + for (i = 0; i < nsfbufs; i++) { sf_bufs[i].kva = sf_base + i * PAGE_SIZE; SLIST_INSERT_HEAD(&sf_freelist, &sf_bufs[i], free_list); @@ -1663,15 +1686,12 @@ static struct sf_buf * sf_buf_alloc() { struct sf_buf *sf; - int s; - s = splimp(); while ((sf = SLIST_FIRST(&sf_freelist)) == NULL) { sf_buf_alloc_want = 1; tsleep(&sf_freelist, PVM, "sfbufa", 0); } SLIST_REMOVE_HEAD(&sf_freelist, free_list); - splx(s); sf->refcnt = 1; return (sf); } @@ -1699,7 +1719,6 @@ sf_buf_free(caddr_t addr, u_int size) { struct sf_buf *sf; struct vm_page *m; - int s; sf = dtosf(addr); if (sf->refcnt == 0) @@ -1708,7 +1727,6 @@ sf_buf_free(caddr_t addr, u_int size) if (sf->refcnt == 0) { pmap_qremove((vm_offset_t)addr, 1); m = sf->m; - s = splvm(); vm_page_unwire(m, 0); /* * Check for the object going away on us. This can @@ -1719,7 +1737,6 @@ sf_buf_free(caddr_t addr, u_int size) vm_page_lock_queues(); vm_page_free(m); vm_page_unlock_queues(); - splx(s); sf->m = NULL; SLIST_INSERT_HEAD(&sf_freelist, sf, free_list); if (sf_buf_alloc_want) { @@ -1742,8 +1759,7 @@ sf_buf_free(caddr_t addr, u_int size) int sendfile(struct proc *p, struct sendfile_args *uap) { - struct file *fp; - struct filedesc *fdp = p->p_fd; + struct fileproc *fp; struct vnode *vp; struct vm_object *obj; struct socket *so; @@ -1755,45 +1771,47 @@ sendfile(struct proc *p, struct sendfile_args *uap) off_t off, xfsize, sbytes = 0; int error = 0, s; + if (sf_bufs == NULL) { + /* Fail if initialization failed */ + return ENOSYS; + } + /* * Do argument checking. Must be a regular file in, stream * type and connected socket out, positive offset. */ - if (((u_int)uap->fd) >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[uap->fd]) == NULL || - (fp->f_flag & FREAD) == 0) { - error = EBADF; - goto done; - } - if (fp->f_type != DTYPE_VNODE) { - error = EINVAL; + if (error = fp_getfvp(p, uap->fd, &fp, &vp)) goto done; + if (fp->f_flag & FREAD) == 0) { + error = EBADF; + goto done1; } - vp = (struct vnode *)fp->f_data; obj = vp->v_object; if (vp->v_type != VREG || obj == NULL) { error = EINVAL; - goto done; + goto done1; } - error = getsock(p->p_fd, uap->s, &fp); + error = file_socket(uap->s, &so); if (error) - goto done; - so = (struct socket *)fp->f_data; + goto done1; if (so == NULL) { error = EBADF; - goto done; + goto done2; } + + socket_lock(so, 1); + if (so->so_type != SOCK_STREAM) { error = EINVAL; - goto done; + goto done3; } if ((so->so_state & SS_ISCONNECTED) == 0) { error = ENOTCONN; - goto done; + goto done3; } if (uap->offset < 0) { error = EINVAL; - goto done; + goto done3; } /* @@ -1801,9 +1819,9 @@ sendfile(struct proc *p, struct sendfile_args *uap) * any headers/trailers. */ if (uap->hdtr != NULL) { - error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); + error = copyin(CAST_USER_ADDR_T(uap->hdtr), &hdtr, sizeof(hdtr)); if (error) - goto done; + goto done3; /* * Send any headers. Wimp out and use writev(2). */ @@ -1813,7 +1831,7 @@ sendfile(struct proc *p, struct sendfile_args *uap) nuap.iovcnt = hdtr.hdr_cnt; error = writev(p, &nuap); if (error) - goto done; + goto done3; sbytes += p->p_retval[0]; } } @@ -1858,8 +1876,8 @@ retry_lookup: error = EPIPE; else error = EAGAIN; - sbunlock(&so->so_snd); - goto done; + sbunlock(&so->so_snd, 0); /* will release lock */ + goto done2; } /* * Attempt to look up the page. If the page doesn't exist or the @@ -1897,20 +1915,17 @@ retry_lookup: /* * Get the page from backing store. */ - bsize = vp->v_mount->mnt_stat.f_iosize; + bsize = vp->v_mount->mnt_vfsstat.f_iosize; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = 0; aiov.iov_len = MAXBSIZE; - auio.uio_resid = MAXBSIZE; auio.uio_offset = trunc_page(off); auio.uio_segflg = UIO_NOCOPY; auio.uio_rw = UIO_READ; - auio.uio_procp = p; - vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p); + uio_setresid(&auio, MAXBSIZE); error = VOP_READ(vp, &auio, IO_VMIO | ((MAXBSIZE / bsize) << 16), p->p_ucred); - VOP_UNLOCK(vp, 0, p); vm_page_flag_clear(pg, PG_ZERO); vm_page_io_finish(pg); if (error) { @@ -1925,8 +1940,8 @@ retry_lookup: vm_page_lock_queues(); vm_page_free(pg); vm_page_unlock_queues(); - sbunlock(&so->so_snd); - goto done; + sbunlock(&so->so_snd, 0); /* will release socket lock */ + goto done2; } } else { if ((pg->flags & PG_BUSY) || pg->busy) { @@ -1937,10 +1952,8 @@ retry_lookup: */ vm_page_flag_set(pg, PG_WANTED); tsleep(pg, PVM, "sfpbsy", 0); - splx(s); goto retry_lookup; } - splx(s); } /* * Protect from having the page ripped out from beneath us. @@ -1958,6 +1971,11 @@ retry_lookup: * Get an mbuf header and set it up as having external storage. */ MGETHDR(m, M_WAIT, MT_DATA); + if (m == NULL) { + error = ENOBUFS; + sbunlock(&so->so_snd, 0); /* will release socket lock */ + goto done2; + } m->m_ext.ext_free = sf_buf_free; m->m_ext.ext_ref = sf_buf_ref; m->m_ext.ext_buf = (void *)sf->kva; @@ -1968,7 +1986,6 @@ retry_lookup: /* * Add the buffer to the socket buffer chain. */ - s = splnet(); retry_space: /* * Make sure that the socket is still able to take more data. @@ -1989,9 +2006,8 @@ retry_space: so->so_error = 0; } m_freem(m); - sbunlock(&so->so_snd); - splx(s); - goto done; + sbunlock(&so->so_snd, 0); /* will release socket lock */ + goto done2; } /* * Wait for socket space to become available. We do this just @@ -2001,10 +2017,9 @@ retry_space: if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) { if (so->so_state & SS_NBIO) { m_freem(m); - sbunlock(&so->so_snd); - splx(s); + sbunlock(&so->so_snd, 0); /* will release socket lock */ error = EAGAIN; - goto done; + goto done2; } error = sbwait(&so->so_snd); /* @@ -2014,20 +2029,19 @@ retry_space: */ if (error) { m_freem(m); - sbunlock(&so->so_snd); - splx(s); - goto done; + sbunlock(&so->so_snd, 0); + goto done2; } goto retry_space; } error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, p); splx(s); if (error) { - sbunlock(&so->so_snd); - goto done; + sbunlock(&so->so_snd, 0); /* will release socket lock */ + goto done2; } } - sbunlock(&so->so_snd); + sbunlock(&so->so_snd, 0); /* will release socket lock */ /* * Send trailers. Wimp out and use writev(2). @@ -2038,15 +2052,22 @@ retry_space: nuap.iovcnt = hdtr.trl_cnt; error = writev(p, &nuap); if (error) - goto done; + goto done2; sbytes += p->p_retval[0]; } - +done2: + file_drop(uap->s); +done1: + file_drop(uap->fd); done: if (uap->sbytes != NULL) { - copyout(&sbytes, uap->sbytes, sizeof(off_t)); + /* XXX this appears bogus for some early failure conditions */ + copyout(&sbytes, CAST_USER_ADDR_T(uap->sbytes), sizeof(off_t)); } return (error); +done3: + socket_unlock(so, 1); + goto done2; } #endif diff --git a/bsd/kern/uipc_usrreq.c b/bsd/kern/uipc_usrreq.c index dda829231..13f275e7e 100644 --- a/bsd/kern/uipc_usrreq.c +++ b/bsd/kern/uipc_usrreq.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -60,12 +60,13 @@ #include <sys/domain.h> #include <sys/fcntl.h> #include <sys/malloc.h> /* XXX must be before <sys/file.h> */ -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/filedesc.h> #include <sys/lock.h> #include <sys/mbuf.h> #include <sys/namei.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> @@ -73,14 +74,23 @@ #include <sys/sysctl.h> #include <sys/un.h> #include <sys/unpcb.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> +#include <sys/kdebug.h> #include <kern/zalloc.h> +#include <kern/locks.h> +#define f_msgcount f_fglob->fg_msgcount +#define f_cred f_fglob->fg_cred +#define f_ops f_fglob->fg_ops +#define f_offset f_fglob->fg_offset +#define f_data f_fglob->fg_data struct zone *unp_zone; static unp_gen_t unp_gencnt; static u_int unp_count; +static lck_mtx_t *unp_mutex; +extern lck_mtx_t * uipc_lock; static struct unp_head unp_shead, unp_dhead; /* @@ -92,22 +102,24 @@ static struct unp_head unp_shead, unp_dhead; * need a proper out-of-band * lock pushdown */ -static struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL }; +static struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL, { 0 } }; static ino_t unp_ino; /* prototype for fake inode numbers */ -static int unp_attach __P((struct socket *)); -static void unp_detach __P((struct unpcb *)); -static int unp_bind __P((struct unpcb *,struct sockaddr *, struct proc *)); -static int unp_connect __P((struct socket *,struct sockaddr *, - struct proc *)); -static void unp_disconnect __P((struct unpcb *)); -static void unp_shutdown __P((struct unpcb *)); -static void unp_drop __P((struct unpcb *, int)); -static void unp_gc __P((void)); -static void unp_scan __P((struct mbuf *, void (*)(struct file *))); -static void unp_mark __P((struct file *)); -static void unp_discard __P((struct file *)); -static int unp_internalize __P((struct mbuf *, struct proc *)); +static int unp_attach(struct socket *); +static void unp_detach(struct unpcb *); +static int unp_bind(struct unpcb *,struct sockaddr *, struct proc *); +static int unp_connect(struct socket *,struct sockaddr *, struct proc *); +static void unp_disconnect(struct unpcb *); +static void unp_shutdown(struct unpcb *); +static void unp_drop(struct unpcb *, int); +static void unp_gc(void); +static void unp_scan(struct mbuf *, void (*)(struct fileglob *)); +static void unp_mark(struct fileglob *); +static void unp_discard(struct fileglob *); +static void unp_discard_fdlocked(struct fileglob *, struct proc *); +static int unp_internalize(struct mbuf *, struct proc *); +static int unp_listen(struct unpcb *, struct proc *); + static int uipc_abort(struct socket *so) @@ -117,6 +129,8 @@ uipc_abort(struct socket *so) if (unp == 0) return EINVAL; unp_drop(unp, ECONNABORTED); + unp_detach(unp); + sofree(so); return 0; } @@ -143,7 +157,7 @@ uipc_accept(struct socket *so, struct sockaddr **nam) } static int -uipc_attach(struct socket *so, int proto, struct proc *p) +uipc_attach(struct socket *so, __unused int proto, __unused struct proc *p) { struct unpcb *unp = sotounpcb(so); @@ -210,13 +224,13 @@ uipc_disconnect(struct socket *so) } static int -uipc_listen(struct socket *so, struct proc *p) +uipc_listen(struct socket *so, __unused struct proc *p) { struct unpcb *unp = sotounpcb(so); if (unp == 0 || unp->unp_vnode == 0) return EINVAL; - return 0; + return unp_listen(unp, p); } static int @@ -233,7 +247,7 @@ uipc_peeraddr(struct socket *so, struct sockaddr **nam) } static int -uipc_rcvd(struct socket *so, int flags) +uipc_rcvd(struct socket *so, __unused int flags) { struct unpcb *unp = sotounpcb(so); struct socket *so2; @@ -316,18 +330,18 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, from = (struct sockaddr *)unp->unp_addr; else from = &sun_noname; - if (sbappendaddr(&so2->so_rcv, from, m, control)) { + if (sbappendaddr(&so2->so_rcv, from, m, control, &error)) { sorwakeup(so2); - m = 0; - control = 0; - } else - error = ENOBUFS; + } + m = 0; + control = 0; if (nam) unp_disconnect(unp); break; } - case SOCK_STREAM: + case SOCK_STREAM: { + int didreceive = 0; #define rcv (&so2->so_rcv) #define snd (&so->so_snd) /* Connect if not connected yet. */ @@ -358,20 +372,22 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, * send buffer hiwater marks to maintain backpressure. * Wake up readers. */ - if (control) { - if (sbappendcontrol(rcv, m, control)) - control = 0; - } else - sbappend(rcv, m); + if ((control && sbappendcontrol(rcv, m, control, NULL)) || + sbappend(rcv, m)) { + didreceive = 1; + } snd->sb_mbmax -= rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt; unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt; snd->sb_hiwat -= rcv->sb_cc - unp->unp_conn->unp_cc; unp->unp_conn->unp_cc = rcv->sb_cc; - sorwakeup(so2); + if (didreceive) + sorwakeup(so2); m = 0; + control = 0; #undef snd #undef rcv + } break; default: @@ -387,6 +403,9 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, unp_shutdown(unp); } + if (control && error != 0) + unp_dispose(control); + release: if (control) m_freem(control); @@ -444,8 +463,43 @@ struct pr_usrreqs uipc_usrreqs = { uipc_connect2, pru_control_notsupp, uipc_detach, uipc_disconnect, uipc_listen, uipc_peeraddr, uipc_rcvd, pru_rcvoob_notsupp, uipc_send, uipc_sense, uipc_shutdown, uipc_sockaddr, - sosend, soreceive, sopoll + sosend, soreceive, pru_sopoll_notsupp }; + +int +uipc_ctloutput( + struct socket *so, + struct sockopt *sopt) +{ + struct unpcb *unp = sotounpcb(so); + int error; + + switch (sopt->sopt_dir) { + case SOPT_GET: + switch (sopt->sopt_name) { + case LOCAL_PEERCRED: + if (unp->unp_flags & UNP_HAVEPC) + error = sooptcopyout(sopt, &unp->unp_peercred, + sizeof(unp->unp_peercred)); + else { + if (so->so_type == SOCK_STREAM) + error = ENOTCONN; + else + error = EINVAL; + } + break; + default: + error = EOPNOTSUPP; + break; + } + break; + case SOPT_SET: + default: + error = EOPNOTSUPP; + break; + } + return (error); +} /* * Both send and receive buffers are allocated PIPSIZ bytes of buffering @@ -479,11 +533,10 @@ SYSCTL_DECL(_net_local); SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, ""); static int -unp_attach(so) - struct socket *so; +unp_attach(struct socket *so) { - register struct unpcb *unp; - int error; + struct unpcb *unp; + int error = 0; if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { switch (so->so_type) { @@ -506,20 +559,22 @@ unp_attach(so) if (unp == NULL) return (ENOBUFS); bzero(unp, sizeof *unp); - unp->unp_gencnt = ++unp_gencnt; - unp_count++; + lck_mtx_lock(unp_mutex); LIST_INIT(&unp->unp_refs); unp->unp_socket = so; + unp->unp_gencnt = ++unp_gencnt; + unp_count++; LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead : &unp_shead, unp, unp_link); so->so_pcb = (caddr_t)unp; + lck_mtx_unlock(unp_mutex); return (0); } static void -unp_detach(unp) - register struct unpcb *unp; +unp_detach(struct unpcb *unp) { + lck_mtx_assert(unp_mutex, LCK_MTX_ASSERT_OWNED); LIST_REMOVE(unp, unp_link); unp->unp_gencnt = ++unp_gencnt; --unp_count; @@ -527,15 +582,14 @@ unp_detach(unp) struct vnode *tvp = unp->unp_vnode; unp->unp_vnode->v_socket = 0; unp->unp_vnode = 0; - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - vrele(tvp); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + vnode_rele(tvp); /* drop the usecount */ } if (unp->unp_conn) unp_disconnect(unp); while (unp->unp_refs.lh_first) unp_drop(unp->unp_refs.lh_first, ECONNRESET); soisdisconnected(unp->unp_socket); + unp->unp_socket->so_flags |= SOF_PCBCLEARING; /* makes sure we're getting dealloced */ unp->unp_socket->so_pcb = 0; if (unp_rights) { /* @@ -550,22 +604,26 @@ unp_detach(unp) } if (unp->unp_addr) FREE(unp->unp_addr, M_SONAME); - zfree(unp_zone, (vm_offset_t)unp); + zfree(unp_zone, unp); } static int -unp_bind(unp, nam, p) - struct unpcb *unp; - struct sockaddr *nam; - struct proc *p; +unp_bind( + struct unpcb *unp, + struct sockaddr *nam, + struct proc *p) { struct sockaddr_un *soun = (struct sockaddr_un *)nam; - register struct vnode *vp; - struct vattr vattr; + struct vnode *vp, *dvp; + struct vnode_attr va; + struct vfs_context context; int error, namelen; struct nameidata nd; char buf[SOCK_MAXADDRLEN]; + context.vc_proc = p; + context.vc_ucred = p->p_ucred; /* XXX kauth_cred_get() ??? proxy */ + if (unp->unp_vnode != NULL) return (EINVAL); namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path); @@ -573,81 +631,93 @@ unp_bind(unp, nam, p) return EINVAL; strncpy(buf, soun->sun_path, namelen); buf[namelen] = 0; /* null-terminate the string */ - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - NDINIT(&nd, CREATE, FOLLOW | LOCKPARENT, UIO_SYSSPACE, - buf, p); + NDINIT(&nd, CREATE, FOLLOW | LOCKPARENT, UIO_SYSSPACE32, + CAST_USER_ADDR_T(buf), &context); /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ error = namei(&nd); if (error) { - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); return (error); } + dvp = nd.ni_dvp; vp = nd.ni_vp; + if (vp != NULL) { - VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); - if (nd.ni_dvp == vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); - vrele(vp); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + /* + * need to do this before the vnode_put of dvp + * since we may have to release an fs_nodelock + */ + nameidone(&nd); + + vnode_put(dvp); + vnode_put(vp); + return (EADDRINUSE); } - VATTR_NULL(&vattr); - vattr.va_type = VSOCK; - vattr.va_mode = (ACCESSPERMS & ~p->p_fd->fd_cmask); - VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); - error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); -#if 0 - /* In FreeBSD create leave s parent held ; not here */ - vput(nd.ni_dvp); -#endif + + /* authorize before creating */ + error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, &context); + + if (!error) { + VATTR_INIT(&va); + VATTR_SET(&va, va_type, VSOCK); + VATTR_SET(&va, va_mode, (ACCESSPERMS & ~p->p_fd->fd_cmask)); + + /* create the socket */ + error = vn_create(dvp, &vp, &nd.ni_cnd, &va, 0, &context); + } + + nameidone(&nd); + vnode_put(dvp); + if (error) { - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); return (error); } - vp = nd.ni_vp; + vnode_ref(vp); /* gain a longterm reference */ vp->v_socket = unp->unp_socket; unp->unp_vnode = vp; unp->unp_addr = (struct sockaddr_un *)dup_sockaddr(nam, 1); - VOP_UNLOCK(vp, 0, p); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + vnode_put(vp); /* drop the iocount */ + return (0); } static int -unp_connect(so, nam, p) - struct socket *so; - struct sockaddr *nam; - struct proc *p; +unp_connect( + struct socket *so, + struct sockaddr *nam, + struct proc *p) { - register struct sockaddr_un *soun = (struct sockaddr_un *)nam; - register struct vnode *vp; - register struct socket *so2, *so3; - struct unpcb *unp2, *unp3; + struct sockaddr_un *soun = (struct sockaddr_un *)nam; + struct vnode *vp; + struct socket *so2, *so3; + struct unpcb *unp, *unp2, *unp3; + struct vfs_context context; int error, len; struct nameidata nd; char buf[SOCK_MAXADDRLEN]; + context.vc_proc = p; + context.vc_ucred = p->p_ucred; /* XXX kauth_cred_get() ??? proxy */ + len = nam->sa_len - offsetof(struct sockaddr_un, sun_path); if (len <= 0) return EINVAL; strncpy(buf, soun->sun_path, len); buf[len] = 0; - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, p); + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE32, CAST_USER_ADDR_T(buf), &context); error = namei(&nd); if (error) { - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); return (error); } + nameidone(&nd); vp = nd.ni_vp; if (vp->v_type != VSOCK) { error = ENOTSOCK; goto bad; } - error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p); + + error = vnode_authorize(vp, NULL, KAUTH_VNODE_WRITE_DATA, &context); if (error) goto bad; so2 = vp->v_socket; @@ -655,11 +725,14 @@ unp_connect(so, nam, p) error = ECONNREFUSED; goto bad; } + + /* make sure the socket can't go away while we're connecting */ + so2->so_usecount++; + if (so->so_type != so2->so_type) { error = EPROTOTYPE; goto bad; } - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); /* * Check if socket was connected while we were trying to @@ -668,40 +741,64 @@ unp_connect(so, nam, p) */ if ((so->so_state & SS_ISCONNECTED) != 0) { error = EISCONN; - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); goto bad; } if (so->so_proto->pr_flags & PR_CONNREQUIRED) { if ((so2->so_options & SO_ACCEPTCONN) == 0 || - (so3 = sonewconn(so2, 0)) == 0) { + (so3 = sonewconn(so2, 0, nam)) == 0) { error = ECONNREFUSED; - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); goto bad; } + unp = sotounpcb(so); unp2 = sotounpcb(so2); unp3 = sotounpcb(so3); if (unp2->unp_addr) unp3->unp_addr = (struct sockaddr_un *) dup_sockaddr((struct sockaddr *) unp2->unp_addr, 1); + + /* + * unp_peercred management: + * + * The connecter's (client's) credentials are copied + * from its process structure at the time of connect() + * (which is now). + */ + cru2x(p->p_ucred, &unp3->unp_peercred); + unp3->unp_flags |= UNP_HAVEPC; + /* + * The receiver's (server's) credentials are copied + * from the unp_peercred member of socket on which the + * former called listen(); unp_listen() cached that + * process's credentials at that time so we can use + * them now. + */ + KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED, + ("unp_connect: listener without cached peercred")); + memcpy(&unp->unp_peercred, &unp2->unp_peercred, + sizeof(unp->unp_peercred)); + unp->unp_flags |= UNP_HAVEPC; + + so2->so_usecount--; /* drop reference taken on so2 */ so2 = so3; + so3->so_usecount++; /* make sure we keep it around */ } error = unp_connect2(so, so2); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); bad: - vput(vp); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + if (so2 != NULL) + so2->so_usecount--; /* release count on socket */ + vnode_put(vp); return (error); } int -unp_connect2(so, so2) - register struct socket *so; - register struct socket *so2; +unp_connect2( + struct socket *so, + struct socket *so2) { - register struct unpcb *unp = sotounpcb(so); - register struct unpcb *unp2; + struct unpcb *unp = sotounpcb(so); + struct unpcb *unp2; if (so2->so_type != so->so_type) return (EPROTOTYPE); @@ -720,6 +817,14 @@ unp_connect2(so, so2) break; case SOCK_STREAM: + /* This takes care of socketpair */ + if (!(unp->unp_flags & UNP_HAVEPC) && !(unp2->unp_flags & UNP_HAVEPC)) { + cru2x(kauth_cred_get(), &unp->unp_peercred); + unp->unp_flags |= UNP_HAVEPC; + + cru2x(kauth_cred_get(), &unp2->unp_peercred); + unp2->unp_flags |= UNP_HAVEPC; + } unp2->unp_conn = unp; soisconnected(so); soisconnected(so2); @@ -732,13 +837,13 @@ unp_connect2(so, so2) } static void -unp_disconnect(unp) - struct unpcb *unp; +unp_disconnect(struct unpcb *unp) { - register struct unpcb *unp2 = unp->unp_conn; + struct unpcb *unp2 = unp->unp_conn; if (unp2 == 0) return; + lck_mtx_assert(unp_mutex, LCK_MTX_ASSERT_OWNED); unp->unp_conn = 0; switch (unp->unp_socket->so_type) { @@ -757,8 +862,7 @@ unp_disconnect(unp) #ifdef notdef void -unp_abort(unp) - struct unpcb *unp; +unp_abort(struct unpcb *unp) { unp_detach(unp); @@ -774,21 +878,25 @@ unp_pcblist SYSCTL_HANDLER_ARGS struct xunpgen xug; struct unp_head *head; + lck_mtx_lock(unp_mutex); head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead); /* * The process of preparing the PCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ - if (req->oldptr == 0) { + if (req->oldptr == USER_ADDR_NULL) { n = unp_count; req->oldidx = 2 * (sizeof xug) + (n + n/8) * sizeof(struct xunpcb); + lck_mtx_unlock(unp_mutex); return 0; } - if (req->newptr != 0) + if (req->newptr != USER_ADDR_NULL) { + lck_mtx_unlock(unp_mutex); return EPERM; + } /* * OK, now we're committed to doing something. @@ -801,18 +909,24 @@ unp_pcblist SYSCTL_HANDLER_ARGS xug.xug_gen = gencnt; xug.xug_sogen = so_gencnt; error = SYSCTL_OUT(req, &xug, sizeof xug); - if (error) + if (error) { + lck_mtx_unlock(unp_mutex); return error; + } /* * We are done if there is no pcb */ - if (n == 0) + if (n == 0) { + lck_mtx_unlock(unp_mutex); return 0; + } - unp_list = _MALLOC(n * sizeof *unp_list, M_TEMP, M_WAITOK); - if (unp_list == 0) + MALLOC(unp_list, struct unpcb **, n * sizeof *unp_list, M_TEMP, M_WAITOK); + if (unp_list == 0) { + lck_mtx_unlock(unp_mutex); return ENOMEM; + } for (unp = head->lh_first, i = 0; unp && i < n; unp = unp->unp_link.le_next) { @@ -827,7 +941,7 @@ unp_pcblist SYSCTL_HANDLER_ARGS if (unp->unp_gencnt <= gencnt) { struct xunpcb xu; xu.xu_len = sizeof xu; - xu.xu_unpp = unp; + xu.xu_unpp = (struct unpcb_compat *)unp; /* * XXX - need more locking here to protect against * connect/disconnect races for SMP. @@ -839,7 +953,7 @@ unp_pcblist SYSCTL_HANDLER_ARGS bcopy(unp->unp_conn->unp_addr, &xu.xu_caddr, unp->unp_conn->unp_addr->sun_len); - bcopy(unp, &xu.xu_unp, sizeof *unp); + bcopy(unp, &xu.xu_unp, sizeof(xu.xu_unp)); sotoxsocket(unp->unp_socket, &xu.xu_socket); error = SYSCTL_OUT(req, &xu, sizeof xu); } @@ -858,6 +972,7 @@ unp_pcblist SYSCTL_HANDLER_ARGS error = SYSCTL_OUT(req, &xug, sizeof xug); } FREE(unp_list, M_TEMP); + lck_mtx_unlock(unp_mutex); return error; } @@ -869,8 +984,7 @@ SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD, "List of active local stream sockets"); static void -unp_shutdown(unp) - struct unpcb *unp; +unp_shutdown(struct unpcb *unp) { struct socket *so; @@ -880,24 +994,14 @@ unp_shutdown(unp) } static void -unp_drop(unp, errno) - struct unpcb *unp; - int errno; +unp_drop( + struct unpcb *unp, + int errno) { struct socket *so = unp->unp_socket; so->so_error = errno; unp_disconnect(unp); - if (so->so_head) { - LIST_REMOVE(unp, unp_link); - unp->unp_gencnt = ++unp_gencnt; - unp_count--; - so->so_pcb = (caddr_t) 0; - if (unp->unp_addr) - FREE(unp->unp_addr, M_SONAME); - zfree(unp_zone, (vm_offset_t)unp); - sofree(so); - } } #ifdef notdef @@ -909,31 +1013,30 @@ unp_drain() #endif int -unp_externalize(rights) - struct mbuf *rights; +unp_externalize(struct mbuf *rights) { struct proc *p = current_proc(); /* XXX */ - register int i; - register struct cmsghdr *cm = mtod(rights, struct cmsghdr *); - register struct file **rp = (struct file **)(cm + 1); - register struct file *fp; + int i; + struct cmsghdr *cm = mtod(rights, struct cmsghdr *); + struct fileglob **rp = (struct fileglob **)(cm + 1); + struct fileproc *fp; + struct fileglob *fg; int newfds = (cm->cmsg_len - sizeof(*cm)) / sizeof (int); int f; - - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + proc_fdlock(p); /* * if the new FD's will not fit, then we free them all */ if (!fdavail(p, newfds)) { for (i = 0; i < newfds; i++) { - fp = *rp; - unp_discard(fp); + fg = *rp; + unp_discard_fdlocked(fg, p); *rp++ = 0; } + proc_fdunlock(p); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); return (EMSGSIZE); } /* @@ -945,15 +1048,19 @@ unp_externalize(rights) for (i = 0; i < newfds; i++) { if (fdalloc(p, 0, &f)) panic("unp_externalize"); - fp = *rp; + fg = *rp; + MALLOC_ZONE(fp, struct fileproc *, sizeof(struct fileproc), M_FILEPROC, M_WAITOK); + bzero(fp, sizeof(struct fileproc)); + fp->f_iocount = 0; + fp->f_fglob = fg; p->p_fd->fd_ofiles[f] = fp; + fg_removeuipc(fg); *fdflags(p, f) &= ~UF_RESERVED; - fp->f_msgcount--; unp_rights--; *(int *)rp++ = f; } + proc_fdunlock(p); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); return (0); } @@ -967,6 +1074,8 @@ unp_init(void) panic("unp_init"); LIST_INIT(&unp_dhead); LIST_INIT(&unp_shead); + + unp_mutex = localdomain.dom_mtx; } #ifndef MIN @@ -974,42 +1083,42 @@ unp_init(void) #endif static int -unp_internalize(control, p) - struct mbuf *control; - struct proc *p; +unp_internalize( + struct mbuf *control, + struct proc *p) { - register struct cmsghdr *cm = mtod(control, struct cmsghdr *); - register struct file **rp; - struct file *fp; + struct cmsghdr *cm = mtod(control, struct cmsghdr *); + struct fileglob **rp; + struct fileproc *fp; register int i, error; int oldfds; + int fdgetf_noref(proc_t, struct fileglob **, struct fileproc **); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET || cm->cmsg_len != control->m_len) { - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); return (EINVAL); } - oldfds = (cm->cmsg_len - sizeof (*cm)) / sizeof (int); - rp = (struct file **)(cm + 1); - for (i = 0; i < oldfds; i++) - if (error = fdgetf(p, *(int *)rp++, 0)) { - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - return (error); + proc_fdlock(p); + rp = (struct fileglob **)(cm + 1); + + for (i = 0; i < oldfds; i++) { + if (error = fdgetf_noref(p, *(int *)rp++, (struct fileglob **)0)) { + proc_fdunlock(p); + return (error); } + } + rp = (struct fileglob **)(cm + 1); - rp = (struct file **)(cm + 1); for (i = 0; i < oldfds; i++) { - (void) fdgetf(p, *(int *)rp, &fp); - *rp++ = fp; - fref(fp); - fp->f_msgcount++; + (void) fdgetf_noref(p, *(int *)rp, &fp); + fg_insertuipc(fp->f_fglob); + *rp++ = fp->f_fglob; unp_rights++; } + proc_fdunlock(p); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); return (0); } @@ -1018,68 +1127,85 @@ static int unp_defer, unp_gcing; static void unp_gc() { - register struct file *fp, *nextfp; + register struct fileglob *fg, *nextfg; register struct socket *so; - struct file **extra_ref, **fpp; + struct fileglob **extra_ref, **fpp; int nunref, i; - if (unp_gcing) + lck_mtx_lock(uipc_lock); + if (unp_gcing) { + lck_mtx_unlock(uipc_lock); return; + } unp_gcing = 1; unp_defer = 0; + lck_mtx_unlock(uipc_lock); /* * before going through all this, set all FDs to * be NOT defered and NOT externally accessible */ - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next) - fp->f_flag &= ~(FMARK|FDEFER); + for (fg = fmsghead.lh_first; fg != 0; fg = fg->f_msglist.le_next) { + lck_mtx_lock(&fg->fg_lock); + fg->fg_flag &= ~(FMARK|FDEFER); + lck_mtx_unlock(&fg->fg_lock); + } do { - for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next) { + for (fg = fmsghead.lh_first; fg != 0; fg = fg->f_msglist.le_next) { + lck_mtx_lock(&fg->fg_lock); /* * If the file is not open, skip it */ - if (fcount(fp) == 0) + if (fg->fg_count == 0) { + lck_mtx_unlock(&fg->fg_lock); continue; + } /* * If we already marked it as 'defer' in a * previous pass, then try process it this time * and un-mark it */ - if (fp->f_flag & FDEFER) { - fp->f_flag &= ~FDEFER; + if (fg->fg_flag & FDEFER) { + fg->fg_flag &= ~FDEFER; unp_defer--; } else { /* * if it's not defered, then check if it's * already marked.. if so skip it */ - if (fp->f_flag & FMARK) + if (fg->fg_flag & FMARK){ + lck_mtx_unlock(&fg->fg_lock); continue; + } /* * If all references are from messages * in transit, then skip it. it's not * externally accessible. */ - if (fcount(fp) == fp->f_msgcount) + if (fg->fg_count == fg->fg_msgcount) { + lck_mtx_unlock(&fg->fg_lock); continue; + } /* * If it got this far then it must be * externally accessible. */ - fp->f_flag |= FMARK; + fg->fg_flag |= FMARK; } /* * either it was defered, or it is externally * accessible and not already marked so. * Now check if it is possibly one of OUR sockets. */ - if (fp->f_type != DTYPE_SOCKET || - (so = (struct socket *)fp->f_data) == 0) + if (fg->fg_type != DTYPE_SOCKET || + (so = (struct socket *)fg->fg_data) == 0) { + lck_mtx_unlock(&fg->fg_lock); continue; + } if (so->so_proto->pr_domain != &localdomain || - (so->so_proto->pr_flags&PR_RIGHTS) == 0) + (so->so_proto->pr_flags&PR_RIGHTS) == 0) { + lck_mtx_unlock(&fg->fg_lock); continue; + } #ifdef notdef /* if this code is enabled need to run under network funnel */ if (so->so_rcv.sb_flags & SB_LOCK) { @@ -1105,6 +1231,7 @@ unp_gc() * as accessible too. */ unp_scan(so->so_rcv.sb_mb, unp_mark); + lck_mtx_unlock(&fg->fg_lock); } } while (unp_defer); /* @@ -1146,83 +1273,94 @@ unp_gc() * * 91/09/19, bsy@cs.cmu.edu */ - extra_ref = _MALLOC(nfiles * sizeof(struct file *), M_FILE, M_WAITOK); - for (nunref = 0, fp = filehead.lh_first, fpp = extra_ref; fp != 0; - fp = nextfp) { - nextfp = fp->f_list.le_next; + extra_ref = _MALLOC(nfiles * sizeof(struct fileglob *), M_FILEGLOB, M_WAITOK); + for (nunref = 0, fg = fmsghead.lh_first, fpp = extra_ref; fg != 0; + fg = nextfg) { + lck_mtx_lock(&fg->fg_lock); + + nextfg = fg->f_msglist.le_next; /* * If it's not open, skip it */ - if (fcount(fp) == 0) + if (fg->fg_count == 0) { + lck_mtx_unlock(&fg->fg_lock); continue; + } /* * If all refs are from msgs, and it's not marked accessible * then it must be referenced from some unreachable cycle * of (shut-down) FDs, so include it in our * list of FDs to remove */ - if (fcount(fp) == fp->f_msgcount && !(fp->f_flag & FMARK)) { - *fpp++ = fp; + if (fg->fg_count == fg->fg_msgcount && !(fg->fg_flag & FMARK)) { + fg->fg_count++; + *fpp++ = fg; nunref++; - fref(fp); } + lck_mtx_unlock(&fg->fg_lock); } /* * for each FD on our hit list, do the following two things */ for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { - struct file *tfp = *fpp; - if (tfp->f_type == DTYPE_SOCKET && tfp->f_data != NULL) { - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - sorflush((struct socket *)(tfp->f_data)); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - } - } + struct fileglob *tfg; + tfg = *fpp; + if (tfg->fg_type == DTYPE_SOCKET && tfg->fg_data != NULL) { + sorflush((struct socket *)(tfg->fg_data)); + } + } for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) - closef(*fpp, (struct proc *) NULL); - FREE((caddr_t)extra_ref, M_FILE); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - + closef_locked((struct fileproc *)0, *fpp, (struct proc *) NULL); unp_gcing = 0; + FREE((caddr_t)extra_ref, M_FILEGLOB); + } void -unp_dispose(m) - struct mbuf *m; +unp_dispose(struct mbuf *m) { if (m) { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); unp_scan(m, unp_discard); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); } } -/* should run under kernel funnel */ +static int +unp_listen( + struct unpcb *unp, + struct proc *p) +{ + + cru2x(p->p_ucred, &unp->unp_peercred); + unp->unp_flags |= UNP_HAVEPCCACHED; + return (0); +} + +/* should run under kernel funnel */ static void -unp_scan(m0, op) - register struct mbuf *m0; - void (*op) __P((struct file *)); +unp_scan( + struct mbuf *m0, + void (*op)(struct fileglob *)) { - register struct mbuf *m; - register struct file **rp; - register struct cmsghdr *cm; - register int i; + struct mbuf *m; + struct fileglob **rp; + struct cmsghdr *cm; + int i; int qfds; while (m0) { for (m = m0; m; m = m->m_next) if (m->m_type == MT_CONTROL && - m->m_len >= sizeof(*cm)) { + (size_t) m->m_len >= sizeof(*cm)) { cm = mtod(m, struct cmsghdr *); if (cm->cmsg_level != SOL_SOCKET || cm->cmsg_type != SCM_RIGHTS) continue; qfds = (cm->cmsg_len - sizeof *cm) - / sizeof (struct file *); - rp = (struct file **)(cm + 1); + / sizeof (struct fileglob *); + rp = (struct fileglob **)(cm + 1); for (i = 0; i < qfds; i++) (*op)(*rp++); break; /* XXX, but saves time */ @@ -1233,23 +1371,40 @@ unp_scan(m0, op) /* should run under kernel funnel */ static void -unp_mark(fp) - struct file *fp; +unp_mark(struct fileglob *fg) { + lck_mtx_lock(&fg->fg_lock); - if (fp->f_flag & FMARK) + if (fg->fg_flag & FMARK) { + lck_mtx_unlock(&fg->fg_lock); return; + } + fg->fg_flag |= (FMARK|FDEFER); + + lck_mtx_unlock(&fg->fg_lock); + unp_defer++; - fp->f_flag |= (FMARK|FDEFER); } /* should run under kernel funnel */ static void -unp_discard(fp) - struct file *fp; +unp_discard(fg) + struct fileglob *fg; { + struct proc *p = current_proc(); /* XXX */ + + proc_fdlock(p); + unp_discard_fdlocked(fg, p); + proc_fdunlock(p); +} +static void +unp_discard_fdlocked(fg, p) + struct fileglob *fg; + struct proc *p; +{ + + fg_removeuipc(fg); - fp->f_msgcount--; unp_rights--; - (void) closef(fp, (struct proc *)NULL); + (void) closef_locked((struct fileproc *)0, fg, p); } diff --git a/bsd/libkern/Makefile b/bsd/libkern/Makefile index 32aaccb2e..825806a41 100644 --- a/bsd/libkern/Makefile +++ b/bsd/libkern/Makefile @@ -11,9 +11,6 @@ include $(MakeInc_def) DATAFILES = \ libkern.h -INSTALL_MI_LIST = ${DATAFILES} - -INSTALL_MI_DIR = libkern EXPORT_MI_LIST = ${DATAFILES} diff --git a/bsd/libkern/crc32.c b/bsd/libkern/crc32.c new file mode 100644 index 000000000..d8f5e345d --- /dev/null +++ b/bsd/libkern/crc32.c @@ -0,0 +1,104 @@ +/*- + * COPYRIGHT (C) 1986 Gary S. Brown. You may use this program, or + * code or tables extracted from it, as desired without restriction. + * + * First, the polynomial itself and its table of feedback terms. The + * polynomial is + * X^32+X^26+X^23+X^22+X^16+X^12+X^11+X^10+X^8+X^7+X^5+X^4+X^2+X^1+X^0 + * + * Note that we take it "backwards" and put the highest-order term in + * the lowest-order bit. The X^32 term is "implied"; the LSB is the + * X^31 term, etc. The X^0 term (usually shown as "+1") results in + * the MSB being 1 + * + * Note that the usual hardware shift register implementation, which + * is what we're using (we're merely optimizing it by doing eight-bit + * chunks at a time) shifts bits into the lowest-order term. In our + * implementation, that means shifting towards the right. Why do we + * do it this way? Because the calculated CRC must be transmitted in + * order from highest-order term to lowest-order term. UARTs transmit + * characters in order from LSB to MSB. By storing the CRC this way + * we hand it to the UART in the order low-byte to high-byte; the UART + * sends each low-bit to hight-bit; and the result is transmission bit + * by bit from highest- to lowest-order term without requiring any bit + * shuffling on our part. Reception works similarly + * + * The feedback terms table consists of 256, 32-bit entries. Notes + * + * The table can be generated at runtime if desired; code to do so + * is shown later. It might not be obvious, but the feedback + * terms simply represent the results of eight shift/xor opera + * tions for all combinations of data and CRC register values + * + * The values must be right-shifted by eight bits by the "updcrc + * logic; the shift must be unsigned (bring in zeroes). On some + * hardware you could probably optimize the shift in assembler by + * using byte-swap instructions + * polynomial $edb88320 + * + * + * CRC32 code derived from work by Gary S. Brown. + */ + +#include <sys/param.h> +#include <sys/systm.h> + +static uint32_t crc32_tab[] = { + 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, + 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, + 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2, + 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, + 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, + 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, + 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c, + 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, + 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, + 0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, + 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106, + 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, + 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, + 0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, + 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, + 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, + 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, + 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, + 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, + 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, + 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, + 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, + 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84, + 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, + 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, + 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, + 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e, + 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, + 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, + 0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, + 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28, + 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, + 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f, + 0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, + 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242, + 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, + 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, + 0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, + 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, + 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, + 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, + 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, + 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d +}; + +uint32_t +crc32(uint32_t crc, const void *buf, size_t size) +{ + const uint8_t *p; + + p = buf; + crc = crc ^ ~0U; + + while (size--) + crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8); + + return crc ^ ~0U; +} diff --git a/bsd/libkern/inet_ntoa.c b/bsd/libkern/inet_ntoa.c deleted file mode 100644 index 0925e8a3c..000000000 --- a/bsd/libkern/inet_ntoa.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ - -/* - * Copyright 1994, 1995 Massachusetts Institute of Technology - * - * Permission to use, copy, modify, and distribute this software and - * its documentation for any purpose and without fee is hereby - * granted, provided that both the above copyright notice and this - * permission notice appear in all copies, that both the above - * copyright notice and this permission notice appear in all - * supporting documentation, and that the name of M.I.T. not be used - * in advertising or publicity pertaining to distribution of the - * software without specific, written prior permission. M.I.T. makes - * no representations about the suitability of this software for any - * purpose. It is provided "as is" without express or implied - * warranty. - * - * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS - * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, - * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT - * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <sys/param.h> -#include <sys/systm.h> - -#include <netinet/in.h> - -char * -inet_ntoa(struct in_addr ina) -{ - static char buf[4*sizeof "123"]; - unsigned char *ucp = (unsigned char *)&ina; - - sprintf(buf, "%d.%d.%d.%d", - ucp[0] & 0xff, - ucp[1] & 0xff, - ucp[2] & 0xff, - ucp[3] & 0xff); - return buf; -} - diff --git a/bsd/libkern/inet_ntop.c b/bsd/libkern/inet_ntop.c new file mode 100644 index 000000000..03d64504a --- /dev/null +++ b/bsd/libkern/inet_ntop.c @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/* + * Copyright 1994, 1995 Massachusetts Institute of Technology + * + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that both the above copyright notice and this + * permission notice appear in all copies, that both the above + * copyright notice and this permission notice appear in all + * supporting documentation, and that the name of M.I.T. not be used + * in advertising or publicity pertaining to distribution of the + * software without specific, written prior permission. M.I.T. makes + * no representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied + * warranty. + * + * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS + * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT + * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/systm.h> + +#include <netinet/in.h> + +static const char *hexchars = "0123456789abcdef"; + +static const char * +inet_ntop4(const struct in_addr *addr, char *buf, size_t len) +{ + const u_int8_t *ap = (const u_int8_t *)&addr->s_addr; + char tmp[MAX_IPv4_STR_LEN]; /* max length of ipv4 addr string */ + int fulllen; + + /* + * snprintf returns number of bytes printed (not including NULL) or + * number of bytes that would have been printed if more than would + * fit + */ + fulllen = snprintf(tmp, sizeof(tmp), "%d.%d.%d.%d", + ap[0], ap[1], ap[2], ap[3]); + if (fulllen >= (int)len) { + return NULL; + } + + bcopy(tmp, buf, fulllen + 1); + + return buf; +} + +static const char * +inet_ntop6(const struct in6_addr *addr, char *dst, size_t size) +{ + char hexa[8][5], tmp[MAX_IPv6_STR_LEN]; + int zr[8]; + size_t len; + int32_t i, j, k, skip; + uint8_t x8, hx8; + uint16_t x16; + struct in_addr a4; + + if (addr == NULL) return NULL; + + bzero(tmp, sizeof(tmp)); + + /* check for mapped or compat addresses */ + i = IN6_IS_ADDR_V4MAPPED(addr); + j = IN6_IS_ADDR_V4COMPAT(addr); + if ((i != 0) || (j != 0)) + { + char tmp2[16]; /* max length of ipv4 addr string */ + a4.s_addr = addr->__u6_addr.__u6_addr32[3]; + len = snprintf(tmp, sizeof(tmp), "::%s%s", (i != 0) ? "ffff:" : "", + inet_ntop4(&a4, tmp2, sizeof(tmp2))); + if (len >= size) return NULL; + bcopy(tmp, dst, len + 1); + return dst; + } + + k = 0; + for (i = 0; i < 16; i += 2) + { + j = 0; + skip = 1; + + bzero(hexa[k], 5); + + x8 = addr->__u6_addr.__u6_addr8[i]; + + hx8 = x8 >> 4; + if (hx8 != 0) + { + skip = 0; + hexa[k][j++] = hexchars[hx8]; + } + + hx8 = x8 & 0x0f; + if ((skip == 0) || ((skip == 1) && (hx8 != 0))) + { + skip = 0; + hexa[k][j++] = hexchars[hx8]; + } + + x8 = addr->__u6_addr.__u6_addr8[i + 1]; + + hx8 = x8 >> 4; + if ((skip == 0) || ((skip == 1) && (hx8 != 0))) + { + hexa[k][j++] = hexchars[hx8]; + } + + hx8 = x8 & 0x0f; + hexa[k][j++] = hexchars[hx8]; + + k++; + } + + /* find runs of zeros for :: convention */ + j = 0; + for (i = 7; i >= 0; i--) + { + zr[i] = j; + x16 = addr->__u6_addr.__u6_addr16[i]; + if (x16 == 0) j++; + else j = 0; + zr[i] = j; + } + + /* find longest run of zeros */ + k = -1; + j = 0; + for(i = 0; i < 8; i++) + { + if (zr[i] > j) + { + k = i; + j = zr[i]; + } + } + + for(i = 0; i < 8; i++) + { + if (i != k) zr[i] = 0; + } + + len = 0; + for (i = 0; i < 8; i++) + { + if (zr[i] != 0) + { + /* check for leading zero */ + if (i == 0) tmp[len++] = ':'; + tmp[len++] = ':'; + i += (zr[i] - 1); + continue; + } + for (j = 0; hexa[i][j] != '\0'; j++) tmp[len++] = hexa[i][j]; + if (i != 7) tmp[len++] = ':'; + } + + /* trailing NULL */ + len++; + + if (len > size) return NULL; + bcopy(tmp, dst, len); + return dst; +} + +const char * +inet_ntop(int af, const void *addr, char *buf, size_t len) +{ + if(af==AF_INET6) + return inet_ntop6(addr, buf, len); + if(af==AF_INET) + return inet_ntop4(addr, buf, len); + return NULL; +} diff --git a/bsd/libkern/libkern.h b/bsd/libkern/libkern.h index 16c005525..6eee3e08c 100644 --- a/bsd/libkern/libkern.h +++ b/bsd/libkern/libkern.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -58,8 +58,11 @@ #define _LIBKERN_LIBKERN_H_ #include <sys/appleapiopts.h> +#include <stdint.h> +#include <string.h> #include <sys/cdefs.h> #include <sys/types.h> +#include <mach/vm_param.h> #ifdef __APPLE_API_OBSOLETE /* BCD conversions. */ @@ -119,18 +122,39 @@ ulmin(u_long a, u_long b) } /* Prototypes for non-quad routines. */ -int bcmp __P((const void *, const void *, size_t)); -int ffs __P((int)); -int locc __P((int, char *, u_int)); -u_long random __P((void)); -char *rindex __P((const char *, int)); -int scanc __P((u_int, u_char *, u_char *, int)); -int skpc __P((int, int, char *)); -char *strcat __P((char *, const char *)); -char *strcpy __P((char *, const char *)); -size_t strlen __P((const char *)); -char *strncpy __P((char *, const char *, size_t)); -long strtol __P((const char*, char **, int)); +extern int ffs(int); +extern int locc(int, char *, u_int); +extern u_long random(void); +extern char *rindex(const char *, int); +extern int scanc(u_int, u_char *, const u_char *, int); +extern int skpc(int, int, char *); +extern long strtol(const char*, char **, int); +extern u_long strtoul(const char *, char **, int); +extern quad_t strtoq(const char *, char **, int); +extern u_quad_t strtouq(const char *, char **, int); + +int snprintf(char *, size_t, const char *, ...); +int sprintf(char *bufp, const char *, ...); +int sscanf(const char *, char const *, ...); +void printf(const char *, ...); + +uint32_t crc32(uint32_t crc, const void *bufp, size_t len); + +int copystr(const void *kfaddr, void *kdaddr, size_t len, size_t *done); +int copyinstr(const user_addr_t uaddr, void *kaddr, size_t len, size_t *done); +int copyoutstr(const void *kaddr, user_addr_t udaddr, size_t len, size_t *done); +int copyin(const user_addr_t uaddr, void *kaddr, size_t len); +int copyout(const void *kaddr, user_addr_t udaddr, size_t len); + +int vsscanf(const char *, char const *, __darwin_va_list); +extern int vsnprintf(char *, size_t, const char *, __darwin_va_list); +extern int vsprintf(char *bufp, const char *, __darwin_va_list); + +extern void invalidate_icache(vm_offset_t, unsigned, int); +extern void flush_dcache(vm_offset_t, unsigned, int); +extern void invalidate_icache64(addr64_t, unsigned, int); +extern void flush_dcache64(addr64_t, unsigned, int); + __END_DECLS #endif /* _LIBKERN_LIBKERN_H_ */ diff --git a/bsd/libkern/scanc.c b/bsd/libkern/scanc.c index 5be4e6f2f..8eb93c384 100644 --- a/bsd/libkern/scanc.c +++ b/bsd/libkern/scanc.c @@ -57,10 +57,7 @@ #include <libkern/libkern.h> int -scanc(size, cp, table, mask0) - u_int size; - register u_char *cp, table[]; - int mask0; +scanc(u_int size, u_char *cp, const u_char table[], int mask0) { register u_char *end; register u_char mask; diff --git a/bsd/machine/Makefile b/bsd/machine/Makefile index 86558ef33..0304d6d01 100644 --- a/bsd/machine/Makefile +++ b/bsd/machine/Makefile @@ -9,17 +9,23 @@ include $(MakeInc_def) DATAFILES = \ - ansi.h byte_order.h cons.h cpu.h disklabel.h endian.h exec.h \ - label_t.h param.h proc.h profile.h psl.h ptrace.h reboot.h \ - reg.h setjmp.h signal.h spl.h table.h trap.h types.h unix_traps.h \ - ucontext.h user.h vmparam.h + byte_order.h endian.h \ + param.h profile.h \ + setjmp.h signal.h types.h\ + ucontext.h vmparam.h _types.h _limits.h +KERNELFILES = \ + byte_order.h endian.h \ + param.h profile.h \ + signal.h spl.h types.h \ + vmparam.h _types.h _limits.h INSTALL_MI_LIST = ${DATAFILES} +INSTALL_MI_LCL_LIST = ${DATAFILES} disklabel.h INSTALL_MI_DIR = machine -EXPORT_MI_LIST = ${DATAFILES} +EXPORT_MI_LIST = ${KERNELFILES} EXPORT_MI_DIR = machine diff --git a/bsd/machine/unix_traps.h b/bsd/machine/_limits.h similarity index 78% rename from bsd/machine/unix_traps.h rename to bsd/machine/_limits.h index fc94186c8..a5be9109a 100644 --- a/bsd/machine/unix_traps.h +++ b/bsd/machine/_limits.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -19,17 +19,15 @@ * * @APPLE_LICENSE_HEADER_END@ */ -#ifndef _BSD_MACHINE_UNIX_TRAPS_H_ -#define _BSD_MACHINE_UNIX_TRAPS_H_ +#ifndef _BSD_MACHINE__LIMITS_H_ +#define _BSD_MACHINE__LIMITS_H_ - -#if defined (__ppc__) -#include "ppc/unix_traps.h" +#if defined (__ppc__) || defined (__ppc64__) +#include "ppc/_limits.h" #elif defined (__i386__) -#include "i386/unix_traps.h" +#include "i386/_limits.h" #else #error architecture not supported #endif - -#endif /* _BSD_MACHINE_UNIX_TRAPS_H_ */ +#endif /* _BSD_MACHINE__LIMITS_H_ */ diff --git a/bsd/machine/table.h b/bsd/machine/_types.h similarity index 78% rename from bsd/machine/table.h rename to bsd/machine/_types.h index e71d1101c..8e6333b6f 100644 --- a/bsd/machine/table.h +++ b/bsd/machine/_types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2003-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -19,17 +19,15 @@ * * @APPLE_LICENSE_HEADER_END@ */ -#ifndef _BSD_MACHINE_TABLE_H_ -#define _BSD_MACHINE_TABLE_H_ +#ifndef _BSD_MACHINE__TYPES_H_ +#define _BSD_MACHINE__TYPES_H_ - -#if defined (__ppc__) -#include "ppc/table.h" +#if defined (__ppc__) || defined (__ppc64__) +#include "ppc/_types.h" #elif defined (__i386__) -#include "i386/table.h" +#include "i386/_types.h" #else #error architecture not supported #endif - -#endif /* _BSD_MACHINE_TABLE_H_ */ +#endif /* _BSD_MACHINE__TYPES_H_ */ diff --git a/bsd/machine/cons.h b/bsd/machine/cons.h index c68a4af46..6d4b3d7cc 100644 --- a/bsd/machine/cons.h +++ b/bsd/machine/cons.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -23,7 +23,7 @@ #define _BSD_MACHINE_CONS_H_ -#if defined (__ppc__) +#if defined (__ppc__) || defined (__ppc64__) #include <dev/ppc/cons.h> #elif defined (__i386__) #include <dev/i386/cons.h> diff --git a/bsd/machine/disklabel.h b/bsd/machine/disklabel.h index fed73e500..8d1402213 100644 --- a/bsd/machine/disklabel.h +++ b/bsd/machine/disklabel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -23,7 +23,7 @@ #define _BSD_MACHINE_CPU_H_ -#if defined (__ppc__) +#if defined (__ppc__) || defined (__ppc64__) #include "ppc/disklabel.h" #elif defined (__i386__) #include "i386/disklabel.h" diff --git a/bsd/machine/endian.h b/bsd/machine/endian.h index 4aa1ad8f4..a6f870e5b 100644 --- a/bsd/machine/endian.h +++ b/bsd/machine/endian.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -26,7 +26,7 @@ #define _BSD_MACHINE_ENDIAN_H_ -#if defined (__ppc__) +#if defined (__ppc__) || defined(__ppc64__) #include "ppc/endian.h" #elif defined (__i386__) #include "i386/endian.h" diff --git a/bsd/machine/exec.h b/bsd/machine/exec.h index 979093289..cb3306c73 100644 --- a/bsd/machine/exec.h +++ b/bsd/machine/exec.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -26,7 +26,7 @@ #define _BSD_MACHINE_EXEC_H_ -#if defined (__ppc__) +#if defined (__ppc__) || defined (__ppc64__) #include "ppc/exec.h" #elif defined (__i386__) #include "i386/exec.h" diff --git a/bsd/machine/param.h b/bsd/machine/param.h index 4ee6af0fd..ab305ba73 100644 --- a/bsd/machine/param.h +++ b/bsd/machine/param.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -26,7 +26,7 @@ #define _BSD_MACHINE_PARAM_H_ -#if defined (__ppc__) +#if defined (__ppc__) || defined (__ppc64__) #include "ppc/param.h" #elif defined (__i386__) #include "i386/param.h" diff --git a/bsd/machine/profile.h b/bsd/machine/profile.h index 94c316745..847570beb 100644 --- a/bsd/machine/profile.h +++ b/bsd/machine/profile.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -30,7 +30,7 @@ #define _BSD_MACHINE_PROFILE_H_ -#if defined (__ppc__) +#if defined (__ppc__) || defined (__ppc64__) #include "ppc/profile.h" #elif defined (__i386__) #include "i386/profile.h" diff --git a/bsd/machine/psl.h b/bsd/machine/psl.h index e9763e864..06c76e528 100644 --- a/bsd/machine/psl.h +++ b/bsd/machine/psl.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -23,7 +23,7 @@ #define _BSD_MACHINE_PSL_H_ -#if defined (__ppc__) +#if defined (__ppc__) || defined (__ppc64__) #include "ppc/psl.h" #elif defined (__i386__) #include "i386/psl.h" diff --git a/bsd/machine/ptrace.h b/bsd/machine/ptrace.h index f2abe2b6a..8d14243a6 100644 --- a/bsd/machine/ptrace.h +++ b/bsd/machine/ptrace.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -26,7 +26,7 @@ #define _BSD_MACHINE_PTRACE_H_ -#if defined (__ppc__) +#if defined (__ppc__) || defined(__ppc64__) #include "ppc/ptrace.h" #elif defined (__i386__) #include "i386/ptrace.h" diff --git a/bsd/machine/reboot.h b/bsd/machine/reboot.h index 368fa0aad..7d0af116c 100644 --- a/bsd/machine/reboot.h +++ b/bsd/machine/reboot.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -23,7 +23,7 @@ #define _BSD_MACHINE_REBOOT_H_ -#if defined (__ppc__) +#if defined (__ppc__) || defined (__ppc64__) #include "ppc/reboot.h" #elif defined (__i386__) #include "i386/reboot.h" diff --git a/bsd/machine/reg.h b/bsd/machine/reg.h index add5145e2..7e18c5b53 100644 --- a/bsd/machine/reg.h +++ b/bsd/machine/reg.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -23,7 +23,7 @@ #define _BSD_MACHINE_REG_H_ -#if defined (__ppc__) +#if defined (__ppc__) || defined (__ppc64__) #include "ppc/reg.h" #elif defined (__i386__) #include "i386/reg.h" diff --git a/bsd/machine/setjmp.h b/bsd/machine/setjmp.h index c39a8ea77..c4bbf5dec 100644 --- a/bsd/machine/setjmp.h +++ b/bsd/machine/setjmp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -20,17 +20,12 @@ * @APPLE_LICENSE_HEADER_END@ */ /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* - * The NEXTSTEP Software License Agreement specifies the terms - * and conditions for redistribution. - * - */ #ifndef _MACHINE_SETJMP_H_ #define _MACHINE_SETJMP_H_ -#if defined (__ppc__) +#if defined (__ppc__) || defined (__ppc64__) #include "ppc/setjmp.h" #elif defined (__i386__) #include "i386/setjmp.h" diff --git a/bsd/machine/signal.h b/bsd/machine/signal.h index b7c7300f1..6c926665e 100644 --- a/bsd/machine/signal.h +++ b/bsd/machine/signal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -23,7 +23,7 @@ #define _BSD_MACHINE_SIGNAL_H_ -#if defined (__ppc__) +#if defined (__ppc__) || defined (__ppc64__) #include "ppc/signal.h" #elif defined (__i386__) #include "i386/signal.h" diff --git a/bsd/machine/spl.h b/bsd/machine/spl.h index 89d75fad5..36ab465e9 100644 --- a/bsd/machine/spl.h +++ b/bsd/machine/spl.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -22,14 +22,35 @@ #ifndef _BSD_MACHINE_SPL_H_ #define _BSD_MACHINE_SPL_H_ +#ifdef KERNEL +#ifndef __ASSEMBLER__ +/* + * Machine-dependent SPL definitions. + * + */ +typedef unsigned spl_t; + +extern unsigned int sploff(void); +extern unsigned int splhigh(void); +extern unsigned int splsched(void); +extern unsigned int splclock(void); +extern unsigned int splpower(void); +extern unsigned int splvm(void); +extern unsigned int splbio(void); +extern unsigned int splimp(void); +extern unsigned int spltty(void); +extern unsigned int splnet(void); +extern unsigned int splsoftclock(void); + +extern void spllo(void); +extern void splon(unsigned int level); +extern void splx(unsigned int level); +extern void spln(unsigned int level); +#define splstatclock() splhigh() + +#endif /* __ASSEMBLER__ */ -#if defined (__ppc__) -#include "ppc/spl.h" -#elif defined (__i386__) -#include "i386/spl.h" -#else -#error architecture not supported -#endif +#endif /* KERNEL */ #endif /* _BSD_MACHINE_SPL_H_ */ diff --git a/bsd/machine/types.h b/bsd/machine/types.h index f5ade7c2f..12053c52f 100644 --- a/bsd/machine/types.h +++ b/bsd/machine/types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -26,7 +26,7 @@ #define _BSD_MACHINE_TYPES_H_ -#if defined (__ppc__) +#if defined (__ppc__) || defined (__ppc64__) #include "ppc/types.h" #elif defined (__i386__) #include "i386/types.h" diff --git a/bsd/machine/ucontext.h b/bsd/machine/ucontext.h index 56cf8137d..fa9635508 100644 --- a/bsd/machine/ucontext.h +++ b/bsd/machine/ucontext.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -22,7 +22,7 @@ #ifndef _MACHINE_UCONTEXT_H_ #define _MACHINE_UCONTEXT_H_ -#if defined (__ppc__) +#if defined (__ppc__) || defined (__ppc64__) #include "ppc/ucontext.h" #elif defined (__i386__) #include "i386/ucontext.h" diff --git a/bsd/machine/user.h b/bsd/machine/user.h deleted file mode 100644 index 4aaf1bbf0..000000000 --- a/bsd/machine/user.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -#ifndef _BSD_MACHINE_USER_H_ -#define _BSD_MACHINE_USER_H_ - - -#if defined (__ppc__) -#include "ppc/user.h" -#elif defined (__i386__) -#include "i386/user.h" -#else -#error architecture not supported -#endif - - -#endif /* _BSD_MACHINE_USER_H_ */ diff --git a/bsd/machine/vmparam.h b/bsd/machine/vmparam.h index d1375d280..ab232e8bb 100644 --- a/bsd/machine/vmparam.h +++ b/bsd/machine/vmparam.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -23,7 +23,7 @@ #define _BSD_MACHINE_VMPARAM_H_ -#if defined (__ppc__) +#if defined (__ppc__) || defined (__ppc64__) #include "ppc/vmparam.h" #elif defined (__i386__) #include "i386/vmparam.h" diff --git a/bsd/man/man2/Makefile b/bsd/man/man2/Makefile index 986739b08..3473b0190 100644 --- a/bsd/man/man2/Makefile +++ b/bsd/man/man2/Makefile @@ -9,12 +9,17 @@ include $(MakeInc_def) DATAFILES = \ __syscall.2 \ _exit.2 \ + aio_cancel.2 \ + aio_error.2 \ + aio_read.2 \ + aio_return.2 \ + aio_suspend.2 \ + aio_write.2 \ accept.2 \ access.2 \ acct.2 \ adjtime.2 \ bind.2 \ - brk.2 \ chdir.2 \ chflags.2 \ chmod.2 \ @@ -25,6 +30,7 @@ DATAFILES = \ dup.2 \ dup2.2 \ execve.2 \ + exchangedata.2 \ fchdir.2 \ fchflags.2 \ fchmod.2 \ @@ -35,11 +41,12 @@ DATAFILES = \ fpathconf.2 \ fstat.2 \ fstatfs.2 \ - fsctl.2 \ fsync.2 \ ftruncate.2 \ futimes.2 \ + getattrlist.2 \ getdirentries.2 \ + getdirentriesattr.2 \ getegid.2 \ geteuid.2 \ getfh.2 \ @@ -61,6 +68,7 @@ DATAFILES = \ getsockopt.2 \ gettimeofday.2 \ getuid.2 \ + getxattr.2 \ intro.2 \ ioctl.2 \ issetugid.2 \ @@ -70,6 +78,7 @@ DATAFILES = \ lchown.2 \ link.2 \ listen.2 \ + listxattr.2 \ lseek.2 \ lstat.2 \ madvise.2 \ @@ -89,6 +98,7 @@ DATAFILES = \ open.2 \ pathconf.2 \ pipe.2 \ + poll.2 \ posix_madvise.2 \ pread.2 \ profil.2 \ @@ -103,9 +113,10 @@ DATAFILES = \ recvfrom.2 \ recvmsg.2 \ rename.2 \ + removexattr.2 \ revoke.2 \ rmdir.2 \ - sbrk.2 \ + searchfs.2 \ select.2 \ semctl.2 \ semget.2 \ @@ -113,6 +124,7 @@ DATAFILES = \ send.2 \ sendmsg.2 \ sendto.2 \ + setattrlist.2 \ setegid.2 \ seteuid.2 \ setgid.2 \ @@ -127,6 +139,7 @@ DATAFILES = \ setsockopt.2 \ settimeofday.2 \ setuid.2 \ + setxattr.2 \ shmat.2 \ shmctl.2 \ shmdt.2 \ @@ -159,7 +172,12 @@ DATAFILES = \ write.2 \ writev.2 \ +# List of source/target hard link pairs for installed manual pages; source +# names may be repeated +MLINKS= kqueue.2 kevent.2 + INSTALL_MAN_LIST = ${DATAFILES} +INSTALL_MAN_LINKS = ${MLINKS} INSTALL_MAN_DIR = man2 diff --git a/bsd/man/man2/accept.2 b/bsd/man/man2/accept.2 index f62dbc3ba..3b5ec370f 100644 --- a/bsd/man/man2/accept.2 +++ b/bsd/man/man2/accept.2 @@ -43,7 +43,7 @@ .Fd #include <sys/types.h> .Fd #include <sys/socket.h> .Ft int -.Fn accept "int s" "struct sockaddr *addr" "int *addrlen" +.Fn accept "int s" "struct sockaddr *addr" "socklen_t *addrlen" .Sh DESCRIPTION The argument .Fa s diff --git a/bsd/man/man2/aio_cancel.2 b/bsd/man/man2/aio_cancel.2 new file mode 100644 index 000000000..a5f1392c6 --- /dev/null +++ b/bsd/man/man2/aio_cancel.2 @@ -0,0 +1,117 @@ +.\" Copyright (c) 1999 Softweyr LLC. +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY Softweyr LLC AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL Softweyr LLC OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD: src/lib/libc/sys/aio_cancel.2,v 1.22 2003/01/13 10:37:11 tjr Exp $ +.\" +.Dd January 19, 2000 +.Dt AIO_CANCEL 2 +.Os +.Sh NAME +.Nm aio_cancel +.Nd cancel an outstanding asynchronous I/O operation (REALTIME) +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In aio.h +.Ft int +.Fn aio_cancel "int fildes" "struct aiocb * iocb" +.Sh DESCRIPTION +The +.Fn aio_cancel +system call cancels the outstanding asynchronous +I/O request for the file descriptor specified in +.Fa fildes . +If +.Fa iocb +is specified, only that specific asynchronous I/O request is cancelled. +.Pp +Normal asynchronous notification occurs for cancelled requests. +Requests complete with an error result of +.Er ECANCELED . +.Sh RESTRICTIONS +The +.Fn aio_cancel +system call does not cancel asynchronous I/O requests for raw disk devices. +The +.Fn aio_cancel +system call will always return +.Dv AIO_NOTCANCELED +for file descriptors associated with raw disk devices. +.Sh RETURN VALUES +The +.Fn aio_cancel +system call returns -1 to indicate an error, or one of the following: +.Bl -tag -width Dv +.It Bq Dv AIO_CANCELED +All outstanding requests meeting the criteria specified were cancelled. +.It Bq Dv AIO_NOTCANCELED +Some requests were not cancelled, status for the requests should be +checked with +.Xr aio_error 2 . +.It Bq Dv AIO_ALLDONE +All of the requests meeting the criteria have finished. +.El +.Sh ERRORS +An error return from +.Fn aio_cancel +indicates: +.Bl -tag -width Er +.It Bq Er EBADF +The +.Fa fildes +argument +is an invalid file descriptor. +.El +.Sh SEE ALSO +.Xr aio_error 2 , +.Xr aio_read 2 , +.Xr aio_return 2 , +.Xr aio_suspend 2 , +.Xr aio_write 2 , +.Xr aio 4 +.Sh STANDARDS +The +.Fn aio_cancel +system call is expected to conform to the +.St -p1003.1 +standard. +.Sh HISTORY +The +.Fn aio_cancel +system call first appeared in +.Fx 3.0 . +The first functional implementation of +.Fn aio_cancel +appeared in +.Fx 4.0 . +.Sh AUTHORS +.An -nosplit +This +manual page was originally written by +.An Wes Peters Aq wes@softweyr.com . +.An Christopher M Sedore Aq cmsedore@maxwell.syr.edu +updated it when +.Fn aio_cancel +was implemented for +.Fx 4.0 . diff --git a/bsd/man/man2/aio_error.2 b/bsd/man/man2/aio_error.2 new file mode 100644 index 000000000..8c13ca3f5 --- /dev/null +++ b/bsd/man/man2/aio_error.2 @@ -0,0 +1,100 @@ +.\" Copyright (c) 1999 Softweyr LLC. +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY Softweyr LLC AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL Softweyr LLC OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD: src/lib/libc/sys/aio_error.2,v 1.18 2003/01/13 10:37:11 tjr Exp $ +.\" +.Dd June 2, 1999 +.Dt AIO_ERROR 2 +.Os +.Sh NAME +.Nm aio_error +.Nd retrieve error status of asynchronous I/O operation (REALTIME) +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In aio.h +.Ft int +.Fn aio_error "const struct aiocb *iocb" +.Sh DESCRIPTION +The +.Fn aio_error +system call returns the error status of the asynchronous I/O request +associated with the structure pointed to by +.Fa iocb . +.Sh RETURN VALUES +If the asynchronous I/O request has completed successfully, +.Fn aio_error +returns 0. If the request has not yet completed, +.Er EINPROGRESS +is returned. If the request has completed unsuccessfully the error +status is returned as described in +.Xr read 2 , +.Xr write 2 , +or +.Xr fsync 2 +is returned. +On failure, +.Fn aio_error +returns +.Dv -1 +and sets +.Dv errno +to indicate the error condition. +.Sh ERRORS +The +.Fn aio_error +system call will fail if: +.Bl -tag -width Er +.It Bq Er EINVAL +The +.Fa iocb +argument +does not reference an outstanding asynchronous I/O request. +.El +.Sh SEE ALSO +.Xr aio_cancel 2 , +.Xr aio_read 2 , +.Xr aio_return 2 , +.Xr aio_suspend 2 , +.Xr aio_write 2 , +.Xr fsync 2 , +.Xr read 2 , +.Xr write 2 , +.Xr aio 4 +.Sh STANDARDS +The +.Fn aio_error +system call +is expected to conform to the +.St -p1003.1 +standard. +.Sh HISTORY +The +.Fn aio_error +system call first appeared in +.Fx 3.0 . +.Sh AUTHORS +This +manual page was written by +.An Wes Peters Aq wes@softweyr.com . diff --git a/bsd/man/man2/aio_read.2 b/bsd/man/man2/aio_read.2 new file mode 100644 index 000000000..e0ef5a537 --- /dev/null +++ b/bsd/man/man2/aio_read.2 @@ -0,0 +1,211 @@ +.\" Copyright (c) 1998 Terry Lambert +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD: src/lib/libc/sys/aio_read.2,v 1.19 2003/01/14 02:37:06 tjr Exp $ +.\" +.Dd November 17, 1998 +.Dt AIO_READ 2 +.Os +.Sh NAME +.Nm aio_read +.Nd asynchronous read from a file (REALTIME) +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In aio.h +.Ft int +.Fn aio_read "struct aiocb *iocb" +.Sh DESCRIPTION +The +.Fn aio_read +system call allows the calling process to read +.Fa iocb->aio_nbytes +from the descriptor +.Fa iocb->aio_fildes +beginning at the offset +.Fa iocb->aio_offset +into the buffer pointed to by +.Fa iocb->aio_buf . +The call returns immediately after the read request has +been enqueued to the descriptor; the read may or may not have +completed at the time the call returns. +.Pp +If _POSIX_PRIORITIZED_IO is defined, and the descriptor supports it, +then the enqueued operation is submitted at a priority equal to that +of the calling process minus +.Fa iocb->aio_reqprio . +.Pp +The +.Fa iocb->aio_lio_opcode +argument +is ignored by the +.Fn aio_read +system call. +.Pp +The +.Fa iocb +pointer may be subsequently used as an argument to +.Fn aio_return +and +.Fn aio_error +in order to determine return or error status for the enqueued operation +while it is in progress. +.Pp +If the request could not be enqueued (generally due to invalid arguments), +then the call returns without having enqueued the request. +.Pp +If the request is successfully enqueued, the value of +.Fa iocb->aio_offset +can be modified during the request as context, so this value must +not be referenced after the request is enqueued. +.Sh RESTRICTIONS +The Asynchronous I/O Control Block structure pointed to by +.Fa iocb +and the buffer that the +.Fa iocb->aio_buf +member of that structure references must remain valid until the +operation has completed. For this reason, use of auto (stack) variables +for these objects is discouraged. +.Pp +The asynchronous I/O control buffer +.Fa iocb +should be zeroed before the +.Fn aio_read +call to avoid passing bogus context information to the kernel. +.Pp +Modifications of the Asynchronous I/O Control Block structure or the +buffer contents after the request has been enqueued, but before the +request has completed, are not allowed. +.Pp +If the file offset in +.Fa iocb->aio_offset +is past the offset maximum for +.Fa iocb->aio_fildes , +no I/O will occur. +.Sh RETURN VALUES +.Rv -std aio_read +.Sh DIAGNOSTICS +None. +.Sh ERRORS +The +.Fn aio_read +system call will fail if: +.Bl -tag -width Er +.It Bq Er EAGAIN +The request was not queued because of system resource limitations. +.It Bq Er ENOSYS +The +.Fn aio_read +system call is not supported. +.El +.Pp +The following conditions may be synchronously detected when the +.Fn aio_read +system call is made, or asynchronously, at any time thereafter. If they +are detected at call time, +.Fn aio_read +returns -1 and sets +.Va errno +appropriately; otherwise the +.Fn aio_return +system call must be called, and will return -1, and +.Fn aio_error +must be called to determine the actual value that would have been +returned in +.Va errno . +.Pp +.Bl -tag -width Er +.It Bq Er EBADF +The +.Fa iocb->aio_fildes +argument +is invalid. +.It Bq Er EINVAL +The offset +.Fa iocb->aio_offset +is not valid, the priority specified by +.Fa iocb->aio_reqprio +is not a valid priority, or the number of bytes specified by +.Fa iocb->aio_nbytes +is not valid. +.It Bq Er EOVERFLOW +The file is a regular file, +.Fa iocb->aio_nbytes +is greater than zero, the starting offset in +.Fa iocb->aio_offset +is before the end of the file, but is at or beyond the +.Fa iocb->aio_fildes +offset maximum. +.El +.Pp +If the request is successfully enqueued, but subsequently cancelled +or an error occurs, the value returned by the +.Fn aio_return +system call is per the +.Xr read 2 +system call, and the value returned by the +.Fn aio_error +system call is either one of the error returns from the +.Xr read 2 +system call, or one of: +.Bl -tag -width Er +.It Bq Er EBADF +The +.Fa iocb->aio_fildes +argument +is invalid for reading. +.It Bq Er ECANCELED +The request was explicitly cancelled via a call to +.Fn aio_cancel . +.It Bq Er EINVAL +The offset +.Fa iocb->aio_offset +would be invalid. +.El +.Sh SEE ALSO +.Xr aio_cancel 2 , +.Xr aio_error 2 , +.Xr aio_return 2 , +.Xr aio_suspend 2 , +.Xr aio_write 2 , +.Xr aio 4 +.Sh STANDARDS +The +.Fn aio_read +system call is expected to conform to the +.St -p1003.1 +standard. +.Sh HISTORY +The +.Fn aio_read +system call first appeared in +.Fx 3.0 . +.Sh AUTHORS +This +manual page was written by +.An Terry Lambert Aq terry@whistle.com . +.Sh BUGS +Invalid information in +.Fa iocb->_aiocb_private +may confuse the kernel. diff --git a/bsd/man/man2/aio_return.2 b/bsd/man/man2/aio_return.2 new file mode 100644 index 000000000..8c4e28ff2 --- /dev/null +++ b/bsd/man/man2/aio_return.2 @@ -0,0 +1,103 @@ +.\" Copyright (c) 1999 Softweyr LLC. +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY Softweyr LLC AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL Softweyr LLC OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD: src/lib/libc/sys/aio_return.2,v 1.17 2003/01/13 10:37:11 tjr Exp $ +.\" +.Dd June 2, 1999 +.Dt AIO_RETURN 2 +.Os +.Sh NAME +.Nm aio_return +.Nd retrieve return status of asynchronous I/O operation (REALTIME) +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In aio.h +.Ft int +.Fn aio_return "struct aiocb *iocb" +.Sh DESCRIPTION +The +.Fn aio_return +system call returns the final status of the asynchronous I/O request +associated with the structure pointed to by +.Fa iocb . +.Pp +The +.Fn aio_return +system call +should only be called once, to obtain the final status of an asynchronous +I/O operation once +.Xr aio_error 2 +returns something other than +.Er EINPROGRESS . +.Sh RETURN VALUES +If the asynchronous I/O request has completed, the status is returned +as described in +.Xr read 2 , +.Xr write 2 , +or +.Xr fsync 2 . +On failure, +.Fn aio_return +returns +.Dv -1 +and sets +.Dv errno +to indicate the error condition. +.Sh ERRORS +The +.Fn aio_return +system call will fail if: +.Bl -tag -width Er +.It Bq Er EINVAL +The +.Fa iocb +argument +does not reference an outstanding asynchronous I/O request. +.El +.Sh SEE ALSO +.Xr aio_cancel 2 , +.Xr aio_error 2 , +.Xr aio_suspend 2 , +.Xr aio_write 2 , +.Xr fsync 2 , +.Xr read 2 , +.Xr write 2 , +.Xr aio 4 +.Sh STANDARDS +The +.Fn aio_return +system call +is expected to conform to the +.St -p1003.1 +standard. +.Sh HISTORY +The +.Fn aio_return +system call first appeared in +.Fx 3.0 . +.Sh AUTHORS +This +manual page was written by +.An Wes Peters Aq wes@softweyr.com . diff --git a/bsd/man/man2/aio_suspend.2 b/bsd/man/man2/aio_suspend.2 new file mode 100644 index 000000000..c0b85ce10 --- /dev/null +++ b/bsd/man/man2/aio_suspend.2 @@ -0,0 +1,113 @@ +.\" Copyright (c) 1999 Softweyr LLC. +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY Softweyr LLC AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL Softweyr LLC OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD: src/lib/libc/sys/aio_suspend.2,v 1.19 2003/01/13 10:37:11 tjr Exp $ +.\" +.Dd June 2, 1999 +.Dt AIO_SUSPEND 2 +.Os +.Sh NAME +.Nm aio_suspend +.Nd suspend until asynchronous I/O operations or timeout complete (REALTIME) +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In aio.h +.Ft int +.Fn aio_suspend "const struct aiocb * const iocbs[]" "int niocb" "const struct timespec * timeout" +.Sh DESCRIPTION +The +.Fn aio_suspend +system call suspends the calling process until at least one of the +specified asynchronous I/O requests have completed, a signal is +delivered, or the +.Fa timeout +has passed. +.Pp +The +.Fa iocbs +argument +is an array of +.Fa niocb +pointers to asynchronous I/O requests. Array members containing +NULL will be silently ignored. +.Pp +If +.Fa timeout +is a non-nil pointer, it specifies a maximum interval to suspend. +If +.Fa timeout +is a nil pointer, the suspend blocks indefinitely. To effect a +poll, the +.Fa timeout +should point to a zero-value timespec structure. +.Sh RETURN VALUES +If one or more of the specified asynchronous I/O requests have +completed, +.Fn aio_suspend +returns 0. Otherwise it returns -1 and sets +.Va errno +to indicate the error, as enumerated below. +.Sh ERRORS +The +.Fn aio_suspend +system call will fail if: +.Bl -tag -width Er +.It Bq Er EAGAIN +the +.Fa timeout +expired before any I/O requests completed. +.It Bq Er EINVAL +The +.Fa iocbs +argument +contains more than +.Dv AIO_LISTIO_MAX +asynchronous I/O requests, or at least one of the requests is not +valid. +.It Bq Er EINTR +the suspend was interrupted by a signal. +.El +.Sh SEE ALSO +.Xr aio_cancel 2 , +.Xr aio_error 2 , +.Xr aio_return 2 , +.Xr aio_write 2 , +.Xr aio 4 +.Sh STANDARDS +The +.Fn aio_suspend +system call +is expected to conform to the +.St -p1003.1 +standard. +.Sh HISTORY +The +.Fn aio_suspend +system call first appeared in +.Fx 3.0 . +.Sh AUTHORS +This +manual page was written by +.An Wes Peters Aq wes@softweyr.com . diff --git a/bsd/man/man2/aio_write.2 b/bsd/man/man2/aio_write.2 new file mode 100644 index 000000000..097daaf4a --- /dev/null +++ b/bsd/man/man2/aio_write.2 @@ -0,0 +1,204 @@ +.\" Copyright (c) 1999 Softweyr LLC. +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY Softweyr LLC AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL Softweyr LLC OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD: src/lib/libc/sys/aio_write.2,v 1.16 2003/01/13 10:37:11 tjr Exp $ +.\" +.Dd June 2, 1999 +.Dt AIO_WRITE 2 +.Os +.Sh NAME +.Nm aio_write +.Nd asynchronous write to a file (REALTIME) +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In aio.h +.Ft int +.Fn aio_write "struct aiocb *iocb" +.Sh DESCRIPTION +The +.Fn aio_write +system call allows the calling process to write +.Fa iocb->aio_nbytes +from the buffer pointed to by +.Fa iocb->aio_buf +to the descriptor +.Fa iocb->aio_fildes . +The call returns immediately after the write request has been enqueued +to the descriptor; the write may or may not have completed at the time +the call returns. If the request could not be enqueued, generally due +to invalid arguments, the call returns without having enqueued the +request. +.Pp +If +.Dv O_APPEND +is set for +.Fa iocb->aio_fildes , +.Fn aio_write +operations append to the file in the same order as the calls were +made. If +.Dv O_APPEND +is not set for the file descriptor, the write operation will occur at +the absolute position from the beginning of the file plus +.Fa iocb->aio_offset . +.Pp +If +.Dv _POSIX_PRIORITIZED_IO +is defined, and the descriptor supports it, then the enqueued +operation is submitted at a priority equal to that of the calling +process minus +.Fa iocb->aio_reqprio . +.Pp +The +.Fa iocb +pointer may be subsequently used as an argument to +.Fn aio_return +and +.Fn aio_error +in order to determine return or error status for the enqueued operation +while it is in progress. +.Pp +If the request is successfully enqueued, the value of +.Fa iocb->aio_offset +can be modified during the request as context, so this value must not +be referenced after the request is enqueued. +.Sh RESTRICTIONS +The Asynchronous I/O Control Block structure pointed to by +.Fa iocb +and the buffer that the +.Fa iocb->aio_buf +member of that structure references must remain valid until the +operation has completed. For this reason, use of auto (stack) variables +for these objects is discouraged. +.Pp +The asynchronous I/O control buffer +.Fa iocb +should be zeroed before the +.Fn aio_write +system call to avoid passing bogus context information to the kernel. +.Pp +Modifications of the Asynchronous I/O Control Block structure or the +buffer contents after the request has been enqueued, but before the +request has completed, are not allowed. +.Pp +If the file offset in +.Fa iocb->aio_offset +is past the offset maximum for +.Fa iocb->aio_fildes , +no I/O will occur. +.Sh RETURN VALUES +.Rv -std aio_write +.Sh ERRORS +The +.Fn aio_write +system call will fail if: +.Bl -tag -width Er +.It Bq Er EAGAIN +The request was not queued because of system resource limitations. +.It Bq Er ENOSYS +The +.Fn aio_write +system call is not supported. +.El +.Pp +The following conditions may be synchronously detected when the +.Fn aio_write +system call is made, or asynchronously, at any time thereafter. If they +are detected at call time, +.Fn aio_write +returns -1 and sets +.Va errno +appropriately; otherwise the +.Fn aio_return +system call must be called, and will return -1, and +.Fn aio_error +must be called to determine the actual value that would have been +returned in +.Va errno . +.Pp +.Bl -tag -width Er +.It Bq Er EBADF +The +.Fa iocb->aio_fildes +argument +is invalid, or is not opened for writing. +.It Bq Er EINVAL +The offset +.Fa iocb->aio_offset +is not valid, the priority specified by +.Fa iocb->aio_reqprio +is not a valid priority, or the number of bytes specified by +.Fa iocb->aio_nbytes +is not valid. +.El +.Pp +If the request is successfully enqueued, but subsequently canceled +or an error occurs, the value returned by the +.Fn aio_return +system call is per the +.Xr write 2 +system call, and the value returned by the +.Fn aio_error +system call is either one of the error returns from the +.Xr write 2 +system call, or one of: +.Bl -tag -width Er +.It Bq Er EBADF +The +.Fa iocb->aio_fildes +argument +is invalid for writing. +.It Bq Er ECANCELED +The request was explicitly canceled via a call to +.Fn aio_cancel . +.It Bq Er EINVAL +The offset +.Fa iocb->aio_offset +would be invalid. +.El +.Sh SEE ALSO +.Xr aio_cancel 2 , +.Xr aio_error 2 , +.Xr aio_return 2 , +.Xr aio_suspend 2 , +.Xr aio 4 +.Sh STANDARDS +The +.Fn aio_write +system call +is expected to conform to the +.St -p1003.1 +standard. +.Sh HISTORY +The +.Fn aio_write +system call first appeared in +.Fx 3.0 . +.Sh AUTHORS +This manual page was written by +.An Wes Peters Aq wes@softweyr.com . +.Sh BUGS +Invalid information in +.Fa iocb->_aiocb_private +may confuse the kernel. diff --git a/bsd/man/man2/bind.2 b/bsd/man/man2/bind.2 index 9c4404cdd..742a58858 100644 --- a/bsd/man/man2/bind.2 +++ b/bsd/man/man2/bind.2 @@ -43,7 +43,7 @@ .Fd #include <sys/types.h> .Fd #include <sys/socket.h> .Ft int -.Fn bind "int s" "const struct sockaddr *name" "int namelen" +.Fn bind "int s" "const struct sockaddr *name" "socklen_t namelen" .Sh DESCRIPTION .Fn Bind assigns a name to an unnamed socket. diff --git a/bsd/man/man2/brk.2 b/bsd/man/man2/brk.2 deleted file mode 100644 index f580c15f6..000000000 --- a/bsd/man/man2/brk.2 +++ /dev/null @@ -1,150 +0,0 @@ -.\" $NetBSD: brk.2,v 1.7 1995/02/27 12:31:57 cgd Exp $ -.\" -.\" Copyright (c) 1980, 1991, 1993 -.\" The Regents of the University of California. All rights reserved. -.\" -.\" Redistribution and use in source and binary forms, with or without -.\" modification, are permitted provided that the following conditions -.\" are met: -.\" 1. Redistributions of source code must retain the above copyright -.\" notice, this list of conditions and the following disclaimer. -.\" 2. Redistributions in binary form must reproduce the above copyright -.\" notice, this list of conditions and the following disclaimer in the -.\" documentation and/or other materials provided with the distribution. -.\" 3. All advertising materials mentioning features or use of this software -.\" must display the following acknowledgement: -.\" This product includes software developed by the University of -.\" California, Berkeley and its contributors. -.\" 4. Neither the name of the University nor the names of its contributors -.\" may be used to endorse or promote products derived from this software -.\" without specific prior written permission. -.\" -.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND -.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -.\" SUCH DAMAGE. -.\" -.\" @(#)brk.2 8.2 (Berkeley) 12/11/93 -.\" -.Dd December 11, 1993 -.Dt BRK 2 -.Os BSD 4 -.Sh NAME -.Nm brk , -.Nm sbrk -.Nd change data segment size -.Sh SYNOPSIS -.Fd #include <unistd.h> -.Ft char * -.Fn brk "const char *addr" -.Ft char * -.Fn sbrk "int incr" -.Sh DESCRIPTION -.Bf -symbolic -The brk and sbrk functions are historical curiosities -left over from earlier days before the advent of virtual memory management. -.Ef -The -.Fn brk -function -sets the break or lowest address -of a process's data segment (uninitialized data) to -.Fa addr -(immediately above bss). -Data addressing is restricted between -.Fa addr -and the lowest stack pointer to the stack segment. -Memory is allocated by -.Fa brk -in page size pieces; -if -.Fa addr -is not evenly divisible by the system page size, it is -increased to the next page boundary. -.Pp -.\" The -.\" .Nm sbrk -.\" function -.\" allocates chunks of -.\" .Fa incr -.\" bytes -.\" to the process's data space -.\" and returns an address pointer. -.\" The -.\" .Xr malloc 3 -.\" function utilizes -.\" .Nm sbrk . -.\" .Pp -The current value of the program break is reliably returned by -.Dq Li sbrk(0) -(see also -.Xr end 3 ) . -The -.Xr getrlimit 2 -system call may be used to determine -the maximum permissible size of the -.Em data -segment; -it will not be possible to set the break -beyond the -.Em rlim_max -value returned from a call to -.Xr getrlimit , -e.g. -.Dq qetext + rlp\(->rlim_max. -(see -.Xr end 3 -for the definition of -.Em etext ) . -.Sh RETURN VALUES -.Nm Brk -returns a pointer to the new end of memory if successful; -otherwise -1 with -.Va errno -set to indicate why the allocation failed. -The -.Nm sbrk -function returns a pointer to the base of the new storage if successful; -otherwise -1 with -.Va errno -set to indicate why the allocation failed. -.Sh ERRORS -.Xr Sbrk -will fail and no additional memory will be allocated if -one of the following are true: -.Bl -tag -width Er -.It Bq Er ENOMEM -The limit, as set by -.Xr setrlimit 2 , -was exceeded. -.It Bq Er ENOMEM -The maximum possible size of a data segment (compiled into the -system) was exceeded. -.It Bq Er ENOMEM -Insufficient space existed in the swap area -to support the expansion. -.El -.Sh SEE ALSO -.Xr execve 2 , -.Xr getrlimit 2 , -.Xr malloc 3 , -.Xr mmap 2 , -.Xr end 3 -.Sh BUGS -Setting the break may fail due to a temporary lack of -swap space. It is not possible to distinguish this -from a failure caused by exceeding the maximum size of -the data segment without consulting -.Xr getrlimit . -.Sh HISTORY -A -.Fn brk -function call appeared in -.At v7 . diff --git a/bsd/man/man2/chflags.2 b/bsd/man/man2/chflags.2 index 70cb5097b..66a036bbb 100644 --- a/bsd/man/man2/chflags.2 +++ b/bsd/man/man2/chflags.2 @@ -44,9 +44,9 @@ .Fd #include <sys/stat.h> .Fd #include <unistd.h> .Ft int -.Fn chflags "const char *path" "u_long flags" +.Fn chflags "const char *path" "u_int flags" .Ft int -.Fn fchflags "int fd" "u_long flags" +.Fn fchflags "int fd" "u_int flags" .Sh DESCRIPTION The file whose name is given by diff --git a/bsd/man/man2/chown.2 b/bsd/man/man2/chown.2 index 3ce057f3b..7ba416f38 100644 --- a/bsd/man/man2/chown.2 +++ b/bsd/man/man2/chown.2 @@ -1,6 +1,3 @@ -.\" $OpenBSD: chown.2,v 1.3 1997/01/26 05:10:33 downsj Exp $ -.\" $NetBSD: chown.2,v 1.10 1995/10/12 15:40:47 jtc Exp $ -.\" .\" Copyright (c) 1980, 1991, 1993, 1994 .\" The Regents of the University of California. All rights reserved. .\" @@ -34,29 +31,31 @@ .\" .\" @(#)chown.2 8.4 (Berkeley) 4/19/94 .\" -.Dd January 25, 1997 +.Dd April 19, 1994 .Dt CHOWN 2 .Os .Sh NAME .Nm chown , -.Nm fchown -.Nd change owner and group of a file or link +.Nm fchown , +.Nm lchown +.Nd change owner and group of a file .Sh SYNOPSIS -.Fd #include <sys/types.h> -.Fd #include <unistd.h> +.In unistd.h .Ft int .Fn chown "const char *path" "uid_t owner" "gid_t group" .Ft int .Fn fchown "int fd" "uid_t owner" "gid_t group" +.Ft int +.Fn lchown "const char *path" "uid_t owner" "gid_t group" .Sh DESCRIPTION -The owner ID and group ID of the file (or link) +The owner ID and group ID of the file named by .Fa path or referenced by .Fa fd is changed as specified by the arguments .Fa owner -and +and .Fa group . The owner of a file may change the .Fa group @@ -66,36 +65,49 @@ but the change .Fa owner capability is restricted to the super-user. .Pp -.Fn Chown +The +.Fn chown +system call clears the set-user-id and set-group-id bits on the file to prevent accidental or mischievous creation of -set-user-id and set-group-id programs. +set-user-id and set-group-id programs if not executed +by the super-user. +The +.Fn chown +system call +follows symbolic links to operate on the target of the link +rather than the link itself. .Pp -.Fn Fchown +The +.Fn fchown +system call is particularly useful when used in conjunction with the file locking primitives (see .Xr flock 2 ) . .Pp +The +.Fn lchown +system call is similar to +.Fn chown +but does not follow symbolic links. +.Pp One of the owner or group id's may be left unchanged by specifying it as -1. .Sh RETURN VALUES -Zero is returned if the operation was successful; --1 is returned if an error occurs, with a more specific -error code being placed in the global variable -.Va errno . +.Rv -std .Sh ERRORS -.Fn Chown -will fail and the file or link will be unchanged if: +The +.Fn chown +and +.Fn lchown +will fail and the file will be unchanged if: .Bl -tag -width Er .It Bq Er ENOTDIR A component of the path prefix is not a directory. .It Bq Er ENAMETOOLONG -A component of a pathname exceeded -.Dv {NAME_MAX} -characters, or an entire path name exceeded -.Dv {PATH_MAX} -characters. +A component of a pathname exceeded 255 characters, +or an entire path name exceeded 1023 characters. .It Bq Er ENOENT The named file does not exist. .It Bq Er EACCES @@ -107,20 +119,27 @@ The effective user ID is not the super-user. .It Bq Er EROFS The named file resides on a read-only file system. .It Bq Er EFAULT -.Fa Path +The +.Fa path +argument points outside the process's allocated address space. .It Bq Er EIO An I/O error occurred while reading from or writing to the file system. .El .Pp -.Fn Fchown -will fail if: +The +.Fn fchown +system call will fail if: .Bl -tag -width Er .It Bq Er EBADF +The .Fa fd +argument does not refer to a valid descriptor. .It Bq Er EINVAL +The .Fa fd +argument refers to a socket, not a file. .It Bq Er EPERM The effective user ID is not the super-user. @@ -130,24 +149,33 @@ The named file resides on a read-only file system. An I/O error occurred while reading from or writing to the file system. .El .Sh SEE ALSO -.Xr chown 8 , .Xr chgrp 1 , .Xr chmod 2 , -.Xr flock 2 +.Xr flock 2 , +.Xr chown 8 .Sh STANDARDS The .Fn chown -function is expected to conform to -.St -p1003.1-88 . +system call is expected to conform to +.St -p1003.1-90 . .Sh HISTORY The +.Fn chown +function appeared in +.At v7 . +The .Fn fchown -function call appeared in +system call appeared in .Bx 4.2 . .Pp The .Fn chown and .Fn fchown -functions were changed to follow symbolic links in +system calls were changed to follow symbolic links in .Bx 4.4 . +The +.Fn lchown +system call was added in +.Fx 3.0 +to compensate for the loss of functionality. diff --git a/bsd/man/man2/connect.2 b/bsd/man/man2/connect.2 index e06e59fc5..c778b8d3a 100644 --- a/bsd/man/man2/connect.2 +++ b/bsd/man/man2/connect.2 @@ -43,7 +43,7 @@ .Fd #include <sys/types.h> .Fd #include <sys/socket.h> .Ft int -.Fn connect "int s" "const struct sockaddr *name" "int namelen" +.Fn connect "int s" "const struct sockaddr *name" "socklen_t namelen" .Sh DESCRIPTION The parameter .Fa s @@ -71,8 +71,11 @@ multiple times to change their association. Datagram sockets may dissolve the association by connecting to an invalid address, such as a null address or an address with -the address family set to AF_UNPSEC (the error -EAFNOSUPPORT will be harmlessly returned). +the address family set to +.Dv AF_UNSPEC +(the error +.Dv EAFNOSUPPORT +will be harmlessly returned). .Sh RETURN VALUES If the connection or binding succeeds, 0 is returned. Otherwise a -1 is returned, and a more specific error @@ -119,6 +122,11 @@ for completion by selecting the socket for writing. The socket is non-blocking and a previous connection attempt has not yet been completed. +.It Bq Er EACCES +The destination address is a broadcast address and the +socket option +.Dv SO_BROADCAST +is not set. .El .Pp The following errors are specific to connecting names in the UNIX domain. diff --git a/bsd/man/man2/exchangedata.2 b/bsd/man/man2/exchangedata.2 new file mode 100644 index 000000000..cc2111ea4 --- /dev/null +++ b/bsd/man/man2/exchangedata.2 @@ -0,0 +1,190 @@ +.\" Copyright (c) 2003 Apple Computer, Inc. All rights reserved. +.\" +.\" The contents of this file constitute Original Code as defined in and +.\" are subject to the Apple Public Source License Version 1.1 (the +.\" "License"). You may not use this file except in compliance with the +.\" License. Please obtain a copy of the License at +.\" http://www.apple.com/publicsource and read it before using this file. +.\" +.\" This Original Code and all software distributed under the License are +.\" distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the +.\" License for the specific language governing rights and limitations +.\" under the License. +.\" +.\" @(#)exchangedata.2 +. +.Dd December 15, 2003 +.Dt EXCHANGEDATA 2 +.Os Darwin +.Sh NAME +.Nm exchangedata +.Nd atomically exchange data between two files +.Sh SYNOPSIS +.Fd #include <unistd.h> +.Ft int +.Fn exchangedata "const char * path1" "const char * path2" "unsigned long options" +. +.Sh DESCRIPTION +The +.Fn exchangedata +function swaps the contents of the files referenced by +.Fa path1 +and +.Fa path2 +in an atomic fashion. +That is, all concurrent processes will either see the pre-exchanged state or the +post-exchanged state; they can never see the files in an inconsistent state. +The data in all forks is swapped in this way. +The +.Fa options +parameter lets you control specific aspects of the function's behaviour. +.Pp +. +Open file descriptors follow the swapped data. +Thus, a descriptor that previously referenced +.Fa path1 +will now reference the data that's accessible via +.Fa path2 , +and vice versa. +.Pp +. +In general, the file attributes (metadata) are not exchanged. +Specifically, the object identifier attributes (that is, the +.Dv ATTR_CMN_OBJID +and +.Dv ATTR_CMN_OBJPERMANENTID +attributes as defined by the +.Xr getattrlist 2 +function) are not swapped. +An exception to this general rule is that the modification time attribute ( +.Dv ATTR_CMN_MODTIME +) is swapped. +.Pp +. +When combined, these features allow you to implement a 'safe save' function that +does not break references to the file (for example, aliases). +You first save the new contents to a temporary file and then +exchange the data of the original file and the temporary. +Programs that reference the file via an object identifier will continue to +reference the original file, but now it has the new data. +.Pp +. +.\" path1 and path2 parameters +. +The +.Fa path1 +and +.Fa path2 +parameters must both reference valid files. +All directories listed in the path names leading to these files must be +searchable. +You must have write access to the files. +.Pp +. +.\" options parameter +. +The +.Fa options +parameter is a bit set that controls the behaviour of +.Fn exchangedata . +The following option bits are defined. +. +.Bl -tag -width FSOPT_NOFOLLOW +. +.It FSOPT_NOFOLLOW +If this bit is set, +.Fn exchangedata +will not follow a symlink if it occurs as +the last component of +.Fa path1 +or +.Fa path2 . +. +.El +. +.Sh RETURN VALUES +Upon successful completion a value of 0 is returned. +Otherwise, a value of -1 is returned and +.Va errno +is set to indicate the error. +. +.Sh COMPATIBILITY +Not all volumes support +.Fn exchangedata . +You can test whether a volume supports +.Fn exchangedata +by using +.Xr getattrlist 2 +to get the volume capabilities attribute +.Dv ATTR_VOL_CAPABILITIES , +and then testing the +.Dv VOL_CAP_INT_EXCHANGEDATA +flag. +.Pp +. +.Sh ERRORS +.Fn exchangedata +will fail if: +.Bl -tag -width Er +. +.It Bq Er ENOTSUP +The volume does not support +.Fn exchangedata . +. +.It Bq Er ENOTDIR +A component of the path prefix is not a directory. +. +.It Bq Er ENAMETOOLONG +A component of a path name exceeded +.Dv NAME_MAX +characters, or an entire path name exceeded +.Dv PATH_MAX +characters. +. +.It Bq Er ENOENT +Either file does not exist. +. +.It Bq Er EACCES +Search permission is denied for a component of the path prefix. +. +.It Bq Er ELOOP +Too many symbolic links were encountered in translating the pathname. +. +.It Bq Er EFAULT +.Fa path1 +or +.Em path2 +points to an invalid address. +. +.It Bq Er EXDEV +.Fa path1 +and +.Em path2 +are on different volumes (mounted file systems). +. +.It Bq Er EINVAL +.Fa path1 +or +.Em path2 +reference the same file. +. +.It Bq Er EINVAL +You try to exchange something other than a regular file (for example, a directory). +. +.It Bq Er EIO +An I/O error occurred while reading from or writing to the file system. +.El +.Pp +. +.Sh SEE ALSO +. +.Xr getattrlist 2 +. +.Sh HISTORY +A +.Fn exchangedata +function call appeared in Darwin 1.3.1 (Mac OS X version 10.0). +. diff --git a/bsd/man/man2/fcntl.2 b/bsd/man/man2/fcntl.2 index 1916e0fdc..12fd284f4 100644 --- a/bsd/man/man2/fcntl.2 +++ b/bsd/man/man2/fcntl.2 @@ -152,6 +152,19 @@ Get disk device information. Currently this only includes the disk device address that corresponds to the current file offset. +.It Dv F_FULLFSYNC +Does the same thing as +.Xr fsync 2 +then asks the drive to +flush all buffered data to +the permanent storage device +.Fa ( arg +is ignored). +This is currently +only implemented on HFS filesystems and +the operation may take quite a while to +complete. Certain FireWire drives have +also been known to ignore this request. .El .Pp The flags for the diff --git a/bsd/man/man2/flock.2 b/bsd/man/man2/flock.2 index e38879e58..c74ac5b69 100644 --- a/bsd/man/man2/flock.2 +++ b/bsd/man/man2/flock.2 @@ -133,7 +133,7 @@ is an invalid descriptor. The argument .Fa fd refers to an object other than a file. -.It Bq Er EOPNOTSUPP +.It Bq Er ENOTSUP The referenced descriptor is not of the correct type. .El .Sh SEE ALSO diff --git a/bsd/man/man2/fsync.2 b/bsd/man/man2/fsync.2 index 7d72c2599..b75a4229d 100644 --- a/bsd/man/man2/fsync.2 +++ b/bsd/man/man2/fsync.2 @@ -51,10 +51,41 @@ to be moved to a permanent storage device. This normally results in all in-core modified copies of buffers for the associated file to be written to a disk. .Pp -.Fn Fsync -should be used by programs that require a file to be -in a known state, for example, in building a simple transaction -facility. +Note that while +.Fn fsync +will flush all data from the host +to the drive (i.e. the "permanent storage +device"), the +drive itself may not physically +write the data to the +platters for quite some time +and it may be written in an +out-of-order sequence. +.Pp +Specifically, if the drive loses power +or the OS crashes, +the application +may find that only some or none of their data was +written. The disk drive may also re-order +the data so that later writes +may be present while earlier writes are not. +.Pp +This is not a theoretical +edge case. This scenario is easily reproduced +with real world workloads and drive +power failures. +.Pp +For applications that require tighter guarantess about +the integrity of their data, MacOS X provides the +F_FULLFSYNC fcntl. The F_FULLFSYNC fcntl asks the +drive to flush all buffered data to permanent +storage. Applications such as databases that require +a strict ordering of writes should use F_FULLFSYNC to +ensure their data is written in the order they expect. +Please see +.Xr fcntl 2 +for more detail. +.Pp .Sh RETURN VALUES A 0 value is returned on success. A -1 value indicates an error. @@ -75,7 +106,8 @@ An I/O error occurred while reading from or writing to the file system. .Sh SEE ALSO .Xr sync 2 , .Xr sync 8 , -.Xr update 8 +.Xr update 8 , +.Xr fcntl 2 .Sh HISTORY The .Fn fsync diff --git a/bsd/man/man2/getattrlist.2 b/bsd/man/man2/getattrlist.2 new file mode 100644 index 000000000..e9bb5c33f --- /dev/null +++ b/bsd/man/man2/getattrlist.2 @@ -0,0 +1,1684 @@ +.\" Copyright (c) 2003 Apple Computer, Inc. All rights reserved. +.\" +.\" The contents of this file constitute Original Code as defined in and +.\" are subject to the Apple Public Source License Version 1.1 (the +.\" "License"). You may not use this file except in compliance with the +.\" License. Please obtain a copy of the License at +.\" http://www.apple.com/publicsource and read it before using this file. +.\" +.\" This Original Code and all software distributed under the License are +.\" distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the +.\" License for the specific language governing rights and limitations +.\" under the License. +.\" +.\" @(#)getattrlist.2 +. +.Dd October 14, 2004 +.Dt GETATTRLIST 2 +.Os Darwin +.Sh NAME +.Nm getattrlist +.Nd get file system attributes +.Sh SYNOPSIS +.Fd #include <sys/attr.h> +.Fd #include <unistd.h> +.Ft int +.Fn getattrlist "const char* path" "struct attrlist * attrList" "void * attrBuf" "size_t attrBufSize" "unsigned long options" +. +.Sh DESCRIPTION +The +.Fn getattrlist +function returns attributes (that is, metadata) of file system objects. +You can think of +.Fn getattrlist +as a seriously enhanced version of +.Xr stat 2 . +The function returns attributes about the file system object specified by +.Fa path +in the buffer specified by +.Fa attrBuf +and +.Fa attrBufSize . +The +.Fa attrList +parameter determines what attributes are returned. +The +.Fa options +parameter lets you control specific aspects of the function's behaviour. +.Pp +. +The +.Fn getattrlist +function is only supported by certain volume format implementations. +For maximum compatibility, client programs should use high-level APIs +(such as the Carbon File Manager) to access file system attributes. +These high-level APIs include logic to emulate file system attributes +on volumes that don't support +.Fn getattrlist . +.Pp +. +Not all volumes support all attributes. +See the discussion of +.Dv ATTR_VOL_ATTRIBUTES +for a discussion of how to determine whether a particular volume supports a +particular attribute. +.Pp +Furthermore, you should only request the attributes that you need. +Some attributes are expensive to calculate on some volume formats. +For example, +.Dv ATTR_DIR_ENTRYCOUNT +is usually expensive to calculate on non-HFS [Plus] volumes. +If you don't need a particular attribute, you should not ask for it. +.Pp +. +.\" path parameter +. +The +.Fa path +parameter must reference a valid file system object. +Read, write or execute permission of the object itself is not required, but +all directories listed in the path name leading to the object must be +searchable. +.Pp +. +.\" attrList parameter +. +The +.Fa attrList +parameter is a pointer to an +.Vt attrlist +structure, as defined by +.Aq Pa sys/attr.h +(shown below). +It determines what attributes are returned by the function. +You are responsible for filling out all fields of this structure before calling the function. +.Bd -literal +typedef u_int32_t attrgroup_t; +.Pp +struct attrlist { + u_short bitmapcount; /* number of attr. bit sets in list */ + u_int16_t reserved; /* (to maintain 4-byte alignment) */ + attrgroup_t commonattr; /* common attribute group */ + attrgroup_t volattr; /* volume attribute group */ + attrgroup_t dirattr; /* directory attribute group */ + attrgroup_t fileattr; /* file attribute group */ + attrgroup_t forkattr; /* fork attribute group */ +}; +#define ATTR_BIT_MAP_COUNT 5 +.Ed +.Pp +. +.\" attrlist elements +. +The fields of the +.Vt attrlist +structure are defined as follows. +.Bl -tag -width XXXbitmapcount +. +.It bitmapcount +Number of attribute bit sets in the structure. +In current systems you must set this to +.Dv ATTR_BIT_MAP_COUNT . +. +.It reserved +Reserved. +You must set this to 0. +. +.It commonattr +A bit set that specifies the common attributes that you require. +Common attributes relate to all types of file system objects. +See below for a description of these attributes. +. +.It volattr +A bit set that specifies the volume attributes that you require. +Volume attributes relate to volumes (that is, mounted file systems). +See below for a description of these attributes. +If you request volume attributes, +.Fa path +must reference the root of a volume. +In addition, you can't request volume attributes if you also request +file or directory attributes. +. +.It dirattr +A bit set that specifies the directory attributes that you require. +See below for a description of these attributes. +. +.It fileattr +A bit set that specifies the file attributes that you require. +See below for a description of these attributes. +. +.It forkattr +A bit set that specifies the fork attributes that you require. +Fork attributes relate to the actual data in the file, +which can be held in multiple named contiguous ranges, or forks. +See below for a description of these attributes. +. +.El +.Pp +. +Unless otherwise noted in the lists below, attributes are read-only. +Attributes labelled as read/write can be set using +.Xr setattrlist 2 . +.Pp +. +.\" attrBuf and attrBufSize parameters +. +The +.Fa attrBuf +and +.Fa attrBufSize +parameters specify a buffer into which the function places attribute values. +The format of this buffer is sufficiently complex that its description +requires a separate section (see below). +The initial contents of this buffer are ignored. +.Pp +. +.\" option parameter +. +The +.Fa options +parameter is a bit set that controls the behaviour of +.Fn getattrlist . +The following option bits are defined. +. +.Bl -tag -width XXXbitmapcount +. +.It FSOPT_NOFOLLOW +If this bit is set, +.Fn getattrlist +will not follow a symlink if it occurs as +the last component of +.Fa path . +. +.El +. +.Sh ATTRIBUTE BUFFER +. +The data returned in the buffer described by +.Fa attrBuf +and +.Fa attrBufSize +is formatted as follows. +.Pp +. +.Bl -enum +. +.It +The first element of the buffer is a +.Vt unsigned long +that contains the overall length, in bytes, of the attributes returned. +This size includes the length field itself. +. +.It +Following the length field is a list of attributes. +Each attribute is represented by a field of its type, +where the type is given as part of the attribute description (below). +. +.It +The attributes are placed into the attribute buffer in the order +that they are described below. +. +.El +.Pp +. +If the attribute is of variable length, it is represented +in the list by an +.Vt attrreference +structure, as defined by +.Aq Pa sys/attr.h +(shown below). +. +.Bd -literal +typedef struct attrreference { + long attr_dataoffset; + size_t attr_length; +} attrreference_t; +.Ed +.Pp +. +This structure contains a 'pointer' to the variable length attribute data. +The +.Fa attr_length +field is the length of the attribute data (in bytes). +The +.Fa attr_dataoffset +field is the offset in bytes from the +.Vt attrreference +structure +to the attribute data. +This offset will always be a multiple of sizeof(unsigned long) bytes, +so you can safely access common data types without fear of alignment +exceptions. +.Pp +. +The +.Fn getattrlist +function will silently truncate attribute data if +.Fa attrBufSize +is too small. +The length field at the front of the attribute list always represents +the length of the data actually copied into the attribute buffer. +If the data is truncated, there is no easy way to determine the +buffer size that's required to get all of the requested attributes. +You should always pass an +.Fa attrBufSize +that is large enough to accommodate the known size of the attributes +in the attribute list (including the leading length field). +.Pp +. +Because the returned attributes are simply truncated if the buffer is +too small, it's possible for a variable length attribute to reference +data beyond the end of the attribute buffer. That is, it's possible +for the attribute data to start beyond the end of the attribute buffer +(that is, if +.Fa attrRef +is a pointer to the +.Vt attrreference_t , +( ( (char *) +.Fa attrRef +) + +.Fa attr_dataoffset +) > ( ( (char *) +.Fa attrBuf +) + +.Fa attrSize +) ) or, indeed, for the attribute data to extend beyond the end of the attribute buffer (that is, +( ( (char *) +.Fa attrRef +) + +.Fa attr_dataoffset ++ +.Fa attr_datalength +) > ( ( (char *) +.Fa attrBuf +) + +.Fa attrSize +) ). +If this happens you must increase the size of the buffer and call +.Fn getattrlist +to get an accurate copy of the attribute. +. +.Sh COMMON ATTRIBUTES +. +Common attributes relate to all types of file system objects. +The following common attributes are defined. +. +.Bl -tag -width ATTR_VOL_ALLOCATIONCLUMP +. +.It ATTR_CMN_NAME +An +.Vt attrreference +structure containing the name of the file system object as +UTF-8 encoded, null terminated C string. +The attribute data length will not be greater than +.Dv NAME_MAX + +1. +.Pp +. +.It ATTR_CMN_DEVID +A +.Vt dev_t +containing the device number of the device on which this +file system object's volume is mounted. +Equivalent to the +.Fa st_dev +field of the +.Vt stat +structure returned by +.Xr stat 2 . +. +.It ATTR_CMN_FSID +An +.Vt fsid_t +structure containing the file system identifier for the volume on which +the file system object resides. +Equivalent to the +.Fa f_fsid +field of the +.Vt statfs +structure returned by +.Xr statfs 2 . +. +.Pp +This value is not related to the file system ID from traditional Mac OS (for example, +the +.Fa filesystemID +field of the +.Vt FSVolumeInfo +structure returned by Carbon's FSGetVolumeInfo() function). +On current versions of Mac OS X that value is synthesised by the Carbon File Manager. +. +.It ATTR_CMN_OBJTYPE +An +.Vt fsobj_type_t +that identifies the type of file system object. +The values are taken from +.Vt enum vtype +in +.Aq Pa sys/vnode.h . +. +.It ATTR_CMN_OBJTAG +An +.Vt fsobj_tag_t +that identifies the type of file system containing the object. +The values are taken from +.Vt enum vtagtype +in +.Aq Pa sys/vnode.h . +. +.It ATTR_CMN_OBJID +An +.Vt fsobj_id_t +structure that uniquely identifies the file system object +within its volume. +The fid_generation field of this structure will be zero for all non-root callers +(effective UID not 0). +This identifier need not be persistent across an unmount/mount sequence. +.Pp +. +Some volume formats use well known values for the +.Fa fid_objno +field for the root directory (2) and the parent of root directory (1). +This is not a required behaviour of this attribute. +. +.It ATTR_CMN_OBJPERMANENTID +An +.Vt fsobj_id_t +structure that uniquely identifies the file system object +within its volume. +The fid_generation field of this structure will be zero for all non-root callers +(effective UID not 0). +This identifier should be persistent across an unmount/mount sequence. +.Pp +Some file systems (for example, original HFS) may need to modify the on-disk +structure to return a persistent identifier. +If such a file system is mounted read-only, an attempt to get this attribute +will fail with the error +.Dv EROFS . +. +.It ATTR_CMN_PAROBJID +An +.Vt fsobj_id_t +structure that identifies the parent directory of the file system object. +The fid_generation field of this structure will be zero for all non-root callers +(effective UID not 0). +Equivalent to the ATTR_CMN_OBJID attribute of the parent directory. +This identifier need not be persistent across an unmount/mount sequence. +.Pp +. +On a volume that supports hard links, a multiply linked file has no unique parent. +This attribute will return an unspecified parent. +.Pp +. +For some volume formats this attribute is very expensive to calculate. +. +.It ATTR_CMN_SCRIPT +(read/write) A +.Vt text_encoding_t +containing a text encoding hint for +the file system object's name. +It is included to facilitate the lossless round trip conversion of names between +Unicode and traditional Mac OS script encodings. +The values are defined in +.Aq Pa CarbonCore/TextCommon.h . +File systems that do not have an appropriate text encoding value should return +kTextEncodingMacUnicode. +See DTS Q&A 1173 "File Manager Text Encoding Hints". +. +.It ATTR_CMN_CRTIME +(read/write) A +.Vt timespec +structure containing the time that the file system object +was created. +. +.It ATTR_CMN_MODTIME +(read/write) A +.Vt timespec +structure containing the time that the file system object +was last modified. +Equivalent to the +.Fa st_mtimespec +field of the +.Vt stat +structure returned by +.Xr stat 2 . +. +.It ATTR_CMN_CHGTIME +(read/write) A +.Vt timespec +structure containing the time that the file system object's +attributes were last modified. +Equivalent to the +.Fa st_ctimespec +field of the +.Vt stat +structure returned by +.Xr stat 2 . +. +.It ATTR_CMN_ACCTIME +(read/write) A +.Vt timespec +structure containing the time that the file system object +was last accessed. +Equivalent to the +.Fa st_atimespec +field of the +.Vt stat +structure returned by +.Xr stat 2 . +. +.It ATTR_CMN_BKUPTIME +(read/write) A +.Vt timespec +structure containing the time that the file system object was +last backed up. +This value is for use by backup utilities. +The file system stores but does not interpret the value. +. +.It ATTR_CMN_FNDRINFO +(read/write) 32 bytes of data for use by the Finder. +Equivalent to the concatenation of a +.Vt FileInfo +structure and an +.Vt ExtendedFileInfo +structure +(or, for directories, a +.Vt FolderInfo +structure and an +.Vt ExtendedFolderInfo +structure). +These structures are defined in +.Aq Pa CarbonCore/Finder.h . +.Pp +This attribute is not byte swapped by the file system. +The value of multibyte fields on disk is always big endian. +When running on a little endian system (such as Darwin on x86), +you must byte swap any multibyte fields. +. +.It ATTR_CMN_OWNERID +(read/write) A +.Vt uid_t +containing the owner of the file system object. +Equivalent to the +.Fa st_uid +field of the +.Vt stat +structure returned by +.Xr stat 2 . +. +.It ATTR_CMN_GRPID +(read/write) A +.Vt gid_t +containing the group of the file system object. +Equivalent to the +.Fa st_gid +field of the +.Vt stat +structure returned by +.Xr stat 2 . +. +.It ATTR_CMN_ACCESSMASK +(read/write) A +.Vt mode_t +containing the access permissions of the file system object. +Equivalent to the +.Fa st_mode +field of the +.Vt stat +structure returned by +.Xr stat 2 . +. +.It ATTR_CMN_NAMEDATTRCOUNT +A +.Vt unsigned long +containing the number of named attributes of the file system object. +No built-in file systems on Mac OS X currently support named attributes. +. +.It ATTR_CMN_NAMEDATTRLIST +An +.Vt attrreference +structure containing a list of named attributes of the file system object. +No built-in file systems on Mac OS X currently support named attributes. +Because of this, the structure of this attribute's value is not yet defined. +. +.It ATTR_CMN_FLAGS +(read/write) A +.Vt unsigned long +containing file flags. +Equivalent to the +.Fa st_flags +field of the +.Vt stat +structure returned by +.Xr stat 2 . +For more information about these flags, see +.Xr chflags 2 . +.Pp +. +The order that attributes are placed into the attribute buffer +almost invariably matches the order of the attribute mask bit values. +The exception is +.Dv ATTR_CMN_FLAGS . +If its order was based on its bit position, it would be before +the +.Dv ATTR_CMN_NAMEDATTRCOUNT +/ +.Dv ATTR_CMN_NAMEDATTRLIST +pair, however, +it is placed in the buffer after them. +. +.It ATTR_CMN_USERACCESS +A +.Vt unsigned long +containing the effective permissions of the current user +(the calling process's effective UID) for this file system object. +You can test for read, write, and execute permission using +.Dv R_OK , +.Dv W_OK , +and +.Dv X_OK , +respectively. See +.Xr access 2 +for more details. +. +.El +. +.Sh VOLUME ATTRIBUTES +. +Volume attributes relate to volumes (that is, mounted file systems). +The following volume attributes are defined. +. +.Bl -tag -width ATTR_VOL_ALLOCATIONCLUMP +. +.It ATTR_VOL_INFO +For reasons that are not at all obvious, you must set +.Dv ATTR_VOL_INFO +in the +.Fa volattr +field if you request any other volume attributes. +This does not result in any attribute data being added to the attribute buffer. +. +.It ATTR_VOL_FSTYPE +A +.Vt unsigned long +containing the file system type. +Equivalent to the +.Fa f_type +field of the +.Vt statfs +structure returned by +.Xr statfs 2 . +Generally not a useful value. +. +.It ATTR_VOL_SIGNATURE +A +.Vt unsigned long +containing the volume signature word. +This value is unique within a given file system type and lets you +distinguish between different volume formats handled by the same file system. +See +.Aq Pa CarbonCore/Files.h +for more details. +. +.It ATTR_VOL_SIZE +An +.Vt off_t +containing the total size of the volume in bytes. +. +.It ATTR_VOL_SPACEFREE +An +.Vt off_t +containing the free space on the volume in bytes. +. +.It ATTR_VOL_SPACEAVAIL +An +.Vt off_t +containing the space, in bytes, on the volume available to non-privileged processes. +This is the free space minus the amount of space reserved by the system to prevent critical +disk exhaustion errors. +Non-privileged programs, like a disk management tool, should use this value to display the +space available to the user. +.Pp +.Dv ATTR_VOL_SPACEAVAIL +is to +.Dv ATTR_VOL_SPACEFREE +as +.Fa f_bavail +is to +.Fa f_bfree +in +.Xr statfs 2 . +. +.It ATTR_VOL_MINALLOCATION +An +.Vt off_t +containing the minimum allocation size on the volume in bytes. +If you create a file containing one byte, it will consume this much space. +. +.It ATTR_VOL_ALLOCATIONCLUMP +An +.Vt off_t +containing the allocation clump size on the volume, in bytes. +As a file is extended, the file system will attempt to allocate +this much space each time in order to reduce fragmentation. +. +.It ATTR_VOL_IOBLOCKSIZE +A +.Vt unsigned long +containing the optimal block size when reading or writing data. +Equivalent to the +.Fa f_iosize +field of the +.Vt statfs +structure returned by +.Xr statfs 2 . +. +.It ATTR_VOL_OBJCOUNT +A +.Vt unsigned long +containing the number of file system objects on the volume. +. +.It ATTR_VOL_FILECOUNT +A +.Vt unsigned long +containing the number of files on the volume. +. +.It ATTR_VOL_DIRCOUNT +A +.Vt unsigned long +containing the number of directories on the volume. +. +.It ATTR_VOL_MAXOBJCOUNT +A +.Vt unsigned long +containing the maximum number of file system objects that can be stored on the volume. +. +.It ATTR_VOL_MOUNTPOINT +An +.Vt attrreference +structure containing the path to the volume's mount point as a +UTF-8 encoded, null terminated C string. +The attribute data length will not be greater than +.Dv MAXPATHLEN . +Equivalent to the +.Fa f_mntonname +field of the +.Vt statfs +structure returned by +.Xr statfs 2 . +. +.It ATTR_VOL_NAME +(read/write) An +.Vt attrreference +structure containing the name of the volume as a +UTF-8 encoded, null terminated C string. +The attribute data length will not be greater than +.Dv NAME_MAX + +1. +.Pp +. +This attribute is only read/write if the +.Dv VOL_CAP_INT_VOL_RENAME +bit is set in the volume capabilities (see below). +.Pp +. +.It ATTR_VOL_MOUNTFLAGS +A +.Vt unsigned long +containing the volume mount flags. +This is a copy of the value passed to the +.Fa flags +parameter of +.Xr mount 2 +when the volume was mounted. +Equivalent to the +.Fa f_flags +field of the +.Vt statfs +structure returned by +.Xr statfs 2 . +. +.It ATTR_VOL_MOUNTEDDEVICE +An +.Vt attrreference +structure that returns the same value as the +.Fa f_mntfromname +field of the +.Vt statfs +structure returned by +.Xr statfs 2 . +For local volumes this is the path to the device on which the volume is mounted as a +UTF-8 encoded, null terminated C string. +For network volumes, this is a unique string that identifies the mount. +The attribute data length will not be greater than +.Dv MAXPATHLEN . +.Pp +. +.It ATTR_VOL_ENCODINGSUSED +An +.Vt unsigned long long +containing a bitmap of the text encodings used on this volume. +For more information about this, see the discussion of +.Fa encodingsBitmap +in DTS Technote 1150 "HFS Plus Volume Format". +. +.It ATTR_VOL_CAPABILITIES +A +.Vt vol_capabilities_attr_t +structure describing the optional features supported by this volume. +See below for a discussion of volume capabilities. +. +.It ATTR_VOL_ATTRIBUTES +A +.Vt vol_attributes_attr_t +structure describing the attributes supported by this volume. +This structure is discussed below, along with volume capabilities. +. +.El +. +.Sh DIRECTORY ATTRIBUTES +. +The following directory attributes are defined. +. +.Bl -tag -width ATTR_VOL_ALLOCATIONCLUMP +. +.It ATTR_DIR_LINKCOUNT +A +.Vt unsigned long +containing the number of file system objects in the directory, including +synthetic items such as "." and "..". +For historical reasons, you should not always rely on this value being accurate. +.Pp +If you're implementing a volume format on which this is hard to calculate, +you should not support this attribute. +While it's traditional to return a constant value of 1 in the +.Fa st_nlink +field of the +.Vt stat +structure as returned by +.Xr stat 2 , +it's not necessary to do this here because there is a +defined way to indicate that you do not support the attribute. +. +.It ATTR_DIR_ENTRYCOUNT +A +.Vt unsigned long +containing the number of file system objects in the directory, not including +any synthetic items. +. +.It ATTR_DIR_MOUNTSTATUS +A +.Vt unsigned long +containing flags describing what's mounted on the directory. +Currently the only flag defined is +.Dv DIR_MNTSTATUS_MNTPOINT, +which indicates that there is a file system mounted on this directory. +Due to a bug (r. 3502822), this flag is never set on current system. +. +.El +. +.Sh FILE ATTRIBUTES +. +The following file attributes are defined. +. +.Bl -tag -width ATTR_VOL_ALLOCATIONCLUMP +. +.It ATTR_FILE_LINKCOUNT +A +.Vt unsigned long +containing the number of hard links to this file. +Equivalent to the +.Fa st_nlink +field of the +.Vt stat +structure returned by +.Xr stat 2 . +. +.It ATTR_FILE_TOTALSIZE +An +.Vt off_t +containing the total number of bytes in all forks of the file (the logical size). +. +.It ATTR_FILE_ALLOCSIZE +An +.Vt off_t +containing a count of the bytes on disk used by all of the file's forks (the physical size). +. +.It ATTR_FILE_IOBLOCKSIZE +A +.Vt unsigned long +containing the optimal block size when reading or writing this file's data. +. +.It ATTR_FILE_CLUMPSIZE +A +.Vt unsigned long +containing the allocation clump size for this file, in bytes. +As the file is extended, the file system will attempt to allocate +this much space each time in order to reduce fragmentation. +This value applies to the data fork. +. +.It ATTR_FILE_DEVTYPE +(read/write) A +.Vt unsigned long +containing the device type for a special device file. +Equivalent to the +.Fa st_rdev +field of the +.Vt stat +structure returned by +.Xr stat 2 . +. +.It ATTR_FILE_FILETYPE +A +.Vt unsigned long +that whose value is reserved. +Clients should ignore its value. +New volume format implementations should not support this attribute. +. +.It ATTR_FILE_FORKCOUNT +A +.Vt unsigned long +containing the number of forks in the file. +No built-in file systems on Mac OS X currently support forks other +than the data and resource fork. +. +.It ATTR_FILE_FORKLIST +An +.Vt attrreference +structure containing a list of named forks of the file. +No built-in file systems on Mac OS X currently support forks +other than the data and resource fork. +Because of this, the structure of this attribute's value is not yet defined. +. +.It ATTR_FILE_DATALENGTH +An +.Vt off_t +containing the length of the data fork in bytes (the logical size). +. +.It ATTR_FILE_DATAALLOCSIZE +An +.Vt off_t +containing a count of the bytes on disk used by the data fork (the physical size). +. +.It ATTR_FILE_DATAEXTENTS +An +.Vt extentrecord +array for the data fork. +The array contains eight +.Vt diskextent +structures which represent the first +eight extents of the fork. +.Pp +This attributes exists for compatibility reasons. +New clients should not use this attribute. +Rather, they should use the +.Dv F_LOG2PHYS +command in +.Xr fcntl 2 . +.Pp +. +In current implementations the value may not be entirely accurate for +a variety of reasons. +. +.It ATTR_FILE_RSRCLENGTH +An +.Vt off_t +containing the length of the resource fork in bytes (the logical size). +. +.It ATTR_FILE_RSRCALLOCSIZE +An +.Vt off_t +containing a count of the bytes on disk used by the resource fork (the physical size). +. +.It ATTR_FILE_RSRCEXTENTS +An +.Vt extentrecord +array for the resource fork. +The array contains eight +.Vt diskextent +structures which represent the first +eight extents of the fork. +.Pp +See also +.Dv ATTR_FILE_DATAEXTENTS . +. +.El +. +.Sh FORK ATTRIBUTES +. +Fork attributes relate to the actual data in the file, +which can be held in multiple named contiguous ranges, or forks. +The following fork attributes are defined. +. +.Bl -tag -width ATTR_VOL_ALLOCATIONCLUMP +. +.It ATTR_FORK_TOTALSIZE +An +.Vt off_t +containing the length of the fork in bytes (the logical size). +. +.It ATTR_FORK_ALLOCSIZE +An +.Vt off_t +containing a count of the bytes on disk used by the fork (the physical size). +. +.El +.Pp +. +Fork attributes are not properly implemented by any current Mac OS X +volume format implementation. +We strongly recommend that client programs do not request fork attributes. +If you are implementing a volume format, you should not support these attributes. +. +.Sh VOLUME CAPABILITIES +. +.\" vol_capabilities_attr_t +. +Not all volumes support all features. The +.Dv ATTR_VOL_CAPABILITIES +attribute returns a +.Vt vol_capabilities_attr_t +structure (shown below) that indicates which features are supported by the volume. +. +.Bd -literal +typedef u_int32_t vol_capabilities_set_t[4]; +.Pp +. +#define VOL_CAPABILITIES_FORMAT 0 +#define VOL_CAPABILITIES_INTERFACES 1 +#define VOL_CAPABILITIES_RESERVED1 2 +#define VOL_CAPABILITIES_RESERVED2 3 +.Pp +. +typedef struct vol_capabilities_attr { + vol_capabilities_set_t capabilities; + vol_capabilities_set_t valid; +} vol_capabilities_attr_t; +.Ed +.Pp +. +The structure contains two fields, +.Fa capabilities +and +.Fa valid . +Each consists of an array of four elements. +The arrays are indexed by the following values. +. +.Bl -tag -width VOL_CAP_FMT_PERSISTENTOBJECTIDS +. +.It VOL_CAPABILITIES_FORMAT +This element contains information about the volume format. +See +.Dv VOL_CAP_FMT_PERSISTENTOBJECTIDS +and so on, below. +. +.It VOL_CAPABILITIES_INTERFACES +This element contains information about which optional functions are +supported by the volume format implementation. +See +.Dv VOL_CAP_INT_SEARCHFS +and so on, below. +. +.It VOL_CAPABILITIES_RESERVED1 +Reserved. +A file system implementation should set this element to zero. +A client program should ignore this element. +. +.It VOL_CAPABILITIES_RESERVED2 +Reserved. +A file system implementation should set this element to zero. +A client program should ignore this element. +. +.El +.Pp +. +The +.Fa valid +field contains bit sets that indicate which flags are known to the volume format +implementation. +Each bit indicates whether the contents of the corresponding bit in the +.Fa capabilities +field is valid. +.Pp +. +The +.Fa capabilities +field contains bit sets that indicate whether a particular feature is implemented +by this volume format. +.Pp +. +The following bits are defined in the first element (indexed by +.Dv VOL_CAPABILITIES_FORMAT ) +of the +.Fa capabilities +and +.Fa valid +fields of the +.Vt vol_capabilities_attr_t +structure. +. +.Bl -tag -width VOL_CAP_FMT_PERSISTENTOBJECTIDS +. +.It VOL_CAP_FMT_PERSISTENTOBJECTIDS +If this bit is set the volume format supports persistent object identifiers +and can look up file system objects by their IDs. +See +.Dv ATTR_CMN_OBJPERMANENTID +for details about how to obtain these identifiers. +. +.It VOL_CAP_FMT_SYMBOLICLINKS +If this bit is set the volume format supports symbolic links. +. +.It VOL_CAP_FMT_HARDLINKS +If this bit is set the volume format supports hard links. +. +.It VOL_CAP_FMT_JOURNAL +If this bit is set the volume format supports a journal used to +speed recovery in case of unplanned restart (such as a power outage +or crash). +This does not necessarily mean the volume is actively using a journal. +.Pp +Introduced with Darwin 7.0 (Mac OS X version 10.3). +. +.It VOL_CAP_FMT_JOURNAL_ACTIVE +If this bit is set the volume is currently using a journal for +speedy recovery after an unplanned restart. +This bit can be set only if +.Dv VOL_CAP_FMT_JOURNAL +is also set. +.Pp +Introduced with Darwin 7.0 (Mac OS X version 10.3). +. +.It VOL_CAP_FMT_NO_ROOT_TIMES +If this bit is set the volume format does not store reliable times for +the root directory, so you should not depend on them to detect changes, +identify volumes across unmount/mount, and so on. +.Pp +Introduced with Darwin 7.0 (Mac OS X version 10.3). +. +.It VOL_CAP_FMT_SPARSE_FILES +If this bit is set the volume format supports sparse files, +that is, files which can have 'holes' that have never been written +to, and thus do not consume space on disk. +A sparse file may have an allocated size on disk that is less than its logical length (that is, +.Dv ATTR_FILE_ALLOCSIZE +< +.Dv ATTR_FILE_TOTALSIZE ). +. +.Pp +Introduced with Darwin 7.0 (Mac OS X version 10.3). +. +.It VOL_CAP_FMT_ZERO_RUNS +For security reasons, parts of a file (runs) that have never been +written to must appear to contain zeroes. +When this bit is set, the volume keeps track of allocated but unwritten +runs of a file so that it can substitute zeroes without actually +writing zeroes to the media. +This provides performance similar to sparse files, but not the space savings. +.Pp +Introduced with Darwin 7.0 (Mac OS X version 10.3). +. +.It VOL_CAP_FMT_CASE_SENSITIVE +If this bit is set the volume format treats upper and lower case +characters in file and directory names as different. +Otherwise an upper case character is equivalent to a lower case character, +and you can't have two names that differ solely in the case of +the characters. +.Pp +Introduced with Darwin 7.0 (Mac OS X version 10.3). +. +.It VOL_CAP_FMT_CASE_PRESERVING +If this bit is set the volume format preserves the case of +file and directory names. +Otherwise the volume may change the case of some characters +(typically making them all upper or all lower case). +A volume that sets +.Dv VOL_CAP_FMT_CASE_SENSITIVE +must also set +.Dv VOL_CAP_FMT_CASE_PRESERVING . +.Pp +Introduced with Darwin 7.0 (Mac OS X version 10.3). +. +.It VOL_CAP_FMT_FAST_STATFS +This bit is used as a hint to upper layers (specifically the Carbon File Manager) to +indicate that +.Xr statfs 2 +is fast enough that its results need not be cached by the caller. +A volume format implementation that caches the +.Xr statfs 2 +information in memory should set this bit. +An implementation that must always read from disk or always perform a network +transaction to satisfy +.Xr statfs 2 +should not set this bit. +.Pp +Introduced with Darwin 7.0 (Mac OS X version 10.3). +. +.It VOL_CAP_FMT_2TB_FILESIZE +If this bit is set the volume format supports file +sizes upto 2TB. This bit does not necessarily mean that the file +system does not support file size more than 2TB. This bit does +not mean that the currently available space on the volume is 2TB. +.Pp +Introduced with Darwin 8.0 (Mac OS X version 10.4). +. +.El +.Pp +. +The following bits are defined in the second element (indexed by +.Dv VOL_CAPABILITIES_INTERFACES ) +of the +.Fa capabilities +and +.Fa valid +fields of the +.Vt vol_capabilities_attr_t +structure. +. +.Bl -tag -width VOL_CAP_FMT_PERSISTENTOBJECTIDS +. +.It VOL_CAP_INT_SEARCHFS +If this bit is set the volume format implementation supports +.Xr searchfs 2 . +. +.It VOL_CAP_INT_ATTRLIST +If this bit is set the volume format implementation supports +.Fn getattrlist +and +.Xr setattrlist 2 . +. +.It VOL_CAP_INT_NFSEXPORT +If this bit is set the volume format implementation allows this volume to be exported via NFS. +. +.It VOL_CAP_INT_READDIRATTR +If this bit is set the volume format implementation supports +.Xr getdirentriesattr 2 . +. +.It VOL_CAP_INT_EXCHANGEDATA +If this bit is set the volume format implementation supports +.Xr exchangedata 2 . +.Pp +Introduced with Darwin 7.0 (Mac OS X version 10.3). +. +.It VOL_CAP_INT_COPYFILE +If this bit is set the volume format implementation supports the (private and undocumented) +copyfile() function. +.Pp +Introduced with Darwin 7.0 (Mac OS X version 10.3). +. +.It VOL_CAP_INT_ALLOCATE +If this bit is set the volume format implementation supports the +.Dv F_PREALLOCATE +selector of +.Xr fcntl 2 . +.Pp +Introduced with Darwin 7.0 (Mac OS X version 10.3). +. +.It VOL_CAP_INT_VOL_RENAME +If this bit is set the volume format implementation allows you to +modify the volume name using +.Xr setattrlist 2 . +.Pp +Introduced with Darwin 7.0 (Mac OS X version 10.3). +. +.It VOL_CAP_INT_ADVLOCK +If this bit is set the volume format implementation supports +advisory locking, that is, the +.Dv F_GETLK , +.Dv F_SETLK , +and +.Dv F_SETLKW +selectors to +.Xr fcntl 2 . +.Pp +Introduced with Darwin 7.0 (Mac OS X version 10.3). +. +.It VOL_CAP_INT_FLOCK +If this bit is set the volume format implementation supports +whole file locks. +This includes +.Xr flock 2 +and the +.Dv O_EXLOCK +and +.Dv O_SHLOCK +flags to +.Xr open 2 . +.Pp +Introduced with Darwin 7.0 (Mac OS X version 10.3). +. +.El +.Pp +. +.\" vol_attributes_attr_t +. +A volume can also report which attributes it supports. +This information is returned by the +.Dv ATTR_VOL_ATTRIBUTES +attribute, which returns a +.Vt vol_attributes_attr_t +structure (shown below). +. +.Bd -literal +typedef struct attribute_set { + attrgroup_t commonattr; /* common attribute group */ + attrgroup_t volattr; /* volume attribute group */ + attrgroup_t dirattr; /* directory attribute group */ + attrgroup_t fileattr; /* file attribute group */ + attrgroup_t forkattr; /* fork attribute group */ +} attribute_set_t; +.Pp +. +typedef struct vol_attributes_attr { + attribute_set_t validattr; + attribute_set_t nativeattr; +} vol_attributes_attr_t; +.Ed +.Pp +. +The +.Fa validattr +field consists of a number of bit sets that indicate whether an attribute is +supported by the volume format implementation. +The +.Fa nativeattr +is similar except that the bit sets indicate whether an attribute is supported +natively by the volume format. +An attribute is supported natively if the volume format implementation does not have to do +any complex conversions to access the attribute. +For example, a volume format might support persistent object identifiers, but +doing so requires a complex table lookup that is not part of the core volume +format. +In that case, the +.Dv ATTR_VOL_ATTRIBUTES +attribute would return +.Dv ATTR_CMN_OBJPERMANENTID +set in the +.Fa validattr +field of the +.Vt vol_attributes_attr_t , +but not in the +.Fa nativeattr +field. +. +.Sh RETURN VALUES +Upon successful completion a value of 0 is returned. +Otherwise, a value of -1 is returned and +.Va errno +is set to indicate the error. +. +.Sh COMPATIBILITY +Not all volumes support +.Fn getattrlist . +The best way to test whether a volume supports this function is to +simply call it and check the error result. +.Fn getattrlist +will return +.Dv ENOTSUP +if it is not supported on a particular volume. +.Pp +. +The +.Fn getattrlist +function has been undocumented for more than two years. +In that time a number of volume format implementations have been created without +a proper specification for the behaviour of this routine. +You may encounter volume format implementations with slightly different +behaviour than what is described here. +Your program is expected to be tolerant of this variant behaviour. +.Pp +. +If you're implementing a volume format that supports +.Fn getattrlist , +you should be careful to support the behaviour specified by this document. +. +.Sh ERRORS +.Fn getattrlist +will fail if: +.Bl -tag -width Er +. +.It Bq Er ENOTSUP +The volume does not support +.Fn getattrlist . +. +.It Bq Er ENOTDIR +A component of the path prefix is not a directory. +. +.It Bq Er ENAMETOOLONG +A component of a path name exceeded +.Dv NAME_MAX +characters, or an entire path name exceeded +.Dv PATH_MAX +characters. +. +.It Bq Er ENOENT +The file system object does not exist. +. +.It Bq Er EACCES +Search permission is denied for a component of the path prefix. +. +.It Bq Er ELOOP +Too many symbolic links were encountered in translating the pathname. +. +.It Bq Er EFAULT +.Fa path , +.Fa attrList +or +.Em attrBuf +points to an invalid address. +. +.It Bq Er EINVAL +The +.Fa bitmapcount +field of +.Fa attrList +is not +.Dv ATTR_BIT_MAP_COUNT . +. +.It Bq Er EINVAL +You requested an invalid attribute. +. +.It Bq Er EINVAL +You requested an attribute that is not supported for this file system object. +. +.It Bq Er EINVAL +You requested volume attributes and directory or file attributes. +. +.It Bq Er EINVAL +You requested volume attributes but +.Fa path +does not reference the root of the volume. +. +.It Bq Er EROFS +The volume is read-only but must be modified in order to return this attribute. +. +.It Bq Er EIO +An I/O error occurred while reading from or writing to the file system. +.El +.Pp +. +.Sh CAVEATS +. +If you request any volume attributes, you must set +.Dv ATTR_VOL_INFO +in the +.Fa volattr +field, even though it generates no result in the attribute buffer. +.Pp +. +The order that attributes are stored in the attribute buffer almost +invariably matches the order of attribute mask bit values. +For example, +.Dv ATTR_CMN_NAME +(0x00000001) comes before +.Dv ATTR_CMN_DEVID +(0x00000002) because its value is smaller. +However, you can not rely on this ordering because there is one key exception: +.Dv ATTR_CMN_FLAGS +is placed after the +.Dv ATTR_CMN_NAMEDATTRCOUNT +/ +.Dv ATTR_CMN_NAMEDATTRLIST +pair, even though its bit position indicates that it should come before. +This is due to a bug in an early version of Mac OS X that can't be fixed for +binary compatibility reasons. +When ordering attributes, you should always use the order in which they +are described above. +.Pp +. +For more caveats, see also the compatibility notes above. +. +.Sh EXAMPLES +. +The following code prints the file type and creator of a file, +assuming that the volume supports the required attributes. +. +.Bd -literal +#include <assert.h> +#include <stdio.h> +#include <string.h> +#include <sys/attr.h> +#include <sys/errno.h> +#include <unistd.h> +#include <sys/vnode.h> +.Pp +. +typedef struct attrlist attrlist_t; +.Pp +. +struct FInfoAttrBuf { + unsigned long length; + fsobj_type_t objType; + char finderInfo[32]; +}; +typedef struct FInfoAttrBuf FInfoAttrBuf; +.Pp +. +static int FInfoDemo(const char *path) +{ + int err; + attrlist_t attrList; + FInfoAttrBuf attrBuf; +.Pp +. + memset(&attrList, 0, sizeof(attrList)); + attrList.bitmapcount = ATTR_BIT_MAP_COUNT; + attrList.commonattr = ATTR_CMN_OBJTYPE | ATTR_CMN_FNDRINFO; +.Pp + + err = getattrlist(path, &attrList, &attrBuf, sizeof(attrBuf), 0); + if (err != 0) { + err = errno; + } +.Pp + + if (err == 0) { + assert(attrBuf.length == sizeof(attrBuf)); +.Pp + + printf("Finder information for %s:\en", path); + switch (attrBuf.objType) { + case VREG: + printf("file type = '%.4s'\en", &attrBuf.finderInfo[0]); + printf("file creator = '%.4s'\en", &attrBuf.finderInfo[4]); + break; + case VDIR: + printf("directory\en"); + break; + default: + printf("other object type, %d\en", attrBuf.objType); + break; + } + } +.Pp +. + return err; +} +.Ed +.Pp +. +The following code is an alternative implementation that uses nested structures +to group the related attributes. +. +.Bd -literal +#include <assert.h> +#include <stdio.h> +#include <stddef.h> +#include <string.h> +#include <sys/attr.h> +#include <sys/errno.h> +#include <unistd.h> +#include <sys/vnode.h> +.Pp +. +typedef struct attrlist attrlist_t; +.Pp +. +struct FInfo2CommonAttrBuf { + fsobj_type_t objType; + char finderInfo[32]; +}; +typedef struct FInfo2CommonAttrBuf FInfo2CommonAttrBuf; +.Pp +. +struct FInfo2AttrBuf { + unsigned long length; + FInfo2CommonAttrBuf common; +}; +typedef struct FInfo2AttrBuf FInfo2AttrBuf; +.Pp +. +static int FInfo2Demo(const char *path) +{ + int err; + attrlist_t attrList; + FInfo2AttrBuf attrBuf; +.Pp +. + memset(&attrList, 0, sizeof(attrList)); + attrList.bitmapcount = ATTR_BIT_MAP_COUNT; + attrList.commonattr = ATTR_CMN_OBJTYPE | ATTR_CMN_FNDRINFO; +.Pp +. + err = getattrlist(path, &attrList, &attrBuf, sizeof(attrBuf), 0); + if (err != 0) { + err = errno; + } +.Pp +. + if (err == 0) { + assert(attrBuf.length == sizeof(attrBuf)); +.Pp +. + printf("Finder information for %s:\en", path); + switch (attrBuf.common.objType) { + case VREG: + printf( + "file type = '%.4s'\en", + &attrBuf.common.finderInfo[0] + ); + printf( + "file creator = '%.4s'\en", + &attrBuf.common.finderInfo[4] + ); + break; + case VDIR: + printf("directory\en"); + break; + default: + printf( + "other object type, %d\en", + attrBuf.common.objType + ); + break; + } + } +.Pp +. + return err; +} +.Ed +.Pp +. +The following example shows how to deal with variable length attributes. +It assumes that the volume specified by +.Fa path +supports the necessary attributes. +. +.Bd -literal +#include <assert.h> +#include <stdio.h> +#include <stddef.h> +#include <string.h> +#include <sys/attr.h> +#include <sys/errno.h> +#include <unistd.h> +#include <sys/vnode.h> +.Pp +. +typedef struct attrlist attrlist_t; +.Pp +. +struct VolAttrBuf { + unsigned long length; + unsigned long fileCount; + unsigned long dirCount; + attrreference_t mountPointRef; + attrreference_t volNameRef; + char mountPointSpace[MAXPATHLEN]; + char volNameSpace[MAXPATHLEN]; +}; +typedef struct VolAttrBuf VolAttrBuf; +.Pp +. +static int VolDemo(const char *path) +{ + int err; + attrlist_t attrList; + VolAttrBuf attrBuf; +.Pp +. + memset(&attrList, 0, sizeof(attrList)); + attrList.bitmapcount = ATTR_BIT_MAP_COUNT; + attrList.volattr = ATTR_VOL_INFO + | ATTR_VOL_FILECOUNT + | ATTR_VOL_DIRCOUNT + | ATTR_VOL_MOUNTPOINT + | ATTR_VOL_NAME; +.Pp + + err = getattrlist(path, &attrList, &attrBuf, sizeof(attrBuf), 0); + if (err != 0) { + err = errno; + } +.Pp + + if (err == 0) { + assert(attrBuf.length > offsetof(VolAttrBuf, mountPointSpace)); + assert(attrBuf.length <= sizeof(attrBuf)); +.Pp + + printf("Volume information for %s:\en", path); + printf("ATTR_VOL_FILECOUNT: %lu\en", attrBuf.fileCount); + printf("ATTR_VOL_DIRCOUNT: %lu\en", attrBuf.dirCount); + printf( + "ATTR_VOL_MOUNTPOINT: %.*s\en", + (int) attrBuf.mountPointRef.attr_length, + ( ((char *) &attrBuf.mountPointRef) + + attrBuf.mountPointRef.attr_dataoffset ) + ); + printf( + "ATTR_VOL_NAME: %.*s\en", + (int) attrBuf.volNameRef.attr_length, + ( ((char *) &attrBuf.volNameRef) + + attrBuf.volNameRef.attr_dataoffset ) + ); + } +.Pp +. + return err; +} +.Ed +.Pp +. +.Sh SEE ALSO +. +.Xr access 2 , +.Xr chflags 2 , +.Xr exchangedata 2 , +.Xr fcntl 2 , +.Xr getdirentriesattr 2 , +.Xr mount 2 , +.Xr searchfs 2 , +.Xr setattrlist 2 , +.Xr stat 2 , +.Xr statfs 2 +. +.Sh HISTORY +A +.Fn getattrlist +function call appeared in Darwin 1.3.1 (Mac OS X version 10.0). +. diff --git a/bsd/man/man2/getdirentriesattr.2 b/bsd/man/man2/getdirentriesattr.2 new file mode 100644 index 000000000..9c59e22ae --- /dev/null +++ b/bsd/man/man2/getdirentriesattr.2 @@ -0,0 +1,427 @@ +.\" Copyright (c) 2003 Apple Computer, Inc. All rights reserved. +.\" +.\" The contents of this file constitute Original Code as defined in and +.\" are subject to the Apple Public Source License Version 1.1 (the +.\" "License"). You may not use this file except in compliance with the +.\" License. Please obtain a copy of the License at +.\" http://www.apple.com/publicsource and read it before using this file. +.\" +.\" This Original Code and all software distributed under the License are +.\" distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the +.\" License for the specific language governing rights and limitations +.\" under the License. +.\" +.\" @(#)getdirentriesattr.2 +. +.Dd December 15, 2003 +.Dt GETDIRENTRIESATTR 2 +.Os Darwin +.Sh NAME +.Nm getdirentriesattr +.Nd get file system attributes for multiple directory entries +.Sh SYNOPSIS +.Fd #include <sys/attr.h> +.Fd #include <unistd.h> +.Ft int +.Fn getdirentriesattr "int fd" "struct attrlist * attrList" "void * attrBuf" "size_t attrBufSize" "unsigned long * count" "unsigned long * basep" "unsigned long * newState" "unsigned long options" +. +. +.Sh DESCRIPTION +The +.Fn getdirentriesattr +function reads directory entries and returns their attributes (that is, metadata). +You can think of it as a combination of +.Xr getdirentries 2 +and +.Xr getattrlist 2 . +The function reads directory entries from the directory referenced by the +file descriptor +.Fa fd . +Attributes of those directory entries are placed into the buffer specified by +.Fa attrBuf +and +.Fa attrBufSize . +The +.Fa attrList +parameter determines what attributes are returned for each entry. +The +.Fa count +parameter contains the number of directory entries requested and returned. +The +.Fa basep +parameter returns the directory offset in a manner similar to +.Xr getdirentries 2 . +The +.Fa newState +parameter allows you to check whether the directory has been modified while +you were reading it. +The +.Fa options +parameter lets you control specific aspects of the function's behaviour. +.Pp +. +The +.Fn getdirentriesattr +function is only supported by certain volume format implementations. +For maximum compatibility, client programs should use high-level APIs +(such as the Carbon File Manager) to access file system attributes. +These high-level APIs include logic to emulate file system attributes +on volumes that don't support +.Fn getdirentriesattr . +.Pp +. +.\" fd parameter +. +The +.Fa fd +parameter must be a file descriptor that references a directory that you have opened for reading. +.Pp +. +.\" attrList parameter +. +The +.Fa attrList +parameter is a pointer to an +.Vt attrlist +structure. +You are responsible for filling out all fields of this structure before calling the function. +See the discussion of the +.Xr getattrlist 2 +function for a detailed description of this structure. +To get an attribute you must set the corresponding bit in the appropriate +.Vt attrgroup_t +field of the +.Vt attrlist +structure. +You must not request volume attributes. +.Pp +. +.\" attrBuf and attrBufSize parameters +. +The +.Fa attrBuf +and +.Fa attrBufSize +parameters specify a buffer into which the function places attribute values. +The attributes for any given directory entry are grouped together and +packed in exactly the same way as they are returned from +.Xr getattrlist 2 . +These groups are then placed into the buffer, one after another. +As each group starts with a leading +.Vt unsigned long +that contains the +overall length of the group, you can step from one group to the next +by simply adding this length to your pointer. +The sample code (below) shows how to do this. +The initial contents of this buffer are ignored. +.Pp +. +.\" count parameter +. +The +.Fa count +parameter points to a +.Vt unsigned long +variable. +You should initialise this variable to be the number of directory entries for which +you wish to get attributes. +On return, this variable contains the number of directory entries whose attributes +have been placed into the attribute buffer. +This may be smaller than the number that you requested. +.Pp +. +.\" basep parameter +The +.Fa basep +parameter returns the offset of the last directory entry read, in a +manner identical to +.Xr getdirentries 2 . +You can use this value to reset a directory iteration to a known position +using +.Xr lseek 2 . +The initial value of the variable is ignored. +.Pp +. +.\" newState parameter +. +The +.Fa newState +parameter returns a value that changes if the directory has been modified. +If you're iterating through the directory by making repeated calls to +.Fn getdirentriesattr , +you can compare subsequent values of +.Fa newState +to determine whether the directory has been modified (and thus restart +your iteration at the beginning). +The initial value of the variable is ignored. +.Pp +. +.\" options parameter +. +The +.Fa options +parameter is a bit set that controls the behaviour of +.Fn getdirentriesattr . +The following option bits are defined. +. +.Bl -tag -width FSOPT_NOINMEMUPDATE +. +.It FSOPT_NOINMEMUPDATE +This tells +.Fn getdirentriesattr +to return the directory entries from disk rather than taking the extra step of looking +at data structures in-memory which may contain changes that haven't been flushed to disk. +.Pp +This option allowed for specific performance optimizations for specific clients on older systems. +We currently recommend that clients not set this option and that file system +implementations ignore it. +. +.El +.Pp +It is typical to ask for a combination of common, file, and directory +attributes and then use the value of the +.Dv ATTR_CMN_OBJTYPE +attribute to parse the resulting attribute buffer. +. +.Sh RETURN VALUES +Upon successful completion a value of 0 or 1 is returned. +The value 0 indicates that the routine completed successfully. +The value 1 indicates that the routine completed successfully and has +returned the last entry in the directory. +On error, a value of -1 is returned and +.Va errno +is set to indicate the error. +. +.Sh COMPATIBILITY +Not all volumes support +.Fn getdirentriesattr . +You can test whether a volume supports +.Fn getdirentriesattr +by using +.Xr getattrlist 2 +to get the volume capabilities attribute +.Dv ATTR_VOL_CAPABILITIES , +and then testing the +.Dv VOL_CAP_INT_READDIRATTR +flag. +.Pp +. +The +.Fn getdirentriesattr +function has been undocumented for more than two years. +In that time a number of volume format implementations have been created without +a proper specification for the behaviour of this routine. +You may encounter volume format implementations with slightly different +behaviour than what is described here. +Your program is expected to be tolerant of this variant behaviour. +.Pp +. +If you're implementing a volume format that supports +.Fn getdirentriesattr , +you should be careful to support the behaviour specified by this document. +. +.Sh ERRORS +.Fn getdirentriesattr +will fail if: +.Bl -tag -width Er +. +.It Bq Er ENOTSUP +The volume does not support +.Fn getdirentriesattr . +. +.It Bq Er EBADF +.Fa fd +is not a valid file descriptor for a directory open for reading. +. +.It Bq Er EFAULT +.Fa attrList +or +.Em attrBuf +points to an invalid address. +. +.It Bq Er EINVAL +The +.Fa bitmapcount +field of +.Fa attrList +is not +.Dv ATTR_BIT_MAP_COUNT . +. +.It Bq Er EINVAL +You requested an invalid attribute. +. +.It Bq Er EINVAL +You requested volume attributes. +. +.It Bq Er EINVAL +The +.Fa options +parameter contains an invalid flag. +. +.It Bq Er EIO +An I/O error occurred while reading from or writing to the file system. +.El +.Pp +. +.Sh EXAMPLES +. +The following code lists the contents of a directory using +.Fn getdirentriesattr . +The listing includes the file type and creator for files. +. +.Bd -literal +#include <assert.h> +#include <stdio.h> +#include <stddef.h> +#include <string.h> +#include <sys/attr.h> +#include <sys/errno.h> +#include <unistd.h> +#include <sys/vnode.h> +#include <stdbool.h> +#include <fcntl.h> +.Pp +. +typedef struct attrlist attrlist_t; +.Pp +. +struct FInfoAttrBuf { + unsigned long length; + attrreference_t name; + fsobj_type_t objType; + char finderInfo[32]; +}; +typedef struct FInfoAttrBuf FInfoAttrBuf; +.Pp +. +enum { + kEntriesPerCall = 10 +}; +.Pp +. +static int FInfoDemo(const char *dirPath) +{ + int err; + int junk; + int dirFD; + attrlist_t attrList; + unsigned long index; + unsigned long count; + unsigned long junkBaseP; + bool oldStateValid; + unsigned long oldState; + unsigned long newState; + bool done; + FInfoAttrBuf * thisEntry; + char attrBuf[kEntriesPerCall * (sizeof(FInfoAttrBuf) + 64)]; +.Pp +. + // attrBuf is big enough for kEntriesPerCall entries, assuming that + // the average name length is less than 64. +.Pp +. + memset(&attrList, 0, sizeof(attrList)); + attrList.bitmapcount = ATTR_BIT_MAP_COUNT; + attrList.commonattr = ATTR_CMN_NAME + | ATTR_CMN_OBJTYPE + | ATTR_CMN_FNDRINFO; +.Pp + + err = 0; + dirFD = open(dirPath, O_RDONLY, 0); + if (dirFD < 0) { + err = errno; + } + if (err == 0) { + oldStateValid = false; + done = false; + do { + count = kEntriesPerCall; +.Pp + err = getdirentriesattr( + dirFD, + &attrList, + &attrBuf, + sizeof(attrBuf), + &count, + &junkBaseP, + &newState, + 0 + ); + if (err < 0) { + err = errno; + } else { + done = err; + err = 0; + } +.Pp + if (err == 0) { + if (oldStateValid) { + if (newState != oldState) { + printf("*** Directory has changed\en"); + oldState = newState; + } + } else { + oldState = newState; + oldStateValid = true; + } +.Pp + thisEntry = (FInfoAttrBuf *) attrBuf; +.Pp + for (index = 0; index < count; index++) { + switch (thisEntry->objType) { + case VREG: + printf( + "'%4.4s' '%4.4s' ", + &thisEntry->finderInfo[0], + &thisEntry->finderInfo[4] + ); + break; + case VDIR: + printf("directory "); + break; + default: + printf( + "objType = %-2d ", + thisEntry->objType + ); + break; + } + printf( + "%s\en", + ((char *) &thisEntry->name) + + thisEntry->name.attr_dataoffset + ); +.Pp + // Advance to the next entry. +.Pp + ((char *) thisEntry) += thisEntry->length; + } + } + } while ( err == 0 && ! done ); + } +.Pp + if (dirFD != -1) { + junk = close(dirFD); + assert(junk == 0); + } +.Pp + return err; +} +.Ed +.Pp +. +.Sh SEE ALSO +. +.Xr getattrlist 2 , +.Xr getdirentries 2 , +.Xr lseek 2 +. +.Sh HISTORY +A +.Fn getdirentriesattr +function call appeared in Darwin 1.3.1 (Mac OS X version 10.0). +. diff --git a/bsd/man/man2/getfsstat.2 b/bsd/man/man2/getfsstat.2 index 525f6a04f..cdc2e2586 100644 --- a/bsd/man/man2/getfsstat.2 +++ b/bsd/man/man2/getfsstat.2 @@ -44,7 +44,7 @@ .Fd #include <sys/ucred.h> .Fd #include <sys/mount.h> .Ft int -.Fn getfsstat "struct statfs *buf" "long bufsize" "int flags" +.Fn getfsstat "struct statfs *buf" "int bufsize" "int flags" .Sh DESCRIPTION .Fn Getfsstat returns information about all mounted file systems. @@ -95,21 +95,22 @@ is given as NULL, .Fn getfsstat returns just the number of mounted file systems. .Pp -Normally -.Fa flags -should be specified as -.Dv MNT_WAIT . If .Fa flags is set to .Dv MNT_NOWAIT , .Fn getfsstat -will return the information it has available without requesting -an update from each file system. -Thus, some of the information will be out of date, but +will directly return the information retained in the kernel +to avoid delays caused by waiting for updated information from +a file system that is perhaps temporarily unable to respond. +Some of the information returned may be out of date, however; if +.Fa flags +is set to +.Dv MNT_WAIT +instead, .Fn getfsstat -will not block waiting for information from a file system that is -unable to respond. +will request updated information from each mounted filesystem before +returning. .Sh RETURN VALUES Upon successful completion, the number of .Fa statfs diff --git a/bsd/man/man2/getpeername.2 b/bsd/man/man2/getpeername.2 index 6d5738014..3fe1c98d4 100644 --- a/bsd/man/man2/getpeername.2 +++ b/bsd/man/man2/getpeername.2 @@ -42,7 +42,7 @@ .Sh SYNOPSIS .Fd #include <sys/socket.h> .Ft int -.Fn getpeername "int s" "struct sockaddr *name" "int *namelen" +.Fn getpeername "int s" "struct sockaddr *name" "socklen_t *namelen" .Sh DESCRIPTION .Fn Getpeername returns the name of the peer connected to diff --git a/bsd/man/man2/getsockname.2 b/bsd/man/man2/getsockname.2 index 4582a3ea5..2d63acb38 100644 --- a/bsd/man/man2/getsockname.2 +++ b/bsd/man/man2/getsockname.2 @@ -42,7 +42,7 @@ .Sh SYNOPSIS .Fd #include <sys/socket.h> .Ft int -.Fn getsockname "int s" "struct sockaddr *name" "int *namelen" +.Fn getsockname "int s" "struct sockaddr *name" "socklen_t *namelen" .Sh DESCRIPTION .Fn Getsockname returns the current diff --git a/bsd/man/man2/getsockopt.2 b/bsd/man/man2/getsockopt.2 index b1ced804a..1f22a55da 100644 --- a/bsd/man/man2/getsockopt.2 +++ b/bsd/man/man2/getsockopt.2 @@ -44,9 +44,9 @@ .Fd #include <sys/types.h> .Fd #include <sys/socket.h> .Ft int -.Fn getsockopt "int s" "int level" "int optname" "void *optval" "int *optlen" +.Fn getsockopt "int s" "int level" "int optname" "void *optval" "socklen_t *optlen" .Ft int -.Fn setsockopt "int s" "int level" "int optname" "const void *optval" "int optlen" +.Fn setsockopt "int s" "int level" "int optname" "const void *optval" "socklen_t optlen" .Sh DESCRIPTION .Fn Getsockopt and @@ -295,8 +295,7 @@ receiving additional data, it returns with a short count or with the error .Er EWOULDBLOCK if no data were received. The struct timeval parameter must represent a -positive time interval less than SHRT_MAX * 10 milliseconds (5 minutes -and 28 seconds) otherwise +positive time interval otherwise .Fn setsockopt returns with the error .Er EDOM . diff --git a/bsd/man/man2/getxattr.2 b/bsd/man/man2/getxattr.2 new file mode 100644 index 000000000..8f04a9194 --- /dev/null +++ b/bsd/man/man2/getxattr.2 @@ -0,0 +1,165 @@ +.\" +.\" Copyright (c) 2004 Apple Computer, Inc. All rights reserved. +.\" +.\" @APPLE_LICENSE_HEADER_START@ +.\" +.\" This file contains Original Code and/or Modifications of Original Code +.\" as defined in and that are subject to the Apple Public Source License +.\" Version 2.0 (the 'License'). You may not use this file except in +.\" compliance with the License. Please obtain a copy of the License at +.\" http://www.opensource.apple.com/apsl/ and read it before using this +.\" file. +.\" +.\" The Original Code and all software distributed under the License are +.\" distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. +.\" Please see the License for the specific language governing rights and +.\" limitations under the License. +.\" +.\" @APPLE_LICENSE_HEADER_END@ +.\" +.Dd Oct 19, 2004 +.Dt GETXATTR 2 +.Os "Mac OS X" +.Sh NAME +.Nm getxattr, +.Nm fgetxattr +.Nd get an extended attribute value +.Sh SYNOPSIS +.Fd #include <sys/xattr.h> +.Ft ssize_t +.Fn getxattr "const char *path" "const char *name" "void *value" "size_t size" "u_int32_t position" "int options" +.Ft ssize_t +.Fn fgetxattr "int fd" "const char *name" "void *value" "size_t size" "u_int32_t position" "int options" +.Sh DESCRIPTION +Extended attributes extend the basic attributes of files and +directories in the file system. They are stored as name:data pairs +associated with file system objects (files, directories, symlinks, etc). +.Pp +The +.Fn getxattr +function retrieves up to +.Fa size +bytes of data from the extended attribute identified by +.Fa name +associated with +.Fa path +into the pre-allocated buffer pointed to by +.Fa value . +The function returns the number of bytes of data retrieved. +.Pp +An extended attribute's +.Fa name +is a simple NULL-terminated UTF-8 string. +.Fa position +specifies an offset within the extended attribute. In the current +implementation, this argument is only used with the resource fork attribute. +For all other extended attributes, this parameter is reserved and should +be zero. +.Pp +On success, +.Fa value +contains the data associated with +.Fa name . +When +.Fa value +is set to NULL, +.Fn getxattr +returns current size of the named attribute. This facility can be used +to determine the size of a buffer sufficiently large to hold the data +currently associated with the attribute. +.Pp +.Fa options +specify options for retrieving extended attributes: +.Pp +.Bl -tag -width XATTR_NOFOLLOW +.It Dv XATTR_NOFOLLOW +do not follow symbolic links. +.Fn getxattr +normally returns information from the target of +.Fa path +if it is a symbolic link. With this option, +.Fn getxattr +will return extended attribute data from the symbolic link instead. +.El +.Pp +.Fn fgetxattr +is identical to +.Fn getxattr , +except that it retrieves extended attribute data from the open file +referenced by the file descriptor +.Fa fd . +.Sh RETURN VALUES +On success, the size of the extended attribute data is returned. On +failure, -1 is returned and the global variable +.Va errno +is set as follows. +.Sh ERRORS +.Bl -tag -width Er +.It Bq Er ENOATTR +The extended attribute does not exist. +.It Bq Er ENOTSUP +The file system does not support extended attributes or has the feature +disabled. +.It Bq Er ERANGE +.Fa value +(as indicated by +.Fa size ) +is too small to hold the extended attribute data. +.It Bq Er EPERM +The named attribute is not permitted for this type of object. +.It Bq Er EINVAL +.Fa name +is invalid or +.Fa options +has an unsupported bit set. +.It Bq Er EISDIR +.Fa path +or +.Fa fd +do not refer to a regular file and the attribute in question is only +applicable to files. Similar to EPERM. +.It Bq Er ENOTDIR +A component of +.Fa path 's +prefix is not a directory. +.It Bq Er ENAMETOOLONG +The length of +.Fa name +exceeds +.Dv XATTR_MAXNAMELEN +UTF-8 bytes, or a component of +.Fa path +exceeds +.Dv NAME_MAX +characters, or the entire +.Fa path +exceeds +.Dv PATH_MAX +characters. +.It Bq Er EACCES +Search permission is denied for a component of +.Fa path +or the attribute is not allowed to be read (e.g. an ACL prohibits reading +the attributes of this file). +.It Bq Er ELOOP +Too many symbolic links were encountered in translating the pathname. +.It Bq Er EFAULT +.Fa path +or +.Fa name +points to an invalid address. +.It Bq Er EIO +An I/O error occurred while reading from or writing to the file system. +.El +.Sh SEE ALSO +.Xr setxattr 2 , +.Xr removexattr 2 , +.Xr listxattr 2 +.Sh HISTORY +.Fn getxattr +and +.Fn fgetxattr +first appeared in Mac OS X 10.4. diff --git a/bsd/man/man2/intro.2 b/bsd/man/man2/intro.2 index e9a29acd5..addbe6aa3 100644 --- a/bsd/man/man2/intro.2 +++ b/bsd/man/man2/intro.2 @@ -270,13 +270,8 @@ system or no implementation for it exists. .It Er 44 ESOCKTNOSUPPORT Em "Socket type not supported" . The support for the socket type has not been configured into the system or no implementation for it exists. -.It Er 45 EOPNOTSUPP Em "Operation not supported" . +.It Er 45 ENOTSUP Em "Not supported" . The attempted operation is not supported for the type of object referenced. -Usually this occurs when a file descriptor refers to a file or socket -that cannot support this operation, -for example, trying to -.Em accept -a connection on a datagram socket. .It Er 46 EPFNOSUPPORT Em "Protocol family not supported" . The protocol family has not been configured into the system or no implementation for it exists. @@ -444,6 +439,28 @@ along an invalid or an incomplete sequence of bytes or the given wide character is invalid. .It Er 93 ENOATTR Em "Attribute not found" . The specified extended attribute does not exist. +.It Er 94 EBADMSG Em "Bad message" . +The message to be received is inapprorpiate for the operation being attempted. +.It Er 95 EMULTIHOP Em "Reserved" . +This error is reserved for future use. +.It Er 96 ENODATA Em "No message available" . +No message was available to be received by the requested operation. +.It Er 97 ENOLINK Em "Reserved" . +This error is reserved for future use. +.It Er 98 ENOSR Em "No STREAM resources" . +This error is reserved for future use. +.It Er 99 ENOSTR Em "Not a STREAM" . +This error is reserved for future use. +.It Er 100 EPROTO Em "Protocol error" . +Some protocol error occurred. This error is device-specific, but is +generally not related to a hardware failure. +.It Er 101 ETIME Em "STREAM ioctl() timeout" . +This error is reserved for future use. +.It Er 102 EOPNOTSUPP Em "Operation not supported on socket" . +The attempted operation is not supported for the type of socket referenced; +for example, trying to +.Em accept +a connection on a datagram socket. .El .Sh DEFINITIONS .Bl -tag -width Ds diff --git a/bsd/man/man2/listxattr.2 b/bsd/man/man2/listxattr.2 new file mode 100644 index 000000000..b466439be --- /dev/null +++ b/bsd/man/man2/listxattr.2 @@ -0,0 +1,153 @@ +.\" +.\" Copyright (c) 2004 Apple Computer, Inc. All rights reserved. +.\" +.\" @APPLE_LICENSE_HEADER_START@ +.\" +.\" This file contains Original Code and/or Modifications of Original Code +.\" as defined in and that are subject to the Apple Public Source License +.\" Version 2.0 (the 'License'). You may not use this file except in +.\" compliance with the License. Please obtain a copy of the License at +.\" http://www.opensource.apple.com/apsl/ and read it before using this +.\" file. +.\" +.\" The Original Code and all software distributed under the License are +.\" distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. +.\" Please see the License for the specific language governing rights and +.\" limitations under the License. +.\" +.\" @APPLE_LICENSE_HEADER_END@ +.\" +.Dd Oct 19, 2004 +.Dt LISTXATTR 2 +.Os "Mac OS X" +.Sh NAME +.Nm listxattr, +.Nm flistxattr +.Nd list an extended attribute value +.Sh SYNOPSIS +.Fd #include <sys/xattr.h> +.Ft ssize_t +.Fn listxattr "const char *path" "char *namebuf" "size_t size" "int options" +.Ft ssize_t +.Fn flistxattr "int fd" "char *namebuf" "size_t size" "int options" +.Sh DESCRIPTION +Extended attributes extend the basic attributes associated with files and +directories in the file system. They are stored as name:data pairs associated +with file system objects (files, directories, symlinks, etc). +.Pp +.Fn listxattr +retrieves a list of names of extended attributes associated with the given +.Fa path +in the file system. +.Pp +.Fa namebuf +is a data buffer of +.Pa size +bytes for the names of the extended attributes associated with +.Fa path . +The extended attribute names are simple NULL-terminated UTF-8 strings and +are returned in arbitrary order. No extra padding is provided between +names in the buffer. The list will only include names of extended +attributes to which the calling process has access. The function returns +the size of the list of names. +.Pp +.Fa options +controls how the attribute list is generated: +.Pp +.Bl -tag -width XATTR_NOFOLLOW +.It Dv XATTR_NOFOLLOW +do not follow symbolic links. +.Fn listxattr +normally lists attributes of the target of +.Fa path +if it is a symbolic link. With this option, +.Fn listxattr +will list attributes of the link itself. +.El +.Pp +If +.Fa namebuf +is set to NULL, +the function returns the size of the list of extended attribute names. +This facility can be used to determine the size of a buffer sufficiently +large to hold the names of the attributes currently associated with +.Fa path . +.Pp +.Fn flistxattr +is identical to +.Fn listxattr , +except that it returns the list of extended attribute names associated +with the open file referenced by file descriptor +.Fa fd . +.Sh RETURN VALUES +On success, the size of the extended attribute name list is returned. If +no accessible extended attributes are associated with the given +.Fa path +or +.Fa fd , +the function returns zero. On failure, -1 is returned and the global +variable +.Va errno +is set as follows. +.Sh ERRORS +.Bl -tag -width Er +.It Bq Er ENOTSUP +The file system does not support extended attributes or has the feature +disabled. +.It Bq Er ERANGE +.Fa namebuf +(as indicated by +.Fa size ) +is too small to hold the list of names. +.It Bq Er EPERM +.Fa path +or +.Fa fd +refer to a file system object that does not support extended attributes. +For example, resource forks don't support extended attributes. +.\" If only EFTYPE was a POSIX error +.It Bq Er ENOTDIR +A component of +.Fa path 's +prefix is not a directory. +.It Bq Er ENAMETOOLONG +.Fa name +exceeds +.Dv XATTR_MAXNAMELEN +UTF-8 bytes, or a component of +.Fa path +exceeds +.Dv NAME_MAX +characters, or the entire +.Fa path +exceeds +.Dv PATH_MAX +characters. +.It Bq Er EACCES +Search permission is denied for a component of +.Fa path +or permission is denied to read the list of attributes from this file. +.It Bq Er ELOOP +Too many symbolic links were encountered resolving +.Fa path . +.It Bq Er EFAULT +.Fa path +points to an invalid address. +.It Bq Er EIO +An I/O error occurred. +.It Bq Er EINVAL +.Fa options +does not make sense. +.El +.Sh SEE ALSO +.Xr setxattr 2 , +.Xr getxattr 2 , +.Xr removexattr 2 +.Sh HISTORY +.Fn listxattr +and +.Fn flistxattr +first appeared in Mac OS X 10.4. diff --git a/bsd/man/man2/madvise.2 b/bsd/man/man2/madvise.2 index 9f5938267..1c85fce72 100644 --- a/bsd/man/man2/madvise.2 +++ b/bsd/man/man2/madvise.2 @@ -99,7 +99,7 @@ Indicates that the application will not need the information contained in this a .Fn madvise system call. .El - +.Pp The .Fn posix_madvise behaves same as diff --git a/bsd/man/man2/mkfifo.2 b/bsd/man/man2/mkfifo.2 index af5d7615f..7d843f2c5 100644 --- a/bsd/man/man2/mkfifo.2 +++ b/bsd/man/man2/mkfifo.2 @@ -66,7 +66,7 @@ indicates an error, and an error code is stored in .Fn Mkfifo will fail and no fifo will be created if: .Bl -tag -width Er -.It Bq Er EOPNOTSUPP +.It Bq Er ENOTSUP The kernel has not been configured to support fifo's. .It Bq Er ENOTDIR A component of the path prefix is not a directory. diff --git a/bsd/man/man2/poll.2 b/bsd/man/man2/poll.2 new file mode 100644 index 000000000..a91b73094 --- /dev/null +++ b/bsd/man/man2/poll.2 @@ -0,0 +1,198 @@ +.\" +.\" Copyright (c) 2005 Apple Computer, Inc. All rights reserved. +.\" +.\" @APPLE_LICENSE_HEADER_START@ +.\" +.\" This file contains Original Code and/or Modifications of Original Code +.\" as defined in and that are subject to the Apple Public Source License +.\" Version 2.0 (the 'License'). You may not use this file except in +.\" compliance with the License. Please obtain a copy of the License at +.\" http://www.opensource.apple.com/apsl/ and read it before using this +.\" file. +.\" +.\" The Original Code and all software distributed under the License are +.\" distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. +.\" Please see the License for the specific language governing rights and +.\" limitations under the License. +.\" +.\" @APPLE_LICENSE_HEADER_END@ +.\" +.\" +.\" Copyright (c) 1996 Charles M. Hannum. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. All advertising materials mentioning features or use of this software +.\" must display the following acknowledgement: +.\" This product includes software developed by Charles M. Hannum. +.\" 4. The name of the author may not be used to endorse or promote products +.\" derived from this software without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +.\" IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +.\" OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +.\" IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +.\" INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +.\" NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +.\" DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +.\" THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +.\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +.\" +.Dd February 27, 2005 +.Dt POLL 2 +.Os +.Sh NAME +.Nm poll +.Nd synchronous I/O multiplexing +.Sh SYNOPSIS +.In poll.h +.Ft int +.Fn poll "struct pollfd *fds" "nfds_t nfds" "int timeout" +.Sh DESCRIPTION +.Fn Poll +examines a set of file descriptors to see if some of them are ready for +I/O or if certain events have occurred on them. +The +.Fa fds +argument is a pointer to an array of pollfd structures as defined in +.Aq Pa poll.h +(shown below). The +.Fa nfds +argument determines the size of the +.Fa fds +array. +.Bd -literal +struct pollfd { + int fd; /* file descriptor */ + short events; /* events to look for */ + short revents; /* events returned */ +}; +.Ed +.Pp +The fields of +.Fa struct pollfd +are as follows: +.Bl -tag -width XXXrevents +.It fd +File descriptor to poll. +.It events +Events to poll for. (See below.) +.It revents +Events which may occur or have occurred. (See below.) +.El +.Pp +The event bitmasks in +.Fa events +and +.Fa revents +have the following bits: +.Bl -tag -width XXXPOLLWRNORM +.It POLLIN +Data other than high priority data may be read without blocking. +This is equivalent to ( POLLRDNORM | POLLRDBAND ). +.It POLLRDNORM +Normal data may be read without blocking. +.It POLLRDBAND +Priority data may be read without blocking. +.It POLLPRI +High priority data may be read without blocking. +.It POLLOUT +.It POLLWRNORM +Normal data may be written without blocking. +.It POLLWRBAND +Priority data may be written without blocking. +.It POLLERR +An exceptional condition has occurred on the device or socket. This +flag is output only, and ignored if present in the input +.Fa events +bitmask. +.It POLLHUP +The device or socket has been disconnected. This flag is output only, +and ignored if present in the input +.Fa events +bitmask. Note that +POLLHUP +and +POLLOUT +are mutually exclusive and should never be present in the +.Fa revents +bitmask at the same time. +.It POLLNVAL +The file descriptor is not open. This flag is output only, and ignored if present in the input +.Fa events +bitmask. +.El +.Pp +The distinction between normal, priority, and high-priority data is file type +or device specific. +.Pp +If +.Fa timeout +is greater than zero, it specifies a maximum interval to +wait for any file descriptor to become ready, in milliseconds. If +.Fa timeout +is zero, then +.Fn poll +will return without blocking. If the value of +.Fa timeout +is -1, the poll blocks indefinitely. +.Sh RETURN VALUES +.Fn Poll +returns the number of descriptors that are ready for I/O, or -1 if an +error occured. If the time limit expires, +.Fn poll +returns 0. +If +.Fn poll +returns with an error, +including one due to an interrupted call, +the +.Fa fds +array will be unmodified. +.Sh ERRORS +An error return from +.Fn poll +indicates: +.Bl -tag -width Er +.It Bq Er EFAULT +.Fa Fds +points outside the process's allocated address space. +.It Bq Er EINTR +A signal was delivered before the time limit expired and +before any of the selected events occurred. +.It Bq Er EINVAL +The +.Fa nfds +argument is greater than OPEN_MAX, or the +.Fa timeout +argument is less than -1. +.El +.Sh BUGS +The +.Fn poll +system call currently does not support devices. +.Sh SEE ALSO +.Xr accept 2 , +.Xr connect 2 , +.Xr kevent 2 , +.Xr read 2 , +.Xr recv 2 , +.Xr select 2 , +.Xr send 2 , +.Xr write 2 +.Sh HISTORY +The +.Fn poll +function call appeared in +.At V . + diff --git a/bsd/man/man2/posix_madvise.2 b/bsd/man/man2/posix_madvise.2 index d0f9ea997..c83f56ab9 100644 --- a/bsd/man/man2/posix_madvise.2 +++ b/bsd/man/man2/posix_madvise.2 @@ -1 +1 @@ -.so man2/madvise.2 +.so man2/madvise.2 \ No newline at end of file diff --git a/bsd/man/man2/ptrace.2 b/bsd/man/man2/ptrace.2 index 649458fec..5df4371b2 100644 --- a/bsd/man/man2/ptrace.2 +++ b/bsd/man/man2/ptrace.2 @@ -47,7 +47,7 @@ argument specifies the process ID of the traced process. can be: .Bl -tag -width 12n .It Dv PT_TRACE_ME -This request is the only one used by the traced process; it declares +This request is one of two used by the traced process; it declares that the process expects to be traced by its parent. All the other arguments are ignored. (If the parent process does not expect to trace the child, it will probably be rather confused by the results; once the @@ -65,70 +65,14 @@ such as it will stop before executing the first instruction of the new image. Also, any setuid or setgid bits on the executable being executed will be ignored. -.It Dv PT_READ_I , Dv PT_READ_D -These requests read a single -.Li int -of data from the traced process' address space. Traditionally, -.Fn ptrace -has allowed for machines with distinct address spaces for instruction -and data, which is why there are two requests: conceptually, -.Dv PT_READ_I -reads from the instruction space and -.Dv PT_READ_D -reads from the data space. In the current OpenBSD implementation, these -two requests are completely identical. The -.Fa addr -argument specifies the address (in the traced process' virtual address -space) at which the read is to be done. This address does not have to -meet any alignment constraints. The value read is returned as the -return value from -.Eo \& -.Fn ptrace -.Ec . -.It Dv PT_WRITE_I , Dv PT_WRITE_D -These requests parallel -.Dv PT_READ_I -and -.Dv PT_READ_D , -except that they write rather than read. The -.Fa data -argument supplies the value to be written. -.\" .It Dv PT_READ_U -.\" This request reads an -.\" .Li int -.\" from the traced process' user structure. The -.\" .Fa addr -.\" argument specifies the location of the int relative to the base of the -.\" user structure; it will usually be an integer value cast to -.\" .Li caddr_t -.\" either explicitly or via the presence of a prototype for -.\" .Eo \& -.\" .Fn ptrace -.\" .Ec . -.\" Unlike -.\" .Dv PT_READ_I -.\" and -.\" .Dv PT_READ_D , -.\" .Fa addr -.\" must be aligned on an -.\" .Li int -.\" boundary. The value read is returned as the return value from -.\" .Eo \& -.\" .Fn ptrace -.\" .Ec . -.\" .It Dv PT_WRITE_U -.\" This request writes an -.\" .Li int -.\" into the traced process' user structure. -.\" .Fa addr -.\" specifies the offset, just as for -.\" .Dv PT_READ_U , -.\" and -.\" .Fa data -.\" specifies the value to be written, just as for -.\" .Dv PT_WRITE_I -.\" and -.\" .Dv PT_WRITE_D . +.It Dv PT_DENY_ATTACH +This request is the other operation used by the traced process; it allows +a process that is not currently being traced to deny future traces by its +parent. All other arguments are ignored. If the process is currently +being traced, it will exit with the exit status of ENOTSUP; otherwise, +it sets a flag that denies future traces. An attempt by the parent to +trace a process which has set this flag will result in a segmentation violation +in the parent. .It Dv PT_CONTINUE The traced process continues execution. .Fa addr @@ -139,6 +83,10 @@ to indicate that execution is to pick up where it left off. .Fa data provides a signal number to be delivered to the traced process as it resumes execution, or 0 if no signal is to be sent. +.It Dv PT_STEP +The traced process continues execution for a single step. The +parameters are identical to those passed to +.Dv PT_CONTINUE. .It Dv PT_KILL The traced process terminates, as if .Dv PT_CONTINUE @@ -164,138 +112,6 @@ succeeds, the traced process is no longer traced and continues execution normally. .El .Pp -Additionally, machine-specific requests can exist. On the SPARC, these -are: -.Bl -tag -width 12n -.It Dv PT_GETREGS -This request reads the traced process' machine registers into the -.Dq Li "struct reg" -(defined in -.Aq Pa machine/reg.h ) -pointed to by -.Fa addr . -.It Dv PT_SETREGS -This request is the converse of -.Dv PT_GETREGS ; -it loads the traced process' machine registers from the -.Dq Li "struct reg" -(defined in -.Aq Pa machine/reg.h ) -pointed to by -.Fa addr . -.It Dv PT_GETFPREGS -This request reads the traced process' floating-point registers into -the -.Dq Li "struct fpreg" -(defined in -.Aq Pa machine/reg.h ) -pointed to by -.Fa addr . -.It Dv PT_SETFPREGS -This request is the converse of -.Dv PT_GETFPREGS ; -it loads the traced process' floating-point registers from the -.Dq Li "struct fpreg" -(defined in -.Aq Pa machine/reg.h ) -pointed to by -.Fa addr . -.\" .It Dv PT_SYSCALL -.\" This request is like -.\" .Dv PT_CONTINUE -.\" except that the process will stop next time it executes any system -.\" call. Information about the system call can be examined with -.\" .Dv PT_READ_U -.\" and potentially modified with -.\" .Dv PT_WRITE_U -.\" through the -.\" .Li u_kproc.kp_proc.p_md -.\" element of the user structure (see below). If the process is continued -.\" with another -.\" .Dv PT_SYSCALL -.\" request, it will stop again on exit from the syscall, at which point -.\" the return values can be examined and potentially changed. The -.\" .Li u_kproc.kp_proc.p_md -.\" element is of type -.\" .Dq Li "struct mdproc" , -.\" which should be declared by including -.\" .Aq Pa sys/param.h , -.\" .Aq Pa sys/user.h , -.\" and -.\" .Aq Pa machine/proc.h , -.\" and contains the following fields (among others): -.\" .Bl -item -compact -offset indent -.\" .It -.\" .Li syscall_num -.\" .It -.\" .Li syscall_nargs -.\" .It -.\" .Li syscall_args[8] -.\" .It -.\" .Li syscall_err -.\" .It -.\" .Li syscall_rv[2] -.\" .El -.\" When a process stops on entry to a syscall, -.\" .Li syscall_num -.\" holds the number of the syscall, -.\" .Li syscall_nargs -.\" holds the number of arguments it expects, and -.\" .Li syscall_args -.\" holds the arguments themselves. (Only the first -.\" .Li syscall_nargs -.\" elements of -.\" .Li syscall_args -.\" are guaranteed to be useful.) When a process stops on exit from a -.\" syscall, -.\" .Li syscall_num -.\" is -.\" .Eo \& -.\" .Li -1 -.\" .Ec , -.\" .Li syscall_err -.\" holds the error number -.\" .Po -.\" see -.\" .Xr errno 2 -.\" .Pc , -.\" or 0 if no error occurred, and -.\" .Li syscall_rv -.\" holds the return values. (If the syscall returns only one value, only -.\" .Li syscall_rv[0] -.\" is useful.) The tracing process can modify any of these with -.\" .Dv PT_WRITE_U ; -.\" only some modifications are useful. -.\" .Pp -.\" On entry to a syscall, -.\" .Li syscall_num -.\" can be changed, and the syscall actually performed will correspond to -.\" the new number (it is the responsibility of the tracing process to fill -.\" in -.\" .Li syscall_args -.\" appropriately for the new call, but there is no need to modify -.\" .Eo \& -.\" .Li syscall_nargs -.\" .Ec ). -.\" If the new syscall number is 0, no syscall is actually performed; -.\" instead, -.\" .Li syscall_err -.\" and -.\" .Li syscall_rv -.\" are passed back to the traced process directly (and therefore should be -.\" filled in). If the syscall number is otherwise out of range, a dummy -.\" syscall which simply produces an -.\" .Er ENOSYS -.\" error is effectively performed. -.\" .Pp -.\" On exit from a syscall, only -.\" .Li syscall_err -.\" and -.\" .Li syscall_rv -.\" can usefully be changed; they are set to the values returned by the -.\" syscall and will be passed back to the traced process by the normal -.\" syscall return mechanism. -.El .Sh ERRORS Some requests can cause .Fn ptrace @@ -318,22 +134,11 @@ on itself. The .Fa request was not one of the legal requests. -.\" .It -.\" The -.\" .Fa addr -.\" to -.\" .Dv PT_READ_U -.\" or -.\" .Dv PT_WRITE_U -.\" was not -.\" .Li int Ns \&-aligned. .It The signal number (in .Fa data ) to .Dv PT_CONTINUE -.\" or -.\" .Dv PT_SYSCALL was neither 0 nor a legal signal number. .It .Dv PT_GETREGS , @@ -371,27 +176,3 @@ on a process in violation of the requirements listed under above. .El .El -.Sh BUGS -On the SPARC, the PC is set to the provided PC value for -.Dv PT_CONTINUE -and similar calls, but the NPC is set willy-nilly to 4 greater than the -PC value. Using -.Dv PT_GETREGS -and -.Dv PT_SETREGS -to modify the PC, passing -.Li (caddr_t)1 -to -.Eo \& -.Fn ptrace -.Ec , -should be able to sidestep this. -.Pp -Single-stepping is not available. -.\" .Pp -.\" When using -.\" .Dv PT_SYSCALL , -.\" there is no easy way to tell whether the traced process stopped because -.\" it made a syscall or because a signal was sent at a moment that it just -.\" happened to have valid-looking garbage in its -.\" .Dq Li "struct mdproc" . diff --git a/bsd/man/man2/quotactl.2 b/bsd/man/man2/quotactl.2 index ea35c50b7..68b2e3c24 100644 --- a/bsd/man/man2/quotactl.2 +++ b/bsd/man/man2/quotactl.2 @@ -158,7 +158,7 @@ A .Fn quotactl call will fail if: .Bl -tag -width Er -.It Bq Er EOPNOTSUPP +.It Bq Er ENOTSUP The kernel has not been compiled with the .Dv QUOTA option. diff --git a/bsd/man/man2/recv.2 b/bsd/man/man2/recv.2 index 75ef209bf..5ceee5989 100644 --- a/bsd/man/man2/recv.2 +++ b/bsd/man/man2/recv.2 @@ -47,7 +47,7 @@ .Ft ssize_t .Fn recv "int s" "void *buf" "size_t len" "int flags" .Ft ssize_t -.Fn recvfrom "int s" "void *buf" "size_t len" "int flags" "struct sockaddr *from" "int *fromlen" +.Fn recvfrom "int s" "void *buf" "size_t len" "int flags" "struct sockaddr *from" "socklen_t *fromlen" .Ft ssize_t .Fn recvmsg "int s" "struct msghdr *msg" "int flags" .Sh DESCRIPTION @@ -147,13 +147,13 @@ This structure has the following form, as defined in .Pp .Bd -literal struct msghdr { - caddr_t msg_name; /* optional address */ - u_int msg_namelen; /* size of address */ - struct iovec *msg_iov; /* scatter/gather array */ - u_int msg_iovlen; /* # elements in msg_iov */ - caddr_t msg_control; /* ancillary data, see below */ - u_int msg_controllen; /* ancillary data buffer len */ - int msg_flags; /* flags on received message */ + caddr_t msg_name; /* optional address */ + socklen_t msg_namelen; /* size of address */ + struct iovec *msg_iov; /* scatter/gather array */ + u_int msg_iovlen; /* # elements in msg_iov */ + caddr_t msg_control; /* ancillary data, see below */ + socklen_t msg_controllen; /* ancillary data buffer len */ + int msg_flags; /* flags on received message */ }; .Ed .Pp diff --git a/bsd/man/man2/removexattr.2 b/bsd/man/man2/removexattr.2 new file mode 100644 index 000000000..acfa319d5 --- /dev/null +++ b/bsd/man/man2/removexattr.2 @@ -0,0 +1,135 @@ +.\" +.\" Copyright (c) 2004 Apple Computer, Inc. All rights reserved. +.\" +.\" @APPLE_LICENSE_HEADER_START@ +.\" +.\" This file contains Original Code and/or Modifications of Original Code +.\" as defined in and that are subject to the Apple Public Source License +.\" Version 2.0 (the 'License'). You may not use this file except in +.\" compliance with the License. Please obtain a copy of the License at +.\" http://www.opensource.apple.com/apsl/ and read it before using this +.\" file. +.\" +.\" The Original Code and all software distributed under the License are +.\" distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. +.\" Please see the License for the specific language governing rights and +.\" limitations under the License. +.\" +.\" @APPLE_LICENSE_HEADER_END@ +.\" +.Dd Oct 19, 2004 +.Dt REMOVEXATTR 2 +.Os "Mac OS X" +.Sh NAME +.Nm removexattr, +.Nm fremovexattr +.Nd remove an extended attribute value +.Sh SYNOPSIS +.Fd #include <sys/xattr.h> +.Ft int +.Fn removexattr "const char *path" "const char *name" "int options" +.Ft int +.Fn fremovexattr "int fd" "const char *name" "int options" +.Sh DESCRIPTION +Extended attributes extend the basic attributes associated with files and +directories in the file system. They are stored as name:data pairs +associated with file system objects (files, directories, symlinks, etc). +.Pp +.Fn Removexattr +deletes the extended attribute +.Fa name +associated with +.Fa path . +.Pp +An extended attribute's +.Fa name +is a simple NULL-terminated UTF-8 string. +.Fa Options +is a bit mask specifying various options: +.Pp +.Bl -tag -width XATTR_NOFOLLOW +.It Dv XATTR_NOFOLLOW +do not follow symbolic links. Normally, +.Fn removexattr +acts on the target of +.Fa path +if it is a symbolic link. With this option, +.Fn removexattr +will act on the link itself. +.El +.Pp +.Fn fremovexattr +is identical to +.Fn removexattr , +except that it removes an extended attribute from an open file referenced +by file descriptor +.Fa fd . +.Sh RETURN VALUES +On success, 0 is returned. On failure, -1 is returned and the global +variable +.Va errno +is set as follows. +.Sh ERRORS +.Bl -tag -width Er +.It Bq Er ENOATTR +The specified extended attribute does not exist. +.It Bq Er ENOTSUP +The file system does not support extended attributes or has the feature +disabled. +.It Bq Er EROFS +The file system is mounted read-only. +.It Bq Er EPERM +This type of object does not support extended attributes. +.It Bq Er EINVAL +.Fa name +or +.Fa options +is invalid. +.Fa name +must be valid UTF-8 +.Fa options +must make sense. +.It Bq Er ENOTDIR +A component of the +.Fa path 's +prefix is not a directory. +.It Bq Er ENAMETOOLONG +.Fa Name +exceeded +.Dv XATTR_MAXNAMELEN +UTF-8 bytes, or a component of +.Fa path +exceeded +.Dv NAME_MAX +characters, or the entire +.Fa path +exceeded +.Dv PATH_MAX +characters. +.It Bq Er EACCES +Search permission is denied for a component +.Fa path +or permission to remove the attribute is denied. +.It Bq Er ELOOP +Too many symbolic links were encountered in +.Fa path . +.It Bq Er EFAULT +.Fa path +or +.Fa name +points to an invalid address. +.It Bq Er EIO +An I/O error occurred while reading from or writing to the file system. +.El +.Sh SEE ALSO +.Xr getxattr 2 , +.Xr setxattr 2 , +.Xr listxattr 2 +.Sh HISTORY +.Fn removexattr +and +.Fn fremovexattr +first appeared in Mac OS X 10.4. diff --git a/bsd/man/man2/sbrk.2 b/bsd/man/man2/sbrk.2 deleted file mode 100644 index a3711a537..000000000 --- a/bsd/man/man2/sbrk.2 +++ /dev/null @@ -1 +0,0 @@ -.so man2/brk.2 diff --git a/bsd/man/man2/searchfs.2 b/bsd/man/man2/searchfs.2 new file mode 100644 index 000000000..c3b602b4d --- /dev/null +++ b/bsd/man/man2/searchfs.2 @@ -0,0 +1,804 @@ +.\" Copyright (c) 2003 Apple Computer, Inc. All rights reserved. +.\" +.\" The contents of this file constitute Original Code as defined in and +.\" are subject to the Apple Public Source License Version 1.1 (the +.\" "License"). You may not use this file except in compliance with the +.\" License. Please obtain a copy of the License at +.\" http://www.apple.com/publicsource and read it before using this file. +.\" +.\" This Original Code and all software distributed under the License are +.\" distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the +.\" License for the specific language governing rights and limitations +.\" under the License. +.\" +.\" @(#)searchfs.2 +. +.Dd December 15, 2003 +.Dt SEARCHFS 2 +.Os Darwin +.Sh NAME +.Nm searchfs +.Nd search a volume quickly +.Sh SYNOPSIS +.Fd #include <sys/attr.h> +.Fd #include <unistd.h> +.Ft int +.Fn searchfs "const char * path" "struct fssearchblock * searchBlock" "unsigned long * numMatches" "unsigned long scriptCode" "unsigned long options" "struct searchstate * state" +. +.Sh DESCRIPTION +The +.Fn searchfs +function searches the volume (that is, mounted file system) specified by +.Fa path +for file system objects matching the criteria specified by +.Fa searchBlock , +.Fa scriptCode , +and +.Fa options . +The +.Fa numMatches +parameter returns the number of matching file system objects found. +The function also returns attributes of those file system objects in a buffer +specified by +.Fa searchBlock . +The +.Fa searchState +parameter allows you search the volume using multiple calls to +.Fn searchfs , +resuming the search where it left off. +The routine will only return objects to which you have access (that is, you +have execute permissions on the directories leading to this object from the root). +.Pp +. +.\" path parameter +. +The +.Fa path +parameter must reference a valid file system object on the volume to be searched. +Typically the path is to the volume's root directory. +The entire volume is always searched. +All directories listed in the path name leading to this object must be +searchable. +.Pp +. +.\" searchBlock parameter +. +The +.Fa searchBlock +parameter is a pointer to an +.Vt fssearchblock +structure, as defined by +.Aq Pa sys/attr.h +(shown below). +You are responsible for filling out all fields of this structure before calling the function. +.Bd -literal +struct fssearchblock { + struct attrlist * returnattrs; + void * returnbuffer; + size_t returnbuffersize; + unsigned long maxmatches; + struct timeval timelimit; + void * searchparams1; + size_t sizeofsearchparams1; + void * searchparams2; + size_t sizeofsearchparams2; + struct attrlist searchattrs; +}; +.Ed +.Pp +. +For information about the +.Vt attrlist +structure, see the discussion of +.Xr getattrlist 2 . +.Pp +. +.\" searchBlock elements +. +The fields of the +.Vt fssearchblock +structure are defined as follows. +.Bl -tag -width sizeofsearchparams1 +. +.It returnattrs +.Fn searchfs +can return arbitrary attributes of the file system objects that it finds. +This field must point to an +.Vt attrlist +structure that specifies the attributes that you want returned. +To request an attribute you must set the corresponding bit in the appropriate +.Vt attrgroup_t +field of the +.Vt attrlist +structure. +You are responsible for filling out all fields of this structure before calling the function. +You must not request volume attributes. +. +.It returnbuffer +.Fn searchfs +places attributes of the matching file system objects into this returned attributes buffer. +The attributes for any given object are grouped together and +packed in exactly the same way as they would be returned from +.Xr getdirentriesattr 2 . +The initial contents of this buffer are ignored. +. +.It returnbuffersize +Set this field to the size, in bytes, of the buffer pointed to by +.Fa returnbuffer . +. +.It maxmatches +Specifies the maximum number of matches that you want this call to +.Fn searchfs +to return. +. +.It timelimit +Specifies the maximum time that you want this call to +.Fn searchfs +to run. +.Pp +. +If you're implementing a volume format, you should impose your own internal +limit on the duration of this call to prevent a malicious user program +from monopolising kernel resources. +.Pp +. +.It searchparams1 +Specifies the lower bound of the search criteria. +This is discussed in detail below. +You must place attribute values into the buffer in the same +way as they would be returned by +.Xr getattrlist 2 , +where the +.Fa searchattrs +field determines the exact layout of the attribute values. +. +.It sizeofsearchparams1 +Set this field to the size, in bytes, of the buffer pointed to by +.Fa searchparams1 . +. +.It searchparams2 +Specifies the upper bound of the search criteria. +This is discussed in detail below. +You must place attribute values into the buffer in the same +way as they would be returned by +.Xr getattrlist 2 , +where the +.Fa searchattrs +field determines the exact layout of the attribute values. +. +.It sizeofsearchparams2 +Set this field to the size, in bytes, of the buffer pointed to by +.Fa searchparams2 . +. +.It searchattrs +Specifies the attributes that you want you use for your search criteria. +You are responsible for filling out all fields of this structure before calling the function. +To search for an attribute you must set the corresponding bit in the appropriate +.Vt attrgroup_t +field of the +.Vt attrlist +structure, and place the appropriate values into the +.Fa searchparam1 +and +.Fa searchparam2 +buffers. +The attributes specified here determine the format of those buffers. +This is discussed in detail below. +. +.El +.Pp +. +.\" numMatches parameter +. +The +.Fa numMatches +parameter points to an +.Vt unsigned long +variable. +The initial value of this variable is ignored. +On return, this variable contains the number of matching file system objects found. +The is always less than or equal to the +.Fa maxmatches +field of the +.Fa searchBlock +parameter. +The attributes for the matching objects have been placed into the returned attributes buffer. +.Pp +. +.\" scriptCode parameter +. +The +.Fa scriptCode +parameter is currently ignored. +You should always pass in the value 0x08000103, which corresponds to the +UTF-8 text encoding value defined by +.Aq Pa CarbonCore/TextCommon.h . +.Pp +. +.\" options parameter +. +The +.Fa options +parameter is a bit set that controls the behaviour of +.Fn searchfs . +The following option bits are defined. +. +.Bl -tag -width SRCHFS_MATCHPARTIALNAMES +. +.It SRCHFS_START +If this bit is set, +.Fn searchfs +will ignore the +.Fa state +parameter and start a new search. +Otherwise +.Fn searchfs +assumes that +.Fa searchstate +is valid and attempts to resume a previous search based on that state. +. +.It SRCHFS_MATCHPARTIALNAMES +If this bit is set, +.Fn searchfs +will consider substrings to be successful matches when evaluating the +.Dv ATTR_CMN_NAME +attribute. +. +.It SRCHFS_MATCHDIRS +If this bit is set, +.Fn searchfs +will search for directories that match the search criteria. +To get meaningful results you must specify either this bit or +.Dv SRCHFS_MATCHFILES , +or both. +. +.It SRCHFS_MATCHFILES +If this bit is set, +.Fn searchfs +will search for files that match the search criteria. +To get meaningful results you must specify either this bit or +.Dv SRCHFS_MATCHDIRS , +or both. +. +.It SRCHFS_SKIPLINKS +If this bit is set, +.Fn searchfs +will only return one reference for a hard linked file, rather that a reference +for each hard link to the file. +.Pp +This option is not recommended for general development. +Its primary client is the +.Xr quotacheck 2 +utility. +.Pp +. +This option is privileged (the caller's effective UID must be 0) and cannot +be used if you request the +.Dv ATTR_CMN_NAME +or +.Dv ATTR_CMN_PAROBJID +attributes. +.Pp +Introduced with Darwin 7.0 (Mac OS X version 10.3). +. +.It SRCHFS_SKIPINVISIBLE +If this bit is set, +.Fn searchfs +will not match any invisible file system objects (that is, objects whose +.Dv ATTR_CMN_FNDRINFO +attribute has bit 6 set in the ninth byte) or any objects within +invisible directories. +.Pp +Introduced with Darwin 7.0 (Mac OS X version 10.3). +. +.It SRCHFS_SKIPPACKAGES +If this bit is set, +.Fn searchfs +will not match any file system objects that are inside a package. +A package is defined as a directory whose extension matches one +of the extensions that are configured into the kernel by Launch Services. +.Pp +Introduced with Darwin 7.0 (Mac OS X version 10.3). +. +.It SRCHFS_SKIPINAPPROPRIATE +If this bit is set, +.Fn searchfs +will not match any file system objects that are within an inappropriate directory. +The current list of inappropriate directories contains one item: /System. +.Pp +Introduced with Darwin 7.0 (Mac OS X version 10.3). +. +.It SRCHFS_NEGATEPARAMS +If this bit is set, +.Fn searchfs +will return all the file system objects that do not match the search criteria. +.Pp +Introduced with Darwin 7.0 (Mac OS X version 10.3). +. +.El +.Pp +. +.\" state parameter +. +The +.Fa state +parameter is a pointer to an opaque data structure that +.Fn searchfs +uses to maintain the state of a search between successive calls. +In your first call to +.Fn searchfs , +you specify the +.Dv SRCHFS_START +flag in the +.Fa options +parameter. +This tells +.Fn searchfs +that the search state is invalid and that it should start a new search. +When this call completes, it may have only returned partial results; +in that case, it will have updated the structure pointed to by +.Fa state . +If you call +.Fn searchfs +again, this time without specifying the +.Dv SRCHFS_START +flag in the +.Fa options +parameter, it will resume the search where it left off, using the search state +that it previously stored in the state structure. +You do not need to explicitly dispose of this state. +.Pp +. +The +.Fn searchfs +function returns significant errors in the followings cases. +. +.Bl -bullet +. +.It +If it has found as many objects as you requested in the +.Fa maxmatches +field of the +.Fa searchBlock +parameter, it will return +.Dv EAGAIN . +. +.It +If there is not enough space in the returned attributes buffer for the first match, +it will return +.Dv ENOBUFS . +You should allocate a larger returned attributes buffer and try again. +.Fa numMatches +will be zero in this case. +. +.It +If the timeout expires it will return +.Dv EAGAIN . +. +.It +If you attempt to resume a search (that is, +.Dv SRCHFS_START +is not specified in the +.Fa options +parameter) and the catalog has changed since the last search, +the function will return +.Dv EBUSY . +You must start your search again from the beginning. +. +.El +.Pp +. +If +.Fn searchfs +returns +.Dv EAGAIN , +the value in +.Fa numMatches +may be greater than zero. +This is known as a partial result. +You should be sure to process these matches before calling +.Fn searchfs +again. +. +.Sh SEARCH CRITERIA +. +You specify the search criteria using a combination of the +.Fa searchattrs , +.Fa searchparams1 , +.Fa sizeofsearchparams1, +.Fa searchparams2 , +and +.Fa sizeofsearchparams2 +fields of the +.Fa searchBlock +parameter, and various flags in the +.Fa options +parameter. +The +.Fa searchattrs +field determines the attributes considered when comparing a file system object to +the search criteria. +You can specify that an attribute should be considered by setting the corresponding +bit in the appropriate +.Vt attrgroup_t +field of the +.Vt attrlist +structure. +See the discussion of +.Xr getattrlist 2 +for a detailed description of this structure. +.Pp +. +The +.Fa searchparams1 , +.Fa sizeofsearchparams1 , +.Fa searchparams2 , +and +.Fa sizeofsearchparams2 +fields specify the attribute values that must be matched. +The format of each of these buffers is determined by the attributes that you're searching for. +The values are packed in exactly the same way as they would be returned from +.Xr getattrlist 2 , +including the leading +.Vt unsigned long +length value. +.Pp +. +The attribute values in the first and second search buffers form a lower and upper bound for +the search, respectively. +These have different meanings depending on the type of attribute. +. +.Bl -bullet +. +.It +For string attributes (specifically +.Dv ATTR_CMN_NAME , +the object name), the value in the first search +buffer is significant and the value in the second search buffer is ignored. +The string comparison is either an exact match or a substring match depending on +the +.Dv SRCHFS_MATCHPARTIALNAMES +flag in the +.Fa options +parameter. +. +.It +For structured attributes (specifically +.Dv ATTR_CMN_FNDRINFO , +the Finder information), the value from the +file system object is masked (logical AND) with the value in the second search buffer and then +compared, byte for byte, against the value in the first search buffer. +If it is equal, the object is a match. +. +.It +For scalar attributes (all other attributes, for example, +.Dv ATTR_CMN_MODTIME , +the modification date), the values in the first and second search +buffers are literally a lower and upper bound. +An object matches the criteria if its value is greater than or equal to the value in +the first buffer and less than or equal to the value in the second. +. +.El +. +.Sh RETURN VALUES +Upon successful completion, a value of 0 is returned. +This means that the entire volume has been searched and all matches returned. +Otherwise, a value of -1 is returned and +.Va errno +is set to indicate the error. +.Pp +. +See the discussion of the +.Dv EAGAIN , +.Dv ENOBUFS , +and +.Dv EBUSY +error codes above. +. +.Sh COMPATIBILITY +Not all volumes support +.Fn searchfs . +You can test whether a volume supports +.Fn searchfs +by using +.Xr getattrlist 2 +to get the volume capabilities attribute +.Dv ATTR_VOL_CAPABILITIES , +and then testing the +.Dv VOL_CAP_INT_SEARCHFS +flag. +.Pp +. +The +.Fn searchfs +function has been undocumented for more than two years. +In that time a number of volume format implementations have been created without +a proper specification for the behaviour of this routine. +You may encounter volume format implementations with slightly different +behaviour than what is described here. +Your program is expected to be tolerant of this variant behaviour. +.Pp +. +If you're implementing a volume format that supports +.Fn searchfs , +you should be careful to support the behaviour specified by this document. +.Pp +. +A bug in systems prior to Darwin 7.0 (Mac OS X version 10.3) makes searching for the +.Dv ATTR_CMN_BKUPTIME +attribute tricky. +The bug causes the attribute to consume two items in the search attribute buffers, the +first in the proper place and the second between +.Dv ATTR_CMN_FNDRINFO +and +.Dv ATTR_CMN_OWNERID . +. +.Sh ERRORS +.Fn searchfs +will fail if: +.Bl -tag -width Er +. +.It Bq Er ENOTSUP +The volume does not support +.Fn searchfs . +. +.It Bq Er ENOTDIR +A component of the path prefix is not a directory. +. +.It Bq Er ENAMETOOLONG +A component of a path name exceeded +.Dv NAME_MAX +characters, or an entire path name exceeded +.Dv PATH_MAX +characters. +. +.It Bq Er ENOENT +The file system object does not exist. +. +.It Bq Er EACCES +Search permission is denied for a component of the path prefix. +. +.It Bq Er ELOOP +Too many symbolic links were encountered in translating the pathname. +. +.It Bq Er EFAULT +One of the pointer parameters points to an invalid address. +. +.It Bq Er EINVAL +The +.Fa options +parameter contains an invalid flag or sizeofsearchparams1/2 is greater than +SEARCHFS_MAX_SEARCHPARMS (see attr.h). +. +.It Bq Er EAGAIN +The search terminated with partial results, either because +.Fa numMatches +has hit the limit specified by +.Fa maxmatches +or because the timeout expired. +Process the matches returned so far and then call +.Fn searchfs +again to look for more. +.Pp +. +.It Bq Er ENOBUFS +The returned attributes buffer is too small for the first match. +You should allocate a larger returned attributes buffer and try again. +.Fa numMatches +will be zero in this case. +. +.It Bq Er EBUSY +The search could not be resumed because the volume has changed. +. +.It Bq Er EIO +An I/O error occurred while reading from or writing to the file system. +.El +.Pp +. +.Sh CAVEATS +Not all attributes can be searched for using +.Fn searchfs . +The list currently includes: +.Pp +. +.Bl -item -compact +.It +ATTR_CMN_NAME +.It +ATTR_CMN_OBJID +.It +ATTR_CMN_PAROBJID +.It +ATTR_CMN_CRTIME +.It +ATTR_CMN_MODTIME +.It +ATTR_CMN_CHGTIME +.It +ATTR_CMN_ACCTIME +.It +ATTR_CMN_BKUPTIME +.It +ATTR_CMN_FNDRINFO +.It +ATTR_CMN_BKUPTIME +.It +ATTR_CMN_OWNERID +.It +ATTR_CMN_GRPID +.It +ATTR_CMN_ACCESSMASK +.Pp +. +.It +ATTR_DIR_ENTRYCOUNT +.Pp +. +.It +ATTR_FILE_DATALENGTH +.It +ATTR_FILE_DATAALLOCSIZE +.It +ATTR_FILE_RSRCLENGTH +.It +ATTR_FILE_RSRCALLOCSIZE +.El +. +.Sh EXAMPLES +. +The following code searches a volume for files of the specified type and creator. +. +.Bd -literal +#include <assert.h> +#include <stdio.h> +#include <stddef.h> +#include <string.h> +#include <sys/attr.h> +#include <sys/errno.h> +#include <unistd.h> +.Pp +. +typedef struct attrlist attrlist_t; +typedef struct fssearchblock fssearchblock_t; +typedef struct searchstate searchstate_t; +.Pp +. +struct SearchAttrBuf { + unsigned long length; + char finderInfo[32]; +}; +typedef struct SearchAttrBuf SearchAttrBuf; +.Pp +. +struct ResultAttrBuf { + unsigned long length; + attrreference_t name; + fsobj_id_t parObjID; +}; +typedef struct ResultAttrBuf ResultAttrBuf; +.Pp +. +enum { + kMatchesPerCall = 16 +}; +.Pp +. +static int SearchFSDemo( + const char *volPath, + const char *type, + const char *creator +) +{ + int err; + fssearchblock_t searchBlock; + SearchAttrBuf lower; + SearchAttrBuf upper; + static const unsigned char kAllOnes[4] = { 0xFF, 0xFF, 0xFF, 0xFF }; + unsigned long matchCount; + unsigned long matchIndex; + unsigned long options; + searchstate_t state; + ResultAttrBuf * thisEntry; + attrlist_t returnAttrList; + char resultAttrBuf[ kMatchesPerCall + * (sizeof(ResultAttrBuf) + 64)]; +.Pp +. + // resultAttrBuf is big enough for kMatchesPerCall entries, + // assuming that the average name length is less than 64. +.Pp +. + assert(strlen(type) == 4); + assert(strlen(creator) == 4); +.Pp + + memset(&searchBlock, 0, sizeof(searchBlock)); + searchBlock.searchattrs.bitmapcount = ATTR_BIT_MAP_COUNT; + searchBlock.searchattrs.commonattr = ATTR_CMN_FNDRINFO; +.Pp + + memset(&lower, 0, sizeof(lower)); + memset(&upper, 0, sizeof(upper)); + lower.length = sizeof(lower); + upper.length = sizeof(upper); + memcpy(&lower.finderInfo[0], type, 4); + memcpy(&lower.finderInfo[4], creator, 4); + memcpy(&upper.finderInfo[0], kAllOnes, 4); + memcpy(&upper.finderInfo[4], kAllOnes, 4); + searchBlock.searchparams1 = &lower; + searchBlock.sizeofsearchparams1 = sizeof(lower); + searchBlock.searchparams2 = &upper; + searchBlock.sizeofsearchparams2 = sizeof(lower); +.Pp + + searchBlock.timelimit.tv_sec = 0; + searchBlock.timelimit.tv_usec = 100 * 1000; +.Pp + + searchBlock.maxmatches = kMatchesPerCall; +.Pp + + memset(&returnAttrList, 0, sizeof(returnAttrList)); + returnAttrList.bitmapcount = ATTR_BIT_MAP_COUNT; + returnAttrList.commonattr = ATTR_CMN_NAME | ATTR_CMN_PAROBJID; +.Pp +. + searchBlock.returnattrs = &returnAttrList; + searchBlock.returnbuffer = resultAttrBuf; + searchBlock.returnbuffersize = sizeof(resultAttrBuf); +.Pp + + options = SRCHFS_START | SRCHFS_MATCHFILES; +.Pp + + do { + err = searchfs( + volPath, + &searchBlock, + &matchCount, + 0x08000103, + options, + &state + ); + if (err != 0) { + err = errno; + } + if ( (err == 0) || (err == EAGAIN) ) { + thisEntry = (ResultAttrBuf *) resultAttrBuf; +.Pp + + for (matchIndex = 0; matchIndex < matchCount; matchIndex++) { + printf("%08x ", thisEntry->parObjID.fid_objno); + printf( + "%s\en", + ((char *) &thisEntry->name) + + thisEntry->name.attr_dataoffset + ); +. + // Advance to the next entry. +. + ((char *) thisEntry) += thisEntry->length; + } + } +.Pp + + options &= ~SRCHFS_START; + } while (err == EAGAIN); +.Pp + + return err; +} +.Ed +. +.Sh SEE ALSO +. +.Xr getattrlist 2 +. +.Sh HISTORY +A +.Fn searchfs +function call appeared in Darwin 1.3.1 (Mac OS X version 10.0). +. diff --git a/bsd/man/man2/select.2 b/bsd/man/man2/select.2 index 39fd5d84b..f4c446606 100644 --- a/bsd/man/man2/select.2 +++ b/bsd/man/man2/select.2 @@ -50,6 +50,7 @@ .Fn FD_SET fd &fdset .Fn FD_CLR fd &fdset .Fn FD_ISSET fd &fdset +.Fn FD_COPY &fdset_orig &fdset_copy .Fn FD_ZERO &fdset .Sh DESCRIPTION .Fn Select @@ -66,7 +67,9 @@ The first descriptors are checked in each set; i.e., the descriptors from 0 through .Fa nfds Ns No -1 -in the descriptor sets are examined. +in the descriptor sets are examined. (Example: If you have set two file descriptors "4" and "17", +.Fa nfds +should not be "2", but rather "17 + 1" or "18".) On return, .Fn select replaces the given descriptor sets @@ -97,6 +100,11 @@ is non-zero if is a member of .Fa fdset , zero otherwise. +.Fn FD_COPY &fdset_orig &fdset_copy +replaces an already allocated +.Fa &fdset_copy +file descriptor set with a copy of +.Fa &fdset_orig . The behavior of these macros is undefined if a descriptor value is less than zero or greater than or equal to .Dv FD_SETSIZE , diff --git a/bsd/man/man2/semctl.2 b/bsd/man/man2/semctl.2 index 2a7e8eb3b..f5d6e8c32 100644 --- a/bsd/man/man2/semctl.2 +++ b/bsd/man/man2/semctl.2 @@ -31,8 +31,6 @@ .Sh NAME .Nm semctl .Nd control operations on a semaphore set -.Sh LIBRARY -.Lb libc .Sh SYNOPSIS .In sys/types.h .In sys/ipc.h diff --git a/bsd/man/man2/semget.2 b/bsd/man/man2/semget.2 index 47ef04913..8705b29e1 100644 --- a/bsd/man/man2/semget.2 +++ b/bsd/man/man2/semget.2 @@ -31,8 +31,6 @@ .Sh NAME .Nm semget .Nd obtain a semaphore id -.Sh LIBRARY -.Lb libc .Sh SYNOPSIS .In sys/types.h .In sys/ipc.h diff --git a/bsd/man/man2/semop.2 b/bsd/man/man2/semop.2 index 94896e750..82701cd74 100644 --- a/bsd/man/man2/semop.2 +++ b/bsd/man/man2/semop.2 @@ -31,8 +31,6 @@ .Sh NAME .Nm semop .Nd atomic array of operations on a semaphore set -.Sh LIBRARY -.Lb libc .Sh SYNOPSIS .In sys/types.h .In sys/ipc.h diff --git a/bsd/man/man2/send.2 b/bsd/man/man2/send.2 index 2d9daca68..36bf6c6c8 100644 --- a/bsd/man/man2/send.2 +++ b/bsd/man/man2/send.2 @@ -47,7 +47,7 @@ .Ft ssize_t .Fn send "int s" "const void *msg" "size_t len" "int flags" .Ft ssize_t -.Fn sendto "int s" "const void *msg" "size_t len" "int flags" "const struct sockaddr *to" "int tolen" +.Fn sendto "int s" "const void *msg" "size_t len" "int flags" "const struct sockaddr *to" "socklen_t tolen" .Ft ssize_t .Fn sendmsg "int s" "const struct msghdr *msg" "int flags" .Sh DESCRIPTION diff --git a/bsd/man/man2/setattrlist.2 b/bsd/man/man2/setattrlist.2 new file mode 100644 index 000000000..d2fbb6b53 --- /dev/null +++ b/bsd/man/man2/setattrlist.2 @@ -0,0 +1,363 @@ +.\" Copyright (c) 2003 Apple Computer, Inc. All rights reserved. +.\" +.\" The contents of this file constitute Original Code as defined in and +.\" are subject to the Apple Public Source License Version 1.1 (the +.\" "License"). You may not use this file except in compliance with the +.\" License. Please obtain a copy of the License at +.\" http://www.apple.com/publicsource and read it before using this file. +.\" +.\" This Original Code and all software distributed under the License are +.\" distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the +.\" License for the specific language governing rights and limitations +.\" under the License. +.\" +.\" @(#)setattrlist.2 +. +.Dd December 15, 2003 +.Dt SETATTRLIST 2 +.Os Darwin +.Sh NAME +.Nm setattrlist +.Nd set file system attributes +.Sh SYNOPSIS +.Fd #include <sys/attr.h> +.Fd #include <unistd.h> +.Ft int +.Fn setattrlist "const char* path" "struct attrlist * attrList" "void * attrBuf" "size_t attrBufSize" "unsigned long options" +. +.Sh DESCRIPTION +The +.Fn setattrlist +function sets attributes (that is, metadata) of file system objects. +It is the logical opposite of +.Xr getattrlist 2 . +The function sets attributes about the file system object specified by +.Fa path +from the values in the buffer specified by +.Fa attrBuf +and +.Fa attrBufSize . +The +.Fa attrList +parameter determines what attributes are set. +The +.Fa options +parameter lets you control specific aspects of the function's behaviour. +.Pp +. +The +.Fn setattrlist +function is only supported by certain volume format implementations. +For maximum compatibility, client programs should use high-level APIs +(such as the Carbon File Manager) to access file system attributes. +These high-level APIs include logic to emulate file system attributes +on volumes that don't support +.Fn setattrlist . +.Pp +. +.\" path parameter +. +The +.Fa path +parameter must reference a valid file system object. +All directories listed in the path name leading to the object +must be searchable. +You must own the file system object in order to set any of the +following attributes: +.Pp +. +.Bl -item -compact +.It +ATTR_CMN_GRPID +.It +ATTR_CMN_ACCESSMASK +.It +ATTR_CMN_FLAGS +.It +ATTR_CMN_CRTIME +.It +ATTR_CMN_MODTIME +.It +ATTR_CMN_CHGTIME +.It +ATTR_CMN_ACCTIME +.El +.Pp +. +You must be root (that is, your process's effective UID must be 0) in order to change the +.Dv ATTR_CMN_OWNERID +attribute. +Setting other attributes requires that you have write access to the object. +.Pp +. +.\" attrList parameter +. +The +.Fa attrList +parameter is a pointer to an +.Vt attrlist +structure. +You are responsible for filling out all fields of this structure before calling the function. +See the discussion of the +.Xr getattrlist 2 +function for a detailed description of this structure. +To set an attribute you must set the corresponding bit in the appropriate +.Vt attrgroup_t +field of the +.Vt attrlist +structure. +.Pp +. +.\" attrBuf and attrBufSize parameters +. +The +.Fa attrBuf +and +.Fa attrBufSize +parameters specify a buffer that contains the attribute values to set. +Attributes are packed in exactly the same way as they are returned from +.Xr getattrlist 2 +except that, when setting attributes, the buffer does not include the leading +.Vt unsigned long +length value. +.Pp +. +.\" option parameter +. +The +.Fa options +parameter is a bit set that controls the behaviour of +.Fn setattrlist . +The following option bits are defined. +. +.Bl -tag -width XXXbitmapcount +. +.It FSOPT_NOFOLLOW +If this bit is set, +.Fn setattrlist +will not follow a symlink if it occurs as +the last component of +.Fa path . +. +.El +. +.Sh RETURN VALUES +Upon successful completion a value of 0 is returned. +Otherwise, a value of -1 is returned and +.Va errno +is set to indicate the error. +. +.Sh COMPATIBILITY +Not all volumes support +.Fn setattrlist . +However, if a volume supports +.Xr getattrlist 2 , +it must also support +.Fn setattrlist . +See the documentation for +.Xr getattrlist 2 +for details on how to tell whether a volume supports it. +.Pp +. +The +.Fn setattrlist +function has been undocumented for more than two years. +In that time a number of volume format implementations have been created without +a proper specification for the behaviour of this routine. +You may encounter volume format implementations with slightly different +behaviour than what is described here. +Your program is expected to be tolerant of this variant behaviour. +.Pp +. +If you're implementing a volume format that supports +.Fn setattrlist , +you should be careful to support the behaviour specified by this document. +. +.Sh ERRORS +.Fn setattrlist +will fail if: +.Bl -tag -width Er +. +.It Bq Er ENOTSUP +The volume does not support +.Fn setattrlist . +. +.It Bq Er ENOTDIR +A component of the path prefix is not a directory. +. +.It Bq Er ENAMETOOLONG +A component of a path name exceeded +.Dv NAME_MAX +characters, or an entire path name exceeded +.Dv PATH_MAX +characters. +. +.It Bq Er ENOENT +The file system object does not exist. +. +.It Bq Er EROFS +The volume is read-only. +. +.It Bq Er EACCES +Search permission is denied for a component of the path prefix. +. +.It Bq Er ELOOP +Too many symbolic links were encountered in translating the pathname. +. +.It Bq Er EFAULT +.Fa path , +.Fa attrList +or +.Em attrBuf +points to an invalid address. +. +.It Bq Er EINVAL +The +.Fa bitmapcount +field of +.Fa attrList +is not +.Dv ATTR_BIT_MAP_COUNT . +. +.It Bq Er EINVAL +You try to set an invalid attribute. +. +.It Bq Er EINVAL +You try to set an attribute that is read-only. +. +.It Bq Er EINVAL +You try to set volume attributes and directory or file attributes at the same time. +. +.It Bq Er EINVAL +You try to set volume attributes but +.Fa path +does not reference the root of the volume. +. +.It Bq Er EPERM +You try to set an attribute that can only be set by the owner. +. +.It Bq Er EACCES +You try to set an attribute that's only settable if you have write permission, +and you do not have write permission. +. +.It Bq Er EINVAL +The buffer size you specified in +.Fa attrBufSize +is too small to hold all the attributes that you are trying to set. +. +.It Bq Er EIO +An I/O error occurred while reading from or writing to the file system. +.El +.Pp +. +.Sh CAVEATS +. +If you try to set any volume attributes, you must set +.Dv ATTR_VOL_INFO +in the +.Fa volattr +field, even though it consumes no data from the attribute buffer. +.Pp +. +For more caveats, see also the compatibility notes above. +. +.Sh EXAMPLES +. +The following code shows how to set the file type and creator of +a file by getting the +.Dv ATTR_CMN_FNDRINFO +attribute using +.Xr getattrlist 2 , +modifying the appropriate fields of the 32-byte Finder information structure, +and then setting the attribute back using +.Fn setattrlist . +This assumes that the target volume supports the required attributes +. +.Bd -literal +#include <assert.h> +#include <stdio.h> +#include <stddef.h> +#include <string.h> +#include <sys/attr.h> +#include <sys/errno.h> +#include <unistd.h> +#include <sys/vnode.h> +.Pp +. +typedef struct attrlist attrlist_t; +.Pp +. +struct FInfoAttrBuf + unsigned long length; + fsobj_type_t objType; + char finderInfo[32]; +}; +typedef struct FInfoAttrBuf FInfoAttrBuf; +.Pp +. +static int FInfoDemo( + const char *path, + const char *type, + const char *creator +) +{ + int err; + attrlist_t attrList; + FInfoAttrBuf attrBuf; +.Pp + + assert( strlen(type) == 4 ); + assert( strlen(creator) == 4 ); +.Pp +. + memset(&attrList, 0, sizeof(attrList)); + attrList.bitmapcount = ATTR_BIT_MAP_COUNT; + attrList.commonattr = ATTR_CMN_OBJTYPE | ATTR_CMN_FNDRINFO; +.Pp + + err = getattrlist(path, &attrList, &attrBuf, sizeof(attrBuf), 0); + if (err != 0) { + err = errno; + } +.Pp + + if ( (err == 0) && (attrBuf.objType != VREG) ) { + fprintf(stderr, "Not a standard file.\en"); + err = EINVAL; + } else { + memcpy( &attrBuf.finderInfo[0], type, 4 ); + memcpy( &attrBuf.finderInfo[4], creator, 4 ); + + attrList.commonattr = ATTR_CMN_FNDRINFO; + err = setattrlist( + path, + &attrList, + attrBuf.finderInfo, + sizeof(attrBuf.finderInfo), + 0 + ); + } +.Pp + return err; +} +.Ed +.Pp +. +.Sh SEE ALSO +. +.Xr chflags 2 , +.Xr chmod 2 , +.Xr chown 2 , +.Xr getattrlist 2 , +.Xr getdirentriesattr 2 , +.Xr searchfs 2 , +.Xr utimes 2 +. +.Sh HISTORY +A +.Fn setattrlist +function call appeared in Darwin 1.3.1 (Mac OS X version 10.0). +. diff --git a/bsd/man/man2/setxattr.2 b/bsd/man/man2/setxattr.2 new file mode 100644 index 000000000..01b444355 --- /dev/null +++ b/bsd/man/man2/setxattr.2 @@ -0,0 +1,175 @@ +.\" +.\" Copyright (c) 2004 Apple Computer, Inc. All rights reserved. +.\" +.\" @APPLE_LICENSE_HEADER_START@ +.\" +.\" This file contains Original Code and/or Modifications of Original Code +.\" as defined in and that are subject to the Apple Public Source License +.\" Version 2.0 (the 'License'). You may not use this file except in +.\" compliance with the License. Please obtain a copy of the License at +.\" http://www.opensource.apple.com/apsl/ and read it before using this +.\" file. +.\" +.\" The Original Code and all software distributed under the License are +.\" distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. +.\" Please see the License for the specific language governing rights and +.\" limitations under the License. +.\" +.\" @APPLE_LICENSE_HEADER_END@ +.\" +.Dd Oct 19, 2004 +.Dt SETXATTR 2 +.Os "Mac OS X" +.Sh NAME +.Nm setxattr, +.Nm fsetxattr +.Nd set an extended attribute value +.Sh SYNOPSIS +.Fd #include <sys/xattr.h> +.Ft int +.Fn setxattr "const char *path" "const char *name" "void *value" "size_t size" "u_int32_t position" "int options" +.Ft int +.Fn fsetxattr "int fd" "const char *name" "void *value" "size_t size" "u_int32_t position" "int options" +.Sh DESCRIPTION +Extended attributes extend the basic attributes associated with files and +directories in the file system. They are stored as name:data pairs +associated with file system objects (files, directories, symlinks, etc). +.Pp +.Fn setxattr +associates +.Fa name +and +.Fa data +together as an attribute of +.Fa path . +.Pp +An extended attribute's +.Fa name +is a simple NULL-terminated UTF-8 string. +.Fa Value +is a pointer to a data buffer of +.Fa size +bytes containing textual or binary data to be associated with the +extended attribute. +.Fa Position +specifies the offset within the extended attribute. In the current +implementation, only the resource fork extended attribute makes use of +this argument. For all others, +.Fa position +is reserved and should be +set to zero. +.Pp +.Fa options +controls how the attribute is set: +.Pp +.Bl -tag -width XATTR_NOFOLLOW +.It Dv XATTR_NOFOLLOW +do not follow symbolic links. +.Fn setxattr +normally sets attributes on the target of +.Fa path +if it is a symbolic link. +With this option, +.Fn setxattr +will act on the link itself. +.It Dv XATTR_CREATE +fail if the named attribute already exists. +.It Dv XATTR_REPLACE +fail if the named attribute does not exist. Failure to specify +.Dv XATTR_REPLACE +or +.Dv XATTR_CREATE +allows creation and replacement. +.El +.Pp +.Fn fsetxattr +is identical to +.Fn setxattr , +except that it sets an extended attribute on an open file referenced by +file descriptor +.Fa fd . +.Sh RETURN VALUES +On success, 0 is returned. On failure, -1 is returned and the global +variable +.Va errno +is set as follows. +.Sh ERRORS +.Bl -tag -width Er +.It Bq Er EEXIST +.Fa options +contains +.Em XATTR_CREATE +and the named attribute already exists. +.It Bq Er ENOATTR +.Fa options +is set to +.Em XATTR_REPLACE +and the named attribute does not exist. +.It Bq Er ENOTSUP +The file system does not support extended attributes or has them disabled. +.It Bq Er EROFS +The file system is mounted read-only. +.It Bq Er ERANGE +The data size of the attribute is out of range (some attributes have size +restrictions). +.It Bq Er EPERM +.\" EFTYPE could be more specific but isn't POSIX +Attributes cannot be associated with this type of object. For example, +attributes are not allowed for resource forks. +.It Bq Er EINVAL +.Fa name +or +.Fa options +is invalid. +.Fa name +must be valid UTF-8 and +.Fa options +must make sense. +.It Bq Er ENOTDIR +A component of +.Fa path +is not a directory. +.It Bq Er ENAMETOOLONG +.Fa name +exceeded +.Dv XATTR_MAXNAMELEN +UTF-8 bytes, or a component of +.Fa path +exceeded +.Dv NAME_MAX +characters, or the entire +.Fa path +exceeded +.Dv PATH_MAX +characters. +.It Bq Er EACCES +Search permission is denied for a component of +.Fa path +or permission to set the attribute is denied. +.It Bq Er ELOOP +Too many symbolic links were encountered resolving +.Fa path . +.It Bq Er EFAULT +.Fa path +or +.Fa name +points to an invalid address. +.It Bq Er EIO +An I/O error occurred while reading from or writing to the file system. +.It Bq Er E2BIG +The data size of the extended attribute is too large. +.It Bq Er ENOSPC +Not enough space left on the file system. +.El +.Sh SEE ALSO +.Xr getxattr 2 , +.Xr removexattr 2 , +.Xr listxattr 2 +.Sh HISTORY +.Fn setxattr +and +.Fn fsetxattr +first appeared in Mac OS X 10.4. diff --git a/bsd/man/man2/shmget.2 b/bsd/man/man2/shmget.2 index 55ad6fbdd..1d13c4960 100644 --- a/bsd/man/man2/shmget.2 +++ b/bsd/man/man2/shmget.2 @@ -46,7 +46,7 @@ .Fn shmget returns the shared memory identifier associated with the key .Fa key . - +.Pp A shared memory segment is created if either .Fa key is equal to IPC_PRIVATE, or @@ -54,7 +54,7 @@ is equal to IPC_PRIVATE, or does not have a shared memory segment identifier associated with it, and the IPC_CREAT bit is set in .Fa shmflg. - +.Pp If a new shared memory segment is created, the data structure associated with it (the .Va shmid_ds structure, see diff --git a/bsd/man/man2/shutdown.2 b/bsd/man/man2/shutdown.2 index 927e3bdf3..e6799e727 100644 --- a/bsd/man/man2/shutdown.2 +++ b/bsd/man/man2/shutdown.2 @@ -52,13 +52,19 @@ the socket associated with to be shut down. If .Fa how -is 0, further receives will be disallowed. +is +.Dv SHUT_RD , +further receives will be disallowed. If .Fa how -is 1, further sends will be disallowed. +is +.Dv SHUT_WR , +further sends will be disallowed. If .Fa how -is 2, further sends and receives will be disallowed. +is +.Dv SHUT_RDWR , +further sends and receives will be disallowed. .Sh DIAGNOSTICS A 0 is returned if the call succeeds, -1 if it fails. .Sh ERRORS diff --git a/bsd/man/man2/vfork.2 b/bsd/man/man2/vfork.2 index 06e201dfc..c8b1d3bbd 100644 --- a/bsd/man/man2/vfork.2 +++ b/bsd/man/man2/vfork.2 @@ -94,7 +94,7 @@ since buffered data would then be flushed twice.) .Xr fork 2 , .Xr execve 2 , .Xr sigaction 2 , -.Xr wait 2 , +.Xr wait 2 .Sh DIAGNOSTICS Same as for .Xr fork . diff --git a/bsd/man/man4/Makefile b/bsd/man/man4/Makefile index 031eb7e86..3cc1cc953 100644 --- a/bsd/man/man4/Makefile +++ b/bsd/man/man4/Makefile @@ -10,11 +10,13 @@ DATAFILES = \ arp.4 \ bpf.4 \ divert.4 \ + dummynet.4 \ faith.4 \ fd.4 \ gif.4 \ icmp.4 \ icmp6.4 \ + ifmib.4 \ inet.4 \ inet6.4 \ ip.4 \ diff --git a/bsd/man/man4/arp.4 b/bsd/man/man4/arp.4 index acdad7029..8cfc8d3a6 100644 --- a/bsd/man/man4/arp.4 +++ b/bsd/man/man4/arp.4 @@ -122,5 +122,4 @@ same Internet address. .%A Karels, M.J. .%B "Trailer Encapsulations .%T RFC893 -.Re - +.Re \ No newline at end of file diff --git a/bsd/man/man4/bpf.4 b/bsd/man/man4/bpf.4 index b79476efc..17b9876c3 100644 --- a/bsd/man/man4/bpf.4 +++ b/bsd/man/man4/bpf.4 @@ -93,6 +93,10 @@ packet can be processed per write. Currently, only writes to Ethernets and .Tn SLIP links are supported. +.Pp +When the last minor device is opened, an additional minor device is +created on demand. The maximum number of devices that can be created is +controlled by the sysctl debug.bpf_maxdevices. .Sh IOCTLS The .Xr ioctl 2 diff --git a/bsd/man/man4/dummynet.4 b/bsd/man/man4/dummynet.4 new file mode 100644 index 000000000..fbd317bf4 --- /dev/null +++ b/bsd/man/man4/dummynet.4 @@ -0,0 +1,64 @@ +.\" +.\" $FreeBSD: /repoman/r/ncvs/src/share/man/man4/dummynet.4,v 1.4.2.12 2002/11/18 21:51:16 luigi Exp $ +.\" +.Dd October 28, 2002 +.Dt DUMMYNET 4 +.Os Darwin +.Sh NAME +.Nm dummynet +.Nd traffic shaper, bandwidth manager and delay emulator +.Sh DESCRIPTION +.Em dummynet +is a system facility that permits the control of traffic +going through the various network interfaces, by applying bandwidth +and queue size limitations, implementing different scheduling and queue +management policies, and emulating delays and losses. +.Pp +The user interface for +.Em dummynet +is implemented by the +.Nm ipfw +program, so the reader is referred to the +.Xr ipfw 8 +manpage for a complete description of the capabilities of +.Nm +and on how to use it. +.Sh KERNEL OPTIONS +The following options in the kernel configuration file are related to +.Nm +operation: +.Bd -literal + IPFIREWALL - enable ipfirewall (required for dummynet). + IPFIREWALL_VERBOSE - enable firewall output. + IPFIREWALL_VERBOSE_LIMIT - limit firewall output. + DUMMYNET - enable dummynet operation. + NMBCLUSTERS - set the amount of network packet buffers + HZ - sets the timer granularity +.Ed +.Pp +Generally, the following options are required: +.Bd -literal + options IPFIREWALL + options DUMMYNET + options HZ=1000 # strongly recommended +.Ed +.Pp +additionally, one may want to increase the number +of mbuf clusters (used to store network packets) according to the +sum of the bandwidth-delay products and queue sizes of all configured +pipes. +.Sh SEE ALSO +.Xr setsockopt 2 , +.Xr bridge 4 , +.Xr ip 4 , +.Xr ipfw 8 , +.Xr sysctl 8 +.Sh HISTORY +.Nm +was initially implemented as a testing tool for TCP congestion control +by +.An Luigi Rizzo Aq luigi@iet.unipi.it , +as described on ACM Computer Communication Review, Jan.97 issue. +Later it has been then modified to work at the ip and bridging +level, integrated with the IPFW packet filter, and extended to +support multiple queueing and scheduling policies. diff --git a/bsd/man/man4/icmp6.4 b/bsd/man/man4/icmp6.4 index fe6cc97e4..28b1325d1 100644 --- a/bsd/man/man4/icmp6.4 +++ b/bsd/man/man4/icmp6.4 @@ -1,32 +1,9 @@ -.\" Copyright (C) 1999 WIDE Project. -.\" All rights reserved. -.\" -.\" Redistribution and use in source and binary forms, with or without -.\" modification, are permitted provided that the following conditions -.\" are met: -.\" 1. Redistributions of source code must retain the above copyright -.\" notice, this list of conditions and the following disclaimer. -.\" 2. Redistributions in binary form must reproduce the above copyright -.\" notice, this list of conditions and the following disclaimer in the -.\" documentation and/or other materials provided with the distribution. -.\" 3. Neither the name of the project nor the names of its contributors -.\" may be used to endorse or promote products derived from this software -.\" without specific prior written permission. -.\" -.\" THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND -.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -.\" ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE -.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -.\" SUCH DAMAGE. +.\" $NetBSD: icmp6.4,v 1.13 2005/01/11 06:01:41 itojun Exp $ +.\" $KAME: icmp6.4,v 1.6 2004/12/27 05:30:56 itojun Exp $ +.\" $OpenBSD: icmp6.4,v 1.19 2004/12/23 20:33:03 jaredy Exp $ .\" .\" Copyright (c) 1986, 1991, 1993 -.\" The Regents of the University of California. All rights reserved. +.\" The Regents of the University of California. All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions @@ -36,11 +13,7 @@ .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. -.\" 3. All advertising materials mentioning features or use of this software -.\" must display the following acknowledgement: -.\" This product includes software developed by the University of -.\" California, Berkeley and its contributors. -.\" 4. Neither the name of the University nor the names of its contributors +.\" 3. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" @@ -55,52 +28,42 @@ .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. -.\" -.\" KAME $Id: icmp6.4,v 1.2 2002/04/17 00:18:23 lindak Exp $ -.\" $FreeBSD: src/share/man/man4/icmp6.4,v 1.1.2.7 2001/12/17 11:30:12 ru Exp $ -.\" -.Dd March 13, 2000 +.Dd December 20, 2004 .Dt ICMP6 4 .Os -.\" .Sh NAME .Nm icmp6 .Nd Internet Control Message Protocol for IPv6 -.\" .Sh SYNOPSIS -.In sys/types.h .In sys/socket.h .In netinet/in.h .In netinet/icmp6.h .Ft int -.Fn socket AF_INET6 SOCK_RAW proto -.\" +.Fn socket AF_INET6 SOCK_RAW IPPROTO_ICMPV6 .Sh DESCRIPTION -.Tn ICMPv6 -is the error and control message protocol used -by -.Tn IPv6 -and the Internet protocol family. +ICMPv6 is the error and control message protocol used by IPv6 and the +IPv6 protocol family (see +.Xr ip6 4 +and +.Xr inet6 4 ) . It may be accessed through a .Dq raw socket for network monitoring and diagnostic functions. +.Pp The .Fa proto -parameter to the socket call to create an -.Tn ICMPv6 -socket is obtained from -.Xr getprotobyname 3 , -or you can use -.Dv IPPROTO_ICMPV6 . -.Tn ICMPv6 -sockets are connectionless, and are normally used with the +parameter to the +.Xr socket 2 +call to create an ICMPv6 socket may be obtained from +.Xr getprotobyname 3 . +ICMPv6 sockets are connectionless, and are normally used with the .Xr sendto 2 and .Xr recvfrom 2 calls, though the .Xr connect 2 call may also be used to fix the destination for future packets -(in which case the +(in which case .Xr read 2 or .Xr recv 2 @@ -110,158 +73,183 @@ or .Xr send 2 system calls may be used). .Pp -Outgoing packets automatically have an -.Tn IPv6 -header prepended to them +Outgoing packets automatically have an IPv6 header prepended to them (based on the destination address). -.Tn ICMPv6 -pseudo header checksum field -.Pq Li icmp6_cksum -will be filled automatically by the kernel. -Incoming packets are received without the -.Tn IPv6 -header nor IPv6 extension headers. -Notice that this behavior is opposite from -.Tn IPv4 -raw sockets and. -.Tn ICMPv4 -sockets. +Incoming packets on the socket are received with the IPv6 header and any +extension headers removed. +.Ss Types +ICMPv6 messages are classified according to the type and code fields +present in the ICMPv6 header. +The abbreviations for the types and codes may be used in rules in +.Xr pf.conf 5 . +The following types are defined: +.Bl -column x xxxxxxxxxxxx -offset indent +.It Sy Num Ta Sy Abbrev. Ta Sy Description +.It 1 Ta unreach Ta "Destination unreachable" +.It 2 Ta toobig Ta "Packet too big" +.It 3 Ta timex Ta "Time exceeded" +.It 4 Ta paramprob Ta "Invalid IPv6 header" +.It 128 Ta echoreq Ta "Echo service request" +.It 129 Ta echorep Ta "Echo service reply" +.It 130 Ta groupqry Ta "Group membership query" +.It 130 Ta listqry Ta "Multicast listener query" +.It 131 Ta grouprep Ta "Group membership report" +.It 131 Ta listenrep Ta "Multicast listener report" +.It 132 Ta groupterm Ta "Group membership termination" +.It 132 Ta listendone Ta "Multicast listerner done" +.It 133 Ta routersol Ta "Router solicitation" +.It 134 Ta routeradv Ta "Router advertisement" +.It 135 Ta neighbrsol Ta "Neighbor solicitation" +.It 136 Ta neighbradv Ta "Neighbor advertisement" +.It 137 Ta redir Ta "Shorter route exists" +.It 138 Ta routrrenum Ta "Route renumbering" +.It 139 Ta fqdnreq Ta "FQDN query" +.It 139 Ta niqry Ta "Node information query" +.It 139 Ta wrureq Ta "Who-are-you request" +.It 140 Ta fqdnrep Ta "FQDN reply" +.It 140 Ta nirep Ta "Node information reply" +.It 140 Ta wrurep Ta "Who-are-you reply" +.It 200 Ta mtraceresp Ta "mtrace response" +.It 201 Ta mtrace Ta "mtrace messages" +.El +.Pp +The following codes are defined: +.Bl -column x xxxxxxxxxxxx xxxxxxxx -offset indent +.It Sy Num Ta Sy Abbrev. Ta Sy Type Ta +.Sy Description +.It 0 Ta noroute-unr Ta unreach Ta "No route to destination" +.It 1 Ta admin-unr Ta unreach Ta "Administratively prohibited" +.It 2 Ta beyond-unr Ta unreach Ta "Beyond scope of source address" +.It 2 Ta notnbr-unr Ta unreach Ta "Not a neighbor (obselete)" +.It 3 Ta addr-unr Ta unreach Ta "Address unreachable" +.It 4 Ta port-unr Ta unreach Ta "Port unreachable" +.It 0 Ta transit Ta timex Ta "Time exceeded in transit" +.It 1 Ta reassemb Ta timex Ta "Time exceeded in reassembly" +.It 0 Ta badhead Ta paramprob Ta "Erroneous header field" +.It 1 Ta nxthdr Ta paramprob Ta "Unrecognized next header" +.It 2 Ta "" Ta redir Ta "Unrecognized option" +.It 0 Ta redironlink Ta redir Ta "Redirection to on-link node" +.It 1 Ta redirrouter Ta redir Ta "Redirection to better router" +.El +.Ss Headers +All ICMPv6 messages are prefixed with an ICMPv6 header. +This header corresponds to the +.Vt icmp6_hdr +structure and has the following definition: +.Bd -literal -offset indent +struct icmp6_hdr { + u_int8_t icmp6_type; /* type field */ + u_int8_t icmp6_code; /* code field */ + u_int16_t icmp6_cksum; /* checksum field */ + union { + u_int32_t icmp6_un_data32[1]; /* type-specific */ + u_int16_t icmp6_un_data16[2]; /* type-specific */ + u_int8_t icmp6_un_data8[4]; /* type-specific */ + } icmp6_dataun; +} __packed; + +#define icmp6_data32 icmp6_dataun.icmp6_un_data32 +#define icmp6_data16 icmp6_dataun.icmp6_un_data16 +#define icmp6_data8 icmp6_dataun.icmp6_un_data8 +#define icmp6_pptr icmp6_data32[0] /* parameter prob */ +#define icmp6_mtu icmp6_data32[0] /* packet too big */ +#define icmp6_id icmp6_data16[0] /* echo request/reply */ +#define icmp6_seq icmp6_data16[1] /* echo request/reply */ +#define icmp6_maxdelay icmp6_data16[0] /* mcast group membership*/ +.Ed .Pp -.Ss ICMPv6 type/code filter -Each -.Tn ICMPv6 -raw socket has an associated filter whose datatype is defined as -.Li struct icmp6_filter ; +.Va icmp6_type +describes the type of the message. +Suitable values are defined in +.Aq Pa netinet/icmp6.h . +.Va icmp6_code +describes the sub-type of the message and depends on +.Va icmp6_type . +.Va icmp6_cksum +contains the checksum for the message and is filled in by the +kernel on outgoing messages. +The other fields are used for type-specific purposes. +.Ss Filters +Because of the extra functionality of ICMPv6 in comparison to ICMPv4, +a larger number of messages may be potentially received on an ICMPv6 +socket. +Input filters may therefore be used to restrict input to a subset of the +incoming ICMPv6 messages so only interesting messages are returned by the +.Xr recv 2 +family of calls to an application. .Pp -This structure, along with the macros and constants defined later in -this section, are defined as a result of including the -.Aq Li netinet/icmp6.h -header. +The +.Vt icmp6_filter +structure may be used to refine the input message set according to the +ICMPv6 type. +By default, all messages types are allowed on newly created raw ICMPv6 +sockets. +The following macros may be used to refine the input set: +.Bl -tag -width Ds +.It Fn "void ICMP6_FILTER_SETPASSALL" "struct icmp6_filter *filterp" +Allow all incoming messages. +.Va filterp +is modified to allow all message types. +.It Fn "void ICMP6_FILTER_SETBLOCKALL" "struct icmp6_filter *filterp" +Ignore all incoming messages. +.Va filterp +is modified to ignore all message types. +.It Fn "void ICMP6_FILTER_SETPASS" "int type" \ + "struct icmp6_filter *filterp" +Allow ICMPv6 messages with the given +.Fa type . +.Va filterp +is modified to allow such messages. +.It Fn "void ICMP6_FILTER_SETBLOCK" "int type" \ + "struct icmp6_filter *filterp" +Ignore ICMPv6 messages with the given +.Fa type . +.Va filterp +is modified to ignore such messages. +.It Fn "int ICMP6_FILTER_WILLPASS" "int type" \ + "const struct icmp6_filter *filterp" +Determine if the given filter will allow an ICMPv6 message of the given +type. +.It Fn "int ICMP6_FILTER_WILLBLOCK" "int type" \ + "const struct icmp6_filter *filterp" +Determine if the given filter will ignore an ICMPv6 message of the given +type. +.El .Pp -The current filter is fetched and stored using +The .Xr getsockopt 2 and .Xr setsockopt 2 -with a level of +calls may be used to obtain and install the filter on ICMPv6 sockets at +option level .Dv IPPROTO_ICMPV6 -and an option name of -.Dv ICMP6_FILTER . -.Pp -Six macros operate on an icmp6_filter structure: -.\" is "Fn" legal for macros? -.Bl -item -offset indent -.It -.Ft void -.Fn ICMP6_FILTER_SETPASSALL "struct icmp6_filter *filterp" -.It -.Ft void -.Fn ICMP6_FILTER_SETBLOCKALL "struct icmp6_filter *filterp" -.It -.Ft void -.Fn ICMP6_FILTER_SETPASS "int type" "struct icmp6_filter *filterp" -.It -.Ft void -.Fn ICMP6_FILTER_SETBLOCK "int type" "struct icmp6_filter *filterp" -.It -.Ft int -.Fn ICMP6_FILTER_WILLPASS "int type" "const struct icmp6_filter *filterp" -.It -.Ft int -.Fn ICMP6_FILTER_WILLBLOCK "int type" "const struct icmp6_filter *filterp" -.El -.Pp -The first argument to the last four macros -(an integer) -is an -.Tn ICMPv6 -message type, between 0 and 255. -The pointer argument to all six -macros is a pointer to a filter that is modified by the first four -macros examined by the last two macros. -.Pp -The first two macros, -.Dv SETPASSALL -and -.Dv SETBLOCKALL , -let us specify that -all -.Tn ICMPv6 -messages are passed to the application or that all -.Tn ICMPv6 -messages are blocked from being passed to the application. -.Pp -The next two macros, -.Dv SETPASS -and -.Dv SETBLOCK , -let us specify that -messages of a given -.Tn ICMPv6 -type should be passed to the application -or not passed to the application -(blocked). -.Pp -The final two macros, -.Dv WILLPASS -and -.Dv WILLBLOCK , -return true or false -depending whether the specified message type is passed to the -application or blocked from being passed to the application by the -filter pointed to by the second argument. -.Pp -When an -.Tn ICMPv6 -raw socket is created, it will by default pass all -.Tn ICMPv6 -message types to the application. -.Pp -For further discussions see RFC2292. -.\" -.Sh ERRORS -A socket operation may fail with one of the following errors returned: -.Bl -tag -width Er -.It Bq Er EISCONN -when trying to establish a connection on a socket which -already has one, or when trying to send a datagram with the destination -address specified and the socket is already connected; -.It Bq Er ENOTCONN -when trying to send a datagram, but -no destination address is specified, and the socket hasn't been -connected; -.It Bq Er ENOBUFS -when the system runs out of memory for -an internal data structure; -.It Bq Er EADDRNOTAVAIL -when an attempt is made to create a -socket with a network address for which no network interface exists. -.El -.\" +and name +.Dv ICMPV6_FILTER +with a pointer to the +.Vt icmp6_filter +structure as the option value. .Sh SEE ALSO +.Xr getsockopt 2 , .Xr recv 2 , .Xr send 2 , +.Xr setsockopt 2 , +.Xr socket 2 , +.Xr getprotobyname 3 , .Xr inet6 4 , -.Xr intro 4 , -.Xr ip6 4 +.Xr ip6 4 , +.Xr netintro 4 .Rs .%A W. Stevens .%A M. Thomas -.%R RFC -.%N 2292 +.%T Advanced Sockets API for IPv6 +.%N RFC 2292 .%D February 1998 -.%T "Advanced Sockets API for IPv6" .Re .Rs .%A A. Conta .%A S. Deering -.%R RFC -.%N 2463 +.%T "Internet Control Message Protocol (ICMPv6) for the Internet" \ + "Protocol Version 6 (IPv6) Specification" +.%N RFC 2463 .%D December 1998 -.%T "Internet Control Message Protocol (ICMPv6) for the Internet Protocol Version 6 (IPv6) Specification" .Re -.\" -.Sh HISTORY -The implementation is based on KAME stack -(which is descendant of WIDE hydrangea IPv6 stack kit). -.Pp -Part of the document was shamelessly copied from RFC2292. diff --git a/bsd/man/man4/ifmib.4 b/bsd/man/man4/ifmib.4 new file mode 100644 index 000000000..db51a73d2 --- /dev/null +++ b/bsd/man/man4/ifmib.4 @@ -0,0 +1,196 @@ +.\" Copyright 1996 Massachusetts Institute of Technology +.\" +.\" Permission to use, copy, modify, and distribute this software and +.\" its documentation for any purpose and without fee is hereby +.\" granted, provided that both the above copyright notice and this +.\" permission notice appear in all copies, that both the above +.\" copyright notice and this permission notice appear in all +.\" supporting documentation, and that the name of M.I.T. not be used +.\" in advertising or publicity pertaining to distribution of the +.\" software without specific, written prior permission. M.I.T. makes +.\" no representations about the suitability of this software for any +.\" purpose. It is provided "as is" without express or implied +.\" warranty. +.\" +.\" THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS +.\" ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, +.\" INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +.\" MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT +.\" SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +.\" SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +.\" LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +.\" USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +.\" ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +.\" OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +.\" OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD: /repoman/r/ncvs/src/share/man/man4/ifmib.4,v 1.20 2004/07/03 18:29:20 ru Exp $ +.\" +.Dd November 15, 1996 +.Dt IFMIB 4 +.Os +.Sh NAME +.Nm ifmib +.Nd Management Information Base for network interfaces +.Sh SYNOPSIS +.In sys/types.h +.In sys/socket.h +.In sys/sysctl.h +.In sys/time.h +.In net/if.h +.In net/if_mib.h +.Sh DESCRIPTION +The +.Nm +facility is an application of the +.Xr sysctl 3 +interface to provide management information about network interfaces +to client applications such as +.Xr netstat 1 , +.Xr slstat 8 , +and +.Tn SNMP +management agents. +This information is structured as a table, where +each row in the table represents a logical network interface (either a +hardware device or a software pseudo-device like +.Xr lo 4 ) . +There are two columns in the table, each containing a single +structure: one column contains generic information relevant to all +interfaces, and the other contains information specific to the +particular class of interface. +(Generally the latter will implement +the +.Tn SNMP +.Tn MIB +defined for that particular interface class, if one exists and can be +implemented in the kernel.) +.Pp +The +.Nm +facility is accessed via the +.Dq Li net.link.generic +branch of the +.Xr sysctl 3 +MIB. +The manifest constants for each level in the +.Xr sysctl 3 +.Ar name +are defined in +.In net/if_mib.h . +The index of the last row in the table is given by +.Dq Li net.link.generic.system.ifcount +(or, using the manifest constants, +.Dv CTL_NET , +.Dv PF_LINK , +.Dv NETLINK_GENERIC , +.Dv IFMIB_SYSTEM , +.Dv IFMIB_IFCOUNT ) . +A management application searching for a particular interface should +start with row 1 and continue through the table row-by-row until the +desired interface is found, or the interface count is reached. +Note that the table may be sparse, i.e., a given row may not exist, +indicated by an +.Va errno +of +.Er ENOENT . +Such an error should be ignored, and the next row should be checked. +.Pp +The generic interface information, common to all interfaces, +can be accessed via the following procedure: +.Bd -literal -offset indent +int +get_ifmib_general(int row, struct ifmibdata *ifmd) +{ + int name[6]; + size_t len; + + name[0] = CTL_NET; + name[1] = PF_LINK; + name[2] = NETLINK_GENERIC; + name[3] = IFMIB_IFDATA; + name[4] = row; + name[5] = IFDATA_GENERAL; + + len = sizeof(*ifmd); + + return sysctl(name, 6, ifmd, &len, (void *)0, 0); +} +.Ed +.Pp +The fields in +.Li struct ifmibdata +are as follows: +.Bl -tag -width "ifmd_snd_drops" +.It Li ifmd_name +.Pq Li "char []" +the name of the interface, including the unit number +.It Li ifmd_pcount +.Pq Li int +the number of promiscuous listeners +.It Li ifmd_flags +.Pq Li int +the interface's flags (defined in +.In net/if.h ) +.It Li ifmd_snd_len +.Pq Li int +the current instantaneous length of the send queue +.It Li ifmd_snd_drops +.Pq Li int +the number of packets dropped at this interface because the send queue +was full +.It Li ifmd_data +.Pq Li struct if_data +more information from a structure defined in +.In net/if.h +(see +.Xr if_data 9 ) +.El +.Pp +Class-specific information can be retrieved by examining the +.Dv IFDATA_LINKSPECIFIC +column instead. +Note that the form and length of the structure will +depend on the class of interface. +For +.Dv IFT_ETHER , +.Dv IFT_ISO88023 , +and +.Dv IFT_STARLAN +interfaces, the structure is called +.Dq Li struct ifmib_iso_8802_3 +(defined in +.In net/if_mib.h ) , +and implements a superset of the +.Tn "RFC 1650" +MIB for Ethernet-like networks. +.\" This will eventually be defined in an ethermib(4) page. +For +.Dv IFT_SLIP , +the structure is a +.Dq Li struct sl_softc +.Pq In net/if_slvar.h . +.Sh SEE ALSO +.Xr sysctl 3 , +.Xr intro 4 , +.Xr ifnet 9 +.\" .Xr ethermib 4 , +.Rs +.%T "Definitions of Managed Objects for the Ethernet-like Interface Types Using SMIv2" +.%A F. Kastenholz +.%D August 1994 +.%O RFC 1650 +.Re +.Sh BUGS +Many Ethernet-like interfaces do not yet support the Ethernet MIB; +the interfaces known to support it include +.Xr ed 4 +and +.Xr de 4 . +Regardless, all interfaces automatically support the generic MIB. +.Sh HISTORY +The +.Nm +interface first appeared in +.Fx 2.2 . diff --git a/bsd/man/man4/ip6.4 b/bsd/man/man4/ip6.4 index 3c2b1eb98..25df62c8e 100644 --- a/bsd/man/man4/ip6.4 +++ b/bsd/man/man4/ip6.4 @@ -1,31 +1,6 @@ -.\" $KAME: ip6.4,v 1.14 2001/02/26 09:31:39 itojun Exp $ -.\" -.\" Copyright (C) 1999 WIDE Project. -.\" All rights reserved. -.\" -.\" Redistribution and use in source and binary forms, with or without -.\" modification, are permitted provided that the following conditions -.\" are met: -.\" 1. Redistributions of source code must retain the above copyright -.\" notice, this list of conditions and the following disclaimer. -.\" 2. Redistributions in binary form must reproduce the above copyright -.\" notice, this list of conditions and the following disclaimer in the -.\" documentation and/or other materials provided with the distribution. -.\" 3. Neither the name of the project nor the names of its contributors -.\" may be used to endorse or promote products derived from this software -.\" without specific prior written permission. -.\" -.\" THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND -.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -.\" ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE -.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -.\" SUCH DAMAGE. +.\" $NetBSD: ip6.4,v 1.20 2005/01/11 06:01:41 itojun Exp $ +.\" $KAME: ip6.4,v 1.23 2005/01/11 05:56:25 itojun Exp $ +.\" $OpenBSD: ip6.4,v 1.21 2005/01/06 03:50:46 itojun Exp $ .\" .\" Copyright (c) 1983, 1991, 1993 .\" The Regents of the University of California. All rights reserved. @@ -38,11 +13,7 @@ .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. -.\" 3. All advertising materials mentioning features or use of this software -.\" must display the following acknowledgement: -.\" This product includes software developed by the University of -.\" California, Berkeley and its contributors. -.\" 4. Neither the name of the University nor the names of its contributors +.\" 3. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" @@ -57,651 +28,659 @@ .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. -.\" -.\" $FreeBSD: src/share/man/man4/ip6.4,v 1.1.2.8 2001/12/17 11:30:12 ru Exp $ -.\" -.Dd March 13, 2000 +.Dd December 29, 2004 .Dt IP6 4 .Os -.\" .Sh NAME .Nm ip6 -.Nd Internet Protocol version 6 (IPv6) -.\" +.Nd Internet Protocol version 6 (IPv6) network layer .Sh SYNOPSIS -.In sys/types.h .In sys/socket.h .In netinet/in.h .Ft int .Fn socket AF_INET6 SOCK_RAW proto -.\" .Sh DESCRIPTION -.Tn IPv6 -is the network layer protocol used by the Internet protocol version 6 family -.Pq Dv AF_INET6 . -Options may be set at the -.Tn IPv6 -level when using higher-level protocols that are based on -.Tn IPv6 -(such as -.Tn TCP +The IPv6 network layer is used by the IPv6 protocol family for +transporting data. +IPv6 packets contain an IPv6 header that is not provided as part of the +payload contents when passed to an application. +IPv6 header options affect the behavior of this protocol and may be used +by high-level protocols (such as the +.Xr tcp 4 and -.Tn UDP ) . -It may also be accessed through a -.Dq raw socket -when developing new protocols, or special-purpose applications. -.Pp -There are several -.Tn IPv6-level -.Xr setsockopt 2 Ns / Ns Xr getsockopt 2 -options. -They are separated into the basic IPv6 sockets API -(defined in RFC2553), -and the advanced API -(defined in RFC2292). -The basic API looks very similar to the API presented in -.Xr ip 4 . -Advanced API uses ancillary data and can handle more complex cases. +.Xr udp 4 +protocols) as well as directly by +.Dq raw sockets , +which process IPv6 messages at a lower-level and may be useful for +developing new protocols and special-purpose applications. +.Ss Header +All IPv6 packets begin with an IPv6 header. +When data received by the kernel are passed to the application, this +header is not included in buffer, even when raw sockets are being used. +Likewise, when data are sent to the kernel for transmit from the +application, the buffer is not examined for an IPv6 header: +the kernel always constructs the header. +To directly access IPv6 headers from received packets and specify them +as part of the buffer passed to the kernel, link-level access +.Po +.Xr bpf 4 , +for example +.Pc +must instead be utilized. .Pp -To specify some of socket options, certain privilege -(i.e. root privilege) is required. -.\" -.Ss Basic IPv6 sockets API -.Dv IPV6_UNICAST_HOPS -may be used to set the hoplimit field in the -.Tn IPv6 -header. -As symbol name suggests, the option controls hoplimit field on unicast packets. -If -1 is specified, the kernel will use a default value. -If a value of 0 to 255 is specified, the packet will have the specified -value as hoplimit. -Other values are considered invalid, and -.Er EINVAL -will be returned. -For example: +The header has the following definition: .Bd -literal -offset indent -int hlim = 60; /* max = 255 */ -setsockopt(s, IPPROTO_IPV6, IPV6_UNICAST_HOPS, &hlim, sizeof(hlim)); +struct ip6_hdr { + union { + struct ip6_hdrctl { + u_int32_t ip6_un1_flow; /* 20 bits of flow ID */ + u_int16_t ip6_un1_plen; /* payload length */ + u_int8_t ip6_un1_nxt; /* next header */ + u_int8_t ip6_un1_hlim; /* hop limit */ + } ip6_un1; + u_int8_t ip6_un2_vfc; /* version and class */ + } ip6_ctlun; + struct in6_addr ip6_src; /* source address */ + struct in6_addr ip6_dst; /* destination address */ +} __packed; + +#define ip6_vfc ip6_ctlun.ip6_un2_vfc +#define ip6_flow ip6_ctlun.ip6_un1.ip6_un1_flow +#define ip6_plen ip6_ctlun.ip6_un1.ip6_un1_plen +#define ip6_nxt ip6_ctlun.ip6_un1.ip6_un1_nxt +#define ip6_hlim ip6_ctlun.ip6_un1.ip6_un1_hlim +#define ip6_hops ip6_ctlun.ip6_un1.ip6_un1_hlim .Ed .Pp -.Tn IPv6 -multicasting is supported only on -.Dv AF_INET6 -sockets of type -.Dv SOCK_DGRAM +All fields are in network-byte order. +Any options specified (see +.Sx Options +below) must also be specified in network-byte order. +.Pp +.Va ip6_flow +specifies the flow ID. +.Va ip6_plen +specifies the payload length. +.Va ip6_nxt +specifies the type of the next header. +.Va ip6_hlim +specifies the hop limit. +.Pp +The top 4 bits of +.Va ip6_vfc +specify the class and the bottom 4 bits specify the version. +.Pp +.Va ip6_src and -.Dv SOCK_RAW, -and only on networks where the interface driver supports multicasting. +.Va ip6_dst +specify the source and destination addresses. .Pp -The -.Dv IPV6_MULTICAST_HOPS -option changes the hoplimit for outgoing multicast datagrams -in order to control the scope of the multicasts: +The IPv6 header may be followed by any number of extension headers that start +with the following generic definition: .Bd -literal -offset indent -unsigned int hlim; /* range: 0 to 255, default = 1 */ -setsockopt(s, IPPROTO_IPV6, IPV6_MULTICAST_HOPS, &hlim, sizeof(hlim)); +struct ip6_ext { + u_int8_t ip6e_nxt; + u_int8_t ip6e_len; +} __packed; .Ed -.Pp -Datagrams with a hoplimit of 1 are not forwarded beyond the local network. -Multicast datagrams with a hoplimit of 0 will not be transmitted on any network, -but may be delivered locally if the sending host belongs to the destination -group and if multicast loopback has not been disabled on the sending socket -(see below). -Multicast datagrams with hoplimit greater than 1 may be forwarded -to other networks if a multicast router is attached to the local network. -.Pp -For hosts with multiple interfaces, each multicast transmission is -sent from the primary network interface. -The -.Dv IPV6_MULTICAST_IF -option overrides the default for -subsequent transmissions from a given socket: -.Bd -literal -offset indent -unsigned int outif; -outif = if_nametoindex("ne0"); -setsockopt(s, IPPROTO_IPV6, IPV6_MULTICAST_IF, &outif, sizeof(outif)); -.Ed -.Pp -where "outif" is an interface index of the desired interface, -or 0 to specify the default interface. -.Pp -If a multicast datagram is sent to a group to which the sending host itself -belongs (on the outgoing interface), a copy of the datagram is, by default, -looped back by the IPv6 layer for local delivery. -The -.Dv IPV6_MULTICAST_LOOP -option gives the sender explicit control -over whether or not subsequent datagrams are looped back: -.Bd -literal -offset indent -u_char loop; /* 0 = disable, 1 = enable (default) */ -setsockopt(s, IPPROTO_IPV6, IPV6_MULTICAST_LOOP, &loop, sizeof(loop)); -.Ed -.Pp -This option -improves performance for applications that may have no more than one -instance on a single host (such as a router daemon), by eliminating -the overhead of receiving their own transmissions. -It should generally not be used by applications for which there -may be more than one instance on a single host (such as a conferencing -program) or for which the sender does not belong to the destination -group (such as a time querying program). -.Pp -A multicast datagram sent with an initial hoplimit greater than 1 may be delivered -to the sending host on a different interface from that on which it was sent, -if the host belongs to the destination group on that other interface. -The loopback control option has no effect on such delivery. -.Pp +.Ss Options +IPv6 allows header options on packets to manipulate the behavior of the +protocol. +These options and other control requests are accessed with the +.Xr getsockopt 2 +and +.Xr setsockopt 2 +system calls at level +.Dv IPPROTO_IPV6 +and by using ancillary data in +.Xr recvmsg 2 +and +.Xr sendmsg 2 . +They can be used to access most of the fields in the IPv6 header and +extension headers. +.Pp +The following socket options are supported: +.Bl -tag -width Ds +.\" .It Dv IPV6_OPTIONS +.It Dv IPV6_UNICAST_HOPS Fa "int *" +Get or set the default hop limit header field for outgoing unicast +datagrams sent on this socket. +A value of \-1 resets to the default value. +.\" .It Dv IPV6_RECVOPTS Fa "int *" +.\" Get or set the status of whether all header options will be +.\" delivered along with the datagram when it is received. +.\" .It Dv IPV6_RECVRETOPTS Fa "int *" +.\" Get or set the status of whether header options will be delivered +.\" for reply. +.\" .It Dv IPV6_RECVDSTADDR Fa "int *" +.\" Get or set the status of whether datagrams are received with +.\" destination addresses. +.\" .It Dv IPV6_RETOPTS +.\" Get or set IPv6 options. +.It Dv IPV6_MULTICAST_IF Fa "u_int *" +Get or set the interface from which multicast packets will be sent. +For hosts with multiple interfaces, each multicast transmission is sent +from the primary network interface. +The interface is specified as its index as provided by +.Xr if_nametoindex 3 . +A value of zero specifies the default interface. +.It Dv IPV6_MULTICAST_HOPS Fa "int *" +Get or set the default hop limit header field for outgoing multicast +datagrams sent on this socket. +This option controls the scope of multicast datagram transmissions. +.Pp +Datagrams with a hop limit of 1 are not forwarded beyond the local +network. +Multicast datagrams with a hop limit of zero will not be transmitted on +any network but may be delivered locally if the sending host belongs to +the destination group and if multicast loopback (see below) has not been +disabled on the sending socket. +Multicast datagrams with a hop limit greater than 1 may be forwarded to +the other networks if a multicast router (such as +.Xr mrouted 8 ) +is attached to the local network. +.It Dv IPV6_MULTICAST_LOOP Fa "u_int *" +Get or set the status of whether multicast datagrams will be looped back +for local delivery when a multicast datagram is sent to a group to which +the sending host belongs. +.Pp +This option improves performance for applications that may have no more +than one instance on a single host (such as a router daemon) by +eliminating the overhead of receiving their own transmissions. +It should generally not be used by applications for which there may be +more than one instance on a single host (such as a conferencing program) +or for which the sender does not belong to the destination group +(such as a time-querying program). +.Pp +A multicast datagram sent with an initial hop limit greater than 1 may +be delivered to the sending host on a different interface from that on +which it was sent if the host belongs to the destination group on that +other interface. +The multicast loopback control option has no effect on such delivery. +.It Dv IPV6_JOIN_GROUP Fa "struct ipv6_mreq *" +Join a multicast group. A host must become a member of a multicast group before it can receive datagrams sent to the group. -To join a multicast group, use the -.Dv IPV6_JOIN_GROUP -option: -.Bd -literal -offset indent -struct ipv6_mreq mreq6; -setsockopt(s, IPPROTO_IPV6, IPV6_JOIN_GROUP, &mreq6, sizeof(mreq6)); +.Bd -literal +struct ipv6_mreq { + struct in6_addr ipv6mr_multiaddr; + unsigned int ipv6mr_interface; +}; .Ed .Pp -where -.Fa mreq6 -is the following structure: -.Bd -literal -offset indent -struct ipv6_mreq { - struct in6_addr ipv6mr_multiaddr; - u_int ipv6mr_interface; +.Va ipv6mr_interface +may be set to zeroes to choose the default multicast interface or to the +index of a particular multicast-capable interface if the host is +multihomed. +Membership is associated with a single interface; programs running on +multihomed hosts may need to join the same group on more than one +interface. +.Pp +If the multicast address is unspecified (i.e., all zeroes), messages +from all multicast addresses will be accepted by this group. +Note that setting to this value requires superuser privileges. +.It Dv IPV6_LEAVE_GROUP Fa "struct ipv6_mreq *" +Drop membership from the associated multicast group. +Memberships are automatically dropped when the socket is closed or when +the process exits. +.It Dv IPV6_PORTRANGE Fa "int *" +Get or set the allocation policy of ephemeral ports for when the kernel +automatically binds a local address to this socket. +The following values are available: +.Pp +.Bl -tag -width IPV6_PORTRANGE_DEFAULT -compact +.It Dv IPV6_PORTRANGE_DEFAULT +Use the regular range of non-reserved ports (varies, see +.Xr sysctl 8 ) . +.It Dv IPV6_PORTRANGE_HIGH +Use a high range (varies, see +.Xr sysctl 8 ) . +.It Dv IPV6_PORTRANGE_LOW +Use a low, reserved range (600\-1023). +.El +.It Dv IPV6_PKTINFO Fa "int *" +Get or set whether additional information about subsequent packets will +be provided as ancillary data along with the payload in subsequent +.Xr recvmsg 2 +calls. +The information is stored in the following structure in the ancillary +data returned: +.Bd -literal +struct in6_pktinfo { + struct in6_addr ipi6_addr; /* src/dst IPv6 address */ + unsigned int ipi6_ifindex; /* send/recv if index */ }; .Ed +.It Dv IPV6_HOPLIMIT Fa "int *" +Get or set whether the hop limit header field from subsequent packets +will be provided as ancillary data along with the payload in subsequent +.Xr recvmsg 2 +calls. +The value is stored as an +.Vt int +in the ancillary data returned. +.\" .It Dv IPV6_NEXTHOP Fa "int *" +.\" Get or set whether the address of the next hop for subsequent +.\" packets will be provided as ancillary data along with the payload in +.\" subsequent +.\" .Xr recvmsg 2 +.\" calls. +.\" The option is stored as a +.\" .Vt sockaddr +.\" structure in the ancillary data returned. +.\" .Pp +.\" This option requires superuser privileges. +.It Dv IPV6_HOPOPTS Fa "int *" +Get or set whether the hop-by-hop options from subsequent packets will be +provided as ancillary data along with the payload in subsequent +.Xr recvmsg 2 +calls. +The option is stored in the following structure in the ancillary data +returned: +.Bd -literal +struct ip6_hbh { + u_int8_t ip6h_nxt; /* next header */ + u_int8_t ip6h_len; /* length in units of 8 octets */ +/* followed by options */ +} __packed; +.Ed .Pp -.Dv ipv6mr_interface -should be 0 to choose the default multicast interface, or the -interface index of a particular multicast-capable interface if -the host is multihomed. -Membership is associated with a single interface; -programs running on multihomed hosts may need to -join the same group on more than one interface. +The +.Fn inet6_option_space +routine and family of routines may be used to manipulate this data. .Pp -To drop a membership, use: -.Bd -literal -offset indent -struct ipv6_mreq mreq6; -setsockopt(s, IPPROTO_IPV6, IPV6_LEAVE_GROUP, &mreq6, sizeof(mreq6)); +This option requires superuser privileges. +.It Dv IPV6_DSTOPTS Fa "int *" +Get or set whether the destination options from subsequent packets will +be provided as ancillary data along with the payload in subsequent +.Xr recvmsg 2 +calls. +The option is stored in the following structure in the ancillary data +returned: +.Bd -literal +struct ip6_dest { + u_int8_t ip6d_nxt; /* next header */ + u_int8_t ip6d_len; /* length in units of 8 octets */ +/* followed by options */ +} __packed; .Ed .Pp -where -.Fa mreq6 -contains the same values as used to add the membership. -Memberships are dropped when the socket is closed or the process exits. +The +.Fn inet6_option_space +routine and family of routines may be used to manipulate this data. .Pp -.Dv IPV6_PORTRANGE -controls how ephemeral ports are allocated for -.Dv SOCK_STREAM -and -.Dv SOCK_DGRAM -sockets. -For example, -.Bd -literal -offset indent -int range = IPV6_PORTRANGE_LOW; /* see <netinet/in.h> */ -setsockopt(s, IPPROTO_IPV6, IPV6_PORTRANGE, &range, sizeof(range)); +This option requires superuser privileges. +.It Dv IPV6_RTHDR Fa "int *" +Get or set whether the routing header from subsequent packets will be +provided as ancillary data along with the payload in subsequent +.Xr recvmsg 2 +calls. +The header is stored in the following structure in the ancillary data +returned: +.Bd -literal +struct ip6_rthdr { + u_int8_t ip6r_nxt; /* next header */ + u_int8_t ip6r_len; /* length in units of 8 octets */ + u_int8_t ip6r_type; /* routing type */ + u_int8_t ip6r_segleft; /* segments left */ +/* followed by routing-type-specific data */ +} __packed; .Ed .Pp -.Dv IPV6_V6ONLY -controls behavior of -.Dv AF_INET6 -wildcard listening socket. -The following example sets the option to 1: -.Bd -literal -offset indent -int on = 1; -setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on)); -.Ed +The +.Fn inet6_option_space +routine and family of routines may be used to manipulate this data. +.Pp +This option requires superuser privileges. +.It Dv IPV6_PKTOPTIONS Fa "struct cmsghdr *" +Get or set all header options and extension headers at one time on the +last packet sent or received on the socket. +All options must fit within the size of an mbuf (see +.Xr mbuf 9 ) . +Options are specified as a series of +.Vt cmsghdr +structures followed by corresponding values. +.Va cmsg_level +is set to +.Dv IPPROTO_IPV6 , +.Va cmsg_type +to one of the other values in this list, and trailing data to the option +value. +When setting options, if the length +.Va optlen +to +.Xr setsockopt 2 +is zero, all header options will be reset to their default values. +Otherwise, the length should specify the size the series of control +messages consumes. .Pp -If set to 1, -.Dv AF_INET6 -wildcard listening socket will accept IPv6 traffic only. -If set to 0, it will accept IPv4 traffic as well, -as if it was from IPv4 mapped address like -.Li ::ffff:10.1.1.1 . -.\" RFC2553 defines the behavior when the variable is set to 0. -Note that if you set it this to 0, -IPv4 access control gets much more complicated. -For example, even if you have no listening -.Dv AF_INET -listening socket on port -.Li X , -you will end up accepting IPv4 traffic by -.Dv AF_INET6 -listening socket on the same port. -The default value for this flag is copied at socket instantiation time, -from -.Li net.inet6.ip6.v6only -.Xr sysctl 3 -variable. -The option affects -.Tn TCP -and -.Tn UDP -sockets only. -.\" -.Ss Advanced IPv6 sockets API -The advanced IPv6 sockets API lets userland programs specify or obtain -details about the IPv6 header and the IPv6 extension headers on packets. -The advanced API uses ancillary data for passing data from/to the kernel. +Instead of using +.Xr sendmsg 2 +to specify option values, the ancillary data used in these calls that +correspond to the desired header options may be directly specified as +the control message in the series of control messages provided as the +argument to +.Xr setsockopt 2 . +.It Dv IPV6_CHECKSUM Fa "int *" +Get or set the byte offset into a packet where the 16-bit checksum is +located. +When set, this byte offset is where incoming packets will be expected +to have checksums of their data stored and where outgoing packets will +have checksums of their data computed and stored by the kernel. +A value of \-1 specifies that no checksums will be checked on incoming +packets and that no checksums will be computed or stored on outgoing +packets. +The offset of the checksum for ICMPv6 sockets cannot be relocated or +turned off. +.It Dv IPV6_V6ONLY Fa "int *" +Get or set whether only IPv6 connections can be made to this socket. +For wildcard sockets, this can restrict connections to IPv6 only. +.\"With +.\".Ox +.\"IPv6 sockets are always IPv6-only, so the socket option is read-only +.\"(not modifiable). +.It Dv IPV6_FAITH Fa "int *" +Get or set the status of whether +.Xr faith 4 +connections can be made to this socket. +.It Dv IPV6_USE_MIN_MTU Fa "int *" +Get or set whether the minimal IPv6 maximum transmission unit (MTU) size +will be used to avoid fragmentation from occurring for subsequent +outgoing datagrams. +.It Dv IPV6_AUTH_LEVEL Fa "int *" +Get or set the +.Xr ipsec 4 +authentication level. +.It Dv IPV6_ESP_TRANS_LEVEL Fa "int *" +Get or set the ESP transport level. +.It Dv IPV6_ESP_NETWORK_LEVEL Fa "int *" +Get or set the ESP encapsulation level. +.It Dv IPV6_IPCOMP_LEVEL Fa "int *" +Get or set the +.Xr ipcomp 4 +level. +.El .Pp -There are -.Xr setsockopt 2 Ns / Ns Xr getsockopt 2 -options to get optional information on incoming packets. -They are +The .Dv IPV6_PKTINFO , +.\" .Dv IPV6_NEXTHOP , .Dv IPV6_HOPLIMIT , .Dv IPV6_HOPOPTS , .Dv IPV6_DSTOPTS , and -.Dv IPV6_RTHDR . -.Bd -literal -offset indent -int on = 1; - -setsockopt(fd, IPPROTO_IPV6, IPV6_PKTINFO, &on, sizeof(on)); -setsockopt(fd, IPPROTO_IPV6, IPV6_HOPLIMIT, &on, sizeof(on)); -setsockopt(fd, IPPROTO_IPV6, IPV6_HOPOPTS, &on, sizeof(on)); -setsockopt(fd, IPPROTO_IPV6, IPV6_DSTOPTS, &on, sizeof(on)); -setsockopt(fd, IPPROTO_IPV6, IPV6_RTHDR, &on, sizeof(on)); -.Ed -.Pp -When any of these options are enabled, the corresponding data is -returned as control information by -.Xr recvmsg 2 , -as one or more ancillary data objects. -.Pp -If -.Dv IPV6_PKTINFO -is enabled, the destination IPv6 address and the arriving interface index -will be available via -.Li struct in6_pktinfo -on ancillary data stream. -You can pick the structure by checking for an ancillary data item with -.Li cmsg_level -equals to -.Dv IPPROTO_IPV6 , -and -.Li cmsg_type -equals to -.Dv IPV6_PKTINFO . -.Pp -If -.Dv IPV6_HOPLIMIT -is enabled, hoplimit value on the packet will be made available to the -userland program. -Ancillary data stream will contain an integer data item with -.Li cmsg_level -equals to -.Dv IPPROTO_IPV6 , -and -.Li cmsg_type -equals to -.Dv IPV6_HOPLIMIT . -.Pp -.Xr inet6_option_space 3 -and friends will help you parse ancillary data items for -.Dv IPV6_HOPOPTS -and -.Dv IPV6_DSTOPTS . -Similarly, -.Xr inet6_rthdr_space 3 -and friends will help you parse ancillary data items for -.Dv IPV6_RTHDR . -.Pp -.Dv IPV6_HOPOPTS +.Dv IPV6_RTHDR +options will return ancillary data along with payload contents in subsequent +.Xr recvmsg 2 +calls with +.Va cmsg_level +set to +.Dv IPPROTO_IPV6 and -.Dv IPV6_DSTOPTS -may appear multiple times on an ancillary data stream -(note that the behavior is slightly different than the specification). -Other ancillary data item will appear no more than once. -.Pp -For outgoing direction, -you can pass ancillary data items with normal payload data, using -.Xr sendmsg 2 . -Ancillary data items will be parsed by the kernel, and used to construct -the IPv6 header and extension headers. -For the 5 -.Li cmsg_level -values listed above, ancillary data format is the same as inbound case. -Additionally, you can specify -.Dv IPV6_NEXTHOP -data object. -The -.Dv IPV6_NEXTHOP -ancillary data object specifies the next hop for the -datagram as a socket address structure. -In the -.Li cmsghdr -structure -containing this ancillary data, the -.Li cmsg_level -member will be -.Dv IPPROTO_IPV6 , -the -.Li cmsg_type -member will be -.Dv IPV6_NEXTHOP , -and the first byte of -.Li cmsg_data[] -will be the first byte of the socket address structure. -.Pp -If the socket address structure contains an IPv6 address (e.g., the -sin6_family member is -.Dv AF_INET6 ) , -then the node identified by that -address must be a neighbor of the sending host. -If that address -equals the destination IPv6 address of the datagram, then this is -equivalent to the existing -.Dv SO_DONTROUTE -socket option. -.Pp -For applications that do not, or unable to use -.Xr sendmsg 2 -or -.Xr recvmsg 2 , -.Dv IPV6_PKTOPTIONS -socket option is defined. -Setting the socket option specifies any of the optional output fields: -.Bd -literal -offset indent -setsockopt(fd, IPPROTO_IPV6, IPV6_PKTOPTIONS, &buf, len); -.Ed -.Pp -The fourth argument points to a buffer containing one or more -ancillary data objects, and the fifth argument is the total length of -all these objects. -The application fills in this buffer exactly as -if the buffer were being passed to +.Va cmsg_type +set to respective option name value (e.g., +.Dv IPV6_HOPTLIMIT ) . +These options may also be used directly as ancillary +.Va cmsg_type +values in .Xr sendmsg 2 -as control information. -.Pp -The options set by calling -.Xr setsockopt 2 -for -.Dv IPV6_PKTOPTIONS -are -called "sticky" options because once set they apply to all packets -sent on that socket. -The application can call -.Xr setsockopt 2 -again to -change all the sticky options, or it can call -.Xr setsockopt 2 -with a -length of 0 to remove all the sticky options for the socket. -.Pp -The corresponding receive option -.Bd -literal -offset indent -getsockopt(fd, IPPROTO_IPV6, IPV6_PKTOPTIONS, &buf, &len); -.Ed -.Pp -returns a buffer with one or more ancillary data objects for all the -optional receive information that the application has previously -specified that it wants to receive. -The fourth argument points to -the buffer that is filled in by the call. -The fifth argument is a -pointer to a value-result integer: when the function is called the -integer specifies the size of the buffer pointed to by the fourth -argument, and on return this integer contains the actual number of -bytes that were returned. -The application processes this buffer -exactly as if the buffer were returned by -.Xr recvmsg 2 -as control information. -.\" -.Ss Advanced API and TCP sockets -When using -.Xr getsockopt 2 -with the -.Dv IPV6_PKTOPTIONS -option and a -.Tn TCP -socket, only the options from the most recently received segment are -retained and returned to the caller, and only after the socket option -has been set. -.\" That is, -.\" .Tn TCP -.\" need not start saving a copy of the options until the application says -.\" to do so. -The application is not allowed to specify ancillary data in a call to +to set options on the packet being transmitted by the call. +The +.Va cmsg_level +value must be +.Dv IPPROTO_IPV6 . +For these options, the ancillary data object value format is the same +as the value returned as explained for each when received with +.Xr recvmsg 2 . +.Pp +Note that using .Xr sendmsg 2 -on a -.Tn TCP -socket, and none of the ancillary data that we -described above is ever returned as control information by -.Xr recvmsg 2 -on a -.Tn TCP -socket. -.\" -.Ss Conflict resolution -In some cases, there are multiple APIs defined for manipulating -a IPv6 header field. -A good example is the outgoing interface for multicast datagrams: -it can be manipulated by +to specify options on particular packets works only on UDP and raw sockets. +To manipulate header options for packets on TCP sockets, only the socket +options may be used. +.Pp +In some cases, there are multiple APIs defined for manipulating an IPv6 +header field. +A good example is the outgoing interface for multicast datagrams, which +can be set by the .Dv IPV6_MULTICAST_IF -in basic API, +socket option, through the .Dv IPV6_PKTINFO -in advanced API, and -.Li sin6_scope_id -field of the socket address passed to -.Xr sendto 2 . -.Pp -When conflicting options are given to the kernel, -the kernel will get the value in the following preference: -(1) options specified by using ancillary data, -(2) options specified by a sticky option of the advanced API, -(3) options specified by using the basic API, and lastly -(4) options specified by a socket address. -Note that the conflict resolution is undefined in the API specifcation -and implementation dependent. -.\" -.Ss "Raw IPv6 Sockets" -Raw -.Tn IPv6 -sockets are connectionless, and are normally used with the +option, and through the +.Va sin6_scope_id +field of the socket address passed to the +.Xr sendto 2 +system call. +.Pp +Resolving these conflicts is implementation dependent. +This implementation determines the value in the following way: +options specified by using ancillary data (i.e., +.Xr sendmsg 2 ) +are considered first, +options specified by using +.Dv IPV6_PKTOPTIONS +to set +.Dq sticky +options are considered second, +options specified by using the individual, basic, and direct socket +options (e.g., +.Dv IPV6_UNICAST_HOPS ) +are considered third, +and options specified in the socket address supplied to +.Xr sendto 2 +are the last choice. +.Ss Multicasting +IPv6 multicasting is supported only on +.Dv AF_INET6 +sockets of type +.Dv SOCK_DGRAM +and +.Dv SOCK_RAW , +and only on networks where the interface driver supports +multicasting. +Socket options (see above) that manipulate membership of +multicast groups and other multicast options include +.Dv IPV6_MULTICAST_IF , +.Dv IPV6_MULTICAST_HOPS , +.Dv IPV6_MULTICAST_LOOP , +.Dv IPV6_LEAVE_GROUP , +and +.Dv IPV6_JOIN_GROUP . +.Ss Raw Sockets +Raw IPv6 sockets are connectionless and are normally used with the .Xr sendto 2 and .Xr recvfrom 2 -calls, though the +calls, although the .Xr connect 2 -call may also be used to fix the destination for future -packets (in which case the -.Xr read 2 -or -.Xr recv 2 -and -.Xr write 2 -or +call may be used to fix the destination address for future outgoing +packets so that .Xr send 2 -system calls may be used). -.Pp -If -.Fa proto -is 0, the default protocol -.Dv IPPROTO_RAW -is used for outgoing packets, and only incoming packets destined -for that protocol are received. -If -.Fa proto -is non-zero, that protocol number will be used on outgoing packets -and to filter incoming packets. -.Pp -Outgoing packets automatically have an -.Tn IPv6 -header prepended to them (based on the destination address and the -protocol number the socket is created with). -Incoming packets are received without -.Tn IPv6 -header nor extension headers. -.Pp -All data sent via raw sockets MUST be in network byte order and all -data received via raw sockets will be in network byte order. -This differs from the IPv4 raw sockets, which did not specify a byte -ordering and typically used the host's byte order. -.Pp -Another difference from IPv4 raw sockets is that complete packets -(that is, IPv6 packets with extension headers) cannot be read or -written using the IPv6 raw sockets API. -Instead, ancillary data -objects are used to transfer the extension headers, as described above. -Should an application need access to the -complete IPv6 packet, some other technique, such as the datalink -interfaces, such as -.Xr bpf 4 , -must be used. -.Pp -All fields in the IPv6 header that an application might want to -change (i.e., everything other than the version number) can be -modified using ancillary data and/or socket options by the -application for output. -All fields in a received IPv6 header (other -than the version number and Next Header fields) and all extension -headers are also made available to the application as ancillary data -on input. -Hence there is no need for a socket option similar to the -IPv4 -.Dv IP_HDRINCL -socket option. -.Pp -When writing to a raw socket the kernel will automatically fragment -the packet if its size exceeds the path MTU, inserting the required -fragmentation headers. On input the kernel reassembles received -fragments, so the reader of a raw socket never sees any fragment -headers. +may instead be used and the +.Xr bind 2 +call may be used to fix the source address for future outgoing +packets instead of having the kernel choose a source address. .Pp -Most IPv4 implementations give special treatment to a raw socket -created with a third argument to +By using +.Xr connect 2 +or +.Xr bind 2 , +raw socket input is constrained to only packets with their +source address matching the socket destination address if +.Xr connect 2 +was used and to packets with their destination address +matching the socket source address if +.Xr bind 2 +was used. +.Pp +If the +.Ar proto +argument to .Xr socket 2 -of -.Dv IPPROTO_RAW , -whose value is normally 255. -We note that this value has no special meaning to -an IPv6 raw socket (and the IANA currently reserves the value of 255 -when used as a next-header field). -.\" Note: This feature was added to -.\" IPv4 in 1988 by Van Jacobson to support traceroute, allowing a -.\" complete IP header to be passed by the application, before the -.\" .Dv IP_HDRINCL -.\" socket option was added. -.Pp -For ICMPv6 raw sockets, -the kernel will calculate and insert the ICMPv6 checksum for -since this checksum is mandatory. +is zero, the default protocol +.Pq Dv IPPROTO_RAW +is used for outgoing packets. +For incoming packets, protocols recognized by kernel are +.Sy not +passed to the application socket (e.g., +.Xr tcp 4 +and +.Xr udp 4 ) +except for some ICMPv6 messages. +The ICMPv6 messages not passed to raw sockets include echo, timestamp, +and address mask requests. +If +.Ar proto +is non-zero, only packets with this protocol will be passed to the +socket. .Pp -For other raw IPv6 sockets (that is, for raw IPv6 sockets created -with a third argument other than IPPROTO_ICMPV6), the application -must set the new IPV6_CHECKSUM socket option to have the kernel (1) -compute and store a psuedo header checksum for output, -and (2) verify the received -pseudo header checksum on input, -discarding the packet if the checksum is in error. -This option prevents applications from having to perform source -address selection on the packets they send. -The checksum will -incorporate the IPv6 pseudo-header, defined in Section 8.1 of RFC2460. -This new socket option also specifies an integer offset into -the user data of where the checksum is located. -.Bd -literal -offset indent -int offset = 2; -setsockopt(fd, IPPROTO_IPV6, IPV6_CHECKSUM, &offset, sizeof(offset)); +IPv6 fragments are also not passed to application sockets until +they have been reassembled. +If reception of all packets is desired, link-level access (such as +.Xr bpf 4 ) +must be used instead. +.Pp +Outgoing packets automatically have an IPv6 header prepended to them +(based on the destination address and the protocol number the socket +was created with). +Incoming packets are received by an application without the IPv6 header +or any extension headers. +.Pp +Outgoing packets will be fragmented automatically by the kernel if they +are too large. +Incoming packets will be reassembled before being sent to the raw socket, +so packet fragments or fragment headers will never be seen on a raw socket. +.Sh EXAMPLES +The following determines the hop limit on the next packet received: +.Bd -literal +struct iovec iov[2]; +u_char buf[BUFSIZ]; +struct cmsghdr *cm; +struct msghdr m; +int found, optval; +u_char data[2048]; + +/* Create socket. */ + +(void)memset(&m, 0, sizeof(m)); +(void)memset(&iov, 0, sizeof(iov)); + +iov[0].iov_base = data; /* buffer for packet payload */ +iov[0].iov_len = sizeof(data); /* expected packet length */ + +m.msg_name = &from; /* sockaddr_in6 of peer */ +m.msg_namelen = sizeof(from); +m.msg_iov = iov; +m.msg_iovlen = 1; +m.msg_control = (caddr_t)buf; /* buffer for control messages */ +m.msg_controllen = sizeof(buf); + +/* + * Enable the hop limit value from received packets to be + * returned along with the payload. + */ +optval = 1; +if (setsockopt(s, IPPROTO_IPV6, IPV6_HOPLIMIT, &optval, + sizeof(optval)) == -1) + err(1, "setsockopt"); + +found = 0; +while (!found) { + if (recvmsg(s, &m, 0) == -1) + err(1, "recvmsg"); + for (cm = CMSG_FIRSTHDR(&m); cm != NULL; + cm = CMSG_NXTHDR(&m, cm)) { + if (cm->cmsg_level == IPPROTO_IPV6 && + cm->cmsg_type == IPV6_HOPLIMIT && + cm->cmsg_len == CMSG_LEN(sizeof(int))) { + found = 1; + (void)printf("hop limit: %d\en", + *(int *)CMSG_DATA(cm)); + break; + } + } +} .Ed -.Pp -By default, this socket option is disabled. Setting the offset to -1 -also disables the option. By disabled we mean (1) the kernel will -not calculate and store a checksum for outgoing packets, and (2) the -kernel will not verify a checksum for received packets. -.Pp -Note: Since the checksum is always calculated by the kernel for an -ICMPv6 socket, applications are not able to generate ICMPv6 packets -with incorrect checksums (presumably for testing purposes) using this -API. -.\" -.Sh ERRORS +.Sh DIAGNOSTICS A socket operation may fail with one of the following errors returned: -.Bl -tag -width Er +.Bl -tag -width EADDRNOTAVAILxx .It Bq Er EISCONN -when trying to establish a connection on a socket which already -has one, or when trying to send a datagram with the destination -address specified and the socket is already connected; +when trying to establish a connection on a socket which +already has one or when trying to send a datagram with the destination +address specified and the socket is already connected. .It Bq Er ENOTCONN -when trying to send a datagram, but no destination address is -specified, and the socket hasn't been connected; +when trying to send a datagram, but +no destination address is specified, and the socket hasn't been +connected. .It Bq Er ENOBUFS -when the system runs out of memory for an internal data structure; +when the system runs out of memory for +an internal data structure. .It Bq Er EADDRNOTAVAIL -when an attempt is made to create a socket with a network address -for which no network interface exists. +when an attempt is made to create a +socket with a network address for which no network interface +exists. .It Bq Er EACCES -when an attempt is made to create a raw IPv6 socket by a non-privileged process. +when an attempt is made to create +a raw IPv6 socket by a non-privileged process. .El .Pp -The following errors specific to -.Tn IPv6 -may occur: +The following errors specific to IPv6 may occur when setting or getting +header options: .Bl -tag -width EADDRNOTAVAILxx .It Bq Er EINVAL An unknown socket option name was given. .It Bq Er EINVAL -The ancillary data items were improperly formed, or option name was unknown. +An ancillary data object was improperly formed. .El -.\" .Sh SEE ALSO .Xr getsockopt 2 , .Xr recv 2 , .Xr send 2 , .Xr setsockopt 2 , -.Xr inet6_option_space 3 , -.Xr inet6_rthdr_space 3 , +.Xr socket 2 , +.\" .Xr inet6_option_space 3 , +.\" .Xr inet6_rthdr_space 3 , +.Xr if_nametoindex 3 , +.Xr bpf 4 , .Xr icmp6 4 , .Xr inet6 4 , -.Xr intro 4 +.Xr netintro 4 , +.Xr tcp 4 , +.Xr udp 4 .Rs .%A W. Stevens .%A M. Thomas -.%R RFC -.%N 2292 +.%T Advanced Sockets API for IPv6 +.%R RFC 2292 .%D February 1998 -.%T "Advanced Sockets API for IPv6" .Re .Rs .%A S. Deering .%A R. Hinden -.%R RFC -.%N 2460 +.%T Internet Protocol, Version 6 (IPv6) Specification +.%R RFC 2460 .%D December 1998 -.%T "Internet Protocol, Version 6 (IPv6) Specification" .Re .Rs .%A R. Gilligan .%A S. Thomson .%A J. Bound .%A W. Stevens -.%R RFC -.%N 2553 +.%T Basic Socket Interface Extensions for IPv6 +.%R RFC 2553 .%D March 1999 -.%T "Basic Socket Interface Extensions for IPv6" .Re -.\" +.Rs +.%A W. Stevens +.%A B. Fenner +.%A A. Rudoff +.%T UNIX Network Programming, third edition +.Re .Sh STANDARDS -Most of the socket options are defined in -RFC2292 and/or RFC2553. -.Pp +Most of the socket options are defined in RFC 2292 or RFC 2553. +The .Dv IPV6_V6ONLY -socket option is defined in draft-ietf-ipngwg-rfc2553bis-03. -.Dv IPV6_PORTRANGE -socket option -and -conflict resolution rule -are not defined in the RFCs and should be considered implementation dependent. -.\" -.Sh HISTORY -The implementation is based on KAME stack -(which is descendant of WIDE hydrangea IPv6 stack kit). -.Pp -Part of the document was shamelessly copied from RFC2553 and RFC2292. -.\" -.Sh BUGS +socket option is defined in RFC 3542. The -.Dv IPV6_NEXTHOP -object/option is not fully implemented as of writing this. +.Dv IPV6_PORTRANGE +socket option and the conflict resolution rule are not defined in the +RFCs and should be considered implementation dependent. diff --git a/bsd/man/man4/termios.4 b/bsd/man/man4/termios.4 index c3c5ee741..e7fcc5ba0 100644 --- a/bsd/man/man4/termios.4 +++ b/bsd/man/man4/termios.4 @@ -173,7 +173,7 @@ process is orphaned, the .Xr read 2 returns -1 with .Va errno set to -.Er Dv EIO +.Er EIO and no signal is sent. The default action of the .Dv SIGTTIN @@ -204,7 +204,7 @@ the .Xr write returns -1 with errno set to -.Er Dv EIO +.Er EIO and no signal is sent. .Pp Certain calls that set terminal parameters are treated in the same diff --git a/bsd/man/man4/unix.4 b/bsd/man/man4/unix.4 index db8816b6e..6597873ff 100644 --- a/bsd/man/man4/unix.4 +++ b/bsd/man/man4/unix.4 @@ -148,6 +148,35 @@ passed to a receiver. Descriptors that are awaiting delivery, or that are purposely not received, are automatically closed by the system when the destination socket is closed. +.Pp +The effective credentials (i.e., the user ID and group list) the of a +peer on a +.Dv SOCK_STREAM +socket may be obtained using the +.Dv LOCAL_PEERCRED +socket option. +This may be used by a server to obtain and verify the credentials of +its client, and vice versa by the client to verify the credentials +of the server. +These will arrive in the form of a filled in +.Ar struct xucred +(defined in +.Pa sys/ucred.h ) . +The credentials presented to the server (the +.Xr listen 2 +caller) are those of the client when it called +.Xr connect 2 ; +the credentials presented to the client (the +.Xr connect 2 +caller) are those of the server when it called +.Xr listen 2 . +This mechanism is reliable; there is no way for either party to influence +the credentials presented to its peer except by calling the appropriate +system call (e.g., +.Xr connect 2 +or +.Xr listen 2 ) +under different effective credentials. .Sh SEE ALSO .Xr socket 2 , .Xr intro 4 diff --git a/bsd/man/man5/types.5 b/bsd/man/man5/types.5 index 533fa7328..48b2372ef 100644 --- a/bsd/man/man5/types.5 +++ b/bsd/man/man5/types.5 @@ -66,8 +66,7 @@ typedef unsigned int u_int; typedef unsigned long u_long; typedef unsigned short ushort; /* Sys V compatibility */ -#include <machine/ansi.h> -#if !defined(_ANSI_SOURCE) && !defined(_POSIX_SOURCE) +#if !defined(_ANSI_SOURCE) && !defined(_POSIX_C_SOURCE) #include <machine/types.h> #endif @@ -132,6 +131,7 @@ typedef struct fd_set { #define FD_SET(n, p) ((p)->fds_bits[(n)/NFDBITS] |= (1 << ((n) % NFDBITS))) #define FD_CLR(n, p) ((p)->fds_bits[(n)/NFDBITS] &= ~(1 << ((n) % NFDBITS))) #define FD_ISSET(n, p) ((p)->fds_bits[(n)/NFDBITS] & (1 << ((n) % NFDBITS))) +#define FD_COPY(f, t) bcopy(f, t, sizeof(*(f))) #define FD_ZERO(p) bzero((char *)(p), sizeof(*(p))) #endif /* !_POSIX_SOURCE */ diff --git a/bsd/man/man9/fetch.9 b/bsd/man/man9/fetch.9 index 5669d0078..24d3182e0 100644 --- a/bsd/man/man9/fetch.9 +++ b/bsd/man/man9/fetch.9 @@ -34,15 +34,17 @@ .\" .\" $FreeBSD: src/share/man/man9/fetch.9,v 1.6.2.4 2001/12/17 11:30:18 ru Exp $ .\" -.Dd January 7, 1996 +.Dd December 16, 2004 .Dt FETCH 9 .Os .Sh NAME .Nm fetch , .Nm fubyte , -.Nm fusword , -.Nm fuswintr , -.Nm fuword +.Nm fuibyte , +.Nm fuword , +.Nm fuiword , +.Nm fulong , +.Nm fuulong .Nd fetch data from user-space .Sh SYNOPSIS .In sys/types.h @@ -50,13 +52,17 @@ .In sys/systm.h .In sys/resourcevar.h .Ft int -.Fn fubyte "const void *base" +.Fn fubyte "const user_addr_t addr" .Ft int -.Fn fusword "void *base" +.Fn fuibyte "const user_addr_t addr" .Ft int -.Fn fuswintr "void *base" -.Ft long -.Fn fuword "const void *base" +.Fn fuword "user_addr_t addr" +.Ft int +.Fn fuiword "user_addr_t addr" +.Ft int64_t +.Fn fulong "user_addr_t addr" +.Ft uint64_t +.Fn fuulong "user_addr_t addr" .Sh DESCRIPTION The .Nm @@ -65,20 +71,27 @@ functions are designed to copy small amounts of data from user-space. The .Nm routines provide the following functionality: -.Bl -tag -width "fuswintr()" +.Bl -tag -width "fuiword()" .It Fn fubyte Fetches a byte of data from the user-space address -.Pa base . -.It Fn fusword -Fetches a short word of data from the user-space address -.Pa base . -.It Fn fuswintr -Fetches a short word of data from the user-space address -.Pa base . +.Pa addr . +.It Fn fuibyte +Fetches a byte of data from the user-space address +.Pa addr . This function is safe to call during an interrupt context. .It Fn fuword Fetches a word of data from the user-space address -.Pa base . +.Pa addr . +.It Fn fuiword +Fetches a word of data from the user-space address +.Pa addr . +This function is safe to call during an interrupt context. +.It Fn fulong +Fetches a long word of data from the user-space address +.Pa addr . +.It Fn fuulong +Fetches a unsigned long word of data from the user-space address +.Pa addr . .El .Sh RETURN VALUES The diff --git a/bsd/man/man9/store.9 b/bsd/man/man9/store.9 index 5ef2d664e..1092c2fce 100644 --- a/bsd/man/man9/store.9 +++ b/bsd/man/man9/store.9 @@ -34,15 +34,17 @@ .\" .\" $FreeBSD: src/share/man/man9/store.9,v 1.7.2.4 2001/12/17 11:30:19 ru Exp $ .\" -.Dd January 7, 1996 +.Dd December 16, 2004 .Dt STORE 9 .Os .Sh NAME .Nm store , .Nm subyte , -.Nm susword , -.Nm suswintr , -.Nm suword +.Nm suibyte , +.Nm suword , +.Nm suiword , +.Nm sulong , +.Nm suulong .Nd store data to user-space .Sh SYNOPSIS .In sys/types.h @@ -50,13 +52,17 @@ .In sys/systm.h .In sys/resourcevar.h .Ft int -.Fn subyte "void *base" "int byte" +.Fn subyte "user_addr_t addr" "int byte" .Ft int -.Fn susword "void *base" "int word" +.Fn suibyte "user_addr_t addr" "int byte" .Ft int -.Fn suswintr "void *base" "int word" +.Fn suword "user_addr_t addr" "int word" .Ft int -.Fn suword "void *base" "long word" +.Fn suiword "user_addr_t addr" "int word" +.Ft int +.Fn sulong "user_addr_t addr" "int64_t longword" +.Ft int +.Fn suulong "user_addr_t addr" "uint64_t longword" .Sh DESCRIPTION The .Nm @@ -65,20 +71,27 @@ functions are designed to copy small amounts of data to user-space. The .Nm routines provide the following functionality: -.Bl -tag -width "suswintr()" +.Bl -tag -width "suibyte()" .It Fn subyte Stores a byte of data to the user-space address -.Pa base . -.It Fn susword -Stores a short word of data to the user-space address -.Pa base . -.It Fn suswintr -Stores a short word of data to the user-space address -.Pa base . +.Pa addr . +.It Fn suibyte +Stores a byte of data to the user-space address +.Pa addr . This function is safe to call during an interrupt context. .It Fn suword Stores a word of data to the user-space address -.Pa base . +.Pa addr . +.It Fn suiword +Stores a word of data to the user-space address +.Pa addr . +This function is safe to call during an interrupt context. +.It Fn sulong +Stores a long word of data to the user-space address +.Pa addr . +.It Fn suulong +Stores a unsigned long word of data to the user-space address +.Pa addr . .El .Sh RETURN VALUES The diff --git a/bsd/miscfs/deadfs/dead_vnops.c b/bsd/miscfs/deadfs/dead_vnops.c index aaca316c9..2e58fddac 100644 --- a/bsd/miscfs/deadfs/dead_vnops.c +++ b/bsd/miscfs/deadfs/dead_vnops.c @@ -58,7 +58,7 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/time.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/errno.h> #include <sys/namei.h> #include <sys/buf.h> @@ -67,104 +67,80 @@ /* * Prototypes for dead operations on vnodes. */ -int dead_badop(), - dead_ebadf(); -int dead_lookup __P((struct vop_lookup_args *)); -#define dead_create ((int (*) __P((struct vop_create_args *)))dead_badop) -#define dead_mknod ((int (*) __P((struct vop_mknod_args *)))dead_badop) -int dead_open __P((struct vop_open_args *)); -#define dead_close ((int (*) __P((struct vop_close_args *)))nullop) -#define dead_access ((int (*) __P((struct vop_access_args *)))dead_ebadf) -#define dead_getattr ((int (*) __P((struct vop_getattr_args *)))dead_ebadf) -#define dead_setattr ((int (*) __P((struct vop_setattr_args *)))dead_ebadf) -int dead_read __P((struct vop_read_args *)); -int dead_write __P((struct vop_write_args *)); -int dead_ioctl __P((struct vop_ioctl_args *)); -int dead_select __P((struct vop_select_args *)); -#define dead_mmap ((int (*) __P((struct vop_mmap_args *)))dead_badop) -#define dead_fsync ((int (*) __P((struct vop_fsync_args *)))nullop) -#define dead_seek ((int (*) __P((struct vop_seek_args *)))nullop) -#define dead_remove ((int (*) __P((struct vop_remove_args *)))dead_badop) -#define dead_link ((int (*) __P((struct vop_link_args *)))dead_badop) -#define dead_rename ((int (*) __P((struct vop_rename_args *)))dead_badop) -#define dead_mkdir ((int (*) __P((struct vop_mkdir_args *)))dead_badop) -#define dead_rmdir ((int (*) __P((struct vop_rmdir_args *)))dead_badop) -#define dead_symlink ((int (*) __P((struct vop_symlink_args *)))dead_badop) -#define dead_readdir ((int (*) __P((struct vop_readdir_args *)))dead_ebadf) -#define dead_readlink ((int (*) __P((struct vop_readlink_args *)))dead_ebadf) -#define dead_abortop ((int (*) __P((struct vop_abortop_args *)))dead_badop) -#define dead_inactive ((int (*) __P((struct vop_inactive_args *)))nullop) -#define dead_reclaim ((int (*) __P((struct vop_reclaim_args *)))nullop) -int dead_lock __P((struct vop_lock_args *)); -#define dead_unlock ((int (*) __P((struct vop_unlock_args *)))nullop) -int dead_bmap __P((struct vop_bmap_args *)); -int dead_strategy __P((struct vop_strategy_args *)); -int dead_print __P((struct vop_print_args *)); -#define dead_islocked ((int (*) __P((struct vop_islocked_args *)))nullop) -#define dead_pathconf ((int (*) __P((struct vop_pathconf_args *)))dead_ebadf) -#define dead_advlock ((int (*) __P((struct vop_advlock_args *)))dead_ebadf) -#define dead_blkatoff ((int (*) __P((struct vop_blkatoff_args *)))dead_badop) -#define dead_valloc ((int (*) __P((struct vop_valloc_args *)))dead_badop) -#define dead_vfree ((int (*) __P((struct vop_vfree_args *)))dead_badop) -#define dead_truncate ((int (*) __P((struct vop_truncate_args *)))nullop) -#define dead_update ((int (*) __P((struct vop_update_args *)))nullop) -#define dead_bwrite ((int (*) __P((struct vop_bwrite_args *)))nullop) -int dead_pagein __P((struct vop_pagein_args *)); -int dead_pageout __P((struct vop_pageout_args *)); -int dead_blktooff __P((struct vop_blktooff_args *)); -int dead_offtoblk __P((struct vop_offtoblk_args *)); -int dead_cmap __P((struct vop_cmap_args *)); +int dead_badop(void *); +int dead_ebadf(void *); +int dead_lookup(struct vnop_lookup_args *); +#define dead_create (int (*)(struct vnop_create_args *))dead_badop +#define dead_mknod (int (*)(struct vnop_mknod_args *))dead_badop +int dead_open(struct vnop_open_args *); +#define dead_close (int (*)(struct vnop_close_args *))nullop +#define dead_access (int (*)(struct vnop_access_args *))dead_ebadf +#define dead_getattr (int (*)(struct vnop_getattr_args *))dead_ebadf +#define dead_setattr (int (*)(struct vnop_setattr_args *))dead_ebadf +int dead_read(struct vnop_read_args *); +int dead_write(struct vnop_write_args *); +int dead_ioctl(struct vnop_ioctl_args *); +int dead_select(struct vnop_select_args *); +#define dead_mmap (int (*)(struct vnop_mmap_args *))dead_badop +#define dead_fsync (int (*)(struct vnop_fsync_args *))nullop +#define dead_remove (int (*)(struct vnop_remove_args ))dead_badop +#define dead_link (int (*)(struct vnop_link_args *))dead_badop +#define dead_rename (int (*)(struct vnop_rename_args *))dead_badop +#define dead_mkdir (int (*)(struct vnop_mkdir_args *))dead_badop +#define dead_rmdir (int (*)(struct vnop_rmdir_args *))dead_badop +#define dead_symlink (int (*)(struct vnop_symlink_args *))dead_badop +#define dead_readdir (int (*)(struct vnop_readdir_args *))dead_ebadf +#define dead_readlink (int (*)(struct vnop_readlink_args *))dead_ebadf +#define dead_inactive (int (*)(struct vnop_inactive_args *))nullop +#define dead_reclaim (int (*)(struct vnop_reclaim_args *))nullop +int dead_strategy(struct vnop_strategy_args *); +#define dead_pathconf (int (*)(struct vnop_pathconf_args *))dead_ebadf +#define dead_advlock (int (*)(struct vnop_advlock_args *))dead_ebadf +#define dead_bwrite (int (*)(struct vnop_bwrite_args *))nullop +int dead_pagein(struct vnop_pagein_args *); +int dead_pageout(struct vnop_pageout_args *); +int dead_blktooff(struct vnop_blktooff_args *); +int dead_offtoblk(struct vnop_offtoblk_args *); +int dead_blockmap(struct vnop_blockmap_args *); #define VOPFUNC int (*)(void *) int (**dead_vnodeop_p)(void *); struct vnodeopv_entry_desc dead_vnodeop_entries[] = { - { &vop_default_desc, (VOPFUNC)vn_default_error }, - { &vop_lookup_desc, (VOPFUNC)dead_lookup }, /* lookup */ - { &vop_create_desc, (VOPFUNC)dead_create }, /* create */ - { &vop_mknod_desc, (VOPFUNC)dead_mknod }, /* mknod */ - { &vop_open_desc, (VOPFUNC)dead_open }, /* open */ - { &vop_close_desc, (VOPFUNC)dead_close }, /* close */ - { &vop_access_desc, (VOPFUNC)dead_access }, /* access */ - { &vop_getattr_desc, (VOPFUNC)dead_getattr }, /* getattr */ - { &vop_setattr_desc, (VOPFUNC)dead_setattr }, /* setattr */ - { &vop_read_desc, (VOPFUNC)dead_read }, /* read */ - { &vop_write_desc, (VOPFUNC)dead_write }, /* write */ - { &vop_ioctl_desc, (VOPFUNC)dead_ioctl }, /* ioctl */ - { &vop_select_desc, (VOPFUNC)dead_select }, /* select */ - { &vop_mmap_desc, (VOPFUNC)dead_mmap }, /* mmap */ - { &vop_fsync_desc, (VOPFUNC)dead_fsync }, /* fsync */ - { &vop_seek_desc, (VOPFUNC)dead_seek }, /* seek */ - { &vop_remove_desc, (VOPFUNC)dead_remove }, /* remove */ - { &vop_link_desc, (VOPFUNC)dead_link }, /* link */ - { &vop_rename_desc, (VOPFUNC)dead_rename }, /* rename */ - { &vop_mkdir_desc, (VOPFUNC)dead_mkdir }, /* mkdir */ - { &vop_rmdir_desc, (VOPFUNC)dead_rmdir }, /* rmdir */ - { &vop_symlink_desc, (VOPFUNC)dead_symlink }, /* symlink */ - { &vop_readdir_desc, (VOPFUNC)dead_readdir }, /* readdir */ - { &vop_readlink_desc, (VOPFUNC)dead_readlink }, /* readlink */ - { &vop_abortop_desc, (VOPFUNC)dead_abortop }, /* abortop */ - { &vop_inactive_desc, (VOPFUNC)dead_inactive }, /* inactive */ - { &vop_reclaim_desc, (VOPFUNC)dead_reclaim }, /* reclaim */ - { &vop_lock_desc, (VOPFUNC)dead_lock }, /* lock */ - { &vop_unlock_desc, (VOPFUNC)dead_unlock }, /* unlock */ - { &vop_bmap_desc, (VOPFUNC)dead_bmap }, /* bmap */ - { &vop_strategy_desc, (VOPFUNC)dead_strategy }, /* strategy */ - { &vop_print_desc, (VOPFUNC)dead_print }, /* print */ - { &vop_islocked_desc, (VOPFUNC)dead_islocked }, /* islocked */ - { &vop_pathconf_desc, (VOPFUNC)dead_pathconf }, /* pathconf */ - { &vop_advlock_desc, (VOPFUNC)dead_advlock }, /* advlock */ - { &vop_blkatoff_desc, (VOPFUNC)dead_blkatoff }, /* blkatoff */ - { &vop_valloc_desc, (VOPFUNC)dead_valloc }, /* valloc */ - { &vop_vfree_desc, (VOPFUNC)dead_vfree }, /* vfree */ - { &vop_truncate_desc, (VOPFUNC)dead_truncate }, /* truncate */ - { &vop_update_desc, (VOPFUNC)dead_update }, /* update */ - { &vop_bwrite_desc, (VOPFUNC)dead_bwrite }, /* bwrite */ - { &vop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */ - { &vop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */ - { &vop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ - { &vop_blktooff_desc, (VOPFUNC)dead_blktooff }, /* blktooff */ - { &vop_offtoblk_desc, (VOPFUNC)dead_offtoblk }, /* offtoblk */ - { &vop_cmap_desc, (VOPFUNC)dead_cmap }, /* cmap */ + { &vnop_default_desc, (VOPFUNC)vn_default_error }, + { &vnop_lookup_desc, (VOPFUNC)dead_lookup }, /* lookup */ + { &vnop_create_desc, (VOPFUNC)dead_create }, /* create */ + { &vnop_open_desc, (VOPFUNC)dead_open }, /* open */ + { &vnop_mknod_desc, (VOPFUNC)dead_mknod }, /* mknod */ + { &vnop_close_desc, (VOPFUNC)dead_close }, /* close */ + { &vnop_access_desc, (VOPFUNC)dead_access }, /* access */ + { &vnop_getattr_desc, (VOPFUNC)dead_getattr }, /* getattr */ + { &vnop_setattr_desc, (VOPFUNC)dead_setattr }, /* setattr */ + { &vnop_read_desc, (VOPFUNC)dead_read }, /* read */ + { &vnop_write_desc, (VOPFUNC)dead_write }, /* write */ + { &vnop_ioctl_desc, (VOPFUNC)dead_ioctl }, /* ioctl */ + { &vnop_select_desc, (VOPFUNC)dead_select }, /* select */ + { &vnop_mmap_desc, (VOPFUNC)dead_mmap }, /* mmap */ + { &vnop_fsync_desc, (VOPFUNC)dead_fsync }, /* fsync */ + { &vnop_remove_desc, (VOPFUNC)dead_remove }, /* remove */ + { &vnop_link_desc, (VOPFUNC)dead_link }, /* link */ + { &vnop_rename_desc, (VOPFUNC)dead_rename }, /* rename */ + { &vnop_mkdir_desc, (VOPFUNC)dead_mkdir }, /* mkdir */ + { &vnop_rmdir_desc, (VOPFUNC)dead_rmdir }, /* rmdir */ + { &vnop_symlink_desc, (VOPFUNC)dead_symlink }, /* symlink */ + { &vnop_readdir_desc, (VOPFUNC)dead_readdir }, /* readdir */ + { &vnop_readlink_desc, (VOPFUNC)dead_readlink }, /* readlink */ + { &vnop_inactive_desc, (VOPFUNC)dead_inactive }, /* inactive */ + { &vnop_reclaim_desc, (VOPFUNC)dead_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (VOPFUNC)dead_strategy }, /* strategy */ + { &vnop_pathconf_desc, (VOPFUNC)dead_pathconf }, /* pathconf */ + { &vnop_advlock_desc, (VOPFUNC)dead_advlock }, /* advlock */ + { &vnop_bwrite_desc, (VOPFUNC)dead_bwrite }, /* bwrite */ + { &vnop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */ + { &vnop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */ + { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ + { &vnop_blktooff_desc, (VOPFUNC)dead_blktooff }, /* blktooff */ + { &vnop_offtoblk_desc, (VOPFUNC)dead_offtoblk }, /* offtoblk */ + { &vnop_blockmap_desc, (VOPFUNC)dead_blockmap }, /* blockmap */ { (struct vnodeop_desc*)NULL, (VOPFUNC)NULL } }; struct vnodeopv_desc dead_vnodeop_opv_desc = @@ -176,10 +152,11 @@ struct vnodeopv_desc dead_vnodeop_opv_desc = /* ARGSUSED */ int dead_lookup(ap) - struct vop_lookup_args /* { + struct vnop_lookup_args /* { struct vnode * a_dvp; struct vnode ** a_vpp; struct componentname * a_cnp; + vfs_context_t a_context; } */ *ap; { @@ -193,11 +170,10 @@ dead_lookup(ap) /* ARGSUSED */ int dead_open(ap) - struct vop_open_args /* { + struct vnop_open_args /* { struct vnode *a_vp; int a_mode; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { @@ -210,11 +186,11 @@ dead_open(ap) /* ARGSUSED */ int dead_read(ap) - struct vop_read_args /* { + struct vnop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; + vfs_context_t a_context; } */ *ap; { @@ -234,11 +210,11 @@ dead_read(ap) /* ARGSUSED */ int dead_write(ap) - struct vop_write_args /* { + struct vnop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; + vfs_context_t a_context; } */ *ap; { @@ -253,29 +229,28 @@ dead_write(ap) /* ARGSUSED */ int dead_ioctl(ap) - struct vop_ioctl_args /* { + struct vnop_ioctl_args /* { struct vnode *a_vp; u_long a_command; caddr_t a_data; int a_fflag; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { if (!chkvnlock(ap->a_vp)) return (EBADF); - return (VCALL(ap->a_vp, VOFFSET(vop_ioctl), ap)); + return (VCALL(ap->a_vp, VOFFSET(vnop_ioctl), ap)); } /* ARGSUSED */ int dead_select(ap) - struct vop_select_args /* { + struct vnop_select_args /* { struct vnode *a_vp; int a_which; int a_fflags; - struct ucred *a_cred; + kauth_cred_t a_cred; void *a_wql; struct proc *a_p; } */ *ap; @@ -292,102 +267,48 @@ dead_select(ap) */ int dead_strategy(ap) - struct vop_strategy_args /* { + struct vnop_strategy_args /* { struct buf *a_bp; } */ *ap; { - if (ap->a_bp->b_vp == NULL || !chkvnlock(ap->a_bp->b_vp)) { - ap->a_bp->b_flags |= B_ERROR; - biodone(ap->a_bp); + if (buf_vnode(ap->a_bp) == NULL || !chkvnlock(buf_vnode(ap->a_bp))) { + buf_seterror(ap->a_bp, EIO); + buf_biodone(ap->a_bp); return (EIO); } - return (VOP_STRATEGY(ap->a_bp)); + return (VNOP_STRATEGY(ap->a_bp)); } /* * Wait until the vnode has finished changing state. */ int -dead_lock(ap) - struct vop_lock_args /* { - struct vnode *a_vp; - } */ *ap; -{ - - struct vnode *vp = ap->a_vp; - - /* - * Since we are not using the lock manager, we must clear - * the interlock here. - */ - if (ap->a_flags & LK_INTERLOCK) { - simple_unlock(&vp->v_interlock); - ap->a_flags &= ~LK_INTERLOCK; - } - if (!chkvnlock(ap->a_vp)) - return (0); - return (VCALL(ap->a_vp, VOFFSET(vop_lock), ap)); -} - -/* - * Wait until the vnode has finished changing state. - */ -int -dead_bmap(ap) - struct vop_bmap_args /* { - struct vnode *a_vp; - daddr_t a_bn; - struct vnode **a_vpp; - daddr_t *a_bnp; - int *a_runp; - } */ *ap; -{ - - if (!chkvnlock(ap->a_vp)) - return (EIO); - return (VOP_BMAP(ap->a_vp, ap->a_bn, ap->a_vpp, ap->a_bnp, ap->a_runp)); -} - -/* - * Wait until the vnode has finished changing state. - */ -int -dead_cmap(ap) - struct vop_cmap_args /* { +dead_blockmap(ap) + struct vnop_blockmap_args /* { struct vnode *a_vp; off_t a_foffset; size_t a_size; - daddr_t *a_bpn; + daddr64_t *a_bpn; size_t *a_run; void *a_poff; + int flags; + vfs_context_t a_context; } */ *ap; { if (!chkvnlock(ap->a_vp)) return (EIO); - return (VOP_CMAP(ap->a_vp, ap->a_foffset, ap->a_size, ap->a_bpn, ap->a_run, ap->a_poff)); -} - -/* - * Print out the contents of a dead vnode. - */ -/* ARGSUSED */ -int -dead_print(ap) - struct vop_print_args /* { - struct vnode *a_vp; - } */ *ap; -{ - - printf("tag VT_NON, dead vnode\n"); + return (VNOP_BLOCKMAP(ap->a_vp, ap->a_foffset, ap->a_size, ap->a_bpn, + ap->a_run, ap->a_poff, ap->a_flags, ap->a_context)); } /* * Empty vnode failed operation */ +/* ARGSUSED */ int -dead_ebadf() +dead_ebadf(void *dummy) { return (EBADF); @@ -396,8 +317,9 @@ dead_ebadf() /* * Empty vnode bad operation */ +/* ARGSUSED */ int -dead_badop() +dead_badop(void *dummy) { panic("dead_badop called"); @@ -407,8 +329,9 @@ dead_badop() /* * Empty vnode null operation */ +/* ARGSUSED */ int -dead_nullop() +dead_nullop(void *dummy) { return (0); @@ -419,26 +342,18 @@ dead_nullop() * in a state of change. */ int -chkvnlock(vp) - register struct vnode *vp; +chkvnlock(__unused vnode_t vp) { - int locked = 0; - - while (vp->v_flag & VXLOCK) { - vp->v_flag |= VXWANT; - sleep((caddr_t)vp, PINOD); - locked = 1; - } - return (locked); + return (0); } /* Blktooff */ int dead_blktooff(ap) - struct vop_blktooff_args /* { + struct vnop_blktooff_args /* { struct vnode *a_vp; - daddr_t a_lblkno; + daddr64_t a_lblkno; off_t *a_offset; } */ *ap; { @@ -451,15 +366,15 @@ dead_blktooff(ap) /* Blktooff */ int dead_offtoblk(ap) -struct vop_offtoblk_args /* { +struct vnop_offtoblk_args /* { struct vnode *a_vp; off_t a_offset; - daddr_t *a_lblkno; + daddr64_t *a_lblkno; } */ *ap; { if (!chkvnlock(ap->a_vp)) return (EIO); - *ap->a_lblkno = (daddr_t)-1; /* failure */ + *ap->a_lblkno = (daddr64_t)-1; /* failure */ return (0); } diff --git a/bsd/miscfs/devfs/devfs.h b/bsd/miscfs/devfs/devfs.h index c76d8544a..647e94785 100644 --- a/bsd/miscfs/devfs/devfs.h +++ b/bsd/miscfs/devfs/devfs.h @@ -52,7 +52,6 @@ #include <sys/appleapiopts.h> -#ifdef __APPLE_API_UNSTABLE #define DEVFS_CHAR 0 #define DEVFS_BLOCK 1 @@ -73,9 +72,10 @@ __BEGIN_DECLS * Returns: * A handle to a device node if successful, NULL otherwise. */ -void * devfs_make_node __P((dev_t dev, int chrblk, uid_t uid, gid_t gid, - int perms, char *fmt, ...)); +void * devfs_make_node(dev_t dev, int chrblk, uid_t uid, gid_t gid, + int perms, const char *fmt, ...); +#ifdef BSD_KERNEL_PRIVATE /* * Function: devfs_make_link * @@ -85,7 +85,8 @@ void * devfs_make_node __P((dev_t dev, int chrblk, uid_t uid, gid_t gid, * Returns: * 0 if successful, -1 if failed */ -int devfs_link __P((void * handle, char *fmt, ...)); +int devfs_link(void * handle, char *fmt, ...); +#endif /* BSD_KERNEL_PRIVATE */ /* * Function: devfs_remove @@ -94,10 +95,9 @@ int devfs_link __P((void * handle, char *fmt, ...)); * Remove the device node returned by devfs_make_node() along with * any links created with devfs_make_link(). */ -void devfs_remove __P((void * handle)); +void devfs_remove(void * handle); __END_DECLS -#endif /* __APPLE_API_UNSTABLE */ #ifdef __APPLE_API_PRIVATE /* XXX */ diff --git a/bsd/miscfs/devfs/devfs_proto.h b/bsd/miscfs/devfs/devfs_proto.h index 3683d6b57..77e9b1c2f 100644 --- a/bsd/miscfs/devfs/devfs_proto.h +++ b/bsd/miscfs/devfs/devfs_proto.h @@ -33,19 +33,14 @@ int dev_add_name(char * name, devnode_t * dirnode, devdirent_t * back, int dev_add_node(int entrytype, devnode_type_t * typeinfo, devnode_t * proto, devnode_t * *dn_pp, struct devfsmount *dvm); void devnode_free(devnode_t * dnp); -void devfs_dn_free(devnode_t * dnp); -int devfs_propogate(devdirent_t * parent,devdirent_t * child); int dev_dup_plane(struct devfsmount *devfs_mp_p); void devfs_free_plane(struct devfsmount *devfs_mp_p); -int dev_dup_entry(devnode_t * parent, devdirent_t * back, devdirent_t * *dnm_pp, - struct devfsmount *dvm); int dev_free_name(devdirent_t * dirent_p); -void dev_free_hier(devdirent_t * dirent_p); int devfs_dntovn(devnode_t * dnp, struct vnode **vn_pp, struct proc * p); int dev_add_entry(char *name, devnode_t * parent, int type, devnode_type_t * typeinfo, devnode_t * proto, struct devfsmount *dvm, devdirent_t * *nm_pp); -int devfs_mount(struct mount *mp, char *path, caddr_t data, - struct nameidata *ndp, struct proc *p); +int devfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, + vfs_context_t context); #endif /* __APPLE_API_PRIVATE */ #endif /* __DEVFS_DEVFS_PROTO_H__ */ diff --git a/bsd/miscfs/devfs/devfs_tree.c b/bsd/miscfs/devfs/devfs_tree.c index 11464c40a..ff61c3d5a 100644 --- a/bsd/miscfs/devfs/devfs_tree.c +++ b/bsd/miscfs/devfs/devfs_tree.c @@ -82,7 +82,7 @@ #include <sys/kernel.h> #include <sys/conf.h> #include <sys/malloc.h> -#include <sys/mount.h> +#include <sys/mount_internal.h> #include <sys/proc.h> #include <sys/vnode.h> #include <stdarg.h> @@ -90,7 +90,18 @@ #include "devfs.h" #include "devfsdefs.h" -struct lock__bsd__ devfs_lock; /* the "big switch" */ +static void devfs_release_busy(devnode_t *); +static void dev_free_hier(devdirent_t *); +static int devfs_propogate(devdirent_t *, devdirent_t *); +static int dev_finddir(char *, devnode_t *, int, devnode_t **); +static int dev_dup_entry(devnode_t *, devdirent_t *, devdirent_t **, struct devfsmount *); + + +lck_grp_t * devfs_lck_grp; +lck_grp_attr_t * devfs_lck_grp_attr; +lck_attr_t * devfs_lck_attr; +lck_mtx_t devfs_mutex; + devdirent_t * dev_root = NULL; /* root of backing tree */ struct devfs_stats devfs_stats; /* hold stats */ @@ -116,20 +127,37 @@ static int devfs_ready = 0; int devfs_sinit(void) { - lockinit(&devfs_lock, PINOD, "devfs", 0, 0); - if (dev_add_entry("root", NULL, DEV_DIR, NULL, NULL, NULL, - &dev_root)) { + int error; + + devfs_lck_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(devfs_lck_grp_attr); + devfs_lck_grp = lck_grp_alloc_init("devfs_lock", devfs_lck_grp_attr); + + devfs_lck_attr = lck_attr_alloc_init(); + //lck_attr_setdebug(devfs_lck_attr); + + lck_mtx_init(&devfs_mutex, devfs_lck_grp, devfs_lck_attr); + + DEVFS_LOCK(); + error = dev_add_entry("root", NULL, DEV_DIR, NULL, NULL, NULL, &dev_root); + DEVFS_UNLOCK(); + + if (error) { printf("devfs_sinit: dev_add_entry failed "); - return (EOPNOTSUPP); + return (ENOTSUP); } #ifdef HIDDEN_MOUNTPOINT MALLOC(devfs_hidden_mount, struct mount *, sizeof(struct mount), M_MOUNT, M_WAITOK); bzero(devfs_hidden_mount,sizeof(struct mount)); + mount_lock_init(devfs_hidden_mount); + TAILQ_INIT(&devfs_hidden_mount->mnt_vnodelist); + TAILQ_INIT(&devfs_hidden_mount->mnt_workerqueue); + TAILQ_INIT(&devfs_hidden_mount->mnt_newvnodes); - /* Initialize the default IO constraints */ - mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS; - mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32; + /* Initialize the default IO constraints */ + mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS; + mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32; devfs_mount(devfs_hidden_mount,"dummy",NULL,NULL,NULL); dev_root->de_dnp->dn_dvm @@ -146,13 +174,15 @@ devfs_sinit(void) \***********************************************************************/ -/***************************************************************\ -* Search down the linked list off a dir to find "name" * -* return the devnode_t * for that node. -\***************************************************************/ -/*proto*/ + +/*************************************************************** + * Search down the linked list off a dir to find "name" + * return the devnode_t * for that node. + * + * called with DEVFS_LOCK held + ***************************************************************/ devdirent_t * -dev_findname(devnode_t * dir,char *name) +dev_findname(devnode_t * dir, char *name) { devdirent_t * newfp; if (dir->dn_type != DEV_DIR) return 0;/*XXX*/ /* printf?*/ @@ -170,6 +200,7 @@ dev_findname(devnode_t * dir,char *name) } } newfp = dir->dn_typeinfo.Dir.dirlist; + while(newfp) { if(!(strcmp(name,newfp->de_name))) @@ -179,121 +210,16 @@ dev_findname(devnode_t * dir,char *name) return NULL; } -#if 0 -/***********************************************************************\ -* Given a starting node (0 for root) and a pathname, return the node * -* for the end item on the path. It MUST BE A DIRECTORY. If the 'CREATE' * -* option is true, then create any missing nodes in the path and create * -* and return the final node as well. * -* This is used to set up a directory, before making nodes in it.. * -* * -* Warning: This function is RECURSIVE. * -\***********************************************************************/ -int -dev_finddir(char * orig_path, /* find this dir (err if not dir) */ - devnode_t * dirnode, /* starting point */ - int create, /* create path? */ - devnode_t * * dn_pp) /* returned */ -{ - devdirent_t * dirent_p; - devnode_t * dnp = NULL; - char pathbuf[DEVMAXPATHSIZE]; - char *path; - char *name; - register char *cp; - int retval; - - - /***************************************\ - * If no parent directory is given * - * then start at the root of the tree * - \***************************************/ - if(!dirnode) dirnode = dev_root->de_dnp; - - /***************************************\ - * Sanity Checks * - \***************************************/ - if (dirnode->dn_type != DEV_DIR) return ENOTDIR; - if(strlen(orig_path) > (DEVMAXPATHSIZE - 1)) return ENAMETOOLONG; - - - path = pathbuf; - strcpy(path,orig_path); - - /***************************************\ - * always absolute, skip leading / * - * get rid of / or // or /// etc. * - \***************************************/ - while(*path == '/') path++; - - /***************************************\ - * If nothing left, then parent was it.. * - \***************************************/ - if ( *path == '\0' ) { - *dn_pp = dirnode; - return 0; - } - - /***************************************\ - * find the next segment of the name * - \***************************************/ - cp = name = path; - while((*cp != '/') && (*cp != 0)) { - cp++; - } - - /***********************************************\ - * Check to see if it's the last component * - \***********************************************/ - if(*cp) { - path = cp + 1; /* path refers to the rest */ - *cp = 0; /* name is now a separate string */ - if(!(*path)) { - path = (char *)0; /* was trailing slash */ - } - } else { - path = NULL; /* no more to do */ - } - - /***************************************\ - * Start scanning along the linked list * - \***************************************/ - dirent_p = dev_findname(dirnode,name); - if(dirent_p) { /* check it's a directory */ - dnp = dirent_p->de_dnp; - if(dnp->dn_type != DEV_DIR) return ENOTDIR; - } else { - /***************************************\ - * The required element does not exist * - * So we will add it if asked to. * - \***************************************/ - if(!create) return ENOENT; - - if((retval = dev_add_entry(name, dirnode, - DEV_DIR, NULL, NULL, NULL, - &dirent_p)) != 0) { - return retval; - } - dnp = dirent_p->de_dnp; - devfs_propogate(dirnode->dn_typeinfo.Dir.myname,dirent_p); - } - if(path != NULL) { /* decide whether to recurse more or return */ - return (dev_finddir(path,dnp,create,dn_pp)); - } else { - *dn_pp = dnp; - return 0; - } -} -#endif -/***********************************************************************\ -* Given a starting node (0 for root) and a pathname, return the node * -* for the end item on the path. It MUST BE A DIRECTORY. If the 'CREATE' * -* option is true, then create any missing nodes in the path and create * -* and return the final node as well. * -* This is used to set up a directory, before making nodes in it.. * -\***********************************************************************/ -/* proto */ -int +/*********************************************************************** + * Given a starting node (0 for root) and a pathname, return the node + * for the end item on the path. It MUST BE A DIRECTORY. If the 'CREATE' + * option is true, then create any missing nodes in the path and create + * and return the final node as well. + * This is used to set up a directory, before making nodes in it.. + * + * called with DEVFS_LOCK held + ***********************************************************************/ +static int dev_finddir(char * path, devnode_t * dirnode, int create, @@ -365,16 +291,17 @@ dev_finddir(char * path, } -/***********************************************************************\ -* Add a new NAME element to the devfs * -* If we're creating a root node, then dirname is NULL * -* Basically this creates a new namespace entry for the device node * -* * -* Creates a name node, and links it to the supplied node * -\***********************************************************************/ -/*proto*/ +/*********************************************************************** + * Add a new NAME element to the devfs + * If we're creating a root node, then dirname is NULL + * Basically this creates a new namespace entry for the device node + * + * Creates a name node, and links it to the supplied node + * + * called with DEVFS_LOCK held + ***********************************************************************/ int -dev_add_name(char * name, devnode_t * dirnode, devdirent_t * back, +dev_add_name(char * name, devnode_t * dirnode, __unused devdirent_t * back, devnode_t * dnp, devdirent_t * *dirent_pp) { devdirent_t * dirent_p = NULL; @@ -470,8 +397,6 @@ dev_add_name(char * name, devnode_t * dirnode, devdirent_t * back, /* * Put it on the END of the linked list of directory entries */ - int len; - dirent_p->de_parent = dirnode; /* null for root */ dirent_p->de_prevp = dirnode->dn_typeinfo.Dir.dirlast; dirent_p->de_next = *(dirent_p->de_prevp); /* should be NULL */ @@ -488,21 +413,22 @@ dev_add_name(char * name, devnode_t * dirnode, devdirent_t * back, } -/***********************************************************************\ -* Add a new element to the devfs plane. * -* * -* Creates a new dev_node to go with it if the prototype should not be * -* reused. (Is a DIR, or we select SPLIT_DEVS at compile time) * -* typeinfo gives us info to make our node if we don't have a prototype. * -* If typeinfo is null and proto exists, then the typeinfo field of * -* the proto is used intead in the CREATE case. * -* note the 'links' count is 0 (except if a dir) * -* but it is only cleared on a transition * -* so this is ok till we link it to something * -* Even in SPLIT_DEVS mode, * -* if the node already exists on the wanted plane, just return it * -\***********************************************************************/ -/*proto*/ +/*********************************************************************** + * Add a new element to the devfs plane. + * + * Creates a new dev_node to go with it if the prototype should not be + * reused. (Is a DIR, or we select SPLIT_DEVS at compile time) + * typeinfo gives us info to make our node if we don't have a prototype. + * If typeinfo is null and proto exists, then the typeinfo field of + * the proto is used intead in the CREATE case. + * note the 'links' count is 0 (except if a dir) + * but it is only cleared on a transition + * so this is ok till we link it to something + * Even in SPLIT_DEVS mode, + * if the node already exists on the wanted plane, just return it + * + * called with DEVFS_LOCK held +***********************************************************************/ int dev_add_node(int entrytype, devnode_type_t * typeinfo, devnode_t * proto, devnode_t * *dn_pp, struct devfsmount *dvm) @@ -545,7 +471,7 @@ dev_add_node(int entrytype, devnode_type_t * typeinfo, devnode_t * proto, * If we have a proto, that means that we are duplicating some * other device, which can only happen if we are not at the back plane */ - if(proto) { + if (proto) { bcopy(proto, dnp, sizeof(devnode_t)); dnp->dn_links = 0; dnp->dn_linklist = NULL; @@ -562,8 +488,8 @@ dev_add_node(int entrytype, devnode_type_t * typeinfo, devnode_t * proto, /* * We have no prototype, so start off with a clean slate */ - tv = time; - bzero(dnp,sizeof(devnode_t)); + microtime(&tv); + bzero(dnp, sizeof(devnode_t)); dnp->dn_type = entrytype; dnp->dn_nextsibling = dnp; dnp->dn_prevsiblingp = &(dnp->dn_nextsibling); @@ -639,21 +565,29 @@ dev_add_node(int entrytype, devnode_type_t * typeinfo, devnode_t * proto, } -/*proto*/ +/*********************************************************************** + * called with DEVFS_LOCK held + **********************************************************************/ void devnode_free(devnode_t * dnp) { + if (dnp->dn_lflags & DN_BUSY) { + dnp->dn_lflags |= DN_DELETE; + return; + } if (dnp->dn_type == DEV_SLNK) { DEVFS_DECR_STRINGSPACE(dnp->dn_typeinfo.Slnk.namelen + 1); - FREE(dnp->dn_typeinfo.Slnk.name,M_DEVFSNODE); + FREE(dnp->dn_typeinfo.Slnk.name, M_DEVFSNODE); } - FREE(dnp, M_DEVFSNODE); DEVFS_DECR_NODES(); - return; + FREE(dnp, M_DEVFSNODE); } -/*proto*/ -void + +/*********************************************************************** + * called with DEVFS_LOCK held + **********************************************************************/ +static void devfs_dn_free(devnode_t * dnp) { if(--dnp->dn_links <= 0 ) /* can be -1 for initial free, on error */ @@ -666,16 +600,9 @@ devfs_dn_free(devnode_t * dnp) } if (dnp->dn_vn == NULL) { -#if 0 - printf("devfs_dn_free: free'ing %x\n", (unsigned int)dnp); -#endif devnode_free(dnp); /* no accesses/references */ } else { -#if 0 - printf("devfs_dn_free: marking %x for deletion\n", - (unsigned int)dnp); -#endif dnp->dn_delete = TRUE; } } @@ -686,20 +613,21 @@ devfs_dn_free(devnode_t * dnp) * Add or delete a chain of front nodes * \***********************************************************************/ -/***********************************************************************\ -* Given a directory backing node, and a child backing node, add the * -* appropriate front nodes to the front nodes of the directory to * -* represent the child node to the user * -* * -* on failure, front nodes will either be correct or not exist for each * -* front dir, however dirs completed will not be stripped of completed * -* frontnodes on failure of a later frontnode * -* * -* This allows a new node to be propogated through all mounted planes * -* * -\***********************************************************************/ -/*proto*/ -int + +/*********************************************************************** + * Given a directory backing node, and a child backing node, add the + * appropriate front nodes to the front nodes of the directory to + * represent the child node to the user + * + * on failure, front nodes will either be correct or not exist for each + * front dir, however dirs completed will not be stripped of completed + * frontnodes on failure of a later frontnode + * + * This allows a new node to be propogated through all mounted planes + * + * called with DEVFS_LOCK held + ***********************************************************************/ +static int devfs_propogate(devdirent_t * parent,devdirent_t * child) { int error; @@ -709,9 +637,9 @@ devfs_propogate(devdirent_t * parent,devdirent_t * child) devnode_t * adnp = parent->de_dnp; int type = child->de_dnp->dn_type; - /***********************************************\ - * Find the other instances of the parent node * - \***********************************************/ + /*********************************************** + * Find the other instances of the parent node + ***********************************************/ for (adnp = pdnp->dn_nextsibling; adnp != pdnp; adnp = adnp->dn_nextsibling) @@ -730,6 +658,7 @@ devfs_propogate(devdirent_t * parent,devdirent_t * child) return 0; /* for now always succeed */ } + /*********************************************************************** * remove all instances of this devicename [for backing nodes..] * note.. if there is another link to the node (non dir nodes only) @@ -745,20 +674,17 @@ devfs_remove(void *dirent_p) { devnode_t * dnp = ((devdirent_t *)dirent_p)->de_dnp; devnode_t * dnp2; - boolean_t funnel_state; boolean_t lastlink; - funnel_state = thread_funnel_set(kernel_flock, TRUE); + DEVFS_LOCK(); if (!devfs_ready) { printf("devfs_remove: not ready for devices!\n"); goto out; } - DEVFS_LOCK(0); - /* keep removing the next sibling till only we exist. */ - while((dnp2 = dnp->dn_nextsibling) != dnp) { + while ((dnp2 = dnp->dn_nextsibling) != dnp) { /* * Keep removing the next front node till no more exist @@ -767,7 +693,7 @@ devfs_remove(void *dirent_p) dnp->dn_nextsibling->dn_prevsiblingp = &(dnp->dn_nextsibling); dnp2->dn_nextsibling = dnp2; dnp2->dn_prevsiblingp = &(dnp2->dn_nextsibling); - if(dnp2->dn_linklist) { + if (dnp2->dn_linklist) { do { lastlink = (1 == dnp2->dn_links); dev_free_name(dnp2->dn_linklist); @@ -780,26 +706,28 @@ devfs_remove(void *dirent_p) * If we are not running in SPLIT_DEVS mode, then * THIS is what gets rid of the propogated nodes. */ - if(dnp->dn_linklist) { + if (dnp->dn_linklist) { do { lastlink = (1 == dnp->dn_links); dev_free_name(dnp->dn_linklist); } while (!lastlink); } - DEVFS_UNLOCK(0); out: - (void) thread_funnel_set(kernel_flock, funnel_state); + DEVFS_UNLOCK(); + return ; } + /*************************************************************** * duplicate the backing tree into a tree of nodes hung off the * mount point given as the argument. Do this by * calling dev_dup_entry which recurses all the way * up the tree.. + * + * called with DEVFS_LOCK held **************************************************************/ -/*proto*/ int dev_dup_plane(struct devfsmount *devfs_mp_p) { @@ -807,40 +735,43 @@ dev_dup_plane(struct devfsmount *devfs_mp_p) int error = 0; if ((error = dev_dup_entry(NULL, dev_root, &new, devfs_mp_p))) - return error; + return error; devfs_mp_p->plane_root = new; return error; } -/***************************************************************\ -* Free a whole plane -\***************************************************************/ -/*proto*/ +/*************************************************************** + * Free a whole plane + * + * called with DEVFS_LOCK held + ***************************************************************/ void devfs_free_plane(struct devfsmount *devfs_mp_p) { devdirent_t * dirent_p; dirent_p = devfs_mp_p->plane_root; - if(dirent_p) { + if (dirent_p) { dev_free_hier(dirent_p); dev_free_name(dirent_p); } devfs_mp_p->plane_root = NULL; } -/***************************************************************\ -* Create and link in a new front element.. * -* Parent can be 0 for a root node * -* Not presently usable to make a symlink XXX * -* (Ok, symlinks don't propogate) -* recursively will create subnodes corresponding to equivalent * -* child nodes in the base level * -\***************************************************************/ -/*proto*/ -int + +/*************************************************************** + * Create and link in a new front element.. + * Parent can be 0 for a root node + * Not presently usable to make a symlink XXX + * (Ok, symlinks don't propogate) + * recursively will create subnodes corresponding to equivalent + * child nodes in the base level + * + * called with DEVFS_LOCK held + ***************************************************************/ +static int dev_dup_entry(devnode_t * parent, devdirent_t * back, devdirent_t * *dnm_pp, struct devfsmount *dvm) { @@ -890,13 +821,16 @@ dev_dup_entry(devnode_t * parent, devdirent_t * back, devdirent_t * *dnm_pp, return error; } -/***************************************************************\ -* Free a name node * -* remember that if there are other names pointing to the * -* dev_node then it may not get freed yet * -* can handle if there is no dnp * -\***************************************************************/ -/*proto*/ + +/*************************************************************** + * Free a name node + * remember that if there are other names pointing to the + * dev_node then it may not get freed yet + * can handle if there is no dnp + * + * called with DEVFS_LOCK held + ***************************************************************/ + int dev_free_name(devdirent_t * dirent_p) { @@ -952,19 +886,22 @@ dev_free_name(devdirent_t * dirent_p) } DEVFS_DECR_ENTRIES(); - FREE(dirent_p,M_DEVFSNAME); + FREE(dirent_p, M_DEVFSNAME); return 0; } -/***************************************************************\ -* Free a hierarchy starting at a directory node name * -* remember that if there are other names pointing to the * -* dev_node then it may not get freed yet * -* can handle if there is no dnp * -* leave the node itself allocated. * -\***************************************************************/ -/*proto*/ -void + +/*************************************************************** + * Free a hierarchy starting at a directory node name + * remember that if there are other names pointing to the + * dev_node then it may not get freed yet + * can handle if there is no dnp + * leave the node itself allocated. + * + * called with DEVFS_LOCK held + ***************************************************************/ + +static void dev_free_hier(devdirent_t * dirent_p) { devnode_t * dnp = dirent_p->de_dnp; @@ -981,60 +918,155 @@ dev_free_hier(devdirent_t * dirent_p) } } -/***************************************************************\ -* given a dev_node, find the appropriate vnode if one is already -* associated, or get a new one and associate it with the dev_node -\***************************************************************/ -/*proto*/ + +/*************************************************************** + * given a dev_node, find the appropriate vnode if one is already + * associated, or get a new one and associate it with the dev_node + * + * called with DEVFS_LOCK held + ***************************************************************/ int -devfs_dntovn(devnode_t * dnp, struct vnode **vn_pp, struct proc * p) +devfs_dntovn(devnode_t * dnp, struct vnode **vn_pp, __unused struct proc * p) { - struct vnode *vn_p, *nvp; + struct vnode *vn_p; int error = 0; + struct vnode_fsparam vfsp; + enum vtype vtype = 0; + int markroot = 0; +retry: *vn_pp = NULL; vn_p = dnp->dn_vn; + + dnp->dn_lflags |= DN_BUSY; + if (vn_p) { /* already has a vnode */ - *vn_pp = vn_p; - return(vget(vn_p, LK_EXCLUSIVE, p)); + uint32_t vid; + + vid = vnode_vid(vn_p); + + DEVFS_UNLOCK(); + + error = vnode_getwithvid(vn_p, vid); + + DEVFS_LOCK(); + + if (dnp->dn_lflags & DN_DELETE) { + /* + * our BUSY node got marked for + * deletion while the DEVFS lock + * was dropped... + */ + if (error == 0) { + /* + * vnode_getwithvid returned a valid ref + * which we need to drop + */ + vnode_put(vn_p); + } + /* + * set the error to EAGAIN + * which will cause devfs_lookup + * to retry this node + */ + error = EAGAIN; + } + if ( !error) + *vn_pp = vn_p; + + devfs_release_busy(dnp); + + return error; + } + + if (dnp->dn_lflags & DN_CREATE) { + dnp->dn_lflags |= DN_CREATEWAIT; + msleep(&dnp->dn_lflags, &devfs_mutex, PRIBIO, 0 , 0); + goto retry; } - if (!(error = getnewvnode(VT_DEVFS, dnp->dn_dvm->mount, - *(dnp->dn_ops), &vn_p))) { - switch(dnp->dn_type) { + + dnp->dn_lflags |= DN_CREATE; + + switch (dnp->dn_type) { case DEV_SLNK: - vn_p->v_type = VLNK; + vtype = VLNK; break; case DEV_DIR: if (dnp->dn_typeinfo.Dir.parent == dnp) { - vn_p->v_flag |= VROOT; + markroot = 1; } - vn_p->v_type = VDIR; + vtype = VDIR; break; case DEV_BDEV: case DEV_CDEV: - vn_p->v_type - = (dnp->dn_type == DEV_BDEV) ? VBLK : VCHR; - if ((nvp = checkalias(vn_p, dnp->dn_typeinfo.dev, - dnp->dn_dvm->mount)) != NULL) { - vput(vn_p); - vn_p = nvp; - } + vtype = (dnp->dn_type == DEV_BDEV) ? VBLK : VCHR; break; + } + vfsp.vnfs_mp = dnp->dn_dvm->mount; + vfsp.vnfs_vtype = vtype; + vfsp.vnfs_str = "devfs"; + vfsp.vnfs_dvp = 0; + vfsp.vnfs_fsnode = dnp; + vfsp.vnfs_cnp = 0; + vfsp.vnfs_vops = *(dnp->dn_ops); + + if (vtype == VBLK || vtype == VCHR) + vfsp.vnfs_rdev = dnp->dn_typeinfo.dev; + else + vfsp.vnfs_rdev = 0; + vfsp.vnfs_filesize = 0; + vfsp.vnfs_flags = VNFS_NOCACHE | VNFS_CANTCACHE; + /* Tag system files */ + vfsp.vnfs_marksystem = 0; + vfsp.vnfs_markroot = markroot; + + DEVFS_UNLOCK(); + + error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &vn_p); + + DEVFS_LOCK(); + + if (error == 0) { + if ((dnp->dn_vn)) { + panic("devnode already has a vnode?"); + } else { + dnp->dn_vn = vn_p; + *vn_pp = vn_p; + vnode_settag(vn_p, VT_DEVFS); } - vn_p->v_mount = dnp->dn_dvm->mount;/* XXX Duplicated */ - *vn_pp = vn_p; - vn_p->v_data = (void *)dnp; - dnp->dn_vn = vn_p; - error = vn_lock(vn_p, LK_EXCLUSIVE | LK_RETRY, p); } + + dnp->dn_lflags &= ~DN_CREATE; + + if (dnp->dn_lflags & DN_CREATEWAIT) { + dnp->dn_lflags &= ~DN_CREATEWAIT; + wakeup(&dnp->dn_lflags); + } + + devfs_release_busy(dnp); + return error; } -/***********************************************************************\ -* add a whole device, with no prototype.. make name element and node * -* Used for adding the original device entries * -\***********************************************************************/ -/*proto*/ + +/*********************************************************************** + * called with DEVFS_LOCK held + ***********************************************************************/ +static void +devfs_release_busy(devnode_t *dnp) { + + dnp->dn_lflags &= ~DN_BUSY; + + if (dnp->dn_lflags & DN_DELETE) + devnode_free(dnp); +} + +/*********************************************************************** + * add a whole device, with no prototype.. make name element and node + * Used for adding the original device entries + * + * called with DEVFS_LOCK held + ***********************************************************************/ int dev_add_entry(char *name, devnode_t * parent, int type, devnode_type_t * typeinfo, devnode_t * proto, struct devfsmount *dvm, devdirent_t * *nm_pp) @@ -1059,6 +1091,7 @@ dev_add_entry(char *name, devnode_t * parent, int type, devnode_type_t * typeinf return error; } + /* * Function: devfs_make_node * @@ -1076,27 +1109,28 @@ dev_add_entry(char *name, devnode_t * parent, int type, devnode_type_t * typeinf */ void * devfs_make_node(dev_t dev, int chrblk, uid_t uid, - gid_t gid, int perms, char *fmt, ...) + gid_t gid, int perms, const char *fmt, ...) { devdirent_t * new_dev = NULL; devnode_t * dnp; /* devnode for parent directory */ devnode_type_t typeinfo; char *name, *path, buf[256]; /* XXX */ - boolean_t funnel_state; int i; va_list ap; - funnel_state = thread_funnel_set(kernel_flock, TRUE); + + DEVFS_LOCK(); if (!devfs_ready) { printf("devfs_make_node: not ready for devices!\n"); goto out; } - if (chrblk != DEVFS_CHAR && chrblk != DEVFS_BLOCK) goto out; + DEVFS_UNLOCK(); + va_start(ap, fmt); vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); @@ -1117,8 +1151,8 @@ devfs_make_node(dev_t dev, int chrblk, uid_t uid, name = buf; path = "/"; } + DEVFS_LOCK(); - DEVFS_LOCK(0); /* find/create directory path ie. mkdir -p */ if (dev_finddir(path, NULL, CREATE, &dnp) == 0) { typeinfo.dev = dev; @@ -1131,10 +1165,9 @@ devfs_make_node(dev_t dev, int chrblk, uid_t uid, devfs_propogate(dnp->dn_typeinfo.Dir.myname, new_dev); } } - DEVFS_UNLOCK(0); - out: - (void) thread_funnel_set(kernel_flock, funnel_state); + DEVFS_UNLOCK(); + return new_dev; } @@ -1157,14 +1190,14 @@ devfs_make_link(void *original, char *fmt, ...) va_list ap; char *p, buf[256]; /* XXX */ int i; - boolean_t funnel_state; - funnel_state = thread_funnel_set(kernel_flock, TRUE); + DEVFS_LOCK(); if (!devfs_ready) { printf("devfs_make_link: not ready for devices!\n"); goto out; } + DEVFS_UNLOCK(); va_start(ap, fmt); vsnprintf(buf, sizeof(buf), fmt, ap); @@ -1172,28 +1205,31 @@ devfs_make_link(void *original, char *fmt, ...) p = NULL; - for(i=strlen(buf); i>0; i--) + for(i=strlen(buf); i>0; i--) { if(buf[i] == '/') { p=&buf[i]; buf[i]=0; break; } - DEVFS_LOCK(0); + } + DEVFS_LOCK(); + if (p) { - *p++ = '\0'; - if (dev_finddir(buf, NULL, CREATE, &dirnode) - || dev_add_name(p, dirnode, NULL, orig->de_dnp, &new_dev)) - goto fail; + *p++ = '\0'; + + if (dev_finddir(buf, NULL, CREATE, &dirnode) + || dev_add_name(p, dirnode, NULL, orig->de_dnp, &new_dev)) + goto fail; } else { - if (dev_finddir("", NULL, CREATE, &dirnode) - || dev_add_name(buf, dirnode, NULL, orig->de_dnp, &new_dev)) - goto fail; + if (dev_finddir("", NULL, CREATE, &dirnode) + || dev_add_name(buf, dirnode, NULL, orig->de_dnp, &new_dev)) + goto fail; } devfs_propogate(dirnode->dn_typeinfo.Dir.myname, new_dev); fail: - DEVFS_UNLOCK(0); out: - (void) thread_funnel_set(kernel_flock, funnel_state); + DEVFS_UNLOCK(); + return ((new_dev != NULL) ? 0 : -1); } diff --git a/bsd/miscfs/devfs/devfs_vfsops.c b/bsd/miscfs/devfs/devfs_vfsops.c index 4c6b4729b..c2148de5d 100644 --- a/bsd/miscfs/devfs/devfs_vfsops.c +++ b/bsd/miscfs/devfs/devfs_vfsops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -58,18 +58,19 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/proc.h> -#include <sys/mount.h> +#include <sys/kauth.h> +#include <sys/mount_internal.h> #include <sys/malloc.h> #include "devfs.h" #include "devfsdefs.h" -static int devfs_statfs( struct mount *mp, struct statfs *sbp, struct proc *p); +static int devfs_statfs( struct mount *mp, struct vfsstatfs *sbp, vfs_context_t context); +static int devfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t context); -static struct vfsconf * devfs_vfsp = 0; -static int kernel_mount = 0; +static struct vfstable * devfs_vfsp = 0; /*- @@ -83,10 +84,10 @@ static int kernel_mount = 0; static int devfs_init(struct vfsconf *vfsp) { - devfs_vfsp = vfsp; /* remember this for devfs_kernel_mount below */ + devfs_vfsp = (struct vfstable *)vfsp; /* remember this for devfs_kernel_mount below */ if (devfs_sinit()) - return (EOPNOTSUPP); + return (ENOTSUP); devfs_make_node(makedev(0, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0622, "console"); devfs_make_node(makedev(2, 0), DEVFS_CHAR, @@ -119,12 +120,10 @@ devfs_init(struct vfsconf *vfsp) */ /*proto*/ int -devfs_mount(struct mount *mp, char *path, caddr_t data, - struct nameidata *ndp, struct proc *p) +devfs_mount(struct mount *mp, __unused vnode_t devvp, __unused user_addr_t data, vfs_context_t context) { struct devfsmount *devfs_mp_p; /* devfs specific mount info */ int error; - size_t size; /*- * If they just want to update, we don't need to do anything. @@ -134,6 +133,9 @@ devfs_mount(struct mount *mp, char *path, caddr_t data, return 0; } + /* Advisory locking should be handled at the VFS layer */ + vfs_setlocklocal(mp); + /*- * Well, it's not an update, it's a real mount request. * Time to get dirty. @@ -151,19 +153,20 @@ devfs_mount(struct mount *mp, char *path, caddr_t data, * Fill out some fields */ mp->mnt_data = (qaddr_t)devfs_mp_p; - mp->mnt_stat.f_type = mp->mnt_vfc->vfc_typenum; - mp->mnt_stat.f_fsid.val[0] = (int32_t)(void *)devfs_mp_p; - mp->mnt_stat.f_fsid.val[1] = mp->mnt_stat.f_type; + mp->mnt_vfsstat.f_fsid.val[0] = (int32_t)(void *)devfs_mp_p; + mp->mnt_vfsstat.f_fsid.val[1] = vfs_typenum(mp); mp->mnt_flag |= MNT_LOCAL; - DEVFS_LOCK(p); + DEVFS_LOCK(); error = dev_dup_plane(devfs_mp_p); - DEVFS_UNLOCK(p); + DEVFS_UNLOCK(); + if (error) { mp->mnt_data = (qaddr_t)0; FREE((caddr_t)devfs_mp_p, M_DEVFSMNT); return (error); - } + } else + DEVFS_INCR_MOUNTS(); /*- * Copy in the name of the directory the filesystem @@ -172,22 +175,16 @@ devfs_mount(struct mount *mp, char *path, caddr_t data, * to be tidy. */ - if (!kernel_mount) { - copyinstr(path, (caddr_t)mp->mnt_stat.f_mntonname, - sizeof(mp->mnt_stat.f_mntonname)-1, &size); - bzero(mp->mnt_stat.f_mntonname + size, - sizeof(mp->mnt_stat.f_mntonname) - size); - } - bzero(mp->mnt_stat.f_mntfromname, MNAMELEN); - bcopy("devfs",mp->mnt_stat.f_mntfromname, 5); - DEVFS_INCR_MOUNTS(); - (void)devfs_statfs(mp, &mp->mnt_stat, p); + bzero(mp->mnt_vfsstat.f_mntfromname, MAXPATHLEN); + bcopy("devfs",mp->mnt_vfsstat.f_mntfromname, 5); + (void)devfs_statfs(mp, &mp->mnt_vfsstat, context); + return 0; } static int -devfs_start(struct mount *mp, int flags, struct proc *p) +devfs_start(__unused struct mount *mp, __unused int flags, __unused vfs_context_t context) { return 0; } @@ -196,7 +193,7 @@ devfs_start(struct mount *mp, int flags, struct proc *p) * Unmount the filesystem described by mp. */ static int -devfs_unmount( struct mount *mp, int mntflags, struct proc *p) +devfs_unmount( struct mount *mp, int mntflags, __unused vfs_context_t context) { struct devfsmount *devfs_mp_p = (struct devfsmount *)mp->mnt_data; int flags = 0; @@ -211,11 +208,13 @@ devfs_unmount( struct mount *mp, int mntflags, struct proc *p) if (error && !force) return error; - DEVFS_LOCK(p); + DEVFS_LOCK(); devfs_free_plane(devfs_mp_p); - DEVFS_UNLOCK(p); - FREE((caddr_t)devfs_mp_p, M_DEVFSMNT); + DEVFS_UNLOCK(); + DEVFS_DECR_MOUNTS(); + + FREE((caddr_t)devfs_mp_p, M_DEVFSMNT); mp->mnt_data = (qaddr_t)0; mp->mnt_flag &= ~MNT_LOCAL; @@ -224,32 +223,27 @@ devfs_unmount( struct mount *mp, int mntflags, struct proc *p) /* return the address of the root vnode in *vpp */ static int -devfs_root(struct mount *mp, struct vnode **vpp) +devfs_root(struct mount *mp, struct vnode **vpp, vfs_context_t context) { struct devfsmount *devfs_mp_p = (struct devfsmount *)(mp->mnt_data); int error; - error = devfs_dntovn(devfs_mp_p->plane_root->de_dnp,vpp, - current_proc()); - return error; -} + DEVFS_LOCK(); + error = devfs_dntovn(devfs_mp_p->plane_root->de_dnp, vpp, context->vc_proc); + DEVFS_UNLOCK(); -static int -devfs_quotactl(struct mount *mp, int cmds, uid_t uid, caddr_t arg, - struct proc *p) -{ - return EOPNOTSUPP; + return error; } static int -devfs_statfs( struct mount *mp, struct statfs *sbp, struct proc *p) +devfs_statfs( struct mount *mp, struct vfsstatfs *sbp, __unused vfs_context_t context) { struct devfsmount *devfs_mp_p = (struct devfsmount *)mp->mnt_data; /*- * Fill in the stat block. */ - sbp->f_type = mp->mnt_stat.f_type; + //sbp->f_type = mp->mnt_vfsstat.f_type; sbp->f_flags = 0; /* XXX */ sbp->f_bsize = 512; sbp->f_iosize = 512; @@ -263,33 +257,48 @@ devfs_statfs( struct mount *mp, struct statfs *sbp, struct proc *p) sbp->f_files = devfs_stats.nodes; sbp->f_ffree = 0; sbp->f_fsid.val[0] = (int32_t)(void *)devfs_mp_p; - sbp->f_fsid.val[1] = mp->mnt_stat.f_type; + sbp->f_fsid.val[1] = vfs_typenum(mp); - /*- - * Copy the mounted on and mounted from names into - * the passed in stat block, if it is not the one - * in the mount structure. - */ - if (sbp != &mp->mnt_stat) { - bcopy((caddr_t)mp->mnt_stat.f_mntonname, - (caddr_t)&sbp->f_mntonname[0], MNAMELEN); - bcopy((caddr_t)mp->mnt_stat.f_mntfromname, - (caddr_t)&sbp->f_mntfromname[0], MNAMELEN); + return 0; +} + +static int +devfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t context) +{ + VFSATTR_RETURN(fsap, f_objcount, devfs_stats.nodes); + VFSATTR_RETURN(fsap, f_maxobjcount, devfs_stats.nodes); + VFSATTR_RETURN(fsap, f_bsize, 512); + VFSATTR_RETURN(fsap, f_iosize, 512); + if (VFSATTR_IS_ACTIVE(fsap, f_blocks) || VFSATTR_IS_ACTIVE(fsap, f_bused)) { + fsap->f_blocks = (devfs_stats.mounts * sizeof(struct devfsmount) + + devfs_stats.nodes * sizeof(devnode_t) + + devfs_stats.entries * sizeof(devdirent_t) + + devfs_stats.stringspace + ) / fsap->f_bsize; + fsap->f_bused = fsap->f_blocks; + VFSATTR_SET_SUPPORTED(fsap, f_blocks); + VFSATTR_SET_SUPPORTED(fsap, f_bused); } + VFSATTR_RETURN(fsap, f_bfree, 0); + VFSATTR_RETURN(fsap, f_bavail, 0); + VFSATTR_RETURN(fsap, f_files, devfs_stats.nodes); + VFSATTR_RETURN(fsap, f_ffree, 0); + VFSATTR_RETURN(fsap, f_fssubtype, 0); + return 0; } static int -devfs_sync(struct mount *mp, int waitfor,struct ucred *cred,struct proc *p) +devfs_sync(__unused struct mount *mp, __unused int waitfor, __unused vfs_context_t context) { return (0); } static int -devfs_vget(struct mount *mp, void * ino,struct vnode **vpp) +devfs_vget(__unused struct mount *mp, __unused ino64_t ino, __unused struct vnode **vpp, __unused vfs_context_t context) { - return EOPNOTSUPP; + return ENOTSUP; } /************************************************************* @@ -298,30 +307,24 @@ devfs_vget(struct mount *mp, void * ino,struct vnode **vpp) */ static int -devfs_fhtovp (struct mount *mp, struct fid *fhp, struct mbuf *nam, - struct vnode **vpp, int *exflagsp, struct ucred **credanonp) +devfs_fhtovp (__unused struct mount *mp, __unused int fhlen, __unused unsigned char *fhp, __unused struct vnode **vpp, __unused vfs_context_t context) { return (EINVAL); } static int -devfs_vptofh (struct vnode *vp, struct fid *fhp) +devfs_vptofh (__unused struct vnode *vp, __unused int *fhlenp, __unused unsigned char *fhp, __unused vfs_context_t context) { return (EINVAL); } static int -devfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) - int *name; - u_int namelen; - void *oldp; - size_t *oldlenp; - void *newp; - size_t newlen; - struct proc *p; +devfs_sysctl(__unused int *name, __unused u_int namelen, __unused user_addr_t oldp, + __unused size_t *oldlenp, __unused user_addr_t newp, + __unused size_t newlen, __unused vfs_context_t context) { - return (EOPNOTSUPP); + return (ENOTSUP); } #include <sys/namei.h> @@ -336,39 +339,47 @@ devfs_kernel_mount(char * mntname) { struct mount *mp; int error; - struct proc *procp; struct nameidata nd; struct vnode * vp; + struct vfs_context context; if (devfs_vfsp == NULL) { printf("devfs_kernel_mount: devfs_vfsp is NULL\n"); return (EINVAL); } - procp = current_proc(); + context.vc_proc = current_proc(); + context.vc_ucred = kauth_cred_get(); /* * Get vnode to be covered */ - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, - mntname, procp); + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE32, + CAST_USER_ADDR_T(mntname), &context); if ((error = namei(&nd))) { printf("devfs_kernel_mount: failed to find directory '%s', %d", mntname, error); return (error); } + nameidone(&nd); vp = nd.ni_vp; - if ((error = vinvalbuf(vp, V_SAVE, procp->p_ucred, procp, 0, 0))) { - printf("devfs_kernel_mount: vinval failed: %d\n", error); - vput(vp); + + if ((error = VNOP_FSYNC(vp, MNT_WAIT, &context))) { + printf("devfs_kernel_mount: vnop_fsync failed: %d\n", error); + vnode_put(vp); return (error); } - if (vp->v_type != VDIR) { + if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) { + printf("devfs_kernel_mount: buf_invalidateblks failed: %d\n", error); + vnode_put(vp); + return (error); + } + if (vnode_isdir(vp) == 0) { printf("devfs_kernel_mount: '%s' is not a directory\n", mntname); - vput(vp); + vnode_put(vp); return (ENOTDIR); } - if (vp->v_mountedhere != NULL) { - vput(vp); + if ((vnode_mountedhere(vp))) { + vnode_put(vp); return (EBUSY); } @@ -379,44 +390,46 @@ devfs_kernel_mount(char * mntname) M_MOUNT, M_WAITOK); bzero((char *)mp, (u_long)sizeof(struct mount)); - /* Initialize the default IO constraints */ - mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS; - mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32; + /* Initialize the default IO constraints */ + mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS; + mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32; + + mount_lock_init(mp); + TAILQ_INIT(&mp->mnt_vnodelist); + TAILQ_INIT(&mp->mnt_workerqueue); + TAILQ_INIT(&mp->mnt_newvnodes); - lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0); - (void)vfs_busy(mp, LK_NOWAIT, 0, procp); - LIST_INIT(&mp->mnt_vnodelist); + (void)vfs_busy(mp, LK_NOWAIT); mp->mnt_op = devfs_vfsp->vfc_vfsops; - mp->mnt_vfc = devfs_vfsp; + mp->mnt_vtable = devfs_vfsp; devfs_vfsp->vfc_refcount++; + devfs_vfsp->vfc_threadsafe = TRUE; + devfs_vfsp->vfc_64bitready = TRUE; mp->mnt_flag = 0; mp->mnt_flag |= devfs_vfsp->vfc_flags & MNT_VISFLAGMASK; - strncpy(mp->mnt_stat.f_fstypename, devfs_vfsp->vfc_name, MFSNAMELEN); + strncpy(mp->mnt_vfsstat.f_fstypename, devfs_vfsp->vfc_name, MFSTYPENAMELEN); vp->v_mountedhere = mp; mp->mnt_vnodecovered = vp; - mp->mnt_stat.f_owner = procp->p_ucred->cr_uid; - (void) copystr(mntname, mp->mnt_stat.f_mntonname, MNAMELEN - 1, 0); + mp->mnt_vfsstat.f_owner = kauth_cred_getuid(kauth_cred_get()); + (void) copystr(mntname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN - 1, 0); + + error = devfs_mount(mp, NULL, NULL, &context); - kernel_mount = 1; - error = devfs_mount(mp, mntname, NULL, NULL, procp); - kernel_mount = 0; if (error) { printf("devfs_kernel_mount: mount %s failed: %d", mntname, error); - mp->mnt_vfc->vfc_refcount--; + mp->mnt_vtable->vfc_refcount--; - if (mp->mnt_kern_flag & MNTK_IO_XINFO) - FREE(mp->mnt_xinfo_ptr, M_TEMP); - vfs_unbusy(mp, procp); + vfs_unbusy(mp); + mount_lock_destroy(mp); FREE_ZONE(mp, sizeof (struct mount), M_MOUNT); - vput(vp); + vnode_put(vp); return (error); } - simple_lock(&mountlist_slock); - CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list); - simple_unlock(&mountlist_slock); - VOP_UNLOCK(vp, 0, procp); - vfs_unbusy(mp, procp); + vnode_ref(vp); + vnode_put(vp); + vfs_unbusy(mp); + mount_list_add(mp); return (0); } @@ -425,12 +438,12 @@ struct vfsops devfs_vfsops = { devfs_start, devfs_unmount, devfs_root, - devfs_quotactl, - devfs_statfs, + NULL, /* quotactl */ + devfs_vfs_getattr, devfs_sync, devfs_vget, devfs_fhtovp, devfs_vptofh, devfs_init, - devfs_sysctl, + devfs_sysctl }; diff --git a/bsd/miscfs/devfs/devfs_vnops.c b/bsd/miscfs/devfs/devfs_vnops.c index 9e8c291fd..c74d145f2 100644 --- a/bsd/miscfs/devfs/devfs_vnops.c +++ b/bsd/miscfs/devfs/devfs_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -69,7 +69,6 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/buf.h> #include <sys/namei.h> #include <sys/kernel.h> #include <sys/fcntl.h> @@ -77,17 +76,23 @@ #include <sys/disklabel.h> #include <sys/lock.h> #include <sys/stat.h> -#include <sys/mount.h> +#include <sys/mount_internal.h> #include <sys/proc.h> +#include <sys/kauth.h> #include <sys/time.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <miscfs/specfs/specdev.h> #include <sys/dirent.h> #include <sys/vmmeter.h> #include <sys/vm.h> +#include <sys/uio_internal.h> #include "devfsdefs.h" +static int devfs_update(struct vnode *vp, struct timeval *access, + struct timeval *modify); + + /* * Convert a component of a pathname into a pointer to a locked node. * This is a very central and rather complicated routine. @@ -126,14 +131,17 @@ * NOTE: (LOOKUP | LOCKPARENT) currently returns the parent node unlocked. */ static int -devfs_lookup(struct vop_lookup_args *ap) - /*struct vop_lookup_args { +devfs_lookup(struct vnop_lookup_args *ap) + /*struct vnop_lookup_args { struct vnode * a_dvp; directory vnode ptr struct vnode ** a_vpp; where to put the result struct componentname * a_cnp; the name we want + vfs_context_t a_context; };*/ { struct componentname *cnp = ap->a_cnp; + vfs_context_t ctx = cnp->cn_context; + struct proc *p = vfs_context_proc(ctx); struct vnode *dir_vnode = ap->a_dvp; struct vnode **result_vnode = ap->a_vpp; devnode_t * dir_node; /* the directory we are searching */ @@ -141,69 +149,69 @@ devfs_lookup(struct vop_lookup_args *ap) devdirent_t * nodename; int flags = cnp->cn_flags; int op = cnp->cn_nameiop; /* LOOKUP, CREATE, RENAME, or DELETE */ - int lockparent = flags & LOCKPARENT; int wantparent = flags & (LOCKPARENT|WANTPARENT); int error = 0; - struct proc *p = cnp->cn_proc; char heldchar; /* the char at the end of the name componet */ +retry: + *result_vnode = NULL; /* safe not sorry */ /*XXX*/ - if (dir_vnode->v_usecount == 0) - printf("devfs_lookup: dir had no refs "); + //if (dir_vnode->v_usecount == 0) + //printf("devfs_lookup: dir had no refs "); dir_node = VTODN(dir_vnode); /* - * Check accessiblity of directory. + * Make sure that our node is a directory as well. */ if (dir_node->dn_type != DEV_DIR) { return (ENOTDIR); } - if ((error = VOP_ACCESS(dir_vnode, VEXEC, cnp->cn_cred, p)) != 0) { - return (error); - } - - /* temporarily terminate string component */ + DEVFS_LOCK(); + /* + * temporarily terminate string component + */ heldchar = cnp->cn_nameptr[cnp->cn_namelen]; cnp->cn_nameptr[cnp->cn_namelen] = '\0'; - DEVFS_LOCK(p); - nodename = dev_findname(dir_node,cnp->cn_nameptr); - if (nodename) { - /* entry exists */ - node = nodename->de_dnp; - node->dn_last_lookup = nodename; /* for unlink */ - /* Do potential vnode allocation here inside the lock - * to make sure that our device node has a non-NULL dn_vn - * associated with it. The device node might otherwise - * get deleted out from under us (see devfs_dn_free()). - */ - error = devfs_dntovn(node, result_vnode, p); - } - DEVFS_UNLOCK(p); - /* restore saved character */ + + nodename = dev_findname(dir_node, cnp->cn_nameptr); + /* + * restore saved character + */ cnp->cn_nameptr[cnp->cn_namelen] = heldchar; - if (error) - return (error); + if (nodename) { + /* entry exists */ + node = nodename->de_dnp; - if (!nodename) { /* no entry */ - /* If it doesn't exist and we're not the last component, + /* Do potential vnode allocation here inside the lock + * to make sure that our device node has a non-NULL dn_vn + * associated with it. The device node might otherwise + * get deleted out from under us (see devfs_dn_free()). + */ + error = devfs_dntovn(node, result_vnode, p); + } + DEVFS_UNLOCK(); + + if (error) { + if (error == EAGAIN) + goto retry; + return error; + } + if (!nodename) { + /* + * we haven't called devfs_dntovn if we get here + * we have not taken a reference on the node.. no + * vnode_put is necessary on these error returns + * + * If it doesn't exist and we're not the last component, * or we're at the last component, but we're not creating * or renaming, return ENOENT. */ if (!(flags & ISLASTCN) || !(op == CREATE || op == RENAME)) { return ENOENT; } - /* - * Access for write is interpreted as allowing - * creation of files in the directory. - */ - if ((error = VOP_ACCESS(dir_vnode, VWRITE, - cnp->cn_cred, p)) != 0) - { - return (error); - } /* * We return with the directory locked, so that * the parameters we set up above will still be @@ -211,17 +219,16 @@ devfs_lookup(struct vop_lookup_args *ap) * We return ni_vp == NULL to indicate that the entry * does not currently exist; we leave a pointer to * the (locked) directory vnode in namei_data->ni_dvp. - * The pathname buffer is saved so that the name - * can be obtained later. * * NB - if the directory is unlocked, then this * information cannot be used. */ - cnp->cn_flags |= SAVENAME; - if (!lockparent) - VOP_UNLOCK(dir_vnode, 0, p); return (EJUSTRETURN); } + /* + * from this point forward, we need to vnode_put the reference + * picked up in devfs_dntovn if we decide to return an error + */ /* * If deleting, and at end of pathname, return @@ -231,37 +238,20 @@ devfs_lookup(struct vop_lookup_args *ap) * on and lock the node, being careful with ".". */ if (op == DELETE && (flags & ISLASTCN)) { - /* - * Write access to directory required to delete files. - */ - if ((error = VOP_ACCESS(dir_vnode, VWRITE, - cnp->cn_cred, p)) != 0) - return (error); + /* * we are trying to delete '.'. What does this mean? XXX */ if (dir_node == node) { - VREF(dir_vnode); - *result_vnode = dir_vnode; - return (0); - } -#ifdef NOTYET - /* - * If directory is "sticky", then user must own - * the directory, or the file in it, else she - * may not delete it (unless she's root). This - * implements append-only directories. - */ - if ((dir_node->mode & ISVTX) && - cnp->cn_cred->cr_uid != 0 && - cnp->cn_cred->cr_uid != dir_node->uid && - cnp->cn_cred->cr_uid != node->uid) { - VOP_UNLOCK(*result_vnode, 0, p); - return (EPERM); + if (*result_vnode) { + vnode_put(*result_vnode); + *result_vnode = NULL; + } + if ( ((error = vnode_get(dir_vnode)) == 0) ) { + *result_vnode = dir_vnode; + } + return (error); } -#endif - if (!lockparent) - VOP_UNLOCK(dir_vnode, 0, p); return (0); } @@ -272,22 +262,15 @@ devfs_lookup(struct vop_lookup_args *ap) * regular file, or empty directory. */ if (op == RENAME && wantparent && (flags & ISLASTCN)) { - /* - * Are we allowed to change the holding directory? - */ - if ((error = VOP_ACCESS(dir_vnode, VWRITE, - cnp->cn_cred, p)) != 0) - return (error); + /* * Careful about locking second node. * This can only occur if the target is ".". */ - if (dir_node == node) - return (EISDIR); - /* hmm save the 'from' name (we need to delete it) */ - cnp->cn_flags |= SAVENAME; - if (!lockparent) - VOP_UNLOCK(dir_vnode, 0, p); + if (dir_node == node) { + error = EISDIR; + goto drop_ref; + } return (0); } @@ -311,294 +294,193 @@ devfs_lookup(struct vop_lookup_args *ap) * work if the file system has any hard links other than ".." * that point backwards in the directory structure. */ - if (flags & ISDOTDOT) { - VOP_UNLOCK(dir_vnode, 0, p); /* race to get the node */ - if (lockparent && (flags & ISLASTCN)) - vn_lock(dir_vnode, LK_EXCLUSIVE | LK_RETRY, p); - } else if (dir_node == node) { -#if 0 - /* - * this next statement is wrong: we already did a vget in - * devfs_dntovn(); DWS 4/16/1999 - */ - VREF(dir_vnode); /* we want ourself, ie "." */ -#endif + if ((flags & ISDOTDOT) == 0 && dir_node == node) { + if (*result_vnode) { + vnode_put(*result_vnode); + *result_vnode = NULL; + } + if ( (error = vnode_get(dir_vnode)) ) { + return (error); + } *result_vnode = dir_vnode; - } else { - if (!lockparent || (flags & ISLASTCN)) - VOP_UNLOCK(dir_vnode, 0, p); } - return (0); + +drop_ref: + if (*result_vnode) { + vnode_put(*result_vnode); + *result_vnode = NULL; + } + return (error); } static int -devfs_access(struct vop_access_args *ap) - /*struct vop_access_args { +devfs_getattr(struct vnop_getattr_args *ap) + /*struct vnop_getattr_args { struct vnode *a_vp; - int a_mode; - struct ucred *a_cred; + struct vnode_attr *a_vap; + kauth_cred_t a_cred; struct proc *a_p; } */ { - /* - * mode is filled with a combination of VREAD, VWRITE, - * and/or VEXEC bits turned on. In an octal number these - * are the Y in 0Y00. - */ struct vnode *vp = ap->a_vp; - int mode = ap->a_mode; - struct ucred *cred = ap->a_cred; + struct vnode_attr *vap = ap->a_vap; devnode_t * file_node; - gid_t *gp; - int i; - struct proc *p = ap->a_p; + struct timeval now; file_node = VTODN(vp); - /* - * if we are not running as a process, we are in the - * kernel and we DO have permission - */ - if (p == NULL) - return 0; - /* - * Access check is based on only one of owner, group, public. - * If not owner, then check group. If not a member of the - * group, then check public access. - */ - if (cred->cr_uid != file_node->dn_uid) - { - /* failing that.. try groups */ - mode >>= 3; - gp = cred->cr_groups; - for (i = 0; i < cred->cr_ngroups; i++, gp++) - { - if (file_node->dn_gid == *gp) - { - goto found; - } - } - /* failing that.. try general access */ - mode >>= 3; -found: - ; - } - if ((file_node->dn_mode & mode) == mode) - return (0); - /* - * Root gets to do anything. - * but only use suser prives as a last resort - * (Use of super powers is recorded in ap->a_p->p_acflag) - */ - if( suser(cred, &ap->a_p->p_acflag) == 0) /* XXX what if no proc? */ - return 0; - return (EACCES); -} + DEVFS_LOCK(); -static int -devfs_getattr(struct vop_getattr_args *ap) - /*struct vop_getattr_args { - struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - struct proc *a_p; - } */ -{ - struct vnode *vp = ap->a_vp; - struct vattr *vap = ap->a_vap; - devnode_t * file_node; - struct timeval tv; + microtime(&now); + dn_times(file_node, &now, &now, &now); + + VATTR_RETURN(vap, va_mode, file_node->dn_mode); - file_node = VTODN(vp); - tv = time; - dn_times(file_node, tv, tv); - vap->va_rdev = 0;/* default value only */ - vap->va_mode = file_node->dn_mode; switch (file_node->dn_type) { case DEV_DIR: - vap->va_rdev = (dev_t)file_node->dn_dvm; + VATTR_RETURN(vap, va_rdev, (dev_t)file_node->dn_dvm); vap->va_mode |= (S_IFDIR); break; case DEV_CDEV: - vap->va_rdev = file_node->dn_typeinfo.dev; + VATTR_RETURN(vap, va_rdev, file_node->dn_typeinfo.dev); vap->va_mode |= (S_IFCHR); break; case DEV_BDEV: - vap->va_rdev = file_node->dn_typeinfo.dev; + VATTR_RETURN(vap, va_rdev, file_node->dn_typeinfo.dev); vap->va_mode |= (S_IFBLK); break; case DEV_SLNK: + VATTR_RETURN(vap, va_rdev, 0); vap->va_mode |= (S_IFLNK); break; + default: + VATTR_RETURN(vap, va_rdev, 0); /* default value only */ } - vap->va_type = vp->v_type; - vap->va_nlink = file_node->dn_links; - vap->va_uid = file_node->dn_uid; - vap->va_gid = file_node->dn_gid; - vap->va_fsid = (int32_t)(void *)file_node->dn_dvm; - vap->va_fileid = (int32_t)(void *)file_node; - vap->va_size = file_node->dn_len; /* now a u_quad_t */ - /* this doesn't belong here */ + VATTR_RETURN(vap, va_type, vp->v_type); + VATTR_RETURN(vap, va_nlink, file_node->dn_links); + VATTR_RETURN(vap, va_uid, file_node->dn_uid); + VATTR_RETURN(vap, va_gid, file_node->dn_gid); + VATTR_RETURN(vap, va_fsid, (uintptr_t)file_node->dn_dvm); + VATTR_RETURN(vap, va_fileid, (uintptr_t)file_node); + VATTR_RETURN(vap, va_data_size, file_node->dn_len); + + /* return an override block size (advisory) */ if (vp->v_type == VBLK) - vap->va_blocksize = BLKDEV_IOSIZE; + VATTR_RETURN(vap, va_iosize, BLKDEV_IOSIZE); else if (vp->v_type == VCHR) - vap->va_blocksize = MAXPHYSIO; + VATTR_RETURN(vap, va_iosize, MAXPHYSIO); else - vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; + VATTR_RETURN(vap, va_iosize, vp->v_mount->mnt_vfsstat.f_iosize); /* if the time is bogus, set it to the boot time */ - if (file_node->dn_ctime.tv_sec == 0) - file_node->dn_ctime.tv_sec = boottime.tv_sec; + if (file_node->dn_ctime.tv_sec == 0) { + file_node->dn_ctime.tv_sec = boottime_sec(); + file_node->dn_ctime.tv_nsec = 0; + } if (file_node->dn_mtime.tv_sec == 0) - file_node->dn_mtime.tv_sec = boottime.tv_sec; + file_node->dn_mtime = file_node->dn_ctime; if (file_node->dn_atime.tv_sec == 0) - file_node->dn_atime.tv_sec = boottime.tv_sec; - vap->va_ctime = file_node->dn_ctime; - vap->va_mtime = file_node->dn_mtime; - vap->va_atime = file_node->dn_atime; - vap->va_gen = 0; - vap->va_flags = 0; - vap->va_bytes = file_node->dn_len; /* u_quad_t */ - vap->va_filerev = 0; /* XXX */ /* u_quad_t */ - vap->va_vaflags = 0; /* XXX */ + file_node->dn_atime = file_node->dn_ctime; + VATTR_RETURN(vap, va_change_time, file_node->dn_ctime); + VATTR_RETURN(vap, va_modify_time, file_node->dn_mtime); + VATTR_RETURN(vap, va_access_time, file_node->dn_atime); + VATTR_RETURN(vap, va_gen, 0); + VATTR_RETURN(vap, va_flags, 0); + VATTR_RETURN(vap, va_filerev, 0); + VATTR_RETURN(vap, va_acl, NULL); + + DEVFS_UNLOCK(); + return 0; } static int -devfs_setattr(struct vop_setattr_args *ap) - /*struct vop_setattr_args { - struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - struct proc *a_p; - } */ +devfs_setattr(struct vnop_setattr_args *ap) + /*struct vnop_setattr_args { + struct vnode *a_vp; + struct vnode_attr *a_vap; + vfs_context_t a_context; + } */ { - struct vnode *vp = ap->a_vp; - struct vattr *vap = ap->a_vap; - struct ucred *cred = ap->a_cred; - struct proc *p = ap->a_p; - int error = 0; - gid_t *gp; - int i; - devnode_t * file_node; - struct timeval atimeval, mtimeval; - - if (vap->va_flags != VNOVAL) /* XXX needs to be implemented */ - return (EOPNOTSUPP); - - file_node = VTODN(vp); - - if ((vap->va_type != VNON) || - (vap->va_nlink != VNOVAL) || - (vap->va_fsid != VNOVAL) || - (vap->va_fileid != VNOVAL) || - (vap->va_blocksize != VNOVAL) || - (vap->va_rdev != VNOVAL) || - (vap->va_bytes != VNOVAL) || - (vap->va_gen != VNOVAL )) - { - return EINVAL; - } - - /* - * Go through the fields and update iff not VNOVAL. - */ - if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { - if (cred->cr_uid != file_node->dn_uid && - (error = suser(cred, &p->p_acflag)) && - ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || - (error = VOP_ACCESS(vp, VWRITE, cred, p)))) - return (error); - if (vap->va_atime.tv_sec != VNOVAL) - file_node->dn_flags |= DN_ACCESS; - if (vap->va_mtime.tv_sec != VNOVAL) - file_node->dn_flags |= DN_CHANGE | DN_UPDATE; - atimeval.tv_sec = vap->va_atime.tv_sec; - atimeval.tv_usec = vap->va_atime.tv_nsec / 1000; - mtimeval.tv_sec = vap->va_mtime.tv_sec; - mtimeval.tv_usec = vap->va_mtime.tv_nsec / 1000; - if (error = VOP_UPDATE(vp, &atimeval, &mtimeval, 1)) - return (error); - } - - /* - * Change the permissions.. must be root or owner to do this. - */ - if (vap->va_mode != (u_short)VNOVAL) { - if ((cred->cr_uid != file_node->dn_uid) - && (error = suser(cred, &p->p_acflag))) - return (error); - file_node->dn_mode &= ~07777; - file_node->dn_mode |= vap->va_mode & 07777; - } - - /* - * Change the owner.. must be root to do this. - */ - if (vap->va_uid != (uid_t)VNOVAL) { - if (error = suser(cred, &p->p_acflag)) - return (error); - file_node->dn_uid = vap->va_uid; - } - - /* - * Change the group.. must be root or owner to do this. - * If we are the owner, we must be in the target group too. - * don't use suser() unless you have to as it reports - * whether you needed suser powers or not. - */ - if (vap->va_gid != (gid_t)VNOVAL) { - if (cred->cr_uid == file_node->dn_uid){ - gp = cred->cr_groups; - for (i = 0; i < cred->cr_ngroups; i++, gp++) { - if (vap->va_gid == *gp) - goto cando; - } - } - /* - * we can't do it with normal privs, - * do we have an ace up our sleeve? - */ - if (error = suser(cred, &p->p_acflag)) - return (error); -cando: - file_node->dn_gid = vap->va_gid; - } -#if 0 - /* - * Copied from somewhere else - * but only kept as a marker and reminder of the fact that - * flags should be handled some day - */ - if (vap->va_flags != VNOVAL) { - if (error = suser(cred, &p->p_acflag)) - return error; - if (cred->cr_uid == 0) - ; - else { + struct vnode *vp = ap->a_vp; + struct vnode_attr *vap = ap->a_vap; + kauth_cred_t cred = vfs_context_ucred(ap->a_context); + struct proc *p = vfs_context_proc(ap->a_context); + int error = 0; + devnode_t * file_node; + struct timeval atimeval, mtimeval; + + file_node = VTODN(vp); + + DEVFS_LOCK(); + /* + * Go through the fields and update if set. + */ + if (VATTR_IS_ACTIVE(vap, va_access_time) || VATTR_IS_ACTIVE(vap, va_modify_time)) { + + + if (VATTR_IS_ACTIVE(vap, va_access_time)) + file_node->dn_access = 1; + if (VATTR_IS_ACTIVE(vap, va_modify_time)) { + file_node->dn_change = 1; + file_node->dn_update = 1; } - } -#endif + atimeval.tv_sec = vap->va_access_time.tv_sec; + atimeval.tv_usec = vap->va_access_time.tv_nsec / 1000; + mtimeval.tv_sec = vap->va_modify_time.tv_sec; + mtimeval.tv_usec = vap->va_modify_time.tv_nsec / 1000; + + if ( (error = devfs_update(vp, &atimeval, &mtimeval)) ) + goto exit; + } + VATTR_SET_SUPPORTED(vap, va_access_time); + VATTR_SET_SUPPORTED(vap, va_change_time); + + /* + * Change the permissions. + */ + if (VATTR_IS_ACTIVE(vap, va_mode)) { + file_node->dn_mode &= ~07777; + file_node->dn_mode |= vap->va_mode & 07777; + } + VATTR_SET_SUPPORTED(vap, va_mode); + + /* + * Change the owner. + */ + if (VATTR_IS_ACTIVE(vap, va_uid)) + file_node->dn_uid = vap->va_uid; + VATTR_SET_SUPPORTED(vap, va_uid); + + /* + * Change the group. + */ + if (VATTR_IS_ACTIVE(vap, va_gid)) + file_node->dn_gid = vap->va_gid; + VATTR_SET_SUPPORTED(vap, va_gid); + exit: + DEVFS_UNLOCK(); + return error; } static int -devfs_read(struct vop_read_args *ap) - /*struct vop_read_args { +devfs_read(struct vnop_read_args *ap) + /* struct vnop_read_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; + vfs_context_t a_context; } */ { devnode_t * dn_p = VTODN(ap->a_vp); switch (ap->a_vp->v_type) { case VDIR: { - dn_p->dn_flags |= DN_ACCESS; - return VOP_READDIR(ap->a_vp, ap->a_uio, ap->a_cred, - NULL, NULL, NULL); + dn_p->dn_access = 1; + + return VNOP_READDIR(ap->a_vp, ap->a_uio, 0, NULL, NULL, ap->a_context); } default: { printf("devfs_read(): bad file type %d", ap->a_vp->v_type); @@ -610,79 +492,90 @@ devfs_read(struct vop_read_args *ap) } static int -devfs_close(ap) - struct vop_close_args /* { +devfs_close(struct vnop_close_args *ap) + /* struct vnop_close_args { struct vnode *a_vp; int a_fflag; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; + vfs_context_t a_context; + } */ { struct vnode * vp = ap->a_vp; register devnode_t * dnp = VTODN(vp); + struct timeval now; - simple_lock(&vp->v_interlock); - if (vp->v_usecount > 1) - dn_times(dnp, time, time); - simple_unlock(&vp->v_interlock); + if (vnode_isinuse(vp, 1)) { + DEVFS_LOCK(); + microtime(&now); + dn_times(dnp, &now, &now, &now); + DEVFS_UNLOCK(); + } return (0); } static int -devfsspec_close(ap) - struct vop_close_args /* { +devfsspec_close(struct vnop_close_args *ap) + /* struct vnop_close_args { struct vnode *a_vp; int a_fflag; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; + vfs_context_t a_context; + } */ { struct vnode * vp = ap->a_vp; register devnode_t * dnp = VTODN(vp); + struct timeval now; - simple_lock(&vp->v_interlock); - if (vp->v_usecount > 1) - dn_times(dnp, time, time); - simple_unlock(&vp->v_interlock); - return (VOCALL (spec_vnodeop_p, VOFFSET(vop_close), ap)); + if (vnode_isinuse(vp, 1)) { + DEVFS_LOCK(); + microtime(&now); + dn_times(dnp, &now, &now, &now); + DEVFS_UNLOCK(); + } + return (VOCALL (spec_vnodeop_p, VOFFSET(vnop_close), ap)); } static int -devfsspec_read(struct vop_read_args *ap) - /*struct vop_read_args { +devfsspec_read(struct vnop_read_args *ap) + /* struct vnop_read_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; + kauth_cred_t a_cred; } */ { - VTODN(ap->a_vp)->dn_flags |= DN_ACCESS; - return (VOCALL (spec_vnodeop_p, VOFFSET(vop_read), ap)); + register devnode_t * dnp = VTODN(ap->a_vp); + + dnp->dn_access = 1; + + return (VOCALL (spec_vnodeop_p, VOFFSET(vnop_read), ap)); } static int -devfsspec_write(struct vop_write_args *ap) - /*struct vop_write_args { +devfsspec_write(struct vnop_write_args *ap) + /* struct vnop_write_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; + vfs_context_t a_context; } */ { - VTODN(ap->a_vp)->dn_flags |= DN_CHANGE | DN_UPDATE; - return (VOCALL (spec_vnodeop_p, VOFFSET(vop_write), ap)); + register devnode_t * dnp = VTODN(ap->a_vp); + + dnp->dn_change = 1; + dnp->dn_update = 1; + + return (VOCALL (spec_vnodeop_p, VOFFSET(vnop_write), ap)); } /* * Write data to a file or directory. */ static int -devfs_write(struct vop_write_args *ap) - /*struct vop_write_args { +devfs_write(struct vnop_write_args *ap) + /* struct vnop_write_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; + kauth_cred_t a_cred; } */ { switch (ap->a_vp->v_type) { @@ -696,8 +589,8 @@ devfs_write(struct vop_write_args *ap) } static int -devfs_remove(struct vop_remove_args *ap) - /*struct vop_remove_args { +devfs_remove(struct vnop_remove_args *ap) + /* struct vnop_remove_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; @@ -706,34 +599,29 @@ devfs_remove(struct vop_remove_args *ap) struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; + vfs_context_t ctx = cnp->cn_context; devnode_t * tp; devnode_t * tdp; devdirent_t * tnp; int doingdirectory = 0; int error = 0; - uid_t ouruid = cnp->cn_cred->cr_uid; - struct proc *p = cnp->cn_proc; + uid_t ouruid = kauth_cred_getuid(vfs_context_ucred(ctx)); /* - * Lock our directories and get our name pointers - * assume that the names are null terminated as they + * assume that the name is null terminated as they * are the end of the path. Get pointers to all our * devfs structures. */ tp = VTODN(vp); tdp = VTODN(dvp); - /* - * Assuming we are atomic, dev_lookup left this for us - */ - tnp = tp->dn_last_lookup; - /* - * Check we are doing legal things WRT the new flags - */ - if ((tp->dn_flags & (IMMUTABLE | APPEND)) - || (tdp->dn_flags & APPEND) /*XXX eh?*/ ) { - error = EPERM; - goto abort; + DEVFS_LOCK(); + + tnp = dev_findname(tdp, cnp->cn_nameptr); + + if (tnp == NULL) { + error = ENOENT; + goto abort; } /* @@ -754,21 +642,9 @@ devfs_remove(struct vop_remove_args *ap) /*********************************** * Start actually doing things.... * ***********************************/ - tdp->dn_flags |= DN_CHANGE | DN_UPDATE; + tdp->dn_change = 1; + tdp->dn_update = 1; - /* - * own the parent directory, or the destination of the rename, - * otherwise the destination may not be changed (except by - * root). This implements append-only directories. - * XXX shoudn't this be in generic code? - */ - if ((tdp->dn_mode & S_ISTXT) - && ouruid != 0 - && ouruid != tdp->dn_uid - && ouruid != tp->dn_uid ) { - error = EPERM; - goto abort; - } /* * Target must be empty if a directory and have no links * to it. Also, ensure source and target are compatible @@ -778,37 +654,32 @@ devfs_remove(struct vop_remove_args *ap) error = ENOTEMPTY; goto abort; } - DEVFS_LOCK(p); dev_free_name(tnp); - DEVFS_UNLOCK(p); - abort: - if (dvp == vp) - vrele(vp); - else - vput(vp); - vput(dvp); +abort: + DEVFS_UNLOCK(); + return (error); } /* */ static int -devfs_link(struct vop_link_args *ap) - /*struct vop_link_args { +devfs_link(struct vnop_link_args *ap) + /*struct vnop_link_args { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; + vfs_context_t a_context; } */ { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; - struct proc *p = cnp->cn_proc; devnode_t * fp; devnode_t * tdp; devdirent_t * tnp; int error = 0; - struct timeval tv; + struct timeval now; /* * First catch an arbitrary restriction for this FS @@ -828,71 +699,26 @@ devfs_link(struct vop_link_args *ap) fp = VTODN(vp); if (tdvp->v_mount != vp->v_mount) { - error = EXDEV; - VOP_ABORTOP(tdvp, cnp); - goto out2; - } - if (tdvp != vp && (error = vn_lock(vp, LK_EXCLUSIVE, p))) { - VOP_ABORTOP(tdvp, cnp); - goto out2; + return (EXDEV); } + DEVFS_LOCK(); - /* - * Check we are doing legal things WRT the new flags - */ - if (fp->dn_flags & (IMMUTABLE | APPEND)) { - VOP_ABORTOP(tdvp, cnp); - error = EPERM; - goto out1; - } /*********************************** * Start actually doing things.... * ***********************************/ - fp->dn_flags |= DN_CHANGE; - tv = time; - error = VOP_UPDATE(vp, &tv, &tv, 1); + fp->dn_change = 1; + + microtime(&now); + error = devfs_update(vp, &now, &now); + if (!error) { - DEVFS_LOCK(p); error = dev_add_name(cnp->cn_nameptr, tdp, NULL, fp, &tnp); - DEVFS_UNLOCK(p); } out1: - if (tdvp != vp) - VOP_UNLOCK(vp, 0, p); -out2: - vput(tdvp); - return (error); -} - -/* - * Check if source directory is in the path of the target directory. - * Target is supplied locked, source is unlocked. - * The target is always vput before returning. - */ -int -devfs_checkpath(source, target) - devnode_t *source, *target; -{ - int error = 0; - devnode_t * ntmp; - devnode_t * tmp; - struct vnode *vp; - - vp = target->dn_vn; - tmp = target; - - do { - if (tmp == source) { - error = EINVAL; - break; - } - ntmp = tmp; - } while ((tmp = tmp->dn_typeinfo.Dir.parent) != ntmp); + DEVFS_UNLOCK(); - if (vp != NULL) - vput(vp); - return (error); + return (error); } /* @@ -923,14 +749,15 @@ devfs_checkpath(source, target) * directory. */ static int -devfs_rename(struct vop_rename_args *ap) - /*struct vop_rename_args { +devfs_rename(struct vnop_rename_args *ap) + /*struct vnop_rename_args { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; + vfs_context_t a_context; } */ { struct vnode *tvp = ap->a_tvp; @@ -939,23 +766,22 @@ devfs_rename(struct vop_rename_args *ap) struct vnode *fdvp = ap->a_fdvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; - struct proc *p = fcnp->cn_proc; devnode_t *fp, *fdp, *tp, *tdp; devdirent_t *fnp,*tnp; int doingdirectory = 0; int error = 0; - struct timeval tv; + struct timeval now; + DEVFS_LOCK(); /* * First catch an arbitrary restriction for this FS */ - if(tcnp->cn_namelen > DEVMAXNAMESIZE) { + if (tcnp->cn_namelen > DEVMAXNAMESIZE) { error = ENAMETOOLONG; - goto abortit; + goto out; } /* - * Lock our directories and get our name pointers * assume that the names are null terminated as they * are the end of the path. Get pointers to all our * devfs structures. @@ -963,47 +789,26 @@ devfs_rename(struct vop_rename_args *ap) tdp = VTODN(tdvp); fdp = VTODN(fdvp); fp = VTODN(fvp); - fnp = fp->dn_last_lookup; + + fnp = dev_findname(fdp, fcnp->cn_nameptr); + + if (fnp == NULL) { + error = ENOENT; + goto out; + } tp = NULL; tnp = NULL; + if (tvp) { - tp = VTODN(tvp); - tnp = tp->dn_last_lookup; - } - - /* - * trying to move it out of devfs? - * if we move a dir across mnt points. we need to fix all - * the mountpoint pointers! XXX - * so for now keep dirs within the same mount - */ - if ((fvp->v_mount != tdvp->v_mount) || - (tvp && (fvp->v_mount != tvp->v_mount))) { - error = EXDEV; -abortit: - VOP_ABORTOP(tdvp, tcnp); - if (tdvp == tvp) /* eh? */ - vrele(tdvp); - else - vput(tdvp); - if (tvp) - vput(tvp); - VOP_ABORTOP(fdvp, fcnp); /* XXX, why not in NFS? */ - vrele(fdvp); - vrele(fvp); - return (error); - } + tnp = dev_findname(tdp, tcnp->cn_nameptr); - /* - * Check we are doing legal things WRT the new flags - */ - if ((tp && (tp->dn_flags & (IMMUTABLE | APPEND))) - || (fp->dn_flags & (IMMUTABLE | APPEND)) - || (fdp->dn_flags & APPEND)) { - error = EPERM; - goto abortit; + if (tnp == NULL) { + error = ENOENT; + goto out; + } + tp = VTODN(tvp); } - + /* * Make sure that we don't try do something stupid */ @@ -1017,7 +822,7 @@ abortit: || (tcnp->cn_flags&ISDOTDOT) || (tdp == fp )) { error = EINVAL; - goto abortit; + goto out; } doingdirectory++; } @@ -1032,7 +837,6 @@ abortit: */ if (doingdirectory && (tdp != fdp)) { devnode_t * tmp, *ntmp; - error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_proc); tmp = tdp; do { if(tmp == fp) { @@ -1047,11 +851,11 @@ abortit: /*********************************** * Start actually doing things.... * ***********************************/ - fp->dn_flags |= DN_CHANGE; - tv = time; - if (error = VOP_UPDATE(fvp, &tv, &tv, 1)) { - VOP_UNLOCK(fvp, 0, p); - goto bad; + fp->dn_change = 1; + microtime(&now); + + if ( (error = devfs_update(fvp, &now, &now)) ) { + goto out; } /* * Check if just deleting a link name. @@ -1059,24 +863,14 @@ abortit: if (fvp == tvp) { if (fvp->v_type == VDIR) { error = EINVAL; - goto abortit; + goto out; } - /* Release destination completely. */ - VOP_ABORTOP(tdvp, tcnp); - vput(tdvp); - vput(tvp); - - /* Delete source. */ - VOP_ABORTOP(fdvp, fcnp); /*XXX*/ - vrele(fdvp); - vrele(fvp); dev_free_name(fnp); + + DEVFS_UNLOCK(); return 0; } - - vrele(fdvp); - /* * 1) Bump link count while we're moving stuff * around. If we crash somewhere before @@ -1088,29 +882,15 @@ abortit: * We could do that as well but won't */ if (tp) { - int ouruid = tcnp->cn_cred->cr_uid; - /* - * If the parent directory is "sticky", then the user must - * own the parent directory, or the destination of the rename, - * otherwise the destination may not be changed (except by - * root). This implements append-only directories. - * XXX shoudn't this be in generic code? - */ - if ((tdp->dn_mode & S_ISTXT) - && ouruid != 0 - && ouruid != tdp->dn_uid - && ouruid != tp->dn_uid ) { - error = EPERM; - goto bad; - } + int ouruid = kauth_cred_getuid(vfs_context_ucred(tcnp->cn_context)); /* * Target must be empty if a directory and have no links * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ if (( doingdirectory) && (tp->dn_links > 2)) { - error = ENOTEMPTY; - goto bad; + error = ENOTEMPTY; + goto bad; } dev_free_name(tnp); tp = NULL; @@ -1118,140 +898,112 @@ abortit: dev_add_name(tcnp->cn_nameptr,tdp,NULL,fp,&tnp); fnp->de_dnp = NULL; fp->dn_links--; /* one less link to it.. */ - dev_free_name(fnp); - fp->dn_links--; /* we added one earlier*/ - if (tdp) - vput(tdvp); - if (tp) - vput(fvp); - vrele(fvp); - return (error); + dev_free_name(fnp); bad: - if (tp) - vput(tvp); - vput(tdvp); + fp->dn_links--; /* we added one earlier*/ out: - if (vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, p) == 0) { - fp->dn_links--; /* we added one earlier*/ - vput(fvp); - } else - vrele(fvp); + DEVFS_UNLOCK(); return (error); } static int -devfs_symlink(struct vop_symlink_args *ap) - /*struct vop_symlink_args { +devfs_symlink(struct vnop_symlink_args *ap) + /*struct vnop_symlink_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; - struct vattr *a_vap; + struct vnode_attr *a_vap; char *a_target; + vfs_context_t a_context; } */ { struct componentname * cnp = ap->a_cnp; - struct vnode *vp = NULL; + vfs_context_t ctx = cnp->cn_context; + struct proc *p = vfs_context_proc(ctx); int error = 0; devnode_t * dir_p; devnode_type_t typeinfo; devdirent_t * nm_p; devnode_t * dev_p; - struct vattr * vap = ap->a_vap; + struct vnode_attr * vap = ap->a_vap; struct vnode * * vpp = ap->a_vpp; - struct proc *p = cnp->cn_proc; - struct timeval tv; dir_p = VTODN(ap->a_dvp); typeinfo.Slnk.name = ap->a_target; typeinfo.Slnk.namelen = strlen(ap->a_target); - DEVFS_LOCK(p); + + DEVFS_LOCK(); error = dev_add_entry(cnp->cn_nameptr, dir_p, DEV_SLNK, &typeinfo, NULL, NULL, &nm_p); - DEVFS_UNLOCK(p); if (error) { goto failure; } - dev_p = nm_p->de_dnp; dev_p->dn_uid = dir_p->dn_uid; dev_p->dn_gid = dir_p->dn_gid; dev_p->dn_mode = vap->va_mode; dn_copy_times(dev_p, dir_p); + error = devfs_dntovn(dev_p, vpp, p); - if (error) - goto failure; - vp = *vpp; - vput(vp); failure: - if ((cnp->cn_flags & SAVESTART) == 0) { - char *tmp = cnp->cn_pnbuf; - cnp->cn_pnbuf = NULL; - cnp->cn_flags &= ~HASBUF; - FREE_ZONE(tmp, cnp->cn_pnlen, M_NAMEI); - } - vput(ap->a_dvp); + DEVFS_UNLOCK(); + return error; } /* * Mknod vnode call */ -/* ARGSUSED */ -int -devfs_mknod(ap) - struct vop_mknod_args /* { +static int +devfs_mknod(struct vnop_mknod_args *ap) + /* struct vnop_mknod_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; - struct vattr *a_vap; - } */ *ap; + struct vnode_attr *a_vap; + vfs_context_t a_context; + } */ { struct componentname * cnp = ap->a_cnp; + vfs_context_t ctx = cnp->cn_context; + struct proc *p = vfs_context_proc(ctx); devnode_t * dev_p; devdirent_t * devent; devnode_t * dir_p; /* devnode for parent directory */ struct vnode * dvp = ap->a_dvp; int error = 0; devnode_type_t typeinfo; - struct vattr * vap = ap->a_vap; + struct vnode_attr * vap = ap->a_vap; struct vnode ** vpp = ap->a_vpp; - struct proc * p = cnp->cn_proc; *vpp = NULL; - if (!vap->va_type == VBLK && !vap->va_type == VCHR) { - error = EINVAL; /* only support mknod of special files */ - goto failure; + if (!(vap->va_type == VBLK) && !(vap->va_type == VCHR)) { + return (EINVAL); /* only support mknod of special files */ } dir_p = VTODN(dvp); typeinfo.dev = vap->va_rdev; - DEVFS_LOCK(p); + + DEVFS_LOCK(); error = dev_add_entry(cnp->cn_nameptr, dir_p, (vap->va_type == VBLK) ? DEV_BDEV : DEV_CDEV, &typeinfo, NULL, NULL, &devent); - DEVFS_UNLOCK(p); if (error) { - goto failure; + goto failure; } dev_p = devent->de_dnp; error = devfs_dntovn(dev_p, vpp, p); if (error) - goto failure; - dev_p->dn_uid = cnp->cn_cred->cr_uid; - dev_p->dn_gid = dir_p->dn_gid; + goto failure; + dev_p->dn_uid = vap->va_uid; + dev_p->dn_gid = vap->va_gid; dev_p->dn_mode = vap->va_mode; + VATTR_SET_SUPPORTED(vap, va_uid); + VATTR_SET_SUPPORTED(vap, va_gid); + VATTR_SET_SUPPORTED(vap, va_mode); failure: - if (*vpp) { - vput(*vpp); - *vpp = 0; - } - if ((cnp->cn_flags & SAVESTART) == 0) { - char *tmp = cnp->cn_pnbuf; - cnp->cn_pnbuf = NULL; - cnp->cn_flags &= ~HASBUF; - FREE_ZONE(tmp, cnp->cn_pnlen, M_NAMEI); - } - vput(dvp); + DEVFS_UNLOCK(); + return (error); } @@ -1259,14 +1011,14 @@ failure: * Vnode op for readdir */ static int -devfs_readdir(struct vop_readdir_args *ap) - /*struct vop_readdir_args { +devfs_readdir(struct vnop_readdir_args *ap) + /*struct vnop_readdir_args { struct vnode *a_vp; struct uio *a_uio; - struct ucred *a_cred; - int *eofflag; - int *ncookies; - u_int **cookies; + int a_flags; + int *a_eofflag; + int *a_numdirent; + vfs_context_t a_context; } */ { struct vnode *vp = ap->a_vp; @@ -1279,21 +1031,25 @@ devfs_readdir(struct vop_readdir_args *ap) int reclen; int nodenumber; int startpos,pos; - struct proc * p = uio->uio_procp; + + if (ap->a_flags & (VNODE_READDIR_EXTENDED | VNODE_READDIR_REQSEEKOFF)) + return (EINVAL); /* set up refs to dir */ dir_node = VTODN(vp); - if(dir_node->dn_type != DEV_DIR) + if (dir_node->dn_type != DEV_DIR) return(ENOTDIR); - pos = 0; startpos = uio->uio_offset; - DEVFS_LOCK(p); + + DEVFS_LOCK(); + name_node = dir_node->dn_typeinfo.Dir.dirlist; nodenumber = 0; - dir_node->dn_flags |= DN_ACCESS; - while ((name_node || (nodenumber < 2)) && (uio->uio_resid > 0)) + dir_node->dn_access = 1; + + while ((name_node || (nodenumber < 2)) && (uio_resid(uio) > 0)) { switch(nodenumber) { @@ -1341,7 +1097,7 @@ devfs_readdir(struct vop_readdir_args *ap) if(pos >= startpos) /* made it to the offset yet? */ { - if (uio->uio_resid < reclen) /* will it fit? */ + if (uio_resid(uio) < reclen) /* will it fit? */ break; strcpy( dirent.d_name,name); if ((error = uiomove ((caddr_t)&dirent, @@ -1353,7 +1109,7 @@ devfs_readdir(struct vop_readdir_args *ap) name_node = name_node->de_next; nodenumber++; } - DEVFS_UNLOCK(p); + DEVFS_UNLOCK(); uio->uio_offset = pos; return (error); @@ -1363,11 +1119,11 @@ devfs_readdir(struct vop_readdir_args *ap) /* */ static int -devfs_readlink(struct vop_readlink_args *ap) - /*struct vop_readlink_args { +devfs_readlink(struct vnop_readlink_args *ap) + /*struct vnop_readlink_args { struct vnode *a_vp; struct uio *a_uio; - struct ucred *a_cred; + vfs_context_t a_context; } */ { struct vnode *vp = ap->a_vp; @@ -1377,25 +1133,28 @@ devfs_readlink(struct vop_readlink_args *ap) /* set up refs to dir */ lnk_node = VTODN(vp); - if(lnk_node->dn_type != DEV_SLNK) - return(EINVAL); - if ((error = VOP_ACCESS(vp, VREAD, ap->a_cred, NULL)) != 0) { /* XXX */ - return error; + + if (lnk_node->dn_type != DEV_SLNK) { + error = EINVAL; + goto out; } error = uiomove(lnk_node->dn_typeinfo.Slnk.name, lnk_node->dn_typeinfo.Slnk.namelen, uio); +out: return error; } static int -devfs_reclaim(struct vop_reclaim_args *ap) - /*struct vop_reclaim_args { +devfs_reclaim(struct vnop_reclaim_args *ap) + /*struct vnop_reclaim_args { struct vnode *a_vp; } */ { struct vnode * vp = ap->a_vp; devnode_t * dnp = VTODN(vp); + DEVFS_LOCK(); + if (dnp) { /* * do the same as devfs_inactive in case it is not called @@ -1403,78 +1162,99 @@ devfs_reclaim(struct vop_reclaim_args *ap) */ dnp->dn_vn = NULL; vp->v_data = NULL; + if (dnp->dn_delete) { devnode_free(dnp); } } + DEVFS_UNLOCK(); + return(0); } + /* - * Print out the contents of a /devfs vnode. + * Get configurable pathname variables. */ static int -devfs_print(struct vop_print_args *ap) - /*struct vop_print_args { +devs_vnop_pathconf( + struct vnop_pathconf_args /* { struct vnode *a_vp; - } */ + int a_name; + int *a_retval; + vfs_context_t a_context; + } */ *ap) { + switch (ap->a_name) { + case _PC_LINK_MAX: + /* arbitrary limit matching HFS; devfs has no hard limit */ + *ap->a_retval = 32767; + break; + case _PC_NAME_MAX: + *ap->a_retval = DEVMAXNAMESIZE - 1; /* includes NUL */ + break; + case _PC_PATH_MAX: + *ap->a_retval = DEVMAXPATHSIZE - 1; /* XXX nonconformant */ + break; + case _PC_CHOWN_RESTRICTED: + *ap->a_retval = 1; + break; + case _PC_NO_TRUNC: + *ap->a_retval = 0; + break; + case _PC_CASE_SENSITIVE: + *ap->a_retval = 1; + break; + case _PC_CASE_PRESERVING: + *ap->a_retval = 1; + break; + default: + return (EINVAL); + } return (0); } + + /**************************************************************************\ * pseudo ops * \**************************************************************************/ /* * - * struct vop_inactive_args { + * struct vnop_inactive_args { * struct vnode *a_vp; - * struct proc *a_p; + * vfs_context_t a_context; * } */ static int -devfs_inactive(struct vop_inactive_args *ap) +devfs_inactive(__unused struct vnop_inactive_args *ap) { - struct vnode * vp = ap->a_vp; - devnode_t * dnp = VTODN(vp); - - if (dnp) { - dnp->dn_vn = NULL; - vp->v_data = NULL; - if (dnp->dn_delete) { - devnode_free(dnp); - } - } - VOP_UNLOCK(vp, 0, ap->a_p); return (0); } -int -devfs_update(ap) - struct vop_update_args /* { - struct vnode *a_vp; - struct timeval *a_access; - struct timeval *a_modify; - int a_waitfor; - } */ *ap; +/* + * called with DEVFS_LOCK held + */ +static int +devfs_update(struct vnode *vp, struct timeval *access, struct timeval *modify) { - register struct fs *fs; - int error; devnode_t * ip; + struct timeval now; + + ip = VTODN(vp); + if (vp->v_mount->mnt_flag & MNT_RDONLY) { + ip->dn_access = 0; + ip->dn_change = 0; + ip->dn_update = 0; - ip = VTODN(ap->a_vp); - if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) { - ip->dn_flags &= - ~(DN_ACCESS | DN_CHANGE | DN_MODIFIED | DN_UPDATE); return (0); } - if ((ip->dn_flags & - (DN_ACCESS | DN_CHANGE | DN_MODIFIED | DN_UPDATE)) == 0) - return (0); - dn_times(ip, time, time); + microtime(&now); + dn_times(ip, access, modify, &now); + return (0); } @@ -1483,57 +1263,42 @@ devfs_update(ap) /* The following ops are used by directories and symlinks */ int (**devfs_vnodeop_p)(void *); static struct vnodeopv_entry_desc devfs_vnodeop_entries[] = { - { &vop_default_desc, (VOPFUNC)vn_default_error }, - { &vop_lookup_desc, (VOPFUNC)devfs_lookup }, /* lookup */ - { &vop_create_desc, (VOPFUNC)err_create }, /* create */ - { &vop_whiteout_desc, (VOPFUNC)err_whiteout }, /* whiteout */ - { &vop_mknod_desc, (VOPFUNC)devfs_mknod }, /* mknod */ - { &vop_open_desc, (VOPFUNC)nop_open }, /* open */ - { &vop_close_desc, (VOPFUNC)devfs_close }, /* close */ - { &vop_access_desc, (VOPFUNC)devfs_access }, /* access */ - { &vop_getattr_desc, (VOPFUNC)devfs_getattr }, /* getattr */ - { &vop_setattr_desc, (VOPFUNC)devfs_setattr }, /* setattr */ - { &vop_read_desc, (VOPFUNC)devfs_read }, /* read */ - { &vop_write_desc, (VOPFUNC)devfs_write }, /* write */ - { &vop_lease_desc, (VOPFUNC)nop_lease }, /* lease */ - { &vop_ioctl_desc, (VOPFUNC)err_ioctl }, /* ioctl */ - { &vop_select_desc, (VOPFUNC)err_select }, /* select */ - { &vop_revoke_desc, (VOPFUNC)err_revoke }, /* revoke */ - { &vop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */ - { &vop_fsync_desc, (VOPFUNC)nop_fsync }, /* fsync */ - { &vop_seek_desc, (VOPFUNC)err_seek }, /* seek */ - { &vop_remove_desc, (VOPFUNC)devfs_remove }, /* remove */ - { &vop_link_desc, (VOPFUNC)devfs_link }, /* link */ - { &vop_rename_desc, (VOPFUNC)devfs_rename }, /* rename */ - { &vop_mkdir_desc, (VOPFUNC)err_mkdir }, /* mkdir */ - { &vop_rmdir_desc, (VOPFUNC)err_rmdir }, /* rmdir */ - { &vop_symlink_desc, (VOPFUNC)devfs_symlink }, /* symlink */ - { &vop_readdir_desc, (VOPFUNC)devfs_readdir }, /* readdir */ - { &vop_readlink_desc, (VOPFUNC)devfs_readlink }, /* readlink */ - { &vop_abortop_desc, (VOPFUNC)nop_abortop }, /* abortop */ - { &vop_inactive_desc, (VOPFUNC)devfs_inactive }, /* inactive */ - { &vop_reclaim_desc, (VOPFUNC)devfs_reclaim }, /* reclaim */ - { &vop_lock_desc, (VOPFUNC)nop_lock }, /* lock */ - { &vop_unlock_desc, (VOPFUNC)nop_unlock }, /* unlock */ - { &vop_bmap_desc, (VOPFUNC)err_bmap }, /* bmap */ - { &vop_strategy_desc, (VOPFUNC)err_strategy }, /* strategy */ - { &vop_print_desc, (VOPFUNC)err_print }, /* print */ - { &vop_islocked_desc, (VOPFUNC)nop_islocked }, /* islocked */ - { &vop_pathconf_desc, (VOPFUNC)err_pathconf }, /* pathconf */ - { &vop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ - { &vop_blkatoff_desc, (VOPFUNC)err_blkatoff }, /* blkatoff */ - { &vop_valloc_desc, (VOPFUNC)err_valloc }, /* valloc */ - { &vop_reallocblks_desc, (VOPFUNC)err_reallocblks }, /* reallocblks */ - { &vop_vfree_desc, (VOPFUNC)err_vfree }, /* vfree */ - { &vop_truncate_desc, (VOPFUNC)err_truncate }, /* truncate */ - { &vop_update_desc, (VOPFUNC)devfs_update }, /* update */ - { &vop_bwrite_desc, (VOPFUNC)err_bwrite }, - { &vop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */ - { &vop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */ - { &vop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ - { &vop_blktooff_desc, (VOPFUNC)err_blktooff }, /* blktooff */ - { &vop_offtoblk_desc, (VOPFUNC)err_offtoblk }, /* offtoblk */ - { &vop_cmap_desc, (VOPFUNC)err_cmap }, /* cmap */ + { &vnop_default_desc, (VOPFUNC)vn_default_error }, + { &vnop_lookup_desc, (VOPFUNC)devfs_lookup }, /* lookup */ + { &vnop_create_desc, (VOPFUNC)err_create }, /* create */ + { &vnop_whiteout_desc, (VOPFUNC)err_whiteout }, /* whiteout */ + { &vnop_mknod_desc, (VOPFUNC)devfs_mknod }, /* mknod */ + { &vnop_open_desc, (VOPFUNC)nop_open }, /* open */ + { &vnop_close_desc, (VOPFUNC)devfs_close }, /* close */ + { &vnop_getattr_desc, (VOPFUNC)devfs_getattr }, /* getattr */ + { &vnop_setattr_desc, (VOPFUNC)devfs_setattr }, /* setattr */ + { &vnop_read_desc, (VOPFUNC)devfs_read }, /* read */ + { &vnop_write_desc, (VOPFUNC)devfs_write }, /* write */ + { &vnop_ioctl_desc, (VOPFUNC)err_ioctl }, /* ioctl */ + { &vnop_select_desc, (VOPFUNC)err_select }, /* select */ + { &vnop_revoke_desc, (VOPFUNC)err_revoke }, /* revoke */ + { &vnop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */ + { &vnop_fsync_desc, (VOPFUNC)nop_fsync }, /* fsync */ + { &vnop_remove_desc, (VOPFUNC)devfs_remove }, /* remove */ + { &vnop_link_desc, (VOPFUNC)devfs_link }, /* link */ + { &vnop_rename_desc, (VOPFUNC)devfs_rename }, /* rename */ + { &vnop_mkdir_desc, (VOPFUNC)err_mkdir }, /* mkdir */ + { &vnop_rmdir_desc, (VOPFUNC)err_rmdir }, /* rmdir */ + { &vnop_symlink_desc, (VOPFUNC)devfs_symlink }, /* symlink */ + { &vnop_readdir_desc, (VOPFUNC)devfs_readdir }, /* readdir */ + { &vnop_readlink_desc, (VOPFUNC)devfs_readlink }, /* readlink */ + { &vnop_inactive_desc, (VOPFUNC)devfs_inactive }, /* inactive */ + { &vnop_reclaim_desc, (VOPFUNC)devfs_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (VOPFUNC)err_strategy }, /* strategy */ + { &vnop_pathconf_desc, (VOPFUNC)devs_vnop_pathconf }, /* pathconf */ + { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ + { &vnop_bwrite_desc, (VOPFUNC)err_bwrite }, + { &vnop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */ + { &vnop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */ + { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ + { &vnop_blktooff_desc, (VOPFUNC)err_blktooff }, /* blktooff */ + { &vnop_offtoblk_desc, (VOPFUNC)err_offtoblk }, /* offtoblk */ + { &vnop_blockmap_desc, (VOPFUNC)err_blockmap }, /* blockmap */ { (struct vnodeop_desc*)NULL, (int(*)())NULL } }; struct vnodeopv_desc devfs_vnodeop_opv_desc = @@ -1542,57 +1307,42 @@ struct vnodeopv_desc devfs_vnodeop_opv_desc = /* The following ops are used by the device nodes */ int (**devfs_spec_vnodeop_p)(void *); static struct vnodeopv_entry_desc devfs_spec_vnodeop_entries[] = { - { &vop_default_desc, (VOPFUNC)vn_default_error }, - { &vop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */ - { &vop_create_desc, (VOPFUNC)spec_create }, /* create */ - { &vop_mknod_desc, (VOPFUNC)spec_mknod }, /* mknod */ - { &vop_open_desc, (VOPFUNC)spec_open }, /* open */ - { &vop_close_desc, (VOPFUNC)devfsspec_close }, /* close */ - { &vop_access_desc, (VOPFUNC)devfs_access }, /* access */ - { &vop_getattr_desc, (VOPFUNC)devfs_getattr }, /* getattr */ - { &vop_setattr_desc, (VOPFUNC)devfs_setattr }, /* setattr */ - { &vop_read_desc, (VOPFUNC)devfsspec_read }, /* read */ - { &vop_write_desc, (VOPFUNC)devfsspec_write }, /* write */ - { &vop_lease_desc, (VOPFUNC)spec_lease_check }, /* lease */ - { &vop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */ - { &vop_select_desc, (VOPFUNC)spec_select }, /* select */ - { &vop_revoke_desc, (VOPFUNC)spec_revoke }, /* revoke */ - { &vop_mmap_desc, (VOPFUNC)spec_mmap }, /* mmap */ - { &vop_fsync_desc, (VOPFUNC)spec_fsync }, /* fsync */ - { &vop_seek_desc, (VOPFUNC)spec_seek }, /* seek */ - { &vop_remove_desc, (VOPFUNC)devfs_remove }, /* remove */ - { &vop_link_desc, (VOPFUNC)devfs_link }, /* link */ - { &vop_rename_desc, (VOPFUNC)spec_rename }, /* rename */ - { &vop_mkdir_desc, (VOPFUNC)spec_mkdir }, /* mkdir */ - { &vop_rmdir_desc, (VOPFUNC)spec_rmdir }, /* rmdir */ - { &vop_symlink_desc, (VOPFUNC)spec_symlink }, /* symlink */ - { &vop_readdir_desc, (VOPFUNC)spec_readdir }, /* readdir */ - { &vop_readlink_desc, (VOPFUNC)spec_readlink }, /* readlink */ - { &vop_abortop_desc, (VOPFUNC)spec_abortop }, /* abortop */ - { &vop_inactive_desc, (VOPFUNC)devfs_inactive }, /* inactive */ - { &vop_reclaim_desc, (VOPFUNC)devfs_reclaim }, /* reclaim */ - { &vop_lock_desc, (VOPFUNC)nop_lock }, /* lock */ - { &vop_unlock_desc, (VOPFUNC)nop_unlock }, /* unlock */ - { &vop_bmap_desc, (VOPFUNC)spec_bmap }, /* bmap */ - { &vop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */ - { &vop_print_desc, (VOPFUNC)devfs_print }, /* print */ - { &vop_islocked_desc, (VOPFUNC)nop_islocked }, /* islocked */ - { &vop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */ - { &vop_advlock_desc, (VOPFUNC)spec_advlock }, /* advlock */ - { &vop_blkatoff_desc, (VOPFUNC)spec_blkatoff }, /* blkatoff */ - { &vop_valloc_desc, (VOPFUNC)spec_valloc }, /* valloc */ - { &vop_reallocblks_desc, (VOPFUNC)spec_reallocblks }, /* reallocblks */ - { &vop_vfree_desc, (VOPFUNC)nop_vfree }, /* vfree */ - { &vop_truncate_desc, (VOPFUNC)spec_truncate }, /* truncate */ - { &vop_update_desc, (VOPFUNC)devfs_update }, /* update */ - { &vop_bwrite_desc, (VOPFUNC)vn_bwrite }, - { &vop_devblocksize_desc, (VOPFUNC)spec_devblocksize }, /* devblocksize */ - { &vop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */ - { &vop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */ - { &vop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ - { &vop_blktooff_desc, (VOPFUNC)spec_blktooff }, /* blktooff */ - { &vop_blktooff_desc, (VOPFUNC)spec_offtoblk }, /* blkofftoblk */ - { &vop_cmap_desc, (VOPFUNC)spec_cmap }, /* cmap */ + { &vnop_default_desc, (VOPFUNC)vn_default_error }, + { &vnop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */ + { &vnop_create_desc, (VOPFUNC)spec_create }, /* create */ + { &vnop_mknod_desc, (VOPFUNC)spec_mknod }, /* mknod */ + { &vnop_open_desc, (VOPFUNC)spec_open }, /* open */ + { &vnop_close_desc, (VOPFUNC)devfsspec_close }, /* close */ + { &vnop_getattr_desc, (VOPFUNC)devfs_getattr }, /* getattr */ + { &vnop_setattr_desc, (VOPFUNC)devfs_setattr }, /* setattr */ + { &vnop_read_desc, (VOPFUNC)devfsspec_read }, /* read */ + { &vnop_write_desc, (VOPFUNC)devfsspec_write }, /* write */ + { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */ + { &vnop_select_desc, (VOPFUNC)spec_select }, /* select */ + { &vnop_revoke_desc, (VOPFUNC)spec_revoke }, /* revoke */ + { &vnop_mmap_desc, (VOPFUNC)spec_mmap }, /* mmap */ + { &vnop_fsync_desc, (VOPFUNC)spec_fsync }, /* fsync */ + { &vnop_remove_desc, (VOPFUNC)devfs_remove }, /* remove */ + { &vnop_link_desc, (VOPFUNC)devfs_link }, /* link */ + { &vnop_rename_desc, (VOPFUNC)spec_rename }, /* rename */ + { &vnop_mkdir_desc, (VOPFUNC)spec_mkdir }, /* mkdir */ + { &vnop_rmdir_desc, (VOPFUNC)spec_rmdir }, /* rmdir */ + { &vnop_symlink_desc, (VOPFUNC)spec_symlink }, /* symlink */ + { &vnop_readdir_desc, (VOPFUNC)spec_readdir }, /* readdir */ + { &vnop_readlink_desc, (VOPFUNC)spec_readlink }, /* readlink */ + { &vnop_inactive_desc, (VOPFUNC)devfs_inactive }, /* inactive */ + { &vnop_reclaim_desc, (VOPFUNC)devfs_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */ + { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */ + { &vnop_advlock_desc, (VOPFUNC)spec_advlock }, /* advlock */ + { &vnop_bwrite_desc, (VOPFUNC)vn_bwrite }, + { &vnop_devblocksize_desc, (VOPFUNC)spec_devblocksize }, /* devblocksize */ + { &vnop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */ + { &vnop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */ + { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ + { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff }, /* blktooff */ + { &vnop_blktooff_desc, (VOPFUNC)spec_offtoblk }, /* blkofftoblk */ + { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap }, /* blockmap */ { (struct vnodeop_desc*)NULL, (int(*)())NULL } }; struct vnodeopv_desc devfs_spec_vnodeop_opv_desc = diff --git a/bsd/miscfs/devfs/devfsdefs.h b/bsd/miscfs/devfs/devfsdefs.h index 6b00a76c5..b104d6927 100644 --- a/bsd/miscfs/devfs/devfsdefs.h +++ b/bsd/miscfs/devfs/devfsdefs.h @@ -101,12 +101,6 @@ union devnode_type { }Slnk; }; -#define DN_ACCESS 0x0001 /* Access time update request. */ -#define DN_CHANGE 0x0002 /* Inode change time update request. */ -#define DN_UPDATE 0x0004 /* Modification time update request. */ -#define DN_MODIFIED 0x0008 /* Inode has been modified. */ -#define DN_RENAME 0x0010 /* Inode is being renamed. */ - struct devnode { devfstype_t dn_type; @@ -123,13 +117,22 @@ struct devnode struct vnode * dn_vn; /* address of last vnode that represented us */ int dn_len; /* of any associated info (e.g. dir data) */ devdirent_t * dn_linklist;/* circular list of hardlinks to this node */ - devdirent_t * dn_last_lookup; /* name I was last looked up from */ devnode_t * dn_nextsibling; /* the list of equivalent nodes */ devnode_t * * dn_prevsiblingp;/* backpointer for the above */ devnode_type_t dn_typeinfo; int dn_delete; /* mark for deletion */ + int dn_change; + int dn_update; + int dn_access; + int dn_lflags; }; +#define DN_BUSY 0x01 +#define DN_DELETE 0x02 +#define DN_CREATE 0x04 +#define DN_CREATEWAIT 0x08 + + struct devdirent { /*-----------------------directory entry fields-------------*/ @@ -143,8 +146,8 @@ struct devdirent }; extern devdirent_t * dev_root; -extern struct lock__bsd__ devfs_lock; extern struct devfs_stats devfs_stats; +extern lck_mtx_t devfs_mutex; /* * Rules for front nodes: @@ -179,90 +182,80 @@ struct devfsmount #define VTODN(vp) ((devnode_t *)(vp)->v_data) -static __inline__ int -DEVFS_LOCK(struct proc * p) -{ - return (lockmgr(&devfs_lock, LK_EXCLUSIVE, NULL, p)); -} +#define DEVFS_LOCK() lck_mtx_lock(&devfs_mutex) + +#define DEVFS_UNLOCK() lck_mtx_unlock(&devfs_mutex) + -static __inline__ int -DEVFS_UNLOCK(struct proc * p) -{ - return (lockmgr(&devfs_lock, LK_RELEASE, NULL, p)); -} static __inline__ void DEVFS_INCR_ENTRIES() { - devfs_stats.entries++; + OSAddAtomic(1, &devfs_stats.entries); } static __inline__ void DEVFS_DECR_ENTRIES() { - devfs_stats.entries--; + OSAddAtomic(-1, &devfs_stats.entries); } static __inline__ void DEVFS_INCR_NODES() { - devfs_stats.nodes++; + OSAddAtomic(1, &devfs_stats.nodes); } static __inline__ void DEVFS_DECR_NODES() { - devfs_stats.nodes--; + OSAddAtomic(-1, &devfs_stats.nodes); } static __inline__ void DEVFS_INCR_MOUNTS() { - devfs_stats.mounts++; + OSAddAtomic(1, &devfs_stats.mounts); } static __inline__ void DEVFS_DECR_MOUNTS() { - devfs_stats.mounts--; + OSAddAtomic(-1, &devfs_stats.mounts); } static __inline__ void DEVFS_INCR_STRINGSPACE(int space) { - devfs_stats.stringspace += space; + OSAddAtomic(space, &devfs_stats.stringspace); } static __inline__ void DEVFS_DECR_STRINGSPACE(int space) { - devfs_stats.stringspace -= space; - if (devfs_stats.stringspace < 0) { - printf("DEVFS_DECR_STRINGSPACE: (%d - %d < 0)\n", - devfs_stats.stringspace + space, space); - devfs_stats.stringspace = 0; - } + OSAddAtomic(-space, &devfs_stats.stringspace); } static __inline__ void -dn_times(devnode_t * dnp, struct timeval t1, struct timeval t2) +dn_times(devnode_t * dnp, struct timeval *t1, struct timeval *t2, struct timeval *t3) { - if (dnp->dn_flags & (DN_ACCESS | DN_CHANGE | DN_UPDATE)) { - if (dnp->dn_flags & DN_ACCESS) { - dnp->dn_atime.tv_sec = t1.tv_sec; - dnp->dn_atime.tv_nsec = t1.tv_usec * 1000; + if (dnp->dn_access) { + dnp->dn_atime.tv_sec = t1->tv_sec; + dnp->dn_atime.tv_nsec = t1->tv_usec * 1000; + dnp->dn_access = 0; } - if (dnp->dn_flags & DN_UPDATE) { - dnp->dn_mtime.tv_sec = t2.tv_sec; - dnp->dn_mtime.tv_nsec = t2.tv_usec * 1000; + if (dnp->dn_update) { + dnp->dn_mtime.tv_sec = t2->tv_sec; + dnp->dn_mtime.tv_nsec = t2->tv_usec * 1000; + dnp->dn_update = 0; } - if (dnp->dn_flags & DN_CHANGE) { - dnp->dn_ctime.tv_sec = time.tv_sec; - dnp->dn_ctime.tv_nsec = time.tv_usec * 1000; + if (dnp->dn_change) { + dnp->dn_ctime.tv_sec = t3->tv_sec; + dnp->dn_ctime.tv_nsec = t3->tv_usec * 1000; + dnp->dn_change = 0; } - dnp->dn_flags &= ~(DN_ACCESS | DN_CHANGE | DN_UPDATE); - } - return; + + return; } static __inline__ void diff --git a/bsd/miscfs/fdesc/fdesc.h b/bsd/miscfs/fdesc/fdesc.h index 63a330513..b4141e2c1 100644 --- a/bsd/miscfs/fdesc/fdesc.h +++ b/bsd/miscfs/fdesc/fdesc.h @@ -88,18 +88,33 @@ typedef enum { struct fdescnode { LIST_ENTRY(fdescnode) fd_hash; /* Hash list */ struct vnode *fd_vnode; /* Back ptr to vnode */ - fdntype fd_type; /* Type of this node */ - unsigned fd_fd; /* Fd to be dup'ed */ - char *fd_link; /* Link to fd/n */ - int fd_ix; /* filesystem index */ + fdntype fd_type; /* Type of this node */ + unsigned fd_fd; /* Fd to be dup'ed */ + char *fd_link; /* Link to fd/n */ + int fd_ix; /* filesystem index */ }; #define VFSTOFDESC(mp) ((struct fdescmount *)((mp)->mnt_data)) #define VTOFDESC(vp) ((struct fdescnode *)(vp)->v_data) -extern int fdesc_init __P((struct vfsconf *)); -extern int fdesc_root __P((struct mount *, struct vnode **)); -extern int fdesc_allocvp __P((fdntype, int, struct mount *, struct vnode **)); +extern int fdesc_allocvp(fdntype, int, struct mount *, struct vnode **, enum vtype); +extern int fdesc_badop(void); +extern int fdesc_getattr(struct vnop_getattr_args *ap); +extern int fdesc_inactive(struct vnop_inactive_args *ap); +extern int fdesc_init(struct vfsconf *); +extern int fdesc_ioctl(struct vnop_ioctl_args *ap); +extern int fdesc_lookup(struct vnop_lookup_args *ap); +extern int fdesc_open(struct vnop_open_args *ap); +extern int fdesc_pathconf(struct vnop_pathconf_args *ap); +extern int fdesc_read(struct vnop_read_args *ap); +extern int fdesc_readdir(struct vnop_readdir_args *ap); +extern int fdesc_readlink(struct vnop_readlink_args *ap); +extern int fdesc_reclaim(struct vnop_reclaim_args *ap); +extern int fdesc_root(struct mount *, struct vnode **, vfs_context_t); +extern int fdesc_select(struct vnop_select_args *ap); +extern int fdesc_setattr(struct vnop_setattr_args *ap); +extern int fdesc_write(struct vnop_write_args *ap); + extern int (**fdesc_vnodeop_p)(void *); extern struct vfsops fdesc_vfsops; #endif /* KERNEL */ diff --git a/bsd/miscfs/fdesc/fdesc_vfsops.c b/bsd/miscfs/fdesc/fdesc_vfsops.c index 53c3d75a9..b0173ec44 100644 --- a/bsd/miscfs/fdesc/fdesc_vfsops.c +++ b/bsd/miscfs/fdesc/fdesc_vfsops.c @@ -67,11 +67,11 @@ #include <sys/systm.h> #include <sys/time.h> #include <sys/types.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> #include <sys/resourcevar.h> #include <sys/filedesc.h> #include <sys/vnode.h> -#include <sys/mount.h> +#include <sys/mount_internal.h> #include <sys/namei.h> #include <sys/malloc.h> #include <miscfs/fdesc/fdesc.h> @@ -80,15 +80,9 @@ * Mount the per-process file descriptors (/dev/fd) */ int -fdesc_mount(mp, path, data, ndp, p) - struct mount *mp; - char *path; - caddr_t data; - struct nameidata *ndp; - struct proc *p; +fdesc_mount(struct mount *mp, vnode_t devvp, __unused user_addr_t data, vfs_context_t context) { int error = 0; - size_t size; struct fdescmount *fmp; struct vnode *rvp; @@ -96,67 +90,68 @@ fdesc_mount(mp, path, data, ndp, p) * Update is a no-op */ if (mp->mnt_flag & MNT_UPDATE) - return (EOPNOTSUPP); + return (ENOTSUP); - error = fdesc_allocvp(Froot, FD_ROOT, mp, &rvp); + error = fdesc_allocvp(Froot, FD_ROOT, mp, &rvp, VDIR); if (error) return (error); MALLOC(fmp, struct fdescmount *, sizeof(struct fdescmount), M_UFSMNT, M_WAITOK); /* XXX */ - rvp->v_type = VDIR; - rvp->v_flag |= VROOT; + + vnode_setnoflush(rvp); + vnode_ref(rvp); + vnode_put(rvp); + fmp->f_root = rvp; /* XXX -- don't mark as local to work around fts() problems */ /*mp->mnt_flag |= MNT_LOCAL;*/ mp->mnt_data = (qaddr_t) fmp; vfs_getnewfsid(mp); - (void) copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN - 1, &size); - bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); - bzero(mp->mnt_stat.f_mntfromname, MNAMELEN); - bcopy("fdesc", mp->mnt_stat.f_mntfromname, sizeof("fdesc")); + bzero(mp->mnt_vfsstat.f_mntfromname, MAXPATHLEN); + bcopy("fdesc", mp->mnt_vfsstat.f_mntfromname, sizeof("fdesc")); return (0); } int -fdesc_start(mp, flags, p) +fdesc_start(mp, flags, context) struct mount *mp; int flags; - struct proc *p; + vfs_context_t context; { return (0); } int -fdesc_unmount(mp, mntflags, p) +fdesc_unmount(mp, mntflags, context) struct mount *mp; int mntflags; - struct proc *p; + vfs_context_t context; { int error; int flags = 0; int force = 0; - struct vnode *rootvp = VFSTOFDESC(mp)->f_root; + struct vnode *rvp = VFSTOFDESC(mp)->f_root; if (mntflags & MNT_FORCE) { flags |= FORCECLOSE; force = 1; } - if ( (rootvp->v_usecount > 1) && !force ) + if ( vnode_isinuse(rvp, 1) && !force ) return (EBUSY); - if ( (error = vflush(mp, rootvp, flags)) && !force ) + if ( (error = vflush(mp, rvp, flags|SKIPSYSTEM)) && !force ) return (error); /* - * Release reference on underlying root vnode + * And mark for recycle after we drop its reference; it away for future re-use */ - vrele(rootvp); + vnode_recycle(rvp); /* - * And blow it away for future re-use + * Release reference on underlying root vnode */ - vgone(rootvp); + vnode_rele(rvp); /* * Finally, throw away the fdescmount structure */ @@ -167,29 +162,29 @@ fdesc_unmount(mp, mntflags, p) } int -fdesc_root(mp, vpp) +fdesc_root(mp, vpp, context) struct mount *mp; struct vnode **vpp; + vfs_context_t context; { - struct proc *p = current_proc(); /* XXX */ struct vnode *vp; /* * Return locked reference to root. */ vp = VFSTOFDESC(mp)->f_root; - VREF(vp); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + vnode_get(vp); *vpp = vp; return (0); } int -fdesc_statfs(mp, sbp, p) +fdesc_statfs(mp, sbp, context) struct mount *mp; - struct statfs *sbp; - struct proc *p; + struct vfsstatfs *sbp; + vfs_context_t context; { + struct proc *p = vfs_context_proc(context); struct filedesc *fdp; int lim; int i; @@ -221,50 +216,94 @@ fdesc_statfs(mp, sbp, p) sbp->f_flags = 0; sbp->f_bsize = DEV_BSIZE; sbp->f_iosize = DEV_BSIZE; - sbp->f_blocks = 2; /* 1K to keep df happy */ + sbp->f_blocks = (uint64_t)2; /* 1K to keep df happy */ sbp->f_bfree = 0; sbp->f_bavail = 0; - sbp->f_files = lim + 1; /* Allow for "." */ - sbp->f_ffree = freefd; /* See comments above */ - if (sbp != &mp->mnt_stat) { - sbp->f_type = mp->mnt_vfc->vfc_typenum; - bcopy(&mp->mnt_stat.f_fsid, &sbp->f_fsid, sizeof(sbp->f_fsid)); - bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); - bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); - } + sbp->f_files = (uint64_t)((unsigned long)(lim + 1)); /* Allow for "." */ + sbp->f_ffree = (uint64_t)((unsigned long)freefd); /* See comments above */ + return (0); } +static int +fdesc_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t context) +{ + VFSATTR_RETURN(fsap, f_bsize, DEV_BSIZE); + VFSATTR_RETURN(fsap, f_iosize, DEV_BSIZE); + VFSATTR_RETURN(fsap, f_blocks, 2); + VFSATTR_RETURN(fsap, f_bfree, 0); + VFSATTR_RETURN(fsap, f_bavail, 0); + VFSATTR_RETURN(fsap, f_fssubtype, 0); + + if (VFSATTR_IS_ACTIVE(fsap, f_objcount) || + VFSATTR_IS_ACTIVE(fsap, f_maxobjcount) || + VFSATTR_IS_ACTIVE(fsap, f_files) || + VFSATTR_IS_ACTIVE(fsap, f_ffree)) + { + struct proc *p = vfs_context_proc(context); + struct filedesc *fdp; + int lim; + int i; + int last; + int freefd; + + /* + * Compute number of free file descriptors. + * [ Strange results will ensue if the open file + * limit is ever reduced below the current number + * of open files... ] + */ + lim = p->p_rlimit[RLIMIT_NOFILE].rlim_cur; + fdp = p->p_fd; + last = min(fdp->fd_nfiles, lim); + freefd = 0; + for (i = fdp->fd_freefile; i < last; i++) + if (fdp->fd_ofiles[i] == NULL && + !(fdp->fd_ofileflags[i] & UF_RESERVED)) + freefd++; + + /* + * Adjust for the fact that the fdesc array may not + * have been fully allocated yet. + */ + if (fdp->fd_nfiles < lim) + freefd += (lim - fdp->fd_nfiles); + + VFSATTR_RETURN(fsap, f_objcount, lim+1); + VFSATTR_RETURN(fsap, f_maxobjcount, lim+1); + VFSATTR_RETURN(fsap, f_files, lim+1); + VFSATTR_RETURN(fsap, f_ffree, freefd); + } + + return 0; +} + int -fdesc_sync(mp, waitfor) +fdesc_sync(mp, waitfor, context) struct mount *mp; int waitfor; + vfs_context_t context; { return (0); } -#define fdesc_fhtovp ((int (*) __P((struct mount *, struct fid *, \ - struct mbuf *, struct vnode **, int *, struct ucred **)))eopnotsupp) -#define fdesc_quotactl ((int (*) __P((struct mount *, int, uid_t, caddr_t, \ - struct proc *)))eopnotsupp) -#define fdesc_sysctl ((int (*) __P((int *, u_int, void *, size_t *, void *, \ - size_t, struct proc *)))eopnotsupp) -#define fdesc_vget ((int (*) __P((struct mount *, void *, struct vnode **))) \ - eopnotsupp) -#define fdesc_vptofh ((int (*) __P((struct vnode *, struct fid *)))eopnotsupp) +#define fdesc_fhtovp (int (*) (mount_t, int, unsigned char *, vnode_t *, vfs_context_t))eopnotsupp +#define fdesc_sysctl (int (*) (int *, u_int, user_addr_t, size_t *, user_addr_t, size_t, vfs_context_t))eopnotsupp +#define fdesc_vget (int (*) (mount_t, ino64_t, vnode_t *, vfs_context_t))eopnotsupp +#define fdesc_vptofh (int (*) (vnode_t, int *, unsigned char *, vfs_context_t))eopnotsupp struct vfsops fdesc_vfsops = { fdesc_mount, fdesc_start, fdesc_unmount, fdesc_root, - fdesc_quotactl, - fdesc_statfs, + NULL, /* quotactl */ + fdesc_vfs_getattr, fdesc_sync, fdesc_vget, fdesc_fhtovp, fdesc_vptofh, fdesc_init, - fdesc_sysctl, + fdesc_sysctl }; diff --git a/bsd/miscfs/fdesc/fdesc_vnops.c b/bsd/miscfs/fdesc/fdesc_vnops.c index 3f11d10c6..185a74c61 100644 --- a/bsd/miscfs/fdesc/fdesc_vnops.c +++ b/bsd/miscfs/fdesc/fdesc_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -67,21 +67,25 @@ #include <sys/systm.h> #include <sys/types.h> #include <sys/time.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> #include <sys/kernel.h> /* boottime */ #include <sys/resourcevar.h> #include <sys/filedesc.h> -#include <sys/vnode.h> +#include <sys/kauth.h> +#include <sys/vnode_internal.h> #include <sys/malloc.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/stat.h> -#include <sys/mount.h> +#include <sys/mount_internal.h> #include <sys/namei.h> -#include <sys/buf.h> #include <sys/dirent.h> #include <sys/ubc.h> +#include <sys/socketvar.h> +#include <sys/pipe.h> +#include <sys/uio_internal.h> #include <miscfs/fdesc/fdesc.h> #include <vfs/vfs_support.h> +#include <pexpert/pexpert.h> #define FDL_WANT 0x01 #define FDL_LOCKED 0x02 @@ -99,35 +103,54 @@ FD_STDIN, FD_STDOUT, FD_STDERR must be a sequence n, n+1, n+2 LIST_HEAD(fdhashhead, fdescnode) *fdhashtbl; u_long fdhash; +static int fdesc_attr(int fd, struct vnode_attr *vap, vfs_context_t a_context); + + /* * Initialise cache headers */ -fdesc_init(vfsp) - struct vfsconf *vfsp; +int +fdesc_init(__unused struct vfsconf *vfsp) { fdhashtbl = hashinit(NFDCACHE, M_CACHE, &fdhash); + + return( 0 ); } int -fdesc_allocvp(ftype, ix, mp, vpp) +fdesc_allocvp(ftype, ix, mp, vpp, vtype) fdntype ftype; int ix; struct mount *mp; struct vnode **vpp; + enum vtype vtype; { - struct proc *p = current_proc(); /* XXX */ struct fdhashhead *fc; struct fdescnode *fd; int error = 0; + int vid = 0; + struct vnode_fsparam vfsp; fc = FD_NHASH(ix); loop: for (fd = fc->lh_first; fd != 0; fd = fd->fd_hash.le_next) { - if (fd->fd_ix == ix && fd->fd_vnode->v_mount == mp) { - if (vget(fd->fd_vnode, 0, p)) + if (fd->fd_ix == ix && vnode_mount(fd->fd_vnode) == mp) { + /* + * doing a vnode_getwithvid isn't technically + * necessary since fdesc is an unsafe filesystem + * and we're running behind a funnel at this point + * however, vnode_get always succeeds, which isn't + * what we want if this vnode is in the process of + * being terminated + */ + vid = vnode_vid(fd->fd_vnode); + + if (vnode_getwithvid(fd->fd_vnode, vid)) goto loop; *vpp = fd->fd_vnode; + (*vpp)->v_type = vtype; + return (error); } } @@ -144,12 +167,29 @@ loop: fdcache_lock |= FDL_LOCKED; MALLOC(fd, void *, sizeof(struct fdescnode), M_TEMP, M_WAITOK); - error = getnewvnode(VT_FDESC, mp, fdesc_vnodeop_p, vpp); + + vfsp.vnfs_mp = mp; + vfsp.vnfs_vtype = vtype; + vfsp.vnfs_str = "fdesc"; + vfsp.vnfs_dvp = 0; + vfsp.vnfs_fsnode = fd; + vfsp.vnfs_cnp = 0; + vfsp.vnfs_vops = fdesc_vnodeop_p; + vfsp.vnfs_rdev = 0; + vfsp.vnfs_filesize = 0; + vfsp.vnfs_flags = VNFS_NOCACHE | VNFS_CANTCACHE; + vfsp.vnfs_marksystem = 0; + if (ftype == Froot) + vfsp.vnfs_markroot = 1; + else + vfsp.vnfs_markroot = 0; + + error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, vpp); if (error) { FREE(fd, M_TEMP); goto out; } - (*vpp)->v_data = fd; + (*vpp)->v_tag = VT_FDESC; fd->fd_vnode = *vpp; fd->fd_type = ftype; fd->fd_fd = -1; @@ -174,28 +214,30 @@ out: */ int fdesc_lookup(ap) - struct vop_lookup_args /* { + struct vnop_lookup_args /* { struct vnode * a_dvp; struct vnode ** a_vpp; struct componentname * a_cnp; + vfs_context_t a_context; } */ *ap; { struct vnode **vpp = ap->a_vpp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; char *pname = cnp->cn_nameptr; - struct proc *p = cnp->cn_proc; - int nfiles = p->p_fd->fd_nfiles; - unsigned fd; + struct proc *p = vfs_context_proc(ap->a_context); + int numfiles = p->p_fd->fd_nfiles; + int fd; int error; struct vnode *fvp; char *ln; - VOP_UNLOCK(dvp, 0, p); if (cnp->cn_namelen == 1 && *pname == '.') { *vpp = dvp; - VREF(dvp); - vn_lock(dvp, LK_SHARED | LK_RETRY, p); + + if ( (error = vnode_get(dvp)) ) { + return(error); + } return (0); } @@ -203,21 +245,20 @@ fdesc_lookup(ap) default: case Flink: case Fdesc: + /* should never happen */ error = ENOTDIR; goto bad; case Froot: if (cnp->cn_namelen == 2 && bcmp(pname, "fd", 2) == 0) { - error = fdesc_allocvp(Fdevfd, FD_DEVFD, dvp->v_mount, &fvp); + error = fdesc_allocvp(Fdevfd, FD_DEVFD, dvp->v_mount, &fvp, VDIR); if (error) goto bad; *vpp = fvp; - fvp->v_type = VDIR; - vn_lock(fvp, LK_SHARED | LK_RETRY, p); return (0); } - ln = 0; + ln = NULL; switch (cnp->cn_namelen) { case 5: if (bcmp(pname, "stdin", 5) == 0) { @@ -238,13 +279,11 @@ fdesc_lookup(ap) } if (ln) { - error = fdesc_allocvp(Flink, fd, dvp->v_mount, &fvp); + error = fdesc_allocvp(Flink, fd, dvp->v_mount, &fvp, VLNK); if (error) goto bad; VTOFDESC(fvp)->fd_link = ln; *vpp = fvp; - fvp->v_type = VLNK; - vn_lock(fvp, LK_SHARED | LK_RETRY, p); return (0); } else { error = ENOENT; @@ -255,7 +294,7 @@ fdesc_lookup(ap) case Fdevfd: if (cnp->cn_namelen == 2 && bcmp(pname, "..", 2) == 0) { - if (error = fdesc_root(dvp->v_mount, vpp)) + if ((error = fdesc_root(dvp->v_mount, vpp, ap->a_context))) goto bad; return (0); } @@ -263,7 +302,7 @@ fdesc_lookup(ap) fd = 0; while (*pname >= '0' && *pname <= '9') { fd = 10 * fd + *pname++ - '0'; - if (fd >= nfiles) + if (fd >= numfiles) break; } @@ -272,38 +311,36 @@ fdesc_lookup(ap) goto bad; } - if (fd >= nfiles || + if (fd < 0 || fd >= numfiles || *fdfile(p, fd) == NULL || (*fdflags(p, fd) & UF_RESERVED)) { error = EBADF; goto bad; } - error = fdesc_allocvp(Fdesc, FD_DESC+fd, dvp->v_mount, &fvp); + error = fdesc_allocvp(Fdesc, FD_DESC+fd, dvp->v_mount, &fvp, VNON); if (error) goto bad; VTOFDESC(fvp)->fd_fd = fd; - vn_lock(fvp, LK_SHARED | LK_RETRY, p); *vpp = fvp; return (0); } bad:; - vn_lock(dvp, LK_SHARED | LK_RETRY, p); *vpp = NULL; return (error); } int fdesc_open(ap) - struct vop_open_args /* { + struct vnop_open_args /* { struct vnode *a_vp; int a_mode; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { struct vnode *vp = ap->a_vp; + struct proc *p = vfs_context_proc(ap->a_context); int error = 0; switch (VTOFDESC(vp)->fd_type) { @@ -314,9 +351,9 @@ fdesc_open(ap) * return ensures that the vnode for this device will be * released by vn_open. Open will detect this special error and * take the actions in dupfdopen. Other callers of vn_open or - * VOP_OPEN will simply report the error. + * vnop_open will simply report the error. */ - ap->a_p->p_dupfd = VTOFDESC(vp)->fd_fd; /* XXX */ + p->p_dupfd = VTOFDESC(vp)->fd_fd; /* XXX */ error = ENODEV; break; @@ -326,123 +363,136 @@ fdesc_open(ap) } static int -fdesc_attr(fd, vap, cred, p) - int fd; - struct vattr *vap; - struct ucred *cred; - struct proc *p; +fdesc_attr(int fd, struct vnode_attr *vap, vfs_context_t a_context) { - struct file *fp; + struct fileproc *fp; + struct proc *p = vfs_context_proc(a_context); struct stat stb; int error; - if (error = fdgetf(p, fd, &fp)) + if ((error = fp_lookup(p, fd, &fp, 0))) return (error); - switch (fp->f_type) { + switch (fp->f_fglob->fg_type) { case DTYPE_VNODE: - error = VOP_GETATTR((struct vnode *) fp->f_data, vap, cred, p); + if(error = vnode_getwithref((struct vnode *) fp->f_fglob->fg_data)) { + break; + } + if ((error = vnode_authorize((struct vnode *)fp->f_fglob->fg_data, + NULL, + KAUTH_VNODE_READ_ATTRIBUTES | KAUTH_VNODE_READ_SECURITY, + a_context)) == 0) + error = vnode_getattr((struct vnode *)fp->f_fglob->fg_data, vap, a_context); if (error == 0 && vap->va_type == VDIR) { /* * directories can cause loops in the namespace, * so turn off the 'x' bits to avoid trouble. + * + * XXX ACLs break this, of course */ vap->va_mode &= ~((VEXEC)|(VEXEC>>3)|(VEXEC>>6)); } + (void)vnode_put((struct vnode *) fp->f_fglob->fg_data); break; case DTYPE_SOCKET: - error = soo_stat((struct socket *)fp->f_data, &stb); + case DTYPE_PIPE: + if (fp->f_fglob->fg_type == DTYPE_SOCKET) + error = soo_stat((struct socket *)fp->f_fglob->fg_data, &stb); + else + error = pipe_stat((struct socket *)fp->f_fglob->fg_data, &stb); + if (error == 0) { - vattr_null(vap); - vap->va_type = VSOCK; - vap->va_mode = stb.st_mode; - vap->va_nlink = stb.st_nlink; - vap->va_uid = stb.st_uid; - vap->va_gid = stb.st_gid; - vap->va_fsid = stb.st_dev; - vap->va_fileid = stb.st_ino; - vap->va_size = stb.st_size; - vap->va_blocksize = stb.st_blksize; - vap->va_atime = stb.st_atimespec; - vap->va_mtime = stb.st_mtimespec; - vap->va_ctime = stb.st_ctimespec; - vap->va_gen = stb.st_gen; - vap->va_flags = stb.st_flags; - vap->va_rdev = stb.st_rdev; - vap->va_bytes = stb.st_blocks * stb.st_blksize; + if (fp->f_fglob->fg_type == DTYPE_SOCKET) + VATTR_RETURN(vap, va_type, VSOCK); + else + VATTR_RETURN(vap, va_type, VFIFO); + + VATTR_RETURN(vap, va_mode, stb.st_mode); + VATTR_RETURN(vap, va_nlink, stb.st_nlink); + VATTR_RETURN(vap, va_uid, stb.st_uid); + VATTR_RETURN(vap, va_gid, stb.st_gid); + VATTR_RETURN(vap, va_fsid, stb.st_dev); + VATTR_RETURN(vap, va_fileid, stb.st_ino); + VATTR_RETURN(vap, va_data_size, stb.st_size); + VATTR_RETURN(vap, va_access_time, stb.st_atimespec); + VATTR_RETURN(vap, va_modify_time, stb.st_mtimespec); + VATTR_RETURN(vap, va_change_time, stb.st_ctimespec); + VATTR_RETURN(vap, va_gen, stb.st_gen); + VATTR_RETURN(vap, va_flags, stb.st_flags); + VATTR_RETURN(vap, va_rdev, stb.st_rdev); + VATTR_RETURN(vap, va_total_alloc, stb.st_blocks * stb.st_blksize); + VATTR_RETURN(vap, va_acl, NULL); } break; default: - return (EBADF); - break; + error = EBADF; } + fp_drop(p, fd, fp, 0); return (error); } int fdesc_getattr(ap) - struct vop_getattr_args /* { + struct vnop_getattr_args /* { struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - struct proc *a_p; + struct vnode_attr *a_vap; + vfs_context_t a_context; } */ *ap; { struct vnode *vp = ap->a_vp; - struct vattr *vap = ap->a_vap; + struct vnode_attr *vap = ap->a_vap; unsigned fd; int error = 0; + struct timespec ts; switch (VTOFDESC(vp)->fd_type) { case Froot: case Fdevfd: case Flink: - bzero((caddr_t) vap, sizeof(*vap)); - vattr_null(vap); - vap->va_fileid = VTOFDESC(vp)->fd_ix; - - vap->va_uid = 0; - vap->va_gid = 0; - vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; - vap->va_blocksize = DEV_BSIZE; - vap->va_atime.tv_sec = boottime.tv_sec; - vap->va_atime.tv_nsec = 0; - vap->va_mtime = vap->va_atime; - vap->va_ctime = vap->va_mtime; - vap->va_gen = 0; - vap->va_flags = 0; - vap->va_rdev = 0; - vap->va_bytes = 0; + VATTR_RETURN(vap, va_fileid, VTOFDESC(vp)->fd_ix); + VATTR_RETURN(vap, va_uid, 0); + VATTR_RETURN(vap, va_gid, 0); + VATTR_RETURN(vap, va_fsid, vp->v_mount->mnt_vfsstat.f_fsid.val[0]); + VATTR_RETURN(vap, va_iosize, DEV_BSIZE); + ts.tv_sec = boottime_sec(); + ts.tv_nsec = 0; + VATTR_RETURN(vap, va_access_time, ts); + VATTR_RETURN(vap, va_modify_time, ts); + VATTR_RETURN(vap, va_change_time, ts); + VATTR_RETURN(vap, va_gen, 0); + VATTR_RETURN(vap, va_flags, 0); + VATTR_RETURN(vap, va_rdev, 0); + VATTR_RETURN(vap, va_acl, NULL); switch (VTOFDESC(vp)->fd_type) { case Flink: - vap->va_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH; - vap->va_type = VLNK; - vap->va_nlink = 1; - vap->va_size = strlen(VTOFDESC(vp)->fd_link); + VATTR_RETURN(vap, va_mode, S_IRUSR|S_IRGRP|S_IROTH); + VATTR_RETURN(vap, va_type, VLNK); /* not strictly required */ + VATTR_RETURN(vap, va_nlink, 1); + VATTR_RETURN(vap, va_data_size, strlen(VTOFDESC(vp)->fd_link)); break; default: - vap->va_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH; - vap->va_type = VDIR; - vap->va_nlink = 2; - vap->va_size = DEV_BSIZE; + VATTR_RETURN(vap, va_mode, S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH); + VATTR_RETURN(vap, va_type, VDIR); + VATTR_RETURN(vap, va_nlink, 2); + VATTR_RETURN(vap, va_data_size, DEV_BSIZE); break; } break; case Fdesc: fd = VTOFDESC(vp)->fd_fd; - error = fdesc_attr(fd, vap, ap->a_cred, ap->a_p); + error = fdesc_attr(fd, vap, ap->a_context); break; default: return (EBADF); break; } - + if (error == 0) { vp->v_type = vap->va_type; } @@ -452,16 +502,16 @@ fdesc_getattr(ap) int fdesc_setattr(ap) - struct vop_setattr_args /* { + struct vnop_setattr_args /* { struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - struct proc *a_p; + struct vnode_attr *a_vap; + vfs_context_t a_context; } */ *ap; { - struct file *fp; + struct fileproc *fp; unsigned fd; int error; + struct proc * p = vfs_context_proc(ap->a_context); /* * Can't mess with the root vnode @@ -475,27 +525,34 @@ fdesc_setattr(ap) } fd = VTOFDESC(ap->a_vp)->fd_fd; - if (error = fdgetf(ap->a_p, fd, &fp)) + if ((error = fp_lookup(vfs_context_proc(ap->a_context), fd, &fp, 0))) return (error); /* * Can setattr the underlying vnode, but not sockets! */ - switch (fp->f_type) { + switch (fp->f_fglob->fg_type) { case DTYPE_VNODE: - error = VOP_SETATTR((struct vnode *) fp->f_data, ap->a_vap, ap->a_cred, ap->a_p); + { + if ((error = vnode_getwithref((struct vnode *) fp->f_fglob->fg_data)) != 0) + break; + error = vnode_setattr((struct vnode *) fp->f_fglob->fg_data, ap->a_vap, ap->a_context); + (void)vnode_put((struct vnode *) fp->f_fglob->fg_data); break; + } case DTYPE_SOCKET: + case DTYPE_PIPE: error = 0; break; default: - kprintf("fp->f_type = %d\n", fp->f_type); + kprintf("fp->f_fglob->fg_type = %d\n", fp->f_fglob->fg_type); error = EBADF; break; } + fp_drop(p, fd, fp, 0); return (error); } @@ -511,29 +568,29 @@ static struct dirtmp { { FD_STDIN, UIO_MX, 5, "stdin" }, { FD_STDOUT, UIO_MX, 6, "stdout" }, { FD_STDERR, UIO_MX, 6, "stderr" }, - { 0 } + { 0, 0, 0, "" } }; int fdesc_readdir(ap) - struct vop_readdir_args /* { + struct vnop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; - struct ucred *a_cred; + int a_flags; int *a_eofflag; - u_long *a_cookies; - int a_ncookies; + int *a_numdirent; + vfs_context_t a_context; } */ *ap; { struct uio *uio = ap->a_uio; - struct proc *p = uio->uio_procp; + struct proc *p = current_proc(); int i, error; /* * We don't allow exporting fdesc mounts, and currently local * requests do not need cookies. */ - if (ap->a_ncookies) + if (ap->a_flags & (VNODE_READDIR_EXTENDED | VNODE_READDIR_REQSEEKOFF)) return (EINVAL); switch (VTOFDESC(ap->a_vp)->fd_type) { @@ -553,7 +610,7 @@ fdesc_readdir(ap) i = uio->uio_offset / UIO_MX; error = 0; - while (uio->uio_resid > 0) { + while (uio_resid(uio) > 0) { dt = &rootent[i]; if (dt->d_fileno == 0) { /**eofflagp = 1;*/ @@ -590,7 +647,7 @@ fdesc_readdir(ap) i = uio->uio_offset / UIO_MX; error = 0; - while (uio->uio_resid > 0) { + while (uio_resid(uio) > 0) { if (i >= p->p_fd->fd_nfiles) break; @@ -620,10 +677,10 @@ fdesc_readdir(ap) int fdesc_readlink(ap) - struct vop_readlink_args /* { + struct vnop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; - struct ucred *a_cred; + vfs_context_t a_context; } */ *ap; { struct vnode *vp = ap->a_vp; @@ -636,70 +693,42 @@ fdesc_readlink(ap) char *ln = VTOFDESC(vp)->fd_link; error = uiomove(ln, strlen(ln), ap->a_uio); } else { - error = EOPNOTSUPP; + error = ENOTSUP; } return (error); } int -fdesc_read(ap) - struct vop_read_args /* { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; - } */ *ap; +fdesc_read(__unused struct vnop_read_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } int -fdesc_write(ap) - struct vop_write_args /* { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; - } */ *ap; +fdesc_write(__unused struct vnop_write_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } int -fdesc_ioctl(ap) - struct vop_ioctl_args /* { - struct vnode *a_vp; - int a_command; - caddr_t a_data; - int a_fflag; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; +fdesc_ioctl(__unused struct vnop_ioctl_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } int -fdesc_select(ap) - struct vop_select_args /* { - struct vnode *a_vp; - int a_which; - int a_fflags; - struct ucred *a_cred; - void *a_wql; - struct proc *a_p; - } */ *ap; +fdesc_select(__unused struct vnop_select_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } int fdesc_inactive(ap) - struct vop_inactive_args /* { + struct vnop_inactive_args /* { struct vnode *a_vp; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { struct vnode *vp = ap->a_vp; @@ -708,15 +737,15 @@ fdesc_inactive(ap) * Clear out the v_type field to avoid * nasty things happening in vgone(). */ - VOP_UNLOCK(vp, 0, ap->a_p); vp->v_type = VNON; return (0); } int fdesc_reclaim(ap) - struct vop_reclaim_args /* { + struct vnop_reclaim_args /* { struct vnode *a_vp; + vfs_context_t a_context; } */ *ap; { struct vnode *vp = ap->a_vp; @@ -732,11 +761,13 @@ fdesc_reclaim(ap) /* * Return POSIX pathconf information applicable to special devices. */ +int fdesc_pathconf(ap) - struct vop_pathconf_args /* { + struct vnop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; + vfs_context_t a_context; } */ *ap; { @@ -765,29 +796,10 @@ fdesc_pathconf(ap) /* NOTREACHED */ } -/* - * Print out the contents of a /dev/fd vnode. - */ -/* ARGSUSED */ -int -fdesc_print(ap) - struct vop_print_args /* { - struct vnode *a_vp; - } */ *ap; -{ - - printf("tag VT_NON, fdesc vnode\n"); - return (0); -} /*void*/ int -fdesc_vfree(ap) - struct vop_vfree_args /* { - struct vnode *a_pvp; - ino_t a_ino; - int a_mode; - } */ *ap; +fdesc_vfree(__unused struct vnop_vfree_args *ap) { return (0); @@ -797,7 +809,7 @@ fdesc_vfree(ap) * /dev/fd "should never get here" operation */ int -fdesc_badop() +fdesc_badop(void) { return (ENOTSUP); @@ -806,93 +818,64 @@ fdesc_badop() #define VOPFUNC int (*)(void *) -#define fdesc_create ((int (*) __P((struct vop_create_args *)))eopnotsupp) -#define fdesc_mknod ((int (*) __P((struct vop_mknod_args *)))eopnotsupp) -#define fdesc_close ((int (*) __P((struct vop_close_args *)))nullop) -#define fdesc_access ((int (*) __P((struct vop_access_args *)))nullop) -#define fdesc_mmap ((int (*) __P((struct vop_mmap_args *)))eopnotsupp) -#define fdesc_revoke vop_revoke -#define fdesc_fsync ((int (*) __P((struct vop_fsync_args *)))nullop) -#define fdesc_seek ((int (*) __P((struct vop_seek_args *)))nullop) -#define fdesc_remove ((int (*) __P((struct vop_remove_args *)))eopnotsupp) -#define fdesc_link ((int (*) __P((struct vop_link_args *)))eopnotsupp) -#define fdesc_rename ((int (*) __P((struct vop_rename_args *)))eopnotsupp) -#define fdesc_mkdir ((int (*) __P((struct vop_mkdir_args *)))eopnotsupp) -#define fdesc_rmdir ((int (*) __P((struct vop_rmdir_args *)))eopnotsupp) -#define fdesc_symlink ((int (*) __P((struct vop_symlink_args *)))eopnotsupp) -#define fdesc_abortop ((int (*) __P((struct vop_abortop_args *)))nop_abortop) -#define fdesc_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) -#define fdesc_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) -#define fdesc_bmap ((int (*) __P((struct vop_bmap_args *)))fdesc_badop) -#define fdesc_strategy ((int (*) __P((struct vop_strategy_args *)))fdesc_badop) -#define fdesc_islocked \ - ((int (*) __P((struct vop_islocked_args *)))vop_noislocked) -#define fdesc_advlock ((int (*) __P((struct vop_advlock_args *)))eopnotsupp) -#define fdesc_blkatoff \ - ((int (*) __P((struct vop_blkatoff_args *)))eopnotsupp) -#define fdesc_valloc ((int(*) __P(( \ - struct vnode *pvp, \ - int mode, \ - struct ucred *cred, \ - struct vnode **vpp))) eopnotsupp) -#define fdesc_truncate \ - ((int (*) __P((struct vop_truncate_args *)))eopnotsupp) -#define fdesc_update ((int (*) __P((struct vop_update_args *)))eopnotsupp) -#define fdesc_bwrite ((int (*) __P((struct vop_bwrite_args *)))eopnotsupp) -#define fdesc_blktooff ((int (*) __P((struct vop_blktooff_args *)))eopnotsupp) -#define fdesc_offtoblk ((int (*) __P((struct vop_offtoblk_args *)))eopnotsupp) -#define fdesc_cmap ((int (*) __P((struct vop_cmap_args *)))eopnotsupp) +#define fdesc_create (int (*) (struct vnop_create_args *))eopnotsupp +#define fdesc_mknod (int (*) (struct vnop_mknod_args *))eopnotsupp +#define fdesc_close (int (*) (struct vnop_close_args *))nullop +#define fdesc_access (int (*) (struct vnop_access_args *))nullop +#define fdesc_mmap (int (*) (struct vnop_mmap_args *))eopnotsupp +#define fdesc_revoke nop_revoke +#define fdesc_fsync (int (*) (struct vnop_fsync_args *))nullop +#define fdesc_remove (int (*) (struct vnop_remove_args *))eopnotsupp +#define fdesc_link (int (*) (struct vnop_link_args *))eopnotsupp +#define fdesc_rename (int (*) (struct vnop_rename_args *))eopnotsupp +#define fdesc_mkdir (int (*) (struct vnop_mkdir_args *))eopnotsupp +#define fdesc_rmdir (int (*) (struct vnop_rmdir_args *))eopnotsupp +#define fdesc_symlink (int (*) (struct vnop_symlink_args *))eopnotsupp +#define fdesc_strategy (int (*) (struct vnop_strategy_args *))fdesc_badop +#define fdesc_advlock (int (*) (struct vnop_advlock_args *))eopnotsupp +#define fdesc_bwrite (int (*) (struct vnop_bwrite_args *))eopnotsupp +#define fdesc_blktooff (int (*) (struct vnop_blktooff_args *))eopnotsupp +#define fdesc_offtoblk (int (*) (struct vnop_offtoblk_args *))eopnotsupp +#define fdesc_blockmap (int (*) (struct vnop_blockmap_args *))eopnotsupp int (**fdesc_vnodeop_p)(void *); struct vnodeopv_entry_desc fdesc_vnodeop_entries[] = { - { &vop_default_desc, (VOPFUNC)vn_default_error }, - { &vop_lookup_desc, (VOPFUNC)fdesc_lookup }, /* lookup */ - { &vop_create_desc, (VOPFUNC)fdesc_create }, /* create */ - { &vop_mknod_desc, (VOPFUNC)fdesc_mknod }, /* mknod */ - { &vop_open_desc, (VOPFUNC)fdesc_open }, /* open */ - { &vop_close_desc, (VOPFUNC)fdesc_close }, /* close */ - { &vop_access_desc, (VOPFUNC)fdesc_access }, /* access */ - { &vop_getattr_desc, (VOPFUNC)fdesc_getattr }, /* getattr */ - { &vop_setattr_desc, (VOPFUNC)fdesc_setattr }, /* setattr */ - { &vop_read_desc, (VOPFUNC)fdesc_read }, /* read */ - { &vop_write_desc, (VOPFUNC)fdesc_write }, /* write */ - { &vop_ioctl_desc, (VOPFUNC)fdesc_ioctl }, /* ioctl */ - { &vop_select_desc, (VOPFUNC)fdesc_select }, /* select */ - { &vop_revoke_desc, (VOPFUNC)fdesc_revoke }, /* revoke */ - { &vop_mmap_desc, (VOPFUNC)fdesc_mmap }, /* mmap */ - { &vop_fsync_desc, (VOPFUNC)fdesc_fsync }, /* fsync */ - { &vop_seek_desc, (VOPFUNC)fdesc_seek }, /* seek */ - { &vop_remove_desc, (VOPFUNC)fdesc_remove }, /* remove */ - { &vop_link_desc, (VOPFUNC)fdesc_link }, /* link */ - { &vop_rename_desc, (VOPFUNC)fdesc_rename }, /* rename */ - { &vop_mkdir_desc, (VOPFUNC)fdesc_mkdir }, /* mkdir */ - { &vop_rmdir_desc, (VOPFUNC)fdesc_rmdir }, /* rmdir */ - { &vop_symlink_desc, (VOPFUNC)fdesc_symlink }, /* symlink */ - { &vop_readdir_desc, (VOPFUNC)fdesc_readdir }, /* readdir */ - { &vop_readlink_desc, (VOPFUNC)fdesc_readlink },/* readlink */ - { &vop_abortop_desc, (VOPFUNC)fdesc_abortop }, /* abortop */ - { &vop_inactive_desc, (VOPFUNC)fdesc_inactive },/* inactive */ - { &vop_reclaim_desc, (VOPFUNC)fdesc_reclaim }, /* reclaim */ - { &vop_lock_desc, (VOPFUNC)fdesc_lock }, /* lock */ - { &vop_unlock_desc, (VOPFUNC)fdesc_unlock }, /* unlock */ - { &vop_bmap_desc, (VOPFUNC)fdesc_bmap }, /* bmap */ - { &vop_strategy_desc, (VOPFUNC)fdesc_strategy }, /* strategy */ - { &vop_print_desc, (VOPFUNC)fdesc_print }, /* print */ - { &vop_islocked_desc, (VOPFUNC)fdesc_islocked }, /* islocked */ - { &vop_pathconf_desc, (VOPFUNC)fdesc_pathconf }, /* pathconf */ - { &vop_advlock_desc, (VOPFUNC)fdesc_advlock }, /* advlock */ - { &vop_blkatoff_desc, (VOPFUNC)fdesc_blkatoff }, /* blkatoff */ - { &vop_valloc_desc, (VOPFUNC)fdesc_valloc }, /* valloc */ - { &vop_vfree_desc, (VOPFUNC)fdesc_vfree }, /* vfree */ - { &vop_truncate_desc, (VOPFUNC)fdesc_truncate }, /* truncate */ - { &vop_update_desc, (VOPFUNC)fdesc_update }, /* update */ - { &vop_bwrite_desc, (VOPFUNC)fdesc_bwrite }, /* bwrite */ - { &vop_pagein_desc, (VOPFUNC)err_pagein }, /* pagein */ - { &vop_pageout_desc, (VOPFUNC)err_pageout }, /* pageout */ - { &vop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ - { &vop_blktooff_desc, (VOPFUNC)fdesc_blktooff }, /* blktooff */ - { &vop_blktooff_desc, (VOPFUNC)fdesc_offtoblk }, /* offtoblk */ - { &vop_cmap_desc, (VOPFUNC)fdesc_cmap }, /* cmap */ + { &vnop_default_desc, (VOPFUNC)vn_default_error }, + { &vnop_lookup_desc, (VOPFUNC)fdesc_lookup }, /* lookup */ + { &vnop_create_desc, (VOPFUNC)fdesc_create }, /* create */ + { &vnop_mknod_desc, (VOPFUNC)fdesc_mknod }, /* mknod */ + { &vnop_open_desc, (VOPFUNC)fdesc_open }, /* open */ + { &vnop_close_desc, (VOPFUNC)fdesc_close }, /* close */ + { &vnop_access_desc, (VOPFUNC)fdesc_access }, /* access */ + { &vnop_getattr_desc, (VOPFUNC)fdesc_getattr }, /* getattr */ + { &vnop_setattr_desc, (VOPFUNC)fdesc_setattr }, /* setattr */ + { &vnop_read_desc, (VOPFUNC)fdesc_read }, /* read */ + { &vnop_write_desc, (VOPFUNC)fdesc_write }, /* write */ + { &vnop_ioctl_desc, (VOPFUNC)fdesc_ioctl }, /* ioctl */ + { &vnop_select_desc, (VOPFUNC)fdesc_select }, /* select */ + { &vnop_revoke_desc, (VOPFUNC)fdesc_revoke }, /* revoke */ + { &vnop_mmap_desc, (VOPFUNC)fdesc_mmap }, /* mmap */ + { &vnop_fsync_desc, (VOPFUNC)fdesc_fsync }, /* fsync */ + { &vnop_remove_desc, (VOPFUNC)fdesc_remove }, /* remove */ + { &vnop_link_desc, (VOPFUNC)fdesc_link }, /* link */ + { &vnop_rename_desc, (VOPFUNC)fdesc_rename }, /* rename */ + { &vnop_mkdir_desc, (VOPFUNC)fdesc_mkdir }, /* mkdir */ + { &vnop_rmdir_desc, (VOPFUNC)fdesc_rmdir }, /* rmdir */ + { &vnop_symlink_desc, (VOPFUNC)fdesc_symlink }, /* symlink */ + { &vnop_readdir_desc, (VOPFUNC)fdesc_readdir }, /* readdir */ + { &vnop_readlink_desc, (VOPFUNC)fdesc_readlink },/* readlink */ + { &vnop_inactive_desc, (VOPFUNC)fdesc_inactive },/* inactive */ + { &vnop_reclaim_desc, (VOPFUNC)fdesc_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (VOPFUNC)fdesc_strategy }, /* strategy */ + { &vnop_pathconf_desc, (VOPFUNC)fdesc_pathconf }, /* pathconf */ + { &vnop_advlock_desc, (VOPFUNC)fdesc_advlock }, /* advlock */ + { &vnop_bwrite_desc, (VOPFUNC)fdesc_bwrite }, /* bwrite */ + { &vnop_pagein_desc, (VOPFUNC)err_pagein }, /* pagein */ + { &vnop_pageout_desc, (VOPFUNC)err_pageout }, /* pageout */ + { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ + { &vnop_blktooff_desc, (VOPFUNC)fdesc_blktooff }, /* blktooff */ + { &vnop_blktooff_desc, (VOPFUNC)fdesc_offtoblk }, /* offtoblk */ + { &vnop_blockmap_desc, (VOPFUNC)fdesc_blockmap }, /* blockmap */ { (struct vnodeop_desc*)NULL, (VOPFUNC)NULL } }; struct vnodeopv_desc fdesc_vnodeop_opv_desc = diff --git a/bsd/miscfs/fifofs/fifo.h b/bsd/miscfs/fifofs/fifo.h index 1d212b694..a083b5948 100644 --- a/bsd/miscfs/fifofs/fifo.h +++ b/bsd/miscfs/fifofs/fifo.h @@ -57,59 +57,66 @@ #ifndef __FIFOFS_FOFO_H__ #define __FIFOFS_FOFO_H__ -#include <sys/appleapiopts.h> +#ifdef BSD_KERNEL_PRIVATE + + +/* + * This structure is associated with the FIFO vnode and stores + * the state associated with the FIFO. + */ +struct fifoinfo { + unsigned int fi_flags; + struct socket *fi_readsock; + struct socket *fi_writesock; + long fi_readers; + long fi_writers; + unsigned int fi_count; +}; + +#define FIFO_INCREATE 1 +#define FIFO_CREATEWAIT 2 +#define FIFO_CREATED 4 + -#ifdef __APPLE_API_PRIVATE /* * Prototypes for fifo operations on vnodes. */ -int fifo_ebadf(); +int fifo_ebadf(void *); + +int fifo_lookup (struct vnop_lookup_args *); +#define fifo_create (int (*) (struct vnop_create_args *))err_create +#define fifo_mknod (int (*) (struct vnop_mknod_args *))err_mknod +int fifo_open (struct vnop_open_args *); +int fifo_close (struct vnop_close_args *); +int fifo_close_internal (vnode_t, int, vfs_context_t, int); +#define fifo_access (int (*) (struct vnop_access_args *))fifo_ebadf +#define fifo_getattr (int (*) (struct vnop_getattr_args *))fifo_ebadf +#define fifo_setattr (int (*) (struct vnop_setattr_args *))fifo_ebadf +int fifo_read (struct vnop_read_args *); +int fifo_write (struct vnop_write_args *); +int fifo_ioctl (struct vnop_ioctl_args *); +int fifo_select (struct vnop_select_args *); +#define fifo_revoke nop_revoke +#define fifo_mmap (int (*) (struct vnop_mmap_args *))err_mmap +#define fifo_fsync (int (*) (struct vnop_fsync_args *))nullop +#define fifo_remove (int (*) (struct vnop_remove_args *))err_remove +#define fifo_link (int (*) (struct vnop_link_args *))err_link +#define fifo_rename (int (*) (struct vnop_rename_args *))err_rename +#define fifo_mkdir (int (*) (struct vnop_mkdir_args *))err_mkdir +#define fifo_rmdir (int (*) (struct vnop_rmdir_args *))err_rmdir +#define fifo_symlink (int (*) (struct vnop_symlink_args *))err_symlink +#define fifo_readdir (int (*) (struct vnop_readdir_args *))err_readdir +#define fifo_readlink (int (*) (struct vnop_readlink_args *))err_readlink +int fifo_inactive (struct vnop_inactive_args *); +#define fifo_reclaim (int (*) (struct vnop_reclaim_args *))nullop +#define fifo_strategy (int (*) (struct vnop_strategy_args *))err_strategy +int fifo_pathconf (struct vnop_pathconf_args *); +int fifo_advlock (struct vnop_advlock_args *); +#define fifo_valloc (int (*) (struct vnop_valloc_args *))err_valloc +#define fifo_vfree (int (*) (struct vnop_vfree_args *))err_vfree +#define fifo_bwrite (int (*) (struct vnop_bwrite_args *))nullop +#define fifo_blktooff (int (*) (struct vnop_blktooff_args *))err_blktooff -int fifo_lookup __P((struct vop_lookup_args *)); -#define fifo_create ((int (*) __P((struct vop_create_args *)))err_create) -#define fifo_mknod ((int (*) __P((struct vop_mknod_args *)))err_mknod) -int fifo_open __P((struct vop_open_args *)); -int fifo_close __P((struct vop_close_args *)); -#define fifo_access ((int (*) __P((struct vop_access_args *)))fifo_ebadf) -#define fifo_getattr ((int (*) __P((struct vop_getattr_args *)))fifo_ebadf) -#define fifo_setattr ((int (*) __P((struct vop_setattr_args *)))fifo_ebadf) -int fifo_read __P((struct vop_read_args *)); -int fifo_write __P((struct vop_write_args *)); -#define fifo_lease_check ((int (*) __P((struct vop_lease_args *)))nullop) -int fifo_ioctl __P((struct vop_ioctl_args *)); -int fifo_select __P((struct vop_select_args *)); -#define fifo_revoke vop_revoke -#define fifo_mmap ((int (*) __P((struct vop_mmap_args *)))err_mmap) -#define fifo_fsync ((int (*) __P((struct vop_fsync_args *)))nullop) -#define fifo_seek ((int (*) __P((struct vop_seek_args *)))err_seek) -#define fifo_remove ((int (*) __P((struct vop_remove_args *)))err_remove) -#define fifo_link ((int (*) __P((struct vop_link_args *)))err_link) -#define fifo_rename ((int (*) __P((struct vop_rename_args *)))err_rename) -#define fifo_mkdir ((int (*) __P((struct vop_mkdir_args *)))err_mkdir) -#define fifo_rmdir ((int (*) __P((struct vop_rmdir_args *)))err_rmdir) -#define fifo_symlink ((int (*) __P((struct vop_symlink_args *)))err_symlink) -#define fifo_readdir ((int (*) __P((struct vop_readdir_args *)))err_readdir) -#define fifo_readlink ((int (*) __P((struct vop_readlink_args *)))err_readlink) -#define fifo_abortop ((int (*) __P((struct vop_abortop_args *)))err_abortop) -int fifo_inactive __P((struct vop_inactive_args *)); -#define fifo_reclaim ((int (*) __P((struct vop_reclaim_args *)))nullop) -#define fifo_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) -#define fifo_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) -int fifo_bmap __P((struct vop_bmap_args *)); -#define fifo_strategy ((int (*) __P((struct vop_strategy_args *)))err_strategy) -int fifo_print __P((struct vop_print_args *)); -#define fifo_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked) -int fifo_pathconf __P((struct vop_pathconf_args *)); -int fifo_advlock __P((struct vop_advlock_args *)); -#define fifo_blkatoff ((int (*) __P((struct vop_blkatoff_args *)))err_blkatoff) -#define fifo_valloc ((int (*) __P((struct vop_valloc_args *)))err_valloc) -#define fifo_reallocblks \ - ((int (*) __P((struct vop_reallocblks_args *)))err_reallocblks) -#define fifo_vfree ((int (*) __P((struct vop_vfree_args *)))err_vfree) -#define fifo_truncate ((int (*) __P((struct vop_truncate_args *)))nullop) -#define fifo_update ((int (*) __P((struct vop_update_args *)))nullop) -#define fifo_bwrite ((int (*) __P((struct vop_bwrite_args *)))nullop) -#define fifo_blktooff ((int (*) __P((struct vop_blktooff_args *)))err_blktooff) +#endif /* BSD_KERNEL_PRIVATE */ -#endif /* __APPLE_API_PRIVATE */ #endif /* __FIFOFS_FOFO_H__ */ diff --git a/bsd/miscfs/fifofs/fifo_vnops.c b/bsd/miscfs/fifofs/fifo_vnops.c index c35fe724b..72358ae91 100644 --- a/bsd/miscfs/fifofs/fifo_vnops.c +++ b/bsd/miscfs/fifofs/fifo_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -59,82 +59,61 @@ #include <sys/proc.h> #include <sys/time.h> #include <sys/namei.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/stat.h> #include <sys/systm.h> #include <sys/ioctl.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/errno.h> #include <sys/malloc.h> #include <miscfs/fifofs/fifo.h> #include <vfs/vfs_support.h> -/* - * This structure is associated with the FIFO vnode and stores - * the state associated with the FIFO. - */ -struct fifoinfo { - struct socket *fi_readsock; - struct socket *fi_writesock; - long fi_readers; - long fi_writers; -}; - #define VOPFUNC int (*)(void *) +extern int soo_ioctl(struct fileproc *fp, u_long cmd, caddr_t data, struct proc *p); +extern int soo_select(struct fileproc *fp, int which, void * wql, struct proc *p); + int (**fifo_vnodeop_p)(void *); struct vnodeopv_entry_desc fifo_vnodeop_entries[] = { - { &vop_default_desc, (VOPFUNC)vn_default_error }, - { &vop_lookup_desc, (VOPFUNC)fifo_lookup }, /* lookup */ - { &vop_create_desc, (VOPFUNC)err_create }, /* create */ - { &vop_mknod_desc, (VOPFUNC)err_mknod }, /* mknod */ - { &vop_open_desc, (VOPFUNC)fifo_open }, /* open */ - { &vop_close_desc, (VOPFUNC)fifo_close }, /* close */ - { &vop_access_desc, (VOPFUNC)fifo_access }, /* access */ - { &vop_getattr_desc, (VOPFUNC)fifo_getattr }, /* getattr */ - { &vop_setattr_desc, (VOPFUNC)fifo_setattr }, /* setattr */ - { &vop_read_desc, (VOPFUNC)fifo_read }, /* read */ - { &vop_write_desc, (VOPFUNC)fifo_write }, /* write */ - { &vop_lease_desc, (VOPFUNC)fifo_lease_check }, /* lease */ - { &vop_ioctl_desc, (VOPFUNC)fifo_ioctl }, /* ioctl */ - { &vop_select_desc, (VOPFUNC)fifo_select }, /* select */ - { &vop_revoke_desc, (VOPFUNC)fifo_revoke }, /* revoke */ - { &vop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */ - { &vop_fsync_desc, (VOPFUNC)fifo_fsync }, /* fsync */ - { &vop_seek_desc, (VOPFUNC)err_seek }, /* seek */ - { &vop_remove_desc, (VOPFUNC)err_remove }, /* remove */ - { &vop_link_desc, (VOPFUNC)err_link }, /* link */ - { &vop_rename_desc, (VOPFUNC)err_rename }, /* rename */ - { &vop_mkdir_desc, (VOPFUNC)err_mkdir }, /* mkdir */ - { &vop_rmdir_desc, (VOPFUNC)err_rmdir }, /* rmdir */ - { &vop_symlink_desc, (VOPFUNC)err_symlink }, /* symlink */ - { &vop_readdir_desc, (VOPFUNC)err_readdir }, /* readdir */ - { &vop_readlink_desc, (VOPFUNC)err_readlink }, /* readlink */ - { &vop_abortop_desc, (VOPFUNC)err_abortop }, /* abortop */ - { &vop_inactive_desc, (VOPFUNC)fifo_inactive }, /* inactive */ - { &vop_reclaim_desc, (VOPFUNC)fifo_reclaim }, /* reclaim */ - { &vop_lock_desc, (VOPFUNC)fifo_lock }, /* lock */ - { &vop_unlock_desc, (VOPFUNC)fifo_unlock }, /* unlock */ - { &vop_bmap_desc, (VOPFUNC)fifo_bmap }, /* bmap */ - { &vop_strategy_desc, (VOPFUNC)err_strategy }, /* strategy */ - { &vop_print_desc, (VOPFUNC)fifo_print }, /* print */ - { &vop_islocked_desc, (VOPFUNC)fifo_islocked }, /* islocked */ - { &vop_pathconf_desc, (VOPFUNC)fifo_pathconf }, /* pathconf */ - { &vop_advlock_desc, (VOPFUNC)fifo_advlock }, /* advlock */ - { &vop_blkatoff_desc, (VOPFUNC)err_blkatoff }, /* blkatoff */ - { &vop_valloc_desc, (VOPFUNC)err_valloc }, /* valloc */ - { &vop_vfree_desc, (VOPFUNC)err_vfree }, /* vfree */ - { &vop_truncate_desc, (VOPFUNC)fifo_truncate }, /* truncate */ - { &vop_update_desc, (VOPFUNC)fifo_update }, /* update */ - { &vop_bwrite_desc, (VOPFUNC)fifo_bwrite }, /* bwrite */ - { &vop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */ - { &vop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */ - { &vop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ - { &vop_blktooff_desc, (VOPFUNC)err_blktooff }, /* blktooff */ - { &vop_offtoblk_desc, (VOPFUNC)err_offtoblk }, /* offtoblk */ - { &vop_cmap_desc, (VOPFUNC)err_cmap }, /* cmap */ + { &vnop_default_desc, (VOPFUNC)vn_default_error }, + { &vnop_lookup_desc, (VOPFUNC)fifo_lookup }, /* lookup */ + { &vnop_create_desc, (VOPFUNC)err_create }, /* create */ + { &vnop_mknod_desc, (VOPFUNC)err_mknod }, /* mknod */ + { &vnop_open_desc, (VOPFUNC)fifo_open }, /* open */ + { &vnop_close_desc, (VOPFUNC)fifo_close }, /* close */ + { &vnop_access_desc, (VOPFUNC)fifo_access }, /* access */ + { &vnop_getattr_desc, (VOPFUNC)fifo_getattr }, /* getattr */ + { &vnop_setattr_desc, (VOPFUNC)fifo_setattr }, /* setattr */ + { &vnop_read_desc, (VOPFUNC)fifo_read }, /* read */ + { &vnop_write_desc, (VOPFUNC)fifo_write }, /* write */ + { &vnop_ioctl_desc, (VOPFUNC)fifo_ioctl }, /* ioctl */ + { &vnop_select_desc, (VOPFUNC)fifo_select }, /* select */ + { &vnop_revoke_desc, (VOPFUNC)fifo_revoke }, /* revoke */ + { &vnop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */ + { &vnop_fsync_desc, (VOPFUNC)fifo_fsync }, /* fsync */ + { &vnop_remove_desc, (VOPFUNC)err_remove }, /* remove */ + { &vnop_link_desc, (VOPFUNC)err_link }, /* link */ + { &vnop_rename_desc, (VOPFUNC)err_rename }, /* rename */ + { &vnop_mkdir_desc, (VOPFUNC)err_mkdir }, /* mkdir */ + { &vnop_rmdir_desc, (VOPFUNC)err_rmdir }, /* rmdir */ + { &vnop_symlink_desc, (VOPFUNC)err_symlink }, /* symlink */ + { &vnop_readdir_desc, (VOPFUNC)err_readdir }, /* readdir */ + { &vnop_readlink_desc, (VOPFUNC)err_readlink }, /* readlink */ + { &vnop_inactive_desc, (VOPFUNC)fifo_inactive }, /* inactive */ + { &vnop_reclaim_desc, (VOPFUNC)fifo_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (VOPFUNC)err_strategy }, /* strategy */ + { &vnop_pathconf_desc, (VOPFUNC)fifo_pathconf }, /* pathconf */ + { &vnop_advlock_desc, (VOPFUNC)fifo_advlock }, /* advlock */ + { &vnop_bwrite_desc, (VOPFUNC)fifo_bwrite }, /* bwrite */ + { &vnop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */ + { &vnop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */ + { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ + { &vnop_blktooff_desc, (VOPFUNC)err_blktooff }, /* blktooff */ + { &vnop_offtoblk_desc, (VOPFUNC)err_offtoblk }, /* offtoblk */ + { &vnop_blockmap_desc, (VOPFUNC)err_blockmap }, /* blockmap */ { (struct vnodeop_desc*)NULL, (int(*)())NULL } }; struct vnodeopv_desc fifo_vnodeop_opv_desc = @@ -144,11 +123,13 @@ struct vnodeopv_desc fifo_vnodeop_opv_desc = * Trivial lookup routine that always fails. */ /* ARGSUSED */ +int fifo_lookup(ap) - struct vop_lookup_args /* { + struct vnop_lookup_args /* { struct vnode * a_dvp; struct vnode ** a_vpp; struct componentname * a_cnp; + vfs_context_t a_context; } */ *ap; { @@ -161,58 +142,94 @@ fifo_lookup(ap) * to find an active instance of a fifo. */ /* ARGSUSED */ +int fifo_open(ap) - struct vop_open_args /* { + struct vnop_open_args /* { struct vnode *a_vp; int a_mode; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { struct vnode *vp = ap->a_vp; struct fifoinfo *fip; - struct proc *p = ap->a_p; struct socket *rso, *wso; int error; - if ((fip = vp->v_fifoinfo) == NULL) { - MALLOC(fip, struct fifoinfo *, - sizeof(*fip), M_TEMP, M_WAITOK); - vp->v_fifoinfo = fip; - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - if (error = socreate(AF_LOCAL, &rso, SOCK_STREAM, 0)) { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - vp->v_fifoinfo = NULL; - FREE(fip, M_TEMP); - return (error); - } - fip->fi_readsock = rso; - if (error = socreate(AF_LOCAL, &wso, SOCK_STREAM, 0)) { - (void)soclose(rso); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - vp->v_fifoinfo = NULL; - FREE(fip, M_TEMP); - return (error); - } - fip->fi_writesock = wso; - if (error = unp_connect2(wso, rso)) { - (void)soclose(wso); - (void)soclose(rso); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - vp->v_fifoinfo = NULL; - FREE(fip, M_TEMP); - return (error); + vnode_lock(vp); + +retry: + + fip = vp->v_fifoinfo; + + if (fip == (struct fifoinfo *)0) + panic("fifo_open with no fifoinfo"); + + if ((fip->fi_flags & FIFO_CREATED) == 0) { + if (fip->fi_flags & FIFO_INCREATE) { + fip->fi_flags |= FIFO_CREATEWAIT; + error = msleep(&fip->fi_flags, &vp->v_lock, PRIBIO | PCATCH, "fifocreatewait", 0); + if (error) { + vnode_unlock(vp); + return(error); + } + goto retry; + } else { + fip->fi_flags |= FIFO_INCREATE; + vnode_unlock(vp); + if ( (error = socreate(AF_LOCAL, &rso, SOCK_STREAM, 0)) ) { + goto bad1; + } + fip->fi_readsock = rso; + + if ( (error = socreate(AF_LOCAL, &wso, SOCK_STREAM, 0)) ) { + (void)soclose(rso); + goto bad1; + } + fip->fi_writesock = wso; + + if ( (error = soconnect2(wso, rso)) ) { + (void)soclose(wso); + (void)soclose(rso); + goto bad1; + } + fip->fi_readers = fip->fi_writers = 0; + + socket_lock(wso, 1); + wso->so_state |= SS_CANTRCVMORE; + wso->so_snd.sb_lowat = PIPE_BUF; +#if 0 + /* Because all the unp is protected by single mutex + * doing it in two step may actually cause problems + * as it opens up window between the drop and acquire + */ + socket_unlock(wso, 1); + + socket_lock(rso, 1); +#endif + rso->so_state |= SS_CANTSENDMORE; + socket_unlock(wso, 1); + + vnode_lock(vp); + fip->fi_flags |= FIFO_CREATED; + fip->fi_flags &= ~FIFO_INCREATE; + + if ((fip->fi_flags & FIFO_CREATEWAIT)) { + fip->fi_flags &= ~FIFO_CREATEWAIT; + wakeup(&fip->fi_flags); + } + /* vnode lock is held to process further */ } - wso->so_state |= SS_CANTRCVMORE; - wso->so_snd.sb_lowat = PIPE_BUF; - rso->so_state |= SS_CANTSENDMORE; - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - fip->fi_readers = fip->fi_writers = 0; } + + /* vnode is locked at this point */ + /* fifo in created already */ if (ap->a_mode & FREAD) { fip->fi_readers++; if (fip->fi_readers == 1) { + socket_lock(fip->fi_writesock, 1); fip->fi_writesock->so_state &= ~SS_CANTSENDMORE; + socket_unlock(fip->fi_writesock, 1); + if (fip->fi_writers > 0) wakeup((caddr_t)&fip->fi_writers); } @@ -220,17 +237,18 @@ fifo_open(ap) if (ap->a_mode & FWRITE) { fip->fi_writers++; if (fip->fi_writers == 1) { + socket_lock(fip->fi_readsock, 1); fip->fi_readsock->so_state &= ~SS_CANTRCVMORE; + socket_unlock(fip->fi_readsock, 1); + if (fip->fi_readers > 0) wakeup((caddr_t)&fip->fi_readers); } } if ((ap->a_mode & FREAD) && (ap->a_mode & O_NONBLOCK) == 0) { if (fip->fi_writers == 0) { - VOP_UNLOCK(vp, 0, p); - error = tsleep((caddr_t)&fip->fi_readers, - PCATCH | PSOCK, "fifoor", 0); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + error = msleep((caddr_t)&fip->fi_readers, &vp->v_lock, + PCATCH | PSOCK, "fifoor", 0); if (error) goto bad; if (fip->fi_readers == 1) { @@ -242,15 +260,13 @@ fifo_open(ap) if (ap->a_mode & FWRITE) { if (ap->a_mode & O_NONBLOCK) { if (fip->fi_readers == 0) { - error = ENXIO; - goto bad; + error = ENXIO; + goto bad; } } else { if (fip->fi_readers == 0) { - VOP_UNLOCK(vp, 0, p); - error = tsleep((caddr_t)&fip->fi_writers, - PCATCH | PSOCK, "fifoow", 0); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + error = msleep((caddr_t)&fip->fi_writers,&vp->v_lock, + PCATCH | PSOCK, "fifoow", 0); if (error) goto bad; if (fip->fi_writers == 1) { @@ -260,39 +276,57 @@ fifo_open(ap) } } } + + vnode_unlock(vp); return (0); bad: - if (error) - VOP_CLOSE(vp, ap->a_mode, ap->a_cred, p); + fifo_close_internal(vp, ap->a_mode, ap->a_context, 1); + + vnode_unlock(vp); + return (error); +bad1: + vnode_lock(vp); + + fip->fi_flags &= ~FIFO_INCREATE; + + if ((fip->fi_flags & FIFO_CREATEWAIT)) { + fip->fi_flags &= ~FIFO_CREATEWAIT; + wakeup(&fip->fi_flags); + } + vnode_unlock(vp); + return (error); } /* * Vnode op for read */ -/* ARGSUSED */ +int fifo_read(ap) - struct vop_read_args /* { + struct vnop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; + vfs_context_t a_context; } */ *ap; { struct uio *uio = ap->a_uio; struct socket *rso = ap->a_vp->v_fifoinfo->fi_readsock; - struct proc *p = uio->uio_procp; int error, startresid; + int rflags; #if DIAGNOSTIC if (uio->uio_rw != UIO_READ) panic("fifo_read mode"); #endif - if (uio->uio_resid == 0) + if (uio_resid(uio) == 0) return (0); - if (ap->a_ioflag & IO_NDELAY) - rso->so_state |= SS_NBIO; - startresid = uio->uio_resid; + + rflags = (ap->a_ioflag & IO_NDELAY) ? MSG_NBIO : 0; + + // LP64todo - fix this! + startresid = uio_resid(uio); + /* fifo conformance - if we have a reader open on the fifo but no * writers then we need to make sure we do not block. We do that by * checking the receive buffer and if empty set error to EWOULDBLOCK. @@ -300,19 +334,15 @@ fifo_read(ap) */ error = 0; if (ap->a_vp->v_fifoinfo->fi_writers < 1) { - error = (rso->so_rcv.sb_cc == 0) ? EWOULDBLOCK : 0; + socket_lock(rso, 1); + error = (rso->so_rcv.sb_cc == 0) ? EWOULDBLOCK : 0; + socket_unlock(rso, 1); } /* skip soreceive to avoid blocking when we have no writers */ if (error != EWOULDBLOCK) { - VOP_UNLOCK(ap->a_vp, 0, p); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - error = soreceive(rso, (struct sockaddr **)0, uio, (struct mbuf **)0, - (struct mbuf **)0, (int *)0); - - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY, p); + (struct mbuf **)0, &rflags); } else { /* clear EWOULDBLOCK and return EOF (zero) */ @@ -321,102 +351,102 @@ fifo_read(ap) /* * Clear EOF indication after first such return. */ - if (uio->uio_resid == startresid) + if (uio_resid(uio) == startresid) { + socket_lock(rso, 1); rso->so_state &= ~SS_CANTRCVMORE; - if (ap->a_ioflag & IO_NDELAY) - rso->so_state &= ~SS_NBIO; + socket_unlock(rso, 1); + } return (error); } /* * Vnode op for write */ -/* ARGSUSED */ +int fifo_write(ap) - struct vop_write_args /* { + struct vnop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; + vfs_context_t a_context; } */ *ap; { struct socket *wso = ap->a_vp->v_fifoinfo->fi_writesock; - struct proc *p = ap->a_uio->uio_procp; int error; #if DIAGNOSTIC if (ap->a_uio->uio_rw != UIO_WRITE) panic("fifo_write mode"); #endif - if (ap->a_ioflag & IO_NDELAY) - wso->so_state |= SS_NBIO; - VOP_UNLOCK(ap->a_vp, 0, p); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - error = sosend(wso, (struct sockaddr *)0, ap->a_uio, 0, (struct mbuf *)0, 0); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY, p); - if (ap->a_ioflag & IO_NDELAY) - wso->so_state &= ~SS_NBIO; + error = sosend(wso, (struct sockaddr *)0, ap->a_uio, 0, + (struct mbuf *)0, (ap->a_ioflag & IO_NDELAY) ? MSG_NBIO : 0); + return (error); } /* * Device ioctl operation. */ -/* ARGSUSED */ +int fifo_ioctl(ap) - struct vop_ioctl_args /* { + struct vnop_ioctl_args /* { struct vnode *a_vp; int a_command; caddr_t a_data; int a_fflag; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { - struct file filetmp; + struct proc *p = vfs_context_proc(ap->a_context); + struct fileproc filetmp; + struct fileglob filefg; int error; if (ap->a_command == FIONBIO) return (0); + bzero(&filetmp, sizeof(struct fileproc)); + filetmp.f_fglob = &filefg; if (ap->a_fflag & FREAD) { - filetmp.f_data = (caddr_t)ap->a_vp->v_fifoinfo->fi_readsock; - error = soo_ioctl(&filetmp, ap->a_command, ap->a_data, ap->a_p); + filetmp.f_fglob->fg_data = (caddr_t)ap->a_vp->v_fifoinfo->fi_readsock; + error = soo_ioctl(&filetmp, ap->a_command, ap->a_data, p); if (error) return (error); } if (ap->a_fflag & FWRITE) { - filetmp.f_data = (caddr_t)ap->a_vp->v_fifoinfo->fi_writesock; - error = soo_ioctl(&filetmp, ap->a_command, ap->a_data, ap->a_p); + filetmp.f_fglob->fg_data = (caddr_t)ap->a_vp->v_fifoinfo->fi_writesock; + error = soo_ioctl(&filetmp, ap->a_command, ap->a_data, p); if (error) return (error); } return (0); } -/* ARGSUSED */ +int fifo_select(ap) - struct vop_select_args /* { + struct vnop_select_args /* { struct vnode *a_vp; int a_which; int a_fflags; - struct ucred *a_cred; void * a_wql; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { - struct file filetmp; + struct proc *p = vfs_context_proc(ap->a_context); + struct fileproc filetmp; + struct fileglob filefg; int ready; + bzero(&filetmp, sizeof(struct fileproc)); + filetmp.f_fglob = &filefg; if (ap->a_which & FREAD) { - filetmp.f_data = (caddr_t)ap->a_vp->v_fifoinfo->fi_readsock; - ready = soo_select(&filetmp, ap->a_which, ap->a_wql, ap->a_p); + filetmp.f_fglob->fg_data = (caddr_t)ap->a_vp->v_fifoinfo->fi_readsock; + ready = soo_select(&filetmp, ap->a_which, ap->a_wql, p); if (ready) return (ready); } if (ap->a_which & FWRITE) { - filetmp.f_data = (caddr_t)ap->a_vp->v_fifoinfo->fi_writesock; - ready = soo_select(&filetmp, ap->a_which, ap->a_wql, ap->a_p); + filetmp.f_fglob->fg_data = (caddr_t)ap->a_vp->v_fifoinfo->fi_writesock; + ready = soo_select(&filetmp, ap->a_which, ap->a_wql, p); if (ready) return (ready); } @@ -424,101 +454,95 @@ fifo_select(ap) } int -fifo_inactive(ap) - struct vop_inactive_args /* { - struct vnode *a_vp; - struct proc *a_p; - } */ *ap; +fifo_inactive(__unused struct vnop_inactive_args *ap) { - - VOP_UNLOCK(ap->a_vp, 0, ap->a_p); return (0); } -/* - * This is a noop, simply returning what one has been given. - */ -fifo_bmap(ap) - struct vop_bmap_args /* { - struct vnode *a_vp; - daddr_t a_bn; - struct vnode **a_vpp; - daddr_t *a_bnp; - int *a_runp; - } */ *ap; -{ - - if (ap->a_vpp != NULL) - *ap->a_vpp = ap->a_vp; - if (ap->a_bnp != NULL) - *ap->a_bnp = ap->a_bn; - if (ap->a_runp != NULL) - *ap->a_runp = 0; - return (0); -} /* * Device close routine */ -/* ARGSUSED */ +int fifo_close(ap) - struct vop_close_args /* { + struct vnop_close_args /* { struct vnode *a_vp; int a_fflag; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { - register struct vnode *vp = ap->a_vp; + return fifo_close_internal(ap->a_vp, ap->a_fflag, ap->a_context, 0); +} + +int +fifo_close_internal(vnode_t vp, int fflag, __unused vfs_context_t context, int locked) +{ register struct fifoinfo *fip = vp->v_fifoinfo; int error1, error2; + struct socket *rso; + struct socket *wso; - if (ap->a_fflag & FREAD) { + if (!locked) + vnode_lock(vp); + + if ((fip->fi_flags & FIFO_CREATED) == 0) { + if (!locked) + vnode_unlock(vp); + return(0); + + } + + if (fflag & FREAD) { fip->fi_readers--; if (fip->fi_readers == 0){ - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + socket_lock(fip->fi_writesock, 1); socantsendmore(fip->fi_writesock); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + socket_unlock(fip->fi_writesock, 1); } } - if (ap->a_fflag & FWRITE) { + + if (fflag & FWRITE) { fip->fi_writers--; if (fip->fi_writers == 0) { - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + socket_lock(fip->fi_readsock, 1); socantrcvmore(fip->fi_readsock); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + socket_unlock(fip->fi_readsock, 1); } } - if (vp->v_usecount > 1) +#if 0 + if (vnode_isinuse_locked(vp, 0, 1)) { + if (!locked) + vnode_unlock(vp); + return (0); + } +#endif + + if (fip->fi_writers || fip->fi_readers) { + if (!locked) + vnode_unlock(vp); return (0); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - error1 = soclose(fip->fi_readsock); - error2 = soclose(fip->fi_writesock); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - vp->v_fifoinfo = NULL; - FREE(fip, M_TEMP); + } + + wso = fip->fi_writesock; + rso = fip->fi_readsock; + fip->fi_readsock = 0; + fip->fi_writesock = 0; + fip->fi_flags &= ~FIFO_CREATED; + if (!locked) + vnode_unlock(vp); + error1 = soclose(rso); + error2 = soclose(wso); + if (error1) return (error1); return (error2); } -/* - * Print out the contents of a fifo vnode. - */ -fifo_print(ap) - struct vop_print_args /* { - struct vnode *a_vp; - } */ *ap; -{ - - printf("tag VT_NON"); - fifo_printinfo(ap->a_vp); - printf("\n"); -} /* * Print out internal contents of a fifo vnode. */ +void fifo_printinfo(vp) struct vnode *vp; { @@ -531,11 +555,13 @@ fifo_printinfo(vp) /* * Return POSIX pathconf information applicable to fifo's. */ +int fifo_pathconf(ap) - struct vop_pathconf_args /* { + struct vnop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; + vfs_context_t a_context; } */ *ap; { @@ -558,7 +584,8 @@ fifo_pathconf(ap) /* * Fifo failed operation */ -fifo_ebadf() +int +fifo_ebadf(__unused void *dummy) { return (EBADF); @@ -567,17 +594,10 @@ fifo_ebadf() /* * Fifo advisory byte-level locks. */ -/* ARGSUSED */ -fifo_advlock(ap) - struct vop_advlock_args /* { - struct vnode *a_vp; - caddr_t a_id; - int a_op; - struct flock *a_fl; - int a_flags; - } */ *ap; +int +fifo_advlock(__unused struct vnop_advlock_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } diff --git a/bsd/miscfs/nullfs/null.h b/bsd/miscfs/nullfs/null.h index a4ccc0e36..734b1d772 100644 --- a/bsd/miscfs/nullfs/null.h +++ b/bsd/miscfs/nullfs/null.h @@ -75,6 +75,15 @@ struct null_mount { }; #ifdef KERNEL +/* LP64 version of null_args. all pointers + * grow when we're dealing with a 64-bit process. + * WARNING - keep in sync with null_args + */ +/* LP64todo - should this move? */ +struct user_null_args { + user_addr_t target; /* Target of loopback */ +}; + /* * A cache of vnode references */ @@ -84,13 +93,13 @@ struct null_node { struct vnode *null_vnode; /* Back pointer */ }; -extern int null_node_create __P((struct mount *mp, struct vnode *target, struct vnode **vpp)); +extern int null_node_create(struct mount *mp, struct vnode *target, struct vnode **vpp); #define MOUNTTONULLMOUNT(mp) ((struct null_mount *)((mp)->mnt_data)) #define VTONULL(vp) ((struct null_node *)(vp)->v_data) #define NULLTOV(xp) ((xp)->null_vnode) #ifdef NULLFS_DIAGNOSTIC -extern struct vnode *null_checkvp __P((struct vnode *vp, char *fil, int lno)); +extern struct vnode *null_checkvp(struct vnode *vp, char *fil, int lno); #define NULLVPTOLOWERVP(vp) null_checkvp((vp), __FILE__, __LINE__) #else #define NULLVPTOLOWERVP(vp) (VTONULL(vp)->null_lowervp) diff --git a/bsd/miscfs/nullfs/null_subr.c b/bsd/miscfs/nullfs/null_subr.c index 133cdb932..cd7f6618e 100644 --- a/bsd/miscfs/nullfs/null_subr.c +++ b/bsd/miscfs/nullfs/null_subr.c @@ -66,7 +66,7 @@ #include <sys/time.h> #include <sys/types.h> #include <sys/vnode.h> -#include <sys/mount.h> +#include <sys/mount_internal.h> #include <sys/namei.h> #include <sys/malloc.h> #include <sys/ubc.h> @@ -79,8 +79,8 @@ * Null layer cache: * Each cache entry holds a reference to the lower vnode * along with a pointer to the alias vnode. When an - * entry is added the lower vnode is VREF'd. When the - * alias is removed the lower vnode is vrele'd. + * entry is added the lower vnode is vnode_get'd. When the + * alias is removed the lower vnode is vnode_put'd. */ #define NULL_NHASH(vp) \ @@ -101,7 +101,7 @@ nullfs_init() } /* - * Return a VREF'ed alias for lower vnode if already exists, else 0. + * Return a vnode_get'ed alias for lower vnode if already exists, else 0. */ static struct vnode * null_node_find(mp, lowervp) @@ -117,19 +117,15 @@ null_node_find(mp, lowervp) * Find hash base, and then search the (two-way) linked * list looking for a null_node structure which is referencing * the lower vnode. If found, the increment the null_node - * reference count (but NOT the lower vnode's VREF counter). + * reference count (but NOT the lower vnode's vnode_get counter). */ hd = NULL_NHASH(lowervp); loop: for (a = hd->lh_first; a != 0; a = a->null_hash.le_next) { if (a->null_lowervp == lowervp && NULLTOV(a)->v_mount == mp) { vp = NULLTOV(a); - /* - * We need vget for the VXLOCK - * stuff, but we don't want to lock - * the lower node. - */ - if (vget(vp, 0, p)) { + + if (vnode_get(vp)) { printf ("null_node_find: vget failed.\n"); goto loop; }; @@ -182,7 +178,7 @@ null_node_alloc(mp, lowervp, vpp) }; if (vp->v_type == VREG) ubc_info_init(vp); - VREF(lowervp); /* Extra VREF will be vrele'd in null_node_create */ + vnode_get(lowervp); /* Extra vnode_get will be vnode_put'd in null_node_create */ hd = NULL_NHASH(lowervp); LIST_INSERT_HEAD(hd, xp, null_hash); return 0; @@ -210,7 +206,7 @@ null_node_create(mp, lowervp, newvpp) #ifdef NULLFS_DIAGNOSTIC vprint("null_node_create: exists", NULLTOV(ap)); #endif - /* VREF(aliasvp); --- done in null_node_find */ + /* vnode_get(aliasvp); --- done in null_node_find */ } else { int error; @@ -228,11 +224,11 @@ null_node_create(mp, lowervp, newvpp) return error; /* - * aliasvp is already VREF'd by getnewvnode() + * aliasvp is already vnode_get'd by getnewvnode() */ } - vrele(lowervp); + vnode_put(lowervp); #if DIAGNOSTIC if (lowervp->v_usecount < 1) { @@ -261,7 +257,7 @@ null_checkvp(vp, fil, lno) struct null_node *a = VTONULL(vp); #ifdef notyet /* - * Can't do this check because vop_reclaim runs + * Can't do this check because vnop_reclaim runs * with a funny vop vector. */ if (vp->v_op != null_vnodeop_p) { diff --git a/bsd/miscfs/nullfs/null_vfsops.c b/bsd/miscfs/nullfs/null_vfsops.c index 66f61af3d..d916c8cd8 100644 --- a/bsd/miscfs/nullfs/null_vfsops.c +++ b/bsd/miscfs/nullfs/null_vfsops.c @@ -68,10 +68,11 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> +#include <sys/kauth.h> #include <sys/time.h> #include <sys/types.h> #include <sys/vnode.h> -#include <sys/mount.h> +#include <sys/mount_internal.h> #include <sys/namei.h> #include <sys/malloc.h> #include <miscfs/nullfs/null.h> @@ -79,16 +80,15 @@ /* * Mount null layer */ -int -nullfs_mount(mp, path, data, ndp, p) +static int +nullfs_mount(mp, devvp, data, context) struct mount *mp; - char *path; - caddr_t data; - struct nameidata *ndp; - struct proc *p; + vnode_t devvp; + user_addr_t data; + vfs_context_t context; { int error = 0; - struct null_args args; + struct user_null_args args; struct vnode *lowerrootvp, *vp; struct vnode *nullm_rootvp; struct null_mount *xmp; @@ -102,30 +102,38 @@ nullfs_mount(mp, path, data, ndp, p) * Update is a no-op */ if (mp->mnt_flag & MNT_UPDATE) { - return (EOPNOTSUPP); - /* return VFS_MOUNT(MOUNTTONULLMOUNT(mp)->nullm_vfs, path, data, ndp, p);*/ + return (ENOTSUP); + /* return VFS_MOUNT(MOUNTTONULLMOUNT(mp)->nullm_vfs, devvp, data, p);*/ } /* * Get argument */ - if (error = copyin(data, (caddr_t)&args, sizeof(struct null_args))) + if (vfs_context_is64bit(context)) { + error = copyin(data, (caddr_t)&args, sizeof (args)); + } + else { + struct null_args temp; + error = copyin(data, (caddr_t)&temp, sizeof (temp)); + args.target = CAST_USER_ADDR_T(temp.target); + } + if (error) return (error); /* * Find lower node */ NDINIT(ndp, LOOKUP, FOLLOW|WANTPARENT|LOCKLEAF, - UIO_USERSPACE, args.target, p); + UIO_USERSPACE, args.target, context); if (error = namei(ndp)) return (error); - + nameidone(ndp); /* * Sanity check on lower vnode */ lowerrootvp = ndp->ni_vp; - vrele(ndp->ni_dvp); + vnode_put(ndp->ni_dvp); ndp->ni_dvp = NULL; xmp = (struct null_mount *) _MALLOC(sizeof(struct null_mount), @@ -141,22 +149,18 @@ nullfs_mount(mp, path, data, ndp, p) * a reference on the root vnode. */ error = null_node_create(mp, lowerrootvp, &vp); - /* - * Unlock the node (either the lower or the alias) - */ - VOP_UNLOCK(vp, 0, p); /* * Make sure the node alias worked */ if (error) { - vrele(lowerrootvp); + vnode_put(lowerrootvp); FREE(xmp, M_UFSMNT); /* XXX */ return (error); } /* * Keep a held reference to the root vnode. - * It is vrele'd in nullfs_unmount. + * It is vnode_put'd in nullfs_unmount. */ nullm_rootvp = vp; nullm_rootvp->v_flag |= VROOT; @@ -166,14 +170,12 @@ nullfs_mount(mp, path, data, ndp, p) mp->mnt_data = (qaddr_t) xmp; vfs_getnewfsid(mp); - (void) copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN - 1, &size); - bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); - (void) copyinstr(args.target, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, + (void) copyinstr(args.target, mp->mnt_vfsstat.f_mntfromname, MAXPATHLEN - 1, &size); - bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + bzero(mp->mnt_vfsstat.f_mntfromname + size, MNAMELEN - size); #ifdef NULLFS_DIAGNOSTIC printf("nullfs_mount: lower %s, alias at %s\n", - mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname); + mp->mnt_vfsstat.f_mntfromname, mp->mnt_vfsstat.f_mntonname); #endif return (0); } @@ -183,24 +185,24 @@ nullfs_mount(mp, path, data, ndp, p) * on the underlying filesystem will have been called * when that filesystem was mounted. */ -int -nullfs_start(mp, flags, p) +static int +nullfs_start(mp, flags, context) struct mount *mp; int flags; - struct proc *p; + vfs_context_t context; { return (0); - /* return VFS_START(MOUNTTONULLMOUNT(mp)->nullm_vfs, flags, p); */ + /* return VFS_START(MOUNTTONULLMOUNT(mp)->nullm_vfs, flags, context); */ } /* * Free reference to null layer */ -int -nullfs_unmount(mp, mntflags, p) +static int +nullfs_unmount(mp, mntflags, context) struct mount *mp; int mntflags; - struct proc *p; + vfs_context_t context; { struct vnode *nullm_rootvp = MOUNTTONULLMOUNT(mp)->nullm_rootvp; int error; @@ -227,11 +229,11 @@ nullfs_unmount(mp, mntflags, p) /* * Release reference on underlying root vnode */ - vrele(nullm_rootvp); + vnode_put(nullm_rootvp); /* * And blow it away for future re-use */ - vgone(nullm_rootvp); + vnode_reclaim(nullm_rootvp); /* * Finally, throw away the null_mount structure */ @@ -240,10 +242,11 @@ nullfs_unmount(mp, mntflags, p) return 0; } -int -nullfs_root(mp, vpp) +static int +nullfs_root(mp, vpp, context) struct mount *mp; struct vnode **vpp; + vfs_context_t context; { struct proc *p = curproc; /* XXX */ struct vnode *vp; @@ -259,31 +262,30 @@ nullfs_root(mp, vpp) * Return locked reference to root. */ vp = MOUNTTONULLMOUNT(mp)->nullm_rootvp; - VREF(vp); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + vnode_get(vp); *vpp = vp; return 0; } -int -nullfs_quotactl(mp, cmd, uid, arg, p) +static int +nullfs_quotactl(mp, cmd, uid, datap, context) struct mount *mp; int cmd; uid_t uid; - caddr_t arg; - struct proc *p; + caddr_t datap; + vfs_context_t context; { - return VFS_QUOTACTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, uid, arg, p); + return VFS_QUOTACTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, uid, datap, context); } -int -nullfs_statfs(mp, sbp, p) +static int +nullfs_statfs(mp, sbp, context) struct mount *mp; - struct statfs *sbp; - struct proc *p; + struct vfsstatfs *sbp; + vfs_context_t context; { int error; - struct statfs mstat; + struct vfsstatfs mstat; #ifdef NULLFS_DIAGNOSTIC printf("nullfs_statfs(mp = %x, vp = %x->%x)\n", mp, @@ -294,12 +296,12 @@ nullfs_statfs(mp, sbp, p) bzero(&mstat, sizeof(mstat)); - error = VFS_STATFS(MOUNTTONULLMOUNT(mp)->nullm_vfs, &mstat, p); + error = VFS_STATFS(MOUNTTONULLMOUNT(mp)->nullm_vfs, &mstat, context); if (error) return (error); /* now copy across the "interesting" information and fake the rest */ - sbp->f_type = mstat.f_type; + //sbp->f_type = mstat.f_type; sbp->f_flags = mstat.f_flags; sbp->f_bsize = mstat.f_bsize; sbp->f_iosize = mstat.f_iosize; @@ -308,20 +310,12 @@ nullfs_statfs(mp, sbp, p) sbp->f_bavail = mstat.f_bavail; sbp->f_files = mstat.f_files; sbp->f_ffree = mstat.f_ffree; - if (sbp != &mp->mnt_stat) { - bcopy(&mp->mnt_stat.f_fsid, &sbp->f_fsid, sizeof(sbp->f_fsid)); - bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); - bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); - } return (0); } -int -nullfs_sync(mp, waitfor, cred, p) - struct mount *mp; - int waitfor; - struct ucred *cred; - struct proc *p; +static int +nullfs_sync(__unused struct mount *mp, __unused int waitfor, + __unused kauth_cred_t cred, __unused vfs_context_t context) { /* * XXX - Assumes no data cached at null layer. @@ -329,41 +323,42 @@ nullfs_sync(mp, waitfor, cred, p) return (0); } -int -nullfs_vget(mp, ino, vpp) +static int +nullfs_vget(mp, ino, vpp, context) struct mount *mp; - ino_t ino; + ino64_t ino; struct vnode **vpp; + vfs_context_t context; { - return VFS_VGET(MOUNTTONULLMOUNT(mp)->nullm_vfs, ino, vpp); + return VFS_VGET(MOUNTTONULLMOUNT(mp)->nullm_vfs, ino, vpp, context); } -int -nullfs_fhtovp(mp, fidp, nam, vpp, exflagsp, credanonp) +static int +nullfs_fhtovp(mp, fhlen, fhp, vpp, context) struct mount *mp; - struct fid *fidp; - struct mbuf *nam; + int fhlen; + unsigned char *fhp; struct vnode **vpp; - int *exflagsp; - struct ucred**credanonp; + vfs_context_t context; { - return VFS_FHTOVP(MOUNTTONULLMOUNT(mp)->nullm_vfs, fidp, nam, vpp, exflagsp,credanonp); + return VFS_FHTOVP(MOUNTTONULLMOUNT(mp)->nullm_vfs, fhlen, fhp, vpp, context); } -int -nullfs_vptofh(vp, fhp) +static int +nullfs_vptofh(vp, fhlenp, fhp, context) struct vnode *vp; - struct fid *fhp; + int *fhlenp; + unsigned char *fhp; + vfs_context_t context; { - return VFS_VPTOFH(NULLVPTOLOWERVP(vp), fhp); + return VFS_VPTOFH(NULLVPTOLOWERVP(vp), fhlenp, fhp, context); } -int nullfs_init __P((struct vfsconf *)); +int nullfs_init (struct vfsconf *); -#define nullfs_sysctl ((int (*) __P((int *, u_int, void *, size_t *, void *, \ - size_t, struct proc *)))eopnotsupp) +#define nullfs_sysctl (int (*) (int *, u_int, user_addr_t, size_t *, user_addr_t, size_t, proc_t))eopnotsupp struct vfsops null_vfsops = { nullfs_mount, @@ -377,5 +372,5 @@ struct vfsops null_vfsops = { nullfs_fhtovp, nullfs_vptofh, nullfs_init, - nullfs_sysctl, + nullfs_sysctl }; diff --git a/bsd/miscfs/nullfs/null_vnops.c b/bsd/miscfs/nullfs/null_vnops.c index fecb1278a..ae4e8db1b 100644 --- a/bsd/miscfs/nullfs/null_vnops.c +++ b/bsd/miscfs/nullfs/null_vnops.c @@ -113,12 +113,12 @@ * in the arguments and, if a vnode is return by the operation, * stacks a null-node on top of the returned vnode. * - * Although bypass handles most operations, vop_getattr, vop_lock, - * vop_unlock, vop_inactive, vop_reclaim, and vop_print are not + * Although bypass handles most operations, vnop_getattr, vnop_lock, + * vnop_unlock, vnop_inactive, vnop_reclaim, and vnop_print are not * bypassed. Vop_getattr must change the fsid being returned. - * Vop_lock and vop_unlock must handle any locking for the + * Vop_lock and vnop_unlock must handle any locking for the * current vnode as well as pass the lock request down. - * Vop_inactive and vop_reclaim are not bypassed so that + * Vop_inactive and vnop_reclaim are not bypassed so that * they can handle freeing null-layer specific data. Vop_print * is not bypassed to avoid excessive debugging information. * Also, certain vnode operations change the locking state within @@ -150,7 +150,7 @@ * "mount_null /usr/include /dev/layer/null". * Changing directory to /dev/layer/null will assign * the root null-node (which was created when the null layer was mounted). - * Now consider opening "sys". A vop_lookup would be + * Now consider opening "sys". A vnop_lookup would be * done on the root null-node. This operation would bypass through * to the lower layer which would return a vnode representing * the UFS "sys". Null_bypass then builds a null-node @@ -196,10 +196,11 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> +#include <sys/kauth.h> #include <sys/time.h> #include <sys/types.h> #include <sys/vnode.h> -#include <sys/mount.h> +#include <sys/mount_internal.h> #include <sys/namei.h> #include <sys/malloc.h> #include <sys/buf.h> @@ -219,7 +220,7 @@ int null_bug_bypass = 0; /* for debugging: enables bypass printf'ing */ * As an exception to this, vnodes can be marked "unmapped" by setting * the Nth bit in operation's vdesc_flags. * - * Also, some BSD vnode operations have the side effect of vrele'ing + * Also, some BSD vnode operations have the side effect of node_put'ing * their arguments. With stacking, the reference counts are held * by the upper node, not the lower one, so we must handle these * side-effects here. This is not of concern in Sun-derived systems @@ -227,7 +228,7 @@ int null_bug_bypass = 0; /* for debugging: enables bypass printf'ing */ * * This makes the following assumptions: * - only one returned vpp - * - no INOUT vpp's (Sun's vop_open has one of these) + * - no INOUT vpp's (Sun's vnop_open has one of these) * - the vnode operation vector of the first vnode should be used * to determine what implementation of the op should be invoked * - all mapped vnodes are of our vnode-type (NEEDSWORK: @@ -235,7 +236,7 @@ int null_bug_bypass = 0; /* for debugging: enables bypass printf'ing */ */ int null_bypass(ap) - struct vop_generic_args /* { + struct vnop_generic_args /* { struct vnodeop_desc *a_desc; <other random data follows, presumably> } */ *ap; @@ -285,11 +286,11 @@ null_bypass(ap) *(vps_p[i]) = NULLVPTOLOWERVP(*this_vp_p); /* * XXX - Several operations have the side effect - * of vrele'ing their vp's. We must account for + * of vnode_put'ing their vp's. We must account for * that. (This should go away in the future.) */ if (reles & 1) - VREF(*this_vp_p); + vnode_get(*this_vp_p); } } @@ -312,21 +313,21 @@ null_bypass(ap) if (old_vps[i]) { *(vps_p[i]) = old_vps[i]; if (reles & 1) - vrele(*(vps_p[i])); + vnode_put(*(vps_p[i])); } } /* * Map the possible out-going vpp * (Assumes that the lower layer always returns - * a VREF'ed vpp unless it gets an error.) + * a vnode_get'ed vpp unless it gets an error.) */ if (descp->vdesc_vpp_offset != VDESC_NO_OFFSET && !(descp->vdesc_flags & VDESC_NOMAP_VPP) && !error) { /* * XXX - even though some ops have vpp returned vp's, - * several ops actually vrele this before returning. + * several ops actually vnode_put this before returning. * We must avoid these ops. * (This should go away when these ops are regularized.) */ @@ -347,28 +348,21 @@ null_bypass(ap) * if this layer is mounted read-only. */ null_lookup(ap) - struct vop_lookup_args /* { + struct vnop_lookup_args /* { struct vnode * a_dvp; struct vnode ** a_vpp; struct componentname * a_cnp; + vfs_context_t a_context; } */ *ap; { struct componentname *cnp = ap->a_cnp; struct proc *p = cnp->cn_proc; int flags = cnp->cn_flags; - struct vop_lock_args lockargs; - struct vop_unlock_args unlockargs; struct vnode *dvp, *vp; int error; - if ((flags & ISLASTCN) && (ap->a_dvp->v_mount->mnt_flag & MNT_RDONLY) && - (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) - return (EROFS); error = null_bypass(ap); - if (error == EJUSTRETURN && (flags & ISLASTCN) && - (ap->a_dvp->v_mount->mnt_flag & MNT_RDONLY) && - (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME)) - error = EROFS; + /* * We must do the same locking and unlocking at this layer as * is done in the layers below us. We could figure this out @@ -381,43 +375,26 @@ null_lookup(ap) vp = *ap->a_vpp; if (dvp == vp) return (error); - if (!VOP_ISLOCKED(dvp)) { - unlockargs.a_vp = dvp; - unlockargs.a_flags = 0; - unlockargs.a_p = p; - vop_nounlock(&unlockargs); - } - if (vp != NULL && VOP_ISLOCKED(vp)) { - lockargs.a_vp = vp; - lockargs.a_flags = LK_SHARED; - lockargs.a_p = p; - vop_nolock(&lockargs); - } return (error); } /* - * Setattr call. Disallow write attempts if the layer is mounted read-only. + * Setattr call. */ int -null_setattr(ap) - struct vop_setattr_args /* { +null_setattr( + struct vnop_setattr_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; + struct vnode_attr *a_vap; + kauth_cred_t a_cred; struct proc *a_p; - } */ *ap; + } */ *ap) { struct vnode *vp = ap->a_vp; - struct vattr *vap = ap->a_vap; - - if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL || - vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || - vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) && - (vp->v_mount->mnt_flag & MNT_RDONLY)) - return (EROFS); - if (vap->va_size != VNOVAL) { + struct vnode_attr *vap = ap->a_vap; + + if (VATTR_IS_ACTIVE(vap, va_data_size)) { switch (vp->v_type) { case VDIR: return (EISDIR); @@ -429,12 +406,6 @@ null_setattr(ap) case VREG: case VLNK: default: - /* - * Disallow write attempts if the filesystem is - * mounted read-only. - */ - if (vp->v_mount->mnt_flag & MNT_RDONLY) - return (EROFS); } } return (null_bypass(ap)); @@ -445,11 +416,10 @@ null_setattr(ap) */ int null_getattr(ap) - struct vop_getattr_args /* { + struct vnop_getattr_args /* { struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - struct proc *a_p; + struct vnode_attr *a_vap; + vfs_context_t a_context; } */ *ap; { int error; @@ -457,91 +427,31 @@ null_getattr(ap) if (error = null_bypass(ap)) return (error); /* Requires that arguments be restored. */ - ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; + VATTR_RETURN(ap->a_vap, va_fsid, ap->a_vp->v_mount->mnt_vfsstat.f_fsid.val[0]); return (0); } int null_access(ap) - struct vop_access_args /* { + struct vnop_access_args /* { struct vnode *a_vp; - int a_mode; - struct ucred *a_cred; - struct proc *a_p; + int a_action; + vfs_context_t a_context; } */ *ap; { - struct vnode *vp = ap->a_vp; - mode_t mode = ap->a_mode; - - /* - * Disallow write attempts on read-only layers; - * unless the file is a socket, fifo, or a block or - * character device resident on the file system. - */ - if (mode & VWRITE) { - switch (vp->v_type) { - case VDIR: - case VLNK: - case VREG: - if (vp->v_mount->mnt_flag & MNT_RDONLY) - return (EROFS); - break; - } - } - return (null_bypass(ap)); -} - -/* - * We need to process our own vnode lock and then clear the - * interlock flag as it applies only to our vnode, not the - * vnodes below us on the stack. - */ -int -null_lock(ap) - struct vop_lock_args /* { - struct vnode *a_vp; - int a_flags; - struct proc *a_p; - } */ *ap; -{ - - vop_nolock(ap); - if ((ap->a_flags & LK_TYPE_MASK) == LK_DRAIN) - return (0); - ap->a_flags &= ~LK_INTERLOCK; - return (null_bypass(ap)); -} - -/* - * We need to process our own vnode unlock and then clear the - * interlock flag as it applies only to our vnode, not the - * vnodes below us on the stack. - */ -int -null_unlock(ap) - struct vop_unlock_args /* { - struct vnode *a_vp; - int a_flags; - struct proc *a_p; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - - vop_nounlock(ap); - ap->a_flags &= ~LK_INTERLOCK; return (null_bypass(ap)); } int null_inactive(ap) - struct vop_inactive_args /* { + struct vnop_inactive_args /* { struct vnode *a_vp; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { /* * Do nothing (and _don't_ bypass). - * Wait to vrele lowervp until reclaim, + * Wait to vnode_put lowervp until reclaim, * so that until then our null_node is in the * cache and reusable. * @@ -551,15 +461,14 @@ null_inactive(ap) * like they do in the name lookup cache code. * That's too much work for now. */ - VOP_UNLOCK(ap->a_vp, 0, ap->a_p); return (0); } int null_reclaim(ap) - struct vop_reclaim_args /* { + struct vnop_reclaim_args /* { struct vnode *a_vp; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { struct vnode *vp = ap->a_vp; @@ -567,7 +476,7 @@ null_reclaim(ap) struct vnode *lowervp = xp->null_lowervp; /* - * Note: in vop_reclaim, vp->v_op == dead_vnodeop_p, + * Note: in vnop_reclaim, vp->v_op == dead_vnodeop_p, * so we can't call VOPs on ourself. */ /* After this assignment, this node will not be re-used. */ @@ -575,29 +484,18 @@ null_reclaim(ap) LIST_REMOVE(xp, null_hash); FREE(vp->v_data, M_TEMP); vp->v_data = NULL; - vrele (lowervp); - return (0); -} - -int -null_print(ap) - struct vop_print_args /* { - struct vnode *a_vp; - } */ *ap; -{ - register struct vnode *vp = ap->a_vp; - printf ("\ttag VT_NULLFS, vp=%x, lowervp=%x\n", vp, NULLVPTOLOWERVP(vp)); + vnode_put (lowervp); return (0); } /* - * XXX - vop_strategy must be hand coded because it has no + * XXX - vnop_strategy must be hand coded because it has no * vnode in its arguments. * This goes away with a merged VM/buffer cache. */ int null_strategy(ap) - struct vop_strategy_args /* { + struct vnop_strategy_args /* { struct buf *a_bp; } */ *ap; { @@ -605,24 +503,24 @@ null_strategy(ap) int error; struct vnode *savedvp; - savedvp = bp->b_vp; - bp->b_vp = NULLVPTOLOWERVP(bp->b_vp); + savedvp = vnode(bp); + buf_setvnode(bp, NULLVPTOLOWERVP(savedvp)); - error = VOP_STRATEGY(bp); + error = VNOP_STRATEGY(bp); - bp->b_vp = savedvp; + buf_setvnode(bp, savedvp); return (error); } /* - * XXX - like vop_strategy, vop_bwrite must be hand coded because it has no + * XXX - like vnop_strategy, vnop_bwrite must be hand coded because it has no * vnode in its arguments. * This goes away with a merged VM/buffer cache. */ int null_bwrite(ap) - struct vop_bwrite_args /* { + struct vnop_bwrite_args /* { struct buf *a_bp; } */ *ap; { @@ -630,12 +528,12 @@ null_bwrite(ap) int error; struct vnode *savedvp; - savedvp = bp->b_vp; - bp->b_vp = NULLVPTOLOWERVP(bp->b_vp); + savedvp = buf_vnode(bp); + buf_setvnode(bp, NULLVPTOLOWERVP(savedvp)); - error = VOP_BWRITE(bp); + error = VNOP_BWRITE(bp); - bp->b_vp = savedvp; + buf_setvnode(bp, savedvp); return (error); } @@ -648,20 +546,17 @@ null_bwrite(ap) int (**null_vnodeop_p)(void *); struct vnodeopv_entry_desc null_vnodeop_entries[] = { - { &vop_default_desc, (VOPFUNC)null_bypass }, - - { &vop_lookup_desc, (VOPFUNC)null_lookup }, - { &vop_setattr_desc, (VOPFUNC)null_setattr }, - { &vop_getattr_desc, (VOPFUNC)null_getattr }, - { &vop_access_desc, (VOPFUNC)null_access }, - { &vop_lock_desc, (VOPFUNC)null_lock }, - { &vop_unlock_desc, (VOPFUNC)null_unlock }, - { &vop_inactive_desc, (VOPFUNC)null_inactive }, - { &vop_reclaim_desc, (VOPFUNC)null_reclaim }, - { &vop_print_desc, (VOPFUNC)null_print }, - - { &vop_strategy_desc, (VOPFUNC)null_strategy }, - { &vop_bwrite_desc, (VOPFUNC)null_bwrite }, + { &vnop_default_desc, (VOPFUNC)null_bypass }, + + { &vnop_lookup_desc, (VOPFUNC)null_lookup }, + { &vnop_setattr_desc, (VOPFUNC)null_setattr }, + { &vnop_getattr_desc, (VOPFUNC)null_getattr }, + { &vnop_access_desc, (VOPFUNC)null_access }, + { &vnop_inactive_desc, (VOPFUNC)null_inactive }, + { &vnop_reclaim_desc, (VOPFUNC)null_reclaim }, + + { &vnop_strategy_desc, (VOPFUNC)null_strategy }, + { &vnop_bwrite_desc, (VOPFUNC)null_bwrite }, { (struct vnodeop_desc*)NULL, (int(*)())NULL } }; diff --git a/bsd/miscfs/specfs/spec_lockf.c b/bsd/miscfs/specfs/spec_lockf.c deleted file mode 100644 index 105656291..000000000 --- a/bsd/miscfs/specfs/spec_lockf.c +++ /dev/null @@ -1,706 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* - * Copyright (c) 1982, 1986, 1989, 1993 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Scooter Morris at Genentech Inc. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)spec_lockf.c 8.4 (Berkeley) 10/26/94 - */ - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/kernel.h> -#include <sys/file.h> -#include <sys/proc.h> -#include <sys/vnode.h> -#include <sys/malloc.h> -#include <sys/fcntl.h> -#include <sys/quota.h> - -#include <miscfs/specfs/lockf.h> -#include <miscfs/specfs/specdev.h> - -/* - * This variable controls the maximum number of processes that will - * be checked in doing deadlock detection. - */ -int spec_maxlockdepth = MAXDEPTH; - -#ifdef LOCKF_DEBUG -#include <vm/vm.h> -#include <sys/sysctl.h> -int lockf_debug = 0; -struct ctldebug debug4 = { "lockf_debug", &lockf_debug }; -#endif - -#define NOLOCKF (struct lockf *)0 -#define SELF 0x1 -#define OTHERS 0x2 - -/* - * Set a byte-range lock. - */ -int -spec_lf_setlock(lock) - register struct lockf *lock; -{ - register struct lockf *block; - struct specinfo *sip = lock->lf_specinfo; - struct lockf **prev, *overlap, *ltmp; - static char lockstr[] = "lockf"; - int ovcase, priority, needtolink, error; - -#ifdef LOCKF_DEBUG - if (lockf_debug & 1) - spec_lf_print("lf_setlock", lock); -#endif /* LOCKF_DEBUG */ - - /* - * Set the priority - */ - priority = PLOCK; - if (lock->lf_type == F_WRLCK) - priority += 4; - priority |= PCATCH; - /* - * Scan lock list for this file looking for locks that would block us. - */ - while (block = spec_lf_getblock(lock)) { - /* - * Free the structure and return if nonblocking. - */ - if ((lock->lf_flags & F_WAIT) == 0) { - FREE(lock, M_LOCKF); - return (EAGAIN); - } - /* - * We are blocked. Since flock style locks cover - * the whole file, there is no chance for deadlock. - * For byte-range locks we must check for deadlock. - * - * Deadlock detection is done by looking through the - * wait channels to see if there are any cycles that - * involve us. MAXDEPTH is set just to make sure we - * do not go off into neverland. - */ - if ((lock->lf_flags & F_POSIX) && - (block->lf_flags & F_POSIX)) { - register struct proc *wproc; - register struct lockf *waitblock; - int i = 0; - - /* The block is waiting on something */ - wproc = (struct proc *)block->lf_id; - while (wproc->p_wchan && - (wproc->p_wmesg == lockstr) && - (i++ < spec_maxlockdepth)) { - waitblock = (struct lockf *)wproc->p_wchan; - /* Get the owner of the blocking lock */ - waitblock = waitblock->lf_next; - if ((waitblock->lf_flags & F_POSIX) == 0) - break; - wproc = (struct proc *)waitblock->lf_id; - if (wproc == (struct proc *)lock->lf_id) { - _FREE(lock, M_LOCKF); - return (EDEADLK); - } - } - } - /* - * For flock type locks, we must first remove - * any shared locks that we hold before we sleep - * waiting for an exclusive lock. - */ - if ((lock->lf_flags & F_FLOCK) && - lock->lf_type == F_WRLCK) { - lock->lf_type = F_UNLCK; - (void) spec_lf_clearlock(lock); - lock->lf_type = F_WRLCK; - } - /* - * Add our lock to the blocked list and sleep until we're free. - * Remember who blocked us (for deadlock detection). - */ - lock->lf_next = block; - TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block); -#ifdef LOCKF_DEBUG - if (lockf_debug & 1) { - spec_lf_print("lf_setlock: blocking on", block); - spec_lf_printlist("lf_setlock", block); - } -#endif /* LOCKF_DEBUG */ - if (error = tsleep((caddr_t)lock, priority, lockstr, 0)) { - /* - * We may have been awakened by a signal (in - * which case we must remove ourselves from the - * blocked list) and/or by another process - * releasing a lock (in which case we have already - * been removed from the blocked list and our - * lf_next field set to NOLOCKF). - */ - if (lock->lf_next) - TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, - lf_block); - _FREE(lock, M_LOCKF); - return (error); - } - } - /* - * No blocks!! Add the lock. Note that we will - * downgrade or upgrade any overlapping locks this - * process already owns. - * - * Skip over locks owned by other processes. - * Handle any locks that overlap and are owned by ourselves. - */ - prev = &sip->si_lockf; - block = sip->si_lockf; - needtolink = 1; - for (;;) { - if (ovcase = spec_lf_findoverlap(block, lock, SELF, &prev, &overlap)) - block = overlap->lf_next; - /* - * Six cases: - * 0) no overlap - * 1) overlap == lock - * 2) overlap contains lock - * 3) lock contains overlap - * 4) overlap starts before lock - * 5) overlap ends after lock - */ - switch (ovcase) { - case 0: /* no overlap */ - if (needtolink) { - *prev = lock; - lock->lf_next = overlap; - } - break; - - case 1: /* overlap == lock */ - /* - * If downgrading lock, others may be - * able to acquire it. - */ - if (lock->lf_type == F_RDLCK && - overlap->lf_type == F_WRLCK) - spec_lf_wakelock(overlap); - overlap->lf_type = lock->lf_type; - FREE(lock, M_LOCKF); - lock = overlap; /* for debug output below */ - break; - - case 2: /* overlap contains lock */ - /* - * Check for common starting point and different types. - */ - if (overlap->lf_type == lock->lf_type) { - _FREE(lock, M_LOCKF); - lock = overlap; /* for debug output below */ - break; - } - if (overlap->lf_start == lock->lf_start) { - *prev = lock; - lock->lf_next = overlap; - overlap->lf_start = lock->lf_end + 1; - } else - spec_lf_split(overlap, lock); - spec_lf_wakelock(overlap); - break; - - case 3: /* lock contains overlap */ - /* - * If downgrading lock, others may be able to - * acquire it, otherwise take the list. - */ - if (lock->lf_type == F_RDLCK && - overlap->lf_type == F_WRLCK) { - spec_lf_wakelock(overlap); - } else { - while (ltmp = overlap->lf_blkhd.tqh_first) { - TAILQ_REMOVE(&overlap->lf_blkhd, ltmp, - lf_block); - TAILQ_INSERT_TAIL(&lock->lf_blkhd, - ltmp, lf_block); - } - } - /* - * Add the new lock if necessary and delete the overlap. - */ - if (needtolink) { - *prev = lock; - lock->lf_next = overlap->lf_next; - prev = &lock->lf_next; - needtolink = 0; - } else - *prev = overlap->lf_next; - _FREE(overlap, M_LOCKF); - continue; - - case 4: /* overlap starts before lock */ - /* - * Add lock after overlap on the list. - */ - lock->lf_next = overlap->lf_next; - overlap->lf_next = lock; - overlap->lf_end = lock->lf_start - 1; - prev = &lock->lf_next; - spec_lf_wakelock(overlap); - needtolink = 0; - continue; - - case 5: /* overlap ends after lock */ - /* - * Add the new lock before overlap. - */ - if (needtolink) { - *prev = lock; - lock->lf_next = overlap; - } - overlap->lf_start = lock->lf_end + 1; - spec_lf_wakelock(overlap); - break; - } - break; - } -#ifdef LOCKF_DEBUG - if (lockf_debug & 1) { - spec_lf_print("lf_setlock: got the lock", lock); - spec_lf_printlist("lf_setlock", lock); - } -#endif /* LOCKF_DEBUG */ - return (0); -} - -/* - * Remove a byte-range lock on an specinfo. - * - * Generally, find the lock (or an overlap to that lock) - * and remove it (or shrink it), then wakeup anyone we can. - */ -int -spec_lf_clearlock(unlock) - register struct lockf *unlock; -{ - struct specinfo *sip = unlock->lf_specinfo; - register struct lockf *lf = sip->si_lockf; - struct lockf *overlap, **prev; - int ovcase; - - if (lf == NOLOCKF) - return (0); -#ifdef LOCKF_DEBUG - if (unlock->lf_type != F_UNLCK) - panic("lf_clearlock: bad type"); - if (lockf_debug & 1) - spec_lf_print("lf_clearlock", unlock); -#endif /* LOCKF_DEBUG */ - prev = &sip->si_lockf; - while (ovcase = spec_lf_findoverlap(lf, unlock, SELF, &prev, &overlap)) { - /* - * Wakeup the list of locks to be retried. - */ - spec_lf_wakelock(overlap); - - switch (ovcase) { - - case 1: /* overlap == lock */ - *prev = overlap->lf_next; - FREE(overlap, M_LOCKF); - break; - - case 2: /* overlap contains lock: split it */ - if (overlap->lf_start == unlock->lf_start) { - overlap->lf_start = unlock->lf_end + 1; - break; - } - spec_lf_split(overlap, unlock); - overlap->lf_next = unlock->lf_next; - break; - - case 3: /* lock contains overlap */ - *prev = overlap->lf_next; - lf = overlap->lf_next; - _FREE(overlap, M_LOCKF); - continue; - - case 4: /* overlap starts before lock */ - overlap->lf_end = unlock->lf_start - 1; - prev = &overlap->lf_next; - lf = overlap->lf_next; - continue; - - case 5: /* overlap ends after lock */ - overlap->lf_start = unlock->lf_end + 1; - break; - } - break; - } -#ifdef LOCKF_DEBUG - if (lockf_debug & 1) - spec_lf_printlist("lf_clearlock", unlock); -#endif /* LOCKF_DEBUG */ - return (0); -} - -/* - * Check whether there is a blocking lock, - * and if so return its process identifier. - */ -int -spec_lf_getlock(lock, fl) - register struct lockf *lock; - register struct flock *fl; -{ - register struct lockf *block; - -#ifdef LOCKF_DEBUG - if (lockf_debug & 1) - spec_lf_print("lf_getlock", lock); -#endif /* LOCKF_DEBUG */ - - if (block = spec_lf_getblock(lock)) { - fl->l_type = block->lf_type; - fl->l_whence = SEEK_SET; - fl->l_start = block->lf_start; - if (block->lf_end == -1) - fl->l_len = 0; - else - fl->l_len = block->lf_end - block->lf_start + 1; - if (block->lf_flags & F_POSIX) - fl->l_pid = ((struct proc *)(block->lf_id))->p_pid; - else - fl->l_pid = -1; - } else { - fl->l_type = F_UNLCK; - } - return (0); -} - -/* - * Walk the list of locks for an specinfo and - * return the first blocking lock. - */ -struct lockf * -spec_lf_getblock(lock) - register struct lockf *lock; -{ - struct lockf **prev, *overlap, *lf = lock->lf_specinfo->si_lockf; - int ovcase; - - prev = &lock->lf_specinfo->si_lockf; - while (ovcase = spec_lf_findoverlap(lf, lock, OTHERS, &prev, &overlap)) { - /* - * We've found an overlap, see if it blocks us - */ - if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK)) - return (overlap); - /* - * Nope, point to the next one on the list and - * see if it blocks us - */ - lf = overlap->lf_next; - } - return (NOLOCKF); -} - -/* - * Walk the list of locks for an specinfo to - * find an overlapping lock (if any). - * - * NOTE: this returns only the FIRST overlapping lock. There - * may be more than one. - */ -int -spec_lf_findoverlap(lf, lock, type, prev, overlap) - register struct lockf *lf; - struct lockf *lock; - int type; - struct lockf ***prev; - struct lockf **overlap; -{ - off_t start, end; - - *overlap = lf; - if (lf == NOLOCKF) - return (0); -#ifdef LOCKF_DEBUG - if (lockf_debug & 2) - spec_lf_print("lf_findoverlap: looking for overlap in", lock); -#endif /* LOCKF_DEBUG */ - start = lock->lf_start; - end = lock->lf_end; - while (lf != NOLOCKF) { - if (((type & SELF) && lf->lf_id != lock->lf_id) || - ((type & OTHERS) && lf->lf_id == lock->lf_id)) { - *prev = &lf->lf_next; - *overlap = lf = lf->lf_next; - continue; - } -#ifdef LOCKF_DEBUG - if (lockf_debug & 2) - spec_lf_print("\tchecking", lf); -#endif /* LOCKF_DEBUG */ - /* - * OK, check for overlap - * - * Six cases: - * 0) no overlap - * 1) overlap == lock - * 2) overlap contains lock - * 3) lock contains overlap - * 4) overlap starts before lock - * 5) overlap ends after lock - */ - if ((lf->lf_end != -1 && start > lf->lf_end) || - (end != -1 && lf->lf_start > end)) { - /* Case 0 */ -#ifdef LOCKF_DEBUG - if (lockf_debug & 2) - printf("no overlap\n"); -#endif /* LOCKF_DEBUG */ - if ((type & SELF) && end != -1 && lf->lf_start > end) - return (0); - *prev = &lf->lf_next; - *overlap = lf = lf->lf_next; - continue; - } - if ((lf->lf_start == start) && (lf->lf_end == end)) { - /* Case 1 */ -#ifdef LOCKF_DEBUG - if (lockf_debug & 2) - printf("overlap == lock\n"); -#endif /* LOCKF_DEBUG */ - return (1); - } - if ((lf->lf_start <= start) && - (end != -1) && - ((lf->lf_end >= end) || (lf->lf_end == -1))) { - /* Case 2 */ -#ifdef LOCKF_DEBUG - if (lockf_debug & 2) - printf("overlap contains lock\n"); -#endif /* LOCKF_DEBUG */ - return (2); - } - if (start <= lf->lf_start && - (end == -1 || - (lf->lf_end != -1 && end >= lf->lf_end))) { - /* Case 3 */ -#ifdef LOCKF_DEBUG - if (lockf_debug & 2) - printf("lock contains overlap\n"); -#endif /* LOCKF_DEBUG */ - return (3); - } - if ((lf->lf_start < start) && - ((lf->lf_end >= start) || (lf->lf_end == -1))) { - /* Case 4 */ -#ifdef LOCKF_DEBUG - if (lockf_debug & 2) - printf("overlap starts before lock\n"); -#endif /* LOCKF_DEBUG */ - return (4); - } - if ((lf->lf_start > start) && - (end != -1) && - ((lf->lf_end > end) || (lf->lf_end == -1))) { - /* Case 5 */ -#ifdef LOCKF_DEBUG - if (lockf_debug & 2) - printf("overlap ends after lock\n"); -#endif /* LOCKF_DEBUG */ - return (5); - } - panic("lf_findoverlap: default"); - } - return (0); -} - -/* - * Split a lock and a contained region into - * two or three locks as necessary. - */ -void -spec_lf_split(lock1, lock2) - register struct lockf *lock1; - register struct lockf *lock2; -{ - register struct lockf *splitlock; - -#ifdef LOCKF_DEBUG - if (lockf_debug & 2) { - spec_lf_print("lf_split", lock1); - spec_lf_print("splitting from", lock2); - } -#endif /* LOCKF_DEBUG */ - /* - * Check to see if spliting into only two pieces. - */ - if (lock1->lf_start == lock2->lf_start) { - lock1->lf_start = lock2->lf_end + 1; - lock2->lf_next = lock1; - return; - } - if (lock1->lf_end == lock2->lf_end) { - lock1->lf_end = lock2->lf_start - 1; - lock2->lf_next = lock1->lf_next; - lock1->lf_next = lock2; - return; - } - /* - * Make a new lock consisting of the last part of - * the encompassing lock - */ - MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK); - bcopy((caddr_t)lock1, (caddr_t)splitlock, sizeof *splitlock); - splitlock->lf_start = lock2->lf_end + 1; - TAILQ_INIT(&splitlock->lf_blkhd); - lock1->lf_end = lock2->lf_start - 1; - /* - * OK, now link it in - */ - splitlock->lf_next = lock1->lf_next; - lock2->lf_next = splitlock; - lock1->lf_next = lock2; -} - -/* - * Wakeup a blocklist - */ -void -spec_lf_wakelock(listhead) - struct lockf *listhead; -{ - register struct lockf *wakelock; - - while (wakelock = listhead->lf_blkhd.tqh_first) { - TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block); - wakelock->lf_next = NOLOCKF; -#ifdef LOCKF_DEBUG - if (lockf_debug & 2) - spec_lf_print("lf_wakelock: awakening", wakelock); -#endif /* LOCKF_DEBUG */ - wakeup((caddr_t)wakelock); - } -} - -#ifdef LOCKF_DEBUG -/* - * Print out a lock. - */ -spec_lf_print(tag, lock) - char *tag; - register struct lockf *lock; -{ - - printf("%s: lock 0x%lx for ", tag, lock); - if (lock->lf_flags & F_POSIX) - printf("proc %d", ((struct proc *)(lock->lf_id))->p_pid); - else - printf("id 0x%x", lock->lf_id); - printf(" on sip %d rdev <%d, %d>, %s, start %d, end %d", - lock->lf_specinfo, - major(lock->lf_specinfo->si_rdev), - minor(lock->lf_specinfo->si_rdev), - lock->lf_type == F_RDLCK ? "shared" : - lock->lf_type == F_WRLCK ? "exclusive" : - lock->lf_type == F_UNLCK ? "unlock" : - "unknown", lock->lf_start, lock->lf_end); - if (lock->lf_blkhd.tqh_first) - printf(" block 0x%x\n", lock->lf_blkhd.tqh_first); - else - printf("\n"); -} - -spec_lf_printlist(tag, lock) - char *tag; - struct lockf *lock; -{ - register struct lockf *lf, *blk; - - printf("%s: Lock list for sip %d on dev <%d, %d>:\n", - tag, lock->lf_specinfo, - major(lock->lf_specinfo->si_dev), - minor(lock->lf_specinfo->si_dev)); - for (lf = lock->lf_specinfo->si_lockf; lf; lf = lf->lf_next) { - printf("\tlock 0x%lx for ", lf); - if (lf->lf_flags & F_POSIX) - printf("proc %d", ((struct proc *)(lf->lf_id))->p_pid); - else - printf("id 0x%x", lf->lf_id); - printf(", %s, start %d, end %d", - lf->lf_type == F_RDLCK ? "shared" : - lf->lf_type == F_WRLCK ? "exclusive" : - lf->lf_type == F_UNLCK ? "unlock" : - "unknown", lf->lf_start, lf->lf_end); - for (blk = lf->lf_blkhd.tqh_first; blk; - blk = blk->lf_block.tqe_next) { - printf("\n\t\tlock request 0x%lx for ", blk); - if (blk->lf_flags & F_POSIX) - printf("proc %d", - ((struct proc *)(blk->lf_id))->p_pid); - else - printf("id 0x%x", blk->lf_id); - printf(", %s, start %d, end %d", - blk->lf_type == F_RDLCK ? "shared" : - blk->lf_type == F_WRLCK ? "exclusive" : - blk->lf_type == F_UNLCK ? "unlock" : - "unknown", blk->lf_start, blk->lf_end); - if (blk->lf_blkhd.tqh_first) - panic("lf_printlist: bad list"); - } - printf("\n"); - } -} -#endif /* LOCKF_DEBUG */ diff --git a/bsd/miscfs/specfs/spec_vnops.c b/bsd/miscfs/specfs/spec_vnops.c index 314464b19..7cb75e4cd 100644 --- a/bsd/miscfs/specfs/spec_vnops.c +++ b/bsd/miscfs/specfs/spec_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -56,22 +56,24 @@ */ #include <sys/param.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/conf.h> -#include <sys/buf.h> -#include <sys/mount.h> +#include <sys/buf_internal.h> +#include <sys/mount_internal.h> #include <sys/namei.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/stat.h> #include <sys/errno.h> #include <sys/ioctl.h> #include <sys/file.h> +#include <sys/user.h> #include <sys/malloc.h> #include <sys/disk.h> +#include <sys/uio_internal.h> #include <miscfs/specfs/specdev.h> -#include <miscfs/specfs/lockf.h> #include <vfs/vfs_support.h> #include <sys/kdebug.h> @@ -91,70 +93,62 @@ char devcls[] = "devcls"; int (**spec_vnodeop_p)(void *); struct vnodeopv_entry_desc spec_vnodeop_entries[] = { - { &vop_default_desc, (VOPFUNC)vn_default_error }, - { &vop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */ - { &vop_create_desc, (VOPFUNC)err_create }, /* create */ - { &vop_mknod_desc, (VOPFUNC)err_mknod }, /* mknod */ - { &vop_open_desc, (VOPFUNC)spec_open }, /* open */ - { &vop_close_desc, (VOPFUNC)spec_close }, /* close */ - { &vop_access_desc, (VOPFUNC)spec_access }, /* access */ - { &vop_getattr_desc, (VOPFUNC)spec_getattr }, /* getattr */ - { &vop_setattr_desc, (VOPFUNC)spec_setattr }, /* setattr */ - { &vop_read_desc, (VOPFUNC)spec_read }, /* read */ - { &vop_write_desc, (VOPFUNC)spec_write }, /* write */ - { &vop_lease_desc, (VOPFUNC)nop_lease }, /* lease */ - { &vop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */ - { &vop_select_desc, (VOPFUNC)spec_select }, /* select */ - { &vop_revoke_desc, (VOPFUNC)nop_revoke }, /* revoke */ - { &vop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */ - { &vop_fsync_desc, (VOPFUNC)spec_fsync }, /* fsync */ - { &vop_seek_desc, (VOPFUNC)err_seek }, /* seek */ - { &vop_remove_desc, (VOPFUNC)err_remove }, /* remove */ - { &vop_link_desc, (VOPFUNC)err_link }, /* link */ - { &vop_rename_desc, (VOPFUNC)err_rename }, /* rename */ - { &vop_mkdir_desc, (VOPFUNC)err_mkdir }, /* mkdir */ - { &vop_rmdir_desc, (VOPFUNC)err_rmdir }, /* rmdir */ - { &vop_symlink_desc, (VOPFUNC)err_symlink }, /* symlink */ - { &vop_readdir_desc, (VOPFUNC)err_readdir }, /* readdir */ - { &vop_readlink_desc, (VOPFUNC)err_readlink }, /* readlink */ - { &vop_abortop_desc, (VOPFUNC)err_abortop }, /* abortop */ - { &vop_inactive_desc, (VOPFUNC)nop_inactive }, /* inactive */ - { &vop_reclaim_desc, (VOPFUNC)nop_reclaim }, /* reclaim */ - { &vop_lock_desc, (VOPFUNC)nop_lock }, /* lock */ - { &vop_unlock_desc, (VOPFUNC)nop_unlock }, /* unlock */ - { &vop_bmap_desc, (VOPFUNC)spec_bmap }, /* bmap */ - { &vop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */ - { &vop_print_desc, (VOPFUNC)spec_print }, /* print */ - { &vop_islocked_desc, (VOPFUNC)nop_islocked }, /* islocked */ - { &vop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */ - { &vop_advlock_desc, (VOPFUNC)spec_advlock }, /* advlock */ - { &vop_blkatoff_desc, (VOPFUNC)err_blkatoff }, /* blkatoff */ - { &vop_valloc_desc, (VOPFUNC)err_valloc }, /* valloc */ - { &vop_vfree_desc, (VOPFUNC)err_vfree }, /* vfree */ - { &vop_truncate_desc, (VOPFUNC)nop_truncate }, /* truncate */ - { &vop_update_desc, (VOPFUNC)nop_update }, /* update */ - { &vop_bwrite_desc, (VOPFUNC)spec_bwrite }, /* bwrite */ - { &vop_devblocksize_desc, (VOPFUNC)spec_devblocksize }, /* devblocksize */ - { &vop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */ - { &vop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */ - { &vop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ - { &vop_blktooff_desc, (VOPFUNC)spec_blktooff }, /* blktooff */ - { &vop_offtoblk_desc, (VOPFUNC)spec_offtoblk }, /* offtoblk */ - { &vop_cmap_desc, (VOPFUNC)spec_cmap }, /* cmap */ + { &vnop_default_desc, (VOPFUNC)vn_default_error }, + { &vnop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */ + { &vnop_create_desc, (VOPFUNC)err_create }, /* create */ + { &vnop_mknod_desc, (VOPFUNC)err_mknod }, /* mknod */ + { &vnop_open_desc, (VOPFUNC)spec_open }, /* open */ + { &vnop_close_desc, (VOPFUNC)spec_close }, /* close */ + { &vnop_access_desc, (VOPFUNC)spec_access }, /* access */ + { &vnop_getattr_desc, (VOPFUNC)spec_getattr }, /* getattr */ + { &vnop_setattr_desc, (VOPFUNC)spec_setattr }, /* setattr */ + { &vnop_read_desc, (VOPFUNC)spec_read }, /* read */ + { &vnop_write_desc, (VOPFUNC)spec_write }, /* write */ + { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */ + { &vnop_select_desc, (VOPFUNC)spec_select }, /* select */ + { &vnop_revoke_desc, (VOPFUNC)nop_revoke }, /* revoke */ + { &vnop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */ + { &vnop_fsync_desc, (VOPFUNC)spec_fsync }, /* fsync */ + { &vnop_remove_desc, (VOPFUNC)err_remove }, /* remove */ + { &vnop_link_desc, (VOPFUNC)err_link }, /* link */ + { &vnop_rename_desc, (VOPFUNC)err_rename }, /* rename */ + { &vnop_mkdir_desc, (VOPFUNC)err_mkdir }, /* mkdir */ + { &vnop_rmdir_desc, (VOPFUNC)err_rmdir }, /* rmdir */ + { &vnop_symlink_desc, (VOPFUNC)err_symlink }, /* symlink */ + { &vnop_readdir_desc, (VOPFUNC)err_readdir }, /* readdir */ + { &vnop_readlink_desc, (VOPFUNC)err_readlink }, /* readlink */ + { &vnop_inactive_desc, (VOPFUNC)nop_inactive }, /* inactive */ + { &vnop_reclaim_desc, (VOPFUNC)nop_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */ + { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */ + { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ + { &vnop_bwrite_desc, (VOPFUNC)spec_bwrite }, /* bwrite */ + { &vnop_devblocksize_desc, (VOPFUNC)spec_devblocksize }, /* devblocksize */ + { &vnop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */ + { &vnop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */ + { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ + { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff }, /* blktooff */ + { &vnop_offtoblk_desc, (VOPFUNC)spec_offtoblk }, /* offtoblk */ + { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap }, /* blockmap */ { (struct vnodeop_desc*)NULL, (int(*)())NULL } }; struct vnodeopv_desc spec_vnodeop_opv_desc = { &spec_vnodeop_p, spec_vnodeop_entries }; + +static void set_blocksize(vnode_t, dev_t); + + /* * Trivial lookup routine that always fails. */ int spec_lookup(ap) - struct vop_lookup_args /* { + struct vnop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; + vfs_context_t a_context; } */ *ap; { @@ -162,10 +156,10 @@ spec_lookup(ap) return (ENOTDIR); } -void +static void set_blocksize(struct vnode *vp, dev_t dev) { - int (*size)(); + int (*size)(dev_t); int rsize; if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) { @@ -187,10 +181,12 @@ set_fsblocksize(struct vnode *vp) dev_t dev = (dev_t)vp->v_rdev; int maj = major(dev); - if ((u_int)maj >= nblkdev) + if ((u_int)maj >= (u_int)nblkdev) return; + vnode_lock(vp); set_blocksize(vp, dev); + vnode_unlock(vp); } } @@ -199,17 +195,17 @@ set_fsblocksize(struct vnode *vp) /* * Open a special file. */ -/* ARGSUSED */ +int spec_open(ap) - struct vop_open_args /* { + struct vnop_open_args /* { struct vnode *a_vp; int a_mode; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { - struct proc *p = ap->a_p; - struct vnode *bvp, *vp = ap->a_vp; + struct proc *p = vfs_context_proc(ap->a_context); + kauth_cred_t cred = vfs_context_ucred(ap->a_context); + struct vnode *vp = ap->a_vp; dev_t bdev, dev = (dev_t)vp->v_rdev; int maj = major(dev); int error; @@ -223,9 +219,9 @@ spec_open(ap) switch (vp->v_type) { case VCHR: - if ((u_int)maj >= nchrdev) + if ((u_int)maj >= (u_int)nchrdev) return (ENXIO); - if (ap->a_cred != FSCRED && (ap->a_mode & FWRITE)) { + if (cred != FSCRED && (ap->a_mode & FWRITE)) { /* * When running in very secure mode, do not allow * opens for writing of any disk character devices. @@ -239,66 +235,77 @@ spec_open(ap) * currently mounted. */ if (securelevel >= 1) { - if ((bdev = chrtoblk(dev)) != NODEV && - vfinddev(bdev, VBLK, &bvp) && - bvp->v_usecount > 0 && - (error = vfs_mountedon(bvp))) + if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error)) return (error); if (iskmemdev(dev)) return (EPERM); } } - if (cdevsw[maj].d_type == D_TTY) + if (cdevsw[maj].d_type == D_TTY) { + vnode_lock(vp); vp->v_flag |= VISTTY; - VOP_UNLOCK(vp, 0, p); + vnode_unlock(vp); + } error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); return (error); case VBLK: - if ((u_int)maj >= nblkdev) + if ((u_int)maj >= (u_int)nblkdev) return (ENXIO); /* * When running in very secure mode, do not allow * opens for writing of any disk block devices. */ - if (securelevel >= 2 && ap->a_cred != FSCRED && + if (securelevel >= 2 && cred != FSCRED && (ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK) return (EPERM); /* * Do not allow opens of block devices that are * currently mounted. */ - if (error = vfs_mountedon(vp)) + if ( (error = vfs_mountedon(vp)) ) return (error); error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p); if (!error) { u_int64_t blkcnt; u_int32_t blksize; + int setsize = 0; + u_int32_t size512 = 512; + + + if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) { + /* Switch to 512 byte sectors (temporarily) */ + if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) { + /* Get the number of 512 byte physical blocks. */ + if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) { + setsize = 1; + } + } + /* If it doesn't set back, we can't recover */ + if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context)) + error = ENXIO; + } + + + vnode_lock(vp); set_blocksize(vp, dev); /* * Cache the size in bytes of the block device for later * use by spec_write(). */ - vp->v_specdevsize = (u_int64_t)0; /* Default: Can't get */ - if (!VOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, NOCRED, p)) { - /* Switch to 512 byte sectors (temporarily) */ - u_int32_t size512 = 512; - - if (!VOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, NOCRED, p)) { - /* Get the number of 512 byte physical blocks. */ - if (!VOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, NOCRED, p)) { + if (setsize) vp->v_specdevsize = blkcnt * (u_int64_t)size512; - } - } - /* If it doesn't set back, we can't recover */ - if (VOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, NOCRED, p)) - error = ENXIO; - } + else + vp->v_specdevsize = (u_int64_t)0; /* Default: Can't get */ + + vnode_unlock(vp); + } return(error); + default: + panic("spec_open type"); } return (0); } @@ -306,42 +313,39 @@ spec_open(ap) /* * Vnode op for read */ -/* ARGSUSED */ +int spec_read(ap) - struct vop_read_args /* { + struct vnop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; + vfs_context_t a_context; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct uio *uio = ap->a_uio; - struct proc *p = uio->uio_procp; struct buf *bp; - daddr_t bn, nextbn; + daddr64_t bn, nextbn; long bsize, bscale; int devBlockSize=0; - int n, on, majordev, (*ioctl)(); + int n, on; int error = 0; dev_t dev; #if DIAGNOSTIC if (uio->uio_rw != UIO_READ) panic("spec_read mode"); - if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != current_proc()) + if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) panic("spec_read proc"); #endif - if (uio->uio_resid == 0) + if (uio_resid(uio) == 0) return (0); switch (vp->v_type) { case VCHR: - VOP_UNLOCK(vp, 0, p); error = (*cdevsw[major(vp->v_rdev)].d_read) (vp->v_rdev, uio, ap->a_ioflag); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); return (error); case VBLK: @@ -361,55 +365,60 @@ spec_read(ap) do { on = uio->uio_offset % bsize; - bn = (uio->uio_offset / devBlockSize) &~ (bscale - 1); + bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ (bscale - 1)); - if (vp->v_lastr + bscale == bn) { + if (vp->v_speclastr + bscale == bn) { nextbn = bn + bscale; - error = breadn(vp, bn, (int)bsize, &nextbn, + error = buf_breadn(vp, bn, (int)bsize, &nextbn, (int *)&bsize, 1, NOCRED, &bp); } else - error = bread(vp, bn, (int)bsize, NOCRED, &bp); + error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp); + + vnode_lock(vp); + vp->v_speclastr = bn; + vnode_unlock(vp); - vp->v_lastr = bn; - n = bsize - bp->b_resid; + n = bsize - buf_resid(bp); if ((on > n) || error) { if (!error) error = EINVAL; - brelse(bp); + buf_brelse(bp); return (error); } - n = min((unsigned)(n - on), uio->uio_resid); + // LP64todo - fix this! + n = min((unsigned)(n - on), uio_resid(uio)); - error = uiomove((char *)bp->b_data + on, n, uio); + error = uiomove((char *)buf_dataptr(bp) + on, n, uio); if (n + on == bsize) - bp->b_flags |= B_AGE; - brelse(bp); - } while (error == 0 && uio->uio_resid > 0 && n != 0); + buf_markaged(bp); + buf_brelse(bp); + } while (error == 0 && uio_resid(uio) > 0 && n != 0); return (error); default: panic("spec_read type"); } /* NOTREACHED */ + + return (0); } /* * Vnode op for write */ -/* ARGSUSED */ +int spec_write(ap) - struct vop_write_args /* { + struct vnop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; + vfs_context_t a_context; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct uio *uio = ap->a_uio; - struct proc *p = uio->uio_procp; struct buf *bp; - daddr_t bn; + daddr64_t bn; int bsize, blkmask, bscale; register int io_sync; register int io_size; @@ -421,27 +430,26 @@ spec_write(ap) #if DIAGNOSTIC if (uio->uio_rw != UIO_WRITE) panic("spec_write mode"); - if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != current_proc()) + if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) panic("spec_write proc"); #endif switch (vp->v_type) { case VCHR: - VOP_UNLOCK(vp, 0, p); error = (*cdevsw[major(vp->v_rdev)].d_write) (vp->v_rdev, uio, ap->a_ioflag); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); return (error); case VBLK: - if (uio->uio_resid == 0) + if (uio_resid(uio) == 0) return (0); if (uio->uio_offset < 0) return (EINVAL); io_sync = (ap->a_ioflag & IO_SYNC); - io_size = uio->uio_resid; + // LP64todo - fix this! + io_size = uio_resid(uio); dev = (vp->v_rdev); @@ -455,20 +463,21 @@ spec_write(ap) do { - bn = (uio->uio_offset / devBlockSize) &~ blkmask; + bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ blkmask); on = uio->uio_offset % bsize; - n = min((unsigned)(bsize - on), uio->uio_resid); + // LP64todo - fix this! + n = min((unsigned)(bsize - on), uio_resid(uio)); /* - * Use getblk() as an optimization IFF: + * Use buf_getblk() as an optimization IFF: * * 1) We are reading exactly a block on a block * aligned boundary * 2) We know the size of the device from spec_open * 3) The read doesn't span the end of the device * - * Otherwise, we fall back on bread(). + * Otherwise, we fall back on buf_bread(). */ if (n == bsize && vp->v_specdevsize != (u_int64_t)0 && @@ -478,92 +487,95 @@ spec_write(ap) } if (n == bsize) - bp = getblk(vp, bn, bsize, 0, 0, BLK_WRITE); + bp = buf_getblk(vp, bn, bsize, 0, 0, BLK_WRITE); else - error = bread(vp, bn, bsize, NOCRED, &bp); + error = (int)buf_bread(vp, bn, bsize, NOCRED, &bp); /* Translate downstream error for upstream, if needed */ - if (!error) { - error = bp->b_error; - if (!error && (bp->b_flags & B_ERROR) != 0) { - error = EIO; - } - } + if (!error) + error = (int)buf_error(bp); if (error) { - brelse(bp); + buf_brelse(bp); return (error); } - n = min(n, bsize - bp->b_resid); - - error = uiomove((char *)bp->b_data + on, n, uio); + n = min(n, bsize - buf_resid(bp)); - bp->b_flags |= B_AGE; + error = uiomove((char *)buf_dataptr(bp) + on, n, uio); + if (error) { + buf_brelse(bp); + return (error); + } + buf_markaged(bp); if (io_sync) - bwrite(bp); + error = buf_bwrite(bp); else { if ((n + on) == bsize) - bawrite(bp); + error = buf_bawrite(bp); else - bdwrite(bp); + error = buf_bdwrite(bp); } - } while (error == 0 && uio->uio_resid > 0 && n != 0); + } while (error == 0 && uio_resid(uio) > 0 && n != 0); return (error); default: panic("spec_write type"); } /* NOTREACHED */ + + return (0); } /* * Device ioctl operation. */ -/* ARGSUSED */ +int spec_ioctl(ap) - struct vop_ioctl_args /* { + struct vnop_ioctl_args /* { struct vnode *a_vp; int a_command; caddr_t a_data; int a_fflag; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { + proc_t p = vfs_context_proc(ap->a_context); dev_t dev = ap->a_vp->v_rdev; switch (ap->a_vp->v_type) { case VCHR: return ((*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, - ap->a_fflag, ap->a_p)); + ap->a_fflag, p)); case VBLK: - if (ap->a_command == 0 && (int)ap->a_data == B_TAPE) + if (ap->a_command == 0 && (int)ap->a_data == B_TAPE) { if (bdevsw[major(dev)].d_type == D_TAPE) return (0); else return (1); + } return ((*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, - ap->a_fflag, ap->a_p)); + ap->a_fflag, p)); default: panic("spec_ioctl"); /* NOTREACHED */ } + return (0); } -/* ARGSUSED */ +int spec_select(ap) - struct vop_select_args /* { + struct vnop_select_args /* { struct vnode *a_vp; int a_which; int a_fflags; - struct ucred *a_cred; void * a_wql; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { + proc_t p = vfs_context_proc(ap->a_context); register dev_t dev; switch (ap->a_vp->v_type) { @@ -573,249 +585,154 @@ spec_select(ap) case VCHR: dev = ap->a_vp->v_rdev; - return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, ap->a_p); + return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p); } } + /* * Synch buffers associated with a block device */ -/* ARGSUSED */ int -spec_fsync(ap) - struct vop_fsync_args /* { - struct vnode *a_vp; - struct ucred *a_cred; - int a_waitfor; - struct proc *a_p; - } */ *ap; +spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context) { - register struct vnode *vp = ap->a_vp; - register struct buf *bp; - struct buf *nbp; - int s; - if (vp->v_type == VCHR) return (0); /* * Flush all dirty buffers associated with a block device. */ -loop: - s = splbio(); - for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { - nbp = bp->b_vnbufs.le_next; - // XXXdbg - don't flush locked blocks. they may be journaled. - if ((bp->b_flags & B_BUSY) || (bp->b_flags & B_LOCKED)) - continue; - if ((bp->b_flags & B_DELWRI) == 0) - panic("spec_fsync: not dirty"); - bremfree(bp); - bp->b_flags |= B_BUSY; - splx(s); - bawrite(bp); - goto loop; - } - if (ap->a_waitfor == MNT_WAIT) { - while (vp->v_numoutput) { - vp->v_flag |= VBWAIT; - tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "spec_fsync", 0); - } -#if DIAGNOSTIC - if (vp->v_dirtyblkhd.lh_first) { - vprint("spec_fsync: dirty", vp); - splx(s); - goto loop; - } -#endif - } - splx(s); + buf_flushdirtyblks(vp, waitfor == MNT_WAIT, 0, (char *)"spec_fsync"); + return (0); } +int +spec_fsync(ap) + struct vnop_fsync_args /* { + struct vnode *a_vp; + int a_waitfor; + vfs_context_t a_context; + } */ *ap; +{ + return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context); +} + /* * Just call the device strategy routine */ +extern int hard_throttle_on_root; + + +#define LOWPRI_DELAY_MSECS 200 +#define LOWPRI_WINDOW_MSECS 200 + +int lowpri_IO_window_msecs = LOWPRI_WINDOW_MSECS; +int lowpri_IO_delay_msecs = LOWPRI_DELAY_MSECS; + +struct timeval last_normal_IO_timestamp; +struct timeval last_lowpri_IO_timestamp; +struct timeval lowpri_IO_window = { 0, LOWPRI_WINDOW_MSECS * 1000 }; + +int spec_strategy(ap) - struct vop_strategy_args /* { + struct vnop_strategy_args /* { struct buf *a_bp; } */ *ap; { - struct buf *bp; - extern int hard_throttle_on_root; + buf_t bp; + int bflags; + dev_t bdev; + proc_t p; + struct timeval elapsed; bp = ap->a_bp; + bdev = buf_device(bp); + bflags = buf_flags(bp); if (kdebug_enable) { - int code = 0; + int code = 0; - if (bp->b_flags & B_READ) - code |= DKIO_READ; - if (bp->b_flags & B_ASYNC) - code |= DKIO_ASYNC; + if (bflags & B_READ) + code |= DKIO_READ; + if (bflags & B_ASYNC) + code |= DKIO_ASYNC; - if (bp->b_flags & B_META) - code |= DKIO_META; - else if (bp->b_flags & (B_PGIN | B_PAGEOUT)) - code |= DKIO_PAGING; + if (bflags & B_META) + code |= DKIO_META; + else if (bflags & B_PAGEIO) + code |= DKIO_PAGING; - KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE, - (unsigned int)bp, bp->b_dev, bp->b_blkno, bp->b_bcount, 0); + KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE, + (unsigned int)bp, bdev, (int)buf_blkno(bp), buf_count(bp), 0); } - if ((bp->b_flags & B_PGIN) && (bp->b_vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) - hard_throttle_on_root = 1; - - (*bdevsw[major(bp->b_dev)].d_strategy)(bp); - return (0); -} - -/* - * Advisory record locking support - */ -int -spec_advlock(ap) - struct vop_advlock_args /* { - struct vnode *a_vp; - caddr_t a_id; - int a_op; - struct flock *a_fl; - int a_flags; - } */ *ap; -{ - register struct flock *fl = ap->a_fl; - register struct lockf *lock; - off_t start, end; - int error; - - /* - * Avoid the common case of unlocking when inode has no locks. - */ - if (ap->a_vp->v_specinfo->si_lockf == (struct lockf *)0) { - if (ap->a_op != F_SETLK) { - fl->l_type = F_UNLCK; - return (0); + if (((bflags & (B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) && + (buf_vnode(bp)->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) + hard_throttle_on_root = 1; + + if ( lowpri_IO_delay_msecs && lowpri_IO_window_msecs ) { + p = current_proc(); + + if ( (p == NULL) || !(p->p_lflag & P_LLOW_PRI_IO)) { + if (!(p->p_lflag & P_LBACKGROUND_IO)) + microuptime(&last_normal_IO_timestamp); + } else { + microuptime(&last_lowpri_IO_timestamp); + + elapsed = last_lowpri_IO_timestamp; + timevalsub(&elapsed, &last_normal_IO_timestamp); + + lowpri_IO_window.tv_sec = lowpri_IO_window_msecs / 1000; + lowpri_IO_window.tv_usec = (lowpri_IO_window_msecs % 1000) * 1000; + + if (timevalcmp(&elapsed, &lowpri_IO_window, <)) { + struct uthread *ut; + + /* + * I'd really like to do the IOSleep here, but + * we may be holding all kinds of filesystem related locks + * and the pages for this I/O marked 'busy'... + * we don't want to cause a normal task to block on + * one of these locks while we're throttling a task marked + * for low priority I/O... we'll mark the uthread and + * do the delay just before we return from the system + * call that triggered this I/O or from vnode_pagein + */ + ut = get_bsdthread_info(current_thread()); + ut->uu_lowpri_delay = lowpri_IO_delay_msecs; + } } } - /* - * Convert the flock structure into a start and end. - */ - switch (fl->l_whence) { + (*bdevsw[major(bdev)].d_strategy)(bp); - case SEEK_SET: - case SEEK_CUR: - /* - * Caller is responsible for adding any necessary offset - * when SEEK_CUR is used. - */ - start = fl->l_start; - break; - - case SEEK_END: - start = ap->a_vp->v_specinfo->si_devsize + fl->l_start; - break; - - default: - return (EINVAL); - } - if (fl->l_len == 0) - end = -1; - else if (fl->l_len > 0) - end = start + fl->l_len - 1; - else { /* l_len is negative */ - end = start - 1; - start += fl->l_len; - } - if (start < 0) - return (EINVAL); - /* - * Create the lockf structure - */ - MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK); - lock->lf_start = start; - lock->lf_end = end; - lock->lf_id = ap->a_id; - lock->lf_specinfo = ap->a_vp->v_specinfo; - lock->lf_type = fl->l_type; - lock->lf_next = (struct lockf *)0; - TAILQ_INIT(&lock->lf_blkhd); - lock->lf_flags = ap->a_flags; - /* - * Do the requested operation. - */ - switch(ap->a_op) { - case F_SETLK: - return (spec_lf_setlock(lock)); - - case F_UNLCK: - error = spec_lf_clearlock(lock); - FREE(lock, M_LOCKF); - return (error); - - case F_GETLK: - error = spec_lf_getlock(lock, fl); - FREE(lock, M_LOCKF); - return (error); - - default: - _FREE(lock, M_LOCKF); - return (EINVAL); - } - /* NOTREACHED */ + return (0); } -/* - * This is a noop, simply returning what one has been given. - */ -spec_bmap(ap) - struct vop_bmap_args /* { - struct vnode *a_vp; - daddr_t a_bn; - struct vnode **a_vpp; - daddr_t *a_bnp; - int *a_runp; - } */ *ap; -{ - - if (ap->a_vpp != NULL) - *ap->a_vpp = ap->a_vp; - if (ap->a_bnp != NULL) - *ap->a_bnp = ap->a_bn * (PAGE_SIZE / ap->a_vp->v_specsize); - if (ap->a_runp != NULL) - *ap->a_runp = (MAXPHYSIO / PAGE_SIZE) - 1; - return (0); -} /* * This is a noop, simply returning what one has been given. */ -spec_cmap(ap) - struct vop_cmap_args /* { - struct vnode *a_vp; - off_t a_offset; - size_t a_size; - daddr_t *a_bpn; - size_t *a_run; - void *a_poff; - } */ *ap; +int +spec_blockmap(__unused struct vnop_blockmap_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } /* * Device close routine */ -/* ARGSUSED */ +int spec_close(ap) - struct vop_close_args /* { + struct vnop_close_args /* { struct vnode *a_vp; int a_fflag; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { register struct vnode *vp = ap->a_vp; dev_t dev = vp->v_rdev; - int (*devclose) __P((dev_t, int, int, struct proc *)); + int (*devclose)(dev_t, int, int, struct proc *); int mode, error; + struct proc *p = vfs_context_proc(ap->a_context); switch (vp->v_type) { @@ -829,17 +746,15 @@ spec_close(ap) * if the reference count is 2 (this last descriptor * plus the session), release the reference from the session. */ - if (vcount(vp) == 2 && ap->a_p && - vp == ap->a_p->p_session->s_ttyvp) { - ap->a_p->p_session->s_ttyvp = NULL; - vrele(vp); + if (vcount(vp) == 2 && p && + vp == p->p_session->s_ttyvp) { + p->p_session->s_ttyvp = NULL; + vnode_rele(vp); } /* - * If the vnode is locked, then we are in the midst - * of forcably closing the device, otherwise we only * close on last reference. */ - if (vcount(vp) > 1 && (vp->v_flag & VXLOCK) == 0) + if (vcount(vp) > 1) return (0); devclose = cdevsw[major(dev)].d_close; mode = S_IFCHR; @@ -852,33 +767,30 @@ spec_close(ap) * we must invalidate any in core blocks, so that * we can, for instance, change floppy disks. */ - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, ap->a_p); - error = vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 0, 0); - VOP_UNLOCK(vp, 0, ap->a_p); + if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context))) + return (error); + + error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0); if (error) return (error); /* - * We do not want to really close the device if it - * is still in use unless we are trying to close it - * forcibly. Since every use (buffer, vnode, swap, cmap) + * Since every use (buffer, vnode, swap, blockmap) * holds a reference to the vnode, and because we mark * any other vnodes that alias this device, when the * sum of the reference counts on all the aliased * vnodes descends to one, we are on last close. */ - if (vcount(vp) > 1 && (vp->v_flag & VXLOCK) == 0) + if (vcount(vp) > 1) return (0); #else /* DEVFS_IMPLEMENTS_LOCKING */ /* - * We do not want to really close the device if it - * is still in use unless we are trying to close it - * forcibly. Since every use (buffer, vnode, swap, cmap) + * Since every use (buffer, vnode, swap, blockmap) * holds a reference to the vnode, and because we mark * any other vnodes that alias this device, when the * sum of the reference counts on all the aliased * vnodes descends to one, we are on last close. */ - if (vcount(vp) > 1 && (vp->v_flag & VXLOCK) == 0) + if (vcount(vp) > 1) return (0); /* @@ -886,7 +798,10 @@ spec_close(ap) * we must invalidate any in core blocks, so that * we can, for instance, change floppy disks. */ - error = vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 0, 0); + if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context))) + return (error); + + error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0); if (error) return (error); #endif /* DEVFS_IMPLEMENTS_LOCKING */ @@ -898,30 +813,19 @@ spec_close(ap) panic("spec_close: not special"); } - return ((*devclose)(dev, ap->a_fflag, mode, ap->a_p)); -} - -/* - * Print out the contents of a special device vnode. - */ -spec_print(ap) - struct vop_print_args /* { - struct vnode *a_vp; - } */ *ap; -{ - - printf("tag VT_NON, dev %d, %d\n", major(ap->a_vp->v_rdev), - minor(ap->a_vp->v_rdev)); + return ((*devclose)(dev, ap->a_fflag, mode, p)); } /* * Return POSIX pathconf information applicable to special devices. */ +int spec_pathconf(ap) - struct vop_pathconf_args /* { + struct vnop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; + vfs_context_t a_context; } */ *ap; { @@ -952,7 +856,7 @@ spec_pathconf(ap) int spec_devblocksize(ap) - struct vop_devblocksize_args /* { + struct vnop_devblocksize_args /* { struct vnode *a_vp; int *a_retval; } */ *ap; @@ -964,7 +868,8 @@ spec_devblocksize(ap) /* * Special device failed operation */ -spec_ebadf() +int +spec_ebadf(__unused void *dummy) { return (EBADF); @@ -973,6 +878,7 @@ spec_ebadf() /* * Special device bad operation */ +int spec_badop() { @@ -983,9 +889,9 @@ spec_badop() /* Blktooff derives file offset from logical block number */ int spec_blktooff(ap) - struct vop_blktooff_args /* { + struct vnop_blktooff_args /* { struct vnode *a_vp; - daddr_t a_lblkno; + daddr64_t a_lblkno; off_t *a_offset; } */ *ap; { @@ -994,42 +900,46 @@ spec_blktooff(ap) switch (vp->v_type) { case VCHR: *ap->a_offset = (off_t)-1; /* failure */ - return (EOPNOTSUPP); + return (ENOTSUP); case VBLK: printf("spec_blktooff: not implemented for VBLK\n"); *ap->a_offset = (off_t)-1; /* failure */ - return (EOPNOTSUPP); + return (ENOTSUP); default: panic("spec_blktooff type"); } /* NOTREACHED */ + + return (0); } /* Offtoblk derives logical block number from file offset */ int spec_offtoblk(ap) - struct vop_offtoblk_args /* { + struct vnop_offtoblk_args /* { struct vnode *a_vp; off_t a_offset; - daddr_t *a_lblkno; + daddr64_t *a_lblkno; } */ *ap; { register struct vnode *vp = ap->a_vp; switch (vp->v_type) { case VCHR: - *ap->a_lblkno = (daddr_t)-1; /* failure */ - return (EOPNOTSUPP); + *ap->a_lblkno = (daddr64_t)-1; /* failure */ + return (ENOTSUP); case VBLK: printf("spec_offtoblk: not implemented for VBLK\n"); - *ap->a_lblkno = (daddr_t)-1; /* failure */ - return (EOPNOTSUPP); + *ap->a_lblkno = (daddr64_t)-1; /* failure */ + return (ENOTSUP); default: panic("spec_offtoblk type"); } /* NOTREACHED */ + + return (0); } diff --git a/bsd/miscfs/specfs/specdev.h b/bsd/miscfs/specfs/specdev.h index b90c72a6e..61c340ac1 100644 --- a/bsd/miscfs/specfs/specdev.h +++ b/bsd/miscfs/specfs/specdev.h @@ -68,15 +68,14 @@ * special devices. It is allocated in checkalias and freed * in vgone. */ -struct lockf; struct specinfo { struct vnode **si_hashchain; struct vnode *si_specnext; long si_flags; dev_t si_rdev; - daddr_t si_size; /* device block size in bytes */ - u_int64_t si_devsize; /* actual device size in bytes */ - struct lockf *si_lockf; /* head of advisory lock list */ + daddr_t si_size; /* device block size in bytes */ + daddr64_t si_lastr; /* last read blkno (read-ahead) */ + u_int64_t si_devsize; /* actual device size in bytes */ }; /* * Exported shorthand @@ -87,6 +86,7 @@ struct specinfo { #define v_specflags v_specinfo->si_flags #define v_specsize v_specinfo->si_size #define v_specdevsize v_specinfo->si_devsize +#define v_speclastr v_specinfo->si_lastr /* * Flags for specinfo @@ -116,56 +116,48 @@ struct flock; struct buf; struct uio; -int spec_ebadf(); +int spec_ebadf(void *); -int spec_lookup __P((struct vop_lookup_args *)); -#define spec_create ((int (*) __P((struct vop_access_args *)))err_create) -#define spec_mknod ((int (*) __P((struct vop_access_args *)))err_mknod) -int spec_open __P((struct vop_open_args *)); -int spec_close __P((struct vop_close_args *)); -#define spec_access ((int (*) __P((struct vop_access_args *)))spec_ebadf) -#define spec_getattr ((int (*) __P((struct vop_getattr_args *)))spec_ebadf) -#define spec_setattr ((int (*) __P((struct vop_setattr_args *)))spec_ebadf) -int spec_read __P((struct vop_read_args *)); -int spec_write __P((struct vop_write_args *)); -#define spec_lease_check ((int (*) __P((struct vop_access_args *)))nop_lease) -int spec_ioctl __P((struct vop_ioctl_args *)); -int spec_select __P((struct vop_select_args *)); -#define spec_revoke ((int (*) __P((struct vop_access_args *)))nop_revoke) -#define spec_mmap ((int (*) __P((struct vop_access_args *)))err_mmap) -int spec_fsync __P((struct vop_fsync_args *)); -#define spec_seek ((int (*) __P((struct vop_access_args *)))err_seek) -#define spec_remove ((int (*) __P((struct vop_access_args *)))err_remove) -#define spec_link ((int (*) __P((struct vop_access_args *)))err_link) -#define spec_rename ((int (*) __P((struct vop_access_args *)))err_rename) -#define spec_mkdir ((int (*) __P((struct vop_access_args *)))err_mkdir) -#define spec_rmdir ((int (*) __P((struct vop_access_args *)))err_rmdir) -#define spec_symlink ((int (*) __P((struct vop_access_args *)))err_symlink) -#define spec_readdir ((int (*) __P((struct vop_access_args *)))err_readdir) -#define spec_readlink ((int (*) __P((struct vop_access_args *)))err_readlink) -#define spec_abortop ((int (*) __P((struct vop_access_args *)))err_abortop) -#define spec_inactive ((int (*) __P((struct vop_access_args *)))nop_inactive) -#define spec_reclaim ((int (*) __P((struct vop_access_args *)))nop_reclaim) -#define spec_lock ((int (*) __P((struct vop_access_args *)))nop_lock) -#define spec_unlock ((int (*) __P((struct vop_access_args *)))nop_unlock) -int spec_bmap __P((struct vop_bmap_args *)); -int spec_strategy __P((struct vop_strategy_args *)); -int spec_print __P((struct vop_print_args *)); -#define spec_islocked ((int (*) __P((struct vop_access_args *)))nop_islocked) -int spec_pathconf __P((struct vop_pathconf_args *)); -int spec_advlock __P((struct vop_advlock_args *)); -#define spec_blkatoff ((int (*) __P((struct vop_access_args *)))err_blkatoff) -#define spec_valloc ((int (*) __P((struct vop_access_args *)))err_valloc) -#define spec_vfree ((int (*) __P((struct vop_access_args *)))err_vfree) -#define spec_truncate ((int (*) __P((struct vop_access_args *)))nop_truncate) -#define spec_update ((int (*) __P((struct vop_access_args *)))nop_update) -#define spec_reallocblks \ - ((int (*) __P((struct vop_reallocblks_args *)))err_reallocblks) -#define spec_bwrite ((int (*) __P((struct vop_bwrite_args *)))nop_bwrite) -int spec_devblocksize __P((struct vop_devblocksize_args *)); -int spec_blktooff __P((struct vop_blktooff_args *)); -int spec_offtoblk __P((struct vop_offtoblk_args *)); -int spec_cmap __P((struct vop_cmap_args *)); +int spec_lookup (struct vnop_lookup_args *); +#define spec_create (int (*) (struct vnop_access_args *))err_create +#define spec_mknod (int (*) (struct vnop_access_args *))err_mknod +int spec_open (struct vnop_open_args *); +int spec_close (struct vnop_close_args *); +#define spec_access (int (*) (struct vnop_access_args *))spec_ebadf +#define spec_getattr (int (*) (struct vnop_getattr_args *))spec_ebadf +#define spec_setattr (int (*) (struct vnop_setattr_args *))spec_ebadf +int spec_read (struct vnop_read_args *); +int spec_write (struct vnop_write_args *); +int spec_ioctl (struct vnop_ioctl_args *); +int spec_select (struct vnop_select_args *); +#define spec_revoke (int (*) (struct vnop_access_args *))nop_revoke +#define spec_mmap (int (*) (struct vnop_access_args *))err_mmap +int spec_fsync (struct vnop_fsync_args *); +int spec_fsync_internal (vnode_t, int, vfs_context_t); +#define spec_remove (int (*) (struct vnop_access_args *))err_remove +#define spec_link (int (*) (struct vnop_access_args *))err_link +#define spec_rename (int (*) (struct vnop_access_args *))err_rename +#define spec_mkdir (int (*) (struct vnop_access_args *))err_mkdir +#define spec_rmdir (int (*) (struct vnop_access_args *))err_rmdir +#define spec_symlink (int (*) (struct vnop_access_args *))err_symlink +#define spec_readdir (int (*) (struct vnop_access_args *))err_readdir +#define spec_readlink (int (*) (struct vnop_access_args *))err_readlink +#define spec_inactive (int (*) (struct vnop_access_args *))nop_inactive +#define spec_reclaim (int (*) (struct vnop_access_args *))nop_reclaim +#define spec_lock (int (*) (struct vnop_access_args *))nop_lock +#define spec_unlock (int (*)(struct vnop_access_args *))nop_unlock +int spec_strategy (struct vnop_strategy_args *); +#define spec_islocked (int (*) (struct vnop_access_args *))nop_islocked +int spec_pathconf (struct vnop_pathconf_args *); +#define spec_advlock (int (*) (struct vnop_access_args *))err_advlock +#define spec_blkatoff (int (*) (struct vnop_access_args *))err_blkatoff +#define spec_valloc (int (*) (struct vnop_access_args *))err_valloc +#define spec_vfree (int (*) (struct vnop_access_args *))err_vfree +#define spec_bwrite (int (*) (struct vnop_bwrite_args *))nop_bwrite +int spec_devblocksize (struct vnop_devblocksize_args *); +int spec_blktooff (struct vnop_blktooff_args *); +int spec_offtoblk (struct vnop_offtoblk_args *); +int spec_blockmap (struct vnop_blockmap_args *); #endif /* __APPLE_API_PRIVATE */ #endif /* _MISCFS_SPECFS_SPECDEV_H_ */ diff --git a/bsd/miscfs/synthfs/synthfs.h b/bsd/miscfs/synthfs/synthfs.h index 28a85f8b2..cb152a45e 100644 --- a/bsd/miscfs/synthfs/synthfs.h +++ b/bsd/miscfs/synthfs/synthfs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -97,7 +97,6 @@ struct synthfsnode struct synthfsnode *s_parent; struct vnode *s_vp; char *s_name; - struct lock__bsd__ s_lock; unsigned long s_nodeflags; /* Internal synthfs flags: IN_CHANGED, IN_MODIFIED, etc. */ unsigned long s_pflags; /* File system flags: IMMUTABLE, etc. */ unsigned long s_nodeid; @@ -146,7 +145,10 @@ struct synthfsnode (sp)->s_modificationtime = *(t2); \ } \ if ((sp)->s_nodeflags & IN_CHANGE) { \ - (sp)->s_changetime = time; \ + struct timeval _tv; \ + \ + microtime(&_tv); \ + (sp)->s_changetime = _tv; \ }; \ (sp)->s_nodeflags &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); \ } \ @@ -182,54 +184,49 @@ struct synthfsnode extern int (**synthfs_vnodeop_p)(void *); __BEGIN_DECLS -int synthfs_mount __P((struct mount *, char *, caddr_t, struct nameidata *, struct proc *)); -int synthfs_start __P((struct mount *, int, struct proc *)); -int synthfs_unmount __P((struct mount *, int, struct proc *)); -int synthfs_root __P((struct mount *, struct vnode **)); -int synthfs_quotactl __P((struct mount *, int, uid_t, caddr_t, struct proc *)); -int synthfs_statfs __P((struct mount *, struct statfs *, struct proc *)); -int synthfs_sync __P((struct mount *, int, struct ucred *, struct proc *)); -int synthfs_vget __P((struct mount *, void *ino, struct vnode **)); -int synthfs_fhtovp __P((struct mount *, struct fid *, struct mbuf *, struct vnode **, int *, struct ucred **)); -int synthfs_vptofh __P((struct vnode *, struct fid *)); -int synthfs_init __P((struct vfsconf *)); -int synthfs_sysctl __P((int *, u_int, void *, size_t *, void *, size_t, struct proc *)); - -int synthfs_create __P((struct vop_create_args *)); -int synthfs_open __P((struct vop_open_args *)); -int synthfs_mmap __P((struct vop_mmap_args *)); -int synthfs_access __P((struct vop_access_args *)); -int synthfs_getattr __P((struct vop_getattr_args *)); -int synthfs_setattr __P((struct vop_setattr_args *)); -int synthfs_rename __P((struct vop_rename_args *)); -int synthfs_select __P((struct vop_select_args *)); -int synthfs_remove __P((struct vop_remove_args *)); -int synthfs_mkdir __P((struct vop_mkdir_args *)); -int synthfs_rmdir __P((struct vop_rmdir_args *)); -int synthfs_symlink __P((struct vop_symlink_args *)); -int synthfs_readlink __P((struct vop_readlink_args *)); -int synthfs_readdir __P((struct vop_readdir_args *)); -int synthfs_cached_lookup __P((struct vop_cachedlookup_args *)); -int synthfs_lookup __P((struct vop_cachedlookup_args *)); -int synthfs_pathconf __P((struct vop_pathconf_args *)); -int synthfs_update __P((struct vop_update_args *)); +int synthfs_mount (struct mount *, vnode_t, user_addr_t, vfs_context_t context); +int synthfs_start (struct mount *, int, vfs_context_t context); +int synthfs_unmount (struct mount *, int, vfs_context_t context); +int synthfs_root (struct mount *, struct vnode **, vfs_context_t context); +int synthfs_vfs_getattr (mount_t mp, struct vfs_attr *fsap, vfs_context_t context); +int synthfs_sync (struct mount *, int, vfs_context_t context); +int synthfs_vget (struct mount *, ino64_t ino, struct vnode **, vfs_context_t context); +int synthfs_fhtovp (struct mount *, int, unsigned char *, struct vnode **, vfs_context_t context); +int synthfs_vptofh (struct vnode *, int *, unsigned char *, vfs_context_t context); +int synthfs_init (struct vfsconf *); +int synthfs_sysctl (int *, u_int, user_addr_t, size_t *, user_addr_t, size_t, vfs_context_t context); + +int synthfs_create (struct vnop_create_args *); +int synthfs_open (struct vnop_open_args *); +int synthfs_mmap (struct vnop_mmap_args *); +int synthfs_getattr (struct vnop_getattr_args *); +int synthfs_setattr (struct vnop_setattr_args *); +int synthfs_rename (struct vnop_rename_args *); +int synthfs_select (struct vnop_select_args *); +int synthfs_remove (struct vnop_remove_args *); +int synthfs_mkdir (struct vnop_mkdir_args *); +int synthfs_rmdir (struct vnop_rmdir_args *); +int synthfs_symlink (struct vnop_symlink_args *); +int synthfs_readlink (struct vnop_readlink_args *); +int synthfs_readdir (struct vnop_readdir_args *); +int synthfs_cached_lookup (struct vnop_lookup_args *); +int synthfs_lookup (struct vnop_lookup_args *); +int synthfs_pathconf (struct vnop_pathconf_args *); -int synthfs_lock __P((struct vop_lock_args *)); -int synthfs_unlock __P((struct vop_unlock_args *)); -int synthfs_islocked __P((struct vop_islocked_args *)); - -int synthfs_inactive __P((struct vop_inactive_args*)); -int synthfs_reclaim __P((struct vop_reclaim_args*)); - -void synthfs_setupuio __P((struct iovec *iov, struct uio *uio, void *buffer, size_t bufsize, enum uio_seg space, enum uio_rw direction, struct proc *p)); -int synthfs_new_directory __P((struct mount *mp, struct vnode *dp, const char *name, unsigned long nodeid, mode_t mode, struct proc *p, struct vnode **vpp)); -int synthfs_new_symlink __P((struct mount *mp, struct vnode *dp, const char *name, unsigned long nodeid, char *targetstring, struct proc *p, struct vnode **vpp)); -long synthfs_adddirentry __P((u_int32_t fileno, u_int8_t type, const char *name, struct uio *uio)); -int synthfs_remove_entry __P((struct vnode *vp)); -int synthfs_remove_directory __P((struct vnode *vp)); -int synthfs_remove_symlink __P((struct vnode *vp)); -int synthfs_move_rename_entry __P((struct vnode *source_vp, struct vnode *newparent_vp, char *newname)); -int synthfs_derive_vnode_path __P((struct vnode *vp, char *vnpath, size_t pathbuffersize)); + +int synthfs_inactive (struct vnop_inactive_args*); +int synthfs_reclaim (struct vnop_reclaim_args*); + +void synthfs_setupuio (struct iovec *iov, struct uio *uio, void *buffer, size_t bufsize, enum uio_seg space, enum uio_rw direction, proc_t p); +int synthfs_new_directory (mount_t mp, vnode_t dp, const char *name, unsigned long nodeid, mode_t mode, proc_t p, vnode_t *vpp); +int synthfs_new_symlink (mount_t mp, vnode_t dp, const char *name, unsigned long nodeid, char *targetstring, proc_t p, vnode_t *vpp); +long synthfs_adddirentry (u_int32_t fileno, u_int8_t type, const char *name, struct uio *uio); +int synthfs_remove_entry (struct vnode *vp); +int synthfs_remove_directory (struct vnode *vp); +int synthfs_remove_symlink (struct vnode *vp); +int synthfs_move_rename_entry (struct vnode *source_vp, struct vnode *newparent_vp, char *newname); +int synthfs_derive_vnode_path (struct vnode *vp, char *vnpath, size_t pathbuffersize); +int synthfs_update(struct vnode *vp, struct timeval *access, struct timeval *modify, int waitfor); #endif /* __APPLE_API_PRIVATE */ #endif /* __SYNTHFS_H__ */ diff --git a/bsd/miscfs/synthfs/synthfs_util.c b/bsd/miscfs/synthfs/synthfs_util.c index 37ec7cde4..d28e6ec5d 100644 --- a/bsd/miscfs/synthfs/synthfs_util.c +++ b/bsd/miscfs/synthfs/synthfs_util.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -34,16 +34,16 @@ #include <sys/kernel.h> #include <sys/file.h> #include <sys/stat.h> -#include <sys/buf.h> #include <sys/proc.h> #include <sys/conf.h> -#include <sys/mount.h> -#include <sys/vnode.h> +#include <sys/mount_internal.h> +#include <sys/vnode_internal.h> #include <sys/malloc.h> #include <sys/dirent.h> #include <sys/namei.h> #include <sys/attr.h> #include <sys/time.h> +#include <sys/uio_internal.h> #include <sys/vm.h> #include <sys/errno.h> @@ -62,25 +62,6 @@ struct synthfs_direntry_head { #define PATHSEPARATOR '/' #define ROOTDIRID 2 -void synthfs_setupuio(struct iovec *iov, - struct uio *uio, - void *buffer, - size_t bufsize, - enum uio_seg space, - enum uio_rw direction, - struct proc *p) { - iov->iov_base = (char *)buffer; - iov->iov_len = bufsize; - - uio->uio_iov = iov; - uio->uio_iovcnt = 1; - uio->uio_offset = 0; - uio->uio_resid = bufsize; - uio->uio_segflg = space; - uio->uio_rw = direction; - uio->uio_procp = p; -} - static int synthfs_insertnode(struct synthfsnode *newnode_sp, struct synthfsnode *parent_sp) { struct timeval now; @@ -91,25 +72,25 @@ static int synthfs_insertnode(struct synthfsnode *newnode_sp, struct synthfsnode ++parent_sp->s_u.d.d_entrycount; newnode_sp->s_parent = parent_sp; - parent_sp->s_nodeflags |= IN_CHANGE | IN_MODIFIED; - now = time; - VOP_UPDATE(STOV(parent_sp), &now, &now, 0); + parent_sp->s_nodeflags |= IN_CHANGE | IN_MODIFIED; + microtime(&now); + synthfs_update(STOV(parent_sp), &now, &now, 0); return 0; } -static int synthfs_newnode(struct mount *mp, struct vnode *dp, const char *name, unsigned long nodeid, mode_t mode, struct proc *p, struct vnode **vpp) { +static int synthfs_newnode(mount_t mp, vnode_t dp, const char *name, unsigned long nodeid, + mode_t mode, __unused proc_t p, enum vtype vtype, vnode_t *vpp) { int result; struct synthfsnode *sp; struct vnode *vp; struct timeval now; char *nodename; + struct vnode_fsparam vfsp; - /* Allocate the synthfsnode now to avoid blocking between the call - to getnewvnode(), below, and the initialization of v_data: */ - MALLOC(sp, struct synthfsnode *, sizeof(struct synthfsnode), M_SYNTHFS, M_WAITOK); + MALLOC(sp, struct synthfsnode *, sizeof(struct synthfsnode), M_SYNTHFS, M_WAITOK); if (name == NULL) { MALLOC(nodename, char *, 1, M_TEMP, M_WAITOK); @@ -119,31 +100,12 @@ static int synthfs_newnode(struct mount *mp, struct vnode *dp, const char *name, strcpy(nodename, name); }; - /* - Note that getnewvnode() returns the vnode with a refcount of +1; - this routine returns the newly created vnode with this positive refcount. - */ - result = getnewvnode(VT_SYNTHFS, mp, synthfs_vnodeop_p, &vp); - if (result != 0) { - DBG_VOP(("getnewvnode failed with error code %d\n", result)); - FREE(nodename, M_TEMP); - FREE(sp, M_TEMP); - return result; - } - if (vp == NULL) { - DBG_VOP(("getnewvnod returned NULL without an error!\n")); - FREE(nodename, M_TEMP); - FREE(sp, M_TEMP); - return EINVAL; - } - /* Initialize the relevant synthfsnode fields: */ bzero(sp, sizeof(*sp)); - lockinit(&sp->s_lock, PINOD, "synthfsnode", 0, 0); sp->s_nodeid = nodeid; /* Initialize all times from a consistent snapshot of the clock: */ - now = time; + microtime(&now); sp->s_createtime = now; sp->s_accesstime = now; sp->s_modificationtime = now; @@ -151,11 +113,32 @@ static int synthfs_newnode(struct mount *mp, struct vnode *dp, const char *name, sp->s_name = nodename; sp->s_mode = mode; + + //bzero(&vfsp, sizeof(struct vnode_fsparam)); + vfsp.vnfs_mp = mp; + vfsp.vnfs_vtype = vtype; + vfsp.vnfs_str = "synthfs"; + vfsp.vnfs_dvp = 0; + vfsp.vnfs_fsnode = sp; + vfsp.vnfs_cnp = 0; + vfsp.vnfs_vops = synthfs_vnodeop_p; + vfsp.vnfs_rdev = 0; + vfsp.vnfs_filesize = 0; + vfsp.vnfs_flags = VNFS_NOCACHE | VNFS_CANTCACHE; + vfsp.vnfs_marksystem = 0; + vfsp.vnfs_markroot = 0; + + result = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &vp); + if (result != 0) { + DBG_VOP(("getnewvnode failed with error code %d\n", result)); + FREE(nodename, M_TEMP); + FREE(sp, M_TEMP); + return result; + } + vnode_ref(vp); + sp->s_vp = vp; - vp->v_data = sp; - vget(vp, LK_EXCLUSIVE, p); - /* If there's a parent directory, update its subnode structures to insert this new node: */ if (dp) { result = synthfs_insertnode(sp, VTOS(dp)); @@ -178,8 +161,8 @@ int synthfs_remove_entry(struct vnode *vp) { --psp->s_u.d.d_entrycount; psp->s_nodeflags |= IN_CHANGE | IN_MODIFIED; - now = time; - VOP_UPDATE(STOV(psp), &now, &now, 0); + microtime(&now); + synthfs_update(STOV(psp), &now, &now, 0); }; return 0; @@ -219,15 +202,13 @@ int synthfs_new_directory(struct mount *mp, struct vnode *dp, const char *name, struct vnode *vp; struct synthfsnode *sp; - result = synthfs_newnode(mp, dp, name, nodeid, mode, p, &vp); + result = synthfs_newnode(mp, dp, name, nodeid, mode, p, VDIR, &vp); if (result) { return result; }; sp = VTOS(vp); sp->s_linkcount = 2; - /* Initialize the relevant vnode fields: */ - vp->v_type = VDIR; if (dp) { ++VTOS(dp)->s_linkcount; /* Account for the [fictitious] ".." link */ }; @@ -251,6 +232,7 @@ int synthfs_remove_directory(struct vnode *vp) { if (psp && (sp->s_type == SYNTHFS_DIRECTORY) && (psp != sp)) { --psp->s_linkcount; /* account for the [fictitious] ".." link now removed */ }; + vnode_rele(vp); /* Do the standard cleanup involved in pruning an entry from the filesystem: */ return synthfs_remove_entry(vp); /* Do whatever standard cleanup is required */ @@ -271,16 +253,13 @@ int synthfs_new_symlink( struct vnode *vp; struct synthfsnode *sp; - result = synthfs_newnode(mp, dp, name, nodeid, 0, p, &vp); + result = synthfs_newnode(mp, dp, name, nodeid, 0, p, VLNK, &vp); if (result) { return result; }; sp = VTOS(vp); sp->s_linkcount = 1; - /* Initialize the relevant vnode fields: */ - vp->v_type = VLNK; - /* Set up the symlink-specific fields: */ sp->s_type = SYNTHFS_SYMLINK; sp->s_u.s.s_length = strlen(targetstring); @@ -298,6 +277,7 @@ int synthfs_remove_symlink(struct vnode *vp) { struct synthfsnode *sp = VTOS(vp); FREE(sp->s_u.s.s_symlinktarget, M_TEMP); + vnode_rele(vp); /* Do the standard cleanup involved in pruning an entry from the filesystem: */ return synthfs_remove_entry(vp); /* Do whatever standard cleanup is required */ @@ -324,7 +304,7 @@ long synthfs_adddirentry(u_int32_t fileno, u_int8_t type, const char *name, stru direntry.d_type = type; direntry.d_namlen = namelength; - if (uio->uio_resid < direntry.d_reclen) { + if (uio_resid(uio) < direntry.d_reclen) { direntrylength = 0; } else { uiomove((caddr_t)(&direntry), sizeof(direntry), uio); diff --git a/bsd/miscfs/synthfs/synthfs_vfsops.c b/bsd/miscfs/synthfs/synthfs_vfsops.c index 530239f61..39e8d6a6c 100644 --- a/bsd/miscfs/synthfs/synthfs_vfsops.c +++ b/bsd/miscfs/synthfs/synthfs_vfsops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -31,13 +31,12 @@ #include <sys/systm.h> #include <sys/namei.h> #include <sys/filedesc.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> #include <sys/kernel.h> #include <mach/machine/vm_types.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/socket.h> -#include <sys/mount.h> -#include <sys/buf.h> +#include <sys/mount_internal.h> #include <sys/mbuf.h> #include <sys/file.h> #include <sys/disk.h> @@ -45,6 +44,7 @@ #include <sys/errno.h> #include <sys/malloc.h> #include <sys/attr.h> +#include <sys/uio_internal.h> #include <miscfs/specfs/specdev.h> @@ -59,8 +59,8 @@ struct vfsops synthfs_vfsops = { synthfs_start, synthfs_unmount, synthfs_root, - synthfs_quotactl, - synthfs_statfs, + NULL, /* quotactl */ + synthfs_vfs_getattr, synthfs_sync, synthfs_vget, synthfs_fhtovp, @@ -71,7 +71,7 @@ struct vfsops synthfs_vfsops = { #define ROOTMPMODE 0755 #define ROOTPLACEHOLDERMODE 0700 -static char synthfs_fs_name[MFSNAMELEN] = "synthfs"; +static char synthfs_fs_name[MFSTYPENAMELEN] = "synthfs"; static char synthfs_fake_mntfromname[] = "<synthfs>"; @@ -91,130 +91,14 @@ int vn_symlink(struct proc *p, char *path, char *link); #if LOADABLE_FS void synthfs_load(int loadArgument) { - struct vfsconf *newvfsconf = NULL; - int j; - int (***opv_desc_vector_p)() = NULL; - int (**opv_desc_vector)(); - struct vnodeopv_entry_desc *opve_descp; - int error = 0; - -#pragma unused(loadArgument) - - /* - * This routine is responsible for all the initialization that would - * ordinarily be done as part of the system startup; it calls synthfs_init - * to do the initialization that is strictly synthfs-specific. - */ - - DBG_VOP(("load_synthfs: starting ...\n")); - - MALLOC(newvfsconf, void *, sizeof(struct vfsconf), M_SYNTHFS, M_WAITOK); - DBG_VOP(("load_synthfs: Allocated new vfsconf list entry, newvfsconf = 0x%08lx.\n", (unsigned long)newvfsconf)); - bzero(newvfsconf, sizeof(struct vfsconf)); - - if (newvfsconf) { - DBG_VOP(("load_synthfs: filling in newly allocated vfsconf entry at 0x%08lX.\n", (long)newvfsconf)); - newvfsconf->vfc_vfsops = &synthfs_vfsops; - strncpy(&newvfsconf->vfc_name[0], synthfs_fs_name, MFSNAMELEN); - newvfsconf->vfc_typenum = maxvfsconf++; - newvfsconf->vfc_refcount = 0; - newvfsconf->vfc_flags = 0; - newvfsconf->vfc_mountroot = NULL; /* Can't mount root of file system [yet] */ - - newvfsconf->vfc_next = NULL; - - /* Based on vfs_op_init and ... */ - opv_desc_vector_p = synthfs_vnodeop_opv_desc.opv_desc_vector_p; - - DBG_VOP(("load_synthfs: Allocating and initializing VNode ops vector...\n")); - - /* - * Allocate and init the vector. - * Also handle backwards compatibility. - */ - - MALLOC(*opv_desc_vector_p, PFI *, vfs_opv_numops*sizeof(PFI), M_SYNTHFS, M_WAITOK); - bzero (*opv_desc_vector_p, vfs_opv_numops*sizeof(PFI)); - opv_desc_vector = *opv_desc_vector_p; - for (j=0; synthfs_vnodeop_opv_desc.opv_desc_ops[j].opve_op; j++) { - opve_descp = &(synthfs_vnodeop_opv_desc.opv_desc_ops[j]); - - /* - * Sanity check: is this operation listed - * in the list of operations? We check this - * by seeing if its offest is zero. Since - * the default routine should always be listed - * first, it should be the only one with a zero - * offset. Any other operation with a zero - * offset is probably not listed in - * vfs_op_descs, and so is probably an error. - * - * A panic here means the layer programmer - * has committed the all-too common bug - * of adding a new operation to the layer's - * list of vnode operations but - * not adding the operation to the system-wide - * list of supported operations. - */ - if (opve_descp->opve_op->vdesc_offset == 0 && - opve_descp->opve_op->vdesc_offset != VOFFSET(vop_default)) { - DBG_VOP(("load_synthfs: operation %s not listed in %s.\n", - opve_descp->opve_op->vdesc_name, - "vfs_op_descs")); - panic ("load_synthfs: bad operation"); - } - /* - * Fill in this entry. - */ - opv_desc_vector[opve_descp->opve_op->vdesc_offset] = - opve_descp->opve_impl; - } - - /* - * Finally, go back and replace unfilled routines - * with their default. (Sigh, an O(n^3) algorithm. I - * could make it better, but that'd be work, and n is small.) - */ - opv_desc_vector_p = synthfs_vnodeop_opv_desc.opv_desc_vector_p; - - /* - * Force every operations vector to have a default routine. - */ - opv_desc_vector = *opv_desc_vector_p; - if (opv_desc_vector[VOFFSET(vop_default)]==NULL) { - panic("load_vp;fs: operation vector without default routine."); - } - for (j = 0;j<vfs_opv_numops; j++) - if (opv_desc_vector[j] == NULL) - opv_desc_vector[j] = - opv_desc_vector[VOFFSET(vop_default)]; - - if (error = vfsconf_add(newvfsconf)) { - goto ErrExit; - }; - goto InitFS; - - -ErrExit: ; - if (opv_desc_vector_p && *opv_desc_vector_p) FREE(*opv_desc_vector_p, M_SYNTHFS); - - if (newvfsconf) FREE (newvfsconf, M_SYNTHFS); - goto StdExit; - - -InitFS: ; - DBG_VOP(("load_synthfs: calling synthfs_init()...\n")); - synthfs_init(newvfsconf); - }; - -StdExit: ; + /* Should use vfs_fsadd kpi */ } int synthfs_unload(void) { - DBG_VOP(("synthfs: Entering synthfs_unload...\n")); + /* should use fs_fsremove kpi */ return 0; } #endif @@ -227,7 +111,7 @@ int synthfs_unload(void) { * mount system call */ int -synthfs_mount_fs(struct mount *mp, char *path, caddr_t data, struct nameidata *ndp, struct proc *p) +synthfs_mount_fs(struct mount *mp, vnode_t devvp, __unused user_addr_t data, struct proc *p) { struct synthfs_mntdata *priv_mnt_data; int error; @@ -237,9 +121,8 @@ synthfs_mount_fs(struct mount *mp, char *path, caddr_t data, struct nameidata *n MALLOC(priv_mnt_data, struct synthfs_mntdata *, sizeof(struct synthfs_mntdata), M_SYNTHFS, M_WAITOK); DBG_VOP(("MALLOC succeeded...\n")); - strncpy(mp->mnt_stat.f_fstypename, synthfs_fs_name, sizeof(mp->mnt_stat.f_fstypename)); - (void) copyinstr(path, mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname) - 1, &size); - strncpy(mp->mnt_stat.f_mntfromname, synthfs_fake_mntfromname, sizeof(mp->mnt_stat.f_mntfromname)); + strncpy(mp->mnt_vfsstat.f_fstypename, synthfs_fs_name, sizeof(mp->mnt_vfsstat.f_fstypename)); + strncpy(mp->mnt_vfsstat.f_mntfromname, synthfs_fake_mntfromname, sizeof(mp->mnt_vfsstat.f_mntfromname)); priv_mnt_data->synthfs_mounteddev = (dev_t)0; priv_mnt_data->synthfs_nextid = FIRST_SYNTHFS_ID; priv_mnt_data->synthfs_filecount = 0; @@ -263,7 +146,7 @@ synthfs_mount_fs(struct mount *mp, char *path, caddr_t data, struct nameidata *n /* Drop the freshly acquired reference on the root, leaving v_usecount=1 to prevent the vnode from beeing freed: */ - vput(priv_mnt_data->synthfs_rootvp); + vnode_put(priv_mnt_data->synthfs_rootvp); return (0); } @@ -271,17 +154,15 @@ synthfs_mount_fs(struct mount *mp, char *path, caddr_t data, struct nameidata *n int -synthfs_mount(mp, path, data, ndp, p) +synthfs_mount(mp, devvp, data, context) register struct mount *mp; - char *path; - caddr_t data; - struct nameidata *ndp; - struct proc *p; + vnode_t devvp; + user_addr_t data; + vfs_context_t context; { size_t size; - (void) copyinstr(path, mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname) - 1, &size); - return (synthfs_mount_fs(mp, path, data, ndp, p)); + return (synthfs_mount_fs(mp, devvp, data, vfs_context_proc(context))); } @@ -301,10 +182,10 @@ synthfs_init(vfsp) } int -synthfs_start(mp, flags, p) +synthfs_start(mp, flags, context) struct mount * mp; int flags; -struct proc * p; +vfs_context_t context; { DBG_VOP(("synthfs_start called.\n")); return 0; @@ -314,38 +195,27 @@ struct proc * p; * Return the root of a filesystem. */ int -synthfs_root(mp, vpp) +synthfs_root(mp, vpp, context) struct mount *mp; struct vnode **vpp; + vfs_context_t context; { unsigned long root_nodeid = ROOT_DIRID; DBG_VOP(("synthfs_root called.\n")); *vpp = VFSTOSFS(mp)->synthfs_rootvp; - return vget(VFSTOSFS(mp)->synthfs_rootvp, LK_EXCLUSIVE | LK_RETRY, current_proc()); -} - -int -synthfs_quotactl(mp, cmds, uid, arg, p) -struct mount *mp; -int cmds; -uid_t uid; -caddr_t arg; -struct proc * p; -{ - DBG_VOP(("synthfs_quotactl called.\n")); - return (0); + return vnode_get(VFSTOSFS(mp)->synthfs_rootvp); } /* * unmount system call */ int -synthfs_unmount(mp, mntflags, p) +synthfs_unmount(mp, mntflags, context) struct mount *mp; int mntflags; - struct proc *p; + vfs_context_t context; { struct synthfs_mntdata *synth; struct vnode *root_vp; @@ -359,16 +229,13 @@ synthfs_unmount(mp, mntflags, p) if (retval && ((mntflags & MNT_FORCE) == 0)) goto Err_Exit; /* Free the root vnode. - Note that there's no need to vget() or vref() it before locking it here: the ref. count has been maintained at +1 ever since mount time. */ if (root_vp) { - retval = vn_lock(root_vp, LK_EXCLUSIVE | LK_RETRY, p); if ((mntflags & MNT_FORCE) == 0) { if (retval) goto Err_Exit; if (root_vp->v_usecount > 1) { DBG_VOP(("synthfs ERROR: root vnode = %x, usecount = %d\n", (int)root_vp, synth->synthfs_rootvp->v_usecount)); - VOP_UNLOCK(root_vp, 0, p); retval = EBUSY; goto Err_Exit; }; @@ -377,8 +244,10 @@ synthfs_unmount(mp, mntflags, p) synth->synthfs_rootvp = NULL; if (retval == 0) { - vput(root_vp); /* This drops synthfs's own refcount */ - vgone(root_vp); + vnode_get(root_vp); + vnode_rele(root_vp); + vnode_recycle(root_vp); + vnode_put(root_vp); /* This drops synthfs's own refcount */ }; }; @@ -398,24 +267,22 @@ Err_Exit: * Get file system statistics. */ int -synthfs_statfs(mp, sbp, p) - struct mount *mp; - register struct statfs *sbp; - struct proc *p; +synthfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t context) { - DBG_VOP(("synthfs_statfs called.\n")); - - sbp->f_bsize = 512; - sbp->f_iosize = 512; - sbp->f_blocks = 1024; // lies, darn lies and virtual file systems - sbp->f_bfree = 0; // Nope, can't write here! - sbp->f_bavail = 0; - sbp->f_files = VFSTOSFS(mp)->synthfs_filecount + VFSTOSFS(mp)->synthfs_dircount; - sbp->f_ffree = 0; - strncpy(sbp->f_mntonname, mp->mnt_stat.f_mntonname, sizeof(sbp->f_mntonname)); - strncpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname, sizeof(sbp->f_mntfromname)); - - return (0); + struct synthfs_mntdata *synthfs_mp = VFSTOSFS(mp); + DBG_VOP(("synthfs_vfs_getattr called.\n")); + + VFSATTR_RETURN(fsap, f_bsize, 512); + VFSATTR_RETURN(fsap, f_iosize, 512); + VFSATTR_RETURN(fsap, f_blocks, 1024); + VFSATTR_RETURN(fsap, f_bfree, 0); + VFSATTR_RETURN(fsap, f_bavail, 0); + VFSATTR_RETURN(fsap, f_bused, 1024); + VFSATTR_RETURN(fsap, f_files, synthfs_mp->synthfs_filecount + synthfs_mp->synthfs_dircount); + VFSATTR_RETURN(fsap, f_ffree, 0); + VFSATTR_RETURN(fsap, f_fssubtype, 0); + + return 0; } /* @@ -423,11 +290,10 @@ synthfs_statfs(mp, sbp, p) * structures, so don't do anything */ int -synthfs_sync(mp, waitfor, cred, p) +synthfs_sync(mp, waitfor, context) struct mount *mp; int waitfor; - struct ucred *cred; - struct proc *p; + vfs_context_t context; { // DBG_VOP(("synthfs_sync called\n")); return 0; @@ -436,12 +302,14 @@ synthfs_sync(mp, waitfor, cred, p) * Look up a synthfs node by node number. */ int -synthfs_vget(mp, ino, vpp) +synthfs_vget(mp, ino, vpp, context) struct mount *mp; - void *ino; + ino64_t ino; struct vnode **vpp; + vfs_context_t context; { struct vnode *vp; + int vid = 0; // DBG_VOP(("synthfs_vget called\n")); @@ -452,19 +320,25 @@ synthfs_vget(mp, ino, vpp) } loop: - simple_lock(&mntvnode_slock); - LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { - if (VTOS(vp)->s_nodeid == *((unsigned long *)ino)) { - if (vget(vp, LK_EXCLUSIVE, current_proc()) != 0) { - simple_unlock(&mntvnode_slock); - goto loop; - }; - simple_unlock(&mntvnode_slock); + TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { + if (VTOS(vp)->s_nodeid == (unsigned long)ino) { + /* + * doing a vnode_getwithvid isn't technically + * necessary since synthfs is an unsafe filesystem + * and we're running behind a funnel at this point + * however, vnode_get always succeeds, which isn't + * what we want if this vnode is in the process of + * being terminated + */ + vid = vnode_vid(vp); + + if (vnode_getwithvid(vp, vid) != 0) { + goto loop; + }; *vpp = vp; return 0; }; }; - simple_unlock(&mntvnode_slock); *vpp = NULL; return -1; } @@ -473,17 +347,11 @@ loop: * fast filesystem related variables. */ int -synthfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) - int *name; - u_int namelen; - void *oldp; - size_t *oldlenp; - void *newp; - size_t newlen; - struct proc *p; +synthfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen, vfs_context_t context) { DBG_VOP(("synthfs_sysctl called.\n")); - return (EOPNOTSUPP); + return (ENOTSUP); } /* @@ -491,16 +359,15 @@ synthfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) * */ int -synthfs_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) +synthfs_fhtovp(mp, fhlen, fhp, vpp, context) register struct mount *mp; - struct fid *fhp; - struct mbuf *nam; + int fhlen; + unsigned char *fhp; struct vnode **vpp; - int *exflagsp; - struct ucred **credanonp; + vfs_context_t context; { DBG_VOP(("synthfs_fhtovp called.\n")); - return EOPNOTSUPP; + return ENOTSUP; } /* @@ -508,12 +375,14 @@ synthfs_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) */ /* ARGSUSED */ int -synthfs_vptofh(vp, fhp) +synthfs_vptofh(vp, fhlenp, fhp, context) struct vnode *vp; - struct fid *fhp; + int *fhlenp; + unsigned char *fhp; + vfs_context_t context; { DBG_VOP(("synthfs_vptofh called.\n")); - return EOPNOTSUPP; + return ENOTSUP; } @@ -522,38 +391,42 @@ synthfs_vptofh(vp, fhp) int -vn_mkdir(struct proc *p, char *path, int mode) { +vn_mkdir(struct proc *p, char *path, int mode) +{ struct nameidata nd; struct vnode *vp; - struct vattr vattr; + struct vnode_attr va; + struct vfs_context context; int error; - NDINIT(&nd, CREATE, LOCKPARENT, UIO_SYSSPACE, path, p); - if (error = namei(&nd)) { + context.vc_proc = p; + context.vc_ucred = proc_ucred(p); /* XXX kauth_cred_get() ??? proxy */ + + NDINIT(&nd, CREATE, LOCKPARENT, UIO_SYSSPACE32, CAST_USER_ADDR_T(path), &context); + error = namei(&nd); + if (error) { DBG_VOP(("vn_mkdir: error from namei, error = %d.\n", error)); return (error); }; vp = nd.ni_vp; - if (vp != NULL) { - VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); - if (nd.ni_dvp == vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); - vrele(vp); + + if (vp == NULL) { + VATTR_INIT(&va); + VATTR_SET(&va, va_type, VDIR); + VATTR_SET(&va, va_mode, (mode & ACCESSPERMS) &~ p->p_fd->fd_cmask); + + error = vn_create(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &va, 0, &context); + if (error) + DBG_VOP(("vn_mkdir: error from vnop_mkdir (%d).\n", error)); + } else { DBG_VOP(("vn_mkdir: target already exists; returning EEXIST.\n")); - return (EEXIST); + error = EEXIST; } - VATTR_NULL(&vattr); - vattr.va_type = VDIR; - vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_fd->fd_cmask; - VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); - error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); - if (error) { - DBG_VOP(("vn_mkdir: error from VOP_MKDIR (%d).\n", error)); - } else { - vput(nd.ni_vp); - }; + vnode_put(nd.ni_dvp); + if (nd.ni_vp) + vnode_put(nd.ni_vp); + nameidone(&nd); + return (error); } @@ -562,25 +435,31 @@ vn_mkdir(struct proc *p, char *path, int mode) { int vn_symlink(struct proc *p, char *path, char *link) { struct nameidata nd; - struct vattr vattr; + struct vnode_attr va; + struct vfs_context context; int error; - NDINIT(&nd, CREATE, LOCKPARENT, UIO_SYSSPACE, link, p); - if (error = namei(&nd)) return error; - - if (nd.ni_vp) { - VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); - if (nd.ni_dvp == nd.ni_vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); - vrele(nd.ni_vp); - return EEXIST; - } - VATTR_NULL(&vattr); - vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask; - VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); - return VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path); + context.vc_proc = p; + context.vc_ucred = proc_ucred(p); /* XXX kauth_cred_get() ??? proxy */ + + NDINIT(&nd, CREATE, LOCKPARENT, UIO_SYSSPACE32, CAST_USER_ADDR_T(link), &context); + if ((error = namei(&nd))) return error; + + if (nd.ni_vp == NULL) { + VATTR_INIT(&va); + VATTR_SET(&va, va_type, VLNK); + VATTR_SET(&va, va_mode, ACCESSPERMS &~ p->p_fd->fd_cmask); + + error = VNOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &va, path, &context); + } else + error = EEXIST; + + vnode_put(nd.ni_dvp); + if (nd.ni_vp) + vnode_put(nd.ni_vp); + nameidone(&nd); + + return (error); } diff --git a/bsd/miscfs/synthfs/synthfs_vnops.c b/bsd/miscfs/synthfs/synthfs_vnops.c index eb723cb22..4f1110e77 100644 --- a/bsd/miscfs/synthfs/synthfs_vnops.c +++ b/bsd/miscfs/synthfs/synthfs_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -35,15 +35,16 @@ #include <sys/kernel.h> #include <sys/file.h> #include <sys/stat.h> -#include <sys/buf.h> #include <sys/proc.h> +#include <sys/kauth.h> #include <sys/conf.h> -#include <sys/mount.h> -#include <sys/vnode.h> +#include <sys/mount_internal.h> +#include <sys/vnode_internal.h> #include <sys/malloc.h> #include <sys/dirent.h> #include <sys/namei.h> #include <sys/attr.h> +#include <sys/uio_internal.h> #include <sys/vm.h> #include <sys/errno.h> @@ -55,81 +56,61 @@ #if RWSUPPORT #error NOT PORTED FOR UBC -/* when porting to UBC, do not just replace - * vnode_uncache by ubc_uncache - there's more - * to it than that! - */ #include <sys/ubc.h> #endif -extern int groupmember(gid_t gid, struct ucred* cred); +static int synthfs_remove_internal(struct vnode *dvp, struct vnode *vp, + struct componentname *cnp, vfs_context_t context); + #define VOPFUNC int (*)(void *) /* Global vfs data structures for synthfs. */ int (**synthfs_vnodeop_p) (void *); struct vnodeopv_entry_desc synthfs_vnodeop_entries[] = { - {&vop_default_desc, (VOPFUNC)vn_default_error}, - {&vop_strategy_desc, (VOPFUNC)err_strategy}, /* strategy - not supported */ - {&vop_bwrite_desc, (VOPFUNC)err_bwrite}, /* bwrite - not supported */ - {&vop_lookup_desc, (VOPFUNC)synthfs_cached_lookup}, /* cached lookup */ - {&vop_create_desc, (VOPFUNC)synthfs_create}, /* create - DEBUGGER */ - {&vop_whiteout_desc, (VOPFUNC)err_whiteout}, /* whiteout - not supported */ - {&vop_mknod_desc, (VOPFUNC)err_mknod}, /* mknod - not supported */ - {&vop_mkcomplex_desc, (VOPFUNC)err_mkcomplex}, /* mkcomplex - not supported */ - {&vop_open_desc, (VOPFUNC)synthfs_open}, /* open - DEBUGGER */ - {&vop_close_desc, (VOPFUNC)nop_close}, /* close - NOP */ - {&vop_access_desc, (VOPFUNC)synthfs_access}, /* access */ - {&vop_getattr_desc, (VOPFUNC)synthfs_getattr}, /* getattr */ - {&vop_setattr_desc, (VOPFUNC)synthfs_setattr}, /* setattr */ - {&vop_getattrlist_desc, (VOPFUNC)err_getattrlist}, /* getattrlist - not supported */ - {&vop_setattrlist_desc, (VOPFUNC)err_setattrlist}, /* setattrlist - not supported */ - {&vop_read_desc, (VOPFUNC)err_read}, /* read - not supported */ - {&vop_write_desc, (VOPFUNC)err_write}, /* write - not supported */ - {&vop_lease_desc, (VOPFUNC)err_lease}, /* lease - not supported */ - {&vop_ioctl_desc, (VOPFUNC)err_ioctl}, /* ioctl - not supported */ - {&vop_select_desc, (VOPFUNC)synthfs_select}, /* select */ - {&vop_exchange_desc, (VOPFUNC)err_exchange}, /* exchange - not supported */ - {&vop_revoke_desc, (VOPFUNC)nop_revoke}, /* revoke - NOP */ - {&vop_mmap_desc, (VOPFUNC)synthfs_mmap}, /* mmap - DEBUGGER */ - {&vop_fsync_desc, (VOPFUNC)nop_fsync}, /* fsync - NOP */ - {&vop_seek_desc, (VOPFUNC)nop_seek}, /* seek - NOP */ - {&vop_remove_desc, (VOPFUNC)synthfs_remove}, /* remove */ - {&vop_link_desc, (VOPFUNC)err_link}, /* link - not supported */ - {&vop_rename_desc, (VOPFUNC)synthfs_rename}, /* rename */ - {&vop_mkdir_desc, (VOPFUNC)synthfs_mkdir}, /* mkdir */ - {&vop_rmdir_desc, (VOPFUNC)synthfs_rmdir}, /* rmdir */ - {&vop_symlink_desc, (VOPFUNC)synthfs_symlink}, /* symlink */ - {&vop_readdir_desc, (VOPFUNC)synthfs_readdir}, /* readdir */ - {&vop_readdirattr_desc, (VOPFUNC)err_readdirattr}, /* readdirattr - not supported */ - {&vop_readlink_desc, (VOPFUNC)synthfs_readlink}, /* readlink */ - {&vop_abortop_desc, (VOPFUNC)nop_abortop}, /* abortop - NOP */ - {&vop_inactive_desc, (VOPFUNC)synthfs_inactive}, /* inactive */ - {&vop_reclaim_desc, (VOPFUNC)synthfs_reclaim}, /* reclaim */ - {&vop_lock_desc, (VOPFUNC)synthfs_lock}, /* lock */ - {&vop_unlock_desc, (VOPFUNC)synthfs_unlock}, /* unlock */ - {&vop_bmap_desc, (VOPFUNC)err_bmap}, /* bmap - not supported */ - {&vop_print_desc, (VOPFUNC)err_print}, /* print - not supported */ - {&vop_islocked_desc, (VOPFUNC)synthfs_islocked}, /* islocked */ - {&vop_pathconf_desc, (VOPFUNC)synthfs_pathconf}, /* pathconf */ - {&vop_advlock_desc, (VOPFUNC)err_advlock}, /* advlock - not supported */ - {&vop_blkatoff_desc, (VOPFUNC)err_blkatoff}, /* blkatoff - not supported */ - {&vop_valloc_desc, (VOPFUNC)err_valloc}, /* valloc - not supported */ - {&vop_reallocblks_desc, (VOPFUNC)err_reallocblks}, /* reallocblks - not supported */ - {&vop_vfree_desc, (VOPFUNC)err_vfree}, /* vfree - not supported */ - {&vop_truncate_desc, (VOPFUNC)err_truncate}, /* truncate - not supported */ - {&vop_allocate_desc, (VOPFUNC)err_allocate}, /* allocate - not supported */ - {&vop_update_desc, (VOPFUNC)synthfs_update}, /* update */ - {&vop_pgrd_desc, (VOPFUNC)err_pgrd}, /* pgrd - not supported */ - {&vop_pgwr_desc, (VOPFUNC)err_pgwr}, /* pgwr - not supported */ - {&vop_pagein_desc, (VOPFUNC)err_pagein}, /* pagein - not supported */ - {&vop_pageout_desc, (VOPFUNC)err_pageout}, /* pageout - not supported */ - {&vop_devblocksize_desc, (VOPFUNC)err_devblocksize}, /* devblocksize - not supported */ - {&vop_searchfs_desc, (VOPFUNC)err_searchfs}, /* searchfs - not supported */ - {&vop_copyfile_desc, (VOPFUNC)err_copyfile}, /* copyfile - not supported */ - { &vop_blktooff_desc, (VOPFUNC)err_blktooff }, /* blktooff not supported */ - { &vop_offtoblk_desc, (VOPFUNC)err_offtoblk }, /* offtoblk not supported */ - { &vop_cmap_desc, (VOPFUNC)err_cmap }, /* cmap not supported */ + {&vnop_default_desc, (VOPFUNC)vn_default_error}, + {&vnop_strategy_desc, (VOPFUNC)err_strategy}, /* strategy - not supported */ + {&vnop_bwrite_desc, (VOPFUNC)err_bwrite}, /* bwrite - not supported */ + {&vnop_lookup_desc, (VOPFUNC)synthfs_cached_lookup}, /* cached lookup */ + {&vnop_create_desc, (VOPFUNC)synthfs_create}, /* create - DEBUGGER */ + {&vnop_whiteout_desc, (VOPFUNC)err_whiteout}, /* whiteout - not supported */ + {&vnop_mknod_desc, (VOPFUNC)err_mknod}, /* mknod - not supported */ + {&vnop_open_desc, (VOPFUNC)synthfs_open}, /* open - DEBUGGER */ + {&vnop_close_desc, (VOPFUNC)nop_close}, /* close - NOP */ + {&vnop_getattr_desc, (VOPFUNC)synthfs_getattr}, /* getattr */ + {&vnop_setattr_desc, (VOPFUNC)synthfs_setattr}, /* setattr */ + {&vnop_getattrlist_desc, (VOPFUNC)err_getattrlist}, /* getattrlist - not supported */ + {&vnop_setattrlist_desc, (VOPFUNC)err_setattrlist}, /* setattrlist - not supported */ + {&vnop_read_desc, (VOPFUNC)err_read}, /* read - not supported */ + {&vnop_write_desc, (VOPFUNC)err_write}, /* write - not supported */ + {&vnop_ioctl_desc, (VOPFUNC)err_ioctl}, /* ioctl - not supported */ + {&vnop_select_desc, (VOPFUNC)synthfs_select}, /* select */ + {&vnop_exchange_desc, (VOPFUNC)err_exchange}, /* exchange - not supported */ + {&vnop_revoke_desc, (VOPFUNC)nop_revoke}, /* revoke - NOP */ + {&vnop_mmap_desc, (VOPFUNC)synthfs_mmap}, /* mmap - DEBUGGER */ + {&vnop_fsync_desc, (VOPFUNC)nop_fsync}, /* fsync - NOP */ + {&vnop_remove_desc, (VOPFUNC)synthfs_remove}, /* remove */ + {&vnop_link_desc, (VOPFUNC)err_link}, /* link - not supported */ + {&vnop_rename_desc, (VOPFUNC)synthfs_rename}, /* rename */ + {&vnop_mkdir_desc, (VOPFUNC)synthfs_mkdir}, /* mkdir */ + {&vnop_rmdir_desc, (VOPFUNC)synthfs_rmdir}, /* rmdir */ + {&vnop_symlink_desc, (VOPFUNC)synthfs_symlink}, /* symlink */ + {&vnop_readdir_desc, (VOPFUNC)synthfs_readdir}, /* readdir */ + {&vnop_readdirattr_desc, (VOPFUNC)err_readdirattr}, /* readdirattr - not supported */ + {&vnop_readlink_desc, (VOPFUNC)synthfs_readlink}, /* readlink */ + {&vnop_inactive_desc, (VOPFUNC)synthfs_inactive}, /* inactive */ + {&vnop_reclaim_desc, (VOPFUNC)synthfs_reclaim}, /* reclaim */ + {&vnop_pathconf_desc, (VOPFUNC)synthfs_pathconf}, /* pathconf */ + {&vnop_advlock_desc, (VOPFUNC)err_advlock}, /* advlock - not supported */ + {&vnop_allocate_desc, (VOPFUNC)err_allocate}, /* allocate - not supported */ + {&vnop_pagein_desc, (VOPFUNC)err_pagein}, /* pagein - not supported */ + {&vnop_pageout_desc, (VOPFUNC)err_pageout}, /* pageout - not supported */ + {&vnop_devblocksize_desc, (VOPFUNC)err_devblocksize}, /* devblocksize - not supported */ + {&vnop_searchfs_desc, (VOPFUNC)err_searchfs}, /* searchfs - not supported */ + {&vnop_copyfile_desc, (VOPFUNC)err_copyfile}, /* copyfile - not supported */ + { &vnop_blktooff_desc, (VOPFUNC)err_blktooff }, /* blktooff not supported */ + { &vnop_offtoblk_desc, (VOPFUNC)err_offtoblk }, /* offtoblk not supported */ + { &vnop_blockmap_desc, (VOPFUNC)err_blockmap }, /* blockmap not supported */ {(struct vnodeop_desc *) NULL, (int (*) ()) NULL} }; @@ -147,11 +128,11 @@ struct vnodeopv_desc synthfs_vnodeop_opv_desc = #% create dvp L U U #% create vpp - L - # - vop_create { + vnop_create { IN WILLRELE struct vnode *dvp; OUT struct vnode **vpp; IN struct componentname *cnp; - IN struct vattr *vap; + IN struct vnode_attr *vap; We are responsible for freeing the namei buffer, it is done in hfs_makenode(), unless there is a previous error. @@ -160,11 +141,12 @@ struct vnodeopv_desc synthfs_vnodeop_opv_desc = int synthfs_create(ap) -struct vop_create_args /* { +struct vnop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; - struct vattr *a_vap; + struct vnode_attr *a_vap; + vfs_context_t a_context; } */ *ap; { #if DEBUG @@ -184,20 +166,18 @@ struct vop_create_args /* { * Open called. #% open vp L L L # - vop_open { + vnop_open { IN struct vnode *vp; IN int mode; - IN struct ucred *cred; - IN struct proc *p; + IN vfs_context_t a_context; */ int synthfs_open(ap) -struct vop_open_args /* { +struct vnop_open_args /* { struct vnode *a_vp; int a_mode; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { struct vnode *vp = ap->a_vp; @@ -225,10 +205,10 @@ struct vop_open_args /* { * NB Currently unsupported. # XXX - not used # - vop_mmap { + vnop_mmap { IN struct vnode *vp; IN int fflags; - IN struct ucred *cred; + IN kauth_cred_t cred; IN struct proc *p; */ @@ -236,186 +216,74 @@ struct vop_open_args /* { /* ARGSUSED */ int -synthfs_mmap(ap) -struct vop_mmap_args /* { - struct vnode *a_vp; - int a_fflags; - struct ucred *a_cred; - struct proc *a_p; -} */ *ap; +synthfs_mmap(__unused struct vnop_mmap_args *ap) { -#if DEBUG - struct vnode *vp = ap->a_vp; - char debugmsg[255]; - - sprintf(debugmsg, "synthfs_mmap: attempt to map '/%s' ?!", VTOS(vp)->s_name); - Debugger(debugmsg); -#endif - return EINVAL; } -/* -#% access vp L L L -# - vop_access { - IN struct vnode *vp; - IN int mode; - IN struct ucred *cred; - IN struct proc *p; - -*/ - -int -synthfs_access(ap) -struct vop_access_args /* { - struct vnode *a_vp; - int a_mode; - struct ucred *a_cred; - struct proc *a_p; -} */ *ap; -{ - struct vnode *vp = ap->a_vp; - mode_t mode = ap->a_mode; - struct ucred *cred = ap->a_cred; - struct synthfsnode *sp = VTOS(vp); - register gid_t *gp; - mode_t mask; - int retval = 0; - int i; - - /* - * Disallow write attempts on read-only file systems; - * unless the file is a socket, fifo, or a block or - * character device resident on the file system. - */ - if (mode & VWRITE) { - switch (vp->v_type) { - case VDIR: - case VLNK: - case VREG: - if (VTOVFS(vp)->mnt_flag & MNT_RDONLY) - return (EROFS); - break; - default: - break; - } - } - - /* If immutable bit set, nobody gets to write it. */ - if ((mode & VWRITE) && (sp->s_flags & IMMUTABLE)) - return (EPERM); - - /* Otherwise, user id 0 always gets access. */ - if (ap->a_cred->cr_uid == 0) { - retval = 0; - goto Exit; - }; - - mask = 0; - - /* Otherwise, check the owner. */ - if (cred->cr_uid == sp->s_uid) { - if (mode & VEXEC) - mask |= S_IXUSR; - if (mode & VREAD) - mask |= S_IRUSR; - if (mode & VWRITE) - mask |= S_IWUSR; - retval = ((sp->s_mode & mask) == mask ? 0 : EACCES); - goto Exit; - } - - /* Otherwise, check the groups. */ - for (i = 0, gp = cred->cr_groups; i < cred->cr_ngroups; i++, gp++) - if (sp->s_gid == *gp) { - if (mode & VEXEC) - mask |= S_IXGRP; - if (mode & VREAD) - mask |= S_IRGRP; - if (mode & VWRITE) - mask |= S_IWGRP; - retval = ((sp->s_mode & mask) == mask ? 0 : EACCES); - goto Exit; - } - - /* Otherwise, check everyone else. */ - if (mode & VEXEC) - mask |= S_IXOTH; - if (mode & VREAD) - mask |= S_IROTH; - if (mode & VWRITE) - mask |= S_IWOTH; - retval = ((sp->s_mode & mask) == mask ? 0 : EACCES); - -Exit: - return (retval); -} - /* #% getattr vp = = = # - vop_getattr { + vnop_getattr { IN struct vnode *vp; - IN struct vattr *vap; - IN struct ucred *cred; - IN struct proc *p; + IN struct vnode_attr *vap; + IN vfs_context_t context; */ int synthfs_getattr(ap) -struct vop_getattr_args /* { +struct vnop_getattr_args /* { struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - struct proc *a_p; + struct vnode_attr *a_vap; + vfs_context_t a_context; } */ *ap; { - struct vnode *vp = ap->a_vp; - struct vattr *vap = ap->a_vap; - struct synthfsnode *sp = VTOS(vp); - struct synthfs_mntdata *smp = VTOSFS(vp); - - vap->va_type = vp->v_type; - vap->va_mode = sp->s_mode; - vap->va_nlink = sp->s_linkcount; - vap->va_uid = sp->s_uid; - vap->va_gid = sp->s_gid; - vap->va_fsid = VTOVFS(vp)->mnt_stat.f_fsid.val[0]; - vap->va_fileid = sp->s_nodeid; + struct vnode *vp = ap->a_vp; + struct vnode_attr *vap = ap->a_vap; + struct synthfsnode *sp = VTOS(vp); + + VATTR_RETURN(vap, va_type, vp->v_type); + VATTR_RETURN(vap, va_mode, sp->s_mode); + VATTR_RETURN(vap, va_nlink, sp->s_linkcount); + VATTR_RETURN(vap, va_uid, sp->s_uid); + VATTR_RETURN(vap, va_gid, sp->s_gid); + VATTR_RETURN(vap, va_fsid, VTOVFS(vp)->mnt_vfsstat.f_fsid.val[0]); + VATTR_RETURN(vap, va_fileid, sp->s_nodeid); switch (vp->v_type) { - case VDIR: - vap->va_size = (sp->s_u.d.d_entrycount + 2) * sizeof(struct dirent); + case VDIR: + VATTR_RETURN(vap, va_data_size, (sp->s_u.d.d_entrycount + 2) * sizeof(struct dirent)); break; - case VREG: - vap->va_size = sp->s_u.f.f_size; + case VREG: + VATTR_RETURN(vap, va_data_size, sp->s_u.f.f_size); break; - case VLNK: - vap->va_size = sp->s_u.s.s_length; + case VLNK: + VATTR_RETURN(vap, va_data_size, sp->s_u.s.s_length); break; - default: - vap->va_size = 0; + default: + VATTR_RETURN(vap, va_data_size, 0); }; - vap->va_blocksize = 512; - vap->va_atime.tv_sec = sp->s_accesstime.tv_sec; - vap->va_atime.tv_nsec = sp->s_accesstime.tv_usec * 1000; - vap->va_mtime.tv_sec = sp->s_modificationtime.tv_sec; - vap->va_mtime.tv_nsec = sp->s_modificationtime.tv_usec * 1000; - vap->va_ctime.tv_sec = sp->s_changetime.tv_sec; - vap->va_ctime.tv_nsec = sp->s_changetime.tv_usec * 1000; - vap->va_gen = sp->s_generation; - vap->va_flags = sp->s_flags; - vap->va_rdev = sp->s_rdev; - vap->va_bytes = vap->va_blocksize * ((vap->va_size + vap->va_blocksize - 1) / vap->va_blocksize); - vap->va_filerev = 0; - vap->va_vaflags = 0; - - return (0); + VATTR_RETURN(vap, va_iosize, 512); + vap->va_access_time.tv_sec = sp->s_accesstime.tv_sec; + vap->va_access_time.tv_nsec = sp->s_accesstime.tv_usec * 1000; + VATTR_SET_SUPPORTED(vap, va_access_time); + vap->va_modify_time.tv_sec = sp->s_modificationtime.tv_sec; + vap->va_modify_time.tv_nsec = sp->s_modificationtime.tv_usec * 1000; + VATTR_SET_SUPPORTED(vap, va_modify_time); + vap->va_change_time.tv_sec = sp->s_changetime.tv_sec; + vap->va_change_time.tv_nsec = sp->s_changetime.tv_usec * 1000; + VATTR_SET_SUPPORTED(vap, va_change_time); + VATTR_RETURN(vap, va_gen, sp->s_generation); + VATTR_RETURN(vap, va_flags, sp->s_flags); + VATTR_RETURN(vap, va_rdev, sp->s_rdev); + VATTR_RETURN(vap, va_filerev, 0); + VATTR_RETURN(vap, va_acl, NULL); + + return (0); } @@ -424,20 +292,11 @@ struct vop_getattr_args /* { * Change the mode on a file or directory. * vnode vp must be locked on entry. */ -int synthfs_chmod(struct vnode *vp, int mode, struct ucred *cred, struct proc *p) +int synthfs_chmod(struct vnode *vp, int mode, kauth_cred_t cred, struct proc *p) { struct synthfsnode *sp = VTOS(vp); int result; - if ((cred->cr_uid != sp->s_uid) && - (result = suser(cred, &p->p_acflag))) - return result; - if (cred->cr_uid) { - if (vp->v_type != VDIR && (mode & S_ISTXT)) - return EFTYPE; - if (!groupmember(sp->s_gid, cred) && (mode & S_ISGID)) - return (EPERM); - } sp->s_mode &= ~ALLPERMS; sp->s_mode |= (mode & ALLPERMS); sp->s_nodeflags |= IN_CHANGE; @@ -454,29 +313,11 @@ int synthfs_chmod(struct vnode *vp, int mode, struct ucred *cred, struct proc *p * Change the flags on a file or directory. * vnode vp must be locked on entry. */ -int synthfs_chflags(struct vnode *vp, u_long flags, struct ucred *cred, struct proc *p) +int synthfs_chflags(struct vnode *vp, u_long flags, kauth_cred_t cred, struct proc *p) { struct synthfsnode *sp = VTOS(vp); - int result; - - if (cred->cr_uid != sp->s_uid && - (result = suser(cred, &p->p_acflag))) - return result; - if (cred->cr_uid == 0) { - if ((sp->s_flags & (SF_IMMUTABLE | SF_APPEND)) && - securelevel > 0) { - return EPERM; - }; - sp->s_flags = flags; - } else { - if (sp->s_flags & (SF_IMMUTABLE | SF_APPEND) || - (flags & UF_SETTABLE) != flags) { - return EPERM; - }; - sp->s_flags &= SF_SETTABLE; - sp->s_flags |= (flags & UF_SETTABLE); - } + sp->s_flags = flags; sp->s_nodeflags |= IN_CHANGE; return 0; @@ -488,26 +329,17 @@ int synthfs_chflags(struct vnode *vp, u_long flags, struct ucred *cred, struct p * Perform chown operation on vnode vp; * vnode vp must be locked on entry. */ -int synthfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, struct proc *p) +int synthfs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred, struct proc *p) { struct synthfsnode *sp = VTOS(vp); uid_t ouid; gid_t ogid; int result = 0; + int is_member; if (uid == (uid_t)VNOVAL) uid = sp->s_uid; if (gid == (gid_t)VNOVAL) gid = sp->s_gid; - /* - * If we don't own the file, are trying to change the owner - * of the file, or are not a member of the target group, - * the caller must be superuser or the call fails. - */ - if ((cred->cr_uid != sp->s_uid || uid != sp->s_uid || - (gid != sp->s_gid && !groupmember((gid_t)gid, cred))) && - (result = suser(cred, &p->p_acflag))) - return result; - ogid = sp->s_gid; ouid = sp->s_uid; @@ -515,8 +347,8 @@ int synthfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, st sp->s_uid = uid; if (ouid != uid || ogid != gid) sp->s_nodeflags |= IN_CHANGE; - if (ouid != uid && cred->cr_uid != 0) sp->s_mode &= ~S_ISUID; - if (ogid != gid && cred->cr_uid != 0) sp->s_mode &= ~S_ISGID; + if (ouid != uid && suser(cred, NULL)) sp->s_mode &= ~S_ISUID; + if (ogid != gid && suser(cred, NULL)) sp->s_mode &= ~S_ISGID; return 0; } @@ -527,143 +359,92 @@ int synthfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, st * Set attribute vnode op. called from several syscalls #% setattr vp L L L # - vop_setattr { + vnop_setattr { IN struct vnode *vp; - IN struct vattr *vap; - IN struct ucred *cred; - IN struct proc *p; - + IN struct vnode_attr *vap; + IN vfs_context_t context; */ int synthfs_setattr(ap) -struct vop_setattr_args /* { +struct vnop_setattr_args /* { struct vnode *a_vp; -struct vattr *a_vap; -struct ucred *a_cred; -struct proc *a_p; +struct vnode_attr *a_vap; +vfs_context_t a_context; } */ *ap; { - struct vnode *vp = ap->a_vp; - struct synthfsnode *sp = VTOS(vp); - struct vattr *vap = ap->a_vap; - struct ucred *cred = ap->a_cred; - struct proc *p = ap->a_p; - struct timeval atimeval, mtimeval; - int result; - - /* - * Check for unsettable attributes. - */ - if (((vap->va_type != VNON) && (vap->va_type != vp->v_type)) || - (vap->va_nlink != VNOVAL) || - (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || - (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || - ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { - result = EINVAL; - goto Err_Exit; - } - - if (vap->va_flags != VNOVAL) { - if (VTOVFS(vp)->mnt_flag & MNT_RDONLY) { - result = EROFS; - goto Err_Exit; - }; - if ((result = synthfs_chflags(vp, vap->va_flags, cred, p))) { - goto Err_Exit; - }; - if (vap->va_flags & (IMMUTABLE | APPEND)) { - result = 0; - goto Err_Exit; - }; - } - - if (sp->s_flags & (IMMUTABLE | APPEND)) { - result = EPERM; - goto Err_Exit; - }; + struct vnode *vp = ap->a_vp; + struct synthfsnode *sp = VTOS(vp); + struct vnode_attr *vap = ap->a_vap; + kauth_cred_t cred = vfs_context_ucred(ap->a_context); + struct proc *p = vfs_context_proc(ap->a_context); + struct timeval atimeval, mtimeval; + uid_t nuid; + gid_t ngid; + int result; + + result = 0; + + if (VATTR_IS_ACTIVE(vap, va_flags)) { + if ((result = synthfs_chflags(vp, vap->va_flags, cred, p))) { + goto Err_Exit; + } + } + VATTR_SET_SUPPORTED(vap, va_flags); + + nuid = (uid_t)ngid = (gid_t)VNOVAL; + if (VATTR_IS_ACTIVE(vap, va_uid)) + nuid = vap->va_uid; + if (VATTR_IS_ACTIVE(vap, va_gid)) + ngid = vap->va_gid; + if (nuid != (uid_t)VNOVAL || ngid != (gid_t)VNOVAL) { + if ((result = synthfs_chown(vp, nuid, ngid, cred, p))) { + goto Err_Exit; + } + } + VATTR_SET_SUPPORTED(vap, va_uid); + VATTR_SET_SUPPORTED(vap, va_gid); - /* - * Go through the fields and update iff not VNOVAL. - */ - if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { - if (VTOVFS(vp)->mnt_flag & MNT_RDONLY) { - result = EROFS; - goto Err_Exit; - }; - if ((result = synthfs_chown(vp, vap->va_uid, vap->va_gid, cred, p))) { - goto Err_Exit; - }; - } - if (vap->va_size != VNOVAL) { - /* - * Disallow write attempts on read-only file systems; - * unless the file is a socket, fifo, or a block or - * character device resident on the file system. - */ - switch (vp->v_type) { - case VDIR: - result = EISDIR; - goto Err_Exit; - case VLNK: - case VREG: - if (VTOVFS(vp)->mnt_flag & MNT_RDONLY) { - result = EROFS; - goto Err_Exit; - }; - break; - default: - break; - } + if (VATTR_IS_ACTIVE(vap, va_data_size)) { #if RWSUPPORT - if ((result = VOP_TRUNCATE(vp, vap->va_size, 0, cred, p))) { - goto Err_Exit; - }; + if ((result = vnode_setsize(vp, vap->va_data_size, 0, ap->a_context))) { + goto Err_Exit; + }; + VATTR_SET_SUPPORTED(vap, va_data_size); #else - result = EINVAL; - goto Err_Exit; + result = EINVAL; + goto Err_Exit; #endif - } + } - sp = VTOS(vp); - if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { - if (VTOVFS(vp)->mnt_flag & MNT_RDONLY) { - result = EROFS; - goto Err_Exit; - }; - if (cred->cr_uid != sp->s_uid && - (result = suser(cred, &p->p_acflag)) && - ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || - (result = VOP_ACCESS(vp, VWRITE, cred, p)))) { - goto Err_Exit; - }; - if (vap->va_atime.tv_sec != VNOVAL) - sp->s_nodeflags |= IN_ACCESS; - if (vap->va_mtime.tv_sec != VNOVAL) - sp->s_nodeflags |= IN_CHANGE | IN_UPDATE; - atimeval.tv_sec = vap->va_atime.tv_sec; - atimeval.tv_usec = vap->va_atime.tv_nsec / 1000; - mtimeval.tv_sec = vap->va_mtime.tv_sec; - mtimeval.tv_usec = vap->va_mtime.tv_nsec / 1000; - if ((result = VOP_UPDATE(vp, &atimeval, &mtimeval, 1))) { - goto Err_Exit; - }; - } + sp = VTOS(vp); + if (VATTR_IS_ACTIVE(vap, va_access_time) || VATTR_IS_ACTIVE(vap, va_modify_time)) { + if (VATTR_IS_ACTIVE(vap, va_access_time)) { + sp->s_nodeflags |= IN_ACCESS; + atimeval.tv_sec = vap->va_access_time.tv_sec; + atimeval.tv_usec = vap->va_access_time.tv_nsec / 1000; + } + if (VATTR_IS_ACTIVE(vap, va_modify_time)) { + sp->s_nodeflags |= IN_CHANGE | IN_UPDATE; + mtimeval.tv_sec = vap->va_modify_time.tv_sec; + mtimeval.tv_usec = vap->va_modify_time.tv_nsec / 1000; + } + if ((result = synthfs_update(vp, &atimeval, &mtimeval, 1))) { + goto Err_Exit; + } + } + VATTR_SET_SUPPORTED(vap, va_access_time); + VATTR_SET_SUPPORTED(vap, va_modify_time); - result = 0; - if (vap->va_mode != (mode_t)VNOVAL) { - if (VTOVFS(vp)->mnt_flag & MNT_RDONLY) { - result = EROFS; - goto Err_Exit; - }; - result = synthfs_chmod(vp, (int)vap->va_mode, cred, p); - }; + if (VATTR_IS_ACTIVE(vap, va_mode)) + result = synthfs_chmod(vp, (int)vap->va_mode, cred, p); + VATTR_SET_SUPPORTED(vap, va_mode); -Err_Exit: ; + Err_Exit: - DBG_VOP(("synthfs_setattr: returning %d...\n", result)); + DBG_VOP(("synthfs_setattr: returning %d...\n", result)); - return (result); + return (result); } @@ -675,7 +456,7 @@ Err_Exit: ; #% rename targetPar_vp L U U #% rename target_vp X U U # - vop_rename { + vnop_rename { IN WILLRELE struct vnode *sourcePar_vp; IN WILLRELE struct vnode *source_vp; IN struct componentname *source_cnp; @@ -700,13 +481,14 @@ Err_Exit: ; int synthfs_rename(ap) -struct vop_rename_args /* { +struct vnop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; + vfs_context_t a_context; } */ *ap; { struct vnode *target_vp = ap->a_tvp; @@ -715,7 +497,6 @@ struct vop_rename_args /* { struct vnode *sourcePar_vp = ap->a_fdvp; struct componentname *target_cnp = ap->a_tcnp; struct componentname *source_cnp = ap->a_fcnp; - struct proc *p = source_cnp->cn_proc; struct synthfsnode *target_sp, *targetPar_sp, *source_sp, *sourcePar_sp; u_short doingdirectory = 0, oldparent = 0, newparent = 0; int retval = 0; @@ -730,35 +511,10 @@ struct vop_rename_args /* { DBG_ASSERT((ap->a_fdvp->v_type == VDIR) && (ap->a_tdvp->v_type == VDIR)); target_sp = targetPar_sp = source_sp = sourcePar_sp = NULL; - /* - * Check for cross-device rename. - */ - if ((source_vp->v_mount != targetPar_vp->v_mount) || - (target_vp && (source_vp->v_mount != target_vp->v_mount))) { - retval = EXDEV; - goto abortit; - } - - /* - * Check for access permissions - */ - if (target_vp && ((VTOS(target_vp)->s_pflags & (IMMUTABLE | APPEND)) || - (VTOS(targetPar_vp)->s_pflags & APPEND))) { - retval = EPERM; - goto abortit; - } - - if ((retval = vn_lock(source_vp, LK_EXCLUSIVE, p))) - goto abortit; sourcePar_sp = VTOS(sourcePar_vp); source_sp = VTOS(source_vp); oldparent = sourcePar_sp->s_nodeid; - if ((source_sp->s_pflags & (IMMUTABLE | APPEND)) || (sourcePar_sp->s_pflags & APPEND)) { - VOP_UNLOCK(source_vp, 0, p); - retval = EPERM; - goto abortit; - } /* * Be sure we are not renaming ".", "..", or an alias of ".". This @@ -771,7 +527,6 @@ struct vop_rename_args /* { || sourcePar_sp == source_sp || (source_cnp->cn_flags & ISDOTDOT) || (source_sp->s_nodeflags & IN_RENAME)) { - VOP_UNLOCK(source_vp, 0, p); retval = EINVAL; goto abortit; } @@ -785,11 +540,6 @@ struct vop_rename_args /* { target_sp = target_vp ? VTOS(target_vp) : NULL; newparent = targetPar_sp->s_nodeid; - retval = VOP_ACCESS(source_vp, VWRITE, target_cnp->cn_cred, target_cnp->cn_proc); - if (doingdirectory && (newparent != oldparent)) { - if (retval) /* write access check above */ - goto bad; - } /* * If the destination exists, then be sure its type (file or dir) @@ -797,35 +547,15 @@ struct vop_rename_args /* { * it is empty. Then delete the destination. */ if (target_vp) { - /* - * If the parent directory is "sticky", then the user must - * own the parent directory, or the destination of the rename, - * otherwise the destination may not be changed (except by - * root). This implements append-only directories. - */ - if ((targetPar_sp->s_mode & S_ISTXT) && target_cnp->cn_cred->cr_uid != 0 && - target_cnp->cn_cred->cr_uid != targetPar_sp->s_uid && - target_sp->s_uid != target_cnp->cn_cred->cr_uid) { - retval = EPERM; - goto bad; - } - /* - * VOP_REMOVE will vput targetPar_vp so we better bump - * its ref count and relockit, always set target_vp to - * NULL afterwards to indicate that were done with it. - */ - VREF(targetPar_vp); #if RWSUPPORT - if (target_vp->v_type == VREG) { - (void) vnode_uncache(target_vp); - }; + if (target_vp->v_type == VREG) { + (void) vnode_uncache(target_vp); + }; #endif - cache_purge(target_vp); + cache_purge(target_vp); - target_cnp->cn_flags &= ~SAVENAME; - retval = VOP_REMOVE(targetPar_vp, target_vp, target_cnp); - (void) vn_lock(targetPar_vp, LK_EXCLUSIVE | LK_RETRY, p); + retval = synthfs_remove_internal(targetPar_vp, target_vp, target_cnp, ap->a_context); target_vp = NULL; target_sp = NULL; @@ -834,17 +564,11 @@ struct vop_rename_args /* { }; - if (newparent != oldparent) - vn_lock(sourcePar_vp, LK_EXCLUSIVE | LK_RETRY, p); - /* remove the existing entry from the namei cache: */ if (source_vp->v_type == VREG) cache_purge(source_vp); retval = synthfs_move_rename_entry( source_vp, targetPar_vp, target_cnp->cn_nameptr); - if (newparent != oldparent) - VOP_UNLOCK(sourcePar_vp, 0, p); - if (retval) goto bad; source_sp->s_nodeflags &= ~IN_RENAME; @@ -857,55 +581,21 @@ struct vop_rename_args /* { */ targetPar_sp->s_nodeflags |= IN_UPDATE; sourcePar_sp->s_nodeflags |= IN_UPDATE; - tv = time; + + microtime(&tv); SYNTHFSTIMES(targetPar_sp, &tv, &tv); SYNTHFSTIMES(sourcePar_sp, &tv, &tv); - vput(targetPar_vp); - vrele(sourcePar_vp); - vput(source_vp); - return (retval); bad:; if (retval && doingdirectory) source_sp->s_nodeflags &= ~IN_RENAME; - if (targetPar_vp == target_vp) - vrele(targetPar_vp); - else - vput(targetPar_vp); - - if (target_vp) - vput(target_vp); - - vrele(sourcePar_vp); - - if (VOP_ISLOCKED(source_vp)) - vput(source_vp); - else - vrele(source_vp); - - return (retval); + return (retval); abortit:; - - VOP_ABORTOP(targetPar_vp, target_cnp); /* XXX, why not in NFS? */ - - if (targetPar_vp == target_vp) - vrele(targetPar_vp); - else - vput(targetPar_vp); - - if (target_vp) - vput(target_vp); - - VOP_ABORTOP(sourcePar_vp, source_cnp); /* XXX, why not in NFS? */ - - vrele(sourcePar_vp); - vrele(source_vp); - - return (retval); + return (retval); } @@ -916,11 +606,12 @@ abortit:; #% mkdir dvp L U U #% mkdir vpp - L - # - vop_mkdir { + vnop_mkdir { IN WILLRELE struct vnode *dvp; OUT struct vnode **vpp; IN struct componentname *cnp; - IN struct vattr *vap; + IN struct vnode_attr *vap; + IN vfs_context_t context; We are responsible for freeing the namei buffer, it is done in synthfs_makenode(), unless there is a previous error. @@ -929,11 +620,12 @@ abortit:; int synthfs_mkdir(ap) -struct vop_mkdir_args /* { +struct vnop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; - struct vattr *a_vap; + struct vnode_attr *a_vap; + vfs_context_t a_context; } */ *ap; { int retval; @@ -942,22 +634,20 @@ struct vop_mkdir_args /* { int mode = MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode); struct vnode *vp = NULL; - *ap->a_vpp = NULL; + *ap->a_vpp = NULL; - retval = synthfs_new_directory(VTOVFS(dvp), dvp, cnp->cn_nameptr, VTOSFS(dvp)->synthfs_nextid++, mode, ap->a_cnp->cn_proc, &vp); - if (retval) goto Error_Exit; + retval = synthfs_new_directory(VTOVFS(dvp), dvp, cnp->cn_nameptr, VTOSFS(dvp)->synthfs_nextid++, mode, vfs_context_proc(cnp->cn_context), &vp); + if (retval) goto Error_Exit; - retval = VOP_SETATTR(vp, ap->a_vap, cnp->cn_cred, cnp->cn_proc); - if (retval != 0) goto Error_Exit; + *ap->a_vpp = vp; - *ap->a_vpp = vp; + retval = vnode_setattr(vp, ap->a_vap, ap->a_context); + if (retval != 0) goto Error_Exit; -Error_Exit:; - if (retval != 0) { - if (vp) synthfs_remove_directory(vp); - VOP_ABORTOP(dvp, cnp); - } - vput(dvp); + Error_Exit:; + if (retval != 0) { + if (vp) synthfs_remove_directory(vp); + } return retval; } @@ -969,37 +659,39 @@ Error_Exit:; #% remove dvp L U U #% remove vp L U U # - vop_remove { + vnop_remove { IN WILLRELE struct vnode *dvp; IN WILLRELE struct vnode *vp; IN struct componentname *cnp; - + IN vfs_context_t context; + */ int synthfs_remove(ap) -struct vop_remove_args /* { +struct vnop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; + vfs_context_t a_context; } */ *ap; { - struct vnode *vp = ap->a_vp; - struct vnode *dvp = ap->a_dvp; + return synthfs_remove_internal(ap->a_dvp, ap->a_vp, ap->a_cnp, ap->a_context); +} + +static int +synthfs_remove_internal(struct vnode *dvp, struct vnode *vp, + __unused struct componentname *cnp, + __unused vfs_context_t context) +{ struct synthfsnode *sp = VTOS(vp); - struct timeval tv; + struct timeval tv; int retval = 0; - if ((sp->s_flags & (IMMUTABLE | APPEND)) || - (VTOS(dvp)->s_flags & APPEND)) { - retval = EPERM; - goto out; - }; - /* This is sort of silly right now but someday it may make sense... */ if (sp->s_nodeflags & IN_MODIFIED) { - tv = time; - VOP_UPDATE(vp, &tv, &tv, 0); + microtime(&tv); + synthfs_update(vp, &tv, &tv, 0); }; /* remove the entry from the namei cache: */ @@ -1028,13 +720,6 @@ out: if (! retval) VTOS(dvp)->s_nodeflags |= IN_CHANGE | IN_UPDATE; - if (dvp == vp) { - vrele(vp); - } else { - vput(vp); - }; - - vput(dvp); return (retval); } @@ -1044,23 +729,24 @@ out: #% rmdir dvp L U U #% rmdir vp L U U # - vop_rmdir { + vnop_rmdir { IN WILLRELE struct vnode *dvp; IN WILLRELE struct vnode *vp; IN struct componentname *cnp; + IN vfs_context_t context; */ int synthfs_rmdir(ap) - struct vop_rmdir_args /* { + struct vnop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; + vfs_context_t a_context; } */ *ap; { - DBG_VOP(("synthfs_rmdir called\n")); - return synthfs_remove((struct vop_remove_args *)ap); + return synthfs_remove((struct vnop_remove_args *)ap); } @@ -1071,15 +757,15 @@ synthfs_rmdir(ap) * Locking policy: ignore */ int -synthfs_select(ap) -struct vop_select_args /* { +synthfs_select(__unused +struct vnop_select_args /* { struct vnode *a_vp; int a_which; int a_fflags; - struct ucred *a_cred; + kauth_cred_t a_cred; void *a_wql; struct proc *a_p; -} */ *ap; +} */ *ap) { DBG_VOP(("synthfs_select called\n")); @@ -1091,15 +777,15 @@ struct vop_select_args /* { #% symlink dvp L U U #% symlink vpp - U - # -# XXX - note that the return vnode has already been vrele'ed -# by the filesystem layer. To use it you must use vget, +# XXX - note that the return vnode has already been vnode_put'ed +# by the filesystem layer. To use it you must use vnode_get, # possibly with a further namei. # - vop_symlink { + vnop_symlink { IN WILLRELE struct vnode *dvp; OUT WILLRELE struct vnode **vpp; IN struct componentname *cnp; - IN struct vattr *vap; + IN struct vnode_attr *vap; IN char *target; We are responsible for freeing the namei buffer, it is done in synthfs_makenode(), unless there is @@ -1110,12 +796,13 @@ struct vop_select_args /* { int synthfs_symlink(ap) - struct vop_symlink_args /* { + struct vnop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; - struct vattr *a_vap; + struct vnode_attr *a_vap; char *a_target; + vfs_context_t a_context; } */ *ap; { struct vnode *dvp = ap->a_dvp; @@ -1125,17 +812,7 @@ synthfs_symlink(ap) *vpp = NULL; - retval = synthfs_new_symlink(VTOVFS(dvp), dvp, cnp->cn_nameptr, VTOSFS(dvp)->synthfs_nextid++, ap->a_target, ap->a_cnp->cn_proc, vpp); - if (retval) goto Error_Exit; - - VOP_UNLOCK(*vpp, 0, cnp->cn_proc); - -Error_Exit:; - - if (retval != 0) { - VOP_ABORTOP(dvp, cnp); - } - vput(dvp); + retval = synthfs_new_symlink(VTOVFS(dvp), dvp, cnp->cn_nameptr, VTOSFS(dvp)->synthfs_nextid++, ap->a_target, vfs_context_proc(cnp->cn_context), vpp); return (retval); } @@ -1146,18 +823,18 @@ Error_Exit:; # #% readlink vp L L L # - vop_readlink { + vnop_readlink { IN struct vnode *vp; INOUT struct uio *uio; - IN struct ucred *cred; + IN kauth_cred_t cred; */ int synthfs_readlink(ap) -struct vop_readlink_args /* { +struct vnop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; - struct ucred *a_cred; + vfs_context_t a_context; } */ *ap; { struct vnode *vp = ap->a_vp; @@ -1170,8 +847,9 @@ struct vop_readlink_args /* { return 0; }; - if (uio->uio_offset + uio->uio_resid <= sp->s_u.s.s_length) { - count = uio->uio_resid; + // LP64todo - fix this! + if (uio->uio_offset + uio_resid(uio) <= sp->s_u.s.s_length) { + count = uio_resid(uio); } else { count = sp->s_u.s.s_length - uio->uio_offset; }; @@ -1186,27 +864,17 @@ struct vop_readlink_args /* { /* -#% readdir vp L L L -# -vop_readdir { - IN struct vnode *vp; - INOUT struct uio *uio; - IN struct ucred *cred; - INOUT int *eofflag; - OUT int *ncookies; - INOUT u_long **cookies; -*/ - - + * Read directory entries. + */ int synthfs_readdir(ap) -struct vop_readdir_args /* { - struct vnode *vp; - struct uio *uio; - struct ucred *cred; - int *eofflag; - int *ncookies; - u_long **cookies; +struct vnop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_flags; + int *a_eofflag; + int *a_numdirent; + vfs_context_t a_context; } */ *ap; { struct synthfsnode *sp = VTOS(ap->a_vp); @@ -1214,34 +882,30 @@ struct vop_readdir_args /* { off_t diroffset; /* Offset into simulated directory file */ struct synthfsnode *entry; - DBG_VOP(("\tuio_offset = %d, uio_resid = %d\n", (int) uio->uio_offset, uio->uio_resid)); + DBG_VOP(("\tuio_offset = %d, uio_resid = %lld\n", (int) uio->uio_offset, uio_resid(uio))); + + if (ap->a_flags & (VNODE_READDIR_EXTENDED | VNODE_READDIR_REQSEEKOFF)) + return (EINVAL); /* We assume it's all one big buffer... */ if (uio->uio_iovcnt > 1) { DBG_VOP(("\tuio->uio_iovcnt = %d?\n", uio->uio_iovcnt)); return EINVAL; }; - - /* - NFS cookies are not supported: - */ - if ((ap->a_cookies != NULL) || (ap->a_ncookies != NULL)) { - return EINVAL; - }; diroffset = 0; /* * We must synthesize . and .. */ - DBG_VOP(("\tstarting ... uio_offset = %d, uio_resid = %d\n", (int) uio->uio_offset, uio->uio_resid)); + DBG_VOP(("\tstarting ... uio_offset = %d, uio_resid = %lld\n", (int) uio->uio_offset, uio_resid(uio))); if (uio->uio_offset == diroffset) { DBG_VOP(("\tAdding .\n")); diroffset += synthfs_adddirentry(sp->s_nodeid, DT_DIR, ".", uio); - DBG_VOP(("\t after adding ., uio_offset = %d, uio_resid = %d\n", (int) uio->uio_offset, uio->uio_resid)); + DBG_VOP(("\t after adding ., uio_offset = %d, uio_resid = %lld\n", (int) uio->uio_offset, uio_resid(uio))); } - if ((uio->uio_resid > 0) && (diroffset > uio->uio_offset)) { + if ((uio_resid(uio) > 0) && (diroffset > uio->uio_offset)) { /* Oops - we skipped over a partial entry: at best, diroffset should've just matched uio->uio_offset */ return EINVAL; }; @@ -1254,9 +918,9 @@ struct vop_readdir_args /* { } else { diroffset += synthfs_adddirentry(sp->s_nodeid, DT_DIR, "..", uio); } - DBG_VOP(("\t after adding .., uio_offset = %d, uio_resid = %d\n", (int) uio->uio_offset, uio->uio_resid)); + DBG_VOP(("\t after adding .., uio_offset = %d, uio_resid = %lld\n", (int) uio->uio_offset, uio_resid(uio))); } - if ((uio->uio_resid > 0) && (diroffset > uio->uio_offset)) { + if ((uio_resid(uio) > 0) && (diroffset > uio->uio_offset)) { /* Oops - we skipped over a partial entry: at best, diroffset should've just matched uio->uio_offset */ return EINVAL; }; @@ -1267,7 +931,7 @@ struct vop_readdir_args /* { /* Return this entry */ diroffset += synthfs_adddirentry(entry->s_nodeid, VTTOIF(STOV(entry)->v_type), entry->s_name, uio); }; - if ((uio->uio_resid > 0) && (diroffset > uio->uio_offset)) { + if ((uio_resid(uio) > 0) && (diroffset > uio->uio_offset)) { /* Oops - we skipped over a partial entry: at best, diroffset should've just matched uio->uio_offset */ return EINVAL; }; @@ -1290,7 +954,7 @@ struct vop_readdir_args /* { int synthfs_cached_lookup(ap) - struct vop_cachedlookup_args /* { + struct vnop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; @@ -1300,35 +964,16 @@ synthfs_cached_lookup(ap) struct componentname *cnp = ap->a_cnp; u_long nameiop = cnp->cn_nameiop; u_long flags = cnp->cn_flags; - boolean_t lockparent = (flags & LOCKPARENT); - struct proc *p = cnp->cn_proc; - struct ucred *cred = cnp->cn_cred; - struct vnode *target_vp = NULL; - u_int32_t target_vnode_id; /* Capability ID of the target vnode for .. unlock/relock handling check */ struct vnode **vpp = ap->a_vpp; int result = 0; DBG_VOP(("synthfs_cached_lookup called, name = %s, namelen = %ld\n", ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen)); - if (flags & LOCKPARENT) DBG_VOP(("\tLOCKPARENT is set\n")); +#if DEBUG if (flags & ISLASTCN) DBG_VOP(("\tISLASTCN is set\n")); +#endif *vpp = NULL; - if (dp->v_type != VDIR) { - result = ENOTDIR; - goto Err_Exit; - }; - - if ((flags & ISLASTCN) && - (VTOVFS(dp)->mnt_flag & MNT_RDONLY) && - ((nameiop == DELETE) || (nameiop == RENAME))) { - result = EROFS; - goto Err_Exit; - }; - - result = VOP_ACCESS(dp, VEXEC, cred, cnp->cn_proc); - if (result != 0) goto Err_Exit; - /* * Look up an entry in the namei cache */ @@ -1344,66 +989,21 @@ synthfs_cached_lookup(ap) /* An entry matching the parent vnode/name was found in the cache: */ - - target_vp = *vpp; - target_vnode_id = target_vp->v_id; - if (target_vp == dp) { - /* lookup on "." */ - VREF(target_vp); - result = 0; - } else if (flags & ISDOTDOT) { - /* - * Carefully now: trying to step from child to parent; - * must release lock on child before trying to lock parent - * vnode. - */ - VOP_UNLOCK(dp, 0, p); - result = vget(target_vp, LK_EXCLUSIVE, p); - if ((result == 0) && lockparent && (flags & ISLASTCN)) { - result = vn_lock(dp, LK_EXCLUSIVE, p); - } - } else { - result = vget(target_vp, LK_EXCLUSIVE, p); - if (!lockparent || (result != 0) || !(flags & ISLASTCN)) { - VOP_UNLOCK(dp, 0, p); - }; - }; - - /* - Check to make sure the target vnode ID didn't change while we - tried to lock it: - */ - if (result == 0) { - if (target_vnode_id == target_vp->v_id) { - return 0; /* THIS IS THE NORMAL EXIT PATH */ - }; - - /* The vnode ID didn't match anymore: we've got another vnode! */ - vput(target_vp); - /* Unlock the parent vnode in the cases where it should've been left locked: */ - if (lockparent && (dp != target_vp) && (flags & ISLASTCN)) { - VOP_UNLOCK(dp, 0, p); - }; - }; - - /* One last try for a successful lookup through the complete lookup path: */ - result = vn_lock(dp, LK_EXCLUSIVE, p); - if (result == 0) { - return synthfs_lookup(ap); - }; + return (0); Err_Exit:; - return result; + return result; } int synthfs_lookup(ap) - struct vop_cachedlookup_args /* { + struct vnop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; + vfs_context_t a_context; } */ *ap; { struct vnode *dp = ap->a_dvp; @@ -1413,8 +1013,9 @@ synthfs_lookup(ap) // char *nameptr = cnp->cn_nameptr; u_long flags = cnp->cn_flags; long namelen = cnp->cn_namelen; - struct proc *p = cnp->cn_proc; - struct ucred *cred = cnp->cn_cred; +// struct proc *p = cnp->cn_proc; + vfs_context_t ctx = cnp->cn_context; + kauth_cred_t cred = vfs_context_ucred(ctx); struct synthfsnode *entry; struct vnode *target_vp = NULL; int result = 0; @@ -1424,26 +1025,13 @@ synthfs_lookup(ap) struct vnode *starting_parent = dp; DBG_VOP(("synthfs_lookup called, name = %s, namelen = %ld\n", ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen)); +#if DEBUG if (flags & LOCKPARENT) DBG_VOP(("\tLOCKPARENT is set\n")); if (flags & ISLASTCN) DBG_VOP(("\tISLASTCN is set\n")); +#endif *ap->a_vpp = NULL; - if (dp->v_type != VDIR) { - result = ENOTDIR; - goto Err_Exit; - }; - - if ((flags & ISLASTCN) && - (VTOVFS(dp)->mnt_flag & MNT_RDONLY) && - ((nameiop == DELETE) || (nameiop == RENAME))) { - result = EROFS; - goto Err_Exit; - }; - - result = VOP_ACCESS(dp, VEXEC, cred, cnp->cn_proc); - if (result != 0) goto Err_Exit; - /* first check for "." and ".." */ if (cnp->cn_nameptr[0] == '.') { if (namelen == 1) { @@ -1454,7 +1042,7 @@ synthfs_lookup(ap) found = TRUE; target_vp = dp; - VREF(target_vp); + vnode_get(target_vp); result = 0; @@ -1472,18 +1060,10 @@ synthfs_lookup(ap) * Special case for ".." to prevent deadlock: * always release the parent vnode BEFORE trying to acquire * ITS parent. This avoids deadlocking with another lookup - * starting from the target_vp trying to vget() this directory. + * starting from the target_vp trying to vnode_get() this directory. */ - VOP_UNLOCK(dp, 0, p); - result = vget(target_vp, LK_EXCLUSIVE | LK_RETRY, p); - if (result != 0) { - vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, p); - goto Err_Exit; - } - if ((flags & LOCKPARENT) && (flags & ISLASTCN)) { - result = vn_lock(dp, LK_EXCLUSIVE, p); - // vput(target_vp); /* XXX WHY WAS THIS HERE? */ - } + result = vnode_get(target_vp); + } else { target_vp = dp; /* dp is alread locked and ref'ed */ @@ -1501,9 +1081,8 @@ synthfs_lookup(ap) (*(entry->s_name + namelen) == (char)0)) { found = TRUE; target_vp = STOV(entry); - result = vget(target_vp, LK_EXCLUSIVE | LK_RETRY, p); /* vget is not really needed because refcount is always > 0... */ + result = vnode_getwithref(target_vp); /* refcount is always > 0 for any vnode in this list... */ if (result != 0) { - vrele(target_vp); goto Err_Exit; }; @@ -1517,11 +1096,6 @@ synthfs_lookup(ap) Std_Exit:; if (found) { if ((nameiop == DELETE) && (flags & ISLASTCN)) { - /* - * Deleting entries requires write access: - */ - result = VOP_ACCESS(dp, VWRITE, cred, p); - if (result != 0) goto Err_Exit; /* * If the parent directory is "sticky" then the user must own @@ -1530,23 +1104,21 @@ Std_Exit:; * append-only directories */ if ((dsp->s_mode & S_ISVTX) && - (cred->cr_uid != 0) && - (cred->cr_uid != dsp->s_uid) && + suser(cred, NULL) && + (kauth_cred_getuid(cred) != dsp->s_uid) && (target_vp != NULL) && (target_vp->v_type != VLNK) && - (VTOS(target_vp)->s_uid != cred->cr_uid)) { - vput(target_vp); + (VTOS(target_vp)->s_uid != kauth_cred_getuid(cred))) { + vnode_put(target_vp); result = EPERM; goto Err_Exit; }; }; if ((nameiop == RENAME) && (flags & WANTPARENT) && (flags * ISLASTCN)) { - result = VOP_ACCESS(dp, VWRITE, cred, p); - if (result != 0) goto Err_Exit; if (isDot) { - vrele(target_vp); + vnode_put(target_vp); result = EISDIR; goto Err_Exit; }; @@ -1559,43 +1131,25 @@ Std_Exit:; ((nameiop == CREATE) || (nameiop == RENAME) || ((nameiop == DELETE) && (flags & DOWHITEOUT) && (flags & ISWHITEOUT)))) { - /* Write access is required to create entries in the directory: */ - result = VOP_ACCESS(dp, VWRITE, cred, p); - if (result != 0) goto Err_Exit; - - cnp->cn_flags |= SAVENAME; - + /* create a new entry */ result = EJUSTRETURN; } }; - /* XXX PPD Should we do something special in case LOCKLEAF isn't set? */ - if (found && !isDot && !isDotDot && (!(flags & LOCKPARENT) || !(flags & ISLASTCN))) { - VOP_UNLOCK(dp, 0, p); - }; - *ap->a_vpp = target_vp; Err_Exit:; DBG_VOP(("synthfs_lookup: result = %d.\n", result)); if (found) { if (target_vp) { - if (VOP_ISLOCKED(target_vp)) { - DBG_VOP(("synthfs_lookup: target_vp = 0x%08X (locked).\n", (u_long)target_vp)); - } else { - DBG_VOP(("synthfs_lookup: target_vp = 0x%08X (NOT locked?).\n", (u_long)target_vp)); - }; + DBG_VOP(("synthfs_lookup: target_vp = 0x%08X \n", (u_long)target_vp)); } else { DBG_VOP(("synthfs_lookup: found = true but target_vp = NULL?\n")); }; } else { DBG_VOP(("synthf_lookup: target not found.\n")); }; - if (VOP_ISLOCKED(starting_parent)) { - DBG_VOP(("synthfs_lookup: dp = %08X; starting_parent = 0x%08X (LOCKED).\n", (u_long)dp, (u_long)starting_parent)); - } else { - DBG_VOP(("synthfs_lookup: dp = %08X; starting_parent = 0x%08X (UNLOCKED).\n", (u_long)dp, (u_long)starting_parent)); - }; + DBG_VOP(("synthfs_lookup: dp = %08X; starting_parent = 0x%08X .\n", (u_long)dp, (u_long)starting_parent)); return result; } @@ -1606,17 +1160,18 @@ Err_Exit:; #% pathconf vp L L L # - vop_pathconf { + vnop_pathconf { IN struct vnode *vp; IN int name; OUT register_t *retval; */ int synthfs_pathconf(ap) -struct vop_pathconf_args /* { +struct vnop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; + vfs_context_t a_context; } */ *ap; { DBG_VOP(("synthfs_pathconf called\n")); @@ -1657,40 +1212,29 @@ struct vop_pathconf_args /* { * time. If waitfor is set, then wait for the disk write of the node to * complete. */ -/* -#% update vp L L L - IN struct vnode *vp; - IN struct timeval *access; - IN struct timeval *modify; - IN int waitfor; -*/ int -synthfs_update(ap) - struct vop_update_args /* { - struct vnode *a_vp; - struct timeval *a_access; - struct timeval *a_modify; - int a_waitfor; - } */ *ap; +synthfs_update(struct vnode *vp, struct timeval *access, struct timeval *modify, __unused int waitfor) { - struct vnode *vp = ap->a_vp; struct synthfsnode *sp = VTOS(vp); + struct timeval tv; DBG_ASSERT(sp != NULL); - DBG_ASSERT(*((int*)&vp->v_interlock) == 0); if (((sp->s_nodeflags & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) != 0) && - !(VTOVFS(ap->a_vp)->mnt_flag & MNT_RDONLY)) { - if (sp->s_nodeflags & IN_ACCESS) sp->s_accesstime = *ap->a_access; - if (sp->s_nodeflags & IN_UPDATE) sp->s_modificationtime = *ap->a_modify; - if (sp->s_nodeflags & IN_CHANGE) sp->s_changetime = time; + !(VTOVFS(vp)->mnt_flag & MNT_RDONLY)) { + if (sp->s_nodeflags & IN_ACCESS) sp->s_accesstime = *access; + if (sp->s_nodeflags & IN_UPDATE) sp->s_modificationtime = *modify; + if (sp->s_nodeflags & IN_CHANGE) { + + microtime(&tv); + sp->s_changetime = tv; + } }; /* After the updates are finished, clear the flags */ sp->s_nodeflags &= ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE); -// DBG_ASSERT(*((int*)&ap->a_vp->v_interlock) == 0); return 0; } @@ -1703,71 +1247,11 @@ synthfs_update(ap) ******************************************************************************************/ -/* -#% lock vp U L U -# - vop_lock { - IN struct vnode *vp; - IN int flags; - IN struct proc *p; -*/ - -int -synthfs_lock(ap) -struct vop_lock_args /* { - struct vnode *a_vp; - int a_flags; - struct proc *a_p; -} */ *ap; -{ - return lockmgr(&VTOS(ap->a_vp)->s_lock, ap->a_flags, &ap->a_vp->v_interlock, ap->a_p); -} - -/* - * Unlock an synthfsnode. -#% unlock vp L U L -# - vop_unlock { - IN struct vnode *vp; - IN int flags; - IN struct proc *p; - - */ -int -synthfs_unlock(ap) -struct vop_unlock_args /* { - struct vnode *a_vp; - int a_flags; - struct proc *a_p; -} */ *ap; -{ - return lockmgr(&VTOS(ap->a_vp)->s_lock, ap->a_flags | LK_RELEASE, &ap->a_vp->v_interlock, ap->a_p); -} - -/* - * Check for a locked synthfsnode. -#% islocked vp = = = -# - vop_islocked { - IN struct vnode *vp; - - */ -int -synthfs_islocked(ap) -struct vop_islocked_args /* { - struct vnode *a_vp; -} */ *ap; -{ - return lockstatus(&VTOS(ap->a_vp)->s_lock); -} - - - /* # #% inactive vp L U U # - vop_inactive { + vnop_inactive { IN struct vnode *vp; IN struct proc *p; @@ -1775,18 +1259,19 @@ struct vop_islocked_args /* { int synthfs_inactive(ap) -struct vop_inactive_args /* { +struct vnop_inactive_args /* { struct vnode *a_vp; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { struct vnode *vp = ap->a_vp; - struct proc *p = ap->a_p; struct synthfsnode *sp = VTOS(vp); struct timeval tv; +#if DEBUG if (vp->v_usecount != 0) DBG_VOP(("synthfs_inactive: bad usecount = %d\n", vp->v_usecount )); +#endif /* * Ignore nodes related to stale file handles. @@ -1796,18 +1281,17 @@ struct vop_inactive_args /* { /* This is sort of silly but might make sense in the future: */ if (sp->s_nodeflags & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) { - tv = time; - VOP_UPDATE(vp, &tv, &tv, 0); + microtime(&tv); + synthfs_update(vp, &tv, &tv, 0); } out: - VOP_UNLOCK(vp, 0, p); /* * If we are done with the inode, reclaim it * so that it can be reused immediately. */ if (vp->v_type == VNON) { - vrecycle(vp, (struct slock *)0, p); + vnode_recycle(vp); }; return 0; @@ -1822,7 +1306,7 @@ out: */ int synthfs_reclaim(ap) - struct vop_reclaim_args /* { struct vnode *a_vp; struct proc *a_p; } */ *ap; + struct vnop_reclaim_args /* { struct vnode *a_vp; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct synthfsnode *sp = VTOS(vp); diff --git a/bsd/miscfs/union/union.h b/bsd/miscfs/union/union.h index 147df4c7f..475a6f7dd 100644 --- a/bsd/miscfs/union/union.h +++ b/bsd/miscfs/union/union.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -62,6 +62,7 @@ #define __UNION_UNION_H__ #include <sys/appleapiopts.h> +#include <sys/cdefs.h> #ifdef __APPLE_API_PRIVATE struct union_args { @@ -83,6 +84,24 @@ struct union_mount { }; #ifdef KERNEL +/* LP64 version of union_args. all pointers + * grow when we're dealing with a 64-bit process. + * WARNING - keep in sync with union_args + */ +/* LP64todo - should this move? */ + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_union_args { + user_addr_t target; /* Target of loopback */ + int mntflags; /* Options on the mount */ +}; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif /* * DEFDIRMODE is the mode bits used to create a shadow directory. @@ -120,29 +139,26 @@ struct union_node { #define UN_KLOCK 0x08 /* Keep upper node locked on vput */ #define UN_CACHED 0x10 /* In union cache */ -extern int union_allocvp __P((struct vnode **, struct mount *, +extern int union_allocvp(struct vnode **, struct mount *, struct vnode *, struct vnode *, struct componentname *, struct vnode *, - struct vnode *, int)); -extern int union_copyfile __P((struct vnode *, struct vnode *, - struct ucred *, struct proc *)); -extern int union_copyup __P((struct union_node *, int, struct ucred *, - struct proc *)); -extern int union_dowhiteout __P((struct union_node *, struct ucred *, - struct proc *)); -extern int union_mkshadow __P((struct union_mount *, struct vnode *, - struct componentname *, struct vnode **)); -extern int union_mkwhiteout __P((struct union_mount *, struct vnode *, - struct componentname *, char *)); -extern int union_vn_create __P((struct vnode **, struct union_node *, - struct proc *)); -extern int union_cn_close __P((struct vnode *, int, struct ucred *, - struct proc *)); -extern void union_removed_upper __P((struct union_node *un)); -extern struct vnode *union_lowervp __P((struct vnode *)); -extern void union_newlower __P((struct union_node *, struct vnode *)); -extern void union_newupper __P((struct union_node *, struct vnode *)); -extern void union_newsize __P((struct vnode *, off_t, off_t)); + struct vnode *, int); +extern int union_copyfile(struct vnode *, struct vnode *, + struct ucred *, struct proc *); +extern int union_copyup(struct union_node *, int, struct ucred *, + struct proc *); +extern int union_dowhiteout(struct union_node *, vfs_context_t); +extern int union_mkshadow(struct union_mount *, struct vnode *, + struct componentname *, struct vnode **); +extern int union_mkwhiteout(struct union_mount *, struct vnode *, + struct componentname *, char *); +extern int union_vn_create(struct vnode **, struct union_node *, struct proc *); +extern int union_cn_close(struct vnode *, int, struct ucred *, struct proc *); +extern void union_removed_upper(struct union_node *un); +extern struct vnode *union_lowervp(struct vnode *); +extern void union_newlower(struct union_node *, struct vnode *); +extern void union_newupper(struct union_node *, struct vnode *); +extern void union_newsize(struct vnode *, off_t, off_t); #define MOUNTTOUNIONMOUNT(mp) ((struct union_mount *)((mp)->mnt_data)) #define VTOUNION(vp) ((struct union_node *)(vp)->v_data) diff --git a/bsd/miscfs/union/union_subr.c b/bsd/miscfs/union/union_subr.c index 8f5ce12ac..6aa953ee6 100644 --- a/bsd/miscfs/union/union_subr.c +++ b/bsd/miscfs/union/union_subr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -61,18 +61,20 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/time.h> #include <sys/kernel.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/namei.h> #include <sys/malloc.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/queue.h> -#include <sys/mount.h> +#include <sys/mount_internal.h> #include <sys/stat.h> #include <sys/ubc.h> +#include <sys/uio_internal.h> #include <miscfs/union/union.h> #if DIAGNOSTIC @@ -137,7 +139,7 @@ union_updatevp(un, uppervp, lowervp) int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp); int nhash = UNION_HASH(uppervp, lowervp); int docache = (lowervp != NULLVP || uppervp != NULLVP); - int lhash, hhash, uhash; + int lhash, uhash; /* * Ensure locking is ordered from lower to higher @@ -170,13 +172,13 @@ union_updatevp(un, uppervp, lowervp) if (un->un_lowervp != lowervp) { if (un->un_lowervp) { - vrele(un->un_lowervp); + vnode_put(un->un_lowervp); if (un->un_path) { _FREE(un->un_path, M_TEMP); un->un_path = 0; } if (un->un_dirvp) { - vrele(un->un_dirvp); + vnode_put(un->un_dirvp); un->un_dirvp = NULLVP; } } @@ -186,7 +188,7 @@ union_updatevp(un, uppervp, lowervp) if (un->un_uppervp != uppervp) { if (un->un_uppervp) - vrele(un->un_uppervp); + vnode_put(un->un_uppervp); un->un_uppervp = uppervp; un->un_uppersz = VNOVAL; @@ -255,8 +257,7 @@ union_newsize(vp, uppersz, lowersz) printf("union: %s size now %ld\n", uppersz != VNOVAL ? "upper" : "lower", (long) sz); #endif - if (UBCISVALID(vp)) - ubc_setsize(vp, sz); /* XXX check error */ + ubc_setsize(vp, sz); } } @@ -272,7 +273,7 @@ union_newsize(vp, uppersz, lowersz) * being mapped. either, but not both, can be nil. * if supplied, (uppervp) is locked. * the reference is either maintained in the new union_node - * object which is allocated, or they are vrele'd. + * object which is allocated, or they are vnode_put'd. * * all union_nodes are maintained on a singly-linked * list. new nodes are only allocated when they cannot @@ -286,7 +287,7 @@ union_newsize(vp, uppersz, lowersz) * vnode. this lock is only taken if we are going to * call getnewvnode, since the kernel itself is single-threaded. * - * if an entry is found on the list, then call vget() to + * if an entry is found on the list, then call vnode_get() to * take a reference. this is done because there may be * zero references to it and so it needs to removed from * the vnode free list. @@ -308,9 +309,11 @@ union_allocvp(vpp, mp, undvp, dvp, cnp, uppervp, lowervp, docache) struct vnode *xlowervp = NULLVP; struct union_mount *um = MOUNTTOUNIONMOUNT(mp); int hash; - int vflag; + int markroot; int try; struct union_node *unp; + struct vnode_fsparam vfsp; + enum vtype vtype; if (uppervp == NULLVP && lowervp == NULLVP) panic("union: unidentifiable allocation"); @@ -321,15 +324,15 @@ union_allocvp(vpp, mp, undvp, dvp, cnp, uppervp, lowervp, docache) } /* detect the root vnode (and aliases) */ - vflag = 0; + markroot = 0; if ((uppervp == um->um_uppervp) && ((lowervp == NULLVP) || lowervp == um->um_lowervp)) { if (lowervp == NULLVP) { lowervp = um->um_lowervp; if (lowervp != NULLVP) - VREF(lowervp); + vnode_get(lowervp); } - vflag = VROOT; + markroot = VROOT; } loop: @@ -366,8 +369,7 @@ loop: (un->un_uppervp == uppervp || un->un_uppervp == NULLVP) && (UNIONTOV(un)->v_mount == mp)) { - if (vget(UNIONTOV(un), 0, - cnp ? cnp->cn_proc : NULL)) { + if (vnode_get(UNIONTOV(un))) { union_list_unlock(hash); goto loop; } @@ -387,7 +389,7 @@ loop: * uppervp is locked, though un->un_uppervp * may not be. this doesn't break the locking * hierarchy since in the case that un->un_uppervp - * is not yet locked it will be vrele'd and replaced + * is not yet locked it will be vnode_put'd and replaced * with uppervp. */ @@ -407,7 +409,7 @@ loop: #endif } else { if (un->un_flags & UN_LOCKED) { - vrele(UNIONTOV(un)); + vnode_put(UNIONTOV(un)); un->un_flags |= UN_WANT; sleep((caddr_t) &un->un_flags, PINOD); goto loop; @@ -434,7 +436,7 @@ loop: if (uppervp != un->un_uppervp) { union_newupper(un, uppervp); } else if (uppervp) { - vrele(uppervp); + vnode_put(uppervp); } if (un->un_uppervp) { @@ -457,11 +459,11 @@ loop: bcopy(cnp->cn_nameptr, un->un_path, cnp->cn_namelen); un->un_path[cnp->cn_namelen] = '\0'; - VREF(dvp); + vnode_get(dvp); un->un_dirvp = dvp; } } else if (lowervp) { - vrele(lowervp); + vnode_put(lowervp); } *vpp = UNIONTOV(un); return (0); @@ -479,31 +481,38 @@ loop: } MALLOC(unp, void *, sizeof(struct union_node), M_TEMP, M_WAITOK); - error = getnewvnode(VT_UNION, mp, union_vnodeop_p, vpp); + + if (uppervp) + vtype = uppervp->v_type; + else + vtype = lowervp->v_type; + //bzero(&vfsp, sizeof(struct vnode_fsparam)); + vfsp.vnfs_mp = mp; + vfsp.vnfs_vtype = vtype; + vfsp.vnfs_str = "unionfs"; + vfsp.vnfs_dvp = dvp; + vfsp.vnfs_fsnode = unp; + vfsp.vnfs_cnp = cnp; + vfsp.vnfs_vops = union_vnodeop_p; + vfsp.vnfs_rdev = 0; + vfsp.vnfs_filesize = 0; + vfsp.vnfs_flags = VNFS_NOCACHE | VNFS_CANTCACHE; + vfsp.vnfs_marksystem = 0; + vfsp.vnfs_markroot = markroot; + + error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, vpp); if (error) { FREE(unp, M_TEMP); if (uppervp) { - if (dvp == uppervp) - vrele(uppervp); - else - vput(uppervp); + vnode_put(uppervp); } if (lowervp) - vrele(lowervp); + vnode_put(lowervp); goto out; } - (*vpp)->v_data = unp; - (*vpp)->v_flag |= vflag; - if (uppervp) - (*vpp)->v_type = uppervp->v_type; - else - (*vpp)->v_type = lowervp->v_type; - - if ((*vpp)->v_type == VREG) - ubc_info_init(*vpp); - + (*vpp)->v_tag = VT_UNION; un = VTOUNION(*vpp); un->un_vnode = *vpp; un->un_uppervp = uppervp; @@ -512,7 +521,7 @@ loop: un->un_lowersz = VNOVAL; un->un_pvp = undvp; if (undvp != NULLVP) - VREF(undvp); + vnode_get(undvp); un->un_dircache = 0; un->un_openl = 0; un->un_flags = UN_LOCKED; @@ -529,7 +538,7 @@ loop: un->un_path = _MALLOC(cnp->cn_namelen+1, M_TEMP, M_WAITOK); bcopy(cnp->cn_nameptr, un->un_path, cnp->cn_namelen); un->un_path[cnp->cn_namelen] = '\0'; - VREF(dvp); + vnode_get(dvp); un->un_dirvp = dvp; } else { un->un_hash = 0; @@ -543,7 +552,7 @@ loop: } if (xlowervp) - vrele(xlowervp); + vnode_put(xlowervp); out: if (docache) @@ -564,13 +573,13 @@ union_freevp(vp) } if (un->un_pvp != NULLVP) - vrele(un->un_pvp); + vnode_put(un->un_pvp); if (un->un_uppervp != NULLVP) - vrele(un->un_uppervp); + vnode_put(un->un_uppervp); if (un->un_lowervp != NULLVP) - vrele(un->un_lowervp); + vnode_put(un->un_lowervp); if (un->un_dirvp != NULLVP) - vrele(un->un_dirvp); + vnode_put(un->un_dirvp); if (un->un_path) _FREE(un->un_path, M_TEMP); @@ -586,15 +595,13 @@ union_freevp(vp) * and (tvp) are locked on entry and exit. */ int -union_copyfile(fvp, tvp, cred, p) - struct vnode *fvp; - struct vnode *tvp; - struct ucred *cred; - struct proc *p; +union_copyfile(struct vnode *fvp, struct vnode *tvp, kauth_cred_t cred, + struct proc *p) { - char *buf; + char *bufp; struct uio uio; - struct iovec iov; + struct iovec_32 iov; + struct vfs_context context; int error = 0; /* @@ -605,51 +612,50 @@ union_copyfile(fvp, tvp, cred, p) * give up at the first sign of trouble. */ - uio.uio_procp = p; + context.vc_proc = p; + context.vc_ucred = cred; + +#if 1 /* LP64todo - can't use new segment flags until the drivers are ready */ uio.uio_segflg = UIO_SYSSPACE; +#else + uio.uio_segflg = UIO_SYSSPACE32; +#endif uio.uio_offset = 0; - VOP_UNLOCK(fvp, 0, p); /* XXX */ - VOP_LEASE(fvp, p, cred, LEASE_READ); - vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, p); /* XXX */ - VOP_UNLOCK(tvp, 0, p); /* XXX */ - VOP_LEASE(tvp, p, cred, LEASE_WRITE); - vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY, p); /* XXX */ - - buf = _MALLOC(MAXPHYSIO, M_TEMP, M_WAITOK); + bufp = _MALLOC(MAXPHYSIO, M_TEMP, M_WAITOK); /* ugly loop follows... */ do { off_t offset = uio.uio_offset; - uio.uio_iov = &iov; + uio.uio_iovs.iov32p = &iov; uio.uio_iovcnt = 1; - iov.iov_base = buf; + iov.iov_base = (uintptr_t)bufp; iov.iov_len = MAXPHYSIO; - uio.uio_resid = iov.iov_len; + uio_setresid(&uio, iov.iov_len); uio.uio_rw = UIO_READ; - error = VOP_READ(fvp, &uio, 0, cred); + error = VNOP_READ(fvp, &uio, 0, &context); if (error == 0) { - uio.uio_iov = &iov; + uio.uio_iovs.iov32p = &iov; uio.uio_iovcnt = 1; - iov.iov_base = buf; - iov.iov_len = MAXPHYSIO - uio.uio_resid; + iov.iov_base = (uintptr_t)bufp; + iov.iov_len = MAXPHYSIO - uio_resid(&uio); uio.uio_offset = offset; uio.uio_rw = UIO_WRITE; - uio.uio_resid = iov.iov_len; + uio_setresid(&uio, iov.iov_len); - if (uio.uio_resid == 0) + if (uio_resid(&uio) == 0) break; do { - error = VOP_WRITE(tvp, &uio, 0, cred); - } while ((uio.uio_resid > 0) && (error == 0)); + error = VNOP_WRITE(tvp, &uio, 0, &context); + } while ((uio_resid(&uio) > 0) && (error == 0)); } } while (error == 0); - _FREE(buf, M_TEMP); + _FREE(bufp, M_TEMP); return (error); } @@ -658,19 +664,20 @@ union_copyfile(fvp, tvp, cred, p) * locked on exit. */ int -union_copyup(un, docopy, cred, p) - struct union_node *un; - int docopy; - struct ucred *cred; - struct proc *p; +union_copyup(struct union_node *un, int docopy, kauth_cred_t cred, + struct proc *p) { int error; struct vnode *lvp, *uvp; + struct vfs_context context; error = union_vn_create(&uvp, un, p); if (error) return (error); + context.vc_proc = p; + context.vc_ucred = cred; + /* at this point, uppervp is locked */ union_newupper(un, uvp); un->un_flags |= UN_ULOCK; @@ -680,14 +687,12 @@ union_copyup(un, docopy, cred, p) if (docopy) { /* * XX - should not ignore errors - * from VOP_CLOSE + * from vnop_close */ - vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY, p); - error = VOP_OPEN(lvp, FREAD, cred, p); + error = VNOP_OPEN(lvp, FREAD, &context); if (error == 0) { error = union_copyfile(lvp, uvp, cred, p); - VOP_UNLOCK(lvp, 0, p); - (void) VOP_CLOSE(lvp, FREAD, cred, p); + (void) VNOP_CLOSE(lvp, FREAD, &context); } #ifdef UNION_DIAGNOSTIC if (error == 0) @@ -696,9 +701,7 @@ union_copyup(un, docopy, cred, p) } un->un_flags &= ~UN_ULOCK; - VOP_UNLOCK(uvp, 0, p); union_vn_close(uvp, FWRITE, cred, p); - vn_lock(uvp, LK_EXCLUSIVE | LK_RETRY, p); un->un_flags |= UN_ULOCK; /* @@ -713,8 +716,8 @@ union_copyup(un, docopy, cred, p) int i; for (i = 0; i < un->un_openl; i++) { - (void) VOP_CLOSE(lvp, FREAD, cred, p); - (void) VOP_OPEN(uvp, FREAD, cred, p); + (void) VNOP_CLOSE(lvp, FREAD, &context); + (void) VNOP_OPEN(uvp, FREAD, &context); } un->un_openl = 0; } @@ -741,10 +744,8 @@ union_relookup(um, dvp, vpp, cnp, cn, path, pathlen) * from or what it is being used for. This must duplicate * some of the work done by NDINIT, some of the work done * by namei, some of the work done by lookup and some of - * the work done by VOP_LOOKUP when given a CREATE flag. + * the work done by vnop_lookup when given a CREATE flag. * Conclusion: Horrible. - * - * The pathname buffer will be FREEed by VOP_MKDIR. */ cn->cn_namelen = pathlen; cn->cn_pnbuf = _MALLOC_ZONE(cn->cn_namelen+1, M_NAMEI, M_WAITOK); @@ -754,19 +755,22 @@ union_relookup(um, dvp, vpp, cnp, cn, path, pathlen) cn->cn_nameiop = CREATE; cn->cn_flags = (LOCKPARENT|HASBUF|SAVENAME|SAVESTART|ISLASTCN); +#ifdef XXX_HELP_ME cn->cn_proc = cnp->cn_proc; if (um->um_op == UNMNT_ABOVE) cn->cn_cred = cnp->cn_cred; else cn->cn_cred = um->um_cred; +#endif + cn->cn_context = cnp->cn_context; /* XXX !UNMNT_ABOVE case ??? */ cn->cn_nameptr = cn->cn_pnbuf; cn->cn_hash = cnp->cn_hash; cn->cn_consume = cnp->cn_consume; - VREF(dvp); + vnode_get(dvp); error = relookup(dvp, vpp, cn); if (!error) - vrele(dvp); + vnode_put(dvp); return (error); } @@ -791,8 +795,7 @@ union_mkshadow(um, dvp, cnp, vpp) struct vnode **vpp; { int error; - struct vattr va; - struct proc *p = cnp->cn_proc; + struct vnode_attr va; struct componentname cn; error = union_relookup(um, dvp, vpp, cnp, &cn, @@ -801,9 +804,7 @@ union_mkshadow(um, dvp, cnp, vpp) return (error); if (*vpp) { - VOP_ABORTOP(dvp, &cn); - VOP_UNLOCK(dvp, 0, p); - vrele(*vpp); + vnode_put(*vpp); *vpp = NULLVP; return (EEXIST); } @@ -815,15 +816,11 @@ union_mkshadow(um, dvp, cnp, vpp) * 777 modified by umask (ie mostly identical to the * mkdir syscall). (jsp, kb) */ + VATTR_INIT(&va); + VATTR_SET(&va, va_type, VDIR); + VATTR_SET(&va, va_mode, um->um_cmode); - VATTR_NULL(&va); - va.va_type = VDIR; - va.va_mode = um->um_cmode; - - /* VOP_LEASE: dvp is locked */ - VOP_LEASE(dvp, p, cn.cn_cred, LEASE_WRITE); - - error = VOP_MKDIR(dvp, vpp, &cn, &va); + error = vn_create(dvp, vpp, &cn, &va, 0, cnp->cn_context); return (error); } @@ -844,33 +841,22 @@ union_mkwhiteout(um, dvp, cnp, path) char *path; { int error; - struct vattr va; - struct proc *p = cnp->cn_proc; struct vnode *wvp; struct componentname cn; - VOP_UNLOCK(dvp, 0, p); error = union_relookup(um, dvp, &wvp, cnp, &cn, path, strlen(path)); if (error) { - vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, p); return (error); } - if (wvp) { - VOP_ABORTOP(dvp, &cn); - vrele(dvp); - vrele(wvp); + vnode_put(dvp); + vnode_put(wvp); return (EEXIST); } - /* VOP_LEASE: dvp is locked */ - VOP_LEASE(dvp, p, p->p_ucred, LEASE_WRITE); + error = VNOP_WHITEOUT(dvp, &cn, CREATE, cnp->cn_context); - error = VOP_WHITEOUT(dvp, &cn, CREATE); - if (error) - VOP_ABORTOP(dvp, &cn); - - vrele(dvp); + vnode_put(dvp); return (error); } @@ -890,9 +876,10 @@ union_vn_create(vpp, un, p) struct proc *p; { struct vnode *vp; - struct ucred *cred = p->p_ucred; - struct vattr vat; - struct vattr *vap = &vat; + kauth_cred_t cred = p->p_ucred; + struct vnode_attr vat; + struct vnode_attr *vap = &vat; + struct vfs_context context; int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL); int error; int cmode = UN_FILEMODE & ~p->p_fd->fd_cmask; @@ -901,6 +888,9 @@ union_vn_create(vpp, un, p) *vpp = NULLVP; + context.vc_proc = p; + context.vc_ucred = p->p_ucred; + /* * Build a new componentname structure (for the same * reasons outlines in union_mkshadow). @@ -917,24 +907,19 @@ union_vn_create(vpp, un, p) bcopy(un->un_path, cn.cn_pnbuf, cn.cn_namelen+1); cn.cn_nameiop = CREATE; cn.cn_flags = (LOCKPARENT|HASBUF|SAVENAME|SAVESTART|ISLASTCN); - cn.cn_proc = p; - cn.cn_cred = p->p_ucred; + cn.cn_context = &context; cn.cn_nameptr = cn.cn_pnbuf; cn.cn_hash = un->un_hash; cn.cn_consume = 0; - VREF(un->un_dirvp); + vnode_get(un->un_dirvp); if (error = relookup(un->un_dirvp, &vp, &cn)) return (error); - vrele(un->un_dirvp); + vnode_put(un->un_dirvp); if (vp) { - VOP_ABORTOP(un->un_dirvp, &cn); - if (un->un_dirvp == vp) - vrele(un->un_dirvp); - else - vput(un->un_dirvp); - vrele(vp); + vnode_put(un->un_dirvp); + vnode_put(vp); return (EEXIST); } @@ -946,37 +931,46 @@ union_vn_create(vpp, un, p) * it is unioned, will require access to the top *and* * bottom files. Access when not unioned will simply * require access to the top-level file. + * * TODO: confirm choice of access permissions. + * decide on authorisation behaviour */ - VATTR_NULL(vap); - vap->va_type = VREG; - vap->va_mode = cmode; - VOP_LEASE(un->un_dirvp, p, cred, LEASE_WRITE); - if (error = VOP_CREATE(un->un_dirvp, &vp, &cn, vap)) + + VATTR_INIT(vap); + VATTR_SET(vap, va_type, VREG); + VATTR_SET(vap, va_mode, cmode); + + if (error = vn_create(un->un_dirvp, &vp, &cn, vap, 0, &context)) return (error); - if (error = VOP_OPEN(vp, fmode, cred, p)) { - vput(vp); + if (error = VNOP_OPEN(vp, fmode, &context)) { + vnode_put(vp); return (error); } + vnode_lock(vp); if (++vp->v_writecount <= 0) panic("union: v_writecount"); + vnode_unlock(vp); *vpp = vp; return (0); } int -union_vn_close(vp, fmode, cred, p) - struct vnode *vp; - int fmode; - struct ucred *cred; - struct proc *p; +union_vn_close(struct vnode *vp, int fmode, kauth_cred_t cred, + struct proc *p) { + struct vfs_context context; + + context.vc_proc = p; + context.vc_ucred = cred; - if (fmode & FWRITE) + if (fmode & FWRITE) { + vnode_lock(vp); --vp->v_writecount; - return (VOP_CLOSE(vp, fmode, cred, p)); + vnode_unlock(vp); + } + return (VNOP_CLOSE(vp, fmode, &context)); } void @@ -993,7 +987,6 @@ union_removed_upper(un) if (un->un_flags & UN_ULOCK) { un->un_flags &= ~UN_ULOCK; - VOP_UNLOCK(un->un_uppervp, 0, p); } } @@ -1006,7 +999,7 @@ union_lowervp(vp) if ((un->un_lowervp != NULLVP) && (vp->v_type == un->un_lowervp->v_type)) { - if (vget(un->un_lowervp, 0, current_proc()) == 0) + if (vnode_get(un->un_lowervp) == 0) return (un->un_lowervp); } @@ -1019,17 +1012,16 @@ union_lowervp(vp) * during a remove/rmdir operation. */ int -union_dowhiteout(un, cred, p) - struct union_node *un; - struct ucred *cred; - struct proc *p; +union_dowhiteout(struct union_node *un, vfs_context_t ctx) { - struct vattr va; + struct vnode_attr va; if (un->un_lowervp != NULLVP) return (1); - if (VOP_GETATTR(un->un_uppervp, &va, cred, p) == 0 && + VATTR_INIT(&va); + VATTR_WANTED(&va, va_flags); + if (vnode_getattr(un->un_uppervp, &va, ctx) == 0 && (va.va_flags & OPAQUE)) return (1); @@ -1046,7 +1038,7 @@ union_dircache_r(vp, vppp, cntp) if (vp->v_op != union_vnodeop_p) { if (vppp) { - VREF(vp); + vnode_get(vp); *(*vppp)++ = vp; if (--(*cntp) == 0) panic("union: dircache table too small"); @@ -1069,27 +1061,26 @@ union_dircache(vp, p) struct vnode *vp; struct proc *p; { - int cnt; + int count; struct vnode *nvp; struct vnode **vpp; struct vnode **dircache; struct union_node *un; int error; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); dircache = VTOUNION(vp)->un_dircache; nvp = NULLVP; if (dircache == 0) { - cnt = 0; - union_dircache_r(vp, 0, &cnt); - cnt++; + count = 0; + union_dircache_r(vp, 0, &count); + count++; dircache = (struct vnode **) - _MALLOC(cnt * sizeof(struct vnode *), + _MALLOC(count * sizeof(struct vnode *), M_TEMP, M_WAITOK); vpp = dircache; - union_dircache_r(vp, &vpp, &cnt); + union_dircache_r(vp, &vpp, &count); *vpp = NULLVP; vpp = dircache + 1; } else { @@ -1103,8 +1094,7 @@ union_dircache(vp, p) if (*vpp == NULLVP) goto out; - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, p); - VREF(*vpp); + vnode_get(*vpp); error = union_allocvp(&nvp, vp->v_mount, NULLVP, NULLVP, 0, *vpp, NULLVP, 0); if (error) goto out; @@ -1114,6 +1104,5 @@ union_dircache(vp, p) un->un_dircache = dircache; out: - VOP_UNLOCK(vp, 0, p); return (nvp); } diff --git a/bsd/miscfs/union/union_vfsops.c b/bsd/miscfs/union/union_vfsops.c index 779831f2d..959c201d2 100644 --- a/bsd/miscfs/union/union_vfsops.c +++ b/bsd/miscfs/union/union_vfsops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -67,38 +67,36 @@ #include <sys/systm.h> #include <sys/time.h> #include <sys/types.h> -#include <sys/proc.h> -#include <sys/vnode.h> -#include <sys/mount.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> +#include <sys/vnode_internal.h> +#include <sys/mount_internal.h> #include <sys/namei.h> #include <sys/malloc.h> #include <sys/filedesc.h> #include <sys/queue.h> #include <miscfs/union/union.h> +static int union_itercallback(__unused vnode_t, void *); + /* * Mount union filesystem */ int -union_mount(mp, path, data, ndp, p) - struct mount *mp; - char *path; - caddr_t data; - struct nameidata *ndp; - struct proc *p; +union_mount(mount_t mp, __unused vnode_t devvp, user_addr_t data, vfs_context_t context) { + proc_t p = vfs_context_proc(context); int error = 0; - struct union_args args; + struct user_union_args args; struct vnode *lowerrootvp = NULLVP; struct vnode *upperrootvp = NULLVP; struct union_mount *um = 0; - struct ucred *cred = 0; - struct ucred *scred; - struct vattr va; + kauth_cred_t cred = NOCRED; char *cp; int len; u_int size; - + struct nameidata nd; + #ifdef UNION_DIAGNOSTIC printf("union_mount(mp = %x)\n", mp); #endif @@ -112,31 +110,42 @@ union_mount(mp, path, data, ndp, p) * 1. a way to convert between rdonly and rdwr mounts. * 2. support for nfs exports. */ - error = EOPNOTSUPP; + error = ENOTSUP; goto bad; } /* * Get argument */ - if (error = copyin(data, (caddr_t)&args, sizeof(struct union_args))) + if (vfs_context_is64bit(context)) { + error = copyin(data, (caddr_t)&args, sizeof(args)); + } + else { + struct union_args temp; + error = copyin(data, (caddr_t)&temp, sizeof (temp)); + args.target = CAST_USER_ADDR_T(temp.target); + args.mntflags = temp.mntflags; + } + if (error) goto bad; lowerrootvp = mp->mnt_vnodecovered; - VREF(lowerrootvp); + vnode_get(lowerrootvp); /* * Find upper node. */ - NDINIT(ndp, LOOKUP, FOLLOW|WANTPARENT, - UIO_USERSPACE, args.target, p); + NDINIT(&nd, LOOKUP, FOLLOW|WANTPARENT, + (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32), + args.target, context); - if (error = namei(ndp)) + if ((error = namei(&nd))) goto bad; - upperrootvp = ndp->ni_vp; - vrele(ndp->ni_dvp); - ndp->ni_dvp = NULL; + nameidone(&nd); + upperrootvp = nd.ni_vp; + vnode_put(nd.ni_dvp); + nd.ni_dvp = NULL; if (upperrootvp->v_type != VDIR) { error = EINVAL; @@ -150,7 +159,7 @@ union_mount(mp, path, data, ndp, p) /* * Keep a held reference to the target vnodes. - * They are vrele'd in union_unmount. + * They are vnode_put'd in union_unmount. * * Depending on the _BELOW flag, the filesystems are * viewed in a different order. In effect, this is the @@ -170,7 +179,7 @@ union_mount(mp, path, data, ndp, p) break; case UNMNT_REPLACE: - vrele(lowerrootvp); + vnode_put(lowerrootvp); lowerrootvp = NULLVP; um->um_uppervp = upperrootvp; um->um_lowervp = lowerrootvp; @@ -186,13 +195,13 @@ union_mount(mp, path, data, ndp, p) * supports whiteout operations */ if ((mp->mnt_flag & MNT_RDONLY) == 0) { - error = VOP_WHITEOUT(um->um_uppervp, (struct componentname *) 0, LOOKUP); + error = VNOP_WHITEOUT(um->um_uppervp, (struct componentname *) 0, + LOOKUP, context); if (error) goto bad; } - um->um_cred = p->p_ucred; - crhold(um->um_cred); + um->um_cred = kauth_cred_get_with_ref(); um->um_cmode = UN_DIRMODE &~ p->p_fd->fd_cmask; /* @@ -223,9 +232,6 @@ union_mount(mp, path, data, ndp, p) mp->mnt_data = (qaddr_t) um; vfs_getnewfsid(mp); - (void) copyinstr(path, mp->mnt_stat.f_mntonname, - MNAMELEN - 1, (size_t *)&size); - bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); switch (um->um_op) { case UNMNT_ABOVE: @@ -239,9 +245,9 @@ union_mount(mp, path, data, ndp, p) break; } len = strlen(cp); - bcopy(cp, mp->mnt_stat.f_mntfromname, len); + bcopy(cp, mp->mnt_vfsstat.f_mntfromname, len); - cp = mp->mnt_stat.f_mntfromname + len; + cp = mp->mnt_vfsstat.f_mntfromname + len; len = MNAMELEN - len; (void) copyinstr(args.target, cp, len - 1, (size_t *)&size); @@ -249,7 +255,7 @@ union_mount(mp, path, data, ndp, p) #ifdef UNION_DIAGNOSTIC printf("union_mount: from %s, on %s\n", - mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname); + mp->mnt_vfsstat.f_mntfromname, mp->mnt_vfsstat.f_mntonname); #endif return (0); @@ -257,11 +263,11 @@ bad: if (um) _FREE(um, M_UFSMNT); if (cred != NOCRED) - crfree(cred); + kauth_cred_rele(cred); if (upperrootvp) - vrele(upperrootvp); + vnode_put(upperrootvp); if (lowerrootvp) - vrele(lowerrootvp); + vnode_put(lowerrootvp); return (error); } @@ -271,30 +277,35 @@ bad: * when that filesystem was mounted. */ int -union_start(mp, flags, p) - struct mount *mp; - int flags; - struct proc *p; +union_start(__unused struct mount *mp, __unused int flags, __unused vfs_context_t context) { return (0); } +static int +union_itercallback(__unused vnode_t vp, void *args) +{ + int num = *(int *)args; + + *(int *)args = num + 1; + return(VNODE_RETURNED); +} + + + /* * Free reference to union layer */ int -union_unmount(mp, mntflags, p) - struct mount *mp; - int mntflags; - struct proc *p; +union_unmount(mount_t mp, int mntflags, __unused vfs_context_t context) { struct union_mount *um = MOUNTTOUNIONMOUNT(mp); struct vnode *um_rootvp; int error; int freeing; int flags = 0; - struct ucred *cred; + kauth_cred_t cred; #ifdef UNION_DIAGNOSTIC printf("union_unmount(mp = %x)\n", mp); @@ -303,7 +314,7 @@ union_unmount(mp, mntflags, p) if (mntflags & MNT_FORCE) flags |= FORCECLOSE; - if (error = union_root(mp, &um_rootvp)) + if ((error = union_root(mp, &um_rootvp))) return (error); /* @@ -316,14 +327,9 @@ union_unmount(mp, mntflags, p) * in the filesystem. */ for (freeing = 0; vflush(mp, um_rootvp, flags) != 0;) { - struct vnode *vp; - int n; + int n = 0; - /* count #vnodes held on mount list */ - for (n = 0, vp = mp->mnt_vnodelist.lh_first; - vp != NULLVP; - vp = vp->v_mntvnodes.le_next) - n++; + vnode_iterate(mp, VNODE_NOLOCK_INTERNAL, union_itercallback, &n); /* if this is unchanged then stop */ if (n == freeing) @@ -334,8 +340,8 @@ union_unmount(mp, mntflags, p) } /* At this point the root vnode should have a single reference */ - if (um_rootvp->v_usecount > 1) { - vput(um_rootvp); + if (vnode_isinuse(um_rootvp, 0)) { + vnode_put(um_rootvp); return (EBUSY); } @@ -346,21 +352,21 @@ union_unmount(mp, mntflags, p) * Discard references to upper and lower target vnodes. */ if (um->um_lowervp) - vrele(um->um_lowervp); - vrele(um->um_uppervp); + vnode_put(um->um_lowervp); + vnode_put(um->um_uppervp); cred = um->um_cred; if (cred != NOCRED) { um->um_cred = NOCRED; - crfree(cred); + kauth_cred_rele(cred); } /* * Release reference on underlying root vnode */ - vput(um_rootvp); + vnode_put(um_rootvp); /* * And blow it away for future re-use */ - vgone(um_rootvp); + vnode_reclaim(um_rootvp); /* * Finally, throw away the union_mount structure */ @@ -370,28 +376,17 @@ union_unmount(mp, mntflags, p) } int -union_root(mp, vpp) - struct mount *mp; - struct vnode **vpp; +union_root(mount_t mp, vnode_t *vpp, __unused vfs_context_t context) { - struct proc *p = current_proc(); /* XXX */ struct union_mount *um = MOUNTTOUNIONMOUNT(mp); int error; - int loselock; /* * Return locked reference to root. */ - VREF(um->um_uppervp); - if ((um->um_op == UNMNT_BELOW) && - VOP_ISLOCKED(um->um_uppervp)) { - loselock = 1; - } else { - vn_lock(um->um_uppervp, LK_EXCLUSIVE | LK_RETRY, p); - loselock = 0; - } + vnode_get(um->um_uppervp); if (um->um_lowervp) - VREF(um->um_lowervp); + vnode_get(um->um_lowervp); error = union_allocvp(vpp, mp, (struct vnode *) 0, (struct vnode *) 0, @@ -401,75 +396,85 @@ union_root(mp, vpp) 1); if (error) { - if (loselock) - vrele(um->um_uppervp); - else - vput(um->um_uppervp); + vnode_put(um->um_uppervp); if (um->um_lowervp) - vrele(um->um_lowervp); - } else { - if (loselock) - VTOUNION(*vpp)->un_flags &= ~UN_ULOCK; - } + vnode_put(um->um_lowervp); + } return (error); } -int -union_statfs(mp, sbp, p) - struct mount *mp; - struct statfs *sbp; - struct proc *p; +static int +union_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t context) { int error; struct union_mount *um = MOUNTTOUNIONMOUNT(mp); - struct statfs mstat; - int lbsize; + struct vfs_attr attr; + uint32_t lbsize = 0; #ifdef UNION_DIAGNOSTIC - printf("union_statfs(mp = %x, lvp = %x, uvp = %x)\n", mp, + printf("union_vfs_getattr(mp = %x, lvp = %x, uvp = %x)\n", mp, um->um_lowervp, um->um_uppervp); #endif - bzero(&mstat, sizeof(mstat)); - + /* Get values from lower file system (if any) */ if (um->um_lowervp) { - error = VFS_STATFS(um->um_lowervp->v_mount, &mstat, p); + VFSATTR_INIT(&attr); + VFSATTR_WANTED(&attr, f_bsize); + VFSATTR_WANTED(&attr, f_blocks); + VFSATTR_WANTED(&attr, f_bused); + VFSATTR_WANTED(&attr, f_files); + error = vfs_getattr(um->um_lowervp->v_mount, &attr, context); if (error) return (error); + + /* now copy across the "interesting" information and fake the rest */ + if (VFSATTR_IS_SUPPORTED(&attr, f_bsize)) + lbsize = attr.f_bsize; + else + lbsize = um->um_lowervp->v_mount->mnt_devblocksize; + fsap->f_blocks = VFSATTR_IS_SUPPORTED(&attr, f_blocks) ? attr.f_blocks : 0; + fsap->f_bused = VFSATTR_IS_SUPPORTED(&attr, f_bused) ? attr.f_bused : 0; + fsap->f_files = VFSATTR_IS_SUPPORTED(&attr, f_files) ? attr.f_files : 0; + } else { + fsap->f_blocks = 0; + fsap->f_bused = 0; + fsap->f_files = 0; } - /* now copy across the "interesting" information and fake the rest */ -#if 0 - sbp->f_type = mstat.f_type; - sbp->f_flags = mstat.f_flags; - sbp->f_bsize = mstat.f_bsize; - sbp->f_iosize = mstat.f_iosize; -#endif - lbsize = mstat.f_bsize; - sbp->f_blocks = mstat.f_blocks; - sbp->f_bfree = mstat.f_bfree; - sbp->f_bavail = mstat.f_bavail; - sbp->f_files = mstat.f_files; - sbp->f_ffree = mstat.f_ffree; - - error = VFS_STATFS(um->um_uppervp->v_mount, &mstat, p); + VFSATTR_INIT(&attr); + VFSATTR_WANTED(&attr, f_bsize); + VFSATTR_WANTED(&attr, f_blocks); + VFSATTR_WANTED(&attr, f_bfree); + VFSATTR_WANTED(&attr, f_bavail); + VFSATTR_WANTED(&attr, f_files); + VFSATTR_WANTED(&attr, f_ffree); + error = vfs_getattr(um->um_uppervp->v_mount, &attr, context); if (error) return (error); - sbp->f_flags = mstat.f_flags; - sbp->f_bsize = mstat.f_bsize; - sbp->f_iosize = mstat.f_iosize; + if (VFSATTR_IS_SUPPORTED(&attr, f_bsize)) { + fsap->f_bsize = attr.f_bsize; + VFSATTR_SET_SUPPORTED(fsap, f_bsize); + } + if (VFSATTR_IS_SUPPORTED(&attr, f_iosize)) { + fsap->f_iosize = attr.f_iosize; + VFSATTR_SET_SUPPORTED(fsap, f_iosize); + } /* * if the lower and upper blocksizes differ, then frig the * block counts so that the sizes reported by df make some * kind of sense. none of this makes sense though. */ - - if (mstat.f_bsize != lbsize) - sbp->f_blocks = sbp->f_blocks * lbsize / mstat.f_bsize; + if (VFSATTR_IS_SUPPORTED(&attr, f_bsize)) + fsap->f_bsize = attr.f_bsize; + else + fsap->f_bsize = um->um_uppervp->v_mount->mnt_devblocksize; + VFSATTR_RETURN(fsap, f_bsize, attr.f_bsize); + if (fsap->f_bsize != lbsize) + fsap->f_blocks = fsap->f_blocks * lbsize / attr.f_bsize; /* * The "total" fields count total resources in all layers, @@ -477,49 +482,52 @@ union_statfs(mp, sbp, p) * free in the upper layer (since only the upper layer * is writeable). */ - sbp->f_blocks += mstat.f_blocks; - sbp->f_bfree = mstat.f_bfree; - sbp->f_bavail = mstat.f_bavail; - sbp->f_files += mstat.f_files; - sbp->f_ffree = mstat.f_ffree; - - if (sbp != &mp->mnt_stat) { - sbp->f_type = mp->mnt_vfc->vfc_typenum; - bcopy(&mp->mnt_stat.f_fsid, &sbp->f_fsid, sizeof(sbp->f_fsid)); - bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); - bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); - } + if (VFSATTR_IS_SUPPORTED(&attr, f_blocks)) + fsap->f_blocks += attr.f_blocks; + if (VFSATTR_IS_SUPPORTED(&attr, f_bfree)) + fsap->f_bfree = attr.f_bfree; + if (VFSATTR_IS_SUPPORTED(&attr, f_bavail)) + fsap->f_bavail = attr.f_bavail; + if (VFSATTR_IS_SUPPORTED(&attr, f_bused)) + fsap->f_bused += attr.f_bused; + if (VFSATTR_IS_SUPPORTED(&attr, f_files)) + fsap->f_files += attr.f_files; + if (VFSATTR_IS_SUPPORTED(&attr, f_ffree)) + fsap->f_ffree = attr.f_ffree; + + VFSATTR_SET_SUPPORTED(fsap, f_bsize); + VFSATTR_SET_SUPPORTED(fsap, f_blocks); + VFSATTR_SET_SUPPORTED(fsap, f_bfree); + VFSATTR_SET_SUPPORTED(fsap, f_bavail); + VFSATTR_SET_SUPPORTED(fsap, f_bused); + VFSATTR_SET_SUPPORTED(fsap, f_files); + VFSATTR_SET_SUPPORTED(fsap, f_ffree); + return (0); } /* * XXX - Assumes no data cached at union layer. */ -#define union_sync ((int (*) __P((struct mount *, int, struct ucred *, \ - struct proc *)))nullop) - -#define union_fhtovp ((int (*) __P((struct mount *, struct fid *, \ - struct mbuf *, struct vnode **, int *, struct ucred **)))eopnotsupp) -int union_init __P((struct vfsconf *)); -#define union_quotactl ((int (*) __P((struct mount *, int, uid_t, caddr_t, \ - struct proc *)))eopnotsupp) -#define union_sysctl ((int (*) __P((int *, u_int, void *, size_t *, void *, \ - size_t, struct proc *)))eopnotsupp) -#define union_vget ((int (*) __P((struct mount *, void *, struct vnode **))) \ - eopnotsupp) -#define union_vptofh ((int (*) __P((struct vnode *, struct fid *)))eopnotsupp) +#define union_sync (int (*) (mount_t, int, ucred_t, vfs_context_t))nullop + +#define union_fhtovp (int (*) (mount_t, int, unsigned char *, vnode_t *, vfs_context_t))eopnotsupp +int union_init (struct vfsconf *); +#define union_sysctl (int (*) (int *, u_int, user_addr_t, size_t *, user_addr_t, size_t, vfs_context_t))eopnotsupp +#define union_vget (int (*) (mount_t, ino64_t, vnode_t *, vfs_context_t))eopnotsupp +#define union_vptofh (int (*) (vnode_t, int *, unsigned char *, vfs_context_t))eopnotsupp struct vfsops union_vfsops = { union_mount, union_start, union_unmount, union_root, - union_quotactl, - union_statfs, + NULL, /* quotactl */ + union_vfs_getattr, union_sync, union_vget, union_fhtovp, union_vptofh, union_init, - union_sysctl, + union_sysctl }; diff --git a/bsd/miscfs/union/union_vnops.c b/bsd/miscfs/union/union_vnops.c index 5ff3134f9..4b1ca8ef1 100644 --- a/bsd/miscfs/union/union_vnops.c +++ b/bsd/miscfs/union/union_vnops.c @@ -62,20 +62,22 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> +#include <sys/kauth.h> #include <sys/file.h> #include <sys/time.h> #include <sys/stat.h> #include <sys/types.h> -#include <sys/vnode.h> -#include <sys/mount.h> +#include <sys/vnode_internal.h> +#include <sys/mount_internal.h> #include <sys/namei.h> #include <sys/malloc.h> -#include <sys/buf.h> +#include <sys/buf_internal.h> #include <sys/queue.h> #include <sys/lock.h> #include <miscfs/union/union.h> #include <vfs/vfs_support.h> #include <sys/ubc.h> +#include <sys/uio_internal.h> #define FIXUP(un, p) { \ if (((un)->un_flags & UN_ULOCK) == 0) { \ @@ -89,19 +91,15 @@ union_fixup(un, p) struct proc *p; { - vn_lock(un->un_uppervp, LK_EXCLUSIVE | LK_RETRY, p); un->un_flags |= UN_ULOCK; } static int -union_lookup1(udvp, dvpp, vpp, cnp) - struct vnode *udvp; - struct vnode **dvpp; - struct vnode **vpp; - struct componentname *cnp; +union_lookup1(struct vnode *udvp, struct vnode **dvpp, struct vnode **vpp, + struct componentname *cnp) { int error; - struct proc *p = cnp->cn_proc; + vfs_context_t ctx = cnp->cn_context; struct vnode *tdvp; struct vnode *dvp; struct mount *mp; @@ -124,26 +122,16 @@ union_lookup1(udvp, dvpp, vpp, cnp) */ tdvp = dvp; *dvpp = dvp = dvp->v_mount->mnt_vnodecovered; - vput(tdvp); - VREF(dvp); - vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, p); + vnode_put(tdvp); + vnode_get(dvp); } } - error = VOP_LOOKUP(dvp, &tdvp, cnp); + error = VNOP_LOOKUP(dvp, &tdvp, cnp, ctx); if (error) return (error); - /* - * The parent directory will have been unlocked, unless lookup - * found the last component. In which case, re-lock the node - * here to allow it to be unlocked again (phew) in union_lookup. - */ - if (dvp != tdvp && !(cnp->cn_flags & ISLASTCN)) - vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, p); - dvp = tdvp; - /* * Lastly check if the current node is a mount point in * which case walk up the mount hierarchy making sure not to @@ -151,18 +139,18 @@ union_lookup1(udvp, dvpp, vpp, cnp) */ while (dvp != udvp && (dvp->v_type == VDIR) && (mp = dvp->v_mountedhere)) { - if (vfs_busy(mp, LK_NOWAIT, 0, p)) { - vput(dvp); + if (vfs_busy(mp, LK_NOWAIT)) { + vnode_put(dvp); return(ENOENT); } - error = VFS_ROOT(mp, &tdvp); - vfs_unbusy(mp, p); + error = VFS_ROOT(mp, &tdvp, ctx); + vfs_unbusy(mp); if (error) { - vput(dvp); + vnode_put(dvp); return (error); } - vput(dvp); + vnode_put(dvp); dvp = tdvp; } @@ -171,13 +159,14 @@ union_lookup1(udvp, dvpp, vpp, cnp) } int -union_lookup(ap) - struct vop_lookup_args /* { +union_lookup( + struct vnop_lookup_args /* { struct vnodeop_desc *a_desc; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; - } */ *ap; + vfs_context_t a_context; + } */ *ap) { int error; int uerror, lerror; @@ -186,13 +175,13 @@ union_lookup(ap) struct vnode *dvp = ap->a_dvp; struct union_node *dun = VTOUNION(dvp); struct componentname *cnp = ap->a_cnp; - struct proc *p = cnp->cn_proc; + vfs_context_t ctx = cnp->cn_context; + struct proc *p = vfs_context_proc(ctx); int lockparent = cnp->cn_flags & LOCKPARENT; - int rdonly = cnp->cn_flags & RDONLY; struct union_mount *um = MOUNTTOUNIONMOUNT(dvp->v_mount); - struct ucred *saved_cred; + kauth_cred_t saved_cred; int iswhiteout; - struct vattr va; + struct vnode_attr va; #ifdef notyet if (cnp->cn_namelen == 3 && @@ -202,10 +191,8 @@ union_lookup(ap) dvp = *ap->a_vpp = LOWERVP(ap->a_dvp); if (dvp == NULLVP) return (ENOENT); - VREF(dvp); - vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, p); - if (!lockparent || !(cnp->cn_flags & ISLASTCN)) - VOP_UNLOCK(ap->a_dvp, 0, p); + vnode_get(dvp); + return (0); } #endif @@ -241,8 +228,9 @@ union_lookup(ap) if (cnp->cn_flags & ISWHITEOUT) { iswhiteout = 1; } else if (lowerdvp != NULLVP) { - lerror = VOP_GETATTR(upperdvp, &va, - cnp->cn_cred, cnp->cn_proc); + VATTR_INIT(&va); + VATTR_WANTED(&va, va_flags); + lerror = vnode_getattr(upperdvp, &va, ap->a_context); if (lerror == 0 && (va.va_flags & OPAQUE)) iswhiteout = 1; } @@ -254,15 +242,13 @@ union_lookup(ap) /* * in a similar way to the upper layer, do the lookup * in the lower layer. this time, if there is some - * component magic going on, then vput whatever we got + * component magic going on, then vnode_put whatever we got * back from the upper layer and return the lower vnode * instead. */ if (lowerdvp != NULLVP && !iswhiteout) { int nameiop; - vn_lock(lowerdvp, LK_EXCLUSIVE | LK_RETRY, p); - /* * Only do a LOOKUP on the bottom node, since * we won't be making changes to it anyway. @@ -270,24 +256,21 @@ union_lookup(ap) nameiop = cnp->cn_nameiop; cnp->cn_nameiop = LOOKUP; if (um->um_op == UNMNT_BELOW) { - saved_cred = cnp->cn_cred; - cnp->cn_cred = um->um_cred; + /* XXX BOGUS */ + saved_cred = cnp->cn_context->vc_ucred; + cnp->cn_context->vc_ucred = um->um_cred; + lerror = union_lookup1(um->um_lowervp, &lowerdvp, + &lowervp, cnp); + cnp->cn_context->vc_ucred = saved_cred; + } else { + lerror = union_lookup1(um->um_lowervp, &lowerdvp, + &lowervp, cnp); } - lerror = union_lookup1(um->um_lowervp, &lowerdvp, - &lowervp, cnp); - if (um->um_op == UNMNT_BELOW) - cnp->cn_cred = saved_cred; cnp->cn_nameiop = nameiop; - if (lowervp != lowerdvp) - VOP_UNLOCK(lowerdvp, 0, p); - if (cnp->cn_consume != 0) { if (uppervp != NULLVP) { - if (uppervp == upperdvp) - vrele(uppervp); - else - vput(uppervp); + vnode_put(uppervp); uppervp = NULLVP; } *ap->a_vpp = lowervp; @@ -300,8 +283,7 @@ union_lookup(ap) if ((cnp->cn_flags & ISDOTDOT) && dun->un_pvp != NULLVP) { lowervp = LOWERVP(dun->un_pvp); if (lowervp != NULLVP) { - VREF(lowervp); - vn_lock(lowervp, LK_EXCLUSIVE | LK_RETRY, p); + vnode_get(lowervp); lerror = 0; } } @@ -345,54 +327,46 @@ union_lookup(ap) if (uerror != 0 /* && (lerror == 0) */ ) { if (lowervp->v_type == VDIR) { /* case 2b. */ dun->un_flags &= ~UN_ULOCK; - VOP_UNLOCK(upperdvp, 0, p); uerror = union_mkshadow(um, upperdvp, cnp, &uppervp); - vn_lock(upperdvp, LK_EXCLUSIVE | LK_RETRY, p); dun->un_flags |= UN_ULOCK; if (uerror) { if (lowervp != NULLVP) { - vput(lowervp); + vnode_put(lowervp); lowervp = NULLVP; } return (uerror); } } } - - if (lowervp != NULLVP) - VOP_UNLOCK(lowervp, 0, p); - error = union_allocvp(ap->a_vpp, dvp->v_mount, dvp, upperdvp, cnp, uppervp, lowervp, 1); if (error) { if (uppervp != NULLVP) - vput(uppervp); + vnode_put(uppervp); if (lowervp != NULLVP) - vrele(lowervp); - } else { - if (*ap->a_vpp != dvp) - if (!lockparent || !(cnp->cn_flags & ISLASTCN)) - VOP_UNLOCK(dvp, 0, p); + vnode_put(lowervp); } return (error); } int -union_create(ap) - struct vop_create_args /* { +union_create( + struct vnop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; - struct vattr *a_vap; - } */ *ap; + struct vnode_attr *a_vap; + vfs_context_t a_context; + } */ *ap) { struct union_node *un = VTOUNION(ap->a_dvp); struct vnode *dvp = un->un_uppervp; struct componentname *cnp = ap->a_cnp; - struct proc *p = cnp->cn_proc; + vfs_context_t ctx = cnp->cn_context; + struct proc *p = vfs_context_proc(ctx); if (dvp != NULLVP) { int error; @@ -401,57 +375,59 @@ union_create(ap) FIXUP(un, p); - VREF(dvp); un->un_flags |= UN_KLOCK; mp = ap->a_dvp->v_mount; - vput(ap->a_dvp); - error = VOP_CREATE(dvp, &vp, cnp, ap->a_vap); + + /* note that this is a direct passthrough to the filesystem */ + error = VNOP_CREATE(dvp, &vp, cnp, ap->a_vap, ap->a_context); if (error) return (error); error = union_allocvp(ap->a_vpp, mp, NULLVP, NULLVP, cnp, vp, NULLVP, 1); if (error) - vput(vp); + vnode_put(vp); return (error); } - - vput(ap->a_dvp); return (EROFS); } int -union_whiteout(ap) - struct vop_whiteout_args /* { +union_whiteout( + struct vnop_whiteout_args /* { struct vnode *a_dvp; struct componentname *a_cnp; int a_flags; - } */ *ap; + vfs_context_t a_context; + } */ *ap) { struct union_node *un = VTOUNION(ap->a_dvp); struct componentname *cnp = ap->a_cnp; - struct proc *p = cnp->cn_proc; + vfs_context_t ctx = cnp->cn_context; + struct proc *p = vfs_context_proc(ctx); if (un->un_uppervp == NULLVP) - return (EOPNOTSUPP); + return (ENOTSUP); FIXUP(un, p); - return (VOP_WHITEOUT(un->un_uppervp, cnp, ap->a_flags)); + return (VNOP_WHITEOUT(un->un_uppervp, cnp, ap->a_flags, ap->a_context)); } int -union_mknod(ap) - struct vop_mknod_args /* { +union_mknod( + struct vnop_mknod_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; - struct vattr *a_vap; - } */ *ap; + struct vnode_attr *a_vap; + vfs_context_t a_context; + } */ *ap) { struct union_node *un = VTOUNION(ap->a_dvp); struct vnode *dvp = un->un_uppervp; struct componentname *cnp = ap->a_cnp; - struct proc *p = cnp->cn_proc; + vfs_context_t ctx = cnp->cn_context; + struct proc *p = vfs_context_proc(ctx); if (dvp != NULLVP) { int error; @@ -460,11 +436,11 @@ union_mknod(ap) FIXUP(un, p); - VREF(dvp); un->un_flags |= UN_KLOCK; mp = ap->a_dvp->v_mount; - vput(ap->a_dvp); - error = VOP_MKNOD(dvp, &vp, cnp, ap->a_vap); + + /* note that this is a direct passthrough to the filesystem */ + error = VNOP_MKNOD(dvp, &vp, cnp, ap->a_vap, ap->a_context); if (error) return (error); @@ -472,30 +448,27 @@ union_mknod(ap) error = union_allocvp(ap->a_vpp, mp, NULLVP, NULLVP, cnp, vp, NULLVP, 1); if (error) - vput(vp); + vnode_put(vp); } return (error); } - - vput(ap->a_dvp); return (EROFS); } int -union_open(ap) - struct vop_open_args /* { +union_open( + struct vnop_open_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; int a_mode; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; + vfs_context_t a_context; + } */ *ap) { struct union_node *un = VTOUNION(ap->a_vp); struct vnode *tvp; int mode = ap->a_mode; - struct ucred *cred = ap->a_cred; - struct proc *p = ap->a_p; + kauth_cred_t cred = vfs_context_ucred(ap->a_context); + struct proc *p = vfs_context_proc(ap->a_context); int error; /* @@ -512,7 +485,7 @@ union_open(ap) if ((ap->a_mode & FWRITE) && (tvp->v_type == VREG)) { error = union_copyup(un, (mode&O_TRUNC) == 0, cred, p); if (error == 0) - error = VOP_OPEN(un->un_uppervp, mode, cred, p); + error = VNOP_OPEN(un->un_uppervp, mode, ap->a_context); return (error); } @@ -520,27 +493,25 @@ union_open(ap) * Just open the lower vnode */ un->un_openl++; - vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY, p); - error = VOP_OPEN(tvp, mode, cred, p); - VOP_UNLOCK(tvp, 0, p); + + error = VNOP_OPEN(tvp, mode, ap->a_context); return (error); } FIXUP(un, p); - error = VOP_OPEN(tvp, mode, cred, p); + error = VNOP_OPEN(tvp, mode, ap->a_context); return (error); } int union_close(ap) - struct vop_close_args /* { + struct vnop_close_args /* { struct vnode *a_vp; int a_fflag; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { struct union_node *un = VTOUNION(ap->a_vp); @@ -556,7 +527,7 @@ union_close(ap) } ap->a_vp = vp; - return (VCALL(vp, VOFFSET(vop_close), ap)); + return (VCALL(vp, VOFFSET(vnop_close), ap)); } /* @@ -568,39 +539,37 @@ union_close(ap) * the user caused an implicit file copy. */ int -union_access(ap) - struct vop_access_args /* { +union_access( + struct vnop_access_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; - int a_mode; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; + int a_action; + vfs_context_t a_context; + } */ *ap) { struct union_node *un = VTOUNION(ap->a_vp); - struct proc *p = ap->a_p; + struct proc *p = vfs_context_proc(ap->a_context); int error = EACCES; struct vnode *vp; if ((vp = un->un_uppervp) != NULLVP) { FIXUP(un, p); ap->a_vp = vp; - return (VCALL(vp, VOFFSET(vop_access), ap)); + return (VCALL(vp, VOFFSET(vnop_access), ap)); } if ((vp = un->un_lowervp) != NULLVP) { - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); ap->a_vp = vp; - error = VCALL(vp, VOFFSET(vop_access), ap); + error = VCALL(vp, VOFFSET(vnop_access), ap); if (error == 0) { struct union_mount *um = MOUNTTOUNIONMOUNT(vp->v_mount); if (um->um_op == UNMNT_BELOW) { - ap->a_cred = um->um_cred; - error = VCALL(vp, VOFFSET(vop_access), ap); + /* XXX fix me */ + // ap->a_cred = um->um_cred; + error = VCALL(vp, VOFFSET(vnop_access), ap); } } - VOP_UNLOCK(vp, 0, p); if (error) return (error); } @@ -614,19 +583,18 @@ union_access(ap) */ int union_getattr(ap) - struct vop_getattr_args /* { + struct vnop_getattr_args /* { struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - struct proc *a_p; + struct vnode_attr *a_vap; + vfs_context_t a_context; } */ *ap; { int error; struct union_node *un = VTOUNION(ap->a_vp); struct vnode *vp = un->un_uppervp; - struct proc *p = ap->a_p; - struct vattr *vap; - struct vattr va; + struct proc *p = vfs_context_proc(ap->a_context); + struct vnode_attr *vap; + struct vnode_attr va; /* @@ -643,7 +611,7 @@ union_getattr(ap) vp = un->un_uppervp; if (vp != NULLVP) { /* - * It's not clear whether VOP_GETATTR is to be + * It's not clear whether vnop_getattr is to be * called with the vnode locked or not. stat() calls * it with (vp) locked, and fstat calls it with * (vp) unlocked. @@ -653,46 +621,49 @@ union_getattr(ap) if (un->un_flags & UN_LOCKED) FIXUP(un, p); - error = VOP_GETATTR(vp, vap, ap->a_cred, ap->a_p); + error = vnode_getattr(vp, vap, ap->a_context); if (error) return (error); - union_newsize(ap->a_vp, vap->va_size, VNOVAL); + union_newsize(ap->a_vp, vap->va_data_size, VNOVAL); } if (vp == NULLVP) { vp = un->un_lowervp; } else if (vp->v_type == VDIR) { vp = un->un_lowervp; + VATTR_INIT(&va); + /* all we want from the lower node is the link count */ + VATTR_WANTED(&va, va_nlink); vap = &va; } else { vp = NULLVP; } if (vp != NULLVP) { - error = VOP_GETATTR(vp, vap, ap->a_cred, ap->a_p); + error = vnode_getattr(vp, vap, ap->a_context); if (error) return (error); - union_newsize(ap->a_vp, VNOVAL, vap->va_size); + union_newsize(ap->a_vp, VNOVAL, vap->va_data_size); } if ((vap != ap->a_vap) && (vap->va_type == VDIR)) ap->a_vap->va_nlink += vap->va_nlink; - ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; + VATTR_RETURN(ap->a_vap, va_fsid, ap->a_vp->v_mount->mnt_vfsstat.f_fsid.val[0]); return (0); } int union_setattr(ap) - struct vop_setattr_args /* { + struct vnop_setattr_args /* { struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - struct proc *a_p; + struct vnode_attr *a_vap; + vfs_context_t a_context; } */ *ap; { struct union_node *un = VTOUNION(ap->a_vp); - struct proc *p = ap->a_p; + struct proc *p = vfs_context_proc(ap->a_context); + kauth_cred_t cred = vfs_context_ucred(ap->a_context); int error; /* @@ -700,11 +671,11 @@ union_setattr(ap) * by creating a zero length upper object. This is to * handle the case of open with O_TRUNC and O_CREAT. */ - if ((un->un_uppervp == NULLVP) && + if (VATTR_IS_ACTIVE(ap->a_vap, va_data_size) && + (un->un_uppervp == NULLVP) && /* assert(un->un_lowervp != NULLVP) */ (un->un_lowervp->v_type == VREG)) { - error = union_copyup(un, (ap->a_vap->va_size != 0), - ap->a_cred, ap->a_p); + error = union_copyup(un, (ap->a_vap->va_data_size != 0), cred, p); if (error) return (error); } @@ -715,10 +686,9 @@ union_setattr(ap) */ if (un->un_uppervp != NULLVP) { FIXUP(un, p); - error = VOP_SETATTR(un->un_uppervp, ap->a_vap, - ap->a_cred, ap->a_p); - if ((error == 0) && (ap->a_vap->va_size != VNOVAL)) - union_newsize(ap->a_vp, ap->a_vap->va_size, VNOVAL); + error = vnode_setattr(un->un_uppervp, ap->a_vap, ap->a_context); + if ((error == 0) && VATTR_IS_ACTIVE(ap->a_vap, va_data_size)) + union_newsize(ap->a_vp, ap->a_vap->va_data_size, VNOVAL); } else { error = EROFS; } @@ -728,25 +698,21 @@ union_setattr(ap) int union_read(ap) - struct vop_read_args /* { + struct vnop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; + vfs_context_t a_context; } */ *ap; { int error; - struct proc *p = ap->a_uio->uio_procp; + struct proc *p = vfs_context_proc(ap->a_context); struct vnode *vp = OTHERVP(ap->a_vp); int dolock = (vp == LOWERVP(ap->a_vp)); - if (dolock) - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - else + if (!dolock) FIXUP(VTOUNION(ap->a_vp), p); - error = VOP_READ(vp, ap->a_uio, ap->a_ioflag, ap->a_cred); - if (dolock) - VOP_UNLOCK(vp, 0, p); + error = VNOP_READ(vp, ap->a_uio, ap->a_ioflag, ap->a_context); /* * XXX @@ -772,24 +738,24 @@ union_read(ap) int union_write(ap) - struct vop_read_args /* { + struct vnop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; + vfs_context_t a_context; } */ *ap; { int error; struct vnode *vp; struct union_node *un = VTOUNION(ap->a_vp); - struct proc *p = ap->a_uio->uio_procp; + struct proc *p = vfs_context_proc(ap->a_context); vp = UPPERVP(ap->a_vp); if (vp == NULLVP) panic("union: missing upper layer in write"); FIXUP(un, p); - error = VOP_WRITE(vp, ap->a_uio, ap->a_ioflag, ap->a_cred); + error = VNOP_WRITE(vp, ap->a_uio, ap->a_ioflag, ap->a_context); /* * the size of the underlying object may be changed by the @@ -805,142 +771,109 @@ union_write(ap) return (error); } -union_lease(ap) - struct vop_lease_args /* { - struct vnode *a_vp; - struct proc *a_p; - struct ucred *a_cred; - int a_flag; - } */ *ap; -{ - register struct vnode *ovp = OTHERVP(ap->a_vp); - - ap->a_vp = ovp; - return (VCALL(ovp, VOFFSET(vop_lease), ap)); -} int union_ioctl(ap) - struct vop_ioctl_args /* { + struct vnop_ioctl_args /* { struct vnode *a_vp; int a_command; caddr_t a_data; int a_fflag; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { register struct vnode *ovp = OTHERVP(ap->a_vp); ap->a_vp = ovp; - return (VCALL(ovp, VOFFSET(vop_ioctl), ap)); + return (VCALL(ovp, VOFFSET(vnop_ioctl), ap)); } int union_select(ap) - struct vop_select_args /* { + struct vnop_select_args /* { struct vnode *a_vp; int a_which; int a_fflags; - struct ucred *a_cred; void * a_wql; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { register struct vnode *ovp = OTHERVP(ap->a_vp); ap->a_vp = ovp; - return (VCALL(ovp, VOFFSET(vop_select), ap)); + return (VCALL(ovp, VOFFSET(vnop_select), ap)); } int union_revoke(ap) - struct vop_revoke_args /* { + struct vnop_revoke_args /* { struct vnode *a_vp; int a_flags; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { struct vnode *vp = ap->a_vp; if (UPPERVP(vp)) - VOP_REVOKE(UPPERVP(vp), ap->a_flags); + VNOP_REVOKE(UPPERVP(vp), ap->a_flags, ap->a_context); if (LOWERVP(vp)) - VOP_REVOKE(LOWERVP(vp), ap->a_flags); - vgone(vp); + VNOP_REVOKE(LOWERVP(vp), ap->a_flags, ap->a_context); + vnode_reclaim(vp); } int union_mmap(ap) - struct vop_mmap_args /* { + struct vnop_mmap_args /* { struct vnode *a_vp; int a_fflags; - struct ucred *a_cred; + kauth_cred_t a_cred; struct proc *a_p; } */ *ap; { register struct vnode *ovp = OTHERVP(ap->a_vp); ap->a_vp = ovp; - return (VCALL(ovp, VOFFSET(vop_mmap), ap)); + return (VCALL(ovp, VOFFSET(vnop_mmap), ap)); } int -union_fsync(ap) - struct vop_fsync_args /* { +union_fsync( + struct vnop_fsync_args /* { struct vnode *a_vp; - struct ucred *a_cred; int a_waitfor; - struct proc *a_p; - } */ *ap; + vfs_context_t a_context; + } */ *ap) { int error = 0; - struct proc *p = ap->a_p; + struct proc *p = vfs_context_proc(ap->a_context); struct vnode *targetvp = OTHERVP(ap->a_vp); if (targetvp != NULLVP) { int dolock = (targetvp == LOWERVP(ap->a_vp)); - if (dolock) - vn_lock(targetvp, LK_EXCLUSIVE | LK_RETRY, p); - else + if (!dolock) FIXUP(VTOUNION(ap->a_vp), p); - error = VOP_FSYNC(targetvp, ap->a_cred, ap->a_waitfor, p); - if (dolock) - VOP_UNLOCK(targetvp, 0, p); + error = VNOP_FSYNC(targetvp, ap->a_waitfor, ap->a_context); } return (error); } int -union_seek(ap) - struct vop_seek_args /* { - struct vnode *a_vp; - off_t a_oldoff; - off_t a_newoff; - struct ucred *a_cred; - } */ *ap; -{ - register struct vnode *ovp = OTHERVP(ap->a_vp); - - ap->a_vp = ovp; - return (VCALL(ovp, VOFFSET(vop_seek), ap)); -} - -int -union_remove(ap) - struct vop_remove_args /* { +union_remove( + struct vnop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; - } */ *ap; + vfs_context_t a_context; + } */ *ap) { int error; struct union_node *dun = VTOUNION(ap->a_dvp); struct union_node *un = VTOUNION(ap->a_vp); struct componentname *cnp = ap->a_cnp; - struct proc *p = cnp->cn_proc; + vfs_context_t ctx = cnp->cn_context; + struct proc *p = vfs_context_proc(ctx); if (dun->un_uppervp == NULLVP) panic("union remove: null upper vnode"); @@ -950,17 +883,13 @@ union_remove(ap) struct vnode *vp = un->un_uppervp; FIXUP(dun, p); - VREF(dvp); dun->un_flags |= UN_KLOCK; - vput(ap->a_dvp); FIXUP(un, p); - VREF(vp); un->un_flags |= UN_KLOCK; - vput(ap->a_vp); - if (union_dowhiteout(un, cnp->cn_cred, cnp->cn_proc)) + if (union_dowhiteout(un, cnp->cn_context)) cnp->cn_flags |= DOWHITEOUT; - error = VOP_REMOVE(dvp, vp, cnp); + error = VNOP_REMOVE(dvp, vp, cnp, 0, ap->a_context); if (!error) union_removed_upper(un); } else { @@ -968,24 +897,24 @@ union_remove(ap) error = union_mkwhiteout( MOUNTTOUNIONMOUNT(UNIONTOV(dun)->v_mount), dun->un_uppervp, ap->a_cnp, un->un_path); - vput(ap->a_dvp); - vput(ap->a_vp); } return (error); } int -union_link(ap) - struct vop_link_args /* { +union_link( + struct vnop_link_args /* { struct vnode *a_vp; struct vnode *a_tdvp; struct componentname *a_cnp; - } */ *ap; + vfs_context_t a_context; + } */ *ap) { int error = 0; struct componentname *cnp = ap->a_cnp; - struct proc *p = cnp->cn_proc; + vfs_context_t ctx = cnp->cn_context; + struct proc *p = vfs_context_proc(ctx); struct union_node *un; struct vnode *vp; struct vnode *tdvp; @@ -997,48 +926,41 @@ union_link(ap) } else { struct union_node *tun = VTOUNION(ap->a_vp); if (tun->un_uppervp == NULLVP) { - vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY, p); if (un->un_uppervp == tun->un_dirvp) { un->un_flags &= ~UN_ULOCK; - VOP_UNLOCK(un->un_uppervp, 0, p); } - error = union_copyup(tun, 1, cnp->cn_cred, p); + error = union_copyup(tun, 1, vfs_context_ucred(ctx), p); if (un->un_uppervp == tun->un_dirvp) { - vn_lock(un->un_uppervp, - LK_EXCLUSIVE | LK_RETRY, p); un->un_flags |= UN_ULOCK; } - VOP_UNLOCK(ap->a_vp, 0, p); } vp = tun->un_uppervp; } - tdvp = un->un_uppervp; if (tdvp == NULLVP) error = EROFS; if (error) { - vput(ap->a_tdvp); return (error); } FIXUP(un, p); - VREF(tdvp); + vnode_get(tdvp); un->un_flags |= UN_KLOCK; - vput(ap->a_tdvp); - return (VOP_LINK(vp, tdvp, cnp)); + return (VNOP_LINK(vp, tdvp, cnp, ap->a_context)); } int union_rename(ap) - struct vop_rename_args /* { + struct vnop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; + vfs_context_t a_context; } */ *ap; { int error; @@ -1062,8 +984,7 @@ union_rename(ap) } fdvp = un->un_uppervp; - VREF(fdvp); - vrele(ap->a_fdvp); + vnode_get(fdvp); } if (fvp->v_op == union_vnodeop_p) { /* always true */ @@ -1078,8 +999,7 @@ union_rename(ap) ap->a_fcnp->cn_flags |= DOWHITEOUT; fvp = un->un_uppervp; - VREF(fvp); - vrele(ap->a_fvp); + vnode_get(fvp); } if (tdvp->v_op == union_vnodeop_p) { @@ -1096,9 +1016,8 @@ union_rename(ap) } tdvp = un->un_uppervp; - VREF(tdvp); + vnode_get(tdvp); un->un_flags |= UN_KLOCK; - vput(ap->a_tdvp); } if (tvp != NULLVP && tvp->v_op == union_vnodeop_p) { @@ -1106,77 +1025,69 @@ union_rename(ap) tvp = un->un_uppervp; if (tvp != NULLVP) { - VREF(tvp); + vnode_get(tvp); un->un_flags |= UN_KLOCK; } - vput(ap->a_tvp); } - return (VOP_RENAME(fdvp, fvp, ap->a_fcnp, tdvp, tvp, ap->a_tcnp)); + return (VNOP_RENAME(fdvp, fvp, ap->a_fcnp, tdvp, tvp, ap->a_tcnp, ap->a_context)); bad: - vrele(fdvp); - vrele(fvp); - vput(tdvp); - if (tvp != NULLVP) - vput(tvp); - return (error); } int -union_mkdir(ap) - struct vop_mkdir_args /* { +union_mkdir( + struct vnop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; - struct vattr *a_vap; - } */ *ap; + struct vnode_attr *a_vap; + vfs_context_t a_context; + } */ *ap) { struct union_node *un = VTOUNION(ap->a_dvp); struct vnode *dvp = un->un_uppervp; struct componentname *cnp = ap->a_cnp; - struct proc *p = cnp->cn_proc; + vfs_context_t ctx = cnp->cn_context; + struct proc *p = vfs_context_proc(ctx); if (dvp != NULLVP) { int error; struct vnode *vp; FIXUP(un, p); - VREF(dvp); un->un_flags |= UN_KLOCK; - VOP_UNLOCK(ap->a_dvp, 0, p); - error = VOP_MKDIR(dvp, &vp, cnp, ap->a_vap); - if (error) { - vrele(ap->a_dvp); + + /* note that this is a direct fallthrough to the filesystem */ + error = VNOP_MKDIR(dvp, &vp, cnp, ap->a_vap, ap->a_context); + if (error) return (error); - } error = union_allocvp(ap->a_vpp, ap->a_dvp->v_mount, ap->a_dvp, NULLVP, cnp, vp, NULLVP, 1); - vrele(ap->a_dvp); if (error) - vput(vp); + vnode_put(vp); return (error); } - - vput(ap->a_dvp); return (EROFS); } int -union_rmdir(ap) - struct vop_rmdir_args /* { +union_rmdir( + struct vnop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; - } */ *ap; + vfs_context_t a_context; + } */ *ap) { int error; struct union_node *dun = VTOUNION(ap->a_dvp); struct union_node *un = VTOUNION(ap->a_vp); struct componentname *cnp = ap->a_cnp; - struct proc *p = cnp->cn_proc; + vfs_context_t ctx = cnp->cn_context; + struct proc *p = vfs_context_proc(ctx); if (dun->un_uppervp == NULLVP) panic("union rmdir: null upper vnode"); @@ -1186,17 +1097,15 @@ union_rmdir(ap) struct vnode *vp = un->un_uppervp; FIXUP(dun, p); - VREF(dvp); + vnode_get(dvp); dun->un_flags |= UN_KLOCK; - vput(ap->a_dvp); FIXUP(un, p); - VREF(vp); + vnode_get(vp); un->un_flags |= UN_KLOCK; - vput(ap->a_vp); - if (union_dowhiteout(un, cnp->cn_cred, cnp->cn_proc)) + if (union_dowhiteout(un, cnp->cn_context)) cnp->cn_flags |= DOWHITEOUT; - error = VOP_RMDIR(dvp, vp, ap->a_cnp); + error = VNOP_RMDIR(dvp, vp, ap->a_cnp, ap->a_context); if (!error) union_removed_upper(un); } else { @@ -1204,43 +1113,38 @@ union_rmdir(ap) error = union_mkwhiteout( MOUNTTOUNIONMOUNT(UNIONTOV(dun)->v_mount), dun->un_uppervp, ap->a_cnp, un->un_path); - vput(ap->a_dvp); - vput(ap->a_vp); } - return (error); } int -union_symlink(ap) - struct vop_symlink_args /* { +union_symlink( + struct vnop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; - struct vattr *a_vap; + struct vnode_attr *a_vap; char *a_target; - } */ *ap; + vfs_context_t a_context; + } */ *ap) { struct union_node *un = VTOUNION(ap->a_dvp); struct vnode *dvp = un->un_uppervp; struct componentname *cnp = ap->a_cnp; - struct proc *p = cnp->cn_proc; + vfs_context_t ctx = cnp->cn_context; + struct proc *p = vfs_context_proc(ctx); if (dvp != NULLVP) { int error; struct vnode *vp; - struct mount *mp = ap->a_dvp->v_mount; FIXUP(un, p); - VREF(dvp); un->un_flags |= UN_KLOCK; - vput(ap->a_dvp); - error = VOP_SYMLINK(dvp, &vp, cnp, ap->a_vap, ap->a_target); + + error = VNOP_SYMLINK(dvp, &vp, cnp, ap->a_vap, ap->a_target, ap->a_context); *ap->a_vpp = NULLVP; return (error); } - - vput(ap->a_dvp); return (EROFS); } @@ -1253,98 +1157,67 @@ union_symlink(ap) */ int union_readdir(ap) - struct vop_readdir_args /* { + struct vnop_readdir_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; struct uio *a_uio; - struct ucred *a_cred; + int a_flags; int *a_eofflag; - u_long *a_cookies; - int a_ncookies; + int *a_numdirent; + vfs_context_t a_context; } */ *ap; { struct union_node *un = VTOUNION(ap->a_vp); struct vnode *uvp = un->un_uppervp; - struct proc *p = ap->a_uio->uio_procp; + struct proc *p = vfs_context_proc(ap->a_context); + + if (ap->a_flags & (VNODE_READDIR_EXTENDED | VNODE_READDIR_REQSEEKOFF)) + return (EINVAL); if (uvp == NULLVP) return (0); FIXUP(un, p); ap->a_vp = uvp; - return (VCALL(uvp, VOFFSET(vop_readdir), ap)); + return (VCALL(uvp, VOFFSET(vnop_readdir), ap)); } int union_readlink(ap) - struct vop_readlink_args /* { + struct vnop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; - struct ucred *a_cred; + vfs_context_t a_context; } */ *ap; { int error; struct uio *uio = ap->a_uio; - struct proc *p = uio->uio_procp; + struct proc *p = vfs_context_proc(ap->a_context); struct vnode *vp = OTHERVP(ap->a_vp); int dolock = (vp == LOWERVP(ap->a_vp)); - if (dolock) - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - else - FIXUP(VTOUNION(ap->a_vp), p); + if (!dolock) + FIXUP(VTOUNION(ap->a_vp), p); ap->a_vp = vp; - error = VCALL(vp, VOFFSET(vop_readlink), ap); - if (dolock) - VOP_UNLOCK(vp, 0, p); - - return (error); -} - -int -union_abortop(ap) - struct vop_abortop_args /* { - struct vnode *a_dvp; - struct componentname *a_cnp; - } */ *ap; -{ - int error; - struct componentname *cnp = ap->a_cnp; - struct proc *p = cnp->cn_proc; - struct vnode *vp = OTHERVP(ap->a_dvp); - struct union_node *un = VTOUNION(ap->a_dvp); - int islocked = un->un_flags & UN_LOCKED; - int dolock = (vp == LOWERVP(ap->a_dvp)); - - if (islocked) { - if (dolock) - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - else - FIXUP(VTOUNION(ap->a_dvp), p); - } - ap->a_dvp = vp; - error = VCALL(vp, VOFFSET(vop_abortop), ap); - if (islocked && dolock) - VOP_UNLOCK(vp, 0, p); + error = VCALL(vp, VOFFSET(vnop_readlink), ap); return (error); } int -union_inactive(ap) - struct vop_inactive_args /* { +union_inactive( + struct vnop_inactive_args /* { struct vnode *a_vp; - struct proc *a_p; - } */ *ap; + vfs_context_t a_context; + } */ *ap) { struct vnode *vp = ap->a_vp; - struct proc *p = ap->a_p; struct union_node *un = VTOUNION(vp); struct vnode **vpp; /* * Do nothing (and _don't_ bypass). - * Wait to vrele lowervp until reclaim, + * Wait to vnode_put lowervp until reclaim, * so that until then our union_node is in the * cache and reusable. * @@ -1357,23 +1230,22 @@ union_inactive(ap) if (un->un_dircache != 0) { for (vpp = un->un_dircache; *vpp != NULLVP; vpp++) - vrele(*vpp); + vnode_put(*vpp); _FREE(un->un_dircache, M_TEMP); un->un_dircache = 0; } - VOP_UNLOCK(vp, 0, p); - if ((un->un_flags & UN_CACHED) == 0) - vgone(vp); + vnode_recycle(vp); return (0); } int union_reclaim(ap) - struct vop_reclaim_args /* { + struct vnop_reclaim_args /* { struct vnode *a_vp; + vfs_context_t a_context; } */ *ap; { @@ -1383,155 +1255,15 @@ union_reclaim(ap) } int -union_lock(ap) - struct vop_lock_args *ap; -{ - struct vnode *vp = ap->a_vp; - struct proc *p = ap->a_p; - int flags = ap->a_flags; - struct union_node *un; - int error; - - - vop_nolock(ap); - /* - * Need to do real lockmgr-style locking here. - * in the mean time, draining won't work quite right, - * which could lead to a few race conditions. - * the following test was here, but is not quite right, we - * still need to take the lock: - if ((flags & LK_TYPE_MASK) == LK_DRAIN) - return (0); - */ - flags &= ~LK_INTERLOCK; - -start: - un = VTOUNION(vp); - - if (un->un_uppervp != NULLVP) { - if (((un->un_flags & UN_ULOCK) == 0) && - (vp->v_usecount != 0)) { - error = vn_lock(un->un_uppervp, flags, p); - if (error) - return (error); - un->un_flags |= UN_ULOCK; - } -#if DIAGNOSTIC - if (un->un_flags & UN_KLOCK) { - vprint("union: dangling klock", vp); - panic("union: dangling upper lock (%lx)", vp); - } -#endif - } - - if (un->un_flags & UN_LOCKED) { -#if DIAGNOSTIC - if (current_proc() && un->un_pid == current_proc()->p_pid && - un->un_pid > -1 && current_proc()->p_pid > -1) - panic("union: locking against myself"); -#endif - un->un_flags |= UN_WANT; - tsleep((caddr_t)&un->un_flags, PINOD, "unionlk2", 0); - goto start; - } - -#if DIAGNOSTIC - if (current_proc()) - un->un_pid = current_proc()->p_pid; - else - un->un_pid = -1; -#endif - - un->un_flags |= UN_LOCKED; - return (0); -} - -/* - * When operations want to vput() a union node yet retain a lock on - * the upper vnode (say, to do some further operations like link(), - * mkdir(), ...), they set UN_KLOCK on the union node, then call - * vput() which calls VOP_UNLOCK() and comes here. union_unlock() - * unlocks the union node (leaving the upper vnode alone), clears the - * KLOCK flag, and then returns to vput(). The caller then does whatever - * is left to do with the upper vnode, and ensures that it gets unlocked. - * - * If UN_KLOCK isn't set, then the upper vnode is unlocked here. - */ -int -union_unlock(ap) - struct vop_unlock_args /* { - struct vnode *a_vp; - int a_flags; - struct proc *a_p; - } */ *ap; -{ - struct union_node *un = VTOUNION(ap->a_vp); - struct proc *p = ap->a_p; - -#if DIAGNOSTIC - if ((un->un_flags & UN_LOCKED) == 0) - panic("union: unlock unlocked node"); - if (current_proc() && un->un_pid != current_proc()->p_pid && - current_proc()->p_pid > -1 && un->un_pid > -1) - panic("union: unlocking other process's union node"); -#endif - - un->un_flags &= ~UN_LOCKED; - - if ((un->un_flags & (UN_ULOCK|UN_KLOCK)) == UN_ULOCK) - VOP_UNLOCK(un->un_uppervp, 0, p); - - un->un_flags &= ~(UN_ULOCK|UN_KLOCK); - - if (un->un_flags & UN_WANT) { - un->un_flags &= ~UN_WANT; - wakeup((caddr_t) &un->un_flags); - } - -#if DIAGNOSTIC - un->un_pid = 0; -#endif - vop_nounlock(ap); - - return (0); -} - -int -union_bmap(ap) - struct vop_bmap_args /* { - struct vnode *a_vp; - daddr_t a_bn; - struct vnode **a_vpp; - daddr_t *a_bnp; - int *a_runp; - } */ *ap; -{ - int error; - struct proc *p = current_proc(); /* XXX */ - struct vnode *vp = OTHERVP(ap->a_vp); - int dolock = (vp == LOWERVP(ap->a_vp)); - - if (dolock) - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - else - FIXUP(VTOUNION(ap->a_vp), p); - ap->a_vp = vp; - error = VCALL(vp, VOFFSET(vop_bmap), ap); - if (dolock) - VOP_UNLOCK(vp, 0, p); - - return (error); -} - -int -union_cmap(ap) - struct vop_cmap_args /* { +union_blockmap(ap) + struct vnop_blockmap_args /* { struct vnode *a_vp; off_t a_offset; size_t a_size; - daddr_t *a_bpn; + daddr64_t *a_bpn; size_t *a_run; void *a_poff; + int a_flags; } */ *ap; { int error; @@ -1539,52 +1271,21 @@ union_cmap(ap) struct vnode *vp = OTHERVP(ap->a_vp); int dolock = (vp == LOWERVP(ap->a_vp)); - if (dolock) - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - else - FIXUP(VTOUNION(ap->a_vp), p); + if (!dolock) + FIXUP(VTOUNION(ap->a_vp), p); ap->a_vp = vp; - error = VCALL(vp, VOFFSET(vop_cmap), ap); - if (dolock) - VOP_UNLOCK(vp, 0, p); + error = VCALL(vp, VOFFSET(vnop_blockmap), ap); return (error); } -int -union_print(ap) - struct vop_print_args /* { - struct vnode *a_vp; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - - printf("\ttag VT_UNION, vp=%x, uppervp=%x, lowervp=%x\n", - vp, UPPERVP(vp), LOWERVP(vp)); - if (UPPERVP(vp) != NULLVP) - vprint("union: upper", UPPERVP(vp)); - if (LOWERVP(vp) != NULLVP) - vprint("union: lower", LOWERVP(vp)); - - return (0); -} - -int -union_islocked(ap) - struct vop_islocked_args /* { - struct vnode *a_vp; - } */ *ap; -{ - - return ((VTOUNION(ap->a_vp)->un_flags & UN_LOCKED) ? 1 : 0); -} - int union_pathconf(ap) - struct vop_pathconf_args /* { + struct vnop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; + vfs_context_t a_context; } */ *ap; { int error; @@ -1592,43 +1293,40 @@ union_pathconf(ap) struct vnode *vp = OTHERVP(ap->a_vp); int dolock = (vp == LOWERVP(ap->a_vp)); - if (dolock) - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - else - FIXUP(VTOUNION(ap->a_vp), p); + if (!dolock) + FIXUP(VTOUNION(ap->a_vp), p); ap->a_vp = vp; - error = VCALL(vp, VOFFSET(vop_pathconf), ap); - if (dolock) - VOP_UNLOCK(vp, 0, p); + error = VCALL(vp, VOFFSET(vnop_pathconf), ap); return (error); } int union_advlock(ap) - struct vop_advlock_args /* { + struct vnop_advlock_args /* { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; + vfs_context_t a_context; } */ *ap; { register struct vnode *ovp = OTHERVP(ap->a_vp); ap->a_vp = ovp; - return (VCALL(ovp, VOFFSET(vop_advlock), ap)); + return (VCALL(ovp, VOFFSET(vnop_advlock), ap)); } /* - * XXX - vop_strategy must be hand coded because it has no + * XXX - vnop_strategy must be hand coded because it has no * vnode in its arguments. * This goes away with a merged VM/buffer cache. */ int union_strategy(ap) - struct vop_strategy_args /* { + struct vnop_strategy_args /* { struct buf *a_bp; } */ *ap; { @@ -1636,41 +1334,41 @@ union_strategy(ap) int error; struct vnode *savedvp; - savedvp = bp->b_vp; - bp->b_vp = OTHERVP(bp->b_vp); + savedvp = buf_vnode(bp); + buf_setvnode(bp, OTHERVP(savedvp)); #if DIAGNOSTIC - if (bp->b_vp == NULLVP) + if (buf_vnode(bp) == NULLVP) panic("union_strategy: nil vp"); - if (((bp->b_flags & B_READ) == 0) && - (bp->b_vp == LOWERVP(savedvp))) + if (((buf_flags(bp) & B_READ) == 0) && + (buf_vnode(bp) == LOWERVP(savedvp))) panic("union_strategy: writing to lowervp"); #endif - error = VOP_STRATEGY(bp); - bp->b_vp = savedvp; + error = VNOP_STRATEGY(bp); + buf_setvnode(bp, savedvp); return (error); } /* Pagein */ +int union_pagein(ap) - struct vop_pagein_args /* { + struct vnop_pagein_args /* { struct vnode *a_vp, upl_t a_pl, vm_offset_t a_pl_offset, off_t a_f_offset, size_t a_size, - struct ucred *a_cred, int a_flags + vfs_context_t a_context; } */ *ap; { int error; - struct proc *p = current_proc(); struct vnode *vp = OTHERVP(ap->a_vp); - error = VOP_PAGEIN(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, - ap->a_size, ap->a_cred,ap->a_flags); + error = VNOP_PAGEIN(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, + ap->a_size, ap->a_flags, ap->a_context); /* * XXX @@ -1695,15 +1393,16 @@ union_pagein(ap) } /* Pageout */ +int union_pageout(ap) - struct vop_pageout_args /* { + struct vnop_pageout_args /* { struct vnode *a_vp, upl_t a_pl, vm_offset_t a_pl_offset, off_t a_f_offset, size_t a_size, - struct ucred *a_cred, int a_flags + vfs_context_t a_context; } */ *ap; { int error; @@ -1714,8 +1413,8 @@ union_pageout(ap) if (vp == NULLVP) panic("union: missing upper layer in pageout"); - error = VOP_PAGEOUT(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, - ap->a_size, ap->a_cred,ap->a_flags); + error = VNOP_PAGEOUT(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, + ap->a_size, ap->a_flags, ap->a_context); /* * the size of the underlying object may be changed by the @@ -1734,16 +1433,16 @@ union_pageout(ap) /* Blktooff derives file offset for the given logical block number */ int union_blktooff(ap) - struct vop_blktooff_args /* { + struct vnop_blktooff_args /* { struct vnode *a_vp; - daddr_t a_lblkno; + daddr64_t a_lblkno; off_t *a_offset; } */ *ap; { int error; struct vnode *vp = OTHERVP(ap->a_vp); - error = VOP_BLKTOOFF(vp, ap->a_lblkno, ap->a_offset); + error = VNOP_BLKTOOFF(vp, ap->a_lblkno, ap->a_offset); return(error); } @@ -1751,16 +1450,16 @@ union_blktooff(ap) /* offtoblk derives file offset for the given logical block number */ int union_offtoblk(ap) - struct vop_offtoblk_args /* { + struct vnop_offtoblk_args /* { struct vnode *a_vp; off_t a_offset; - daddr_t *a_lblkno; + daddr64_t *a_lblkno; } */ *ap; { int error; struct vnode *vp = OTHERVP(ap->a_vp); - error = VOP_OFFTOBLK(vp, ap->a_offset, ap->a_lblkno); + error = VNOP_OFFTOBLK(vp, ap->a_offset, ap->a_lblkno); return(error); } @@ -1772,58 +1471,45 @@ union_offtoblk(ap) */ int (**union_vnodeop_p)(void *); struct vnodeopv_entry_desc union_vnodeop_entries[] = { - { &vop_default_desc, (VOPFUNC)vn_default_error }, - { &vop_lookup_desc, (VOPFUNC)union_lookup }, /* lookup */ - { &vop_create_desc, (VOPFUNC)union_create }, /* create */ - { &vop_whiteout_desc, (VOPFUNC)union_whiteout }, /* whiteout */ - { &vop_mknod_desc, (VOPFUNC)union_mknod }, /* mknod */ - { &vop_open_desc, (VOPFUNC)union_open }, /* open */ - { &vop_close_desc, (VOPFUNC)union_close }, /* close */ - { &vop_access_desc, (VOPFUNC)union_access }, /* access */ - { &vop_getattr_desc, (VOPFUNC)union_getattr }, /* getattr */ - { &vop_setattr_desc, (VOPFUNC)union_setattr }, /* setattr */ - { &vop_read_desc, (VOPFUNC)union_read }, /* read */ - { &vop_write_desc, (VOPFUNC)union_write }, /* write */ - { &vop_lease_desc, (VOPFUNC)union_lease }, /* lease */ - { &vop_ioctl_desc, (VOPFUNC)union_ioctl }, /* ioctl */ - { &vop_select_desc, (VOPFUNC)union_select }, /* select */ - { &vop_revoke_desc, (VOPFUNC)union_revoke }, /* revoke */ - { &vop_mmap_desc, (VOPFUNC)union_mmap }, /* mmap */ - { &vop_fsync_desc, (VOPFUNC)union_fsync }, /* fsync */ - { &vop_seek_desc, (VOPFUNC)union_seek }, /* seek */ - { &vop_remove_desc, (VOPFUNC)union_remove }, /* remove */ - { &vop_link_desc, (VOPFUNC)union_link }, /* link */ - { &vop_rename_desc, (VOPFUNC)union_rename }, /* rename */ - { &vop_mkdir_desc, (VOPFUNC)union_mkdir }, /* mkdir */ - { &vop_rmdir_desc, (VOPFUNC)union_rmdir }, /* rmdir */ - { &vop_symlink_desc, (VOPFUNC)union_symlink }, /* symlink */ - { &vop_readdir_desc, (VOPFUNC)union_readdir }, /* readdir */ - { &vop_readlink_desc, (VOPFUNC)union_readlink }, /* readlink */ - { &vop_abortop_desc, (VOPFUNC)union_abortop }, /* abortop */ - { &vop_inactive_desc, (VOPFUNC)union_inactive }, /* inactive */ - { &vop_reclaim_desc, (VOPFUNC)union_reclaim }, /* reclaim */ - { &vop_lock_desc, (VOPFUNC)union_lock }, /* lock */ - { &vop_unlock_desc, (VOPFUNC)union_unlock }, /* unlock */ - { &vop_bmap_desc, (VOPFUNC)union_bmap }, /* bmap */ - { &vop_strategy_desc, (VOPFUNC)union_strategy }, /* strategy */ - { &vop_print_desc, (VOPFUNC)union_print }, /* print */ - { &vop_islocked_desc, (VOPFUNC)union_islocked }, /* islocked */ - { &vop_pathconf_desc, (VOPFUNC)union_pathconf }, /* pathconf */ - { &vop_advlock_desc, (VOPFUNC)union_advlock }, /* advlock */ + { &vnop_default_desc, (VOPFUNC)vn_default_error }, + { &vnop_lookup_desc, (VOPFUNC)union_lookup }, /* lookup */ + { &vnop_create_desc, (VOPFUNC)union_create }, /* create */ + { &vnop_whiteout_desc, (VOPFUNC)union_whiteout }, /* whiteout */ + { &vnop_mknod_desc, (VOPFUNC)union_mknod }, /* mknod */ + { &vnop_open_desc, (VOPFUNC)union_open }, /* open */ + { &vnop_close_desc, (VOPFUNC)union_close }, /* close */ + { &vnop_access_desc, (VOPFUNC)union_access }, /* access */ + { &vnop_getattr_desc, (VOPFUNC)union_getattr }, /* getattr */ + { &vnop_setattr_desc, (VOPFUNC)union_setattr }, /* setattr */ + { &vnop_read_desc, (VOPFUNC)union_read }, /* read */ + { &vnop_write_desc, (VOPFUNC)union_write }, /* write */ + { &vnop_ioctl_desc, (VOPFUNC)union_ioctl }, /* ioctl */ + { &vnop_select_desc, (VOPFUNC)union_select }, /* select */ + { &vnop_revoke_desc, (VOPFUNC)union_revoke }, /* revoke */ + { &vnop_mmap_desc, (VOPFUNC)union_mmap }, /* mmap */ + { &vnop_fsync_desc, (VOPFUNC)union_fsync }, /* fsync */ + { &vnop_remove_desc, (VOPFUNC)union_remove }, /* remove */ + { &vnop_link_desc, (VOPFUNC)union_link }, /* link */ + { &vnop_rename_desc, (VOPFUNC)union_rename }, /* rename */ + { &vnop_mkdir_desc, (VOPFUNC)union_mkdir }, /* mkdir */ + { &vnop_rmdir_desc, (VOPFUNC)union_rmdir }, /* rmdir */ + { &vnop_symlink_desc, (VOPFUNC)union_symlink }, /* symlink */ + { &vnop_readdir_desc, (VOPFUNC)union_readdir }, /* readdir */ + { &vnop_readlink_desc, (VOPFUNC)union_readlink }, /* readlink */ + { &vnop_inactive_desc, (VOPFUNC)union_inactive }, /* inactive */ + { &vnop_reclaim_desc, (VOPFUNC)union_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (VOPFUNC)union_strategy }, /* strategy */ + { &vnop_pathconf_desc, (VOPFUNC)union_pathconf }, /* pathconf */ + { &vnop_advlock_desc, (VOPFUNC)union_advlock }, /* advlock */ #ifdef notdef - { &vop_blkatoff_desc, (VOPFUNC)union_blkatoff }, /* blkatoff */ - { &vop_valloc_desc, (VOPFUNC)union_valloc }, /* valloc */ - { &vop_vfree_desc, (VOPFUNC)union_vfree }, /* vfree */ - { &vop_truncate_desc, (VOPFUNC)union_truncate }, /* truncate */ - { &vop_update_desc, (VOPFUNC)union_update }, /* update */ - { &vop_bwrite_desc, (VOPFUNC)union_bwrite }, /* bwrite */ + { &vnop_bwrite_desc, (VOPFUNC)union_bwrite }, /* bwrite */ #endif - { &vop_pagein_desc, (VOPFUNC)union_pagein }, /* Pagein */ - { &vop_pageout_desc, (VOPFUNC)union_pageout }, /* Pageout */ - { &vop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ - { &vop_blktooff_desc, (VOPFUNC)union_blktooff }, /* blktooff */ - { &vop_offtoblk_desc, (VOPFUNC)union_offtoblk }, /* offtoblk */ - { &vop_cmap_desc, (VOPFUNC)union_cmap }, /* cmap */ + { &vnop_pagein_desc, (VOPFUNC)union_pagein }, /* Pagein */ + { &vnop_pageout_desc, (VOPFUNC)union_pageout }, /* Pageout */ + { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ + { &vnop_blktooff_desc, (VOPFUNC)union_blktooff }, /* blktooff */ + { &vnop_offtoblk_desc, (VOPFUNC)union_offtoblk }, /* offtoblk */ + { &vnop_blockmap_desc, (VOPFUNC)union_blockmap }, /* blockmap */ { (struct vnodeop_desc*)NULL, (int(*)())NULL } }; struct vnodeopv_desc union_vnodeop_opv_desc = diff --git a/bsd/miscfs/volfs/volfs.h b/bsd/miscfs/volfs/volfs.h index 939f555ec..0b083ee7d 100644 --- a/bsd/miscfs/volfs/volfs.h +++ b/bsd/miscfs/volfs/volfs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -19,10 +19,7 @@ * * @APPLE_LICENSE_HEADER_END@ */ -/* Copyright (c) 1998, Apple Computer, Inc. All rights reserved. */ -/* - * Header file for volfs - */ + #ifndef __VOLFS_VOLFS_H__ #define __VOLFS_VOLFS_H__ @@ -32,7 +29,6 @@ struct volfs_mntdata { struct vnode *volfs_rootvp; - LIST_HEAD(volfs_fsvnodelist, vnode) volfs_fsvnodes; }; /* @@ -46,9 +42,9 @@ struct volfs_mntdata struct volfs_vndata { int vnode_type; - struct lock__bsd__ lock; unsigned int nodeID; /* the dev entry of a file system */ struct mount * fs_mount; + fsid_t fs_fsid; }; #define MAXVLFSNAMLEN 24 /* max length is really 10, pad to 24 since @@ -61,140 +57,11 @@ struct volfs_vndata #define MAXPLCENTRIES 250 #define PLCHASHSIZE 128 -extern int (**volfs_vnodeop_p)(void *); -__BEGIN_DECLS - -int volfs_mount __P((struct mount *, char *, caddr_t, struct nameidata *, - struct proc *)); -int volfs_start __P((struct mount *, int, struct proc *)); -int volfs_unmount __P((struct mount *, int, struct proc *)); -int volfs_root __P((struct mount *, struct vnode **)); -int volfs_quotactl __P((struct mount *, int, uid_t, caddr_t, - struct proc *)); -int volfs_statfs __P((struct mount *, struct statfs *, struct proc *)); -int volfs_sync __P((struct mount *, int, struct ucred *, struct proc *)); -int volfs_vget __P((struct mount *, void *ino_t, struct vnode **)); -int volfs_fhtovp __P((struct mount *, struct fid *, struct mbuf *, - struct vnode **, int *, struct ucred **)); -int volfs_vptofh __P((struct vnode *, struct fid *)); -int volfs_init __P((struct vfsconf *)); -int volfs_sysctl __P((int *, u_int, void *, size_t *, void *, size_t, - struct proc *)); - -int volfs_reclaim __P((struct vop_reclaim_args*)); -int volfs_access __P((struct vop_access_args *)); -int volfs_getattr __P((struct vop_getattr_args *)); -int volfs_select __P((struct vop_select_args *)); -int volfs_rmdir __P((struct vop_rmdir_args *)); -int volfs_readdir __P((struct vop_readdir_args *)); -int volfs_lock __P((struct vop_lock_args *)); -int volfs_unlock __P((struct vop_unlock_args *)); -int volfs_islocked __P((struct vop_islocked_args *)); -int volfs_pathconf __P((struct vop_pathconf_args *)); -int volfs_lookup __P((struct vop_lookup_args *)); -__END_DECLS #define VTOVL(VP) ((struct volfs_vndata *)((VP)->v_data)) #define PRINTIT kprintf -#if VOLFS_DEBUG - #define DBG_VOP_TEST_LOCKS 1 - #define DBG_FUNC_NAME(FSTR) static char *funcname = FSTR - #define DBG_PRINT_FUNC_NAME() PRINTIT("%s\n", funcname); - #define DBG_VOP_PRINT_FUNCNAME() PRINTIT("%s: ", funcname); - #define DBG_VOP_PRINT_CPN_INFO(CN) PRINTIT("name: %s",(CN)->cn_nameptr); - #define DBG_VOP(STR) PRINTIT STR; - #define DBG_VOP_PRINT_VNODE_INFO(VP) { if ((VP)) \ - { if ((VP)->v_tag == VT_NON) \ - PRINTIT("\tfs:%s id: %d v: 0x%x ", VTOVL(VP)->fs_mount->mnt_stat.f_fstypename, VTOVL(VP)->nodeID, (u_int)(VP)); \ - else PRINTIT("\t%s v: 0x%x ", (VP)->v_mount->mnt_stat.f_fstypename, (u_int)(VP)); \ - } else { PRINTIT("*** NULL NODE ***"); } } - -#else /* VOLFS_DEBUG */ - #define DBG_VOP_TEST_LOCKS 0 - #define DBG_FUNC_NAME(FSTR) - #define DBG_PRINT_FUNC_NAME() - #define DBG_VOP_PRINT_FUNCNAME() - #define DBG_VOP_PRINT_CPN_INFO(CN) - #define DBG_VOP(A) - #define DBG_VOP_PRINT_VNODE_INFO(VP) -#endif /* VOLFS_DEBUG */ - - -#if DBG_VOP_TEST_LOCKS - -#define VOPDBG_IGNORE 0 -#define VOPDBG_LOCKED 1 -#define VOPDBG_UNLOCKED -1 -#define VOPDBG_LOCKNOTNIL 2 -#define VOPDBG_SAME 3 - -#define VOPDBG_ZERO 0 -#define VOPDBG_POS 1 - - -#define MAXDBGLOCKS 15 - -typedef struct VopDbgStoreRec { - short id; - struct vnode *vp; - short inState; - short outState; - short errState; - int inValue; - int outValue; - } VopDbgStoreRec; - - -/* This sets up the test for the lock state of vnodes. The entry paramaters are: - * I = index of paramater - * VP = pointer to a vnode - * ENTRYSTATE = the inState of the lock - * EXITSTATE = the outState of the lock - * ERRORSTATE = the error state of the lock - * It initializes the structure, does some preliminary validity checks, but does nothing - * if the instate is set to be ignored. - */ - -#define DBG_VOP_LOCKS_DECL(I) VopDbgStoreRec VopDbgStore[I];short numOfLockSlots=I -#define DBG_VOP_LOCKS_INIT(I,VP,ENTRYSTATE,EXITSTATE,ERRORSTATE,CHECKFLAG) \ - if (I >= numOfLockSlots) { \ - PRINTIT("%s: DBG_VOP_LOCKS_INIT: Entry #%d greater than allocated slots!\n", funcname, I); \ - }; \ - VopDbgStore[I].id = I; \ - VopDbgStore[I].vp = (VP); \ - VopDbgStore[I].inState = ENTRYSTATE; \ - VopDbgStore[I].outState = EXITSTATE; \ - VopDbgStore[I].errState = ERRORSTATE; \ - VopDbgStore[I].inValue = 0; \ - VopDbgStore[I].outValue = 0; \ - if ((VopDbgStore[I].inState != VOPDBG_IGNORE)) { \ - if ((VP) == NULL) \ - PRINTIT ("%s: DBG_VOP_LOCK on start: Null vnode ptr\n", funcname); \ - else \ - VopDbgStore[I].inValue = lockstatus (&((struct volfs_vndata *)((VP)->v_data))->lock); \ - } \ - if ((VP) != NULL) \ - { \ - if (CHECKFLAG==VOPDBG_POS && (VP)->v_usecount <= 0) \ - PRINTIT("%s: BAD USECOUNT OF %d !!!!\n", funcname, (VP)->v_usecount); \ - else if ((VP)->v_usecount < 0) \ - PRINTIT("%s: BAD USECOUNT OF %d !!!!\n", funcname, (VP)->v_usecount); \ - } -#define DBG_VOP_UPDATE_VP(I, VP) \ - VopDbgStore[I].vp = (VP); - - -#define DBG_VOP_LOCKS_TEST(status) DbgVopTest (numOfLockSlots, status, VopDbgStore, funcname); - -#else /*DBG_VOP_TEST_LOCKS */ -#define DBG_VOP_LOCKS_DECL(A) -#define DBG_VOP_LOCKS_INIT(A,B,C,D,E,F) -#define DBG_VOP_LOCKS_TEST(a) -#define DBG_VOP_UPDATE_VP(I, VP) - -#endif /* DBG_VOP_TEST_LOCKS */ #endif /* __APPLE_API_PRIVATE */ #endif /* __VOLFS_VOLFS_H__ */ diff --git a/bsd/miscfs/volfs/volfs_vfsops.c b/bsd/miscfs/volfs/volfs_vfsops.c index d92ee4c62..6cdd7f2ed 100644 --- a/bsd/miscfs/volfs/volfs_vfsops.c +++ b/bsd/miscfs/volfs/volfs_vfsops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -19,13 +19,6 @@ * * @APPLE_LICENSE_HEADER_END@ */ -/* Copyright (c) 1998 Apple Computer, Inc. All Rights Reserved */ -/* - * Change History: - * - * 29-May-1998 Pat Dirks Changed to cache pointer to root vnode until unmount. - * - */ #include <sys/param.h> #include <sys/systm.h> @@ -35,8 +28,7 @@ #include <mach/machine/vm_types.h> #include <sys/vnode.h> #include <sys/socket.h> -#include <sys/mount.h> -#include <sys/buf.h> +#include <sys/mount_internal.h> #include <sys/mbuf.h> #include <sys/file.h> #include <sys/disk.h> @@ -48,13 +40,27 @@ #include <miscfs/specfs/specdev.h> #include "volfs.h" +static int volfs_mount(struct mount *, vnode_t , user_addr_t, vfs_context_t); +static int volfs_start(struct mount *, int, vfs_context_t); +static int volfs_unmount(struct mount *, int, vfs_context_t); +static int volfs_root(struct mount *, struct vnode **, vfs_context_t); +static int volfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t context); +static int volfs_sync(struct mount *, int, vfs_context_t); +static int volfs_vget(struct mount *, ino64_t, struct vnode **, vfs_context_t); +static int volfs_fhtovp(struct mount *, int, unsigned char *, struct vnode **, vfs_context_t); +static int volfs_vptofh(struct vnode *, int *, unsigned char *, vfs_context_t); +static int volfs_init(struct vfsconf *); +static int volfs_sysctl(int *, u_int, user_addr_t, size_t *, user_addr_t, size_t, vfs_context_t); +void volfs_load(int loadArgument); + + struct vfsops volfs_vfsops = { volfs_mount, volfs_start, volfs_unmount, volfs_root, - volfs_quotactl, - volfs_statfs, + NULL, /* quotactl */ + volfs_vfs_getattr, volfs_sync, volfs_vget, volfs_fhtovp, @@ -63,17 +69,19 @@ struct vfsops volfs_vfsops = { volfs_sysctl }; -static char volfs_fs_name[MFSNAMELEN] = "volfs"; +// static char volfs_fs_name[MFSNAMELEN] = "volfs"; extern struct vnodeopv_desc volfs_vnodeop_opv_desc; +extern int (**volfs_vnodeop_p)(void *); + /* The following refer to kernel global variables used in the loading/initialization: */ -extern int maxvfsslots; /* Total number of slots in the system's vfsconf table */ -extern int maxvfsconf; /* The highest fs type number [old-style ID] in use [dispite its name] */ extern int vfs_opv_numops; /* The total number of defined vnode operations */ extern int kdp_flag; void -volfs_load(int loadArgument) { +volfs_load(__unused int loadArgument) +{ +#if 0 struct vfsconf *vfsconflistentry; int entriesRemaining; struct vfsconf *newvfsconf = NULL; @@ -82,9 +90,7 @@ volfs_load(int loadArgument) { int (***opv_desc_vector_p)(); int (**opv_desc_vector)(); struct vnodeopv_entry_desc *opve_descp; - -#pragma unused(loadArgument) - + /* * This routine is responsible for all the initialization that would * ordinarily be done as part of the system startup; it calls volfs_init @@ -99,7 +105,6 @@ volfs_load(int loadArgument) { This becomes irrelevant when volfs is compiled into the list. */ - DBG_VOP(("load_volfs: Scanning vfsconf list...\n")); vfsconflistentry = vfsconf; for (entriesRemaining = maxvfsslots; entriesRemaining > 0; --entriesRemaining) { if (vfsconflistentry->vfc_vfsops != NULL) { @@ -123,8 +128,7 @@ volfs_load(int loadArgument) { }; if (newvfsconf) { - DBG_VOP(("load_volfs: filling in vfsconf entry at 0x%08lX; lastentry = 0x%08lX.\n", (long)newvfsconf, (long)lastentry)); - newvfsconf->vfc_vfsops = &volfs_vfsops; + newvfsconf->vfc_vfsops = &volfs_vfsops; strncpy(&newvfsconf->vfc_name[0], "volfs", MFSNAMELEN); newvfsconf->vfc_typenum = maxvfsconf++; newvfsconf->vfc_refcount = 0; @@ -141,8 +145,6 @@ volfs_load(int loadArgument) { /* Based on vfs_op_init and ... */ opv_desc_vector_p = volfs_vnodeop_opv_desc.opv_desc_vector_p; - DBG_VOP(("load_volfs: Allocating and initializing VNode ops vector...\n")); - /* * Allocate and init the vector. * Also handle backwards compatibility. @@ -173,10 +175,7 @@ volfs_load(int loadArgument) { * list of supported operations. */ if (opve_descp->opve_op->vdesc_offset == 0 && - opve_descp->opve_op->vdesc_offset != VOFFSET(vop_default)) { - DBG_VOP(("load_volfs: operation %s not listed in %s.\n", - opve_descp->opve_op->vdesc_name, - "vfs_op_descs")); + opve_descp->opve_op->vdesc_offset != VOFFSET(vnop_default)) { panic ("load_volfs: bad operation"); } /* @@ -197,17 +196,19 @@ volfs_load(int loadArgument) { * Force every operations vector to have a default routine. */ opv_desc_vector = *opv_desc_vector_p; - if (opv_desc_vector[VOFFSET(vop_default)]==NULL) { + if (opv_desc_vector[VOFFSET(vnop_default)]==NULL) { panic("load_vp;fs: operation vector without default routine."); } for (j = 0;j<vfs_opv_numops; j++) if (opv_desc_vector[j] == NULL) opv_desc_vector[j] = - opv_desc_vector[VOFFSET(vop_default)]; + opv_desc_vector[VOFFSET(vnop_default)]; - DBG_VOP(("load_volfs: calling volfs_init()...\n")); - volfs_init(newvfsconf); + volfs_init(newvfsconf); }; +#else + panic("volfs load not ported"); +#endif } /* @@ -215,66 +216,70 @@ volfs_load(int loadArgument) { * * mount system call */ -int -volfs_mount(mp, path, data, ndp, p) - register struct mount *mp; - char *path; - caddr_t data; - struct nameidata *ndp; - struct proc *p; +static int +volfs_mount(struct mount *mp, __unused vnode_t devvp, __unused user_addr_t data, __unused vfs_context_t context) { struct volfs_mntdata *priv_mnt_data; - struct vnode *root_vp; - struct volfs_vndata *priv_vn_data; - int error; - size_t size; + struct vnode *root_vp; + struct volfs_vndata *priv_vn_data; + int error; + struct vnode_fsparam vfsp; - DBG_VOP(("volfs_mount called\n")); MALLOC(priv_mnt_data, struct volfs_mntdata *, sizeof(struct volfs_mntdata), M_VOLFSMNT, M_WAITOK); - DBG_VOP(("MALLOC succeeded\n")); - LIST_INIT(&priv_mnt_data->volfs_fsvnodes); - DBG_VOP(("LIST_INIT succeeded\n")); mp->mnt_data = (void *)priv_mnt_data; - strcpy(mp->mnt_stat.f_fstypename, "volfs"); - (void) copyinstr(path, mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname) - 1, &size); - strcpy(mp->mnt_stat.f_mntfromname, "<volfs>"); + strcpy(mp->mnt_vfsstat.f_fstypename, "volfs"); + strcpy(mp->mnt_vfsstat.f_mntfromname, "<volfs>"); /* Set up the root vnode for fast reference in the future. Note that the root is maintained unlocked but with a pos. ref count until unmount. */ - MALLOC(priv_vn_data, struct volfs_vndata *, sizeof(struct volfs_vndata), M_VOLFSNODE, M_WAITOK); - error = getnewvnode(VT_VOLFS, mp, volfs_vnodeop_p, &root_vp); - if (error != 0) - { + MALLOC(priv_vn_data, struct volfs_vndata *, sizeof(struct volfs_vndata), M_VOLFSNODE, M_WAITOK); + + priv_vn_data->vnode_type = VOLFS_ROOT; + priv_vn_data->nodeID = ROOT_DIRID; + priv_vn_data->fs_mount = mp; + priv_vn_data->fs_fsid = mp->mnt_vfsstat.f_fsid; + + vfsp.vnfs_mp = mp; + vfsp.vnfs_vtype = VDIR; + vfsp.vnfs_str = "volfs"; + vfsp.vnfs_dvp = 0; + vfsp.vnfs_fsnode = priv_vn_data; + vfsp.vnfs_cnp = 0; + vfsp.vnfs_vops = volfs_vnodeop_p; + vfsp.vnfs_rdev = 0; + vfsp.vnfs_filesize = 0; + vfsp.vnfs_flags = VNFS_NOCACHE | VNFS_CANTCACHE; + vfsp.vnfs_marksystem = 0; + vfsp.vnfs_markroot = 1; + + error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &root_vp); + if (error != 0) { FREE(priv_mnt_data, M_VOLFSMNT); FREE(priv_vn_data, M_VOLFSNODE); - DBG_VOP(("getnewvnode failed with error code %d\n", error)); return(error); } - root_vp->v_type = VDIR; - root_vp->v_flag |= VROOT; - lockinit(&priv_vn_data->lock, PINOD, "volfsnode", 0, 0); - priv_vn_data->vnode_type = VOLFS_ROOT; - priv_vn_data->nodeID = 0; - priv_vn_data->fs_mount = mp; - root_vp->v_data = priv_vn_data; - - priv_mnt_data->volfs_rootvp = root_vp; + vnode_ref(root_vp); + vnode_put(root_vp); + + /* obtain a new fsid for the mount point */ + vfs_getnewfsid(mp); + + vnode_settag(root_vp, VT_VOLFS); - mp->mnt_flag &= ~MNT_RDONLY; + priv_mnt_data->volfs_rootvp = root_vp; + mp->mnt_flag &= ~MNT_RDONLY; + + mp->mnt_vtable->vfc_threadsafe = TRUE; - return (0); + return (0); } -int -volfs_start(mp, flags, p) -struct mount * mp; -int flags; -struct proc * p; +static int +volfs_start(__unused struct mount * mp, __unused int flags, __unused vfs_context_t context) { - DBG_VOP(("volfs_start called\n")); return (0); } @@ -282,58 +287,33 @@ struct proc * p; * Return the root of a filesystem. For volfs the root vnode is a directory * containing the list of all filesystems volfs can work with. */ -int -volfs_root(mp, vpp) - struct mount *mp; - struct vnode **vpp; +static int +volfs_root(struct mount *mp, struct vnode **vpp, __unused vfs_context_t context) { struct volfs_mntdata *priv_data; - // struct volfs_vndata *priv_vn_data; - // int error; - DBG_VOP(("volfs_root called\n")); priv_data = (struct volfs_mntdata *)mp->mnt_data; - if (priv_data->volfs_rootvp) { - vref(priv_data->volfs_rootvp); - VOP_LOCK(priv_data->volfs_rootvp, LK_EXCLUSIVE, current_proc()); + if (priv_data->volfs_rootvp) { + vnode_get(priv_data->volfs_rootvp); *vpp = priv_data->volfs_rootvp; } else { panic("volfs: root vnode missing!"); - }; - - DBG_VOP(("volfs_root returned with ")); - DBG_VOP_PRINT_VNODE_INFO(*vpp);DBG_VOP(("\n")); - - return(0); -} + }; -int -volfs_quotactl(mp, cmds, uid, arg, p) -struct mount *mp; -int cmds; -uid_t uid; -caddr_t arg; -struct proc * p; -{ - DBG_VOP(("volfs_quotactl called\n")); - return (0); + return(0); } /* * unmount system call */ -int -volfs_unmount(mp, mntflags, p) - struct mount *mp; - int mntflags; - struct proc *p; +static int +volfs_unmount(struct mount *mp, __unused int mntflags, __unused vfs_context_t context) { struct volfs_mntdata *priv_data; struct vnode *root_vp; int retval; - DBG_VOP(("volfs_unmount called\n")); priv_data = (struct volfs_mntdata *)mp->mnt_data; root_vp = priv_data->volfs_rootvp; @@ -344,23 +324,17 @@ volfs_unmount(mp, mntflags, p) Note that there's no need to vget() or vref() it before locking it here: the ref. count has been maintained at +1 ever since mount time. */ if (root_vp) { - retval = vn_lock(root_vp, LK_EXCLUSIVE, p); - if (retval) goto Err_Exit; - if (root_vp->v_usecount > 1) { - DBG_VOP(("VOLFS ERROR: root vnode = %x, usecount = %d\n", (int)root_vp, priv_data->volfs_rootvp->v_usecount)); - VOP_UNLOCK(root_vp, 0, p); - retval = EBUSY; + if (vnode_isinuse(root_vp, 1)) { + retval = EBUSY; goto Err_Exit; }; priv_data->volfs_rootvp = NULL; - vput(root_vp); /* This drops volfs's own refcount */ - vgone(root_vp); + vnode_rele(root_vp); /* This drops volfs's own refcount */ + vnode_reclaim(root_vp); }; /* All vnodes should be gone, and no errors, clean up the last */ - /* XXX DBG_ASSERT(mp->mnt_vnodelist.lh_first == NULL); */ - /* XXX DBG_ASSERT(retval == 0); */ mp->mnt_data = NULL; FREE(priv_data, M_VOLFSMNT); @@ -373,122 +347,78 @@ Err_Exit: /* * Get file system statistics. */ -int -volfs_statfs(mp, sbp, p) - struct mount *mp; - register struct statfs *sbp; - struct proc *p; +static int +volfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t context) { - DBG_VOP(("volfs_statfs called\n")); - sbp->f_bsize = 512; - sbp->f_iosize = 512; - sbp->f_blocks = 1024; // lies, darn lies and virtual file systems - sbp->f_bfree = 0; // Nope, can't write here! - sbp->f_bavail = 0; - sbp->f_files = 0; // Hmmm...maybe later - sbp->f_ffree = 0; - return (0); + VFSATTR_RETURN(fsap, f_bsize, 512); + VFSATTR_RETURN(fsap, f_iosize, 512); + VFSATTR_RETURN(fsap, f_blocks, 1024); + VFSATTR_RETURN(fsap, f_bfree, 0); + VFSATTR_RETURN(fsap, f_bavail, 0); + VFSATTR_RETURN(fsap, f_bused, 1024); + VFSATTR_RETURN(fsap, f_files, 0); + VFSATTR_RETURN(fsap, f_ffree, 0); + VFSATTR_RETURN(fsap, f_fssubtype, 0); + return 0; } /* * volfs doesn't have any data and you can't write into any of the volfs * structures, so don't do anything */ -int -volfs_sync(mp, waitfor, cred, p) - struct mount *mp; - int waitfor; - struct ucred *cred; - struct proc *p; +static int +volfs_sync(__unused struct mount *mp, __unused int waitfor, __unused vfs_context_t context) { -// DBG_VOP(("volfs_sync called\n")); - - /* Release a few entries from the permissions cache to keep them from getting stale. - * Since sync is called at least every 30 seconds or so, releasing 1/20 of the cache - * every time through should free all entries in no less than 10 minutes, which should - * be adequate to prevent pid-wrapping from mis-associating PLC entries: - */ - volfs_PLC_reclaim_entries(MAXPLCENTRIES / 20); - return 0; } + /* - * Look up a FFS dinode number to find its incore vnode, otherwise read it - * in from disk. If it is in core, wait for the lock bit to clear, then - * return the inode locked. Detection and handling of mount points must be - * done by the calling routine. + * */ -int -volfs_vget(mp, ino, vpp) - struct mount *mp; - void *ino; - struct vnode **vpp; +static int +volfs_vget(__unused struct mount *mp, __unused ino64_t ino, + __unused struct vnode **vpp, __unused vfs_context_t context) { -// DBG_VOP(("volfs_vget called\n")); - return(0); + return(ENOTSUP); } + /* * File handle to vnode - * - * Have to be really careful about stale file handles: - * - check that the inode number is valid - * - call ffs_vget() to get the locked inode - * - check for an unallocated inode (i_mode == 0) - * - check that the given client host has export rights and return - * those rights via. exflagsp and credanonp */ -int -volfs_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) - register struct mount *mp; - struct fid *fhp; - struct mbuf *nam; - struct vnode **vpp; - int *exflagsp; - struct ucred **credanonp; +static int +volfs_fhtovp(__unused struct mount *mp, __unused int fhlen, + __unused unsigned char *fhp, __unused struct vnode **vpp, + __unused vfs_context_t context) { - DBG_VOP(("volfs_fhtovp called\n")); - return(0); + return(ENOTSUP); } + /* * Vnode pointer to File handle */ -/* ARGSUSED */ -int -volfs_vptofh(vp, fhp) - struct vnode *vp; - struct fid *fhp; +static int +volfs_vptofh(__unused struct vnode *vp, __unused int *fhlenp, __unused unsigned char *fhp, __unused vfs_context_t context) { - DBG_VOP(("volfs_vptofh called\n")); - return(0); + return(ENOTSUP); } + /* * Initialize the filesystem */ -int -volfs_init(vfsp) - struct vfsconf *vfsp; -{ - DBG_VOP(("volfs_init called\n")); - - volfs_PLChashinit(); - +static int +volfs_init(__unused struct vfsconf *vfsp) +{ return (0); } /* * fast filesystem related variables. */ -int -volfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) - int *name; - u_int namelen; - void *oldp; - size_t *oldlenp; - void *newp; - size_t newlen; - struct proc *p; +static int +volfs_sysctl(__unused int *name, __unused u_int namelen, __unused user_addr_t oldp, + __unused size_t *oldlenp, __unused user_addr_t newp, __unused size_t newlen, + __unused vfs_context_t context) { - DBG_VOP(("volfs_sysctl called\n")); - return (EOPNOTSUPP); + return (ENOTSUP); } diff --git a/bsd/miscfs/volfs/volfs_vnops.c b/bsd/miscfs/volfs/volfs_vnops.c index 9c0980f75..d875957b1 100644 --- a/bsd/miscfs/volfs/volfs_vnops.c +++ b/bsd/miscfs/volfs/volfs_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -19,22 +19,6 @@ * * @APPLE_LICENSE_HEADER_END@ */ -/* - * Copyright (c) 1998-1999 Apple Computer, Inc. All Rights Reserved. - * - * Modification History: - * - * 2/10/2000 Clark Warner Added copyfile - * 5/24/1999 Don Brady Fixed security hole in get_fsvnode. - * 11/18/1998 Don Brady Special case 2 to mean the root of a file system. - * 9/28/1998 Umesh Vaishampayan Use the default vnode ops. Cleanup - * header includes. - * 11/12/1998 Scott Roberts validfsnode only checks to see if the volfs mount flag is set - * 8/5/1998 Don Brady fix validfsnode logic to handle a "bad" VFS_GET - * 7/5/1998 Don Brady In volfs_reclaim set vp->v_data to NULL after private data is free (VFS expects a NULL). - * 4/5/1998 Don Brady Changed lockstatus calls to VOP_ISLOCKED (radar #2231108); - * 3/25/1998 Pat Dirks Added include for sys/attr.h, which is no longer included indirectly. - */ #include <mach/mach_types.h> @@ -45,22 +29,25 @@ #include <sys/file.h> #include <sys/filedesc.h> #include <sys/stat.h> -#include <sys/buf.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> /* for p_fd */ +#include <sys/kauth.h> #include <sys/conf.h> -#include <sys/mount.h> -#include <sys/vnode.h> +#include <sys/mount_internal.h> +#include <sys/vnode_internal.h> #include <sys/malloc.h> #include <sys/dirent.h> #include <sys/namei.h> #include <sys/attr.h> #include <sys/kdebug.h> #include <sys/queue.h> +#include <sys/uio_internal.h> #include <sys/vm.h> #include <sys/errno.h> #include <vfs/vfs_support.h> +#include <kern/locks.h> + #include "volfs.h" /* @@ -92,72 +79,74 @@ * a similar mechanism. */ +static int volfs_reclaim (struct vnop_reclaim_args*); +static int volfs_getattr (struct vnop_getattr_args *); +static int volfs_select (struct vnop_select_args *); +static int volfs_rmdir (struct vnop_rmdir_args *); +static int volfs_readdir (struct vnop_readdir_args *); +static int volfs_pathconf (struct vnop_pathconf_args *); +static int volfs_lookup (struct vnop_lookup_args *); + +static int volfs_readdir_callback(mount_t, void *); +static int get_filevnode(struct mount *parent_fs, u_int id, vnode_t *ret_vnode, vfs_context_t context); +static int get_fsvnode(struct mount *our_mount, int id, vnode_t *ret_vnode); + +/* for the call back function in volfs_readdir */ +struct volfs_rdstruct { + int validindex; + vnode_t vp; + int rec_offset; + struct uio * uio; +}; + #define VOPFUNC int (*)(void *) /* Global vfs data structures for volfs. */ int (**volfs_vnodeop_p) (void *); struct vnodeopv_entry_desc volfs_vnodeop_entries[] = { - {&vop_default_desc, (VOPFUNC)vn_default_error}, - {&vop_strategy_desc, (VOPFUNC)err_strategy}, /* strategy */ - {&vop_bwrite_desc, (VOPFUNC)err_bwrite}, /* bwrite */ - {&vop_lookup_desc, (VOPFUNC)volfs_lookup}, /* lookup */ - {&vop_create_desc, (VOPFUNC)err_create}, /* create */ - {&vop_whiteout_desc, (VOPFUNC)err_whiteout}, /* whiteout */ - {&vop_mknod_desc, (VOPFUNC)err_mknod}, /* mknod */ - {&vop_mkcomplex_desc, (VOPFUNC)err_mkcomplex}, /* mkcomplex */ - {&vop_open_desc, (VOPFUNC)nop_open}, /* open */ - {&vop_close_desc, (VOPFUNC)nop_close}, /* close */ - {&vop_access_desc, (VOPFUNC)volfs_access}, /* access */ - {&vop_getattr_desc, (VOPFUNC)volfs_getattr}, /* getattr */ - {&vop_setattr_desc, (VOPFUNC)err_setattr}, /* setattr */ - {&vop_getattrlist_desc, (VOPFUNC)err_getattrlist}, /* getattrlist */ - {&vop_setattrlist_desc, (VOPFUNC)err_setattrlist}, /* setattrlist */ - {&vop_read_desc, (VOPFUNC)err_read}, /* read */ - {&vop_write_desc, (VOPFUNC)err_write}, /* write */ - {&vop_lease_desc, (VOPFUNC)err_lease}, /* lease */ - {&vop_ioctl_desc, (VOPFUNC)err_ioctl}, /* ioctl */ - {&vop_select_desc, (VOPFUNC)volfs_select}, /* select */ - {&vop_exchange_desc, (VOPFUNC)err_exchange}, /* exchange */ - {&vop_revoke_desc, (VOPFUNC)nop_revoke}, /* revoke */ - {&vop_mmap_desc, (VOPFUNC)err_mmap}, /* mmap */ - {&vop_fsync_desc, (VOPFUNC)err_fsync}, /* fsync */ - {&vop_seek_desc, (VOPFUNC)nop_seek}, /* seek */ - {&vop_remove_desc, (VOPFUNC)err_remove}, /* remove */ - {&vop_link_desc, (VOPFUNC)err_link}, /* link */ - {&vop_rename_desc, (VOPFUNC)err_rename}, /* rename */ - {&vop_mkdir_desc, (VOPFUNC)err_mkdir}, /* mkdir */ - {&vop_rmdir_desc, (VOPFUNC)volfs_rmdir}, /* rmdir */ - {&vop_symlink_desc, (VOPFUNC)err_symlink}, /* symlink */ - {&vop_readdir_desc, (VOPFUNC)volfs_readdir}, /* readdir */ - {&vop_readdirattr_desc, (VOPFUNC)err_readdirattr}, /* readdirattr */ - {&vop_readlink_desc, (VOPFUNC)err_readlink}, /* readlink */ - {&vop_abortop_desc, (VOPFUNC)err_abortop}, /* abortop */ - {&vop_inactive_desc, (VOPFUNC)err_inactive}, /* inactive */ - {&vop_reclaim_desc, (VOPFUNC)volfs_reclaim}, /* reclaim */ - {&vop_lock_desc, (VOPFUNC)volfs_lock}, /* lock */ - {&vop_unlock_desc, (VOPFUNC)volfs_unlock}, /* unlock */ - {&vop_bmap_desc, (VOPFUNC)err_bmap}, /* bmap */ - {&vop_print_desc, (VOPFUNC)err_print}, /* print */ - {&vop_islocked_desc, (VOPFUNC)volfs_islocked}, /* islocked */ - {&vop_pathconf_desc, (VOPFUNC)volfs_pathconf}, /* pathconf */ - {&vop_advlock_desc, (VOPFUNC)err_advlock}, /* advlock */ - {&vop_blkatoff_desc, (VOPFUNC)err_blkatoff}, /* blkatoff */ - {&vop_valloc_desc, (VOPFUNC)err_valloc}, /* valloc */ - {&vop_reallocblks_desc, (VOPFUNC)err_reallocblks}, /* reallocblks */ - {&vop_vfree_desc, (VOPFUNC)err_vfree}, /* vfree */ - {&vop_truncate_desc, (VOPFUNC)err_truncate}, /* truncate */ - {&vop_allocate_desc, (VOPFUNC)err_allocate}, /* allocate */ - {&vop_update_desc, (VOPFUNC)err_update}, /* update */ - {&vop_pgrd_desc, (VOPFUNC)err_pgrd}, /* pgrd */ - {&vop_pgwr_desc, (VOPFUNC)err_pgwr}, /* pgwr */ - {&vop_pagein_desc, (VOPFUNC)err_pagein}, /* pagein */ - {&vop_pageout_desc, (VOPFUNC)err_pageout}, /* pageout */ - {&vop_devblocksize_desc, (VOPFUNC)err_devblocksize}, /* devblocksize */ - {&vop_searchfs_desc, (VOPFUNC)err_searchfs}, /* searchfs */ - {&vop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ - {&vop_blktooff_desc, (VOPFUNC)err_blktooff}, /* blktooff */ - {&vop_offtoblk_desc, (VOPFUNC)err_offtoblk }, /* offtoblk */ - {&vop_cmap_desc, (VOPFUNC)err_cmap }, /* cmap */ + {&vnop_default_desc, (VOPFUNC)vn_default_error}, + {&vnop_strategy_desc, (VOPFUNC)err_strategy}, /* strategy */ + {&vnop_bwrite_desc, (VOPFUNC)err_bwrite}, /* bwrite */ + {&vnop_lookup_desc, (VOPFUNC)volfs_lookup}, /* lookup */ + {&vnop_create_desc, (VOPFUNC)err_create}, /* create */ + {&vnop_whiteout_desc, (VOPFUNC)err_whiteout}, /* whiteout */ + {&vnop_mknod_desc, (VOPFUNC)err_mknod}, /* mknod */ + {&vnop_open_desc, (VOPFUNC)nop_open}, /* open */ + {&vnop_close_desc, (VOPFUNC)nop_close}, /* close */ + {&vnop_getattr_desc, (VOPFUNC)volfs_getattr}, /* getattr */ + {&vnop_setattr_desc, (VOPFUNC)err_setattr}, /* setattr */ + {&vnop_getattrlist_desc, (VOPFUNC)err_getattrlist}, /* getattrlist */ + {&vnop_setattrlist_desc, (VOPFUNC)err_setattrlist}, /* setattrlist */ + {&vnop_read_desc, (VOPFUNC)err_read}, /* read */ + {&vnop_write_desc, (VOPFUNC)err_write}, /* write */ + {&vnop_ioctl_desc, (VOPFUNC)err_ioctl}, /* ioctl */ + {&vnop_select_desc, (VOPFUNC)volfs_select}, /* select */ + {&vnop_exchange_desc, (VOPFUNC)err_exchange}, /* exchange */ + {&vnop_revoke_desc, (VOPFUNC)nop_revoke}, /* revoke */ + {&vnop_mmap_desc, (VOPFUNC)err_mmap}, /* mmap */ + {&vnop_fsync_desc, (VOPFUNC)err_fsync}, /* fsync */ + {&vnop_remove_desc, (VOPFUNC)err_remove}, /* remove */ + {&vnop_link_desc, (VOPFUNC)err_link}, /* link */ + {&vnop_rename_desc, (VOPFUNC)err_rename}, /* rename */ + {&vnop_mkdir_desc, (VOPFUNC)err_mkdir}, /* mkdir */ + {&vnop_rmdir_desc, (VOPFUNC)volfs_rmdir}, /* rmdir */ + {&vnop_symlink_desc, (VOPFUNC)err_symlink}, /* symlink */ + {&vnop_readdir_desc, (VOPFUNC)volfs_readdir}, /* readdir */ + {&vnop_readdirattr_desc, (VOPFUNC)err_readdirattr}, /* readdirattr */ + {&vnop_readlink_desc, (VOPFUNC)err_readlink}, /* readlink */ + {&vnop_inactive_desc, (VOPFUNC)err_inactive}, /* inactive */ + {&vnop_reclaim_desc, (VOPFUNC)volfs_reclaim}, /* reclaim */ + {&vnop_pathconf_desc, (VOPFUNC)volfs_pathconf}, /* pathconf */ + {&vnop_advlock_desc, (VOPFUNC)err_advlock}, /* advlock */ + {&vnop_allocate_desc, (VOPFUNC)err_allocate}, /* allocate */ + {&vnop_pagein_desc, (VOPFUNC)err_pagein}, /* pagein */ + {&vnop_pageout_desc, (VOPFUNC)err_pageout}, /* pageout */ + {&vnop_devblocksize_desc, (VOPFUNC)err_devblocksize}, /* devblocksize */ + {&vnop_searchfs_desc, (VOPFUNC)err_searchfs}, /* searchfs */ + {&vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ + {&vnop_blktooff_desc, (VOPFUNC)err_blktooff}, /* blktooff */ + {&vnop_offtoblk_desc, (VOPFUNC)err_offtoblk }, /* offtoblk */ + {&vnop_blockmap_desc, (VOPFUNC)err_blockmap }, /* blockmap */ {(struct vnodeop_desc *) NULL, (int (*) ()) NULL} }; @@ -168,7 +157,6 @@ struct vnodeopv_entry_desc volfs_vnodeop_entries[] = { struct vnodeopv_desc volfs_vnodeop_opv_desc = {&volfs_vnodeop_p, volfs_vnodeop_entries}; -static char gDot[] = "."; static char gDotDot[] = ".."; struct finfo { @@ -180,321 +168,119 @@ struct finfoattrbuf { struct finfo fi; }; -static int validfsnode(struct mount *fsnode); - -struct volfs_PLCEntry -{ - LIST_ENTRY(volfs_PLCEntry) vplc_hash_link; /* entry's hash chain */ - TAILQ_ENTRY(volfs_PLCEntry) vplc_lru_link; /* entry's LRU chain link */ - int32_t vplc_fsid; - u_int vplc_item_id; - uid_t vplc_uid; - pid_t vplc_pid; -}; - -#define VOLFSPLCHASH(fsid, inum) ((((unsigned long)fsid) + (unsigned long)(inum)) & volfs_PLCHashMask) - -static struct slock volfs_PLChashtable_slock; -static TAILQ_HEAD(volfs_PLCLRUListHead, volfs_PLCEntry) volfs_PLCLRUList; -static TAILQ_HEAD(volfs_PLCFreeListHead, volfs_PLCEntry) volfs_PLCFreeList; -static LIST_HEAD(, volfs_PLCEntry) *volfs_PLCHashTable; -static u_long volfs_PLCHashMask; /* size of hash table - 1 */ -static u_long volfs_PLCEntryCount; -#if DBG_VOP_TEST_LOCKS -static void DbgVopTest (int max, int error, VopDbgStoreRec *VopDbgStore, char *funcname); -#endif /* DBG_VOP_TEST_LOCKS */ - - -/* - * volfs_PLChashinit - */ -__private_extern__ void -volfs_PLChashinit(void) -{ - int i; - - TAILQ_INIT(&volfs_PLCLRUList); - TAILQ_INIT(&volfs_PLCFreeList); - simple_lock_init(&volfs_PLChashtable_slock); -#if MAXPLCENTRIES - volfs_PLCHashTable = hashinit(PLCHASHSIZE, M_TEMP, &volfs_PLCHashMask); - - for (i = 0; i < PLCHASHSIZE; ++i) { - LIST_INIT(&volfs_PLCHashTable[i]); - }; -#endif - volfs_PLCEntryCount = 0; -} - - - -__private_extern__ void -volfs_PLC_reclaim_entries(int entrycount) -{ -#if MAXPLCENTRIES - int i; - struct volfs_PLCEntry *reclaim_target; - - simple_lock(&volfs_PLChashtable_slock); - - for (i = entrycount; i > 0; --i) { - if (TAILQ_EMPTY(&volfs_PLCLRUList)) break; - - /* Pick the next entry to be recycled and free it: */ - reclaim_target = TAILQ_FIRST(&volfs_PLCLRUList); - TAILQ_REMOVE(&volfs_PLCLRUList, reclaim_target, vplc_lru_link); - LIST_REMOVE(reclaim_target, vplc_hash_link); - TAILQ_INSERT_TAIL(&volfs_PLCFreeList, reclaim_target, vplc_lru_link); - }; - - simple_unlock(&volfs_PLChashtable_slock); -#endif -} - - - -#if MAXPLCENTRIES -/* - * volfs_PLCLookup - * - * Look up a PLC entry in the hash - */ -static int -volfs_PLCLookup(int32_t fsid, u_int target_id, uid_t uid, pid_t pid) -{ - struct volfs_PLCEntry *hash_entry; - int result = 0; - - simple_lock(&volfs_PLChashtable_slock); - LIST_FOREACH(hash_entry, &volfs_PLCHashTable[VOLFSPLCHASH(fsid, target_id)], vplc_hash_link) { - if ((hash_entry->vplc_item_id == target_id) && - (hash_entry->vplc_pid == pid) && - (hash_entry->vplc_uid == uid) && - (hash_entry->vplc_fsid == fsid)) { - result = 1; -#if 0 - if (hash_entry != TAILQ_LAST(&volfs_PLCLRUList, volfs_PLCLRUListHead)) { - TAILQ_REMOVE(&volfs_PLCLRUList, hash_entry, vplc_lru_link); - TAILQ_INSERT_TAIL(&volfs_PLCLRUList, hash_entry, vplc_lru_link); - }; -#endif - break; - }; - }; - simple_unlock(&volfs_PLChashtable_slock); - return result; -} - - -static void -volfs_PLCEnter(int32_t fsid, u_int target_id, uid_t uid, pid_t pid) -{ - struct volfs_PLCEntry *new_entry; - - simple_lock(&volfs_PLChashtable_slock); - if (!TAILQ_EMPTY(&volfs_PLCFreeList)) { - new_entry = TAILQ_FIRST(&volfs_PLCFreeList); - TAILQ_REMOVE(&volfs_PLCFreeList, new_entry, vplc_lru_link); - } else { - /* - * Allocate up to the predetermined maximum number of new entries: - * [must be done now to avoid blocking in MALLOC() with volfs_PLChashtable_slock held locked] - */ - if (volfs_PLCEntryCount < MAXPLCENTRIES) { - simple_unlock(&volfs_PLChashtable_slock); - new_entry = MALLOC(new_entry, struct volfs_PLCEntry *, sizeof(struct volfs_PLCEntry), M_TEMP, M_WAITOK); - simple_lock(&volfs_PLChashtable_slock); - ++volfs_PLCEntryCount; - } else { - new_entry = TAILQ_FIRST(&volfs_PLCLRUList); - TAILQ_REMOVE(&volfs_PLCLRUList, new_entry, vplc_lru_link); - LIST_REMOVE(new_entry, vplc_hash_link); - }; - }; - - new_entry->vplc_fsid = fsid; - new_entry->vplc_item_id = target_id; - new_entry->vplc_uid = uid; - new_entry->vplc_pid = pid; - - /* Link the new entry on the hash list for the fsid/target_id as well as the tail of the LRU list: */ - LIST_INSERT_HEAD(&volfs_PLCHashTable[VOLFSPLCHASH(fsid, target_id)], new_entry, vplc_hash_link); - TAILQ_INSERT_TAIL(&volfs_PLCLRUList, new_entry, vplc_lru_link); - simple_unlock(&volfs_PLChashtable_slock); -} -#endif +static int volfs_getattr_callback(mount_t, void *); /* * volfs_reclaim - Reclaim a vnode so that it can be used for other purposes. - * - * Locking policy: ignored */ -int +static int volfs_reclaim(ap) - struct vop_reclaim_args /* { struct vnode *a_vp; struct proc *a_p; } */ *ap; + struct vnop_reclaim_args /* { struct vnode *a_vp; vfs_context_t a_context; } */ *ap; { - struct vnode *vp = ap->a_vp; - void *data = vp->v_data; - - DBG_FUNC_NAME("volfs_reclaim"); - DBG_VOP_LOCKS_DECL(1); - DBG_VOP_PRINT_FUNCNAME();DBG_VOP_PRINT_VNODE_INFO(ap->a_vp);DBG_VOP(("\n")); - - DBG_VOP_LOCKS_INIT(0, vp, VOPDBG_UNLOCKED, VOPDBG_IGNORE, VOPDBG_IGNORE, VOPDBG_ZERO); + struct vnode *vp = ap->a_vp; + void *data = vp->v_data; vp->v_data = NULL; - FREE(data, M_VOLFSNODE); + FREE(data, M_VOLFSNODE); - DBG_VOP_LOCKS_TEST(0); - return (0); + return (0); } -/* - * volfs_access - same access policy for all vnodes and all users (file/directory vnodes - * for the actual file systems are handled by actual file system) - * - * Locking policy: a_vp locked on input and output - */ -int -volfs_access(ap) - struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct - ucred *a_cred; struct proc *a_p; } */ *ap; -{ - int ret_err; - DBG_FUNC_NAME("volfs_access"); - DBG_VOP_LOCKS_DECL(1); - DBG_VOP_PRINT_FUNCNAME();DBG_VOP_PRINT_VNODE_INFO(ap->a_vp);DBG_VOP(("\n")); - - DBG_VOP_LOCKS_INIT(0,ap->a_vp, VOPDBG_LOCKED, VOPDBG_LOCKED, VOPDBG_LOCKED, VOPDBG_POS); +struct volfsgetattr_struct{ + int numMounts; + vnode_t a_vp; +}; - /* - * We don't need to check credentials! FS is read-only for everyone - */ - if ((ap->a_mode & ~(VREAD | VEXEC)) == 0) - ret_err = 0; - else - ret_err = EACCES; +static int +volfs_getattr_callback(mount_t mp, void * arg) +{ + struct volfsgetattr_struct *vstrp = (struct volfsgetattr_struct *)arg; - DBG_VOP_LOCKS_TEST(ret_err); - return (ret_err); + if (mp != vnode_mount(vstrp->a_vp) && validfsnode(mp)) + vstrp->numMounts++; + return(VFS_RETURNED); } /* * volfs_getattr - fill in the attributes for this vnode - * - * Locking policy: don't change anything */ -int +static int volfs_getattr(ap) - struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; - struct ucred *a_cred; struct proc *a_p; } */ *ap; + struct vnop_getattr_args /* { struct vnode *a_vp; struct vnode_attr *a_vap; + vfs_context_t a_context; } */ *ap; { struct volfs_vndata *priv_data; - struct vnode *a_vp; - struct vattr *a_vap; + struct vnode *a_vp; + struct vnode_attr *a_vap; int numMounts = 0; - DBG_FUNC_NAME("volfs_getattr"); - DBG_VOP_LOCKS_DECL(1); - DBG_VOP_PRINT_FUNCNAME();DBG_VOP_PRINT_VNODE_INFO(ap->a_vp);DBG_VOP(("\n")); - - DBG_VOP_LOCKS_INIT(0,ap->a_vp, VOPDBG_SAME, VOPDBG_SAME, VOPDBG_SAME, VOPDBG_POS); + struct volfsgetattr_struct vstr; + struct timespec ts; a_vp = ap->a_vp; a_vap = ap->a_vap; priv_data = a_vp->v_data; - a_vap->va_type = VDIR; - a_vap->va_mode = 0444; /* Yup, hard - coded to read - only */ - a_vap->va_nlink = 2; - a_vap->va_uid = 0; /* Always owned by root */ - a_vap->va_gid = 0; /* Always part of group 0 */ - a_vap->va_fsid = (int) a_vp->v_mount->mnt_stat.f_fsid.val[0]; - a_vap->va_fileid = priv_data->nodeID; + VATTR_RETURN(a_vap, va_type, VDIR); + VATTR_RETURN(a_vap, va_mode, 0555); + VATTR_RETURN(a_vap, va_nlink, 2); + VATTR_RETURN(a_vap, va_uid, 0); + VATTR_RETURN(a_vap, va_gid, 0); + VATTR_RETURN(a_vap, va_fsid, (int) a_vp->v_mount->mnt_vfsstat.f_fsid.val[0]); + VATTR_RETURN(a_vap, va_fileid, (uint64_t)((u_long)priv_data->nodeID)); + VATTR_RETURN(a_vap, va_acl, NULL); /* * If it's the root vnode calculate its size based on the number of eligible * file systems */ - if (priv_data->vnode_type == VOLFS_ROOT) - { - register struct mount *mp, *nmp; + if (priv_data->vnode_type == VOLFS_ROOT) { + vstr.numMounts = 0; + vstr.a_vp = a_vp; - simple_lock(&mountlist_slock); - for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { - if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, ap->a_p)) { - nmp = mp->mnt_list.cqe_next; - continue; - } - - if (mp != a_vp->v_mount && validfsnode(mp)) - numMounts++; + vfs_iterate(LK_NOWAIT, volfs_getattr_callback, (void *)&vstr); - simple_lock(&mountlist_slock); - nmp = mp->mnt_list.cqe_next; - vfs_unbusy(mp, ap->a_p); - } - simple_unlock(&mountlist_slock); + numMounts = vstr.numMounts; - DBG_VOP(("found %d file systems that volfs can support\n", numMounts)); - a_vap->va_size = (numMounts + 2) * VLFSDIRENTLEN; - } - else - { - a_vap->va_size = 2 * VLFSDIRENTLEN; - } - DBG_VOP(("va_size = %d, VLFSDIRENTLEN = %ld\n", (int) a_vap->va_size, VLFSDIRENTLEN)); - a_vap->va_blocksize = 512; - - a_vap->va_atime.tv_sec = boottime.tv_sec; - a_vap->va_atime.tv_nsec = 0; - - a_vap->va_mtime.tv_sec = boottime.tv_sec; - a_vap->va_mtime.tv_nsec = 0; + VATTR_RETURN(a_vap, va_data_size, (numMounts + 2) * VLFSDIRENTLEN); + } else { + VATTR_RETURN(a_vap, va_data_size, 2 * VLFSDIRENTLEN); + } - a_vap->va_ctime.tv_sec = boottime.tv_sec; - a_vap->va_ctime.tv_nsec = 0; + VATTR_RETURN(a_vap, va_iosize, 512); + ts.tv_sec = boottime_sec(); + ts.tv_nsec = 0; + VATTR_RETURN(a_vap, va_access_time, ts); + VATTR_RETURN(a_vap, va_modify_time, ts); + VATTR_RETURN(a_vap, va_change_time, ts); - a_vap->va_gen = 0; - a_vap->va_flags = 0; - a_vap->va_rdev = 0; - a_vap->va_bytes = a_vap->va_size; - a_vap->va_filerev = 0; - a_vap->va_vaflags = 0; + VATTR_RETURN(a_vap, va_gen, 0); + VATTR_RETURN(a_vap, va_flags, 0); + VATTR_RETURN(a_vap, va_rdev, 0); + VATTR_RETURN(a_vap, va_filerev, 0); - DBG_VOP_LOCKS_TEST(0); return (0); } /* * volfs_select - just say OK. Only possible op is readdir - * - * Locking policy: ignore */ -int -volfs_select(ap) - struct vop_select_args /* { struct vnode *a_vp; int a_which; int - * a_fflags; struct ucred *a_cred; void * a_wql; struct - proc *a_p; } */ *ap; +static int +volfs_select(__unused struct vnop_select_args *ap) { - DBG_VOP(("volfs_select called\n")); - - return (1); + return (1); } /* * vofls_rmdir - not possible to remove directories in volfs - * - * Locking policy: a_dvp & a_vp - locked on entry, unlocked on exit */ -int +static int volfs_rmdir(ap) - struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; - struct componentname *a_cnp; } */ *ap; + struct vnop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; + struct componentname *a_cnp; vfs_context_t a_context; } */ *ap; { - DBG_VOP(("volfs_rmdir called\n")); if (ap->a_dvp == ap->a_vp) { (void) nop_rmdir(ap); return (EINVAL); @@ -502,6 +288,31 @@ volfs_rmdir(ap) return (err_rmdir(ap)); } + + +static int +volfs_readdir_callback(mount_t mp, void * v) +{ + struct volfs_rdstruct * vcsp = (struct volfs_rdstruct *)v; + struct dirent local_dir; + int error; + + if ((mp != vnode_mount(vcsp->vp)) && validfsnode(mp)) + vcsp->validindex++; + + if (vcsp->rec_offset == vcsp->validindex) + { + local_dir.d_fileno = mp->mnt_vfsstat.f_fsid.val[0]; + local_dir.d_type = DT_DIR; + local_dir.d_reclen = VLFSDIRENTLEN; + local_dir.d_namlen = sprintf(&local_dir.d_name[0], "%d", mp->mnt_vfsstat.f_fsid.val[0]); + error = uiomove((char *) &local_dir, VLFSDIRENTLEN, vcsp->uio); + vcsp->rec_offset++; + } + + return(VFS_RETURNED); +} + /* * volfs_readdir - Get directory entries * @@ -511,14 +322,12 @@ volfs_rmdir(ap) * equivalent of the f_fsid.val[0] from their mount structure (typically * the device id of the volume). The maximum length for a name, then is * 10 characters. - * - * Locking policy: a_vp locked on entry and exit */ -int +static int volfs_readdir(ap) - struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; - * struct ucred *a_cred; int *a_eofflag; int - *ncookies; u_long **a_cookies; } */ *ap; + struct vnop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; + * int *a_eofflag; int + *ncookies; u_long **a_cookies; vfs_context_t a_context; } */ *ap; { struct volfs_vndata *priv_data; register struct uio *uio = ap->a_uio; @@ -529,52 +338,42 @@ volfs_readdir(ap) int i; int starting_resid; off_t off; - DBG_FUNC_NAME("volfs_readdir"); - DBG_VOP_LOCKS_DECL(1); - - DBG_VOP_LOCKS_INIT(0,ap->a_vp, VOPDBG_LOCKED, VOPDBG_LOCKED, VOPDBG_LOCKED, VOPDBG_POS); - DBG_VOP_PRINT_FUNCNAME();DBG_VOP_PRINT_VNODE_INFO(ap->a_vp);DBG_VOP(("\n")); - - DBG_VOP(("\tuio_offset = %d, uio_resid = %d\n", (int) uio->uio_offset, uio->uio_resid)); - /* We assume it's all one big buffer... */ - if (uio->uio_iovcnt > 1) - DBG_VOP(("\tuio->uio_iovcnt = %d?\n", uio->uio_iovcnt)); - + struct volfs_rdstruct vcs; + off = uio->uio_offset; priv_data = ap->a_vp->v_data; - starting_resid = uio->uio_resid; - count = uio->uio_resid; + // LP64todo - fix this! + starting_resid = count = uio_resid(uio); /* Make sure we don't return partial entries. */ count -= (uio->uio_offset + count) & (VLFSDIRENTLEN - 1); - if (count <= 0) - { - DBG_VOP(("volfs_readdir: Not enough buffer to read in entries\n")); - DBG_VOP_LOCKS_TEST(EINVAL); - return (EINVAL); - } + if (count <= 0) { + return (EINVAL); + } /* * Make sure we're starting on a directory boundary */ - if (off & (VLFSDIRENTLEN - 1)) - { - DBG_VOP_LOCKS_TEST(EINVAL); - return (EINVAL); - } + if (off & (VLFSDIRENTLEN - 1)) { + return (EINVAL); + } rec_offset = off / VLFSDIRENTLEN; - lost = uio->uio_resid - count; - uio->uio_resid = count; - uio->uio_iov->iov_len = count; + // LP64todo - fix this! + lost = uio_resid(uio) - count; + uio_setresid(uio, count); + uio_iov_len_set(uio, count); +#if LP64_DEBUG + if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) { + panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__); + } +#endif /* LP64_DEBUG */ local_dir.d_reclen = VLFSDIRENTLEN; /* * We must synthesize . and .. */ - DBG_VOP(("\tstarting ... uio_offset = %d, uio_resid = %d\n", - (int) uio->uio_offset, uio->uio_resid)); + if (rec_offset == 0) { - DBG_VOP(("\tAdding .\n")); /* * Synthesize . */ @@ -585,13 +384,10 @@ volfs_readdir(ap) for (i = 1; i < MAXVLFSNAMLEN; i++) local_dir.d_name[i] = 0; error = uiomove((char *) &local_dir, VLFSDIRENTLEN, uio); - DBG_VOP(("\t after adding ., uio_offset = %d, uio_resid = %d\n", - (int) uio->uio_offset, uio->uio_resid)); rec_offset++; } if (rec_offset == 1) { - DBG_VOP(("\tAdding ..\n")); /* * Synthesize .. * We only have two levels in the volfs hierarchy. Root's @@ -607,8 +403,6 @@ volfs_readdir(ap) local_dir.d_name[i] = 0; error = uiomove((char *) &local_dir, VLFSDIRENTLEN, uio); rec_offset++; - DBG_VOP(("\t after adding .., uio_offset = %d, uio_resid = %d\n", - (int) uio->uio_offset, uio->uio_resid)); } /* @@ -619,58 +413,26 @@ volfs_readdir(ap) if (priv_data->vnode_type == VOLFS_FSNODE) { *ap->a_eofflag = 1; /* we got all the way to the end */ - DBG_VOP_LOCKS_TEST(error); return (error); } if (rec_offset > 1) { - register struct mount *mp, *nmp; - int validnodeindex; - struct proc *p = uio->uio_procp; - - validnodeindex = 1; /* we always have "." and ".." */ - - simple_lock(&mountlist_slock); - for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { - if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { - nmp = mp->mnt_list.cqe_next; - continue; - } - - if (mp != ap->a_vp->v_mount && validfsnode(mp)) - validnodeindex++; - - if (rec_offset == validnodeindex) - { - local_dir.d_fileno = mp->mnt_stat.f_fsid.val[0]; - local_dir.d_type = DT_DIR; - local_dir.d_reclen = VLFSDIRENTLEN; - DBG_VOP(("\tAdding dir entry %d for offset %d\n", mp->mnt_stat.f_fsid.val[0], rec_offset)); - local_dir.d_namlen = sprintf(&local_dir.d_name[0], "%d", mp->mnt_stat.f_fsid.val[0]); - error = uiomove((char *) &local_dir, VLFSDIRENTLEN, uio); - DBG_VOP(("\t after adding entry '%s', uio_offset = %d, uio_resid = %d\n", - &local_dir.d_name[0], (int) uio->uio_offset, uio->uio_resid)); - rec_offset++; - } - - simple_lock(&mountlist_slock); - nmp = mp->mnt_list.cqe_next; - vfs_unbusy(mp, p); - } - simple_unlock(&mountlist_slock); + vcs.validindex = 1; /* we always have "." and ".." */ + vcs.rec_offset = rec_offset; + vcs.vp = ap->a_vp; + vcs.uio = uio; + + + vfs_iterate(0, volfs_readdir_callback, &vcs); - if (mp == (void *) &mountlist) + //if (mp == (void *) &mountlist) *ap->a_eofflag = 1; /* we got all the way to the end */ } + uio_setresid(uio, (uio_resid(uio) + lost)); - uio->uio_resid += lost; - if (starting_resid == uio->uio_resid) + if (starting_resid == uio_resid(uio)) uio->uio_offset = 0; - DBG_VOP(("\tExiting, uio_offset = %d, uio_resid = %d, ap->a_eofflag = %d\n", - (int) uio->uio_offset, uio->uio_resid, *ap->a_eofflag)); - - DBG_VOP_LOCKS_TEST(error); return (error); } @@ -680,7 +442,7 @@ volfs_readdir(ap) * * This can cause context switching, so caller should be lock safe */ -static int +int validfsnode(struct mount *fsnode) { @@ -695,109 +457,14 @@ validfsnode(struct mount *fsnode) return 0; } -/* - * volfs_lock - Lock an inode. - * If its already locked, set the WANT bit and sleep. - * - * Locking policy: handled by lockmgr - */ -int -volfs_lock(ap) - struct vop_lock_args /* { struct vnode *a_vp; int a_flags; struct - proc *a_p; } */ *ap; -{ - int retval; - struct volfs_vndata *priv_data; - DBG_FUNC_NAME("volfs_lock"); - DBG_VOP_LOCKS_DECL(1); -#if 0 - KERNEL_DEBUG((FSDBG_CODE(DBG_FSVN, 0)) | DBG_FUNC_START, - (unsigned int)ap->a_vp, (unsigned int)ap->a_flags, (unsigned int)ap->a_p, 0, 0); -#endif - DBG_VOP_PRINT_FUNCNAME();DBG_VOP_PRINT_VNODE_INFO(ap->a_vp);DBG_VOP(("\n")); - - DBG_VOP_LOCKS_INIT(0,ap->a_vp, VOPDBG_UNLOCKED, VOPDBG_LOCKED, VOPDBG_UNLOCKED, VOPDBG_ZERO); - - priv_data = (struct volfs_vndata *) ap->a_vp->v_data; - retval = lockmgr(&priv_data->lock, ap->a_flags, &ap->a_vp->v_interlock, ap->a_p); - DBG_VOP_LOCKS_TEST(retval); -#if 0 - KERNEL_DEBUG((FSDBG_CODE(DBG_FSVN, 0)) | DBG_FUNC_END, - (unsigned int)ap->a_vp, (unsigned int)ap->a_flags, (unsigned int)ap->a_p, retval, 0); -#endif - return (retval); -} - -/* - * volfs_unlock - Unlock an inode. - * - * Locking policy: handled by lockmgr - */ -int -volfs_unlock(ap) - struct vop_unlock_args /* { struct vnode *a_vp; int a_flags; struct - proc *a_p; } */ *ap; -{ - int retval; - struct volfs_vndata *priv_data; - DBG_FUNC_NAME("volfs_unlock"); - DBG_VOP_LOCKS_DECL(1); -#if 0 - KERNEL_DEBUG((FSDBG_CODE(DBG_FSVN, 4)) | DBG_FUNC_START, - (unsigned int)ap->a_vp, (unsigned int)ap->a_flags, (unsigned int)ap->a_p, 0, 0); -#endif - DBG_VOP_PRINT_FUNCNAME();DBG_VOP_PRINT_VNODE_INFO(ap->a_vp);DBG_VOP(("\n")); - - DBG_VOP_LOCKS_INIT(0,ap->a_vp, VOPDBG_LOCKED, VOPDBG_UNLOCKED, VOPDBG_LOCKED, VOPDBG_ZERO); - - priv_data = (struct volfs_vndata *) ap->a_vp->v_data; - retval = lockmgr(&priv_data->lock, ap->a_flags | LK_RELEASE, - &ap->a_vp->v_interlock, ap->a_p); - - DBG_VOP_LOCKS_TEST(retval); -#if 0 - KERNEL_DEBUG((FSDBG_CODE(DBG_FSVN, 4)) | DBG_FUNC_END, - (unsigned int)ap->a_vp, (unsigned int)ap->a_flags, (unsigned int)ap->a_p, retval, 0); -#endif - return (retval); -} - -/* - * volfs_islocked - Check for a locked inode. - * - * Locking policy: ignore - */ -int -volfs_islocked(ap) - struct vop_islocked_args /* { struct vnode *a_vp; } */ *ap; -{ - int retval; - struct volfs_vndata *priv_data; - - DBG_FUNC_NAME("volfs_islocked"); - DBG_VOP_LOCKS_DECL(1); - //DBG_VOP_PRINT_FUNCNAME();DBG_VOP(("\n")); - - DBG_VOP_LOCKS_INIT(0,ap->a_vp, VOPDBG_IGNORE, VOPDBG_IGNORE, VOPDBG_IGNORE, VOPDBG_ZERO); - priv_data = (struct volfs_vndata *) ap->a_vp->v_data; - retval = lockstatus(&priv_data->lock); - - DBG_VOP_LOCKS_TEST(retval); - return (retval); -} - /* * volfs_pathconf - Return POSIX pathconf information applicable to ufs filesystems. - * - * Locking policy: a_vp locked on input and output */ -int +static int volfs_pathconf(ap) - struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int - *a_retval; } */ *ap; + struct vnop_pathconf_args /* { struct vnode *a_vp; int a_name; int + *a_retval; vfs_context_t a_context; } */ *ap; { - DBG_VOP(("volfs_pathconf called\n")); - switch (ap->a_name) { case _PC_LINK_MAX: @@ -824,65 +491,31 @@ volfs_pathconf(ap) /* NOTREACHED */ } - -/* - * Call VOP_GETATTRLIST on a given vnode - */ -static int -vp_getattrlist(struct vnode *vp, struct attrlist alist, void *attrbufptr, size_t bufsize, unsigned long options, struct proc *p) { - struct iovec iov; - struct uio bufuio; - - iov.iov_base = (char *)attrbufptr; - iov.iov_len = bufsize; - - bufuio.uio_iov = &iov; - bufuio.uio_iovcnt = 1; - bufuio.uio_offset = 0; - bufuio.uio_resid = iov.iov_len; - bufuio.uio_segflg = UIO_SYSSPACE; - bufuio.uio_rw = UIO_READ; - bufuio.uio_procp = p; - - return VOP_GETATTRLIST(vp, &alist, &bufuio, p->p_ucred, p); -} - /* * get_parentvp() - internal routine that tries to lookup the parent of vpp. - * On success, *vpp is the parent vp and is returned locked and the original child - * is left unlocked. On failure, the original child will be locked upon return. + * On success, *vpp is the parent vp and is returned with a reference. */ static int -get_parentvp(struct vnode **vpp, struct mount *mp, struct proc *p) +get_parentvp(struct vnode **vpp, struct mount *mp, vfs_context_t context) { int result; - struct attrlist alist; - struct finfoattrbuf finfobuf; + struct vnode_attr va; struct vnode *child_vp = *vpp; - - alist.bitmapcount = 5; - alist.reserved = 0; - alist.commonattr = ATTR_CMN_PAROBJID; - alist.volattr = 0; - alist.dirattr = 0; - alist.fileattr = 0; - alist.forkattr = 0; - result = vp_getattrlist(child_vp, alist, &finfobuf, sizeof(finfobuf), 0, p); - if (result) - return result; - - /* Release the child vnode before trying to acquire its parent - to avoid vnode deadlock problems with parsing code - coming top-down through the directory hierarchy: */ - VOP_UNLOCK(child_vp, 0, p); + VATTR_INIT(&va); + VATTR_WANTED(&va, va_parentid); + result = vnode_getattr(child_vp, &va, context); + if (result) { + return result; + } + /* Shift attention to the parent directory vnode: */ - result = VFS_VGET(mp, &finfobuf.fi.parID.fid_objno, vpp); - if (result) { - /* Make sure child_vp is still locked on exit: */ - vn_lock(child_vp, LK_EXCLUSIVE | LK_RETRY, p); + result = VFS_VGET(mp, (ino64_t)va.va_parentid, vpp, context); + + if (result == 0 && child_vp->v_parent != *vpp) { + vnode_update_identity(child_vp, *vpp, NULL, 0, 0, VNODE_UPDATE_PARENT); } - + return result; } @@ -891,58 +524,50 @@ get_parentvp(struct vnode **vpp, struct mount *mp, struct proc *p) * Look up the parent directory of a given vnode. */ static int -lookup_parent(u_int id, struct vnode *child_vp, struct vnode **parent_vp, struct proc *p) +lookup_parent(vnode_t child_vp, vnode_t *parent_vpp, int is_authorized, vfs_context_t context) { - struct nameidata nd; - struct componentname *cnp = &nd.ni_cnd; - struct filedesc *fdp = p->p_fd; + struct componentname cn; + vnode_t new_vp; int error; - *parent_vp = NULL; - - /* - * Special case lookups for root's parent directory, - * recognized by its special id of "1": - */ - if (id != 1) { - VREF(child_vp); - nd.ni_startdir = child_vp; - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, (caddr_t)&gDotDot, p); - } else { - struct vnode *root_vp; - - error = VFS_ROOT(child_vp->v_mount, &root_vp); - if (error) return error; - VOP_UNLOCK(root_vp, 0, p); /* Hold on to the reference */ - nd.ni_startdir = root_vp; - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, (caddr_t)&gDot, p); - }; - nd.ni_cnd.cn_cred = nd.ni_cnd.cn_proc->p_ucred; - - /* Since we can't hit any symlinks, use the source path string directly: */ - cnp->cn_pnbuf = nd.ni_dirp; - nd.ni_pathlen = strlen(cnp->cn_pnbuf); - cnp->cn_pnlen = nd.ni_pathlen + 1; - cnp->cn_flags |= (HASBUF | SAVENAME); - - nd.ni_loopcnt = 0; - - if ((nd.ni_rootdir = fdp->fd_rdir) == NULL) nd.ni_rootdir = rootvnode; - cnp->cn_nameptr = cnp->cn_pnbuf; - if (error = lookup(&nd)) { - cnp->cn_pnbuf = NULL; - return (error); - } - /* - * Check for symbolic link - */ - if (cnp->cn_flags & ISSYMLINK) return ENOENT; - if (nd.ni_vp == child_vp) return ELOOP; + *parent_vpp = NULLVP; - *parent_vp = nd.ni_vp; - return 0; -} + if (is_authorized == 0) { + error = vnode_authorize(child_vp, NULL, KAUTH_VNODE_SEARCH, context); + if (error != 0) { + return (error); + } + } + new_vp = child_vp->v_parent; + if (new_vp != NULLVP) { + if ( (error = vnode_getwithref(new_vp)) == 0 ) + *parent_vpp = new_vp; + return (error); + } + bzero(&cn, sizeof(cn)); + cn.cn_nameiop = LOOKUP; + cn.cn_context = context; + cn.cn_pnbuf = CAST_DOWN(caddr_t, &gDotDot); + cn.cn_pnlen = strlen(cn.cn_pnbuf); + cn.cn_nameptr = cn.cn_pnbuf; + cn.cn_namelen = cn.cn_pnlen; + cn.cn_flags = (FOLLOW | LOCKLEAF | ISLASTCN | ISDOTDOT); + + error = VNOP_LOOKUP(child_vp, &new_vp, &cn, context); + if (error != 0) { + return(error); + } + if (new_vp == child_vp) { + vnode_put(new_vp); + return ELOOP; + } + if (child_vp->v_parent == NULLVP) { + vnode_update_identity(child_vp, new_vp, NULL, 0, 0, VNODE_UPDATE_PARENT); + } + *parent_vpp = new_vp; + return 0; +} /* @@ -950,107 +575,120 @@ lookup_parent(u_int id, struct vnode *child_vp, struct vnode **parent_vp, struct */ static int -verify_fullpathaccess(u_int id, struct vnode *targetvp, struct proc *p) { +verify_fullpathaccess(struct vnode *targetvp, vfs_context_t context) +{ struct vnode *vp, *parent_vp; struct mount *mp = targetvp->v_mount; - struct attrlist alist; - struct finfoattrbuf finfobuf; + struct proc *p = vfs_context_proc(context); int result; + int dp_authorized; struct filedesc *fdp = p->p_fd; /* pointer to file descriptor state */ - u_int target_id; - u_long vp_id; -#if 0 - KERNEL_DEBUG((FSDBG_CODE(DBG_FSVN, 12)) | DBG_FUNC_START, - (unsigned int)targetvp, (unsigned int)mp, (unsigned int)p, 0, 0); -#endif - vp = targetvp; - vp_id = vp->v_id; - if (vp->v_type != VDIR) { - - /* The target is a file: get the parent directory. */ - result = get_parentvp(&vp, mp, p); - if (result) goto err_exit; - - /* At this point, targetvp is unlocked (but still referenced), and - vp is the parent directory vnode, held locked */ - }; - + dp_authorized = 0; -#if MAXPLCENTRIES - if (volfs_PLCLookup(mp->mnt_stat.f_fsid.val[0], id, p->p_ucred->cr_uid, p->p_pid)) goto lookup_success; -#endif - /* Keep going up until either the process's root or the process's working directory is hit, - either one of which are potential valid starting points for a full pathname: */ - target_id = id; - while (vp != NULL && (!((vp->v_flag & VROOT) || /* Hit "/" */ - (vp == fdp->fd_cdir) || /* Hit process's working directory */ - (vp == fdp->fd_rdir)))) { /* Hit process chroot()-ed root */ - - /* At this point, vp is some directory node and it's always locked */ - /* Unlock the starting directory for namei(), retaining a reference... */ - VOP_UNLOCK(vp, 0, p); - - if (result = lookup_parent(target_id, vp, &parent_vp, p)) { - /* - * If the lookup fails with EACCES and the targetvp is a directory, - * we should try again using get_parentvp(). Without this check, - * directories that you can navigate to but not traverse will - * disappear when clicked in the Finder. - */ - if (result == EACCES && vp == targetvp && vp->v_type == VDIR && (vp->v_flag & VROOT) == 0) { - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - parent_vp = vp; - if (get_parentvp(&parent_vp, mp, p)) { - /* on error, vp is still locked... unlock for lookup_err_exit path */ - VOP_UNLOCK(vp, 0, p); - } else { - /* on success, vp is returned unlocked, parent_vp is returned locked */ - result = 0; + /* get the parent directory. */ + if ((vp->v_flag & VROOT) == 0 && vp != fdp->fd_cdir && vp != fdp->fd_rdir) { + if (vp->v_parent == NULLVP || (vp->v_flag & VISHARDLINK) || (vnode_getwithref(vp->v_parent) != 0)) { + if (vp->v_type == VDIR) { + result = lookup_parent(vp, &parent_vp, dp_authorized, context); + + /* + * If the lookup fails with EACCES and the vp is a directory, + * we should try again but bypass authorization check. Without this + * workaround directories that you can navigate to but not traverse will + * disappear when clicked in the Finder. + */ + if (result == EACCES && (vp->v_flag & VROOT) == 0) { + dp_authorized = 1; /* bypass auth check */ + if (lookup_parent(vp, &parent_vp, dp_authorized, context) == 0) { + result = 0; + } + dp_authorized = 0; /* force us to authorize */ } - }; - if (result) goto lookup_err_exit; - }; - - if (vp != targetvp) { - vrele(vp); /* Completely done with that vp now... */ - }; - - vp = parent_vp; - target_id = 0; /* It's unknown at this point */ - - if (((result = VOP_ACCESS(vp, VEXEC, p->p_ucred, p)) != 0) && - ((result = VOP_ACCESS(vp, VREAD, p->p_ucred, p)) != 0)) { - VOP_UNLOCK(vp, 0, p); - goto lookup_err_exit; - }; - }; + vp = parent_vp; + } + else { + /* + * this is not a directory so we must get parent object ID + */ + result = get_parentvp(&vp, mp, context); + parent_vp = vp; + } + if (result != 0) + goto err_exit; + } + else { + /* + * we where able to get a reference on v_parent + */ + parent_vp = vp = vp->v_parent; + } + } -#if MAXPLCENTRIES - volfs_PLCEnter(mp->mnt_stat.f_fsid.val[0], id, p->p_ucred->cr_uid, p->p_pid); -#endif + /* + * Keep going up until either the process's root or the process's working + * directory is hit, either one of which are potential valid starting points + * for a full pathname + */ + while (vp != NULLVP) { -lookup_success: - /* Success: the caller has complete access to the initial vnode: */ - result = 0; - - if (vp && vp != targetvp) VOP_UNLOCK(vp, 0, p); - -lookup_err_exit: - if (vp && vp != targetvp) { - vrele(vp); - vn_lock(targetvp, LK_EXCLUSIVE | LK_RETRY, p); - if (vp_id != targetvp->v_id || targetvp->v_type == VBAD) { - result = EAGAIN; /* vnode was recycled */ + result = reverse_lookup(vp, &parent_vp, fdp, context, &dp_authorized); + if (result == 0) { + /* + * we're done and we have access + */ + break; } - }; + if (vp != parent_vp) { + /* + * we where able to walk up the parent chain so now we don't need + * vp any longer + */ + vnode_put(vp); + vp = parent_vp; + } + /* + * we have a referenced vp at this point... if dp_authorized == 1, than + * it's been authorized for search, but v_parent was NULL... + * if dp_authorized == 0, than we need to do the authorization check + * before looking up the parent + */ + if ((vp->v_flag & VROOT) != 0 || + vp == fdp->fd_cdir || vp == fdp->fd_rdir) { + /* + * we're already at the termination point, which implies that + * the authorization check in the cache failed (otherwise we + * would have returned 'done' from "reverse_lookup"... so, + * do the authorization and bail + */ + result = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, context); + goto lookup_exit; + } + result = lookup_parent(vp, &parent_vp, dp_authorized, context); + if (result != 0) { + goto lookup_exit; + } + if (vp != parent_vp) { + /* + * got the parent so now we don't need vp any longer + */ + vnode_put(vp); + vp = parent_vp; + } + } /* while loop */ + + /* + * Success: the caller has complete access to the initial vnode + */ + result = 0; + +lookup_exit: + if (vp != NULLVP && vp != targetvp) { + vnode_put(vp); + } err_exit: -#if 0 - KERNEL_DEBUG((FSDBG_CODE(DBG_FSVN, 12)) | DBG_FUNC_END, - (unsigned int)targetvp, (unsigned int)mp, (unsigned int)p, result, 0); -#endif return result; }; @@ -1060,33 +698,21 @@ err_exit: * id of filesystem to lookup and pointer to vnode pointer to fill in */ static int -get_fsvnode(our_mount, id, ret_vnode) - struct mount *our_mount; - int id; - struct vnode **ret_vnode; +get_fsvnode(struct mount *our_mount, int id, vnode_t *ret_vnode) { - register struct mount *mp; struct mount *cur_mount; + fsid_t cur_fsid; struct vnode *cur_vnode; struct volfs_vndata *cur_privdata; int retval; - - //DBG_VOP(("volfs: get_fsvnode called\n")); + struct vnode_fsparam vfsp; + int vid = 0; /* * OK, first look up the matching mount on the list of mounted file systems */ - cur_mount = NULL; - simple_lock(&mountlist_slock); - for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = mp->mnt_list.cqe_next) - { - if (validfsnode(mp) && mp->mnt_stat.f_fsid.val[0] == id) - { - cur_mount = mp; - break; - } - } - simple_unlock(&mountlist_slock); + /* the following will return the mount point with vfs_busy held */ + cur_mount = mount_lookupby_volfsid(id, 1); if (cur_mount == NULL) { /* @@ -1100,34 +726,36 @@ get_fsvnode(our_mount, id, ret_vnode) return ENOENT; }; + cur_fsid = cur_mount->mnt_vfsstat.f_fsid; + /* * Now search the list attached to the mount structure to * see if this vnode is already floating around */ search_vnodelist: - cur_vnode = our_mount->mnt_vnodelist.lh_first; - while (cur_vnode != NULL) - { + mount_lock(our_mount); + TAILQ_FOREACH(cur_vnode, &our_mount->mnt_vnodelist, v_mntvnodes) { cur_privdata = (struct volfs_vndata *) cur_vnode->v_data; - if (cur_privdata->nodeID == id) + if (cur_privdata->nodeID == (unsigned int)id) { if (cur_privdata->fs_mount != cur_mount) { - DBG_VOP(("volfs get_fsvnode: Updating fs_mount for vnode 0x%08lX (id = %d) from 0x%08lX to 0x%08lX...\n", - (unsigned long)cur_vnode, - cur_privdata->nodeID, - (unsigned long)cur_privdata->fs_mount, - (unsigned long)cur_mount)); cur_privdata->fs_mount = cur_mount; + cur_privdata->fs_fsid = cur_fsid; }; break; } - cur_vnode = cur_vnode->v_mntvnodes.le_next; - } + } + mount_unlock(our_mount); - //DBG_VOP(("\tfinal cur_mount: 0x%x\n",cur_mount)); if (cur_vnode) { - /* If vget returns an error, cur_vnode will not be what we think it is, try again */ - if (vget(cur_vnode, LK_EXCLUSIVE, current_proc()) != 0) { + vid = vnode_vid(cur_vnode); + + /* + * use vnode_getwithvid since it will wait for a vnode currently being + * terminated... if it returns an error, cur_vnode will not be what we + * think it is, try again + */ + if (vnode_getwithvid(cur_vnode, vid) != 0) { goto search_vnodelist; }; } @@ -1135,27 +763,40 @@ search_vnodelist: { MALLOC(cur_privdata, struct volfs_vndata *, sizeof(struct volfs_vndata), M_VOLFSNODE, M_WAITOK); - retval = getnewvnode(VT_VOLFS, our_mount, volfs_vnodeop_p, &cur_vnode); - if (retval != 0) { - FREE(cur_privdata, M_VOLFSNODE); - return retval; - }; - + cur_privdata->vnode_type = VOLFS_FSNODE; cur_privdata->nodeID = id; cur_privdata->fs_mount = cur_mount; - lockinit(&cur_privdata->lock, PINOD, "volfsnode", 0, 0); - lockmgr(&cur_privdata->lock, LK_EXCLUSIVE, (struct slock *)0, current_proc()); - cur_vnode->v_data = cur_privdata; - cur_vnode->v_type = VDIR; - DBG_VOP(("get_fsvnode returned with new node of ")); - DBG_VOP_PRINT_VNODE_INFO(cur_vnode);DBG_VOP(("\n")); + cur_privdata->fs_fsid = cur_fsid; + + vfsp.vnfs_mp = our_mount; + vfsp.vnfs_vtype = VDIR; + vfsp.vnfs_str = "volfs"; + vfsp.vnfs_dvp = 0; + vfsp.vnfs_fsnode = cur_privdata; + vfsp.vnfs_cnp = 0; + vfsp.vnfs_vops = volfs_vnodeop_p; + vfsp.vnfs_rdev = 0; + vfsp.vnfs_filesize = 0; + vfsp.vnfs_flags = VNFS_NOCACHE | VNFS_CANTCACHE; + vfsp.vnfs_marksystem = 0; + vfsp.vnfs_markroot = 0; + + retval = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &cur_vnode); + if (retval != 0) { + FREE(cur_privdata, M_VOLFSNODE); + goto out; + }; + cur_vnode->v_tag = VT_VOLFS; + } *ret_vnode = cur_vnode; - - return (0); + retval = 0; +out: + vfs_unbusy(cur_mount); + return (retval); } @@ -1166,11 +807,7 @@ search_vnodelist: * to a vnode pointer */ static int -get_filevnode(parent_fs, id, ret_vnode, p) - struct mount *parent_fs; - u_int id; - struct vnode **ret_vnode; - struct proc *p; +get_filevnode(struct mount *parent_fs, u_int id, vnode_t *ret_vnode, vfs_context_t context) { int retval; @@ -1179,18 +816,18 @@ again: * Special case 2 to mean the root of a file system */ if (id == 2) - retval = VFS_ROOT(parent_fs, ret_vnode); + retval = VFS_ROOT(parent_fs, ret_vnode, context); else - retval = VFS_VGET(parent_fs, &id, ret_vnode); + retval = VFS_VGET(parent_fs, (ino64_t)id, ret_vnode, context); if (retval) goto error; - retval = verify_fullpathaccess(id, *ret_vnode, p); + retval = verify_fullpathaccess(*ret_vnode, context); if (retval) { /* An error was encountered verifying that the caller has, in fact, got access all the way from "/" or their working directory to the specified item... */ - vput(*ret_vnode); + vnode_put(*ret_vnode); *ret_vnode = NULL; /* vnode was recycled during access verification. */ if (retval == EAGAIN) { @@ -1203,382 +840,140 @@ error: } -int -volfs_lookup(ap) - struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode - **a_vpp; struct componentname *a_cnp; } */ *ap; +static int +volfs_lookup(struct vnop_lookup_args *ap) { - struct volfs_vndata *priv_data; - char *cnp; - long namelen; - struct mount *parent_fs; - int unlocked_parent = 0, isdot_or_dotdot = 0; - int ret_err = ENOENT; - DBG_FUNC_NAME("volfs_lookup"); - DBG_VOP_LOCKS_DECL(2); + struct volfs_vndata *priv_data; + char *nameptr; + long namelen; + struct mount *parent_fs; + vnode_t vp; + int isdot_or_dotdot = 0; + int ret_err = ENOENT; + char firstchar; + int ret_val; #if 0 KERNEL_DEBUG((FSDBG_CODE(DBG_FSVN, 8)) | DBG_FUNC_START, (unsigned int)ap->a_dvp, (unsigned int)ap->a_cnp, (unsigned int)p, 0, 0); #endif - - DBG_VOP(("volfs_lookup called, name = %s, namelen = %ld\n", ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen)); - - DBG_VOP_LOCKS_INIT(0,ap->a_dvp, VOPDBG_LOCKED, VOPDBG_IGNORE, VOPDBG_IGNORE, VOPDBG_POS); - DBG_VOP_LOCKS_INIT(1,*ap->a_vpp, VOPDBG_IGNORE, VOPDBG_LOCKED, VOPDBG_IGNORE, VOPDBG_POS); - DBG_VOP_PRINT_FUNCNAME();DBG_VOP(("\n")); - DBG_VOP(("\t"));DBG_VOP_PRINT_CPN_INFO(ap->a_cnp);DBG_VOP(("\n")); - if (ap->a_cnp->cn_flags & LOCKPARENT) - DBG_VOP(("\tLOCKPARENT is set\n")); - if (ap->a_cnp->cn_flags & ISLASTCN) - { - DBG_VOP(("\tISLASTCN is set\n")); - if (ap->a_cnp->cn_nameiop == DELETE || ap->a_cnp->cn_nameiop == RENAME) /* XXX PPD Shouldn't we check for CREATE, too? */ - { - ret_err = EROFS; - goto Err_Exit; - } - } priv_data = ap->a_dvp->v_data; - cnp = ap->a_cnp->cn_nameptr; + nameptr = ap->a_cnp->cn_nameptr; namelen = ap->a_cnp->cn_namelen; - -#if VOLFS_DEBUG - switch (priv_data->vnode_type) { - case VOLFS_ROOT: - DBG_VOP(("\tparent directory (vnode 0x%08lX) vnode_type is VOLFS_ROOT.\n", (unsigned long)ap->a_dvp)); - break; - - case VOLFS_FSNODE: - DBG_VOP(("\tparent directory (vnode 0x%08lX) vnode_type is VOLFS_FSNODE, nodeID = %d, fs_mount = 0x%08lX.\n", - (unsigned long)ap->a_dvp, - priv_data->nodeID, - (unsigned long)priv_data->fs_mount)); - - default: - DBG_VOP(("\tparent directory (vnode 0x%08lX) has unknown vnode_type (%d), nodeID = %d.\n", - (unsigned long)ap->a_dvp, - priv_data->vnode_type, - priv_data->nodeID)); - }; -#endif /* VOLFS_DEBUG */ + firstchar = nameptr[0]; - /* first check for "." and ".." */ - if (cnp[0] == '.') - { - if (namelen == 1) - { + /* First check for "." and ".." */ + if (firstchar == '.') { + if (namelen == 1) { /* "." requested */ isdot_or_dotdot = 1; - *ap->a_vpp = ap->a_dvp; - VREF(*ap->a_vpp); - DBG_VOP_LOCKS_TEST(0); - ret_err = 0; - } - else if (cnp[1] == '.' && namelen == 2) - { + *ap->a_vpp = ap->a_dvp; + vnode_get(*ap->a_vpp); + ret_err = 0; + } else if (nameptr[1] == '.' && namelen == 2) { /* ".." requested */ isdot_or_dotdot = 1; - ret_err = volfs_root(ap->a_dvp->v_mount, ap->a_vpp); + ret_err = VFS_ROOT(ap->a_dvp->v_mount, ap->a_vpp, ap->a_context); } - } - - /* then look for special file system root symbol ('@') */ - else if (cnp[0] == '@') - { + } else if (firstchar == '@') { /* '@' is alias for system root */ if ((namelen == 1) && (priv_data->vnode_type != VOLFS_ROOT)) { - parent_fs = priv_data->fs_mount; - if (!(ap->a_cnp->cn_flags & LOCKPARENT) || !(ap->a_cnp->cn_flags & ISLASTCN)) { - VOP_UNLOCK(ap->a_dvp, 0, ap->a_cnp->cn_proc); - unlocked_parent = 1; - }; - ret_err = VFS_ROOT(parent_fs, ap->a_vpp); - } else { - DBG_VOP(("volfs_lookup: pathname = '@' but namelen = %ld and parent vnode_type = %d.\n", namelen, priv_data->vnode_type)); - *ap->a_vpp = NULL; - ret_err = ENOENT; - }; - } - - /* finally, just look for numeric ids... */ - else if (namelen <= 10 && cnp[0] > '0' && cnp[0] <= '9') /* 10 digits max lead digit must be 1 - 9 */ - { + /* the following returns with iteration count on mount point */ + parent_fs = mount_list_lookupby_fsid(&priv_data->fs_fsid, 0, 1); + if (parent_fs) { + ret_val = vfs_busy(parent_fs, LK_NOWAIT); + mount_iterdrop(parent_fs); + if (ret_val !=0) { + *ap->a_vpp = NULL; + ret_err = ENOENT; + } else { + ret_err = VFS_ROOT(parent_fs, ap->a_vpp, ap->a_context); + vfs_unbusy(parent_fs); + } + } else { + *ap->a_vpp = NULL; + ret_err = ENOENT; + } + } else { + *ap->a_vpp = NULL; + ret_err = ENOENT; + } + } else if (namelen <= 10 && firstchar > '0' && firstchar <= '9') { char *check_ptr; u_long id; - id = strtoul(cnp, &check_ptr, 10); + id = strtoul(nameptr, &check_ptr, 10); - /* + /* * strtol will leave us at the first non-numeric character. * we've checked to make sure the component name does * begin with a numeric so check_ptr must wind up on * the terminating null or there was other junk following the * number */ - if ((check_ptr - cnp) == namelen) - { - if (priv_data->vnode_type == VOLFS_ROOT) + if ((check_ptr - nameptr) == namelen) { + if (priv_data->vnode_type == VOLFS_ROOT) { + /* + * OPTIMIZATION + * + * Obtain the mountpoint and call VFS_VGET in + * one step (ie without creating a vnode for + * the mountpoint). + */ + if (check_ptr[0] == '/' && + check_ptr[1] > '0' && check_ptr[1] <= '9') { + struct mount *mp; + struct vnode *vp; + u_long id2; + char *endptr; + + /* this call will return mount point with vfs_busy held */ + mp = mount_lookupby_volfsid(id, 1); + if (mp == NULL) { + *ap->a_vpp = NULL; + return ENOENT; + } + id2 = strtoul(&check_ptr[1], &endptr, 10); + if ((endptr[0] == '/' || endptr[0] == '\0') && + get_filevnode(mp, id2, &vp, ap->a_context) == 0) { + ap->a_cnp->cn_consume = endptr - check_ptr; + *ap->a_vpp = vp; + vfs_unbusy(mp); + return (0); + } + vfs_unbusy(mp); + } + /* Fall through to default behavior... */ + ret_err = get_fsvnode(ap->a_dvp->v_mount, id, ap->a_vpp); - else { - parent_fs = priv_data->fs_mount; - if (!(ap->a_cnp->cn_flags & LOCKPARENT) || !(ap->a_cnp->cn_flags & ISLASTCN)) { - VOP_UNLOCK(ap->a_dvp, 0, ap->a_cnp->cn_proc); - unlocked_parent = 1; - }; - ret_err = get_filevnode(parent_fs, id, ap->a_vpp, ap->a_cnp->cn_proc); - } - } - } - if (!isdot_or_dotdot && *ap->a_vpp && VPARENT(*ap->a_vpp) == NULL && ap->a_dvp != *ap->a_vpp) { - if (VPARENT(ap->a_dvp) == *ap->a_vpp) { - panic("volfs: ap->a_dvp 0x%x has parent == a_vpp 0x%x\n", - ap->a_dvp, *ap->a_vpp); + } else { + parent_fs = mount_list_lookupby_fsid(&priv_data->fs_fsid, 0, 1); + if (parent_fs) { + ret_val = vfs_busy(parent_fs, LK_NOWAIT); + mount_iterdrop(parent_fs); + if (ret_val !=0) { + *ap->a_vpp = NULL; + ret_err = ENOENT; + } else { + ret_err = get_filevnode(parent_fs, id, ap->a_vpp, ap->a_context); + vfs_unbusy(parent_fs); + } + } else { + *ap->a_vpp = NULL; + ret_err = ENOENT; + } + } } - vget(ap->a_dvp, 0, ap->a_cnp->cn_proc); - VPARENT(*ap->a_vpp) = ap->a_dvp; } + vp = *ap->a_vpp; - if (!unlocked_parent && (!(ap->a_cnp->cn_flags & LOCKPARENT) || !(ap->a_cnp->cn_flags & ISLASTCN))) { - VOP_UNLOCK(ap->a_dvp, 0, ap->a_cnp->cn_proc); - }; - - /* XXX PPD Should we do something special in case LOCKLEAF isn't set? */ + if ( ret_err == 0 && !isdot_or_dotdot && (vp != NULLVP) && (vp->v_parent == NULLVP)) + vnode_update_identity(vp, ap->a_dvp, NULL, 0, 0, VNODE_UPDATE_PARENT); -Err_Exit: - - DBG_VOP_UPDATE_VP(1, *ap->a_vpp); - DBG_VOP_LOCKS_TEST(ret_err); - #if 0 KERNEL_DEBUG((FSDBG_CODE(DBG_FSVN, 8)) | DBG_FUNC_START, (unsigned int)ap->a_dvp, (unsigned int)ap->a_cnp, (unsigned int)p, ret_err, 0); #endif - return (ret_err); -} - -#if DBG_VOP_TEST_LOCKS - -#if 0 -static void DbgLookupTest( char *funcname, struct componentname *cnp, struct vnode *dvp, struct vnode *vp) -{ - int flags = cnp->cn_flags; - int nameiop = cnp->cn_nameiop; - - DBG_VOP (("%s: Action:", funcname)); - switch (nameiop) - { - case LOOKUP: - PRINTIT ("LOOKUP"); - break; - case CREATE: - PRINTIT ("CREATE"); - break; - case DELETE: - PRINTIT ("DELETE"); - break; - case RENAME: - PRINTIT ("RENAME"); - break; - default: - PRINTIT ("!!!UNKNOWN!!!!"); - break; - } - PRINTIT(" flags: 0x%x ",flags ); - if (flags & LOCKPARENT) - PRINTIT (" Lock Parent"); - if (flags & ISLASTCN) - PRINTIT (" Last Action"); - PRINTIT("\n"); - - if (dvp) - { - PRINTIT ("%s: Parent vnode exited ", funcname); - if (VOP_ISLOCKED(dvp)) - PRINTIT("LOCKED\n"); - else - PRINTIT("UNLOCKED\n"); - } - if (vp && vp==dvp) - { - PRINTIT ("%s: Found and Parent are the same\n", funcname); - } - else if (vp) - { - PRINTIT ("%s: Found vnode exited ", funcname); - if (VOP_ISLOCKED(vp)) - PRINTIT("LOCKED\n"); - else - PRINTIT("UNLOCKED\n"); - } - else - PRINTIT ("%s: Found vnode exited NULL\n", funcname); - - -} -#endif - -static void DbgVopTest( int maxSlots, - int retval, - VopDbgStoreRec *VopDbgStore, - char *funcname) -{ - int index; - - for (index = 0; index < maxSlots; index++) - { - if (VopDbgStore[index].id != index) { - PRINTIT("%s: DBG_VOP_LOCK: invalid id field (%d) in target entry (#%d).\n", funcname, VopDbgStore[index].id, index); - return; - }; - - if ((VopDbgStore[index].vp != NULL) && - ((VopDbgStore[index].vp->v_data==NULL))) - continue; - - switch (VopDbgStore[index].inState) - { - case VOPDBG_IGNORE: - case VOPDBG_SAME: - /* Do Nothing !!! */ - break; - case VOPDBG_LOCKED: - case VOPDBG_UNLOCKED: - case VOPDBG_LOCKNOTNIL: - { - if (VopDbgStore[index].vp == NULL && (VopDbgStore[index].inState != VOPDBG_LOCKNOTNIL)) { - PRINTIT ("%s: InState check: Null vnode ptr in entry #%d\n", funcname, index); - } else if (VopDbgStore[index].vp != NULL) { - switch (VopDbgStore[index].inState) - { - case VOPDBG_LOCKED: - case VOPDBG_LOCKNOTNIL: - if (VopDbgStore[index].inValue == 0) - { - PRINTIT ("%s: %d Entry: not LOCKED:", funcname, index); DBG_VOP(("\n")); - } - break; - case VOPDBG_UNLOCKED: - if (VopDbgStore[index].inValue != 0) - { - PRINTIT ("%s: %d Entry: not UNLOCKED:", funcname, index); DBG_VOP(("\n")); - } - break; - } - } - break; - } - default: - PRINTIT ("%s: DBG_VOP_LOCK on entry: bad lock test value: %d\n", funcname, VopDbgStore[index].errState); - } - - - if (retval != 0) - { - switch (VopDbgStore[index].errState) - { - case VOPDBG_IGNORE: - /* Do Nothing !!! */ - break; - case VOPDBG_LOCKED: - case VOPDBG_UNLOCKED: - case VOPDBG_SAME: - { - if (VopDbgStore[index].vp == NULL) { - PRINTIT ("%s: ErrState check: Null vnode ptr in entry #%d\n", funcname, index); - } else { - VopDbgStore[index].outValue = VOP_ISLOCKED(VopDbgStore[index].vp); - switch (VopDbgStore[index].errState) - { - case VOPDBG_LOCKED: - if (VopDbgStore[index].outValue == 0) - { - PRINTIT ("%s: %d Error: not LOCKED:", funcname, index); DBG_VOP(("\n")); - } - break; - case VOPDBG_UNLOCKED: - if (VopDbgStore[index].outValue != 0) - { - PRINTIT ("%s: %d Error: not UNLOCKED:", funcname, index); DBG_VOP(("\n")); - } - break; - case VOPDBG_SAME: - if (VopDbgStore[index].outValue != VopDbgStore[index].inValue) - PRINTIT ("%s: Error: In/Out locks are DIFFERENT: 0x%x, inis %d and out is %d\n", funcname, (u_int)VopDbgStore[index].vp, VopDbgStore[index].inValue, VopDbgStore[index].outValue); - break; - } - } - break; - } - case VOPDBG_LOCKNOTNIL: - if (VopDbgStore[index].vp != NULL) { - VopDbgStore[index].outValue = VOP_ISLOCKED(VopDbgStore[index].vp); - if (VopDbgStore[index].outValue == 0) - PRINTIT ("%s: Error: %d Not LOCKED: 0x%x\n", funcname, index, (u_int)VopDbgStore[index].vp); - } - break; - default: - PRINTIT ("%s: Error: bad lock test value: %d\n", funcname, VopDbgStore[index].errState); - } - } - else - { - switch (VopDbgStore[index].outState) - { - case VOPDBG_IGNORE: - /* Do Nothing !!! */ - break; - case VOPDBG_LOCKED: - case VOPDBG_UNLOCKED: - case VOPDBG_SAME: - if (VopDbgStore[index].vp == NULL) { - PRINTIT ("%s: OutState: Null vnode ptr in entry #%d\n", funcname, index); - }; - if (VopDbgStore[index].vp != NULL) - { - VopDbgStore[index].outValue = VOP_ISLOCKED(VopDbgStore[index].vp); - switch (VopDbgStore[index].outState) - { - case VOPDBG_LOCKED: - if (VopDbgStore[index].outValue == 0) - { - PRINTIT ("%s: %d Out: not LOCKED:", funcname, index); DBG_VOP(("\n")); - } - break; - case VOPDBG_UNLOCKED: - if (VopDbgStore[index].outValue != 0) - { - PRINTIT ("%s: %d Out: not UNLOCKED:", funcname, index); DBG_VOP(("\n")); - } - break; - case VOPDBG_SAME: - if (VopDbgStore[index].outValue != VopDbgStore[index].inValue) - PRINTIT ("%s: Out: In/Out locks are DIFFERENT: 0x%x, inis %d and out is %d\n", funcname, (u_int)VopDbgStore[index].vp, VopDbgStore[index].inValue, VopDbgStore[index].outValue); - break; - } - } - break; - case VOPDBG_LOCKNOTNIL: - if (VopDbgStore[index].vp != NULL) { - if (&((struct volfs_vndata *)(VopDbgStore[index].vp->v_data))->lock == NULL) - PRINTIT ("%s: DBG_VOP_LOCK on out: Null lock on vnode 0x%x\n", funcname, (u_int)VopDbgStore[index].vp); - else { - VopDbgStore[index].outValue = VOP_ISLOCKED(VopDbgStore[index].vp); - if (VopDbgStore[index].outValue == 0) - { - PRINTIT ("%s: DBG_VOP_LOCK on out: Should be LOCKED:", funcname); DBG_VOP(("\n")); - } - } - } - break; - default: - PRINTIT ("%s: DBG_VOP_LOCK on out: bad lock test value: %d\n", funcname, VopDbgStore[index].outState); - } - } - - VopDbgStore[index].id = -1; /* Invalidate the entry to allow panic-free re-use */ - } + return (ret_err); } -#endif /* DBG_VOP_TEST_LOCKS */ - diff --git a/bsd/net/Makefile b/bsd/net/Makefile index 304c2be7d..e29e3a849 100644 --- a/bsd/net/Makefile +++ b/bsd/net/Makefile @@ -20,29 +20,37 @@ EXPINC_SUBDIRS_PPC = \ EXPINC_SUBDIRS_I386 = \ DATAFILES= \ - bpf.h bpf_compat.h bpfdesc.h dlil.h dlil_pvt.h \ - etherdefs.h ethernet.h if.h if_arp.h \ + bpf.h dlil.h \ + ethernet.h if.h if_arp.h \ if_dl.h if_llc.h if_media.h if_mib.h \ - if_slvar.h \ - if_types.h if_var.h iso88025.h \ - kext_net.h ndrv.h net_osdep.h netisr.h pfkeyv2.h \ - radix.h raw_cb.h route.h slcompress.h slip.h + if_types.h if_var.h \ + kext_net.h ndrv.h pfkeyv2.h \ + route.h + +KERNELFILES= \ + kpi_interface.h kpi_interfacefilter.h kpi_protocol.h \ + if_ether.h init.h PRIVATE_DATAFILES = \ - ndrv_var.h zlib.h if_pppvar.h if_sppp.h ppp_comp.h if_atm.h \ - if_tun.h if_vlan_var.h if_ppp.h firewire.h ppp_defs.h + if_atm.h if_vlan_var.h if_ppp.h firewire.h \ + ppp_defs.h radix.h if_bond_var.h lacp.h ndrv_var.h \ + raw_cb.h etherdefs.h iso88025.h + +PRIVATE_KERNELFILES = ${KERNELFILES} \ + bpfdesc.h dlil_pvt.h if_faith.h ppp_comp.h \ + zlib.h bpf_compat.h net_osdep.h INSTALL_MI_LIST = ${DATAFILES} INSTALL_MI_DIR = net -EXPORT_MI_LIST = ${INSTALL_MI_LIST} +EXPORT_MI_LIST = ${INSTALL_MI_LIST} ${KERNELFILES} EXPORT_MI_DIR = ${INSTALL_MI_DIR} INSTALL_MI_LCL_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} -INSTALL_MI_LCL_KERN_LIST = ${PRIVATE_DATAFILES} +INSTALL_KF_MI_LCL_LIST = ${INSTALL_MI_LCL_LIST} ${PRIVATE_KERNELFILES} include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/bsd/net/bpf.c b/bsd/net/bpf.c index 1da778677..3d025ccbc 100644 --- a/bsd/net/bpf.c +++ b/bsd/net/bpf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -81,6 +81,7 @@ #include <sys/sockio.h> #include <sys/ttycom.h> #include <sys/filedesc.h> +#include <sys/uio_internal.h> #if defined(sparc) && BSD < 199103 #include <sys/stream.h> @@ -100,10 +101,14 @@ #include <sys/sysctl.h> #include <net/firewire.h> -#include <machine/ansi.h> +#include <machine/spl.h> #include <miscfs/devfs/devfs.h> #include <net/dlil.h> +#include <kern/locks.h> + +extern int tvtohz(struct timeval *); + #if NBPFILTER > 0 /* @@ -126,12 +131,15 @@ static caddr_t bpf_alloc(); /* * The default read buffer size is patchable. */ -static int bpf_bufsize = BPF_BUFSIZE; +static unsigned int bpf_bufsize = BPF_BUFSIZE; SYSCTL_INT(_debug, OID_AUTO, bpf_bufsize, CTLFLAG_RW, &bpf_bufsize, 0, ""); -static int bpf_maxbufsize = BPF_MAXBUFSIZE; +static unsigned int bpf_maxbufsize = BPF_MAXBUFSIZE; SYSCTL_INT(_debug, OID_AUTO, bpf_maxbufsize, CTLFLAG_RW, &bpf_maxbufsize, 0, ""); +static unsigned int bpf_maxdevices = 256; +SYSCTL_UINT(_debug, OID_AUTO, bpf_maxdevices, CTLFLAG_RW, + &bpf_maxdevices, 0, ""); /* * bpf_iflist is the list of interfaces; each corresponds to an ifnet @@ -143,42 +151,48 @@ static struct bpf_if *bpf_iflist; * BSD now stores the bpf_d in the dev_t which is a struct * on their system. Our dev_t is an int, so we still store * the bpf_d in a separate table indexed by minor device #. + * + * The value stored in bpf_dtab[n] represent three states: + * 0: device not opened + * 1: device opening or closing + * other: device <n> opened with pointer to storage */ static struct bpf_d **bpf_dtab = NULL; -static int bpf_dtab_size = 0; -static int nbpfilter = 0; +static unsigned int bpf_dtab_size = 0; +static unsigned int nbpfilter = 0; + +static lck_mtx_t *bpf_mlock; +static lck_grp_t *bpf_mlock_grp; +static lck_grp_attr_t *bpf_mlock_grp_attr; +static lck_attr_t *bpf_mlock_attr; /* * Mark a descriptor free by making it point to itself. * This is probably cheaper than marking with a constant since * the address should be in a register anyway. */ -#define D_ISFREE(d) ((d) == (d)->bd_next) -#define D_MARKFREE(d) ((d)->bd_next = (d)) -#define D_MARKUSED(d) ((d)->bd_next = 0) #endif /* __APPLE__ */ -static int bpf_allocbufs __P((struct bpf_d *)); -static void bpf_attachd __P((struct bpf_d *d, struct bpf_if *bp)); -static void bpf_detachd __P((struct bpf_d *d)); -static void bpf_freed __P((struct bpf_d *)); -static void bpf_mcopy __P((const void *, void *, size_t)); -static int bpf_movein __P((struct uio *, int, - struct mbuf **, struct sockaddr *, int *)); -static int bpf_setif __P((struct bpf_d *, struct ifreq *)); -static inline void - bpf_wakeup __P((struct bpf_d *)); -static void catchpacket __P((struct bpf_d *, u_char *, u_int, - u_int, void (*)(const void *, void *, size_t))); -static void reset_d __P((struct bpf_d *)); -static int bpf_setf __P((struct bpf_d *, struct bpf_program *)); +static int bpf_allocbufs(struct bpf_d *); +static void bpf_attachd(struct bpf_d *d, struct bpf_if *bp); +static void bpf_detachd(struct bpf_d *d); +static void bpf_freed(struct bpf_d *); +static void bpf_mcopy(const void *, void *, size_t); +static int bpf_movein(struct uio *, int, + struct mbuf **, struct sockaddr *, int *); +static int bpf_setif(struct bpf_d *, struct ifreq *); +static void bpf_wakeup(struct bpf_d *); +static void catchpacket(struct bpf_d *, u_char *, u_int, + u_int, void (*)(const void *, void *, size_t)); +static void reset_d(struct bpf_d *); +static int bpf_setf(struct bpf_d *, struct user_bpf_program *); /*static void *bpf_devfs_token[MAXBPFILTER];*/ static int bpf_devsw_installed; -void bpf_init __P((void *unused)); - +void bpf_init(void *unused); +int bpf_tap_callback(struct ifnet *ifp, struct mbuf *m); /* * Darwin differs from BSD here, the following are static @@ -188,15 +202,9 @@ void bpf_init __P((void *unused)); d_close_t bpfclose; d_read_t bpfread; d_write_t bpfwrite; - d_ioctl_t bpfioctl; + ioctl_fcn_t bpfioctl; select_fcn_t bpfpoll; -#ifdef __APPLE__ -void bpf_mtap(struct ifnet *, struct mbuf *); - -int bpfopen(), bpfclose(), bpfread(), bpfwrite(), bpfioctl(), - bpfpoll(); -#endif /* Darwin's cdevsw struct differs slightly from BSDs */ #define CDEV_MAJOR 23 @@ -206,98 +214,101 @@ static struct cdevsw bpf_cdevsw = { /* read */ bpfread, /* write */ bpfwrite, /* ioctl */ bpfioctl, - /* stop */ nulldev, - /* reset */ nulldev, - /* tty */ NULL, + /* stop */ eno_stop, + /* reset */ eno_reset, + /* tty */ NULL, /* select */ bpfpoll, - /* mmap */ eno_mmap, + /* mmap */ eno_mmap, /* strategy*/ eno_strat, - /* getc */ eno_getc, - /* putc */ eno_putc, - /* type */ 0 + /* getc */ eno_getc, + /* putc */ eno_putc, + /* type */ 0 }; #define SOCKADDR_HDR_LEN offsetof(struct sockaddr, sa_data) static int -bpf_movein(uio, linktype, mp, sockp, datlen) - register struct uio *uio; - int linktype, *datlen; - register struct mbuf **mp; - register struct sockaddr *sockp; +bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, struct sockaddr *sockp, int *datlen) { struct mbuf *m; int error; int len; int hlen; - /* - * Build a sockaddr based on the data link layer type. - * We do this at this level because the ethernet header - * is copied directly into the data field of the sockaddr. - * In the case of SLIP, there is no header and the packet - * is forwarded as is. - * Also, we are careful to leave room at the front of the mbuf - * for the link level header. - */ - switch (linktype) { - - case DLT_SLIP: - sockp->sa_family = AF_INET; - hlen = 0; - break; - - case DLT_EN10MB: - sockp->sa_family = AF_UNSPEC; - /* XXX Would MAXLINKHDR be better? */ - hlen = sizeof(struct ether_header); - break; - - case DLT_FDDI: -#if defined(__FreeBSD__) || defined(__bsdi__) - sockp->sa_family = AF_IMPLINK; - hlen = 0; -#else - sockp->sa_family = AF_UNSPEC; - /* XXX 4(FORMAC)+6(dst)+6(src)+3(LLC)+5(SNAP) */ - hlen = 24; -#endif - break; - - case DLT_RAW: - case DLT_NULL: - sockp->sa_family = AF_UNSPEC; - hlen = 0; - break; - -#ifdef __FreeBSD__ - case DLT_ATM_RFC1483: + if (sockp) { /* - * en atm driver requires 4-byte atm pseudo header. - * though it isn't standard, vpi:vci needs to be - * specified anyway. + * Build a sockaddr based on the data link layer type. + * We do this at this level because the ethernet header + * is copied directly into the data field of the sockaddr. + * In the case of SLIP, there is no header and the packet + * is forwarded as is. + * Also, we are careful to leave room at the front of the mbuf + * for the link level header. */ - sockp->sa_family = AF_UNSPEC; - hlen = 12; /* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */ - break; -#endif - case DLT_PPP: - sockp->sa_family = AF_UNSPEC; - hlen = 4; /* This should match PPP_HDRLEN */ - break; - - case DLT_APPLE_IP_OVER_IEEE1394: - sockp->sa_family = AF_UNSPEC; - hlen = sizeof(struct firewire_header); - break; - - default: - return (EIO); + switch (linktype) { + + case DLT_SLIP: + sockp->sa_family = AF_INET; + hlen = 0; + break; + + case DLT_EN10MB: + sockp->sa_family = AF_UNSPEC; + /* XXX Would MAXLINKHDR be better? */ + hlen = sizeof(struct ether_header); + break; + + case DLT_FDDI: + #if defined(__FreeBSD__) || defined(__bsdi__) + sockp->sa_family = AF_IMPLINK; + hlen = 0; + #else + sockp->sa_family = AF_UNSPEC; + /* XXX 4(FORMAC)+6(dst)+6(src)+3(LLC)+5(SNAP) */ + hlen = 24; + #endif + break; + + case DLT_RAW: + case DLT_NULL: + sockp->sa_family = AF_UNSPEC; + hlen = 0; + break; + + #ifdef __FreeBSD__ + case DLT_ATM_RFC1483: + /* + * en atm driver requires 4-byte atm pseudo header. + * though it isn't standard, vpi:vci needs to be + * specified anyway. + */ + sockp->sa_family = AF_UNSPEC; + hlen = 12; /* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */ + break; + #endif + case DLT_PPP: + sockp->sa_family = AF_UNSPEC; + hlen = 4; /* This should match PPP_HDRLEN */ + break; + + case DLT_APPLE_IP_OVER_IEEE1394: + sockp->sa_family = AF_UNSPEC; + hlen = sizeof(struct firewire_header); + break; + + default: + return (EIO); + } + if ((hlen + SOCKADDR_HDR_LEN) > sockp->sa_len) { + return (EIO); + } } - if ((hlen + SOCKADDR_HDR_LEN) > sockp->sa_len) { - return (EIO); + else { + hlen = 0; } - len = uio->uio_resid; + + // LP64todo - fix this! + len = uio_resid(uio); *datlen = len - hlen; if ((unsigned)len > MCLBYTES) return (EIO); @@ -305,7 +316,7 @@ bpf_movein(uio, linktype, mp, sockp, datlen) MGETHDR(m, M_WAIT, MT_DATA); if (m == 0) return (ENOBUFS); - if (len > MHLEN) { + if ((unsigned)len > MHLEN) { #if BSD >= 199103 MCLGET(m, M_WAIT); if ((m->m_flags & M_EXT) == 0) { @@ -347,76 +358,71 @@ bpf_movein(uio, linktype, mp, sockp, datlen) /* Callback registered with Ethernet driver. */ int bpf_tap_callback(struct ifnet *ifp, struct mbuf *m) { - boolean_t funnel_state; - - funnel_state = thread_funnel_set(network_flock, TRUE); - /* * Do nothing if the BPF tap has been turned off. * This is to protect from a potential race where this - * call blocks on the funnel lock. And in the meantime + * call blocks on the lock. And in the meantime * BPF is turned off, which will clear if_bpf. */ if (ifp->if_bpf) bpf_mtap(ifp, m); - - thread_funnel_set(network_flock, funnel_state); return 0; } /* - * Returns 1 on sucess, 0 on failure + * The dynamic addition of a new device node must block all processes that are opening + * the last device so that no process will get an unexpected ENOENT */ -static int -bpf_dtab_grow(int increment) +static void +bpf_make_dev_t(int maj) { - struct bpf_d **new_dtab = NULL; - - new_dtab = (struct bpf_d **)_MALLOC(sizeof(struct bpf_d *) * (bpf_dtab_size + increment), M_DEVBUF, M_WAIT); - if (new_dtab == NULL) - return 0; - - if (bpf_dtab) { - struct bpf_d **old_dtab; + static int bpf_growing = 0; + unsigned int cur_size = nbpfilter, i; - bcopy(bpf_dtab, new_dtab, sizeof(struct bpf_d *) * bpf_dtab_size); - /* - * replace must be atomic with respect to free do bpf_dtab - * is always valid. - */ - old_dtab = bpf_dtab; - bpf_dtab = new_dtab; - _FREE(old_dtab, M_DEVBUF); - } - else bpf_dtab = new_dtab; - - bzero(bpf_dtab + bpf_dtab_size, sizeof(struct bpf_d *) * increment); - - bpf_dtab_size += increment; - - return 1; -} + if (nbpfilter >= bpf_maxdevices) + return; -static struct bpf_d * -bpf_make_dev_t(int maj) -{ - struct bpf_d *d; + while (bpf_growing) { + /* Wait until new device has been created */ + (void)tsleep((caddr_t)&bpf_growing, PZERO, "bpf_growing", 0); + } + if (nbpfilter > cur_size) { + /* other thread grew it already */ + return; + } + bpf_growing = 1; - if (nbpfilter >= bpf_dtab_size && bpf_dtab_grow(NBPFILTER) == 0) - return NULL; - - d = (struct bpf_d *)_MALLOC(sizeof(struct bpf_d), M_DEVBUF, M_WAIT); - if (d != NULL) { - int i = nbpfilter++; - - bzero(d, sizeof(struct bpf_d)); - bpf_dtab[i] = d; - D_MARKFREE(bpf_dtab[i]); - /*bpf_devfs_token[i] = */devfs_make_node(makedev(maj, i), - DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0600, - "bpf%d", i); + /* need to grow bpf_dtab first */ + if (nbpfilter == bpf_dtab_size) { + int new_dtab_size; + struct bpf_d **new_dtab = NULL; + struct bpf_d **old_dtab = NULL; + + new_dtab_size = bpf_dtab_size + NBPFILTER; + new_dtab = (struct bpf_d **)_MALLOC(sizeof(struct bpf_d *) * new_dtab_size, M_DEVBUF, M_WAIT); + if (new_dtab == 0) { + printf("bpf_make_dev_t: malloc bpf_dtab failed\n"); + goto done; + } + if (bpf_dtab) { + bcopy(bpf_dtab, new_dtab, + sizeof(struct bpf_d *) * bpf_dtab_size); + } + bzero(new_dtab + bpf_dtab_size, + sizeof(struct bpf_d *) * NBPFILTER); + old_dtab = bpf_dtab; + bpf_dtab = new_dtab; + bpf_dtab_size = new_dtab_size; + if (old_dtab != NULL) + _FREE(old_dtab, M_DEVBUF); } - return d; + i = nbpfilter++; + (void) devfs_make_node(makedev(maj, i), + DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0600, + "bpf%d", i); +done: + bpf_growing = 0; + wakeup((caddr_t)&bpf_growing); } #endif @@ -426,9 +432,7 @@ bpf_make_dev_t(int maj) * Must be called at splimp. */ static void -bpf_attachd(d, bp) - struct bpf_d *d; - struct bpf_if *bp; +bpf_attachd(struct bpf_d *d, struct bpf_if *bp) { /* * Point d at bp, and add d to the interface's list of listeners. @@ -442,8 +446,7 @@ bpf_attachd(d, bp) bp->bif_ifp->if_bpf = bp; #ifdef __APPLE__ - if (bp->bif_ifp->if_set_bpf_tap) - (*bp->bif_ifp->if_set_bpf_tap)(bp->bif_ifp, BPF_TAP_INPUT_OUTPUT, bpf_tap_callback); + dlil_set_bpf_tap(bp->bif_ifp, BPF_TAP_INPUT_OUTPUT, bpf_tap_callback); #endif } @@ -451,8 +454,7 @@ bpf_attachd(d, bp) * Detach a file from its interface. */ static void -bpf_detachd(d) - struct bpf_d *d; +bpf_detachd(struct bpf_d *d) { struct bpf_d **p; struct bpf_if *bp; @@ -470,14 +472,14 @@ bpf_detachd(d) */ if (d->bd_promisc) { d->bd_promisc = 0; - if (ifpromisc(bp->bif_ifp, 0)) + if (ifnet_set_promiscuous(bp->bif_ifp, 0)) /* * Something is really wrong if we were able to put * the driver into promiscuous mode, but can't * take it out. * Most likely the network interface is gone. */ - printf("bpf: ifpromisc failed"); + printf("bpf: ifnet_set_promiscuous failed"); } /* Remove d from the interface's descriptor list. */ p = &bp->bif_dlist; @@ -505,58 +507,57 @@ bpf_detachd(d) */ /* ARGSUSED */ int -bpfopen(dev, flags, fmt, p) - dev_t dev; - int flags; - int fmt; - struct proc *p; +bpfopen(dev_t dev, __unused int flags, __unused int fmt, __unused struct proc *p) { register struct bpf_d *d; -#ifdef __APPLE__ - /* new device nodes on demand when opening the last one */ - if (minor(dev) == nbpfilter - 1) - bpf_make_dev_t(major(dev)); - - if (minor(dev) >= nbpfilter) + if ((unsigned int) minor(dev) >= nbpfilter) return (ENXIO); - - d = bpf_dtab[minor(dev)]; - - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); -#else - if (p->p_prison) - return (EPERM); + + /* + * New device nodes are created on demand when opening the last one. + * The programming model is for processes to loop on the minor starting at 0 + * as long as EBUSY is returned. The loop stops when either the open succeeds or + * an error other that EBUSY is returned. That means that bpf_make_dev_t() must + * block all processes that are opening the last node. If not all + * processes are blocked, they could unexpectedly get ENOENT and abort their + * opening loop. + */ + if ((unsigned int) minor(dev) == (nbpfilter - 1)) + bpf_make_dev_t(major(dev)); - d = dev->si_drv1; -#endif /* * Each minor can be opened by only one process. If the requested * minor is in use, return EBUSY. + * + * Important: bpfopen() and bpfclose() have to check and set the status of a device + * in the same lockin context otherwise the device may be leaked because the vnode use count + * will be unpextectly greater than 1 when close() is called. */ -#ifdef __APPLE__ - if (!D_ISFREE(d)) { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - return (EBUSY); + if (bpf_dtab[minor(dev)] == 0) + bpf_dtab[minor(dev)] = (void *)1; /* Mark opening */ + else + return (EBUSY); + + d = (struct bpf_d *)_MALLOC(sizeof(struct bpf_d), M_DEVBUF, M_WAIT); + if (d == NULL) { + /* this really is a catastrophic failure */ + printf("bpfopen: malloc bpf_d failed\n"); + bpf_dtab[minor(dev)] = 0; + return ENOMEM; } + bzero(d, sizeof(struct bpf_d)); - /* Mark "free" and do most initialization. */ - bzero((char *)d, sizeof(*d)); -#else - if (d) - return (EBUSY); - make_dev(&bpf_cdevsw, minor(dev), 0, 0, 0600, "bpf%d", lminor(dev)); - MALLOC(d, struct bpf_d *, sizeof(*d), M_BPF, M_WAITOK); - bzero(d, sizeof(*d)); - dev->si_drv1 = d; -#endif + /* + * It is not necessary to take the BPF lock here because no other + * thread can access the device until it is marked opened... + */ + + /* Mark "in use" and do most initialization. */ d->bd_bufsize = bpf_bufsize; d->bd_sig = SIGIO; d->bd_seesent = 1; - -#ifdef __APPLE__ - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); -#endif + bpf_dtab[minor(dev)] = d; /* Mark opened */ return (0); } @@ -567,93 +568,49 @@ bpfopen(dev, flags, fmt, p) */ /* ARGSUSED */ int -bpfclose(dev, flags, fmt, p) - dev_t dev; - int flags; - int fmt; - struct proc *p; +bpfclose(dev_t dev, __unused int flags, __unused int fmt, __unused struct proc *p) { register struct bpf_d *d; - register int s; -#ifdef __APPLE__ - struct bpf_d **bpf_dtab_schk; -#endif -#ifndef __APPLE__ - funsetown(d->bd_sigio); -#endif - s = splimp(); -#ifdef __APPLE__ -again: d = bpf_dtab[minor(dev)]; - bpf_dtab_schk = bpf_dtab; -#endif - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + if (d == 0 || d == (void *)1) + return (ENXIO); + + bpf_dtab[minor(dev)] = (void *)1; /* Mark closing */ -#ifdef __APPLE__ - /* - * If someone grows bpf_dtab[] while we were waiting for the - * funnel, then we will be pointing off into freed memory; - * check to see if this is the case. - */ - if (bpf_dtab_schk != bpf_dtab) { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - goto again; - } -#endif + /* Take BPF lock to ensure no other thread is using the device */ + lck_mtx_lock(bpf_mlock); if (d->bd_bif) bpf_detachd(d); - splx(s); -#ifdef __APPLE__ selthreadclear(&d->bd_sel); -#endif bpf_freed(d); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + + lck_mtx_unlock(bpf_mlock); + + /* Mark free in same context as bpfopen comes to check */ + bpf_dtab[minor(dev)] = 0; /* Mark closed */ + _FREE(d, M_DEVBUF); + return (0); } -/* - * Support for SunOS, which does not have tsleep. - */ -#if BSD < 199103 -static -bpf_timeout(arg) - caddr_t arg; -{ - boolean_t funnel_state; - struct bpf_d *d = (struct bpf_d *)arg; - funnel_state = thread_funnel_set(network_flock, TRUE); - d->bd_timedout = 1; - wakeup(arg); - (void) thread_funnel_set(network_flock, FALSE); -} -#define BPF_SLEEP(chan, pri, s, t) bpf_sleep((struct bpf_d *)chan) +#define BPF_SLEEP bpf_sleep -int -bpf_sleep(d) - register struct bpf_d *d; +static int +bpf_sleep(struct bpf_d *d, int pri, const char *wmesg, int timo) { - register int rto = d->bd_rtout; register int st; - if (rto != 0) { - d->bd_timedout = 0; - timeout(bpf_timeout, (caddr_t)d, rto); - } - st = sleep((caddr_t)d, PRINET|PCATCH); - if (rto != 0) { - if (d->bd_timedout == 0) - untimeout(bpf_timeout, (caddr_t)d); - else if (st == 0) - return EWOULDBLOCK; - } - return (st != 0) ? EINTR : 0; + lck_mtx_unlock(bpf_mlock); + + st = tsleep((caddr_t)d, pri, wmesg, timo); + + lck_mtx_lock(bpf_mlock); + + return st; } -#else -#define BPF_SLEEP tsleep -#endif /* * Rotate the packet buffers in descriptor d. Move the store buffer @@ -670,25 +627,26 @@ bpf_sleep(d) * bpfread - read next chunk of packets from buffers */ int -bpfread(dev, uio, ioflag) - dev_t dev; - struct uio *uio; - int ioflag; +bpfread(dev_t dev, struct uio *uio, int ioflag) { register struct bpf_d *d; int error; int s; d = bpf_dtab[minor(dev)]; + if (d == 0 || d == (void *)1) + return (ENXIO); + + lck_mtx_lock(bpf_mlock); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); /* * Restrict application to use a buffer the same size as * as kernel buffers. */ + // LP64todo - fix this if (uio->uio_resid != d->bd_bufsize) { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + lck_mtx_unlock(bpf_mlock); return (EINVAL); } @@ -717,18 +675,18 @@ bpfread(dev, uio, ioflag) */ if (d->bd_bif == NULL) { splx(s); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + lck_mtx_unlock(bpf_mlock); return (ENXIO); } if (ioflag & IO_NDELAY) error = EWOULDBLOCK; else - error = BPF_SLEEP((caddr_t)d, PRINET|PCATCH, "bpf", + error = BPF_SLEEP(d, PRINET|PCATCH, "bpf", d->bd_rtout); if (error == EINTR || error == ERESTART) { splx(s); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + lck_mtx_unlock(bpf_mlock); return (error); } if (error == EWOULDBLOCK) { @@ -747,7 +705,7 @@ bpfread(dev, uio, ioflag) if (d->bd_slen == 0) { splx(s); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + lck_mtx_unlock(bpf_mlock); return (0); } ROTATE_BUFFERS(d); @@ -771,7 +729,7 @@ bpfread(dev, uio, ioflag) d->bd_hbuf = 0; d->bd_hlen = 0; splx(s); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + lck_mtx_unlock(bpf_mlock); return (error); } @@ -779,9 +737,8 @@ bpfread(dev, uio, ioflag) /* * If there are processes sleeping on this descriptor, wake them up. */ -static inline void -bpf_wakeup(d) - register struct bpf_d *d; +static void +bpf_wakeup(struct bpf_d *d) { wakeup((caddr_t)d); if (d->bd_async && d->bd_sig && d->bd_sigio) @@ -806,57 +763,54 @@ bpf_wakeup(d) #define MAX_DATALINK_HDR_LEN (sizeof(struct firewire_header)) int -bpfwrite(dev, uio, ioflag) - dev_t dev; - struct uio *uio; - int ioflag; +bpfwrite(dev_t dev, struct uio *uio, __unused int ioflag) { register struct bpf_d *d; struct ifnet *ifp; struct mbuf *m; - int error, s; + int error; char dst_buf[SOCKADDR_HDR_LEN + MAX_DATALINK_HDR_LEN]; int datlen; d = bpf_dtab[minor(dev)]; - - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + if (d == 0 || d == (void *)1) + return (ENXIO); + + lck_mtx_lock(bpf_mlock); if (d->bd_bif == 0) { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + lck_mtx_unlock(bpf_mlock); return (ENXIO); } ifp = d->bd_bif->bif_ifp; if (uio->uio_resid == 0) { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + lck_mtx_unlock(bpf_mlock); return (0); } ((struct sockaddr *)dst_buf)->sa_len = sizeof(dst_buf); error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, &m, - (struct sockaddr *)dst_buf, &datlen); + d->bd_hdrcmplt ? 0 : (struct sockaddr *)dst_buf, &datlen); if (error) { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + lck_mtx_unlock(bpf_mlock); return (error); } - if (datlen > ifp->if_mtu) { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + if ((unsigned)datlen > ifp->if_mtu) { + lck_mtx_unlock(bpf_mlock); return (EMSGSIZE); } + lck_mtx_unlock(bpf_mlock); + if (d->bd_hdrcmplt) { - ((struct sockaddr *)dst_buf)->sa_family = pseudo_AF_HDRCMPLT; + error = dlil_output(ifp, 0, m, NULL, NULL, 1); } - - s = splnet(); - - error = dlil_output(ifptodlt(ifp, PF_INET), m, - (caddr_t) 0, (struct sockaddr *)dst_buf, 0); - - splx(s); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + else { + error = dlil_output(ifp, PF_INET, m, NULL, (struct sockaddr *)dst_buf, 0); + } + /* * The driver frees the mbuf. */ @@ -868,8 +822,7 @@ bpfwrite(dev, uio, ioflag) * receive and drop counts. Should be called at splimp. */ static void -reset_d(d) - struct bpf_d *d; +reset_d(struct bpf_d *d) { if (d->bd_hbuf) { /* Free the hold buffer. */ @@ -904,19 +857,16 @@ reset_d(d) */ /* ARGSUSED */ int -bpfioctl(dev, cmd, addr, flags, p) - dev_t dev; - u_long cmd; - caddr_t addr; - int flags; - struct proc *p; +bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags, struct proc *p) { register struct bpf_d *d; int s, error = 0; d = bpf_dtab[minor(dev)]; + if (d == 0 || d == (void *)1) + return (ENXIO); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + lck_mtx_lock(bpf_mlock); switch (cmd) { @@ -949,7 +899,7 @@ bpfioctl(dev, cmd, addr, flags, p) error = EINVAL; else { ifp = d->bd_bif->bif_ifp; - error = (*ifp->if_ioctl)(ifp, cmd, addr); + error = dlil_ioctl(0, ifp, cmd, addr); } break; } @@ -986,7 +936,18 @@ bpfioctl(dev, cmd, addr, flags, p) * Set link layer read filter. */ case BIOCSETF: - error = bpf_setf(d, (struct bpf_program *)addr); + if (proc_is64bit(p)) { + error = bpf_setf(d, (struct user_bpf_program *)addr); + } + else { + struct bpf_program * tmpp; + struct user_bpf_program tmp; + + tmpp = (struct bpf_program *)addr; + tmp.bf_len = tmpp->bf_len; + tmp.bf_insns = CAST_USER_ADDR_T(tmpp->bf_insns); + error = bpf_setf(d, &tmp); + } break; /* @@ -1011,7 +972,7 @@ bpfioctl(dev, cmd, addr, flags, p) } s = splimp(); if (d->bd_promisc == 0) { - error = ifpromisc(d->bd_bif->bif_ifp, 1); + error = ifnet_set_promiscuous(d->bd_bif->bif_ifp, 1); if (error == 0) d->bd_promisc = 1; } @@ -1175,7 +1136,9 @@ bpfioctl(dev, cmd, addr, flags, p) *(u_int *)addr = d->bd_sig; break; } - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + + lck_mtx_unlock(bpf_mlock); + return (error); } @@ -1184,16 +1147,14 @@ bpfioctl(dev, cmd, addr, flags, p) * free it and replace it. Returns EINVAL for bogus requests. */ static int -bpf_setf(d, fp) - struct bpf_d *d; - struct bpf_program *fp; +bpf_setf(struct bpf_d *d, struct user_bpf_program *fp) { struct bpf_insn *fcode, *old; u_int flen, size; int s; old = d->bd_filter; - if (fp->bf_insns == 0) { + if (fp->bf_insns == USER_ADDR_NULL) { if (fp->bf_len != 0) return (EINVAL); s = splimp(); @@ -1208,13 +1169,13 @@ bpf_setf(d, fp) if (flen > BPF_MAXINSNS) return (EINVAL); - size = flen * sizeof(*fp->bf_insns); + size = flen * sizeof(struct bpf_insn); fcode = (struct bpf_insn *) _MALLOC(size, M_DEVBUF, M_WAIT); #ifdef __APPLE__ if (fcode == NULL) return (ENOBUFS); #endif - if (copyin((caddr_t)fp->bf_insns, (caddr_t)fcode, size) == 0 && + if (copyin(fp->bf_insns, (caddr_t)fcode, size) == 0 && bpf_validate(fcode, (int)flen)) { s = splimp(); d->bd_filter = fcode; @@ -1235,9 +1196,7 @@ bpf_setf(d, fp) * Return an errno or 0. */ static int -bpf_setif(d, ifr) - struct bpf_d *d; - struct ifreq *ifr; +bpf_setif(struct bpf_d *d, struct ifreq *ifr) { struct bpf_if *bp; int s, error; @@ -1295,24 +1254,23 @@ bpf_setif(d, ifr) * Otherwise, return false but make a note that a selwakeup() must be done. */ int -bpfpoll(dev, events, wql, p) - register dev_t dev; - int events; - void * wql; - struct proc *p; +bpfpoll(dev_t dev, int events, void * wql, struct proc *p) { register struct bpf_d *d; register int s; int revents = 0; d = bpf_dtab[minor(dev)]; + if (d == 0 || d == (void *)1) + return (ENXIO); + + lck_mtx_lock(bpf_mlock); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); /* * An imitation of the FIONREAD ioctl code. */ if (d->bd_bif == NULL) { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + lck_mtx_unlock(bpf_mlock); return (ENXIO); } @@ -1324,7 +1282,8 @@ bpfpoll(dev, events, wql, p) selrecord(p, &d->bd_sel, wql); } splx(s); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + + lck_mtx_unlock(bpf_mlock); return (revents); } @@ -1335,10 +1294,7 @@ bpfpoll(dev, events, wql, p) * buffer. */ void -bpf_tap(ifp, pkt, pktlen) - struct ifnet *ifp; - register u_char *pkt; - register u_int pktlen; +bpf_tap(struct ifnet *ifp, u_char *pkt, u_int pktlen) { struct bpf_if *bp; register struct bpf_d *d; @@ -1348,20 +1304,21 @@ bpf_tap(ifp, pkt, pktlen) * The only problem that could arise here is that if two different * interfaces shared any data. This is not the case. */ - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + lck_mtx_lock(bpf_mlock); + bp = ifp->if_bpf; #ifdef __APPLE__ if (bp) { #endif - for (d = bp->bif_dlist; d != 0; d = d->bd_next) { - ++d->bd_rcount; - slen = bpf_filter(d->bd_filter, pkt, pktlen, pktlen); - if (slen != 0) - catchpacket(d, pkt, pktlen, slen, bcopy); - } + for (d = bp->bif_dlist; d != 0; d = d->bd_next) { + ++d->bd_rcount; + slen = bpf_filter(d->bd_filter, pkt, pktlen, pktlen); + if (slen != 0) + catchpacket(d, pkt, pktlen, slen, bcopy); + } #ifdef __APPLE__ } - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + lck_mtx_unlock(bpf_mlock); #endif } @@ -1370,13 +1327,10 @@ bpf_tap(ifp, pkt, pktlen) * from m_copydata in sys/uipc_mbuf.c. */ static void -bpf_mcopy(src_arg, dst_arg, len) - const void *src_arg; - void *dst_arg; - register size_t len; +bpf_mcopy(const void *src_arg, void *dst_arg, size_t len) { - register const struct mbuf *m; - register u_int count; + const struct mbuf *m; + u_int count; u_char *dst; m = src_arg; @@ -1385,7 +1339,7 @@ bpf_mcopy(src_arg, dst_arg, len) if (m == 0) panic("bpf_mcopy"); count = min(m->m_len, len); - bcopy(mtod((struct mbuf *)m, void *), dst, count); + bcopy(mtod(m, const void *), dst, count); m = m->m_next; dst += count; len -= count; @@ -1396,27 +1350,32 @@ bpf_mcopy(src_arg, dst_arg, len) * Incoming linkage from device drivers, when packet is in an mbuf chain. */ void -bpf_mtap(ifp, m) - struct ifnet *ifp; - struct mbuf *m; +bpf_mtap(struct ifnet *ifp, struct mbuf *m) { - struct bpf_if *bp = ifp->if_bpf; + struct bpf_if *bp; struct bpf_d *d; u_int pktlen, slen; struct mbuf *m0; + lck_mtx_lock(bpf_mlock); + + bp = ifp->if_bpf; + if (bp) { pktlen = 0; for (m0 = m; m0 != 0; m0 = m0->m_next) pktlen += m0->m_len; - - for (d = bp->bif_dlist; d != 0; d = d->bd_next) { - if (!d->bd_seesent && (m->m_pkthdr.rcvif == NULL)) - continue; - ++d->bd_rcount; - slen = bpf_filter(d->bd_filter, (u_char *)m, pktlen, 0); - if (slen != 0) - catchpacket(d, (u_char *)m, pktlen, slen, bpf_mcopy); + + for (d = bp->bif_dlist; d != 0; d = d->bd_next) { + if (!d->bd_seesent && (m->m_pkthdr.rcvif == NULL)) + continue; + ++d->bd_rcount; + slen = bpf_filter(d->bd_filter, (u_char *)m, pktlen, 0); + if (slen != 0) + catchpacket(d, (u_char *)m, pktlen, slen, bpf_mcopy); + } } + + lck_mtx_unlock(bpf_mlock); } /* @@ -1428,11 +1387,8 @@ bpf_mtap(ifp, m) * pkt is really an mbuf. */ static void -catchpacket(d, pkt, pktlen, snaplen, cpfn) - register struct bpf_d *d; - register u_char *pkt; - register u_int pktlen, snaplen; - register void (*cpfn) __P((const void *, void *, size_t)); +catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, + void (*cpfn)(const void *, void *, size_t)) { register struct bpf_hdr *hp; register int totlen, curlen; @@ -1500,8 +1456,7 @@ catchpacket(d, pkt, pktlen, snaplen, cpfn) * Initialize all nonzero fields of a descriptor. */ static int -bpf_allocbufs(d) - register struct bpf_d *d; +bpf_allocbufs(struct bpf_d *d) { d->bd_fbuf = (caddr_t) _MALLOC(d->bd_bufsize, M_DEVBUF, M_WAIT); if (d->bd_fbuf == 0) @@ -1522,8 +1477,7 @@ bpf_allocbufs(d) * Called on close. */ static void -bpf_freed(d) - register struct bpf_d *d; +bpf_freed(struct bpf_d *d) { /* * We don't need to lock out interrupts since this descriptor has @@ -1539,8 +1493,6 @@ bpf_freed(d) } if (d->bd_filter) FREE((caddr_t)d->bd_filter, M_DEVBUF); - - D_MARKFREE(d); } /* @@ -1549,16 +1501,15 @@ bpf_freed(d) * size of the link header (variable length headers not yet supported). */ void -bpfattach(ifp, dlt, hdrlen) - struct ifnet *ifp; - u_int dlt, hdrlen; +bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen) { struct bpf_if *bp; - int i; bp = (struct bpf_if *) _MALLOC(sizeof(*bp), M_DEVBUF, M_WAIT); if (bp == 0) panic("bpfattach"); + lck_mtx_lock(bpf_mlock); + bp->bif_dlist = 0; bp->bif_ifp = ifp; bp->bif_dlt = dlt; @@ -1575,6 +1526,11 @@ bpfattach(ifp, dlt, hdrlen) * performance reasons and to alleviate alignment restrictions). */ bp->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen; + + /* Take a reference on the interface */ + ifp_reference(ifp); + + lck_mtx_unlock(bpf_mlock); #ifndef __APPLE__ if (bootverbose) @@ -1589,14 +1545,15 @@ bpfattach(ifp, dlt, hdrlen) * ENXIO. */ void -bpfdetach(ifp) - struct ifnet *ifp; +bpfdetach(struct ifnet *ifp) { struct bpf_if *bp, *bp_prev; struct bpf_d *d; int s; s = splimp(); + + lck_mtx_lock(bpf_mlock); /* Locate BPF interface information */ bp_prev = NULL; @@ -1633,6 +1590,10 @@ bpfdetach(ifp) } else { bpf_iflist = bp->bif_next; } + + ifp_release(ifp); + + lck_mtx_unlock(bpf_mlock); FREE(bp, M_DEVBUF); @@ -1640,25 +1601,51 @@ bpfdetach(ifp) } void -bpf_init(unused) - void *unused; +bpf_init(__unused void *unused) { #ifdef __APPLE__ int i; int maj; - if (!bpf_devsw_installed ) { + if (bpf_devsw_installed == 0) { bpf_devsw_installed = 1; + + bpf_mlock_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setdefault(bpf_mlock_grp_attr); + + bpf_mlock_grp = lck_grp_alloc_init("bpf", bpf_mlock_grp_attr); + + bpf_mlock_attr = lck_attr_alloc_init(); + lck_attr_setdefault(bpf_mlock_attr); + + bpf_mlock = lck_mtx_alloc_init(bpf_mlock_grp, bpf_mlock_attr); + + if (bpf_mlock == 0) { + printf("bpf_init: failed to allocate bpf_mlock\n"); + bpf_devsw_installed = 0; + return; + } + maj = cdevsw_add(CDEV_MAJOR, &bpf_cdevsw); if (maj == -1) { + if (bpf_mlock) + lck_mtx_free(bpf_mlock, bpf_mlock_grp); + if (bpf_mlock_attr) + lck_attr_free(bpf_mlock_attr); + if (bpf_mlock_grp) + lck_grp_free(bpf_mlock_grp); + if (bpf_mlock_grp_attr) + lck_grp_attr_free(bpf_mlock_grp_attr); + + bpf_mlock = 0; + bpf_mlock_attr = 0; + bpf_mlock_grp = 0; + bpf_mlock_grp_attr = 0; + bpf_devsw_installed = 0; printf("bpf_init: failed to allocate a major number!\n"); - nbpfilter = 0; - return; - } - if (bpf_dtab_grow(NBPFILTER) == 0) { - printf("bpf_init: failed to allocate bpf_dtab\n"); return; } + for (i = 0 ; i < NBPFILTER; i++) bpf_make_dev_t(maj); } diff --git a/bsd/net/bpf.h b/bsd/net/bpf.h index 89e9163c6..b6b0a3995 100644 --- a/bsd/net/bpf.h +++ b/bsd/net/bpf.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -67,6 +67,11 @@ #include <sys/appleapiopts.h> #include <sys/types.h> #include <sys/time.h> +#include <sys/cdefs.h> + +#ifdef KERNEL +#include <sys/kernel_types.h> +#endif /* BSD style release date */ #define BPF_RELEASE 199606 @@ -82,7 +87,7 @@ typedef u_int32_t bpf_u_int32; #define BPF_WORDALIGN(x) (((x)+(BPF_ALIGNMENT-1))&~(BPF_ALIGNMENT-1)) #define BPF_MAXINSNS 512 -#define BPF_MAXBUFSIZE 0x8000 +#define BPF_MAXBUFSIZE 0x80000 #define BPF_MINBUFSIZE 32 /* @@ -93,6 +98,26 @@ struct bpf_program { struct bpf_insn *bf_insns; }; +#ifdef KERNEL +/* LP64 version of bpf_program. all pointers + * grow when we're dealing with a 64-bit process. + * WARNING - keep in sync with bpf_program + */ +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_bpf_program { + u_int bf_len; + user_addr_t bf_insns; +}; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +#endif // KERNEL + /* * Struct returned by BIOCGSTATS. */ @@ -332,26 +357,40 @@ struct bpf_insn { #define BPF_STMT(code, k) { (u_short)(code), 0, 0, k } #define BPF_JUMP(code, k, jt, jf) { (u_short)(code), jt, jf, k } +#ifdef KERNEL_PRIVATE /* Forward declerations */ struct ifnet; struct mbuf; -#ifdef KERNEL -#ifdef __APPLE_API_UNSTABLE -int bpf_validate __P((const struct bpf_insn *, int)); -void bpf_tap __P((struct ifnet *, u_char *, u_int)); -void bpf_mtap __P((struct ifnet *, struct mbuf *)); -void bpfattach __P((struct ifnet *, u_int, u_int)); -void bpfdetach __P((struct ifnet *)); +int bpf_validate(const struct bpf_insn *, int); +void bpf_tap(struct ifnet *, u_char *, u_int); +void bpf_mtap(struct ifnet *, struct mbuf *); + +void bpfdetach(struct ifnet *); -void bpfilterattach __P((int)); -u_int bpf_filter __P((const struct bpf_insn *, u_char *, u_int, u_int)); +void bpfilterattach(int); +u_int bpf_filter(const struct bpf_insn *, u_char *, u_int, u_int); #ifdef __APPLE__ #define BPF_TAP(x, y, z) bpf_tap(x,y,z) #define BPF_MTAP(x, y) bpf_mtap(x, y) #endif /* __APPLE__ */ -#endif /* __APPLE_API_UNSTABLE */ + +#endif /* KERNEL_PRIVATE */ + +#ifdef KERNEL +/*! + @function bpfattach + @discussion Registers an interface with BPF. This allows bpf devices + to attach to your interface to capture packets. Your interface + will be unregistered automatically when your interface is + detached. + @param interface The interface to register with BPF. + @param data_link_type The data link type of the interface. See the + DLT_* defines in bpf.h. + @param header_length The length, in bytes, of the data link header. + */ +void bpfattach(ifnet_t interface, u_int data_link_type, u_int header_length); #endif /* KERNEL */ /* diff --git a/bsd/net/bpf_filter.c b/bsd/net/bpf_filter.c index 102e3c492..d697c1363 100644 --- a/bsd/net/bpf_filter.c +++ b/bsd/net/bpf_filter.c @@ -93,7 +93,7 @@ #ifdef KERNEL #define MINDEX(m, k) \ { \ - register int len = m->m_len; \ + register unsigned int len = m->m_len; \ \ while (k >= len) { \ k -= len; \ @@ -104,14 +104,11 @@ } \ } -static u_int16_t m_xhalf __P((struct mbuf *m, bpf_u_int32 k, int *err)); -static u_int32_t m_xword __P((struct mbuf *m, bpf_u_int32 k, int *err)); +static u_int16_t m_xhalf(struct mbuf *m, bpf_u_int32 k, int *err); +static u_int32_t m_xword(struct mbuf *m, bpf_u_int32 k, int *err); static u_int32_t -m_xword(m, k, err) - register struct mbuf *m; - register bpf_u_int32 k; - register int *err; +m_xword(struct mbuf *m, bpf_u_int32 k, int *err) { register size_t len; register u_char *cp, *np; @@ -164,10 +161,7 @@ m_xword(m, k, err) } static u_int16_t -m_xhalf(m, k, err) - register struct mbuf *m; - register bpf_u_int32 k; - register int *err; +m_xhalf(struct mbuf *m, bpf_u_int32 k, int *err) { register size_t len; register u_char *cp; @@ -203,11 +197,7 @@ m_xhalf(m, k, err) * buflen is the amount of data present */ u_int -bpf_filter(pc, p, wirelen, buflen) - register const struct bpf_insn *pc; - register u_char *p; - u_int wirelen; - register u_int buflen; +bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) { register u_int32_t A = 0, X = 0; register bpf_u_int32 k; @@ -540,9 +530,7 @@ bpf_filter(pc, p, wirelen, buflen) * Otherwise, a bogus program could easily crash the system. */ int -bpf_validate(f, len) - const struct bpf_insn *f; - int len; +bpf_validate(const struct bpf_insn *f, int len) { register int i; const struct bpf_insn *p; @@ -557,7 +545,7 @@ bpf_validate(f, len) register int from = i + 1; if (BPF_OP(p->code) == BPF_JA) { - if (from >= len || p->k >= len - from) + if (from >= len || p->k >= (bpf_u_int32)(len - from)) return 0; } else if (from >= len || p->jt >= len - from || diff --git a/bsd/net/bpfdesc.h b/bsd/net/bpfdesc.h index b8f80bb63..1958e635b 100644 --- a/bsd/net/bpfdesc.h +++ b/bsd/net/bpfdesc.h @@ -64,7 +64,7 @@ #ifndef _NET_BPFDESC_H_ #define _NET_BPFDESC_H_ #include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE /* * The items in this header file should be wrapped in #ifdef KERNEL. */ @@ -134,5 +134,5 @@ struct bpf_if { struct ifnet *bif_ifp; /* corresponding interface */ }; -#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL_PRIVATE */ #endif diff --git a/bsd/net/bridge.c b/bsd/net/bridge.c index 2f364dee5..2770e4a76 100644 --- a/bsd/net/bridge.c +++ b/bsd/net/bridge.c @@ -166,22 +166,24 @@ static void bdg_promisc_off(int clear_used) { struct ifnet *ifp ; - TAILQ_FOREACH(ifp, &ifnet, if_link) { - if ( (ifp2sc[ifp->if_index].flags & IFF_BDG_PROMISC) ) { - int s, ret ; - s = splimp(); - ret = ifpromisc(ifp, 0); - splx(s); - ifp2sc[ifp->if_index].flags &= ~(IFF_BDG_PROMISC|IFF_MUTE) ; - DEB(printf(">> now %s%d promisc OFF if_flags 0x%x bdg_flags 0x%x\n", - ifp->if_name, ifp->if_unit, - ifp->if_flags, ifp2sc[ifp->if_index].flags);) - } - if (clear_used) { - ifp2sc[ifp->if_index].flags &= ~(IFF_USED) ; - bdg_stats.s[ifp->if_index].name[0] = '\0'; - } + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + if ( (ifp2sc[ifp->if_index].flags & IFF_BDG_PROMISC) ) { + int s, ret ; + s = splimp(); + ret = ifnet_set_promiscuous(ifp, 0); + splx(s); + ifp2sc[ifp->if_index].flags &= ~(IFF_BDG_PROMISC|IFF_MUTE) ; + DEB(printf(">> now %s%d promisc OFF if_flags 0x%x bdg_flags 0x%x\n", + ifp->if_name, ifp->if_unit, + ifp->if_flags, ifp2sc[ifp->if_index].flags);) + } + if (clear_used) { + ifp2sc[ifp->if_index].flags &= ~(IFF_USED) ; + bdg_stats.s[ifp->if_index].name[0] = '\0'; + } } + ifnet_head_done(); } /* @@ -193,29 +195,31 @@ bdg_promisc_on() struct ifnet *ifp ; int s ; - TAILQ_FOREACH(ifp, &ifnet, if_link) { - if ( !BDG_USED(ifp) ) - continue ; - if ( 0 == ( ifp->if_flags & IFF_UP) ) { - s = splimp(); - if_up(ifp); - splx(s); - } - if ( !(ifp2sc[ifp->if_index].flags & IFF_BDG_PROMISC) ) { - int ret ; - s = splimp(); - ret = ifpromisc(ifp, 1); - splx(s); - ifp2sc[ifp->if_index].flags |= IFF_BDG_PROMISC ; - printf(">> now %s%d promisc ON if_flags 0x%x bdg_flags 0x%x\n", - ifp->if_name, ifp->if_unit, - ifp->if_flags, ifp2sc[ifp->if_index].flags); - } - if (BDG_MUTED(ifp)) { - printf(">> unmuting %s%d\n", ifp->if_name, ifp->if_unit); - BDG_UNMUTE(ifp) ; - } + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + if ( !BDG_USED(ifp) ) + continue ; + if ( 0 == ( ifp->if_flags & IFF_UP) ) { + s = splimp(); + if_up(ifp); + splx(s); + } + if ( !(ifp2sc[ifp->if_index].flags & IFF_BDG_PROMISC) ) { + int ret ; + s = splimp(); + ret = ifnet_set_promiscuous(ifp, 1); + splx(s); + ifp2sc[ifp->if_index].flags |= IFF_BDG_PROMISC ; + printf(">> now %s%d promisc ON if_flags 0x%x bdg_flags 0x%x\n", + ifp->if_name, ifp->if_unit, + ifp->if_flags, ifp2sc[ifp->if_index].flags); + } + if (BDG_MUTED(ifp)) { + printf(">> unmuting %s%d\n", ifp->if_name, ifp->if_unit); + BDG_UNMUTE(ifp) ; + } } + ifnet_head_done(); } static int @@ -394,17 +398,6 @@ flush_table() splx(s); } -/* wrapper for funnel */ -void -bdg_timeout_funneled(void * dummy) -{ - boolean_t funnel_state; - - funnel_state = thread_funnel_set(network_flock, TRUE); - bdg_timeout(dummy); - funnel_state = thread_funnel_set(network_flock, FALSE); -} - /* * called periodically to flush entries etc. */ @@ -438,7 +431,7 @@ bdg_timeout(void *dummy) bdg_loops = 0 ; } } - timeout(bdg_timeout_funneled, (void *)0, 2*hz ); + timeout(bdg_timeout, (void *)0, 2*hz ); } /* @@ -477,7 +470,6 @@ bdgtakeifaces(void) { int i ; struct ifnet *ifp; - struct arpcom *ac ; bdg_addr *p = bdg_addresses ; struct bdg_softc *bp; @@ -485,32 +477,32 @@ bdgtakeifaces(void) *bridge_cfg = '\0'; printf("BRIDGE 010131, have %d interfaces\n", if_index); + ifnet_head_lock_shared(); for (i = 0 , ifp = ifnet.tqh_first ; i < if_index ; i++, ifp = TAILQ_NEXT(ifp, if_link) ) - if (ifp->if_type == IFT_ETHER) { /* ethernet ? */ - bp = &ifp2sc[ifp->if_index] ; - ac = (struct arpcom *)ifp; - sprintf(bridge_cfg + strlen(bridge_cfg), - "%s%d:1,", ifp->if_name, ifp->if_unit); - printf("-- index %d %s type %d phy %d addrl %d addr %6D\n", - ifp->if_index, - bdg_stats.s[ifp->if_index].name, - (int)ifp->if_type, (int) ifp->if_physical, - (int)ifp->if_addrlen, - ac->ac_enaddr, "." ); - bcopy(ac->ac_enaddr, p->etheraddr, 6); - p++ ; - bp->ifp = ifp ; - bp->flags = IFF_USED ; - bp->cluster_id = htons(1) ; - bp->magic = 0xDEADBEEF ; - - sprintf(bdg_stats.s[ifp->if_index].name, - "%s%d:%d", ifp->if_name, ifp->if_unit, - ntohs(bp->cluster_id)); - bdg_ports ++ ; - } - + if (ifp->if_type == IFT_ETHER) { /* ethernet ? */ + ifnet_lladdr_copy_bytes(ifp, p->etheraddr, ETHER_ADDR_LEN); + bp = &ifp2sc[ifp->if_index] ; + sprintf(bridge_cfg + strlen(bridge_cfg), + "%s%d:1,", ifp->if_name, ifp->if_unit); + printf("-- index %d %s type %d phy %d addrl %d addr %6D\n", + ifp->if_index, + bdg_stats.s[ifp->if_index].name, + (int)ifp->if_type, (int) ifp->if_physical, + (int)ifp->if_addrlen, + p->etheraddr, "." ); + p++ ; + bp->ifp = ifp ; + bp->flags = IFF_USED ; + bp->cluster_id = htons(1) ; + bp->magic = 0xDEADBEEF ; + + sprintf(bdg_stats.s[ifp->if_index].name, + "%s%d:%d", ifp->if_name, ifp->if_unit, + ntohs(bp->cluster_id)); + bdg_ports ++ ; + } + ifnet_head_done(); } /* @@ -666,27 +658,27 @@ bdg_forward(struct mbuf *m0, struct ether_header *const eh, struct ifnet *dst) bdg_thru++; /* only count once */ if (src == NULL) /* packet from ether_output */ - dst = bridge_dst_lookup(eh); + dst = bridge_dst_lookup(eh); if (dst == BDG_DROP) { /* this should not happen */ - printf("xx bdg_forward for BDG_DROP\n"); - m_freem(m0); - return NULL; + printf("xx bdg_forward for BDG_DROP\n"); + m_freem(m0); + return NULL; } if (dst == BDG_LOCAL) { /* this should not happen as well */ - printf("xx ouch, bdg_forward for local pkt\n"); - return m0; + printf("xx ouch, bdg_forward for local pkt\n"); + return m0; } if (dst == BDG_BCAST || dst == BDG_MCAST || dst == BDG_UNKNOWN) { - ifp = ifnet.tqh_first ; /* scan all ports */ - once = 0 ; - if (dst != BDG_UNKNOWN) /* need a copy for the local stack */ - shared = 1 ; + ifp = ifnet_head.tqh_first ; /* scan all ports */ + once = 0 ; + if (dst != BDG_UNKNOWN) /* need a copy for the local stack */ + shared = 1 ; } else { - ifp = dst ; - once = 1 ; + ifp = dst ; + once = 1 ; } if ( (u_int)(ifp) <= (u_int)BDG_FORWARD ) - panic("bdg_forward: bad dst"); + panic("bdg_forward: bad dst"); #ifdef IPFIREWALL /* diff --git a/bsd/net/bridge.h b/bsd/net/bridge.h index a9c6b277d..3f49914b4 100644 --- a/bsd/net/bridge.h +++ b/bsd/net/bridge.h @@ -87,7 +87,6 @@ extern struct bdg_softc *ifp2sc; #define BDG_MUTE(ifp) ifp2sc[ifp->if_index].flags |= IFF_MUTE #define BDG_UNMUTE(ifp) ifp2sc[ifp->if_index].flags &= ~IFF_MUTE #define BDG_CLUSTER(ifp) (ifp2sc[ifp->if_index].cluster_id) -#define BDG_EH(ifp) ((struct arpcom *)ifp)->ac_enaddr #define BDG_SAMECLUSTER(ifp,src) \ (src == NULL || BDG_CLUSTER(ifp) == BDG_CLUSTER(src) ) diff --git a/bsd/net/bsd_comp.c b/bsd/net/bsd_comp.c index d81e66f93..1bfc725ea 100644 --- a/bsd/net/bsd_comp.c +++ b/bsd/net/bsd_comp.c @@ -146,26 +146,27 @@ struct bsd_db { #define BSD_OVHD 2 /* BSD compress overhead/packet */ #define BSD_INIT_BITS BSD_MIN_BITS -static void bsd_clear __P((struct bsd_db *db)); -static int bsd_check __P((struct bsd_db *db)); -static void *bsd_alloc __P((u_char *options, int opt_len, int decomp)); -static int bsd_init_comp_db __P((struct bsd_db *db, u_char *options, int opt_len, +static void bsd_clear(struct bsd_db *db); +static int bsd_check(struct bsd_db *db); +static void *bsd_alloc(u_char *options, int opt_len, int decomp); +static int bsd_init_comp_db(struct bsd_db *db, u_char *options, + int opt_len, int unit, int hdrlen, int mru, int debug, - int decomp)); -static void *bsd_comp_alloc __P((u_char *options, int opt_len)); -static void *bsd_decomp_alloc __P((u_char *options, int opt_len)); -static void bsd_free __P((void *state)); -static int bsd_comp_init __P((void *state, u_char *options, int opt_len, - int unit, int hdrlen, int debug)); -static int bsd_decomp_init __P((void *state, u_char *options, int opt_len, - int unit, int hdrlen, int mru, int debug)); -static int bsd_compress __P((void *state, struct mbuf **mret, - struct mbuf *mp, int slen, int maxolen)); -static void bsd_incomp __P((void *state, struct mbuf *dmsg)); -static int bsd_decompress __P((void *state, struct mbuf *cmp, - struct mbuf **dmpp)); -static void bsd_reset __P((void *state)); -static void bsd_comp_stats __P((void *state, struct compstat *stats)); + int decomp); +static void *bsd_comp_alloc(u_char *options, int opt_len); +static void *bsd_decomp_alloc(u_char *options, int opt_len); +static void bsd_free(void *state); +static int bsd_comp_init(void *state, u_char *options, int opt_len, + int unit, int hdrlen, int debug); +static int bsd_decomp_init(void *state, u_char *options, int opt_len, + int unit, int hdrlen, int mru, int debug); +static int bsd_compress(void *state, struct mbuf **mret, + struct mbuf *mp, int slen, int maxolen); +static void bsd_incomp(void *state, struct mbuf *dmsg); +static int bsd_decompress(void *state, struct mbuf *cmp, + struct mbuf **dmpp); +static void bsd_reset(void *state); +static void bsd_comp_stats(void *state, struct compstat *stats); /* * Procedures exported to if_ppp.c. diff --git a/bsd/net/devtimer.c b/bsd/net/devtimer.c new file mode 100644 index 000000000..4344f3a9f --- /dev/null +++ b/bsd/net/devtimer.c @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/* + * devtimer.c + * - timer source based on <kern/thread_call.h> + */ + +/* + * Modification History: + * + * June 22, 2004 Dieter Siegmund (dieter@apple.com) + * - created + */ +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <kern/thread_call.h> +#include <net/devtimer.h> +#include <libkern/OSAtomic.h> + +#ifdef DEVTIMER_DEBUG +#define _devtimer_printf printf +#else DEVTIMER_DEBUG +static __inline__ void +_devtimer_printf(__unused const char * fmt, ...) +{ +} +#endif DEVTIMER_DEBUG + +struct devtimer_s { + void * dt_callout; + devtimer_timeout_func dt_timeout_func; + devtimer_process_func dt_process_func; + void * dt_arg0; + void * dt_arg1; + void * dt_arg2; + int dt_generation; + UInt32 dt_retain_count; +}; + +#define M_DEVTIMER M_DEVBUF + +static __inline__ void +timeval_add(struct timeval tv1, struct timeval tv2, + struct timeval * result) +{ + result->tv_sec = tv1.tv_sec + tv2.tv_sec; + result->tv_usec = tv1.tv_usec + tv2.tv_usec; + if (result->tv_usec > DEVTIMER_USECS_PER_SEC) { + result->tv_usec -= DEVTIMER_USECS_PER_SEC; + result->tv_sec++; + } + return; +} + +static __inline__ uint64_t +timeval_to_absolutetime(struct timeval tv) +{ + uint64_t secs; + uint64_t usecs; + + clock_interval_to_absolutetime_interval(tv.tv_sec, NSEC_PER_SEC, + &secs); + clock_interval_to_absolutetime_interval(tv.tv_usec, NSEC_PER_USEC, + &usecs); + return (secs + usecs); +} + + +__private_extern__ int +devtimer_valid(devtimer_ref timer) +{ + return (timer->dt_callout != NULL); +} + +__private_extern__ void +devtimer_retain(devtimer_ref timer) +{ + OSIncrementAtomic(&timer->dt_retain_count); + return; +} + +__private_extern__ void +devtimer_invalidate(devtimer_ref timer) +{ + devtimer_cancel(timer); + timer->dt_arg0 = NULL; + if (timer->dt_callout != NULL) { + thread_call_free(timer->dt_callout); + timer->dt_callout = NULL; + } + return; +} + +__private_extern__ void +devtimer_release(devtimer_ref timer) +{ + UInt32 old_retain_count; + + old_retain_count = OSDecrementAtomic(&timer->dt_retain_count); + switch (old_retain_count) { + case 0: + panic("devtimer_release: retain count is 0\n"); + break; + case 1: + devtimer_invalidate(timer); + FREE(timer, M_DEVTIMER); + _devtimer_printf("devtimer: timer released\n"); + break; + default: + break; + } + return; +} + +static void +devtimer_process(void * param0, void * param1) +{ + int generation = (int)param1; + devtimer_process_func process_func; + devtimer_timeout_func timeout_func; + devtimer_ref timer = (devtimer_ref)param0; + + process_func = timer->dt_process_func; + if (process_func != NULL) { + (*process_func)(timer, devtimer_process_func_event_lock); + } + timeout_func = timer->dt_timeout_func; + if (timeout_func != NULL) { + timer->dt_timeout_func = NULL; + if (timer->dt_generation == generation) { + (*timeout_func)(timer->dt_arg0, timer->dt_arg1, timer->dt_arg2); + } + } + devtimer_release(timer); + if (process_func != NULL) { + (*process_func)(timer, devtimer_process_func_event_unlock); + } + return; +} + +__private_extern__ void * +devtimer_arg0(devtimer_ref timer) +{ + return (timer->dt_arg0); +} + +__private_extern__ devtimer_ref +devtimer_create(devtimer_process_func process_func, void * arg0) +{ + devtimer_ref timer; + + timer = _MALLOC(sizeof(*timer), M_DEVTIMER, M_WAITOK); + if (timer == NULL) { + return (timer); + } + bzero(timer, sizeof(*timer)); + devtimer_retain(timer); + timer->dt_callout = thread_call_allocate(devtimer_process, timer); + if (timer->dt_callout == NULL) { + _devtimer_printf("devtimer: thread_call_allocate failed\n"); + devtimer_release(timer); + timer = NULL; + } + timer->dt_process_func = process_func; + timer->dt_arg0 = arg0; + return (timer); +} + +__private_extern__ void +devtimer_set_absolute(devtimer_ref timer, + struct timeval abs_time, + devtimer_timeout_func timeout_func, + void * arg1, void * arg2) +{ + if (timer->dt_callout == NULL) { + printf("devtimer_set_absolute: uninitialized/freed timer\n"); + return; + } + devtimer_cancel(timer); + if (timeout_func == NULL) { + return; + } + timer->dt_timeout_func = timeout_func; + timer->dt_arg1 = arg1; + timer->dt_arg2 = arg2; + _devtimer_printf("devtimer: wakeup time is (%d.%d)\n", + abs_time.tv_sec, abs_time.tv_usec); + timer->dt_generation++; + devtimer_retain(timer); + thread_call_enter1_delayed(timer->dt_callout, + (thread_call_param_t)timer->dt_generation, + timeval_to_absolutetime(abs_time)); + return; +} + +__private_extern__ void +devtimer_set_relative(devtimer_ref timer, + struct timeval rel_time, + devtimer_timeout_func timeout_func, + void * arg1, void * arg2) +{ + struct timeval abs_time; + struct timeval current_time; + + current_time = devtimer_current_time(); + timeval_add(current_time, rel_time, &abs_time); + devtimer_set_absolute(timer, abs_time, timeout_func, arg1, arg2); + return; +} + +__private_extern__ void +devtimer_cancel(devtimer_ref timer) +{ + if (timer->dt_timeout_func != NULL) { + timer->dt_timeout_func = NULL; + if (timer->dt_callout != NULL) { + _devtimer_printf("devtimer: cancelling timer source\n"); + if (thread_call_cancel(timer->dt_callout)) { + devtimer_release(timer); + } + else { + _devtimer_printf("devtimer: delayed release\n"); + } + } + } + return; +} + +__private_extern__ int +devtimer_enabled(devtimer_ref timer) +{ + return (timer->dt_timeout_func != NULL); +} + +__private_extern__ int32_t +devtimer_current_secs(void) +{ + struct timeval tv; + + tv = devtimer_current_time(); + return (tv.tv_sec); +} + +__private_extern__ struct timeval +devtimer_current_time(void) +{ + struct timeval tv; + uint32_t sec; + uint32_t usec; + + clock_get_system_microtime(&sec, &usec); + tv.tv_sec = sec; + tv.tv_usec = usec; + return (tv); +} diff --git a/bsd/net/devtimer.h b/bsd/net/devtimer.h new file mode 100644 index 000000000..9e8aeca91 --- /dev/null +++ b/bsd/net/devtimer.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/* + * devtimer.h + * - timer source based on <kern/thread_call.h> + */ + + +#ifndef _NET_DEVTIMER_H +#define _NET_DEVTIMER_H + +#include <sys/types.h> +#include <sys/systm.h> + +#define DEVTIMER_USECS_PER_SEC (1000 * 1000) + +enum { + devtimer_process_func_event_lock, + devtimer_process_func_event_unlock, +}; +typedef int devtimer_process_func_event; + +typedef struct devtimer_s * devtimer_ref; +typedef void (*devtimer_process_func)(devtimer_ref timer, + devtimer_process_func_event event); +typedef void (*devtimer_timeout_func)(void * arg0, void * arg1, void * arg2); + +int +devtimer_valid(devtimer_ref timer); + +void +devtimer_retain(devtimer_ref timer); + +void * +devtimer_arg0(devtimer_ref timer); + +devtimer_ref +devtimer_create(devtimer_process_func process_func, void * arg0); + +void +devtimer_invalidate(devtimer_ref timer); + +void +devtimer_release(devtimer_ref timer); + +void +devtimer_set_absolute(devtimer_ref t, + struct timeval abs_time, + devtimer_timeout_func func, + void * arg1, void * arg2); + +void +devtimer_set_relative(devtimer_ref t, + struct timeval rel_time, + devtimer_timeout_func func, + void * arg1, void * arg2); +void +devtimer_cancel(devtimer_ref t); + +int +devtimer_enabled(devtimer_ref t); + +struct timeval +devtimer_current_time(void); + +int32_t +devtimer_current_secs(void); + +#endif _NET_DEVTIMER_H diff --git a/bsd/net/dlil.c b/bsd/net/dlil.c index 98973ea10..f69a1c9e0 100644 --- a/bsd/net/dlil.c +++ b/bsd/net/dlil.c @@ -26,28 +26,33 @@ * Author: Ted Walker */ - - #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/socket.h> +#include <sys/domain.h> +#include <sys/user.h> #include <net/if_dl.h> #include <net/if.h> +#include <net/route.h> #include <net/if_var.h> #include <net/dlil.h> +#include <net/if_arp.h> #include <sys/kern_event.h> #include <sys/kdebug.h> -#include <string.h> +#include <kern/assert.h> #include <kern/task.h> #include <kern/thread.h> #include <kern/sched_prim.h> +#include <kern/locks.h> -#include <net/netisr.h> #include <net/if_types.h> +#include <net/kpi_interfacefilter.h> + +#include <libkern/OSAtomic.h> #include <machine/machine_routines.h> @@ -67,49 +72,79 @@ #define PFILT(x) ((struct dlil_filterq_entry *) (x))->variants.pr_filter #define IFILT(x) ((struct dlil_filterq_entry *) (x))->variants.if_filter -struct dl_tag_str { - struct ifnet *ifp; - struct if_proto *proto; - struct dlil_filterq_head *pr_flt_head; +#if 0 +#define DLIL_PRINTF printf +#else +#define DLIL_PRINTF kprintf +#endif + +//#define DLIL_ALWAYS_DELAY_DETACH 1 + +enum { + kProtoKPI_DLIL = 0, + kProtoKPI_v1 = 1 +}; + +struct if_proto { + SLIST_ENTRY(if_proto) next_hash; + int refcount; + int detaching; + struct ifnet *ifp; + struct domain *dl_domain; + protocol_family_t protocol_family; + int proto_kpi; + union { + struct { + dl_input_func dl_input; + dl_pre_output_func dl_pre_output; + dl_event_func dl_event; + dl_offer_func dl_offer; + dl_ioctl_func dl_ioctl; + dl_detached_func dl_detached; + } dlil; + struct { + proto_media_input input; + proto_media_preout pre_output; + proto_media_event event; + proto_media_ioctl ioctl; + proto_media_detached detached; + proto_media_resolve_multi resolve_multi; + proto_media_send_arp send_arp; + } v1; + } kpi; }; +SLIST_HEAD(proto_hash_entry, if_proto); + struct dlil_ifnet { /* ifnet and drvr_ext are used by the stack and drivers drvr_ext extends the public ifnet and must follow dl_if */ struct ifnet dl_if; /* public ifnet */ - void *drvr_ext[4]; /* driver reserved (e.g arpcom extension for enet) */ /* dlil private fields */ TAILQ_ENTRY(dlil_ifnet) dl_if_link; /* dlil_ifnet are link together */ /* it is not the ifnet list */ void *if_uniqueid; /* unique id identifying the interface */ size_t if_uniqueid_len;/* length of the unique id */ - char if_namestorage[IFNAMSIZ]; /* interface name storage for detached interfaces */ -}; - -struct dlil_stats_str { - int inject_pr_in1; - int inject_pr_in2; - int inject_pr_out1; - int inject_pr_out2; - int inject_if_in1; - int inject_if_in2; - int inject_if_out1; - int inject_if_out2; + char if_namestorage[IFNAMSIZ]; /* interface name storage */ }; - -struct dlil_filter_id_str { - int type; - struct dlil_filterq_head *head; - struct dlil_filterq_entry *filter_ptr; - struct ifnet *ifp; - struct if_proto *proto; +struct ifnet_filter { + TAILQ_ENTRY(ifnet_filter) filt_next; + ifnet_t filt_ifp; + int filt_detaching; + + const char *filt_name; + void *filt_cookie; + protocol_family_t filt_protocol; + iff_input_func filt_input; + iff_output_func filt_output; + iff_event_func filt_event; + iff_ioctl_func filt_ioctl; + iff_detached_func filt_detached; }; - - struct if_family_str { TAILQ_ENTRY(if_family_str) if_fam_next; u_long if_family; @@ -121,49 +156,47 @@ struct if_family_str { int (*add_if)(struct ifnet *ifp); int (*del_if)(struct ifnet *ifp); int (*init_if)(struct ifnet *ifp); - int (*add_proto)(struct ddesc_head_str *demux_desc_head, - struct if_proto *proto, u_long dl_tag); - int (*del_proto)(struct if_proto *proto, u_long dl_tag); - int (*ifmod_ioctl)(struct ifnet *ifp, u_long command, caddr_t data); - int (*shutdown)(); + int (*add_proto)(struct ifnet *ifp, u_long protocol_family, struct ddesc_head_str *demux_desc_head); + ifnet_del_proto_func del_proto; + ifnet_ioctl_func ifmod_ioctl; + int (*shutdown)(void); }; - struct proto_family_str { TAILQ_ENTRY(proto_family_str) proto_fam_next; u_long proto_family; u_long if_family; + int usecnt; - int (*attach_proto)(struct ifnet *ifp, u_long *dl_tag); - int (*detach_proto)(struct ifnet *ifp, u_long dl_tag); + int (*attach_proto)(struct ifnet *ifp, u_long protocol_family); + int (*detach_proto)(struct ifnet *ifp, u_long protocol_family); }; +enum { + kIfNetUseCount_MayBeZero = 0, + kIfNetUseCount_MustNotBeZero = 1 +}; - -struct dlil_stats_str dlil_stats; - -static -struct dlil_filter_id_str *dlil_filters; - -static -struct dl_tag_str *dl_tag_array; - -static -TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head; - -static -TAILQ_HEAD(, if_family_str) if_family_head; - -static -TAILQ_HEAD(, proto_family_str) proto_family_head; - -static ifnet_inited = 0; -static u_long dl_tag_nb = 0; -static u_long dlil_filters_nb = 0; +static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head; +static TAILQ_HEAD(, if_family_str) if_family_head; +static TAILQ_HEAD(, proto_family_str) proto_family_head; +static lck_grp_t *dlil_lock_group; +static lck_grp_t *ifnet_lock_group; +static lck_grp_t *ifnet_head_lock_group; +static lck_attr_t *ifnet_lock_attr; +static lck_mtx_t *proto_family_mutex; +static lck_rw_t *ifnet_head_mutex; +static lck_mtx_t *dlil_ifnet_mutex; +static lck_mtx_t *dlil_mutex; +static unsigned long dlil_read_count = 0; +static unsigned long dlil_detach_waiting = 0; +extern u_int32_t ipv4_ll_arp_aware; int dlil_initialized = 0; -decl_simple_lock_data(, dlil_input_lock) +lck_spin_t *dlil_input_lock; +__private_extern__ thread_t dlil_input_thread_ptr = 0; int dlil_input_thread_wakeup = 0; +__private_extern__ int dlil_output_thread_wakeup = 0; static struct mbuf *dlil_input_mbuf_head = NULL; static struct mbuf *dlil_input_mbuf_tail = NULL; #if NLOOP > 1 @@ -171,18 +204,140 @@ static struct mbuf *dlil_input_mbuf_tail = NULL; #endif static struct mbuf *dlil_input_loop_head = NULL; static struct mbuf *dlil_input_loop_tail = NULL; -extern struct ifmultihead ifma_lostlist; static void dlil_input_thread(void); -extern void run_netisr(void); +static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg); +struct ifnet *ifbyfamily(u_long family, short unit); +static int dlil_detach_filter_internal(interface_filter_t filter, int detached); +static void dlil_call_delayed_detach_thread(void); + +static void dlil_read_begin(void); +static void dlil_read_end(void); +static int dlil_write_begin(void); +static void dlil_write_end(void); + +static int ifp_use(struct ifnet *ifp, int handle_zero); +static int ifp_unuse(struct ifnet *ifp); +static void ifp_use_reached_zero(struct ifnet *ifp); + extern void bpfdetach(struct ifnet*); +extern void proto_input_run(void); // new run_netisr + + +int dlil_input_packet(struct ifnet *ifp, struct mbuf *m, char *frame_header); + +__private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *); int dlil_expand_mcl; +static const u_int32_t dlil_writer_waiting = 0x80000000; + +static __inline__ void* +_cast_non_const(const void * ptr) { + union { + const void* cval; + void* val; + } ret; + + ret.cval = ptr; + return (ret.val); +} + +/* Should these be inline? */ +static void +dlil_read_begin(void) +{ + unsigned long new_value; + unsigned long old_value; + struct uthread *uth = get_bsdthread_info(current_thread()); + + if (uth->dlil_incremented_read == dlil_writer_waiting) + panic("dlil_read_begin - thread is already a writer"); + + do { +again: + old_value = dlil_read_count; + + if ((old_value & dlil_writer_waiting) != 0 && uth->dlil_incremented_read == 0) + { + tsleep(&dlil_read_count, PRIBIO, "dlil_read_count", 1); + goto again; + } + + new_value = old_value + 1; + } while (!OSCompareAndSwap((UInt32)old_value, (UInt32)new_value, (UInt32*)&dlil_read_count)); + + uth->dlil_incremented_read++; +} + +static void +dlil_read_end(void) +{ + struct uthread *uth = get_bsdthread_info(current_thread()); + + OSDecrementAtomic((UInt32*)&dlil_read_count); + uth->dlil_incremented_read--; + if (dlil_read_count == dlil_writer_waiting) + wakeup(_cast_non_const(&dlil_writer_waiting)); +} + +static int +dlil_write_begin(void) +{ + struct uthread *uth = get_bsdthread_info(current_thread()); + + if (uth->dlil_incremented_read != 0) { + return EDEADLK; + } + lck_mtx_lock(dlil_mutex); + OSBitOrAtomic((UInt32)dlil_writer_waiting, (UInt32*)&dlil_read_count); +again: + if (dlil_read_count == dlil_writer_waiting) { + uth->dlil_incremented_read = dlil_writer_waiting; + return 0; + } + else { + tsleep(_cast_non_const(&dlil_writer_waiting), PRIBIO, "dlil_writer_waiting", 1); + goto again; + } +} + +static void +dlil_write_end(void) +{ + struct uthread *uth = get_bsdthread_info(current_thread()); + + if (uth->dlil_incremented_read != dlil_writer_waiting) + panic("dlil_write_end - thread is not a writer"); + OSBitAndAtomic((UInt32)~dlil_writer_waiting, (UInt32*)&dlil_read_count); + lck_mtx_unlock(dlil_mutex); + uth->dlil_incremented_read = 0; + wakeup(&dlil_read_count); +} + +#define PROTO_HASH_SLOTS 0x5 + /* * Internal functions. */ +static int +proto_hash_value(u_long protocol_family) +{ + switch(protocol_family) { + case PF_INET: + return 0; + case PF_INET6: + return 1; + case PF_APPLETALK: + return 2; + case PF_VLAN: + return 3; + default: + return 4; + } +} + static struct if_family_str *find_family_module(u_long if_family) { @@ -197,7 +352,8 @@ struct if_family_str *find_family_module(u_long if_family) } static -struct proto_family_str *find_proto_module(u_long proto_family, u_long if_family) +struct proto_family_str* +find_proto_module(u_long proto_family, u_long if_family) { struct proto_family_str *mod = NULL; @@ -210,1305 +366,2068 @@ struct proto_family_str *find_proto_module(u_long proto_family, u_long if_family return mod; } - -/* - * Public functions. - */ - -struct ifnet *ifbyfamily(u_long family, short unit) +static struct if_proto* +find_attached_proto(struct ifnet *ifp, u_long protocol_family) { - struct ifnet *ifp; - - TAILQ_FOREACH(ifp, &ifnet, if_link) - if ((family == ifp->if_family) && - (ifp->if_unit == unit)) - return ifp; - - return 0; + struct if_proto *proto = NULL; + u_long i = proto_hash_value(protocol_family); + if (ifp->if_proto_hash) { + proto = SLIST_FIRST(&ifp->if_proto_hash[i]); + } + + while(proto && proto->protocol_family != protocol_family) { + proto = SLIST_NEXT(proto, next_hash); + } + + return proto; } -struct if_proto *dlttoproto(u_long dl_tag) +static void +if_proto_ref(struct if_proto *proto) { - if (dl_tag < dl_tag_nb && dl_tag_array[dl_tag].ifp) - return dl_tag_array[dl_tag].proto; - return 0; + OSAddAtomic(1, (UInt32*)&proto->refcount); } - -static int dlil_ifp_proto_count(struct ifnet * ifp) +static void +if_proto_free(struct if_proto *proto) { - int count = 0; - struct if_proto * proto; - struct dlil_proto_head * tmp; - - tmp = (struct dlil_proto_head *) &ifp->proto_head; - - TAILQ_FOREACH(proto, tmp, next) - count++; - - return count; + int oldval = OSAddAtomic(-1, (UInt32*)&proto->refcount); + + if (oldval == 1) { /* This was the last reference */ + FREE(proto, M_IFADDR); + } } -u_long ifptodlt(struct ifnet *ifp, u_long proto_family) +__private_extern__ void +ifnet_lock_assert( + __unused struct ifnet *ifp, + __unused int what) { - struct if_proto *proto; - struct dlil_proto_head *tmp = (struct dlil_proto_head *) &ifp->proto_head; - - - TAILQ_FOREACH(proto, tmp, next) - if (proto->protocol_family == proto_family) - return proto->dl_tag; - - return 0; +#if IFNET_RW_LOCK + /* + * Not implemented for rw locks. + * + * Function exists so when/if we use mutex we can + * enable this check. + */ +#else + lck_mtx_assert(ifp->if_lock, what); +#endif } - -int dlil_find_dltag(u_long if_family, short unit, u_long proto_family, u_long *dl_tag) +__private_extern__ void +ifnet_lock_shared( + struct ifnet *ifp) { - struct ifnet *ifp; - - ifp = ifbyfamily(if_family, unit); - if (!ifp) - return ENOENT; - - *dl_tag = ifptodlt(ifp, proto_family); - if (*dl_tag == 0) - return EPROTONOSUPPORT; - else - return 0; +#if IFNET_RW_LOCK + lck_rw_lock_shared(ifp->if_lock); +#else + lck_mtx_assert(ifp->if_lock, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(ifp->if_lock); +#endif } - -void dlil_post_msg(struct ifnet *ifp, u_long event_subclass, u_long event_code, - struct net_event_data *event_data, u_long event_data_len) +__private_extern__ void +ifnet_lock_exclusive( + struct ifnet *ifp) { - struct net_event_data ev_data; - struct kev_msg ev_msg; - - /* - * a net event always start with a net_event_data structure - * but the caller can generate a simple net event or - * provide a longer event structure to post - */ - - ev_msg.vendor_code = KEV_VENDOR_APPLE; - ev_msg.kev_class = KEV_NETWORK_CLASS; - ev_msg.kev_subclass = event_subclass; - ev_msg.event_code = event_code; - - if (event_data == 0) { - event_data = &ev_data; - event_data_len = sizeof(struct net_event_data); - } - - strncpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ); - event_data->if_family = ifp->if_family; - event_data->if_unit = (unsigned long) ifp->if_unit; - - ev_msg.dv[0].data_length = event_data_len; - ev_msg.dv[0].data_ptr = event_data; - ev_msg.dv[1].data_length = 0; - - kev_post_msg(&ev_msg); +#if IFNET_RW_LOCK + lck_rw_lock_exclusive(ifp->if_lock); +#else + lck_mtx_assert(ifp->if_lock, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(ifp->if_lock); +#endif } - - -void -dlil_init() +__private_extern__ void +ifnet_lock_done( + struct ifnet *ifp) { - int i; - - TAILQ_INIT(&dlil_ifnet_head); - TAILQ_INIT(&if_family_head); - TAILQ_INIT(&proto_family_head); - - // create the dl tag array - MALLOC(dl_tag_array, void *, sizeof(struct dl_tag_str) * MAX_DL_TAGS, M_NKE, M_WAITOK); - if (dl_tag_array == 0) { - printf("dlil_init tags array allocation failed\n"); - return; //very bad - } - bzero(dl_tag_array, sizeof(struct dl_tag_str) * MAX_DL_TAGS); - dl_tag_nb = MAX_DL_TAGS; - - // create the dl filters array - MALLOC(dlil_filters, void *, sizeof(struct dlil_filter_id_str) * MAX_DLIL_FILTERS, M_NKE, M_WAITOK); - if (dlil_filters == 0) { - printf("dlil_init filters array allocation failed\n"); - return; //very bad - } - bzero(dlil_filters, sizeof(struct dlil_filter_id_str) * MAX_DLIL_FILTERS); - dlil_filters_nb = MAX_DLIL_FILTERS; - - bzero(&dlil_stats, sizeof(dlil_stats)); - - simple_lock_init(&dlil_input_lock); - - /* - * Start up the dlil input thread once everything is initialized - */ - (void) kernel_thread(kernel_task, dlil_input_thread); +#if IFNET_RW_LOCK + lck_rw_done(ifp->if_lock); +#else + lck_mtx_assert(ifp->if_lock, LCK_MTX_ASSERT_OWNED); + lck_mtx_unlock(ifp->if_lock); +#endif } -u_long get_new_filter_id() +__private_extern__ void +ifnet_head_lock_shared() { - u_long i; - u_char *p; - - for (i=1; i < dlil_filters_nb; i++) - if (dlil_filters[i].type == 0) - break; - - if (i == dlil_filters_nb) { - // expand the filters array by MAX_DLIL_FILTERS - MALLOC(p, u_char *, sizeof(struct dlil_filter_id_str) * (dlil_filters_nb + MAX_DLIL_FILTERS), M_NKE, M_WAITOK); - if (p == 0) - return 0; - - bcopy(dlil_filters, p, sizeof(struct dlil_filter_id_str) * dlil_filters_nb); - bzero(p + sizeof(struct dlil_filter_id_str) * dlil_filters_nb, sizeof(struct dlil_filter_id_str) * MAX_DL_TAGS); - dlil_filters_nb += MAX_DLIL_FILTERS; - FREE(dlil_filters, M_NKE); - dlil_filters = (struct dlil_filter_id_str *)p; - } - - return i; + lck_rw_lock_shared(ifnet_head_mutex); } +__private_extern__ void +ifnet_head_lock_exclusive() +{ + lck_rw_lock_exclusive(ifnet_head_mutex); +} -int dlil_attach_interface_filter(struct ifnet *ifp, - struct dlil_if_flt_str *if_filter, - u_long *filter_id, - int insertion_point) +__private_extern__ void +ifnet_head_done() { - int s; - int retval = 0; - struct dlil_filterq_entry *tmp_ptr; - struct dlil_filterq_entry *if_filt; - struct dlil_filterq_head *fhead = (struct dlil_filterq_head *) &ifp->if_flt_head; - boolean_t funnel_state; + lck_rw_done(ifnet_head_mutex); +} - MALLOC(tmp_ptr, struct dlil_filterq_entry *, sizeof(*tmp_ptr), M_NKE, M_WAITOK); - if (tmp_ptr == NULL) - return (ENOBUFS); +/* + * Public functions. + */ +struct ifnet *ifbyfamily(u_long family, short unit) +{ + struct ifnet *ifp; - bcopy((caddr_t) if_filter, (caddr_t) &tmp_ptr->variants.if_filter, - sizeof(struct dlil_if_flt_str)); + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_link) + if ((family == ifp->if_family) && (ifp->if_unit == unit)) + break; + ifnet_head_done(); - funnel_state = thread_funnel_set(network_flock, TRUE); - s = splnet(); + return ifp; +} - *filter_id = get_new_filter_id(); - if (*filter_id == 0) { - FREE(tmp_ptr, M_NKE); - retval = ENOMEM; - goto end; - } - - dlil_filters[*filter_id].filter_ptr = tmp_ptr; - dlil_filters[*filter_id].head = (struct dlil_filterq_head *) &ifp->if_flt_head; - dlil_filters[*filter_id].type = DLIL_IF_FILTER; - dlil_filters[*filter_id].ifp = ifp; - tmp_ptr->filter_id = *filter_id; - tmp_ptr->type = DLIL_IF_FILTER; - - if (insertion_point != DLIL_LAST_FILTER) { - TAILQ_FOREACH(if_filt, fhead, que) - if (insertion_point == if_filt->filter_id) { - TAILQ_INSERT_BEFORE(if_filt, tmp_ptr, que); - break; - } - } - else - TAILQ_INSERT_TAIL(fhead, tmp_ptr, que); +static int dlil_ifp_proto_count(struct ifnet * ifp) +{ + int count = 0; + int i; + + if (ifp->if_proto_hash != NULL) { + for (i = 0; i < PROTO_HASH_SLOTS; i++) { + struct if_proto *proto; + SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) { + count++; + } + } + } + + return count; +} -end: - splx(s); - thread_funnel_set(network_flock, funnel_state); - return retval; +__private_extern__ void +dlil_post_msg(struct ifnet *ifp, u_long event_subclass, u_long event_code, + struct net_event_data *event_data, u_long event_data_len) +{ + struct net_event_data ev_data; + struct kev_msg ev_msg; + + /* + * a net event always start with a net_event_data structure + * but the caller can generate a simple net event or + * provide a longer event structure to post + */ + + ev_msg.vendor_code = KEV_VENDOR_APPLE; + ev_msg.kev_class = KEV_NETWORK_CLASS; + ev_msg.kev_subclass = event_subclass; + ev_msg.event_code = event_code; + + if (event_data == 0) { + event_data = &ev_data; + event_data_len = sizeof(struct net_event_data); + } + + strncpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ); + event_data->if_family = ifp->if_family; + event_data->if_unit = (unsigned long) ifp->if_unit; + + ev_msg.dv[0].data_length = event_data_len; + ev_msg.dv[0].data_ptr = event_data; + ev_msg.dv[1].data_length = 0; + + dlil_event_internal(ifp, &ev_msg); } +void dlil_init(void); +void +dlil_init(void) +{ + lck_grp_attr_t *grp_attributes = 0; + lck_attr_t *lck_attributes = 0; + lck_grp_t *input_lock_grp = 0; + + TAILQ_INIT(&dlil_ifnet_head); + TAILQ_INIT(&if_family_head); + TAILQ_INIT(&proto_family_head); + TAILQ_INIT(&ifnet_head); + + /* Setup the lock groups we will use */ + grp_attributes = lck_grp_attr_alloc_init(); + lck_grp_attr_setdefault(grp_attributes); + + dlil_lock_group = lck_grp_alloc_init("dlil internal locks", grp_attributes); +#if IFNET_RW_LOCK + ifnet_lock_group = lck_grp_alloc_init("ifnet locks", grp_attributes); +#else + ifnet_lock_group = lck_grp_alloc_init("ifnet locks", grp_attributes); +#endif + ifnet_head_lock_group = lck_grp_alloc_init("ifnet head lock", grp_attributes); + input_lock_grp = lck_grp_alloc_init("dlil input lock", grp_attributes); + lck_grp_attr_free(grp_attributes); + grp_attributes = 0; + + /* Setup the lock attributes we will use */ + lck_attributes = lck_attr_alloc_init(); + lck_attr_setdefault(lck_attributes); + + ifnet_lock_attr = lck_attr_alloc_init(); + lck_attr_setdefault(ifnet_lock_attr); + + dlil_input_lock = lck_spin_alloc_init(input_lock_grp, lck_attributes); + input_lock_grp = 0; + + ifnet_head_mutex = lck_rw_alloc_init(ifnet_head_lock_group, lck_attributes); + proto_family_mutex = lck_mtx_alloc_init(dlil_lock_group, lck_attributes); + dlil_ifnet_mutex = lck_mtx_alloc_init(dlil_lock_group, lck_attributes); + dlil_mutex = lck_mtx_alloc_init(dlil_lock_group, lck_attributes); + + lck_attr_free(lck_attributes); + lck_attributes = 0; + + /* + * Start up the dlil input thread once everything is initialized + */ + (void) kernel_thread(kernel_task, dlil_input_thread); + (void) kernel_thread(kernel_task, dlil_call_delayed_detach_thread); +} -int dlil_attach_protocol_filter(u_long dl_tag, - struct dlil_pr_flt_str *pr_filter, - u_long *filter_id, - int insertion_point) +int +dlil_attach_filter( + struct ifnet *ifp, + const struct iff_filter *if_filter, + interface_filter_t *filter_ref) { - struct dlil_filterq_entry *tmp_ptr, *pr_filt; - int s; int retval = 0; - boolean_t funnel_state; + struct ifnet_filter *filter; - if (dl_tag >= dl_tag_nb || dl_tag_array[dl_tag].ifp == 0) - return (ENOENT); - - MALLOC(tmp_ptr, struct dlil_filterq_entry *, sizeof(*tmp_ptr), M_NKE, M_WAITOK); - if (tmp_ptr == NULL) - return (ENOBUFS); - - bcopy((caddr_t) pr_filter, (caddr_t) &tmp_ptr->variants.pr_filter, - sizeof(struct dlil_pr_flt_str)); - - funnel_state = thread_funnel_set(network_flock, TRUE); - s = splnet(); + MALLOC(filter, struct ifnet_filter *, sizeof(*filter), M_NKE, M_WAITOK); + if (filter == NULL) + return ENOMEM; + bzero(filter, sizeof(*filter)); - *filter_id = get_new_filter_id(); - if (*filter_id == 0) { - FREE(tmp_ptr, M_NKE); - retval = ENOMEM; - goto end; - } - dlil_filters[*filter_id].filter_ptr = tmp_ptr; - dlil_filters[*filter_id].head = dl_tag_array[dl_tag].pr_flt_head; - dlil_filters[*filter_id].type = DLIL_PR_FILTER; - dlil_filters[*filter_id].proto = dl_tag_array[dl_tag].proto; - dlil_filters[*filter_id].ifp = dl_tag_array[dl_tag].ifp; - tmp_ptr->filter_id = *filter_id; - tmp_ptr->type = DLIL_PR_FILTER; - - if (insertion_point != DLIL_LAST_FILTER) { - TAILQ_FOREACH(pr_filt, dl_tag_array[dl_tag].pr_flt_head, que) - if (insertion_point == pr_filt->filter_id) { - TAILQ_INSERT_BEFORE(pr_filt, tmp_ptr, que); - break; - } - } - else - TAILQ_INSERT_TAIL(dl_tag_array[dl_tag].pr_flt_head, tmp_ptr, que); - -end: - splx(s); - thread_funnel_set(network_flock, funnel_state); - return retval; + filter->filt_ifp = ifp; + filter->filt_cookie = if_filter->iff_cookie; + filter->filt_name = if_filter->iff_name; + filter->filt_protocol = if_filter->iff_protocol; + filter->filt_input = if_filter->iff_input; + filter->filt_output = if_filter->iff_output; + filter->filt_event = if_filter->iff_event; + filter->filt_ioctl = if_filter->iff_ioctl; + filter->filt_detached = if_filter->iff_detached; + + if ((retval = dlil_write_begin()) != 0) { + /* Failed to acquire the write lock */ + FREE(filter, M_NKE); + return retval; + } + TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next); + dlil_write_end(); + *filter_ref = filter; + return retval; } - -int -dlil_detach_filter(u_long filter_id) +static int +dlil_detach_filter_internal(interface_filter_t filter, int detached) { - struct dlil_filter_id_str *flt; - int s, retval = 0; - boolean_t funnel_state; - - funnel_state = thread_funnel_set(network_flock, TRUE); - s = splnet(); - - if (filter_id >= dlil_filters_nb || dlil_filters[filter_id].type == 0) { - retval = ENOENT; - goto end; - } - - flt = &dlil_filters[filter_id]; - - if (flt->type == DLIL_IF_FILTER) { - if (IFILT(flt->filter_ptr).filter_detach) - (*IFILT(flt->filter_ptr).filter_detach)(IFILT(flt->filter_ptr).cookie); - } - else { - if (flt->type == DLIL_PR_FILTER) { - if (PFILT(flt->filter_ptr).filter_detach) - (*PFILT(flt->filter_ptr).filter_detach)(PFILT(flt->filter_ptr).cookie); + int retval = 0; + + + /* Take the write lock */ +#if DLIL_ALWAYS_DELAY_DETACH + retval = EDEADLK; +#else + if (detached == 0 && (retval = dlil_write_begin()) != 0) +#endif + { + if (retval == EDEADLK) { + /* Perform a delayed detach */ + filter->filt_detaching = 1; + dlil_detach_waiting = 1; + wakeup(&dlil_detach_waiting); + retval = 0; + } + return retval; } - } - - TAILQ_REMOVE(flt->head, flt->filter_ptr, que); - FREE(flt->filter_ptr, M_NKE); - flt->type = 0; + + if (detached == 0) + TAILQ_REMOVE(&filter->filt_ifp->if_flt_head, filter, filt_next); + + /* release the write lock */ + if (detached == 0) + dlil_write_end(); + + if (filter->filt_detached) + filter->filt_detached(filter->filt_cookie, filter->filt_ifp); -end: - splx(s); - thread_funnel_set(network_flock, funnel_state); - return retval; + FREE(filter, M_NKE); + + return retval; } void -dlil_input_thread_continue(void) -{ - while (1) { - struct mbuf *m, *m_loop; - - usimple_lock(&dlil_input_lock); - m = dlil_input_mbuf_head; - dlil_input_mbuf_head = NULL; - dlil_input_mbuf_tail = NULL; - m_loop = dlil_input_loop_head; - dlil_input_loop_head = NULL; - dlil_input_loop_tail = NULL; - usimple_unlock(&dlil_input_lock); - - /* - * NOTE warning %%% attention !!!! - * We should think about putting some thread starvation safeguards if - * we deal with long chains of packets. - */ - while (m) { - struct mbuf *m0 = m->m_nextpkt; - void *header = m->m_pkthdr.header; - - m->m_nextpkt = NULL; - m->m_pkthdr.header = NULL; - (void) dlil_input_packet(m->m_pkthdr.rcvif, m, header); - m = m0; - } - m = m_loop; - while (m) { - struct mbuf *m0 = m->m_nextpkt; - void *header = m->m_pkthdr.header; - struct ifnet *ifp = &loif[0]; - - m->m_nextpkt = NULL; - m->m_pkthdr.header = NULL; - (void) dlil_input_packet(ifp, m, header); - m = m0; - } +dlil_detach_filter(interface_filter_t filter) +{ + dlil_detach_filter_internal(filter, 0); +} - if (netisr != 0) - run_netisr(); +static void +dlil_input_thread_continue( + __unused void* foo, + __unused wait_result_t wait) +{ + while (1) { + struct mbuf *m, *m_loop; + + lck_spin_lock(dlil_input_lock); + m = dlil_input_mbuf_head; + dlil_input_mbuf_head = NULL; + dlil_input_mbuf_tail = NULL; + m_loop = dlil_input_loop_head; + dlil_input_loop_head = NULL; + dlil_input_loop_tail = NULL; + lck_spin_unlock(dlil_input_lock); + + /* + * NOTE warning %%% attention !!!! + * We should think about putting some thread starvation safeguards if + * we deal with long chains of packets. + */ + while (m) { + struct mbuf *m0 = m->m_nextpkt; + void *header = m->m_pkthdr.header; + + m->m_nextpkt = NULL; + m->m_pkthdr.header = NULL; + (void) dlil_input_packet(m->m_pkthdr.rcvif, m, header); + m = m0; + } + m = m_loop; + while (m) { + struct mbuf *m0 = m->m_nextpkt; + void *header = m->m_pkthdr.header; + struct ifnet *ifp = &loif[0]; + + m->m_nextpkt = NULL; + m->m_pkthdr.header = NULL; + (void) dlil_input_packet(ifp, m, header); + m = m0; + } + + proto_input_run(); - if (dlil_input_mbuf_head == NULL && - dlil_input_loop_head == NULL && - netisr == 0) { - assert_wait(&dlil_input_thread_wakeup, THREAD_UNINT); - (void) thread_block(dlil_input_thread_continue); - /* NOTREACHED */ - } - } + if (dlil_input_mbuf_head == NULL && + dlil_input_loop_head == NULL) { + assert_wait(&dlil_input_thread_wakeup, THREAD_UNINT); + (void) thread_block(dlil_input_thread_continue); + /* NOTREACHED */ + } + } } void dlil_input_thread(void) { - register thread_t self = current_act(); - - ml_thread_policy(self, MACHINE_GROUP, - (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_NETISR)); + register thread_t self = current_thread(); + + ml_thread_policy(self, MACHINE_GROUP, + (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_NETISR)); + + dlil_initialized = 1; + dlil_input_thread_ptr = current_thread(); + dlil_input_thread_continue(NULL, THREAD_RESTART); +} - /* The dlil thread is always funneled */ - thread_funnel_set(network_flock, TRUE); - dlil_initialized = 1; - dlil_input_thread_continue(); +int +dlil_input_with_stats( + struct ifnet *ifp, + struct mbuf *m_head, + struct mbuf *m_tail, + const struct ifnet_stat_increment_param *stats) +{ + /* WARNING + * Because of loopbacked multicast we cannot stuff the ifp in + * the rcvif of the packet header: loopback has its own dlil + * input queue + */ + + lck_spin_lock(dlil_input_lock); + if (ifp->if_type != IFT_LOOP) { + if (dlil_input_mbuf_head == NULL) + dlil_input_mbuf_head = m_head; + else if (dlil_input_mbuf_tail != NULL) + dlil_input_mbuf_tail->m_nextpkt = m_head; + dlil_input_mbuf_tail = m_tail ? m_tail : m_head; + } else { + if (dlil_input_loop_head == NULL) + dlil_input_loop_head = m_head; + else if (dlil_input_loop_tail != NULL) + dlil_input_loop_tail->m_nextpkt = m_head; + dlil_input_loop_tail = m_tail ? m_tail : m_head; + } + if (stats) { + ifp->if_data.ifi_ipackets += stats->packets_in; + ifp->if_data.ifi_ibytes += stats->bytes_in; + ifp->if_data.ifi_ierrors += stats->errors_in; + + ifp->if_data.ifi_opackets += stats->packets_out; + ifp->if_data.ifi_obytes += stats->bytes_out; + ifp->if_data.ifi_oerrors += stats->errors_out; + + ifp->if_data.ifi_collisions += stats->collisions; + ifp->if_data.ifi_iqdrops += stats->dropped; + } + lck_spin_unlock(dlil_input_lock); + + wakeup((caddr_t)&dlil_input_thread_wakeup); + + return 0; } int dlil_input(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail) -{ - /* WARNING - * Because of loopbacked multicast we cannot stuff the ifp in - * the rcvif of the packet header: loopback has its own dlil - * input queue - */ - - usimple_lock(&dlil_input_lock); - if (ifp->if_type != IFT_LOOP) { - if (dlil_input_mbuf_head == NULL) - dlil_input_mbuf_head = m_head; - else if (dlil_input_mbuf_tail != NULL) - dlil_input_mbuf_tail->m_nextpkt = m_head; - dlil_input_mbuf_tail = m_tail ? m_tail : m_head; - } else { - if (dlil_input_loop_head == NULL) - dlil_input_loop_head = m_head; - else if (dlil_input_loop_tail != NULL) - dlil_input_loop_tail->m_nextpkt = m_head; - dlil_input_loop_tail = m_tail ? m_tail : m_head; - } - usimple_unlock(&dlil_input_lock); - - wakeup((caddr_t)&dlil_input_thread_wakeup); - - return 0; +{ + return dlil_input_with_stats(ifp, m_head, m_tail, NULL); } int -dlil_input_packet(struct ifnet *ifp, struct mbuf *m, +dlil_input_packet(struct ifnet *ifp, struct mbuf *m, char *frame_header) { - struct ifnet *orig_ifp = 0; - struct dlil_filterq_entry *tmp; int retval; struct if_proto *ifproto = 0; - struct if_proto *proto; - struct dlil_filterq_head *fhead = (struct dlil_filterq_head *) &ifp->if_flt_head; + protocol_family_t protocol_family; + struct ifnet_filter *filter; KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START,0,0,0,0,0); - /* - * Run interface filters - */ - - while (orig_ifp != ifp) { - orig_ifp = ifp; - - TAILQ_FOREACH_REVERSE(tmp, fhead, que, dlil_filterq_head) { - if (IFILT(tmp).filter_if_input) { - retval = (*IFILT(tmp).filter_if_input)(IFILT(tmp).cookie, - &ifp, - &m, - &frame_header); - if (retval) { - if (retval == EJUSTRETURN) - return 0; - else { - m_freem(m); - return retval; - } - } - } + /* + * Lock the interface while we run through + * the filters and the demux. This lock + * protects the filter list and the demux list. + */ + dlil_read_begin(); - if (ifp != orig_ifp) - break; - } - } + /* + * Call family demux module. If the demux module finds a match + * for the frame it will fill-in the ifproto pointer. + */ - ifp->if_lastchange = time; - - /* - * Call family demux module. If the demux module finds a match - * for the frame it will fill-in the ifproto pointer. - */ + retval = ifp->if_demux(ifp, m, frame_header, &protocol_family); + if (retval != 0) + protocol_family = 0; + if (retval == EJUSTRETURN) { + dlil_read_end(); + return 0; + } - retval = (*ifp->if_demux)(ifp, m, frame_header, &ifproto ); + /* DANGER!!! */ + if (m->m_flags & (M_BCAST|M_MCAST)) + ifp->if_imcasts++; - if (m->m_flags & (M_BCAST|M_MCAST)) - ifp->if_imcasts++; - - if ((retval) && (retval != EJUSTRETURN) && (ifp->offercnt)) { /* - * No match was found, look for any offers. + * Run interface filters */ - struct dlil_proto_head *tmp = (struct dlil_proto_head *) &ifp->proto_head; - TAILQ_FOREACH(proto, tmp, next) { - if ((proto->dl_offer) && (proto->dl_offer(m, frame_header) == 0)) { - ifproto = proto; - retval = 0; - break; - } + + /* Do not pass VLAN tagged packets to filters PR-3586856 */ + if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) == 0) { + TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) { + int filter_result; + if (filter->filt_input && (filter->filt_protocol == 0 || + filter->filt_protocol == protocol_family)) { + filter_result = filter->filt_input(filter->filt_cookie, ifp, protocol_family, &m, &frame_header); + + if (filter_result) { + dlil_read_end(); + if (filter_result == EJUSTRETURN) { + filter_result = 0; + } + else { + m_freem(m); + } + + return filter_result; + } + } + } } - } - if (retval) { - if (retval != EJUSTRETURN) { - m_freem(m); - return retval; + /* Demux is done, interface filters have been processed, unlock the mutex */ + if (retval || ((m->m_flags & M_PROMISC) != 0) ) { + dlil_read_end(); + if (retval != EJUSTRETURN) { + m_freem(m); + return retval; + } + else + return 0; } - else - return 0; - } - else + + ifproto = find_attached_proto(ifp, protocol_family); + if (ifproto == 0) { - printf("ERROR - dlil_input - if_demux didn't return an if_proto pointer\n"); - m_freem(m); - return 0; + dlil_read_end(); + DLIL_PRINTF("ERROR - dlil_input - if_demux didn't return an if_proto pointer\n"); + m_freem(m); + return 0; } + + /* + * Hand the packet off to the protocol. + */ -/* - * Call any attached protocol filters. - */ - - TAILQ_FOREACH_REVERSE(tmp, &ifproto->pr_flt_head, que, dlil_filterq_head) { - if (PFILT(tmp).filter_dl_input) { - retval = (*PFILT(tmp).filter_dl_input)(PFILT(tmp).cookie, - &m, - &frame_header, - &ifp); + if (ifproto->dl_domain && (ifproto->dl_domain->dom_flags & DOM_REENTRANT) == 0) { + lck_mtx_lock(ifproto->dl_domain->dom_mtx); + } - if (retval) { - if (retval == EJUSTRETURN) - return 0; - else { - m_freem(m); - return retval; - } - } - } - } + if (ifproto->proto_kpi == kProtoKPI_DLIL) + retval = (*ifproto->kpi.dlil.dl_input)(m, frame_header, + ifp, ifproto->protocol_family, + TRUE); + else + retval = ifproto->kpi.v1.input(ifp, ifproto->protocol_family, m, frame_header); + if (ifproto->dl_domain && (ifproto->dl_domain->dom_flags & DOM_REENTRANT) == 0) { + lck_mtx_unlock(ifproto->dl_domain->dom_mtx); + } + dlil_read_end(); - retval = (*ifproto->dl_input)(m, frame_header, - ifp, ifproto->dl_tag, - TRUE); - - if (retval == EJUSTRETURN) - retval = 0; - else - if (retval) - m_freem(m); + if (retval == EJUSTRETURN) + retval = 0; + else + if (retval) + m_freem(m); - KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END,0,0,0,0,0); - return retval; + KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END,0,0,0,0,0); + return retval; } - - -void ether_input(ifp, eh, m) - struct ifnet *ifp; - struct ether_header *eh; - struct mbuf *m; - +static int +dlil_event_internal(struct ifnet *ifp, struct kev_msg *event) { - kprintf("Someone is calling ether_input!!\n"); - - dlil_input(ifp, m, NULL); + struct ifnet_filter *filter; + + if (ifp_use(ifp, kIfNetUseCount_MustNotBeZero) == 0) { + dlil_read_begin(); + + /* Pass the event to the interface filters */ + TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) { + if (filter->filt_event) + filter->filt_event(filter->filt_cookie, ifp, filter->filt_protocol, event); + } + + if (ifp->if_proto_hash) { + int i; + + for (i = 0; i < PROTO_HASH_SLOTS; i++) { + struct if_proto *proto; + + SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) { + /* Pass the event to the protocol */ + if (proto->proto_kpi == kProtoKPI_DLIL) { + if (proto->kpi.dlil.dl_event) + proto->kpi.dlil.dl_event(ifp, event); + } + else { + if (proto->kpi.v1.event) + proto->kpi.v1.event(ifp, proto->protocol_family, event); + } + } + } + } + + dlil_read_end(); + + /* Pass the event to the interface */ + if (ifp->if_event) + ifp->if_event(ifp, event); + + if (ifp_unuse(ifp)) + ifp_use_reached_zero(ifp); + } + + return kev_post_msg(event); } - int dlil_event(struct ifnet *ifp, struct kern_event_msg *event) { - struct dlil_filterq_entry *filt; - int retval = 0; - struct ifnet *orig_ifp = 0; - struct if_proto *proto; - struct dlil_filterq_head *fhead = (struct dlil_filterq_head *) &ifp->if_flt_head; - struct kev_msg kev_msg; - struct dlil_proto_head *tmp = (struct dlil_proto_head *) &ifp->proto_head; - boolean_t funnel_state; - - - funnel_state = thread_funnel_set(network_flock, TRUE); - - while (orig_ifp != ifp) { - orig_ifp = ifp; - - TAILQ_FOREACH_REVERSE(filt, fhead, que, dlil_filterq_head) { - if (IFILT(filt).filter_if_event) { - retval = (*IFILT(filt).filter_if_event)(IFILT(filt).cookie, - &ifp, - &event); - - if (retval) { - (void) thread_funnel_set(network_flock, funnel_state); - if (retval == EJUSTRETURN) - return 0; - else - return retval; - } - } - - if (ifp != orig_ifp) - break; - } - } + int result = 0; + struct kev_msg kev_msg; - /* - * Call Interface Module event hook, if any. - */ + kev_msg.vendor_code = event->vendor_code; + kev_msg.kev_class = event->kev_class; + kev_msg.kev_subclass = event->kev_subclass; + kev_msg.event_code = event->event_code; + kev_msg.dv[0].data_ptr = &event->event_data[0]; + kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE; + kev_msg.dv[1].data_length = 0; + - if (ifp->if_event) { - retval = ifp->if_event(ifp, (caddr_t) event); + result = dlil_event_internal(ifp, &kev_msg); - if (retval) { - (void) thread_funnel_set(network_flock, funnel_state); - if (retval == EJUSTRETURN) - return 0; - else - return retval; - } - } + return result; +} - /* - * Call dl_event entry point for all protocols attached to this interface - */ +dlil_output_list( + struct ifnet* ifp, + u_long proto_family, + struct mbuf *packetlist, + caddr_t route, + const struct sockaddr *dest, + int raw) +{ + char *frame_type = 0; + char *dst_linkaddr = 0; + int error, retval = 0; + char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4]; + char dst_linkaddr_buffer[MAX_LINKADDR * 4]; + struct ifnet_filter *filter; + struct if_proto *proto = 0; + struct mbuf *m; + + KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START,0,0,0,0,0); +#if BRIDGE + if ((raw != 0) || proto_family != PF_INET || do_brige) { +#else + if ((raw != 0) || proto_family != PF_INET) { +#endif + while (packetlist) { + m = packetlist; + packetlist = packetlist->m_nextpkt; + m->m_nextpkt = NULL; + error = dlil_output(ifp, proto_family, m, route, dest, raw); + if (error) { + if (packetlist) + m_freem_list(packetlist); + return (error); + } + } + return (0); + } + + dlil_read_begin(); + + frame_type = frame_type_buffer; + dst_linkaddr = dst_linkaddr_buffer; + m = packetlist; + packetlist = packetlist->m_nextpkt; + m->m_nextpkt = NULL; + + proto = find_attached_proto(ifp, proto_family); + if (proto == NULL) { + retval = ENXIO; + goto cleanup; + } - TAILQ_FOREACH(proto, tmp, next) { - /* - * Call any attached protocol filters. - */ + retval = 0; + if (proto->proto_kpi == kProtoKPI_DLIL) { + if (proto->kpi.dlil.dl_pre_output) + retval = proto->kpi.dlil.dl_pre_output(ifp, proto_family, &m, dest, route, frame_type, dst_linkaddr); + } + else { + if (proto->kpi.v1.pre_output) + retval = proto->kpi.v1.pre_output(ifp, proto_family, &m, dest, route, frame_type, dst_linkaddr); + } - TAILQ_FOREACH_REVERSE(filt, &proto->pr_flt_head, que, dlil_filterq_head) { - if (PFILT(filt).filter_dl_event) { - retval = (*PFILT(filt).filter_dl_event)(PFILT(filt).cookie, - event); - - if (retval) { - (void) thread_funnel_set(network_flock, funnel_state); - if (retval == EJUSTRETURN) - return 0; - else - return retval; - } - } - } - - - /* - * Finally, call the dl_event entry point (if any) - */ - - if (proto->dl_event) - retval = (*proto->dl_event)(event, proto->dl_tag); - - if (retval == EJUSTRETURN) { - (void) thread_funnel_set(network_flock, funnel_state); - return 0; - } - } - + if (retval) { + if (retval != EJUSTRETURN) { + m_freem(m); + } + goto cleanup; + } - /* - * Now, post this event to the Kernel Event message queue - */ + do { + + + if (ifp->if_framer) { + retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr, frame_type); + if (retval) { + if (retval != EJUSTRETURN) { + m_freem(m); + } + goto cleanup; + } + } + + /* + * Let interface filters (if any) do their thing ... + */ + /* Do not pass VLAN tagged packets to filters PR-3586856 */ + if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) == 0) { + TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) { + if ((filter->filt_protocol == 0 || (filter->filt_protocol == proto_family)) && + filter->filt_output) { + retval = filter->filt_output(filter->filt_cookie, ifp, proto_family, &m); + if (retval) { + if (retval == EJUSTRETURN) + continue; + else { + m_freem(m); + } + goto cleanup; + } + } + } + } + /* + * Finally, call the driver. + */ + + KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START, 0,0,0,0,0); + retval = ifp->if_output(ifp, m); + if (retval) { + printf("dlil_output_list: output error retval = %x\n", retval); + goto cleanup; + } + KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0,0,0,0,0); - kev_msg.vendor_code = event->vendor_code; - kev_msg.kev_class = event->kev_class; - kev_msg.kev_subclass = event->kev_subclass; - kev_msg.event_code = event->event_code; - kev_msg.dv[0].data_ptr = &event->event_data[0]; - kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE; - kev_msg.dv[1].data_length = 0; + m = packetlist; + if (m) { + packetlist = packetlist->m_nextpkt; + m->m_nextpkt = NULL; + } + } while (m); - kev_post_msg(&kev_msg); + + KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END,0,0,0,0,0); - (void) thread_funnel_set(network_flock, funnel_state); - return 0; +cleanup: + dlil_read_end(); + if (packetlist) /* if any packet left, clean up */ + m_freem_list(packetlist); + if (retval == EJUSTRETURN) + retval = 0; + return retval; } - - -int -dlil_output(u_long dl_tag, - struct mbuf *m, - caddr_t route, - struct sockaddr *dest, - int raw - ) -{ - char *frame_type; - char *dst_linkaddr; - struct ifnet *orig_ifp = 0; - struct ifnet *ifp; - struct if_proto *proto; - struct dlil_filterq_entry *tmp; - int retval = 0; - char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4]; - char dst_linkaddr_buffer[MAX_LINKADDR * 4]; - struct dlil_filterq_head *fhead; - - KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START,0,0,0,0,0); - - if (dl_tag >= dl_tag_nb || dl_tag_array[dl_tag].ifp == 0) { - m_freem(m); - return ENOENT; - } - - ifp = dl_tag_array[dl_tag].ifp; - proto = dl_tag_array[dl_tag].proto; - - frame_type = frame_type_buffer; - dst_linkaddr = dst_linkaddr_buffer; - - fhead = (struct dlil_filterq_head *) &ifp->if_flt_head; - - if ((raw == 0) && (proto->dl_pre_output)) { - retval = (*proto->dl_pre_output)(ifp, &m, dest, route, - frame_type, dst_linkaddr, dl_tag); - if (retval) { - if (retval == EJUSTRETURN) - return 0; - else { - m_freem(m); - return retval; - } - } - } - /* - * Run any attached protocol filters. + * dlil_output + * + * Caller should have a lock on the protocol domain if the protocol + * doesn't support finer grained locking. In most cases, the lock + * will be held from the socket layer and won't be released until + * we return back to the socket layer. + * + * This does mean that we must take a protocol lock before we take + * an interface lock if we're going to take both. This makes sense + * because a protocol is likely to interact with an ifp while it + * is under the protocol lock. */ - - if (TAILQ_EMPTY(dl_tag_array[dl_tag].pr_flt_head) == 0) { - TAILQ_FOREACH(tmp, dl_tag_array[dl_tag].pr_flt_head, que) { - if (PFILT(tmp).filter_dl_output) { - retval = (*PFILT(tmp).filter_dl_output)(PFILT(tmp).cookie, - &m, &ifp, &dest, dst_linkaddr, frame_type); - if (retval) { - if (retval == EJUSTRETURN) - return 0; - else { +int +dlil_output( + struct ifnet* ifp, + u_long proto_family, + struct mbuf *m, + caddr_t route, + const struct sockaddr *dest, + int raw) +{ + char *frame_type = 0; + char *dst_linkaddr = 0; + int retval = 0; + char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4]; + char dst_linkaddr_buffer[MAX_LINKADDR * 4]; + struct ifnet_filter *filter; + + KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START,0,0,0,0,0); + + dlil_read_begin(); + + frame_type = frame_type_buffer; + dst_linkaddr = dst_linkaddr_buffer; + + if (raw == 0) { + struct if_proto *proto = 0; + + proto = find_attached_proto(ifp, proto_family); + if (proto == NULL) { m_freem(m); - return retval; - } + retval = ENXIO; + goto cleanup; + } + + retval = 0; + if (proto->proto_kpi == kProtoKPI_DLIL) { + if (proto->kpi.dlil.dl_pre_output) + retval = proto->kpi.dlil.dl_pre_output(ifp, proto_family, &m, dest, route, frame_type, dst_linkaddr); + } + else { + if (proto->kpi.v1.pre_output) + retval = proto->kpi.v1.pre_output(ifp, proto_family, &m, dest, route, frame_type, dst_linkaddr); + } + + if (retval) { + if (retval != EJUSTRETURN) { + m_freem(m); + } + goto cleanup; } - } } - } - - -/* - * Call framing module - */ - if ((raw == 0) && (ifp->if_framer)) { - retval = (*ifp->if_framer)(ifp, &m, dest, dst_linkaddr, frame_type); - if (retval) { - if (retval == EJUSTRETURN) - return 0; - else - { - m_freem(m); - return retval; - } + + /* + * Call framing module + */ + if ((raw == 0) && (ifp->if_framer)) { + retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr, frame_type); + if (retval) { + if (retval != EJUSTRETURN) { + m_freem(m); + } + goto cleanup; + } } - } - -#if BRIDGE - if (do_bridge) { - struct mbuf *m0 = m ; - struct ether_header *eh = mtod(m, struct ether_header *); - if (m->m_pkthdr.rcvif) - m->m_pkthdr.rcvif = NULL ; - ifp = bridge_dst_lookup(eh); - bdg_forward(&m0, ifp); - if (m0) - m_freem(m0); - - return 0; - } +#if BRIDGE + /* !!!LOCKING!!! + * + * Need to consider how to handle this. + */ + broken-locking + if (do_bridge) { + struct mbuf *m0 = m; + struct ether_header *eh = mtod(m, struct ether_header *); + + if (m->m_pkthdr.rcvif) + m->m_pkthdr.rcvif = NULL; + ifp = bridge_dst_lookup(eh); + bdg_forward(&m0, ifp); + if (m0) + m_freem(m0); + + return 0; + } #endif - - -/* - * Let interface filters (if any) do their thing ... - */ - - fhead = (struct dlil_filterq_head *) &ifp->if_flt_head; - if (TAILQ_EMPTY(fhead) == 0) { - while (orig_ifp != ifp) { - orig_ifp = ifp; - TAILQ_FOREACH(tmp, fhead, que) { - if (IFILT(tmp).filter_if_output) { - retval = (*IFILT(tmp).filter_if_output)(IFILT(tmp).cookie, - &ifp, - &m); - if (retval) { - if (retval == EJUSTRETURN) - return 0; - else { - m_freem(m); - return retval; + + + /* + * Let interface filters (if any) do their thing ... + */ + + /* Do not pass VLAN tagged packets to filters PR-3586856 */ + if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) == 0) { + TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) { + if ((filter->filt_protocol == 0 || (filter->filt_protocol == proto_family)) && + filter->filt_output) { + retval = filter->filt_output(filter->filt_cookie, ifp, proto_family, &m); + if (retval) { + if (retval != EJUSTRETURN) + m_freem(m); + goto cleanup; + } } - } - } - - if (ifp != orig_ifp) - break; - } } - } - -/* - * Finally, call the driver. - */ - - KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START, 0,0,0,0,0); - retval = (*ifp->if_output)(ifp, m); - KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0,0,0,0,0); - - KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END,0,0,0,0,0); + + /* + * Finally, call the driver. + */ + + KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START, 0,0,0,0,0); + retval = ifp->if_output(ifp, m); + KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0,0,0,0,0); + + KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END,0,0,0,0,0); - if ((retval == 0) || (retval == EJUSTRETURN)) - return 0; - else +cleanup: + dlil_read_end(); + if (retval == EJUSTRETURN) + retval = 0; return retval; } - int dlil_ioctl(u_long proto_fam, struct ifnet *ifp, u_long ioctl_code, caddr_t ioctl_arg) { - struct dlil_filterq_entry *tmp; - struct dlil_filterq_head *fhead; - int retval = EOPNOTSUPP; - int retval2 = EOPNOTSUPP; - u_long dl_tag; - struct if_family_str *if_family; + struct ifnet_filter *filter; + int retval = EOPNOTSUPP; + int result = 0; + struct if_family_str *if_family; + int holding_read = 0; + + /* Attempt to increment the use count. If it's zero, bail out, the ifp is invalid */ + result = ifp_use(ifp, kIfNetUseCount_MustNotBeZero); + if (result != 0) + return EOPNOTSUPP; + + dlil_read_begin(); + holding_read = 1; + + /* Run the interface filters first. + * We want to run all filters before calling the protocol, + * interface family, or interface. + */ + TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) { + if ((filter->filt_protocol == 0 || (filter->filt_protocol == proto_fam)) && + filter->filt_ioctl != NULL) { + result = filter->filt_ioctl(filter->filt_cookie, ifp, proto_fam, ioctl_code, ioctl_arg); + /* Only update retval if no one has handled the ioctl */ + if (retval == EOPNOTSUPP || result == EJUSTRETURN) { + if (result == ENOTSUP) + result = EOPNOTSUPP; + retval = result; + if (retval && retval != EOPNOTSUPP) { + goto cleanup; + } + } + } + } + + /* Allow the protocol to handle the ioctl */ + if (proto_fam) { + struct if_proto *proto = find_attached_proto(ifp, proto_fam); + + if (proto != 0) { + result = EOPNOTSUPP; + if (proto->proto_kpi == kProtoKPI_DLIL) { + if (proto->kpi.dlil.dl_ioctl) + result = proto->kpi.dlil.dl_ioctl(proto_fam, ifp, ioctl_code, ioctl_arg); + } + else { + if (proto->kpi.v1.ioctl) + result = proto->kpi.v1.ioctl(ifp, proto_fam, ioctl_code, ioctl_arg); + } + + /* Only update retval if no one has handled the ioctl */ + if (retval == EOPNOTSUPP || result == EJUSTRETURN) { + if (result == ENOTSUP) + result = EOPNOTSUPP; + retval = result; + if (retval && retval != EOPNOTSUPP) { + goto cleanup; + } + } + } + } + + /* + * Since we have incremented the use count on the ifp, we are guaranteed + * that the ifp will not go away (the function pointers may not be changed). + * We release the dlil read lock so the interface ioctl may trigger a + * protocol attach. This happens with vlan and may occur with other virtual + * interfaces. + */ + dlil_read_end(); + holding_read = 0; + + /* retval is either 0 or EOPNOTSUPP */ + + /* + * Let the family handle this ioctl. + * If it returns something non-zero and not EOPNOTSUPP, we're done. + * If it returns zero, the ioctl was handled, so set retval to zero. + */ + if_family = find_family_module(ifp->if_family); + if ((if_family) && (if_family->ifmod_ioctl)) { + result = (*if_family->ifmod_ioctl)(ifp, ioctl_code, ioctl_arg); + + /* Only update retval if no one has handled the ioctl */ + if (retval == EOPNOTSUPP || result == EJUSTRETURN) { + if (result == ENOTSUP) + result = EOPNOTSUPP; + retval = result; + if (retval && retval != EOPNOTSUPP) { + goto cleanup; + } + } + } + + /* + * Let the interface handle this ioctl. + * If it returns EOPNOTSUPP, ignore that, we may have + * already handled this in the protocol or family. + */ + if (ifp->if_ioctl) + result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg); + + /* Only update retval if no one has handled the ioctl */ + if (retval == EOPNOTSUPP || result == EJUSTRETURN) { + if (result == ENOTSUP) + result = EOPNOTSUPP; + retval = result; + if (retval && retval != EOPNOTSUPP) { + goto cleanup; + } + } + +cleanup: + if (holding_read) + dlil_read_end(); + if (ifp_unuse(ifp)) + ifp_use_reached_zero(ifp); + if (retval == EJUSTRETURN) + retval = 0; + return retval; +} - if (proto_fam) { - if (dlil_find_dltag(ifp->if_family, ifp->if_unit, - proto_fam, &dl_tag) == 0) { - if (dl_tag_array[dl_tag].ifp != ifp) - return ENOENT; +__private_extern__ errno_t +dlil_set_bpf_tap( + ifnet_t ifp, + bpf_tap_mode mode, + bpf_packet_func callback) +{ + errno_t error = 0; -/* - * Run any attached protocol filters. - */ - TAILQ_FOREACH(tmp, dl_tag_array[dl_tag].pr_flt_head, que) { - if (PFILT(tmp).filter_dl_ioctl) { - retval = - (*PFILT(tmp).filter_dl_ioctl)(PFILT(tmp).cookie, - dl_tag_array[dl_tag].ifp, - ioctl_code, - ioctl_arg); - - if (retval) { - if (retval == EJUSTRETURN) - return 0; - else - return retval; - } - } - } - - if (dl_tag_array[dl_tag].proto->dl_ioctl) - retval = - (*dl_tag_array[dl_tag].proto->dl_ioctl)(dl_tag, - dl_tag_array[dl_tag].ifp, - ioctl_code, - ioctl_arg); - else - retval = EOPNOTSUPP; - } - } - - if ((retval) && (retval != EOPNOTSUPP)) { - if (retval == EJUSTRETURN) - return 0; - else - return retval; - } - - - fhead = (struct dlil_filterq_head *) &ifp->if_flt_head; - TAILQ_FOREACH(tmp, fhead, que) { - if (IFILT(tmp).filter_if_ioctl) { - retval2 = (*IFILT(tmp).filter_if_ioctl)(IFILT(tmp).cookie, ifp, - ioctl_code, ioctl_arg); - if (retval2) { - if (retval2 == EJUSTRETURN) - return 0; - else - return retval2; - } - } - } - - - if_family = find_family_module(ifp->if_family); - if ((if_family) && (if_family->ifmod_ioctl)) { - retval2 = (*if_family->ifmod_ioctl)(ifp, ioctl_code, ioctl_arg); - - if ((retval2) && (retval2 != EOPNOTSUPP)) { - if (retval2 == EJUSTRETURN) - return 0; - else - return retval; - } - - if (retval == EOPNOTSUPP) - retval = retval2; - } - - if (ifp->if_ioctl) - retval2 = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg); - - if (retval == EOPNOTSUPP) - return retval2; - else { - if (retval2 == EOPNOTSUPP) - return 0; - else - return retval2; - } + dlil_read_begin(); + if (ifp->if_set_bpf_tap) + error = ifp->if_set_bpf_tap(ifp, mode, callback); + dlil_read_end(); + + return error; } - -int -dlil_attach_protocol(struct dlil_proto_reg_str *proto, - u_long *dl_tag) +__private_extern__ errno_t +dlil_resolve_multi( + struct ifnet *ifp, + const struct sockaddr *proto_addr, + struct sockaddr *ll_addr, + size_t ll_len) { - struct ifnet *ifp; - struct if_proto *ifproto; - u_long i; - struct if_family_str *if_family; - struct dlil_proto_head *tmp; - struct kev_dl_proto_data ev_pr_data; - int s, retval = 0; - boolean_t funnel_state; - u_char *p; - - if ((proto->protocol_family == 0) || (proto->interface_family == 0)) - return EINVAL; - - funnel_state = thread_funnel_set(network_flock, TRUE); - s = splnet(); - if_family = find_family_module(proto->interface_family); - if ((!if_family) || (if_family->flags & DLIL_SHUTDOWN)) { - kprintf("dlil_attach_protocol -- no interface family module %d", - proto->interface_family); - retval = ENOENT; - goto end; - } + errno_t result = EOPNOTSUPP; + struct if_proto *proto; + const struct sockaddr *verify; + + dlil_read_begin(); + + bzero(ll_addr, ll_len); + + /* Call the protocol first */ + proto = find_attached_proto(ifp, proto_addr->sa_family); + if (proto != NULL && proto->proto_kpi != kProtoKPI_DLIL && + proto->kpi.v1.resolve_multi != NULL) { + result = proto->kpi.v1.resolve_multi(ifp, proto_addr, + (struct sockaddr_dl*)ll_addr, ll_len); + } + + /* Let the interface verify the multicast address */ + if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) { + if (result == 0) + verify = ll_addr; + else + verify = proto_addr; + result = ifp->if_check_multi(ifp, verify); + } + + dlil_read_end(); + + return result; +} - ifp = ifbyfamily(proto->interface_family, proto->unit_number); - if (!ifp) { - kprintf("dlil_attach_protocol -- no such interface %d unit %d\n", - proto->interface_family, proto->unit_number); - retval = ENOENT; - goto end; - } +__private_extern__ errno_t +dlil_send_arp_internal( + ifnet_t ifp, + u_short arpop, + const struct sockaddr_dl* sender_hw, + const struct sockaddr* sender_proto, + const struct sockaddr_dl* target_hw, + const struct sockaddr* target_proto) +{ + struct if_proto *proto; + errno_t result = 0; + + dlil_read_begin(); + + proto = find_attached_proto(ifp, target_proto->sa_family); + if (proto == NULL || proto->proto_kpi == kProtoKPI_DLIL || + proto->kpi.v1.send_arp == NULL) { + result = ENOTSUP; + } + else { + result = proto->kpi.v1.send_arp(ifp, arpop, sender_hw, sender_proto, + target_hw, target_proto); + } + + dlil_read_end(); + + return result; +} - if (dlil_find_dltag(proto->interface_family, proto->unit_number, - proto->protocol_family, &i) == 0) { - retval = EEXIST; - goto end; - } +__private_extern__ errno_t +dlil_send_arp( + ifnet_t ifp, + u_short arpop, + const struct sockaddr_dl* sender_hw, + const struct sockaddr* sender_proto, + const struct sockaddr_dl* target_hw, + const struct sockaddr* target_proto) +{ + errno_t result = 0; + + if (target_proto == NULL || (sender_proto && + sender_proto->sa_family != target_proto->sa_family)) + return EINVAL; + + /* + * If this is an ARP request and the target IP is IPv4LL, + * send the request on all interfaces. + */ + if (IN_LINKLOCAL(((const struct sockaddr_in*)target_proto)->sin_addr.s_addr) + && ipv4_ll_arp_aware != 0 && target_proto->sa_family == AF_INET && + arpop == ARPOP_REQUEST) { + ifnet_t *ifp_list; + u_int32_t count; + u_int32_t ifp_on; + + result = ENOTSUP; + + if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) { + for (ifp_on = 0; ifp_on < count; ifp_on++) { + errno_t new_result; + ifaddr_t source_hw = NULL; + ifaddr_t source_ip = NULL; + struct sockaddr_in source_ip_copy; + + /* + * Only arp on interfaces marked for IPv4LL ARPing. This may + * mean that we don't ARP on the interface the subnet route + * points to. + */ + if ((ifp_list[ifp_on]->if_eflags & IFEF_ARPLL) == 0) { + continue; + } + + source_hw = TAILQ_FIRST(&ifp_list[ifp_on]->if_addrhead); + + /* Find the source IP address */ + ifnet_lock_shared(ifp_list[ifp_on]); + TAILQ_FOREACH(source_ip, &ifp_list[ifp_on]->if_addrhead, + ifa_link) { + if (source_ip->ifa_addr && + source_ip->ifa_addr->sa_family == AF_INET) { + break; + } + } + + /* No IP Source, don't arp */ + if (source_ip == NULL) { + ifnet_lock_done(ifp_list[ifp_on]); + continue; + } + + /* Copy the source IP address */ + source_ip_copy = *(struct sockaddr_in*)source_ip->ifa_addr; + + ifnet_lock_done(ifp_list[ifp_on]); + + /* Send the ARP */ + new_result = dlil_send_arp_internal(ifp_list[ifp_on], arpop, + (struct sockaddr_dl*)source_hw->ifa_addr, + (struct sockaddr*)&source_ip_copy, NULL, + target_proto); + + if (result == ENOTSUP) { + result = new_result; + } + } + } + + ifnet_list_free(ifp_list); + } + else { + result = dlil_send_arp_internal(ifp, arpop, sender_hw, sender_proto, + target_hw, target_proto); + } + + return result; +} - for (i=1; i < dl_tag_nb; i++) - if (dl_tag_array[i].ifp == 0) - break; +static int +ifp_use( + struct ifnet *ifp, + int handle_zero) +{ + int old_value; + int retval = 0; + + do { + old_value = ifp->if_usecnt; + if (old_value == 0 && handle_zero == kIfNetUseCount_MustNotBeZero) { + retval = ENXIO; // ifp is invalid + break; + } + } while (!OSCompareAndSwap((UInt32)old_value, (UInt32)old_value + 1, (UInt32*)&ifp->if_usecnt)); + + return retval; +} - if (i == dl_tag_nb) { - // expand the tag array by MAX_DL_TAGS - MALLOC(p, u_char *, sizeof(struct dl_tag_str) * (dl_tag_nb + MAX_DL_TAGS), M_NKE, M_WAITOK); - if (p == 0) { - retval = ENOBUFS; - goto end; - } - bcopy(dl_tag_array, p, sizeof(struct dl_tag_str) * dl_tag_nb); - bzero(p + sizeof(struct dl_tag_str) * dl_tag_nb, sizeof(struct dl_tag_str) * MAX_DL_TAGS); - dl_tag_nb += MAX_DL_TAGS; - FREE(dl_tag_array, M_NKE); - dl_tag_array = (struct dl_tag_str *)p; - } - - /* - * Allocate and init a new if_proto structure - */ +/* ifp_unuse is broken into two pieces. + * + * ifp_use and ifp_unuse must be called between when the caller calls + * dlil_write_begin and dlil_write_end. ifp_unuse needs to perform some + * operations after dlil_write_end has been called. For this reason, + * anyone calling ifp_unuse must call ifp_use_reached_zero if ifp_unuse + * returns a non-zero value. The caller must call ifp_use_reached_zero + * after the caller has called dlil_write_end. + */ +static void +ifp_use_reached_zero( + struct ifnet *ifp) +{ + struct if_family_str *if_family; + ifnet_detached_func free_func; + + dlil_read_begin(); + + if (ifp->if_usecnt != 0) + panic("ifp_use_reached_zero: ifp->if_usecnt != 0"); + + /* Let BPF know we're detaching */ + bpfdetach(ifp); + + ifnet_head_lock_exclusive(); + ifnet_lock_exclusive(ifp); + + /* Remove ourselves from the list */ + TAILQ_REMOVE(&ifnet_head, ifp, if_link); + ifnet_addrs[ifp->if_index - 1] = 0; + + /* ifp should be removed from the interface list */ + while (ifp->if_multiaddrs.lh_first) { + struct ifmultiaddr *ifma = ifp->if_multiaddrs.lh_first; + + /* + * When the interface is gone, we will no longer + * be listening on these multicasts. Various bits + * of the stack may be referencing these multicasts, + * release only our reference. + */ + LIST_REMOVE(ifma, ifma_link); + ifma->ifma_ifp = NULL; + ifma_release(ifma); + } + ifnet_head_done(); + + ifp->if_eflags &= ~IFEF_DETACHING; // clear the detaching flag + ifnet_lock_done(ifp); - ifproto = _MALLOC(sizeof(struct if_proto), M_IFADDR, M_WAITOK); - if (!ifproto) { - printf("ERROR - DLIL failed if_proto allocation\n"); - retval = ENOMEM; - goto end; - } - - bzero(ifproto, sizeof(struct if_proto)); + if_family = find_family_module(ifp->if_family); + if (if_family && if_family->del_if) + if_family->del_if(ifp); +#if 0 + if (--if_family->if_usecnt == 0) { + if (if_family->shutdown) + (*if_family->shutdown)(); + + TAILQ_REMOVE(&if_family_head, if_family, if_fam_next); + FREE(if_family, M_IFADDR); + } +#endif - dl_tag_array[i].ifp = ifp; - dl_tag_array[i].proto = ifproto; - dl_tag_array[i].pr_flt_head = &ifproto->pr_flt_head; - ifproto->dl_tag = i; - *dl_tag = i; + dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, 0, 0); + free_func = ifp->if_free; + dlil_read_end(); + + if (free_func) + free_func(ifp); +} - if (proto->default_proto) { - if (ifp->if_data.default_proto == 0) - ifp->if_data.default_proto = i; - else - printf("ERROR - dlil_attach_protocol -- Attempt to attach more than one default protocol\n"); - } +static int +ifp_unuse( + struct ifnet *ifp) +{ + int oldval; + oldval = OSDecrementAtomic((UInt32*)&ifp->if_usecnt); + if (oldval == 0) + panic("ifp_unuse: ifp(%s%n)->if_usecnt was zero\n", ifp->if_name, ifp->if_unit); + + if (oldval > 1) + return 0; + + if ((ifp->if_eflags & IFEF_DETACHING) == 0) + panic("ifp_unuse: use count reached zero but detching flag is not set!"); + + return 1; /* caller must call ifp_use_reached_zero */ +} - ifproto->protocol_family = proto->protocol_family; - ifproto->dl_input = proto->input; - ifproto->dl_pre_output = proto->pre_output; - ifproto->dl_event = proto->event; - ifproto->dl_offer = proto->offer; - ifproto->dl_ioctl = proto->ioctl; - ifproto->ifp = ifp; - TAILQ_INIT(&ifproto->pr_flt_head); +void +ifp_reference( + struct ifnet *ifp) +{ + int oldval; + oldval = OSIncrementAtomic(&ifp->if_refcnt); +} - /* - * Call family module add_proto routine so it can refine the - * demux descriptors as it wishes. - */ - retval = (*if_family->add_proto)(&proto->demux_desc_head, ifproto, *dl_tag); - if (retval) { - dl_tag_array[i].ifp = 0; - FREE(ifproto, M_IFADDR); - goto end; - } +void +ifp_release( + struct ifnet *ifp) +{ + int oldval; + oldval = OSDecrementAtomic((UInt32*)&ifp->if_refcnt); + if (oldval == 0) + panic("dlil_if_reference - refcount decremented past zero!"); +} - /* - * Add to if_proto list for this interface - */ +extern lck_mtx_t *domain_proto_mtx; - tmp = (struct dlil_proto_head *) &ifp->proto_head; - TAILQ_INSERT_TAIL(tmp, ifproto, next); - ifp->refcnt++; - if (ifproto->dl_offer) - ifp->offercnt++; +static int +dlil_attach_protocol_internal( + struct if_proto *proto, + const struct ddesc_head_str *demux, + const struct ifnet_demux_desc *demux_list, + u_int32_t demux_count) +{ + struct ddesc_head_str temp_head; + struct kev_dl_proto_data ev_pr_data; + struct ifnet *ifp = proto->ifp; + int retval = 0; + u_long hash_value = proto_hash_value(proto->protocol_family); + int if_using_kpi = (ifp->if_eflags & IFEF_USEKPI) != 0; + void* free_me = NULL; + + /* setup some of the common values */ + + { + lck_mtx_lock(domain_proto_mtx); + struct domain *dp = domains; + while (dp && (protocol_family_t)dp->dom_family != proto->protocol_family) + dp = dp->dom_next; + proto->dl_domain = dp; + lck_mtx_unlock(domain_proto_mtx); + } + + /* + * Convert the demux descriptors to a type the interface + * will understand. Checking e_flags should be safe, this + * flag won't change. + */ + if (if_using_kpi && demux) { + /* Convert the demux linked list to a demux_list */ + struct dlil_demux_desc *demux_entry; + struct ifnet_demux_desc *temp_list = NULL; + u_int32_t i = 0; + + TAILQ_FOREACH(demux_entry, demux, next) { + i++; + } + + temp_list = _MALLOC(sizeof(struct ifnet_demux_desc) * i, M_TEMP, M_WAITOK); + free_me = temp_list; + + if (temp_list == NULL) + return ENOMEM; + + i = 0; + TAILQ_FOREACH(demux_entry, demux, next) { + /* dlil_demux_desc types 1, 2, and 3 are obsolete and can not be translated */ + if (demux_entry->type == 1 || + demux_entry->type == 2 || + demux_entry->type == 3) { + FREE(free_me, M_TEMP); + return ENOTSUP; + } + + temp_list[i].type = demux_entry->type; + temp_list[i].data = demux_entry->native_type; + temp_list[i].datalen = demux_entry->variants.native_type_length; + i++; + } + demux_count = i; + demux_list = temp_list; + } + else if (!if_using_kpi && demux_list != NULL) { + struct dlil_demux_desc *demux_entry; + u_int32_t i = 0; + + demux_entry = _MALLOC(sizeof(struct dlil_demux_desc) * demux_count, M_TEMP, M_WAITOK); + free_me = demux_entry; + if (demux_entry == NULL) + return ENOMEM; + + TAILQ_INIT(&temp_head); + + for (i = 0; i < demux_count; i++) { + demux_entry[i].type = demux_list[i].type; + demux_entry[i].native_type = demux_list[i].data; + demux_entry[i].variants.native_type_length = demux_list[i].datalen; + TAILQ_INSERT_TAIL(&temp_head, &demux_entry[i], next); + } + demux = &temp_head; + } + + /* + * Take the write lock to protect readers and exclude other writers. + */ + dlil_write_begin(); + + /* Check that the interface isn't currently detaching */ + ifnet_lock_shared(ifp); + if ((ifp->if_eflags & IFEF_DETACHING) != 0) { + ifnet_lock_done(ifp); + dlil_write_end(); + if (free_me) + FREE(free_me, M_TEMP); + return ENXIO; + } + ifnet_lock_done(ifp); + + if (find_attached_proto(ifp, proto->protocol_family) != NULL) { + dlil_write_end(); + if (free_me) + FREE(free_me, M_TEMP); + return EEXIST; + } + + /* + * Call family module add_proto routine so it can refine the + * demux descriptors as it wishes. + */ + if (if_using_kpi) + retval = ifp->if_add_proto_u.kpi(ifp, proto->protocol_family, demux_list, demux_count); + else { + retval = ifp->if_add_proto_u.original(ifp, proto->protocol_family, + _cast_non_const(demux)); + } + if (retval) { + dlil_write_end(); + if (free_me) + FREE(free_me, M_TEMP); + return retval; + } + + /* + * We can't fail from this point on. + * Increment the number of uses (protocol attachments + interface attached). + */ + ifp_use(ifp, kIfNetUseCount_MustNotBeZero); + + /* + * Insert the protocol in the hash + */ + { + struct if_proto* prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]); + while (prev_proto && SLIST_NEXT(prev_proto, next_hash) != NULL) + prev_proto = SLIST_NEXT(prev_proto, next_hash); + if (prev_proto) + SLIST_INSERT_AFTER(prev_proto, proto, next_hash); + else + SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value], proto, next_hash); + } - /* the reserved field carries the number of protocol still attached (subject to change) */ - ev_pr_data.proto_family = proto->protocol_family; - ev_pr_data.proto_remaining_count = dlil_ifp_proto_count(ifp); - dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED, - (struct net_event_data *)&ev_pr_data, - sizeof(struct kev_dl_proto_data)); + /* + * Add to if_proto list for this interface + */ + if_proto_ref(proto); + if (proto->proto_kpi == kProtoKPI_DLIL && proto->kpi.dlil.dl_offer) + ifp->offercnt++; + dlil_write_end(); + + /* the reserved field carries the number of protocol still attached (subject to change) */ + ev_pr_data.proto_family = proto->protocol_family; + ev_pr_data.proto_remaining_count = dlil_ifp_proto_count(ifp); + dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED, + (struct net_event_data *)&ev_pr_data, + sizeof(struct kev_dl_proto_data)); + + DLIL_PRINTF("Attached protocol %d to %s%d - %d\n", proto->protocol_family, + ifp->if_name, ifp->if_unit, retval); + if (free_me) + FREE(free_me, M_TEMP); + return retval; +} +__private_extern__ int +dlil_attach_protocol_kpi(ifnet_t ifp, protocol_family_t protocol, + const struct ifnet_attach_proto_param *proto_details) +{ + int retval = 0; + struct if_proto *ifproto = NULL; + + ifproto = _MALLOC(sizeof(struct if_proto), M_IFADDR, M_WAITOK); + if (ifproto == 0) { + DLIL_PRINTF("ERROR - DLIL failed if_proto allocation\n"); + retval = ENOMEM; + goto end; + } + bzero(ifproto, sizeof(*ifproto)); + + ifproto->ifp = ifp; + ifproto->protocol_family = protocol; + ifproto->proto_kpi = kProtoKPI_v1; + ifproto->kpi.v1.input = proto_details->input; + ifproto->kpi.v1.pre_output = proto_details->pre_output; + ifproto->kpi.v1.event = proto_details->event; + ifproto->kpi.v1.ioctl = proto_details->ioctl; + ifproto->kpi.v1.detached = proto_details->detached; + ifproto->kpi.v1.resolve_multi = proto_details->resolve; + ifproto->kpi.v1.send_arp = proto_details->send_arp; + + retval = dlil_attach_protocol_internal(ifproto, NULL, + proto_details->demux_list, proto_details->demux_count); + end: - splx(s); - thread_funnel_set(network_flock, funnel_state); - return retval; + if (retval && ifproto) + FREE(ifproto, M_IFADDR); + return retval; } +int +dlil_attach_protocol(struct dlil_proto_reg_str *proto) +{ + struct ifnet *ifp = NULL; + struct if_proto *ifproto = NULL; + int retval = 0; + /* + * Do everything we can before taking the write lock + */ + + if ((proto->protocol_family == 0) || (proto->interface_family == 0)) + return EINVAL; -int -dlil_detach_protocol(u_long dl_tag) -{ - struct ifnet *ifp; - struct ifnet *orig_ifp=0; - struct if_proto *proto; - struct dlil_proto_head *tmp; - struct if_family_str *if_family; - struct dlil_filterq_entry *filter; - int s, retval = 0; - struct dlil_filterq_head *fhead; - struct kev_dl_proto_data ev_pr_data; - boolean_t funnel_state; - - funnel_state = thread_funnel_set(network_flock, TRUE); - s = splnet(); - - if (dl_tag >= dl_tag_nb || dl_tag_array[dl_tag].ifp == 0) { - retval = ENOENT; - goto end; - } + /* + * Allocate and init a new if_proto structure + */ + ifproto = _MALLOC(sizeof(struct if_proto), M_IFADDR, M_WAITOK); + if (!ifproto) { + DLIL_PRINTF("ERROR - DLIL failed if_proto allocation\n"); + retval = ENOMEM; + goto end; + } + - ifp = dl_tag_array[dl_tag].ifp; - proto = dl_tag_array[dl_tag].proto; + /* ifbyfamily returns us an ifp with an incremented if_usecnt */ + ifp = ifbyfamily(proto->interface_family, proto->unit_number); + if (!ifp) { + DLIL_PRINTF("dlil_attach_protocol -- no such interface %d unit %d\n", + proto->interface_family, proto->unit_number); + retval = ENXIO; + goto end; + } - if_family = find_family_module(ifp->if_family); - if (if_family == NULL) { - retval = ENOENT; - goto end; - } + bzero(ifproto, sizeof(struct if_proto)); + + ifproto->ifp = ifp; + ifproto->protocol_family = proto->protocol_family; + ifproto->proto_kpi = kProtoKPI_DLIL; + ifproto->kpi.dlil.dl_input = proto->input; + ifproto->kpi.dlil.dl_pre_output = proto->pre_output; + ifproto->kpi.dlil.dl_event = proto->event; + ifproto->kpi.dlil.dl_offer = proto->offer; + ifproto->kpi.dlil.dl_ioctl = proto->ioctl; + ifproto->kpi.dlil.dl_detached = proto->detached; + + retval = dlil_attach_protocol_internal(ifproto, &proto->demux_desc_head, NULL, 0); + +end: + if (retval && ifproto) + FREE(ifproto, M_IFADDR); + return retval; +} - tmp = (struct dlil_proto_head *) &ifp->proto_head; +extern void if_rtproto_del(struct ifnet *ifp, int protocol); - /* - * Call family module del_proto - */ +static int +dlil_detach_protocol_internal( + struct if_proto *proto) +{ + struct ifnet *ifp = proto->ifp; + u_long proto_family = proto->protocol_family; + struct kev_dl_proto_data ev_pr_data; + + if (proto->proto_kpi == kProtoKPI_DLIL) { + if (proto->kpi.dlil.dl_detached) + proto->kpi.dlil.dl_detached(proto->protocol_family, ifp); + } + else { + if (proto->kpi.v1.detached) + proto->kpi.v1.detached(ifp, proto->protocol_family); + } + if_proto_free(proto); + + /* + * Cleanup routes that may still be in the routing table for that interface/protocol pair. + */ + + if_rtproto_del(ifp, proto_family); + + /* the reserved field carries the number of protocol still attached (subject to change) */ + ev_pr_data.proto_family = proto_family; + ev_pr_data.proto_remaining_count = dlil_ifp_proto_count(ifp); + dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED, + (struct net_event_data *)&ev_pr_data, + sizeof(struct kev_dl_proto_data)); + return 0; +} - (*if_family->del_proto)(proto, dl_tag); +int +dlil_detach_protocol(struct ifnet *ifp, u_long proto_family) +{ + struct if_proto *proto = NULL; + int retval = 0; + int use_reached_zero = 0; + +#if DLIL_ALWAYS_DELAY_DETACH + { + retval = EDEADLK; +#else + if ((retval = dlil_write_begin()) != 0) { +#endif + if (retval == EDEADLK) { + retval = 0; + dlil_read_begin(); + proto = find_attached_proto(ifp, proto_family); + if (proto == 0) { + retval = ENXIO; + } + else { + proto->detaching = 1; + dlil_detach_waiting = 1; + wakeup(&dlil_detach_waiting); + } + dlil_read_end(); + } + goto end; + } + + proto = find_attached_proto(ifp, proto_family); + + if (proto == NULL) { + retval = ENXIO; + dlil_write_end(); + goto end; + } + + /* + * Call family module del_proto + */ + + if (ifp->if_del_proto) + ifp->if_del_proto(ifp, proto->protocol_family); - /* - * Remove and deallocate any attached protocol filters - */ + if (proto->proto_kpi == kProtoKPI_DLIL && proto->kpi.dlil.dl_offer) + ifp->offercnt--; - while (filter = TAILQ_FIRST(&proto->pr_flt_head)) - dlil_detach_filter(filter->filter_id); - - if (proto->dl_offer) - ifp->offercnt--; + SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)], proto, if_proto, next_hash); + + /* + * We can do the rest of the work outside of the write lock. + */ + use_reached_zero = ifp_unuse(ifp); + dlil_write_end(); + + dlil_detach_protocol_internal(proto); + + /* + * Only handle the case where the interface will go away after + * we've sent the message. This way post message can send the + * message to the interface safely. + */ + + if (use_reached_zero) + ifp_use_reached_zero(ifp); + +end: + return retval; +} - if (ifp->if_data.default_proto == dl_tag) - ifp->if_data.default_proto = 0; - dl_tag_array[dl_tag].ifp = 0; +/* + * dlil_delayed_detach_thread is responsible for detaching + * protocols, protocol filters, and interface filters after + * an attempt was made to detach one of those items while + * it was not safe to do so (i.e. called dlil_read_begin). + * + * This function will take the dlil write lock and walk + * through each of the interfaces looking for items with + * the detaching flag set. When an item is found, it is + * detached from the interface and placed on a local list. + * After all of the items have been collected, we drop the + * write lock and performed the post detach. This is done + * so we only have to take the write lock once. + * + * When detaching a protocol filter, if we find that we + * have detached the very last protocol and we need to call + * ifp_use_reached_zero, we have to break out of our work + * to drop the write lock so we can call ifp_use_reached_zero. + */ + +static void +dlil_delayed_detach_thread(__unused void* foo, __unused wait_result_t wait) +{ + thread_t self = current_thread(); + int asserted = 0; - /* the reserved field carries the number of protocol still attached (subject to change) */ - ev_pr_data.proto_family = proto->protocol_family; + ml_thread_policy(self, MACHINE_GROUP, + (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_NETISR)); - /* - * Cleanup routes that may still be in the routing table for that interface/protocol pair. - */ + + while (1) { + if (dlil_detach_waiting != 0 && dlil_write_begin() == 0) { + struct ifnet *ifp; + struct proto_hash_entry detached_protos; + struct ifnet_filter_head detached_filters; + struct if_proto *proto; + struct if_proto *next_proto; + struct ifnet_filter *filt; + struct ifnet_filter *next_filt; + int reached_zero; + + reached_zero = 0; + + /* Clear the detach waiting flag */ + dlil_detach_waiting = 0; + TAILQ_INIT(&detached_filters); + SLIST_INIT(&detached_protos); + + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + int i; + + // Look for protocols and protocol filters + for (i = 0; i < PROTO_HASH_SLOTS && !reached_zero; i++) { + struct if_proto **prev_nextptr = &SLIST_FIRST(&ifp->if_proto_hash[i]); + for (proto = *prev_nextptr; proto; proto = *prev_nextptr) { + + // Detach this protocol + if (proto->detaching) { + if (ifp->if_del_proto) + ifp->if_del_proto(ifp, proto->protocol_family); + if (proto->proto_kpi == kProtoKPI_DLIL && proto->kpi.dlil.dl_offer) + ifp->offercnt--; + *prev_nextptr = SLIST_NEXT(proto, next_hash); + SLIST_INSERT_HEAD(&detached_protos, proto, next_hash); + reached_zero = ifp_unuse(ifp); + if (reached_zero) { + break; + } + } + else { + // Update prev_nextptr to point to our next ptr + prev_nextptr = &SLIST_NEXT(proto, next_hash); + } + } + } + + // look for interface filters that need to be detached + for (filt = TAILQ_FIRST(&ifp->if_flt_head); filt; filt = next_filt) { + next_filt = TAILQ_NEXT(filt, filt_next); + if (filt->filt_detaching != 0) { + // take this interface filter off the interface filter list + TAILQ_REMOVE(&ifp->if_flt_head, filt, filt_next); + + // put this interface filter on the detached filters list + TAILQ_INSERT_TAIL(&detached_filters, filt, filt_next); + } + } + + if (ifp->if_delayed_detach) { + ifp->if_delayed_detach = 0; + reached_zero = ifp_unuse(ifp); + } + + if (reached_zero) + break; + } + ifnet_head_done(); + dlil_write_end(); + + for (filt = TAILQ_FIRST(&detached_filters); filt; filt = next_filt) { + next_filt = TAILQ_NEXT(filt, filt_next); + /* + * dlil_detach_filter_internal won't remove an item from + * the list if it is already detached (second parameter). + * The item will be freed though. + */ + dlil_detach_filter_internal(filt, 1); + } + + for (proto = SLIST_FIRST(&detached_protos); proto; proto = next_proto) { + next_proto = SLIST_NEXT(proto, next_hash); + dlil_detach_protocol_internal(proto); + } + + if (reached_zero) { + ifp_use_reached_zero(ifp); + dlil_detach_waiting = 1; // we may have missed something + } + } + + if (!asserted && dlil_detach_waiting == 0) { + asserted = 1; + assert_wait(&dlil_detach_waiting, THREAD_UNINT); + } + + if (dlil_detach_waiting == 0) { + asserted = 0; + thread_block(dlil_delayed_detach_thread); + } + } +} - if_rtproto_del(ifp, proto->protocol_family); +static void +dlil_call_delayed_detach_thread(void) { + dlil_delayed_detach_thread(NULL, THREAD_RESTART); +} - TAILQ_REMOVE(tmp, proto, next); - FREE(proto, M_IFADDR); +extern int if_next_index(void); + +__private_extern__ int +dlil_if_attach_with_address( + struct ifnet *ifp, + const struct sockaddr_dl *ll_addr) +{ + u_long interface_family = ifp->if_family; + struct if_family_str *if_family = NULL; + int stat; + struct ifnet *tmp_if; + struct proto_hash_entry *new_proto_list = NULL; + int locked = 0; + + + ifnet_head_lock_shared(); - ifp->refcnt--; - ev_pr_data.proto_remaining_count = dlil_ifp_proto_count(ifp); - dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED, - (struct net_event_data *)&ev_pr_data, - sizeof(struct kev_dl_proto_data)); + /* Verify we aren't already on the list */ + TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) { + if (tmp_if == ifp) { + ifnet_head_done(); + return EEXIST; + } + } + + ifnet_head_done(); + + if ((ifp->if_eflags & IFEF_REUSE) == 0 || ifp->if_lock == 0) +#if IFNET_RW_LOCK + ifp->if_lock = lck_rw_alloc_init(ifnet_lock_group, ifnet_lock_attr); +#else + ifp->if_lock = lck_mtx_alloc_init(ifnet_lock_group, ifnet_lock_attr); +#endif - if (ifp->refcnt == 0) { + if (ifp->if_lock == 0) { + return ENOMEM; + } - TAILQ_REMOVE(&ifnet, ifp, if_link); + // Only use family if this is not a KPI interface + if ((ifp->if_eflags & IFEF_USEKPI) == 0) { + if_family = find_family_module(interface_family); + } - (*if_family->del_if)(ifp); + /* + * Allow interfaces withouth protocol families to attach + * only if they have the necessary fields filled out. + */ + + if ((if_family == 0) && + (ifp->if_add_proto == 0 || ifp->if_del_proto == 0)) { + DLIL_PRINTF("Attempt to attach interface without family module - %d\n", + interface_family); + return ENODEV; + } + + if ((ifp->if_eflags & IFEF_REUSE) == 0 || ifp->if_proto_hash == NULL) { + MALLOC(new_proto_list, struct proto_hash_entry*, sizeof(struct proto_hash_entry) * PROTO_HASH_SLOTS, + M_NKE, M_WAITOK); - if (--if_family->refcnt == 0) { - if (if_family->shutdown) - (*if_family->shutdown)(); - - TAILQ_REMOVE(&if_family_head, if_family, if_fam_next); - FREE(if_family, M_IFADDR); + if (new_proto_list == 0) { + return ENOBUFS; + } } - fhead = (struct dlil_filterq_head *) &ifp->if_flt_head; - while (orig_ifp != ifp) { - orig_ifp = ifp; + dlil_write_begin(); + locked = 1; + + /* + * Call the family module to fill in the appropriate fields in the + * ifnet structure. + */ + + if (if_family) { + stat = if_family->add_if(ifp); + if (stat) { + DLIL_PRINTF("dlil_if_attach -- add_if failed with %d\n", stat); + dlil_write_end(); + return stat; + } + ifp->if_add_proto_u.original = if_family->add_proto; + ifp->if_del_proto = if_family->del_proto; + if_family->refcnt++; + } + + ifp->offercnt = 0; + TAILQ_INIT(&ifp->if_flt_head); + + + if (new_proto_list) { + bzero(new_proto_list, (PROTO_HASH_SLOTS * sizeof(struct proto_hash_entry))); + ifp->if_proto_hash = new_proto_list; + new_proto_list = 0; + } + + /* old_if_attach */ + { + struct ifaddr *ifa = 0; + + if (ifp->if_snd.ifq_maxlen == 0) + ifp->if_snd.ifq_maxlen = ifqmaxlen; + TAILQ_INIT(&ifp->if_prefixhead); + LIST_INIT(&ifp->if_multiaddrs); + ifnet_touch_lastchange(ifp); + + /* usecount to track attachment to the ifnet list */ + ifp_use(ifp, kIfNetUseCount_MayBeZero); + + /* Lock the list of interfaces */ + ifnet_head_lock_exclusive(); + ifnet_lock_exclusive(ifp); + + if ((ifp->if_eflags & IFEF_REUSE) == 0 || ifp->if_index == 0) { + char workbuf[64]; + int namelen, masklen, socksize, ifasize; + + ifp->if_index = if_next_index(); + + namelen = snprintf(workbuf, sizeof(workbuf), "%s%d", ifp->if_name, ifp->if_unit); +#define _offsetof(t, m) ((int)((caddr_t)&((t *)0)->m)) + masklen = _offsetof(struct sockaddr_dl, sdl_data[0]) + namelen; + socksize = masklen + ifp->if_addrlen; +#define ROUNDUP(a) (1 + (((a) - 1) | (sizeof(long) - 1))) + if ((u_long)socksize < sizeof(struct sockaddr_dl)) + socksize = sizeof(struct sockaddr_dl); + socksize = ROUNDUP(socksize); + ifasize = sizeof(struct ifaddr) + 2 * socksize; + ifa = (struct ifaddr*)_MALLOC(ifasize, M_IFADDR, M_WAITOK); + if (ifa) { + struct sockaddr_dl *sdl = (struct sockaddr_dl *)(ifa + 1); + ifnet_addrs[ifp->if_index - 1] = ifa; + bzero(ifa, ifasize); + sdl->sdl_len = socksize; + sdl->sdl_family = AF_LINK; + bcopy(workbuf, sdl->sdl_data, namelen); + sdl->sdl_nlen = namelen; + sdl->sdl_index = ifp->if_index; + sdl->sdl_type = ifp->if_type; + if (ll_addr) { + sdl->sdl_alen = ll_addr->sdl_alen; + if (ll_addr->sdl_alen != ifp->if_addrlen) + panic("dlil_if_attach - ll_addr->sdl_alen != ifp->if_addrlen"); + bcopy(CONST_LLADDR(ll_addr), LLADDR(sdl), sdl->sdl_alen); + } + ifa->ifa_ifp = ifp; + ifa->ifa_rtrequest = link_rtrequest; + ifa->ifa_addr = (struct sockaddr*)sdl; + sdl = (struct sockaddr_dl*)(socksize + (caddr_t)sdl); + ifa->ifa_netmask = (struct sockaddr*)sdl; + sdl->sdl_len = masklen; + while (namelen != 0) + sdl->sdl_data[--namelen] = 0xff; + } + } + else { + /* preserve the first ifaddr */ + ifnet_addrs[ifp->if_index - 1] = TAILQ_FIRST(&ifp->if_addrhead); + } + - TAILQ_FOREACH(filter, fhead, que) { - if (IFILT(filter).filter_if_free) { - retval = (*IFILT(filter).filter_if_free)(IFILT(filter).cookie, ifp); - if (retval) { - splx(s); - thread_funnel_set(network_flock, funnel_state); - return 0; - } + TAILQ_INIT(&ifp->if_addrhead); + ifa = ifnet_addrs[ifp->if_index - 1]; + + if (ifa) { + /* + * We don't use if_attach_ifa because we want + * this address to be first on the list. + */ + ifaref(ifa); + ifa->ifa_debug |= IFA_ATTACHED; + TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link); } - if (ifp != orig_ifp) - break; - } + + TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link); + ifindex2ifnet[ifp->if_index] = ifp; + + ifnet_head_done(); } - - dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, 0, 0); - - (*ifp->if_free)(ifp); - } + dlil_write_end(); + + if (if_family && if_family->init_if) { + stat = if_family->init_if(ifp); + if (stat) { + DLIL_PRINTF("dlil_if_attach -- init_if failed with %d\n", stat); + } + } + + dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, 0, 0); + ifnet_lock_done(ifp); -end: - splx(s); - thread_funnel_set(network_flock, funnel_state); - return retval; + return 0; } - - - - int dlil_if_attach(struct ifnet *ifp) { - u_long interface_family = ifp->if_family; - struct if_family_str *if_family; - struct dlil_proto_head *tmp; - int stat; - int s; - boolean_t funnel_state; - - funnel_state = thread_funnel_set(network_flock, TRUE); - s = splnet(); - if (ifnet_inited == 0) { - TAILQ_INIT(&ifnet); - ifnet_inited = 1; - } - - if_family = find_family_module(interface_family); - - if ((!if_family) || (if_family->flags & DLIL_SHUTDOWN)) { - splx(s); - kprintf("Attempt to attach interface without family module - %d\n", - interface_family); - thread_funnel_set(network_flock, funnel_state); - return ENODEV; - } - - if (ifp->refcnt == 0) { - /* - * Call the family module to fill in the appropriate fields in the - * ifnet structure. - */ - - stat = (*if_family->add_if)(ifp); - if (stat) { - splx(s); - kprintf("dlil_if_attach -- add_if failed with %d\n", stat); - thread_funnel_set(network_flock, funnel_state); - return stat; - } - if_family->refcnt++; - - /* - * Add the ifp to the interface list. - */ - - tmp = (struct dlil_proto_head *) &ifp->proto_head; - TAILQ_INIT(tmp); - - ifp->if_data.default_proto = 0; - ifp->offercnt = 0; - TAILQ_INIT(&ifp->if_flt_head); - old_if_attach(ifp); - - if (if_family->init_if) { - stat = (*if_family->init_if)(ifp); - if (stat) { - kprintf("dlil_if_attach -- init_if failed with %d\n", stat); - } - } - } - - ifp->refcnt++; - - dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, 0, 0); - - splx(s); - thread_funnel_set(network_flock, funnel_state); - return 0; + dlil_if_attach_with_address(ifp, NULL); } int dlil_if_detach(struct ifnet *ifp) { - struct if_proto *proto; - struct dlil_filterq_entry *if_filter; - struct if_family_str *if_family; - struct dlil_filterq_head *fhead = (struct dlil_filterq_head *) &ifp->if_flt_head; - struct kev_msg ev_msg; - boolean_t funnel_state; + struct ifnet_filter *filter; + struct ifnet_filter *filter_next; + int zeroed = 0; + int retval = 0; + struct ifnet_filter_head fhead; - funnel_state = thread_funnel_set(network_flock, TRUE); - if_family = find_family_module(ifp->if_family); + ifnet_lock_exclusive(ifp); - if (!if_family) { - kprintf("Attempt to detach interface without family module - %s\n", - ifp->if_name); - thread_funnel_set(network_flock, funnel_state); - return ENODEV; + if ((ifp->if_eflags & IFEF_DETACHING) != 0) { + /* Interface has already been detached */ + ifnet_lock_done(ifp); + return ENXIO; } - while (if_filter = TAILQ_FIRST(fhead)) - dlil_detach_filter(if_filter->filter_id); + /* + * Indicate this interface is being detached. + * + * This should prevent protocols from attaching + * from this point on. Interface will remain on + * the list until all of the protocols are detached. + */ + ifp->if_eflags |= IFEF_DETACHING; + ifnet_lock_done(ifp); - ifp->refcnt--; + dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, 0, 0); - if (ifp->refcnt > 0) { - dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, 0, 0); - thread_funnel_set(network_flock, funnel_state); - return DLIL_WAIT_FOR_FREE; + if ((retval = dlil_write_begin()) != 0) { + if (retval == EDEADLK) { + retval = DLIL_WAIT_FOR_FREE; + + /* We need to perform a delayed detach */ + ifp->if_delayed_detach = 1; + dlil_detach_waiting = 1; + wakeup(&dlil_detach_waiting); + } + return retval; } - while (ifp->if_multiaddrs.lh_first) { - struct ifmultiaddr *ifma = ifp->if_multiaddrs.lh_first; - - /* - * When the interface is gone, we will no - * longer be listening on these multicasts. - * Various bits of the stack may be referencing - * these multicasts, so we can't just free them. - * We place them on a list so they may be cleaned - * up later as the other bits of the stack release - * them. - */ - LIST_REMOVE(ifma, ifma_link); - ifma->ifma_ifp = NULL; - LIST_INSERT_HEAD(&ifma_lostlist, ifma, ifma_link); - } + /* Steal the list of interface filters */ + fhead = ifp->if_flt_head; + TAILQ_INIT(&ifp->if_flt_head); - /* Let BPF know the interface is detaching. */ - bpfdetach(ifp); - TAILQ_REMOVE(&ifnet, ifp, if_link); + /* unuse the interface */ + zeroed = ifp_unuse(ifp); - (*if_family->del_if)(ifp); + dlil_write_end(); - if (--if_family->refcnt == 0) { - if (if_family->shutdown) - (*if_family->shutdown)(); - - TAILQ_REMOVE(&if_family_head, if_family, if_fam_next); - FREE(if_family, M_IFADDR); + for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) { + filter_next = TAILQ_NEXT(filter, filt_next); + dlil_detach_filter_internal(filter, 1); } - dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, 0, 0); - thread_funnel_set(network_flock, funnel_state); - return 0; + if (zeroed == 0) { + retval = DLIL_WAIT_FOR_FREE; + } + else + { + ifp_use_reached_zero(ifp); + } + + return retval; } @@ -1517,25 +2436,17 @@ dlil_reg_if_modules(u_long interface_family, struct dlil_ifmod_reg_str *ifmod) { struct if_family_str *if_family; - int s; - boolean_t funnel_state; - funnel_state = thread_funnel_set(network_flock, TRUE); - s = splnet(); if (find_family_module(interface_family)) { - kprintf("Attempt to register dlil family module more than once - %d\n", + DLIL_PRINTF("Attempt to register dlil family module more than once - %d\n", interface_family); - splx(s); - thread_funnel_set(network_flock, funnel_state); return EEXIST; } if ((!ifmod->add_if) || (!ifmod->del_if) || (!ifmod->add_proto) || (!ifmod->del_proto)) { - kprintf("dlil_reg_if_modules passed at least one null pointer\n"); - splx(s); - thread_funnel_set(network_flock, funnel_state); + DLIL_PRINTF("dlil_reg_if_modules passed at least one null pointer\n"); return EINVAL; } @@ -1551,17 +2462,13 @@ dlil_reg_if_modules(u_long interface_family, if (interface_family == 123) { /* Vicom */ ifmod->init_if = 0; } else { - splx(s); - thread_funnel_set(network_flock, funnel_state); return EINVAL; } } if_family = (struct if_family_str *) _MALLOC(sizeof(struct if_family_str), M_IFADDR, M_WAITOK); if (!if_family) { - kprintf("dlil_reg_if_modules failed allocation\n"); - splx(s); - thread_funnel_set(network_flock, funnel_state); + DLIL_PRINTF("dlil_reg_if_modules failed allocation\n"); return ENOMEM; } @@ -1574,29 +2481,23 @@ dlil_reg_if_modules(u_long interface_family, if_family->init_if = ifmod->init_if; if_family->add_proto = ifmod->add_proto; if_family->del_proto = ifmod->del_proto; - if_family->ifmod_ioctl = ifmod->ifmod_ioctl; + if_family->ifmod_ioctl = ifmod->ifmod_ioctl; if_family->refcnt = 1; if_family->flags = 0; TAILQ_INSERT_TAIL(&if_family_head, if_family, if_fam_next); - splx(s); - thread_funnel_set(network_flock, funnel_state); return 0; } int dlil_dereg_if_modules(u_long interface_family) { struct if_family_str *if_family; - int s, ret = 0; - boolean_t funnel_state; + int ret = 0; + - funnel_state = thread_funnel_set(network_flock, TRUE); - s = splnet(); if_family = find_family_module(interface_family); if (if_family == 0) { - splx(s); - thread_funnel_set(network_flock, funnel_state); - return ENOENT; + return ENXIO; } if (--if_family->refcnt == 0) { @@ -1611,104 +2512,84 @@ int dlil_dereg_if_modules(u_long interface_family) ret = DLIL_WAIT_FOR_FREE; } - splx(s); - thread_funnel_set(network_flock, funnel_state); return ret; } int -dlil_reg_proto_module(u_long protocol_family, u_long interface_family, - struct dlil_protomod_reg_str *protomod_reg) +dlil_reg_proto_module( + u_long protocol_family, + u_long interface_family, + int (*attach)(struct ifnet *ifp, u_long protocol_family), + int (*detach)(struct ifnet *ifp, u_long protocol_family)) { struct proto_family_str *proto_family; - int s; - boolean_t funnel_state; + if (attach == NULL) return EINVAL; - funnel_state = thread_funnel_set(network_flock, TRUE); - s = splnet(); - if (find_proto_module(protocol_family, interface_family)) { - splx(s); - thread_funnel_set(network_flock, funnel_state); - return EEXIST; - } - - if (protomod_reg->reserved[0] != 0 || protomod_reg->reserved[1] != 0 - || protomod_reg->reserved[2] != 0 || protomod_reg->reserved[3] !=0) { - splx(s); - thread_funnel_set(network_flock, funnel_state); - return EINVAL; - } - - if (protomod_reg->attach_proto == NULL) { - splx(s); - thread_funnel_set(network_flock, funnel_state); - return EINVAL; + lck_mtx_lock(proto_family_mutex); + + TAILQ_FOREACH(proto_family, &proto_family_head, proto_fam_next) { + if (proto_family->proto_family == protocol_family && + proto_family->if_family == interface_family) { + lck_mtx_unlock(proto_family_mutex); + return EEXIST; + } } proto_family = (struct proto_family_str *) _MALLOC(sizeof(struct proto_family_str), M_IFADDR, M_WAITOK); if (!proto_family) { - splx(s); - thread_funnel_set(network_flock, funnel_state); + lck_mtx_unlock(proto_family_mutex); return ENOMEM; } bzero(proto_family, sizeof(struct proto_family_str)); proto_family->proto_family = protocol_family; proto_family->if_family = interface_family & 0xffff; - proto_family->attach_proto = protomod_reg->attach_proto; - proto_family->detach_proto = protomod_reg->detach_proto; + proto_family->attach_proto = attach; + proto_family->detach_proto = detach; TAILQ_INSERT_TAIL(&proto_family_head, proto_family, proto_fam_next); - splx(s); - thread_funnel_set(network_flock, funnel_state); + lck_mtx_unlock(proto_family_mutex); return 0; } int dlil_dereg_proto_module(u_long protocol_family, u_long interface_family) { struct proto_family_str *proto_family; - int s, ret = 0; - boolean_t funnel_state; + int ret = 0; + + lck_mtx_lock(proto_family_mutex); - funnel_state = thread_funnel_set(network_flock, TRUE); - s = splnet(); proto_family = find_proto_module(protocol_family, interface_family); if (proto_family == 0) { - splx(s); - thread_funnel_set(network_flock, funnel_state); - return ENOENT; + lck_mtx_unlock(proto_family_mutex); + return ENXIO; } TAILQ_REMOVE(&proto_family_head, proto_family, proto_fam_next); FREE(proto_family, M_IFADDR); - - splx(s); - thread_funnel_set(network_flock, funnel_state); + + lck_mtx_unlock(proto_family_mutex); return ret; } -int dlil_plumb_protocol(u_long protocol_family, struct ifnet *ifp, u_long *dl_tag) +int dlil_plumb_protocol(u_long protocol_family, struct ifnet *ifp) { struct proto_family_str *proto_family; - int s, ret = 0; - boolean_t funnel_state; + int ret = 0; - funnel_state = thread_funnel_set(network_flock, TRUE); - s = splnet(); + lck_mtx_lock(proto_family_mutex); proto_family = find_proto_module(protocol_family, ifp->if_family); if (proto_family == 0) { - splx(s); - thread_funnel_set(network_flock, funnel_state); - return ENOENT; + lck_mtx_unlock(proto_family_mutex); + return ENXIO; } - ret = (*proto_family->attach_proto)(ifp, dl_tag); + ret = proto_family->attach_proto(ifp, protocol_family); - splx(s); - thread_funnel_set(network_flock, funnel_state); + lck_mtx_unlock(proto_family_mutex); return ret; } @@ -1716,488 +2597,65 @@ int dlil_plumb_protocol(u_long protocol_family, struct ifnet *ifp, u_long *dl_ta int dlil_unplumb_protocol(u_long protocol_family, struct ifnet *ifp) { struct proto_family_str *proto_family; - int s, ret = 0; - u_long tag; - boolean_t funnel_state; - - funnel_state = thread_funnel_set(network_flock, TRUE); - s = splnet(); - - ret = dlil_find_dltag(ifp->if_family, ifp->if_unit, protocol_family, &tag); - - if (ret == 0) { - proto_family = find_proto_module(protocol_family, ifp->if_family); - if (proto_family && proto_family->detach_proto) - ret = (*proto_family->detach_proto)(ifp, tag); - else - ret = dlil_detach_protocol(tag); - } - - splx(s); - thread_funnel_set(network_flock, funnel_state); - return ret; -} - - - -/* - * Old if_attach no-op'ed function defined here for temporary backwards compatibility - */ - -void if_attach(ifp) - struct ifnet *ifp; -{ - dlil_if_attach(ifp); -} - - - -int -dlil_inject_if_input(struct mbuf *m, char *frame_header, u_long from_id) -{ - struct ifnet *orig_ifp = 0; - struct ifnet *ifp; - struct if_proto *ifproto; - struct if_proto *proto; - struct dlil_filterq_entry *tmp; - int retval = 0; - struct dlil_filterq_head *fhead; - int match_found; - - dlil_stats.inject_if_in1++; - - if (from_id >= dlil_filters_nb || dlil_filters[from_id].type != DLIL_IF_FILTER) - return ENOENT; - - ifp = dlil_filters[from_id].ifp; - -/* - * Let interface filters (if any) do their thing ... - */ - - fhead = (struct dlil_filterq_head *) &ifp->if_flt_head; - match_found = 0; - - if (TAILQ_EMPTY(fhead) == 0) { - while (orig_ifp != ifp) { - orig_ifp = ifp; - TAILQ_FOREACH_REVERSE(tmp, fhead, que, dlil_filterq_head) { - if ((match_found) && (IFILT(tmp).filter_if_input)) { - retval = (*IFILT(tmp).filter_if_input)(IFILT(tmp).cookie, - &ifp, - &m, - &frame_header); - if (retval) { - if (retval == EJUSTRETURN) - return 0; - else { - m_freem(m); - return retval; - } - } - - } - - if (ifp != orig_ifp) - break; - - if (from_id == tmp->filter_id) - match_found = 1; - } - } - } - - ifp->if_lastchange = time; - - /* - * Call family demux module. If the demux module finds a match - * for the frame it will fill-in the ifproto pointer. - */ - - retval = (*ifp->if_demux)(ifp, m, frame_header, &ifproto ); - - if (m->m_flags & (M_BCAST|M_MCAST)) - ifp->if_imcasts++; - - if ((retval) && (ifp->offercnt)) { - /* - * No match was found, look for any offers. - */ - struct dlil_proto_head *tmp = (struct dlil_proto_head *) &ifp->proto_head; - TAILQ_FOREACH(proto, tmp, next) { - if ((proto->dl_offer) && (proto->dl_offer(m, frame_header) == 0)) { - ifproto = proto; - retval = 0; - break; - } - } - } - - if (retval) { - if (retval != EJUSTRETURN) { - m_freem(m); - return retval; - } - else - return 0; - } - else - if (ifproto == 0) { - printf("ERROR - dlil_inject_if_input -- if_demux didn't return an if_proto pointer\n"); - m_freem(m); - return 0; - } - -/* - * Call any attached protocol filters. - */ - TAILQ_FOREACH_REVERSE(tmp, &ifproto->pr_flt_head, que, dlil_filterq_head) { - if (PFILT(tmp).filter_dl_input) { - retval = (*PFILT(tmp).filter_dl_input)(PFILT(tmp).cookie, - &m, - &frame_header, - &ifp); - - if (retval) { - if (retval == EJUSTRETURN) - return 0; - else { - m_freem(m); - return retval; - } - } - } - } - - - - retval = (*ifproto->dl_input)(m, frame_header, - ifp, ifproto->dl_tag, - FALSE); - - dlil_stats.inject_if_in2++; - if (retval == EJUSTRETURN) - retval = 0; - else - if (retval) - m_freem(m); - - return retval; - -} - - - - - -int -dlil_inject_pr_input(struct mbuf *m, char *frame_header, u_long from_id) -{ - struct ifnet *orig_ifp = 0; - struct dlil_filterq_entry *tmp; - int retval; - struct if_proto *ifproto = 0; - int match_found; - struct ifnet *ifp; - - dlil_stats.inject_pr_in1++; - if (from_id >= dlil_filters_nb || dlil_filters[from_id].type != DLIL_PR_FILTER) - return ENOENT; - - ifproto = dlil_filters[from_id].proto; - ifp = dlil_filters[from_id].ifp; - -/* - * Call any attached protocol filters. - */ - - match_found = 0; - TAILQ_FOREACH_REVERSE(tmp, &ifproto->pr_flt_head, que, dlil_filterq_head) { - if ((match_found) && (PFILT(tmp).filter_dl_input)) { - retval = (*PFILT(tmp).filter_dl_input)(PFILT(tmp).cookie, - &m, - &frame_header, - &ifp); - - if (retval) { - if (retval == EJUSTRETURN) - return 0; - else { - m_freem(m); - return retval; - } - } - } - - if (tmp->filter_id == from_id) - match_found = 1; - } - - - retval = (*ifproto->dl_input)(m, frame_header, - ifp, ifproto->dl_tag, - FALSE); - - if (retval == EJUSTRETURN) - retval = 0; - else - if (retval) - m_freem(m); - - dlil_stats.inject_pr_in2++; - return retval; -} - + int ret = 0; + lck_mtx_lock(proto_family_mutex); -int -dlil_inject_pr_output(struct mbuf *m, - struct sockaddr *dest, - int raw, - char *frame_type, - char *dst_linkaddr, - u_long from_id) -{ - struct ifnet *orig_ifp = 0; - struct ifnet *ifp; - struct dlil_filterq_entry *tmp; - int retval = 0; - char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4]; - char dst_linkaddr_buffer[MAX_LINKADDR * 4]; - struct dlil_filterq_head *fhead; - int match_found; - u_long dl_tag; - - dlil_stats.inject_pr_out1++; - if (raw == 0) { - if (frame_type) - bcopy(frame_type, &frame_type_buffer[0], MAX_FRAME_TYPE_SIZE * 4); - else - return EINVAL; - - if (dst_linkaddr) - bcopy(dst_linkaddr, &dst_linkaddr_buffer, MAX_LINKADDR * 4); + proto_family = find_proto_module(protocol_family, ifp->if_family); + if (proto_family && proto_family->detach_proto) + ret = proto_family->detach_proto(ifp, protocol_family); else - return EINVAL; - } - - if (from_id >= dlil_filters_nb || dlil_filters[from_id].type != DLIL_PR_FILTER) - return ENOENT; - - ifp = dlil_filters[from_id].ifp; - dl_tag = dlil_filters[from_id].proto->dl_tag; - - frame_type = frame_type_buffer; - dst_linkaddr = dst_linkaddr_buffer; - - fhead = (struct dlil_filterq_head *) &ifp->if_flt_head; - -/* - * Run any attached protocol filters. - */ - match_found = 0; - - if (TAILQ_EMPTY(dl_tag_array[dl_tag].pr_flt_head) == 0) { - TAILQ_FOREACH(tmp, dl_tag_array[dl_tag].pr_flt_head, que) { - if ((match_found) && (PFILT(tmp).filter_dl_output)) { - retval = (*PFILT(tmp).filter_dl_output)(PFILT(tmp).cookie, - &m, &ifp, &dest, dst_linkaddr, frame_type); - if (retval) { - if (retval == EJUSTRETURN) - return 0; - else { - m_freem(m); - return retval; - } - } - } - - if (tmp->filter_id == from_id) - match_found = 1; - } - } - - -/* - * Call framing module - */ - if ((raw == 0) && (ifp->if_framer)) { - retval = (*ifp->if_framer)(ifp, &m, dest, dst_linkaddr, frame_type); - if (retval) { - if (retval == EJUSTRETURN) - return 0; - else - { - m_freem(m); - return retval; - } - } - } - - -#if BRIDGE - if (do_bridge) { - struct mbuf *m0 = m ; - struct ether_header *eh = mtod(m, struct ether_header *); - - if (m->m_pkthdr.rcvif) - m->m_pkthdr.rcvif = NULL ; - ifp = bridge_dst_lookup(eh); - bdg_forward(&m0, ifp); - if (m0) - m_freem(m0); - - return 0; - } -#endif - - -/* - * Let interface filters (if any) do their thing ... - */ - - fhead = (struct dlil_filterq_head *) &ifp->if_flt_head; - if (TAILQ_EMPTY(fhead) == 0) { - while (orig_ifp != ifp) { - orig_ifp = ifp; - TAILQ_FOREACH(tmp, fhead, que) { - if (IFILT(tmp).filter_if_output) { - retval = (*IFILT(tmp).filter_if_output)(IFILT(tmp).cookie, - &ifp, - &m); - if (retval) { - if (retval == EJUSTRETURN) - return 0; - else { - m_freem(m); - return retval; - } - } - - } - - if (ifp != orig_ifp) - break; - } - } - } - -/* - * Finally, call the driver. - */ - - retval = (*ifp->if_output)(ifp, m); - dlil_stats.inject_pr_out2++; - if ((retval == 0) || (retval == EJUSTRETURN)) - return 0; - else - return retval; -} - - -int -dlil_inject_if_output(struct mbuf *m, u_long from_id) -{ - struct ifnet *orig_ifp = 0; - struct ifnet *ifp; - struct dlil_filterq_entry *tmp; - int retval = 0; - struct dlil_filterq_head *fhead; - int match_found; - - dlil_stats.inject_if_out1++; - if (from_id > dlil_filters_nb || dlil_filters[from_id].type != DLIL_IF_FILTER) - return ENOENT; - - ifp = dlil_filters[from_id].ifp; - -/* - * Let interface filters (if any) do their thing ... - */ - - fhead = (struct dlil_filterq_head *) &ifp->if_flt_head; - match_found = 0; - - if (TAILQ_EMPTY(fhead) == 0) { - while (orig_ifp != ifp) { - orig_ifp = ifp; - TAILQ_FOREACH(tmp, fhead, que) { - if ((match_found) && (IFILT(tmp).filter_if_output)) { - retval = (*IFILT(tmp).filter_if_output)(IFILT(tmp).cookie, - &ifp, - &m); - if (retval) { - if (retval == EJUSTRETURN) - return 0; - else { - m_freem(m); - return retval; - } - } - - } - - if (ifp != orig_ifp) - break; - - if (from_id == tmp->filter_id) - match_found = 1; - } - } - } - -/* - * Finally, call the driver. - */ + ret = dlil_detach_protocol(ifp, protocol_family); - retval = (*ifp->if_output)(ifp, m); - dlil_stats.inject_if_out2++; - if ((retval == 0) || (retval == EJUSTRETURN)) - return 0; - else - return retval; + lck_mtx_unlock(proto_family_mutex); + return ret; } -static -int dlil_recycle_ioctl(struct ifnet *ifnet_ptr, u_long ioctl_code, void *ioctl_arg) +static errno_t +dlil_recycle_ioctl( + __unused ifnet_t ifnet_ptr, + __unused u_int32_t ioctl_code, + __unused void *ioctl_arg) { - return EOPNOTSUPP; } -static -int dlil_recycle_output(struct ifnet *ifnet_ptr, struct mbuf *m) +static int +dlil_recycle_output( + __unused struct ifnet *ifnet_ptr, + struct mbuf *m) { - m_freem(m); return 0; } -static -int dlil_recycle_free(struct ifnet *ifnet_ptr) +static void +dlil_recycle_free( + __unused ifnet_t ifnet_ptr) { - return 0; } -static -int dlil_recycle_set_bpf_tap(struct ifnet *ifp, int mode, - int (*bpf_callback)(struct ifnet *, struct mbuf *)) +static errno_t +dlil_recycle_set_bpf_tap( + __unused ifnet_t ifp, + __unused bpf_tap_mode mode, + __unused bpf_packet_func callback) { /* XXX not sure what to do here */ return 0; } -int dlil_if_acquire(u_long family, void *uniqueid, size_t uniqueid_len, - struct ifnet **ifp) +int dlil_if_acquire( + u_long family, + const void *uniqueid, + size_t uniqueid_len, + struct ifnet **ifp) { struct ifnet *ifp1 = NULL; struct dlil_ifnet *dlifp1 = NULL; - int s, ret = 0; - boolean_t funnel_state; - - funnel_state = thread_funnel_set(network_flock, TRUE); - s = splnet(); + int ret = 0; + lck_mtx_lock(dlil_ifnet_mutex); TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) { ifp1 = (struct ifnet *)dlifp1; @@ -2216,8 +2674,11 @@ int dlil_if_acquire(u_long family, void *uniqueid, size_t uniqueid_len, } } else { - - ifp1->if_eflags |= (IFEF_INUSE + IFEF_REUSE); + if (!ifp1->if_lock) + panic("ifp's lock is gone\n"); + ifnet_lock_exclusive(ifp1); + ifp1->if_eflags |= (IFEF_INUSE | IFEF_REUSE); + ifnet_lock_done(ifp1); *ifp = ifp1; goto end; } @@ -2247,27 +2708,26 @@ int dlil_if_acquire(u_long family, void *uniqueid, size_t uniqueid_len, ifp1 = (struct ifnet *)dlifp1; ifp1->if_eflags |= IFEF_INUSE; + ifp1->if_name = dlifp1->if_namestorage; TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link); *ifp = ifp1; end: + lck_mtx_unlock(dlil_ifnet_mutex); - splx(s); - thread_funnel_set(network_flock, funnel_state); return ret; } void dlil_if_release(struct ifnet *ifp) { struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp; - int s; - boolean_t funnel_state; - funnel_state = thread_funnel_set(network_flock, TRUE); - s = splnet(); + /* Interface does not have a lock until it is attached - radar 3713951 */ + if (ifp->if_lock) + ifnet_lock_exclusive(ifp); ifp->if_eflags &= ~IFEF_INUSE; ifp->if_ioctl = dlil_recycle_ioctl; ifp->if_output = dlil_recycle_output; @@ -2276,8 +2736,7 @@ void dlil_if_release(struct ifnet *ifp) strncpy(dlifp->if_namestorage, ifp->if_name, IFNAMSIZ); ifp->if_name = dlifp->if_namestorage; + if (ifp->if_lock) + ifnet_lock_done(ifp); - splx(s); - thread_funnel_set(network_flock, funnel_state); } - diff --git a/bsd/net/dlil.h b/bsd/net/dlil.h index 21f6d01fe..6824572cb 100644 --- a/bsd/net/dlil.h +++ b/bsd/net/dlil.h @@ -25,22 +25,23 @@ * Data Link Inteface Layer * Author: Ted Walker */ - - #ifndef DLIL_H #define DLIL_H -#include <sys/appleapiopts.h> +#ifdef KERNEL +#include <sys/kernel_types.h> +#include <net/kpi_interface.h> #if __STDC__ struct ifnet; struct mbuf; struct ether_header; +struct sockaddr_dl; #endif -#ifdef __APPLE_API_UNSTABLE +#ifdef KERNEL_PRIVATE #define DLIL_LAST_FILTER -1 #define DLIL_NULL_FILTER -2 @@ -54,6 +55,8 @@ struct ether_header; #include <net/if_var.h> #include <sys/kern_event.h> +#endif KERNEL_PRIVATE + enum { BPF_TAP_DISABLE, BPF_TAP_INPUT, @@ -61,143 +64,62 @@ enum { BPF_TAP_INPUT_OUTPUT }; - -struct dl_tag_attr_str { - u_long dl_tag; - short if_flags; - short if_unit; - u_long if_family; - u_long protocol_family; -}; - - -struct dlil_pr_flt_str { - caddr_t cookie; - - int (*filter_dl_input)(caddr_t cookie, - struct mbuf **m, - char **frame_header, - struct ifnet **ifp); - - - int (*filter_dl_output)(caddr_t cookie, - struct mbuf **m, - struct ifnet **ifp, - struct sockaddr **dest, - char *dest_linkaddr, - char *frame_type); - - int (*filter_dl_event)(caddr_t cookie, - struct kern_event_msg *event_msg); - - int (*filter_dl_ioctl)(caddr_t cookie, - struct ifnet *ifp, - u_long ioctl_cmd, - caddr_t ioctl_arg); - - int (*filter_detach)(caddr_t cookie); - u_long reserved[2]; -}; +#ifdef KERNEL_PRIVATE +struct kev_msg; +struct iff_filter; struct dlil_if_flt_str { caddr_t cookie; int (*filter_if_input)(caddr_t cookie, - struct ifnet **ifnet_ptr, + struct ifnet **ifp, struct mbuf **mbuf_ptr, char **frame_ptr); int (*filter_if_event)(caddr_t cookie, - struct ifnet **ifnet_ptr, - struct kern_event_msg **event_msg_ptr); + struct ifnet *ifp, + struct kev_msg *event_msg_ptr); int (*filter_if_output)(caddr_t cookie, - struct ifnet **ifnet_ptr, + struct ifnet **ifp, struct mbuf **mbuf_ptr); int (*filter_if_ioctl)(caddr_t cookie, - struct ifnet *ifnet_ptr, + struct ifnet *ifp, u_long ioctl_code_ptr, caddr_t ioctl_arg_ptr); int (*filter_if_free)(caddr_t cookie, - struct ifnet *ifnet_ptr); + struct ifnet *ifp); - int (*filter_detach)(caddr_t cookie); + int (*filter_detach)(caddr_t cookie); u_long reserved[2]; }; - #define DLIL_PR_FILTER 1 #define DLIL_IF_FILTER 2 typedef int (*dl_input_func)(struct mbuf *m, char *frame_header, - struct ifnet *ifp, u_long dl_tag, int sync_ok); + struct ifnet *ifp, u_long protocol_family, int sync_ok); typedef int (*dl_pre_output_func)(struct ifnet *ifp, - struct mbuf **m, - struct sockaddr *dest, - caddr_t route_entry, - char *frame_type, - char *dst_addr, - u_long dl_tag); + u_long protocol_family, + struct mbuf **m, + const struct sockaddr *dest, + caddr_t route_entry, + char *frame_type, + char *dst_addr); -typedef int (*dl_event_func)(struct kern_event_msg *event, - u_long dl_tag); +typedef void (*dl_event_func)(struct ifnet *ifp, struct kev_msg *event); typedef int (*dl_offer_func)(struct mbuf *m, char *frame_header); -typedef int (*dl_ioctl_func)(u_long dl_tag, +typedef int (*dl_ioctl_func)(u_long protocol_family, struct ifnet *ifp, u_long ioctl_cmd, caddr_t ioctl_arg); +typedef int (*dl_detached_func)(u_long protocol_family, struct ifnet *ifp); - - -#ifdef __APPLE_API_PRIVATE -struct dlil_filterq_entry { - TAILQ_ENTRY(dlil_filterq_entry) que; - u_long filter_id; - int type; - union { - struct dlil_if_flt_str if_filter; - struct dlil_pr_flt_str pr_filter; - } variants; -}; -#else -struct dlil_filterq_entry; -#endif /* __APPLE_API_PRIVATE */ - -TAILQ_HEAD(dlil_filterq_head, dlil_filterq_entry); - - -struct if_proto { - TAILQ_ENTRY(if_proto) next; - u_long dl_tag; - struct dlil_filterq_head pr_flt_head; - struct ifnet *ifp; - dl_input_func dl_input; - dl_pre_output_func dl_pre_output; - dl_event_func dl_event; - dl_offer_func dl_offer; - dl_ioctl_func dl_ioctl; - u_long protocol_family; - u_long reserved[4]; - -}; - -#ifdef __APPLE_API_PRIVATE -TAILQ_HEAD(dlil_proto_head, if_proto); - -struct dlil_tag_list_entry { - TAILQ_ENTRY(dlil_tag_list_entry) next; - struct ifnet *ifp; - u_long dl_tag; -}; -#endif /* __APPLE_API_PRIVATE */ - - -#ifdef __APPLE_API_OBSOLETE /* Obsolete types */ #define DLIL_DESC_RAW 1 #define DLIL_DESC_802_2 2 @@ -210,9 +132,9 @@ struct dlil_tag_list_entry { * DLIL_DESC_802_2_SNAP - obsolete, data in variants.desc_802_2_SNAP * protocol field in host byte order */ -#endif /* __APPLE_API_OBSOLETE */ +#endif KERNEL_PRIVATE -/* Ehernet specific types */ +/* Ethernet specific types */ #define DLIL_DESC_ETYPE2 4 #define DLIL_DESC_SAP 5 #define DLIL_DESC_SNAP 6 @@ -232,6 +154,7 @@ struct dlil_tag_list_entry { * variants.native_type_length. */ +#ifdef KERNEL_PRIVATE struct dlil_demux_desc { TAILQ_ENTRY(dlil_demux_desc) next; @@ -269,7 +192,6 @@ struct dlil_demux_desc { TAILQ_HEAD(ddesc_head_str, dlil_demux_desc); - struct dlil_proto_reg_str { struct ddesc_head_str demux_desc_head; u_long interface_family; @@ -281,23 +203,39 @@ struct dlil_proto_reg_str { dl_event_func event; dl_offer_func offer; dl_ioctl_func ioctl; - u_long reserved[4]; + dl_detached_func detached; + u_long reserved[3]; }; -int dlil_attach_interface_filter(struct ifnet *ifnet_ptr, - struct dlil_if_flt_str *interface_filter, - u_long *filter_id, - int insertion_point); +int dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter, + interface_filter_t *filter_ref); + +struct ifnet_stat_increment_param; + +int +dlil_input_with_stats(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail, + const struct ifnet_stat_increment_param *stats); int dlil_input(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail); int -dlil_output(u_long dl_tag, +dlil_output_list( + struct ifnet *ifp, + u_long protocol_family, + struct mbuf *packetlist, + caddr_t route, + const struct sockaddr *dest, + int raw); + +int +dlil_output( + struct ifnet *ifp, + u_long protocol_family, struct mbuf *m, caddr_t route, - struct sockaddr *dest, + const struct sockaddr *dest, int raw); @@ -307,32 +245,82 @@ dlil_ioctl(u_long proto_family, u_long ioctl_code, caddr_t ioctl_arg); +errno_t +dlil_resolve_multi( + struct ifnet *ifp, + const struct sockaddr *proto_addr, + struct sockaddr *ll_addr, + size_t ll_len); + +/* + * Send arp internal bypasses the check for + * IPv4LL. + */ +errno_t +dlil_send_arp_internal( + ifnet_t ifp, + u_int16_t arpop, + const struct sockaddr_dl* sender_hw, + const struct sockaddr* sender_proto, + const struct sockaddr_dl* target_hw, + const struct sockaddr* target_proto); + +errno_t +dlil_send_arp( + ifnet_t ifp, + u_int16_t arpop, + const struct sockaddr_dl* sender_hw, + const struct sockaddr* sender_proto, + const struct sockaddr_dl* target_hw, + const struct sockaddr* target_proto); + +int +dlil_ioctl_locked(u_long proto_family, + struct ifnet *ifp, + u_long ioctl_code, + caddr_t ioctl_arg); + int -dlil_attach_protocol(struct dlil_proto_reg_str *proto, - u_long *dl_tag); +dlil_attach_protocol(struct dlil_proto_reg_str *proto); int -dlil_detach_protocol(u_long dl_tag); +dlil_detach_protocol(struct ifnet *ifp, u_long protocol_family); int dlil_if_attach(struct ifnet *ifp); +#ifdef BSD_KERNEL_PRIVATE + int -dlil_attach_protocol_filter(u_long dl_tag, - struct dlil_pr_flt_str *proto_filter, - u_long *filter_id, - int insertion_point); +dlil_if_attach_with_address( + struct ifnet *ifp, + const struct sockaddr_dl *ll_addr); + int -dlil_detach_filter(u_long filter_id); +dlil_attach_protocol_kpi(ifnet_t ifp, protocol_family_t protocol, + const struct ifnet_attach_proto_param *proto_details); + +errno_t dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, + bpf_packet_func callback); + +#endif + +void +dlil_detach_filter(interface_filter_t filter); struct dlil_ifmod_reg_str { int (*add_if)(struct ifnet *ifp); int (*del_if)(struct ifnet *ifp); - int (*add_proto)(struct ddesc_head_str *demux_desc_head, - struct if_proto *proto, u_long dl_tag); - int (*del_proto)(struct if_proto *proto, u_long dl_tag); - int (*ifmod_ioctl)(struct ifnet *ifp, u_long ioctl_cmd, caddr_t data); - int (*shutdown)(); + int (*add_proto)(struct ifnet *ifp, u_long protocol_family, + struct ddesc_head_str *demux_desc_head); +#ifdef __KPI_INTERFACE__ + ifnet_del_proto_func del_proto; + ifnet_ioctl_func ifmod_ioctl; +#else + void* del_proto; + void* ifmod_ioctl; +#endif + int (*shutdown)(void); int (*init_if)(struct ifnet *ifp); u_long reserved[3]; }; @@ -341,24 +329,6 @@ struct dlil_ifmod_reg_str { int dlil_reg_if_modules(u_long interface_family, struct dlil_ifmod_reg_str *ifmod_reg); -struct dlil_protomod_reg_str { - /* - * attach the protocol to the interface and return the dl_tag - */ - int (*attach_proto)(struct ifnet *ifp, u_long *dl_tag); - - /* - * detach the protocol from the interface. - * this is optionnal. If it is NULL, DLIL will use 0 default detach function. - */ - int (*detach_proto)(struct ifnet *ifp, u_long dl_tag); - - /* - * reserved for future use. MUST be NULL. - */ - u_long reserved[4]; -}; - /* Function : dlil_reg_proto_module @@ -402,7 +372,8 @@ EINVAL: */ int dlil_reg_proto_module(u_long protocol_family, u_long interface_family, - struct dlil_protomod_reg_str *protomod_reg); + int (*attach)(struct ifnet *ifp, u_long protocol_family), + int (*detach)(struct ifnet *ifp, u_long protocol_family)); /* @@ -438,16 +409,11 @@ Function : dlil_plumb_protocol dlil_plumb_protocol() will plumb a protocol to an actual interface. This will find a registered protocol module and call its attach function. - The module will typically call dlil_attach_protocol with the appropriate parameters, - and will return the dl_tag of the attachement. - It is up to the caller to handle the dl_tag. - Some protocol (IPv4) will stick it in their internal structure for future use. - Some other protocol (IPv6) can ignore the dl_tag. - + The module will typically call dlil_attach_protocol with the appropriate parameters. + Parameters : 'protocol_family' is PF_INET, PF_INET6, ... 'ifp' is the interface to plumb the protocol to. - 'dl_tag' is the tag returned from the succesful attachement. Return code : @@ -464,7 +430,7 @@ other: Error returned by the attach_proto function */ -int dlil_plumb_protocol(u_long protocol_family, struct ifnet *ifp, u_long *dl_tag); +int dlil_plumb_protocol(u_long protocol_family, struct ifnet *ifp); /* @@ -513,9 +479,11 @@ dlil_inject_pr_output(struct mbuf *m, int dlil_inject_if_output(struct mbuf *m, u_long from_id); -int -dlil_find_dltag(u_long if_family, short unit, u_long proto_family, u_long *dl_tag); - +#ifdef KERNEL_PRIVATE +void +dlil_post_msg(struct ifnet *ifp,u_long event_subclass, u_long event_code, + struct net_event_data *event_data, u_long event_data_len); +#endif int dlil_event(struct ifnet *ifp, struct kern_event_msg *event); @@ -525,6 +493,12 @@ int dlil_dereg_if_modules(u_long interface_family); int dlil_if_detach(struct ifnet *ifp); +void +ifp_reference(struct ifnet *ifp); + +void +ifp_release(struct ifnet *ifp); + /* @@ -604,7 +578,7 @@ EBUSY: */ -int dlil_if_acquire(u_long family, void *uniqueid, size_t uniqueid_len, +int dlil_if_acquire(u_long family, const void *uniqueid, size_t uniqueid_len, struct ifnet **ifp); @@ -619,10 +593,10 @@ Function : dlil_if_release The if_eflags IF_INUSE will be cleared. The fields if_output, if_ioctl, if_free and if_set_bpf_tap will be changed to point to DLIL private functions. - After calling dlil_if_acquire, the driver can safely terminate and + After calling dlil_if_release, the driver can safely terminate and unload if necessary. - Note : if the call to dlil_if_detach returns DLIL_WAIT_FOR_FREE, the - driver can safely ignore it and call dlil_if_release. + Note: your driver should only call dlil_if_release once your if_free + function has been called. Parameters : ifp is the pointer to the ifnet to release. @@ -631,5 +605,6 @@ Parameters : void dlil_if_release(struct ifnet *ifp); -#endif /* __APPLE_API_UNSTABLE */ +#endif /* KERNEL_PRIVATE */ +#endif /* KERNEL */ #endif /* DLIL_H */ diff --git a/bsd/net/dlil_pvt.h b/bsd/net/dlil_pvt.h index 91da52b03..af688c107 100644 --- a/bsd/net/dlil_pvt.h +++ b/bsd/net/dlil_pvt.h @@ -22,26 +22,20 @@ #ifndef DLIL_PVT_H #define DLIL_PVT_H #include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE #include <net/dlil.h> #include <sys/queue.h> -struct dlil_if_filterq_entry { - TAILQ_ENTRY(dlil_if_filterq_entry) que; - struct dlil_interface_filter_str if_filter; -}; - - struct dlil_family_mod_str { TAILQ_ENTRY(dlil_family_mod_str) dl_fam_next; char *interface_family; int (*add_if)(struct ifnet_ptr *ifp); int (*del_if)(struct ifnet *ifp); - int (*add_proto)(TAILQ_HEAD(ddesc_head_name, dlil_demux_desc) demux_desc_head, - struct if_proto *proto); - int (*del_proto)(struct if_proto *proto); + int (*add_proto)(struct ifnet *ifp, u_long protocol_family, + struct ddesc_head_str *demux_desc_head); + int (*del_proto)(struct ifnet *ifp, u_long proto_family); } -#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL_PRIVATE */ #endif diff --git a/bsd/net/ether_at_pr_module.c b/bsd/net/ether_at_pr_module.c index d811b9c33..7f031bdd8 100644 --- a/bsd/net/ether_at_pr_module.c +++ b/bsd/net/ether_at_pr_module.c @@ -65,7 +65,6 @@ #include <sys/sysctl.h> #include <net/if.h> -#include <net/netisr.h> #include <net/route.h> #include <net/if_llc.h> #include <net/if_dl.h> @@ -90,19 +89,22 @@ extern struct ifqueue atalkintrq; #include <net/if_vlan_var.h> #endif /* NVLAN > 0 */ -static -u_char etherbroadcastaddr[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; - -#define IFP2AC(IFP) ((struct arpcom *)IFP) - - struct dl_es_at_entry { - struct ifnet *ifp; - u_long dl_tag; - int ref_count; + struct ifnet *ifp; + int ref_count; }; +/* Local fuction declerations */ +int at_ether_input(struct mbuf *m, char *frame_header, struct ifnet *ifp, + u_long protocol_family, int sync_ok); +int ether_pre_output(struct ifnet *ifp, u_long protocol_family, struct mbuf **m0, + const struct sockaddr *dst_netaddr, caddr_t route, char *type, char *edst); +int ether_prmod_ioctl(u_long protocol_family, struct ifnet *ifp, u_long command, + caddr_t data); +int ether_attach_at(struct ifnet *ifp); +void ether_detach_at(struct ifnet *ifp); + /* * Temp static for protocol registration XXX @@ -118,155 +120,44 @@ static struct dl_es_at_entry en_at_array[MAX_EN_COUNT]; * the ether header, which is provided separately. */ int -at_ether_input(m, frame_header, ifp, dl_tag, sync_ok) - struct mbuf *m; - char *frame_header; - struct ifnet *ifp; - u_long dl_tag; - int sync_ok; +at_ether_input( + struct mbuf *m, + __unused char *frame_header, + __unused struct ifnet *ifp, + __unused u_long protocol_family, + __unused int sync_ok) { - register struct ether_header *eh = (struct ether_header *) frame_header; - register struct ifqueue *inq=0; - u_short ether_type; - int s; - u_int16_t ptype = -1; - unsigned char buf[18]; - -#if NETAT - register struct llc *l; -#endif - - if ((ifp->if_flags & IFF_UP) == 0) { - m_freem(m); - return EJUSTRETURN; - } - - ifp->if_lastchange = time; - - if (eh->ether_dhost[0] & 1) { - if (bcmp((caddr_t)etherbroadcastaddr, (caddr_t)eh->ether_dhost, - sizeof(etherbroadcastaddr)) == 0) - m->m_flags |= M_BCAST; - else - m->m_flags |= M_MCAST; - } - if (m->m_flags & (M_BCAST|M_MCAST)) - ifp->if_imcasts++; + /* + * note: for AppleTalk we need to pass the enet header of the + * packet up stack. To do so, we made sure in that the FULL packet + * is copied in the mbuf by the mace driver, and only the m_data and + * length have been shifted to make IP and the other guys happy. + */ + + m->m_data -= sizeof(struct ether_header); + m->m_len += sizeof(struct ether_header); + m->m_pkthdr.len += sizeof(struct ether_header); + proto_input(PF_APPLETALK, m); - ether_type = ntohs(eh->ether_type); - -#if NVLAN > 0 - if (ether_type == vlan_proto) { - if (vlan_input(eh, m) < 0) - ifp->if_data.ifi_noproto++; - return EJUSTRETURN; - } -#endif /* NVLAN > 0 */ - - if (ether_type > ETHERMTU) - return ENOENT; - -#if NETAT - l = mtod(m, struct llc *); - - switch (l->llc_dsap) { - case LLC_SNAP_LSAP: - - /* Temporary hack: check for AppleTalk and AARP packets */ - /* WARNING we're checking only on the "ether_type" (the 2 bytes - * of the SNAP header. This shouldn't be a big deal, - * AppleTalk pat_input is making sure we have the right packets - * because it needs to discrimante AARP from EtherTalk packets. - */ - - if (l->llc_ssap == LLC_SNAP_LSAP && - l->llc_un.type_snap.control == 0x03) { - -#ifdef APPLETALK_DEBUG - printf("new_ether_input: SNAP Cntrol type=0x%x Src=%s\n", - l->llc_un.type_snap.ether_type, - ether_sprintf(buf, &eh->ether_shost)); - printf(" Dst=%s\n", - ether_sprintf(buf, &eh->ether_dhost)); -#endif /* APPLETALK_DEBUG */ - - if ((l->llc_un.type_snap.ether_type == 0x809B) || - (l->llc_un.type_snap.ether_type == 0x80F3)) { - - - /* - * note: for AppleTalk we need to pass the enet header of the - * packet up stack. To do so, we made sure in that the FULL packet - * is copied in the mbuf by the mace driver, and only the m_data and - * length have been shifted to make IP and the other guys happy. - */ - - m->m_data -= sizeof(*eh); - m->m_len += sizeof(*eh); - m->m_pkthdr.len += sizeof(*eh); -#ifdef APPLETALK_DEBUG - l == (struct llc *)(eh+1); - if (l->llc_un.type_snap.ether_type == 0x80F3) { - kprintf("new_ether_input: RCV AppleTalk type=0x%x Src=%s\n", - l->llc_un.type_snap.ether_type, - ether_sprintf(buf, &eh->ether_shost)); - kprintf(" Dst=%s\n", - ether_sprintf(buf, &eh->ether_dhost)); - } -#endif /* APPLETALK_DEBUG */ - schednetisr(NETISR_APPLETALK); - inq = &atalkintrq ; - - break; - } - } - - break; - - - default: - return ENOENT; - } - - - if (inq == 0) - return ENOENT; - - s = splimp(); - if (IF_QFULL(inq)) { - IF_DROP(inq); - m_freem(m); - splx(s); - return EJUSTRETURN; - } else - IF_ENQUEUE(inq, m); - splx(s); return 0; -#else - return ENOENT; -#endif /* NETAT */ } int -ether_pre_output(ifp, m0, dst_netaddr, route, type, edst, dl_tag ) - struct ifnet *ifp; - struct mbuf **m0; - struct sockaddr *dst_netaddr; - caddr_t route; - char *type; - char *edst; - u_long dl_tag; +ether_pre_output( + struct ifnet *ifp, + __unused u_long protocol_family, + struct mbuf **m0, + const struct sockaddr *dst_netaddr, + __unused caddr_t route, + char *type, + char *edst) { - int s; register struct mbuf *m = *m0; - register struct rtentry *rt; register struct ether_header *eh; - int off, len = m->m_pkthdr.len; int hlen; /* link layer header lenght */ - struct arpcom *ac = IFP2AC(ifp); @@ -314,42 +205,27 @@ ether_pre_output(ifp, m0, dst_netaddr, route, type, edst, dl_tag ) int -ether_prmod_ioctl(dl_tag, ifp, command, data) - u_long dl_tag; - struct ifnet *ifp; - int command; - caddr_t data; +ether_prmod_ioctl( + __unused u_long protocol_family, + struct ifnet *ifp, + u_long command, + caddr_t data) { - struct ifaddr *ifa = (struct ifaddr *) data; struct ifreq *ifr = (struct ifreq *) data; int error = 0; - boolean_t funnel_state; - struct arpcom *ac = (struct arpcom *) ifp; - struct sockaddr_dl *sdl; - struct sockaddr_in *sin; - u_char *e_addr; - - - funnel_state = thread_funnel_set(network_flock, TRUE); switch (command) { case SIOCSIFADDR: if ((ifp->if_flags & IFF_RUNNING) == 0) { - ifp->if_flags |= IFF_UP; + ifnet_set_flags(ifp, IFF_UP, IFF_UP); dlil_ioctl(0, ifp, SIOCSIFFLAGS, (caddr_t) 0); } break; case SIOCGIFADDR: - { - struct sockaddr *sa; - - sa = (struct sockaddr *) & ifr->ifr_data; - bcopy(IFP2AC(ifp)->ac_enaddr, - (caddr_t) sa->sa_data, ETHER_ADDR_LEN); - } + ifnet_lladdr_copy_bytes(ifp, ifr->ifr_addr.sa_data, ETHER_ADDR_LEN); break; case SIOCSIFMTU: @@ -367,102 +243,92 @@ ether_prmod_ioctl(dl_tag, ifp, command, data) return EOPNOTSUPP; } - (void) thread_funnel_set(network_flock, funnel_state); return (error); } -void -ether_attach_at(struct ifnet *ifp, u_long *at_dl_tag, u_long *aarp_dl_tag) +int +ether_attach_at( + struct ifnet *ifp) { struct dlil_proto_reg_str reg; struct dlil_demux_desc desc; struct dlil_demux_desc desc2; - u_short native = 0; /* 802.2 frames use a length here */ int stat; int first_empty; int i; - + u_int8_t atalk_snap[5] = {0x08, 0x00, 0x07, 0x80, 0x9b}; + u_int8_t aarp_snap[5] = {0x00, 0x00, 0x00, 0x80, 0xf3}; first_empty = MAX_EN_COUNT; - for (i=0; i < MAX_EN_COUNT; i++) { - if (en_at_array[i].ifp == 0) - first_empty = i; - - if (en_at_array[i].ifp == ifp) { - en_at_array[i].ref_count++; - *at_dl_tag = *aarp_dl_tag = en_at_array[i].dl_tag; - return; - } + for (i=0; i < MAX_EN_COUNT; i++) { + if (en_at_array[i].ifp == 0) + first_empty = i; + + if (en_at_array[i].ifp == ifp) { + en_at_array[i].ref_count++; + return 0; + } } - if (first_empty == MAX_EN_COUNT) - return; - - TAILQ_INIT(®.demux_desc_head); - desc.type = DLIL_DESC_802_2_SNAP; - desc.variants.desc_802_2_SNAP.dsap = LLC_SNAP_LSAP; - desc.variants.desc_802_2_SNAP.ssap = LLC_SNAP_LSAP; - desc.variants.desc_802_2_SNAP.control_code = 0x03; - desc.variants.desc_802_2_SNAP.org[0] = 0x08; - desc.variants.desc_802_2_SNAP.org[1] = 0x00; - desc.variants.desc_802_2_SNAP.org[2] = 0x07; - desc.variants.desc_802_2_SNAP.protocol_type = 0x809B; - desc.native_type = NULL; - TAILQ_INSERT_TAIL(®.demux_desc_head, &desc, next); - reg.interface_family = ifp->if_family; - reg.unit_number = ifp->if_unit; - reg.input = at_ether_input; - reg.pre_output = ether_pre_output; - reg.event = 0; - reg.offer = 0; - reg.ioctl = ether_prmod_ioctl; - reg.default_proto = 0; - reg.protocol_family = PF_APPLETALK; - - desc2 = desc; - desc2.variants.desc_802_2_SNAP.protocol_type = 0x80F3; - desc2.variants.desc_802_2_SNAP.org[0] = 0; - desc2.variants.desc_802_2_SNAP.org[1] = 0; - desc2.variants.desc_802_2_SNAP.org[2] = 0; - - TAILQ_INSERT_TAIL(®.demux_desc_head, &desc2, next); - - stat = dlil_attach_protocol(®, at_dl_tag); - if (stat) { - printf("WARNING: ether_attach_at can't attach at to interface\n"); - return; - } - - *aarp_dl_tag = *at_dl_tag; - - en_at_array[first_empty].ifp = ifp; - en_at_array[first_empty].dl_tag = *at_dl_tag; - en_at_array[first_empty].ref_count = 1; + if (first_empty == MAX_EN_COUNT) + return ENOMEM; + + bzero(®, sizeof(reg)); + bzero(&desc, sizeof(desc)); + bzero(&desc2, sizeof(desc2)); + + TAILQ_INIT(®.demux_desc_head); + reg.interface_family = ifp->if_family; + reg.unit_number = ifp->if_unit; + reg.input = at_ether_input; + reg.pre_output = ether_pre_output; + reg.ioctl = ether_prmod_ioctl; + reg.protocol_family = PF_APPLETALK; + + desc.type = DLIL_DESC_SNAP; + desc.native_type = atalk_snap; + desc.variants.native_type_length = sizeof(atalk_snap); + TAILQ_INSERT_TAIL(®.demux_desc_head, &desc, next); + + desc2.type = DLIL_DESC_SNAP; + desc2.native_type = aarp_snap; + desc2.variants.native_type_length = sizeof(aarp_snap); + TAILQ_INSERT_TAIL(®.demux_desc_head, &desc2, next); + + stat = dlil_attach_protocol(®); + if (stat) { + printf("WARNING: ether_attach_at can't attach at to interface\n"); + return stat; + } + en_at_array[first_empty].ifp = ifp; + en_at_array[first_empty].ref_count = 1; + + return 0; } /* ether_attach_at */ void ether_detach_at(struct ifnet *ifp) { - int i; - - for (i=0; i < MAX_EN_COUNT; i++) { - if (en_at_array[i].ifp == ifp) - break; - } - - if (i < MAX_EN_COUNT) { - if (en_at_array[i].ref_count > 1) - en_at_array[i].ref_count--; - else { - if (en_at_array[i].ref_count == 1) { - dlil_detach_protocol(en_at_array[i].dl_tag); - en_at_array[i].ifp = 0; - } - } - } + int i; + + for (i=0; i < MAX_EN_COUNT; i++) { + if (en_at_array[i].ifp == ifp) + break; + } + + if (i < MAX_EN_COUNT) { + if (en_at_array[i].ref_count > 1) + en_at_array[i].ref_count--; + else { + if (en_at_array[i].ref_count == 1) { + dlil_detach_protocol(ifp, PF_APPLETALK); + en_at_array[i].ifp = 0; + } + } + } } diff --git a/bsd/net/ether_if_module.c b/bsd/net/ether_if_module.c index 5d47f82b2..0e50b415b 100644 --- a/bsd/net/ether_if_module.c +++ b/bsd/net/ether_if_module.c @@ -65,11 +65,11 @@ #include <sys/sysctl.h> #include <net/if.h> -#include <net/netisr.h> #include <net/route.h> #include <net/if_llc.h> #include <net/if_dl.h> #include <net/if_types.h> +#include <net/if_ether.h> #include <netinet/if_ether.h> #include <netinet/in.h> /* For M_LOOP */ @@ -85,12 +85,10 @@ #include <sys/socketvar.h> #include <net/if_vlan_var.h> +#include <net/if_bond_var.h> #include <net/dlil.h> -extern int vlan_demux(struct ifnet * ifp, struct mbuf *, - char * frame_header, struct if_proto * * proto); - #if LLC && CCITT extern struct ifqueue pkintrq; #endif @@ -107,340 +105,422 @@ extern struct ifqueue atalkintrq; #include <net/bridge.h> #endif -static u_long lo_dlt = 0; +#define memcpy(x,y,z) bcopy(y, x, z) + -#define IFP2AC(IFP) ((struct arpcom *)IFP) +SYSCTL_DECL(_net_link); +SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet"); struct en_desc { - u_int16_t type; /* Type of protocol stored in data */ - struct if_proto *proto; /* Protocol structure */ - u_long data[2]; /* Protocol data */ + u_int16_t type; /* Type of protocol stored in data */ + u_long protocol_family; /* Protocol family */ + u_long data[2]; /* Protocol data */ }; - -#define ETHER_DESC_BLK_SIZE (10) -#define MAX_INTERFACES 50 +/* descriptors are allocated in blocks of ETHER_DESC_BLK_SIZE */ +#define ETHER_DESC_BLK_SIZE (10) /* - * Statics for demux module + * Header for the demux list, hangs off of IFP at family_cookie */ struct ether_desc_blk_str { - u_long n_max_used; - u_long n_count; - struct en_desc *block_ptr; + u_long n_max_used; + u_long n_count; + u_long n_used; + struct en_desc block_ptr[1]; }; +/* Size of the above struct before the array of struct en_desc */ +#define ETHER_DESC_HEADER_SIZE ((size_t)&(((struct ether_desc_blk_str*)0)->block_ptr[0])) +__private_extern__ u_char etherbroadcastaddr[ETHER_ADDR_LEN] = + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; - -static struct ether_desc_blk_str ether_desc_blk[MAX_INTERFACES]; - - -/* from if_ethersubr.c */ -int ether_resolvemulti __P((struct ifnet *, struct sockaddr **, - struct sockaddr *)); +int ether_add_proto_old(struct ifnet *ifp, u_long protocol_family, struct ddesc_head_str *desc_head); +int ether_add_if(struct ifnet *ifp); +int ether_del_if(struct ifnet *ifp); +int ether_init_if(struct ifnet *ifp); +int ether_family_init(void); /* - * Release all descriptor entries owned by this dl_tag (there may be several). + * Release all descriptor entries owned by this protocol (there may be several). * Setting the type to 0 releases the entry. Eventually we should compact-out * the unused entries. */ -__private_extern__ int -ether_del_proto(struct if_proto *proto, u_long dl_tag) +int +ether_del_proto( + ifnet_t ifp, + protocol_family_t protocol_family) { - struct en_desc* ed = ether_desc_blk[proto->ifp->family_cookie].block_ptr; - u_long current = 0; - int found = 0; - - for (current = ether_desc_blk[proto->ifp->family_cookie].n_max_used; - current > 0; current--) { - if (ed[current - 1].proto == proto) { - found = 1; - ed[current - 1].type = 0; - - if (current == ether_desc_blk[proto->ifp->family_cookie].n_max_used) { - ether_desc_blk[proto->ifp->family_cookie].n_max_used--; - } - } - } - - return found; + struct ether_desc_blk_str *desc_blk = (struct ether_desc_blk_str *)ifp->family_cookie; + u_long current = 0; + int found = 0; + + if (desc_blk == NULL) + return 0; + + for (current = desc_blk->n_max_used; current > 0; current--) { + if (desc_blk->block_ptr[current - 1].protocol_family == protocol_family) { + found = 1; + desc_blk->block_ptr[current - 1].type = 0; + desc_blk->n_used--; + } + } + + if (desc_blk->n_used == 0) { + FREE(ifp->family_cookie, M_IFADDR); + ifp->family_cookie = 0; + } + else { + /* Decrement n_max_used */ + for (; desc_blk->n_max_used > 0 && desc_blk->block_ptr[desc_blk->n_max_used - 1].type == 0; desc_blk->n_max_used--) + ; + } + + return 0; } +static int +ether_add_proto_internal( + struct ifnet *ifp, + protocol_family_t protocol, + const struct ifnet_demux_desc *demux) +{ + struct en_desc *ed; + struct ether_desc_blk_str *desc_blk = (struct ether_desc_blk_str *)ifp->family_cookie; + u_int32_t i; + + switch (demux->type) { + /* These types are supported */ + /* Top three are preferred */ + case DLIL_DESC_ETYPE2: + if (demux->datalen != 2) { + return EINVAL; + } + break; + + case DLIL_DESC_SAP: + if (demux->datalen != 3) { + return EINVAL; + } + break; + + case DLIL_DESC_SNAP: + if (demux->datalen != 5) { + return EINVAL; + } + break; + + default: + return ENOTSUP; + } + + // Verify a matching descriptor does not exist. + if (desc_blk != NULL) { + switch (demux->type) { + case DLIL_DESC_ETYPE2: + for (i = 0; i < desc_blk->n_max_used; i++) { + if (desc_blk->block_ptr[i].type == DLIL_DESC_ETYPE2 && + desc_blk->block_ptr[i].data[0] == + *(u_int16_t*)demux->data) { + return EADDRINUSE; + } + } + break; + case DLIL_DESC_SAP: + case DLIL_DESC_SNAP: + for (i = 0; i < desc_blk->n_max_used; i++) { + if (desc_blk->block_ptr[i].type == demux->type && + bcmp(desc_blk->block_ptr[i].data, demux->data, + demux->datalen) == 0) { + return EADDRINUSE; + } + } + break; + } + } + + // Check for case where all of the descriptor blocks are in use + if (desc_blk == NULL || desc_blk->n_used == desc_blk->n_count) { + struct ether_desc_blk_str *tmp; + u_long new_count = ETHER_DESC_BLK_SIZE; + u_long new_size; + u_long old_size = 0; + + i = 0; + + if (desc_blk) { + new_count += desc_blk->n_count; + old_size = desc_blk->n_count * sizeof(struct en_desc) + ETHER_DESC_HEADER_SIZE; + i = desc_blk->n_used; + } + + new_size = new_count * sizeof(struct en_desc) + ETHER_DESC_HEADER_SIZE; + + tmp = _MALLOC(new_size, M_IFADDR, M_WAITOK); + if (tmp == 0) { + /* + * Remove any previous descriptors set in the call. + */ + return ENOMEM; + } + + bzero(tmp + old_size, new_size - old_size); + if (desc_blk) { + bcopy(desc_blk, tmp, old_size); + FREE(desc_blk, M_IFADDR); + } + desc_blk = tmp; + ifp->family_cookie = (u_long)desc_blk; + desc_blk->n_count = new_count; + } + else { + /* Find a free entry */ + for (i = 0; i < desc_blk->n_count; i++) { + if (desc_blk->block_ptr[i].type == 0) { + break; + } + } + } + + /* Bump n_max_used if appropriate */ + if (i + 1 > desc_blk->n_max_used) { + desc_blk->n_max_used = i + 1; + } + + ed = &desc_blk->block_ptr[i]; + ed->protocol_family = protocol; + ed->data[0] = 0; + ed->data[1] = 0; + + switch (demux->type) { + case DLIL_DESC_ETYPE2: + /* 2 byte ethernet raw protocol type is at native_type */ + /* prtocol must be in network byte order */ + ed->type = DLIL_DESC_ETYPE2; + ed->data[0] = *(u_int16_t*)demux->data; + break; + + case DLIL_DESC_SAP: + ed->type = DLIL_DESC_SAP; + bcopy(demux->data, &ed->data[0], 3); + break; + + case DLIL_DESC_SNAP: { + u_int8_t* pDest = ((u_int8_t*)&ed->data[0]) + 3; + ed->type = DLIL_DESC_SNAP; + bcopy(demux->data, pDest, 5); + } + break; + } + + desc_blk->n_used++; + + return 0; +} +int +ether_add_proto( + ifnet_t ifp, + protocol_family_t protocol, + const struct ifnet_demux_desc *demux_list, + u_int32_t demux_count) +{ + int error = 0; + u_int32_t i; + + for (i = 0; i < demux_count; i++) { + error = ether_add_proto_internal(ifp, protocol, &demux_list[i]); + if (error) { + ether_del_proto(ifp, protocol); + break; + } + } + + return error; +} __private_extern__ int -ether_add_proto(struct ddesc_head_str *desc_head, struct if_proto *proto, u_long dl_tag) +ether_add_proto_old( + struct ifnet *ifp, + u_long protocol_family, + struct ddesc_head_str *desc_head) { - char *current_ptr; - struct dlil_demux_desc *desc; - struct en_desc *ed; - struct en_desc *last; - u_long *bitmask; - u_long *proto_id; - u_long i; - short total_length; - u_long block_count; - u_long *tmp; - - - TAILQ_FOREACH(desc, desc_head, next) { - switch (desc->type) { - /* These types are supported */ - /* Top three are preferred */ - case DLIL_DESC_ETYPE2: - if (desc->variants.native_type_length != 2) - return EINVAL; - break; - - case DLIL_DESC_SAP: - if (desc->variants.native_type_length != 3) - return EINVAL; - break; - - case DLIL_DESC_SNAP: - if (desc->variants.native_type_length != 5) - return EINVAL; - break; - - case DLIL_DESC_802_2: - case DLIL_DESC_802_2_SNAP: - break; - - case DLIL_DESC_RAW: - if (desc->variants.bitmask.proto_id_length == 0) - break; - /* else fall through, bitmask variant not supported */ - - default: - ether_del_proto(proto, dl_tag); - return EINVAL; - } - - ed = ether_desc_blk[proto->ifp->family_cookie].block_ptr; - - /* Find a free entry */ - for (i = 0; i < ether_desc_blk[proto->ifp->family_cookie].n_count; i++) { - if (ed[i].type == 0) { - break; - } - } - - if (i >= ether_desc_blk[proto->ifp->family_cookie].n_count) { - u_long new_count = ETHER_DESC_BLK_SIZE + - ether_desc_blk[proto->ifp->family_cookie].n_count; - tmp = _MALLOC((new_count * (sizeof(*ed))), M_IFADDR, M_WAITOK); - if (tmp == 0) { - /* - * Remove any previous descriptors set in the call. - */ - ether_del_proto(proto, dl_tag); - return ENOMEM; - } - - bzero(tmp, new_count * sizeof(*ed)); - bcopy(ether_desc_blk[proto->ifp->family_cookie].block_ptr, - tmp, ether_desc_blk[proto->ifp->family_cookie].n_count * sizeof(*ed)); - FREE(ether_desc_blk[proto->ifp->family_cookie].block_ptr, M_IFADDR); - ether_desc_blk[proto->ifp->family_cookie].n_count = new_count; - ether_desc_blk[proto->ifp->family_cookie].block_ptr = (struct en_desc*)tmp; - ed = ether_desc_blk[proto->ifp->family_cookie].block_ptr; - } - - /* Bump n_max_used if appropriate */ - if (i + 1 > ether_desc_blk[proto->ifp->family_cookie].n_max_used) { - ether_desc_blk[proto->ifp->family_cookie].n_max_used = i + 1; - } - - ed[i].proto = proto; - ed[i].data[0] = 0; - ed[i].data[1] = 0; - - switch (desc->type) { - case DLIL_DESC_RAW: - /* 2 byte ethernet raw protocol type is at native_type */ - /* protocol is not in network byte order */ - ed[i].type = DLIL_DESC_ETYPE2; - ed[i].data[0] = htons(*(u_int16_t*)desc->native_type); - break; - - case DLIL_DESC_ETYPE2: - /* 2 byte ethernet raw protocol type is at native_type */ - /* prtocol must be in network byte order */ - ed[i].type = DLIL_DESC_ETYPE2; - ed[i].data[0] = *(u_int16_t*)desc->native_type; - break; - - case DLIL_DESC_802_2: - ed[i].type = DLIL_DESC_SAP; - ed[i].data[0] = *(u_int32_t*)&desc->variants.desc_802_2; - ed[i].data[0] &= htonl(0xFFFFFF00); - break; - - case DLIL_DESC_SAP: - ed[i].type = DLIL_DESC_SAP; - bcopy(desc->native_type, &ed[i].data[0], 3); - break; - - case DLIL_DESC_802_2_SNAP: - ed[i].type = DLIL_DESC_SNAP; - desc->variants.desc_802_2_SNAP.protocol_type = - htons(desc->variants.desc_802_2_SNAP.protocol_type); - bcopy(&desc->variants.desc_802_2_SNAP, &ed[i].data[0], 8); - ed[i].data[0] &= htonl(0x000000FF); - desc->variants.desc_802_2_SNAP.protocol_type = - ntohs(desc->variants.desc_802_2_SNAP.protocol_type); - break; - - case DLIL_DESC_SNAP: { - u_int8_t* pDest = ((u_int8_t*)&ed[i].data[0]) + 3; - ed[i].type = DLIL_DESC_SNAP; - bcopy(desc->native_type, pDest, 5); - } - break; - } - } - - return 0; + struct dlil_demux_desc *desc; + int error = 0; + + TAILQ_FOREACH(desc, desc_head, next) { + struct ifnet_demux_desc dmx; + int swapped = 0; + + // Convert dlil_demux_desc to ifnet_demux_desc + dmx.type = desc->type; + dmx.datalen = desc->variants.native_type_length; + dmx.data = desc->native_type; + +#ifdef DLIL_DESC_RAW + if (dmx.type == DLIL_DESC_RAW) { + swapped = 1; + dmx.type = DLIL_DESC_ETYPE2; + dmx.datalen = 2; + *(u_int16_t*)dmx.data = htons(*(u_int16_t*)dmx.data); + } +#endif + + error = ether_add_proto_internal(ifp, protocol_family, &dmx); + if (swapped) { + *(u_int16_t*)dmx.data = ntohs(*(u_int16_t*)dmx.data); + swapped = 0; + } + if (error) { + ether_del_proto(ifp, protocol_family); + break; + } + } + + return error; } -static -int ether_shutdown() +static int +ether_shutdown(void) { return 0; } -int ether_demux(ifp, m, frame_header, proto) - struct ifnet *ifp; - struct mbuf *m; - char *frame_header; - struct if_proto **proto; - +int +ether_demux( + ifnet_t ifp, + mbuf_t m, + char *frame_header, + protocol_family_t *protocol_family) { - register struct ether_header *eh = (struct ether_header *)frame_header; - u_short ether_type = eh->ether_type; - u_short ether_type_host; - u_int16_t type; - u_int8_t *data; - u_long i = 0; - u_long max = ether_desc_blk[ifp->family_cookie].n_max_used; - struct en_desc *ed = ether_desc_blk[ifp->family_cookie].block_ptr; - u_int32_t extProto1 = 0; - u_int32_t extProto2 = 0; - - if (eh->ether_dhost[0] & 1) { - /* Check for broadcast */ - if (*(u_int32_t*)eh->ether_dhost == 0xFFFFFFFF && - *(u_int16_t*)(eh->ether_dhost + sizeof(u_int32_t)) == 0xFFFF) - m->m_flags |= M_BCAST; - else - m->m_flags |= M_MCAST; - } else { - /* - * When the driver is put into promiscuous mode we may receive unicast - * frames that are not intended for our interfaces. They are filtered - * here to keep them from traveling further up the stack to code that - * is not expecting them or prepared to deal with them. In the near - * future, the filtering done here will be moved even further down the - * stack into the IONetworkingFamily, preventing even interface - * filter NKE's from receiving promiscuous packets. Please use BPF. - */ - #define ETHER_CMP(x, y) ( ((u_int16_t *) x)[0] != ((u_int16_t *) y)[0] || \ - ((u_int16_t *) x)[1] != ((u_int16_t *) y)[1] || \ - ((u_int16_t *) x)[2] != ((u_int16_t *) y)[2] ) - - if (ETHER_CMP(eh->ether_dhost, ((struct arpcom *) ifp)->ac_enaddr)) { - m_freem(m); - return EJUSTRETURN; - } - } - ether_type_host = ntohs(ether_type); - if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) - || ether_type_host == ETHERTYPE_VLAN) { - return (vlan_demux(ifp, m, frame_header, proto)); - } - data = mtod(m, u_int8_t*); - - /* - * Determine the packet's protocol type and stuff the protocol into - * longs for quick compares. - */ - if (ether_type_host <= 1500) { - extProto1 = *(u_int32_t*)data; - - // SAP or SNAP - if ((extProto1 & htonl(0xFFFFFF00)) == htonl(0xAAAA0300)) { - // SNAP - type = DLIL_DESC_SNAP; - extProto2 = *(u_int32_t*)(data + sizeof(u_int32_t)); - extProto1 &= htonl(0x000000FF); - } else { - type = DLIL_DESC_SAP; - extProto1 &= htonl(0xFFFFFF00); - } - } else { - type = DLIL_DESC_ETYPE2; - } - - /* - * Search through the connected protocols for a match. - */ - - switch (type) { - case DLIL_DESC_ETYPE2: - for (i = 0; i < max; i++) { - if ((ed[i].type == type) && (ed[i].data[0] == ether_type)) { - *proto = ed[i].proto; - return 0; - } - } - break; - - case DLIL_DESC_SAP: - for (i = 0; i < max; i++) { - if ((ed[i].type == type) && (ed[i].data[0] == extProto1)) { - *proto = ed[i].proto; - return 0; - } - } - break; - - case DLIL_DESC_SNAP: - for (i = 0; i < max; i++) { - if ((ed[i].type == type) && (ed[i].data[0] == extProto1) && - (ed[i].data[1] == extProto2)) { - *proto = ed[i].proto; - return 0; - } - } - break; - } - - return ENOENT; -} + struct ether_header *eh = (struct ether_header *)frame_header; + u_short ether_type = eh->ether_type; + u_int16_t type; + u_int8_t *data; + u_long i = 0; + struct ether_desc_blk_str *desc_blk = (struct ether_desc_blk_str *)ifp->family_cookie; + u_long maxd = desc_blk ? desc_blk->n_max_used : 0; + struct en_desc *ed = desc_blk ? desc_blk->block_ptr : NULL; + u_int32_t extProto1 = 0; + u_int32_t extProto2 = 0; + + if (eh->ether_dhost[0] & 1) { + /* Check for broadcast */ + if (*(u_int32_t*)eh->ether_dhost == 0xFFFFFFFF && + *(u_int16_t*)(eh->ether_dhost + sizeof(u_int32_t)) == 0xFFFF) + m->m_flags |= M_BCAST; + else + m->m_flags |= M_MCAST; + } + if (ifp->if_eflags & IFEF_BOND) { + /* if we're bonded, bond "protocol" gets all the packets */ + *protocol_family = PF_BOND; + return (0); + } + if ((eh->ether_dhost[0] & 1) == 0) { + /* + * When the driver is put into promiscuous mode we may receive unicast + * frames that are not intended for our interfaces. They are marked here + * as being promiscuous so the caller may dispose of them after passing + * the packets to any interface filters. + */ + #define ETHER_CMP(x, y) ( ((u_int16_t *) x)[0] != ((u_int16_t *) y)[0] || \ + ((u_int16_t *) x)[1] != ((u_int16_t *) y)[1] || \ + ((u_int16_t *) x)[2] != ((u_int16_t *) y)[2] ) + + if (ETHER_CMP(eh->ether_dhost, ifnet_lladdr(ifp))) { + m->m_flags |= M_PROMISC; + } + } + + /* Quick check for VLAN */ + if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0 || + ether_type == htons(ETHERTYPE_VLAN)) { + *protocol_family = PF_VLAN; + return 0; + } + + data = mtod(m, u_int8_t*); + + /* + * Determine the packet's protocol type and stuff the protocol into + * longs for quick compares. + */ + + if (ntohs(ether_type) <= 1500) { + extProto1 = *(u_int32_t*)data; + + // SAP or SNAP + if ((extProto1 & htonl(0xFFFFFF00)) == htonl(0xAAAA0300)) { + // SNAP + type = DLIL_DESC_SNAP; + extProto2 = *(u_int32_t*)(data + sizeof(u_int32_t)); + extProto1 &= htonl(0x000000FF); + } else { + type = DLIL_DESC_SAP; + extProto1 &= htonl(0xFFFFFF00); + } + } else { + type = DLIL_DESC_ETYPE2; + } + + /* + * Search through the connected protocols for a match. + */ + + switch (type) { + case DLIL_DESC_ETYPE2: + for (i = 0; i < maxd; i++) { + if ((ed[i].type == type) && (ed[i].data[0] == ether_type)) { + *protocol_family = ed[i].protocol_family; + return 0; + } + } + break; + + case DLIL_DESC_SAP: + for (i = 0; i < maxd; i++) { + if ((ed[i].type == type) && (ed[i].data[0] == extProto1)) { + *protocol_family = ed[i].protocol_family; + return 0; + } + } + break; + + case DLIL_DESC_SNAP: + for (i = 0; i < maxd; i++) { + if ((ed[i].type == type) && (ed[i].data[0] == extProto1) && + (ed[i].data[1] == extProto2)) { + *protocol_family = ed[i].protocol_family; + return 0; + } + } + break; + } + + return ENOENT; +} /* * Ethernet output routine. * Encapsulate a packet of type family for the local net. * Use trailer local net encapsulation if enough data in first * packet leaves a multiple of 512 bytes of data in remainder. - * Assumes that ifp is actually pointer to arpcom structure. */ int -ether_frameout(ifp, m, ndest, edst, ether_type) - register struct ifnet *ifp; - struct mbuf **m; - struct sockaddr *ndest; - char *edst; - char *ether_type; +ether_frameout( + struct ifnet *ifp, + struct mbuf **m, + const struct sockaddr *ndest, + const char *edst, + const char *ether_type) { - register struct ether_header *eh; + struct ether_header *eh; int hlen; /* link layer header length */ - struct arpcom *ac = IFP2AC(ifp); - hlen = ETHER_HDR_LEN; @@ -455,26 +535,21 @@ ether_frameout(ifp, m, ndest, edst, ether_type) */ if ((ifp->if_flags & IFF_SIMPLEX) && ((*m)->m_flags & M_LOOP)) { - if (lo_dlt == 0) - dlil_find_dltag(APPLE_IF_FAM_LOOPBACK, 0, PF_INET, &lo_dlt); - - if (lo_dlt) { + if (lo_ifp) { if ((*m)->m_flags & M_BCAST) { struct mbuf *n = m_copy(*m, 0, (int)M_COPYALL); if (n != NULL) - dlil_output(lo_dlt, n, 0, ndest, 0); - } - else - { - if (bcmp(edst, ac->ac_enaddr, ETHER_ADDR_LEN) == 0) { - dlil_output(lo_dlt, *m, 0, ndest, 0); + dlil_output(lo_ifp, ndest->sa_family, n, 0, ndest, 0); + } + else { + if (bcmp(edst, ifnet_lladdr(ifp), ETHER_ADDR_LEN) == 0) { + dlil_output(lo_ifp, ndest->sa_family, *m, 0, ndest, 0); return EJUSTRETURN; } } } } - /* * Add local net header. If no space in first mbuf, * allocate another. @@ -489,41 +564,17 @@ ether_frameout(ifp, m, ndest, edst, ether_type) (void)memcpy(&eh->ether_type, ether_type, sizeof(eh->ether_type)); (void)memcpy(eh->ether_dhost, edst, 6); - (void)memcpy(eh->ether_shost, ac->ac_enaddr, - sizeof(eh->ether_shost)); + ifnet_lladdr_copy_bytes(ifp, eh->ether_shost, ETHER_ADDR_LEN); return 0; } - __private_extern__ int ether_add_if(struct ifnet *ifp) { - u_long i; - - ifp->if_framer = ether_frameout; - ifp->if_demux = ether_demux; - ifp->if_event = 0; - ifp->if_resolvemulti = ether_resolvemulti; - ifp->if_nvlans = 0; - - for (i=0; i < MAX_INTERFACES; i++) - if (ether_desc_blk[i].n_count == 0) - break; - - if (i == MAX_INTERFACES) - return ENOMEM; - - ether_desc_blk[i].block_ptr = _MALLOC(ETHER_DESC_BLK_SIZE * sizeof(struct en_desc), - M_IFADDR, M_WAITOK); - if (ether_desc_blk[i].block_ptr == 0) - return ENOMEM; - - ether_desc_blk[i].n_count = ETHER_DESC_BLK_SIZE; - bzero(ether_desc_blk[i].block_ptr, ETHER_DESC_BLK_SIZE * sizeof(struct en_desc)); - - ifp->family_cookie = i; + ifp->if_framer = ether_frameout; + ifp->if_demux = ether_demux; return 0; } @@ -531,108 +582,95 @@ ether_add_if(struct ifnet *ifp) __private_extern__ int ether_del_if(struct ifnet *ifp) { - if ((ifp->family_cookie < MAX_INTERFACES) && - (ether_desc_blk[ifp->family_cookie].n_count)) - { - FREE(ether_desc_blk[ifp->family_cookie].block_ptr, M_IFADDR); - ether_desc_blk[ifp->family_cookie].block_ptr = NULL; - ether_desc_blk[ifp->family_cookie].n_count = 0; - ether_desc_blk[ifp->family_cookie].n_max_used = 0; - return 0; - } - else - return ENOENT; + if (ifp->family_cookie) { + FREE(ifp->family_cookie, M_IFADDR); + return 0; + } + else + return ENOENT; } __private_extern__ int ether_init_if(struct ifnet *ifp) { - register struct ifaddr *ifa; - register struct sockaddr_dl *sdl; - - ifa = ifnet_addrs[ifp->if_index - 1]; - if (ifa == 0) { - printf("ether_ifattach: no lladdr!\n"); - return (EINVAL); - } - sdl = (struct sockaddr_dl *)ifa->ifa_addr; - sdl->sdl_type = IFT_ETHER; - sdl->sdl_alen = ifp->if_addrlen; - bcopy((IFP2AC(ifp))->ac_enaddr, LLADDR(sdl), ifp->if_addrlen); - + /* + * Copy ethernet address out of old style arpcom. New + * interfaces created using the KPIs will not have an + * interface family. Those interfaces will have the + * lladdr passed in when the interface is created. + */ + u_char *enaddr = ((u_char*)ifp) + sizeof(struct ifnet); + ifnet_set_lladdr(ifp, enaddr, 6); + bzero(enaddr, 6); + return 0; } +errno_t +ether_check_multi( + ifnet_t ifp, + const struct sockaddr *proto_addr) +{ + errno_t result = EAFNOSUPPORT; + const u_char *e_addr; + + /* + * AF_SPEC and AF_LINK don't require translation. We do + * want to verify that they specify a valid multicast. + */ + switch(proto_addr->sa_family) { + case AF_UNSPEC: + e_addr = (const u_char*)&proto_addr->sa_data[0]; + if ((e_addr[0] & 0x01) != 0x01) + result = EADDRNOTAVAIL; + else + result = 0; + break; + + case AF_LINK: + e_addr = CONST_LLADDR((const struct sockaddr_dl*)proto_addr); + if ((e_addr[0] & 0x01) != 0x01) + result = EADDRNOTAVAIL; + else + result = 0; + break; + } + + return result; +} + int -ether_ifmod_ioctl(ifp, command, data) - struct ifnet *ifp; - u_long command; - caddr_t data; +ether_ioctl( + __unused ifnet_t ifp, + __unused u_int32_t command, + __unused void* data) { - struct rslvmulti_req *rsreq = (struct rslvmulti_req *) data; - int error = 0; - struct sockaddr_dl *sdl; - struct sockaddr_in *sin; - u_char *e_addr; - - - switch (command) { - case SIOCRSLVMULTI: - switch(rsreq->sa->sa_family) { - case AF_UNSPEC: - /* AppleTalk uses AF_UNSPEC for multicast registration. - * No mapping needed. Just check that it's a valid MC address. - */ - e_addr = &rsreq->sa->sa_data[0]; - if ((e_addr[0] & 1) != 1) - return EADDRNOTAVAIL; - *rsreq->llsa = 0; - return EJUSTRETURN; - - - case AF_LINK: - /* - * No mapping needed. Just check that it's a valid MC address. - */ - sdl = (struct sockaddr_dl *)rsreq->sa; - e_addr = LLADDR(sdl); - if ((e_addr[0] & 1) != 1) - return EADDRNOTAVAIL; - *rsreq->llsa = 0; - return EJUSTRETURN; - - default: - return EAFNOSUPPORT; - } - - default: - return EOPNOTSUPP; - } + return EOPNOTSUPP; } -extern int ether_attach_inet(struct ifnet *ifp, u_long *dl_tag); -extern int ether_detach_inet(struct ifnet *ifp, u_long dl_tag); -extern int ether_attach_inet6(struct ifnet *ifp, u_long *dl_tag); -extern int ether_detach_inet6(struct ifnet *ifp, u_long dl_tag); -int ether_family_init() +extern int ether_attach_inet(struct ifnet *ifp, u_long proto_family); +extern int ether_detach_inet(struct ifnet *ifp, u_long proto_family); +extern int ether_attach_inet6(struct ifnet *ifp, u_long proto_family); +extern int ether_detach_inet6(struct ifnet *ifp, u_long proto_family); + +extern void kprintf(const char *, ...); + +int ether_family_init(void) { - int i, error=0; + int error=0; struct dlil_ifmod_reg_str ifmod_reg; - struct dlil_protomod_reg_str enet_protoreg; - extern int vlan_family_init(void); /* ethernet family is built-in, called from bsd_init */ - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); bzero(&ifmod_reg, sizeof(ifmod_reg)); ifmod_reg.add_if = ether_add_if; ifmod_reg.del_if = ether_del_if; ifmod_reg.init_if = ether_init_if; - ifmod_reg.add_proto = ether_add_proto; + ifmod_reg.add_proto = ether_add_proto_old; ifmod_reg.del_proto = ether_del_proto; - ifmod_reg.ifmod_ioctl = ether_ifmod_ioctl; + ifmod_reg.ifmod_ioctl = ether_ioctl; ifmod_reg.shutdown = ether_shutdown; if (dlil_reg_if_modules(APPLE_IF_FAM_ETHERNET, &ifmod_reg)) { @@ -641,29 +679,24 @@ int ether_family_init() goto done; } + /* Register protocol registration functions */ - /* Register protocol registration functions */ - - bzero(&enet_protoreg, sizeof(enet_protoreg)); - enet_protoreg.attach_proto = ether_attach_inet; - enet_protoreg.detach_proto = ether_detach_inet; - - if (error = dlil_reg_proto_module(PF_INET, APPLE_IF_FAM_ETHERNET, &enet_protoreg) != 0) { - printf("ether_family_init: dlil_reg_proto_module failed for AF_INET error=%d\n", error); - goto done; - } - - enet_protoreg.attach_proto = ether_attach_inet6; - enet_protoreg.detach_proto = ether_detach_inet6; - - if (error = dlil_reg_proto_module(PF_INET6, APPLE_IF_FAM_ETHERNET, &enet_protoreg) != 0) { - printf("ether_family_init: dlil_reg_proto_module failed for AF_INET6 error=%d\n", error); - goto done; - } - vlan_family_init(); + if ((error = dlil_reg_proto_module(PF_INET, APPLE_IF_FAM_ETHERNET, + ether_attach_inet, ether_detach_inet)) != 0) { + kprintf("dlil_reg_proto_module failed for AF_INET6 error=%d\n", error); + goto done; + } + + + if ((error = dlil_reg_proto_module(PF_INET6, APPLE_IF_FAM_ETHERNET, + ether_attach_inet6, ether_detach_inet6)) != 0) { + kprintf("dlil_reg_proto_module failed for AF_INET6 error=%d\n", error); + goto done; + } + vlan_family_init(); + bond_family_init(); done: - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); return (error); } diff --git a/bsd/net/ether_inet6_pr_module.c b/bsd/net/ether_inet6_pr_module.c index e1ac8180f..1c2175efe 100644 --- a/bsd/net/ether_inet6_pr_module.c +++ b/bsd/net/ether_inet6_pr_module.c @@ -63,14 +63,15 @@ #include <sys/socket.h> #include <sys/sockio.h> #include <sys/sysctl.h> +#include <kern/lock.h> #include <net/if.h> -#include <net/netisr.h> #include <net/route.h> #include <net/if_llc.h> #include <net/if_dl.h> #include <net/if_types.h> #include <net/ndrv.h> +#include <net/kpi_protocol.h> #include <netinet/in.h> #include <netinet/in_var.h> @@ -104,252 +105,112 @@ extern struct ifqueue pkintrq; #include <net/if_vlan_var.h> #endif /* NVLAN > 0 */ -static u_long lo_dlt = 0; -static ivedonethis = 0; -static u_char etherbroadcastaddr[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; - -#define IFP2AC(IFP) ((struct arpcom *)IFP) - - - +/* Local function declerations */ +int ether_attach_inet6(struct ifnet *ifp, u_long protocol_family); +int ether_detach_inet6(struct ifnet *ifp, u_long protocol_family); /* * Process a received Ethernet packet; * the packet is in the mbuf chain m without * the ether header, which is provided separately. */ -int -inet6_ether_input(m, frame_header, ifp, dl_tag, sync_ok) - struct mbuf *m; - char *frame_header; - struct ifnet *ifp; - u_long dl_tag; - int sync_ok; - +static errno_t +inet6_ether_input( + __unused ifnet_t ifp, + protocol_family_t protocol, + mbuf_t packet, + __unused char *header) { - register struct ether_header *eh = (struct ether_header *) frame_header; - register struct ifqueue *inq=0; - u_short ether_type; - int s; - u_int16_t ptype = -1; - unsigned char buf[18]; - - - - if ((ifp->if_flags & IFF_UP) == 0) { - m_freem(m); - return EJUSTRETURN; - } - - ifp->if_lastchange = time; - - if (eh->ether_dhost[0] & 1) { - if (bcmp((caddr_t)etherbroadcastaddr, (caddr_t)eh->ether_dhost, - sizeof(etherbroadcastaddr)) == 0) - m->m_flags |= M_BCAST; - else - m->m_flags |= M_MCAST; - } - if (m->m_flags & (M_BCAST|M_MCAST)) - ifp->if_imcasts++; - - ether_type = ntohs(eh->ether_type); - - - switch (ether_type) { + proto_input(protocol, packet); + return 0; +} - case ETHERTYPE_IPV6: - schednetisr(NETISR_IPV6); - inq = &ip6intrq; - break; +static errno_t +inet6_ether_pre_output( + ifnet_t ifp, + __unused protocol_family_t protocol_family, + mbuf_t *m0, + const struct sockaddr *dst_netaddr, + void *route, + char *type, + char *edst) +{ + errno_t result; + struct sockaddr_dl sdl; + register struct mbuf *m = *m0; - default: { - return ENOENT; + /* + * Tell ether_frameout it's ok to loop packet if necessary + */ + m->m_flags |= M_LOOP; + + result = nd6_lookup_ipv6(ifp, (const struct sockaddr_in6*)dst_netaddr, + &sdl, sizeof(sdl), route, *m0); + + if (result == 0) { + *(u_int16_t*)type = htons(ETHERTYPE_IPV6); + bcopy(LLADDR(&sdl), edst, sdl.sdl_alen); } - } - - if (inq == 0) - return ENOENT; - s = splimp(); - if (IF_QFULL(inq)) { - IF_DROP(inq); - m_freem(m); - splx(s); - return EJUSTRETURN; - } else - IF_ENQUEUE(inq, m); - splx(s); - return 0; + return result; } - - - -int -inet6_ether_pre_output(ifp, m0, dst_netaddr, route, type, edst, dl_tag ) - struct ifnet *ifp; - struct mbuf **m0; - struct sockaddr *dst_netaddr; - caddr_t route; - char *type; - char *edst; - u_long dl_tag; +static int +ether_inet6_resolve_multi( + ifnet_t ifp, + const struct sockaddr *proto_addr, + struct sockaddr_dl *out_ll, + size_t ll_len) { - struct rtentry *rt0 = (struct rtentry *) route; - int s; - register struct mbuf *m = *m0; - register struct rtentry *rt; - register struct ether_header *eh; - int hlen; /* link layer header lenght */ - struct arpcom *ac = IFP2AC(ifp); - - - - if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING)) - return ENETDOWN; - - rt = rt0; - if (rt) { - if ((rt->rt_flags & RTF_UP) == 0) { - rt0 = rt = rtalloc1(dst_netaddr, 1, 0UL); - if (rt0) - rtunref(rt); - else - return EHOSTUNREACH; - } - - if (rt->rt_flags & RTF_GATEWAY) { - if (rt->rt_gwroute == 0) - goto lookup; - if (((rt = rt->rt_gwroute)->rt_flags & RTF_UP) == 0) { - rtfree(rt); rt = rt0; - lookup: rt->rt_gwroute = rtalloc1(rt->rt_gateway, 1, - 0UL); - if ((rt = rt->rt_gwroute) == 0) - return (EHOSTUNREACH); - } - } - + static const size_t minsize = offsetof(struct sockaddr_dl, sdl_data[0]) + ETHER_ADDR_LEN; + const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6*)proto_addr; - if (rt->rt_flags & RTF_REJECT) - if (rt->rt_rmx.rmx_expire == 0 || - time_second < rt->rt_rmx.rmx_expire) - return (rt == rt0 ? EHOSTDOWN : EHOSTUNREACH); - } - - hlen = ETHER_HDR_LEN; - - /* - * Tell ether_frameout it's ok to loop packet unless negated below. - */ - m->m_flags |= M_LOOP; - - switch (dst_netaddr->sa_family) { - - - case AF_INET6: - if (!nd6_storelladdr(&ac->ac_if, rt, m, dst_netaddr, (u_char *)edst)) { - /* this must be impossible, so we bark */ - printf("nd6_storelladdr failed\n"); - return(EADDRNOTAVAIL); /* dlil_output will free the mbuf */ - } - *(u_short *)type = htons(ETHERTYPE_IPV6); - break; - - default: - printf("%s%d: can't handle af%d\n", ifp->if_name, ifp->if_unit, - dst_netaddr->sa_family); - - /* dlil_output will free the mbuf */ - return EAFNOSUPPORT; - } - - return (0); + if (proto_addr->sa_family != AF_INET6) + return EAFNOSUPPORT; + + if (proto_addr->sa_len < sizeof(struct sockaddr_in6)) + return EINVAL; + + if (ll_len < minsize) + return EMSGSIZE; + + bzero(out_ll, minsize); + out_ll->sdl_len = minsize; + out_ll->sdl_family = AF_LINK; + out_ll->sdl_index = ifp->if_index; + out_ll->sdl_type = IFT_ETHER; + out_ll->sdl_nlen = 0; + out_ll->sdl_alen = ETHER_ADDR_LEN; + out_ll->sdl_slen = 0; + ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, LLADDR(out_ll)); + + return 0; } -int -ether_inet6_prmod_ioctl(dl_tag, ifp, command, data) - u_long dl_tag; - struct ifnet *ifp; - int command; - caddr_t data; +static errno_t +ether_inet6_prmod_ioctl( + ifnet_t ifp, + __unused protocol_family_t protocol_family, + u_int32_t command, + void* data) { - struct ifaddr *ifa = (struct ifaddr *) data; struct ifreq *ifr = (struct ifreq *) data; - struct rslvmulti_req *rsreq = (struct rslvmulti_req *) data; int error = 0; - boolean_t funnel_state; - struct arpcom *ac = (struct arpcom *) ifp; - struct sockaddr_dl *sdl; - struct sockaddr_in *sin; - struct sockaddr_in6 *sin6; - - u_char *e_addr; - switch (command) { - case SIOCRSLVMULTI: { - switch(rsreq->sa->sa_family) { - - case AF_INET6: - sin6 = (struct sockaddr_in6 *)rsreq->sa; - if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { - /* - * An IP6 address of 0 means listen to all - * of the Ethernet multicast address used for IP6. - * (This is used for multicast routers.) - */ - ifp->if_flags |= IFF_ALLMULTI; - *rsreq->llsa = 0; - return 0; - } - MALLOC(sdl, struct sockaddr_dl *, sizeof *sdl, M_IFMADDR, - M_WAITOK); - sdl->sdl_len = sizeof *sdl; - sdl->sdl_family = AF_LINK; - sdl->sdl_index = ifp->if_index; - sdl->sdl_type = IFT_ETHER; - sdl->sdl_nlen = 0; - sdl->sdl_alen = ETHER_ADDR_LEN; - sdl->sdl_slen = 0; - e_addr = LLADDR(sdl); - ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr); -#ifndef __APPLE__ - printf("ether_resolvemulti AF_INET6 Adding %x:%x:%x:%x:%x:%x\n", - e_addr[0], e_addr[1], e_addr[2], e_addr[3], e_addr[4], e_addr[5]); -#endif - *rsreq->llsa = (struct sockaddr *)sdl; - return 0; - - default: - /* - * Well, the text isn't quite right, but it's the name - * that counts... - */ - return EAFNOSUPPORT; - } - - } case SIOCSIFADDR: if ((ifp->if_flags & IFF_RUNNING) == 0) { - ifp->if_flags |= IFF_UP; + ifnet_set_flags(ifp, IFF_UP, IFF_UP); dlil_ioctl(0, ifp, SIOCSIFFLAGS, (caddr_t) 0); } break; case SIOCGIFADDR: - { - struct sockaddr *sa; - - sa = (struct sockaddr *) & ifr->ifr_data; - bcopy(IFP2AC(ifp)->ac_enaddr, - (caddr_t) sa->sa_data, ETHER_ADDR_LEN); - } + ifnet_lladdr_copy_bytes(ifp, ifr->ifr_addr.sa_data, ETHER_ADDR_LEN); break; case SIOCSIFMTU: @@ -366,59 +227,48 @@ ether_inet6_prmod_ioctl(dl_tag, ifp, command, data) return (error); } - - - - -int ether_attach_inet6(struct ifnet *ifp, u_long *dl_tag) +int +ether_attach_inet6( + struct ifnet *ifp, + __unused u_long protocol_family) { - struct dlil_proto_reg_str reg; - struct dlil_demux_desc desc; - u_short en_6native=ETHERTYPE_IPV6; - int stat; - int i; - - - stat = dlil_find_dltag(ifp->if_family, ifp->if_unit, PF_INET6, dl_tag); - if (stat == 0) - return stat; - - TAILQ_INIT(®.demux_desc_head); - desc.type = DLIL_DESC_RAW; - desc.variants.bitmask.proto_id_length = 0; - desc.variants.bitmask.proto_id = 0; - desc.variants.bitmask.proto_id_mask = 0; - desc.native_type = (char *) &en_6native; - TAILQ_INSERT_TAIL(®.demux_desc_head, &desc, next); - reg.interface_family = ifp->if_family; - reg.unit_number = ifp->if_unit; - reg.input = inet6_ether_input; - reg.pre_output = inet6_ether_pre_output; - reg.event = 0; - reg.offer = 0; - reg.ioctl = ether_inet6_prmod_ioctl; - reg.default_proto = 0; - reg.protocol_family = PF_INET6; - - stat = dlil_attach_protocol(®, dl_tag); - if (stat) { - printf("WARNING: ether_attach_inet6 can't attach ip to interface\n"); - } - - return stat; + struct ifnet_attach_proto_param proto; + struct ifnet_demux_desc demux[1]; + u_short en_6native=htons(ETHERTYPE_IPV6); + errno_t error; + + bzero(&proto, sizeof(proto)); + demux[0].type = DLIL_DESC_ETYPE2; + demux[0].data = &en_6native; + demux[0].datalen = sizeof(en_6native); + proto.demux_list = demux; + proto.demux_count = 1; + proto.input = inet6_ether_input; + proto.pre_output = inet6_ether_pre_output; + proto.ioctl = ether_inet6_prmod_ioctl; + proto.resolve = ether_inet6_resolve_multi; + error = ifnet_attach_protocol(ifp, protocol_family, &proto); + if (error && error != EEXIST) { + printf("WARNING: ether_attach_inet6 can't attach ipv6 to %s%d\n", + ifp->if_name, ifp->if_unit); + } + + return error; } -int ether_detach_inet6(struct ifnet *ifp, u_long dl_tag) +int +ether_detach_inet6( + struct ifnet *ifp, + u_long protocol_family) { - int stat; - - stat = dlil_find_dltag(ifp->if_family, ifp->if_unit, PF_INET6, &dl_tag); - if (stat == 0) { - stat = dlil_detach_protocol(dl_tag); - if (stat) { - printf("WARNING: ether_detach_inet6 can't detach ip6 from interface\n"); - } - } - return stat; + errno_t error; + + error = ifnet_detach_protocol(ifp, protocol_family); + if (error && error != ENOENT) { + printf("WARNING: ether_detach_inet6 can't detach ipv6 from %s%d\n", + ifp->if_name, ifp->if_unit); + } + + return error; } diff --git a/bsd/net/ether_inet_pr_module.c b/bsd/net/ether_inet_pr_module.c index 73ae154ac..43d413070 100644 --- a/bsd/net/ether_inet_pr_module.c +++ b/bsd/net/ether_inet_pr_module.c @@ -63,19 +63,21 @@ #include <sys/socket.h> #include <sys/sockio.h> #include <sys/sysctl.h> +#include <kern/lock.h> #include <net/if.h> -#include <net/netisr.h> #include <net/route.h> #include <net/if_llc.h> #include <net/if_dl.h> #include <net/if_types.h> +#include <net/kpi_protocol.h> #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/if_ether.h> #include <netinet/in_systm.h> #include <netinet/ip.h> +#include <netinet/in_arp.h> #include <sys/socketvar.h> @@ -85,7 +87,6 @@ extern struct ifqueue pkintrq; #endif - #if BRIDGE #include <net/bridge.h> #endif @@ -95,283 +96,217 @@ extern struct ifqueue pkintrq; #include <net/if_vlan_var.h> #endif /* NVLAN > 0 */ -static u_long lo_dlt = 0; -static ivedonethis = 0; -static u_char etherbroadcastaddr[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; - -#define IFP2AC(IFP) ((struct arpcom *)IFP) +/* Local function declerations */ +int ether_attach_inet(struct ifnet *ifp, u_long proto_family); +int ether_detach_inet(struct ifnet *ifp, u_long proto_family); +extern void * kdp_get_interface(void); +extern void ipintr(void); +extern void arp_input(struct mbuf* m); - -extern void * kdp_get_interface(); +static void +inet_ether_arp_input( + struct mbuf *m) +{ + struct ether_arp *ea; + struct sockaddr_dl sender_hw; + struct sockaddr_in sender_ip; + struct sockaddr_in target_ip; + + if (mbuf_len(m) < sizeof(*ea) && + mbuf_pullup(&m, sizeof(*ea)) != 0) + return; + + ea = mbuf_data(m); + + /* Verify this is an ethernet/ip arp and address lengths are correct */ + if (ntohs(ea->arp_hrd) != ARPHRD_ETHER || + ntohs(ea->arp_pro) != ETHERTYPE_IP || + ea->arp_pln != sizeof(struct in_addr) || + ea->arp_hln != ETHER_ADDR_LEN) { + mbuf_free(m); + return; + } + + /* Verify the sender is not broadcast or multicast */ + if ((ea->arp_sha[0] & 0x01) != 0) { + mbuf_free(m); + return; + } + + bzero(&sender_ip, sizeof(sender_ip)); + sender_ip.sin_len = sizeof(sender_ip); + sender_ip.sin_family = AF_INET; + sender_ip.sin_addr = *(struct in_addr*)ea->arp_spa; + target_ip = sender_ip; + target_ip.sin_addr = *(struct in_addr*)ea->arp_tpa; + + bzero(&sender_hw, sizeof(sender_hw)); + sender_hw.sdl_len = sizeof(sender_hw); + sender_hw.sdl_family = AF_LINK; + sender_hw.sdl_type = IFT_ETHER; + sender_hw.sdl_alen = ETHER_ADDR_LEN; + bcopy(ea->arp_sha, LLADDR(&sender_hw), ETHER_ADDR_LEN); + + arp_ip_handle_input(mbuf_pkthdr_rcvif(m), ntohs(ea->arp_op), &sender_hw, &sender_ip, &target_ip); + mbuf_free(m); +} /* * Process a received Ethernet packet; * the packet is in the mbuf chain m without * the ether header, which is provided separately. */ -int -inet_ether_input(m, frame_header, ifp, dl_tag, sync_ok) - struct mbuf *m; - char *frame_header; - struct ifnet *ifp; - u_long dl_tag; - int sync_ok; - +static errno_t +inet_ether_input( + __unused ifnet_t ifp, + __unused protocol_family_t protocol_family, + mbuf_t m, + char *frame_header) { register struct ether_header *eh = (struct ether_header *) frame_header; - register struct ifqueue *inq=0; u_short ether_type; - int s; - u_int16_t ptype = -1; - unsigned char buf[18]; - -#if ISO || LLC || NETAT - register struct llc *l; -#endif - - if ((ifp->if_flags & IFF_UP) == 0) { - m_freem(m); - return EJUSTRETURN; - } - - ifp->if_lastchange = time; - - if (eh->ether_dhost[0] & 1) { - if (bcmp((caddr_t)etherbroadcastaddr, (caddr_t)eh->ether_dhost, - sizeof(etherbroadcastaddr)) == 0) - m->m_flags |= M_BCAST; - else - m->m_flags |= M_MCAST; - } - if (m->m_flags & (M_BCAST|M_MCAST)) - ifp->if_imcasts++; ether_type = ntohs(eh->ether_type); -#if NVLAN > 0 - if (ether_type == vlan_proto) { - if (vlan_input(eh, m) < 0) - ifp->if_data.ifi_noproto++; - return EJUSTRETURN; - } -#endif /* NVLAN > 0 */ - switch (ether_type) { - case ETHERTYPE_IP: - if (ipflow_fastforward(m)) - return EJUSTRETURN; - ptype = mtod(m, struct ip *)->ip_p; - if ((sync_ok == 0) || - (ptype != IPPROTO_TCP && ptype != IPPROTO_UDP)) { - schednetisr(NETISR_IP); - } - - inq = &ipintrq; - break; - - case ETHERTYPE_ARP: - schednetisr(NETISR_ARP); - inq = &arpintrq; - break; - - default: { - return ENOENT; - } + case ETHERTYPE_IP: + proto_input(PF_INET, m); + break; + + case ETHERTYPE_ARP: { + inet_ether_arp_input(m); + } + break; + + default: { + return ENOENT; + } } - - if (inq == 0) - return ENOENT; - - s = splimp(); - if (IF_QFULL(inq)) { - IF_DROP(inq); - m_freem(m); - splx(s); - return EJUSTRETURN; - } else - IF_ENQUEUE(inq, m); - splx(s); - - if ((sync_ok) && - (ptype == IPPROTO_TCP || ptype == IPPROTO_UDP)) { - extern void ipintr(void); - - s = splnet(); - ipintr(); - splx(s); - } - + return 0; } - - - -int -inet_ether_pre_output(ifp, m0, dst_netaddr, route, type, edst, dl_tag ) - struct ifnet *ifp; - struct mbuf **m0; - struct sockaddr *dst_netaddr; - caddr_t route; - char *type; - char *edst; - u_long dl_tag; +static errno_t +inet_ether_pre_output( + ifnet_t ifp, + __unused protocol_family_t protocol_family, + mbuf_t *m0, + const struct sockaddr *dst_netaddr, + void* route, + char *type, + char *edst) { - struct rtentry *rt0 = (struct rtentry *) route; - int s; register struct mbuf *m = *m0; - register struct rtentry *rt; register struct ether_header *eh; - int off, len = m->m_pkthdr.len; - int hlen; /* link layer header length */ - struct arpcom *ac = IFP2AC(ifp); - + errno_t result = 0; if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING)) - return ENETDOWN; - - rt = rt0; - if (rt) { - if ((rt->rt_flags & RTF_UP) == 0) { - rt0 = rt = rtalloc1(dst_netaddr, 1, 0UL); - if (rt0) - rtunref(rt); - else - return EHOSTUNREACH; - } - - if (rt->rt_flags & RTF_GATEWAY) { - if (rt->rt_gwroute == 0) - goto lookup; - if (((rt = rt->rt_gwroute)->rt_flags & RTF_UP) == 0) { - rtfree(rt); rt = rt0; - lookup: rt->rt_gwroute = rtalloc1(rt->rt_gateway, 1, - 0UL); - if ((rt = rt->rt_gwroute) == 0) - return (EHOSTUNREACH); - } - } - + return ENETDOWN; - if (rt->rt_flags & RTF_REJECT) - if (rt->rt_rmx.rmx_expire == 0 || - time_second < rt->rt_rmx.rmx_expire) - return (rt == rt0 ? EHOSTDOWN : EHOSTUNREACH); - } - - hlen = ETHER_HDR_LEN; - /* * Tell ether_frameout it's ok to loop packet unless negated below. */ m->m_flags |= M_LOOP; switch (dst_netaddr->sa_family) { + + case AF_INET: { + struct sockaddr_dl ll_dest; + result = arp_lookup_ip(ifp, (const struct sockaddr_in*)dst_netaddr, + &ll_dest, sizeof(ll_dest), (route_t)route, *m0); + if (result == 0) { + bcopy(LLADDR(&ll_dest), edst, ETHER_ADDR_LEN); + *(u_int16_t*)type = htons(ETHERTYPE_IP); + } + } + break; + + case pseudo_AF_HDRCMPLT: + case AF_UNSPEC: + m->m_flags &= ~M_LOOP; + eh = (struct ether_header *)dst_netaddr->sa_data; + (void)memcpy(edst, eh->ether_dhost, 6); + *(u_short *)type = eh->ether_type; + break; + + default: + printf("%s%d: can't handle af%d\n", ifp->if_name, ifp->if_unit, + dst_netaddr->sa_family); + + result = EAFNOSUPPORT; + } - case AF_INET: - if (!arpresolve(ac, rt, m, dst_netaddr, edst, rt0)) - return (EJUSTRETURN); /* if not yet resolved */ - off = m->m_pkthdr.len - m->m_len; - *(u_short *)type = htons(ETHERTYPE_IP); - break; - - case AF_UNSPEC: - m->m_flags &= ~M_LOOP; - eh = (struct ether_header *)dst_netaddr->sa_data; - (void)memcpy(edst, eh->ether_dhost, 6); - *(u_short *)type = eh->ether_type; - break; - - default: - kprintf("%s%d: can't handle af%d\n", ifp->if_name, ifp->if_unit, - dst_netaddr->sa_family); + return result; +} - return EAFNOSUPPORT; - } +static errno_t +ether_inet_resolve_multi( + ifnet_t ifp, + const struct sockaddr *proto_addr, + struct sockaddr_dl *out_ll, + size_t ll_len) +{ + static const size_t minsize = offsetof(struct sockaddr_dl, sdl_data[0]) + ETHER_ADDR_LEN; + const struct sockaddr_in *sin = (const struct sockaddr_in*)proto_addr; + + if (proto_addr->sa_family != AF_INET) + return EAFNOSUPPORT; + + if (proto_addr->sa_len < sizeof(struct sockaddr_in)) + return EINVAL; - return (0); + if (ll_len < minsize) + return EMSGSIZE; + + bzero(out_ll, minsize); + out_ll->sdl_len = minsize; + out_ll->sdl_family = AF_LINK; + out_ll->sdl_index = ifp->if_index; + out_ll->sdl_type = IFT_ETHER; + out_ll->sdl_nlen = 0; + out_ll->sdl_alen = ETHER_ADDR_LEN; + out_ll->sdl_slen = 0; + ETHER_MAP_IP_MULTICAST(&sin->sin_addr, LLADDR(out_ll)); + + return 0; } -int -ether_inet_prmod_ioctl(dl_tag, ifp, command, data) - u_long dl_tag; - struct ifnet *ifp; - int command; - caddr_t data; +static errno_t +ether_inet_prmod_ioctl( + ifnet_t ifp, + __unused protocol_family_t protocol_family, + u_int32_t command, + void* data) { - struct ifaddr *ifa = (struct ifaddr *) data; - struct ifreq *ifr = (struct ifreq *) data; - struct rslvmulti_req *rsreq = (struct rslvmulti_req *) data; + ifaddr_t ifa = data; + struct ifreq *ifr = data; int error = 0; - boolean_t funnel_state; - struct arpcom *ac = (struct arpcom *) ifp; - struct sockaddr_dl *sdl; - struct sockaddr_in *sin; - u_char *e_addr; -#if 0 - /* No tneeded at soo_ioctlis already funnelled */ - funnel_state = thread_funnel_set(network_flock,TRUE); -#endif - switch (command) { - case SIOCRSLVMULTI: { - switch(rsreq->sa->sa_family) { - - case AF_INET: - sin = (struct sockaddr_in *)rsreq->sa; - if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) - return EADDRNOTAVAIL; - MALLOC(sdl, struct sockaddr_dl *, sizeof *sdl, M_IFMADDR, - M_WAITOK); - sdl->sdl_len = sizeof *sdl; - sdl->sdl_family = AF_LINK; - sdl->sdl_index = ifp->if_index; - sdl->sdl_type = IFT_ETHER; - sdl->sdl_nlen = 0; - sdl->sdl_alen = ETHER_ADDR_LEN; - sdl->sdl_slen = 0; - e_addr = LLADDR(sdl); - ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr); - *rsreq->llsa = (struct sockaddr *)sdl; - return EJUSTRETURN; - - default: - /* - * Well, the text isn't quite right, but it's the name - * that counts... - */ - return EAFNOSUPPORT; - } - - } case SIOCSIFADDR: - if ((ifp->if_flags & IFF_RUNNING) == 0) { - ifp->if_flags |= IFF_UP; - dlil_ioctl(0, ifp, SIOCSIFFLAGS, (caddr_t) 0); - } + if ((ifnet_flags(ifp) & IFF_RUNNING) == 0) { + ifnet_set_flags(ifp, IFF_UP, IFF_UP); + ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL); + } - switch (ifa->ifa_addr->sa_family) { + switch (ifaddr_address_family(ifa)) { case AF_INET: - if (ifp->if_init) - ifp->if_init(ifp->if_softc); /* before arpwhohas */ - - arp_ifinit(IFP2AC(ifp), ifa); + inet_arp_init_ifaddr(ifp, ifa); /* * Register new IP and MAC addresses with the kernel debugger * if the interface is the same as was registered by IOKernelDebugger. If * no interface was registered, fall back and just match against en0 interface. */ - if ((kdp_get_interface() != 0 && kdp_get_interface() == ifp->if_private) + if ((kdp_get_interface() != 0 && kdp_get_interface() == ifp->if_softc) || (kdp_get_interface() == 0 && ifp->if_unit == 0)) - kdp_set_ip_and_mac_addresses(&(IA_SIN(ifa)->sin_addr), &(IFP2AC(ifp)->ac_enaddr)); + kdp_set_ip_and_mac_addresses(&(IA_SIN(ifa)->sin_addr), ifnet_lladdr(ifp)); break; @@ -382,13 +317,7 @@ ether_inet_prmod_ioctl(dl_tag, ifp, command, data) break; case SIOCGIFADDR: - { - struct sockaddr *sa; - - sa = (struct sockaddr *) & ifr->ifr_data; - bcopy(IFP2AC(ifp)->ac_enaddr, - (caddr_t) sa->sa_data, ETHER_ADDR_LEN); - } + ifnet_lladdr_copy_bytes(ifp, ifr->ifr_addr.sa_data, ETHER_ADDR_LEN); break; case SIOCSIFMTU: @@ -402,71 +331,183 @@ ether_inet_prmod_ioctl(dl_tag, ifp, command, data) return EOPNOTSUPP; } - //(void) thread_funnel_set(network_flock, FALSE); - return (error); } +static void +ether_inet_event( + ifnet_t ifp, + __unused protocol_family_t protocol, + const struct kev_msg *event) +{ + ifaddr_t *addresses; + + if (event->vendor_code != KEV_VENDOR_APPLE || + event->kev_class != KEV_NETWORK_CLASS || + event->kev_subclass != KEV_DL_SUBCLASS || + event->event_code != KEV_DL_LINK_ADDRESS_CHANGED) { + return; + } + + if (ifnet_get_address_list_family(ifp, &addresses, AF_INET) == 0) { + int i; + + for (i = 0; addresses[i] != NULL; i++) { + inet_arp_init_ifaddr(ifp, addresses[i]); + } + + ifnet_free_address_list(addresses); + } +} - - +static errno_t +ether_inet_arp( + ifnet_t ifp, + u_short arpop, + const struct sockaddr_dl* sender_hw, + const struct sockaddr* sender_proto, + const struct sockaddr_dl* target_hw, + const struct sockaddr* target_proto) +{ + mbuf_t m; + errno_t result; + struct ether_header *eh; + struct ether_arp *ea; + const struct sockaddr_in* sender_ip = (const struct sockaddr_in*)sender_proto; + const struct sockaddr_in* target_ip = (const struct sockaddr_in*)target_proto; + char *datap; + + if (target_ip == NULL) + return EINVAL; + + if ((sender_ip && sender_ip->sin_family != AF_INET) || + (target_ip && target_ip->sin_family != AF_INET)) + return EAFNOSUPPORT; + + result = mbuf_gethdr(MBUF_WAITOK, MBUF_TYPE_DATA, &m); + if (result != 0) + return result; + + mbuf_setlen(m, sizeof(*ea)); + mbuf_pkthdr_setlen(m, sizeof(*ea)); + + /* Move the data pointer in the mbuf to the end, aligned to 4 bytes */ + datap = mbuf_datastart(m); + datap += mbuf_trailingspace(m); + datap -= (((u_long)datap) & 0x3); + mbuf_setdata(m, datap, sizeof(*ea)); + ea = mbuf_data(m); + + /* Prepend the ethernet header, we will send the raw frame */ + mbuf_prepend(&m, sizeof(*eh), MBUF_WAITOK); + eh = mbuf_data(m); + eh->ether_type = htons(ETHERTYPE_ARP); + + /* Fill out the arp header */ + ea->arp_pro = htons(ETHERTYPE_IP); + ea->arp_hln = sizeof(ea->arp_sha); + ea->arp_pln = sizeof(ea->arp_spa); + ea->arp_hrd = htons(ARPHRD_ETHER); + ea->arp_op = htons(arpop); + + /* Sender Hardware */ + if (sender_hw != NULL) { + bcopy(CONST_LLADDR(sender_hw), ea->arp_sha, sizeof(ea->arp_sha)); + } + else { + ifnet_lladdr_copy_bytes(ifp, ea->arp_sha, ETHER_ADDR_LEN); + } + ifnet_lladdr_copy_bytes(ifp, eh->ether_shost, sizeof(eh->ether_shost)); + + /* Sender IP */ + if (sender_ip != NULL) { + bcopy(&sender_ip->sin_addr, ea->arp_spa, sizeof(ea->arp_spa)); + } + else { + struct ifaddr *ifa; + + /* Look for an IP address to use as our source */ + ifnet_lock_shared(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET) + break; + } + if (ifa) { + bcopy(&((struct sockaddr_in*)ifa->ifa_addr)->sin_addr, ea->arp_spa, + sizeof(ea->arp_spa)); + } + ifnet_lock_done(ifp); + + if (ifa == NULL) { + mbuf_free(m); + return ENXIO; + } + } + + /* Target Hardware */ + if (target_hw == 0) { + bzero(ea->arp_tha, sizeof(ea->arp_tha)); + bcopy(etherbroadcastaddr, eh->ether_dhost, sizeof(eh->ether_dhost)); + } + else { + bcopy(CONST_LLADDR(target_hw), ea->arp_tha, sizeof(ea->arp_tha)); + bcopy(CONST_LLADDR(target_hw), eh->ether_dhost, sizeof(eh->ether_dhost)); + } + + /* Target IP */ + bcopy(&target_ip->sin_addr, ea->arp_tpa, sizeof(ea->arp_tpa)); + + ifnet_output_raw(ifp, PF_INET, m); + + return 0; +} int -ether_attach_inet(struct ifnet *ifp, u_long *dl_tag) +ether_attach_inet( + struct ifnet *ifp, + __unused u_long proto_family) { - struct dlil_proto_reg_str reg; - struct dlil_demux_desc desc; - struct dlil_demux_desc desc2; - u_short en_native=ETHERTYPE_IP; - u_short arp_native=ETHERTYPE_ARP; - int stat; - int i; - - - stat = dlil_find_dltag(ifp->if_family, ifp->if_unit, PF_INET, dl_tag); - if (stat == 0) - return (stat); - - TAILQ_INIT(®.demux_desc_head); - desc.type = DLIL_DESC_RAW; - desc.variants.bitmask.proto_id_length = 0; - desc.variants.bitmask.proto_id = 0; - desc.variants.bitmask.proto_id_mask = 0; - desc.native_type = (char *) &en_native; - TAILQ_INSERT_TAIL(®.demux_desc_head, &desc, next); - reg.interface_family = ifp->if_family; - reg.unit_number = ifp->if_unit; - reg.input = inet_ether_input; - reg.pre_output = inet_ether_pre_output; - reg.event = 0; - reg.offer = 0; - reg.ioctl = ether_inet_prmod_ioctl; - reg.default_proto = 1; - reg.protocol_family = PF_INET; - - desc2 = desc; - desc2.native_type = (char *) &arp_native; - TAILQ_INSERT_TAIL(®.demux_desc_head, &desc2, next); - - stat = dlil_attach_protocol(®, dl_tag); - if (stat) { - printf("WARNING: ether_attach_inet can't attach ip to interface\n"); - return stat; - } - return (0); + struct ifnet_attach_proto_param proto; + struct ifnet_demux_desc demux[2]; + u_short en_native=htons(ETHERTYPE_IP); + u_short arp_native=htons(ETHERTYPE_ARP); + errno_t error; + + bzero(&demux[0], sizeof(demux)); + demux[0].type = DLIL_DESC_ETYPE2; + demux[0].data = &en_native; + demux[0].datalen = sizeof(en_native); + demux[1].type = DLIL_DESC_ETYPE2; + demux[1].data = &arp_native; + demux[1].datalen = sizeof(arp_native); + + bzero(&proto, sizeof(proto)); + proto.demux_list = demux; + proto.demux_count = sizeof(demux) / sizeof(demux[0]); + proto.input = inet_ether_input; + proto.pre_output = inet_ether_pre_output; + proto.ioctl = ether_inet_prmod_ioctl; + proto.event = ether_inet_event; + proto.resolve = ether_inet_resolve_multi; + proto.send_arp = ether_inet_arp; + + error = ifnet_attach_protocol(ifp, proto_family, &proto); + if (error && error != EEXIST) { + printf("WARNING: ether_attach_inet can't attach ip to %s%d\n", + ifp->if_name, ifp->if_unit); + } + return error; } -int ether_detach_inet(struct ifnet *ifp, u_long dl_tag) +int +ether_detach_inet( + struct ifnet *ifp, + u_long proto_family) { int stat; - stat = dlil_find_dltag(ifp->if_family, ifp->if_unit, PF_INET, &dl_tag); - if (stat == 0) { - stat = dlil_detach_protocol(dl_tag); - if (stat) { - printf("WARNING: ether_detach_inet can't detach ip from interface\n"); - } - } + stat = dlil_detach_protocol(ifp, proto_family); + return stat; } diff --git a/bsd/net/ethernet.h b/bsd/net/ethernet.h index 703660a07..b52bd815f 100644 --- a/bsd/net/ethernet.h +++ b/bsd/net/ethernet.h @@ -102,11 +102,17 @@ struct ether_addr { #define ETHERMTU (ETHER_MAX_LEN-ETHER_HDR_LEN-ETHER_CRC_LEN) #define ETHERMIN (ETHER_MIN_LEN-ETHER_HDR_LEN-ETHER_CRC_LEN) -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -struct ether_addr *ether_aton __P((char *)); -#endif /* __APPLE_API_PRIVATE */ +#ifdef KERNEL_PRIVATE +/* + * The following are used by ethernet interfaces. + */ + +struct ether_addr *ether_aton(const char *); + +#ifdef BSD_KERNEL_PRIVATE +extern u_char etherbroadcastaddr[ETHER_ADDR_LEN]; #endif +#endif /* KERNEL_PRIVATE */ #ifndef KERNEL #include <sys/cdefs.h> @@ -116,10 +122,11 @@ struct ether_addr *ether_aton __P((char *)); */ __BEGIN_DECLS -int ether_hostton __P((char *, struct ether_addr *)); -int ether_line __P((char *, struct ether_addr *, char *)); -char *ether_ntoa __P((const struct ether_addr *)); -int ether_ntohost __P((char *, struct ether_addr *)); +int ether_hostton(const char *, struct ether_addr *); +int ether_line(const char *, struct ether_addr *, char *); +char *ether_ntoa(const struct ether_addr *); +struct ether_addr *ether_aton(const char *); +int ether_ntohost(char *, const struct ether_addr *); __END_DECLS #endif /* !KERNEL */ diff --git a/bsd/ppc/label_t.h b/bsd/net/ieee8023ad.h similarity index 58% rename from bsd/ppc/label_t.h rename to bsd/net/ieee8023ad.h index d4a45ac77..6de8d3ea9 100644 --- a/bsd/ppc/label_t.h +++ b/bsd/net/ieee8023ad.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -19,27 +19,29 @@ * * @APPLE_LICENSE_HEADER_END@ */ + /* - * Copyright (c) 1993 NeXT Computer, Inc. - * - * PowerPC Family: For setjmp/longjmp (kernel version). - * - * HISTORY - * + * ieee8023ad.h */ - -#ifndef _BSD_PPC_LABEL_T_H_ -#define _BSD_PPC_LABEL_T_H_ -#include <sys/appleapiopts.h> +/* + * Modification History + * + * May 14, 2004 Dieter Siegmund (dieter@apple.com) + * - created + */ -#ifdef __APPLE_API_OBSOLETE -typedef struct label_t { - int val[59]; -} label_t; +#ifndef _NET_IEEE8023AD_H_ +#define _NET_IEEE8023AD_H_ -#endif /* __APPLE_API_OBSOLETE */ +#include <sys/types.h> -#endif /* _BSD_PPC_LABEL_T_H_ */ +#define IEEE8023AD_SLOW_PROTO_ETHERTYPE 0x8809 +#define IEEE8023AD_SLOW_PROTO_MULTICAST { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x02 } +#define IEEE8023AD_SLOW_PROTO_SUBTYPE_LACP 1 +#define IEEE8023AD_SLOW_PROTO_SUBTYPE_LA_MARKER_PROTOCOL 2 +#define IEEE8023AD_SLOW_PROTO_SUBTYPE_RESERVED_START 3 +#define IEEE8023AD_SLOW_PROTO_SUBTYPE_RESERVED_END 10 +#endif _NET_IEEE8023AD_H_ diff --git a/bsd/net/if.c b/bsd/net/if.c index e5f361d70..3f6d1157d 100644 --- a/bsd/net/if.c +++ b/bsd/net/if.c @@ -55,6 +55,8 @@ * $FreeBSD: src/sys/net/if.c,v 1.85.2.9 2001/07/24 19:10:17 brooks Exp $ */ +#include <kern/locks.h> + #include <sys/param.h> #include <sys/malloc.h> #include <sys/mbuf.h> @@ -73,12 +75,15 @@ #include <net/if_dl.h> #include <net/if_types.h> #include <net/if_var.h> +#include <net/net_osdep.h> + #include <net/radix.h> #include <net/route.h> #ifdef __APPLE__ #include <net/dlil.h> //#include <string.h> #include <sys/domain.h> +#include <libkern/OSAtomic.h> #endif #if defined(INET) || defined(INET6) @@ -95,19 +100,21 @@ * System initialization */ -static int ifconf __P((u_long, caddr_t)); -static void if_qflush __P((struct ifqueue *)); -static void link_rtrequest __P((int, struct rtentry *, struct sockaddr *)); +static int ifconf(u_long cmd, user_addr_t ifrp, int * ret_space); +static void if_qflush(struct ifqueue *); +__private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *); +void if_rtproto_del(struct ifnet *ifp, int protocol); static struct if_clone *if_clone_lookup(const char *, int *); -static int if_clone_list(struct if_clonereq *); +#ifdef IF_CLONE_LIST +static int if_clone_list(int count, int * total, user_addr_t dst); +#endif MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address"); MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address"); int ifqmaxlen = IFQ_MAXLEN; -struct ifnethead ifnet = TAILQ_HEAD_INITIALIZER(ifnet); -struct ifmultihead ifma_lostlist = LIST_HEAD_INITIALIZER(ifma_lostlist); +struct ifnethead ifnet_head = TAILQ_HEAD_INITIALIZER(ifnet_head); static int if_cloners_count; LIST_HEAD(, if_clone) if_cloners = LIST_HEAD_INITIALIZER(if_cloners); @@ -117,7 +124,7 @@ LIST_HEAD(, if_clone) if_cloners = LIST_HEAD_INITIALIZER(if_cloners); * XXX: declare here to avoid to include many inet6 related files.. * should be more generalized? */ -extern void nd6_setmtu __P((struct ifnet *)); +extern void nd6_setmtu(struct ifnet *); #endif #define M_CLONE M_IFADDR @@ -133,6 +140,48 @@ int if_index; struct ifaddr **ifnet_addrs; struct ifnet **ifindex2ifnet; +__private_extern__ void +if_attach_ifa( + struct ifnet *ifp, + struct ifaddr *ifa) +{ + ifnet_lock_assert(ifp, LCK_MTX_ASSERT_OWNED); + if (ifa->ifa_debug & IFA_ATTACHED) { + panic("if_attach_ifa: Attempted to attach address that's already attached!\n"); + } + ifaref(ifa); + ifa->ifa_debug |= IFA_ATTACHED; + TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); +} + +__private_extern__ void +if_detach_ifa( + struct ifnet *ifp, + struct ifaddr *ifa) +{ + ifnet_lock_assert(ifp, LCK_MTX_ASSERT_OWNED); +#if 1 + /* Debugging code */ + if ((ifa->ifa_debug & IFA_ATTACHED) == 0) { + printf("if_detach_ifa: ifa is not attached to any interface! flags=%\n", ifa->ifa_debug); + return; + } + else { + struct ifaddr *ifa2; + TAILQ_FOREACH(ifa2, &ifp->if_addrhead, ifa_link) { + if (ifa2 == ifa) + break; + } + if (ifa2 != ifa) { + printf("if_detach_ifa: Attempted to detach IFA that was not attached!\n"); + } + } +#endif + TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link); + ifa->ifa_debug &= ~IFA_ATTACHED; + ifafree(ifa); +} + #define INITIAL_IF_INDEXLIM 8 /* @@ -148,17 +197,14 @@ struct ifnet **ifindex2ifnet; * always allocate one extra element to hold ifindex2ifnet[0], which * is unused. */ -static int +int if_next_index(void); + +__private_extern__ int if_next_index(void) { static int if_indexlim = 0; - static int if_list_growing = 0; int new_index; - while (if_list_growing) { - /* wait until list is done growing */ - (void)tsleep((caddr_t)&ifnet_addrs, PZERO, "if_next_index", 0); - } new_index = ++if_index; if (if_index > if_indexlim) { unsigned n; @@ -167,9 +213,6 @@ if_next_index(void) caddr_t new_ifindex2ifnet; caddr_t old_ifnet_addrs; - /* mark list as growing */ - if_list_growing = 1; - old_ifnet_addrs = (caddr_t)ifnet_addrs; if (ifnet_addrs == NULL) { new_if_indexlim = INITIAL_IF_INDEXLIM; @@ -201,89 +244,8 @@ if_next_index(void) if (old_ifnet_addrs != NULL) { _FREE((caddr_t)old_ifnet_addrs, M_IFADDR); } - - /* wake up others that might be blocked */ - if_list_growing = 0; - wakeup((caddr_t)&ifnet_addrs); } return (new_index); - -} - -/* - * Attach an interface to the - * list of "active" interfaces. - */ -void -old_if_attach(ifp) - struct ifnet *ifp; -{ - unsigned socksize, ifasize; - int namelen, masklen; - char workbuf[64]; - register struct sockaddr_dl *sdl; - register struct ifaddr *ifa; - - if (ifp->if_snd.ifq_maxlen == 0) - ifp->if_snd.ifq_maxlen = ifqmaxlen; - - /* - * XXX - - * The old code would work if the interface passed a pre-existing - * chain of ifaddrs to this code. We don't trust our callers to - * properly initialize the tailq, however, so we no longer allow - * this unlikely case. - */ - TAILQ_INIT(&ifp->if_addrhead); - TAILQ_INIT(&ifp->if_prefixhead); - LIST_INIT(&ifp->if_multiaddrs); - getmicrotime(&ifp->if_lastchange); - - if ((ifp->if_eflags & IFEF_REUSE) == 0 || ifp->if_index == 0) { - /* allocate a new entry */ - ifp->if_index = if_next_index(); - ifindex2ifnet[ifp->if_index] = ifp; - - /* - * create a Link Level name for this device - */ - namelen = snprintf(workbuf, sizeof(workbuf), - "%s%d", ifp->if_name, ifp->if_unit); -#define _offsetof(t, m) ((int)((caddr_t)&((t *)0)->m)) - masklen = _offsetof(struct sockaddr_dl, sdl_data[0]) + namelen; - socksize = masklen + ifp->if_addrlen; -#define ROUNDUP(a) (1 + (((a) - 1) | (sizeof(long) - 1))) - if (socksize < sizeof(*sdl)) - socksize = sizeof(*sdl); - socksize = ROUNDUP(socksize); - ifasize = sizeof(*ifa) + 2 * socksize; - ifa = (struct ifaddr *) _MALLOC(ifasize, M_IFADDR, M_WAITOK); - if (ifa) { - bzero((caddr_t)ifa, ifasize); - sdl = (struct sockaddr_dl *)(ifa + 1); - sdl->sdl_len = socksize; - sdl->sdl_family = AF_LINK; - bcopy(workbuf, sdl->sdl_data, namelen); - sdl->sdl_nlen = namelen; - sdl->sdl_index = ifp->if_index; - sdl->sdl_type = ifp->if_type; - ifnet_addrs[ifp->if_index - 1] = ifa; - ifa->ifa_ifp = ifp; - ifa->ifa_rtrequest = link_rtrequest; - ifa->ifa_addr = (struct sockaddr *)sdl; - sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl); - ifa->ifa_netmask = (struct sockaddr *)sdl; - sdl->sdl_len = masklen; - while (namelen != 0) - sdl->sdl_data[--namelen] = 0xff; - } - } else { - ifa = ifnet_addrs[ifp->if_index - 1]; - } - if (ifa != NULL) { - TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link); - } - TAILQ_INSERT_TAIL(&ifnet, ifp, if_link); } /* @@ -361,7 +323,7 @@ if_clone_create(char *name, int len) /* * Destroy a clone network interface. */ -int +static int if_clone_destroy(const char *name) { struct if_clone *ifc; @@ -405,7 +367,7 @@ if_clone_lookup(const char *name, int *unitp) { struct if_clone *ifc; const char *cp; - int i; + size_t i; for (ifc = LIST_FIRST(&if_cloners); ifc != NULL;) { for (cp = name, i = 0; i < ifc->ifc_namelen; i++, cp++) { @@ -492,27 +454,27 @@ if_clone_detach(struct if_clone *ifc) if_cloners_count--; } +#ifdef IF_CLONE_LIST /* * Provide list of interface cloners to userspace. */ static int -if_clone_list(struct if_clonereq *ifcr) +if_clone_list(int count, int * total, user_addr_t dst) { - char outbuf[IFNAMSIZ], *dst; + char outbuf[IFNAMSIZ]; struct if_clone *ifc; - int count, error = 0; + int error = 0; - ifcr->ifcr_total = if_cloners_count; - if ((dst = ifcr->ifcr_buffer) == NULL) { + *total = if_cloners_count; + if (dst == USER_ADDR_NULL) { /* Just asking how many there are. */ return (0); } - if (ifcr->ifcr_count < 0) + if (count < 0) return (EINVAL); - count = (if_cloners_count < ifcr->ifcr_count) ? - if_cloners_count : ifcr->ifcr_count; + count = (if_cloners_count < count) ? if_cloners_count : count; for (ifc = LIST_FIRST(&if_cloners); ifc != NULL && count != 0; ifc = LIST_NEXT(ifc, ifc_list), count--, dst += IFNAMSIZ) { @@ -524,27 +486,37 @@ if_clone_list(struct if_clonereq *ifcr) return (error); } +#endif IF_CLONE_LIST +int ifa_foraddr(unsigned int addr); __private_extern__ int -ifa_foraddr(addr) - unsigned int addr; +ifa_foraddr( + unsigned int addr) { - register struct ifnet *ifp; - register struct ifaddr *ifa; - register unsigned int addr2; + struct ifnet *ifp; + struct ifaddr *ifa; + unsigned int addr2; + int result = 0; - - for (ifp = ifnet.tqh_first; ifp; ifp = ifp->if_link.tqe_next) + ifnet_head_lock_shared(); + for (ifp = ifnet_head.tqh_first; ifp && !result; ifp = ifp->if_link.tqe_next) { + ifnet_lock_shared(ifp); for (ifa = ifp->if_addrhead.tqh_first; ifa; ifa = ifa->ifa_link.tqe_next) { - if (ifa->ifa_addr->sa_family != AF_INET) - continue; - addr2 = IA_SIN(ifa)->sin_addr.s_addr; - - if (addr == addr2) - return (1); + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + addr2 = IA_SIN(ifa)->sin_addr.s_addr; + + if (addr == addr2) { + result = 1; + break; + } + } + ifnet_lock_done(ifp); } - return (0); + ifnet_head_done(); + + return result; } /* @@ -552,50 +524,75 @@ ifa_foraddr(addr) */ /*ARGSUSED*/ struct ifaddr * -ifa_ifwithaddr(addr) - register struct sockaddr *addr; +ifa_ifwithaddr( + const struct sockaddr *addr) { - register struct ifnet *ifp; - register struct ifaddr *ifa; + struct ifnet *ifp; + struct ifaddr *ifa; + struct ifaddr *result = 0; #define equal(a1, a2) \ - (bcmp((caddr_t)(a1), (caddr_t)(a2), ((struct sockaddr *)(a1))->sa_len) == 0) - for (ifp = ifnet.tqh_first; ifp; ifp = ifp->if_link.tqe_next) - for (ifa = ifp->if_addrhead.tqh_first; ifa; - ifa = ifa->ifa_link.tqe_next) { - if (ifa->ifa_addr->sa_family != addr->sa_family) - continue; - if (equal(addr, ifa->ifa_addr)) - return (ifa); - if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && - /* IP6 doesn't have broadcast */ - ifa->ifa_broadaddr->sa_len != 0 && - equal(ifa->ifa_broadaddr, addr)) - return (ifa); + (bcmp((const void*)(a1), (const void*)(a2), ((const struct sockaddr *)(a1))->sa_len) == 0) + + ifnet_head_lock_shared(); + for (ifp = ifnet_head.tqh_first; ifp && !result; ifp = ifp->if_link.tqe_next) { + ifnet_lock_shared(ifp); + for (ifa = ifp->if_addrhead.tqh_first; ifa; + ifa = ifa->ifa_link.tqe_next) { + if (ifa->ifa_addr->sa_family != addr->sa_family) + continue; + if (equal(addr, ifa->ifa_addr)) { + result = ifa; + break; + } + if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && + /* IP6 doesn't have broadcast */ + ifa->ifa_broadaddr->sa_len != 0 && + equal(ifa->ifa_broadaddr, addr)) { + result = ifa; + break; + } + } + if (result) + ifaref(result); + ifnet_lock_done(ifp); } - return ((struct ifaddr *)0); + ifnet_head_done(); + + return result; } /* * Locate the point to point interface with a given destination address. */ /*ARGSUSED*/ struct ifaddr * -ifa_ifwithdstaddr(addr) - register struct sockaddr *addr; +ifa_ifwithdstaddr( + const struct sockaddr *addr) { - register struct ifnet *ifp; - register struct ifaddr *ifa; - - for (ifp = ifnet.tqh_first; ifp; ifp = ifp->if_link.tqe_next) - if (ifp->if_flags & IFF_POINTOPOINT) - for (ifa = ifp->if_addrhead.tqh_first; ifa; - ifa = ifa->ifa_link.tqe_next) { - if (ifa->ifa_addr->sa_family != addr->sa_family) - continue; - if (ifa->ifa_dstaddr && equal(addr, ifa->ifa_dstaddr)) - return (ifa); + struct ifnet *ifp; + struct ifaddr *ifa; + struct ifaddr *result = 0; + + ifnet_head_lock_shared(); + for (ifp = ifnet_head.tqh_first; ifp && !result; ifp = ifp->if_link.tqe_next) { + if (ifp->if_flags & IFF_POINTOPOINT) { + ifnet_lock_shared(ifp); + for (ifa = ifp->if_addrhead.tqh_first; ifa; + ifa = ifa->ifa_link.tqe_next) { + if (ifa->ifa_addr->sa_family != addr->sa_family) + continue; + if (ifa->ifa_dstaddr && equal(addr, ifa->ifa_dstaddr)) { + result = ifa; + break; + } + } + if (result) + ifaref(result); + ifnet_lock_done(ifp); + } } - return ((struct ifaddr *)0); + ifnet_head_done(); + return result; } /* @@ -603,33 +600,42 @@ ifa_ifwithdstaddr(addr) * is most specific found. */ struct ifaddr * -ifa_ifwithnet(addr) - struct sockaddr *addr; +ifa_ifwithnet( + const struct sockaddr *addr) { - register struct ifnet *ifp; - register struct ifaddr *ifa; + struct ifnet *ifp; + struct ifaddr *ifa = NULL; struct ifaddr *ifa_maybe = (struct ifaddr *) 0; u_int af = addr->sa_family; char *addr_data = addr->sa_data, *cplim; + ifnet_head_lock_shared(); /* * AF_LINK addresses can be looked up directly by their index number, * so do that if we can. */ if (af == AF_LINK) { - register struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr; - if (sdl->sdl_index && sdl->sdl_index <= if_index) - return (ifnet_addrs[sdl->sdl_index - 1]); + const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)addr; + if (sdl->sdl_index && sdl->sdl_index <= if_index) { + ifa = ifnet_addrs[sdl->sdl_index - 1]; + + if (ifa) + ifaref(ifa); + + ifnet_head_done(); + return ifa; + } } /* * Scan though each interface, looking for ones that have * addresses in this address family. */ - for (ifp = ifnet.tqh_first; ifp; ifp = ifp->if_link.tqe_next) { + for (ifp = ifnet_head.tqh_first; ifp; ifp = ifp->if_link.tqe_next) { + ifnet_lock_shared(ifp); for (ifa = ifp->if_addrhead.tqh_first; ifa; ifa = ifa->ifa_link.tqe_next) { - register char *cp, *cp2, *cp3; + char *cp, *cp2, *cp3; if (ifa->ifa_addr->sa_family != af) next: continue; @@ -653,8 +659,9 @@ next: continue; * netmask for the remote end. */ if (ifa->ifa_dstaddr != 0 - && equal(addr, ifa->ifa_dstaddr)) - return (ifa); + && equal(addr, ifa->ifa_dstaddr)) { + break; + } } else #endif /* __APPLE__*/ { @@ -663,8 +670,8 @@ next: continue; * then use it instead of the generic one. */ if (ifa->ifa_claim_addr) { - if ((*ifa->ifa_claim_addr)(ifa, addr)) { - return (ifa); + if (ifa->ifa_claim_addr(ifa, addr)) { + break; } else { continue; } @@ -696,12 +703,38 @@ next: continue; */ if (ifa_maybe == 0 || rn_refines((caddr_t)ifa->ifa_netmask, - (caddr_t)ifa_maybe->ifa_netmask)) + (caddr_t)ifa_maybe->ifa_netmask)) { + ifaref(ifa); + if (ifa_maybe) + ifafree(ifa_maybe); ifa_maybe = ifa; + } } } + + if (ifa) { + ifaref(ifa); + } + + /* + * ifa is set if we found an exact match. + * take a reference to the ifa before + * releasing the ifp lock + */ + ifnet_lock_done(ifp); + + if (ifa) { + break; + } + } + ifnet_head_done(); + if (!ifa) + ifa = ifa_maybe; + else if (ifa_maybe) { + ifafree(ifa_maybe); + ifa_maybe = NULL; } - return (ifa_maybe); + return ifa; } /* @@ -709,18 +742,20 @@ next: continue; * a given address. */ struct ifaddr * -ifaof_ifpforaddr(addr, ifp) - struct sockaddr *addr; - register struct ifnet *ifp; +ifaof_ifpforaddr( + const struct sockaddr *addr, + struct ifnet *ifp) { - register struct ifaddr *ifa; - register char *cp, *cp2, *cp3; - register char *cplim; + struct ifaddr *ifa = 0; + const char *cp, *cp2, *cp3; + char *cplim; struct ifaddr *ifa_maybe = 0; u_int af = addr->sa_family; if (af >= AF_MAX) return (0); + + ifnet_lock_shared(ifp); for (ifa = ifp->if_addrhead.tqh_first; ifa; ifa = ifa->ifa_link.tqe_next) { if (ifa->ifa_addr->sa_family != af) @@ -730,12 +765,12 @@ ifaof_ifpforaddr(addr, ifp) if (ifa->ifa_netmask == 0) { if (equal(addr, ifa->ifa_addr) || (ifa->ifa_dstaddr && equal(addr, ifa->ifa_dstaddr))) - return (ifa); + break; continue; } if (ifp->if_flags & IFF_POINTOPOINT) { if (equal(addr, ifa->ifa_dstaddr)) - return (ifa); + break; } else { cp = addr->sa_data; cp2 = ifa->ifa_addr->sa_data; @@ -745,10 +780,15 @@ ifaof_ifpforaddr(addr, ifp) if ((*cp++ ^ *cp2++) & *cp3) break; if (cp3 == cplim) - return (ifa); + break; } } - return (ifa_maybe); + + if (!ifa) ifa = ifa_maybe; + if (ifa) ifaref(ifa); + + ifnet_lock_done(ifp); + return ifa; } #include <net/route.h> @@ -758,13 +798,13 @@ ifaof_ifpforaddr(addr, ifp) * Lookup an appropriate real ifa to point to. * This should be moved to /sys/net/link.c eventually. */ -static void +void link_rtrequest(cmd, rt, sa) int cmd; - register struct rtentry *rt; + struct rtentry *rt; struct sockaddr *sa; { - register struct ifaddr *ifa; + struct ifaddr *ifa; struct sockaddr *dst; struct ifnet *ifp; @@ -776,75 +816,102 @@ link_rtrequest(cmd, rt, sa) rtsetifa(rt, ifa); if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest) ifa->ifa_rtrequest(cmd, rt, sa); + ifafree(ifa); } } /* - * Mark an interface down and notify protocols of - * the transition. - * NOTE: must be called at splnet or eqivalent. - */ -void -if_unroute(ifp, flag, fam) - register struct ifnet *ifp; - int flag, fam; -{ - register struct ifaddr *ifa; - - ifp->if_flags &= ~flag; - getmicrotime(&ifp->if_lastchange); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) - if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) - pfctlinput(PRC_IFDOWN, ifa->ifa_addr); - if_qflush(&ifp->if_snd); - rt_ifmsg(ifp); -} - -/* - * Mark an interface up and notify protocols of - * the transition. - * NOTE: must be called at splnet or eqivalent. + * if_updown will set the interface up or down. It will + * prevent other up/down events from occurring until this + * up/down event has completed. + * + * Caller must lock ifnet. This function will drop the + * lock. This allows ifnet_set_flags to set the rest of + * the flags after we change the up/down state without + * dropping the interface lock between setting the + * up/down state and updating the rest of the flags. */ -void -if_route(ifp, flag, fam) - register struct ifnet *ifp; - int flag, fam; +__private_extern__ void +if_updown( + struct ifnet *ifp, + int up) { - register struct ifaddr *ifa; - - ifp->if_flags |= flag; - getmicrotime(&ifp->if_lastchange); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) - if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) - pfctlinput(PRC_IFUP, ifa->ifa_addr); + int i; + struct ifaddr **ifa; + struct timespec tv; + + /* Wait until no one else is changing the up/down state */ + while ((ifp->if_eflags & IFEF_UPDOWNCHANGE) != 0) { + tv.tv_sec = 0; + tv.tv_nsec = NSEC_PER_SEC / 10; + ifnet_lock_done(ifp); + msleep(&ifp->if_eflags, NULL, 0, "if_updown", &tv); + ifnet_lock_exclusive(ifp); + } + + /* Verify that the interface isn't already in the right state */ + if ((!up && (ifp->if_flags & IFF_UP) == 0) || + (up && (ifp->if_flags & IFF_UP) == IFF_UP)) { + return; + } + + /* Indicate that the up/down state is changing */ + ifp->if_eflags |= IFEF_UPDOWNCHANGE; + + /* Mark interface up or down */ + if (up) { + ifp->if_flags |= IFF_UP; + } + else { + ifp->if_flags &= ~IFF_UP; + } + + ifnet_touch_lastchange(ifp); + + /* Drop the lock to notify addresses and route */ + ifnet_lock_done(ifp); + if (ifnet_get_address_list(ifp, &ifa) == 0) { + for (i = 0; ifa[i] != 0; i++) { + pfctlinput(up ? PRC_IFUP : PRC_IFDOWN, ifa[i]->ifa_addr); + } + ifnet_free_address_list(ifa); + } rt_ifmsg(ifp); - + + /* Aquire the lock to clear the changing flag and flush the send queue */ + ifnet_lock_exclusive(ifp); + if (!up) + if_qflush(&ifp->if_snd); + ifp->if_eflags &= ~IFEF_UPDOWNCHANGE; + wakeup(&ifp->if_eflags); + + return; } /* * Mark an interface down and notify protocols of * the transition. - * NOTE: must be called at splnet or eqivalent. */ void -if_down(ifp) - register struct ifnet *ifp; +if_down( + struct ifnet *ifp) { - - if_unroute(ifp, IFF_UP, AF_UNSPEC); + ifnet_lock_exclusive(ifp); + if_updown(ifp, 0); + ifnet_lock_done(ifp); } /* * Mark an interface up and notify protocols of * the transition. - * NOTE: must be called at splnet or eqivalent. */ void -if_up(ifp) - register struct ifnet *ifp; +if_up( + struct ifnet *ifp) { - - if_route(ifp, IFF_UP, AF_UNSPEC); + ifnet_lock_exclusive(ifp); + if_updown(ifp, 1); + ifnet_lock_done(ifp); } /* @@ -852,9 +919,9 @@ if_up(ifp) */ static void if_qflush(ifq) - register struct ifqueue *ifq; + struct ifqueue *ifq; { - register struct mbuf *m, *n; + struct mbuf *m, *n; n = ifq->ifq_head; while ((m = n) != 0) { @@ -904,12 +971,14 @@ ifunit(const char *name) /* * Now search all the interfaces for this name/number */ - for (ifp = ifnet.tqh_first; ifp; ifp = ifp->if_link.tqe_next) { + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { if (strcmp(ifp->if_name, namebuf)) continue; if (unit == ifp->if_unit) break; } + ifnet_head_done(); return (ifp); } @@ -952,8 +1021,8 @@ ifioctl(so, cmd, data, p) caddr_t data; struct proc *p; { - register struct ifnet *ifp; - register struct ifreq *ifr; + struct ifnet *ifp; + struct ifreq *ifr; struct ifstat *ifs; int error = 0; short oif_flags; @@ -961,26 +1030,41 @@ ifioctl(so, cmd, data, p) struct net_event_data ev_data; switch (cmd) { - case SIOCGIFCONF: case OSIOCGIFCONF: - return (ifconf(cmd, data)); + case SIOCGIFCONF64: + { + struct ifconf64 * ifc = (struct ifconf64 *)data; + user_addr_t user_addr; + + user_addr = proc_is64bit(p) + ? ifc->ifc_req64 : CAST_USER_ADDR_T(ifc->ifc_req); + return (ifconf(cmd, user_addr, &ifc->ifc_len)); + } + break; } ifr = (struct ifreq *)data; - switch (cmd) { case SIOCIFCREATE: case SIOCIFDESTROY: - error = suser(p->p_ucred, &p->p_acflag); + error = proc_suser(p); if (error) return (error); return ((cmd == SIOCIFCREATE) ? if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name)) : if_clone_destroy(ifr->ifr_name)); -#if 0 +#if IF_CLONE_LIST case SIOCIFGCLONERS: - return (if_clone_list((struct if_clonereq *)data)); -#endif 0 + case SIOCIFGCLONERS64: + { + struct if_clonereq64 * ifcr = (struct if_clonereq64 *)data; + user_addr = proc_is64bit(p) + ? ifcr->ifcr_ifcru.ifcru_buffer64 + : CAST_USER_ADDR_T(ifcr->ifcr_ifcru.ifcru_buffer32); + return (if_clone_list(ifcr->ifcr_count, &ifcr->ifcr_total, + user_data)); + } +#endif IF_CLONE_LIST } ifp = ifunit(ifr->ifr_name); @@ -989,43 +1073,35 @@ ifioctl(so, cmd, data, p) switch (cmd) { case SIOCGIFFLAGS: + ifnet_lock_shared(ifp); ifr->ifr_flags = ifp->if_flags; + ifnet_lock_done(ifp); break; case SIOCGIFMETRIC: + ifnet_lock_shared(ifp); ifr->ifr_metric = ifp->if_metric; + ifnet_lock_done(ifp); break; case SIOCGIFMTU: + ifnet_lock_shared(ifp); ifr->ifr_mtu = ifp->if_mtu; + ifnet_lock_done(ifp); break; case SIOCGIFPHYS: + ifnet_lock_shared(ifp); ifr->ifr_phys = ifp->if_physical; + ifnet_lock_done(ifp); break; case SIOCSIFFLAGS: - error = suser(p->p_ucred, &p->p_acflag); + error = proc_suser(p); if (error) return (error); -#ifndef __APPLE__ - if (ifp->if_flags & IFF_SMART) { - /* Smart drivers twiddle their own routes */ - } else -#endif - if (ifp->if_flags & IFF_UP && - (ifr->ifr_flags & IFF_UP) == 0) { - int s = splimp(); - if_down(ifp); - splx(s); - } else if (ifr->ifr_flags & IFF_UP && - (ifp->if_flags & IFF_UP) == 0) { - int s = splimp(); - if_up(ifp); - splx(s); - } - ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) | - (ifr->ifr_flags &~ IFF_CANTCHANGE); + + ifnet_set_flags(ifp, ifr->ifr_flags, ~IFF_CANTCHANGE); error = dlil_ioctl(so->so_proto->pr_domain->dom_family, ifp, cmd, (caddr_t) data); @@ -1044,11 +1120,11 @@ ifioctl(so, cmd, data, p) ev_msg.dv[1].data_length = 0; kev_post_msg(&ev_msg); } - getmicrotime(&ifp->if_lastchange); + ifnet_touch_lastchange(ifp); break; case SIOCSIFMETRIC: - error = suser(p->p_ucred, &p->p_acflag); + error = proc_suser(p); if (error) return (error); ifp->if_metric = ifr->ifr_metric; @@ -1068,11 +1144,11 @@ ifioctl(so, cmd, data, p) ev_msg.dv[1].data_length = 0; kev_post_msg(&ev_msg); - getmicrotime(&ifp->if_lastchange); + ifnet_touch_lastchange(ifp); break; case SIOCSIFPHYS: - error = suser(p->p_ucred, &p->p_acflag); + error = proc_suser(p); if (error) return error; @@ -1093,7 +1169,7 @@ ifioctl(so, cmd, data, p) ev_msg.dv[1].data_length = 0; kev_post_msg(&ev_msg); - getmicrotime(&ifp->if_lastchange); + ifnet_touch_lastchange(ifp); } return(error); @@ -1101,7 +1177,7 @@ ifioctl(so, cmd, data, p) { u_long oldmtu = ifp->if_mtu; - error = suser(p->p_ucred, &p->p_acflag); + error = proc_suser(p); if (error) return (error); if (ifp->if_ioctl == NULL) @@ -1126,7 +1202,7 @@ ifioctl(so, cmd, data, p) ev_msg.dv[1].data_length = 0; kev_post_msg(&ev_msg); - getmicrotime(&ifp->if_lastchange); + ifnet_touch_lastchange(ifp); rt_ifmsg(ifp); } /* @@ -1142,7 +1218,7 @@ ifioctl(so, cmd, data, p) case SIOCADDMULTI: case SIOCDELMULTI: - error = suser(p->p_ucred, &p->p_acflag); + error = proc_suser(p); if (error) return (error); @@ -1157,8 +1233,7 @@ ifioctl(so, cmd, data, p) #endif if (cmd == SIOCADDMULTI) { - struct ifmultiaddr *ifma; - error = if_addmulti(ifp, &ifr->ifr_addr, &ifma); + error = if_addmulti(ifp, &ifr->ifr_addr, NULL); ev_msg.event_code = KEV_DL_ADDMULTI; } else { error = if_delmulti(ifp, &ifr->ifr_addr); @@ -1177,14 +1252,10 @@ ifioctl(so, cmd, data, p) ev_msg.dv[1].data_length = 0; kev_post_msg(&ev_msg); - getmicrotime(&ifp->if_lastchange); + ifnet_touch_lastchange(ifp); } return error; - case SIOCSETVLAN: - if (ifp->if_type != IFT_L2VLAN) { - return (EOPNOTSUPP); - } case SIOCSIFPHYADDR: case SIOCDIFPHYADDR: #ifdef INET6 @@ -1194,7 +1265,10 @@ ifioctl(so, cmd, data, p) case SIOCSIFMEDIA: case SIOCSIFGENERIC: case SIOCSIFLLADDR: - error = suser(p->p_ucred, &p->p_acflag); + case SIOCSIFALTMTU: + case SIOCSIFVLAN: + case SIOCSIFBOND: + error = proc_suser(p); if (error) return (error); @@ -1202,7 +1276,7 @@ ifioctl(so, cmd, data, p) ifp, cmd, (caddr_t) data); if (error == 0) - getmicrotime(&ifp->if_lastchange); + ifnet_touch_lastchange(ifp); return error; case SIOCGIFSTATUS: @@ -1214,13 +1288,11 @@ ifioctl(so, cmd, data, p) case SIOCGLIFPHYADDR: case SIOCGIFMEDIA: case SIOCGIFGENERIC: - + case SIOCGIFDEVMTU: return dlil_ioctl(so->so_proto->pr_domain->dom_family, ifp, cmd, (caddr_t) data); - case SIOCGETVLAN: - if (ifp->if_type != IFT_L2VLAN) { - return (EOPNOTSUPP); - } + case SIOCGIFVLAN: + case SIOCGIFBOND: return dlil_ioctl(so->so_proto->pr_domain->dom_family, ifp, cmd, (caddr_t) data); @@ -1228,10 +1300,11 @@ ifioctl(so, cmd, data, p) oif_flags = ifp->if_flags; if (so->so_proto == 0) return (EOPNOTSUPP); -#if !COMPAT_43 - return ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, - data, - ifp, p)); +#if !COMPAT_43_SOCKET + socket_lock(so, 1); + error =(*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, ifp, p)); + socket_unlock(so, 1); + return (error); #else { int ocmd = cmd; @@ -1269,10 +1342,10 @@ ifioctl(so, cmd, data, p) case OSIOCGIFNETMASK: cmd = SIOCGIFNETMASK; } - error = ((*so->so_proto->pr_usrreqs->pru_control)(so, - cmd, - data, - ifp, p)); + socket_lock(so, 1); + error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, + data, ifp, p)); + socket_unlock(so, 1); switch (ocmd) { case OSIOCGIFADDR: @@ -1283,9 +1356,9 @@ ifioctl(so, cmd, data, p) } } -#endif /* COMPAT_43 */ +#endif /* COMPAT_43_SOCKET */ - if (error == EOPNOTSUPP) + if (error == EOPNOTSUPP || error == ENOTSUP) error = dlil_ioctl(so->so_proto->pr_domain->dom_family, ifp, cmd, (caddr_t) data); @@ -1294,47 +1367,74 @@ ifioctl(so, cmd, data, p) return (0); } +int +ifioctllocked(so, cmd, data, p) + struct socket *so; + u_long cmd; + caddr_t data; + struct proc *p; +{ + int error; + + socket_unlock(so, 0); + error = ifioctl(so, cmd, data, p); + socket_lock(so, 0); + return(error); +} + /* * Set/clear promiscuous mode on interface ifp based on the truth value * of pswitch. The calls are reference counted so that only the first * "on" request actually has an effect, as does the final "off" request. * Results are undefined if the "off" and "on" requests are not matched. */ -int -ifpromisc(ifp, pswitch) - struct ifnet *ifp; - int pswitch; +errno_t +ifnet_set_promiscuous( + ifnet_t ifp, + int pswitch) { struct ifreq ifr; - int error; + int error = 0; int oldflags; + int locked = 0; + int changed = 0; + ifnet_lock_exclusive(ifp); + locked = 1; oldflags = ifp->if_flags; if (pswitch) { /* * If the device is not configured up, we cannot put it in * promiscuous mode. */ - if ((ifp->if_flags & IFF_UP) == 0) - return (ENETDOWN); - if (ifp->if_pcount++ != 0) - return (0); + if ((ifp->if_flags & IFF_UP) == 0) { + error = ENETDOWN; + goto done; + } + if (ifp->if_pcount++ != 0) { + goto done; + } ifp->if_flags |= IFF_PROMISC; - log(LOG_INFO, "%s%d: promiscuous mode enabled\n", - ifp->if_name, ifp->if_unit); } else { if (--ifp->if_pcount > 0) - return (0); + goto done; ifp->if_flags &= ~IFF_PROMISC; - log(LOG_INFO, "%s%d: promiscuous mode disabled\n", - ifp->if_name, ifp->if_unit); } ifr.ifr_flags = ifp->if_flags; + locked = 0; + ifnet_lock_done(ifp); error = dlil_ioctl(0, ifp, SIOCSIFFLAGS, (caddr_t)&ifr); if (error == 0) rt_ifmsg(ifp); else ifp->if_flags = oldflags; +done: + if (locked) ifnet_lock_done(ifp); + if (changed) { + log(LOG_INFO, "%s%d: promiscuous mode %s\n", + ifp->if_name, ifp->if_unit, + pswitch != 0 ? "enabled" : "disabled"); + } return error; } @@ -1346,20 +1446,19 @@ ifpromisc(ifp, pswitch) */ /*ARGSUSED*/ static int -ifconf(cmd, data) - u_long cmd; - caddr_t data; +ifconf(u_long cmd, user_addr_t ifrp, int * ret_space) { - register struct ifconf *ifc = (struct ifconf *)data; - register struct ifnet *ifp = ifnet.tqh_first; - register struct ifaddr *ifa; - struct ifreq ifr, *ifrp; - int space = ifc->ifc_len, error = 0; - - ifrp = ifc->ifc_req; - for (; space > sizeof (ifr) && ifp; ifp = ifp->if_link.tqe_next) { + struct ifnet *ifp = NULL; + struct ifaddr *ifa; + struct ifreq ifr; + int error = 0; + size_t space; + + space = *ret_space; + ifnet_head_lock_shared(); + for (ifp = ifnet_head.tqh_first; space > sizeof(ifr) && ifp; ifp = ifp->if_link.tqe_next) { char workbuf[64]; - int ifnlen, addrs; + size_t ifnlen, addrs; ifnlen = snprintf(workbuf, sizeof(workbuf), "%s%d", ifp->if_name, ifp->if_unit); @@ -1369,63 +1468,64 @@ ifconf(cmd, data) } else { strcpy(ifr.ifr_name, workbuf); } + + ifnet_lock_shared(ifp); addrs = 0; ifa = ifp->if_addrhead.tqh_first; for ( ; space > sizeof (ifr) && ifa; ifa = ifa->ifa_link.tqe_next) { - register struct sockaddr *sa = ifa->ifa_addr; + struct sockaddr *sa = ifa->ifa_addr; #ifndef __APPLE__ if (curproc->p_prison && prison_if(curproc, sa)) continue; #endif addrs++; -#ifdef COMPAT_43 +#if COMPAT_43_SOCKET if (cmd == OSIOCGIFCONF) { struct osockaddr *osa = (struct osockaddr *)&ifr.ifr_addr; ifr.ifr_addr = *sa; osa->sa_family = sa->sa_family; - error = copyout((caddr_t)&ifr, (caddr_t)ifrp, - sizeof (ifr)); - ifrp++; + error = copyout((caddr_t)&ifr, ifrp, sizeof(ifr)); + ifrp += sizeof(struct ifreq); } else #endif if (sa->sa_len <= sizeof(*sa)) { ifr.ifr_addr = *sa; - error = copyout((caddr_t)&ifr, (caddr_t)ifrp, - sizeof (ifr)); - ifrp++; + error = copyout((caddr_t)&ifr, ifrp, sizeof(ifr)); + ifrp += sizeof(struct ifreq); } else { - if (space < sizeof (ifr) + sa->sa_len - - sizeof(*sa)) + if (space < sizeof (ifr) + sa->sa_len - sizeof(*sa)) break; space -= sa->sa_len - sizeof(*sa); - error = copyout((caddr_t)&ifr, (caddr_t)ifrp, - sizeof (ifr.ifr_name)); - if (error == 0) + error = copyout((caddr_t)&ifr, ifrp, sizeof (ifr.ifr_name)); + if (error == 0) { error = copyout((caddr_t)sa, - (caddr_t)&ifrp->ifr_addr, sa->sa_len); - ifrp = (struct ifreq *) - (sa->sa_len + (caddr_t)&ifrp->ifr_addr); + (ifrp + offsetof(struct ifreq, ifr_addr)), + sa->sa_len); + } + ifrp += (sa->sa_len + offsetof(struct ifreq, ifr_addr)); } if (error) break; space -= sizeof (ifr); } + ifnet_lock_done(ifp); + if (error) break; if (!addrs) { bzero((caddr_t)&ifr.ifr_addr, sizeof(ifr.ifr_addr)); - error = copyout((caddr_t)&ifr, (caddr_t)ifrp, - sizeof (ifr)); + error = copyout((caddr_t)&ifr, ifrp, sizeof (ifr)); if (error) break; space -= sizeof (ifr); - ifrp++; + ifrp += sizeof(struct ifreq); } } - ifc->ifc_len -= space; + ifnet_head_done(); + *ret_space -= space; return (error); } @@ -1438,12 +1538,14 @@ if_allmulti(ifp, onswitch) int onswitch; { int error = 0; - int s = splimp(); + int modified = 0; + + ifnet_lock_exclusive(ifp); if (onswitch) { if (ifp->if_amcount++ == 0) { ifp->if_flags |= IFF_ALLMULTI; - error = dlil_ioctl(0, ifp, SIOCSIFFLAGS, (caddr_t) 0); + modified = 1; } } else { if (ifp->if_amcount > 1) { @@ -1451,207 +1553,250 @@ if_allmulti(ifp, onswitch) } else { ifp->if_amcount = 0; ifp->if_flags &= ~IFF_ALLMULTI; - error = dlil_ioctl(0, ifp, SIOCSIFFLAGS, (caddr_t) 0); + modified = 1; } } - splx(s); + ifnet_lock_done(ifp); + + if (modified) + error = dlil_ioctl(0, ifp, SIOCSIFFLAGS, (caddr_t) 0); if (error == 0) rt_ifmsg(ifp); return error; } -/* - * Add a multicast listenership to the interface in question. - * The link layer provides a routine which converts - */ -int -if_addmulti(ifp, sa, retifma) - struct ifnet *ifp; /* interface to manipulate */ - struct sockaddr *sa; /* address to add */ - struct ifmultiaddr **retifma; +void +ifma_reference( + struct ifmultiaddr *ifma) { - struct sockaddr *llsa = 0; - struct sockaddr *dupsa; - int error, s; - struct ifmultiaddr *ifma; - struct rslvmulti_req rsreq; + if (OSIncrementAtomic((SInt32 *)&ifma->ifma_refcount) <= 0) + panic("ifma_reference: ifma already released or invalid\n"); +} - /* - * If the matching multicast address already exists - * then don't add a new one, just add a reference - */ +void +ifma_release( + struct ifmultiaddr *ifma) +{ + while (ifma) { + struct ifmultiaddr *next; + int32_t prevValue = OSDecrementAtomic((SInt32 *)&ifma->ifma_refcount); + if (prevValue < 1) + panic("ifma_release: ifma already released or invalid\n"); + if (prevValue != 1) + break; + + /* Allow the allocator of the protospec to free it */ + if (ifma->ifma_protospec && ifma->ifma_free) { + ifma->ifma_free(ifma->ifma_protospec); + } + + next = ifma->ifma_ll; + FREE(ifma->ifma_addr, M_IFMADDR); + FREE(ifma, M_IFMADDR); + ifma = next; + } +} + + /* + * Find an ifmultiaddr that matches a socket address on an interface. + * + * Caller is responsible for holding the ifnet_lock while calling + * this function. + */ +static int +if_addmulti_doesexist( + struct ifnet *ifp, + const struct sockaddr *sa, + struct ifmultiaddr **retifma) +{ + struct ifmultiaddr *ifma; for (ifma = ifp->if_multiaddrs.lh_first; ifma; ifma = ifma->ifma_link.le_next) { if (equal(sa, ifma->ifma_addr)) { - ifma->ifma_refcount++; - if (retifma) + ifma->ifma_usecount++; + if (retifma) { *retifma = ifma; + ifma_reference(*retifma); + } return 0; } } + + return ENOENT; +} + +/* + * Add a multicast listenership to the interface in question. + * The link layer provides a routine which converts + */ +int +if_addmulti( + struct ifnet *ifp, /* interface to manipulate */ + const struct sockaddr *sa, /* address to add */ + struct ifmultiaddr **retifma) +{ + struct sockaddr_storage storage; + struct sockaddr *llsa = NULL; + struct sockaddr *dupsa; + int error; + struct ifmultiaddr *ifma; + struct ifmultiaddr *llifma = NULL; + + ifnet_lock_exclusive(ifp); + error = if_addmulti_doesexist(ifp, sa, retifma); + ifnet_lock_done(ifp); + + if (error == 0) + return 0; /* * Give the link layer a chance to accept/reject it, and also * find out which AF_LINK address this maps to, if it isn't one * already. */ - rsreq.sa = sa; - rsreq.llsa = &llsa; - - error = dlil_ioctl(sa->sa_family, ifp, SIOCRSLVMULTI, (caddr_t) &rsreq); + error = dlil_resolve_multi(ifp, sa, (struct sockaddr*)&storage, sizeof(storage)); + if (error == 0 && storage.ss_len != 0) { + MALLOC(llsa, struct sockaddr*, storage.ss_len, M_IFMADDR, M_WAITOK); + MALLOC(llifma, struct ifmultiaddr *, sizeof *llifma, M_IFMADDR, M_WAITOK); + bcopy(&storage, llsa, storage.ss_len); + } /* to be similar to FreeBSD */ if (error == EOPNOTSUPP) error = 0; - if (error) - return error; + if (error) { + return error; + } + /* Allocate while we aren't holding any locks */ MALLOC(ifma, struct ifmultiaddr *, sizeof *ifma, M_IFMADDR, M_WAITOK); MALLOC(dupsa, struct sockaddr *, sa->sa_len, M_IFMADDR, M_WAITOK); bcopy(sa, dupsa, sa->sa_len); + + ifnet_lock_exclusive(ifp); + /* + * Check again for the matching multicast. + */ + if ((error = if_addmulti_doesexist(ifp, sa, retifma)) == 0) { + ifnet_lock_done(ifp); + FREE(ifma, M_IFMADDR); + FREE(dupsa, M_IFMADDR); + if (llsa) + FREE(llsa, M_IFMADDR); + return 0; + } + bzero(ifma, sizeof(*ifma)); ifma->ifma_addr = dupsa; - ifma->ifma_lladdr = llsa; ifma->ifma_ifp = ifp; + ifma->ifma_usecount = 1; ifma->ifma_refcount = 1; - ifma->ifma_protospec = 0; - rt_newmaddrmsg(RTM_NEWMADDR, ifma); - - /* - * Some network interfaces can scan the address list at - * interrupt time; lock them out. - */ - s = splimp(); - LIST_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link); - splx(s); - if (retifma) - *retifma = ifma; - - if (llsa != 0) { - for (ifma = ifp->if_multiaddrs.lh_first; ifma; - ifma = ifma->ifma_link.le_next) { - if (equal(ifma->ifma_addr, llsa)) - break; - } - if (ifma) { - ifma->ifma_refcount++; + + if (llifma != 0) { + if (if_addmulti_doesexist(ifp, llsa, &ifma->ifma_ll) == 0) { + FREE(llsa, M_IFMADDR); + FREE(llifma, M_IFMADDR); } else { - MALLOC(ifma, struct ifmultiaddr *, sizeof *ifma, - M_IFMADDR, M_WAITOK); - MALLOC(dupsa, struct sockaddr *, llsa->sa_len, - M_IFMADDR, M_WAITOK); - bcopy(llsa, dupsa, llsa->sa_len); - ifma->ifma_addr = dupsa; - ifma->ifma_lladdr = 0; - ifma->ifma_ifp = ifp; - ifma->ifma_refcount = 1; - s = splimp(); - LIST_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link); - splx(s); + bzero(llifma, sizeof(*llifma)); + llifma->ifma_addr = llsa; + llifma->ifma_ifp = ifp; + llifma->ifma_usecount = 1; + llifma->ifma_refcount = 1; + LIST_INSERT_HEAD(&ifp->if_multiaddrs, llifma, ifma_link); + + ifma->ifma_ll = llifma; + ifma_reference(ifma->ifma_ll); } } + + LIST_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link); + + if (retifma) { + *retifma = ifma; + ifma_reference(*retifma); + } + + ifnet_lock_done(ifp); + + if (llsa != 0) + rt_newmaddrmsg(RTM_NEWMADDR, ifma); + /* * We are certain we have added something, so call down to the * interface to let them know about it. */ - s = splimp(); dlil_ioctl(0, ifp, SIOCADDMULTI, (caddr_t) 0); - splx(s); - + return 0; } int -if_delmultiaddr(struct ifmultiaddr *ifma) +if_delmultiaddr( + struct ifmultiaddr *ifma, + int locked) { - struct sockaddr *sa; struct ifnet *ifp; + int do_del_multi = 0; - /* Verify ifma is valid */ - { - struct ifmultiaddr *match = NULL; - for (ifp = ifnet.tqh_first; ifp; ifp = ifp->if_link.tqe_next) { - for (match = ifp->if_multiaddrs.lh_first; match; match = match->ifma_link.le_next) { - if (match->ifma_ifp != ifp) { - printf("if_delmultiaddr: ifma (%x) on ifp i(%s) is stale\n", - match, if_name(ifp)); - return (0) ; /* swallow error ? */ - } - if (match == ifma) - break; - } - if (match == ifma) - break; + ifp = ifma->ifma_ifp; + + if (!locked && ifp) { + ifnet_lock_exclusive(ifp); + } + + while (ifma != NULL) { + struct ifmultiaddr *ll_ifma; + + if (ifma->ifma_usecount > 1) { + ifma->ifma_usecount--; + break; } - if (match != ifma) { - for (match = ifma_lostlist.lh_first; match; match = match->ifma_link.le_next) { - if (match->ifma_ifp != NULL) { - printf("if_delmultiaddr: item on lost list (%x) contains non-null ifp=%s\n", - match, if_name(match->ifma_ifp)); - return (0) ; /* swallow error ? */ - } - if (match == ifma) - break; - } + + if (ifp) + LIST_REMOVE(ifma, ifma_link); + + ll_ifma = ifma->ifma_ll; + + if (ll_ifma) { /* send a routing msg for network addresses only */ + if (ifp) + ifnet_lock_done(ifp); + rt_newmaddrmsg(RTM_DELMADDR, ifma); + if (ifp) + ifnet_lock_exclusive(ifp); } - if (match != ifma) { - printf("if_delmultiaddr: ifma 0x%X is invalid\n", ifma); - return 0; + /* + * Make sure the interface driver is notified + * in the case of a link layer mcast group being left. + */ + if (ll_ifma == 0) { + if (ifp && ifma->ifma_addr->sa_family == AF_LINK) + do_del_multi = 1; + break; } + + if (ifp) + ifma_release(ifma); + + ifma = ll_ifma; } - if (ifma->ifma_refcount > 1) { - ifma->ifma_refcount--; - return 0; + if (!locked && ifp) { + /* This wasn't initially locked, we should unlock it */ + ifnet_lock_done(ifp); } - - sa = ifma->ifma_lladdr; - - if (sa) /* send a routing msg for network addresses only */ - rt_newmaddrmsg(RTM_DELMADDR, ifma); - - ifp = ifma->ifma_ifp; - LIST_REMOVE(ifma, ifma_link); - /* - * Make sure the interface driver is notified - * in the case of a link layer mcast group being left. - */ - if (ifp && ifma->ifma_addr->sa_family == AF_LINK && sa == 0) + if (do_del_multi) { + if (locked) + ifnet_lock_done(ifp); dlil_ioctl(0, ifp, SIOCDELMULTI, 0); - FREE(ifma->ifma_addr, M_IFMADDR); - FREE(ifma, M_IFMADDR); - if (sa == 0) - return 0; - - /* - * Now look for the link-layer address which corresponds to - * this network address. It had been squirreled away in - * ifma->ifma_lladdr for this purpose (so we don't have - * to call SIOCRSLVMULTI again), and we saved that - * value in sa above. If some nasty deleted the - * link-layer address out from underneath us, we can deal because - * the address we stored was is not the same as the one which was - * in the record for the link-layer address. (So we don't complain - * in that case.) - */ - if (ifp) - ifma = ifp->if_multiaddrs.lh_first; - else - ifma = ifma_lostlist.lh_first; - for (; ifma; ifma = ifma->ifma_link.le_next) - if (equal(sa, ifma->ifma_addr)) - break; - - FREE(sa, M_IFMADDR); - if (ifma == 0) { - return 0; + if (locked) + ifnet_lock_exclusive(ifp); } - - return if_delmultiaddr(ifma); + + return 0; } /* @@ -1659,20 +1804,27 @@ if_delmultiaddr(struct ifmultiaddr *ifma) * if the request does not match an existing membership. */ int -if_delmulti(ifp, sa) - struct ifnet *ifp; - struct sockaddr *sa; +if_delmulti( + struct ifnet *ifp, + const struct sockaddr *sa) { struct ifmultiaddr *ifma; + int retval = 0; + ifnet_lock_exclusive(ifp); for (ifma = ifp->if_multiaddrs.lh_first; ifma; ifma = ifma->ifma_link.le_next) if (equal(sa, ifma->ifma_addr)) break; - if (ifma == 0) + if (ifma == 0) { + ifnet_lock_done(ifp); return ENOENT; + } + + retval = if_delmultiaddr(ifma, 1); + ifnet_lock_done(ifp); - return if_delmultiaddr(ifma); + return retval; } @@ -1690,15 +1842,17 @@ if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) struct ifmultiaddr * ifmaof_ifpforaddr(sa, ifp) - struct sockaddr *sa; + const struct sockaddr *sa; struct ifnet *ifp; { struct ifmultiaddr *ifma; + ifnet_lock_shared(ifp); for (ifma = ifp->if_multiaddrs.lh_first; ifma; ifma = ifma->ifma_link.le_next) if (equal(ifma->ifma_addr, sa)) break; + ifnet_lock_done(ifp); return ifma; } @@ -1711,17 +1865,21 @@ SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management"); * Shutdown all network activity. Used boot() when halting * system. */ +int if_down_all(void); int if_down_all(void) { - struct ifnet *ifp; - int s; - - s = splnet(); - TAILQ_FOREACH(ifp, &ifnet, if_link) - if_down(ifp); + struct ifnet **ifp; + u_int32_t count; + u_int32_t i; - splx(s); - return(0); /* Sheesh */ + if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp, &count) != 0) { + for (i = 0; i < count; i++) { + if_down(ifp[i]); + } + ifnet_list_free(ifp); + } + + return 0; } /* @@ -1740,9 +1898,9 @@ int if_down_all(void) * */ static int -if_rtdel(rn, arg) - struct radix_node *rn; - void *arg; +if_rtdel( + struct radix_node *rn, + void *arg) { struct rtentry *rt = (struct rtentry *)rn; struct ifnet *ifp = arg; @@ -1757,7 +1915,7 @@ if_rtdel(rn, arg) if ((rt->rt_flags & RTF_UP) == 0) return (0); - err = rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway, + err = rtrequest_locked(RTM_DELETE, rt_key(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags, (struct rtentry **) NULL); if (err) { @@ -1772,12 +1930,111 @@ if_rtdel(rn, arg) * Removes routing table reference to a given interfacei * for a given protocol family */ - void if_rtproto_del(struct ifnet *ifp, int protocol) { struct radix_node_head *rnh; - if ((protocol <= AF_MAX) && ((rnh = rt_tables[protocol]) != NULL) && (ifp != NULL)) + if ((protocol <= AF_MAX) && ((rnh = rt_tables[protocol]) != NULL) && (ifp != NULL)) { + lck_mtx_lock(rt_mtx); (void) rnh->rnh_walktree(rnh, if_rtdel, ifp); + lck_mtx_unlock(rt_mtx); + } +} + +extern lck_spin_t *dlil_input_lock; + +__private_extern__ void +if_data_internal_to_if_data( + const struct if_data_internal *if_data_int, + struct if_data *if_data) +{ +#define COPYFIELD(fld) if_data->fld = if_data_int->fld +#define COPYFIELD32(fld) if_data->fld = (u_int32_t)(if_data_int->fld) + COPYFIELD(ifi_type); + COPYFIELD(ifi_typelen); + COPYFIELD(ifi_physical); + COPYFIELD(ifi_addrlen); + COPYFIELD(ifi_hdrlen); + COPYFIELD(ifi_recvquota); + COPYFIELD(ifi_xmitquota); + if_data->ifi_unused1 = 0; + COPYFIELD(ifi_mtu); + COPYFIELD(ifi_metric); + if (if_data_int->ifi_baudrate & 0xFFFFFFFF00000000LL) { + if_data->ifi_baudrate = 0xFFFFFFFF; + } + else { + COPYFIELD32(ifi_baudrate); + } + + lck_spin_lock(dlil_input_lock); + COPYFIELD32(ifi_ipackets); + COPYFIELD32(ifi_ierrors); + COPYFIELD32(ifi_opackets); + COPYFIELD32(ifi_oerrors); + COPYFIELD32(ifi_collisions); + COPYFIELD32(ifi_ibytes); + COPYFIELD32(ifi_obytes); + COPYFIELD32(ifi_imcasts); + COPYFIELD32(ifi_omcasts); + COPYFIELD32(ifi_iqdrops); + COPYFIELD32(ifi_noproto); + COPYFIELD32(ifi_recvtiming); + COPYFIELD32(ifi_xmittiming); + COPYFIELD(ifi_lastchange); + lck_spin_unlock(dlil_input_lock); + +#if IF_LASTCHANGEUPTIME + if_data->ifi_lastchange.tv_sec += boottime_sec(); +#endif + + if_data->ifi_unused2 = 0; + COPYFIELD(ifi_hwassist); + if_data->ifi_reserved1 = 0; + if_data->ifi_reserved2 = 0; +#undef COPYFIELD32 +#undef COPYFIELD +} + +__private_extern__ void +if_data_internal_to_if_data64( + const struct if_data_internal *if_data_int, + struct if_data64 *if_data64) +{ +#define COPYFIELD(fld) if_data64->fld = if_data_int->fld + COPYFIELD(ifi_type); + COPYFIELD(ifi_typelen); + COPYFIELD(ifi_physical); + COPYFIELD(ifi_addrlen); + COPYFIELD(ifi_hdrlen); + COPYFIELD(ifi_recvquota); + COPYFIELD(ifi_xmitquota); + if_data64->ifi_unused1 = 0; + COPYFIELD(ifi_mtu); + COPYFIELD(ifi_metric); + COPYFIELD(ifi_baudrate); + + lck_spin_lock(dlil_input_lock); + COPYFIELD(ifi_ipackets); + COPYFIELD(ifi_ierrors); + COPYFIELD(ifi_opackets); + COPYFIELD(ifi_oerrors); + COPYFIELD(ifi_collisions); + COPYFIELD(ifi_ibytes); + COPYFIELD(ifi_obytes); + COPYFIELD(ifi_imcasts); + COPYFIELD(ifi_omcasts); + COPYFIELD(ifi_iqdrops); + COPYFIELD(ifi_noproto); + COPYFIELD(ifi_recvtiming); + COPYFIELD(ifi_xmittiming); + COPYFIELD(ifi_lastchange); + lck_spin_unlock(dlil_input_lock); + +#if IF_LASTCHANGEUPTIME + if_data64->ifi_lastchange.tv_sec += boottime_sec(); +#endif + +#undef COPYFIELD } diff --git a/bsd/net/if.h b/bsd/net/if.h index 1aa754ce5..1763d9383 100644 --- a/bsd/net/if.h +++ b/bsd/net/if.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -57,8 +57,11 @@ #ifndef _NET_IF_H_ #define _NET_IF_H_ -#include <sys/appleapiopts.h> +#define IF_NAMESIZE 16 + +#ifndef _POSIX_C_SOURCE +#include <sys/appleapiopts.h> #ifdef __APPLE__ /* * Define Data-Link event subclass, and associated @@ -82,17 +85,10 @@ #define KEV_DL_LINK_ON 13 #define KEV_DL_PROTO_ATTACHED 14 #define KEV_DL_PROTO_DETACHED 15 -#endif - -/* - * <net/if.h> does not depend on <sys/time.h> on most other systems. This - * helps userland compatability. (struct timeval ifi_lastchange) - */ -#include <sys/time.h> - +#define KEV_DL_LINK_ADDRESS_CHANGED 16 -#ifdef __APPLE__ #include <net/if_var.h> +#include <sys/types.h> #endif #ifdef KERNEL_PRIVATE @@ -101,6 +97,19 @@ struct if_clonereq { int ifcr_count; /* room for this many in user buffer */ char *ifcr_buffer; /* buffer for cloner names */ }; + +/* in-kernel, LP64-aware version of if_clonereq. all pointers + * grow when we're dealing with a 64-bit process. + * WARNING - keep in sync with if_clonereq + */ +struct if_clonereq64 { + int ifcr_total; /* total cloners (out) */ + int ifcr_count; /* room for this many in user buffer */ + union { + u_int64_t ifcru_buffer64; + char * ifcru_buffer32; + } ifcr_ifcru; +}; #endif KERNEL_PRIVATE #define IFF_UP 0x1 /* interface is up */ @@ -126,45 +135,51 @@ struct if_clonereq { #define IFEF_AUTOCONFIGURING 0x1 #define IFEF_DVR_REENTRY_OK 0x20 /* When set, driver may be reentered from its own thread */ #define IFEF_ACCEPT_RTADVD 0x40 /* set to accept IPv6 router advertisement on the interface */ -#define IFEF_INUSE 0x40000000 /* DLIL ifnet recycler, ifnet in use */ +#define IFEF_DETACHING 0x80 /* Set when interface is detaching */ +#define IFEF_USEKPI 0x100 /* Set when interface is created through the KPIs */ +#define IFEF_VLAN 0x200 /* interface has one or more vlans */ +#define IFEF_BOND 0x400 /* interface is part of bond */ +#define IFEF_ARPLL 0x800 /* ARP for IPv4LL addresses on this port */ #define IFEF_REUSE 0x20000000 /* DLIL ifnet recycler, ifnet is not new */ -#endif /* KERNEL_PRIVATE */ - +#define IFEF_INUSE 0x40000000 /* DLIL ifnet recycler, ifnet in use */ +#define IFEF_UPDOWNCHANGE 0x80000000 /* Interface's up/down state is changing */ /* flags set internally only: */ #define IFF_CANTCHANGE \ (IFF_BROADCAST|IFF_POINTOPOINT|IFF_RUNNING|IFF_OACTIVE|\ IFF_SIMPLEX|IFF_MULTICAST|IFF_ALLMULTI) +#endif /* KERNEL_PRIVATE */ + #define IFQ_MAXLEN 50 #define IFNET_SLOWHZ 1 /* granularity is 1 second */ /* * Message format for use in obtaining information about interfaces - * from getkerninfo and the routing socket + * from sysctl and the routing socket */ struct if_msghdr { - u_short ifm_msglen; /* to skip over non-understood messages */ - u_char ifm_version; /* future binary compatability */ - u_char ifm_type; /* message type */ - int ifm_addrs; /* like rtm_addrs */ - int ifm_flags; /* value of if_flags */ - u_short ifm_index; /* index for associated ifp */ - struct if_data ifm_data;/* statistics and other data about if */ + unsigned short ifm_msglen; /* to skip over non-understood messages */ + unsigned char ifm_version; /* future binary compatability */ + unsigned char ifm_type; /* message type */ + int ifm_addrs; /* like rtm_addrs */ + int ifm_flags; /* value of if_flags */ + unsigned short ifm_index; /* index for associated ifp */ + struct if_data ifm_data; /* statistics and other data about if */ }; /* * Message format for use in obtaining information about interface addresses - * from getkerninfo and the routing socket + * from sysctl and the routing socket */ struct ifa_msghdr { - u_short ifam_msglen; /* to skip over non-understood messages */ - u_char ifam_version; /* future binary compatability */ - u_char ifam_type; /* message type */ - int ifam_addrs; /* like rtm_addrs */ - int ifam_flags; /* value of ifa_flags */ - u_short ifam_index; /* index for associated ifp */ - int ifam_metric; /* value of ifa_metric */ + unsigned short ifam_msglen; /* to skip over non-understood messages */ + unsigned char ifam_version; /* future binary compatability */ + unsigned char ifam_type; /* message type */ + int ifam_addrs; /* like rtm_addrs */ + int ifam_flags; /* value of ifa_flags */ + unsigned short ifam_index; /* index for associated ifp */ + int ifam_metric; /* value of ifa_metric */ }; /* @@ -172,12 +187,55 @@ struct ifa_msghdr { * from the routing socket */ struct ifma_msghdr { + unsigned short ifmam_msglen; /* to skip over non-understood messages */ + unsigned char ifmam_version; /* future binary compatability */ + unsigned char ifmam_type; /* message type */ + int ifmam_addrs; /* like rtm_addrs */ + int ifmam_flags; /* value of ifa_flags */ + unsigned short ifmam_index; /* index for associated ifp */ +}; + +/* + * Message format for use in obtaining information about interfaces + * from sysctl + */ +struct if_msghdr2 { + u_short ifm_msglen; /* to skip over non-understood messages */ + u_char ifm_version; /* future binary compatability */ + u_char ifm_type; /* message type */ + int ifm_addrs; /* like rtm_addrs */ + int ifm_flags; /* value of if_flags */ + u_short ifm_index; /* index for associated ifp */ + int ifm_snd_len; /* instantaneous length of send queue */ + int ifm_snd_maxlen; /* maximum length of send queue */ + int ifm_snd_drops; /* number of drops in send queue */ + int ifm_timer; /* time until if_watchdog called */ + struct if_data64 ifm_data; /* statistics and other data about if */ +}; + +/* + * Message format for use in obtaining information about multicast addresses + * from sysctl + */ +struct ifma_msghdr2 { u_short ifmam_msglen; /* to skip over non-understood messages */ u_char ifmam_version; /* future binary compatability */ u_char ifmam_type; /* message type */ int ifmam_addrs; /* like rtm_addrs */ int ifmam_flags; /* value of ifa_flags */ u_short ifmam_index; /* index for associated ifp */ + int32_t ifmam_refcount; +}; + +/* + * ifdevmtu: interface device mtu + * Used with SIOCGIFDEVMTU to get the current mtu in use by the device, + * as well as the minimum and maximum mtu allowed by the device. + */ +struct ifdevmtu { + int ifdm_current; + int ifdm_min; + int ifdm_max; }; /* @@ -186,9 +244,10 @@ struct ifma_msghdr { * definitions which begin with ifr_name. The * remainder may be interface specific. */ -#define IF_NAMESIZE IFNAMSIZ struct ifreq { -#define IFNAMSIZ 16 +#ifndef IFNAMSIZ +#define IFNAMSIZ IF_NAMESIZE +#endif char ifr_name[IFNAMSIZ]; /* if name, e.g. "en0" */ union { struct sockaddr ifru_addr; @@ -199,7 +258,12 @@ struct ifreq { int ifru_mtu; int ifru_phys; int ifru_media; + int ifru_intval; caddr_t ifru_data; +#ifdef KERNEL_PRIVATE + u_int64_t ifru_data64; /* 64-bit ifru_data */ +#endif KERNEL_PRIVATE + struct ifdevmtu ifru_devmtu; } ifr_ifru; #define ifr_addr ifr_ifru.ifru_addr /* address */ #define ifr_dstaddr ifr_ifru.ifru_dstaddr /* other end of p-to-p link */ @@ -215,6 +279,11 @@ struct ifreq { #define ifr_phys ifr_ifru.ifru_phys /* physical wire */ #define ifr_media ifr_ifru.ifru_media /* physical media */ #define ifr_data ifr_ifru.ifru_data /* for use by interface */ +#define ifr_devmtu ifr_ifru.ifru_devmtu +#define ifr_intval ifr_ifru.ifru_intval /* integer value */ +#ifdef KERNEL_PRIVATE +#define ifr_data64 ifr_ifru.ifru_data64 /* 64-bit pointer */ +#endif KERNEL_PRIVATE }; #define _SIZEOF_ADDR_IFREQ(ifr) \ @@ -244,6 +313,25 @@ struct ifmediareq { int *ifm_ulist; /* media words */ }; +#ifdef KERNEL_PRIVATE +/* LP64 version of ifmediareq. all pointers + * grow when we're dealing with a 64-bit process. + * WARNING - keep in sync with ifmediareq + */ +struct ifmediareq64 { + char ifm_name[IFNAMSIZ]; /* if name, e.g. "en0" */ + int ifm_current; /* current media options */ + int ifm_mask; /* don't care mask */ + int ifm_status; /* media status */ + int ifm_active; /* active options */ + int ifm_count; /* # entries in ifm_ulist array */ + union { /* media words */ + int * ifmu_ulist32; /* 32-bit pointer */ + u_int64_t ifmu_ulist64; /* 64-bit pointer */ + } ifm_ifmu; +}; +#endif // KERNEL_PRIVATE + /* * Structure used to retrieve aux status data from interfaces. * Kernel suppliers to this interface should respect the formatting @@ -273,30 +361,40 @@ struct ifconf { #define ifc_req ifc_ifcu.ifcu_req /* array of structures returned */ }; -#ifdef __APPLE__ -#ifdef __APPLE_API_UNSTABLE +#ifdef KERNEL_PRIVATE +/* LP64 version of ifconf. all pointers + * grow when we're dealing with a 64-bit process. + * WARNING - keep in sync with ifconf + */ +struct ifconf64 { + int ifc_len; /* size of associated buffer */ + union { + struct ifreq * ifcu_req; + u_int64_t ifcu_req64; + } ifc_ifcu; +}; +#define ifc_req64 ifc_ifcu.ifcu_req64 +#endif // KERNEL_PRIVATE + /* * DLIL KEV_DL_PROTO_ATTACHED/DETACHED structure */ struct kev_dl_proto_data { struct net_event_data link_data; - u_long proto_family; - u_long proto_remaining_count; + unsigned long proto_family; + unsigned long proto_remaining_count; }; -#endif /* __APPLE_API_UNSTABLE */ -#endif - /* * Structure for SIOC[AGD]LIFADDR */ struct if_laddrreq { - char iflr_name[IFNAMSIZ]; - u_int flags; + char iflr_name[IFNAMSIZ]; + unsigned int flags; #define IFLR_PREFIX 0x8000 /* in: prefix given out: kernel fills id */ - u_int prefixlen; /* in/out */ - struct sockaddr_storage addr; /* in/out */ - struct sockaddr_storage dstaddr; /* out */ + unsigned int prefixlen; /* in/out */ + struct sockaddr_storage addr; /* in/out */ + struct sockaddr_storage dstaddr; /* out */ }; #ifdef KERNEL @@ -305,28 +403,24 @@ MALLOC_DECLARE(M_IFADDR); MALLOC_DECLARE(M_IFMADDR); #endif #endif +#endif /* _POSIX_C_SOURCE */ #ifndef KERNEL struct if_nameindex { - u_int if_index; /* 1, 2, ... */ - char *if_name; /* null terminated name: "le0", ... */ + unsigned int if_index; /* 1, 2, ... */ + char *if_name; /* null terminated name: "le0", ... */ }; __BEGIN_DECLS -u_int if_nametoindex __P((const char *)); -char *if_indextoname __P((u_int, char *)); -struct if_nameindex *if_nameindex __P((void)); -void if_freenameindex __P((struct if_nameindex *)); +unsigned int if_nametoindex(const char *); +char *if_indextoname(unsigned int, char *); +struct if_nameindex *if_nameindex(void); +void if_freenameindex(struct if_nameindex *); __END_DECLS #endif #ifdef KERNEL -#ifndef __APPLE__ -struct proc; - -int prison_if __P((struct proc *p, struct sockaddr *sa)); -#endif - +#include <net/kpi_interface.h> #endif #endif /* !_NET_IF_H_ */ diff --git a/bsd/net/if_arp.h b/bsd/net/if_arp.h index 0a752d965..24caabe63 100644 --- a/bsd/net/if_arp.h +++ b/bsd/net/if_arp.h @@ -114,58 +114,4 @@ struct arpreq { #define ATF_PUBL 0x08 /* publish entry (respond for other host) */ #define ATF_USETRAILERS 0x10 /* has requested trailers */ -#ifdef __APPLE_API_UNSTABLE - -#ifdef __APPLE__ -/* - * Ethernet multicast address structure. There is one of these for each - * multicast address or range of multicast addresses that we are supposed - * to listen to on a particular interface. They are kept in a linked list, - * rooted in the interface's arpcom structure. (This really has nothing to - * do with ARP, or with the Internet address family, but this appears to be - * the minimally-disrupting place to put it.) - */ -struct ether_multi { - u_char enm_addrlo[6]; /* low or only address of range */ - u_char enm_addrhi[6]; /* high or only address of range */ - struct arpcom *enm_ac; /* back pointer to arpcom */ - u_int enm_refcount; /* no. claims to this addr/range */ - struct ether_multi *enm_next; /* ptr to next ether_multi */ -}; - -/* - * Structure used by macros below to remember position when stepping through - * all of the ether_multi records. - */ -struct ether_multistep { - struct ether_multi *e_enm; -}; -#endif /* __APPLE__ */ - -#ifdef KERNEL -/* - * Structure shared between the ethernet driver modules and - * the address resolution code. For example, each ec_softc or il_softc - * begins with this structure. - */ -struct arpcom { - /* - * The ifnet struct _must_ be at the head of this structure. - */ - struct ifnet ac_if; /* network-visible interface */ - u_char ac_enaddr[6]; /* ethernet hardware address */ -#ifdef __APPLE__ - struct in_addr ac_ipaddr; /* copy of ip address- XXX */ - struct ether_multi *ac_multiaddrs; /* list of ether multicast addrs */ -#endif - int ac_multicnt; /* length of ac_multiaddrs list */ -#ifndef __APPLE__ - void *ac_netgraph; /* ng_ether(4) netgraph node info */ -#endif -}; - - -#endif -#endif /* __APPLE_API_UNSTABLE */ - #endif /* !_NET_IF_ARP_H_ */ diff --git a/bsd/net/if_atm.h b/bsd/net/if_atm.h index 9a0518ac8..e8fe0eceb 100644 --- a/bsd/net/if_atm.h +++ b/bsd/net/if_atm.h @@ -58,12 +58,13 @@ * net/if_atm.h */ +#ifdef KERNEL_PRIVATE #if defined(__NetBSD__) || defined(__OpenBSD__) || defined(__bsdi__) #define RTALLOC1(A,B) rtalloc1((A),(B)) #elif defined(__FreeBSD__) || defined(__APPLE__) #define RTALLOC1(A,B) rtalloc1((A),(B),0UL) #endif - +#endif /* KERNEL_PRIVATE */ /* * pseudo header for packet transmission @@ -119,13 +120,11 @@ struct atmllc { (X)->type[0] = ((V) & 0xff); \ } -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -void atm_ifattach __P((struct ifnet *)); -void atm_input __P((struct ifnet *, struct atm_pseudohdr *, - struct mbuf *, void *)); -int atm_output __P((struct ifnet *, struct mbuf *, struct sockaddr *, - struct rtentry *)); -#endif /* __APPLE_API_PRIVATE */ -#endif +#ifdef KERNEL_PRIVATE +void atm_ifattach(struct ifnet *); +void atm_input(struct ifnet *, struct atm_pseudohdr *, + struct mbuf *, void *); +int atm_output(struct ifnet *, struct mbuf *, struct sockaddr *, + struct rtentry *); +#endif /* KERNEL_PRIVATE */ diff --git a/bsd/net/if_bond.c b/bsd/net/if_bond.c new file mode 100644 index 000000000..5c7005dc4 --- /dev/null +++ b/bsd/net/if_bond.c @@ -0,0 +1,4485 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/* + * if_bond.c + * - bond/failover interface + * - implements IEEE 802.3ad Link Aggregation + */ + +/* + * Modification History: + * + * April 29, 2004 Dieter Siegmund (dieter@apple.com) + * - created + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/queue.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/sysctl.h> +#include <sys/systm.h> +#include <sys/kern_event.h> + +#include <net/bpf.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <net/kpi_interface.h> +#include <net/if_arp.h> +#include <net/if_dl.h> +#include <net/if_ether.h> +#include <net/if_types.h> +#include <net/if_bond_var.h> +#include <net/ieee8023ad.h> +#include <net/lacp.h> +#include <net/dlil.h> +#include <sys/time.h> +#include <net/devtimer.h> +#include <net/if_vlan_var.h> + +#include <kern/locks.h> +#include <libkern/OSAtomic.h> + +#include <netinet/in.h> +#include <netinet/if_ether.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> + +#include <net/if_media.h> +#include <net/multicast_list.h> + +extern int dlil_input_packet(struct ifnet *, struct mbuf *, char *); + +static struct ether_addr slow_proto_multicast = { + IEEE8023AD_SLOW_PROTO_MULTICAST +}; + +#define BOND_MAXUNIT 128 +#define BONDNAME "bond" +#define M_BOND M_DEVBUF + +#define EA_FORMAT "%x:%x:%x:%x:%x:%x" +#define EA_CH(e, i) ((u_char)((u_char *)(e))[(i)]) +#define EA_LIST(ea) EA_CH(ea,0),EA_CH(ea,1),EA_CH(ea,2),EA_CH(ea,3),EA_CH(ea,4),EA_CH(ea,5) + +#define timestamp_printf printf + +/** + ** bond locks + **/ +static __inline__ lck_grp_t * +my_lck_grp_alloc_init(const char * grp_name) +{ + lck_grp_t * grp; + lck_grp_attr_t * grp_attrs; + + grp_attrs = lck_grp_attr_alloc_init(); + lck_grp_attr_setdefault(grp_attrs); + lck_grp_attr_setdefault(grp_attrs); + grp = lck_grp_alloc_init(grp_name, grp_attrs); + lck_grp_attr_free(grp_attrs); + return (grp); +} + +static __inline__ lck_mtx_t * +my_lck_mtx_alloc_init(lck_grp_t * lck_grp) +{ + lck_attr_t * lck_attrs; + lck_mtx_t * lck_mtx; + + lck_attrs = lck_attr_alloc_init(); + lck_attr_setdefault(lck_attrs); + lck_mtx = lck_mtx_alloc_init(lck_grp, lck_attrs); + lck_attr_free(lck_attrs); + return (lck_mtx); +} + +static lck_mtx_t * bond_lck_mtx; + +static __inline__ void +bond_lock_init(void) +{ + lck_grp_t * bond_lck_grp; + + bond_lck_grp = my_lck_grp_alloc_init("if_bond"); + bond_lck_mtx = my_lck_mtx_alloc_init(bond_lck_grp); +} + +static __inline__ void +bond_assert_lock_held(void) +{ + lck_mtx_assert(bond_lck_mtx, LCK_MTX_ASSERT_OWNED); + return; +} + +static __inline__ void +bond_assert_lock_not_held(void) +{ + lck_mtx_assert(bond_lck_mtx, LCK_MTX_ASSERT_NOTOWNED); + return; +} + +static __inline__ void +bond_lock(void) +{ + lck_mtx_lock(bond_lck_mtx); + return; +} + +static __inline__ void +bond_unlock(void) +{ + lck_mtx_unlock(bond_lck_mtx); + return; +} + +/** + ** bond structures, types + **/ + +struct LAG_info_s { + lacp_system li_system; + lacp_system_priority li_system_priority; + lacp_key li_key; +}; +typedef struct LAG_info_s LAG_info, * LAG_info_ref; + +struct bondport_s; +TAILQ_HEAD(port_list, bondport_s); +struct ifbond_s; +TAILQ_HEAD(ifbond_list, ifbond_s); +struct LAG_s; +TAILQ_HEAD(lag_list, LAG_s); + +typedef struct ifbond_s ifbond, * ifbond_ref; +typedef struct bondport_s bondport, * bondport_ref; + +struct LAG_s { + TAILQ_ENTRY(LAG_s) lag_list; + struct port_list lag_port_list; + short lag_port_count; + short lag_selected_port_count; + int lag_active_media; + LAG_info lag_info; +}; +typedef struct LAG_s LAG, * LAG_ref; + +typedef struct partner_state_s { + LAG_info ps_lag_info; + lacp_port ps_port; + lacp_port_priority ps_port_priority; + lacp_actor_partner_state ps_state; +} partner_state, * partner_state_ref; + +struct ifbond_s { + TAILQ_ENTRY(ifbond_s) ifb_bond_list; + int ifb_flags; + UInt32 ifb_retain_count; + char ifb_name[IFNAMSIZ]; + struct ifnet * ifb_ifp; + bpf_packet_func ifb_bpf_input; + bpf_packet_func ifb_bpf_output; + int ifb_altmtu; + struct port_list ifb_port_list; + short ifb_port_count; + struct lag_list ifb_lag_list; + lacp_key ifb_key; + short ifb_max_active; /* 0 == unlimited */ + LAG_ref ifb_active_lag; + struct ifmultiaddr * ifb_ifma_slow_proto; + bondport_ref * ifb_distributing_array; + int ifb_distributing_count; +}; + +struct media_info { + int mi_active; + int mi_status; +}; + +enum { + ReceiveState_none = 0, + ReceiveState_INITIALIZE = 1, + ReceiveState_PORT_DISABLED = 2, + ReceiveState_EXPIRED = 3, + ReceiveState_LACP_DISABLED = 4, + ReceiveState_DEFAULTED = 5, + ReceiveState_CURRENT = 6, +}; + +typedef u_char ReceiveState; + +enum { + SelectedState_UNSELECTED = IF_BOND_STATUS_SELECTED_STATE_UNSELECTED, + SelectedState_SELECTED = IF_BOND_STATUS_SELECTED_STATE_SELECTED, + SelectedState_STANDBY = IF_BOND_STATUS_SELECTED_STATE_STANDBY +}; +typedef u_char SelectedState; + +static __inline__ const char * +SelectedStateString(SelectedState s) +{ + static const char * names[] = { "UNSELECTED", "SELECTED", "STANDBY" }; + + if (s <= SelectedState_STANDBY) { + return (names[s]); + } + return ("<unknown>"); +} + +enum { + MuxState_none = 0, + MuxState_DETACHED = 1, + MuxState_WAITING = 2, + MuxState_ATTACHED = 3, + MuxState_COLLECTING_DISTRIBUTING = 4, +}; + +typedef u_char MuxState; + +struct bondport_s { + TAILQ_ENTRY(bondport_s) po_port_list; + ifbond_ref po_bond; + struct multicast_list po_multicast; + struct ifnet * po_ifp; + struct ether_addr po_saved_addr; + int po_enabled; + char po_name[IFNAMSIZ]; + struct ifdevmtu po_devmtu; + + /* LACP */ + TAILQ_ENTRY(bondport_s) po_lag_port_list; + devtimer_ref po_current_while_timer; + devtimer_ref po_periodic_timer; + devtimer_ref po_wait_while_timer; + devtimer_ref po_transmit_timer; + partner_state po_partner_state; + lacp_port_priority po_priority; + lacp_actor_partner_state po_actor_state; + u_char po_flags; + u_char po_periodic_interval; + u_char po_n_transmit; + ReceiveState po_receive_state; + MuxState po_mux_state; + SelectedState po_selected; + int32_t po_last_transmit_secs; + struct media_info po_media_info; + LAG_ref po_lag; +}; + +#define IFBF_PROMISC 0x1 /* promiscuous mode */ +#define IFBF_IF_DETACHING 0x2 /* interface is detaching */ +#define IFBF_LLADDR 0x4 /* specific link address requested */ +#define IFBF_CHANGE_IN_PROGRESS 0x8 /* interface add/remove in progress */ + +static int bond_get_status(ifbond_ref ifb, struct if_bond_req * ibr_p, + user_addr_t datap); + +static __inline__ int +ifbond_flags_promisc(ifbond_ref ifb) +{ + return ((ifb->ifb_flags & IFBF_PROMISC) != 0); +} + +static __inline__ void +ifbond_flags_set_promisc(ifbond_ref ifb) +{ + ifb->ifb_flags |= IFBF_PROMISC; + return; +} + +static __inline__ void +ifbond_flags_clear_promisc(ifbond_ref ifb) +{ + ifb->ifb_flags &= ~IFBF_PROMISC; + return; +} + +static __inline__ int +ifbond_flags_if_detaching(ifbond_ref ifb) +{ + return ((ifb->ifb_flags & IFBF_IF_DETACHING) != 0); +} + +static __inline__ void +ifbond_flags_set_if_detaching(ifbond_ref ifb) +{ + ifb->ifb_flags |= IFBF_IF_DETACHING; + return; +} + +static __inline__ int +ifbond_flags_lladdr(ifbond_ref ifb) +{ + return ((ifb->ifb_flags & IFBF_LLADDR) != 0); +} + +static __inline__ void +ifbond_flags_set_lladdr(ifbond_ref ifb) +{ + ifb->ifb_flags |= IFBF_LLADDR; + return; +} + +static __inline__ void +ifbond_flags_clear_lladdr(ifbond_ref ifb) +{ + ifb->ifb_flags &= ~IFBF_LLADDR; + return; +} + +static __inline__ int +ifbond_flags_change_in_progress(ifbond_ref ifb) +{ + return ((ifb->ifb_flags & IFBF_CHANGE_IN_PROGRESS) != 0); +} + +static __inline__ void +ifbond_flags_set_change_in_progress(ifbond_ref ifb) +{ + ifb->ifb_flags |= IFBF_CHANGE_IN_PROGRESS; + return; +} + +static __inline__ void +ifbond_flags_clear_change_in_progress(ifbond_ref ifb) +{ + ifb->ifb_flags &= ~IFBF_CHANGE_IN_PROGRESS; + return; +} + +/* + * bondport_ref->po_flags bits + */ +#define BONDPORT_FLAGS_NTT 0x01 +#define BONDPORT_FLAGS_READY 0x02 +#define BONDPORT_FLAGS_SELECTED_CHANGED 0x04 +#define BONDPORT_FLAGS_MUX_ATTACHED 0x08 +#define BONDPORT_FLAGS_DISTRIBUTING 0x10 +#define BONDPORT_FLAGS_UNUSED2 0x20 +#define BONDPORT_FLAGS_UNUSED3 0x40 +#define BONDPORT_FLAGS_UNUSED4 0x80 + +static __inline__ void +bondport_flags_set_ntt(bondport_ref p) +{ + p->po_flags |= BONDPORT_FLAGS_NTT; + return; +} + +static __inline__ void +bondport_flags_clear_ntt(bondport_ref p) +{ + p->po_flags &= ~BONDPORT_FLAGS_NTT; + return; +} + +static __inline__ int +bondport_flags_ntt(bondport_ref p) +{ + return ((p->po_flags & BONDPORT_FLAGS_NTT) != 0); +} + +static __inline__ void +bondport_flags_set_ready(bondport_ref p) +{ + p->po_flags |= BONDPORT_FLAGS_READY; + return; +} + +static __inline__ void +bondport_flags_clear_ready(bondport_ref p) +{ + p->po_flags &= ~BONDPORT_FLAGS_READY; + return; +} + +static __inline__ int +bondport_flags_ready(bondport_ref p) +{ + return ((p->po_flags & BONDPORT_FLAGS_READY) != 0); +} + +static __inline__ void +bondport_flags_set_selected_changed(bondport_ref p) +{ + p->po_flags |= BONDPORT_FLAGS_SELECTED_CHANGED; + return; +} + +static __inline__ void +bondport_flags_clear_selected_changed(bondport_ref p) +{ + p->po_flags &= ~BONDPORT_FLAGS_SELECTED_CHANGED; + return; +} + +static __inline__ int +bondport_flags_selected_changed(bondport_ref p) +{ + return ((p->po_flags & BONDPORT_FLAGS_SELECTED_CHANGED) != 0); +} + +static __inline__ void +bondport_flags_set_mux_attached(bondport_ref p) +{ + p->po_flags |= BONDPORT_FLAGS_MUX_ATTACHED; + return; +} + +static __inline__ void +bondport_flags_clear_mux_attached(bondport_ref p) +{ + p->po_flags &= ~BONDPORT_FLAGS_MUX_ATTACHED; + return; +} + +static __inline__ int +bondport_flags_mux_attached(bondport_ref p) +{ + return ((p->po_flags & BONDPORT_FLAGS_MUX_ATTACHED) != 0); +} + +static __inline__ void +bondport_flags_set_distributing(bondport_ref p) +{ + p->po_flags |= BONDPORT_FLAGS_DISTRIBUTING; + return; +} + +static __inline__ void +bondport_flags_clear_distributing(bondport_ref p) +{ + p->po_flags &= ~BONDPORT_FLAGS_DISTRIBUTING; + return; +} + +static __inline__ int +bondport_flags_distributing(bondport_ref p) +{ + return ((p->po_flags & BONDPORT_FLAGS_DISTRIBUTING) != 0); +} + +typedef struct bond_globals_s { + struct ifbond_list ifbond_list; + lacp_system system; + lacp_system_priority system_priority; + int verbose; +} * bond_globals_ref; + +static bond_globals_ref g_bond; + +/** + ** packet_buffer routines + ** - thin wrapper for mbuf + **/ + +typedef struct mbuf * packet_buffer_ref; + +static packet_buffer_ref +packet_buffer_allocate(int length) +{ + packet_buffer_ref m; + int size; + + /* leave room for ethernet header */ + size = length + sizeof(struct ether_header); + if (size > (int)MHLEN) { + /* XXX doesn't handle large payloads */ + printf("bond: packet_buffer_allocate size %d > max %d\n", size, MHLEN); + return (NULL); + } + m = m_gethdr(M_WAITOK, MT_DATA); + if (m == NULL) { + return (NULL); + } + m->m_len = size; + m->m_pkthdr.len = size; + return (m); +} + +static void * +packet_buffer_byteptr(packet_buffer_ref buf) +{ + return (buf->m_data + sizeof(struct ether_header)); +} + +typedef enum { + LAEventStart, + LAEventTimeout, + LAEventPacket, + LAEventMediaChange, + LAEventSelectedChange, + LAEventPortMoved, + LAEventReady +} LAEvent; + +/** + ** Receive machine + **/ +static void +bondport_receive_machine(bondport_ref p, LAEvent event, + void * event_data); +/** + ** Periodic Transmission machine + **/ +static void +bondport_periodic_transmit_machine(bondport_ref p, LAEvent event, + void * event_data); + +/** + ** Transmit machine + **/ +static void +bondport_transmit_machine(bondport_ref p, LAEvent event, + void * event_data); + +/** + ** Mux machine + **/ +static void +bondport_mux_machine(bondport_ref p, LAEvent event, + void * event_data); + +/** + ** bond, LAG + **/ +static void +ifbond_activate_LAG(ifbond_ref bond, LAG_ref lag, int active_media); + +static void +ifbond_deactivate_LAG(ifbond_ref bond, LAG_ref lag); + +static int +ifbond_all_ports_ready(ifbond_ref bond); + +static LAG_ref +ifbond_find_best_LAG(ifbond_ref bond, int * active_media); + +static int +LAG_get_aggregatable_port_count(LAG_ref lag, int * active_media); + +static int +ifbond_selection(ifbond_ref bond); + + +/** + ** bondport + **/ + +static void +bondport_receive_lacpdu(bondport_ref p, lacpdu_ref in_lacpdu_p); + +static void +bondport_slow_proto_transmit(bondport_ref p, packet_buffer_ref buf); + +static bondport_ref +bondport_create(struct ifnet * port_ifp, lacp_port_priority priority, + int active, int short_timeout, int * error); +static void +bondport_start(bondport_ref p); + +static void +bondport_free(bondport_ref p); + +static int +bondport_aggregatable(bondport_ref p); + +static int +bondport_remove_from_LAG(bondport_ref p); + +static void +bondport_set_selected(bondport_ref p, SelectedState s); + +static int +bondport_matches_LAG(bondport_ref p, LAG_ref lag); + +static void +bondport_link_status_changed(bondport_ref p); + +static void +bondport_enable_distributing(bondport_ref p); + +static void +bondport_disable_distributing(bondport_ref p); + +static __inline__ int +bondport_collecting(bondport_ref p) +{ + return (lacp_actor_partner_state_collecting(p->po_actor_state)); +} + +/** + ** bond interface/dlil specific routines + **/ +static int bond_clone_create(struct if_clone *, int); +static void bond_clone_destroy(struct ifnet *); +static int bond_input(struct mbuf *m, char *frame_header, struct ifnet *ifp, + u_long protocol_family, int sync_ok); +static int bond_output(struct ifnet *ifp, struct mbuf *m); +static int bond_ioctl(struct ifnet *ifp, u_int32_t cmd, void * addr); +static int bond_set_bpf_tap(struct ifnet * ifp, bpf_tap_mode mode, + bpf_packet_func func); +static int bond_attach_protocol(struct ifnet *ifp); +static int bond_detach_protocol(struct ifnet *ifp); +static int bond_setmulti(struct ifnet *ifp); +static int bond_add_interface(struct ifnet * ifp, struct ifnet * port_ifp); +static int bond_remove_interface(ifbond_ref ifb, struct ifnet * port_ifp); +static void bond_if_free(struct ifnet * ifp); + +static struct if_clone bond_cloner = IF_CLONE_INITIALIZER(BONDNAME, + bond_clone_create, + bond_clone_destroy, + 0, + BOND_MAXUNIT); +static void interface_link_event(struct ifnet * ifp, u_long event_code); + +static int +siocsifmtu(struct ifnet * ifp, int mtu) +{ + struct ifreq ifr; + + bzero(&ifr, sizeof(ifr)); + ifr.ifr_mtu = mtu; + return (dlil_ioctl(0, ifp, SIOCSIFMTU, (caddr_t)&ifr)); +} + +static int +siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p) +{ + struct ifreq ifr; + int error; + + bzero(&ifr, sizeof(ifr)); + error = dlil_ioctl(0, ifp, SIOCGIFDEVMTU, (caddr_t)&ifr); + if (error == 0) { + *ifdm_p = ifr.ifr_devmtu; + } + return (error); +} + +static __inline__ void +ether_addr_copy(void * dest, const void * source) +{ + bcopy(source, dest, ETHER_ADDR_LEN); + return; +} + +static __inline__ void +ifbond_retain(ifbond_ref ifb) +{ + OSIncrementAtomic(&ifb->ifb_retain_count); +} + +static __inline__ void +ifbond_release(ifbond_ref ifb) +{ + UInt32 old_retain_count; + + old_retain_count = OSDecrementAtomic(&ifb->ifb_retain_count); + switch (old_retain_count) { + case 0: + panic("ifbond_release: retain count is 0\n"); + break; + case 1: + if (g_bond->verbose) { + printf("ifbond_release(%s)\n", ifb->ifb_name); + } + if (ifb->ifb_ifma_slow_proto != NULL) { + if (g_bond->verbose) { + printf("ifbond_release(%s) removing multicast\n", + ifb->ifb_name); + } + (void)if_delmultiaddr(ifb->ifb_ifma_slow_proto, 0); + ifma_release(ifb->ifb_ifma_slow_proto); + } + if (ifb->ifb_distributing_array != NULL) { + FREE(ifb->ifb_distributing_array, M_BOND); + } + FREE(ifb, M_BOND); + break; + default: + break; + } + return; +} + +/* + * Function: ifbond_wait + * Purpose: + * Allows a single thread to gain exclusive access to the ifbond + * data structure. Some operations take a long time to complete, + * and some have side-effects that we can't predict. Holding the + * bond_lock() across such operations is not possible. + * + * For example: + * 1) The SIOCSIFLLADDR ioctl takes a long time (several seconds) to + * complete. Simply holding the bond_lock() would freeze all other + * data structure accesses during that time. + * 2) When we attach our protocol to the interface, a dlil event is + * generated and invokes our bond_event() function. bond_event() + * needs to take the bond_lock(), but we're already holding it, so + * we're deadlocked against ourselves. + * Notes: + * Before calling, you must be holding the bond_lock and have taken + * a reference on the ifbond_ref. + */ +static void +ifbond_wait(ifbond_ref ifb, const char * msg) +{ + int waited = 0; + + /* other add/remove in progress */ + while (ifbond_flags_change_in_progress(ifb)) { + if (g_bond->verbose) { + printf("%s: %s msleep\n", ifb->ifb_name, msg); + } + waited = 1; + (void)msleep(ifb, bond_lck_mtx, PZERO, msg, 0); + } + /* prevent other bond list remove/add from taking place */ + ifbond_flags_set_change_in_progress(ifb); + if (g_bond->verbose && waited) { + printf("%s: %s woke up\n", ifb->ifb_name, msg); + } + return; +} + +/* + * Function: ifbond_signal + * Purpose: + * Allows the thread that previously invoked ifbond_wait() to + * give up exclusive access to the ifbond data structure, and wake up + * any other threads waiting to access + * Notes: + * Before calling, you must be holding the bond_lock and have taken + * a reference on the ifbond_ref. + */ +static void +ifbond_signal(ifbond_ref ifb, const char * msg) +{ + ifbond_flags_clear_change_in_progress(ifb); + wakeup((caddr_t)ifb); + if (g_bond->verbose) { + printf("%s: %s wakeup\n", ifb->ifb_name, msg); + } + return; +} + +/** + ** Media information + **/ + +static int +link_speed(int active) +{ + switch (IFM_SUBTYPE(active)) { + case IFM_10_T: + case IFM_10_2: + case IFM_10_5: + case IFM_10_STP: + case IFM_10_FL: + return (10); + case IFM_100_TX: + case IFM_100_FX: + case IFM_100_T4: + case IFM_100_VG: + case IFM_100_T2: + return (100); + case IFM_1000_SX: + case IFM_1000_LX: + case IFM_1000_CX: + case IFM_1000_TX: + return (1000); + case IFM_HPNA_1: + return (0); + default: + /* assume that new defined types are going to be at least 10GigE */ + case IFM_10G_SR: + case IFM_10G_LR: + return (10000); + } +} + +static __inline__ int +media_active(const struct media_info * mi) +{ + if ((mi->mi_status & IFM_AVALID) == 0) { + return (1); + } + return ((mi->mi_status & IFM_ACTIVE) != 0); +} + +static __inline__ int +media_full_duplex(const struct media_info * mi) +{ + return ((mi->mi_active & IFM_FDX) != 0); +} + +static __inline__ int +media_speed(const struct media_info * mi) +{ + return (link_speed(mi->mi_active)); +} + +static struct media_info +interface_media_info(struct ifnet * ifp) +{ + struct ifmediareq ifmr; + struct media_info mi; + + bzero(&mi, sizeof(mi)); + bzero(&ifmr, sizeof(ifmr)); + if (dlil_ioctl(0, ifp, SIOCGIFMEDIA, (caddr_t)&ifmr) == 0) { + if (ifmr.ifm_count != 0) { + mi.mi_status = ifmr.ifm_status; + mi.mi_active = ifmr.ifm_active; + } + } + return (mi); +} + +/** + ** interface utility functions + **/ +static __inline__ struct ifaddr * +ifindex_get_ifaddr(int i) +{ + if (i > if_index || i == 0) { + return (NULL); + } + return (ifnet_addrs[i - 1]); +} + +static __inline__ struct ifaddr * +ifp_get_ifaddr(struct ifnet * ifp) +{ + return (ifindex_get_ifaddr(ifp->if_index)); +} + +static __inline__ struct sockaddr_dl * +ifp_get_sdl(struct ifnet * ifp) +{ + struct ifaddr * ifa; + + ifa = ifp_get_ifaddr(ifp); + return ((struct sockaddr_dl *)(ifa->ifa_addr)); +} + +static int +if_siflladdr(struct ifnet * ifp, const struct ether_addr * ea_p) +{ + struct ifreq ifr; + + /* + * XXX setting the sa_len to ETHER_ADDR_LEN is wrong, but the driver + * currently expects it that way + */ + ifr.ifr_addr.sa_family = AF_UNSPEC; + ifr.ifr_addr.sa_len = ETHER_ADDR_LEN; + ether_addr_copy(ifr.ifr_addr.sa_data, ea_p); +#if 0 + snprintf(ifr.ifr_name, sizeof(ifr.ifr_name), "%s%d", ifp->if_name, + ifp->if_unit); +#endif 0 + return (dlil_ioctl(0, ifp, SIOCSIFLLADDR, (caddr_t)&ifr)); +} + +/** + ** bond_globals + **/ +static bond_globals_ref +bond_globals_create(lacp_system_priority sys_pri, + lacp_system_ref sys) +{ + bond_globals_ref b; + + b = _MALLOC(sizeof(*b), M_BOND, M_WAITOK); + if (b == NULL) { + return (NULL); + } + bzero(b, sizeof(*b)); + TAILQ_INIT(&b->ifbond_list); + b->system = *sys; + b->system_priority = sys_pri; +#if 0 + b->verbose = 1; +#endif 0 + return (b); +} + +static int +bond_globals_init(void) +{ + bond_globals_ref b; + int i; + struct ifnet * ifp; + + bond_assert_lock_not_held(); + + if (g_bond != NULL) { + return (0); + } + + /* + * use en0's ethernet address as the system identifier, and if it's not + * there, use en1 .. en3 + */ + ifp = NULL; + for (i = 0; i < 4; i++) { + char ifname[IFNAMSIZ+1]; + snprintf(ifname, sizeof(ifname), "en%d", i); + /* XXX ifunit() needs to return a reference on the ifp */ + ifp = ifunit(ifname); + if (ifp != NULL) { + break; + } + } + b = NULL; + if (ifp != NULL) { + b = bond_globals_create(0x8000, + (lacp_system_ref)LLADDR(ifp_get_sdl(ifp))); + } + bond_lock(); + if (g_bond != NULL) { + bond_unlock(); + _FREE(b, M_BOND); + return (0); + } + g_bond = b; + bond_unlock(); + if (ifp == NULL) { + return (ENXIO); + } + if (b == NULL) { + return (ENOMEM); + } + return (0); +} + +static void +bond_bpf_vlan(struct ifnet * ifp, struct mbuf * m, + const struct ether_header * eh_p, + u_int16_t vlan_tag, bpf_packet_func func) +{ + struct ether_vlan_header * vlh_p; + struct mbuf * vl_m; + + vl_m = m_get(M_DONTWAIT, MT_DATA); + if (vl_m == NULL) { + return; + } + /* populate a new mbuf containing the vlan ethernet header */ + vl_m->m_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; + vlh_p = mtod(vl_m, struct ether_vlan_header *); + bcopy(eh_p, vlh_p, offsetof(struct ether_header, ether_type)); + vlh_p->evl_encap_proto = htons(ETHERTYPE_VLAN); + vlh_p->evl_tag = htons(vlan_tag); + vlh_p->evl_proto = eh_p->ether_type; + vl_m->m_next = m; + (*func)(ifp, vl_m); + vl_m->m_next = NULL; + m_free(vl_m); + return; +} + +static __inline__ void +bond_bpf_output(struct ifnet * ifp, struct mbuf * m, + bpf_packet_func func) +{ + if (func != NULL) { + if (m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) { + const struct ether_header * eh_p; + eh_p = mtod(m, const struct ether_header *); + m->m_data += ETHER_HDR_LEN; + m->m_len -= ETHER_HDR_LEN; + bond_bpf_vlan(ifp, m, eh_p, m->m_pkthdr.vlan_tag, func); + m->m_data -= ETHER_HDR_LEN; + m->m_len += ETHER_HDR_LEN; + } else { + (*func)(ifp, m); + } + } + return; +} + +static __inline__ void +bond_bpf_input(ifnet_t ifp, mbuf_t m, const struct ether_header * eh_p, + bpf_packet_func func) +{ + if (func != NULL) { + if (m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) { + bond_bpf_vlan(ifp, m, eh_p, m->m_pkthdr.vlan_tag, func); + } else { + /* restore the header */ + m->m_data -= ETHER_HDR_LEN; + m->m_len += ETHER_HDR_LEN; + (*func)(ifp, m); + m->m_data += ETHER_HDR_LEN; + m->m_len -= ETHER_HDR_LEN; + } + } + return; +} + +/* + * Function: bond_setmulti + * Purpose: + * Enable multicast reception on "our" interface by enabling multicasts on + * each of the member ports. + */ +static int +bond_setmulti(struct ifnet * ifp) +{ + ifbond_ref ifb; + int error; + int result = 0; + bondport_ref p; + + bond_lock(); + ifb = ifp->if_private; + if (ifb == NULL || ifbond_flags_if_detaching(ifb) + || TAILQ_EMPTY(&ifb->ifb_port_list)) { + bond_unlock(); + return (0); + } + ifbond_retain(ifb); + ifbond_wait(ifb, "bond_setmulti"); + + if (ifbond_flags_if_detaching(ifb)) { + /* someone destroyed the bond while we were waiting */ + result = EBUSY; + goto signal_done; + } + bond_unlock(); + + /* ifbond_wait() let's us safely walk the list without holding the lock */ + TAILQ_FOREACH(p, &ifb->ifb_port_list, po_port_list) { + struct ifnet * port_ifp = p->po_ifp; + + error = multicast_list_program(&p->po_multicast, + ifp, port_ifp); + if (error != 0) { + printf("bond_setmulti(%s): " + "multicast_list_program(%s%d) failed, %d\n", + ifb->ifb_name, port_ifp->if_name, + port_ifp->if_unit, error); + result = error; + } + } + bond_lock(); + signal_done: + ifbond_release(ifb); + ifbond_signal(ifb, "bond_setmulti"); + bond_unlock(); + return (result); +} + +static void +bond_clone_attach(void) +{ + if_clone_attach(&bond_cloner); + bond_lock_init(); + return; +} + +static int +ifbond_add_slow_proto_multicast(ifbond_ref ifb) +{ + int error; + struct ifmultiaddr * ifma = NULL; + struct sockaddr_dl sdl; + + bond_assert_lock_not_held(); + + bzero(&sdl, sizeof(sdl)); + sdl.sdl_len = sizeof(sdl); + sdl.sdl_family = AF_LINK; + sdl.sdl_type = IFT_ETHER; + sdl.sdl_nlen = 0; + sdl.sdl_alen = sizeof(slow_proto_multicast); + bcopy(&slow_proto_multicast, sdl.sdl_data, sizeof(slow_proto_multicast)); + error = if_addmulti(ifb->ifb_ifp, (struct sockaddr *)&sdl, + &ifma); + if (error == 0) { + ifb->ifb_ifma_slow_proto = ifma; + } + return (error); +} + +static int +bond_clone_create(struct if_clone * ifc, int unit) +{ + int error; + ifbond_ref ifb; + struct ifnet * ifp; + + error = bond_globals_init(); + if (error != 0) { + return (error); + } + + ifb = _MALLOC(sizeof(ifbond), M_BOND, M_WAITOK); + if (ifb == NULL) { + return (ENOMEM); + } + bzero(ifb, sizeof(*ifb)); + + ifbond_retain(ifb); + TAILQ_INIT(&ifb->ifb_port_list); + TAILQ_INIT(&ifb->ifb_lag_list); + ifb->ifb_key = unit + 1; + + /* use the interface name as the unique id for ifp recycle */ + if ((u_long)snprintf(ifb->ifb_name, sizeof(ifb->ifb_name), "%s%d", + ifc->ifc_name, unit) >= sizeof(ifb->ifb_name)) { + ifbond_release(ifb); + return (EINVAL); + } + error = dlil_if_acquire(APPLE_IF_FAM_BOND, + ifb->ifb_name, + strlen(ifb->ifb_name), + &ifp); + if (error) { + ifbond_release(ifb); + return (error); + } + ifb->ifb_ifp = ifp; + ifp->if_name = ifc->ifc_name; + ifp->if_unit = unit; + ifp->if_family = APPLE_IF_FAM_BOND; + ifp->if_private = NULL; + ifp->if_ioctl = bond_ioctl; + ifp->if_set_bpf_tap = bond_set_bpf_tap; + ifp->if_free = bond_if_free; + ifp->if_output = bond_output; + ifp->if_hwassist = 0; + ifp->if_addrlen = ETHER_ADDR_LEN; + ifp->if_baudrate = 0; + ifp->if_type = IFT_IEEE8023ADLAG; + ifp->if_flags = IFF_BROADCAST | IFF_MULTICAST | IFF_SIMPLEX; + ifp->if_mtu = 0; + + /* XXX ethernet specific */ + ifp->if_broadcast.length = ETHER_ADDR_LEN; + bcopy(etherbroadcastaddr, ifp->if_broadcast.u.buffer, ETHER_ADDR_LEN); + + error = dlil_if_attach(ifp); + if (error != 0) { + dlil_if_release(ifp); + ifbond_release(ifb); + return (error); + } + error = ifbond_add_slow_proto_multicast(ifb); + if (error != 0) { + printf("bond_clone_create(%s): " + "failed to add slow_proto multicast, %d\n", + ifb->ifb_name, error); + } + + /* attach as ethernet */ + bpfattach(ifp, DLT_EN10MB, sizeof(struct ether_header)); + + bond_lock(); + ifp->if_private = ifb; + TAILQ_INSERT_HEAD(&g_bond->ifbond_list, ifb, ifb_bond_list); + bond_unlock(); + + return (0); +} + +static void +bond_remove_all_interfaces(ifbond_ref ifb) +{ + bondport_ref p; + + bond_assert_lock_held(); + + /* + * do this in reverse order to avoid re-programming the mac address + * as each head interface is removed + */ + while ((p = TAILQ_LAST(&ifb->ifb_port_list, port_list)) != NULL) { + bond_remove_interface(ifb, p->po_ifp); + } + return; +} + +static void +bond_remove(ifbond_ref ifb) +{ + bond_assert_lock_held(); + ifbond_flags_set_if_detaching(ifb); + TAILQ_REMOVE(&g_bond->ifbond_list, ifb, ifb_bond_list); + bond_remove_all_interfaces(ifb); + return; +} + +static void +bond_if_detach(struct ifnet * ifp) +{ + int error; + + error = dlil_if_detach(ifp); + if (error != DLIL_WAIT_FOR_FREE) { + if (error) { + printf("bond_if_detach %s%d: dlil_if_detach failed, %d\n", + ifp->if_name, ifp->if_unit, error); + } + bond_if_free(ifp); + } + return; +} + +static void +bond_clone_destroy(struct ifnet * ifp) +{ + ifbond_ref ifb; + + bond_lock(); + ifb = ifp->if_private; + if (ifb == NULL || ifp->if_type != IFT_IEEE8023ADLAG) { + bond_unlock(); + return; + } + if (ifbond_flags_if_detaching(ifb)) { + bond_unlock(); + return; + } + bond_remove(ifb); + bond_unlock(); + bond_if_detach(ifp); + return; +} + +static int +bond_set_bpf_tap(struct ifnet * ifp, bpf_tap_mode mode, bpf_packet_func func) +{ + ifbond_ref ifb; + + bond_lock(); + ifb = ifp->if_private; + if (ifb == NULL || ifbond_flags_if_detaching(ifb)) { + bond_unlock(); + return (ENODEV); + } + switch (mode) { + case BPF_TAP_DISABLE: + ifb->ifb_bpf_input = ifb->ifb_bpf_output = NULL; + break; + + case BPF_TAP_INPUT: + ifb->ifb_bpf_input = func; + break; + + case BPF_TAP_OUTPUT: + ifb->ifb_bpf_output = func; + break; + + case BPF_TAP_INPUT_OUTPUT: + ifb->ifb_bpf_input = ifb->ifb_bpf_output = func; + break; + default: + break; + } + bond_unlock(); + return 0; +} + +static uint32_t +ether_header_hash(struct ether_header * eh_p) +{ + uint32_t h; + + /* get 32-bits from destination ether and ether type */ + h = (*((uint16_t *)&eh_p->ether_dhost[4]) << 16) + | eh_p->ether_type; + h ^= *((uint32_t *)&eh_p->ether_dhost[0]); + return (h); +} + +static struct mbuf * +S_mbuf_skip_to_offset(struct mbuf * m, long * offset) +{ + int len; + + len = m->m_len; + while (*offset >= len) { + *offset -= len; + m = m->m_next; + if (m == NULL) { + break; + } + len = m->m_len; + } + return (m); +} + +#if BYTE_ORDER == BIG_ENDIAN +static __inline__ uint32_t +make_uint32(u_char c0, u_char c1, u_char c2, u_char c3) +{ + return (((uint32_t)c0 << 24) | ((uint32_t)c1 << 16) + | ((uint32_t)c2 << 8) | (uint32_t)c3); +} +#else /* BYTE_ORDER == LITTLE_ENDIAN */ +static __inline__ uint32_t +make_uint32(u_char c0, u_char c1, u_char c2, u_char c3) +{ + return (((uint32_t)c3 << 24) | ((uint32_t)c2 << 16) + | ((uint32_t)c1 << 8) | (uint32_t)c0); +} +#endif /* BYTE_ORDER == LITTLE_ENDIAN */ + +static int +S_mbuf_copy_uint32(struct mbuf * m, long offset, uint32_t * val) +{ + struct mbuf * current; + u_char * current_data; + struct mbuf * next; + u_char * next_data; + int space_current; + + current = S_mbuf_skip_to_offset(m, &offset); + if (current == NULL) { + return (1); + } + current_data = mtod(current, u_char *) + offset; + space_current = current->m_len - offset; + if (space_current >= (int)sizeof(uint32_t)) { + *val = *((uint32_t *)current_data); + return (0); + } + next = current->m_next; + if (next == NULL || (next->m_len + space_current) < (int)sizeof(uint32_t)) { + return (1); + } + next_data = mtod(next, u_char *); + switch (space_current) { + case 1: + *val = make_uint32(current_data[0], next_data[0], + next_data[1], next_data[2]); + break; + case 2: + *val = make_uint32(current_data[0], current_data[1], + next_data[0], next_data[1]); + break; + default: + *val = make_uint32(current_data[0], current_data[1], + current_data[2], next_data[0]); + break; + } + return (0); +} + +#define IP_SRC_OFFSET (offsetof(struct ip, ip_src) - offsetof(struct ip, ip_p)) +#define IP_DST_OFFSET (offsetof(struct ip, ip_dst) - offsetof(struct ip, ip_p)) + +static uint32_t +ip_header_hash(struct mbuf * m) +{ + u_char * data; + struct in_addr ip_dst; + struct in_addr ip_src; + u_char ip_p; + long offset; + struct mbuf * orig_m = m; + + /* find the IP protocol field relative to the start of the packet */ + offset = offsetof(struct ip, ip_p) + sizeof(struct ether_header); + m = S_mbuf_skip_to_offset(m, &offset); + if (m == NULL || m->m_len < 1) { + goto bad_ip_packet; + } + data = mtod(m, u_char *) + offset; + ip_p = *data; + + /* find the IP src relative to the IP protocol */ + if ((m->m_len - offset) + >= (int)(IP_SRC_OFFSET + sizeof(struct in_addr) * 2)) { + /* this should be the normal case */ + ip_src = *(struct in_addr *)(data + IP_SRC_OFFSET); + ip_dst = *(struct in_addr *)(data + IP_DST_OFFSET); + } + else { + if (S_mbuf_copy_uint32(m, offset + IP_SRC_OFFSET, + (uint32_t *)&ip_src.s_addr)) { + goto bad_ip_packet; + } + if (S_mbuf_copy_uint32(m, offset + IP_DST_OFFSET, + (uint32_t *)&ip_dst.s_addr)) { + goto bad_ip_packet; + } + } + return (ntohl(ip_dst.s_addr) ^ ntohl(ip_src.s_addr) ^ ((uint32_t)ip_p)); + + bad_ip_packet: + return (ether_header_hash(mtod(orig_m, struct ether_header *))); +} + +#define IP6_ADDRS_LEN (sizeof(struct in6_addr) * 2) +static uint32_t +ipv6_header_hash(struct mbuf * m) +{ + u_char * data; + int i; + long offset; + struct mbuf * orig_m = m; + uint32_t * scan; + uint32_t val; + + /* find the IP protocol field relative to the start of the packet */ + offset = offsetof(struct ip6_hdr, ip6_src) + sizeof(struct ether_header); + m = S_mbuf_skip_to_offset(m, &offset); + if (m == NULL) { + goto bad_ipv6_packet; + } + data = mtod(m, u_char *) + offset; + val = 0; + if ((m->m_len - offset) >= (int)IP6_ADDRS_LEN) { + /* this should be the normal case */ + for (i = 0, scan = (uint32_t *)data; + i < (int)(IP6_ADDRS_LEN / sizeof(uint32_t)); + i++, scan++) { + val ^= *scan; + } + } + else { + for (i = 0; i < (int)(IP6_ADDRS_LEN / sizeof(uint32_t)); i++) { + uint32_t tmp; + if (S_mbuf_copy_uint32(m, offset + i * sizeof(uint32_t), + (uint32_t *)&tmp)) { + goto bad_ipv6_packet; + } + val ^= tmp; + } + } + return (ntohl(val)); + + bad_ipv6_packet: + return (ether_header_hash(mtod(orig_m, struct ether_header *))); +} + +static int +bond_output(struct ifnet * ifp, struct mbuf * m) +{ + bpf_packet_func bpf_func; + uint32_t h; + ifbond_ref ifb; + struct ifnet * port_ifp = NULL; + + if (m == 0) { + return (0); + } + if ((m->m_flags & M_PKTHDR) == 0) { + m_freem(m); + return (0); + } + if (m->m_pkthdr.socket_id != 0) { + h = m->m_pkthdr.socket_id; + } + else { + struct ether_header * eh_p; + + eh_p = mtod(m, struct ether_header *); + switch (ntohs(eh_p->ether_type)) { + case ETHERTYPE_IP: + h = ip_header_hash(m); + break; + case ETHERTYPE_IPV6: + h = ipv6_header_hash(m); + break; + default: + h = ether_header_hash(eh_p); + break; + } + } + bond_lock(); + ifb = ifp->if_private; + if (ifb == NULL || ifbond_flags_if_detaching(ifb) + || ifb->ifb_distributing_count == 0) { + goto done; + } + h %= ifb->ifb_distributing_count; + port_ifp = ifb->ifb_distributing_array[h]->po_ifp; + bpf_func = ifb->ifb_bpf_output; + bond_unlock(); + + if (m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) { + (void)ifnet_stat_increment_out(ifp, 1, + m->m_pkthdr.len + ETHER_VLAN_ENCAP_LEN, + 0); + } else { + (void)ifnet_stat_increment_out(ifp, 1, m->m_pkthdr.len, 0); + } + bond_bpf_output(ifp, m, bpf_func); + + return (dlil_output(port_ifp, 0, m, NULL, NULL, 1)); + + done: + bond_unlock(); + m_freem(m); + return (0); +} + +static bondport_ref +ifbond_lookup_port(ifbond_ref ifb, struct ifnet * port_ifp) +{ + bondport_ref p; + TAILQ_FOREACH(p, &ifb->ifb_port_list, po_port_list) { + if (p->po_ifp == port_ifp) { + return (p); + } + } + return (NULL); +} + +static bondport_ref +bond_lookup_port(struct ifnet * port_ifp) +{ + ifbond_ref ifb; + bondport_ref port; + + TAILQ_FOREACH(ifb, &g_bond->ifbond_list, ifb_bond_list) { + port = ifbond_lookup_port(ifb, port_ifp); + if (port != NULL) { + return (port); + } + } + return (NULL); +} + +static void +bond_receive_lacpdu(struct mbuf * m, struct ifnet * port_ifp) +{ + struct ifnet * bond_ifp = NULL; + int event_code = 0; + bondport_ref p; + + bond_lock(); + if ((port_ifp->if_eflags & IFEF_BOND) == 0) { + goto done; + } + p = bond_lookup_port(port_ifp); + if (p == NULL) { + goto done; + } + if (p->po_enabled == 0) { + goto done; + } + bondport_receive_lacpdu(p, (lacpdu_ref)m->m_data); + if (ifbond_selection(p->po_bond)) { + event_code = (p->po_bond->ifb_active_lag == NULL) + ? KEV_DL_LINK_OFF + : KEV_DL_LINK_ON; + /* XXX need to take a reference on bond_ifp */ + bond_ifp = p->po_bond->ifb_ifp; + } + + done: + bond_unlock(); + if (bond_ifp != NULL) { + interface_link_event(bond_ifp, event_code); + } + m_freem(m); + return; +} + +static void +bond_receive_la_marker_pdu(struct mbuf * m, struct ifnet * port_ifp) +{ + la_marker_pdu_ref marker_p; + bondport_ref p; + + marker_p = (la_marker_pdu_ref)(m->m_data + ETHER_HDR_LEN); + if (marker_p->lm_marker_tlv_type != LA_MARKER_TLV_TYPE_MARKER) { + goto failed; + } + bond_lock(); + if ((port_ifp->if_eflags & IFEF_BOND) == 0) { + bond_unlock(); + goto failed; + } + p = bond_lookup_port(port_ifp); + if (p == NULL || p->po_enabled == 0) { + bond_unlock(); + goto failed; + } + /* echo back the same packet as a marker response */ + marker_p->lm_marker_tlv_type = LA_MARKER_TLV_TYPE_MARKER_RESPONSE; + bondport_slow_proto_transmit(p, (packet_buffer_ref)m); + bond_unlock(); + return; + + failed: + m_freem(m); + return; +} + +static int +bond_input(struct mbuf * m, char * frame_header, struct ifnet * port_ifp, + __unused u_long protocol_family, __unused int sync_ok) +{ + bpf_packet_func bpf_func; + const struct ether_header * eh_p; + ifbond_ref ifb; + struct ifnet * ifp; + bondport_ref p; + + eh_p = (const struct ether_header *)frame_header; + if ((m->m_flags & M_MCAST) != 0 + && bcmp(eh_p->ether_dhost, &slow_proto_multicast, + sizeof(eh_p->ether_dhost)) == 0 + && ntohs(eh_p->ether_type) == IEEE8023AD_SLOW_PROTO_ETHERTYPE) { + u_char subtype = *mtod(m, u_char *); + + if (subtype == IEEE8023AD_SLOW_PROTO_SUBTYPE_LACP) { + if (m->m_pkthdr.len < (int)offsetof(lacpdu, la_reserved)) { + m_freem(m); + return (0); + } + /* send to lacp */ + if (m->m_len < (int)offsetof(lacpdu, la_reserved)) { + m = m_pullup(m, offsetof(lacpdu, la_reserved)); + if (m == NULL) { + return (0); + } + } + bond_receive_lacpdu(m, port_ifp); + return (0); + } + else if (subtype == IEEE8023AD_SLOW_PROTO_SUBTYPE_LA_MARKER_PROTOCOL) { + int min_size; + + /* restore the ethernet header pointer in the mbuf */ + m->m_pkthdr.len += ETHER_HDR_LEN; + m->m_data -= ETHER_HDR_LEN; + m->m_len += ETHER_HDR_LEN; + min_size = ETHER_HDR_LEN + offsetof(la_marker_pdu, lm_reserved); + if (m->m_pkthdr.len < min_size) { + m_freem(m); + return (0); + } + /* send to lacp */ + if (m->m_len < min_size) { + m = m_pullup(m, min_size); + if (m == NULL) { + return (0); + } + } + /* send to marker responder */ + bond_receive_la_marker_pdu(m, port_ifp); + return (0); + } + else if (subtype == 0 + || subtype > IEEE8023AD_SLOW_PROTO_SUBTYPE_RESERVED_END) { + /* invalid subtype, discard the frame */ + m_freem(m); + return (0); + } + } + bond_lock(); + if ((port_ifp->if_eflags & IFEF_BOND) == 0) { + goto done; + } + p = bond_lookup_port(port_ifp); + if (p == NULL || bondport_collecting(p) == 0) { + goto done; + } + + /* make the packet appear as if it arrived on the bonded interface */ + ifb = p->po_bond; + ifp = ifb->ifb_ifp; + bpf_func = ifb->ifb_bpf_input; + bond_unlock(); + + if (m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) { + (void)ifnet_stat_increment_in(ifp, 1, + (m->m_pkthdr.len + ETHER_HDR_LEN + + ETHER_VLAN_ENCAP_LEN), 0); + } + else { + (void)ifnet_stat_increment_in(ifp, 1, + (m->m_pkthdr.len + ETHER_HDR_LEN), 0); + } + m->m_pkthdr.rcvif = ifp; + bond_bpf_input(ifp, m, eh_p, bpf_func); + dlil_input_packet(ifp, m, frame_header); + return 0; + + done: + bond_unlock(); + m_freem(m); + return (0); +} + +static __inline__ const char * +bondport_get_name(bondport_ref p) +{ + return (p->po_name); +} + +static __inline__ int +bondport_get_index(bondport_ref p) +{ + return (p->po_ifp->if_index); +} + +static void +bondport_slow_proto_transmit(bondport_ref p, packet_buffer_ref buf) +{ + struct ether_header * eh_p; + int error; + + /* packet_buffer_allocate leaves room for ethernet header */ + eh_p = mtod(buf, struct ether_header *); + bcopy(&slow_proto_multicast, &eh_p->ether_dhost, sizeof(eh_p->ether_dhost)); + bcopy(&p->po_saved_addr, eh_p->ether_shost, sizeof(eh_p->ether_shost)); + eh_p->ether_type = htons(IEEE8023AD_SLOW_PROTO_ETHERTYPE); + error = dlil_output(p->po_ifp, 0, buf, NULL, NULL, 1); + if (error != 0) { + printf("bondport_slow_proto_transmit(%s) failed %d\n", + bondport_get_name(p), error); + } + return; +} + +static void +bondport_timer_process_func(devtimer_ref timer, + devtimer_process_func_event event) +{ + bondport_ref p; + + switch (event) { + case devtimer_process_func_event_lock: + bond_lock(); + devtimer_retain(timer); + break; + case devtimer_process_func_event_unlock: + if (devtimer_valid(timer)) { + /* as long as the devtimer is valid, we can look at arg0 */ + int event_code = 0; + struct ifnet * bond_ifp = NULL; + + p = (bondport_ref)devtimer_arg0(timer); + if (ifbond_selection(p->po_bond)) { + event_code = (p->po_bond->ifb_active_lag == NULL) + ? KEV_DL_LINK_OFF + : KEV_DL_LINK_ON; + /* XXX need to take a reference on bond_ifp */ + bond_ifp = p->po_bond->ifb_ifp; + } + devtimer_release(timer); + bond_unlock(); + if (bond_ifp != NULL) { + interface_link_event(bond_ifp, event_code); + } + } + else { + /* timer is going away */ + devtimer_release(timer); + bond_unlock(); + } + break; + default: + break; + } +} + +static bondport_ref +bondport_create(struct ifnet * port_ifp, lacp_port_priority priority, + int active, int short_timeout, int * ret_error) +{ + int error = 0; + bondport_ref p = NULL; + lacp_actor_partner_state s; + + *ret_error = 0; + p = _MALLOC(sizeof(*p), M_BOND, M_WAITOK); + if (p == NULL) { + *ret_error = ENOMEM; + return (NULL); + } + bzero(p, sizeof(*p)); + multicast_list_init(&p->po_multicast); + if ((u_long)snprintf(p->po_name, sizeof(p->po_name), "%s%d", + port_ifp->if_name, port_ifp->if_unit) + >= sizeof(p->po_name)) { + printf("if_bond: name too large\n"); + *ret_error = EINVAL; + goto failed; + } + error = siocgifdevmtu(port_ifp, &p->po_devmtu); + if (error != 0) { + printf("if_bond: SIOCGIFDEVMTU %s failed, %d\n", + bondport_get_name(p), error); + goto failed; + } + /* remember the current interface MTU so it can be restored */ + p->po_devmtu.ifdm_current = port_ifp->if_mtu; + p->po_ifp = port_ifp; + p->po_media_info = interface_media_info(port_ifp); + p->po_current_while_timer = devtimer_create(bondport_timer_process_func, p); + if (p->po_current_while_timer == NULL) { + *ret_error = ENOMEM; + goto failed; + } + p->po_periodic_timer = devtimer_create(bondport_timer_process_func, p); + if (p->po_periodic_timer == NULL) { + *ret_error = ENOMEM; + goto failed; + } + p->po_wait_while_timer = devtimer_create(bondport_timer_process_func, p); + if (p->po_wait_while_timer == NULL) { + *ret_error = ENOMEM; + goto failed; + } + p->po_transmit_timer = devtimer_create(bondport_timer_process_func, p); + if (p->po_transmit_timer == NULL) { + *ret_error = ENOMEM; + goto failed; + } + p->po_receive_state = ReceiveState_none; + p->po_mux_state = MuxState_none; + p->po_priority = priority; + s = 0; + s = lacp_actor_partner_state_set_aggregatable(s); + if (short_timeout) { + s = lacp_actor_partner_state_set_short_timeout(s); + } + if (active) { + s = lacp_actor_partner_state_set_active_lacp(s); + } + p->po_actor_state = s; + return (p); + + failed: + bondport_free(p); + return (NULL); +} + +static void +bondport_start(bondport_ref p) +{ + bondport_receive_machine(p, LAEventStart, NULL); + bondport_mux_machine(p, LAEventStart, NULL); + bondport_periodic_transmit_machine(p, LAEventStart, NULL); + bondport_transmit_machine(p, LAEventStart, NULL); + return; +} + +/* + * Function: bondport_invalidate_timers + * Purpose: + * Invalidate all of the timers for the bondport. + */ +static void +bondport_invalidate_timers(bondport_ref p) +{ + devtimer_invalidate(p->po_current_while_timer); + devtimer_invalidate(p->po_periodic_timer); + devtimer_invalidate(p->po_wait_while_timer); + devtimer_invalidate(p->po_transmit_timer); +} + +static void +bondport_free(bondport_ref p) +{ + multicast_list_remove(&p->po_multicast); + devtimer_release(p->po_current_while_timer); + devtimer_release(p->po_periodic_timer); + devtimer_release(p->po_wait_while_timer); + devtimer_release(p->po_transmit_timer); + FREE(p, M_BOND); + return; +} + +#define BOND_ADD_PROGRESS_IN_LIST 0x1 +#define BOND_ADD_PROGRESS_PROTO_ATTACHED 0x2 +#define BOND_ADD_PROGRESS_LLADDR_SET 0x4 +#define BOND_ADD_PROGRESS_MTU_SET 0x8 + +static __inline__ int +bond_device_mtu(struct ifnet * ifp, ifbond_ref ifb) +{ + return (((int)ifp->if_mtu > ifb->ifb_altmtu) + ? (int)ifp->if_mtu : ifb->ifb_altmtu); +} + +static int +bond_add_interface(struct ifnet * ifp, struct ifnet * port_ifp) +{ + int devmtu; + int error = 0; + int event_code = 0; + ifbond_ref ifb; + struct sockaddr_dl * ifb_sdl; + bondport_ref * new_array = NULL; + bondport_ref * old_array = NULL; + bondport_ref p; + struct sockaddr_dl * port_sdl; + int progress = 0; + + /* pre-allocate space for new port */ + p = bondport_create(port_ifp, 0x8000, 1, 0, &error); + if (p == NULL) { + return (error); + } + bond_lock(); + ifb = (ifbond_ref)ifp->if_private; + if (ifb == NULL || ifbond_flags_if_detaching(ifb)) { + bond_unlock(); + bondport_free(p); + return ((ifb == NULL ? EOPNOTSUPP : EBUSY)); + } + + /* make sure this interface can handle our current MTU */ + devmtu = bond_device_mtu(ifp, ifb); + if (devmtu != 0 + && (devmtu > p->po_devmtu.ifdm_max || devmtu < p->po_devmtu.ifdm_min)) { + bond_unlock(); + printf("if_bond: interface %s doesn't support mtu %d", + bondport_get_name(p), devmtu); + bondport_free(p); + return (EINVAL); + } + + /* make sure ifb doesn't get de-allocated while we wait */ + ifbond_retain(ifb); + + /* wait for other add or remove to complete */ + ifbond_wait(ifb, "bond_add_interface"); + + if (ifbond_flags_if_detaching(ifb)) { + /* someone destroyed the bond while we were waiting */ + error = EBUSY; + goto signal_done; + } + if (bond_lookup_port(port_ifp) != NULL) { + /* port is already part of a bond */ + error = EBUSY; + goto signal_done; + } + ifnet_lock_exclusive(port_ifp); + if ((port_ifp->if_eflags & (IFEF_VLAN | IFEF_BOND)) != 0) { + /* interface already has VLAN's, or is part of bond */ + ifnet_lock_done(port_ifp); + error = EBUSY; + goto signal_done; + } + + /* mark the interface busy */ + port_ifp->if_eflags |= IFEF_BOND; + ifnet_lock_done(port_ifp); + + port_sdl = ifp_get_sdl(port_ifp); + ifb_sdl = ifp_get_sdl(ifp); + + if (TAILQ_EMPTY(&ifb->ifb_port_list)) { + ifp->if_hwassist = port_ifp->if_hwassist; + ifp->if_flags |= IFF_RUNNING; + if (ifbond_flags_lladdr(ifb) == FALSE) { + /* first port added to bond determines bond's ethernet address */ + ether_addr_copy(LLADDR(ifb_sdl), LLADDR(port_sdl)); + ifb_sdl->sdl_type = IFT_ETHER; + ifb_sdl->sdl_alen = ETHER_ADDR_LEN; + } + } else { + if (ifp->if_hwassist != port_ifp->if_hwassist) { + printf("bond_add_interface(%s, %s) " + "hwassist values don't match 0x%x != 0x%x\n", + ifb->ifb_name, bondport_get_name(p), + ifp->if_hwassist, port_ifp->if_hwassist); + /* + * XXX + * if the bond has VLAN's, we can't simply change the hwassist + * field behind its back: this needs work + */ + ifp->if_hwassist = 0; + } + } + p->po_bond = ifb; + + /* remember the port's ethernet address so it can be restored */ + ether_addr_copy(&p->po_saved_addr, LLADDR(port_sdl)); + + /* add it to the list of ports */ + TAILQ_INSERT_TAIL(&ifb->ifb_port_list, p, po_port_list); + ifb->ifb_port_count++; + + /* set the default MTU */ + if (ifp->if_mtu == 0) { + ifp->if_mtu = ETHERMTU; + } + bond_unlock(); + progress |= BOND_ADD_PROGRESS_IN_LIST; + + /* allocate a larger distributing array */ + new_array = (bondport_ref *) + _MALLOC(sizeof(*new_array) * ifb->ifb_port_count, M_BOND, M_WAITOK); + if (new_array == NULL) { + error = ENOMEM; + goto failed; + } + + /* attach our BOND "protocol" to the interface */ + error = bond_attach_protocol(port_ifp); + if (error) { + goto failed; + } + progress |= BOND_ADD_PROGRESS_PROTO_ATTACHED; + + /* set the interface MTU */ + devmtu = bond_device_mtu(ifp, ifb); + error = siocsifmtu(port_ifp, devmtu); + if (error != 0) { + printf("bond_add_interface(%s, %s):" + " SIOCSIFMTU %d failed %d\n", + ifb->ifb_name, bondport_get_name(p), devmtu, error); + goto failed; + } + progress |= BOND_ADD_PROGRESS_MTU_SET; + + /* program the port with our multicast addresses */ + error = multicast_list_program(&p->po_multicast, ifp, port_ifp); + if (error) { + printf("bond_add_interface(%s, %s):" + " multicast_list_program failed %d\n", + ifb->ifb_name, bondport_get_name(p), error); + goto failed; + } + + /* mark the interface up */ + ifnet_set_flags(port_ifp, IFF_UP, IFF_UP); + + error = dlil_ioctl(0, port_ifp, SIOCSIFFLAGS, (caddr_t)NULL); + if (error != 0) { + printf("bond_add_interface(%s, %s): SIOCSIFFLAGS failed %d\n", + ifb->ifb_name, bondport_get_name(p), error); + goto failed; + } + + /* re-program the port's ethernet address */ + error = if_siflladdr(port_ifp, + (const struct ether_addr *)LLADDR(ifb_sdl)); + if (error != 0) { + /* port doesn't support setting the link address */ + printf("bond_add_interface(%s, %s): if_siflladdr failed %d\n", + ifb->ifb_name, bondport_get_name(p), error); + goto failed; + } + progress |= BOND_ADD_PROGRESS_LLADDR_SET; + + bond_lock(); + + /* no failures past this point */ + p->po_enabled = 1; + + /* copy the contents of the existing distributing array */ + if (ifb->ifb_distributing_count) { + bcopy(ifb->ifb_distributing_array, new_array, + sizeof(*new_array) * ifb->ifb_distributing_count); + } + old_array = ifb->ifb_distributing_array; + ifb->ifb_distributing_array = new_array; + + /* clear the busy state, and wakeup anyone waiting */ + ifbond_signal(ifb, "bond_add_interface"); + bondport_start(p); + + /* check if we need to generate a link status event */ + if (ifbond_selection(ifb)) { + event_code = (ifb->ifb_active_lag == NULL) + ? KEV_DL_LINK_OFF + : KEV_DL_LINK_ON; + } + bond_unlock(); + if (event_code != 0) { + interface_link_event(ifp, event_code); + } + if (old_array != NULL) { + FREE(old_array, M_BOND); + } + return 0; + + failed: + bond_assert_lock_not_held(); + + if (new_array != NULL) { + FREE(new_array, M_BOND); + } + if ((progress & BOND_ADD_PROGRESS_LLADDR_SET) != 0) { + int error1; + + error1 = if_siflladdr(port_ifp, &p->po_saved_addr); + if (error1 != 0) { + printf("bond_add_interface(%s, %s): if_siflladdr failed %d\n", + ifb->ifb_name, bondport_get_name(p), error1); + } + } + if ((progress & BOND_ADD_PROGRESS_PROTO_ATTACHED) != 0) { + (void)bond_detach_protocol(port_ifp); + } + if ((progress & BOND_ADD_PROGRESS_MTU_SET) != 0) { + int error1; + + error1 = siocsifmtu(port_ifp, p->po_devmtu.ifdm_current); + if (error1 != 0) { + printf("bond_add_interface(%s, %s): SIOCSIFMTU %d failed %d\n", + ifb->ifb_name, bondport_get_name(p), p->po_devmtu.ifdm_current, + error1); + } + } + bond_lock(); + if ((progress & BOND_ADD_PROGRESS_IN_LIST) != 0) { + TAILQ_REMOVE(&ifb->ifb_port_list, p, po_port_list); + ifb->ifb_port_count--; + } + ifnet_lock_exclusive(port_ifp); + port_ifp->if_eflags &= ~IFEF_BOND; + ifnet_lock_done(port_ifp); + if (TAILQ_EMPTY(&ifb->ifb_port_list)) { + ifb->ifb_altmtu = 0; + ifp->if_mtu = 0; + ifp->if_hwassist = 0; + if (ifbond_flags_lladdr(ifb) == FALSE) { + bzero(LLADDR(ifb_sdl), ETHER_ADDR_LEN); + ifb_sdl->sdl_type = IFT_IEEE8023ADLAG; + ifb_sdl->sdl_alen = 0; + } + } + + signal_done: + ifbond_release(ifb); + ifbond_signal(ifb, "bond_add_interface"); + bond_unlock(); + bondport_free(p); + return (error); +} + +static int +bond_remove_interface(ifbond_ref ifb, struct ifnet * port_ifp) +{ + int active_lag = 0; + int error = 0; + int event_code = 0; + bondport_ref head_port; + struct sockaddr_dl * ifb_sdl; + struct ifnet * ifp; + int new_link_address = 0; + bondport_ref p; + lacp_actor_partner_state s; + + bond_assert_lock_held(); + + ifbond_retain(ifb); + ifbond_wait(ifb, "bond_remove_interface"); + + p = ifbond_lookup_port(ifb, port_ifp); + if (p == NULL) { + error = ENXIO; + /* it got removed by another thread */ + goto signal_done; + } + + /* de-select it and remove it from the lists */ + bondport_disable_distributing(p); + bondport_set_selected(p, SelectedState_UNSELECTED); + active_lag = bondport_remove_from_LAG(p); + TAILQ_REMOVE(&ifb->ifb_port_list, p, po_port_list); + ifb->ifb_port_count--; + + /* invalidate timers here while holding the bond_lock */ + bondport_invalidate_timers(p); + + /* announce that we're Individual now */ + s = p->po_actor_state; + s = lacp_actor_partner_state_set_individual(s); + s = lacp_actor_partner_state_set_not_collecting(s); + s = lacp_actor_partner_state_set_not_distributing(s); + s = lacp_actor_partner_state_set_out_of_sync(s); + p->po_actor_state = s; + bondport_flags_set_ntt(p); + + ifp = ifb->ifb_ifp; + ifb_sdl = ifp_get_sdl(ifp); + head_port = TAILQ_FIRST(&ifb->ifb_port_list); + if (head_port == NULL) { + ifp->if_flags &= ~IFF_RUNNING; + if (ifbond_flags_lladdr(ifb) == FALSE) { + ifb_sdl->sdl_type = IFT_IEEE8023ADLAG; + ifb_sdl->sdl_alen = 0; + bzero(LLADDR(ifb_sdl), ETHER_ADDR_LEN); + } + ifp->if_hwassist = 0; + ifp->if_mtu = 0; + ifb->ifb_altmtu = 0; + } else if (ifbond_flags_lladdr(ifb) == FALSE + && bcmp(&p->po_saved_addr, LLADDR(ifb_sdl), + ETHER_ADDR_LEN) == 0) { + /* this port gave the bond its ethernet address, switch to new one */ + ether_addr_copy(LLADDR(ifb_sdl), &head_port->po_saved_addr); + ifb_sdl->sdl_type = IFT_ETHER; + ifb_sdl->sdl_alen = ETHER_ADDR_LEN; + new_link_address = 1; + } + /* check if we need to generate a link status event */ + if (ifbond_selection(ifb) || active_lag) { + event_code = (ifb->ifb_active_lag == NULL) + ? KEV_DL_LINK_OFF + : KEV_DL_LINK_ON; + } + bond_unlock(); + + bondport_transmit_machine(p, LAEventStart, (void *)1); + + if (new_link_address) { + struct ifnet * scan_ifp; + bondport_ref scan_port; + + /* ifbond_wait() allows port list traversal without holding the lock */ + + /* re-program each port with the new link address */ + TAILQ_FOREACH(scan_port, &ifb->ifb_port_list, po_port_list) { + scan_ifp = scan_port->po_ifp; + + error = if_siflladdr(scan_ifp, + (const struct ether_addr *) LLADDR(ifb_sdl)); + if (error != 0) { + printf("bond_remove_interface(%s, %s): " + "if_siflladdr (%s) failed %d\n", + ifb->ifb_name, bondport_get_name(p), + bondport_get_name(scan_port), error); + } + } + } + + /* restore the port's ethernet address */ + error = if_siflladdr(port_ifp, &p->po_saved_addr); + if (error != 0) { + printf("bond_remove_interface(%s, %s): if_siflladdr failed %d\n", + ifb->ifb_name, bondport_get_name(p), error); + } + + /* restore the port's MTU */ + error = siocsifmtu(port_ifp, p->po_devmtu.ifdm_current); + if (error != 0) { + printf("bond_remove_interface(%s, %s): SIOCSIFMTU %d failed %d\n", + ifb->ifb_name, bondport_get_name(p), + p->po_devmtu.ifdm_current, error); + } + + /* remove the bond "protocol" */ + bond_detach_protocol(port_ifp); + + /* generate link event */ + if (event_code != 0) { + interface_link_event(ifp, event_code); + } + + bond_lock(); + ifbond_release(ifb); + bondport_free(p); + ifnet_lock_exclusive(port_ifp); + port_ifp->if_eflags &= ~IFEF_BOND; + ifnet_lock_done(port_ifp); + + signal_done: + ifbond_signal(ifb, "bond_remove_interface"); + ifbond_release(ifb); /* a second release for the second reference */ + return (error); +} + +static int +bond_get_status(ifbond_ref ifb, struct if_bond_req * ibr_p, user_addr_t datap) +{ + int count; + user_addr_t dst; + int error = 0; + struct if_bond_status_req * ibsr; + struct if_bond_status ibs; + bondport_ref port; + + ibsr = &(ibr_p->ibr_ibru.ibru_status); + if (ibsr->ibsr_version != IF_BOND_STATUS_REQ_VERSION) { + return (EINVAL); + } + ibsr->ibsr_key = ifb->ifb_key; + ibsr->ibsr_total = ifb->ifb_port_count; + dst = proc_is64bit(current_proc()) + ? ibsr->ibsr_ibsru.ibsru_buffer64 + : CAST_USER_ADDR_T(ibsr->ibsr_ibsru.ibsru_buffer32); + if (dst == USER_ADDR_NULL) { + /* just want to know how many there are */ + goto done; + } + if (ibsr->ibsr_count < 0) { + return (EINVAL); + } + count = (ifb->ifb_port_count < ibsr->ibsr_count) + ? ifb->ifb_port_count : ibsr->ibsr_count; + TAILQ_FOREACH(port, &ifb->ifb_port_list, po_port_list) { + struct if_bond_partner_state * ibps_p; + partner_state_ref ps; + + if (count == 0) { + break; + } + bzero(&ibs, sizeof(ibs)); + strncpy(ibs.ibs_if_name, port->po_name, sizeof(ibs.ibs_if_name)); + ibs.ibs_port_priority = port->po_priority; + ibs.ibs_state = port->po_actor_state; + ibs.ibs_selected_state = port->po_selected; + ps = &port->po_partner_state; + ibps_p = &ibs.ibs_partner_state; + ibps_p->ibps_system = ps->ps_lag_info.li_system; + ibps_p->ibps_system_priority = ps->ps_lag_info.li_system_priority; + ibps_p->ibps_key = ps->ps_lag_info.li_key; + ibps_p->ibps_port = ps->ps_port; + ibps_p->ibps_port_priority = ps->ps_port_priority; + ibps_p->ibps_state = ps->ps_state; + error = copyout(&ibs, dst, sizeof(ibs)); + if (error != 0) { + break; + } + dst += sizeof(ibs); + count--; + } + + done: + if (error == 0) { + error = copyout(ibr_p, datap, sizeof(*ibr_p)); + } + else { + (void)copyout(ibr_p, datap, sizeof(*ibr_p)); + } + return (error); +} + +static int +bond_set_promisc(__unused struct ifnet *ifp) +{ + int error = 0; +#if 0 + ifbond_ref ifb = ifp->if_private; + + + if ((ifp->if_flags & IFF_PROMISC) != 0) { + if ((ifb->ifb_flags & IFBF_PROMISC) == 0) { + error = ifnet_set_promiscuous(ifb->ifb_p, 1); + if (error == 0) + ifb->ifb_flags |= IFBF_PROMISC; + } + } else { + if ((ifb->ifb_flags & IFBF_PROMISC) != 0) { + error = ifnet_set_promiscuous(ifb->ifb_p, 0); + if (error == 0) + ifb->ifb_flags &= ~IFBF_PROMISC; + } + } +#endif 0 + return (error); +} + +static void +bond_get_mtu_values(ifbond_ref ifb, int * ret_min, int * ret_max) +{ + int mtu_min = 0; + int mtu_max = 0; + bondport_ref p; + + if (TAILQ_FIRST(&ifb->ifb_port_list) != NULL) { + mtu_min = IF_MINMTU; + } + TAILQ_FOREACH(p, &ifb->ifb_port_list, po_port_list) { + struct ifdevmtu * devmtu_p = &p->po_devmtu; + + if (devmtu_p->ifdm_min > mtu_min) { + mtu_min = devmtu_p->ifdm_min; + } + if (mtu_max == 0 || devmtu_p->ifdm_max < mtu_max) { + mtu_max = devmtu_p->ifdm_max; + } + } + *ret_min = mtu_min; + *ret_max = mtu_max; + return; +} + +static int +bond_set_mtu_on_ports(ifbond_ref ifb, int mtu) +{ + int error = 0; + bondport_ref p; + + TAILQ_FOREACH(p, &ifb->ifb_port_list, po_port_list) { + error = siocsifmtu(p->po_ifp, mtu); + if (error != 0) { + printf("if_bond(%s): SIOCSIFMTU %s failed, %d\n", + ifb->ifb_name, bondport_get_name(p), error); + break; + } + } + return (error); +} + +static int +bond_set_mtu(struct ifnet * ifp, int mtu, int isdevmtu) +{ + int error = 0; + ifbond_ref ifb; + int mtu_min; + int mtu_max; + int new_max; + int old_max; + + bond_lock(); + ifb = (ifbond_ref)ifp->if_private; + if (ifb == NULL || ifbond_flags_if_detaching(ifb)) { + error = (ifb == NULL) ? EOPNOTSUPP : EBUSY; + goto done; + } + ifbond_retain(ifb); + ifbond_wait(ifb, "bond_set_mtu"); + + /* check again */ + if (ifp->if_private == NULL || ifbond_flags_if_detaching(ifb)) { + error = EBUSY; + goto signal_done; + } + bond_get_mtu_values(ifb, &mtu_min, &mtu_max); + if (mtu > mtu_max) { + error = EINVAL; + goto signal_done; + } + if (mtu < mtu_min && (isdevmtu == 0 || mtu != 0)) { + /* allow SIOCSIFALTMTU to set the mtu to 0 */ + error = EINVAL; + goto signal_done; + } + if (isdevmtu) { + new_max = (mtu > (int)ifp->if_mtu) ? mtu : (int)ifp->if_mtu; + } + else { + new_max = (mtu > ifb->ifb_altmtu) ? mtu : ifb->ifb_altmtu; + } + old_max = ((int)ifp->if_mtu > ifb->ifb_altmtu) + ? (int)ifp->if_mtu : ifb->ifb_altmtu; + if (new_max != old_max) { + /* we can safely walk the list of port without the lock held */ + bond_unlock(); + error = bond_set_mtu_on_ports(ifb, new_max); + if (error != 0) { + /* try our best to back out of it */ + (void)bond_set_mtu_on_ports(ifb, old_max); + } + bond_lock(); + } + if (error == 0) { + if (isdevmtu) { + ifb->ifb_altmtu = mtu; + } + else { + ifp->if_mtu = mtu; + } + } + + signal_done: + ifbond_signal(ifb, "bond_set_mtu"); + ifbond_release(ifb); + + done: + bond_unlock(); + return (error); +} + +static int +bond_ioctl(struct ifnet *ifp, u_int32_t cmd, void * data) +{ + int error = 0; + struct if_bond_req ibr; + struct ifaddr * ifa; + ifbond_ref ifb; + struct ifreq * ifr; + struct ifmediareq64 *ifmr; + struct ifnet * port_ifp = NULL; + user_addr_t user_addr; + + if (ifp->if_type != IFT_IEEE8023ADLAG) { + return (EOPNOTSUPP); + } + ifr = (struct ifreq *)data; + ifa = (struct ifaddr *)data; + + switch (cmd) { + case SIOCSIFADDR: + ifnet_set_flags(ifp, IFF_UP, IFF_UP); + break; + + case SIOCGIFMEDIA64: + case SIOCGIFMEDIA: + bond_lock(); + ifb = (ifbond_ref)ifp->if_private; + if (ifb == NULL || ifbond_flags_if_detaching(ifb)) { + bond_unlock(); + return (ifb == NULL ? EOPNOTSUPP : EBUSY); + } + ifmr = (struct ifmediareq64 *)data; + ifmr->ifm_current = IFM_ETHER; + ifmr->ifm_mask = 0; + ifmr->ifm_status = IFM_AVALID; + ifmr->ifm_active = IFM_ETHER; + ifmr->ifm_count = 1; + if (ifb->ifb_active_lag != NULL) { + ifmr->ifm_active = ifb->ifb_active_lag->lag_active_media; + ifmr->ifm_status |= IFM_ACTIVE; + } + bond_unlock(); + user_addr = (cmd == SIOCGIFMEDIA64) + ? ifmr->ifm_ifmu.ifmu_ulist64 + : CAST_USER_ADDR_T(ifmr->ifm_ifmu.ifmu_ulist32); + if (user_addr != USER_ADDR_NULL) { + error = copyout(&ifmr->ifm_current, + user_addr, + sizeof(int)); + } + break; + + case SIOCSIFMEDIA: + /* XXX send the SIFMEDIA to all children? Or force autoselect? */ + error = EINVAL; + break; + + case SIOCGIFDEVMTU: + bond_lock(); + ifb = (ifbond_ref)ifp->if_private; + if (ifb == NULL || ifbond_flags_if_detaching(ifb)) { + bond_unlock(); + error = (ifb == NULL) ? EOPNOTSUPP : EBUSY; + break; + } + ifr->ifr_devmtu.ifdm_current = bond_device_mtu(ifp, ifb); + bond_get_mtu_values(ifb, &ifr->ifr_devmtu.ifdm_min, + &ifr->ifr_devmtu.ifdm_max); + bond_unlock(); + break; + + case SIOCGIFALTMTU: + bond_lock(); + ifb = (ifbond_ref)ifp->if_private; + if (ifb == NULL || ifbond_flags_if_detaching(ifb)) { + bond_unlock(); + error = (ifb == NULL) ? EOPNOTSUPP : EBUSY; + break; + } + ifr->ifr_mtu = ifb->ifb_altmtu; + bond_unlock(); + break; + + case SIOCSIFALTMTU: + error = bond_set_mtu(ifp, ifr->ifr_mtu, 1); + break; + + case SIOCSIFMTU: + error = bond_set_mtu(ifp, ifr->ifr_mtu, 0); + break; + + case SIOCSIFBOND: + user_addr = proc_is64bit(current_proc()) + ? ifr->ifr_data64 : CAST_USER_ADDR_T(ifr->ifr_data); + error = copyin(user_addr, &ibr, sizeof(ibr)); + if (error) { + break; + } + switch (ibr.ibr_op) { + case IF_BOND_OP_ADD_INTERFACE: + case IF_BOND_OP_REMOVE_INTERFACE: + /* XXX ifunit() needs to return a reference on the ifp */ + port_ifp = ifunit(ibr.ibr_ibru.ibru_if_name); + if (port_ifp == NULL) { + error = ENXIO; + break; + } + if (port_ifp->if_type != IFT_ETHER) { + error = EPROTONOSUPPORT; + break; + } + break; + case IF_BOND_OP_SET_VERBOSE: + break; + default: + error = EOPNOTSUPP; + break; + } + if (error != 0) { + break; + } + switch (ibr.ibr_op) { + case IF_BOND_OP_ADD_INTERFACE: + error = bond_add_interface(ifp, port_ifp); + break; + case IF_BOND_OP_REMOVE_INTERFACE: + bond_lock(); + ifb = (ifbond_ref)ifp->if_private; + if (ifb == NULL || ifbond_flags_if_detaching(ifb)) { + bond_unlock(); + return (ifb == NULL ? EOPNOTSUPP : EBUSY); + } + error = bond_remove_interface(ifb, port_ifp); + bond_unlock(); + break; + case IF_BOND_OP_SET_VERBOSE: + bond_lock(); + if (g_bond == NULL) { + bond_unlock(); + error = ENXIO; + break; + } + g_bond->verbose = ibr.ibr_ibru.ibru_int_val; + bond_unlock(); + break; + } + break; + + case SIOCGIFBOND: + user_addr = proc_is64bit(current_proc()) + ? ifr->ifr_data64 : CAST_USER_ADDR_T(ifr->ifr_data); + error = copyin(user_addr, &ibr, sizeof(ibr)); + if (error) { + break; + } + switch (ibr.ibr_op) { + case IF_BOND_OP_GET_STATUS: + break; + default: + error = EOPNOTSUPP; + break; + } + if (error != 0) { + break; + } + bond_lock(); + ifb = (ifbond_ref)ifp->if_private; + if (ifb == NULL || ifbond_flags_if_detaching(ifb)) { + bond_unlock(); + return (ifb == NULL ? EOPNOTSUPP : EBUSY); + } + switch (ibr.ibr_op) { + case IF_BOND_OP_GET_STATUS: + error = bond_get_status(ifb, &ibr, user_addr); + break; + } + bond_unlock(); + break; + + case SIOCSIFLLADDR: + error = EOPNOTSUPP; + break; + + case SIOCSIFFLAGS: + /* enable/disable promiscuous mode */ + bond_lock(); + error = bond_set_promisc(ifp); + bond_unlock(); + break; + + case SIOCADDMULTI: + case SIOCDELMULTI: + error = bond_setmulti(ifp); + break; + default: + error = EOPNOTSUPP; + } + return error; +} + +static void +bond_if_free(struct ifnet * ifp) +{ + ifbond_ref ifb; + + if (ifp == NULL) { + return; + } + bond_lock(); + ifb = (ifbond_ref)ifp->if_private; + if (ifb == NULL) { + bond_unlock(); + return; + } + ifp->if_private = NULL; + ifbond_release(ifb); + bond_unlock(); + dlil_if_release(ifp); + return; +} + +static void +bond_event(struct ifnet * port_ifp, struct kev_msg * event) +{ + struct ifnet * bond_ifp = NULL; + int event_code = 0; + bondport_ref p; + struct media_info media_info; + + if (event->vendor_code != KEV_VENDOR_APPLE + || event->kev_class != KEV_NETWORK_CLASS + || event->kev_subclass != KEV_DL_SUBCLASS) { + return; + } + switch (event->event_code) { + case KEV_DL_IF_DETACHING: + break; + case KEV_DL_LINK_OFF: + case KEV_DL_LINK_ON: + media_info = interface_media_info(port_ifp); + break; + default: + return; + } + bond_lock(); + p = bond_lookup_port(port_ifp); + if (p == NULL) { + bond_unlock(); + return; + } + switch (event->event_code) { + case KEV_DL_IF_DETACHING: + bond_remove_interface(p->po_bond, p->po_ifp); + break; + case KEV_DL_LINK_OFF: + case KEV_DL_LINK_ON: + p->po_media_info = media_info; + if (p->po_enabled) { + bondport_link_status_changed(p); + } + break; + } + /* generate a link-event */ + if (ifbond_selection(p->po_bond)) { + event_code = (p->po_bond->ifb_active_lag == NULL) + ? KEV_DL_LINK_OFF + : KEV_DL_LINK_ON; + /* XXX need to take a reference on bond_ifp */ + bond_ifp = p->po_bond->ifb_ifp; + } + bond_unlock(); + if (bond_ifp != NULL) { + interface_link_event(bond_ifp, event_code); + } + return; +} + +static void +interface_link_event(struct ifnet * ifp, u_long event_code) +{ + struct { + struct kern_event_msg header; + u_long unit; + char if_name[IFNAMSIZ]; + } event; + + event.header.total_size = sizeof(event); + event.header.vendor_code = KEV_VENDOR_APPLE; + event.header.kev_class = KEV_NETWORK_CLASS; + event.header.kev_subclass = KEV_DL_SUBCLASS; + event.header.event_code = event_code; + event.header.event_data[0] = ifp->if_family; + event.unit = (u_long) ifp->if_unit; + strncpy(event.if_name, ifp->if_name, IFNAMSIZ); + dlil_event(ifp, &event.header); + return; +} + +/* + * Function: bond_attach_protocol + * Purpose: + * Attach a DLIL protocol to the interface. + * + * The ethernet demux special cases to always return PF_BOND if the + * interface is bonded. That means we receive all traffic from that + * interface without passing any of the traffic to any other attached + * protocol. + */ +static int +bond_attach_protocol(struct ifnet *ifp) +{ + int error; + struct dlil_proto_reg_str reg; + + bzero(®, sizeof(reg)); + TAILQ_INIT(®.demux_desc_head); + reg.interface_family = ifp->if_family; + reg.unit_number = ifp->if_unit; + reg.input = bond_input; + reg.event = bond_event; + reg.protocol_family = PF_BOND; + + error = dlil_attach_protocol(®); + if (error) { + printf("bond over %s%d: dlil_attach_protocol failed, %d\n", + ifp->if_name, ifp->if_unit, error); + } + return (error); +} + +/* + * Function: bond_detach_protocol + * Purpose: + * Detach our DLIL protocol from an interface + */ +static int +bond_detach_protocol(struct ifnet *ifp) +{ + int error; + + error = dlil_detach_protocol(ifp, PF_BOND); + if (error) { + printf("bond over %s%d: dlil_detach_protocol failed, %d\n", + ifp->if_name, ifp->if_unit, error); + } + return (error); +} + +/* + * DLIL interface family functions + */ +extern int ether_add_if(struct ifnet *ifp); +extern int ether_del_if(struct ifnet *ifp); +extern int ether_init_if(struct ifnet *ifp); +extern int ether_add_proto_old(struct ifnet *ifp, u_long protocol_family, + struct ddesc_head_str *desc_head); + +extern int ether_attach_inet(struct ifnet *ifp, u_long protocol_family); +extern int ether_detach_inet(struct ifnet *ifp, u_long protocol_family); +extern int ether_attach_inet6(struct ifnet *ifp, u_long protocol_family); +extern int ether_detach_inet6(struct ifnet *ifp, u_long protocol_family); + +__private_extern__ int +bond_family_init(void) +{ + int error=0; + struct dlil_ifmod_reg_str ifmod_reg; + + bzero(&ifmod_reg, sizeof(ifmod_reg)); + ifmod_reg.add_if = ether_add_if; + ifmod_reg.del_if = ether_del_if; + ifmod_reg.init_if = NULL; + ifmod_reg.add_proto = ether_add_proto_old; + ifmod_reg.del_proto = ether_del_proto; + ifmod_reg.ifmod_ioctl = ether_ioctl; + ifmod_reg.shutdown = NULL; + + if (dlil_reg_if_modules(APPLE_IF_FAM_BOND, &ifmod_reg)) { + printf("WARNING: bond_family_init -- " + "Can't register if family modules\n"); + error = EIO; + goto done; + } + + error = dlil_reg_proto_module(PF_INET, APPLE_IF_FAM_BOND, + ether_attach_inet, + ether_detach_inet); + if (error != 0) { + printf("bond: dlil_reg_proto_module failed for AF_INET6 error=%d\n", + error); + goto done; + } + + error = dlil_reg_proto_module(PF_INET6, APPLE_IF_FAM_BOND, + ether_attach_inet6, + ether_detach_inet6); + if (error != 0) { + printf("bond: dlil_reg_proto_module failed for AF_INET6 error=%d\n", + error); + goto done; + } + bond_clone_attach(); + + done: + return (error); +} +/** + ** + ** LACP routines: + ** + **/ + +/** + ** LACP ifbond_list routines + **/ +static bondport_ref +ifbond_list_find_moved_port(bondport_ref rx_port, + const lacp_actor_partner_tlv_ref atlv) +{ + ifbond_ref bond; + bondport_ref p; + partner_state_ref ps; + LAG_info_ref ps_li; + + TAILQ_FOREACH(bond, &g_bond->ifbond_list, ifb_bond_list) { + TAILQ_FOREACH(p, &bond->ifb_port_list, po_port_list) { + + if (rx_port == p) { + /* no point in comparing against ourselves */ + continue; + } + if (p->po_receive_state != ReceiveState_PORT_DISABLED) { + /* it's not clear that we should be checking this */ + continue; + } + ps = &p->po_partner_state; + if (lacp_actor_partner_state_defaulted(ps->ps_state)) { + continue; + } + ps_li = &ps->ps_lag_info; + if (ps->ps_port == lacp_actor_partner_tlv_get_port(atlv) + && bcmp(&ps_li->li_system, atlv->lap_system, + sizeof(ps_li->li_system)) == 0) { + if (g_bond->verbose) { + timestamp_printf("System " EA_FORMAT + " Port 0x%x moved from %s to %s\n", + EA_LIST(&ps_li->li_system), ps->ps_port, + bondport_get_name(p), + bondport_get_name(rx_port)); + } + return (p); + } + } + } + return (NULL); +} + +/** + ** LACP ifbond, LAG routines + **/ + +static int +ifbond_selection(ifbond_ref bond) +{ + int all_ports_ready = 0; + int active_media = 0; + LAG_ref lag = NULL; + int lag_changed = 0; + bondport_ref p; + int port_speed = 0; + + lag = ifbond_find_best_LAG(bond, &active_media); + if (lag != bond->ifb_active_lag) { + if (bond->ifb_active_lag != NULL) { + ifbond_deactivate_LAG(bond, bond->ifb_active_lag); + bond->ifb_active_lag = NULL; + } + bond->ifb_active_lag = lag; + if (lag != NULL) { + ifbond_activate_LAG(bond, lag, active_media); + } + lag_changed = 1; + } + else if (lag != NULL) { + if (lag->lag_active_media != active_media) { + if (g_bond->verbose) { + timestamp_printf("LAG PORT SPEED CHANGED from %d to %d\n", + link_speed(lag->lag_active_media), + link_speed(active_media)); + } + ifbond_deactivate_LAG(bond, lag); + ifbond_activate_LAG(bond, lag, active_media); + lag_changed = 1; + } + } + if (lag != NULL) { + port_speed = link_speed(active_media); + all_ports_ready = ifbond_all_ports_ready(bond); + } + TAILQ_FOREACH(p, &bond->ifb_port_list, po_port_list) { + if (lag != NULL && p->po_lag == lag + && media_speed(&p->po_media_info) == port_speed + && (p->po_mux_state == MuxState_DETACHED + || p->po_selected == SelectedState_SELECTED + || p->po_selected == SelectedState_STANDBY) + && bondport_aggregatable(p)) { + if (bond->ifb_max_active > 0) { + if (lag->lag_selected_port_count < bond->ifb_max_active) { + if (p->po_selected == SelectedState_STANDBY + || p->po_selected == SelectedState_UNSELECTED) { + bondport_set_selected(p, SelectedState_SELECTED); + } + } + else if (p->po_selected == SelectedState_UNSELECTED) { + bondport_set_selected(p, SelectedState_STANDBY); + } + } + else { + bondport_set_selected(p, SelectedState_SELECTED); + } + } + if (bondport_flags_selected_changed(p)) { + bondport_flags_clear_selected_changed(p); + bondport_mux_machine(p, LAEventSelectedChange, NULL); + } + if (all_ports_ready + && bondport_flags_ready(p) + && p->po_mux_state == MuxState_WAITING) { + bondport_mux_machine(p, LAEventReady, NULL); + } + bondport_transmit_machine(p, LAEventStart, NULL); + } + return (lag_changed); +} + +static LAG_ref +ifbond_find_best_LAG(ifbond_ref bond, int * active_media) +{ + int best_active = 0; + LAG_ref best_lag = NULL; + int best_count = 0; + int best_speed = 0; + LAG_ref lag; + + if (bond->ifb_active_lag != NULL) { + best_lag = bond->ifb_active_lag; + best_count = LAG_get_aggregatable_port_count(best_lag, &best_active); + if (bond->ifb_max_active > 0 + && best_count > bond->ifb_max_active) { + best_count = bond->ifb_max_active; + } + best_speed = link_speed(best_active); + } + TAILQ_FOREACH(lag, &bond->ifb_lag_list, lag_list) { + int active; + int count; + int speed; + + if (lag == bond->ifb_active_lag) { + /* we've already computed it */ + continue; + } + count = LAG_get_aggregatable_port_count(lag, &active); + if (count == 0) { + continue; + } + if (bond->ifb_max_active > 0 + && count > bond->ifb_max_active) { + /* if there's a limit, don't count extra links */ + count = bond->ifb_max_active; + } + speed = link_speed(active); + if ((count * speed) > (best_count * best_speed)) { + best_count = count; + best_speed = speed; + best_active = active; + best_lag = lag; + } + } + if (best_count == 0) { + return (NULL); + } + *active_media = best_active; + return (best_lag); +} + +static void +ifbond_deactivate_LAG(__unused ifbond_ref bond, LAG_ref lag) +{ + bondport_ref p; + + TAILQ_FOREACH(p, &lag->lag_port_list, po_lag_port_list) { + bondport_set_selected(p, SelectedState_UNSELECTED); + } + return; +} + +static void +ifbond_activate_LAG(ifbond_ref bond, LAG_ref lag, int active_media) +{ + int need = 0; + bondport_ref p; + + if (bond->ifb_max_active > 0) { + need = bond->ifb_max_active; + } + lag->lag_active_media = active_media; + TAILQ_FOREACH(p, &lag->lag_port_list, po_lag_port_list) { + if (bondport_aggregatable(p) == 0) { + bondport_set_selected(p, SelectedState_UNSELECTED); + } + else if (media_speed(&p->po_media_info) != link_speed(active_media)) { + bondport_set_selected(p, SelectedState_UNSELECTED); + } + else if (p->po_mux_state == MuxState_DETACHED) { + if (bond->ifb_max_active > 0) { + if (need > 0) { + bondport_set_selected(p, SelectedState_SELECTED); + need--; + } + else { + bondport_set_selected(p, SelectedState_STANDBY); + } + } + else { + bondport_set_selected(p, SelectedState_SELECTED); + } + } + else { + bondport_set_selected(p, SelectedState_UNSELECTED); + } + } + return; +} + +#if 0 +static void +ifbond_set_max_active(ifbond_ref bond, int max_active) +{ + LAG_ref lag = bond->ifb_active_lag; + + bond->ifb_max_active = max_active; + if (bond->ifb_max_active <= 0 || lag == NULL) { + return; + } + if (lag->lag_selected_port_count > bond->ifb_max_active) { + bondport_ref p; + int remove_count; + + remove_count = lag->lag_selected_port_count - bond->ifb_max_active; + TAILQ_FOREACH(p, &lag->lag_port_list, po_lag_port_list) { + if (p->po_selected == SelectedState_SELECTED) { + bondport_set_selected(p, SelectedState_UNSELECTED); + remove_count--; + if (remove_count == 0) { + break; + } + } + } + } + return; +} +#endif 0 + +static int +ifbond_all_ports_ready(ifbond_ref bond) +{ + int ready = 0; + bondport_ref p; + + if (bond->ifb_active_lag == NULL) { + return (0); + } + TAILQ_FOREACH(p, &bond->ifb_active_lag->lag_port_list, po_lag_port_list) { + if (p->po_mux_state == MuxState_WAITING + && p->po_selected == SelectedState_SELECTED) { + if (bondport_flags_ready(p) == 0) { + return (0); + } + } + /* note that there was at least one ready port */ + ready = 1; + } + return (ready); +} + +static int +ifbond_all_ports_attached(ifbond_ref bond, bondport_ref this_port) +{ + bondport_ref p; + + TAILQ_FOREACH(p, &bond->ifb_port_list, po_port_list) { + if (this_port == p) { + continue; + } + if (bondport_flags_mux_attached(p) == 0) { + return (0); + } + } + return (1); +} + +static LAG_ref +ifbond_get_LAG_matching_port(ifbond_ref bond, bondport_ref p) +{ + LAG_ref lag; + + TAILQ_FOREACH(lag, &bond->ifb_lag_list, lag_list) { + if (bcmp(&lag->lag_info, &p->po_partner_state.ps_lag_info, + sizeof(lag->lag_info)) == 0) { + return (lag); + } + } + return (NULL); +} + +static int +LAG_get_aggregatable_port_count(LAG_ref lag, int * active_media) +{ + int active; + int count; + bondport_ref p; + int speed; + + active = 0; + count = 0; + speed = 0; + TAILQ_FOREACH(p, &lag->lag_port_list, po_lag_port_list) { + if (bondport_aggregatable(p)) { + int this_speed; + + this_speed = media_speed(&p->po_media_info); + if (this_speed == 0) { + continue; + } + if (this_speed > speed) { + active = p->po_media_info.mi_active; + speed = this_speed; + count = 1; + } + else if (this_speed == speed) { + count++; + } + } + } + *active_media = active; + return (count); +} + + +/** + ** LACP bondport routines + **/ +static void +bondport_link_status_changed(bondport_ref p) +{ + ifbond_ref bond = p->po_bond; + + if (g_bond->verbose) { + if (media_active(&p->po_media_info)) { + timestamp_printf("[%s] Link UP %d Mbit/s %s duplex\n", + bondport_get_name(p), + media_speed(&p->po_media_info), + media_full_duplex(&p->po_media_info) + ? "full" : "half"); + } + else { + timestamp_printf("[%s] Link DOWN\n", bondport_get_name(p)); + } + } + if (media_active(&p->po_media_info) + && bond->ifb_active_lag != NULL + && p->po_lag == bond->ifb_active_lag + && p->po_selected != SelectedState_UNSELECTED) { + if (media_speed(&p->po_media_info) != p->po_lag->lag_active_media) { + if (g_bond->verbose) { + timestamp_printf("[%s] Port speed %d differs from LAG %d\n", + bondport_get_name(p), + media_speed(&p->po_media_info), + link_speed(p->po_lag->lag_active_media)); + } + bondport_set_selected(p, SelectedState_UNSELECTED); + } + } + bondport_receive_machine(p, LAEventMediaChange, NULL); + bondport_mux_machine(p, LAEventMediaChange, NULL); + bondport_periodic_transmit_machine(p, LAEventMediaChange, NULL); + + return; +} + +static int +bondport_aggregatable(bondport_ref p) +{ + partner_state_ref ps = &p->po_partner_state; + + if (lacp_actor_partner_state_aggregatable(p->po_actor_state) == 0 + || lacp_actor_partner_state_aggregatable(ps->ps_state) == 0) { + /* we and/or our partner are individual */ + return (0); + } + if (p->po_lag == NULL) { + return (0); + } + switch (p->po_receive_state) { + default: + if (g_bond->verbose) { + timestamp_printf("[%s] Port is not selectable\n", + bondport_get_name(p)); + } + return (0); + case ReceiveState_CURRENT: + case ReceiveState_EXPIRED: + break; + } + return (1); +} + +static int +bondport_matches_LAG(bondport_ref p, LAG_ref lag) +{ + LAG_info_ref lag_li; + partner_state_ref ps; + LAG_info_ref ps_li; + + ps = &p->po_partner_state; + ps_li = &ps->ps_lag_info; + lag_li = &lag->lag_info; + if (ps_li->li_system_priority == lag_li->li_system_priority + && ps_li->li_key == lag_li->li_key + && (bcmp(&ps_li->li_system, &lag_li->li_system, + sizeof(lag_li->li_system)) + == 0)) { + return (1); + } + return (0); +} + +static int +bondport_remove_from_LAG(bondport_ref p) +{ + int active_lag = 0; + ifbond_ref bond = p->po_bond; + LAG_ref lag = p->po_lag; + + if (lag == NULL) { + return (0); + } + TAILQ_REMOVE(&lag->lag_port_list, p, po_lag_port_list); + if (g_bond->verbose) { + timestamp_printf("[%s] Removed from LAG (0x%04x," EA_FORMAT + ",0x%04x)\n", + bondport_get_name(p), + lag->lag_info.li_system_priority, + EA_LIST(&lag->lag_info.li_system), + lag->lag_info.li_key); + } + p->po_lag = NULL; + lag->lag_port_count--; + if (lag->lag_port_count > 0) { + return (bond->ifb_active_lag == lag); + } + if (g_bond->verbose) { + timestamp_printf("Key 0x%04x: LAG Released (%04x," EA_FORMAT + ",0x%04x)\n", + bond->ifb_key, + lag->lag_info.li_system_priority, + EA_LIST(&lag->lag_info.li_system), + lag->lag_info.li_key); + } + TAILQ_REMOVE(&bond->ifb_lag_list, lag, lag_list); + if (bond->ifb_active_lag == lag) { + bond->ifb_active_lag = NULL; + active_lag = 1; + } + FREE(lag, M_BOND); + return (active_lag); +} + +static void +bondport_add_to_LAG(bondport_ref p, LAG_ref lag) +{ + TAILQ_INSERT_TAIL(&lag->lag_port_list, p, po_lag_port_list); + p->po_lag = lag; + lag->lag_port_count++; + if (g_bond->verbose) { + timestamp_printf("[%s] Added to LAG (0x%04x," EA_FORMAT "0x%04x)\n", + bondport_get_name(p), + lag->lag_info.li_system_priority, + EA_LIST(&lag->lag_info.li_system), + lag->lag_info.li_key); + } + return; +} + +static void +bondport_assign_to_LAG(bondport_ref p) +{ + ifbond_ref bond = p->po_bond; + LAG_ref lag; + + if (lacp_actor_partner_state_defaulted(p->po_actor_state)) { + bondport_remove_from_LAG(p); + return; + } + lag = p->po_lag; + if (lag != NULL) { + if (bondport_matches_LAG(p, lag)) { + /* still OK */ + return; + } + bondport_remove_from_LAG(p); + } + lag = ifbond_get_LAG_matching_port(bond, p); + if (lag != NULL) { + bondport_add_to_LAG(p, lag); + return; + } + lag = (LAG_ref)_MALLOC(sizeof(*lag), M_BOND, M_WAITOK); + TAILQ_INIT(&lag->lag_port_list); + lag->lag_port_count = 0; + lag->lag_selected_port_count = 0; + lag->lag_info = p->po_partner_state.ps_lag_info; + TAILQ_INSERT_TAIL(&bond->ifb_lag_list, lag, lag_list); + if (g_bond->verbose) { + timestamp_printf("Key 0x%04x: LAG Created (0x%04x," EA_FORMAT + ",0x%04x)\n", + bond->ifb_key, + lag->lag_info.li_system_priority, + EA_LIST(&lag->lag_info.li_system), + lag->lag_info.li_key); + } + bondport_add_to_LAG(p, lag); + return; +} + +static void +bondport_receive_lacpdu(bondport_ref p, lacpdu_ref in_lacpdu_p) +{ + bondport_ref moved_port; + + moved_port + = ifbond_list_find_moved_port(p, (const lacp_actor_partner_tlv_ref) + &in_lacpdu_p->la_actor_tlv); + if (moved_port != NULL) { + bondport_receive_machine(moved_port, LAEventPortMoved, NULL); + } + bondport_receive_machine(p, LAEventPacket, in_lacpdu_p); + bondport_mux_machine(p, LAEventPacket, in_lacpdu_p); + bondport_periodic_transmit_machine(p, LAEventPacket, in_lacpdu_p); + return; +} + +static void +bondport_set_selected(bondport_ref p, SelectedState s) +{ + if (s != p->po_selected) { + ifbond_ref bond = p->po_bond; + LAG_ref lag = p->po_lag; + + bondport_flags_set_selected_changed(p); + if (lag != NULL && bond->ifb_active_lag == lag) { + if (p->po_selected == SelectedState_SELECTED) { + lag->lag_selected_port_count--; + } + else if (s == SelectedState_SELECTED) { + lag->lag_selected_port_count++; + } + if (g_bond->verbose) { + timestamp_printf("[%s] SetSelected: %s (was %s)\n", + bondport_get_name(p), + SelectedStateString(s), + SelectedStateString(p->po_selected)); + } + } + } + p->po_selected = s; + return; +} + +/** + ** Receive machine + **/ + +static void +bondport_UpdateDefaultSelected(bondport_ref p) +{ + bondport_set_selected(p, SelectedState_UNSELECTED); + return; +} + +static void +bondport_RecordDefault(bondport_ref p) +{ + bzero(&p->po_partner_state, sizeof(p->po_partner_state)); + p->po_actor_state + = lacp_actor_partner_state_set_defaulted(p->po_actor_state); + bondport_assign_to_LAG(p); + return; +} + +static void +bondport_UpdateSelected(bondport_ref p, lacpdu_ref lacpdu_p) +{ + lacp_actor_partner_tlv_ref actor; + partner_state_ref ps; + LAG_info_ref ps_li; + + /* compare the PDU's Actor information to our Partner state */ + actor = (lacp_actor_partner_tlv_ref)lacpdu_p->la_actor_tlv; + ps = &p->po_partner_state; + ps_li = &ps->ps_lag_info; + if (lacp_actor_partner_tlv_get_port(actor) != ps->ps_port + || (lacp_actor_partner_tlv_get_port_priority(actor) + != ps->ps_port_priority) + || bcmp(actor->lap_system, &ps_li->li_system, sizeof(ps_li->li_system)) + || (lacp_actor_partner_tlv_get_system_priority(actor) + != ps_li->li_system_priority) + || (lacp_actor_partner_tlv_get_key(actor) != ps_li->li_key) + || (lacp_actor_partner_state_aggregatable(actor->lap_state) + != lacp_actor_partner_state_aggregatable(ps->ps_state))) { + bondport_set_selected(p, SelectedState_UNSELECTED); + if (g_bond->verbose) { + timestamp_printf("[%s] updateSelected UNSELECTED\n", + bondport_get_name(p)); + } + } + return; +} + +static void +bondport_RecordPDU(bondport_ref p, lacpdu_ref lacpdu_p) +{ + lacp_actor_partner_tlv_ref actor; + ifbond_ref bond = p->po_bond; + int lacp_maintain = 0; + partner_state_ref ps; + lacp_actor_partner_tlv_ref partner; + LAG_info_ref ps_li; + + /* copy the PDU's Actor information into our Partner state */ + actor = (lacp_actor_partner_tlv_ref)lacpdu_p->la_actor_tlv; + ps = &p->po_partner_state; + ps_li = &ps->ps_lag_info; + ps->ps_port = lacp_actor_partner_tlv_get_port(actor); + ps->ps_port_priority = lacp_actor_partner_tlv_get_port_priority(actor); + ps_li->li_system = *((lacp_system_ref)actor->lap_system); + ps_li->li_system_priority + = lacp_actor_partner_tlv_get_system_priority(actor); + ps_li->li_key = lacp_actor_partner_tlv_get_key(actor); + ps->ps_state = lacp_actor_partner_state_set_out_of_sync(actor->lap_state); + p->po_actor_state + = lacp_actor_partner_state_set_not_defaulted(p->po_actor_state); + + /* compare the PDU's Partner information to our own information */ + partner = (lacp_actor_partner_tlv_ref)lacpdu_p->la_partner_tlv; + + if (lacp_actor_partner_state_active_lacp(ps->ps_state) + || (lacp_actor_partner_state_active_lacp(p->po_actor_state) + && lacp_actor_partner_state_active_lacp(partner->lap_state))) { + if (g_bond->verbose) { + timestamp_printf("[%s] recordPDU: LACP will maintain\n", + bondport_get_name(p)); + } + lacp_maintain = 1; + } + if ((lacp_actor_partner_tlv_get_port(partner) + == bondport_get_index(p)) + && lacp_actor_partner_tlv_get_port_priority(partner) == p->po_priority + && bcmp(partner->lap_system, &g_bond->system, + sizeof(g_bond->system)) == 0 + && (lacp_actor_partner_tlv_get_system_priority(partner) + == g_bond->system_priority) + && lacp_actor_partner_tlv_get_key(partner) == bond->ifb_key + && (lacp_actor_partner_state_aggregatable(partner->lap_state) + == lacp_actor_partner_state_aggregatable(p->po_actor_state)) + && lacp_actor_partner_state_in_sync(actor->lap_state) + && lacp_maintain) { + ps->ps_state = lacp_actor_partner_state_set_in_sync(ps->ps_state); + if (g_bond->verbose) { + timestamp_printf("[%s] recordPDU: LACP partner in sync\n", + bondport_get_name(p)); + } + } + else if (lacp_actor_partner_state_aggregatable(actor->lap_state) == 0 + && lacp_actor_partner_state_in_sync(actor->lap_state) + && lacp_maintain) { + ps->ps_state = lacp_actor_partner_state_set_in_sync(ps->ps_state); + if (g_bond->verbose) { + timestamp_printf("[%s] recordPDU: LACP partner in sync (ind)\n", + bondport_get_name(p)); + } + } + bondport_assign_to_LAG(p); + return; +} + +static __inline__ lacp_actor_partner_state +updateNTTBits(lacp_actor_partner_state s) +{ + return (s & (LACP_ACTOR_PARTNER_STATE_LACP_ACTIVITY + | LACP_ACTOR_PARTNER_STATE_LACP_TIMEOUT + | LACP_ACTOR_PARTNER_STATE_AGGREGATION + | LACP_ACTOR_PARTNER_STATE_SYNCHRONIZATION)); +} + +static void +bondport_UpdateNTT(bondport_ref p, lacpdu_ref lacpdu_p) +{ + ifbond_ref bond = p->po_bond; + lacp_actor_partner_tlv_ref partner; + + /* compare the PDU's Actor information to our Partner state */ + partner = (lacp_actor_partner_tlv_ref)lacpdu_p->la_partner_tlv; + if ((lacp_actor_partner_tlv_get_port(partner) != bondport_get_index(p)) + || lacp_actor_partner_tlv_get_port_priority(partner) != p->po_priority + || bcmp(partner->lap_system, &g_bond->system, sizeof(g_bond->system)) + || (lacp_actor_partner_tlv_get_system_priority(partner) + != g_bond->system_priority) + || lacp_actor_partner_tlv_get_key(partner) != bond->ifb_key + || (updateNTTBits(partner->lap_state) + != updateNTTBits(p->po_actor_state))) { + bondport_flags_set_ntt(p); + if (g_bond->verbose) { + timestamp_printf("[%s] updateNTT: Need To Transmit\n", + bondport_get_name(p)); + } + } + return; +} + +static void +bondport_AttachMuxToAggregator(bondport_ref p) +{ + if (bondport_flags_mux_attached(p) == 0) { + if (g_bond->verbose) { + timestamp_printf("[%s] Attached Mux To Aggregator\n", + bondport_get_name(p)); + } + bondport_flags_set_mux_attached(p); + } + return; +} + +static void +bondport_DetachMuxFromAggregator(bondport_ref p) +{ + if (bondport_flags_mux_attached(p)) { + if (g_bond->verbose) { + timestamp_printf("[%s] Detached Mux From Aggregator\n", + bondport_get_name(p)); + } + bondport_flags_clear_mux_attached(p); + } + return; +} + +static void +bondport_enable_distributing(bondport_ref p) +{ + if (bondport_flags_distributing(p) == 0) { + ifbond_ref bond = p->po_bond; + + bond->ifb_distributing_array[bond->ifb_distributing_count++] = p; + if (g_bond->verbose) { + timestamp_printf("[%s] Distribution Enabled\n", + bondport_get_name(p)); + } + bondport_flags_set_distributing(p); + } + return; +} + +static void +bondport_disable_distributing(bondport_ref p) +{ + if (bondport_flags_distributing(p)) { + bondport_ref * array; + ifbond_ref bond; + int count; + int i; + + bond = p->po_bond; + array = bond->ifb_distributing_array; + count = bond->ifb_distributing_count; + for (i = 0; i < count; i++) { + if (array[i] == p) { + int j; + + for (j = i; j < (count - 1); j++) { + array[j] = array[j + 1]; + } + break; + } + } + bond->ifb_distributing_count--; + if (g_bond->verbose) { + timestamp_printf("[%s] Distribution Disabled\n", + bondport_get_name(p)); + } + bondport_flags_clear_distributing(p); + } + return; +} + +/** + ** Receive machine functions + **/ +static void +bondport_receive_machine_initialize(bondport_ref p, LAEvent event, + void * event_data); +static void +bondport_receive_machine_port_disabled(bondport_ref p, LAEvent event, + void * event_data); +static void +bondport_receive_machine_expired(bondport_ref p, LAEvent event, + void * event_data); +static void +bondport_receive_machine_lacp_disabled(bondport_ref p, LAEvent event, + void * event_data); +static void +bondport_receive_machine_defaulted(bondport_ref p, LAEvent event, + void * event_data); +static void +bondport_receive_machine_current(bondport_ref p, LAEvent event, + void * event_data); + +static void +bondport_receive_machine_event(bondport_ref p, LAEvent event, + void * event_data) +{ + switch (p->po_receive_state) { + case ReceiveState_none: + bondport_receive_machine_initialize(p, LAEventStart, NULL); + break; + case ReceiveState_INITIALIZE: + bondport_receive_machine_initialize(p, event, event_data); + break; + case ReceiveState_PORT_DISABLED: + bondport_receive_machine_port_disabled(p, event, event_data); + break; + case ReceiveState_EXPIRED: + bondport_receive_machine_expired(p, event, event_data); + break; + case ReceiveState_LACP_DISABLED: + bondport_receive_machine_lacp_disabled(p, event, event_data); + break; + case ReceiveState_DEFAULTED: + bondport_receive_machine_defaulted(p, event, event_data); + break; + case ReceiveState_CURRENT: + bondport_receive_machine_current(p, event, event_data); + break; + default: + break; + } + return; +} + +static void +bondport_receive_machine(bondport_ref p, LAEvent event, + void * event_data) +{ + switch (event) { + case LAEventPacket: + if (p->po_receive_state != ReceiveState_LACP_DISABLED) { + bondport_receive_machine_current(p, event, event_data); + } + break; + case LAEventMediaChange: + if (media_active(&p->po_media_info)) { + switch (p->po_receive_state) { + case ReceiveState_PORT_DISABLED: + case ReceiveState_LACP_DISABLED: + bondport_receive_machine_port_disabled(p, LAEventMediaChange, NULL); + break; + default: + break; + } + } + else { + bondport_receive_machine_port_disabled(p, LAEventStart, NULL); + } + break; + default: + bondport_receive_machine_event(p, event, event_data); + break; + } + return; +} + +static void +bondport_receive_machine_initialize(bondport_ref p, LAEvent event, + __unused void * event_data) +{ + switch (event) { + case LAEventStart: + devtimer_cancel(p->po_current_while_timer); + if (g_bond->verbose) { + timestamp_printf("[%s] Receive INITIALIZE\n", + bondport_get_name(p)); + } + p->po_receive_state = ReceiveState_INITIALIZE; + bondport_set_selected(p, SelectedState_UNSELECTED); + bondport_RecordDefault(p); + p->po_actor_state + = lacp_actor_partner_state_set_not_expired(p->po_actor_state); + bondport_receive_machine_port_disabled(p, LAEventStart, NULL); + break; + default: + break; + } + return; +} + +static void +bondport_receive_machine_port_disabled(bondport_ref p, LAEvent event, + __unused void * event_data) +{ + partner_state_ref ps; + + switch (event) { + case LAEventStart: + devtimer_cancel(p->po_current_while_timer); + if (g_bond->verbose) { + timestamp_printf("[%s] Receive PORT_DISABLED\n", + bondport_get_name(p)); + } + p->po_receive_state = ReceiveState_PORT_DISABLED; + ps = &p->po_partner_state; + ps->ps_state = lacp_actor_partner_state_set_out_of_sync(ps->ps_state); + /* FALL THROUGH */ + case LAEventMediaChange: + if (media_active(&p->po_media_info)) { + if (media_full_duplex(&p->po_media_info)) { + bondport_receive_machine_expired(p, LAEventStart, NULL); + } + else { + bondport_receive_machine_lacp_disabled(p, LAEventStart, NULL); + } + } + else if (p->po_selected == SelectedState_SELECTED) { + struct timeval tv; + + if (g_bond->verbose) { + timestamp_printf("[%s] Receive PORT_DISABLED: " + "link timer started\n", + bondport_get_name(p)); + } + tv.tv_sec = 1; + tv.tv_usec = 0; + devtimer_set_relative(p->po_current_while_timer, tv, + (devtimer_timeout_func) + bondport_receive_machine_port_disabled, + (void *)LAEventTimeout, NULL); + } + else if (p->po_selected == SelectedState_STANDBY) { + bondport_set_selected(p, SelectedState_UNSELECTED); + } + break; + case LAEventTimeout: + if (p->po_selected == SelectedState_SELECTED) { + if (g_bond->verbose) { + timestamp_printf("[%s] Receive PORT_DISABLED: " + "link timer completed, marking UNSELECTED\n", + bondport_get_name(p)); + } + bondport_set_selected(p, SelectedState_UNSELECTED); + } + break; + case LAEventPortMoved: + bondport_receive_machine_initialize(p, LAEventStart, NULL); + break; + default: + break; + } + return; +} + +static void +bondport_receive_machine_expired(bondport_ref p, LAEvent event, + __unused void * event_data) +{ + lacp_actor_partner_state s; + struct timeval tv; + + switch (event) { + case LAEventStart: + devtimer_cancel(p->po_current_while_timer); + if (g_bond->verbose) { + timestamp_printf("[%s] Receive EXPIRED\n", + bondport_get_name(p)); + } + p->po_receive_state = ReceiveState_EXPIRED; + s = p->po_partner_state.ps_state; + s = lacp_actor_partner_state_set_out_of_sync(s); + s = lacp_actor_partner_state_set_short_timeout(s); + p->po_partner_state.ps_state = s; + p->po_actor_state + = lacp_actor_partner_state_set_expired(p->po_actor_state); + /* start current_while timer */ + tv.tv_sec = LACP_SHORT_TIMEOUT_TIME; + tv.tv_usec = 0; + devtimer_set_relative(p->po_current_while_timer, tv, + (devtimer_timeout_func) + bondport_receive_machine_expired, + (void *)LAEventTimeout, NULL); + + break; + case LAEventTimeout: + bondport_receive_machine_defaulted(p, LAEventStart, NULL); + break; + default: + break; + } + return; +} + +static void +bondport_receive_machine_lacp_disabled(bondport_ref p, LAEvent event, + __unused void * event_data) +{ + partner_state_ref ps; + switch (event) { + case LAEventStart: + devtimer_cancel(p->po_current_while_timer); + if (g_bond->verbose) { + timestamp_printf("[%s] Receive LACP_DISABLED\n", + bondport_get_name(p)); + } + p->po_receive_state = ReceiveState_LACP_DISABLED; + bondport_set_selected(p, SelectedState_UNSELECTED); + bondport_RecordDefault(p); + ps = &p->po_partner_state; + ps->ps_state = lacp_actor_partner_state_set_individual(ps->ps_state); + p->po_actor_state + = lacp_actor_partner_state_set_not_expired(p->po_actor_state); + break; + default: + break; + } + return; +} + +static void +bondport_receive_machine_defaulted(bondport_ref p, LAEvent event, + __unused void * event_data) +{ + switch (event) { + case LAEventStart: + devtimer_cancel(p->po_current_while_timer); + if (g_bond->verbose) { + timestamp_printf("[%s] Receive DEFAULTED\n", + bondport_get_name(p)); + } + p->po_receive_state = ReceiveState_DEFAULTED; + bondport_UpdateDefaultSelected(p); + bondport_RecordDefault(p); + p->po_actor_state + = lacp_actor_partner_state_set_not_expired(p->po_actor_state); + break; + default: + break; + } + return; +} + +static void +bondport_receive_machine_current(bondport_ref p, LAEvent event, + void * event_data) +{ + partner_state_ref ps; + struct timeval tv; + + switch (event) { + case LAEventPacket: + devtimer_cancel(p->po_current_while_timer); + if (g_bond->verbose) { + timestamp_printf("[%s] Receive CURRENT\n", + bondport_get_name(p)); + } + p->po_receive_state = ReceiveState_CURRENT; + bondport_UpdateSelected(p, event_data); + bondport_UpdateNTT(p, event_data); + bondport_RecordPDU(p, event_data); + p->po_actor_state + = lacp_actor_partner_state_set_not_expired(p->po_actor_state); + bondport_assign_to_LAG(p); + /* start current_while timer */ + ps = &p->po_partner_state; + if (lacp_actor_partner_state_short_timeout(ps->ps_state)) { + tv.tv_sec = LACP_SHORT_TIMEOUT_TIME; + } + else { + tv.tv_sec = LACP_LONG_TIMEOUT_TIME; + } + tv.tv_usec = 0; + devtimer_set_relative(p->po_current_while_timer, tv, + (devtimer_timeout_func) + bondport_receive_machine_current, + (void *)LAEventTimeout, NULL); + break; + case LAEventTimeout: + bondport_receive_machine_expired(p, LAEventStart, NULL); + break; + default: + break; + } + return; +} + +/** + ** Periodic Transmission machine + **/ + +static void +bondport_periodic_transmit_machine(bondport_ref p, LAEvent event, + __unused void * event_data) +{ + int interval; + partner_state_ref ps; + struct timeval tv; + + switch (event) { + case LAEventStart: + if (g_bond->verbose) { + timestamp_printf("[%s] periodic_transmit Start\n", + bondport_get_name(p)); + } + /* FALL THROUGH */ + case LAEventMediaChange: + devtimer_cancel(p->po_periodic_timer); + p->po_periodic_interval = 0; + if (media_active(&p->po_media_info) == 0 + || media_full_duplex(&p->po_media_info) == 0) { + break; + } + case LAEventPacket: + /* Neither Partner nor Actor are LACP Active, no periodic tx */ + ps = &p->po_partner_state; + if (lacp_actor_partner_state_active_lacp(p->po_actor_state) == 0 + && (lacp_actor_partner_state_active_lacp(ps->ps_state) + == 0)) { + devtimer_cancel(p->po_periodic_timer); + p->po_periodic_interval = 0; + break; + } + if (lacp_actor_partner_state_short_timeout(ps->ps_state)) { + interval = LACP_FAST_PERIODIC_TIME; + } + else { + interval = LACP_SLOW_PERIODIC_TIME; + } + if (p->po_periodic_interval != interval) { + if (interval == LACP_FAST_PERIODIC_TIME + && p->po_periodic_interval == LACP_SLOW_PERIODIC_TIME) { + if (g_bond->verbose) { + timestamp_printf("[%s] periodic_transmit:" + " Need To Transmit\n", + bondport_get_name(p)); + } + bondport_flags_set_ntt(p); + } + p->po_periodic_interval = interval; + tv.tv_usec = 0; + tv.tv_sec = interval; + devtimer_set_relative(p->po_periodic_timer, tv, + (devtimer_timeout_func) + bondport_periodic_transmit_machine, + (void *)LAEventTimeout, NULL); + if (g_bond->verbose) { + timestamp_printf("[%s] Periodic Transmission Timer: %d secs\n", + bondport_get_name(p), + p->po_periodic_interval); + } + } + break; + case LAEventTimeout: + bondport_flags_set_ntt(p); + tv.tv_sec = p->po_periodic_interval; + tv.tv_usec = 0; + devtimer_set_relative(p->po_periodic_timer, tv, (devtimer_timeout_func) + bondport_periodic_transmit_machine, + (void *)LAEventTimeout, NULL); + if (g_bond->verbose > 1) { + timestamp_printf("[%s] Periodic Transmission Timer: %d secs\n", + bondport_get_name(p), p->po_periodic_interval); + } + break; + default: + break; + } + return; +} + +/** + ** Transmit machine + **/ +static int +bondport_can_transmit(bondport_ref p, int32_t current_secs, + long * next_secs) +{ + if (p->po_last_transmit_secs != current_secs) { + p->po_last_transmit_secs = current_secs; + p->po_n_transmit = 0; + } + if (p->po_n_transmit < LACP_PACKET_RATE) { + p->po_n_transmit++; + return (1); + } + if (next_secs != NULL) { + *next_secs = current_secs + 1; + } + return (0); +} + +static void +bondport_transmit_machine(bondport_ref p, LAEvent event, + void * event_data) +{ + lacp_actor_partner_tlv_ref aptlv; + lacp_collector_tlv_ref ctlv; + struct timeval next_tick_time = {0, 0}; + lacpdu_ref out_lacpdu_p; + packet_buffer_ref pkt; + partner_state_ref ps; + LAG_info_ref ps_li; + + switch (event) { + case LAEventTimeout: + case LAEventStart: + if (p->po_periodic_interval == 0 || bondport_flags_ntt(p) == 0) { + break; + } + if (event_data != NULL) { + /* we're going away, transmit the packet no matter what */ + } + else if (bondport_can_transmit(p, devtimer_current_secs(), + &next_tick_time.tv_sec) == 0) { + if (devtimer_enabled(p->po_transmit_timer)) { + if (g_bond->verbose > 0) { + timestamp_printf("[%s] Transmit Timer Already Set\n", + bondport_get_name(p)); + } + } + else { + devtimer_set_absolute(p->po_transmit_timer, next_tick_time, + (devtimer_timeout_func) + bondport_transmit_machine, + (void *)LAEventTimeout, NULL); + if (g_bond->verbose > 0) { + timestamp_printf("[%s] Transmit Timer Deadline %d secs\n", + bondport_get_name(p), + next_tick_time.tv_sec); + } + } + break; + } + if (g_bond->verbose > 0) { + if (event == LAEventTimeout) { + timestamp_printf("[%s] Transmit Timer Complete\n", + bondport_get_name(p)); + } + } + pkt = packet_buffer_allocate(sizeof(*out_lacpdu_p)); + if (pkt == NULL) { + printf("[%s] Transmit: failed to allocate packet buffer\n", + bondport_get_name(p)); + break; + } + out_lacpdu_p = (lacpdu_ref)packet_buffer_byteptr(pkt); + bzero(out_lacpdu_p, sizeof(*out_lacpdu_p)); + out_lacpdu_p->la_subtype = IEEE8023AD_SLOW_PROTO_SUBTYPE_LACP; + out_lacpdu_p->la_version = LACPDU_VERSION_1; + + /* Actor */ + aptlv = (lacp_actor_partner_tlv_ref)out_lacpdu_p->la_actor_tlv; + aptlv->lap_tlv_type = LACPDU_TLV_TYPE_ACTOR; + aptlv->lap_length = LACPDU_ACTOR_TLV_LENGTH; + *((lacp_system_ref)aptlv->lap_system) = g_bond->system; + lacp_actor_partner_tlv_set_system_priority(aptlv, + g_bond->system_priority); + lacp_actor_partner_tlv_set_port_priority(aptlv, p->po_priority); + lacp_actor_partner_tlv_set_port(aptlv, bondport_get_index(p)); + lacp_actor_partner_tlv_set_key(aptlv, p->po_bond->ifb_key); + aptlv->lap_state = p->po_actor_state; + + /* Partner */ + aptlv = (lacp_actor_partner_tlv_ref)out_lacpdu_p->la_partner_tlv; + aptlv->lap_tlv_type = LACPDU_TLV_TYPE_PARTNER; + aptlv->lap_length = LACPDU_PARTNER_TLV_LENGTH; + ps = &p->po_partner_state; + ps_li = &ps->ps_lag_info; + lacp_actor_partner_tlv_set_port(aptlv, ps->ps_port); + lacp_actor_partner_tlv_set_port_priority(aptlv, ps->ps_port_priority); + *((lacp_system_ref)aptlv->lap_system) = ps_li->li_system; + lacp_actor_partner_tlv_set_system_priority(aptlv, + ps_li->li_system_priority); + lacp_actor_partner_tlv_set_key(aptlv, ps_li->li_key); + aptlv->lap_state = ps->ps_state; + + /* Collector */ + ctlv = (lacp_collector_tlv_ref)out_lacpdu_p->la_collector_tlv; + ctlv->lac_tlv_type = LACPDU_TLV_TYPE_COLLECTOR; + ctlv->lac_length = LACPDU_COLLECTOR_TLV_LENGTH; + + bondport_slow_proto_transmit(p, pkt); + bondport_flags_clear_ntt(p); + if (g_bond->verbose > 0) { + timestamp_printf("[%s] Transmit Packet %d\n", + bondport_get_name(p), p->po_n_transmit); + } + break; + default: + break; + } + return; +} + +/** + ** Mux machine functions + **/ + +static void +bondport_mux_machine_detached(bondport_ref p, LAEvent event, + void * event_data); +static void +bondport_mux_machine_waiting(bondport_ref p, LAEvent event, + void * event_data); +static void +bondport_mux_machine_attached(bondport_ref p, LAEvent event, + void * event_data); + +static void +bondport_mux_machine_collecting_distributing(bondport_ref p, LAEvent event, + void * event_data); + +static void +bondport_mux_machine(bondport_ref p, LAEvent event, void * event_data) +{ + switch (p->po_mux_state) { + case MuxState_none: + bondport_mux_machine_detached(p, LAEventStart, NULL); + break; + case MuxState_DETACHED: + bondport_mux_machine_detached(p, event, event_data); + break; + case MuxState_WAITING: + bondport_mux_machine_waiting(p, event, event_data); + break; + case MuxState_ATTACHED: + bondport_mux_machine_attached(p, event, event_data); + break; + case MuxState_COLLECTING_DISTRIBUTING: + bondport_mux_machine_collecting_distributing(p, event, event_data); + break; + default: + break; + } + return; +} + +static void +bondport_mux_machine_detached(bondport_ref p, LAEvent event, + __unused void * event_data) +{ + lacp_actor_partner_state s; + + switch (event) { + case LAEventStart: + devtimer_cancel(p->po_wait_while_timer); + if (g_bond->verbose) { + timestamp_printf("[%s] Mux DETACHED\n", + bondport_get_name(p)); + } + p->po_mux_state = MuxState_DETACHED; + bondport_flags_clear_ready(p); + bondport_DetachMuxFromAggregator(p); + bondport_disable_distributing(p); + s = p->po_actor_state; + s = lacp_actor_partner_state_set_out_of_sync(s); + s = lacp_actor_partner_state_set_not_collecting(s); + s = lacp_actor_partner_state_set_not_distributing(s); + p->po_actor_state = s; + bondport_flags_set_ntt(p); + break; + case LAEventSelectedChange: + case LAEventPacket: + case LAEventMediaChange: + if (p->po_selected == SelectedState_SELECTED + || p->po_selected == SelectedState_STANDBY) { + bondport_mux_machine_waiting(p, LAEventStart, NULL); + } + break; + default: + break; + } + return; +} + +static void +bondport_mux_machine_waiting(bondport_ref p, LAEvent event, + __unused void * event_data) +{ + struct timeval tv; + + switch (event) { + case LAEventStart: + devtimer_cancel(p->po_wait_while_timer); + if (g_bond->verbose) { + timestamp_printf("[%s] Mux WAITING\n", + bondport_get_name(p)); + } + p->po_mux_state = MuxState_WAITING; + /* FALL THROUGH */ + default: + case LAEventSelectedChange: + if (p->po_selected == SelectedState_UNSELECTED) { + bondport_mux_machine_detached(p, LAEventStart, NULL); + break; + } + if (p->po_selected == SelectedState_STANDBY) { + devtimer_cancel(p->po_wait_while_timer); + /* wait until state changes to SELECTED */ + if (g_bond->verbose) { + timestamp_printf("[%s] Mux WAITING: Standby\n", + bondport_get_name(p)); + } + break; + } + if (bondport_flags_ready(p)) { + if (g_bond->verbose) { + timestamp_printf("[%s] Mux WAITING: Port is already ready\n", + bondport_get_name(p)); + } + break; + } + if (devtimer_enabled(p->po_wait_while_timer)) { + if (g_bond->verbose) { + timestamp_printf("[%s] Mux WAITING: Timer already set\n", + bondport_get_name(p)); + } + break; + } + if (ifbond_all_ports_attached(p->po_bond, p)) { + devtimer_cancel(p->po_wait_while_timer); + if (g_bond->verbose) { + timestamp_printf("[%s] Mux WAITING: No waiting\n", + bondport_get_name(p)); + } + bondport_flags_set_ready(p); + goto no_waiting; + } + if (g_bond->verbose) { + timestamp_printf("[%s] Mux WAITING: 2 seconds\n", + bondport_get_name(p)); + } + tv.tv_sec = LACP_AGGREGATE_WAIT_TIME; + tv.tv_usec = 0; + devtimer_set_relative(p->po_wait_while_timer, tv, + (devtimer_timeout_func) + bondport_mux_machine_waiting, + (void *)LAEventTimeout, NULL); + break; + case LAEventTimeout: + if (g_bond->verbose) { + timestamp_printf("[%s] Mux WAITING: Ready\n", + bondport_get_name(p)); + } + bondport_flags_set_ready(p); + break; + case LAEventReady: + no_waiting: + if (bondport_flags_ready(p)){ + if (g_bond->verbose) { + timestamp_printf("[%s] Mux WAITING: All Ports Ready\n", + bondport_get_name(p)); + } + bondport_mux_machine_attached(p, LAEventStart, NULL); + break; + } + break; + } + return; +} + +static void +bondport_mux_machine_attached(bondport_ref p, LAEvent event, + __unused void * event_data) +{ + lacp_actor_partner_state s; + + switch (event) { + case LAEventStart: + devtimer_cancel(p->po_wait_while_timer); + if (g_bond->verbose) { + timestamp_printf("[%s] Mux ATTACHED\n", + bondport_get_name(p)); + } + p->po_mux_state = MuxState_ATTACHED; + bondport_AttachMuxToAggregator(p); + s = p->po_actor_state; + s = lacp_actor_partner_state_set_in_sync(s); + s = lacp_actor_partner_state_set_not_collecting(s); + s = lacp_actor_partner_state_set_not_distributing(s); + bondport_disable_distributing(p); + p->po_actor_state = s; + bondport_flags_set_ntt(p); + /* FALL THROUGH */ + default: + switch (p->po_selected) { + case SelectedState_SELECTED: + s = p->po_partner_state.ps_state; + if (lacp_actor_partner_state_in_sync(s)) { + bondport_mux_machine_collecting_distributing(p, LAEventStart, + NULL); + } + break; + default: + bondport_mux_machine_detached(p, LAEventStart, NULL); + break; + } + break; + } + return; +} + +static void +bondport_mux_machine_collecting_distributing(bondport_ref p, + LAEvent event, + __unused void * event_data) +{ + lacp_actor_partner_state s; + + switch (event) { + case LAEventStart: + devtimer_cancel(p->po_wait_while_timer); + if (g_bond->verbose) { + timestamp_printf("[%s] Mux COLLECTING_DISTRIBUTING\n", + bondport_get_name(p)); + } + p->po_mux_state = MuxState_COLLECTING_DISTRIBUTING; + bondport_enable_distributing(p); + s = p->po_actor_state; + s = lacp_actor_partner_state_set_collecting(s); + s = lacp_actor_partner_state_set_distributing(s); + p->po_actor_state = s; + bondport_flags_set_ntt(p); + /* FALL THROUGH */ + default: + s = p->po_partner_state.ps_state; + if (lacp_actor_partner_state_in_sync(s) == 0) { + bondport_mux_machine_attached(p, LAEventStart, NULL); + break; + } + switch (p->po_selected) { + case SelectedState_UNSELECTED: + case SelectedState_STANDBY: + bondport_mux_machine_attached(p, LAEventStart, NULL); + break; + default: + break; + } + break; + } + return; +} diff --git a/bsd/net/if_bond_var.h b/bsd/net/if_bond_var.h new file mode 100644 index 000000000..f07728fbf --- /dev/null +++ b/bsd/net/if_bond_var.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef _NET_IF_BOND_VAR_H_ +#define _NET_IF_BOND_VAR_H_ + +#include <sys/types.h> + +#include <net/lacp.h> + +#define IF_BOND_OP_ADD_INTERFACE 1 +#define IF_BOND_OP_REMOVE_INTERFACE 2 +#define IF_BOND_OP_GET_STATUS 3 +#define IF_BOND_OP_SET_VERBOSE 4 + +struct if_bond_partner_state { + lacp_system ibps_system; + lacp_system_priority ibps_system_priority; + lacp_key ibps_key; + lacp_port ibps_port; + lacp_port_priority ibps_port_priority; + lacp_actor_partner_state ibps_state; + u_char ibps_reserved1; +}; + +#define IF_BOND_STATUS_SELECTED_STATE_UNSELECTED 0 +#define IF_BOND_STATUS_SELECTED_STATE_SELECTED 1 +#define IF_BOND_STATUS_SELECTED_STATE_STANDBY 2 + +struct if_bond_status { + char ibs_if_name[IFNAMSIZ]; /* interface name */ + lacp_port_priority ibs_port_priority; + lacp_actor_partner_state ibs_state; + u_char ibs_selected_state; + struct if_bond_partner_state ibs_partner_state; + u_int32_t ibs_reserved[8]; +}; + +#define IF_BOND_STATUS_REQ_VERSION 1 + +struct if_bond_status_req { + int ibsr_version; /* version */ + int ibsr_total; /* returned number of struct if_bond_status's */ + int ibsr_count; /* number that will fit in ibsr_buffer */ + union { /* buffer to hold if_bond_status's */ + char * ibsru_buffer32; + u_int64_t ibsru_buffer64; + } ibsr_ibsru; + lacp_key ibsr_key; /* returned */ + u_int16_t ibsr_reserved0; /* for future use */ + u_int32_t ibsr_reserved[3];/* for future use */ +}; + +#if defined(__LP64__) +#define ibsr_buffer ibsr_ibsru.ibsru_buffer64 +#else +#define ibsr_buffer ibsr_ibsru.ibsru_buffer32 +#endif + +struct if_bond_req { + u_int32_t ibr_op; /* operation */ + union { + char ibru_if_name[IFNAMSIZ]; /* interface name */ + struct if_bond_status_req ibru_status; /* status information */ + int ibru_int_val; + } ibr_ibru; +}; + +#ifdef KERNEL_PRIVATE +int bond_family_init(void); +#endif KERNEL_PRIVATE + +#endif /* _NET_IF_BOND_VAR_H_ */ diff --git a/bsd/net/if_disc.c b/bsd/net/if_disc.c index c8c266bc4..b5e751f41 100644 --- a/bsd/net/if_disc.c +++ b/bsd/net/if_disc.c @@ -78,7 +78,7 @@ #define DSMTU 65532 #endif -static void discattach __P((void)); +static void discattach(void); static struct ifnet discif; static int discoutput(struct ifnet *, struct mbuf *, struct sockaddr *, @@ -200,7 +200,7 @@ discioctl(ifp, cmd, data) switch (cmd) { case SIOCSIFADDR: - ifp->if_flags |= IFF_UP; + ifnet_set_flags(ifp, IFF_UP, IFF_UP); ifa = (struct ifaddr *)data; if (ifa != 0) ifa->ifa_rtrequest = discrtrequest; diff --git a/bsd/net/if_dl.h b/bsd/net/if_dl.h index 16201a909..51e9262e0 100644 --- a/bsd/net/if_dl.h +++ b/bsd/net/if_dl.h @@ -98,14 +98,21 @@ struct sockaddr_dl { }; #define LLADDR(s) ((caddr_t)((s)->sdl_data + (s)->sdl_nlen)) +#ifdef KERNEL_PRIVATE +#define CONST_LLADDR(s) ((const u_char*)((s)->sdl_data + (s)->sdl_nlen)) +#endif + +#ifdef BSD_KERNEL_PRIVATE +#define SDL(s) ((struct sockaddr_dl *)s) +#endif #ifndef KERNEL #include <sys/cdefs.h> __BEGIN_DECLS -void link_addr __P((const char *, struct sockaddr_dl *)); -char *link_ntoa __P((const struct sockaddr_dl *)); +void link_addr(const char *, struct sockaddr_dl *); +char *link_ntoa(const struct sockaddr_dl *); __END_DECLS #endif /* !KERNEL */ diff --git a/bsd/net/if_dummy.c b/bsd/net/if_dummy.c index 50ce06568..c9b330fa3 100644 --- a/bsd/net/if_dummy.c +++ b/bsd/net/if_dummy.c @@ -77,7 +77,6 @@ #include <net/if.h> #include <net/if_types.h> -#include <net/netisr.h> #include <net/route.h> #include <net/bpf.h> @@ -109,12 +108,12 @@ #include "bpfilter.h" -static int dummyioctl __P((struct ifnet *, u_long, caddr_t)); -int dummyoutput __P((struct ifnet *, register struct mbuf *, struct sockaddr *, - register struct rtentry *)); -static void dummyrtrequest __P((int, struct rtentry *, struct sockaddr *)); +static int dummyioctl(struct ifnet *, u_long, caddr_t); +int dummyoutput(struct ifnet *, register struct mbuf *, struct sockaddr *, + register struct rtentry *); +static void dummyrtrequest(int, struct rtentry *, struct sockaddr *); -static void dummyattach __P((void *)); +static void dummyattach(void *); PSEUDO_SET(dummyattach, if_dummy); #if TINY_DUMMYMTU @@ -171,8 +170,6 @@ dummyoutput(ifp, m, dst, rt) struct sockaddr *dst; register struct rtentry *rt; { - int s, isr; - register struct ifqueue *ifq = 0; if ((m->m_flags & M_PKTHDR) == 0) panic("dummyoutput no HDR"); @@ -216,62 +213,9 @@ dummyoutput(ifp, m, dst, rt) } ifp->if_opackets++; ifp->if_obytes += m->m_pkthdr.len; - switch (dst->sa_family) { - -#if INET - case AF_INET: - ifq = &ipintrq; - isr = NETISR_IP; - break; -#endif -#if IPX - case AF_IPX: - ifq = &ipxintrq; - isr = NETISR_IPX; - break; -#endif -#if INET6 - case AF_INET6: - ifq = &ip6intrq; - isr = NETISR_IPV6; - break; -#endif -#if NS - case AF_NS: - ifq = &nsintrq; - isr = NETISR_NS; - break; -#endif -#if ISO - case AF_ISO: - ifq = &clnlintrq; - isr = NETISR_ISO; - break; -#endif -#if NETATALK - case AF_APPLETALK: - ifq = &atintrq2; - isr = NETISR_ATALK; - break; -#endif NETATALK - default: - printf("%s: can't handle af%d\n", - if_name(ifp), dst->sa_family); - m_freem(m); - return (EAFNOSUPPORT); - } - s = splimp(); - if (IF_QFULL(ifq)) { - IF_DROP(ifq); - m_freem(m); - splx(s); - return (ENOBUFS); - } - IF_ENQUEUE(ifq, m); - schednetisr(isr); + proto_inject(dst->sa_family, m); ifp->if_ipackets++; ifp->if_ibytes += m->m_pkthdr.len; - splx(s); return (0); } @@ -311,7 +255,7 @@ dummyioctl(ifp, cmd, data) switch (cmd) { case SIOCSIFADDR: - ifp->if_flags |= IFF_UP | IFF_RUNNING; + ifnet_set_flags(ifp, IFF_UP | IFF_RUNNING, IFF_UP | IFF_RUNNING); ifa = (struct ifaddr *)data; ifa->ifa_rtrequest = dummyrtrequest; /* diff --git a/bsd/net/if_ether.h b/bsd/net/if_ether.h new file mode 100644 index 000000000..a8e1bce0d --- /dev/null +++ b/bsd/net/if_ether.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef _NET_IF_ETHER_H +#define _NET_IF_ETHER_H +#ifdef KERNEL + +#include <net/kpi_interface.h> + +__BEGIN_DECLS + +/* + * These functions may be used for an interface emulating an ethernet + * interface and not using IOKit. If you use IOKit and the IOKit + * Ethernet Family, these functions will be set for you. Use these + * functions when filling out the ifnet_init_params structure. + */ +errno_t ether_demux(ifnet_t interface, mbuf_t packet, char* header, + protocol_family_t *protocol); +errno_t ether_add_proto(ifnet_t interface, protocol_family_t protocol, + const struct ifnet_demux_desc *demux_list, + u_int32_t demux_count); +errno_t ether_del_proto(ifnet_t interface, protocol_family_t protocol); +errno_t ether_frameout(ifnet_t interface, mbuf_t *packet, + const struct sockaddr *dest, const char *dest_lladdr, + const char *frame_type); +errno_t ether_ioctl(ifnet_t interface, u_int32_t command, void* data); +errno_t ether_check_multi(ifnet_t ifp, const struct sockaddr *multicast); + +__END_DECLS + +#endif KERNEL +#endif _NET_IF_ETHER_H diff --git a/bsd/net/if_ethersubr.c b/bsd/net/if_ethersubr.c index 0df1345ba..205ec4392 100644 --- a/bsd/net/if_ethersubr.c +++ b/bsd/net/if_ethersubr.c @@ -65,7 +65,6 @@ #include <sys/sysctl.h> #include <net/if.h> -#include <net/netisr.h> #include <net/route.h> #include <net/if_llc.h> #include <net/if_dl.h> @@ -101,47 +100,16 @@ extern struct ifqueue pkintrq; extern u_char etherbroadcastaddr[]; #define senderr(e) do { error = (e); goto bad;} while (0) -#define IFP2AC(IFP) ((struct arpcom *)IFP) /* * Perform common duties while attaching to interface list */ - -/* - IONetworkingFamily should call dlil_if_attach - ether_ifattach becomes obsolete, but remains for - temporary compatibility with third parties extensions -*/ -void -ether_ifattach(ifp) - register struct ifnet *ifp; -{ - boolean_t funnel_state; - - funnel_state = thread_funnel_set(network_flock, TRUE); - - ifp->if_name = "en"; - ifp->if_family = APPLE_IF_FAM_ETHERNET; - ifp->if_type = IFT_ETHER; - ifp->if_addrlen = 6; - ifp->if_hdrlen = 14; - ifp->if_mtu = ETHERMTU; - if (ifp->if_baudrate == 0) - ifp->if_baudrate = 10000000; - - dlil_if_attach(ifp); - (void) thread_funnel_set(network_flock, funnel_state); -} - -SYSCTL_DECL(_net_link); -SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet"); - int -ether_resolvemulti(ifp, llsa, sa) - struct ifnet *ifp; - struct sockaddr **llsa; - struct sockaddr *sa; +ether_resolvemulti( + struct ifnet *ifp, + struct sockaddr **llsa, + struct sockaddr *sa) { struct sockaddr_dl *sdl; struct sockaddr_in *sin; @@ -232,185 +200,6 @@ ether_resolvemulti(ifp, llsa, sa) } - - - -u_char ether_ipmulticast_min[6] = { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x00 }; -u_char ether_ipmulticast_max[6] = { 0x01, 0x00, 0x5e, 0x7f, 0xff, 0xff }; -/* - * Add an Ethernet multicast address or range of addresses to the list for a - * given interface. - */ -int -ether_addmulti(ifr, ac) - struct ifreq *ifr; - register struct arpcom *ac; -{ - register struct ether_multi *enm; - struct sockaddr_in *sin; - u_char addrlo[6]; - u_char addrhi[6]; - int s = splimp(); - - switch (ifr->ifr_addr.sa_family) { - - case AF_UNSPEC: - bcopy(ifr->ifr_addr.sa_data, addrlo, 6); - bcopy(addrlo, addrhi, 6); - break; - -#if INET - case AF_INET: - sin = (struct sockaddr_in *)&(ifr->ifr_addr); - if (sin->sin_addr.s_addr == INADDR_ANY) { - /* - * An IP address of INADDR_ANY means listen to all - * of the Ethernet multicast addresses used for IP. - * (This is for the sake of IP multicast routers.) - */ - bcopy(ether_ipmulticast_min, addrlo, 6); - bcopy(ether_ipmulticast_max, addrhi, 6); - } - else { - ETHER_MAP_IP_MULTICAST(&sin->sin_addr, addrlo); - bcopy(addrlo, addrhi, 6); - } - break; -#endif - - default: - splx(s); - return (EAFNOSUPPORT); - } - - /* - * Verify that we have valid Ethernet multicast addresses. - */ - if ((addrlo[0] & 0x01) != 1 || (addrhi[0] & 0x01) != 1) { - splx(s); - return (EINVAL); - } - /* - * See if the address range is already in the list. - */ - ETHER_LOOKUP_MULTI(addrlo, addrhi, ac, enm); - if (enm != NULL) { - /* - * Found it; just increment the reference count. - */ - ++enm->enm_refcount; - splx(s); - return (0); - } - /* - * New address or range; malloc a new multicast record - * and link it into the interface's multicast list. - */ - enm = (struct ether_multi *)_MALLOC(sizeof(*enm), M_IFMADDR, M_WAITOK); - if (enm == NULL) { - splx(s); - return (ENOBUFS); - } - bcopy(addrlo, enm->enm_addrlo, 6); - bcopy(addrhi, enm->enm_addrhi, 6); - enm->enm_ac = ac; - enm->enm_refcount = 1; - enm->enm_next = ac->ac_multiaddrs; - ac->ac_multiaddrs = enm; - splx(s); - /* - * Return ENETRESET to inform the driver that the list has changed - * and its reception filter should be adjusted accordingly. - */ - return (ENETRESET); -} - -/* - * Delete a multicast address record. - */ -int -ether_delmulti(ifr, ac, ret_mca) - struct ifreq *ifr; - register struct arpcom *ac; - struct ether_addr * ret_mca; -{ - register struct ether_multi *enm; - register struct ether_multi **p; - struct sockaddr_in *sin; - u_char addrlo[6]; - u_char addrhi[6]; - int s = splimp(); - - switch (ifr->ifr_addr.sa_family) { - - case AF_UNSPEC: - bcopy(ifr->ifr_addr.sa_data, addrlo, 6); - bcopy(addrlo, addrhi, 6); - break; - -#if INET - case AF_INET: - sin = (struct sockaddr_in *)&(ifr->ifr_addr); - if (sin->sin_addr.s_addr == INADDR_ANY) { - /* - * An IP address of INADDR_ANY means stop listening - * to the range of Ethernet multicast addresses used - * for IP. - */ - bcopy(ether_ipmulticast_min, addrlo, 6); - bcopy(ether_ipmulticast_max, addrhi, 6); - } - else { - ETHER_MAP_IP_MULTICAST(&sin->sin_addr, addrlo); - bcopy(addrlo, addrhi, 6); - } - break; -#endif - - default: - splx(s); - return (EAFNOSUPPORT); - } - - /* - * Look up the address in our list. - */ - ETHER_LOOKUP_MULTI(addrlo, addrhi, ac, enm); - if (enm == NULL) { - splx(s); - return (ENXIO); - } - if (--enm->enm_refcount != 0) { - /* - * Still some claims to this record. - */ - splx(s); - return (0); - } - - /* save the low and high address of the range before deletion */ - if (ret_mca) { - *ret_mca = *((struct ether_addr *)addrlo); - *(ret_mca + 1) = *((struct ether_addr *)addrhi); - } - - /* - * No remaining claims to this record; unlink and free it. - */ - for (p = &enm->enm_ac->ac_multiaddrs; - *p != enm; - p = &(*p)->enm_next) - continue; - *p = (*p)->enm_next; - FREE(enm, M_IFMADDR); - splx(s); - /* - * Return ENETRESET to inform the driver that the list has changed - * and its reception filter should be adjusted accordingly. - */ - return (ENETRESET); -} - /* * Convert Ethernet address to printable (loggable) representation. */ diff --git a/bsd/net/if_faith.c b/bsd/net/if_faith.c index 1a33f89d7..5c4c8e487 100644 --- a/bsd/net/if_faith.c +++ b/bsd/net/if_faith.c @@ -80,7 +80,6 @@ #include <net/if.h> #include <net/if_types.h> -#include <net/netisr.h> #include <net/route.h> #include <net/bpf.h> #include <net/if_faith.h> @@ -106,12 +105,12 @@ #include <net/net_osdep.h> -static int faithioctl __P((struct ifnet *, u_long, void*)); -int faith_pre_output __P((struct ifnet *, register struct mbuf **, struct sockaddr *, - caddr_t, char *, char *, u_long)); -static void faithrtrequest __P((int, struct rtentry *, struct sockaddr *)); +static int faithioctl(struct ifnet *, u_long, void*); +int faith_pre_output(struct ifnet *, register struct mbuf **, + const struct sockaddr *, caddr_t, char *, char *, u_long); +static void faithrtrequest(int, struct rtentry *, struct sockaddr *); -void faithattach __P((void)); +void faithattach(void); #ifndef __APPLE__ PSEUDO_SET(faithattach, if_faith); #endif @@ -196,21 +195,15 @@ int faith_attach_inet(struct ifnet *ifp, u_long *dl_tag) } } + bzero(®, sizeof(reg)); + bzero(&desc, sizeof(desc)); TAILQ_INIT(®.demux_desc_head); desc.type = DLIL_DESC_RAW; - desc.variants.bitmask.proto_id_length = 0; - desc.variants.bitmask.proto_id = 0; - desc.variants.bitmask.proto_id_mask = 0; desc.native_type = (char *) &native; TAILQ_INSERT_TAIL(®.demux_desc_head, &desc, next); reg.interface_family = ifp->if_family; reg.unit_number = ifp->if_unit; - reg.input = 0; reg.pre_output = faith_pre_output; - reg.event = 0; - reg.offer = 0; - reg.ioctl = 0; - reg.default_proto = 0; reg.protocol_family = PF_INET; stat = dlil_attach_protocol(®, dl_tag); @@ -288,14 +281,12 @@ int faith_pre_output(ifp, m0, dst, route_entry, frame_type, dst_addr, dl_tag) struct ifnet *ifp; register struct mbuf **m0; - struct sockaddr *dst; + const struct sockaddr *dst; caddr_t route_entry; char *frame_type; char *dst_addr; u_long dl_tag; { - int s, isr; - register struct ifqueue *ifq = 0; register struct mbuf *m = *m0; struct rtentry *rt = (struct rtentry*)route_entry; @@ -339,37 +330,10 @@ faith_pre_output(ifp, m0, dst, route_entry, frame_type, dst_addr, dl_tag) } ifp->if_opackets++; ifp->if_obytes += m->m_pkthdr.len; - switch (dst->sa_family) { -#if INET - case AF_INET: - ifq = &ipintrq; - isr = NETISR_IP; - break; -#endif -#if INET6 - case AF_INET6: - ifq = &ip6intrq; - isr = NETISR_IPV6; - break; -#endif - default: - return EAFNOSUPPORT; - } - - /* XXX do we need more sanity checks? */ - m->m_pkthdr.rcvif = ifp; - s = splimp(); - if (IF_QFULL(ifq)) { - IF_DROP(ifq); - splx(s); - return (ENOBUFS); - } - IF_ENQUEUE(ifq, m); - schednetisr(isr); + proto_inject(dst->sa_family, m); ifp->if_ipackets++; ifp->if_ibytes += m->m_pkthdr.len; - splx(s); return (EJUSTRETURN); } @@ -409,7 +373,7 @@ faithioctl(ifp, cmd, data) switch (cmd) { case SIOCSIFADDR: - ifp->if_flags |= IFF_UP | IFF_RUNNING; + ifnet_set_flags(ifp, IFF_UP | IFF_RUNNING, IFF_UP | IFF_RUNNING); ifa = (struct ifaddr *)data; ifa->ifa_rtrequest = faithrtrequest; /* diff --git a/bsd/net/if_faith.h b/bsd/net/if_faith.h index 3e97f5744..953c9b081 100644 --- a/bsd/net/if_faith.h +++ b/bsd/net/if_faith.h @@ -34,13 +34,11 @@ #define _NET_IF_FAITH_H_ #include <sys/appleapiopts.h> -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE #if INET6 struct in6_addr; -int faithprefix __P((struct in6_addr *)); +int faithprefix(struct in6_addr *); #endif /* INET6 */ -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +#endif /* KERNEL_PRIVATE */ #endif /* _NET_IF_FAITH_H_ */ diff --git a/bsd/net/if_gif.c b/bsd/net/if_gif.c index f3e6be8c9..6db06ef62 100644 --- a/bsd/net/if_gif.c +++ b/bsd/net/if_gif.c @@ -66,7 +66,6 @@ #include <net/if.h> #include <net/if_types.h> -#include <net/netisr.h> #include <net/route.h> #include <net/bpf.h> @@ -104,9 +103,9 @@ static MALLOC_DEFINE(M_GIF, "gif", "Generic Tunnel Interface"); TAILQ_HEAD(gifhead, gif_softc) gifs = TAILQ_HEAD_INITIALIZER(gifs); #ifdef __APPLE__ -void gifattach __P((void)); -int gif_pre_output __P((struct ifnet *, register struct mbuf **, struct sockaddr *, - caddr_t, char *, char *, u_long)); +void gifattach(void); +int gif_pre_output(struct ifnet *ifp, u_long protocol_family, struct mbuf **m0, + const struct sockaddr *dst, caddr_t rt, char *frame, char *address); static void gif_create_dev(void); static int gif_encapcheck(const struct mbuf*, int, int, void*); @@ -119,20 +118,22 @@ struct protosw in_gif_protosw = { SOCK_RAW, 0, 0/*IPPROTO_IPV[46]*/, PR_ATOMIC|PR_ADDR, in_gif_input, 0, 0, 0, 0, - 0, 0, 0, 0, + 0, 0, 0, 0, 0, - &rip_usrreqs + &rip_usrreqs, + 0, rip_unlock, 0 }; #endif #if INET6 struct ip6protosw in6_gif_protosw = { SOCK_RAW, 0, 0/*IPPROTO_IPV[46]*/, PR_ATOMIC|PR_ADDR, - in6_gif_input, - 0, 0, 0, + in6_gif_input, 0, 0, 0, 0, 0, 0, 0, 0, - 0, - &rip6_usrreqs + 0, + &rip6_usrreqs, + 0, rip_unlock, 0, + }; #endif @@ -163,185 +164,88 @@ static int max_gif_nesting = MAX_GIF_NEST; */ /* GIF interface module support */ -int gif_demux(ifp, m, frame_header, proto) - struct ifnet *ifp; - struct mbuf *m; - char *frame_header; - struct if_proto **proto; +int gif_demux( + struct ifnet *ifp, + struct mbuf *m, + char *frame_header, + u_long *protocol_family) { struct gif_softc* gif = (struct gif_softc*)ifp->if_softc; /* Only one protocol may be attached to a gif interface. */ - *proto = gif->gif_proto; + *protocol_family = gif->gif_proto; return 0; } static -int gif_add_if(struct ifnet *ifp) -{ - ifp->if_demux = gif_demux; - ifp->if_framer = 0; - return 0; -} - -static -int gif_del_if(struct ifnet *ifp) -{ - return 0; -} - -static -int gif_add_proto(struct ddesc_head_str *desc_head, struct if_proto *proto, u_long dl_tag) +int gif_add_proto(struct ifnet *ifp, u_long protocol_family, struct ddesc_head_str *desc_head) { /* Only one protocol may be attached at a time */ - struct gif_softc* gif = (struct gif_softc*)proto->ifp; + struct gif_softc* gif = (struct gif_softc*)ifp->if_softc; - if (gif->gif_proto != NULL) + if (gif->gif_proto != 0) printf("gif_add_proto: request add_proto for gif%d\n", gif->gif_if.if_unit); - gif->gif_proto = proto; + gif->gif_proto = protocol_family; return 0; } static -int gif_del_proto(struct if_proto *proto, u_long dl_tag) +int gif_del_proto(struct ifnet *ifp, u_long protocol_family) { - if (((struct gif_softc*)proto->ifp)->gif_proto == proto) - ((struct gif_softc*)proto->ifp)->gif_proto = NULL; + if (((struct gif_softc*)ifp)->gif_proto == protocol_family) + ((struct gif_softc*)ifp)->gif_proto = 0; else return ENOENT; return 0; } -int gif_shutdown() -{ - return 0; -} - -void gif_reg_if_mods() -{ - struct dlil_ifmod_reg_str gif_ifmod; - - bzero(&gif_ifmod, sizeof(gif_ifmod)); - gif_ifmod.add_if = gif_add_if; - gif_ifmod.del_if = gif_del_if; - gif_ifmod.add_proto = gif_add_proto; - gif_ifmod.del_proto = gif_del_proto; - gif_ifmod.ifmod_ioctl = 0; - gif_ifmod.shutdown = gif_shutdown; - - if (dlil_reg_if_modules(APPLE_IF_FAM_GIF, &gif_ifmod)) - panic("Couldn't register gif modules\n"); - -} - /* Glue code to attach inet to a gif interface through DLIL */ - -u_long gif_attach_proto_family(struct ifnet *ifp, int af) +int +gif_attach_proto_family( + struct ifnet *ifp, + u_long protocol_family) { struct dlil_proto_reg_str reg; - struct dlil_demux_desc desc; - u_long dl_tag=0; - short native=0; int stat; - /* Check if we're already attached */ - stat = dlil_find_dltag(ifp->if_family, ifp->if_unit, af, &dl_tag); - if (stat == 0) - return dl_tag; - + bzero(®, sizeof(reg)); TAILQ_INIT(®.demux_desc_head); - desc.type = DLIL_DESC_RAW; - desc.variants.bitmask.proto_id_length = 0; - desc.variants.bitmask.proto_id = 0; - desc.variants.bitmask.proto_id_mask = 0; - desc.native_type = (char *) &native; - TAILQ_INSERT_TAIL(®.demux_desc_head, &desc, next); reg.interface_family = ifp->if_family; reg.unit_number = ifp->if_unit; reg.input = gif_input; reg.pre_output = gif_pre_output; - reg.event = 0; - reg.offer = 0; - reg.ioctl = 0; - reg.default_proto = 0; - reg.protocol_family = af; - - stat = dlil_attach_protocol(®, &dl_tag); - if (stat) { - panic("gif_attach_proto_family can't attach interface fam=%d\n", af); - } - - return dl_tag; -} + reg.protocol_family = protocol_family; -u_long gif_detach_proto_family(struct ifnet *ifp, int af) -{ - u_long ip_dl_tag = 0; - int stat; - - stat = dlil_find_dltag(ifp->if_family, ifp->if_unit, af, &ip_dl_tag); - if (stat == 0) { - stat = dlil_detach_protocol(ip_dl_tag); - if (stat) { - printf("WARNING: gif_detach can't detach IP fam=%d from interface\n", af); - } + stat = dlil_attach_protocol(®); + if (stat && stat != EEXIST) { + panic("gif_attach_proto_family can't attach interface fam=%d\n", protocol_family); } - return (stat); -} - -int gif_attach_inet(struct ifnet *ifp, u_long *dl_tag) { - *dl_tag = gif_attach_proto_family(ifp, AF_INET); - return 0; -} - -int gif_detach_inet(struct ifnet *ifp, u_long dl_tag) { - gif_detach_proto_family(ifp, AF_INET); - return 0; -} -int gif_attach_inet6(struct ifnet *ifp, u_long *dl_tag) { - *dl_tag = gif_attach_proto_family(ifp, AF_INET6); - return 0; + return stat; } -int gif_detach_inet6(struct ifnet *ifp, u_long dl_tag) { - gif_detach_proto_family(ifp, AF_INET6); - return 0; -} #endif /* Function to setup the first gif interface */ void gifattach(void) { - struct dlil_protomod_reg_str gif_protoreg; int error; /* Init the list of interfaces */ TAILQ_INIT(&gifs); - gif_reg_if_mods(); /* DLIL modules */ - /* Register protocol registration functions */ - - bzero(&gif_protoreg, sizeof(gif_protoreg)); - gif_protoreg.attach_proto = gif_attach_inet; - gif_protoreg.detach_proto = gif_detach_inet; - - if ( error = dlil_reg_proto_module(AF_INET, APPLE_IF_FAM_GIF, &gif_protoreg) != 0) + if ( error = dlil_reg_proto_module(AF_INET, APPLE_IF_FAM_GIF, gif_attach_proto_family, NULL) != 0) printf("dlil_reg_proto_module failed for AF_INET error=%d\n", error); - - gif_protoreg.attach_proto = gif_attach_inet6; - gif_protoreg.detach_proto = gif_detach_inet6; - if ( error = dlil_reg_proto_module(AF_INET6, APPLE_IF_FAM_GIF, &gif_protoreg) != 0) + if ( error = dlil_reg_proto_module(AF_INET6, APPLE_IF_FAM_GIF, gif_attach_proto_family, NULL) != 0) printf("dlil_reg_proto_module failed for AF_INET6 error=%d\n", error); - /* Create first device */ gif_create_dev(); } @@ -399,6 +303,7 @@ gif_create_dev(void) } #endif + sc->gif_called = 0; sc->gif_if.if_family= APPLE_IF_FAM_GIF; sc->gif_if.if_mtu = GIF_MTU; sc->gif_if.if_flags = IFF_POINTOPOINT | IFF_MULTICAST; @@ -406,9 +311,12 @@ gif_create_dev(void) /* turn off ingress filter */ sc->gif_if.if_flags |= IFF_LINK2; #endif + sc->gif_if.if_demux = gif_demux; sc->gif_if.if_ioctl = gif_ioctl; sc->gif_if.if_output = NULL; /* pre_output returns error or EJUSTRETURN */ sc->gif_if.if_type = IFT_GIF; + sc->gif_if.if_add_proto = gif_add_proto; + sc->gif_if.if_del_proto = gif_del_proto; dlil_if_attach(&sc->gif_if); bpfattach(&sc->gif_if, DLT_NULL, sizeof(u_int)); TAILQ_INSERT_TAIL(&gifs, sc, gif_link); @@ -473,20 +381,19 @@ gif_encapcheck(m, off, proto, arg) } int -gif_pre_output(ifp, m0, dst, rt, frame, address, dl_tag) - struct ifnet *ifp; - struct mbuf **m0; - struct sockaddr *dst; - caddr_t rt; - char *frame; - char *address; - u_long dl_tag; +gif_pre_output( + struct ifnet *ifp, + u_long protocol_family, + struct mbuf **m0, + const struct sockaddr *dst, + caddr_t rt, + char *frame, + char *address) { struct gif_softc *sc = (struct gif_softc*)ifp; register struct mbuf * m = *m0; int error = 0; - static int called = 0; /* XXX: MUTEX */ - + /* * gif may cause infinite recursion calls when misconfigured. * We'll prevent this by introducing upper limit. @@ -494,16 +401,16 @@ gif_pre_output(ifp, m0, dst, rt, frame, address, dl_tag) * mutual exclusion of the variable CALLED, especially if we * use kernel thread. */ - if (++called > max_gif_nesting) { + if (++sc->gif_called > max_gif_nesting) { log(LOG_NOTICE, "gif_output: recursively called too many times(%d)\n", - called); + sc->gif_called); m_freem(m); /* free it here not in dlil_output*/ error = EIO; /* is there better errno? */ goto end; } - getmicrotime(&ifp->if_lastchange); + ifnet_touch_lastchange(ifp); m->m_flags &= ~(M_BCAST|M_MCAST); if (!(ifp->if_flags & IFF_UP) || sc->gif_psrc == NULL || sc->gif_pdst == NULL) { @@ -521,11 +428,11 @@ gif_pre_output(ifp, m0, dst, rt, frame, address, dl_tag) * try to free it or keep a pointer a to it). */ struct mbuf m0; - u_int32_t af = dst->sa_family; + u_int32_t protocol_family = dst->sa_family; m0.m_next = m; m0.m_len = 4; - m0.m_data = (char *)⁡ + m0.m_data = (char *)&protocol_family; bpf_mtap(ifp, &m0); } @@ -554,7 +461,7 @@ gif_pre_output(ifp, m0, dst, rt, frame, address, dl_tag) } end: - called = 0; /* reset recursion counter */ + sc->gif_called = 0; /* reset recursion counter */ if (error) { /* the mbuf was freed either by in_gif_output or in here */ *m0 = NULL; /* avoid getting dlil_output freeing it */ @@ -566,16 +473,13 @@ gif_pre_output(ifp, m0, dst, rt, frame, address, dl_tag) } int -gif_input(m, frame_header, gifp, dl_tag, sync_ok) - struct mbuf *m; - char* frame_header; - struct ifnet* gifp; - u_long dl_tag; - int sync_ok; +gif_input( + struct mbuf *m, + char* frame_header, + struct ifnet* gifp, + u_long protocol_family, + int sync_ok) { - int s, isr; - struct ifqueue *ifq = 0; - int af; if (gifp == NULL) { /* just in case */ @@ -583,9 +487,6 @@ gif_input(m, frame_header, gifp, dl_tag, sync_ok) return; } - /* Assume packet is of type of protocol attached to this interface */ - af = ((struct gif_softc*)(gifp->if_softc))->gif_proto->protocol_family; - if (m->m_pkthdr.rcvif) m->m_pkthdr.rcvif = gifp; @@ -598,11 +499,11 @@ gif_input(m, frame_header, gifp, dl_tag, sync_ok) * try to free it or keep a pointer a to it). */ struct mbuf m0; - u_int32_t af1 = af; + u_int32_t protocol_family1 = protocol_family; m0.m_next = m; m0.m_len = 4; - m0.m_data = (char *)&af1; + m0.m_data = (char *)&protocol_family1; bpf_mtap(gifp, &m0); } @@ -619,37 +520,9 @@ gif_input(m, frame_header, gifp, dl_tag, sync_ok) * it occurs more times than we thought, we may change the policy * again. */ - switch (af) { -#if INET - case AF_INET: - ifq = &ipintrq; - isr = NETISR_IP; - break; -#endif -#if INET6 - case AF_INET6: - ifq = &ip6intrq; - isr = NETISR_IPV6; - break; -#endif - default: - m_freem(m); - return (EJUSTRETURN); - } - - s = splimp(); - if (IF_QFULL(ifq)) { - IF_DROP(ifq); /* update statistics */ - m_freem(m); - splx(s); - return (EJUSTRETURN); - } - IF_ENQUEUE(ifq, m); - /* we need schednetisr since the address family may change */ - schednetisr(isr); + proto_input(protocol_family, m); gifp->if_ipackets++; gifp->if_ibytes += m->m_pkthdr.len; - splx(s); return (0); } @@ -781,7 +654,8 @@ gif_ioctl(ifp, cmd, data) break; } - TAILQ_FOREACH(ifp2, &ifnet, if_link) { + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp2, &ifnet_head, if_link) { if (strcmp(ifp2->if_name, GIFNAME) != 0) continue; sc2 = ifp2->if_softc; @@ -799,6 +673,7 @@ gif_ioctl(ifp, cmd, data) if (bcmp(sc2->gif_pdst, dst, dst->sa_len) == 0 && bcmp(sc2->gif_psrc, src, src->sa_len) == 0) { error = EADDRNOTAVAIL; + ifnet_head_done(); goto bad; } #endif @@ -813,16 +688,19 @@ gif_ioctl(ifp, cmd, data) if (dst->sa_family == AF_INET && multidest(dst) && multidest(sc2->gif_pdst)) { error = EADDRNOTAVAIL; + ifnet_head_done(); goto bad; } #if INET6 if (dst->sa_family == AF_INET6 && multidest6(dst) && multidest6(sc2->gif_pdst)) { error = EADDRNOTAVAIL; + ifnet_head_done(); goto bad; } #endif } + ifnet_head_done(); if (sc->gif_psrc) FREE((caddr_t)sc->gif_psrc, M_IFADDR); @@ -838,8 +716,6 @@ gif_ioctl(ifp, cmd, data) ifp->if_flags |= IFF_RUNNING; - gif_attach_proto_family(ifp, src->sa_family); - s = splimp(); if_up(ifp); /* mark interface UP and send up RTM_IFINFO */ #ifdef __APPLE__ @@ -966,6 +842,8 @@ gif_ioctl(ifp, cmd, data) return error; } +#ifndef __APPLE__ +/* This function is not used in our stack */ void gif_delete_tunnel(sc) struct gif_softc *sc; @@ -982,3 +860,4 @@ gif_delete_tunnel(sc) } /* change the IFF_UP flag as well? */ } +#endif diff --git a/bsd/net/if_gif.h b/bsd/net/if_gif.h index a74b84a4e..bfd647244 100644 --- a/bsd/net/if_gif.h +++ b/bsd/net/if_gif.h @@ -61,15 +61,15 @@ #include <netinet/in.h> /* xxx sigh, why route have struct route instead of pointer? */ +#ifdef KERNEL_PRIVATE struct encaptab; -#ifdef __APPLE_API_PRIVATE struct gif_softc { struct ifnet gif_if; /* common area - must be at the top */ struct sockaddr *gif_psrc; /* Physical src addr */ struct sockaddr *gif_pdst; /* Physical dst addr */ #ifdef __APPLE__ - struct if_proto *gif_proto; /* dlil protocol attached */ + u_long gif_proto; /* dlil protocol attached */ #endif union { struct route gifscr_ro; /* xxx */ @@ -78,6 +78,7 @@ struct gif_softc { #endif } gifsc_gifscr; int gif_flags; + int gif_called; const struct encaptab *encap_cookie4; const struct encaptab *encap_cookie6; TAILQ_ENTRY(gif_softc) gif_link; /* all gif's are linked */ @@ -87,18 +88,20 @@ struct gif_softc { #if INET6 #define gif_ro6 gifsc_gifscr.gifscr_ro6 #endif -#endif /* __APPLE_API_PRIVATE */ + +#endif /* KERNEL_PRIVATE */ #define GIF_MTU (1280) /* Default MTU */ #define GIF_MTU_MIN (1280) /* Minimum MTU */ #define GIF_MTU_MAX (8192) /* Maximum MTU */ -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE + /* Prototypes */ -int gif_input __P((struct mbuf *, char*, struct ifnet *, u_long, int)); -int gif_output __P((struct ifnet *, struct mbuf *, - struct sockaddr *, struct rtentry *)); -int gif_ioctl __P((struct ifnet *, u_long, void*)); -#endif /* __APPLE_API_PRIVATE */ +int gif_input(struct mbuf *, char*, struct ifnet *, u_long, int); +int gif_output(struct ifnet *, struct mbuf *, + struct sockaddr *, struct rtentry *); +int gif_ioctl(struct ifnet *, u_long, void*); +#endif /* KERNEL_PRIVATE */ #endif /* _NET_IF_GIF_H_ */ diff --git a/bsd/net/if_loop.c b/bsd/net/if_loop.c index 33255ecb1..57914cd96 100644 --- a/bsd/net/if_loop.c +++ b/bsd/net/if_loop.c @@ -70,7 +70,6 @@ #include <net/if.h> #include <net/if_types.h> -#include <net/netisr.h> #include <net/route.h> #include <net/bpf.h> #include <sys/malloc.h> @@ -94,6 +93,7 @@ #endif #include <net/dlil.h> +#include <net/kpi_protocol.h> #if NETAT extern struct ifqueue atalkintrq; @@ -107,14 +107,14 @@ extern struct ifqueue atalkintrq; #define NLOOP_ATTACHMENTS (NLOOP * 12) struct lo_statics_str { - int bpf_mode; - int (*bpf_callback)(struct ifnet *, struct mbuf *); + int bpf_mode; + bpf_packet_func bpf_callback; }; -static struct if_proto *lo_array[NLOOP_ATTACHMENTS]; -static struct lo_statics_str lo_statics[NLOOP]; -static lo_count = 0; +void loopattach(void *dummy); +static struct lo_statics_str lo_statics[NLOOP]; +int loopattach_done = 0; /* used to sync ip6_init2 loopback configuration */ #ifdef TINY_LOMTU #define LOMTU (1024+512) @@ -123,102 +123,70 @@ static lo_count = 0; #endif struct ifnet loif[NLOOP]; +struct ifnet *lo_ifp = &loif[0]; -void lo_reg_if_mods(); - +struct loopback_header { + u_long protocol; +}; +void lo_reg_if_mods(void); +/* Local forward declerations */ -int lo_demux(ifp, m, frame_header, proto) - struct ifnet *ifp; - struct mbuf *m; - char *frame_header; - struct if_proto **proto; +static errno_t +lo_demux( + __unused ifnet_t ifp, + __unused mbuf_t m, + char *frame_header, + protocol_family_t *protocol_family) { - int i; - struct if_proto **proto_ptr; - - proto_ptr = mtod(m, struct if_proto **); - *proto = *proto_ptr; - m_adj(m, sizeof(u_long)); - return 0; + struct loopback_header *header = (struct loopback_header *)frame_header; + + *protocol_family = header->protocol; + + return 0; } -int lo_framer(ifp, m, dest, dest_linkaddr, frame_type) - struct ifnet *ifp; - struct mbuf **m; - struct sockaddr *dest; - char *dest_linkaddr; - char *frame_type; - +static errno_t +lo_framer( + __unused ifnet_t ifp, + mbuf_t *m, + __unused const struct sockaddr *dest, + __unused const char *dest_linkaddr, + const char *frame_type) { - char *to_ptr; + struct loopback_header *header; - M_PREPEND(*m, (4 * sizeof(u_long)), M_WAITOK); - to_ptr = mtod(*m, char *); - bcopy(dest_linkaddr, to_ptr, (4 * sizeof(u_long))); + M_PREPEND(*m, sizeof(struct loopback_header), M_WAITOK); + header = mtod(*m, struct loopback_header*); + header->protocol = *(const u_long*)frame_type; return 0; } -static -int lo_add_if(struct ifnet *ifp) -{ - ifp->if_demux = lo_demux; - ifp->if_framer = lo_framer; - ifp->if_event = 0; - return 0; -} - -static -int lo_del_if(struct ifnet *ifp) +static errno_t +lo_add_proto( + __unused struct ifnet *ifp, + __unused u_long protocol_family, + __unused struct ddesc_head_str *demux_desc_head) { return 0; } - - -static -int lo_add_proto(struct ddesc_head_str *desc_head, struct if_proto *proto, u_long dl_tag) +static errno_t +lo_del_proto( + __unused ifnet_t ifp, + __unused protocol_family_t protocol) { - int i; - - for (i=0; i < lo_count; i++) - if (lo_array[i] == 0) { - lo_array[lo_count] = proto; - return 0; - } - - if ((i == lo_count) && (lo_count == NLOOP_ATTACHMENTS)) - panic("lo_add_proto -- Too many attachments\n"); - - lo_array[lo_count++] = proto; - return 0; -} - - -static -int lo_del_proto(struct if_proto *proto, u_long dl_tag) -{ - int i; - - for (i=0; i < lo_count; i++) - if (lo_array[i] == proto) { - lo_array[i] = 0; - return 0; - } - - return ENOENT; + return 0; } static int -lo_output(ifp, m) - struct ifnet *ifp; - register struct mbuf *m; -{ u_int *prepend_ptr; - u_int af; - u_long saved_header[3]; +lo_output( + struct ifnet *ifp, + struct mbuf *m) +{ if ((m->m_flags & M_PKTHDR) == 0) panic("lo_output: no HDR"); @@ -230,20 +198,26 @@ lo_output(ifp, m) */ if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = ifp; - prepend_ptr = mtod(m, u_int *); - af = *prepend_ptr; - m_adj(m, sizeof(u_int)); + ifp->if_ibytes += m->m_pkthdr.len; + ifp->if_obytes += m->m_pkthdr.len; + + ifp->if_opackets++; + ifp->if_ipackets++; + + m->m_pkthdr.header = mtod(m, char *); + m->m_pkthdr.csum_data = 0xffff; /* loopback checksums are always OK */ + m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR | + CSUM_IP_CHECKED | CSUM_IP_VALID; + m_adj(m, sizeof(struct loopback_header)); #if NBPFILTER > 0 if (lo_statics[ifp->if_unit].bpf_mode != BPF_TAP_DISABLE) { struct mbuf m0, *n; - bcopy(mtod(m, caddr_t), &saved_header[0], (3 * sizeof(u_long))); - m_adj(m, (3 * sizeof(u_long))); - n = m; if (ifp->if_bpf->bif_dlt == DLT_NULL) { + struct loopback_header *header; /* * We need to prepend the address family as * a four byte field. Cons up a dummy header @@ -251,156 +225,70 @@ lo_output(ifp, m) * will only read from the mbuf (i.e., it won't * try to free it or keep a pointer a to it). */ + header = (struct loopback_header*)m->m_pkthdr.header; m0.m_next = m; m0.m_len = 4; - m0.m_data = (char *)⁡ + m0.m_data = (char *)&header->protocol; n = &m0; } - (*lo_statics[ifp->if_unit].bpf_callback)(ifp, n); - - M_PREPEND(m, (3 * sizeof(u_long)), M_WAITOK); - bcopy(&saved_header[0], mtod(m, caddr_t), (3 * sizeof(u_long))); - + lo_statics[ifp->if_unit].bpf_callback(ifp, n); } #endif - ifp->if_ibytes += m->m_pkthdr.len; - ifp->if_obytes += m->m_pkthdr.len; - - ifp->if_opackets++; - ifp->if_ipackets++; - - m->m_pkthdr.header = mtod(m, char *); - m->m_pkthdr.csum_data = 0xffff; /* loopback checksums are always OK */ - m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR | - CSUM_IP_CHECKED | CSUM_IP_VALID; return dlil_input(ifp, m, m); } /* - * This is a common pre-output route used by INET, AT, etc. This could + * This is a common pre-output route used by INET and INET6. This could * (should?) be split into separate pre-output routines for each protocol. */ static int -lo_pre_output(ifp, m, dst, route, frame_type, dst_addr, dl_tag) - struct ifnet *ifp; - register struct mbuf **m; - struct sockaddr *dst; - void *route; - char *frame_type; - char *dst_addr; - u_long dl_tag; +lo_pre_output( + __unused struct ifnet *ifp, + u_long protocol_family, + struct mbuf **m, + __unused const struct sockaddr *dst, + caddr_t route, + char *frame_type, + __unused char *dst_addr) { - int s, isr; - register struct ifqueue *ifq = 0; - u_long *prepend_ptr; register struct rtentry *rt = (struct rtentry *) route; - prepend_ptr = (u_long *) dst_addr; if (((*m)->m_flags & M_PKTHDR) == 0) panic("looutput no HDR"); if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { - if (rt->rt_flags & RTF_BLACKHOLE) { - m_freem(*m); - return EJUSTRETURN; - } - else - return ((rt->rt_flags & RTF_HOST) ? EHOSTUNREACH : ENETUNREACH); - } - - switch (dst->sa_family) { -#if INET - case AF_INET: - ifq = &ipintrq; - isr = NETISR_IP; - break; -#endif -#if INET6 - case AF_INET6: - (*m)->m_flags |= M_LOOP; - ifq = &ip6intrq; - isr = NETISR_IPV6; - break; -#endif -#if IPX - case AF_IPX: - ifq = &ipxintrq; - isr = NETISR_IPX; - break; -#endif -#if NS - case AF_NS: - ifq = &nsintrq; - isr = NETISR_NS; - break; -#endif -#if ISO - case AF_ISO: - ifq = &clnlintrq; - isr = NETISR_ISO; - break; -#endif -#if NETAT - case AF_APPLETALK: - ifq = &atalkintrq; - isr = NETISR_APPLETALK; - break; -#endif /* NETAT */ - default: - return (EAFNOSUPPORT); + if (rt->rt_flags & RTF_BLACKHOLE) { + m_freem(*m); + return EJUSTRETURN; + } + else + return ((rt->rt_flags & RTF_HOST) ? EHOSTUNREACH : ENETUNREACH); } - - *prepend_ptr++ = dst->sa_family; /* For lo_output(BPF) */ - *prepend_ptr++ = dlttoproto(dl_tag); /* For lo_demux */ - *prepend_ptr++ = (u_long) ifq; /* For lo_input */ - *prepend_ptr = isr; /* For lo_input */ + + *(u_long *)frame_type = protocol_family; return 0; } - - - /* * lo_input - This should work for all attached protocols that use the * ifq/schednetisr input mechanism. */ - - -int -lo_input(m, fh, ifp, dl_tag, sync_ok) - register struct mbuf *m; - char *fh; - struct ifnet *ifp; - u_long dl_tag; - int sync_ok; - +static int +lo_input( + struct mbuf *m, + __unused char *fh, + __unused struct ifnet *ifp, + __unused u_long protocol_family, + __unused int sync_ok) { - u_long *prepend_ptr; - int s, isr; - register struct ifqueue *ifq = 0; - - prepend_ptr = mtod(m, u_long *); - ifq = (struct ifqueue *) *prepend_ptr++; - isr = *prepend_ptr; - m_adj(m, (2 * sizeof(u_long))); - - s = splimp(); - if (IF_QFULL(ifq)) { - IF_DROP(ifq); + if (proto_input(protocol_family, m) != 0) m_freem(m); - splx(s); - return (EJUSTRETURN); - } - - IF_ENQUEUE(ifq, m); - schednetisr(isr); - splx(s); return (0); } @@ -409,10 +297,10 @@ lo_input(m, fh, ifp, dl_tag, sync_ok) /* ARGSUSED */ static void -lortrequest(cmd, rt, sa) - int cmd; - struct rtentry *rt; - struct sockaddr *sa; +lortrequest( + __unused int cmd, + struct rtentry *rt, + __unused struct sockaddr *sa) { if (rt) { rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; /* for ISO */ @@ -429,8 +317,11 @@ lortrequest(cmd, rt, sa) /* * Process an ioctl request. */ -static int -lo_if_ioctl(struct ifnet *ifp, u_long cmd, void * data) +static errno_t +loioctl( + ifnet_t ifp, + u_int32_t cmd, + void* data) { register struct ifaddr *ifa; register struct ifreq *ifr = (struct ifreq *)data; @@ -439,7 +330,7 @@ lo_if_ioctl(struct ifnet *ifp, u_long cmd, void * data) switch (cmd) { case SIOCSIFADDR: - ifp->if_flags |= IFF_UP | IFF_RUNNING; + ifnet_set_flags(ifp, IFF_UP | IFF_RUNNING, IFF_UP | IFF_RUNNING); ifa = (struct ifaddr *)data; ifa->ifa_rtrequest = lortrequest; /* @@ -483,140 +374,49 @@ lo_if_ioctl(struct ifnet *ifp, u_long cmd, void * data) } return (error); } - -static int -loioctl(u_long dl_tag, struct ifnet *ifp, u_long cmd, caddr_t data) -{ - return (lo_if_ioctl(ifp, cmd, data)); -} - #endif /* NLOOP > 0 */ -int lo_shutdown() -{ - return 0; -} - -int lo_attach_inet(struct ifnet *ifp, u_long *dl_tag) +static int lo_attach_proto(struct ifnet *ifp, u_long protocol_family) { - struct dlil_proto_reg_str reg; - struct dlil_demux_desc desc; - short native=0; - int stat =0 ; - int i; - - for (i=0; i < lo_count; i++) { - if ((lo_array[i]) && (lo_array[i]->ifp == ifp)) { - if (lo_array[i]->protocol_family == PF_INET) { - *dl_tag = lo_array[i]->dl_tag; - return (0); - } - } - } - - TAILQ_INIT(®.demux_desc_head); - desc.type = DLIL_DESC_RAW; - desc.variants.bitmask.proto_id_length = 0; - desc.variants.bitmask.proto_id = 0; - desc.variants.bitmask.proto_id_mask = 0; - desc.native_type = (char *) &native; - TAILQ_INSERT_TAIL(®.demux_desc_head, &desc, next); - reg.interface_family = ifp->if_family; - reg.unit_number = ifp->if_unit; - reg.input = lo_input; - reg.pre_output = lo_pre_output; - reg.event = 0; - reg.offer = 0; - reg.ioctl = loioctl; - reg.default_proto = 0; - reg.protocol_family = PF_INET; - - stat = dlil_attach_protocol(®, dl_tag); - - if (stat) - printf("lo_attach_inet: dlil_attach_protocol returned=%d\n", stat); - - return stat; -} + struct dlil_proto_reg_str reg; + int stat =0 ; + + bzero(®, sizeof(reg)); + TAILQ_INIT(®.demux_desc_head); + reg.interface_family = ifp->if_family; + reg.unit_number = ifp->if_unit; + reg.input = lo_input; + reg.pre_output = lo_pre_output; + reg.protocol_family = protocol_family; + + stat = dlil_attach_protocol(®); -int lo_attach_inet6(struct ifnet *ifp, u_long *dl_tag) -{ - struct dlil_proto_reg_str reg; - struct dlil_demux_desc desc; - short native=0; - int stat; - int i; - - for (i=0; i < lo_count; i++) { - if ((lo_array[i]) && (lo_array[i]->ifp == ifp)) { - if (lo_array[i]->protocol_family == PF_INET6) { - *dl_tag = lo_array[i]->dl_tag; - return (0); - } + if (stat && stat != EEXIST) { + printf("lo_attach_proto: dlil_attach_protocol for %d returned=%d\n", + protocol_family, stat); } - } - - TAILQ_INIT(®.demux_desc_head); - desc.type = DLIL_DESC_RAW; - desc.variants.bitmask.proto_id_length = 0; - desc.variants.bitmask.proto_id = 0; - desc.variants.bitmask.proto_id_mask = 0; - desc.native_type = (char *) &native; - TAILQ_INSERT_TAIL(®.demux_desc_head, &desc, next); - reg.interface_family = ifp->if_family; - reg.unit_number = ifp->if_unit; - reg.input = lo_input; - reg.pre_output = lo_pre_output; - reg.event = 0; - reg.offer = 0; - reg.ioctl = loioctl; - reg.default_proto = 0; - reg.protocol_family = PF_INET6; - - stat = dlil_attach_protocol(®, dl_tag); - - if (stat) - printf("lo_attach_inet6: dlil_attach_protocol returned=%d\n", stat); - - return stat; + + return stat; } void lo_reg_if_mods() { - struct dlil_ifmod_reg_str lo_ifmod; - struct dlil_protomod_reg_str lo_protoreg; int error; - bzero(&lo_ifmod, sizeof(lo_ifmod)); - lo_ifmod.add_if = lo_add_if; - lo_ifmod.del_if = lo_del_if; - lo_ifmod.add_proto = lo_add_proto; - lo_ifmod.del_proto = lo_del_proto; - lo_ifmod.ifmod_ioctl = 0; - lo_ifmod.shutdown = lo_shutdown; - - if (dlil_reg_if_modules(APPLE_IF_FAM_LOOPBACK, &lo_ifmod)) - panic("Couldn't register lo modules\n"); - /* Register protocol registration functions */ - - bzero(&lo_protoreg, sizeof(lo_protoreg)); - lo_protoreg.attach_proto = lo_attach_inet; - lo_protoreg.detach_proto = NULL; /* no detach function for loopback */ - - if ( error = dlil_reg_proto_module(PF_INET, APPLE_IF_FAM_LOOPBACK, &lo_protoreg) != 0) + if ((error = dlil_reg_proto_module(PF_INET, APPLE_IF_FAM_LOOPBACK, lo_attach_proto, NULL)) != 0) printf("dlil_reg_proto_module failed for AF_INET error=%d\n", error); - lo_protoreg.attach_proto = lo_attach_inet6; - lo_protoreg.detach_proto = NULL; - - if ( error = dlil_reg_proto_module(PF_INET6, APPLE_IF_FAM_LOOPBACK, &lo_protoreg) != 0) + if ((error = dlil_reg_proto_module(PF_INET6, APPLE_IF_FAM_LOOPBACK, lo_attach_proto, NULL)) != 0) printf("dlil_reg_proto_module failed for AF_INET6 error=%d\n", error); - } -int lo_set_bpf_tap(struct ifnet *ifp, int mode, int (*bpf_callback)(struct ifnet *, struct mbuf *)) +static errno_t +lo_set_bpf_tap( + ifnet_t ifp, + bpf_tap_mode mode, + bpf_packet_func bpf_callback) { /* @@ -637,32 +437,38 @@ int lo_set_bpf_tap(struct ifnet *ifp, int mode, int (*bpf_callback)(struct ifnet /* ARGSUSED */ void -loopattach(dummy) - void *dummy; +loopattach( + __unused void *dummy) { - register struct ifnet *ifp; - register int i = 0; + struct ifnet *ifp; + int i = 0; - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); lo_reg_if_mods(); for (ifp = loif; i < NLOOP; ifp++) { lo_statics[i].bpf_callback = 0; lo_statics[i].bpf_mode = BPF_TAP_DISABLE; + bzero(ifp, sizeof(struct ifnet)); ifp->if_name = "lo"; ifp->if_family = APPLE_IF_FAM_LOOPBACK; ifp->if_unit = i++; ifp->if_mtu = LOMTU; ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST; - ifp->if_ioctl = lo_if_ioctl; + ifp->if_ioctl = loioctl; + ifp->if_demux = lo_demux; + ifp->if_framer = lo_framer; + ifp->if_add_proto = lo_add_proto; + ifp->if_del_proto = lo_del_proto; ifp->if_set_bpf_tap = lo_set_bpf_tap; ifp->if_output = lo_output; ifp->if_type = IFT_LOOP; - ifp->if_hwassist = 0; /* HW cksum on send side breaks Classic loopback */ + ifp->if_hwassist = IF_HWASSIST_CSUM_IP | IF_HWASSIST_CSUM_TCP | IF_HWASSIST_CSUM_UDP; + ifp->if_hdrlen = sizeof(struct loopback_header); + lo_ifp = ifp; dlil_if_attach(ifp); #if NBPFILTER > 0 bpfattach(ifp, DLT_NULL, sizeof(u_int)); #endif } - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + loopattach_done = 1; } diff --git a/bsd/net/if_media.c b/bsd/net/if_media.c index ed5ec87ab..f7c4e96dc 100644 --- a/bsd/net/if_media.c +++ b/bsd/net/if_media.c @@ -83,12 +83,12 @@ * Useful for debugging newly-ported drivers. */ -static struct ifmedia_entry *ifmedia_match __P((struct ifmedia *ifm, - int flags, int mask)); +static struct ifmedia_entry *ifmedia_match(struct ifmedia *ifm, + int flags, int mask); #ifdef IFMEDIA_DEBUG int ifmedia_debug = 0; -static void ifmedia_printword __P((int)); +static void ifmedia_printword(int); #endif /* @@ -213,11 +213,11 @@ ifmedia_set(ifm, target) * Device-independent media ioctl support function. */ int -ifmedia_ioctl(ifp, ifr, ifm, cmd) - struct ifnet *ifp; - struct ifreq *ifr; - struct ifmedia *ifm; - u_long cmd; +ifmedia_ioctl( + struct ifnet *ifp, + struct ifreq *ifr, + struct ifmedia *ifm, + u_long cmd) { struct ifmedia_entry *match; struct ifmediareq *ifmr = (struct ifmediareq *) ifr; @@ -349,7 +349,7 @@ ifmedia_ioctl(ifp, ifr, ifm, cmd) sticky = error; if ((error == 0 || error == E2BIG) && ifmr->ifm_count != 0) { error = copyout((caddr_t)kptr, - (caddr_t)ifmr->ifm_ulist, + CAST_USER_ADDR_T(ifmr->ifm_ulist), ifmr->ifm_count * sizeof(int)); } diff --git a/bsd/net/if_media.h b/bsd/net/if_media.h index f17ed30fa..077fc3e69 100644 --- a/bsd/net/if_media.h +++ b/bsd/net/if_media.h @@ -72,16 +72,14 @@ * to implement this interface. */ -#ifdef KERNEL - +#ifdef KERNEL_PRIVATE #include <sys/queue.h> -#ifdef __APPLE_API_UNSTABLE /* * Driver callbacks for media status and change requests. */ -typedef int (*ifm_change_cb_t) __P((struct ifnet *ifp)); -typedef void (*ifm_stat_cb_t) __P((struct ifnet *ifp, struct ifmediareq *req)); +typedef int (*ifm_change_cb_t)(struct ifnet *ifp); +typedef void (*ifm_stat_cb_t)(struct ifnet *ifp, struct ifmediareq *req); /* * In-kernel representation of a single supported media type. @@ -107,25 +105,24 @@ struct ifmedia { }; /* Initialize an interface's struct if_media field. */ -void ifmedia_init __P((struct ifmedia *ifm, int dontcare_mask, - ifm_change_cb_t change_callback, ifm_stat_cb_t status_callback)); +void ifmedia_init(struct ifmedia *ifm, int dontcare_mask, + ifm_change_cb_t change_callback, ifm_stat_cb_t status_callback); /* Add one supported medium to a struct ifmedia. */ -void ifmedia_add __P((struct ifmedia *ifm, int mword, int data, void *aux)); +void ifmedia_add(struct ifmedia *ifm, int mword, int data, void *aux); /* Add an array (of ifmedia_entry) media to a struct ifmedia. */ void ifmedia_list_add(struct ifmedia *mp, struct ifmedia_entry *lp, int count); /* Set default media type on initialization. */ -void ifmedia_set __P((struct ifmedia *ifm, int mword)); +void ifmedia_set(struct ifmedia *ifm, int mword); /* Common ioctl function for getting/setting media, called by driver. */ -int ifmedia_ioctl __P((struct ifnet *ifp, struct ifreq *ifr, - struct ifmedia *ifm, u_long cmd)); +int ifmedia_ioctl(struct ifnet *ifp, struct ifreq *ifr, + struct ifmedia *ifm, u_long cmd); -#endif /* __APPLE_API_UNSTABLE */ -#endif /* KERNEL */ +#endif /* KERNEL_PRIVATE */ /* * if_media Options word: @@ -156,8 +153,13 @@ int ifmedia_ioctl __P((struct ifnet *ifp, struct ifreq *ifr, #define IFM_10_FL 13 /* 10baseFL - Fiber */ #define IFM_1000_LX 14 /* 1000baseLX - single-mode fiber */ #define IFM_1000_CX 15 /* 1000baseCX - 150ohm STP */ -#define IFM_1000_TX 16 /* 1000baseTX - 4 pair cat 5 */ +#define IFM_1000_T 16 /* 1000baseT - 4 pair cat 5 */ +#ifdef PRIVATE +#define IFM_1000_TX IFM_1000_T /* For compatibility */ +#endif /* PRIVATE */ #define IFM_HPNA_1 17 /* HomePNA 1.0 (1Mb/s) */ +#define IFM_10G_SR 18 /* 10GbaseSR - multi-mode fiber */ +#define IFM_10G_LR 19 /* 10GbaseLR - single-mode fiber */ /* * Token ring @@ -283,8 +285,10 @@ struct ifmedia_description { { IFM_10_FL, "10baseFL" }, \ { IFM_1000_LX, "1000baseLX" }, \ { IFM_1000_CX, "1000baseCX" }, \ - { IFM_1000_TX, "1000baseTX" }, \ + { IFM_1000_T, "1000baseT" }, \ { IFM_HPNA_1, "HomePNA1" }, \ + { IFM_10G_SR, "10GbaseSR" }, \ + { IFM_10G_LR, "10GbaseLR" }, \ { 0, NULL }, \ } @@ -307,8 +311,10 @@ struct ifmedia_description { { IFM_10_FL, "10FL" }, \ { IFM_1000_LX, "1000LX" }, \ { IFM_1000_CX, "1000CX" }, \ - { IFM_1000_TX, "1000TX" }, \ + { IFM_1000_T, "1000T" }, \ { IFM_HPNA_1, "HPNA1" }, \ + { IFM_10G_SR, "10GSR" }, \ + { IFM_10G_LR, "10GLR" }, \ { 0, NULL }, \ } diff --git a/bsd/net/if_mib.c b/bsd/net/if_mib.c index fc94438fe..faa462f44 100644 --- a/bsd/net/if_mib.c +++ b/bsd/net/if_mib.c @@ -55,6 +55,7 @@ #include <sys/kernel.h> #include <sys/socket.h> #include <sys/sysctl.h> +#include <sys/systm.h> #include <net/if.h> #include <net/if_mib.h> @@ -85,48 +86,44 @@ */ SYSCTL_DECL(_net_link_generic); -SYSCTL_NODE(_net_link_generic, IFMIB_SYSTEM, system, CTLFLAG_RW, 0, + +SYSCTL_NODE(_net_link_generic, IFMIB_SYSTEM, system, CTLFLAG_RD, 0, "Variables global to all interfaces"); + SYSCTL_INT(_net_link_generic_system, IFMIB_IFCOUNT, ifcount, CTLFLAG_RD, &if_index, 0, "Number of configured interfaces"); -static int -sysctl_ifdata SYSCTL_HANDLER_ARGS /* XXX bad syntax! */ -{ - int *name = (int *)arg1; - int error, ifnlen; - u_int namelen = arg2; - struct ifnet *ifp; - char workbuf[64]; - struct ifmibdata ifmd; +static int sysctl_ifdata SYSCTL_HANDLER_ARGS; +SYSCTL_NODE(_net_link_generic, IFMIB_IFDATA, ifdata, CTLFLAG_RD, + sysctl_ifdata, "Interface table"); - if (namelen != 2) - return EINVAL; +static int sysctl_ifalldata SYSCTL_HANDLER_ARGS; +SYSCTL_NODE(_net_link_generic, IFMIB_IFALLDATA, ifalldata, CTLFLAG_RD, + sysctl_ifalldata, "Interface table"); - if (name[0] <= 0 || name[0] > if_index) - return ENOENT; +static int make_ifmibdata(struct ifnet *, int *, struct sysctl_req *); - ifp = ifnet_addrs[name[0] - 1]->ifa_ifp; +int +make_ifmibdata(struct ifnet *ifp, int *name, struct sysctl_req *req) +{ + struct ifmibdata ifmd; + int error = 0; + switch(name[1]) { default: - return ENOENT; + error = ENOENT; + break; case IFDATA_GENERAL: - /* - ifnlen = snprintf(workbuf, sizeof(workbuf), - "%s%d", ifp->if_name, ifp->if_unit); - if(ifnlen + 1 > sizeof ifmd.ifmd_name) { - return ENAMETOOLONG; - } else { - strcpy(ifmd.ifmd_name, workbuf); - } - */ + + snprintf(ifmd.ifmd_name, sizeof(ifmd.ifmd_name), "%s%d", + ifp->if_name, ifp->if_unit); #define COPY(fld) ifmd.ifmd_##fld = ifp->if_##fld COPY(pcount); COPY(flags); - COPY(data); + if_data_internal_to_if_data64(&ifp->if_data, &ifmd.ifmd_data); #undef COPY ifmd.ifmd_snd_len = ifp->if_snd.ifq_len; ifmd.ifmd_snd_maxlen = ifp->if_snd.ifq_maxlen; @@ -134,11 +131,12 @@ sysctl_ifdata SYSCTL_HANDLER_ARGS /* XXX bad syntax! */ error = SYSCTL_OUT(req, &ifmd, sizeof ifmd); if (error || !req->newptr) - return error; + break; +#ifdef IF_MIB_WR error = SYSCTL_IN(req, &ifmd, sizeof ifmd); if (error) - return error; + break; #define DONTCOPY(fld) ifmd.ifmd_data.ifi_##fld = ifp->if_data.ifi_##fld DONTCOPY(type); @@ -154,22 +152,72 @@ sysctl_ifdata SYSCTL_HANDLER_ARGS /* XXX bad syntax! */ ifp->if_snd.ifq_maxlen = ifmd.ifmd_snd_maxlen; ifp->if_snd.ifq_drops = ifmd.ifmd_snd_drops; #undef COPY +#endif /* IF_MIB_WR */ break; case IFDATA_LINKSPECIFIC: error = SYSCTL_OUT(req, ifp->if_linkmib, ifp->if_linkmiblen); if (error || !req->newptr) - return error; + break; +#ifdef IF_MIB_WR error = SYSCTL_IN(req, ifp->if_linkmib, ifp->if_linkmiblen); if (error) - return error; - + break; +#endif /* IF_MIB_WR */ + break; } - return 0; + + return error; +} + +int +sysctl_ifdata SYSCTL_HANDLER_ARGS /* XXX bad syntax! */ +{ + int *name = (int *)arg1; + int error = 0; + u_int namelen = arg2; + struct ifnet *ifp; + + if (namelen != 2) + return EINVAL; + + if (name[0] <= 0 || name[0] > if_index || + (ifp = ifindex2ifnet[name[0]]) == NULL) + return ENOENT; + + ifnet_lock_shared(ifp); + + error = make_ifmibdata(ifp, name, req); + + ifnet_lock_done(ifp); + + return error; } -SYSCTL_NODE(_net_link_generic, IFMIB_IFDATA, ifdata, CTLFLAG_RW, - sysctl_ifdata, "Interface table"); +int +sysctl_ifalldata SYSCTL_HANDLER_ARGS /* XXX bad syntax! */ +{ + int *name = (int *)arg1; + int error = 0; + u_int namelen = arg2; + struct ifnet *ifp; + + if (namelen != 2) + return EINVAL; + + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + ifnet_lock_shared(ifp); + + error = make_ifmibdata(ifp, name, req); + + ifnet_lock_done(ifp); + if (error) + break; + } + ifnet_head_done(); + return error; +} #endif diff --git a/bsd/net/if_mib.h b/bsd/net/if_mib.h index dc7417e89..0175e68a9 100644 --- a/bsd/net/if_mib.h +++ b/bsd/net/if_mib.h @@ -54,16 +54,15 @@ #define _NET_IF_MIB_H 1 #include <sys/appleapiopts.h> -#ifdef __APPLE_API_UNSTABLE struct ifmibdata { - char ifmd_name[IFNAMSIZ]; /* name of interface */ - int ifmd_pcount; /* number of promiscuous listeners */ - int ifmd_flags; /* interface flags */ - int ifmd_snd_len; /* instantaneous length of send queue */ - int ifmd_snd_maxlen; /* maximum length of send queue */ - int ifmd_snd_drops; /* number of drops in send queue */ - int ifmd_filler[4]; /* for future expansion */ - struct if_data ifmd_data; /* generic information and statistics */ + char ifmd_name[IFNAMSIZ]; /* name of interface */ + unsigned int ifmd_pcount; /* number of promiscuous listeners */ + unsigned int ifmd_flags; /* interface flags */ + unsigned int ifmd_snd_len; /* instantaneous length of send queue */ + unsigned int ifmd_snd_maxlen; /* maximum length of send queue */ + unsigned int ifmd_snd_drops; /* number of drops in send queue */ + unsigned int ifmd_filler[4]; /* for future expansion */ + struct if_data64 ifmd_data; /* generic information and statistics */ }; /* @@ -71,12 +70,15 @@ struct ifmibdata { */ #define IFMIB_SYSTEM 1 /* non-interface-specific */ #define IFMIB_IFDATA 2 /* per-interface data table */ +#define IFMIB_IFALLDATA 3 /* all interfaces data at once */ /* * MIB tags for the various net.link.generic.ifdata tables */ -#define IFDATA_GENERAL 1 /* generic stats for all kinds of ifaces */ -#define IFDATA_LINKSPECIFIC 2 /* specific to the type of interface */ +#define IFDATA_GENERAL 1 /* generic stats for all kinds of ifaces */ +#define IFDATA_LINKSPECIFIC 2 /* specific to the type of interface */ +#define IFDATA_ADDRS 3 /* addresses assigned to interface */ +#define IFDATA_MULTIADDRS 4 /* multicast addresses assigned to interface */ /* * MIB tags at the net.link.generic.system level @@ -105,7 +107,7 @@ struct ifmibdata { */ /* For IFT_ETHER, IFT_ISO88023, and IFT_STARLAN, as used by RFC 1650 */ -struct ifmib_iso_8802_3 { +struct ifs_iso_8802_3 { u_int32_t dot3StatsAlignmentErrors; u_int32_t dot3StatsFCSErrors; u_int32_t dot3StatsSingleCollisionFrames; @@ -190,5 +192,4 @@ enum { * Put other types of interface MIBs here, or in interface-specific * header files if convenient ones already exist. */ -#endif /* __APPLE_API_UNSTABLE */ #endif /* _NET_IF_MIB_H */ diff --git a/bsd/net/if_pppvar.h b/bsd/net/if_pppvar.h index ec688a95b..35628358c 100644 --- a/bsd/net/if_pppvar.h +++ b/bsd/net/if_pppvar.h @@ -68,7 +68,7 @@ #warning if_pppvar.h is not used by the darwin kernel #endif -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE /* * Supported network protocols. These values are used for @@ -85,10 +85,10 @@ struct ppp_softc { /*hi*/ u_int sc_flags; /* control/status bits; see if_ppp.h */ struct callout_handle sc_ch; /* Used for scheduling timeouts */ void *sc_devp; /* pointer to device-dep structure */ - void (*sc_start) __P((struct ppp_softc *)); /* start output proc */ - void (*sc_ctlp) __P((struct ppp_softc *)); /* rcvd control pkt */ - void (*sc_relinq) __P((struct ppp_softc *)); /* relinquish ifunit */ - void (*sc_setmtu) __P((struct ppp_softc *)); /* set mtu */ + void (*sc_start)(struct ppp_softc *); /* start output proc */ + void (*sc_ctlp)(struct ppp_softc *); /* rcvd control pkt */ + void (*sc_relinq)(struct ppp_softc *); /* relinquish ifunit */ + void (*sc_setmtu)(struct ppp_softc *); /* set mtu */ short sc_mru; /* max receive unit */ pid_t sc_xfer; /* used in transferring unit */ /*hi*/ struct ifqueue sc_rawq; /* received packets */ @@ -128,13 +128,13 @@ struct ppp_softc { extern struct ppp_softc ppp_softc[]; -struct ppp_softc *pppalloc __P((pid_t pid)); -void pppdealloc __P((struct ppp_softc *sc)); -int pppioctl __P((struct ppp_softc *sc, u_long cmd, caddr_t data, - int flag, struct proc *p)); -int pppoutput __P((struct ifnet *ifp, struct mbuf *m0, - struct sockaddr *dst, struct rtentry *rtp)); -void ppp_restart __P((struct ppp_softc *sc)); -void ppppktin __P((struct ppp_softc *sc, struct mbuf *m, int lost)); -struct mbuf *ppp_dequeue __P((struct ppp_softc *sc)); +struct ppp_softc *pppalloc(pid_t pid); +void pppdealloc(struct ppp_softc *sc); +int pppioctl(struct ppp_softc *sc, u_long cmd, caddr_t data, + int flag, struct proc *p); +int pppoutput(struct ifnet *ifp, struct mbuf *m0, + struct sockaddr *dst, struct rtentry *rtp); +void ppp_restart(struct ppp_softc *sc); +void ppppktin(struct ppp_softc *sc, struct mbuf *m, int lost); +struct mbuf *ppp_dequeue(struct ppp_softc *sc); #endif /* __APPLE_API_PRIVATE */ diff --git a/bsd/net/if_sppp.h b/bsd/net/if_sppp.h deleted file mode 100644 index f0b6435bc..000000000 --- a/bsd/net/if_sppp.h +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* - * Defines for synchronous PPP/Cisco link level subroutines. - * - * Copyright (C) 1994 Cronyx Ltd. - * Author: Serge Vakulenko, <vak@cronyx.ru> - * - * Heavily revamped to conform to RFC 1661. - * Copyright (C) 1997, Joerg Wunsch. - * - * This software is distributed with NO WARRANTIES, not even the implied - * warranties for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * Authors grant any other persons or organizations permission to use - * or modify this software as long as this message is kept with the software, - * all derivative works or modified versions. - * - * From: Version 2.0, Fri Oct 6 20:39:21 MSK 1995 - * - */ - -#ifndef _NET_IF_SPPP_H_ -#define _NET_IF_SPPP_H_ 1 -#include <sys/appleapiopts.h> - -#ifndef DONT_WARN_OBSOLETE -#warning if_sppp.h is not used by the darwin kernel -#endif - -#define IDX_LCP 0 /* idx into state table */ - -struct slcp { - u_long opts; /* LCP options to send (bitfield) */ - u_long magic; /* local magic number */ - u_long mru; /* our max receive unit */ - u_long their_mru; /* their max receive unit */ - u_long protos; /* bitmask of protos that are started */ - u_char echoid; /* id of last keepalive echo request */ - /* restart max values, see RFC 1661 */ - int timeout; - int max_terminate; - int max_configure; - int max_failure; -}; - -#define IDX_IPCP 1 /* idx into state table */ - -struct sipcp { - u_long opts; /* IPCP options to send (bitfield) */ - u_int flags; -#define IPCP_HISADDR_SEEN 1 /* have seen his address already */ -#define IPCP_MYADDR_DYN 2 /* my address is dynamically assigned */ -#define IPCP_MYADDR_SEEN 4 /* have seen his address already */ -}; - -#define AUTHNAMELEN 32 -#define AUTHKEYLEN 16 - -struct sauth { - u_short proto; /* authentication protocol to use */ - u_short flags; -#define AUTHFLAG_NOCALLOUT 1 /* do not require authentication on */ - /* callouts */ -#define AUTHFLAG_NORECHALLENGE 2 /* do not re-challenge CHAP */ - u_char name[AUTHNAMELEN]; /* system identification name */ - u_char secret[AUTHKEYLEN]; /* secret password */ - u_char challenge[AUTHKEYLEN]; /* random challenge */ -}; - -#define IDX_PAP 2 -#define IDX_CHAP 3 - -#define IDX_COUNT (IDX_CHAP + 1) /* bump this when adding cp's! */ - -/* - * Don't change the order of this. Ordering the phases this way allows - * for a comparision of ``pp_phase >= PHASE_AUTHENTICATE'' in order to - * know whether LCP is up. - */ -enum ppp_phase { - PHASE_DEAD, PHASE_ESTABLISH, PHASE_TERMINATE, - PHASE_AUTHENTICATE, PHASE_NETWORK -}; - -#ifdef __APPLE_API_PRIVATE -struct sppp { - /* NB: pp_if _must_ be first */ - struct ifnet pp_if; /* network interface data */ - struct ifqueue pp_fastq; /* fast output queue */ - struct ifqueue pp_cpq; /* PPP control protocol queue */ - struct sppp *pp_next; /* next interface in keepalive list */ - u_int pp_flags; /* use Cisco protocol instead of PPP */ - u_short pp_alivecnt; /* keepalive packets counter */ - u_short pp_loopcnt; /* loopback detection counter */ - u_long pp_seq; /* local sequence number */ - u_long pp_rseq; /* remote sequence number */ - enum ppp_phase pp_phase; /* phase we're currently in */ - int state[IDX_COUNT]; /* state machine */ - u_char confid[IDX_COUNT]; /* id of last configuration request */ - int rst_counter[IDX_COUNT]; /* restart counter */ - int fail_counter[IDX_COUNT]; /* negotiation failure counter */ - struct callout_handle ch[IDX_COUNT]; /* per-proto and if callouts */ - struct callout_handle pap_my_to_ch; /* PAP needs one more... */ - struct slcp lcp; /* LCP params */ - struct sipcp ipcp; /* IPCP params */ - struct sauth myauth; /* auth params, i'm peer */ - struct sauth hisauth; /* auth params, i'm authenticator */ - /* - * These functions are filled in by sppp_attach(), and are - * expected to be used by the lower layer (hardware) drivers - * in order to communicate the (un)availability of the - * communication link. Lower layer drivers that are always - * ready to communicate (like hardware HDLC) can shortcut - * pp_up from pp_tls, and pp_down from pp_tlf. - */ - void (*pp_up)(struct sppp *sp); - void (*pp_down)(struct sppp *sp); - /* - * These functions need to be filled in by the lower layer - * (hardware) drivers if they request notification from the - * PPP layer whether the link is actually required. They - * correspond to the tls and tlf actions. - */ - void (*pp_tls)(struct sppp *sp); - void (*pp_tlf)(struct sppp *sp); - /* - * These (optional) functions may be filled by the hardware - * driver if any notification of established connections - * (currently: IPCP up) is desired (pp_con) or any internal - * state change of the interface state machine should be - * signaled for monitoring purposes (pp_chg). - */ - void (*pp_con)(struct sppp *sp); - void (*pp_chg)(struct sppp *sp, int new_state); - /* These two fields are for use by the lower layer */ - void *pp_lowerp; - int pp_loweri; -}; - -#endif /* __APPLE_API_PRIVATE */ - -#define PP_KEEPALIVE 0x01 /* use keepalive protocol */ -#define PP_CISCO 0x02 /* use Cisco protocol instead of PPP */ - /* 0x04 was PP_TIMO */ -#define PP_CALLIN 0x08 /* we are being called */ -#define PP_NEEDAUTH 0x10 /* remote requested authentication */ - - -#define PP_MTU 1500 /* default/minimal MRU */ -#define PP_MAX_MRU 2048 /* maximal MRU we want to negotiate */ - -/* - * Definitions to pass struct sppp data down into the kernel using the - * SIOC[SG]IFGENERIC ioctl interface. - * - * In order to use this, create a struct spppreq, fill in the cmd - * field with SPPPIOGDEFS, and put the address of this structure into - * the ifr_data portion of a struct ifreq. Pass this struct to a - * SIOCGIFGENERIC ioctl. Then replace the cmd field by SPPPIOCDEFS, - * modify the defs field as desired, and pass the struct ifreq now - * to a SIOCSIFGENERIC ioctl. - */ - -#define SPPPIOGDEFS ((caddr_t)(('S' << 24) + (1 << 16) + sizeof(struct sppp))) -#define SPPPIOSDEFS ((caddr_t)(('S' << 24) + (2 << 16) + sizeof(struct sppp))) - -struct spppreq { - int cmd; - struct sppp defs; -}; - -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -void sppp_attach (struct ifnet *ifp); -void sppp_detach (struct ifnet *ifp); -void sppp_input (struct ifnet *ifp, struct mbuf *m); -int sppp_ioctl (struct ifnet *ifp, u_long cmd, void *data); -struct mbuf *sppp_dequeue (struct ifnet *ifp); -struct mbuf *sppp_pick(struct ifnet *ifp); -int sppp_isempty (struct ifnet *ifp); -void sppp_flush (struct ifnet *ifp); -#endif /* __APPLE_API_PRIVATE */ -#endif - -#endif /* _NET_IF_SPPP_H_ */ diff --git a/bsd/net/if_stf.c b/bsd/net/if_stf.c index 379550287..6bf96e349 100644 --- a/bsd/net/if_stf.c +++ b/bsd/net/if_stf.c @@ -83,15 +83,12 @@ #include <sys/protosw.h> #include <sys/kernel.h> #include <sys/syslog.h> -#include <machine/cpu.h> #include <sys/malloc.h> #include <net/if.h> #include <net/route.h> -#include <net/netisr.h> #include <net/if_types.h> -#include <net/if_stf.h> #include <netinet/in.h> #include <netinet/in_systm.h> @@ -118,7 +115,7 @@ struct stf_softc { struct ifnet sc_if; /* common area */ #ifdef __APPLE__ - struct if_proto *stf_proto; /* dlil protocol attached */ + u_long sc_protocol_family; /* dlil protocol attached */ #endif union { struct route __sc_ro4; @@ -131,10 +128,7 @@ struct stf_softc { static struct stf_softc *stf; #ifdef __APPLE__ -void stfattach __P((void)); -int stf_pre_output __P((struct ifnet *, register struct mbuf **, struct sockaddr *, - caddr_t, char *, char *, u_long)); -static u_long stf_dl_tag=0; +void stfattach (void); #endif #ifndef __APPLE__ @@ -142,147 +136,94 @@ static MALLOC_DEFINE(M_STF, "stf", "6to4 Tunnel Interface"); #endif static int ip_stf_ttl = 40; +static void in_stf_input(struct mbuf *, int); extern struct domain inetdomain; struct protosw in_stf_protosw = { SOCK_RAW, &inetdomain, IPPROTO_IPV6, PR_ATOMIC|PR_ADDR, in_stf_input, 0, 0, rip_ctloutput, 0, - 0, 0, 0, 0, 0, - &rip_usrreqs + &rip_usrreqs, + 0, rip_unlock, 0 }; -static int stf_encapcheck __P((const struct mbuf *, int, int, void *)); -static struct in6_ifaddr *stf_getsrcifa6 __P((struct ifnet *)); -int stf_pre_output __P((struct ifnet *, register struct mbuf **, struct sockaddr *, - caddr_t, char *, char *, u_long)); -static int stf_checkaddr4 __P((struct stf_softc *, struct in_addr *, - struct ifnet *)); -static int stf_checkaddr6 __P((struct stf_softc *, struct in6_addr *, - struct ifnet *)); -static void stf_rtrequest __P((int, struct rtentry *, struct sockaddr *)); -int stf_ioctl __P((struct ifnet *, u_long, void *)); - +static int stf_encapcheck(const struct mbuf *, int, int, void *); +static struct in6_ifaddr *stf_getsrcifa6(struct ifnet *); +int stf_pre_output(struct ifnet *, u_long, register struct mbuf **, + const struct sockaddr *, caddr_t, char *, char *); +static int stf_checkaddr4(struct stf_softc *, struct in_addr *, + struct ifnet *); +static int stf_checkaddr6(struct stf_softc *, struct in6_addr *, + struct ifnet *); +static void stf_rtrequest(int, struct rtentry *, struct sockaddr *); +int stf_ioctl(struct ifnet *, u_long, void *); static -int stf_add_if(struct ifnet *ifp) -{ - ifp->if_demux = 0; - ifp->if_framer = 0; - return 0; -} - -static -int stf_del_if(struct ifnet *ifp) -{ - return 0; -} - -static -int stf_add_proto(struct ddesc_head_str *desc_head, struct if_proto *proto, u_long dl_tag) +int stf_add_proto( + struct ifnet *ifp, + u_long protocol_family, + struct ddesc_head_str *desc_head) { /* Only one protocol may be attached at a time */ - struct stf_softc* stf = (struct stf_softc*)proto->ifp; - if (stf->stf_proto == NULL) - stf->stf_proto = proto; + struct stf_softc* stf = (struct stf_softc*)ifp; + if (stf->sc_protocol_family == 0) + stf->sc_protocol_family = protocol_family; else { printf("stf_add_proto: stf already has a proto\n"); - return (EBUSY); + return EBUSY; } - - return (0); + + return 0; } static -int stf_del_proto(struct if_proto *proto, u_long dl_tag) +int stf_del_proto( + struct ifnet *ifp, + u_long protocol_family) { - if (((struct stf_softc*)proto->ifp)->stf_proto == proto) - ((struct stf_softc*)proto->ifp)->stf_proto = NULL; + if (((struct stf_softc*)ifp)->sc_protocol_family == protocol_family) + ((struct stf_softc*)ifp)->sc_protocol_family = 0; else return ENOENT; return 0; } -int stf_shutdown() -{ - return 0; -} - -int stf_attach_inet6(struct ifnet *ifp, u_long *dl_tag) +static int +stf_attach_inet6(struct ifnet *ifp, u_long protocol_family) { struct dlil_proto_reg_str reg; - struct dlil_demux_desc desc; - short native=0; int stat, i; - if (stf_dl_tag != 0) { - *dl_tag = stf_dl_tag; - return 0; - } - + bzero(®, sizeof(reg)); TAILQ_INIT(®.demux_desc_head); - desc.type = DLIL_DESC_RAW; - desc.variants.bitmask.proto_id_length = 0; - desc.variants.bitmask.proto_id = 0; - desc.variants.bitmask.proto_id_mask = 0; - desc.native_type = (char *) &native; - TAILQ_INSERT_TAIL(®.demux_desc_head, &desc, next); reg.interface_family = ifp->if_family; reg.unit_number = ifp->if_unit; - reg.input = 0; reg.pre_output = stf_pre_output; - reg.event = 0; - reg.offer = 0; - reg.ioctl = 0; - reg.default_proto = 0; reg.protocol_family = PF_INET6; - stat = dlil_attach_protocol(®, &stf_dl_tag); - *dl_tag = stf_dl_tag; + stat = dlil_attach_protocol(®); return stat; } -int stf_detach_inet6(struct ifnet *ifp, u_long dl_tag) +static int +stf_demux( + struct ifnet *ifp, + struct mbuf *m, + char *frame_ptr, + u_long *protocol_family) { - int stat; - - stat = dlil_find_dltag(ifp->if_family, ifp->if_unit, AF_INET6, &dl_tag); - if (stat == 0) { - stat = dlil_detach_protocol(dl_tag); - if (stat) { - printf("WARNING: stf_detach can't detach IP AF_INET6 from interface\n"); - } - } - return (stat); + *protocol_family = PF_INET6; + return 0; } void stf_reg_if_mods() { - struct dlil_ifmod_reg_str stf_ifmod; - struct dlil_protomod_reg_str stf_protoreg; int error; - bzero(&stf_ifmod, sizeof(stf_ifmod)); - stf_ifmod.add_if = stf_add_if; - stf_ifmod.del_if = stf_del_if; - stf_ifmod.add_proto = stf_add_proto; - stf_ifmod.del_proto = stf_del_proto; - stf_ifmod.ifmod_ioctl = 0; - stf_ifmod.shutdown = stf_shutdown; - - - if (dlil_reg_if_modules(APPLE_IF_FAM_STF, &stf_ifmod)) - panic("Couldn't register stf modules\n"); - /* Register protocol registration functions */ - - bzero(&stf_protoreg, sizeof(stf_protoreg)); - stf_protoreg.attach_proto = stf_attach_inet6; - stf_protoreg.detach_proto = stf_detach_inet6; - - if ( error = dlil_reg_proto_module(AF_INET6, APPLE_IF_FAM_STF, &stf_protoreg) != 0) + if ( error = dlil_reg_proto_module(AF_INET6, APPLE_IF_FAM_STF, stf_attach_inet6, NULL) != 0) kprintf("dlil_reg_proto_module failed for AF_INET6 error=%d\n", error); } @@ -292,8 +233,6 @@ stfattach(void) struct ifnet *ifp; struct stf_softc *sc; int i, error; - - int err; const struct encaptab *p; @@ -323,6 +262,9 @@ stfattach(void) sc->sc_if.if_output = NULL; /* processing done in pre_output */ sc->sc_if.if_type = IFT_STF; sc->sc_if.if_family= APPLE_IF_FAM_STF; + sc->sc_if.if_add_proto = stf_add_proto; + sc->sc_if.if_del_proto = stf_del_proto; + sc->sc_if.if_demux = stf_demux; #if 0 /* turn off ingress filter */ sc->sc_if.if_flags |= IFF_LINK2; @@ -409,6 +351,7 @@ stf_getsrcifa6(ifp) struct sockaddr_in6 *sin6; struct in_addr in; + ifnet_lock_shared(ifp); for (ia = ifp->if_addrlist.tqh_first; ia; ia = ia->ifa_list.tqe_next) @@ -422,6 +365,7 @@ stf_getsrcifa6(ifp) continue; bcopy(GET_V4(&sin6->sin6_addr), &in, sizeof(in)); + lck_mtx_lock(rt_mtx); for (ia4 = TAILQ_FIRST(&in_ifaddrhead); ia4; ia4 = TAILQ_NEXT(ia4, ia_link)) @@ -429,24 +373,27 @@ stf_getsrcifa6(ifp) if (ia4->ia_addr.sin_addr.s_addr == in.s_addr) break; } + lck_mtx_unlock(rt_mtx); if (ia4 == NULL) continue; + ifnet_lock_done(ifp); return (struct in6_ifaddr *)ia; } + ifnet_lock_done(ifp); return NULL; } int -stf_pre_output(ifp, m0, dst, rt, frame_type, address, dl_tag) - struct ifnet *ifp; - register struct mbuf **m0; - struct sockaddr *dst; - caddr_t rt; - char *frame_type; - char *address; - u_long dl_tag; +stf_pre_output( + struct ifnet *ifp, + u_long protocol_family, + register struct mbuf **m0, + const struct sockaddr *dst, + caddr_t rt, + char *frame_type, + char *address) { register struct mbuf *m = *m0; struct stf_softc *sc; @@ -506,14 +453,14 @@ stf_pre_output(ifp, m0, dst, rt, frame_type, address, dl_tag) * will only read from the mbuf (i.e., it won't * try to free it or keep a pointer a to it). */ - struct mbuf m0; + struct mbuf m1; u_int32_t af = AF_INET6; - m0.m_next = m; - m0.m_len = 4; - m0.m_data = (char *)⁡ + m1.m_next = m; + m1.m_len = 4; + m1.m_data = (char *)⁡ - bpf_mtap(ifp, &m0); + bpf_mtap(ifp, &m1); } M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); @@ -544,7 +491,7 @@ stf_pre_output(ifp, m0, dst, rt, frame_type, address, dl_tag) dst4->sin_len = sizeof(struct sockaddr_in); bcopy(&ip->ip_dst, &dst4->sin_addr, sizeof(dst4->sin_addr)); if (sc->sc_ro.ro_rt) { - RTFREE(sc->sc_ro.ro_rt); + rtfree(sc->sc_ro.ro_rt); sc->sc_ro.ro_rt = NULL; } } @@ -559,6 +506,8 @@ stf_pre_output(ifp, m0, dst, rt, frame_type, address, dl_tag) error = ip_output(m, NULL, &sc->sc_ro, 0, NULL); if (error == 0) return EJUSTRETURN; + + return error; } static int @@ -583,15 +532,19 @@ stf_checkaddr4(sc, in, inifp) /* * reject packets with broadcast */ + lck_mtx_lock(rt_mtx); for (ia4 = TAILQ_FIRST(&in_ifaddrhead); ia4; ia4 = TAILQ_NEXT(ia4, ia_link)) { if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) continue; - if (in->s_addr == ia4->ia_broadaddr.sin_addr.s_addr) + if (in->s_addr == ia4->ia_broadaddr.sin_addr.s_addr) { + lck_mtx_unlock(rt_mtx); return -1; + } } + lck_mtx_unlock(rt_mtx); /* * perform ingress filter @@ -645,7 +598,7 @@ stf_checkaddr6(sc, in6, inifp) return 0; } -void +static void in_stf_input(m, off) struct mbuf *m; int off; @@ -654,8 +607,7 @@ in_stf_input(m, off) struct ip *ip; struct ip6_hdr *ip6; u_int8_t otos, itos; - int s, isr, proto; - struct ifqueue *ifq = NULL; + int proto; struct ifnet *ifp; ip = mtod(m, struct ip *); @@ -746,21 +698,9 @@ in_stf_input(m, off) * See net/if_gif.c for possible issues with packet processing * reorder due to extra queueing. */ - ifq = &ip6intrq; - isr = NETISR_IPV6; - - s = splimp(); - if (IF_QFULL(ifq)) { - IF_DROP(ifq); /* update statistics */ - m_freem(m); - splx(s); - return; - } - IF_ENQUEUE(ifq, m); - schednetisr(isr); + proto_input(PF_INET6, m); ifp->if_ipackets++; ifp->if_ibytes += m->m_pkthdr.len; - splx(s); return; } @@ -798,8 +738,11 @@ stf_ioctl(ifp, cmd, data) } sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; if (IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) { - ifa->ifa_rtrequest = stf_rtrequest; - ifp->if_flags |= IFF_UP; + if ( !(ifnet_flags( ifp ) & IFF_UP) ) { + /* do this only if the interface is not already up */ + ifa->ifa_rtrequest = stf_rtrequest; + ifnet_set_flags(ifp, IFF_UP, IFF_UP); + } } else error = EINVAL; break; diff --git a/bsd/net/if_stf.h b/bsd/net/if_stf.h deleted file mode 100644 index c374f426d..000000000 --- a/bsd/net/if_stf.h +++ /dev/null @@ -1,41 +0,0 @@ -/* $FreeBSD: src/sys/net/if_stf.h,v 1.1.2.1 2000/07/15 07:14:29 kris Exp $ */ -/* $KAME: if_stf.h,v 1.3 2000/03/25 07:23:33 sumikawa Exp $ */ - -/* - * Copyright (C) 2000 WIDE Project. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the project nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#ifndef _NET_IF_STF_H_ -#define _NET_IF_STF_H_ -#include <sys/appleapiopts.h> - -#ifdef __APPLE_API_PRIVATE -void in_stf_input __P((struct mbuf *, int)); -#endif /* __APPLE_API_PRIVATE */ - -#endif /* _NET_IF_STF_H_ */ diff --git a/bsd/net/if_tun.c b/bsd/net/if_tun.c deleted file mode 100644 index 8ed3afd27..000000000 --- a/bsd/net/if_tun.c +++ /dev/null @@ -1,764 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* $NetBSD: if_tun.c,v 1.14 1994/06/29 06:36:25 cgd Exp $ */ - -/* - * Copyright (c) 1988, Julian Onions <jpo@cs.nott.ac.uk> - * Nottingham University 1987. - * - * This source may be freely distributed, however I would be interested - * in any changes that are made. - * - * This driver takes packets off the IP i/f and hands them up to a - * user process to have its wicked way with. This driver has it's - * roots in a similar driver written by Phil Cockcroft (formerly) at - * UCL. This driver is based much more on read/write/poll mode of - * operation though. - */ - -#include "tun.h" -#if NTUN > 0 - -#include "opt_devfs.h" -#include "opt_inet.h" - -#include <sys/param.h> -#include <sys/proc.h> -#include <sys/systm.h> -#include <sys/mbuf.h> -#include <sys/socket.h> -#include <sys/filio.h> -#include <sys/sockio.h> -#include <sys/ttycom.h> -#include <sys/poll.h> -#include <sys/signalvar.h> -#include <sys/filedesc.h> -#include <sys/kernel.h> -#include <sys/sysctl.h> -#if DEVFS -#include <sys/devfsext.h> -#endif /*DEVFS*/ -#include <sys/conf.h> -#include <sys/uio.h> -#include <sys/vnode.h> - -#include <net/if.h> -#include <net/if_types.h> -#include <net/netisr.h> -#include <net/route.h> - -#if INET -#include <netinet/in.h> -#include <netinet/in_var.h> -#endif - -#if INET6 -#include <netinet/ip6.h> -#include <netinet6/ip6_var.h> -#include <netinet6/in6_ifattach.h> -#endif /* INET6 */ - -#include "bpfilter.h" -#if NBPFILTER > 0 -#include <net/bpf.h> -#endif - -#include <net/if_tunvar.h> -#include <net/if_tun.h> - -static void tunattach __P((void *)); -PSEUDO_SET(tunattach, if_tun); - -#define TUNDEBUG if (tundebug) printf -static int tundebug = 0; -SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, ""); - -static struct tun_softc tunctl[NTUN]; - -static int tunoutput __P((struct ifnet *, struct mbuf *, struct sockaddr *, - struct rtentry *rt)); -static int tunifioctl __P((struct ifnet *, u_long, caddr_t)); -static int tuninit __P((int, int, u_char)); - -static d_open_t tunopen; -static d_close_t tunclose; -static d_read_t tunread; -static d_write_t tunwrite; -static d_ioctl_t tunioctl; -static d_poll_t tunpoll; - -#define CDEV_MAJOR 52 -static struct cdevsw tun_cdevsw = { - tunopen, tunclose, tunread, tunwrite, - tunioctl, nullstop, noreset, nodevtotty, - tunpoll, nommap, nostrategy, "tun", NULL, -1 -}; - - -static int tun_devsw_installed; -#if DEVFS -static void *tun_devfs_token[NTUN]; -#endif - -#define minor_val(n) ((((n) & ~0xff) << 8) | ((n) & 0xff)) -#define dev_val(n) (((n) >> 8) | ((n) & 0xff)) - -static void -tunattach(dummy) - void *dummy; -{ - register int i; - struct ifnet *ifp; - dev_t dev; - - if ( tun_devsw_installed ) - return; - dev = makedev(CDEV_MAJOR, 0); - cdevsw_add(&dev, &tun_cdevsw, NULL); - tun_devsw_installed = 1; - for ( i = 0; i < NTUN; i++ ) { -#if DEVFS - tun_devfs_token[i] = devfs_add_devswf(&tun_cdevsw, minor_val(i), - DV_CHR, UID_UUCP, - GID_DIALER, 0600, - "tun%d", i); -#endif - tunctl[i].tun_flags = TUN_INITED; - - ifp = &tunctl[i].tun_if; - ifp->if_unit = i; - ifp->if_name = "tun"; - ifp->if_family = APPLE_IF_FAM_TUN; - ifp->if_mtu = TUNMTU; - ifp->if_ioctl = tunifioctl; - ifp->if_output = tunoutput; - ifp->if_flags = IFF_POINTOPOINT | IFF_MULTICAST; - ifp->if_type = IFT_PPP; /* necessary init value for IPv6 lladdr auto conf */ - ifp->if_snd.ifq_maxlen = ifqmaxlen; - if_attach(ifp); -#if NBPFILTER > 0 - bpfattach(ifp, DLT_NULL, sizeof(u_int)); -#endif - } -} - -/* - * tunnel open - must be superuser & the device must be - * configured in - */ -static int -tunopen(dev, flag, mode, p) - dev_t dev; - int flag, mode; - struct proc *p; -{ - struct ifnet *ifp; - struct tun_softc *tp; - register int unit, error; - - error = suser(p->p_ucred, &p->p_acflag); - if (error) - return (error); - - if ((unit = dev_val(minor(dev))) >= NTUN) - return (ENXIO); - tp = &tunctl[unit]; - if (tp->tun_flags & TUN_OPEN) - return EBUSY; - ifp = &tp->tun_if; - tp->tun_flags |= TUN_OPEN; - TUNDEBUG("%s%d: open\n", ifp->if_name, ifp->if_unit); - return (0); -} - -/* - * tunclose - close the device - mark i/f down & delete - * routing info - */ -static int -tunclose(dev, foo, bar, p) - dev_t dev; - int foo; - int bar; - struct proc *p; -{ - register int unit = dev_val(minor(dev)), s; - struct tun_softc *tp = &tunctl[unit]; - struct ifnet *ifp = &tp->tun_if; - struct mbuf *m; - - tp->tun_flags &= ~TUN_OPEN; - - /* - * junk all pending output - */ - do { - s = splimp(); - IF_DEQUEUE(&ifp->if_snd, m); - splx(s); - if (m) - m_freem(m); - } while (m); - - if (ifp->if_flags & IFF_UP) { - s = splimp(); - if_down(ifp); - if (ifp->if_flags & IFF_RUNNING) { - /* find internet addresses and delete routes */ - register struct ifaddr *ifa; - for (ifa = ifp->if_addrhead.tqh_first; ifa; - ifa = ifa->ifa_link.tqe_next) { - switch (ifa->ifa_addr->sa_family) { -#if INET - case AF_INET: -#endif -#if INET6 - case AF_INET6: -#endif - rtinit(ifa, (int)RTM_DELETE, - tp->tun_flags & TUN_DSTADDR ? RTF_HOST : 0); - break; - } - } - } - splx(s); - } - ifp->if_flags &= ~IFF_RUNNING; - funsetown(tp->tun_sigio); - selwakeup(&tp->tun_rsel); - selthreadclear(&tp->tun_rsel); - - TUNDEBUG ("%s%d: closed\n", ifp->if_name, ifp->if_unit); - return (0); -} - -static int -tuninit(unit, cmd, af) - int unit; - int cmd; - u_char af; -{ - struct tun_softc *tp = &tunctl[unit]; - struct ifnet *ifp = &tp->tun_if; - register struct ifaddr *ifa; - - TUNDEBUG("%s%d: tuninit\n", ifp->if_name, ifp->if_unit); - - ifp->if_flags |= IFF_UP | IFF_RUNNING; - getmicrotime(&ifp->if_lastchange); - - for (ifa = ifp->if_addrhead.tqh_first; ifa; - ifa = ifa->ifa_link.tqe_next) { -#if INET - if (ifa->ifa_addr->sa_family == AF_INET) { - struct sockaddr_in *si; - - si = (struct sockaddr_in *)ifa->ifa_addr; - if (si && si->sin_addr.s_addr) - tp->tun_flags |= TUN_IASET; - - si = (struct sockaddr_in *)ifa->ifa_dstaddr; - if (si && si->sin_addr.s_addr) - tp->tun_flags |= TUN_DSTADDR; - } -#endif - } - return 0; -} - -/* - * Process an ioctl request. - */ -int -tunifioctl(ifp, cmd, data) - struct ifnet *ifp; - u_long cmd; - caddr_t data; -{ - register struct ifreq *ifr = (struct ifreq *)data; - int error = 0, s; - - s = splimp(); - switch(cmd) { - case SIOCGIFSTATUS: - ifs = (struct ifstat *)data; - if (tp->tun_pid) - sprintf(ifs->ascii + strlen(ifs->ascii), - "\tOpened by PID %d\n", tp->tun_pid); - break; - case SIOCSIFADDR: - tuninit(ifp->if_unit); - TUNDEBUG("%s%d: address set\n", - ifp->if_name, ifp->if_unit); - break; - case SIOCSIFDSTADDR: -#if 0 -#if defined(INET6) && defined(__FreeBSD__) && __FreeBSD__ >= 3 - if (found_first_ifid == 0) - in6_ifattach_noifid(ifp); -#endif /* defined(INET6) && defined(__FreeBSD__) && __FreeBSD__ >= 3 */ -#endif - tuninit(ifp->if_unit, cmd, ifr->ifr_addr.sa_family); - break; - case SIOCSIFMTU: - ifp->if_mtu = ifr->ifr_mtu; - TUNDEBUG("%s%d: mtu set\n", - ifp->if_name, ifp->if_unit); - break; - case SIOCADDMULTI: - case SIOCDELMULTI: - break; - - case SIOCSIFFLAGS: - if ((ifp->if_flags & IFF_UP) != 0) - ifp->if_flags |= IFF_RUNNING; - else if ((ifp->if_flags & IFF_UP) == 0) - ifp->if_flags &= ~IFF_RUNNING; - break; - - default: - error = EINVAL; - } - splx(s); - return (error); -} - -/* - * tunoutput - queue packets from higher level ready to put out. - */ -/* Packet data format between tun and ppp is changed to enable checking of - * Address Family of sending packet. When INET6 is defined, 4byte AF field - * is appended to packet data as following. - * - * 0 1 2 3 4 5 6 7 8 ..... - * ------------------------------ - * | af | packet data ..... - * ------------------------------ - * ^^^^^^^^^^^^^ - * Newly added part. The size is sizeof(u_long). - * - * However, this is not adopted for tun -> ppp AF_INET packet for - * backword compatibility, because the ppp process may be an existing - * ip only supporting one. - * Also in ppp->tun case, when af value is unknown, (af > 255) is checked and - * if it is true, AF_INET is assumed. (the 4byte may be the head of - * AF_INET packet. Despite the byte order, the value must always be - * greater than 255, because of ip_len field or (ip_v and ip_hl) - * field. (Idea from Mr. Noritoshi Demize) - */ -int -tunoutput(ifp, m0, dst, rt) - struct ifnet *ifp; - struct mbuf *m0; - struct sockaddr *dst; - struct rtentry *rt; -{ - struct tun_softc *tp = &tunctl[ifp->if_unit]; - int s; - - TUNDEBUG ("%s%d: tunoutput\n", ifp->if_name, ifp->if_unit); - - if ((tp->tun_flags & TUN_READY) != TUN_READY) { - TUNDEBUG ("%s%d: not ready 0%o\n", ifp->if_name, - ifp->if_unit, tp->tun_flags); - m_freem (m0); - return EHOSTDOWN; - } - -#if NBPFILTER > 0 - /* BPF write needs to be handled specially */ - if (dst->sa_family == AF_UNSPEC) { - dst->sa_family = *(mtod(m0, int *)); - m0->m_len -= sizeof(int); - m0->m_pkthdr.len -= sizeof(int); - m0->m_data += sizeof(int); - } - - if (ifp->if_bpf) { - /* - * We need to prepend the address family as - * a four byte field. Cons up a dummy header - * to pacify bpf. This is safe because bpf - * will only read from the mbuf (i.e., it won't - * try to free it or keep a pointer to it). - */ - struct mbuf m; - u_int af = dst->sa_family; - - m.m_next = m0; - m.m_len = 4; - m.m_data = (char *)⁡ - - bpf_mtap(ifp, &m); - } -#endif - - switch(dst->sa_family) { -#if defined(INET) || defined(INET6) -#if INET6 - case AF_INET6: - M_PREPEND(m0, sizeof(u_long) /* af field passed to upper */, - M_DONTWAIT); - if (m0 == 0) - return (ENOBUFS); - *mtod(m0, u_long *) = (u_long)dst->sa_family; - /* FALLTHROUGH */ -#endif /* INET6 */ -#if INET - case AF_INET: -#endif /* INET */ -#endif /* INET || INET6 */ - s = splimp(); - if (IF_QFULL(&ifp->if_snd)) { - IF_DROP(&ifp->if_snd); - m_freem(m0); - splx(s); - ifp->if_collisions++; - return (ENOBUFS); - } - ifp->if_obytes += m0->m_pkthdr.len; - IF_ENQUEUE(&ifp->if_snd, m0); - splx(s); - ifp->if_opackets++; - break; - default: - m_freem(m0); - return EAFNOSUPPORT; - } - - if (tp->tun_flags & TUN_RWAIT) { - tp->tun_flags &= ~TUN_RWAIT; - wakeup((caddr_t)tp); - } - if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) - pgsigio(tp->tun_sigio, SIGIO, 0); - selwakeup(&tp->tun_rsel); - return 0; -} - -/* - * the cdevsw interface is now pretty minimal. - */ -static int -tunioctl(dev, cmd, data, flag, p) - dev_t dev; - u_long cmd; - caddr_t data; - int flag; - struct proc *p; -{ - int unit = dev_val(minor(dev)), s; - struct tun_softc *tp = &tunctl[unit]; - struct tuninfo *tunp; - - switch (cmd) { - case TUNSIFINFO: - tunp = (struct tuninfo *)data; - tp->tun_if.if_mtu = tunp->mtu; - tp->tun_if.if_type = tunp->type; - tp->tun_if.if_baudrate = tunp->baudrate; - break; - case TUNGIFINFO: - tunp = (struct tuninfo *)data; - tunp->mtu = tp->tun_if.if_mtu; - tunp->type = tp->tun_if.if_type; - tunp->baudrate = tp->tun_if.if_baudrate; - break; - case TUNSDEBUG: - tundebug = *(int *)data; - break; - case TUNGDEBUG: - *(int *)data = tundebug; - break; - case FIONBIO: - break; - case FIOASYNC: - if (*(int *)data) - tp->tun_flags |= TUN_ASYNC; - else - tp->tun_flags &= ~TUN_ASYNC; - break; - case FIONREAD: - s = splimp(); - if (tp->tun_if.if_snd.ifq_head) { - struct mbuf *mb = tp->tun_if.if_snd.ifq_head; - for( *(int *)data = 0; mb != 0; mb = mb->m_next) - *(int *)data += mb->m_len; - } else - *(int *)data = 0; - splx(s); - break; - case FIOSETOWN: - return (fsetown(*(int *)data, &tp->tun_sigio)); - - case FIOGETOWN: - *(int *)data = fgetown(tp->tun_sigio); - return (0); - - /* This is deprecated, FIOSETOWN should be used instead. */ - case TIOCSPGRP: - return (fsetown(-(*(int *)data), &tp->tun_sigio)); - - /* This is deprecated, FIOGETOWN should be used instead. */ - case TIOCGPGRP: - *(int *)data = -fgetown(tp->tun_sigio); - return (0); - - default: - return (ENOTTY); - } - return (0); -} - -/* - * The cdevsw read interface - reads a packet at a time, or at - * least as much of a packet as can be read. - */ -static int -tunread(dev, uio, flag) - dev_t dev; - struct uio *uio; - int flag; -{ - int unit = dev_val(minor(dev)); - struct tun_softc *tp = &tunctl[unit]; - struct ifnet *ifp = &tp->tun_if; - struct mbuf *m, *m0; - int error=0, len, s; - - TUNDEBUG ("%s%d: read\n", ifp->if_name, ifp->if_unit); - if ((tp->tun_flags & TUN_READY) != TUN_READY) { - TUNDEBUG ("%s%d: not ready 0%o\n", ifp->if_name, - ifp->if_unit, tp->tun_flags); - return EHOSTDOWN; - } - - tp->tun_flags &= ~TUN_RWAIT; - - s = splimp(); - do { - IF_DEQUEUE(&ifp->if_snd, m0); - if (m0 == 0) { - if (flag & IO_NDELAY) { - splx(s); - return EWOULDBLOCK; - } - tp->tun_flags |= TUN_RWAIT; - if( error = tsleep((caddr_t)tp, PCATCH | (PZERO + 1), - "tunread", 0)) { - splx(s); - return error; - } - } - } while (m0 == 0); - splx(s); - - while (m0 && uio->uio_resid > 0 && error == 0) { - len = min(uio->uio_resid, m0->m_len); - if (len == 0) - break; - error = uiomove(mtod(m0, caddr_t), len, uio); - MFREE(m0, m); - m0 = m; - } - - if (m0) { - TUNDEBUG("Dropping mbuf\n"); - m_freem(m0); - } - return error; -} - -/* - * the cdevsw write interface - an atomic write is a packet - or else! - */ -/* See top of tunoutput() about interface change between ppp process and - * tun. */ -static int -tunwrite(dev, uio, flag) - dev_t dev; - struct uio *uio; - int flag; -{ - int unit = dev_val(minor(dev)); - struct ifnet *ifp = &tunctl[unit].tun_if; - struct mbuf *top, **mp, *m; - int error=0, s, tlen, mlen; - u_long af; - u_int netisr_af; - struct ifqueue *afintrq = NULL; - - TUNDEBUG("%s%d: tunwrite\n", ifp->if_name, ifp->if_unit); - - if (uio->uio_resid < 0 || uio->uio_resid > TUNMRU) { - TUNDEBUG("%s%d: len=%d!\n", ifp->if_name, ifp->if_unit, - uio->uio_resid); - return EIO; - } - tlen = uio->uio_resid; - - /* get a header mbuf */ - MGETHDR(m, M_DONTWAIT, MT_DATA); - if (m == NULL) - return ENOBUFS; - if (tlen > MHLEN) { - MCLGET(m, M_DONTWAIT); - if ((m->m_flags & M_EXT) == 0) { - m_free(m); - return ENOBUFS; - } - mlen = m->m_ext.ext_size; - } else - mlen = MHLEN; - - top = 0; - mp = ⊤ - while (error == 0 && uio->uio_resid > 0) { - m->m_len = min(mlen, uio->uio_resid); - error = uiomove(mtod (m, caddr_t), m->m_len, uio); - *mp = m; - mp = &m->m_next; - if (uio->uio_resid > 0) { - MGET (m, M_DONTWAIT, MT_DATA); - if (m == 0) { - error = ENOBUFS; - break; - } - mlen = MLEN; - } - } - /* Change for checking Address Family of sending packet. */ - af = *mtod(top, u_long *); - switch (af) { -#if INET - case AF_INET: - netisr_af = NETISR_IP; - afintrq = &ipintrq; - break; -#endif /* INET */ -#if INET6 - case AF_INET6: - netisr_af = NETISR_IPV6; - afintrq = &ip6intrq; - break; -#endif /* INET6 */ - default: - if (af > 255) { /* see description at the top of tunoutput */ - af = AF_INET; - netisr_af = NETISR_IP; - afintrq = &ipintrq; - goto af_decided; - } - error = EAFNOSUPPORT; - break; - } - m_adj(top, sizeof(u_long)); /* remove af field passed from upper */ - tlen -= sizeof(u_long); - af_decided: - if (error) { - if (top) - m_freem (top); - return error; - } - - top->m_pkthdr.len = tlen; - top->m_pkthdr.rcvif = ifp; - -#if NBPFILTER > 0 - if (ifp->if_bpf) { - /* - * We need to prepend the address family as - * a four byte field. Cons up a dummy header - * to pacify bpf. This is safe because bpf - * will only read from the mbuf (i.e., it won't - * try to free it or keep a pointer to it). - */ - struct mbuf m; - - m.m_next = top; - m.m_len = 4; - m.m_data = (char *)⁡ - - bpf_mtap(ifp, &m); - } -#endif - - /* just for safety */ - if (!afintrq) - return EAFNOSUPPORT; - - s = splimp(); - if (IF_QFULL (afintrq)) { - IF_DROP(afintrq); - splx(s); - ifp->if_collisions++; - m_freem(top); - return ENOBUFS; - } - IF_ENQUEUE(afintrq, top); - splx(s); - ifp->if_ibytes += tlen; - ifp->if_ipackets++; - schednetisr(netisr_af); - return error; -} - -/* - * tunpoll - the poll interface, this is only useful on reads - * really. The write detect always returns true, write never blocks - * anyway, it either accepts the packet or drops it. - */ -static int -tunpoll(dev, events, wql, p) - dev_t dev; - int events; - void * wql; - struct proc *p; -{ - int unit = dev_val(minor(dev)), s; - struct tun_softc *tp = &tunctl[unit]; - struct ifnet *ifp = &tp->tun_if; - int revents = 0; - - s = splimp(); - TUNDEBUG("%s%d: tunpoll\n", ifp->if_name, ifp->if_unit); - - if (events & (POLLIN | POLLRDNORM)) - if (ifp->if_snd.ifq_len > 0) { - TUNDEBUG("%s%d: tunpoll q=%d\n", ifp->if_name, - ifp->if_unit, ifp->if_snd.ifq_len); - revents |= events & (POLLIN | POLLRDNORM); - } else { - TUNDEBUG("%s%d: tunpoll waiting\n", ifp->if_name, - ifp->if_unit); - selrecord(p, &tp->tun_rsel, wql); - } - - if (events & (POLLOUT | POLLWRNORM)) - revents |= events & (POLLOUT | POLLWRNORM); - - splx(s); - return (revents); -} - - -#endif /* NTUN */ diff --git a/bsd/net/if_tun.h b/bsd/net/if_tun.h deleted file mode 100644 index 23731386f..000000000 --- a/bsd/net/if_tun.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* $NetBSD: if_tun.h,v 1.5 1994/06/29 06:36:27 cgd Exp $ */ - -/* - * Copyright (c) 1988, Julian Onions <jpo@cs.nott.ac.uk> - * Nottingham University 1987. - * - * This source may be freely distributed, however I would be interested - * in any changes that are made. - * - * This driver takes packets off the IP i/f and hands them up to a - * user process to have its wicked way with. This driver has it's - * roots in a similar driver written by Phil Cockcroft (formerly) at - * UCL. This driver is based much more on read/write/select mode of - * operation though. - * - */ - -#ifndef _NET_IF_TUN_H_ -#define _NET_IF_TUN_H_ -#include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE - -/* Refer to if_tunvar.h for the softc stuff */ - -/* Maximum transmit packet size (default) */ -#define TUNMTU 1500 - -/* Maximum receive packet size (hard limit) */ -#define TUNMRU 16384 - -struct tuninfo { - int baudrate; /* linespeed */ - short mtu; /* maximum transmission unit */ - u_char type; /* ethernet, tokenring, etc. */ - u_char dummy; /* place holder */ -}; - -/* ioctl's for get/set debug */ -#define TUNSDEBUG _IOW('t', 90, int) -#define TUNGDEBUG _IOR('t', 89, int) -#define TUNSIFINFO _IOW('t', 91, struct tuninfo) -#define TUNGIFINFO _IOR('t', 92, struct tuninfo) - -#endif /* __APPLE_API_PRIVATE */ -#endif /* !_NET_IF_TUN_H_ */ diff --git a/bsd/net/if_tunvar.h b/bsd/net/if_tunvar.h deleted file mode 100644 index 9b93e0867..000000000 --- a/bsd/net/if_tunvar.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/*- - * Copyright (c) 1998 Brian Somers <brian@Awfulhak.org> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - */ - -#ifndef _NET_IF_TUNVAR_H_ -#define _NET_IF_TUNVAR_H_ -#include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE - -struct tun_softc { - u_short tun_flags; /* misc flags */ -#define TUN_OPEN 0x0001 -#define TUN_INITED 0x0002 -#define TUN_RCOLL 0x0004 -#define TUN_IASET 0x0008 -#define TUN_DSTADDR 0x0010 -#define TUN_RWAIT 0x0040 -#define TUN_ASYNC 0x0080 - -#define TUN_READY (TUN_OPEN | TUN_INITED) - - struct ifnet tun_if; /* the interface */ - struct sigio *tun_sigio; /* information for async I/O */ - struct selinfo tun_rsel; /* read select */ - struct selinfo tun_wsel; /* write select (not used) */ -}; - -#endif /* __APPLE_API_PRIVATE */ -#endif /* !_NET_IF_TUNVAR_H_ */ diff --git a/bsd/net/if_types.h b/bsd/net/if_types.h index befa53907..282c0f811 100644 --- a/bsd/net/if_types.h +++ b/bsd/net/if_types.h @@ -63,6 +63,8 @@ * Interface types for benefit of parsing media address headers. * This list is derived from the SNMP list of ifTypes, currently * documented in RFC1573. + * The current list of assignments is maintained at: + * http://www.iana.org/assignments/smi-numbers */ #define IFT_OTHER 0x1 /* none of the following */ @@ -123,7 +125,14 @@ #define IFT_FAITH 0x38 /*0xf2*/ #define IFT_STF 0x39 /*0xf3*/ #define IFT_L2VLAN 0x87 /* Layer 2 Virtual LAN using 802.1Q */ - +#define IFT_IEEE8023ADLAG 0x88 /* IEEE802.3ad Link Aggregate */ #define IFT_IEEE1394 0x90 /* IEEE1394 High Performance SerialBus*/ +/* + * These are not based on IANA assignments: + * Note: IFT_STF has a defined ifType: 0xd7 (215), but we use 0x39. + */ +#define IFT_GIF 0x37 /*0xf0*/ +#define IFT_FAITH 0x38 /*0xf2*/ +#define IFT_STF 0x39 /*0xf3*/ #endif diff --git a/bsd/net/if_var.h b/bsd/net/if_var.h index 1f813e917..8ac2039fb 100644 --- a/bsd/net/if_var.h +++ b/bsd/net/if_var.h @@ -57,7 +57,18 @@ #ifndef _NET_IF_VAR_H_ #define _NET_IF_VAR_H_ + #include <sys/appleapiopts.h> +#include <sys/types.h> +#include <sys/time.h> +#include <sys/queue.h> /* get TAILQ macros */ +#ifdef KERNEL_PRIVATE +#include <kern/locks.h> +#endif /* KERNEL_PRIVATE */ + +#ifdef KERNEL +#include <net/kpi_interface.h> +#endif KERNEL #ifdef __APPLE__ #define APPLE_IF_FAM_LOOPBACK 1 @@ -73,7 +84,15 @@ #define APPLE_IF_FAM_FAITH 11 #define APPLE_IF_FAM_STF 12 #define APPLE_IF_FAM_FIREWIRE 13 -#endif +#define APPLE_IF_FAM_BOND 14 +#endif __APPLE__ + +/* + * 72 was chosen below because it is the size of a TCP/IP + * header (40) + the minimum mss (32). + */ +#define IF_MINMTU 72 +#define IF_MAXMTU 65535 /* * Structures defining a network interface, providing a packet @@ -100,122 +119,210 @@ * interfaces. These routines live in the files if.c and route.c */ -#ifdef __STDC__ -/* - * Forward structure declarations for function prototypes [sic]. - */ -struct mbuf; -struct proc; -struct rtentry; -struct socket; -struct ether_header; -struct sockaddr_dl; -#endif - #define IFNAMSIZ 16 -#include <sys/queue.h> /* get TAILQ macros */ - -#ifdef __APPLE_API_UNSTABLE -#ifdef __APPLE__ -struct tqdummy; - -TAILQ_HEAD(tailq_head, tqdummy); - /* This belongs up in socket.h or socketvar.h, depending on how far the * event bubbles up. */ struct net_event_data { - u_long if_family; - u_long if_unit; + unsigned long if_family; + unsigned long if_unit; char if_name[IFNAMSIZ]; }; -#endif - - -TAILQ_HEAD(ifnethead, ifnet); /* we use TAILQs so that the order of */ -TAILQ_HEAD(ifaddrhead, ifaddr); /* instantiation is preserved in the list */ -TAILQ_HEAD(ifprefixhead, ifprefix); -LIST_HEAD(ifmultihead, ifmultiaddr); +/* + * Structure describing information about an interface + * which may be of interest to management entities. + */ +struct if_data { + /* generic interface information */ + unsigned char ifi_type; /* ethernet, tokenring, etc */ #ifdef __APPLE__ -#ifdef KERNEL_PRIVATE -/* bottom 16 bits reserved for hardware checksum */ -#define IF_HWASSIST_CSUM_IP 0x0001 /* will csum IP */ -#define IF_HWASSIST_CSUM_TCP 0x0002 /* will csum TCP */ -#define IF_HWASSIST_CSUM_UDP 0x0004 /* will csum UDP */ -#define IF_HWASSIST_CSUM_IP_FRAGS 0x0008 /* will csum IP fragments */ -#define IF_HWASSIST_CSUM_FRAGMENT 0x0010 /* will do IP fragmentation */ -#define IF_HWASSIST_CSUM_TCP_SUM16 0x1000 /* simple TCP Sum16 computation */ -#define IF_HWASSIST_CSUM_MASK 0xffff -#define IF_HWASSIST_CSUM_FLAGS(hwassist) ((hwassist) & IF_HWASSIST_CSUM_MASK) + unsigned char ifi_typelen; /* Length of frame type id */ +#endif + unsigned char ifi_physical; /* e.g., AUI, Thinnet, 10base-T, etc */ + unsigned char ifi_addrlen; /* media address length */ + unsigned char ifi_hdrlen; /* media header length */ + unsigned char ifi_recvquota; /* polling quota for receive intrs */ + unsigned char ifi_xmitquota; /* polling quota for xmit intrs */ + unsigned char ifi_unused1; /* for future use */ + unsigned long ifi_mtu; /* maximum transmission unit */ + unsigned long ifi_metric; /* routing metric (external only) */ + unsigned long ifi_baudrate; /* linespeed */ + /* volatile statistics */ + unsigned long ifi_ipackets; /* packets received on interface */ + unsigned long ifi_ierrors; /* input errors on interface */ + unsigned long ifi_opackets; /* packets sent on interface */ + unsigned long ifi_oerrors; /* output errors on interface */ + unsigned long ifi_collisions; /* collisions on csma interfaces */ + unsigned long ifi_ibytes; /* total number of octets received */ + unsigned long ifi_obytes; /* total number of octets sent */ + unsigned long ifi_imcasts; /* packets received via multicast */ + unsigned long ifi_omcasts; /* packets sent via multicast */ + unsigned long ifi_iqdrops; /* dropped on input, this interface */ + unsigned long ifi_noproto; /* destined for unsupported protocol */ + unsigned long ifi_recvtiming; /* usec spent receiving when timing */ + unsigned long ifi_xmittiming; /* usec spent xmitting when timing */ + struct timeval ifi_lastchange; /* time of last administrative change */ + unsigned long ifi_unused2; /* used to be the default_proto */ + unsigned long ifi_hwassist; /* HW offload capabilities */ + unsigned long ifi_reserved1; /* for future use */ + unsigned long ifi_reserved2; /* for future use */ +}; -/* VLAN support */ -#define IF_HWASSIST_VLAN_TAGGING 0x10000 /* supports VLAN tagging */ -#define IF_HWASSIST_VLAN_MTU 0x20000 /* supports VLAN MTU-sized packet (for software VLAN) */ -#endif KERNEL_PRIVATE /* * Structure describing information about an interface * which may be of interest to management entities. */ -struct if_data { +struct if_data64 { /* generic interface information */ u_char ifi_type; /* ethernet, tokenring, etc */ #ifdef __APPLE__ u_char ifi_typelen; /* Length of frame type id */ #endif - u_char ifi_physical; /* e.g., AUI, Thinnet, 10base-T, etc */ - u_char ifi_addrlen; /* media address length */ + u_char ifi_physical; /* e.g., AUI, Thinnet, 10base-T, etc */ + u_char ifi_addrlen; /* media address length */ + u_char ifi_hdrlen; /* media header length */ + u_char ifi_recvquota; /* polling quota for receive intrs */ + u_char ifi_xmitquota; /* polling quota for xmit intrs */ + u_char ifi_unused1; /* for future use */ + u_long ifi_mtu; /* maximum transmission unit */ + u_long ifi_metric; /* routing metric (external only) */ + u_int64_t ifi_baudrate; /* linespeed */ + /* volatile statistics */ + u_int64_t ifi_ipackets; /* packets received on interface */ + u_int64_t ifi_ierrors; /* input errors on interface */ + u_int64_t ifi_opackets; /* packets sent on interface */ + u_int64_t ifi_oerrors; /* output errors on interface */ + u_int64_t ifi_collisions; /* collisions on csma interfaces */ + u_int64_t ifi_ibytes; /* total number of octets received */ + u_int64_t ifi_obytes; /* total number of octets sent */ + u_int64_t ifi_imcasts; /* packets received via multicast */ + u_int64_t ifi_omcasts; /* packets sent via multicast */ + u_int64_t ifi_iqdrops; /* dropped on input, this interface */ + u_int64_t ifi_noproto; /* destined for unsupported protocol */ + u_long ifi_recvtiming; /* usec spent receiving when timing */ + u_long ifi_xmittiming; /* usec spent xmitting when timing */ + struct timeval ifi_lastchange; /* time of last administrative change */ +}; + +#ifdef PRIVATE +/* + * Internal storage of if_data. This is bound to change. Various places in the + * stack will translate this data structure in to the externally visible + * if_data structure above. + */ +struct if_data_internal { + /* generic interface information */ + u_char ifi_type; /* ethernet, tokenring, etc */ + u_char ifi_typelen; /* Length of frame type id */ + u_char ifi_physical; /* e.g., AUI, Thinnet, 10base-T, etc */ + u_char ifi_addrlen; /* media address length */ u_char ifi_hdrlen; /* media header length */ - u_char ifi_recvquota; /* polling quota for receive intrs */ - u_char ifi_xmitquota; /* polling quota for xmit intrs */ + u_char ifi_recvquota; /* polling quota for receive intrs */ + u_char ifi_xmitquota; /* polling quota for xmit intrs */ + u_char ifi_unused1; /* for future use */ u_long ifi_mtu; /* maximum transmission unit */ u_long ifi_metric; /* routing metric (external only) */ - u_long ifi_baudrate; /* linespeed */ + u_long ifi_baudrate; /* linespeed */ /* volatile statistics */ - u_long ifi_ipackets; /* packets received on interface */ - u_long ifi_ierrors; /* input errors on interface */ - u_long ifi_opackets; /* packets sent on interface */ - u_long ifi_oerrors; /* output errors on interface */ - u_long ifi_collisions; /* collisions on csma interfaces */ - u_long ifi_ibytes; /* total number of octets received */ - u_long ifi_obytes; /* total number of octets sent */ - u_long ifi_imcasts; /* packets received via multicast */ - u_long ifi_omcasts; /* packets sent via multicast */ - u_long ifi_iqdrops; /* dropped on input, this interface */ - u_long ifi_noproto; /* destined for unsupported protocol */ -#ifdef __APPLE__ + u_int64_t ifi_ipackets; /* packets received on interface */ + u_int64_t ifi_ierrors; /* input errors on interface */ + u_int64_t ifi_opackets; /* packets sent on interface */ + u_int64_t ifi_oerrors; /* output errors on interface */ + u_int64_t ifi_collisions; /* collisions on csma interfaces */ + u_int64_t ifi_ibytes; /* total number of octets received */ + u_int64_t ifi_obytes; /* total number of octets sent */ + u_int64_t ifi_imcasts; /* packets received via multicast */ + u_int64_t ifi_omcasts; /* packets sent via multicast */ + u_int64_t ifi_iqdrops; /* dropped on input, this interface */ + u_int64_t ifi_noproto; /* destined for unsupported protocol */ u_long ifi_recvtiming; /* usec spent receiving when timing */ u_long ifi_xmittiming; /* usec spent xmitting when timing */ -#endif +#define IF_LASTCHANGEUPTIME 1 /* lastchange: 1-uptime 0-calendar time */ struct timeval ifi_lastchange; /* time of last administrative change */ -#ifdef __APPLE__ - u_long default_proto; /* Default dl_tag when none is specified - * on dlil_output */ -#endif u_long ifi_hwassist; /* HW offload capabilities */ -#ifdef KERNEL_PRIVATE - u_short ifi_nvlans; /* number of attached vlans */ - u_short ifi_reserved_1; /* for future use */ -#else KERNEL_PRIVATE - u_long ifi_reserved1; /* for future use */ -#endif KERNEL_PRIVATE - u_long ifi_reserved2; /* for future use */ }; -#endif + +#define if_mtu if_data.ifi_mtu +#define if_type if_data.ifi_type +#define if_typelen if_data.ifi_typelen +#define if_physical if_data.ifi_physical +#define if_addrlen if_data.ifi_addrlen +#define if_hdrlen if_data.ifi_hdrlen +#define if_metric if_data.ifi_metric +#define if_baudrate if_data.ifi_baudrate +#define if_hwassist if_data.ifi_hwassist +#define if_ipackets if_data.ifi_ipackets +#define if_ierrors if_data.ifi_ierrors +#define if_opackets if_data.ifi_opackets +#define if_oerrors if_data.ifi_oerrors +#define if_collisions if_data.ifi_collisions +#define if_ibytes if_data.ifi_ibytes +#define if_obytes if_data.ifi_obytes +#define if_imcasts if_data.ifi_imcasts +#define if_omcasts if_data.ifi_omcasts +#define if_iqdrops if_data.ifi_iqdrops +#define if_noproto if_data.ifi_noproto +#define if_lastchange if_data.ifi_lastchange +#define if_recvquota if_data.ifi_recvquota +#define if_xmitquota if_data.ifi_xmitquota +#define if_iflags if_data.ifi_iflags + +struct mbuf; +struct ifaddr; +TAILQ_HEAD(ifnethead, ifnet); /* we use TAILQs so that the order of */ +TAILQ_HEAD(ifaddrhead, ifaddr); /* instantiation is preserved in the list */ +TAILQ_HEAD(ifprefixhead, ifprefix); +LIST_HEAD(ifmultihead, ifmultiaddr); +struct tqdummy; +TAILQ_HEAD(tailq_head, tqdummy); + +/* + * Forward structure declarations for function prototypes [sic]. + */ +struct proc; +struct rtentry; +struct socket; +struct ether_header; +struct sockaddr_dl; +struct ifnet_filter; + +TAILQ_HEAD(ifnet_filter_head, ifnet_filter); +TAILQ_HEAD(ddesc_head_name, dlil_demux_desc); + +/* bottom 16 bits reserved for hardware checksum */ +#define IF_HWASSIST_CSUM_IP 0x0001 /* will csum IP */ +#define IF_HWASSIST_CSUM_TCP 0x0002 /* will csum TCP */ +#define IF_HWASSIST_CSUM_UDP 0x0004 /* will csum UDP */ +#define IF_HWASSIST_CSUM_IP_FRAGS 0x0008 /* will csum IP fragments */ +#define IF_HWASSIST_CSUM_FRAGMENT 0x0010 /* will do IP fragmentation */ +#define IF_HWASSIST_CSUM_TCP_SUM16 0x1000 /* simple TCP Sum16 computation */ +#define IF_HWASSIST_CSUM_MASK 0xffff +#define IF_HWASSIST_CSUM_FLAGS(hwassist) ((hwassist) & IF_HWASSIST_CSUM_MASK) + +/* VLAN support */ +#define IF_HWASSIST_VLAN_TAGGING 0x10000 /* supports VLAN tagging */ +#define IF_HWASSIST_VLAN_MTU 0x20000 /* supports VLAN MTU-sized packet (for software VLAN) */ + +#define IFNET_RW_LOCK 1 /* * Structure defining a queue for a network interface. */ struct ifqueue { - struct mbuf *ifq_head; - struct mbuf *ifq_tail; + void *ifq_head; + void *ifq_tail; int ifq_len; int ifq_maxlen; int ifq_drops; }; +struct ddesc_head_str; +struct proto_hash_entry; +struct kev_msg; + /* * Structure defining a network interface. * @@ -223,13 +330,16 @@ struct ifqueue { */ struct ifnet { void *if_softc; /* pointer to driver state */ - char *if_name; /* name, e.g. ``en'' or ``lo'' */ + const char *if_name; /* name, e.g. ``en'' or ``lo'' */ TAILQ_ENTRY(ifnet) if_link; /* all struct ifnets are chained */ struct ifaddrhead if_addrhead; /* linked list of addresses per if */ -#ifdef __APPLE__ - struct tailq_head proto_head; /* Head for if_proto structures */ -#endif - int if_pcount; /* number of promiscuous listeners */ + u_long if_refcnt; +#ifdef __KPI_INTERFACE__ + ifnet_check_multi if_check_multi; +#else + void* if_check_multi; +#endif __KPI_INTERFACE__ + int if_pcount; /* number of promiscuous listeners */ struct bpf_if *if_bpf; /* packet filter structure */ u_short if_index; /* numeric abbreviation for this if */ short if_unit; /* sub-unit for lower level driver */ @@ -238,118 +348,109 @@ struct ifnet { int if_ipending; /* interrupts pending */ void *if_linkmib; /* link-type-specific MIB data */ size_t if_linkmiblen; /* length of above data */ - struct if_data if_data; + struct if_data_internal if_data; -#ifdef __APPLE__ /* New with DLIL */ +#ifdef BSD_KERNEL_PRIVATE + int if_usecnt; +#else int refcnt; +#endif int offercnt; - int (*if_output)(struct ifnet *ifnet_ptr, struct mbuf *m); - int (*if_ioctl)(struct ifnet *ifnet_ptr, u_long ioctl_code, void *ioctl_arg); - int (*if_set_bpf_tap)(struct ifnet *ifp, int mode, - int (*bpf_callback)(struct ifnet *, struct mbuf *)); - int (*if_free)(struct ifnet *ifnet_ptr); - int (*if_demux)(struct ifnet *ifnet_ptr, struct mbuf *mbuf_ptr, - char *frame_ptr, void *if_proto_ptr); - - int (*if_event)(struct ifnet *ifnet_ptr, caddr_t event_ptr); - - int (*if_framer)(struct ifnet *ifp, - struct mbuf **m, - struct sockaddr *dest, - char *dest_linkaddr, - char *frame_type); +#ifdef __KPI_INTERFACE__ + ifnet_output_func if_output; + ifnet_ioctl_func if_ioctl; + ifnet_set_bpf_tap if_set_bpf_tap; + ifnet_detached_func if_free; + ifnet_demux_func if_demux; + ifnet_event_func if_event; + ifnet_framer_func if_framer; + ifnet_family_t if_family; /* ulong assigned by Apple */ +#else + void* if_output; + void* if_ioctl; + void* if_set_bpf_tap; + void* if_free; + void* if_demux; + void* if_event; + void* if_framer; + u_long if_family; /* ulong assigned by Apple */ +#endif - u_long if_family; /* ulong assigned by Apple */ - struct tailq_head if_flt_head; + struct ifnet_filter_head if_flt_head; /* End DLIL specific */ - void *reserved0; /* for future use */ + u_long if_delayed_detach; /* need to perform delayed detach */ void *if_private; /* private to interface */ long if_eflags; /* autoaddr, autoaddr done, etc. */ -#endif /* __APPLE__ */ struct ifmultihead if_multiaddrs; /* multicast addresses configured */ int if_amcount; /* number of all-multicast requests */ /* procedure handles */ -#ifndef __APPLE__ - int (*if_output) /* output routine (enqueue) */ - __P((struct ifnet *, struct mbuf *, struct sockaddr *, - struct rtentry *)); - void (*if_start) /* initiate output routine */ - __P((struct ifnet *)); - int (*if_done) /* output complete routine */ - __P((struct ifnet *)); /* (XXX not used; fake prototype) */ - int (*if_ioctl) /* ioctl routine */ - __P((struct ifnet *, u_long, caddr_t)); - void (*if_watchdog) /* timer routine */ - __P((struct ifnet *)); -#endif - int (*if_poll_recv) /* polled receive routine */ - __P((struct ifnet *, int *)); - int (*if_poll_xmit) /* polled transmit routine */ - __P((struct ifnet *, int *)); - void (*if_poll_intren) /* polled interrupt reenable routine */ - __P((struct ifnet *)); - void (*if_poll_slowinput) /* input routine for slow devices */ - __P((struct ifnet *, struct mbuf *)); - void (*if_init) /* Init routine */ - __P((void *)); - int (*if_resolvemulti) /* validate/resolve multicast */ - __P((struct ifnet *, struct sockaddr **, struct sockaddr *)); - struct ifqueue if_snd; /* output queue */ - struct ifqueue *if_poll_slowq; /* input queue for slow devices */ +#ifdef __KPI_INTERFACE__ + union { + int (*original)(struct ifnet *ifp, u_long protocol_family, + struct ddesc_head_str *demux_desc_head); + ifnet_add_proto_func kpi; + } if_add_proto_u; + ifnet_del_proto_func if_del_proto; +#else __KPI_INTERFACE__ + void* if_add_proto; + void* if_del_proto; +#endif __KPI_INTERFACE__ + struct proto_hash_entry *if_proto_hash; + void *if_kpi_storage; + + void *unused_was_init; + void *unused_was_resolvemulti; + + struct ifqueue if_snd; + u_long unused_2[1]; #ifdef __APPLE__ - u_long family_cookie; + u_long family_cookie; struct ifprefixhead if_prefixhead; /* list of prefixes per if */ - void *reserved1; /* for future use */ + +#ifdef _KERN_LOCKS_H_ +#if IFNET_RW_LOCK + lck_rw_t *if_lock; /* Lock to protect this interface */ +#else + lck_mtx_t *if_lock; /* Lock to protect this interface */ +#endif +#else + void *if_lock; +#endif + #else struct ifprefixhead if_prefixhead; /* list of prefixes per if */ #endif /* __APPLE__ */ + struct { + u_long length; + union { + u_char buffer[8]; + u_char *ptr; + } u; + } if_broadcast; }; -typedef void if_init_f_t __P((void *)); -#define if_mtu if_data.ifi_mtu -#define if_type if_data.ifi_type -#define if_typelen if_data.ifi_typelen -#define if_physical if_data.ifi_physical -#define if_addrlen if_data.ifi_addrlen -#define if_hdrlen if_data.ifi_hdrlen -#define if_metric if_data.ifi_metric -#define if_baudrate if_data.ifi_baudrate -#define if_hwassist if_data.ifi_hwassist -#define if_ipackets if_data.ifi_ipackets -#define if_ierrors if_data.ifi_ierrors -#define if_opackets if_data.ifi_opackets -#define if_oerrors if_data.ifi_oerrors -#define if_collisions if_data.ifi_collisions -#define if_ibytes if_data.ifi_ibytes -#define if_obytes if_data.ifi_obytes -#define if_imcasts if_data.ifi_imcasts -#define if_omcasts if_data.ifi_omcasts -#define if_iqdrops if_data.ifi_iqdrops -#define if_noproto if_data.ifi_noproto -#define if_lastchange if_data.ifi_lastchange -#define if_recvquota if_data.ifi_recvquota -#define if_xmitquota if_data.ifi_xmitquota -#ifdef KERNEL_PRIVATE -#define if_nvlans if_data.ifi_nvlans -#endif KERNEL_PRIVATE -#define if_rawoutput(if, m, sa) if_output(if, m, sa, (struct rtentry *)0) +#define if_add_proto if_add_proto_u.original #ifndef __APPLE__ /* for compatibility with other BSDs */ #define if_addrlist if_addrhead #define if_list if_link -#endif +#endif !__APPLE__ + + +#endif /* PRIVATE */ + #ifdef KERNEL_PRIVATE /* * Structure describing a `cloning' interface. */ struct if_clone { LIST_ENTRY(if_clone) ifc_list; /* on list of cloners */ - const char *ifc_name; /* name of device, e.g. `vlan' */ + const char *ifc_name; /* name of device, e.g. `vlan' */ size_t ifc_namelen; /* length of name */ int ifc_minifs; /* minimum number of interfaces */ int ifc_maxunit; /* maximum unit number */ @@ -361,8 +462,7 @@ struct if_clone { }; #define IF_CLONE_INITIALIZER(name, create, destroy, minifs, maxunit) \ - { { 0 }, name, sizeof(name) - 1, minifs, maxunit, NULL, 0, create, destroy } -#endif + { { 0, 0 }, name, sizeof(name) - 1, minifs, maxunit, NULL, 0, create, destroy } /* * Bit values in if_ipending @@ -383,7 +483,7 @@ struct if_clone { if ((ifq)->ifq_tail == 0) \ (ifq)->ifq_head = m; \ else \ - (ifq)->ifq_tail->m_nextpkt = m; \ + ((struct mbuf*)(ifq)->ifq_tail)->m_nextpkt = m; \ (ifq)->ifq_tail = m; \ (ifq)->ifq_len++; \ } @@ -404,12 +504,11 @@ struct if_clone { } \ } -#ifdef KERNEL #define IF_ENQ_DROP(ifq, m) if_enq_drop(ifq, m) #if defined(__GNUC__) && defined(MT_HEADER) static __inline int -if_queue_drop(struct ifqueue *ifq, struct mbuf *m) +if_queue_drop(struct ifqueue *ifq, __unused struct mbuf *m) { IF_DROP(ifq); return 0; @@ -427,22 +526,15 @@ if_enq_drop(struct ifqueue *ifq, struct mbuf *m) #else #ifdef MT_HEADER -int if_enq_drop __P((struct ifqueue *, struct mbuf *)); -#endif +int if_enq_drop(struct ifqueue *, struct mbuf *); +#endif MT_HEADER -#endif -#endif /* __APPLE_API_UNSTABLE */ +#endif defined(__GNUC__) && defined(MT_HEADER) -/* - * 72 was chosen below because it is the size of a TCP/IP - * header (40) + the minimum mss (32). - */ -#define IF_MINMTU 72 -#define IF_MAXMTU 65535 +#endif /* KERNEL_PRIVATE */ -#endif /* KERNEL */ -#ifdef __APPLE_API_UNSTABLE +#ifdef PRIVATE /* * The ifaddr structure contains information about one address * of an interface. They are maintained by the different address families, @@ -454,26 +546,27 @@ struct ifaddr { struct sockaddr *ifa_dstaddr; /* other end of p-to-p link */ #define ifa_broadaddr ifa_dstaddr /* broadcast address interface */ struct sockaddr *ifa_netmask; /* used to determine subnet */ -#ifndef __APPLE__ - /* Use of if_data doesn't justify change of API */ - struct if_data if_data; /* not all members are meaningful */ -#endif struct ifnet *ifa_ifp; /* back-pointer to interface */ TAILQ_ENTRY(ifaddr) ifa_link; /* queue macro glue */ void (*ifa_rtrequest) /* check or clean routes (+ or -)'d */ - __P((int, struct rtentry *, struct sockaddr *)); + (int, struct rtentry *, struct sockaddr *); u_short ifa_flags; /* mostly rt_flags for cloning */ - short ifa_refcnt;/* 16bit ref count, use ifaref, ifafree */ + int ifa_refcnt;/* 32bit ref count, use ifaref, ifafree */ int ifa_metric; /* cost of going out this interface */ #ifdef notdef struct rtentry *ifa_rt; /* XXXX for ROUTETOIF ????? */ #endif - u_long ifa_dlt; int (*ifa_claim_addr) /* check if an addr goes to this if */ - __P((struct ifaddr *, struct sockaddr *)); + (struct ifaddr *, const struct sockaddr *); + u_long ifa_debug; /* debug flags */ }; -#define IFA_ROUTE RTF_UP /* route installed */ +#define IFA_ROUTE RTF_UP /* route installed (0x1) */ +#define IFA_CLONING RTF_CLONING /* (0x100) */ +#define IFA_ATTACHED 0x1 /* ifa_debug: IFA is attached to an interface */ + +#endif /* PRIVATE */ +#ifdef KERNEL_PRIVATE /* * The prefix structure contains information about one prefix * of an interface. They are maintained by the different address families, @@ -487,6 +580,10 @@ struct ifprefix { u_char ifpr_plen; /* prefix length in bits */ u_char ifpr_type; /* protocol dependent prefix type */ }; +#endif /* KERNEL_PRIVATE */ + +#ifdef PRIVATE +typedef void (*ifma_protospec_free_func)(void* ifma_protospec); /* * Multicast address structure. This is analogous to the ifaddr @@ -497,81 +594,81 @@ struct ifprefix { struct ifmultiaddr { LIST_ENTRY(ifmultiaddr) ifma_link; /* queue macro glue */ struct sockaddr *ifma_addr; /* address this membership is for */ - struct sockaddr *ifma_lladdr; /* link-layer translation, if any */ - struct ifnet *ifma_ifp; /* back-pointer to interface */ - u_int ifma_refcount; /* reference count */ - void *ifma_protospec; /* protocol-specific state, if any */ + struct ifmultiaddr *ifma_ll; /* link-layer translation, if any */ + struct ifnet *ifma_ifp; /* back-pointer to interface */ + u_int ifma_usecount; /* use count, protected by ifp's lock */ + void *ifma_protospec; /* protocol-specific state, if any */ + int32_t ifma_refcount; /* reference count, atomically protected */ + ifma_protospec_free_func ifma_free; /* function called to free ifma_protospec */ }; +#endif /* PRIVATE */ -#ifdef KERNEL +#ifdef KERNEL_PRIVATE #define IFAREF(ifa) ifaref(ifa) #define IFAFREE(ifa) ifafree(ifa) -#ifdef __APPLE_API_PRIVATE -extern struct ifnethead ifnet; +/* + * To preserve kmem compatibility, we define + * ifnet_head to ifnet. This should be temp. + */ +#define ifnet_head ifnet +extern struct ifnethead ifnet_head; extern struct ifnet **ifindex2ifnet; extern int ifqmaxlen; extern struct ifnet loif[]; extern int if_index; extern struct ifaddr **ifnet_addrs; -#endif /* __APPLE_API_PRIVATE */ +extern struct ifnet *lo_ifp; + +int if_addmulti(struct ifnet *, const struct sockaddr *, struct ifmultiaddr **); +int if_allmulti(struct ifnet *, int); +void if_attach(struct ifnet *); +int if_delmultiaddr(struct ifmultiaddr *ifma, int locked); +int if_delmulti(struct ifnet *, const struct sockaddr *); +void if_down(struct ifnet *); +void if_route(struct ifnet *, int flag, int fam); +void if_unroute(struct ifnet *, int flag, int fam); +void if_up(struct ifnet *); +void if_updown(struct ifnet *ifp, int up); +/*void ifinit(void));*/ /* declared in systm.h for main( */ +int ifioctl(struct socket *, u_long, caddr_t, struct proc *); +int ifioctllocked(struct socket *, u_long, caddr_t, struct proc *); +struct ifnet *ifunit(const char *); +struct ifnet *if_withname(struct sockaddr *); -#ifndef __APPLE__ -void ether_ifattach __P((struct ifnet *, int)); -void ether_ifdetach __P((struct ifnet *, int)); -void ether_input __P((struct ifnet *, struct ether_header *, struct mbuf *)); -void ether_demux __P((struct ifnet *, struct ether_header *, struct mbuf *)); -int ether_output __P((struct ifnet *, - struct mbuf *, struct sockaddr *, struct rtentry *)); -int ether_output_frame __P((struct ifnet *, struct mbuf *)); -int ether_ioctl __P((struct ifnet *, int, caddr_t)); -#endif - -int if_addmulti __P((struct ifnet *, struct sockaddr *, - struct ifmultiaddr **)); -int if_allmulti __P((struct ifnet *, int)); -void if_attach __P((struct ifnet *)); -int if_delmultiaddr __P((struct ifmultiaddr *ifma)); -int if_delmulti __P((struct ifnet *, struct sockaddr *)); -void if_down __P((struct ifnet *)); -void if_route __P((struct ifnet *, int flag, int fam)); -void if_unroute __P((struct ifnet *, int flag, int fam)); -void if_up __P((struct ifnet *)); -/*void ifinit __P((void));*/ /* declared in systm.h for main() */ -int ifioctl __P((struct socket *, u_long, caddr_t, struct proc *)); -int ifpromisc __P((struct ifnet *, int)); -struct ifnet *ifunit __P((const char *)); -struct ifnet *if_withname __P((struct sockaddr *)); - -int if_poll_recv_slow __P((struct ifnet *ifp, int *quotap)); -void if_poll_xmit_slow __P((struct ifnet *ifp, int *quotap)); -void if_poll_throttle __P((void)); -void if_poll_unthrottle __P((void *)); -void if_poll_init __P((void)); -void if_poll __P((void)); -#ifdef KERNEL_PRIVATE void if_clone_attach(struct if_clone *); void if_clone_detach(struct if_clone *); -#endif KERNEL_PRIVATE - -struct ifaddr *ifa_ifwithaddr __P((struct sockaddr *)); -struct ifaddr *ifa_ifwithdstaddr __P((struct sockaddr *)); -struct ifaddr *ifa_ifwithnet __P((struct sockaddr *)); -struct ifaddr *ifa_ifwithroute __P((int, struct sockaddr *, - struct sockaddr *)); -struct ifaddr *ifaof_ifpforaddr __P((struct sockaddr *, struct ifnet *)); -void ifafree __P((struct ifaddr *)); -void ifaref __P((struct ifaddr *)); - -struct ifmultiaddr *ifmaof_ifpforaddr __P((struct sockaddr *, - struct ifnet *)); -#ifndef __APPLE__ -int if_simloop __P((struct ifnet *ifp, struct mbuf *m, - struct sockaddr *dst, int hlen)); -#endif - -#endif /* KERNEL */ - -#endif /* __APPLE_API_UNSTABLE */ +void ifnet_lock_assert(struct ifnet *ifp, int what); +void ifnet_lock_shared(struct ifnet *ifp); +void ifnet_lock_exclusive(struct ifnet *ifp); +void ifnet_lock_done(struct ifnet *ifp); + +void ifnet_head_lock_shared(void); +void ifnet_head_lock_exclusive(void); +void ifnet_head_done(void); + +void if_attach_ifa(struct ifnet * ifp, struct ifaddr *ifa); +void if_detach_ifa(struct ifnet * ifp, struct ifaddr *ifa); + +void ifma_reference(struct ifmultiaddr *ifma); +void ifma_release(struct ifmultiaddr *ifma); + +struct ifaddr *ifa_ifwithaddr(const struct sockaddr *); +struct ifaddr *ifa_ifwithdstaddr(const struct sockaddr *); +struct ifaddr *ifa_ifwithnet(const struct sockaddr *); +struct ifaddr *ifa_ifwithroute(int, const struct sockaddr *, const struct sockaddr *); +struct ifaddr *ifaof_ifpforaddr(const struct sockaddr *, struct ifnet *); +void ifafree(struct ifaddr *); +void ifaref(struct ifaddr *); + +struct ifmultiaddr *ifmaof_ifpforaddr(const struct sockaddr *, struct ifnet *); + +#ifdef BSD_KERNEL_PRIVATE +void if_data_internal_to_if_data(const struct if_data_internal *if_data_int, + struct if_data *if_data); +void if_data_internal_to_if_data64(const struct if_data_internal *if_data_int, + struct if_data64 *if_data64); +#endif /* BSD_KERNEL_PRIVATE */ +#endif /* KERNEL_PRIVATE */ #endif /* !_NET_IF_VAR_H_ */ diff --git a/bsd/net/if_vlan.c b/bsd/net/if_vlan.c index 74f9e99ba..21e88cddd 100644 --- a/bsd/net/if_vlan.c +++ b/bsd/net/if_vlan.c @@ -79,63 +79,264 @@ #include <net/if.h> #include <net/if_arp.h> #include <net/if_dl.h> +#include <net/if_ether.h> #include <net/if_types.h> #include <net/if_vlan_var.h> +#include <libkern/OSAtomic.h> #include <net/dlil.h> +#include <kern/locks.h> + #ifdef INET #include <netinet/in.h> #include <netinet/if_ether.h> #endif #include <net/if_media.h> +#include <net/multicast_list.h> -#define ETHER_VLAN_ENCAP_LEN 4 /* len of 802.1Q VLAN encapsulation */ -#define IF_MAXUNIT 0x7fff /* historical value */ - -#define IFP2AC(p) ((struct arpcom *)p) - -#define VLAN_PROTO_FAMILY 0x766c616e /* 'vlan' */ +#define IF_MAXUNIT 0x7fff /* historical value */ #define VLANNAME "vlan" typedef int (bpf_callback_func)(struct ifnet *, struct mbuf *); typedef int (if_set_bpf_tap_func)(struct ifnet *ifp, int mode, bpf_callback_func * func); -struct vlan_mc_entry { - struct ether_addr mc_addr; - SLIST_ENTRY(vlan_mc_entry) mc_entries; -}; +/** + ** vlan locks + **/ +static __inline__ lck_grp_t * +my_lck_grp_alloc_init(const char * grp_name) +{ + lck_grp_t * grp; + lck_grp_attr_t * grp_attrs; + + grp_attrs = lck_grp_attr_alloc_init(); + lck_grp_attr_setdefault(grp_attrs); + grp = lck_grp_alloc_init(grp_name, grp_attrs); + lck_grp_attr_free(grp_attrs); + return (grp); +} + +static __inline__ lck_mtx_t * +my_lck_mtx_alloc_init(lck_grp_t * lck_grp) +{ + lck_attr_t * lck_attrs; + lck_mtx_t * lck_mtx; + + lck_attrs = lck_attr_alloc_init(); + lck_attr_setdefault(lck_attrs); + lck_mtx = lck_mtx_alloc_init(lck_grp, lck_attrs); + lck_attr_free(lck_attrs); + return (lck_mtx); +} + +static lck_mtx_t * vlan_lck_mtx; + +static __inline__ void +vlan_lock_init(void) +{ + lck_grp_t * vlan_lck_grp; + + vlan_lck_grp = my_lck_grp_alloc_init("if_vlan"); + vlan_lck_mtx = my_lck_mtx_alloc_init(vlan_lck_grp); +} + +static __inline__ void +vlan_assert_lock_held(void) +{ + lck_mtx_assert(vlan_lck_mtx, LCK_MTX_ASSERT_OWNED); + return; +} + +static __inline__ void +vlan_assert_lock_not_held(void) +{ + lck_mtx_assert(vlan_lck_mtx, LCK_MTX_ASSERT_NOTOWNED); + return; +} -struct ifvlan { - char ifv_name[IFNAMSIZ]; /* our unique id */ - struct ifnet *ifv_ifp; /* our interface */ - struct ifnet *ifv_p; /* parent interface of this vlan */ +static __inline__ void +vlan_lock(void) +{ + lck_mtx_lock(vlan_lck_mtx); + return; +} + +static __inline__ void +vlan_unlock(void) +{ + lck_mtx_unlock(vlan_lck_mtx); + return; +} + +/** + ** vlan structures, types + **/ +struct vlan_parent; +LIST_HEAD(vlan_parent_list, vlan_parent); +struct ifvlan; +LIST_HEAD(ifvlan_list, ifvlan); + +typedef struct vlan_parent { + LIST_ENTRY(vlan_parent) vlp_parent_list;/* list of parents */ + struct ifnet * vlp_ifp; /* interface */ + struct ifvlan_list vlp_vlan_list; /* list of VLAN's */ +#define VLPF_SUPPORTS_VLAN_MTU 0x1 +#define VLPF_CHANGE_IN_PROGRESS 0x2 +#define VLPF_DETACHING 0x4 + u_int32_t vlp_flags; + struct ifdevmtu vlp_devmtu; + UInt32 vlp_retain_count; +} vlan_parent, * vlan_parent_ref; + +struct ifvlan { + LIST_ENTRY(ifvlan) ifv_vlan_list; + char ifv_name[IFNAMSIZ]; /* our unique id */ + struct ifnet * ifv_ifp; /* our interface */ + vlan_parent_ref ifv_vlp; /* parent information */ struct ifv_linkmib { - int ifvm_parent; - int ifvm_encaplen; /* encapsulation length */ - int ifvm_mtufudge; /* MTU fudged by this much */ - int ifvm_mintu; /* min transmission unit */ - u_int16_t ifvm_proto; /* encapsulation ethertype */ - u_int16_t ifvm_tag; /* tag to apply on packets leaving if */ + u_int16_t ifvm_encaplen;/* encapsulation length */ + u_int16_t ifvm_mtufudge;/* MTU fudged by this much */ + u_int16_t ifvm_proto; /* encapsulation ethertype */ + u_int16_t ifvm_tag; /* tag to apply on packets leaving if */ } ifv_mib; - SLIST_HEAD(__vlan_mchead, vlan_mc_entry) vlan_mc_listhead; - LIST_ENTRY(ifvlan) ifv_list; - int ifv_flags; - int ifv_detaching; - u_long ifv_filter_id; - int ifv_filter_valid; - bpf_callback_func * ifv_bpf_input; - bpf_callback_func * ifv_bpf_output; + struct multicast_list ifv_multicast; +#define IFVF_PROMISC 0x1 /* promiscuous mode enabled */ +#define IFVF_DETACHING 0x2 /* interface is detaching */ +#define IFVF_READY 0x4 /* interface is ready */ + u_int32_t ifv_flags; + bpf_packet_func ifv_bpf_input; + bpf_packet_func ifv_bpf_output; }; -#define ifv_tag ifv_mib.ifvm_tag +typedef struct ifvlan * ifvlan_ref; + +typedef struct vlan_globals_s { + struct vlan_parent_list parent_list; + int verbose; +} * vlan_globals_ref; + +static vlan_globals_ref g_vlan; + +#define ifv_tag ifv_mib.ifvm_tag #define ifv_encaplen ifv_mib.ifvm_encaplen #define ifv_mtufudge ifv_mib.ifvm_mtufudge -#define ifv_mintu ifv_mib.ifvm_mintu -#define IFVF_PROMISC 0x01 /* promiscuous mode enabled */ + +/** + ** vlan_parent_ref vlp_flags in-lines + **/ +static __inline__ int +vlan_parent_flags_supports_vlan_mtu(vlan_parent_ref vlp) +{ + return ((vlp->vlp_flags & VLPF_SUPPORTS_VLAN_MTU) != 0); +} + +static __inline__ void +vlan_parent_flags_set_supports_vlan_mtu(vlan_parent_ref vlp) +{ + vlp->vlp_flags |= VLPF_SUPPORTS_VLAN_MTU; + return; +} + +static __inline__ void +vlan_parent_flags_clear_supports_vlan_mtu(vlan_parent_ref vlp) +{ + vlp->vlp_flags &= ~VLPF_SUPPORTS_VLAN_MTU; + return; +} + +static __inline__ int +vlan_parent_flags_change_in_progress(vlan_parent_ref vlp) +{ + return ((vlp->vlp_flags & VLPF_CHANGE_IN_PROGRESS) != 0); +} + +static __inline__ void +vlan_parent_flags_set_change_in_progress(vlan_parent_ref vlp) +{ + vlp->vlp_flags |= VLPF_CHANGE_IN_PROGRESS; + return; +} + +static __inline__ void +vlan_parent_flags_clear_change_in_progress(vlan_parent_ref vlp) +{ + vlp->vlp_flags &= ~VLPF_CHANGE_IN_PROGRESS; + return; +} + +static __inline__ int +vlan_parent_flags_detaching(struct vlan_parent * vlp) +{ + return ((vlp->vlp_flags & VLPF_DETACHING) != 0); +} + +static __inline__ void +vlan_parent_flags_set_detaching(struct vlan_parent * vlp) +{ + vlp->vlp_flags |= VLPF_DETACHING; + return; +} + + +/** + ** ifvlan_flags in-lines routines + **/ +static __inline__ int +ifvlan_flags_promisc(ifvlan_ref ifv) +{ + return ((ifv->ifv_flags & IFVF_PROMISC) != 0); +} + +static __inline__ void +ifvlan_flags_set_promisc(ifvlan_ref ifv) +{ + ifv->ifv_flags |= IFVF_PROMISC; + return; +} + +static __inline__ void +ifvlan_flags_clear_promisc(ifvlan_ref ifv) +{ + ifv->ifv_flags &= ~IFVF_PROMISC; + return; +} + +static __inline__ int +ifvlan_flags_ready(ifvlan_ref ifv) +{ + return ((ifv->ifv_flags & IFVF_READY) != 0); +} + +static __inline__ void +ifvlan_flags_set_ready(ifvlan_ref ifv) +{ + ifv->ifv_flags |= IFVF_READY; + return; +} + +static __inline__ void +ifvlan_flags_clear_ready(ifvlan_ref ifv) +{ + ifv->ifv_flags &= ~IFVF_READY; + return; +} + +static __inline__ int +ifvlan_flags_detaching(ifvlan_ref ifv) +{ + return ((ifv->ifv_flags & IFVF_DETACHING) != 0); +} + +static __inline__ void +ifvlan_flags_set_detaching(ifvlan_ref ifv) +{ + ifv->ifv_flags |= IFVF_DETACHING; + return; +} #if 0 SYSCTL_DECL(_net_link); @@ -145,69 +346,102 @@ SYSCTL_NODE(_net_link_vlan, PF_LINK, link, CTLFLAG_RW, 0, "for consistency"); #define M_VLAN M_DEVBUF -MALLOC_DEFINE(M_VLAN, VLANNAME, "802.1Q Virtual LAN Interface"); - -static LIST_HEAD(, ifvlan) ifv_list; - -#if 0 -/* - * Locking: one lock is used to guard both the ifv_list and modification - * to vlan data structures. We are rather conservative here; probably - * more than necessary. - */ -static struct mtx ifv_mtx; -#define VLAN_LOCK_INIT() mtx_init(&ifv_mtx, VLANNAME, NULL, MTX_DEF) -#define VLAN_LOCK_DESTROY() mtx_destroy(&ifv_mtx) -#define VLAN_LOCK_ASSERT() mtx_assert(&ifv_mtx, MA_OWNED) -#define VLAN_LOCK() mtx_lock(&ifv_mtx) -#define VLAN_UNLOCK() mtx_unlock(&ifv_mtx) -#else -#define VLAN_LOCK_INIT() -#define VLAN_LOCK_DESTROY() -#define VLAN_LOCK_ASSERT() -#define VLAN_LOCK() -#define VLAN_UNLOCK() -#endif 0 - static int vlan_clone_create(struct if_clone *, int); static void vlan_clone_destroy(struct ifnet *); +static int vlan_input(struct mbuf *m, char *frame_header, struct ifnet *ifp, + u_long protocol_family, int sync_ok); static int vlan_output(struct ifnet *ifp, struct mbuf *m); -static void vlan_ifinit(void *foo); -static int vlan_ioctl(struct ifnet *ifp, u_long cmd, void * addr); -static int vlan_set_bpf_tap(struct ifnet * ifp, int mode, - bpf_callback_func * func); +static int vlan_ioctl(ifnet_t ifp, u_int32_t cmd, void * addr); +static int vlan_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, + bpf_packet_func func); static int vlan_attach_protocol(struct ifnet *ifp); static int vlan_detach_protocol(struct ifnet *ifp); -static int vlan_attach_filter(struct ifnet * ifp, u_long * filter_id); -static int vlan_detach_filter(u_long filter_id); static int vlan_setmulti(struct ifnet *ifp); static int vlan_unconfig(struct ifnet *ifp); -static int vlan_config(struct ifvlan *ifv, struct ifnet *p, int tag); -static int vlan_if_free(struct ifnet * ifp); +static int vlan_config(struct ifnet * ifp, struct ifnet * p, int tag); +static void vlan_if_free(struct ifnet * ifp); +static void vlan_remove(ifvlan_ref ifv); +static void vlan_if_detach(struct ifnet * ifp); +static int vlan_new_mtu(struct ifnet * ifp, int mtu); static struct if_clone vlan_cloner = IF_CLONE_INITIALIZER(VLANNAME, - vlan_clone_create, vlan_clone_destroy, 0, IF_MAXUNIT); + vlan_clone_create, + vlan_clone_destroy, + 0, + IF_MAXUNIT); +static void interface_link_event(struct ifnet * ifp, u_long event_code); +static void vlan_parent_link_event(vlan_parent_ref vlp, + u_long event_code); +extern int dlil_input_packet(struct ifnet *ifp, struct mbuf *m, char *frame_header); + +static int +vlan_globals_init(void) +{ + vlan_globals_ref v; + + vlan_assert_lock_not_held(); + + if (g_vlan != NULL) { + return (0); + } + v = _MALLOC(sizeof(*v), M_VLAN, M_WAITOK); + if (v != NULL) { + LIST_INIT(&v->parent_list); + v->verbose = 0; + } + vlan_lock(); + if (g_vlan != NULL) { + vlan_unlock(); + if (v != NULL) { + _FREE(v, M_VLAN); + } + return (0); + } + g_vlan = v; + vlan_unlock(); + if (v == NULL) { + return (ENOMEM); + } + return (0); +} + +static int +siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p) +{ + struct ifreq ifr; + int error; + + bzero(&ifr, sizeof(ifr)); + error = dlil_ioctl(0, ifp, SIOCGIFDEVMTU, (caddr_t)&ifr); + if (error == 0) { + *ifdm_p = ifr.ifr_devmtu; + } + return (error); +} -static if_set_bpf_tap_func nop_if_bpf; -static int nop_if_free(struct ifnet *); -static int nop_if_ioctl(struct ifnet *, u_long, void *); -static int nop_if_output(struct ifnet * ifp, struct mbuf * m); +static int +siocsifaltmtu(struct ifnet * ifp, int mtu) +{ + struct ifreq ifr; -static void interface_link_event(struct ifnet * ifp, u_long event_code); + bzero(&ifr, sizeof(ifr)); + ifr.ifr_mtu = mtu; + return (dlil_ioctl(0, ifp, SIOCSIFALTMTU, (caddr_t)&ifr)); +} static __inline__ void vlan_bpf_output(struct ifnet * ifp, struct mbuf * m, - bpf_callback_func func) + bpf_packet_func func) { if (func != NULL) { - func(ifp, m); + (*func)(ifp, m); } return; } static __inline__ void vlan_bpf_input(struct ifnet * ifp, struct mbuf * m, - bpf_callback_func func, char * frame_header, + bpf_packet_func func, char * frame_header, int frame_header_len, int encap_len) { if (func != NULL) { @@ -217,7 +451,7 @@ vlan_bpf_input(struct ifnet * ifp, struct mbuf * m, } m->m_data -= frame_header_len; m->m_len += frame_header_len; - func(ifp, m); + (*func)(ifp, m); m->m_data += frame_header_len; m->m_len -= frame_header_len; if (encap_len > 0) { @@ -229,7 +463,7 @@ vlan_bpf_input(struct ifnet * ifp, struct mbuf * m, } static struct ifaddr * -ifaddr_byindex(unsigned int i) +ifaddr_byindex(int i) { if (i > if_index || i == 0) { return (NULL); @@ -237,6 +471,100 @@ ifaddr_byindex(unsigned int i) return (ifnet_addrs[i - 1]); } +/** + ** vlan_parent synchronization routines + **/ +static __inline__ void +vlan_parent_retain(vlan_parent_ref vlp) +{ + OSIncrementAtomic(&vlp->vlp_retain_count); +} + +static __inline__ void +vlan_parent_release(vlan_parent_ref vlp) +{ + UInt32 old_retain_count; + + old_retain_count = OSDecrementAtomic(&vlp->vlp_retain_count); + switch (old_retain_count) { + case 0: + panic("vlan_parent_release: retain count is 0\n"); + break; + case 1: + if (g_vlan->verbose) { + struct ifnet * ifp = vlp->vlp_ifp; + printf("vlan_parent_release(%s%d)\n", ifp->if_name, + ifp->if_unit); + } + FREE(vlp, M_VLAN); + break; + default: + break; + } + return; +} + +/* + * Function: vlan_parent_wait + * Purpose: + * Allows a single thread to gain exclusive access to the vlan_parent + * data structure. Some operations take a long time to complete, + * and some have side-effects that we can't predict. Holding the + * vlan_lock() across such operations is not possible. + * + * Notes: + * Before calling, you must be holding the vlan_lock and have taken + * a reference on the vlan_parent_ref. + */ +static void +vlan_parent_wait(vlan_parent_ref vlp, const char * msg) +{ + int waited = 0; + + /* other add/remove/multicast-change in progress */ + while (vlan_parent_flags_change_in_progress(vlp)) { + if (g_vlan->verbose) { + struct ifnet * ifp = vlp->vlp_ifp; + + printf("%s%d: %s msleep\n", ifp->if_name, ifp->if_unit, msg); + } + waited = 1; + (void)msleep(vlp, vlan_lck_mtx, PZERO, msg, 0); + } + /* prevent other vlan parent remove/add from taking place */ + vlan_parent_flags_set_change_in_progress(vlp); + if (g_vlan->verbose && waited) { + struct ifnet * ifp = vlp->vlp_ifp; + + printf("%s: %s woke up\n", ifp->if_name, ifp->if_unit, msg); + } + return; +} + +/* + * Function: vlan_parent_signal + * Purpose: + * Allows the thread that previously invoked vlan_parent_wait() to + * give up exclusive access to the vlan_parent data structure, and wake up + * any other threads waiting to access + * Notes: + * Before calling, you must be holding the vlan_lock and have taken + * a reference on the vlan_parent_ref. + */ +static void +vlan_parent_signal(vlan_parent_ref vlp, const char * msg) +{ + vlan_parent_flags_clear_change_in_progress(vlp); + wakeup((caddr_t)vlp); + if (g_vlan->verbose) { + struct ifnet * ifp = vlp->vlp_ifp; + + printf("%s%d: %s wakeup\n", ifp->if_name, ifp->if_unit, msg); + } + return; +} + + /* * Program our multicast filter. What we're actually doing is * programming the multicast filter of the parent. This has the @@ -246,154 +574,232 @@ ifaddr_byindex(unsigned int i) * to avoid this: there really is only one physical interface. */ static int -vlan_setmulti(struct ifnet *ifp) +vlan_setmulti(struct ifnet * ifp) { - struct ifnet *p; - struct ifmultiaddr *ifma, *rifma = NULL; - struct ifvlan *sc; - struct vlan_mc_entry *mc = NULL; - struct sockaddr_dl sdl; - int error; - - /* Find the parent. */ - sc = ifp->if_private; - p = sc->ifv_p; - if (p == NULL) { - /* no parent, so no need to program the multicast filter */ - return (0); + int error = 0; + ifvlan_ref ifv; + struct ifnet * p; + vlan_parent_ref vlp; + + vlan_lock(); + ifv = (ifvlan_ref)ifp->if_private; + if (ifv == NULL || ifvlan_flags_detaching(ifv)) { + goto unlock_done; } - - bzero((char *)&sdl, sizeof sdl); - sdl.sdl_len = sizeof sdl; - sdl.sdl_family = AF_LINK; - sdl.sdl_index = p->if_index; - sdl.sdl_type = IFT_ETHER; - sdl.sdl_alen = ETHER_ADDR_LEN; - - /* First, remove any existing filter entries. */ - while (SLIST_FIRST(&sc->vlan_mc_listhead) != NULL) { - mc = SLIST_FIRST(&sc->vlan_mc_listhead); - bcopy((char *)&mc->mc_addr, LLADDR(&sdl), ETHER_ADDR_LEN); - error = if_delmulti(p, (struct sockaddr *)&sdl); - if (error) - return(error); - SLIST_REMOVE_HEAD(&sc->vlan_mc_listhead, mc_entries); - FREE(mc, M_VLAN); + vlp = ifv->ifv_vlp; + if (vlp == NULL) { + /* no parent, no need to program the multicast filter */ + goto unlock_done; + } + if (vlan_parent_flags_detaching(vlp)) { + goto unlock_done; } + vlan_parent_retain(vlp); + vlan_parent_wait(vlp, "vlan_setmulti"); - /* Now program new ones. */ - LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { - if (ifma->ifma_addr->sa_family != AF_LINK) - continue; - mc = _MALLOC(sizeof(struct vlan_mc_entry), M_VLAN, M_WAITOK); - bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), - (char *)&mc->mc_addr, ETHER_ADDR_LEN); - SLIST_INSERT_HEAD(&sc->vlan_mc_listhead, mc, mc_entries); - bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), - LLADDR(&sdl), ETHER_ADDR_LEN); - error = if_addmulti(p, (struct sockaddr *)&sdl, &rifma); - if (error) - return(error); + /* check again, things could have changed */ + ifv = (ifvlan_ref)ifp->if_private; + if (ifv == NULL || ifvlan_flags_detaching(ifv)) { + goto signal_done; } + if (ifv->ifv_vlp != vlp) { + /* vlan parent changed */ + goto signal_done; + } + if (vlp == NULL) { + /* no parent, no need to program the multicast filter */ + goto signal_done; + } + p = vlp->vlp_ifp; + vlan_unlock(); - return(0); -} + /* update parent interface with our multicast addresses */ + error = multicast_list_program(&ifv->ifv_multicast, ifp, p); -#if 0 -/* - * VLAN support can be loaded as a module. The only place in the - * system that's intimately aware of this is ether_input. We hook - * into this code through vlan_input_p which is defined there and - * set here. Noone else in the system should be aware of this so - * we use an explicit reference here. - * - * NB: Noone should ever need to check if vlan_input_p is null or - * not. This is because interfaces have a count of the number - * of active vlans (if_nvlans) and this should never be bumped - * except by vlan_config--which is in this module so therefore - * the module must be loaded and vlan_input_p must be non-NULL. - */ -extern void (*vlan_input_p)(struct ifnet *, struct mbuf *); + vlan_lock(); -static int -vlan_modevent(module_t mod, int type, void *data) -{ - - switch (type) { - case MOD_LOAD: - LIST_INIT(&ifv_list); - VLAN_LOCK_INIT(); - vlan_input_p = vlan_input; - if_clone_attach(&vlan_cloner); - break; - case MOD_UNLOAD: - if_clone_detach(&vlan_cloner); - vlan_input_p = NULL; - while (!LIST_EMPTY(&ifv_list)) - vlan_clone_destroy(LIST_FIRST(&ifv_list)->ifv_ifp); - VLAN_LOCK_DESTROY(); - break; - } - return 0; -} + signal_done: + vlan_parent_signal(vlp, "vlan_setmulti"); -static moduledata_t vlan_mod = { - "if_vlan", - vlan_modevent, - 0 -}; + unlock_done: + vlan_unlock(); + return (error); +} -DECLARE_MODULE(if_vlan, vlan_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +/** + ** vlan_parent list manipulation/lookup routines + **/ +static vlan_parent_ref +parent_list_lookup(struct ifnet * p) +{ + vlan_parent_ref vlp; -#endif 0 + LIST_FOREACH(vlp, &g_vlan->parent_list, vlp_parent_list) { + if (vlp->vlp_ifp == p) { + return (vlp); + } + } + return (NULL); +} -static struct ifvlan * -vlan_lookup_ifp_and_tag(struct ifnet * ifp, int tag) +static ifvlan_ref +vlan_parent_lookup_tag(vlan_parent_ref vlp, int tag) { - struct ifvlan * ifv; + ifvlan_ref ifv; - LIST_FOREACH(ifv, &ifv_list, ifv_list) { - if (ifp == ifv->ifv_p && tag == ifv->ifv_tag) { + LIST_FOREACH(ifv, &vlp->vlp_vlan_list, ifv_vlan_list) { + if (tag == ifv->ifv_tag) { return (ifv); } } return (NULL); } -static struct ifvlan * -vlan_lookup_ifp(struct ifnet * ifp) +static ifvlan_ref +vlan_lookup_parent_and_tag(struct ifnet * p, int tag) { - struct ifvlan * ifv; + vlan_parent_ref vlp; - LIST_FOREACH(ifv, &ifv_list, ifv_list) { - if (ifp == ifv->ifv_p) { - return (ifv); - } + vlp = parent_list_lookup(p); + if (vlp != NULL) { + return (vlan_parent_lookup_tag(vlp, tag)); } return (NULL); } +static int +vlan_parent_find_max_mtu(vlan_parent_ref vlp, ifvlan_ref exclude_ifv) +{ + int max_mtu = 0; + ifvlan_ref ifv; + + LIST_FOREACH(ifv, &vlp->vlp_vlan_list, ifv_vlan_list) { + int req_mtu; + + if (exclude_ifv == ifv) { + continue; + } + req_mtu = ifv->ifv_ifp->if_mtu + ifv->ifv_mtufudge; + if (req_mtu > max_mtu) { + max_mtu = req_mtu; + } + } + return (max_mtu); +} + +/* + * Function: vlan_parent_create + * Purpose: + * Create a vlan_parent structure to hold the VLAN's for the given + * interface. Add it to the list of VLAN parents. + */ +static int +vlan_parent_create(struct ifnet * p, vlan_parent_ref * ret_vlp) +{ + int error; + vlan_parent_ref vlp; + + *ret_vlp = NULL; + vlp = _MALLOC(sizeof(*vlp), M_VLAN, M_WAITOK); + if (vlp == NULL) { + return (ENOMEM); + } + bzero(vlp, sizeof(*vlp)); + error = siocgifdevmtu(p, &vlp->vlp_devmtu); + if (error != 0) { + printf("vlan_parent_create (%s%d): siocgifdevmtu failed, %d\n", + p->if_name, p->if_unit, error); + FREE(vlp, M_VLAN); + return (error); + } + LIST_INIT(&vlp->vlp_vlan_list); + vlp->vlp_ifp = p; + vlan_parent_retain(vlp); + if (p->if_hwassist + & (IF_HWASSIST_VLAN_MTU | IF_HWASSIST_VLAN_TAGGING)) { + vlan_parent_flags_set_supports_vlan_mtu(vlp); + } + *ret_vlp = vlp; + return (0); +} + +static void +vlan_parent_remove_all_vlans(vlan_parent_ref vlp) +{ + ifvlan_ref ifv; + struct ifnet * p; + + vlan_assert_lock_held(); + + while ((ifv = LIST_FIRST(&vlp->vlp_vlan_list)) != NULL) { + vlan_remove(ifv); + vlan_unlock(); + vlan_if_detach(ifv->ifv_ifp); + vlan_lock(); + } + + /* the vlan parent has no more VLAN's */ + p = vlp->vlp_ifp; + ifnet_lock_exclusive(p); + p->if_eflags &= ~IFEF_VLAN; + ifnet_lock_done(p); + LIST_REMOVE(vlp, vlp_parent_list); + vlan_unlock(); + vlan_parent_release(vlp); + vlan_lock(); + + return; +} + +static __inline__ int +vlan_parent_no_vlans(vlan_parent_ref vlp) +{ + return (LIST_EMPTY(&vlp->vlp_vlan_list)); +} + +static void +vlan_parent_add_vlan(vlan_parent_ref vlp, ifvlan_ref ifv, int tag) +{ + LIST_INSERT_HEAD(&vlp->vlp_vlan_list, ifv, ifv_vlan_list); + ifv->ifv_vlp = vlp; + ifv->ifv_tag = tag; + return; +} + +static void +vlan_parent_remove_vlan(__unused vlan_parent_ref vlp, ifvlan_ref ifv) +{ + ifv->ifv_vlp = NULL; + LIST_REMOVE(ifv, ifv_vlan_list); + return; +} + static void vlan_clone_attach(void) { if_clone_attach(&vlan_cloner); + vlan_lock_init(); return; } static int vlan_clone_create(struct if_clone *ifc, int unit) { - int error; - struct ifvlan *ifv; - struct ifnet *ifp; + int error; + ifvlan_ref ifv; + struct ifnet * ifp; + error = vlan_globals_init(); + if (error != 0) { + return (error); + } ifv = _MALLOC(sizeof(struct ifvlan), M_VLAN, M_WAITOK); bzero(ifv, sizeof(struct ifvlan)); - SLIST_INIT(&ifv->vlan_mc_listhead); + multicast_list_init(&ifv->ifv_multicast); /* use the interface name as the unique id for ifp recycle */ - if (snprintf(ifv->ifv_name, sizeof(ifv->ifv_name), "%s%d", - ifc->ifc_name, unit) >= sizeof(ifv->ifv_name)) { + if ((unsigned int)snprintf(ifv->ifv_name, sizeof(ifv->ifv_name), "%s%d", + ifc->ifc_name, unit) >= sizeof(ifv->ifv_name)) { FREE(ifv, M_VLAN); return (EINVAL); } @@ -405,9 +811,7 @@ vlan_clone_create(struct if_clone *ifc, int unit) FREE(ifv, M_VLAN); return (error); } - ifv->ifv_ifp = ifp; - ifp->if_private = ifv; - ifp->if_name = (char *)ifc->ifc_name; + ifp->if_name = ifc->ifc_name; ifp->if_unit = unit; ifp->if_family = APPLE_IF_FAM_VLAN; @@ -420,49 +824,45 @@ vlan_clone_create(struct if_clone *ifc, int unit) ifp->if_ioctl = vlan_ioctl; ifp->if_set_bpf_tap = vlan_set_bpf_tap; - ifp->if_free = nop_if_free; - ifp->if_output = nop_if_output; + ifp->if_free = vlan_if_free; + ifp->if_output = vlan_output; ifp->if_hwassist = 0; ifp->if_addrlen = ETHER_ADDR_LEN; /* XXX ethernet specific */ ifp->if_baudrate = 0; ifp->if_type = IFT_L2VLAN; ifp->if_hdrlen = ETHER_VLAN_ENCAP_LEN; + + /* XXX ethernet specific */ + ifp->if_broadcast.length = ETHER_ADDR_LEN; + bcopy(etherbroadcastaddr, ifp->if_broadcast.u.buffer, ETHER_ADDR_LEN); + error = dlil_if_attach(ifp); if (error) { dlil_if_release(ifp); FREE(ifv, M_VLAN); return (error); } + ifp->if_private = ifv; + ifv->ifv_ifp = ifp; /* attach as ethernet */ bpfattach(ifp, DLT_EN10MB, sizeof(struct ether_header)); - - VLAN_LOCK(); - LIST_INSERT_HEAD(&ifv_list, ifv, ifv_list); - VLAN_UNLOCK(); - return (0); } static void -vlan_remove(struct ifvlan * ifv) +vlan_remove(ifvlan_ref ifv) { - VLAN_LOCK_ASSERT(); - ifv->ifv_detaching = 1; + vlan_assert_lock_held(); + ifvlan_flags_set_detaching(ifv); vlan_unconfig(ifv->ifv_ifp); - LIST_REMOVE(ifv, ifv_list); return; } static void vlan_if_detach(struct ifnet * ifp) { - ifp->if_output = nop_if_output; - ifp->if_ioctl = nop_if_ioctl; - ifp->if_set_bpf_tap = &nop_if_bpf; - if (dlil_if_detach(ifp) == DLIL_WAIT_FOR_FREE) { - ifp->if_free = vlan_if_free; - } else { + if (dlil_if_detach(ifp) != DLIL_WAIT_FOR_FREE) { vlan_if_free(ifp); } return; @@ -471,27 +871,35 @@ vlan_if_detach(struct ifnet * ifp) static void vlan_clone_destroy(struct ifnet *ifp) { - struct ifvlan *ifv = ifp->if_private; + ifvlan_ref ifv; + vlan_lock(); + ifv = ifp->if_private; if (ifv == NULL || ifp->if_type != IFT_L2VLAN) { + vlan_unlock(); return; } - VLAN_LOCK(); - if (ifv->ifv_detaching) { - VLAN_UNLOCK(); + if (ifvlan_flags_detaching(ifv)) { + vlan_unlock(); return; } vlan_remove(ifv); - VLAN_UNLOCK(); + vlan_unlock(); vlan_if_detach(ifp); return; } static int -vlan_set_bpf_tap(struct ifnet * ifp, int mode, bpf_callback_func * func) +vlan_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func func) { - struct ifvlan *ifv = ifp->if_private; + ifvlan_ref ifv; + vlan_lock(); + ifv = ifp->if_private; + if (ifv == NULL || ifvlan_flags_detaching(ifv)) { + vlan_unlock(); + return (ENODEV); + } switch (mode) { case BPF_TAP_DISABLE: ifv->ifv_bpf_input = ifv->ifv_bpf_output = NULL; @@ -511,42 +919,52 @@ vlan_set_bpf_tap(struct ifnet * ifp, int mode, bpf_callback_func * func) default: break; } + vlan_unlock(); return 0; } -static void -vlan_ifinit(void *foo) -{ - return; -} - static int -vlan_output(struct ifnet *ifp, struct mbuf *m) +vlan_output(struct ifnet * ifp, struct mbuf * m) { - struct ifvlan *ifv; - struct ifnet *p; - struct ether_vlan_header *evl; - int soft_vlan; - - ifv = ifp->if_private; - p = ifv->ifv_p; - if (p == NULL) { - return (nop_if_output(ifp, m)); - } + bpf_packet_func bpf_func; + struct ether_vlan_header * evl; + int encaplen; + ifvlan_ref ifv; + struct ifnet * p; + int soft_vlan; + u_short tag; + vlan_parent_ref vlp; + if (m == 0) { - printf("%s: NULL output mbuf\n", ifv->ifv_name); - return (EINVAL); + return (0); } if ((m->m_flags & M_PKTHDR) == 0) { - printf("%s: M_PKTHDR bit not set\n", ifv->ifv_name); - m_freem(m); - return (EINVAL); + m_freem_list(m); + return (0); } - ifp->if_obytes += m->m_pkthdr.len; - ifp->if_opackets++; + vlan_lock(); + ifv = (ifvlan_ref)ifp->if_private; + if (ifv == NULL || ifvlan_flags_detaching(ifv) + || ifvlan_flags_ready(ifv) == 0) { + vlan_unlock(); + m_freem_list(m); + return (0); + } + vlp = ifv->ifv_vlp; + if (vlp == NULL) { + vlan_unlock(); + m_freem_list(m); + return (0); + } + p = vlp->vlp_ifp; + (void)ifnet_stat_increment_out(ifp, 1, m->m_pkthdr.len, 0); soft_vlan = (p->if_hwassist & IF_HWASSIST_VLAN_TAGGING) == 0; - vlan_bpf_output(ifp, m, ifv->ifv_bpf_output); - + bpf_func = ifv->ifv_bpf_output; + tag = ifv->ifv_tag; + encaplen = ifv->ifv_encaplen; + vlan_unlock(); + vlan_bpf_output(ifp, m, bpf_func); + /* do not run parent's if_output() if the parent is not up */ if ((p->if_flags & (IFF_UP | IFF_RUNNING)) != (IFF_UP | IFF_RUNNING)) { m_freem(m); @@ -563,22 +981,22 @@ vlan_output(struct ifnet *ifp, struct mbuf *m) */ if (soft_vlan == 0) { m->m_pkthdr.csum_flags |= CSUM_VLAN_TAG_VALID; - m->m_pkthdr.vlan_tag = ifv->ifv_tag; + m->m_pkthdr.vlan_tag = tag; } else { - M_PREPEND(m, ifv->ifv_encaplen, M_DONTWAIT); + M_PREPEND(m, encaplen, M_DONTWAIT); if (m == NULL) { - printf("%s: unable to prepend VLAN header\n", - ifv->ifv_name); - ifp->if_ierrors++; + printf("%s%d: unable to prepend VLAN header\n", ifp->if_name, + ifp->if_unit); + ifp->if_oerrors++; return (0); } /* M_PREPEND takes care of m_len, m_pkthdr.len for us */ - if (m->m_len < sizeof(*evl)) { + if (m->m_len < (int)sizeof(*evl)) { m = m_pullup(m, sizeof(*evl)); if (m == NULL) { - printf("%s: cannot pullup VLAN header\n", - ifv->ifv_name); - ifp->if_ierrors++; + printf("%s%d: unable to pullup VLAN header\n", ifp->if_name, + ifp->if_unit); + ifp->if_oerrors++; return (0); } } @@ -587,31 +1005,25 @@ vlan_output(struct ifnet *ifp, struct mbuf *m) * Transform the Ethernet header into an Ethernet header * with 802.1Q encapsulation. */ - bcopy(mtod(m, char *) + ifv->ifv_encaplen, + bcopy(mtod(m, char *) + encaplen, mtod(m, char *), ETHER_HDR_LEN); evl = mtod(m, struct ether_vlan_header *); evl->evl_proto = evl->evl_encap_proto; evl->evl_encap_proto = htons(ETHERTYPE_VLAN); - evl->evl_tag = htons(ifv->ifv_tag); - m->m_pkthdr.len += ifv->ifv_encaplen; + evl->evl_tag = htons(tag); } - - /* - * Send it, precisely as ether_output() would have. - * We are already running at splimp. - */ - return ((*p->if_output)(p, m)); + return dlil_output(p, 0, m, NULL, NULL, 1); } -extern int -vlan_demux(struct ifnet * ifp, struct mbuf * m, - char * frame_header, struct if_proto * * proto) +static int +vlan_input(struct mbuf * m, char * frame_header, struct ifnet * p, + __unused u_long protocol_family, __unused int sync_ok) { - register struct ether_header *eh = (struct ether_header *)frame_header; - struct ether_vlan_header *evl; - struct ifvlan *ifv = NULL; - int soft_vlan = 0; - u_int tag; + bpf_packet_func bpf_func = NULL; + struct ether_vlan_header * evl; + struct ifnet * ifp = NULL; + int soft_vlan = 0; + u_int tag = 0; if (m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) { /* @@ -623,21 +1035,20 @@ vlan_demux(struct ifnet * ifp, struct mbuf * m, m->m_pkthdr.vlan_tag = 0; } else { soft_vlan = 1; - - switch (ifp->if_type) { + switch (p->if_type) { case IFT_ETHER: if (m->m_len < ETHER_VLAN_ENCAP_LEN) { m_freem(m); - return (EJUSTRETURN); + return 0; } evl = (struct ether_vlan_header *)frame_header; if (ntohs(evl->evl_proto) == ETHERTYPE_VLAN) { /* don't allow VLAN within VLAN */ m_freem(m); - return (EJUSTRETURN); + return (0); } tag = EVL_VLANOFTAG(ntohs(evl->evl_tag)); - + /* * Restore the original ethertype. We'll remove * the encapsulation after we've found the vlan @@ -647,28 +1058,34 @@ vlan_demux(struct ifnet * ifp, struct mbuf * m, break; default: printf("vlan_demux: unsupported if type %u", - ifp->if_type); + p->if_type); m_freem(m); - return (EJUSTRETURN); + return 0; break; } } if (tag != 0) { - if (ifp->if_nvlans == 0) { + ifvlan_ref ifv; + + if ((p->if_eflags & IFEF_VLAN) == 0) { /* don't bother looking through the VLAN list */ m_freem(m); - ifp->if_noproto++; - return (EJUSTRETURN); + return 0; + } + vlan_lock(); + ifv = vlan_lookup_parent_and_tag(p, tag); + if (ifv != NULL) { + ifp = ifv->ifv_ifp; } - VLAN_LOCK(); - ifv = vlan_lookup_ifp_and_tag(ifp, tag); - if (ifv == NULL || (ifv->ifv_ifp->if_flags & IFF_UP) == 0) { - VLAN_UNLOCK(); + if (ifv == NULL + || ifvlan_flags_ready(ifv) == 0 + || (ifp->if_flags & IFF_UP) == 0) { + vlan_unlock(); m_freem(m); - ifp->if_noproto++; - return (EJUSTRETURN); + return 0; } - VLAN_UNLOCK(); /* XXX extend below? */ + bpf_func = ifv->ifv_bpf_input; + vlan_unlock(); } if (soft_vlan) { /* @@ -682,105 +1099,137 @@ vlan_demux(struct ifnet * ifp, struct mbuf * m, m->m_pkthdr.csum_flags = 0; /* can't trust hardware checksum */ } if (tag != 0) { - /* we found a vlan interface above, so send it up */ - m->m_pkthdr.rcvif = ifv->ifv_ifp; - ifv->ifv_ifp->if_ipackets++; - ifv->ifv_ifp->if_ibytes += m->m_pkthdr.len; - - vlan_bpf_input(ifv->ifv_ifp, m, ifv->ifv_bpf_input, frame_header, - ETHER_HDR_LEN, soft_vlan ? ETHER_VLAN_ENCAP_LEN : 0); - - /* Pass it back through the parent's demux routine. */ - return ((*ifp->if_demux)(ifv->ifv_ifp, m, frame_header, proto)); + m->m_pkthdr.rcvif = ifp; + (void)ifnet_stat_increment_in(ifp, 1, + m->m_pkthdr.len + ETHER_HDR_LEN, 0); + vlan_bpf_input(ifp, m, bpf_func, frame_header, ETHER_HDR_LEN, + soft_vlan ? ETHER_VLAN_ENCAP_LEN : 0); + /* We found a vlan interface, inject on that interface. */ + dlil_input_packet(ifp, m, frame_header); + } else { + /* Send priority-tagged packet up through the parent */ + dlil_input_packet(p, m, frame_header); } - /* Pass it back through calling demux routine. */ - return ((*ifp->if_demux)(ifp, m, frame_header, proto)); + return 0; } +#define VLAN_CONFIG_PROGRESS_VLP_RETAINED 0x1 +#define VLAN_CONFIG_PROGRESS_IN_LIST 0x2 + static int -vlan_config(struct ifvlan *ifv, struct ifnet *p, int tag) +vlan_config(struct ifnet * ifp, struct ifnet * p, int tag) { - struct ifnet * ifp; - struct ifaddr *ifa1, *ifa2; - struct sockaddr_dl *sdl1, *sdl2; - int supports_vlan_mtu = 0; + int error; + int first_vlan = 0; + ifvlan_ref ifv = NULL; + struct ifaddr * ifa1; + struct ifaddr * ifa2; + vlan_parent_ref new_vlp = NULL; + int need_vlp_release = 0; + u_int32_t progress = 0; + struct sockaddr_dl *sdl1; + struct sockaddr_dl *sdl2; + vlan_parent_ref vlp = NULL; + + /* pre-allocate space for vlan_parent, in case we're first */ + error = vlan_parent_create(p, &new_vlp); + if (error != 0) { + return (error); + } - VLAN_LOCK_ASSERT(); - if (p->if_data.ifi_type != IFT_ETHER) - return EPROTONOSUPPORT; - if (ifv->ifv_p != NULL || ifv->ifv_detaching) { - return EBUSY; + vlan_lock(); + ifv = (ifvlan_ref)ifp->if_private; + if (ifv != NULL && ifv->ifv_vlp != NULL) { + vlan_unlock(); + vlan_parent_release(new_vlp); + return (EBUSY); } - if (vlan_lookup_ifp_and_tag(p, tag) != NULL) { - /* already a VLAN with that tag on this interface */ - return (EADDRINUSE); + vlp = parent_list_lookup(p); + if (vlp != NULL) { + if (vlan_parent_lookup_tag(vlp, tag) != NULL) { + /* already a VLAN with that tag on this interface */ + error = EADDRINUSE; + goto unlock_done; + } + } + else { + /* we're the first VLAN on this interface */ + LIST_INSERT_HEAD(&g_vlan->parent_list, new_vlp, vlp_parent_list); + vlp = new_vlp; } - ifp = ifv->ifv_ifp; - ifv->ifv_encaplen = ETHER_VLAN_ENCAP_LEN; - ifv->ifv_mintu = ETHERMIN; - ifv->ifv_flags = 0; - /* - * If the parent supports the VLAN_MTU capability, - * i.e. can Tx/Rx larger than ETHER_MAX_LEN frames, - * enable it. - */ - if (p->if_hwassist & (IF_HWASSIST_VLAN_MTU | IF_HWASSIST_VLAN_TAGGING)) { - supports_vlan_mtu = 1; + /* need to wait to ensure no one else is trying to add/remove */ + vlan_parent_retain(vlp); + progress |= VLAN_CONFIG_PROGRESS_VLP_RETAINED; + vlan_parent_wait(vlp, "vlan_config"); + + ifv = (ifvlan_ref)ifp->if_private; + if (ifv == NULL) { + error = EOPNOTSUPP; + goto signal_done; + } + if (vlan_parent_flags_detaching(vlp) + || ifvlan_flags_detaching(ifv) || ifv->ifv_vlp != NULL) { + error = EBUSY; + goto signal_done; } - if (p->if_nvlans == 0) { - u_long dltag; - u_long filter_id; - int error; - /* attach our VLAN "interface filter" to the interface */ - error = vlan_attach_filter(p, &filter_id); - if (error) { - return (error); - } + /* check again because someone might have gotten in */ + if (vlan_parent_lookup_tag(vlp, tag) != NULL) { + /* already a VLAN with that tag on this interface */ + error = EADDRINUSE; + goto signal_done; + } + if (vlan_parent_no_vlans(vlp)) { + first_vlan = 1; + } + vlan_parent_add_vlan(vlp, ifv, tag); + progress |= VLAN_CONFIG_PROGRESS_IN_LIST; + + /* check whether bond interface is using parent interface */ + ifnet_lock_exclusive(p); + if ((p->if_eflags & IFEF_BOND) != 0) { + ifnet_lock_done(p); + /* don't allow VLAN over interface that's already part of a bond */ + error = EBUSY; + goto signal_done; + } + /* prevent BOND interface from using it */ + p->if_eflags |= IFEF_VLAN; + ifnet_lock_done(p); + vlan_unlock(); + + if (first_vlan) { /* attach our VLAN "protocol" to the interface */ error = vlan_attach_protocol(p); if (error) { - (void)vlan_detach_filter(filter_id); - return (error); + vlan_lock(); + goto signal_done; } - ifv->ifv_filter_id = filter_id; - ifv->ifv_filter_valid = TRUE; -#if 0 - if (supports_vlan_mtu) { - /* - * Enable Tx/Rx of VLAN-sized frames. - */ - p->if_capenable |= IFCAP_VLAN_MTU; - if (p->if_flags & IFF_UP) { - struct ifreq ifr; - int error; - - ifr.ifr_flags = p->if_flags; - error = (*p->if_ioctl)(p, SIOCSIFFLAGS, - (caddr_t) &ifr); - if (error) { - if (p->if_nvlans == 0) - p->if_capenable &= ~IFCAP_VLAN_MTU; - return (error); - } - } - } -#endif 0 - } else { - struct ifvlan * other_ifv; + /* mark the parent interface up */ + ifnet_lock_exclusive(p); + p->if_flags |= IFF_UP; + ifnet_lock_done(p); + (void)dlil_ioctl(0, p, SIOCSIFFLAGS, (caddr_t)NULL); + } - other_ifv = vlan_lookup_ifp(p); - if (other_ifv == NULL) { - printf("vlan: other_ifv can't be NULL\n"); - return (EINVAL); + /* configure parent to receive our multicast addresses */ + error = multicast_list_program(&ifv->ifv_multicast, ifp, p); + if (error != 0) { + if (first_vlan) { + (void)vlan_detach_protocol(p); } - ifv->ifv_filter_id = other_ifv->ifv_filter_id; - ifv->ifv_filter_valid = TRUE; + vlan_lock(); + goto signal_done; } - p->if_nvlans++; - if (supports_vlan_mtu) { + + /* no failures past this point */ + vlan_lock(); + + ifv->ifv_encaplen = ETHER_VLAN_ENCAP_LEN; + ifv->ifv_flags = 0; + if (vlan_parent_flags_supports_vlan_mtu(vlp)) { ifv->ifv_mtufudge = 0; } else { /* @@ -792,9 +1241,8 @@ vlan_config(struct ifvlan *ifv, struct ifnet *p, int tag) */ ifv->ifv_mtufudge = ifv->ifv_encaplen; } + ifp->if_mtu = ETHERMTU - ifv->ifv_mtufudge; - ifv->ifv_p = p; - ifp->if_mtu = p->if_mtu - ifv->ifv_mtufudge; /* * Copy only a selected subset of flags from the parent. * Other flags are none of our business. @@ -809,10 +1257,8 @@ vlan_config(struct ifvlan *ifv, struct ifnet *p, int tag) if (p->if_hwassist & IF_HWASSIST_VLAN_TAGGING) { ifp->if_hwassist |= IF_HWASSIST_CSUM_FLAGS(p->if_hwassist); } - /* - * Set up our ``Ethernet address'' to reflect the underlying - * physical interface's. - */ + + /* set our ethernet address to that of the parent */ ifa1 = ifaddr_byindex(ifp->if_index); ifa2 = ifaddr_byindex(p->if_index); sdl1 = (struct sockaddr_dl *)ifa1->ifa_addr; @@ -820,17 +1266,46 @@ vlan_config(struct ifvlan *ifv, struct ifnet *p, int tag) sdl1->sdl_type = IFT_ETHER; sdl1->sdl_alen = ETHER_ADDR_LEN; bcopy(LLADDR(sdl2), LLADDR(sdl1), ETHER_ADDR_LEN); - bcopy(LLADDR(sdl2), IFP2AC(ifp)->ac_enaddr, ETHER_ADDR_LEN); - - /* - * Configure multicast addresses that may already be - * joined on the vlan device. - */ - (void)vlan_setmulti(ifp); - ifp->if_output = vlan_output; - ifv->ifv_tag = tag; + ifp->if_flags |= IFF_RUNNING; + ifvlan_flags_set_ready(ifv); + vlan_parent_signal(vlp, "vlan_config"); + vlan_unlock(); + if (new_vlp != vlp) { + /* throw it away, it wasn't needed */ + vlan_parent_release(new_vlp); + } return 0; + + signal_done: + vlan_assert_lock_held(); + vlan_parent_signal(vlp, "vlan_config"); + + unlock_done: + if ((progress & VLAN_CONFIG_PROGRESS_IN_LIST) != 0) { + vlan_parent_remove_vlan(vlp, ifv); + } + if (!vlan_parent_flags_detaching(vlp) && vlan_parent_no_vlans(vlp)) { + /* the vlan parent has no more VLAN's */ + ifnet_lock_exclusive(p); + p->if_eflags &= ~IFEF_VLAN; + ifnet_lock_done(p); + LIST_REMOVE(vlp, vlp_parent_list); + /* release outside of the lock below */ + need_vlp_release = 1; + } + vlan_unlock(); + + if ((progress & VLAN_CONFIG_PROGRESS_VLP_RETAINED) != 0) { + vlan_parent_release(vlp); + } + if (need_vlp_release) { + vlan_parent_release(vlp); + } + if (new_vlp != vlp) { + vlan_parent_release(new_vlp); + } + return (error); } static void @@ -854,252 +1329,410 @@ vlan_link_event(struct ifnet * ifp, struct ifnet * p) } static int -vlan_unconfig(struct ifnet *ifp) +vlan_unconfig(struct ifnet * ifp) { - struct ifaddr *ifa; + int error = 0; + struct ifaddr * ifa; + ifvlan_ref ifv; + int last_vlan = 0; + int need_vlp_release = 0; + struct ifnet * p; struct sockaddr_dl *sdl; - struct vlan_mc_entry *mc; - struct ifvlan *ifv; - struct ifnet *p; - int error; - - VLAN_LOCK_ASSERT(); - - ifv = ifp->if_private; - - /* Disconnect from parent. */ - p = ifv->ifv_p; - ifv->ifv_p = NULL; - - if (p != NULL) { - struct sockaddr_dl sdl; + vlan_parent_ref vlp; - /* - * Since the interface is being unconfigured, we need to - * empty the list of multicast groups that we may have joined - * while we were alive from the parent's list. - */ - bzero((char *)&sdl, sizeof sdl); - sdl.sdl_len = sizeof sdl; - sdl.sdl_family = AF_LINK; - sdl.sdl_index = p->if_index; - sdl.sdl_type = IFT_ETHER; - sdl.sdl_alen = ETHER_ADDR_LEN; - - while (SLIST_FIRST(&ifv->vlan_mc_listhead) != NULL) { - mc = SLIST_FIRST(&ifv->vlan_mc_listhead); - bcopy((char *)&mc->mc_addr, LLADDR(&sdl), ETHER_ADDR_LEN); - error = if_delmulti(p, (struct sockaddr *)&sdl); - if (error) { - printf("vlan_unconfig: if_delmulti %s failed, %d\n", - ifv->ifv_name, error); - } - SLIST_REMOVE_HEAD(&ifv->vlan_mc_listhead, mc_entries); - FREE(mc, M_VLAN); - } - p->if_nvlans--; - if (p->if_nvlans == 0) { - /* detach our VLAN "protocol" from the interface */ - if (ifv->ifv_filter_valid) { - (void)vlan_detach_filter(ifv->ifv_filter_id); - } - (void)vlan_detach_protocol(p); -#if 0 - /* - * Disable Tx/Rx of VLAN-sized frames. - */ - p->if_capenable &= ~IFCAP_VLAN_MTU; - if (p->if_flags & IFF_UP) { - struct ifreq ifr; - - ifr.ifr_flags = p->if_flags; - (*p->if_ioctl)(p, SIOCSIFFLAGS, (caddr_t) &ifr); - } -#endif 0 + vlan_assert_lock_held(); + ifv = (ifvlan_ref)ifp->if_private; + if (ifv == NULL) { + return (0); + } + vlp = ifv->ifv_vlp; + if (vlp == NULL) { + return (0); + } + vlan_parent_retain(vlp); + vlan_parent_wait(vlp, "vlan_unconfig"); + + /* check again because another thread could be in vlan_unconfig */ + ifv = (ifvlan_ref)ifp->if_private; + if (ifv == NULL) { + goto signal_done; + } + if (ifv->ifv_vlp != vlp) { + /* vlan parent changed */ + goto signal_done; + } + need_vlp_release++; + p = vlp->vlp_ifp; + + /* remember whether we're the last VLAN on the parent */ + if (LIST_NEXT(LIST_FIRST(&vlp->vlp_vlan_list), ifv_vlan_list) == NULL) { + if (g_vlan->verbose) { + printf("vlan_unconfig: last vlan on %s%d\n", + p->if_name, p->if_unit); } + last_vlan = 1; + } + + /* back-out any effect our mtu might have had on the parent */ + (void)vlan_new_mtu(ifp, ETHERMTU - ifv->ifv_mtufudge); + + vlan_unlock(); + + /* detach VLAN "protocol" */ + if (last_vlan) { + (void)vlan_detach_protocol(p); } - /* return to the state we were in before SETVLAN */ + /* un-join multicast on parent interface */ + (void)multicast_list_remove(&ifv->ifv_multicast); + + vlan_lock(); + + /* Disconnect from parent. */ + vlan_parent_remove_vlan(vlp, ifv); + + /* return to the state we were in before SIFVLAN */ ifp->if_mtu = 0; ifp->if_flags &= ~(IFF_BROADCAST | IFF_MULTICAST | IFF_SIMPLEX | IFF_RUNNING); - ifv->ifv_ifp->if_hwassist = 0; + ifp->if_hwassist = 0; ifv->ifv_flags = 0; - ifv->ifv_ifp->if_output = nop_if_output; ifv->ifv_mtufudge = 0; - ifv->ifv_filter_valid = FALSE; /* Clear our MAC address. */ - ifa = ifaddr_byindex(ifv->ifv_ifp->if_index); + ifa = ifaddr_byindex(ifp->if_index); sdl = (struct sockaddr_dl *)(ifa->ifa_addr); sdl->sdl_type = IFT_L2VLAN; sdl->sdl_alen = 0; bzero(LLADDR(sdl), ETHER_ADDR_LEN); - bzero(IFP2AC(ifv->ifv_ifp)->ac_enaddr, ETHER_ADDR_LEN); - /* send a link down event */ - if (p != NULL) { - interface_link_event(ifv->ifv_ifp, KEV_DL_LINK_OFF); + if (!vlan_parent_flags_detaching(vlp) && vlan_parent_no_vlans(vlp)) { + /* the vlan parent has no more VLAN's */ + ifnet_lock_exclusive(p); + p->if_eflags &= ~IFEF_VLAN; + ifnet_lock_done(p); + LIST_REMOVE(vlp, vlp_parent_list); + /* release outside of the lock below */ + need_vlp_release++; } - return 0; + + signal_done: + vlan_parent_signal(vlp, "vlan_unconfig"); + vlan_unlock(); + vlan_parent_release(vlp); /* one because we waited */ + + while (need_vlp_release--) { + vlan_parent_release(vlp); + } + vlan_lock(); + return (error); } static int -vlan_set_promisc(struct ifnet *ifp) +vlan_set_promisc(struct ifnet * ifp) { - struct ifvlan *ifv = ifp->if_private; - int error = 0; + int error = 0; + ifvlan_ref ifv; + vlan_parent_ref vlp; + + vlan_lock(); + ifv = (ifvlan_ref)ifp->if_private; + if (ifv == NULL || ifvlan_flags_detaching(ifv)) { + error = (ifv == NULL) ? EOPNOTSUPP : EBUSY; + goto done; + } + vlp = ifv->ifv_vlp; + if (vlp == NULL) { + goto done; + } if ((ifp->if_flags & IFF_PROMISC) != 0) { - if ((ifv->ifv_flags & IFVF_PROMISC) == 0) { - error = ifpromisc(ifv->ifv_p, 1); - if (error == 0) - ifv->ifv_flags |= IFVF_PROMISC; + if (!ifvlan_flags_promisc(ifv)) { + error = ifnet_set_promiscuous(vlp->vlp_ifp, 1); + if (error == 0) { + ifvlan_flags_set_promisc(ifv); + } } } else { - if ((ifv->ifv_flags & IFVF_PROMISC) != 0) { - error = ifpromisc(ifv->ifv_p, 0); - if (error == 0) - ifv->ifv_flags &= ~IFVF_PROMISC; + if (ifvlan_flags_promisc(ifv)) { + error = ifnet_set_promiscuous(vlp->vlp_ifp, 0); + if (error == 0) { + ifvlan_flags_clear_promisc(ifv); + } } } + done: + vlan_unlock(); + return (error); +} +static int +vlan_new_mtu(struct ifnet * ifp, int mtu) +{ + struct ifdevmtu * devmtu_p; + int error = 0; + ifvlan_ref ifv; + int max_mtu; + int new_mtu = 0; + int req_mtu; + vlan_parent_ref vlp; + + vlan_assert_lock_held(); + ifv = (ifvlan_ref)ifp->if_private; + vlp = ifv->ifv_vlp; + devmtu_p = &vlp->vlp_devmtu; + req_mtu = mtu + ifv->ifv_mtufudge; + if (req_mtu > devmtu_p->ifdm_max || req_mtu < devmtu_p->ifdm_min) { + return (EINVAL); + } + max_mtu = vlan_parent_find_max_mtu(vlp, ifv); + if (req_mtu > max_mtu) { + new_mtu = req_mtu; + } + else if (max_mtu < devmtu_p->ifdm_current) { + new_mtu = max_mtu; + } + if (new_mtu != 0) { + struct ifnet * p = vlp->vlp_ifp; + vlan_unlock(); + error = siocsifaltmtu(p, new_mtu); + vlan_lock(); + } + if (error == 0) { + if (new_mtu != 0) { + devmtu_p->ifdm_current = new_mtu; + } + ifp->if_mtu = mtu; + } return (error); } static int -vlan_ioctl(struct ifnet *ifp, u_long cmd, void * data) +vlan_set_mtu(struct ifnet * ifp, int mtu) { - struct ifaddr *ifa; - struct ifnet *p; - struct ifreq *ifr; - struct ifvlan *ifv; - struct vlanreq vlr; - int error = 0; + int error = 0; + ifvlan_ref ifv; + vlan_parent_ref vlp; + + if (mtu < IF_MINMTU) { + return (EINVAL); + } + vlan_lock(); + ifv = (ifvlan_ref)ifp->if_private; + if (ifv == NULL || ifvlan_flags_detaching(ifv)) { + vlan_unlock(); + return ((ifv == NULL) ? EOPNOTSUPP : EBUSY); + } + vlp = ifv->ifv_vlp; + if (vlp == NULL || vlan_parent_flags_detaching(vlp)) { + vlan_unlock(); + if (mtu != 0) { + return (EINVAL); + } + return (0); + } + vlan_parent_retain(vlp); + vlan_parent_wait(vlp, "vlan_set_mtu"); + + /* check again, something might have changed */ + ifv = (ifvlan_ref)ifp->if_private; + if (ifv == NULL || ifvlan_flags_detaching(ifv)) { + error = (ifv == NULL) ? EOPNOTSUPP : EBUSY; + goto signal_done; + } + if (ifv->ifv_vlp != vlp) { + /* vlan parent changed */ + goto signal_done; + } + if (vlp == NULL || vlan_parent_flags_detaching(vlp)) { + if (mtu != 0) { + error = EINVAL; + } + goto signal_done; + } + error = vlan_new_mtu(ifp, mtu); + + signal_done: + vlan_parent_signal(vlp, "vlan_set_mtu"); + vlan_unlock(); + vlan_parent_release(vlp); + + return (error); +} +static int +vlan_ioctl(ifnet_t ifp, u_int32_t cmd, void * data) +{ + struct ifdevmtu * devmtu_p; + int error = 0; + struct ifaddr * ifa; + struct ifmediareq64 * ifmr; + struct ifreq * ifr; + ifvlan_ref ifv; + struct ifnet * p; + u_short tag; + user_addr_t user_addr; + vlan_parent_ref vlp; + struct vlanreq vlr; + + if (ifp->if_type != IFT_L2VLAN) { + return (EOPNOTSUPP); + } ifr = (struct ifreq *)data; ifa = (struct ifaddr *)data; - ifv = (struct ifvlan *)ifp->if_private; switch (cmd) { case SIOCSIFADDR: - ifp->if_flags |= IFF_UP; + ifnet_set_flags(ifp, IFF_UP, IFF_UP); break; + case SIOCGIFMEDIA64: case SIOCGIFMEDIA: - VLAN_LOCK(); - if (ifv->ifv_p != NULL) { - error = (*ifv->ifv_p->if_ioctl)(ifv->ifv_p, - SIOCGIFMEDIA, data); - VLAN_UNLOCK(); - /* Limit the result to the parent's current config. */ - if (error == 0) { - struct ifmediareq *ifmr; + vlan_lock(); + ifv = (ifvlan_ref)ifp->if_private; + if (ifv == NULL || ifvlan_flags_detaching(ifv)) { + vlan_unlock(); + return (ifv == NULL ? EOPNOTSUPP : EBUSY); + } + p = (ifv->ifv_vlp == NULL) ? NULL : ifv->ifv_vlp->vlp_ifp; + vlan_unlock(); + ifmr = (struct ifmediareq64 *)data; + user_addr = (cmd == SIOCGIFMEDIA64) + ? ifmr->ifm_ifmu.ifmu_ulist64 + : CAST_USER_ADDR_T(ifmr->ifm_ifmu.ifmu_ulist32); + if (p != NULL) { + struct ifmediareq64 p_ifmr; - ifmr = (struct ifmediareq *) data; - if (ifmr->ifm_count >= 1 && ifmr->ifm_ulist) { + bzero(&p_ifmr, sizeof(p_ifmr)); + error = dlil_ioctl(0, p, SIOCGIFMEDIA, (caddr_t)&p_ifmr); + if (error == 0) { + ifmr->ifm_active = p_ifmr.ifm_active; + ifmr->ifm_current = p_ifmr.ifm_current; + ifmr->ifm_mask = p_ifmr.ifm_mask; + ifmr->ifm_status = p_ifmr.ifm_status; + ifmr->ifm_count = p_ifmr.ifm_count; + /* Limit the result to the parent's current config. */ + if (ifmr->ifm_count >= 1 && user_addr != USER_ADDR_NULL) { ifmr->ifm_count = 1; - error = copyout(&ifmr->ifm_current, - ifmr->ifm_ulist, + error = copyout(&ifmr->ifm_current, user_addr, sizeof(int)); } } } else { - struct ifmediareq *ifmr; - VLAN_UNLOCK(); - - ifmr = (struct ifmediareq *) data; - ifmr->ifm_current = 0; + ifmr->ifm_active = ifmr->ifm_current = IFM_NONE; ifmr->ifm_mask = 0; ifmr->ifm_status = IFM_AVALID; - ifmr->ifm_active = 0; ifmr->ifm_count = 1; - if (ifmr->ifm_ulist) { - error = copyout(&ifmr->ifm_current, - ifmr->ifm_ulist, - sizeof(int)); + if (user_addr != USER_ADDR_NULL) { + error = copyout(&ifmr->ifm_current, user_addr, sizeof(int)); } - error = 0; } break; case SIOCSIFMEDIA: - error = EINVAL; - break; - - case SIOCSIFMTU: - /* - * Set the interface MTU. - */ - VLAN_LOCK(); - if (ifv->ifv_p != NULL) { - if (ifr->ifr_mtu > (ifv->ifv_p->if_mtu - ifv->ifv_mtufudge) - || ifr->ifr_mtu < (ifv->ifv_mintu - ifv->ifv_mtufudge)) { - error = EINVAL; - } else { - ifp->if_mtu = ifr->ifr_mtu; - } - } else { - error = EINVAL; - } - VLAN_UNLOCK(); + error = EOPNOTSUPP; break; - case SIOCSETVLAN: - error = copyin(ifr->ifr_data, &vlr, sizeof(vlr)); - if (error) - break; - if (vlr.vlr_parent[0] == '\0') { - VLAN_LOCK(); - vlan_unconfig(ifp); -#if 0 - if (ifp->if_flags & IFF_UP) - if_down(ifp); - ifp->if_flags &= ~IFF_RUNNING; -#endif 0 - VLAN_UNLOCK(); - break; + case SIOCGIFDEVMTU: + vlan_lock(); + ifv = (ifvlan_ref)ifp->if_private; + if (ifv == NULL || ifvlan_flags_detaching(ifv)) { + vlan_unlock(); + return (ifv == NULL ? EOPNOTSUPP : EBUSY); } - p = ifunit(vlr.vlr_parent); - if (p == 0) { - error = ENOENT; - break; + vlp = ifv->ifv_vlp; + if (vlp != NULL) { + int min_mtu = vlp->vlp_devmtu.ifdm_min - ifv->ifv_mtufudge; + devmtu_p = &ifr->ifr_devmtu; + devmtu_p->ifdm_current = ifp->if_mtu; + devmtu_p->ifdm_min = max(min_mtu, IF_MINMTU); + devmtu_p->ifdm_max = vlp->vlp_devmtu.ifdm_max - ifv->ifv_mtufudge; } - /* - * Don't let the caller set up a VLAN tag with - * anything except VLID bits. - */ - if (vlr.vlr_tag & ~EVL_VLID_MASK) { - error = EINVAL; - break; + else { + devmtu_p = &ifr->ifr_devmtu; + devmtu_p->ifdm_current = 0; + devmtu_p->ifdm_min = 0; + devmtu_p->ifdm_max = 0; } - VLAN_LOCK(); - error = vlan_config(ifv, p, vlr.vlr_tag); + vlan_unlock(); + break; + + case SIOCSIFMTU: + error = vlan_set_mtu(ifp, ifr->ifr_mtu); + break; + + case SIOCSIFVLAN: + user_addr = proc_is64bit(current_proc()) + ? ifr->ifr_data64 : CAST_USER_ADDR_T(ifr->ifr_data); + error = copyin(user_addr, &vlr, sizeof(vlr)); if (error) { - VLAN_UNLOCK(); break; } - ifp->if_flags |= IFF_RUNNING; - VLAN_UNLOCK(); - - /* Update promiscuous mode, if necessary. */ - vlan_set_promisc(ifp); - - /* generate a link event */ - vlan_link_event(ifp, p); + p = NULL; + if (vlr.vlr_parent[0] != '\0') { + if (vlr.vlr_tag & ~EVL_VLID_MASK) { + /* + * Don't let the caller set up a VLAN tag with + * anything except VLID bits. + */ + error = EINVAL; + break; + } + p = ifunit(vlr.vlr_parent); + if (p == NULL) { + error = ENXIO; + break; + } + /* can't do VLAN over anything but ethernet or ethernet aggregate */ + if (p->if_type != IFT_ETHER && p->if_type != IFT_IEEE8023ADLAG) { + error = EPROTONOSUPPORT; + break; + } + error = vlan_config(ifp, p, vlr.vlr_tag); + if (error) { + break; + } + + /* Update promiscuous mode, if necessary. */ + (void)vlan_set_promisc(ifp); + + /* generate a link event based on the state of the parent */ + vlan_link_event(ifp, p); + } else { + vlan_lock(); + ifv = (ifvlan_ref)ifp->if_private; + if (ifv == NULL || ifvlan_flags_detaching(ifv)) { + vlan_unlock(); + error = (ifv == NULL ? EOPNOTSUPP : EBUSY); + break; + } + error = vlan_unconfig(ifp); + vlan_unlock(); + if (error == 0) { + interface_link_event(ifp, KEV_DL_LINK_OFF); + } + } break; - case SIOCGETVLAN: + case SIOCGIFVLAN: bzero(&vlr, sizeof vlr); - VLAN_LOCK(); - if (ifv->ifv_p != NULL) { + vlan_lock(); + ifv = (ifvlan_ref)ifp->if_private; + if (ifv == NULL || ifvlan_flags_detaching(ifv)) { + vlan_unlock(); + return (ifv == NULL ? EOPNOTSUPP : EBUSY); + } + p = (ifv->ifv_vlp == NULL) ? NULL : ifv->ifv_vlp->vlp_ifp; + tag = ifv->ifv_tag; + vlan_unlock(); + if (p != NULL) { snprintf(vlr.vlr_parent, sizeof(vlr.vlr_parent), - "%s%d", ifv->ifv_p->if_name, - ifv->ifv_p->if_unit); - vlr.vlr_tag = ifv->ifv_tag; + "%s%d", p->if_name, p->if_unit); + vlr.vlr_tag = tag; } - VLAN_UNLOCK(); - error = copyout(&vlr, ifr->ifr_data, sizeof vlr); + user_addr = proc_is64bit(current_proc()) + ? ifr->ifr_data64 : CAST_USER_ADDR_T(ifr->ifr_data); + error = copyout(&vlr, user_addr, sizeof(vlr)); break; case SIOCSIFFLAGS: @@ -1107,8 +1740,7 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd, void * data) * For promiscuous mode, we enable promiscuous mode on * the parent if we need promiscuous on the VLAN interface. */ - if (ifv->ifv_p != NULL) - error = vlan_set_promisc(ifp); + error = vlan_set_promisc(ifp); break; case SIOCADDMULTI: @@ -1121,156 +1753,72 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd, void * data) return error; } -static int -nop_if_ioctl(struct ifnet * ifp, u_long cmd, void * data) -{ - return EOPNOTSUPP; -} - -static int -nop_if_bpf(struct ifnet *ifp, int mode, bpf_callback_func * func) -{ - return ENODEV; -} - -static int -nop_if_free(struct ifnet * ifp) -{ - return 0; -} - -static int -nop_if_output(struct ifnet * ifp, struct mbuf * m) -{ - if (m != NULL) { - m_freem_list(m); - } - return 0; -} - -static int +static void vlan_if_free(struct ifnet * ifp) { - struct ifvlan *ifv; + ifvlan_ref ifv; if (ifp == NULL) { - return 0; + return; } - ifv = (struct ifvlan *)ifp->if_private; + vlan_lock(); + ifv = (ifvlan_ref)ifp->if_private; if (ifv == NULL) { - return 0; + vlan_unlock(); + return; } ifp->if_private = NULL; + vlan_unlock(); dlil_if_release(ifp); FREE(ifv, M_VLAN); - return 0; } -/* - * Function: vlan_if_filter_detach - * Purpose: - * Destroy all vlan interfaces that refer to the interface - */ -static int -vlan_if_filter_detach(caddr_t cookie) +static void +vlan_event(struct ifnet * p, struct kev_msg * event) { - struct ifnet * ifp; - struct ifvlan * ifv; - struct ifnet * p = (struct ifnet *)cookie; + vlan_parent_ref vlp; - VLAN_LOCK(); - while (TRUE) { - ifv = vlan_lookup_ifp(p); - if (ifv == NULL) { - break; - } - if (ifv->ifv_detaching) { - continue; - } - /* make sure we don't invoke vlan_detach_filter */ - ifv->ifv_filter_valid = FALSE; - vlan_remove(ifv); - ifp = ifv->ifv_ifp; - VLAN_UNLOCK(); - vlan_if_detach(ifp); - VLAN_LOCK(); + /* Check if the interface we are attached to is being detached */ + if (event->vendor_code != KEV_VENDOR_APPLE + || event->kev_class != KEV_NETWORK_CLASS + || event->kev_subclass != KEV_DL_SUBCLASS) { + return; } - VLAN_UNLOCK(); - return (0); -} - -/* - * Function: vlan_attach_filter - * Purpose: - * We attach an interface filter to detect when the underlying interface - * goes away. We are forced to do that because dlil does not call our - * protocol's dl_event function for KEV_DL_IF_DETACHING. - */ - -static int -vlan_attach_filter(struct ifnet * ifp, u_long * filter_id) -{ - int error; - struct dlil_if_flt_str filt; - - bzero(&filt, sizeof(filt)); - filt.filter_detach = vlan_if_filter_detach; - filt.cookie = (caddr_t)ifp; - error = dlil_attach_interface_filter(ifp, &filt, filter_id, - DLIL_LAST_FILTER); - if (error) { - printf("vlan: dlil_attach_interface_filter(%s%d) failed, %d\n", - ifp->if_name, ifp->if_unit, error); + switch (event->event_code) { + case KEV_DL_IF_DETACHING: + case KEV_DL_LINK_OFF: + case KEV_DL_LINK_ON: + break; + default: + return; } - return (error); -} - -/* - * Function: vlan_detach_filter - * Purpose: - * Remove our interface filter. - */ -static int -vlan_detach_filter(u_long filter_id) -{ - int error; - - error = dlil_detach_filter(filter_id); - if (error) { - printf("vlan: dlil_detach_filter failed, %d\n", error); + vlan_lock(); + if ((p->if_eflags & IFEF_VLAN) == 0) { + vlan_unlock(); + /* no VLAN's */ + return; } - return (error); -} - -/* - * Function: vlan_proto_input - * Purpose: - * This function is never called. We aren't allowed to leave the - * function pointer NULL, so this function simply free's the mbuf. - */ -static int -vlan_proto_input(m, frame_header, ifp, dl_tag, sync_ok) - struct mbuf *m; - char *frame_header; - struct ifnet *ifp; - u_long dl_tag; - int sync_ok; -{ - m_freem(m); - return (EJUSTRETURN); -} - -static struct ifnet * -find_if_name_unit(const char * if_name, int unit) -{ - struct ifnet * ifp; - - TAILQ_FOREACH(ifp, &ifnet, if_link) { - if (strcmp(if_name, ifp->if_name) == 0 && unit == ifp->if_unit) { - return (ifp); - } + vlp = parent_list_lookup(p); + if (vlp == NULL) { + /* no VLAN's */ + vlan_unlock(); + return; + } + switch (event->event_code) { + case KEV_DL_IF_DETACHING: + vlan_parent_flags_set_detaching(vlp); + vlan_parent_remove_all_vlans(vlp); + break; + + case KEV_DL_LINK_OFF: + case KEV_DL_LINK_ON: + vlan_parent_link_event(vlp, event->event_code); + break; + default: + break; } - return (ifp); + vlan_unlock(); + return; } static void @@ -1295,103 +1843,41 @@ interface_link_event(struct ifnet * ifp, u_long event_code) } static void -parent_link_event(struct ifnet * p, u_long event_code) +vlan_parent_link_event(vlan_parent_ref vlp, u_long event_code) { - struct ifvlan * ifv; + ifvlan_ref ifv; - LIST_FOREACH(ifv, &ifv_list, ifv_list) { - if (p == ifv->ifv_p) { - interface_link_event(ifv->ifv_ifp, event_code); - } + LIST_FOREACH(ifv, &vlp->vlp_vlan_list, ifv_vlan_list) { + interface_link_event(ifv->ifv_ifp, event_code); } return; } -/* - * Function: vlan_dl_event - * Purpose: - * Process DLIL events that interest us. Currently, that is - * just the interface UP and DOWN. Ideally, this would also - * include the KEV_DL_IF_DETACH{ING} messages, which would eliminate - * the need for an interface filter. - */ -static int -vlan_dl_event(struct kern_event_msg * event, u_long dl_tag) -{ - struct ifnet * p; - struct net_event_data * net_event; - - if (event->vendor_code != KEV_VENDOR_APPLE - || event->kev_class != KEV_NETWORK_CLASS - || event->kev_subclass != KEV_DL_SUBCLASS) { - goto done; - } - net_event = (struct net_event_data *)(event->event_data); - switch (event->event_code) { - case KEV_DL_LINK_OFF: - case KEV_DL_LINK_ON: - p = find_if_name_unit(net_event->if_name, net_event->if_unit); - if (p != NULL) { - parent_link_event(p, event->event_code); - } - break; -#if 0 - case KEV_DL_IF_DETACHING: - case KEV_DL_IF_DETACHED: - /* we don't get these, unfortunately */ - break; -#endif 0 - default: - break; - } - - done: - return (0); -} - /* * Function: vlan_attach_protocol * Purpose: * Attach a DLIL protocol to the interface, using the ETHERTYPE_VLAN - * demux ether type. We're not a real protocol, we'll never receive - * any packets because they're intercepted by ether_demux before - * our input routine would be called. + * demux ether type. * - * The reasons for attaching a protocol to the interface are: - * 1) add a protocol reference to the interface so that the underlying - * interface automatically gets marked up while we're attached - * 2) receive link status events which we can propagate to our - * VLAN interfaces. + * The ethernet demux actually special cases VLAN to support hardware. + * The demux here isn't used. The demux will return PF_VLAN for the + * appropriate packets and our vlan_input function will be called. */ static int vlan_attach_protocol(struct ifnet *ifp) { - struct dlil_demux_desc desc; - u_long dl_tag; - u_short en_native = ETHERTYPE_VLAN; int error; - int i; struct dlil_proto_reg_str reg; - + + bzero(®, sizeof(reg)); TAILQ_INIT(®.demux_desc_head); - desc.type = DLIL_DESC_RAW; - desc.variants.bitmask.proto_id_length = 0; - desc.variants.bitmask.proto_id = 0; - desc.variants.bitmask.proto_id_mask = 0; - desc.native_type = (char *) &en_native; - TAILQ_INSERT_TAIL(®.demux_desc_head, &desc, next); reg.interface_family = ifp->if_family; reg.unit_number = ifp->if_unit; - reg.input = vlan_proto_input; - reg.pre_output = 0; - reg.event = vlan_dl_event; - reg.offer = 0; - reg.ioctl = 0; - reg.default_proto = 0; - reg.protocol_family = VLAN_PROTO_FAMILY; - - error = dlil_attach_protocol(®, &dl_tag); + reg.input = vlan_input; + reg.event = vlan_event; + reg.protocol_family = PF_VLAN; + error = dlil_attach_protocol(®); if (error) { printf("vlan_proto_attach(%s%d) dlil_attach_protocol failed, %d\n", ifp->if_name, ifp->if_unit, error); @@ -1407,21 +1893,14 @@ vlan_attach_protocol(struct ifnet *ifp) static int vlan_detach_protocol(struct ifnet *ifp) { - u_long dl_tag; int error; - error = dlil_find_dltag(ifp->if_family, ifp->if_unit, - VLAN_PROTO_FAMILY, &dl_tag); + error = dlil_detach_protocol(ifp, PF_VLAN); if (error) { - printf("vlan_proto_detach(%s%d) dlil_find_dltag failed, %d\n", + printf("vlan_proto_detach(%s%d) dlil_detach_protocol failed, %d\n", ifp->if_name, ifp->if_unit, error); - } else { - error = dlil_detach_protocol(dl_tag); - if (error) { - printf("vlan_proto_detach(%s%d) dlil_detach_protocol failed, %d\n", - ifp->if_name, ifp->if_unit, error); - } } + return (error); } @@ -1435,41 +1914,36 @@ vlan_detach_protocol(struct ifnet *ifp) extern int ether_add_if(struct ifnet *ifp); extern int ether_del_if(struct ifnet *ifp); extern int ether_init_if(struct ifnet *ifp); -extern int ether_add_proto(struct ddesc_head_str *desc_head, - struct if_proto *proto, u_long dl_tag); -extern int ether_del_proto(struct if_proto *proto, u_long dl_tag); -extern int ether_ifmod_ioctl(struct ifnet *ifp, u_long command, - caddr_t data); -extern int ether_del_proto(struct if_proto *proto, u_long dl_tag); -extern int ether_add_proto(struct ddesc_head_str *desc_head, struct if_proto *proto, u_long dl_tag); - -extern int ether_attach_inet(struct ifnet *ifp, u_long *dl_tag); -extern int ether_detach_inet(struct ifnet *ifp, u_long dl_tag); -extern int ether_attach_inet6(struct ifnet *ifp, u_long *dl_tag); -extern int ether_detach_inet6(struct ifnet *ifp, u_long dl_tag); +extern int ether_add_proto_old(struct ifnet *ifp, u_long protocol_family, + struct ddesc_head_str *desc_head); + +extern int ether_attach_inet(struct ifnet *ifp, u_long protocol_family); +extern int ether_detach_inet(struct ifnet *ifp, u_long protocol_family); +extern int ether_attach_inet6(struct ifnet *ifp, u_long protocol_family); +extern int ether_detach_inet6(struct ifnet *ifp, u_long protocol_family); static int -vlan_attach_inet(struct ifnet *ifp, u_long *dl_tag) +vlan_attach_inet(struct ifnet *ifp, u_long protocol_family) { - return (ether_attach_inet(ifp, dl_tag)); + return (ether_attach_inet(ifp, protocol_family)); } static int -vlan_detach_inet(struct ifnet *ifp, u_long dl_tag) +vlan_detach_inet(struct ifnet *ifp, u_long protocol_family) { - return (ether_detach_inet(ifp, dl_tag)); + return (ether_detach_inet(ifp, protocol_family)); } static int -vlan_attach_inet6(struct ifnet *ifp, u_long *dl_tag) +vlan_attach_inet6(struct ifnet *ifp, u_long protocol_family) { - return (ether_attach_inet6(ifp, dl_tag)); + return (ether_attach_inet6(ifp, protocol_family)); } static int -vlan_detach_inet6(struct ifnet *ifp, u_long dl_tag) +vlan_detach_inet6(struct ifnet *ifp, u_long protocol_family) { - return (ether_detach_inet6(ifp, dl_tag)); + return (ether_detach_inet6(ifp, protocol_family)); } static int @@ -1484,38 +1958,21 @@ vlan_del_if(struct ifnet *ifp) return (ether_del_if(ifp)); } -static int -vlan_init_if(struct ifnet *ifp) -{ - return (0); -} - -static int -vlan_shutdown() -{ - return 0; -} __private_extern__ int -vlan_family_init() +vlan_family_init(void) { - int i, error=0; + int error=0; struct dlil_ifmod_reg_str ifmod_reg; - struct dlil_protomod_reg_str vlan_protoreg; - -#if 0 - /* VLAN family is built-in, called from ether_family_init */ - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); -#endif 0 - + bzero(&ifmod_reg, sizeof(ifmod_reg)); ifmod_reg.add_if = vlan_add_if; ifmod_reg.del_if = vlan_del_if; - ifmod_reg.init_if = vlan_init_if; - ifmod_reg.add_proto = ether_add_proto; + ifmod_reg.init_if = NULL; + ifmod_reg.add_proto = ether_add_proto_old; ifmod_reg.del_proto = ether_del_proto; - ifmod_reg.ifmod_ioctl = ether_ifmod_ioctl; - ifmod_reg.shutdown = vlan_shutdown; + ifmod_reg.ifmod_ioctl = ether_ioctl; + ifmod_reg.shutdown = NULL; if (dlil_reg_if_modules(APPLE_IF_FAM_VLAN, &ifmod_reg)) { printf("WARNING: vlan_family_init -- " @@ -1524,31 +1981,22 @@ vlan_family_init() goto done; } - /* Register protocol registration functions */ - bzero(&vlan_protoreg, sizeof(vlan_protoreg)); - vlan_protoreg.attach_proto = vlan_attach_inet; - vlan_protoreg.detach_proto = vlan_detach_inet; - - if (error = dlil_reg_proto_module(PF_INET, APPLE_IF_FAM_VLAN, - &vlan_protoreg) != 0) { - kprintf("dlil_reg_proto_module failed for AF_INET6 error=%d\n", - error); + error = dlil_reg_proto_module(PF_INET, APPLE_IF_FAM_VLAN, + vlan_attach_inet, vlan_detach_inet); + if (error != 0) { + printf("dlil_reg_proto_module failed for AF_INET error=%d\n", + error); goto done; } - vlan_protoreg.attach_proto = vlan_attach_inet6; - vlan_protoreg.detach_proto = vlan_detach_inet6; - - if (error = dlil_reg_proto_module(PF_INET6, APPLE_IF_FAM_VLAN, - &vlan_protoreg) != 0) { - kprintf("dlil_reg_proto_module failed for AF_INET6 error=%d\n", - error); + error = dlil_reg_proto_module(PF_INET6, APPLE_IF_FAM_VLAN, + vlan_attach_inet6, vlan_detach_inet6); + if (error != 0) { + printf("dlil_reg_proto_module failed for AF_INET6 error=%d\n", + error); goto done; } vlan_clone_attach(); done: -#if 0 - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); -#endif 0 return (error); } diff --git a/bsd/net/if_vlan_var.h b/bsd/net/if_vlan_var.h index 3588b9dd2..6772ddac8 100644 --- a/bsd/net/if_vlan_var.h +++ b/bsd/net/if_vlan_var.h @@ -53,6 +53,7 @@ #ifndef _NET_IF_VLAN_VAR_H_ #define _NET_IF_VLAN_VAR_H_ 1 +#define ETHER_VLAN_ENCAP_LEN 4 /* len of 802.1Q VLAN encapsulation */ struct ether_vlan_header { u_char evl_dhost[ETHER_ADDR_LEN]; u_char evl_shost[ETHER_ADDR_LEN]; @@ -79,4 +80,7 @@ struct vlanreq { u_short vlr_tag; }; +#ifdef KERNEL_PRIVATE +int vlan_family_init(void); +#endif KERNEL_PRIVATE #endif /* _NET_IF_VLAN_VAR_H_ */ diff --git a/bsd/net/init.c b/bsd/net/init.c new file mode 100644 index 000000000..82c2882df --- /dev/null +++ b/bsd/net/init.c @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include <kern/kalloc.h> +#include <libkern/OSAtomic.h> +#include <sys/errno.h> +#include <net/init.h> +#include <libkern/libkern.h> +#include <string.h> + +struct init_list_entry { + struct init_list_entry *next; + net_init_func_ptr func; +}; + +#define LIST_RAN ((struct init_list_entry*)0xffffffff) +static struct init_list_entry *list_head = 0; + +errno_t +net_init_add( + net_init_func_ptr init_func) +{ + struct init_list_entry *entry; + + if (init_func == 0) { + return EINVAL; + } + + /* Check if we've already started */ + if (list_head == LIST_RAN) { + return EALREADY; + } + + entry = kalloc(sizeof(*entry)); + if (entry == 0) { + printf("net_init_add: no memory\n"); + return ENOMEM; + } + + bzero(entry, sizeof(*entry)); + entry->func = init_func; + + do { + entry->next = list_head; + + if (entry->next == LIST_RAN) { + /* List already ran, cleanup and call the function */ + kfree(entry, sizeof(*entry)); + return EALREADY; + } + } while(!OSCompareAndSwap((UInt32)entry->next, (UInt32)entry, + (UInt32*)&list_head)); + + return 0; +} + +__private_extern__ void +net_init_run(void) +{ + struct init_list_entry *backward_head = 0; + struct init_list_entry *forward_head = 0; + struct init_list_entry *current = 0; + + /* + * Grab the list, replacing the head with 0xffffffff to indicate + * that we've already run. + */ + do { + backward_head = list_head; + } while (!OSCompareAndSwap((UInt32)backward_head, (UInt32)LIST_RAN, + (UInt32*)&list_head)); + + /* Reverse the order of the list */ + while (backward_head != 0) { + current = backward_head; + backward_head = current->next; + current->next = forward_head; + forward_head = current; + } + + /* Call each function pointer registered */ + while (forward_head != 0) { + current = forward_head; + forward_head = current->next; + current->func(); + kfree(current, sizeof(*current)); + } +} diff --git a/bsd/net/init.h b/bsd/net/init.h new file mode 100644 index 000000000..fc3ad5120 --- /dev/null +++ b/bsd/net/init.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/*! + @header init.h + This header defines an API to register a function that will be called when + the network stack is being initialized. This gives a kernel extensions an + opportunity to install filters before sockets are created and network + operations occur. + */ +#ifndef _NET_INIT_H_ +#define _NET_INIT_H_ +#include <sys/kernel_types.h> + +/*! + @typedef net_init_func_ptr + @discussion net_init_func_ptr will be called once the networking stack + initialized and before network operations occur. + */ +typedef void (*net_init_func_ptr)(void); + +/*! + @function net_init_add + @discussion Add a function to be called during network initialization. Your + kext must not unload until the function you register is called if + net_init_add returns success. + @param init_func A pointer to a function to be called when the stack is + initialized. + @result EINVAL - the init_func value was NULL. + EALREADY - the network has already been initialized + ENOMEM - there was not enough memory to perform this operation + 0 - success + */ +errno_t net_init_add(net_init_func_ptr init_func); + +#ifdef BSD_KERNEL_PRIVATE +/* net_init_run is called from bsd_init */ +extern void net_init_run(void); +#endif /* BSD_KERNEL_PRIVATE */ + +#endif /* _NET_INIT_H_ */ diff --git a/bsd/net/iso88025.h b/bsd/net/iso88025.h index e1d1a68dc..243499354 100644 --- a/bsd/net/iso88025.h +++ b/bsd/net/iso88025.h @@ -42,8 +42,6 @@ #ifndef _NET_ISO88025_H_ #define _NET_ISO88025_H_ -#include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE /* * General ISO 802.5 definitions @@ -116,13 +114,4 @@ struct iso88025_addr { #define ISO88025_DEFAULT_MTU 1500 #define senderr(e) { error = (e); goto bad;} -#ifndef __APPLE__ -/* Not implemented in Darwin */ -void iso88025_ifattach __P((struct ifnet *)); -int iso88025_ioctl __P((struct ifnet *, int , caddr_t )); -int iso88025_output __P((struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *)); -void iso88025_input __P((struct ifnet *, struct iso88025_header *, struct mbuf *)); -#endif - -#endif /* __APPLE_API_PRIVATE */ #endif diff --git a/bsd/net/kext_net.c b/bsd/net/kext_net.c index fe79fa1c5..3acfce5de 100644 --- a/bsd/net/kext_net.c +++ b/bsd/net/kext_net.c @@ -96,7 +96,8 @@ register_sockfilter(struct NFDescriptor *nfp, struct NFDescriptor *nfp1, return(0); } -unregister_sockfilter(struct NFDescriptor *nfp, struct protosw *pr, int flags) +int +unregister_sockfilter(struct NFDescriptor *nfp, struct protosw *pr, __unused int flags) { int s; s = splhigh(); @@ -129,7 +130,7 @@ find_nke(unsigned int handle) */ int nke_insert(struct socket *so, struct so_nke *np) -{ int s, error; +{ struct kextcb *kp, *kp1; struct NFDescriptor *nf1, *nf2 = NULL; diff --git a/bsd/net/kext_net.h b/bsd/net/kext_net.h index ebe88e6c3..f81b5e93f 100644 --- a/bsd/net/kext_net.h +++ b/bsd/net/kext_net.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -19,77 +19,73 @@ * * @APPLE_LICENSE_HEADER_END@ */ -/* Copyright (C) 1999 Apple Computer, Inc. */ + /* - * Support for network filter kernel extensions - * Justin C. Walker, 990319 + * Support for socket filter kernel extensions */ + #ifndef NET_KEXT_NET_H #define NET_KEXT_NET_H + #include <sys/appleapiopts.h> #include <sys/queue.h> -#include <sys/socketvar.h> +#include <sys/cdefs.h> -struct mbuf; -struct socket; -struct uio; -struct sockbuf; -struct sockaddr; -struct kextcb; -struct protosw; -struct sockif; -struct sockutil; -struct sockopt; +#ifdef BSD_KERNEL_PRIVATE -#ifdef __APPLE_API_UNSTABLE +#include <sys/kpi_socketfilter.h> /* - * This structure gives access to the functionality of the filter. - * The kextcb provides the link from the socket structure. + * Internal implementation bits */ -struct NFDescriptor -{ TAILQ_ENTRY(NFDescriptor) nf_next; /* protosw chain */ - TAILQ_ENTRY(NFDescriptor) nf_list; /* descriptor list */ - unsigned int nf_handle; /* Identifier */ - int nf_flags; - /* Dispatch for PF_FILTER control */ - int (*nf_connect)(); /* Make contact */ - void (*nf_disconnect)(); /* Break contact */ - int (*nf_read)(); /* Get data from filter */ - int (*nf_write)(); /* Send data to filter */ - int (*nf_get)(); /* Get filter config */ - int (*nf_set)(); /* Set filter config */ - /* - * Socket function dispatch vectors - copied to kextcb - * during socreate() - */ - struct sockif *nf_soif; /* Socket functions */ - struct sockutil *nf_soutil; /* Sockbuf utility functions */ - u_long reserved[4]; /* for future use if needed */ + +struct socket_filter; + +#define SFEF_DETACHING 0x1 + +struct socket_filter_entry { + struct socket_filter_entry *sfe_next_onsocket; + struct socket_filter_entry *sfe_next_onfilter; + + struct socket_filter *sfe_filter; + struct socket *sfe_socket; + void *sfe_cookie; + + u_int32_t sfe_flags; }; -#define NFD_GLOBAL 0x01 -#define NFD_PROG 0x02 -#define NFD_VISIBLE 0x80000000 +#define SFF_DETACHING 0x1 -#define NFF_BEFORE 0x01 -#define NFF_AFTER 0x02 +struct socket_filter { + TAILQ_ENTRY(socket_filter) sf_protosw_next; + TAILQ_ENTRY(socket_filter) sf_global_next; + struct socket_filter_entry *sf_entry_head; + + struct protosw *sf_proto; + struct sflt_filter sf_filter; + u_int32_t sf_flags; + u_int32_t sf_usecount; +}; -#ifdef KERNEL -/* How to register: filter, insert location, target protosw, flags */ -extern int register_sockfilter(struct NFDescriptor *, - struct NFDescriptor *, - struct protosw *, int); -/* How to unregister: filter, original protosw, flags */ -extern int unregister_sockfilter(struct NFDescriptor *, struct protosw *, int); +TAILQ_HEAD(socket_filter_list, socket_filter); -#ifdef __APPLE_API_PRIVATE -TAILQ_HEAD(nf_list, NFDescriptor); +/* Private, internal implementation functions */ +void sflt_init(void); +void sflt_initsock(struct socket *so); +void sflt_termsock(struct socket *so); +void sflt_use(struct socket *so); +void sflt_unuse(struct socket *so); +void sflt_notify(struct socket *so, sflt_event_t event, void *param); +int sflt_data_in(struct socket *so, const struct sockaddr *from, mbuf_t *data, + mbuf_t *control, sflt_data_flag_t flags); +int sflt_attach_private(struct socket *so, struct socket_filter *filter, sflt_handle handle, int locked); +void sflt_detach_private(struct socket_filter_entry *entry, int filter_detached); -extern struct nf_list nf_list; -#endif /* __APPLE_API_PRIVATE */ -#endif +#endif /* BSD_KERNEL_PRIVATE */ + +#define NFF_BEFORE 0x01 +#define NFF_AFTER 0x02 #define NKE_OK 0 #define NKE_REMOVE -1 @@ -102,6 +98,10 @@ extern struct nf_list nf_list; * the 'where' NKE. If the latter is NULL, the flags indicate "first" * or "last" */ +#if __DARWIN_ALIGN_POWER +#pragma options align=power +#endif + struct so_nke { unsigned int nke_handle; unsigned int nke_where; @@ -109,102 +109,9 @@ struct so_nke unsigned long reserved[4]; /* for future use */ }; -/* - * sockif: - * Contains socket interface: - * dispatch vector abstracting the interface between protocols and - * the socket layer. - * TODO: add sf_sosense() - */ -struct sockif -{ int (*sf_soabort)(struct socket *, struct kextcb *); - int (*sf_soaccept)(struct socket *, struct sockaddr **, - struct kextcb *); - int (*sf_sobind)(struct socket *, struct sockaddr *, struct kextcb *); - int (*sf_soclose)(struct socket *, struct kextcb *); - int (*sf_soconnect)(struct socket *, struct sockaddr *, - struct kextcb *); - int (*sf_soconnect2)(struct socket *, struct socket *, - struct kextcb *); - int (*sf_socontrol)(struct socket *, struct sockopt *, - struct kextcb *); - int (*sf_socreate)(struct socket *, struct protosw *, struct kextcb *); - int (*sf_sodisconnect)(struct socket *, struct kextcb *); - int (*sf_sofree)(struct socket *, struct kextcb *); - int (*sf_sogetopt)(struct socket *, int, int, struct mbuf **, - struct kextcb *); - int (*sf_sohasoutofband)(struct socket *, struct kextcb *); - int (*sf_solisten)(struct socket *, struct kextcb *); - int (*sf_soreceive)(struct socket *, struct sockaddr **, struct uio **, - struct mbuf **, struct mbuf **, int *, - struct kextcb *); - int (*sf_sorflush)(struct socket *, struct kextcb *); - int (*sf_sosend)(struct socket *, struct sockaddr **, struct uio **, - struct mbuf **, struct mbuf **, int *, - struct kextcb *); - int (*sf_sosetopt)(struct socket *, int, int, struct mbuf *, - struct kextcb *); - int (*sf_soshutdown)(struct socket *, int, struct kextcb *); - /* Calls sorwakeup() */ - int (*sf_socantrcvmore)(struct socket *, struct kextcb *); - /* Calls sowwakeup() */ - int (*sf_socantsendmore)(struct socket *, struct kextcb *); - /* Calls soqinsque(), sorwakeup(), sowwakeup() */ - int (*sf_soisconnected)(struct socket *, struct kextcb *); - int (*sf_soisconnecting)(struct socket *, struct kextcb *); - /* Calls sowwakeup(), sorwakeup() */ - int (*sf_soisdisconnected)(struct socket *, struct kextcb *); - /* Calls sowwakeup(), sorwakeup() */ - int (*sf_soisdisconnecting)(struct socket *, struct kextcb *); - /* Calls soreserve(), soqinsque(), soqremque(), sorwakeup() */ - int (*sf_sonewconn)(struct socket *, int, struct kextcb *); - int (*sf_soqinsque)(struct socket *, struct socket *, int, - struct kextcb *); - int (*sf_soqremque)(struct socket *, int, struct kextcb *); - int (*sf_soreserve)(struct socket *, u_long, u_long, struct kextcb *); - int (*sf_sowakeup)(struct socket *, struct sockbuf *, - struct kextcb *); - u_long reserved[4]; -}; - +#if __DARWIN_ALIGN_POWER +#pragma options align=reset +#endif -/* - * sockutil: - * Contains the utility functions for socket layer access - */ -struct sockutil -{ /* Sleeps if locked */ - int (*su_sb_lock)(struct sockbuf *, struct kextcb *); - /* Conditionally calls sbappendrecord, Calls sbcompress */ - int (*su_sbappend)(struct sockbuf *, struct mbuf *, struct kextcb *); - /* Calls sbspace(), sballoc() */ - int (*su_sbappendaddr)(struct sockbuf *, struct sockaddr *, - struct mbuf *, struct mbuf *, struct kextcb *); - /* Calls sbspace(), sballoc() */ - int (*su_sbappendcontrol)(struct sockbuf *, struct mbuf *, - struct mbuf *, struct kextcb *); - /* Calls sballoc(), sbcompress() */ - int (*su_sbappendrecord)(struct sockbuf *, struct mbuf *, - struct kextcb *); - /* Calls sballoc() */ - int (*su_sbcompress)(struct sockbuf *, struct mbuf *, struct mbuf *, - struct kextcb *); - /* Calls sbfree() */ - int (*su_sbdrop)(struct sockbuf *, int, struct kextcb *); - /* Calls sbfree() */ - int (*su_sbdroprecord)(struct sockbuf *, struct kextcb *); - /* Calls sbdrop() */ - int (*su_sbflush)(struct sockbuf *, struct kextcb *); - /* Calls sballoc(), sbcompress() */ - int (*su_sbinsertoob)(struct sockbuf *, struct mbuf *, - struct kextcb *); - /* Calls sbflush() */ - int (*su_sbrelease)(struct sockbuf *, struct kextcb *); - int (*su_sbreserve)(struct sockbuf *, u_long, struct kextcb *); - /* Calls tsleep() */ - int (*su_sbwait)(struct sockbuf *, struct kextcb *); - u_long reserved[4]; -}; -#endif /* __APPLE_API_UNSTABLE */ +#endif /* NET_KEXT_NET_H */ -#endif diff --git a/bsd/net/kpi_interface.c b/bsd/net/kpi_interface.c new file mode 100644 index 000000000..a5d64adac --- /dev/null +++ b/bsd/net/kpi_interface.c @@ -0,0 +1,1355 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include "kpi_interface.h" + +#include <sys/queue.h> +#include <sys/param.h> /* for definition of NULL */ +#include <sys/errno.h> +#include <sys/socket.h> +#include <sys/kern_event.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/kpi_mbuf.h> +#include <net/if_var.h> +#include <net/if_dl.h> +#include <net/dlil.h> +#include <net/if_types.h> +#include <net/if_dl.h> +#include <net/if_arp.h> +#include <libkern/libkern.h> +#include <kern/locks.h> + +#if IF_LASTCHANGEUPTIME +#define TOUCHLASTCHANGE(__if_lastchange) microuptime(__if_lastchange) +#else +#define TOUCHLASTCHANGE(__if_lastchange) microtime(__if_lastchange) +#endif + +extern lck_spin_t *dlil_input_lock; + +/* + Temporary work around until we have real reference counting + + We keep the bits about calling dlil_if_release (which should be + called recycle) transparent by calling it from our if_free function + pointer. We have to keep the client's original detach function + somewhere so we can call it. + */ +static void +ifnet_kpi_free( + ifnet_t ifp) +{ + ifnet_detached_func detach_func = ifp->if_kpi_storage; + + if (detach_func) + detach_func(ifp); + + if (ifp->if_broadcast.length > sizeof(ifp->if_broadcast.u.buffer)) { + FREE(ifp->if_broadcast.u.ptr, M_IFADDR); + ifp->if_broadcast.u.ptr = NULL; + } + + dlil_if_release(ifp); +} + +errno_t +ifnet_allocate( + const struct ifnet_init_params *init, + ifnet_t *interface) +{ + int error; + struct ifnet *ifp = NULL; + + if (init->family == 0) + return EINVAL; + if (init->name == NULL || + init->output == NULL) + return EINVAL; + if (strlen(init->name) >= IFNAMSIZ) + return EINVAL; + if ((init->type & 0xFFFFFF00) != 0 || init->type == 0) + return EINVAL; + + error = dlil_if_acquire(init->family, init->uniqueid, init->uniqueid_len, &ifp); + if (error == 0) + { + strncpy(ifp->if_name, init->name, IFNAMSIZ); + ifp->if_type = init->type; + ifp->if_family = init->family; + ifp->if_unit = init->unit; + ifp->if_output = init->output; + ifp->if_demux = init->demux; + ifp->if_add_proto_u.kpi = init->add_proto; + ifp->if_del_proto = init->del_proto; + ifp->if_check_multi = init->check_multi; + ifp->if_framer = init->framer; + ifp->if_softc = init->softc; + ifp->if_ioctl = init->ioctl; + ifp->if_set_bpf_tap = init->set_bpf_tap; + ifp->if_free = ifnet_kpi_free; + ifp->if_event = init->event; + ifp->if_kpi_storage = init->detach; + ifp->if_eflags |= IFEF_USEKPI; + + if (init->broadcast_len && init->broadcast_addr) { + if (init->broadcast_len > sizeof(ifp->if_broadcast.u.buffer)) { + MALLOC(ifp->if_broadcast.u.ptr, u_char*, init->broadcast_len, M_IFADDR, M_NOWAIT); + if (ifp->if_broadcast.u.ptr == NULL) { + error = ENOMEM; + } + else { + bcopy(init->broadcast_addr, ifp->if_broadcast.u.ptr, init->broadcast_len); + } + } + else { + bcopy(init->broadcast_addr, ifp->if_broadcast.u.buffer, init->broadcast_len); + } + ifp->if_broadcast.length = init->broadcast_len; + } + else { + bzero(&ifp->if_broadcast, sizeof(ifp->if_broadcast)); + } + + if (error == 0) { + *interface = ifp; + ifnet_reference(ifp); // temporary - this should be done in dlil_if_acquire + } + else { + dlil_if_release(ifp); + *interface = 0; + } + } + + /* + Note: We should do something here to indicate that we haven't been + attached yet. By doing so, we can catch the case in ifnet_release + where the reference count reaches zero and call the recycle + function. If the interface is attached, the interface will be + recycled when the interface's if_free function is called. If the + interface is never attached, the if_free function will never be + called and the interface will never be recycled. + */ + + return error; +} + +errno_t +ifnet_reference( + ifnet_t interface) +{ + if (interface == NULL) return EINVAL; + ifp_reference(interface); + return 0; +} + +errno_t +ifnet_release( + ifnet_t interface) +{ + if (interface == NULL) return EINVAL; + ifp_release(interface); + return 0; +} + +errno_t +ifnet_attach( + ifnet_t interface, + const struct sockaddr_dl *ll_addr) +{ + if (interface == NULL) return EINVAL; + if (ll_addr && interface->if_addrlen == 0) { + interface->if_addrlen = ll_addr->sdl_alen; + } + else if (ll_addr && ll_addr->sdl_alen != interface->if_addrlen) { + return EINVAL; + } + return dlil_if_attach_with_address(interface, ll_addr); +} + +errno_t +ifnet_detach( + ifnet_t interface) +{ + errno_t error; + + if (interface == NULL) return EINVAL; + + error = dlil_if_detach(interface); + if (error == DLIL_WAIT_FOR_FREE) error = 0; /* Client should always wait for detach */ + + return error; +} + +void* +ifnet_softc( + ifnet_t interface) +{ + return interface == NULL ? NULL : interface->if_softc; +} + +const char* +ifnet_name( + ifnet_t interface) +{ + return interface == NULL ? NULL : interface->if_name; +} + +ifnet_family_t +ifnet_family( + ifnet_t interface) +{ + return interface == NULL ? 0 : interface->if_family; +} + +u_int32_t +ifnet_unit( + ifnet_t interface) +{ + return interface == NULL ? (u_int32_t)0xffffffff : (u_int32_t)interface->if_unit; +} + +u_int32_t +ifnet_index( + ifnet_t interface) +{ + return interface == NULL ? (u_int32_t)0xffffffff : interface->if_index; +} + +errno_t +ifnet_set_flags( + ifnet_t interface, + u_int16_t new_flags, + u_int16_t mask) +{ + int lock; + + if (interface == NULL) return EINVAL; + lock = (interface->if_lock != 0); + + if (lock) ifnet_lock_exclusive(interface); + + /* If we are modifying the up/down state, call if_updown */ + if (lock && (mask & IFF_UP) != 0) { + if_updown(interface, (new_flags & IFF_UP) == IFF_UP); + } + + interface->if_flags = (new_flags & mask) | (interface->if_flags & ~mask); + if (lock) ifnet_lock_done(interface); + + return 0; +} + +u_int16_t +ifnet_flags( + ifnet_t interface) +{ + return interface == NULL ? 0 : interface->if_flags; +} + +errno_t +ifnet_set_eflags( + ifnet_t interface, + u_int32_t new_flags, + u_int32_t mask) +{ + int lock; + + if (interface == NULL) return EINVAL; + lock = (interface->if_lock != 0); + + if (lock) ifnet_lock_exclusive(interface); + interface->if_eflags = (new_flags & mask) | (interface->if_eflags & ~mask); + if (lock) ifnet_lock_done(interface); + + return 0; +} + +u_int32_t +ifnet_eflags( + ifnet_t interface) +{ + return interface == NULL ? 0 : interface->if_eflags; +} + +static const ifnet_offload_t offload_mask = IFNET_CSUM_IP | IFNET_CSUM_TCP | + IFNET_CSUM_UDP | IFNET_CSUM_FRAGMENT | IFNET_IP_FRAGMENT | + IFNET_CSUM_SUM16 | IFNET_VLAN_TAGGING | IFNET_VLAN_MTU; + +errno_t +ifnet_set_offload( + ifnet_t interface, + ifnet_offload_t offload) +{ + int lock; + + if (interface == NULL) return EINVAL; + lock = (interface->if_lock != 0); + + if (lock) ifnet_lock_exclusive(interface); + interface->if_hwassist = (offload & offload_mask); + if (lock) ifnet_lock_done(interface); + + return 0; +} + +ifnet_offload_t +ifnet_offload( + ifnet_t interface) +{ + return interface == NULL ? 0 : (interface->if_hwassist & offload_mask); +} + +/* + * Should MIB data store a copy? + */ +errno_t +ifnet_set_link_mib_data( + ifnet_t interface, + void* mibData, + u_int32_t mibLen) +{ + int lock; + + if (interface == NULL) return EINVAL; + lock = (interface->if_lock != 0); + + if (lock) ifnet_lock_exclusive(interface); + interface->if_linkmib = (void*)mibData; + interface->if_linkmiblen = mibLen; + if (lock) ifnet_lock_done(interface); + return 0; +} + +errno_t +ifnet_get_link_mib_data( + ifnet_t interface, + void *mibData, + u_int32_t *mibLen) +{ + errno_t result = 0; + int lock; + + if (interface == NULL) return EINVAL; + lock = (interface->if_lock != NULL); + + if (lock) ifnet_lock_shared(interface); + if (*mibLen < interface->if_linkmiblen) + result = EMSGSIZE; + if (result == 0 && interface->if_linkmib == NULL) + result = ENOTSUP; + + if (result == 0) { + *mibLen = interface->if_linkmiblen; + bcopy(interface->if_linkmib, mibData, *mibLen); + } + if (lock) ifnet_lock_done(interface); + + return result; +} + +u_int32_t +ifnet_get_link_mib_data_length( + ifnet_t interface) +{ + return interface == NULL ? 0 : interface->if_linkmiblen; +} + +errno_t +ifnet_attach_protocol( + ifnet_t interface, + protocol_family_t protocol, + const struct ifnet_attach_proto_param *proto_details) +{ + if (interface == NULL || protocol == 0 || proto_details == NULL) + return EINVAL; + return dlil_attach_protocol_kpi(interface, protocol, proto_details); +} + +errno_t +ifnet_detach_protocol( + ifnet_t interface, + protocol_family_t protocol) +{ + if (interface == NULL || protocol == 0) return EINVAL; + return dlil_detach_protocol(interface, protocol); +} + +errno_t +ifnet_output( + ifnet_t interface, + protocol_family_t protocol_family, + mbuf_t m, + void *route, + const struct sockaddr *dest) +{ + if (interface == NULL || protocol_family == 0 || m == NULL) { + if (m) + mbuf_freem_list(m); + return EINVAL; + } + return dlil_output(interface, protocol_family, m, route, dest, 0); +} + +errno_t +ifnet_output_raw( + ifnet_t interface, + protocol_family_t protocol_family, + mbuf_t m) +{ + if (interface == NULL || protocol_family == 0 || m == NULL) { + if (m) + mbuf_freem_list(m); + return EINVAL; + } + return dlil_output(interface, protocol_family, m, NULL, NULL, 1); +} + +errno_t +ifnet_input( + ifnet_t interface, + mbuf_t first_packet, + const struct ifnet_stat_increment_param *stats) +{ + mbuf_t last_packet = first_packet; + + if (interface == NULL || first_packet == NULL) { + if (first_packet) + mbuf_freem_list(first_packet); + return EINVAL; + } + + while (mbuf_nextpkt(last_packet) != NULL) + last_packet = mbuf_nextpkt(last_packet); + return dlil_input_with_stats(interface, first_packet, last_packet, stats); +} + +errno_t +ifnet_ioctl( + ifnet_t interface, + protocol_family_t protocol_family, + u_int32_t ioctl_code, + void *ioctl_arg) +{ + if (interface == NULL || protocol_family == 0 || ioctl_code == 0) + return EINVAL; + return dlil_ioctl(protocol_family, interface, + ioctl_code, ioctl_arg); +} + +errno_t +ifnet_event( + ifnet_t interface, + struct kern_event_msg* event_ptr) +{ + if (interface == NULL || event_ptr == NULL) return EINVAL; + return dlil_event(interface, event_ptr); +} + +errno_t +ifnet_set_mtu( + ifnet_t interface, + u_int32_t mtu) +{ + if (interface == NULL) return EINVAL; + interface->if_data.ifi_mtu = mtu; + return 0; +} + +u_int32_t +ifnet_mtu( + ifnet_t interface) +{ + u_int32_t retval; + retval = interface == NULL ? 0 : interface->if_data.ifi_mtu; + return retval; +} + +u_char +ifnet_type( + ifnet_t interface) +{ + u_char retval; + + retval = interface == NULL ? 0 : interface->if_data.ifi_type; + return retval; +} + +#if 0 +errno_t +ifnet_set_typelen( + ifnet_t interface, + u_char typelen) +{ + int lock = (interface->if_lock != 0); + if (lock) ifnet_lock_exclusive(interface); + interface->if_data.ifi_typelen = typelen; + if (lock) ifnet_lock_done(interface); + return 0; +} + +u_char +ifnet_typelen( + ifnet_t interface) +{ + u_char retval; + retval = interface == NULL ? 0 : interface->if_data.ifi_typelen; + return retval; +} +#endif + +errno_t +ifnet_set_addrlen( + ifnet_t interface, + u_char addrlen) +{ + if (interface == NULL) return EINVAL; + interface->if_data.ifi_addrlen = addrlen; + return 0; +} + +u_char +ifnet_addrlen( + ifnet_t interface) +{ + u_char retval; + retval = interface == NULL ? 0 : interface->if_data.ifi_addrlen; + return retval; +} + +errno_t +ifnet_set_hdrlen( + ifnet_t interface, + u_char hdrlen) +{ + if (interface == NULL) return EINVAL; + interface->if_data.ifi_hdrlen = hdrlen; + return 0; +} + +u_char +ifnet_hdrlen( + ifnet_t interface) +{ + u_char retval; + retval = interface == NULL ? 0 : interface->if_data.ifi_hdrlen; + return retval; +} + +errno_t +ifnet_set_metric( + ifnet_t interface, + u_int32_t metric) +{ + if (interface == NULL) return EINVAL; + interface->if_data.ifi_metric = metric; + return 0; +} + +u_int32_t +ifnet_metric( + ifnet_t interface) +{ + u_int32_t retval; + retval = interface == NULL ? 0 : interface->if_data.ifi_metric; + return retval; +} + +errno_t +ifnet_set_baudrate( + ifnet_t interface, + u_int64_t baudrate) +{ + if (interface == NULL) return EINVAL; + /* Pin baudrate to 32 bits until we can change the storage size */ + interface->if_data.ifi_baudrate = baudrate > 0xFFFFFFFF ? 0xFFFFFFFF : baudrate; + return 0; +} + +u_int64_t +ifnet_baudrate( + ifnet_t interface) +{ + u_int64_t retval; + retval = interface == NULL ? 0 : interface->if_data.ifi_baudrate; + return retval; +} + +errno_t +ifnet_stat_increment( + ifnet_t interface, + const struct ifnet_stat_increment_param *counts) +{ + if (interface == NULL) return EINVAL; + + lck_spin_lock(dlil_input_lock); + + interface->if_data.ifi_ipackets += counts->packets_in; + interface->if_data.ifi_ibytes += counts->bytes_in; + interface->if_data.ifi_ierrors += counts->errors_in; + + interface->if_data.ifi_opackets += counts->packets_out; + interface->if_data.ifi_obytes += counts->bytes_out; + interface->if_data.ifi_oerrors += counts->errors_out; + + interface->if_data.ifi_collisions += counts->collisions; + interface->if_data.ifi_iqdrops += counts->dropped; + + /* Touch the last change time. */ + TOUCHLASTCHANGE(&interface->if_lastchange); + + lck_spin_unlock(dlil_input_lock); + + return 0; +} + +errno_t +ifnet_stat_increment_in( + ifnet_t interface, + u_int32_t packets_in, + u_int32_t bytes_in, + u_int32_t errors_in) +{ + if (interface == NULL) return EINVAL; + + lck_spin_lock(dlil_input_lock); + + interface->if_data.ifi_ipackets += packets_in; + interface->if_data.ifi_ibytes += bytes_in; + interface->if_data.ifi_ierrors += errors_in; + + TOUCHLASTCHANGE(&interface->if_lastchange); + + lck_spin_unlock(dlil_input_lock); + + return 0; +} + +errno_t +ifnet_stat_increment_out( + ifnet_t interface, + u_int32_t packets_out, + u_int32_t bytes_out, + u_int32_t errors_out) +{ + if (interface == NULL) return EINVAL; + + lck_spin_lock(dlil_input_lock); + + interface->if_data.ifi_opackets += packets_out; + interface->if_data.ifi_obytes += bytes_out; + interface->if_data.ifi_oerrors += errors_out; + + TOUCHLASTCHANGE(&interface->if_lastchange); + + lck_spin_unlock(dlil_input_lock); + + return 0; +} + +errno_t +ifnet_set_stat( + ifnet_t interface, + const struct ifnet_stats_param *stats) +{ + if (interface == NULL) return EINVAL; + + lck_spin_lock(dlil_input_lock); + + interface->if_data.ifi_ipackets = stats->packets_in; + interface->if_data.ifi_ibytes = stats->bytes_in; + interface->if_data.ifi_imcasts = stats->multicasts_in; + interface->if_data.ifi_ierrors = stats->errors_in; + + interface->if_data.ifi_opackets = stats->packets_out; + interface->if_data.ifi_obytes = stats->bytes_out; + interface->if_data.ifi_omcasts = stats->multicasts_out; + interface->if_data.ifi_oerrors = stats->errors_out; + + interface->if_data.ifi_collisions = stats->collisions; + interface->if_data.ifi_iqdrops = stats->dropped; + interface->if_data.ifi_noproto = stats->no_protocol; + + /* Touch the last change time. */ + TOUCHLASTCHANGE(&interface->if_lastchange); + + lck_spin_unlock(dlil_input_lock); + + return 0; +} + +errno_t +ifnet_stat( + ifnet_t interface, + struct ifnet_stats_param *stats) +{ + if (interface == NULL) return EINVAL; + + lck_spin_lock(dlil_input_lock); + + stats->packets_in = interface->if_data.ifi_ipackets; + stats->bytes_in = interface->if_data.ifi_ibytes; + stats->multicasts_in = interface->if_data.ifi_imcasts; + stats->errors_in = interface->if_data.ifi_ierrors; + + stats->packets_out = interface->if_data.ifi_opackets; + stats->bytes_out = interface->if_data.ifi_obytes; + stats->multicasts_out = interface->if_data.ifi_omcasts; + stats->errors_out = interface->if_data.ifi_oerrors; + + stats->collisions = interface->if_data.ifi_collisions; + stats->dropped = interface->if_data.ifi_iqdrops; + stats->no_protocol = interface->if_data.ifi_noproto; + + lck_spin_unlock(dlil_input_lock); + + return 0; +} + +errno_t +ifnet_touch_lastchange( + ifnet_t interface) +{ + if (interface == NULL) return EINVAL; + + lck_spin_lock(dlil_input_lock); + TOUCHLASTCHANGE(&interface->if_lastchange); + lck_spin_unlock(dlil_input_lock); + + return 0; +} + +errno_t +ifnet_lastchange( + ifnet_t interface, + struct timeval *last_change) +{ + if (interface == NULL) return EINVAL; + + lck_spin_lock(dlil_input_lock); + *last_change = interface->if_data.ifi_lastchange; + lck_spin_unlock(dlil_input_lock); + +#if IF_LASTCHANGEUPTIME + /* Crude conversion from uptime to calendar time */ + last_change->tv_sec += boottime_sec(); +#endif + + return 0; +} + +errno_t +ifnet_get_address_list( + ifnet_t interface, + ifaddr_t **addresses) +{ + if (interface == NULL || addresses == NULL) return EINVAL; + return ifnet_get_address_list_family(interface, addresses, 0); +} + +errno_t +ifnet_get_address_list_family( + ifnet_t interface, + ifaddr_t **addresses, + sa_family_t family) +{ + struct ifnet *ifp; + int count = 0; + int cmax = 0; + + if (interface == NULL || addresses == NULL) return EINVAL; + *addresses = NULL; + + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet, if_link) + { + if (interface && ifp != interface) continue; + + ifnet_lock_shared(ifp); + if ((ifp->if_eflags & IFEF_DETACHING) == 0) { + if (interface == NULL || interface == ifp) + { + struct ifaddr *addr; + TAILQ_FOREACH(addr, &ifp->if_addrhead, ifa_link) + { + if (family == 0 || addr->ifa_addr->sa_family == family) + cmax++; + } + } + } + else if (interface != NULL) { + ifnet_lock_done(ifp); + ifnet_head_done(); + return ENXIO; + } + ifnet_lock_done(ifp); + } + + MALLOC(*addresses, ifaddr_t*, sizeof(ifaddr_t) * (cmax + 1), M_TEMP, M_NOWAIT); + if (*addresses == NULL) { + ifnet_head_done(); + return ENOMEM; + } + + TAILQ_FOREACH(ifp, &ifnet, if_link) + { + if (interface && ifp != interface) continue; + + ifnet_lock_shared(ifp); + if ((ifp->if_eflags & IFEF_DETACHING) == 0) { + if (interface == NULL || (struct ifnet*)interface == ifp) + { + struct ifaddr *addr; + TAILQ_FOREACH(addr, &ifp->if_addrhead, ifa_link) + { + if (count + 1 > cmax) break; + if (family == 0 || addr->ifa_addr->sa_family == family) { + (*addresses)[count] = (ifaddr_t)addr; + ifaddr_reference((*addresses)[count]); + count++; + } + } + } + } + ifnet_lock_done(ifp); + if (interface || count == cmax) + break; + } + ifnet_head_done(); + (*addresses)[cmax] = 0; + + return 0; +} + +void +ifnet_free_address_list( + ifaddr_t *addresses) +{ + int i; + + if (addresses == NULL) return; + + for (i = 0; addresses[i] != NULL; i++) + { + ifaddr_release(addresses[i]); + } + + FREE(addresses, M_TEMP); +} + +void* +ifnet_lladdr( + ifnet_t interface) +{ + if (interface == NULL) return NULL; + return LLADDR(SDL(interface->if_addrhead.tqh_first->ifa_addr)); +} + +errno_t +ifnet_llbroadcast_copy_bytes( + ifnet_t interface, + void *addr, + size_t buffer_len, + size_t *out_len) +{ + if (interface == NULL || addr == NULL || out_len == NULL) return EINVAL; + + *out_len = interface->if_broadcast.length; + + if (buffer_len < interface->if_broadcast.length) { + return EMSGSIZE; + } + + if (interface->if_broadcast.length == 0) + return ENXIO; + + if (interface->if_broadcast.length <= sizeof(interface->if_broadcast.u.buffer)) { + bcopy(interface->if_broadcast.u.buffer, addr, interface->if_broadcast.length); + } + else { + bcopy(interface->if_broadcast.u.ptr, addr, interface->if_broadcast.length); + } + + return 0; +} + +errno_t +ifnet_lladdr_copy_bytes( + ifnet_t interface, + void* lladdr, + size_t lladdr_len) +{ + struct sockaddr_dl *sdl; + if (interface == NULL || lladdr == NULL) return EINVAL; + + sdl = SDL(interface->if_addrhead.tqh_first->ifa_addr); + + while (1) { + if (lladdr_len != sdl->sdl_alen) { + bzero(lladdr, lladdr_len); + return EMSGSIZE; + } + bcopy(LLADDR(sdl), lladdr, lladdr_len); + if (bcmp(lladdr, LLADDR(sdl), lladdr_len) == 0 && + lladdr_len == sdl->sdl_alen) + break; + } + return 0; +} + +static errno_t +ifnet_set_lladdr_internal( + ifnet_t interface, + const void *lladdr, + size_t lladdr_len, + u_char new_type, + int apply_type) +{ + struct ifaddr *ifa; + struct sockaddr_dl *sdl; + errno_t error = 0; + + if (interface == NULL) return EINVAL; + + if (lladdr_len != 0 && (lladdr_len != interface->if_addrlen || lladdr == 0)) + return EINVAL; + + ifnet_head_lock_shared(); + ifa = ifnet_addrs[interface->if_index - 1]; + if (ifa != NULL) { + sdl = (struct sockaddr_dl*)ifa->ifa_addr; + if (lladdr_len != 0) { + bcopy(lladdr, LLADDR(sdl), lladdr_len); + } + else { + bzero(LLADDR(sdl), interface->if_addrlen); + } + sdl->sdl_alen = lladdr_len; + + if (apply_type) { + sdl->sdl_type = new_type; + } + } + else { + error = ENXIO; + } + ifnet_head_done(); + + /* Generate a kernel event */ + if (error == 0) { + dlil_post_msg(interface, KEV_DL_SUBCLASS, + KEV_DL_LINK_ADDRESS_CHANGED, NULL, 0); + } + + return error; +} + +errno_t +ifnet_set_lladdr( + ifnet_t interface, + const void* lladdr, + size_t lladdr_len) +{ + return ifnet_set_lladdr_internal(interface, lladdr, lladdr_len, 0, 0); +} + +errno_t +ifnet_set_lladdr_and_type( + ifnet_t interface, + const void* lladdr, + size_t lladdr_len, + u_char type) +{ + return ifnet_set_lladdr_internal(interface, lladdr, lladdr_len, type, 1); +} + +errno_t +ifnet_add_multicast( + ifnet_t interface, + const struct sockaddr *maddr, + ifmultiaddr_t *address) +{ + if (interface == NULL || maddr == NULL) return EINVAL; + return if_addmulti(interface, maddr, address); +} + +errno_t +ifnet_remove_multicast( + ifmultiaddr_t address) +{ + if (address == NULL) return EINVAL; + return if_delmultiaddr(address, 0); +} + +errno_t ifnet_get_multicast_list(ifnet_t interface, ifmultiaddr_t **addresses) +{ + int count = 0; + int cmax = 0; + struct ifmultiaddr *addr; + int lock; + + if (interface == NULL || addresses == NULL) + return EINVAL; + + lock = (interface->if_lock != 0); + if (lock) ifnet_lock_shared(interface); + if ((interface->if_eflags & IFEF_DETACHING) == 0) { + LIST_FOREACH(addr, &interface->if_multiaddrs, ifma_link) + { + cmax++; + } + } + else { + if (lock) ifnet_lock_done(interface); + return ENXIO; + } + + MALLOC(*addresses, ifmultiaddr_t*, sizeof(ifmultiaddr_t) * (cmax + 1), M_TEMP, M_NOWAIT); + if (*addresses == NULL) return ENOMEM; + + LIST_FOREACH(addr, &interface->if_multiaddrs, ifma_link) + { + if (count + 1 > cmax) + break; + (*addresses)[count] = (ifmultiaddr_t)addr; + ifmaddr_reference((*addresses)[count]); + count++; + } + (*addresses)[cmax] = 0; + if (lock) ifnet_lock_done(interface); + + return 0; +} + +void +ifnet_free_multicast_list( + ifmultiaddr_t *addresses) +{ + int i; + + if (addresses == NULL) return; + + for (i = 0; addresses[i] != NULL; i++) + { + ifmaddr_release(addresses[i]); + } + + FREE(addresses, M_TEMP); +} + +errno_t +ifnet_find_by_name( + const char *ifname, + ifnet_t *interface) +{ + struct ifnet *ifp; + int namelen; + + if (ifname == NULL) return EINVAL; + + namelen = strlen(ifname); + + *interface = NULL; + + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet, if_link) + { + struct sockaddr_dl *ll_addr = + (struct sockaddr_dl *)ifnet_addrs[ifp->if_index - 1]->ifa_addr; + if ((ifp->if_eflags & IFEF_DETACHING) == 0 && + namelen == ll_addr->sdl_nlen && + (strncmp(ll_addr->sdl_data, ifname, ll_addr->sdl_nlen) == 0)) + { + break; + } + } + if (ifp) { + *interface = ifp; + ifnet_reference(*interface); + } + ifnet_head_done(); + + return (ifp == NULL) ? ENXIO : 0; +} + +errno_t +ifnet_list_get( + ifnet_family_t family, + ifnet_t **list, + u_int32_t *count) +{ + struct ifnet *ifp; + u_int32_t cmax = 0; + *count = 0; + errno_t result = 0; + + if (list == NULL || count == NULL) return EINVAL; + + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet, if_link) + { + if (ifp->if_eflags & IFEF_DETACHING) continue; + if (family == 0 || ifp->if_family == family) + cmax++; + } + + if (cmax == 0) + result = ENXIO; + + if (result == 0) { + MALLOC(*list, ifnet_t*, sizeof(ifnet_t) * (cmax + 1), M_TEMP, M_NOWAIT); + if (*list == NULL) + result = ENOMEM; + } + + if (result == 0) { + TAILQ_FOREACH(ifp, &ifnet, if_link) + { + if (ifp->if_eflags & IFEF_DETACHING) continue; + if (*count + 1 > cmax) break; + if (family == 0 || ((ifnet_family_t)ifp->if_family) == family) + { + (*list)[*count] = (ifnet_t)ifp; + ifnet_reference((*list)[*count]); + (*count)++; + } + } + (*list)[*count] = NULL; + } + ifnet_head_done(); + + return 0; +} + +void +ifnet_list_free( + ifnet_t *interfaces) +{ + int i; + + if (interfaces == NULL) return; + + for (i = 0; interfaces[i]; i++) + { + ifnet_release(interfaces[i]); + } + + FREE(interfaces, M_TEMP); +} + +/****************************************************************************/ +/* ifaddr_t accessors */ +/****************************************************************************/ + +errno_t +ifaddr_reference( + ifaddr_t ifa) +{ + if (ifa == NULL) return EINVAL; + ifaref(ifa); + return 0; +} + +errno_t +ifaddr_release( + ifaddr_t ifa) +{ + if (ifa == NULL) return EINVAL; + ifafree(ifa); + return 0; +} + +sa_family_t +ifaddr_address_family( + ifaddr_t ifa) +{ + if (ifa && ifa->ifa_addr) + return ifa->ifa_addr->sa_family; + + return 0; +} + +errno_t +ifaddr_address( + ifaddr_t ifa, + struct sockaddr *out_addr, + u_int32_t addr_size) +{ + u_int32_t copylen; + + if (ifa == NULL || out_addr == NULL) return EINVAL; + if (ifa->ifa_addr == NULL) return ENOTSUP; + + copylen = (addr_size >= ifa->ifa_addr->sa_len) ? ifa->ifa_addr->sa_len : addr_size; + bcopy(ifa->ifa_addr, out_addr, copylen); + + if (ifa->ifa_addr->sa_len > addr_size) return EMSGSIZE; + + return 0; +} + +errno_t +ifaddr_dstaddress( + ifaddr_t ifa, + struct sockaddr *out_addr, + u_int32_t addr_size) +{ + u_int32_t copylen; + if (ifa == NULL || out_addr == NULL) return EINVAL; + if (ifa->ifa_dstaddr == NULL) return ENOTSUP; + + copylen = (addr_size >= ifa->ifa_dstaddr->sa_len) ? ifa->ifa_dstaddr->sa_len : addr_size; + bcopy(ifa->ifa_dstaddr, out_addr, copylen); + + if (ifa->ifa_dstaddr->sa_len > addr_size) return EMSGSIZE; + + return 0; +} + +errno_t +ifaddr_netmask( + ifaddr_t ifa, + struct sockaddr *out_addr, + u_int32_t addr_size) +{ + u_int32_t copylen; + if (ifa == NULL || out_addr == NULL) return EINVAL; + if (ifa->ifa_netmask == NULL) return ENOTSUP; + + copylen = addr_size >= ifa->ifa_netmask->sa_len ? ifa->ifa_netmask->sa_len : addr_size; + bcopy(ifa->ifa_netmask, out_addr, copylen); + + if (ifa->ifa_netmask->sa_len > addr_size) return EMSGSIZE; + + return 0; +} + +ifnet_t +ifaddr_ifnet( + ifaddr_t ifa) +{ + struct ifnet *ifp; + if (ifa == NULL) return NULL; + ifp = ifa->ifa_ifp; + + return (ifnet_t)ifp; +} + +ifaddr_t +ifaddr_withaddr( + const struct sockaddr* address) +{ + if (address == NULL) return NULL; + return ifa_ifwithaddr(address); +} + +ifaddr_t +ifaddr_withdstaddr( + const struct sockaddr* address) +{ + if (address == NULL) return NULL; + return ifa_ifwithdstaddr(address); +} + +ifaddr_t +ifaddr_withnet( + const struct sockaddr* net) +{ + if (net == NULL) return NULL; + return ifa_ifwithnet(net); +} + +ifaddr_t +ifaddr_withroute( + int flags, + const struct sockaddr* destination, + const struct sockaddr* gateway) +{ + if (destination == NULL || gateway == NULL) return NULL; + return ifa_ifwithroute(flags, destination, gateway); +} + +ifaddr_t +ifaddr_findbestforaddr( + const struct sockaddr *addr, + ifnet_t interface) +{ + if (addr == NULL || interface == NULL) return NULL; + return ifaof_ifpforaddr(addr, interface); +} + +errno_t +ifmaddr_reference( + ifmultiaddr_t ifmaddr) +{ + if (ifmaddr == NULL) return EINVAL; + ifma_reference(ifmaddr); + return 0; +} + +errno_t +ifmaddr_release( + ifmultiaddr_t ifmaddr) +{ + if (ifmaddr == NULL) return EINVAL; + ifma_release(ifmaddr); + return 0; +} + +errno_t +ifmaddr_address( + ifmultiaddr_t ifmaddr, + struct sockaddr *out_addr, + u_int32_t addr_size) +{ + u_int32_t copylen; + + if (ifmaddr == NULL || out_addr == NULL) return EINVAL; + if (ifmaddr->ifma_addr == NULL) return ENOTSUP; + + copylen = addr_size >= ifmaddr->ifma_addr->sa_len ? ifmaddr->ifma_addr->sa_len : addr_size; + bcopy(ifmaddr->ifma_addr, out_addr, copylen); + + if (ifmaddr->ifma_addr->sa_len > addr_size) return EMSGSIZE; + + return 0; +} + +errno_t +ifmaddr_lladdress( + ifmultiaddr_t ifmaddr, + struct sockaddr *out_addr, + u_int32_t addr_size) +{ + if (ifmaddr == NULL || out_addr == NULL) return EINVAL; + if (ifmaddr->ifma_ll == NULL) return ENOTSUP; + + return ifmaddr_address(ifmaddr->ifma_ll, out_addr, addr_size); +} + +ifnet_t +ifmaddr_ifnet( + ifmultiaddr_t ifmaddr) +{ + if (ifmaddr == NULL || ifmaddr->ifma_ifp == NULL) return NULL; + return ifmaddr->ifma_ifp; +} diff --git a/bsd/net/kpi_interface.h b/bsd/net/kpi_interface.h new file mode 100644 index 000000000..8f09d0985 --- /dev/null +++ b/bsd/net/kpi_interface.h @@ -0,0 +1,1617 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/*! + @header kpi_interface.h + This header defines an API to interact with network interfaces in + the kernel. The network interface KPI may be used to implement + network interfaces or to attach protocols to existing interfaces. + */ + +#ifndef __KPI_INTERFACE__ +#define __KPI_INTERFACE__ +#include <sys/kernel_types.h> + +#ifndef _SA_FAMILY_T +#define _SA_FAMILY_T +typedef __uint8_t sa_family_t; +#endif + +struct timeval; +struct sockaddr; +struct sockaddr_dl; +struct kern_event_msg; +struct kev_msg; +struct ifnet_demux_desc; + +/*! + @enum Interface Families + @abstract Constants defining interface families. + @constant IFNET_FAMILY_ANY Match interface of any family type. + @constant IFNET_FAMILY_LOOPBACK A software loopback interface. + @constant IFNET_FAMILY_ETHERNET An Ethernet interface. + @constant IFNET_FAMILY_SLIP A SLIP interface. + @constant IFNET_FAMILY_TUN A tunnel interface. + @constant IFNET_FAMILY_VLAN A virtual LAN interface. + @constant IFNET_FAMILY_PPP A PPP interface. + @constant IFNET_FAMILY_PVC A PVC interface. + @constant IFNET_FAMILY_DISC A DISC interface. + @constant IFNET_FAMILY_MDECAP A MDECAP interface. + @constant IFNET_FAMILY_GIF A generic tunnel interface. + @constant IFNET_FAMILY_FAITH A FAITH (IPv4/IPv6 translation) interface. + @constant IFNET_FAMILY_STF A 6to4 interface. + @constant IFNET_FAMILY_FIREWIRE An IEEE 1394 (firewire) interface. + @constant IFNET_FAMILY_BOND A virtual bonded interface. +*/ + +enum { + IFNET_FAMILY_ANY = 0, + IFNET_FAMILY_LOOPBACK = 1, + IFNET_FAMILY_ETHERNET = 2, + IFNET_FAMILY_SLIP = 3, + IFNET_FAMILY_TUN = 4, + IFNET_FAMILY_VLAN = 5, + IFNET_FAMILY_PPP = 6, + IFNET_FAMILY_PVC = 7, + IFNET_FAMILY_DISC = 8, + IFNET_FAMILY_MDECAP = 9, + IFNET_FAMILY_GIF = 10, + IFNET_FAMILY_FAITH = 11, + IFNET_FAMILY_STF = 12, + IFNET_FAMILY_FIREWIRE = 13, + IFNET_FAMILY_BOND = 14 +}; +/*! + @typedef ifnet_family_t + @abstract Storage type for the interface family. +*/ +typedef u_int32_t ifnet_family_t; + +/*! + @enum BPF tap mode + @abstract Constants defining interface families. + @constant BPF_MODE_DISABLED Disable bpf. + @constant BPF_MODE_INPUT Enable input only. + @constant BPF_MODE_OUTPUT Enable output only. + @constant BPF_MODE_INPUT_OUTPUT Enable input and output. +*/ + +enum { + BPF_MODE_DISABLED = 0, + BPF_MODE_INPUT = 1, + BPF_MODE_OUTPUT = 2, + BPF_MODE_INPUT_OUTPUT = 3 +}; +/*! + @typedef bpf_tap_mode + @abstract Mode for tapping. BPF_MODE_DISABLED/BPF_MODE_INPUT_OUTPUT etc. +*/ +typedef u_int32_t bpf_tap_mode; + +/*! + @typedef protocol_family_t + @abstract Storage type for the protocol family. +*/ +typedef u_int32_t protocol_family_t; + +/*! + @enum Interface Abilities + @abstract Constants defining interface offload support. + @constant IFNET_CSUM_IP Hardware will calculate IPv4 checksums. + @constant IFNET_CSUM_TCP Hardware will calculate TCP checksums. + @constant IFNET_CSUM_UDP Hardware will calculate UDP checksums. + @constant IFNET_CSUM_FRAGMENT Hardware will checksum IP fragments. + @constant IFNET_IP_FRAGMENT Hardware will fragment IP packets. + @constant IFNET_VLAN_TAGGING Hardware will generate VLAN headers. + @constant IFNET_VLAN_MTU Hardware supports VLAN MTU. +*/ + +enum { + IFNET_CSUM_IP = 0x00000001, + IFNET_CSUM_TCP = 0x00000002, + IFNET_CSUM_UDP = 0x00000004, + IFNET_CSUM_FRAGMENT = 0x00000008, + IFNET_IP_FRAGMENT = 0x00000010, +#ifdef KERNEL_PRIVATE + IFNET_CSUM_SUM16 = 0x00001000, +#endif + IFNET_VLAN_TAGGING = 0x00010000, + IFNET_VLAN_MTU = 0x00020000, +}; +/*! + @typedef ifnet_offload_t + @abstract Flags indicating the offload support of the interface. +*/ +typedef u_int32_t ifnet_offload_t; + +/* + * Callbacks + * + * These are function pointers you supply to the kernel in the interface. + */ +/*! + @typedef bpf_packet_func + + @discussion bpf_packet_func The bpf_packet_func is used to intercept + inbound and outbound packets. The tap function will never free + the mbuf. The tap function will only copy the mbuf in to various + bpf file descriptors tapping this interface. + @param interface The interface being sent or received on. + @param data The packet to be transmitted or received. + @result An errno value or zero upon success. + */ +/* Fast path - do not block or spend excessive amounts of time */ +typedef errno_t (*bpf_packet_func)(ifnet_t interface, mbuf_t data); + +/*! + @typedef ifnet_output_func + + @discussion ifnet_output_func is used to transmit packets. The stack + will pass fully formed packets, including frame header, to the + ifnet_output function for an interface. The driver is + responsible for freeing the mbuf. + @param interface The interface being sent on. + @param data The packet to be sent. + */ +/* Fast path - do not block or spend excessive amounts of time */ +typedef errno_t (*ifnet_output_func)(ifnet_t interface, mbuf_t data); + +/*! + @typedef ifnet_ioctl_func + @discussion ifnet_ioctl_func is used to communicate ioctls from the + stack to the driver. + @param interface The interface the ioctl is being sent to. + @param proto_family The protocol family to handle the ioctl, may be + zero for no protocol_family. + @param cmd The ioctl command. + @param data A pointer to any data related to the ioctl. + */ +typedef errno_t (*ifnet_ioctl_func)(ifnet_t interface, u_int32_t cmd, void *data); + +/*! + @typedef ifnet_set_bpf_tap + @discussion ifnet_set_bpf_tap is used to set the bpf tap function to + be called when packets are sent and/or received. + @param interface The interface the bpf tap function is being set on. + @param mode Sets the mode of the tap to either disabled, input, + output, or input/output. + @param callback A function pointer to be called when a packet is + sent or received. + */ +typedef errno_t (*ifnet_set_bpf_tap)(ifnet_t interface, bpf_tap_mode mode, + bpf_packet_func callback); + +/*! + @typedef ifnet_detached_func + @discussion ifnet_detached_func is called an interface is detached + from the list of interfaces. When ifnet_detach is called, it may + not detach the interface immediately if protocols are attached. + ifnet_detached_func is used to notify the interface that it has + been detached from the networking stack. This is the last + function that will be called on an interface. Until this + function returns, you must not unload a kext supplying function + pointers to this interface, even if ifnet_detacah has been + called. Your detach function may be called during your call to + ifnet_detach. + @param interface The interface that has been detached. + event. + */ +typedef void (*ifnet_detached_func)(ifnet_t interface); + +/*! + @typedef ifnet_demux_func + @discussion ifnet_demux_func is called for each inbound packet to determine + which protocol family the packet belongs to. This information is then + used by the stack to determine which protocol to pass the packet to. + This function may return protocol families for protocols that are + not attached. If the protocol family has not been attached to the + interface, the packet will be discarded. + @param interface The interface the packet was received on. + @param packet The mbuf containing the packet. + @param frame_header A pointer to the frame header. + @param protocol_family Upon return, the protocol family matching the + packet should be stored here. + @result + If the result is zero, processing will continue normally. + If the result is EJUSTRETURN, processing will stop but the packet will not be freed. + If the result is anything else, the processing will stop and the packet will be freed. + */ +typedef errno_t (*ifnet_demux_func)(ifnet_t interface, mbuf_t packet, + char *frame_header, + protocol_family_t *protocol_family); + +/*! + @typedef ifnet_event_func + @discussion ifnet_event_func is called when an event occurs on a + specific interface. + @param interface The interface the event occurred on. + @param event_ptr Pointer to a kern_event structure describing the + event. + */ +typedef void (*ifnet_event_func)(ifnet_t interface, const struct kev_msg *msg); + +/*! + @typedef ifnet_framer_func + @discussion ifnet_framer_func is called for each outbound packet to + give the interface an opportunity to prepend interface specific + headers. + @param interface The interface the packet is being sent on. + @param packet Pointer to the mbuf containing the packet, caller may + set this to a different mbuf upon return. This can happen if the + frameout function needs to prepend another mbuf to the chain to + have enough space for the header. + @param dest The higher layer protocol destination (i.e. IP address). + @param dest_linkaddr The link layer address as determined by the + protocol's pre-output function. + @param frame_type The frame type as determined by the protocol's + pre-output function. + @result + If the result is zero, processing will continue normally. + If the result is EJUSTRETURN, processing will stop but the packet will not be freed. + If the result is anything else, the processing will stop and the packet will be freed. + */ +typedef errno_t (*ifnet_framer_func)(ifnet_t interface, mbuf_t *packet, + const struct sockaddr *dest, + const char *desk_linkaddr, + const char *frame_type); + +/*! + @typedef ifnet_add_proto_func + @discussion if_add_proto_func is called by the stack when a protocol + is attached to an interface. This gives the interface an + opportunity to get a list of protocol description structures + for demuxing packets to this protocol (demux descriptors). + @param interface The interface the protocol will be attached to. + @param protocol_family The family of the protocol being attached. + @param demux_array An array of demux descriptors that describe + the interface specific ways of identifying packets belonging + to this protocol family. + @param demux_count The number of demux descriptors in the array. + @result + If the result is zero, processing will continue normally. + If the result is anything else, the add protocol will be aborted. + */ +typedef errno_t (*ifnet_add_proto_func)(ifnet_t interface, + protocol_family_t protocol_family, + const struct ifnet_demux_desc *demux_array, + u_int32_t demux_count); + +/*! + @typedef if_del_proto_func + @discussion if_del_proto_func is called by the stack when a protocol + is being detached from an interface. This gives the interface an + opportunity to free any storage related to this specific + protocol being attached to this interface. + @param interface The interface the protocol will be detached from. + @param protocol_family The family of the protocol being detached. + @result + If the result is zero, processing will continue normally. + If the result is anything else, the detach will continue + and the error will be returned to the caller. + */ +typedef errno_t (*ifnet_del_proto_func)(ifnet_t interface, + protocol_family_t protocol_family); + +/*! + @typedef ifnet_check_multi + @discussion ifnet_check_multi is called for each multicast address + added to an interface. This gives the interface an opportunity + to reject invalid multicast addresses before they are attached + to the interface. + + To prevent an address from being added to your multicast list, + return EADDRNOTAVAIL. If you don't know how to parse/translate + the address, return EOPNOTSUPP. + @param The interface. + @param mcast The multicast address. + @result + Zero upon success, EADDRNOTAVAIL on invalid multicast, + EOPNOTSUPP for addresses the interface does not understand. + */ +typedef errno_t (*ifnet_check_multi)(ifnet_t interface, + const struct sockaddr* mcast); + +/*! + @typedef proto_media_input + @discussion proto_media_input is called for all inbound packets for + a specific protocol on a specific interface. This function is + registered on an interface using ifnet_attach_protocol. + @param ifp The interface the packet was received on. + @param protocol_family The protocol of the packet received. + @param packet The packet being input. + @param header The frame header. + @result + If the result is zero, the caller will assume the packet was passed + to the protocol. + If the result is non-zero and not EJUSTRETURN, the caller will free + the packet. + */ +typedef errno_t (*proto_media_input)(ifnet_t ifp, protocol_family_t protocol, + mbuf_t packet, char* header); + +/*! + @typedef proto_media_preout + @discussion proto_media_preout is called just before the packet + is transmitted. This gives the proto_media_preout function an + opportunity to specify the media specific frame type and + destination. + @param ifp The interface the packet will be sent on. + @param protocol_family The protocol of the packet being sent + (PF_INET/etc...). + @param packet The packet being sent. + @param dest The protocol level destination address. + @param route A pointer to the routing structure for the packet. + @param frame_type The media specific frame type. + @param link_layer_dest The media specific destination. + @result + If the result is zero, processing will continue normally. If the + result is non-zero, processing will stop. If the result is + non-zero and not EJUSTRETURN, the packet will be freed by the + caller. + */ +typedef errno_t (*proto_media_preout)(ifnet_t ifp, protocol_family_t protocol, + mbuf_t *packet, const struct sockaddr *dest, + void *route, char *frame_type, char *link_layer_dest); + +/*! + @typedef proto_media_event + @discussion proto_media_event is called to notify this layer of + interface specific events. + @param ifp The interface. + @param protocol_family The protocol family. + @param kev_msg The event. + */ +typedef void (*proto_media_event)(ifnet_t ifp, protocol_family_t protocol, + const struct kev_msg *event); + +/*! + @typedef proto_media_ioctl + @discussion proto_media_event allows this layer to handle ioctls. + When an ioctl is handled, it is passed to the interface filters, + protocol filters, protocol, and interface. If you do not support + this ioctl, return EOPNOTSUPP. If you successfully handle the + ioctl, return zero. If you return any error other than + EOPNOTSUPP, other parts of the stack may not get an opportunity + to process the ioctl. If you return EJUSTRETURN, processing will + stop and a result of zero will be returned to the caller. + @param ifp The interface. + @param protocol_family The protocol family. + @param command The ioctl command. + @param argument The argument to the ioctl. + @result + See the discussion. + */ +typedef errno_t (*proto_media_ioctl)(ifnet_t ifp, protocol_family_t protocol, + u_int32_t command, void* argument); + +/*! + @typedef proto_media_detached + @discussion proto_media_detached notifies you that your protocol + has been detached. + @param ifp The interface. + @param protocol_family The protocol family. + @result + See the discussion. + */ +typedef errno_t (*proto_media_detached)(ifnet_t ifp, protocol_family_t protocol); + + +/*! + @typedef proto_media_resolve_multi + @discussion proto_media_resolve_multi is called to resolve a + protocol layer mulitcast address to a link layer multicast + address. + @param ifp The interface. + @param proto_addr The protocol address. + @param out_ll A sockaddr_dl to copy the link layer multicast in to. + @param ll_len The length of data allocated for out_ll. + @result Return zero on success or an errno error value on failure. + */ +typedef errno_t (*proto_media_resolve_multi)(ifnet_t ifp, + const struct sockaddr *proto_addr, + struct sockaddr_dl *out_ll, size_t ll_len); + +/*! + @typedef proto_media_send_arp + @discussion proto_media_send_arp is called by the stack to generate + an ARP packet. This field is currently only used with IP. This + function should inspect the parameters and transmit an arp + packet using the information passed in. + @param ifp The interface the arp packet should be sent on. + @param protocol_family The protocol family of the addresses + (PF_INET). + @param arpop The arp operation (usually ARPOP_REQUEST or + ARPOP_REPLY). + @param sender_hw The value to use for the sender hardware + address field. If this is NULL, use the hardware address + of the interface. + @param sender_proto The value to use for the sender protocol + address field. This will not be NULL. + @param target_hw The value to use for the target hardware address. + If this is NULL, the target hardware address in the ARP packet + should be NULL and the link-layer destination for the back + should be a broadcast. If this is not NULL, this value should be + used for both the link-layer destination and the target hardware + address. + @param target_proto The target protocol address. This will not be + NULL. + @result Return zero on success or an errno error value on failure. + */ +typedef errno_t (*proto_media_send_arp)(ifnet_t ifp, + u_short arpop, + const struct sockaddr_dl* sender_hw, + const struct sockaddr* sender_proto, + const struct sockaddr_dl* target_hw, + const struct sockaddr* target_proto); + +/*! + @struct ifnet_stat_increment_param + @discussion This structure is used increment the counters on a + network interface. + @field packets_in The number of packets received. + @field bytes_in The number of bytes received. + @field errors_in The number of receive errors. + @field packets_out The number of packets transmitted. + @field bytes_out The number of bytes transmitted. + @field errors_out The number of transmission errors. + @field collisions The number of collisions seen by this interface. + @field dropped The number of packets dropped. +*/ + +struct ifnet_stat_increment_param { + u_int32_t packets_in; + u_int32_t bytes_in; + u_int32_t errors_in; + + u_int32_t packets_out; + u_int32_t bytes_out; + u_int32_t errors_out; + + u_int32_t collisions; + u_int32_t dropped; +}; + +/*! + @struct ifnet_init_params + @discussion This structure is used to define various properties of + the interface when calling ifnet_init. A copy of these values + will be stored in the ifnet and can not be modified while the + interface is attached. + @field uniqueid An identifier unique to this instance of the + interface. + @field uniqueid_len The length, in bytes, of the uniqueid. + @field name The interface name (i.e. en). + @field unit The interface unit number (en0's unit number is 0). + @field family The interface family. + @field type The interface type (see sys/if_types.h). Must be less + than 256. For new types, use IFT_OTHER. + @field output The output function for the interface. Every packet the + stack attempts to send through this interface will go out through + this function. + @field demux The function used to determine the protocol family of an + incoming packet. + @field add_proto The function used to attach a protocol to this interface. + @field del_proto The function used to remove a protocol from this interface. + @field framer The function used to frame outbound packets, may be NULL. + @field softc Driver specific storage. This value can be retrieved from the + ifnet using the ifnet_softc function. + @field ioctl The function used to handle ioctls. + @field set_bpf_tap The function used to set the bpf_tap function. + @field detach The function called to let the driver know the interface has been detached. + @field event The function to notify the interface of various interface specific kernel events. + @field broadcast_addr The link-layer broadcast address for this interface. + @field broadcast_len The length of the link-layer broadcast address. +*/ + +struct ifnet_init_params { + /* used to match recycled interface */ + const void* uniqueid; /* optional */ + u_int32_t uniqueid_len; /* optional */ + + /* used to fill out initial values for interface */ + const char* name; /* required */ + u_int32_t unit; /* required */ + ifnet_family_t family; /* required */ + u_int32_t type; /* required */ + ifnet_output_func output; /* required */ + ifnet_demux_func demux; /* required */ + ifnet_add_proto_func add_proto; /* required */ + ifnet_del_proto_func del_proto; /* required */ + ifnet_check_multi check_multi; /* required for non point-to-point interfaces */ + ifnet_framer_func framer; /* optional */ + void* softc; /* optional */ + ifnet_ioctl_func ioctl; /* optional */ + ifnet_set_bpf_tap set_bpf_tap; /* optional */ + ifnet_detached_func detach; /* optional */ + ifnet_event_func event; /* optional */ + const void *broadcast_addr;/* required for non point-to-point interfaces */ + u_int32_t broadcast_len; /* required for non point-to-point interfaces */ +}; + +/*! + @struct ifnet_stats_param + @discussion This structure is used get and set the interface + statistics. + @field packets_in The number of packets received. + @field bytes_in The number of bytes received. + @field errors_in The number of receive errors. + @field packets_out The number of packets transmitted. + @field bytes_out The number of bytes transmitted. + @field errors_out The number of transmission errors. + @field collisions The number of collisions seen by this interface. + @field dropped The number of packets dropped. +*/ + +struct ifnet_stats_param { + u_int64_t packets_in; + u_int64_t bytes_in; + u_int64_t multicasts_in; + u_int64_t errors_in; + + u_int64_t packets_out; + u_int64_t bytes_out; + u_int64_t multicasts_out; + u_int64_t errors_out; + + u_int64_t collisions; + u_int64_t dropped; + u_int64_t no_protocol; +}; + +/*! + @struct ifnet_demux_desc + @discussion This structure is to identify packets that belong to a + specific protocol. The types supported are interface specific. + Ethernet supports ETHER_DESC_ETYPE2, ETHER_DESC_SAP, and + ETHER_DESC_SNAP. The type defines the offset in the packet where + the data will be matched as well as context. For example, if + ETHER_DESC_SNAP is specified, the only valid datalen is 5 and + only in the 5 bytes will only be matched when the packet header + indicates that the packet is a SNAP packet. + @field type The type of identifier data (i.e. ETHER_DESC_ETYPE2) + @field data A pointer to an entry of type (i.e. pointer to 0x0800). + @field datalen The number of bytes of data used to describe the + packet. +*/ + +struct ifnet_demux_desc { + u_int32_t type; + void* data; + u_int32_t datalen; +}; + +/*! + @struct ifnet_attach_proto_param + @discussion This structure is used to attach a protocol to an + interface. This structure provides the various functions for + handling operations related to the protocol on the interface as + well as information for how to demux packets for this protocol. + @field demux_array An array of ifnet_demux_desc structures + describing the protocol. + @field demux_count The number of entries in the demux_array array. + @field input The function to be called for inbound packets. + @field pre_output The function to be called for outbound packets. + @field event The function to be called for interface events. + @field ioctl The function to be called for ioctls. + @field detached The function to be called for handling the detach. +*/ +#ifdef KERNEL_PRIVATE +#define demux_list demux_array +#endif /* KERNEL_PRIVATE */ + +struct ifnet_attach_proto_param { + struct ifnet_demux_desc *demux_array; /* interface may/may not require */ + u_int32_t demux_count; /* interface may/may not require */ + + proto_media_input input; /* required */ + proto_media_preout pre_output; /* required */ + proto_media_event event; /* optional */ + proto_media_ioctl ioctl; /* optional */ + proto_media_detached detached; /* optional */ + proto_media_resolve_multi resolve; /* optional */ + proto_media_send_arp send_arp; /* optional */ +}; + +__BEGIN_DECLS + +/* + * Ifnet creation and reference counting + */ + +/*! + @function ifnet_allocate + @discussion Allocate an ifnet_t with an initial refcount of 1. Many + parts of the stack do not properly refcount the ifnet_t. In + order to avoid freeing the ifnet_t while some parts of the stack + may contain a reference to it, the ifnet_ts are only recycled, + never freed. A unique id is used to try and recycle the same + ifnet_t when allocating an interface. For example, for an + ethernet interface, the hardware address of the ethernet card is + usually used for the uniqueid. If a PC Card is removed and + inserted again, if the ethernet address of the PC card is used, + the same ifnet_t will be used for the card the second time it is + inserted. In the future, when the ifnet_t is correctly + refcounted by all of the stack, the interfaces may be freed and + the unique ids ignored. + @param init The initial values for the interface. These values can + not be changed after the interface has been allocated. + @param interface The interface allocated upon success. + @result May return ENOMEM if there is insufficient memory or EEXIST + if an interface with the same uniqueid and family has already + been allocated and is in use. + */ +errno_t ifnet_allocate(const struct ifnet_init_params *init, ifnet_t *interface); + +/*! + @function ifnet_reference + @discussion Increment the reference count of the ifnet to assure + that it will not go away. The interface must already have at + least one reference. + @param interface The interface to increment the reference count of. + @result May return EINVAL if the interface is not valid. + */ +errno_t ifnet_reference(ifnet_t interface); + +/*! + @function ifnet_release + @discussion Release a reference of the ifnet, this may trigger a + free if the reference count reaches 0. + @param interface The interface to decrement the reference count of + and possibly free. + @result May return EINVAL if the interface is not valid. + */ +errno_t ifnet_release(ifnet_t interface); + +/*! + @function ifnet_attach + @discussion Attaches an interface to the global interface list. The + interface must be setup properly before calling attach. The + stack will take a reference on the interface and hold it until + ifnet_detach is called. + + This function is intended to be called by the driver. A kext + must not call this function on an interface the kext does not + own. + @param interface The interface to attach. + @param ll_addr The link layer address of the interface. This is used + to fill out the first ifaddr in the list of addresses for the + interface. This parameter is not required for interfaces such as + PPP that have no link-layer address. + @result Will return an error if there is anything wrong with the + interface. + */ +errno_t ifnet_attach(ifnet_t interface, const struct sockaddr_dl *ll_addr); + +/*! + @function ifnet_detach + @discussion Detaches the interface. + + Call this to indicate this interface is no longer valid (i.e. PC + Card was removed). This function will begin the process of + removing knowledge of this interface from the stack. + + The function will return before the interface is detached. The + functions you supplied in to the interface may continue to be + called. When the detach has been completed, your detached + function will be called. Your kext must not unload until the + detached function has been called. The interface will be + properly freed when the reference count reaches zero. + + An interface may not be attached again. You must call + ifnet_allocate to create a new interface to attach. + + This function is intended to be called by the driver. A kext + must not call this function on an interface the kext does not + own. + @param interface The interface to detach. + @result 0 on success, otherwise errno error. + */ +errno_t ifnet_detach(ifnet_t interface); + +/* + * Interface manipulation. + */ + +/*! + @function ifnet_softc + @discussion Returns the driver's private storage on the interface. + @param interface Interface to retrieve the storage from. + @result Driver's private storage. + */ +void* ifnet_softc(ifnet_t interface); + +/*! + @function ifnet_name + @discussion Returns a pointer to the name of the interface. + @param interface Interface to retrieve the name from. + @result Pointer to the name. + */ +const char* ifnet_name(ifnet_t interface); + +/*! + @function ifnet_family + @discussion Returns the family of the interface. + @param interface Interface to retrieve the unit number from. + @result Unit number. + */ +ifnet_family_t ifnet_family(ifnet_t interface); + +/*! + @function ifnet_unit + @discussion Returns the unit number of the interface. + @param interface Interface to retrieve the unit number from. + @result Unit number. + */ +u_int32_t ifnet_unit(ifnet_t interface); + +/*! + @function ifnet_index + @discussion Returns the index of the interface. This index value + will match the index you would find in a sockaddr_dl or using + if_nametoindex or if_indextoname in user space. The value of the + interface index is undefined for an interface that is not + currently attached. + @param interface Interface to retrieve the index of. + @result Index. + */ +u_int32_t ifnet_index(ifnet_t interface); + +/*! + @function ifnet_set_flags + @discussion Sets the interface flags to match new_flags. + @discussion Sets the interface flags to new_flags. This function + lets you specify which flags you want to change using the mask. + The kernel will effectively take the lock, then set the + interface's flags to (if_flags & ~mask) | (new_flags & mask). + @param interface Interface to set the flags on. + @param new_flags The new set of flags that should be set. These + flags are defined in net/if.h + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_set_flags(ifnet_t interface, u_int16_t new_flags, u_int16_t mask); + +/*! + @function ifnet_flags + @discussion Returns the interface flags that are set. + @param interface Interface to retrieve the flags from. + @result Flags. These flags are defined in net/if.h + */ +u_int16_t ifnet_flags(ifnet_t interface); + + +#ifdef KERNEL_PRIVATE +/*! + @function ifnet_set_eflags + @discussion Sets the extended interface flags to new_flags. This + function lets you specify which flags you want to change using + the mask. The kernel will effectively take the lock, then set + the interface's extended flags to (if_eflags & ~mask) | + (new_flags & mask). + @param interface The interface. + @param new_flags The new set of flags that should be set. These + flags are defined in net/if.h + @param mask The mask of flags to be modified. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_set_eflags(ifnet_t interface, u_int32_t new_flags, u_int32_t mask); + +/*! + @function ifnet_eflags + @discussion Returns the extended interface flags that are set. + @param interface Interface to retrieve the flags from. + @result Extended flags. These flags are defined in net/if.h + */ +u_int32_t ifnet_eflags(ifnet_t interface); +#endif + +/*! + @function ifnet_set_offload + @discussion Sets a bitfield to indicate special hardware offload + support provided by the interface such as hardware checksums and + VLAN. This replaces the if_hwassist flags field. Any flags + unrecognized by the stack will not be set. + @param interface The interface. + @param offload The new set of flags indicating which offload options + the device supports. + @param mask The mask of flags to be modified. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_set_offload(ifnet_t interface, ifnet_offload_t offload); + +/*! + @function ifnet_offload + @discussion Returns flags indicating which operations can be + offloaded to the interface. + @param interface Interface to retrieve the offload from. + @result Abilities flags, see ifnet_offload_t. + */ +ifnet_offload_t ifnet_offload(ifnet_t interface); + +/*! + @function ifnet_set_link_mib_data + @discussion Sets the mib link data. The ifnet_t will store the + pointer you supply and copy mibLen bytes from the pointer + whenever the sysctl for getting interface specific MIB data is + used. Since the ifnet_t stores a pointer to your data instead of + a copy, you may update the data at the address at any time. + + This function is intended to be called by the driver. A kext + must not call this function on an interface the kext does not + own. + @param interface Interface to set the unit number of. + @param mibData A pointer to the data. + @param mibLen Length of data pointed to. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_set_link_mib_data(ifnet_t interface, void *mibData, u_int32_t mibLen); + +/*! + @function ifnet_get_link_mib_data + @discussion Copies the link MIB data in to mibData, up to mibLen + bytes. Returns error if the buffer is too small to hold all of + the MIB data. + @param interface The interface. + @param mibData A pointer to space for the mibData to be copied in + to. + @param mibLen When calling, this should be the size of the buffer + passed in mibData. Upon return, this will be the size of data + copied in to mibData. + @result Returns an error if the buffer size is too small or there is + no data. + */ +errno_t ifnet_get_link_mib_data(ifnet_t interface, void *mibData, u_int32_t *mibLen); + +/*! + @function ifnet_get_link_mib_data_length + @discussion Retrieve the size of the mib data. + @param interface The interface. + @result Returns the number of bytes of mib data associated with the + interface. + */ +u_int32_t ifnet_get_link_mib_data_length(ifnet_t interface); + +/*! + @function ifnet_attach_protocol + @discussion Attaches a protocol to an interface. + @param interface The interface. + @param protocol_family The protocol family being attached + (PF_INET/PF_APPLETALK/etc...). + @param proto_details Details of the protocol being attached. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_attach_protocol(ifnet_t interface, protocol_family_t protocol_family, + const struct ifnet_attach_proto_param *proto_details); + +/*! + @function ifnet_detach_protocol + @discussion Detaches a protocol from an interface. + @param interface The interface. + @param protocol_family The protocol family of the protocol to + detach. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_detach_protocol(ifnet_t interface, protocol_family_t protocol_family); + +/*! + @function ifnet_output + @discussion Handles an outbound packet on the interface by calling + any filters, a protocol preoutput function, the interface framer + function, and finally the interface's output function. The + protocol_family will be used to apply protocol filters and + determine which preoutput function to call. The route and dest + parameters will be passed to the preoutput function defined for + the attachment of the specified protocol to the specified + interface. ifnet_output will free the mbuf chain in the event of + an error. + @param interface The interface. + @param protocol_family The family of the protocol generating this + packet (i.e. AF_INET). + @param packet The packet to be transmitted. + @param route A pointer to a routing structure for this packet. The + preoutput function determines whether this value may be NULL or + not. + @param dest The destination address of protocol_family type. This + will be passed to the preoutput function. If the preoutput + function does not require this value, you may pass NULL. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_output(ifnet_t interface, protocol_family_t protocol_family, mbuf_t packet, + void* route, const struct sockaddr *dest); + +/*! + @function ifnet_output_raw + @discussion Handles and outbond raw packet on the interface by + calling any filters followed by the interface's output function. + protocol_family may be zero. If the packet is from a specific + protocol the protocol_family will be used to apply protocol + filters. All interface filters will be applied to the outgoing + packet. Processing, such as calling the protocol preoutput and + interface framer functions will be bypassed. The packet will + pass through the filters and be sent on the interface as is. + ifnet_output_raw will free the packet chain in the event of an + error. + @param interface The interface. + @param protocol_family The family of the protocol generating this + packet (i.e. AF_INET). + @param packet The fully formed packet to be transmitted. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_output_raw(ifnet_t interface, protocol_family_t protocol_family, mbuf_t packet); + +/*! + @function ifnet_input + @discussion Inputs packets from the interface. The interface's demux + will be called to determine the protocol. Once the protocol is + determined, the interface filters and protocol filters will be + called. From there, the packet will be passed to the registered + protocol. If there is an error, the mbuf chain will be freed. + @param interface The interface. + @param first_packet The first packet in a chain of packets. + @param stats Counts to be integrated in to the stats. The interface + statistics will be incremented by the amounts specified in + stats. This parameter may be NULL. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_input(ifnet_t interface, mbuf_t first_packet, + const struct ifnet_stat_increment_param *stats); + +/*! + @function ifnet_ioctl + @discussion Calls the interface's ioctl function with the parameters + passed. + @param interface The interface. + @param protocol The protocol family of the protocol to send the + ioctl to (may be zero). Some ioctls apply to a protocol while + other ioctls apply to just an interface. + @param ioctl_code The ioctl to perform. + @param ioctl_arg Any parameters to the ioctl. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_ioctl(ifnet_t interface, protocol_family_t protocol, + u_int32_t ioctl_code, void *ioctl_arg); + +/*! + @function ifnet_event + @discussion Calls the interface's event function. + @param interface The interface. + @param event_ptr Pointer to an kern_event structure describing the + event. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_event(ifnet_t interface, struct kern_event_msg* event_ptr); + +/*! + @function ifnet_set_mtu + @discussion Sets the value of the MTU in the interface structure. + Calling this function will not notify the driver that the MTU + should be changed. Use the appropriate ioctl. + + This function is intended to be called by the driver. A kext + must not call this function on an interface the kext does not + own. + @param interface The interface. + @param mtu The new MTU. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_set_mtu(ifnet_t interface, u_int32_t mtu); + +/*! + @function ifnet_mtu + @param interface The interface. + @result The MTU. + */ +u_int32_t ifnet_mtu(ifnet_t interface); + +/*! + @function ifnet_type + @param interface The interface. + @result The type. See net/if_types.h. + */ +u_int8_t ifnet_type(ifnet_t interface); + +/*! + @function ifnet_set_addrlen + @discussion + This function is intended to be called by the driver. A kext + must not call this function on an interface the kext does not + own. + @param interface The interface. + @param addrlen The new address length. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_set_addrlen(ifnet_t interface, u_int8_t addrlen); + +/*! + @function ifnet_addrlen + @param interface The interface. + @result The address length. + */ +u_int8_t ifnet_addrlen(ifnet_t interface); + +/*! + @function ifnet_set_hdrlen + @discussion + This function is intended to be called by the driver. A kext + must not call this function on an interface the kext does not + own. + @param interface The interface. + @param hdrlen The new header length. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_set_hdrlen(ifnet_t interface, u_int8_t hdrlen); + +/*! + @function ifnet_hdrlen + @param interface The interface. + @result The header length. + */ +u_int8_t ifnet_hdrlen(ifnet_t interface); + +/*! + @function ifnet_set_metric + @discussion + This function is intended to be called by the driver. A kext + must not call this function on an interface the kext does not + own. + @param interface The interface. + @param metric The new metric. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_set_metric(ifnet_t interface, u_int32_t metric); + +/*! + @function ifnet_metric + @param interface The interface. + @result The metric. + */ +u_int32_t ifnet_metric(ifnet_t interface); + +/*! + @function ifnet_set_baudrate + @discussion + This function is intended to be called by the driver. A kext + must not call this function on an interface the kext does not + own. + @param interface The interface. + @param baudrate The new baudrate. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_set_baudrate(ifnet_t interface, u_int64_t baudrate); + +/*! + @function ifnet_baudrate + @param interface The interface. + @result The baudrate. + */ +u_int64_t ifnet_baudrate(ifnet_t interface); + +/*! + @function ifnet_stat_increment + @discussion + This function is intended to be called by the driver. A kext + must not call this function on an interface the kext does not + own. + @param interface The interface. + @param counts A pointer to a structure containing the amount to + increment each counter by. Any counts not appearing in the + ifnet_counter_increment structure are handled in the stack. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_stat_increment(ifnet_t interface, + const struct ifnet_stat_increment_param *counts); + +/*! + @function ifnet_stat_increment_in + @discussion + This function is intended to be called by the driver. This + function allows a driver to update the inbound interface counts. + The most efficient time to update these counts is when calling + ifnet_input. + + A lock protects the counts, this makes the increment functions + expensive. The increment function will update the lastchanged + value. + @param interface The interface. + @param packets_in The number of additional packets received. + @param bytes_in The number of additional bytes received. + @param errors_in The number of additional receive errors. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_stat_increment_in(ifnet_t interface, + u_int32_t packets_in, u_int32_t bytes_in, + u_int32_t errors_in); + +/*! + @function ifnet_stat_increment_out + @discussion + This function is intended to be called by the driver. This + function allows a driver to update the outbound interface counts. + + A lock protects the counts, this makes the increment functions + expensive. The increment function will update the lastchanged + value. + @param interface The interface. + @param packets_out The number of additional packets sent. + @param bytes_out The number of additional bytes sent. + @param errors_out The number of additional send errors. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_stat_increment_out(ifnet_t interface, + u_int32_t packets_out, u_int32_t bytes_out, + u_int32_t errors_out); + +/*! + @function ifnet_set_stat + @discussion + This function is intended to be called by the driver. A kext + must not call this function on an interface the kext does not + own. + + The one exception would be the case where a kext wants to zero + all of the counters. + @param interface The interface. + @param counts The new stats values. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_set_stat(ifnet_t interface, + const struct ifnet_stats_param *stats); + +/*! + @function ifnet_stat + @param interface The interface. + @param out_stats Storage for the values. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_stat(ifnet_t interface, + struct ifnet_stats_param *out_stats); + +/*! + @function ifnet_set_promiscuous + @discussion Enable or disable promiscuous mode on the interface. The + interface keeps an internal count of the number of times + promiscuous mode has been enabled. Promiscuous mode is only + disabled when this count reaches zero. Be sure to disable + promiscuous mode only once for every time you enable it. + @param interface The interface to toggle promiscuous mode on. + @param on If set, the number of promicuous on requests will be + incremented. If this is the first requrest, promiscuous mode + will be enabled. If this is not set, the number of promiscous + clients will be decremented. If this causes the number to reach + zero, promiscuous mode will be disabled. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_set_promiscuous(ifnet_t interface, int on); + +/*! + @function ifnet_touch_lastchange + @discussion Updates the lastchange value to now. + @param interface The interface. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_touch_lastchange(ifnet_t interface); + +/*! + @function ifnet_lastchange + @param interface The interface. + @param last_change A timeval struct to copy the last time changed in + to. + */ +errno_t ifnet_lastchange(ifnet_t interface, struct timeval *last_change); + +/*! + @function ifnet_get_address_list + @discussion Get a list of addresses on the interface. Passing NULL + for the interface will return a list of all addresses. The + addresses will have their reference count bumped so they will + not go away. Calling ifnet_free_address_list will decrement the + refcount and free the array. If you wish to hold on to a + reference to an ifaddr_t, be sure to bump the reference count + before calling ifnet_free_address_list. + @param interface The interface. + @param addresses A pointer to a NULL terminated array of ifaddr_ts. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_get_address_list(ifnet_t interface, ifaddr_t **addresses); + +/*! + @function ifnet_get_address_list_family + @discussion Get a list of addresses on the interface. Passing NULL + for the interface will return a list of all addresses. The + addresses will have their reference count bumped so they will + not go away. Calling ifnet_free_address_list will decrement the + refcount and free the array. If you wish to hold on to a + reference to an ifaddr_t, be sure to bump the reference count + before calling ifnet_free_address_list. Unlike + ifnet_get_address_list, this function lets the caller specify + the address family to get a list of only a specific address type. + @param interface The interface. + @param addresses A pointer to a NULL terminated array of ifaddr_ts. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_get_address_list_family(ifnet_t interface, ifaddr_t **addresses, sa_family_t family); + +/*! + @function ifnet_free_address_list + @discussion Free a list of addresses returned from + ifnet_get_address_list. Decrements the refcounts and frees the + memory used for the array of references. + @param addresses An array of ifaddr_ts. + */ +void ifnet_free_address_list(ifaddr_t *addresses); + +/*! + @function ifnet_set_lladdr + @discussion Sets the link-layer address for this interface. + @param interface The interface the link layer address is being + changed on. + @param lladdr A pointer to the raw link layer address (pointer to + the 6 byte ethernet address for ethernet). + @param lladdr_len The length, in bytes, of the link layer address. + */ +errno_t ifnet_set_lladdr(ifnet_t interface, const void* lladdr, size_t lladdr_len); + +/*! + @function ifnet_lladdr_copy_bytes + @discussion Copies the bytes of the link-layer address in to the + specified buffer. + @param interface The interface to copy the link-layer address from. + @param lladdr The buffer to copy the link-layer address in to. + @param length The length of the buffer. This value must match the + length of the link-layer address. + */ +errno_t ifnet_lladdr_copy_bytes(ifnet_t interface, void* lladdr, size_t length); + +#ifdef KERNEL_PRIVATE +/*! + @function ifnet_lladdr + @discussion Returns a pointer to the link-layer address. + @param interface The interface the link-layer address is on. + */ +void* ifnet_lladdr(ifnet_t interface); +#endif KERNEL_PRIVATE + +/*! + @function ifnet_llbroadcast_copy_bytes + @discussion Retrieves the link-layer broadcast address for this + interface. + @param interface The interface. + @param addr A buffer to copy the broadcast address in to. + @param bufferlen The length of the buffer at addr. + @param addr_len On return, the length of the broadcast address. + @param lladdr_len The length, in bytes, of the link layer address. + */ +errno_t ifnet_llbroadcast_copy_bytes(ifnet_t interface, void* addr, + size_t bufferlen, size_t* addr_len); + +#ifdef KERNEL_PRIVATE +/*! + @function ifnet_set_lladdr_and_type + @discussion Sets the link-layer address as well as the type field in + the sockaddr_dl. Support for setting the type was added for vlan + and bond interfaces. + @param interface The interface the link layer address is being + changed on. + @param lladdr A pointer to the raw link layer address (pointer to + the 6 byte ethernet address for ethernet). + @param lladdr_len The length, in bytes, of the link layer address. + @param type The link-layer address type. + */ +errno_t ifnet_set_lladdr_and_type(ifnet_t interface, const void* lladdr, size_t length, u_char type); +#endif KERNEL_PRIVATE + +/*! + @function ifnet_add_multicast + @discussion Joins a multicast and returns an ifmultiaddr_t with the + reference count incremented for you. You are responsible for + decrementing the reference count after calling + ifnet_remove_multicast and making sure you no longer have any + references to the multicast. + @param interface The interface. + @param maddr The multicast address to join. Either a physical + address or logical address to be translated to a physical + address. + @param multicast The resulting ifmultiaddr_t multicast address. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_add_multicast(ifnet_t interface, const struct sockaddr *maddr, + ifmultiaddr_t *multicast); + +/*! + @function ifnet_remove_multicast + @discussion Causes the interface to leave the multicast group. The + stack keeps track of how many times ifnet_add_multicast has been + called for a given multicast address. The multicast will only be + removed when the number of times ifnet_remove_multicast has been + called matches the number of times ifnet_add_multicast has been + called. + + The memory for the multicast address is not actually freed until + the separate reference count has reached zero. Some parts of the + stack may keep a pointer to the multicast even after that + multicast has been removed from the interface. + + When an interface is detached, all of the multicasts are + removed. If the interface of the multicast passed in is no + longer attached, this function will gracefully return, + performing no work. + + It is the callers responsibility to release the multicast + address after calling this function. + @param multicast The multicast to be removed. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_remove_multicast(ifmultiaddr_t multicast); + +/*! + @function ifnet_get_multicast_list + @discussion Retrieves a list of multicast address the interface is + set to receive. This function allocates and returns an array of + references to the various multicast addresses. The multicasts + have their reference counts bumped on your behalf. Calling + ifnet_free_multicast_list will decrement the reference counts + and free the array. + @param interface The interface. + @param multicasts A pointer to a NULL terminated array of references + to the multicast addresses. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_get_multicast_list(ifnet_t interface, ifmultiaddr_t **addresses); + +/*! + @function ifnet_free_multicast_list + @discussion Frees a list of multicasts returned by + ifnet_get_multicast_list. Decrements the refcount on each + multicast address and frees the array. + @param multicasts An array of references to the multicast addresses. + @result 0 on success otherwise the errno error. + */ +void ifnet_free_multicast_list(ifmultiaddr_t *multicasts); + +/*! + @function ifnet_find_by_name + @discussion Find an interface by the name including the unit number. + Caller must call ifnet_release on any non-null interface return + value. + @param name The name of the interface, including any unit number + (i.e. "en0"). + @param interface A pointer to an interface reference. This will be + filled in if a matching interface is found. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_find_by_name(const char *ifname, ifnet_t *interface); + +/*! + @function ifnet_list_get + @discussion Get a list of attached interfaces. List will be set to + point to an array allocated by ifnet_list_get. The interfaces + are refcounted and the counts will be incremented before the + function returns. The list of interfaces must be freed using + ifnet_list_free. + @param family The interface family (i.e. IFNET_FAMILY_ETHERNET). To + find interfaces of all families, use IFNET_FAMILY_ANY. + @param interfaces A pointer to an array of interface references. + @param count A pointer that will be filled in with the number of + matching interfaces in the array. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_list_get(ifnet_family_t family, ifnet_t **interfaces, u_int32_t *count); + +/*! + @function ifnet_list_free + @discussion Free a list of interfaces returned by ifnet_list_get. + Decrements the reference count on each interface and frees the + array of references. If you keep a reference to an interface, be + sure to increment the reference count before calling + ifnet_list_free. + @param interfaces An array of interface references from ifnet_list_get. + */ +void ifnet_list_free(ifnet_t *interfaces); + +/********************************************************************************************/ +/* ifaddr_t accessors */ +/********************************************************************************************/ + +/*! + @function ifaddr_reference + @discussion Increment the reference count of an address tied to an + interface. + @param ifaddr The interface address. + @result 0 upon success + */ +errno_t ifaddr_reference(ifaddr_t ifaddr); + +/*! + @function ifaddr_release + @discussion Decrements the reference count of and possibly frees an + address tied to an interface. + @param ifaddr The interface address. + @result 0 upon success + */ +errno_t ifaddr_release(ifaddr_t ifaddr); + +/*! + @function ifaddr_address + @discussion Copies the address out of the ifaddr. + @param ifaddr The interface address. + @param out_addr The sockaddr storage for the address. + @param addr_size The size of the storage for the address. + @result 0 upon success + */ +errno_t ifaddr_address(ifaddr_t ifaddr, struct sockaddr *out_addr, u_int32_t addr_size); + +/*! + @function ifaddr_address + @discussion Returns the address family of the address. + @param ifaddr The interface address. + @result 0 on failure, address family on success. + */ +sa_family_t ifaddr_address_family(ifaddr_t ifaddr); + +/*! + @function ifaddr_dstaddress + @discussion Copies the destination address out of the ifaddr. + @param ifaddr The interface address. + @param out_dstaddr The sockaddr storage for the destination address. + @param dstaddr_size The size of the storage for the destination address. + @result 0 upon success + */ +errno_t ifaddr_dstaddress(ifaddr_t ifaddr, struct sockaddr *out_dstaddr, u_int32_t dstaddr_size); + +/*! + @function ifaddr_netmask + @discussion Copies the netmask out of the ifaddr. + @param ifaddr The interface address. + @param out_netmask The sockaddr storage for the netmask. + @param netmask_size The size of the storage for the netmask. + @result 0 upon success + */ +errno_t ifaddr_netmask(ifaddr_t ifaddr, struct sockaddr *out_netmask, u_int32_t netmask_size); + +/*! + @function ifaddr_ifnet + @discussion Returns the interface the address is attached to. The + reference is only valid until the ifaddr is released. If you + need to hold a reference to the ifnet for longer than you hold a + reference to the ifaddr, increment the reference using + ifnet_reference. + @param ifaddr The interface address. + @result A reference to the interface the address is attached to. + */ +ifnet_t ifaddr_ifnet(ifaddr_t ifaddr); + +/*! + @function ifaddr_withaddr + @discussion Returns an interface address with the address specified. + Increments the reference count on the ifaddr before returning to + the caller. Caller is responsible for calling ifaddr_release. + @param address The address to search for. + @result A reference to the interface address. + */ +ifaddr_t ifaddr_withaddr(const struct sockaddr* address); + +/*! + @function ifaddr_withdstaddr + @discussion Returns an interface address for the interface address + that matches the destination when the netmask is applied. + Increments the reference count on the ifaddr before returning to + the caller. Caller is responsible for calling ifaddr_release. + @param destination The destination to search for. + @result A reference to the interface address. + */ +ifaddr_t ifaddr_withdstaddr(const struct sockaddr* destination); + +/*! + @function ifaddr_withnet + @discussion Returns an interface address for the interface with the + network described by net. Increments the reference count on the + ifaddr before returning to the caller. Caller is responsible for + calling ifaddr_release. + @param net The network to search for. + @result A reference to the interface address. + */ +ifaddr_t ifaddr_withnet(const struct sockaddr* net); + +/*! + @function ifaddr_withroute + @discussion Returns an interface address given a destination and + gateway. Increments the reference count on the ifaddr before + returning to the caller. Caller is responsible for calling + ifaddr_release. + @param flags Routing flags. See net/route.h, RTF_GATEWAY etc. + @param destination The destination to search for. + @param gateway A gateway to search for. + @result A reference to the interface address. + */ +ifaddr_t ifaddr_withroute(int flags, const struct sockaddr* destination, + const struct sockaddr* gateway); + +/*! + @function ifaddr_findbestforaddr + @discussion Finds the best local address assigned to a specific + interface to use when communicating with another address. + Increments the reference count on the ifaddr before returning to + the caller. Caller is responsible for calling ifaddr_release. + @param addr The remote address. + @param interface The local interface. + @result A reference to the interface address. + */ +ifaddr_t ifaddr_findbestforaddr(const struct sockaddr *addr, ifnet_t interface); + +/********************************************************************************************/ +/* ifmultiaddr_t accessors */ +/********************************************************************************************/ + +/*! + @function ifmaddr_reference + @discussion Increment the reference count of an interface multicast + address. + @param ifmaddr The interface multicast address. + @result 0 on success. Only error will be EINVAL if ifmaddr is not valid. + */ +errno_t ifmaddr_reference(ifmultiaddr_t ifmaddr); + +/*! + @function ifmaddr_release + @discussion Decrement the reference count of an interface multicast + address. If the reference count reaches zero, the ifmultiaddr + will be removed from the interface and the ifmultiaddr will be + freed. + @param ifmaddr The interface multicast address. + @result 0 on success. Only error will be EINVAL if ifmaddr is not valid. + */ +errno_t ifmaddr_release(ifmultiaddr_t ifmaddr); + +/*! + @function ifmaddr_address + @discussion Copies the multicast address to out_multicast. + @param out_multicast Storage for a sockaddr. + @param addr_size Size of the storage. + @result 0 on success. + */ +errno_t ifmaddr_address(ifmultiaddr_t ifmaddr, struct sockaddr *out_multicast, u_int32_t addr_size); + +/*! + @function ifmaddr_lladdress + @discussion Copies the link layer multicast address to + out_link_layer_multicast. + @param out_link_layer_multicast Storage for a sockaddr. + @param addr_size Size of the storage. + @result 0 on success. + */ +errno_t ifmaddr_lladdress(ifmultiaddr_t ifmaddr, struct sockaddr *out_link_layer_multicast, + u_int32_t addr_size); + +/*! + @function ifmaddr_ifnet + @discussion Returns the interface this multicast address is attached + to. The interface reference count is not bumped by this + function. The interface is only valid as long as you don't + release the refernece to the multiast address. If you need to + maintain your pointer to the ifnet, call ifnet_reference + followed by ifnet_release when you're finished. + @param ifmaddr The interface multicast address. + @result A reference to the interface. + */ +ifnet_t ifmaddr_ifnet(ifmultiaddr_t ifmaddr); + +__END_DECLS + +#endif diff --git a/bsd/net/kpi_interfacefilter.c b/bsd/net/kpi_interfacefilter.c new file mode 100644 index 000000000..ee9b28174 --- /dev/null +++ b/bsd/net/kpi_interfacefilter.c @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include "kpi_interfacefilter.h" + +#include <sys/malloc.h> +#include <sys/param.h> +#include <sys/socket.h> +#include <sys/kern_event.h> +#include <net/dlil.h> + +errno_t +iflt_attach( + ifnet_t interface, + const struct iff_filter *filter, + interface_filter_t *filter_ref) +{ + if (interface == NULL) return ENOENT; + + return dlil_attach_filter(interface, filter, filter_ref); +} + +void +iflt_detach( + interface_filter_t filter_ref) +{ + dlil_detach_filter(filter_ref); +} diff --git a/bsd/net/kpi_interfacefilter.h b/bsd/net/kpi_interfacefilter.h new file mode 100644 index 000000000..e4140b1a6 --- /dev/null +++ b/bsd/net/kpi_interfacefilter.h @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/*! + @header kpi_interfacefilter.h + This header defines an API to attach interface filters. Interface + filters may be attached to a specific interface. The filters can + intercept all packets in to and out of the specific interface. In + addition, the filters may intercept interface specific events and + ioctls. + */ + +#ifndef __KPI_INTERFACEFILTER__ +#define __KPI_INTERFACEFILTER__ +#include <sys/kernel_types.h> +#include <net/kpi_interface.h> + +struct kev_msg; + +/*! + @typedef iff_input_func + + @discussion iff_input_func is used to filter incoming packets. The + interface is only valid for the duration of the filter call. If + you need to keep a reference to the interface, be sure to call + ifnet_reference and ifnet_release. The packets passed to the + inbound filter are different from those passed to the outbound + filter. Packets to the inbound filter have the frame header + passed in separately from the rest of the packet. The outbound + data filters is passed the whole packet including the frame + header. + + The frame header usually preceeds the data in the mbuf. This + ensures that the frame header will be a valid pointer as long as + the mbuf is not freed. If you need to change the frame header to + point somewhere else, the recommended method is to prepend a new + frame header to the mbuf chain (mbuf_prepend), set the header to + point to that data, then call mbuf_adj to move the mbuf data + pointer back to the start of the packet payload. + @param cookie The cookie specified when this filter was attached. + @param interface The interface the packet was recieved on. + @param protocol The protocol of this packet. If you specified a + protocol when attaching your filter, the protocol will only ever + be the protocol you specified. + @param data The inbound packet, after the frame header as determined + by the interface. + @param frame_ptr A pointer to the pointer to the frame header. The + frame header length can be found by inspecting the interface's + frame header length (ifnet_hdrlen). + @result Return: + 0 - The caller will continue with normal processing of the packet. + EJUSTRETURN - The caller will stop processing the packet, the packet will not be freed. + Anything Else - The caller will free the packet and stop processing. +*/ +typedef errno_t (*iff_input_func)(void* cookie, ifnet_t interface, protocol_family_t protocol, + mbuf_t *data, char **frame_ptr); + +/*! + @typedef iff_output_func + + @discussion iff_output_func is used to filter fully formed outbound + packets. The interface is only valid for the duration of the + filter call. If you need to keep a reference to the interface, + be sure to call ifnet_reference and ifnet_release. + @param cookie The cookie specified when this filter was attached. + @param interface The interface the packet is being transmitted on. + @param data The fully formed outbound packet in a chain of mbufs. + The frame header is already included. The filter function may + modify the packet or return a different mbuf chain. + @result Return: + 0 - The caller will continue with normal processing of the packet. + EJUSTRETURN - The caller will stop processing the packet, the packet will not be freed. + Anything Else - The caller will free the packet and stop processing. +*/ +typedef errno_t (*iff_output_func)(void* cookie, ifnet_t interface, protocol_family_t protocol, + mbuf_t *data); + +/*! + @typedef iff_event_func + + @discussion iff_event_func is used to filter interface specific + events. The interface is only valid for the duration of the + filter call. If you need to keep a reference to the interface, + be sure to call ifnet_reference and ifnet_release. + @param cookie The cookie specified when this filter was attached. + @param interface The interface the packet is being transmitted on. + @param event_msg The kernel event, may not be changed. +*/ +typedef void (*iff_event_func)(void* cookie, ifnet_t interface, protocol_family_t protocol, + const struct kev_msg *event_msg); + +/*! + @typedef iff_ioctl_func + + @discussion iff_ioctl_func is used to filter ioctls sent to an + interface. The interface is only valid for the duration of the + filter call. If you need to keep a reference to the interface, + be sure to call ifnet_reference and ifnet_release. + @param cookie The cookie specified when this filter was attached. + @param interface The interface the packet is being transmitted on. + @param ioctl_cmd The ioctl command. + @param ioctl_arg A pointer to the ioctl argument. + @result Return: + 0 - The caller will continue with normal processing of the packet. + EJUSTRETURN - The caller will stop processing the packet, the packet will not be freed. + Anything Else - The caller will free the packet and stop processing. +*/ +typedef errno_t (*iff_ioctl_func)(void* cookie, ifnet_t interface, protocol_family_t protocol, + u_long ioctl_cmd, void* ioctl_arg); + +/*! + @typedef iff_detached_func + + @discussion iff_detached_func is called to notify the filter that it + has been detached from an interface. This is the last call to + the filter that will be made. A filter may be detached if the + interface is detached or the detach filter function is called. + In the case that the interface is being detached, your filter's + event function will be called with the interface detaching event + before the your detached function will be called. + @param cookie The cookie specified when this filter was attached. + @param interface The interface this filter was detached from. +*/ +typedef void (*iff_detached_func)(void* cookie, ifnet_t interface); + +/*! + @struct iff_filter + @discussion This structure is used to define an interface filter for + use with the iflt_attach function. + @field iff_cookie A kext defined cookie that will be passed to all + filter functions. + @field iff_name A filter name used for debugging purposes. + @field iff_protocol The protocol of the packets this filter is + interested in. If you specify zero, packets from all protocols + will be passed to the filter. + @field iff_input The filter function to handle inbound packets, may + be NULL. + @field iff_output The filter function to handle outbound packets, + may be NULL. + @field iff_event The filter function to handle interface events, may + be null. + @field iff_ioctl The filter function to handle interface ioctls, may + be null. + @field iff_detached The filter function used to notify the filter that + it has been detached. +*/ + +struct iff_filter { + void* iff_cookie; + const char* iff_name; + protocol_family_t iff_protocol; + iff_input_func iff_input; + iff_output_func iff_output; + iff_event_func iff_event; + iff_ioctl_func iff_ioctl; + iff_detached_func iff_detached; +}; + +/*! + @function iflt_attach + @discussion Attaches an interface filter to an interface. + @param interface The interface the filter should be attached to. + @param filter A structure defining the filter. + @param filter_ref A reference to the filter used to detach. + @result 0 on success otherwise the errno error. + */ +errno_t iflt_attach(ifnet_t interface, const struct iff_filter* filter, + interface_filter_t *filter_ref); + +/*! + @function iflt_detach + @discussion Detaches an interface filter from an interface. + @param filter_ref The reference to the filter from iflt_attach. + */ +void iflt_detach(interface_filter_t filter_ref); + +#endif diff --git a/bsd/net/kpi_protocol.c b/bsd/net/kpi_protocol.c new file mode 100644 index 000000000..ad16db5c1 --- /dev/null +++ b/bsd/net/kpi_protocol.c @@ -0,0 +1,366 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include "kpi_protocol.h" + +#include <sys/param.h> +#include <sys/malloc.h> +#include <sys/socket.h> +#include <sys/systm.h> +#include <sys/kpi_mbuf.h> +#include <sys/domain.h> +#include <net/if.h> +#include <net/dlil.h> +#include <libkern/OSAtomic.h> + +void proto_kpi_init(void); +void proto_input_run(void); + +typedef int (*attach_t)(struct ifnet *ifp, u_long protocol_family); +typedef int (*detach_t)(struct ifnet *ifp, u_long protocol_family); + +/****************************************************************************/ +/* WARNING: Big assumption made here - there can be only one input thread */ +struct proto_input_entry { + struct proto_input_entry *next; + int detach; + struct domain *domain; + + protocol_family_t protocol; + proto_input_handler input; + proto_input_detached_handler detached; + + mbuf_t first_packet; + mbuf_t last_packet; +}; + +#define PROTO_HASH_SLOTS 5 + +static struct proto_input_entry *proto_hash[PROTO_HASH_SLOTS]; +static struct proto_input_entry *proto_input_add_list; +static lck_mtx_t *proto_input_lock = 0; +static u_int32_t inject_buckets = 0; + +extern thread_t dlil_input_thread_ptr; +extern int dlil_input_thread_wakeup; + +static int +proto_hash_value( + protocol_family_t protocol) +{ + switch(protocol) { + case PF_INET: + return 0; + case PF_INET6: + return 1; + case PF_APPLETALK: + return 2; + case PF_VLAN: + return 3; + } + return 4; +} + +__private_extern__ void +proto_kpi_init(void) +{ + lck_grp_attr_t *grp_attrib = 0; + lck_attr_t *lck_attrib = 0; + lck_grp_t *lck_group = 0; + + /* Allocate a mtx lock */ + grp_attrib = lck_grp_attr_alloc_init(); + lck_grp_attr_setdefault(grp_attrib); + lck_group = lck_grp_alloc_init("protocol kpi", grp_attrib); + lck_grp_attr_free(grp_attrib); + lck_attrib = lck_attr_alloc_init(); + lck_attr_setdefault(lck_attrib); + proto_input_lock = lck_mtx_alloc_init(lck_group, lck_attrib); + lck_grp_free(lck_group); + lck_attr_free(lck_attrib); +} + +__private_extern__ errno_t +proto_register_input( + protocol_family_t protocol, + proto_input_handler input, + proto_input_detached_handler detached) +{ + + struct proto_input_entry *entry; + + entry = _MALLOC(sizeof(*entry), M_IFADDR, M_WAITOK); + + if (entry == NULL) + return ENOMEM; + + bzero(entry, sizeof(*entry)); + entry->protocol = protocol; + entry->input = input; + entry->detached = detached; + + { + struct domain *dp = domains; + extern lck_mtx_t *domain_proto_mtx; + + lck_mtx_assert(domain_proto_mtx, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(domain_proto_mtx); + while (dp && dp->dom_family != protocol) + dp = dp->dom_next; + entry->domain = dp; + lck_mtx_unlock(domain_proto_mtx); + } + + + do { + entry->next = proto_input_add_list; + } while(!OSCompareAndSwap((UInt32)entry->next, (UInt32)entry, (UInt32*)&proto_input_add_list)); + + wakeup((caddr_t)&dlil_input_thread_wakeup); + + return 0; +} + + +__private_extern__ void +proto_unregister_input( + protocol_family_t protocol) +{ + struct proto_input_entry *entry = NULL; + + for (entry = proto_hash[proto_hash_value(protocol)]; entry; entry = entry->next) + if (entry->protocol == protocol) + break; + + if (entry) + entry->detach = 1; +} + + +static void +proto_delayed_attach( + struct proto_input_entry *entry) +{ + struct proto_input_entry *next_entry; + for (next_entry = entry->next; entry; entry = next_entry) { + struct proto_input_entry *exist; + int hash_slot; + + hash_slot = proto_hash_value(entry->protocol); + next_entry = entry->next; + + for (exist = proto_hash[hash_slot]; exist; exist = exist->next) + if (exist->protocol == entry->protocol) + break; + + /* If the entry already exists, call detached and dispose */ + if (exist) { + if (entry->detached) + entry->detached(entry->protocol); + FREE(entry, M_IFADDR); + } + else { + entry->next = proto_hash[hash_slot]; + proto_hash[hash_slot] = entry; + } + } +} + +static void +proto_delayed_inject( + struct proto_input_entry *entry) +{ + mbuf_t packet_list; + mbuf_t packet; + int locked = 0; + + lck_mtx_lock(proto_input_lock); + packet_list = entry->first_packet; + entry->first_packet = entry->last_packet = 0; + lck_mtx_unlock(proto_input_lock); + + if (packet_list == NULL) + return; + + if (entry->domain && (entry->domain->dom_flags & DOM_REENTRANT) == 0) { + lck_mtx_lock(entry->domain->dom_mtx); + locked = 1; + } + + for (packet = packet_list; packet; packet = packet_list) { + packet_list = mbuf_nextpkt(packet); + mbuf_setnextpkt(packet, NULL); + entry->input(entry->protocol, packet); + } + + if (locked) { + lck_mtx_unlock(entry->domain->dom_mtx); + } +} + +/* This function must be called from a single dlil input thread */ +__private_extern__ void +proto_input_run(void) +{ + struct proto_input_entry *entry; + u_int32_t inject; + int i; + + if (current_thread() != dlil_input_thread_ptr) + panic("proto_input_run called from a thread other than dlil_input_thread!\n"); + + do { + entry = proto_input_add_list; + } while (entry && !OSCompareAndSwap((UInt32)entry, 0, (UInt32*)&proto_input_add_list)); + + if (entry) + proto_delayed_attach(entry); + + do { + inject = inject_buckets; + } while (inject && !OSCompareAndSwap(inject, 0, (UInt32*)&inject_buckets)); + + if (inject) { + for (i = 0; i < PROTO_HASH_SLOTS; i++) { + if ((inject & (1L << i)) != 0) { + for (entry = proto_hash[i]; entry; entry = entry->next) { + if (entry->first_packet) { + proto_delayed_inject(entry); + } + } + } + } + } +} + +errno_t +proto_input( + protocol_family_t protocol, + mbuf_t packet_list) +{ + struct proto_input_entry *entry; + + if (current_thread() != dlil_input_thread_ptr) + panic("proto_input called from a thread other than dlil_input_thread!\n"); + + for (entry = proto_hash[proto_hash_value(protocol)]; entry; entry = entry->next) { + if (entry->protocol == protocol) + break; + } + + if (entry) { + mbuf_t packet; +#if DIRECT_PROTO_INPUT + // See <rdar://problem/3687868> for why this is disabled + // We need to release the dlil lock before taking the protocol lock + for (packet = packet_list; packet; packet = packet_list) { + packet_list = mbuf_nextpkt(packet); + mbuf_setnextpkt(packet, NULL); + entry->input(entry->protocol, packet); + } +#else + mbuf_t last_packet; + int hash_slot = proto_hash_value(protocol); + + for (last_packet = packet_list; mbuf_nextpkt(last_packet); + last_packet = mbuf_nextpkt(last_packet)) + /* find the last packet */; + + lck_mtx_lock(proto_input_lock); + if (entry->first_packet == NULL) { + entry->first_packet = packet_list; + } + else { + mbuf_setnextpkt(entry->last_packet, packet_list); + } + entry->last_packet = last_packet; + lck_mtx_unlock(proto_input_lock); + + OSBitOrAtomic((1L << hash_slot), (UInt32*)&inject_buckets); +#endif + } + else + { + return ENOENT; + } + + return 0; +} + +errno_t +proto_inject( + protocol_family_t protocol, + mbuf_t packet_list) +{ + struct proto_input_entry *entry; + mbuf_t last_packet; + int hash_slot = proto_hash_value(protocol); + + for (last_packet = packet_list; mbuf_nextpkt(last_packet); + last_packet = mbuf_nextpkt(last_packet)) + /* find the last packet */; + + for (entry = proto_hash[hash_slot]; entry; entry = entry->next) { + if (entry->protocol == protocol) + break; + } + + if (entry) { + lck_mtx_lock(proto_input_lock); + if (entry->first_packet == NULL) { + entry->first_packet = packet_list; + } + else { + mbuf_setnextpkt(entry->last_packet, packet_list); + } + entry->last_packet = last_packet; + lck_mtx_unlock(proto_input_lock); + + OSBitOrAtomic((1L << hash_slot), (UInt32*)&inject_buckets); + + wakeup((caddr_t)&dlil_input_thread_wakeup); + } + else + { + return ENOENT; + } + + return 0; +} + +errno_t +proto_register_plumber( + protocol_family_t proto_fam, + ifnet_family_t if_fam, + proto_plumb_handler plumb, + proto_unplumb_handler unplumb) +{ + return dlil_reg_proto_module(proto_fam, if_fam, (attach_t)plumb, (detach_t)unplumb); +} + +void +proto_unregister_plumber( + protocol_family_t proto_fam, + ifnet_family_t if_fam) +{ + (void)dlil_dereg_proto_module(proto_fam, if_fam); +} diff --git a/bsd/net/kpi_protocol.h b/bsd/net/kpi_protocol.h new file mode 100644 index 000000000..5ea4188da --- /dev/null +++ b/bsd/net/kpi_protocol.h @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/*! + @header kpi_protocol.h + This header defines an API to interact with protocols in the kernel. + The KPIs in this header file can be used to interact with protocols + that already exist in the stack. These KPIs can be used to support + existing protocols over media types that are not natively supported + in the kernel, such as ATM. + */ + +#ifndef __KPI_PROTOCOL__ +#define __KPI_PROTOCOL__ +#include <sys/kernel_types.h> +#include <net/kpi_interface.h> + + +__BEGIN_DECLS + +/****************************************************************************/ +/* Protocol input/inject */ +/****************************************************************************/ + +#ifdef KERNEL_PRIVATE +/*! + @typedef protocol_input_handler + @discussion protocol_input_handler is called to input a packet. If + your protocol has specified a global lock, the lock will be held + when this funciton is called. + @pararm protocol The protocol this packet is intended for. + @param packet The packet that should be input. + */ +typedef void (*proto_input_handler)(protocol_family_t protocol, mbuf_t packet); + +/*! + @typedef proto_input_detached_handler + @discussion proto_input_detached_handler is called to notify the + protocol that it has been detached. When this function is + called, the proto_input_handler will not be called again, making + it safe to unload. + @pararm protocol The protocol detached. + */ +typedef void (*proto_input_detached_handler)(protocol_family_t protocol); + +/*! + @function proto_register_input + @discussion Allows the caller to specify the functions called when a + packet for a protocol is received. + @param protocol The protocol family these functions will receive + packets for. + @param input The function called when a packet is input. + @result A errno error on failure. + */ +errno_t proto_register_input(protocol_family_t protocol, proto_input_handler input, + proto_input_detached_handler detached); + +/*! + @function proto_unregister_input + @discussion Allows the caller to unregister the input and inject + functions for a protocol. The input/inject functions may not be + unregistered immediately if there is a chance they are in use. + To notify the owner when the functions are no longer in use, the + proto_detached_handler function will be called. It is not safe + to unload until the proto_detached_handler is called. + @param protocol The protocol family these functions will receive + packets for. + @param input The function called when a packet is input. + @param inject The function to called when a packet is injected (not + on the normal input path). + @result A errno error on failure. + */ +void proto_unregister_input(protocol_family_t protocol); +#endif + +/*! + @function proto_input + @discussion Inputs a packet on the specified protocol from the input + path. + @param protocol The protocol of the packet. + @param packet The first packet in a chain of packets to be input. + @result A errno error on failure. Unless proto_input returns zero, + the caller is responsible for freeing the mbuf. + */ +errno_t proto_input(protocol_family_t protocol, mbuf_t packet); + +/*! + @function proto_inject + @discussion Injects a packet on the specified protocol from + anywhere. To avoid recursion, the protocol may need to queue the + packet to be handled later. + @param protocol The protocol of the packet. + @param packet The first packet in a chain of packets to be injected. + @result A errno error on failure. Unless proto_inject returns zero, + the caller is responsible for freeing the mbuf. + */ +errno_t proto_inject(protocol_family_t protocol, mbuf_t packet); + + +/****************************************************************************/ +/* Protocol plumbing */ +/****************************************************************************/ + +/*! + @typedef proto_plumb_handler + @discussion proto_plumb_handler is called to attach a protocol to an + interface. A typical protocol plumb function would fill out an + ifnet_attach_proto_param and call ifnet_attach_protocol. + @param ifp The interface the protocol should be attached to. + @param protocol_family The protocol that should be attached to the + interface. + @result + A non-zero value of the attach failed. + */ +typedef errno_t (*proto_plumb_handler)(ifnet_t ifp, protocol_family_t protocol); + +/*! + @typedef proto_unplumb_handler + @discussion proto_unplumb_handler is called to detach a protocol + from an interface. A typical unplumb function would call + ifnet_detach_protocol and perform any necessary cleanup. + @param ifp The interface the protocol should be detached from. + @param protocol_family The protocol that should be detached from the + interface. + */ +typedef void (*proto_unplumb_handler)(ifnet_t ifp, protocol_family_t protocol); + +/*! + @function proto_register_plumber + @discussion Allows the caller to specify the functions called when a protocol + is attached to an interface belonging to the specified family and when + that protocol is detached. + @param proto_fam The protocol family these plumbing functions will + handle. + @param if_fam The interface family these plumbing functions will + handle. + @param plumb The function to call to attach the protocol to an + interface. + @param unplumb The function to call to detach the protocol to an + interface, may be NULL in which case ifnet_detach_protocol will + be used to detach the protocol. + @result A non-zero value of the attach failed. + */ +errno_t proto_register_plumber(protocol_family_t proto_fam, ifnet_family_t if_fam, + proto_plumb_handler plumb, proto_unplumb_handler unplumb); + +/*! + @function proto_unregister_plumber + @discussion Unregisters a previously registered plumbing function. + @param proto_fam The protocol family these plumbing functions + handle. + @param if_fam The interface family these plumbing functions handle. + */ +void proto_unregister_plumber(protocol_family_t proto_fam, ifnet_family_t if_fam); + +__END_DECLS + +#endif diff --git a/bsd/net/lacp.h b/bsd/net/lacp.h new file mode 100644 index 000000000..0aad344c0 --- /dev/null +++ b/bsd/net/lacp.h @@ -0,0 +1,418 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/* + * lacp.h + * - definitions for the Link Aggregation Control Protocol (LACP) and + * the Link Aggregation Marker Protocol + */ + +/* + * Modification History + * + * May 14, 2004 Dieter Siegmund (dieter@apple.com) + * - created + */ + +#ifndef _NET_LACP_H_ +#define _NET_LACP_H_ + +#include <sys/types.h> + +/** + ** Link Aggregation Control Protocol (LACP) definitions + **/ +#define LACPDU_VERSION_1 1 + +#define LACPDU_TLV_TYPE_TERMINATOR 0x00 +#define LACPDU_TLV_TYPE_ACTOR 0x01 +#define LACPDU_TLV_TYPE_PARTNER 0x02 +#define LACPDU_TLV_TYPE_COLLECTOR 0x03 + +#define LACPDU_ACTOR_TLV_LENGTH 20 +#define LACPDU_PARTNER_TLV_LENGTH 20 +#define LACPDU_COLLECTOR_TLV_LENGTH 16 + +typedef u_char lacp_actor_partner_state; +typedef u_int16_t lacp_key; +typedef u_int16_t lacp_system_priority, lacp_port_priority, lacp_port; +typedef u_int16_t lacp_collector_max_delay; +typedef struct { + u_char system_id[6]; +} lacp_system, *lacp_system_ref; + +/* + * LACP Actor/Partner TLV + */ +typedef struct lacp_actor_partner_tlv_s { + u_char lap_tlv_type; /* 0x01 or 0x02 */ + u_char lap_length; /* 20 */ + u_char lap_system_priority[2]; + u_char lap_system[6]; + u_char lap_key[2]; + u_char lap_port_priority[2]; + u_char lap_port[2]; + u_char lap_state; + u_char lap_reserved[3]; +} lacp_actor_partner_tlv, *lacp_actor_partner_tlv_ref; + +/* + * LACP Collector TLV + */ +typedef struct lacp_collector_tlv_s { + u_char lac_tlv_type; /* 0x03 */ + u_char lac_length; /* 16 */ + u_char lac_max_delay[2]; + u_char lac_reserved[12]; +} lacp_collector_tlv, *lacp_collector_tlv_ref; + + +/* + * LACP Actor/Partner State bits + */ +#define LACP_ACTOR_PARTNER_STATE_LACP_ACTIVITY 0x01 +#define LACP_ACTOR_PARTNER_STATE_LACP_TIMEOUT 0x02 +#define LACP_ACTOR_PARTNER_STATE_AGGREGATION 0x04 +#define LACP_ACTOR_PARTNER_STATE_SYNCHRONIZATION 0x08 +#define LACP_ACTOR_PARTNER_STATE_COLLECTING 0x10 +#define LACP_ACTOR_PARTNER_STATE_DISTRIBUTING 0x20 +#define LACP_ACTOR_PARTNER_STATE_DEFAULTED 0x40 +#define LACP_ACTOR_PARTNER_STATE_EXPIRED 0x80 + +static __inline__ lacp_actor_partner_state +lacp_actor_partner_state_set_active_lacp(lacp_actor_partner_state state) +{ + return (state | LACP_ACTOR_PARTNER_STATE_LACP_ACTIVITY); +} + +static __inline__ lacp_actor_partner_state +lacp_actor_partner_state_set_passive_lacp(lacp_actor_partner_state state) +{ + return (state &= ~LACP_ACTOR_PARTNER_STATE_LACP_ACTIVITY); +} + +static __inline__ int +lacp_actor_partner_state_active_lacp(lacp_actor_partner_state state) +{ + return ((state & LACP_ACTOR_PARTNER_STATE_LACP_ACTIVITY) != 0); +} + +static __inline__ lacp_actor_partner_state +lacp_actor_partner_state_set_short_timeout(lacp_actor_partner_state state) +{ + return (state | LACP_ACTOR_PARTNER_STATE_LACP_TIMEOUT); +} + +static __inline__ lacp_actor_partner_state +lacp_actor_partner_state_set_long_timeout(lacp_actor_partner_state state) +{ + return (state &= ~LACP_ACTOR_PARTNER_STATE_LACP_TIMEOUT); +} + +static __inline__ int +lacp_actor_partner_state_short_timeout(lacp_actor_partner_state state) +{ + return ((state & LACP_ACTOR_PARTNER_STATE_LACP_TIMEOUT) != 0); +} + +static __inline__ lacp_actor_partner_state +lacp_actor_partner_state_set_aggregatable(lacp_actor_partner_state state) +{ + return (state | LACP_ACTOR_PARTNER_STATE_AGGREGATION); +} + +static __inline__ lacp_actor_partner_state +lacp_actor_partner_state_set_individual(lacp_actor_partner_state state) +{ + return (state &= ~LACP_ACTOR_PARTNER_STATE_AGGREGATION); +} + +static __inline__ lacp_actor_partner_state +lacp_actor_partner_state_aggregatable(lacp_actor_partner_state state) +{ + return ((state & LACP_ACTOR_PARTNER_STATE_AGGREGATION) != 0); +} + +static __inline__ lacp_actor_partner_state +lacp_actor_partner_state_set_in_sync(lacp_actor_partner_state state) +{ + return (state | LACP_ACTOR_PARTNER_STATE_SYNCHRONIZATION); +} + +static __inline__ lacp_actor_partner_state +lacp_actor_partner_state_set_out_of_sync(lacp_actor_partner_state state) +{ + return (state &= ~LACP_ACTOR_PARTNER_STATE_SYNCHRONIZATION); +} + +static __inline__ int +lacp_actor_partner_state_in_sync(lacp_actor_partner_state state) +{ + return ((state & LACP_ACTOR_PARTNER_STATE_SYNCHRONIZATION) != 0); +} + +static __inline__ lacp_actor_partner_state +lacp_actor_partner_state_set_collecting(lacp_actor_partner_state state) +{ + return (state | LACP_ACTOR_PARTNER_STATE_COLLECTING); +} + +static __inline__ lacp_actor_partner_state +lacp_actor_partner_state_set_not_collecting(lacp_actor_partner_state state) +{ + return (state &= ~LACP_ACTOR_PARTNER_STATE_COLLECTING); +} + +static __inline__ lacp_actor_partner_state +lacp_actor_partner_state_collecting(lacp_actor_partner_state state) +{ + return ((state & LACP_ACTOR_PARTNER_STATE_COLLECTING) != 0); +} + +static __inline__ lacp_actor_partner_state +lacp_actor_partner_state_set_distributing(lacp_actor_partner_state state) +{ + return (state | LACP_ACTOR_PARTNER_STATE_DISTRIBUTING); +} + +static __inline__ lacp_actor_partner_state +lacp_actor_partner_state_set_not_distributing(lacp_actor_partner_state state) +{ + return (state &= ~LACP_ACTOR_PARTNER_STATE_DISTRIBUTING); +} + +static __inline__ lacp_actor_partner_state +lacp_actor_partner_state_distributing(lacp_actor_partner_state state) +{ + return ((state & LACP_ACTOR_PARTNER_STATE_DISTRIBUTING) != 0); +} + +static __inline__ lacp_actor_partner_state +lacp_actor_partner_state_set_defaulted(lacp_actor_partner_state state) +{ + return (state | LACP_ACTOR_PARTNER_STATE_DEFAULTED); +} + +static __inline__ lacp_actor_partner_state +lacp_actor_partner_state_set_not_defaulted(lacp_actor_partner_state state) +{ + return (state &= ~LACP_ACTOR_PARTNER_STATE_DEFAULTED); +} + +static __inline__ lacp_actor_partner_state +lacp_actor_partner_state_defaulted(lacp_actor_partner_state state) +{ + return ((state & LACP_ACTOR_PARTNER_STATE_DEFAULTED) != 0); +} + +static __inline__ lacp_actor_partner_state +lacp_actor_partner_state_set_expired(lacp_actor_partner_state state) +{ + return (state | LACP_ACTOR_PARTNER_STATE_EXPIRED); +} + +static __inline__ lacp_actor_partner_state +lacp_actor_partner_state_set_not_expired(lacp_actor_partner_state state) +{ + return (state &= ~LACP_ACTOR_PARTNER_STATE_EXPIRED); +} + +static __inline__ lacp_actor_partner_state +lacp_actor_partner_state_expired(lacp_actor_partner_state state) +{ + return ((state & LACP_ACTOR_PARTNER_STATE_EXPIRED) != 0); +} + + +/* + * LACP Actor/Partner TLV access functions + */ +static __inline__ void +lacp_actor_partner_tlv_set_system_priority(lacp_actor_partner_tlv_ref tlv, + lacp_system_priority system_priority) +{ + *((lacp_system_priority *)tlv->lap_system_priority) + = (lacp_system_priority)htons(system_priority); + return; +} + +static __inline__ lacp_system_priority +lacp_actor_partner_tlv_get_system_priority(const lacp_actor_partner_tlv_ref tlv) +{ + return ((lacp_system_priority) + ntohs(*((u_short *)tlv->lap_system_priority))); +} + +static __inline__ void +lacp_actor_partner_tlv_set_key(lacp_actor_partner_tlv_ref tlv, lacp_key key) +{ + *((lacp_key *)tlv->lap_key) = (lacp_key)htons(key); + return; +} + +static __inline__ lacp_key +lacp_actor_partner_tlv_get_key(const lacp_actor_partner_tlv_ref tlv) +{ + return ((lacp_key)ntohs(*((u_short *)tlv->lap_key))); +} + +static __inline__ void +lacp_actor_partner_tlv_set_port_priority(lacp_actor_partner_tlv_ref tlv, + lacp_port_priority port_priority) +{ + *((lacp_port_priority *)tlv->lap_port_priority) + = (lacp_port_priority)htons(port_priority); + return; +} + +static __inline__ lacp_port_priority +lacp_actor_partner_tlv_get_port_priority(const lacp_actor_partner_tlv_ref tlv) +{ + return ((lacp_port_priority)ntohs(*((u_short *)tlv->lap_port_priority))); +} + +static __inline__ void +lacp_actor_partner_tlv_set_port(lacp_actor_partner_tlv_ref tlv, lacp_port port) +{ + *((lacp_port *)tlv->lap_port) = (lacp_port)htons(port); + return; +} + +static __inline__ lacp_port +lacp_actor_partner_tlv_get_port(const lacp_actor_partner_tlv_ref tlv) +{ + return ((lacp_port)ntohs(*((u_short *)tlv->lap_port))); +} + +/* + * LACP Collector TLV access functions + */ +static __inline__ void +lacp_collector_tlv_set_max_delay(lacp_collector_tlv_ref tlv, + lacp_collector_max_delay delay) +{ + *((lacp_collector_max_delay *)tlv->lac_max_delay) + = (lacp_collector_max_delay)htons(delay); + return; +} + +static __inline__ lacp_collector_max_delay +lacp_collector_tlv_get_max_delay(const lacp_collector_tlv_ref tlv) +{ + return ((lacp_collector_max_delay)ntohs(*((u_short *)tlv->lac_max_delay))); +} + +typedef struct lacpdu_s { + u_char la_subtype; + u_char la_version; + u_char la_actor_tlv[LACPDU_ACTOR_TLV_LENGTH]; + u_char la_partner_tlv[LACPDU_PARTNER_TLV_LENGTH]; + u_char la_collector_tlv[LACPDU_COLLECTOR_TLV_LENGTH]; + u_char la_terminator_type; + u_char la_terminator_length; + u_char la_reserved[50]; +} lacpdu, *lacpdu_ref; + +/* timer values in seconds */ +#define LACP_FAST_PERIODIC_TIME 1 +#define LACP_SLOW_PERIODIC_TIME 30 +#define LACP_SHORT_TIMEOUT_TIME 3 +#define LACP_LONG_TIMEOUT_TIME 90 +#define LACP_CHURN_DETECTION_TIME 60 +#define LACP_AGGREGATE_WAIT_TIME 2 + +/* packet rate per second */ +#define LACP_PACKET_RATE 3 + +/** + ** Link Aggregation Marker Protocol definitions + **/ +#define LA_MARKER_PDU_VERSION_1 1 +#define LA_MARKER_TLV_TYPE_TERMINATOR 0x00 +#define LA_MARKER_TLV_TYPE_MARKER 0x01 +#define LA_MARKER_TLV_TYPE_MARKER_RESPONSE 0x02 + +#define LA_MARKER_TLV_LENGTH 16 +#define LA_MARKER_RESPONSE_TLV_LENGTH 16 + +typedef u_int32_t la_marker_transaction_id; + +typedef struct la_marker_pdu_s { + u_char lm_subtype; /* 0x02 */ + u_char lm_version; /* 0x01 */ + u_char lm_marker_tlv_type; /* 0x01 or 0x02 */ + u_char lm_marker_tlv_length; /* 16 */ + u_char lm_requestor_port[2]; + u_char lm_requestor_system[6]; + u_char lm_requestor_transaction_id[4]; + u_char lm_pad[2]; + u_char lm_terminator_type; /* 0x00 */ + u_char lm_terminator_length; /* 0 */ + u_char lm_reserved[90]; +} la_marker_pdu, *la_marker_pdu_ref, + la_marker_response_pdu, * la_marker_response_pdu_ref; + +static __inline__ void +la_marker_pdu_set_requestor_port(la_marker_pdu_ref lmpdu, lacp_port port) +{ + *((lacp_port *)lmpdu->lm_requestor_port) = (lacp_port)htons(port); + return; +} + +static __inline__ lacp_port +la_marker_pdu_get_requestor_port(la_marker_pdu_ref lmpdu) +{ + return ((lacp_port)ntohs(*((lacp_port *)lmpdu->lm_requestor_port))); +} + +static __inline__ void +la_marker_pdu_set_requestor_transaction_id(la_marker_pdu_ref lmpdu, + la_marker_transaction_id xid) +{ + *((la_marker_transaction_id *)lmpdu->lm_requestor_transaction_id) + = (la_marker_transaction_id)htonl(xid); + return; +} + +static __inline__ la_marker_transaction_id +la_marker_pdu_get_requestor_transaction_id(la_marker_pdu_ref lmpdu) +{ + la_marker_transaction_id * xid_p; + + xid_p = (la_marker_transaction_id *)lmpdu->lm_requestor_transaction_id; + return ((la_marker_transaction_id)ntohl(*xid_p)); +} + +static __inline__ void +la_marker_pdu_set_requestor_system(la_marker_pdu_ref lmpdu, lacp_system sys) +{ + *((lacp_system_ref)lmpdu->lm_requestor_system) = sys; + return; +} + +static __inline__ lacp_system +la_marker_pdu_get_requestor_system(la_marker_pdu_ref lmpdu) +{ + return (*(lacp_system_ref)(lmpdu->lm_requestor_system)); +} + +#endif /* _NET_LACP_H_ */ diff --git a/bsd/net/multicast_list.c b/bsd/net/multicast_list.c new file mode 100644 index 000000000..6fbc66f25 --- /dev/null +++ b/bsd/net/multicast_list.c @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/* + * multicast_util.c: + * - keep track of multicast addresses added to one interface based on the + * actual multicast addresses in another + * - used by VLAN and BOND + */ + +/* + * Modification History: + * + * April 29, 2004 Dieter Siegmund (dieter@apple.com) + * - created + */ + +#include <net/multicast_list.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <net/if_dl.h> + +__private_extern__ void +multicast_list_init(struct multicast_list * mc_list) +{ + SLIST_INIT(mc_list); + return; +} + +/* + * Function: multicast_list_remove + * Purpose: + * Remove the given list of multicast addresses from the interface and from + * the multicast list structure. + */ +__private_extern__ int +multicast_list_remove(struct multicast_list * mc_list) +{ + int error; + struct multicast_entry * mc; + int result = 0; + + while ((mc = SLIST_FIRST(mc_list)) != NULL) { + error = ifnet_remove_multicast(mc->mc_ifma); + if (error != 0) { + result = error; + } + SLIST_REMOVE_HEAD(mc_list, mc_entries); + ifmaddr_release(mc->mc_ifma); + FREE(mc, M_DEVBUF); + } + return (result); +} + +/* + * Function: multicast_list_program + * Purpose: + * Program the multicast filter on "target_ifp" using the values from + * "source_ifp", and saving the result in "mc_list" + * + * We build a new list of multicast addresses while programming the new list. + * If that completes successfully, we remove the old list, and return the + * new list. + * + * If it fails, we remove what we've added to the new list, and + * return an error. + */ +__private_extern__ int +multicast_list_program(struct multicast_list * mc_list, + struct ifnet * source_ifp, + struct ifnet * target_ifp) +{ + int alen; + int error = 0; + int i; + struct multicast_entry * mc = NULL; + struct multicast_list new_mc_list; + struct sockaddr_dl source_sdl; + ifmultiaddr_t * source_multicast_list; + struct sockaddr_dl target_sdl; + + alen = target_ifp->if_addrlen; + bzero((char *)&target_sdl, sizeof(target_sdl)); + target_sdl.sdl_len = sizeof(target_sdl); + target_sdl.sdl_family = AF_LINK; + target_sdl.sdl_type = target_ifp->if_type; + target_sdl.sdl_alen = alen; + target_sdl.sdl_index = target_ifp->if_index; + + /* build a new list */ + multicast_list_init(&new_mc_list); + error = ifnet_get_multicast_list(source_ifp, &source_multicast_list); + if (error != 0) { + printf("multicast_list_program: " + "ifnet_get_multicast_list(%s%d) failed, %d\n", + source_ifp->if_name, source_ifp->if_unit, error); + return (error); + } + for (i = 0; source_multicast_list[i] != NULL; i++) { + if (ifmaddr_address(source_multicast_list[i], + (struct sockaddr *)&source_sdl, + sizeof(source_sdl)) != 0 + || source_sdl.sdl_family != AF_LINK) { + continue; + } + mc = _MALLOC(sizeof(struct multicast_entry), M_DEVBUF, M_WAITOK); + bcopy(LLADDR(&source_sdl), LLADDR(&target_sdl), alen); + error = ifnet_add_multicast(target_ifp, (struct sockaddr *)&target_sdl, + &mc->mc_ifma); + if (error != 0) { + FREE(mc, M_DEVBUF); + break; + } + SLIST_INSERT_HEAD(&new_mc_list, mc, mc_entries); + } + if (error != 0) { + /* restore previous state */ + (void)multicast_list_remove(&new_mc_list); + } else { + /* remove the old entries, and return the new list */ + (void)multicast_list_remove(mc_list); + *mc_list = new_mc_list; + } + return (error); +} diff --git a/bsd/machine/ansi.h b/bsd/net/multicast_list.h similarity index 52% rename from bsd/machine/ansi.h rename to bsd/net/multicast_list.h index 8f34cdd79..6a6f64b65 100644 --- a/bsd/machine/ansi.h +++ b/bsd/net/multicast_list.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -19,28 +19,35 @@ * * @APPLE_LICENSE_HEADER_END@ */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ + +#ifndef _NET_MULTICAST_LIST_H +#define _NET_MULTICAST_LIST_H + +#include <sys/queue.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <net/if.h> +#include <net/kpi_interface.h> + /* - * The NEXTSTEP Software License Agreement specifies the terms - * and conditions for redistribution. - * + * multicast_util.h: + * - keep track of multicast addresses on one device for programming on + * another (VLAN, BOND) */ +struct multicast_entry { + SLIST_ENTRY(multicast_entry) mc_entries; + ifmultiaddr_t mc_ifma; +}; +SLIST_HEAD(multicast_list, multicast_entry); -#ifndef _MACHINE_ANSI_H_ -#define _MACHINE_ANSI_H_ - -#if defined (__ppc__) -#include "ppc/ansi.h" -#elif defined (__i386__) -#include "i386/ansi.h" -#else -#error architecture not supported -#endif +void +multicast_list_init(struct multicast_list * mc_list); -#ifdef KERNEL -#ifndef offsetof -#define offsetof(type, member) ((size_t)(&((type *)0)->member)) -#endif /* offsetof */ -#endif /* KERNEL */ +int +multicast_list_program(struct multicast_list * mc_list, + struct ifnet * source_ifp, + struct ifnet * target_ifp); +int +multicast_list_remove(struct multicast_list * mc_list); -#endif /* _MACHINE_ANSI_H_ */ +#endif _NET_MULTICAST_LIST_H diff --git a/bsd/net/ndrv.c b/bsd/net/ndrv.c index 9c4595c7a..38429b2f1 100644 --- a/bsd/net/ndrv.c +++ b/bsd/net/ndrv.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -60,7 +60,6 @@ #include <kern/queue.h> #include <net/ndrv.h> -#include <net/netisr.h> #include <net/route.h> #include <net/if_llc.h> #include <net/if_dl.h> @@ -75,14 +74,13 @@ #include <machine/spl.h> -int ndrv_do_detach(struct ndrv_cb *); -int ndrv_do_disconnect(struct ndrv_cb *); -struct ndrv_cb *ndrv_find_tag(unsigned int); -void ndrv_read_event(struct socket* inSo, caddr_t ref, int waitf); -int ndrv_setspec(struct ndrv_cb *np, struct sockopt *sopt); -int ndrv_delspec(struct ndrv_cb *); -int ndrv_to_dlil_demux(struct ndrv_demux_desc* ndrv, struct dlil_demux_desc* dlil); -void ndrv_handle_ifp_detach(u_long family, short unit); +static int ndrv_do_detach(struct ndrv_cb *); +static int ndrv_do_disconnect(struct ndrv_cb *); +static struct ndrv_cb *ndrv_find_inbound(struct ifnet *ifp, u_long protocol_family); +static int ndrv_setspec(struct ndrv_cb *np, struct sockopt *sopt); +static int ndrv_delspec(struct ndrv_cb *); +static int ndrv_to_dlil_demux(struct ndrv_demux_desc* ndrv, struct dlil_demux_desc* dlil); +static void ndrv_handle_ifp_detach(u_long family, short unit); static int ndrv_do_add_multicast(struct ndrv_cb *np, struct sockopt *sopt); static int ndrv_do_remove_multicast(struct ndrv_cb *np, struct sockopt *sopt); static struct ndrv_multiaddr* ndrv_have_multicast(struct ndrv_cb *np, struct sockaddr* addr); @@ -90,62 +88,39 @@ static void ndrv_remove_all_multicast(struct ndrv_cb *np); unsigned long ndrv_sendspace = NDRVSNDQ; unsigned long ndrv_recvspace = NDRVRCVQ; -struct ndrv_cb ndrvl; /* Head of controlblock list */ +TAILQ_HEAD(, ndrv_cb) ndrvl = TAILQ_HEAD_INITIALIZER(ndrvl); -struct domain ndrvdomain; -struct protosw ndrvsw; -static struct socket* ndrv_so; +extern struct domain ndrvdomain; +extern struct protosw ndrvsw; +extern lck_mtx_t *domain_proto_mtx; +extern void kprintf(const char *, ...); /* - * Protocol init function for NDRV protocol - * Init the control block list. + * Verify these values match. + * To keep clients from including dlil.h, we define + * these values independently in ndrv.h. They must + * match or a conversion function must be written. */ -void -ndrv_init() -{ - int retval; - struct kev_request kev_request; - - ndrvl.nd_next = ndrvl.nd_prev = &ndrvl; - - /* Create a PF_SYSTEM socket so we can listen for events */ - retval = socreate(PF_SYSTEM, &ndrv_so, SOCK_RAW, SYSPROTO_EVENT); - if (retval != 0 || ndrv_so == NULL) - retval = KERN_FAILURE; - - /* Install a callback function for the socket */ - ndrv_so->so_rcv.sb_flags |= SB_NOTIFY|SB_UPCALL; - ndrv_so->so_upcall = ndrv_read_event; - ndrv_so->so_upcallarg = NULL; - - /* Configure the socket to receive the events we're interested in */ - kev_request.vendor_code = KEV_VENDOR_APPLE; - kev_request.kev_class = KEV_NETWORK_CLASS; - kev_request.kev_subclass = KEV_DL_SUBCLASS; - retval = ndrv_so->so_proto->pr_usrreqs->pru_control(ndrv_so, SIOCSKEVFILT, (caddr_t)&kev_request, 0, 0); - if (retval != 0) - { - /* - * We will not get attaching or detaching events in this case. - * We should probably prevent any sockets from binding so we won't - * panic later if the interface goes away. - */ - log(LOG_WARNING, "PF_NDRV: ndrv_init - failed to set event filter (%d)", - retval); - } -} +#if NDRV_DEMUXTYPE_ETHERTYPE != DLIL_DESC_ETYPE2 +#error NDRV_DEMUXTYPE_ETHERTYPE must match DLIL_DESC_ETYPE2 +#endif +#if NDRV_DEMUXTYPE_SAP != DLIL_DESC_SAP +#error NDRV_DEMUXTYPE_SAP must match DLIL_DESC_SAP +#endif +#if NDRV_DEMUXTYPE_SNAP != DLIL_DESC_SNAP +#error NDRV_DEMUXTYPE_SNAP must match DLIL_DESC_SNAP +#endif /* * Protocol output - Called to output a raw network packet directly * to the driver. */ -int -ndrv_output(register struct mbuf *m, register struct socket *so) +static int +ndrv_output(struct mbuf *m, struct socket *so) { - register struct ndrv_cb *np = sotondrvcb(so); - register struct ifnet *ifp = np->nd_if; - extern void kprintf(const char *, ...); + struct ndrv_cb *np = sotondrvcb(so); + struct ifnet *ifp = np->nd_if; int result = 0; #if NDRV_DEBUG @@ -158,32 +133,33 @@ ndrv_output(register struct mbuf *m, register struct socket *so) if ((m->m_flags&M_PKTHDR) == 0) return(EINVAL); + /* Unlock before calling dlil_output */ + socket_unlock(so, 0); + /* * Call DLIL if we can. DLIL is much safer than calling the * ifp directly. */ - if (np->nd_tag != 0) - result = dlil_output(np->nd_tag, m, (caddr_t)NULL, - (struct sockaddr*)NULL, 1); - else if (np->nd_send_tag != 0) - result = dlil_output(np->nd_send_tag, m, (caddr_t)NULL, - (struct sockaddr*)NULL, 1); - else - result = ENXIO; + result = dlil_output(ifp, np->nd_proto_family, m, (caddr_t)NULL, + (struct sockaddr*)NULL, 1); + + socket_lock(so, 0); + return (result); } /* Our input routine called from DLIL */ -int +static int ndrv_input(struct mbuf *m, char *frame_header, struct ifnet *ifp, - u_long dl_tag, - int sync_ok) + u_long proto_family, + __unused int sync_ok) { struct socket *so; struct sockaddr_dl ndrvsrc = {sizeof (struct sockaddr_dl), AF_NDRV}; - register struct ndrv_cb *np; + struct ndrv_cb *np; + int error = 0; /* move packet from if queue to socket */ @@ -194,43 +170,36 @@ ndrv_input(struct mbuf *m, ndrvsrc.sdl_slen = 0; bcopy(frame_header, &ndrvsrc.sdl_data, 6); - np = ndrv_find_tag(dl_tag); + np = ndrv_find_inbound(ifp, proto_family); if (np == NULL) { return(ENOENT); } so = np->nd_socket; /* prepend the frame header */ - m = m_prepend(m, ifp->if_data.ifi_hdrlen, M_NOWAIT); + m = m_prepend(m, ifp->if_hdrlen, M_NOWAIT); if (m == NULL) return EJUSTRETURN; - bcopy(frame_header, m->m_data, ifp->if_data.ifi_hdrlen); + bcopy(frame_header, m->m_data, ifp->if_hdrlen); + + lck_mtx_assert(so->so_proto->pr_domain->dom_mtx, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(so->so_proto->pr_domain->dom_mtx); if (sbappendaddr(&(so->so_rcv), (struct sockaddr *)&ndrvsrc, - m, (struct mbuf *)0) == 0) - { - /* yes, sbappendaddr returns zero if the sockbuff is full... */ - /* caller will free m */ - return(ENOMEM); - } else + m, (struct mbuf *)0, &error) != 0) { sorwakeup(so); - return(0); -} - -int -ndrv_control(struct socket *so, u_long cmd, caddr_t data, - struct ifnet *ifp, struct proc *p) -{ - return (0); + } + lck_mtx_unlock(so->so_proto->pr_domain->dom_mtx); + return 0; /* radar 4030377 - always return 0 */ } /* * Allocate an ndrv control block and some buffer space for the socket */ -int -ndrv_attach(struct socket *so, int proto, struct proc *p) +static int +ndrv_attach(struct socket *so, int proto, __unused struct proc *p) { int error; - register struct ndrv_cb *np = sotondrvcb(so); + struct ndrv_cb *np = sotondrvcb(so); if ((so->so_state & SS_PRIV) == 0) return(EPERM); @@ -256,10 +225,10 @@ ndrv_attach(struct socket *so, int proto, struct proc *p) np->nd_proto.sp_family = so->so_proto->pr_domain->dom_family; np->nd_proto.sp_protocol = proto; np->nd_if = NULL; - np->nd_tag = 0; + np->nd_proto_family = 0; np->nd_family = 0; np->nd_unit = 0; - insque((queue_t)np, (queue_t)&ndrvl); + TAILQ_INSERT_TAIL(&ndrvl, np, nd_next); return(0); } @@ -268,10 +237,10 @@ ndrv_attach(struct socket *so, int proto, struct proc *p) * Flush data or not depending on the options. */ -int +static int ndrv_detach(struct socket *so) { - register struct ndrv_cb *np = sotondrvcb(so); + struct ndrv_cb *np = sotondrvcb(so); if (np == 0) return EINVAL; @@ -288,9 +257,10 @@ ndrv_detach(struct socket *so) * Don't expect this to be used. */ -int ndrv_connect(struct socket *so, struct sockaddr *nam, struct proc *p) +static int +ndrv_connect(struct socket *so, struct sockaddr *nam, __unused struct proc *p) { - register struct ndrv_cb *np = sotondrvcb(so); + struct ndrv_cb *np = sotondrvcb(so); int result = 0; if (np == 0) @@ -312,22 +282,34 @@ int ndrv_connect(struct socket *so, struct sockaddr *nam, struct proc *p) return 0; } +static void +ndrv_event(struct ifnet *ifp, struct kev_msg *event) +{ + if (event->vendor_code == KEV_VENDOR_APPLE && + event->kev_class == KEV_NETWORK_CLASS && + event->kev_subclass == KEV_DL_SUBCLASS && + event->event_code == KEV_DL_IF_DETACHING) { + ndrv_handle_ifp_detach(ifp->if_family, ifp->if_unit); + } +} + +static int name_cmp(struct ifnet *, char *); + /* * This is the "driver open" hook - we 'bind' to the * named driver. * Here's where we latch onto the driver. */ -int -ndrv_bind(struct socket *so, struct sockaddr *nam, struct proc *p) +static int +ndrv_bind(struct socket *so, struct sockaddr *nam, __unused struct proc *p) { - register struct sockaddr_ndrv *sa = (struct sockaddr_ndrv *) nam; - register char *dname; - register struct ndrv_cb *np; - register struct ifnet *ifp; - extern int name_cmp(struct ifnet *, char *); + struct sockaddr_ndrv *sa = (struct sockaddr_ndrv *) nam; + char *dname; + struct ndrv_cb *np; + struct ifnet *ifp; int result; - if TAILQ_EMPTY(&ifnet) + if TAILQ_EMPTY(&ifnet_head) return(EADDRNOTAVAIL); /* Quick sanity check */ np = sotondrvcb(so); if (np == 0) @@ -351,46 +333,40 @@ ndrv_bind(struct socket *so, struct sockaddr *nam, struct proc *p) * There's no internal call for this so we have to dup the code * in if.c/ifconf() */ - TAILQ_FOREACH(ifp, &ifnet, if_link) { + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { if (name_cmp(ifp, dname) == 0) break; } + ifnet_head_done(); if (ifp == NULL) return(EADDRNOTAVAIL); - - /* - * Loopback demuxing doesn't work with PF_NDRV. - * The first 4 bytes of the packet must be the - * protocol ptr. Can't get that from userland. - */ - if (ifp->if_family == APPLE_IF_FAM_LOOPBACK) - return (ENOTSUP); - - if ((dlil_find_dltag(ifp->if_family, ifp->if_unit, - PF_NDRV, &np->nd_send_tag) != 0) && - (ifp->if_family != APPLE_IF_FAM_PPP)) { - /* NDRV isn't registered on this interface, lets change that */ - struct dlil_proto_reg_str ndrv_proto; - int result = 0; - bzero(&ndrv_proto, sizeof(ndrv_proto)); - TAILQ_INIT(&ndrv_proto.demux_desc_head); - - ndrv_proto.interface_family = ifp->if_family; - ndrv_proto.protocol_family = PF_NDRV; - ndrv_proto.unit_number = ifp->if_unit; - - result = dlil_attach_protocol(&ndrv_proto, &np->nd_send_tag); - - /* - * If the interface does not allow PF_NDRV to attach, we will - * respect it's wishes. Sending will be disabled. No error is - * returned because the client may later attach a real protocol - * that the interface may accept. - */ - if (result != 0) - np->nd_send_tag = 0; - } + + // PPP doesn't support PF_NDRV. + if (ifp->if_family != APPLE_IF_FAM_PPP) + { + /* NDRV on this interface */ + struct dlil_proto_reg_str ndrv_proto; + result = 0; + bzero(&ndrv_proto, sizeof(ndrv_proto)); + TAILQ_INIT(&ndrv_proto.demux_desc_head); + + ndrv_proto.interface_family = ifp->if_family; + ndrv_proto.protocol_family = PF_NDRV; + ndrv_proto.unit_number = ifp->if_unit; + ndrv_proto.event = ndrv_event; + + /* We aren't worried about double attaching, that should just return an error */ + result = dlil_attach_protocol(&ndrv_proto); + if (result && result != EEXIST) { + return result; + } + np->nd_proto_family = PF_NDRV; + } + else { + np->nd_proto_family = 0; + } np->nd_if = ifp; np->nd_family = ifp->if_family; @@ -399,10 +375,10 @@ ndrv_bind(struct socket *so, struct sockaddr *nam, struct proc *p) return(0); } -int +static int ndrv_disconnect(struct socket *so) { - register struct ndrv_cb *np = sotondrvcb(so); + struct ndrv_cb *np = sotondrvcb(so); if (np == 0) return EINVAL; @@ -414,41 +390,13 @@ ndrv_disconnect(struct socket *so) return 0; } -/* - * Accessor function - */ -struct ifnet* -ndrv_get_ifp(caddr_t ndrv_pcb) -{ - struct ndrv_cb* np = (struct ndrv_cb*)ndrv_pcb; - -#if DEBUG - { - struct ndrv_cb* temp = ndrvl.nd_next; - /* Verify existence of pcb */ - for (temp = ndrvl.nd_next; temp != &ndrvl; temp = temp->nd_next) - { - if (temp == np) - break; - } - - if (temp != np) - { - log(LOG_WARNING, "PF_NDRV: ndrv_get_ifp called with invalid ndrv_cb!"); - return NULL; - } - } -#endif - - return np->nd_if; -} - /* * Mark the connection as being incapable of further input. */ -int +static int ndrv_shutdown(struct socket *so) { + lck_mtx_assert(so->so_proto->pr_domain->dom_mtx, LCK_MTX_ASSERT_OWNED); socantsendmore(so); return 0; } @@ -458,10 +406,10 @@ ndrv_shutdown(struct socket *so) * to the appropriate driver. The really tricky part * is the destination address... */ -int -ndrv_send(struct socket *so, int flags, struct mbuf *m, - struct sockaddr *addr, struct mbuf *control, - struct proc *p) +static int +ndrv_send(struct socket *so, __unused int flags, struct mbuf *m, + __unused struct sockaddr *addr, struct mbuf *control, + __unused struct proc *p) { int error; @@ -474,10 +422,10 @@ ndrv_send(struct socket *so, int flags, struct mbuf *m, } -int +static int ndrv_abort(struct socket *so) { - register struct ndrv_cb *np = sotondrvcb(so); + struct ndrv_cb *np = sotondrvcb(so); if (np == 0) return EINVAL; @@ -486,19 +434,10 @@ ndrv_abort(struct socket *so) return 0; } -int -ndrv_sense(struct socket *so, struct stat *sb) -{ - /* - * stat: don't bother with a blocksize. - */ - return (0); -} - -int +static int ndrv_sockaddr(struct socket *so, struct sockaddr **nam) { - register struct ndrv_cb *np = sotondrvcb(so); + struct ndrv_cb *np = sotondrvcb(so); int len; if (np == 0) @@ -508,16 +447,19 @@ ndrv_sockaddr(struct socket *so, struct sockaddr **nam) return EINVAL; len = np->nd_laddr->snd_len; + MALLOC(*nam, struct sockaddr *, len, M_SONAME, M_WAITOK); + if (*nam == NULL) + return ENOMEM; bcopy((caddr_t)np->nd_laddr, *nam, (unsigned)len); return 0; } -int +static int ndrv_peeraddr(struct socket *so, struct sockaddr **nam) { - register struct ndrv_cb *np = sotondrvcb(so); + struct ndrv_cb *np = sotondrvcb(so); int len; if (np == 0) @@ -527,25 +469,21 @@ ndrv_peeraddr(struct socket *so, struct sockaddr **nam) return ENOTCONN; len = np->nd_faddr->snd_len; + MALLOC(*nam, struct sockaddr *, len, M_SONAME, M_WAITOK); + if (*nam == NULL) + return ENOMEM; bcopy((caddr_t)np->nd_faddr, *nam, (unsigned)len); return 0; } -/* Control input */ - -void -ndrv_ctlinput(int dummy1, struct sockaddr *dummy2, void *dummy3) -{ -} - /* Control output */ -int +static int ndrv_ctloutput(struct socket *so, struct sockopt *sopt) { - register struct ndrv_cb *np = sotondrvcb(so); + struct ndrv_cb *np = sotondrvcb(so); int error = 0; switch(sopt->sopt_name) @@ -580,25 +518,11 @@ ndrv_ctloutput(struct socket *so, struct sockopt *sopt) return(error); } -/* Drain the queues */ -void -ndrv_drain() -{ -} - -/* Sysctl hook for NDRV */ -int -ndrv_sysctl() -{ - return(0); -} - -int -ndrv_do_detach(register struct ndrv_cb *np) +static int +ndrv_do_detach(struct ndrv_cb *np) { struct ndrv_cb* cur_np = NULL; struct socket *so = np->nd_socket; - struct ndrv_multicast* next; int error = 0; #if NDRV_DEBUG @@ -606,47 +530,38 @@ ndrv_do_detach(register struct ndrv_cb *np) #endif ndrv_remove_all_multicast(np); - if (np->nd_tag != 0) - { - error = dlil_detach_protocol(np->nd_tag); - if (error) - { - log(LOG_WARNING, "NDRV ndrv_do_detach: error %d removing dl_tag %d", - error, np->nd_tag); - return error; - } - } - - /* Remove from the linked list of control blocks */ - remque((queue_t)np); - - if (np->nd_send_tag != 0) - { - /* Check if this is the last socket attached to this interface */ - for (cur_np = ndrvl.nd_next; cur_np != &ndrvl; cur_np = cur_np->nd_next) - { - if (cur_np->nd_family == np->nd_family && - cur_np->nd_unit == np->nd_unit) - { - break; - } - } - - /* If there are no other interfaces, detach PF_NDRV from the interface */ - if (cur_np == &ndrvl) - { - dlil_detach_protocol(np->nd_send_tag); - } - } + if (np->nd_if) { + if (np->nd_proto_family != PF_NDRV && + np->nd_proto_family != 0) { + dlil_detach_protocol(np->nd_if, np->nd_proto_family); + } + + /* Remove from the linked list of control blocks */ + TAILQ_REMOVE(&ndrvl, np, nd_next); + + /* Check if this is the last socket attached to this interface */ + TAILQ_FOREACH(cur_np, &ndrvl, nd_next) { + if (cur_np->nd_family == np->nd_family && + cur_np->nd_unit == np->nd_unit) { + break; + } + } + + /* If there are no other interfaces, detach PF_NDRV from the interface */ + if (cur_np == NULL) { + dlil_detach_protocol(np->nd_if, PF_NDRV); + } + } FREE((caddr_t)np, M_PCB); so->so_pcb = 0; + so->so_flags |= SOF_PCBCLEARING; sofree(so); return error; } -int -ndrv_do_disconnect(register struct ndrv_cb *np) +static int +ndrv_do_disconnect(struct ndrv_cb *np) { #if NDRV_DEBUG kprintf("NDRV disconnect: %x\n", np); @@ -662,15 +577,30 @@ ndrv_do_disconnect(register struct ndrv_cb *np) return(0); } +/* Hackery - return a string version of a decimal number */ +static char * +sprint_d(u_int n, char *buf, int buflen) +{ char dbuf[IFNAMSIZ]; + char *cp = dbuf+IFNAMSIZ-1; + + *cp = 0; + do { buflen--; + cp--; + *cp = "0123456789"[n % 10]; + n /= 10; + } while (n != 0 && buflen > 0); + strncpy(buf, cp, IFNAMSIZ-buflen); + return (cp); +} + /* * Try to compare a device name (q) with one of the funky ifnet * device names (ifp). */ -int name_cmp(register struct ifnet *ifp, register char *q) -{ register char *r; - register int len; +static int name_cmp(struct ifnet *ifp, char *q) +{ char *r; + int len; char buf[IFNAMSIZ]; - static char *sprint_d(); r = buf; len = strlen(ifp->if_name); @@ -683,32 +613,15 @@ int name_cmp(register struct ifnet *ifp, register char *q) return(strncmp(buf, q, IFNAMSIZ)); } -/* Hackery - return a string version of a decimal number */ -static char * -sprint_d(n, buf, buflen) - u_int n; - char *buf; - int buflen; -{ char dbuf[IFNAMSIZ]; - register char *cp = dbuf+IFNAMSIZ-1; - - *cp = 0; - do { buflen--; - cp--; - *cp = "0123456789"[n % 10]; - n /= 10; - } while (n != 0 && buflen > 0); - strncpy(buf, cp, IFNAMSIZ-buflen); - return (cp); -} - +#if 0 +//### Not used /* * When closing, dump any enqueued mbufs. */ void -ndrv_flushq(register struct ifqueue *q) +ndrv_flushq(struct ifqueue *q) { - register struct mbuf *m; + struct mbuf *m; for (;;) { IF_DEQUEUE(q, m); @@ -719,6 +632,7 @@ ndrv_flushq(register struct ifqueue *q) m_freem(m); } } +#endif int ndrv_setspec(struct ndrv_cb *np, struct sockopt *sopt) @@ -730,7 +644,7 @@ ndrv_setspec(struct ndrv_cb *np, struct sockopt *sopt) int error = 0; /* Sanity checking */ - if (np->nd_tag) + if (np->nd_proto_family != PF_NDRV) return EBUSY; if (np->nd_if == NULL) return EINVAL; @@ -764,7 +678,7 @@ ndrv_setspec(struct ndrv_cb *np, struct sockopt *sopt) if (error == 0) { /* Copy the ndrv demux array from userland */ - error = copyin(ndrvSpec.demux_list, ndrvDemux, + error = copyin(CAST_USER_ADDR_T(ndrvSpec.demux_list), ndrvDemux, ndrvSpec.demux_count * sizeof(struct ndrv_demux_desc)); ndrvSpec.demux_list = ndrvDemux; } @@ -779,6 +693,7 @@ ndrv_setspec(struct ndrv_cb *np, struct sockopt *sopt) dlilSpec.interface_family = np->nd_family; dlilSpec.unit_number = np->nd_unit; dlilSpec.input = ndrv_input; + dlilSpec.event = ndrv_event; dlilSpec.protocol_family = ndrvSpec.protocol_family; for (demuxOn = 0; demuxOn < ndrvSpec.demux_count; demuxOn++) @@ -796,7 +711,9 @@ ndrv_setspec(struct ndrv_cb *np, struct sockopt *sopt) if (error == 0) { /* We've got all our ducks lined up...lets attach! */ - error = dlil_attach_protocol(&dlilSpec, &np->nd_tag); + error = dlil_attach_protocol(&dlilSpec); + if (error == 0) + np->nd_proto_family = dlilSpec.protocol_family; } /* Free any memory we've allocated */ @@ -837,32 +754,27 @@ ndrv_delspec(struct ndrv_cb *np) { int result = 0; - if (np->nd_tag == 0) + if (np->nd_proto_family == PF_NDRV || + np->nd_proto_family == 0) return EINVAL; /* Detach the protocol */ - result = dlil_detach_protocol(np->nd_tag); - if (result == 0) - { - np->nd_tag = 0; - } + result = dlil_detach_protocol(np->nd_if, np->nd_proto_family); + np->nd_proto_family = PF_NDRV; return result; } struct ndrv_cb * -ndrv_find_tag(unsigned int tag) +ndrv_find_inbound(struct ifnet *ifp, u_long protocol) { struct ndrv_cb* np; - int i; - - if (tag == 0) - return NULL; + + if (protocol == PF_NDRV) return NULL; - for (np = ndrvl.nd_next; np != NULL; np = np->nd_next) - { - if (np->nd_tag == tag) - { + TAILQ_FOREACH(np, &ndrvl, nd_next) { + if (np->nd_proto_family == protocol && + np->nd_if == ifp) { return np; } } @@ -870,7 +782,7 @@ ndrv_find_tag(unsigned int tag) return NULL; } -void ndrv_dominit() +static void ndrv_dominit(void) { static int ndrv_dominited = 0; @@ -879,55 +791,22 @@ void ndrv_dominit() ndrv_dominited = 1; } -void -ndrv_read_event(struct socket* so, caddr_t ref, int waitf) -{ - // Read an event - struct mbuf *m = NULL; - struct kern_event_msg *msg; - struct uio auio = {0}; - int result = 0; - int flags = 0; - - // Get the data - auio.uio_resid = 1000000; // large number to get all of the data - flags = MSG_DONTWAIT; - result = soreceive(so, (struct sockaddr**)NULL, &auio, &m, - (struct mbuf**)NULL, &flags); - if (result != 0 || m == NULL) - return; - - // cast the mbuf to a kern_event_msg - // this is dangerous, doesn't handle linked mbufs - msg = mtod(m, struct kern_event_msg*); - - // check for detaches, assume even filtering is working - if (msg->event_code == KEV_DL_IF_DETACHING || - msg->event_code == KEV_DL_IF_DETACHED) - { - struct net_event_data *ev_data; - ev_data = (struct net_event_data*)msg->event_data; - ndrv_handle_ifp_detach(ev_data->if_family, ev_data->if_unit); - } - - m_free(m); -} - -void +static void ndrv_handle_ifp_detach(u_long family, short unit) { struct ndrv_cb* np; - u_long dl_tag; + struct ifnet *ifp = NULL; + struct socket *so; /* Find all sockets using this interface. */ - for (np = ndrvl.nd_next; np != &ndrvl; np = np->nd_next) - { + TAILQ_FOREACH(np, &ndrvl, nd_next) { if (np->nd_family == family && np->nd_unit == unit) { /* This cb is using the detaching interface, but not for long. */ /* Let the protocol go */ - if (np->nd_tag != 0) + ifp = np->nd_if; + if (np->nd_proto_family != 0) ndrv_delspec(np); /* Delete the multicasts first */ @@ -937,18 +816,19 @@ ndrv_handle_ifp_detach(u_long family, short unit) np->nd_if = NULL; np->nd_unit = 0; np->nd_family = 0; - np->nd_send_tag = 0; - + + so = np->nd_socket; /* Make sure sending returns an error */ /* Is this safe? Will we drop the funnel? */ - socantsendmore(np->nd_socket); - socantrcvmore(np->nd_socket); + lck_mtx_assert(so->so_proto->pr_domain->dom_mtx, LCK_MTX_ASSERT_OWNED); + socantsendmore(so); + socantrcvmore(so); } } /* Unregister our protocol */ - if (dlil_find_dltag(family, unit, PF_NDRV, &dl_tag) == 0) { - dlil_detach_protocol(dl_tag); + if (ifp) { + dlil_detach_protocol(ifp, PF_NDRV); } } @@ -983,7 +863,7 @@ ndrv_do_add_multicast(struct ndrv_cb *np, struct sockopt *sopt) if (result == 0) { // Try adding the multicast - result = if_addmulti(np->nd_if, &ndrv_multi->addr, NULL); + result = if_addmulti(np->nd_if, &ndrv_multi->addr, &ndrv_multi->ifma); } if (result == 0) @@ -1039,7 +919,7 @@ ndrv_do_remove_multicast(struct ndrv_cb *np, struct sockopt *sopt) if (result == 0) { // Try deleting the multicast - result = if_delmulti(np->nd_if, &ndrv_entry->addr); + result = if_delmultiaddr(ndrv_entry->ifma, 0); } if (result == 0) @@ -1047,6 +927,8 @@ ndrv_do_remove_multicast(struct ndrv_cb *np, struct sockopt *sopt) // Remove from our linked list struct ndrv_multiaddr* cur = np->nd_multiaddrs; + ifma_release(ndrv_entry->ifma); + if (cur == ndrv_entry) { np->nd_multiaddrs = cur->next; @@ -1101,7 +983,8 @@ ndrv_remove_all_multicast(struct ndrv_cb* np) cur = np->nd_multiaddrs; np->nd_multiaddrs = cur->next; - if_delmulti(np->nd_if, &cur->addr); + if_delmultiaddr(cur->ifma, 0); + ifma_release(cur->ifma); FREE(cur, M_IFADDR); } } @@ -1109,17 +992,19 @@ ndrv_remove_all_multicast(struct ndrv_cb* np) struct pr_usrreqs ndrv_usrreqs = { ndrv_abort, pru_accept_notsupp, ndrv_attach, ndrv_bind, - ndrv_connect, pru_connect2_notsupp, ndrv_control, ndrv_detach, + ndrv_connect, pru_connect2_notsupp, pru_control_notsupp, ndrv_detach, ndrv_disconnect, pru_listen_notsupp, ndrv_peeraddr, pru_rcvd_notsupp, - pru_rcvoob_notsupp, ndrv_send, ndrv_sense, ndrv_shutdown, - ndrv_sockaddr, sosend, soreceive, sopoll + pru_rcvoob_notsupp, ndrv_send, pru_sense_null, ndrv_shutdown, + ndrv_sockaddr, sosend, soreceive, pru_sopoll_notsupp }; struct protosw ndrvsw = { SOCK_RAW, &ndrvdomain, NDRVPROTO_NDRV, PR_ATOMIC|PR_ADDR, - 0, ndrv_output, ndrv_ctlinput, ndrv_ctloutput, - 0, ndrv_init, 0, 0, - ndrv_drain, ndrv_sysctl, &ndrv_usrreqs + 0, ndrv_output, 0, ndrv_ctloutput, + 0, 0, 0, 0, + 0, 0, + &ndrv_usrreqs, + 0, 0, 0 }; struct domain ndrvdomain = diff --git a/bsd/net/ndrv.h b/bsd/net/ndrv.h index 0824dafca..6aa8e14bd 100644 --- a/bsd/net/ndrv.h +++ b/bsd/net/ndrv.h @@ -42,9 +42,9 @@ struct sockaddr_ndrv * Support for user-mode protocol handlers */ -#define NDRV_DEMUXTYPE_ETHERTYPE DLIL_DESC_ETYPE2 -#define NDRV_DEMUXTYPE_SAP DLIL_DESC_SAP -#define NDRV_DEMUXTYPE_SNAP DLIL_DESC_SNAP +#define NDRV_DEMUXTYPE_ETHERTYPE 4 +#define NDRV_DEMUXTYPE_SAP 5 +#define NDRV_DEMUXTYPE_SNAP 6 #define NDRVPROTO_NDRV 0 @@ -115,9 +115,7 @@ struct ndrv_protocol_desc }; #define SOL_NDRVPROTO NDRVPROTO_NDRV /* Use this socket level */ -/* NDRV_DMXSPEC 0x01 Obsolete */ #define NDRV_DELDMXSPEC 0x02 /* Delete the registered protocol */ -/* NDRV_DMXSPECCNT 0x03 Obsolete */ #define NDRV_SETDMXSPEC 0x04 /* Set the protocol spec */ #define NDRV_ADDMULTICAST 0x05 /* Add a physical multicast address */ #define NDRV_DELMULTICAST 0x06 /* Delete a phyiscal multicast */ @@ -145,11 +143,4 @@ struct ndrv_protocol_desc * you a second or two. */ -#ifdef KERNEL -#ifdef __APPLE_API_UNSTABLE -/* Additional Kernel APIs */ -struct ifnet* ndrv_get_ifp(caddr_t ndrv_pcb); -#endif /* __APPLE_API_UNSTABLE */ -#endif - #endif /* _NET_NDRV_H */ diff --git a/bsd/net/ndrv_var.h b/bsd/net/ndrv_var.h index 74f8aaf2b..d39c2a7ad 100644 --- a/bsd/net/ndrv_var.h +++ b/bsd/net/ndrv_var.h @@ -27,9 +27,7 @@ #ifndef _NET_NDRV_VAR_H #define _NET_NDRV_VAR_H -#include <sys/appleapiopts.h> -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef PRIVATE /* * structure for storing a linked list of multicast addresses @@ -39,6 +37,7 @@ struct ndrv_multiaddr { struct ndrv_multiaddr* next; + struct if_multiaddr* ifma; struct sockaddr addr; }; @@ -49,8 +48,7 @@ struct ndrv_multiaddr */ struct ndrv_cb { - struct ndrv_cb *nd_next; /* Doubly-linked list */ - struct ndrv_cb *nd_prev; + TAILQ_ENTRY(ndrv_cb) nd_next; struct socket *nd_socket; /* Back to the socket */ u_int32_t nd_signature; /* Just double-checking */ struct sockaddr_ndrv *nd_faddr; @@ -59,8 +57,7 @@ struct ndrv_cb int nd_descrcnt; /* # elements in nd_dlist - Obsolete */ TAILQ_HEAD(dlist, dlil_demux_desc) nd_dlist; /* Descr. list */ struct ifnet *nd_if; /* obsolete, maintained for binary compatibility */ - u_long nd_send_tag; - u_long nd_tag; + u_long nd_proto_family; u_long nd_family; struct ndrv_multiaddr* nd_multiaddrs; short nd_unit; @@ -73,7 +70,5 @@ struct ndrv_cb #define NDRVSNDQ 8192 #define NDRVRCVQ 8192 -extern struct ndrv_cb ndrvl; /* Head of controlblock list */ -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +#endif /* PRIVATE */ #endif /* _NET_NDRV_VAR_H */ diff --git a/bsd/net/net_osdep.c b/bsd/net/net_osdep.c index 8d2f62168..51bea3c12 100644 --- a/bsd/net/net_osdep.c +++ b/bsd/net/net_osdep.c @@ -61,7 +61,6 @@ #include <net/if.h> #include <net/if_types.h> -#include <net/netisr.h> #include <net/route.h> #include <net/bpf.h> diff --git a/bsd/net/net_osdep.h b/bsd/net/net_osdep.h index e350606ac..15775cb29 100644 --- a/bsd/net/net_osdep.h +++ b/bsd/net/net_osdep.h @@ -250,10 +250,10 @@ #ifndef __NET_NET_OSDEP_H_DEFINED_ #define __NET_NET_OSDEP_H_DEFINED_ #include <sys/appleapiopts.h> -#ifdef KERNEL +#ifdef KERNEL_PRIVATE struct ifnet; -extern const char *if_name __P((struct ifnet *)); +extern const char *if_name(struct ifnet *); #define HAVE_OLD_BPF @@ -272,5 +272,5 @@ extern const char *if_name __P((struct ifnet *)); #define WITH_CONVERT_IP_OFF #endif -#endif /*_KERNEL*/ +#endif /* KERNEL_PRIVATE */ #endif /*__NET_NET_OSDEP_H_DEFINED_ */ diff --git a/bsd/net/netisr.c b/bsd/net/netisr.c deleted file mode 100644 index 2ae5be8eb..000000000 --- a/bsd/net/netisr.c +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* - * Mach Operating System - * Copyright (c) 1987 Carnegie-Mellon University - * All rights reserved. The CMU software License Agreement specifies - * the terms and conditions for use and redistribution. - */ - -/* HISTORY - * 18-May-90 Avadis Tevanian (avie) at NeXT - * Changed to use sensible priorities (higher numbers -> higher pri). - * - * 1-Feb-88 David Golub (dbg) at Carnegie-Mellon University - * Goofed... netisr thread must run at splnet, because the routines - * it calls expect to be called from the softnet interrupt (at - * splnet). - * - * 19-Nov-87 David Golub (dbg) at Carnegie-Mellon University - * Created. - * - */ - -/* - * netisr.c - * - * Kernel thread for network code. - */ - - -#include <meta_features.h> -#include <machine/spl.h> -#include <net/netisr.h> - -#include <kern/thread.h> -#include <kern/processor.h> - -volatile int netisr; - - -void run_netisr(void) -{ - spl_t spl = splnet(); - - while (netisr != 0) { -#ifdef NIMP -#if NIMP > 0 - if (netisr & (1<<NETISR_IMP)){ - netisr &= ~(1<<NETISR_IMP); - impintr(); - } -#endif /* NIMP > 0 */ -#endif /* NIMP */ - -#if INET - if (netisr & (1<<NETISR_IP)){ - void ipintr(void); - - netisr &= ~(1<<NETISR_IP); - ipintr(); - } - if (netisr & (1<<NETISR_ARP)) { - void arpintr(void); - - netisr &= ~(1<<NETISR_ARP); - arpintr(); - } -#endif /* INET */ - -#if INET6 - if (netisr & (1<<NETISR_IPV6)){ - void ip6intr(void); - - netisr &= ~(1<<NETISR_IPV6); - ip6intr(); - } -#endif /* INET6 */ - -#if ISO - if (netisr & (1<<NETISR_ISO)) { - netisr &= ~(1<<NETISR_ISO); - isointr(); - } -#endif /* ISO */ - -#if CCITT - if (netisr & (1<<NETISR_CCITT)) { - netisr &= ~(1<<NETISR_CCITT); - ccittintr(); - } -#endif /* CCITT */ - -#if NS - if (netisr & (1<<NETISR_NS)){ - netisr &= ~(1<<NETISR_NS); - nsintr(); - } -#endif /* NS */ - -#if NETAT - if (netisr & (1<<NETISR_APPLETALK)){ - void atalkintr(void); - - netisr &= ~(1<<NETISR_APPLETALK); - atalkintr(); - } -#endif /* NETAT */ - } - - splx(spl); - - return; -} - diff --git a/bsd/net/pfkeyv2.h b/bsd/net/pfkeyv2.h index 2ea4e167f..eeff8a049 100644 --- a/bsd/net/pfkeyv2.h +++ b/bsd/net/pfkeyv2.h @@ -125,14 +125,14 @@ struct sadb_sa { u_int32_t sadb_sa_flags; }; -#ifdef __APPLE_API_PRIVATE +#ifdef PRIVATE struct sadb_sa_2 { struct sadb_sa sa; u_int16_t sadb_sa_natt_port; u_int16_t sadb_reserved0; u_int32_t sadb_reserved1; }; -#endif +#endif /* PRIVATE */ struct sadb_lifetime { u_int16_t sadb_lifetime_len; @@ -373,11 +373,11 @@ struct sadb_x_ipsecrequest { /* `flags' in sadb_sa structure holds followings */ #define SADB_X_EXT_NONE 0x0000 /* i.e. new format. */ #define SADB_X_EXT_OLD 0x0001 /* old format. */ -#ifdef __APPLE_API_PRIVATE +#ifdef PRIVATE #define SADB_X_EXT_NATT 0x0002 /* Use UDP encapsulation to traverse NAT */ #define SADB_X_EXT_NATT_KEEPALIVE 0x0004 /* Local node is behind NAT, send keepalives */ /* Should only be set for outbound SAs */ -#endif +#endif /* PRIVATE */ #define SADB_X_EXT_IV4B 0x0010 /* IV length of 4 bytes in use */ #define SADB_X_EXT_DERIV 0x0020 /* DES derived */ diff --git a/bsd/net/ppp_comp.h b/bsd/net/ppp_comp.h index 475e85e67..69a7f8b89 100644 --- a/bsd/net/ppp_comp.h +++ b/bsd/net/ppp_comp.h @@ -49,8 +49,7 @@ #ifndef _NET_PPP_COMP_H #define _NET_PPP_COMP_H -#include <sys/appleapiopts.h> -#ifdef __APPLE_API_UNSTABLE +#ifdef KERNEL_PRIVATE /* * The following symbols control whether we include code for * various compression methods. @@ -72,36 +71,35 @@ struct compressor { int compress_proto; /* CCP compression protocol number */ /* Allocate space for a compressor (transmit side) */ - void *(*comp_alloc) __P((u_char *options, int opt_len)); + void *(*comp_alloc)(u_char *options, int opt_len); /* Free space used by a compressor */ - void (*comp_free) __P((void *state)); + void (*comp_free)(void *state); /* Initialize a compressor */ - int (*comp_init) __P((void *state, u_char *options, int opt_len, - int unit, int hdrlen, int debug)); + int (*comp_init)(void *state, u_char *options, int opt_len, + int unit, int hdrlen, int debug); /* Reset a compressor */ - void (*comp_reset) __P((void *state)); + void (*comp_reset)(void *state); /* Compress a packet */ - int (*compress) __P((void *state, PACKETPTR *mret, - PACKETPTR mp, int orig_len, int max_len)); + int (*compress)(void *state, PACKETPTR *mret, + PACKETPTR mp, int orig_len, int max_len); /* Return compression statistics */ - void (*comp_stat) __P((void *state, struct compstat *stats)); + void (*comp_stat)(void *state, struct compstat *stats); /* Allocate space for a decompressor (receive side) */ - void *(*decomp_alloc) __P((u_char *options, int opt_len)); + void *(*decomp_alloc)(u_char *options, int opt_len); /* Free space used by a decompressor */ - void (*decomp_free) __P((void *state)); + void (*decomp_free)(void *state); /* Initialize a decompressor */ - int (*decomp_init) __P((void *state, u_char *options, int opt_len, - int unit, int hdrlen, int mru, int debug)); + int (*decomp_init)(void *state, u_char *options, int opt_len, + int unit, int hdrlen, int mru, int debug); /* Reset a decompressor */ - void (*decomp_reset) __P((void *state)); + void (*decomp_reset)(void *state); /* Decompress a packet. */ - int (*decompress) __P((void *state, PACKETPTR mp, - PACKETPTR *dmpp)); + int (*decompress)(void *state, PACKETPTR mp, PACKETPTR *dmpp); /* Update state for an incompressible packet received */ - void (*incomp) __P((void *state, PACKETPTR mp)); + void (*incomp)(void *state, PACKETPTR mp); /* Return decompression statistics */ - void (*decomp_stat) __P((void *state, struct compstat *stats)); + void (*decomp_stat)(void *state, struct compstat *stats); }; #endif /* PACKETPTR */ @@ -183,5 +181,5 @@ struct compressor { #define CI_PREDICTOR_2 2 /* config option for Predictor-2 */ #define CILEN_PREDICTOR_2 2 /* length of its config option */ -#endif /* __APPLE_API_UNSTABLE */ +#endif /* KERNEL_PRIVATE */ #endif /* _NET_PPP_COMP_H */ diff --git a/bsd/net/ppp_deflate.c b/bsd/net/ppp_deflate.c index 7d28a5b41..3d2b95df3 100644 --- a/bsd/net/ppp_deflate.c +++ b/bsd/net/ppp_deflate.c @@ -78,24 +78,23 @@ struct deflate_state { #define DEFLATE_OVHD 2 /* Deflate overhead/packet */ -static void *z_alloc __P((void *, u_int items, u_int size)); -static void z_free __P((void *, void *ptr)); -static void *z_comp_alloc __P((u_char *options, int opt_len)); -static void *z_decomp_alloc __P((u_char *options, int opt_len)); -static void z_comp_free __P((void *state)); -static void z_decomp_free __P((void *state)); -static int z_comp_init __P((void *state, u_char *options, int opt_len, - int unit, int hdrlen, int debug)); -static int z_decomp_init __P((void *state, u_char *options, int opt_len, - int unit, int hdrlen, int mru, int debug)); -static int z_compress __P((void *state, struct mbuf **mret, - struct mbuf *mp, int slen, int maxolen)); -static void z_incomp __P((void *state, struct mbuf *dmsg)); -static int z_decompress __P((void *state, struct mbuf *cmp, - struct mbuf **dmpp)); -static void z_comp_reset __P((void *state)); -static void z_decomp_reset __P((void *state)); -static void z_comp_stats __P((void *state, struct compstat *stats)); +static void *z_alloc(void *, u_int items, u_int size); +static void z_free(void *, void *ptr); +static void *z_comp_alloc(u_char *options, int opt_len); +static void *z_decomp_alloc(u_char *options, int opt_len); +static void z_comp_free(void *state); +static void z_decomp_free(void *state); +static int z_comp_init(void *state, u_char *options, int opt_len, + int unit, int hdrlen, int debug); +static int z_decomp_init(void *state, u_char *options, int opt_len, + int unit, int hdrlen, int mru, int debug); +static int z_compress(void *state, struct mbuf **mret, + struct mbuf *mp, int slen, int maxolen); +static void z_incomp(void *state, struct mbuf *dmsg); +static int z_decompress(void *state, struct mbuf *cmp, struct mbuf **dmpp); +static void z_comp_reset(void *state); +static void z_decomp_reset(void *state); +static void z_comp_stats(void *state, struct compstat *stats); /* * Procedures exported to if_ppp.c. diff --git a/bsd/net/ppp_defs.h b/bsd/net/ppp_defs.h index 416aa9ede..b902632a9 100644 --- a/bsd/net/ppp_defs.h +++ b/bsd/net/ppp_defs.h @@ -167,12 +167,4 @@ struct ppp_idle { time_t recv_idle; /* time since last NP packet received */ }; -#ifndef __P -#ifdef __STDC__ -#define __P(x) x -#else -#define __P(x) () -#endif -#endif - #endif /* _PPP_DEFS_H_ */ diff --git a/bsd/net/radix.c b/bsd/net/radix.c index cfe854974..bbc3572b1 100644 --- a/bsd/net/radix.c +++ b/bsd/net/radix.c @@ -70,17 +70,20 @@ #endif #include <sys/syslog.h> #include <net/radix.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <kern/locks.h> #endif -static int rn_walktree_from __P((struct radix_node_head *h, void *a, - void *m, walktree_f_t *f, void *w)); -static int rn_walktree __P((struct radix_node_head *, walktree_f_t *, void *)); +static int rn_walktree_from(struct radix_node_head *h, void *a, + void *m, walktree_f_t *f, void *w); +static int rn_walktree(struct radix_node_head *, walktree_f_t *, void *); static struct radix_node - *rn_insert __P((void *, struct radix_node_head *, int *, - struct radix_node [2])), - *rn_newpair __P((void *, int, struct radix_node[2])), - *rn_search __P((void *, struct radix_node *)), - *rn_search_m __P((void *, struct radix_node *, void *)); + *rn_insert(void *, struct radix_node_head *, int *, + struct radix_node [2]), + *rn_newpair(void *, int, struct radix_node[2]), + *rn_search(void *, struct radix_node *), + *rn_search_m(void *, struct radix_node *, void *); static int max_keylen; static struct radix_mask *rn_mkfreelist; @@ -89,17 +92,22 @@ static char *addmask_key; static char normal_chars[] = {0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, -1}; static char *rn_zeros, *rn_ones; + +extern lck_grp_t *domain_proto_mtx_grp; +extern lck_attr_t *domain_proto_mtx_attr; +lck_mtx_t *rn_mutex; + #define rn_masktop (mask_rnhead->rnh_treetop) #undef Bcmp #define Bcmp(a, b, l) \ (l == 0 ? 0 : bcmp((caddr_t)(a), (caddr_t)(b), (u_long)l)) -static int rn_lexobetter __P((void *m_arg, void *n_arg)); +static int rn_lexobetter(void *m_arg, void *n_arg); static struct radix_mask * - rn_new_radix_mask __P((struct radix_node *tt, - struct radix_mask *next)); -static int rn_satsifies_leaf __P((char *trial, struct radix_node *leaf, - int skip)); + rn_new_radix_mask(struct radix_node *tt, + struct radix_mask *next); +static int rn_satsifies_leaf(char *trial, struct radix_node *leaf, + int skip); /* * The data structure for the keys is a radix tree with one way @@ -496,7 +504,7 @@ rn_addmask(n_arg, search, skip) x = rn_insert(cp, mask_rnhead, &maskduplicated, x); if (maskduplicated) { log(LOG_ERR, "rn_addmask: mask impossibly already in tree"); - Free(saved_x); + R_Free(saved_x); return (x); } /* @@ -1084,6 +1092,7 @@ rn_init() #ifdef KERNEL struct domain *dom; + /* lock already held when rn_init is called */ for (dom = domains; dom; dom = dom->dom_next) if (dom->dom_maxrtkey > max_keylen) max_keylen = dom->dom_maxrtkey; @@ -1103,4 +1112,41 @@ rn_init() *cp++ = -1; if (rn_inithead((void **)&mask_rnhead, 0) == 0) panic("rn_init 2"); + + rn_mutex = lck_mtx_alloc_init(domain_proto_mtx_grp, domain_proto_mtx_attr); +} +int +rn_lock(so, refcount, lr) + struct socket *so; + int refcount; + int lr; +{ +// printf("rn_lock: (global) so=%x ref=%d lr=%x\n", so, so->so_usecount, lr); + lck_mtx_assert(rn_mutex, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(rn_mutex); + if (refcount) + so->so_usecount++; + return (0); +} + +int +rn_unlock(so, refcount, lr) + struct socket *so; + int refcount; + int lr; +{ +// printf("rn_unlock: (global) so=%x ref=%d lr=%x\n", so, so->so_usecount, lr); + if (refcount) + so->so_usecount--; + lck_mtx_assert(rn_mutex, LCK_MTX_ASSERT_OWNED); + lck_mtx_unlock(rn_mutex); + return (0); +} +lck_mtx_t * +rn_getlock(so, locktype) + struct socket *so; + int locktype; +{ +// printf("rn_getlock: (global) so=%x\n", so); + return (rn_mutex); } diff --git a/bsd/net/radix.h b/bsd/net/radix.h index 833e9f714..73d0b6da7 100644 --- a/bsd/net/radix.h +++ b/bsd/net/radix.h @@ -59,7 +59,7 @@ #define _RADIX_H_ #include <sys/appleapiopts.h> -#if !defined(KERNEL) || defined(__APPLE_API_PRIVATE) +#ifdef PRIVATE #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_RTABLE); @@ -104,14 +104,6 @@ struct radix_node { #define rn_left rn_u.rn_node.rn_L #define rn_right rn_u.rn_node.rn_R -#if 0 -/* for backward compatibility with previous definitions */ -#define rn_p rn_parent -#define rn_b rn_bit -#define rn_off rn_offset -#define rn_l rn_left -#define rn_r rn_right -#endif /* * Annotations to tree concerning potential routes applying to subtrees. */ @@ -141,35 +133,35 @@ struct radix_mask { #define MKFree(m) { (m)->rm_mklist = rn_mkfreelist; rn_mkfreelist = (m);} -typedef int walktree_f_t __P((struct radix_node *, void *)); +typedef int walktree_f_t(struct radix_node *, void *); struct radix_node_head { struct radix_node *rnh_treetop; int rnh_addrsize; /* permit, but not require fixed keys */ int rnh_pktsize; /* permit, but not require fixed keys */ struct radix_node *(*rnh_addaddr) /* add based on sockaddr */ - __P((void *v, void *mask, - struct radix_node_head *head, struct radix_node nodes[])); + (void *v, void *mask, + struct radix_node_head *head, struct radix_node nodes[]); struct radix_node *(*rnh_addpkt) /* add based on packet hdr */ - __P((void *v, void *mask, - struct radix_node_head *head, struct radix_node nodes[])); + (void *v, void *mask, + struct radix_node_head *head, struct radix_node nodes[]); struct radix_node *(*rnh_deladdr) /* remove based on sockaddr */ - __P((void *v, void *mask, struct radix_node_head *head)); + (void *v, void *mask, struct radix_node_head *head); struct radix_node *(*rnh_delpkt) /* remove based on packet hdr */ - __P((void *v, void *mask, struct radix_node_head *head)); + (void *v, void *mask, struct radix_node_head *head); struct radix_node *(*rnh_matchaddr) /* locate based on sockaddr */ - __P((void *v, struct radix_node_head *head)); + (void *v, struct radix_node_head *head); struct radix_node *(*rnh_lookup) /* locate based on sockaddr */ - __P((void *v, void *mask, struct radix_node_head *head)); + (void *v, void *mask, struct radix_node_head *head); struct radix_node *(*rnh_matchpkt) /* locate based on packet hdr */ - __P((void *v, struct radix_node_head *head)); + (void *v, struct radix_node_head *head); int (*rnh_walktree) /* traverse tree */ - __P((struct radix_node_head *head, walktree_f_t *f, void *w)); + (struct radix_node_head *head, walktree_f_t *f, void *w); int (*rnh_walktree_from) /* traverse tree below a */ - __P((struct radix_node_head *head, void *a, void *m, - walktree_f_t *f, void *w)); + (struct radix_node_head *head, void *a, void *m, + walktree_f_t *f, void *w); void (*rnh_close) /* do something when the last ref drops */ - __P((struct radix_node *rn, struct radix_node_head *head)); + (struct radix_node *rn, struct radix_node_head *head); struct radix_node rnh_nodes[3]; /* empty tree for common case */ }; @@ -178,26 +170,25 @@ struct radix_node_head { #define Bcopy(a, b, n) bcopy(((char *)(a)), ((char *)(b)), (unsigned)(n)) #define Bzero(p, n) bzero((char *)(p), (int)(n)); #define R_Malloc(p, t, n) (p = (t) malloc((unsigned int)(n))) -#define Free(p) free((char *)p); +#define R_Free(p) free((char *)p); #else #define Bcmp(a, b, n) bcmp(((caddr_t)(a)), ((caddr_t)(b)), (unsigned)(n)) #define Bcopy(a, b, n) bcopy(((caddr_t)(a)), ((caddr_t)(b)), (unsigned)(n)) #define Bzero(p, n) bzero((caddr_t)(p), (unsigned)(n)); #define R_Malloc(p, t, n) (p = (t) _MALLOC((unsigned long)(n), M_RTABLE, M_WAITOK)) -#define Free(p) FREE((caddr_t)p, M_RTABLE); +#define R_Free(p) FREE((caddr_t)p, M_RTABLE); #endif /*KERNEL*/ -void rn_init __P((void)); -int rn_inithead __P((void **, int)); -int rn_refines __P((void *, void *)); +void rn_init(void); +int rn_inithead(void **, int); +int rn_refines(void *, void *); struct radix_node - *rn_addmask __P((void *, int, int)), - *rn_addroute __P((void *, void *, struct radix_node_head *, - struct radix_node [2])), - *rn_delete __P((void *, void *, struct radix_node_head *)), - *rn_lookup __P((void *v_arg, void *m_arg, - struct radix_node_head *head)), - *rn_match __P((void *, struct radix_node_head *)); - -#endif /* __APPLE_API_PRIVATE || !KERNEL */ + *rn_addmask(void *, int, int), + *rn_addroute(void *, void *, struct radix_node_head *, + struct radix_node [2]), + *rn_delete(void *, void *, struct radix_node_head *), + *rn_lookup(void *v_arg, void *m_arg, struct radix_node_head *head), + *rn_match(void *, struct radix_node_head *); + +#endif /* PRIVATE */ #endif /* _RADIX_H_ */ diff --git a/bsd/net/raw_cb.c b/bsd/net/raw_cb.c index 105ef13c8..cba291fe0 100644 --- a/bsd/net/raw_cb.c +++ b/bsd/net/raw_cb.c @@ -60,6 +60,7 @@ #include <sys/socketvar.h> #include <sys/domain.h> #include <sys/protosw.h> +#include <kern/locks.h> #include <net/raw_cb.h> @@ -76,6 +77,7 @@ struct rawcb_list_head rawcb_list; static u_long raw_sendspace = RAWSNDQ; static u_long raw_recvspace = RAWRCVQ; +extern lck_mtx_t *raw_mtx; /*### global raw cb mutex for now */ /* * Allocate a control block and a nominal amount @@ -102,7 +104,9 @@ raw_attach(so, proto) rp->rcb_socket = so; rp->rcb_proto.sp_family = so->so_proto->pr_domain->dom_family; rp->rcb_proto.sp_protocol = proto; + lck_mtx_lock(raw_mtx); LIST_INSERT_HEAD(&rawcb_list, rp, list); + lck_mtx_unlock(raw_mtx); return (0); } @@ -117,13 +121,21 @@ raw_detach(rp) struct socket *so = rp->rcb_socket; so->so_pcb = 0; + so->so_flags |= SOF_PCBCLEARING; sofree(so); + if (!lck_mtx_try_lock(raw_mtx)) { + socket_unlock(so, 0); + lck_mtx_lock(raw_mtx); + socket_lock(so, 0); + } LIST_REMOVE(rp, list); + lck_mtx_unlock(raw_mtx); #ifdef notdef if (rp->rcb_laddr) m_freem(dtom(rp->rcb_laddr)); rp->rcb_laddr = 0; #endif + rp->rcb_socket = NULL; FREE((caddr_t)(rp), M_PCB); } diff --git a/bsd/net/raw_cb.h b/bsd/net/raw_cb.h index d047ae38f..478cdd571 100644 --- a/bsd/net/raw_cb.h +++ b/bsd/net/raw_cb.h @@ -60,7 +60,7 @@ #include <sys/queue.h> -#ifdef __APPLE_API_PRIVATE +#ifdef PRIVATE /* * Raw protocol interface control block. Used * to tie a socket to the generic raw interface. @@ -73,7 +73,6 @@ struct rawcb { struct sockproto rcb_proto; /* protocol family, protocol */ u_long reserved[4]; /* for future use */ }; -#endif /* __APPLE_API_PRIVATE */ #define sotorawcb(so) ((struct rawcb *)(so)->so_pcb) @@ -82,21 +81,20 @@ struct rawcb { */ #define RAWSNDQ 8192 #define RAWRCVQ 8192 +#endif /* PRIVATE */ -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE extern LIST_HEAD(rawcb_list_head, rawcb) rawcb_list; -int raw_attach __P((struct socket *, int)); -void raw_ctlinput __P((int, struct sockaddr *, void *)); -void raw_detach __P((struct rawcb *)); -void raw_disconnect __P((struct rawcb *)); -void raw_init __P((void)); -void raw_input __P((struct mbuf *, - struct sockproto *, struct sockaddr *, struct sockaddr *)); +int raw_attach(struct socket *, int); +void raw_ctlinput(int, struct sockaddr *, void *); +void raw_detach(struct rawcb *); +void raw_disconnect(struct rawcb *); +void raw_init(void); +void raw_input(struct mbuf *, + struct sockproto *, struct sockaddr *, struct sockaddr *); extern struct pr_usrreqs raw_usrreqs; -#endif /* __APPLE_API_PRIVATE */ -#endif +#endif KERNEL_PRIVATE #endif diff --git a/bsd/net/raw_usrreq.c b/bsd/net/raw_usrreq.c index 5e8a246a9..d9bf97217 100644 --- a/bsd/net/raw_usrreq.c +++ b/bsd/net/raw_usrreq.c @@ -59,18 +59,38 @@ #include <sys/systm.h> #include <sys/mbuf.h> #include <sys/proc.h> +#include <sys/domain.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> +#include <kern/locks.h> #include <net/raw_cb.h> +lck_mtx_t *raw_mtx; /*### global raw cb mutex for now */ +lck_attr_t *raw_mtx_attr; +lck_grp_t *raw_mtx_grp; +lck_grp_attr_t *raw_mtx_grp_attr; /* * Initialize raw connection block q. */ void raw_init() { + raw_mtx_grp_attr = lck_grp_attr_alloc_init(); + + lck_grp_attr_setdefault(raw_mtx_grp_attr); + + raw_mtx_grp = lck_grp_alloc_init("rawcb", raw_mtx_grp_attr); + + raw_mtx_attr = lck_attr_alloc_init(); + + lck_attr_setdefault(raw_mtx_attr); + + if ((raw_mtx = lck_mtx_alloc_init(raw_mtx_grp, raw_mtx_attr)) == NULL) { + printf("raw_init: can't alloc raw_mtx\n"); + return; + } LIST_INIT(&rawcb_list); } @@ -93,8 +113,14 @@ raw_input(m0, proto, src, dst) register struct mbuf *m = m0; register int sockets = 0; struct socket *last; + int error; +//####LD raw_input is called from many places, input & output path. We have to assume the +//####LD socket we'll find and need to append to is unlocked. +//####LD calls from the output (locked) path need to make sure the socket is not locked when +//####LD we call in raw_input last = 0; + lck_mtx_lock(raw_mtx); LIST_FOREACH(rp, &rawcb_list, list) { if (rp->rcb_proto.sp_family != proto->sp_family) continue; @@ -119,28 +145,28 @@ raw_input(m0, proto, src, dst) struct mbuf *n; n = m_copy(m, 0, (int)M_COPYALL); if (n) { + socket_lock(last, 1); if (sbappendaddr(&last->so_rcv, src, - n, (struct mbuf *)0) == 0) - /* should notify about lost packet */ - m_freem(n); - else { + n, (struct mbuf *)0, &error) != 0) { sorwakeup(last); sockets++; } + socket_unlock(last, 1); } } last = rp->rcb_socket; } if (last) { + socket_lock(last, 1); if (sbappendaddr(&last->so_rcv, src, - m, (struct mbuf *)0) == 0) - m_freem(m); - else { + m, (struct mbuf *)0, &error) != 0) { sorwakeup(last); sockets++; } + socket_unlock(last, 1); } else m_freem(m); + lck_mtx_unlock(raw_mtx); } /*ARGSUSED*/ @@ -161,6 +187,13 @@ raw_uabort(struct socket *so) { struct rawcb *rp = sotorawcb(so); + lck_mtx_t * mutex_held; + if (so->so_proto->pr_getlock != NULL) + mutex_held = (*so->so_proto->pr_getlock)(so, 0); + else + mutex_held = so->so_proto->pr_domain->dom_mtx; + lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); + if (rp == 0) return EINVAL; raw_disconnect(rp); @@ -175,7 +208,9 @@ static int raw_uattach(struct socket *so, int proto, struct proc *p) { struct rawcb *rp = sotorawcb(so); +#ifndef __APPLE__ int error; +#endif if (rp == 0) return EINVAL; @@ -209,6 +244,12 @@ raw_udetach(struct socket *so) { struct rawcb *rp = sotorawcb(so); + lck_mtx_t * mutex_held; + if (so->so_proto->pr_getlock != NULL) + mutex_held = (*so->so_proto->pr_getlock)(so, 0); + else + mutex_held = so->so_proto->pr_domain->dom_mtx; + lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); if (rp == 0) return EINVAL; @@ -257,6 +298,13 @@ raw_usend(struct socket *so, int flags, struct mbuf *m, int error; struct rawcb *rp = sotorawcb(so); + lck_mtx_t * mutex_held; + if (so->so_proto->pr_getlock != NULL) + mutex_held = (*so->so_proto->pr_getlock)(so, 0); + else + mutex_held = so->so_proto->pr_domain->dom_mtx; + lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); + if (rp == 0) { error = EINVAL; goto release; @@ -297,6 +345,12 @@ static int raw_ushutdown(struct socket *so) { struct rawcb *rp = sotorawcb(so); + lck_mtx_t * mutex_held; + if (so->so_proto->pr_getlock != NULL) + mutex_held = (*so->so_proto->pr_getlock)(so, 0); + else + mutex_held = so->so_proto->pr_domain->dom_mtx; + lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); if (rp == 0) return EINVAL; @@ -322,5 +376,5 @@ struct pr_usrreqs raw_usrreqs = { pru_connect2_notsupp, pru_control_notsupp, raw_udetach, raw_udisconnect, pru_listen_notsupp, raw_upeeraddr, pru_rcvd_notsupp, pru_rcvoob_notsupp, raw_usend, pru_sense_null, raw_ushutdown, - raw_usockaddr, sosend, soreceive, sopoll + raw_usockaddr, sosend, soreceive, pru_sopoll_notsupp }; diff --git a/bsd/net/route.c b/bsd/net/route.c index 0f41d2360..4ab8d1d16 100644 --- a/bsd/net/route.c +++ b/bsd/net/route.c @@ -62,6 +62,7 @@ #include <sys/socket.h> #include <sys/domain.h> #include <sys/syslog.h> +#include <kern/lock.h> #include <net/if.h> #include <net/route.h> @@ -73,15 +74,22 @@ #define SA(p) ((struct sockaddr *)(p)) +extern struct domain routedomain; struct route_cb route_cb; -static struct rtstat rtstat; +__private_extern__ struct rtstat rtstat = { 0, 0, 0, 0, 0 }; struct radix_node_head *rt_tables[AF_MAX+1]; -static int rttrash; /* routes not in table but not freed */ +lck_mtx_t *rt_mtx; /*### global routing tables mutex for now */ +lck_attr_t *rt_mtx_attr; +lck_grp_t *rt_mtx_grp; +lck_grp_attr_t *rt_mtx_grp_attr; -static void rt_maskedcopy __P((struct sockaddr *, - struct sockaddr *, struct sockaddr *)); -static void rtable_init __P((void **)); +lck_mtx_t *route_domain_mtx; /*### global routing tables mutex for now */ +__private_extern__ int rttrash = 0; /* routes not in table but not freed */ + +static void rt_maskedcopy(struct sockaddr *, + struct sockaddr *, struct sockaddr *); +static void rtable_init(void **); __private_extern__ u_long route_generation = 0; extern int use_routegenid; @@ -101,8 +109,26 @@ rtable_init(table) void route_init() { + rt_mtx_grp_attr = lck_grp_attr_alloc_init(); + + lck_grp_attr_setdefault(rt_mtx_grp_attr); + + rt_mtx_grp = lck_grp_alloc_init("route", rt_mtx_grp_attr); + + rt_mtx_attr = lck_attr_alloc_init(); + + lck_attr_setdefault(rt_mtx_attr); + + if ((rt_mtx = lck_mtx_alloc_init(rt_mtx_grp, rt_mtx_attr)) == NULL) { + printf("route_init: can't alloc rt_mtx\n"); + return; + } + + lck_mtx_lock(rt_mtx); rn_init(); /* initialize all zeroes, all ones, mask table */ + lck_mtx_unlock(rt_mtx); rtable_init((void **)rt_tables); + route_domain_mtx = routedomain.dom_mtx; } /* @@ -116,34 +142,41 @@ rtalloc(ro) } void -rtalloc_ign(ro, ignore) +rtalloc_ign_locked(ro, ignore) register struct route *ro; u_long ignore; { struct rtentry *rt; - int s; if ((rt = ro->ro_rt) != NULL) { if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP) return; /* XXX - We are probably always at splnet here already. */ - s = splnet(); - rtfree(rt); + rtfree_locked(rt); ro->ro_rt = NULL; - splx(s); } - ro->ro_rt = rtalloc1(&ro->ro_dst, 1, ignore); + ro->ro_rt = rtalloc1_locked(&ro->ro_dst, 1, ignore); if (ro->ro_rt) ro->ro_rt->generation_id = route_generation; } +void +rtalloc_ign(ro, ignore) + register struct route *ro; + u_long ignore; +{ + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(rt_mtx); + rtalloc_ign_locked(ro, ignore); + lck_mtx_unlock(rt_mtx); +} /* * Look up the route that matches the address given * Or, at least try.. Create a cloned route if needed. */ struct rtentry * -rtalloc1(dst, report, ignflags) - register struct sockaddr *dst; +rtalloc1_locked(dst, report, ignflags) + const struct sockaddr *dst; int report; u_long ignflags; { @@ -153,8 +186,7 @@ rtalloc1(dst, report, ignflags) struct rtentry *newrt = 0; struct rt_addrinfo info; u_long nflags; - int s = splnet(), err = 0, msgtype = RTM_MISS; - + int err = 0, msgtype = RTM_MISS; /* * Look up the address in the table for that Address Family */ @@ -172,7 +204,7 @@ rtalloc1(dst, report, ignflags) * If it requires that it be cloned, do so. * (This implies it wasn't a HOST route.) */ - err = rtrequest(RTM_RESOLVE, dst, SA(0), + err = rtrequest_locked(RTM_RESOLVE, dst, SA(0), SA(0), 0, &newrt); if (err) { /* @@ -211,27 +243,46 @@ rtalloc1(dst, report, ignflags) rt_missmsg(msgtype, &info, 0, err); } } - splx(s); return (newrt); } +struct rtentry * +rtalloc1(dst, report, ignflags) + register struct sockaddr *dst; + int report; + u_long ignflags; +{ + struct rtentry * entry; + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(rt_mtx); + entry = rtalloc1_locked(dst, report, ignflags); + lck_mtx_unlock(rt_mtx); + return (entry); +} + /* * Remove a reference count from an rtentry. * If the count gets low enough, take it out of the routing table */ void -rtfree(rt) +rtfree_locked(rt) register struct rtentry *rt; { /* * find the tree for that address family * Note: in the case of igmp packets, there might not be an rnh */ - register struct radix_node_head *rnh = - rt_tables[rt_key(rt)->sa_family]; + register struct radix_node_head *rnh; - if (rt == 0) - panic("rtfree"); + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); + + /* See 3582620 - We hit this during the transition from funnels to locks */ + if (rt == 0) { + printf("rtfree - rt is NULL\n"); + return; + } + + rnh = rt_tables[rt_key(rt)->sa_family]; /* * decrement the reference count by one and if it reaches 0, @@ -258,7 +309,7 @@ rtfree(rt) #ifdef DIAGNOSTIC if (rt->rt_refcnt < 0) { - printf("rtfree: %p not freed (neg refs)\n", rt); + printf("rtfree: %p not freed (neg refs) cnt=%d\n", rt, rt->rt_refcnt); return; } #endif @@ -268,20 +319,11 @@ rtfree(rt) * e.g other routes and ifaddrs. */ if (rt->rt_parent) - rtfree(rt->rt_parent); + rtfree_locked(rt->rt_parent); - if(rt->rt_ifa && !(rt->rt_parent && rt->rt_parent->rt_ifa == rt->rt_ifa)) { - /* - * Only release the ifa if our parent doesn't hold it for us. - * The parent route is responsible for holding a reference - * to the ifa for us. Ifa refcounts are 16bit, if every - * cloned route held a reference, the 16bit refcount may - * rollover, making a mess :( - * - * FreeBSD solved this by making the ifa_refcount 32bits, but - * we can't do that since it changes the size of the ifaddr struct. - */ + if(rt->rt_ifa) { ifafree(rt->rt_ifa); + rt->rt_ifa = NULL; } /* @@ -289,15 +331,25 @@ rtfree(rt) * This also frees the gateway, as they are always malloc'd * together. */ - Free(rt_key(rt)); + R_Free(rt_key(rt)); /* * and the rtentry itself of course */ - Free(rt); + R_Free(rt); } } +void +rtfree(rt) + register struct rtentry *rt; +{ + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(rt_mtx); + rtfree_locked(rt); + lck_mtx_unlock(rt_mtx); +} + /* * Decrements the refcount but does not free the route when * the refcount reaches zero. Unless you have really good reason, @@ -306,6 +358,8 @@ rtfree(rt) void rtunref(struct rtentry* rt) { + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); + if (rt == NULL) panic("rtunref"); rt->rt_refcnt--; @@ -321,6 +375,8 @@ rtunref(struct rtentry* rt) void rtref(struct rtentry* rt) { + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); + if (rt == NULL) panic("rtref"); @@ -336,15 +392,15 @@ rtsetifa(struct rtentry *rt, struct ifaddr* ifa) if (rt->rt_ifa == ifa) return; - /* Release the old ifa if it isn't our parent route's ifa */ - if (rt->rt_ifa && !(rt->rt_parent && rt->rt_parent->rt_ifa == rt->rt_ifa)) + /* Release the old ifa */ + if (rt->rt_ifa) ifafree(rt->rt_ifa); /* Set rt_ifa */ rt->rt_ifa = ifa; - /* Take a reference to the ifa if it isn't our parent route's ifa */ - if (rt->rt_ifa && !(rt->rt_parent && rt->rt_parent->rt_ifa == ifa)) + /* Take a reference to the ifa */ + if (rt->rt_ifa) ifaref(rt->rt_ifa); } @@ -352,43 +408,31 @@ void ifafree(ifa) register struct ifaddr *ifa; { + int i, oldval; + u_char *ptr = (u_char*)ifa; + if (ifa == NULL) panic("ifafree"); - if (ifa->ifa_refcnt == 0) { -#ifdef __APPLE__ - /* Detect case where an ifa is being freed before it should */ - struct ifnet* ifp; - /* Verify this ifa isn't attached to an interface */ - for (ifp = ifnet.tqh_first; ifp; ifp = ifp->if_link.tqe_next) { - struct ifaddr *ifaInUse; - for (ifaInUse = ifp->if_addrhead.tqh_first; ifaInUse; ifaInUse = ifaInUse->ifa_link.tqe_next) { - if (ifa == ifaInUse) { - /* - * This is an ugly hack done because we can't move to a 32 bit - * refcnt like bsd has. We have to maintain binary compatibility - * in our kernel, unlike FreeBSD. - */ - log(LOG_ERR, "ifa attached to ifp is being freed, leaking insted\n"); - return; - } - } + + oldval = OSAddAtomic(-1, &ifa->ifa_refcnt); + + if (oldval == 0) { + if ((ifa->ifa_flags & IFA_ATTACHED) != 0) { + panic("ifa attached to ifp is being freed\n"); } -#endif FREE(ifa, M_IFADDR); } - else - ifa->ifa_refcnt--; } -#ifdef __APPLE__ void ifaref(struct ifaddr *ifa) { if (ifa == NULL) panic("ifaref"); - ifa->ifa_refcnt++; + + if (OSAddAtomic(1, &ifa->ifa_refcnt) == 0xffffffff) + panic("ifaref - reference count rolled over!"); } -#endif /* * Force a routing table entry to the specified @@ -409,14 +453,18 @@ rtredirect(dst, gateway, netmask, flags, src, rtp) int error = 0; short *stat = 0; struct rt_addrinfo info; - struct ifaddr *ifa; + struct ifaddr *ifa = NULL; + + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(rt_mtx); /* verify the gateway is directly reachable */ if ((ifa = ifa_ifwithnet(gateway)) == 0) { error = ENETUNREACH; goto out; } - rt = rtalloc1(dst, 0, 0UL); + + rt = rtalloc1_locked(dst, 0, 0UL); /* * If the redirect isn't from our current router for this dst, * it's either old or wrong. If it redirects us to ourselves, @@ -427,8 +475,20 @@ rtredirect(dst, gateway, netmask, flags, src, rtp) if (!(flags & RTF_DONE) && rt && (!equal(src, rt->rt_gateway) || rt->rt_ifa != ifa)) error = EINVAL; - else if (ifa_ifwithaddr(gateway)) - error = EHOSTUNREACH; + else { + ifafree(ifa); + if ((ifa = ifa_ifwithaddr(gateway))) { + ifafree(ifa); + ifa = NULL; + error = EHOSTUNREACH; + } + } + + if (ifa) { + ifafree(ifa); + ifa = NULL; + } + if (error) goto done; /* @@ -451,7 +511,7 @@ rtredirect(dst, gateway, netmask, flags, src, rtp) */ create: flags |= RTF_GATEWAY | RTF_DYNAMIC; - error = rtrequest((int)RTM_ADD, dst, gateway, + error = rtrequest_locked((int)RTM_ADD, dst, gateway, netmask, flags, (struct rtentry **)0); stat = &rtstat.rts_dynamic; @@ -475,7 +535,7 @@ done: if (rtp && !error) *rtp = rt; else - rtfree(rt); + rtfree_locked(rt); } out: if (error) @@ -488,6 +548,7 @@ out: info.rti_info[RTAX_NETMASK] = netmask; info.rti_info[RTAX_AUTHOR] = src; rt_missmsg(RTM_REDIRECT, &info, flags, error); + lck_mtx_unlock(rt_mtx); } /* @@ -512,11 +573,15 @@ rtioctl(req, data, p) } struct ifaddr * -ifa_ifwithroute(flags, dst, gateway) - int flags; - struct sockaddr *dst, *gateway; +ifa_ifwithroute( + int flags, + const struct sockaddr *dst, + const struct sockaddr *gateway) { - register struct ifaddr *ifa; + + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); + + struct ifaddr *ifa = 0; if ((flags & RTF_GATEWAY) == 0) { /* * If we are adding a route to an interface, @@ -525,7 +590,6 @@ ifa_ifwithroute(flags, dst, gateway) * as our clue to the interface. Otherwise * we can use the local address. */ - ifa = 0; if (flags & RTF_HOST) { ifa = ifa_ifwithdstaddr(dst); } @@ -542,18 +606,23 @@ ifa_ifwithroute(flags, dst, gateway) if (ifa == 0) ifa = ifa_ifwithnet(gateway); if (ifa == 0) { - struct rtentry *rt = rtalloc1(dst, 0, 0UL); + struct rtentry *rt = rtalloc1_locked(dst, 0, 0UL); if (rt == 0) return (0); + ifa = rt->rt_ifa; + if (ifa) + ifaref(ifa); rtunref(rt); - if ((ifa = rt->rt_ifa) == 0) - return (0); + if (ifa == 0) + return 0; } if (ifa->ifa_addr->sa_family != dst->sa_family) { - struct ifaddr *oifa = ifa; - ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp); - if (ifa == 0) - ifa = oifa; + struct ifaddr *newifa; + newifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp); + if (newifa != 0) { + ifafree(ifa); + ifa = newifa; + } } return (ifa); } @@ -572,20 +641,23 @@ struct rtfc_arg { * Do appropriate manipulations of a routing tree given * all the bits of info needed */ -int -rtrequest(req, dst, gateway, netmask, flags, ret_nrt) - int req, flags; - struct sockaddr *dst, *gateway, *netmask; - struct rtentry **ret_nrt; +rtrequest_locked( + int req, + struct sockaddr *dst, + struct sockaddr *gateway, + struct sockaddr *netmask, + int flags, + struct rtentry **ret_nrt) { - int s = splnet(); int error = 0; + int error = 0; register struct rtentry *rt; register struct radix_node *rn; register struct radix_node_head *rnh; - struct ifaddr *ifa; + struct ifaddr *ifa = NULL; struct sockaddr *ndst; #define senderr(x) { error = x ; goto bad; } + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); /* * Find the correct routing tree to use for this Address Family */ @@ -626,7 +698,7 @@ rtrequest(req, dst, gateway, netmask, flags, ret_nrt) */ if (rt->rt_gwroute) { rt = rt->rt_gwroute; - rtfree(rt); + rtfree_locked(rt); (rt = (struct rtentry *)rn)->rt_gwroute = 0; } @@ -644,6 +716,7 @@ rtrequest(req, dst, gateway, netmask, flags, ret_nrt) */ if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) ifa->ifa_rtrequest(RTM_DELETE, rt, SA(0)); + ifa = NULL; /* * one more rtentry floating around that is not @@ -660,7 +733,7 @@ rtrequest(req, dst, gateway, netmask, flags, ret_nrt) *ret_nrt = rt; else if (rt->rt_refcnt <= 0) { rt->rt_refcnt++; /* make a 1->0 transition */ - rtfree(rt); + rtfree_locked(rt); } break; @@ -668,6 +741,7 @@ rtrequest(req, dst, gateway, netmask, flags, ret_nrt) if (ret_nrt == 0 || (rt = *ret_nrt) == 0) senderr(EINVAL); ifa = rt->rt_ifa; + ifaref(ifa); flags = rt->rt_flags & ~(RTF_CLONING | RTF_PRCLONING | RTF_STATIC); flags |= RTF_WASCLONED; @@ -694,7 +768,7 @@ rtrequest(req, dst, gateway, netmask, flags, ret_nrt) * also add the rt_gwroute if possible. */ if ((error = rt_setgate(rt, dst, gateway)) != 0) { - Free(rt); + R_Free(rt); senderr(error); } @@ -716,13 +790,8 @@ rtrequest(req, dst, gateway, netmask, flags, ret_nrt) * This moved from below so that rnh->rnh_addaddr() can * examine the ifa and ifa->ifa_ifp if it so desires. */ - /* - * Note that we do not use rtsetifa here because - * rt_parent has not been setup yet. - */ - ifaref(ifa); - rt->rt_ifa = ifa; - rt->rt_ifp = ifa->ifa_ifp; + rtsetifa(rt, ifa); + rt->rt_ifp = rt->rt_ifa->ifa_ifp; /* XXX mtu manipulation will be done in rnh_addaddr -- itojun */ @@ -737,19 +806,19 @@ rtrequest(req, dst, gateway, netmask, flags, ret_nrt) * mechanism, then we just blow it away and retry * the insertion of the new one. */ - rt2 = rtalloc1(dst, 0, RTF_PRCLONING); + rt2 = rtalloc1_locked(dst, 0, RTF_PRCLONING); if (rt2 && rt2->rt_parent) { - rtrequest(RTM_DELETE, + rtrequest_locked(RTM_DELETE, (struct sockaddr *)rt_key(rt2), rt2->rt_gateway, rt_mask(rt2), rt2->rt_flags, 0); - rtfree(rt2); + rtfree_locked(rt2); rn = rnh->rnh_addaddr((caddr_t)ndst, (caddr_t)netmask, rnh, rt->rt_nodes); } else if (rt2) { /* undo the extra ref we got */ - rtfree(rt2); + rtfree_locked(rt2); } } @@ -759,12 +828,12 @@ rtrequest(req, dst, gateway, netmask, flags, ret_nrt) */ if (rn == 0) { if (rt->rt_gwroute) - rtfree(rt->rt_gwroute); + rtfree_locked(rt->rt_gwroute); if (rt->rt_ifa) { ifafree(rt->rt_ifa); } - Free(rt_key(rt)); - Free(rt); + R_Free(rt_key(rt)); + R_Free(rt); senderr(EEXIST); } @@ -780,13 +849,6 @@ rtrequest(req, dst, gateway, netmask, flags, ret_nrt) if ((*ret_nrt)->rt_flags & (RTF_CLONING | RTF_PRCLONING)) { rt->rt_parent = (*ret_nrt); rtref(*ret_nrt); - - /* - * If our parent is holding a reference to the same ifa, - * free our reference and rely on the parent holding it. - */ - if (rt->rt_parent && rt->rt_parent->rt_ifa == rt->rt_ifa) - ifafree(rt->rt_ifa); } } @@ -796,6 +858,8 @@ rtrequest(req, dst, gateway, netmask, flags, ret_nrt) */ if (ifa->ifa_rtrequest) ifa->ifa_rtrequest(req, rt, SA(ret_nrt ? *ret_nrt : 0)); + ifafree(ifa); + ifa = 0; /* * We repeat the same procedure from rt_setgate() here because @@ -821,10 +885,27 @@ rtrequest(req, dst, gateway, netmask, flags, ret_nrt) break; } bad: - splx(s); + if (ifa) + ifafree(ifa); return (error); } +int +rtrequest( + int req, + struct sockaddr *dst, + struct sockaddr *gateway, + struct sockaddr *netmask, + int flags, + struct rtentry **ret_nrt) +{ + int error; + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(rt_mtx); + error = rtrequest_locked(req, dst, gateway, netmask, flags, ret_nrt); + lck_mtx_unlock(rt_mtx); + return (error); +} /* * Called from rtrequest(RTM_DELETE, ...) to fix up the route's ``family'' * (i.e., the routes related to it by the operation of cloning). This @@ -840,8 +921,10 @@ rt_fixdelete(rn, vp) struct rtentry *rt = (struct rtentry *)rn; struct rtentry *rt0 = vp; + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); + if (rt->rt_parent == rt0 && !(rt->rt_flags & RTF_PINNED)) { - return rtrequest(RTM_DELETE, rt_key(rt), + return rtrequest_locked(RTM_DELETE, rt_key(rt), (struct sockaddr *)0, rt_mask(rt), rt->rt_flags, (struct rtentry **)0); } @@ -882,6 +965,8 @@ rt_fixchange(rn, vp) printf("rt_fixchange: rt %p, rt0 %p\n", rt, rt0); #endif + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); + if (!rt->rt_parent || (rt->rt_flags & RTF_PINNED)) { #ifdef DEBUG if(rtfcdebug) printf("no parent or pinned\n"); @@ -893,7 +978,7 @@ rt_fixchange(rn, vp) #ifdef DEBUG if(rtfcdebug) printf("parent match\n"); #endif - return rtrequest(RTM_DELETE, rt_key(rt), + return rtrequest_locked(RTM_DELETE, rt_key(rt), (struct sockaddr *)0, rt_mask(rt), rt->rt_flags, (struct rtentry **)0); } @@ -947,7 +1032,7 @@ rt_fixchange(rn, vp) #ifdef DEBUG if(rtfcdebug) printf("deleting\n"); #endif - return rtrequest(RTM_DELETE, rt_key(rt), (struct sockaddr *)0, + return rtrequest_locked(RTM_DELETE, rt_key(rt), (struct sockaddr *)0, rt_mask(rt), rt->rt_flags, (struct rtentry **)0); } @@ -966,6 +1051,9 @@ rt_setgate(rt0, dst, gate) * will interfere with keeping LLINFO in the routing * table, so disallow it. */ + + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); + if (((rt0->rt_flags & (RTF_HOST|RTF_GATEWAY|RTF_LLINFO)) == (RTF_HOST|RTF_GATEWAY)) && (dst->sa_len == gate->sa_len) && @@ -975,7 +1063,7 @@ rt_setgate(rt0, dst, gate) * or a routing redirect, so try to delete it. */ if (rt_key(rt0)) - rtrequest(RTM_DELETE, (struct sockaddr *)rt_key(rt0), + rtrequest_locked(RTM_DELETE, (struct sockaddr *)rt_key(rt0), rt0->rt_gateway, rt_mask(rt0), rt0->rt_flags, 0); return EADDRNOTAVAIL; } @@ -1011,7 +1099,7 @@ rt_setgate(rt0, dst, gate) */ if (old) { Bcopy(dst, new, dlen); - Free(old); + R_Free(old); } /* @@ -1019,7 +1107,7 @@ rt_setgate(rt0, dst, gate) * so drop it. */ if (rt->rt_gwroute) { - rt = rt->rt_gwroute; rtfree(rt); + rt = rt->rt_gwroute; rtfree_locked(rt); rt = rt0; rt->rt_gwroute = 0; } /* @@ -1033,9 +1121,9 @@ rt_setgate(rt0, dst, gate) * This is obviously mandatory when we get rt->rt_output(). */ if (rt->rt_flags & RTF_GATEWAY) { - rt->rt_gwroute = rtalloc1(gate, 1, RTF_PRCLONING); + rt->rt_gwroute = rtalloc1_locked(gate, 1, RTF_PRCLONING); if (rt->rt_gwroute == rt) { - rtfree(rt->rt_gwroute); + rtfree_locked(rt->rt_gwroute); rt->rt_gwroute = 0; return EDQUOT; /* failure */ } @@ -1091,6 +1179,19 @@ int rtinit(ifa, cmd, flags) register struct ifaddr *ifa; int cmd, flags; +{ + int error; + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(rt_mtx); + error = rtinit_locked(ifa, cmd, flags); + lck_mtx_unlock(rt_mtx); + return (error); +} + +int +rtinit_locked(ifa, cmd, flags) + register struct ifaddr *ifa; + int cmd, flags; { register struct rtentry *rt; register struct sockaddr *dst; @@ -1113,8 +1214,9 @@ rtinit(ifa, cmd, flags) */ if ((flags & RTF_HOST) == 0 && ifa->ifa_netmask) { m = m_get(M_DONTWAIT, MT_SONAME); - if (m == NULL) + if (m == NULL) { return(ENOBUFS); + } deldst = mtod(m, struct sockaddr *); rt_maskedcopy(dst, deldst, ifa->ifa_netmask); dst = deldst; @@ -1125,7 +1227,7 @@ rtinit(ifa, cmd, flags) * We set "report" to FALSE so that if it doesn't exist, * it doesn't report an error or clone a route, etc. etc. */ - rt = rtalloc1(dst, 0, 0UL); + rt = rtalloc1_locked(dst, 0, 0UL); if (rt) { /* * Ok so we found the rtentry. it has an extra reference @@ -1155,6 +1257,7 @@ rtinit(ifa, cmd, flags) * it doesn't exist, we could just return at this point * with an "ELSE" clause, but apparently not.. */ + lck_mtx_unlock(rt_mtx); return (flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); } @@ -1163,7 +1266,7 @@ rtinit(ifa, cmd, flags) /* * Do the actual request */ - error = rtrequest(cmd, dst, ifa->ifa_addr, ifa->ifa_netmask, + error = rtrequest_locked(cmd, dst, ifa->ifa_addr, ifa->ifa_netmask, flags | ifa->ifa_flags, &nrt); if (m) (void) m_free(m); @@ -1180,7 +1283,7 @@ rtinit(ifa, cmd, flags) route_generation++; if (rt->rt_refcnt <= 0) { rt->rt_refcnt++; /* need a 1->0 transition to free */ - rtfree(rt); + rtfree_locked(rt); } } @@ -1232,6 +1335,6 @@ rtinit(ifa, cmd, flags) rt_newaddrmsg(cmd, ifa, error, nrt); if (use_routegenid) route_generation++; - } + } return (error); } diff --git a/bsd/net/route.h b/bsd/net/route.h index 85f010666..5d1ab8ab2 100644 --- a/bsd/net/route.h +++ b/bsd/net/route.h @@ -58,6 +58,8 @@ #ifndef _NET_ROUTE_H_ #define _NET_ROUTE_H_ #include <sys/appleapiopts.h> +#include <sys/types.h> +#include <sys/socket.h> /* * Kernel resident routing tables. @@ -71,7 +73,8 @@ * to a routing entry. These are often held by protocols * in their control blocks, e.g. inpcb. */ -#if !defined(KERNEL) || defined(__APPLE_API_PRIVATE) +#ifdef PRIVATE +struct rtentry; struct route { struct rtentry *ro_rt; struct sockaddr ro_dst; @@ -79,7 +82,7 @@ struct route { }; #else struct route; -#endif +#endif /* PRIVATE */ /* * These numbers are used by reliable protocols for determining @@ -89,7 +92,7 @@ struct rt_metrics { u_long rmx_locks; /* Kernel must leave these values alone */ u_long rmx_mtu; /* MTU for this path */ u_long rmx_hopcount; /* max hops expected */ - u_long rmx_expire; /* lifetime for route, e.g. redirect */ + int32_t rmx_expire; /* lifetime for route, e.g. redirect */ u_long rmx_recvpipe; /* inbound delay-bandwidth product */ u_long rmx_sendpipe; /* outbound delay-bandwidth product */ u_long rmx_ssthresh; /* outbound gateway buffer limit */ @@ -110,7 +113,6 @@ struct rt_metrics { /* * XXX kernel function pointer `rt_output' is visible to applications. */ -struct mbuf; /* * We distinguish between routes to hosts and routes to networks, @@ -120,10 +122,10 @@ struct mbuf; * gateways are marked so that the output routines know to address the * gateway rather than the ultimate destination. */ +#ifdef PRIVATE #ifndef RNF_NORMAL #include <net/radix.h> #endif -#ifdef __APPLE_API_UNSTABLE struct rtentry { struct radix_node rt_nodes[2]; /* tree glue, and other values */ #define rt_key(r) ((struct sockaddr *)((r)->rt_nodes->rn_key)) @@ -138,14 +140,15 @@ struct rtentry { caddr_t rt_llinfo; /* pointer to link level info cache */ struct rt_metrics rt_rmx; /* metrics used by rx'ing protocols */ struct rtentry *rt_gwroute; /* implied entry for gatewayed routes */ - int (*rt_output) __P((struct ifnet *, struct mbuf *, - struct sockaddr *, struct rtentry *)); + int (*rt_output)(struct ifnet *, struct mbuf *, + struct sockaddr *, struct rtentry *); /* output routine for this (rt,if) */ struct rtentry *rt_parent; /* cloning parent of this route */ u_long generation_id; /* route generation id */ }; -#endif /* __APPLE_API_UNSTABLE */ +#endif /* PRIVATE */ +#ifdef __APPLE_API_OBSOLETE /* * Following structure necessary for 4.3 compatibility; * We should eventually move it to a compat file. @@ -159,8 +162,11 @@ struct ortentry { u_long rt_use; /* raw # packets forwarded */ struct ifnet *rt_ifp; /* the answer: interface to use */ }; +#endif /* __APPLE_API_OBSOLETE */ +#ifdef PRIVATE #define rt_use rt_rmx.rmx_pksent +#endif /* PRIVATE */ #define RTF_UP 0x1 /* route usable */ #define RTF_GATEWAY 0x2 /* destination is a gateway */ @@ -181,7 +187,7 @@ struct ortentry { #define RTF_PRCLONING 0x10000 /* protocol requires cloning */ #define RTF_WASCLONED 0x20000 /* route generated through cloning */ #define RTF_PROTO3 0x40000 /* protocol specific routing flag */ -/* 0x80000 unused */ + /* 0x80000 unused */ #define RTF_PINNED 0x100000 /* future use */ #define RTF_LOCAL 0x200000 /* route represents a local address */ #define RTF_BROADCAST 0x400000 /* route represents a bcast address */ @@ -198,6 +204,7 @@ struct rtstat { short rts_unreach; /* lookups which failed */ short rts_wildcard; /* lookups satisfied by a wildcard */ }; + /* * Structures for routing messages. */ @@ -208,14 +215,30 @@ struct rt_msghdr { u_short rtm_index; /* index for associated ifp */ int rtm_flags; /* flags, incl. kern & message, e.g. DONE */ int rtm_addrs; /* bitmask identifying sockaddrs in msg */ - pid_t rtm_pid; /* identify sender */ - int rtm_seq; /* for sender to identify action */ - int rtm_errno; /* why failed */ + pid_t rtm_pid; /* identify sender */ + int rtm_seq; /* for sender to identify action */ + int rtm_errno; /* why failed */ int rtm_use; /* from rtentry */ u_long rtm_inits; /* which metrics we are initializing */ struct rt_metrics rtm_rmx; /* metrics themselves */ }; +struct rt_msghdr2 { + u_short rtm_msglen; /* to skip over non-understood messages */ + u_char rtm_version; /* future binary compatibility */ + u_char rtm_type; /* message type */ + u_short rtm_index; /* index for associated ifp */ + int rtm_flags; /* flags, incl. kern & message, e.g. DONE */ + int rtm_addrs; /* bitmask identifying sockaddrs in msg */ + int32_t rtm_refcnt; /* reference count */ + int rtm_parentflags; /* flags of the parent route */ + int rtm_reserved; /* reserved field set to 0 */ + int rtm_use; /* from rtentry */ + u_long rtm_inits; /* which metrics we are initializing */ + struct rt_metrics rtm_rmx; /* metrics themselves */ +}; + + #define RTM_VERSION 5 /* Up the ante and ignore older versions */ /* @@ -237,9 +260,12 @@ struct rt_msghdr { #define RTM_IFINFO 0xe /* iface going up/down etc. */ #define RTM_NEWMADDR 0xf /* mcast group membership being added to if */ #define RTM_DELMADDR 0x10 /* mcast group membership being deleted */ -#ifdef KERNEL_PRIVATE +#ifdef PRIVATE #define RTM_GET_SILENT 0x11 -#endif +#endif PRIVATE +#define RTM_IFINFO2 0x12 /* */ +#define RTM_NEWMADDR2 0x13 /* */ +#define RTM_GET2 0x14 /* */ /* * Bitmask values for rtm_inits and rmx_locks. @@ -292,53 +318,45 @@ struct route_cb { int any_count; }; -#ifdef KERNEL -#ifndef __APPLE__ -#define RTFREE(rt) \ - do { \ - if ((rt)->rt_refcnt <= 1) \ - rtfree(rt); \ - else \ - (rt)->rt_refcnt--; \ - } while (0) -#else +#ifdef KERNEL_PRIVATE #define RTFREE(rt) rtfree(rt) -#endif - -#ifdef __APPLE_API_PRIVATE extern struct route_cb route_cb; extern struct radix_node_head *rt_tables[AF_MAX+1]; struct ifmultiaddr; struct proc; -void route_init __P((void)); -void rt_ifmsg __P((struct ifnet *)); -void rt_missmsg __P((int, struct rt_addrinfo *, int, int)); -void rt_newaddrmsg __P((int, struct ifaddr *, int, struct rtentry *)); -void rt_newmaddrmsg __P((int, struct ifmultiaddr *)); -int rt_setgate __P((struct rtentry *, - struct sockaddr *, struct sockaddr *)); -void rtalloc __P((struct route *)); -void rtalloc_ign __P((struct route *, u_long)); +void route_init(void); +void rt_ifmsg(struct ifnet *); +void rt_missmsg(int, struct rt_addrinfo *, int, int); +void rt_newaddrmsg(int, struct ifaddr *, int, struct rtentry *); +void rt_newmaddrmsg(int, struct ifmultiaddr *); +int rt_setgate(struct rtentry *, struct sockaddr *, struct sockaddr *); +void rtalloc(struct route *); +void rtalloc_ign(struct route *, u_long); +struct rtentry * + rtalloc1(struct sockaddr *, int, u_long); struct rtentry * - rtalloc1 __P((struct sockaddr *, int, u_long)); -void rtfree __P((struct rtentry *)); -void rtref __P((struct rtentry *)); + rtalloc1_locked(const struct sockaddr *, int, u_long); +void rtfree(struct rtentry *); +void rtfree_locked(struct rtentry *); +void rtref(struct rtentry *); /* * rtunref will decrement the refcount, rtfree will decrement and free if * the refcount has reached zero and the route is not up. * Unless you have good reason to do otherwise, use rtfree. */ -void rtunref __P((struct rtentry *)); -void rtsetifa __P((struct rtentry *, struct ifaddr *)); -int rtinit __P((struct ifaddr *, int, int)); -int rtioctl __P((int, caddr_t, struct proc *)); -void rtredirect __P((struct sockaddr *, struct sockaddr *, - struct sockaddr *, int, struct sockaddr *, struct rtentry **)); -int rtrequest __P((int, struct sockaddr *, - struct sockaddr *, struct sockaddr *, int, struct rtentry **)); -#endif /* __APPLE_API_PRIVATE */ -#endif +void rtunref(struct rtentry *); +void rtsetifa(struct rtentry *, struct ifaddr *); +int rtinit(struct ifaddr *, int, int); +int rtinit_locked(struct ifaddr *, int, int); +int rtioctl(int, caddr_t, struct proc *); +void rtredirect(struct sockaddr *, struct sockaddr *, + struct sockaddr *, int, struct sockaddr *, struct rtentry **); +int rtrequest(int, struct sockaddr *, + struct sockaddr *, struct sockaddr *, int, struct rtentry **); +int rtrequest_locked(int, struct sockaddr *, + struct sockaddr *, struct sockaddr *, int, struct rtentry **); +#endif KERNEL_PRIVATE #endif diff --git a/bsd/net/rtsock.c b/bsd/net/rtsock.c index 8a9742722..54abd75f0 100644 --- a/bsd/net/rtsock.c +++ b/bsd/net/rtsock.c @@ -67,18 +67,28 @@ #include <sys/domain.h> #include <sys/protosw.h> #include <sys/syslog.h> +#include <kern/lock.h> #include <net/if.h> #include <net/route.h> #include <net/raw_cb.h> #include <netinet/in.h> +#include <machine/spl.h> + +extern void m_copydata(struct mbuf *, int, int, caddr_t); +extern void m_copyback(struct mbuf *, int, int, caddr_t); + +extern struct rtstat rtstat; +extern int rttrash; + MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables"); -static struct sockaddr route_dst = { 2, PF_ROUTE, }; -static struct sockaddr route_src = { 2, PF_ROUTE, }; -static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, }; -static struct sockproto route_proto = { PF_ROUTE, }; +extern lck_mtx_t *rt_mtx; +static struct sockaddr route_dst = { 2, PF_ROUTE, { 0, } }; +static struct sockaddr route_src = { 2, PF_ROUTE, { 0, } }; +static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, { 0, } }; +static struct sockproto route_proto = { PF_ROUTE, 0 }; struct walkarg { int w_tmemsize; @@ -88,16 +98,16 @@ struct walkarg { }; static struct mbuf * - rt_msg1 __P((int, struct rt_addrinfo *)); -static int rt_msg2 __P((int, - struct rt_addrinfo *, caddr_t, struct walkarg *)); -static int rt_xaddrs __P((caddr_t, caddr_t, struct rt_addrinfo *)); -static int sysctl_dumpentry __P((struct radix_node *rn, void *vw)); -static int sysctl_iflist __P((int af, struct walkarg *w)); -static int route_output __P((struct mbuf *, struct socket *)); -static void rt_setmetrics __P((u_long, struct rt_metrics *, struct rt_metrics *)); -static void rt_setif __P((struct rtentry *, struct sockaddr *, struct sockaddr *, - struct sockaddr *)); + rt_msg1(int, struct rt_addrinfo *); +static int rt_msg2(int, struct rt_addrinfo *, caddr_t, struct walkarg *); +static int rt_xaddrs(caddr_t, caddr_t, struct rt_addrinfo *); +static int sysctl_dumpentry(struct radix_node *rn, void *vw); +static int sysctl_iflist(int af, struct walkarg *w); +static int sysctl_iflist2(int af, struct walkarg *w); +static int route_output(struct mbuf *, struct socket *); +static void rt_setmetrics(u_long, struct rt_metrics *, struct rt_metrics *); +static void rt_setif(struct rtentry *, struct sockaddr *, struct sockaddr *, + struct sockaddr *); /* Sleazy use of local variables throughout file, warning!!!! */ #define dst info.rti_info[RTAX_DST] @@ -115,20 +125,19 @@ static void rt_setif __P((struct rtentry *, struct sockaddr *, struct sockaddr * static int rts_abort(struct socket *so) { - int s, error; - s = splnet(); + int error; + error = raw_usrreqs.pru_abort(so); - splx(s); return error; } /* pru_accept is EOPNOTSUPP */ static int -rts_attach(struct socket *so, int proto, struct proc *p) +rts_attach(struct socket *so, int proto, __unused struct proc *p) { struct rawcb *rp; - int s, error; + int error; if (sotorawcb(so) != 0) return EISCONN; /* XXX panic? */ @@ -144,17 +153,18 @@ rts_attach(struct socket *so, int proto, struct proc *p) * Probably we should try to do more of this work beforehand and * eliminate the spl. */ - s = splnet(); so->so_pcb = (caddr_t)rp; error = raw_attach(so, proto); /* don't use raw_usrreqs.pru_attach, it checks for SS_PRIV */ rp = sotorawcb(so); if (error) { - splx(s); FREE(rp, M_PCB); so->so_pcb = 0; + so->so_flags |= SOF_PCBCLEARING; return error; } + socket_lock(so, 1); switch(rp->rcb_proto.sp_protocol) { +//####LD route_cb needs looking case AF_INET: route_cb.ip_count++; break; @@ -172,7 +182,7 @@ rts_attach(struct socket *so, int proto, struct proc *p) route_cb.any_count++; soisconnected(so); so->so_options |= SO_USELOOPBACK; - splx(s); + socket_unlock(so, 1); return 0; } @@ -287,36 +297,41 @@ rts_sockaddr(struct socket *so, struct sockaddr **nam) } static struct pr_usrreqs route_usrreqs = { - rts_abort, pru_accept_notsupp, rts_attach, rts_bind, rts_connect, - pru_connect2_notsupp, pru_control_notsupp, rts_detach, rts_disconnect, - pru_listen_notsupp, rts_peeraddr, pru_rcvd_notsupp, pru_rcvoob_notsupp, - rts_send, pru_sense_null, rts_shutdown, rts_sockaddr, - sosend, soreceive, sopoll + rts_abort, pru_accept_notsupp, rts_attach, rts_bind, + rts_connect, pru_connect2_notsupp, pru_control_notsupp, + rts_detach, rts_disconnect, pru_listen_notsupp, rts_peeraddr, + pru_rcvd_notsupp, pru_rcvoob_notsupp, rts_send, pru_sense_null, + rts_shutdown, rts_sockaddr, sosend, soreceive, pru_sopoll_notsupp }; /*ARGSUSED*/ static int route_output(m, so) - register struct mbuf *m; + struct mbuf *m; struct socket *so; { - register struct rt_msghdr *rtm = 0; - register struct rtentry *rt = 0; + struct rt_msghdr *rtm = 0; + struct rtentry *rt = 0; struct rtentry *saved_nrt = 0; struct radix_node_head *rnh; struct rt_addrinfo info; int len, error = 0; struct ifnet *ifp = 0; - struct ifaddr *ifa = 0; +#ifndef __APPLE__ struct proc *curproc = current_proc(); +#endif int sendonlytoself = 0; #define senderr(e) { error = e; goto flush;} - if (m == 0 || ((m->m_len < sizeof(long)) && - (m = m_pullup(m, sizeof(long))) == 0)) + if (m == 0 || ((m->m_len < sizeof(long)) && (m = m_pullup(m, sizeof(long))) == 0)) return (ENOBUFS); if ((m->m_flags & M_PKTHDR) == 0) panic("route_output"); + + /* unlock the socket (but keep a reference) it won't be accessed until raw_input appends to it. */ + socket_unlock(so, 0); + lck_mtx_lock(rt_mtx); + len = m->m_pkthdr.len; if (len < sizeof(*rtm) || len != mtod(m, struct rt_msghdr *)->rtm_msglen) { @@ -353,15 +368,17 @@ route_output(m, so) dst = 0; senderr(EPERM); } - rtm->rtm_pid = curproc->p_pid; + + rtm->rtm_pid = proc_selfpid(); info.rti_addrs = rtm->rtm_addrs; if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info)) { dst = 0; senderr(EINVAL); } if (dst == 0 || (dst->sa_family >= AF_MAX) - || (gate != 0 && (gate->sa_family >= AF_MAX))) + || (gate != 0 && (gate->sa_family >= AF_MAX))) { senderr(EINVAL); + } if (genmask) { struct radix_node *t; t = rn_addmask((caddr_t)genmask, 0, 1); @@ -371,10 +388,10 @@ route_output(m, so) senderr(ENOBUFS); } switch (rtm->rtm_type) { - - case RTM_ADD: - if (gate == 0) - senderr(EINVAL); + + case RTM_ADD: + if (gate == 0) + senderr(EINVAL); #ifdef __APPLE__ /* XXX LD11JUL02 Special case for AOL 5.1.2 connectivity issue to AirPort BS (Radar 2969954) @@ -389,169 +406,174 @@ route_output(m, so) * confusing the routing table with a wrong route to the previous default gateway */ { - extern int check_routeselfref; + extern int check_routeselfref; #define satosinaddr(sa) (((struct sockaddr_in *)sa)->sin_addr.s_addr) - - if (check_routeselfref && (dst && dst->sa_family == AF_INET) && - (netmask && satosinaddr(netmask) == INADDR_BROADCAST) && - (gate && satosinaddr(dst) == satosinaddr(gate))) { - log(LOG_WARNING, "route_output: circular route %ld.%ld.%ld.%ld/32 ignored\n", - (ntohl(satosinaddr(gate)>>24))&0xff, - (ntohl(satosinaddr(gate)>>16))&0xff, - (ntohl(satosinaddr(gate)>>8))&0xff, - (ntohl(satosinaddr(gate)))&0xff); - - senderr(EINVAL); - } + + if (check_routeselfref && (dst && dst->sa_family == AF_INET) && + (netmask && satosinaddr(netmask) == INADDR_BROADCAST) && + (gate && satosinaddr(dst) == satosinaddr(gate))) { + log(LOG_WARNING, "route_output: circular route %ld.%ld.%ld.%ld/32 ignored\n", + (ntohl(satosinaddr(gate)>>24))&0xff, + (ntohl(satosinaddr(gate)>>16))&0xff, + (ntohl(satosinaddr(gate)>>8))&0xff, + (ntohl(satosinaddr(gate)))&0xff); + + senderr(EINVAL); + } } #endif - error = rtrequest(RTM_ADD, dst, gate, netmask, - rtm->rtm_flags, &saved_nrt); - if (error == 0 && saved_nrt) { + error = rtrequest_locked(RTM_ADD, dst, gate, netmask, + rtm->rtm_flags, &saved_nrt); + if (error == 0 && saved_nrt) { #ifdef __APPLE__ - /* - * If the route request specified an interface with - * IFA and/or IFP, we set the requested interface on - * the route with rt_setif. It would be much better - * to do this inside rtrequest, but that would - * require passing the desired interface, in some - * form, to rtrequest. Since rtrequest is called in - * so many places (roughly 40 in our source), adding - * a parameter is to much for us to swallow; this is - * something for the FreeBSD developers to tackle. - * Instead, we let rtrequest compute whatever - * interface it wants, then come in behind it and - * stick in the interface that we really want. This - * works reasonably well except when rtrequest can't - * figure out what interface to use (with - * ifa_withroute) and returns ENETUNREACH. Ideally - * it shouldn't matter if rtrequest can't figure out - * the interface if we're going to explicitly set it - * ourselves anyway. But practically we can't - * recover here because rtrequest will not do any of - * the work necessary to add the route if it can't - * find an interface. As long as there is a default - * route that leads to some interface, rtrequest will - * find an interface, so this problem should be - * rarely encountered. - * dwiggins@bbn.com - */ - - rt_setif(saved_nrt, ifpaddr, ifaaddr, gate); + /* + * If the route request specified an interface with + * IFA and/or IFP, we set the requested interface on + * the route with rt_setif. It would be much better + * to do this inside rtrequest, but that would + * require passing the desired interface, in some + * form, to rtrequest. Since rtrequest is called in + * so many places (roughly 40 in our source), adding + * a parameter is to much for us to swallow; this is + * something for the FreeBSD developers to tackle. + * Instead, we let rtrequest compute whatever + * interface it wants, then come in behind it and + * stick in the interface that we really want. This + * works reasonably well except when rtrequest can't + * figure out what interface to use (with + * ifa_withroute) and returns ENETUNREACH. Ideally + * it shouldn't matter if rtrequest can't figure out + * the interface if we're going to explicitly set it + * ourselves anyway. But practically we can't + * recover here because rtrequest will not do any of + * the work necessary to add the route if it can't + * find an interface. As long as there is a default + * route that leads to some interface, rtrequest will + * find an interface, so this problem should be + * rarely encountered. + * dwiggins@bbn.com + */ + + rt_setif(saved_nrt, ifpaddr, ifaaddr, gate); #endif - rt_setmetrics(rtm->rtm_inits, - &rtm->rtm_rmx, &saved_nrt->rt_rmx); - saved_nrt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits); - saved_nrt->rt_rmx.rmx_locks |= - (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks); - rtunref(saved_nrt); - saved_nrt->rt_genmask = genmask; - } - break; - - case RTM_DELETE: - error = rtrequest(RTM_DELETE, dst, gate, netmask, - rtm->rtm_flags, &saved_nrt); - if (error == 0) { - if ((rt = saved_nrt)) - rtref(rt); - goto report; - } - break; - - case RTM_GET: - case RTM_CHANGE: - case RTM_LOCK: - if ((rnh = rt_tables[dst->sa_family]) == 0) { - senderr(EAFNOSUPPORT); - } else if ((rt = (struct rtentry *) - rnh->rnh_lookup(dst, netmask, rnh)) != NULL) - rtref(rt); - else - senderr(ESRCH); - switch(rtm->rtm_type) { - - case RTM_GET: - report: - dst = rt_key(rt); - gate = rt->rt_gateway; - netmask = rt_mask(rt); - genmask = rt->rt_genmask; - if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) { - ifp = rt->rt_ifp; - if (ifp) { - ifpaddr = ifp->if_addrhead.tqh_first->ifa_addr; - ifaaddr = rt->rt_ifa->ifa_addr; - rtm->rtm_index = ifp->if_index; - } else { - ifpaddr = 0; - ifaaddr = 0; - } + rt_setmetrics(rtm->rtm_inits, + &rtm->rtm_rmx, &saved_nrt->rt_rmx); + saved_nrt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits); + saved_nrt->rt_rmx.rmx_locks |= + (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks); + rtunref(saved_nrt); + saved_nrt->rt_genmask = genmask; } - len = rt_msg2(rtm->rtm_type, &info, (caddr_t)0, - (struct walkarg *)0); - if (len > rtm->rtm_msglen) { - struct rt_msghdr *new_rtm; - R_Malloc(new_rtm, struct rt_msghdr *, len); - if (new_rtm == 0) - senderr(ENOBUFS); - Bcopy(rtm, new_rtm, rtm->rtm_msglen); - Free(rtm); rtm = new_rtm; + break; + + case RTM_DELETE: + error = rtrequest_locked(RTM_DELETE, dst, gate, netmask, + rtm->rtm_flags, &saved_nrt); + if (error == 0) { + if ((rt = saved_nrt)) + rtref(rt); + goto report; } - (void)rt_msg2(rtm->rtm_type, &info, (caddr_t)rtm, - (struct walkarg *)0); - rtm->rtm_flags = rt->rt_flags; - rtm->rtm_rmx = rt->rt_rmx; - rtm->rtm_addrs = info.rti_addrs; break; + case RTM_GET: case RTM_CHANGE: - if (gate && (error = rt_setgate(rt, rt_key(rt), gate))) - senderr(error); - - /* - * If they tried to change things but didn't specify - * the required gateway, then just use the old one. - * This can happen if the user tries to change the - * flags on the default route without changing the - * default gateway. Changing flags still doesn't work. - */ - if ((rt->rt_flags & RTF_GATEWAY) && !gate) - gate = rt->rt_gateway; - + case RTM_LOCK: + if ((rnh = rt_tables[dst->sa_family]) == 0) { + senderr(EAFNOSUPPORT); + } else if ((rt = (struct rtentry *) + rnh->rnh_lookup(dst, netmask, rnh)) != NULL) + rtref(rt); + else + senderr(ESRCH); + switch(rtm->rtm_type) { + + case RTM_GET: { + struct ifaddr *ifa2; + report: + dst = rt_key(rt); + gate = rt->rt_gateway; + netmask = rt_mask(rt); + genmask = rt->rt_genmask; + if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) { + ifp = rt->rt_ifp; + if (ifp) { + ifnet_lock_shared(ifp); + ifa2 = ifp->if_addrhead.tqh_first; + ifpaddr = ifa2->ifa_addr; + ifnet_lock_done(ifp); + ifaaddr = rt->rt_ifa->ifa_addr; + rtm->rtm_index = ifp->if_index; + } else { + ifpaddr = 0; + ifaaddr = 0; + } + } + len = rt_msg2(rtm->rtm_type, &info, (caddr_t)0, + (struct walkarg *)0); + if (len > rtm->rtm_msglen) { + struct rt_msghdr *new_rtm; + R_Malloc(new_rtm, struct rt_msghdr *, len); + if (new_rtm == 0) { + senderr(ENOBUFS); + } + Bcopy(rtm, new_rtm, rtm->rtm_msglen); + R_Free(rtm); rtm = new_rtm; + } + (void)rt_msg2(rtm->rtm_type, &info, (caddr_t)rtm, + (struct walkarg *)0); + rtm->rtm_flags = rt->rt_flags; + rtm->rtm_rmx = rt->rt_rmx; + rtm->rtm_addrs = info.rti_addrs; + } + break; + + case RTM_CHANGE: + if (gate && (error = rt_setgate(rt, rt_key(rt), gate))) + senderr(error); + + /* + * If they tried to change things but didn't specify + * the required gateway, then just use the old one. + * This can happen if the user tries to change the + * flags on the default route without changing the + * default gateway. Changing flags still doesn't work. + */ + if ((rt->rt_flags & RTF_GATEWAY) && !gate) + gate = rt->rt_gateway; + #ifdef __APPLE__ - /* - * On Darwin, we call rt_setif which contains the - * equivalent to the code found at this very spot - * in BSD. - */ - rt_setif(rt, ifpaddr, ifaaddr, gate); + /* + * On Darwin, we call rt_setif which contains the + * equivalent to the code found at this very spot + * in BSD. + */ + rt_setif(rt, ifpaddr, ifaaddr, gate); #endif - - rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx, - &rt->rt_rmx); + + rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx, + &rt->rt_rmx); #ifndef __APPLE__ - /* rt_setif, called above does this for us on darwin */ - if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest) - rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, gate); + /* rt_setif, called above does this for us on darwin */ + if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest) + rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, gate); #endif - if (genmask) - rt->rt_genmask = genmask; - /* - * Fall into - */ - case RTM_LOCK: - rt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits); - rt->rt_rmx.rmx_locks |= - (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks); + if (genmask) + rt->rt_genmask = genmask; + /* + * Fall into + */ + case RTM_LOCK: + rt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits); + rt->rt_rmx.rmx_locks |= + (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks); + break; + } break; - } - break; - - default: - senderr(EOPNOTSUPP); + + default: + senderr(EOPNOTSUPP); } - flush: if (rtm) { if (error) @@ -560,16 +582,18 @@ flush: rtm->rtm_flags |= RTF_DONE; } if (rt) - rtfree(rt); + rtfree_locked(rt); + lck_mtx_unlock(rt_mtx); + socket_lock(so, 0); /* relock the socket now */ { - register struct rawcb *rp = 0; + struct rawcb *rp = 0; /* * Check to see if we don't want our own messages. */ if ((so->so_options & SO_USELOOPBACK) == 0) { if (route_cb.any_count <= 1) { if (rtm) - Free(rtm); + R_Free(rtm); m_freem(m); return (error); } @@ -583,22 +607,25 @@ flush: m = NULL; } else if (m->m_pkthdr.len > rtm->rtm_msglen) m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len); - Free(rtm); + R_Free(rtm); } if (sendonlytoself && m) { - if (sbappendaddr(&so->so_rcv, &route_src, m, (struct mbuf*)0) == 0) { - m_freem(m); - error = ENOBUFS; - } else { + error = 0; + if (sbappendaddr(&so->so_rcv, &route_src, m, (struct mbuf*)0, &error) != 0) { sorwakeup(so); } + if (error) + return error; } else { if (rp) rp->rcb_proto.sp_family = 0; /* Avoid us */ if (dst) route_proto.sp_protocol = dst->sa_family; - if (m) + if (m) { + socket_unlock(so, 0); raw_input(m, &route_proto, &route_src, &route_dst); + socket_lock(so, 0); + } if (rp) rp->rcb_proto.sp_family = PF_ROUTE; } @@ -609,7 +636,7 @@ flush: static void rt_setmetrics(which, in, out) u_long which; - register struct rt_metrics *in, *out; + struct rt_metrics *in, *out; { #define metric(f, e) if (which & (f)) out->e = in->e; metric(RTV_RPIPE, rmx_recvpipe); @@ -627,30 +654,53 @@ rt_setmetrics(which, in, out) * Set route's interface given ifpaddr, ifaaddr, and gateway. */ static void -rt_setif(rt, Ifpaddr, Ifaaddr, Gate) - struct rtentry *rt; - struct sockaddr *Ifpaddr, *Ifaaddr, *Gate; +rt_setif( + struct rtentry *rt, + struct sockaddr *Ifpaddr, + struct sockaddr *Ifaaddr, + struct sockaddr *Gate) { struct ifaddr *ifa = 0; struct ifnet *ifp = 0; + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); + /* new gateway could require new ifaddr, ifp; flags may also be different; ifp may be specified by ll sockaddr when protocol address is ambiguous */ if (Ifpaddr && (ifa = ifa_ifwithnet(Ifpaddr)) && - (ifp = ifa->ifa_ifp) && (Ifaaddr || Gate)) + (ifp = ifa->ifa_ifp) && (Ifaaddr || Gate)) { + ifafree(ifa); ifa = ifaof_ifpforaddr(Ifaaddr ? Ifaaddr : Gate, ifp); - else if (Ifpaddr && (ifp = if_withname(Ifpaddr)) ) { - ifa = Gate ? ifaof_ifpforaddr(Gate, ifp) : - TAILQ_FIRST(&ifp->if_addrhead); } - else if ((Ifaaddr && (ifa = ifa_ifwithaddr(Ifaaddr))) || - (Gate && (ifa = ifa_ifwithroute(rt->rt_flags, - rt_key(rt), Gate)))) - ifp = ifa->ifa_ifp; + else + { + if (ifa) { + ifafree(ifa); + ifa = 0; + } + if (Ifpaddr && (ifp = if_withname(Ifpaddr)) ) { + if (Gate) { + ifa = ifaof_ifpforaddr(Gate, ifp); + } + else { + ifnet_lock_shared(ifp); + ifa = TAILQ_FIRST(&ifp->if_addrhead); + ifaref(ifa); + ifnet_lock_done(ifp); + } + } + else if (Ifaaddr && (ifa = ifa_ifwithaddr(Ifaaddr))) { + ifp = ifa->ifa_ifp; + } + else if (Gate && (ifa = ifa_ifwithroute(rt->rt_flags, + rt_key(rt), Gate))) { + ifp = ifa->ifa_ifp; + } + } if (ifa) { - register struct ifaddr *oifa = rt->rt_ifa; + struct ifaddr *oifa = rt->rt_ifa; if (oifa != ifa) { if (oifa && oifa->ifa_rtrequest) oifa->ifa_rtrequest(RTM_DELETE, @@ -660,8 +710,11 @@ rt_setif(rt, Ifpaddr, Ifaaddr, Gate) rt->rt_rmx.rmx_mtu = ifp->if_mtu; if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest) rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, Gate); - } else + } else { + ifafree(ifa); goto call_ifareq; + } + ifafree(ifa); return; } call_ifareq: @@ -683,11 +736,11 @@ rt_setif(rt, Ifpaddr, Ifaaddr, Gate) */ static int rt_xaddrs(cp, cplim, rtinfo) - register caddr_t cp, cplim; - register struct rt_addrinfo *rtinfo; + caddr_t cp, cplim; + struct rt_addrinfo *rtinfo; { - register struct sockaddr *sa; - register int i; + struct sockaddr *sa; + int i; bzero(rtinfo->rti_info, sizeof(rtinfo->rti_info)); for (i = 0; (i < RTAX_MAX) && (cp < cplim); i++) { @@ -721,14 +774,14 @@ rt_xaddrs(cp, cplim, rtinfo) } static struct mbuf * -rt_msg1(type, rtinfo) - int type; - register struct rt_addrinfo *rtinfo; +rt_msg1( + int type, + struct rt_addrinfo *rtinfo) { - register struct rt_msghdr *rtm; - register struct mbuf *m; - register int i; - register struct sockaddr *sa; + struct rt_msghdr *rtm; + struct mbuf *m; + int i; + struct sockaddr *sa; int len, dlen; switch (type) { @@ -787,11 +840,11 @@ rt_msg1(type, rtinfo) static int rt_msg2(type, rtinfo, cp, w) int type; - register struct rt_addrinfo *rtinfo; + struct rt_addrinfo *rtinfo; caddr_t cp; struct walkarg *w; { - register int i; + int i; int len, dlen, second_time = 0; caddr_t cp0; @@ -804,10 +857,27 @@ again: len = sizeof(struct ifa_msghdr); break; + case RTM_DELMADDR: + case RTM_NEWMADDR: + len = sizeof(struct ifma_msghdr); + break; + case RTM_IFINFO: len = sizeof(struct if_msghdr); break; + case RTM_IFINFO2: + len = sizeof(struct if_msghdr2); + break; + + case RTM_NEWMADDR2: + len = sizeof(struct ifma_msghdr2); + break; + + case RTM_GET2: + len = sizeof(struct rt_msghdr2); + break; + default: len = sizeof(struct rt_msghdr); } @@ -815,7 +885,7 @@ again: if (cp0) cp += len; for (i = 0; i < RTAX_MAX; i++) { - register struct sockaddr *sa; + struct sockaddr *sa; if ((sa = rtinfo->rti_info[i]) == 0) continue; @@ -828,7 +898,7 @@ again: len += dlen; } if (cp == 0 && w != NULL && !second_time) { - register struct walkarg *rw = w; + struct walkarg *rw = w; if (rw->w_req) { if (rw->w_tmemsize < len) { @@ -847,7 +917,7 @@ again: } } if (cp) { - register struct rt_msghdr *rtm = (struct rt_msghdr *)cp0; + struct rt_msghdr *rtm = (struct rt_msghdr *)cp0; rtm->rtm_version = RTM_VERSION; rtm->rtm_type = type; @@ -858,19 +928,21 @@ again: /* * This routine is called to generate a message from the routing - * socket indicating that a redirect has occured, a routing lookup + * socket indicating that a redirect has occurred, a routing lookup * has failed, or that a protocol has detected timeouts to a particular * destination. */ void rt_missmsg(type, rtinfo, flags, error) int type, flags, error; - register struct rt_addrinfo *rtinfo; + struct rt_addrinfo *rtinfo; { - register struct rt_msghdr *rtm; - register struct mbuf *m; + struct rt_msghdr *rtm; + struct mbuf *m; struct sockaddr *sa = rtinfo->rti_info[RTAX_DST]; + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); + if (route_cb.any_count == 0) return; m = rt_msg1(type, rtinfo); @@ -889,10 +961,10 @@ rt_missmsg(type, rtinfo, flags, error) * socket indicating that the status of a network interface has changed. */ void -rt_ifmsg(ifp) - register struct ifnet *ifp; +rt_ifmsg( + struct ifnet *ifp) { - register struct if_msghdr *ifm; + struct if_msghdr *ifm; struct mbuf *m; struct rt_addrinfo info; @@ -905,7 +977,7 @@ rt_ifmsg(ifp) ifm = mtod(m, struct if_msghdr *); ifm->ifm_index = ifp->if_index; ifm->ifm_flags = (u_short)ifp->if_flags; - ifm->ifm_data = ifp->if_data; + if_data_internal_to_if_data(&ifp->if_data, &ifm->ifm_data); ifm->ifm_addrs = 0; route_proto.sp_protocol = 0; raw_input(m, &route_proto, &route_src, &route_dst); @@ -918,12 +990,15 @@ rt_ifmsg(ifp) * socket indicate a request to configure interfaces, then it will * be unnecessary as the routing socket will automatically generate * copies of it. + * + * Since this is coming from the interface, it is expected that the + * interface will be locked. */ void rt_newaddrmsg(cmd, ifa, error, rt) int cmd, error; - register struct ifaddr *ifa; - register struct rtentry *rt; + struct ifaddr *ifa; + struct rtentry *rt; { struct rt_addrinfo info; struct sockaddr *sa = 0; @@ -937,7 +1012,7 @@ rt_newaddrmsg(cmd, ifa, error, rt) bzero((caddr_t)&info, sizeof(info)); if ((cmd == RTM_ADD && pass == 1) || (cmd == RTM_DELETE && pass == 2)) { - register struct ifa_msghdr *ifam; + struct ifa_msghdr *ifam; int ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR; ifaaddr = sa = ifa->ifa_addr; @@ -954,7 +1029,7 @@ rt_newaddrmsg(cmd, ifa, error, rt) } if ((cmd == RTM_ADD && pass == 2) || (cmd == RTM_DELETE && pass == 1)) { - register struct rt_msghdr *rtm; + struct rt_msghdr *rtm; if (rt == 0) continue; @@ -1002,11 +1077,11 @@ rt_newmaddrmsg(cmd, ifma) * If a link-layer address is present, present it as a ``gateway'' * (similarly to how ARP entries, e.g., are presented). */ - gate = ifma->ifma_lladdr; + gate = ifma->ifma_ll->ifma_addr; if ((m = rt_msg1(cmd, &info)) == NULL) return; ifmam = mtod(m, struct ifma_msghdr *); - ifmam->ifmam_index = ifp->if_index; + ifmam->ifmam_index = ifp ? ifp->if_index : 0; ifmam->ifmam_addrs = info.rti_addrs; route_proto.sp_protocol = ifma->ifma_addr->sa_family; raw_input(m, &route_proto, &route_src, &route_dst); @@ -1020,8 +1095,8 @@ sysctl_dumpentry(rn, vw) struct radix_node *rn; void *vw; { - register struct walkarg *w = vw; - register struct rtentry *rt = (struct rtentry *)rn; + struct walkarg *w = vw; + struct rtentry *rt = (struct rtentry *)rn; int error = 0, size; struct rt_addrinfo info; @@ -1032,51 +1107,81 @@ sysctl_dumpentry(rn, vw) gate = rt->rt_gateway; netmask = rt_mask(rt); genmask = rt->rt_genmask; - size = rt_msg2(RTM_GET, &info, 0, w); - if (w->w_req && w->w_tmem) { - register struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem; - - rtm->rtm_flags = rt->rt_flags; - rtm->rtm_use = rt->rt_use; - rtm->rtm_rmx = rt->rt_rmx; - rtm->rtm_index = rt->rt_ifp->if_index; - rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0; - rtm->rtm_addrs = info.rti_addrs; - error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); - return (error); + if (w->w_op != NET_RT_DUMP2) { + size = rt_msg2(RTM_GET, &info, 0, w); + if (w->w_req && w->w_tmem) { + struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem; + + rtm->rtm_flags = rt->rt_flags; + rtm->rtm_use = rt->rt_use; + rtm->rtm_rmx = rt->rt_rmx; + rtm->rtm_index = rt->rt_ifp->if_index; + rtm->rtm_pid = 0; + rtm->rtm_seq = 0; + rtm->rtm_errno = 0; + rtm->rtm_addrs = info.rti_addrs; + error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); + return (error); + } + } else { + size = rt_msg2(RTM_GET2, &info, 0, w); + if (w->w_req && w->w_tmem) { + struct rt_msghdr2 *rtm = (struct rt_msghdr2 *)w->w_tmem; + + rtm->rtm_flags = rt->rt_flags; + rtm->rtm_use = rt->rt_use; + rtm->rtm_rmx = rt->rt_rmx; + rtm->rtm_index = rt->rt_ifp->if_index; + rtm->rtm_refcnt = rt->rt_refcnt; + if (rt->rt_parent) + rtm->rtm_parentflags = rt->rt_parent->rt_flags; + else + rtm->rtm_parentflags = 0; + rtm->rtm_reserved = 0; + rtm->rtm_addrs = info.rti_addrs; + error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); + return (error); + + } } return (error); } int -sysctl_iflist(af, w) - int af; - register struct walkarg *w; +sysctl_iflist( + int af, + struct walkarg *w) { - register struct ifnet *ifp; - register struct ifaddr *ifa; + struct ifnet *ifp; + struct ifaddr *ifa; struct rt_addrinfo info; int len, error = 0; bzero((caddr_t)&info, sizeof(info)); - for (ifp = ifnet.tqh_first; ifp; ifp = ifp->if_link.tqe_next) { + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + if (error) + break; if (w->w_arg && w->w_arg != ifp->if_index) continue; + ifnet_lock_shared(ifp); ifa = ifp->if_addrhead.tqh_first; ifpaddr = ifa->ifa_addr; len = rt_msg2(RTM_IFINFO, &info, (caddr_t)0, w); ifpaddr = 0; if (w->w_req && w->w_tmem) { - register struct if_msghdr *ifm; + struct if_msghdr *ifm; ifm = (struct if_msghdr *)w->w_tmem; ifm->ifm_index = ifp->if_index; ifm->ifm_flags = (u_short)ifp->if_flags; - ifm->ifm_data = ifp->if_data; + if_data_internal_to_if_data(&ifp->if_data, &ifm->ifm_data); ifm->ifm_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req,(caddr_t)ifm, len); - if (error) - return (error); + if (error) { + ifnet_lock_done(ifp); + break; + } } while ((ifa = ifa->ifa_link.tqe_next) != 0) { if (af && af != ifa->ifa_addr->sa_family) @@ -1090,7 +1195,7 @@ sysctl_iflist(af, w) brdaddr = ifa->ifa_dstaddr; len = rt_msg2(RTM_NEWADDR, &info, 0, w); if (w->w_req && w->w_tmem) { - register struct ifa_msghdr *ifam; + struct ifa_msghdr *ifam; ifam = (struct ifa_msghdr *)w->w_tmem; ifam->ifam_index = ifa->ifa_ifp->if_index; @@ -1099,21 +1204,148 @@ sysctl_iflist(af, w) ifam->ifam_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, w->w_tmem, len); if (error) - return (error); + break; } } + ifnet_lock_done(ifp); ifaaddr = netmask = brdaddr = 0; } - return (0); + ifnet_head_done(); + return error; } +int +sysctl_iflist2( + int af, + struct walkarg *w) +{ + struct ifnet *ifp; + struct ifaddr *ifa; + struct rt_addrinfo info; + int len, error = 0; + + bzero((caddr_t)&info, sizeof(info)); + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + if (error) + break; + if (w->w_arg && w->w_arg != ifp->if_index) + continue; + ifnet_lock_shared(ifp); + ifa = ifp->if_addrhead.tqh_first; + ifpaddr = ifa->ifa_addr; + len = rt_msg2(RTM_IFINFO2, &info, (caddr_t)0, w); + ifpaddr = 0; + if (w->w_req && w->w_tmem) { + struct if_msghdr2 *ifm; + + ifm = (struct if_msghdr2 *)w->w_tmem; + ifm->ifm_addrs = info.rti_addrs; + ifm->ifm_flags = (u_short)ifp->if_flags; + ifm->ifm_index = ifp->if_index; + ifm->ifm_snd_len = ifp->if_snd.ifq_len; + ifm->ifm_snd_maxlen = ifp->if_snd.ifq_maxlen; + ifm->ifm_snd_drops = ifp->if_snd.ifq_drops; + ifm->ifm_timer = ifp->if_timer; + if_data_internal_to_if_data64(&ifp->if_data, &ifm->ifm_data); + error = SYSCTL_OUT(w->w_req, w->w_tmem, len); + if (error) { + ifnet_lock_done(ifp); + break; + } + } + while ((ifa = ifa->ifa_link.tqe_next) != 0) { + if (af && af != ifa->ifa_addr->sa_family) + continue; + ifaaddr = ifa->ifa_addr; + netmask = ifa->ifa_netmask; + brdaddr = ifa->ifa_dstaddr; + len = rt_msg2(RTM_NEWADDR, &info, 0, w); + if (w->w_req && w->w_tmem) { + struct ifa_msghdr *ifam; + + ifam = (struct ifa_msghdr *)w->w_tmem; + ifam->ifam_index = ifa->ifa_ifp->if_index; + ifam->ifam_flags = ifa->ifa_flags; + ifam->ifam_metric = ifa->ifa_metric; + ifam->ifam_addrs = info.rti_addrs; + error = SYSCTL_OUT(w->w_req, w->w_tmem, len); + if (error) + break; + } + } + if (error) { + ifnet_lock_done(ifp); + break; + } + { + struct ifmultiaddr *ifma; + + for (ifma = ifp->if_multiaddrs.lh_first; ifma; + ifma = ifma->ifma_link.le_next) { + if (af && af != ifma->ifma_addr->sa_family) + continue; + bzero((caddr_t)&info, sizeof(info)); + ifaaddr = ifma->ifma_addr; + if (ifp->if_addrhead.tqh_first) + ifpaddr = ifp->if_addrhead.tqh_first->ifa_addr; + if (ifma->ifma_ll) + gate = ifma->ifma_ll->ifma_addr; + len = rt_msg2(RTM_NEWMADDR2, &info, 0, w); + if (w->w_req && w->w_tmem) { + struct ifma_msghdr2 *ifmam; + + ifmam = (struct ifma_msghdr2 *)w->w_tmem; + ifmam->ifmam_addrs = info.rti_addrs; + ifmam->ifmam_flags = 0; + ifmam->ifmam_index = ifma->ifma_ifp->if_index; + ifmam->ifmam_refcount = ifma->ifma_refcount; + error = SYSCTL_OUT(w->w_req, w->w_tmem, len); + if (error) + break; + } + } + } + ifnet_lock_done(ifp); + ifaaddr = netmask = brdaddr = 0; + } + ifnet_head_done(); + return error; +} + + +static int +sysctl_rtstat(struct sysctl_req *req) +{ + int error; + + error = SYSCTL_OUT(req, &rtstat, sizeof(struct rtstat)); + if (error) + return (error); + + return 0; +} + +static int +sysctl_rttrash(struct sysctl_req *req) +{ + int error; + + error = SYSCTL_OUT(req, &rttrash, sizeof(rttrash)); + if (error) + return (error); + + return 0; +} + + static int sysctl_rtsock SYSCTL_HANDLER_ARGS { int *name = (int *)arg1; u_int namelen = arg2; - register struct radix_node_head *rnh; - int i, s, error = EINVAL; + struct radix_node_head *rnh; + int i, error = EINVAL; u_char af; struct walkarg w; @@ -1129,10 +1361,11 @@ sysctl_rtsock SYSCTL_HANDLER_ARGS w.w_arg = name[2]; w.w_req = req; - s = splnet(); + lck_mtx_lock(rt_mtx); switch (w.w_op) { case NET_RT_DUMP: + case NET_RT_DUMP2: case NET_RT_FLAGS: for (i = 1; i <= AF_MAX; i++) if ((rnh = rt_tables[i]) && (af == 0 || af == i) && @@ -1140,11 +1373,20 @@ sysctl_rtsock SYSCTL_HANDLER_ARGS sysctl_dumpentry, &w))) break; break; - case NET_RT_IFLIST: error = sysctl_iflist(af, &w); + break; + case NET_RT_IFLIST2: + error = sysctl_iflist2(af, &w); + break; + case NET_RT_STAT: + error = sysctl_rtstat(req); + break; + case NET_RT_TRASH: + error = sysctl_rttrash(req); + break; } - splx(s); + lck_mtx_unlock(rt_mtx); if (w.w_tmem) FREE(w.w_tmem, M_RTABLE); return (error); @@ -1163,13 +1405,18 @@ static struct protosw routesw[] = { 0, route_output, raw_ctlinput, 0, 0, raw_init, 0, 0, 0, - 0, &route_usrreqs, 0, 0 + 0, + &route_usrreqs, + 0, 0, 0, + { 0, 0 }, 0, { 0 } } }; struct domain routedomain = { PF_ROUTE, "route", route_init, 0, 0, - routesw}; + routesw, + 0, 0, 0, 0, 0, 0, 0, 0, + { 0, 0 } }; DOMAIN_SET(route); diff --git a/bsd/net/slcompress.c b/bsd/net/slcompress.c deleted file mode 100644 index 47108b410..000000000 --- a/bsd/net/slcompress.c +++ /dev/null @@ -1,635 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/*- - * Copyright (c) 1989, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)slcompress.c 8.2 (Berkeley) 4/16/94 - * $FreeBSD: src/sys/net/slcompress.c,v 1.16 1999/12/29 04:38:37 peter Exp $ - */ - -/* - * Routines to compress and uncompess tcp packets (for transmission - * over low speed serial lines. - * - * Van Jacobson (van@helios.ee.lbl.gov), Dec 31, 1989: - * - Initial distribution. - * - */ - -#include <sys/param.h> -#include <sys/mbuf.h> -#include <sys/systm.h> - -#include <netinet/in.h> -#include <netinet/in_systm.h> -#include <netinet/ip.h> -#include <netinet/tcp.h> - -#include <net/slcompress.h> - -#ifndef SL_NO_STATS -#define INCR(counter) ++comp->counter; -#else -#define INCR(counter) -#endif - -#define BCMP(p1, p2, n) bcmp((char *)(p1), (char *)(p2), (int)(n)) -#define BCOPY(p1, p2, n) bcopy((char *)(p1), (char *)(p2), (int)(n)) -#ifndef KERNEL -#define ovbcopy bcopy -#endif - -void -sl_compress_init(comp, max_state) - struct slcompress *comp; - int max_state; -{ - register u_int i; - register struct cstate *tstate = comp->tstate; - - if (max_state == -1) { - max_state = MAX_STATES - 1; - bzero((char *)comp, sizeof(*comp)); - } else { - /* Don't reset statistics */ - bzero((char *)comp->tstate, sizeof(comp->tstate)); - bzero((char *)comp->rstate, sizeof(comp->rstate)); - } - for (i = max_state; i > 0; --i) { - tstate[i].cs_id = i; - tstate[i].cs_next = &tstate[i - 1]; - } - tstate[0].cs_next = &tstate[max_state]; - tstate[0].cs_id = 0; - comp->last_cs = &tstate[0]; - comp->last_recv = 255; - comp->last_xmit = 255; - comp->flags = SLF_TOSS; -} - - -/* ENCODE encodes a number that is known to be non-zero. ENCODEZ - * checks for zero (since zero has to be encoded in the long, 3 byte - * form). - */ -#define ENCODE(n) { \ - if ((u_int16_t)(n) >= 256) { \ - *cp++ = 0; \ - cp[1] = (n); \ - cp[0] = (n) >> 8; \ - cp += 2; \ - } else { \ - *cp++ = (n); \ - } \ -} -#define ENCODEZ(n) { \ - if ((u_int16_t)(n) >= 256 || (u_int16_t)(n) == 0) { \ - *cp++ = 0; \ - cp[1] = (n); \ - cp[0] = (n) >> 8; \ - cp += 2; \ - } else { \ - *cp++ = (n); \ - } \ -} - -#define DECODEL(f) { \ - if (*cp == 0) {\ - (f) = htonl(ntohl(f) + ((cp[1] << 8) | cp[2])); \ - cp += 3; \ - } else { \ - (f) = htonl(ntohl(f) + (u_int32_t)*cp++); \ - } \ -} - -#define DECODES(f) { \ - if (*cp == 0) {\ - (f) = htons(ntohs(f) + ((cp[1] << 8) | cp[2])); \ - cp += 3; \ - } else { \ - (f) = htons(ntohs(f) + (u_int32_t)*cp++); \ - } \ -} - -#define DECODEU(f) { \ - if (*cp == 0) {\ - (f) = htons((cp[1] << 8) | cp[2]); \ - cp += 3; \ - } else { \ - (f) = htons((u_int32_t)*cp++); \ - } \ -} - -/* - * Attempt to compress an outgoing TCP packet and return the type of - * the result. The caller must have already verified that the protocol - * is TCP. The first mbuf must contain the complete IP and TCP headers, - * and "ip" must be == mtod(m, struct ip *). "comp" supplies the - * compression state, and "compress_cid" tells us whether it is OK - * to leave out the CID field when feasible. - * - * The caller is responsible for adjusting m->m_pkthdr.len upon return, - * if m is an M_PKTHDR mbuf. - */ -u_int -sl_compress_tcp(m, ip, comp, compress_cid) - struct mbuf *m; - register struct ip *ip; - struct slcompress *comp; - int compress_cid; -{ - register struct cstate *cs = comp->last_cs->cs_next; - register u_int hlen = ip->ip_hl; - register struct tcphdr *oth; - register struct tcphdr *th; - register u_int deltaS, deltaA; - register u_int changes = 0; - u_char new_seq[16]; - register u_char *cp = new_seq; - - /* - * Bail if this is an IP fragment or if the TCP packet isn't - * `compressible' (i.e., ACK isn't set or some other control bit is - * set). (We assume that the caller has already made sure the - * packet is IP proto TCP). - */ - if ((ip->ip_off & htons(0x3fff)) || m->m_len < 40) - return (TYPE_IP); - - th = (struct tcphdr *)&((int32_t *)ip)[hlen]; - if ((th->th_flags & (TH_SYN|TH_FIN|TH_RST|TH_ACK)) != TH_ACK) - return (TYPE_IP); - /* - * Packet is compressible -- we're going to send either a - * COMPRESSED_TCP or UNCOMPRESSED_TCP packet. Either way we need - * to locate (or create) the connection state. Special case the - * most recently used connection since it's most likely to be used - * again & we don't have to do any reordering if it's used. - */ - INCR(sls_packets) - if (ip->ip_src.s_addr != cs->cs_ip.ip_src.s_addr || - ip->ip_dst.s_addr != cs->cs_ip.ip_dst.s_addr || - *(int32_t *)th != ((int32_t *)&cs->cs_ip)[cs->cs_ip.ip_hl]) { - /* - * Wasn't the first -- search for it. - * - * States are kept in a circularly linked list with - * last_cs pointing to the end of the list. The - * list is kept in lru order by moving a state to the - * head of the list whenever it is referenced. Since - * the list is short and, empirically, the connection - * we want is almost always near the front, we locate - * states via linear search. If we don't find a state - * for the datagram, the oldest state is (re-)used. - */ - register struct cstate *lcs; - register struct cstate *lastcs = comp->last_cs; - - do { - lcs = cs; cs = cs->cs_next; - INCR(sls_searches) - if (ip->ip_src.s_addr == cs->cs_ip.ip_src.s_addr - && ip->ip_dst.s_addr == cs->cs_ip.ip_dst.s_addr - && *(int32_t *)th == - ((int32_t *)&cs->cs_ip)[cs->cs_ip.ip_hl]) - goto found; - } while (cs != lastcs); - - /* - * Didn't find it -- re-use oldest cstate. Send an - * uncompressed packet that tells the other side what - * connection number we're using for this conversation. - * Note that since the state list is circular, the oldest - * state points to the newest and we only need to set - * last_cs to update the lru linkage. - */ - INCR(sls_misses) - comp->last_cs = lcs; - hlen += th->th_off; - hlen <<= 2; - if (hlen > m->m_len) - return TYPE_IP; - goto uncompressed; - - found: - /* - * Found it -- move to the front on the connection list. - */ - if (cs == lastcs) - comp->last_cs = lcs; - else { - lcs->cs_next = cs->cs_next; - cs->cs_next = lastcs->cs_next; - lastcs->cs_next = cs; - } - } - - /* - * Make sure that only what we expect to change changed. The first - * line of the `if' checks the IP protocol version, header length & - * type of service. The 2nd line checks the "Don't fragment" bit. - * The 3rd line checks the time-to-live and protocol (the protocol - * check is unnecessary but costless). The 4th line checks the TCP - * header length. The 5th line checks IP options, if any. The 6th - * line checks TCP options, if any. If any of these things are - * different between the previous & current datagram, we send the - * current datagram `uncompressed'. - */ - oth = (struct tcphdr *)&((int32_t *)&cs->cs_ip)[hlen]; - deltaS = hlen; - hlen += th->th_off; - hlen <<= 2; - if (hlen > m->m_len) - return TYPE_IP; - - if (((u_int16_t *)ip)[0] != ((u_int16_t *)&cs->cs_ip)[0] || - ((u_int16_t *)ip)[3] != ((u_int16_t *)&cs->cs_ip)[3] || - ((u_int16_t *)ip)[4] != ((u_int16_t *)&cs->cs_ip)[4] || - th->th_off != oth->th_off || - (deltaS > 5 && - BCMP(ip + 1, &cs->cs_ip + 1, (deltaS - 5) << 2)) || - (th->th_off > 5 && - BCMP(th + 1, oth + 1, (th->th_off - 5) << 2))) - goto uncompressed; - - /* - * Figure out which of the changing fields changed. The - * receiver expects changes in the order: urgent, window, - * ack, seq (the order minimizes the number of temporaries - * needed in this section of code). - */ - if (th->th_flags & TH_URG) { - deltaS = ntohs(th->th_urp); - ENCODEZ(deltaS); - changes |= NEW_U; - } else if (th->th_urp != oth->th_urp) - /* argh! URG not set but urp changed -- a sensible - * implementation should never do this but RFC793 - * doesn't prohibit the change so we have to deal - * with it. */ - goto uncompressed; - - deltaS = (u_int16_t)(ntohs(th->th_win) - ntohs(oth->th_win)); - if (deltaS) { - ENCODE(deltaS); - changes |= NEW_W; - } - - deltaA = ntohl(th->th_ack) - ntohl(oth->th_ack); - if (deltaA) { - if (deltaA > 0xffff) - goto uncompressed; - ENCODE(deltaA); - changes |= NEW_A; - } - - deltaS = ntohl(th->th_seq) - ntohl(oth->th_seq); - if (deltaS) { - if (deltaS > 0xffff) - goto uncompressed; - ENCODE(deltaS); - changes |= NEW_S; - } - - switch(changes) { - - case 0: - /* - * Nothing changed. If this packet contains data and the - * last one didn't, this is probably a data packet following - * an ack (normal on an interactive connection) and we send - * it compressed. Otherwise it's probably a retransmit, - * retransmitted ack or window probe. Send it uncompressed - * in case the other side missed the compressed version. - */ - if (ip->ip_len != cs->cs_ip.ip_len && - ntohs(cs->cs_ip.ip_len) == hlen) - break; - - /* (fall through) */ - - case SPECIAL_I: - case SPECIAL_D: - /* - * actual changes match one of our special case encodings -- - * send packet uncompressed. - */ - goto uncompressed; - - case NEW_S|NEW_A: - if (deltaS == deltaA && - deltaS == ntohs(cs->cs_ip.ip_len) - hlen) { - /* special case for echoed terminal traffic */ - changes = SPECIAL_I; - cp = new_seq; - } - break; - - case NEW_S: - if (deltaS == ntohs(cs->cs_ip.ip_len) - hlen) { - /* special case for data xfer */ - changes = SPECIAL_D; - cp = new_seq; - } - break; - } - - deltaS = ntohs(ip->ip_id) - ntohs(cs->cs_ip.ip_id); - if (deltaS != 1) { - ENCODEZ(deltaS); - changes |= NEW_I; - } - if (th->th_flags & TH_PUSH) - changes |= TCP_PUSH_BIT; - /* - * Grab the cksum before we overwrite it below. Then update our - * state with this packet's header. - */ - deltaA = ntohs(th->th_sum); - BCOPY(ip, &cs->cs_ip, hlen); - - /* - * We want to use the original packet as our compressed packet. - * (cp - new_seq) is the number of bytes we need for compressed - * sequence numbers. In addition we need one byte for the change - * mask, one for the connection id and two for the tcp checksum. - * So, (cp - new_seq) + 4 bytes of header are needed. hlen is how - * many bytes of the original packet to toss so subtract the two to - * get the new packet size. - */ - deltaS = cp - new_seq; - cp = (u_char *)ip; - if (compress_cid == 0 || comp->last_xmit != cs->cs_id) { - comp->last_xmit = cs->cs_id; - hlen -= deltaS + 4; - cp += hlen; - *cp++ = changes | NEW_C; - *cp++ = cs->cs_id; - } else { - hlen -= deltaS + 3; - cp += hlen; - *cp++ = changes; - } - m->m_len -= hlen; - m->m_data += hlen; - *cp++ = deltaA >> 8; - *cp++ = deltaA; - BCOPY(new_seq, cp, deltaS); - INCR(sls_compressed) - return (TYPE_COMPRESSED_TCP); - - /* - * Update connection state cs & send uncompressed packet ('uncompressed' - * means a regular ip/tcp packet but with the 'conversation id' we hope - * to use on future compressed packets in the protocol field). - */ -uncompressed: - BCOPY(ip, &cs->cs_ip, hlen); - ip->ip_p = cs->cs_id; - comp->last_xmit = cs->cs_id; - return (TYPE_UNCOMPRESSED_TCP); -} - - -int -sl_uncompress_tcp(bufp, len, type, comp) - u_char **bufp; - int len; - u_int type; - struct slcompress *comp; -{ - u_char *hdr, *cp; - int hlen, vjlen; - - cp = bufp? *bufp: NULL; - vjlen = sl_uncompress_tcp_core(cp, len, len, type, comp, &hdr, &hlen); - if (vjlen < 0) - return (0); /* error */ - if (vjlen == 0) - return (len); /* was uncompressed already */ - - cp += vjlen; - len -= vjlen; - - /* - * At this point, cp points to the first byte of data in the - * packet. If we're not aligned on a 4-byte boundary, copy the - * data down so the ip & tcp headers will be aligned. Then back up - * cp by the tcp/ip header length to make room for the reconstructed - * header (we assume the packet we were handed has enough space to - * prepend 128 bytes of header). - */ - if ((intptr_t)cp & 3) { - if (len > 0) - (void) ovbcopy(cp, (caddr_t)((intptr_t)cp &~ 3), len); - cp = (u_char *)((intptr_t)cp &~ 3); - } - cp -= hlen; - len += hlen; - BCOPY(hdr, cp, hlen); - - *bufp = cp; - return (len); -} - -/* - * Uncompress a packet of total length total_len. The first buflen - * bytes are at buf; this must include the entire (compressed or - * uncompressed) TCP/IP header. This procedure returns the length - * of the VJ header, with a pointer to the uncompressed IP header - * in *hdrp and its length in *hlenp. - */ -int -sl_uncompress_tcp_core(buf, buflen, total_len, type, comp, hdrp, hlenp) - u_char *buf; - int buflen, total_len; - u_int type; - struct slcompress *comp; - u_char **hdrp; - u_int *hlenp; -{ - register u_char *cp; - register u_int hlen, changes; - register struct tcphdr *th; - register struct cstate *cs; - register struct ip *ip; - register u_int16_t *bp; - register u_int vjlen; - - switch (type) { - - case TYPE_UNCOMPRESSED_TCP: - ip = (struct ip *) buf; - if (ip->ip_p >= MAX_STATES) - goto bad; - cs = &comp->rstate[comp->last_recv = ip->ip_p]; - comp->flags &=~ SLF_TOSS; - ip->ip_p = IPPROTO_TCP; - /* - * Calculate the size of the TCP/IP header and make sure that - * we don't overflow the space we have available for it. - */ - hlen = ip->ip_hl << 2; - if (hlen + sizeof(struct tcphdr) > buflen) - goto bad; - hlen += ((struct tcphdr *)&((char *)ip)[hlen])->th_off << 2; - if (hlen > MAX_HDR || hlen > buflen) - goto bad; - BCOPY(ip, &cs->cs_ip, hlen); - cs->cs_hlen = hlen; - INCR(sls_uncompressedin) - *hdrp = (u_char *) &cs->cs_ip; - *hlenp = hlen; - return (0); - - default: - goto bad; - - case TYPE_COMPRESSED_TCP: - break; - } - /* We've got a compressed packet. */ - INCR(sls_compressedin) - cp = buf; - changes = *cp++; - if (changes & NEW_C) { - /* Make sure the state index is in range, then grab the state. - * If we have a good state index, clear the 'discard' flag. */ - if (*cp >= MAX_STATES) - goto bad; - - comp->flags &=~ SLF_TOSS; - comp->last_recv = *cp++; - } else { - /* this packet has an implicit state index. If we've - * had a line error since the last time we got an - * explicit state index, we have to toss the packet. */ - if (comp->flags & SLF_TOSS) { - INCR(sls_tossed) - return (-1); - } - } - cs = &comp->rstate[comp->last_recv]; - hlen = cs->cs_ip.ip_hl << 2; - th = (struct tcphdr *)&((u_char *)&cs->cs_ip)[hlen]; - th->th_sum = htons((*cp << 8) | cp[1]); - cp += 2; - if (changes & TCP_PUSH_BIT) - th->th_flags |= TH_PUSH; - else - th->th_flags &=~ TH_PUSH; - - switch (changes & SPECIALS_MASK) { - case SPECIAL_I: - { - register u_int i = ntohs(cs->cs_ip.ip_len) - cs->cs_hlen; - th->th_ack = htonl(ntohl(th->th_ack) + i); - th->th_seq = htonl(ntohl(th->th_seq) + i); - } - break; - - case SPECIAL_D: - th->th_seq = htonl(ntohl(th->th_seq) + ntohs(cs->cs_ip.ip_len) - - cs->cs_hlen); - break; - - default: - if (changes & NEW_U) { - th->th_flags |= TH_URG; - DECODEU(th->th_urp) - } else - th->th_flags &=~ TH_URG; - if (changes & NEW_W) - DECODES(th->th_win) - if (changes & NEW_A) - DECODEL(th->th_ack) - if (changes & NEW_S) - DECODEL(th->th_seq) - break; - } - if (changes & NEW_I) { - DECODES(cs->cs_ip.ip_id) - } else - cs->cs_ip.ip_id = htons(ntohs(cs->cs_ip.ip_id) + 1); - - /* - * At this point, cp points to the first byte of data in the - * packet. Fill in the IP total length and update the IP - * header checksum. - */ - vjlen = cp - buf; - buflen -= vjlen; - if (buflen < 0) - /* we must have dropped some characters (crc should detect - * this but the old slip framing won't) */ - goto bad; - - total_len += cs->cs_hlen - vjlen; - cs->cs_ip.ip_len = htons(total_len); - - /* recompute the ip header checksum */ - bp = (u_int16_t *) &cs->cs_ip; - cs->cs_ip.ip_sum = 0; - for (changes = 0; hlen > 0; hlen -= 2) - changes += *bp++; - changes = (changes & 0xffff) + (changes >> 16); - changes = (changes & 0xffff) + (changes >> 16); - cs->cs_ip.ip_sum = ~ changes; - - *hdrp = (u_char *) &cs->cs_ip; - *hlenp = cs->cs_hlen; - return vjlen; - -bad: - comp->flags |= SLF_TOSS; - INCR(sls_errorin) - return (-1); -} diff --git a/bsd/net/slcompress.h b/bsd/net/slcompress.h deleted file mode 100644 index b784ad8db..000000000 --- a/bsd/net/slcompress.h +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* - * Definitions for tcp compression routines. - * - * Copyright (c) 1989, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * Van Jacobson (van@helios.ee.lbl.gov), Dec 31, 1989: - * - Initial distribution. - * $FreeBSD: src/sys/net/slcompress.h,v 1.14.2.1 2000/05/05 13:37:06 jlemon Exp $ - */ - -#ifndef _NET_SLCOMPRESS_H_ -#define _NET_SLCOMPRESS_H_ -#include <sys/appleapiopts.h> - -#include <netinet/ip.h> - -#define MAX_STATES 16 /* must be > 2 and < 256 */ -#define MAX_HDR 128 - -/* - * Compressed packet format: - * - * The first octet contains the packet type (top 3 bits), TCP - * 'push' bit, and flags that indicate which of the 4 TCP sequence - * numbers have changed (bottom 5 bits). The next octet is a - * conversation number that associates a saved IP/TCP header with - * the compressed packet. The next two octets are the TCP checksum - * from the original datagram. The next 0 to 15 octets are - * sequence number changes, one change per bit set in the header - * (there may be no changes and there are two special cases where - * the receiver implicitly knows what changed -- see below). - * - * There are 5 numbers which can change (they are always inserted - * in the following order): TCP urgent pointer, window, - * acknowledgement, sequence number and IP ID. (The urgent pointer - * is different from the others in that its value is sent, not the - * change in value.) Since typical use of SLIP links is biased - * toward small packets (see comments on MTU/MSS below), changes - * use a variable length coding with one octet for numbers in the - * range 1 - 255 and 3 octets (0, MSB, LSB) for numbers in the - * range 256 - 65535 or 0. (If the change in sequence number or - * ack is more than 65535, an uncompressed packet is sent.) - */ - -/* - * Packet types (must not conflict with IP protocol version) - * - * The top nibble of the first octet is the packet type. There are - * three possible types: IP (not proto TCP or tcp with one of the - * control flags set); uncompressed TCP (a normal IP/TCP packet but - * with the 8-bit protocol field replaced by an 8-bit connection id -- - * this type of packet syncs the sender & receiver); and compressed - * TCP (described above). - * - * LSB of 4-bit field is TCP "PUSH" bit (a worthless anachronism) and - * is logically part of the 4-bit "changes" field that follows. Top - * three bits are actual packet type. For backward compatibility - * and in the interest of conserving bits, numbers are chosen so the - * IP protocol version number (4) which normally appears in this nibble - * means "IP packet". - */ - -/* packet types */ -#define TYPE_IP 0x40 -#define TYPE_UNCOMPRESSED_TCP 0x70 -#define TYPE_COMPRESSED_TCP 0x80 -#define TYPE_ERROR 0x00 - -/* Bits in first octet of compressed packet */ -#define NEW_C 0x40 /* flag bits for what changed in a packet */ -#define NEW_I 0x20 -#define NEW_S 0x08 -#define NEW_A 0x04 -#define NEW_W 0x02 -#define NEW_U 0x01 - -/* reserved, special-case values of above */ -#define SPECIAL_I (NEW_S|NEW_W|NEW_U) /* echoed interactive traffic */ -#define SPECIAL_D (NEW_S|NEW_A|NEW_W|NEW_U) /* unidirectional data */ -#define SPECIALS_MASK (NEW_S|NEW_A|NEW_W|NEW_U) - -#define TCP_PUSH_BIT 0x10 - - -/* - * "state" data for each active tcp conversation on the wire. This is - * basically a copy of the entire IP/TCP header from the last packet - * we saw from the conversation together with a small identifier - * the transmit & receive ends of the line use to locate saved header. - */ -struct cstate { - struct cstate *cs_next; /* next most recently used cstate (xmit only) */ - u_int16_t cs_hlen; /* size of hdr (receive only) */ - u_char cs_id; /* connection # associated with this state */ - u_char cs_filler; - union { - char csu_hdr[MAX_HDR]; - struct ip csu_ip; /* ip/tcp hdr from most recent packet */ - } slcs_u; -}; -#define cs_ip slcs_u.csu_ip -#define cs_hdr slcs_u.csu_hdr - -/* - * all the state data for one serial line (we need one of these - * per line). - */ -struct slcompress { - struct cstate *last_cs; /* most recently used tstate */ - u_char last_recv; /* last rcvd conn. id */ - u_char last_xmit; /* last sent conn. id */ - u_int16_t flags; -#ifndef SL_NO_STATS - int sls_packets; /* outbound packets */ - int sls_compressed; /* outbound compressed packets */ - int sls_searches; /* searches for connection state */ - int sls_misses; /* times couldn't find conn. state */ - int sls_uncompressedin; /* inbound uncompressed packets */ - int sls_compressedin; /* inbound compressed packets */ - int sls_errorin; /* inbound unknown type packets */ - int sls_tossed; /* inbound packets tossed because of error */ -#endif - struct cstate tstate[MAX_STATES]; /* xmit connection states */ - struct cstate rstate[MAX_STATES]; /* receive connection states */ -}; -/* flag values */ -#define SLF_TOSS 1 /* tossing rcvd frames because of input err */ - -#if !defined(KERNEL) || defined(__APPLE_API_PRIVATE) -void sl_compress_init __P((struct slcompress *, int)); -u_int sl_compress_tcp __P((struct mbuf *, - struct ip *, struct slcompress *, int)); -int sl_uncompress_tcp __P((u_char **, int, u_int, struct slcompress *)); -int sl_uncompress_tcp_core __P((u_char *, int, int, u_int, - struct slcompress *, u_char **, u_int *)); - -#endif /* !KERNEL || __APPLE_API_PRIVATE */ -#endif /* !_NET_SLCOMPRESS_H_ */ diff --git a/bsd/net/zlib.c b/bsd/net/zlib.c index 78130d5b8..a3d4c72ba 100644 --- a/bsd/net/zlib.c +++ b/bsd/net/zlib.c @@ -49,7 +49,7 @@ subject to change. Applications should only use zlib.h. */ -/* @(#) $Id: zlib.c,v 1.9 2002/11/28 00:56:55 lindak Exp $ */ +/* @(#) $Id: zlib.c,v 1.10 2004/07/29 19:17:20 lindak Exp $ */ #ifndef _Z_UTIL_H #define _Z_UTIL_H @@ -295,7 +295,7 @@ void zcfree OF((voidpf opaque, voidpf ptr)); subject to change. Applications should only use zlib.h. */ -/* @(#) $Id: zlib.c,v 1.9 2002/11/28 00:56:55 lindak Exp $ */ +/* @(#) $Id: zlib.c,v 1.10 2004/07/29 19:17:20 lindak Exp $ */ #ifndef _DEFLATE_H #define _DEFLATE_H @@ -655,7 +655,7 @@ void _tr_stored_block OF((deflate_state *s, charf *buf, ulg stored_len, * */ -/* @(#) $Id: zlib.c,v 1.9 2002/11/28 00:56:55 lindak Exp $ */ +/* @(#) $Id: zlib.c,v 1.10 2004/07/29 19:17:20 lindak Exp $ */ /* #include "deflate.h" */ @@ -1997,7 +1997,7 @@ local block_state deflate_slow(s, flush) * Addison-Wesley, 1983. ISBN 0-201-06672-6. */ -/* @(#) $Id: zlib.c,v 1.9 2002/11/28 00:56:55 lindak Exp $ */ +/* @(#) $Id: zlib.c,v 1.10 2004/07/29 19:17:20 lindak Exp $ */ /* #define GEN_TREES_H */ @@ -2359,7 +2359,7 @@ local int tr_static_init( ush bl_count[MAX_BITS+1]; /* number of codes at each bit length for an optimal tree */ - if (static_init_done) return; + if (static_init_done) return Z_OK; /* allocate storage for static structures */ if (static_storage == Z_NULL) { @@ -2439,6 +2439,7 @@ local int tr_static_init( gen_trees_header(); # endif #endif /* defined(GEN_TREES_H) || !defined(STDC) */ + return Z_OK; } /* =========================================================================== @@ -5541,7 +5542,7 @@ z_streamp z; * For conditions of distribution and use, see copyright notice in zlib.h */ -/* @(#) $Id: zlib.c,v 1.9 2002/11/28 00:56:55 lindak Exp $ */ +/* @(#) $Id: zlib.c,v 1.10 2004/07/29 19:17:20 lindak Exp $ */ /* #include "zutil.h" */ @@ -5771,7 +5772,7 @@ void zcfree (opaque, ptr) * For conditions of distribution and use, see copyright notice in zlib.h */ -/* @(#) $Id: zlib.c,v 1.9 2002/11/28 00:56:55 lindak Exp $ */ +/* @(#) $Id: zlib.c,v 1.10 2004/07/29 19:17:20 lindak Exp $ */ /* #include "zlib.h" */ diff --git a/bsd/net/zlib.h b/bsd/net/zlib.h index 688673ae5..5f90b2049 100644 --- a/bsd/net/zlib.h +++ b/bsd/net/zlib.h @@ -40,7 +40,7 @@ #define _ZLIB_H #include <sys/appleapiopts.h> -#if !defined(KERNEL) || defined(__APPLE_API_PRIVATE) +#ifdef KERNEL_PRIVATE #if __cplusplus extern "C" { @@ -81,8 +81,10 @@ extern "C" { # define compress2 z_compress2 # define uncompress z_uncompress # define adler32 z_adler32 +#if 0 # define crc32 z_crc32 # define get_crc_table z_get_crc_table +#endif # define Byte z_Byte # define uInt z_uInt @@ -1126,6 +1128,7 @@ ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len)); if (adler != original_adler) error(); */ +#if 0 ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len)); /* Update a running crc with the bytes buf[0..len-1] and return the updated @@ -1141,6 +1144,7 @@ ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len)); } if (crc != original_crc) error(); */ +#endif /* various hacks, don't look :) */ @@ -1181,6 +1185,6 @@ ZEXTERN const uLongf * ZEXPORT get_crc_table OF((void)); } #endif -#endif /* !KERNEL || __APPLE_API_PRIVATE */ +#endif KERNEL_PRIVATE #endif /* _ZLIB_H */ /* --- zlib.h */ diff --git a/bsd/netat/Makefile b/bsd/netat/Makefile index c31806cc6..9a37bee86 100644 --- a/bsd/netat/Makefile +++ b/bsd/netat/Makefile @@ -25,11 +25,17 @@ COMP_SUBDIRS = \ INST_SUBDIRS = \ -DATAFILES = appletalk.h atp.h asp.h at_pcb.h at_var.h aurp.h \ - debug.h ddp.h ep.h lap.h nbp.h pap.h sysglue.h zip.h \ - adsp.h adsp_internal.h \ - at_pat.h at_snmp.h at_aarp.h at_ddp_brt.h \ - routing_tables.h rtmp.h +DATAFILES = appletalk.h atp.h asp.h aurp.h \ + ddp.h ep.h lap.h nbp.h pap.h zip.h \ + adsp.h at_pat.h at_snmp.h at_aarp.h \ + rtmp.h + +PRIVATE_DATAFILES = \ + debug.h routing_tables.h sysglue.h at_var.h + +PRIVATE_KERNELFILES = \ + adsp_internal.h at_ddp_brt.h at_pcb.h + INSTALL_MI_LIST = ${DATAFILES} @@ -39,6 +45,10 @@ EXPORT_MI_LIST = ${DATAFILES} EXPORT_MI_DIR = netat +INSTALL_MI_LCL_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} + +INSTALL_KF_MI_LCL_LIST = ${INSTALL_MI_LCL_LIST} ${PRIVATE_KERNELFILES} + include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/bsd/netat/adsp.c b/bsd/netat/adsp.c index 02d01c963..b6c1cf6e6 100644 --- a/bsd/netat/adsp.c +++ b/bsd/netat/adsp.c @@ -133,150 +133,151 @@ adspWriteHandler(gref, mp) void *sp; switch(gbuf_type(mp)) { - case MSG_DATA: - if (gref->info == 0) { - gbuf_freem(mp); - return(STR_IGNORE); - } - /* - * Fill in the global stuff - */ - ap = (struct adspcmd *)gbuf_rptr(mp); - ap->gref = gref; - ap->ioc = 0; - ap->mp = mp; - sp = (void *)gbuf_rptr(((gbuf_t *)gref->info)); - switch(ap->csCode) { - case dspWrite: - if ((error = adspWrite(sp, ap))) - gbuf_freem(mp); - return(STR_IGNORE); - case dspAttention: - if ((error = adspAttention(sp, ap))) - gbuf_freem(mp); - return(STR_IGNORE); - } - case MSG_IOCTL: - if (gref->info == 0) { - adspioc_ack(EPROTO, mp, gref); - return(STR_IGNORE); - } - iocbp = (ioc_t *) gbuf_rptr(mp); - if (ADSP_IOCTL(iocbp->ioc_cmd)) { - iocbp->ioc_count = sizeof(*ap) - 1; - if (gbuf_cont(mp) == 0) { - adspioc_ack(EINVAL, mp, gref); - return(STR_IGNORE); - } - ap = (struct adspcmd *) gbuf_rptr(gbuf_cont(mp)); - ap->gref = gref; - ap->ioc = (caddr_t) mp; - ap->mp = gbuf_cont(mp); /* request head */ - ap->ioResult = 0; - - if ((gref->info == 0) && ((iocbp->ioc_cmd != ADSPOPEN) && - (iocbp->ioc_cmd != ADSPCLLISTEN))) { - ap->ioResult = errState; - - adspioc_ack(EINVAL, mp, gref); - return(STR_IGNORE); - } - } - sp = (void *)gbuf_rptr(((gbuf_t *)gref->info)); - switch(iocbp->ioc_cmd) { - case ADSPOPEN: - case ADSPCLLISTEN: - ap->socket = ((CCBPtr)sp)->localSocket; - flag = (adspMode(ap) == ocAccept) ? 1 : 0; - if (flag && ap->socket) { - if (adspDeassignSocket((CCBPtr)sp) >= 0) - ap->socket = 0; - } - if ((ap->socket == 0) && - ((ap->socket = - (at_socket)adspAssignSocket(gref, flag)) == 0)) { - adspioc_ack(EADDRNOTAVAIL, mp, gref); - return(STR_IGNORE); - } - ap->csCode = iocbp->ioc_cmd == ADSPOPEN ? dspInit : dspCLInit; - if ((error = adspInit(sp, ap)) == 0) { + case MSG_DATA: + if (gref->info == 0) { + gbuf_freem(mp); + return(STR_IGNORE); + } + /* + * Fill in the global stuff + */ + ap = (struct adspcmd *)gbuf_rptr(mp); + ap->gref = gref; + ap->ioc = 0; + ap->mp = mp; + sp = (void *)gbuf_rptr(((gbuf_t *)gref->info)); switch(ap->csCode) { - case dspInit: - /* and open the connection */ - ap->csCode = dspOpen; - error = adspOpen(sp, ap); - break; - case dspCLInit: - /* ADSPCLLISTEN */ - ap->csCode = dspCLListen; - error = adspCLListen(sp, ap); - break; + case dspWrite: + if ((error = adspWrite(sp, ap))) + gbuf_freem(mp); + return(STR_IGNORE); + case dspAttention: + if ((error = adspAttention(sp, ap))) + gbuf_freem(mp); + return(STR_IGNORE); } - } - if (error) - adspioc_ack(error, mp, gref); /* if this failed req complete */ - return(STR_IGNORE); - case ADSPCLOSE: - ap->csCode = dspClose; - if ((error = adspClose(sp, ap))) { - adspioc_ack(error, mp, gref); - break; - } - break; - case ADSPCLREMOVE: - ap->csCode = dspCLRemove; - error = adspClose(sp, ap); - adspioc_ack(error, mp, gref); - return(STR_IGNORE); - case ADSPCLDENY: - ap->csCode = dspCLDeny; - if ((error = adspCLDeny(sp, ap))) { - adspioc_ack(error, mp, gref); - } - return(STR_IGNORE); - case ADSPSTATUS: - ap->csCode = dspStatus; - if ((error = adspStatus(sp, ap))) { - adspioc_ack(error, mp, gref); - } - return(STR_IGNORE); - case ADSPREAD: - ap->csCode = dspRead; - if ((error = adspRead(sp, ap))) { - adspioc_ack(error, mp, gref); - } - return(STR_IGNORE); - case ADSPATTENTION: - ap->csCode = dspAttention; - if ((error = adspReadAttention(sp, ap))) { - adspioc_ack(error, mp, gref); - } - return(STR_IGNORE); - case ADSPOPTIONS: - ap->csCode = dspOptions; - if ((error = adspOptions(sp, ap))) { - adspioc_ack(error, mp, gref); - } - return(STR_IGNORE); - case ADSPRESET: - ap->csCode = dspReset; - if ((error = adspReset(sp, ap))) { - adspioc_ack(error, mp, gref); - } - return(STR_IGNORE); - case ADSPNEWCID: - ap->csCode = dspNewCID; - if ((error = adspNewCID(sp, ap))) { - adspioc_ack(error, mp, gref); - } - return(STR_IGNORE); - default: - return(STR_PUTNEXT); /* pass it on down */ - } - return(STR_IGNORE); - case MSG_PROTO: - default: - gbuf_freem(mp); + case MSG_IOCTL: + if (gref->info == 0) { + adspioc_ack(EPROTOTYPE, mp, gref); + return(STR_IGNORE); + } + iocbp = (ioc_t *) gbuf_rptr(mp); + if (ADSP_IOCTL(iocbp->ioc_cmd)) { + iocbp->ioc_count = sizeof(*ap) - 1; + if (gbuf_cont(mp) == 0) { + adspioc_ack(EINVAL, mp, gref); + return(STR_IGNORE); + } + ap = (struct adspcmd *) gbuf_rptr(gbuf_cont(mp)); + ap->gref = gref; + ap->ioc = (caddr_t) mp; + ap->mp = gbuf_cont(mp); /* request head */ + ap->ioResult = 0; + + if ((gref->info == 0) && ((iocbp->ioc_cmd != ADSPOPEN) && + (iocbp->ioc_cmd != ADSPCLLISTEN))) { + ap->ioResult = errState; + + adspioc_ack(EINVAL, mp, gref); + return(STR_IGNORE); + } + } else + return(STR_PUTNEXT); /* pass it on down */ + sp = (void *)gbuf_rptr(((gbuf_t *)gref->info)); + switch(iocbp->ioc_cmd) { + case ADSPOPEN: + case ADSPCLLISTEN: + ap->socket = ((CCBPtr)sp)->localSocket; + flag = (adspMode(ap) == ocAccept) ? 1 : 0; + if (flag && ap->socket) { + if (adspDeassignSocket((CCBPtr)sp) >= 0) + ap->socket = 0; + } + if ((ap->socket == 0) && + ((ap->socket = + (at_socket)adspAssignSocket(gref, flag)) == 0)) { + adspioc_ack(EADDRNOTAVAIL, mp, gref); + return(STR_IGNORE); + } + ap->csCode = iocbp->ioc_cmd == ADSPOPEN ? dspInit : dspCLInit; + if ((error = adspInit(sp, ap)) == 0) { + switch(ap->csCode) { + case dspInit: + /* and open the connection */ + ap->csCode = dspOpen; + error = adspOpen(sp, ap); + break; + case dspCLInit: + /* ADSPCLLISTEN */ + ap->csCode = dspCLListen; + error = adspCLListen(sp, ap); + break; + } + } + if (error) + adspioc_ack(error, mp, gref); /* if this failed req complete */ + return(STR_IGNORE); + case ADSPCLOSE: + ap->csCode = dspClose; + if ((error = adspClose(sp, ap))) { + adspioc_ack(error, mp, gref); + break; + } + break; + case ADSPCLREMOVE: + ap->csCode = dspCLRemove; + error = adspClose(sp, ap); + adspioc_ack(error, mp, gref); + return(STR_IGNORE); + case ADSPCLDENY: + ap->csCode = dspCLDeny; + if ((error = adspCLDeny(sp, ap))) { + adspioc_ack(error, mp, gref); + } + return(STR_IGNORE); + case ADSPSTATUS: + ap->csCode = dspStatus; + if ((error = adspStatus(sp, ap))) { + adspioc_ack(error, mp, gref); + } + return(STR_IGNORE); + case ADSPREAD: + ap->csCode = dspRead; + if ((error = adspRead(sp, ap))) { + adspioc_ack(error, mp, gref); + } + return(STR_IGNORE); + case ADSPATTENTION: + ap->csCode = dspAttention; + if ((error = adspReadAttention(sp, ap))) { + adspioc_ack(error, mp, gref); + } + return(STR_IGNORE); + case ADSPOPTIONS: + ap->csCode = dspOptions; + if ((error = adspOptions(sp, ap))) { + adspioc_ack(error, mp, gref); + } + return(STR_IGNORE); + case ADSPRESET: + ap->csCode = dspReset; + if ((error = adspReset(sp, ap))) { + adspioc_ack(error, mp, gref); + } + return(STR_IGNORE); + case ADSPNEWCID: + ap->csCode = dspNewCID; + if ((error = adspNewCID(sp, ap))) { + adspioc_ack(error, mp, gref); + } + return(STR_IGNORE); + default: + return(STR_PUTNEXT); /* pass it on down */ + } + return(STR_IGNORE); + case MSG_PROTO: + default: + gbuf_freem(mp); } } diff --git a/bsd/netat/adsp.h b/bsd/netat/adsp.h index bf40ffbd5..315a7decc 100644 --- a/bsd/netat/adsp.h +++ b/bsd/netat/adsp.h @@ -31,6 +31,10 @@ #ifndef _NETAT_ADSP_H_ #define _NETAT_ADSP_H_ #include <sys/appleapiopts.h> +#include <netat/appletalk.h> + +#ifdef __APPLE_API_OBSOLETE + /* ADSP flags for read, write, and close routines */ #define ADSP_EOM 0x01 /* Sent or received EOM with data */ @@ -269,7 +273,7 @@ struct tpb { #endif */ -typedef long (*ProcPtr)(); +typedef long (*ProcPtr)(); /* XXX */ typedef ProcPtr *ProcHandle; typedef char *Ptr; typedef Ptr *Handle; @@ -663,8 +667,7 @@ typedef struct { #define ADSPGETSOCK ((AT_MID_ADSP<<8) | 239) #define ADSPGETPEER ((AT_MID_ADSP<<8) | 238) -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE /* from h/adsp_adsp.h */ @@ -674,19 +677,22 @@ typedef struct { #define STR_PUTBACK 2 #define STR_QTIME (HZ >> 3) -extern int adspInit(); -extern int adspOpen(); -extern int adspCLListen(); -extern int adspClose(); -extern int adspCLDeny(); -extern int adspStatus(); -extern int adspRead(); -extern int adspWrite(); -extern int adspAttention(); -extern int adspOptions(); -extern int adspReset(); -extern int adspNewCID(); -extern int adspPacket(); +struct ccb; +#define CCBPtr struct ccb * +extern int adspInit(CCBPtr sp, struct adspcmd *ap); +extern int adspOpen(register CCBPtr sp, register struct adspcmd *pb); +extern int adspCLListen( register CCBPtr sp, register struct adspcmd *pb); +extern int adspClose(register CCBPtr sp, register struct adspcmd *pb); +extern int adspCLDeny(struct adspcmd *pb, CCBPtr sp); +extern int adspStatus(CCBPtr sp, register struct adspcmd *pb); +extern int adspRead(register CCBPtr sp, register struct adspcmd *pb); +extern int adspWrite(CCBPtr sp, struct adspcmd *pb); +extern int adspAttention(register struct adspcmd *pb, register CCBPtr sp); +extern int adspOptions(CCBPtr sp, struct adspcmd *pb); +extern int adspReset(CCBPtr sp, struct adspcmd *pb); +extern int adspNewCID(CCBPtr sp, struct adspcmd *pb); +extern int adspPacket(gref_t *gref, gbuf_t *mp); +#undef CCBPtr struct adsp_debug { @@ -701,6 +707,6 @@ struct adsp_debug { int ad_sendWdwSeq; }; -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +#endif /* KERNEL_PRIVATE */ +#endif /* __APPLE_API_OBSOLETE */ #endif /* _NETAT_ADSP_H_ */ diff --git a/bsd/netat/adsp_CLDeny.c b/bsd/netat/adsp_CLDeny.c index 9d6a9b7e7..a7e14c4b0 100644 --- a/bsd/netat/adsp_CLDeny.c +++ b/bsd/netat/adsp_CLDeny.c @@ -62,9 +62,7 @@ * errState not a connection listener * errAborted request aborted by a Remove call */ -int adspCLDeny(sp, pb) /* (DSPPBPtr pb) */ - struct adspcmd *pb; - CCBPtr sp; +int adspCLDeny(struct adspcmd *pb, CCBPtr sp) { gbuf_t *mp; ADSP_FRAMEPtr adspp; diff --git a/bsd/netat/adsp_Close.c b/bsd/netat/adsp_Close.c index 684c47204..179d60e96 100644 --- a/bsd/netat/adsp_Close.c +++ b/bsd/netat/adsp_Close.c @@ -222,7 +222,7 @@ int AbortIO(sp, err) /* * Complete all outstanding transactions. */ - total += CompleteQueue(&sp->sapb, err); /* Abort outstanding send attentions */ + total = CompleteQueue(&sp->sapb, err); /* Abort outstanding send attentions */ CompleteQueue(&sp->frpb, err); /* Abort outstanding forward resets */ if (sp->sbuf_mb) { /* clear the send queue */ diff --git a/bsd/netat/adsp_Control.c b/bsd/netat/adsp_Control.c index 69221b4fb..fe9d6803a 100644 --- a/bsd/netat/adsp_Control.c +++ b/bsd/netat/adsp_Control.c @@ -165,7 +165,7 @@ top: gbuf_wset(mp,DDPL_FRAME_LEN); /* leave room for DDP header */ if (sp->sendCtl) { - short mask; + short mask = 0; i = sp->sendCtl; /* get local copy bitmap of */ /* which ctl packets to send. */ diff --git a/bsd/netat/adsp_Timer.c b/bsd/netat/adsp_Timer.c index 94127cf21..d528e7e3a 100644 --- a/bsd/netat/adsp_Timer.c +++ b/bsd/netat/adsp_Timer.c @@ -179,9 +179,9 @@ send: void TimerTick_funnel(void *arg) { - thread_funnel_set(network_flock, TRUE); + atalk_lock(); TimerTick(); - thread_funnel_set(network_flock, FALSE); + atalk_unlock(); } static StopTimer; diff --git a/bsd/netat/adsp_attention.c b/bsd/netat/adsp_attention.c index 18ce61f65..8a0510ac4 100644 --- a/bsd/netat/adsp_attention.c +++ b/bsd/netat/adsp_attention.c @@ -70,9 +70,7 @@ * errAttention attention message too long * errAborted request aborted by Remove or Close call */ -int adspAttention(sp, pb) /* (DSPPBPtr pb) */ - register struct adspcmd *pb; - register CCBPtr sp; +int adspAttention(register struct adspcmd *pb, register CCBPtr sp) { int s; register gbuf_t *mp, *nmp; diff --git a/bsd/netat/adsp_internal.h b/bsd/netat/adsp_internal.h index 268cbe068..e5a48476d 100644 --- a/bsd/netat/adsp_internal.h +++ b/bsd/netat/adsp_internal.h @@ -21,12 +21,11 @@ */ #ifndef _NETAT_ADSP_INTERNAL_H_ #define _NETAT_ADSP_INTERNAL_H_ -#include <sys/appleapiopts.h> #include <sys/types.h> -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef __APPLE_API_OBSOLETE +#ifdef KERNEL_PRIVATE /* from h/adsp_portab.h */ @@ -141,9 +140,8 @@ typedef struct TNetworkTransition { ProcPtr netValidProc; /* pointer to the network valid procedure */ } TNetworkTransition, *TPNetworkTransition; -typedef long (*NetworkTransitionProcPtr)(); - /* (TPNetworkTransition nettrans, - unsigned long thenet); */ +typedef long (*NetworkTransitionProcPtr)(TPNetworkTransition nettrans, + unsigned long thenet); /* * This is the connection control block */ @@ -309,34 +307,33 @@ typedef struct { /* fron h/adsp_supp.h */ -void CallUserRoutine(); /* (CCB FPTR sp); */ +void CallUserRoutine(CCBPtr sp); /* (CCB FPTR sp); */ /* * Add queue element to end of queue. Pass Address of ptr to * 1st element of queue +int qAddToEnd(struct qlink **qhead, struct qlink *qelem); */ -int qAddToEnd(); /* (void FPTR FPTR qhead, void FPTR qelem); */ + /* (void FPTR FPTR qhead, void FPTR qelem); */ /* * Hunt down a linked list of queue elements looking for an element with * 'data' at 'offset' bytes into the queue element. */ -void *qfind_b(); /* (void *qhead, word offset, word data); */ -void *qfind_w(); /* (void *qhead, word offset, word data); */ -void *qfind_p(); /* (void *qhead, word offset, void *ptr); */ -void *qfind_o(); /* (void *qhead, word offset, void *ptr); */ -void *qfind_m(); /* (void *qhead, void *match, - ProcPtr compare_fnx); */ +void *qfind_b(void *qhead, word offset, word data); +void *qfind_w(void *qhead, word offset, word data); +void *qfind_p(void *qhead, word offset, void *ptr); +void *qfind_o(void *qhead, word offset, void *ptr); +void *qfind_m(CCBPtr qhead, void *match, ProcPtr compare_fnx); /* * Routines to handle sorted timer queues */ -void InsertTimerElem(); /* (TimerElemPtr *qhead, TimerElemPtr t, - word val); */ -void RemoveTimerElem(); /* (TimerElemPtr *qhead, TimerElemPtr t); */ -void TimerQueueTick(); /* (TimerElemPtr *qhead);*/ +void InsertTimerElem(TimerElemPtr *qhead, TimerElemPtr t, int val); +void RemoveTimerElem(TimerElemPtr *qhead, TimerElemPtr t); +void TimerQueueTick(TimerElemPtr *qhead); /* from h/adsp_global.h */ @@ -355,7 +352,7 @@ extern GLOBAL adspGlobal; /* Address of ptr to list of ccb's */ #define AT_ADSP_STREAMS ((CCB **)&(adspGlobal.ccbList)) -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +#endif /* KERNEL_PRIVATE */ +#endif /* __APPLE_API_OBSOLETE */ #endif /* _NETAT_ADSP_INTERNAL_H_ */ diff --git a/bsd/netat/appletalk.h b/bsd/netat/appletalk.h index 7fbc4e38e..615164c37 100644 --- a/bsd/netat/appletalk.h +++ b/bsd/netat/appletalk.h @@ -39,6 +39,8 @@ #include <sys/types.h> #include <sys/uio.h> +#ifdef __APPLE_API_OBSOLETE + /* Non-aligned types are used in packet headers. */ @@ -281,19 +283,18 @@ typedef struct at_state { #define AT_ST_ZT_CHANGED 0x0800 /* zone table changed (for SNMP) */ #define AT_ST_NBP_CHANGED 0x1000 /* if nbp table changed (for SNMP)*/ -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE extern at_state_t at_state; /* global state of AT network */ #define ROUTING_MODE (at_state.flags & AT_ST_ROUTER) #define MULTIHOME_MODE (at_state.flags & AT_ST_MULTIHOME) #define MULTIPORT_MODE (ROUTING_MODE || MULTIHOME_MODE) -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +#endif /* KERNEL_PRIVATE */ /* defines originally from h/at_elap.h */ #define AT_ADDR 0 #define ET_ADDR 1 #define AT_ADDR_NO_LOOP 2 /* disables packets from looping back */ +#endif /* __APPLE_API_OBSOLETE */ #endif /* _NETAT_APPLETALK_H_ */ diff --git a/bsd/netat/asp.h b/bsd/netat/asp.h index a18a74d85..fd7d85c24 100644 --- a/bsd/netat/asp.h +++ b/bsd/netat/asp.h @@ -32,6 +32,8 @@ #define _NETAT_ASP_H_ #include <sys/appleapiopts.h> +#ifdef __APPLE_API_OBSOLETE + #define ASP_Version 0x100 #define ASPFUNC_CloseSess 1 @@ -125,8 +127,7 @@ union asp_primitives { asp_command_ind_t CommandInd; }; -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE #define ASPSTATE_Close 0 #define ASPSTATE_Idle 1 @@ -214,6 +215,6 @@ typedef struct asp_scb { atevent_t delay_event; } asp_scb_t; -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +#endif /* KERNEL_PRIVATE */ +#endif /* __APPLE_API_OBSOLETE */ #endif /* _NETAT_ASP_H_ */ diff --git a/bsd/netat/asp_proto.c b/bsd/netat/asp_proto.c index bcaef1ac3..265de3369 100644 --- a/bsd/netat/asp_proto.c +++ b/bsd/netat/asp_proto.c @@ -40,6 +40,7 @@ #include <sys/mbuf.h> #include <sys/ioctl.h> #include <sys/malloc.h> +#include <kern/locks.h> #include <sys/socket.h> #include <sys/socketvar.h> @@ -81,7 +82,7 @@ void asp_init(); void asp_ack_reply(); void asp_nak_reply(); void asp_clock(); -void asp_clock_funnel(void *); +void asp_clock_locked(void *); int asp_open(); int asp_close(); int asp_wput(); @@ -97,13 +98,14 @@ StaticProc void asp_timout(); StaticProc void asp_untimout(); StaticProc void asp_hangup(); StaticProc void asp_send_tickle(); -StaticProc void asp_send_tickle_funnel(void *); +StaticProc void asp_send_tickle_locked(void *); StaticProc void asp_accept(); StaticProc int asp_send_req(); extern at_ifaddr_t *ifID_home; extern int atp_pidM[]; extern gref_t *atp_inputQ[]; +extern lck_mtx_t *atalk_mutex; gbuf_t *scb_resource_m = 0; unsigned char asp_inpC[256]; asp_scb_t *asp_scbQ[256]; @@ -265,7 +267,7 @@ asp_close(gref) */ scb->tmo_cnt = 0; asp_untimout(asp_hangup, scb); - untimeout(asp_send_tickle_funnel, (void *)scb); /* added for 2225395 */ + untimeout(asp_send_tickle_locked, (void *)scb); /* added for 2225395 */ /* * free the asp session control block @@ -493,7 +495,7 @@ int asp_wput(gref, m) case ASPIOC_GetLocEntity: if ((gbuf_cont(mioc) == 0) || (scb->atp_state == 0)) { - asp_iocnak(gref, mioc, EPROTO); + asp_iocnak(gref, mioc, EPROTOTYPE); return 0; } *(at_inet_t *)gbuf_rptr(gbuf_cont(mioc)) = scb->loc_addr; @@ -501,7 +503,7 @@ int asp_wput(gref, m) case ASPIOC_GetRemEntity: if ((gbuf_cont(mioc) == 0) || (scb->atp_state == 0)) { - asp_iocnak(gref, mioc, EPROTO); + asp_iocnak(gref, mioc, EPROTOTYPE); return 0; } *(at_inet_t *)gbuf_rptr(gbuf_cont(mioc)) = scb->rem_addr; @@ -509,7 +511,7 @@ int asp_wput(gref, m) case ASPIOC_GetSession: if ((mdata = gbuf_cont(mioc)) == 0) { - asp_iocnak(gref, mioc, EPROTO); + asp_iocnak(gref, mioc, EPROTOTYPE); return 0; } addr = (at_inet_t *)gbuf_rptr(mdata); @@ -518,11 +520,11 @@ int asp_wput(gref, m) server_scb = asp_scbQ[addr->socket]; /*### LD 10/28/97: changed to make sure we're not accessing a null server_scb */ if (server_scb == 0) { - asp_iocnak(gref, mioc, EPROTO); + asp_iocnak(gref, mioc, EPROTOTYPE); return 0; } if (server_scb->sess_ioc == 0) { - asp_iocnak(gref, mioc, EPROTO); + asp_iocnak(gref, mioc, EPROTOTYPE); return 0; } @@ -774,15 +776,15 @@ asp_send_req(gref, mioc, dest, retry, awp, xo, state, bitmap) } /* - * send tickle routine - funnelled version + * send tickle routine - locked version */ StaticProc void -asp_send_tickle_funnel(scb) +asp_send_tickle_locked(scb) void *scb; { - thread_funnel_set(network_flock, TRUE); + atalk_lock(); asp_send_tickle((asp_scb_t *)scb); - thread_funnel_set(network_flock, FALSE); + atalk_unlock(); } @@ -810,7 +812,7 @@ asp_send_tickle(scb) dPrintf(D_M_ASP, D_L_WARNING, ("asp_send_tickle: ENOBUFS 0, loc=%d, rem=%d\n", scb->loc_addr.socket,scb->rem_addr.socket)); - timeout(asp_send_tickle_funnel, (void *)scb, 10); + timeout(asp_send_tickle_locked, (void *)scb, 10); return; } gbuf_wset(mioc,sizeof(ioc_t)); @@ -832,7 +834,7 @@ asp_send_tickle(scb) ("asp_send_tickle: ENOBUFS 1, loc=%d, rem=%d\n", scb->loc_addr.socket,scb->rem_addr.socket)); - timeout(asp_send_tickle_funnel, (void *)scb, 10); + timeout(asp_send_tickle_locked, (void *)scb, 10); return; } } @@ -893,14 +895,14 @@ asp_accept(scb, sess_scb, m) } /* asp_accept */ /* - * timer routine - funneled version + * timer routine - locked version */ -void asp_clock_funnel(arg) +void asp_clock_locked(arg) void *arg; { - thread_funnel_set(network_flock, TRUE); + atalk_lock(); asp_clock(arg); - thread_funnel_set(network_flock, FALSE); + atalk_unlock(); } /* @@ -929,7 +931,7 @@ void asp_clock(arg) ATENABLE(s, asptmo_lock); if (++scb_tmo_cnt == 0) scb_tmo_cnt++; - timeout(asp_clock_funnel, (void *)arg, (1<<SESS_TMO_RES)*TICKS_PER_SEC); + timeout(asp_clock_locked, (void *)arg, (1<<SESS_TMO_RES)*TICKS_PER_SEC); } @@ -1520,7 +1522,7 @@ asp_nak_reply(gref, mioc) /* last remaining use of MSG_ERROR */ gbuf_set_type(mioc, MSG_ERROR); - *gbuf_rptr(mioc) = (u_char)EPROTO; + *gbuf_rptr(mioc) = (u_char)EPROTOTYPE; gbuf_wset(mioc, 1); if (gbuf_cont(mioc)) { gbuf_freem(gbuf_cont(mioc)); @@ -1953,9 +1955,9 @@ int ASPputmsg(gref_t *gref, strbuf_t *ctlptr, strbuf_t *datptr, gbuf_t *mreq, in bcopy (datptr, &datbuf, sizeof (strbuf_t)); } else { /* being called from user space */ - if ((err = copyin((caddr_t)ctlptr, (caddr_t)&ctlbuf, sizeof(ctlbuf))) != 0) + if ((err = copyin(CAST_USER_ADDR_T(ctlptr), (caddr_t)&ctlbuf, sizeof(ctlbuf))) != 0) goto l_err; - if ((err = copyin((caddr_t)datptr, (caddr_t)&datbuf, sizeof(datbuf))) != 0) + if ((err = copyin(CAST_USER_ADDR_T(datptr), (caddr_t)&datbuf, sizeof(datbuf))) != 0) goto l_err; } @@ -1975,7 +1977,7 @@ int ASPputmsg(gref_t *gref, strbuf_t *ctlptr, strbuf_t *datptr, gbuf_t *mreq, in bcopy (ctlbuf.buf, gbuf_rptr(mioc), ctlbuf.len); } else { /* being called from user space */ - if ((err = copyin((caddr_t)ctlbuf.buf, (caddr_t)gbuf_rptr(mioc), ctlbuf.len)) != 0) { + if ((err = copyin(CAST_USER_ADDR_T(ctlbuf.buf), (caddr_t)gbuf_rptr(mioc), ctlbuf.len)) != 0) { gbuf_freem(mioc); goto l_err; } @@ -2028,7 +2030,7 @@ int ASPputmsg(gref_t *gref, strbuf_t *ctlptr, strbuf_t *datptr, gbuf_t *mreq, in remain -= copy_len; if (mreq != NULL) bcopy (dataptr, (gbuf_rptr(mdata) + offset), copy_len); - else if ((err = copyin(dataptr, (caddr_t)(gbuf_rptr(mdata) + offset), copy_len)) != 0) { + else if ((err = copyin(CAST_USER_ADDR_T(dataptr), (caddr_t)(gbuf_rptr(mdata) + offset), copy_len)) != 0) { gbuf_freem(mioc); goto l_err; } @@ -2240,7 +2242,8 @@ int ASPgetmsg(gref_t *gref, strbuf_t *ctlptr, strbuf_t *datptr, gbuf_t **mreply, */ while ((mproto = scb->sess_ioc) == 0) { scb->get_wait = 1; - err = tsleep(&scb->event, PSOCK | PCATCH, "aspgetmsg", 0); + lck_mtx_assert(atalk_mutex, LCK_MTX_ASSERT_OWNED); + err = msleep(&scb->event, atalk_mutex, PSOCK | PCATCH, "aspgetmsg", 0); if (err != 0) { scb->get_wait = 0; ATENABLE(s, scb->lock); @@ -2278,10 +2281,10 @@ int ASPgetmsg(gref_t *gref, strbuf_t *ctlptr, strbuf_t *datptr, gbuf_t **mreply, bcopy (datptr, &datbuf, sizeof(datbuf)); } else { /* called from user space */ - if ((err = copyin((caddr_t)ctlptr, + if ((err = copyin(CAST_USER_ADDR_T(ctlptr), (caddr_t)&ctlbuf, sizeof(ctlbuf))) != 0) goto l_err; - if ((err = copyin((caddr_t)datptr, + if ((err = copyin(CAST_USER_ADDR_T(datptr), (caddr_t)&datbuf, sizeof(datbuf))) != 0) goto l_err; } @@ -2320,10 +2323,10 @@ int ASPgetmsg(gref_t *gref, strbuf_t *ctlptr, strbuf_t *datptr, gbuf_t **mreply, } else { /* called from user space */ if ((err = copyout((caddr_t)gbuf_rptr(mproto), - (caddr_t)ctlbuf.buf, ctlbuf.len)) != 0) + CAST_USER_ADDR_T(ctlbuf.buf), ctlbuf.len)) != 0) goto l_err; if ((err = copyout((caddr_t)&ctlbuf, - (caddr_t)ctlptr, sizeof(ctlbuf))) != 0) + CAST_USER_ADDR_T(ctlptr), sizeof(ctlbuf))) != 0) goto l_err; } @@ -2341,7 +2344,7 @@ int ASPgetmsg(gref_t *gref, strbuf_t *ctlptr, strbuf_t *datptr, gbuf_t **mreply, if (mreply == NULL) { /* called from user space */ - if ((err = copyout((caddr_t)gbuf_rptr(mdata), (caddr_t)&datbuf.buf[sum], len)) != 0) + if ((err = copyout((caddr_t)gbuf_rptr(mdata), CAST_USER_ADDR_T(&datbuf.buf[sum]), len)) != 0) goto l_err; } sum += len; @@ -2353,7 +2356,7 @@ int ASPgetmsg(gref_t *gref, strbuf_t *ctlptr, strbuf_t *datptr, gbuf_t **mreply, bcopy (&datbuf, datptr, sizeof(datbuf)); } else { /* called from user space */ - if ((err = copyout((caddr_t)&datbuf, (caddr_t)datptr, sizeof(datbuf))) != 0) + if ((err = copyout((caddr_t)&datbuf, CAST_USER_ADDR_T(datptr), sizeof(datbuf))) != 0) goto l_err; } diff --git a/bsd/netat/at.c b/bsd/netat/at.c index 089fd3889..3909aa1d9 100644 --- a/bsd/netat/at.c +++ b/bsd/netat/at.c @@ -34,6 +34,7 @@ #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/file.h> +#include <sys/kauth.h> #include <net/if.h> #include <net/if_dl.h> @@ -124,7 +125,8 @@ static int set_zones(ifz) * ifp is 0 if not an interface-specific ioctl. */ -int at_control(so, cmd, data, ifp) +int +at_control(so, cmd, data, ifp) struct socket *so; u_long cmd; caddr_t data; @@ -137,15 +139,18 @@ int at_control(so, cmd, data, ifp) struct ifaddr *ifa; struct sockaddr_dl *sdl; - if (cmd == 0x2000ff99) { + if ((cmd & 0xffff) == 0xff99) { + u_long fixed_command; + char ioctl_buffer[32]; /* *** this is a temporary hack to get at_send_to_dev() to work with BSD-style sockets instead of the special purpose system calls, ATsocket() and ATioctl(). *** */ - if ((error = at_ioctl((struct atpcb *)so->so_pcb, cmd, data, 0))) { + fixed_command = _IOW(0, 0xff99, user_addr_t); + if ((error = at_ioctl((struct atpcb *)so->so_pcb, fixed_command, data, 0))) { if (((struct atpcb *)so->so_pcb)->proto != ATPROTO_LAP) { ((struct atpcb *)so->so_pcb)->proto = ATPROTO_LAP; - error = at_ioctl((struct atpcb *)so->so_pcb, cmd, data, 0); + error = at_ioctl((struct atpcb *)so->so_pcb, fixed_command, data , 0); } } return(error); @@ -216,7 +221,7 @@ int at_control(so, cmd, data, ifp) at_def_zone_t *defzonep = (at_def_zone_t *)data; /* check for root access */ - if (error = suser(p->p_ucred, &p->p_acflag)) + if (error = suser(kauth_cred_get(), 0)) return(EACCES); ifID = 0; @@ -326,7 +331,7 @@ int at_control(so, cmd, data, ifp) { at_nbp_reg_t *nbpP = (at_nbp_reg_t *)data; nve_entry_t nve; - int error; + int error2; if (!(at_state.flags & AT_ST_STARTED) || !ifID_home) return(ENOTREADY); @@ -382,14 +387,14 @@ int at_control(so, cmd, data, ifp) * this tuple in the registry and return ok response. */ ATDISABLE(nve_lock, NVE_LOCK); - if ((error = nbp_new_nve_entry(&nve, ifID)) == 0) { + if ((error2 = nbp_new_nve_entry(&nve, ifID)) == 0) { nbpP->addr.net = ifID->ifThisNode.s_net; nbpP->addr.node = ifID->ifThisNode.s_node; nbpP->unique_nbp_id = nve.unique_nbp_id; } ATENABLE(nve_lock, NVE_LOCK); - return(error); + return(error2); break; } @@ -463,7 +468,7 @@ int at_control(so, cmd, data, ifp) at_router_params_t *rt = (at_router_params_t *)data; /* check for root access */ - if (error = suser(p->p_ucred, &p->p_acflag)) + if (error = suser(kauth_cred_get(), 0)) return(EACCES); /* when in routing/multihome mode the AIOCSETROUTER IOCTL @@ -503,7 +508,7 @@ int at_control(so, cmd, data, ifp) at_kern_err_t *keP = (at_kern_err_t *)data; /* check for root access */ - if (suser(p->p_ucred, &p->p_acflag)) + if (suser(kauth_cred_get(), 0)) return(EACCES); if (!(at_state.flags & AT_ST_STARTED)) @@ -534,7 +539,7 @@ int at_control(so, cmd, data, ifp) ret; /* check for root access */ - if (error = suser(p->p_ucred, &p->p_acflag)) + if (error = suser(kauth_cred_get(), 0)) return(EACCES); ret = ddp_shutdown(*count_only); @@ -561,7 +566,7 @@ int at_control(so, cmd, data, ifp) case SIOCSIFADDR: /* check for root access */ - if (error = suser(p->p_ucred, &p->p_acflag)) + if (error = suser(kauth_cred_get(), 0)) error = EACCES; else if (ifID) error = EEXIST; @@ -579,6 +584,7 @@ int at_control(so, cmd, data, ifp) ifID->aa_ifp = ifp; ifa = &ifID->aa_ifa; + ifnet_lock_exclusive(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if ((sdl = (struct sockaddr_dl *)ifa->ifa_addr) && (sdl->sdl_family == AF_LINK)) { @@ -599,14 +605,14 @@ int at_control(so, cmd, data, ifp) ifID->ifNodeAddress.sat_family = AF_APPLETALK; /* the address itself will be filled in when ifThisNode is set */ - s = splnet(); - TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); - splx(s); + if_attach_ifa(ifp, ifa); + ifnet_lock_done(ifp); switch (ifp->if_type) { case IFT_ETHER: - ether_attach_at(ifp, &ifID->at_dl_tag, - &ifID->aarp_dl_tag); + case IFT_L2VLAN: + case IFT_IEEE8023ADLAG: /* bonded ethernet */ + ether_attach_at(ifp); error = 0; ifID->cable_multicast_addr = etalk_multicast_addr; @@ -649,7 +655,7 @@ int at_control(so, cmd, data, ifp) deletion of interfaces *** */ case SIOCDIFADDR: /* check for root access */ - if (error = suser(p->p_ucred, &p->p_acflag)) + if (error = suser(kauth_cred_get(), 0)) error = EACCES; else if (!ifID) error = EINVAL; @@ -669,13 +675,11 @@ int at_control(so, cmd, data, ifp) /* let's make sure it's either -1 or a valid file descriptor */ if (cloned_fd != -1) { struct socket *cloned_so; - struct file *cloned_fp; - error = getsock(p->p_fd, cloned_fd, &cloned_fp); + error = file_socket(cloned_fd, &cloned_so); if (error){ splx(s); /* XXX */ break; } - cloned_so = (struct socket *)cloned_fp->f_data; clonedat_pcb = sotoatpcb(cloned_so); } else { clonedat_pcb = NULL; @@ -687,6 +691,7 @@ int at_control(so, cmd, data, ifp) at_pcb->ddp_flags = clonedat_pcb->ddp_flags; } splx(s); /* XXX */ + file_drop(cloned_fd); break; } diff --git a/bsd/netat/at_aarp.h b/bsd/netat/at_aarp.h index 80c8b88ec..61bd23db8 100644 --- a/bsd/netat/at_aarp.h +++ b/bsd/netat/at_aarp.h @@ -22,6 +22,12 @@ #ifndef _NETAT_AT_AARP_H_ #define _NETAT_AT_AARP_H_ #include <sys/appleapiopts.h> +#ifdef KERNEL_PRIVATE +#include <netat/at_var.h> +#endif KERNEL_PRIVATE + +#ifdef __APPLE_API_OBSOLETE + /* * Copyright (c) 1988, 1989 Apple Computer, Inc. */ @@ -90,6 +96,8 @@ typedef struct { /* Errors returned by AARP routines */ #define AARP_ERR_NOT_OURS 1 /* not our appletalk address */ +#ifdef KERNEL_PRIVATE + /*************************************************/ /* Declarations for AARP Address Map Table (AMT) */ /*************************************************/ @@ -174,13 +182,10 @@ typedef struct { ) ? 1 : 0 \ ) -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE - int aarp_chk_addr(at_ddp_t *, at_ifaddr_t *); int aarp_rcv_pkt(aarp_pkt_t *, at_ifaddr_t *); -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +#endif /* KERNEL_PRIVATE */ +#endif /* __APPLE_API_OBSOLETE */ #endif /* _NETAT_AT_AARP_H_ */ diff --git a/bsd/netat/at_config.h b/bsd/netat/at_config.h index b9f6abef3..0955e7457 100644 --- a/bsd/netat/at_config.h +++ b/bsd/netat/at_config.h @@ -27,6 +27,8 @@ #define _NETAT_AT_CONFIG_H_ #include <sys/appleapiopts.h> +#ifdef __APPLE_API_OBSOLETE + /* originally from if_cnt.h * * defines for if_stat struct. @@ -73,4 +75,5 @@ typedef struct if_zone { } if_zone_t; +#endif /* __APPLE_API_OBSOLETE */ #endif /* _NETAT_AT_CONFIG_H_ */ diff --git a/bsd/netat/at_ddp_brt.h b/bsd/netat/at_ddp_brt.h index 0b40c9893..aab59d573 100644 --- a/bsd/netat/at_ddp_brt.h +++ b/bsd/netat/at_ddp_brt.h @@ -27,7 +27,8 @@ #ifndef _NETAT_AT_DDP_BRT_H_ #define _NETAT_AT_DDP_BRT_H_ #include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE +#ifdef __APPLE_API_OBSOLETE typedef struct { int age_flag; @@ -80,6 +81,7 @@ typedef struct { /* Best Router Cache */ extern ddp_brt_t at_ddp_brt[BRTSIZE]; -#endif /* __APPLE_API_PRIVATE */ +#endif /* __APPLE_API_OBSOLETE */ +#endif /* KERNEL_PRIVATE */ #endif /* _NETAT_AT_DDP_BRT_H_ */ diff --git a/bsd/netat/at_pat.h b/bsd/netat/at_pat.h index 107b3857d..99dc7ee29 100644 --- a/bsd/netat/at_pat.h +++ b/bsd/netat/at_pat.h @@ -28,6 +28,8 @@ #define _NETAT_AT_PAT_H_ #include <sys/appleapiopts.h> +#ifdef __APPLE_API_OBSOLETE + /* This is header for the PAT module. This contains a table of pointers that * should get initialized with the BNET stuff and the ethernet driver. The * number of interfaces supported should be communicated. Should include @@ -62,4 +64,5 @@ typedef struct { ((*((unsigned long *)(a1)) == *((unsigned long *)(a2))) && \ (a1[4] == a2[4]) \ ) +#endif /* __APPLE_API_OBSOLETE */ #endif /* _NETAT_AT_PAT_H_ */ diff --git a/bsd/netat/at_pcb.c b/bsd/netat/at_pcb.c index 7c120650e..09b6801e2 100644 --- a/bsd/netat/at_pcb.c +++ b/bsd/netat/at_pcb.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -130,9 +130,11 @@ int at_pcballoc(so, head) pcb->atpcb_head = head; pcb->atpcb_socket = so; + atalk_lock(); /* makes sure the list is locked while inserting atpcb */ if (head) insque((queue_t)pcb, (queue_t)head); so->so_pcb = (caddr_t)pcb; + atalk_unlock(); return (0); } @@ -149,9 +151,10 @@ int at_pcbdetach(pcb) } so->so_pcb = 0; + so->so_flags |= SOF_PCBCLEARING; if ((pcb->atpcb_next) && (pcb->atpcb_prev)) remque((queue_t)pcb); - zfree(atpcb_zone, (vm_offset_t)pcb); + zfree(atpcb_zone, pcb); sofree(so); return(0); } diff --git a/bsd/netat/at_pcb.h b/bsd/netat/at_pcb.h index 9a4d4297a..6d9dfb28b 100644 --- a/bsd/netat/at_pcb.h +++ b/bsd/netat/at_pcb.h @@ -60,7 +60,8 @@ /* at_pcb.h */ #include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE +#ifdef __APPLE_API_OBSOLETE +#ifdef KERNEL_PRIVATE /* * Common structure pcb for internet protocol implementation. * Here are stored pointers to local and foreign host table @@ -68,6 +69,8 @@ * up (to a socket structure) and down (to a protocol-specific) * control block. */ +struct atpcb; +typedef struct atpcb gref_t; struct atpcb { struct atpcb *atpcb_next, /* pointers to other pcb's */ *atpcb_prev, @@ -96,14 +99,13 @@ struct atpcb { atlock_t lock; atevent_t event; atevent_t iocevent; - int (*writeable)(); - int (*readable)(); + int (*writeable)(gref_t *gref); + int (*readable)(gref_t *gref); struct selinfo si; /* BSD 4.4 selinfo structure for selrecord/selwakeup */ }; #define sotoatpcb(so)((struct atpcb *)(so)->so_pcb) -#endif /* __APPLE_API_PRIVATE */ /* ddp_flags */ #define DDPFLG_CHKSUM 0x01 /* DDP checksums to be used on this connection */ @@ -112,12 +114,13 @@ struct atpcb { #define DDPFLG_HDRINCL 0x08 /* user supplies entire DDP header */ #define DDPFLG_STRIPHDR 0x200 /* drop DDP header on receive (raw) */ -#ifdef __APPLE_API_PRIVATE -#ifdef KERNEL -typedef struct atpcb gref_t; +int at_pcballoc(struct socket *, struct atpcb *); +int at_pcbdetach(struct atpcb *); +int at_pcbbind(struct atpcb *, struct sockaddr *); + +int atalk_getref(struct fileproc *, int , gref_t ** , struct proc *, int); +int atalk_getref_locked(struct fileproc *, int , gref_t ** , struct proc *, int); + -int at_pcballoc __P((struct socket *, struct atpcb *)); -int at_pcbdetach __P((struct atpcb *)); -int at_pcbbind __P((struct atpcb *, struct sockaddr *)); -#endif /* KERNEL */ -#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL_PRIVATE */ +#endif /* __APPLE_API_OBSOLETE */ diff --git a/bsd/netat/at_proto.c b/bsd/netat/at_proto.c index 3a8cb6055..d0e0934e9 100644 --- a/bsd/netat/at_proto.c +++ b/bsd/netat/at_proto.c @@ -36,6 +36,7 @@ #include <sys/protosw.h> #include <sys/domain.h> #include <sys/mbuf.h> +#include <kern/locks.h> #include <sys/sysctl.h> @@ -45,7 +46,6 @@ #include <netat/at_var.h> #include <netat/ddp.h> -struct domain atalkdomain; extern int ddp_pru_abort(struct socket *so); @@ -71,6 +71,7 @@ extern int ddp_pru_send(struct socket *so, int flags, struct mbuf *m, extern int ddp_pru_shutdown(struct socket *so); extern int ddp_pru_sockaddr(struct socket *so, struct sockaddr **nam); +void atalk_dominit(); /* * Dummy usrreqs struct created by Ted for FreeBSD 3.x integration. @@ -81,24 +82,64 @@ struct pr_usrreqs ddp_usrreqs = { ddp_pru_connect, pru_connect2_notsupp, ddp_pru_control, ddp_pru_detach, ddp_pru_disconnect, pru_listen_notsupp, ddp_pru_peeraddr, pru_rcvd_notsupp, pru_rcvoob_notsupp, ddp_pru_send, pru_sense_null, ddp_pru_shutdown, - ddp_pru_sockaddr, sosend, soreceive, sopoll + ddp_pru_sockaddr, sosend, soreceive, pru_sopoll_notsupp }; +struct domain atalkdomain; struct protosw atalksw[] = { { SOCK_RAW, &atalkdomain, /*protocol*/ 0, PR_ATOMIC|PR_ADDR, /*input*/ 0, /*output*/ 0, /*clinput*/ 0, ddp_ctloutput, /*ousrreq*/ 0, ddp_init, /*fastto*/ 0, /*slowto*/ 0, /*drain*/ 0, - /*sysctl*/ 0, &ddp_usrreqs + /*sysctl*/ 0, &ddp_usrreqs, + 0, 0, 0 } }; struct domain atalkdomain = -{ AF_APPLETALK, "appletalk", 0, 0, 0, +{ AF_APPLETALK, "appletalk", atalk_dominit, 0, 0, atalksw, 0, 0, 0, 0, DDP_X_HDR_SIZE, 0 }; +struct domain * atalkdom = &atalkdomain; +lck_mtx_t *atalk_mutex = NULL; + SYSCTL_NODE(_net, PF_APPLETALK, appletalk, CTLFLAG_RW, 0, "AppleTalk Family"); +void +atalk_dominit() +{ + atalk_mutex = atalkdom->dom_mtx; +} + +void +atalk_lock() +{ + int error = 0, lr, lr_saved; +#ifdef __ppc__ + __asm__ volatile("mflr %0" : "=r" (lr)); + lr_saved = lr; +#endif + lck_mtx_assert(atalkdom->dom_mtx, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(atalkdom->dom_mtx); +} + +void +atalk_unlock() +{ + int error = 0, lr, lr_saved; +#ifdef __ppc__ + __asm__ volatile("mflr %0" : "=r" (lr)); + lr_saved = lr; +#endif + lck_mtx_assert(atalkdom->dom_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_unlock(atalkdom->dom_mtx); + +} + + + + + diff --git a/bsd/netat/at_snmp.h b/bsd/netat/at_snmp.h index 476e6d481..3ec477cac 100644 --- a/bsd/netat/at_snmp.h +++ b/bsd/netat/at_snmp.h @@ -23,6 +23,8 @@ #define _NETAT_AT_SNMP_H_ #include <sys/appleapiopts.h> +#ifdef __APPLE_API_OBSOLETE + #define MAX_PHYS_ADDR_SIZE 6 /* maximum physical addr size */ #define MAX_IFS 25 /* max # interfaces */ #define IF_NAME_SIZE 6 /* max name length of I/F name */ @@ -212,4 +214,5 @@ typedef struct snmpStats { #define SNMP_TYPE(var,type) ((var & SNMP_OBJ_TYPE_MASK) == type) +#endif /* __APPLE_API_OBSOLETE */ #endif /* _NETAT_AT_SNMP_H_ */ diff --git a/bsd/netat/at_var.h b/bsd/netat/at_var.h index bd8a4f82e..9eb7c5795 100644 --- a/bsd/netat/at_var.h +++ b/bsd/netat/at_var.h @@ -23,8 +23,11 @@ * Copyright (c) 1998 Apple Computer, Inc. */ +#ifndef _NETAT_AT_VAR_H_ +#define _NETAT_AT_VAR_H_ + #include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE +#ifdef __APPLE_API_OBSOLETE #include <sys/queue.h> /* at_var.h */ @@ -37,20 +40,17 @@ #define MCAST_TRACK_DELETE 2 #define MCAST_TRACK_CHECK 3 -#define ETHERNET_ADDR_LEN 6 -#define IFNAMESIZ 16 - /* maximum number of I/F's allowed */ #define IF_TOTAL_MAX 17 /* max count of any combination of I/F's */ /* 17 == (1+(4*4)); 9 and 13 would also be reasonable values */ #define FDDI_OR_TOKENRING(i) ((i == IFT_FDDI) || (i == IFT_ISO88025)) - +#define ETHERNET_ADDR_LEN 6 +#define IFNAMESIZ 16 typedef struct etalk_addr { u_char etalk_addr_octet[ETHERNET_ADDR_LEN]; } etalk_addr_t; - typedef char if_name_t[IFNAMESIZ]; typedef struct at_ifname_list { if_name_t at_if[IF_TOTAL_MAX]; @@ -118,6 +118,7 @@ typedef struct { short router_mix; } at_router_params_t; + typedef struct at_kern_err { int error; /* kernel error # (KE_xxx) */ int port1; @@ -143,6 +144,7 @@ typedef struct at_kern_err { #define KE_RTMP_OVERFLOW 10 #define KE_ZIP_OVERFLOW 11 +#ifdef KERNEL_PRIVATE /* * Interface address, AppleTalk version. One of these structures * is allocated for each AppleTalk address on an interface. @@ -181,6 +183,7 @@ typedef struct at_ifaddr { /* for use by ZIP */ u_char ifNumRetries; + u_char ifGNIScheduled; /* to keep getnetinfo from being scheduled more than once */ at_nvestr_t ifZoneName; /* Added for routing support */ @@ -213,6 +216,7 @@ typedef struct at_ifaddr { middle of an elap_online operation */ } at_ifaddr_t; +#endif /* KERNEL_PRIVATE */ #define LAP_OFFLINE 0 /* LAP_OFFLINE MUST be 0 */ #define LAP_ONLINE 1 @@ -269,16 +273,16 @@ typedef struct at_ifaddr { #define ELAP_CFG_HOME 0x02 /* designate home port (one allowed) */ #define ELAP_CFG_SEED 0x08 /* set if it's a seed port */ -#ifdef KERNEL +#ifdef KERNEL_PRIVATE extern TAILQ_HEAD(at_ifQueueHd, at_ifaddr) at_ifQueueHd; -int at_control __P((struct socket *, u_long, caddr_t, struct ifnet *)); -int ddp_usrreq __P((struct socket *, int, struct mbuf *, struct mbuf *, - struct mbuf *)); -int ddp_ctloutput __P((struct socket *, struct sockopt *)); -void ddp_init __P((void));; -void ddp_slowtimo __P((void)); -#endif +int at_control(struct socket *, u_long, caddr_t, struct ifnet *); +int ddp_usrreq(struct socket *, int, struct mbuf *, struct mbuf *, + struct mbuf *); +int ddp_ctloutput(struct socket *, struct sockopt *); +void ddp_init(void);; +void ddp_slowtimo(void); +#endif /* KERNEL_PRIVATE */ /* * Define AppleTalk event subclass and specific AppleTalk events. @@ -302,7 +306,13 @@ struct kev_atalk_data { } node_data; }; +#ifdef KERNEL_PRIVATE + void atalk_post_msg(struct ifnet *ifp, u_long event_code, struct at_addr *address, at_nvestr_t *zone); void aarp_sched_probe(void *); +void atalk_lock(); +void atalk_unlock(); -#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL_PRIVATE */ +#endif /* __APPLE_API_OBSOLETE */ +#endif /* _NETAT_AT_VAR_H_ */ diff --git a/bsd/netat/atp.h b/bsd/netat/atp.h index cc590c659..421c265f0 100644 --- a/bsd/netat/atp.h +++ b/bsd/netat/atp.h @@ -48,6 +48,8 @@ #define _NETAT_ATP_H_ #include <sys/appleapiopts.h> +#ifdef __APPLE_API_OBSOLETE + /* ATP function codes */ #define ATP_CMD_TREQ 0x01 /* TRequest packet */ @@ -184,8 +186,7 @@ typedef struct { #define ATP_SENDRESPONSE 2 #define ATP_GETREQUEST 3 -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE /* @@ -456,6 +457,6 @@ void atp_timout(void (*func)(), struct atp_trans *, int); void atp_untimout(void (*func)(), struct atp_trans *); int atp_tid(struct atp_state *); -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +#endif /* KERNEL_PRIVATE */ +#endif /* __APPLE_API_OBSOLETE */ #endif /* _NETAT_ATP_H_ */ diff --git a/bsd/netat/atp_misc.c b/bsd/netat/atp_misc.c index f8d34a439..91fecf154 100644 --- a/bsd/netat/atp_misc.c +++ b/bsd/netat/atp_misc.c @@ -277,13 +277,15 @@ void atp_rcb_timer() register struct atp_rcb *next_rcbp; extern struct atp_rcb_qhead atp_need_rel; extern struct atp_trans *trp_tmo_rcb; + struct timeval timenow; l_again: ATDISABLE(s, atpgen_lock); + getmicrouptime(&timenow); for (rcbp = atp_need_rel.head; rcbp; rcbp = next_rcbp) { next_rcbp = rcbp->rc_tlist.next; - if (abs(time.tv_sec - rcbp->rc_timestamp) > 30) { + if (abs(timenow.tv_sec - rcbp->rc_timestamp) > 30) { ATENABLE(s, atpgen_lock); atp_rcb_free(rcbp); goto l_again; diff --git a/bsd/netat/atp_open.c b/bsd/netat/atp_open.c index 5271592ab..8f07c0332 100644 --- a/bsd/netat/atp_open.c +++ b/bsd/netat/atp_open.c @@ -140,7 +140,7 @@ int atp_open(gref, flag) if (atp_rcb_data == NULL) { if (kmem_alloc(kernel_map, &temp, sizeof(struct atp_rcb) * NATP_RCB) != KERN_SUCCESS) return(ENOMEM); - if (atp_rcb_data == NULL) { /* in case we lost funnel while allocating */ + if (atp_rcb_data == NULL) { bzero((caddr_t)temp, sizeof(struct atp_rcb) * NATP_RCB); atp_rcb_data = (struct atp_rcb*)temp; for (i = 0; i < NATP_RCB; i++) { diff --git a/bsd/netat/atp_read.c b/bsd/netat/atp_read.c index 92f0ac6cd..25690a3c6 100644 --- a/bsd/netat/atp_read.c +++ b/bsd/netat/atp_read.c @@ -50,7 +50,7 @@ static void atp_trans_complete(); void atp_x_done(); -void atp_x_done_funnel(void *); +void atp_x_done_locked(void *); extern void atp_req_timeout(); /* @@ -63,9 +63,8 @@ void atp_treq_event(void *arg) register gref_t *gref = (gref_t *)arg; register gbuf_t *m; register struct atp_state *atp; - boolean_t funnel_state; - funnel_state = thread_funnel_set(network_flock, TRUE); + atalk_lock(); atp = (struct atp_state *)gref->info; if (atp->dflag) atp = (struct atp_state *)atp->atp_msgq; @@ -86,7 +85,7 @@ void atp_treq_event(void *arg) if (m == 0) timeout(atp_treq_event, gref, 10); - (void) thread_funnel_set(network_flock, FALSE); + atalk_unlock(); } void atp_rput(gref, m) @@ -97,6 +96,7 @@ gbuf_t *m; register struct atp_state *atp; register int s, s_gen; gbuf_t *m_asp = NULL; + struct timeval timenow; atp = (struct atp_state *)gref->info; if (atp->dflag) @@ -399,9 +399,10 @@ gbuf_t *m; * update the bitmap and resend * the replies */ + getmicrouptime(&timenow); ATDISABLE(s_gen, atpgen_lock); if (rcbp->rc_timestamp) { - rcbp->rc_timestamp = time.tv_sec; + rcbp->rc_timestamp = timenow.tv_sec; if (rcbp->rc_timestamp == 0) rcbp->rc_timestamp = 1; } @@ -455,12 +456,12 @@ gbuf_t *m; } /* atp_rput */ void -atp_x_done_funnel(trp) +atp_x_done_locked(trp) void *trp; { - thread_funnel_set(network_flock, TRUE); + atalk_lock(); atp_x_done((struct atp_trans *)trp); - (void) thread_funnel_set(network_flock, FALSE); + atalk_unlock(); } @@ -491,7 +492,7 @@ register struct atp_trans *trp; atp = trp->tr_queue; trp->tr_state = TRANS_RELEASE; - timeout(atp_x_done_funnel, trp, 10); + timeout(atp_x_done_locked, trp, 10); } } } diff --git a/bsd/netat/atp_write.c b/bsd/netat/atp_write.c index 668fee473..865962ef6 100644 --- a/bsd/netat/atp_write.c +++ b/bsd/netat/atp_write.c @@ -38,6 +38,7 @@ #include <sys/proc.h> #include <sys/filedesc.h> #include <sys/fcntl.h> +#include <kern/locks.h> #include <sys/mbuf.h> #include <sys/ioctl.h> #include <sys/malloc.h> @@ -67,7 +68,7 @@ static int loop_cnt; /* for debugging loops */ static void atp_pack_bdsp(struct atp_trans *, struct atpBDS *); static int atp_unpack_bdsp(struct atp_state *, gbuf_t *, struct atp_rcb *, int, int); -void atp_trp_clock(), asp_clock(), asp_clock_funnel(), atp_trp_clock_funnel();; +void atp_trp_clock(), asp_clock(), asp_clock_locked(), atp_trp_clock_locked();; extern struct atp_rcb_qhead atp_need_rel; extern int atp_inited; @@ -82,6 +83,7 @@ extern gbuf_t *atp_resource_m; extern gref_t *atp_inputQ[]; extern int atp_pidM[]; extern at_ifaddr_t *ifID_home; +extern lck_mtx_t * atalk_mutex; static struct atp_trans *trp_tmo_list; struct atp_trans *trp_tmo_rcb; @@ -104,8 +106,8 @@ void atp_link() void atp_unlink() { - untimeout(asp_clock_funnel, (void *)&atp_inited); - untimeout(atp_trp_clock_funnel, (void *)&atp_inited); + untimeout(asp_clock_locked, (void *)&atp_inited); + untimeout(atp_trp_clock_locked, (void *)&atp_inited); atp_untimout(atp_rcb_timer, trp_tmo_rcb); trp_tmo_list = 0; @@ -464,6 +466,7 @@ void atp_send_replies(atp, rcbp) struct ddp_atp { char ddp_atp_hdr[TOTAL_ATP_HDR_SIZE]; }; + struct timeval timenow; ATDISABLE(s, atp->atp_lock); if (rcbp->rc_queue != atp) { @@ -501,6 +504,8 @@ void atp_send_replies(atp, rcbp) offset = 0; if (m0) space = gbuf_msgsize(m0); + else + space = 0; for (i = 0; i < cnt; i++) { if (rcbp->rc_snd[i] == 0) { if ((len = UAS_VALUE(bdsp->bdsBuffSz))) { @@ -583,9 +588,10 @@ nothing_to_send: * resources. */ if (rcbp->rc_xo && rcbp->rc_state != RCB_RELEASED) { + getmicrouptime(&timenow); ATDISABLE(s_gen, atpgen_lock); if (rcbp->rc_timestamp == 0) { - rcbp->rc_timestamp = time.tv_sec; + rcbp->rc_timestamp = timenow.tv_sec; if (rcbp->rc_timestamp == 0) rcbp->rc_timestamp = 1; ATP_Q_APPEND(atp_need_rel, rcbp, rc_tlist); @@ -638,7 +644,7 @@ atp_pack_bdsp(trp, bdsp) if (len > bufsize) len = bufsize; copyout((caddr_t)gbuf_rptr(m), - (caddr_t)&buf[tmp], + CAST_USER_ADDR_T(&buf[tmp]), len); bufsize -= len; tmp += len; @@ -687,6 +693,7 @@ atp_unpack_bdsp(atp, m, rcbp, cnt, wait) gbuf_t *rc_xmt[ATP_TRESP_MAX]; unsigned char *m0_rptr, *m0_wptr; int err, offset, space; + struct timeval timenow; /* * get the user data structure pointer @@ -821,9 +828,10 @@ atp_unpack_bdsp(atp, m, rcbp, cnt, wait) */ l_send: if (rcbp->rc_xo) { + getmicrouptime(&timenow); ATDISABLE(s_gen, atpgen_lock); if (rcbp->rc_timestamp == 0) { - if ((rcbp->rc_timestamp = time.tv_sec) == 0) + if ((rcbp->rc_timestamp = timenow.tv_sec) == 0) rcbp->rc_timestamp = 1; ATP_Q_APPEND(atp_need_rel, rcbp, rc_tlist); } @@ -1118,12 +1126,12 @@ atp_untimout(func, trp) } void -atp_trp_clock_funnel(arg) +atp_trp_clock_locked(arg) void *arg; { - thread_funnel_set(network_flock, TRUE); + atalk_lock(); atp_trp_clock(arg); - thread_funnel_set(network_flock, FALSE); + atalk_unlock(); } void @@ -1149,7 +1157,7 @@ atp_trp_clock(arg) } ATENABLE(s, atptmo_lock); - timeout(atp_trp_clock_funnel, (void *)arg, (1<<5)); + timeout(atp_trp_clock_locked, (void *)arg, (1<<5)); } void @@ -1262,16 +1270,15 @@ void atp_retry_req(arg) { gbuf_t *m = (gbuf_t *)arg; gref_t *gref; - boolean_t funnel_state; - funnel_state = thread_funnel_set(network_flock, TRUE); + atalk_lock(); gref = (gref_t *)((ioc_t *)gbuf_rptr(m))->ioc_private; if (gref->info) { ((asp_scb_t *)gref->info)->stat_msg = 0; atp_send_req(gref, m); } - (void) thread_funnel_set(network_flock, FALSE); + atalk_unlock(); } void atp_send_rsp(gref, m, wait) @@ -1372,7 +1379,7 @@ int asp_pack_bdsp(trp, xm) gbuf_rinc(m, ATP_HDR_SIZE); if (UAL_VALUE(bdsp->bdsBuffAddr)) { - short tmp; + short tmp = 0; /* user expects data back */ m = gbuf_strip(m); @@ -1442,24 +1449,31 @@ _ATPsndreq(fd, buf, len, nowait, err, proc) gbuf_t *m2, *m, *mioc; char bds[atpBDSsize]; - if ((*err = atalk_getref(0, fd, &gref, proc)) != 0) + if ((*err = atalk_getref(0, fd, &gref, proc, 1)) != 0) return -1; if ((gref == 0) || ((atp = (struct atp_state *)gref->info) == 0) || (atp->atp_flags & ATP_CLOSING)) { dPrintf(D_M_ATP, D_L_ERROR, ("ATPsndreq: stale handle=0x%x, pid=%d\n", (u_int) gref, gref->pid)); - + file_drop(fd); *err = EINVAL; return -1; } + while ((mioc = gbuf_alloc(sizeof(ioc_t), PRI_MED)) == 0) { + struct timespec ts; + /* the vaue of 10n terms of hz is 100ms */ + ts.tv_sec = 0; + ts.tv_nsec = 100 *1000 * NSEC_PER_USEC; + ATDISABLE(s, atp->atp_delay_lock); - rc = tsleep(&atp->atp_delay_event, PSOCK | PCATCH, "atpmioc", 10); + rc = msleep(&atp->atp_delay_event, atalk_mutex, PSOCK | PCATCH, "atpmioc", &ts); ATENABLE(s, atp->atp_delay_lock); if (rc != 0) { *err = rc; + file_drop(fd); return -1; } @@ -1467,21 +1481,28 @@ _ATPsndreq(fd, buf, len, nowait, err, proc) gbuf_wset(mioc,sizeof(ioc_t)); len -= atpBDSsize; while ((m2 = gbuf_alloc(len, PRI_MED)) == 0) { + struct timespec ts; + /* the vaue of 10n terms of hz is 100ms */ + ts.tv_sec = 0; + ts.tv_nsec = 100 *1000 * NSEC_PER_USEC; + ATDISABLE(s, atp->atp_delay_lock); - rc = tsleep(&atp->atp_delay_event, PSOCK | PCATCH, "atpm2", 10); + rc = msleep(&atp->atp_delay_event, atalk_mutex, PSOCK | PCATCH, "atpm2", &ts); ATENABLE(s, atp->atp_delay_lock); if (rc != 0) { gbuf_freeb(mioc); + file_drop(fd); *err = rc; return -1; } } gbuf_wset(m2, len); gbuf_cont(mioc) = m2; - if (((*err = copyin((caddr_t)buf, (caddr_t)bds, atpBDSsize)) != 0) - || ((*err = copyin((caddr_t)&buf[atpBDSsize], + if (((*err = copyin(CAST_USER_ADDR_T(buf), (caddr_t)bds, atpBDSsize)) != 0) + || ((*err = copyin(CAST_USER_ADDR_T(&buf[atpBDSsize]), (caddr_t)gbuf_rptr(m2), len)) != 0)) { gbuf_freem(mioc); + file_drop(fd); return -1; } gbuf_set_type(mioc, MSG_IOCTL); @@ -1503,11 +1524,17 @@ _ATPsndreq(fd, buf, len, nowait, err, proc) * allocate and set up the transaction record */ while ((trp = atp_trans_alloc(atp)) == 0) { + struct timespec ts; + /* the vaue of 10n terms of hz is 100ms */ + ts.tv_sec = 0; + ts.tv_nsec = 100 *1000 * NSEC_PER_USEC; + ATDISABLE(s, atp->atp_delay_lock); - rc = tsleep(&atp->atp_delay_event, PSOCK | PCATCH, "atptrp", 10); + rc = msleep(&atp->atp_delay_event, atalk_mutex, PSOCK | PCATCH, "atptrp", &ts); ATENABLE(s, atp->atp_delay_lock); if (rc != 0) { gbuf_freem(mioc); + file_drop(fd); *err = rc; return -1; } @@ -1570,8 +1597,10 @@ _ATPsndreq(fd, buf, len, nowait, err, proc) if (m) DDP_OUTPUT(m); - if (nowait) + if (nowait) { + file_drop(fd); return (int)tid; + } /* * wait for the transaction to complete @@ -1580,10 +1609,11 @@ _ATPsndreq(fd, buf, len, nowait, err, proc) while ((trp->tr_state != TRANS_DONE) && (trp->tr_state != TRANS_FAILED) && (trp->tr_state != TRANS_ABORTING)) { trp->tr_rsp_wait = 1; - rc = tsleep(&trp->tr_event, PSOCK | PCATCH, "atpsndreq", 0); + rc = msleep(&trp->tr_event, atalk_mutex, PSOCK | PCATCH, "atpsndreq", 0); if (rc != 0) { trp->tr_rsp_wait = 0; ATENABLE(s, trp->tr_lock); + file_drop(fd); *err = rc; return -1; } @@ -1597,6 +1627,7 @@ _ATPsndreq(fd, buf, len, nowait, err, proc) * transaction timed out, return error */ atp_free(trp); + file_drop(fd); *err = ETIMEDOUT; return -1; } @@ -1609,9 +1640,10 @@ _ATPsndreq(fd, buf, len, nowait, err, proc) /* * copyout the result info */ - copyout((caddr_t)bds, (caddr_t)buf, atpBDSsize); + copyout((caddr_t)bds, CAST_USER_ADDR_T(buf), atpBDSsize); atp_free(trp); + file_drop(fd); return (int)tid; } /* _ATPsndreq */ @@ -1646,7 +1678,7 @@ _ATPsndrsp(fd, respbuff, resplen, datalen, err, proc) int bds_cnt, count, len; caddr_t dataptr; - if ((*err = atalk_getref(0, fd, &gref, proc)) != 0) + if ((*err = atalk_getref(0, fd, &gref, proc, 1)) != 0) return -1; if ((gref == 0) || ((atp = (struct atp_state *)gref->info) == 0) @@ -1654,6 +1686,7 @@ _ATPsndrsp(fd, respbuff, resplen, datalen, err, proc) dPrintf(D_M_ATP, D_L_ERROR, ("ATPsndrsp: stale handle=0x%x, pid=%d\n", (u_int) gref, gref->pid)); + file_drop(fd); *err = EINVAL; return -1; } @@ -1663,10 +1696,12 @@ _ATPsndrsp(fd, respbuff, resplen, datalen, err, proc) */ if ((m = gbuf_alloc_wait(resplen, TRUE)) == 0) { *err = ENOMEM; + file_drop(fd); return -1; } - if ((*err = copyin((caddr_t)respbuff, (caddr_t)gbuf_rptr(m), resplen)) != 0) { + if ((*err = copyin(CAST_USER_ADDR_T(respbuff), (caddr_t)gbuf_rptr(m), resplen)) != 0) { gbuf_freeb(m); + file_drop(fd); return -1; } gbuf_wset(m,resplen); @@ -1683,6 +1718,7 @@ _ATPsndrsp(fd, respbuff, resplen, datalen, err, proc) if (bds_cnt > ATP_TRESP_MAX) { gbuf_freem(m); *err = EINVAL; + file_drop(fd); return -1; } @@ -1692,12 +1728,14 @@ _ATPsndrsp(fd, respbuff, resplen, datalen, err, proc) if (size > datalen) { gbuf_freem(m); *err = EINVAL; + file_drop(fd); return -1; } /* get the first mbuf */ if ((mdata = gbuf_alloc_wait((space = (size > MCLBYTES ? MCLBYTES : size)), TRUE)) == 0) { gbuf_freem(m); + file_drop(fd); *err = ENOMEM; return -1; } @@ -1711,6 +1749,7 @@ _ATPsndrsp(fd, respbuff, resplen, datalen, err, proc) /* allocate the next mbuf */ if ((gbuf_cont(mdata) = m_get((M_WAIT), MSG_DATA)) == 0) { gbuf_freem(m); + file_drop(fd); *err = ENOMEM; return -1; } @@ -1718,14 +1757,16 @@ _ATPsndrsp(fd, respbuff, resplen, datalen, err, proc) MCLGET(mdata, M_WAIT); if (!(mdata->m_flags & M_EXT)) { m_freem(m); + file_drop(fd); return(NULL); } dataptr = mtod(mdata, caddr_t); space = MCLBYTES; } /* do the copyin */ - if ((*err = copyin((caddr_t)bufaddr, dataptr, len)) != 0) { + if ((*err = copyin(CAST_USER_ADDR_T(bufaddr), dataptr, len)) != 0) { gbuf_freem(m); + file_drop(fd); return -1; } dataptr += len; @@ -1736,6 +1777,7 @@ _ATPsndrsp(fd, respbuff, resplen, datalen, err, proc) gbuf_cont(m)->m_pkthdr.len = size; /* set packet hdr len */ atp_send_rsp(gref, m, TRUE); + file_drop(fd); return 0; } @@ -1753,13 +1795,14 @@ _ATPgetreq(fd, buf, buflen, err, proc) register gbuf_t *m, *m_head; int s, size, len; - if ((*err = atalk_getref(0, fd, &gref, proc)) != 0) + if ((*err = atalk_getref(0, fd, &gref, proc, 1)) != 0) return -1; if ((gref == 0) || ((atp = (struct atp_state *)gref->info) == 0) || (atp->atp_flags & ATP_CLOSING)) { dPrintf(D_M_ATP, D_L_ERROR, ("ATPgetreq: stale handle=0x%x, pid=%d\n", (u_int) gref, gref->pid)); + file_drop(fd); *err = EINVAL; return -1; } @@ -1790,17 +1833,19 @@ _ATPgetreq(fd, buf, buflen, err, proc) for (size=0, m=m_head; m; m = gbuf_cont(m)) { if ((len = gbuf_len(m)) > buflen) len = buflen; - copyout((caddr_t)gbuf_rptr(m), (caddr_t)&buf[size], len); + copyout((caddr_t)gbuf_rptr(m), CAST_USER_ADDR_T(&buf[size]), len); size += len; if ((buflen -= len) == 0) break; } gbuf_freem(m_head); + file_drop(fd); return size; } ATENABLE(s, atp->atp_lock); + file_drop(fd); return -1; } @@ -1817,13 +1862,14 @@ _ATPgetrsp(fd, bdsp, err, proc) int s, tid; char bds[atpBDSsize]; - if ((*err = atalk_getref(0, fd, &gref, proc)) != 0) + if ((*err = atalk_getref(0, fd, &gref, proc, 1)) != 0) return -1; if ((gref == 0) || ((atp = (struct atp_state *)gref->info) == 0) || (atp->atp_flags & ATP_CLOSING)) { dPrintf(D_M_ATP, D_L_ERROR, ("ATPgetrsp: stale handle=0x%x, pid=%d\n", (u_int) gref, gref->pid)); + file_drop(fd); *err = EINVAL; return -1; } @@ -1837,13 +1883,16 @@ _ATPgetrsp(fd, bdsp, err, proc) switch (trp->tr_state) { case TRANS_DONE: ATENABLE(s, atp->atp_lock); - if ((*err = copyin((caddr_t)bdsp, - (caddr_t)bds, sizeof(bds))) != 0) + if ((*err = copyin(CAST_USER_ADDR_T(bdsp), + (caddr_t)bds, sizeof(bds))) != 0) { + file_drop(fd); return -1; + } atp_pack_bdsp(trp, (struct atpBDS *)bds); tid = (int)trp->tr_tid; atp_free(trp); - copyout((caddr_t)bds, (caddr_t)bdsp, sizeof(bds)); + copyout((caddr_t)bds, CAST_USER_ADDR_T(bdsp), sizeof(bds)); + file_drop(fd); return tid; case TRANS_FAILED: @@ -1852,6 +1901,7 @@ _ATPgetrsp(fd, bdsp, err, proc) */ ATENABLE(s, atp->atp_lock); atp_free(trp); + file_drop(fd); *err = ETIMEDOUT; return -1; @@ -1861,6 +1911,7 @@ _ATPgetrsp(fd, bdsp, err, proc) } ATENABLE(s, atp->atp_lock); + file_drop(fd); *err = EINVAL; return -1; } diff --git a/bsd/netat/aurp.h b/bsd/netat/aurp.h index 9bc832279..c98f2321f 100644 --- a/bsd/netat/aurp.h +++ b/bsd/netat/aurp.h @@ -31,6 +31,8 @@ #define _NETAT_AURP_H_ #include <sys/appleapiopts.h> +#ifdef __APPLE_API_OBSOLETE + /* * AURP device ioctl (I_STR) 'subcommands' */ @@ -49,8 +51,7 @@ #define AURP_SOCKNUM 387 #define AURP_MAXNETACCESS 64 -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE #define AURPCODE_REG 0 #define AURPCODE_RTMPPKT 1 @@ -187,6 +188,8 @@ struct myq #define LOCK_DECL(x) atlock_t x +#include <sys/uio_internal.h> + /* * Quandry: if we use a single socket, we have to rebind on each call. * If we use separate sockets per tunnel endpoint, we have to examine @@ -215,44 +218,44 @@ struct aurp_global_t #define AE_UDPIP 0x02 /* UDP/IP input event */ #define AE_SHUTDOWN 0x04 /* Shutdown AURP process */ -void aurp_wakeup __P((struct socket *, caddr_t, int)); -struct mbuf *at_gbuf_to_mbuf __P((gbuf_t *)); -gbuf_t *at_mbuf_to_gbuf __P((struct mbuf *, int)); -int at_insert __P((gbuf_t *m, unsigned int type, unsigned int node)); -int ddp_AURPfuncx __P((int code, void *param, unsigned char node)); -int AURPinit __P((void)); -int aurpd_start __P((void)); -void atalk_to_ip __P((gbuf_t *m)); -void AURPaccess __P((void)); -void AURPshutdown __P((void)); -void AURPiocack __P((gref_t *gref, gbuf_t *m)); -void AURPiocnak __P((gref_t *gref, gbuf_t *m, int error)); -void AURPsndZReq __P((aurp_state_t *state)); -void AURPsndZRsp __P((aurp_state_t *state, gbuf_t *dat_m, int flag)); -void AURPsndRIUpd __P((aurp_state_t *state)); -void AURPsndRIReq __P((aurp_state_t *state)); -void AURPsndRIAck __P((aurp_state_t *state, gbuf_t *m, unsigned short flags)); -void AURPsndOpenReq __P((aurp_state_t *state)); -void AURPsndRDReq __P((aurp_state_t *state)); -void AURPrcvZReq __P((aurp_state_t *state, gbuf_t *m)); -void AURPrcvZRsp __P((aurp_state_t *state, gbuf_t *m)); -void AURPrcvRIUpd __P((aurp_state_t *state, gbuf_t *m)); -void AURPrcvRIReq __P((aurp_state_t *state, gbuf_t *m)); -void AURPrcvRIAck __P((aurp_state_t *state, gbuf_t *m)); -void AURPrcvRIRsp __P((aurp_state_t *state, gbuf_t *m)); -void AURPrcvOpenReq __P((aurp_state_t *state, gbuf_t *m)); -void AURPrcvOpenRsp __P((aurp_state_t *state, gbuf_t *m)); -void AURPrcvTickle __P((aurp_state_t *state, gbuf_t *m)); -void AURPrcvTickleAck __P((aurp_state_t *state, gbuf_t *m)); -void AURPrcvRDReq __P((aurp_state_t *state, gbuf_t *m)); -void AURPfreemsg __P((gbuf_t *m)); -void AURPrtupdate __P((RT_entry *entry, unsigned char ev)); -void AURPsend __P((gbuf_t *mdata, int type, int node)); -void AURPcleanup __P((aurp_state_t *state)); -void AURPpurgeri __P((unsigned char node)); -int AURPgetri __P((short next_entry, unsigned char *buf, short *len)); -int AURPsetri __P((unsigned char node, gbuf_t *m)); -int AURPupdateri __P((unsigned char node, gbuf_t *m)); +void aurp_wakeup(struct socket *, caddr_t, int); +struct mbuf *at_gbuf_to_mbuf(gbuf_t *); +gbuf_t *at_mbuf_to_gbuf(struct mbuf *, int); +int at_insert(gbuf_t *m, unsigned int type, unsigned int node); +int ddp_AURPfuncx(int code, void *param, unsigned char node); +int AURPinit(void); +int aurpd_start(void); +void atalk_to_ip(gbuf_t *m); +void AURPaccess(void); +void AURPshutdown(void); +void AURPiocack(gref_t *gref, gbuf_t *m); +void AURPiocnak(gref_t *gref, gbuf_t *m, int error); +void AURPsndZReq(aurp_state_t *state); +void AURPsndZRsp(aurp_state_t *state, gbuf_t *dat_m, int flag); +void AURPsndRIUpd(aurp_state_t *state); +void AURPsndRIReq(aurp_state_t *state); +void AURPsndRIAck(aurp_state_t *state, gbuf_t *m, unsigned short flags); +void AURPsndOpenReq(aurp_state_t *state); +void AURPsndRDReq(aurp_state_t *state); +void AURPrcvZReq(aurp_state_t *state, gbuf_t *m); +void AURPrcvZRsp(aurp_state_t *state, gbuf_t *m); +void AURPrcvRIUpd(aurp_state_t *state, gbuf_t *m); +void AURPrcvRIReq(aurp_state_t *state, gbuf_t *m); +void AURPrcvRIAck(aurp_state_t *state, gbuf_t *m); +void AURPrcvRIRsp(aurp_state_t *state, gbuf_t *m); +void AURPrcvOpenReq(aurp_state_t *state, gbuf_t *m); +void AURPrcvOpenRsp(aurp_state_t *state, gbuf_t *m); +void AURPrcvTickle(aurp_state_t *state, gbuf_t *m); +void AURPrcvTickleAck(aurp_state_t *state, gbuf_t *m); +void AURPrcvRDReq(aurp_state_t *state, gbuf_t *m); +void AURPfreemsg(gbuf_t *m); +void AURPrtupdate(RT_entry *entry, unsigned char ev); +void AURPsend(gbuf_t *mdata, int type, int node); +void AURPcleanup(aurp_state_t *state); +void AURPpurgeri(unsigned char node); +int AURPgetri(short next_entry, unsigned char *buf, short *len); +int AURPsetri(unsigned char node, gbuf_t *m); +int AURPupdateri(unsigned char node, gbuf_t *m); /* AURP header for IP tunneling */ typedef struct aurp_domain @@ -283,6 +286,6 @@ typedef struct aurp_domain /****### LD 9/26/97*/ extern struct aurp_global_t aurp_global; -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +#endif /* KERNEL_PRIVATE */ +#endif /* __APPLE_API_OBSOLETE */ #endif /* _NETAT_AURP_H_ */ diff --git a/bsd/netat/aurp_aurpd.c b/bsd/netat/aurp_aurpd.c index a051e9ced..bcd8365ac 100644 --- a/bsd/netat/aurp_aurpd.c +++ b/bsd/netat/aurp_aurpd.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -40,6 +40,7 @@ #include <sys/systm.h> #include <sys/kernel.h> #include <sys/proc.h> +#include <sys/kauth.h> #include <sys/filedesc.h> #include <sys/fcntl.h> #include <sys/mbuf.h> @@ -48,6 +49,8 @@ #include <sys/protosw.h> #include <sys/malloc.h> #include <sys/proc.h> +#include <sys/uio_internal.h> +#include <kern/locks.h> #include <netinet/in.h> #include <net/if.h> @@ -62,6 +65,8 @@ #define M_RCVBUF (64 * 1024) #define M_SNDBUF (64 * 1024) +extern lck_mtx_t * atalk_mutex; + static int ip_to_atalk(struct sockaddr_in *fp, register gbuf_t *p_mbuf); static int aurp_bindrp(struct socket *so); @@ -89,7 +94,7 @@ aurpd_start() int maxbuf; struct sockopt sopt; - if (suser(current_proc()->p_ucred, ¤t_proc()->p_acflag) != 0 ) + if (suser(kauth_cred_get(), 0) != 0 ) return(EPERM); /* @@ -134,7 +139,7 @@ aurpd_start() goto out; } else { maxbuf = M_RCVBUF; - sopt.sopt_val = &maxbuf; + sopt.sopt_val = CAST_USER_ADDR_T(&maxbuf); sopt.sopt_valsize = sizeof(maxbuf); sopt.sopt_level = SOL_SOCKET; sopt.sopt_name = SO_RCVBUF; @@ -154,7 +159,7 @@ aurpd_start() } else { maxbuf = M_SNDBUF; - sopt.sopt_val = &maxbuf; + sopt.sopt_val = CAST_USER_ADDR_T(&maxbuf); sopt.sopt_valsize = sizeof(maxbuf); sopt.sopt_level = SOL_SOCKET; sopt.sopt_name = SO_SNDBUF; @@ -171,8 +176,8 @@ aurpd_start() so->so_snd.sb_flags |=(SB_SEL|SB_NOINTR); out: - sbunlock(&so->so_snd); - sbunlock(&so->so_rcv); + sbunlock(&so->so_snd, 0); + sbunlock(&so->so_rcv, 0); return(error); } @@ -181,7 +186,7 @@ int AURPgetmsg(err) int *err; { register struct socket *so; - register int s, events; + register int events; so = aurp_global.tunnel; *err = 0; @@ -189,7 +194,8 @@ AURPgetmsg(err) for (;;) { gbuf_t *from, *p_mbuf; int flags = MSG_DONTWAIT; - struct uio auio; + uio_t auio; + char uio_buf[ UIO_SIZEOF(0) ]; /* * Wait for a package to arrive. This will be from the @@ -201,7 +207,8 @@ AURPgetmsg(err) events = aurp_global.event; if (((*err == 0) || (*err == EWOULDBLOCK)) && events == 0) { - *err = tsleep(&aurp_global.event_anchor, PSOCK | PCATCH, "AURPgetmsg", 0); + lck_mtx_assert(atalk_mutex, LCK_MTX_ASSERT_OWNED); + *err = msleep(&aurp_global.event_anchor, atalk_mutex, PSOCK | PCATCH, "AURPgetmsg", 0); events = aurp_global.event; aurp_global.event = 0; } @@ -237,11 +244,8 @@ AURPgetmsg(err) * give it no iov's, point off to non-existant user space, * but make sure the 'resid' count means somehting. */ - - auio.uio_iov = NULL; - auio.uio_iovcnt = 0; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_offset = 0; /* XXX */ + auio = uio_createwithbuffer(0, 0, UIO_SYSSPACE, UIO_READ, + &uio_buf[0], sizeof(uio_buf)); /* Keep up an even flow... */ for (;;) @@ -253,8 +257,8 @@ AURPgetmsg(err) #define A_LARGE_SIZE 700 flags = MSG_DONTWAIT; - auio.uio_resid = A_LARGE_SIZE; - *err = soreceive(so, (struct sockaddr **)&from, &auio, &p_mbuf, 0, &flags); + uio_setresid(auio, A_LARGE_SIZE); + *err = soreceive(so, (struct sockaddr **)&from, auio, &p_mbuf, 0, &flags); dPrintf(D_M_AURP, D_L_VERBOSE, ("AURPgetmsg: soreceive returned %d, aurp_global.event==0x%x\n", *err, events)); /* soreceive() sets *mp to zero! at start */ @@ -278,7 +282,6 @@ AURPgetmsg(err) * which will wake us from the sleep at * the top of the outer loop. */ - int s; ATDISABLE(s, aurp_global.glock); aurp_global.event &= ~AE_UDPIP; ATENABLE(s, aurp_global.glock); @@ -296,9 +299,8 @@ AURPgetmsg(err) * * This conforms to the so_upcall function pointer member of struct sockbuf. */ -void aurp_wakeup(struct socket *so, register caddr_t p, int state) +void aurp_wakeup(__unused struct socket *so, register caddr_t p, __unused int state) { - register int s; register int bit; bit = (int) p; @@ -322,7 +324,6 @@ aurp_bindrp(struct socket *so) { struct sockaddr_in sin; struct proc *p = current_proc(); - gbuf_t *m; int error; @@ -336,8 +337,8 @@ aurp_bindrp(struct socket *so) sblock(&so->so_snd, M_WAIT); so->so_state |= SS_PRIV; error = (*so->so_proto->pr_usrreqs->pru_bind)(so, (struct sockaddr *) &sin, p); - sbunlock(&so->so_snd); - sbunlock(&so->so_rcv); + sbunlock(&so->so_snd, 0); + sbunlock(&so->so_rcv, 0); return (error); } @@ -409,7 +410,6 @@ atalk_to_ip(register gbuf_t *m) int error; int flags = MSG_DONTWAIT; struct sockaddr_in rem_addr; - int s; m->m_type = MT_HEADER; m->m_pkthdr.len = gbuf_msgsize(m); diff --git a/bsd/netat/aurp_cfg.c b/bsd/netat/aurp_cfg.c index bc7de2587..97f8c6d33 100644 --- a/bsd/netat/aurp_cfg.c +++ b/bsd/netat/aurp_cfg.c @@ -77,7 +77,7 @@ int aurp_open(gref) if (ddp_AURPfuncx(AURPCODE_REG, AURPcmdx, 0)) { aurp_gref = 0; aurp_minor_no[i] = 0; - return EPROTO; + return EPROTOTYPE; } } diff --git a/bsd/netat/aurp_misc.c b/bsd/netat/aurp_misc.c index 662499730..f1cd9a728 100644 --- a/bsd/netat/aurp_misc.c +++ b/bsd/netat/aurp_misc.c @@ -84,15 +84,14 @@ void AURPupdate(arg) void *arg; { unsigned char node; - boolean_t funnel_state; aurp_state_t *state; - funnel_state = thread_funnel_set(network_flock, TRUE); + atalk_lock(); state = (aurp_state_t *)&aurp_state[1]; if (aurp_gref == 0) { - (void) thread_funnel_set(network_flock, FALSE); + atalk_unlock(); return; } /* @@ -110,7 +109,7 @@ void AURPupdate(arg) timeout(AURPupdate, arg, AURP_UpdateRate*10*HZ); update_tmo = 1; - (void) thread_funnel_set(network_flock, FALSE); + atalk_unlock(); } /* */ diff --git a/bsd/netat/aurp_open.c b/bsd/netat/aurp_open.c index 948ccda85..2157d68c7 100644 --- a/bsd/netat/aurp_open.c +++ b/bsd/netat/aurp_open.c @@ -50,13 +50,13 @@ #include <netat/debug.h> -/* funnel version of AURPsndOpenReq */ -void AURPsndOpenReq_funnel(state) +/* locked version of AURPsndOpenReq */ +void AURPsndOpenReq_locked(state) aurp_state_t *state; { - thread_funnel_set(network_flock, TRUE); + atalk_lock(); AURPsndOpenReq(state); - thread_funnel_set(network_flock, FALSE); + atalk_unlock(); } /* */ @@ -116,7 +116,7 @@ void AURPsndOpenReq(state) } /* start the retry timer */ - timeout(AURPsndOpenReq_funnel, state, AURP_RetryInterval*HZ); + timeout(AURPsndOpenReq_locked, state, AURP_RetryInterval*HZ); state->rcv_tmo = 1; } @@ -218,7 +218,7 @@ void AURPrcvOpenRsp(state, m) } /* cancel the retry timer */ - untimeout(AURPsndOpenReq_funnel, state); + untimeout(AURPsndOpenReq_locked, state); state->rcv_tmo = 0; state->rcv_retry = 0; diff --git a/bsd/netat/aurp_ri.c b/bsd/netat/aurp_ri.c index 6cab1f01c..44d8df254 100644 --- a/bsd/netat/aurp_ri.c +++ b/bsd/netat/aurp_ri.c @@ -86,13 +86,13 @@ void AURPsndRIAck(state, m, flags) AURPsend(m, AUD_AURP, state->rem_node); } -/* funneled version of AURPsndRIReq */ -void AURPsndRIReq_funnel(state) +/* locked version of AURPsndRIReq */ +void AURPsndRIReq_locked(state) aurp_state_t *state; { - thread_funnel_set(network_flock, TRUE); + atalk_lock(); AURPsndRIReq(state); - thread_funnel_set(network_flock, FALSE); + atalk_unlock(); } /* */ @@ -132,17 +132,17 @@ void AURPsndRIReq(state) } /* start the retry timer */ - timeout(AURPsndRIReq_funnel, state, AURP_RetryInterval*HZ); + timeout(AURPsndRIReq_locked, state, AURP_RetryInterval*HZ); state->rcv_tmo = 1; } -/* funneled version of AURPsndRIRsp */ -void AURPsndRIRsp_funnel(state) +/* locked version of AURPsndRIRsp */ +void AURPsndRIRsp_locked(state) aurp_state_t *state; { - thread_funnel_set(network_flock, TRUE); + atalk_lock(); AURPsndRIRsp(state); - thread_funnel_set(network_flock, FALSE); + atalk_unlock(); } /* */ @@ -170,7 +170,7 @@ void AURPsndRIRsp(state) ATENABLE(s, aurpgen_lock); msize = sizeof(aurp_hdr_t); if ((m = (gbuf_t *)gbuf_alloc(msize+AURP_MaxPktSize, PRI_MED)) == 0) { - timeout(AURPsndRIRsp_funnel, state, AURP_RetryInterval*HZ); + timeout(AURPsndRIRsp_locked, state, AURP_RetryInterval*HZ); state->snd_tmo = 1; return; } @@ -198,7 +198,7 @@ void AURPsndRIRsp(state) m = (gbuf_t *)gbuf_dupb(state->rsp_m); /* start the retry timer */ - timeout(AURPsndRIRsp_funnel, state, AURP_RetryInterval*HZ); + timeout(AURPsndRIRsp_locked, state, AURP_RetryInterval*HZ); state->snd_tmo = 1; if (msize == 0) @@ -212,12 +212,12 @@ void AURPsndRIRsp(state) } -void AURPsndRIUpd_funnel(state) +void AURPsndRIUpd_locked(state) aurp_state_t *state; { - thread_funnel_set(network_flock, TRUE); + atalk_lock(); AURPsndRIUpd(state); - thread_funnel_set(network_flock, FALSE); + atalk_unlock(); } /* */ @@ -261,7 +261,7 @@ void AURPsndRIUpd(state) m = (gbuf_t *)gbuf_dupb(state->upd_m); /* start the retry timer */ - timeout(AURPsndRIUpd_funnel, state, AURP_RetryInterval*HZ); + timeout(AURPsndRIUpd_locked, state, AURP_RetryInterval*HZ); state->snd_tmo = 1; if (msize == 0) @@ -369,7 +369,7 @@ void AURPrcvRIRsp(state, m) dPrintf(D_M_AURP, D_L_INFO, ("AURPrcvRIRsp: len=%ld\n", gbuf_len(m))); /* cancel the retry timer */ - untimeout(AURPsndRIReq_funnel, state); + untimeout(AURPsndRIReq_locked, state); state->rcv_tmo = 0; /* send RI ack */ @@ -472,13 +472,13 @@ void AURPrcvRIAck(state, m) if (snd_state == AURPSTATE_WaitingForRIAck1) { /* ack from the tunnel peer to our RI response */ - untimeout(AURPsndRIRsp_funnel, state); + untimeout(AURPsndRIRsp_locked, state); dat_m = state->rsp_m; state->rsp_m = 0; flag = 1; } else { /* ack from the tunnel peer to our RI update */ - untimeout(AURPsndRIUpd_funnel, state); + untimeout(AURPsndRIUpd_locked, state); dat_m = state->upd_m; state->upd_m = 0; flag = 2; diff --git a/bsd/netat/aurp_rx.c b/bsd/netat/aurp_rx.c index 5d5c43a3a..2a3d85b30 100644 --- a/bsd/netat/aurp_rx.c +++ b/bsd/netat/aurp_rx.c @@ -106,6 +106,7 @@ aurp_wput(gref, m) break; case AUC_UDPPORT: + mdata = gbuf_cont(m); aurp_global.udp_port = *(char *)gbuf_rptr(mdata); break; diff --git a/bsd/netat/aurp_tickle.c b/bsd/netat/aurp_tickle.c index ce8772d0f..91994a9bc 100644 --- a/bsd/netat/aurp_tickle.c +++ b/bsd/netat/aurp_tickle.c @@ -56,12 +56,11 @@ void AURPsndTickle(state) int msize; gbuf_t *m; aurp_hdr_t *hdrp; - boolean_t funnel_state; - funnel_state = thread_funnel_set(network_flock, TRUE); + atalk_lock(); if (state->rcv_state == AURPSTATE_Unconnected) { - (void) thread_funnel_set(network_flock, FALSE); + atalk_unlock(); return; } /* stop trying if the retry count exceeds the maximum retry value */ @@ -78,7 +77,7 @@ void AURPsndTickle(state) /* purge all routes associated with the tunnel peer */ AURPpurgeri(state->rem_node); - (void) thread_funnel_set(network_flock, FALSE); + atalk_unlock(); return; } @@ -102,7 +101,7 @@ void AURPsndTickle(state) /* start the retry timer */ timeout(AURPsndTickle, state, AURP_TickleRetryInterval*HZ); - (void) thread_funnel_set(network_flock, FALSE); + atalk_unlock(); } /* */ diff --git a/bsd/netat/ddp.c b/bsd/netat/ddp.c index 93da77ebf..e471d55c1 100644 --- a/bsd/netat/ddp.c +++ b/bsd/netat/ddp.c @@ -381,10 +381,10 @@ void ddp_rem_if(ifID) /* un-do processing done in SIOCSIFADDR */ if (ifa->ifa_addr) { - int s = splnet(); - TAILQ_REMOVE(&ifID->aa_ifp->if_addrhead, ifa, ifa_link); + ifnet_lock_exclusive(ifID->aa_ifp); + if_detach_ifa(ifID->aa_ifp, ifa); ifa->ifa_addr = NULL; - splx(s); + ifnet_lock_done(ifID->aa_ifp); } if (ifID->at_dl_tag) { /* dlil_detach_protocol(ifID->at_dl_tag); */ @@ -1080,10 +1080,9 @@ void ddp_input(mp, ifID) if (sbappendaddr(&((gref->atpcb_socket)->so_rcv), (struct sockaddr *)&ddp_in, - mp, 0) == 0) - gbuf_freem(mp); - else + mp, 0, NULL) != 0) { sorwakeup(gref->atpcb_socket); + } } else { atalk_putnext(gref, mp); } diff --git a/bsd/netat/ddp.h b/bsd/netat/ddp.h index ad79d612b..919d73373 100644 --- a/bsd/netat/ddp.h +++ b/bsd/netat/ddp.h @@ -32,6 +32,8 @@ #define _NETAT_DDP_H_ #include <sys/appleapiopts.h> +#ifdef __APPLE_API_OBSOLETE + /* Header and data sizes */ #define DDP_HDR_SIZE 5 /* DDP (short) header size */ @@ -74,11 +76,9 @@ typedef struct { char data[DDP_DATA_SIZE]; } at_ddp_t; - #define DDPLEN_ASSIGN(ddp, len) ddp->length = len #define DDPLEN_VALUE(ddp) ddp->length - /* DDP module statistics and configuration */ typedef struct at_ddp_stats { @@ -102,6 +102,7 @@ typedef struct at_ddp_stats { u_int xmit_dropped_nobuf; } at_ddp_stats_t; + /* DDP streams module ioctls */ #define AT_MID_DDP 203 @@ -124,8 +125,7 @@ typedef struct at_ddp_stats { #define DDP_IOC_SET_PROTO ((AT_MID_DDP<<8) | 13) #endif -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE #define DDP_MIN_NETWORK 0x0001 #define DDP_MAX_NETWORK 0xfffe @@ -184,6 +184,6 @@ void ddp_bit_reverse(unsigned char *); /* in ddp_lap.c */ int ddp_shutdown(int); -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +#endif /* KERNEL_PRIVATE */ +#endif /* __APPLE_API_OBSOLETE */ #endif /* _NETAT_DDP_H_ */ diff --git a/bsd/netat/ddp_aarp.c b/bsd/netat/ddp_aarp.c index 8789856a9..d3692d60c 100644 --- a/bsd/netat/ddp_aarp.c +++ b/bsd/netat/ddp_aarp.c @@ -434,6 +434,8 @@ int aarp_send_data(m, elapp, dest_at_addr, loop) register at_ddp_t *ddp_hdrp; int error; int s; + struct timeval timenow; + getmicrouptime(&timenow); if (gbuf_len(m) <= 0) ddp_hdrp = (at_ddp_t *)gbuf_rptr(gbuf_cont(m)); @@ -514,7 +516,8 @@ int aarp_send_data(m, elapp, dest_at_addr, loop) amt_ptr->dest_at_addr = *dest_at_addr; amt_ptr->dest_at_addr.atalk_unused = 0; - amt_ptr->last_time = time.tv_sec; + getmicrouptime(&timenow); + amt_ptr->last_time = timenow.tv_sec; amt_ptr->m = m; amt_ptr->elapp = elapp; amt_ptr->no_of_retries = 0; @@ -765,9 +768,8 @@ register aarp_amt_t *amt_ptr; void aarp_sched_probe(void *arg) { - boolean_t funnel_state; - funnel_state = thread_funnel_set(network_flock, TRUE); + atalk_lock(); if (probe_cb.elapp->aa_ifp != 0 && probe_cb.no_of_retries != AARP_MAX_PROBE_RETRIES) { @@ -778,7 +780,7 @@ void aarp_sched_probe(void *arg) AARPwakeup(&probe_cb); } - (void) thread_funnel_set(network_flock, FALSE); + atalk_unlock(); } @@ -810,10 +812,9 @@ StaticProc void aarp_sched_req(arg) void *arg; { int s, i; - boolean_t funnel_state; aarp_amt_t *amt_ptr = (aarp_amt_t *)arg; - funnel_state = thread_funnel_set(network_flock, TRUE); + atalk_lock(); /* * make sure pointer still valid in case interface removed @@ -831,13 +832,13 @@ StaticProc void aarp_sched_req(arg) ATDISABLE(s, arpinp_lock); if (amt_ptr->tmo == 0) { ATENABLE(s, arpinp_lock); - (void) thread_funnel_set(network_flock, FALSE); + atalk_unlock(); return; } if (amt_ptr->no_of_retries < AARP_MAX_REQ_RETRIES) { ATENABLE(s, arpinp_lock); if (aarp_send_req(amt_ptr) == 0) { - (void) thread_funnel_set(network_flock, FALSE); + atalk_unlock(); return; } ATDISABLE(s, arpinp_lock); @@ -846,7 +847,7 @@ StaticProc void aarp_sched_req(arg) aarp_delete_amt_info(amt_ptr); break; } - (void) thread_funnel_set(network_flock, FALSE); + atalk_unlock(); return; } diff --git a/bsd/netat/ddp_brt.c b/bsd/netat/ddp_brt.c index 9d9c66253..6f125f76e 100644 --- a/bsd/netat/ddp_brt.c +++ b/bsd/netat/ddp_brt.c @@ -130,7 +130,7 @@ void ddp_brt_init() bzero(at_ddp_brt, sizeof(at_ddp_brt)); ddp_brt_sweep_timer = 1; #ifdef NOT_USED - timeout(ddp_brt_sweep_funnel, (long)0, BRT_SWEEP_INT * SYS_HZ); + timeout(ddp_brt_sweep_locked, (long)0, BRT_SWEEP_INT * SYS_HZ); #endif } @@ -139,17 +139,17 @@ void ddp_brt_shutdown() #ifdef NOT_USED bzero(at_ddp_brt, sizeof(at_ddp_brt)); if (ddp_brt_sweep_timer) - untimeout(ddp_brt_sweep_funnel, 0); + untimeout(ddp_brt_sweep_locked, 0); #endif ddp_brt_sweep_timer = 0; } -/* funneled version */ -void ddp_brt_sweep_funnel() +/* locked version */ +void ddp_brt_sweep_locked() { - thread_funnel_set(network_flock, TRUE); + atalk_lock(); ddp_brt_sweep(); - thread_funnel_set(network_flock, FALSE); + atalk_unlock(); } void ddp_brt_sweep() @@ -182,7 +182,7 @@ void ddp_brt_sweep() } #ifdef NOT_USED /* set up the next sweep... */ - timeout(ddp_brt_sweep_funnel, (long)0, BRT_SWEEP_INT * SYS_HZ); + timeout(ddp_brt_sweep_locked, (long)0, BRT_SWEEP_INT * SYS_HZ); #endif } diff --git a/bsd/netat/ddp_lap.c b/bsd/netat/ddp_lap.c index 5c2414e8d..f2ed8d1b4 100644 --- a/bsd/netat/ddp_lap.c +++ b/bsd/netat/ddp_lap.c @@ -53,8 +53,10 @@ #include <sys/mbuf.h> #include <sys/ioctl.h> #include <sys/socket.h> +#include <net/if_dl.h> #include <sys/socketvar.h> #include <sys/malloc.h> +#include <sys/domain.h> #include <sys/sockio.h> #include <vm/vm_kern.h> /* for kernel_map */ @@ -125,6 +127,7 @@ extern asp_scb_t *scb_used_list; extern CCB *adsp_inputQ[]; extern CCB *ccb_used_list; extern at_ddp_stats_t at_ddp_stats; +extern lck_mtx_t * atalk_mutex; /* protos */ extern snmpAarpEnt_t * getAarp(int *); @@ -305,7 +308,7 @@ int elap_wput(gref, m) register ioc_t *iocbp; register at_if_cfg_t *cfgp; at_elap_stats_t *statsp; - int error, i; + int i; int (*func)(); gbuf_t *tmpm; at_ifaddr_t *patp; @@ -774,7 +777,7 @@ elap_dataput(m, elapp, addr_flag, addr) char *addr; { register int size; - int error; + int error = 0; extern int zip_type_packet(); struct etalk_addr dest_addr; struct atalk_addr dest_at_addr; @@ -901,9 +904,11 @@ static int elap_online1(elapp) return ENOENT; elapp->startup_inprogress = TRUE; - if (! (elapp->startup_error = re_aarp(elapp))) - (void)tsleep(&elapp->startup_inprogress, PSOCK | PCATCH, + if (! (elapp->startup_error = re_aarp(elapp))) { + lck_mtx_assert(atalk_mutex, LCK_MTX_ASSERT_OWNED); + (void)msleep(&elapp->startup_inprogress, atalk_mutex, PSOCK | PCATCH, "elap_online1", 0); + } /* then later, after some timeouts AARPwakeup() is called */ @@ -1004,7 +1009,8 @@ int elap_online3(elapp) /* then later, after some timeouts AARPwakeup() is called */ - (void)tsleep(&elapp->startup_inprogress, PSOCK | PCATCH, + lck_mtx_assert(atalk_mutex, LCK_MTX_ASSERT_OWNED); + (void)msleep(&elapp->startup_inprogress, atalk_mutex, PSOCK | PCATCH, "elap_online3", 0); return(elapp->startup_error); } /* elap_online3 */ @@ -1041,6 +1047,7 @@ void elap_offline(elapp) ATENABLE(s, ddpinp_lock); /* make sure no zip timeouts are left running */ + elapp->ifGNIScheduled = 0; untimeout(zip_sched_getnetinfo, elapp); } ddp_rem_if(elapp); @@ -1251,6 +1258,7 @@ int routerStart(keP) { register at_ifaddr_t *ifID; int error; + struct timespec ts; if (! ifID_home) return(EINVAL); @@ -1274,12 +1282,18 @@ int routerStart(keP) dPrintf(D_M_ELAP, D_L_STARTUP_INFO, ("router_start: waiting 20 sec before starting up\n")); + lck_mtx_assert(atalk_mutex, LCK_MTX_ASSERT_OWNED); /* sleep for 20 seconds */ + + /* the vaue of 10n terms of hz is 100ms */ + ts.tv_sec = 20; + ts.tv_nsec = 0; + if ((error = /* *** eventually this will be the ifID for the interface being brought up in router mode *** */ - tsleep(&ifID_home->startup_inprogress, - PSOCK | PCATCH, "routerStart", 20 * SYS_HZ)) + msleep(&ifID_home->startup_inprogress, atalk_mutex, + PSOCK | PCATCH, "routerStart", &ts)) != EWOULDBLOCK) { /* if (!error) @@ -1428,7 +1442,9 @@ static int elap_trackMcast(patp, func, addr) u_char c; switch(patp->aa_ifp->if_type) { case IFT_ETHER: - case IFT_FDDI: + case IFT_FDDI: + case IFT_L2VLAN: + case IFT_IEEE8023ADLAG: /* bonded ethernet */ /* set addr to point to unique part of addr */ c = addr[5]; @@ -1515,6 +1531,8 @@ static getSnmpCfg(snmp) ifc->ifc_addrSize = getPhysAddrSize(i); switch (elapp->aa_ifp->if_type) { case IFT_ETHER: + case IFT_L2VLAN: + case IFT_IEEE8023ADLAG: /* bonded ethernet */ ifc->ifc_type = SNMP_TYPE_ETHER2; break; case IFT_ISO88025: /* token ring */ @@ -1570,7 +1588,7 @@ int at_reg_mcast(ifID, data) caddr_t data; { struct ifnet *nddp = ifID->aa_ifp; - struct sockaddr sa; + struct sockaddr_dl sdl; if (*(int *)data) { if (!nddp) { @@ -1582,16 +1600,22 @@ int at_reg_mcast(ifID, data) return(0); /* this is for ether_output */ - sa.sa_family = AF_UNSPEC; - sa.sa_len = 2 + sizeof(struct etalk_addr); - bcopy (data, &sa.sa_data[0], sizeof(struct etalk_addr)); + bzero(&sdl, sizeof(sdl)); + sdl.sdl_family = AF_LINK; + sdl.sdl_alen = sizeof(struct etalk_addr); + sdl.sdl_len = offsetof(struct sockaddr_dl, sdl_data) + + sizeof(struct etalk_addr); + bcopy(data, sdl.sdl_data, sizeof(struct etalk_addr)); + /* these next two lines should not really be needed XXX */ + sdl.sdl_index = nddp->if_index; + sdl.sdl_type = IFT_ETHER; dPrintf(D_M_PAT, D_L_STARTUP, ("pat_mcast: adding multicast %08x%04x ifID:0x%x\n", *(unsigned*)data, (*(unsigned *)(data+2))&0x0000ffff, (unsigned)ifID)); - if (if_addmulti(nddp, &sa, 0)) + if (if_addmulti(nddp, &sdl, 0)) return -1; } return 0; @@ -1603,7 +1627,7 @@ int at_unreg_mcast(ifID, data) caddr_t data; { struct ifnet *nddp = ifID->aa_ifp; - struct sockaddr sa; + struct sockaddr_dl sdl; if (*(int *)data) { if (!nddp) { @@ -1614,9 +1638,15 @@ int at_unreg_mcast(ifID, data) elap_trackMcast(ifID, MCAST_TRACK_DELETE, data); /* this is for ether_output */ - sa.sa_family = AF_UNSPEC; - sa.sa_len = 2 + sizeof(struct etalk_addr); - bcopy (data, &sa.sa_data[0], sizeof(struct etalk_addr)); + bzero(&sdl, sizeof(sdl)); + sdl.sdl_family = AF_LINK; + sdl.sdl_alen = sizeof(struct etalk_addr); + sdl.sdl_len = offsetof(struct sockaddr_dl, sdl_data) + + sizeof(struct etalk_addr); + bcopy(data, sdl.sdl_data, sizeof(struct etalk_addr)); + /* these next two lines should not really be needed XXX */ + sdl.sdl_index = nddp->if_index; + sdl.sdl_type = IFT_ETHER; dPrintf(D_M_PAT, D_L_STARTUP, ("pat_mcast: deleting multicast %08x%04x ifID:0x%x\n", @@ -1624,7 +1654,7 @@ int at_unreg_mcast(ifID, data) (unsigned)ifID)); bzero(data, sizeof(struct etalk_addr)); - if (if_delmulti(nddp, &sa)) + if (if_delmulti(nddp, &sdl)) return -1; } return 0; diff --git a/bsd/netat/ddp_nbp.c b/bsd/netat/ddp_nbp.c index 1bfec9e17..db4629db5 100644 --- a/bsd/netat/ddp_nbp.c +++ b/bsd/netat/ddp_nbp.c @@ -226,7 +226,7 @@ void nbp_input(m, ifID) /* true if home zone == destination zone */ register int zno, i; register gbuf_t *m2; - register error_found =0; + register int error_found =0; register at_ifaddr_t *ifIDorig; if (!ROUTING_MODE) /* for routers only! */ @@ -1423,7 +1423,7 @@ int nbp_new_nve_entry(nve_entry, ifID) new_entry->zone_hash = nbp_strhash(&new_entry->zone); } new_entry->tag = tag; - new_entry->pid = current_proc()->p_pid; + new_entry->pid = proc_selfpid(); ATDISABLE(nve_lock_pri,NVE_LOCK); TAILQ_INSERT_TAIL(&name_registry, new_entry, nve_link); diff --git a/bsd/netat/ddp_proto.c b/bsd/netat/ddp_proto.c index 7a6298a4d..347ba3e77 100644 --- a/bsd/netat/ddp_proto.c +++ b/bsd/netat/ddp_proto.c @@ -42,7 +42,6 @@ #include <sys/malloc.h> #include <sys/socket.h> #include <sys/socketvar.h> -#include <sys/buf.h> #include <net/if.h> diff --git a/bsd/netat/ddp_r_rtmp.c b/bsd/netat/ddp_r_rtmp.c index 46c378fd8..438d9220f 100644 --- a/bsd/netat/ddp_r_rtmp.c +++ b/bsd/netat/ddp_r_rtmp.c @@ -49,6 +49,7 @@ #include <sys/mbuf.h> #include <sys/ioctl.h> #include <sys/malloc.h> +#include <kern/locks.h> #include <sys/socket.h> #include <sys/socketvar.h> @@ -82,7 +83,7 @@ gbuf_t *rtmp_prep_new_packet(); void rtmp_timeout(); void rtmp_send_port(); -void rtmp_send_port_funnel(); +void rtmp_send_port_locked(); void rtmp_dropper(void *); void rtmp_shutdown(); static void rtmp_update(); @@ -92,6 +93,7 @@ extern int elap_online3(); extern pktsIn, pktsOut, pktsDropped, pktsHome; extern short ErrorRTMPoverflow, ErrorZIPoverflow; extern atlock_t ddpinp_lock; +extern lck_mtx_t * atalk_mutex; /* * rtmp_router_input: function called by DDP (in router mode) to handle @@ -739,12 +741,11 @@ register at_ifaddr_t *ifID; register unsigned int s; short i; RT_entry *en = &RT_table[0]; - boolean_t funnel_state; - funnel_state = thread_funnel_set(network_flock, TRUE); + atalk_lock(); if (ifID->ifRoutingState < PORT_ONLINE) { - (void) thread_funnel_set(network_flock, FALSE); + atalk_unlock(); return; } @@ -799,7 +800,7 @@ register at_ifaddr_t *ifID; ATENABLE(s, ddpinp_lock); timeout(rtmp_timeout, (caddr_t) ifID, 20*SYS_HZ); - (void) thread_funnel_set(network_flock, FALSE); + atalk_unlock(); } /* @@ -1168,13 +1169,13 @@ static void rtmp_request(ifID, ddp) } -/* funnel version of rtmp_send_port */ -void rtmp_send_port_funnel(ifID) +/* locked version of rtmp_send_port */ +void rtmp_send_port_locked(ifID) register at_ifaddr_t *ifID; { - thread_funnel_set(network_flock, TRUE); + atalk_lock(); rtmp_send_port(ifID); - thread_funnel_set(network_flock, FALSE); + atalk_unlock(); } @@ -1212,7 +1213,7 @@ void rtmp_send_port(ifID) dPrintf(D_M_RTMP_LOW, D_L_TRACE, ("rtmp_send_port: func=0x%x, ifID=0x%x\n", (u_int) rtmp_send_port, (u_int) ifID)); - timeout (rtmp_send_port_funnel, (caddr_t)ifID, 10 * SYS_HZ); + timeout (rtmp_send_port_locked, (caddr_t)ifID, 10 * SYS_HZ); } @@ -1222,14 +1223,13 @@ void rtmp_send_port(ifID) void rtmp_dropper(void *arg) { - boolean_t funnel_state; - funnel_state = thread_funnel_set(network_flock, TRUE); + atalk_lock(); pktsIn = pktsOut = pktsHome = pktsDropped = 0; timeout(rtmp_dropper, NULL, 2*SYS_HZ); - (void) thread_funnel_set(network_flock, FALSE); + atalk_unlock(); } /* @@ -1248,9 +1248,8 @@ int rtmp_router_start(keP) register short Index, router_starting_timer = 0; register RT_entry *Entry; register at_net_al netStart, netStop; - boolean_t funnel_state; + struct timespec ts; - funnel_state = thread_funnel_set(network_flock, TRUE); /* clear the static structure used to record routing errors */ bzero(&ke, sizeof(ke)); @@ -1422,13 +1421,15 @@ int rtmp_router_start(keP) goto error; } - /* sleep for 10 seconds */ + /* sleep for 11 seconds */ + ts.tv_sec = 11; + ts.tv_nsec = 0; if ((err = /* *** eventually this will be the ifID for the interface being brought up in router mode *** */ /* *** router sends rtmp packets every 10 seconds *** */ - tsleep(&ifID_home->startup_inprogress, - PSOCK | PCATCH, "router_start1", (10+1) * SYS_HZ)) + msleep(&ifID_home->startup_inprogress, atalk_mutex, + PSOCK | PCATCH, "router_start1", &ts)) != EWOULDBLOCK) { goto error; } @@ -1470,11 +1471,13 @@ startZoneInfo: dPrintf(D_M_RTMP, D_L_STARTUP, ("rtmp_router_start: waiting for zone info to complete\n")); /* sleep for 10 seconds */ + ts.tv_sec = 10; + ts.tv_nsec = 0; if ((err = /* *** eventually this will be the ifID for the interface being brought up in router mode *** */ - tsleep(&ifID_home->startup_inprogress, - PSOCK | PCATCH, "router_start2", 10 * SYS_HZ)) + msleep(&ifID_home->startup_inprogress, atalk_mutex, + PSOCK | PCATCH, "router_start2", &ts)) != EWOULDBLOCK) { goto error; } @@ -1558,22 +1561,20 @@ startZoneInfo: /* prepare the packet dropper timer */ timeout (rtmp_dropper, NULL, 1*SYS_HZ); - (void) thread_funnel_set(network_flock, funnel_state); return(0); error: dPrintf(D_M_RTMP,D_L_ERROR, - ("rtmp_router_start: error type=%d occured on port %d\n", + ("rtmp_router_start: error type=%d occurred on port %d\n", ifID->ifRoutingState, ifID->ifPort)); /* if there's no keP->error, copy the local ke structure, - since the error occured asyncronously */ + since the error occurred asyncronously */ if ((!keP->error) && ke.error) bcopy(&ke, keP, sizeof(ke)); rtmp_shutdown(); /* to return the error in keP, the ioctl has to return 0 */ - (void) thread_funnel_set(network_flock, funnel_state); return((keP->error)? 0: err); } /* rtmp_router_start */ @@ -1597,7 +1598,7 @@ void rtmp_shutdown() TAILQ_FOREACH(ifID, &at_ifQueueHd, aa_link) { if (ifID->ifRoutingState > PORT_OFFLINE ) { if (ifID->ifRoutingState == PORT_ONLINE) { - untimeout(rtmp_send_port_funnel, (caddr_t)ifID); + untimeout(rtmp_send_port_locked, (caddr_t)ifID); untimeout(rtmp_timeout, (caddr_t) ifID); } /* diff --git a/bsd/netat/ddp_r_zip.c b/bsd/netat/ddp_r_zip.c index eaf337695..bde37beb8 100644 --- a/bsd/netat/ddp_r_zip.c +++ b/bsd/netat/ddp_r_zip.c @@ -89,7 +89,7 @@ extern short ErrorZIPoverflow; static int netinfo_reply_pending; static void zip_netinfo_reply(at_x_zip_t *, at_ifaddr_t *); static void zip_getnetinfo(at_ifaddr_t *); -static void zip_getnetinfo_funnel(void *); +static void zip_getnetinfo_locked(void *); static void send_phony_reply(void *); /* @@ -742,6 +742,7 @@ void zip_router_input (m, ifID) */ ifID->ifNumRetries = ZIP_NETINFO_RETRIES; netinfo_reply_pending = 1; + ifID->ifGNIScheduled = 1; timeout(zip_sched_getnetinfo, (caddr_t) ifID, 2*ZIP_TIMER_INT); @@ -881,6 +882,7 @@ static void zip_netinfo_reply (netinfo, ifID) ifID->ifThisCableStart, ifID->ifThisCableEnd)); /* The packet is in response to our request */ + ifID->ifGNIScheduled = 0; untimeout (zip_sched_getnetinfo, (caddr_t) ifID); netinfo_reply_pending = 0; zone_name_len = netinfo->data[0]; @@ -965,13 +967,15 @@ int zip_control (ifID, control) switch (control) { case ZIP_ONLINE : case ZIP_LATE_ROUTER : - ifID->ifNumRetries = 0; - /* Get the desired zone name from elap and put it in - * ifID for zip_getnetinfo() to use. - */ - if (ifID->startup_zone.len) - ifID->ifZoneName = ifID->startup_zone; - zip_getnetinfo(ifID); + if (!ifID->ifGNIScheduled) { + ifID->ifNumRetries = 0; + /* Get the desired zone name from elap and put it in + * ifID for zip_getnetinfo() to use. + */ + if (ifID->startup_zone.len) + ifID->ifZoneName = ifID->startup_zone; + zip_getnetinfo(ifID); + } break; case ZIP_NO_ROUTER : ifID->ifZoneName.len = 1; @@ -988,14 +992,19 @@ int zip_control (ifID, control) return (0); } -/* funnel version of zip_getnetinfo */ -static void zip_getnetinfo_funnel(arg) +/* locked version of zip_getnetinfo */ +static void zip_getnetinfo_locked(arg) void *arg; { - at_ifaddr_t *ifID = (at_ifaddr_t *)arg; - thread_funnel_set(network_flock, TRUE); - zip_getnetinfo(ifID); - thread_funnel_set(network_flock, FALSE); + at_ifaddr_t *ifID; + + atalk_lock(); + if (ifID != NULL) { // make sure it hasn't been closed + ifID = (at_ifaddr_t *)arg; + ifID->ifGNIScheduled = 0; + zip_getnetinfo(ifID); + } + atalk_unlock(); } @@ -1012,6 +1021,7 @@ static void zip_getnetinfo (ifID) void zip_sched_getnetinfo(); register struct atalk_addr *at_dest; register int size; + size = DDP_X_HDR_SIZE + ZIP_X_HDR_SIZE + ifID->ifZoneName.len + 1 + sizeof(struct atalk_addr) + 1; @@ -1022,7 +1032,8 @@ static void zip_getnetinfo (ifID) */ dPrintf(D_M_ZIP, D_L_WARNING, ("zip_getnetinfo: no buffer, call later port=%d\n", ifID->ifPort)); - timeout (zip_getnetinfo_funnel, (caddr_t) ifID, ZIP_TIMER_INT/10); + ifID->ifGNIScheduled = 1; + timeout (zip_getnetinfo_locked, (caddr_t) ifID, ZIP_TIMER_INT/10); return; } @@ -1075,7 +1086,7 @@ static void zip_getnetinfo (ifID) ifID->ifNumRetries++; netinfo_reply_pending = 1; - + ifID->ifGNIScheduled = 1; timeout (zip_sched_getnetinfo, (caddr_t) ifID, ZIP_TIMER_INT); } /* zip_getnetinfo */ @@ -1088,9 +1099,10 @@ static void zip_getnetinfo (ifID) void zip_sched_getnetinfo (ifID) register at_ifaddr_t *ifID; { - boolean_t funnel_state; - funnel_state = thread_funnel_set(network_flock, TRUE); + atalk_lock(); + + ifID->ifGNIScheduled = 0; if (ifID->ifNumRetries >= ZIP_NETINFO_RETRIES) { /* enough packets sent.... give up! */ @@ -1119,7 +1131,7 @@ void zip_sched_getnetinfo (ifID) } else zip_getnetinfo(ifID); - (void) thread_funnel_set(network_flock, FALSE); + atalk_unlock(); } @@ -1263,13 +1275,11 @@ send_phony_reply(arg) void *arg; { gbuf_t *rm = (gbuf_t *)arg; - boolean_t funnel_state; - funnel_state = thread_funnel_set(network_flock, TRUE); - + atalk_lock(); ddp_input(rm, ifID_home); + atalk_unlock(); - (void) thread_funnel_set(network_flock, FALSE); return; } diff --git a/bsd/netat/ddp_rtmp.c b/bsd/netat/ddp_rtmp.c index c1b40b63e..023418dfc 100644 --- a/bsd/netat/ddp_rtmp.c +++ b/bsd/netat/ddp_rtmp.c @@ -174,10 +174,15 @@ void trackrouter(ifID, net, node) void ddp_age_router(deadrouter) register struct routerinfo *deadrouter; { - register at_ifaddr_t *ourrouter = deadrouter->ifID; - boolean_t funnel_state; + register at_ifaddr_t *ourrouter; - funnel_state = thread_funnel_set(network_flock, TRUE); + atalk_lock(); + + ourrouter = deadrouter->ifID; + if (ourrouter == NULL) { + atalk_unlock(); + return; + } dPrintf(D_M_RTMP, D_L_INFO, ("ddp_age_router called deadrouter=%d:%d\n", NODE(deadrouter), NET(deadrouter))); @@ -238,7 +243,7 @@ void ddp_age_router(deadrouter) } else bzero((caddr_t) deadrouter, sizeof(struct routerinfo)); - (void) thread_funnel_set(network_flock, FALSE); + atalk_unlock(); } /* ddp_age_router */ diff --git a/bsd/netat/ddp_usrreq.c b/bsd/netat/ddp_usrreq.c index 30d2d217e..9331419cb 100644 --- a/bsd/netat/ddp_usrreq.c +++ b/bsd/netat/ddp_usrreq.c @@ -94,7 +94,7 @@ int ddp_pru_attach(struct socket *so, int proto, if (error) return error; pcb = (struct atpcb *)((so)->so_pcb); - pcb->pid = current_proc()->p_pid; + pcb->pid = proc_selfpid(); pcb->ddptype = (u_char) proto; /* set in socreate() */ pcb->proto = ATPROTO_DDP; diff --git a/bsd/netat/debug.h b/bsd/netat/debug.h index 34c8517f5..d59f52235 100644 --- a/bsd/netat/debug.h +++ b/bsd/netat/debug.h @@ -28,7 +28,8 @@ #ifndef _NETAT_DEBUG_H_ #define _NETAT_DEBUG_H_ #include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE +#ifdef __APPLE_API_OBSOLETE +#ifdef PRIVATE #define D_L_FATAL 0x00000001 #define D_L_ERROR 0x00000002 @@ -260,6 +261,7 @@ static char *at_mid_strings[] = { #endif -#endif /* __APPLE_API_PRIVATE */ +#endif /* PRIVATE */ +#endif /* __APPLE_API_OBSOLETE */ #endif /* _NETAT_DEBUG_H_ */ diff --git a/bsd/netat/drv_dep.c b/bsd/netat/drv_dep.c index 13f845952..8f286db63 100644 --- a/bsd/netat/drv_dep.c +++ b/bsd/netat/drv_dep.c @@ -68,6 +68,8 @@ static llc_header_t snap_hdr_aarp = SNAP_HDR_AARP; static unsigned char snap_proto_ddp[5] = SNAP_PROTO_AT; static unsigned char snap_proto_aarp[5] = SNAP_PROTO_AARP; +static void at_input_packet(protocol_family_t protocol, mbuf_t m); + int pktsIn, pktsOut; struct ifqueue atalkintrq; /* appletalk and aarp packet input queue */ @@ -86,6 +88,7 @@ void atalk_load() { extern int _ATsocket(), _ATgetmsg(), _ATputmsg(); extern int _ATPsndreq(), _ATPsndrsp(), _ATPgetreq(), _ATPgetrsp(); + extern lck_mtx_t *domain_proto_mtx; sys_ATsocket = _ATsocket; sys_ATgetmsg = _ATgetmsg; @@ -114,6 +117,9 @@ void atalk_load() for 2225395 this happens in adsp_open and is undone on ADSP_UNLINK */ + lck_mtx_unlock(domain_proto_mtx); + proto_register_input(PF_APPLETALK, at_input_packet, NULL); + lck_mtx_lock(domain_proto_mtx); } /* atalk_load */ /* Undo everything atalk_load() did. */ @@ -208,7 +214,9 @@ int pat_output(patp, mlist, dst_addr, type) kprintf("po: mlen= %d, m2len= %d\n", m->m_len, (m->m_next)->m_len); #endif - dlil_output(patp->at_dl_tag, m, NULL, &dst, 0); + atalk_unlock(); + dlil_output(patp->aa_ifp, PF_APPLETALK, m, NULL, &dst, 0); + atalk_lock(); pktsOut++; } @@ -216,44 +224,30 @@ int pat_output(patp, mlist, dst_addr, type) return 0; } /* pat_output */ -void atalkintr() +static void +at_input_packet( + __unused protocol_family_t protocol, + mbuf_t m) { - struct mbuf *m, *m1, *mlist = NULL; + struct mbuf *m1; struct ifnet *ifp; - int s; llc_header_t *llc_header; at_ifaddr_t *ifID; char src[6]; enet_header_t *enet_header; - -next: - s = splimp(); - IF_DEQUEUE(&atalkintrq, m); - splx(s); - - if (m == 0) - return; - - for ( ; m ; m = mlist) { - mlist = m->m_nextpkt; -#ifdef APPLETALK_DEBUG - /* packet chains are not yet in use on input */ - if (mlist) kprintf("atalkintr: packet chain\n"); -#endif - m->m_nextpkt = 0; - if (!appletalk_inited) { + if (!appletalk_inited) { m_freem(m); - continue; - } + return; + } - if ((m->m_flags & M_PKTHDR) == 0) { + if ((m->m_flags & M_PKTHDR) == 0) { #ifdef APPLETALK_DEBUG - kprintf("atalkintr: no HDR on packet received"); + kprintf("atalkintr: no HDR on packet received"); #endif m_freem(m); - continue; - } + return; + } /* make sure the interface this packet was received on is configured for AppleTalk */ @@ -265,7 +259,7 @@ next: /* if we didn't find a matching interface */ if (!ifID) { m_freem(m); - continue; /* was EAFNOSUPPORT */ + return; /* was EAFNOSUPPORT */ } /* make sure the entire packet header is in the current mbuf */ @@ -275,13 +269,15 @@ next: kprintf("atalkintr: packet too small\n"); #endif m_freem(m); - continue; + return; } enet_header = mtod(m, enet_header_t *); /* Ignore multicast packets from local station */ /* *** Note: code for IFTYPE_TOKENTALK may be needed here. *** */ - if (ifID->aa_ifp->if_type == IFT_ETHER) { + if (ifID->aa_ifp->if_type == IFT_ETHER || + ifID->aa_ifp->if_type == IFT_L2VLAN || + ifID->aa_ifp->if_type == IFT_IEEE8023ADLAG) { bcopy((char *)enet_header->src, src, sizeof(src)); #ifdef COMMENT /* In order to receive packets from the Blue Box, we cannot @@ -291,7 +287,7 @@ next: (bcmp(src, ifID->xaddr, sizeof(src)) == 0)) { /* Packet rejected: think it's a local mcast. */ m_freem(m); - continue; /* was EAFNOSUPPORT */ + return; /* was EAFNOSUPPORT */ } #endif /* COMMENT */ @@ -321,7 +317,7 @@ next: llc_header->protocol[4]); #endif m_freem(m); - continue; /* was EAFNOSUPPORT */ + return; /* was EAFNOSUPPORT */ } } MCHTYPE(m, MSG_DATA); /* set the mbuf type */ @@ -342,7 +338,5 @@ next: #endif m_freem(m); } - } } - goto next; -} /* atalkintr */ +} diff --git a/bsd/netat/ep.h b/bsd/netat/ep.h index 2f46f707b..fd917a57a 100644 --- a/bsd/netat/ep.h +++ b/bsd/netat/ep.h @@ -31,6 +31,8 @@ #define _NETAT_EP_H_ #include <sys/appleapiopts.h> +#ifdef __APPLE_API_OBSOLETE + #define EP_REQUEST 1 /* Echo request packet */ #define EP_REPLY 2 /* Echo reply packet */ @@ -38,4 +40,5 @@ #define EP_DATA_SIZE 585 /* Maximum size of EP data */ +#endif /* __APPLE_API_OBSOLETE */ #endif /* _NETAT_EP_H_ */ diff --git a/bsd/netat/lap.h b/bsd/netat/lap.h index aec11df28..85542c5d9 100644 --- a/bsd/netat/lap.h +++ b/bsd/netat/lap.h @@ -30,6 +30,8 @@ #define _NETAT_LAP_H_ #include <sys/appleapiopts.h> +#ifdef __APPLE_API_OBSOLETE + #define AT_MID_ELAP 202 /* elap ioctl's */ @@ -88,5 +90,6 @@ #endif /* NOT_USED */ +#endif /* __APPLE_API_OBSOLETE */ #endif /* _NETAT_LAP_H_ */ diff --git a/bsd/netat/nbp.h b/bsd/netat/nbp.h index 8c77bed8b..9bac6ef0f 100644 --- a/bsd/netat/nbp.h +++ b/bsd/netat/nbp.h @@ -54,6 +54,8 @@ #define _NETAT_NBP_H_ #include <sys/appleapiopts.h> +#ifdef __APPLE_API_OBSOLETE + /* NBP packet types */ #define NBP_BRRQ 0x01 /* Broadcast request */ @@ -97,8 +99,7 @@ typedef struct at_nbp { #define DEFAULT_ZONE(zone) (!(zone)->len || ((zone)->len == 1 && (zone)->str[0] == '*')) -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE /* Struct for name registry */ typedef struct _nve_ { @@ -123,8 +124,10 @@ typedef struct _nve_ { #define NBP_WILD_TYPE 0x02 #define NBP_WILD_MASK 0x03 -typedef struct nbp_req { - int (*func)(); +struct nbp_req; +typedef struct nbp_req nbp_req_t; +struct nbp_req { + int (*func)(nbp_req_t *, nve_entry_t *); gbuf_t *response; /* the response datagram */ int space_unused; /* Space available in the resp */ /* packet. */ @@ -134,16 +137,16 @@ typedef struct nbp_req { u_char flags; /* Flags to indicate whether or */ /* not the request tuple has */ /* wildcards in it */ -} nbp_req_t; +}; extern int nbp_insert_entry(nve_entry_t *); extern u_int nbp_strhash (at_nvestr_t *); extern nve_entry_t *nbp_find_nve(nve_entry_t *); -extern int nbp_fillin_nve(); +extern int nbp_fillin_nve(at_entity_t *, nve_entry_t *); extern at_nvestr_t *getSPLocalZone(int); extern at_nvestr_t *getLocalZone(int); -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +#endif /* KERNEL_PRIVATE */ +#endif /* __APPLE_API_OBSOLETE */ #endif /* _NETAT_NBP_H_ */ diff --git a/bsd/netat/pap.h b/bsd/netat/pap.h index 6abed58d8..51388c274 100644 --- a/bsd/netat/pap.h +++ b/bsd/netat/pap.h @@ -37,6 +37,8 @@ #define _NETAT_PAP_H_ #include <sys/appleapiopts.h> +#ifdef __APPLE_API_OBSOLETE + #define AT_PAP_DATA_SIZE 512 /* Maximum PAP data size */ #define AT_PAP_STATUS_SIZE 255 /* Maximum PAP status length */ #define PAP_TIMEOUT 120 @@ -125,4 +127,5 @@ struct pap_state { int pap_tickle_id; /* the transaction ID for tickles */ }; +#endif /* __APPLE_API_OBSOLETE */ #endif /* _NETAT_PAP_H_ */ diff --git a/bsd/netat/routing_tables.h b/bsd/netat/routing_tables.h index 5376ab6a7..f3b46283d 100644 --- a/bsd/netat/routing_tables.h +++ b/bsd/netat/routing_tables.h @@ -32,7 +32,7 @@ #ifndef _NETAT_ROUTING_TABLES_H_ #define _NETAT_ROUTING_TABLES_H_ #include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE +#ifdef PRIVATE /* RTMP table entry state bitmap (EntryState) values */ @@ -141,7 +141,7 @@ typedef struct { ZT_entry zt; /* the zone table entry */ } ZT_entryno; -#ifdef KERNEL +#ifdef KERNEL_PRIVATE /* Macros for Routing table B-tree easy access */ @@ -191,9 +191,10 @@ extern int zonename_equal(at_nvestr_t *, at_nvestr_t *); extern RT_entry *RT_table_freelist; extern RT_entry RT_table_start; extern RT_entry *RT_table; -extern RT_entry *rt_binsert(); -extern RT_entry *rt_insert(); -extern RT_entry *rt_bdelete(); +extern RT_entry *rt_binsert (RT_entry *); +extern RT_entry *rt_insert( at_net_al NStop, at_net_al NStart, at_net_al NxNet, + at_node NxNode, u_char NtDist, u_char NtPort, u_char EntS); +extern RT_entry *rt_bdelete (at_net_al NetStop, at_net_al NetStart); extern RT_entry *rt_blookup(int); extern RT_entry *rt_getNextRoute(int); @@ -209,9 +210,9 @@ extern int zt_ent_zindex(u_char *); extern ZT_entryno *zt_getNextZone(int); extern void zt_remove_zones(u_char *); extern void zt_set_zmap(u_short, char *); -extern void rtmp_router_input(); +extern void rtmp_router_input(gbuf_t *, at_ifaddr_t *); -#endif /* KERNEL */ +#endif /* KERNEL_PRIVATE */ -#endif /* __APPLE_API_PRIVATE */ +#endif /* PRIVATE */ #endif /* _NETAT_ROUTING_TABLES_H_ */ diff --git a/bsd/netat/rtmp.h b/bsd/netat/rtmp.h index 8f7365b21..31b1a1b68 100644 --- a/bsd/netat/rtmp.h +++ b/bsd/netat/rtmp.h @@ -28,6 +28,8 @@ #define _NETAT_RTMP_H_ #include <sys/appleapiopts.h> +#ifdef __APPLE_API_OBSOLETE + /* Changed 03-22-94 for router support LD */ /* RTMP function codes */ @@ -62,4 +64,5 @@ typedef struct { unsigned char at_rtmp_data; } at_rtmp_tuple; +#endif /* __APPLE_API_OBSOLETE */ #endif /* _NETAT_RTMP_H_ */ diff --git a/bsd/netat/sys_dep.c b/bsd/netat/sys_dep.c index 0c20c8f1c..272d890dc 100644 --- a/bsd/netat/sys_dep.c +++ b/bsd/netat/sys_dep.c @@ -33,14 +33,16 @@ #include <machine/spl.h> #include <sys/systm.h> #include <sys/kernel.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> /* for p_fd in fdflags */ #include <sys/filedesc.h> #include <sys/fcntl.h> #include <sys/mbuf.h> #include <sys/malloc.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/socket.h> #include <sys/socketvar.h> +#include <sys/sysproto.h> +#include <sys/kdebug.h> #include <net/if_var.h> #include <netat/sysglue.h> @@ -59,17 +61,23 @@ int (*sys_ATPgetrsp)() = 0; extern at_state_t at_state; /* global state of AT network */ extern at_ifaddr_t *ifID_home; /* default interface */ +extern lck_mtx_t * atalk_mutex; + +#define f_flag f_fglob->fg_flag +#define f_type f_fglob->fg_type +#define f_msgcount f_fglob->fg_msgcount +#define f_cred f_fglob->fg_cred +#define f_ops f_fglob->fg_ops +#define f_offset f_fglob->fg_offset +#define f_data f_fglob->fg_data -struct ATsocket_args { - int proto; -}; int ATsocket(proc, uap, retval) - void *proc; + struct proc *proc; struct ATsocket_args *uap; int *retval; { int err; - + atalk_lock(); if (sys_ATsocket) { /* required check for all AppleTalk system calls */ if (!(at_state.flags & AT_ST_STARTED) || !ifID_home) { @@ -82,22 +90,18 @@ int ATsocket(proc, uap, retval) *retval = -1; err = ENXIO; } + atalk_unlock(); return err; } -struct ATgetmsg_args { - int fd; - void *ctlptr; - void *datptr; - int *flags; -}; int ATgetmsg(proc, uap, retval) - void *proc; + struct proc *proc; struct ATgetmsg_args *uap; int *retval; { int err; + atalk_lock(); if (sys_ATgetmsg) { /* required check for all AppleTalk system calls */ if (!(at_state.flags & AT_ST_STARTED) || !ifID_home) { @@ -112,22 +116,18 @@ int ATgetmsg(proc, uap, retval) *retval = -1; err = ENXIO; } + atalk_unlock(); return err; } -struct ATputmsg_args { - int fd; - void *ctlptr; - void *datptr; - int flags; -}; int ATputmsg(proc, uap, retval) - void *proc; + struct proc *proc; struct ATputmsg_args *uap; int *retval; { int err; + atalk_lock(); if (sys_ATputmsg) { /* required check for all AppleTalk system calls */ if (!(at_state.flags & AT_ST_STARTED) || !ifID_home) { @@ -142,22 +142,18 @@ int ATputmsg(proc, uap, retval) *retval = -1; err = ENXIO; } + atalk_unlock(); return err; } -struct ATPsndreq_args { - int fd; - unsigned char *buf; - int len; - int nowait; -}; int ATPsndreq(proc, uap, retval) - void *proc; + struct proc *proc; struct ATPsndreq_args *uap; int *retval; { int err; + atalk_lock(); if (sys_ATPsndreq) { /* required check for all AppleTalk system calls */ if (!(at_state.flags & AT_ST_STARTED) || !ifID_home) { @@ -172,22 +168,18 @@ int ATPsndreq(proc, uap, retval) *retval = -1; err= ENXIO; } + atalk_unlock(); return err; } -struct ATPsndrsp_args { - int fd; - unsigned char *respbuff; - int resplen; - int datalen; -}; int ATPsndrsp(proc, uap, retval) - void *proc; + struct proc *proc; struct ATPsndrsp_args *uap; int *retval; { int err; + atalk_lock(); if (sys_ATPsndrsp) { /* required check for all AppleTalk system calls */ if (!(at_state.flags & AT_ST_STARTED) || !ifID_home) { @@ -202,21 +194,18 @@ int ATPsndrsp(proc, uap, retval) *retval = -1; err = ENXIO; } + atalk_unlock(); return err; } -struct ATPgetreq_args { - int fd; - unsigned char *buf; - int buflen; -}; int ATPgetreq(proc, uap, retval) - void *proc; + struct proc *proc; struct ATPgetreq_args *uap; int *retval; { int err; + atalk_lock(); if (sys_ATPgetreq) { /* required check for all AppleTalk system calls */ if (!(at_state.flags & AT_ST_STARTED) || !ifID_home) { @@ -231,20 +220,18 @@ int ATPgetreq(proc, uap, retval) *retval = -1; err = ENXIO; } + atalk_unlock(); return err; } -struct ATPgetrsp_args { - int fd; - unsigned char *bdsp; -}; int ATPgetrsp(proc, uap, retval) - void *proc; + struct proc *proc; struct ATPgetrsp_args *uap; int *retval; { int err = 0; + atalk_lock(); if (sys_ATPgetrsp) { /* required check for all AppleTalk system calls */ if (!(at_state.flags & AT_ST_STARTED) || !ifID_home) { @@ -258,19 +245,16 @@ int ATPgetrsp(proc, uap, retval) *retval = -1; err = ENXIO; } + atalk_unlock(); return err; } -int atalk_closeref(fp, grefp) - struct file *fp; +int atalk_closeref(fg, grefp) + struct fileglob *fg; gref_t **grefp; { - if ((*grefp = (gref_t *)fp->f_data)) { - fp->f_data = 0; -/* - kprintf("atalk_closeref: fp = 0x%x, gref = 0x%x\n", (u_int)fp, - (u_int)*grefp); -*/ + if ((*grefp = (gref_t *)fg->fg_data)) { + fg->fg_data = 0; return(0); } return(EBADF); @@ -283,14 +267,15 @@ int atalk_openref(gref, retfd, proc) { extern int _ATread(), _ATwrite(),_ATioctl(), _ATselect(), _ATclose(), _ATkqfilter(); static struct fileops fileops = - {_ATread, _ATwrite, _ATioctl, _ATselect, _ATclose, _ATkqfilter}; + {_ATread, _ATwrite, _ATioctl, _ATselect, _ATclose, _ATkqfilter, 0}; int err, fd; - struct file *fp; - - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - - if ((err = falloc(proc, &fp, &fd)) != 0) { - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + struct fileproc *fp; + + lck_mtx_assert(atalk_mutex, LCK_MTX_ASSERT_OWNED); + + proc_fdlock(proc); + if ((err = falloc_locked(proc, &fp, &fd, 1)) != 0) { + proc_fdunlock(proc); return err; } @@ -300,45 +285,69 @@ int atalk_openref(gref, retfd, proc) */ fp->f_type = DTYPE_ATALK+1; fp->f_ops = &fileops; + fp->f_data = (void *)gref; + *fdflags(proc, fd) &= ~UF_RESERVED; *retfd = fd; - fp->f_data = (void *)gref; + fp_drop(proc, fd, fp, 1); + proc_fdunlock(proc); /* kprintf("atalk_openref: fp = 0x%x, gref = 0x%x\n", (u_int)fp, (u_int)gref); */ - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); return 0; } -/* go from file descriptor to gref, which has been saved in fp->f_data */ -int atalk_getref(fp, fd, grefp, proc) -struct file *fp; +/* + * go from file descriptor to gref, which has been saved in fp->f_data + * + * This routine returns with an iocount on the fileproc when the fp is null + * as it converts fd to fileproc. Callers of this api who pass fp as null + * need to drop the iocount when they are done with the fp + */ +int atalk_getref(fp, fd, grefp, proc, droponerr) +struct fileproc *fp; int fd; gref_t **grefp; struct proc *proc; +int droponerr; { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - if (fp == 0) { - int error = fdgetf(proc, fd, &fp); - - if (error) { - - *grefp = (gref_t *) 0; - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - return EBADF; - } - } - *grefp = (gref_t *)fp->f_data; - if (*grefp == 0 || *grefp == (gref_t *)(-1)) { - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - return EBADF; - } + int error; - if ((*grefp)->errno) { - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - return (int)(*grefp)->errno; - } + proc_fdlock(proc); + error = atalk_getref_locked(fp, fd, grefp, proc, droponerr); + proc_fdunlock(proc); + return error; +} - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - return 0; +int atalk_getref_locked(fp, fd, grefp, proc, droponerr) +struct fileproc *fp; +int fd; +gref_t **grefp; +struct proc *proc; +int droponerr; +{ + lck_mtx_assert(atalk_mutex, LCK_MTX_ASSERT_OWNED); + if (fp == 0) { + int error = fp_lookup(proc, fd, &fp, 1); + + if (error) { + + *grefp = (gref_t *) 0; + return EBADF; + } + } + *grefp = (gref_t *)fp->f_data; + if (*grefp == 0 || *grefp == (gref_t *)(-1)) { + if (droponerr) + fp_drop(proc, fd, fp, 1); + printf("atalk_getref_locked EBADF f_data: %x\n", fp->f_data); + return EBADF; + } + + if ((*grefp)->errno) { + if (droponerr) + fp_drop(proc, fd, fp, 1); + return (int)(*grefp)->errno; + } + return 0; } diff --git a/bsd/netat/sys_glue.c b/bsd/netat/sys_glue.c index 7b6538136..a59859f2f 100644 --- a/bsd/netat/sys_glue.c +++ b/bsd/netat/sys_glue.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -35,12 +35,15 @@ #include <sys/proc.h> #include <sys/filedesc.h> #include <sys/fcntl.h> +#include <sys/file_internal.h> #include <sys/mbuf.h> #include <sys/ioctl.h> #include <sys/malloc.h> +#include <kern/locks.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/ioccom.h> +#include <sys/uio_internal.h> #include <sys/sysctl.h> @@ -55,6 +58,7 @@ #include <netat/debug.h> extern struct atpcb ddp_head; +extern lck_mtx_t * atalk_mutex; extern void ddp_putmsg(gref_t *gref, gbuf_t *m), @@ -84,6 +88,9 @@ at_ddp_stats_t at_ddp_stats; /* DDP statistics */ SYSCTL_STRUCT(_net_appletalk, OID_AUTO, ddpstats, CTLFLAG_RD, &at_ddp_stats, at_ddp_stats, "AppleTalk DDP Stats"); +static void ioccmd_t_32_to_64( ioccmd_t *from_p, user_ioccmd_t *to_p ); +static void ioccmd_t_64_to_32( user_ioccmd_t *from_p, ioccmd_t *to_p ); + atlock_t refall_lock; caddr_t atp_free_cluster_list = 0; @@ -112,7 +119,7 @@ void gref_wput(gref, m) gbuf_freem(gbuf_cont(m)); gbuf_cont(m) = 0; ((ioc_t *)gbuf_rptr(m))->ioc_rval = -1; - ((ioc_t *)gbuf_rptr(m))->ioc_error = EPROTO; + ((ioc_t *)gbuf_rptr(m))->ioc_error = EPROTOTYPE; gbuf_set_type(m, MSG_IOCNAK); atalk_putnext(gref, m); } else @@ -159,7 +166,7 @@ int _ATsocket(proto, err, proc) return -1; } gref->proto = proto; - gref->pid = ((struct proc *)proc)->p_pid; + gref->pid = proc_pid((struct proc *)proc); /* open the specified protocol */ switch (gref->proto) { @@ -211,7 +218,7 @@ int _ATgetmsg(fd, ctlptr, datptr, flags, err, proc) int rc = -1; gref_t *gref; - if ((*err = atalk_getref(0, fd, &gref, proc)) == 0) { + if ((*err = atalk_getref(0, fd, &gref, proc, 1)) == 0) { switch (gref->proto) { case ATPROTO_ASP: rc = ASPgetmsg(gref, ctlptr, datptr, NULL, flags, err); @@ -225,6 +232,7 @@ int _ATgetmsg(fd, ctlptr, datptr, flags, err, proc) *err = EPROTONOSUPPORT; break; } + file_drop(fd); } /* kprintf("_ATgetmsg: return=%d\n", *err);*/ @@ -242,30 +250,31 @@ int _ATputmsg(fd, ctlptr, datptr, flags, err, proc) int rc = -1; gref_t *gref; - if ((*err = atalk_getref(0, fd, &gref, proc)) == 0) { + if ((*err = atalk_getref(0, fd, &gref, proc, 1)) == 0) { switch (gref->proto) { case ATPROTO_ASP: rc = ASPputmsg(gref, ctlptr, datptr, NULL, flags, err); break; default: *err = EPROTONOSUPPORT; break; } + file_drop(fd); } /* kprintf("_ATputmsg: return=%d\n", *err); */ return rc; } -int _ATclose(fp, proc) - struct file *fp; +int _ATclose(fg, proc) + struct fileglob *fg; struct proc *proc; { int err; gref_t *gref; - if ((err = atalk_closeref(fp, &gref)) == 0) { - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + if ((err = atalk_closeref(fg, &gref)) == 0) { + atalk_lock(); (void)gref_close(gref); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + atalk_unlock(); } return err; @@ -281,10 +290,12 @@ int _ATrw(fp, rw, uio, ext) gref_t *gref; gbuf_t *m, *mhead, *mprev; - if ((err = atalk_getref(fp, 0, &gref, 0)) != 0) + /* no need to get/drop iocount as the fp already has one */ + if ((err = atalk_getref_locked(fp, 0, &gref, 0, 1)) != 0) return err; - if ((len = uio->uio_resid) == 0) + // LP64todo - fix this! + if ((len = uio_resid(uio)) == 0) return 0; ATDISABLE(s, gref->lock); @@ -293,7 +304,7 @@ int _ATrw(fp, rw, uio, ext) KERNEL_DEBUG(DBG_ADSP_ATRW, 0, gref, len, gref->rdhead, 0); while ((gref->errno == 0) && ((mhead = gref->rdhead) == 0)) { gref->sevents |= POLLMSG; - err = tsleep(&gref->event, PSOCK | PCATCH, "AT read", 0); + err = msleep(&gref->event, atalk_mutex, PSOCK | PCATCH, "AT read", 0); gref->sevents &= ~POLLMSG; if (err != 0) { ATENABLE(s, gref->lock); @@ -359,7 +370,7 @@ int _ATrw(fp, rw, uio, ext) while (!(*gref->writeable)(gref)) { /* flow control on, wait to be enabled to write */ gref->sevents |= POLLSYNC; - err = tsleep(&gref->event, PSOCK | PCATCH, "AT write", 0); + err = msleep(&gref->event, atalk_mutex, PSOCK | PCATCH, "AT write", 0); gref->sevents &= ~POLLSYNC; if (err != 0) { ATENABLE(s, gref->lock); @@ -394,7 +405,7 @@ int _ATrw(fp, rw, uio, ext) } /* _ATrw */ int _ATread(fp, uio, cred, flags, p) - void *fp; + struct fileproc *fp; struct uio *uio; void *cred; int flags; @@ -402,14 +413,14 @@ int _ATread(fp, uio, cred, flags, p) { int stat; - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + atalk_lock(); stat = _ATrw(fp, UIO_READ, uio, 0); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + atalk_unlock(); return stat; } int _ATwrite(fp, uio, cred, flags, p) - void *fp; + struct fileproc *fp; struct uio *uio; void *cred; int flags; @@ -417,10 +428,9 @@ int _ATwrite(fp, uio, cred, flags, p) { int stat; - - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + atalk_lock(); stat = _ATrw(fp, UIO_WRITE, uio, 0); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + atalk_unlock(); return stat; } @@ -431,27 +441,43 @@ int _ATwrite(fp, uio, cred, flags, p) int at_ioctl(gref_t *gref, u_long cmd, caddr_t arg, int fromKernel) { int s, err = 0, len; + u_int size; gbuf_t *m, *mdata; ioc_t *ioc; - ioccmd_t ioccmd; + user_addr_t user_arg; + user_ioccmd_t user_ioccmd; + boolean_t is64bit; /* error if not for us */ if ((cmd & 0xffff) != 0xff99) return EOPNOTSUPP; + size = IOCPARM_LEN(cmd); + if (size != sizeof(user_addr_t)) + return EINVAL; + + user_arg = *((user_addr_t *)arg); + /* copy in ioc command info */ -/* - kprintf("at_ioctl: arg ioccmd.ic_cmd=%x ic_len=%x gref->lock=%x, gref->event=%x\n", - ((ioccmd_t *)arg)->ic_cmd, ((ioccmd_t *)arg)->ic_len, - gref->lock, gref->event); -*/ - if (fromKernel) - bcopy (arg, &ioccmd, sizeof (ioccmd_t)); + is64bit = proc_is64bit(current_proc()); + if (fromKernel) { + ioccmd_t tmp; + bcopy (CAST_DOWN(caddr_t, user_arg), &tmp, sizeof (tmp)); + ioccmd_t_32_to_64(&tmp, &user_ioccmd); + } else { - if ((err = copyin((caddr_t)arg, (caddr_t)&ioccmd, sizeof(ioccmd_t))) != 0) { + if (is64bit) { + err = copyin(user_arg, (caddr_t)&user_ioccmd, sizeof(user_ioccmd)); + } + else { + ioccmd_t tmp; + err = copyin(user_arg, (caddr_t)&tmp, sizeof(tmp)); + ioccmd_t_32_to_64(&tmp, &user_ioccmd); + } + if (err != 0) { #ifdef APPLETALK_DEBUG - kprintf("at_ioctl: err = %d, copyin(%x, %x, %d)\n", err, - (caddr_t)arg, (caddr_t)&ioccmd, sizeof(ioccmd_t)); + kprintf("at_ioctl: err = %d, copyin(%llx, %x, %d)\n", err, + user_arg, (caddr_t)&user_ioccmd, sizeof(user_ioccmd)); #endif return err; } @@ -466,27 +492,27 @@ int at_ioctl(gref_t *gref, u_long cmd, caddr_t arg, int fromKernel) /* create the ioc command second mbuf contains the actual ASP command */ - if (ioccmd.ic_len) { - if ((gbuf_cont(m) = gbuf_alloc(ioccmd.ic_len, PRI_HI)) == 0) { + if (user_ioccmd.ic_len) { + if ((gbuf_cont(m) = gbuf_alloc(user_ioccmd.ic_len, PRI_HI)) == 0) { gbuf_freem(m); #ifdef APPLETALK_DEBUG kprintf("at_ioctl: gbuf_alloc err=%d\n",ENOBUFS); #endif return ENOBUFS; } - gbuf_wset(gbuf_cont(m), ioccmd.ic_len); /* mbuf->m_len */ + gbuf_wset(gbuf_cont(m), user_ioccmd.ic_len); /* mbuf->m_len */ if (fromKernel) - bcopy (ioccmd.ic_dp, gbuf_rptr(gbuf_cont(m)), ioccmd.ic_len); + bcopy (CAST_DOWN(caddr_t, user_ioccmd.ic_dp), gbuf_rptr(gbuf_cont(m)), user_ioccmd.ic_len); else { - if ((err = copyin((caddr_t)ioccmd.ic_dp, (caddr_t)gbuf_rptr(gbuf_cont(m)), ioccmd.ic_len)) != 0) { + if ((err = copyin(user_ioccmd.ic_dp, (caddr_t)gbuf_rptr(gbuf_cont(m)), user_ioccmd.ic_len)) != 0) { gbuf_freem(m); return err; } } } ioc = (ioc_t *) gbuf_rptr(m); - ioc->ioc_cmd = ioccmd.ic_cmd; - ioc->ioc_count = ioccmd.ic_len; + ioc->ioc_cmd = user_ioccmd.ic_cmd; + ioc->ioc_count = user_ioccmd.ic_len; ioc->ioc_error = 0; ioc->ioc_rval = 0; @@ -500,7 +526,7 @@ int at_ioctl(gref_t *gref, u_long cmd, caddr_t arg, int fromKernel) #ifdef APPLETALK_DEBUG kprintf("sleep gref = 0x%x\n", (unsigned)gref); #endif - err = tsleep(&gref->iocevent, PSOCK | PCATCH, "AT ioctl", 0); + err = msleep(&gref->iocevent, atalk_mutex, PSOCK | PCATCH, "AT ioctl", 0); gref->sevents &= ~POLLPRI; if (err != 0) { ATENABLE(s, gref->lock); @@ -527,19 +553,19 @@ int at_ioctl(gref_t *gref, u_long cmd, caddr_t arg, int fromKernel) /* process the ioc response */ ioc = (ioc_t *) gbuf_rptr(m); if ((err = ioc->ioc_error) == 0) { - ioccmd.ic_timout = ioc->ioc_rval; - ioccmd.ic_len = 0; + user_ioccmd.ic_timout = ioc->ioc_rval; + user_ioccmd.ic_len = 0; mdata = gbuf_cont(m); - if (mdata && ioccmd.ic_dp) { - ioccmd.ic_len = gbuf_msgsize(mdata); + if (mdata && user_ioccmd.ic_dp) { + user_ioccmd.ic_len = gbuf_msgsize(mdata); for (len = 0; mdata; mdata = gbuf_cont(mdata)) { if (fromKernel) - bcopy (gbuf_rptr(mdata), &ioccmd.ic_dp[len], gbuf_len(mdata)); + bcopy (gbuf_rptr(mdata), CAST_DOWN(caddr_t, (user_ioccmd.ic_dp + len)), gbuf_len(mdata)); else { - if ((err = copyout((caddr_t)gbuf_rptr(mdata), (caddr_t)&ioccmd.ic_dp[len], gbuf_len(mdata))) < 0) { + if ((err = copyout((caddr_t)gbuf_rptr(mdata), (user_ioccmd.ic_dp + len), gbuf_len(mdata))) < 0) { #ifdef APPLETALK_DEBUG kprintf("at_ioctl: len=%d error copyout=%d from=%x to=%x gbuf_len=%x\n", - len, err, (caddr_t)gbuf_rptr(mdata), (caddr_t)&ioccmd.ic_dp[len], gbuf_len(mdata)); + len, err, (caddr_t)gbuf_rptr(mdata), (caddr_t)&user_ioccmd.ic_dp[len], gbuf_len(mdata)); #endif goto l_done; } @@ -548,14 +574,21 @@ int at_ioctl(gref_t *gref, u_long cmd, caddr_t arg, int fromKernel) } } - if (fromKernel) - bcopy (&ioccmd, arg, sizeof(ioccmd_t)); + if (fromKernel) { + ioccmd_t tmp; + ioccmd_t_64_to_32(&user_ioccmd, &tmp); + bcopy (&tmp, CAST_DOWN(caddr_t, user_arg), sizeof(tmp)); + } else { - if ((err = copyout((caddr_t)&ioccmd, (caddr_t)arg, sizeof(ioccmd_t))) != 0) { -#ifdef APPLETALK_DEBUG - kprintf("at_ioctl: error copyout2=%d from=%x to=%x len=%d\n", - err, &ioccmd, arg, sizeof(ioccmd_t)); -#endif + if (is64bit) { + err = copyout((caddr_t)&user_ioccmd, user_arg, sizeof(user_ioccmd)); + } + else { + ioccmd_t tmp; + ioccmd_t_64_to_32(&user_ioccmd, &tmp); + err = copyout((caddr_t)&tmp, user_arg, sizeof(tmp)); + } + if (err != 0) { goto l_done; } } @@ -576,8 +609,9 @@ int _ATioctl(fp, cmd, arg, proc) int err; gref_t *gref; - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - if ((err = atalk_getref(fp, 0, &gref, 0)) != 0) { + atalk_lock(); + /* No need to get a reference on fp as it already has one */ + if ((err = atalk_getref_locked(fp, 0, &gref, 0, 0)) != 0) { #ifdef APPLETALK_DEBUG kprintf("_ATioctl: atalk_getref err = %d\n", err); #endif @@ -585,13 +619,13 @@ int _ATioctl(fp, cmd, arg, proc) else err = at_ioctl(gref, cmd, arg, 0); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + atalk_unlock(); return err; } int _ATselect(fp, which, wql, proc) - struct file *fp; + struct fileproc *fp; int which; void * wql; struct proc *proc; @@ -599,9 +633,10 @@ int _ATselect(fp, which, wql, proc) int s, err, rc = 0; gref_t *gref; - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - err = atalk_getref(fp, 0, &gref, 0); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + atalk_lock(); + /* no need to drop the iocount as select covers that */ + err = atalk_getref_locked(fp, 0, &gref, 0, 0); + atalk_unlock(); if (err != 0) rc = 1; @@ -633,7 +668,7 @@ int _ATselect(fp, which, wql, proc) } int _ATkqfilter(fp, kn, p) - struct file *fp; + struct fileproc *fp; struct knote *kn; struct proc *p; { @@ -1317,3 +1352,20 @@ void ioc_ack(errno, m, gref) atalk_putnext(gref, m); } + +static void ioccmd_t_32_to_64( ioccmd_t *from_p, user_ioccmd_t *to_p ) +{ + to_p->ic_cmd = from_p->ic_cmd; + to_p->ic_timout = from_p->ic_timout; + to_p->ic_len = from_p->ic_len; + to_p->ic_dp = CAST_USER_ADDR_T(from_p->ic_dp); +} + + +static void ioccmd_t_64_to_32( user_ioccmd_t *from_p, ioccmd_t *to_p ) +{ + to_p->ic_cmd = from_p->ic_cmd; + to_p->ic_timout = from_p->ic_timout; + to_p->ic_len = from_p->ic_len; + to_p->ic_dp = CAST_DOWN(caddr_t, from_p->ic_dp); +} diff --git a/bsd/netat/sysglue.h b/bsd/netat/sysglue.h index 4235ac253..6aa9e37c6 100644 --- a/bsd/netat/sysglue.h +++ b/bsd/netat/sysglue.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -35,6 +35,9 @@ #ifndef _NETAT_SYSGLUE_H_ #define _NETAT_SYSGLUE_H_ #include <sys/appleapiopts.h> +#include <sys/cdefs.h> + +#ifdef __APPLE_API_OBSOLETE /* The following is originally from netat/h/localglue.h, which was @@ -48,6 +51,31 @@ typedef struct { char *ic_dp; } ioccmd_t; +#ifdef KERNEL +#ifdef KERNEL_PRIVATE + +/* LP64 version of ioccmd_t. all pointers + * grow when we're dealing with a 64-bit process. + * WARNING - keep in sync with ioccmd_t + */ +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +typedef struct { + int ic_cmd; + int ic_timout; + int ic_len; + user_addr_t ic_dp; +} user_ioccmd_t; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +#endif // KERNEL_PRIVATE +#endif // KERNEL + typedef struct { int ioc_cmd; void *ioc_cr; @@ -89,7 +117,7 @@ typedef struct { #endif #ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE #define SYS_HZ HZ /* Number of clock (SYS_SETTIMER) ticks per second */ #define HZ hz /* HZ ticks definition used throughout AppleTalk */ @@ -100,7 +128,6 @@ typedef struct { * in MacOSX. Need to find a better Error code ###LD */ #define ENOTREADY ESHUTDOWN -#define EPROTO EPROTOTYPE /* T_MPSAFE is used only in atp_open. I suspect it's a * trick to accelerate local atp transactions. @@ -130,11 +157,11 @@ typedef struct { #endif typedef int atevent_t; -typedef simple_lock_t atlock_t; +typedef int atlock_t; typedef int *atomic_p; #define ATLOCKINIT(a) (a = (atlock_t) EVENT_NULL) -#define ATDISABLE(l, a) (l = splimp()) -#define ATENABLE(l, a) splx(l) +#define ATDISABLE(l, a) +#define ATENABLE(l, a) #define ATEVENTINIT(a) (a = (atevent_t) EVENT_NULL) #define DDP_OUTPUT(m) ddp_putmsg(0,m) #define StaticProc static @@ -187,6 +214,7 @@ int gbuf_msgsize(gbuf_t *m); #undef timeout #undef untimeout -#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL_PRIVATE */ #endif /* KERNEL */ +#endif /* __APPLE_API_OBSOLETE */ #endif /* _NETAT_SYSGLUE_H_ */ diff --git a/bsd/netat/zip.h b/bsd/netat/zip.h index 457c246cb..fd0b38a5f 100644 --- a/bsd/netat/zip.h +++ b/bsd/netat/zip.h @@ -32,6 +32,8 @@ #define _NETAT_ZIP_H_ #include <sys/appleapiopts.h> +#ifdef __APPLE_API_OBSOLETE + /* Definitions for ZIP, per AppleTalk Zone Information Protocol * documentation from `Inside AppleTalk', July 14, 1986. */ @@ -90,4 +92,5 @@ typedef struct { #define ZIP_RE_AARP -1 +#endif /* __APPLE_API_OBSOLETE */ #endif /* _NETAT_ZIP_H_ */ diff --git a/bsd/netinet/Makefile b/bsd/netinet/Makefile index a680385a0..de3d2890a 100644 --- a/bsd/netinet/Makefile +++ b/bsd/netinet/Makefile @@ -20,29 +20,36 @@ EXPINC_SUBDIRS_PPC = \ EXPINC_SUBDIRS_I386 = \ DATAFILES = \ - bootp.h icmp6.h icmp_var.h if_ether.h \ - igmp.h igmp_var.h in.h in_gif.h in_pcb.h \ - in_systm.h in_var.h ip.h ip6.h ip_compat.h \ - ip_dummynet.h ip_ecn.h ip_encap.h \ - ip_fw.h ip_icmp.h ip_mroute.h \ - ip_var.h tcp.h \ - tcp_debug.h tcp_fsm.h tcp_seq.h tcp_timer.h tcp_var.h \ + bootp.h icmp6.h if_ether.h icmp_var.h \ + igmp.h igmp_var.h in.h in_pcb.h \ + in_systm.h in_var.h ip.h ip6.h \ + ip_fw.h ip_fw2.h \ + ip_icmp.h ip_mroute.h ip_var.h tcp.h \ + tcp_fsm.h tcp_seq.h tcp_timer.h tcp_var.h \ tcpip.h udp.h udp_var.h +KERNELFILES = \ + kpi_ipfilter.h in_arp.h + PRIVATE_DATAFILES = \ - if_fddi.h if_atm.h ip_flow.h + if_fddi.h if_atm.h ip_dummynet.h \ + tcp_debug.h \ + in_gif.h ip_compat.h + +PRIVATE_KERNELFILES = ${KERNELFILES} \ + ip_ecn.h ip_encap.h ip_flow.h INSTALL_MI_LIST = ${DATAFILES} INSTALL_MI_DIR = netinet -EXPORT_MI_LIST = ${DATAFILES} +EXPORT_MI_LIST = ${DATAFILES} ${KERNELFILES} -EXPORT_MI_DIR = netinet +EXPORT_MI_DIR = ${INSTALL_MI_DIR} INSTALL_MI_LCL_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} -INSTALL_MI_LCL_KERN_LIST = ${PRIVATE_DATAFILES} +INSTALL_KF_MI_LCL_LIST = ${INSTALL_MI_LCL_LIST} ${PRIVATE_KERNELFILES} include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/bsd/netinet/bootp.h b/bsd/netinet/bootp.h index 4de5d87cd..8cbbfa7c9 100644 --- a/bsd/netinet/bootp.h +++ b/bsd/netinet/bootp.h @@ -33,6 +33,8 @@ * packet. */ +#include <netinet/udp.h> + #define iaddr_t struct in_addr struct bootp { diff --git a/bsd/netinet/dhcp_options.h b/bsd/netinet/dhcp_options.h index cbef0e298..8c5daabde 100644 --- a/bsd/netinet/dhcp_options.h +++ b/bsd/netinet/dhcp_options.h @@ -174,7 +174,7 @@ typedef struct { typedef ptrlist_t dhcpol_t; -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE void dhcpol_init(dhcpol_t * list); void dhcpol_free(dhcpol_t * list); @@ -194,5 +194,5 @@ boolean_t dhcpol_parse_vendor(dhcpol_t * vendor, dhcpol_t * options, unsigned char * err); void dhcpol_print(dhcpol_t * list); -#endif /* __APPLE_API_PRIVATE */ +#endif KERNEL_PRIVATE #endif /* _NETINET_DHCP_OPTIONS_H */ diff --git a/bsd/netinet/icmp6.h b/bsd/netinet/icmp6.h index aaa2ee50d..87380d493 100644 --- a/bsd/netinet/icmp6.h +++ b/bsd/netinet/icmp6.h @@ -506,7 +506,6 @@ struct icmp6_filter { }; #ifdef KERNEL -#ifdef __APPLE_API_UNSTABLE #define ICMP6_FILTER_SETPASSALL(filterp) \ do { \ int i; u_char *p; \ @@ -516,7 +515,6 @@ do { \ } while (0) #define ICMP6_FILTER_SETBLOCKALL(filterp) \ bzero(filterp, sizeof(struct icmp6_filter)) -#endif /* __APPLE_API_UNSTABLE */ #else /* KERNEL */ #define ICMP6_FILTER_SETPASSALL(filterp) \ memset(filterp, 0xff, sizeof(struct icmp6_filter)) @@ -533,7 +531,6 @@ do { \ #define ICMP6_FILTER_WILLBLOCK(type, filterp) \ ((((filterp)->icmp6_filt[(type) >> 5]) & (1 << ((type) & 31))) == 0) -#ifdef __APPLE_API_UNSTABLE /* * Variables related to this implementation * of the internet control message protocol version 6. @@ -618,6 +615,7 @@ struct icmp6stat { #define ICMPV6CTL_ND6_PRLIST 20 #define ICMPV6CTL_MAXID 21 +#ifdef KERNEL_PRIVATE #define ICMPV6CTL_NAMES { \ { 0, 0 }, \ { 0, 0 }, \ @@ -641,29 +639,26 @@ struct icmp6stat { { 0, 0 }, \ { 0, 0 }, \ } -#endif /* __APPLE_API_UNSTABLE */ #define RTF_PROBEMTU RTF_PROTO1 -#ifdef KERNEL # ifdef __STDC__ struct rtentry; struct rttimer; struct in6_multi; # endif -#ifdef __APPLE_API_PRIVATE -void icmp6_init __P((void)); -void icmp6_paramerror __P((struct mbuf *, int)); -void icmp6_error __P((struct mbuf *, int, int, int)); -int icmp6_input __P((struct mbuf **, int *)); -void icmp6_fasttimo __P((void)); -void icmp6_reflect __P((struct mbuf *, size_t)); -void icmp6_prepare __P((struct mbuf *)); -void icmp6_redirect_input __P((struct mbuf *, int)); -void icmp6_redirect_output __P((struct mbuf *, struct rtentry *)); +void icmp6_init(void); +void icmp6_paramerror(struct mbuf *, int); +void icmp6_error(struct mbuf *, int, int, int); +int icmp6_input(struct mbuf **, int *); +void icmp6_fasttimo(void); +void icmp6_reflect(struct mbuf *, size_t); +void icmp6_prepare(struct mbuf *); +void icmp6_redirect_input(struct mbuf *, int); +void icmp6_redirect_output(struct mbuf *, struct rtentry *); struct ip6ctlparam; -void icmp6_mtudisc_update __P((struct ip6ctlparam *, int)); +void icmp6_mtudisc_update(struct ip6ctlparam *, int); /* XXX: is this the right place for these macros? */ #define icmp6_ifstat_inc(ifp, tag) \ @@ -730,7 +725,6 @@ do { \ extern int icmp6_rediraccept; /* accept/process redirects */ extern int icmp6_redirtimeout; /* cache time for redirect routes */ -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +#endif KERNEL_PRIVATE #endif /* !_NETINET_ICMP6_H_ */ diff --git a/bsd/netinet/icmp_var.h b/bsd/netinet/icmp_var.h index cc55d4fd1..ec3a8ef5c 100644 --- a/bsd/netinet/icmp_var.h +++ b/bsd/netinet/icmp_var.h @@ -58,7 +58,6 @@ #ifndef _NETINET_ICMP_VAR_H_ #define _NETINET_ICMP_VAR_H_ #include <sys/appleapiopts.h> -#ifdef __APPLE_API_UNSTABLE /* * Variables related to this implementation @@ -90,6 +89,7 @@ struct icmpstat { #define ICMPCTL_TIMESTAMP 4 /* allow replies to time stamp requests */ #define ICMPCTL_MAXID 5 +#ifdef KERNEL_PRIVATE #define ICMPCTL_NAMES { \ { 0, 0 }, \ { "maskrepl", CTLTYPE_INT }, \ @@ -97,13 +97,10 @@ struct icmpstat { { "icmplim", CTLTYPE_INT }, \ { "icmptimestamp", CTLTYPE_INT }, \ } -#endif /* __APPLE_API_UNSTABLE */ -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE SYSCTL_DECL(_net_inet_icmp); #ifdef ICMP_BANDLIM -extern int badport_bandlim __P((int)); +extern int badport_bandlim(int); #endif #define BANDLIM_UNLIMITED -1 #define BANDLIM_ICMP_UNREACH 0 @@ -112,6 +109,5 @@ extern int badport_bandlim __P((int)); #define BANDLIM_RST_CLOSEDPORT 3 /* No connection, and no listeners */ #define BANDLIM_RST_OPENPORT 4 /* No connection, listener */ #define BANDLIM_MAX 4 -#endif /* __APPLE_API_PRIVATE */ -#endif -#endif +#endif KERNEL_PRIVATE +#endif _NETINET_ICMP_VAR_H_ diff --git a/bsd/netinet/if_atm.h b/bsd/netinet/if_atm.h index 9c31d421b..e1b147905 100644 --- a/bsd/netinet/if_atm.h +++ b/bsd/netinet/if_atm.h @@ -59,13 +59,13 @@ */ #include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE struct atm_pseudohdr; struct mbuf; struct rtentry; struct sockaddr; -void atm_rtrequest __P((int, struct rtentry *, struct sockaddr *)); -int atmresolve __P((struct rtentry *, struct mbuf *, struct sockaddr *, - struct atm_pseudohdr *)); -#endif /* __APPLE_API_PRIVATE */ +void atm_rtrequest(int, struct rtentry *, struct sockaddr *); +int atmresolve(struct rtentry *, struct mbuf *, struct sockaddr *, + struct atm_pseudohdr *); +#endif /* KERNEL_PRIVATE */ diff --git a/bsd/netinet/if_ether.c b/bsd/netinet/if_ether.c deleted file mode 100644 index 499057940..000000000 --- a/bsd/netinet/if_ether.c +++ /dev/null @@ -1,923 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1982, 1986, 1988, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)if_ether.c 8.1 (Berkeley) 6/10/93 - * $FreeBSD: src/sys/netinet/if_ether.c,v 1.64.2.11 2001/07/25 17:27:56 jlemon Exp $ - */ - -/* - * Ethernet address resolution protocol. - * TODO: - * add "inuse/lock" bit (or ref. count) along with valid bit - */ - -#include <sys/param.h> -#include <sys/kernel.h> -#include <sys/queue.h> -#include <sys/sysctl.h> -#include <sys/systm.h> -#include <sys/mbuf.h> -#include <sys/malloc.h> -#include <sys/socket.h> -#include <sys/syslog.h> - -#include <net/if.h> -#include <net/if_dl.h> -#include <net/if_types.h> -#include <net/route.h> -#include <net/netisr.h> -#include <net/if_llc.h> -#if BRIDGE -#include <net/ethernet.h> -#include <net/bridge.h> -#endif - -#include <netinet/in.h> -#include <netinet/in_var.h> -#include <netinet/if_ether.h> - -#include <net/iso88025.h> - -#define SIN(s) ((struct sockaddr_in *)s) -#define SDL(s) ((struct sockaddr_dl *)s) - -SYSCTL_DECL(_net_link_ether); -SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, ""); - -/* timer values */ -static int arpt_prune = (5*60*1); /* walk list every 5 minutes */ -static int arpt_keep = (20*60); /* once resolved, good for 20 more minutes */ -static int arpt_down = 20; /* once declared down, don't send for 20 sec */ - -/* Apple Hardware SUM16 checksuming */ -int apple_hwcksum_tx = 1; -int apple_hwcksum_rx = 1; - -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, prune_intvl, CTLFLAG_RW, - &arpt_prune, 0, ""); -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_RW, - &arpt_keep, 0, ""); -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, host_down_time, CTLFLAG_RW, - &arpt_down, 0, ""); -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, apple_hwcksum_tx, CTLFLAG_RW, - &apple_hwcksum_tx, 0, ""); -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, apple_hwcksum_rx, CTLFLAG_RW, - &apple_hwcksum_rx, 0, ""); - -#define rt_expire rt_rmx.rmx_expire - -struct llinfo_arp { - LIST_ENTRY(llinfo_arp) la_le; - struct rtentry *la_rt; - struct mbuf *la_hold; /* last packet until resolved/timeout */ - long la_asked; /* last time we QUERIED for this addr */ -#define la_timer la_rt->rt_rmx.rmx_expire /* deletion time in seconds */ -}; - -static LIST_HEAD(, llinfo_arp) llinfo_arp; - -struct ifqueue arpintrq = {0, 0, 0, 50}; -static int arp_inuse, arp_allocated; - -static int arp_maxtries = 5; -static int useloopback = 1; /* use loopback interface for local traffic */ -static int arp_proxyall = 0; -static int arp_init_called = 0; - -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_RW, - &arp_maxtries, 0, ""); -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, useloopback, CTLFLAG_RW, - &useloopback, 0, ""); -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_RW, - &arp_proxyall, 0, ""); - -void arp_rtrequest __P((int, struct rtentry *, struct sockaddr *)); -static void arprequest __P((struct arpcom *, - struct in_addr *, struct in_addr *, u_char *)); -void arpintr __P((void)); -static void arptfree __P((struct llinfo_arp *)); -static void arptimer __P((void *)); -static u_char etherbroadcastaddr[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; -static struct llinfo_arp - *arplookup __P((u_long, int, int)); -#if INET -static void in_arpinput __P((struct mbuf *)); -#endif - -/* - * Timeout routine. Age arp_tab entries periodically. - */ -/* ARGSUSED */ -static void -arptimer(ignored_arg) - void *ignored_arg; -{ -#ifdef __APPLE__ - boolean_t funnel_state = thread_funnel_set(network_flock, TRUE); -#endif - int s = splnet(); - register struct llinfo_arp *la = llinfo_arp.lh_first; - struct llinfo_arp *ola; - - timeout(arptimer, (caddr_t)0, arpt_prune * hz); - while ((ola = la) != 0) { - register struct rtentry *rt = la->la_rt; - la = la->la_le.le_next; - if (rt->rt_expire && rt->rt_expire <= time_second) - arptfree(ola); /* timer has expired, clear */ - } - splx(s); -#ifdef __APPLE__ - (void) thread_funnel_set(network_flock, FALSE); -#endif -} - -/* - * Parallel to llc_rtrequest. - */ -void -arp_rtrequest(req, rt, sa) - int req; - register struct rtentry *rt; - struct sockaddr *sa; -{ - register struct sockaddr *gate = rt->rt_gateway; - register struct llinfo_arp *la = (struct llinfo_arp *)rt->rt_llinfo; - static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; - static int arpinit_done; - - if (!arpinit_done) { - arpinit_done = 1; - LIST_INIT(&llinfo_arp); - timeout(arptimer, (caddr_t)0, hz); -#ifndef __APPLE__ - register_netisr(NETISR_ARP, arpintr); -#endif - } - if (rt->rt_flags & RTF_GATEWAY) - return; - switch (req) { - - case RTM_ADD: - /* - * XXX: If this is a manually added route to interface - * such as older version of routed or gated might provide, - * restore cloning bit. - */ - if ((rt->rt_flags & RTF_HOST) == 0 && - SIN(rt_mask(rt))->sin_addr.s_addr != 0xffffffff) - rt->rt_flags |= RTF_CLONING; - if (rt->rt_flags & RTF_CLONING) { - /* - * Case 1: This route should come from a route to iface. - */ - rt_setgate(rt, rt_key(rt), - (struct sockaddr *)&null_sdl); - gate = rt->rt_gateway; - SDL(gate)->sdl_type = rt->rt_ifp->if_type; - SDL(gate)->sdl_index = rt->rt_ifp->if_index; - rt->rt_expire = time_second; - break; - } - /* Announce a new entry if requested. */ - if (rt->rt_flags & RTF_ANNOUNCE) - arprequest((struct arpcom *)rt->rt_ifp, - &SIN(rt_key(rt))->sin_addr, - &SIN(rt_key(rt))->sin_addr, - (u_char *)LLADDR(SDL(gate))); - /*FALLTHROUGH*/ - case RTM_RESOLVE: - if (gate->sa_family != AF_LINK || - gate->sa_len < sizeof(null_sdl)) { - log(LOG_DEBUG, "arp_rtrequest: bad gateway value\n"); - break; - } - SDL(gate)->sdl_type = rt->rt_ifp->if_type; - SDL(gate)->sdl_index = rt->rt_ifp->if_index; - if (la != 0) - break; /* This happens on a route change */ - /* - * Case 2: This route may come from cloning, or a manual route - * add with a LL address. - */ - R_Malloc(la, struct llinfo_arp *, sizeof(*la)); - rt->rt_llinfo = (caddr_t)la; - if (la == 0) { - log(LOG_DEBUG, "arp_rtrequest: malloc failed\n"); - break; - } - arp_inuse++, arp_allocated++; - Bzero(la, sizeof(*la)); - la->la_rt = rt; - rt->rt_flags |= RTF_LLINFO; - LIST_INSERT_HEAD(&llinfo_arp, la, la_le); - -#if INET - /* - * This keeps the multicast addresses from showing up - * in `arp -a' listings as unresolved. It's not actually - * functional. Then the same for broadcast. - */ - if (IN_MULTICAST(ntohl(SIN(rt_key(rt))->sin_addr.s_addr))) { - ETHER_MAP_IP_MULTICAST(&SIN(rt_key(rt))->sin_addr, - LLADDR(SDL(gate))); - SDL(gate)->sdl_alen = 6; - rt->rt_expire = 0; - } - if (in_broadcast(SIN(rt_key(rt))->sin_addr, rt->rt_ifp)) { - memcpy(LLADDR(SDL(gate)), etherbroadcastaddr, 6); - SDL(gate)->sdl_alen = 6; - rt->rt_expire = time_second; - } -#endif - - if (SIN(rt_key(rt))->sin_addr.s_addr == - (IA_SIN(rt->rt_ifa))->sin_addr.s_addr) { - /* - * This test used to be - * if (loif.if_flags & IFF_UP) - * It allowed local traffic to be forced - * through the hardware by configuring the loopback down. - * However, it causes problems during network configuration - * for boards that can't receive packets they send. - * It is now necessary to clear "useloopback" and remove - * the route to force traffic out to the hardware. - */ - rt->rt_expire = 0; - Bcopy(((struct arpcom *)rt->rt_ifp)->ac_enaddr, - LLADDR(SDL(gate)), SDL(gate)->sdl_alen = 6); - if (useloopback) - rt->rt_ifp = loif; - - } - break; - - case RTM_DELETE: - if (la == 0) - break; - arp_inuse--; - LIST_REMOVE(la, la_le); - rt->rt_llinfo = 0; - rt->rt_flags &= ~RTF_LLINFO; - if (la->la_hold) - m_freem(la->la_hold); - Free((caddr_t)la); - } -} - -/* - * Broadcast an ARP packet, asking who has addr on interface ac. - */ -void -arpwhohas(ac, addr) - struct arpcom *ac; - struct in_addr *addr; -{ - struct ifnet *ifp = (struct ifnet *)ac; - struct ifaddr *ifa = TAILQ_FIRST(&ifp->if_addrhead); - - while (ifa) { - if (ifa->ifa_addr->sa_family == AF_INET) { - arprequest(ac, &SIN(ifa->ifa_addr)->sin_addr, addr, ac->ac_enaddr); - return; - } - ifa = TAILQ_NEXT(ifa, ifa_link); - } - return; /* XXX */ -} - -/* - * Broadcast an ARP request. Caller specifies: - * - arp header source ip address - * - arp header target ip address - * - arp header source ethernet address - */ -static void -arprequest(ac, sip, tip, enaddr) - register struct arpcom *ac; - register struct in_addr *sip, *tip; - register u_char *enaddr; -{ - register struct mbuf *m; - register struct ether_header *eh; - register struct ether_arp *ea; - struct sockaddr sa; - static u_char llcx[] = { 0x82, 0x40, LLC_SNAP_LSAP, LLC_SNAP_LSAP, - LLC_UI, 0x00, 0x00, 0x00, 0x08, 0x06 }; - - if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL) - return; - m->m_pkthdr.rcvif = (struct ifnet *)0; - switch (ac->ac_if.if_type) { - case IFT_ISO88025: - m->m_len = sizeof(*ea) + sizeof(llcx); - m->m_pkthdr.len = sizeof(*ea) + sizeof(llcx); - MH_ALIGN(m, sizeof(*ea) + sizeof(llcx)); - (void)memcpy(mtod(m, caddr_t), llcx, sizeof(llcx)); - (void)memcpy(sa.sa_data, etherbroadcastaddr, 6); - (void)memcpy(sa.sa_data + 6, enaddr, 6); - sa.sa_data[6] |= TR_RII; - sa.sa_data[12] = TR_AC; - sa.sa_data[13] = TR_LLC_FRAME; - ea = (struct ether_arp *)(mtod(m, char *) + sizeof(llcx)); - bzero((caddr_t)ea, sizeof (*ea)); - ea->arp_hrd = htons(ARPHRD_IEEE802); - break; - case IFT_FDDI: - case IFT_ETHER: - /* - * This may not be correct for types not explicitly - * listed, but this is our best guess - */ - default: - m->m_len = sizeof(*ea); - m->m_pkthdr.len = sizeof(*ea); - MH_ALIGN(m, sizeof(*ea)); - ea = mtod(m, struct ether_arp *); - eh = (struct ether_header *)sa.sa_data; - bzero((caddr_t)ea, sizeof (*ea)); - /* if_output will not swap */ - eh->ether_type = htons(ETHERTYPE_ARP); - (void)memcpy(eh->ether_dhost, etherbroadcastaddr, - sizeof(eh->ether_dhost)); - ea->arp_hrd = htons(ARPHRD_ETHER); - break; - } - ea->arp_pro = htons(ETHERTYPE_IP); - ea->arp_hln = sizeof(ea->arp_sha); /* hardware address length */ - ea->arp_pln = sizeof(ea->arp_spa); /* protocol address length */ - ea->arp_op = htons(ARPOP_REQUEST); - (void)memcpy(ea->arp_sha, enaddr, sizeof(ea->arp_sha)); - (void)memcpy(ea->arp_spa, sip, sizeof(ea->arp_spa)); - (void)memcpy(ea->arp_tpa, tip, sizeof(ea->arp_tpa)); - sa.sa_family = AF_UNSPEC; - sa.sa_len = sizeof(sa); - dlil_output(ifptodlt(((struct ifnet *)ac), PF_INET), m, 0, &sa, 0); -} - -/* - * Resolve an IP address into an ethernet address. If success, - * desten is filled in. If there is no entry in arptab, - * set one up and broadcast a request for the IP address. - * Hold onto this mbuf and resend it once the address - * is finally resolved. A return value of 1 indicates - * that desten has been filled in and the packet should be sent - * normally; a 0 return indicates that the packet has been - * taken over here, either now or for later transmission. - */ -int -arpresolve(ac, rt, m, dst, desten, rt0) - register struct arpcom *ac; - register struct rtentry *rt; - struct mbuf *m; - register struct sockaddr *dst; - register u_char *desten; - struct rtentry *rt0; -{ - struct llinfo_arp *la = 0; - struct sockaddr_dl *sdl; - - if (m->m_flags & M_BCAST) { /* broadcast */ - (void)memcpy(desten, etherbroadcastaddr, sizeof(etherbroadcastaddr)); - return (1); - } - if (m->m_flags & M_MCAST) { /* multicast */ - ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten); - return(1); - } - if (rt) - la = (struct llinfo_arp *)rt->rt_llinfo; - if (la == 0) { - la = arplookup(SIN(dst)->sin_addr.s_addr, 1, 0); - if (la) - rt = la->la_rt; - } - if (la == 0 || rt == 0) { - log(LOG_DEBUG, "arpresolve: can't allocate llinfo for %s%s%s\n", - inet_ntoa(SIN(dst)->sin_addr), la ? "la" : "", - rt ? "rt" : ""); - m_freem(m); - return (0); - } - sdl = SDL(rt->rt_gateway); - /* - * Check the address family and length is valid, the address - * is resolved; otherwise, try to resolve. - */ - if ((rt->rt_expire == 0 || rt->rt_expire > time_second) && - sdl->sdl_family == AF_LINK && sdl->sdl_alen != 0) { - bcopy(LLADDR(sdl), desten, sdl->sdl_alen); - return 1; - } - /* - * If ARP is disabled on this interface, stop. - * XXX - * Probably should not allocate empty llinfo struct if we are - * not going to be sending out an arp request. - */ - if (ac->ac_if.if_flags & IFF_NOARP) - return (0); - /* - * There is an arptab entry, but no ethernet address - * response yet. Replace the held mbuf with this - * latest one. - */ - if (la->la_hold) - m_freem(la->la_hold); - la->la_hold = m; - if (rt->rt_expire) { - rt->rt_flags &= ~RTF_REJECT; - if (la->la_asked == 0 || rt->rt_expire != time_second) { - rt->rt_expire = time_second; - if (la->la_asked++ < arp_maxtries) - arprequest(ac, - &SIN(rt->rt_ifa->ifa_addr)->sin_addr, - &SIN(dst)->sin_addr, ac->ac_enaddr); - else { - rt->rt_flags |= RTF_REJECT; - rt->rt_expire += arpt_down; - la->la_asked = 0; - } - - } - } - return (0); -} - -/* - * Common length and type checks are done here, - * then the protocol-specific routine is called. - */ -void -arpintr() -{ - register struct mbuf *m; - register struct arphdr *ar; - int s; - - while (arpintrq.ifq_head) { - s = splimp(); - IF_DEQUEUE(&arpintrq, m); - splx(s); - if (m == 0 || (m->m_flags & M_PKTHDR) == 0) - panic("arpintr"); - - if (m->m_len < sizeof(struct arphdr) && - ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) { - log(LOG_ERR, "arp: runt packet -- m_pullup failed\n"); - continue; - } - ar = mtod(m, struct arphdr *); - - if (ntohs(ar->ar_hrd) != ARPHRD_ETHER - && ntohs(ar->ar_hrd) != ARPHRD_IEEE802) { - log(LOG_ERR, - "arp: unknown hardware address format (0x%2D)\n", - (unsigned char *)&ar->ar_hrd, ""); - m_freem(m); - continue; - } - - if (m->m_pkthdr.len < sizeof(struct arphdr) + 2 * ar->ar_hln - + 2 * ar->ar_pln) { - log(LOG_ERR, "arp: runt packet\n"); - m_freem(m); - continue; - } - - switch (ntohs(ar->ar_pro)) { -#ifdef INET - case ETHERTYPE_IP: - in_arpinput(m); - continue; -#endif - } - m_freem(m); - } -} - -#if INET -/* - * ARP for Internet protocols on 10 Mb/s Ethernet. - * Algorithm is that given in RFC 826. - * In addition, a sanity check is performed on the sender - * protocol address, to catch impersonators. - * We no longer handle negotiations for use of trailer protocol: - * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent - * along with IP replies if we wanted trailers sent to us, - * and also sent them in response to IP replies. - * This allowed either end to announce the desire to receive - * trailer packets. - * We no longer reply to requests for ETHERTYPE_TRAIL protocol either, - * but formerly didn't normally send requests. - */ -static int log_arp_wrong_iface = 0; - -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW, - &log_arp_wrong_iface, 0, - "log arp packets arriving on the wrong interface"); - -static void -in_arpinput(m) - struct mbuf *m; -{ - register struct ether_arp *ea; - register struct arpcom *ac = (struct arpcom *)m->m_pkthdr.rcvif; - struct ether_header *eh; - struct iso88025_header *th = (struct iso88025_header *)0; - register struct llinfo_arp *la = 0; - register struct rtentry *rt; - struct in_ifaddr *ia, *maybe_ia = 0; - struct sockaddr_dl *sdl; - struct sockaddr sa; - struct in_addr isaddr, itaddr, myaddr; - int op, rif_len; - unsigned char buf[18]; - unsigned char buf2[18]; - - if (m->m_len < sizeof(struct ether_arp) && - (m = m_pullup(m, sizeof(struct ether_arp))) == NULL) { - log(LOG_ERR, "in_arp: runt packet -- m_pullup failed\n"); - return; - } - - ea = mtod(m, struct ether_arp *); - op = ntohs(ea->arp_op); - (void)memcpy(&isaddr, ea->arp_spa, sizeof (isaddr)); - (void)memcpy(&itaddr, ea->arp_tpa, sizeof (itaddr)); - -#if __APPLE__ - /* Don't respond to requests for 0.0.0.0 */ - if (itaddr.s_addr == 0 && op == ARPOP_REQUEST) { - m_freem(m); - return; - } -#endif - - for (ia = in_ifaddrhead.tqh_first; ia; ia = ia->ia_link.tqe_next) { - /* - * For a bridge, we want to check the address irrespective - * of the receive interface. (This will change slightly - * when we have clusters of interfaces). - */ -#if BRIDGE -#define BRIDGE_TEST (do_bridge) -#else -#define BRIDGE_TEST (0) /* cc will optimise the test away */ -#endif - if ((BRIDGE_TEST) || (ia->ia_ifp == &ac->ac_if)) { - maybe_ia = ia; - if ((itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) || - (isaddr.s_addr == ia->ia_addr.sin_addr.s_addr)) { - break; - } - } - } - if (maybe_ia == 0) { - m_freem(m); - return; - } - myaddr = ia ? ia->ia_addr.sin_addr : maybe_ia->ia_addr.sin_addr; - if (!bcmp((caddr_t)ea->arp_sha, (caddr_t)ac->ac_enaddr, - sizeof (ea->arp_sha))) { - m_freem(m); /* it's from me, ignore it. */ - return; - } - if (!bcmp((caddr_t)ea->arp_sha, (caddr_t)etherbroadcastaddr, - sizeof (ea->arp_sha))) { - log(LOG_ERR, - "arp: ether address is broadcast for IP address %s!\n", - inet_ntoa(isaddr)); - m_freem(m); - return; - } - if (isaddr.s_addr == myaddr.s_addr) { - struct kev_msg ev_msg; - struct kev_in_collision *in_collision; - u_char storage[sizeof(struct kev_in_collision) + 6]; - in_collision = (struct kev_in_collision*)storage; - - log(LOG_ERR, - "duplicate IP address %s sent from ethernet address %s\n", - inet_ntoa(isaddr), ether_sprintf(buf, ea->arp_sha)); - - /* Send a kernel event so anyone can learn of the conflict */ - in_collision->link_data.if_family = ac->ac_if.if_family; - in_collision->link_data.if_unit = ac->ac_if.if_unit; - strncpy(&in_collision->link_data.if_name[0], ac->ac_if.if_name, IFNAMSIZ); - in_collision->ia_ipaddr = isaddr; - in_collision->hw_len = ETHER_ADDR_LEN; - bcopy((caddr_t)ea->arp_sha, (caddr_t)in_collision->hw_addr, sizeof(ea->arp_sha)); - ev_msg.vendor_code = KEV_VENDOR_APPLE; - ev_msg.kev_class = KEV_NETWORK_CLASS; - ev_msg.kev_subclass = KEV_INET_SUBCLASS; - ev_msg.event_code = KEV_INET_ARPCOLLISION; - ev_msg.dv[0].data_ptr = in_collision; - ev_msg.dv[0].data_length = sizeof(struct kev_in_collision) + 6; - ev_msg.dv[1].data_length = 0; - kev_post_msg(&ev_msg); - - itaddr = myaddr; - goto reply; - } - la = arplookup(isaddr.s_addr, itaddr.s_addr == myaddr.s_addr, 0); - if (la && (rt = la->la_rt) && (sdl = SDL(rt->rt_gateway))) { - /* the following is not an error when doing bridging */ - if (!BRIDGE_TEST && rt->rt_ifp != &ac->ac_if) { - if (log_arp_wrong_iface) - log(LOG_ERR, "arp: %s is on %s%d but got reply from %s on %s%d\n", - inet_ntoa(isaddr), - rt->rt_ifp->if_name, rt->rt_ifp->if_unit, - ether_sprintf(buf, ea->arp_sha), - ac->ac_if.if_name, ac->ac_if.if_unit); - goto reply; - } - if (sdl->sdl_alen && - bcmp((caddr_t)ea->arp_sha, LLADDR(sdl), sdl->sdl_alen)) { - if (rt->rt_expire) - log(LOG_INFO, "arp: %s moved from %s to %s on %s%d\n", - inet_ntoa(isaddr), - ether_sprintf(buf, (u_char *)LLADDR(sdl)), - ether_sprintf(buf2, ea->arp_sha), - ac->ac_if.if_name, ac->ac_if.if_unit); - else { - log(LOG_ERR, - "arp: %s attempts to modify permanent entry for %s on %s%d", - ether_sprintf(buf, ea->arp_sha), inet_ntoa(isaddr), - ac->ac_if.if_name, ac->ac_if.if_unit); - goto reply; - } - } - (void)memcpy(LLADDR(sdl), ea->arp_sha, sizeof(ea->arp_sha)); - sdl->sdl_alen = sizeof(ea->arp_sha); -#ifndef __APPLE__ - /* TokenRing */ - sdl->sdl_rcf = (u_short)0; - /* - * If we receive an arp from a token-ring station over - * a token-ring nic then try to save the source - * routing info. - */ - if (ac->ac_if.if_type == IFT_ISO88025) { - th = (struct iso88025_header *)m->m_pkthdr.header; - rif_len = TR_RCF_RIFLEN(th->rcf); - if ((th->iso88025_shost[0] & TR_RII) && - (rif_len > 2)) { - sdl->sdl_rcf = th->rcf; - sdl->sdl_rcf ^= htons(TR_RCF_DIR); - memcpy(sdl->sdl_route, th->rd, rif_len - 2); - sdl->sdl_rcf &= ~htons(TR_RCF_BCST_MASK); - /* - * Set up source routing information for - * reply packet (XXX) - */ - m->m_data -= rif_len; - m->m_len += rif_len; - m->m_pkthdr.len += rif_len; - } else { - th->iso88025_shost[0] &= ~TR_RII; - } - m->m_data -= 8; - m->m_len += 8; - m->m_pkthdr.len += 8; - th->rcf = sdl->sdl_rcf; - } -#endif - if (rt->rt_expire) - rt->rt_expire = time_second + arpt_keep; - rt->rt_flags &= ~RTF_REJECT; - la->la_asked = 0; - if (la->la_hold) { - dlil_output(((struct ifnet *)ac)->if_data.default_proto, la->la_hold, rt, - rt_key(rt), 0); - la->la_hold = 0; - } - } -reply: - if (op != ARPOP_REQUEST) { - m_freem(m); - return; - } - if (itaddr.s_addr == myaddr.s_addr) { - /* I am the target */ - (void)memcpy(ea->arp_tha, ea->arp_sha, sizeof(ea->arp_sha)); - (void)memcpy(ea->arp_sha, ac->ac_enaddr, sizeof(ea->arp_sha)); - } else { - la = arplookup(itaddr.s_addr, 0, SIN_PROXY); - if (la == NULL) { - struct sockaddr_in sin; - - if (!arp_proxyall) { - m_freem(m); - return; - } - - bzero(&sin, sizeof sin); - sin.sin_family = AF_INET; - sin.sin_len = sizeof sin; - sin.sin_addr = itaddr; - - rt = rtalloc1((struct sockaddr *)&sin, 0, 0UL); - if (!rt) { - m_freem(m); - return; - } - /* - * Don't send proxies for nodes on the same interface - * as this one came out of, or we'll get into a fight - * over who claims what Ether address. - */ - if (rt->rt_ifp == &ac->ac_if) { - rtfree(rt); - m_freem(m); - return; - } - (void)memcpy(ea->arp_tha, ea->arp_sha, sizeof(ea->arp_sha)); - (void)memcpy(ea->arp_sha, ac->ac_enaddr, sizeof(ea->arp_sha)); - rtfree(rt); -#if DEBUG_PROXY - printf("arp: proxying for %s\n", - inet_ntoa(itaddr)); -#endif - } else { - rt = la->la_rt; - (void)memcpy(ea->arp_tha, ea->arp_sha, sizeof(ea->arp_sha)); - sdl = SDL(rt->rt_gateway); - (void)memcpy(ea->arp_sha, LLADDR(sdl), sizeof(ea->arp_sha)); - } - } - - (void)memcpy(ea->arp_tpa, ea->arp_spa, sizeof(ea->arp_spa)); - (void)memcpy(ea->arp_spa, &itaddr, sizeof(ea->arp_spa)); - ea->arp_op = htons(ARPOP_REPLY); - ea->arp_pro = htons(ETHERTYPE_IP); /* let's be sure! */ - switch (ac->ac_if.if_type) { - case IFT_ISO88025: - /* Re-arrange the source/dest address */ - memcpy(th->iso88025_dhost, th->iso88025_shost, - sizeof(th->iso88025_dhost)); - memcpy(th->iso88025_shost, ac->ac_enaddr, - sizeof(th->iso88025_shost)); - /* Set the source routing bit if neccesary */ - if (th->iso88025_dhost[0] & TR_RII) { - th->iso88025_dhost[0] &= ~TR_RII; - if (TR_RCF_RIFLEN(th->rcf) > 2) - th->iso88025_shost[0] |= TR_RII; - } - /* Copy the addresses, ac and fc into sa_data */ - memcpy(sa.sa_data, th->iso88025_dhost, - sizeof(th->iso88025_dhost) * 2); - sa.sa_data[(sizeof(th->iso88025_dhost) * 2)] = TR_AC; - sa.sa_data[(sizeof(th->iso88025_dhost) * 2) + 1] = TR_LLC_FRAME; - break; - case IFT_ETHER: - case IFT_FDDI: - /* - * May not be correct for types not explictly - * listed, but it is our best guess. - */ - default: - eh = (struct ether_header *)sa.sa_data; -#ifdef __APPLE__ - if (IN_LINKLOCAL(ntohl(*((u_int32_t*)ea->arp_spa)))) - (void)memcpy(eh->ether_dhost, etherbroadcastaddr, sizeof(eh->ether_dhost)); - else -#endif - (void)memcpy(eh->ether_dhost, ea->arp_tha, sizeof(eh->ether_dhost)); - eh->ether_type = htons(ETHERTYPE_ARP); - break; - } - sa.sa_family = AF_UNSPEC; - sa.sa_len = sizeof(sa); - dlil_output(((struct ifnet *)ac)->if_data.default_proto, m, 0, &sa, 0); - return; -} -#endif - -/* - * Free an arp entry. - */ -static void -arptfree(la) - register struct llinfo_arp *la; -{ - register struct rtentry *rt = la->la_rt; - register struct sockaddr_dl *sdl; - if (rt == 0) - panic("arptfree"); - if (rt->rt_refcnt > 0 && (sdl = SDL(rt->rt_gateway)) && - sdl->sdl_family == AF_LINK) { - sdl->sdl_alen = 0; - la->la_asked = 0; - rt->rt_flags &= ~RTF_REJECT; - return; - } - rtrequest(RTM_DELETE, rt_key(rt), (struct sockaddr *)0, rt_mask(rt), - 0, (struct rtentry **)0); -} -/* - * Lookup or enter a new address in arptab. - */ -static struct llinfo_arp * -arplookup(addr, create, proxy) - u_long addr; - int create, proxy; -{ - register struct rtentry *rt; - static struct sockaddr_inarp sin = {sizeof(sin), AF_INET }; - const char *why = 0; - - sin.sin_addr.s_addr = addr; - sin.sin_other = proxy ? SIN_PROXY : 0; - rt = rtalloc1((struct sockaddr *)&sin, create, 0UL); - if (rt == 0) - return (0); - rtunref(rt); - - if (rt->rt_flags & RTF_GATEWAY) { - why = "host is not on local network"; - - /* If there are no references to this route, purge it */ - if (rt->rt_refcnt <= 0 && (rt->rt_flags & RTF_WASCLONED) != 0) { - rtrequest(RTM_DELETE, - (struct sockaddr *)rt_key(rt), - rt->rt_gateway, rt_mask(rt), - rt->rt_flags, 0); - } - } - else if ((rt->rt_flags & RTF_LLINFO) == 0) - why = "could not allocate llinfo"; - else if (rt->rt_gateway->sa_family != AF_LINK) - why = "gateway route is not ours"; - - if (why && create) { - log(LOG_DEBUG, "arplookup %s failed: %s\n", - inet_ntoa(sin.sin_addr), why); - return 0; - } else if (why) { - return 0; - } - return ((struct llinfo_arp *)rt->rt_llinfo); -} - -void -arp_ifinit(ac, ifa) - struct arpcom *ac; - struct ifaddr *ifa; -{ - if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) - arprequest(ac, &IA_SIN(ifa)->sin_addr, - &IA_SIN(ifa)->sin_addr, ac->ac_enaddr); - ifa->ifa_rtrequest = arp_rtrequest; - ifa->ifa_flags |= RTF_CLONING; -} diff --git a/bsd/netinet/if_ether.h b/bsd/netinet/if_ether.h index 0d9799b6b..fa65cfe8b 100644 --- a/bsd/netinet/if_ether.h +++ b/bsd/netinet/if_ether.h @@ -65,52 +65,6 @@ #include <net/if_arp.h> #define ea_byte ether_addr_octet -#ifdef __APPLE__ -#ifdef __APPLE_API_UNSTABLE -/* - * Macro for looking up the ether_multi record for a given range of Ethernet - * multicast addresses connected to a given arpcom structure. If no matching - * record is found, "enm" returns NULL. - */ -#define ETHER_LOOKUP_MULTI(addrlo, addrhi, ac, enm) \ - /* u_char addrlo[6]; */ \ - /* u_char addrhi[6]; */ \ - /* struct arpcom *ac; */ \ - /* struct ether_multi *enm; */ \ -{ \ - for ((enm) = (ac)->ac_multiaddrs; \ - (enm) != NULL && \ - (bcmp((enm)->enm_addrlo, (addrlo), 6) != 0 || \ - bcmp((enm)->enm_addrhi, (addrhi), 6) != 0); \ - (enm) = (enm)->enm_next); \ -} - -/* - * Macro to step through all of the ether_multi records, one at a time. - * The current position is remembered in "step", which the caller must - * provide. ETHER_FIRST_MULTI(), below, must be called to initialize "step" - * and get the first record. Both macros return a NULL "enm" when there - * are no remaining records. - */ -#define ETHER_NEXT_MULTI(step, enm) \ - /* struct ether_multistep step; */ \ - /* struct ether_multi *enm; */ \ -{ \ - if (((enm) = (step).e_enm) != NULL) \ - (step).e_enm = (enm)->enm_next; \ -} - -#define ETHER_FIRST_MULTI(step, ac, enm) \ - /* struct ether_multistep step; */ \ - /* struct arpcom *ac; */ \ - /* struct ether_multi *enm; */ \ -{ \ - (step).e_enm = (ac)->ac_multiaddrs; \ - ETHER_NEXT_MULTI((step), (enm)); \ -} -#endif /* __APPLE_API_UNSTABLE */ -#endif /* __APPLE__ */ - /* * Macro to map an IP multicast address to an Ethernet multicast address. * The high-order 25 bits of the Ethernet address are statically assigned, @@ -123,9 +77,9 @@ (enaddr)[0] = 0x01; \ (enaddr)[1] = 0x00; \ (enaddr)[2] = 0x5e; \ - (enaddr)[3] = ((u_char *)ipaddr)[1] & 0x7f; \ - (enaddr)[4] = ((u_char *)ipaddr)[2]; \ - (enaddr)[5] = ((u_char *)ipaddr)[3]; \ + (enaddr)[3] = ((const u_char *)ipaddr)[1] & 0x7f; \ + (enaddr)[4] = ((const u_char *)ipaddr)[2]; \ + (enaddr)[5] = ((const u_char *)ipaddr)[3]; \ } /* * Macro to map an IP6 multicast address to an Ethernet multicast address. @@ -138,10 +92,10 @@ { \ (enaddr)[0] = 0x33; \ (enaddr)[1] = 0x33; \ - (enaddr)[2] = ((u_char *)ip6addr)[12]; \ - (enaddr)[3] = ((u_char *)ip6addr)[13]; \ - (enaddr)[4] = ((u_char *)ip6addr)[14]; \ - (enaddr)[5] = ((u_char *)ip6addr)[15]; \ + (enaddr)[2] = ((const u_char *)ip6addr)[12]; \ + (enaddr)[3] = ((const u_char *)ip6addr)[13]; \ + (enaddr)[4] = ((const u_char *)ip6addr)[14]; \ + (enaddr)[5] = ((const u_char *)ip6addr)[15]; \ } /* @@ -180,18 +134,14 @@ struct sockaddr_inarp { #define RTF_USETRAILERS RTF_PROTO1 /* use trailers */ #define RTF_ANNOUNCE RTF_PROTO2 /* announce new arp entry */ -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE extern u_char ether_ipmulticast_min[ETHER_ADDR_LEN]; extern u_char ether_ipmulticast_max[ETHER_ADDR_LEN]; extern struct ifqueue arpintrq; -int arpresolve __P((struct arpcom *, struct rtentry *, struct mbuf *, - struct sockaddr *, u_char *, struct rtentry *)); -#endif /* __APPLE_API_PRIVATE */ -#ifdef __APPLE_API_UNSTABLE -void arp_ifinit __P((struct arpcom *, struct ifaddr *)); -#endif /* __APPLE_API_UNSTABLE */ -#endif +int arpresolve(struct ifnet *, struct rtentry *, struct mbuf *, + struct sockaddr *, u_char *, struct rtentry *); +void arp_ifinit(struct ifnet *, struct ifaddr *); +#endif KERNEL_PRIVATE -#endif +#endif _NETINET_IF_ETHER_H_ diff --git a/bsd/netinet/if_fddi.h b/bsd/netinet/if_fddi.h index dd33f7311..ed9071d4c 100644 --- a/bsd/netinet/if_fddi.h +++ b/bsd/netinet/if_fddi.h @@ -91,8 +91,7 @@ struct fddi_header { #define FDDIFC_LLC_SYNC 0xd0 #define FDDIFC_SMT 0x40 -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE #define fddibroadcastaddr etherbroadcastaddr #define fddi_ipmulticast_min ether_ipmulticast_min #define fddi_ipmulticast_max ether_ipmulticast_max @@ -100,11 +99,10 @@ struct fddi_header { #define fddi_delmulti ether_delmulti #define fddi_sprintf ether_sprintf -void fddi_ifattach __P((struct ifnet *)); -void fddi_input __P((struct ifnet *, struct fddi_header *, struct mbuf *)); -int fddi_output __P((struct ifnet *, - struct mbuf *, struct sockaddr *, struct rtentry *)); -#endif /* __APPLE_API_PRIVATE */ -#endif +void fddi_ifattach(struct ifnet *); +void fddi_input(struct ifnet *, struct fddi_header *, struct mbuf *); +int fddi_output(struct ifnet *, + struct mbuf *, struct sockaddr *, struct rtentry *); +#endif KERNEL_PRIVATE -#endif +#endif _NETINET_IF_FDDI_H_ diff --git a/bsd/netinet/if_tun.h b/bsd/netinet/if_tun.h index 6ffd6734b..9f748c7a3 100644 --- a/bsd/netinet/if_tun.h +++ b/bsd/netinet/if_tun.h @@ -39,7 +39,7 @@ #define _NET_IF_TUN_H_ #include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE struct tun_softc { u_short tun_flags; /* misc flags */ #define TUN_OPEN 0x0001 @@ -61,13 +61,14 @@ struct tun_softc { caddr_t tun_bpf; #endif }; -#endif /* __APPLE_API_PRIVATE */ -/* Maximum packet size */ -#define TUNMTU 1500 +#endif KERNEL_PRIVATE /* ioctl's for get/set debug */ #define TUNSDEBUG _IOW('t', 90, int) #define TUNGDEBUG _IOR('t', 89, int) +/* Maximum packet size */ +#define TUNMTU 1500 + #endif /* !_NET_IF_TUN_H_ */ diff --git a/bsd/netinet/igmp.c b/bsd/netinet/igmp.c index 1430acc47..0f427e7e3 100644 --- a/bsd/netinet/igmp.c +++ b/bsd/netinet/igmp.c @@ -94,7 +94,7 @@ static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state"); #endif static struct router_info * - find_rti __P((struct ifnet *ifp)); + find_rti(struct ifnet *ifp); static struct igmpstat igmpstat; @@ -107,7 +107,7 @@ static u_long igmp_all_rtrs_group; static struct mbuf *router_alert; static struct router_info *Head; -static void igmp_sendpkt __P((struct in_multi *, int, unsigned long)); +static void igmp_sendpkt(struct in_multi *, int, unsigned long); void igmp_init() @@ -138,10 +138,10 @@ igmp_init() } static struct router_info * -find_rti(ifp) - struct ifnet *ifp; +find_rti( + struct ifnet *ifp) { - register struct router_info *rti = Head; + struct router_info *rti = Head; #if IGMP_DEBUG @@ -173,17 +173,17 @@ find_rti(ifp) } void -igmp_input(m, iphlen) - register struct mbuf *m; - register int iphlen; +igmp_input( + struct mbuf *m, + int iphlen) { - register struct igmp *igmp; - register struct ip *ip; - register int igmplen; - register struct ifnet *ifp = m->m_pkthdr.rcvif; - register int minlen; - register struct in_multi *inm; - register struct in_ifaddr *ia; + struct igmp *igmp; + struct ip *ip; + int igmplen; + struct ifnet *ifp = m->m_pkthdr.rcvif; + int minlen; + struct in_multi *inm; + struct in_ifaddr *ia; struct in_multistep step; struct router_info *rti; @@ -293,6 +293,7 @@ igmp_input(m, iphlen) * - Use the value specified in the query message as * the maximum timeout. */ + lck_mtx_lock(rt_mtx); IN_FIRST_MULTI(step, inm); while (inm != NULL) { if (inm->inm_ifp == ifp && @@ -308,6 +309,7 @@ igmp_input(m, iphlen) } IN_NEXT_MULTI(step, inm); } + lck_mtx_unlock(rt_mtx); break; @@ -350,7 +352,9 @@ igmp_input(m, iphlen) * If we belong to the group being reported, stop * our timer for that group. */ + ifnet_lock_shared(ifp); IN_LOOKUP_MULTI(igmp->igmp_group, ifp, inm); + ifnet_lock_done(ifp); if (inm != NULL) { inm->inm_timer = 0; @@ -373,7 +377,6 @@ int igmp_joingroup(inm) struct in_multi *inm; { - int s = splnet(); if (inm->inm_addr.s_addr == igmp_all_hosts_group || inm->inm_ifp->if_flags & IFF_LOOPBACK) { @@ -389,7 +392,6 @@ igmp_joingroup(inm) igmp_timers_are_running = 1; } return 0; - splx(s); } void @@ -406,9 +408,8 @@ igmp_leavegroup(inm) void igmp_fasttimo() { - register struct in_multi *inm; + struct in_multi *inm; struct in_multistep step; - int s; /* * Quick check to see if any work needs to be done, in order @@ -418,7 +419,6 @@ igmp_fasttimo() if (!igmp_timers_are_running) return; - s = splnet(); igmp_timers_are_running = 0; IN_FIRST_MULTI(step, inm); while (inm != NULL) { @@ -432,14 +432,12 @@ igmp_fasttimo() } IN_NEXT_MULTI(step, inm); } - splx(s); } void igmp_slowtimo() { - int s = splnet(); - register struct router_info *rti = Head; + struct router_info *rti = Head; #if IGMP_DEBUG printf("[igmp.c,_slowtimo] -- > entering \n"); @@ -456,7 +454,6 @@ igmp_slowtimo() #if IGMP_DEBUG printf("[igmp.c,_slowtimo] -- > exiting \n"); #endif - splx(s); } static struct route igmprt; diff --git a/bsd/netinet/igmp_var.h b/bsd/netinet/igmp_var.h index b7e99ed14..435197b7c 100644 --- a/bsd/netinet/igmp_var.h +++ b/bsd/netinet/igmp_var.h @@ -62,7 +62,6 @@ #define _NETINET_IGMP_VAR_H_ #include <sys/appleapiopts.h> -#ifdef __APPLE_API_UNSTABLE /* * Internet Group Management Protocol (IGMP), @@ -85,8 +84,8 @@ struct igmpstat { u_int igps_snd_reports; /* sent membership reports */ }; +#ifdef KERNEL_PRIVATE #ifdef KERNEL -#ifdef __APPLE_API_PRIVATE #define IGMP_RANDOM_DELAY(X) (random() % (X) + 1) /* @@ -109,17 +108,17 @@ struct igmpstat { */ #define IGMP_AGE_THRESHOLD 540 -void igmp_init __P((void)); -void igmp_input __P((struct mbuf *, int)); -int igmp_joingroup __P((struct in_multi *)); -void igmp_leavegroup __P((struct in_multi *)); -void igmp_fasttimo __P((void)); -void igmp_slowtimo __P((void)); +void igmp_init(void); +void igmp_input(struct mbuf *, int); +int igmp_joingroup(struct in_multi *); +void igmp_leavegroup(struct in_multi *); +void igmp_fasttimo(void); +void igmp_slowtimo(void); SYSCTL_DECL(_net_inet_igmp); -#endif /* __APPLE_API_PRIVATE */ -#endif +#endif /* KERNEL */ +#endif /* KERNEL_PRIVATE */ /* * Names for IGMP sysctl objects @@ -127,11 +126,11 @@ SYSCTL_DECL(_net_inet_igmp); #define IGMPCTL_STATS 1 /* statistics (read-only) */ #define IGMPCTL_MAXID 2 +#ifdef KERNEL_PRIVATE #define IGMPCTL_NAMES { \ { 0, 0 }, \ { "stats", CTLTYPE_STRUCT }, \ } -#endif /* __APPLE_API_UNSTABLE */ +#endif /* KERNEL_PRIVATE */ #endif - diff --git a/bsd/netinet/in.c b/bsd/netinet/in.c index cbbf21679..6fa1e7ab8 100644 --- a/bsd/netinet/in.c +++ b/bsd/netinet/in.c @@ -65,6 +65,7 @@ #include <sys/kernel.h> #include <sys/sysctl.h> #include <sys/kern_event.h> +#include <sys/syslog.h> #include <net/if.h> #include <net/if_types.h> @@ -86,14 +87,14 @@ #include <sys/file.h> -static int in_mask2len __P((struct in_addr *)); -static void in_len2mask __P((struct in_addr *, int)); -static int in_lifaddr_ioctl __P((struct socket *, u_long, caddr_t, - struct ifnet *, struct proc *)); +static int in_mask2len(struct in_addr *); +static void in_len2mask(struct in_addr *, int); +static int in_lifaddr_ioctl(struct socket *, u_long, caddr_t, + struct ifnet *, struct proc *); -static void in_socktrim __P((struct sockaddr_in *)); -static int in_ifinit __P((struct ifnet *, - struct in_ifaddr *, struct sockaddr_in *, int)); +static void in_socktrim(struct sockaddr_in *); +static int in_ifinit(struct ifnet *, + struct in_ifaddr *, struct sockaddr_in *, int); static int subnetsarelocal = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, subnets_are_local, CTLFLAG_RW, @@ -101,7 +102,10 @@ SYSCTL_INT(_net_inet_ip, OID_AUTO, subnets_are_local, CTLFLAG_RW, struct in_multihead in_multihead; /* XXX BSS initialization */ -extern void arp_rtrequest(); +extern lck_mtx_t *rt_mtx; + +/* Track whether or not the SIOCARPIPLL ioctl has been called */ +__private_extern__ u_int32_t ipv4_ll_arp_aware = 0; /* * Return 1 if an internet address is for a ``local'' host @@ -113,19 +117,27 @@ int in_localaddr(in) struct in_addr in; { - register u_long i = ntohl(in.s_addr); - register struct in_ifaddr *ia; + u_long i = ntohl(in.s_addr); + struct in_ifaddr *ia; if (subnetsarelocal) { + lck_mtx_lock(rt_mtx); for (ia = in_ifaddrhead.tqh_first; ia; ia = ia->ia_link.tqe_next) - if ((i & ia->ia_netmask) == ia->ia_net) + if ((i & ia->ia_netmask) == ia->ia_net) { + lck_mtx_unlock(rt_mtx); return (1); + } + lck_mtx_unlock(rt_mtx); } else { + lck_mtx_lock(rt_mtx); for (ia = in_ifaddrhead.tqh_first; ia; ia = ia->ia_link.tqe_next) - if ((i & ia->ia_subnetmask) == ia->ia_subnet) + if ((i & ia->ia_subnetmask) == ia->ia_subnet) { + lck_mtx_unlock(rt_mtx); return (1); + } + lck_mtx_unlock(rt_mtx); } return (0); } @@ -139,8 +151,8 @@ int in_canforward(in) struct in_addr in; { - register u_long i = ntohl(in.s_addr); - register u_long net; + u_long i = ntohl(in.s_addr); + u_long net; if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i)) return (0); @@ -159,8 +171,8 @@ static void in_socktrim(ap) struct sockaddr_in *ap; { - register char *cplim = (char *) &ap->sin_addr; - register char *cp = (char *) (&ap->sin_addr + 1); + char *cplim = (char *) &ap->sin_addr; + char *cp = (char *) (&ap->sin_addr + 1); ap->sin_len = 0; while (--cp >= cplim) @@ -216,21 +228,21 @@ static int in_interfaces; /* number of external internet interfaces */ */ /* ARGSUSED */ int -in_control(so, cmd, data, ifp, p) - struct socket *so; - u_long cmd; - caddr_t data; - register struct ifnet *ifp; - struct proc *p; +in_control( + struct socket *so, + u_long cmd, + caddr_t data, + struct ifnet *ifp, + struct proc *p) { - register struct ifreq *ifr = (struct ifreq *)data; - register struct in_ifaddr *ia = 0, *iap; - register struct ifaddr *ifa; + struct ifreq *ifr = (struct ifreq *)data; + struct in_ifaddr *ia = 0, *iap; + struct ifaddr *ifa; struct in_ifaddr *oia; struct in_aliasreq *ifra = (struct in_aliasreq *)data; struct sockaddr_in oldaddr; - int error, hostIsNew, maskIsNew, s; - u_long i, dl_tag; + int error, hostIsNew, maskIsNew; + u_long i; struct kev_msg ev_msg; struct kev_in_data in_event_data; @@ -238,7 +250,7 @@ in_control(so, cmd, data, ifp, p) switch (cmd) { case SIOCALIFADDR: case SIOCDLIFADDR: - if (p && (error = suser(p->p_ucred, &p->p_acflag)) != 0) + if (p && (error = proc_suser(p)) != 0) return error; /*fall through*/ case SIOCGLIFADDR: @@ -253,7 +265,8 @@ in_control(so, cmd, data, ifp, p) * If an alias address was specified, find that one instead of * the first one on the interface. */ - if (ifp) + if (ifp) { + lck_mtx_lock(rt_mtx); for (iap = in_ifaddrhead.tqh_first; iap; iap = iap->ia_link.tqe_next) if (iap->ia_ifp == ifp) { @@ -267,10 +280,12 @@ in_control(so, cmd, data, ifp, p) break; } } - + lck_mtx_unlock(rt_mtx); + } switch (cmd) { case SIOCAUTOADDR: - if (p && (error = suser(p->p_ucred, &p->p_acflag)) != 0) + case SIOCARPIPLL: + if (p && (error = proc_suser(p)) != 0) return error; break; @@ -279,12 +294,14 @@ in_control(so, cmd, data, ifp, p) if (ifp == 0) return (EADDRNOTAVAIL); if (ifra->ifra_addr.sin_family == AF_INET) { + lck_mtx_lock(rt_mtx); for (oia = ia; ia; ia = ia->ia_link.tqe_next) { if (ia->ia_ifp == ifp && ia->ia_addr.sin_addr.s_addr == ifra->ifra_addr.sin_addr.s_addr) break; } + lck_mtx_unlock(rt_mtx); if ((ifp->if_flags & IFF_POINTOPOINT) && (cmd == SIOCAIFADDR) && (ifra->ifra_dstaddr.sin_addr.s_addr @@ -300,13 +317,8 @@ in_control(so, cmd, data, ifp, p) case SIOCSIFADDR: case SIOCSIFNETMASK: case SIOCSIFDSTADDR: -#ifdef __APPLE__ if ((so->so_state & SS_PRIV) == 0) return (EPERM); -#else - if (p && (error = suser(p)) != 0) - return error; -#endif if (ifp == 0) return (EADDRNOTAVAIL); @@ -322,29 +334,14 @@ in_control(so, cmd, data, ifp, p) * Protect from ipintr() traversing address list * while we're modifying it. */ - s = splnet(); - TAILQ_INSERT_TAIL(&in_ifaddrhead, ia, ia_link); ifa = &ia->ia_ifa; - TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); - -/* - * Temorary code for protocol attachment XXX - */ - - /* Generic protocol plumbing */ - - if (error = dlil_plumb_protocol(PF_INET, ifp, &dl_tag)) { - kprintf("in.c: warning can't plumb proto if=%s%n type %d error=%d\n", - ifp->if_name, ifp->if_unit, ifp->if_type, error); - error = 0; /*discard error, can be cold with unsupported interfaces */ - } -/* End of temp code */ ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr; ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask; ia->ia_sockmask.sin_len = 8; + ifnet_lock_exclusive(ifp); if (ifp->if_flags & IFF_BROADCAST) { ia->ia_broadaddr.sin_len = sizeof(ia->ia_addr); ia->ia_broadaddr.sin_family = AF_INET; @@ -352,13 +349,27 @@ in_control(so, cmd, data, ifp, p) ia->ia_ifp = ifp; if (!(ifp->if_flags & IFF_LOOPBACK)) in_interfaces++; - splx(s); + if_attach_ifa(ifp, ifa); + ifnet_lock_done(ifp); + + lck_mtx_lock(rt_mtx); + TAILQ_INSERT_TAIL(&in_ifaddrhead, ia, ia_link); + lck_mtx_unlock(rt_mtx); + + /* Generic protocol plumbing */ + + if (error = dlil_plumb_protocol(PF_INET, ifp)) { + kprintf("in.c: warning can't plumb proto if=%s%n type %d error=%d\n", + ifp->if_name, ifp->if_unit, ifp->if_type, error); + error = 0; /*discard error, can be cold with unsupported interfaces */ + } + } break; case SIOCPROTOATTACH: case SIOCPROTODETACH: - if (p && (error = suser(p->p_ucred, &p->p_acflag)) != 0) + if (p && (error = proc_suser(p)) != 0) return error; if (ifp == 0) return (EADDRNOTAVAIL); @@ -386,10 +397,24 @@ in_control(so, cmd, data, ifp, p) case SIOCAUTOADDR: if (ifp == 0) return (EADDRNOTAVAIL); - if (ifr->ifr_data) + ifnet_lock_exclusive(ifp); + if (ifr->ifr_intval) ifp->if_eflags |= IFEF_AUTOCONFIGURING; else ifp->if_eflags &= ~IFEF_AUTOCONFIGURING; + ifnet_lock_done(ifp); + break; + + case SIOCARPIPLL: + if (ifp == 0) + return (EADDRNOTAVAIL); + ipv4_ll_arp_aware = 1; + ifnet_lock_exclusive(ifp); + if (ifr->ifr_data) + ifp->if_eflags |= IFEF_ARPLL; + else + ifp->if_eflags &= ~IFEF_ARPLL; + ifnet_lock_done(ifp); break; case SIOCGIFADDR: @@ -504,16 +529,20 @@ in_control(so, cmd, data, ifp, p) (struct sockaddr_in *) &ifr->ifr_addr, 1)); case SIOCPROTOATTACH: - error = dlil_plumb_protocol(PF_INET, ifp, &dl_tag); + error = dlil_plumb_protocol(PF_INET, ifp); if (error) return(error); break; case SIOCPROTODETACH: // if an ip address is still present, refuse to detach - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) - if (ifa->ifa_addr->sa_family == AF_INET) - return EBUSY; + ifnet_lock_shared(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) + if (ifa->ifa_addr->sa_family == AF_INET) + break; + ifnet_lock_done(ifp); + if (ifa != 0) + return EBUSY; error = dlil_unplumb_protocol(PF_INET, ifp); if (error) @@ -567,7 +596,7 @@ in_control(so, cmd, data, ifp, p) hostIsNew = 0; } if (ifra->ifra_mask.sin_len) { - in_ifscrub(ifp, ia); + in_ifscrub(ifp, ia, 0); ia->ia_sockmask = ifra->ifra_mask; ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr); @@ -575,7 +604,7 @@ in_control(so, cmd, data, ifp, p) } if ((ifp->if_flags & IFF_POINTOPOINT) && (ifra->ifra_dstaddr.sin_family == AF_INET)) { - in_ifscrub(ifp, ia); + in_ifscrub(ifp, ia, 0); ia->ia_dstaddr = ifra->ifra_dstaddr; maskIsNew = 1; /* We lie; but the effect's the same */ } @@ -627,12 +656,13 @@ in_control(so, cmd, data, ifp, p) return (error); case SIOCDIFADDR: - error = dlil_ioctl(PF_INET, ifp, SIOCDIFADDR, (caddr_t)ia); - if (error == EOPNOTSUPP) - error = 0; + error = dlil_ioctl(PF_INET, ifp, SIOCDIFADDR, (caddr_t)ia); + if (error == EOPNOTSUPP) + error = 0; if (error) return error; + /* Fill out the kernel event information */ ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_NETWORK_CLASS; ev_msg.kev_subclass = KEV_INET_SUBCLASS; @@ -656,47 +686,23 @@ in_control(so, cmd, data, ifp, p) in_event_data.link_data.if_unit = (unsigned long) ifp->if_unit; ev_msg.dv[0].data_ptr = &in_event_data; - ev_msg.dv[0].data_length = sizeof(struct kev_in_data); + ev_msg.dv[0].data_length = sizeof(struct kev_in_data); ev_msg.dv[1].data_length = 0; - kev_post_msg(&ev_msg); - + lck_mtx_lock(rt_mtx); + TAILQ_REMOVE(&in_ifaddrhead, ia, ia_link); /* * in_ifscrub kills the interface route. */ - in_ifscrub(ifp, ia); -#ifndef __APPLE__ - /* - * in_ifadown gets rid of all the rest of - * the routes. This is not quite the right - * thing to do, but at least if we are running - * a routing process they will come back. - */ - in_ifadown(&ia->ia_ifa, 1); - /* - * XXX horrible hack to detect that we are being called - * from if_detach() - */ - if (!ifnet_addrs[ifp->if_index - 1]) { - in_pcbpurgeif0(LIST_FIRST(ripcbinfo.listhead), ifp); - in_pcbpurgeif0(LIST_FIRST(udbinfo.listhead), ifp); - } -#endif - - /* - * Protect from ipintr() traversing address list - * while we're modifying it. - */ - s = splnet(); - + in_ifscrub(ifp, ia, 1); ifa = &ia->ia_ifa; - TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link); - oia = ia; - TAILQ_REMOVE(&in_ifaddrhead, oia, ia_link); - ifafree(&oia->ia_ifa); + lck_mtx_unlock(rt_mtx); + ifnet_lock_exclusive(ifp); + if_detach_ifa(ifp, ifa); + ifafree(&ia->ia_ifa); #ifdef __APPLE__ - /* + /* * If the interface supports multicast, and no address is left, * remove the "all hosts" multicast group from that interface. */ @@ -711,12 +717,16 @@ in_control(so, cmd, data, ifp, p) if (ifa == 0) { addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); IN_LOOKUP_MULTI(addr, ifp, inm); - if (inm) - in_delmulti(inm); } - } + ifnet_lock_done(ifp); + if (inm) + in_delmulti(&inm); + } else + ifnet_lock_done(ifp); #endif - splx(s); + + /* Post the kernel event */ + kev_post_msg(&ev_msg); break; #ifdef __APPLE__ @@ -726,27 +736,23 @@ in_control(so, cmd, data, ifp, p) * Special ioctl for OpenTransport sockets */ struct inpcb *inp, *cloned_inp; - int error = 0; + int error2 = 0; int cloned_fd = *(int *)data; - s = splnet(); /* XXX */ inp = sotoinpcb(so); if (inp == NULL) { - splx(s); break; } /* let's make sure it's either -1 or a valid file descriptor */ if (cloned_fd != -1) { struct socket *cloned_so; - struct file *cloned_fp; - error = getsock(p->p_fd, cloned_fd, &cloned_fp); - if (error){ - splx(s); + error2 = file_socket(cloned_fd, &cloned_so); + if (error2){ break; } - cloned_so = (struct socket *)cloned_fp->f_data; cloned_inp = sotoinpcb(cloned_so); + file_drop(cloned_fd); } else { cloned_inp = NULL; } @@ -768,7 +774,7 @@ in_control(so, cmd, data, ifp, p) /* Multicast options */ if (cloned_inp->inp_moptions != NULL) { - int i; + int i; struct ip_moptions *cloned_imo = cloned_inp->inp_moptions; struct ip_moptions *imo = inp->inp_moptions; @@ -777,14 +783,12 @@ in_control(so, cmd, data, ifp, p) * No multicast option buffer attached to the pcb; * allocate one. */ - splx(); imo = (struct ip_moptions*) _MALLOC(sizeof(*imo), M_IPMOPTS, M_WAITOK); if (imo == NULL) { - error = ENOBUFS; + error2 = ENOBUFS; break; } - s = splnet(); /* XXX */ inp->inp_moptions = imo; } imo->imo_multicast_ifp = cloned_imo->imo_multicast_ifp; @@ -797,29 +801,25 @@ in_control(so, cmd, data, ifp, p) in_addmulti(&cloned_imo->imo_membership[i]->inm_addr, cloned_imo->imo_membership[i]->inm_ifp); if (imo->imo_membership[i] == NULL) { - error = ENOBUFS; + error2 = ENOBUFS; break; } } if (i < cloned_imo->imo_num_memberships) { /* Failed, perform cleanup */ for (i--; i >= 0; i--) - in_delmulti(imo->imo_membership[i]); + in_delmulti(&imo->imo_membership[i]); + imo->imo_num_memberships = 0; break; } } } - splx(s); break; } #endif /* __APPLE__ */ default: return EOPNOTSUPP; - /* Darwin: dlil_ioctl called from ifioctl */ -#ifndef __APPLE__ - return ((*ifp->if_ioctl)(ifp, cmd, data)); -#endif } return (0); } @@ -841,12 +841,12 @@ in_control(so, cmd, data, ifp, p) * other values may be returned from in_ioctl() */ static int -in_lifaddr_ioctl(so, cmd, data, ifp, p) - struct socket *so; - u_long cmd; - caddr_t data; - struct ifnet *ifp; - struct proc *p; +in_lifaddr_ioctl( + struct socket *so, + u_long cmd, + caddr_t data, + struct ifnet *ifp, + struct proc *p) { struct if_laddrreq *iflr = (struct if_laddrreq *)data; struct ifaddr *ifa; @@ -946,6 +946,7 @@ in_lifaddr_ioctl(so, cmd, data, ifp, p) } } + ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; @@ -956,6 +957,7 @@ in_lifaddr_ioctl(so, cmd, data, ifp, p) if (candidate.s_addr == match.s_addr) break; } + ifnet_lock_done(ifp); if (!ifa) return EADDRNOTAVAIL; ia = (struct in_ifaddr *)ifa; @@ -1006,18 +1008,23 @@ in_lifaddr_ioctl(so, cmd, data, ifp, p) * Delete any existing route for an interface. */ void -in_ifscrub(ifp, ia) - register struct ifnet *ifp; - register struct in_ifaddr *ia; +in_ifscrub( + struct ifnet *ifp, + struct in_ifaddr *ia, + int locked) { if ((ia->ia_flags & IFA_ROUTE) == 0) return; + if (!locked) + lck_mtx_lock(rt_mtx); if (ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) - rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); + rtinit_locked(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); else - rtinit(&(ia->ia_ifa), (int)RTM_DELETE, 0); + rtinit_locked(&(ia->ia_ifa), (int)RTM_DELETE, 0); ia->ia_flags &= ~IFA_ROUTE; + if (!locked) + lck_mtx_unlock(rt_mtx); } /* @@ -1025,15 +1032,15 @@ in_ifscrub(ifp, ia) * and routing table entry. */ static int -in_ifinit(ifp, ia, sin, scrub) - register struct ifnet *ifp; - register struct in_ifaddr *ia; - struct sockaddr_in *sin; - int scrub; +in_ifinit( + struct ifnet *ifp, + struct in_ifaddr *ia, + struct sockaddr_in *sin, + int scrub) { - register u_long i = ntohl(sin->sin_addr.s_addr); + u_long i = ntohl(sin->sin_addr.s_addr); struct sockaddr_in oldaddr; - int s = splimp(), flags = RTF_UP, error; + int flags = RTF_UP, error; oldaddr = ia->ia_addr; ia->ia_addr = *sin; @@ -1047,14 +1054,12 @@ in_ifinit(ifp, ia, sin, scrub) if (error == EOPNOTSUPP) error = 0; if (error) { - splx(s); ia->ia_addr = oldaddr; return (error); } - splx(s); if (scrub) { ia->ia_ifa.ifa_addr = (struct sockaddr *)&oldaddr; - in_ifscrub(ifp, ia); + in_ifscrub(ifp, ia, 0); ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; } if (IN_CLASSA(i)) @@ -1108,7 +1113,9 @@ in_ifinit(ifp, ia, sin, scrub) struct in_addr addr; addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); + ifnet_lock_shared(ifp); IN_LOOKUP_MULTI(addr, ifp, inm); + ifnet_lock_done(ifp); if (inm == 0) in_addmulti(&addr, ifp); } @@ -1120,11 +1127,11 @@ in_ifinit(ifp, ia, sin, scrub) * Return 1 if the address might be a local broadcast address. */ int -in_broadcast(in, ifp) - struct in_addr in; - struct ifnet *ifp; +in_broadcast( + struct in_addr in, + struct ifnet *ifp) { - register struct ifaddr *ifa; + struct ifaddr *ifa; u_long t; if (in.s_addr == INADDR_BROADCAST || @@ -1138,10 +1145,12 @@ in_broadcast(in, ifp) * with a broadcast address. */ #define ia ((struct in_ifaddr *)ifa) - for (ifa = ifp->if_addrhead.tqh_first; ifa; - ifa = ifa->ifa_link.tqe_next) { - if (ifa->ifa_addr == NULL) + ifnet_lock_shared(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr == NULL) { + ifnet_lock_done(ifp); return (0); + } if (ifa->ifa_addr->sa_family == AF_INET && (in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || in.s_addr == ia->ia_netbroadcast.s_addr || @@ -1154,25 +1163,45 @@ in_broadcast(in, ifp) * only exist when an interface gets a secondary * address. */ - ia->ia_subnetmask != (u_long)0xffffffff) - return 1; + ia->ia_subnetmask != (u_long)0xffffffff) { + ifnet_lock_done(ifp); + return 1; + } } + ifnet_lock_done(ifp); return (0); #undef ia } + +static void +in_free_inm( + void* ifma_protospec) +{ + struct in_multi *inm = ifma_protospec; + + /* + * No remaining claims to this record; let IGMP know that + * we are leaving the multicast group. + */ + igmp_leavegroup(inm); + lck_mtx_lock(rt_mtx); + LIST_REMOVE(inm, inm_link); + lck_mtx_unlock(rt_mtx); + FREE(inm, M_IPMADDR); +} + /* * Add an address to the list of IP multicast addresses for a given interface. */ struct in_multi * -in_addmulti(ap, ifp) - register struct in_addr *ap; - register struct ifnet *ifp; +in_addmulti( + struct in_addr *ap, + struct ifnet *ifp) { - register struct in_multi *inm; + struct in_multi *inm; int error; struct sockaddr_in sin; struct ifmultiaddr *ifma; - int s = splnet(); /* * Call generic routine to add membership or increment @@ -1185,7 +1214,6 @@ in_addmulti(ap, ifp) sin.sin_addr = *ap; error = if_addmulti(ifp, (struct sockaddr *)&sin, &ifma); if (error) { - splx(s); return 0; } @@ -1194,13 +1222,11 @@ in_addmulti(ap, ifp) * a new record. Otherwise, we are done. */ if (ifma->ifma_protospec != 0) { - splx(s); return ifma->ifma_protospec; } inm = (struct in_multi *) _MALLOC(sizeof(*inm), M_IPMADDR, M_WAITOK); if (inm == NULL) { - splx(s); return (NULL); } @@ -1208,20 +1234,37 @@ in_addmulti(ap, ifp) inm->inm_addr = *ap; inm->inm_ifp = ifp; inm->inm_ifma = ifma; - ifma->ifma_protospec = inm; - LIST_INSERT_HEAD(&in_multihead, inm, inm_link); + lck_mtx_lock(rt_mtx); + if (ifma->ifma_protospec == NULL) { + ifma->ifma_protospec = inm; + ifma->ifma_free = in_free_inm; + LIST_INSERT_HEAD(&in_multihead, inm, inm_link); + } + lck_mtx_unlock(rt_mtx); + + if (ifma->ifma_protospec != inm) { + _FREE(inm, M_IPMADDR); + return ifma->ifma_protospec; + } /* * Let IGMP know that we have joined a new IP multicast group. */ error = igmp_joingroup(inm); if (error) { - if_delmultiaddr(ifma); - LIST_REMOVE(inm, inm_link); - _FREE(inm, M_IPMADDR); - inm = NULL; + char addrbuf[16]; + + /* + * We can't free the inm because someone else may already be + * using it. Once we put it in to ifma->ifma_protospec, it + * must exist as long as the ifma does. Might be nice to flag + * the error so we can try igmp_joingroup the next time through. + */ + log(LOG_ERR, "igmp_joingroup error %d joining multicast %s on %s%d\n", + error, inet_ntop(AF_INET, &sin.sin_addr, addrbuf, sizeof(addrbuf)), + ifp->if_name, ifp->if_unit); } - splx(s); + return (inm); } @@ -1229,26 +1272,49 @@ in_addmulti(ap, ifp) * Delete a multicast address record. */ void -in_delmulti(inm) - register struct in_multi *inm; +in_delmulti( + struct in_multi **inm) { - struct ifmultiaddr *ifma = inm->inm_ifma; - int s = splnet(); + struct in_multi *inm2; + + lck_mtx_lock(rt_mtx); + LIST_FOREACH(inm2, &in_multihead, inm_link) { + if (inm2 == *inm) + break; + } + if (inm2 != *inm) { + lck_mtx_unlock(rt_mtx); + printf("in_delmulti - ignorning invalid inm (0x%x)\n", *inm); + return; + } + lck_mtx_unlock(rt_mtx); /* We intentionally do this a bit differently than BSD */ - - if (ifma && ifma->ifma_refcount == 1) { - /* - * No remaining claims to this record; let IGMP know that - * we are leaving the multicast group. - */ - igmp_leavegroup(inm); - ifma->ifma_protospec = 0; - LIST_REMOVE(inm, inm_link); - FREE(inm, M_IPMADDR); + if ((*inm)->inm_ifma) { + if_delmultiaddr((*inm)->inm_ifma, 0); + ifma_release((*inm)->inm_ifma); } - /* XXX - should be separate API for when we have an ifma? */ - if (ifma) - if_delmultiaddr(ifma); - splx(s); + *inm = NULL; } + +#if !NFSCLIENT +int +inet_aton(char * cp, struct in_addr * pin) +{ + u_char * b = (char *)pin; + int i; + char * p; + + for (p = cp, i = 0; i < 4; i++) { + u_long l = strtoul(p, 0, 0); + if (l > 255) + return (FALSE); + b[i] = l; + p = strchr(p, '.'); + if (i < 3 && p == NULL) + return (FALSE); + p++; + } + return (TRUE); +} +#endif diff --git a/bsd/netinet/in.h b/bsd/netinet/in.h index c8e73f38b..695e07bff 100644 --- a/bsd/netinet/in.h +++ b/bsd/netinet/in.h @@ -58,6 +58,30 @@ #ifndef _NETINET_IN_H_ #define _NETINET_IN_H_ #include <sys/appleapiopts.h> +#include <sys/_types.h> + +#ifndef _IN_ADDR_T +#define _IN_ADDR_T +typedef __uint32_t in_addr_t; /* base type for internet address */ +#endif + +#ifndef _IN_PORT_T +#define _IN_PORT_T +typedef __uint16_t in_port_t; +#endif + +/* + * POSIX 1003.1-2003 + * "Inclusion of the <netinet/in.h> header may also make visible all + * symbols from <inttypes.h> and <sys/socket.h>". + */ +#include <sys/socket.h> + +/* + * The following two #includes insure htonl and family are defined + */ +#include <machine/endian.h> +#include <sys/_endian.h> /* * Constants and structures defined by the internet system, @@ -68,13 +92,18 @@ * Protocols (RFC 1700) */ #define IPPROTO_IP 0 /* dummy for IP */ +#ifndef _POSIX_C_SOURCE #define IPPROTO_HOPOPTS 0 /* IP6 hop-by-hop options */ +#endif /* !_POSIX_C_SOURCE */ #define IPPROTO_ICMP 1 /* control message protocol */ +#ifndef _POSIX_C_SOURCE #define IPPROTO_IGMP 2 /* group mgmt protocol */ #define IPPROTO_GGP 3 /* gateway^2 (deprecated) */ #define IPPROTO_IPV4 4 /* IPv4 encapsulation */ #define IPPROTO_IPIP IPPROTO_IPV4 /* for compatibility */ +#endif /* !_POSIX_C_SOURCE */ #define IPPROTO_TCP 6 /* tcp */ +#ifndef _POSIX_C_SOURCE #define IPPROTO_ST 7 /* Stream protocol II */ #define IPPROTO_EGP 8 /* exterior gateway protocol */ #define IPPROTO_PIGP 9 /* private interior gateway */ @@ -85,7 +114,9 @@ #define IPPROTO_EMCON 14 /* EMCON */ #define IPPROTO_XNET 15 /* Cross Net Debugger */ #define IPPROTO_CHAOS 16 /* Chaos*/ +#endif /* !_POSIX_C_SOURCE */ #define IPPROTO_UDP 17 /* user datagram protocol */ +#ifndef _POSIX_C_SOURCE #define IPPROTO_MUX 18 /* Multiplexing */ #define IPPROTO_MEAS 19 /* DCN Measurement Subsystems */ #define IPPROTO_HMP 20 /* Host Monitoring */ @@ -109,7 +140,9 @@ #define IPPROTO_CMTP 38 /* Control Message Transport */ #define IPPROTO_TPXX 39 /* TP++ Transport */ #define IPPROTO_IL 40 /* IL transport protocol */ +#endif /* !_POSIX_C_SOURCE */ #define IPPROTO_IPV6 41 /* IP6 header */ +#ifndef _POSIX_C_SOURCE #define IPPROTO_SDRP 42 /* Source Demand Routing */ #define IPPROTO_ROUTING 43 /* IP6 routing header */ #define IPPROTO_FRAGMENT 44 /* IP6 fragmentation header */ @@ -174,11 +207,15 @@ /* 255: Reserved */ /* BSD Private, local use, namespace incursion */ #define IPPROTO_DIVERT 254 /* divert pseudo-protocol */ +#endif /* !_POSIX_C_SOURCE */ #define IPPROTO_RAW 255 /* raw IP packet */ + +#ifndef _POSIX_C_SOURCE #define IPPROTO_MAX 256 /* last return value of *_input(), meaning "all job for this pkt is done". */ #define IPPROTO_DONE 257 +#endif /* _POSIX_C_SOURCE */ /* * Local port number conventions: @@ -225,13 +262,18 @@ * */ +#define __DARWIN_IPPORT_RESERVED 1024 + +#ifndef _POSIX_C_SOURCE /* * Ports < IPPORT_RESERVED are reserved for * privileged processes (e.g. root). (IP_PORTRANGE_LOW) * Ports > IPPORT_USERRESERVED are reserved * for servers, not necessarily privileged. (IP_PORTRANGE_DEFAULT) */ -#define IPPORT_RESERVED 1024 +#ifndef IPPORT_RESERVED +#define IPPORT_RESERVED __DARWIN_IPPORT_RESERVED +#endif #define IPPORT_USERRESERVED 5000 /* @@ -247,6 +289,7 @@ * have a fit if we use. */ #define IPPORT_RESERVEDSTART 600 +#endif /* !_POSIX_C_SOURCE */ /* * Internet address (a structure for historical reasons) @@ -260,6 +303,10 @@ struct in_addr { * On subnets, the decomposition of addresses to host and net parts * is done according to subnet mask, not the masks here. */ +#define INADDR_ANY (u_int32_t)0x00000000 +#define INADDR_BROADCAST (u_int32_t)0xffffffff /* must be masked */ + +#ifndef _POSIX_C_SOURCE #define IN_CLASSA(i) (((u_int32_t)(i) & 0x80000000) == 0) #define IN_CLASSA_NET 0xff000000 #define IN_CLASSA_NSHIFT 24 @@ -286,9 +333,7 @@ struct in_addr { #define IN_EXPERIMENTAL(i) (((u_int32_t)(i) & 0xf0000000) == 0xf0000000) #define IN_BADCLASS(i) (((u_int32_t)(i) & 0xf0000000) == 0xf0000000) -#define INADDR_ANY (u_int32_t)0x00000000 #define INADDR_LOOPBACK (u_int32_t)0x7f000001 -#define INADDR_BROADCAST (u_int32_t)0xffffffff /* must be masked */ #ifndef KERNEL #define INADDR_NONE 0xffffffff /* -1 return */ #endif @@ -304,20 +349,22 @@ struct in_addr { #endif #define IN_LOOPBACKNET 127 /* official! */ +#endif /* !_POSIX_C_SOURCE */ /* * Socket address, internet style. */ struct sockaddr_in { - u_char sin_len; - u_char sin_family; - u_short sin_port; + __uint8_t sin_len; + sa_family_t sin_family; + in_port_t sin_port; struct in_addr sin_addr; - char sin_zero[8]; + char sin_zero[8]; /* XXX bwg2001-004 */ }; #define INET_ADDRSTRLEN 16 +#ifndef _POSIX_C_SOURCE /* * Structure used to describe IP options. * Used to store options internally, to pass them to a process, @@ -407,6 +454,7 @@ struct ip_mreq { #define IP_PORTRANGE_HIGH 1 /* "high" - request firewall bypass */ #define IP_PORTRANGE_LOW 2 /* "low" - vouchsafe security */ + /* * Definitions for inet sysctl operations. * @@ -415,6 +463,8 @@ struct ip_mreq { */ #define IPPROTO_MAXID (IPPROTO_AH + 1) /* don't list to IPPROTO_MAX */ +#ifdef KERNEL_PRIVATE + #define CTL_IPPROTO_NAMES { \ { "ip", CTLTYPE_NODE }, \ { "icmp", CTLTYPE_NODE }, \ @@ -470,6 +520,8 @@ struct ip_mreq { { "ipsec", CTLTYPE_NODE }, \ } +#endif /* KERNEL_PRIVATE */ + /* * Names for IP sysctl objects */ @@ -493,6 +545,8 @@ struct ip_mreq { #define IPCTL_GIF_TTL 16 /* default TTL for gif encap packet */ #define IPCTL_MAXID 17 +#ifdef KERNEL_PRIVATE + #define IPCTL_NAMES { \ { 0, 0 }, \ { "forwarding", CTLTYPE_INT }, \ @@ -512,6 +566,10 @@ struct ip_mreq { { "keepfaith", CTLTYPE_INT }, \ { "gifttl", CTLTYPE_INT }, \ } +#endif /* KERNEL_PRIVATE */ + +#endif /* !_POSIX_C_SOURCE */ + /* INET6 stuff */ #define __KAME_NETINET_IN_H_INCLUDED_ @@ -519,19 +577,22 @@ struct ip_mreq { #undef __KAME_NETINET_IN_H_INCLUDED_ #ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE struct ifnet; struct mbuf; /* forward declarations for Standard C */ -int in_broadcast __P((struct in_addr, struct ifnet *)); -int in_canforward __P((struct in_addr)); -int in_cksum __P((struct mbuf *, int)); -int in_cksum_skip __P((struct mbuf *, u_short, u_short)); -u_short in_addword __P((u_short, u_short)); -u_short in_pseudo __P((u_int, u_int, u_int)); -int in_localaddr __P((struct in_addr)); -char *inet_ntoa __P((struct in_addr)); /* in libkern */ -u_long in_netof __P((struct in_addr)); -#endif /* __APPLE_API_PRIVATE */ +int in_broadcast(struct in_addr, struct ifnet *); +int in_canforward(struct in_addr); +int in_cksum(struct mbuf *, int); +int in_cksum_skip(struct mbuf *, u_short, u_short); +u_short in_addword(u_short, u_short); +u_short in_pseudo(u_int, u_int, u_int); +int in_localaddr(struct in_addr); +u_long in_netof(struct in_addr); +#endif /* KERNEL_PRIVATE */ +#define MAX_IPv4_STR_LEN 16 +#define MAX_IPv6_STR_LEN 64 + +const char *inet_ntop(int, const void *, char *, size_t); /* in libkern */ #endif /* KERNEL */ -#endif +#endif _NETINET_IN_H_ diff --git a/bsd/netinet/in_arp.c b/bsd/netinet/in_arp.c new file mode 100644 index 000000000..44c526395 --- /dev/null +++ b/bsd/netinet/in_arp.c @@ -0,0 +1,876 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* + * Copyright (c) 1982, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include <kern/debug.h> +#include <netinet/in_arp.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/kernel_types.h> +#include <sys/syslog.h> +#include <sys/systm.h> +#include <sys/time.h> +#include <sys/kernel.h> +#include <sys/mbuf.h> +#include <sys/sysctl.h> +#include <string.h> +#include <net/if_arp.h> +#include <net/if_dl.h> +#include <net/dlil.h> +#include <net/route.h> +#include <netinet/if_ether.h> +#include <netinet/in_var.h> + +#define SIN(s) ((struct sockaddr_in *)s) +#define CONST_LLADDR(s) ((const u_char*)((s)->sdl_data + (s)->sdl_nlen)) +#define rt_expire rt_rmx.rmx_expire + +static const size_t MAX_HW_LEN = 10; + +SYSCTL_DECL(_net_link_ether); +SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, ""); + +/* timer values */ +static int arpt_prune = (5*60*1); /* walk list every 5 minutes */ +static int arpt_keep = (20*60); /* once resolved, good for 20 more minutes */ +static int arpt_down = 20; /* once declared down, don't send for 20 sec */ + +/* Apple Hardware SUM16 checksuming */ +int apple_hwcksum_tx = 1; +int apple_hwcksum_rx = 1; + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, prune_intvl, CTLFLAG_RW, + &arpt_prune, 0, ""); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_RW, + &arpt_keep, 0, ""); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, host_down_time, CTLFLAG_RW, + &arpt_down, 0, ""); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, apple_hwcksum_tx, CTLFLAG_RW, + &apple_hwcksum_tx, 0, ""); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, apple_hwcksum_rx, CTLFLAG_RW, + &apple_hwcksum_rx, 0, ""); + +struct llinfo_arp { + LIST_ENTRY(llinfo_arp) la_le; + struct rtentry *la_rt; + struct mbuf *la_hold; /* last packet until resolved/timeout */ + long la_asked; /* last time we QUERIED for this addr */ +}; + +static LIST_HEAD(, llinfo_arp) llinfo_arp; + +static int arp_inuse, arp_allocated; + +static int arp_maxtries = 5; +static int useloopback = 1; /* use loopback interface for local traffic */ +static int arp_proxyall = 0; + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_RW, + &arp_maxtries, 0, ""); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, useloopback, CTLFLAG_RW, + &useloopback, 0, ""); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_RW, + &arp_proxyall, 0, ""); + +static int log_arp_warnings = 0; + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_warnings, CTLFLAG_RW, + &log_arp_warnings, 0, + "log arp warning messages"); + +extern u_int32_t ipv4_ll_arp_aware; + +/* + * Free an arp entry. + */ +static void +arptfree( + struct llinfo_arp *la) +{ + struct rtentry *rt = la->la_rt; + struct sockaddr_dl *sdl; + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); + if (rt == 0) + panic("arptfree"); + if (rt->rt_refcnt > 0 && (sdl = SDL(rt->rt_gateway)) && + sdl->sdl_family == AF_LINK) { + sdl->sdl_alen = 0; + la->la_asked = 0; + rt->rt_flags &= ~RTF_REJECT; + return; + } + rtrequest_locked(RTM_DELETE, rt_key(rt), (struct sockaddr *)0, rt_mask(rt), + 0, (struct rtentry **)0); +} + +/* + * Timeout routine. Age arp_tab entries periodically. + */ +/* ARGSUSED */ +static void +arptimer( + __unused void *ignored_arg) +{ + struct llinfo_arp *la = llinfo_arp.lh_first; + struct llinfo_arp *ola; + struct timeval timenow; + + lck_mtx_lock(rt_mtx); + getmicrotime(&timenow); + while ((ola = la) != 0) { + struct rtentry *rt = la->la_rt; + la = la->la_le.le_next; + if (rt->rt_expire && rt->rt_expire <= timenow.tv_sec) + arptfree(ola); /* timer has expired, clear */ + } + lck_mtx_unlock(rt_mtx); + timeout(arptimer, (caddr_t)0, arpt_prune * hz); +} + +/* + * Parallel to llc_rtrequest. + */ +static void +arp_rtrequest( + int req, + struct rtentry *rt, + __unused struct sockaddr *sa) +{ + struct sockaddr *gate = rt->rt_gateway; + struct llinfo_arp *la = (struct llinfo_arp *)rt->rt_llinfo; + static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK, 0, 0, 0, 0, 0, {0}}; + static int arpinit_done; + struct timeval timenow; + + if (!arpinit_done) { + arpinit_done = 1; + LIST_INIT(&llinfo_arp); + timeout(arptimer, (caddr_t)0, hz); + } + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); + + if (rt->rt_flags & RTF_GATEWAY) + return; + getmicrotime(&timenow); + switch (req) { + + case RTM_ADD: + /* + * XXX: If this is a manually added route to interface + * such as older version of routed or gated might provide, + * restore cloning bit. + */ + if ((rt->rt_flags & RTF_HOST) == 0 && + SIN(rt_mask(rt))->sin_addr.s_addr != 0xffffffff) + rt->rt_flags |= RTF_CLONING; + if (rt->rt_flags & RTF_CLONING) { + /* + * Case 1: This route should come from a route to iface. + */ + rt_setgate(rt, rt_key(rt), + (struct sockaddr *)&null_sdl); + gate = rt->rt_gateway; + SDL(gate)->sdl_type = rt->rt_ifp->if_type; + SDL(gate)->sdl_index = rt->rt_ifp->if_index; + rt->rt_expire = timenow.tv_sec; + break; + } + /* Announce a new entry if requested. */ + if (rt->rt_flags & RTF_ANNOUNCE) + dlil_send_arp(rt->rt_ifp, ARPOP_REQUEST, SDL(gate), rt_key(rt), (struct sockaddr_dl *)rt_key(rt), NULL); + /*FALLTHROUGH*/ + case RTM_RESOLVE: + if (gate->sa_family != AF_LINK || + gate->sa_len < sizeof(null_sdl)) { + if (log_arp_warnings) + log(LOG_DEBUG, "arp_rtrequest: bad gateway value\n"); + break; + } + SDL(gate)->sdl_type = rt->rt_ifp->if_type; + SDL(gate)->sdl_index = rt->rt_ifp->if_index; + if (la != 0) + break; /* This happens on a route change */ + /* + * Case 2: This route may come from cloning, or a manual route + * add with a LL address. + */ + R_Malloc(la, struct llinfo_arp *, sizeof(*la)); + rt->rt_llinfo = (caddr_t)la; + if (la == 0) { + if ( log_arp_warnings) + log(LOG_DEBUG, "arp_rtrequest: malloc failed\n"); + break; + } + arp_inuse++, arp_allocated++; + Bzero(la, sizeof(*la)); + la->la_rt = rt; + rt->rt_flags |= RTF_LLINFO; + LIST_INSERT_HEAD(&llinfo_arp, la, la_le); + +#if INET + /* + * This keeps the multicast addresses from showing up + * in `arp -a' listings as unresolved. It's not actually + * functional. Then the same for broadcast. + */ + if (IN_MULTICAST(ntohl(SIN(rt_key(rt))->sin_addr.s_addr))) { + dlil_resolve_multi(rt->rt_ifp, rt_key(rt), gate, sizeof(struct sockaddr_dl)); + rt->rt_expire = 0; + } + else if (in_broadcast(SIN(rt_key(rt))->sin_addr, rt->rt_ifp)) { + struct sockaddr_dl *gate_ll = SDL(gate); + size_t broadcast_len; + ifnet_llbroadcast_copy_bytes(rt->rt_ifp, LLADDR(gate_ll), + sizeof(gate_ll->sdl_data), + &broadcast_len); + gate_ll->sdl_alen = broadcast_len; + gate_ll->sdl_family = AF_LINK; + gate_ll->sdl_len = sizeof(struct sockaddr_dl); + rt->rt_expire = timenow.tv_sec; + } +#endif + + if (SIN(rt_key(rt))->sin_addr.s_addr == + (IA_SIN(rt->rt_ifa))->sin_addr.s_addr) { + /* + * This test used to be + * if (loif.if_flags & IFF_UP) + * It allowed local traffic to be forced + * through the hardware by configuring the loopback down. + * However, it causes problems during network configuration + * for boards that can't receive packets they send. + * It is now necessary to clear "useloopback" and remove + * the route to force traffic out to the hardware. + */ + rt->rt_expire = 0; + ifnet_lladdr_copy_bytes(rt->rt_ifp, LLADDR(SDL(gate)), SDL(gate)->sdl_alen = 6); + if (useloopback) + rt->rt_ifp = loif; + + } + break; + + case RTM_DELETE: + if (la == 0) + break; + arp_inuse--; + LIST_REMOVE(la, la_le); + rt->rt_llinfo = 0; + rt->rt_flags &= ~RTF_LLINFO; + if (la->la_hold) { + m_freem(la->la_hold); + } + la->la_hold = NULL; + R_Free((caddr_t)la); + } +} + +/* + * convert hardware address to hex string for logging errors. + */ +static const char * +sdl_addr_to_hex(const struct sockaddr_dl *sdl, char * orig_buf, int buflen) +{ + char * buf = orig_buf; + int i; + const u_char * lladdr = sdl->sdl_data; + int maxbytes = buflen / 3; + + if (maxbytes > sdl->sdl_alen) { + maxbytes = sdl->sdl_alen; + } + *buf = '\0'; + for (i = 0; i < maxbytes; i++) { + snprintf(buf, 3, "%02x", lladdr[i]); + buf += 2; + *buf = (i == maxbytes - 1) ? '\0' : ':'; + buf++; + } + return (orig_buf); +} + +/* + * arp_lookup_route will lookup the route for a given address. + * + * The routing lock must be held. The address must be for a + * host on a local network on this interface. + */ +static errno_t +arp_lookup_route( + const struct in_addr *addr, + int create, + int proxy, + route_t *route) +{ + struct sockaddr_inarp sin = {sizeof(sin), AF_INET, 0, {0}, {0}, 0, 0}; + const char *why = 0; + errno_t error = 0; + + // Caller is responsible for taking the routing lock + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); + + sin.sin_addr.s_addr = addr->s_addr; + sin.sin_other = proxy ? SIN_PROXY : 0; + + *route = rtalloc1_locked((const struct sockaddr*)&sin, create, 0); + if (*route == NULL) + return ENETUNREACH; + + rtunref(*route); + + if ((*route)->rt_flags & RTF_GATEWAY) { + why = "host is not on local network"; + + /* If there are no references to this route, purge it */ + if ((*route)->rt_refcnt <= 0 && ((*route)->rt_flags & RTF_WASCLONED) != 0) { + rtrequest_locked(RTM_DELETE, + (struct sockaddr *)rt_key(*route), + (*route)->rt_gateway, rt_mask(*route), + (*route)->rt_flags, 0); + } + *route = NULL; + error = ENETUNREACH; + } + else if (((*route)->rt_flags & RTF_LLINFO) == 0) { + why = "could not allocate llinfo"; + *route = NULL; + error = ENOMEM; + } + else if ((*route)->rt_gateway->sa_family != AF_LINK) { + why = "gateway route is not ours"; + *route = NULL; + error = EPROTONOSUPPORT; + } + + if (why && create && log_arp_warnings) { + char tmp[MAX_IPv4_STR_LEN]; + log(LOG_DEBUG, "arplookup %s failed: %s\n", + inet_ntop(AF_INET, addr, tmp, sizeof(tmp)), why); + } + + return error; +} + + +__private_extern__ errno_t +arp_route_to_gateway_route( + const struct sockaddr *net_dest, + route_t hint, + route_t *out_route); +/* + * arp_route_to_gateway_route will find the gateway route for a given route. + * + * If the route is down, look the route up again. + * If the route goes through a gateway, get the route to the gateway. + * If the gateway route is down, look it up again. + * If the route is set to reject, verify it hasn't expired. + */ +__private_extern__ errno_t +arp_route_to_gateway_route( + const struct sockaddr *net_dest, + route_t hint, + route_t *out_route) +{ + route_t route = hint; + *out_route = NULL; + struct timeval timenow; + + /* If we got a hint from the higher layers, check it out */ + if (route) { + lck_mtx_lock(rt_mtx); + + if ((route->rt_flags & RTF_UP) == 0) { + /* route is down, find a new one */ + hint = route = rtalloc1_locked(net_dest, 1, 0); + if (hint) { + rtunref(hint); + } + else { + /* No route to host */ + lck_mtx_unlock(rt_mtx); + return EHOSTUNREACH; + } + } + + if (route->rt_flags & RTF_GATEWAY) { + /* + * We need the gateway route. If it is NULL or down, + * look it up. + */ + if (route->rt_gwroute == 0 || + (route->rt_gwroute->rt_flags & RTF_UP) == 0) { + if (route->rt_gwroute != 0) + rtfree_locked(route->rt_gwroute); + + route->rt_gwroute = rtalloc1_locked(route->rt_gateway, 1, 0); + if (route->rt_gwroute == 0) { + lck_mtx_unlock(rt_mtx); + return EHOSTUNREACH; + } + } + + route = route->rt_gwroute; + } + + if (route->rt_flags & RTF_REJECT) { + getmicrotime(&timenow); + if (route->rt_rmx.rmx_expire == 0 || + timenow.tv_sec < route->rt_rmx.rmx_expire) { + lck_mtx_unlock(rt_mtx); + return route == hint ? EHOSTDOWN : EHOSTUNREACH; + } + } + + lck_mtx_unlock(rt_mtx); + } + + *out_route = route; + return 0; +} + +errno_t +arp_lookup_ip( + ifnet_t ifp, + const struct sockaddr_in *net_dest, + struct sockaddr_dl *ll_dest, + size_t ll_dest_len, + route_t hint, + mbuf_t packet) +{ + route_t route = NULL; + errno_t result = 0; + struct sockaddr_dl *gateway; + struct llinfo_arp *llinfo; + struct timeval timenow; + + if (net_dest->sin_family != AF_INET) + return EAFNOSUPPORT; + + if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING)) + return ENETDOWN; + + /* + * If we were given a route, verify the route and grab the gateway + */ + if (hint) { + result = arp_route_to_gateway_route((const struct sockaddr*)net_dest, + hint, &route); + if (result != 0) + return result; + } + + if (packet->m_flags & M_BCAST) { + u_long broadcast_len; + bzero(ll_dest, ll_dest_len); + result = ifnet_llbroadcast_copy_bytes(ifp, LLADDR(ll_dest), ll_dest_len + - offsetof(struct sockaddr_dl, + sdl_data), &broadcast_len); + if (result != 0) { + return result; + } + + ll_dest->sdl_alen = broadcast_len; + ll_dest->sdl_family = AF_LINK; + ll_dest->sdl_len = sizeof(struct sockaddr_dl); + + return 0; + } + if (packet->m_flags & M_MCAST) { + return dlil_resolve_multi(ifp, (const struct sockaddr*)net_dest, + (struct sockaddr*)ll_dest, ll_dest_len); + } + + lck_mtx_lock(rt_mtx); + + /* + * If we didn't find a route, or the route doesn't have + * link layer information, trigger the creation of the + * route and link layer information. + */ + if (route == NULL || route->rt_llinfo == NULL) + result = arp_lookup_route(&net_dest->sin_addr, 1, 0, &route); + + if (result || route == NULL || route->rt_llinfo == NULL) { + char tmp[MAX_IPv4_STR_LEN]; + lck_mtx_unlock(rt_mtx); + if (log_arp_warnings) + log(LOG_DEBUG, "arpresolve: can't allocate llinfo for %s\n", + inet_ntop(AF_INET, &net_dest->sin_addr, tmp, sizeof(tmp))); + return result; + } + + /* + * Now that we have the right route, is it filled in? + */ + gateway = SDL(route->rt_gateway); + getmicrotime(&timenow); + if ((route->rt_rmx.rmx_expire == 0 || route->rt_rmx.rmx_expire > timenow.tv_sec) && + gateway != NULL && gateway->sdl_family == AF_LINK && gateway->sdl_alen != 0) { + bcopy(gateway, ll_dest, MIN(gateway->sdl_len, ll_dest_len)); + lck_mtx_unlock(rt_mtx); + return 0; + } + + /* + * Route wasn't complete/valid. We need to arp. + */ + if (ifp->if_flags & IFF_NOARP) { + lck_mtx_unlock(rt_mtx); + return ENOTSUP; + } + + llinfo = (struct llinfo_arp*)route->rt_llinfo; + if (packet) { + if (llinfo->la_hold) { + m_freem(llinfo->la_hold); + } + llinfo->la_hold = packet; + } + + if (route->rt_rmx.rmx_expire) { + route->rt_flags &= ~RTF_REJECT; + if (llinfo->la_asked == 0 || route->rt_rmx.rmx_expire != timenow.tv_sec) { + route->rt_rmx.rmx_expire = timenow.tv_sec; + if (llinfo->la_asked++ < arp_maxtries) { + lck_mtx_unlock(rt_mtx); + dlil_send_arp(ifp, ARPOP_REQUEST, NULL, route->rt_ifa->ifa_addr, + NULL, (const struct sockaddr*)net_dest); + return EJUSTRETURN; + } + else { + route->rt_flags |= RTF_REJECT; + route->rt_rmx.rmx_expire += arpt_down; + llinfo->la_asked = 0; + llinfo->la_hold = 0; + lck_mtx_unlock(rt_mtx); + return EHOSTUNREACH; + } + } + } + lck_mtx_unlock(rt_mtx); + + return EJUSTRETURN; +} + +errno_t +arp_ip_handle_input( + ifnet_t ifp, + u_short arpop, + const struct sockaddr_dl *sender_hw, + const struct sockaddr_in *sender_ip, + const struct sockaddr_in *target_ip) +{ + char ipv4str[MAX_IPv4_STR_LEN]; + struct sockaddr_dl *gateway; + struct in_ifaddr *ia; + struct in_ifaddr *best_ia = NULL; + route_t route = NULL; + char buf[3 * MAX_HW_LEN]; // enough for MAX_HW_LEN byte hw address + struct llinfo_arp *llinfo; + struct timeval timenow; + errno_t error; + + /* Do not respond to requests for 0.0.0.0 */ + if (target_ip->sin_addr.s_addr == 0 && arpop == ARPOP_REQUEST) { + return 0; + } + + /* + * Determine if this ARP is for us + */ + lck_mtx_lock(rt_mtx); + for (ia = in_ifaddrhead.tqh_first; ia; ia = ia->ia_link.tqe_next) { + /* do_bridge should be tested here for bridging */ + if (ia->ia_ifp == ifp) { + best_ia = ia; + if (target_ip->sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr || + sender_ip->sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr) { + break; + } + } + } + + /* If we don't have an IP address on this interface, ignore the packet */ + if (best_ia == 0) { + lck_mtx_unlock(rt_mtx); + return 0; + } + + /* If the packet is from this interface, ignore the packet */ + if (!bcmp(CONST_LLADDR(sender_hw), ifnet_lladdr(ifp), sender_hw->sdl_len)) { + lck_mtx_unlock(rt_mtx); + return 0; + } + + /* Check for a conflict */ + if (sender_ip->sin_addr.s_addr == best_ia->ia_addr.sin_addr.s_addr) { + struct kev_msg ev_msg; + struct kev_in_collision *in_collision; + u_char storage[sizeof(struct kev_in_collision) + MAX_HW_LEN]; + in_collision = (struct kev_in_collision*)storage; + log(LOG_ERR, "%s%d duplicate IP address %s sent from address %s\n", + ifp->if_name, ifp->if_unit, + inet_ntop(AF_INET, &sender_ip->sin_addr, ipv4str, sizeof(ipv4str)), + sdl_addr_to_hex(sender_hw, buf, sizeof(buf))); + + /* Send a kernel event so anyone can learn of the conflict */ + in_collision->link_data.if_family = ifp->if_family; + in_collision->link_data.if_unit = ifp->if_unit; + strncpy(&in_collision->link_data.if_name[0], ifp->if_name, IFNAMSIZ); + in_collision->ia_ipaddr = sender_ip->sin_addr; + in_collision->hw_len = sender_hw->sdl_alen < MAX_HW_LEN ? sender_hw->sdl_alen : MAX_HW_LEN; + bcopy(CONST_LLADDR(sender_hw), (caddr_t)in_collision->hw_addr, in_collision->hw_len); + ev_msg.vendor_code = KEV_VENDOR_APPLE; + ev_msg.kev_class = KEV_NETWORK_CLASS; + ev_msg.kev_subclass = KEV_INET_SUBCLASS; + ev_msg.event_code = KEV_INET_ARPCOLLISION; + ev_msg.dv[0].data_ptr = in_collision; + ev_msg.dv[0].data_length = sizeof(struct kev_in_collision) + in_collision->hw_len; + ev_msg.dv[1].data_length = 0; + kev_post_msg(&ev_msg); + + goto respond; + } + + /* + * Look up the routing entry. If it doesn't exist and we are the + * target, go ahead and create one. + */ + error = arp_lookup_route(&sender_ip->sin_addr, (target_ip->sin_addr.s_addr == + best_ia->ia_addr.sin_addr.s_addr), 0, &route); + + if (error || route == 0 || route->rt_gateway == 0) { + if (ipv4_ll_arp_aware != 0 && IN_LINKLOCAL(target_ip->sin_addr.s_addr) + && arpop == ARPOP_REQUEST && sender_ip->sin_addr.s_addr == 0) { + /* + * Verify this ARP probe doesn't conflict with an IPv4LL we know of + * on another interface. + */ + error = arp_lookup_route(&target_ip->sin_addr, 0, 0, &route); + if (error == 0 && route && route->rt_gateway) { + gateway = SDL(route->rt_gateway); + if (route->rt_ifp != ifp && + (gateway->sdl_alen != sender_hw->sdl_alen || + bcmp(CONST_LLADDR(gateway), CONST_LLADDR(sender_hw), + gateway->sdl_alen) != 0)) { + /* + * A node is probing for an IPv4LL we know exists on a + * different interface. We respond with a conflicting probe + * to force the new device to pick a different IPv4LL + * address. + */ + log(LOG_INFO, + "arp: %s on %s%d sent probe for %s, already on %s%d\n", + sdl_addr_to_hex(sender_hw, buf, sizeof(buf)), + ifp->if_name, ifp->if_unit, + inet_ntop(AF_INET, &target_ip->sin_addr, ipv4str, + sizeof(ipv4str)), + route->rt_ifp->if_name, route->rt_ifp->if_unit); + log(LOG_INFO, + "arp: sending conflicting probe to %s on %s%d\n", + sdl_addr_to_hex(sender_hw, buf, sizeof(buf)), + ifp->if_name, ifp->if_unit); + + /* + * Send a conservative unicast "ARP probe". + * This should force the other device to pick a new number. + * This will not force the device to pick a new number if the device + * has already assigned that number. + * This will not imply to the device that we own that address. + */ + dlil_send_arp_internal(ifp, ARPOP_REQUEST, + (struct sockaddr_dl*)TAILQ_FIRST(&ifp->if_addrhead)->ifa_addr, + (const struct sockaddr*)sender_ip, sender_hw, + (const struct sockaddr*)target_ip); + } + } + } + + goto respond; + } + + gateway = SDL(route->rt_gateway); + if (route->rt_ifp != ifp) { + if (!IN_LINKLOCAL(sender_ip->sin_addr.s_addr) || (ifp->if_eflags & IFEF_ARPLL) == 0) { + if (log_arp_warnings) + log(LOG_ERR, "arp: %s is on %s%d but got reply from %s on %s%d\n", + inet_ntop(AF_INET, &sender_ip->sin_addr, ipv4str, + sizeof(ipv4str)), + route->rt_ifp->if_name, + route->rt_ifp->if_unit, + sdl_addr_to_hex(sender_hw, buf, sizeof(buf)), + ifp->if_name, ifp->if_unit); + goto respond; + } + else { + /* Don't change a permanent address */ + if (route->rt_rmx.rmx_expire == 0) { + goto respond; + } + + /* + * Don't change the cloned route away from the parent's interface + * if the address did resolve. + */ + if (gateway->sdl_alen != 0 && route->rt_parent && + route->rt_parent->rt_ifp == route->rt_ifp) { + goto respond; + } + + /* Change the interface when the existing route is on */ + route->rt_ifp = ifp; + rtsetifa(route, &best_ia->ia_ifa); + gateway->sdl_index = ifp->if_index; + } + } + + if (gateway->sdl_alen && bcmp(LLADDR(gateway), CONST_LLADDR(sender_hw), gateway->sdl_alen)) { + if (route->rt_rmx.rmx_expire) { + char buf2[3 * MAX_HW_LEN]; + log(LOG_INFO, "arp: %s moved from %s to %s on %s%d\n", + inet_ntop(AF_INET, &sender_ip->sin_addr, ipv4str, + sizeof(ipv4str)), + sdl_addr_to_hex(gateway, buf, sizeof(buf)), + sdl_addr_to_hex(sender_hw, buf2, sizeof(buf2)), ifp->if_name, + ifp->if_unit); + } + else { + log(LOG_ERR, + "arp: %s attempts to modify permanent entry for %s on %s%d\n", + sdl_addr_to_hex(sender_hw, buf, sizeof(buf)), + inet_ntop(AF_INET, &sender_ip->sin_addr, ipv4str, + sizeof(ipv4str)), + ifp->if_name, ifp->if_unit); + goto respond; + } + } + + /* Copy the sender hardware address in to the route's gateway address */ + gateway->sdl_alen = sender_hw->sdl_alen; + bcopy(CONST_LLADDR(sender_hw), LLADDR(gateway), gateway->sdl_alen); + + /* Update the expire time for the route and clear the reject flag */ + getmicrotime(&timenow); + if (route->rt_rmx.rmx_expire) + route->rt_rmx.rmx_expire = timenow.tv_sec + arpt_keep; + route->rt_flags &= ~RTF_REJECT; + + /* update the llinfo, send a queued packet if there is one */ + llinfo = (struct llinfo_arp*)route->rt_llinfo; + llinfo->la_asked = 0; + if (llinfo->la_hold) { + struct mbuf *m0; + m0 = llinfo->la_hold; + llinfo->la_hold = 0; + + /* Should we a reference on the route first? */ + lck_mtx_unlock(rt_mtx); + dlil_output(ifp, PF_INET, m0, (caddr_t)route, rt_key(route), 0); + lck_mtx_lock(rt_mtx); + } + +respond: + if (arpop != ARPOP_REQUEST) { + lck_mtx_unlock(rt_mtx); + return 0; + } + + /* If we are not the target, check if we should proxy */ + if (target_ip->sin_addr.s_addr != best_ia->ia_addr.sin_addr.s_addr) { + + /* Find a proxy route */ + error = arp_lookup_route(&target_ip->sin_addr, 0, SIN_PROXY, &route); + if (error || route == NULL) { + + /* We don't have a route entry indicating we should use proxy */ + /* If we aren't supposed to proxy all, we are done */ + if (!arp_proxyall) { + lck_mtx_unlock(rt_mtx); + return 0; + } + + /* See if we have a route to the target ip before we proxy it */ + route = rtalloc1_locked((const struct sockaddr*)target_ip, 0, 0); + if (!route) { + lck_mtx_unlock(rt_mtx); + return 0; + } + + /* + * Don't proxy for hosts already on the same interface. + */ + if (route->rt_ifp == ifp) { + rtfree_locked(route); + lck_mtx_unlock(rt_mtx); + return 0; + } + } + } + lck_mtx_unlock(rt_mtx); + + dlil_send_arp(ifp, ARPOP_REPLY, NULL, (const struct sockaddr*)target_ip, + sender_hw, (const struct sockaddr*)sender_ip); + + return 0; +} + +void +arp_ifinit( + struct ifnet *ifp, + struct ifaddr *ifa) +{ + ifa->ifa_rtrequest = arp_rtrequest; + ifa->ifa_flags |= RTF_CLONING; + dlil_send_arp(ifp, ARPOP_REQUEST, NULL, ifa->ifa_addr, NULL, ifa->ifa_addr); +} diff --git a/bsd/netinet/in_arp.h b/bsd/netinet/in_arp.h new file mode 100644 index 000000000..e32f597a7 --- /dev/null +++ b/bsd/netinet/in_arp.h @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef _NETINET_IN_ARP_H_ +#define _NETINET_IN_ARP_H_ +#include <sys/kernel_types.h> + +struct sockaddr_dl; +struct sockaddr_in; + +/*! + @function inet_arp_lookup + @discussion This function will check the routing table for a cached + arp entry or trigger an arp query to resolve the ip address to a + link-layer address. + + Arp entries are stored in the routing table. This function will + lookup the ip destination in the routing table. If the + destination requires forwarding to a gateway, the route of the + gateway will be looked up. The route entry is inspected to + determine if the link layer destination address is known. If + unknown, the arp generation function for IP attached to the + interface is called to create an arp request packet. + @param interface The interface the packet is being sent on. + @param ip_dest The ip destination of the packet. + @param ll_dest On output, the link-layer destination. + @param ll_dest_len The length of the buffer for ll_dest. + @param hint Any routing hint passed down from the protocol. + @param packet The packet being transmitted. + @result May return an error such as EHOSTDOWN or ENETUNREACH. If + this function returns EJUSTRETURN, the packet has been queued + and will be sent when an arp response is received. If any other + value is returned, the caller is responsible for disposing of + the packet. + */ +#ifdef BSD_KERNEL_PRIVATE +#define inet_arp_lookup arp_lookup_ip +#else +errno_t inet_arp_lookup(ifnet_t interface, const struct sockaddr_in *ip_dest, + struct sockaddr_dl *ll_dest, size_t ll_dest_len, route_t hint, + mbuf_t packet); +#endif /* BSD_KERNEL_PRIVATE */ +#ifdef KERNEL_PRIVATE +/* arp_lookup_ip is obsolete, use inet_arp_lookup */ +errno_t arp_lookup_ip(ifnet_t interface, const struct sockaddr_in *ip_dest, + struct sockaddr_dl *ll_dest, size_t ll_dest_len, route_t hint, + mbuf_t packet); +#endif /* KERNEL_PRIVATE */ + +/*! + @function inet_arp_handle_input + @discussion This function should be called by code that handles + inbound arp packets. The caller should parse the ARP packet to + pull out the operation and the relevant addresses. If a response + is required, the proto_media_send_arp function will be called. + + This function will lookup the sender in the routing table and + add an arp entry if necessary. Any queued packets waiting for + the arp resolution will also be transmitted. + @param interface The interface the packet was received on. + @param arp_op The arp operation, ARPOP_REQUEST or ARPOP_REPLY + @param sender_hw The sender hardware address from the arp payload. + @param sender_ip The sender IP address from the arp payload. + @param target_ip The target IP address from the arp payload. + @result 0 on success or an errno error value on failure. + */ +#ifdef BSD_KERNEL_PRIVATE +#define inet_arp_handle_input arp_ip_handle_input +#else +errno_t inet_arp_handle_input(ifnet_t ifp, u_int16_t arpop, + const struct sockaddr_dl *sender_hw, + const struct sockaddr_in *sender_ip, + const struct sockaddr_in *target_ip); +#endif /* KERNEL_PRIVATE */ +#ifdef KERNEL_PRIVATE +/* arp_ip_handle_input is obsolete, use inet_arp_handle_input */ +errno_t arp_ip_handle_input(ifnet_t ifp, u_int16_t arpop, + const struct sockaddr_dl *sender_hw, + const struct sockaddr_in *sender_ip, + const struct sockaddr_in *target_ip); +#endif /* BSD_KERNEL_PRIVATE */ + +/*! + @function inet_arp_init_ifaddr + @discussion This function should be called in two places, when an IP + address is added and when the hardware address changes. This + function will setup the ifaddr_t for use with the IP ARP + functions. This function will also trigger the transmission of a + gratuitous ARP packet. + + When the SIOCSIFADDR ioctl is handled, the data parameter will + be an ifaddr_t. If this is an IP address, inet_arp_init_ifaddr + should be called. This is usually performed in the protocol + attachment's ioctl handler. + + When the event handler for the protocol attachment receives a + KEV_DL_LINK_ADDRESS_CHANGED event, the event handler should call + inet_arp_init_ifaddr for each interface ip address. + + For an example, see bsd/net/ether_inet_pr_module.c in xnu. + Search for inet_arp_init_ifaddr. + @param interface The interface the packet was received on. + @param ipaddr The ip interface address. + */ +#ifdef BSD_KERNEL_PRIVATE +/* inet_arp_init_ifaddr is aliased to arp_ifinit */ +#define inet_arp_init_ifaddr arp_ifinit +#else +void inet_arp_init_ifaddr(ifnet_t interface, ifaddr_t ipaddr); +#endif + +#endif _NETINET_IN_ARP_H_ diff --git a/bsd/netinet/in_bootp.c b/bsd/netinet/in_bootp.c index 8a1055ebc..903262d4c 100644 --- a/bsd/netinet/in_bootp.c +++ b/bsd/netinet/in_bootp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -49,6 +49,7 @@ #include <sys/vnode.h> #include <sys/socket.h> #include <sys/socketvar.h> +#include <sys/uio_internal.h> #include <net/if.h> #include <net/if_dl.h> #include <net/if_types.h> @@ -66,12 +67,22 @@ #include <sys/malloc.h> #include <netinet/dhcp_options.h> +#include <kern/kern_types.h> +#include <kern/kalloc.h> + #ifdef BOOTP_DEBUG #define dprintf(x) printf x; #else /* !BOOTP_DEBUG */ #define dprintf(x) #endif /* BOOTP_DEBUG */ +int bootp(struct ifnet * ifp, struct in_addr * iaddr_p, int max_try, + struct in_addr * netmask_p, struct in_addr * router_p, + struct proc * procp); +struct mbuf * ip_pkt_to_mbuf(caddr_t pkt, int pktsize); +int receive_packet(struct socket * so, caddr_t pp, int psize, int * actual_size); + + /* ip address formatting macros */ #define IP_FORMAT "%d.%d.%d.%d" #define IP_CH(ip) ((u_char *)ip) @@ -85,7 +96,7 @@ blank_sin() } static __inline__ void -print_reply(struct bootp *bp, int bp_len) +print_reply(struct bootp *bp, __unused int bp_len) { int i, j, len; @@ -130,7 +141,7 @@ print_reply(struct bootp *bp, int bp_len) } static __inline__ void -print_reply_short(struct bootp *bp, int bp_len) +print_reply_short(struct bootp *bp, __unused int bp_len) { printf("bp_yiaddr = " IP_FORMAT "\n", IP_LIST(&bp->bp_yiaddr)); printf("bp_sname = %s\n", bp->bp_sname); @@ -240,13 +251,16 @@ link_from_ifnet(struct ifnet * ifp) /* for (addr = ifp->if_addrlist; addr; addr = addr->ifa_next) */ + ifnet_lock_shared(ifp); TAILQ_FOREACH(addr, &ifp->if_addrhead, ifa_link) { if (addr->ifa_addr->sa_family == AF_LINK) { struct sockaddr_dl * dl_p = (struct sockaddr_dl *)(addr->ifa_addr); + ifnet_lock_done(ifp); return (dl_p); } } + ifnet_lock_done(ifp); return (NULL); } @@ -257,7 +271,7 @@ link_from_ifnet(struct ifnet * ifp) * bypassing routing code. */ static int -send_bootp_request(struct ifnet * ifp, struct socket * so, +send_bootp_request(struct ifnet * ifp, __unused struct socket * so, struct bootp_packet * pkt) { struct mbuf * m; @@ -269,7 +283,7 @@ send_bootp_request(struct ifnet * ifp, struct socket * so, sin.sin_addr.s_addr = INADDR_BROADCAST; m = ip_pkt_to_mbuf((caddr_t)pkt, sizeof(*pkt)); - return (dlil_output(ifptodlt(ifp, PF_INET), m, 0, (struct sockaddr *)&sin, 0)); + return dlil_output(ifp, PF_INET, m, 0, (struct sockaddr *)&sin, 0); } /* @@ -280,23 +294,18 @@ send_bootp_request(struct ifnet * ifp, struct socket * so, int receive_packet(struct socket * so, caddr_t pp, int psize, int * actual_size) { - struct iovec aiov; - struct uio auio; + uio_t auio; int rcvflg; int error; + char uio_buf[ UIO_SIZEOF(1) ]; - aiov.iov_base = pp; - aiov.iov_len = psize; - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_offset = 0; - auio.uio_resid = psize; - auio.uio_rw = UIO_READ; + auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, + &uio_buf[0], sizeof(uio_buf)); + uio_addiov(auio, CAST_USER_ADDR_T(pp), psize); rcvflg = MSG_WAITALL; - error = soreceive(so, (struct sockaddr **) 0, &auio, 0, 0, &rcvflg); - *actual_size = psize - auio.uio_resid; + error = soreceive(so, (struct sockaddr **) 0, auio, 0, 0, &rcvflg); + *actual_size = psize - uio_resid(auio); return (error); } @@ -310,14 +319,13 @@ bootp_timeout(void * arg) { struct socket * * socketflag = (struct socket * *)arg; struct socket * so = *socketflag; - boolean_t funnel_state; dprintf(("bootp: timeout\n")); - funnel_state = thread_funnel_set(network_flock,TRUE); *socketflag = NULL; + socket_lock(so, 1); sowakeup(so, &so->so_rcv); - (void) thread_funnel_set(network_flock, FALSE); + socket_unlock(so, 1); return; } @@ -331,7 +339,7 @@ bootp_timeout(void * arg) */ #define GOOD_RATING 3 static __inline__ int -rate_packet(struct bootp * pkt, int pkt_size, dhcpol_t * options_p) +rate_packet(__unused struct bootp * pkt, __unused int pkt_size, dhcpol_t * options_p) { int len; int rating = 1; @@ -501,8 +509,11 @@ bootp_loop(struct socket * so, struct ifnet * ifp, int max_try, } break; /* retry */ } - else - sbwait(&so->so_rcv); + else { + socket_lock(so, 1); + error = sbwait(&so->so_rcv); + socket_unlock(so, 1); + } } if (error && (error != EWOULDBLOCK)) { dprintf(("bootp: failed to receive packets: %d\n", error)); @@ -523,9 +534,9 @@ bootp_loop(struct socket * so, struct ifnet * ifp, int max_try, cleanup: if (request) - kfree((caddr_t)request, sizeof (*request)); + kfree(request, sizeof (*request)); if (reply) - kfree((caddr_t)reply, reply_size); + kfree(reply, reply_size); return (error); } @@ -583,7 +594,9 @@ int bootp(struct ifnet * ifp, struct in_addr * iaddr_p, int max_try, dprintf(("bootp: sobind failed, %d\n", error)); goto cleanup; } + socket_lock(so, 1); so->so_state |= SS_NBIO; + socket_unlock(so, 1); } /* do the protocol */ error = bootp_loop(so, ifp, max_try, iaddr_p, netmask_p, router_p); diff --git a/bsd/netinet/in_gif.c b/bsd/netinet/in_gif.c index 1971b49bc..6b5b9efff 100644 --- a/bsd/netinet/in_gif.c +++ b/bsd/netinet/in_gif.c @@ -91,11 +91,11 @@ SYSCTL_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_RW, &ip_gif_ttl, 0, ""); int -in_gif_output(ifp, family, m, rt) - struct ifnet *ifp; - int family; - struct mbuf *m; - struct rtentry *rt; +in_gif_output( + struct ifnet *ifp, + int family, + struct mbuf *m, + struct rtentry *rt) { struct gif_softc *sc = (struct gif_softc*)ifp; struct sockaddr_in *dst = (struct sockaddr_in *)&sc->gif_ro.ro_dst; @@ -342,14 +342,18 @@ gif_encapcheck4(m, off, proto, arg) return 0; } /* reject packets with broadcast on source */ + lck_mtx_lock(rt_mtx); for (ia4 = TAILQ_FIRST(&in_ifaddrhead); ia4; ia4 = TAILQ_NEXT(ia4, ia_link)) { if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) continue; - if (ip.ip_src.s_addr == ia4->ia_broadaddr.sin_addr.s_addr) + if (ip.ip_src.s_addr == ia4->ia_broadaddr.sin_addr.s_addr) { + lck_mtx_unlock(rt_mtx); return 0; + } } + lck_mtx_unlock(rt_mtx); /* ingress filters on outer source */ if ((sc->gif_if.if_flags & IFF_LINK2) == 0 && diff --git a/bsd/netinet/in_gif.h b/bsd/netinet/in_gif.h index e6d09d293..4321eba46 100644 --- a/bsd/netinet/in_gif.h +++ b/bsd/netinet/in_gif.h @@ -53,15 +53,17 @@ #ifndef _NETINET_IN_GIF_H_ #define _NETINET_IN_GIF_H_ #include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL +#ifdef KERNEL_PRIVATE #define GIF_TTL 30 extern int ip_gif_ttl; -void in_gif_input __P((struct mbuf *, int)); -int in_gif_output __P((struct ifnet *, int, struct mbuf *, struct rtentry *)); -int gif_encapcheck4 __P((const struct mbuf *, int, int, void *)); +void in_gif_input(struct mbuf *, int); +int in_gif_output(struct ifnet *, int, struct mbuf *, struct rtentry *); +int gif_encapcheck4(const struct mbuf *, int, int, void *); -#endif /* __APPLE_API_PRIVATE */ -#endif /*_NETINET_IN_GIF_H_*/ +#endif KERNEL_PRIVATE +#endif KERNEL +#endif _NETINET_IN_GIF_H_ diff --git a/bsd/netinet/in_pcb.c b/bsd/netinet/in_pcb.c index 9a8696d2b..c3b56f784 100644 --- a/bsd/netinet/in_pcb.c +++ b/bsd/netinet/in_pcb.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -69,6 +69,7 @@ #endif #include <sys/kernel.h> #include <sys/sysctl.h> +#include <libkern/OSAtomic.h> #include <machine/limits.h> @@ -100,6 +101,7 @@ #if IPSEC extern int ipsec_bypass; +extern lck_mtx_t *sadb_mutex; #endif extern u_long route_generation; @@ -182,7 +184,9 @@ in_pcballoc(so, pcbinfo, p) register struct inpcb *inp; caddr_t temp; #if IPSEC +#ifndef __APPLE__ int error; +#endif #endif if (so->cached_in_sock_layer == 0) { @@ -207,12 +211,24 @@ in_pcballoc(so, pcbinfo, p) inp->inp_gencnt = ++pcbinfo->ipi_gencnt; inp->inp_pcbinfo = pcbinfo; inp->inp_socket = so; + so->so_pcb = (caddr_t)inp; + + if (so->so_proto->pr_flags & PR_PCBLOCK) { + inp->inpcb_mtx = lck_mtx_alloc_init(pcbinfo->mtx_grp, pcbinfo->mtx_attr); + if (inp->inpcb_mtx == NULL) { + printf("in_pcballoc: can't alloc mutex! so=%x\n", so); + return(ENOMEM); + } + } + #if IPSEC #ifndef __APPLE__ if (ipsec_bypass == 0) { + lck_mtx_lock(sadb_mutex); error = ipsec_init_policy(so, &inp->inp_sp); + lck_mtx_unlock(sadb_mutex); if (error != 0) { - zfree(pcbinfo->ipi_zone, (vm_offset_t)inp); + zfree(pcbinfo->ipi_zone, inp); return error; } } @@ -222,13 +238,16 @@ in_pcballoc(so, pcbinfo, p) if (INP_SOCKAF(so) == AF_INET6 && !ip6_mapped_addr_on) inp->inp_flags |= IN6P_IPV6_V6ONLY; #endif - LIST_INSERT_HEAD(pcbinfo->listhead, inp, inp_list); - pcbinfo->ipi_count++; - so->so_pcb = (caddr_t)inp; + #if INET6 if (ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; #endif + lck_rw_lock_exclusive(pcbinfo->mtx); + inp->inp_gencnt = ++pcbinfo->ipi_gencnt; + LIST_INSERT_HEAD(pcbinfo->listhead, inp, inp_list); + pcbinfo->ipi_count++; + lck_rw_done(pcbinfo->mtx); return (0); } @@ -252,17 +271,25 @@ in_pcbbind(inp, nam, p) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) wild = 1; + socket_unlock(so, 0); /* keep reference on socket */ + lck_rw_lock_exclusive(pcbinfo->mtx); if (nam) { sin = (struct sockaddr_in *)nam; - if (nam->sa_len != sizeof (*sin)) + if (nam->sa_len != sizeof (*sin)) { + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); return (EINVAL); + } #ifdef notdef /* * We should check the family, but old programs * incorrectly fail to initialize it. */ - if (sin->sin_family != AF_INET) + if (sin->sin_family != AF_INET) { + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); return (EAFNOSUPPORT); + } #endif lport = sin->sin_port; if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { @@ -276,17 +303,27 @@ in_pcbbind(inp, nam, p) if (so->so_options & SO_REUSEADDR) reuseport = SO_REUSEADDR|SO_REUSEPORT; } else if (sin->sin_addr.s_addr != INADDR_ANY) { + struct ifaddr *ifa; sin->sin_port = 0; /* yech... */ - if (ifa_ifwithaddr((struct sockaddr *)sin) == 0) + if ((ifa = ifa_ifwithaddr((struct sockaddr *)sin)) == 0) { + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); return (EADDRNOTAVAIL); + } + else { + ifafree(ifa); + } } if (lport) { struct inpcb *t; /* GROSS */ if (ntohs(lport) < IPPORT_RESERVED && p && - suser(p->p_ucred, &p->p_acflag)) + proc_suser(p)) { + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); return (EACCES); + } if (so->so_uid && !IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { t = in_pcblookup_local(inp->inp_pcbinfo, @@ -303,9 +340,12 @@ in_pcbbind(inp, nam, p) ntohl(t->inp_laddr.s_addr) != INADDR_ANY || INP_SOCKAF(so) == - INP_SOCKAF(t->inp_socket)) + INP_SOCKAF(t->inp_socket)) { #endif /* defined(INET6) */ - return (EADDRINUSE); + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); + return (EADDRINUSE); + } } } t = in_pcblookup_local(pcbinfo, sin->sin_addr, @@ -319,9 +359,12 @@ in_pcbbind(inp, nam, p) ntohl(t->inp_laddr.s_addr) != INADDR_ANY || INP_SOCKAF(so) == - INP_SOCKAF(t->inp_socket)) + INP_SOCKAF(t->inp_socket)) { #endif /* defined(INET6) */ - return (EADDRINUSE); + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); + return (EADDRINUSE); + } } } inp->inp_laddr = sin->sin_addr; @@ -337,8 +380,11 @@ in_pcbbind(inp, nam, p) last = ipport_hilastauto; lastport = &pcbinfo->lasthi; } else if (inp->inp_flags & INP_LOWPORT) { - if (p && (error = suser(p->p_ucred, &p->p_acflag))) + if (p && (error = proc_suser(p))) { + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); return error; + } first = ipport_lowfirstauto; /* 1023 */ last = ipport_lowlastauto; /* 600 */ lastport = &pcbinfo->lastlow; @@ -362,6 +408,8 @@ in_pcbbind(inp, nam, p) do { if (count-- < 0) { /* completely used? */ + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); inp->inp_laddr.s_addr = INADDR_ANY; return (EADDRNOTAVAIL); } @@ -379,6 +427,8 @@ in_pcbbind(inp, nam, p) do { if (count-- < 0) { /* completely used? */ + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); inp->inp_laddr.s_addr = INADDR_ANY; return (EADDRNOTAVAIL); } @@ -390,12 +440,15 @@ in_pcbbind(inp, nam, p) inp->inp_laddr, lport, wild)); } } + socket_lock(so, 0); inp->inp_lport = lport; - if (in_pcbinshash(inp) != 0) { + if (in_pcbinshash(inp, 1) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; + lck_rw_done(pcbinfo->mtx); return (EAGAIN); } + lck_rw_done(pcbinfo->mtx); return (0); } @@ -426,6 +479,7 @@ in_pcbladdr(inp, nam, plocal_sin) return (EAFNOSUPPORT); if (sin->sin_port == 0) return (EADDRNOTAVAIL); + lck_mtx_lock(rt_mtx); if (!TAILQ_EMPTY(&in_ifaddrhead)) { /* * If the destination address is INADDR_ANY, @@ -460,19 +514,19 @@ in_pcbladdr(inp, nam, plocal_sin) sin->sin_addr.s_addr || inp->inp_socket->so_options & SO_DONTROUTE || ro->ro_rt->generation_id != route_generation)) { - rtfree(ro->ro_rt); + rtfree_locked(ro->ro_rt); ro->ro_rt = (struct rtentry *)0; } if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0 && /*XXX*/ (ro->ro_rt == (struct rtentry *)0 || - ro->ro_rt->rt_ifp == (struct ifnet *)0)) { + ro->ro_rt->rt_ifp == 0)) { /* No route yet, so try to acquire one */ bzero(&ro->ro_dst, sizeof(struct sockaddr_in)); ro->ro_dst.sa_family = AF_INET; ro->ro_dst.sa_len = sizeof(struct sockaddr_in); ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = sin->sin_addr; - rtalloc(ro); + rtalloc_ign_locked(ro, 0UL); } /* * If we found a route, use the address @@ -480,20 +534,29 @@ in_pcbladdr(inp, nam, plocal_sin) * unless it is the loopback (in case a route * to our address on another net goes to loopback). */ - if (ro->ro_rt && !(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) + if (ro->ro_rt && !(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) { ia = ifatoia(ro->ro_rt->rt_ifa); + if (ia) + ifaref(&ia->ia_ifa); + } if (ia == 0) { u_short fport = sin->sin_port; sin->sin_port = 0; ia = ifatoia(ifa_ifwithdstaddr(sintosa(sin))); - if (ia == 0) + if (ia == 0) { ia = ifatoia(ifa_ifwithnet(sintosa(sin))); + } sin->sin_port = fport; - if (ia == 0) + if (ia == 0) { ia = TAILQ_FIRST(&in_ifaddrhead); - if (ia == 0) + if (ia) + ifaref(&ia->ia_ifa); + } + if (ia == 0) { + lck_mtx_unlock(rt_mtx); return (EADDRNOTAVAIL); + } } /* * If the destination address is multicast and an outgoing @@ -506,22 +569,29 @@ in_pcbladdr(inp, nam, plocal_sin) struct ifnet *ifp; imo = inp->inp_moptions; - if (imo->imo_multicast_ifp != NULL) { + if (imo->imo_multicast_ifp != NULL && (ia == NULL || + ia->ia_ifp != imo->imo_multicast_ifp)) { ifp = imo->imo_multicast_ifp; + if (ia) + ifafree(&ia->ia_ifa); TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) if (ia->ia_ifp == ifp) break; - if (ia == 0) + if (ia == 0) { + lck_mtx_unlock(rt_mtx); return (EADDRNOTAVAIL); + } + ifaref(ia); } } - /* - * Don't do pcblookup call here; return interface in plocal_sin - * and exit to caller, that will do the lookup. - */ + /* + * Don't do pcblookup call here; return interface in plocal_sin + * and exit to caller, that will do the lookup. + */ *plocal_sin = &ia->ia_addr; - + ifafree(&ia->ia_ifa); } + lck_mtx_unlock(rt_mtx); return(0); } @@ -540,7 +610,7 @@ in_pcbconnect(inp, nam, p) { struct sockaddr_in *ifaddr; struct sockaddr_in *sin = (struct sockaddr_in *)nam; - struct sockaddr_in sa; + struct inpcb *pcb; int error; /* @@ -549,9 +619,13 @@ in_pcbconnect(inp, nam, p) if ((error = in_pcbladdr(inp, nam, &ifaddr)) != 0) return(error); - if (in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port, + socket_unlock(inp->inp_socket, 0); + pcb = in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port, inp->inp_laddr.s_addr ? inp->inp_laddr : ifaddr->sin_addr, - inp->inp_lport, 0, NULL) != NULL) { + inp->inp_lport, 0, NULL); + socket_lock(inp->inp_socket, 0); + if (pcb != NULL) { + in_pcb_checkstate(pcb, WNT_RELEASE, 0); return (EADDRINUSE); } if (inp->inp_laddr.s_addr == INADDR_ANY) { @@ -560,12 +634,27 @@ in_pcbconnect(inp, nam, p) if (error) return (error); } + if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) { + /*lock inversion issue, mostly with udp multicast packets */ + socket_unlock(inp->inp_socket, 0); + lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx); + socket_lock(inp->inp_socket, 0); + } inp->inp_laddr = ifaddr->sin_addr; inp->inp_flags |= INP_INADDR_ANY; + } + else { + if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) { + /*lock inversion issue, mostly with udp multicast packets */ + socket_unlock(inp->inp_socket, 0); + lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx); + socket_lock(inp->inp_socket, 0); + } } inp->inp_faddr = sin->sin_addr; inp->inp_fport = sin->sin_port; in_pcbrehash(inp); + lck_rw_done(inp->inp_pcbinfo->mtx); return (0); } @@ -576,8 +665,18 @@ in_pcbdisconnect(inp) inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; + + if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) { + /*lock inversion issue, mostly with udp multicast packets */ + socket_unlock(inp->inp_socket, 0); + lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx); + socket_lock(inp->inp_socket, 0); + } + in_pcbrehash(inp); - if (inp->inp_socket->so_state & SS_NOFDREF) + lck_rw_done(inp->inp_pcbinfo->mtx); + + if (inp->inp_socket->so_state & SS_NOFDREF) in_pcbdetach(inp); } @@ -586,56 +685,115 @@ in_pcbdetach(inp) struct inpcb *inp; { struct socket *so = inp->inp_socket; - struct inpcbinfo *ipi = inp->inp_pcbinfo; struct rtentry *rt = inp->inp_route.ro_rt; - - if (so->so_pcb == 0) /* we've been called twice, ignore */ - return; + if (so->so_pcb == 0) { /* we've been called twice */ + panic("in_pcbdetach: inp=%x so=%x proto=%x so_pcb is null!\n", + inp, so, so->so_proto->pr_protocol); + } #if IPSEC - ipsec4_delete_pcbpolicy(inp); + if (ipsec_bypass == 0) { + lck_mtx_lock(sadb_mutex); + ipsec4_delete_pcbpolicy(inp); + lck_mtx_unlock(sadb_mutex); + } #endif /*IPSEC*/ - inp->inp_gencnt = ++ipi->ipi_gencnt; - in_pcbremlists(inp); + + /* mark socket state as dead */ + if (in_pcb_checkstate(inp, WNT_STOPUSING, 1) != WNT_STOPUSING) + panic("in_pcbdetach so=%x prot=%x couldn't set to STOPUSING\n", so, so->so_proto->pr_protocol); #if TEMPDEBUG if (so->cached_in_sock_layer) - printf("PCB_DETACH for cached socket %x\n", so); + printf("in_pcbdetach for cached socket %x flags=%x\n", so, so->so_flags); else - printf("PCB_DETACH for allocated socket %x\n", so); + printf("in_pcbdetach for allocated socket %x flags=%x\n", so, so->so_flags); #endif + if ((so->so_flags & SOF_PCBCLEARING) == 0) { + inp->inp_vflag = 0; + if (inp->inp_options) + (void)m_free(inp->inp_options); + if (rt) { + /* + * route deletion requires reference count to be <= zero + */ + lck_mtx_lock(rt_mtx); + if ((rt->rt_flags & RTF_DELCLONE) && + (rt->rt_flags & RTF_WASCLONED) && + (rt->rt_refcnt <= 1)) { + rtunref(rt); + rt->rt_flags &= ~RTF_UP; + rtrequest_locked(RTM_DELETE, rt_key(rt), + rt->rt_gateway, rt_mask(rt), + rt->rt_flags, (struct rtentry **)0); + } + else { + rtfree_locked(rt); + inp->inp_route.ro_rt = 0; + } + lck_mtx_unlock(rt_mtx); + } + ip_freemoptions(inp->inp_moptions); + inp->inp_moptions = NULL; + sofreelastref(so, 0); + inp->inp_state = INPCB_STATE_DEAD; + so->so_flags |= SOF_PCBCLEARING; /* makes sure we're not called twice from so_close */ + } +} - so->so_pcb = 0; - if (inp->inp_options) - (void)m_free(inp->inp_options); - if (rt) { - /* - * route deletion requires reference count to be <= zero - */ - if ((rt->rt_flags & RTF_DELCLONE) && - (rt->rt_flags & RTF_WASCLONED) && - (rt->rt_refcnt <= 1)) { - rtunref(rt); - rt->rt_flags &= ~RTF_UP; - rtrequest(RTM_DELETE, rt_key(rt), - rt->rt_gateway, rt_mask(rt), - rt->rt_flags, (struct rtentry **)0); +void +in_pcbdispose(inp) + struct inpcb *inp; +{ + struct socket *so = inp->inp_socket; + struct inpcbinfo *ipi = inp->inp_pcbinfo; + +#if TEMPDEBUG + if (inp->inp_state != INPCB_STATE_DEAD) { + printf("in_pcbdispose: not dead yet? so=%x\n", so); + } +#endif + + if (so && so->so_usecount != 0) + panic("in_pcbdispose: use count=%x so=%x\n", so->so_usecount, so); + + + inp->inp_gencnt = ++ipi->ipi_gencnt; + /*### access ipi in in_pcbremlists */ + in_pcbremlists(inp); + + if (so) { + if (so->so_proto->pr_flags & PR_PCBLOCK) { + sofreelastref(so, 0); + if (so->so_rcv.sb_cc || so->so_snd.sb_cc) { +#if TEMPDEBUG + printf("in_pcbdispose sb not cleaned up so=%x rc_cci=%x snd_cc=%x\n", + so, so->so_rcv.sb_cc, so->so_snd.sb_cc); +#endif + sbrelease(&so->so_rcv); + sbrelease(&so->so_snd); + } + if (so->so_head != NULL) + panic("in_pcbdispose, so=%x head still exist\n", so); + lck_mtx_unlock(inp->inpcb_mtx); + lck_mtx_free(inp->inpcb_mtx, ipi->mtx_grp); } - else { - rtfree(rt); - inp->inp_route.ro_rt = 0; + so->so_flags |= SOF_PCBCLEARING; /* makes sure we're not called twice from so_close */ + so->so_saved_pcb = (caddr_t) inp; + so->so_pcb = 0; + inp->inp_socket = 0; + inp->reserved[0] = so; + if (so->cached_in_sock_layer == 0) { + zfree(ipi->ipi_zone, inp); } + sodealloc(so); } - ip_freemoptions(inp->inp_moptions); - inp->inp_vflag = 0; - if (so->cached_in_sock_layer) - so->so_saved_pcb = (caddr_t) inp; +#if TEMPDEBUG else - zfree(ipi->ipi_zone, (vm_offset_t) inp); - - sofree(so); + printf("in_pcbdispose: no socket for inp=%x\n", inp); +#endif } /* @@ -653,7 +811,6 @@ in_setsockaddr(so, nam) struct socket *so; struct sockaddr **nam; { - int s; register struct inpcb *inp; register struct sockaddr_in *sin; @@ -667,16 +824,13 @@ in_setsockaddr(so, nam) sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); - s = splnet(); inp = sotoinpcb(so); if (!inp) { - splx(s); FREE(sin, M_SONAME); return ECONNRESET; } sin->sin_port = inp->inp_lport; sin->sin_addr = inp->inp_laddr; - splx(s); *nam = (struct sockaddr *)sin; return 0; @@ -687,7 +841,6 @@ in_setpeeraddr(so, nam) struct socket *so; struct sockaddr **nam; { - int s; struct inpcb *inp; register struct sockaddr_in *sin; @@ -701,33 +854,29 @@ in_setpeeraddr(so, nam) sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); - s = splnet(); inp = sotoinpcb(so); if (!inp) { - splx(s); FREE(sin, M_SONAME); return ECONNRESET; } sin->sin_port = inp->inp_fport; sin->sin_addr = inp->inp_faddr; - splx(s); *nam = (struct sockaddr *)sin; return 0; } void -in_pcbnotifyall(head, faddr, errno, notify) - struct inpcbhead *head; +in_pcbnotifyall(pcbinfo, faddr, errno, notify) + struct inpcbinfo *pcbinfo; struct in_addr faddr; - void (*notify) __P((struct inpcb *, int)); + void (*notify) (struct inpcb *, int); { - struct inpcb *inp, *ninp; - int s; + struct inpcb *inp; + + lck_rw_lock_shared(pcbinfo->mtx); - s = splnet(); - for (inp = LIST_FIRST(head); inp != NULL; inp = ninp) { - ninp = LIST_NEXT(inp, inp_list); + LIST_FOREACH(inp, pcbinfo->listhead, inp_list) { #if INET6 if ((inp->inp_vflag & INP_IPV4) == 0) continue; @@ -735,15 +884,20 @@ in_pcbnotifyall(head, faddr, errno, notify) if (inp->inp_faddr.s_addr != faddr.s_addr || inp->inp_socket == NULL) continue; + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) + continue; + socket_lock(inp->inp_socket, 1); (*notify)(inp, errno); + (void)in_pcb_checkstate(inp, WNT_RELEASE, 1); + socket_unlock(inp->inp_socket, 1); } - splx(s); + lck_rw_done(pcbinfo->mtx); } void -in_pcbpurgeif0(head, ifp) - struct inpcb *head; - struct ifnet *ifp; +in_pcbpurgeif0( + struct inpcb *head, + struct ifnet *ifp) { struct inpcb *inp; struct ip_moptions *imo; @@ -767,7 +921,7 @@ in_pcbpurgeif0(head, ifp) for (i = 0, gap = 0; i < imo->imo_num_memberships; i++) { if (imo->imo_membership[i]->inm_ifp == ifp) { - in_delmulti(imo->imo_membership[i]); + in_delmulti(&imo->imo_membership[i]); gap++; } else if (gap != 0) imo->imo_membership[i - gap] = @@ -792,6 +946,7 @@ in_losing(inp) struct rt_addrinfo info; if ((rt = inp->inp_route.ro_rt)) { + lck_mtx_lock(rt_mtx); bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = (struct sockaddr *)&inp->inp_route.ro_dst; @@ -799,11 +954,12 @@ in_losing(inp) info.rti_info[RTAX_NETMASK] = rt_mask(rt); rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0); if (rt->rt_flags & RTF_DYNAMIC) - (void) rtrequest(RTM_DELETE, rt_key(rt), + (void) rtrequest_locked(RTM_DELETE, rt_key(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags, (struct rtentry **)0); inp->inp_route.ro_rt = 0; - rtfree(rt); + rtfree_locked(rt); + lck_mtx_unlock(rt_mtx); /* * A new route can be allocated * the next time output is attempted. @@ -821,7 +977,7 @@ in_rtchange(inp, errno) int errno; { if (inp->inp_route.ro_rt) { - if (ifa_foraddr(inp->inp_laddr.s_addr) == NULL) + if ((ifa_foraddr(inp->inp_laddr.s_addr)) == 0) return; /* we can't remove the route now. not sure if still ok to use src */ rtfree(inp->inp_route.ro_rt); inp->inp_route.ro_rt = 0; @@ -930,13 +1086,14 @@ in_pcblookup_local(pcbinfo, laddr, lport_arg, wild_okay) * Lookup PCB in hash list. */ struct inpcb * -in_pcblookup_hash(pcbinfo, faddr, fport_arg, laddr, lport_arg, wildcard, - ifp) - struct inpcbinfo *pcbinfo; - struct in_addr faddr, laddr; - u_int fport_arg, lport_arg; - int wildcard; - struct ifnet *ifp; +in_pcblookup_hash( + struct inpcbinfo *pcbinfo, + struct in_addr faddr, + u_int fport_arg, + struct in_addr laddr, + u_int lport_arg, + int wildcard, + struct ifnet *ifp) { struct inpcbhead *head; register struct inpcb *inp; @@ -946,19 +1103,7 @@ in_pcblookup_hash(pcbinfo, faddr, fport_arg, laddr, lport_arg, wildcard, * We may have found the pcb in the last lookup - check this first. */ - if ((!IN_MULTICAST(laddr.s_addr)) && (pcbinfo->last_pcb)) { - if (faddr.s_addr == pcbinfo->last_pcb->inp_faddr.s_addr && - laddr.s_addr == pcbinfo->last_pcb->inp_laddr.s_addr && - fport_arg == pcbinfo->last_pcb->inp_fport && - lport_arg == pcbinfo->last_pcb->inp_lport) { - /* - * Found. - */ - return (pcbinfo->last_pcb); - } - - pcbinfo->last_pcb = 0; - } + lck_rw_lock_shared(pcbinfo->mtx); /* * First look for an exact match. @@ -976,7 +1121,14 @@ in_pcblookup_hash(pcbinfo, faddr, fport_arg, laddr, lport_arg, wildcard, /* * Found. */ - return (inp); + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) { + lck_rw_done(pcbinfo->mtx); + return (inp); + } + else { /* it's there but dead, say it isn't found */ + lck_rw_done(pcbinfo->mtx); + return(NULL); + } } } if (wildcard) { @@ -998,8 +1150,16 @@ in_pcblookup_hash(pcbinfo, faddr, fport_arg, laddr, lport_arg, wildcard, (inp->inp_flags & INP_FAITH) == 0) continue; #endif - if (inp->inp_laddr.s_addr == laddr.s_addr) - return (inp); + if (inp->inp_laddr.s_addr == laddr.s_addr) { + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) { + lck_rw_done(pcbinfo->mtx); + return (inp); + } + else { /* it's there but dead, say it isn't found */ + lck_rw_done(pcbinfo->mtx); + return(NULL); + } + } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #if defined(INET6) if (INP_CHECK_SOCKAF(inp->inp_socket, @@ -1012,15 +1172,35 @@ in_pcblookup_hash(pcbinfo, faddr, fport_arg, laddr, lport_arg, wildcard, } } #if defined(INET6) - if (local_wild == NULL) - return (local_wild_mapped); + if (local_wild == NULL) { + if (local_wild_mapped != NULL) { + if (in_pcb_checkstate(local_wild_mapped, WNT_ACQUIRE, 0) != WNT_STOPUSING) { + lck_rw_done(pcbinfo->mtx); + return (local_wild_mapped); + } + else { /* it's there but dead, say it isn't found */ + lck_rw_done(pcbinfo->mtx); + return(NULL); + } + } + lck_rw_done(pcbinfo->mtx); + return (NULL); + } #endif /* defined(INET6) */ - return (local_wild); + if (in_pcb_checkstate(local_wild, WNT_ACQUIRE, 0) != WNT_STOPUSING) { + lck_rw_done(pcbinfo->mtx); + return (local_wild); + } + else { /* it's there but dead, say it isn't found */ + lck_rw_done(pcbinfo->mtx); + return(NULL); + } } /* * Not found. */ + lck_rw_done(pcbinfo->mtx); return (NULL); } @@ -1028,8 +1208,9 @@ in_pcblookup_hash(pcbinfo, faddr, fport_arg, laddr, lport_arg, wildcard, * Insert PCB onto various hash lists. */ int -in_pcbinshash(inp) +in_pcbinshash(inp, locked) struct inpcb *inp; + int locked; /* list already locked exclusive */ { struct inpcbhead *pcbhash; struct inpcbporthead *pcbporthash; @@ -1044,8 +1225,18 @@ in_pcbinshash(inp) #endif /* INET6 */ hashkey_faddr = inp->inp_faddr.s_addr; - pcbhash = &pcbinfo->hashbase[INP_PCBHASH(hashkey_faddr, - inp->inp_lport, inp->inp_fport, pcbinfo->hashmask)]; + inp->hash_element = INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->hashmask); + + if (!locked) { + if (!lck_rw_try_lock_exclusive(pcbinfo->mtx)) { + /*lock inversion issue, mostly with udp multicast packets */ + socket_unlock(inp->inp_socket, 0); + lck_rw_lock_exclusive(pcbinfo->mtx); + socket_lock(inp->inp_socket, 0); + } + } + + pcbhash = &pcbinfo->hashbase[inp->hash_element]; pcbporthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(inp->inp_lport, pcbinfo->porthashmask)]; @@ -1063,6 +1254,8 @@ in_pcbinshash(inp) if (phd == NULL) { MALLOC(phd, struct inpcbport *, sizeof(struct inpcbport), M_PCB, M_WAITOK); if (phd == NULL) { + if (!locked) + lck_rw_done(pcbinfo->mtx); return (ENOBUFS); /* XXX */ } phd->phd_port = inp->inp_lport; @@ -1072,10 +1265,8 @@ in_pcbinshash(inp) inp->inp_phd = phd; LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); LIST_INSERT_HEAD(pcbhash, inp, inp_hash); -#ifdef __APPLE__ - inp->hash_element = INP_PCBHASH(inp->inp_faddr.s_addr, inp->inp_lport, - inp->inp_fport, pcbinfo->hashmask); -#endif + if (!locked) + lck_rw_done(pcbinfo->mtx); return (0); } @@ -1098,30 +1289,23 @@ in_pcbrehash(inp) else #endif /* INET6 */ hashkey_faddr = inp->inp_faddr.s_addr; - - head = &inp->inp_pcbinfo->hashbase[INP_PCBHASH(hashkey_faddr, - inp->inp_lport, inp->inp_fport, inp->inp_pcbinfo->hashmask)]; + inp->hash_element = INP_PCBHASH(hashkey_faddr, inp->inp_lport, + inp->inp_fport, inp->inp_pcbinfo->hashmask); + head = &inp->inp_pcbinfo->hashbase[inp->hash_element]; LIST_REMOVE(inp, inp_hash); LIST_INSERT_HEAD(head, inp, inp_hash); -#ifdef __APPLE__ - inp->hash_element = INP_PCBHASH(inp->inp_faddr.s_addr, inp->inp_lport, - inp->inp_fport, inp->inp_pcbinfo->hashmask); -#endif } /* * Remove PCB from various lists. */ +//###LOCK must be called with list lock held void in_pcbremlists(inp) struct inpcb *inp; { inp->inp_gencnt = ++inp->inp_pcbinfo->ipi_gencnt; -#ifdef __APPLE__ - if (inp == inp->inp_pcbinfo->last_pcb) - inp->inp_pcbinfo->last_pcb = 0; -#endif if (inp->inp_lport) { struct inpcbport *phd = inp->inp_phd; @@ -1137,17 +1321,18 @@ in_pcbremlists(inp) inp->inp_pcbinfo->ipi_count--; } +static void in_pcb_detach_port( struct inpcb *inp); int -in_pcb_grab_port __P((struct inpcbinfo *pcbinfo, +in_pcb_grab_port (struct inpcbinfo *pcbinfo, u_short options, struct in_addr laddr, u_short *lport, struct in_addr faddr, u_short fport, u_int cookie, - u_char owner_id)) + u_char owner_id) { - struct inpcb *pcb; + struct inpcb *inp, *pcb; struct sockaddr_in sin; struct proc *p = current_proc(); int stat; @@ -1164,19 +1349,25 @@ in_pcb_grab_port __P((struct inpcbinfo *pcbinfo, * it's from the acceptance of an incoming connection. */ if (laddr.s_addr == 0) { + pcbinfo->nat_dummy_socket.so_pcb = (caddr_t)pcbinfo->nat_dummy_pcb; return EINVAL; } - if (in_pcblookup_hash(pcbinfo, faddr, fport, - laddr, *lport, 0, NULL) != NULL) { - if (!(IN_MULTICAST(ntohl(laddr.s_addr)))) { - return (EADDRINUSE); - } + inp = in_pcblookup_hash(pcbinfo, faddr, fport, laddr, *lport, 0, NULL); + if (inp) { + /* pcb was found, its count was upped. need to decrease it here */ + in_pcb_checkstate(inp, WNT_RELEASE, 0); + if (!(IN_MULTICAST(ntohl(laddr.s_addr)))) { + pcbinfo->nat_dummy_socket.so_pcb = (caddr_t)pcbinfo->nat_dummy_pcb; + return (EADDRINUSE); + } } stat = in_pcballoc(&pcbinfo->nat_dummy_socket, pcbinfo, p); - if (stat) + if (stat) { + pcbinfo->nat_dummy_socket.so_pcb = (caddr_t)pcbinfo->nat_dummy_pcb; return stat; + } pcb = sotoinpcb(&pcbinfo->nat_dummy_socket); pcb->inp_vflag |= INP_IPV4; @@ -1185,7 +1376,10 @@ in_pcb_grab_port __P((struct inpcbinfo *pcbinfo, pcb->inp_faddr = faddr; pcb->inp_fport = fport; - in_pcbinshash(pcb); + + lck_rw_lock_exclusive(pcbinfo->mtx); + in_pcbinshash(pcb, 1); + lck_rw_done(pcbinfo->mtx); } else { /* @@ -1198,8 +1392,10 @@ in_pcb_grab_port __P((struct inpcbinfo *pcbinfo, */ stat = in_pcballoc(&pcbinfo->nat_dummy_socket, pcbinfo, p); - if (stat) + if (stat) { + pcbinfo->nat_dummy_socket.so_pcb = (caddr_t)pcbinfo->nat_dummy_pcb; return stat; + } pcb = sotoinpcb(&pcbinfo->nat_dummy_socket); pcb->inp_vflag |= INP_IPV4; @@ -1209,21 +1405,26 @@ in_pcb_grab_port __P((struct inpcbinfo *pcbinfo, sin.sin_family = AF_INET; sin.sin_addr.s_addr = laddr.s_addr; sin.sin_port = *lport; - + + socket_lock(&pcbinfo->nat_dummy_socket, 1); stat = in_pcbbind((struct inpcb *) pcbinfo->nat_dummy_socket.so_pcb, (struct sockaddr *) &sin, p); if (stat) { - in_pcbdetach(pcb); - return stat; + socket_unlock(&pcbinfo->nat_dummy_socket, 1); /*detach first */ + in_pcb_detach_port(pcb); /* will restore dummy pcb */ + return stat; } + socket_unlock(&pcbinfo->nat_dummy_socket, 1); } } else { /* The grabber wants an ephemeral port */ stat = in_pcballoc(&pcbinfo->nat_dummy_socket, pcbinfo, p); - if (stat) + if (stat) { + pcbinfo->nat_dummy_socket.so_pcb = (caddr_t)pcbinfo->nat_dummy_pcb; return stat; + } pcb = sotoinpcb(&pcbinfo->nat_dummy_socket); pcb->inp_vflag |= INP_IPV4; @@ -1240,26 +1441,34 @@ in_pcb_grab_port __P((struct inpcbinfo *pcbinfo, */ if (laddr.s_addr == 0) { - in_pcbdetach(pcb); + in_pcb_detach_port(pcb); /* restores dummy pcb */ return EINVAL; } + socket_lock(&pcbinfo->nat_dummy_socket, 1); stat = in_pcbbind((struct inpcb *) pcbinfo->nat_dummy_socket.so_pcb, (struct sockaddr *) &sin, p); if (stat) { - in_pcbdetach(pcb); + socket_unlock(&pcbinfo->nat_dummy_socket, 1); + in_pcb_detach_port(pcb); /* restores dummy pcb */ return stat; } - if (in_pcblookup_hash(pcbinfo, faddr, fport, - pcb->inp_laddr, pcb->inp_lport, 0, NULL) != NULL) { - in_pcbdetach(pcb); - return (EADDRINUSE); + socket_unlock(&pcbinfo->nat_dummy_socket, 1); + inp = in_pcblookup_hash(pcbinfo, faddr, fport, + pcb->inp_laddr, pcb->inp_lport, 0, NULL); + if (inp) { + /* pcb was found, its count was upped. need to decrease it here */ + in_pcb_checkstate(inp, WNT_RELEASE, 0); + in_pcb_detach_port(pcb); + return (EADDRINUSE); } + lck_rw_lock_exclusive(pcbinfo->mtx); pcb->inp_faddr = faddr; pcb->inp_fport = fport; in_pcbrehash(pcb); + lck_rw_done(pcbinfo->mtx); } else { /* @@ -1267,12 +1476,15 @@ in_pcb_grab_port __P((struct inpcbinfo *pcbinfo, * may or may not be defined. */ + socket_lock(&pcbinfo->nat_dummy_socket, 1); stat = in_pcbbind((struct inpcb *) pcbinfo->nat_dummy_socket.so_pcb, (struct sockaddr *) &sin, p); if (stat) { - in_pcbdetach(pcb); + socket_unlock(&pcbinfo->nat_dummy_socket, 1); + in_pcb_detach_port(pcb); return stat; } + socket_unlock(&pcbinfo->nat_dummy_socket, 1); } *lport = pcb->inp_lport; } @@ -1281,12 +1493,33 @@ in_pcb_grab_port __P((struct inpcbinfo *pcbinfo, pcb->nat_owner = owner_id; pcb->nat_cookie = cookie; pcb->inp_ppcb = (caddr_t) pcbinfo->dummy_cb; + pcbinfo->nat_dummy_socket.so_pcb = (caddr_t)pcbinfo->nat_dummy_pcb; /* restores dummypcb */ return 0; } +/* 3962035 - in_pcb_letgo_port needs a special case function for detaching */ +static void +in_pcb_detach_port( + struct inpcb *inp) +{ + struct socket *so = inp->inp_socket; + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + + if (so != &pcbinfo->nat_dummy_socket) + panic("in_pcb_detach_port: not a dummy_sock: so=%x, inp=%x\n", so, inp); + inp->inp_gencnt = ++pcbinfo->ipi_gencnt; + /*### access ipi in in_pcbremlists */ + in_pcbremlists(inp); + + inp->inp_socket = 0; + inp->reserved[0] = so; + zfree(pcbinfo->ipi_zone, inp); + pcbinfo->nat_dummy_socket.so_pcb = (caddr_t)pcbinfo->nat_dummy_pcb; /* restores dummypcb */ +} + int -in_pcb_letgo_port __P((struct inpcbinfo *pcbinfo, struct in_addr laddr, u_short lport, - struct in_addr faddr, u_short fport, u_char owner_id)) +in_pcb_letgo_port(struct inpcbinfo *pcbinfo, struct in_addr laddr, u_short lport, + struct in_addr faddr, u_short fport, u_char owner_id) { struct inpcbhead *head; register struct inpcb *inp; @@ -1295,6 +1528,8 @@ in_pcb_letgo_port __P((struct inpcbinfo *pcbinfo, struct in_addr laddr, u_short /* * First look for an exact match. */ + + lck_rw_lock_exclusive(pcbinfo->mtx); head = &pcbinfo->hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->hashmask)]; for (inp = head->lh_first; inp != NULL; inp = inp->inp_hash.le_next) { if (inp->inp_faddr.s_addr == faddr.s_addr && @@ -1305,11 +1540,13 @@ in_pcb_letgo_port __P((struct inpcbinfo *pcbinfo, struct in_addr laddr, u_short /* * Found. */ - in_pcbdetach(inp); + in_pcb_detach_port(inp); + lck_rw_done(pcbinfo->mtx); return 0; } } + lck_rw_done(pcbinfo->mtx); return ENOENT; } @@ -1332,6 +1569,7 @@ in_pcb_get_owner(struct inpcbinfo *pcbinfo, * owners. */ + lck_rw_lock_shared(pcbinfo->mtx); porthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(lport, pcbinfo->porthashmask)]; for (phd = porthash->lh_first; phd != NULL; phd = phd->phd_hash.le_next) { @@ -1340,6 +1578,7 @@ in_pcb_get_owner(struct inpcbinfo *pcbinfo, } if (phd == 0) { + lck_rw_done(pcbinfo->mtx); return INPCB_NO_OWNER; } @@ -1355,18 +1594,23 @@ in_pcb_get_owner(struct inpcbinfo *pcbinfo, } } + lck_rw_done(pcbinfo->mtx); return owner_id; } else { inp = in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, 1, NULL); if (inp) { + /* pcb was found, its count was upped. need to decrease it here */ + /* if we found it, that pcb is already locked by the caller */ + if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) + return(INPCB_NO_OWNER); + if (inp->nat_owner) { owner_id = inp->nat_owner; *cookie = inp->nat_cookie; } else { - pcbinfo->last_pcb = inp; owner_id = INPCB_OWNED_BY_X; } } @@ -1401,6 +1645,7 @@ in_pcb_rem_share_client(struct inpcbinfo *pcbinfo, u_char owner_id) struct inpcb *inp; + lck_rw_lock_exclusive(pcbinfo->mtx); if (pcbinfo->all_owners & owner_id) { pcbinfo->all_owners &= ~owner_id; for (inp = pcbinfo->listhead->lh_first; inp != NULL; inp = inp->inp_list.le_next) { @@ -1409,16 +1654,18 @@ in_pcb_rem_share_client(struct inpcbinfo *pcbinfo, u_char owner_id) /* * Deallocate the pcb */ - in_pcbdetach(inp); + in_pcb_detach_port(inp); else inp->nat_owner &= ~owner_id; } } } else { + lck_rw_done(pcbinfo->mtx); return ENOENT; } + lck_rw_done(pcbinfo->mtx); return 0; } @@ -1427,11 +1674,147 @@ in_pcb_rem_share_client(struct inpcbinfo *pcbinfo, u_char owner_id) void in_pcb_nat_init(struct inpcbinfo *pcbinfo, int afamily, int pfamily, int protocol) { + int stat; + struct proc *p = current_proc(); + bzero(&pcbinfo->nat_dummy_socket, sizeof(struct socket)); - pcbinfo->nat_dummy_socket.so_proto = pffindproto(afamily, pfamily, protocol); + pcbinfo->nat_dummy_socket.so_proto = pffindproto_locked(afamily, pfamily, protocol); pcbinfo->all_owners = 0; + stat = in_pcballoc(&pcbinfo->nat_dummy_socket, pcbinfo, p); + if (stat) + panic("in_pcb_nat_init: can't alloc fakepcb err=%\n", stat); + pcbinfo->nat_dummy_pcb = pcbinfo->nat_dummy_socket.so_pcb; } +/* Mechanism used to defer the memory release of PCBs + * The pcb list will contain the pcb until the ripper can clean it up if + * the following conditions are met: 1) state "DEAD", 2) wantcnt is STOPUSING + * 3) usecount is null + * This function will be called to either mark the pcb as +*/ +int +in_pcb_checkstate(struct inpcb *pcb, int mode, int locked) + +{ + + volatile UInt32 *wantcnt = (volatile UInt32 *)&pcb->inp_wantcnt; + UInt32 origwant; + UInt32 newwant; + + switch (mode) { + + case WNT_STOPUSING: /* try to mark the pcb as ready for recycling */ + + /* compareswap with STOPUSING, if success we're good, if it's in use, will be marked later */ + + if (locked == 0) + socket_lock(pcb->inp_socket, 1); + pcb->inp_state = INPCB_STATE_DEAD; +stopusing: + if (pcb->inp_socket->so_usecount < 0) + panic("in_pcb_checkstate STOP pcb=%x so=%x usecount is negative\n", pcb, pcb->inp_socket); + if (locked == 0) + socket_unlock(pcb->inp_socket, 1); + + origwant = *wantcnt; + if ((UInt16) origwant == 0xffff ) /* should stop using */ + return (WNT_STOPUSING); + newwant = 0xffff; + if ((UInt16) origwant == 0) {/* try to mark it as unsuable now */ + OSCompareAndSwap(origwant, newwant, (UInt32 *) wantcnt) ; + } + return (WNT_STOPUSING); + break; + + case WNT_ACQUIRE: /* try to increase reference to pcb */ + /* if WNT_STOPUSING should bail out */ + /* + * if socket state DEAD, try to set count to STOPUSING, return failed + * otherwise increase cnt + */ + do { + origwant = *wantcnt; + if ((UInt16) origwant == 0xffff ) {/* should stop using */ +// printf("in_pcb_checkstate: ACQ PCB was STOPUSING while release. odd pcb=%x\n", pcb); + return (WNT_STOPUSING); + } + newwant = origwant + 1; + } while (!OSCompareAndSwap(origwant, newwant, (UInt32 *) wantcnt)); + return (WNT_ACQUIRE); + break; + + case WNT_RELEASE: /* release reference. if result is null and pcb state is DEAD, + set wanted bit to STOPUSING + */ + + if (locked == 0) + socket_lock(pcb->inp_socket, 1); + + do { + origwant = *wantcnt; + if ((UInt16) origwant == 0x0 ) + panic("in_pcb_checkstate pcb=%x release with zero count", pcb); + if ((UInt16) origwant == 0xffff ) {/* should stop using */ +#if TEMPDEBUG + printf("in_pcb_checkstate: REL PCB was STOPUSING while release. odd pcb=%x\n", pcb); +#endif + if (locked == 0) + socket_unlock(pcb->inp_socket, 1); + return (WNT_STOPUSING); + } + newwant = origwant - 1; + } while (!OSCompareAndSwap(origwant, newwant, (UInt32 *) wantcnt)); + + if (pcb->inp_state == INPCB_STATE_DEAD) + goto stopusing; + if (pcb->inp_socket->so_usecount < 0) + panic("in_pcb_checkstate RELEASE pcb=%x so=%x usecount is negative\n", pcb, pcb->inp_socket); + + if (locked == 0) + socket_unlock(pcb->inp_socket, 1); + return (WNT_RELEASE); + break; + + default: + + panic("in_pcb_checkstate: so=%x not a valid state =%x\n", pcb->inp_socket, mode); + } + + /* NOTREACHED */ + return (mode); +} + +/* + * inpcb_to_compat copies specific bits of an inpcb to a inpcb_compat. + * The inpcb_compat data structure is passed to user space and must + * not change. We intentionally avoid copying pointers. The socket is + * the one exception, though we probably shouldn't copy that either. + */ +void +inpcb_to_compat( + struct inpcb *inp, + struct inpcb_compat *inp_compat) +{ + bzero(inp_compat, sizeof(*inp_compat)); + inp_compat->inp_fport = inp->inp_fport; + inp_compat->inp_lport = inp->inp_lport; + inp_compat->inp_socket = inp->inp_socket; + inp_compat->nat_owner = inp->nat_owner; + inp_compat->nat_cookie = inp->nat_cookie; + inp_compat->inp_gencnt = inp->inp_gencnt; + inp_compat->inp_flags = inp->inp_flags; + inp_compat->inp_flow = inp->inp_flow; + inp_compat->inp_vflag = inp->inp_vflag; + inp_compat->inp_ip_ttl = inp->inp_ip_ttl; + inp_compat->inp_ip_p = inp->inp_ip_p; + inp_compat->inp_dependfaddr.inp6_foreign = inp->inp_dependfaddr.inp6_foreign; + inp_compat->inp_dependladdr.inp6_local = inp->inp_dependladdr.inp6_local; + inp_compat->inp_depend4.inp4_ip_tos = inp->inp_depend4.inp4_ip_tos; + inp_compat->inp_depend6.inp6_hlim = inp->inp_depend6.inp6_hlim; + inp_compat->inp_depend6.inp6_cksum = inp->inp_depend6.inp6_cksum; + inp_compat->inp6_ifindex = inp->inp6_ifindex; + inp_compat->inp_depend6.inp6_hops = inp->inp_depend6.inp6_hops; +} #ifndef __APPLE__ prison_xinpcb(struct proc *p, struct inpcb *inp) diff --git a/bsd/netinet/in_pcb.h b/bsd/netinet/in_pcb.h index a32277efc..9f8b77058 100644 --- a/bsd/netinet/in_pcb.h +++ b/bsd/netinet/in_pcb.h @@ -59,12 +59,17 @@ #define _NETINET_IN_PCB_H_ #include <sys/appleapiopts.h> +#include <sys/types.h> #include <sys/queue.h> - +#ifdef KERNEL_PRIVATE +#ifdef KERNEL +#include <kern/locks.h> +#endif +#endif /* KERNEL_PRIVATE */ #include <netinet6/ipsec.h> /* for IPSEC */ -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE #define in6pcb inpcb /* for KAME src sync over BSD*'s */ #define in6p_sp inp_sp /* for KAME src sync over BSD*'s */ @@ -78,6 +83,7 @@ */ LIST_HEAD(inpcbhead, inpcb); LIST_HEAD(inpcbporthead, inpcbport); +#endif /* KERNEL_PRIVATE */ typedef u_quad_t inp_gen_t; /* @@ -90,6 +96,7 @@ struct in_addr_4in6 { struct in_addr ia46_addr4; }; +#ifdef KERNEL_PRIVATE /* * NB: the zone allocator is type-stable EXCEPT FOR THE FIRST TWO LONGS * of the structure. Therefore, it is important that the members in @@ -100,8 +107,8 @@ struct icmp6_filter; struct inpcb { LIST_ENTRY(inpcb) inp_hash; /* hash list */ - struct in_addr reserved1; /* APPLE reserved: inp_faddr defined in protcol indep. part */ - struct in_addr reserved2; /* APPLE reserved */ + int inp_wantcnt; /* pcb wanted count. protected by pcb list lock */ + int inp_state; /* state of this pcb, in use, recycled, ready for recycling... */ u_short inp_fport; /* foreign port */ u_short inp_lport; /* local port */ LIST_ENTRY(inpcb) inp_list; /* list for all PCBs of this proto */ @@ -116,9 +123,7 @@ struct inpcb { int inp_flags; /* generic IP/datagram flags */ u_int32_t inp_flow; - u_char inp_vflag; -#define INP_IPV4 0x1 -#define INP_IPV6 0x2 + u_char inp_vflag; /* INP_IPV4 or INP_IPV6 */ u_char inp_ip_ttl; /* time to live proto */ u_char inp_ip_p; /* protocol proto */ @@ -146,12 +151,6 @@ struct inpcb { /* IP multicast options */ struct ip_moptions *inp4_moptions; } inp_depend4; -#define inp_faddr inp_dependfaddr.inp46_foreign.ia46_addr4 -#define inp_laddr inp_dependladdr.inp46_local.ia46_addr4 -#define inp_route inp_dependroute.inp4_route -#define inp_ip_tos inp_depend4.inp4_ip_tos -#define inp_options inp_depend4.inp4_options -#define inp_moptions inp_depend4.inp4_moptions struct { /* IP options */ struct mbuf *inp6_options; @@ -169,32 +168,20 @@ struct inpcb { u_short inp6_ifindex; short inp6_hops; } inp_depend6; -#define in6p_faddr inp_dependfaddr.inp6_foreign -#define in6p_laddr inp_dependladdr.inp6_local -#define in6p_route inp_dependroute.inp6_route -#define in6p_ip6_hlim inp_depend6.inp6_hlim -#define in6p_hops inp_depend6.inp6_hops /* default hop limit */ -#define in6p_ip6_nxt inp_ip_p -#define in6p_flowinfo inp_flow -#define in6p_vflag inp_vflag -#define in6p_options inp_depend6.inp6_options -#define in6p_outputopts inp_depend6.inp6_outputopts -#define in6p_moptions inp_depend6.inp6_moptions -#define in6p_icmp6filt inp_depend6.inp6_icmp6filt -#define in6p_cksum inp_depend6.inp6_cksum -#define inp6_ifindex inp_depend6.inp6_ifindex -#define in6p_flags inp_flags /* for KAME src sync over BSD*'s */ -#define in6p_socket inp_socket /* for KAME src sync over BSD*'s */ -#define in6p_lport inp_lport /* for KAME src sync over BSD*'s */ -#define in6p_fport inp_fport /* for KAME src sync over BSD*'s */ -#define in6p_ppcb inp_ppcb /* for KAME src sync over BSD*'s */ int hash_element; /* Array index of pcb's hash list */ caddr_t inp_saved_ppcb; /* place to save pointer while cached */ struct inpcbpolicy *inp_sp; - u_long reserved[3]; /* For future use */ +#ifdef _KERN_LOCKS_H_ + lck_mtx_t *inpcb_mtx; /* inpcb per-socket mutex */ +#else + void *inpcb_mtx; +#endif + u_long reserved[2]; /* For future use */ }; -#endif /* __APPLE_API_PRIVATE */ + +#endif /* KERNEL_PRIVATE */ + /* * The range of the generation count, as used in this implementation, * is 9e19. We would have to create 300 billion connections per @@ -206,10 +193,105 @@ struct inpcb { * Interface exported to userland by various protocols which use * inpcbs. Hack alert -- only define if struct xsocket is in scope. */ -#ifdef _SYS_SOCKETVAR_H_ + +/* + * This is a copy of the inpcb as it shipped in Panther. This structure + * is filled out in a copy function. This allows the inpcb to change + * without breaking userland tools. + * + * CAUTION: Many fields may not be filled out. Fewer may be filled out + * in the future. Code defensively. + */ +#ifdef KERNEL_PRIVATE +struct inpcb_compat { +#else +struct inpcbinfo; +struct inpcbport; +struct mbuf; +struct ip6_pktopts; +struct ip6_moptions; +struct icmp6_filter; +struct inpcbpolicy; + +struct inpcb { +#endif /* KERNEL_PRIVATE */ + LIST_ENTRY(inpcb) inp_hash; /* hash list */ + struct in_addr reserved1; /* APPLE reserved: inp_faddr defined in protcol indep. part */ + struct in_addr reserved2; /* APPLE reserved */ + u_short inp_fport; /* foreign port */ + u_short inp_lport; /* local port */ + LIST_ENTRY(inpcb) inp_list; /* list for all PCBs of this proto */ + caddr_t inp_ppcb; /* pointer to per-protocol pcb */ + struct inpcbinfo *inp_pcbinfo; /* PCB list info */ + void* inp_socket; /* back pointer to socket */ + u_char nat_owner; /* Used to NAT TCP/UDP traffic */ + u_long nat_cookie; /* Cookie stored and returned to NAT */ + LIST_ENTRY(inpcb) inp_portlist; /* list for this PCB's local port */ + struct inpcbport *inp_phd; /* head of this list */ + inp_gen_t inp_gencnt; /* generation count of this instance */ + int inp_flags; /* generic IP/datagram flags */ + u_int32_t inp_flow; + + u_char inp_vflag; + + u_char inp_ip_ttl; /* time to live proto */ + u_char inp_ip_p; /* protocol proto */ + /* protocol dependent part */ + union { + /* foreign host table entry */ + struct in_addr_4in6 inp46_foreign; + struct in6_addr inp6_foreign; + } inp_dependfaddr; + union { + /* local host table entry */ + struct in_addr_4in6 inp46_local; + struct in6_addr inp6_local; + } inp_dependladdr; + union { + /* placeholder for routing entry */ + u_char inp4_route[20]; + u_char inp6_route[32]; + } inp_dependroute; + struct { + /* type of service proto */ + u_char inp4_ip_tos; + /* IP options */ + struct mbuf *inp4_options; + /* IP multicast options */ + struct ip_moptions *inp4_moptions; + } inp_depend4; + + struct { + /* IP options */ + struct mbuf *inp6_options; + u_int8_t inp6_hlim; + u_int8_t unused_uint8_1; + ushort unused_uint16_1; + /* IP6 options for outgoing packets */ + struct ip6_pktopts *inp6_outputopts; + /* IP multicast options */ + struct ip6_moptions *inp6_moptions; + /* ICMPv6 code type filter */ + struct icmp6_filter *inp6_icmp6filt; + /* IPV6_CHECKSUM setsockopt */ + int inp6_cksum; + u_short inp6_ifindex; + short inp6_hops; + } inp_depend6; + + int hash_element; /* Array index of pcb's hash list */ + caddr_t inp_saved_ppcb; /* place to save pointer while cached */ + struct inpcbpolicy *inp_sp; + u_long reserved[3]; /* For future use */ +}; + struct xinpcb { size_t xi_len; /* length of this structure */ +#ifdef KERNEL_PRIVATE + struct inpcb_compat xi_inp; +#else struct inpcb xi_inp; +#endif struct xsocket xi_socket; u_quad_t xi_alignment_hack; }; @@ -220,9 +302,41 @@ struct xinpgen { inp_gen_t xig_gen; /* generation count at this time */ so_gen_t xig_sogen; /* socket generation count at this time */ }; -#endif /* _SYS_SOCKETVAR_H_ */ -#ifdef __APPLE_API_PRIVATE +/* + * These defines are for use with the inpcb. + */ +#define INP_IPV4 0x1 +#define INP_IPV6 0x2 +#define inp_faddr inp_dependfaddr.inp46_foreign.ia46_addr4 +#define inp_laddr inp_dependladdr.inp46_local.ia46_addr4 +#define inp_route inp_dependroute.inp4_route +#define inp_ip_tos inp_depend4.inp4_ip_tos +#define inp_options inp_depend4.inp4_options +#define inp_moptions inp_depend4.inp4_moptions +#define in6p_faddr inp_dependfaddr.inp6_foreign +#define in6p_laddr inp_dependladdr.inp6_local +#define in6p_route inp_dependroute.inp6_route +#define in6p_ip6_hlim inp_depend6.inp6_hlim +#define in6p_hops inp_depend6.inp6_hops /* default hop limit */ +#define in6p_ip6_nxt inp_ip_p +#define in6p_flowinfo inp_flow +#define in6p_vflag inp_vflag +#define in6p_options inp_depend6.inp6_options +#define in6p_outputopts inp_depend6.inp6_outputopts +#define in6p_moptions inp_depend6.inp6_moptions +#define in6p_icmp6filt inp_depend6.inp6_icmp6filt +#define in6p_cksum inp_depend6.inp6_cksum +#define inp6_ifindex inp_depend6.inp6_ifindex +#define in6p_flags inp_flags /* for KAME src sync over BSD*'s */ +#define in6p_socket inp_socket /* for KAME src sync over BSD*'s */ +#define in6p_lport inp_lport /* for KAME src sync over BSD*'s */ +#define in6p_fport inp_fport /* for KAME src sync over BSD*'s */ +#define in6p_ppcb inp_ppcb /* for KAME src sync over BSD*'s */ +#define in6p_state inp_state +#define in6p_wantcnt inp_wantcnt + +#ifdef KERNEL_PRIVATE struct inpcbport { LIST_ENTRY(inpcbport) phd_hash; struct inpcbhead phd_pcblist; @@ -245,10 +359,21 @@ struct inpcbinfo { /* XXX documentation, prefixes */ u_int ipi_count; /* number of pcbs in this list */ u_quad_t ipi_gencnt; /* current generation count */ #ifdef __APPLE__ - u_char all_owners; - struct socket nat_dummy_socket; - struct inpcb *last_pcb; - caddr_t dummy_cb; + u_char all_owners; + struct socket nat_dummy_socket; /* fake socket for NAT pcb backpointer */ + struct inpcb *nat_dummy_pcb; /* fake pcb for finding NAT mutex */ + caddr_t dummy_cb; +#ifdef _KERN_LOCKS_H_ + lck_attr_t *mtx_attr; /* mutex attributes */ + lck_grp_t *mtx_grp; /* mutex group definition */ + lck_grp_attr_t *mtx_grp_attr; /* mutex group attributes */ + lck_rw_t *mtx; /* global mutex for the pcblist*/ +#else + void *mtx_attr; /* mutex attributes */ + void *mtx_grp; /* mutex group definition */ + void *mtx_grp_attr; /* mutex group attributes */ + void *mtx; /* global mutex for the pcblist*/ +#endif #endif }; @@ -257,6 +382,8 @@ struct inpcbinfo { /* XXX documentation, prefixes */ #define INP_PCBPORTHASH(lport, mask) \ (ntohs((lport)) & (mask)) +#endif /* KERNEL_PRIVATE */ + /* flags in inp_flags: */ #define INP_RECVOPTS 0x01 /* receive incoming IP options */ #define INP_RECVRETOPTS 0x02 /* receive IP options for reply */ @@ -268,12 +395,12 @@ struct inpcbinfo { /* XXX documentation, prefixes */ #define INP_RECVIF 0x80 /* receive incoming interface */ #define INP_MTUDISC 0x100 /* user can do MTU discovery */ #ifdef __APPLE__ -#define INP_STRIPHDR 0x200 /* Strip headers in raw_ip, for OT support */ +#define INP_STRIPHDR 0x200 /* Strip headers in raw_ip, for OT support */ #endif #define INP_FAITH 0x400 /* accept FAITH'ed connections */ #define INP_INADDR_ANY 0x800 /* local address wasn't specified */ -#define INP_RECVTTL 0x1000 +#define INP_RECVTTL 0x1000 #define IN6P_IPV6_V6ONLY 0x008000 /* restrict AF_INET6 socket for v6 */ @@ -286,6 +413,7 @@ struct inpcbinfo { /* XXX documentation, prefixes */ #define IN6P_AUTOFLOWLABEL 0x800000 /* attach flowlabel automatically */ #define IN6P_BINDV6ONLY 0x10000000 /* do not grab IPv4 traffic */ +#ifdef KERNEL_PRIVATE #define INP_CONTROLOPTS (INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\ INP_RECVIF|\ IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\ @@ -313,7 +441,7 @@ struct inpcbinfo { /* XXX documentation, prefixes */ #define INPCB_NO_OWNER 0x0 #define INPCB_OWNED_BY_X 0x80 #define INPCB_MAX_IDS 7 -#endif +#endif /* __APPLE__ */ #define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) #define sotoin6pcb(so) sotoinpcb(so) /* for KAME src sync over BSD*'s */ @@ -330,55 +458,63 @@ extern int ipport_lastauto; extern int ipport_hifirstauto; extern int ipport_hilastauto; -void in_pcbpurgeif0 __P((struct inpcb *, struct ifnet *)); -void in_losing __P((struct inpcb *)); -void in_rtchange __P((struct inpcb *, int)); -int in_pcballoc __P((struct socket *, struct inpcbinfo *, struct proc *)); -int in_pcbbind __P((struct inpcb *, struct sockaddr *, struct proc *)); -int in_pcbconnect __P((struct inpcb *, struct sockaddr *, struct proc *)); -void in_pcbdetach __P((struct inpcb *)); -void in_pcbdisconnect __P((struct inpcb *)); -int in_pcbinshash __P((struct inpcb *)); -int in_pcbladdr __P((struct inpcb *, struct sockaddr *, - struct sockaddr_in **)); +#define INPCB_STATE_INUSE 0x1 /* freshly allocated PCB, it's in use */ +#define INPCB_STATE_CACHED 0x2 /* this pcb is sitting in a a cache */ +#define INPCB_STATE_DEAD 0x3 /* should treat as gone, will be garbage collected and freed */ + +#define WNT_STOPUSING 0xffff /* marked as ready to be garbaged collected, should be treated as not found */ +#define WNT_ACQUIRE 0x1 /* that pcb is being acquired, do not recycle this time */ +#define WNT_RELEASE 0x2 /* release acquired mode, can be garbage collected when wantcnt is null */ + + +void in_pcbpurgeif0(struct inpcb *, struct ifnet *); +void in_losing(struct inpcb *); +void in_rtchange(struct inpcb *, int); +int in_pcballoc(struct socket *, struct inpcbinfo *, struct proc *); +int in_pcbbind(struct inpcb *, struct sockaddr *, struct proc *); +int in_pcbconnect(struct inpcb *, struct sockaddr *, struct proc *); +void in_pcbdetach(struct inpcb *); +void in_pcbdispose (struct inpcb *); +void in_pcbdisconnect(struct inpcb *); +int in_pcbinshash(struct inpcb *, int); +int in_pcbladdr(struct inpcb *, struct sockaddr *, struct sockaddr_in **); struct inpcb * - in_pcblookup_local __P((struct inpcbinfo *, - struct in_addr, u_int, int)); + in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_int, int); struct inpcb * - in_pcblookup_hash __P((struct inpcbinfo *, + in_pcblookup_hash(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, - int, struct ifnet *)); -void in_pcbnotifyall __P((struct inpcbhead *, struct in_addr, - int, void (*)(struct inpcb *, int))); -void in_pcbrehash __P((struct inpcb *)); -int in_setpeeraddr __P((struct socket *so, struct sockaddr **nam)); -int in_setsockaddr __P((struct socket *so, struct sockaddr **nam)); + int, struct ifnet *); +void in_pcbnotifyall(struct inpcbinfo *, struct in_addr, + int, void (*)(struct inpcb *, int)); +void in_pcbrehash(struct inpcb *); +int in_setpeeraddr(struct socket *so, struct sockaddr **nam); +int in_setsockaddr(struct socket *so, struct sockaddr **nam); +int in_pcb_checkstate(struct inpcb *pcb, int mode, int locked); -#ifdef __APPLE__ -int -in_pcb_grab_port __P((struct inpcbinfo *pcbinfo, +int +in_pcb_grab_port (struct inpcbinfo *pcbinfo, u_short options, struct in_addr laddr, u_short *lport, struct in_addr faddr, u_short fport, u_int cookie, - u_char owner_id)); + u_char owner_id); int -in_pcb_letgo_port __P((struct inpcbinfo *pcbinfo, +in_pcb_letgo_port(struct inpcbinfo *pcbinfo, struct in_addr laddr, u_short lport, struct in_addr faddr, - u_short fport, u_char owner_id)); + u_short fport, u_char owner_id); u_char -in_pcb_get_owner __P((struct inpcbinfo *pcbinfo, +in_pcb_get_owner(struct inpcbinfo *pcbinfo, struct in_addr laddr, u_short lport, struct in_addr faddr, u_short fport, - u_int *cookie)); + u_int *cookie); void in_pcb_nat_init(struct inpcbinfo *pcbinfo, int afamily, int pfamily, int protocol); @@ -388,13 +524,12 @@ in_pcb_new_share_client(struct inpcbinfo *pcbinfo, u_char *owner_id); int in_pcb_rem_share_client(struct inpcbinfo *pcbinfo, u_char owner_id); -#endif /* __APPLE__ */ -void in_pcbremlists __P((struct inpcb *inp)); -#ifndef __APPLE__ -int prison_xinpcb __P((struct proc *p, struct inpcb *inp)); -#endif -#endif /* _KERNEL */ -#endif /* __APPLE_API_PRIVATE */ +void in_pcbremlists(struct inpcb *inp); +int in_pcb_ckeckstate(struct inpcb *, int, int); +void inpcb_to_compat(struct inpcb *inp, struct inpcb_compat *inp_compat); + +#endif /* KERNEL */ +#endif /* KERNEL_PRIVATE */ #endif /* !_NETINET_IN_PCB_H_ */ diff --git a/bsd/netinet/in_proto.c b/bsd/netinet/in_proto.c index ad4409a07..d910aa342 100644 --- a/bsd/netinet/in_proto.c +++ b/bsd/netinet/in_proto.c @@ -61,6 +61,7 @@ #include <sys/protosw.h> #include <sys/queue.h> #include <sys/sysctl.h> +#include <sys/mbuf.h> #include <net/if.h> #include <net/route.h> @@ -78,6 +79,7 @@ #include <netinet/udp.h> #include <netinet/udp_var.h> #include <netinet/ip_encap.h> +#include <netinet/ip_divert.h> /* @@ -102,98 +104,127 @@ static struct pr_usrreqs nousrreqs; extern struct pr_usrreqs icmp_dgram_usrreqs; extern int icmp_dgram_ctloutput(struct socket *, struct sockopt *); + struct protosw inetsw[] = { { 0, &inetdomain, 0, 0, 0, 0, 0, 0, 0, ip_init, 0, ip_slowtimo, ip_drain, - 0, &nousrreqs + 0, + &nousrreqs, + 0, 0, 0, { 0, 0 }, 0, { 0 } }, -{ SOCK_DGRAM, &inetdomain, IPPROTO_UDP, PR_ATOMIC|PR_ADDR, +{ SOCK_DGRAM, &inetdomain, IPPROTO_UDP, PR_ATOMIC|PR_ADDR|PR_PROTOLOCK|PR_PCBLOCK, udp_input, 0, udp_ctlinput, ip_ctloutput, 0, - udp_init, 0, 0, 0, - 0, &udp_usrreqs + udp_init, 0, udp_slowtimo, 0, + 0, + &udp_usrreqs, + udp_lock, udp_unlock, udp_getlock, { 0, 0 }, 0, { 0 } }, -{ SOCK_STREAM, &inetdomain, IPPROTO_TCP, - PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, +{ SOCK_STREAM, &inetdomain, IPPROTO_TCP, + PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD|PR_PCBLOCK|PR_PROTOLOCK|PR_DISPOSE, tcp_input, 0, tcp_ctlinput, tcp_ctloutput, 0, tcp_init, tcp_fasttimo, tcp_slowtimo, tcp_drain, - 0, &tcp_usrreqs + 0, + &tcp_usrreqs, + tcp_lock, tcp_unlock, tcp_getlock, { 0, 0 }, 0, { 0 } }, { SOCK_RAW, &inetdomain, IPPROTO_RAW, PR_ATOMIC|PR_ADDR, rip_input, 0, rip_ctlinput, rip_ctloutput, 0, 0, 0, 0, 0, - 0, &rip_usrreqs + 0, + &rip_usrreqs, + 0, rip_unlock, 0, { 0, 0 }, 0, { 0 } }, { SOCK_RAW, &inetdomain, IPPROTO_ICMP, PR_ATOMIC|PR_ADDR|PR_LASTHDR, icmp_input, 0, 0, rip_ctloutput, 0, 0, 0, 0, 0, - 0, &rip_usrreqs + 0, + &rip_usrreqs, + 0, rip_unlock, 0, { 0, 0 }, 0, { 0 } }, -{ SOCK_DGRAM, &inetdomain, IPPROTO_ICMP, PR_ATOMIC|PR_ADDR|PR_LASTHDR, - icmp_input, 0, 0, icmp_dgram_ctloutput, +{ SOCK_DGRAM, &inetdomain, IPPROTO_ICMP, PR_ATOMIC|PR_ADDR|PR_LASTHDR, + icmp_input, 0, 0, icmp_dgram_ctloutput, 0, - 0, 0, 0, 0, - 0, &icmp_dgram_usrreqs + 0, 0, 0, 0, + 0, + &icmp_dgram_usrreqs, + 0, rip_unlock, 0, { 0, 0 }, 0, { 0 } }, { SOCK_RAW, &inetdomain, IPPROTO_IGMP, PR_ATOMIC|PR_ADDR|PR_LASTHDR, igmp_input, 0, 0, rip_ctloutput, 0, igmp_init, igmp_fasttimo, igmp_slowtimo, 0, - 0, &rip_usrreqs + 0, + &rip_usrreqs, + 0, rip_unlock, 0, { 0, 0 }, 0, { 0 } }, { SOCK_RAW, &inetdomain, IPPROTO_RSVP, PR_ATOMIC|PR_ADDR|PR_LASTHDR, rsvp_input, 0, 0, rip_ctloutput, 0, 0, 0, 0, 0, - 0, &rip_usrreqs + 0, + &rip_usrreqs, + 0, rip_unlock, 0, { 0, 0 }, 0, { 0 } }, #if IPSEC -{ SOCK_RAW, &inetdomain, IPPROTO_AH, PR_ATOMIC|PR_ADDR, +{ SOCK_RAW, &inetdomain, IPPROTO_AH, PR_ATOMIC|PR_ADDR|PR_PROTOLOCK, ah4_input, 0, 0, 0, 0, 0, 0, 0, 0, - 0, &nousrreqs + 0, + &nousrreqs, + 0, 0, 0, { 0, 0 }, 0, { 0 } }, #if IPSEC_ESP -{ SOCK_RAW, &inetdomain, IPPROTO_ESP, PR_ATOMIC|PR_ADDR, +{ SOCK_RAW, &inetdomain, IPPROTO_ESP, PR_ATOMIC|PR_ADDR|PR_PROTOLOCK, esp4_input, 0, 0, 0, 0, 0, 0, 0, 0, - 0, &nousrreqs + 0, + &nousrreqs, + 0, 0, 0, { 0, 0 }, 0, { 0 } }, #endif -{ SOCK_RAW, &inetdomain, IPPROTO_IPCOMP, PR_ATOMIC|PR_ADDR, +{ SOCK_RAW, &inetdomain, IPPROTO_IPCOMP, PR_ATOMIC|PR_ADDR|PR_PROTOLOCK, ipcomp4_input, 0, 0, 0, 0, 0, 0, 0, 0, - 0, &nousrreqs + 0, + &nousrreqs, + 0, 0, 0, { 0, 0 }, 0, { 0 } }, #endif /* IPSEC */ { SOCK_RAW, &inetdomain, IPPROTO_IPV4, PR_ATOMIC|PR_ADDR|PR_LASTHDR, encap4_input, 0, 0, rip_ctloutput, 0, encap_init, 0, 0, 0, - 0, &rip_usrreqs + 0, + &rip_usrreqs, + 0, 0, 0, { 0, 0 }, 0, { 0 } }, # if INET6 { SOCK_RAW, &inetdomain, IPPROTO_IPV6, PR_ATOMIC|PR_ADDR|PR_LASTHDR, encap4_input, 0, 0, rip_ctloutput, 0, encap_init, 0, 0, 0, - 0, &rip_usrreqs + 0, + &rip_usrreqs, + 0, 0, 0, { 0, 0 }, 0, { 0 } }, #endif #if IPDIVERT -{ SOCK_RAW, &inetdomain, IPPROTO_DIVERT, PR_ATOMIC|PR_ADDR, +{ SOCK_RAW, &inetdomain, IPPROTO_DIVERT, PR_ATOMIC|PR_ADDR|PR_PCBLOCK, div_input, 0, 0, ip_ctloutput, 0, div_init, 0, 0, 0, - 0, &div_usrreqs, + 0, + &div_usrreqs, + div_lock, div_unlock, div_getlock, { 0, 0 }, 0, { 0 } }, #endif #if IPXIP @@ -201,7 +232,9 @@ struct protosw inetsw[] = { ipxip_input, 0, ipxip_ctlinput, 0, 0, 0, 0, 0, 0, - 0, &rip_usrreqs + 0, + &rip_usrreqs, + 0, 0, 0, { 0, 0 }, 0, { 0 } }, #endif #if NSIP @@ -209,29 +242,44 @@ struct protosw inetsw[] = { idpip_input, 0, nsip_ctlinput, 0, 0, 0, 0, 0, 0, - 0, &rip_usrreqs + 0, + &rip_usrreqs, + 0, 0, 0, { 0, 0 }, 0, { 0 } }, #endif /* raw wildcard */ -{ SOCK_RAW, &inetdomain, 0, PR_ATOMIC|PR_ADDR, +{ SOCK_RAW, &inetdomain, 0, PR_ATOMIC|PR_ADDR|PR_LASTHDR, rip_input, 0, 0, rip_ctloutput, 0, rip_init, 0, 0, 0, - 0, &rip_usrreqs + 0, + &rip_usrreqs, + 0, rip_unlock, 0, { 0, 0 }, 0, { 0 } }, }; -extern int in_inithead __P((void **, int)); +extern int in_inithead(void **, int); int in_proto_count = (sizeof (inetsw) / sizeof (struct protosw)); extern void in_dinit(void); /* A routing init function, and a header size */ struct domain inetdomain = - { AF_INET, "internet", in_dinit, 0, 0, - inetsw, 0, - in_inithead, 32, sizeof(struct sockaddr_in), - sizeof(struct tcpiphdr), 0 + { AF_INET, + "internet", + in_dinit, + 0, + 0, + inetsw, + 0, + in_inithead, + 32, + sizeof(struct sockaddr_in), + sizeof(struct tcpiphdr), + 0, + 0, + 0, + { 0, 0} }; DOMAIN_SET(inet); diff --git a/bsd/netinet/in_rmx.c b/bsd/netinet/in_rmx.c index 62d6eb597..3a7afc3cd 100644 --- a/bsd/netinet/in_rmx.c +++ b/bsd/netinet/in_rmx.c @@ -70,19 +70,21 @@ #include <sys/socket.h> #include <sys/mbuf.h> #include <sys/syslog.h> +#include <kern/lock.h> #include <net/if.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/in_var.h> -extern int in_inithead __P((void **head, int off)); +extern int in_inithead(void **head, int off); #ifdef __APPLE__ static void in_rtqtimo(void *rock); #endif #define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */ +extern lck_mtx_t *rt_mtx; /* * Do what we need to do when inserting a route. @@ -145,21 +147,21 @@ in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, * Find out if it is because of an * ARP entry and delete it if so. */ - rt2 = rtalloc1((struct sockaddr *)sin, 0, + rt2 = rtalloc1_locked((struct sockaddr *)sin, 0, RTF_CLONING | RTF_PRCLONING); if (rt2) { if (rt2->rt_flags & RTF_LLINFO && rt2->rt_flags & RTF_HOST && rt2->rt_gateway && rt2->rt_gateway->sa_family == AF_LINK) { - rtrequest(RTM_DELETE, + rtrequest_locked(RTM_DELETE, (struct sockaddr *)rt_key(rt2), rt2->rt_gateway, rt_mask(rt2), rt2->rt_flags, 0); ret = rn_addroute(v_arg, n_arg, head, treenodes); } - rtfree(rt2); + rtfree_locked(rt2); } } return ret; @@ -232,6 +234,7 @@ static void in_clsroute(struct radix_node *rn, struct radix_node_head *head) { struct rtentry *rt = (struct rtentry *)rn; + struct timeval timenow; if(!(rt->rt_flags & RTF_UP)) return; /* prophylactic measures */ @@ -249,10 +252,11 @@ in_clsroute(struct radix_node *rn, struct radix_node_head *head) * waiting for a timeout cycle to kill it. */ if(rtq_reallyold != 0) { + getmicrotime(&timenow); rt->rt_flags |= RTPRF_OURS; - rt->rt_rmx.rmx_expire = time_second + rtq_reallyold; + rt->rt_rmx.rmx_expire = timenow.tv_sec + rtq_reallyold; } else { - rtrequest(RTM_DELETE, + rtrequest_locked(RTM_DELETE, (struct sockaddr *)rt_key(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags, 0); @@ -279,15 +283,18 @@ in_rtqkill(struct radix_node *rn, void *rock) struct rtqk_arg *ap = rock; struct rtentry *rt = (struct rtentry *)rn; int err; + struct timeval timenow; + getmicrotime(&timenow); + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); if(rt->rt_flags & RTPRF_OURS) { ap->found++; - if(ap->draining || rt->rt_rmx.rmx_expire <= time_second) { + if(ap->draining || rt->rt_rmx.rmx_expire <= timenow.tv_sec) { if(rt->rt_refcnt > 0) panic("rtqkill route really not free"); - err = rtrequest(RTM_DELETE, + err = rtrequest_locked(RTM_DELETE, (struct sockaddr *)rt_key(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags, 0); @@ -298,9 +305,9 @@ in_rtqkill(struct radix_node *rn, void *rock) } } else { if(ap->updating - && (rt->rt_rmx.rmx_expire - time_second + && (rt->rt_rmx.rmx_expire - timenow.tv_sec > rtq_reallyold)) { - rt->rt_rmx.rmx_expire = time_second + rt->rt_rmx.rmx_expire = timenow.tv_sec + rtq_reallyold; } ap->nextstop = lmin(ap->nextstop, @@ -314,11 +321,7 @@ in_rtqkill(struct radix_node *rn, void *rock) static void in_rtqtimo_funnel(void *rock) { - boolean_t funnel_state; - - funnel_state = thread_funnel_set(network_flock, TRUE); in_rtqtimo(rock); - (void) thread_funnel_set(network_flock, FALSE); } #define RTQ_TIMEOUT 60*10 /* run no less than once every ten minutes */ @@ -331,15 +334,15 @@ in_rtqtimo(void *rock) struct rtqk_arg arg; struct timeval atv; static time_t last_adjusted_timeout = 0; - int s; + struct timeval timenow; + getmicrotime(&timenow); arg.found = arg.killed = 0; arg.rnh = rnh; - arg.nextstop = time_second + rtq_timeout; + arg.nextstop = timenow.tv_sec + rtq_timeout; arg.draining = arg.updating = 0; - s = splnet(); + lck_mtx_lock(rt_mtx); rnh->rnh_walktree(rnh, in_rtqkill, &arg); - splx(s); /* * Attempt to be somewhat dynamic about this: @@ -350,27 +353,26 @@ in_rtqtimo(void *rock) * hard. */ if((arg.found - arg.killed > rtq_toomany) - && (time_second - last_adjusted_timeout >= rtq_timeout) + && (timenow.tv_sec - last_adjusted_timeout >= rtq_timeout) && rtq_reallyold > rtq_minreallyold) { rtq_reallyold = 2*rtq_reallyold / 3; if(rtq_reallyold < rtq_minreallyold) { rtq_reallyold = rtq_minreallyold; } - last_adjusted_timeout = time_second; + last_adjusted_timeout = timenow.tv_sec; #if DIAGNOSTIC log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n", rtq_reallyold); #endif arg.found = arg.killed = 0; arg.updating = 1; - s = splnet(); rnh->rnh_walktree(rnh, in_rtqkill, &arg); - splx(s); } atv.tv_usec = 0; - atv.tv_sec = arg.nextstop - time_second; + atv.tv_sec = arg.nextstop - timenow.tv_sec; + lck_mtx_unlock(rt_mtx); timeout(in_rtqtimo_funnel, rock, tvtohz(&atv)); } @@ -379,15 +381,14 @@ in_rtqdrain(void) { struct radix_node_head *rnh = rt_tables[AF_INET]; struct rtqk_arg arg; - int s; arg.found = arg.killed = 0; arg.rnh = rnh; arg.nextstop = 0; arg.draining = 1; arg.updating = 0; - s = splnet(); + lck_mtx_lock(rt_mtx); rnh->rnh_walktree(rnh, in_rtqkill, &arg); - splx(s); + lck_mtx_unlock(rt_mtx); } /* @@ -451,7 +452,7 @@ in_ifadownkill(struct radix_node *rn, void *xap) * so that behavior is not needed there. */ rt->rt_flags &= ~(RTF_CLONING | RTF_PRCLONING); - err = rtrequest(RTM_DELETE, (struct sockaddr *)rt_key(rt), + err = rtrequest_locked(RTM_DELETE, (struct sockaddr *)rt_key(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags, 0); if (err) { log(LOG_WARNING, "in_ifadownkill: error %d\n", err); @@ -466,6 +467,8 @@ in_ifadown(struct ifaddr *ifa, int delete) struct in_ifadown_arg arg; struct radix_node_head *rnh; + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); + if (ifa->ifa_addr->sa_family != AF_INET) return 1; diff --git a/bsd/netinet/in_systm.h b/bsd/netinet/in_systm.h index 347a460e3..ecfcc405d 100644 --- a/bsd/netinet/in_systm.h +++ b/bsd/netinet/in_systm.h @@ -58,6 +58,7 @@ #ifndef _NETINET_IN_SYSTM_H_ #define _NETINET_IN_SYSTM_H_ #include <sys/appleapiopts.h> +#include <sys/_types.h> /* * Miscellaneous internetwork @@ -72,15 +73,13 @@ * the bytes before transmission at each protocol level. The n_ types * represent the types with the bytes in ``high-ender'' order. */ -typedef u_int16_t n_short; /* short as received from the net */ -typedef u_int32_t n_long; /* long as received from the net */ +typedef __uint16_t n_short; /* short as received from the net */ +typedef __uint32_t n_long; /* long as received from the net */ -typedef u_int32_t n_time; /* ms since 00:00 GMT, byte rev */ +typedef __uint32_t n_time; /* ms since 00:00 GMT, byte rev */ -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -n_time iptime __P((void)); -#endif /* __APPLE_API_PRIVATE */ -#endif +#ifdef KERNEL_PRIVATE +n_time iptime(void); +#endif KERNEL_PRIVATE #endif diff --git a/bsd/netinet/in_var.h b/bsd/netinet/in_var.h index e33e1ea2e..1ec7420be 100644 --- a/bsd/netinet/in_var.h +++ b/bsd/netinet/in_var.h @@ -64,7 +64,7 @@ #include <sys/kern_event.h> #endif -#ifdef __APPLE_API_UNSTABLE +#ifdef PRIVATE /* * Interface address, Internet version. One of these structures @@ -88,6 +88,7 @@ struct in_ifaddr { #define ia_broadaddr ia_dstaddr struct sockaddr_in ia_sockmask; /* reserve space for general netmask */ }; +#endif /* PRIVATE */ struct in_aliasreq { char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ @@ -100,7 +101,6 @@ struct in_aliasreq { #endif }; -#ifdef __APPLE__ /* * Event data, internet style. */ @@ -136,8 +136,8 @@ struct kev_in_collision { #define KEV_INET_SIFBRDADDR 5 #define KEV_INET_SIFNETMASK 6 #define KEV_INET_ARPCOLLISION 7 /* use kev_in_collision */ -#endif /* __APPLE__ */ +#ifdef KERNEL_PRIVATE /* * Given a pointer to an in_ifaddr (ifaddr), * return a pointer to the addr as a sockaddr_in. @@ -148,14 +148,11 @@ struct kev_in_collision { #define IN_LNAOF(in, ifa) \ ((ntohl((in).s_addr) & ~((struct in_ifaddr *)(ifa)->ia_subnetmask)) -#endif /* __APPLE_API_UNSTABLE */ - -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE extern TAILQ_HEAD(in_ifaddrhead, in_ifaddr) in_ifaddrhead; extern struct ifqueue ipintrq; /* ip packet input queue */ extern struct in_addr zeroin_addr; extern u_char inetctlerrmap[]; +extern lck_mtx_t *rt_mtx; /* * Macro for finding the interface (ifnet structure) corresponding to one @@ -167,10 +164,13 @@ extern u_char inetctlerrmap[]; { \ struct in_ifaddr *ia; \ \ + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_NOTOWNED); \ + lck_mtx_lock(rt_mtx); \ TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) \ if (IA_SIN(ia)->sin_addr.s_addr == (addr).s_addr) \ break; \ (ifp) = (ia == NULL) ? NULL : ia->ia_ifp; \ + lck_mtx_unlock(rt_mtx); \ } /* @@ -181,15 +181,15 @@ extern u_char inetctlerrmap[]; /* struct ifnet *ifp; */ \ /* struct in_ifaddr *ia; */ \ { \ + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_NOTOWNED); \ + lck_mtx_lock(rt_mtx); \ for ((ia) = TAILQ_FIRST(&in_ifaddrhead); \ (ia) != NULL && (ia)->ia_ifp != (ifp); \ (ia) = TAILQ_NEXT((ia), ia_link)) \ continue; \ + lck_mtx_unlock(rt_mtx); \ } -#endif /* __APPLE_API_PRIVATE */ -#endif -#ifdef __APPLE_API_UNSTABLE /* * This information should be part of the ifnet structure but we don't wish * to change that - as it might break a number of things @@ -219,10 +219,6 @@ struct in_multi { u_int inm_state; /* state of the membership */ struct router_info *inm_rti; /* router info*/ }; -#endif /* __APPLE_API_UNSTABLE */ - -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_ip); @@ -283,20 +279,19 @@ do { \ } while(0) struct route; -struct in_multi *in_addmulti __P((struct in_addr *, struct ifnet *)); -void in_delmulti __P((struct in_multi *)); -int in_control __P((struct socket *, u_long, caddr_t, struct ifnet *, - struct proc *)); -void in_rtqdrain __P((void)); -void ip_input __P((struct mbuf *)); -int in_ifadown __P((struct ifaddr *ifa, int)); -void in_ifscrub __P((struct ifnet *, struct in_ifaddr *)); -int ipflow_fastforward __P((struct mbuf *)); -void ipflow_create __P((const struct route *, struct mbuf *)); -void ipflow_slowtimo __P((void)); +struct in_multi *in_addmulti(struct in_addr *, struct ifnet *); +void in_delmulti(struct in_multi **); +int in_control(struct socket *, u_long, caddr_t, struct ifnet *, + struct proc *); +void in_rtqdrain(void); +void ip_input(struct mbuf *); +int in_ifadown(struct ifaddr *ifa, int); +void in_ifscrub(struct ifnet *, struct in_ifaddr *, int); +int ipflow_fastforward(struct mbuf *); +void ipflow_create(const struct route *, struct mbuf *); +void ipflow_slowtimo(void); -#endif /* __APPLE_API_PRIVATE */ -#endif /* _KERNEL */ +#endif /* KERNEL_PRIVATE */ /* INET6 stuff */ #include <netinet6/in6_var.h> diff --git a/bsd/netinet/ip.h b/bsd/netinet/ip.h index 8f19e94a6..7b75ffb2c 100644 --- a/bsd/netinet/ip.h +++ b/bsd/netinet/ip.h @@ -58,6 +58,10 @@ #ifndef _NETINET_IP_H_ #define _NETINET_IP_H_ #include <sys/appleapiopts.h> +#include <sys/types.h>> /* XXX temporary hack to get u_ types */ +#include <netinet/in.h> +#include <netinet/in_systm.h> + /* * Definitions for internet protocol version 4. diff --git a/bsd/netinet/ip6.h b/bsd/netinet/ip6.h index 6a3b65785..c677f2b6a 100644 --- a/bsd/netinet/ip6.h +++ b/bsd/netinet/ip6.h @@ -237,8 +237,7 @@ struct ip6_frag { #define IPV6_MMTU 1280 /* minimal MTU and reassembly. 1024 + 256 */ #define IPV6_MAXPACKET 65535 /* ip6 max packet size without Jumbo payload*/ -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE /* * IP6_EXTHDR_CHECK ensures that region between the IP6 header and the * target header (including IPv6 itself, extension headers and @@ -248,25 +247,25 @@ struct ip6_frag { * supposed to never be matched but is prepared just in case. */ -#define IP6_EXTHDR_CHECK(m, off, hlen, ret) \ +#define IP6_EXTHDR_CHECK(m, off, hlen, action) \ do { \ if ((m)->m_next != NULL) { \ if (((m)->m_flags & M_LOOP) && \ ((m)->m_len < (off) + (hlen)) && \ (((m) = m_pullup((m), (off) + (hlen))) == NULL)) { \ ip6stat.ip6s_exthdrtoolong++; \ - return ret; \ + action; \ } else if ((m)->m_flags & M_EXT) { \ if ((m)->m_len < (off) + (hlen)) { \ ip6stat.ip6s_exthdrtoolong++; \ m_freem(m); \ - return ret; \ + action; \ } \ } else { \ if ((m)->m_len < (off) + (hlen)) { \ ip6stat.ip6s_exthdrtoolong++; \ m_freem(m); \ - return ret; \ + action; \ } \ } \ } else { \ @@ -274,7 +273,7 @@ do { \ ip6stat.ip6s_tooshort++; \ in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); \ m_freem(m); \ - return ret; \ + action; \ } \ } \ } while (0) @@ -325,7 +324,6 @@ do { \ } \ } \ } while (0) -#endif /* __APPLE_API_PRIVATE */ -#endif /*KERNEL*/ -#endif /* not _NETINET_IP6_H_ */ +#endif KERNEL_PRIVATE +#endif !_NETINET_IP6_H_ diff --git a/bsd/netinet/ip_compat.h b/bsd/netinet/ip_compat.h index a39802e7f..8fc0d1ea5 100644 --- a/bsd/netinet/ip_compat.h +++ b/bsd/netinet/ip_compat.h @@ -35,13 +35,8 @@ #ifndef __IP_COMPAT_H__ #define __IP_COMPAT_H__ -#ifndef __P -# ifdef __STDC__ -# define __P(x) x -# else -# define __P(x) () -# define const -# endif +#ifndef __STDC__ +# define const #endif #ifndef SOLARIS @@ -245,7 +240,7 @@ typedef struct qif { */ int qf_hl; /* header length */ } qif_t; -extern ill_t *get_unit __P((char *)); +extern ill_t *get_unit(char *); # define GETUNIT(n) get_unit((n)) # else /* SOLARIS */ # if defined(__sgi) @@ -289,8 +284,8 @@ typedef struct { # define KFREE(x) kmem_free((char *)(x), sizeof(*(x))) # define KFREES(x,s) kmem_free((char *)(x), (s)) # if !SOLARIS -extern void m_copydata __P((struct mbuf *, int, int, caddr_t)); -extern void m_copyback __P((struct mbuf *, int, int, caddr_t)); +extern void m_copydata(struct mbuf *, int, int, caddr_t); +extern void m_copyback(struct mbuf *, int, int, caddr_t); # endif # ifdef __sgi # include <sys/kmem.h> diff --git a/bsd/netinet/ip_divert.c b/bsd/netinet/ip_divert.c index f33097694..e92d7e6a3 100644 --- a/bsd/netinet/ip_divert.c +++ b/bsd/netinet/ip_divert.c @@ -51,10 +51,9 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: src/sys/netinet/ip_divert.c,v 1.42.2.4 2001/07/29 19:32:40 ume Exp $ + * $FreeBSD: src/sys/netinet/ip_divert.c,v 1.98 2004/08/17 22:05:54 andre Exp $ */ - #ifndef INET #error "IPDIVERT requires INET." #endif @@ -64,6 +63,7 @@ #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/socket.h> +#include <sys/domain.h> #include <sys/protosw.h> #include <sys/socketvar.h> #include <sys/sysctl.h> @@ -80,6 +80,10 @@ #include <netinet/in_pcb.h> #include <netinet/in_var.h> #include <netinet/ip_var.h> +#include <netinet/ip_fw.h> +#include <netinet/ip_divert.h> + +#include <kern/zalloc.h> /* * Divert sockets @@ -92,20 +96,28 @@ #define DIVRCVQ (65536 + 100) /* - * A 16 bit cookie is passed to and from the user process. - * The user process can send it back to help the caller know - * something about where the packet originally came from. + * Divert sockets work in conjunction with ipfw, see the divert(4) + * manpage for features. + * Internally, packets selected by ipfw in ip_input() or ip_output(), + * and never diverted before, are passed to the input queue of the + * divert socket with a given 'divert_port' number (as specified in + * the matching ipfw rule), and they are tagged with a 16 bit cookie + * (representing the rule number of the matching ipfw rule), which + * is passed to process reading from the socket. * - * In the case of ipfw, then the cookie is the rule that sent - * us here. On reinjection is is the rule after which processing - * should continue. Leaving it the same will make processing start - * at the rule number after that which sent it here. Setting it to - * 0 will restart processing at the beginning. + * Packets written to the divert socket are again tagged with a cookie + * (usually the same as above) and a destination address. + * If the destination address is INADDR_ANY then the packet is + * treated as outgoing and sent to ip_output(), otherwise it is + * treated as incoming and sent to ip_input(). + * In both cases, the packet is tagged with the cookie. * - * For divert_packet(), ip_divert_cookie is an input value only. - * For div_output(), ip_divert_cookie is an output value only. + * On reinjection, processing in ip_input() and ip_output() + * will be exactly the same as for the original packet, except that + * ipfw processing will start at the rule number after the one + * written in the cookie (so, tagging a packet with a cookie of 0 + * will cause it to be effectively considered as a standard packet). */ -u_int16_t ip_divert_cookie; /* Internal variables */ static struct inpcbhead divcb; @@ -115,18 +127,20 @@ static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */ static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */ /* Optimization: have this preinitialized */ -static struct sockaddr_in divsrc = { sizeof(divsrc), AF_INET }; +static struct sockaddr_in divsrc = { sizeof(divsrc), AF_INET, }; /* Internal functions */ static int div_output(struct socket *so, struct mbuf *m, struct sockaddr *addr, struct mbuf *control); +extern int load_ipfw(void); /* * Initialize divert connection block queue. */ void div_init(void) { + struct inpcbinfo *pcbinfo; LIST_INIT(&divcb); divcbinfo.listhead = &divcb; /* @@ -138,13 +152,26 @@ div_init(void) divcbinfo.porthashbase = hashinit(1, M_PCB, &divcbinfo.porthashmask); divcbinfo.ipi_zone = (void *) zinit(sizeof(struct inpcb),(maxsockets * sizeof(struct inpcb)), 4096, "divzone"); + pcbinfo = &divcbinfo; + /* + * allocate lock group attribute and group for udp pcb mutexes + */ + pcbinfo->mtx_grp_attr = lck_grp_attr_alloc_init(); -/* - * ### LD 08/03: init IP forwarding at this point [ipfw is not a module yet] - */ -#if !IPFIREWALL_KEXT - ip_fw_init(); -#endif + pcbinfo->mtx_grp = lck_grp_alloc_init("divcb", pcbinfo->mtx_grp_attr); + + /* + * allocate the lock attribute for divert pcb mutexes + */ + pcbinfo->mtx_attr = lck_attr_alloc_init(); + lck_attr_setdefault(pcbinfo->mtx_attr); + + if ((pcbinfo->mtx = lck_rw_alloc_init(pcbinfo->mtx_grp, pcbinfo->mtx_attr)) == NULL) + return; /* pretty much dead if this fails... */ + + if (!IPFW_LOADED) { + load_ipfw(); + } } /* @@ -152,7 +179,7 @@ div_init(void) * with that protocol number to enter the system from the outside. */ void -div_input(struct mbuf *m, int off) +div_input(struct mbuf *m, __unused int off) { ipstat.ips_noproto++; m_freem(m); @@ -163,9 +190,10 @@ div_input(struct mbuf *m, int off) * * Setup generic address and protocol structures for div_input routine, * then pass them along with mbuf chain. + * ###LOCK called in ip_mutex from ip_output/ip_input */ void -divert_packet(struct mbuf *m, int incoming, int port) +divert_packet(struct mbuf *m, int incoming, int port, int rule) { struct ip *ip; struct inpcb *inp; @@ -175,9 +203,7 @@ divert_packet(struct mbuf *m, int incoming, int port) /* Sanity check */ KASSERT(port != 0, ("%s: port=0", __FUNCTION__)); - /* Record and reset divert cookie */ - divsrc.sin_port = ip_divert_cookie; - ip_divert_cookie = 0; + divsrc.sin_port = rule; /* record matching rule */ /* Assure header */ if (m->m_len < sizeof(struct ip) && @@ -198,6 +224,7 @@ divert_packet(struct mbuf *m, int incoming, int port) KASSERT((m->m_flags & M_PKTHDR), ("%s: !PKTHDR", __FUNCTION__)); /* Find IP address for receive interface */ + ifnet_lock_shared(m->m_pkthdr.rcvif); TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) { if (ifa->ifa_addr == NULL) continue; @@ -207,6 +234,7 @@ divert_packet(struct mbuf *m, int incoming, int port) ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr; break; } + ifnet_lock_done(m->m_pkthdr.rcvif); } /* * Record the incoming interface name whenever we have one. @@ -239,21 +267,25 @@ divert_packet(struct mbuf *m, int incoming, int port) /* Put packet on socket queue, if any */ sa = NULL; nport = htons((u_int16_t)port); + lck_rw_lock_shared(divcbinfo.mtx); LIST_FOREACH(inp, &divcb, inp_list) { if (inp->inp_lport == nport) sa = inp->inp_socket; } if (sa) { + int error = 0; + + socket_lock(sa, 1); if (sbappendaddr(&sa->so_rcv, (struct sockaddr *)&divsrc, - m, (struct mbuf *)0) == 0) - m_freem(m); - else + m, (struct mbuf *)0, &error) != 0) sorwakeup(sa); + socket_unlock(sa, 1); } else { m_freem(m); ipstat.ips_noproto++; ipstat.ips_delivered--; } + lck_rw_done(divcbinfo.mtx); } /* @@ -262,6 +294,7 @@ divert_packet(struct mbuf *m, int incoming, int port) * If no address specified, or address is 0.0.0.0, send to ip_output(); * otherwise, send to ip_input() and mark as having been received on * the interface with that address. + * ###LOCK called in inet_proto mutex when from div_send. */ static int div_output(so, m, addr, control) @@ -280,10 +313,21 @@ div_output(so, m, addr, control) /* Loopback avoidance and state recovery */ if (sin) { + struct m_tag *mtag; + struct divert_tag *dt; int len = 0; char *c = sin->sin_zero; - ip_divert_cookie = sin->sin_port; + mtag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DIVERT, + sizeof(struct divert_tag), M_NOWAIT); + if (mtag == NULL) { + error = ENOBUFS; + goto cantsend; + } + dt = (struct divert_tag *)(mtag+1); + dt->info = 0; + dt->cookie = sin->sin_port; + m_tag_prepend(m, mtag); /* * Find receive interface with the given name or IP address. @@ -297,8 +341,6 @@ div_output(so, m, addr, control) while (*c++ && (len++ < sizeof(sin->sin_zero))); if ((len > 0) && (len < sizeof(sin->sin_zero))) m->m_pkthdr.rcvif = ifunit(sin->sin_zero); - } else { - ip_divert_cookie = 0; } /* Reinject packet into the system as incoming or outgoing */ @@ -319,10 +361,13 @@ div_output(so, m, addr, control) /* Send packet to output processing */ ipstat.ips_rawout++; /* XXX */ - error = ip_output(m, inp->inp_options, &inp->inp_route, + socket_unlock(so, 0); + error = ip_output(m, + inp->inp_options, &inp->inp_route, (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST | IP_RAWOUTPUT, inp->inp_moptions); + socket_lock(so, 0); } else { struct ifaddr *ifa; @@ -341,20 +386,40 @@ div_output(so, m, addr, control) goto cantsend; } m->m_pkthdr.rcvif = ifa->ifa_ifp; + ifafree(ifa); + } + + if ((~IF_HWASSIST_CSUM_FLAGS(m->m_pkthdr.rcvif->if_hwassist) & + m->m_pkthdr.csum_flags) == 0) { + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } + m->m_pkthdr.csum_flags |= + CSUM_DATA_VALID | CSUM_PSEUDO_HDR | + CSUM_IP_CHECKED | CSUM_IP_VALID; + m->m_pkthdr.csum_data = 0xffff; + } + else if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + int hlen; + +#ifdef _IP_VHL + hlen = IP_VHL_HL(ip->ip_vhl) << 2; +#else + hlen = ip->ip_hl << 2; +#endif + in_delayed_cksum(m); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + ip->ip_sum = in_cksum(m, hlen); } /* Send packet to input processing */ - ip_input(m); + proto_inject(PF_INET, m); } - /* paranoid: Reset for next time (and other packets) */ - /* almost definitly already done in the ipfw filter but.. */ - ip_divert_cookie = 0; return error; cantsend: m_freem(m); - ip_divert_cookie = 0; return error; } @@ -362,20 +427,19 @@ static int div_attach(struct socket *so, int proto, struct proc *p) { struct inpcb *inp; - int error, s; + int error; + inp = sotoinpcb(so); if (inp) panic("div_attach"); - if (p && (error = suser(p->p_ucred, &p->p_acflag)) != 0) + if (p && (error = proc_suser(p)) != 0) return error; error = soreserve(so, div_sendspace, div_recvspace); if (error) return error; - s = splnet(); error = in_pcballoc(so, &divcbinfo, p); - splx(s); if (error) return error; inp = (struct inpcb *)so->so_pcb; @@ -385,6 +449,11 @@ div_attach(struct socket *so, int proto, struct proc *p) /* The socket is always "connected" because we always know "where" to send the packet */ so->so_state |= SS_ISCONNECTED; + +#ifdef MORE_DICVLOCK_DEBUG + printf("div_attach: so=%x sopcb=%x lock=%x ref=%x\n", + so, so->so_pcb, ((struct inpcb *)so->so_pcb)->inpcb_mtx, so->so_usecount); +#endif return 0; } @@ -393,10 +462,15 @@ div_detach(struct socket *so) { struct inpcb *inp; +#ifdef MORE_DICVLOCK_DEBUG + printf("div_detach: so=%x sopcb=%x lock=%x ref=%x\n", + so, so->so_pcb, ((struct inpcb *)so->so_pcb)->inpcb_mtx, so->so_usecount); +#endif inp = sotoinpcb(so); if (inp == 0) - panic("div_detach"); + panic("div_detach: so=%x null inp\n", so); in_pcbdetach(inp); + inp->inp_state = INPCB_STATE_DEAD; return 0; } @@ -419,10 +493,8 @@ static int div_bind(struct socket *so, struct sockaddr *nam, struct proc *p) { struct inpcb *inp; - int s; int error; - s = splnet(); inp = sotoinpcb(so); /* in_pcbbind assumes that the socket is a sockaddr_in * and in_pcbbind requires a valid address. Since divert @@ -437,7 +509,6 @@ div_bind(struct socket *so, struct sockaddr *nam, struct proc *p) ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY; error = in_pcbbind(inp, nam, p); } - splx(s); return error; } @@ -449,8 +520,8 @@ div_shutdown(struct socket *so) } static int -div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, - struct mbuf *control, struct proc *p) +div_send(struct socket *so, __unused int flags, struct mbuf *m, struct sockaddr *nam, + struct mbuf *control, __unused struct proc *p) { /* Packet must have a header (but that's about it) */ if (m->m_len < sizeof (struct ip) && @@ -467,7 +538,7 @@ div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, static int div_pcblist SYSCTL_HANDLER_ARGS { - int error, i, n, s; + int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; @@ -476,57 +547,61 @@ div_pcblist SYSCTL_HANDLER_ARGS * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ - if (req->oldptr == 0) { + lck_rw_lock_exclusive(divcbinfo.mtx); + if (req->oldptr == USER_ADDR_NULL) { n = divcbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xinpcb); + lck_rw_done(divcbinfo.mtx); return 0; } - if (req->newptr != 0) + if (req->newptr != USER_ADDR_NULL) { + lck_rw_done(divcbinfo.mtx); return EPERM; + } /* * OK, now we're committed to doing something. */ - s = splnet(); gencnt = divcbinfo.ipi_gencnt; n = divcbinfo.ipi_count; - splx(s); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); - if (error) + if (error) { + lck_rw_done(divcbinfo.mtx); return error; + } inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK); - if (inp_list == 0) + if (inp_list == 0) { + lck_rw_done(divcbinfo.mtx); return ENOMEM; + } - s = splnet(); for (inp = LIST_FIRST(divcbinfo.listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { #ifdef __APPLE__ - if (inp->inp_gencnt <= gencnt) + if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) #else if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp)) #endif inp_list[i++] = inp; } - splx(s); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; - if (inp->inp_gencnt <= gencnt) { + if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) { struct xinpcb xi; xi.xi_len = sizeof xi; /* XXX should avoid extra copy */ - bcopy(inp, &xi.xi_inp, sizeof *inp); + inpcb_to_compat(inp, &xi.xi_inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); error = SYSCTL_OUT(req, &xi, sizeof xi); @@ -540,30 +615,119 @@ div_pcblist SYSCTL_HANDLER_ARGS * while we were processing this request, and it * might be necessary to retry. */ - s = splnet(); xig.xig_gen = divcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = divcbinfo.ipi_count; - splx(s); error = SYSCTL_OUT(req, &xig, sizeof xig); } FREE(inp_list, M_TEMP); + lck_rw_done(divcbinfo.mtx); return error; } -#ifndef __APPLE__ -#warning Fix SYSCTL net_inet_divert +__private_extern__ int +div_lock(struct socket *so, int refcount, int lr) + { + int lr_saved; +#ifdef __ppc__ + if (lr == 0) { + __asm__ volatile("mflr %0" : "=r" (lr_saved)); + } + else lr_saved = lr; #endif -#if 0 -SYSCTL_DECL(_net_inet_divert); -SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLFLAG_RD, 0, 0, - div_pcblist, "S,xinpcb", "List of active divert sockets"); + +#ifdef MORE_DICVLOCK_DEBUG + printf("div_lock: so=%x sopcb=%x lock=%x ref=%x lr=%x\n", + so, + so->so_pcb, + so->so_pcb ? ((struct inpcb *)so->so_pcb)->inpcb_mtx : 0, + so->so_usecount, + lr_saved); #endif + if (so->so_pcb) { + lck_mtx_lock(((struct inpcb *)so->so_pcb)->inpcb_mtx); + } else { + panic("div_lock: so=%x NO PCB! lr=%x\n", so, lr_saved); + lck_mtx_lock(so->so_proto->pr_domain->dom_mtx); + } + + if (so->so_usecount < 0) + panic("div_lock: so=%x so_pcb=%x lr=%x ref=%x\n", + so, so->so_pcb, lr_saved, so->so_usecount); + + if (refcount) + so->so_usecount++; + so->reserved3 = (void *)lr_saved; + + return (0); +} + +__private_extern__ int +div_unlock(struct socket *so, int refcount, int lr) +{ + int lr_saved; + lck_mtx_t * mutex_held; + struct inpcb *inp = sotoinpcb(so); +#ifdef __ppc__ + if (lr == 0) { + __asm__ volatile("mflr %0" : "=r" (lr_saved)); + } + else lr_saved = lr; +#endif + +#ifdef MORE_DICVLOCK_DEBUG + printf("div_unlock: so=%x sopcb=%x lock=%x ref=%x lr=%x\n", + so, + so->so_pcb, + so->so_pcb ? ((struct inpcb *)so->so_pcb)->inpcb_mtx : 0, + so->so_usecount, + lr_saved); +#endif + if (refcount) + so->so_usecount--; + + if (so->so_usecount < 0) + panic("div_unlock: so=%x usecount=%x\n", so, so->so_usecount); + if (so->so_pcb == NULL) { + panic("div_unlock: so=%x NO PCB usecount=%x lr=%x\n", so, so->so_usecount, lr_saved); + mutex_held = so->so_proto->pr_domain->dom_mtx; + } else { + mutex_held = ((struct inpcb *)so->so_pcb)->inpcb_mtx; + } + + if (so->so_usecount == 0 && (inp->inp_wantcnt == WNT_STOPUSING)) { + lck_rw_lock_exclusive(divcbinfo.mtx); + in_pcbdispose(inp); + lck_rw_done(divcbinfo.mtx); + return (0); + } + lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); + lck_mtx_unlock(mutex_held); + so->reserved4 = (void *)lr_saved; + return (0); +} + +__private_extern__ lck_mtx_t * +div_getlock(struct socket *so, __unused int locktype) +{ + struct inpcb *inpcb = (struct inpcb *)so->so_pcb; + + if (so->so_pcb) { + if (so->so_usecount < 0) + panic("div_getlock: so=%x usecount=%x\n", so, so->so_usecount); + return(inpcb->inpcb_mtx); + } else { + panic("div_getlock: so=%x NULL so_pcb\n", so); + return (so->so_proto->pr_domain->dom_mtx); + } +} + struct pr_usrreqs div_usrreqs = { div_abort, pru_accept_notsupp, div_attach, div_bind, pru_connect_notsupp, pru_connect2_notsupp, in_control, div_detach, div_disconnect, pru_listen_notsupp, in_setpeeraddr, pru_rcvd_notsupp, pru_rcvoob_notsupp, div_send, pru_sense_null, div_shutdown, - in_setsockaddr, sosend, soreceive, sopoll + in_setsockaddr, sosend, soreceive, pru_sopoll_notsupp }; + diff --git a/bsd/netinet/ip_divert.h b/bsd/netinet/ip_divert.h new file mode 100644 index 000000000..b2f1a2939 --- /dev/null +++ b/bsd/netinet/ip_divert.h @@ -0,0 +1,92 @@ +/*- + * Copyright (c) 2003 Sam Leffler, Errno Consulting + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any + * redistribution must be conditioned upon including a substantially + * similar Disclaimer requirement for further binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + * of any contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGES. + * + * $FreeBSD: src/sys/netinet/ip_divert.h,v 1.3 2004/02/25 19:55:28 mlaier Exp $ + */ + +#ifndef _NETINET_IP_DIVERT_H_ +#define _NETINET_IP_DIVERT_H_ + +#if IPDIVERT +/* + * Divert socket definitions. + */ + +/* 32-bit unique unsigned value used to identify a module */ + +struct divert_tag { + u_int32_t info; /* port & flags */ + u_int16_t cookie; /* ipfw rule number */ +}; + +/* + * Return the divert cookie associated with the mbuf; if any. + */ +static __inline u_int16_t +divert_cookie(struct m_tag *mtag) +{ + return ((struct divert_tag *)(mtag+1))->cookie; +} +static __inline u_int16_t +divert_find_cookie(struct mbuf *m) +{ + struct m_tag *mtag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_DIVERT, NULL); + return mtag ? divert_cookie(mtag) : 0; +} + +/* + * Return the divert info associated with the mbuf; if any. + */ +static __inline u_int32_t +divert_info(struct m_tag *mtag) +{ + return ((struct divert_tag *)(mtag+1))->info; +} +static __inline u_int32_t +divert_find_info(struct mbuf *m) +{ + struct m_tag *mtag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_DIVERT, NULL); + return mtag ? divert_info(mtag) : 0; +} + +extern void div_init(void); +extern void div_input(struct mbuf *, int); +lck_mtx_t * + div_getlock(struct socket *, int ); +int div_unlock(struct socket *, int, int); +int div_lock(struct socket *, int , int ); +extern void divert_packet(struct mbuf *m, int incoming, int port, int rule); +extern struct pr_usrreqs div_usrreqs; + +#endif /* IPDIVERT */ +#endif /* _NETINET_IP_DIVERT_H_ */ diff --git a/bsd/netinet/ip_dummynet.c b/bsd/netinet/ip_dummynet.c index 0979e45d3..013ccbf43 100644 --- a/bsd/netinet/ip_dummynet.c +++ b/bsd/netinet/ip_dummynet.c @@ -19,7 +19,8 @@ * * @APPLE_LICENSE_HEADER_END@ */ - * Copyright (c) 1998-2001 Luigi Rizzo, Universita` di Pisa +/* + * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa * Portions Copyright (c) 2000 Akamba Corp. * All rights reserved * @@ -44,11 +45,10 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.24.2.11 2001/02/09 23:18:08 luigi Exp $ + * $FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.84 2004/08/25 09:31:30 pjd Exp $ */ -#define DEB(x) -#define DDB(x) x +#define DUMMYNET_DEBUG /* * This module implements IP dummynet, a bandwidth limiter/delay emulator @@ -60,10 +60,7 @@ * + scheduler and dummynet functions; * + configuration and initialization. * - * NOTA BENE: critical sections are protected by splimp()/splx() - * pairs. One would think that splnet() is enough as for most of - * the netinet code, but it is not so because when used with - * bridging, dummynet is invoked at splimp(). + * NOTA BENE: critical sections are protected by the "dummynet lock". * * Most important Changes: * @@ -88,6 +85,7 @@ #include <sys/sysctl.h> #include <net/if.h> #include <net/route.h> +#include <net/kpi_protocol.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/in_var.h> @@ -140,7 +138,7 @@ static void ready_event(struct dn_flow_queue *q); static struct dn_pipe *all_pipes = NULL ; /* list of all pipes */ static struct dn_flow_set *all_flow_sets = NULL ;/* list of all flow_sets */ -#if SYSCTL_NODE +#ifdef SYSCTL_NODE SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size, @@ -168,30 +166,35 @@ SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size"); #endif +#ifdef DUMMYNET_DEBUG +int dummynet_debug = 0; +#ifdef SYSCTL_NODE +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug, + 0, "control debugging printfs"); +#endif +#define DPRINTF(X) if (dummynet_debug) printf X +#else +#define DPRINTF(X) +#endif + +/* dummynet lock */ +lck_grp_t *dn_mutex_grp; +lck_grp_attr_t *dn_mutex_grp_attr; +lck_attr_t *dn_mutex_attr; +lck_mtx_t *dn_mutex; + static int config_pipe(struct dn_pipe *p); static int ip_dn_ctl(struct sockopt *sopt); -static void rt_unref(struct rtentry *); static void dummynet(void *); static void dummynet_flush(void); void dummynet_drain(void); -int if_tx_rdy(struct ifnet *ifp); +static ip_dn_io_t dummynet_io; +static void dn_rule_delete(void *); -/* - * ip_fw_chain is used when deleting a pipe, because ipfw rules can - * hold references to the pipe. - */ -extern LIST_HEAD (ip_fw_head, ip_fw_chain) ip_fw_chain_head; +int if_tx_rdy(struct ifnet *ifp); -static void -rt_unref(struct rtentry *rt) -{ - if (rt == NULL) - return ; - if (rt->rt_refcnt <= 0) - printf("-- warning, refcnt now %ld, decreasing\n", rt->rt_refcnt); - rtfree(rt); -} +extern lck_mtx_t *rt_mtx; /* route global lock */ /* * Heap management functions. @@ -214,23 +217,23 @@ rt_unref(struct rtentry *rt) static int heap_init(struct dn_heap *h, int new_size) -{ +{ struct dn_heap_entry *p; if (h->size >= new_size ) { - printf("heap_init, Bogus call, have %d want %d\n", + printf("dummynet: heap_init, Bogus call, have %d want %d\n", h->size, new_size); return 0 ; - } + } new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ; - p = _MALLOC(new_size * sizeof(*p), M_IPFW, M_DONTWAIT ); + p = _MALLOC(new_size * sizeof(*p), M_DUMMYNET, M_DONTWAIT ); if (p == NULL) { - printf(" heap_init, resize %d failed\n", new_size ); + printf("dummynet: heap_init, resize %d failed\n", new_size ); return 1 ; /* error */ } if (h->size > 0) { bcopy(h->p, p, h->size * sizeof(*p) ); - FREE(h->p, M_IPFW); + FREE(h->p, M_DUMMYNET); } h->p = p ; h->size = new_size ; @@ -258,7 +261,7 @@ heap_init(struct dn_heap *h, int new_size) *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ; static int heap_insert(struct dn_heap *h, dn_key key1, void *p) -{ +{ int son = h->elements ; if (p == NULL) /* data already there, set starting point */ @@ -277,7 +280,7 @@ heap_insert(struct dn_heap *h, dn_key key1, void *p) struct dn_heap_entry tmp ; if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) - break ; /* found right position */ + break ; /* found right position */ /* son smaller than father, swap and repeat */ HEAP_SWAP(h->p[son], h->p[father], tmp) ; SET_OFFSET(h, son); @@ -292,22 +295,22 @@ heap_insert(struct dn_heap *h, dn_key key1, void *p) */ static void heap_extract(struct dn_heap *h, void *obj) -{ +{ int child, father, max = h->elements - 1 ; if (max < 0) { - printf("warning, extract from empty heap 0x%p\n", h); + printf("dummynet: warning, extract from empty heap 0x%p\n", h); return ; } father = 0 ; /* default: move up smallest child */ if (obj != NULL) { /* extract specific element, index is at offset */ if (h->offset <= 0) - panic("*** heap_extract from middle not supported on this heap!!!\n"); + panic("dummynet: heap_extract from middle not supported on this heap!!!\n"); father = *((int *)((char *)obj + h->offset)) ; if (father < 0 || father >= h->elements) { printf("dummynet: heap_extract, father %d out of bound 0..%d\n", father, h->elements); - panic("heap_extract"); + panic("dummynet: heap_extract"); } } RESET_OFFSET(h, father); @@ -319,7 +322,7 @@ heap_extract(struct dn_heap *h, void *obj) SET_OFFSET(h, father); father = child ; child = HEAP_LEFT(child) ; /* left child for next loop */ - } + } h->elements-- ; if (father != max) { /* @@ -328,7 +331,7 @@ heap_extract(struct dn_heap *h, void *obj) h->p[father] = h->p[max] ; heap_insert(h, father, NULL); /* this one cannot fail */ } -} +} #if 0 /* @@ -391,7 +394,7 @@ static void heap_free(struct dn_heap *h) { if (h->size >0 ) - FREE(h->p, M_IPFW); + FREE(h->p, M_DUMMYNET); bzero(h, sizeof(*h) ); } @@ -399,6 +402,23 @@ heap_free(struct dn_heap *h) * --- end of heap management functions --- */ +/* + * Return the mbuf tag holding the dummynet state. As an optimization + * this is assumed to be the first tag on the list. If this turns out + * wrong we'll need to search the list. + */ +static struct dn_pkt_tag * +dn_tag_get(struct mbuf *m) +{ + struct m_tag *mtag = m_tag_first(m); +/* KASSERT(mtag != NULL && + mtag->m_tag_id == KERNEL_MODULE_TAG_ID && + mtag->m_tag_type == KERNEL_TAG_TYPE_DUMMYNET, + ("packet on dummynet queue w/o dummynet tag!")); +*/ + return (struct dn_pkt_tag *)(mtag+1); +} + /* * Scheduler functions: * @@ -420,76 +440,78 @@ heap_free(struct dn_heap *h) static void transmit_event(struct dn_pipe *pipe) { - struct dn_pkt *pkt ; - - while ( (pkt = pipe->head) && DN_KEY_LEQ(pkt->output_time, curr_time) ) { - /* - * first unlink, then call procedures, since ip_input() can invoke - * ip_output() and viceversa, thus causing nested calls - */ - pipe->head = DN_NEXT(pkt) ; - - /* - * The actual mbuf is preceded by a struct dn_pkt, resembling an mbuf - * (NOT A REAL one, just a small block of malloc'ed memory) with - * m_type = MT_DUMMYNET - * m_next = actual mbuf to be processed by ip_input/output - * m_data = the matching rule - * and some other fields. - * The block IS FREED HERE because it contains parameters passed - * to the called routine. - */ - switch (pkt->dn_dir) { - case DN_TO_IP_OUT: - (void)ip_output((struct mbuf *)pkt, NULL, NULL, 0, NULL); - rt_unref (pkt->ro.ro_rt) ; - break ; - - case DN_TO_IP_IN : - ip_input((struct mbuf *)pkt) ; - break ; - + struct mbuf *m ; + struct dn_pkt_tag *pkt ; + struct ip *ip; + + lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); + + while ( (m = pipe->head) ) { + pkt = dn_tag_get(m); + if ( !DN_KEY_LEQ(pkt->output_time, curr_time) ) + break; + /* + * first unlink, then call procedures, since ip_input() can invoke + * ip_output() and viceversa, thus causing nested calls + */ + pipe->head = m->m_nextpkt ; + m->m_nextpkt = NULL; + + /* XXX: drop the lock for now to avoid LOR's */ + lck_mtx_unlock(dn_mutex); + switch (pkt->dn_dir) { + case DN_TO_IP_OUT: { + struct route tmp_rt = pkt->ro; + (void)ip_output(m, NULL, NULL, pkt->flags, NULL); + if (tmp_rt.ro_rt) { + rtfree(tmp_rt.ro_rt); + } + break ; + } + case DN_TO_IP_IN : + ip = mtod(m, struct ip *); + ip->ip_len = htons(ip->ip_len); + ip->ip_off = htons(ip->ip_off); + proto_inject(PF_INET, m); + break ; + #if BRIDGE - case DN_TO_BDG_FWD : { - struct mbuf *m = (struct mbuf *)pkt ; - struct ether_header *eh; - - if (pkt->dn_m->m_len < ETHER_HDR_LEN - && (pkt->dn_m = m_pullup(pkt->dn_m, ETHER_HDR_LEN)) == NULL) { - printf("dummynet/bridge: pullup fail, dropping pkt\n"); - break; - } - /* - * same as ether_input, make eh be a pointer into the mbuf - */ - eh = mtod(pkt->dn_m, struct ether_header *); - m_adj(pkt->dn_m, ETHER_HDR_LEN); - /* - * bdg_forward() wants a pointer to the pseudo-mbuf-header, but - * on return it will supply the pointer to the actual packet - * (originally pkt->dn_m, but could be something else now) if - * it has not consumed it. - */ - m = bdg_forward(m, eh, pkt->ifp); - if (m) - m_freem(m); - } - break ; -#endif - - default: - printf("dummynet: bad switch %d!\n", pkt->dn_dir); - m_freem(pkt->dn_m); - break ; - } - FREE(pkt, M_IPFW); + case DN_TO_BDG_FWD : + /* + * The bridge requires/assumes the Ethernet header is + * contiguous in the first mbuf header. Insure this is true. + */ + if (BDG_LOADED) { + if (m->m_len < ETHER_HDR_LEN && + (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) { + printf("dummynet/bridge: pullup fail, dropping pkt\n"); + break; + } + m = bdg_forward_ptr(m, pkt->ifp); + } else { + /* somebody unloaded the bridge module. Drop pkt */ + /* XXX rate limit */ + printf("dummynet: dropping bridged packet trapped in pipe\n"); + } + if (m) + m_freem(m); + break; +#endif + default: + printf("dummynet: bad switch %d!\n", pkt->dn_dir); + m_freem(m); + break ; + } + lck_mtx_lock(dn_mutex); } /* if there are leftover packets, put into the heap for next event */ - if ( (pkt = pipe->head) ) - heap_insert(&extract_heap, pkt->output_time, pipe ) ; - /* XXX should check errors on heap_insert, by draining the - * whole pipe p and hoping in the future we are more successful - */ + if ( (m = pipe->head) ) { + pkt = dn_tag_get(m); + /* XXX should check errors on heap_insert, by draining the + * whole pipe p and hoping in the future we are more successful + */ + heap_insert(&extract_heap, pkt->output_time, pipe); + } } /* @@ -497,8 +519,8 @@ transmit_event(struct dn_pipe *pipe) * before being able to transmit a packet. The credit is taken from * either a pipe (WF2Q) or a flow_queue (per-flow queueing) */ -#define SET_TICKS(pkt, q, p) \ - (pkt->dn_m->m_pkthdr.len*8*hz - (q)->numbytes + p->bandwidth - 1 ) / \ +#define SET_TICKS(_m, q, p) \ + ((_m)->m_pkthdr.len*8*hz - (q)->numbytes + p->bandwidth - 1 ) / \ p->bandwidth ; /* @@ -506,21 +528,23 @@ transmit_event(struct dn_pipe *pipe) * and put into delay line (p_queue) */ static void -move_pkt(struct dn_pkt *pkt, struct dn_flow_queue *q, +move_pkt(struct mbuf *pkt, struct dn_flow_queue *q, struct dn_pipe *p, int len) { - q->head = DN_NEXT(pkt) ; + struct dn_pkt_tag *dt = dn_tag_get(pkt); + + q->head = pkt->m_nextpkt ; q->len-- ; q->len_bytes -= len ; - pkt->output_time = curr_time + p->delay ; + dt->output_time = curr_time + p->delay ; if (p->head == NULL) p->head = pkt; else - DN_NEXT(p->tail) = pkt; + p->tail->m_nextpkt = pkt; p->tail = pkt; - DN_NEXT(p->tail) = NULL; + p->tail->m_nextpkt = NULL; } /* @@ -533,12 +557,14 @@ move_pkt(struct dn_pkt *pkt, struct dn_flow_queue *q, static void ready_event(struct dn_flow_queue *q) { - struct dn_pkt *pkt; + struct mbuf *pkt; struct dn_pipe *p = q->fs->pipe ; int p_was_empty ; + lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); + if (p == NULL) { - printf("ready_event- pipe is gone\n"); + printf("dummynet: ready_event- pipe is gone\n"); return ; } p_was_empty = (p->head == NULL) ; @@ -553,7 +579,7 @@ ready_event(struct dn_flow_queue *q) */ q->numbytes += ( curr_time - q->sched_time ) * p->bandwidth; while ( (pkt = q->head) != NULL ) { - int len = pkt->dn_m->m_pkthdr.len; + int len = pkt->m_pkthdr.len; int len_scaled = p->bandwidth ? len*8*hz : 0 ; if (len_scaled > q->numbytes ) break ; @@ -574,8 +600,10 @@ ready_event(struct dn_flow_queue *q) /* XXX should check errors on heap_insert, and drain the whole * queue on error hoping next time we are luckier. */ - } else /* RED needs to know when the queue becomes empty */ + } else { /* RED needs to know when the queue becomes empty */ q->q_time = curr_time; + q->numbytes = 0; + } /* * If the delay line was empty call transmit_event(p) now. * Otherwise, the scheduler will take care of it. @@ -599,14 +627,16 @@ ready_event_wfq(struct dn_pipe *p) struct dn_heap *sch = &(p->scheduler_heap); struct dn_heap *neh = &(p->not_eligible_heap) ; + lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); + if (p->if_name[0] == 0) /* tx clock is simulated */ p->numbytes += ( curr_time - p->sched_time ) * p->bandwidth; else { /* tx clock is for real, the ifq must be empty or this is a NOP */ if (p->ifp && p->ifp->if_snd.ifq_head != NULL) return ; else { - DEB(printf("pipe %d ready from %s --\n", - p->pipe_nr, p->if_name);) + DPRINTF(("dummynet: pipe %d ready from %s --\n", + p->pipe_nr, p->if_name)); } } @@ -617,9 +647,9 @@ ready_event_wfq(struct dn_pipe *p) while ( p->numbytes >=0 && (sch->elements>0 || neh->elements >0) ) { if (sch->elements > 0) { /* have some eligible pkts to send out */ struct dn_flow_queue *q = sch->p[0].object ; - struct dn_pkt *pkt = q->head; - struct dn_flow_set *fs = q->fs; - u_int64_t len = pkt->dn_m->m_pkthdr.len; + struct mbuf *pkt = q->head; + struct dn_flow_set *fs = q->fs; + u_int64_t len = pkt->m_pkthdr.len; int len_scaled = p->bandwidth ? len*8*hz : 0 ; heap_extract(sch, NULL); /* remove queue from heap */ @@ -636,7 +666,7 @@ ready_event_wfq(struct dn_pipe *p) * update F and position in backlogged queue, then * put flow in not_eligible_heap (we will fix this later). */ - len = (q->head)->dn_m->m_pkthdr.len; + len = (q->head)->m_pkthdr.len; q->F += (len<<MY_M)/(u_int64_t) fs->weight ; if (DN_KEY_LEQ(q->S, p->V)) heap_insert(neh, q->S, q); @@ -691,7 +721,7 @@ ready_event_wfq(struct dn_pipe *p) if (p->bandwidth > 0) t = ( p->bandwidth -1 - p->numbytes) / p->bandwidth ; - p->tail->output_time += t ; + dn_tag_get(p->tail)->output_time += t ; p->sched_time = curr_time ; heap_insert(&wfq_ready_heap, curr_time + t, (void *)p); /* XXX should check errors on heap_insert, and drain the whole @@ -715,7 +745,6 @@ dummynet(void * __unused unused) { void *p ; /* generic parameter to handler */ struct dn_heap *h ; - int s ; struct dn_heap *heaps[3]; int i; struct dn_pipe *pe ; @@ -723,14 +752,16 @@ dummynet(void * __unused unused) heaps[0] = &ready_heap ; /* fixed-rate queues */ heaps[1] = &wfq_ready_heap ; /* wfq queues */ heaps[2] = &extract_heap ; /* delay line */ - s = splimp(); /* see note on top, splnet() is not enough */ + + lck_mtx_lock(dn_mutex); + curr_time++ ; for (i=0; i < 3 ; i++) { h = heaps[i]; while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time) ) { - DDB(if (h->p[0].key > curr_time) - printf("-- dummynet: warning, heap %d is %d ticks late\n", - i, (int)(curr_time - h->p[0].key));) + if (h->p[0].key > curr_time) + printf("dummynet: warning, heap %d is %d ticks late\n", + i, (int)(curr_time - h->p[0].key)); p = h->p[0].object ; /* store a copy before heap_extract */ heap_extract(h, NULL); /* need to extract before processing */ if (i == 0) @@ -738,7 +769,7 @@ dummynet(void * __unused unused) else if (i == 1) { struct dn_pipe *pipe = p; if (pipe->if_name[0] != '\0') - printf("*** bad ready_event_wfq for pipe %s\n", + printf("dummynet: bad ready_event_wfq for pipe %s\n", pipe->if_name); else ready_event_wfq(p) ; @@ -756,7 +787,9 @@ dummynet(void * __unused unused) q->S = q->F + 1 ; /* mark timestamp as invalid */ pe->sum -= q->fs->weight ; } - splx(s); + + lck_mtx_unlock(dn_mutex); + timeout(dummynet, NULL, 1); } @@ -768,6 +801,7 @@ if_tx_rdy(struct ifnet *ifp) { struct dn_pipe *p; + lck_mtx_lock(dn_mutex); for (p = all_pipes; p ; p = p->next ) if (p->ifp == ifp) break ; @@ -777,16 +811,18 @@ if_tx_rdy(struct ifnet *ifp) for (p = all_pipes; p ; p = p->next ) if (!strcmp(p->if_name, buf) ) { p->ifp = ifp ; - DEB(printf("++ tx rdy from %s (now found)\n", buf);) + DPRINTF(("dummynet: ++ tx rdy from %s (now found)\n", buf)); break ; } } if (p != NULL) { - DEB(printf("++ tx rdy from %s%d - qlen %d\n", ifp->if_name, - ifp->if_unit, ifp->if_snd.ifq_len);) + DPRINTF(("dummynet: ++ tx rdy from %s%d - qlen %d\n", ifp->if_name, + ifp->if_unit, ifp->if_snd.ifq_len)); p->numbytes = 0 ; /* mark ready for I/O */ ready_event_wfq(p); } + lck_mtx_lock(dn_mutex); + return 0; } @@ -799,10 +835,13 @@ expire_queues(struct dn_flow_set *fs) { struct dn_flow_queue *q, *prev ; int i, initial_elements = fs->rq_elements ; + struct timeval timenow; - if (fs->last_expired == time_second) + getmicrotime(&timenow); + + if (fs->last_expired == timenow.tv_sec) return 0 ; - fs->last_expired = time_second ; + fs->last_expired = timenow.tv_sec ; for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */ for (prev=NULL, q = fs->rq[i] ; q != NULL ; ) if (q->head != NULL || q->S != q->F+1) { @@ -816,7 +855,7 @@ expire_queues(struct dn_flow_set *fs) else fs->rq[i] = q = q->next ; fs->rq_elements-- ; - FREE(old_q, M_IPFW); + FREE(old_q, M_DUMMYNET); } return initial_elements - fs->rq_elements ; } @@ -839,12 +878,11 @@ create_queue(struct dn_flow_set *fs, int i) if ( fs->rq[i] != NULL ) return fs->rq[i] ; } - q = _MALLOC(sizeof(*q), M_IPFW, M_DONTWAIT) ; + q = _MALLOC(sizeof(*q), M_DUMMYNET, M_DONTWAIT | M_ZERO); if (q == NULL) { - printf("sorry, cannot allocate queue for new flow\n"); + printf("dummynet: sorry, cannot allocate queue for new flow\n"); return NULL ; } - bzero(q, sizeof(*q) ); /* needed */ q->fs = fs ; q->hash_slot = i ; q->next = fs->rq[i] ; @@ -860,7 +898,7 @@ create_queue(struct dn_flow_set *fs, int i) * so that further searches take less time. */ static struct dn_flow_queue * -find_queue(struct dn_flow_set *fs) +find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id) { int i = 0 ; /* we need i and q for new allocations */ struct dn_flow_queue *q, *prev; @@ -869,25 +907,30 @@ find_queue(struct dn_flow_set *fs) q = fs->rq[0] ; else { /* first, do the masking */ - last_pkt.dst_ip &= fs->flow_mask.dst_ip ; - last_pkt.src_ip &= fs->flow_mask.src_ip ; - last_pkt.dst_port &= fs->flow_mask.dst_port ; - last_pkt.src_port &= fs->flow_mask.src_port ; - last_pkt.proto &= fs->flow_mask.proto ; - last_pkt.flags = 0 ; /* we don't care about this one */ + id->dst_ip &= fs->flow_mask.dst_ip ; + id->src_ip &= fs->flow_mask.src_ip ; + id->dst_port &= fs->flow_mask.dst_port ; + id->src_port &= fs->flow_mask.src_port ; + id->proto &= fs->flow_mask.proto ; + id->flags = 0 ; /* we don't care about this one */ /* then, hash function */ - i = ( (last_pkt.dst_ip) & 0xffff ) ^ - ( (last_pkt.dst_ip >> 15) & 0xffff ) ^ - ( (last_pkt.src_ip << 1) & 0xffff ) ^ - ( (last_pkt.src_ip >> 16 ) & 0xffff ) ^ - (last_pkt.dst_port << 1) ^ (last_pkt.src_port) ^ - (last_pkt.proto ); + i = ( (id->dst_ip) & 0xffff ) ^ + ( (id->dst_ip >> 15) & 0xffff ) ^ + ( (id->src_ip << 1) & 0xffff ) ^ + ( (id->src_ip >> 16 ) & 0xffff ) ^ + (id->dst_port << 1) ^ (id->src_port) ^ + (id->proto ); i = i % fs->rq_size ; /* finally, scan the current list for a match */ searches++ ; for (prev=NULL, q = fs->rq[i] ; q ; ) { search_steps++; - if (bcmp(&last_pkt, &(q->id), sizeof(q->id) ) == 0) + if (id->dst_ip == q->id.dst_ip && + id->src_ip == q->id.src_ip && + id->dst_port == q->id.dst_port && + id->src_port == q->id.src_port && + id->proto == q->id.proto && + id->flags == q->id.flags) break ; /* found */ else if (pipe_expire && q->head == NULL && q->S == q->F+1 ) { /* entry is idle and not in any heap, expire it */ @@ -898,7 +941,7 @@ find_queue(struct dn_flow_set *fs) else fs->rq[i] = q = q->next ; fs->rq_elements-- ; - FREE(old_q, M_IPFW); + FREE(old_q, M_DUMMYNET); continue ; } prev = q ; @@ -913,7 +956,7 @@ find_queue(struct dn_flow_set *fs) if (q == NULL) { /* no match, need to allocate a new entry */ q = create_queue(fs, i); if (q != NULL) - q->id = last_pkt ; + q->id = *id ; } return q ; } @@ -923,27 +966,27 @@ red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len) { /* * RED algorithm - * + * * RED calculates the average queue size (avg) using a low-pass filter * with an exponential weighted (w_q) moving average: * avg <- (1-w_q) * avg + w_q * q_size * where q_size is the queue length (measured in bytes or * packets). - * + * * If q_size == 0, we compute the idle time for the link, and set * avg = (1 - w_q)^(idle/s) * where s is the time needed for transmitting a medium-sized packet. - * + * * Now, if avg < min_th the packet is enqueued. * If avg > max_th the packet is dropped. Otherwise, the packet is * dropped with probability P function of avg. - * + * */ int64_t p_b = 0; /* queue in bytes or packets ? */ u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? q->len_bytes : q->len; - DEB(printf("\n%d q: %2u ", (int) curr_time, q_size);) + DPRINTF(("\ndummynet: %d q: %2u ", (int) curr_time, q_size)); /* average queue size estimation */ if (q_size != 0) { @@ -969,7 +1012,7 @@ red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len) SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; } } - DEB(printf("avg: %u ", SCALE_VAL(q->avg));) + DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg))); /* should i drop ? */ @@ -988,7 +1031,7 @@ red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len) p_b = SCALE_MUL((int64_t) fs->c_3, (int64_t) q->avg) - fs->c_4; } else { q->count = -1; - printf("- drop"); + DPRINTF(("dummynet: - drop")); return 1 ; } } else if (q->avg > fs->min_th) { @@ -1010,7 +1053,7 @@ red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len) */ if (SCALE_MUL(p_b, SCALE((int64_t) q->count)) > q->random) { q->count = 0; - DEB(printf("- red drop");) + DPRINTF(("dummynet: - red drop")); /* after a drop we calculate a new random value */ q->random = random() & 0xffff; return 1; /* drop */ @@ -1022,51 +1065,83 @@ red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len) static __inline struct dn_flow_set * -locate_flowset(int pipe_nr, struct ip_fw_chain *rule) +locate_flowset(int pipe_nr, struct ip_fw *rule) { - struct dn_flow_set *fs = NULL ; + struct dn_flow_set *fs; + ipfw_insn *cmd = rule->cmd + rule->act_ofs; - if ( (rule->rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_QUEUE ) - for (fs=all_flow_sets; fs && fs->fs_nr != pipe_nr; fs=fs->next) - ; + if (cmd->opcode == O_LOG) + cmd += F_LEN(cmd); + + bcopy(& ((ipfw_insn_pipe *)cmd)->pipe_ptr, &fs, sizeof(fs)); + + if (fs != NULL) + return fs; + + if (cmd->opcode == O_QUEUE) { + for (fs=all_flow_sets; fs && fs->fs_nr != pipe_nr; fs=fs->next) + ; + } else { - struct dn_pipe *p1; - for (p1 = all_pipes; p1 && p1->pipe_nr != pipe_nr; p1 = p1->next) - ; - if (p1 != NULL) - fs = &(p1->fs) ; + struct dn_pipe *p1; + for (p1 = all_pipes; p1 && p1->pipe_nr != pipe_nr; p1 = p1->next) + ; + if (p1 != NULL) + fs = &(p1->fs) ; } - if (fs != NULL) - rule->rule->pipe_ptr = fs ; /* record for the future */ + /* record for the future */ + bcopy(&fs, & ((ipfw_insn_pipe *)cmd)->pipe_ptr, sizeof(fs)); + return fs ; } /* * dummynet hook for packets. Below 'pipe' is a pipe or a queue * depending on whether WF2Q or fixed bw is used. + * + * pipe_nr pipe or queue the packet is destined for. + * dir where shall we send the packet after dummynet. + * m the mbuf with the packet + * ifp the 'ifp' parameter from the caller. + * NULL in ip_input, destination interface in ip_output, + * real_dst in bdg_forward + * ro route parameter (only used in ip_output, NULL otherwise) + * dst destination address, only used by ip_output + * rule matching rule, in case of multiple passes + * flags flags from the caller, only used in ip_output + * */ -int -dummynet_io(int pipe_nr, int dir, /* pipe_nr can also be a fs_nr */ - struct mbuf *m, struct ifnet *ifp, struct route *ro, - struct sockaddr_in *dst, - struct ip_fw_chain *rule, int flags) +static int +dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa) { - struct dn_pkt *pkt; + struct dn_pkt_tag *pkt; + struct m_tag *mtag; struct dn_flow_set *fs; struct dn_pipe *pipe ; u_int64_t len = m->m_pkthdr.len ; struct dn_flow_queue *q = NULL ; - int s ; - - s = splimp(); + int is_pipe; + +#if IPFW2 + ipfw_insn *cmd = fwa->rule->cmd + fwa->rule->act_ofs; + + if (cmd->opcode == O_LOG) + cmd += F_LEN(cmd); + is_pipe = (cmd->opcode == O_PIPE); +#else + is_pipe = (fwa->rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_PIPE; +#endif pipe_nr &= 0xffff ; - if ( (fs = rule->rule->pipe_ptr) == NULL ) { - fs = locate_flowset(pipe_nr, rule); - if (fs == NULL) - goto dropit ; /* this queue/pipe does not exist! */ - } + lck_mtx_lock(dn_mutex); + + /* + * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule. + */ + fs = locate_flowset(pipe_nr, fwa->rule); + if (fs == NULL) + goto dropit ; /* this queue/pipe does not exist! */ pipe = fs->pipe ; if (pipe == NULL) { /* must be a queue, try find a matching pipe */ for (pipe = all_pipes; pipe && pipe->pipe_nr != fs->parent_nr; @@ -1075,12 +1150,12 @@ dummynet_io(int pipe_nr, int dir, /* pipe_nr can also be a fs_nr */ if (pipe != NULL) fs->pipe = pipe ; else { - printf("No pipe %d for queue %d, drop pkt\n", + printf("dummynet: no pipe %d for queue %d, drop pkt\n", fs->parent_nr, fs->fs_nr); goto dropit ; } } - q = find_queue(fs); + q = find_queue(fs, &(fwa->f_id)); if ( q == NULL ) goto dropit ; /* cannot allocate queue */ /* @@ -1100,56 +1175,59 @@ dummynet_io(int pipe_nr, int dir, /* pipe_nr can also be a fs_nr */ if ( fs->flags_fs & DN_IS_RED && red_drops(fs, q, len) ) goto dropit ; - pkt = (struct dn_pkt *)_MALLOC(sizeof (*pkt), M_IPFW, M_NOWAIT) ; - if ( pkt == NULL ) - goto dropit ; /* cannot allocate packet header */ + /* XXX expensive to zero, see if we can remove it*/ + mtag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, + sizeof(struct dn_pkt_tag), M_NOWAIT|M_ZERO); + if ( mtag == NULL ) + goto dropit ; /* cannot allocate packet header */ + m_tag_prepend(m, mtag); /* attach to mbuf chain */ + + pkt = (struct dn_pkt_tag *)(mtag+1); /* ok, i can handle the pkt now... */ - bzero(pkt, sizeof(*pkt) ); /* XXX expensive, see if we can remove it*/ /* build and enqueue packet + parameters */ - pkt->hdr.mh_type = MT_DUMMYNET ; - (struct ip_fw_chain *)pkt->hdr.mh_data = rule ; - DN_NEXT(pkt) = NULL; - pkt->dn_m = m; + pkt->rule = fwa->rule ; pkt->dn_dir = dir ; - pkt->ifp = ifp; + pkt->ifp = fwa->oif; if (dir == DN_TO_IP_OUT) { /* * We need to copy *ro because for ICMP pkts (and maybe others) * the caller passed a pointer into the stack; dst might also be * a pointer into *ro so it needs to be updated. */ - pkt->ro = *ro; - if (ro->ro_rt) - rtref(ro->ro_rt); - if (dst == (struct sockaddr_in *)&ro->ro_dst) /* dst points into ro */ - dst = (struct sockaddr_in *)&(pkt->ro.ro_dst) ; - - pkt->dn_dst = dst; - pkt->flags = flags ; - } + lck_mtx_lock(rt_mtx); + pkt->ro = *(fwa->ro); + if (fwa->ro->ro_rt) + fwa->ro->ro_rt->rt_refcnt++ ; + if (fwa->dst == (struct sockaddr_in *)&fwa->ro->ro_dst) /* dst points into ro */ + fwa->dst = (struct sockaddr_in *)&(pkt->ro.ro_dst) ; + lck_mtx_unlock(rt_mtx); + + pkt->dn_dst = fwa->dst; + pkt->flags = fwa->flags; + } if (q->head == NULL) - q->head = pkt; + q->head = m; else - DN_NEXT(q->tail) = pkt; - q->tail = pkt; + q->tail->m_nextpkt = m; + q->tail = m; q->len++; q->len_bytes += len ; - if ( q->head != pkt ) /* flow was not idle, we are done */ + if ( q->head != m ) /* flow was not idle, we are done */ goto done; /* * If we reach this point the flow was previously idle, so we need * to schedule it. This involves different actions for fixed-rate or * WF2Q queues. */ - if ( (rule->rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_PIPE ) { + if (is_pipe) { /* * Fixed-rate queue: just insert into the ready_heap. */ dn_key t = 0 ; - if (pipe->bandwidth) - t = SET_TICKS(pkt, q, pipe); + if (pipe->bandwidth) + t = SET_TICKS(m, q, pipe); q->sched_time = curr_time ; if (t == 0) /* must process it now */ ready_event( q ); @@ -1193,42 +1271,46 @@ dummynet_io(int pipe_nr, int dir, /* pipe_nr can also be a fs_nr */ */ if (DN_KEY_GT(q->S, pipe->V) ) { /* not eligible */ if (pipe->scheduler_heap.elements == 0) - printf("++ ouch! not eligible but empty scheduler!\n"); + printf("dummynet: ++ ouch! not eligible but empty scheduler!\n"); heap_insert(&(pipe->not_eligible_heap), q->S, q); } else { heap_insert(&(pipe->scheduler_heap), q->F, q); if (pipe->numbytes >= 0) { /* pipe is idle */ if (pipe->scheduler_heap.elements != 1) - printf("*** OUCH! pipe should have been idle!\n"); - DEB(printf("Waking up pipe %d at %d\n", - pipe->pipe_nr, (int)(q->F >> MY_M)); ) + printf("dummynet: OUCH! pipe should have been idle!\n"); + DPRINTF(("dummynet: waking up pipe %d at %d\n", + pipe->pipe_nr, (int)(q->F >> MY_M))); pipe->sched_time = curr_time ; ready_event_wfq(pipe); } } } done: - splx(s); + lck_mtx_unlock(dn_mutex); return 0; dropit: - splx(s); if (q) q->drops++ ; + lck_mtx_unlock(dn_mutex); m_freem(m); - return ENOBUFS ; + return ( (fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS); } /* - * Below, the rt_unref is only needed when (pkt->dn_dir == DN_TO_IP_OUT) + * Below, the rtfree is only needed when (pkt->dn_dir == DN_TO_IP_OUT) * Doing this would probably save us the initial bzero of dn_pkt */ -#define DN_FREE_PKT(pkt) { \ - struct dn_pkt *n = pkt ; \ - rt_unref ( n->ro.ro_rt ) ; \ - m_freem(n->dn_m); \ - pkt = DN_NEXT(n) ; \ - FREE(n, M_IPFW) ; } +#define DN_FREE_PKT(_m) do { \ + struct m_tag *tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, NULL); \ + if (tag) { \ + struct dn_pkt_tag *n = (struct dn_pkt_tag *)(tag+1); \ + if (n->ro.ro_rt) \ + rtfree(n->ro.ro_rt); \ + } \ + m_tag_delete(_m, tag); \ + m_freem(_m); \ +} while (0) /* * Dispose all packets and flow_queues on a flow_set. @@ -1239,16 +1321,22 @@ dropit: static void purge_flow_set(struct dn_flow_set *fs, int all) { - struct dn_pkt *pkt ; struct dn_flow_queue *q, *qn ; int i ; + lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); + for (i = 0 ; i <= fs->rq_size ; i++ ) { for (q = fs->rq[i] ; q ; q = qn ) { - for (pkt = q->head ; pkt ; ) - DN_FREE_PKT(pkt) ; + struct mbuf *m, *mnext; + + mnext = q->head; + while ((m = mnext) != NULL) { + mnext = m->m_nextpkt; + DN_FREE_PKT(m); + } qn = q->next ; - FREE(q, M_IPFW); + FREE(q, M_DUMMYNET); } fs->rq[i] = NULL ; } @@ -1256,12 +1344,12 @@ purge_flow_set(struct dn_flow_set *fs, int all) if (all) { /* RED - free lookup table */ if (fs->w_q_lookup) - FREE(fs->w_q_lookup, M_IPFW); + FREE(fs->w_q_lookup, M_DUMMYNET); if (fs->rq) - FREE(fs->rq, M_IPFW); + FREE(fs->rq, M_DUMMYNET); /* if this fs is not part of a pipe, free it */ if (fs->pipe && fs != &(fs->pipe->fs) ) - FREE(fs, M_IPFW); + FREE(fs, M_DUMMYNET); } } @@ -1273,12 +1361,15 @@ purge_flow_set(struct dn_flow_set *fs, int all) static void purge_pipe(struct dn_pipe *pipe) { - struct dn_pkt *pkt ; + struct mbuf *m, *mnext; purge_flow_set( &(pipe->fs), 1 ); - for (pkt = pipe->head ; pkt ; ) - DN_FREE_PKT(pkt) ; + mnext = pipe->head; + while ((m = mnext) != NULL) { + mnext = m->m_nextpkt; + DN_FREE_PKT(m); + } heap_free( &(pipe->scheduler_heap) ); heap_free( &(pipe->not_eligible_heap) ); @@ -1293,25 +1384,22 @@ static void dummynet_flush() { struct dn_pipe *curr_p, *p ; - struct ip_fw_chain *chain ; struct dn_flow_set *fs, *curr_fs; - int s ; - s = splimp() ; + lck_mtx_lock(dn_mutex); /* remove all references to pipes ...*/ - LIST_FOREACH(chain, &ip_fw_chain_head, next) - chain->rule->pipe_ptr = NULL ; + flush_pipe_ptrs(NULL); /* prevent future matches... */ p = all_pipes ; - all_pipes = NULL ; + all_pipes = NULL ; fs = all_flow_sets ; all_flow_sets = NULL ; /* and free heaps so we don't have unwanted events */ heap_free(&ready_heap); heap_free(&wfq_ready_heap); heap_free(&extract_heap); - splx(s) ; + /* * Now purge all queued pkts and delete all pipes */ @@ -1325,24 +1413,27 @@ dummynet_flush() purge_pipe(p); curr_p = p ; p = p->next ; - FREE(q, M_IPFW); + FREE(curr_p, M_DUMMYNET); } + lck_mtx_unlock(dn_mutex); } -extern struct ip_fw_chain *ip_fw_default_rule ; +extern struct ip_fw *ip_fw_default_rule ; static void dn_rule_delete_fs(struct dn_flow_set *fs, void *r) { int i ; struct dn_flow_queue *q ; - struct dn_pkt *pkt ; + struct mbuf *m ; for (i = 0 ; i <= fs->rq_size ; i++) /* last one is ovflow */ for (q = fs->rq[i] ; q ; q = q->next ) - for (pkt = q->head ; pkt ; pkt = DN_NEXT(pkt) ) - if (pkt->hdr.mh_data == r) - pkt->hdr.mh_data = (void *)ip_fw_default_rule ; + for (m = q->head ; m ; m = m->m_nextpkt ) { + struct dn_pkt_tag *pkt = dn_tag_get(m) ; + if (pkt->rule == r) + pkt->rule = ip_fw_default_rule ; + } } /* * when a firewall rule is deleted, scan all queues and remove the flow-id @@ -1352,8 +1443,11 @@ void dn_rule_delete(void *r) { struct dn_pipe *p ; - struct dn_pkt *pkt ; struct dn_flow_set *fs ; + struct dn_pkt_tag *pkt ; + struct mbuf *m ; + + lck_mtx_lock(dn_mutex); /* * If the rule references a queue (dn_flow_set), then scan @@ -1365,17 +1459,20 @@ dn_rule_delete(void *r) for ( p = all_pipes ; p ; p = p->next ) { fs = &(p->fs) ; dn_rule_delete_fs(fs, r); - for (pkt = p->head ; pkt ; pkt = DN_NEXT(pkt) ) - if (pkt->hdr.mh_data == r) - pkt->hdr.mh_data = (void *)ip_fw_default_rule ; + for (m = p->head ; m ; m = m->m_nextpkt ) { + pkt = dn_tag_get(m) ; + if (pkt->rule == r) + pkt->rule = ip_fw_default_rule ; + } } + lck_mtx_unlock(dn_mutex); } /* * setup RED parameters */ static int -config_red(struct dn_flow_set *p, struct dn_flow_set * x) +config_red(struct dn_flow_set *p, struct dn_flow_set * x) { int i; @@ -1392,19 +1489,21 @@ config_red(struct dn_flow_set *p, struct dn_flow_set * x) } /* if the lookup table already exist, free and create it again */ - if (x->w_q_lookup) - FREE(x->w_q_lookup, M_IPFW); + if (x->w_q_lookup) { + FREE(x->w_q_lookup, M_DUMMYNET); + x->w_q_lookup = NULL ; + } if (red_lookup_depth == 0) { - printf("\nnet.inet.ip.dummynet.red_lookup_depth must be > 0"); - FREE(x, M_IPFW); + printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth must be > 0\n"); + FREE(x, M_DUMMYNET); return EINVAL; } x->lookup_depth = red_lookup_depth; x->w_q_lookup = (u_int *) _MALLOC(x->lookup_depth * sizeof(int), - M_IPFW, M_DONTWAIT); + M_DUMMYNET, M_DONTWAIT); if (x->w_q_lookup == NULL) { - printf("sorry, cannot allocate red lookup table\n"); - FREE(x, M_IPFW); + printf("dummynet: sorry, cannot allocate red lookup table\n"); + FREE(x, M_DUMMYNET); return ENOSPC; } @@ -1433,18 +1532,17 @@ alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs) l = dn_hash_size; if (l < 4) l = 4; - else if (l > 1024) - l = 1024; + else if (l > DN_MAX_HASH_SIZE) + l = DN_MAX_HASH_SIZE; x->rq_size = l; } else /* one is enough for null mask */ x->rq_size = 1; x->rq = _MALLOC((1 + x->rq_size) * sizeof(struct dn_flow_queue *), - M_IPFW, M_DONTWAIT); + M_DUMMYNET, M_DONTWAIT | M_ZERO); if (x->rq == NULL) { - printf("sorry, cannot allocate queue\n"); + printf("dummynet: sorry, cannot allocate queue\n"); return ENOSPC; } - bzero(x->rq, (1+x->rq_size) * sizeof(struct dn_flow_queue *)); x->rq_elements = 0; return 0 ; } @@ -1474,19 +1572,20 @@ set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src) * setup pipe or queue parameters. */ -static int +static int config_pipe(struct dn_pipe *p) { - int s ; + int i, r; struct dn_flow_set *pfs = &(p->fs); + struct dn_flow_queue *q; - /* - * The config program passes parameters as follows: + /* + * The config program passes parameters as follows: * bw = bits/second (0 means no limits), * delay = ms, must be translated into ticks. * qsize = slots/bytes - */ - p->delay = ( p->delay * hz ) / 1000 ; + */ + p->delay = ( p->delay * hz ) / 1000 ; /* We need either a pipe number or a flow_set number */ if (p->pipe_nr == 0 && pfs->fs_nr == 0) return EINVAL ; @@ -1494,64 +1593,74 @@ config_pipe(struct dn_pipe *p) return EINVAL ; if (p->pipe_nr != 0) { /* this is a pipe */ struct dn_pipe *x, *a, *b; - /* locate pipe */ + + lck_mtx_lock(dn_mutex); +/* locate pipe */ for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ; a = b , b = b->next) ; if (b == NULL || b->pipe_nr != p->pipe_nr) { /* new pipe */ - x = _MALLOC(sizeof(struct dn_pipe), M_IPFW, M_DONTWAIT) ; + x = _MALLOC(sizeof(struct dn_pipe), M_DUMMYNET, M_DONTWAIT | M_ZERO) ; if (x == NULL) { - printf("ip_dummynet.c: no memory for new pipe\n"); + lck_mtx_unlock(dn_mutex); + printf("dummynet: no memory for new pipe\n"); return ENOSPC; } - bzero(x, sizeof(struct dn_pipe)); x->pipe_nr = p->pipe_nr; x->fs.pipe = x ; /* idle_heap is the only one from which we extract from the middle. */ x->idle_heap.size = x->idle_heap.elements = 0 ; x->idle_heap.offset=OFFSET_OF(struct dn_flow_queue, heap_pos); - } else + } else { x = b; + /* Flush accumulated credit for all queues */ + for (i = 0; i <= x->fs.rq_size; i++) + for (q = x->fs.rq[i]; q; q = q->next) + q->numbytes = 0; + } - x->bandwidth = p->bandwidth ; + x->bandwidth = p->bandwidth ; x->numbytes = 0; /* just in case... */ bcopy(p->if_name, x->if_name, sizeof(p->if_name) ); x->ifp = NULL ; /* reset interface ptr */ - x->delay = p->delay ; + x->delay = p->delay ; set_fs_parms(&(x->fs), pfs); if ( x->fs.rq == NULL ) { /* a new pipe */ - s = alloc_hash(&(x->fs), pfs) ; - if (s) { - FREE(x, M_IPFW); - return s ; + r = alloc_hash(&(x->fs), pfs) ; + if (r) { + lck_mtx_unlock(dn_mutex); + FREE(x, M_DUMMYNET); + return r ; } - s = splimp() ; x->next = b ; if (a == NULL) all_pipes = x ; else a->next = x ; - splx(s); } + lck_mtx_unlock(dn_mutex); } else { /* config queue */ struct dn_flow_set *x, *a, *b ; + lck_mtx_lock(dn_mutex); /* locate flow_set */ for (a=NULL, b=all_flow_sets ; b && b->fs_nr < pfs->fs_nr ; a = b , b = b->next) ; if (b == NULL || b->fs_nr != pfs->fs_nr) { /* new */ - if (pfs->parent_nr == 0) /* need link to a pipe */ - return EINVAL ; - x = _MALLOC(sizeof(struct dn_flow_set), M_IPFW, M_DONTWAIT); + if (pfs->parent_nr == 0) { /* need link to a pipe */ + lck_mtx_unlock(dn_mutex); + return EINVAL ; + } + x = _MALLOC(sizeof(struct dn_flow_set), M_DUMMYNET, M_DONTWAIT | M_ZERO); if (x == NULL) { - printf("ip_dummynet.c: no memory for new flow_set\n"); - return ENOSPC; + lck_mtx_unlock(dn_mutex); + printf("dummynet: no memory for new flow_set\n"); + return ENOSPC; } - bzero(x, sizeof(struct dn_flow_set)); x->fs_nr = pfs->fs_nr; x->parent_nr = pfs->parent_nr; x->weight = pfs->weight ; @@ -1561,26 +1670,28 @@ config_pipe(struct dn_pipe *p) x->weight = 100 ; } else { /* Change parent pipe not allowed; must delete and recreate */ - if (pfs->parent_nr != 0 && b->parent_nr != pfs->parent_nr) - return EINVAL ; + if (pfs->parent_nr != 0 && b->parent_nr != pfs->parent_nr) { + lck_mtx_unlock(dn_mutex); + return EINVAL ; + } x = b; } set_fs_parms(x, pfs); if ( x->rq == NULL ) { /* a new flow_set */ - s = alloc_hash(x, pfs) ; - if (s) { - FREE(x, M_IPFW); - return s ; + r = alloc_hash(x, pfs) ; + if (r) { + lck_mtx_unlock(dn_mutex); + FREE(x, M_DUMMYNET); + return r ; } - s = splimp() ; x->next = b; if (a == NULL) all_flow_sets = x; else a->next = x; - splx(s); } + lck_mtx_unlock(dn_mutex); } return 0 ; } @@ -1631,7 +1742,9 @@ dummynet_drain() { struct dn_flow_set *fs; struct dn_pipe *p; - struct dn_pkt *pkt; + struct mbuf *m, *mnext; + + lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); heap_free(&ready_heap); heap_free(&wfq_ready_heap); @@ -1642,8 +1755,12 @@ dummynet_drain() for (p = all_pipes; p; p= p->next ) { purge_flow_set(&(p->fs), 0); - for (pkt = p->head ; pkt ; ) - DN_FREE_PKT(pkt) ; + + mnext = p->head; + while ((m = mnext) != NULL) { + mnext = m->m_nextpkt; + DN_FREE_PKT(m); + } p->head = p->tail = NULL ; } } @@ -1651,12 +1768,9 @@ dummynet_drain() /* * Fully delete a pipe or a queue, cleaning up associated info. */ -static int +static int delete_pipe(struct dn_pipe *p) { - int s ; - struct ip_fw_chain *chain ; - if (p->pipe_nr == 0 && p->fs.fs_nr == 0) return EINVAL ; if (p->pipe_nr != 0 && p->fs.fs_nr != 0) @@ -1665,13 +1779,14 @@ delete_pipe(struct dn_pipe *p) struct dn_pipe *a, *b; struct dn_flow_set *fs; + lck_mtx_lock(dn_mutex); /* locate pipe */ for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ; a = b , b = b->next) ; - if (b == NULL || (b->pipe_nr != p->pipe_nr) ) + if (b == NULL || (b->pipe_nr != p->pipe_nr) ) { + lck_mtx_unlock(dn_mutex); return EINVAL ; /* not found */ - - s = splimp() ; + } /* unlink from list of pipes */ if (a == NULL) @@ -1679,14 +1794,12 @@ delete_pipe(struct dn_pipe *p) else a->next = b->next ; /* remove references to this pipe from the ip_fw rules. */ - LIST_FOREACH(chain, &ip_fw_chain_head, next) - if (chain->rule->pipe_ptr == &(b->fs)) - chain->rule->pipe_ptr = NULL ; + flush_pipe_ptrs(&(b->fs)); /* remove all references to this pipe from flow_sets */ for (fs = all_flow_sets; fs; fs= fs->next ) if (fs->pipe == b) { - printf("++ ref to pipe %d from fs %d\n", + printf("dummynet: ++ ref to pipe %d from fs %d\n", p->pipe_nr, fs->fs_nr); fs->pipe = NULL ; purge_flow_set(fs, 0); @@ -1696,26 +1809,27 @@ delete_pipe(struct dn_pipe *p) /* remove reference to here from extract_heap and wfq_ready_heap */ pipe_remove_from_heap(&extract_heap, b); pipe_remove_from_heap(&wfq_ready_heap, b); - splx(s); - FREE(b, M_IPFW); + lck_mtx_unlock(dn_mutex); + + FREE(b, M_DUMMYNET); } else { /* this is a WF2Q queue (dn_flow_set) */ struct dn_flow_set *a, *b; + lck_mtx_lock(dn_mutex); /* locate set */ for (a = NULL, b = all_flow_sets ; b && b->fs_nr < p->fs.fs_nr ; a = b , b = b->next) ; - if (b == NULL || (b->fs_nr != p->fs.fs_nr) ) + if (b == NULL || (b->fs_nr != p->fs.fs_nr) ) { + lck_mtx_unlock(dn_mutex); return EINVAL ; /* not found */ + } - s = splimp() ; if (a == NULL) all_flow_sets = b->next ; else a->next = b->next ; /* remove references to this flow_set from the ip_fw rules. */ - LIST_FOREACH(chain, &ip_fw_chain_head, next) - if (chain->rule->pipe_ptr == b) - chain->rule->pipe_ptr = NULL ; + flush_pipe_ptrs(b); if (b->pipe != NULL) { /* Update total weight on parent pipe and cleanup parent heaps */ @@ -1727,7 +1841,7 @@ delete_pipe(struct dn_pipe *p) #endif } purge_flow_set(b, 1); - splx(s); + lck_mtx_unlock(dn_mutex); } return 0 ; } @@ -1741,13 +1855,15 @@ dn_copy_set(struct dn_flow_set *set, char *bp) int i, copied = 0 ; struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp; + lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); + for (i = 0 ; i <= set->rq_size ; i++) for (q = set->rq[i] ; q ; q = q->next, qp++ ) { if (q->hash_slot != i) - printf("++ at %d: wrong slot (have %d, " + printf("dummynet: ++ at %d: wrong slot (have %d, " "should be %d)\n", copied, q->hash_slot, i); if (q->fs != set) - printf("++ at %d: wrong fs ptr (have %p, should be %p)\n", + printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n", i, q->fs, set); copied++ ; bcopy(q, qp, sizeof( *q ) ); @@ -1757,21 +1873,20 @@ dn_copy_set(struct dn_flow_set *set, char *bp) qp->fs = NULL ; } if (copied != set->rq_elements) - printf("++ wrong count, have %d should be %d\n", + printf("dummynet: ++ wrong count, have %d should be %d\n", copied, set->rq_elements); return (char *)qp ; } -static int -dummynet_get(struct sockopt *sopt) +static size_t +dn_calc_size(void) { - char *buf, *bp ; /* bp is the "copy-pointer" */ - size_t size ; struct dn_flow_set *set ; struct dn_pipe *p ; - int s, error=0 ; + size_t size ; + + lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); - s = splimp(); /* * compute size of data structures: list of pipes and flow_sets. */ @@ -1781,10 +1896,37 @@ dummynet_get(struct sockopt *sopt) for (set = all_flow_sets ; set ; set = set->next ) size += sizeof ( *set ) + set->rq_elements * sizeof(struct dn_flow_queue); - buf = _MALLOC(size, M_TEMP, M_DONTWAIT); - if (buf == 0) { - splx(s); - return ENOBUFS ; + return size ; +} + +static int +dummynet_get(struct sockopt *sopt) +{ + char *buf, *bp ; /* bp is the "copy-pointer" */ + size_t size ; + struct dn_flow_set *set ; + struct dn_pipe *p ; + int error=0, i ; + + /* XXX lock held too long */ + lck_mtx_lock(dn_mutex); + /* + * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we + * cannot use this flag while holding a mutex. + */ + for (i = 0; i < 10; i++) { + size = dn_calc_size(); + lck_mtx_unlock(dn_mutex); + buf = _MALLOC(size, M_TEMP, M_WAITOK); + lck_mtx_lock(dn_mutex); + if (size == dn_calc_size()) + break; + FREE(buf, M_TEMP); + buf = NULL; + } + if (buf == NULL) { + lck_mtx_unlock(dn_mutex); + return ENOBUFS ; } for (p = all_pipes, bp = buf ; p ; p = p->next ) { struct dn_pipe *pipe_bp = (struct dn_pipe *)bp ; @@ -1822,7 +1964,8 @@ dummynet_get(struct sockopt *sopt) bp += sizeof( *set ) ; bp = dn_copy_set( set, bp ); } - splx(s); + lck_mtx_unlock(dn_mutex); + error = sooptcopyout(sopt, buf, size); FREE(buf, M_TEMP); return error ; @@ -1843,7 +1986,7 @@ ip_dn_ctl(struct sockopt *sopt) switch (sopt->sopt_name) { default : - printf("ip_dn_ctl -- unknown option %d", sopt->sopt_name); + printf("dummynet: -- unknown option %d", sopt->sopt_name); return EINVAL ; case IP_DUMMYNET_GET : @@ -1853,6 +1996,7 @@ ip_dn_ctl(struct sockopt *sopt) case IP_DUMMYNET_FLUSH : dummynet_flush() ; break ; + case IP_DUMMYNET_CONFIGURE : p = &tmp_pipe ; error = sooptcopyin(sopt, p, sizeof *p, sizeof *p); @@ -1873,10 +2017,20 @@ ip_dn_ctl(struct sockopt *sopt) return error ; } -static void +void ip_dn_init(void) { - printf("DUMMYNET initialized (010124)\n"); + /* setup locks */ + dn_mutex_grp_attr = lck_grp_attr_alloc_init(); + dn_mutex_grp = lck_grp_alloc_init("dn", dn_mutex_grp_attr); + dn_mutex_attr = lck_attr_alloc_init(); + lck_attr_setdefault(dn_mutex_attr); + + if ((dn_mutex = lck_mtx_alloc_init(dn_mutex_grp, dn_mutex_attr)) == NULL) { + printf("ip_dn_init: can't alloc dn_mutex\n"); + return; + } + all_pipes = NULL ; all_flow_sets = NULL ; ready_heap.size = ready_heap.elements = 0 ; @@ -1888,37 +2042,8 @@ ip_dn_init(void) extract_heap.size = extract_heap.elements = 0 ; extract_heap.offset = 0 ; ip_dn_ctl_ptr = ip_dn_ctl; + ip_dn_io_ptr = dummynet_io; + ip_dn_ruledel_ptr = dn_rule_delete; + timeout(dummynet, NULL, 1); } - -static ip_dn_ctl_t *old_dn_ctl_ptr ; - -static int -dummynet_modevent(module_t mod, int type, void *data) -{ - int s ; - switch (type) { - case MOD_LOAD: - s = splimp(); - old_dn_ctl_ptr = ip_dn_ctl_ptr; - ip_dn_init(); - splx(s); - break; - case MOD_UNLOAD: - s = splimp(); - ip_dn_ctl_ptr = old_dn_ctl_ptr; - splx(s); - dummynet_flush(); - break ; - default: - break ; - } - return 0 ; -} - -static moduledata_t dummynet_mod = { - "dummynet", - dummynet_modevent, - NULL -} ; -DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); diff --git a/bsd/netinet/ip_dummynet.h b/bsd/netinet/ip_dummynet.h index 9a13ae239..c334a1f2a 100644 --- a/bsd/netinet/ip_dummynet.h +++ b/bsd/netinet/ip_dummynet.h @@ -20,25 +20,39 @@ * @APPLE_LICENSE_HEADER_END@ */ /* - * Copyright (c) 1998 Luigi Rizzo + * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa + * Portions Copyright (c) 2000 Akamba Corp. + * All rights reserved * - * Redistribution and use in source forms, with and without modification, - * are permitted provided that this entire comment appears intact. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. * - * Redistribution in binary form may occur without any restrictions. - * Obviously, it would be nice if you gave credit where credit is due - * but requiring it would be too onerous. + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. * - * This software is provided ``AS IS'' without any warranties of any kind. - * - * $FreeBSD: src/sys/netinet/ip_dummynet.h,v 1.10.2.3 2001/02/01 20:25:09 luigi Exp $ + * $FreeBSD: src/sys/netinet/ip_dummynet.h,v 1.32 2004/08/17 22:05:54 andre Exp $ */ #ifndef _IP_DUMMYNET_H #define _IP_DUMMYNET_H #include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE +#ifdef PRIVATE /* * Definition of dummynet data structures. In the structures, I decided * not to use the macros in <sys/queue.h> in the hope of making the code @@ -85,6 +99,12 @@ typedef u_int64_t dn_key ; /* sorting key */ */ #define OFFSET_OF(type, field) ((int)&( ((type *)0)->field) ) +/* + * The maximum hash table size for queues. This value must be a power + * of 2. + */ +#define DN_MAX_HASH_SIZE 65536 + /* * A heap entry is made of a key and a pointer to the actual * object stored in the heap. @@ -113,39 +133,28 @@ struct dn_heap { } ; /* - * MT_DUMMYNET is a new (fake) mbuf type that is prepended to the - * packet when it comes out of a pipe. The definition - * ought to go in /sys/sys/mbuf.h but here it is less intrusive. + * Packets processed by dummynet have an mbuf tag associated with + * them that carries their dummynet state. This is used within + * the dummynet code as well as outside when checking for special + * processing requirements. */ - -#define MT_DUMMYNET MT_CONTROL - -/* - * struct dn_pkt identifies a packet in the dummynet queue. The - * first part is really an m_hdr for implementation purposes, and some - * fields are saved there. When passing the packet back to the ip_input/ - * ip_output()/bdg_forward, the struct is prepended to the mbuf chain with type - * MT_DUMMYNET, and contains the pointer to the matching rule. - * - * Note: there is no real need to make this structure contain an m_hdr, - * in the future this should be changed to a normal data structure. - */ -struct dn_pkt { - struct m_hdr hdr ; -#define dn_next hdr.mh_nextpkt /* next element in queue */ -#define DN_NEXT(x) (struct dn_pkt *)(x)->dn_next -#define dn_m hdr.mh_next /* packet to be forwarded */ -#define dn_dir hdr.mh_flags /* action when pkt extracted from a queue */ +#ifdef KERNEL +struct dn_pkt_tag { + struct ip_fw *rule; /* matching rule */ + int dn_dir; /* action when packet comes out. */ #define DN_TO_IP_OUT 1 #define DN_TO_IP_IN 2 #define DN_TO_BDG_FWD 3 - dn_key output_time; /* when the pkt is due for delivery */ - struct ifnet *ifp; /* interface, for ip_output */ - struct sockaddr_in *dn_dst ; - struct route ro; /* route, for ip_output. MUST COPY */ - int flags ; /* flags, for ip_output (IPv6 ?) */ + dn_key output_time; /* when the pkt is due for delivery */ + struct ifnet *ifp; /* interface, for ip_output */ + struct sockaddr_in *dn_dst ; + struct route ro; /* route, for ip_output. MUST COPY */ + int flags ; /* flags, for ip_output (IPv6 ?) */ }; +#else +struct dn_pkt; +#endif /* KERNEL */ /* * Overall structure of dummynet (with WF2Q+): @@ -211,19 +220,24 @@ flow using a number of heaps defined into the pipe itself. * per flow queue. This contains the flow identifier, the queue * of packets, counters, and parameters used to support both RED and * WF2Q+. + * + * A dn_flow_queue is created and initialized whenever a packet for + * a new flow arrives. */ struct dn_flow_queue { struct dn_flow_queue *next ; struct ipfw_flow_id id ; - struct dn_pkt *head, *tail ; /* queue of packets */ + + struct mbuf *head, *tail ; /* queue of packets */ u_int len ; u_int len_bytes ; - long numbytes ; /* credit for transmission (dynamic queues) */ + u_long numbytes ; /* credit for transmission (dynamic queues) */ u_int64_t tot_pkts ; /* statistics counters */ u_int64_t tot_bytes ; u_int32_t drops ; - int hash_slot ; /* debugging/diagnostic */ + + int hash_slot ; /* debugging/diagnostic */ /* RED parameters */ int avg ; /* average queue length est. (scaled) */ @@ -232,12 +246,13 @@ struct dn_flow_queue { u_int32_t q_time ; /* start of queue idle time */ /* WF2Q+ support */ - struct dn_flow_set *fs ; /* parent flow set */ - int heap_pos ; /* position (index) of struct in heap */ - dn_key sched_time ; /* current time when queue enters ready_heap */ + struct dn_flow_set *fs ; /* parent flow set */ + int heap_pos ; /* position (index) of struct in heap */ + dn_key sched_time ; /* current time when queue enters ready_heap */ - dn_key S,F ; /* start-time, finishing time */ - /* setting F < S means the timestamp is invalid. We only need + dn_key S,F ; /* start time, finish time */ + /* + * Setting F < S means the timestamp is invalid. We only need * to test this when the queue is empty. */ } ; @@ -250,6 +265,9 @@ struct dn_flow_queue { * hashing the flow-id, then scan the list looking for a match. * The size of the hash table (buckets) is configurable on a per-queue * basis. + * + * A dn_flow_set is created whenever a new queue or pipe is created (in the + * latter case, the structure is located inside the struct dn_pipe). */ struct dn_flow_set { struct dn_flow_set *next; /* next flow set in all_flow_sets list */ @@ -257,26 +275,28 @@ struct dn_flow_set { u_short fs_nr ; /* flow_set number */ u_short flags_fs; #define DN_HAVE_FLOW_MASK 0x0001 -#define DN_IS_PIPE 0x4000 -#define DN_IS_QUEUE 0x8000 #define DN_IS_RED 0x0002 #define DN_IS_GENTLE_RED 0x0004 -#define DN_QSIZE_IS_BYTES 0x0008 /* queue measured in bytes */ +#define DN_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */ +#define DN_NOERROR 0x0010 /* do not report ENOBUFS on drops */ +#define DN_IS_PIPE 0x4000 +#define DN_IS_QUEUE 0x8000 - struct dn_pipe *pipe ; /* pointer to parent pipe */ + struct dn_pipe *pipe ; /* pointer to parent pipe */ u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */ - int weight ; /* WFQ queue weight */ - int qsize ; /* queue size in slots or bytes */ - int plr ; /* pkt loss rate (2^31-1 means 100%) */ + int weight ; /* WFQ queue weight */ + int qsize ; /* queue size in slots or bytes */ + int plr ; /* pkt loss rate (2^31-1 means 100%) */ struct ipfw_flow_id flow_mask ; + /* hash table of queues onto this flow_set */ int rq_size ; /* number of slots */ int rq_elements ; /* active elements */ struct dn_flow_queue **rq; /* array of rq_size entries */ + u_int32_t last_expired ; /* do not expire too frequently */ - /* XXX some RED parameters as well ? */ int backlogged ; /* #active queues for this flowset */ /* RED parameters */ @@ -284,64 +304,61 @@ struct dn_flow_set { #define SCALE(x) ( (x) << SCALE_RED ) #define SCALE_VAL(x) ( (x) >> SCALE_RED ) #define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) - int w_q ; /* queue weight (scaled) */ - int max_th ; /* maximum threshold for queue (scaled) */ - int min_th ; /* minimum threshold for queue (scaled) */ - int max_p ; /* maximum value for p_b (scaled) */ - u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ - u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ - u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ - u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ - u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ - u_int lookup_depth ; /* depth of lookup table */ - int lookup_step ; /* granularity inside the lookup table */ - int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ - int avg_pkt_size ; /* medium packet size */ - int max_pkt_size ; /* max packet size */ + int w_q ; /* queue weight (scaled) */ + int max_th ; /* maximum threshold for queue (scaled) */ + int min_th ; /* minimum threshold for queue (scaled) */ + int max_p ; /* maximum value for p_b (scaled) */ + u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ + u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ + u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ + u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ + u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ + u_int lookup_depth ; /* depth of lookup table */ + int lookup_step ; /* granularity inside the lookup table */ + int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ + int avg_pkt_size ; /* medium packet size */ + int max_pkt_size ; /* max packet size */ } ; /* * Pipe descriptor. Contains global parameters, delay-line queue, * and the flow_set used for fixed-rate queues. - * - * For WF2Q support it also has 4 heaps holding dn_flow_queue: + * + * For WF2Q+ support it also has 3 heaps holding dn_flow_queue: * not_eligible_heap, for queues whose start time is higher * than the virtual time. Sorted by start time. * scheduler_heap, for queues eligible for scheduling. Sorted by * finish time. - * backlogged_heap, all flows in the two heaps above, sorted by - * start time. This is used to compute the virtual time. * idle_heap, all flows that are idle and can be removed. We * do that on each tick so we do not slow down too much * operations during forwarding. * */ -struct dn_pipe { /* a pipe */ - struct dn_pipe *next ; +struct dn_pipe { /* a pipe */ + struct dn_pipe *next ; int pipe_nr ; /* number */ - int bandwidth; /* really, bytes/tick. */ - int delay ; /* really, ticks */ + int bandwidth; /* really, bytes/tick. */ + int delay ; /* really, ticks */ - struct dn_pkt *head, *tail ; /* packets in delay line */ + struct mbuf *head, *tail ; /* packets in delay line */ /* WF2Q+ */ struct dn_heap scheduler_heap ; /* top extract - key Finish time*/ struct dn_heap not_eligible_heap; /* top extract- key Start time */ struct dn_heap idle_heap ; /* random extract - key Start=Finish time */ - dn_key V ; /* virtual time */ - int sum; /* sum of weights of all active sessions */ - int numbytes; /* bit i can transmit (more or less). */ + dn_key V ; /* virtual time */ + int sum; /* sum of weights of all active sessions */ + int numbytes; /* bits I can transmit (more or less). */ - dn_key sched_time ; /* first time pipe is scheduled in ready_heap */ + dn_key sched_time ; /* time pipe was scheduled in ready_heap */ - /* the tx clock can come from an interface. In this case, the - * name is below, and the pointer is filled when the rule is - * configured. We identify this by setting the if_name to a - * non-empty string. + /* + * When the tx clock come from an interface (if_name[0] != '\0'), its name + * is stored below, whereas the ifp is filled when the rule is configured. */ - char if_name[16]; + char if_name[IFNAMSIZ]; struct ifnet *ifp ; int ready ; /* set if ifp != NULL and we got a signal from it */ @@ -350,17 +367,33 @@ struct dn_pipe { /* a pipe */ #ifdef KERNEL -MALLOC_DECLARE(M_IPFW); +void ip_dn_init(void); /* called from raw_ip.c:load_ipfw() */ -typedef int ip_dn_ctl_t __P((struct sockopt *)) ; -extern ip_dn_ctl_t *ip_dn_ctl_ptr; +typedef int ip_dn_ctl_t(struct sockopt *); /* raw_ip.c */ +typedef void ip_dn_ruledel_t(void *); /* ip_fw.c */ +typedef int ip_dn_io_t(struct mbuf *m, int pipe_nr, int dir, + struct ip_fw_args *fwa); +extern ip_dn_ctl_t *ip_dn_ctl_ptr; +extern ip_dn_ruledel_t *ip_dn_ruledel_ptr; +extern ip_dn_io_t *ip_dn_io_ptr; +#define DUMMYNET_LOADED (ip_dn_io_ptr != NULL) -void dn_rule_delete(void *r); /* used in ip_fw.c */ -int dummynet_io(int pipe, int dir, - struct mbuf *m, struct ifnet *ifp, struct route *ro, - struct sockaddr_in * dst, - struct ip_fw_chain *rule, int flags); -#endif - -#endif /* __APPLE_API_PRIVATE */ +/* + * Return the IPFW rule associated with the dummynet tag; if any. + * Make sure that the dummynet tag is not reused by lower layers. + */ +static __inline struct ip_fw * +ip_dn_claim_rule(struct mbuf *m) +{ + struct m_tag *mtag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_DUMMYNET, NULL); + if (mtag != NULL) { + mtag->m_tag_type = KERNEL_TAG_TYPE_NONE; + return (((struct dn_pkt_tag *)(mtag+1))->rule); + } else + return (NULL); +} +#endif /* KERNEL */ + +#endif /* PRIVATE */ #endif /* _IP_DUMMYNET_H */ diff --git a/bsd/netinet/ip_ecn.h b/bsd/netinet/ip_ecn.h index 4aa2132c5..6e452f578 100644 --- a/bsd/netinet/ip_ecn.h +++ b/bsd/netinet/ip_ecn.h @@ -54,13 +54,11 @@ */ #include <sys/appleapiopts.h> +#ifdef KERNEL_PRIVATE #define ECN_ALLOWED 1 /* ECN allowed */ #define ECN_FORBIDDEN 0 /* ECN forbidden */ #define ECN_NOCARE (-1) /* no consideration to ECN */ -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -extern void ip_ecn_ingress __P((int, u_int8_t *, const u_int8_t *)); -extern void ip_ecn_egress __P((int, const u_int8_t *, u_int8_t *)); -#endif /* __APPLE_API_PRIVATE */ -#endif +extern void ip_ecn_ingress(int, u_int8_t *, const u_int8_t *); +extern void ip_ecn_egress(int, const u_int8_t *, u_int8_t *); +#endif KERNEL_PRIVATE diff --git a/bsd/netinet/ip_encap.c b/bsd/netinet/ip_encap.c index e4c9c7c9b..9517bbb05 100644 --- a/bsd/netinet/ip_encap.c +++ b/bsd/netinet/ip_encap.c @@ -113,10 +113,10 @@ MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); #endif -static void encap_add __P((struct encaptab *)); -static int mask_match __P((const struct encaptab *, const struct sockaddr *, - const struct sockaddr *)); -static void encap_fillarg __P((struct mbuf *, const struct encaptab *)); +static void encap_add(struct encaptab *); +static int mask_match(const struct encaptab *, const struct sockaddr *, + const struct sockaddr *); +static void encap_fillarg(struct mbuf *, const struct encaptab *); #ifndef LIST_HEAD_INITIALIZER /* rely upon BSS initialization */ @@ -412,7 +412,7 @@ const struct encaptab * encap_attach_func(af, proto, func, psw, arg) int af; int proto; - int (*func) __P((const struct mbuf *, int, int, void *)); + int (*func)(const struct mbuf *, int, int, void *); const struct protosw *psw; void *arg; { diff --git a/bsd/netinet/ip_encap.h b/bsd/netinet/ip_encap.h index a1c472e22..0a3aba152 100644 --- a/bsd/netinet/ip_encap.h +++ b/bsd/netinet/ip_encap.h @@ -54,8 +54,7 @@ #define _NETINET_IP_ENCAP_H_ #include <sys/appleapiopts.h> -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE struct encaptab { LIST_ENTRY(encaptab) chain; @@ -65,23 +64,22 @@ struct encaptab { struct sockaddr_storage srcmask; struct sockaddr_storage dst; /* remote addr */ struct sockaddr_storage dstmask; - int (*func) __P((const struct mbuf *, int, int, void *)); + int (*func)(const struct mbuf *, int, int, void *); const struct protosw *psw; /* only pr_input will be used */ void *arg; /* passed via m->m_pkthdr.aux */ }; -void encap_init __P((void)); -void encap4_input __P((struct mbuf *, int)); -int encap6_input __P((struct mbuf **, int *)); -const struct encaptab *encap_attach __P((int, int, const struct sockaddr *, +void encap_init(void); +void encap4_input(struct mbuf *, int); +int encap6_input(struct mbuf **, int *); +const struct encaptab *encap_attach(int, int, const struct sockaddr *, const struct sockaddr *, const struct sockaddr *, - const struct sockaddr *, const struct protosw *, void *)); -const struct encaptab *encap_attach_func __P((int, int, - int (*) __P((const struct mbuf *, int, int, void *)), - const struct protosw *, void *)); -int encap_detach __P((const struct encaptab *)); -void *encap_getarg __P((struct mbuf *)); -#endif /* __APPLE_API_PRIVATE */ -#endif + const struct sockaddr *, const struct protosw *, void *); +const struct encaptab *encap_attach_func(int, int, + int (*)(const struct mbuf *, int, int, void *), + const struct protosw *, void *); +int encap_detach(const struct encaptab *); +void *encap_getarg(struct mbuf *); +#endif KERNEL_PRIVATE #endif /*_NETINET_IP_ENCAP_H_*/ diff --git a/bsd/netinet/ip_flow.c b/bsd/netinet/ip_flow.c index 59db2291f..ba3ca9816 100644 --- a/bsd/netinet/ip_flow.c +++ b/bsd/netinet/ip_flow.c @@ -194,7 +194,7 @@ ipflow_fastforward( dst = &ipf->ipf_ro.ro_dst; #ifdef __APPLE__ /* Not sure the rt_dlt is valid here !! XXX */ - if ((error = dlil_output(ifptodlt(rt->rt_ifp, PF_INET), m, (caddr_t) rt, dst, 0)) != 0) { + if ((error = dlil_output(rt->rt_ifp, PF_INET, m, (caddr_t) rt, dst, 0)) != 0) { #else if ((error = (*rt->rt_ifp->if_output)(rt->rt_ifp, m, dst, rt)) != 0) { @@ -352,7 +352,7 @@ ipflow_create( * Fill in the updated information. */ ipf->ipf_ro = *ro; - rtref(ro->ro_rt); + rtref(ro->ro_rt); //### LD 5/25/04 needs rt_mtx lock ipf->ipf_dst = ip->ip_dst; ipf->ipf_src = ip->ip_src; ipf->ipf_tos = ip->ip_tos; diff --git a/bsd/netinet/ip_flow.h b/bsd/netinet/ip_flow.h index 23c25d366..0fed616e0 100644 --- a/bsd/netinet/ip_flow.h +++ b/bsd/netinet/ip_flow.h @@ -61,7 +61,7 @@ #define _NETINET_IP_FLOW_H #include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE struct ipflow { LIST_ENTRY(ipflow) ipf_next; /* next ipflow in bucket */ struct in_addr ipf_dst; /* destination address */ @@ -76,6 +76,6 @@ struct ipflow { u_long ipf_errors; /* other errors returned by if_output */ u_long ipf_last_uses; /* number of uses in last period */ }; -#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL_PRIVATE */ #endif diff --git a/bsd/netinet/ip_fw.h b/bsd/netinet/ip_fw.h index 41eae12e2..3f19ae79f 100644 --- a/bsd/netinet/ip_fw.h +++ b/bsd/netinet/ip_fw.h @@ -38,6 +38,10 @@ #define _IP_FW_H #include <sys/appleapiopts.h> +#ifdef IPFW2 +#include <netinet/ip_fw2.h> +#else /* !IPFW2, good old ipfw */ + #include <sys/queue.h> @@ -287,8 +291,7 @@ struct ipfw_dyn_rule { /* * Main firewall chains definitions and global var's definitions. */ -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE #define IP_FW_PORT_DYNT_FLAG 0x10000 #define IP_FW_PORT_TEE_FLAG 0x20000 @@ -297,20 +300,20 @@ struct ipfw_dyn_rule { /* * Function definitions. */ -void ip_fw_init __P((void)); +void ip_fw_init(void); /* Firewall hooks */ struct ip; struct sockopt; -typedef int ip_fw_chk_t __P((struct ip **, int, struct ifnet *, u_int16_t *, - struct mbuf **, struct ip_fw_chain **, struct sockaddr_in **)); -typedef int ip_fw_ctl_t __P((struct sockopt *)); +typedef int ip_fw_chk_t(struct ip **, int, struct ifnet *, u_int16_t *, + struct mbuf **, struct ip_fw_chain **, struct sockaddr_in **); +typedef int ip_fw_ctl_t(struct sockopt *); extern ip_fw_chk_t *ip_fw_chk_ptr; extern ip_fw_ctl_t *ip_fw_ctl_ptr; extern int fw_one_pass; extern int fw_enable; extern struct ipfw_flow_id last_pkt ; -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +#endif KERNEL_PRIVATE -#endif /* _IP_FW_H */ +#endif !IPFW2 +#endif _IP_FW_H diff --git a/bsd/netinet/ip_fw2.c b/bsd/netinet/ip_fw2.c new file mode 100644 index 000000000..5f45949f1 --- /dev/null +++ b/bsd/netinet/ip_fw2.c @@ -0,0 +1,3324 @@ +/* + * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/sys/netinet/ip_fw2.c,v 1.6.2.18 2003/10/17 11:01:03 scottl Exp $ + */ + +#define DEB(x) +#define DDB(x) x + +/* + * Implement IP packet firewall (new version) + */ + +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ + +#ifdef IPFW2 +#include <machine/spl.h> + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/ucred.h> +#include <net/if.h> +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> +#include <netinet/in_pcb.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/ip_icmp.h> +#include <netinet/ip_fw.h> +#include <netinet/ip_divert.h> + +#if DUMMYNET +#include <netinet/ip_dummynet.h> +#endif /* DUMMYNET */ + +#include <netinet/tcp.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcpip.h> +#include <netinet/udp.h> +#include <netinet/udp_var.h> + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#endif + +#include <netinet/if_ether.h> /* XXX for ETHERTYPE_IP */ + +#include "ip_fw2_compat.h" + +#include <sys/kern_event.h> +#include <stdarg.h> + +/* +#include <machine/in_cksum.h> +*/ /* XXX for in_cksum */ + +/* + * XXX This one should go in sys/mbuf.h. It is used to avoid that + * a firewall-generated packet loops forever through the firewall. + */ +#ifndef M_SKIP_FIREWALL +#define M_SKIP_FIREWALL 0x4000 +#endif + +/* + * set_disable contains one bit per set value (0..31). + * If the bit is set, all rules with the corresponding set + * are disabled. Set RESVD_SET(31) is reserved for the default rule + * and rules that are not deleted by the flush command, + * and CANNOT be disabled. + * Rules in set RESVD_SET can only be deleted explicitly. + */ +static u_int32_t set_disable; + +int fw_verbose; +static int verbose_limit; + +#define IPFW_DEFAULT_RULE 65535 + +#define IPFW_RULE_INACTIVE 1 + +/* + * list of rules for layer 3 + */ +static struct ip_fw *layer3_chain; + +MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); + +static int fw_debug = 1; +static int autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ + +#ifdef SYSCTL_NODE +SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, enable, + CTLFLAG_RW, + &fw_enable, 0, "Enable ipfw"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_RW, + &autoinc_step, 0, "Rule number autincrement step"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass, + CTLFLAG_RW, + &fw_one_pass, 0, + "Only do a single pass through ipfw when using dummynet(4)"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, + CTLFLAG_RW, + &fw_debug, 0, "Enable printing of debug ip_fw statements"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, + CTLFLAG_RW, + &fw_verbose, 0, "Log matches to ipfw rules"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW, + &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged"); + +/* + * Description of dynamic rules. + * + * Dynamic rules are stored in lists accessed through a hash table + * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can + * be modified through the sysctl variable dyn_buckets which is + * updated when the table becomes empty. + * + * XXX currently there is only one list, ipfw_dyn. + * + * When a packet is received, its address fields are first masked + * with the mask defined for the rule, then hashed, then matched + * against the entries in the corresponding list. + * Dynamic rules can be used for different purposes: + * + stateful rules; + * + enforcing limits on the number of sessions; + * + in-kernel NAT (not implemented yet) + * + * The lifetime of dynamic rules is regulated by dyn_*_lifetime, + * measured in seconds and depending on the flags. + * + * The total number of dynamic rules is stored in dyn_count. + * The max number of dynamic rules is dyn_max. When we reach + * the maximum number of rules we do not create anymore. This is + * done to avoid consuming too much memory, but also too much + * time when searching on each packet (ideally, we should try instead + * to put a limit on the length of the list on each bucket...). + * + * Each dynamic rule holds a pointer to the parent ipfw rule so + * we know what action to perform. Dynamic rules are removed when + * the parent rule is deleted. XXX we should make them survive. + * + * There are some limitations with dynamic rules -- we do not + * obey the 'randomized match', and we do not do multiple + * passes through the firewall. XXX check the latter!!! + */ +static ipfw_dyn_rule **ipfw_dyn_v = NULL; +static u_int32_t dyn_buckets = 256; /* must be power of 2 */ +static u_int32_t curr_dyn_buckets = 256; /* must be power of 2 */ + +/* + * Timeouts for various events in handing dynamic rules. + */ +static u_int32_t dyn_ack_lifetime = 300; +static u_int32_t dyn_syn_lifetime = 20; +static u_int32_t dyn_fin_lifetime = 1; +static u_int32_t dyn_rst_lifetime = 1; +static u_int32_t dyn_udp_lifetime = 10; +static u_int32_t dyn_short_lifetime = 5; + +/* + * Keepalives are sent if dyn_keepalive is set. They are sent every + * dyn_keepalive_period seconds, in the last dyn_keepalive_interval + * seconds of lifetime of a rule. + * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower + * than dyn_keepalive_period. + */ + +static u_int32_t dyn_keepalive_interval = 20; +static u_int32_t dyn_keepalive_period = 5; +static u_int32_t dyn_keepalive = 1; /* do send keepalives */ + +static u_int32_t static_count; /* # of static rules */ +static u_int32_t static_len; /* size in bytes of static rules */ +static u_int32_t dyn_count; /* # of dynamic rules */ +static u_int32_t dyn_max = 4096; /* max # of dynamic rules */ + +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW, + &dyn_buckets, 0, "Number of dyn. buckets"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_RD, + &curr_dyn_buckets, 0, "Current Number of dyn. buckets"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_RD, + &dyn_count, 0, "Number of dyn. rules"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_RW, + &dyn_max, 0, "Max number of dyn. rules"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD, + &static_count, 0, "Number of static rules"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW, + &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW, + &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW, + &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW, + &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW, + &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW, + &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW, + &dyn_keepalive, 0, "Enable keepalives for dyn. rules"); + +#endif /* SYSCTL_NODE */ + + +extern lck_mtx_t *ip_mutex; +static ip_fw_chk_t ipfw_chk; + +/* firewall lock */ +lck_grp_t *ipfw_mutex_grp; +lck_grp_attr_t *ipfw_mutex_grp_attr; +lck_attr_t *ipfw_mutex_attr; +lck_mtx_t *ipfw_mutex; + +extern void ipfwsyslog( int level, char *format,...); + +#if DUMMYNET +ip_dn_ruledel_t *ip_dn_ruledel_ptr = NULL; /* hook into dummynet */ +#endif /* DUMMYNET */ + +#define KEV_LOG_SUBCLASS 10 +#define IPFWLOGEVENT 0 + +#define ipfwstring "ipfw:" +static size_t ipfwstringlen; + +#define dolog( a ) { \ + if ( fw_verbose == 2 ) /* Apple logging, log to ipfw.log */ \ + ipfwsyslog a ; \ + else log a ; \ +} + +void ipfwsyslog( int level, char *format,...) +{ +#define msgsize 100 + + struct kev_msg ev_msg; + va_list ap; + char msgBuf[msgsize]; + char *dptr = msgBuf; + unsigned char pri; + int loglen; + + va_start( ap, format ); + loglen = vsnprintf(msgBuf, msgsize, format, ap); + va_end( ap ); + + ev_msg.vendor_code = KEV_VENDOR_APPLE; + ev_msg.kev_class = KEV_NETWORK_CLASS; + ev_msg.kev_subclass = KEV_LOG_SUBCLASS; + ev_msg.event_code = IPFWLOGEVENT; + + /* get rid of the trailing \n */ + dptr[loglen-1] = 0; + + pri = LOG_PRI(level); + + /* remove "ipfw:" prefix if logging to ipfw log */ + if ( !(strncmp( ipfwstring, msgBuf, ipfwstringlen))){ + dptr = msgBuf+ipfwstringlen; + } + + ev_msg.dv[0].data_ptr = &pri; + ev_msg.dv[0].data_length = 1; + ev_msg.dv[1].data_ptr = dptr; + ev_msg.dv[1].data_length = 100; /* bug in kern_post_msg, it can't handle size > 256-msghdr */ + ev_msg.dv[2].data_length = 0; + + kev_post_msg(&ev_msg); +} + +/* + * This macro maps an ip pointer into a layer3 header pointer of type T + */ +#define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) + +static __inline int +icmptype_match(struct ip *ip, ipfw_insn_u32 *cmd) +{ + int type = L3HDR(struct icmp,ip)->icmp_type; + + return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<<type)) ); +} + +#define TT ( (1 << ICMP_ECHO) | (1 << ICMP_ROUTERSOLICIT) | \ + (1 << ICMP_TSTAMP) | (1 << ICMP_IREQ) | (1 << ICMP_MASKREQ) ) + +static int +is_icmp_query(struct ip *ip) +{ + int type = L3HDR(struct icmp, ip)->icmp_type; + return (type <= ICMP_MAXTYPE && (TT & (1<<type)) ); +} +#undef TT + +/* + * The following checks use two arrays of 8 or 16 bits to store the + * bits that we want set or clear, respectively. They are in the + * low and high half of cmd->arg1 or cmd->d[0]. + * + * We scan options and store the bits we find set. We succeed if + * + * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear + * + * The code is sometimes optimized not to store additional variables. + */ + +static int +flags_match(ipfw_insn *cmd, u_int8_t bits) +{ + u_char want_clear; + bits = ~bits; + + if ( ((cmd->arg1 & 0xff) & bits) != 0) + return 0; /* some bits we want set were clear */ + want_clear = (cmd->arg1 >> 8) & 0xff; + if ( (want_clear & bits) != want_clear) + return 0; /* some bits we want clear were set */ + return 1; +} + +static int +ipopts_match(struct ip *ip, ipfw_insn *cmd) +{ + int optlen, bits = 0; + u_char *cp = (u_char *)(ip + 1); + int x = (ip->ip_hl << 2) - sizeof (struct ip); + + for (; x > 0; x -= optlen, cp += optlen) { + int opt = cp[IPOPT_OPTVAL]; + + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + optlen = 1; + else { + optlen = cp[IPOPT_OLEN]; + if (optlen <= 0 || optlen > x) + return 0; /* invalid or truncated */ + } + switch (opt) { + + default: + break; + + case IPOPT_LSRR: + bits |= IP_FW_IPOPT_LSRR; + break; + + case IPOPT_SSRR: + bits |= IP_FW_IPOPT_SSRR; + break; + + case IPOPT_RR: + bits |= IP_FW_IPOPT_RR; + break; + + case IPOPT_TS: + bits |= IP_FW_IPOPT_TS; + break; + } + } + return (flags_match(cmd, bits)); +} + +static int +tcpopts_match(struct ip *ip, ipfw_insn *cmd) +{ + int optlen, bits = 0; + struct tcphdr *tcp = L3HDR(struct tcphdr,ip); + u_char *cp = (u_char *)(tcp + 1); + int x = (tcp->th_off << 2) - sizeof(struct tcphdr); + + for (; x > 0; x -= optlen, cp += optlen) { + int opt = cp[0]; + if (opt == TCPOPT_EOL) + break; + if (opt == TCPOPT_NOP) + optlen = 1; + else { + optlen = cp[1]; + if (optlen <= 0) + break; + } + + switch (opt) { + + default: + break; + + case TCPOPT_MAXSEG: + bits |= IP_FW_TCPOPT_MSS; + break; + + case TCPOPT_WINDOW: + bits |= IP_FW_TCPOPT_WINDOW; + break; + + case TCPOPT_SACK_PERMITTED: + case TCPOPT_SACK: + bits |= IP_FW_TCPOPT_SACK; + break; + + case TCPOPT_TIMESTAMP: + bits |= IP_FW_TCPOPT_TS; + break; + + case TCPOPT_CC: + case TCPOPT_CCNEW: + case TCPOPT_CCECHO: + bits |= IP_FW_TCPOPT_CC; + break; + } + } + return (flags_match(cmd, bits)); +} + +static int +iface_match(struct ifnet *ifp, ipfw_insn_if *cmd) +{ + if (ifp == NULL) /* no iface with this packet, match fails */ + return 0; + /* Check by name or by IP address */ + if (cmd->name[0] != '\0') { /* match by name */ + /* Check unit number (-1 is wildcard) */ + if (cmd->p.unit != -1 && cmd->p.unit != ifp->if_unit) + return(0); + /* Check name */ + if (!strncmp(ifp->if_name, cmd->name, IFNAMSIZ)) + return(1); + } else { + struct ifaddr *ia; + + ifnet_lock_shared(ifp); + TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { + if (ia->ifa_addr == NULL) + continue; + if (ia->ifa_addr->sa_family != AF_INET) + continue; + if (cmd->p.ip.s_addr == ((struct sockaddr_in *) + (ia->ifa_addr))->sin_addr.s_addr) { + ifnet_lock_done(ifp); + return(1); /* match */ + } + } + ifnet_lock_done(ifp); + } + return(0); /* no match, fail ... */ +} + +/* + * The 'verrevpath' option checks that the interface that an IP packet + * arrives on is the same interface that traffic destined for the + * packet's source address would be routed out of. This is a measure + * to block forged packets. This is also commonly known as "anti-spoofing" + * or Unicast Reverse Path Forwarding (Unicast RFP) in Cisco-ese. The + * name of the knob is purposely reminisent of the Cisco IOS command, + * + * ip verify unicast reverse-path + * + * which implements the same functionality. But note that syntax is + * misleading. The check may be performed on all IP packets whether unicast, + * multicast, or broadcast. + */ +static int +verify_rev_path(struct in_addr src, struct ifnet *ifp) +{ + static struct route ro; + struct sockaddr_in *dst; + + dst = (struct sockaddr_in *)&(ro.ro_dst); + + /* Check if we've cached the route from the previous call. */ + if (src.s_addr != dst->sin_addr.s_addr) { + ro.ro_rt = NULL; + + bzero(dst, sizeof(*dst)); + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = src; + + rtalloc_ign(&ro, RTF_CLONING|RTF_PRCLONING); + } + + if ((ro.ro_rt == NULL) || (ifp == NULL) || + (ro.ro_rt->rt_ifp->if_index != ifp->if_index)) + return 0; + + return 1; +} + + +static u_int64_t norule_counter; /* counter for ipfw_log(NULL...) */ + +#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 +#define SNP(buf) buf, sizeof(buf) + +/* + * We enter here when we have a rule with O_LOG. + * XXX this function alone takes about 2Kbytes of code! + */ +static void +ipfw_log(struct ip_fw *f, u_int hlen, struct ether_header *eh, + struct mbuf *m, struct ifnet *oif) +{ + char *action; + int limit_reached = 0; + char ipv4str[MAX_IPv4_STR_LEN]; + char action2[40], proto[48], fragment[28]; + + fragment[0] = '\0'; + proto[0] = '\0'; + + if (f == NULL) { /* bogus pkt */ + if (verbose_limit != 0 && norule_counter >= verbose_limit) + return; + norule_counter++; + if (norule_counter == verbose_limit) + limit_reached = verbose_limit; + action = "Refuse"; + } else { /* O_LOG is the first action, find the real one */ + ipfw_insn *cmd = ACTION_PTR(f); + ipfw_insn_log *l = (ipfw_insn_log *)cmd; + + if (l->max_log != 0 && l->log_left == 0) + return; + l->log_left--; + if (l->log_left == 0) + limit_reached = l->max_log; + cmd += F_LEN(cmd); /* point to first action */ + if (cmd->opcode == O_PROB) + cmd += F_LEN(cmd); + + action = action2; + switch (cmd->opcode) { + case O_DENY: + action = "Deny"; + break; + + case O_REJECT: + if (cmd->arg1==ICMP_REJECT_RST) + action = "Reset"; + else if (cmd->arg1==ICMP_UNREACH_HOST) + action = "Reject"; + else + snprintf(SNPARGS(action2, 0), "Unreach %d", + cmd->arg1); + break; + + case O_ACCEPT: + action = "Accept"; + break; + case O_COUNT: + action = "Count"; + break; + case O_DIVERT: + snprintf(SNPARGS(action2, 0), "Divert %d", + cmd->arg1); + break; + case O_TEE: + snprintf(SNPARGS(action2, 0), "Tee %d", + cmd->arg1); + break; + case O_SKIPTO: + snprintf(SNPARGS(action2, 0), "SkipTo %d", + cmd->arg1); + break; + case O_PIPE: + snprintf(SNPARGS(action2, 0), "Pipe %d", + cmd->arg1); + break; + case O_QUEUE: + snprintf(SNPARGS(action2, 0), "Queue %d", + cmd->arg1); + break; + case O_FORWARD_IP: { + ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd; + int len; + + if (f->reserved_1 == IPFW_RULE_INACTIVE) { + break; + } + len = snprintf(SNPARGS(action2, 0), "Forward to %s", + inet_ntop(AF_INET, &sa->sa.sin_addr, ipv4str, sizeof(ipv4str))); + if (sa->sa.sin_port) + snprintf(SNPARGS(action2, len), ":%d", + sa->sa.sin_port); + } + break; + default: + action = "UNKNOWN"; + break; + } + } + + if (hlen == 0) { /* non-ip */ + snprintf(SNPARGS(proto, 0), "MAC"); + } else { + struct ip *ip = mtod(m, struct ip *); + /* these three are all aliases to the same thing */ + struct icmp *const icmp = L3HDR(struct icmp, ip); + struct tcphdr *const tcp = (struct tcphdr *)icmp; + struct udphdr *const udp = (struct udphdr *)icmp; + + int ip_off, offset, ip_len; + + int len; + + if (eh != NULL) { /* layer 2 packets are as on the wire */ + ip_off = ntohs(ip->ip_off); + ip_len = ntohs(ip->ip_len); + } else { + ip_off = ip->ip_off; + ip_len = ip->ip_len; + } + offset = ip_off & IP_OFFMASK; + switch (ip->ip_p) { + case IPPROTO_TCP: + len = snprintf(SNPARGS(proto, 0), "TCP %s", + inet_ntop(AF_INET, &ip->ip_src, ipv4str, sizeof(ipv4str))); + if (offset == 0) + snprintf(SNPARGS(proto, len), ":%d %s:%d", + ntohs(tcp->th_sport), + inet_ntop(AF_INET, &ip->ip_dst, ipv4str, sizeof(ipv4str)), + ntohs(tcp->th_dport)); + else + snprintf(SNPARGS(proto, len), " %s", + inet_ntop(AF_INET, &ip->ip_dst, ipv4str, sizeof(ipv4str))); + break; + + case IPPROTO_UDP: + len = snprintf(SNPARGS(proto, 0), "UDP %s", + inet_ntop(AF_INET, &ip->ip_src, ipv4str, sizeof(ipv4str))); + if (offset == 0) + snprintf(SNPARGS(proto, len), ":%d %s:%d", + ntohs(udp->uh_sport), + inet_ntop(AF_INET, &ip->ip_dst, ipv4str, sizeof(ipv4str)), + ntohs(udp->uh_dport)); + else + snprintf(SNPARGS(proto, len), " %s", + inet_ntop(AF_INET, &ip->ip_dst, ipv4str, sizeof(ipv4str))); + break; + + case IPPROTO_ICMP: + if (offset == 0) + len = snprintf(SNPARGS(proto, 0), + "ICMP:%u.%u ", + icmp->icmp_type, icmp->icmp_code); + else + len = snprintf(SNPARGS(proto, 0), "ICMP "); + len += snprintf(SNPARGS(proto, len), "%s", + inet_ntop(AF_INET, &ip->ip_src, ipv4str, sizeof(ipv4str))); + snprintf(SNPARGS(proto, len), " %s", + inet_ntop(AF_INET, &ip->ip_dst, ipv4str, sizeof(ipv4str))); + break; + + default: + len = snprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p, + inet_ntop(AF_INET, &ip->ip_src, ipv4str, sizeof(ipv4str))); + snprintf(SNPARGS(proto, len), " %s", + inet_ntop(AF_INET, &ip->ip_dst, ipv4str, sizeof(ipv4str))); + break; + } + + if (ip_off & (IP_MF | IP_OFFMASK)) + snprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)", + ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2), + offset << 3, + (ip_off & IP_MF) ? "+" : ""); + } + if (oif || m->m_pkthdr.rcvif) + { + dolog((LOG_AUTHPRIV | LOG_INFO, + "ipfw: %d %s %s %s via %s%d%s\n", + f ? f->rulenum : -1, + action, proto, oif ? "out" : "in", + oif ? oif->if_name : m->m_pkthdr.rcvif->if_name, + oif ? oif->if_unit : m->m_pkthdr.rcvif->if_unit, + fragment)); + } + else{ + dolog((LOG_AUTHPRIV | LOG_INFO, + "ipfw: %d %s %s [no if info]%s\n", + f ? f->rulenum : -1, + action, proto, fragment)); + } + if (limit_reached){ + dolog((LOG_AUTHPRIV | LOG_NOTICE, + "ipfw: limit %d reached on entry %d\n", + limit_reached, f ? f->rulenum : -1)); + } +} + +/* + * IMPORTANT: the hash function for dynamic rules must be commutative + * in source and destination (ip,port), because rules are bidirectional + * and we want to find both in the same bucket. + */ +static __inline int +hash_packet(struct ipfw_flow_id *id) +{ + u_int32_t i; + + i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port); + i &= (curr_dyn_buckets - 1); + return i; +} + +/** + * unlink a dynamic rule from a chain. prev is a pointer to + * the previous one, q is a pointer to the rule to delete, + * head is a pointer to the head of the queue. + * Modifies q and potentially also head. + */ +#define UNLINK_DYN_RULE(prev, head, q) { \ + ipfw_dyn_rule *old_q = q; \ + \ + /* remove a refcount to the parent */ \ + if (q->dyn_type == O_LIMIT) \ + q->parent->count--; \ + DEB(printf("ipfw: unlink entry 0x%08x %d -> 0x%08x %d, %d left\n",\ + (q->id.src_ip), (q->id.src_port), \ + (q->id.dst_ip), (q->id.dst_port), dyn_count-1 ); ) \ + if (prev != NULL) \ + prev->next = q = q->next; \ + else \ + head = q = q->next; \ + dyn_count--; \ + _FREE(old_q, M_IPFW); } + +#define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) + +/** + * Remove dynamic rules pointing to "rule", or all of them if rule == NULL. + * + * If keep_me == NULL, rules are deleted even if not expired, + * otherwise only expired rules are removed. + * + * The value of the second parameter is also used to point to identify + * a rule we absolutely do not want to remove (e.g. because we are + * holding a reference to it -- this is the case with O_LIMIT_PARENT + * rules). The pointer is only used for comparison, so any non-null + * value will do. + */ +static void +remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me) +{ + static u_int32_t last_remove = 0; + +#define FORCE (keep_me == NULL) + + ipfw_dyn_rule *prev, *q; + int i, pass = 0, max_pass = 0; + struct timeval timenow; + + getmicrotime(&timenow); + + if (ipfw_dyn_v == NULL || dyn_count == 0) + return; + /* do not expire more than once per second, it is useless */ + if (!FORCE && last_remove == timenow.tv_sec) + return; + last_remove = timenow.tv_sec; + + /* + * because O_LIMIT refer to parent rules, during the first pass only + * remove child and mark any pending LIMIT_PARENT, and remove + * them in a second pass. + */ +next_pass: + for (i = 0 ; i < curr_dyn_buckets ; i++) { + for (prev=NULL, q = ipfw_dyn_v[i] ; q ; ) { + /* + * Logic can become complex here, so we split tests. + */ + if (q == keep_me) + goto next; + if (rule != NULL && rule != q->rule) + goto next; /* not the one we are looking for */ + if (q->dyn_type == O_LIMIT_PARENT) { + /* + * handle parent in the second pass, + * record we need one. + */ + max_pass = 1; + if (pass == 0) + goto next; + if (FORCE && q->count != 0 ) { + /* XXX should not happen! */ + printf("ipfw: OUCH! cannot remove rule," + " count %d\n", q->count); + } + } else { + if (!FORCE && + !TIME_LEQ( q->expire, timenow.tv_sec )) + goto next; + } + if (q->dyn_type != O_LIMIT_PARENT || !q->count) { + UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q); + continue; + } +next: + prev=q; + q=q->next; + } + } + if (pass++ < max_pass) + goto next_pass; +} + + +/** + * lookup a dynamic rule. + */ +static ipfw_dyn_rule * +lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction, + struct tcphdr *tcp) +{ + /* + * stateful ipfw extensions. + * Lookup into dynamic session queue + */ +#define MATCH_REVERSE 0 +#define MATCH_FORWARD 1 +#define MATCH_NONE 2 +#define MATCH_UNKNOWN 3 +#define BOTH_SYN (TH_SYN | (TH_SYN << 8)) +#define BOTH_FIN (TH_FIN | (TH_FIN << 8)) + + int i, dir = MATCH_NONE; + ipfw_dyn_rule *prev, *q=NULL; + struct timeval timenow; + + getmicrotime(&timenow); + + if (ipfw_dyn_v == NULL) + goto done; /* not found */ + i = hash_packet( pkt ); + for (prev=NULL, q = ipfw_dyn_v[i] ; q != NULL ; ) { + if (q->dyn_type == O_LIMIT_PARENT && q->count) + goto next; + if (TIME_LEQ( q->expire, timenow.tv_sec)) { /* expire entry */ + int dounlink = 1; + + /* check if entry is TCP */ + if ( q->id.proto == IPPROTO_TCP ) + { + /* do not delete an established TCP connection which hasn't been closed by both sides */ + if ( (q->state & (BOTH_SYN | BOTH_FIN)) != (BOTH_SYN | BOTH_FIN) ) + dounlink = 0; + } + if ( dounlink ){ + UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q); + continue; + } + } + if (pkt->proto == q->id.proto && + q->dyn_type != O_LIMIT_PARENT) { + if (pkt->src_ip == q->id.src_ip && + pkt->dst_ip == q->id.dst_ip && + pkt->src_port == q->id.src_port && + pkt->dst_port == q->id.dst_port ) { + dir = MATCH_FORWARD; + break; + } + if (pkt->src_ip == q->id.dst_ip && + pkt->dst_ip == q->id.src_ip && + pkt->src_port == q->id.dst_port && + pkt->dst_port == q->id.src_port ) { + dir = MATCH_REVERSE; + break; + } + } +next: + prev = q; + q = q->next; + } + if (q == NULL) + goto done; /* q = NULL, not found */ + + if ( prev != NULL) { /* found and not in front */ + prev->next = q->next; + q->next = ipfw_dyn_v[i]; + ipfw_dyn_v[i] = q; + } + if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */ + u_char flags = pkt->flags & (TH_FIN|TH_SYN|TH_RST); + + q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8); + switch (q->state) { + case TH_SYN: /* opening */ + q->expire = timenow.tv_sec + dyn_syn_lifetime; + break; + + case BOTH_SYN: /* move to established */ + case BOTH_SYN | TH_FIN : /* one side tries to close */ + case BOTH_SYN | (TH_FIN << 8) : + if (tcp) { +#define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0) + u_int32_t ack = ntohl(tcp->th_ack); + if (dir == MATCH_FORWARD) { + if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd)) + q->ack_fwd = ack; + else { /* ignore out-of-sequence */ + break; + } + } else { + if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev)) + q->ack_rev = ack; + else { /* ignore out-of-sequence */ + break; + } + } + } + q->expire = timenow.tv_sec + dyn_ack_lifetime; + break; + + case BOTH_SYN | BOTH_FIN: /* both sides closed */ + if (dyn_fin_lifetime >= dyn_keepalive_period) + dyn_fin_lifetime = dyn_keepalive_period - 1; + q->expire = timenow.tv_sec + dyn_fin_lifetime; + break; + + default: +#if 0 + /* + * reset or some invalid combination, but can also + * occur if we use keep-state the wrong way. + */ + if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0) + printf("invalid state: 0x%x\n", q->state); +#endif + if (dyn_rst_lifetime >= dyn_keepalive_period) + dyn_rst_lifetime = dyn_keepalive_period - 1; + q->expire = timenow.tv_sec + dyn_rst_lifetime; + break; + } + } else if (pkt->proto == IPPROTO_UDP) { + q->expire = timenow.tv_sec + dyn_udp_lifetime; + } else { + /* other protocols */ + q->expire = timenow.tv_sec + dyn_short_lifetime; + } +done: + if (match_direction) + *match_direction = dir; + return q; +} + +static void +realloc_dynamic_table(void) +{ + /* + * Try reallocation, make sure we have a power of 2 and do + * not allow more than 64k entries. In case of overflow, + * default to 1024. + */ + + if (dyn_buckets > 65536) + dyn_buckets = 1024; + if ((dyn_buckets & (dyn_buckets-1)) != 0) { /* not a power of 2 */ + dyn_buckets = curr_dyn_buckets; /* reset */ + return; + } + curr_dyn_buckets = dyn_buckets; + if (ipfw_dyn_v != NULL) + _FREE(ipfw_dyn_v, M_IPFW); + for (;;) { + ipfw_dyn_v = _MALLOC(curr_dyn_buckets * sizeof(ipfw_dyn_rule *), + M_IPFW, M_NOWAIT | M_ZERO); + if (ipfw_dyn_v != NULL || curr_dyn_buckets <= 2) + break; + curr_dyn_buckets /= 2; + } +} + +/** + * Install state of type 'type' for a dynamic session. + * The hash table contains two type of rules: + * - regular rules (O_KEEP_STATE) + * - rules for sessions with limited number of sess per user + * (O_LIMIT). When they are created, the parent is + * increased by 1, and decreased on delete. In this case, + * the third parameter is the parent rule and not the chain. + * - "parent" rules for the above (O_LIMIT_PARENT). + */ +static ipfw_dyn_rule * +add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule) +{ + ipfw_dyn_rule *r; + int i; + struct timeval timenow; + + getmicrotime(&timenow); + + if (ipfw_dyn_v == NULL || + (dyn_count == 0 && dyn_buckets != curr_dyn_buckets)) { + realloc_dynamic_table(); + if (ipfw_dyn_v == NULL) + return NULL; /* failed ! */ + } + i = hash_packet(id); + + r = _MALLOC(sizeof *r, M_IPFW, M_NOWAIT | M_ZERO); + if (r == NULL) { +#if IPFW_DEBUG + printf ("ipfw: sorry cannot allocate state\n"); +#endif + return NULL; + } + + /* increase refcount on parent, and set pointer */ + if (dyn_type == O_LIMIT) { + ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule; + if ( parent->dyn_type != O_LIMIT_PARENT) + panic("invalid parent"); + parent->count++; + r->parent = parent; + rule = parent->rule; + } + + r->id = *id; + r->expire = timenow.tv_sec + dyn_syn_lifetime; + r->rule = rule; + r->dyn_type = dyn_type; + r->pcnt = r->bcnt = 0; + r->count = 0; + + r->bucket = i; + r->next = ipfw_dyn_v[i]; + ipfw_dyn_v[i] = r; + dyn_count++; + DEB(printf("ipfw: add dyn entry ty %d 0x%08x %d -> 0x%08x %d, total %d\n", + dyn_type, + (r->id.src_ip), (r->id.src_port), + (r->id.dst_ip), (r->id.dst_port), + dyn_count ); ) + return r; +} + +/** + * lookup dynamic parent rule using pkt and rule as search keys. + * If the lookup fails, then install one. + */ +static ipfw_dyn_rule * +lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule) +{ + ipfw_dyn_rule *q; + int i; + struct timeval timenow; + + getmicrotime(&timenow); + + if (ipfw_dyn_v) { + i = hash_packet( pkt ); + for (q = ipfw_dyn_v[i] ; q != NULL ; q=q->next) + if (q->dyn_type == O_LIMIT_PARENT && + rule== q->rule && + pkt->proto == q->id.proto && + pkt->src_ip == q->id.src_ip && + pkt->dst_ip == q->id.dst_ip && + pkt->src_port == q->id.src_port && + pkt->dst_port == q->id.dst_port) { + q->expire = timenow.tv_sec + dyn_short_lifetime; + DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);) + return q; + } + } + return add_dyn_rule(pkt, O_LIMIT_PARENT, rule); +} + +/** + * Install dynamic state for rule type cmd->o.opcode + * + * Returns 1 (failure) if state is not installed because of errors or because + * session limitations are enforced. + */ +static int +install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, + struct ip_fw_args *args) +{ + static int last_log; + struct timeval timenow; + + ipfw_dyn_rule *q; + getmicrotime(&timenow); + + DEB(printf("ipfw: install state type %d 0x%08x %u -> 0x%08x %u\n", + cmd->o.opcode, + (args->f_id.src_ip), (args->f_id.src_port), + (args->f_id.dst_ip), (args->f_id.dst_port) );) + + q = lookup_dyn_rule(&args->f_id, NULL, NULL); + + if (q != NULL) { /* should never occur */ + if (last_log != timenow.tv_sec) { + last_log = timenow.tv_sec; + printf("ipfw: install_state: entry already present, done\n"); + } + return 0; + } + + if (dyn_count >= dyn_max) + /* + * Run out of slots, try to remove any expired rule. + */ + remove_dyn_rule(NULL, (ipfw_dyn_rule *)1); + + if (dyn_count >= dyn_max) { + if (last_log != timenow.tv_sec) { + last_log = timenow.tv_sec; + printf("ipfw: install_state: Too many dynamic rules\n"); + } + return 1; /* cannot install, notify caller */ + } + + switch (cmd->o.opcode) { + case O_KEEP_STATE: /* bidir rule */ + add_dyn_rule(&args->f_id, O_KEEP_STATE, rule); + break; + + case O_LIMIT: /* limit number of sessions */ + { + u_int16_t limit_mask = cmd->limit_mask; + struct ipfw_flow_id id; + ipfw_dyn_rule *parent; + + DEB(printf("ipfw: installing dyn-limit rule %d\n", + cmd->conn_limit);) + + id.dst_ip = id.src_ip = 0; + id.dst_port = id.src_port = 0; + id.proto = args->f_id.proto; + + if (limit_mask & DYN_SRC_ADDR) + id.src_ip = args->f_id.src_ip; + if (limit_mask & DYN_DST_ADDR) + id.dst_ip = args->f_id.dst_ip; + if (limit_mask & DYN_SRC_PORT) + id.src_port = args->f_id.src_port; + if (limit_mask & DYN_DST_PORT) + id.dst_port = args->f_id.dst_port; + parent = lookup_dyn_parent(&id, rule); + if (parent == NULL) { + printf("ipfw: add parent failed\n"); + return 1; + } + if (parent->count >= cmd->conn_limit) { + /* + * See if we can remove some expired rule. + */ + remove_dyn_rule(rule, parent); + if (parent->count >= cmd->conn_limit) { + if (fw_verbose && last_log != timenow.tv_sec) { + last_log = timenow.tv_sec; + dolog((LOG_AUTHPRIV | LOG_DEBUG, + "drop session, too many entries\n")); + } + return 1; + } + } + add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent); + } + break; + default: + printf("ipfw: unknown dynamic rule type %u\n", cmd->o.opcode); + return 1; + } + lookup_dyn_rule(&args->f_id, NULL, NULL); /* XXX just set lifetime */ + return 0; +} + +/* + * Transmit a TCP packet, containing either a RST or a keepalive. + * When flags & TH_RST, we are sending a RST packet, because of a + * "reset" action matched the packet. + * Otherwise we are sending a keepalive, and flags & TH_ + */ +static void +send_pkt(struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags) +{ + struct mbuf *m; + struct ip *ip; + struct tcphdr *tcp; + struct route sro; /* fake route */ + + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == 0) + return; + m->m_pkthdr.rcvif = (struct ifnet *)0; + m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr); + m->m_data += max_linkhdr; + + ip = mtod(m, struct ip *); + bzero(ip, m->m_len); + tcp = (struct tcphdr *)(ip + 1); /* no IP options */ + ip->ip_p = IPPROTO_TCP; + tcp->th_off = 5; + /* + * Assume we are sending a RST (or a keepalive in the reverse + * direction), swap src and destination addresses and ports. + */ + ip->ip_src.s_addr = htonl(id->dst_ip); + ip->ip_dst.s_addr = htonl(id->src_ip); + tcp->th_sport = htons(id->dst_port); + tcp->th_dport = htons(id->src_port); + if (flags & TH_RST) { /* we are sending a RST */ + if (flags & TH_ACK) { + tcp->th_seq = htonl(ack); + tcp->th_ack = htonl(0); + tcp->th_flags = TH_RST; + } else { + if (flags & TH_SYN) + seq++; + tcp->th_seq = htonl(0); + tcp->th_ack = htonl(seq); + tcp->th_flags = TH_RST | TH_ACK; + } + } else { + /* + * We are sending a keepalive. flags & TH_SYN determines + * the direction, forward if set, reverse if clear. + * NOTE: seq and ack are always assumed to be correct + * as set by the caller. This may be confusing... + */ + if (flags & TH_SYN) { + /* + * we have to rewrite the correct addresses! + */ + ip->ip_dst.s_addr = htonl(id->dst_ip); + ip->ip_src.s_addr = htonl(id->src_ip); + tcp->th_dport = htons(id->dst_port); + tcp->th_sport = htons(id->src_port); + } + tcp->th_seq = htonl(seq); + tcp->th_ack = htonl(ack); + tcp->th_flags = TH_ACK; + } + /* + * set ip_len to the payload size so we can compute + * the tcp checksum on the pseudoheader + * XXX check this, could save a couple of words ? + */ + ip->ip_len = htons(sizeof(struct tcphdr)); + tcp->th_sum = in_cksum(m, m->m_pkthdr.len); + /* + * now fill fields left out earlier + */ + ip->ip_ttl = ip_defttl; + ip->ip_len = m->m_pkthdr.len; + bzero (&sro, sizeof (sro)); + ip_rtaddr(ip->ip_dst, &sro); + m->m_flags |= M_SKIP_FIREWALL; + ip_output_list(m, 0, NULL, &sro, 0, NULL); + if (sro.ro_rt) + RTFREE(sro.ro_rt); +} + +/* + * sends a reject message, consuming the mbuf passed as an argument. + */ +static void +send_reject(struct ip_fw_args *args, int code, int offset, int ip_len) +{ + + if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */ + /* We need the IP header in host order for icmp_error(). */ + if (args->eh != NULL) { + struct ip *ip = mtod(args->m, struct ip *); + ip->ip_len = ntohs(ip->ip_len); + ip->ip_off = ntohs(ip->ip_off); + } + lck_mtx_unlock(ip_mutex); + icmp_error(args->m, ICMP_UNREACH, code, 0L, 0); + lck_mtx_lock(ip_mutex); + } else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) { + struct tcphdr *const tcp = + L3HDR(struct tcphdr, mtod(args->m, struct ip *)); + if ( (tcp->th_flags & TH_RST) == 0) { + lck_mtx_unlock(ip_mutex); + send_pkt(&(args->f_id), ntohl(tcp->th_seq), + ntohl(tcp->th_ack), + tcp->th_flags | TH_RST); + lck_mtx_lock(ip_mutex); + } + m_freem(args->m); + } else + m_freem(args->m); + args->m = NULL; +} + +/** + * + * Given an ip_fw *, lookup_next_rule will return a pointer + * to the next rule, which can be either the jump + * target (for skipto instructions) or the next one in the list (in + * all other cases including a missing jump target). + * The result is also written in the "next_rule" field of the rule. + * Backward jumps are not allowed, so start looking from the next + * rule... + * + * This never returns NULL -- in case we do not have an exact match, + * the next rule is returned. When the ruleset is changed, + * pointers are flushed so we are always correct. + */ + +static struct ip_fw * +lookup_next_rule(struct ip_fw *me) +{ + struct ip_fw *rule = NULL; + ipfw_insn *cmd; + + /* look for action, in case it is a skipto */ + cmd = ACTION_PTR(me); + if (cmd->opcode == O_LOG) + cmd += F_LEN(cmd); + if ( cmd->opcode == O_SKIPTO ) + for (rule = me->next; rule ; rule = rule->next) + if (rule->rulenum >= cmd->arg1) + break; + if (rule == NULL) /* failure or not a skipto */ + rule = me->next; + me->next_rule = rule; + return rule; +} + +/* + * The main check routine for the firewall. + * + * All arguments are in args so we can modify them and return them + * back to the caller. + * + * Parameters: + * + * args->m (in/out) The packet; we set to NULL when/if we nuke it. + * Starts with the IP header. + * args->eh (in) Mac header if present, or NULL for layer3 packet. + * args->oif Outgoing interface, or NULL if packet is incoming. + * The incoming interface is in the mbuf. (in) + * args->divert_rule (in/out) + * Skip up to the first rule past this rule number; + * upon return, non-zero port number for divert or tee. + * + * args->rule Pointer to the last matching rule (in/out) + * args->next_hop Socket we are forwarding to (out). + * args->f_id Addresses grabbed from the packet (out) + * + * Return value: + * + * IP_FW_PORT_DENY_FLAG the packet must be dropped. + * 0 The packet is to be accepted and routed normally OR + * the packet was denied/rejected and has been dropped; + * in the latter case, *m is equal to NULL upon return. + * port Divert the packet to port, with these caveats: + * + * - If IP_FW_PORT_TEE_FLAG is set, tee the packet instead + * of diverting it (ie, 'ipfw tee'). + * + * - If IP_FW_PORT_DYNT_FLAG is set, interpret the lower + * 16 bits as a dummynet pipe number instead of diverting + */ + +static int +ipfw_chk(struct ip_fw_args *args) +{ + /* + * Local variables hold state during the processing of a packet. + * + * IMPORTANT NOTE: to speed up the processing of rules, there + * are some assumption on the values of the variables, which + * are documented here. Should you change them, please check + * the implementation of the various instructions to make sure + * that they still work. + * + * args->eh The MAC header. It is non-null for a layer2 + * packet, it is NULL for a layer-3 packet. + * + * m | args->m Pointer to the mbuf, as received from the caller. + * It may change if ipfw_chk() does an m_pullup, or if it + * consumes the packet because it calls send_reject(). + * XXX This has to change, so that ipfw_chk() never modifies + * or consumes the buffer. + * ip is simply an alias of the value of m, and it is kept + * in sync with it (the packet is supposed to start with + * the ip header). + */ + struct mbuf *m = args->m; + struct ip *ip = mtod(m, struct ip *); + + /* + * oif | args->oif If NULL, ipfw_chk has been called on the + * inbound path (ether_input, bdg_forward, ip_input). + * If non-NULL, ipfw_chk has been called on the outbound path + * (ether_output, ip_output). + */ + struct ifnet *oif = args->oif; + + struct ip_fw *f = NULL; /* matching rule */ + int retval = 0; + + /* + * hlen The length of the IPv4 header. + * hlen >0 means we have an IPv4 packet. + */ + u_int hlen = 0; /* hlen >0 means we have an IP pkt */ + + /* + * offset The offset of a fragment. offset != 0 means that + * we have a fragment at this offset of an IPv4 packet. + * offset == 0 means that (if this is an IPv4 packet) + * this is the first or only fragment. + */ + u_short offset = 0; + + /* + * Local copies of addresses. They are only valid if we have + * an IP packet. + * + * proto The protocol. Set to 0 for non-ip packets, + * or to the protocol read from the packet otherwise. + * proto != 0 means that we have an IPv4 packet. + * + * src_port, dst_port port numbers, in HOST format. Only + * valid for TCP and UDP packets. + * + * src_ip, dst_ip ip addresses, in NETWORK format. + * Only valid for IPv4 packets. + */ + u_int8_t proto; + u_int16_t src_port = 0, dst_port = 0; /* NOTE: host format */ + struct in_addr src_ip, dst_ip; /* NOTE: network format */ + u_int16_t ip_len=0; + int pktlen; + int dyn_dir = MATCH_UNKNOWN; + ipfw_dyn_rule *q = NULL; + struct timeval timenow; + + if (m->m_flags & M_SKIP_FIREWALL) { + return 0; /* accept */ + } + + lck_mtx_lock(ipfw_mutex); + + getmicrotime(&timenow); + /* + * dyn_dir = MATCH_UNKNOWN when rules unchecked, + * MATCH_NONE when checked and not matched (q = NULL), + * MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL) + */ + + pktlen = m->m_pkthdr.len; + if (args->eh == NULL || /* layer 3 packet */ + ( m->m_pkthdr.len >= sizeof(struct ip) && + ntohs(args->eh->ether_type) == ETHERTYPE_IP)) + hlen = ip->ip_hl << 2; + + /* + * Collect parameters into local variables for faster matching. + */ + if (hlen == 0) { /* do not grab addresses for non-ip pkts */ + proto = args->f_id.proto = 0; /* mark f_id invalid */ + goto after_ip_checks; + } + + proto = args->f_id.proto = ip->ip_p; + src_ip = ip->ip_src; + dst_ip = ip->ip_dst; + if (args->eh != NULL) { /* layer 2 packets are as on the wire */ + offset = ntohs(ip->ip_off) & IP_OFFMASK; + ip_len = ntohs(ip->ip_len); + } else { + offset = ip->ip_off & IP_OFFMASK; + ip_len = ip->ip_len; + } + pktlen = ip_len < pktlen ? ip_len : pktlen; + +#define PULLUP_TO(len) \ + do { \ + if ((m)->m_len < (len)) { \ + args->m = m = m_pullup(m, (len)); \ + if (m == 0) \ + goto pullup_failed; \ + ip = mtod(m, struct ip *); \ + } \ + } while (0) + + if (offset == 0) { + switch (proto) { + case IPPROTO_TCP: + { + struct tcphdr *tcp; + + PULLUP_TO(hlen + sizeof(struct tcphdr)); + tcp = L3HDR(struct tcphdr, ip); + dst_port = tcp->th_dport; + src_port = tcp->th_sport; + args->f_id.flags = tcp->th_flags; + } + break; + + case IPPROTO_UDP: + { + struct udphdr *udp; + + PULLUP_TO(hlen + sizeof(struct udphdr)); + udp = L3HDR(struct udphdr, ip); + dst_port = udp->uh_dport; + src_port = udp->uh_sport; + } + break; + + case IPPROTO_ICMP: + PULLUP_TO(hlen + 4); /* type, code and checksum. */ + args->f_id.flags = L3HDR(struct icmp, ip)->icmp_type; + break; + + default: + break; + } +#undef PULLUP_TO + } + + args->f_id.src_ip = ntohl(src_ip.s_addr); + args->f_id.dst_ip = ntohl(dst_ip.s_addr); + args->f_id.src_port = src_port = ntohs(src_port); + args->f_id.dst_port = dst_port = ntohs(dst_port); + +after_ip_checks: + if (args->rule) { + /* + * Packet has already been tagged. Look for the next rule + * to restart processing. + * + * If fw_one_pass != 0 then just accept it. + * XXX should not happen here, but optimized out in + * the caller. + */ + if (fw_one_pass) { + lck_mtx_unlock(ipfw_mutex); + return 0; + } + + f = args->rule->next_rule; + if (f == NULL) + f = lookup_next_rule(args->rule); + } else { + /* + * Find the starting rule. It can be either the first + * one, or the one after divert_rule if asked so. + */ + int skipto = args->divert_rule; + + f = layer3_chain; + if (args->eh == NULL && skipto != 0) { + if (skipto >= IPFW_DEFAULT_RULE) { + lck_mtx_unlock(ipfw_mutex); + return(IP_FW_PORT_DENY_FLAG); /* invalid */ + } + while (f && f->rulenum <= skipto) + f = f->next; + if (f == NULL) { /* drop packet */ + lck_mtx_unlock(ipfw_mutex); + return(IP_FW_PORT_DENY_FLAG); + } + } + } + args->divert_rule = 0; /* reset to avoid confusion later */ + + /* + * Now scan the rules, and parse microinstructions for each rule. + */ + for (; f; f = f->next) { + int l, cmdlen; + ipfw_insn *cmd; + int skip_or; /* skip rest of OR block */ + +again: + if (f->reserved_1 == IPFW_RULE_INACTIVE) { + continue; + } + + if (set_disable & (1 << f->set) ) + continue; + + skip_or = 0; + for (l = f->cmd_len, cmd = f->cmd ; l > 0 ; + l -= cmdlen, cmd += cmdlen) { + int match; + + /* + * check_body is a jump target used when we find a + * CHECK_STATE, and need to jump to the body of + * the target rule. + */ + +check_body: + cmdlen = F_LEN(cmd); + /* + * An OR block (insn_1 || .. || insn_n) has the + * F_OR bit set in all but the last instruction. + * The first match will set "skip_or", and cause + * the following instructions to be skipped until + * past the one with the F_OR bit clear. + */ + if (skip_or) { /* skip this instruction */ + if ((cmd->len & F_OR) == 0) + skip_or = 0; /* next one is good */ + continue; + } + match = 0; /* set to 1 if we succeed */ + + switch (cmd->opcode) { + /* + * The first set of opcodes compares the packet's + * fields with some pattern, setting 'match' if a + * match is found. At the end of the loop there is + * logic to deal with F_NOT and F_OR flags associated + * with the opcode. + */ + case O_NOP: + match = 1; + break; + + case O_FORWARD_MAC: + printf("ipfw: opcode %d unimplemented\n", + cmd->opcode); + break; + +#ifndef __APPLE__ + case O_GID: +#endif + case O_UID: + /* + * We only check offset == 0 && proto != 0, + * as this ensures that we have an IPv4 + * packet with the ports info. + */ + if (offset!=0) + break; + + { + struct inpcbinfo *pi; + int wildcard; + struct inpcb *pcb; + + if (proto == IPPROTO_TCP) { + wildcard = 0; + pi = &tcbinfo; + } else if (proto == IPPROTO_UDP) { + wildcard = 1; + pi = &udbinfo; + } else + break; + + pcb = (oif) ? + in_pcblookup_hash(pi, + dst_ip, htons(dst_port), + src_ip, htons(src_port), + wildcard, oif) : + in_pcblookup_hash(pi, + src_ip, htons(src_port), + dst_ip, htons(dst_port), + wildcard, NULL); + + if (pcb == NULL || pcb->inp_socket == NULL) + break; +#if __FreeBSD_version < 500034 +#define socheckuid(a,b) (kauth_cred_getuid((a)->so_cred) != (b)) +#endif + if (cmd->opcode == O_UID) { + match = +#ifdef __APPLE__ + (pcb->inp_socket->so_uid == (uid_t)((ipfw_insn_u32 *)cmd)->d[0]); +#else + !socheckuid(pcb->inp_socket, + (uid_t)((ipfw_insn_u32 *)cmd)->d[0]); +#endif + } +#ifndef __APPLE__ + else { + match = 0; + kauth_cred_ismember_gid(pcb->inp_socket->so_cred, + (gid_t)((ipfw_insn_u32 *)cmd)->d[0], &match); + } +#endif + } + + break; + + case O_RECV: + match = iface_match(m->m_pkthdr.rcvif, + (ipfw_insn_if *)cmd); + break; + + case O_XMIT: + match = iface_match(oif, (ipfw_insn_if *)cmd); + break; + + case O_VIA: + match = iface_match(oif ? oif : + m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd); + break; + + case O_MACADDR2: + if (args->eh != NULL) { /* have MAC header */ + u_int32_t *want = (u_int32_t *) + ((ipfw_insn_mac *)cmd)->addr; + u_int32_t *mask = (u_int32_t *) + ((ipfw_insn_mac *)cmd)->mask; + u_int32_t *hdr = (u_int32_t *)args->eh; + + match = + ( want[0] == (hdr[0] & mask[0]) && + want[1] == (hdr[1] & mask[1]) && + want[2] == (hdr[2] & mask[2]) ); + } + break; + + case O_MAC_TYPE: + if (args->eh != NULL) { + u_int16_t t = + ntohs(args->eh->ether_type); + u_int16_t *p = + ((ipfw_insn_u16 *)cmd)->ports; + int i; + + for (i = cmdlen - 1; !match && i>0; + i--, p += 2) + match = (t>=p[0] && t<=p[1]); + } + break; + + case O_FRAG: + match = (hlen > 0 && offset != 0); + break; + + case O_IN: /* "out" is "not in" */ + match = (oif == NULL); + break; + + case O_LAYER2: + match = (args->eh != NULL); + break; + + case O_PROTO: + /* + * We do not allow an arg of 0 so the + * check of "proto" only suffices. + */ + match = (proto == cmd->arg1); + break; + + case O_IP_SRC: + match = (hlen > 0 && + ((ipfw_insn_ip *)cmd)->addr.s_addr == + src_ip.s_addr); + break; + + case O_IP_SRC_MASK: + case O_IP_DST_MASK: + if (hlen > 0) { + uint32_t a = + (cmd->opcode == O_IP_DST_MASK) ? + dst_ip.s_addr : src_ip.s_addr; + uint32_t *p = ((ipfw_insn_u32 *)cmd)->d; + int i = cmdlen-1; + + for (; !match && i>0; i-= 2, p+= 2) + match = (p[0] == (a & p[1])); + } + break; + + case O_IP_SRC_ME: + if (hlen > 0) { + struct ifnet *tif; + + INADDR_TO_IFP(src_ip, tif); + match = (tif != NULL); + } + break; + + case O_IP_DST_SET: + case O_IP_SRC_SET: + if (hlen > 0) { + u_int32_t *d = (u_int32_t *)(cmd+1); + u_int32_t addr = + cmd->opcode == O_IP_DST_SET ? + args->f_id.dst_ip : + args->f_id.src_ip; + + if (addr < d[0]) + break; + addr -= d[0]; /* subtract base */ + match = (addr < cmd->arg1) && + ( d[ 1 + (addr>>5)] & + (1<<(addr & 0x1f)) ); + } + break; + + case O_IP_DST: + match = (hlen > 0 && + ((ipfw_insn_ip *)cmd)->addr.s_addr == + dst_ip.s_addr); + break; + + case O_IP_DST_ME: + if (hlen > 0) { + struct ifnet *tif; + + INADDR_TO_IFP(dst_ip, tif); + match = (tif != NULL); + } + break; + + case O_IP_SRCPORT: + case O_IP_DSTPORT: + /* + * offset == 0 && proto != 0 is enough + * to guarantee that we have an IPv4 + * packet with port info. + */ + if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP) + && offset == 0) { + u_int16_t x = + (cmd->opcode == O_IP_SRCPORT) ? + src_port : dst_port ; + u_int16_t *p = + ((ipfw_insn_u16 *)cmd)->ports; + int i; + + for (i = cmdlen - 1; !match && i>0; + i--, p += 2) + match = (x>=p[0] && x<=p[1]); + } + break; + + case O_ICMPTYPE: + match = (offset == 0 && proto==IPPROTO_ICMP && + icmptype_match(ip, (ipfw_insn_u32 *)cmd) ); + break; + + case O_IPOPT: + match = (hlen > 0 && ipopts_match(ip, cmd) ); + break; + + case O_IPVER: + match = (hlen > 0 && cmd->arg1 == ip->ip_v); + break; + + case O_IPID: + case O_IPLEN: + case O_IPTTL: + if (hlen > 0) { /* only for IP packets */ + uint16_t x; + uint16_t *p; + int i; + + if (cmd->opcode == O_IPLEN) + x = ip_len; + else if (cmd->opcode == O_IPTTL) + x = ip->ip_ttl; + else /* must be IPID */ + x = ntohs(ip->ip_id); + if (cmdlen == 1) { + match = (cmd->arg1 == x); + break; + } + /* otherwise we have ranges */ + p = ((ipfw_insn_u16 *)cmd)->ports; + i = cmdlen - 1; + for (; !match && i>0; i--, p += 2) + match = (x >= p[0] && x <= p[1]); + } + break; + + case O_IPPRECEDENCE: + match = (hlen > 0 && + (cmd->arg1 == (ip->ip_tos & 0xe0)) ); + break; + + case O_IPTOS: + match = (hlen > 0 && + flags_match(cmd, ip->ip_tos)); + break; + + case O_TCPFLAGS: + match = (proto == IPPROTO_TCP && offset == 0 && + flags_match(cmd, + L3HDR(struct tcphdr,ip)->th_flags)); + break; + + case O_TCPOPTS: + match = (proto == IPPROTO_TCP && offset == 0 && + tcpopts_match(ip, cmd)); + break; + + case O_TCPSEQ: + match = (proto == IPPROTO_TCP && offset == 0 && + ((ipfw_insn_u32 *)cmd)->d[0] == + L3HDR(struct tcphdr,ip)->th_seq); + break; + + case O_TCPACK: + match = (proto == IPPROTO_TCP && offset == 0 && + ((ipfw_insn_u32 *)cmd)->d[0] == + L3HDR(struct tcphdr,ip)->th_ack); + break; + + case O_TCPWIN: + match = (proto == IPPROTO_TCP && offset == 0 && + cmd->arg1 == + L3HDR(struct tcphdr,ip)->th_win); + break; + + case O_ESTAB: + /* reject packets which have SYN only */ + /* XXX should i also check for TH_ACK ? */ + match = (proto == IPPROTO_TCP && offset == 0 && + (L3HDR(struct tcphdr,ip)->th_flags & + (TH_RST | TH_ACK | TH_SYN)) != TH_SYN); + break; + + case O_LOG: + if (fw_verbose) + ipfw_log(f, hlen, args->eh, m, oif); + match = 1; + break; + + case O_PROB: + match = (random()<((ipfw_insn_u32 *)cmd)->d[0]); + break; + + case O_VERREVPATH: + /* Outgoing packets automatically pass/match */ + match = ((oif != NULL) || + (m->m_pkthdr.rcvif == NULL) || + verify_rev_path(src_ip, m->m_pkthdr.rcvif)); + break; + + case O_IPSEC: +#ifdef FAST_IPSEC + match = (m_tag_find(m, + PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL); +#endif +#ifdef IPSEC + match = (ipsec_gethist(m, NULL) != NULL); +#endif + /* otherwise no match */ + break; + + /* + * The second set of opcodes represents 'actions', + * i.e. the terminal part of a rule once the packet + * matches all previous patterns. + * Typically there is only one action for each rule, + * and the opcode is stored at the end of the rule + * (but there are exceptions -- see below). + * + * In general, here we set retval and terminate the + * outer loop (would be a 'break 3' in some language, + * but we need to do a 'goto done'). + * + * Exceptions: + * O_COUNT and O_SKIPTO actions: + * instead of terminating, we jump to the next rule + * ('goto next_rule', equivalent to a 'break 2'), + * or to the SKIPTO target ('goto again' after + * having set f, cmd and l), respectively. + * + * O_LIMIT and O_KEEP_STATE: these opcodes are + * not real 'actions', and are stored right + * before the 'action' part of the rule. + * These opcodes try to install an entry in the + * state tables; if successful, we continue with + * the next opcode (match=1; break;), otherwise + * the packet * must be dropped + * ('goto done' after setting retval); + * + * O_PROBE_STATE and O_CHECK_STATE: these opcodes + * cause a lookup of the state table, and a jump + * to the 'action' part of the parent rule + * ('goto check_body') if an entry is found, or + * (CHECK_STATE only) a jump to the next rule if + * the entry is not found ('goto next_rule'). + * The result of the lookup is cached to make + * further instances of these opcodes are + * effectively NOPs. + */ + case O_LIMIT: + case O_KEEP_STATE: + if (install_state(f, + (ipfw_insn_limit *)cmd, args)) { + retval = IP_FW_PORT_DENY_FLAG; + goto done; /* error/limit violation */ + } + match = 1; + break; + + case O_PROBE_STATE: + case O_CHECK_STATE: + /* + * dynamic rules are checked at the first + * keep-state or check-state occurrence, + * with the result being stored in dyn_dir. + * The compiler introduces a PROBE_STATE + * instruction for us when we have a + * KEEP_STATE (because PROBE_STATE needs + * to be run first). + */ + if (dyn_dir == MATCH_UNKNOWN && + (q = lookup_dyn_rule(&args->f_id, + &dyn_dir, proto == IPPROTO_TCP ? + L3HDR(struct tcphdr, ip) : NULL)) + != NULL) { + /* + * Found dynamic entry, update stats + * and jump to the 'action' part of + * the parent rule. + */ + q->pcnt++; + q->bcnt += pktlen; + f = q->rule; + cmd = ACTION_PTR(f); + l = f->cmd_len - f->act_ofs; + goto check_body; + } + /* + * Dynamic entry not found. If CHECK_STATE, + * skip to next rule, if PROBE_STATE just + * ignore and continue with next opcode. + */ + if (cmd->opcode == O_CHECK_STATE) + goto next_rule; + match = 1; + break; + + case O_ACCEPT: + retval = 0; /* accept */ + goto done; + + case O_PIPE: + case O_QUEUE: + args->rule = f; /* report matching rule */ + retval = cmd->arg1 | IP_FW_PORT_DYNT_FLAG; + goto done; + + case O_DIVERT: + case O_TEE: + if (args->eh) /* not on layer 2 */ + break; + args->divert_rule = f->rulenum; + retval = (cmd->opcode == O_DIVERT) ? + cmd->arg1 : + cmd->arg1 | IP_FW_PORT_TEE_FLAG; + goto done; + + case O_COUNT: + case O_SKIPTO: + f->pcnt++; /* update stats */ + f->bcnt += pktlen; + f->timestamp = timenow.tv_sec; + if (cmd->opcode == O_COUNT) + goto next_rule; + /* handle skipto */ + if (f->next_rule == NULL) + lookup_next_rule(f); + f = f->next_rule; + goto again; + + case O_REJECT: + /* + * Drop the packet and send a reject notice + * if the packet is not ICMP (or is an ICMP + * query), and it is not multicast/broadcast. + */ + if (hlen > 0 && + (proto != IPPROTO_ICMP || + is_icmp_query(ip)) && + !(m->m_flags & (M_BCAST|M_MCAST)) && + !IN_MULTICAST(dst_ip.s_addr)) { + send_reject(args, cmd->arg1, + offset,ip_len); + m = args->m; + } + /* FALLTHROUGH */ + case O_DENY: + retval = IP_FW_PORT_DENY_FLAG; + goto done; + + case O_FORWARD_IP: + if (args->eh) /* not valid on layer2 pkts */ + break; + if (!q || dyn_dir == MATCH_FORWARD) + args->next_hop = + &((ipfw_insn_sa *)cmd)->sa; + retval = 0; + goto done; + + default: + panic("-- unknown opcode %d\n", cmd->opcode); + } /* end of switch() on opcodes */ + + if (cmd->len & F_NOT) + match = !match; + + if (match) { + if (cmd->len & F_OR) + skip_or = 1; + } else { + if (!(cmd->len & F_OR)) /* not an OR block, */ + break; /* try next rule */ + } + + } /* end of inner for, scan opcodes */ + +next_rule:; /* try next rule */ + + } /* end of outer for, scan rules */ + printf("ipfw: ouch!, skip past end of rules, denying packet\n"); + lck_mtx_unlock(ipfw_mutex); + return(IP_FW_PORT_DENY_FLAG); + +done: + /* Update statistics */ + f->pcnt++; + f->bcnt += pktlen; + f->timestamp = timenow.tv_sec; + lck_mtx_unlock(ipfw_mutex); + return retval; + +pullup_failed: + if (fw_verbose) + printf("ipfw: pullup failed\n"); + lck_mtx_unlock(ipfw_mutex); + return(IP_FW_PORT_DENY_FLAG); +} + +/* + * When a rule is added/deleted, clear the next_rule pointers in all rules. + * These will be reconstructed on the fly as packets are matched. + * Must be called at splimp(). + */ +static void +flush_rule_ptrs(void) +{ + struct ip_fw *rule; + + for (rule = layer3_chain; rule; rule = rule->next) + rule->next_rule = NULL; +} + +/* + * When pipes/queues are deleted, clear the "pipe_ptr" pointer to a given + * pipe/queue, or to all of them (match == NULL). + * Must be called at splimp(). + */ +void +flush_pipe_ptrs(struct dn_flow_set *match) +{ + struct ip_fw *rule; + + for (rule = layer3_chain; rule; rule = rule->next) { + ipfw_insn_pipe *cmd = (ipfw_insn_pipe *)ACTION_PTR(rule); + + if (cmd->o.opcode != O_PIPE && cmd->o.opcode != O_QUEUE) + continue; + /* + * XXX Use bcmp/bzero to handle pipe_ptr to overcome + * possible alignment problems on 64-bit architectures. + * This code is seldom used so we do not worry too + * much about efficiency. + */ + if (match == NULL || + !bcmp(&cmd->pipe_ptr, &match, sizeof(match)) ) + bzero(&cmd->pipe_ptr, sizeof(cmd->pipe_ptr)); + } +} + +/* + * Add a new rule to the list. Copy the rule into a malloc'ed area, then + * possibly create a rule number and add the rule to the list. + * Update the rule_number in the input struct so the caller knows it as well. + */ +static int +add_rule(struct ip_fw **head, struct ip_fw *input_rule) +{ + struct ip_fw *rule, *f, *prev; + int s; + int l = RULESIZE(input_rule); + + if (*head == NULL && input_rule->rulenum != IPFW_DEFAULT_RULE) + return (EINVAL); + + rule = _MALLOC(l, M_IPFW, M_WAIT); + if (rule == NULL) { + printf("ipfw2: add_rule MALLOC failed\n"); + return (ENOSPC); + } + + bzero(rule, l); + bcopy(input_rule, rule, l); + + rule->next = NULL; + rule->next_rule = NULL; + + rule->pcnt = 0; + rule->bcnt = 0; + rule->timestamp = 0; + + if (*head == NULL) { /* default rule */ + *head = rule; + goto done; + } + + /* + * If rulenum is 0, find highest numbered rule before the + * default rule, and add autoinc_step + */ + if (autoinc_step < 1) + autoinc_step = 1; + else if (autoinc_step > 1000) + autoinc_step = 1000; + if (rule->rulenum == 0) { + /* + * locate the highest numbered rule before default + */ + for (f = *head; f; f = f->next) { + if (f->rulenum == IPFW_DEFAULT_RULE) + break; + rule->rulenum = f->rulenum; + } + if (rule->rulenum < IPFW_DEFAULT_RULE - autoinc_step) + rule->rulenum += autoinc_step; + input_rule->rulenum = rule->rulenum; + } + + /* + * Now insert the new rule in the right place in the sorted list. + */ + for (prev = NULL, f = *head; f; prev = f, f = f->next) { + if (f->rulenum > rule->rulenum) { /* found the location */ + if (prev) { + rule->next = f; + prev->next = rule; + } else { /* head insert */ + rule->next = *head; + *head = rule; + } + break; + } + } + flush_rule_ptrs(); +done: + static_count++; + static_len += l; + DEB(printf("ipfw: installed rule %d, static count now %d\n", + rule->rulenum, static_count);) + return (0); +} + +/** + * Free storage associated with a static rule (including derived + * dynamic rules). + * The caller is in charge of clearing rule pointers to avoid + * dangling pointers. + * @return a pointer to the next entry. + * Arguments are not checked, so they better be correct. + * Must be called at splimp(). + */ +static struct ip_fw * +delete_rule(struct ip_fw **head, struct ip_fw *prev, struct ip_fw *rule) +{ + struct ip_fw *n; + int l = RULESIZE(rule); + + n = rule->next; + remove_dyn_rule(rule, NULL /* force removal */); + if (prev == NULL) + *head = n; + else + prev->next = n; + static_count--; + static_len -= l; + +#if DUMMYNET + if (DUMMYNET_LOADED) + ip_dn_ruledel_ptr(rule); +#endif /* DUMMYNET */ + _FREE(rule, M_IPFW); + return n; +} + +#if DEBUG_INACTIVE_RULES +static void +print_chain(struct ip_fw **chain) +{ + struct ip_fw *rule = *chain; + + for (; rule; rule = rule->next) { + ipfw_insn *cmd = ACTION_PTR(rule); + + printf("ipfw: rule->rulenum = %d\n", rule->rulenum); + + if (rule->reserved_1 == IPFW_RULE_INACTIVE) { + printf("ipfw: rule->reserved = IPFW_RULE_INACTIVE\n"); + } + + switch (cmd->opcode) { + case O_DENY: + printf("ipfw: ACTION: Deny\n"); + break; + + case O_REJECT: + if (cmd->arg1==ICMP_REJECT_RST) + printf("ipfw: ACTION: Reset\n"); + else if (cmd->arg1==ICMP_UNREACH_HOST) + printf("ipfw: ACTION: Reject\n"); + break; + + case O_ACCEPT: + printf("ipfw: ACTION: Accept\n"); + break; + case O_COUNT: + printf("ipfw: ACTION: Count\n"); + break; + case O_DIVERT: + printf("ipfw: ACTION: Divert\n"); + break; + case O_TEE: + printf("ipfw: ACTION: Tee\n"); + break; + case O_SKIPTO: + printf("ipfw: ACTION: SkipTo\n"); + break; + case O_PIPE: + printf("ipfw: ACTION: Pipe\n"); + break; + case O_QUEUE: + printf("ipfw: ACTION: Queue\n"); + break; + case O_FORWARD_IP: + printf("ipfw: ACTION: Forward\n"); + break; + default: + printf("ipfw: invalid action! %d\n", cmd->opcode); + } + } +} +#endif /* DEBUG_INACTIVE_RULES */ + +static void +flush_inactive(void *param) +{ + struct ip_fw *inactive_rule = (struct ip_fw *)param; + struct ip_fw *rule, *prev; + + lck_mtx_lock(ipfw_mutex); + + for (rule = layer3_chain, prev = NULL; rule; ) { + if (rule == inactive_rule && rule->reserved_1 == IPFW_RULE_INACTIVE) { + struct ip_fw *n = rule; + + if (prev == NULL) { + layer3_chain = rule->next; + } + else { + prev->next = rule->next; + } + rule = rule->next; + _FREE(n, M_IPFW); + } + else { + prev = rule; + rule = rule->next; + } + } + +#if DEBUG_INACTIVE_RULES + print_chain(&layer3_chain); +#endif + lck_mtx_unlock(ipfw_mutex); +} + +static void +mark_inactive(struct ip_fw **prev, struct ip_fw **rule) +{ + int l = RULESIZE(*rule); + + if ((*rule)->reserved_1 != IPFW_RULE_INACTIVE) { + (*rule)->reserved_1 = IPFW_RULE_INACTIVE; + static_count--; + static_len -= l; + + timeout(flush_inactive, *rule, 30*hz); /* 30 sec. */ + } + + *prev = *rule; + *rule = (*rule)->next; +} + +/* + * Deletes all rules from a chain (except rules in set RESVD_SET + * unless kill_default = 1). + * Must be called at splimp(). + */ +static void +free_chain(struct ip_fw **chain, int kill_default) +{ + struct ip_fw *prev, *rule; + + flush_rule_ptrs(); /* more efficient to do outside the loop */ + for (prev = NULL, rule = *chain; rule ; ) + if (kill_default || rule->set != RESVD_SET) { + ipfw_insn *cmd = ACTION_PTR(rule); + + /* skip over forwarding rules so struct isn't + * deleted while pointer is still in use elsewhere + */ + if (cmd->opcode == O_FORWARD_IP) { + mark_inactive(&prev, &rule); + } + else { + rule = delete_rule(chain, prev, rule); + } + } + else { + prev = rule; + rule = rule->next; + } +} + +/** + * Remove all rules with given number, and also do set manipulation. + * Assumes chain != NULL && *chain != NULL. + * + * The argument is an u_int32_t. The low 16 bit are the rule or set number, + * the next 8 bits are the new set, the top 8 bits are the command: + * + * 0 delete rules with given number + * 1 delete rules with given set number + * 2 move rules with given number to new set + * 3 move rules with given set number to new set + * 4 swap sets with given numbers + */ +static int +del_entry(struct ip_fw **chain, u_int32_t arg) +{ + struct ip_fw *prev = NULL, *rule = *chain; + int s; + u_int16_t rulenum; /* rule or old_set */ + u_int8_t cmd, new_set; + + rulenum = arg & 0xffff; + cmd = (arg >> 24) & 0xff; + new_set = (arg >> 16) & 0xff; + + if (cmd > 4) + return EINVAL; + if (new_set > RESVD_SET) + return EINVAL; + if (cmd == 0 || cmd == 2) { + if (rulenum >= IPFW_DEFAULT_RULE) + return EINVAL; + } else { + if (rulenum > RESVD_SET) /* old_set */ + return EINVAL; + } + + switch (cmd) { + case 0: /* delete rules with given number */ + /* + * locate first rule to delete + */ + for (; rule->rulenum < rulenum; prev = rule, rule = rule->next) + ; + if (rule->rulenum != rulenum) + return EINVAL; + + /* + * flush pointers outside the loop, then delete all matching + * rules. prev remains the same throughout the cycle. + */ + flush_rule_ptrs(); + while (rule->rulenum == rulenum) { + ipfw_insn *cmd = ACTION_PTR(rule); + + /* keep forwarding rules around so struct isn't + * deleted while pointer is still in use elsewhere + */ + if (cmd->opcode == O_FORWARD_IP) { + mark_inactive(&prev, &rule); + } + else { + rule = delete_rule(chain, prev, rule); + } + } + break; + + case 1: /* delete all rules with given set number */ + flush_rule_ptrs(); + while (rule->rulenum < IPFW_DEFAULT_RULE) { + if (rule->set == rulenum) { + ipfw_insn *cmd = ACTION_PTR(rule); + + /* keep forwarding rules around so struct isn't + * deleted while pointer is still in use elsewhere + */ + if (cmd->opcode == O_FORWARD_IP) { + mark_inactive(&prev, &rule); + } + else { + rule = delete_rule(chain, prev, rule); + } + } + else { + prev = rule; + rule = rule->next; + } + } + break; + + case 2: /* move rules with given number to new set */ + for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next) + if (rule->rulenum == rulenum) + rule->set = new_set; + break; + + case 3: /* move rules with given set number to new set */ + for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next) + if (rule->set == rulenum) + rule->set = new_set; + break; + + case 4: /* swap two sets */ + for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next) + if (rule->set == rulenum) + rule->set = new_set; + else if (rule->set == new_set) + rule->set = rulenum; + break; + } + return 0; +} + +/* + * Clear counters for a specific rule. + */ +static void +clear_counters(struct ip_fw *rule, int log_only) +{ + ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule); + + if (log_only == 0) { + rule->bcnt = rule->pcnt = 0; + rule->timestamp = 0; + } + if (l->o.opcode == O_LOG) + l->log_left = l->max_log; +} + +/** + * Reset some or all counters on firewall rules. + * @arg frwl is null to clear all entries, or contains a specific + * rule number. + * @arg log_only is 1 if we only want to reset logs, zero otherwise. + */ +static int +zero_entry(int rulenum, int log_only) +{ + struct ip_fw *rule; + int s; + char *msg; + + if (rulenum == 0) { + norule_counter = 0; + for (rule = layer3_chain; rule; rule = rule->next) + clear_counters(rule, log_only); + msg = log_only ? "ipfw: All logging counts reset.\n" : + "ipfw: Accounting cleared.\n"; + } else { + int cleared = 0; + /* + * We can have multiple rules with the same number, so we + * need to clear them all. + */ + for (rule = layer3_chain; rule; rule = rule->next) + if (rule->rulenum == rulenum) { + while (rule && rule->rulenum == rulenum) { + clear_counters(rule, log_only); + rule = rule->next; + } + cleared = 1; + break; + } + if (!cleared) /* we did not find any matching rules */ + return (EINVAL); + msg = log_only ? "ipfw: Entry %d logging count reset.\n" : + "ipfw: Entry %d cleared.\n"; + } + if (fw_verbose) + { + dolog((LOG_AUTHPRIV | LOG_NOTICE, msg, rulenum)); + } + return (0); +} + +/* + * Check validity of the structure before insert. + * Fortunately rules are simple, so this mostly need to check rule sizes. + */ +static int +check_ipfw_struct(struct ip_fw *rule, int size) +{ + int l, cmdlen = 0; + int have_action=0; + ipfw_insn *cmd; + + if (size < sizeof(*rule)) { + printf("ipfw: rule too short\n"); + return (EINVAL); + } + /* first, check for valid size */ + l = RULESIZE(rule); + if (l != size) { + printf("ipfw: size mismatch (have %d want %d)\n", size, l); + return (EINVAL); + } + /* + * Now go for the individual checks. Very simple ones, basically only + * instruction sizes. + */ + for (l = rule->cmd_len, cmd = rule->cmd ; + l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + if (cmdlen > l) { + printf("ipfw: opcode %d size truncated\n", + cmd->opcode); + return EINVAL; + } + DEB(printf("ipfw: opcode %d\n", cmd->opcode);) + switch (cmd->opcode) { + case O_PROBE_STATE: + case O_KEEP_STATE: + case O_PROTO: + case O_IP_SRC_ME: + case O_IP_DST_ME: + case O_LAYER2: + case O_IN: + case O_FRAG: + case O_IPOPT: + case O_IPTOS: + case O_IPPRECEDENCE: + case O_IPVER: + case O_TCPWIN: + case O_TCPFLAGS: + case O_TCPOPTS: + case O_ESTAB: + case O_VERREVPATH: + case O_IPSEC: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; + break; + case O_UID: +#ifndef __APPLE__ + case O_GID: +#endif /* __APPLE__ */ + case O_IP_SRC: + case O_IP_DST: + case O_TCPSEQ: + case O_TCPACK: + case O_PROB: + case O_ICMPTYPE: + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32)) + goto bad_size; + break; + + case O_LIMIT: + if (cmdlen != F_INSN_SIZE(ipfw_insn_limit)) + goto bad_size; + break; + + case O_LOG: + if (cmdlen != F_INSN_SIZE(ipfw_insn_log)) + goto bad_size; + + /* enforce logging limit */ + if (fw_verbose && + ((ipfw_insn_log *)cmd)->max_log == 0 && verbose_limit != 0) { + ((ipfw_insn_log *)cmd)->max_log = verbose_limit; + } + + ((ipfw_insn_log *)cmd)->log_left = + ((ipfw_insn_log *)cmd)->max_log; + + break; + + case O_IP_SRC_MASK: + case O_IP_DST_MASK: + /* only odd command lengths */ + if ( !(cmdlen & 1) || cmdlen > 31) + goto bad_size; + break; + + case O_IP_SRC_SET: + case O_IP_DST_SET: + if (cmd->arg1 == 0 || cmd->arg1 > 256) { + printf("ipfw: invalid set size %d\n", + cmd->arg1); + return EINVAL; + } + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + + (cmd->arg1+31)/32 ) + goto bad_size; + break; + + case O_MACADDR2: + if (cmdlen != F_INSN_SIZE(ipfw_insn_mac)) + goto bad_size; + break; + + case O_NOP: + case O_IPID: + case O_IPTTL: + case O_IPLEN: + if (cmdlen < 1 || cmdlen > 31) + goto bad_size; + break; + + case O_MAC_TYPE: + case O_IP_SRCPORT: + case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */ + if (cmdlen < 2 || cmdlen > 31) + goto bad_size; + break; + + case O_RECV: + case O_XMIT: + case O_VIA: + if (cmdlen != F_INSN_SIZE(ipfw_insn_if)) + goto bad_size; + break; + + case O_PIPE: + case O_QUEUE: + if (cmdlen != F_INSN_SIZE(ipfw_insn_pipe)) + goto bad_size; + goto check_action; + + case O_FORWARD_IP: + if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) + goto bad_size; + goto check_action; + + case O_FORWARD_MAC: /* XXX not implemented yet */ + case O_CHECK_STATE: + case O_COUNT: + case O_ACCEPT: + case O_DENY: + case O_REJECT: + case O_SKIPTO: + case O_DIVERT: + case O_TEE: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; +check_action: + if (have_action) { + printf("ipfw: opcode %d, multiple actions" + " not allowed\n", + cmd->opcode); + return EINVAL; + } + have_action = 1; + if (l != cmdlen) { + printf("ipfw: opcode %d, action must be" + " last opcode\n", + cmd->opcode); + return EINVAL; + } + break; + default: + printf("ipfw: opcode %d, unknown opcode\n", + cmd->opcode); + return EINVAL; + } + } + if (have_action == 0) { + printf("ipfw: missing action\n"); + return EINVAL; + } + return 0; + +bad_size: + printf("ipfw: opcode %d size %d wrong\n", + cmd->opcode, cmdlen); + return EINVAL; +} + + +/** + * {set|get}sockopt parser. + */ +static int +ipfw_ctl(struct sockopt *sopt) +{ +#define RULE_MAXSIZE (256*sizeof(u_int32_t)) + u_int32_t api_version; + int command; + int error, s; + size_t size; + struct ip_fw *bp , *buf, *rule; + + /* copy of orig sopt to send to ipfw_get_command_and_version() */ + struct sockopt tmp_sopt = *sopt; + struct timeval timenow; + + getmicrotime(&timenow); + + /* + * Disallow modifications in really-really secure mode, but still allow + * the logging counters to be reset. + */ + if (sopt->sopt_name == IP_FW_ADD || + (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) { +#if __FreeBSD_version >= 500034 + error = securelevel_ge(sopt->sopt_td->td_ucred, 3); + if (error) + return (error); +#else /* FreeBSD 4.x */ + if (securelevel >= 3) + return (EPERM); +#endif + } + + /* first get the command and version, then do conversion as necessary */ + error = ipfw_get_command_and_version(&tmp_sopt, &command, &api_version); + + if (error) { + /* error getting the version */ + return error; + } + + switch (command) { + case IP_FW_GET: + /* + * pass up a copy of the current rules. Static rules + * come first (the last of which has number IPFW_DEFAULT_RULE), + * followed by a possibly empty list of dynamic rule. + * The last dynamic rule has NULL in the "next" field. + */ + lck_mtx_lock(ipfw_mutex); + size = static_len; /* size of static rules */ + if (ipfw_dyn_v) /* add size of dyn.rules */ + size += (dyn_count * sizeof(ipfw_dyn_rule)); + + /* + * XXX todo: if the user passes a short length just to know + * how much room is needed, do not bother filling up the + * buffer, just jump to the sooptcopyout. + */ + buf = _MALLOC(size, M_TEMP, M_WAITOK); + if (buf == 0) { + lck_mtx_unlock(ipfw_mutex); + error = ENOBUFS; + break; + } + + bzero(buf, size); + + bp = buf; + for (rule = layer3_chain; rule ; rule = rule->next) { + int i = RULESIZE(rule); + + if (rule->reserved_1 == IPFW_RULE_INACTIVE) { + continue; + } + bcopy(rule, bp, i); + bcopy(&set_disable, &(bp->next_rule), + sizeof(set_disable)); + bp = (struct ip_fw *)((char *)bp + i); + } + if (ipfw_dyn_v) { + int i; + ipfw_dyn_rule *p, *dst, *last = NULL; + + dst = (ipfw_dyn_rule *)bp; + for (i = 0 ; i < curr_dyn_buckets ; i++ ) + for ( p = ipfw_dyn_v[i] ; p != NULL ; + p = p->next, dst++ ) { + bcopy(p, dst, sizeof *p); + bcopy(&(p->rule->rulenum), &(dst->rule), + sizeof(p->rule->rulenum)); + /* + * store a non-null value in "next". + * The userland code will interpret a + * NULL here as a marker + * for the last dynamic rule. + */ + bcopy(&dst, &dst->next, sizeof(dst)); + last = dst ; + dst->expire = + TIME_LEQ(dst->expire, timenow.tv_sec) ? + 0 : dst->expire - timenow.tv_sec ; + } + if (last != NULL) /* mark last dynamic rule */ + bzero(&last->next, sizeof(last)); + } + lck_mtx_unlock(ipfw_mutex); + + /* convert back if necessary and copyout */ + if (api_version == IP_FW_VERSION_0) { + int i, len = 0; + struct ip_old_fw *buf2, *rule_vers0; + + buf2 = _MALLOC(static_count * sizeof(struct ip_old_fw), M_TEMP, M_WAITOK); + if (buf2 == 0) { + error = ENOBUFS; + } + + if (!error) { + bp = buf; + rule_vers0 = buf2; + + for (i = 0; i < static_count; i++) { + /* static rules have different sizes */ + int j = RULESIZE(bp); + ipfw_convert_from_latest(bp, rule_vers0, api_version); + bp = (struct ip_fw *)((char *)bp + j); + len += sizeof(*rule_vers0); + rule_vers0++; + } + error = sooptcopyout(sopt, buf2, len); + _FREE(buf2, M_TEMP); + } + } else if (api_version == IP_FW_VERSION_1) { + int i, len = 0, buf_size; + struct ip_fw_compat *buf2, *rule_vers1; + struct ipfw_dyn_rule_compat *dyn_rule_vers1, *dyn_last = NULL; + ipfw_dyn_rule *p; + + buf_size = static_count * sizeof(struct ip_fw_compat) + + dyn_count * sizeof(struct ipfw_dyn_rule_compat); + + buf2 = _MALLOC(buf_size, M_TEMP, M_WAITOK); + if (buf2 == 0) { + error = ENOBUFS; + } + + if (!error) { + bp = buf; + rule_vers1 = buf2; + + /* first do static rules */ + for (i = 0; i < static_count; i++) { + /* static rules have different sizes */ + int j = RULESIZE(bp); + ipfw_convert_from_latest(bp, rule_vers1, api_version); + bp = (struct ip_fw *)((char *)bp + j); + len += sizeof(*rule_vers1); + rule_vers1++; + } + + /* now do dynamic rules */ + dyn_rule_vers1 = (struct ipfw_dyn_rule_compat *)rule_vers1; + if (ipfw_dyn_v) { + for (i = 0; i < curr_dyn_buckets; i++) { + for ( p = ipfw_dyn_v[i] ; p != NULL ; p = p->next) { + (int) dyn_rule_vers1->chain = p->rule->rulenum; + dyn_rule_vers1->id = p->id; + dyn_rule_vers1->mask = p->id; + dyn_rule_vers1->type = p->dyn_type; + dyn_rule_vers1->expire = p->expire; + dyn_rule_vers1->pcnt = p->pcnt; + dyn_rule_vers1->bcnt = p->bcnt; + dyn_rule_vers1->bucket = p->bucket; + dyn_rule_vers1->state = p->state; + + dyn_rule_vers1->next = dyn_rule_vers1; + dyn_last = dyn_rule_vers1; + + len += sizeof(*dyn_rule_vers1); + dyn_rule_vers1++; + } + } + + if (dyn_last != NULL) { + dyn_last->next = NULL; + } + } + + error = sooptcopyout(sopt, buf2, len); + _FREE(buf2, M_TEMP); + } + } else { + error = sooptcopyout(sopt, buf, size); + } + + _FREE(buf, M_TEMP); + break; + + case IP_FW_FLUSH: + /* + * Normally we cannot release the lock on each iteration. + * We could do it here only because we start from the head all + * the times so there is no risk of missing some entries. + * On the other hand, the risk is that we end up with + * a very inconsistent ruleset, so better keep the lock + * around the whole cycle. + * + * XXX this code can be improved by resetting the head of + * the list to point to the default rule, and then freeing + * the old list without the need for a lock. + */ + + lck_mtx_lock(ipfw_mutex); + free_chain(&layer3_chain, 0 /* keep default rule */); +#if DEBUG_INACTIVE_RULES + print_chain(&layer3_chain); +#endif + lck_mtx_unlock(ipfw_mutex); + break; + + case IP_FW_ADD: + rule = _MALLOC(RULE_MAXSIZE, M_TEMP, M_WAITOK); + if (rule == 0) { + error = ENOBUFS; + break; + } + + bzero(rule, RULE_MAXSIZE); + + if (api_version != IP_FW_CURRENT_API_VERSION) { + error = ipfw_convert_to_latest(sopt, rule, api_version); + } + else { + error = sooptcopyin(sopt, rule, RULE_MAXSIZE, + sizeof(struct ip_fw) ); + } + + if (!error) { + if ((api_version == IP_FW_VERSION_0) || (api_version == IP_FW_VERSION_1)) { + /* the rule has already been checked so just + * adjust sopt_valsize to match what would be expected. + */ + sopt->sopt_valsize = RULESIZE(rule); + } + error = check_ipfw_struct(rule, sopt->sopt_valsize); + if (!error) { + lck_mtx_lock(ipfw_mutex); + error = add_rule(&layer3_chain, rule); + lck_mtx_unlock(ipfw_mutex); + + size = RULESIZE(rule); + if (!error && sopt->sopt_dir == SOPT_GET) { + /* convert back if necessary and copyout */ + if (api_version == IP_FW_VERSION_0) { + struct ip_old_fw rule_vers0; + + ipfw_convert_from_latest(rule, &rule_vers0, api_version); + sopt->sopt_valsize = sizeof(struct ip_old_fw); + + error = sooptcopyout(sopt, &rule_vers0, sizeof(struct ip_old_fw)); + } else if (api_version == IP_FW_VERSION_1) { + struct ip_fw_compat rule_vers1; + + ipfw_convert_from_latest(rule, &rule_vers1, api_version); + sopt->sopt_valsize = sizeof(struct ip_fw_compat); + + error = sooptcopyout(sopt, &rule_vers1, sizeof(struct ip_fw_compat)); + } else { + error = sooptcopyout(sopt, rule, size); + } + } + } + } + + _FREE(rule, M_TEMP); + break; + + case IP_FW_DEL: + { + /* + * IP_FW_DEL is used for deleting single rules or sets, + * and (ab)used to atomically manipulate sets. + * rule->set_masks is used to distinguish between the two: + * rule->set_masks[0] == 0 + * delete single rule or set of rules, + * or reassign rules (or sets) to a different set. + * rule->set_masks[0] != 0 + * atomic disable/enable sets. + * rule->set_masks[0] contains sets to be disabled, + * rule->set_masks[1] contains sets to be enabled. + */ + /* there is only a simple rule passed in + * (no cmds), so use a temp struct to copy + */ + struct ip_fw temp_rule = { 0 }; + + if (api_version != IP_FW_CURRENT_API_VERSION) { + error = ipfw_convert_to_latest(sopt, &temp_rule, api_version); + } + else { + error = sooptcopyin(sopt, &temp_rule, sizeof(struct ip_fw), + sizeof(struct ip_fw) ); + } + + if (!error) { + /* set_masks is used to distinguish between deleting + * single rules or atomically manipulating sets + */ + lck_mtx_lock(ipfw_mutex); + + if (temp_rule.set_masks[0] != 0) { + /* set manipulation */ + set_disable = + (set_disable | temp_rule.set_masks[0]) & ~temp_rule.set_masks[1] & + ~(1<<RESVD_SET); /* set RESVD_SET always enabled */ + } + else { + /* single rule */ + error = del_entry(&layer3_chain, temp_rule.rulenum); +#if DEBUG_INACTIVE_RULES + print_chain(&layer3_chain); +#endif + + } + + lck_mtx_unlock(ipfw_mutex); + } + break; + } + case IP_FW_ZERO: + case IP_FW_RESETLOG: /* using rule->rulenum */ + { + /* there is only a simple rule passed in + * (no cmds), so use a temp struct to copy + */ + struct ip_fw temp_rule = { 0 }; + + if (api_version != IP_FW_CURRENT_API_VERSION) { + error = ipfw_convert_to_latest(sopt, &temp_rule, api_version); + } + else { + if (sopt->sopt_val != 0) { + error = sooptcopyin(sopt, &temp_rule, sizeof(struct ip_fw), + sizeof(struct ip_fw) ); + } + } + + if (!error) { + lck_mtx_lock(ipfw_mutex); + error = zero_entry(temp_rule.rulenum, sopt->sopt_name == IP_FW_RESETLOG); + lck_mtx_unlock(ipfw_mutex); + } + break; + } + default: + printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name); + error = EINVAL; + } + + return (error); +} + +/** + * dummynet needs a reference to the default rule, because rules can be + * deleted while packets hold a reference to them. When this happens, + * dummynet changes the reference to the default rule (it could well be a + * NULL pointer, but this way we do not need to check for the special + * case, plus here he have info on the default behaviour). + */ +struct ip_fw *ip_fw_default_rule; + +/* + * This procedure is only used to handle keepalives. It is invoked + * every dyn_keepalive_period + */ +static void +ipfw_tick(void * __unused unused) +{ + int i; + int s; + ipfw_dyn_rule *q; + struct timeval timenow; + + + if (dyn_keepalive == 0 || ipfw_dyn_v == NULL || dyn_count == 0) + goto done; + + getmicrotime(&timenow); + + lck_mtx_lock(ipfw_mutex); + for (i = 0 ; i < curr_dyn_buckets ; i++) { + for (q = ipfw_dyn_v[i] ; q ; q = q->next ) { + if (q->dyn_type == O_LIMIT_PARENT) + continue; + if (q->id.proto != IPPROTO_TCP) + continue; + if ( (q->state & BOTH_SYN) != BOTH_SYN) + continue; + if (TIME_LEQ( timenow.tv_sec+dyn_keepalive_interval, + q->expire)) + continue; /* too early */ + if (TIME_LEQ(q->expire, timenow.tv_sec)) + continue; /* too late, rule expired */ + + send_pkt(&(q->id), q->ack_rev - 1, q->ack_fwd, TH_SYN); + send_pkt(&(q->id), q->ack_fwd - 1, q->ack_rev, 0); + } + } + lck_mtx_unlock(ipfw_mutex); +done: + timeout(ipfw_tick, NULL, dyn_keepalive_period*hz); +} + +void +ipfw_init(void) +{ + struct ip_fw default_rule; + + /* setup locks */ + ipfw_mutex_grp_attr = lck_grp_attr_alloc_init(); + ipfw_mutex_grp = lck_grp_alloc_init("ipfw", ipfw_mutex_grp_attr); + ipfw_mutex_attr = lck_attr_alloc_init(); + lck_attr_setdefault(ipfw_mutex_attr); + + if ((ipfw_mutex = lck_mtx_alloc_init(ipfw_mutex_grp, ipfw_mutex_attr)) == NULL) { + printf("ipfw_init: can't alloc ipfw_mutex\n"); + return; + } + + layer3_chain = NULL; + + bzero(&default_rule, sizeof default_rule); + + default_rule.act_ofs = 0; + default_rule.rulenum = IPFW_DEFAULT_RULE; + default_rule.cmd_len = 1; + default_rule.set = RESVD_SET; + + default_rule.cmd[0].len = 1; + default_rule.cmd[0].opcode = +#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT + 1 ? O_ACCEPT : +#endif + O_DENY; + + if (add_rule(&layer3_chain, &default_rule)) { + printf("ipfw2: add_rule failed adding default rule\n"); + printf("ipfw2 failed initialization!!\n"); + fw_enable = 0; + } + else { + ip_fw_default_rule = layer3_chain; +#if 0 + /* Radar 3920649, don't print unncessary messages to the log */ + printf("ipfw2 initialized, divert %s, " + "rule-based forwarding enabled, default to %s, logging ", + #ifdef IPDIVERT + "enabled", + #else + "disabled", + #endif + default_rule.cmd[0].opcode == O_ACCEPT ? "accept" : "deny"); +#endif + + #ifdef IPFIREWALL_VERBOSE + fw_verbose = 1; + #endif + #ifdef IPFIREWALL_VERBOSE_LIMIT + verbose_limit = IPFIREWALL_VERBOSE_LIMIT; + #endif + if (fw_verbose == 0) + printf("disabled\n"); + else if (verbose_limit == 0) + printf("unlimited\n"); + else + printf("limited to %d packets/entry by default\n", + verbose_limit); + } + + ip_fw_chk_ptr = ipfw_chk; + ip_fw_ctl_ptr = ipfw_ctl; + + ipfwstringlen = strlen( ipfwstring ); + + timeout(ipfw_tick, NULL, hz); +} + +#endif /* IPFW2 */ diff --git a/bsd/netinet/ip_fw2.h b/bsd/netinet/ip_fw2.h new file mode 100644 index 000000000..43dcf98ce --- /dev/null +++ b/bsd/netinet/ip_fw2.h @@ -0,0 +1,443 @@ +/* + * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/sys/netinet/ip_fw2.h,v 1.1.2.4 2003/07/17 06:03:39 luigi Exp $ + */ + +#ifndef _IPFW2_H +#define _IPFW2_H + +/* + * The kernel representation of ipfw rules is made of a list of + * 'instructions' (for all practical purposes equivalent to BPF + * instructions), which specify which fields of the packet + * (or its metadata) should be analysed. + * + * Each instruction is stored in a structure which begins with + * "ipfw_insn", and can contain extra fields depending on the + * instruction type (listed below). + * Note that the code is written so that individual instructions + * have a size which is a multiple of 32 bits. This means that, if + * such structures contain pointers or other 64-bit entities, + * (there is just one instance now) they may end up unaligned on + * 64-bit architectures, so the must be handled with care. + * + * "enum ipfw_opcodes" are the opcodes supported. We can have up + * to 256 different opcodes. + */ + +enum ipfw_opcodes { /* arguments (4 byte each) */ + O_NOP, + + O_IP_SRC, /* u32 = IP */ + O_IP_SRC_MASK, /* ip = IP/mask */ + O_IP_SRC_ME, /* none */ + O_IP_SRC_SET, /* u32=base, arg1=len, bitmap */ + + O_IP_DST, /* u32 = IP */ + O_IP_DST_MASK, /* ip = IP/mask */ + O_IP_DST_ME, /* none */ + O_IP_DST_SET, /* u32=base, arg1=len, bitmap */ + + O_IP_SRCPORT, /* (n)port list:mask 4 byte ea */ + O_IP_DSTPORT, /* (n)port list:mask 4 byte ea */ + O_PROTO, /* arg1=protocol */ + + O_MACADDR2, /* 2 mac addr:mask */ + O_MAC_TYPE, /* same as srcport */ + + O_LAYER2, /* none */ + O_IN, /* none */ + O_FRAG, /* none */ + + O_RECV, /* none */ + O_XMIT, /* none */ + O_VIA, /* none */ + + O_IPOPT, /* arg1 = 2*u8 bitmap */ + O_IPLEN, /* arg1 = len */ + O_IPID, /* arg1 = id */ + + O_IPTOS, /* arg1 = id */ + O_IPPRECEDENCE, /* arg1 = precedence << 5 */ + O_IPTTL, /* arg1 = TTL */ + + O_IPVER, /* arg1 = version */ + O_UID, /* u32 = id */ + O_GID, /* u32 = id */ + O_ESTAB, /* none (tcp established) */ + O_TCPFLAGS, /* arg1 = 2*u8 bitmap */ + O_TCPWIN, /* arg1 = desired win */ + O_TCPSEQ, /* u32 = desired seq. */ + O_TCPACK, /* u32 = desired seq. */ + O_ICMPTYPE, /* u32 = icmp bitmap */ + O_TCPOPTS, /* arg1 = 2*u8 bitmap */ + + O_VERREVPATH, /* none */ + + O_PROBE_STATE, /* none */ + O_KEEP_STATE, /* none */ + O_LIMIT, /* ipfw_insn_limit */ + O_LIMIT_PARENT, /* dyn_type, not an opcode. */ + + /* + * These are really 'actions'. + */ + + O_LOG, /* ipfw_insn_log */ + O_PROB, /* u32 = match probability */ + + O_CHECK_STATE, /* none */ + O_ACCEPT, /* none */ + O_DENY, /* none */ + O_REJECT, /* arg1=icmp arg (same as deny) */ + O_COUNT, /* none */ + O_SKIPTO, /* arg1=next rule number */ + O_PIPE, /* arg1=pipe number */ + O_QUEUE, /* arg1=queue number */ + O_DIVERT, /* arg1=port number */ + O_TEE, /* arg1=port number */ + O_FORWARD_IP, /* fwd sockaddr */ + O_FORWARD_MAC, /* fwd mac */ + + /* + * More opcodes. + */ + O_IPSEC, /* has ipsec history */ + + O_LAST_OPCODE /* not an opcode! */ +}; + +/* + * Template for instructions. + * + * ipfw_insn is used for all instructions which require no operands, + * a single 16-bit value (arg1), or a couple of 8-bit values. + * + * For other instructions which require different/larger arguments + * we have derived structures, ipfw_insn_*. + * + * The size of the instruction (in 32-bit words) is in the low + * 6 bits of "len". The 2 remaining bits are used to implement + * NOT and OR on individual instructions. Given a type, you can + * compute the length to be put in "len" using F_INSN_SIZE(t) + * + * F_NOT negates the match result of the instruction. + * + * F_OR is used to build or blocks. By default, instructions + * are evaluated as part of a logical AND. An "or" block + * { X or Y or Z } contains F_OR set in all but the last + * instruction of the block. A match will cause the code + * to skip past the last instruction of the block. + * + * NOTA BENE: in a couple of places we assume that + * sizeof(ipfw_insn) == sizeof(u_int32_t) + * this needs to be fixed. + * + */ +typedef struct _ipfw_insn { /* template for instructions */ + enum ipfw_opcodes opcode:8; + u_int8_t len; /* numer of 32-byte words */ +#define F_NOT 0x80 +#define F_OR 0x40 +#define F_LEN_MASK 0x3f +#define F_LEN(cmd) ((cmd)->len & F_LEN_MASK) + + u_int16_t arg1; +} ipfw_insn; + +/* + * The F_INSN_SIZE(type) computes the size, in 4-byte words, of + * a given type. + */ +#define F_INSN_SIZE(t) ((sizeof (t))/sizeof(u_int32_t)) + +/* + * This is used to store an array of 16-bit entries (ports etc.) + */ +typedef struct _ipfw_insn_u16 { + ipfw_insn o; + u_int16_t ports[2]; /* there may be more */ +} ipfw_insn_u16; + +/* + * This is used to store an array of 32-bit entries + * (uid, single IPv4 addresses etc.) + */ +typedef struct _ipfw_insn_u32 { + ipfw_insn o; + u_int32_t d[1]; /* one or more */ +} ipfw_insn_u32; + +/* + * This is used to store IP addr-mask pairs. + */ +typedef struct _ipfw_insn_ip { + ipfw_insn o; + struct in_addr addr; + struct in_addr mask; +} ipfw_insn_ip; + +/* + * This is used to forward to a given address (ip). + */ +typedef struct _ipfw_insn_sa { + ipfw_insn o; + struct sockaddr_in sa; +} ipfw_insn_sa; + +/* + * This is used for MAC addr-mask pairs. + */ +typedef struct _ipfw_insn_mac { + ipfw_insn o; + u_char addr[12]; /* dst[6] + src[6] */ + u_char mask[12]; /* dst[6] + src[6] */ +} ipfw_insn_mac; + +/* + * This is used for interface match rules (recv xx, xmit xx). + */ +typedef struct _ipfw_insn_if { + ipfw_insn o; + union { + struct in_addr ip; + int32_t unit; + } p; + char name[IFNAMSIZ]; +} ipfw_insn_if; + +/* + * This is used for pipe and queue actions, which need to store + * a single pointer (which can have different size on different + * architectures. + * Note that, because of previous instructions, pipe_ptr might + * be unaligned in the overall structure, so it needs to be + * manipulated with care. + */ +typedef struct _ipfw_insn_pipe { + ipfw_insn o; + void *pipe_ptr; /* XXX */ +} ipfw_insn_pipe; + +/* + * This is used for limit rules. + */ +typedef struct _ipfw_insn_limit { + ipfw_insn o; + u_int8_t _pad; + u_int8_t limit_mask; /* combination of DYN_* below */ +#define DYN_SRC_ADDR 0x1 +#define DYN_SRC_PORT 0x2 +#define DYN_DST_ADDR 0x4 +#define DYN_DST_PORT 0x8 + + u_int16_t conn_limit; +} ipfw_insn_limit; + +/* + * This is used for log instructions. + */ +typedef struct _ipfw_insn_log { + ipfw_insn o; + u_int32_t max_log; /* how many do we log -- 0 = all */ + u_int32_t log_left; /* how many left to log */ +} ipfw_insn_log; + +/* Version of this API */ +#define IP_FW_VERSION_NONE 0 +#define IP_FW_VERSION_0 10 /* old ipfw */ +#define IP_FW_VERSION_1 20 /* ipfw in Jaguar/Panther */ +#define IP_FW_VERSION_2 30 /* ipfw2 */ +#define IP_FW_CURRENT_API_VERSION IP_FW_VERSION_2 + +/* + * Here we have the structure representing an ipfw rule. + * + * It starts with a general area (with link fields and counters) + * followed by an array of one or more instructions, which the code + * accesses as an array of 32-bit values. + * + * Given a rule pointer r: + * + * r->cmd is the start of the first instruction. + * ACTION_PTR(r) is the start of the first action (things to do + * once a rule matched). + * + * When assembling instruction, remember the following: + * + * + if a rule has a "keep-state" (or "limit") option, then the + * first instruction (at r->cmd) MUST BE an O_PROBE_STATE + * + if a rule has a "log" option, then the first action + * (at ACTION_PTR(r)) MUST be O_LOG + * + * NOTE: we use a simple linked list of rules because we never need + * to delete a rule without scanning the list. We do not use + * queue(3) macros for portability and readability. + */ + +struct ip_fw { + u_int32_t version; /* Version of this structure. MUST be set */ + /* by clients. Should always be */ + /* set to IP_FW_CURRENT_API_VERSION. */ + void *context; /* Context that is usable by user processes to */ + /* identify this rule. */ + struct ip_fw *next; /* linked list of rules */ + struct ip_fw *next_rule; /* ptr to next [skipto] rule */ + /* 'next_rule' is used to pass up 'set_disable' status */ + + u_int16_t act_ofs; /* offset of action in 32-bit units */ + u_int16_t cmd_len; /* # of 32-bit words in cmd */ + u_int16_t rulenum; /* rule number */ + u_int8_t set; /* rule set (0..31) */ + u_int32_t set_masks[2]; /* masks for manipulating sets atomically */ +#define RESVD_SET 31 /* set for default and persistent rules */ + u_int8_t _pad; /* padding */ + + /* These fields are present in all rules. */ + u_int64_t pcnt; /* Packet counter */ + u_int64_t bcnt; /* Byte counter */ + u_int32_t timestamp; /* tv_sec of last match */ + + u_int32_t reserved_1; /* reserved - set to 0 */ + u_int32_t reserved_2; /* reserved - set to 0 */ + + ipfw_insn cmd[1]; /* storage for commands */ +}; + +#define ACTION_PTR(rule) \ + (ipfw_insn *)( (u_int32_t *)((rule)->cmd) + ((rule)->act_ofs) ) + +#define RULESIZE(rule) (sizeof(struct ip_fw) + \ + ((struct ip_fw *)(rule))->cmd_len * 4 - 4) + +/* + * This structure is used as a flow mask and a flow id for various + * parts of the code. + */ +struct ipfw_flow_id { + u_int32_t dst_ip; + u_int32_t src_ip; + u_int16_t dst_port; + u_int16_t src_port; + u_int8_t proto; + u_int8_t flags; /* protocol-specific flags */ +}; + +/* + * Dynamic ipfw rule. + */ +typedef struct _ipfw_dyn_rule ipfw_dyn_rule; + +struct _ipfw_dyn_rule { + ipfw_dyn_rule *next; /* linked list of rules. */ + struct ip_fw *rule; /* pointer to rule */ + /* 'rule' is used to pass up the rule number (from the parent) */ + + ipfw_dyn_rule *parent; /* pointer to parent rule */ + u_int64_t pcnt; /* packet match counter */ + u_int64_t bcnt; /* byte match counter */ + struct ipfw_flow_id id; /* (masked) flow id */ + u_int32_t expire; /* expire time */ + u_int32_t bucket; /* which bucket in hash table */ + u_int32_t state; /* state of this rule (typically a + * combination of TCP flags) + */ + u_int32_t ack_fwd; /* most recent ACKs in forward */ + u_int32_t ack_rev; /* and reverse directions (used */ + /* to generate keepalives) */ + u_int16_t dyn_type; /* rule type */ + u_int16_t count; /* refcount */ +}; + +/* + * Definitions for IP option names. + */ +#define IP_FW_IPOPT_LSRR 0x01 +#define IP_FW_IPOPT_SSRR 0x02 +#define IP_FW_IPOPT_RR 0x04 +#define IP_FW_IPOPT_TS 0x08 + +/* + * Definitions for TCP option names. + */ +#define IP_FW_TCPOPT_MSS 0x01 +#define IP_FW_TCPOPT_WINDOW 0x02 +#define IP_FW_TCPOPT_SACK 0x04 +#define IP_FW_TCPOPT_TS 0x08 +#define IP_FW_TCPOPT_CC 0x10 + +#define ICMP_REJECT_RST 0x100 /* fake ICMP code (send a TCP RST) */ + +/* + * Main firewall chains definitions and global var's definitions. + */ +#ifdef KERNEL + +#define IP_FW_PORT_DYNT_FLAG 0x10000 +#define IP_FW_PORT_TEE_FLAG 0x20000 +#define IP_FW_PORT_DENY_FLAG 0x40000 + +/* + * Arguments for calling ipfw_chk() and dummynet_io(). We put them + * all into a structure because this way it is easier and more + * efficient to pass variables around and extend the interface. + */ +struct ip_fw_args { + struct mbuf *m; /* the mbuf chain */ + struct ifnet *oif; /* output interface */ + struct sockaddr_in *next_hop; /* forward address */ + struct ip_fw *rule; /* matching rule */ + struct ether_header *eh; /* for bridged packets */ + + struct route *ro; /* for dummynet */ + struct sockaddr_in *dst; /* for dummynet */ + int flags; /* for dummynet */ + + struct ipfw_flow_id f_id; /* grabbed from IP header */ + u_int16_t divert_rule; /* divert cookie */ + u_int32_t retval; +}; + +/* + * Function definitions. + */ + +/* Firewall hooks */ +struct sockopt; +struct dn_flow_set; + +void flush_pipe_ptrs(struct dn_flow_set *match); /* used by dummynet */ +void ipfw_init(void); /* called from raw_ip.c: load_ipfw() */ + +typedef int ip_fw_chk_t (struct ip_fw_args *args); +typedef int ip_fw_ctl_t (struct sockopt *); +extern ip_fw_chk_t *ip_fw_chk_ptr; +extern ip_fw_ctl_t *ip_fw_ctl_ptr; +extern int fw_one_pass; +extern int fw_enable; +#define IPFW_LOADED (ip_fw_chk_ptr != NULL) +#endif /* KERNEL */ + +#endif /* _IPFW2_H */ diff --git a/bsd/netinet/ip_fw2_compat.c b/bsd/netinet/ip_fw2_compat.c new file mode 100644 index 000000000..1e0ee62a6 --- /dev/null +++ b/bsd/netinet/ip_fw2_compat.c @@ -0,0 +1,2253 @@ +/* IPFW2 Backward Compatibility */ + +/* Convert to and from IPFW2 structures. */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/socket.h> +#include <sys/socketvar.h> + +#include <sys/types.h> + +#include <net/if.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_icmp.h> +#include <netinet/ip_fw.h> +#include <netinet/tcp.h> + +#include "ip_fw2_compat.h" + +#define FW2_DEBUG_VERBOSE 0 + +/* + * _s_x is a structure that stores a string <-> token pairs, used in + * various places in the parser. Entries are stored in arrays, + * with an entry with s=NULL as terminator. + * The search routines are match_token() and match_value(). + * Often, an element with x=0 contains an error string. + * + */ +struct _s_x { + char const *s; + int x; +}; + +#define NO_VERSION_STR "IP_FW_VERSION_NONE" +#define VERSION_ZERO_STR "IP_FW_VERSION_0" +#define VERSION_ONE_STR "IP_FW_VERSION_1" +#define CURRENT_API_VERSION_STR "IP_FW_CURRENT_API_VERSION" + +static struct _s_x f_tcpflags[] = { + { "syn", TH_SYN }, + { "fin", TH_FIN }, + { "ack", TH_ACK }, + { "psh", TH_PUSH }, + { "rst", TH_RST }, + { "urg", TH_URG }, + { "tcp flag", 0 }, + { NULL, 0 } +}; + +static struct _s_x f_tcpopts[] = { + { "mss", IP_FW_TCPOPT_MSS }, + { "maxseg", IP_FW_TCPOPT_MSS }, + { "window", IP_FW_TCPOPT_WINDOW }, + { "sack", IP_FW_TCPOPT_SACK }, + { "ts", IP_FW_TCPOPT_TS }, + { "timestamp", IP_FW_TCPOPT_TS }, + { "cc", IP_FW_TCPOPT_CC }, + { "tcp option", 0 }, + { NULL, 0 } +}; + +/* + * IP options span the range 0 to 255 so we need to remap them + * (though in fact only the low 5 bits are significant). + */ +static struct _s_x f_ipopts[] = { + { "ssrr", IP_FW_IPOPT_SSRR}, + { "lsrr", IP_FW_IPOPT_LSRR}, + { "rr", IP_FW_IPOPT_RR}, + { "ts", IP_FW_IPOPT_TS}, + { "ip option", 0 }, + { NULL, 0 } +}; + +static struct _s_x f_iptos[] = { + { "lowdelay", IPTOS_LOWDELAY}, + { "throughput", IPTOS_THROUGHPUT}, + { "reliability", IPTOS_RELIABILITY}, + { "mincost", IPTOS_MINCOST}, + { "congestion", IPTOS_CE}, + { "ecntransport", IPTOS_ECT}, + { "ip tos option", 0}, + { NULL, 0 } +}; + +static struct _s_x limit_masks[] = { + {"all", DYN_SRC_ADDR|DYN_SRC_PORT|DYN_DST_ADDR|DYN_DST_PORT}, + {"src-addr", DYN_SRC_ADDR}, + {"src-port", DYN_SRC_PORT}, + {"dst-addr", DYN_DST_ADDR}, + {"dst-port", DYN_DST_PORT}, + {NULL, 0} +}; + +static void +ipfw_print_fw_flags(u_int flags) +{ + /* print action */ + switch (flags & IP_FW_F_COMMAND_COMPAT) { + case IP_FW_F_ACCEPT_COMPAT: + printf("IP_FW_F_ACCEPT_COMPAT\n"); + break; + case IP_FW_F_COUNT_COMPAT: + printf("IP_FW_F_COUNT_COMPAT\n"); + break; + case IP_FW_F_PIPE_COMPAT: + printf("IP_FW_F_PIPE_COMPAT\n"); + break; + case IP_FW_F_QUEUE_COMPAT: + printf("IP_FW_F_QUEUE_COMPAT\n"); + break; + case IP_FW_F_SKIPTO_COMPAT: + printf("IP_FW_F_SKIPTO_COMPAT\n"); + break; + case IP_FW_F_DIVERT_COMPAT: + printf("IP_FW_F_DIVERT_COMPAT\n"); + break; + case IP_FW_F_TEE_COMPAT: + printf("IP_FW_F_TEE_COMPAT\n"); + break; + case IP_FW_F_FWD_COMPAT: + printf("IP_FW_F_FWD_COMPAT\n"); + break; + case IP_FW_F_DENY_COMPAT: + printf("IP_FW_F_DENY_COMPAT\n"); + break; + case IP_FW_F_REJECT_COMPAT: + printf("IP_FW_F_REJECT_COMPAT\n"); + break; + case IP_FW_F_CHECK_S_COMPAT: + printf("IP_FW_F_CHECK_S_COMPAT\n"); + break; + default: + printf("No action given\n"); + break; + } + + /* print commands */ + if (flags & IP_FW_F_IN_COMPAT) { + printf("IP_FW_F_IN_COMPAT\n"); + } + if (flags & IP_FW_F_OUT_COMPAT) { + printf("IP_FW_F_OUT_COMPAT\n"); + } + if (flags & IP_FW_F_IIFACE_COMPAT) { + printf("IP_FW_F_IIFACE_COMPAT\n"); + } + if (flags & IP_FW_F_OIFACE_COMPAT) { + printf("IP_FW_F_OIFACE_COMPAT\n"); + } + if (flags & IP_FW_F_PRN_COMPAT) { + printf("IP_FW_F_PRN_COMPAT\n"); + } + if (flags & IP_FW_F_SRNG_COMPAT) { + printf("IP_FW_F_SRNG_COMPAT\n"); + } + if (flags & IP_FW_F_DRNG_COMPAT) { + printf("IP_FW_F_DRNG_COMPAT\n"); + } + if (flags & IP_FW_F_FRAG_COMPAT) { + printf("IP_FW_F_FRAG_COMPAT\n"); + } + if (flags & IP_FW_F_IIFNAME_COMPAT) { + printf("IP_FW_F_IIFNAME_COMPAT\n"); + } + if (flags & IP_FW_F_OIFNAME_COMPAT) { + printf("IP_FW_F_OIFNAME_COMPAT\n"); + } + if (flags & IP_FW_F_INVSRC_COMPAT) { + printf("IP_FW_F_INVSRC_COMPAT\n"); + } + if (flags & IP_FW_F_INVDST_COMPAT) { + printf("IP_FW_F_INVDST_COMPAT\n"); + } + if (flags & IP_FW_F_ICMPBIT_COMPAT) { + printf("IP_FW_F_ICMPBIT_COMPAT\n"); + } + if (flags & IP_FW_F_UID_COMPAT) { + printf("IP_FW_F_UID_COMPAT\n"); + } + if (flags & IP_FW_F_RND_MATCH_COMPAT) { + printf("IP_FW_F_RND_MATCH_COMPAT\n"); + } + if (flags & IP_FW_F_SMSK_COMPAT) { + printf("IP_FW_F_SMSK_COMPAT\n"); + } + if (flags & IP_FW_F_DMSK_COMPAT) { + printf("IP_FW_F_DMSK_COMPAT\n"); + } + if (flags & IP_FW_BRIDGED_COMPAT) { + printf("IP_FW_BRIDGED_COMPAT\n"); + } + if (flags & IP_FW_F_KEEP_S_COMPAT) { + printf("IP_FW_F_KEEP_S_COMPAT\n"); + } + if (flags & IP_FW_F_CHECK_S_COMPAT) { + printf("IP_FW_F_CHECK_S_COMPAT\n"); + } + if (flags & IP_FW_F_SME_COMPAT) { + printf("IP_FW_F_SME_COMPAT\n"); + } + if (flags & IP_FW_F_DME_COMPAT) { + printf("IP_FW_F_DME_COMPAT\n"); + } +} + +static void +print_fw_version(u_int32_t api_version) +{ + switch (api_version) { + case IP_FW_VERSION_0: + printf("Version: %s\n", VERSION_ZERO_STR); + break; + case IP_FW_VERSION_1: + printf("Version: %s\n", VERSION_ONE_STR); + break; + case IP_FW_CURRENT_API_VERSION: + printf("Version: %s\n", CURRENT_API_VERSION_STR); + break; + case IP_FW_VERSION_NONE: + printf("Version: %s\n", NO_VERSION_STR); + break; + default: + printf("Unrecognized version\n"); + break; + } +} + +static void +ipfw_print_vers1_struct(struct ip_fw_compat *vers1_rule) +{ + char ipv4str[MAX_IPv4_STR_LEN]; + print_fw_version(vers1_rule->version); + printf("Rule #%d\n", vers1_rule->fw_number); + + ipfw_print_fw_flags(vers1_rule->fw_flg); + + printf("fw_pcnt: %d\n", vers1_rule->fw_pcnt); + printf("fw_bcnt: %d\n", vers1_rule->fw_bcnt); + printf("fw_src: %s\n", + inet_ntop(AF_INET, &vers1_rule->fw_src, ipv4str, sizeof(ipv4str))); + printf("fw_dst: %s\n", + inet_ntop(AF_INET, &vers1_rule->fw_dst, ipv4str, sizeof(ipv4str))); + printf("fw_smsk: %s\n", + inet_ntop(AF_INET, &vers1_rule->fw_smsk, ipv4str, sizeof(ipv4str))); + printf("fw_dmsk: %s\n", + inet_ntop(AF_INET, &vers1_rule->fw_dmsk, ipv4str, sizeof(ipv4str))); + + if (vers1_rule->fw_flg & IP_FW_F_ICMPBIT_COMPAT) { + int type_index; + int first = 1; + + printf(" icmptype"); + + for (type_index = 0; + type_index < (IP_FW_ICMPTYPES_DIM_COMPAT * sizeof(unsigned) * 8); + ++type_index) { + if (vers1_rule->fw_uar_compat.fw_icmptypes[type_index / (sizeof(unsigned) * 8)] & + (1U << (type_index % (sizeof(unsigned) * 8)))) { + printf("%c%d", first == 1 ? ' ' : ',', type_index); + first = 0; + } + } + } else { + int i, nsp, ndp; + + nsp = IP_FW_GETNSRCP_COMPAT(vers1_rule); + for (i = 0; i < nsp; i++) { + printf("source ports: fw_uar_compat.fw_pts: %04x", vers1_rule->fw_uar_compat.fw_pts[i]); + if (i == 0 && (vers1_rule->fw_flg & IP_FW_F_SRNG_COMPAT)) + printf("-"); + else if (i == 0 && (vers1_rule->fw_flg & IP_FW_F_SMSK_COMPAT)) + printf(":"); + else + printf(","); + } + + printf("\n"); + + ndp = IP_FW_GETNDSTP_COMPAT(vers1_rule); + for (i = 0; i < ndp; i++) { + printf("source ports: fw_uar_compat.fw_pts: %04x", vers1_rule->fw_uar_compat.fw_pts[nsp+i]); + if (i == 0 && (vers1_rule->fw_flg & IP_FW_F_DRNG_COMPAT)) + printf("-"); + else if (i == 0 && (vers1_rule->fw_flg & IP_FW_F_DMSK_COMPAT)) + printf(":"); + else + printf(","); + } + + printf("\n"); + } + + printf("fw_ipflg: %d\n", vers1_rule->fw_ipflg); + printf("fw_ipopt: %d\n", vers1_rule->fw_ipopt); + printf("fw_ipnopt: %d\n", vers1_rule->fw_ipnopt); + printf("fw_tcpopt: %d\n", vers1_rule->fw_tcpopt); + printf("fw_tcpnopt: %d\n", vers1_rule->fw_tcpnopt); + printf("fw_tcpf: %d\n", vers1_rule->fw_tcpf); + printf("fw_tcpnf: %d\n", vers1_rule->fw_tcpnf); + printf("timestamp: %d\n", vers1_rule->timestamp); + + if ((vers1_rule->fw_flg & IF_FW_F_VIAHACK_COMPAT) == IF_FW_F_VIAHACK_COMPAT) { + printf("fw_in_if: "); + inet_ntop(AF_INET, &vers1_rule->fw_in_if.fu_via_ip, ipv4str, + sizeof(ipv4str)); + printf("fu_via_ip: %s\n", ipv4str); + printf("fu_via_if_compat.name: %s\n", vers1_rule->fw_in_if.fu_via_if_compat.name); + printf("fu_via_if_compat.unit: %d\n", vers1_rule->fw_in_if.fu_via_if_compat.unit); + } else { + if (vers1_rule->fw_flg & IP_FW_F_IIFACE_COMPAT) { + printf("fw_in_if: "); + printf("fu_via_ip: %s\n", + inet_ntop(AF_INET, &vers1_rule->fw_in_if.fu_via_ip, ipv4str, + sizeof(ipv4str))); + printf("fu_via_if_compat.name: %s\n", vers1_rule->fw_in_if.fu_via_if_compat.name); + printf("fu_via_if_compat.unit: %d\n", vers1_rule->fw_in_if.fu_via_if_compat.unit); + } + if (vers1_rule->fw_flg & IP_FW_F_OIFACE_COMPAT) { + printf("fw_out_if: "); + printf("fu_via_ip: %s\n", + inet_ntop(AF_INET, &vers1_rule->fw_out_if.fu_via_ip, + ipv4str, sizeof(ipv4str))); + printf("fu_via_if_compat.name: %s\n", vers1_rule->fw_out_if.fu_via_if_compat.name); + printf("fu_via_if_compat.unit: %d\n", vers1_rule->fw_out_if.fu_via_if_compat.unit); + } + } + + printf("fw_prot: %d\n", vers1_rule->fw_prot); + printf("fw_nports: %d\n", vers1_rule->fw_nports); + printf("pipe_ptr: %x\n", vers1_rule->pipe_ptr); + printf("next_rule_ptr: %x\n", vers1_rule->next_rule_ptr); + printf("fw_uid: %d\n", vers1_rule->fw_uid); + printf("fw_logamount: %d\n", vers1_rule->fw_logamount); + printf("fw_loghighest: %d\n", vers1_rule->fw_loghighest); +} + +static void +print_icmptypes(ipfw_insn_u32 *cmd) +{ + int i; + char sep= ' '; + + printf(" icmptypes"); + for (i = 0; i < 32; i++) { + if ( (cmd->d[0] & (1 << (i))) == 0) + continue; + printf("%c%d", sep, i); + sep = ','; + } +} + +/* + * print flags set/clear in the two bitmasks passed as parameters. + * There is a specialized check for f_tcpflags. + */ +static void +print_flags(char const *name, ipfw_insn *cmd, struct _s_x *list) +{ + char const *comma = ""; + int i; + uint8_t set = cmd->arg1 & 0xff; + uint8_t clear = (cmd->arg1 >> 8) & 0xff; + + if (list == f_tcpflags && set == TH_SYN && clear == TH_ACK) { + printf(" setup"); + return; + } + + printf(" %s ", name); + for (i=0; list[i].x != 0; i++) { + if (set & list[i].x) { + set &= ~list[i].x; + printf("%s%s", comma, list[i].s); + comma = ","; + } + if (clear & list[i].x) { + clear &= ~list[i].x; + printf("%s!%s", comma, list[i].s); + comma = ","; + } + } +} + +static int +contigmask(uint8_t *p, int len) +{ + int i, n; + + for (i=0; i<len ; i++) + if ( (p[i/8] & (1 << (7 - (i%8)))) == 0) /* first bit unset */ + break; + for (n=i+1; n < len; n++) + if ( (p[n/8] & (1 << (7 - (n%8)))) != 0) + return -1; /* mask not contiguous */ + return i; +} + +/* + * Print the ip address contained in a command. + */ +static void +print_ip(ipfw_insn_ip *cmd) +{ + int len = F_LEN((ipfw_insn *)cmd); + uint32_t *a = ((ipfw_insn_u32 *)cmd)->d; + char ipv4str[MAX_IPv4_STR_LEN]; + + printf("%s ", cmd->o.len & F_NOT ? " not": ""); + + if (cmd->o.opcode == O_IP_SRC_ME || cmd->o.opcode == O_IP_DST_ME) { + printf("me"); + return; + } + + /* + * len == 2 indicates a single IP, whereas lists of 1 or more + * addr/mask pairs have len = (2n+1). We convert len to n so we + * use that to count the number of entries. + */ + for (len = len / 2; len > 0; len--, a += 2) { + int mb = /* mask length */ + (cmd->o.opcode == O_IP_SRC || cmd->o.opcode == O_IP_DST) ? + 32 : contigmask((uint8_t *)&(a[1]), 32); + if (mb == 0) { /* any */ + printf("any"); + } else { /* numeric IP followed by some kind of mask */ + printf("%s", inet_ntop(AF_INET, &a[0], ipv4str, sizeof(ipv4str))); + if (mb < 0) + printf(":%s", inet_ntop(AF_INET, &a[1], ipv4str, sizeof(ipv4str))); + else if (mb < 32) + printf("/%d", mb); + } + if (len > 1) + printf(","); + } +} + +/* + * prints a MAC address/mask pair + */ +static void +print_mac(uint8_t *addr, uint8_t *mask) +{ + int l = contigmask(mask, 48); + + if (l == 0) + printf(" any"); + else { + printf(" %02x:%02x:%02x:%02x:%02x:%02x", + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]); + if (l == -1) + printf("&%02x:%02x:%02x:%02x:%02x:%02x", + mask[0], mask[1], mask[2], + mask[3], mask[4], mask[5]); + else if (l < 48) + printf("/%d", l); + } +} + +static void +ipfw_print_vers2_struct(struct ip_fw *vers2_rule) +{ + int l; + ipfw_insn *cmd; + ipfw_insn_log *logptr = NULL; + char ipv4str[MAX_IPv4_STR_LEN]; + + print_fw_version(vers2_rule->version); + + printf("act_ofs: %d\n", vers2_rule->act_ofs); + printf("cmd_len: %d\n", vers2_rule->cmd_len); + printf("rulenum: %d\n", vers2_rule->rulenum); + printf("set: %d\n", vers2_rule->set); + printf("pcnt: %d\n", vers2_rule->pcnt); + printf("bcnt: %d\n", vers2_rule->bcnt); + printf("timestamp: %d\n", vers2_rule->timestamp); + + /* + * first print actions + */ + for (l = vers2_rule->cmd_len - vers2_rule->act_ofs, cmd = ACTION_PTR(vers2_rule); + l > 0 ; l -= F_LEN(cmd), cmd += F_LEN(cmd)) { + switch(cmd->opcode) { + case O_CHECK_STATE: + printf("check-state"); + break; + + case O_ACCEPT: + printf("allow"); + break; + + case O_COUNT: + printf("count"); + break; + + case O_DENY: + printf("deny"); + break; + + case O_REJECT: + if (cmd->arg1 == ICMP_REJECT_RST) + printf("reset"); + else if (cmd->arg1 == ICMP_UNREACH_HOST) + printf("reject"); + else + printf("unreach %u", cmd->arg1); + break; + + case O_SKIPTO: + printf("skipto %u", cmd->arg1); + break; + + case O_PIPE: + printf("pipe %u", cmd->arg1); + break; + + case O_QUEUE: + printf("queue %u", cmd->arg1); + break; + + case O_DIVERT: + printf("divert %u", cmd->arg1); + break; + + case O_TEE: + printf("tee %u", cmd->arg1); + break; + + case O_FORWARD_IP: + { + ipfw_insn_sa *s = (ipfw_insn_sa *)cmd; + + printf("fwd %s", + inet_ntop(AF_INET, &s->sa.sin_addr, ipv4str, + sizeof(ipv4str))); + if (s->sa.sin_port) + printf(",%d", s->sa.sin_port); + break; + } + + case O_LOG: /* O_LOG is printed last */ + logptr = (ipfw_insn_log *)cmd; + break; + + default: + printf("** unrecognized action %d len %d", + cmd->opcode, cmd->len); + } + } + if (logptr) { + if (logptr->max_log > 0) + printf(" log logamount %d", logptr->max_log); + else + printf(" log"); + } + + /* + * then print the body. + */ + for (l = vers2_rule->act_ofs, cmd = vers2_rule->cmd ; + l > 0 ; l -= F_LEN(cmd) , cmd += F_LEN(cmd)) { + /* useful alias */ + ipfw_insn_u32 *cmd32 = (ipfw_insn_u32 *)cmd; + + switch(cmd->opcode) { + case O_PROB: + break; /* done already */ + + case O_PROBE_STATE: + break; /* no need to print anything here */ + + case O_MACADDR2: + { + ipfw_insn_mac *m = (ipfw_insn_mac *)cmd; + + if (cmd->len & F_NOT) + printf(" not"); + printf(" MAC"); + print_mac(m->addr, m->mask); + print_mac(m->addr + 6, m->mask + 6); + printf("\n"); + break; + } + case O_MAC_TYPE: + { + uint16_t *p = ((ipfw_insn_u16 *)cmd)->ports; + int i; + + for (i = F_LEN((ipfw_insn *)cmd) - 1; i > 0; i--, p += 2) { + printf("0x%04x", p[0]); + if (p[0] != p[1]) { + printf("-"); + printf("0x%04x", p[1]); + } + printf(","); + } + break; + } + case O_IP_SRC: + case O_IP_SRC_MASK: + case O_IP_SRC_ME: + print_ip((ipfw_insn_ip *)cmd); + break; + + case O_IP_DST: + case O_IP_DST_MASK: + case O_IP_DST_ME: + print_ip((ipfw_insn_ip *)cmd); + break; + + case O_IP_DSTPORT: + case O_IP_SRCPORT: + { + uint16_t *p = ((ipfw_insn_u16 *)cmd)->ports; + int i; + + for (i = F_LEN((ipfw_insn *)cmd) - 1; i > 0; i--, p += 2) { + printf("0x%04x", p[0]); + if (p[0] != p[1]) { + printf("-"); + printf("0x%04x", p[1]); + } + printf(","); + } + break; + } + case O_PROTO: + { + printf("O_PROTO"); + + if (cmd->len & F_NOT) + printf(" not"); + + printf(" %u", cmd->arg1); + + break; + } + + default: /*options ... */ + { + if (cmd->len & F_NOT && cmd->opcode != O_IN) + printf(" not"); + switch(cmd->opcode) { + case O_FRAG: + printf("O_FRAG"); + break; + + case O_IN: + printf(cmd->len & F_NOT ? " out" : " O_IN"); + break; + + case O_LAYER2: + printf(" O_LAYER2"); + break; + case O_XMIT: + case O_RECV: + case O_VIA: + { + char const *s; + ipfw_insn_if *cmdif = (ipfw_insn_if *)cmd; + + if (cmd->opcode == O_XMIT) + s = "O_XMIT"; + else if (cmd->opcode == O_RECV) + s = "O_RECV"; + else /* if (cmd->opcode == O_VIA) */ + s = "O_VIA"; + if (cmdif->name[0] == '\0') { + printf(" %s %s", s, + inet_ntop(AF_INET, &cmdif->p.ip, ipv4str, + sizeof(ipv4str))); + } + else if (cmdif->p.unit == -1) + printf(" %s %s*", s, cmdif->name); + else + printf(" %s %s%d", s, cmdif->name, + cmdif->p.unit); + } + break; + + case O_IPID: + if (F_LEN(cmd) == 1) + printf(" ipid %u", cmd->arg1 ); + else { + uint16_t *p = ((ipfw_insn_u16 *)cmd)->ports; + int i; + + for (i = F_LEN((ipfw_insn *)cmd) - 1; i > 0; i--, p += 2) { + printf("0x%04x", p[0]); + if (p[0] != p[1]) { + printf("-"); + printf("0x%04x", p[1]); + } + printf(","); + } + } + + break; + + case O_IPTTL: + if (F_LEN(cmd) == 1) + printf(" ipttl %u", cmd->arg1 ); + else { + uint16_t *p = ((ipfw_insn_u16 *)cmd)->ports; + int i; + + for (i = F_LEN((ipfw_insn *)cmd) - 1; i > 0; i--, p += 2) { + printf("0x%04x", p[0]); + if (p[0] != p[1]) { + printf("-"); + printf("0x%04x", p[1]); + } + printf(","); + } + } + + break; + + case O_IPVER: + printf(" ipver %u", cmd->arg1 ); + break; + + case O_IPPRECEDENCE: + printf(" ipprecedence %u", (cmd->arg1) >> 5 ); + break; + + case O_IPLEN: + if (F_LEN(cmd) == 1) + printf(" iplen %u", cmd->arg1 ); + else { + uint16_t *p = ((ipfw_insn_u16 *)cmd)->ports; + int i; + + for (i = F_LEN((ipfw_insn *)cmd) - 1; i > 0; i--, p += 2) { + printf("0x%04x", p[0]); + if (p[0] != p[1]) { + printf("-"); + printf("0x%04x", p[1]); + } + printf(","); + } + } + + break; + + case O_IPOPT: + print_flags("ipoptions", cmd, f_ipopts); + break; + + case O_IPTOS: + print_flags("iptos", cmd, f_iptos); + break; + + case O_ICMPTYPE: + print_icmptypes((ipfw_insn_u32 *)cmd); + break; + + case O_ESTAB: + printf(" established"); + break; + + case O_TCPFLAGS: + print_flags("tcpflags", cmd, f_tcpflags); + break; + + case O_TCPOPTS: + print_flags("tcpoptions", cmd, f_tcpopts); + break; + + case O_TCPWIN: + printf(" tcpwin %d", ntohs(cmd->arg1)); + break; + + case O_TCPACK: + printf(" tcpack %ld", ntohl(cmd32->d[0])); + break; + + case O_TCPSEQ: + printf(" tcpseq %ld", ntohl(cmd32->d[0])); + break; + + case O_UID: + printf(" uid %u", cmd32->d[0]); + break; + + case O_GID: + printf(" gid %u", cmd32->d[0]); + break; + + case O_VERREVPATH: + printf(" verrevpath"); + break; + + case O_IPSEC: + printf(" ipsec"); + break; + + case O_NOP: + break; + + case O_KEEP_STATE: + printf(" keep-state"); + break; + + case O_LIMIT: + { + struct _s_x *p = limit_masks; + ipfw_insn_limit *c = (ipfw_insn_limit *)cmd; + uint8_t x = c->limit_mask; + char const *comma = " "; + + printf(" limit"); + for (; p->x != 0 ; p++) + if ((x & p->x) == p->x) { + x &= ~p->x; + printf("%s%s", comma, p->s); + comma = ","; + } + printf(" %d", c->conn_limit); + + break; + } + + default: + printf(" [opcode %d len %d]", + cmd->opcode, cmd->len); + } /* switch */ + } /* default */ + } /* switch */ + } /* for */ +} + +/* + * helper function, updates the pointer to cmd with the length + * of the current command, and also cleans up the first word of + * the new command in case it has been clobbered before. + * from ipfw2.c + */ +static ipfw_insn * +next_cmd(ipfw_insn *cmd) +{ + cmd += F_LEN(cmd); + bzero(cmd, sizeof(*cmd)); + return cmd; +} + +/* + * A function to fill simple commands of size 1. + * Existing flags are preserved. + * from ipfw2.c + */ +static void +fill_cmd(ipfw_insn *cmd, enum ipfw_opcodes opcode, uint16_t arg) +{ + cmd->opcode = opcode; + cmd->len = ((cmd->len) & (F_NOT | F_OR)) | 1; + cmd->arg1 = arg; +} + + +static u_int32_t +fill_compat_tcpflags(u_int32_t flags) { + u_int32_t flags_compat = 0; + + if (flags & TH_FIN) + flags_compat |= IP_FW_TCPF_FIN_COMPAT; + if (flags & TH_SYN) + flags_compat |= IP_FW_TCPF_SYN_COMPAT; + if (flags & TH_RST) + flags_compat |= IP_FW_TCPF_RST_COMPAT; + if (flags & TH_PUSH) + flags_compat |= IP_FW_TCPF_PSH_COMPAT; + if (flags & TH_ACK) + flags_compat |= IP_FW_TCPF_ACK_COMPAT; + if (flags & TH_URG) + flags_compat |= IP_FW_TCPF_URG_COMPAT; + + return flags_compat; +} + + +/* ******************************************** + * *********** Convert from Latest ************ + * ********************************************/ + +/* + * Things we're actively ignoring: + * sets, sets of addresses, blocks (NOT, OR) + */ +static void +ipfw_map_from_cmds(struct ip_fw *curr_rule, struct ip_fw_compat *compat_rule) +{ + int l; + ipfw_insn *cmd; + + for (l = curr_rule->act_ofs, cmd = curr_rule->cmd ; + l > 0 ; + l -= F_LEN(cmd) , cmd += F_LEN(cmd)) { + /* useful alias */ + ipfw_insn_u32 *cmd32 = (ipfw_insn_u32 *)cmd; + + switch (cmd->opcode) { + case O_PROTO: + /* protocol */ + compat_rule->fw_prot = cmd->arg1; + break; + + case O_IP_SRC_ME: + compat_rule->fw_flg |= IP_FW_F_SME_COMPAT; + if (cmd->len & F_NOT) { + compat_rule->fw_flg |= IP_FW_F_INVSRC_COMPAT; + } + break; + + case O_IP_SRC_MASK: + { + /* addr/mask */ + ipfw_insn_ip *ip = (ipfw_insn_ip *)cmd; + + compat_rule->fw_src = ip->addr; + compat_rule->fw_smsk = ip->mask; + if (cmd->len & F_NOT) { + compat_rule->fw_flg |= IP_FW_F_INVSRC_COMPAT; + } + break; + } + + case O_IP_SRC: + /* one IP */ + /* source - + * for now we only deal with one address + * per rule and ignore sets of addresses + */ + compat_rule->fw_src.s_addr = cmd32->d[0]; + if (cmd->len & F_NOT) { + compat_rule->fw_flg |= IP_FW_F_INVSRC_COMPAT; + } + break; + + case O_IP_SRCPORT: + { + /* source ports */ + ipfw_insn_u16 *ports = (ipfw_insn_u16 *)cmd; + uint16_t *p = ports->ports; + int i, j; + + /* copy list of ports */ + for (i = F_LEN(cmd) - 1, j = 0; i > 0; i--, j++, p += 2) { + if (p[0] != p[1]) { + /* this is a range */ + compat_rule->fw_flg |= IP_FW_F_SRNG_COMPAT; + compat_rule->fw_uar_compat.fw_pts[j++] = p[0]; + compat_rule->fw_uar_compat.fw_pts[j] = p[1]; + } else { + compat_rule->fw_uar_compat.fw_pts[j] = p[0]; + } + } + IP_FW_SETNSRCP_COMPAT(compat_rule, j); + + break; + } + + case O_IP_DST_ME: + /* destination */ + compat_rule->fw_flg |= IP_FW_F_DME_COMPAT; + if (cmd->len & F_NOT) { + compat_rule->fw_flg |= IP_FW_F_INVDST_COMPAT; + } + break; + + case O_IP_DST_MASK: + { + /* addr/mask */ + ipfw_insn_ip *ip = (ipfw_insn_ip *)cmd; + + compat_rule->fw_dst = ip->addr; + compat_rule->fw_dmsk = ip->mask; + if (cmd->len & F_NOT) { + compat_rule->fw_flg |= IP_FW_F_INVDST_COMPAT; + } + break; + } + case O_IP_DST: + /* one IP */ + /* dest - + * for now we only deal with one address + * per rule, and ignore sets of addresses + */ + compat_rule->fw_dst.s_addr = cmd32->d[0]; + if (cmd->len & F_NOT) { + compat_rule->fw_flg |= IP_FW_F_INVDST_COMPAT; + } + break; + + case O_IP_DSTPORT: + { + /* dest. ports */ + ipfw_insn_u16 *ports = (ipfw_insn_u16 *)cmd; + uint16_t *p = ports->ports; + int i, + j = IP_FW_GETNSRCP_COMPAT(compat_rule); + + /* copy list of ports */ + for (i = F_LEN(cmd) - 1; i > 0; i--, j++, p += 2) { + if (p[0] != p[1]) { + /* this is a range */ + compat_rule->fw_flg |= IP_FW_F_DRNG_COMPAT; + compat_rule->fw_uar_compat.fw_pts[j++] = p[0]; + compat_rule->fw_uar_compat.fw_pts[j] = p[1]; + } else { + compat_rule->fw_uar_compat.fw_pts[j] = p[0]; + } + } + IP_FW_SETNDSTP_COMPAT(compat_rule, (j - IP_FW_GETNSRCP_COMPAT(compat_rule))); + + break; + } + + case O_LOG: + { + ipfw_insn_log *c = (ipfw_insn_log *)cmd; + + compat_rule->fw_flg |= IP_FW_F_PRN_COMPAT; + compat_rule->fw_logamount = c->max_log; + break; + } + case O_UID: + compat_rule->fw_flg |= IP_FW_F_UID_COMPAT; + compat_rule->fw_uid = cmd32->d[0]; + break; + + case O_IN: + if (cmd->len & F_NOT) { + compat_rule->fw_flg |= IP_FW_F_OUT_COMPAT; + } else { + compat_rule->fw_flg |= IP_FW_F_IN_COMPAT; + } + break; + + case O_KEEP_STATE: + compat_rule->fw_flg |= IP_FW_F_KEEP_S_COMPAT; + break; + + case O_LAYER2: + compat_rule->fw_flg |= IP_FW_BRIDGED_COMPAT; + break; + + case O_XMIT: + { + ipfw_insn_if *ifcmd = (ipfw_insn_if *)cmd; + union ip_fw_if_compat ifu; + + if ((ifcmd->o.len == 0) && (ifcmd->name[0] == '\0')) { + /* any */ + compat_rule->fw_flg |= IP_FW_F_OIFACE_COMPAT; + ifu.fu_via_ip.s_addr = 0; + } + else if (ifcmd->p.ip.s_addr != 0) { + compat_rule->fw_flg |= IP_FW_F_OIFACE_COMPAT; + ifu.fu_via_ip = ifcmd->p.ip; + } else { + compat_rule->fw_flg |= IP_FW_F_OIFNAME_COMPAT; + strncpy(ifu.fu_via_if_compat.name, ifcmd->name, sizeof(ifu.fu_via_if_compat.name)); + ifu.fu_via_if_compat.unit = ifcmd->p.unit; + } + compat_rule->fw_out_if = ifu; + + break; + } + + case O_RECV: + { + ipfw_insn_if *ifcmd = (ipfw_insn_if *)cmd; + union ip_fw_if_compat ifu; + + if ((ifcmd->o.len == 0) && (ifcmd->name[0] == '\0')) { + /* any */ + compat_rule->fw_flg |= IP_FW_F_IIFACE_COMPAT; + ifu.fu_via_ip.s_addr = 0; + } + else if (ifcmd->p.ip.s_addr != 0) { + compat_rule->fw_flg |= IP_FW_F_IIFACE_COMPAT; + ifu.fu_via_ip = ifcmd->p.ip; + } else { + compat_rule->fw_flg |= IP_FW_F_IIFNAME_COMPAT; + strncpy(ifu.fu_via_if_compat.name, ifcmd->name, sizeof(ifu.fu_via_if_compat.name)); + ifu.fu_via_if_compat.unit = ifcmd->p.unit; + } + compat_rule->fw_in_if = ifu; + + break; + } + + case O_VIA: + { + ipfw_insn_if *ifcmd = (ipfw_insn_if *)cmd; + union ip_fw_if_compat ifu; + + if ((ifcmd->o.len == 0) && (ifcmd->name[0] == '\0')) { + /* any */ + ifu.fu_via_ip.s_addr = 0; + } + else if (ifcmd->name[0] != '\0') { + compat_rule->fw_flg |= IP_FW_F_IIFNAME_COMPAT; + strncpy(ifu.fu_via_if_compat.name, ifcmd->name, sizeof(ifu.fu_via_if_compat.name)); + ifu.fu_via_if_compat.unit = ifcmd->p.unit; + } else { + ifu.fu_via_ip = ifcmd->p.ip; + } + compat_rule->fw_flg |= IF_FW_F_VIAHACK_COMPAT; + compat_rule->fw_out_if = compat_rule->fw_in_if = ifu; + + break; + } + + case O_FRAG: + compat_rule->fw_flg |= IP_FW_F_FRAG_COMPAT; + break; + + case O_IPOPT: + /* IP options */ + compat_rule->fw_ipopt = (cmd->arg1 & 0xff); + compat_rule->fw_ipnopt = ((cmd->arg1 >> 8) & 0xff); + break; + + case O_TCPFLAGS: + /* check for "setup" */ + if ((cmd->arg1 & 0xff) == TH_SYN && + ((cmd->arg1 >> 8) & 0xff) == TH_ACK) { + compat_rule->fw_tcpf = IP_FW_TCPF_SYN_COMPAT; + compat_rule->fw_tcpnf = IP_FW_TCPF_ACK_COMPAT; + } + else { + compat_rule->fw_tcpf = fill_compat_tcpflags(cmd->arg1 & 0xff); + compat_rule->fw_tcpnf = fill_compat_tcpflags((cmd->arg1 >> 8) & 0xff); + } + break; + + case O_TCPOPTS: + /* TCP options */ + compat_rule->fw_tcpopt = (cmd->arg1 & 0xff); + compat_rule->fw_tcpnopt = ((cmd->arg1 >> 8) & 0xff); + break; + + case O_ESTAB: + compat_rule->fw_ipflg |= IP_FW_IF_TCPEST_COMPAT; + break; + + case O_ICMPTYPE: + { + /* ICMP */ + /* XXX: check this */ + int i, type; + + compat_rule->fw_flg |= IP_FW_F_ICMPBIT_COMPAT; + for (i = 0; i < sizeof(uint32_t) ; i++) { + type = cmd32->d[0] & i; + + compat_rule->fw_uar_compat.fw_icmptypes[type / (sizeof(unsigned) * 8)] |= + 1 << (type % (sizeof(unsigned) * 8)); + } + break; + } + default: + break; + } /* switch */ + } /* for */ +} + +static void +ipfw_map_from_actions(struct ip_fw *curr_rule, struct ip_fw_compat *compat_rule) +{ + int l; + ipfw_insn *cmd; + + for (l = curr_rule->cmd_len - curr_rule->act_ofs, cmd = ACTION_PTR(curr_rule); + l > 0 ; + l -= F_LEN(cmd), cmd += F_LEN(cmd)) { + switch (cmd->opcode) { + case O_ACCEPT: + compat_rule->fw_flg |= IP_FW_F_ACCEPT_COMPAT; + break; + case O_COUNT: + compat_rule->fw_flg |= IP_FW_F_COUNT_COMPAT; + break; + case O_PIPE: + compat_rule->fw_flg |= IP_FW_F_PIPE_COMPAT; + compat_rule->fw_divert_port_compat = cmd->arg1; + break; + case O_QUEUE: + compat_rule->fw_flg |= IP_FW_F_QUEUE_COMPAT; + compat_rule->fw_divert_port_compat = cmd->arg1; + break; + case O_SKIPTO: + compat_rule->fw_flg |= IP_FW_F_SKIPTO_COMPAT; + compat_rule->fw_skipto_rule_compat = cmd->arg1; + break; + case O_DIVERT: + compat_rule->fw_flg |= IP_FW_F_DIVERT_COMPAT; + compat_rule->fw_divert_port_compat = cmd->arg1; + break; + case O_TEE: + compat_rule->fw_flg |= IP_FW_F_TEE_COMPAT; + compat_rule->fw_divert_port_compat = cmd->arg1; + break; + case O_FORWARD_IP: + { + ipfw_insn_sa *p = (ipfw_insn_sa *)cmd; + + compat_rule->fw_flg |= IP_FW_F_FWD_COMPAT; + compat_rule->fw_fwd_ip_compat.sin_len = p->sa.sin_len; + compat_rule->fw_fwd_ip_compat.sin_family = p->sa.sin_family; + compat_rule->fw_fwd_ip_compat.sin_port = p->sa.sin_port; + compat_rule->fw_fwd_ip_compat.sin_addr = p->sa.sin_addr; + + break; + } + case O_DENY: + compat_rule->fw_flg |= IP_FW_F_DENY_COMPAT; + break; + case O_REJECT: + compat_rule->fw_flg |= IP_FW_F_REJECT_COMPAT; + compat_rule->fw_reject_code_compat = cmd->arg1; + break; + case O_CHECK_STATE: + compat_rule->fw_flg |= IP_FW_F_CHECK_S_COMPAT; + break; + default: + break; + } + } +} + +static void +ipfw_version_latest_to_one(struct ip_fw *curr_rule, struct ip_fw_compat *rule_vers1) +{ + if (!rule_vers1) + return; + + bzero(rule_vers1, sizeof(struct ip_fw_compat)); + + rule_vers1->version = IP_FW_VERSION_1; + rule_vers1->context = curr_rule->context; + rule_vers1->fw_number = curr_rule->rulenum; + rule_vers1->fw_pcnt = curr_rule->pcnt; + rule_vers1->fw_bcnt = curr_rule->bcnt; + rule_vers1->timestamp = curr_rule->timestamp; + + /* convert actions */ + ipfw_map_from_actions(curr_rule, rule_vers1); + + /* convert commands */ + ipfw_map_from_cmds(curr_rule, rule_vers1); + +#if FW2_DEBUG_VERBOSE + ipfw_print_vers1_struct(rule_vers1); +#endif +} + +/* first convert to version one then to version zero */ +static void +ipfw_version_latest_to_zero(struct ip_fw *curr_rule, struct ip_old_fw *rule_vers0) +{ + struct ip_fw_compat rule_vers1; + + ipfw_version_latest_to_one(curr_rule, &rule_vers1); + + bzero(rule_vers0, sizeof(struct ip_old_fw)); + bcopy(&rule_vers1.fw_uar_compat, &rule_vers0->fw_uar, sizeof(rule_vers1.fw_uar_compat)); + bcopy(&rule_vers1.fw_in_if, &rule_vers0->fw_in_if, sizeof(rule_vers1.fw_in_if)); + bcopy(&rule_vers1.fw_out_if, &rule_vers0->fw_out_if, sizeof(rule_vers1.fw_out_if)); + bcopy(&rule_vers1.fw_un_compat, &rule_vers0->fw_un, sizeof(rule_vers1.fw_un_compat)); + + rule_vers0->fw_pcnt = rule_vers1.fw_pcnt; + rule_vers0->fw_bcnt = rule_vers1.fw_bcnt; + rule_vers0->fw_src = rule_vers1.fw_src; + rule_vers0->fw_dst = rule_vers1.fw_dst; + rule_vers0->fw_smsk = rule_vers1.fw_smsk; + rule_vers0->fw_dmsk = rule_vers1.fw_dmsk; + rule_vers0->fw_number = rule_vers1.fw_number; + rule_vers0->fw_flg = rule_vers1.fw_flg; + rule_vers0->fw_ipopt = rule_vers1.fw_ipopt; + rule_vers0->fw_ipnopt = rule_vers1.fw_ipnopt; + rule_vers0->fw_tcpf = rule_vers1.fw_tcpf; + rule_vers0->fw_tcpnf = rule_vers1.fw_tcpnf; + rule_vers0->timestamp = rule_vers1.timestamp; + rule_vers0->fw_prot = rule_vers1.fw_prot; + rule_vers0->fw_nports = rule_vers1.fw_nports; + rule_vers0->pipe_ptr = rule_vers1.pipe_ptr; + rule_vers0->next_rule_ptr = rule_vers1.next_rule_ptr; + + if (rule_vers1.fw_ipflg && IP_FW_IF_TCPEST_COMPAT) rule_vers0->fw_tcpf |= IP_OLD_FW_TCPF_ESTAB; +} + +void +ipfw_convert_from_latest(struct ip_fw *curr_rule, void *old_rule, u_int32_t api_version) +{ + switch (api_version) { + case IP_FW_VERSION_0: + { + struct ip_old_fw *rule_vers0 = old_rule; + + ipfw_version_latest_to_zero(curr_rule, rule_vers0); + break; + } + case IP_FW_VERSION_1: + { + struct ip_fw_compat *rule_vers1 = old_rule; + + ipfw_version_latest_to_one(curr_rule, rule_vers1); + break; + } + case IP_FW_CURRENT_API_VERSION: + /* ipfw2 for now, don't need to do anything */ + break; + + default: + /* unknown version */ + break; + } +} + + +/* ******************************************** + * *********** Convert to Latest ************** + * ********************************************/ + +/* from ip_fw.c */ +static int +ipfw_check_vers1_struct(struct ip_fw_compat *frwl) +{ + /* Check for invalid flag bits */ + if ((frwl->fw_flg & ~IP_FW_F_MASK_COMPAT) != 0) { + /* + printf(("%s undefined flag bits set (flags=%x)\n", + err_prefix, frwl->fw_flg)); + */ + return (EINVAL); + } + if (frwl->fw_flg == IP_FW_F_CHECK_S_COMPAT) { + /* check-state */ + return 0 ; + } + /* Must apply to incoming or outgoing (or both) */ + if (!(frwl->fw_flg & (IP_FW_F_IN_COMPAT | IP_FW_F_OUT_COMPAT))) { + /* + printf(("%s neither in nor out\n", err_prefix)); + */ + return (EINVAL); + } + /* Empty interface name is no good */ + if (((frwl->fw_flg & IP_FW_F_IIFNAME_COMPAT) + && !*frwl->fw_in_if.fu_via_if_compat.name) + || ((frwl->fw_flg & IP_FW_F_OIFNAME_COMPAT) + && !*frwl->fw_out_if.fu_via_if_compat.name)) { + /* + printf(("%s empty interface name\n", err_prefix)); + */ + return (EINVAL); + } + /* Sanity check interface matching */ + if ((frwl->fw_flg & IF_FW_F_VIAHACK_COMPAT) == IF_FW_F_VIAHACK_COMPAT) { + ; /* allow "via" backwards compatibility */ + } else if ((frwl->fw_flg & IP_FW_F_IN_COMPAT) + && (frwl->fw_flg & IP_FW_F_OIFACE_COMPAT)) { + /* + printf(("%s outgoing interface check on incoming\n", + err_prefix)); + */ + return (EINVAL); + } + /* Sanity check port ranges */ + if ((frwl->fw_flg & IP_FW_F_SRNG_COMPAT) && IP_FW_GETNSRCP_COMPAT(frwl) < 2) { + /* + printf(("%s src range set but n_src_p=%d\n", + err_prefix, IP_FW_GETNSRCP_COMPAT(frwl))); + */ + return (EINVAL); + } + if ((frwl->fw_flg & IP_FW_F_DRNG_COMPAT) && IP_FW_GETNDSTP_COMPAT(frwl) < 2) { + /* + printf(("%s dst range set but n_dst_p=%d\n", + err_prefix, IP_FW_GETNDSTP_COMPAT(frwl))); + */ + return (EINVAL); + } + if (IP_FW_GETNSRCP_COMPAT(frwl) + IP_FW_GETNDSTP_COMPAT(frwl) > IP_FW_MAX_PORTS_COMPAT) { + /* + printf(("%s too many ports (%d+%d)\n", + err_prefix, IP_FW_GETNSRCP_COMPAT(frwl), IP_FW_GETNDSTP_COMPAT(frwl))); + */ + return (EINVAL); + } + /* + * Protocols other than TCP/UDP don't use port range + */ + if ((frwl->fw_prot != IPPROTO_TCP) && + (frwl->fw_prot != IPPROTO_UDP) && + (IP_FW_GETNSRCP_COMPAT(frwl) || IP_FW_GETNDSTP_COMPAT(frwl))) { + /* + printf(("%s port(s) specified for non TCP/UDP rule\n", + err_prefix)); + */ + return (EINVAL); + } + + /* + * Rather than modify the entry to make such entries work, + * we reject this rule and require user level utilities + * to enforce whatever policy they deem appropriate. + */ + if ((frwl->fw_src.s_addr & (~frwl->fw_smsk.s_addr)) || + (frwl->fw_dst.s_addr & (~frwl->fw_dmsk.s_addr))) { + /* + printf(("%s rule never matches\n", err_prefix)); + */ + return (EINVAL); + } + + if ((frwl->fw_flg & IP_FW_F_FRAG_COMPAT) && + (frwl->fw_prot == IPPROTO_UDP || frwl->fw_prot == IPPROTO_TCP)) { + if (frwl->fw_nports) { + /* + printf(("%s cannot mix 'frag' and ports\n", err_prefix)); + */ + return (EINVAL); + } + if (frwl->fw_prot == IPPROTO_TCP && + frwl->fw_tcpf != frwl->fw_tcpnf) { + /* + printf(("%s cannot mix 'frag' and TCP flags\n", err_prefix)); + */ + return (EINVAL); + } + } + + /* Check command specific stuff */ + switch (frwl->fw_flg & IP_FW_F_COMMAND_COMPAT) + { + case IP_FW_F_REJECT_COMPAT: + if (frwl->fw_reject_code_compat >= 0x100 + && !(frwl->fw_prot == IPPROTO_TCP + && frwl->fw_reject_code_compat == IP_FW_REJECT_RST_COMPAT)) { + /* + printf(("%s unknown reject code\n", err_prefix)); + */ + return (EINVAL); + } + break; + case IP_FW_F_DIVERT_COMPAT: /* Diverting to port zero is invalid */ + case IP_FW_F_TEE_COMPAT: + case IP_FW_F_PIPE_COMPAT: /* piping through 0 is invalid */ + case IP_FW_F_QUEUE_COMPAT: /* piping through 0 is invalid */ + if (frwl->fw_divert_port_compat == 0) { + /* + printf(("%s can't divert to port 0\n", err_prefix)); + */ + return (EINVAL); + } + break; + case IP_FW_F_DENY_COMPAT: + case IP_FW_F_ACCEPT_COMPAT: + case IP_FW_F_COUNT_COMPAT: + case IP_FW_F_SKIPTO_COMPAT: + case IP_FW_F_FWD_COMPAT: + case IP_FW_F_UID_COMPAT: + break; + default: + /* + printf(("%s invalid command\n", err_prefix)); + */ + return (EINVAL); + } + + return 0; +} + +static void +ipfw_convert_to_cmds(struct ip_fw *curr_rule, struct ip_fw_compat *compat_rule) +{ + int k; + uint32_t actbuf[255], cmdbuf[255]; + ipfw_insn *action, *cmd, *src, *dst; + ipfw_insn *have_state = NULL, /* track check-state or keep-state */ + *end_action = NULL, + *end_cmd = NULL; + + if (!compat_rule || !curr_rule || !(curr_rule->cmd)) { + return; + } + + /* preemptively check the old ip_fw rule to + * make sure it's valid before starting to copy stuff + */ + if (ipfw_check_vers1_struct(compat_rule)) { + /* bad rule */ + return; + } + + bzero(actbuf, sizeof(actbuf)); /* actions go here */ + bzero(cmdbuf, sizeof(cmdbuf)); + + /* fill in action */ + action = (ipfw_insn *)actbuf; + { + u_int flag = compat_rule->fw_flg; + + action->len = 1; /* default */ + + if (flag & IP_FW_F_CHECK_S_COMPAT) { + have_state = action; + action->opcode = O_CHECK_STATE; + } + else { + switch (flag & IP_FW_F_COMMAND_COMPAT) { + case IP_FW_F_ACCEPT_COMPAT: + action->opcode = O_ACCEPT; + break; + case IP_FW_F_COUNT_COMPAT: + action->opcode = O_COUNT; + break; + case IP_FW_F_PIPE_COMPAT: + action->opcode = O_PIPE; + action->len = F_INSN_SIZE(ipfw_insn_pipe); + action->arg1 = compat_rule->fw_divert_port_compat; + break; + case IP_FW_F_QUEUE_COMPAT: + action->opcode = O_QUEUE; + action->len = F_INSN_SIZE(ipfw_insn_pipe); + action->arg1 = compat_rule->fw_divert_port_compat; + break; + case IP_FW_F_SKIPTO_COMPAT: + action->opcode = O_SKIPTO; + action->arg1 = compat_rule->fw_skipto_rule_compat; + break; + case IP_FW_F_DIVERT_COMPAT: + action->opcode = O_DIVERT; + action->arg1 = compat_rule->fw_divert_port_compat; + break; + case IP_FW_F_TEE_COMPAT: + action->opcode = O_TEE; + action->arg1 = compat_rule->fw_divert_port_compat; + break; + case IP_FW_F_FWD_COMPAT: + { + ipfw_insn_sa *p = (ipfw_insn_sa *)action; + + action->opcode = O_FORWARD_IP; + action->len = F_INSN_SIZE(ipfw_insn_sa); + + p->sa.sin_len = compat_rule->fw_fwd_ip_compat.sin_len; + p->sa.sin_family = compat_rule->fw_fwd_ip_compat.sin_family; + p->sa.sin_port = compat_rule->fw_fwd_ip_compat.sin_port; + p->sa.sin_addr = compat_rule->fw_fwd_ip_compat.sin_addr; + + break; + } + case IP_FW_F_DENY_COMPAT: + action->opcode = O_DENY; + action->arg1 = 0; + break; + case IP_FW_F_REJECT_COMPAT: + action->opcode = O_REJECT; + action->arg1 = compat_rule->fw_reject_code_compat; + break; + default: + action->opcode = O_NOP; + break; + } + } + + /* action is mandatory */ + if (action->opcode == O_NOP) { + return; + } + + action = next_cmd(action); + } /* end actions */ + + cmd = (ipfw_insn *)cmdbuf; + + /* this is O_CHECK_STATE, we're done */ + if (have_state) { + goto done; + } + + { + ipfw_insn *prev = NULL; + u_int flag = compat_rule->fw_flg; + + /* logging */ + if (flag & IP_FW_F_PRN_COMPAT) { + ipfw_insn_log *c = (ipfw_insn_log *)cmd; + + cmd->opcode = O_LOG; + cmd->len |= F_INSN_SIZE(ipfw_insn_log); + c->max_log = compat_rule->fw_logamount; + + prev = cmd; + cmd = next_cmd(cmd); + } + + /* protocol */ + if (compat_rule->fw_prot != 0) { + fill_cmd(cmd, O_PROTO, compat_rule->fw_prot); + prev = cmd; + cmd = next_cmd(cmd); + } + + /* source */ + if (flag & IP_FW_F_SME_COMPAT) { + cmd->opcode = O_IP_SRC_ME; + cmd->len |= F_INSN_SIZE(ipfw_insn); + if (flag & IP_FW_F_INVSRC_COMPAT) { + cmd->len ^= F_NOT; /* toggle F_NOT */ + } + + prev = cmd; + cmd = next_cmd(cmd); + } else { + if (compat_rule->fw_smsk.s_addr != 0) { + /* addr/mask */ + ipfw_insn_ip *ip = (ipfw_insn_ip *)cmd; + + ip->addr = compat_rule->fw_src; + ip->mask = compat_rule->fw_smsk; + cmd->opcode = O_IP_SRC_MASK; + cmd->len |= F_INSN_SIZE(ipfw_insn_ip); /* double check this */ + } else { + /* one IP */ + ipfw_insn_u32 *cmd32 = (ipfw_insn_u32 *)cmd; /* alias for cmd */ + + if (compat_rule->fw_src.s_addr == 0) { + /* any */ + cmd32->o.len &= ~F_LEN_MASK; /* zero len */ + } else { + cmd32->d[0] = compat_rule->fw_src.s_addr; + cmd32->o.opcode = O_IP_SRC; + cmd32->o.len |= F_INSN_SIZE(ipfw_insn_u32); + } + } + + if (flag & IP_FW_F_INVSRC_COMPAT) { + cmd->len ^= F_NOT; /* toggle F_NOT */ + } + + if (F_LEN(cmd) != 0) { /* !any */ + prev = cmd; + cmd = next_cmd(cmd); + } + } + + /* source ports */ + { + ipfw_insn_u16 *ports = (ipfw_insn_u16 *)cmd; + uint16_t *p = ports->ports; + int i, j = 0, + nports = IP_FW_GETNSRCP_COMPAT(compat_rule), + have_range = 0; + + cmd->opcode = O_IP_SRCPORT; + for (i = 0; i < nports; i++) { + if (((flag & IP_FW_F_SRNG_COMPAT) || + (flag & IP_FW_F_SMSK_COMPAT)) && !have_range) { + p[0] = compat_rule->fw_uar_compat.fw_pts[i++]; + p[1] = compat_rule->fw_uar_compat.fw_pts[i]; + have_range = 1; + } else { + p[0] = p[1] = compat_rule->fw_uar_compat.fw_pts[i]; + } + p += 2; + j++; + } + + if (j > 0) { + ports->o.len |= j+1; /* leave F_NOT and F_OR untouched */ + } + + prev = cmd; + cmd = next_cmd(cmd); + } + + /* destination */ + if (flag & IP_FW_F_DME_COMPAT) { + cmd->opcode = O_IP_DST_ME; + cmd->len |= F_INSN_SIZE(ipfw_insn); + if (flag & IP_FW_F_INVDST_COMPAT) { + cmd->len ^= F_NOT; /* toggle F_NOT */ + } + + prev = cmd; + cmd = next_cmd(cmd); + } else { + if (compat_rule->fw_dmsk.s_addr != 0) { + /* addr/mask */ + ipfw_insn_ip *ip = (ipfw_insn_ip *)cmd; + + ip->addr = compat_rule->fw_dst; + ip->mask = compat_rule->fw_dmsk; + cmd->opcode = O_IP_DST_MASK; + cmd->len |= F_INSN_SIZE(ipfw_insn_ip); /* double check this */ + } else { + /* one IP */ + ipfw_insn_u32 *cmd32 = (ipfw_insn_u32 *)cmd; /* alias for cmd */ + + if (compat_rule->fw_dst.s_addr == 0) { + /* any */ + cmd32->o.len &= ~F_LEN_MASK; /* zero len */ + } else { + cmd32->d[0] = compat_rule->fw_dst.s_addr; + cmd32->o.opcode = O_IP_DST; + cmd32->o.len |= F_INSN_SIZE(ipfw_insn_u32); + } + } + + if (flag & IP_FW_F_INVDST_COMPAT) { + cmd->len ^= F_NOT; /* toggle F_NOT */ + } + + if (F_LEN(cmd) != 0) { /* !any */ + prev = cmd; + cmd = next_cmd(cmd); + } + } + + /* dest. ports */ + { + ipfw_insn_u16 *ports = (ipfw_insn_u16 *)cmd; + uint16_t *p = ports->ports; + int i = IP_FW_GETNSRCP_COMPAT(compat_rule), + j = 0, + nports = (IP_FW_GETNDSTP_COMPAT(compat_rule) + i), + have_range = 0; + + cmd->opcode = O_IP_DSTPORT; + for (; i < nports; i++, p += 2) { + if (((flag & IP_FW_F_DRNG_COMPAT) || + (flag & IP_FW_F_DMSK_COMPAT)) && !have_range) { + /* range */ + p[0] = compat_rule->fw_uar_compat.fw_pts[i++]; + p[1] = compat_rule->fw_uar_compat.fw_pts[i]; + have_range = 1; + } else { + p[0] = p[1] = compat_rule->fw_uar_compat.fw_pts[i]; + } + j++; + } + + if (j > 0) { + ports->o.len |= j+1; /* leave F_NOT and F_OR untouched */ + } + + prev = cmd; + cmd = next_cmd(cmd); + } + + if (flag & IP_FW_F_UID_COMPAT) { + ipfw_insn_u32 *cmd32 = (ipfw_insn_u32 *)cmd; /* alias for cmd */ + + cmd32->o.opcode = O_UID; + cmd32->o.len |= F_INSN_SIZE(ipfw_insn_u32); + cmd32->d[0] = compat_rule->fw_uid; + + prev = cmd; + cmd = next_cmd(cmd); + } + + if (flag & IP_FW_F_KEEP_S_COMPAT) { + have_state = cmd; + fill_cmd(cmd, O_KEEP_STATE, 0); + + prev = cmd; + cmd = next_cmd(cmd); + } + if (flag & IP_FW_BRIDGED_COMPAT) { + fill_cmd(cmd, O_LAYER2, 0); + + prev = cmd; + cmd = next_cmd(cmd); + } + + if ((flag & IF_FW_F_VIAHACK_COMPAT) == IF_FW_F_VIAHACK_COMPAT) { + /* via */ + ipfw_insn_if *ifcmd = (ipfw_insn_if *)cmd; + union ip_fw_if_compat ifu = compat_rule->fw_in_if; + + cmd->opcode = O_VIA; + ifcmd->o.len |= F_INSN_SIZE(ipfw_insn_if); + + if (ifu.fu_via_ip.s_addr == 0) { + /* "any" */ + ifcmd->name[0] = '\0'; + ifcmd->o.len = 0; + } + else if (compat_rule->fw_flg & IP_FW_F_IIFNAME_COMPAT) { + /* by name */ + strncpy(ifcmd->name, ifu.fu_via_if_compat.name, sizeof(ifcmd->name)); + ifcmd->p.unit = ifu.fu_via_if_compat.unit; + } else { + /* by addr */ + ifcmd->p.ip = ifu.fu_via_ip; + } + + prev = cmd; + cmd = next_cmd(cmd); + } else { + if (flag & IP_FW_F_IN_COMPAT) { + fill_cmd(cmd, O_IN, 0); + + prev = cmd; + cmd = next_cmd(cmd); + } + if (flag & IP_FW_F_OUT_COMPAT) { + /* if the previous command was O_IN, and this + * is being set as well, it's equivalent to not + * having either command, so let's back up prev + * to the cmd before it and move cmd to prev. + */ + if (prev->opcode == O_IN) { + cmd = prev; + bzero(cmd, sizeof(*cmd)); + } else { + cmd->len ^= F_NOT; /* toggle F_NOT */ + fill_cmd(cmd, O_IN, 0); + + prev = cmd; + cmd = next_cmd(cmd); + } + } + if (flag & IP_FW_F_OIFACE_COMPAT) { + /* xmit */ + ipfw_insn_if *ifcmd = (ipfw_insn_if *)cmd; + union ip_fw_if_compat ifu = compat_rule->fw_out_if; + + cmd->opcode = O_XMIT; + ifcmd->o.len |= F_INSN_SIZE(ipfw_insn_if); + + if (ifu.fu_via_ip.s_addr == 0) { + /* "any" */ + ifcmd->name[0] = '\0'; + ifcmd->o.len = 0; + } + else if (flag & IP_FW_F_OIFNAME_COMPAT) { + /* by name */ + strncpy(ifcmd->name, ifu.fu_via_if_compat.name, sizeof(ifcmd->name)); + ifcmd->p.unit = ifu.fu_via_if_compat.unit; + } else { + /* by addr */ + ifcmd->p.ip = ifu.fu_via_ip; + } + + prev = cmd; + cmd = next_cmd(cmd); + } + else if (flag & IP_FW_F_IIFACE_COMPAT) { + /* recv */ + ipfw_insn_if *ifcmd = (ipfw_insn_if *)cmd; + union ip_fw_if_compat ifu = compat_rule->fw_in_if; + + cmd->opcode = O_RECV; + ifcmd->o.len |= F_INSN_SIZE(ipfw_insn_if); + + if (ifu.fu_via_ip.s_addr == 0) { + /* "any" */ + ifcmd->name[0] = '\0'; + ifcmd->o.len = 0; + } + else if (flag & IP_FW_F_IIFNAME_COMPAT) { + /* by name */ + strncpy(ifcmd->name, ifu.fu_via_if_compat.name, sizeof(ifcmd->name)); + ifcmd->p.unit = ifu.fu_via_if_compat.unit; + } else { + /* by addr */ + ifcmd->p.ip = ifu.fu_via_ip; + } + + prev = cmd; + cmd = next_cmd(cmd); + } + } + + if (flag & IP_FW_F_FRAG_COMPAT) { + fill_cmd(cmd, O_FRAG, 0); + + prev = cmd; + cmd = next_cmd(cmd); + } + + /* IP options */ + if (compat_rule->fw_ipopt != 0 || compat_rule->fw_ipnopt != 0) { + fill_cmd(cmd, O_IPOPT, (compat_rule->fw_ipopt & 0xff) | + (compat_rule->fw_ipnopt & 0xff) << 8); + + prev = cmd; + cmd = next_cmd(cmd); + } + + if (compat_rule->fw_prot == IPPROTO_TCP) { + if (compat_rule->fw_ipflg & IP_FW_IF_TCPEST_COMPAT) { + fill_cmd(cmd, O_ESTAB, 0); + + prev = cmd; + cmd = next_cmd(cmd); + } + + /* TCP options and flags */ + if (compat_rule->fw_tcpf != 0 || compat_rule->fw_tcpnf != 0) { + if ((compat_rule->fw_tcpf & IP_FW_TCPF_SYN_COMPAT) && + compat_rule->fw_tcpnf & IP_FW_TCPF_ACK_COMPAT) { + fill_cmd(cmd, O_TCPFLAGS, (TH_SYN) | ( (TH_ACK) & 0xff) <<8); + + prev = cmd; + cmd = next_cmd(cmd); + } + else { + fill_cmd(cmd, O_TCPFLAGS, (compat_rule->fw_tcpf & 0xff) | + (compat_rule->fw_tcpnf & 0xff) << 8); + + prev = cmd; + cmd = next_cmd(cmd); + } + } + if (compat_rule->fw_tcpopt != 0 || compat_rule->fw_tcpnopt != 0) { + fill_cmd(cmd, O_TCPOPTS, (compat_rule->fw_tcpopt & 0xff) | + (compat_rule->fw_tcpnopt & 0xff) << 8); + + prev = cmd; + cmd = next_cmd(cmd); + } + } + + /* ICMP */ + /* XXX: check this */ + if (flag & IP_FW_F_ICMPBIT_COMPAT) { + int i; + ipfw_insn_u32 *cmd32 = (ipfw_insn_u32 *)cmd; /* alias for cmd */ + + cmd32->o.opcode = O_ICMPTYPE; + cmd32->o.len |= F_INSN_SIZE(ipfw_insn_u32); + + for (i = 0; i < IP_FW_ICMPTYPES_DIM_COMPAT; i++) { + cmd32->d[0] |= compat_rule->fw_uar_compat.fw_icmptypes[i]; + } + + prev = cmd; + cmd = next_cmd(cmd); + } + } /* end commands */ + +done: + /* finally, copy everything into the current + * rule buffer in the right order. + */ + dst = curr_rule->cmd; + + /* first, do match probability */ + if (compat_rule->fw_flg & IP_FW_F_RND_MATCH_COMPAT) { + dst->opcode = O_PROB; + dst->len = 2; + *((int32_t *)(dst+1)) = compat_rule->pipe_ptr; + dst += dst->len; + } + + /* generate O_PROBE_STATE if necessary */ + if (have_state && have_state->opcode != O_CHECK_STATE) { + fill_cmd(dst, O_PROBE_STATE, 0); + dst = next_cmd(dst); + } + + /* + * copy all commands but O_LOG, O_KEEP_STATE + */ + for (src = (ipfw_insn *)cmdbuf; src != cmd; src += k) { + k = F_LEN(src); + + switch (src->opcode) { + case O_LOG: + case O_KEEP_STATE: + break; + default: + bcopy(src, dst, k * sizeof(uint32_t)); + dst += k; + } + } + + /* + * put back the have_state command as last opcode + */ + if (have_state && have_state->opcode != O_CHECK_STATE) { + k = F_LEN(have_state); + bcopy(have_state, dst, k * sizeof(uint32_t)); + dst += k; + } + + /* + * start action section + */ + curr_rule->act_ofs = dst - curr_rule->cmd; + + /* + * put back O_LOG if necessary + */ + src = (ipfw_insn *)cmdbuf; + if (src->opcode == O_LOG) { + k = F_LEN(src); + bcopy(src, dst, k * sizeof(uint32_t)); + dst += k; + } + + /* + * copy all other actions + */ + for (src = (ipfw_insn *)actbuf; src != action; src += k) { + k = F_LEN(src); + bcopy(src, dst, k * sizeof(uint32_t)); + dst += k; + } + + curr_rule->cmd_len = (uint32_t *)dst - (uint32_t *)(curr_rule->cmd); + + return; +} + +static int +ipfw_version_one_to_version_two(struct sockopt *sopt, struct ip_fw *curr_rule, + struct ip_fw_compat *rule_vers1) +{ + int err = EINVAL; + struct ip_fw_compat *rule_ptr; + struct ip_fw_compat rule; + + if (rule_vers1) { + rule_ptr = rule_vers1; + err = 0; + } else { + /* do some basic size checking here, more extensive checking later */ + if (!sopt->sopt_val || sopt->sopt_valsize < sizeof(struct ip_fw_compat)) + return err; + + if ((err = sooptcopyin(sopt, &rule, sizeof(struct ip_fw_compat), + sizeof(struct ip_fw_compat)))) { + return err; + } + + rule_ptr = &rule; + } + + /* deal with commands */ + ipfw_convert_to_cmds(curr_rule, rule_ptr); + + curr_rule->version = IP_FW_CURRENT_API_VERSION; + curr_rule->context = rule_ptr->context; + curr_rule->rulenum = rule_ptr->fw_number; + curr_rule->pcnt = rule_ptr->fw_pcnt; + curr_rule->bcnt = rule_ptr->fw_bcnt; + curr_rule->timestamp = rule_ptr->timestamp; + + +#if FW2_DEBUG_VERBOSE + ipfw_print_vers2_struct(curr_rule); +#endif /* FW2_DEBUG_VERBOSE */ + + return err; +} + +/* This converts to whatever the latest version is. Currently the + * latest version of the firewall is ipfw2. + */ +static int +ipfw_version_one_to_latest(struct sockopt *sopt, struct ip_fw *curr_rule, struct ip_fw_compat *rule_vers1) +{ + int err; + + /* if rule_vers1 is not null then this is coming from + * ipfw_version_zero_to_latest(), so pass that along; + * otherwise let ipfw_version_one_to_version_two() + * get the rule from sopt. + */ + err = ipfw_version_one_to_version_two(sopt, curr_rule, rule_vers1); + + return err; +} + +static void +ipfw_version_zero_to_one(struct ip_old_fw *rule_vers0, struct ip_fw_compat *rule_vers1) +{ + bzero(rule_vers1, sizeof(struct ip_fw_compat)); + bcopy(&rule_vers0->fw_uar, &rule_vers1->fw_uar_compat, sizeof(rule_vers0->fw_uar)); + bcopy(&rule_vers0->fw_in_if, &rule_vers1->fw_in_if, sizeof(rule_vers0->fw_in_if)); + bcopy(&rule_vers0->fw_out_if, &rule_vers1->fw_out_if, sizeof(rule_vers0->fw_out_if)); + bcopy(&rule_vers0->fw_un, &rule_vers1->fw_un_compat, sizeof(rule_vers0->fw_un)); + + rule_vers1->version = 10; + rule_vers1->fw_pcnt = rule_vers0->fw_pcnt; + rule_vers1->fw_bcnt = rule_vers0->fw_bcnt; + rule_vers1->fw_src = rule_vers0->fw_src; + rule_vers1->fw_dst = rule_vers0->fw_dst; + rule_vers1->fw_smsk = rule_vers0->fw_smsk; + rule_vers1->fw_dmsk = rule_vers0->fw_dmsk; + rule_vers1->fw_number = rule_vers0->fw_number; + rule_vers1->fw_flg = rule_vers0->fw_flg; + rule_vers1->fw_ipopt = rule_vers0->fw_ipopt; + rule_vers1->fw_ipnopt = rule_vers0->fw_ipnopt; + rule_vers1->fw_tcpf = rule_vers0->fw_tcpf & ~IP_OLD_FW_TCPF_ESTAB; + rule_vers1->fw_tcpnf = rule_vers0->fw_tcpnf; + rule_vers1->timestamp = rule_vers0->timestamp; + rule_vers1->fw_prot = rule_vers0->fw_prot; + rule_vers1->fw_nports = rule_vers0->fw_nports; + rule_vers1->pipe_ptr = rule_vers0->pipe_ptr; + rule_vers1->next_rule_ptr = rule_vers0->next_rule_ptr; + rule_vers1->fw_ipflg = (rule_vers0->fw_tcpf & IP_OLD_FW_TCPF_ESTAB) ? IP_FW_IF_TCPEST_COMPAT : 0; +} + +/* first convert to version one, then to version two */ +static int +ipfw_version_zero_to_latest(struct sockopt *sopt, struct ip_fw *curr_rule) +{ + int err; + struct ip_old_fw rule_vers0; + struct ip_fw_compat rule_vers1; + + if (sopt->sopt_name == IP_OLD_FW_GET || + sopt->sopt_name == IP_OLD_FW_FLUSH || + sopt->sopt_val == NULL) { + /* In the old-style API, it was legal to not pass in a rule + * structure for certain firewall operations (e.g. flush, + * reset log). If that's the situation, we pretend we received + * a blank structure. */ + bzero(curr_rule, sizeof(struct ip_fw)); + curr_rule->version = 10; + } + else { + if (!sopt->sopt_val || sopt->sopt_valsize < sizeof(struct ip_old_fw)) { + return EINVAL; + } + + err = sooptcopyin(sopt, &rule_vers0, sizeof(struct ip_old_fw), + sizeof(struct ip_old_fw)); + if (err) { + return err; + } + + ipfw_version_zero_to_one(&rule_vers0, &rule_vers1); + } + + return (ipfw_version_one_to_latest(sopt, curr_rule, &rule_vers1)); +} + +/* rule is a u_int32_t buffer[255] into which the converted + * (if necessary) rules go. + */ +int +ipfw_convert_to_latest(struct sockopt *sopt, struct ip_fw *curr_rule, int api_version) +{ + int err = 0; + + /* the following functions copy the rules passed in and + * convert to latest structures based on version + */ + switch (api_version) { + case IP_FW_VERSION_0: + /* this is the oldest version we support */ + err = ipfw_version_zero_to_latest(sopt, curr_rule); + break; + + case IP_FW_VERSION_1: + /* this is the version supported in Panther */ + err = ipfw_version_one_to_latest(sopt, curr_rule, NULL); + break; + + case IP_FW_CURRENT_API_VERSION: + /* IPFW2 for now */ + /* do nothing here... */ + break; + + default: + /* unrecognized/unsupported version */ + err = EINVAL; + break; + } + + return err; +} + +int +ipfw_get_command_and_version(struct sockopt *sopt, int *command, u_int32_t *api_version) +{ + int cmd; + int err = 0; + u_int32_t vers = IP_FW_VERSION_NONE; + + /* first deal with the oldest version */ + if (sopt->sopt_name == IP_OLD_FW_GET) { + vers = IP_FW_VERSION_0; + cmd = IP_FW_GET; + } + else if (sopt->sopt_name == IP_OLD_FW_FLUSH) { + vers = IP_FW_VERSION_0; + cmd = IP_FW_FLUSH; + } + else if (sopt->sopt_name == IP_OLD_FW_ZERO) { + vers = IP_FW_VERSION_0; + cmd = IP_FW_ZERO; + } + else if (sopt->sopt_name == IP_OLD_FW_ADD) { + vers = IP_FW_VERSION_0; + cmd = IP_FW_ADD; + } + else if (sopt->sopt_name == IP_OLD_FW_DEL) { + vers = IP_FW_VERSION_0; + cmd = IP_FW_DEL; + } + else if (sopt->sopt_name == IP_OLD_FW_RESETLOG) { + vers = IP_FW_VERSION_0; + cmd = IP_FW_RESETLOG; + } + else { + cmd = sopt->sopt_name; + } + + if (vers == IP_FW_VERSION_NONE) { + /* working off the fact that the offset + * is the same in both structs. + */ + struct ip_fw rule; + + if (!sopt->sopt_val || sopt->sopt_valsize < sizeof(struct ip_fw)) + return EINVAL; + + if ((err = sooptcopyin(sopt, &rule, sizeof(struct ip_fw), + sizeof(struct ip_fw)))) { + return err; + } + + vers = rule.version; + } + + if (command) { + *command = cmd; + } + + if (api_version) { + *api_version = vers; + } + + return err; +} + diff --git a/bsd/netinet/ip_fw2_compat.h b/bsd/netinet/ip_fw2_compat.h new file mode 100644 index 000000000..f0b7da0db --- /dev/null +++ b/bsd/netinet/ip_fw2_compat.h @@ -0,0 +1,375 @@ +/* IPFW backward compatibility */ + +#ifndef _IP_FW_COMPAT_H_ +#define _IP_FW_COMPAT_H_ + + +/* prototypes */ +void ipfw_convert_from_latest(struct ip_fw *curr_rule, void *old_rule, u_int32_t api_version); +int ipfw_convert_to_latest(struct sockopt *sopt, struct ip_fw *rule, int api_version); +int ipfw_get_command_and_version(struct sockopt *sopt, int *command, u_int32_t *api_version); + + +/* + * ****************************** + * ****** IPFW version one ****** + * ****************************** + */ + +/* + * This union structure identifies an interface, either explicitly + * by name or implicitly by IP address. The flags IP_FW_F_IIFNAME + * and IP_FW_F_OIFNAME say how to interpret this structure. An + * interface unit number of -1 matches any unit number, while an + * IP address of 0.0.0.0 indicates matches any interface. + * + * The receive and transmit interfaces are only compared against the + * the packet if the corresponding bit (IP_FW_F_IIFACE or IP_FW_F_OIFACE) + * is set. Note some packets lack a receive or transmit interface + * (in which case the missing "interface" never matches). + */ + +union ip_fw_if_compat { + struct in_addr fu_via_ip; /* Specified by IP address */ + struct { /* Specified by interface name */ +#define FW_IFNLEN_COMPAT 10 /* need room ! was IFNAMSIZ */ + char name[FW_IFNLEN_COMPAT]; + short unit; /* -1 means match any unit */ + } fu_via_if_compat; +}; + +/* + * Format of an IP firewall descriptor + * + * fw_src, fw_dst, fw_smsk, fw_dmsk are always stored in network byte order. + * fw_flg and fw_n*p are stored in host byte order (of course). + * Port numbers are stored in HOST byte order. + */ + +struct ip_fw_compat { + u_int32_t version; /* Version of this structure. Should always be */ + /* set to IP_FW_CURRENT_API_VERSION by clients. */ + void *context; /* Context that is usable by user processes to */ + /* identify this rule. */ + u_int64_t fw_pcnt,fw_bcnt; /* Packet and byte counters */ + struct in_addr fw_src, fw_dst; /* Source and destination IP addr */ + struct in_addr fw_smsk, fw_dmsk; /* Mask for src and dest IP addr */ + u_short fw_number; /* Rule number */ + u_int fw_flg; /* Flags word */ +#define IP_FW_MAX_PORTS_COMPAT 10 /* A reasonable maximum */ + union { + u_short fw_pts[IP_FW_MAX_PORTS_COMPAT]; /* Array of port numbers to match */ +#define IP_FW_ICMPTYPES_MAX_COMPAT 128 +#define IP_FW_ICMPTYPES_DIM_COMPAT (IP_FW_ICMPTYPES_MAX_COMPAT / (sizeof(unsigned) * 8)) + unsigned fw_icmptypes[IP_FW_ICMPTYPES_DIM_COMPAT]; /* ICMP types bitmap */ + } fw_uar_compat; + u_int fw_ipflg; /* IP flags word */ + u_char fw_ipopt,fw_ipnopt; /* IP options set/unset */ + u_char fw_tcpopt,fw_tcpnopt; /* TCP options set/unset */ + u_char fw_tcpf,fw_tcpnf; /* TCP flags set/unset */ + long timestamp; /* timestamp (tv_sec) of last match */ + union ip_fw_if_compat fw_in_if, fw_out_if; /* Incoming and outgoing interfaces */ + union { + u_short fu_divert_port; /* Divert/tee port (options IPDIVERT) */ + u_short fu_pipe_nr; /* queue number (option DUMMYNET) */ + u_short fu_skipto_rule; /* SKIPTO command rule number */ + u_short fu_reject_code; /* REJECT response code */ + struct sockaddr_in fu_fwd_ip; + } fw_un_compat; + u_char fw_prot; /* IP protocol */ + /* + * N'of src ports and # of dst ports in ports array (dst ports + * follow src ports; max of 10 ports in all; count of 0 means + * match all ports) + */ + u_char fw_nports; + void *pipe_ptr; /* flow_set ptr for dummynet pipe */ + void *next_rule_ptr ; /* next rule in case of match */ + uid_t fw_uid; /* uid to match */ + int fw_logamount; /* amount to log */ + u_int64_t fw_loghighest; /* highest number packet to log */ +}; + +/* + * extended ipfw structure... some fields in the original struct + * can be used to pass parameters up/down, namely pointers + * void *pipe_ptr + * void *next_rule_ptr + * some others can be used to pass parameters down, namely counters etc. + * u_int64_t fw_pcnt,fw_bcnt; + * long timestamp; + */ + +struct ip_fw_ext_compat { /* extended structure */ + struct ip_fw rule; /* must be at offset 0 */ + long dont_match_prob; /* 0x7fffffff means 1.0, always fail */ + u_int dyn_type; /* type for dynamic rule */ +}; + +struct ip_fw_chain_compat { + LIST_ENTRY(ip_fw_chain_compat) next; + struct ip_fw_compat *rule; +}; + +/* + * dynamic ipfw rule + */ +struct ipfw_dyn_rule_compat { + struct ipfw_dyn_rule *next ; + + struct ipfw_flow_id id ; + struct ipfw_flow_id mask ; + struct ip_fw_chain_compat *chain ; /* pointer to parent rule */ + u_int32_t type ; /* rule type */ + u_int32_t expire ; /* expire time */ + u_int64_t pcnt, bcnt; /* match counters */ + u_int32_t bucket ; /* which bucket in hash table */ + u_int32_t state ; /* state of this rule (typ. a */ + /* combination of TCP flags) */ +} ; + +#define IP_FW_GETNSRCP_COMPAT(rule) ((rule)->fw_nports & 0x0f) +#define IP_FW_SETNSRCP_COMPAT(rule, n) do { \ + (rule)->fw_nports &= ~0x0f; \ + (rule)->fw_nports |= (n); \ + } while (0) +#define IP_FW_GETNDSTP_COMPAT(rule) ((rule)->fw_nports >> 4) +#define IP_FW_SETNDSTP_COMPAT(rule, n) do { \ + (rule)->fw_nports &= ~0xf0; \ + (rule)->fw_nports |= (n) << 4;\ + } while (0) + +#define fw_divert_port_compat fw_un_compat.fu_divert_port +#define fw_skipto_rule_compat fw_un_compat.fu_skipto_rule +#define fw_reject_code_compat fw_un_compat.fu_reject_code +#define fw_pipe_nr_compat fw_un_compat.fu_pipe_nr +#define fw_fwd_ip_compat fw_un_compat.fu_fwd_ip + +/* + * Values for "flags" field . + */ +#define IP_FW_F_COMMAND_COMPAT 0x000000ff /* Mask for type of chain entry: */ +#define IP_FW_F_DENY_COMPAT 0x00000000 /* This is a deny rule */ +#define IP_FW_F_REJECT_COMPAT 0x00000001 /* Deny and send a response packet */ +#define IP_FW_F_ACCEPT_COMPAT 0x00000002 /* This is an accept rule */ +#define IP_FW_F_COUNT_COMPAT 0x00000003 /* This is a count rule */ +#define IP_FW_F_DIVERT_COMPAT 0x00000004 /* This is a divert rule */ +#define IP_FW_F_TEE_COMPAT 0x00000005 /* This is a tee rule */ +#define IP_FW_F_SKIPTO_COMPAT 0x00000006 /* This is a skipto rule */ +#define IP_FW_F_FWD_COMPAT 0x00000007 /* This is a "change forwarding address" rule */ +#define IP_FW_F_PIPE_COMPAT 0x00000008 /* This is a dummynet rule */ +#define IP_FW_F_QUEUE_COMPAT 0x00000009 /* This is a dummynet queue */ + +#define IP_FW_F_IN_COMPAT 0x00000100 /* Check inbound packets */ +#define IP_FW_F_OUT_COMPAT 0x00000200 /* Check outbound packets */ +#define IP_FW_F_IIFACE_COMPAT 0x00000400 /* Apply inbound interface test */ +#define IP_FW_F_OIFACE_COMPAT 0x00000800 /* Apply outbound interface test */ + +#define IP_FW_F_PRN_COMPAT 0x00001000 /* Print if this rule matches */ + +#define IP_FW_F_SRNG_COMPAT 0x00002000 /* The first two src ports are a min * + * and max range (stored in host byte * + * order). */ + +#define IP_FW_F_DRNG_COMPAT 0x00004000 /* The first two dst ports are a min * + * and max range (stored in host byte * + * order). */ + +#define IP_FW_F_FRAG_COMPAT 0x00008000 /* Fragment */ + +#define IP_FW_F_IIFNAME_COMPAT 0x00010000 /* In interface by name/unit (not IP) */ +#define IP_FW_F_OIFNAME_COMPAT 0x00020000 /* Out interface by name/unit (not IP) */ + +#define IP_FW_F_INVSRC_COMPAT 0x00040000 /* Invert sense of src check */ +#define IP_FW_F_INVDST_COMPAT 0x00080000 /* Invert sense of dst check */ + +#define IP_FW_F_ICMPBIT_COMPAT 0x00100000 /* ICMP type bitmap is valid */ + +#define IP_FW_F_UID_COMPAT 0x00200000 /* filter by uid */ + +#define IP_FW_F_RND_MATCH_COMPAT 0x00800000 /* probabilistic rule match */ +#define IP_FW_F_SMSK_COMPAT 0x01000000 /* src-port + mask */ +#define IP_FW_F_DMSK_COMPAT 0x02000000 /* dst-port + mask */ +#define IP_FW_BRIDGED_COMPAT 0x04000000 /* only match bridged packets */ +#define IP_FW_F_KEEP_S_COMPAT 0x08000000 /* keep state */ +#define IP_FW_F_CHECK_S_COMPAT 0x10000000 /* check state */ + +#define IP_FW_F_SME_COMPAT 0x20000000 /* source = me */ +#define IP_FW_F_DME_COMPAT 0x40000000 /* destination = me */ + +#define IP_FW_F_MASK_COMPAT 0x7FFFFFFF /* All possible flag bits mask */ + +/* + * Flags for the 'fw_ipflg' field, for comparing values of ip and its protocols. + */ +#define IP_FW_IF_TCPEST_COMPAT 0x00000020 /* established TCP connection */ +#define IP_FW_IF_TCPMSK_COMPAT 0x00000020 /* mask of all TCP values */ + +/* + * Definitions for TCP flags. + */ +#define IP_FW_TCPF_FIN_COMPAT TH_FIN +#define IP_FW_TCPF_SYN_COMPAT TH_SYN +#define IP_FW_TCPF_RST_COMPAT TH_RST +#define IP_FW_TCPF_PSH_COMPAT TH_PUSH +#define IP_FW_TCPF_ACK_COMPAT TH_ACK +#define IP_FW_TCPF_URG_COMPAT TH_URG + +/* + * For backwards compatibility with rules specifying "via iface" but + * not restricted to only "in" or "out" packets, we define this combination + * of bits to represent this configuration. + */ + +#define IF_FW_F_VIAHACK_COMPAT (IP_FW_F_IN_COMPAT|IP_FW_F_OUT_COMPAT|IP_FW_F_IIFACE_COMPAT|IP_FW_F_OIFACE_COMPAT) + +/* + * Definitions for REJECT response codes. + * Values less than 256 correspond to ICMP unreachable codes. + */ +#define IP_FW_REJECT_RST_COMPAT 0x0100 /* TCP packets: send RST */ + + +/* + * ****************************** + * ****** IPFW version zero ***** + * ****************************** + */ + +/* + * This union structure identifies an interface, either explicitly + * by name or implicitly by IP address. The flags IP_FW_F_IIFNAME + * and IP_FW_F_OIFNAME say how to interpret this structure. An + * interface unit number of -1 matches any unit number, while an + * IP address of 0.0.0.0 indicates matches any interface. + * + * The receive and transmit interfaces are only compared against the + * the packet if the corresponding bit (IP_FW_F_IIFACE or IP_FW_F_OIFACE) + * is set. Note some packets lack a receive or transmit interface + * (in which case the missing "interface" never matches). + */ + +union ip_old_fw_if { + struct in_addr fu_via_ip; /* Specified by IP address */ + struct { /* Specified by interface name */ +#define OLD_FW_IFNLEN 10 /* need room ! was IFNAMSIZ */ + char name[OLD_FW_IFNLEN]; + short unit; /* -1 means match any unit */ + } fu_via_if; +}; + +/* + * Format of an IP firewall descriptor + * + * fw_src, fw_dst, fw_smsk, fw_dmsk are always stored in network byte order. + * fw_flg and fw_n*p are stored in host byte order (of course). + * Port numbers are stored in HOST byte order. + * Warning: setsockopt() will fail if sizeof(struct ip_fw) > MLEN (108) + */ + +struct ip_old_fw { + u_int64_t fw_pcnt,fw_bcnt; /* Packet and byte counters */ + struct in_addr fw_src, fw_dst; /* Source and destination IP addr */ + struct in_addr fw_smsk, fw_dmsk; /* Mask for src and dest IP addr */ + u_short fw_number; /* Rule number */ + u_int fw_flg; /* Flags word */ +#define IP_OLD_FW_MAX_PORTS 10 /* A reasonable maximum */ + union { + u_short fw_pts[IP_OLD_FW_MAX_PORTS]; /* Array of port numbers to match */ +#define IP_OLD_FW_ICMPTYPES_MAX 128 +#define IP_OLD_FW_ICMPTYPES_DIM (IP_OLD_FW_ICMPTYPES_MAX / (sizeof(unsigned) * 8)) + unsigned fw_icmptypes[IP_OLD_FW_ICMPTYPES_DIM]; /* ICMP types bitmap */ + } fw_uar; + u_char fw_ipopt,fw_ipnopt; /* IP options set/unset */ + u_char fw_tcpf,fw_tcpnf; /* TCP flags set/unset */ + long timestamp; /* timestamp (tv_sec) of last match */ + union ip_old_fw_if fw_in_if, fw_out_if; /* Incoming and outgoing interfaces */ + union { + u_short fu_divert_port; /* Divert/tee port (options IPDIVERT) */ + u_short fu_pipe_nr; /* pipe number (option DUMMYNET) */ + u_short fu_skipto_rule; /* SKIPTO command rule number */ + u_short fu_reject_code; /* REJECT response code */ + struct sockaddr_in fu_fwd_ip; + } fw_un; + u_char fw_prot; /* IP protocol */ + u_char fw_nports; /* N'of src ports and # of dst ports */ + /* in ports array (dst ports follow */ + /* src ports; max of 10 ports in all; */ + /* count of 0 means match all ports) */ + void *pipe_ptr; /* Pipe ptr in case of dummynet pipe */ + void *next_rule_ptr ; /* next rule in case of match */ +}; + +#define IP_OLD_FW_GETNSRCP(rule) ((rule)->fw_nports & 0x0f) +#define IP_OLD_FW_SETNSRCP(rule, n) do { \ + (rule)->fw_nports &= ~0x0f; \ + (rule)->fw_nports |= (n); \ + } while (0) +#define IP_OLD_FW_GETNDSTP(rule) ((rule)->fw_nports >> 4) +#define IP_OLD_FW_SETNDSTP(rule, n) do { \ + (rule)->fw_nports &= ~0xf0; \ + (rule)->fw_nports |= (n) << 4;\ + } while (0) + +#define old_fw_divert_port fw_un.fu_divert_port +#define old_fw_skipto_rule fw_un.fu_skipto_rule +#define old_fw_reject_code fw_un.fu_reject_code +#define old_fw_pipe_nr fw_un.fu_pipe_nr +#define old_fw_fwd_ip fw_un.fu_fwd_ip + +/* + * Values for "flags" field . + */ +#define IP_OLD_FW_F_COMMAND 0x000000ff /* Mask for type of chain entry: */ +#define IP_OLD_FW_F_DENY 0x00000000 /* This is a deny rule */ +#define IP_OLD_FW_F_REJECT 0x00000001 /* Deny and send a response packet */ +#define IP_OLD_FW_F_ACCEPT 0x00000002 /* This is an accept rule */ +#define IP_OLD_FW_F_COUNT 0x00000003 /* This is a count rule */ +#define IP_OLD_FW_F_DIVERT 0x00000004 /* This is a divert rule */ +#define IP_OLD_FW_F_TEE 0x00000005 /* This is a tee rule */ +#define IP_OLD_FW_F_SKIPTO 0x00000006 /* This is a skipto rule */ +#define IP_OLD_FW_F_FWD 0x00000007 /* This is a "change forwarding address" rule */ +#define IP_OLD_FW_F_PIPE 0x00000008 /* This is a dummynet rule */ + +#define IP_OLD_FW_F_IN 0x00000100 /* Check inbound packets */ +#define IP_OLD_FW_F_OUT 0x00000200 /* Check outbound packets */ +#define IP_OLD_FW_F_IIFACE 0x00000400 /* Apply inbound interface test */ +#define IP_OLD_FW_F_OIFACE 0x00000800 /* Apply outbound interface test */ + +#define IP_OLD_FW_F_PRN 0x00001000 /* Print if this rule matches */ + +#define IP_OLD_FW_F_SRNG 0x00002000 /* The first two src ports are a min * + * and max range (stored in host byte * + * order). */ + +#define IP_OLD_FW_F_DRNG 0x00004000 /* The first two dst ports are a min * + * and max range (stored in host byte * + * order). */ + +#define IP_OLD_FW_F_FRAG 0x00008000 /* Fragment */ + +#define IP_OLD_FW_F_IIFNAME 0x00010000 /* In interface by name/unit (not IP) */ +#define IP_OLD_FW_F_OIFNAME 0x00020000 /* Out interface by name/unit (not IP) */ + +#define IP_OLD_FW_F_INVSRC 0x00040000 /* Invert sense of src check */ +#define IP_OLD_FW_F_INVDST 0x00080000 /* Invert sense of dst check */ + +#define IP_OLD_FW_F_ICMPBIT 0x00100000 /* ICMP type bitmap is valid */ + +#define IP_OLD_FW_F_MASK 0x001FFFFF /* All possible flag bits mask */ + +/* + * For backwards compatibility with rules specifying "via iface" but + * not restricted to only "in" or "out" packets, we define this combination + * of bits to represent this configuration. + */ + +#define IF_OLD_FW_F_VIAHACK (IP_OLD_FW_F_IN|IP_OLD_FW_F_OUT|IP_OLD_FW_F_IIFACE|IP_OLD_FW_F_OIFACE) + +/* + * Definitions for TCP flags - abridged + */ +#define IP_OLD_FW_TCPF_ESTAB 0x40 + + +#endif /* _IP_FW_COMPAT_H_ */ diff --git a/bsd/netinet/ip_icmp.c b/bsd/netinet/ip_icmp.c index 82ba9a44f..d6fbacb09 100644 --- a/bsd/netinet/ip_icmp.c +++ b/bsd/netinet/ip_icmp.c @@ -148,9 +148,9 @@ SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_RW, int icmpprintfs = 0; #endif -static void icmp_reflect __P((struct mbuf *)); -static void icmp_send __P((struct mbuf *, struct mbuf *)); -static int ip_next_mtu __P((int, int)); +static void icmp_reflect(struct mbuf *); +static void icmp_send(struct mbuf *, struct mbuf *); +static int ip_next_mtu(int, int); extern struct protosw inetsw[]; @@ -159,11 +159,12 @@ extern struct protosw inetsw[]; * in response to bad packet ip. */ void -icmp_error(n, type, code, dest, destifp) - struct mbuf *n; - int type, code; - n_long dest; - struct ifnet *destifp; +icmp_error( + struct mbuf *n, + int type, + int code, + n_long dest, + struct ifnet *destifp) { register struct ip *oip = mtod(n, struct ip *), *nip; register unsigned oiplen = IP_VHL_HL(oip->ip_vhl) << 2; @@ -279,8 +280,9 @@ icmp_input(m, hlen) int icmplen = ip->ip_len; register int i; struct in_ifaddr *ia; - void (*ctlfunc) __P((int, struct sockaddr *, void *)); + void (*ctlfunc)(int, struct sockaddr *, void *); int code; + char ipv4str[MAX_IPv4_STR_LEN]; /* * Locate icmp structure in mbuf, and check @@ -288,10 +290,12 @@ icmp_input(m, hlen) */ #if ICMPPRINTFS if (icmpprintfs) { - char buf[4 * sizeof "123"]; - strcpy(buf, inet_ntoa(ip->ip_src)); + char buf[MAX_IPv4_STR_LEN]; + printf("icmp_input from %s to %s, len %d\n", - buf, inet_ntoa(ip->ip_dst), icmplen); + inet_ntop(AF_INET, &ip->ip_src, buf, sizeof(buf)), + inet_ntop(AF_INET, &ip->ip_dst, ipv4str, sizeof(ipv4str)), + icmplen); } #endif if (icmplen < ICMP_MINLEN) { @@ -446,7 +450,9 @@ icmp_input(m, hlen) 1); #if DEBUG_MTUDISC printf("MTU for %s reduced to %d\n", - inet_ntoa(icmpsrc.sin_addr), mtu); + inet_ntop(AF_INET, &icmpsrc.sin_addr, ipv4str, + sizeof(ipv4str)), + mtu); #endif if (mtu < max(296, (tcp_minmss + sizeof(struct tcpiphdr)))) { /* rt->rt_rmx.rmx_mtu = @@ -537,8 +543,11 @@ icmp_input(m, hlen) (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif); if (ia == 0) break; - if (ia->ia_ifp == 0) + if (ia->ia_ifp == 0) { + ifafree(&ia->ia_ifa); + ia = 0; break; + } icp->icmp_type = ICMP_MASKREPLY; icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr; if (ip->ip_src.s_addr == 0) { @@ -547,6 +556,7 @@ icmp_input(m, hlen) else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT) ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr; } + ifafree(&ia->ia_ifa); reflect: ip->ip_len += hlen; /* since ip_input deducts this */ icmpstat.icps_reflect++; @@ -590,11 +600,12 @@ reflect: icmpdst.sin_addr = icp->icmp_gwaddr; #if ICMPPRINTFS if (icmpprintfs) { - char buf[4 * sizeof "123"]; - strcpy(buf, inet_ntoa(icp->icmp_ip.ip_dst)); + char buf[MAX_IPv4_STR_LEN]; printf("redirect dst %s to %s\n", - buf, inet_ntoa(icp->icmp_gwaddr)); + inet_ntop(AF_INET, &icp->icmp_ip.ip_dst, buf, sizeof(buf)), + inet_ntop(AF_INET, &icp->icmp_gwaddr, ipv4str, + sizeof(ipv4str))); } #endif icmpsrc.sin_addr = icp->icmp_ip.ip_dst; @@ -657,6 +668,7 @@ icmp_reflect(m) * or anonymous), use the address which corresponds * to the incoming interface. */ + lck_mtx_lock(rt_mtx); for (ia = in_ifaddrhead.tqh_first; ia; ia = ia->ia_link.tqe_next) { if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) break; @@ -664,6 +676,8 @@ icmp_reflect(m) t.s_addr == satosin(&ia->ia_broadaddr)->sin_addr.s_addr) break; } + if (ia) + ifaref(&ia->ia_ifa); icmpdst.sin_addr = t; if ((ia == (struct in_ifaddr *)0) && m->m_pkthdr.rcvif) ia = (struct in_ifaddr *)ifaof_ifpforaddr( @@ -672,11 +686,16 @@ icmp_reflect(m) * The following happens if the packet was not addressed to us, * and was received on an interface with no IP address. */ - if (ia == (struct in_ifaddr *)0) + if (ia == (struct in_ifaddr *)0) { ia = in_ifaddrhead.tqh_first; + ifaref(&ia->ia_ifa); + } + lck_mtx_unlock(rt_mtx); t = IA_SIN(ia)->sin_addr; ip->ip_src = t; ip->ip_ttl = ip_defttl; + ifafree(&ia->ia_ifa); + ia = NULL; if (optlen > 0) { register u_char *cp; @@ -770,6 +789,7 @@ icmp_send(m, opts) register int hlen; register struct icmp *icp; struct route ro; + char ipv4str[MAX_IPv4_STR_LEN]; hlen = IP_VHL_HL(ip->ip_vhl) << 2; m->m_data += hlen; @@ -779,16 +799,17 @@ icmp_send(m, opts) icp->icmp_cksum = in_cksum(m, ip->ip_len - hlen); m->m_data -= hlen; m->m_len += hlen; - m->m_pkthdr.rcvif = (struct ifnet *)0; + m->m_pkthdr.rcvif = 0; m->m_pkthdr.aux = NULL; m->m_pkthdr.csum_data = 0; m->m_pkthdr.csum_flags = 0; #if ICMPPRINTFS if (icmpprintfs) { - char buf[4 * sizeof "123"]; - strcpy(buf, inet_ntoa(ip->ip_dst)); + char buf[MAX_IPv4_STR_LEN]; + printf("icmp_send dst %s src %s\n", - buf, inet_ntoa(ip->ip_src)); + inet_ntop(AF_INET, &ip->ip_dst, buf, sizeof(buf)), + inet_ntop(AF_INET, &ip->ip_src, ipv4str, sizeof(ipv4str))); } #endif bzero(&ro, sizeof ro); @@ -893,7 +914,7 @@ badport_bandlim(int which) if (icmplim <= 0 || which > BANDLIM_MAX || which < 0) return(0); - getmicrotime(&time); + getmicrouptime(&time); secs = time.tv_sec - lticks[which].tv_sec ; @@ -959,7 +980,7 @@ __private_extern__ struct pr_usrreqs icmp_dgram_usrreqs = { pru_connect2_notsupp, in_control, rip_detach, rip_disconnect, pru_listen_notsupp, in_setpeeraddr, pru_rcvd_notsupp, pru_rcvoob_notsupp, icmp_dgram_send, pru_sense_null, rip_shutdown, - in_setsockaddr, sosend, soreceive, sopoll + in_setsockaddr, sosend, soreceive, pru_sopoll_notsupp }; /* Like rip_attach but without root privilege enforcement */ @@ -1059,8 +1080,7 @@ icmp_dgram_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *n /* Only IPv4 */ if (IP_VHL_V(ip->ip_vhl) != 4) goto bad; - if (hlen < 20 || hlen > 40 || ip->ip_len != m->m_pkthdr.len || - ip->ip_len > 65535) + if (hlen < 20 || hlen > 40 || ip->ip_len != m->m_pkthdr.len) goto bad; /* Bogus fragments can tie up peer resources */ if (ip->ip_off != 0) @@ -1070,12 +1090,22 @@ icmp_dgram_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *n goto bad; /* To prevent spoofing, specified source address must be one of ours */ if (ip->ip_src.s_addr != INADDR_ANY) { - if (TAILQ_EMPTY(&in_ifaddrhead)) + socket_unlock(so, 0); + lck_mtx_lock(rt_mtx); + if (TAILQ_EMPTY(&in_ifaddrhead)) { + lck_mtx_unlock(rt_mtx); + socket_lock(so, 0); goto bad; + } TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { - if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_src.s_addr) + if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_src.s_addr) { + lck_mtx_unlock(rt_mtx); + socket_lock(so, 0); goto ours; + } } + lck_mtx_unlock(rt_mtx); + socket_lock(so, 0); goto bad; } ours: diff --git a/bsd/netinet/ip_icmp.h b/bsd/netinet/ip_icmp.h index 22e119eb1..8aff81cb0 100644 --- a/bsd/netinet/ip_icmp.h +++ b/bsd/netinet/ip_icmp.h @@ -206,11 +206,9 @@ struct icmp { (type) == ICMP_IREQ || (type) == ICMP_IREQREPLY || \ (type) == ICMP_MASKREQ || (type) == ICMP_MASKREPLY) -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -void icmp_error __P((struct mbuf *, int, int, n_long, struct ifnet *)); -void icmp_input __P((struct mbuf *, int)); -#endif /* __APPLE_API_PRIVATE */ -#endif +#ifdef KERNEL_PRIVATE +void icmp_error(struct mbuf *, int, int, n_long, struct ifnet *); +void icmp_input(struct mbuf *, int); +#endif KERNEL_PRIVATE #endif diff --git a/bsd/netinet/ip_id.c b/bsd/netinet/ip_id.c index 4e7e70980..42630415b 100644 --- a/bsd/netinet/ip_id.c +++ b/bsd/netinet/ip_id.c @@ -87,9 +87,9 @@ static u_int16_t ru_msb = 0; static long ru_reseed; static u_int32_t tmp; /* Storage for unused random */ -static u_int16_t pmod __P((u_int16_t, u_int16_t, u_int16_t)); -static void ip_initid __P((void)); -u_int16_t ip_randomid __P((void)); +static u_int16_t pmod(u_int16_t, u_int16_t, u_int16_t); +static void ip_initid(void); +u_int16_t ip_randomid(void); /* * Do a fast modular exponation, returned value will be in the range @@ -135,7 +135,7 @@ ip_initid(void) int noprime = 1; struct timeval time; - getmicrotime(&time); + getmicrouptime(&time); read_random((void *) &tmp, sizeof(tmp)); ru_x = (tmp & 0xFFFF) % RU_M; @@ -186,7 +186,7 @@ ip_randomid(void) int i, n; struct timeval time; - getmicrotime(&time); + getmicrouptime(&time); if (ru_counter >= RU_MAX || time.tv_sec > ru_reseed) ip_initid(); diff --git a/bsd/netinet/ip_input.c b/bsd/netinet/ip_input.c index 5ac01c9e1..4a219f43c 100644 --- a/bsd/netinet/ip_input.c +++ b/bsd/netinet/ip_input.c @@ -70,12 +70,13 @@ #include <sys/sysctl.h> #include <kern/queue.h> +#include <kern/locks.h> #include <net/if.h> #include <net/if_var.h> #include <net/if_dl.h> #include <net/route.h> -#include <net/netisr.h> +#include <net/kpi_protocol.h> #include <netinet/in.h> #include <netinet/in_systm.h> @@ -87,6 +88,9 @@ #include <sys/socketvar.h> #include <netinet/ip_fw.h> +#include <netinet/ip_divert.h> + +#include <netinet/kpi_ipfilter_var.h> /* needed for AUTOCONFIGURING: */ #include <netinet/udp.h> @@ -116,6 +120,7 @@ #if IPSEC extern int ipsec_bypass; +extern lck_mtx_t *sadb_mutex; #endif int rsvp_on = 0; @@ -149,7 +154,7 @@ SYSCTL_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW, "Enable packet capture for FAITH IPv4->IPv6 translater daemon"); static int nipq = 0; /* total # of reass queues */ -static int maxnipq = 0; +static int maxnipq; SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_RW, &maxnipq, 0, "Maximum number of IPv4 fragment reassembly queue entries"); @@ -159,6 +164,12 @@ SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW, &maxfragsperpacket, 0, "Maximum number of IPv4 fragments allowed per packet"); +static int maxfrags; +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW, + &maxfrags, 0, "Maximum number of IPv4 fragments allowed"); + +static int currentfrags = 0; + /* * XXX - Setting ip_checkinterface mostly implements the receive side of * the Strong ES model described in RFC 1122, but since the routing table @@ -203,7 +214,15 @@ SYSCTL_STRUCT(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RD, (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK) static struct ipq ipq[IPREASS_NHASH]; +static TAILQ_HEAD(ipq_list, ipq) ipq_list = + TAILQ_HEAD_INITIALIZER(ipq_list); const int ipintrq_present = 1; +lck_mtx_t *ip_mutex; +lck_attr_t *ip_mutex_attr; +lck_grp_t *ip_mutex_grp; +lck_grp_attr_t *ip_mutex_grp_attr; +lck_mtx_t *inet_domain_mutex; +extern lck_mtx_t *domain_proto_mtx; #if IPCTL_DEFMTU SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, @@ -219,14 +238,14 @@ SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, /* Firewall hooks */ ip_fw_chk_t *ip_fw_chk_ptr; -ip_fw_ctl_t *ip_fw_ctl_ptr; int fw_enable = 1 ; +int fw_one_pass = 1; #if DUMMYNET -ip_dn_ctl_t *ip_dn_ctl_ptr; +ip_dn_io_t *ip_dn_io_ptr; #endif -int (*fr_checkp) __P((struct ip *, int, struct ifnet *, int, struct mbuf **)) = NULL; +int (*fr_checkp)(struct ip *, int, struct ifnet *, int, struct mbuf **) = NULL; SYSCTL_NODE(_net_inet_ip, OID_AUTO, linklocal, CTLFLAG_RW, 0, "link local"); @@ -237,7 +256,7 @@ SYSCTL_STRUCT(_net_inet_ip_linklocal, OID_AUTO, stat, CTLFLAG_RD, SYSCTL_NODE(_net_inet_ip_linklocal, OID_AUTO, in, CTLFLAG_RW, 0, "link local input"); -int ip_linklocal_in_allowbadttl = 0; +int ip_linklocal_in_allowbadttl = 1; SYSCTL_INT(_net_inet_ip_linklocal_in, OID_AUTO, allowbadttl, CTLFLAG_RW, &ip_linklocal_in_allowbadttl, 0, "Allow incoming link local packets with TTL less than 255"); @@ -258,29 +277,26 @@ static struct ip_srcrt { struct in_addr route[MAX_IPOPTLEN/sizeof(struct in_addr)]; } ip_srcrt; -struct sockaddr_in *ip_fw_fwd_addr; - #ifdef __APPLE__ extern struct mbuf* m_dup(register struct mbuf *m, int how); #endif -static void save_rte __P((u_char *, struct in_addr)); -static int ip_dooptions __P((struct mbuf *)); -static void ip_forward __P((struct mbuf *, int)); -static void ip_freef __P((struct ipq *)); +static void save_rte(u_char *, struct in_addr); +static int ip_dooptions(struct mbuf *, int, struct sockaddr_in *, struct route *ipforward_rt); +static void ip_forward(struct mbuf *, int, struct sockaddr_in *, struct route *ipforward_rt); +static void ip_freef(struct ipq *); #if IPDIVERT #ifdef IPDIVERT_44 -static struct mbuf *ip_reass __P((struct mbuf *, - struct ipq *, struct ipq *, u_int32_t *, u_int16_t *)); +static struct mbuf *ip_reass(struct mbuf *, + struct ipq *, struct ipq *, u_int32_t *, u_int16_t *); #else -static struct mbuf *ip_reass __P((struct mbuf *, - struct ipq *, struct ipq *, u_int16_t *, u_int16_t *)); +static struct mbuf *ip_reass(struct mbuf *, + struct ipq *, struct ipq *, u_int16_t *, u_int16_t *); #endif #else -static struct mbuf *ip_reass __P((struct mbuf *, struct ipq *, struct ipq *)); +static struct mbuf *ip_reass(struct mbuf *, struct ipq *, struct ipq *); #endif -static struct in_ifaddr *ip_rtaddr __P((struct in_addr)); -void ipintr __P((void)); +void ipintr(void); #if RANDOM_IP_ID extern u_short ip_id; @@ -299,11 +315,13 @@ ip_init() register struct protosw *pr; register int i; static ip_initialized = 0; + struct timeval timenow; + if (!ip_initialized) { TAILQ_INIT(&in_ifaddrhead); - pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); + pr = pffindproto_locked(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == 0) panic("ip_init"); for (i = 0; i < IPPROTO_MAX; i++) @@ -318,16 +336,43 @@ ip_init() ipq[i].next = ipq[i].prev = &ipq[i]; maxnipq = nmbclusters / 32; - maxfragsperpacket = 16; + maxfrags = maxnipq * 2; + maxfragsperpacket = 128; /* enough for 64k in 512 byte fragments */ #if RANDOM_IP_ID - ip_id = time_second & 0xffff; + getmicrouptime(&timenow); + ip_id = timenow.tv_sec & 0xffff; #endif ipintrq.ifq_maxlen = ipqmaxlen; + + ipf_init(); + + ip_mutex_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setdefault(ip_mutex_grp_attr); + + ip_mutex_grp = lck_grp_alloc_init("ip", ip_mutex_grp_attr); + + ip_mutex_attr = lck_attr_alloc_init(); + + lck_attr_setdefault(ip_mutex_attr); + + if ((ip_mutex = lck_mtx_alloc_init(ip_mutex_grp, ip_mutex_attr)) == NULL) { + printf("ip_init: can't alloc ip_mutex\n"); + return; + } + ip_initialized = 1; } } +static void +ip_proto_input( + protocol_family_t protocol, + mbuf_t packet) +{ + ip_input(packet); +} + /* Initialize the PF_INET domain, and add in the pre-defined protos */ void in_dinit() @@ -341,15 +386,90 @@ in_dinit() { kprintf("Initing %d protosw entries\n", in_proto_count); dp = &inetdomain; + dp->dom_flags = DOM_REENTRANT; for (i=0, pr = &inetsw[0]; i<in_proto_count; i++, pr++) net_add_proto(pr, dp); + inet_domain_mutex = dp->dom_mtx; inetdomain_initted = 1; + + lck_mtx_unlock(domain_proto_mtx); + proto_register_input(PF_INET, ip_proto_input, NULL); + lck_mtx_lock(domain_proto_mtx); } } +__private_extern__ void +ip_proto_dispatch_in( + struct mbuf *m, + int hlen, + u_int8_t proto, + ipfilter_t inject_ipfref) +{ + struct ipfilter *filter; + int seen = (inject_ipfref == 0); + int changed_header = 0; + struct ip *ip; + + if (!TAILQ_EMPTY(&ipv4_filters)) { + ipf_ref(); + TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { + if (seen == 0) { + if ((struct ipfilter *)inject_ipfref == filter) + seen = 1; + } else if (filter->ipf_filter.ipf_input) { + errno_t result; + + if (changed_header == 0) { + changed_header = 1; + ip = mtod(m, struct ip *); + ip->ip_len = htons(ip->ip_len + hlen); + ip->ip_off = htons(ip->ip_off); + ip->ip_sum = 0; + ip->ip_sum = in_cksum(m, hlen); + } + result = filter->ipf_filter.ipf_input( + filter->ipf_filter.cookie, (mbuf_t*)&m, hlen, proto); + if (result == EJUSTRETURN) { + ipf_unref(); + return; + } + if (result != 0) { + ipf_unref(); + m_freem(m); + return; + } + } + } + ipf_unref(); + } + /* + * If there isn't a specific lock for the protocol + * we're about to call, use the generic lock for AF_INET. + * otherwise let the protocol deal with its own locking + */ + ip = mtod(m, struct ip *); + + if (changed_header) { + ip->ip_len = ntohs(ip->ip_len) - hlen; + ip->ip_off = ntohs(ip->ip_off); + } + + if (!(ip_protox[ip->ip_p]->pr_flags & PR_PROTOLOCK)) { + lck_mtx_lock(inet_domain_mutex); + (*ip_protox[ip->ip_p]->pr_input)(m, hlen); + lck_mtx_unlock(inet_domain_mutex); + } + else + (*ip_protox[ip->ip_p]->pr_input)(m, hlen); + +} + +/* + * ipforward_rt cleared in in_addroute() + * when a new route is successfully created. + */ static struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET }; -static struct route ipforward_rt; /* * Ip input routine. Checksum and byte swap header. If fragmented @@ -363,41 +483,77 @@ ip_input(struct mbuf *m) struct in_ifaddr *ia = NULL; int i, hlen, mff, checkif; u_short sum; - u_int16_t divert_cookie; /* firewall cookie */ struct in_addr pkt_dst; -#if IPDIVERT - u_int16_t divert_info = 0; /* packet divert/tee info */ -#endif - struct ip_fw_chain *rule = NULL; + u_int32_t div_info = 0; /* packet divert/tee info */ + struct ip_fw_args args; + ipfilter_t inject_filter_ref = 0; + struct m_tag *tag; + struct route ipforward_rt = { 0 }; + + lck_mtx_lock(ip_mutex); + + args.eh = NULL; + args.oif = NULL; + args.rule = NULL; + args.divert_rule = 0; /* divert cookie */ + args.next_hop = NULL; + + /* Grab info from mtags prepended to the chain */ +#if DUMMYNET + if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) { + struct dn_pkt_tag *dn_tag; + + dn_tag = (struct dn_pkt_tag *)(tag+1); + args.rule = dn_tag->rule; + + m_tag_delete(m, tag); + } +#endif /* DUMMYNET */ -#if IPDIVERT - /* Get and reset firewall cookie */ - divert_cookie = ip_divert_cookie; - ip_divert_cookie = 0; -#else - divert_cookie = 0; -#endif + if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) { + struct divert_tag *div_tag; + + div_tag = (struct divert_tag *)(tag+1); + args.divert_rule = div_tag->cookie; -#if IPFIREWALL && DUMMYNET - /* - * dummynet packet are prepended a vestigial mbuf with - * m_type = MT_DUMMYNET and m_data pointing to the matching - * rule. - */ - if (m->m_type == MT_DUMMYNET) { - rule = (struct ip_fw_chain *)(m->m_data) ; - m = m->m_next ; - ip = mtod(m, struct ip *); - hlen = IP_VHL_HL(ip->ip_vhl) << 2; - goto iphack ; - } else - rule = NULL ; -#endif + m_tag_delete(m, tag); + } + if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) { + struct ip_fwd_tag *ipfwd_tag; + + ipfwd_tag = (struct ip_fwd_tag *)(tag+1); + args.next_hop = ipfwd_tag->next_hop; + m_tag_delete(m, tag); + } + #if DIAGNOSTIC if (m == NULL || (m->m_flags & M_PKTHDR) == 0) panic("ip_input no HDR"); #endif + + if (args.rule) { /* dummynet already filtered us */ + ip = mtod(m, struct ip *); + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + inject_filter_ref = ipf_get_inject_filter(m); + goto iphack ; + } + + /* + * No need to proccess packet twice if we've + * already seen it + */ + inject_filter_ref = ipf_get_inject_filter(m); + if (inject_filter_ref != 0) { + lck_mtx_unlock(ip_mutex); + ip = mtod(m, struct ip *); + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + ip->ip_len = ntohs(ip->ip_len) - hlen; + ip->ip_off = ntohs(ip->ip_off); + ip_proto_dispatch_in(m, hlen, ip->ip_p, inject_filter_ref); + return; + } + ipstat.ips_total++; if (m->m_pkthdr.len < sizeof(struct ip)) @@ -406,6 +562,7 @@ ip_input(struct mbuf *m) if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == 0) { ipstat.ips_toosmall++; + lck_mtx_unlock(ip_mutex); return; } ip = mtod(m, struct ip *); @@ -426,6 +583,7 @@ ip_input(struct mbuf *m) if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == 0) { ipstat.ips_badhlen++; + lck_mtx_unlock(ip_mutex); return; } ip = mtod(m, struct ip *); @@ -447,14 +605,15 @@ ip_input(struct mbuf *m) if (ip->ip_ttl != MAXTTL) { ip_linklocal_stat.iplls_in_badttl++; /* Silently drop link local traffic with bad TTL */ - if (ip_linklocal_in_allowbadttl != 0) + if (!ip_linklocal_in_allowbadttl) goto bad; } } if ((IF_HWASSIST_CSUM_FLAGS(m->m_pkthdr.rcvif->if_hwassist) == 0) || (apple_hwcksum_rx == 0) || - ((m->m_pkthdr.csum_flags & CSUM_TCP_SUM16) && ip->ip_p != IPPROTO_TCP)) - m->m_pkthdr.csum_flags = 0; /* invalidate HW generated checksum flags */ + ((m->m_pkthdr.csum_flags & CSUM_TCP_SUM16) && ip->ip_p != IPPROTO_TCP)) { + m->m_pkthdr.csum_flags = 0; /* invalidate HW generated checksum flags */ + } if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); @@ -526,55 +685,57 @@ iphack: if (fr_checkp) { struct mbuf *m1 = m; - if ((*fr_checkp)(ip, hlen, m->m_pkthdr.rcvif, 0, &m1) || !m1) + if (fr_checkp(ip, hlen, m->m_pkthdr.rcvif, 0, &m1) || !m1) return; ip = mtod(m = m1, struct ip *); } - if (fw_enable && ip_fw_chk_ptr) { + if (fw_enable && IPFW_LOADED) { #if IPFIREWALL_FORWARD /* * If we've been forwarded from the output side, then * skip the firewall a second time */ - if (ip_fw_fwd_addr) + if (args.next_hop) goto ours; #endif /* IPFIREWALL_FORWARD */ - /* - * See the comment in ip_output for the return values - * produced by the firewall. - */ - i = (*ip_fw_chk_ptr)(&ip, - hlen, NULL, &divert_cookie, &m, &rule, &ip_fw_fwd_addr); + + args.m = m; + i = ip_fw_chk_ptr(&args); + m = args.m; + if ( (i & IP_FW_PORT_DENY_FLAG) || m == NULL) { /* drop */ - if (m) - m_freem(m); + if (m) + m_freem(m); + lck_mtx_unlock(ip_mutex); return; - } + } ip = mtod(m, struct ip *); /* just in case m changed */ - if (i == 0 && ip_fw_fwd_addr == NULL) /* common case */ + if (i == 0 && args.next_hop == NULL) /* common case */ goto pass; #if DUMMYNET - if ((i & IP_FW_PORT_DYNT_FLAG) != 0) { - /* send packet to the appropriate pipe */ - dummynet_io(i&0xffff,DN_TO_IP_IN,m,NULL,NULL,0, rule); + if (DUMMYNET_LOADED && (i & IP_FW_PORT_DYNT_FLAG) != 0) { + /* Send packet to the appropriate pipe */ + lck_mtx_unlock(ip_mutex); + ip_dn_io_ptr(m, i&0xffff, DN_TO_IP_IN, &args); return; } -#endif +#endif /* DUMMYNET */ #if IPDIVERT if (i != 0 && (i & IP_FW_PORT_DYNT_FLAG) == 0) { /* Divert or tee packet */ - divert_info = i; + div_info = i; goto ours; } #endif #if IPFIREWALL_FORWARD - if (i == 0 && ip_fw_fwd_addr != NULL) + if (i == 0 && args.next_hop != NULL) goto pass; #endif /* * if we get here, the packet must be dropped */ m_freem(m); + lck_mtx_unlock(ip_mutex); return; } pass: @@ -586,10 +747,8 @@ pass: * to be sent and the original packet to be freed). */ ip_nhops = 0; /* for source routed packets */ - if (hlen > sizeof (struct ip) && ip_dooptions(m)) { -#if IPFIREWALL_FORWARD - ip_fw_fwd_addr = NULL; -#endif + if (hlen > sizeof (struct ip) && ip_dooptions(m, 0, args.next_hop, &ipforward_rt)) { + lck_mtx_unlock(ip_mutex); return; } @@ -616,8 +775,8 @@ pass: * Cache the destination address of the packet; this may be * changed by use of 'ipfw fwd'. */ - pkt_dst = ip_fw_fwd_addr == NULL ? - ip->ip_dst : ip_fw_fwd_addr->sin_addr; + pkt_dst = args.next_hop == NULL ? + ip->ip_dst : args.next_hop->sin_addr; /* * Enable a consistency check between the destination address @@ -635,13 +794,16 @@ pass: */ checkif = ip_checkinterface && (ipforwarding == 0) && ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) && - (ip_fw_fwd_addr == NULL); + (args.next_hop == NULL); + lck_mtx_lock(rt_mtx); TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { #define satosin(sa) ((struct sockaddr_in *)(sa)) - if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) + if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) { + lck_mtx_unlock(rt_mtx); goto ours; + } /* * If the address matches, verify that the packet @@ -649,8 +811,10 @@ pass: * enabled. */ if (IA_SIN(ia)->sin_addr.s_addr == pkt_dst.s_addr && - (!checkif || ia->ia_ifp == m->m_pkthdr.rcvif)) + (!checkif || ia->ia_ifp == m->m_pkthdr.rcvif)) { + lck_mtx_unlock(rt_mtx); goto ours; + } /* * Only accept broadcast packets that arrive via the * matching interface. Reception of forwarded directed @@ -661,12 +825,17 @@ pass: if ((!checkif || ia->ia_ifp == m->m_pkthdr.rcvif) && ia->ia_ifp && ia->ia_ifp->if_flags & IFF_BROADCAST) { if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == - pkt_dst.s_addr) + pkt_dst.s_addr) { + lck_mtx_unlock(rt_mtx); goto ours; - if (ia->ia_netbroadcast.s_addr == pkt_dst.s_addr) + } + if (ia->ia_netbroadcast.s_addr == pkt_dst.s_addr) { + lck_mtx_unlock(rt_mtx); goto ours; + } } } + lck_mtx_unlock(rt_mtx); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { struct in_multi *inm; if (ip_mrouter) { @@ -678,9 +847,11 @@ pass: * ip_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. */ - if (ip_mforward(ip, m->m_pkthdr.rcvif, m, 0) != 0) { + if (ip_mforward && + ip_mforward(ip, m->m_pkthdr.rcvif, m, 0) != 0) { ipstat.ips_cantforward++; m_freem(m); + lck_mtx_unlock(ip_mutex); return; } @@ -701,6 +872,7 @@ pass: if (inm == NULL) { ipstat.ips_notmember++; m_freem(m); + lck_mtx_unlock(ip_mutex); return; } goto ours; @@ -719,6 +891,7 @@ pass: if (m->m_len < sizeof(struct udpiphdr) && (m = m_pullup(m, sizeof(struct udpiphdr))) == 0) { udpstat.udps_hdrops++; + lck_mtx_unlock(ip_mutex); return; } ui = mtod(m, struct udpiphdr *); @@ -738,20 +911,20 @@ pass: goto ours; } m_freem(m); + lck_mtx_unlock(ip_mutex); return; } #endif + lck_mtx_unlock(ip_mutex); /* * Not for us; forward if possible and desirable. */ if (ipforwarding == 0) { ipstat.ips_cantforward++; m_freem(m); - } else - ip_forward(m, 0); -#if IPFIREWALL_FORWARD - ip_fw_fwd_addr = NULL; -#endif + } else { + ip_forward(m, 0, args.next_hop, &ipforward_rt); + } return; ours: @@ -775,10 +948,34 @@ ours: /* If maxnipq is 0, never accept fragments. */ if (maxnipq == 0) { - ipstat.ips_fragments++; + ipstat.ips_fragments++; ipstat.ips_fragdropped++; goto bad; + } + + /* + * If we will exceed the number of fragments in queues, timeout the + * oldest fragemented packet to make space. + */ + if (currentfrags >= maxfrags) { + fp = TAILQ_LAST(&ipq_list, ipq_list); + ipstat.ips_fragtimeout += fp->ipq_nfrags; + + if (ip->ip_id == fp->ipq_id && + ip->ip_src.s_addr == fp->ipq_src.s_addr && + ip->ip_dst.s_addr == fp->ipq_dst.s_addr && + ip->ip_p == fp->ipq_p) { + /* + * If we match the fragment queue we were going to + * discard, drop this packet too. + */ + ipstat.ips_fragdropped++; + ip_freef(fp); + goto bad; } + + ip_freef(fp); + } sum = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); /* @@ -792,8 +989,6 @@ ours: ip->ip_p == fp->ipq_p) goto found; - fp = 0; - /* * Enforce upper bound on number of fragmented packets * for which we attempt reassembly; @@ -801,23 +996,15 @@ ours: */ if ((nipq > maxnipq) && (maxnipq > 0)) { /* - * drop something from the tail of the current queue - * before proceeding further + * drop the oldest fragment before proceeding further */ - if (ipq[sum].prev == &ipq[sum]) { /* gak */ - for (i = 0; i < IPREASS_NHASH; i++) { - if (ipq[i].prev != &ipq[i]) { - ipstat.ips_fragtimeout += - ipq[i].prev->ipq_nfrags; - ip_freef(ipq[i].prev); - break; - } - } - } else { - ipstat.ips_fragtimeout += ipq[sum].prev->ipq_nfrags; - ip_freef(ipq[sum].prev); - } + fp = TAILQ_LAST(&ipq_list, ipq_list); + ipstat.ips_fragtimeout += fp->ipq_nfrags; + ip_freef(fp); } + + fp = NULL; + found: /* * Adjust ip_len to not reflect header, @@ -827,34 +1014,34 @@ found: if (ip->ip_off & IP_MF) { /* * Make sure that fragments have a data length - * that's a non-zero multiple of 8 bytes. + * that's a non-zero multiple of 8 bytes. */ if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) { ipstat.ips_toosmall++; /* XXX */ goto bad; } m->m_flags |= M_FRAG; - } else + } else { + /* Clear the flag in case packet comes from loopback */ m->m_flags &= ~M_FRAG; + } ip->ip_off <<= 3; /* * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf, and update - * the divert info in divert_info and args.divert_rule. + * the divert info in div_info and args.divert_rule. */ ipstat.ips_fragments++; m->m_pkthdr.header = ip; #if IPDIVERT m = ip_reass(m, - fp, &ipq[sum], &divert_info, &divert_cookie); + fp, &ipq[sum], &div_info, &args.divert_rule); #else m = ip_reass(m, fp, &ipq[sum]); #endif if (m == 0) { -#if IPFIREWALL_FORWARD - ip_fw_fwd_addr = NULL; -#endif + lck_mtx_unlock(ip_mutex); return; } ipstat.ips_reassembled++; @@ -863,7 +1050,7 @@ found: hlen = IP_VHL_HL(ip->ip_vhl) << 2; #if IPDIVERT /* Restore original checksum before diverting packet */ - if (divert_info != 0) { + if (div_info != 0) { ip->ip_len += hlen; HTONS(ip->ip_len); HTONS(ip->ip_off); @@ -881,14 +1068,14 @@ found: /* * Divert or tee packet to the divert protocol if required. * - * If divert_info is zero then cookie should be too, so we shouldn't + * If div_info is zero then cookie should be too, so we shouldn't * need to clear them here. Assume divert_packet() does so also. */ - if (divert_info != 0) { + if (div_info != 0) { struct mbuf *clone = NULL; /* Clone packet if we're doing a 'tee' */ - if ((divert_info & IP_FW_PORT_TEE_FLAG) != 0) + if ((div_info & IP_FW_PORT_TEE_FLAG) != 0) clone = m_dup(m, M_DONTWAIT); /* Restore packet header fields to original values */ @@ -897,13 +1084,15 @@ found: HTONS(ip->ip_off); /* Deliver packet to divert input routine */ - ip_divert_cookie = divert_cookie; - divert_packet(m, 1, divert_info & 0xffff); ipstat.ips_delivered++; + lck_mtx_unlock(ip_mutex); + divert_packet(m, 1, div_info & 0xffff, args.divert_rule); /* If 'tee', continue with original packet */ - if (clone == NULL) + if (clone == NULL) { return; + } + lck_mtx_lock(ip_mutex); m = clone; ip = mtod(m, struct ip *); } @@ -915,10 +1104,14 @@ found: * note that we do not visit this with protocols with pcb layer * code - like udp/tcp/raw ip. */ - if (ipsec_bypass == 0 && (ip_protox[ip->ip_p]->pr_flags & PR_LASTHDR) != 0 && - ipsec4_in_reject(m, NULL)) { - ipsecstat.in_polvio++; - goto bad; + if (ipsec_bypass == 0 && (ip_protox[ip->ip_p]->pr_flags & PR_LASTHDR) != 0) { + lck_mtx_lock(sadb_mutex); + if (ipsec4_in_reject(m, NULL)) { + ipsecstat.in_polvio++; + lck_mtx_unlock(sadb_mutex); + goto bad; + } + lck_mtx_unlock(sadb_mutex); } #endif @@ -927,49 +1120,45 @@ found: */ ipstat.ips_delivered++; { - KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, - ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); - - (*ip_protox[ip->ip_p]->pr_input)(m, hlen); -#if IPFIREWALL_FORWARD - ip_fw_fwd_addr = NULL; /* tcp needed it */ -#endif + if (args.next_hop && ip->ip_p == IPPROTO_TCP) { + /* TCP needs IPFORWARD info if available */ + struct m_tag *fwd_tag; + struct ip_fwd_tag *ipfwd_tag; + + fwd_tag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, + sizeof(struct sockaddr_in), M_NOWAIT); + if (fwd_tag == NULL) { + goto bad; + } + + ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1); + ipfwd_tag->next_hop = args.next_hop; + + m_tag_prepend(m, fwd_tag); + + KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, + ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); + + lck_mtx_unlock(ip_mutex); + + /* TCP deals with its own locking */ + ip_proto_dispatch_in(m, hlen, ip->ip_p, 0); + } else { + KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, + ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); + + lck_mtx_unlock(ip_mutex); + ip_proto_dispatch_in(m, hlen, ip->ip_p, 0); + } + return; } bad: -#if IPFIREWALL_FORWARD - ip_fw_fwd_addr = NULL; -#endif KERNEL_DEBUG(DBG_LAYER_END, 0,0,0,0,0); + lck_mtx_unlock(ip_mutex); m_freem(m); } -/* - * IP software interrupt routine - to go away sometime soon - */ -void -ipintr(void) -{ - int s; - struct mbuf *m; - - KERNEL_DEBUG(DBG_FNC_IP_INPUT | DBG_FUNC_START, 0,0,0,0,0); - - while(1) { - s = splimp(); - IF_DEQUEUE(&ipintrq, m); - splx(s); - if (m == 0) { - KERNEL_DEBUG(DBG_FNC_IP_INPUT | DBG_FUNC_END, 0,0,0,0,0); - return; - } - - ip_input(m); - } -} - -NETISR_SET(NETISR_IP, ipintr); - /* * Take incoming datagram fragment and try to reassemble it into * whole datagram. If a chain for reassembly of this datagram already @@ -1037,6 +1226,7 @@ ip_reass(m, fp, where) #endif fp->ipq_div_cookie = 0; #endif + TAILQ_INSERT_HEAD(&ipq_list, fp, ipq_list); goto inserted; } else { fp->ipq_nfrags++; @@ -1100,6 +1290,7 @@ ip_reass(m, fp, where) } inserted: + currentfrags++; #if IPDIVERT /* @@ -1172,8 +1363,12 @@ inserted: for (q = nq; q != NULL; q = nq) { nq = q->m_nextpkt; q->m_nextpkt = NULL; + if (q->m_pkthdr.csum_flags & CSUM_TCP_SUM16) + m->m_pkthdr.csum_flags = 0; + else { m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags; m->m_pkthdr.csum_data += q->m_pkthdr.csum_data; + } m_cat(m, q); } @@ -1199,6 +1394,8 @@ inserted: ip->ip_src = fp->ipq_src; ip->ip_dst = fp->ipq_dst; remque((void*)fp); + TAILQ_REMOVE(&ipq_list, fp, ipq_list); + currentfrags -= fp->ipq_nfrags; nipq--; (void) m_free(dtom(fp)); m->m_len += (IP_VHL_HL(ip->ip_vhl) << 2); @@ -1234,14 +1431,10 @@ static void ip_freef(fp) struct ipq *fp; { - register struct mbuf *q; - - while (fp->ipq_frags) { - q = fp->ipq_frags; - fp->ipq_frags = q->m_nextpkt; - m_freem(q); - } + currentfrags -= fp->ipq_nfrags; + m_freem_list(fp->ipq_frags); remque((void*)fp); + TAILQ_REMOVE(&ipq_list, fp, ipq_list); (void) m_free(dtom(fp)); nipq--; } @@ -1255,9 +1448,8 @@ void ip_slowtimo() { register struct ipq *fp; - int s = splnet(); int i; - + lck_mtx_lock(ip_mutex); for (i = 0; i < IPREASS_NHASH; i++) { fp = ipq[i].next; if (fp == 0) @@ -1287,7 +1479,7 @@ ip_slowtimo() } } ipflow_slowtimo(); - splx(s); + lck_mtx_unlock(ip_mutex); } /* @@ -1298,12 +1490,14 @@ ip_drain() { int i; + lck_mtx_lock(ip_mutex); for (i = 0; i < IPREASS_NHASH; i++) { while (ipq[i].next != &ipq[i]) { ipstat.ips_fragdropped += ipq[i].next->ipq_nfrags; ip_freef(ipq[i].next); } } + lck_mtx_unlock(ip_mutex); in_rtqdrain(); } @@ -1311,12 +1505,16 @@ ip_drain() * Do option processing on a datagram, * possibly discarding it if bad options are encountered, * or forwarding it if source-routed. + * The pass argument is used when operating in the IPSTEALTH + * mode to tell what options to process: + * [LS]SRR (pass 0) or the others (pass 1). + * The reason for as many as two passes is that when doing IPSTEALTH, + * non-routing options should be processed only if the packet is for us. * Returns 1 if packet has been forwarded/freed, * 0 if the packet should be processed further. */ static int -ip_dooptions(m) - struct mbuf *m; +ip_dooptions(struct mbuf *m, int pass, struct sockaddr_in *next_hop, struct route *ipforward_rt) { register struct ip *ip = mtod(m, struct ip *); register u_char *cp; @@ -1387,6 +1585,10 @@ ip_dooptions(m) */ break; } + else { + ifafree(&ia->ia_ifa); + ia = NULL; + } off--; /* 0 origin */ if (off > optlen - (int)sizeof(struct in_addr)) { /* @@ -1400,15 +1602,16 @@ ip_dooptions(m) if (!ip_dosourceroute) { if (ipforwarding) { - char buf[16]; /* aaa.bbb.ccc.ddd\0 */ + char buf[MAX_IPv4_STR_LEN]; + char buf2[MAX_IPv4_STR_LEN]; /* * Acting as a router, so generate ICMP */ nosourcerouting: - strcpy(buf, inet_ntoa(ip->ip_dst)); - log(LOG_WARNING, + log(LOG_WARNING, "attempted source route from %s to %s\n", - inet_ntoa(ip->ip_src), buf); + inet_ntop(AF_INET, &ip->ip_src, buf, sizeof(buf)), + inet_ntop(AF_INET, &ip->ip_dst, buf2, sizeof(buf2))); type = ICMP_UNREACH; code = ICMP_UNREACH_SRCFAIL; goto bad; @@ -1431,10 +1634,12 @@ nosourcerouting: if (opt == IPOPT_SSRR) { #define INA struct in_ifaddr * #define SA struct sockaddr * - if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == 0) - ia = (INA)ifa_ifwithnet((SA)&ipaddr); - } else - ia = ip_rtaddr(ipaddr.sin_addr); + if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == 0) { + ia = (INA)ifa_ifwithnet((SA)&ipaddr); + } + } else { + ia = ip_rtaddr(ipaddr.sin_addr, ipforward_rt); + } if (ia == 0) { type = ICMP_UNREACH; code = ICMP_UNREACH_SRCFAIL; @@ -1443,6 +1648,8 @@ nosourcerouting: ip->ip_dst = ipaddr.sin_addr; (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), sizeof(struct in_addr)); + ifafree(&ia->ia_ifa); + ia = NULL; cp[IPOPT_OFFSET] += sizeof(struct in_addr); /* * Let ip_intr's mcast routing check handle mcast pkts @@ -1471,14 +1678,17 @@ nosourcerouting: * locate outgoing interface; if we're the destination, * use the incoming interface (should be same). */ - if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == 0 && - (ia = ip_rtaddr(ipaddr.sin_addr)) == 0) { - type = ICMP_UNREACH; - code = ICMP_UNREACH_HOST; - goto bad; + if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) != 0) { + if ((ia = ip_rtaddr(ipaddr.sin_addr, ipforward_rt)) == 0) { + type = ICMP_UNREACH; + code = ICMP_UNREACH_HOST; + goto bad; + } } (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), sizeof(struct in_addr)); + ifafree(&ia->ia_ifa); + ia = NULL; cp[IPOPT_OFFSET] += sizeof(struct in_addr); break; @@ -1523,6 +1733,8 @@ nosourcerouting: (void)memcpy(sin, &IA_SIN(ia)->sin_addr, sizeof(struct in_addr)); ipt->ipt_ptr += sizeof(struct in_addr); + ifafree(&ia->ia_ifa); + ia = NULL; break; case IPOPT_TS_PRESPEC: @@ -1534,8 +1746,10 @@ nosourcerouting: } (void)memcpy(&ipaddr.sin_addr, sin, sizeof(struct in_addr)); - if (ifa_ifwithaddr((SA)&ipaddr) == 0) + if ((ia = (struct in_ifaddr*)ifa_ifwithaddr((SA)&ipaddr)) == 0) continue; + ifafree(&ia->ia_ifa); + ia = NULL; ipt->ipt_ptr += sizeof(struct in_addr); break; @@ -1552,13 +1766,15 @@ nosourcerouting: } } if (forward && ipforwarding) { - ip_forward(m, 1); + ip_forward(m, 1, next_hop, ipforward_rt); return (1); } return (0); bad: ip->ip_len -= IP_VHL_HL(ip->ip_vhl) << 2; /* XXX icmp_error adds in hdr length */ + lck_mtx_unlock(ip_mutex); icmp_error(m, type, code, 0, 0); + lck_mtx_lock(ip_mutex); ipstat.ips_badoptions++; return (1); } @@ -1567,29 +1783,37 @@ bad: * Given address of next destination (final or next hop), * return internet address info of interface to be used to get there. */ -static struct in_ifaddr * -ip_rtaddr(dst) - struct in_addr dst; +struct in_ifaddr * +ip_rtaddr(dst, rt) + struct in_addr dst; + struct route *rt; { register struct sockaddr_in *sin; - sin = (struct sockaddr_in *) &ipforward_rt.ro_dst; + sin = (struct sockaddr_in *)&rt->ro_dst; - if (ipforward_rt.ro_rt == 0 || dst.s_addr != sin->sin_addr.s_addr || - ipforward_rt.ro_rt->generation_id != route_generation) { - if (ipforward_rt.ro_rt) { - rtfree(ipforward_rt.ro_rt); - ipforward_rt.ro_rt = 0; + lck_mtx_lock(rt_mtx); + if (rt->ro_rt == 0 || dst.s_addr != sin->sin_addr.s_addr || + rt->ro_rt->generation_id != route_generation) { + if (rt->ro_rt) { + rtfree_locked(rt->ro_rt); + rt->ro_rt = 0; } sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = dst; - rtalloc_ign(&ipforward_rt, RTF_PRCLONING); + rtalloc_ign_locked(rt, RTF_PRCLONING); } - if (ipforward_rt.ro_rt == 0) + if (rt->ro_rt == 0) { + lck_mtx_unlock(rt_mtx); return ((struct in_ifaddr *)0); - return ((struct in_ifaddr *) ipforward_rt.ro_rt->rt_ifa); + } + + if (rt->ro_rt->rt_ifa) + ifaref(rt->ro_rt->rt_ifa); + lck_mtx_unlock(rt_mtx); + return ((struct in_ifaddr *) rt->ro_rt->rt_ifa); } /* @@ -1735,9 +1959,7 @@ u_char inetctlerrmap[PRC_NCMDS] = { * via a source route. */ static void -ip_forward(m, srcrt) - struct mbuf *m; - int srcrt; +ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop, struct route *ipforward_rt) { register struct ip *ip = mtod(m, struct ip *); register struct sockaddr_in *sin; @@ -1745,21 +1967,28 @@ ip_forward(m, srcrt) int error, type = 0, code = 0; struct mbuf *mcopy; n_long dest; + struct in_addr pkt_dst; struct ifnet *destifp; #if IPSEC struct ifnet dummyifp; #endif dest = 0; + /* + * Cache the destination address of the packet; this may be + * changed by use of 'ipfw fwd'. + */ + pkt_dst = next_hop ? next_hop->sin_addr : ip->ip_dst; + #if DIAGNOSTIC if (ipprintfs) printf("forward: src %lx dst %lx ttl %x\n", - (u_long)ip->ip_src.s_addr, (u_long)ip->ip_dst.s_addr, + (u_long)ip->ip_src.s_addr, (u_long)pkt_dst.s_addr, ip->ip_ttl); #endif - if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { + if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(pkt_dst) == 0) { ipstat.ips_cantforward++; m_freem(m); return; @@ -1776,24 +2005,24 @@ ip_forward(m, srcrt) } #endif - sin = (struct sockaddr_in *)&ipforward_rt.ro_dst; - if ((rt = ipforward_rt.ro_rt) == 0 || - ip->ip_dst.s_addr != sin->sin_addr.s_addr || - ipforward_rt.ro_rt->generation_id != route_generation) { - if (ipforward_rt.ro_rt) { - rtfree(ipforward_rt.ro_rt); - ipforward_rt.ro_rt = 0; + sin = (struct sockaddr_in *)&ipforward_rt->ro_dst; + if ((rt = ipforward_rt->ro_rt) == 0 || + pkt_dst.s_addr != sin->sin_addr.s_addr || + ipforward_rt->ro_rt->generation_id != route_generation) { + if (ipforward_rt->ro_rt) { + rtfree(ipforward_rt->ro_rt); + ipforward_rt->ro_rt = 0; } sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); - sin->sin_addr = ip->ip_dst; + sin->sin_addr = pkt_dst; - rtalloc_ign(&ipforward_rt, RTF_PRCLONING); - if (ipforward_rt.ro_rt == 0) { + rtalloc_ign(ipforward_rt, RTF_PRCLONING); + if (ipforward_rt->ro_rt == 0) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0); return; } - rt = ipforward_rt.ro_rt; + rt = ipforward_rt->ro_rt; } /* @@ -1842,7 +2071,7 @@ ip_forward(m, srcrt) if (rt->rt_flags & RTF_GATEWAY) dest = satosin(rt->rt_gateway)->sin_addr.s_addr; else - dest = ip->ip_dst.s_addr; + dest = pkt_dst.s_addr; /* Router requirements says to only send host redirects */ type = ICMP_REDIRECT; code = ICMP_REDIRECT_HOST; @@ -1853,8 +2082,28 @@ ip_forward(m, srcrt) } } - error = ip_output(m, (struct mbuf *)0, &ipforward_rt, + { + if (next_hop) { + /* Pass IPFORWARD info if available */ + struct m_tag *tag; + struct ip_fwd_tag *ipfwd_tag; + + tag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, + sizeof(struct sockaddr_in), M_NOWAIT); + if (tag == NULL) { + error = ENOBUFS; + m_freem(m); + return; + } + + ipfwd_tag = (struct ip_fwd_tag *)(tag+1); + ipfwd_tag->next_hop = next_hop; + + m_tag_prepend(m, tag); + } + error = ip_output_list(m, 0, (struct mbuf *)0, ipforward_rt, IP_FORWARDING, 0); + } if (error) ipstat.ips_cantforward++; else { @@ -1863,7 +2112,7 @@ ip_forward(m, srcrt) ipstat.ips_redirectsent++; else { if (mcopy) { - ipflow_create(&ipforward_rt, mcopy); + ipflow_create(ipforward_rt, mcopy); m_freem(mcopy); } return; @@ -1892,8 +2141,8 @@ ip_forward(m, srcrt) type = ICMP_UNREACH; code = ICMP_UNREACH_NEEDFRAG; #ifndef IPSEC - if (ipforward_rt.ro_rt) - destifp = ipforward_rt.ro_rt->rt_ifp; + if (ipforward_rt->ro_rt) + destifp = ipforward_rt->ro_rt->rt_ifp; #else /* * If the packet is routed over IPsec tunnel, tell the @@ -1901,25 +2150,25 @@ ip_forward(m, srcrt) * tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz * XXX quickhack!!! */ - if (ipforward_rt.ro_rt) { + if (ipforward_rt->ro_rt) { struct secpolicy *sp = NULL; int ipsecerror; int ipsechdr; struct route *ro; if (ipsec_bypass) { - destifp = ipforward_rt.ro_rt->rt_ifp; + destifp = ipforward_rt->ro_rt->rt_ifp; ipstat.ips_cantfrag++; break; } - + lck_mtx_lock(sadb_mutex); sp = ipsec4_getpolicybyaddr(mcopy, IPSEC_DIR_OUTBOUND, IP_FORWARDING, &ipsecerror); if (sp == NULL) - destifp = ipforward_rt.ro_rt->rt_ifp; + destifp = ipforward_rt->ro_rt->rt_ifp; else { /* count IPsec header size */ ipsechdr = ipsec4_hdrsiz(mcopy, @@ -1950,6 +2199,7 @@ ip_forward(m, srcrt) key_freesp(sp); } + lck_mtx_unlock(sadb_mutex); } #endif /*IPSEC*/ ipstat.ips_cantfrag++; @@ -1968,11 +2218,11 @@ ip_forward(m, srcrt) } void -ip_savecontrol(inp, mp, ip, m) - register struct inpcb *inp; - register struct mbuf **mp; - register struct ip *ip; - register struct mbuf *m; +ip_savecontrol( + register struct inpcb *inp, + register struct mbuf **mp, + register struct ip *ip, + register struct mbuf *m) { if (inp->inp_socket->so_options & SO_TIMESTAMP) { struct timeval tv; @@ -2018,6 +2268,7 @@ ip_savecontrol(inp, mp, ip, m) struct sockaddr_dl *sdp; struct sockaddr_dl *sdl2 = &sdlbuf.sdl; + ifnet_head_lock_shared(); if (((ifp = m->m_pkthdr.rcvif)) && ( ifp->if_index && (ifp->if_index <= if_index))) { sdp = (struct sockaddr_dl *)(ifnet_addrs @@ -2038,6 +2289,7 @@ makedummy: sdl2->sdl_index = 0; sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; } + ifnet_head_done(); *mp = sbcreatecontrol((caddr_t) sdl2, sdl2->sdl_len, IP_RECVIF, IPPROTO_IP); if (*mp) diff --git a/bsd/netinet/ip_mroute.c b/bsd/netinet/ip_mroute.c index b522d5f95..584058aa0 100644 --- a/bsd/netinet/ip_mroute.c +++ b/bsd/netinet/ip_mroute.c @@ -71,13 +71,13 @@ #endif #ifndef MROUTING -extern u_long _ip_mcast_src __P((int vifi)); -extern int _ip_mforward __P((struct ip *ip, struct ifnet *ifp, - struct mbuf *m, struct ip_moptions *imo)); -extern int _ip_mrouter_done __P((void)); -extern int _ip_mrouter_get __P((struct socket *so, struct sockopt *sopt)); -extern int _ip_mrouter_set __P((struct socket *so, struct sockopt *sopt)); -extern int _mrt_ioctl __P((int req, caddr_t data, struct proc *p)); +extern u_long _ip_mcast_src(int vifi); +extern int _ip_mforward(struct ip *ip, struct ifnet *ifp, + struct mbuf *m, struct ip_moptions *imo); +extern int _ip_mrouter_done(void); +extern int _ip_mrouter_get(struct socket *so, struct sockopt *sopt); +extern int _ip_mrouter_set(struct socket *so, struct sockopt *sopt); +extern int _mrt_ioctl(int req, caddr_t data, struct proc *p); /* * Dummy routines and globals used when multicast routing is not compiled in. @@ -215,7 +215,7 @@ ip_rsvp_force_done(so) struct socket *ip_mrouter = NULL; static struct mrtstat mrtstat; #else /* MROUTE_LKM */ -extern void X_ipip_input __P((struct mbuf *m, int iphlen)); +extern void X_ipip_input(struct mbuf *m, int iphlen); extern struct mrtstat mrtstat; static int ip_mrtproto; #endif @@ -286,13 +286,13 @@ static int have_encap_tunnel = 0; static u_long last_encap_src; static struct vif *last_encap_vif; -static u_long X_ip_mcast_src __P((int vifi)); -static int X_ip_mforward __P((struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo)); -static int X_ip_mrouter_done __P((void)); -static int X_ip_mrouter_get __P((struct socket *so, struct sockopt *m)); -static int X_ip_mrouter_set __P((struct socket *so, struct sockopt *m)); -static int X_legal_vif_num __P((int vif)); -static int X_mrt_ioctl __P((int cmd, caddr_t data)); +static u_long X_ip_mcast_src(int vifi); +static int X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo); +static int X_ip_mrouter_done(void); +static int X_ip_mrouter_get(struct socket *so, struct sockopt *m); +static int X_ip_mrouter_set(struct socket *so, struct sockopt *m); +static int X_legal_vif_num(int vif); +static int X_mrt_ioctl(int cmd, caddr_t data); static int get_sg_cnt(struct sioc_sg_req *); static int get_vif_cnt(struct sioc_vif_req *); @@ -713,6 +713,8 @@ add_vif(vifcp) ifa = ifa_ifwithaddr((struct sockaddr *)&sin); if (ifa == 0) return EADDRNOTAVAIL; ifp = ifa->ifa_ifp; + ifafree(ifa); + ifa = NULL; if (vifcp->vifc_flags & VIFF_TUNNEL) { if ((vifcp->vifc_flags & VIFF_SRCRT) == 0) { @@ -1076,14 +1078,17 @@ socket_send(s, mm, src) struct mbuf *mm; struct sockaddr_in *src; { + socket_lock(s, 1); if (s) { if (sbappendaddr(&s->so_rcv, (struct sockaddr *)src, - mm, (struct mbuf *)0) != 0) { + mm, (struct mbuf *)0, NULL) != 0) { sorwakeup(s); + socket_unlock(s, 1); return 0; } } + socket_unlock(s, 1); m_freem(mm); return -1; } @@ -1336,10 +1341,7 @@ expire_upcalls(void *unused) struct mfc *mfc, **nptr; int i; int s; - boolean_t funnel_state; - - funnel_state = thread_funnel_set(network_flock, TRUE); s = splnet(); for (i = 0; i < MFCTBLSIZ; i++) { @@ -1382,7 +1384,6 @@ expire_upcalls(void *unused) } splx(s); timeout(expire_upcalls, (caddr_t)NULL, EXPIRE_TIMEOUT); - (void) thread_funnel_set(network_flock, FALSE); } /* @@ -1646,8 +1647,6 @@ ipip_input(m, iphlen) struct ifnet *ifp = m->m_pkthdr.rcvif; register struct ip *ip = mtod(m, struct ip *); register int hlen = ip->ip_hl << 2; - register int s; - register struct ifqueue *ifq; register struct vif *vifp; if (!have_encap_tunnel) { @@ -1698,23 +1697,8 @@ ipip_input(m, iphlen) m->m_len -= IP_HDR_LEN; m->m_pkthdr.len -= IP_HDR_LEN; m->m_pkthdr.rcvif = ifp; - - ifq = &ipintrq; - s = splimp(); - if (IF_QFULL(ifq)) { - IF_DROP(ifq); - m_freem(m); - } else { - IF_ENQUEUE(ifq, m); - /* - * normally we would need a "schednetisr(NETISR_IP)" - * here but we were called by ip_input and it is going - * to loop back & try to dequeue the packet we just - * queued as soon as we return so we avoid the - * unnecessary software interrrupt. - */ - } - splx(s); + + proto_inject(PF_INET, m); } /* @@ -1852,11 +1836,8 @@ tbf_reprocess_q(xvifp) void *xvifp; { register struct vif *vifp = xvifp; - boolean_t funnel_state; - funnel_state = thread_funnel_set(network_flock, TRUE); if (ip_mrouter == NULL) { - (void) thread_funnel_set(network_flock, FALSE); return; } @@ -1866,7 +1847,6 @@ tbf_reprocess_q(xvifp) if (vifp->v_tbf->tbf_q_len) timeout(tbf_reprocess_q, (caddr_t)vifp, TBF_REPROCESS); - (void) thread_funnel_set(network_flock, FALSE); } /* function that will selectively discard a member of the queue diff --git a/bsd/netinet/ip_mroute.h b/bsd/netinet/ip_mroute.h index a667f4ea4..c9e42d75d 100644 --- a/bsd/netinet/ip_mroute.h +++ b/bsd/netinet/ip_mroute.h @@ -87,7 +87,9 @@ #define MRT_ASSERT 107 /* enable PIM assert processing */ +#ifdef KERNEL_PRIVATE #define GET_TIME(t) microtime(&t) +#endif KERNEL_PRIVATE /* * Types and macros for handling bitmaps with one bit per virtual interface. @@ -172,11 +174,14 @@ struct sioc_vif_req { u_long ibytes; /* Input byte count on vif */ u_long obytes; /* Output byte count on vif */ }; - +#ifdef PRIVATE /* * The kernel's virtual-interface structure. */ +struct tbf; +struct ifnet; +struct socket; struct vif { u_char v_flags; /* VIFF_ flags defined above */ u_char v_threshold; /* min ttl required to forward on vif*/ @@ -193,6 +198,7 @@ struct vif { u_int v_rsvp_on; /* RSVP listening on this vif */ struct socket *v_rsvpd; /* RSVP daemon socket */ }; +#endif /* * The kernel's multicast forwarding cache entry structure @@ -228,7 +234,9 @@ struct igmpmsg { u_char unused3; struct in_addr im_src, im_dst; }; +#define MFCTBLSIZ 256 +#ifdef KERNEL_PRIVATE /* * Argument structure used for pkt info. while upcall is made */ @@ -242,7 +250,6 @@ struct rtdetq { struct rtdetq *next; /* Next in list of packets */ }; -#define MFCTBLSIZ 256 #if (MFCTBLSIZ & (MFCTBLSIZ - 1)) == 0 /* from sys:route.h */ #define MFCHASHMOD(h) ((h) & (MFCTBLSIZ - 1)) #else @@ -270,21 +277,17 @@ struct tbf struct mbuf *tbf_t; /* tail-insertion pointer */ }; -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE struct sockopt; -extern int (*ip_mrouter_set) __P((struct socket *, struct sockopt *)); -extern int (*ip_mrouter_get) __P((struct socket *, struct sockopt *)); -extern int (*ip_mrouter_done) __P((void)); +extern int (*ip_mrouter_set)(struct socket *, struct sockopt *); +extern int (*ip_mrouter_get)(struct socket *, struct sockopt *); +extern int (*ip_mrouter_done)(void); #if MROUTING -extern int (*mrt_ioctl) __P((int, caddr_t)); +extern int (*mrt_ioctl)(int, caddr_t); #else -extern int (*mrt_ioctl) __P((int, caddr_t, struct proc *)); +extern int (*mrt_ioctl)(int, caddr_t, struct proc *); #endif -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ - -#endif /* _NETINET_IP_MROUTE_H_ */ +#endif KERNEL_PRIVATE +#endif _NETINET_IP_MROUTE_H_ diff --git a/bsd/netinet/ip_output.c b/bsd/netinet/ip_output.c index edee063bc..9fd7a09a1 100644 --- a/bsd/netinet/ip_output.c +++ b/bsd/netinet/ip_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -65,6 +65,8 @@ #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> +#include <kern/locks.h> +#include <sys/sysctl.h> #include <net/if.h> #include <net/route.h> @@ -76,6 +78,8 @@ #include <netinet/in_var.h> #include <netinet/ip_var.h> +#include <netinet/kpi_ipfilter_var.h> + #include "faith.h" #include <net/dlil.h> @@ -87,16 +91,6 @@ #define DBG_FNC_IPSEC4_OUTPUT NETDBG_CODE(DBG_NETIP, (2 << 8) | 1) -#if vax -#include <machine/mtpr.h> -#endif - -#if __FreeBSD__ -#include <machine/in_cksum.h> - -static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options"); -#endif - #if IPSEC #include <netinet6/ipsec.h> #include <netkey/key.h> @@ -108,6 +102,7 @@ static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options"); #endif /*IPSEC*/ #include <netinet/ip_fw.h> +#include <netinet/ip_divert.h> #if DUMMYNET #include <netinet/ip_dummynet.h> @@ -120,42 +115,45 @@ static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options"); (ntohl(a.s_addr))&0xFF); #endif +#if IPSEC +extern lck_mtx_t *sadb_mutex; +#endif + u_short ip_id; -static struct mbuf *ip_insertoptions __P((struct mbuf *, struct mbuf *, int *)); -static struct ifnet *ip_multicast_if __P((struct in_addr *, int *)); -static void ip_mloopback - __P((struct ifnet *, struct mbuf *, struct sockaddr_in *, int)); -static int ip_getmoptions - __P((struct sockopt *, struct ip_moptions *)); -static int ip_pcbopts __P((int, struct mbuf **, struct mbuf *)); -static int ip_setmoptions - __P((struct sockopt *, struct ip_moptions **)); +static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); +static struct ifnet *ip_multicast_if(struct in_addr *, int *); +static void ip_mloopback(struct ifnet *, struct mbuf *, + struct sockaddr_in *, int); +static int ip_getmoptions(struct sockopt *, struct ip_moptions *); +static int ip_pcbopts(int, struct mbuf **, struct mbuf *); +static int ip_setmoptions(struct sockopt *, struct ip_moptions **); int ip_createmoptions(struct ip_moptions **imop); int ip_addmembership(struct ip_moptions *imo, struct ip_mreq *mreq); int ip_dropmembership(struct ip_moptions *imo, struct ip_mreq *mreq); -int ip_optcopy __P((struct ip *, struct ip *)); -extern int (*fr_checkp) __P((struct ip *, int, struct ifnet *, int, struct mbuf **)); +int ip_optcopy(struct ip *, struct ip *); +extern int (*fr_checkp)(struct ip *, int, struct ifnet *, int, struct mbuf **); #ifdef __APPLE__ extern struct mbuf* m_dup(register struct mbuf *m, int how); #endif -static u_long lo_dl_tag = 0; - -void in_delayed_cksum(struct mbuf *m); extern int apple_hwcksum_tx; extern u_long route_generation; extern struct protosw inetsw[]; extern struct ip_linklocal_stat ip_linklocal_stat; +extern lck_mtx_t *ip_mutex; /* temporary: for testing */ #if IPSEC extern int ipsec_bypass; #endif +static int ip_maxchainsent = 0; +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent, CTLFLAG_RW, + &ip_maxchainsent, 0, "use dlil_output_list"); /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). @@ -163,12 +161,26 @@ extern int ipsec_bypass; * The mbuf opt, if present, will not be freed. */ int -ip_output(m0, opt, ro, flags, imo) - struct mbuf *m0; - struct mbuf *opt; - struct route *ro; - int flags; - struct ip_moptions *imo; +ip_output( + struct mbuf *m0, + struct mbuf *opt, + struct route *ro, + int flags, + struct ip_moptions *imo) +{ + int error; + error = ip_output_list(m0, 0, opt, ro, flags, imo); + return error; +} + +int +ip_output_list( + struct mbuf *m0, + int packetchain, + struct mbuf *opt, + struct route *ro, + int flags, + struct ip_moptions *imo) { struct ip *ip, *mhip; struct ifnet *ifp = NULL; @@ -178,80 +190,116 @@ ip_output(m0, opt, ro, flags, imo) struct sockaddr_in *dst = NULL; struct in_ifaddr *ia = NULL; int isbroadcast, sw_csum; + struct in_addr pkt_dst; #if IPSEC struct route iproute; struct socket *so = NULL; struct secpolicy *sp = NULL; #endif - u_int16_t divert_cookie; /* firewall cookie */ #if IPFIREWALL_FORWARD int fwd_rewrite_src = 0; #endif - struct ip_fw_chain *rule = NULL; - -#if IPDIVERT - /* Get and reset firewall cookie */ - divert_cookie = ip_divert_cookie; - ip_divert_cookie = 0; -#else - divert_cookie = 0; -#endif + struct ip_fw_args args; + int didfilter = 0; + ipfilter_t inject_filter_ref = 0; + struct m_tag *tag; + struct route dn_route; + struct mbuf * packetlist; + int pktcnt = 0; + + lck_mtx_lock(ip_mutex); KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); -#if IPFIREWALL && DUMMYNET - /* - * dummynet packet are prepended a vestigial mbuf with - * m_type = MT_DUMMYNET and m_data pointing to the matching - * rule. - */ - if (m->m_type == MT_DUMMYNET) { - /* - * the packet was already tagged, so part of the - * processing was already done, and we need to go down. - * Get parameters from the header. - */ - rule = (struct ip_fw_chain *)(m->m_data) ; - opt = NULL ; - ro = & ( ((struct dn_pkt *)m)->ro ) ; - imo = NULL ; - dst = ((struct dn_pkt *)m)->dn_dst ; - ifp = ((struct dn_pkt *)m)->ifp ; - flags = ((struct dn_pkt *)m)->flags; - m0 = m = m->m_next ; -#if IPSEC - if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { - so = ipsec_getsocket(m); - (void)ipsec_setsocket(m, NULL); - } + packetlist = m0; + args.eh = NULL; + args.rule = NULL; + args.next_hop = NULL; + args.divert_rule = 0; /* divert cookie */ + + /* Grab info from mtags prepended to the chain */ +#if DUMMYNET + if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) { + struct dn_pkt_tag *dn_tag; + + dn_tag = (struct dn_pkt_tag *)(tag+1); + args.rule = dn_tag->rule; + opt = NULL; + dn_route = dn_tag->ro; + ro = &dn_route; + + imo = NULL; + dst = dn_tag->dn_dst; + ifp = dn_tag->ifp; + flags = dn_tag->flags; + + m_tag_delete(m0, tag); + } +#endif /* DUMMYNET */ + + if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) { + struct divert_tag *div_tag; + + div_tag = (struct divert_tag *)(tag+1); + args.divert_rule = div_tag->cookie; + + m_tag_delete(m0, tag); + } + if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) { + struct ip_fwd_tag *ipfwd_tag; + + ipfwd_tag = (struct ip_fwd_tag *)(tag+1); + args.next_hop = ipfwd_tag->next_hop; + + m_tag_delete(m0, tag); + } + + m = m0; + +#if DIAGNOSTIC + if ( !m || (m->m_flags & M_PKTHDR) != 0) + panic("ip_output no HDR"); + if (!ro) + panic("ip_output no route, proto = %d", + mtod(m, struct ip *)->ip_p); #endif + + if (args.rule != NULL) { /* dummynet already saw us */ ip = mtod(m, struct ip *); hlen = IP_VHL_HL(ip->ip_vhl) << 2 ; if (ro->ro_rt != NULL) ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa; - goto sendit; - } else - rule = NULL ; + if (ia) + ifaref(&ia->ia_ifa); +#if IPSEC + if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { + so = ipsec_getsocket(m); + (void)ipsec_setsocket(m, NULL); + } #endif + goto sendit; + } + #if IPSEC if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { so = ipsec_getsocket(m); (void)ipsec_setsocket(m, NULL); } #endif +loopit: + /* + * No need to proccess packet twice if we've + * already seen it + */ + inject_filter_ref = ipf_get_inject_filter(m); -#if DIAGNOSTIC - if ((m->m_flags & M_PKTHDR) == 0) - panic("ip_output no HDR"); - if (!ro) - panic("ip_output no route, proto = %d", - mtod(m, struct ip *)->ip_p); -#endif if (opt) { m = ip_insertoptions(m, opt, &len); hlen = len; } ip = mtod(m, struct ip *); + pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst; + /* * Fill in IP header. */ @@ -281,15 +329,17 @@ ip_output(m0, opt, ro, flags, imo) * cache with IPv6. */ - if (ro->ro_rt && (ro->ro_rt->generation_id != route_generation) && - ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0) && (ip->ip_src.s_addr != INADDR_ANY) && - (ifa_foraddr(ip->ip_src.s_addr) == NULL)) { - error = EADDRNOTAVAIL; - goto bad; + { + if (ro->ro_rt && (ro->ro_rt->generation_id != route_generation) && + ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0) && (ip->ip_src.s_addr != INADDR_ANY) && + (ifa_foraddr(ip->ip_src.s_addr) == 0)) { + error = EADDRNOTAVAIL; + goto bad; + } } if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || dst->sin_family != AF_INET || - dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { + dst->sin_addr.s_addr != pkt_dst.s_addr)) { rtfree(ro->ro_rt); ro->ro_rt = (struct rtentry *)0; } @@ -297,7 +347,7 @@ ip_output(m0, opt, ro, flags, imo) bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); - dst->sin_addr = ip->ip_dst; + dst->sin_addr = pkt_dst; } /* * If routing to interface only, @@ -306,11 +356,14 @@ ip_output(m0, opt, ro, flags, imo) #define ifatoia(ifa) ((struct in_ifaddr *)(ifa)) #define sintosa(sin) ((struct sockaddr *)(sin)) if (flags & IP_ROUTETOIF) { - if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0 && - (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) { - ipstat.ips_noroute++; - error = ENETUNREACH; - goto bad; + if (ia) + ifafree(&ia->ia_ifa); + if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0) { + if ((ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) { + ipstat.ips_noroute++; + error = ENETUNREACH; + goto bad; + } } ifp = ia->ia_ifp; ip->ip_ttl = 1; @@ -332,7 +385,11 @@ ip_output(m0, opt, ro, flags, imo) error = EHOSTUNREACH; goto bad; } + if (ia) + ifafree(&ia->ia_ifa); ia = ifatoia(ro->ro_rt->rt_ifa); + if (ia) + ifaref(&ia->ia_ifa); ifp = ro->ro_rt->rt_ifp; ro->ro_rt->rt_use++; if (ro->ro_rt->rt_flags & RTF_GATEWAY) @@ -342,7 +399,7 @@ ip_output(m0, opt, ro, flags, imo) else isbroadcast = in_broadcast(dst->sin_addr, ifp); } - if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) { struct in_multi *inm; m->m_flags |= M_MCAST; @@ -395,7 +452,9 @@ ip_output(m0, opt, ro, flags, imo) } } - IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm); + ifnet_lock_shared(ifp); + IN_LOOKUP_MULTI(pkt_dst, ifp, inm); + ifnet_lock_done(ifp); if (inm != NULL && (imo == NULL || imo->imo_multicast_loop)) { /* @@ -403,6 +462,42 @@ ip_output(m0, opt, ro, flags, imo) * on the outgoing interface, and the caller did not * forbid loopback, loop back a copy. */ + if (!TAILQ_EMPTY(&ipv4_filters)) { + struct ipfilter *filter; + int seen = (inject_filter_ref == 0); + struct ipf_pktopts *ippo = 0, ipf_pktopts; + + if (imo) { + ippo = &ipf_pktopts; + ipf_pktopts.ippo_mcast_ifnet = imo->imo_multicast_ifp; + ipf_pktopts.ippo_mcast_ttl = imo->imo_multicast_ttl; + ipf_pktopts.ippo_mcast_loop = imo->imo_multicast_loop; + } + + lck_mtx_unlock(ip_mutex); + ipf_ref(); + TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { + if (seen == 0) { + if ((struct ipfilter *)inject_filter_ref == filter) + seen = 1; + } else if (filter->ipf_filter.ipf_output) { + errno_t result; + result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo); + if (result == EJUSTRETURN) { + ipf_unref(); + goto done; + } + if (result != 0) { + ipf_unref(); + lck_mtx_lock(ip_mutex); + goto bad; + } + } + } + lck_mtx_lock(ip_mutex); + ipf_unref(); + didfilter = 1; + } ip_mloopback(ifp, m, dst, hlen); } else { @@ -429,6 +524,7 @@ ip_output(m0, opt, ro, flags, imo) imo = NULL; if (ip_mforward(ip, ifp, m, imo) != 0) { m_freem(m); + lck_mtx_unlock(ip_mutex); goto done; } } @@ -444,6 +540,7 @@ ip_output(m0, opt, ro, flags, imo) */ if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { m_freem(m); + lck_mtx_unlock(ip_mutex); goto done; } @@ -466,15 +563,6 @@ ip_output(m0, opt, ro, flags, imo) #endif /* IPFIREWALL_FORWARD */ } #endif /* notdef */ - /* - * Verify that we have any chance at all of being able to queue - * the packet or packet fragments - */ - if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >= - ifp->if_snd.ifq_maxlen) { - error = ENOBUFS; - goto bad; - } /* * Look for broadcast address and @@ -512,6 +600,35 @@ sendit: } } +injectit: + if (!didfilter && !TAILQ_EMPTY(&ipv4_filters)) { + struct ipfilter *filter; + int seen = (inject_filter_ref == 0); + + lck_mtx_unlock(ip_mutex); + ipf_ref(); + TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { + if (seen == 0) { + if ((struct ipfilter *)inject_filter_ref == filter) + seen = 1; + } else if (filter->ipf_filter.ipf_output) { + errno_t result; + result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, 0); + if (result == EJUSTRETURN) { + ipf_unref(); + goto done; + } + if (result != 0) { + ipf_unref(); + lck_mtx_lock(ip_mutex); + goto bad; + } + } + } + ipf_unref(); + lck_mtx_lock(ip_mutex); + } + #if IPSEC /* temporary for testing only: bypass ipsec alltogether */ @@ -520,6 +637,8 @@ sendit: KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); + lck_mtx_lock(sadb_mutex); + /* get SP for this packet */ if (so == NULL) sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, flags, &error); @@ -529,6 +648,7 @@ sendit: if (sp == NULL) { ipsecstat.out_inval++; KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); + lck_mtx_unlock(sadb_mutex); goto bad; } @@ -542,12 +662,14 @@ sendit: */ ipsecstat.out_polvio++; KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1,0,0,0,0); + lck_mtx_unlock(sadb_mutex); goto bad; case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: /* no need to do IPsec. */ KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 2,0,0,0,0); + lck_mtx_unlock(sadb_mutex); goto skip_ipsec; case IPSEC_POLICY_IPSEC: @@ -555,6 +677,7 @@ sendit: /* acquire a policy */ error = key_spdacquire(sp); KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 3,0,0,0,0); + lck_mtx_unlock(sadb_mutex); goto bad; } break; @@ -588,8 +711,11 @@ sendit: HTONS(ip->ip_len); HTONS(ip->ip_off); + lck_mtx_unlock(ip_mutex); error = ipsec4_output(&state, sp, flags); - + lck_mtx_unlock(sadb_mutex); + lck_mtx_lock(ip_mutex); + m0 = m = state.m; if (flags & IP_ROUTETOIF) { @@ -639,7 +765,7 @@ sendit: /* Check that there wasn't a route change and src is still valid */ if (ro->ro_rt->generation_id != route_generation) { - if (ifa_foraddr(ip->ip_src.s_addr) == NULL && ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0)) { + if (ifa_foraddr(ip->ip_src.s_addr) == 0 && ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0)) { error = EADDRNOTAVAIL; KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 5,0,0,0,0); goto bad; @@ -657,7 +783,11 @@ sendit: goto bad; } } else { + if (ia) + ifafree(&ia->ia_ifa); ia = ifatoia(ro->ro_rt->rt_ifa); + if (ia) + ifaref(&ia->ia_ifa); ifp = ro->ro_rt->rt_ifp; } @@ -665,6 +795,31 @@ sendit: NTOHS(ip->ip_len); NTOHS(ip->ip_off); KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 7,0xff,0xff,0xff,0xff); + + /* Pass to filters again */ + if (!TAILQ_EMPTY(&ipv4_filters)) { + struct ipfilter *filter; + + lck_mtx_unlock(ip_mutex); + ipf_ref(); + TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { + if (filter->ipf_filter.ipf_output) { + errno_t result; + result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, 0); + if (result == EJUSTRETURN) { + ipf_unref(); + goto done; + } + if (result != 0) { + ipf_unref(); + lck_mtx_lock(ip_mutex); + goto bad; + } + } + } + ipf_unref(); + lck_mtx_lock(ip_mutex); + } skip_ipsec: #endif /*IPSEC*/ @@ -678,19 +833,27 @@ skip_ipsec: if (fr_checkp) { struct mbuf *m1 = m; - if ((error = (*fr_checkp)(ip, hlen, ifp, 1, &m1)) || !m1) + if ((error = (*fr_checkp)(ip, hlen, ifp, 1, &m1)) || !m1) { + lck_mtx_unlock(ip_mutex); goto done; + } ip = mtod(m0 = m = m1, struct ip *); } /* * Check with the firewall... + * but not if we are already being fwd'd from a firewall. */ - if (fw_enable && ip_fw_chk_ptr) { + if (fw_enable && IPFW_LOADED && !args.next_hop) { struct sockaddr_in *old = dst; - off = (*ip_fw_chk_ptr)(&ip, - hlen, ifp, &divert_cookie, &m, &rule, &dst); + args.m = m; + args.next_hop = dst; + args.oif = ifp; + off = ip_fw_chk_ptr(&args); + m = args.m; + dst = args.next_hop; + /* * On return we must do the following: * IP_FW_PORT_DENY_FLAG -> drop the pkt (XXX new) @@ -710,13 +873,14 @@ skip_ipsec: if (m) m_freem(m); error = EACCES ; + lck_mtx_unlock(ip_mutex); goto done ; } ip = mtod(m, struct ip *); if (off == 0 && dst == old) /* common case */ goto pass ; #if DUMMYNET - if ((off & IP_FW_PORT_DYNT_FLAG) != 0) { + if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) { /* * pass the pkt to dummynet. Need to include * pipe number, m, ifp, ro, dst because these are @@ -726,11 +890,16 @@ skip_ipsec: * XXX note: if the ifp or ro entry are deleted * while a pkt is in dummynet, we are in trouble! */ - error = dummynet_io(off & 0xffff, DN_TO_IP_OUT, m, - ifp,ro,dst,rule, flags); + args.ro = ro; + args.dst = dst; + args.flags = flags; + + lck_mtx_unlock(ip_mutex); + error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT, + &args); goto done; } -#endif +#endif /* DUMMYNET */ #if IPDIVERT if (off != 0 && (off & IP_FW_PORT_DYNT_FLAG) == 0) { struct mbuf *clone = NULL; @@ -753,8 +922,7 @@ skip_ipsec: HTONS(ip->ip_off); /* Deliver packet to divert input routine */ - ip_divert_cookie = divert_cookie; - divert_packet(m, 0, off & 0xffff); + divert_packet(m, 0, off & 0xffff, args.divert_rule); /* If 'tee', continue with original packet */ if (clone != NULL) { @@ -762,6 +930,7 @@ skip_ipsec: ip = mtod(m, struct ip *); goto pass; } + lck_mtx_unlock(ip_mutex); goto done; } #endif @@ -777,7 +946,7 @@ skip_ipsec: * And I'm babbling. */ if (off == 0 && old != dst) { - struct in_ifaddr *ia; + struct in_ifaddr *ia_fw; /* It's changed... */ /* There must be a better way to do this next line... */ @@ -800,31 +969,65 @@ skip_ipsec: * as the packet runs through ip_input() as * it is done through a ISR. */ - TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + TAILQ_FOREACH(ia_fw, &in_ifaddrhead, ia_link) { /* * If the addr to forward to is one * of ours, we pretend to * be the destination for this packet. */ - if (IA_SIN(ia)->sin_addr.s_addr == + if (IA_SIN(ia_fw)->sin_addr.s_addr == dst->sin_addr.s_addr) break; } if (ia) { /* tell ip_input "dont filter" */ - ip_fw_fwd_addr = dst; + struct m_tag *fwd_tag; + struct ip_fwd_tag *ipfwd_tag; + + fwd_tag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, + sizeof(struct sockaddr_in), M_NOWAIT); + if (fwd_tag == NULL) { + error = ENOBUFS; + goto bad; + } + + ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1); + ipfwd_tag->next_hop = args.next_hop; + + m_tag_prepend(m, fwd_tag); + if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = ifunit("lo0"); - if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + if ((~IF_HWASSIST_CSUM_FLAGS(m->m_pkthdr.rcvif->if_hwassist) & + m->m_pkthdr.csum_flags) == 0) { + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + m->m_pkthdr.csum_flags |= + CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + m->m_pkthdr.csum_data = 0xffff; + } m->m_pkthdr.csum_flags |= - CSUM_DATA_VALID | CSUM_PSEUDO_HDR; - m->m_pkthdr.csum_data = 0xffff; + CSUM_IP_CHECKED | CSUM_IP_VALID; + } + else if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + in_delayed_cksum(m); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + ip->ip_sum = in_cksum(m, hlen); } - m->m_pkthdr.csum_flags |= - CSUM_IP_CHECKED | CSUM_IP_VALID; HTONS(ip->ip_len); HTONS(ip->ip_off); - ip_input(m); + + lck_mtx_unlock(ip_mutex); + + /* we need to call dlil_output to run filters + * and resync to avoid recursion loops. + */ + if (lo_ifp) { + dlil_output(lo_ifp, PF_INET, m, 0, (struct sockaddr *)dst, 0); + } + else { + printf("ip_output: no loopback ifp for forwarding!!!\n"); + } goto done; } /* Some of the logic for this was @@ -844,7 +1047,7 @@ skip_ipsec: goto bad; } - ia = ifatoia(ro_fwd->ro_rt->rt_ifa); + ia_fw = ifatoia(ro_fwd->ro_rt->rt_ifa); ifp = ro_fwd->ro_rt->rt_ifp; ro_fwd->ro_rt->rt_use++; if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY) @@ -864,7 +1067,7 @@ skip_ipsec: * interface, do it again, from the new one. */ if (fwd_rewrite_src) - ip->ip_src = IA_SIN(ia)->sin_addr; + ip->ip_src = IA_SIN(ia_fw)->sin_addr; goto pass ; } #endif /* IPFIREWALL_FORWARD */ @@ -873,8 +1076,9 @@ skip_ipsec: * we have to drop the pkt */ m_freem(m); - error = EACCES; /* not sure this is the right error msg */ - goto done; + error = EACCES; /* not sure this is the right error msg */ + lck_mtx_unlock(ip_mutex); + goto done; } pass: @@ -885,13 +1089,15 @@ pass: (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) { ipstat.ips_badaddr++; m_freem(m); - /* - * Simply drop the packet just like a firewall -- we do not want the - * the application to feel the pain, not yet... - * Returning ENETUNREACH like ip6_output does in some similar cases - * could startle the otherwise clueless process that specifies + /* + * Do not simply drop the packet just like a firewall -- we want the + * the application to feel the pain. + * Return ENETUNREACH like ip6_output does in some similar cases. + * This can startle the otherwise clueless process that specifies * loopback as the source address. */ + error = ENETUNREACH; + lck_mtx_unlock(ip_mutex); goto done; } #endif @@ -955,14 +1161,29 @@ pass: if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) ipsec_delaux(m); #endif -#if __APPLE__ - error = dlil_output(ifptodlt(ifp, PF_INET), m, (void *) ro->ro_rt, + if (packetchain == 0) { + lck_mtx_unlock(ip_mutex); + error = dlil_output(ifp, PF_INET, m, (void *) ro->ro_rt, (struct sockaddr *)dst, 0); -#else - error = (*ifp->if_output)(ifp, m, - (struct sockaddr *)dst, ro->ro_rt); -#endif - goto done; + goto done; + } + else { /* packet chaining allows us to reuse the route for all packets */ + m = m->m_nextpkt; + if (m == NULL) { + if (pktcnt > ip_maxchainsent) + ip_maxchainsent = pktcnt; + //send + lck_mtx_unlock(ip_mutex); + error = dlil_output_list(ifp, PF_INET, packetlist, (void *) ro->ro_rt, + (struct sockaddr *)dst, 0); + pktcnt = 0; + goto done; + + } + m0 = m; + pktcnt++; + goto loopit; + } } /* * Too large for interface; fragment if possible. @@ -998,8 +1219,10 @@ pass: if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA && (ifp->if_hwassist & CSUM_IP_FRAGS) == 0) { in_delayed_cksum(m); - if (m == NULL) + if (m == NULL) { + lck_mtx_unlock(ip_mutex); return(ENOMEM); + } m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } @@ -1047,8 +1270,9 @@ pass: goto sendorfree; } m->m_pkthdr.len = mhlen + len; - m->m_pkthdr.rcvif = (struct ifnet *)0; + m->m_pkthdr.rcvif = 0; m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; + m->m_pkthdr.socket_id = m0->m_pkthdr.socket_id; HTONS(mhip->ip_off); mhip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) { @@ -1084,6 +1308,7 @@ sendorfree: KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); + lck_mtx_unlock(ip_mutex); for (m = m0; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; @@ -1100,14 +1325,10 @@ sendorfree: ia->ia_ifa.if_obytes += m->m_pkthdr.len; } #endif - -#if __APPLE__ - error = dlil_output(ifptodlt(ifp, PF_INET), m, (void *) ro->ro_rt, + if ((packetchain != 0) && (pktcnt > 0)) + panic("ip_output: mix of packet in packetlist is wrong=%x", packetlist); + error = dlil_output(ifp, PF_INET, m, (void *) ro->ro_rt, (struct sockaddr *)dst, 0); -#else - error = (*ifp->if_output)(ifp, m, - (struct sockaddr *)dst, ro->ro_rt); -#endif } else m_freem(m); } @@ -1116,6 +1337,10 @@ sendorfree: ipstat.ips_fragmented++; } done: + if (ia) { + ifafree(&ia->ia_ifa); + ia = NULL; + } #if IPSEC if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { if (ro == &iproute && ro->ro_rt) { @@ -1125,7 +1350,9 @@ done: if (sp != NULL) { KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP ip_output call free SP:%x\n", sp)); + lck_mtx_lock(sadb_mutex); key_freesp(sp); + lck_mtx_unlock(sadb_mutex); } } #endif /* IPSEC */ @@ -1134,36 +1361,128 @@ done: return (error); bad: m_freem(m0); + lck_mtx_unlock(ip_mutex); goto done; } void -in_delayed_cksum(struct mbuf *m) +in_delayed_cksum_offset(struct mbuf *m, int ip_offset) { struct ip *ip; u_short csum, offset; - ip = mtod(m, struct ip *); + + while (ip_offset > m->m_len) { + ip_offset -= m->m_len; + m = m->m_next; + if (m) { + printf("in_delayed_cksum_withoffset failed - ip_offset wasn't in the packet\n"); + return; + } + } + + if (ip_offset + sizeof(struct ip) > m->m_len) { + printf("delayed m_pullup, m->len: %d off: %d p: %d\n", + m->m_len, ip_offset, ip->ip_p); + /* + * XXX + * this shouldn't happen + */ + m = m_pullup(m, ip_offset + sizeof(struct ip)); + } + + /* Gross */ + if (ip_offset) { + m->m_len -= ip_offset; + m->m_data += ip_offset; + } + + ip = mtod(m, struct ip*); offset = IP_VHL_HL(ip->ip_vhl) << 2 ; csum = in_cksum_skip(m, ip->ip_len, offset); if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) csum = 0xffff; offset += m->m_pkthdr.csum_data & 0xFFFF; /* checksum offset */ + + /* Gross */ + if (ip_offset) { + if (M_LEADINGSPACE(m) < ip_offset) + panic("in_delayed_cksum_withoffset - chain modified!\n"); + m->m_len += ip_offset; + m->m_data -= ip_offset; + } if (offset > ip->ip_len) /* bogus offset */ return; - if (offset + sizeof(u_short) > m->m_len) { + if (offset + ip_offset + sizeof(u_short) > m->m_len) { printf("delayed m_pullup, m->len: %d off: %d p: %d\n", - m->m_len, offset, ip->ip_p); + m->m_len, offset + ip_offset, ip->ip_p); /* * XXX * this shouldn't happen, but if it does, the * correct behavior may be to insert the checksum * in the existing chain instead of rearranging it. */ - m = m_pullup(m, offset + sizeof(u_short)); + m = m_pullup(m, offset + ip_offset + sizeof(u_short)); + } + *(u_short *)(m->m_data + offset + ip_offset) = csum; +} + +void +in_delayed_cksum(struct mbuf *m) +{ + in_delayed_cksum_offset(m, 0); +} + +void +in_cksum_offset(struct mbuf* m, size_t ip_offset) +{ + struct ip* ip = NULL; + int hlen = 0; + + while (ip_offset > m->m_len) { + ip_offset -= m->m_len; + m = m->m_next; + if (m) { + printf("in_cksum_offset failed - ip_offset wasn't in the packet\n"); + return; + } + } + + if (ip_offset + sizeof(struct ip) > m->m_len) { + printf("in_cksum_offset - delayed m_pullup, m->len: %d off: %d\n", + m->m_len, ip_offset); + /* + * XXX + * this shouldn't happen + */ + m = m_pullup(m, ip_offset + sizeof(struct ip)); + } + + /* Gross */ + if (ip_offset) { + m->m_len -= ip_offset; + m->m_data += ip_offset; + } + + ip = mtod(m, struct ip*); + +#ifdef _IP_VHL + hlen = IP_VHL_HL(ip->ip_vhl) << 2; +#else + hlen = ip->ip_hl << 2; +#endif + + ip->ip_sum = 0; + ip->ip_sum = in_cksum(m, hlen); + + /* Gross */ + if (ip_offset) { + if (M_LEADINGSPACE(m) < ip_offset) + panic("in_cksum_offset - chain modified!\n"); + m->m_len += ip_offset; + m->m_data -= ip_offset; } - *(u_short *)(m->m_data + offset) = csum; } /* @@ -1193,7 +1512,7 @@ ip_insertoptions(m, opt, phlen) MGETHDR(n, M_DONTWAIT, MT_HEADER); if (n == 0) return (m); - n->m_pkthdr.rcvif = (struct ifnet *)0; + n->m_pkthdr.rcvif = 0; n->m_pkthdr.len = m->m_pkthdr.len + optlen; m->m_len -= sizeof(struct ip); m->m_data += sizeof(struct ip); @@ -1410,24 +1729,24 @@ ip_ctloutput(so, sopt) struct mbuf *m; int optname; - if (sopt->sopt_valsize > MCLBYTES) { - error = EMSGSIZE; - break; - } - + if (sopt->sopt_valsize > MCLBYTES) { + error = EMSGSIZE; + break; + } if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ break; priv = (sopt->sopt_p != NULL && - suser(sopt->sopt_p->p_ucred, - &sopt->sopt_p->p_acflag) != 0) ? 0 : 1; + proc_suser(sopt->sopt_p) != 0) ? 0 : 1; if (m) { req = mtod(m, caddr_t); len = m->m_len; } optname = sopt->sopt_name; + lck_mtx_lock(sadb_mutex); error = ipsec4_set_policy(inp, optname, req, len, priv); + lck_mtx_unlock(sadb_mutex); m_freem(m); break; } @@ -1533,7 +1852,9 @@ ip_ctloutput(so, sopt) req = mtod(m, caddr_t); len = m->m_len; } + lck_mtx_lock(sadb_mutex); error = ipsec4_get_policy(sotoinpcb(so), req, len, &m); + lck_mtx_unlock(sadb_mutex); if (error == 0) error = soopt_mcopyout(sopt, m); /* XXX */ if (error == 0) @@ -1679,9 +2000,13 @@ ip_multicast_if(a, ifindexp) *ifindexp = 0; if (ntohl(a->s_addr) >> 24 == 0) { ifindex = ntohl(a->s_addr) & 0xffffff; - if (ifindex < 0 || if_index < ifindex) + ifnet_head_lock_shared(); + if (ifindex < 0 || if_index < ifindex) { + ifnet_head_done(); return NULL; + } ifp = ifindex2ifnet[ifindex]; + ifnet_head_done(); if (ifindexp) *ifindexp = ifindex; } else { @@ -1705,7 +2030,6 @@ ip_setmoptions(sopt, imop) struct ifnet *ifp = NULL; struct ip_moptions *imo = *imop; int ifindex; - int s; if (imo == NULL) { /* @@ -1756,10 +2080,8 @@ ip_setmoptions(sopt, imop) * IP address. Find the interface and confirm that * it supports multicasting. */ - s = splimp(); ifp = ip_multicast_if(&addr, &ifindex); if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { - splx(s); error = EADDRNOTAVAIL; break; } @@ -1768,7 +2090,6 @@ ip_setmoptions(sopt, imop) imo->imo_multicast_addr = addr; else imo->imo_multicast_addr.s_addr = INADDR_ANY; - splx(s); break; case IP_MULTICAST_TTL: @@ -1900,14 +2221,12 @@ ip_addmembership( struct sockaddr_in *dst; struct ifnet *ifp = NULL; int error = 0; - int s = 0; int i; if (!IN_MULTICAST(ntohl(mreq->imr_multiaddr.s_addr))) { error = EINVAL; return error; } - s = splimp(); /* * If no interface address was provided, use the interface of * the route to the given multicast address. @@ -1939,7 +2258,6 @@ ip_addmembership( */ if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { error = EADDRNOTAVAIL; - splx(s); return error; } /* @@ -1954,12 +2272,10 @@ ip_addmembership( } if (i < imo->imo_num_memberships) { error = EADDRINUSE; - splx(s); return error; } if (i == IP_MAX_MEMBERSHIPS) { error = ETOOMANYREFS; - splx(s); return error; } /* @@ -1969,11 +2285,9 @@ ip_addmembership( if ((imo->imo_membership[i] = in_addmulti(&mreq->imr_multiaddr, ifp)) == NULL) { error = ENOBUFS; - splx(s); return error; } ++imo->imo_num_memberships; - splx(s); return error; } @@ -1987,7 +2301,6 @@ ip_dropmembership( struct ip_mreq *mreq) { int error = 0; - int s = 0; struct ifnet* ifp = NULL; int i; @@ -1996,7 +2309,6 @@ ip_dropmembership( return error; } - s = splimp(); /* * If an interface address was specified, get a pointer * to its ifnet structure. @@ -2007,7 +2319,6 @@ ip_dropmembership( ifp = ip_multicast_if(&mreq->imr_interface, NULL); if (ifp == NULL) { error = EADDRNOTAVAIL; - splx(s); return error; } } @@ -2023,21 +2334,19 @@ ip_dropmembership( } if (i == imo->imo_num_memberships) { error = EADDRNOTAVAIL; - splx(s); return error; } /* * Give up the multicast address record to which the * membership points. */ - in_delmulti(imo->imo_membership[i]); + in_delmulti(&imo->imo_membership[i]); /* * Remove the gap in the membership array. */ for (++i; i < imo->imo_num_memberships; ++i) imo->imo_membership[i-1] = imo->imo_membership[i]; --imo->imo_num_memberships; - splx(s); return error; } @@ -2119,8 +2428,7 @@ ip_freemoptions(imo) if (imo != NULL) { for (i = 0; i < imo->imo_num_memberships; ++i) - if (imo->imo_membership[i] != NULL) - in_delmulti(imo->imo_membership[i]); + in_delmulti(&imo->imo_membership[i]); FREE(imo, M_IPMOPTS); } } @@ -2204,18 +2512,15 @@ ip_mloopback(ifp, m, dst, hlen) * a filter has tapped-in. */ - if (lo_dl_tag == 0) - dlil_find_dltag(APPLE_IF_FAM_LOOPBACK, 0, PF_INET, &lo_dl_tag); - /* * Stuff the 'real' ifp into the pkthdr, to be used in matching * in ip_input(); we need the loopback ifp/dl_tag passed as args * to make the loopback driver compliant with the data link * requirements. */ - if (lo_dl_tag) { + if (lo_ifp) { copym->m_pkthdr.rcvif = ifp; - dlil_output(lo_dl_tag, copym, 0, (struct sockaddr *) dst, 0); + dlil_output(lo_ifp, PF_INET, copym, 0, (struct sockaddr *) dst, 0); } else { printf("Warning: ip_output call to dlil_find_dltag failed!\n"); m_freem(copym); diff --git a/bsd/netinet/ip_var.h b/bsd/netinet/ip_var.h index d5c0f22f6..ddba0e79f 100644 --- a/bsd/netinet/ip_var.h +++ b/bsd/netinet/ip_var.h @@ -58,7 +58,6 @@ #define _NETINET_IP_VAR_H_ #include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE /* * Overlay for ip header used by other protocols (tcp, udp). */ @@ -70,6 +69,7 @@ struct ipovly { struct in_addr ih_dst; /* destination internet address */ }; +#ifdef KERNEL_PRIVATE /* * Ip reassembly queue structure. Each fragment * being reassembled is attached to one of these structures. @@ -84,7 +84,8 @@ struct ipq { struct mbuf *ipq_frags; /* to ip headers of fragments */ struct in_addr ipq_src,ipq_dst; u_long ipq_nfrags; - u_long reserved[3]; /* for future use */ + TAILQ_ENTRY(ipq) ipq_list; + u_long reserved[1]; /* for future use */ #if IPDIVERT #ifdef IPDIVERT_44 u_int32_t ipq_div_info; /* ipfw divert port & flags */ @@ -101,7 +102,9 @@ struct ipq { * The actual length of the options (including ipopt_dst) * is in m_len. */ +#endif /* KERNEL_PRIVATE */ #define MAX_IPOPTLEN 40 +#ifdef KERNEL_PRIVATE struct ipoption { struct in_addr ipopt_dst; /* first-hop dst if source routed */ @@ -121,9 +124,14 @@ struct ip_moptions { u_long imo_multicast_vif; /* vif num outgoing multicasts */ struct in_addr imo_multicast_addr; /* ifindex/addr on MULTICAST_IF */ }; -#endif /* __APPLE_API_PRIVATE */ -#ifdef __APPLE_API_UNSTABLE +/* mbuf tag for ip_forwarding info */ +struct ip_fwd_tag { + struct sockaddr_in *next_hop; /* next_hop */ +}; + +#endif /* KERNEL_PRIVATE */ + struct ipstat { u_long ips_total; /* total packets received */ u_long ips_badsum; /* checksum bad */ @@ -155,10 +163,6 @@ struct ipstat { u_long ips_nogif; /* no match gif found */ u_long ips_badaddr; /* invalid address on header */ }; -#endif /* __APPLE_API_UNSTABLE */ - -#ifdef __APPLE_API_PRIVATE -#ifdef KERNEL struct ip_linklocal_stat { u_long iplls_in_total; @@ -167,6 +171,7 @@ struct ip_linklocal_stat { u_long iplls_out_badttl; }; +#ifdef KERNEL_PRIVATE /* flags passed to ip_output as last parameter */ #define IP_FORWARDING 0x1 /* most of ip header exists */ #define IP_RAWOUTPUT 0x2 /* raw ip header exists */ @@ -188,55 +193,48 @@ extern int ipforwarding; /* ip forwarding */ extern struct protosw *ip_protox[]; extern struct socket *ip_rsvpd; /* reservation protocol daemon */ extern struct socket *ip_mrouter; /* multicast routing daemon */ -extern int (*legal_vif_num) __P((int)); -extern u_long (*ip_mcast_src) __P((int)); +extern int (*legal_vif_num)(int); +extern u_long (*ip_mcast_src)(int); extern int rsvp_on; extern struct pr_usrreqs rip_usrreqs; -int ip_ctloutput __P((struct socket *, struct sockopt *sopt)); -void ip_drain __P((void)); -void ip_freemoptions __P((struct ip_moptions *)); -void ip_init __P((void)); -extern int (*ip_mforward) __P((struct ip *, struct ifnet *, struct mbuf *, - struct ip_moptions *)); -int ip_output __P((struct mbuf *, - struct mbuf *, struct route *, int, struct ip_moptions *)); -void ip_savecontrol __P((struct inpcb *, struct mbuf **, struct ip *, - struct mbuf *)); -void ip_slowtimo __P((void)); +int ip_ctloutput(struct socket *, struct sockopt *sopt); +void ip_drain(void); +void ip_freemoptions(struct ip_moptions *); +void ip_init(void); +extern int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, + struct ip_moptions *); +int ip_output(struct mbuf *, + struct mbuf *, struct route *, int, struct ip_moptions *); +int ip_output_list(struct mbuf *, int, + struct mbuf *, struct route *, int, struct ip_moptions *); +struct in_ifaddr * + ip_rtaddr(struct in_addr, struct route *); +void ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *, + struct mbuf *); +void ip_slowtimo(void); struct mbuf * - ip_srcroute __P((void)); -void ip_stripoptions __P((struct mbuf *, struct mbuf *)); + ip_srcroute(void); +void ip_stripoptions(struct mbuf *, struct mbuf *); #if RANDOM_IP_ID u_int16_t - ip_randomid __P((void)); + ip_randomid(void); #endif -int rip_ctloutput __P((struct socket *, struct sockopt *)); -void rip_ctlinput __P((int, struct sockaddr *, void *)); -void rip_init __P((void)); -void rip_input __P((struct mbuf *, int)); -int rip_output __P((struct mbuf *, struct socket *, u_long)); -void ipip_input __P((struct mbuf *, int)); -void rsvp_input __P((struct mbuf *, int)); -int ip_rsvp_init __P((struct socket *)); -int ip_rsvp_done __P((void)); -int ip_rsvp_vif_init __P((struct socket *, struct sockopt *)); -int ip_rsvp_vif_done __P((struct socket *, struct sockopt *)); -void ip_rsvp_force_done __P((struct socket *)); - -#if IPDIVERT -void div_init __P((void)); -void div_input __P((struct mbuf *, int)); -void divert_packet __P((struct mbuf *, int, int)); -extern struct pr_usrreqs div_usrreqs; -extern u_int16_t ip_divert_cookie; -#endif - -extern struct sockaddr_in *ip_fw_fwd_addr; +int rip_ctloutput(struct socket *, struct sockopt *); +void rip_ctlinput(int, struct sockaddr *, void *); +void rip_init(void); +void rip_input(struct mbuf *, int); +int rip_output(struct mbuf *, struct socket *, u_long); +int rip_unlock(struct socket *, int, int); +void ipip_input(struct mbuf *, int); +void rsvp_input(struct mbuf *, int); +int ip_rsvp_init(struct socket *); +int ip_rsvp_done(void); +int ip_rsvp_vif_init(struct socket *, struct sockopt *); +int ip_rsvp_vif_done(struct socket *, struct sockopt *); +void ip_rsvp_force_done(struct socket *); void in_delayed_cksum(struct mbuf *m); -#endif /* _KERNEL */ -#endif /* __APPLE_API_PRIVATE */ - -#endif /* !_NETINET_IP_VAR_H_ */ +#endif KERNEL_PRIVATE +#endif !_NETINET_IP_VAR_H_ diff --git a/bsd/netinet/kpi_ipfilter.c b/bsd/netinet/kpi_ipfilter.c new file mode 100644 index 000000000..52b8a0f25 --- /dev/null +++ b/bsd/netinet/kpi_ipfilter.c @@ -0,0 +1,496 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include <sys/param.h> /* for definition of NULL */ +#include <sys/errno.h> +#include <sys/malloc.h> +#include <sys/socket.h> +#include <sys/mbuf.h> +#include <sys/systm.h> + +#define _IP_VHL +#include <net/if_var.h> +#include <net/route.h> +#include <net/kpi_protocol.h> + +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/in_var.h> +#include <netinet6/in6_var.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> +#include <netinet/ip_var.h> +#include <netinet6/ip6_var.h> +#include <netinet/kpi_ipfilter_var.h> + +/* + * kipf_lock and kipf_ref protect the linkage of the list of IP filters + * An IP filter can be removed only when kipf_ref is zero + * If an IP filter cannot be removed because kipf_ref is not null, then + * the IP filter is marjed and kipf_delayed_remove is set so that when + * kipf_ref eventually goes down to zero, the IP filter is removed + */ +static lck_mtx_t *kipf_lock = 0; +static unsigned long kipf_ref = 0; +static unsigned long kipf_delayed_remove = 0; + +__private_extern__ struct ipfilter_list ipv4_filters = TAILQ_HEAD_INITIALIZER(ipv4_filters); +__private_extern__ struct ipfilter_list ipv6_filters = TAILQ_HEAD_INITIALIZER(ipv6_filters); +__private_extern__ struct ipfilter_list tbr_filters = TAILQ_HEAD_INITIALIZER(tbr_filters); + +__private_extern__ void +ipf_ref(void) +{ + lck_mtx_lock(kipf_lock); + kipf_ref++; + lck_mtx_unlock(kipf_lock); +} + +__private_extern__ void +ipf_unref(void) +{ + lck_mtx_lock(kipf_lock); + + if (kipf_ref == 0) + panic("ipf_unref: kipf_ref == 0\n"); + + kipf_ref--; + if (kipf_ref == 0 && kipf_delayed_remove != 0) { + struct ipfilter *filter; + + while ((filter = TAILQ_FIRST(&tbr_filters))) { + ipf_detach_func ipf_detach = filter->ipf_filter.ipf_detach; + void* cookie = filter->ipf_filter.cookie; + + TAILQ_REMOVE(filter->ipf_head, filter, ipf_link); + TAILQ_REMOVE(&tbr_filters, filter, ipf_tbr); + kipf_delayed_remove--; + + if (ipf_detach) { + lck_mtx_unlock(kipf_lock); + ipf_detach(cookie); + lck_mtx_lock(kipf_lock); + /* In case some filter got to run while we released the lock */ + if (kipf_ref != 0) + break; + } + } + } + lck_mtx_unlock(kipf_lock); +} + +static errno_t +ipf_add( + const struct ipf_filter* filter, + ipfilter_t *filter_ref, + struct ipfilter_list *head) +{ + struct ipfilter *new_filter; + if (filter->name == NULL || (filter->ipf_input == NULL && filter->ipf_output == NULL)) + return EINVAL; + + MALLOC(new_filter, struct ipfilter*, sizeof(*new_filter), M_IFADDR, M_WAITOK); + if (new_filter == NULL) + return ENOMEM; + + lck_mtx_lock(kipf_lock); + new_filter->ipf_filter = *filter; + new_filter->ipf_head = head; + + /* + * 3957298 + * Make sure third parties have a chance to filter packets before + * SharedIP. Always SharedIP at the end of the list. + */ + if (filter->name != NULL && + strcmp(filter->name, "com.apple.nke.SharedIP") == 0) { + TAILQ_INSERT_TAIL(head, new_filter, ipf_link); + } + else { + TAILQ_INSERT_HEAD(head, new_filter, ipf_link); + } + + lck_mtx_unlock(kipf_lock); + + *filter_ref = (ipfilter_t)new_filter; + return 0; +} + +errno_t +ipf_addv4( + const struct ipf_filter* filter, + ipfilter_t *filter_ref) +{ + return ipf_add(filter, filter_ref, &ipv4_filters); +} + +errno_t +ipf_addv6( + const struct ipf_filter* filter, + ipfilter_t *filter_ref) +{ + return ipf_add(filter, filter_ref, &ipv6_filters); +} + +errno_t +ipf_remove( + ipfilter_t filter_ref) +{ + struct ipfilter *match = (struct ipfilter*)filter_ref; + struct ipfilter_list *head; + + if (match == 0 || (match->ipf_head != &ipv4_filters && match->ipf_head != &ipv6_filters)) + return EINVAL; + + head = match->ipf_head; + + lck_mtx_lock(kipf_lock); + TAILQ_FOREACH(match, head, ipf_link) { + if (match == (struct ipfilter*)filter_ref) { + ipf_detach_func ipf_detach = match->ipf_filter.ipf_detach; + void* cookie = match->ipf_filter.cookie; + + /* + * Cannot detach when they are filters running + */ + if (kipf_ref) { + kipf_delayed_remove++; + TAILQ_INSERT_TAIL(&tbr_filters, match, ipf_tbr); + match->ipf_filter.ipf_input = 0; + match->ipf_filter.ipf_output = 0; + lck_mtx_unlock(kipf_lock); + } else { + TAILQ_REMOVE(head, match, ipf_link); + lck_mtx_unlock(kipf_lock); + if (ipf_detach) + ipf_detach(cookie); + FREE(match, M_IFADDR); + } + return 0; + } + } + lck_mtx_unlock(kipf_lock); + + return ENOENT; +} + +int log_for_en1 = 0; + +errno_t +ipf_inject_input( + mbuf_t data, + ipfilter_t filter_ref) +{ + struct mbuf *m = (struct mbuf*)data; + struct m_tag *mtag = 0; + struct ip *ip = mtod(m, struct ip *); + u_int8_t vers; + int hlen; + errno_t error = 0; + protocol_family_t proto; + + vers = IP_VHL_V(ip->ip_vhl); + + switch (vers) { + case 4: + proto = PF_INET; + break; + case 6: + proto = PF_INET6; + break; + default: + error = ENOTSUP; + goto done; + } + + if (filter_ref == 0 && m->m_pkthdr.rcvif == 0) { + m->m_pkthdr.rcvif = ifunit("lo0"); + m->m_pkthdr.csum_data = 0; + m->m_pkthdr.csum_flags = 0; + if (vers == 4) { + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + ip->ip_sum = 0; + ip->ip_sum = in_cksum(m, hlen); + } + } + if (filter_ref != 0) { + mtag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFILT, + sizeof (ipfilter_t), M_NOWAIT); + if (mtag == NULL) { + error = ENOMEM; + goto done; + } + *(ipfilter_t*)(mtag+1) = filter_ref; + m_tag_prepend(m, mtag); + } + + error = proto_inject(proto, data); + +done: + return error; +} + +static errno_t +ipf_injectv4_out( + mbuf_t data, + ipfilter_t filter_ref, + ipf_pktopts_t options) +{ + struct route ro; + struct sockaddr_in *sin = (struct sockaddr_in*)&ro.ro_dst; + struct ip *ip; + struct mbuf *m = (struct mbuf*)data; + errno_t error = 0; + struct m_tag *mtag = 0; + struct ip_moptions *imo = 0, ip_moptions; + + /* Make the IP header contiguous in the mbuf */ + if ((size_t)m->m_len < sizeof(struct ip)) { + m = m_pullup(m, sizeof(struct ip)); + if (m == NULL) return ENOMEM; + } + ip = (struct ip*)m_mtod(m); + + if (filter_ref != 0) { + mtag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFILT, + sizeof (ipfilter_t), M_NOWAIT); + if (mtag == NULL) { + m_freem(m); + return ENOMEM; + } + *(ipfilter_t*)(mtag+1) = filter_ref; + m_tag_prepend(m, mtag); + } + + if (options && (options->ippo_flags & IPPOF_MCAST_OPTS)) { + imo = &ip_moptions; + + bzero(imo, sizeof(struct ip6_moptions)); + imo->imo_multicast_ifp = options->ippo_mcast_ifnet; + imo->imo_multicast_ttl = options->ippo_mcast_ttl; + imo->imo_multicast_loop = options->ippo_mcast_loop; + } + + /* Fill out a route structure and get a route */ + bzero(&ro, sizeof(struct route)); + sin->sin_len = sizeof(struct sockaddr_in); + sin->sin_family = AF_INET; + sin->sin_port = 0; + sin->sin_addr = ip->ip_dst; + rtalloc(&ro); + if (ro.ro_rt == NULL) { + m_freem(m); + return ENETUNREACH; + } + /* Send */ + error = ip_output(m, NULL, &ro, IP_ALLOWBROADCAST | IP_RAWOUTPUT, imo); + + /* Release the route */ + if (ro.ro_rt) + rtfree(ro.ro_rt); + + return error; +} + +static errno_t +ipf_injectv6_out( + mbuf_t data, + ipfilter_t filter_ref, + ipf_pktopts_t options) +{ + struct route_in6 ro; + struct sockaddr_in6 *sin6 = &ro.ro_dst; + struct ip6_hdr *ip6; + struct mbuf *m = (struct mbuf*)data; + errno_t error = 0; + struct m_tag *mtag = 0; + struct ip6_moptions *im6o = 0, ip6_moptions; + + /* Make the IP header contiguous in the mbuf */ + if ((size_t)m->m_len < sizeof(struct ip6_hdr)) { + m = m_pullup(m, sizeof(struct ip6_hdr)); + if (m == NULL) return ENOMEM; + } + ip6 = (struct ip6_hdr*)m_mtod(m); + + if (filter_ref != 0) { + mtag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFILT, + sizeof (ipfilter_t), M_NOWAIT); + if (mtag == NULL) { + m_freem(m); + return ENOMEM; + } + *(ipfilter_t*)(mtag+1) = filter_ref; + m_tag_prepend(m, mtag); + } + + if (options && (options->ippo_flags & IPPOF_MCAST_OPTS)) { + im6o = &ip6_moptions; + + bzero(im6o, sizeof(struct ip6_moptions)); + im6o->im6o_multicast_ifp = options->ippo_mcast_ifnet; + im6o->im6o_multicast_hlim = options->ippo_mcast_ttl; + im6o->im6o_multicast_loop = options->ippo_mcast_loop; + } + + + /* Fill out a route structure and get a route */ + bzero(&ro, sizeof(struct route_in6)); + sin6->sin6_len = sizeof(struct sockaddr_in6); + sin6->sin6_family = AF_INET6; + sin6->sin6_addr = ip6->ip6_dst; +#if 0 + /* This is breaks loopback multicast! */ + /* The scope ID should already at s6_addr16[1] */ + if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) { + /* Hack, pull the scope_id out of the dest addr */ + sin6->sin6_scope_id = ntohs(ip6->ip6_dst.s6_addr16[1]); + ip6->ip6_dst.s6_addr16[1] = 0; + } else + sin6->sin6_scope_id = 0; +#endif + rtalloc((struct route*)&ro); + if (ro.ro_rt == NULL) { + m_freem(m); + return ENETUNREACH; + } + + /* Send */ + error = ip6_output(m, NULL, &ro, 0, im6o, NULL, 0); + + /* Release the route */ + if (ro.ro_rt) + rtfree(ro.ro_rt); + + return error; +} + +errno_t +ipf_inject_output( + mbuf_t data, + ipfilter_t filter_ref, + ipf_pktopts_t options) +{ + struct mbuf *m = (struct mbuf*)data; + u_int8_t vers; + errno_t error = 0; + + /* Make one byte of the header contiguous in the mbuf */ + if (m->m_len < 1) { + m = m_pullup(m, 1); + if (m == NULL) + goto done; + } + + vers = (*(u_int8_t*)m_mtod(m)) >> 4; + switch (vers) + { + case 4: + error = ipf_injectv4_out(data, filter_ref, options); + break; + case 6: + error = ipf_injectv6_out(data, filter_ref, options); + break; + default: + m_freem(m); + error = ENOTSUP; + break; + } + +done: + return error; +} + +__private_extern__ ipfilter_t +ipf_get_inject_filter(struct mbuf *m) +{ + ipfilter_t filter_ref = 0; + struct m_tag *mtag; + + mtag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFILT, NULL); + if (mtag) { + filter_ref = *(ipfilter_t *)(mtag+1); + + m_tag_delete(m, mtag); + } + return filter_ref; +} + +__private_extern__ int +ipf_init(void) +{ + int error = 0; + lck_grp_attr_t *grp_attributes = 0; + lck_attr_t *lck_attributes = 0; + lck_grp_t *lck_grp = 0; + + grp_attributes = lck_grp_attr_alloc_init(); + if (grp_attributes == 0) { + printf("ipf_init: lck_grp_attr_alloc_init failed\n"); + error = ENOMEM; + goto done; + } + lck_grp_attr_setdefault(grp_attributes); + + lck_grp = lck_grp_alloc_init("IP Filter", grp_attributes); + if (lck_grp == 0) { + printf("ipf_init: lck_grp_alloc_init failed\n"); + error = ENOMEM; + goto done; + } + + lck_attributes = lck_attr_alloc_init(); + if (lck_attributes == 0) { + printf("ipf_init: lck_attr_alloc_init failed\n"); + error = ENOMEM; + goto done; + } + lck_attr_setdefault(lck_attributes); + + kipf_lock = lck_mtx_alloc_init(lck_grp, lck_attributes); + if (kipf_lock == 0) { + printf("ipf_init: lck_mtx_alloc_init failed\n"); + error = ENOMEM; + goto done; + } + done: + if (error != 0) { + if (kipf_lock) { + lck_mtx_free(kipf_lock, lck_grp); + kipf_lock = 0; + } + } + if (lck_grp) { + lck_grp_free(lck_grp); + lck_grp = 0; + } + if (grp_attributes) { + lck_grp_attr_free(grp_attributes); + grp_attributes = 0; + } + if (lck_attributes) { + lck_attr_free(lck_attributes); + lck_attributes = 0; + } + + return error; +} diff --git a/bsd/netinet/kpi_ipfilter.h b/bsd/netinet/kpi_ipfilter.h new file mode 100644 index 000000000..bc0ae7867 --- /dev/null +++ b/bsd/netinet/kpi_ipfilter.h @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2003-2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/*! + @header kpi_ipfilter.h + This header defines an API to attach IP filters. IP filters may be + attached to intercept either IPv4 or IPv6 packets. The filters can + intercept all IP packets in to and out of the host regardless of + interface. + */ + +#ifndef __KPI_IPFILTER__ +#define __KPI_IPFILTER__ + +#include <sys/kernel_types.h> + +/* + * ipf_pktopts + * + * Options for outgoing packets. The options need to be preserved when + * re-injecting a packet. + */ +struct ipf_pktopts { + u_int32_t ippo_flags; + ifnet_t ippo_mcast_ifnet; + int ippo_mcast_loop; + u_int8_t ippo_mcast_ttl; +}; +#define IPPOF_MCAST_OPTS 0x1 + +typedef struct ipf_pktopts* ipf_pktopts_t; + +/*! + @typedef ipf_input_func + + @discussion ipf_input_func is used to filter incoming ip packets. + The IP filter is called for packets from all interfaces. The + filter is called between when the general IP processing is + handled and when the packet is passed up to the next layer + protocol such as udp or tcp. In the case of encapsulation, such + as UDP in ESP (IPSec), your filter will be called once for ESP + and then again for UDP. This will give your filter an + opportunity to process the ESP header as well as the decrypted + packet. Offset and protocol are used to determine where in the + packet processing is currently occuring. If you're only + interested in TCP or UDP packets, just return 0 if protocol + doesn't match TCP or UDP. + @param cookie The cookie specified when your filter was attached. + @param data The reassembled ip packet, data will start at the ip + header. + @param offset An offset to the next header + (udp/tcp/icmp/esp/etc...). + @param protocol The protocol type (udp/tcp/icmp/etc...) of the IP packet + @result Return: + 0 - The caller will continue with normal processing of the packet. + EJUSTRETURN - The caller will stop processing the packet, the packet will not be freed. + Anything Else - The caller will free the packet and stop processing. +*/ +typedef errno_t (*ipf_input_func)(void* cookie, mbuf_t *data, int offset, u_int8_t protocol); + +/*! + @typedef ipf_output_func + + @discussion ipf_output_func is used to filter outbound ip packets. + The IP filter is called for packets to all interfaces. The + filter is called before fragmentation and IPSec processing. If + you need to change the destination IP address, call + ipf_inject_output and return EJUSTRETURN. + @param cookie The cookie specified when your filter was attached. + @param data The ip packet, will contain an IP header followed by the + rest of the IP packet. + @result Return: + 0 - The caller will continue with normal processing of the packet. + EJUSTRETURN - The caller will stop processing the packet, the packet will not be freed. + Anything Else - The caller will free the packet and stop processing. +*/ +typedef errno_t (*ipf_output_func)(void* cookie, mbuf_t *data, ipf_pktopts_t options); + +/*! + @typedef ipf_detach_func + + @discussion ipf_detach_func is called to notify your filter that it + has been detached. + @param cookie The cookie specified when your filter was attached. +*/ +typedef void (*ipf_detach_func)(void* cookie); + +/*! + @typedef ipf_filter + @discussion This structure is used to define an IP filter for + use with the ipf_addv4 or ipf_addv6 function. + @field cookie A kext defined cookie that will be passed to all + filter functions. + @field name A filter name used for debugging purposes. + @field ipf_input The filter function to handle inbound packets. + @field ipf_output The filter function to handle outbound packets. + @field ipf_detach The filter function to notify of a detach. +*/ +struct ipf_filter { + void* cookie; + const char* name; + ipf_input_func ipf_input; + ipf_output_func ipf_output; + ipf_detach_func ipf_detach; +}; + +struct opaque_ipfilter; +typedef struct opaque_ipfilter* ipfilter_t; + +/*! + @function ipf_addv4 + @discussion Attaches an IPv4 ip filter. + @param filter A structure defining the filter. + @param filter_ref A reference to the filter used to detach it. + @result 0 on success otherwise the errno error. + */ +errno_t ipf_addv4(const struct ipf_filter* filter, ipfilter_t *filter_ref); + +/*! + @function ipf_addv6 + @discussion Attaches an IPv6 ip filter. + @param filter A structure defining the filter. + @param filter_ref A reference to the filter used to detach it. + @result 0 on success otherwise the errno error. + */ +errno_t ipf_addv6(const struct ipf_filter* filter, ipfilter_t *filter_ref); + +/*! + @function ipf_remove + @discussion Detaches an IPv4 or IPv6 filter. + @param filter_ref The reference to the filter returned from ipf_addv4 or + ipf_addv6. + @result 0 on success otherwise the errno error. + */ +errno_t ipf_remove(ipfilter_t filter_ref); + +/*! + @function ipf_inject_input + @discussion Inject an IP packet as though it had just been + reassembled in ip_input. When re-injecting a packet intercepted + by the filter's ipf_input function, an IP filter can pass its + reference to avoid processing the packet twice. This also + prevents ip filters installed before this filter from + getting a chance to process the packet. If the filter modified + the packet, it should not specify the filter ref to give other + filters a chance to process the new packet. + + Caller is responsible for freeing mbuf chain in the event that + ipf_inject_input returns an error. + @param data The complete IPv4 or IPv6 packet, receive interface must + be set. + @param filter_ref The reference to the filter injecting the data + @result 0 on success otherwise the errno error. + */ +errno_t ipf_inject_input(mbuf_t data, ipfilter_t filter_ref); + +/*! + @function ipf_inject_output + @discussion Inject an IP packet as though it had just been sent to + ip_output. When re-injecting a packet intercepted by the + filter's ipf_output function, an IP filter can pass its + reference to avoid processing the packet twice. This also + prevents ip filters installed before this filter from getting a + chance to process the packet. If the filter modified the packet, + it should not specify the filter ref to give other filters a + chance to process the new packet. + @param data The complete IPv4 or IPv6 packet. + @param filter_ref The reference to the filter injecting the data + @param options Output options for the packet + @result 0 on success otherwise the errno error. ipf_inject_output + will always free the mbuf. + */ +errno_t ipf_inject_output(mbuf_t data, ipfilter_t filter_ref, ipf_pktopts_t options); + +#endif /* __KPI_IPFILTER__ */ diff --git a/bsd/ppc/table.h b/bsd/netinet/kpi_ipfilter_var.h similarity index 51% rename from bsd/ppc/table.h rename to bsd/netinet/kpi_ipfilter_var.h index ff4d1dc34..f77f97214 100644 --- a/bsd/ppc/table.h +++ b/bsd/netinet/kpi_ipfilter_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -19,20 +19,40 @@ * * @APPLE_LICENSE_HEADER_END@ */ -/* - * Copyright (c) 1989 Next, Inc. - */ - -#ifndef _BSD_PPC_TABLE_H_ -#define _BSD_PPC_TABLE_H_ + +#ifndef __KPI_IPFILTER_VAR__ +#define __KPI_IPFILTER_VAR__ #include <sys/appleapiopts.h> +#include <netinet/kpi_ipfilter.h> + +#ifdef KERNEL_PRIVATE + +/* Private data structure, stripped out by ifdef tool */ +/* Implementation specific bits */ + +#include <sys/queue.h> + +struct ipfilter { + TAILQ_ENTRY(ipfilter) ipf_link; + struct ipf_filter ipf_filter; + struct ipfilter_list *ipf_head; + TAILQ_ENTRY(ipfilter) ipf_tbr; +}; +TAILQ_HEAD(ipfilter_list, ipfilter); + + +extern struct ipfilter_list ipv6_filters; +extern struct ipfilter_list ipv4_filters; + +ipfilter_t ipf_get_inject_filter(struct mbuf *m); +void ipf_ref(void); +void ipf_unref(void); +int ipf_init(void); + +void ip_proto_dispatch_in(struct mbuf *m, int hlen, u_int8_t proto, ipfilter_t ipfref); -#ifdef __APPLE_API_OBSOLETE -/* - * No machine dependent table calls for ppc. - */ -#endif /* __APPLE_API_OBSOLETE */ +#endif /* KERNEL_PRIVATE */ -#endif /* _BSD_PPC_TABLE_H_ */ +#endif /*__KPI_IPFILTER_VAR__*/ diff --git a/bsd/netinet/raw_ip.c b/bsd/netinet/raw_ip.c index 60f2ce01b..f361be892 100644 --- a/bsd/netinet/raw_ip.c +++ b/bsd/netinet/raw_ip.c @@ -60,6 +60,7 @@ #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/proc.h> +#include <sys/domain.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> @@ -93,11 +94,19 @@ #if IPSEC extern int ipsec_bypass; +extern lck_mtx_t *sadb_mutex; #endif +extern u_long route_generation; struct inpcbhead ripcb; struct inpcbinfo ripcbinfo; +/* control hooks for ipfw and dummynet */ +ip_fw_ctl_t *ip_fw_ctl_ptr; +#if DUMMYNET +ip_dn_ctl_t *ip_dn_ctl_ptr; +#endif /* DUMMYNET */ + /* * Nominal space allocated to a raw ip socket. */ @@ -114,6 +123,8 @@ struct inpcbinfo ripcbinfo; void rip_init() { + struct inpcbinfo *pcbinfo; + LIST_INIT(&ripcb); ripcbinfo.listhead = &ripcb; /* @@ -128,6 +139,24 @@ rip_init() (4096 * sizeof(struct inpcb)), 4096, "ripzone"); + pcbinfo = &ripcbinfo; + /* + * allocate lock group attribute and group for udp pcb mutexes + */ + pcbinfo->mtx_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setdefault(pcbinfo->mtx_grp_attr); + + pcbinfo->mtx_grp = lck_grp_alloc_init("ripcb", pcbinfo->mtx_grp_attr); + + /* + * allocate the lock attribute for udp pcb mutexes + */ + pcbinfo->mtx_attr = lck_attr_alloc_init(); + lck_attr_setdefault(pcbinfo->mtx_attr); + + if ((pcbinfo->mtx = lck_rw_alloc_init(pcbinfo->mtx_grp, pcbinfo->mtx_attr)) == NULL) + return; /* pretty much dead if this fails... */ + } static struct sockaddr_in ripsrc = { sizeof(ripsrc), AF_INET }; @@ -145,8 +174,10 @@ rip_input(m, iphlen) register struct inpcb *inp; struct inpcb *last = 0; struct mbuf *opts = 0; + int skipit; ripsrc.sin_addr = ip->ip_src; + lck_rw_lock_shared(ripcbinfo.mtx); LIST_FOREACH(inp, &ripcb, inp_list) { #if INET6 if ((inp->inp_vflag & INP_IPV4) == 0) @@ -162,16 +193,23 @@ rip_input(m, iphlen) continue; if (last) { struct mbuf *n = m_copy(m, 0, (int)M_COPYALL); - + #if IPSEC /* check AH/ESP integrity. */ - if (ipsec_bypass == 0 && n && ipsec4_in_reject_so(n, last->inp_socket)) { - m_freem(n); - ipsecstat.in_polvio++; - /* do not inject data to pcb */ - } else + skipit = 0; + if (ipsec_bypass == 0 && n) { + lck_mtx_lock(sadb_mutex); + if (ipsec4_in_reject_so(n, last->inp_socket)) { + m_freem(n); + ipsecstat.in_polvio++; + /* do not inject data to pcb */ + skipit = 1; + } + lck_mtx_unlock(sadb_mutex); + } #endif /*IPSEC*/ - if (n) { + if (n && skipit == 0) { + int error = 0; if (last->inp_flags & INP_CONTROLOPTS || last->inp_socket->so_options & SO_TIMESTAMP) ip_savecontrol(last, &opts, ip, n); @@ -180,51 +218,60 @@ rip_input(m, iphlen) n->m_pkthdr.len -= iphlen; n->m_data += iphlen; } +// ###LOCK need to lock that socket? if (sbappendaddr(&last->inp_socket->so_rcv, (struct sockaddr *)&ripsrc, n, - opts) == 0) { - /* should notify about lost packet */ - kprintf("rip_input can't append to socket\n"); - m_freem(n); - if (opts) - m_freem(opts); - } else + opts, &error) != 0) { sorwakeup(last->inp_socket); + } + else { + if (error) { + /* should notify about lost packet */ + kprintf("rip_input can't append to socket\n"); + } + } opts = 0; } } last = inp; } + lck_rw_done(ripcbinfo.mtx); #if IPSEC /* check AH/ESP integrity. */ - if (ipsec_bypass == 0 && last && ipsec4_in_reject_so(m, last->inp_socket)) { - m_freem(m); - ipsecstat.in_polvio++; - ipstat.ips_delivered--; - /* do not inject data to pcb */ - } else + skipit = 0; + if (ipsec_bypass == 0 && last) { + lck_mtx_lock(sadb_mutex); + if (ipsec4_in_reject_so(m, last->inp_socket)) { + m_freem(m); + ipsecstat.in_polvio++; + ipstat.ips_delivered--; + /* do not inject data to pcb */ + skipit = 1; + } + lck_mtx_unlock(sadb_mutex); + } #endif /*IPSEC*/ - if (last) { - if (last->inp_flags & INP_CONTROLOPTS || - last->inp_socket->so_options & SO_TIMESTAMP) - ip_savecontrol(last, &opts, ip, m); - if (last->inp_flags & INP_STRIPHDR) { - m->m_len -= iphlen; - m->m_pkthdr.len -= iphlen; - m->m_data += iphlen; - } - if (sbappendaddr(&last->inp_socket->so_rcv, - (struct sockaddr *)&ripsrc, m, opts) == 0) { - kprintf("rip_input(2) can't append to socket\n"); + if (skipit == 0) { + if (last) { + if (last->inp_flags & INP_CONTROLOPTS || + last->inp_socket->so_options & SO_TIMESTAMP) + ip_savecontrol(last, &opts, ip, m); + if (last->inp_flags & INP_STRIPHDR) { + m->m_len -= iphlen; + m->m_pkthdr.len -= iphlen; + m->m_data += iphlen; + } + if (sbappendaddr(&last->inp_socket->so_rcv, + (struct sockaddr *)&ripsrc, m, opts, NULL) != 0) { + sorwakeup(last->inp_socket); + } else { + kprintf("rip_input(2) can't append to socket\n"); + } + } else { m_freem(m); - if (opts) - m_freem(opts); - } else - sorwakeup(last->inp_socket); - } else { - m_freem(m); - ipstat.ips_noproto++; - ipstat.ips_delivered--; + ipstat.ips_noproto++; + ipstat.ips_delivered--; + } } } @@ -293,23 +340,27 @@ rip_output(m, so, dst) } #endif /*IPSEC*/ - return (ip_output(m, inp->inp_options, &inp->inp_route, flags, + if (inp->inp_route.ro_rt && inp->inp_route.ro_rt->generation_id != route_generation) { + rtfree(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = (struct rtentry *)0; + } + + return (ip_output_list(m, 0, inp->inp_options, &inp->inp_route, flags, inp->inp_moptions)); } -int +extern int load_ipfw() { kern_return_t err; - /* Load the kext by the identifier */ - err = kmod_load_extension("com.apple.nke.IPFirewall"); - if (err) return err; + ipfw_init(); - if (ip_fw_ctl_ptr == NULL) { - /* Wait for the kext to finish loading */ - err = tsleep(&ip_fw_ctl_ptr, PWAIT | PCATCH, "load_ipfw_kext", 5 * 60 /* 5 seconds */); - } +#if DUMMYNET + if (!DUMMYNET_LOADED) + ip_dn_init(); +#endif /* DUMMYNET */ + err = 0; return err == 0 && ip_fw_ctl_ptr == NULL ? -1 : err; } @@ -357,10 +408,10 @@ rip_ctloutput(so, sopt) #if DUMMYNET case IP_DUMMYNET_GET: - if (ip_dn_ctl_ptr == NULL) - error = ENOPROTOOPT ; - else + if (DUMMYNET_LOADED) error = ip_dn_ctl_ptr(sopt); + else + error = ENOPROTOOPT; break ; #endif /* DUMMYNET */ @@ -428,10 +479,10 @@ rip_ctloutput(so, sopt) case IP_DUMMYNET_CONFIGURE: case IP_DUMMYNET_DEL: case IP_DUMMYNET_FLUSH: - if (ip_dn_ctl_ptr == NULL) - error = ENOPROTOOPT ; - else + if (DUMMYNET_LOADED) error = ip_dn_ctl_ptr(sopt); + else + error = ENOPROTOOPT ; break ; #endif @@ -493,6 +544,7 @@ rip_ctlinput(cmd, sa, vip) switch (cmd) { case PRC_IFDOWN: + lck_mtx_lock(rt_mtx); for (ia = in_ifaddrhead.tqh_first; ia; ia = ia->ia_link.tqe_next) { if (ia->ia_ifa.ifa_addr == sa @@ -500,7 +552,7 @@ rip_ctlinput(cmd, sa, vip) /* * in_ifscrub kills the interface route. */ - in_ifscrub(ia->ia_ifp, ia); + in_ifscrub(ia->ia_ifp, ia, 1); /* * in_ifadown gets rid of all the rest of * the routes. This is not quite the right @@ -511,16 +563,20 @@ rip_ctlinput(cmd, sa, vip) break; } } + lck_mtx_unlock(rt_mtx); break; case PRC_IFUP: + lck_mtx_lock(rt_mtx); for (ia = in_ifaddrhead.tqh_first; ia; ia = ia->ia_link.tqe_next) { if (ia->ia_ifa.ifa_addr == sa) break; } - if (ia == 0 || (ia->ia_flags & IFA_ROUTE)) + if (ia == 0 || (ia->ia_flags & IFA_ROUTE)) { + lck_mtx_unlock(rt_mtx); return; + } flags = RTF_UP; ifp = ia->ia_ifa.ifa_ifp; @@ -528,7 +584,8 @@ rip_ctlinput(cmd, sa, vip) || (ifp->if_flags & IFF_POINTOPOINT)) flags |= RTF_HOST; - err = rtinit(&ia->ia_ifa, RTM_ADD, flags); + err = rtinit_locked(&ia->ia_ifa, RTM_ADD, flags); + lck_mtx_unlock(rt_mtx); if (err == 0) ia->ia_flags |= IFA_ROUTE; break; @@ -612,15 +669,21 @@ rip_bind(struct socket *so, struct sockaddr *nam, struct proc *p) { struct inpcb *inp = sotoinpcb(so); struct sockaddr_in *addr = (struct sockaddr_in *)nam; + struct ifaddr *ifa = NULL; if (nam->sa_len != sizeof(*addr)) return EINVAL; - if (TAILQ_EMPTY(&ifnet) || ((addr->sin_family != AF_INET) && + if (TAILQ_EMPTY(&ifnet_head) || ((addr->sin_family != AF_INET) && (addr->sin_family != AF_IMPLINK)) || (addr->sin_addr.s_addr && - ifa_ifwithaddr((struct sockaddr *)addr) == 0)) + (ifa = ifa_ifwithaddr((struct sockaddr *)addr)) == 0)) { return EADDRNOTAVAIL; + } + else if (ifa) { + ifafree(ifa); + ifa = NULL; + } inp->inp_laddr = addr->sin_addr; return 0; } @@ -633,7 +696,7 @@ rip_connect(struct socket *so, struct sockaddr *nam, struct proc *p) if (nam->sa_len != sizeof(*addr)) return EINVAL; - if (TAILQ_EMPTY(&ifnet)) + if (TAILQ_EMPTY(&ifnet_head)) return EADDRNOTAVAIL; if ((addr->sin_family != AF_INET) && (addr->sin_family != AF_IMPLINK)) @@ -673,6 +736,33 @@ rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, return rip_output(m, so, dst); } +int +rip_unlock(struct socket *so, int refcount, int debug) +{ + int lr_saved; + struct inpcb *inp = sotoinpcb(so); +#ifdef __ppc__ + if (debug == 0) { + __asm__ volatile("mflr %0" : "=r" (lr_saved)); + } + else lr_saved = debug; +#endif + if (refcount) { + if (so->so_usecount <= 0) + panic("rip_unlock: bad refoucnt so=%x val=%x\n", so, so->so_usecount); + so->so_usecount--; + if (so->so_usecount == 0 && (inp->inp_wantcnt == WNT_STOPUSING)) { + lck_mtx_unlock(so->so_proto->pr_domain->dom_mtx); + lck_rw_lock_exclusive(ripcbinfo.mtx); + in_pcbdispose(inp); + lck_rw_done(ripcbinfo.mtx); + return(0); + } + } + lck_mtx_unlock(so->so_proto->pr_domain->dom_mtx); + return(0); +} + static int rip_pcblist SYSCTL_HANDLER_ARGS { @@ -685,58 +775,64 @@ rip_pcblist SYSCTL_HANDLER_ARGS * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ - if (req->oldptr == 0) { + lck_rw_lock_exclusive(ripcbinfo.mtx); + if (req->oldptr == USER_ADDR_NULL) { n = ripcbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xinpcb); + lck_rw_done(ripcbinfo.mtx); return 0; } - if (req->newptr != 0) + if (req->newptr != USER_ADDR_NULL) { + lck_rw_done(ripcbinfo.mtx); return EPERM; + } /* * OK, now we're committed to doing something. */ - s = splnet(); gencnt = ripcbinfo.ipi_gencnt; n = ripcbinfo.ipi_count; - splx(s); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); - if (error) + if (error) { + lck_rw_done(ripcbinfo.mtx); return error; + } /* * We are done if there is no pcb */ - if (n == 0) + if (n == 0) { + lck_rw_done(ripcbinfo.mtx); return 0; + } inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK); - if (inp_list == 0) + if (inp_list == 0) { + lck_rw_done(ripcbinfo.mtx); return ENOMEM; + } - s = splnet(); for (inp = ripcbinfo.listhead->lh_first, i = 0; inp && i < n; inp = inp->inp_list.le_next) { - if (inp->inp_gencnt <= gencnt) + if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) inp_list[i++] = inp; } - splx(s); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; - if (inp->inp_gencnt <= gencnt) { + if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) { struct xinpcb xi; xi.xi_len = sizeof xi; /* XXX should avoid extra copy */ - bcopy(inp, &xi.xi_inp, sizeof *inp); + inpcb_to_compat(inp, &xi.xi_inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); error = SYSCTL_OUT(req, &xi, sizeof xi); @@ -750,14 +846,13 @@ rip_pcblist SYSCTL_HANDLER_ARGS * while we were processing this request, and it * might be necessary to retry. */ - s = splnet(); xig.xig_gen = ripcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = ripcbinfo.ipi_count; - splx(s); error = SYSCTL_OUT(req, &xig, sizeof xig); } FREE(inp_list, M_TEMP); + lck_rw_done(ripcbinfo.mtx); return error; } @@ -769,5 +864,5 @@ struct pr_usrreqs rip_usrreqs = { pru_connect2_notsupp, in_control, rip_detach, rip_disconnect, pru_listen_notsupp, in_setpeeraddr, pru_rcvd_notsupp, pru_rcvoob_notsupp, rip_send, pru_sense_null, rip_shutdown, - in_setsockaddr, sosend, soreceive, sopoll + in_setsockaddr, sosend, soreceive, pru_sopoll_notsupp }; diff --git a/bsd/netinet/tcp.h b/bsd/netinet/tcp.h index 30479bd88..89e6ba826 100644 --- a/bsd/netinet/tcp.h +++ b/bsd/netinet/tcp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -58,9 +58,12 @@ #ifndef _NETINET_TCP_H_ #define _NETINET_TCP_H_ #include <sys/appleapiopts.h> +#include <sys/_types.h> +#include <machine/endian.h> -typedef u_int32_t tcp_seq; -typedef u_int32_t tcp_cc; /* connection count per rfc1644 */ +#ifndef _POSIX_C_SOURCE +typedef __uint32_t tcp_seq; +typedef __uint32_t tcp_cc; /* connection count per rfc1644 */ #define tcp6_seq tcp_seq /* for KAME src sync over BSD*'s */ #define tcp6hdr tcphdr /* for KAME src sync over BSD*'s */ @@ -70,19 +73,19 @@ typedef u_int32_t tcp_cc; /* connection count per rfc1644 */ * Per RFC 793, September, 1981. */ struct tcphdr { - u_short th_sport; /* source port */ - u_short th_dport; /* destination port */ + unsigned short th_sport; /* source port */ + unsigned short th_dport; /* destination port */ tcp_seq th_seq; /* sequence number */ tcp_seq th_ack; /* acknowledgement number */ -#if BYTE_ORDER == LITTLE_ENDIAN - u_int th_x2:4, /* (unused) */ - th_off:4; /* data offset */ +#if __DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN + unsigned int th_x2:4, /* (unused) */ + th_off:4; /* data offset */ #endif -#if BYTE_ORDER == BIG_ENDIAN - u_int th_off:4, /* data offset */ - th_x2:4; /* (unused) */ +#if __DARWIN_BYTE_ORDER == __DARWIN_BIG_ENDIAN + unsigned int th_off:4, /* data offset */ + th_x2:4; /* (unused) */ #endif - u_char th_flags; + unsigned char th_flags; #define TH_FIN 0x01 #define TH_SYN 0x02 #define TH_RST 0x04 @@ -93,9 +96,9 @@ struct tcphdr { #define TH_CWR 0x80 #define TH_FLAGS (TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG|TH_ECE|TH_CWR) - u_short th_win; /* window */ - u_short th_sum; /* checksum */ - u_short th_urp; /* urgent pointer */ + unsigned short th_win; /* window */ + unsigned short th_sum; /* checksum */ + unsigned short th_urp; /* urgent pointer */ }; #define TCPOPT_EOL 0 @@ -140,6 +143,15 @@ struct tcphdr { */ #define TCP_MINMSS 216 +/* + * TCP_MINMSSOVERLOAD is defined to be 1000 which should cover any type + * of interactive TCP session. + * See tcp_subr.c tcp_minmssoverload SYSCTL declaration and tcp_input.c + * for more comments. + * Setting this to "0" disables the minmssoverload check. + */ +#define TCP_MINMSSOVERLOAD 1000 + /* * Default maximum segment size for TCP6. * With an IP6 MSS of 1280, this is 1220, @@ -158,14 +170,17 @@ struct tcphdr { #define TCP_MAXHLEN (0xf<<2) /* max length of header in bytes */ #define TCP_MAXOLEN (TCP_MAXHLEN - sizeof(struct tcphdr)) /* max space left for options */ +#endif /* _POSIX_C_SOURCE */ /* * User-settable options (used with setsockopt). */ #define TCP_NODELAY 0x01 /* don't delay send to coalesce packets */ +#ifndef _POSIX_C_SOURCE #define TCP_MAXSEG 0x02 /* set maximum segment size */ #define TCP_NOPUSH 0x04 /* don't push last block of write */ #define TCP_NOOPT 0x08 /* don't use TCP options */ #define TCP_KEEPALIVE 0x10 /* idle time used when SO_KEEPALIVE is enabled */ +#endif /* _POSIX_C_SOURCE */ #endif diff --git a/bsd/netinet/tcp_debug.h b/bsd/netinet/tcp_debug.h index 76a05474e..1fbf37aa9 100644 --- a/bsd/netinet/tcp_debug.h +++ b/bsd/netinet/tcp_debug.h @@ -58,7 +58,7 @@ #ifndef _NETINET_TCP_DEBUG_H_ #define _NETINET_TCP_DEBUG_H_ #include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE +#ifdef PRIVATE struct tcp_debug { n_time td_time; @@ -97,11 +97,6 @@ static char *tanames[] = #define TCP_NDEBUG 100 -#ifndef KERNEL -/* XXX common variables for broken applications. */ -struct tcp_debug tcp_debug[TCP_NDEBUG]; -int tcp_debx; -#endif -#endif /* __APPLE_API_PRIVATE */ +#endif /* PRIVATE */ #endif /* !_NETINET_TCP_DEBUG_H_ */ diff --git a/bsd/netinet/tcp_fsm.h b/bsd/netinet/tcp_fsm.h index e4a2e6826..6bbb41490 100644 --- a/bsd/netinet/tcp_fsm.h +++ b/bsd/netinet/tcp_fsm.h @@ -99,7 +99,7 @@ #define TCPS_HAVEESTABLISHED(s) ((s) >= TCPS_ESTABLISHED) #define TCPS_HAVERCVDFIN(s) ((s) >= TCPS_TIME_WAIT) -#ifdef __APPLE_API_UNSTABLE +#ifdef KERNEL_PRIVATE #ifdef TCPOUTFLAGS /* * Flags used when sending segments in tcp_output. @@ -121,19 +121,19 @@ static u_char tcp_outflags[TCP_NSTATES] = { TH_ACK, /* 10, TIME_WAIT */ }; #endif -#endif /* __APPLE_API_UNSTABLE */ +#endif KERNEL_PRIVATE #if KPROF -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE int tcp_acounts[TCP_NSTATES][PRU_NREQ]; -#endif /* __APPLE_API_PRIVATE */ +#endif KERNEL_PRIVATE #endif #ifdef TCPSTATES char *tcpstates[] = { "CLOSED", "LISTEN", "SYN_SENT", "SYN_RCVD", "ESTABLISHED", "CLOSE_WAIT", "FIN_WAIT_1", "CLOSING", - "LAST_ACK", "FIN_WAIT_2", "TIME_WAIT", + "LAST_ACK", "FIN_WAIT_2", "TIME_WAIT" }; #endif diff --git a/bsd/netinet/tcp_input.c b/bsd/netinet/tcp_input.c index 46fe20ed1..b931b642d 100644 --- a/bsd/netinet/tcp_input.c +++ b/bsd/netinet/tcp_input.c @@ -129,6 +129,7 @@ extern int apple_hwcksum_rx; #if IPSEC extern int ipsec_bypass; +extern lck_mtx_t *sadb_mutex; #endif struct tcpstat tcpstat; @@ -188,14 +189,14 @@ struct inpcbhead tcb; #define tcb6 tcb /* for KAME src sync over BSD*'s */ struct inpcbinfo tcbinfo; -static void tcp_dooptions __P((struct tcpcb *, - u_char *, int, struct tcphdr *, struct tcpopt *)); -static void tcp_pulloutofband __P((struct socket *, - struct tcphdr *, struct mbuf *, int)); -static int tcp_reass __P((struct tcpcb *, struct tcphdr *, int *, - struct mbuf *)); -static void tcp_xmit_timer __P((struct tcpcb *, int)); -static int tcp_newreno __P((struct tcpcb *, struct tcphdr *)); +static void tcp_dooptions(struct tcpcb *, + u_char *, int, struct tcphdr *, struct tcpopt *); +static void tcp_pulloutofband(struct socket *, + struct tcphdr *, struct mbuf *, int); +static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, + struct mbuf *); +static void tcp_xmit_timer(struct tcpcb *, int); +static int tcp_newreno __P((struct tcpcb *, struct tcphdr *)); /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ #if INET6 @@ -212,16 +213,39 @@ do { \ extern u_long *delack_bitmask; +extern void ipfwsyslog( int level, char *format,...); +extern int ChkAddressOK( __uint32_t dstaddr, __uint32_t srcaddr ); +extern int fw_verbose; + +#define log_in_vain_log( a ) { \ + if ( (log_in_vain == 3 ) && (fw_verbose == 2)) { /* Apple logging, log to ipfw.log */ \ + ipfwsyslog a ; \ + } \ + else log a ; \ +} + /* - * Indicate whether this ack should be delayed. We can delay the ack if - * - delayed acks are enabled and - * - there is no delayed ack timer in progress and + * Indicate whether this ack should be delayed. + * We can delay the ack if: + * - delayed acks are enabled (set to 1) and * - our last ack wasn't a 0-sized window. We never want to delay - * the ack that opens up a 0-sized window. + * the ack that opens up a 0-sized window. + * - delayed acks are enabled (set to 2, "more compatible") and + * - our last ack wasn't a 0-sized window. + * - if the peer hasn't sent us a TH_PUSH data packet (this solves 3649245) + * - the peer hasn't sent us a TH_PUSH data packet, if he did, take this as a clue that we + * need to ACK with no delay. This helps higher level protocols who won't send + * us more data even if the window is open because their last "segment" hasn't been ACKed + * + * */ #define DELAY_ACK(tp) \ - (tcp_delack_enabled && !callout_pending(tp->tt_delack) && \ - (tp->t_flags & TF_RXWIN0SENT) == 0) + (((tcp_delack_enabled == 1) && ((tp->t_flags & TF_RXWIN0SENT) == 0)) || \ + (((tcp_delack_enabled == 2) && (tp->t_flags & TF_RXWIN0SENT) == 0) && \ + ((thflags & TH_PUSH) == 0) && ((tp->t_flags & TF_DELACK) == 0))) + + +static int tcpdropdropablreq(struct socket *head); static int @@ -237,6 +261,7 @@ tcp_reass(tp, th, tlenp, m) struct tseg_qent *te; struct socket *so = tp->t_inpcb->inp_socket; int flags; + int dowakeup = 0; /* * Call with th==0 after become established to @@ -362,8 +387,10 @@ present: LIST_REMOVE(q, tqe_q); if (so->so_state & SS_CANTRCVMORE) m_freem(q->tqe_m); - else - sbappend(&so->so_rcv, q->tqe_m); + else { + if (sbappend(&so->so_rcv, q->tqe_m)) + dowakeup = 1; + } FREE(q, M_TSEGQ); tcp_reass_qsize--; q = nq; @@ -387,8 +414,9 @@ present: (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) | (tp->t_inpcb->inp_faddr.s_addr & 0xffff)), 0,0,0); - } - sorwakeup(so); + } + if (dowakeup) + sorwakeup(so); /* done with socket lock held */ return (flags); } @@ -407,7 +435,7 @@ tcp6_input(mp, offp) register struct mbuf *m = *mp; struct in6_ifaddr *ia6; - IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); + IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), return IPPROTO_DONE); /* * draft-itojun-ipv6-tcp-to-anycast @@ -451,13 +479,26 @@ tcp_input(m, off0) #endif int dropsocket = 0; int iss = 0; + int nosock = 0; u_long tiwin; struct tcpopt to; /* options in this segment */ struct rmxp_tao *taop; /* pointer to our TAO cache entry */ struct rmxp_tao tao_noncached; /* in case there's no cached entry */ + struct sockaddr_in *next_hop = NULL; #if TCPDEBUG short ostate = 0; #endif + struct m_tag *fwd_tag; + + /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ + fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, NULL); + if (fwd_tag != NULL) { + struct ip_fwd_tag *ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1); + + next_hop = ipfwd_tag->next_hop; + m_tag_delete(m, fwd_tag); + } + #if INET6 struct ip6_hdr *ip6 = NULL; int isipv6; @@ -483,7 +524,7 @@ tcp_input(m, off0) tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { tcpstat.tcps_rcvbadsum++; - goto drop; + goto dropnosock; } th = (struct tcphdr *)((caddr_t)ip6 + off0); @@ -500,7 +541,7 @@ tcp_input(m, off0) */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { /* XXX stat */ - goto drop; + goto dropnosock; } } else #endif /* INET6 */ @@ -579,7 +620,7 @@ tcp_input(m, off0) } if (th->th_sum) { tcpstat.tcps_rcvbadsum++; - goto drop; + goto dropnosock; } #if INET6 /* Re-initialization for later version check */ @@ -594,13 +635,13 @@ tcp_input(m, off0) off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { tcpstat.tcps_rcvbadoff++; - goto drop; + goto dropnosock; } tlen -= off; /* tlen is used instead of ti->ti_len */ if (off > sizeof (struct tcphdr)) { #if INET6 if (isipv6) { - IP6_EXTHDR_CHECK(m, off0, off, ); + IP6_EXTHDR_CHECK(m, off0, off, return); ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); } else @@ -647,7 +688,7 @@ tcp_input(m, off0) * This is incompatible with RFC1644 extensions (T/TCP). */ if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) - goto drop; + goto dropnosock; #endif /* @@ -673,7 +714,7 @@ tcp_input(m, off0) */ findpcb: #if IPFIREWALL_FORWARD - if (ip_fw_fwd_addr != NULL + if (next_hop != NULL #if INET6 && isipv6 == NULL /* IPv6 support is not yet */ #endif /* INET6 */ @@ -688,19 +729,18 @@ findpcb: /* * No, then it's new. Try find the ambushing socket */ - if (!ip_fw_fwd_addr->sin_port) { + if (!next_hop->sin_port) { inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, - th->th_sport, ip_fw_fwd_addr->sin_addr, + th->th_sport, next_hop->sin_addr, th->th_dport, 1, m->m_pkthdr.rcvif); } else { inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, - ip_fw_fwd_addr->sin_addr, - ntohs(ip_fw_fwd_addr->sin_port), 1, + next_hop->sin_addr, + ntohs(next_hop->sin_port), 1, m->m_pkthdr.rcvif); } } - ip_fw_fwd_addr = NULL; } else #endif /* IPFIREWALL_FORWARD */ { @@ -716,17 +756,23 @@ findpcb: } #if IPSEC + if (ipsec_bypass == 0) { + lck_mtx_lock(sadb_mutex); #if INET6 - if (isipv6) { - if (ipsec_bypass == 0 && inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) { - ipsec6stat.in_polvio++; - goto drop; - } - } else + if (isipv6) { + if (inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) { + ipsec6stat.in_polvio++; + lck_mtx_unlock(sadb_mutex); + goto dropnosock; + } + } else #endif /* INET6 */ - if (ipsec_bypass == 0 && inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) { - ipsecstat.in_polvio++; - goto drop; + if (inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) { + ipsecstat.in_polvio++; + lck_mtx_unlock(sadb_mutex); + goto dropnosock; + } + lck_mtx_unlock(sadb_mutex); } #endif /*IPSEC*/ @@ -739,55 +785,88 @@ findpcb: if (inp == NULL) { if (log_in_vain) { #if INET6 - char dbuf[INET6_ADDRSTRLEN], sbuf[INET6_ADDRSTRLEN]; + char dbuf[MAX_IPv6_STR_LEN], sbuf[MAX_IPv6_STR_LEN]; #else /* INET6 */ - char dbuf[4*sizeof "123"], sbuf[4*sizeof "123"]; + char dbuf[MAX_IPv4_STR_LEN], sbuf[MAX_IPv4_STR_LEN]; #endif /* INET6 */ #if INET6 if (isipv6) { - strcpy(dbuf, ip6_sprintf(&ip6->ip6_dst)); - strcpy(sbuf, ip6_sprintf(&ip6->ip6_src)); + inet_ntop(AF_INET6, &ip6->ip6_dst, dbuf, sizeof(dbuf)); + inet_ntop(AF_INET6, &ip6->ip6_src, sbuf, sizeof(sbuf)); } else #endif - { - strcpy(dbuf, inet_ntoa(ip->ip_dst)); - strcpy(sbuf, inet_ntoa(ip->ip_src)); - } + { + inet_ntop(AF_INET, &ip->ip_dst, dbuf, sizeof(dbuf)); + inet_ntop(AF_INET, &ip->ip_src, sbuf, sizeof(sbuf)); + } switch (log_in_vain) { case 1: if(thflags & TH_SYN) log(LOG_INFO, - "Connection attempt to TCP %s:%d from %s:%d\n", - dbuf, ntohs(th->th_dport), - sbuf, - ntohs(th->th_sport)); + "Connection attempt to TCP %s:%d from %s:%d\n", + dbuf, ntohs(th->th_dport), + sbuf, + ntohs(th->th_sport)); break; case 2: log(LOG_INFO, - "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n", - dbuf, ntohs(th->th_dport), sbuf, - ntohs(th->th_sport), thflags); + "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n", + dbuf, ntohs(th->th_dport), sbuf, + ntohs(th->th_sport), thflags); + break; + case 3: + if ((thflags & TH_SYN) && + !(m->m_flags & (M_BCAST | M_MCAST)) && +#if INET6 + ((isipv6 && !IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) || + (!isipv6 && ip->ip_dst.s_addr != ip->ip_src.s_addr)) +#else + ip->ip_dst.s_addr != ip->ip_src.s_addr +#endif + ) + log_in_vain_log((LOG_INFO, + "Stealth Mode connection attempt to TCP %s:%d from %s:%d\n", + dbuf, ntohs(th->th_dport), + sbuf, + ntohs(th->th_sport))); break; default: break; } } if (blackhole) { - switch (blackhole) { - case 1: - if (thflags & TH_SYN) - goto drop; - break; - case 2: - goto drop; - default: - goto drop; - } + if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type != IFT_LOOP) + switch (blackhole) { + case 1: + if (thflags & TH_SYN) + goto dropnosock; + break; + case 2: + goto dropnosock; + default: + goto dropnosock; + } } rstreason = BANDLIM_RST_CLOSEDPORT; - goto dropwithreset; + goto dropwithresetnosock; + } + so = inp->inp_socket; + if (so == NULL) { + if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) + inp = NULL; // pretend we didn't find it +#if TEMPDEBUG + printf("tcp_input: no more socket for inp=%x\n", inp); +#endif + goto dropnosock; } + tcp_lock(so, 1, 2); + if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { + tcp_unlock(so, 1, 2); + inp = NULL; // pretend we didn't find it + goto dropnosock; + } + tp = intotcpcb(inp); if (tp == 0) { rstreason = BANDLIM_RST_CLOSEDPORT; @@ -811,7 +890,6 @@ findpcb: else tiwin = th->th_win; - so = inp->inp_socket; if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { #if TCPDEBUG if (so->so_options & SO_DEBUG) { @@ -827,11 +905,10 @@ findpcb: } #endif if (so->so_options & SO_ACCEPTCONN) { - register struct tcpcb *tp0 = tp; + register struct tcpcb *tp0 = tp; struct socket *so2; -#if IPSEC struct socket *oso; -#endif + struct sockaddr_storage from; #if INET6 struct inpcb *oinp = sotoinpcb(so); #endif /* INET6 */ @@ -900,31 +977,51 @@ findpcb: } } #endif - - so2 = sonewconn(so, 0); + if (so->so_filt) { + if (isipv6) { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6*)&from; + + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = th->th_sport; + sin6->sin6_flowinfo = 0; + sin6->sin6_addr = ip6->ip6_src; + sin6->sin6_scope_id = 0; + } else { + struct sockaddr_in *sin = (struct sockaddr_in*)&from; + + sin->sin_len = sizeof(*sin); + sin->sin_family = AF_INET; + sin->sin_port = th->th_sport; + sin->sin_addr = ip->ip_src; + } + so2 = sonewconn(so, 0, (struct sockaddr*)&from); + } else { + so2 = sonewconn(so, 0, NULL); + } if (so2 == 0) { tcpstat.tcps_listendrop++; - so2 = sodropablereq(so); - if (so2) { - if (tcp_lq_overflow) - sototcpcb(so2)->t_flags |= - TF_LQ_OVERFLOW; - tcp_drop(sototcpcb(so2), ETIMEDOUT); - so2 = sonewconn(so, 0); + if (tcpdropdropablreq(so)) { + if (so->so_filt) + so2 = sonewconn(so, 0, (struct sockaddr*)&from); + else + so2 = sonewconn(so, 0, NULL); } - if (!so2) + if (!so2) goto drop; } /* * Make sure listening socket did not get closed during socket allocation, - * not only this is incorrect but it is know to cause panic - */ + * not only this is incorrect but it is know to cause panic + */ if (so->so_gencnt != ogencnt) goto drop; -#if IPSEC + oso = so; -#endif + tcp_unlock(so, 0, 0); /* Unlock but keep a reference on listener for now */ + so = so2; + tcp_lock(so, 1, 0); /* * This is ugly, but .... * @@ -950,7 +1047,7 @@ findpcb: } #endif /* INET6 */ inp->inp_lport = th->th_dport; - if (in_pcbinshash(inp) != 0) { + if (in_pcbinshash(inp, 0) != 0) { /* * Undo the assignments above if we failed to * put the PCB on the hash lists. @@ -962,6 +1059,8 @@ findpcb: #endif /* INET6 */ inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; + tcp_lock(oso, 0, 0); /* release ref on parent */ + tcp_unlock(oso, 1, 0); goto drop; } #if IPSEC @@ -978,6 +1077,8 @@ findpcb: * Note: dropwithreset makes sure we don't * send a RST in response to a RST. */ + tcp_lock(oso, 0, 0); /* release ref on parent */ + tcp_unlock(oso, 1, 0); if (thflags & TH_ACK) { tcpstat.tcps_badsyn++; rstreason = BANDLIM_RST_OPENPORT; @@ -1010,22 +1111,26 @@ findpcb: } else #endif /* INET6 */ inp->inp_options = ip_srcroute(); + tcp_lock(oso, 0, 0); #if IPSEC /* copy old policy into new socket's */ if (sotoinpcb(oso)->inp_sp) { int error = 0; + lck_mtx_lock(sadb_mutex); /* Is it a security hole here to silently fail to copy the policy? */ if (inp->inp_sp != NULL) error = ipsec_init_policy(so, &inp->inp_sp); if (error != 0 || ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp)) printf("tcp_input: could not copy policy\n"); + lck_mtx_unlock(sadb_mutex); } #endif + tcp_unlock(oso, 1, 0); /* now drop the reference on the listener */ tp = intotcpcb(inp); tp->t_state = TCPS_LISTEN; tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT|TF_NODELAY); - + tp->t_inpcb->inp_ip_ttl = tp0->t_inpcb->inp_ip_ttl; /* Compute proper scaling value from buffer space */ while (tp->request_r_scale < TCP_MAX_WINSHIFT && TCP_MAXWIN << tp->request_r_scale < @@ -1036,6 +1141,68 @@ findpcb: } } +#if 1 + lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); +#endif + /* + * Radar 3529618 + * This is the second part of the MSS DoS prevention code (after + * minmss on the sending side) and it deals with too many too small + * tcp packets in a too short timeframe (1 second). + * + * For every full second we count the number of received packets + * and bytes. If we get a lot of packets per second for this connection + * (tcp_minmssoverload) we take a closer look at it and compute the + * average packet size for the past second. If that is less than + * tcp_minmss we get too many packets with very small payload which + * is not good and burdens our system (and every packet generates + * a wakeup to the process connected to our socket). We can reasonable + * expect this to be small packet DoS attack to exhaust our CPU + * cycles. + * + * Care has to be taken for the minimum packet overload value. This + * value defines the minimum number of packets per second before we + * start to worry. This must not be too low to avoid killing for + * example interactive connections with many small packets like + * telnet or SSH. + * + * Setting either tcp_minmssoverload or tcp_minmss to "0" disables + * this check. + * + * Account for packet if payload packet, skip over ACK, etc. + */ + if (tcp_minmss && tcp_minmssoverload && + tp->t_state == TCPS_ESTABLISHED && tlen > 0) { + if (tp->rcv_reset > tcp_now) { + tp->rcv_pps++; + tp->rcv_byps += tlen + off; + if (tp->rcv_pps > tcp_minmssoverload) { + if ((tp->rcv_byps / tp->rcv_pps) < tcp_minmss) { + char ipstrbuf[MAX_IPv6_STR_LEN]; + printf("too many small tcp packets from " + "%s:%u, av. %lubyte/packet, " + "dropping connection\n", +#ifdef INET6 + isipv6 ? + inet_ntop(AF_INET6, &inp->in6p_faddr, ipstrbuf, + sizeof(ipstrbuf)) : +#endif + inet_ntop(AF_INET, &inp->inp_faddr, ipstrbuf, + sizeof(ipstrbuf)), + inp->inp_fport, + tp->rcv_byps / tp->rcv_pps); + tp = tcp_drop(tp, ECONNRESET); +/* tcpstat.tcps_minmssdrops++; */ + goto drop; + } + } + } else { + tp->rcv_reset = tcp_now + PR_SLOWHZ; + tp->rcv_pps = 1; + tp->rcv_byps = tlen + off; + } + } + /* * Segment received on connection. * Reset idle time and keep-alive timer. @@ -1144,9 +1311,10 @@ findpcb: else if (tp->t_timer[TCPT_PERSIST] == 0) tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; - if (so->so_snd.sb_cc) + sowwakeup(so); /* has to be done with socket lock held */ + if ((so->so_snd.sb_cc) || (tp->t_flags & TF_ACKNOW)) (void) tcp_output(tp); - sowwakeup(so); + tcp_unlock(so, 1, 0); KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); return; } @@ -1167,7 +1335,8 @@ findpcb: * Add data to socket buffer. */ m_adj(m, drop_hdrlen); /* delayed header drop */ - sbappend(&so->so_rcv, m); + if (sbappend(&so->so_rcv, m)) + sorwakeup(so); #if INET6 if (isipv6) { KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport), @@ -1181,14 +1350,13 @@ findpcb: (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)), th->th_seq, th->th_ack, th->th_win); } - if (tcp_delack_enabled) { - TCP_DELACK_BITSET(tp->t_inpcb->hash_element); + if (DELAY_ACK(tp)) { tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; tcp_output(tp); } - sorwakeup(so); + tcp_unlock(so, 1, 0); KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); return; } @@ -1200,6 +1368,9 @@ findpcb: * Receive window is amount of space in rcv queue, * but not less than advertised window. */ +#if 1 + lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); +#endif { int win; win = sbspace(&so->so_rcv); @@ -1234,7 +1405,10 @@ findpcb: register struct sockaddr_in6 *sin6; #endif - if (thflags & TH_RST) +#if 1 + lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); +#endif + if (thflags & TH_RST) goto drop; if (thflags & TH_ACK) { rstreason = BANDLIM_RST_OPENPORT; @@ -1299,6 +1473,9 @@ findpcb: } else #endif { +#if 1 + lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); +#endif MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, M_NOWAIT); if (sin == NULL) @@ -1368,7 +1545,7 @@ findpcb: * segment. Otherwise must send ACK now in case * the other side is slow starting. */ - if (tcp_delack_enabled && ((thflags & TH_FIN) || + if (DELAY_ACK(tp) && ((thflags & TH_FIN) || (tlen != 0 && #if INET6 (isipv6 && in6_localaddr(&inp->in6p_faddr)) @@ -1380,11 +1557,11 @@ findpcb: ) #endif /* INET6 */ ))) { - TCP_DELACK_BITSET(tp->t_inpcb->hash_element); tp->t_flags |= (TF_DELACK | TF_NEEDSYN); } - else + else { tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); + } /* * Limit the `virtual advertised window' to TCP_MAXWIN @@ -1523,12 +1700,12 @@ findpcb: * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ - if (tcp_delack_enabled && tlen != 0) { - TCP_DELACK_BITSET(tp->t_inpcb->hash_element); + if (DELAY_ACK(tp) && tlen != 0) { tp->t_flags |= TF_DELACK; } - else + else { tp->t_flags |= TF_ACKNOW; + } /* * Received <SYN,ACK> in SYN_SENT[*] state. * Transitions: @@ -1634,6 +1811,7 @@ trimthenstep6: } if (CC_GT(to.to_cc, tp->cc_recv)) { tp = tcp_close(tp); + tcp_unlock(so, 1, 50); goto findpcb; } else @@ -1744,6 +1922,9 @@ trimthenstep6: goto drop; } +#if 1 + lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); +#endif /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. @@ -1873,6 +2054,7 @@ trimthenstep6: SEQ_GT(th->th_seq, tp->rcv_nxt)) { iss = tcp_new_isn(tp); tp = tcp_close(tp); + tcp_unlock(so, 1, 0); goto findpcb; } /* @@ -2203,10 +2385,17 @@ process_ACK: tp->snd_wnd -= acked; ourfinisacked = 0; } + sowwakeup(so); + /* detect una wraparound */ + if (SEQ_GEQ(tp->snd_una, tp->snd_recover) && + SEQ_LT(th->th_ack, tp->snd_recover)) + tp->snd_recover = th->th_ack; + if (SEQ_GT(tp->snd_una, tp->snd_high) && + SEQ_LEQ(th->th_ack, tp->snd_high)) + tp->snd_high = th->th_ack - 1; tp->snd_una = th->th_ack; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; - sowwakeup(so); switch (tp->t_state) { @@ -2230,6 +2419,7 @@ process_ACK: } add_to_time_wait(tp); tp->t_state = TCPS_FIN_WAIT_2; + goto drop; } break; @@ -2389,25 +2579,19 @@ dodata: /* XXX */ if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq) && TCPS_HAVEESTABLISHED(tp->t_state)) { -#ifdef __APPLE__ - if (tcp_delack_enabled) { - TCP_DELACK_BITSET(tp->t_inpcb->hash_element); + if (DELAY_ACK(tp) && ((tp->t_flags & TF_ACKNOW) == 0)) { tp->t_flags |= TF_DELACK; } -#else - if (DELAY_ACK(tp)) - callout_reset(tp->tt_delack, tcp_delacktime, - tcp_timer_delack, tp); -#endif - else + else { tp->t_flags |= TF_ACKNOW; + } tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; tcpstat.tcps_rcvpack++; tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); - sbappend(&so->so_rcv, m); - sorwakeup(so); + if (sbappend(&so->so_rcv, m)) + sorwakeup(so); } else { thflags = tcp_reass(tp, th, &tlen, m); tp->t_flags |= TF_ACKNOW; @@ -2456,12 +2640,12 @@ dodata: /* XXX */ * Otherwise, since we received a FIN then no * more input can be expected, send ACK now. */ - if (tcp_delack_enabled && (tp->t_flags & TF_NEEDSYN)) { - TCP_DELACK_BITSET(tp->t_inpcb->hash_element); + if (DELAY_ACK(tp) && (tp->t_flags & TF_NEEDSYN)) { tp->t_flags |= TF_DELACK; } - else + else { tp->t_flags |= TF_ACKNOW; + } tp->rcv_nxt++; } switch (tp->t_state) { @@ -2527,6 +2711,7 @@ dodata: /* XXX */ */ if (needoutput || (tp->t_flags & TF_ACKNOW)) (void) tcp_output(tp); + tcp_unlock(so, 1, 0); KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); return; @@ -2560,9 +2745,11 @@ dropafterack: m_freem(m); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); + tcp_unlock(so, 1, 0); KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); return; - +dropwithresetnosock: + nosock = 1; dropwithreset: /* * Generate a RST, dropping incoming segment. @@ -2610,11 +2797,17 @@ dropwithreset: (tcp_seq)0, TH_RST|TH_ACK); } /* destroy temporarily created socket */ - if (dropsocket) - (void) soabort(so); + if (dropsocket) { + (void) soabort(so); + tcp_unlock(so, 1, 0); + } + else + if ((inp != NULL) && (nosock == 0)) + tcp_unlock(so, 1, 0); KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); return; - +dropnosock: + nosock = 1; drop: /* * Drop space held by incoming segment and return. @@ -2626,8 +2819,13 @@ drop: #endif m_freem(m); /* destroy temporarily created socket */ - if (dropsocket) - (void) soabort(so); + if (dropsocket) { + (void) soabort(so); + tcp_unlock(so, 1, 0); + } + else + if (nosock == 0) + tcp_unlock(so, 1, 0); KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); return; } @@ -3188,6 +3386,7 @@ tcp_newreno(tp, th) * is called) */ tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); + tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); tp->snd_cwnd = ocwnd; if (SEQ_GT(onxt, tp->snd_nxt)) @@ -3201,3 +3400,78 @@ tcp_newreno(tp, th) } return (0); } + +/* + * Drop a random TCP connection that hasn't been serviced yet and + * is eligible for discard. There is a one in qlen chance that + * we will return a null, saying that there are no dropable + * requests. In this case, the protocol specific code should drop + * the new request. This insures fairness. + * + * The listening TCP socket "head" must be locked + */ +static int +tcpdropdropablreq(struct socket *head) +{ + struct socket *so; + unsigned int i, j, qlen; + static int rnd; + static struct timeval old_runtime; + static unsigned int cur_cnt, old_cnt; + struct timeval tv; + struct inpcb *inp = NULL; + + microtime(&tv); + if ((i = (tv.tv_sec - old_runtime.tv_sec)) != 0) { + old_runtime = tv; + old_cnt = cur_cnt / i; + cur_cnt = 0; + } + + so = TAILQ_FIRST(&head->so_incomp); + if (!so) + return 0; + + qlen = head->so_incqlen; + if (++cur_cnt > qlen || old_cnt > qlen) { + rnd = (314159 * rnd + 66329) & 0xffff; + j = ((qlen + 1) * rnd) >> 16; + + while (j-- && so) + so = TAILQ_NEXT(so, so_list); + } + /* Find a connection that is not already closing */ + while (so) { + inp = (struct inpcb *)so->so_pcb; + + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) + break; + + so = TAILQ_NEXT(so, so_list); + } + if (!so) + return 0; + + /* Let's remove this connection from the incomplete list */ + tcp_lock(so, 1, 0); + + if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { + tcp_unlock(so, 1, 0); + return 0; + } + sototcpcb(so)->t_flags |= TF_LQ_OVERFLOW; + head->so_incqlen--; + head->so_qlen--; + so->so_head = NULL; + TAILQ_REMOVE(&head->so_incomp, so, so_list); + so->so_usecount--; /* No more held by so_head */ + + tcp_drop(sototcpcb(so), ETIMEDOUT); + + tcp_unlock(so, 1, 0); + + return 1; + +} + + diff --git a/bsd/netinet/tcp_output.c b/bsd/netinet/tcp_output.c index e9fd4814e..36e310fd1 100644 --- a/bsd/netinet/tcp_output.c +++ b/bsd/netinet/tcp_output.c @@ -121,7 +121,14 @@ int tcp_do_newreno = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno, 0, "Enable NewReno Algorithms"); -struct mbuf *m_copym_with_hdrs __P((struct mbuf*, int, int, int, struct mbuf**, int*)); +int tcp_packet_chaining = 50; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, packetchain, CTLFLAG_RW, &tcp_packet_chaining, + 0, "Enable TCP output packet chaining"); + +struct mbuf *m_copym_with_hdrs(struct mbuf*, int, int, int, struct mbuf**, int*); +static long packchain_newlist = 0; +static long packchain_looped = 0; +static long packchain_sent = 0; /* temporary: for testing */ @@ -131,7 +138,25 @@ extern int ipsec_bypass; extern int slowlink_wsize; /* window correction for slow links */ extern u_long route_generation; +extern int fw_enable; /* firewall is on: disable packet chaining */ +extern int ipsec_bypass; +extern vm_size_t so_cache_zone_element_size; + +static __inline__ u_int16_t +get_socket_id(struct socket * s) +{ + u_int16_t val; + + if (so_cache_zone_element_size == 0) { + return (0); + } + val = (u_int16_t)(((u_int32_t)s) / so_cache_zone_element_size); + if (val == 0) { + val = 0xffff; + } + return (val); +} /* * Tcp output routine: figure out what should be sent and send it. @@ -152,7 +177,7 @@ tcp_output(tp) register struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; - int idle, sendalot; + int idle, sendalot, howmuchsent = 0; int maxburst = TCP_MAXBURST; struct rmxp_tao *taop; struct rmxp_tao tao_noncached; @@ -160,9 +185,13 @@ tcp_output(tp) int m_off; struct mbuf *m_last = 0; struct mbuf *m_head = 0; + struct mbuf *packetlist = 0; + struct mbuf *lastpacket = 0; #if INET6 int isipv6 = tp->t_inpcb->inp_vflag & INP_IPV6 ; #endif + short packchain_listadd = 0; + u_int16_t socket_id = get_socket_id(so); /* @@ -172,11 +201,7 @@ tcp_output(tp) * to send, then transmit; otherwise, investigate further. */ idle = (tp->snd_max == tp->snd_una); -#ifdef __APPLE__ if (idle && tp->t_rcvtime >= tp->t_rxtcur) { -#else - if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) { -#endif /* * We have been idle for "a while" and no acks are * expected to clock out any data we send -- @@ -231,7 +256,7 @@ again: if ((tp->t_inpcb->inp_route.ro_rt != NULL && (tp->t_inpcb->inp_route.ro_rt->generation_id != route_generation)) || (tp->t_inpcb->inp_route.ro_rt == NULL)) { /* check that the source address is still valid */ - if (ifa_foraddr(tp->t_inpcb->inp_laddr.s_addr) == NULL) { + if (ifa_foraddr(tp->t_inpcb->inp_laddr.s_addr) == 0) { if (tp->t_state >= TCPS_CLOSE_WAIT) { tcp_close(tp); return(EADDRNOTAVAIL); @@ -250,6 +275,11 @@ again: } } + if (packetlist) { + error = ip_output_list(packetlist, packchain_listadd, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, + (so->so_options & SO_DONTROUTE), 0); + tp->t_lastchain = 0; + } if (so->so_flags & SOF_NOADDRAVAIL) return(EADDRNOTAVAIL); else @@ -323,6 +353,11 @@ again: off--, len++; if (len > 0 && tp->t_state == TCPS_SYN_SENT && taop->tao_ccsent == 0) { + if (packetlist) { + error = ip_output_list(packetlist, packchain_listadd, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, + (so->so_options & SO_DONTROUTE), 0); + tp->t_lastchain = 0; + } KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return 0; } @@ -363,6 +398,7 @@ again: } if (len > tp->t_maxseg) { len = tp->t_maxseg; + howmuchsent += len; sendalot = 1; } if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) @@ -469,8 +505,13 @@ again: } /* - * No reason to send a segment, just return. + * If there is no reason to send a segment, just return. + * but if there is some packets left in the packet list, send them now. */ + if (packetlist) { + error = ip_output_list(packetlist, packchain_listadd, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, + (so->so_options & SO_DONTROUTE), 0); + } KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return (0); @@ -634,6 +675,7 @@ send: */ flags &= ~TH_FIN; len = tp->t_maxopd - optlen - ipoptlen; + howmuchsent += len; sendalot = 1; } @@ -798,7 +840,7 @@ send: m->m_data += max_linkhdr; m->m_len = hdrlen; } - m->m_pkthdr.rcvif = (struct ifnet *)0; + m->m_pkthdr.rcvif = 0; #if INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); @@ -864,6 +906,20 @@ send: win = (long)TCP_MAXWIN << tp->rcv_scale; th->th_win = htons((u_short) (win>>tp->rcv_scale)); } + + /* + * Adjust the RXWIN0SENT flag - indicate that we have advertised + * a 0 window. This may cause the remote transmitter to stall. This + * flag tells soreceive() to disable delayed acknowledgements when + * draining the buffer. This can occur if the receiver is attempting + * to read more data then can be buffered prior to transmitting on + * the connection. + */ + if (win == 0) + tp->t_flags |= TF_RXWIN0SENT; + else + tp->t_flags &= ~TF_RXWIN0SENT; + if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); th->th_flags |= TH_URG; @@ -994,10 +1050,11 @@ send: goto out; } #endif /*IPSEC*/ + m->m_pkthdr.socket_id = socket_id; error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &tp->t_inpcb->in6p_route, - (so->so_options & SO_DONTROUTE), NULL, NULL); + (so->so_options & SO_DONTROUTE), NULL, NULL, 0); } else #endif /* INET6 */ { @@ -1050,9 +1107,49 @@ send: if (ipsec_bypass == 0) ipsec_setsocket(m, so); #endif /*IPSEC*/ - error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, - (so->so_options & SO_DONTROUTE), 0); - } + + /* + * The socket is kept locked while sending out packets in ip_output, even if packet chaining is not active. + */ + + m->m_pkthdr.socket_id = socket_id; + if (packetlist) { + m->m_nextpkt = NULL; + lastpacket->m_nextpkt = m; + lastpacket = m; + packchain_listadd++; + } + else { + m->m_nextpkt = NULL; + packchain_newlist++; + packetlist = lastpacket = m; + packchain_listadd=0; + } + + if ((ipsec_bypass == 0) || fw_enable || sendalot == 0 || (tp->t_state != TCPS_ESTABLISHED) || + (tp->snd_cwnd <= (tp->snd_wnd / 4)) || + (tp->t_flags & (TH_PUSH | TF_ACKNOW)) || tp->t_force != 0 || + packchain_listadd >= tcp_packet_chaining) { + lastpacket->m_nextpkt = 0; + error = ip_output_list(packetlist, packchain_listadd, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, + (so->so_options & SO_DONTROUTE), 0); + tp->t_lastchain = packchain_listadd; + packchain_sent++; + packetlist = NULL; + if (error == 0) + howmuchsent = 0; + } + else { + error = 0; + packchain_looped++; + tcpstat.tcps_sndtotal++; + if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) + tp->rcv_adv = tp->rcv_nxt + win; + tp->last_ack_sent = tp->rcv_nxt; + tp->t_flags &= ~(TF_ACKNOW|TF_DELACK); + goto again; + } + } if (error) { /* @@ -1064,15 +1161,19 @@ send: * No need to check for TH_FIN here because * the TF_SENTFIN flag handles that case. */ - if ((flags & TH_SYN) == 0) - tp->snd_nxt -= len; + if ((flags & TH_SYN) == 0) + tp->snd_nxt -= howmuchsent; } + howmuchsent = 0; out: if (error == ENOBUFS) { if (!tp->t_timer[TCPT_REXMT] && !tp->t_timer[TCPT_PERSIST]) tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; tcp_quench(tp->t_inpcb, 0); + if (packetlist) + m_freem_list(packetlist); + tp->t_lastchain = 0; KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return (0); } @@ -1084,18 +1185,28 @@ out: * not do so here. */ tcp_mtudisc(tp->t_inpcb, 0); + if (packetlist) + m_freem_list(packetlist); + tp->t_lastchain = 0; KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return 0; } if ((error == EHOSTUNREACH || error == ENETDOWN) && TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; + if (packetlist) + m_freem_list(packetlist); + tp->t_lastchain = 0; KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return (0); } + if (packetlist) + m_freem_list(packetlist); + tp->t_lastchain = 0; KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return (error); } +sentit: tcpstat.tcps_sndtotal++; /* @@ -1109,8 +1220,8 @@ out: tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW|TF_DELACK); - KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); - if (sendalot) + KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,0,0,0,0,0); + if (sendalot && (!tcp_do_newreno || --maxburst)) goto again; return (0); } @@ -1120,7 +1231,6 @@ tcp_setpersist(tp) register struct tcpcb *tp; { int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; - int tt; if (tp->t_timer[TCPT_REXMT]) panic("tcp_setpersist: retransmit pending"); diff --git a/bsd/netinet/tcp_seq.h b/bsd/netinet/tcp_seq.h index 57f1c9262..5023c0a18 100644 --- a/bsd/netinet/tcp_seq.h +++ b/bsd/netinet/tcp_seq.h @@ -85,7 +85,7 @@ /* Macro to increment a CC: skip 0 which has a special meaning */ #define CC_INC(c) (++(c) == 0 ? ++(c) : (c)) -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE /* * Macros to initialize tcp sequence numbers for * send and receive from initial send and receive @@ -96,15 +96,11 @@ #define tcp_sendseqinit(tp) \ (tp)->snd_una = (tp)->snd_nxt = (tp)->snd_max = (tp)->snd_up = \ - (tp)->iss + (tp)->snd_recover = (tp)->snd_high = (tp)->iss #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) /* timestamp wrap-around time */ -#endif /* __APPLE_API_PRIVATE */ -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE extern tcp_cc tcp_ccgen; /* global connection count */ -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +#endif KERNEL_PRIVATE #endif /* _NETINET_TCP_SEQ_H_ */ diff --git a/bsd/netinet/tcp_subr.c b/bsd/netinet/tcp_subr.c index cb7bda653..0d8a16867 100644 --- a/bsd/netinet/tcp_subr.c +++ b/bsd/netinet/tcp_subr.c @@ -67,11 +67,13 @@ #include <sys/domain.h> #endif #include <sys/proc.h> +#include <sys/kauth.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/protosw.h> #include <sys/random.h> #include <sys/syslog.h> +#include <kern/locks.h> @@ -120,10 +122,12 @@ #define DBG_FNC_TCP_CLOSE NETDBG_CODE(DBG_NETTCP, ((5 << 8) | 2)) +extern int tcp_lq_overflow; /* temporary: for testing */ #if IPSEC extern int ipsec_bypass; +extern lck_mtx_t *sadb_mutex; #endif int tcp_mssdflt = TCP_MSS; @@ -149,6 +153,23 @@ int tcp_minmss = TCP_MINMSS; SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW, &tcp_minmss , 0, "Minmum TCP Maximum Segment Size"); +/* + * Number of TCP segments per second we accept from remote host + * before we start to calculate average segment size. If average + * segment size drops below the minimum TCP MSS we assume a DoS + * attack and reset+drop the connection. Care has to be taken not to + * set this value too small to not kill interactive type connections + * (telnet, SSH) which send many small packets. + */ +#ifdef FIX_WORKAROUND_FOR_3894301 +__private_extern__ int tcp_minmssoverload = TCP_MINMSSOVERLOAD; +#else +__private_extern__ int tcp_minmssoverload = 0; +#endif +SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmssoverload, CTLFLAG_RW, + &tcp_minmssoverload , 0, "Number of TCP Segments per Second allowed to" + "be under the MINMSS Size"); + static int tcp_do_rfc1323 = 1; SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions"); @@ -161,7 +182,7 @@ static int tcp_tcbhashsize = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); -static int do_tcpdrain = 1; +static int do_tcpdrain = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); @@ -180,8 +201,8 @@ static int tcp_isn_reseed_interval = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); -static void tcp_cleartaocache __P((void)); -static void tcp_notify __P((struct inpcb *, int)); +static void tcp_cleartaocache(void); +static void tcp_notify(struct inpcb *, int); /* * Target size of TCP PCB hash tables. Must be a power of two. @@ -237,7 +258,7 @@ int get_tcp_str_size() return sizeof(struct tcpcb); } -int tcp_freeq __P((struct tcpcb *tp)); +int tcp_freeq(struct tcpcb *tp); /* @@ -249,6 +270,7 @@ tcp_init() int hashsize = TCBHASHSIZE; vm_size_t str_size; int i; + struct inpcbinfo *pcbinfo; tcp_ccgen = 1; tcp_cleartaocache(); @@ -260,11 +282,12 @@ tcp_init() tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_msl = TCPTV_MSL; read_random(&tcp_now, sizeof(tcp_now)); - tcp_now = tcp_now & 0x7fffffffffffffff; /* Starts tcp internal 500ms clock at a random value */ + tcp_now = tcp_now & 0x7fffffff; /* Starts tcp internal 500ms clock at a random value */ LIST_INIT(&tcb); tcbinfo.listhead = &tcb; + pcbinfo = &tcbinfo; #ifndef __APPLE__ TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize); #endif @@ -301,10 +324,29 @@ tcp_init() if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) panic("tcp_init"); #undef TCP_MINPROTOHDR - tcbinfo.last_pcb = 0; dummy_tcb.t_state = TCP_NSTATES; dummy_tcb.t_flags = 0; tcbinfo.dummy_cb = (caddr_t) &dummy_tcb; + + /* + * allocate lock group attribute and group for tcp pcb mutexes + */ + pcbinfo->mtx_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setdefault(pcbinfo->mtx_grp_attr); + pcbinfo->mtx_grp = lck_grp_alloc_init("tcppcb", pcbinfo->mtx_grp_attr); + + /* + * allocate the lock attribute for tcp pcb mutexes + */ + pcbinfo->mtx_attr = lck_attr_alloc_init(); + lck_attr_setdefault(pcbinfo->mtx_attr); + + if ((pcbinfo->mtx = lck_rw_alloc_init(pcbinfo->mtx_grp, pcbinfo->mtx_attr)) == NULL) { + printf("tcp_init: mutex not alloced!\n"); + return; /* pretty much dead if this fails... */ + } + + in_pcb_nat_init(&tcbinfo, AF_INET, IPPROTO_TCP, SOCK_STREAM); delack_bitmask = _MALLOC((4 * hashsize)/32, M_PCB, M_WAITOK); @@ -530,7 +572,7 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags) } m->m_len = tlen; m->m_pkthdr.len = tlen; - m->m_pkthdr.rcvif = (struct ifnet *) 0; + m->m_pkthdr.rcvif = 0; nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_x2 = 0; @@ -571,7 +613,7 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags) #endif #if INET6 if (isipv6) { - (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL); + (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL, 0); if (ro6 == &sro6 && ro6->ro_rt) { rtfree(ro6->ro_rt); ro6->ro_rt = NULL; @@ -579,7 +621,7 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags) } else #endif /* INET6 */ { - (void) ip_output(m, NULL, ro, ipflags, NULL); + (void) ip_output_list(m, 0, NULL, ro, ipflags, NULL); if (ro == &sro && ro->ro_rt) { rtfree(ro->ro_rt); ro->ro_rt = NULL; @@ -731,7 +773,6 @@ tcp_close(tp) } } #endif - KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_START, tp,0,0,0,0); switch (tp->t_state) @@ -859,7 +900,7 @@ tcp_close(tp) * mark route for deletion if no information is * cached. */ - if ((tp->t_flags & TF_LQ_OVERFLOW) && + if ((tp->t_flags & TF_LQ_OVERFLOW) && tcp_lq_overflow && ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0)){ if (rt->rt_rmx.rmx_rtt == 0) rt->rt_flags |= RTF_DELCLONE; @@ -874,7 +915,6 @@ tcp_close(tp) inp->inp_saved_ppcb = (caddr_t) tp; #endif - inp->inp_ppcb = NULL; soisdisconnected(so); #if INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) @@ -908,6 +948,9 @@ tcp_freeq(tp) void tcp_drain() { +/* + * ###LD 05/19/04 locking issue, tcpdrain is disabled, deadlock situation with tcbinfo.mtx + */ if (do_tcpdrain) { struct inpcb *inpb; @@ -922,6 +965,7 @@ tcp_drain() * where we're really low on mbufs, this is potentially * usefull. */ + lck_rw_lock_exclusive(tcbinfo.mtx); for (inpb = LIST_FIRST(tcbinfo.listhead); inpb; inpb = LIST_NEXT(inpb, inp_list)) { if ((tcpb = intotcpcb(inpb))) { @@ -934,6 +978,7 @@ tcp_drain() } } } + lck_rw_done(tcbinfo.mtx); } } @@ -953,7 +998,7 @@ tcp_notify(inp, error) { struct tcpcb *tp; - if (inp == NULL) + if (inp == NULL || (inp->inp_state == INPCB_STATE_DEAD)) return; /* pcb is gone already */ tp = (struct tcpcb *)inp->inp_ppcb; @@ -993,66 +1038,73 @@ tcp_pcblist SYSCTL_HANDLER_ARGS * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ - if (req->oldptr == 0) { + lck_rw_lock_shared(tcbinfo.mtx); + if (req->oldptr == USER_ADDR_NULL) { n = tcbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xtcpcb); + lck_rw_done(tcbinfo.mtx); return 0; } - if (req->newptr != 0) + if (req->newptr != USER_ADDR_NULL) { + lck_rw_done(tcbinfo.mtx); return EPERM; + } /* * OK, now we're committed to doing something. */ - s = splnet(); gencnt = tcbinfo.ipi_gencnt; n = tcbinfo.ipi_count; - splx(s); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); - if (error) + if (error) { + lck_rw_done(tcbinfo.mtx); return error; + } /* * We are done if there is no pcb */ - if (n == 0) + if (n == 0) { + lck_rw_done(tcbinfo.mtx); return 0; + } inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK); - if (inp_list == 0) + if (inp_list == 0) { + lck_rw_done(tcbinfo.mtx); return ENOMEM; + } - s = splnet(); for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { #ifdef __APPLE__ - if (inp->inp_gencnt <= gencnt) + if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) #else if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp)) #endif inp_list[i++] = inp; } - splx(s); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; - if (inp->inp_gencnt <= gencnt) { + if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) { struct xtcpcb xt; caddr_t inp_ppcb; xt.xt_len = sizeof xt; /* XXX should avoid extra copy */ - bcopy(inp, &xt.xt_inp, sizeof *inp); + inpcb_to_compat(inp, &xt.xt_inp); inp_ppcb = inp->inp_ppcb; - if (inp_ppcb != NULL) + if (inp_ppcb != NULL) { bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); + } else bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); if (inp->inp_socket) @@ -1068,14 +1120,13 @@ tcp_pcblist SYSCTL_HANDLER_ARGS * while we were processing this request, and it * might be necessary to retry. */ - s = splnet(); xig.xig_gen = tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = tcbinfo.ipi_count; - splx(s); error = SYSCTL_OUT(req, &xig, sizeof xig); } FREE(inp_list, M_TEMP); + lck_rw_done(tcbinfo.mtx); return error; } @@ -1103,7 +1154,7 @@ tcp_getcred(SYSCTL_HANDLER_ARGS) error = ENOENT; goto out; } - error = SYSCTL_OUT(req, inp->inp_socket->so_cred, sizeof(struct ucred)); + error = SYSCTL_OUT(req, inp->inp_socket->so_cred, sizeof(*(kauth_cred_t)0); out: splx(s); return (error); @@ -1150,7 +1201,7 @@ tcp6_getcred(SYSCTL_HANDLER_ARGS) goto out; } error = SYSCTL_OUT(req, inp->inp_socket->so_cred, - sizeof(struct ucred)); + sizeof(*(kauth_cred_t)0); out: splx(s); return (error); @@ -1173,7 +1224,7 @@ tcp_ctlinput(cmd, sa, vip) struct in_addr faddr; struct inpcb *inp; struct tcpcb *tp; - void (*notify) __P((struct inpcb *, int)) = tcp_notify; + void (*notify)(struct inpcb *, int) = tcp_notify; tcp_seq icmp_seq; int s; @@ -1196,21 +1247,25 @@ tcp_ctlinput(cmd, sa, vip) else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip) { - s = splnet(); th = (struct tcphdr *)((caddr_t)ip + (IP_VHL_HL(ip->ip_vhl) << 2)); inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport, ip->ip_src, th->th_sport, 0, NULL); if (inp != NULL && inp->inp_socket != NULL) { + tcp_lock(inp->inp_socket, 1, 0); + if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { + tcp_unlock(inp->inp_socket, 1, 0); + return; + } icmp_seq = htonl(th->th_seq); tp = intotcpcb(inp); if (SEQ_GEQ(icmp_seq, tp->snd_una) && SEQ_LT(icmp_seq, tp->snd_max)) (*notify)(inp, inetctlerrmap[cmd]); + tcp_unlock(inp->inp_socket, 1, 0); } - splx(s); } else - in_pcbnotifyall(&tcb, faddr, inetctlerrmap[cmd], notify); + in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify); } #if INET6 @@ -1221,7 +1276,7 @@ tcp6_ctlinput(cmd, sa, d) void *d; { struct tcphdr th; - void (*notify) __P((struct inpcb *, int)) = tcp_notify; + void (*notify)(struct inpcb *, int) = tcp_notify; struct ip6_hdr *ip6; struct mbuf *m; struct ip6ctlparam *ip6cp = NULL; @@ -1271,11 +1326,11 @@ tcp6_ctlinput(cmd, sa, d) bzero(&th, sizeof(th)); m_copydata(m, off, sizeof(*thp), (caddr_t)&th); - in6_pcbnotify(&tcb, sa, th.th_dport, + in6_pcbnotify(&tcbinfo, sa, th.th_dport, (struct sockaddr *)ip6cp->ip6c_src, th.th_sport, cmd, notify); } else - in6_pcbnotify(&tcb, sa, 0, (struct sockaddr *)sa6_src, + in6_pcbnotify(&tcbinfo, sa, 0, (struct sockaddr *)sa6_src, 0, cmd, notify); } #endif /* INET6 */ @@ -1586,6 +1641,7 @@ ipsec_hdrsiz_tcp(tp) if (!m) return 0; + lck_mtx_lock(sadb_mutex); #if INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { ip6 = mtod(m, struct ip6_hdr *); @@ -1603,7 +1659,7 @@ ipsec_hdrsiz_tcp(tp) tcp_fillheaders(tp, ip, th); hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); } - + lck_mtx_unlock(sadb_mutex); m_free(m); return hdrsiz; } @@ -1647,3 +1703,88 @@ static void tcp_cleartaocache() { } + +int +tcp_lock(so, refcount, lr) + struct socket *so; + int refcount; + int lr; +{ + int lr_saved; +#ifdef __ppc__ + if (lr == 0) { + __asm__ volatile("mflr %0" : "=r" (lr_saved)); + } + else lr_saved = lr; +#endif + + if (so->so_pcb) { + lck_mtx_lock(((struct inpcb *)so->so_pcb)->inpcb_mtx); + } + else { + panic("tcp_lock: so=%x NO PCB! lr=%x\n", so, lr_saved); + lck_mtx_lock(so->so_proto->pr_domain->dom_mtx); + } + + if (so->so_usecount < 0) + panic("tcp_lock: so=%x so_pcb=%x lr=%x ref=%x\n", + so, so->so_pcb, lr_saved, so->so_usecount); + + if (refcount) + so->so_usecount++; + so->reserved3 = (void *)lr_saved; + return (0); +} + +int +tcp_unlock(so, refcount, lr) + struct socket *so; + int refcount; + int lr; +{ + int lr_saved; +#ifdef __ppc__ + if (lr == 0) { + __asm__ volatile("mflr %0" : "=r" (lr_saved)); + } + else lr_saved = lr; +#endif + +#ifdef MORE_TCPLOCK_DEBUG + printf("tcp_unlock: so=%x sopcb=%x lock=%x ref=%x lr=%x\n", + so, so->so_pcb, ((struct inpcb *)so->so_pcb)->inpcb_mtx, so->so_usecount, lr_saved); +#endif + if (refcount) + so->so_usecount--; + + if (so->so_usecount < 0) + panic("tcp_unlock: so=%x usecount=%x\n", so, so->so_usecount); + if (so->so_pcb == NULL) { + panic("tcp_unlock: so=%x NO PCB usecount=%x lr=%x\n", so, so->so_usecount, lr_saved); + lck_mtx_unlock(so->so_proto->pr_domain->dom_mtx); + } + else { + lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_unlock(((struct inpcb *)so->so_pcb)->inpcb_mtx); + } + so->reserved4 = (void *)lr_saved; + return (0); +} + +lck_mtx_t * +tcp_getlock(so, locktype) + struct socket *so; + int locktype; +{ + struct inpcb *inp = sotoinpcb(so); + + if (so->so_pcb) { + if (so->so_usecount < 0) + panic("tcp_getlock: so=%x usecount=%x\n", so, so->so_usecount); + return(inp->inpcb_mtx); + } + else { + panic("tcp_getlock: so=%x NULL so_pcb\n", so); + return (so->so_proto->pr_domain->dom_mtx); + } +} diff --git a/bsd/netinet/tcp_timer.c b/bsd/netinet/tcp_timer.c index a99efeea7..b3c5388b9 100644 --- a/bsd/netinet/tcp_timer.c +++ b/bsd/netinet/tcp_timer.c @@ -64,6 +64,7 @@ #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/protosw.h> +#include <kern/locks.h> #include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */ @@ -154,11 +155,16 @@ int cur_tw_slot = 0; u_long *delack_bitmask; -void add_to_time_wait(tp) +void add_to_time_wait_locked(tp) struct tcpcb *tp; { int tw_slot; + /* pcb list should be locked when we get here */ +#if 0 + lck_mtx_assert(tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED); +#endif + LIST_REMOVE(tp->t_inpcb, inp_list); if (tp->t_timer[TCPT_2MSL] == 0) @@ -172,6 +178,19 @@ void add_to_time_wait(tp) LIST_INSERT_HEAD(&time_wait_slots[tw_slot], tp->t_inpcb, inp_list); } +void add_to_time_wait(tp) + struct tcpcb *tp; +{ + struct inpcbinfo *pcbinfo = &tcbinfo; + + if (!lck_rw_try_lock_exclusive(pcbinfo->mtx)) { + tcp_unlock(tp->t_inpcb->inp_socket, 0, 0); + lck_rw_lock_exclusive(pcbinfo->mtx); + tcp_lock(tp->t_inpcb->inp_socket, 0, 0); + } + add_to_time_wait_locked(tp); + lck_rw_done(pcbinfo->mtx); +} @@ -182,49 +201,46 @@ void add_to_time_wait(tp) void tcp_fasttimo() { - register struct inpcb *inp; + struct inpcb *inp, *inpnxt; register struct tcpcb *tp; - register u_long i,j; - register u_long temp_mask; - register u_long elem_base = 0; - struct inpcbhead *head; - int s = splnet(); + struct inpcbinfo *pcbinfo = &tcbinfo; - static - int delack_checked = 0; + int delack_checked = 0, delack_done = 0; KERNEL_DEBUG(DBG_FNC_TCP_FAST | DBG_FUNC_START, 0,0,0,0,0); - if (!tcp_delack_enabled) + if (tcp_delack_enabled == 0) return; - for (i=0; i < (tcbinfo.hashsize / 32); i++) { - if (delack_bitmask[i]) { - temp_mask = 1; - for (j=0; j < 32; j++) { - if (temp_mask & delack_bitmask[i]) { - head = &tcbinfo.hashbase[elem_base + j]; - for (inp=head->lh_first; inp != 0; inp = inp->inp_hash.le_next) { - delack_checked++; - if ((tp = (struct tcpcb *)inp->inp_ppcb) && (tp->t_flags & TF_DELACK)) { - tp->t_flags &= ~TF_DELACK; - tp->t_flags |= TF_ACKNOW; - tcpstat.tcps_delack++; - (void) tcp_output(tp); - } - } - } - temp_mask <<= 1; + lck_rw_lock_shared(pcbinfo->mtx); + + /* Walk the list of valid tcpcbs and send ACKS on the ones with DELACK bit set */ + + for (inp = tcb.lh_first; inp != NULL; inp = inpnxt) { + inpnxt = inp->inp_list.le_next; + /* NOTE: it's OK to check the tp because the pcb can't be removed while we hold pcbinfo->mtx) */ + if ((tp = (struct tcpcb *)inp->inp_ppcb) && (tp->t_flags & TF_DELACK)) { + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) + continue; + tcp_lock(inp->inp_socket, 1, 0); + if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { + tcp_unlock(inp->inp_socket, 1, 0); + continue; + } + if (tp->t_flags & TF_DELACK) { + delack_done++; + tp->t_flags &= ~TF_DELACK; + tp->t_flags |= TF_ACKNOW; + tcpstat.tcps_delack++; + (void) tcp_output(tp); } - delack_bitmask[i] = 0; - } - elem_base += 32; + tcp_unlock(inp->inp_socket, 1, 0); + } } - KERNEL_DEBUG(DBG_FNC_TCP_FAST | DBG_FUNC_END, delack_checked,tcpstat.tcps_delack,0,0,0); - splx(s); - + KERNEL_DEBUG(DBG_FNC_TCP_FAST | DBG_FUNC_END, delack_checked, delack_done, tcpstat.tcps_delack,0,0); + lck_rw_done(pcbinfo->mtx); } /* @@ -235,41 +251,54 @@ tcp_fasttimo() void tcp_slowtimo() { - register struct inpcb *ip, *ipnxt; - register struct tcpcb *tp; - register int i; - int s; + struct inpcb *inp, *inpnxt; + struct tcpcb *tp; + struct socket *so; + int i; #if TCPDEBUG int ostate; #endif #if KDEBUG static int tws_checked; #endif + struct inpcbinfo *pcbinfo = &tcbinfo; KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_START, 0,0,0,0,0); - s = splnet(); tcp_maxidle = tcp_keepcnt * tcp_keepintvl; - ip = tcb.lh_first; - if (ip == NULL) { - splx(s); - return; - } + lck_rw_lock_shared(pcbinfo->mtx); + /* * Search through tcb's and update active timers. */ - for (; ip != NULL; ip = ipnxt) { - ipnxt = ip->inp_list.le_next; - tp = intotcpcb(ip); - if (tp == 0 || tp->t_state == TCPS_LISTEN) + for (inp = tcb.lh_first; inp != NULL; inp = inpnxt) { + inpnxt = inp->inp_list.le_next; + + if (in_pcb_checkstate(inp, WNT_ACQUIRE,0) == WNT_STOPUSING) continue; + + so = inp->inp_socket; + tcp_lock(so, 1, 0); + + if ((in_pcb_checkstate(inp, WNT_RELEASE,1) == WNT_STOPUSING) && so->so_usecount == 1) { + tcp_unlock(so, 1, 0); + continue; + } + tp = intotcpcb(inp); + if (tp == 0 || tp->t_state == TCPS_LISTEN) { + tcp_unlock(so, 1, 0); + continue; + } /* * Bogus state when port owned by SharedIP with loopback as the * only configured interface: BlueBox does not filters loopback */ - if (tp->t_state == TCP_NSTATES) - continue; + if (tp->t_state == TCP_NSTATES) { + tcp_unlock(so, 1, 0); + continue; + } + for (i = 0; i < TCPT_NTIMERS; i++) { if (tp->t_timer[i] && --tp->t_timer[i] == 0) { @@ -292,9 +321,9 @@ tcp_slowtimo() tp->t_rcvtime++; tp->t_starttime++; if (tp->t_rtttime) - tp->t_rtttime++; + tp->t_rtttime++; tpgone: - ; + tcp_unlock(so, 1, 0); } #if KDEBUG @@ -306,16 +335,27 @@ tpgone: * Process the items in the current time-wait slot */ - for (ip = time_wait_slots[cur_tw_slot].lh_first; ip; ip = ipnxt) + for (inp = time_wait_slots[cur_tw_slot].lh_first; inp; inp = inpnxt) { + inpnxt = inp->inp_list.le_next; #if KDEBUG tws_checked++; #endif - ipnxt = ip->inp_list.le_next; - tp = intotcpcb(ip); + + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) + continue; + + tcp_lock(inp->inp_socket, 1, 0); + + if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) + goto twunlock; + + tp = intotcpcb(inp); if (tp == NULL) { /* tp already closed, remove from list */ - LIST_REMOVE(ip, inp_list); - continue; +#if TEMPDEBUG + printf("tcp_slowtimo: tp is null in time-wait slot!\n"); +#endif + goto twunlock; } if (tp->t_timer[TCPT_2MSL] >= N_TIME_WAIT_SLOTS) { tp->t_timer[TCPT_2MSL] -= N_TIME_WAIT_SLOTS; @@ -324,14 +364,77 @@ tpgone: else tp->t_timer[TCPT_2MSL] = 0; - if (tp->t_timer[TCPT_2MSL] == 0) - tp = tcp_timers(tp, TCPT_2MSL); + if (tp->t_timer[TCPT_2MSL] == 0) + tp = tcp_timers(tp, TCPT_2MSL); /* tp can be returned null if tcp_close is called */ +twunlock: + tcp_unlock(inp->inp_socket, 1, 0); + } + + if (lck_rw_lock_shared_to_exclusive(pcbinfo->mtx) != 0) + lck_rw_lock_exclusive(pcbinfo->mtx); /* Upgrade failed, lost lock no take it again exclusive */ + + + for (inp = tcb.lh_first; inp != NULL; inp = inpnxt) { + inpnxt = inp->inp_list.le_next; + /* Ignore nat/SharedIP dummy pcbs */ + if (inp->inp_socket == &tcbinfo.nat_dummy_socket) + continue; + + if (inp->inp_wantcnt != WNT_STOPUSING) + continue; + + so = inp->inp_socket; + if (!lck_mtx_try_lock(inp->inpcb_mtx)) {/* skip if in use */ +#if TEMPDEBUG + printf("tcp_slowtimo so=%x STOPUSING but locked...\n", so); +#endif + continue; + } + + if (so->so_usecount == 0) + in_pcbdispose(inp); + else { + tp = intotcpcb(inp); + /* Check for embryonic socket stuck on listener queue (4023660) */ + if ((so->so_usecount == 1) && (tp->t_state == TCPS_CLOSED) && + (so->so_head != NULL) && (so->so_state & SS_INCOMP)) { + so->so_usecount--; + in_pcbdispose(inp); + } else + lck_mtx_unlock(inp->inpcb_mtx); + } } + /* Now cleanup the time wait ones */ + for (inp = time_wait_slots[cur_tw_slot].lh_first; inp; inp = inpnxt) + { + inpnxt = inp->inp_list.le_next; + + if (inp->inp_wantcnt != WNT_STOPUSING) + continue; + + so = inp->inp_socket; + if (!lck_mtx_try_lock(inp->inpcb_mtx)) /* skip if in use */ + continue; + if (so->so_usecount == 0) + in_pcbdispose(inp); + else { + tp = intotcpcb(inp); + /* Check for embryonic socket stuck on listener queue (4023660) */ + if ((so->so_usecount == 1) && (tp->t_state == TCPS_CLOSED) && + (so->so_head != NULL) && (so->so_state & SS_INCOMP)) { + so->so_usecount--; + in_pcbdispose(inp); + } else + lck_mtx_unlock(inp->inpcb_mtx); + } + } + + tcp_now++; if (++cur_tw_slot >= N_TIME_WAIT_SLOTS) cur_tw_slot = 0; - tcp_now++; /* for timestamps */ - splx(s); + + lck_rw_done(pcbinfo->mtx); KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked, cur_tw_slot,0,0,0); } @@ -376,6 +479,7 @@ tcp_timers(tp, timer) int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0; #endif /* INET6 */ + so_tmp = tp->t_inpcb->inp_socket; switch (timer) { @@ -388,11 +492,13 @@ tcp_timers(tp, timer) case TCPT_2MSL: if (tp->t_state != TCPS_TIME_WAIT && tp->t_rcvtime <= tcp_maxidle) { - tp->t_timer[TCPT_2MSL] = tcp_keepintvl; - add_to_time_wait(tp); + tp->t_timer[TCPT_2MSL] = (unsigned long)tcp_keepintvl; + add_to_time_wait_locked(tp); } - else + else { tp = tcp_close(tp); + return(tp); + } break; /* @@ -404,7 +510,6 @@ tcp_timers(tp, timer) if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; tcpstat.tcps_timeoutdrop++; - so_tmp = tp->t_inpcb->inp_socket; tp = tcp_drop(tp, tp->t_softerror ? tp->t_softerror : ETIMEDOUT); postevent(so_tmp, 0, EV_TIMEOUT); @@ -549,7 +654,7 @@ tcp_timers(tp, timer) if ((always_keepalive || tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) && tp->t_state <= TCPS_CLOSING) { - if (tp->t_rcvtime >= TCP_KEEPIDLE(tp) + tcp_maxidle) + if (tp->t_rcvtime >= TCP_KEEPIDLE(tp) + (unsigned long)tcp_maxidle) goto dropit; /* * Send a packet designed to force a response @@ -583,7 +688,6 @@ tcp_timers(tp, timer) #endif dropit: tcpstat.tcps_keepdrops++; - so_tmp = tp->t_inpcb->inp_socket; tp = tcp_drop(tp, ETIMEDOUT); postevent(so_tmp, 0, EV_TIMEOUT); break; diff --git a/bsd/netinet/tcp_timer.h b/bsd/netinet/tcp_timer.h index e3f5282d5..e4979a078 100644 --- a/bsd/netinet/tcp_timer.h +++ b/bsd/netinet/tcp_timer.h @@ -106,10 +106,11 @@ * amount of time probing, then we drop the connection. */ +#ifdef PRIVATE + /* * Time constants. */ -#ifdef __APPLE_API_PRIVATE #define TCPTV_MSL ( 30*PR_SLOWHZ) /* max seg lifetime (hah!) */ #define TCPTV_SRTTBASE 0 /* base roundtrip time; if 0, no idea yet */ @@ -140,6 +141,7 @@ static char *tcptimers[] = { "REXMT", "PERSIST", "KEEP", "2MSL" }; #endif +#ifdef KERNEL /* * Force a time value to be in a certain range. */ @@ -151,8 +153,6 @@ static char *tcptimers[] = (tv) = (tvmax); \ } while(0) -#ifdef KERNEL - #define TCP_KEEPIDLE(tp) \ (tp->t_keepidle && (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) ? \ tp->t_keepidle : tcp_keepidle) @@ -167,13 +167,13 @@ extern int tcp_msl; extern int tcp_ttl; /* time to live for TCP segs */ extern int tcp_backoff[]; -void tcp_timer_2msl __P((void *xtp)); -void tcp_timer_keep __P((void *xtp)); -void tcp_timer_persist __P((void *xtp)); -void tcp_timer_rexmt __P((void *xtp)); -void tcp_timer_delack __P((void *xtp)); +void tcp_timer_2msl(void *xtp); +void tcp_timer_keep(void *xtp); +void tcp_timer_persist(void *xtp); +void tcp_timer_rexmt(void *xtp); +void tcp_timer_delack(void *xtp); #endif /* KERNEL */ -#endif /* __APPLE_API_PRIVATE */ - +#endif /* PRIVATE */ #endif /* !_NETINET_TCP_TIMER_H_ */ + diff --git a/bsd/netinet/tcp_usrreq.c b/bsd/netinet/tcp_usrreq.c index 246e031c0..d29331e28 100644 --- a/bsd/netinet/tcp_usrreq.c +++ b/bsd/netinet/tcp_usrreq.c @@ -104,17 +104,15 @@ */ extern char *tcpstates[]; /* XXX ??? */ -static int tcp_attach __P((struct socket *, struct proc *)); -static int tcp_connect __P((struct tcpcb *, struct sockaddr *, - struct proc *)); +static int tcp_attach(struct socket *, struct proc *); +static int tcp_connect(struct tcpcb *, struct sockaddr *, struct proc *); #if INET6 -static int tcp6_connect __P((struct tcpcb *, struct sockaddr *, - struct proc *)); +static int tcp6_connect(struct tcpcb *, struct sockaddr *, struct proc *); #endif /* INET6 */ static struct tcpcb * - tcp_disconnect __P((struct tcpcb *)); + tcp_disconnect(struct tcpcb *); static struct tcpcb * - tcp_usrclosed __P((struct tcpcb *)); + tcp_usrclosed(struct tcpcb *); #if TCPDEBUG #define TCPDEBUG0 int ostate = 0 @@ -134,7 +132,6 @@ static struct tcpcb * static int tcp_usr_attach(struct socket *so, int proto, struct proc *p) { - int s = splnet(); int error; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = 0; @@ -155,7 +152,6 @@ tcp_usr_attach(struct socket *so, int proto, struct proc *p) tp = sototcpcb(so); out: TCPDEBUG2(PRU_ATTACH); - splx(s); return error; } @@ -169,16 +165,17 @@ out: static int tcp_usr_detach(struct socket *so) { - int s = splnet(); int error = 0; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp; TCPDEBUG0; - if (inp == 0) { - splx(s); + if (inp == 0 || (inp->inp_state == INPCB_STATE_DEAD)) { return EINVAL; /* XXX */ } +#if 1 + lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); +#endif tp = intotcpcb(inp); /* In case we got disconnected from the peer */ if (tp == 0) @@ -187,21 +184,19 @@ tcp_usr_detach(struct socket *so) tp = tcp_disconnect(tp); out: TCPDEBUG2(PRU_DETACH); - splx(s); return error; } #define COMMON_START() TCPDEBUG0; \ do { \ - if (inp == 0) { \ - splx(s); \ + if (inp == 0 || (inp->inp_state == INPCB_STATE_DEAD)) { \ return EINVAL; \ } \ tp = intotcpcb(inp); \ TCPDEBUG1(); \ } while(0) -#define COMMON_END(req) out: TCPDEBUG2(req); splx(s); return error; goto out +#define COMMON_END(req) out: TCPDEBUG2(req); return error; goto out /* @@ -210,7 +205,6 @@ out: static int tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct proc *p) { - int s = splnet(); int error = 0; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp; @@ -239,7 +233,6 @@ tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct proc *p) static int tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct proc *p) { - int s = splnet(); int error = 0; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp; @@ -285,7 +278,6 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct proc *p) static int tcp_usr_listen(struct socket *so, struct proc *p) { - int s = splnet(); int error = 0; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp; @@ -302,7 +294,6 @@ tcp_usr_listen(struct socket *so, struct proc *p) static int tcp6_usr_listen(struct socket *so, struct proc *p) { - int s = splnet(); int error = 0; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp; @@ -330,7 +321,6 @@ tcp6_usr_listen(struct socket *so, struct proc *p) static int tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p) { - int s = splnet(); int error = 0; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp; @@ -362,7 +352,6 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p) static int tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p) { - int s = splnet(); int error = 0; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp; @@ -419,11 +408,13 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p) static int tcp_usr_disconnect(struct socket *so) { - int s = splnet(); int error = 0; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp; - + +#if 1 + lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); +#endif COMMON_START(); /* In case we got disconnected from the peer */ if (tp == 0) @@ -440,7 +431,6 @@ tcp_usr_disconnect(struct socket *so) static int tcp_usr_accept(struct socket *so, struct sockaddr **nam) { - int s = splnet(); int error = 0; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = NULL; @@ -450,8 +440,7 @@ tcp_usr_accept(struct socket *so, struct sockaddr **nam) error = ECONNABORTED; goto out; } - if (inp == 0) { - splx(s); + if (inp == 0 || (inp->inp_state == INPCB_STATE_DEAD)) { return (EINVAL); } tp = intotcpcb(inp); @@ -464,7 +453,6 @@ tcp_usr_accept(struct socket *so, struct sockaddr **nam) static int tcp6_usr_accept(struct socket *so, struct sockaddr **nam) { - int s = splnet(); int error = 0; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = NULL; @@ -474,8 +462,7 @@ tcp6_usr_accept(struct socket *so, struct sockaddr **nam) error = ECONNABORTED; goto out; } - if (inp == 0) { - splx(s); + if (inp == 0 || (inp->inp_state == INPCB_STATE_DEAD)) { return (EINVAL); } tp = intotcpcb(inp); @@ -490,7 +477,6 @@ tcp6_usr_accept(struct socket *so, struct sockaddr **nam) static int tcp_usr_shutdown(struct socket *so) { - int s = splnet(); int error = 0; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp; @@ -512,7 +498,6 @@ tcp_usr_shutdown(struct socket *so) static int tcp_usr_rcvd(struct socket *so, int flags) { - int s = splnet(); int error = 0; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp; @@ -536,7 +521,6 @@ static int tcp_usr_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct proc *p) { - int s = splnet(); int error = 0; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp; @@ -545,7 +529,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, #endif TCPDEBUG0; - if (inp == NULL) { + if (inp == NULL || inp->inp_state == INPCB_STATE_DEAD) { /* * OOPS! we lost a race, the TCP session got reset after * we checked SS_CANTSENDMORE, eg: while doing uiomove or a @@ -660,7 +644,6 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, static int tcp_usr_abort(struct socket *so) { - int s = splnet(); int error = 0; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp; @@ -670,6 +653,7 @@ tcp_usr_abort(struct socket *so) if (tp == 0) goto out; tp = tcp_drop(tp, ECONNABORTED); + so->so_usecount--; COMMON_END(PRU_ABORT); } @@ -679,7 +663,6 @@ tcp_usr_abort(struct socket *so) static int tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags) { - int s = splnet(); int error = 0; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp; @@ -709,7 +692,7 @@ struct pr_usrreqs tcp_usrreqs = { tcp_usr_connect, pru_connect2_notsupp, in_control, tcp_usr_detach, tcp_usr_disconnect, tcp_usr_listen, in_setpeeraddr, tcp_usr_rcvd, tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown, - in_setsockaddr, sosend, soreceive, sopoll + in_setsockaddr, sosend, soreceive, pru_sopoll_notsupp }; #if INET6 @@ -718,7 +701,7 @@ struct pr_usrreqs tcp6_usrreqs = { tcp6_usr_connect, pru_connect2_notsupp, in6_control, tcp_usr_detach, tcp_usr_disconnect, tcp6_usr_listen, in6_mapped_peeraddr, tcp_usr_rcvd, tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown, - in6_mapped_sockaddr, sosend, soreceive, sopoll + in6_mapped_sockaddr, sosend, soreceive, pru_sopoll_notsupp }; #endif /* INET6 */ @@ -761,29 +744,51 @@ tcp_connect(tp, nam, p) error = in_pcbladdr(inp, nam, &ifaddr); if (error) return error; + + tcp_unlock(inp->inp_socket, 0, 0); oinp = in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port, inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr : ifaddr->sin_addr, inp->inp_lport, 0, NULL); + + tcp_lock(inp->inp_socket, 0, 0); if (oinp) { + tcp_lock(oinp->inp_socket, 1, 0); + if (in_pcb_checkstate(oinp, WNT_RELEASE, 1) == WNT_STOPUSING) { + tcp_unlock(oinp->inp_socket, 1, 0); + goto skip_oinp; + } + if (oinp != inp && (otp = intotcpcb(oinp)) != NULL && otp->t_state == TCPS_TIME_WAIT && otp->t_starttime < tcp_msl && (otp->t_flags & TF_RCVD_CC)) otp = tcp_close(otp); - else + else { + printf("tcp_connect: inp=%x err=EADDRINUSE\n", inp); + tcp_unlock(oinp->inp_socket, 1, 0); return EADDRINUSE; + } + tcp_unlock(oinp->inp_socket, 1, 0); } +skip_oinp: if ((inp->inp_laddr.s_addr == INADDR_ANY ? ifaddr->sin_addr.s_addr : inp->inp_laddr.s_addr) == sin->sin_addr.s_addr && inp->inp_lport == sin->sin_port) return EINVAL; + if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) { + /*lock inversion issue, mostly with udp multicast packets */ + socket_unlock(inp->inp_socket, 0); + lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx); + socket_lock(inp->inp_socket, 0); + } if (inp->inp_laddr.s_addr == INADDR_ANY) inp->inp_laddr = ifaddr->sin_addr; inp->inp_faddr = sin->sin_addr; inp->inp_fport = sin->sin_port; in_pcbrehash(inp); + lck_rw_done(inp->inp_pcbinfo->mtx); /* Compute window scaling to request. */ while (tp->request_r_scale < TCP_MAX_WINSHIFT && @@ -829,7 +834,7 @@ tcp6_connect(tp, nam, p) struct socket *so = inp->inp_socket; struct tcpcb *otp; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; - struct in6_addr *addr6; + struct in6_addr addr6; struct rmxp_tao *taop; struct rmxp_tao tao_noncached; int error; @@ -848,12 +853,14 @@ tcp6_connect(tp, nam, p) error = in6_pcbladdr(inp, nam, &addr6); if (error) return error; + tcp_unlock(inp->inp_socket, 0, 0); oinp = in6_pcblookup_hash(inp->inp_pcbinfo, &sin6->sin6_addr, sin6->sin6_port, IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) - ? addr6 + ? &addr6 : &inp->in6p_laddr, inp->inp_lport, 0, NULL); + tcp_lock(inp->inp_socket, 0, 0); if (oinp) { if (oinp != inp && (otp = intotcpcb(oinp)) != NULL && otp->t_state == TCPS_TIME_WAIT && @@ -863,13 +870,20 @@ tcp6_connect(tp, nam, p) else return EADDRINUSE; } + if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) { + /*lock inversion issue, mostly with udp multicast packets */ + socket_unlock(inp->inp_socket, 0); + lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx); + socket_lock(inp->inp_socket, 0); + } if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) - inp->in6p_laddr = *addr6; + inp->in6p_laddr = addr6; inp->in6p_faddr = sin6->sin6_addr; inp->inp_fport = sin6->sin6_port; if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != NULL) inp->in6p_flowinfo = sin6->sin6_flowinfo; in_pcbrehash(inp); + lck_rw_done(inp->inp_pcbinfo->mtx); /* Compute window scaling to request. */ while (tp->request_r_scale < TCP_MAX_WINSHIFT && @@ -917,15 +931,13 @@ tcp_ctloutput(so, sopt) struct socket *so; struct sockopt *sopt; { - int error, opt, optval, s; + int error, opt, optval; struct inpcb *inp; struct tcpcb *tp; error = 0; - s = splnet(); /* XXX */ inp = sotoinpcb(so); if (inp == NULL) { - splx(s); return (ECONNRESET); } if (sopt->sopt_level != IPPROTO_TCP) { @@ -935,12 +947,10 @@ tcp_ctloutput(so, sopt) else #endif /* INET6 */ error = ip_ctloutput(so, sopt); - splx(s); return (error); } tp = intotcpcb(inp); if (tp == NULL) { - splx(s); return (ECONNRESET); } @@ -1031,7 +1041,6 @@ tcp_ctloutput(so, sopt) error = sooptcopyout(sopt, &optval, sizeof optval); break; } - splx(s); return (error); } diff --git a/bsd/netinet/tcp_var.h b/bsd/netinet/tcp_var.h index f47cd5e4c..08b897f6c 100644 --- a/bsd/netinet/tcp_var.h +++ b/bsd/netinet/tcp_var.h @@ -58,8 +58,11 @@ #ifndef _NETINET_TCP_VAR_H_ #define _NETINET_TCP_VAR_H_ #include <sys/appleapiopts.h> +#include <sys/queue.h> +#include <netinet/in_pcb.h> #include <netinet/tcp_timer.h> -#ifdef __APPLE_API_PRIVATE + +#ifdef KERNEL_PRIVATE #define N_TIME_WAIT_SLOTS 128 /* must be power of 2 */ @@ -88,19 +91,10 @@ struct tcptemp { #define tcp6cb tcpcb /* for KAME src sync over BSD*'s */ -#ifdef __APPLE__ -#define TCP_DELACK_BITSET(hash_elem)\ -delack_bitmask[((hash_elem) >> 5)] |= 1 << ((hash_elem) & 0x1F) - -#define DELACK_BITMASK_ON 1 -#define DELACK_BITMASK_THRESH 300 -#endif - /* * Tcp control block, one per tcp; fields: * Organized for 16 byte cacheline efficiency. */ -#if KERNEL struct tcpcb { struct tsegqe_head t_segq; int t_dupacks; /* consecutive dup acks recd */ @@ -192,26 +186,109 @@ struct tcpcb { /* RFC 1644 variables */ tcp_cc cc_send; /* send connection count */ tcp_cc cc_recv; /* receive connection count */ - tcp_seq snd_recover; /* for use in fast recovery */ + tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ /* experimental */ u_long snd_cwnd_prev; /* cwnd prior to retransmit */ u_long snd_ssthresh_prev; /* ssthresh prior to retransmit */ u_long t_badrxtwin; /* window for retransmit recovery */ int t_keepidle; /* keepalive idle timer (override global if > 0) */ + int t_lastchain; /* amount of packets chained last time around */ + +/* 3529618 MSS overload prevention */ + u_long rcv_reset; + u_long rcv_pps; + u_long rcv_byps; + tcp_seq snd_high; /* for use in NewReno Fast Recovery */ + tcp_seq snd_high_prev; /* snd_high prior to retransmit */ + }; -#else -#define tcpcb otcpcb +/* + * Structure to hold TCP options that are only used during segment + * processing (in tcp_input), but not held in the tcpcb. + * It's basically used to reduce the number of parameters + * to tcp_dooptions. + */ +struct tcpopt { + u_long to_flag; /* which options are present */ +#define TOF_TS 0x0001 /* timestamp */ +#define TOF_CC 0x0002 /* CC and CCnew are exclusive */ +#define TOF_CCNEW 0x0004 +#define TOF_CCECHO 0x0008 + u_long to_tsval; + u_long to_tsecr; + tcp_cc to_cc; /* holds CC or CCnew */ + tcp_cc to_ccecho; + u_short reserved; /* unused now: was to_maxseg */ +}; -#endif +/* + * The TAO cache entry which is stored in the protocol family specific + * portion of the route metrics. + */ +struct rmxp_tao { + tcp_cc tao_cc; /* latest CC in valid SYN */ + tcp_cc tao_ccsent; /* latest CC sent to peer */ + u_short tao_mssopt; /* peer's cached MSS */ +#ifdef notyet + u_short tao_flags; /* cache status flags */ +#define TAOF_DONT 0x0001 /* peer doesn't understand rfc1644 */ +#define TAOF_OK 0x0002 /* peer does understand rfc1644 */ +#define TAOF_UNDEF 0 /* we don't know yet */ +#endif /* notyet */ +}; +#define rmx_taop(r) ((struct rmxp_tao *)(r).rmx_filler) + +#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) +#define sototcpcb(so) (intotcpcb(sotoinpcb(so))) +/* + * The smoothed round-trip time and estimated variance + * are stored as fixed point numbers scaled by the values below. + * For convenience, these scales are also used in smoothing the average + * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed). + * With these scales, srtt has 3 bits to the right of the binary point, + * and thus an "ALPHA" of 0.875. rttvar has 2 bits to the right of the + * binary point, and is smoothed with an ALPHA of 0.75. + */ +#define TCP_RTT_SCALE 32 /* multiplier for srtt; 3 bits frac. */ +#define TCP_RTT_SHIFT 5 /* shift for srtt; 3 bits frac. */ +#define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 2 bits */ +#define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 2 bits */ +#define TCP_DELTA_SHIFT 2 /* see tcp_input.c */ + +/* + * The initial retransmission should happen at rtt + 4 * rttvar. + * Because of the way we do the smoothing, srtt and rttvar + * will each average +1/2 tick of bias. When we compute + * the retransmit timer, we want 1/2 tick of rounding and + * 1 extra tick because of +-1/2 tick uncertainty in the + * firing of the timer. The bias will give us exactly the + * 1.5 tick we need. But, because the bias is + * statistical, we have to test that we don't drop below + * the minimum feasible timer (which is 2 ticks). + * This version of the macro adapted from a paper by Lawrence + * Brakmo and Larry Peterson which outlines a problem caused + * by insufficient precision in the original implementation, + * which results in inappropriately large RTO values for very + * fast networks. + */ +#define TCP_REXMTVAL(tp) \ + max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \ + + (tp)->t_rttvar) >> TCP_DELTA_SHIFT) /* * Jaguar compatible TCP control block, for xtcpcb * Does not have the old fields */ struct otcpcb { +#else +struct tseg_qent; +LIST_HEAD(tsegqe_head, tseg_qent); + +struct tcpcb { +#endif /* KERNEL_PRIVATE */ struct tsegqe_head t_segq; int t_dupacks; /* consecutive dup acks recd */ struct tcptemp *unused; /* unused now: was t_template */ @@ -302,90 +379,13 @@ struct otcpcb { /* RFC 1644 variables */ tcp_cc cc_send; /* send connection count */ tcp_cc cc_recv; /* receive connection count */ - tcp_seq snd_recover; /* for use in fast recovery */ + tcp_seq snd_recover; /* for use in fast recovery */ /* experimental */ u_long snd_cwnd_prev; /* cwnd prior to retransmit */ u_long snd_ssthresh_prev; /* ssthresh prior to retransmit */ u_long t_badrxtwin; /* window for retransmit recovery */ }; - -/* - * Structure to hold TCP options that are only used during segment - * processing (in tcp_input), but not held in the tcpcb. - * It's basically used to reduce the number of parameters - * to tcp_dooptions. - */ -struct tcpopt { - u_long to_flag; /* which options are present */ -#define TOF_TS 0x0001 /* timestamp */ -#define TOF_CC 0x0002 /* CC and CCnew are exclusive */ -#define TOF_CCNEW 0x0004 -#define TOF_CCECHO 0x0008 - u_long to_tsval; - u_long to_tsecr; - tcp_cc to_cc; /* holds CC or CCnew */ - tcp_cc to_ccecho; - u_short reserved; /* unused now: was to_maxseg */ -}; - -/* - * The TAO cache entry which is stored in the protocol family specific - * portion of the route metrics. - */ -struct rmxp_tao { - tcp_cc tao_cc; /* latest CC in valid SYN */ - tcp_cc tao_ccsent; /* latest CC sent to peer */ - u_short tao_mssopt; /* peer's cached MSS */ -#ifdef notyet - u_short tao_flags; /* cache status flags */ -#define TAOF_DONT 0x0001 /* peer doesn't understand rfc1644 */ -#define TAOF_OK 0x0002 /* peer does understand rfc1644 */ -#define TAOF_UNDEF 0 /* we don't know yet */ -#endif /* notyet */ -}; -#define rmx_taop(r) ((struct rmxp_tao *)(r).rmx_filler) - -#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) -#define sototcpcb(so) (intotcpcb(sotoinpcb(so))) - -/* - * The smoothed round-trip time and estimated variance - * are stored as fixed point numbers scaled by the values below. - * For convenience, these scales are also used in smoothing the average - * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed). - * With these scales, srtt has 3 bits to the right of the binary point, - * and thus an "ALPHA" of 0.875. rttvar has 2 bits to the right of the - * binary point, and is smoothed with an ALPHA of 0.75. - */ -#define TCP_RTT_SCALE 32 /* multiplier for srtt; 3 bits frac. */ -#define TCP_RTT_SHIFT 5 /* shift for srtt; 3 bits frac. */ -#define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 2 bits */ -#define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 2 bits */ -#define TCP_DELTA_SHIFT 2 /* see tcp_input.c */ - -/* - * The initial retransmission should happen at rtt + 4 * rttvar. - * Because of the way we do the smoothing, srtt and rttvar - * will each average +1/2 tick of bias. When we compute - * the retransmit timer, we want 1/2 tick of rounding and - * 1 extra tick because of +-1/2 tick uncertainty in the - * firing of the timer. The bias will give us exactly the - * 1.5 tick we need. But, because the bias is - * statistical, we have to test that we don't drop below - * the minimum feasible timer (which is 2 ticks). - * This version of the macro adapted from a paper by Lawrence - * Brakmo and Larry Peterson which outlines a problem caused - * by insufficient precision in the original implementation, - * which results in inappropriately large RTO values for very - * fast networks. - */ -#define TCP_REXMTVAL(tp) \ - max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \ - + (tp)->t_rttvar) >> TCP_DELTA_SHIFT) -#endif /* __APPLE_API_PRIVATE */ - -#ifdef __APPLE_API_UNSTABLE /* * TCP statistics. * Many of these should be kept per connection, @@ -456,18 +456,20 @@ struct tcpstat { u_long tcps_mturesent; /* resends due to MTU discovery */ u_long tcps_listendrop; /* listen queue overflows */ }; -#endif /* __APPLE_API_UNSTABLE */ /* * TCB structure exported to user-land via sysctl(3). * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been * included. Not all of our clients do. */ -#if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_) struct xtcpcb { size_t xt_len; +#ifdef KERNEL_PRIVATE + struct inpcb_compat xt_inp; +#else struct inpcb xt_inp; -#if KERNEL +#endif +#ifdef KERNEL_PRIVATE struct otcpcb xt_tp; #else struct tcpcb xt_tp; @@ -475,7 +477,6 @@ struct xtcpcb { struct xsocket xt_socket; u_quad_t xt_alignment_hack; }; -#endif /* * Names for TCP sysctl objects @@ -495,6 +496,7 @@ struct xtcpcb { #define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */ #define TCPCTL_MAXID 14 +#ifdef KERNEL_PRIVATE #define TCPCTL_NAMES { \ { 0, 0 }, \ { "rfc1323", CTLTYPE_INT }, \ @@ -512,68 +514,72 @@ struct xtcpcb { { "v6mssdflt", CTLTYPE_INT }, \ } -#ifdef __APPLE_API_PRIVATE -#ifdef KERNEL #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_tcp); -#endif +#endif /* SYSCTL_DECL */ extern struct inpcbhead tcb; /* head of queue of active tcpcb's */ extern struct inpcbinfo tcbinfo; extern struct tcpstat tcpstat; /* tcp statistics */ extern int tcp_mssdflt; /* XXX */ extern int tcp_minmss; -extern int tcp_delack_enabled; +extern int tcp_minmssoverload; extern int tcp_do_newreno; extern int ss_fltsz; extern int ss_fltsz_local; #ifdef __APPLE__ extern u_long tcp_now; /* for RFC 1323 timestamps */ extern int tcp_delack_enabled; -#endif +#endif /* __APPLE__ */ -void tcp_canceltimers __P((struct tcpcb *)); +void tcp_canceltimers(struct tcpcb *); struct tcpcb * - tcp_close __P((struct tcpcb *)); -void tcp_ctlinput __P((int, struct sockaddr *, void *)); -int tcp_ctloutput __P((struct socket *, struct sockopt *)); + tcp_close(struct tcpcb *); +void tcp_ctlinput(int, struct sockaddr *, void *); +int tcp_ctloutput(struct socket *, struct sockopt *); struct tcpcb * - tcp_drop __P((struct tcpcb *, int)); -void tcp_drain __P((void)); -void tcp_fasttimo __P((void)); + tcp_drop(struct tcpcb *, int); +void tcp_drain(void); +void tcp_fasttimo(void); struct rmxp_tao * - tcp_gettaocache __P((struct inpcb *)); -void tcp_init __P((void)); -void tcp_input __P((struct mbuf *, int)); -void tcp_mss __P((struct tcpcb *, int)); -int tcp_mssopt __P((struct tcpcb *)); -void tcp_drop_syn_sent __P((struct inpcb *, int)); -void tcp_mtudisc __P((struct inpcb *, int)); + tcp_gettaocache(struct inpcb *); +void tcp_init(void); +void tcp_input(struct mbuf *, int); +void tcp_mss(struct tcpcb *, int); +int tcp_mssopt(struct tcpcb *); +void tcp_drop_syn_sent(struct inpcb *, int); +void tcp_mtudisc(struct inpcb *, int); struct tcpcb * - tcp_newtcpcb __P((struct inpcb *)); -int tcp_output __P((struct tcpcb *)); -void tcp_quench __P((struct inpcb *, int)); -void tcp_respond __P((struct tcpcb *, void *, - struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int)); + tcp_newtcpcb(struct inpcb *); +int tcp_output(struct tcpcb *); +void tcp_quench(struct inpcb *, int); +void tcp_respond(struct tcpcb *, void *, + struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int); struct rtentry * - tcp_rtlookup __P((struct inpcb *)); -void tcp_setpersist __P((struct tcpcb *)); -void tcp_slowtimo __P((void)); + tcp_rtlookup(struct inpcb *); +void tcp_setpersist(struct tcpcb *); +void tcp_slowtimo(void); struct tcptemp * - tcp_maketemplate __P((struct tcpcb *)); -void tcp_fillheaders __P((struct tcpcb *, void *, void *)); + tcp_maketemplate(struct tcpcb *); +void tcp_fillheaders(struct tcpcb *, void *, void *); struct tcpcb * - tcp_timers __P((struct tcpcb *, int)); -void tcp_trace __P((int, int, struct tcpcb *, void *, struct tcphdr *, - int)); + tcp_timers(struct tcpcb *, int); +void tcp_trace(int, int, struct tcpcb *, void *, struct tcphdr *, int); +int tcp_lock (struct socket *, int, int); +int tcp_unlock (struct socket *, int, int); +#ifdef _KERN_LOCKS_H_ +lck_mtx_t * tcp_getlock (struct socket *, int); +#else +void * tcp_getlock (struct socket *, int); +#endif + extern struct pr_usrreqs tcp_usrreqs; extern u_long tcp_sendspace; extern u_long tcp_recvspace; -tcp_seq tcp_new_isn __P((struct tcpcb *)); +tcp_seq tcp_new_isn(struct tcpcb *); -#endif /* KERNEL */ -#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL_RPIVATE */ #endif /* _NETINET_TCP_VAR_H_ */ diff --git a/bsd/netinet/tcpip.h b/bsd/netinet/tcpip.h index 79410f82d..b9830e140 100644 --- a/bsd/netinet/tcpip.h +++ b/bsd/netinet/tcpip.h @@ -58,7 +58,7 @@ #ifndef _NETINET_TCPIP_H_ #define _NETINET_TCPIP_H_ #include <sys/appleapiopts.h> - +#include <netinet/ip_var.h> /* * Tcp+ip header, after ip options removed. */ diff --git a/bsd/netinet/udp_usrreq.c b/bsd/netinet/udp_usrreq.c index 5d7ce5151..4eafd6c8f 100644 --- a/bsd/netinet/udp_usrreq.c +++ b/bsd/netinet/udp_usrreq.c @@ -68,6 +68,7 @@ #include <sys/syslog.h> #include <net/if.h> +#include <net/if_types.h> #include <net/route.h> #include <netinet/in.h> @@ -91,6 +92,7 @@ #if IPSEC #include <netinet6/ipsec.h> extern int ipsec_bypass; +extern lck_mtx_t *sadb_mutex; #endif /*IPSEC*/ @@ -133,9 +135,22 @@ extern int apple_hwcksum_rx; extern int esp_udp_encap_port; extern u_long route_generation; +extern void ipfwsyslog( int level, char *format,...); + +extern int fw_verbose; + +#define log_in_vain_log( a ) { \ + if ( (log_in_vain == 3 ) && (fw_verbose == 2)) { /* Apple logging, log to ipfw.log */ \ + ipfwsyslog a ; \ + } \ + else log a ; \ +} + struct udpstat udpstat; /* from udp_var.h */ SYSCTL_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RD, &udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); +SYSCTL_INT(_net_inet_udp, OID_AUTO, pcbcount, CTLFLAG_RD, + &udbinfo.ipi_count, 0, "Number of active PCBs"); static struct sockaddr_in udp_in = { sizeof(udp_in), AF_INET }; #if INET6 @@ -152,25 +167,23 @@ struct udp_ip6 { } udp_ip6; #endif /* INET6 */ -static void udp_append __P((struct inpcb *last, struct ip *ip, - struct mbuf *n, int off)); +static void udp_append(struct inpcb *last, struct ip *ip, + struct mbuf *n, int off); #if INET6 -static void ip_2_ip6_hdr __P((struct ip6_hdr *ip6, struct ip *ip)); +static void ip_2_ip6_hdr(struct ip6_hdr *ip6, struct ip *ip); #endif -static int udp_detach __P((struct socket *so)); -static int udp_output __P((struct inpcb *, struct mbuf *, struct sockaddr *, - struct mbuf *, struct proc *)); +static int udp_detach(struct socket *so); +static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *, + struct mbuf *, struct proc *); +extern int ChkAddressOK( __uint32_t dstaddr, __uint32_t srcaddr ); void udp_init() { - vm_size_t str_size; - int stat; - u_char fake_owner; - struct in_addr laddr; - struct in_addr faddr; - u_short lport; + vm_size_t str_size; + struct inpcbinfo *pcbinfo; + LIST_INIT(&udb); udbinfo.listhead = &udb; @@ -181,7 +194,21 @@ udp_init() str_size = (vm_size_t) sizeof(struct inpcb); udbinfo.ipi_zone = (void *) zinit(str_size, 80000*str_size, 8192, "udpcb"); - udbinfo.last_pcb = 0; + pcbinfo = &udbinfo; + /* + * allocate lock group attribute and group for udp pcb mutexes + */ + pcbinfo->mtx_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setdefault(pcbinfo->mtx_grp_attr); + + pcbinfo->mtx_grp = lck_grp_alloc_init("udppcb", pcbinfo->mtx_grp_attr); + + pcbinfo->mtx_attr = lck_attr_alloc_init(); + lck_attr_setdefault(pcbinfo->mtx_attr); + + if ((pcbinfo->mtx = lck_rw_alloc_init(pcbinfo->mtx_grp, pcbinfo->mtx_attr)) == NULL) + return; /* pretty much dead if this fails... */ + in_pcb_nat_init(&udbinfo, AF_INET, IPPROTO_UDP, SOCK_DGRAM); #else udbinfo.ipi_zone = zinit("udpcb", sizeof(struct inpcb), maxsockets, @@ -227,6 +254,7 @@ udp_input(m, iphlen) int len; struct ip save_ip; struct sockaddr *append_sa; + struct inpcbinfo *pcbinfo = &udbinfo; udpstat.udps_ipackets++; @@ -325,6 +353,7 @@ doudpcksum: if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { struct inpcb *last; + lck_rw_lock_shared(pcbinfo->mtx); /* * Deliver a multicast or broadcast datagram to *all* sockets * for which the local and remote addresses and ports match @@ -341,6 +370,7 @@ doudpcksum: * fixing the interface. Maybe 4.5BSD will remedy this?) */ + /* * Construct sockaddr format source address. */ @@ -360,39 +390,67 @@ doudpcksum: if (inp->inp_socket == &udbinfo.nat_dummy_socket) continue; #endif -#if INET6 - if ((inp->inp_vflag & INP_IPV4) == 0) + if (inp->inp_socket == NULL) continue; + if (inp != sotoinpcb(inp->inp_socket)) + panic("udp_input: bad so back ptr inp=%x\n", inp); +#if INET6 + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; #endif - if (inp->inp_lport != uh->uh_dport) + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) { + continue; + } + + udp_lock(inp->inp_socket, 1, 0); + + if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { + udp_unlock(inp->inp_socket, 1, 0); continue; + } + + if (inp->inp_lport != uh->uh_dport) { + udp_unlock(inp->inp_socket, 1, 0); + continue; + } if (inp->inp_laddr.s_addr != INADDR_ANY) { if (inp->inp_laddr.s_addr != - ip->ip_dst.s_addr) + ip->ip_dst.s_addr) { + udp_unlock(inp->inp_socket, 1, 0); continue; + } } if (inp->inp_faddr.s_addr != INADDR_ANY) { if (inp->inp_faddr.s_addr != ip->ip_src.s_addr || - inp->inp_fport != uh->uh_sport) + inp->inp_fport != uh->uh_sport) { + udp_unlock(inp->inp_socket, 1, 0); continue; + } } if (last != NULL) { struct mbuf *n; - #if IPSEC + int skipit = 0; /* check AH/ESP integrity. */ - if (ipsec_bypass == 0 && ipsec4_in_reject_so(m, last->inp_socket)) { - ipsecstat.in_polvio++; - /* do not inject data to pcb */ - } else + if (ipsec_bypass == 0) { + lck_mtx_lock(sadb_mutex); + if (ipsec4_in_reject_so(m, last->inp_socket)) { + ipsecstat.in_polvio++; + /* do not inject data to pcb */ + skipit = 1; + } + lck_mtx_unlock(sadb_mutex); + } + if (skipit == 0) #endif /*IPSEC*/ if ((n = m_copy(m, 0, M_COPYALL)) != NULL) { udp_append(last, ip, n, iphlen + sizeof(struct udphdr)); } + udp_unlock(last->inp_socket, 1, 0); } last = inp; /* @@ -406,6 +464,7 @@ doudpcksum: if ((last->inp_socket->so_options&(SO_REUSEPORT|SO_REUSEADDR)) == 0) break; } + lck_rw_done(pcbinfo->mtx); if (last == NULL) { /* @@ -418,15 +477,23 @@ doudpcksum: } #if IPSEC /* check AH/ESP integrity. */ - if (ipsec_bypass == 0 && m && ipsec4_in_reject_so(m, last->inp_socket)) { - ipsecstat.in_polvio++; - goto bad; + if (ipsec_bypass == 0 && m) { + lck_mtx_lock(sadb_mutex); + if (ipsec4_in_reject_so(m, last->inp_socket)) { + ipsecstat.in_polvio++; + lck_mtx_unlock(sadb_mutex); + udp_unlock(last->inp_socket, 1, 0); + goto bad; + } + lck_mtx_unlock(sadb_mutex); } #endif /*IPSEC*/ udp_append(last, ip, m, iphlen + sizeof(struct udphdr)); + udp_unlock(last->inp_socket, 1, 0); return; } +#if IPSEC /* * UDP to port 4500 with a payload where the first four bytes are * not zero is a UDP encapsulated IPSec packet. Packets where @@ -472,6 +539,7 @@ doudpcksum: return; } } +#endif /* * Locate pcb for datagram. @@ -480,13 +548,25 @@ doudpcksum: ip->ip_dst, uh->uh_dport, 1, m->m_pkthdr.rcvif); if (inp == NULL) { if (log_in_vain) { - char buf[4*sizeof "123"]; - - strcpy(buf, inet_ntoa(ip->ip_dst)); - log(LOG_INFO, - "Connection attempt to UDP %s:%d from %s:%d\n", - buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src), - ntohs(uh->uh_sport)); + char buf[MAX_IPv4_STR_LEN]; + char buf2[MAX_IPv4_STR_LEN]; + + /* check src and dst address */ + if (log_in_vain != 3) + log(LOG_INFO, + "Connection attempt to UDP %s:%d from %s:%d\n", + inet_ntop(AF_INET, &ip->ip_dst, buf, sizeof(buf)), + ntohs(uh->uh_dport), + inet_ntop(AF_INET, &ip->ip_src, buf2, sizeof(buf2)), + ntohs(uh->uh_sport)); + else if (!(m->m_flags & (M_BCAST | M_MCAST)) && + ip->ip_dst.s_addr != ip->ip_src.s_addr) + log_in_vain_log((LOG_INFO, + "Stealth Mode connection attempt to UDP %s:%d from %s:%d\n", + inet_ntop(AF_INET, &ip->ip_dst, buf, sizeof(buf)), + ntohs(uh->uh_dport), + inet_ntop(AF_INET, &ip->ip_src, buf2, sizeof(buf2)), + ntohs(uh->uh_sport))) } udpstat.udps_noport++; if (m->m_flags & (M_BCAST | M_MCAST)) { @@ -498,18 +578,31 @@ doudpcksum: goto bad; #endif if (blackhole) - goto bad; + if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type != IFT_LOOP) + goto bad; *ip = save_ip; ip->ip_len += iphlen; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END, 0,0,0,0,0); return; } -#if IPSEC - if (ipsec_bypass == 0 && inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) { - ipsecstat.in_polvio++; + udp_lock(inp->inp_socket, 1, 0); + + if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { + udp_unlock(inp->inp_socket, 1, 0); goto bad; } +#if IPSEC + if (ipsec_bypass == 0 && inp != NULL) { + lck_mtx_lock(sadb_mutex); + if (ipsec4_in_reject_so(m, inp->inp_socket)) { + ipsecstat.in_polvio++; + lck_mtx_unlock(sadb_mutex); + udp_unlock(inp->inp_socket, 1, 0); + goto bad; + } + lck_mtx_unlock(sadb_mutex); + } #endif /*IPSEC*/ /* @@ -545,11 +638,13 @@ doudpcksum: } else #endif append_sa = (struct sockaddr *)&udp_in; - if (sbappendaddr(&inp->inp_socket->so_rcv, append_sa, m, opts) == 0) { + if (sbappendaddr(&inp->inp_socket->so_rcv, append_sa, m, opts, NULL) == 0) { udpstat.udps_fullsock++; - goto bad; } - sorwakeup(inp->inp_socket); + else { + sorwakeup(inp->inp_socket); + } + udp_unlock(inp->inp_socket, 1, 0); KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END, 0,0,0,0,0); return; bad: @@ -622,10 +717,7 @@ udp_append(last, ip, n, off) #endif append_sa = (struct sockaddr *)&udp_in; m_adj(n, off); - if (sbappendaddr(&last->inp_socket->so_rcv, append_sa, n, opts) == 0) { - m_freem(n); - if (opts) - m_freem(opts); + if (sbappendaddr(&last->inp_socket->so_rcv, append_sa, n, opts, NULL) == 0) { udpstat.udps_fullsock++; } else sorwakeup(last->inp_socket); @@ -653,10 +745,9 @@ udp_ctlinput(cmd, sa, vip) { struct ip *ip = vip; struct udphdr *uh; - void (*notify) __P((struct inpcb *, int)) = udp_notify; + void (*notify)(struct inpcb *, int) = udp_notify; struct in_addr faddr; struct inpcb *inp; - int s; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) @@ -670,21 +761,26 @@ udp_ctlinput(cmd, sa, vip) else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip) { - s = splnet(); uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); inp = in_pcblookup_hash(&udbinfo, faddr, uh->uh_dport, ip->ip_src, uh->uh_sport, 0, NULL); - if (inp != NULL && inp->inp_socket != NULL) + if (inp != NULL && inp->inp_socket != NULL) { + udp_lock(inp->inp_socket, 1, 0); + if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { + udp_unlock(inp->inp_socket, 1, 0); + return; + } (*notify)(inp, inetctlerrmap[cmd]); - splx(s); + udp_unlock(inp->inp_socket, 1, 0); + } } else - in_pcbnotifyall(&udb, faddr, inetctlerrmap[cmd], notify); + in_pcbnotifyall(&udbinfo, faddr, inetctlerrmap[cmd], notify); } static int udp_pcblist SYSCTL_HANDLER_ARGS { - int error, i, n, s; + int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; @@ -693,58 +789,64 @@ udp_pcblist SYSCTL_HANDLER_ARGS * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ - if (req->oldptr == 0) { + lck_rw_lock_exclusive(udbinfo.mtx); + if (req->oldptr == USER_ADDR_NULL) { n = udbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xinpcb); + lck_rw_done(udbinfo.mtx); return 0; } - if (req->newptr != 0) + if (req->newptr != USER_ADDR_NULL) { + lck_rw_done(udbinfo.mtx); return EPERM; + } /* * OK, now we're committed to doing something. */ - s = splnet(); gencnt = udbinfo.ipi_gencnt; n = udbinfo.ipi_count; - splx(s); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); - if (error) + if (error) { + lck_rw_done(udbinfo.mtx); return error; + } /* * We are done if there is no pcb */ - if (n == 0) + if (n == 0) { + lck_rw_done(udbinfo.mtx); return 0; + } inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == 0) { + lck_rw_done(udbinfo.mtx); return ENOMEM; } for (inp = LIST_FIRST(udbinfo.listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { - if (inp->inp_gencnt <= gencnt) + if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) inp_list[i++] = inp; } - splx(s); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; - if (inp->inp_gencnt <= gencnt) { + if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) { struct xinpcb xi; xi.xi_len = sizeof xi; /* XXX should avoid extra copy */ - bcopy(inp, &xi.xi_inp, sizeof *inp); + inpcb_to_compat(inp, &xi.xi_inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); error = SYSCTL_OUT(req, &xi, sizeof xi); @@ -758,14 +860,13 @@ udp_pcblist SYSCTL_HANDLER_ARGS * while we were processing this request, and it * might be necessary to retry. */ - s = splnet(); xig.xig_gen = udbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = udbinfo.ipi_count; - splx(s); error = SYSCTL_OUT(req, &xig, sizeof xig); } FREE(inp_list, M_TEMP); + lck_rw_done(udbinfo.mtx); return error; } @@ -774,6 +875,21 @@ SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, +static __inline__ u_int16_t +get_socket_id(struct socket * s) +{ + u_int16_t val; + + if (s == NULL) { + return (0); + } + val = (u_int16_t)(((u_int32_t)s) / sizeof(struct socket)); + if (val == 0) { + val = 0xffff; + } + return (val); +} + static int udp_output(inp, m, addr, control, p) register struct inpcb *inp; @@ -784,8 +900,12 @@ udp_output(inp, m, addr, control, p) { register struct udpiphdr *ui; register int len = m->m_pkthdr.len; - struct in_addr laddr; - int s = 0, error = 0; + struct sockaddr_in *sin, src; + struct in_addr origladdr, laddr, faddr; + u_short lport, fport; + struct sockaddr_in *ifaddr; + int error = 0, udp_dodisconnect = 0; + KERNEL_DEBUG(DBG_FNC_UDP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); @@ -806,8 +926,12 @@ udp_output(inp, m, addr, control, p) * Reacquire a new source address if INADDR_ANY was specified */ +#if 1 + lck_mtx_assert(inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED); +#endif + if (inp->inp_route.ro_rt && inp->inp_route.ro_rt->generation_id != route_generation) { - if (ifa_foraddr(inp->inp_laddr.s_addr) == NULL) { /* src address is gone */ + if (ifa_foraddr(inp->inp_laddr.s_addr) == 0) { /* src address is gone */ if (inp->inp_flags & INP_INADDR_ANY) inp->inp_faddr.s_addr = INADDR_ANY; /* new src will be set later */ else { @@ -819,23 +943,51 @@ udp_output(inp, m, addr, control, p) inp->inp_route.ro_rt = (struct rtentry *)0; } + origladdr= laddr = inp->inp_laddr; + faddr = inp->inp_faddr; + lport = inp->inp_lport; + fport = inp->inp_fport; + if (addr) { - laddr = inp->inp_laddr; - if (inp->inp_faddr.s_addr != INADDR_ANY) { + sin = (struct sockaddr_in *)addr; + if (faddr.s_addr != INADDR_ANY) { error = EISCONN; goto release; } - /* - * Must block input while temporarily connected. - */ - s = splnet(); - error = in_pcbconnect(inp, addr, p); - if (error) { - splx(s); - goto release; + if (lport == 0) { + /* + * In case we don't have a local port set, go through the full connect. + * We don't have a local port yet (ie, we can't be looked up), + * so it's not an issue if the input runs at the same time we do this. + */ + error = in_pcbconnect(inp, addr, p); + if (error) { + goto release; + } + laddr = inp->inp_laddr; + lport = inp->inp_lport; + faddr = inp->inp_faddr; + fport = inp->inp_fport; + udp_dodisconnect = 1; + } + else { + /* Fast path case + * we have a full address and a local port. + * use those info to build the packet without changing the pcb + * and interfering with the input path. See 3851370 + */ + if (laddr.s_addr == INADDR_ANY) { + if ((error = in_pcbladdr(inp, addr, &ifaddr)) != 0) + goto release; + laddr = ifaddr->sin_addr; + inp->inp_flags |= INP_INADDR_ANY; /* from pcbconnect: remember we don't care about src addr.*/ + } + + faddr = sin->sin_addr; + fport = sin->sin_port; } } else { - if (inp->inp_faddr.s_addr == INADDR_ANY) { + if (faddr.s_addr == INADDR_ANY) { error = ENOTCONN; goto release; } @@ -859,10 +1011,10 @@ udp_output(inp, m, addr, control, p) ui = mtod(m, struct udpiphdr *); bzero(ui->ui_x1, sizeof(ui->ui_x1)); /* XXX still needed? */ ui->ui_pr = IPPROTO_UDP; - ui->ui_src = inp->inp_laddr; - ui->ui_dst = inp->inp_faddr; - ui->ui_sport = inp->inp_lport; - ui->ui_dport = inp->inp_fport; + ui->ui_src = laddr; + ui->ui_dst = faddr; + ui->ui_sport = lport; + ui->ui_dport = fport; ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr)); /* @@ -890,23 +1042,22 @@ udp_output(inp, m, addr, control, p) goto abort; } #endif /*IPSEC*/ - error = ip_output(m, inp->inp_options, &inp->inp_route, + m->m_pkthdr.socket_id = get_socket_id(inp->inp_socket); + error = ip_output_list(m, 0, inp->inp_options, &inp->inp_route, (inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST)), inp->inp_moptions); - if (addr) { + if (udp_dodisconnect) { in_pcbdisconnect(inp); - inp->inp_laddr = laddr; /* XXX rehash? */ - splx(s); + inp->inp_laddr = origladdr; /* XXX rehash? */ } KERNEL_DEBUG(DBG_FNC_UDP_OUTPUT | DBG_FUNC_END, error, 0,0,0,0); return (error); abort: - if (addr) { + if (udp_dodisconnect) { in_pcbdisconnect(inp); - inp->inp_laddr = laddr; /* XXX rehash? */ - splx(s); + inp->inp_laddr = origladdr; /* XXX rehash? */ } release: @@ -934,15 +1085,12 @@ static int udp_abort(struct socket *so) { struct inpcb *inp; - int s; inp = sotoinpcb(so); if (inp == 0) - return EINVAL; /* ??? possible? panic instead? */ + panic("udp_abort: so=%x null inp\n", so); /* ??? possible? panic instead? */ soisdisconnected(so); - s = splnet(); in_pcbdetach(inp); - splx(s); return 0; } @@ -950,20 +1098,18 @@ static int udp_attach(struct socket *so, int proto, struct proc *p) { struct inpcb *inp; - int error; long s; + int error; inp = sotoinpcb(so); if (inp != 0) - return EINVAL; + panic ("udp_attach so=%x inp=%x\n", so, inp); - error = soreserve(so, udp_sendspace, udp_recvspace); - if (error) - return error; - s = splnet(); error = in_pcballoc(so, &udbinfo, p); - splx(s); if (error) return error; + error = soreserve(so, udp_sendspace, udp_recvspace); + if (error) + return error; inp = (struct inpcb *)so->so_pcb; inp->inp_vflag |= INP_IPV4; inp->inp_ip_ttl = ip_defttl; @@ -974,14 +1120,12 @@ static int udp_bind(struct socket *so, struct sockaddr *nam, struct proc *p) { struct inpcb *inp; - int s, error; + int error; inp = sotoinpcb(so); if (inp == 0) return EINVAL; - s = splnet(); error = in_pcbbind(inp, nam, p); - splx(s); return error; } @@ -989,17 +1133,15 @@ static int udp_connect(struct socket *so, struct sockaddr *nam, struct proc *p) { struct inpcb *inp; - int s, error; + int error; inp = sotoinpcb(so); if (inp == 0) return EINVAL; if (inp->inp_faddr.s_addr != INADDR_ANY) return EISCONN; - s = splnet(); error = in_pcbconnect(inp, nam, p); - splx(s); - if (error == 0) + if (error == 0) soisconnected(so); return error; } @@ -1008,14 +1150,12 @@ static int udp_detach(struct socket *so) { struct inpcb *inp; - int s; inp = sotoinpcb(so); if (inp == 0) - return EINVAL; - s = splnet(); + panic("udp_detach: so=%x null inp\n", so); /* ??? possible? panic instead? */ in_pcbdetach(inp); - splx(s); + inp->inp_state = INPCB_STATE_DEAD; return 0; } @@ -1023,7 +1163,6 @@ static int udp_disconnect(struct socket *so) { struct inpcb *inp; - int s; inp = sotoinpcb(so); if (inp == 0) @@ -1031,10 +1170,8 @@ udp_disconnect(struct socket *so) if (inp->inp_faddr.s_addr == INADDR_ANY) return ENOTCONN; - s = splnet(); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; - splx(s); so->so_state &= ~SS_ISCONNECTED; /* XXX */ return 0; } @@ -1070,6 +1207,135 @@ struct pr_usrreqs udp_usrreqs = { pru_connect2_notsupp, in_control, udp_detach, udp_disconnect, pru_listen_notsupp, in_setpeeraddr, pru_rcvd_notsupp, pru_rcvoob_notsupp, udp_send, pru_sense_null, udp_shutdown, - in_setsockaddr, sosend, soreceive, sopoll + in_setsockaddr, sosend, soreceive, pru_sopoll_notsupp }; + +int +udp_lock(so, refcount, debug) + struct socket *so; + int refcount, debug; +{ + int lr_saved; +#ifdef __ppc__ + if (debug == 0) { + __asm__ volatile("mflr %0" : "=r" (lr_saved)); + } + else lr_saved = debug; +#endif + + if (so->so_pcb) { + lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(((struct inpcb *)so->so_pcb)->inpcb_mtx); + } + else { + panic("udp_lock: so=%x NO PCB! lr=%x\n", so, lr_saved); + lck_mtx_assert(so->so_proto->pr_domain->dom_mtx, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(so->so_proto->pr_domain->dom_mtx); + } + + if (refcount) + so->so_usecount++; + + so->reserved3= lr_saved; + return (0); +} + +int +udp_unlock(so, refcount, debug) + struct socket *so; + int refcount; + int debug; +{ + int lr_saved; + struct inpcb *inp = sotoinpcb(so); + struct inpcbinfo *pcbinfo = &udbinfo; +#ifdef __ppc__ + if (debug == 0) { + __asm__ volatile("mflr %0" : "=r" (lr_saved)); + } + else lr_saved = debug; +#endif + if (refcount) { + so->so_usecount--; +#if 0 + if (so->so_usecount == 0 && (inp->inp_wantcnt == WNT_STOPUSING)) { + if (lck_rw_try_lock_exclusive(pcbinfo->mtx)) { + in_pcbdispose(inp); + lck_rw_done(pcbinfo->mtx); + return(0); + } + } +#endif + } + if (so->so_pcb == NULL) { + panic("udp_unlock: so=%x NO PCB! lr=%x\n", so, lr_saved); + lck_mtx_assert(so->so_proto->pr_domain->dom_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_unlock(so->so_proto->pr_domain->dom_mtx); + } + else { + lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_unlock(((struct inpcb *)so->so_pcb)->inpcb_mtx); + } + + + so->reserved4 = lr_saved; + return (0); +} + +lck_mtx_t * +udp_getlock(so, locktype) + struct socket *so; + int locktype; +{ + struct inpcb *inp = sotoinpcb(so); + + + if (so->so_pcb) + return(inp->inpcb_mtx); + else { + panic("udp_getlock: so=%x NULL so_pcb\n", so); + return (so->so_proto->pr_domain->dom_mtx); + } +} + +void +udp_slowtimo() +{ + struct inpcb *inp, *inpnxt; + struct socket *so; + struct inpcbinfo *pcbinfo = &udbinfo; + + lck_rw_lock_exclusive(pcbinfo->mtx); + + for (inp = udb.lh_first; inp != NULL; inp = inpnxt) { + inpnxt = inp->inp_list.le_next; + + /* Ignore nat/SharedIP dummy pcbs */ + if (inp->inp_socket == &udbinfo.nat_dummy_socket) + continue; + + if (inp->inp_wantcnt != WNT_STOPUSING) + continue; + + so = inp->inp_socket; + if (!lck_mtx_try_lock(inp->inpcb_mtx)) /* skip if busy, no hurry for cleanup... */ + continue; + + if (so->so_usecount == 0) + in_pcbdispose(inp); + else + lck_mtx_unlock(inp->inpcb_mtx); + } + lck_rw_done(pcbinfo->mtx); +} + +int +ChkAddressOK( __uint32_t dstaddr, __uint32_t srcaddr ) +{ + if ( dstaddr == srcaddr ){ + return 0; + } + return 1; +} + diff --git a/bsd/netinet/udp_var.h b/bsd/netinet/udp_var.h index 4c2976f31..3a1c1873b 100644 --- a/bsd/netinet/udp_var.h +++ b/bsd/netinet/udp_var.h @@ -59,7 +59,6 @@ #include <sys/appleapiopts.h> #include <sys/sysctl.h> -#ifdef __APPLE_API_PRIVATE /* * UDP kernel structures and variables. @@ -79,9 +78,7 @@ struct udpiphdr { #define ui_sum ui_u.uh_sum #define ui_next ui_i.ih_next #define ui_prev ui_i.ih_prev -#endif /* __APPLE_API_PRIVATE */ -#ifdef __APPLE_API_UNSTABLE struct udpstat { /* input statistics: */ u_long udps_ipackets; /* total input packets */ @@ -102,7 +99,6 @@ struct udpstat { u_long udps_noportmcast; #endif }; -#endif /* __APPLE_API_UNSTABLE */ /* * Names for UDP sysctl objects @@ -114,6 +110,7 @@ struct udpstat { #define UDPCTL_PCBLIST 5 /* list of PCBs for UDP sockets */ #define UDPCTL_MAXID 6 +#ifdef KERNEL_PRIVATE #define UDPCTL_NAMES { \ { 0, 0 }, \ { "checksum", CTLTYPE_INT }, \ @@ -123,8 +120,6 @@ struct udpstat { { "pcblist", CTLTYPE_STRUCT }, \ } -#ifdef __APPLE_API_PRIVATE -#ifdef KERNEL SYSCTL_DECL(_net_inet_udp); extern struct pr_usrreqs udp_usrreqs; @@ -135,13 +130,20 @@ extern u_long udp_recvspace; extern struct udpstat udpstat; extern int log_in_vain; -void udp_ctlinput __P((int, struct sockaddr *, void *)); -void udp_init __P((void)); -void udp_input __P((struct mbuf *, int)); +void udp_ctlinput(int, struct sockaddr *, void *); +void udp_init(void); +void udp_input(struct mbuf *, int); -void udp_notify __P((struct inpcb *inp, int errno)); -int udp_shutdown __P((struct socket *so)); +void udp_notify(struct inpcb *inp, int errno); +int udp_shutdown(struct socket *so); +int udp_lock (struct socket *, int, int); +int udp_unlock (struct socket *, int, int); +void udp_slowtimo (void); +#ifdef _KERN_LOCKS_H_ +lck_mtx_t * udp_getlock (struct socket *, int); +#else +void * udp_getlock (struct socket *, int); #endif -#endif /* __APPLE_API_PRIVATE */ -#endif +#endif KERNEL_PRIVATE +#endif _NETINET_UDP_VAR_H_ diff --git a/bsd/netinet6/Makefile b/bsd/netinet6/Makefile index 37a4991a7..fc12c8bef 100644 --- a/bsd/netinet6/Makefile +++ b/bsd/netinet6/Makefile @@ -20,16 +20,18 @@ EXPINC_SUBDIRS_PPC = \ EXPINC_SUBDIRS_I386 = \ DATAFILES = \ - ah.h esp_rijndael.h in6_ifattach.h ip6.h \ - ip6_var.h ipsec.h pim6.h tcp6_var.h \ - ah6.h icmp6.h in6_pcb.h ip6_ecn.h \ - ip6protosw.h ipsec6.h pim6_var.h udp6_var.h \ - esp.h in6.h in6_prefix.h \ - ipcomp.h mld6_var.h raw_ip6.h esp6.h \ - in6_gif.h in6_var.h ip6_mroute.h ipcomp6.h \ - nd6.h scope6_var.h ip6_fw.h + ah.h ipsec.h pim6.h \ + esp.h in6.h ipcomp.h raw_ip6.h \ + in6_var.h ip6_mroute.h nd6.h ip6_fw.h +PRIVATE_DATAFILES = \ + in6_pcb.h ip6_var.h pim6_var.h +PRIVATE_KERNELFILES = \ + ah6.h esp6.h esp_rijndael.h in6_gif.h in6_ifattach.h \ + in6_prefix.h ip6_ecn.h ip6_fw.h \ + ip6protosw.h ipcomp6.h ipsec6.h mld6_var.h \ + raw_ip6.h scope6_var.h tcp6_var.h udp6_var.h INSTALL_MI_LIST = ${DATAFILES} @@ -37,14 +39,11 @@ INSTALL_MI_DIR = netinet6 EXPORT_MI_LIST = ${DATAFILES} -EXPORT_MI_DIR = netinet6 +EXPORT_MI_DIR = ${INSTALL_MI_DIR} -PRIVATE_DATAFILES = \ - ip6_fw.h +INSTALL_MI_LCL_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} -INSTALL_MI_LCL_KERN_LIST = ${PRIVATE_DATAFILES} +INSTALL_KF_MI_LCL_LIST = ${INSTALL_MI_LCL_LIST} ${PRIVATE_KERNELFILES} include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/bsd/netinet6/ah.h b/bsd/netinet6/ah.h index 62c5eda54..3e7f8dcf8 100644 --- a/bsd/netinet6/ah.h +++ b/bsd/netinet6/ah.h @@ -55,8 +55,7 @@ struct newah { /* variable size, 32bit bound*/ /* Authentication data */ }; -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE struct secasvar; struct ah_algorithm_state { @@ -65,29 +64,28 @@ struct ah_algorithm_state { }; struct ah_algorithm { - int (*sumsiz) __P((struct secasvar *)); - int (*mature) __P((struct secasvar *)); + int (*sumsiz)(struct secasvar *); + int (*mature)(struct secasvar *); int keymin; /* in bits */ int keymax; /* in bits */ const char *name; - int (*init) __P((struct ah_algorithm_state *, struct secasvar *)); - void (*update) __P((struct ah_algorithm_state *, caddr_t, size_t)); - void (*result) __P((struct ah_algorithm_state *, caddr_t)); + int (*init)(struct ah_algorithm_state *, struct secasvar *); + void (*update)(struct ah_algorithm_state *, caddr_t, size_t); + void (*result)(struct ah_algorithm_state *, caddr_t); }; #define AH_MAXSUMSIZE 16 -extern const struct ah_algorithm *ah_algorithm_lookup __P((int)); +extern const struct ah_algorithm *ah_algorithm_lookup(int); /* cksum routines */ -extern int ah_hdrlen __P((struct secasvar *)); +extern int ah_hdrlen(struct secasvar *); -extern size_t ah_hdrsiz __P((struct ipsecrequest *)); -extern void ah4_input __P((struct mbuf *, int)); -extern int ah4_output __P((struct mbuf *, struct ipsecrequest *)); -extern int ah4_calccksum __P((struct mbuf *, caddr_t, size_t, - const struct ah_algorithm *, struct secasvar *)); -#endif /* __APPLE_API_PRIVATE */ -#endif /*KERNEL*/ +extern size_t ah_hdrsiz(struct ipsecrequest *); +extern void ah4_input(struct mbuf *, int); +extern int ah4_output(struct mbuf *, struct ipsecrequest *); +extern int ah4_calccksum(struct mbuf *, caddr_t, size_t, + const struct ah_algorithm *, struct secasvar *); +#endif KERNEL_PRIVATE -#endif /*_NETINET6_AH_H_*/ +#endif _NETINET6_AH_H_ diff --git a/bsd/netinet6/ah6.h b/bsd/netinet6/ah6.h index 8ac8dd613..1525e3fc5 100644 --- a/bsd/netinet6/ah6.h +++ b/bsd/netinet6/ah6.h @@ -38,18 +38,16 @@ #define _NETINET6_AH6_H_ #include <sys/appleapiopts.h> -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE struct secasvar; -extern int ah6_input __P((struct mbuf **, int *)); -extern int ah6_output __P((struct mbuf *, u_char *, struct mbuf *, - struct ipsecrequest *)); -extern int ah6_calccksum __P((struct mbuf *, caddr_t, size_t, - const struct ah_algorithm *, struct secasvar *)); +extern int ah6_input(struct mbuf **, int *); +extern int ah6_output(struct mbuf *, u_char *, struct mbuf *, + struct ipsecrequest *); +extern int ah6_calccksum(struct mbuf *, caddr_t, size_t, + const struct ah_algorithm *, struct secasvar *); -extern void ah6_ctlinput __P((int, struct sockaddr *, void *)); -#endif /* __APPLE_API_PRIVATE */ -#endif +extern void ah6_ctlinput(int, struct sockaddr *, void *); +#endif /* KERNEL_PRIVATE */ #endif /*_NETINET6_AH6_H_*/ diff --git a/bsd/netinet6/ah_core.c b/bsd/netinet6/ah_core.c index 519c1b6a4..c30b9c759 100644 --- a/bsd/netinet6/ah_core.c +++ b/bsd/netinet6/ah_core.c @@ -86,57 +86,46 @@ #define HMACSIZE 16 -static int ah_sumsiz_1216 __P((struct secasvar *)); -static int ah_sumsiz_zero __P((struct secasvar *)); -static int ah_none_mature __P((struct secasvar *)); -static int ah_none_init __P((struct ah_algorithm_state *, struct secasvar *)); -static void ah_none_loop __P((struct ah_algorithm_state *, caddr_t, size_t)); -static void ah_none_result __P((struct ah_algorithm_state *, caddr_t)); -static int ah_keyed_md5_mature __P((struct secasvar *)); -static int ah_keyed_md5_init __P((struct ah_algorithm_state *, - struct secasvar *)); -static void ah_keyed_md5_loop __P((struct ah_algorithm_state *, caddr_t, - size_t)); -static void ah_keyed_md5_result __P((struct ah_algorithm_state *, caddr_t)); -static int ah_keyed_sha1_mature __P((struct secasvar *)); -static int ah_keyed_sha1_init __P((struct ah_algorithm_state *, - struct secasvar *)); -static void ah_keyed_sha1_loop __P((struct ah_algorithm_state *, caddr_t, - size_t)); -static void ah_keyed_sha1_result __P((struct ah_algorithm_state *, caddr_t)); -static int ah_hmac_md5_mature __P((struct secasvar *)); -static int ah_hmac_md5_init __P((struct ah_algorithm_state *, - struct secasvar *)); -static void ah_hmac_md5_loop __P((struct ah_algorithm_state *, caddr_t, - size_t)); -static void ah_hmac_md5_result __P((struct ah_algorithm_state *, caddr_t)); -static int ah_hmac_sha1_mature __P((struct secasvar *)); -static int ah_hmac_sha1_init __P((struct ah_algorithm_state *, - struct secasvar *)); -static void ah_hmac_sha1_loop __P((struct ah_algorithm_state *, caddr_t, - size_t)); -static void ah_hmac_sha1_result __P((struct ah_algorithm_state *, caddr_t)); -static int ah_hmac_sha2_256_mature __P((struct secasvar *)); -static int ah_hmac_sha2_256_init __P((struct ah_algorithm_state *, - struct secasvar *)); -static void ah_hmac_sha2_256_loop __P((struct ah_algorithm_state *, caddr_t, - size_t)); -static void ah_hmac_sha2_256_result __P((struct ah_algorithm_state *, caddr_t)); -static int ah_hmac_sha2_384_mature __P((struct secasvar *)); -static int ah_hmac_sha2_384_init __P((struct ah_algorithm_state *, - struct secasvar *)); -static void ah_hmac_sha2_384_loop __P((struct ah_algorithm_state *, caddr_t, - size_t)); -static void ah_hmac_sha2_384_result __P((struct ah_algorithm_state *, caddr_t)); -static int ah_hmac_sha2_512_mature __P((struct secasvar *)); -static int ah_hmac_sha2_512_init __P((struct ah_algorithm_state *, - struct secasvar *)); -static void ah_hmac_sha2_512_loop __P((struct ah_algorithm_state *, caddr_t, - size_t)); -static void ah_hmac_sha2_512_result __P((struct ah_algorithm_state *, caddr_t)); - -static void ah_update_mbuf __P((struct mbuf *, int, int, - const struct ah_algorithm *, struct ah_algorithm_state *)); +static int ah_sumsiz_1216(struct secasvar *); +static int ah_sumsiz_zero(struct secasvar *); +static int ah_none_mature(struct secasvar *); +static int ah_none_init(struct ah_algorithm_state *, struct secasvar *); +static void ah_none_loop(struct ah_algorithm_state *, caddr_t, size_t); +static void ah_none_result(struct ah_algorithm_state *, caddr_t); +static int ah_keyed_md5_mature(struct secasvar *); +static int ah_keyed_md5_init(struct ah_algorithm_state *, struct secasvar *); +static void ah_keyed_md5_loop(struct ah_algorithm_state *, caddr_t, size_t); +static void ah_keyed_md5_result(struct ah_algorithm_state *, caddr_t); +static int ah_keyed_sha1_mature(struct secasvar *); +static int ah_keyed_sha1_init(struct ah_algorithm_state *, struct secasvar *); +static void ah_keyed_sha1_loop(struct ah_algorithm_state *, caddr_t, size_t); +static void ah_keyed_sha1_result(struct ah_algorithm_state *, caddr_t); +static int ah_hmac_md5_mature(struct secasvar *); +static int ah_hmac_md5_init(struct ah_algorithm_state *, struct secasvar *); +static void ah_hmac_md5_loop(struct ah_algorithm_state *, caddr_t, size_t); +static void ah_hmac_md5_result(struct ah_algorithm_state *, caddr_t); +static int ah_hmac_sha1_mature(struct secasvar *); +static int ah_hmac_sha1_init(struct ah_algorithm_state *, struct secasvar *); +static void ah_hmac_sha1_loop(struct ah_algorithm_state *, caddr_t, size_t); +static void ah_hmac_sha1_result(struct ah_algorithm_state *, caddr_t); +static int ah_hmac_sha2_256_mature(struct secasvar *); +static int ah_hmac_sha2_256_init(struct ah_algorithm_state *, + struct secasvar *); +static void ah_hmac_sha2_256_loop(struct ah_algorithm_state *, caddr_t, size_t); +static void ah_hmac_sha2_256_result(struct ah_algorithm_state *, caddr_t); +static int ah_hmac_sha2_384_mature(struct secasvar *); +static int ah_hmac_sha2_384_init(struct ah_algorithm_state *, + struct secasvar *); +static void ah_hmac_sha2_384_loop(struct ah_algorithm_state *, caddr_t, size_t); +static void ah_hmac_sha2_384_result(struct ah_algorithm_state *, caddr_t); +static int ah_hmac_sha2_512_mature(struct secasvar *); +static int ah_hmac_sha2_512_init(struct ah_algorithm_state *, + struct secasvar *); +static void ah_hmac_sha2_512_loop(struct ah_algorithm_state *, caddr_t, size_t); +static void ah_hmac_sha2_512_result(struct ah_algorithm_state *, caddr_t); + +static void ah_update_mbuf(struct mbuf *, int, int, + const struct ah_algorithm *, struct ah_algorithm_state *); const struct ah_algorithm * ah_algorithm_lookup(idx) diff --git a/bsd/netinet6/ah_input.c b/bsd/netinet6/ah_input.c index e055cd53b..06340aa1e 100644 --- a/bsd/netinet6/ah_input.c +++ b/bsd/netinet6/ah_input.c @@ -48,8 +48,8 @@ #include <net/if.h> #include <net/route.h> -#include <net/netisr.h> #include <kern/cpu_number.h> +#include <kern/locks.h> #include <netinet/in.h> #include <netinet/in_systm.h> @@ -91,6 +91,7 @@ #define IPLEN_FLIPPED +extern lck_mtx_t *sadb_mutex; #if INET extern struct protosw inetsw[]; @@ -110,6 +111,7 @@ ah4_input(struct mbuf *m, int off) int s; size_t stripsiz = 0; + lck_mtx_lock(sadb_mutex); #ifndef PULLDOWN_TEST if (m->m_len < off + sizeof(struct newah)) { @@ -447,17 +449,9 @@ ah4_input(struct mbuf *m, int off) ipsecstat.in_nomem++; goto fail; } - - s = splimp(); - if (IF_QFULL(&ipintrq)) { - ipsecstat.in_inval++; - splx(s); - goto fail; - } - IF_ENQUEUE(&ipintrq, m); - m = NULL; - schednetisr(NETISR_IP); /*can be skipped but to make sure*/ - splx(s); + lck_mtx_unlock(sadb_mutex); + proto_input(PF_INET, m); + lck_mtx_lock(sadb_mutex); nxt = IPPROTO_DONE; } else { /* @@ -531,7 +525,9 @@ ah4_input(struct mbuf *m, int off) ipsecstat.in_polvio++; goto fail; } - (*ip_protox[nxt]->pr_input)(m, off); + lck_mtx_unlock(sadb_mutex); + ip_proto_dispatch_in(m, off, nxt, 0); + lck_mtx_lock(sadb_mutex); } else m_freem(m); m = NULL; @@ -543,6 +539,7 @@ ah4_input(struct mbuf *m, int off) key_freesav(sav); } ipsecstat.in_success++; + lck_mtx_unlock(sadb_mutex); return; fail: @@ -551,6 +548,7 @@ fail: printf("DP ah4_input call free SA:%p\n", sav)); key_freesav(sav); } + lck_mtx_unlock(sadb_mutex); if (m) m_freem(m); return; @@ -577,14 +575,17 @@ ah6_input(mp, offp) int s; size_t stripsiz = 0; + lck_mtx_lock(sadb_mutex); + #ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, sizeof(struct ah), IPPROTO_DONE); + IP6_EXTHDR_CHECK(m, off, sizeof(struct ah), {lck_mtx_unlock(sadb_mutex);return IPPROTO_DONE;}); ah = (struct ah *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(ah, struct ah *, m, off, sizeof(struct newah)); if (ah == NULL) { ipseclog((LOG_DEBUG, "IPv6 AH input: can't pullup\n")); ipsec6stat.in_inval++; + lck_mtx_unlock(sadb_mutex); return IPPROTO_DONE; } #endif @@ -662,7 +663,8 @@ ah6_input(mp, offp) goto fail; } #ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, sizeof(struct ah) + sizoff + siz1, IPPROTO_DONE); + IP6_EXTHDR_CHECK(m, off, sizeof(struct ah) + sizoff + siz1, + {lck_mtx_unlock(sadb_mutex);return IPPROTO_DONE;}); #else IP6_EXTHDR_GET(ah, struct ah *, m, off, sizeof(struct ah) + sizoff + siz1); @@ -748,7 +750,8 @@ ah6_input(mp, offp) sizoff = (sav->flags & SADB_X_EXT_OLD) ? 0 : 4; IP6_EXTHDR_CHECK(m, off, sizeof(struct ah) + sizoff + siz1 - + sizeof(struct ip6_hdr), IPPROTO_DONE); + + sizeof(struct ip6_hdr), + {lck_mtx_unlock(sadb_mutex);return IPPROTO_DONE;}); nip6 = (struct ip6_hdr *)((u_char *)(ah + 1) + sizoff + siz1); if (!IN6_ARE_ADDR_EQUAL(&nip6->ip6_src, &ip6->ip6_src) @@ -849,17 +852,9 @@ ah6_input(mp, offp) ipsec6stat.in_nomem++; goto fail; } - - s = splimp(); - if (IF_QFULL(&ip6intrq)) { - ipsec6stat.in_inval++; - splx(s); - goto fail; - } - IF_ENQUEUE(&ip6intrq, m); - m = NULL; - schednetisr(NETISR_IPV6); /* can be skipped but to make sure */ - splx(s); + lck_mtx_unlock(sadb_mutex); + proto_input(PF_INET6, m); + lck_mtx_lock(sadb_mutex); nxt = IPPROTO_DONE; } else { /* @@ -933,6 +928,7 @@ ah6_input(mp, offp) key_freesav(sav); } ipsec6stat.in_success++; + lck_mtx_unlock(sadb_mutex); return nxt; fail: @@ -941,6 +937,7 @@ fail: printf("DP ah6_input call free SA:%p\n", sav)); key_freesav(sav); } + lck_mtx_unlock(sadb_mutex); if (m) m_freem(m); return IPPROTO_DONE; @@ -1007,6 +1004,7 @@ ah6_ctlinput(cmd, sa, d) */ sa6_src = ip6cp->ip6c_src; sa6_dst = (struct sockaddr_in6 *)sa; + lck_mtx_lock(sadb_mutex); sav = key_allocsa(AF_INET6, (caddr_t)&sa6_src->sin6_addr, (caddr_t)&sa6_dst->sin6_addr, @@ -1017,6 +1015,7 @@ ah6_ctlinput(cmd, sa, d) valid++; key_freesav(sav); } + lck_mtx_unlock(sadb_mutex); /* XXX Further validation? */ diff --git a/bsd/netinet6/ah_output.c b/bsd/netinet6/ah_output.c index 5a4f92cff..4a7940c46 100644 --- a/bsd/netinet6/ah_output.c +++ b/bsd/netinet6/ah_output.c @@ -78,7 +78,7 @@ #include <net/net_osdep.h> #if INET -static struct in_addr *ah4_finaldst __P((struct mbuf *)); +static struct in_addr *ah4_finaldst(struct mbuf *); #endif /* diff --git a/bsd/netinet6/dest6.c b/bsd/netinet6/dest6.c index 8127ebeae..ae7a18b8a 100644 --- a/bsd/netinet6/dest6.c +++ b/bsd/netinet6/dest6.c @@ -68,7 +68,7 @@ dest6_input(mp, offp) /* validation of the length of the header */ #ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, sizeof(*dstopts), IPPROTO_DONE); + IP6_EXTHDR_CHECK(m, off, sizeof(*dstopts), return IPPROTO_DONE); dstopts = (struct ip6_dest *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(dstopts, struct ip6_dest *, m, off, sizeof(*dstopts)); @@ -78,7 +78,7 @@ dest6_input(mp, offp) dstoptlen = (dstopts->ip6d_len + 1) << 3; #ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, dstoptlen, IPPROTO_DONE); + IP6_EXTHDR_CHECK(m, off, dstoptlen, return IPPROTO_DONE); dstopts = (struct ip6_dest *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(dstopts, struct ip6_dest *, m, off, dstoptlen); @@ -107,7 +107,7 @@ dest6_input(mp, offp) default: /* unknown option */ optlen = ip6_unknown_opt(opt, m, - opt - mtod(m, u_int8_t *)); + opt - mtod(m, u_int8_t *), 0); if (optlen == -1) return (IPPROTO_DONE); optlen += 2; diff --git a/bsd/netinet6/esp.h b/bsd/netinet6/esp.h index 295048940..1d9d0c0ba 100644 --- a/bsd/netinet6/esp.h +++ b/bsd/netinet6/esp.h @@ -67,43 +67,41 @@ struct esptail { /*variable size, 32bit bound*/ /* Authentication data (new IPsec)*/ }; -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE struct secasvar; struct esp_algorithm { size_t padbound; /* pad boundary, in byte */ int ivlenval; /* iv length, in byte */ - int (*mature) __P((struct secasvar *)); + int (*mature)(struct secasvar *); int keymin; /* in bits */ int keymax; /* in bits */ - int (*schedlen) __P((const struct esp_algorithm *)); + int (*schedlen)(const struct esp_algorithm *); const char *name; - int (*ivlen) __P((const struct esp_algorithm *, struct secasvar *)); - int (*decrypt) __P((struct mbuf *, size_t, - struct secasvar *, const struct esp_algorithm *, int)); - int (*encrypt) __P((struct mbuf *, size_t, size_t, - struct secasvar *, const struct esp_algorithm *, int)); + int (*ivlen)(const struct esp_algorithm *, struct secasvar *); + int (*decrypt)(struct mbuf *, size_t, + struct secasvar *, const struct esp_algorithm *, int); + int (*encrypt)(struct mbuf *, size_t, size_t, + struct secasvar *, const struct esp_algorithm *, int); /* not supposed to be called directly */ - int (*schedule) __P((const struct esp_algorithm *, struct secasvar *)); - int (*blockdecrypt) __P((const struct esp_algorithm *, - struct secasvar *, u_int8_t *, u_int8_t *)); - int (*blockencrypt) __P((const struct esp_algorithm *, - struct secasvar *, u_int8_t *, u_int8_t *)); + int (*schedule)(const struct esp_algorithm *, struct secasvar *); + int (*blockdecrypt)(const struct esp_algorithm *, + struct secasvar *, u_int8_t *, u_int8_t *); + int (*blockencrypt)(const struct esp_algorithm *, + struct secasvar *, u_int8_t *, u_int8_t *); }; -extern const struct esp_algorithm *esp_algorithm_lookup __P((int)); -extern int esp_max_ivlen __P((void)); +extern const struct esp_algorithm *esp_algorithm_lookup(int); +extern int esp_max_ivlen(void); /* crypt routines */ -extern int esp4_output __P((struct mbuf *, struct ipsecrequest *)); -extern void esp4_input __P((struct mbuf *, int off)); -extern size_t esp_hdrsiz __P((struct ipsecrequest *)); +extern int esp4_output(struct mbuf *, struct ipsecrequest *); +extern void esp4_input(struct mbuf *, int off); +extern size_t esp_hdrsiz(struct ipsecrequest *); -extern int esp_schedule __P((const struct esp_algorithm *, struct secasvar *)); -extern int esp_auth __P((struct mbuf *, size_t, size_t, - struct secasvar *, u_char *)); -#endif /* __APPLE_API_PRIVATE */ -#endif /*KERNEL*/ +extern int esp_schedule(const struct esp_algorithm *, struct secasvar *); +extern int esp_auth(struct mbuf *, size_t, size_t, + struct secasvar *, u_char *); +#endif KERNEL_PRIVATE -#endif /*_NETINET6_ESP_H_*/ +#endif _NETINET6_ESP_H_ diff --git a/bsd/netinet6/esp6.h b/bsd/netinet6/esp6.h index 74b5acc91..bb2c20529 100644 --- a/bsd/netinet6/esp6.h +++ b/bsd/netinet6/esp6.h @@ -38,14 +38,12 @@ #define _NETINET6_ESP6_H_ #include <sys/appleapiopts.h> -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -extern int esp6_output __P((struct mbuf *, u_char *, struct mbuf *, - struct ipsecrequest *)); -extern int esp6_input __P((struct mbuf **, int *)); +#ifdef KERNEL_PRIVATE +extern int esp6_output(struct mbuf *, u_char *, struct mbuf *, + struct ipsecrequest *); +extern int esp6_input(struct mbuf **, int *); -extern void esp6_ctlinput __P((int, struct sockaddr *, void *)); -#endif /* __APPLE_API_PRIVATE */ -#endif /*_KERNEL*/ +extern void esp6_ctlinput(int, struct sockaddr *, void *); +#endif KERNEL_PRIVATE -#endif /*_NETINET6_ESP6_H_*/ +#endif _NETINET6_ESP6_H_ diff --git a/bsd/netinet6/esp_core.c b/bsd/netinet6/esp_core.c index 7b8b124c6..21daa3f41 100644 --- a/bsd/netinet6/esp_core.c +++ b/bsd/netinet6/esp_core.c @@ -44,6 +44,8 @@ #include <sys/kernel.h> #include <sys/syslog.h> +#include <kern/locks.h> + #include <net/if.h> #include <net/route.h> @@ -82,49 +84,51 @@ #define DBG_LAYER_END NETDBG_CODE(DBG_NETIPSEC, 3) #define DBG_FNC_ESPAUTH NETDBG_CODE(DBG_NETIPSEC, (8 << 8)) -static int esp_null_mature __P((struct secasvar *)); -static int esp_null_decrypt __P((struct mbuf *, size_t, - struct secasvar *, const struct esp_algorithm *, int)); -static int esp_null_encrypt __P((struct mbuf *, size_t, size_t, - struct secasvar *, const struct esp_algorithm *, int)); -static int esp_descbc_mature __P((struct secasvar *)); -static int esp_descbc_ivlen __P((const struct esp_algorithm *, - struct secasvar *)); -static int esp_des_schedule __P((const struct esp_algorithm *, - struct secasvar *)); -static int esp_des_schedlen __P((const struct esp_algorithm *)); -static int esp_des_blockdecrypt __P((const struct esp_algorithm *, - struct secasvar *, u_int8_t *, u_int8_t *)); -static int esp_des_blockencrypt __P((const struct esp_algorithm *, - struct secasvar *, u_int8_t *, u_int8_t *)); -static int esp_cbc_mature __P((struct secasvar *)); -static int esp_blowfish_schedule __P((const struct esp_algorithm *, - struct secasvar *)); -static int esp_blowfish_schedlen __P((const struct esp_algorithm *)); -static int esp_blowfish_blockdecrypt __P((const struct esp_algorithm *, - struct secasvar *, u_int8_t *, u_int8_t *)); -static int esp_blowfish_blockencrypt __P((const struct esp_algorithm *, - struct secasvar *, u_int8_t *, u_int8_t *)); -static int esp_cast128_schedule __P((const struct esp_algorithm *, - struct secasvar *)); -static int esp_cast128_schedlen __P((const struct esp_algorithm *)); -static int esp_cast128_blockdecrypt __P((const struct esp_algorithm *, - struct secasvar *, u_int8_t *, u_int8_t *)); -static int esp_cast128_blockencrypt __P((const struct esp_algorithm *, - struct secasvar *, u_int8_t *, u_int8_t *)); -static int esp_3des_schedule __P((const struct esp_algorithm *, - struct secasvar *)); -static int esp_3des_schedlen __P((const struct esp_algorithm *)); -static int esp_3des_blockdecrypt __P((const struct esp_algorithm *, - struct secasvar *, u_int8_t *, u_int8_t *)); -static int esp_3des_blockencrypt __P((const struct esp_algorithm *, - struct secasvar *, u_int8_t *, u_int8_t *)); -static int esp_common_ivlen __P((const struct esp_algorithm *, - struct secasvar *)); -static int esp_cbc_decrypt __P((struct mbuf *, size_t, - struct secasvar *, const struct esp_algorithm *, int)); -static int esp_cbc_encrypt __P((struct mbuf *, size_t, size_t, - struct secasvar *, const struct esp_algorithm *, int)); +extern lck_mtx_t *sadb_mutex; + +static int esp_null_mature(struct secasvar *); +static int esp_null_decrypt(struct mbuf *, size_t, + struct secasvar *, const struct esp_algorithm *, int); +static int esp_null_encrypt(struct mbuf *, size_t, size_t, + struct secasvar *, const struct esp_algorithm *, int); +static int esp_descbc_mature(struct secasvar *); +static int esp_descbc_ivlen(const struct esp_algorithm *, + struct secasvar *); +static int esp_des_schedule(const struct esp_algorithm *, + struct secasvar *); +static int esp_des_schedlen(const struct esp_algorithm *); +static int esp_des_blockdecrypt(const struct esp_algorithm *, + struct secasvar *, u_int8_t *, u_int8_t *); +static int esp_des_blockencrypt(const struct esp_algorithm *, + struct secasvar *, u_int8_t *, u_int8_t *); +static int esp_cbc_mature(struct secasvar *); +static int esp_blowfish_schedule(const struct esp_algorithm *, + struct secasvar *); +static int esp_blowfish_schedlen(const struct esp_algorithm *); +static int esp_blowfish_blockdecrypt(const struct esp_algorithm *, + struct secasvar *, u_int8_t *, u_int8_t *); +static int esp_blowfish_blockencrypt(const struct esp_algorithm *, + struct secasvar *, u_int8_t *, u_int8_t *); +static int esp_cast128_schedule(const struct esp_algorithm *, + struct secasvar *); +static int esp_cast128_schedlen(const struct esp_algorithm *); +static int esp_cast128_blockdecrypt(const struct esp_algorithm *, + struct secasvar *, u_int8_t *, u_int8_t *); +static int esp_cast128_blockencrypt(const struct esp_algorithm *, + struct secasvar *, u_int8_t *, u_int8_t *); +static int esp_3des_schedule(const struct esp_algorithm *, + struct secasvar *); +static int esp_3des_schedlen(const struct esp_algorithm *); +static int esp_3des_blockdecrypt(const struct esp_algorithm *, + struct secasvar *, u_int8_t *, u_int8_t *); +static int esp_3des_blockencrypt(const struct esp_algorithm *, + struct secasvar *, u_int8_t *, u_int8_t *); +static int esp_common_ivlen(const struct esp_algorithm *, + struct secasvar *); +static int esp_cbc_decrypt(struct mbuf *, size_t, + struct secasvar *, const struct esp_algorithm *, int); +static int esp_cbc_encrypt(struct mbuf *, size_t, size_t, + struct secasvar *, const struct esp_algorithm *, int); #define MAXIVLEN 16 @@ -151,11 +155,11 @@ static const struct esp_algorithm esp_algorithms[] = { esp_common_ivlen, esp_cbc_decrypt, esp_cbc_encrypt, esp_cast128_schedule, esp_cast128_blockdecrypt, esp_cast128_blockencrypt, }, - { 16, 16, esp_cbc_mature, 128, 256, esp_rijndael_schedlen, - "rijndael-cbc", - esp_common_ivlen, esp_cbc_decrypt, - esp_cbc_encrypt, esp_rijndael_schedule, - esp_rijndael_blockdecrypt, esp_rijndael_blockencrypt }, + { 16, 16, esp_cbc_mature, 128, 256, esp_aes_schedlen, + "aes-cbc", + esp_common_ivlen, esp_cbc_decrypt_aes, + esp_cbc_encrypt_aes, esp_aes_schedule, + 0, 0 }, }; const struct esp_algorithm * @@ -819,10 +823,6 @@ esp_cbc_decrypt(m, off, sav, algo, ivlen) soff += s->m_len; s = s->m_next; } - - /* skip over empty mbuf */ - while (s && s->m_len == 0) - s = s->m_next; } m_freem(scut->m_next); @@ -1025,10 +1025,6 @@ esp_cbc_encrypt(m, off, plen, sav, algo, ivlen) soff += s->m_len; s = s->m_next; } - - /* skip over empty mbuf */ - while (s && s->m_len == 0) - s = s->m_next; } m_freem(scut->m_next); @@ -1129,7 +1125,7 @@ esp_auth(m0, skip, length, sav, sum) KERNEL_DEBUG(DBG_FNC_ESPAUTH | DBG_FUNC_END, 5,0,0,0,0); return error; } - + lck_mtx_unlock(sadb_mutex); while (0 < length) { if (!m) panic("mbuf chain?"); @@ -1147,7 +1143,7 @@ esp_auth(m0, skip, length, sav, sum) } (*algo->result)(&s, sumbuf); bcopy(sumbuf, sum, siz); /*XXX*/ - + lck_mtx_lock(sadb_mutex); KERNEL_DEBUG(DBG_FNC_ESPAUTH | DBG_FUNC_END, 6,0,0,0,0); return 0; } diff --git a/bsd/netinet6/esp_input.c b/bsd/netinet6/esp_input.c index 463a4182d..4b080c6b4 100644 --- a/bsd/netinet6/esp_input.c +++ b/bsd/netinet6/esp_input.c @@ -48,8 +48,8 @@ #include <net/if.h> #include <net/route.h> -#include <net/netisr.h> #include <kern/cpu_number.h> +#include <kern/locks.h> #include <netinet/in.h> #include <netinet/in_systm.h> @@ -96,6 +96,7 @@ #define DBG_FNC_DECRYPT NETDBG_CODE(DBG_NETIPSEC, (7 << 8)) #define IPLEN_FLIPPED +extern lck_mtx_t *sadb_mutex; #if INET extern struct protosw inetsw[]; @@ -121,6 +122,8 @@ esp4_input(m, off) size_t esplen; int s; + lck_mtx_lock(sadb_mutex); + KERNEL_DEBUG(DBG_FNC_ESPIN | DBG_FUNC_START, 0,0,0,0,0); /* sanity check for alignment. */ if (off % 4 != 0 || m->m_pkthdr.len % 4 != 0) { @@ -397,16 +400,9 @@ noreplaycheck: /* Clear the csum flags, they can't be valid for the inner headers */ m->m_pkthdr.csum_flags = 0; - s = splimp(); - if (IF_QFULL(&ipintrq)) { - ipsecstat.in_inval++; - splx(s); - goto bad; - } - IF_ENQUEUE(&ipintrq, m); - m = NULL; - schednetisr(NETISR_IP); /*can be skipped but to make sure*/ - splx(s); + lck_mtx_unlock(sadb_mutex); + proto_input(PF_INET, m); + lck_mtx_lock(sadb_mutex); nxt = IPPROTO_DONE; KERNEL_DEBUG(DBG_FNC_ESPIN | DBG_FUNC_END, 2,0,0,0,0); } else { @@ -457,7 +453,9 @@ noreplaycheck: goto bad; } KERNEL_DEBUG(DBG_FNC_ESPIN | DBG_FUNC_END, 3,0,0,0,0); - (*ip_protox[nxt]->pr_input)(m, off); + lck_mtx_unlock(sadb_mutex); + ip_proto_dispatch_in(m, off, nxt, 0); + lck_mtx_lock(sadb_mutex); } else m_freem(m); m = NULL; @@ -469,6 +467,7 @@ noreplaycheck: key_freesav(sav); } ipsecstat.in_success++; + lck_mtx_unlock(sadb_mutex); return; bad: @@ -477,6 +476,7 @@ bad: printf("DP esp4_input call free SA:%p\n", sav)); key_freesav(sav); } + lck_mtx_unlock(sadb_mutex); if (m) m_freem(m); KERNEL_DEBUG(DBG_FNC_ESPIN | DBG_FUNC_END, 4,0,0,0,0); @@ -504,6 +504,8 @@ esp6_input(mp, offp) size_t esplen; int s; + lck_mtx_lock(sadb_mutex); + /* sanity check for alignment. */ if (off % 4 != 0 || m->m_pkthdr.len % 4 != 0) { ipseclog((LOG_ERR, "IPv6 ESP input: packet alignment problem " @@ -513,12 +515,13 @@ esp6_input(mp, offp) } #ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, ESPMAXLEN, IPPROTO_DONE); + IP6_EXTHDR_CHECK(m, off, ESPMAXLEN, {lck_mtx_unlock(sadb_mutex); return IPPROTO_DONE;}); esp = (struct esp *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(esp, struct esp *, m, off, ESPMAXLEN); if (esp == NULL) { ipsec6stat.in_inval++; + lck_mtx_unlock(sadb_mutex); return IPPROTO_DONE; } #endif @@ -672,7 +675,7 @@ noreplaycheck: } #ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, esplen + ivlen, IPPROTO_DONE); /*XXX*/ + IP6_EXTHDR_CHECK(m, off, esplen + ivlen, return IPPROTO_DONE); /*XXX*/ #else IP6_EXTHDR_GET(esp, struct esp *, m, off, esplen + ivlen); if (esp == NULL) { @@ -776,17 +779,9 @@ noreplaycheck: ipsec6stat.in_nomem++; goto bad; } - - s = splimp(); - if (IF_QFULL(&ip6intrq)) { - ipsec6stat.in_inval++; - splx(s); - goto bad; - } - IF_ENQUEUE(&ip6intrq, m); - m = NULL; - schednetisr(NETISR_IPV6); /*can be skipped but to make sure*/ - splx(s); + lck_mtx_unlock(sadb_mutex); + proto_input(PF_INET6, m); + lck_mtx_lock(sadb_mutex); nxt = IPPROTO_DONE; } else { /* @@ -894,6 +889,7 @@ noreplaycheck: key_freesav(sav); } ipsec6stat.in_success++; + lck_mtx_unlock(sadb_mutex); return nxt; bad: @@ -902,6 +898,7 @@ bad: printf("DP esp6_input call free SA:%p\n", sav)); key_freesav(sav); } + lck_mtx_unlock(sadb_mutex); if (m) m_freem(m); return IPPROTO_DONE; @@ -986,6 +983,7 @@ esp6_ctlinput(cmd, sa, d) */ sa6_src = ip6cp->ip6c_src; sa6_dst = (struct sockaddr_in6 *)sa; + lck_mtx_lock(sadb_mutex); sav = key_allocsa(AF_INET6, (caddr_t)&sa6_src->sin6_addr, (caddr_t)&sa6_dst->sin6_addr, @@ -996,6 +994,7 @@ esp6_ctlinput(cmd, sa, d) valid++; key_freesav(sav); } + lck_mtx_unlock(sadb_mutex); /* XXX Further validation? */ diff --git a/bsd/netinet6/esp_output.c b/bsd/netinet6/esp_output.c index b3dc4d22c..b1b00cda2 100644 --- a/bsd/netinet6/esp_output.c +++ b/bsd/netinet6/esp_output.c @@ -87,8 +87,8 @@ #define DBG_FNC_ESPOUT NETDBG_CODE(DBG_NETIPSEC, (4 << 8)) #define DBG_FNC_ENCRYPT NETDBG_CODE(DBG_NETIPSEC, (5 << 8)) -static int esp_output __P((struct mbuf *, u_char *, struct mbuf *, - struct ipsecrequest *, int)); +static int esp_output(struct mbuf *, u_char *, struct mbuf *, + struct ipsecrequest *, int); extern int esp_udp_encap_port; extern u_int32_t natt_now; diff --git a/bsd/netinet6/esp_rijndael.c b/bsd/netinet6/esp_rijndael.c index fa35c593c..f2ebe936d 100644 --- a/bsd/netinet6/esp_rijndael.c +++ b/bsd/netinet6/esp_rijndael.c @@ -34,6 +34,10 @@ #include <sys/systm.h> #include <sys/socket.h> #include <sys/queue.h> +#include <sys/syslog.h> +#include <sys/mbuf.h> + +#include <kern/locks.h> #include <net/if.h> #include <net/route.h> @@ -42,72 +46,381 @@ #include <netinet6/esp.h> #include <netinet6/esp_rijndael.h> -#include <crypto/rijndael/rijndael.h> +#include <crypto/aes/aes.h> #include <net/net_osdep.h> -/* as rijndael uses assymetric scheduled keys, we need to do it twice. */ +#define AES_BLOCKLEN 16 + +extern lck_mtx_t *sadb_mutex; + int -esp_rijndael_schedlen(algo) +esp_aes_schedlen(algo) const struct esp_algorithm *algo; { - return sizeof(keyInstance) * 2; + return sizeof(aes_ctx); } int -esp_rijndael_schedule(algo, sav) +esp_aes_schedule(algo, sav) const struct esp_algorithm *algo; struct secasvar *sav; { - keyInstance *k; - - k = (keyInstance *)sav->sched; - if (rijndael_makeKey(&k[0], DIR_DECRYPT, _KEYLEN(sav->key_enc) * 8, - _KEYBUF(sav->key_enc)) < 0) - return -1; - if (rijndael_makeKey(&k[1], DIR_ENCRYPT, _KEYLEN(sav->key_enc) * 8, - _KEYBUF(sav->key_enc)) < 0) - return -1; + aes_ctx *ctx = (aes_ctx*)sav->sched; + + gen_tabs(); + aes_decrypt_key(_KEYBUF(sav->key_enc), _KEYLEN(sav->key_enc), &ctx->decrypt); + aes_encrypt_key(_KEYBUF(sav->key_enc), _KEYLEN(sav->key_enc), &ctx->encrypt); + return 0; } + +/* The following 2 functions decrypt or encrypt the contents of + * the mbuf chain passed in keeping the IP and ESP header's in place, + * along with the IV. + * The code attempts to call the crypto code with the largest chunk + * of data it can based on the amount of source data in + * the current source mbuf and the space remaining in the current + * destination mbuf. The crypto code requires data to be a multiples + * of 16 bytes. A separate buffer is used when a 16 byte block spans + * mbufs. + * + * m = mbuf chain + * off = offset to ESP header + * + * local vars for source: + * soff = offset from beginning of the chain to the head of the + * current mbuf. + * scut = last mbuf that contains headers to be retained + * scutoff = offset to end of the headers in scut + * s = the current mbuf + * sn = current offset to data in s (next source data to process) + * + * local vars for dest: + * d0 = head of chain + * d = current mbuf + * dn = current offset in d (next location to store result) + */ + + int -esp_rijndael_blockdecrypt(algo, sav, s, d) - const struct esp_algorithm *algo; +esp_cbc_decrypt_aes(m, off, sav, algo, ivlen) + struct mbuf *m; + size_t off; struct secasvar *sav; - u_int8_t *s; - u_int8_t *d; + const struct esp_algorithm *algo; + int ivlen; { - cipherInstance c; - keyInstance *p; - - /* does not take advantage of CBC mode support */ - bzero(&c, sizeof(c)); - if (rijndael_cipherInit(&c, MODE_ECB, NULL) < 0) - return -1; - p = (keyInstance *)sav->sched; - if (rijndael_blockDecrypt(&c, &p[0], s, algo->padbound * 8, d) < 0) - return -1; + struct mbuf *s; + struct mbuf *d, *d0, *dp; + int soff; /* offset from the head of chain, to head of this mbuf */ + int sn, dn; /* offset from the head of the mbuf, to meat */ + size_t ivoff, bodyoff; + u_int8_t iv[AES_BLOCKLEN], *dptr; + u_int8_t sbuf[AES_BLOCKLEN], *sp; + struct mbuf *scut; + int scutoff; + int i, len; + + + if (ivlen != AES_BLOCKLEN) { + ipseclog((LOG_ERR, "esp_cbc_decrypt %s: " + "unsupported ivlen %d\n", algo->name, ivlen)); + m_freem(m); + return EINVAL; + } + + if (sav->flags & SADB_X_EXT_OLD) { + /* RFC 1827 */ + ivoff = off + sizeof(struct esp); + bodyoff = off + sizeof(struct esp) + ivlen; + } else { + ivoff = off + sizeof(struct newesp); + bodyoff = off + sizeof(struct newesp) + ivlen; + } + + if (m->m_pkthdr.len < bodyoff) { + ipseclog((LOG_ERR, "esp_cbc_decrypt %s: bad len %d/%lu\n", + algo->name, m->m_pkthdr.len, (unsigned long)bodyoff)); + m_freem(m); + return EINVAL; + } + if ((m->m_pkthdr.len - bodyoff) % AES_BLOCKLEN) { + ipseclog((LOG_ERR, "esp_cbc_decrypt %s: " + "payload length must be multiple of %d\n", + algo->name, AES_BLOCKLEN)); + m_freem(m); + return EINVAL; + } + + /* grab iv */ + m_copydata(m, ivoff, ivlen, iv); + + lck_mtx_unlock(sadb_mutex); + s = m; + soff = sn = dn = 0; + d = d0 = dp = NULL; + sp = dptr = NULL; + + /* skip header/IV offset */ + while (soff < bodyoff) { + if (soff + s->m_len > bodyoff) { + sn = bodyoff - soff; + break; + } + + soff += s->m_len; + s = s->m_next; + } + scut = s; + scutoff = sn; + + /* skip over empty mbuf */ + while (s && s->m_len == 0) + s = s->m_next; + + while (soff < m->m_pkthdr.len) { + /* source */ + if (sn + AES_BLOCKLEN <= s->m_len) { + /* body is continuous */ + sp = mtod(s, u_int8_t *) + sn; + len = s->m_len - sn; + len -= len % AES_BLOCKLEN; // full blocks only + } else { + /* body is non-continuous */ + m_copydata(s, sn, AES_BLOCKLEN, sbuf); + sp = sbuf; + len = AES_BLOCKLEN; // 1 block only in sbuf + } + + /* destination */ + if (!d || dn + AES_BLOCKLEN > d->m_len) { + if (d) + dp = d; + MGET(d, M_DONTWAIT, MT_DATA); + i = m->m_pkthdr.len - (soff + sn); + if (d && i > MLEN) { + MCLGET(d, M_DONTWAIT); + if ((d->m_flags & M_EXT) == 0) { + m_free(d); + d = NULL; + } + } + if (!d) { + m_freem(m); + if (d0) + m_freem(d0); + lck_mtx_lock(sadb_mutex); + return ENOBUFS; + } + if (!d0) + d0 = d; + if (dp) + dp->m_next = d; + d->m_len = M_TRAILINGSPACE(d); + d->m_len -= d->m_len % AES_BLOCKLEN; + if (d->m_len > i) + d->m_len = i; + dptr = mtod(d, u_int8_t *); + dn = 0; + } + + /* adjust len if greater than space available in dest */ + if (len > d->m_len - dn) + len = d->m_len - dn; + + /* decrypt */ + aes_decrypt_cbc(sp, iv, len >> 4, dptr + dn, + (aes_decrypt_ctx*)(&(((aes_ctx*)sav->sched)->decrypt))); + + /* udpate offsets */ + sn += len; + dn += len; + + // next iv + bcopy(sp + len - AES_BLOCKLEN, iv, AES_BLOCKLEN); + + /* find the next source block */ + while (s && sn >= s->m_len) { + sn -= s->m_len; + soff += s->m_len; + s = s->m_next; + } + + } + + /* free un-needed source mbufs and add dest mbufs to chain */ + m_freem(scut->m_next); + scut->m_len = scutoff; + scut->m_next = d0; + + /* just in case */ + bzero(iv, sizeof(iv)); + bzero(sbuf, sizeof(sbuf)); + lck_mtx_lock(sadb_mutex); + return 0; } int -esp_rijndael_blockencrypt(algo, sav, s, d) - const struct esp_algorithm *algo; +esp_cbc_encrypt_aes(m, off, plen, sav, algo, ivlen) + struct mbuf *m; + size_t off; + size_t plen; struct secasvar *sav; - u_int8_t *s; - u_int8_t *d; + const struct esp_algorithm *algo; + int ivlen; { - cipherInstance c; - keyInstance *p; - - /* does not take advantage of CBC mode support */ - bzero(&c, sizeof(c)); - if (rijndael_cipherInit(&c, MODE_ECB, NULL) < 0) - return -1; - p = (keyInstance *)sav->sched; - if (rijndael_blockEncrypt(&c, &p[1], s, algo->padbound * 8, d) < 0) - return -1; + struct mbuf *s; + struct mbuf *d, *d0, *dp; + int soff, doff; /* offset from the head of chain, to head of this mbuf */ + int sn, dn; /* offset from the head of the mbuf, to meat */ + size_t ivoff, bodyoff; + u_int8_t *ivp, *dptr; + u_int8_t sbuf[AES_BLOCKLEN], *sp; + struct mbuf *scut; + int scutoff; + int i, len; + + if (ivlen != AES_BLOCKLEN) { + ipseclog((LOG_ERR, "esp_cbc_encrypt %s: " + "unsupported ivlen %d\n", algo->name, ivlen)); + m_freem(m); + return EINVAL; + } + + if (sav->flags & SADB_X_EXT_OLD) { + /* RFC 1827 */ + ivoff = off + sizeof(struct esp); + bodyoff = off + sizeof(struct esp) + ivlen; + } else { + ivoff = off + sizeof(struct newesp); + bodyoff = off + sizeof(struct newesp) + ivlen; + } + + /* put iv into the packet */ + m_copyback(m, ivoff, ivlen, sav->iv); + ivp = sav->iv; + + if (m->m_pkthdr.len < bodyoff) { + ipseclog((LOG_ERR, "esp_cbc_encrypt %s: bad len %d/%lu\n", + algo->name, m->m_pkthdr.len, (unsigned long)bodyoff)); + m_freem(m); + return EINVAL; + } + if ((m->m_pkthdr.len - bodyoff) % AES_BLOCKLEN) { + ipseclog((LOG_ERR, "esp_cbc_encrypt %s: " + "payload length must be multiple of %lu\n", + algo->name, AES_BLOCKLEN)); + m_freem(m); + return EINVAL; + } + lck_mtx_unlock(sadb_mutex); + + s = m; + soff = sn = dn = 0; + d = d0 = dp = NULL; + sp = dptr = NULL; + + /* skip headers/IV */ + while (soff < bodyoff) { + if (soff + s->m_len > bodyoff) { + sn = bodyoff - soff; + break; + } + + soff += s->m_len; + s = s->m_next; + } + scut = s; + scutoff = sn; + + /* skip over empty mbuf */ + while (s && s->m_len == 0) + s = s->m_next; + + while (soff < m->m_pkthdr.len) { + /* source */ + if (sn + AES_BLOCKLEN <= s->m_len) { + /* body is continuous */ + sp = mtod(s, u_int8_t *) + sn; + len = s->m_len - sn; + len -= len % AES_BLOCKLEN; // full blocks only + } else { + /* body is non-continuous */ + m_copydata(s, sn, AES_BLOCKLEN, sbuf); + sp = sbuf; + len = AES_BLOCKLEN; // 1 block only in sbuf + } + + /* destination */ + if (!d || dn + AES_BLOCKLEN > d->m_len) { + if (d) + dp = d; + MGET(d, M_DONTWAIT, MT_DATA); + i = m->m_pkthdr.len - (soff + sn); + if (d && i > MLEN) { + MCLGET(d, M_DONTWAIT); + if ((d->m_flags & M_EXT) == 0) { + m_free(d); + d = NULL; + } + } + if (!d) { + m_freem(m); + if (d0) + m_freem(d0); + lck_mtx_lock(sadb_mutex); + return ENOBUFS; + } + if (!d0) + d0 = d; + if (dp) + dp->m_next = d; + + d->m_len = M_TRAILINGSPACE(d); + d->m_len -= d->m_len % AES_BLOCKLEN; + if (d->m_len > i) + d->m_len = i; + dptr = mtod(d, u_int8_t *); + dn = 0; + } + + /* adjust len if greater than space available */ + if (len > d->m_len - dn) + len = d->m_len - dn; + + /* encrypt */ + aes_encrypt_cbc(sp, ivp, len >> 4, dptr + dn, + (aes_encrypt_ctx*)(&(((aes_ctx*)sav->sched)->encrypt))); + + /* update offsets */ + sn += len; + dn += len; + + /* next iv */ + ivp = dptr + dn - AES_BLOCKLEN; // last block encrypted + + /* find the next source block and skip empty mbufs */ + while (s && sn >= s->m_len) { + sn -= s->m_len; + soff += s->m_len; + s = s->m_next; + } + + } + + /* free un-needed source mbufs and add dest mbufs to chain */ + m_freem(scut->m_next); + scut->m_len = scutoff; + scut->m_next = d0; + + /* just in case */ + bzero(sbuf, sizeof(sbuf)); + lck_mtx_lock(sadb_mutex); + key_sa_stir_iv(sav); + return 0; } diff --git a/bsd/netinet6/esp_rijndael.h b/bsd/netinet6/esp_rijndael.h index e571f820f..9d60216a9 100644 --- a/bsd/netinet6/esp_rijndael.h +++ b/bsd/netinet6/esp_rijndael.h @@ -31,12 +31,13 @@ */ #include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE -int esp_rijndael_schedlen __P((const struct esp_algorithm *)); -int esp_rijndael_schedule __P((const struct esp_algorithm *, - struct secasvar *)); -int esp_rijndael_blockdecrypt __P((const struct esp_algorithm *, - struct secasvar *, u_int8_t *, u_int8_t *)); -int esp_rijndael_blockencrypt __P((const struct esp_algorithm *, - struct secasvar *, u_int8_t *, u_int8_t *)); -#endif /* __APPLE_API_PRIVATE */ +#ifdef KERNEL_PRIVATE +int esp_aes_schedlen(const struct esp_algorithm *); +int esp_aes_schedule(const struct esp_algorithm *, struct secasvar *); +int esp_cbc_decrypt_aes(struct mbuf *, size_t, struct secasvar *, + const struct esp_algorithm *, int); +int +esp_cbc_encrypt_aes(struct mbuf *, size_t, size_t, struct secasvar *, + const struct esp_algorithm *, int); + +#endif KERNEL_PRIVATE diff --git a/bsd/netinet6/frag6.c b/bsd/netinet6/frag6.c index f686d01cd..a495a0464 100644 --- a/bsd/netinet6/frag6.c +++ b/bsd/netinet6/frag6.c @@ -42,6 +42,7 @@ #include <sys/kernel.h> #include <sys/syslog.h> #include <kern/queue.h> +#include <kern/locks.h> #include <net/if.h> #include <net/route.h> @@ -61,21 +62,23 @@ */ #define IN6_IFSTAT_STRICT -static void frag6_enq __P((struct ip6asfrag *, struct ip6asfrag *)); -static void frag6_deq __P((struct ip6asfrag *)); -static void frag6_insque __P((struct ip6q *, struct ip6q *)); -static void frag6_remque __P((struct ip6q *)); -static void frag6_freef __P((struct ip6q *)); +static void frag6_enq(struct ip6asfrag *, struct ip6asfrag *); +static void frag6_deq(struct ip6asfrag *); +static void frag6_insque(struct ip6q *, struct ip6q *); +static void frag6_remque(struct ip6q *); +static void frag6_freef(struct ip6q *); /* XXX we eventually need splreass6, or some real semaphore */ int frag6_doing_reass; u_int frag6_nfragpackets; +static u_int frag6_nfrags; struct ip6q ip6q; /* ip6 reassemble queue */ #ifndef __APPLE__ MALLOC_DEFINE(M_FTABLE, "fragment", "fragment reassembly header"); #endif +extern lck_mtx_t *inet6_domain_mutex; /* * Initialise reassembly queue and fragment identifier. */ @@ -85,6 +88,7 @@ frag6_init() struct timeval tv; ip6_maxfragpackets = nmbclusters / 32; + ip6_maxfrags = nmbclusters / 4; /* * in many cases, random() here does NOT return random number @@ -126,6 +130,8 @@ frag6_init() */ /* * Fragment input + * NOTE: this function is called with the inet6_domain_mutex held from ip6_input. + * inet6_domain_mutex is protecting he frag6 queue manipulation. */ int frag6_input(mp, offp) @@ -148,7 +154,7 @@ frag6_input(mp, offp) ip6 = mtod(m, struct ip6_hdr *); #ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, offset, sizeof(struct ip6_frag), IPPROTO_DONE); + IP6_EXTHDR_CHECK(m, offset, sizeof(struct ip6_frag), return IPPROTO_DONE); ip6f = (struct ip6_frag *)((caddr_t)ip6 + offset); #else IP6_EXTHDR_GET(ip6f, struct ip6_frag *, m, offset, sizeof(*ip6f)); @@ -211,6 +217,16 @@ frag6_input(mp, offp) frag6_doing_reass = 1; + /* + * Enforce upper bound on number of fragments. + * If maxfrag is 0, never accept fragments. + * If maxfrag is -1, accept all fragments without limitation. + */ + if (ip6_maxfrags < 0) + ; + else if (frag6_nfrags >= (u_int)ip6_maxfrags) + goto dropfrag; + for (q6 = ip6q.ip6q_next; q6 != &ip6q; q6 = q6->ip6q_next) if (ip6f->ip6f_ident == q6->ip6q_ident && IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &q6->ip6q_src) && @@ -253,6 +269,8 @@ frag6_input(mp, offp) q6->ip6q_src = ip6->ip6_src; q6->ip6q_dst = ip6->ip6_dst; q6->ip6q_unfrglen = -1; /* The 1st fragment has not arrived. */ + + q6->ip6q_nfrag = 0; } /* @@ -431,6 +449,8 @@ insert: * the most recently active fragmented packet. */ frag6_enq(ip6af, af6->ip6af_up); + frag6_nfrags++; + q6->ip6q_nfrag++; #if 0 /* xxx */ if (q6 != ip6q.ip6q_next) { frag6_remque(q6); @@ -493,6 +513,7 @@ insert: /* this comes with no copy if the boundary is on cluster */ if ((t = m_split(m, offset, M_DONTWAIT)) == NULL) { frag6_remque(q6); + frag6_nfrags -= q6->ip6q_nfrag; FREE(q6, M_FTABLE); frag6_nfragpackets--; goto dropfrag; @@ -510,6 +531,7 @@ insert: } frag6_remque(q6); + frag6_nfrags -= q6->ip6q_nfrag; FREE(q6, M_FTABLE); frag6_nfragpackets--; @@ -571,7 +593,6 @@ frag6_freef(q6) /* restoure source and destination addresses */ ip6->ip6_src = q6->ip6q_src; ip6->ip6_dst = q6->ip6q_dst; - icmp6_error(m, ICMP6_TIME_EXCEEDED, ICMP6_TIME_EXCEED_REASSEMBLY, 0); } else @@ -580,6 +601,7 @@ frag6_freef(q6) } frag6_remque(q6); + frag6_nfrags -= q6->ip6q_nfrag; FREE(q6, M_FTABLE); frag6_nfragpackets--; } @@ -636,7 +658,7 @@ void frag6_slowtimo() { struct ip6q *q6; - int s = splnet(); + lck_mtx_lock(inet6_domain_mutex); frag6_doing_reass = 1; q6 = ip6q.ip6q_next; @@ -679,7 +701,7 @@ frag6_slowtimo() } #endif - splx(s); + lck_mtx_unlock(inet6_domain_mutex); } /* @@ -690,9 +712,11 @@ frag6_drain() { if (frag6_doing_reass) return; + lck_mtx_lock(inet6_domain_mutex); while (ip6q.ip6q_next != &ip6q) { ip6stat.ip6s_fragdropped++; /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ frag6_freef(ip6q.ip6q_next); } + lck_mtx_unlock(inet6_domain_mutex); } diff --git a/bsd/netinet6/icmp6.c b/bsd/netinet6/icmp6.c index 654dc8b15..042bdd76d 100644 --- a/bsd/netinet6/icmp6.c +++ b/bsd/netinet6/icmp6.c @@ -68,6 +68,7 @@ #include <sys/param.h> #include <sys/systm.h> +#include <sys/lock.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/protosw.h> @@ -109,35 +110,6 @@ extern int ipsec_bypass; #include <net/net_osdep.h> -#if HAVE_NRL_INPCB -/* inpcb members */ -#define in6pcb inpcb -#define in6p_laddr inp_laddr6 -#define in6p_faddr inp_faddr6 -#define in6p_icmp6filt inp_icmp6filt -#define in6p_route inp_route -#define in6p_socket inp_socket -#define in6p_flags inp_flags -#define in6p_moptions inp_moptions6 -#define in6p_outputopts inp_outputopts6 -#define in6p_ip6 inp_ipv6 -#define in6p_flowinfo inp_flowinfo -#define in6p_sp inp_sp -#define in6p_next inp_next -#define in6p_prev inp_prev -/* macro names */ -#define sotoin6pcb sotoinpcb -/* function names */ -#define in6_pcbdetach in_pcbdetach -#define in6_rtchange in_rtchange - -/* - * for KAME src sync over BSD*'s. XXX: FreeBSD (>=3) are VERY different from - * others... - */ -#define in6p_ip6_nxt inp_ipv6.ip6_nxt -#endif - extern struct domain inet6domain; extern struct ip6protosw inet6sw[]; extern struct ip6protosw *ip6_protox[]; @@ -149,23 +121,26 @@ extern int icmp6errppslim; static int icmp6errpps_count = 0; static struct timeval icmp6errppslim_last; extern int icmp6_nodeinfo; - -static void icmp6_errcount __P((struct icmp6errstat *, int, int)); -static int icmp6_rip6_input __P((struct mbuf **, int)); -static int icmp6_ratelimit __P((const struct in6_addr *, const int, const int)); -static const char *icmp6_redirect_diag __P((struct in6_addr *, - struct in6_addr *, struct in6_addr *)); +extern struct inpcbinfo ripcbinfo; +extern lck_mtx_t *ip6_mutex; +extern lck_mtx_t *nd6_mutex; + +static void icmp6_errcount(struct icmp6errstat *, int, int); +static int icmp6_rip6_input(struct mbuf **, int); +static int icmp6_ratelimit(const struct in6_addr *, const int, const int); +static const char *icmp6_redirect_diag(struct in6_addr *, + struct in6_addr *, struct in6_addr *); #ifndef HAVE_PPSRATECHECK -static int ppsratecheck __P((struct timeval *, int *, int)); +static int ppsratecheck(struct timeval *, int *, int); #endif -static struct mbuf *ni6_input __P((struct mbuf *, int)); -static struct mbuf *ni6_nametodns __P((const char *, int, int)); -static int ni6_dnsmatch __P((const char *, int, const char *, int)); -static int ni6_addrs __P((struct icmp6_nodeinfo *, struct mbuf *, - struct ifnet **, char *)); -static int ni6_store_addrs __P((struct icmp6_nodeinfo *, struct icmp6_nodeinfo *, - struct ifnet *, int)); -static int icmp6_notify_error __P((struct mbuf *, int, int, int)); +static struct mbuf *ni6_input(struct mbuf *, int); +static struct mbuf *ni6_nametodns(const char *, int, int); +static int ni6_dnsmatch(const char *, int, const char *, int); +static int ni6_addrs(struct icmp6_nodeinfo *, + struct ifnet **, char *); +static int ni6_store_addrs(struct icmp6_nodeinfo *, struct icmp6_nodeinfo *, + struct ifnet *, int); +static int icmp6_notify_error(struct mbuf *, int, int, int); #ifdef COMPAT_RFC1885 static struct route_in6 icmp6_reflect_rt; @@ -252,6 +227,7 @@ icmp6_error(m, type, code, param) icmp6stat.icp6s_error++; + lck_mtx_assert(ip6_mutex, LCK_MTX_ASSERT_NOTOWNED); /* count per-type-code statistics */ icmp6_errcount(&icmp6stat.icp6s_outerrhist, type, code); @@ -263,7 +239,7 @@ icmp6_error(m, type, code, param) #endif #ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), ); + IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), return); #else if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); @@ -300,7 +276,7 @@ icmp6_error(m, type, code, param) struct icmp6_hdr *icp; #ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, 0, off + sizeof(struct icmp6_hdr), ); + IP6_EXTHDR_CHECK(m, 0, off + sizeof(struct icmp6_hdr), return); icp = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icp, struct icmp6_hdr *, m, off, @@ -401,7 +377,7 @@ icmp6_input(mp, offp) int code, sum, noff; #ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr), IPPROTO_DONE); + IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr), return IPPROTO_DONE); /* m might change if M_LOOP. So, call mtod after this */ #endif @@ -659,7 +635,7 @@ icmp6_input(mp, offp) if (mode == FQDN) { #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_nodeinfo), - IPPROTO_DONE); + return IPPROTO_DONE); #endif n = m_copy(m, 0, M_COPYALL); if (n) @@ -873,7 +849,7 @@ icmp6_notify_error(m, off, icmp6len, code) #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr), - -1); + return -1); icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, @@ -887,7 +863,7 @@ icmp6_notify_error(m, off, icmp6len, code) /* Detect the upper level protocol */ { - void (*ctlfunc) __P((int, struct sockaddr *, void *)); + void (*ctlfunc)(int, struct sockaddr *, void *); u_int8_t nxt = eip6->ip6_nxt; int eoff = off + sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr); @@ -909,7 +885,7 @@ icmp6_notify_error(m, off, icmp6len, code) #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(struct ip6_ext), - -1); + return -1); eh = (struct ip6_ext *)(mtod(m, caddr_t) + eoff); #else @@ -938,7 +914,7 @@ icmp6_notify_error(m, off, icmp6len, code) */ #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(*rth), - -1); + return -1); rth = (struct ip6_rthdr *)(mtod(m, caddr_t) + eoff); #else @@ -964,7 +940,7 @@ icmp6_notify_error(m, off, icmp6len, code) #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + rthlen, - -1); + return -1); rth0 = (struct ip6_rthdr0 *)(mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(rth0, @@ -987,7 +963,7 @@ icmp6_notify_error(m, off, icmp6len, code) #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(struct ip6_frag), - -1); + return -1); fh = (struct ip6_frag *)(mtod(m, caddr_t) + eoff); #else @@ -1092,7 +1068,7 @@ icmp6_notify_error(m, off, icmp6len, code) icmp6_mtudisc_update(&ip6cp, 1); /*XXX*/ } - ctlfunc = (void (*) __P((int, struct sockaddr *, void *))) + ctlfunc = (void (*)(int, struct sockaddr *, void *)) (ip6_protox[nxt]->pr_ctlinput); if (ctlfunc) { (void) (*ctlfunc)(code, (struct sockaddr *)&icmp6dst, @@ -1209,11 +1185,15 @@ ni6_input(m, off) /* unicast/anycast, fine */ if ((ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (icmp6_nodeinfo & 4) == 0) { + ifafree(&ia6->ia_ifa); + ia6 = NULL; nd6log((LOG_DEBUG, "ni6_input: ignore node info to " "a temporary address in %s:%d", __FILE__, __LINE__)); goto bad; } + ifafree(&ia6->ia_ifa); + ia6 = NULL; } else if (IN6_IS_ADDR_MC_LINKLOCAL(&sin6.sin6_addr)) ; /* link-local multicast, fine */ else @@ -1356,7 +1336,7 @@ ni6_input(m, off) replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); break; case NI_QTYPE_NODEADDR: - addrs = ni6_addrs(ni6, m, &ifp, subj); + addrs = ni6_addrs(ni6, &ifp, subj); if ((replylen += addrs * (sizeof(struct in6_addr) + sizeof(u_int32_t))) > MCLBYTES) replylen = MCLBYTES; /* XXX: will truncate pkt later */ @@ -1640,9 +1620,8 @@ ni6_dnsmatch(a, alen, b, blen) * calculate the number of addresses to be returned in the node info reply. */ static int -ni6_addrs(ni6, m, ifpp, subj) +ni6_addrs(ni6, ifpp, subj) struct icmp6_nodeinfo *ni6; - struct mbuf *m; struct ifnet **ifpp; char *subj; { @@ -1669,9 +1648,10 @@ ni6_addrs(ni6, m, ifpp, subj) } } - for (ifp = TAILQ_FIRST(&ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) - { + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_list) { addrsofif = 0; + ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family != AF_INET6) @@ -1724,13 +1704,16 @@ ni6_addrs(ni6, m, ifpp, subj) } addrsofif++; /* count the address */ } + ifnet_lock_done(ifp); if (iffound) { *ifpp = ifp; + ifnet_head_done(); return(addrsofif); } addrs += addrsofif; } + ifnet_head_done(); return(addrs); } @@ -1741,7 +1724,7 @@ ni6_store_addrs(ni6, nni6, ifp0, resid) struct ifnet *ifp0; int resid; { - struct ifnet *ifp = ifp0 ? ifp0 : TAILQ_FIRST(&ifnet); + struct ifnet *ifp = ifp0; struct in6_ifaddr *ifa6; struct ifaddr *ifa; struct ifnet *ifp_dep = NULL; @@ -1749,14 +1732,20 @@ ni6_store_addrs(ni6, nni6, ifp0, resid) u_char *cp = (u_char *)(nni6 + 1); int niflags = ni6->ni_flags; u_int32_t ltime; + struct timeval timenow; + + getmicrotime(&timenow); if (ifp0 == NULL && !(niflags & NI_NODEADDR_FLAG_ALL)) return(0); /* needless to copy */ again: - for (; ifp; ifp = TAILQ_NEXT(ifp, if_list)) - { + ifnet_head_lock_shared(); + if (ifp == NULL) ifp = TAILQ_FIRST(&ifnet_head); + + for (; ifp; ifp = TAILQ_NEXT(ifp, if_list)) { + ifnet_lock_shared(ifp); for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = ifa->ifa_list.tqe_next) { @@ -1820,6 +1809,8 @@ ni6_store_addrs(ni6, nni6, ifp0, resid) */ nni6->ni_flags |= NI_NODEADDR_FLAG_TRUNCATE; + ifnet_lock_done(ifp); + ifnet_head_done(); return(copied); } @@ -1842,8 +1833,8 @@ ni6_store_addrs(ni6, nni6, ifp0, resid) ltime = ND6_INFINITE_LIFETIME; else { if (ifa6->ia6_lifetime.ia6t_expire > - time_second) - ltime = htonl(ifa6->ia6_lifetime.ia6t_expire - time_second); + timenow.tv_sec) + ltime = htonl(ifa6->ia6_lifetime.ia6t_expire - timenow.tv_sec); else ltime = 0; } @@ -1863,9 +1854,11 @@ ni6_store_addrs(ni6, nni6, ifp0, resid) copied += (sizeof(struct in6_addr) + sizeof(u_int32_t)); } + ifnet_lock_done(ifp); if (ifp0) /* we need search only on the specified IF */ break; } + ifnet_head_done(); if (allow_deprecated == 0 && ifp_dep != NULL) { ifp = ifp_dep; @@ -1910,6 +1903,7 @@ icmp6_rip6_input(mp, off) /* KAME hack: recover scopeid */ (void)in6_recoverscope(&rip6src, &ip6->ip6_src, m->m_pkthdr.rcvif); + lck_rw_lock_shared(ripcbinfo.mtx); LIST_FOREACH(in6p, &ripcb, inp_list) { if ((in6p->inp_vflag & INP_IPV6) == 0) @@ -1939,31 +1933,24 @@ icmp6_rip6_input(mp, off) m_adj(n, off); if (sbappendaddr(&last->in6p_socket->so_rcv, (struct sockaddr *)&rip6src, - n, opts) == 0) { - /* should notify about lost packet */ - m_freem(n); - if (opts) { - m_freem(opts); - } - } else + n, opts, NULL) != 0) { sorwakeup(last->in6p_socket); + } opts = NULL; } } last = in6p; } + lck_rw_done(ripcbinfo.mtx); if (last) { if (last->in6p_flags & IN6P_CONTROLOPTS) ip6_savecontrol(last, &opts, ip6, m); /* strip intermediate headers */ m_adj(m, off); if (sbappendaddr(&last->in6p_socket->so_rcv, - (struct sockaddr *)&rip6src, m, opts) == 0) { - m_freem(m); - if (opts) - m_freem(opts); - } else + (struct sockaddr *)&rip6src, m, opts, NULL) != 0) { sorwakeup(last->in6p_socket); + } } else { m_freem(m); ip6stat.ip6s_delivered--; @@ -1983,7 +1970,7 @@ icmp6_reflect(m, off) struct ip6_hdr *ip6; struct icmp6_hdr *icmp6; struct in6_ifaddr *ia; - struct in6_addr t, *src = 0; + struct in6_addr t, src_storage, *src = 0; int plen; int type, code; struct ifnet *outif = NULL; @@ -2105,12 +2092,14 @@ icmp6_reflect(m, off) * (for example) when we encounter an error while forwarding procedure * destined to a duplicated address of ours. */ - for (ia = in6_ifaddr; ia; ia = ia->ia_next) + lck_mtx_lock(nd6_mutex); + for (ia = in6_ifaddrs; ia; ia = ia->ia_next) if (IN6_ARE_ADDR_EQUAL(&t, &ia->ia_addr.sin6_addr) && (ia->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY)) == 0) { src = &t; break; } + lck_mtx_unlock(nd6_mutex); if (ia == NULL && IN6_IS_ADDR_LINKLOCAL(&t) && (m->m_flags & M_LOOP)) { /* * This is the case if the dst is our link-local address @@ -2129,7 +2118,7 @@ icmp6_reflect(m, off) * source address of the erroneous packet. */ bzero(&ro, sizeof(ro)); - src = in6_selectsrc(&sa6_src, NULL, NULL, &ro, NULL, &e); + src = in6_selectsrc(&sa6_src, NULL, NULL, &ro, NULL, &src_storage, &e); if (ro.ro_rt) rtfree(ro.ro_rt); /* XXX: we could use this */ if (src == NULL) { @@ -2169,9 +2158,9 @@ icmp6_reflect(m, off) #endif /*IPSEC*/ #ifdef COMPAT_RFC1885 - ip6_output(m, NULL, &icmp6_reflect_rt, 0, NULL, &outif); + ip6_output(m, NULL, &icmp6_reflect_rt, 0, NULL, &outif, 0); #else - ip6_output(m, NULL, NULL, 0, NULL, &outif); + ip6_output(m, NULL, NULL, 0, NULL, &outif, 0); #endif if (outif) icmp6_ifoutstat_inc(outif, type, code); @@ -2233,7 +2222,7 @@ icmp6_redirect_input(m, off) goto freeit; #ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, icmp6len,); + IP6_EXTHDR_CHECK(m, off, icmp6len, return); nd_rd = (struct nd_redirect *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_rd, struct nd_redirect *, m, off, icmp6len); @@ -2410,7 +2399,7 @@ icmp6_redirect_output(m0, rt) struct rtentry *rt; { struct ifnet *ifp; /* my outgoing interface */ - struct in6_addr *ifp_ll6; + struct in6_addr ifp_ll6; struct in6_addr *router_ll6; struct ip6_hdr *sip6; /* m0 as struct ip6_hdr */ struct mbuf *m = NULL; /* newly allocated one */ @@ -2444,7 +2433,7 @@ icmp6_redirect_output(m0, rt) src_sa.sin6_addr = sip6->ip6_src; /* we don't currently use sin6_scope_id, but eventually use it */ src_sa.sin6_scope_id = in6_addr2scopeid(ifp, &sip6->ip6_src); - if (nd6_is_addr_neighbor(&src_sa, ifp) == 0) + if (nd6_is_addr_neighbor(&src_sa, ifp, 0) == 0) goto fail; if (IN6_IS_ADDR_MULTICAST(&sip6->ip6_dst)) goto fail; /* what should we do here? */ @@ -2483,7 +2472,8 @@ icmp6_redirect_output(m0, rt) IN6_IFF_NOTREADY| IN6_IFF_ANYCAST)) == NULL) goto fail; - ifp_ll6 = &ia->ia_addr.sin6_addr; + ifp_ll6 = ia->ia_addr.sin6_addr; + ifafree(&ia->ia_ifa); } /* get ip6 linklocal address for the router. */ @@ -2505,7 +2495,7 @@ icmp6_redirect_output(m0, rt) ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; /* ip6->ip6_src must be linklocal addr for my outgoing if. */ - bcopy(ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr)); + bcopy(&ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr)); bcopy(&sip6->ip6_src, &ip6->ip6_dst, sizeof(struct in6_addr)); /* ND Redirect */ @@ -2545,7 +2535,7 @@ icmp6_redirect_output(m0, rt) struct nd_opt_hdr *nd_opt; char *lladdr; - rt_router = nd6_lookup(router_ll6, 0, ifp); + rt_router = nd6_lookup(router_ll6, 0, ifp, 0); if (!rt_router) goto nolladdropt; len = sizeof(*nd_opt) + ifp->if_addrlen; @@ -2676,7 +2666,7 @@ noredhdropt:; if (ipsec_bypass == 0) (void)ipsec_setsocket(m, NULL); #endif /*IPSEC*/ - ip6_output(m, NULL, NULL, 0, NULL, &outif); + ip6_output(m, NULL, NULL, 0, NULL, &outif, 0); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); icmp6_ifstat_inc(outif, ifs6_out_redirect); diff --git a/bsd/netinet6/in6.c b/bsd/netinet6/in6.c index f3648e697..8887a091d 100644 --- a/bsd/netinet6/in6.c +++ b/bsd/netinet6/in6.c @@ -78,6 +78,7 @@ #include <sys/kernel.h> #include <sys/syslog.h> #include <sys/kern_event.h> +#include <kern/lock.h> #include <net/if.h> #include <net/if_types.h> @@ -130,13 +131,14 @@ const struct in6_addr in6mask128 = IN6MASK128; const struct sockaddr_in6 sa6_any = {sizeof(sa6_any), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0}; -static int in6_lifaddr_ioctl __P((struct socket *, u_long, caddr_t, - struct ifnet *, struct proc *)); -static int in6_ifinit __P((struct ifnet *, struct in6_ifaddr *, - struct sockaddr_in6 *, int)); -static void in6_unlink_ifa __P((struct in6_ifaddr *, struct ifnet *)); +static int in6_lifaddr_ioctl(struct socket *, u_long, caddr_t, + struct ifnet *, struct proc *); +static int in6_ifinit(struct ifnet *, struct in6_ifaddr *, + struct sockaddr_in6 *, int); +static void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *, int); struct in6_multihead in6_multihead; /* XXX BSS initialization */ +extern struct lck_mtx_t *nd6_mutex; /* * Subroutine for in6_ifaddloop() and in6_ifremloop(). @@ -162,7 +164,7 @@ in6_ifloop_request(int cmd, struct ifaddr *ifa) * (probably implicitly) set nd6_rtrequest() to ifa->ifa_rtrequest, * which changes the outgoing interface to the loopback interface. */ - e = rtrequest(cmd, ifa->ifa_addr, ifa->ifa_addr, + e = rtrequest_locked(cmd, ifa->ifa_addr, ifa->ifa_addr, (struct sockaddr *)&all1_sa, RTF_UP|RTF_HOST|RTF_LLINFO, &nrt); if (e != 0) { @@ -196,7 +198,7 @@ in6_ifloop_request(int cmd, struct ifaddr *ifa) if (nrt->rt_refcnt <= 0) { /* XXX: we should free the entry ourselves. */ rtref(nrt); - rtfree(nrt); + rtfree_locked(nrt); } } else { /* the cmd must be RTM_ADD here */ @@ -217,13 +219,15 @@ in6_ifaddloop(struct ifaddr *ifa) { struct rtentry *rt; + lck_mtx_lock(rt_mtx); /* If there is no loopback entry, allocate one. */ - rt = rtalloc1(ifa->ifa_addr, 0, 0); + rt = rtalloc1_locked(ifa->ifa_addr, 0, 0); if (rt == NULL || (rt->rt_flags & RTF_HOST) == 0 || (rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) in6_ifloop_request(RTM_ADD, ifa); if (rt) rt->rt_refcnt--; + lck_mtx_unlock(rt_mtx); } /* @@ -231,7 +235,7 @@ in6_ifaddloop(struct ifaddr *ifa) * if it exists. */ static void -in6_ifremloop(struct ifaddr *ifa) +in6_ifremloop(struct ifaddr *ifa, int locked) { struct in6_ifaddr *ia; struct rtentry *rt; @@ -253,13 +257,17 @@ in6_ifremloop(struct ifaddr *ifa) * (probably p2p) interfaces. * XXX: we should avoid such a configuration in IPv6... */ - for (ia = in6_ifaddr; ia; ia = ia->ia_next) { + if (!locked) + lck_mtx_lock(nd6_mutex); + for (ia = in6_ifaddrs; ia; ia = ia->ia_next) { if (IN6_ARE_ADDR_EQUAL(IFA_IN6(ifa), &ia->ia_addr.sin6_addr)) { ia_count++; if (ia_count > 1) break; } } + if (!locked) + lck_mtx_unlock(nd6_mutex); if (ia_count == 1) { /* @@ -270,12 +278,14 @@ in6_ifremloop(struct ifaddr *ifa) * a subnet-router anycast address on an interface attahced * to a shared medium. */ - rt = rtalloc1(ifa->ifa_addr, 0, 0); + lck_mtx_lock(rt_mtx); + rt = rtalloc1_locked(ifa->ifa_addr, 0, 0); if (rt != NULL && (rt->rt_flags & RTF_HOST) != 0 && (rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) { rt->rt_refcnt--; in6_ifloop_request(RTM_DELETE, ifa); } + lck_mtx_unlock(rt_mtx); } } @@ -289,16 +299,23 @@ in6_ifindex2scopeid(idx) if (idx < 0 || if_index < idx) return -1; + + ifnet_head_lock_shared(); ifp = ifindex2ifnet[idx]; + ifnet_head_done(); + ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; - if (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr)) + if (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr)) { + ifnet_lock_done(ifp); return sin6->sin6_scope_id & 0xffff; + } } + ifnet_lock_done(ifp); return -1; } @@ -370,11 +387,14 @@ in6_control(so, cmd, data, ifp, p) struct in6_ifaddr *ia = NULL; struct in6_aliasreq *ifra = (struct in6_aliasreq *)data; int privileged, error = 0; - u_long dl_tag; + int index; + struct timeval timenow; + + getmicrotime(&timenow); privileged = 0; #ifdef __APPLE__ - if (p == NULL || !suser(p->p_ucred, &p->p_acflag)) + if (p == NULL || !proc_suser(p)) #else if (p == NULL || !suser(p)) #endif @@ -390,6 +410,15 @@ in6_control(so, cmd, data, ifp, p) return(EOPNOTSUPP); switch (cmd) { + case SIOCAUTOCONF_START: + case SIOCAUTOCONF_STOP: + case SIOCLL_START: + case SIOCLL_STOP: + case SIOCPROTOATTACH_IN6: + case SIOCPROTODETACH_IN6: + if (!privileged) + return(EPERM); + break; case SIOCSNDFLUSH_IN6: case SIOCSPFXFLUSH_IN6: case SIOCSRTRFLUSH_IN6: @@ -449,28 +478,32 @@ in6_control(so, cmd, data, ifp, p) switch (cmd) { case SIOCAUTOCONF_START: + ifnet_lock_exclusive(ifp); ifp->if_eflags |= IFEF_ACCEPT_RTADVD; + ifnet_lock_done(ifp); return (0); case SIOCAUTOCONF_STOP: { - struct ifaddr *ifa, *nifa = NULL; - + struct in6_ifaddr *ia, *nia = NULL; + + ifnet_lock_exclusive(ifp); ifp->if_eflags &= ~IFEF_ACCEPT_RTADVD; + ifnet_lock_done(ifp); - /* nuke prefix list. this may try to remove some of ifaddrs as well */ + /* nuke prefix list. this may try to remove some ifaddrs as well */ in6_purgeprefix(ifp); /* removed autoconfigured address from interface */ - - for (ifa = TAILQ_FIRST(&ifp->if_addrlist); ifa != NULL; ifa = nifa) - { - nifa = TAILQ_NEXT(ifa, ifa_list); - if (ifa->ifa_addr == NULL || ifa->ifa_addr->sa_family != AF_INET6) + lck_mtx_lock(nd6_mutex); + for (ia = in6_ifaddrs; ia != NULL; ia = nia) { + nia = ia->ia_next; + if (ia->ia_ifa.ifa_ifp != ifp) continue; - if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_AUTOCONF) - in6_purgeaddr(ifa); + if (ia->ia6_flags & IN6_IFF_AUTOCONF) + in6_purgeaddr(&ia->ia_ifa, 1); } + lck_mtx_unlock(nd6_mutex); return (0); } @@ -491,24 +524,25 @@ in6_control(so, cmd, data, ifp, p) case SIOCLL_STOP: { - struct ifaddr *ifa, *nifa = NULL; - + struct in6_ifaddr *ia, *nia = NULL; + /* removed link local addresses from interface */ - for (ifa = TAILQ_FIRST(&ifp->if_addrlist); ifa != NULL; ifa = nifa) - { - nifa = TAILQ_NEXT(ifa, ifa_list); - if (ifa->ifa_addr == NULL || ifa->ifa_addr->sa_family != AF_INET6) + lck_mtx_lock(nd6_mutex); + for (ia = in6_ifaddrs; ia != NULL; ia = nia) { + nia = ia->ia_next; + if (ia->ia_ifa.ifa_ifp != ifp) continue; - if (IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa))) - in6_purgeaddr(ifa); + if (IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) + in6_purgeaddr(&ia->ia_ifa, 1); } + lck_mtx_unlock(nd6_mutex); return (0); } case SIOCPROTOATTACH_IN6: - + switch (ifp->if_type) { #if IFT_BRIDGE /*OpenBSD 2.8*/ /* some of the interfaces are inherently not IPv6 capable */ @@ -517,7 +551,7 @@ in6_control(so, cmd, data, ifp, p) #endif default: - if (error = dlil_plumb_protocol(PF_INET6, ifp, &dl_tag)) + if (error = dlil_plumb_protocol(PF_INET6, ifp)) printf("SIOCPROTOATTACH_IN6: %s error=%d\n", if_name(ifp), error); break; @@ -573,7 +607,8 @@ in6_control(so, cmd, data, ifp, p) * and should be unused. */ /* we decided to obsolete this command (20000704) */ - return(EINVAL); + error = EINVAL; + goto ioctl_cleanup; case SIOCDIFADDR_IN6: /* @@ -583,8 +618,11 @@ in6_control(so, cmd, data, ifp, p) * address from the day one, we consider "remove the first one" * semantics to be not preferable. */ - if (ia == NULL) - return(EADDRNOTAVAIL); + if (ia == NULL) { + error = EADDRNOTAVAIL; + goto ioctl_cleanup; + } + /* FALLTHROUGH */ case SIOCAIFADDR_IN6: /* @@ -592,10 +630,14 @@ in6_control(so, cmd, data, ifp, p) * the corresponding operation. */ if (ifra->ifra_addr.sin6_family != AF_INET6 || - ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6)) - return(EAFNOSUPPORT); - if (!privileged) - return(EPERM); + ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6)) { + error = EAFNOSUPPORT; + goto ioctl_cleanup; + } + if (!privileged) { + error = EPERM; + goto ioctl_cleanup; + } break; @@ -607,26 +649,34 @@ in6_control(so, cmd, data, ifp, p) case SIOCGIFDSTADDR_IN6: case SIOCGIFALIFETIME_IN6: /* must think again about its semantics */ - if (ia == NULL) - return(EADDRNOTAVAIL); + if (ia == NULL) { + error = EADDRNOTAVAIL; + goto ioctl_cleanup; + } break; case SIOCSIFALIFETIME_IN6: { struct in6_addrlifetime *lt; - if (!privileged) - return(EPERM); - if (ia == NULL) - return(EADDRNOTAVAIL); + if (!privileged) { + error = EPERM; + goto ioctl_cleanup; + } + if (ia == NULL) { + error = EADDRNOTAVAIL; + goto ioctl_cleanup; + } /* sanity for overflow - beware unsigned */ lt = &ifr->ifr_ifru.ifru_lifetime; if (lt->ia6t_vltime != ND6_INFINITE_LIFETIME - && lt->ia6t_vltime + time_second < time_second) { - return EINVAL; + && lt->ia6t_vltime + timenow.tv_sec < timenow.tv_sec) { + error = EINVAL; + goto ioctl_cleanup; } if (lt->ia6t_pltime != ND6_INFINITE_LIFETIME - && lt->ia6t_pltime + time_second < time_second) { - return EINVAL; + && lt->ia6t_pltime + timenow.tv_sec < timenow.tv_sec) { + error = EINVAL; + goto ioctl_cleanup; } break; } @@ -639,8 +689,10 @@ in6_control(so, cmd, data, ifp, p) break; case SIOCGIFDSTADDR_IN6: - if ((ifp->if_flags & IFF_POINTOPOINT) == 0) - return(EINVAL); + if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { + error = EINVAL; + goto ioctl_cleanup; + } /* * XXX: should we check if ifa_dstaddr is NULL and return * an error? @@ -657,28 +709,34 @@ in6_control(so, cmd, data, ifp, p) break; case SIOCGIFSTAT_IN6: - if (ifp == NULL) - return EINVAL; - if (in6_ifstat == NULL || ifp->if_index >= in6_ifstatmax - || in6_ifstat[ifp->if_index] == NULL) { + if (ifp == NULL) { + error = EINVAL; + goto ioctl_cleanup; + } + index = ifp->if_index; + if (in6_ifstat == NULL || index >= in6_ifstatmax + || in6_ifstat[index] == NULL) { /* return EAFNOSUPPORT? */ bzero(&ifr->ifr_ifru.ifru_stat, sizeof(ifr->ifr_ifru.ifru_stat)); } else - ifr->ifr_ifru.ifru_stat = *in6_ifstat[ifp->if_index]; + ifr->ifr_ifru.ifru_stat = *in6_ifstat[index]; break; case SIOCGIFSTAT_ICMP6: - if (ifp == NULL) - return EINVAL; - if (icmp6_ifstat == NULL || ifp->if_index >= icmp6_ifstatmax || - icmp6_ifstat[ifp->if_index] == NULL) { + if (ifp == NULL) { + error = EINVAL; + goto ioctl_cleanup; + } + index = ifp->if_index; + if (icmp6_ifstat == NULL || index >= icmp6_ifstatmax || + icmp6_ifstat[index] == NULL) { /* return EAFNOSUPPORT? */ bzero(&ifr->ifr_ifru.ifru_stat, sizeof(ifr->ifr_ifru.ifru_icmp6stat)); } else ifr->ifr_ifru.ifru_icmp6stat = - *icmp6_ifstat[ifp->if_index]; + *icmp6_ifstat[index]; break; case SIOCGIFALIFETIME_IN6: @@ -690,38 +748,44 @@ in6_control(so, cmd, data, ifp, p) /* for sanity */ if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_expire = - time_second + ia->ia6_lifetime.ia6t_vltime; + timenow.tv_sec + ia->ia6_lifetime.ia6t_vltime; } else ia->ia6_lifetime.ia6t_expire = 0; if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_preferred = - time_second + ia->ia6_lifetime.ia6t_pltime; + timenow.tv_sec + ia->ia6_lifetime.ia6t_pltime; } else ia->ia6_lifetime.ia6t_preferred = 0; break; case SIOCAIFADDR_IN6: { - int i, error = 0; + int i; struct nd_prefix pr0, *pr; - if (dlil_find_dltag(ifp->if_family, ifp->if_unit, PF_INET6, &dl_tag) == EPROTONOSUPPORT) { - /* Address is added without previous IPv6 configurator support (gif, stf etc...) */ - if (error = dlil_plumb_protocol(PF_INET6, ifp, &dl_tag)) { + /* Attempt to attache the protocol, in case it isn't attached */ + error = dlil_plumb_protocol(PF_INET6, ifp); + if (error) { + if (error != EEXIST) { printf("SIOCAIFADDR_IN6: %s can't plumb protocol error=%d\n", if_name(ifp), error); - return (error); + goto ioctl_cleanup; } + + /* Ignore, EEXIST */ + error = 0; + } + else { + /* PF_INET6 wasn't previously attached */ in6_if_up(ifp, NULL); } - /* * first, make or update the interface address structure, * and link it to the list. */ if ((error = in6_update_ifa(ifp, ifra, ia)) != 0) - return(error); + goto ioctl_cleanup; /* * then, make the prefix on-link on the interface. @@ -767,11 +831,12 @@ in6_control(so, cmd, data, ifp, p) * interface route. */ if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0) - return(error); + goto ioctl_cleanup; if (pr == NULL) { log(LOG_ERR, "nd6_prelist_add succedded but " "no prefix\n"); - return(EINVAL); /* XXX panic here? */ + error = EINVAL; + goto ioctl_cleanup; } } if ((ia = in6ifa_ifpwithaddr(ifp, &ifra->ifra_addr.sin6_addr)) @@ -808,8 +873,7 @@ in6_control(so, cmd, data, ifp, p) * addresses, that is, this address might make * other addresses detached. */ - pfxlist_onlink_check(); - in6_post_msg(ifp, KEV_INET6_NEW_USER_ADDR, ia); + pfxlist_onlink_check(0); } break; @@ -858,23 +922,22 @@ in6_control(so, cmd, data, ifp, p) } purgeaddr: - in6_purgeaddr(&ia->ia_ifa); + in6_purgeaddr(&ia->ia_ifa, 0); break; } default: #ifdef __APPLE__ - error = dlil_ioctl(PF_INET6, ifp, cmd, (caddr_t)data); - return error; - + error = dlil_ioctl(PF_INET6, ifp, cmd, (caddr_t)data); + goto ioctl_cleanup; #else if (ifp == NULL || ifp->if_ioctl == 0) return(EOPNOTSUPP); return((*ifp->if_ioctl)(ifp, cmd, data)); #endif } - - return(0); +ioctl_cleanup: + return error; } /* @@ -893,7 +956,10 @@ in6_update_ifa(ifp, ifra, ia) struct in6_ifaddr *oia; struct sockaddr_in6 dst6; struct in6_addrlifetime *lt; + struct timeval timenow; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); /* Validate parameters */ if (ifp == NULL || ifra == NULL) /* this maybe redundant */ return(EINVAL); @@ -912,6 +978,14 @@ in6_update_ifa(ifp, ifra, ia) */ if (ifra->ifra_prefixmask.sin6_len > sizeof(struct sockaddr_in6)) return(EINVAL); + /* + * Set the address family value for the mask if it was not set. + * Radar 3899482. + */ + if (ifra->ifra_prefixmask.sin6_len == sizeof(struct sockaddr_in6) && + ifra->ifra_prefixmask.sin6_family == 0) { + ifra->ifra_prefixmask.sin6_family = AF_INET6; + } /* * Because the IPv6 address architecture is classless, we require * users to specify a (non 0) prefix length (mask) for a new address. @@ -985,9 +1059,11 @@ in6_update_ifa(ifp, ifra, ia) } } /* lifetime consistency check */ + + getmicrotime(&timenow); lt = &ifra->ifra_lifetime; if (lt->ia6t_vltime != ND6_INFINITE_LIFETIME - && lt->ia6t_vltime + time_second < time_second) { + && lt->ia6t_vltime + timenow.tv_sec < timenow.tv_sec) { return EINVAL; } if (lt->ia6t_vltime == 0) { @@ -1000,7 +1076,7 @@ in6_update_ifa(ifp, ifra, ia) ip6_sprintf(&ifra->ifra_addr.sin6_addr)); } if (lt->ia6t_pltime != ND6_INFINITE_LIFETIME - && lt->ia6t_pltime + time_second < time_second) { + && lt->ia6t_pltime + timenow.tv_sec < timenow.tv_sec) { return EINVAL; } @@ -1018,7 +1094,7 @@ in6_update_ifa(ifp, ifra, ia) ia = (struct in6_ifaddr *) _MALLOC(sizeof(*ia), M_IFADDR, M_NOWAIT); if (ia == NULL) - return (ENOBUFS); + return ENOBUFS; bzero((caddr_t)ia, sizeof(*ia)); /* Initialize the address and masks */ ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; @@ -1038,15 +1114,18 @@ in6_update_ifa(ifp, ifra, ia) = (struct sockaddr *)&ia->ia_prefixmask; ia->ia_ifp = ifp; - if ((oia = in6_ifaddr) != NULL) { + lck_mtx_lock(nd6_mutex); + if ((oia = in6_ifaddrs) != NULL) { for ( ; oia->ia_next; oia = oia->ia_next) continue; oia->ia_next = ia; } else - in6_ifaddr = ia; + in6_ifaddrs = ia; + lck_mtx_unlock(nd6_mutex); - TAILQ_INSERT_TAIL(&ifp->if_addrlist, &ia->ia_ifa, - ifa_list); + ifnet_lock_exclusive(ifp); + if_attach_ifa(ifp, &ia->ia_ifa); + ifnet_lock_done(ifp); } /* set prefix mask */ @@ -1108,7 +1187,7 @@ in6_update_ifa(ifp, ifra, ia) iilen = (sizeof(ia->ia_prefixmask.sin6_addr) << 3) - plen; if ((error = in6_prefix_add_ifid(iilen, ia)) != 0) { - in6_purgeaddr((struct ifaddr *)ia); + in6_purgeaddr((struct ifaddr *)ia, 0); return(error); } } @@ -1131,14 +1210,14 @@ in6_update_ifa(ifp, ifra, ia) llsol.s6_addr32[3] = ifra->ifra_addr.sin6_addr.s6_addr32[3]; llsol.s6_addr8[12] = 0xff; - (void)in6_addmulti(&llsol, ifp, &error); + (void)in6_addmulti(&llsol, ifp, &error, 0); if (error != 0) { log(LOG_WARNING, "in6_update_ifa: addmulti failed for " "%s on %s (errno=%d)\n", ip6_sprintf(&llsol), if_name(ifp), error); - in6_purgeaddr((struct ifaddr *)ia); + in6_purgeaddr((struct ifaddr *)ia, 0); return(error); } } @@ -1157,7 +1236,9 @@ in6_update_ifa(ifp, ifra, ia) mltaddr.sin6_addr = in6addr_linklocal_allnodes; mltaddr.sin6_addr.s6_addr16[1] = htons(ifp->if_index); + ifnet_lock_shared(ifp); IN6_LOOKUP_MULTI(mltaddr.sin6_addr, ifp, in6m); + ifnet_lock_done(ifp); if (in6m == NULL) { rtrequest(RTM_ADD, (struct sockaddr *)&mltaddr, @@ -1165,7 +1246,7 @@ in6_update_ifa(ifp, ifra, ia) (struct sockaddr *)&mltmask, RTF_UP|RTF_CLONING, /* xxx */ (struct rtentry **)0); - (void)in6_addmulti(&mltaddr.sin6_addr, ifp, &error); + (void)in6_addmulti(&mltaddr.sin6_addr, ifp, &error, 0); if (error != 0) { log(LOG_WARNING, "in6_update_ifa: addmulti failed for " @@ -1181,10 +1262,12 @@ in6_update_ifa(ifp, ifra, ia) #define hostnamelen strlen(hostname) if (in6_nigroup(ifp, hostname, hostnamelen, &mltaddr.sin6_addr) == 0) { + ifnet_lock_shared(ifp); IN6_LOOKUP_MULTI(mltaddr.sin6_addr, ifp, in6m); + ifnet_lock_done(ifp); if (in6m == NULL && ia != NULL) { (void)in6_addmulti(&mltaddr.sin6_addr, - ifp, &error); + ifp, &error, 0); if (error != 0) { log(LOG_WARNING, "in6_update_ifa: " "addmulti failed for " @@ -1210,7 +1293,9 @@ in6_update_ifa(ifp, ifra, ia) mltaddr.sin6_addr = in6addr_nodelocal_allnodes; + ifnet_lock_shared(ifp); IN6_LOOKUP_MULTI(mltaddr.sin6_addr, ifp, in6m); + ifnet_lock_done(ifp); if (in6m == NULL && ia_loop != NULL) { rtrequest(RTM_ADD, (struct sockaddr *)&mltaddr, @@ -1219,7 +1304,7 @@ in6_update_ifa(ifp, ifra, ia) RTF_UP, (struct rtentry **)0); (void)in6_addmulti(&mltaddr.sin6_addr, ifp, - &error); + &error, 0); if (error != 0) { log(LOG_WARNING, "in6_update_ifa: " "addmulti failed for %s on %s " @@ -1239,12 +1324,12 @@ in6_update_ifa(ifp, ifra, ia) /* for sanity */ if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_expire = - time_second + ia->ia6_lifetime.ia6t_vltime; + timenow.tv_sec + ia->ia6_lifetime.ia6t_vltime; } else ia->ia6_lifetime.ia6t_expire = 0; if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_preferred = - time_second + ia->ia6_lifetime.ia6t_pltime; + timenow.tv_sec + ia->ia6_lifetime.ia6t_pltime; } else ia->ia6_lifetime.ia6t_preferred = 0; @@ -1273,13 +1358,13 @@ in6_update_ifa(ifp, ifra, ia) * anyway. */ if (hostIsNew) - in6_unlink_ifa(ia, ifp); + in6_unlink_ifa(ia, ifp, 0); return(error); } void -in6_purgeaddr(ifa) - struct ifaddr *ifa; +in6_purgeaddr( + struct ifaddr *ifa, int nd6_locked) { struct ifnet *ifp = ifa->ifa_ifp; struct in6_ifaddr *ia = (struct in6_ifaddr *) ifa; @@ -1308,7 +1393,7 @@ in6_purgeaddr(ifa) } /* Remove ownaddr's loopback rtentry, if it exists. */ - in6_ifremloop(&(ia->ia_ifa)); + in6_ifremloop(&(ia->ia_ifa), nd6_locked); if (ifp->if_flags & IFF_MULTICAST) { /* @@ -1325,29 +1410,35 @@ in6_purgeaddr(ifa) ia->ia_addr.sin6_addr.s6_addr32[3]; llsol.s6_addr8[12] = 0xff; + ifnet_lock_shared(ifp); IN6_LOOKUP_MULTI(llsol, ifp, in6m); + ifnet_lock_done(ifp); if (in6m) - in6_delmulti(in6m); + in6_delmulti(in6m, nd6_locked); } + in6_unlink_ifa(ia, ifp, nd6_locked); in6_post_msg(ifp, KEV_INET6_ADDR_DELETED, ia); - in6_unlink_ifa(ia, ifp); } static void -in6_unlink_ifa(ia, ifp) +in6_unlink_ifa(ia, ifp, nd6_locked) struct in6_ifaddr *ia; struct ifnet *ifp; + int nd6_locked; { int plen, iilen; struct in6_ifaddr *oia; - int s = splnet(); - TAILQ_REMOVE(&ifp->if_addrlist, &ia->ia_ifa, ifa_list); + ifnet_lock_exclusive(ifp); + if_detach_ifa(ifp, &ia->ia_ifa); + ifnet_lock_done(ifp); + if (!nd6_locked) + lck_mtx_lock(nd6_mutex); oia = ia; - if (oia == (ia = in6_ifaddr)) - in6_ifaddr = ia->ia_next; + if (oia == (ia = in6_ifaddrs)) + in6_ifaddrs = ia->ia_next; else { while (ia->ia_next && (ia->ia_next != oia)) ia = ia->ia_next; @@ -1358,7 +1449,6 @@ in6_unlink_ifa(ia, ifp) printf("Couldn't unlink in6_ifaddr from in6_ifaddr\n"); } } - if (oia->ia6_ifpr) { /* check for safety */ plen = in6_mask2len(&oia->ia_prefixmask.sin6_addr, NULL); iilen = (sizeof(oia->ia_prefixmask.sin6_addr) << 3) - plen; @@ -1381,36 +1471,38 @@ in6_unlink_ifa(ia, ifp) oia->ia6_ndpr = NULL; } - pfxlist_onlink_check(); + pfxlist_onlink_check(1); } + if (!nd6_locked) + lck_mtx_unlock(nd6_mutex); + /* - * release another refcnt for the link from in6_ifaddr. + * release another refcnt for the link from in6_ifaddrs. * Note that we should decrement the refcnt at least once for all *BSD. */ ifafree(&oia->ia_ifa); - splx(s); } void in6_purgeif(ifp) struct ifnet *ifp; { - struct ifaddr *ifa, *nifa = NULL; + struct in6_ifaddr *ia, *nia = NULL; if (ifp == NULL || &ifp->if_addrlist == NULL) return; - for (ifa = TAILQ_FIRST(&ifp->if_addrlist); ifa != NULL; ifa = nifa) + lck_mtx_lock(nd6_mutex); + for (ia = in6_ifaddrs; ia != NULL; ia = nia) { - nifa = TAILQ_NEXT(ifa, ifa_list); - if (ifa->ifa_addr == NULL) + nia = ia->ia_next; + if (ia->ia_ifa.ifa_ifp != ifp) continue; - if (ifa->ifa_addr->sa_family != AF_INET6) - continue; - in6_purgeaddr(ifa); + in6_purgeaddr(&ia->ia_ifa, 1); } + lck_mtx_unlock(nd6_mutex); in6_ifdetach(ifp); } @@ -1492,8 +1584,9 @@ in6_lifaddr_ioctl(so, cmd, data, ifp, p) case SIOCALIFADDR: { struct in6_aliasreq ifra; - struct in6_addr *hostid = NULL; + struct in6_addr hostid; int prefixlen; + int hostid_found = 0; if ((iflr->flags & IFLR_PREFIX) != 0) { struct sockaddr_in6 *sin6; @@ -1506,7 +1599,8 @@ in6_lifaddr_ioctl(so, cmd, data, ifp, p) ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, 0); if (!ifa) return EADDRNOTAVAIL; - hostid = IFA_IN6(ifa); + hostid = *IFA_IN6(ifa); + hostid_found = 1; /* prefixlen must be <= 64. */ if (64 < iflr->prefixlen) @@ -1529,22 +1623,22 @@ in6_lifaddr_ioctl(so, cmd, data, ifp, p) bcopy(&iflr->addr, &ifra.ifra_addr, ((struct sockaddr *)&iflr->addr)->sa_len); - if (hostid) { + if (hostid_found) { /* fill in hostid part */ ifra.ifra_addr.sin6_addr.s6_addr32[2] = - hostid->s6_addr32[2]; + hostid.s6_addr32[2]; ifra.ifra_addr.sin6_addr.s6_addr32[3] = - hostid->s6_addr32[3]; + hostid.s6_addr32[3]; } if (((struct sockaddr *)&iflr->dstaddr)->sa_family) { /*XXX*/ bcopy(&iflr->dstaddr, &ifra.ifra_dstaddr, ((struct sockaddr *)&iflr->dstaddr)->sa_len); - if (hostid) { + if (hostid_found) { ifra.ifra_dstaddr.sin6_addr.s6_addr32[2] = - hostid->s6_addr32[2]; + hostid.s6_addr32[2]; ifra.ifra_dstaddr.sin6_addr.s6_addr32[3] = - hostid->s6_addr32[3]; + hostid.s6_addr32[3]; } } @@ -1593,6 +1687,7 @@ in6_lifaddr_ioctl(so, cmd, data, ifp, p) } } + ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family != AF_INET6) @@ -1617,6 +1712,7 @@ in6_lifaddr_ioctl(so, cmd, data, ifp, p) if (IN6_ARE_ADDR_EQUAL(&candidate, &match)) break; } + ifnet_lock_done(ifp); if (!ifa) return EADDRNOTAVAIL; ia = ifa2ia6(ifa); @@ -1700,7 +1796,6 @@ in6_ifinit(ifp, ia, sin6, newhost) int newhost; { int error = 0, plen, ifacount = 0; - int s = splimp(); struct ifaddr *ifa; /* @@ -1708,6 +1803,7 @@ in6_ifinit(ifp, ia, sin6, newhost) * if this is its first address, * and to validate the address if necessary. */ + ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { if (ifa->ifa_addr == NULL) @@ -1716,25 +1812,17 @@ in6_ifinit(ifp, ia, sin6, newhost) continue; ifacount++; } + ifnet_lock_done(ifp); ia->ia_addr = *sin6; if (ifacount <= 1 && -#ifdef __APPLE__ (error = dlil_ioctl(PF_INET6, ifp, SIOCSIFADDR, (caddr_t)ia))) { if (error) { - splx(s); return(error); } } -#else - ifp->if_ioctl && (error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia))) { - splx(s); - return(error); - } -#endif - splx(s); ia->ia_ifa.ifa_metric = ifp->if_metric; @@ -1775,15 +1863,15 @@ in6_ifinit(ifp, ia, sin6, newhost) * given interface. */ struct in6_multi * -in6_addmulti(maddr6, ifp, errorp) +in6_addmulti(maddr6, ifp, errorp, nd6_locked) struct in6_addr *maddr6; struct ifnet *ifp; int *errorp; + int nd6_locked; { struct in6_multi *in6m; struct sockaddr_in6 sin6; struct ifmultiaddr *ifma; - int s = splnet(); *errorp = 0; @@ -1798,7 +1886,6 @@ in6_addmulti(maddr6, ifp, errorp) sin6.sin6_addr = *maddr6; *errorp = if_addmulti(ifp, (struct sockaddr *)&sin6, &ifma); if (*errorp) { - splx(s); return 0; } @@ -1813,7 +1900,6 @@ in6_addmulti(maddr6, ifp, errorp) at interrupt time? If so, need to fix if_addmulti. XXX */ in6m = (struct in6_multi *)_MALLOC(sizeof(*in6m), M_IPMADDR, M_NOWAIT); if (in6m == NULL) { - splx(s); return (NULL); } @@ -1822,14 +1908,17 @@ in6_addmulti(maddr6, ifp, errorp) in6m->in6m_ifp = ifp; in6m->in6m_ifma = ifma; ifma->ifma_protospec = in6m; + if (nd6_locked == 0) + lck_mtx_lock(nd6_mutex); LIST_INSERT_HEAD(&in6_multihead, in6m, in6m_entry); + if (nd6_locked == 0) + lck_mtx_unlock(nd6_mutex); /* * Let MLD6 know that we have joined a new IP6 multicast * group. */ mld6_start_listening(in6m); - splx(s); return(in6m); } @@ -1837,26 +1926,30 @@ in6_addmulti(maddr6, ifp, errorp) * Delete a multicast address record. */ void -in6_delmulti(in6m) - struct in6_multi *in6m; +in6_delmulti( + struct in6_multi *in6m, int nd6locked) { struct ifmultiaddr *ifma = in6m->in6m_ifma; - int s = splnet(); - if (ifma && ifma->ifma_refcount == 1) { + if (ifma && ifma->ifma_usecount == 1) { /* * No remaining claims to this record; let MLD6 know * that we are leaving the multicast group. */ mld6_stop_listening(in6m); ifma->ifma_protospec = 0; + if (nd6locked == 0) + lck_mtx_lock(nd6_mutex); LIST_REMOVE(in6m, in6m_entry); + if (nd6locked == 0) + lck_mtx_unlock(nd6_mutex); FREE(in6m, M_IPMADDR); } /* XXX - should be separate API for when we have an ifma? */ - if (ifma) - if_delmultiaddr(ifma); - splx(s); + if (ifma) { + if_delmultiaddr(ifma, 0); + ifma_release(ifma); + } } /* @@ -1869,6 +1962,7 @@ in6ifa_ifpforlinklocal(ifp, ignoreflags) { struct ifaddr *ifa; + ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { if (ifa->ifa_addr == NULL) @@ -1882,11 +1976,11 @@ in6ifa_ifpforlinklocal(ifp, ignoreflags) break; } } + ifnet_lock_done(ifp); return((struct in6_ifaddr *)ifa); } - /* * find the internet address corresponding to a given interface and address. */ @@ -1897,6 +1991,7 @@ in6ifa_ifpwithaddr(ifp, addr) { struct ifaddr *ifa; + ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { if (ifa->ifa_addr == NULL) @@ -1906,6 +2001,7 @@ in6ifa_ifpwithaddr(ifp, addr) if (IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa))) break; } + ifnet_lock_done(ifp); return((struct in6_ifaddr *)ifa); } @@ -1973,11 +2069,15 @@ in6_localaddr(in6) if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6)) return 1; - for (ia = in6_ifaddr; ia; ia = ia->ia_next) + lck_mtx_lock(nd6_mutex); + for (ia = in6_ifaddrs; ia; ia = ia->ia_next) if (IN6_ARE_MASKED_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr, - &ia->ia_prefixmask.sin6_addr)) + &ia->ia_prefixmask.sin6_addr)) { + lck_mtx_unlock(nd6_mutex); return 1; + } + lck_mtx_unlock(nd6_mutex); return (0); } @@ -1987,18 +2087,22 @@ in6_is_addr_deprecated(sa6) { struct in6_ifaddr *ia; - for (ia = in6_ifaddr; ia; ia = ia->ia_next) { + lck_mtx_lock(nd6_mutex); + for (ia = in6_ifaddrs; ia; ia = ia->ia_next) { if (IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &sa6->sin6_addr) && #if SCOPEDROUTING ia->ia_addr.sin6_scope_id == sa6->sin6_scope_id && #endif - (ia->ia6_flags & IN6_IFF_DEPRECATED) != 0) + (ia->ia6_flags & IN6_IFF_DEPRECATED) != 0) { + lck_mtx_unlock(nd6_mutex); return(1); /* true */ + } /* XXX: do we still have to go thru the rest of the list? */ } + lck_mtx_unlock(nd6_mutex); return(0); /* false */ } @@ -2046,7 +2150,8 @@ in6_are_prefix_equal(p1, p2, len) if (bcmp(&p1->s6_addr, &p2->s6_addr, bytelen)) return(0); - if (p1->s6_addr[bytelen] >> (8 - bitlen) != + if (bitlen != 0 && + p1->s6_addr[bytelen] >> (8 - bitlen) != p2->s6_addr[bytelen] >> (8 - bitlen)) return(0); @@ -2081,9 +2186,9 @@ in6_prefixlen2mask(maskp, len) * return the best address out of the same scope */ struct in6_ifaddr * -in6_ifawithscope(oifp, dst) - struct ifnet *oifp; - struct in6_addr *dst; +in6_ifawithscope( + struct ifnet *oifp, + struct in6_addr *dst) { int dst_scope = in6_addrscope(dst), src_scope, best_scope = 0; int blen = -1; @@ -2103,8 +2208,8 @@ in6_ifawithscope(oifp, dst) * Comparing an interface with the outgoing interface will be done * only at the final stage of tiebreaking. */ - for (ifp = TAILQ_FIRST(&ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) - { + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_list) { /* * We can never take an address that breaks the scope zone * of the destination. @@ -2112,6 +2217,7 @@ in6_ifawithscope(oifp, dst) if (in6_addr2scopeid(ifp, dst) != in6_addr2scopeid(oifp, dst)) continue; + ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { int tlen = -1, dscopecmp, bscopecmp, matchcmp; @@ -2328,12 +2434,17 @@ in6_ifawithscope(oifp, dst) goto replace; /* (9) */ replace: + ifaref(ifa); + if (ifa_best) + ifafree(&ifa_best->ia_ifa); ifa_best = (struct in6_ifaddr *)ifa; blen = tlen >= 0 ? tlen : in6_matchlen(IFA_IN6(ifa), dst); best_scope = in6_addrscope(&ifa_best->ia_addr.sin6_addr); } + ifnet_lock_done(ifp); } + ifnet_head_done(); /* count statistics for future improvements */ if (ifa_best == NULL) @@ -2361,9 +2472,9 @@ in6_ifawithscope(oifp, dst) * found, return the first valid address from designated IF. */ struct in6_ifaddr * -in6_ifawithifp(ifp, dst) - struct ifnet *ifp; - struct in6_addr *dst; +in6_ifawithifp( + struct ifnet *ifp, + struct in6_addr *dst) { int dst_scope = in6_addrscope(dst), blen = -1, tlen; struct ifaddr *ifa; @@ -2378,6 +2489,7 @@ in6_ifawithifp(ifp, dst) * If two or more, return one which matches the dst longest. * If none, return one of global addresses assigned other ifs. */ + ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family != AF_INET6) @@ -2410,8 +2522,10 @@ in6_ifawithifp(ifp, dst) besta = (struct in6_ifaddr *)ifa; } } - if (besta) + if (besta) { + ifnet_lock_done(ifp); return(besta); + } TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { @@ -2428,9 +2542,11 @@ in6_ifawithifp(ifp, dst) dep[1] = (struct in6_ifaddr *)ifa; continue; } - + + ifnet_lock_done(ifp); return (struct in6_ifaddr *)ifa; } + ifnet_lock_done(ifp); /* use the last-resort values, that are, deprecated addresses */ if (dep[0]) @@ -2447,9 +2563,9 @@ extern int in6_init2done; * perform DAD when interface becomes IFF_UP. */ void -in6_if_up(ifp, ifra) - struct ifnet *ifp; - struct in6_aliasreq *ifra; +in6_if_up( + struct ifnet *ifp, + struct in6_aliasreq *ifra) { struct ifaddr *ifa; struct in6_ifaddr *ia; @@ -2464,6 +2580,7 @@ in6_if_up(ifp, ifra) in6_ifattach(ifp, NULL, ifra); dad_delay = 0; + ifnet_lock_exclusive(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family != AF_INET6) @@ -2472,11 +2589,12 @@ in6_if_up(ifp, ifra) if (ia->ia6_flags & IN6_IFF_TENTATIVE) nd6_dad_start(ifa, &dad_delay); } + ifnet_lock_done(ifp); } int -in6if_do_dad(ifp) - struct ifnet *ifp; +in6if_do_dad( + struct ifnet *ifp) { if ((ifp->if_flags & IFF_LOOPBACK) != 0) return(0); @@ -2521,12 +2639,13 @@ in6_setmaxmtu() unsigned long maxmtu = 0; struct ifnet *ifp; - for (ifp = TAILQ_FIRST(&ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) - { + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_list) { if ((ifp->if_flags & IFF_LOOPBACK) == 0 && nd_ifinfo[ifp->if_index].linkmtu > maxmtu) maxmtu = nd_ifinfo[ifp->if_index].linkmtu; } + ifnet_head_done(); if (maxmtu) /* update only when maxmtu is positive */ in6_maxmtu = maxmtu; } diff --git a/bsd/netinet6/in6.h b/bsd/netinet6/in6.h index d9df90b9b..47f18fed4 100644 --- a/bsd/netinet6/in6.h +++ b/bsd/netinet6/in6.h @@ -72,6 +72,13 @@ #ifndef _NETINET6_IN6_H_ #define _NETINET6_IN6_H_ #include <sys/appleapiopts.h> +#include <sys/_types.h> + + +#ifndef _SA_FAMILY_T +#define _SA_FAMILY_T +typedef __uint8_t sa_family_t; +#endif /* * Identification of the network protocol stack @@ -81,6 +88,7 @@ #define __KAME__ #define __KAME_VERSION "20010528/apple-darwin" +#ifndef _POSIX_C_SOURCE /* * Local port number conventions: * @@ -114,15 +122,16 @@ #define IPV6PORT_ANONMAX 65535 #define IPV6PORT_RESERVEDMIN 600 #define IPV6PORT_RESERVEDMAX (IPV6PORT_RESERVED-1) +#endif /* _POSIX_C_SOURCE */ /* * IPv6 address */ struct in6_addr { union { - u_int8_t __u6_addr8[16]; - u_int16_t __u6_addr16[8]; - u_int32_t __u6_addr32[4]; + __uint8_t __u6_addr8[16]; + __uint16_t __u6_addr16[8]; + __uint32_t __u6_addr32[4]; } __u6_addr; /* 128-bit IP6 address */ }; @@ -138,16 +147,16 @@ struct in6_addr { /* * Socket address for IPv6 */ -#if !defined(_XOPEN_SOURCE) +#ifndef _POSIX_C_SOURCE #define SIN6_LEN -#endif +#endif /* _POSIX_C_SOURCE */ struct sockaddr_in6 { - u_int8_t sin6_len; /* length of this struct(sa_family_t)*/ - u_int8_t sin6_family; /* AF_INET6 (sa_family_t) */ - u_int16_t sin6_port; /* Transport layer port # (in_port_t)*/ - u_int32_t sin6_flowinfo; /* IP6 flow information */ + __uint8_t sin6_len; /* length of this struct(sa_family_t)*/ + sa_family_t sin6_family; /* AF_INET6 (sa_family_t) */ + in_port_t sin6_port; /* Transport layer port # (in_port_t)*/ + __uint32_t sin6_flowinfo; /* IP6 flow information */ struct in6_addr sin6_addr; /* IP6 address */ - u_int32_t sin6_scope_id; /* scope zone index */ + __uint32_t sin6_scope_id; /* scope zone index */ }; /* @@ -165,7 +174,7 @@ struct sockaddr_in6 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }}} #endif -#ifdef KERNEL +#ifdef KERNEL_PRIVATE extern const struct sockaddr_in6 sa6_any; extern const struct in6_addr in6mask0; @@ -173,7 +182,7 @@ extern const struct in6_addr in6mask32; extern const struct in6_addr in6mask64; extern const struct in6_addr in6mask96; extern const struct in6_addr in6mask128; -#endif /* KERNEL */ +#endif KERNEL_PRIVATE /* * Macros started with IPV6_ADDR is KAME local @@ -209,6 +218,7 @@ extern const struct in6_addr in6mask128; #define IN6ADDR_LOOPBACK_INIT \ {{{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }}} +#ifndef _POSIX_C_SOURCE #define IN6ADDR_NODELOCAL_ALLNODES_INIT \ {{{ 0xff, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }}} @@ -218,12 +228,15 @@ extern const struct in6_addr in6mask128; #define IN6ADDR_LINKLOCAL_ALLROUTERS_INIT \ {{{ 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02 }}} +#endif /* _POSIX_C_SOURCE */ extern const struct in6_addr in6addr_any; extern const struct in6_addr in6addr_loopback; +#ifndef _POSIX_C_SOURCE extern const struct in6_addr in6addr_nodelocal_allnodes; extern const struct in6_addr in6addr_linklocal_allnodes; extern const struct in6_addr in6addr_linklocal_allrouters; +#endif /* _POSIX_C_SOURCE */ /* * Equality @@ -235,8 +248,10 @@ extern const struct in6_addr in6addr_linklocal_allrouters; #define IN6_ARE_ADDR_EQUAL(a, b) \ (bcmp(&(a)->s6_addr[0], &(b)->s6_addr[0], sizeof(struct in6_addr)) == 0) #else +#ifndef _POSIX_C_SOURCE #define IN6_ARE_ADDR_EQUAL(a, b) \ (memcmp(&(a)->s6_addr[0], &(b)->s6_addr[0], sizeof(struct in6_addr)) == 0) +#endif /* _POSIX_C_SOURCE */ #endif #ifdef KERNEL /* non standard */ @@ -251,37 +266,37 @@ extern const struct in6_addr in6addr_linklocal_allrouters; * Unspecified */ #define IN6_IS_ADDR_UNSPECIFIED(a) \ - ((*(const u_int32_t *)(const void *)(&(a)->s6_addr[0]) == 0) && \ - (*(const u_int32_t *)(const void *)(&(a)->s6_addr[4]) == 0) && \ - (*(const u_int32_t *)(const void *)(&(a)->s6_addr[8]) == 0) && \ - (*(const u_int32_t *)(const void *)(&(a)->s6_addr[12]) == 0)) + ((*(const __uint32_t *)(const void *)(&(a)->s6_addr[0]) == 0) && \ + (*(const __uint32_t *)(const void *)(&(a)->s6_addr[4]) == 0) && \ + (*(const __uint32_t *)(const void *)(&(a)->s6_addr[8]) == 0) && \ + (*(const __uint32_t *)(const void *)(&(a)->s6_addr[12]) == 0)) /* * Loopback */ #define IN6_IS_ADDR_LOOPBACK(a) \ - ((*(const u_int32_t *)(const void *)(&(a)->s6_addr[0]) == 0) && \ - (*(const u_int32_t *)(const void *)(&(a)->s6_addr[4]) == 0) && \ - (*(const u_int32_t *)(const void *)(&(a)->s6_addr[8]) == 0) && \ - (*(const u_int32_t *)(const void *)(&(a)->s6_addr[12]) == ntohl(1))) + ((*(const __uint32_t *)(const void *)(&(a)->s6_addr[0]) == 0) && \ + (*(const __uint32_t *)(const void *)(&(a)->s6_addr[4]) == 0) && \ + (*(const __uint32_t *)(const void *)(&(a)->s6_addr[8]) == 0) && \ + (*(const __uint32_t *)(const void *)(&(a)->s6_addr[12]) == ntohl(1))) /* * IPv4 compatible */ #define IN6_IS_ADDR_V4COMPAT(a) \ - ((*(const u_int32_t *)(const void *)(&(a)->s6_addr[0]) == 0) && \ - (*(const u_int32_t *)(const void *)(&(a)->s6_addr[4]) == 0) && \ - (*(const u_int32_t *)(const void *)(&(a)->s6_addr[8]) == 0) && \ - (*(const u_int32_t *)(const void *)(&(a)->s6_addr[12]) != 0) && \ - (*(const u_int32_t *)(const void *)(&(a)->s6_addr[12]) != ntohl(1))) + ((*(const __uint32_t *)(const void *)(&(a)->s6_addr[0]) == 0) && \ + (*(const __uint32_t *)(const void *)(&(a)->s6_addr[4]) == 0) && \ + (*(const __uint32_t *)(const void *)(&(a)->s6_addr[8]) == 0) && \ + (*(const __uint32_t *)(const void *)(&(a)->s6_addr[12]) != 0) && \ + (*(const __uint32_t *)(const void *)(&(a)->s6_addr[12]) != ntohl(1))) /* * Mapped */ #define IN6_IS_ADDR_V4MAPPED(a) \ - ((*(const u_int32_t *)(const void *)(&(a)->s6_addr[0]) == 0) && \ - (*(const u_int32_t *)(const void *)(&(a)->s6_addr[4]) == 0) && \ - (*(const u_int32_t *)(const void *)(&(a)->s6_addr[8]) == ntohl(0x0000ffff))) + ((*(const __uint32_t *)(const void *)(&(a)->s6_addr[0]) == 0) && \ + (*(const __uint32_t *)(const void *)(&(a)->s6_addr[4]) == 0) && \ + (*(const __uint32_t *)(const void *)(&(a)->s6_addr[8]) == ntohl(0x0000ffff))) /* * KAME Scope Values @@ -368,43 +383,47 @@ extern const struct in6_addr in6addr_linklocal_allrouters; #define IFA6_IS_DEPRECATED(a) \ ((a)->ia6_lifetime.ia6t_preferred != 0 && \ - (a)->ia6_lifetime.ia6t_preferred < time_second) + (a)->ia6_lifetime.ia6t_preferred < timenow.tv_sec) #define IFA6_IS_INVALID(a) \ ((a)->ia6_lifetime.ia6t_expire != 0 && \ - (a)->ia6_lifetime.ia6t_expire < time_second) -#endif /* _KERNEL */ + (a)->ia6_lifetime.ia6t_expire < timenow.tv_sec) +#endif /* KERNEL */ /* * IP6 route structure */ -#ifdef __APPLE_API_PRIVATE -#if !defined(_XOPEN_SOURCE) +#ifndef _POSIX_C_SOURCE +#ifdef PRIVATE struct route_in6 { struct rtentry *ro_rt; struct sockaddr_in6 ro_dst; }; -#endif -#endif /* __APPLE_API_PRIVATE */ +#endif /* PRIVATE */ +#endif /* _POSIX_C_SOURCE */ /* * Options for use with [gs]etsockopt at the IPV6 level. * First word of comment is data type; bool is stored in int. */ /* no hdrincl */ +#ifndef _POSIX_C_SOURCE #if 0 /* the followings are relic in IPv4 and hence are disabled */ #define IPV6_OPTIONS 1 /* buf/ip6_opts; set/get IP6 options */ #define IPV6_RECVOPTS 5 /* bool; receive all IP6 opts w/dgram */ #define IPV6_RECVRETOPTS 6 /* bool; receive IP6 opts for response */ #define IPV6_RECVDSTADDR 7 /* bool; receive IP6 dst addr w/dgram */ #define IPV6_RETOPTS 8 /* ip6_opts; set/get IP6 options */ -#endif +#endif 0 #define IPV6_SOCKOPT_RESERVED1 3 /* reserved for future use */ +#endif /* _POSIX_C_SOURCE */ #define IPV6_UNICAST_HOPS 4 /* int; IP6 hops */ -#define IPV6_MULTICAST_IF 9 /* u_char; set/get IP6 multicast i/f */ -#define IPV6_MULTICAST_HOPS 10 /* u_char; set/get IP6 multicast hops */ -#define IPV6_MULTICAST_LOOP 11 /* u_char; set/get IP6 multicast loopback */ +#define IPV6_MULTICAST_IF 9 /* __uint8_t; set/get IP6 multicast i/f */ +#define IPV6_MULTICAST_HOPS 10 /* __uint8_t; set/get IP6 multicast hops */ +#define IPV6_MULTICAST_LOOP 11 /* __uint8_t; set/get IP6 mcast loopback */ #define IPV6_JOIN_GROUP 12 /* ip6_mreq; join a group membership */ #define IPV6_LEAVE_GROUP 13 /* ip6_mreq; leave a group membership */ + +#ifndef _POSIX_C_SOURCE #define IPV6_PORTRANGE 14 /* int; range to choose for unspec port */ #define ICMP6_FILTER 18 /* icmp6_filter; icmp6 filter */ /* RFC2292 options */ @@ -417,15 +436,17 @@ struct route_in6 { #define IPV6_PKTOPTIONS 25 /* buf/cmsghdr; set/get IPv6 options */ #define IPV6_CHECKSUM 26 /* int; checksum offset for raw socket */ +#endif /* _POSIX_C_SOURCE */ #define IPV6_V6ONLY 27 /* bool; only bind INET6 at wildcard bind */ +#ifndef _POSIX_C_SOURCE #ifndef KERNEL #define IPV6_BINDV6ONLY IPV6_V6ONLY -#endif +#endif KERNEL #if 1 /*IPSEC*/ #define IPV6_IPSEC_POLICY 28 /* struct; get/set security policy */ -#endif +#endif 1 #define IPV6_FAITH 29 /* bool; accept FAITH'ed connections */ #if 1 /*IPV6FIREWALL*/ @@ -434,7 +455,7 @@ struct route_in6 { #define IPV6_FW_FLUSH 32 /* flush firewall rule chain */ #define IPV6_FW_ZERO 33 /* clear single/all firewall counter(s) */ #define IPV6_FW_GET 34 /* get entire firewall rule chain */ -#endif +#endif 1 /* to define items, should talk with KAME guys first, for *BSD compatibility */ @@ -472,7 +493,6 @@ struct in6_pktinfo { #define IPV6_PORTRANGE_HIGH 1 /* "high" - request firewall bypass */ #define IPV6_PORTRANGE_LOW 2 /* "low" - vouchsafe security */ -#if !defined(_XOPEN_SOURCE) /* * Definitions for inet6 sysctl operations. * @@ -481,6 +501,54 @@ struct in6_pktinfo { */ #define IPV6PROTO_MAXID (IPPROTO_PIM + 1) /* don't list to IPV6PROTO_MAX */ +/* + * Names for IP sysctl objects + */ +#define IPV6CTL_FORWARDING 1 /* act as router */ +#define IPV6CTL_SENDREDIRECTS 2 /* may send redirects when forwarding*/ +#define IPV6CTL_DEFHLIM 3 /* default Hop-Limit */ +#ifdef notyet +#define IPV6CTL_DEFMTU 4 /* default MTU */ +#endif +#define IPV6CTL_FORWSRCRT 5 /* forward source-routed dgrams */ +#define IPV6CTL_STATS 6 /* stats */ +#define IPV6CTL_MRTSTATS 7 /* multicast forwarding stats */ +#define IPV6CTL_MRTPROTO 8 /* multicast routing protocol */ +#define IPV6CTL_MAXFRAGPACKETS 9 /* max packets reassembly queue */ +#define IPV6CTL_SOURCECHECK 10 /* verify source route and intf */ +#define IPV6CTL_SOURCECHECK_LOGINT 11 /* minimume logging interval */ +#define IPV6CTL_ACCEPT_RTADV 12 +#define IPV6CTL_KEEPFAITH 13 +#define IPV6CTL_LOG_INTERVAL 14 +#define IPV6CTL_HDRNESTLIMIT 15 +#define IPV6CTL_DAD_COUNT 16 +#define IPV6CTL_AUTO_FLOWLABEL 17 +#define IPV6CTL_DEFMCASTHLIM 18 +#define IPV6CTL_GIF_HLIM 19 /* default HLIM for gif encap packet */ +#define IPV6CTL_KAME_VERSION 20 +#define IPV6CTL_USE_DEPRECATED 21 /* use deprecated addr (RFC2462 5.5.4) */ +#define IPV6CTL_RR_PRUNE 22 /* walk timer for router renumbering */ +#if 0 /*obsolete*/ +#define IPV6CTL_MAPPED_ADDR 23 +#endif +#define IPV6CTL_V6ONLY 24 +#define IPV6CTL_RTEXPIRE 25 /* cloned route expiration time */ +#define IPV6CTL_RTMINEXPIRE 26 /* min value for expiration time */ +#define IPV6CTL_RTMAXCACHE 27 /* trigger level for dynamic expire */ + +#define IPV6CTL_USETEMPADDR 32 /* use temporary addresses (RFC3041) */ +#define IPV6CTL_TEMPPLTIME 33 /* preferred lifetime for tmpaddrs */ +#define IPV6CTL_TEMPVLTIME 34 /* valid lifetime for tmpaddrs */ +#define IPV6CTL_AUTO_LINKLOCAL 35 /* automatic link-local addr assign */ +#define IPV6CTL_RIP6STATS 36 /* raw_ip6 stats */ + +#define IPV6CTL_MAXFRAGS 41 /* max fragments */ + +/* New entries should be added here from current IPV6CTL_MAXID value. */ +/* to define items, should talk with KAME guys first, for *BSD compatibility */ +#define IPV6CTL_MAXID 42 + +#ifdef KERNEL_PRIVATE #define CTL_IPV6PROTO_NAMES { \ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ { 0, 0 }, \ @@ -527,54 +595,6 @@ struct in6_pktinfo { { 0, 0 }, \ { "pim6", CTLTYPE_NODE }, \ } - -/* - * Names for IP sysctl objects - */ -#define IPV6CTL_FORWARDING 1 /* act as router */ -#define IPV6CTL_SENDREDIRECTS 2 /* may send redirects when forwarding*/ -#define IPV6CTL_DEFHLIM 3 /* default Hop-Limit */ -#ifdef notyet -#define IPV6CTL_DEFMTU 4 /* default MTU */ -#endif -#define IPV6CTL_FORWSRCRT 5 /* forward source-routed dgrams */ -#define IPV6CTL_STATS 6 /* stats */ -#define IPV6CTL_MRTSTATS 7 /* multicast forwarding stats */ -#define IPV6CTL_MRTPROTO 8 /* multicast routing protocol */ -#define IPV6CTL_MAXFRAGPACKETS 9 /* max packets reassembly queue */ -#define IPV6CTL_SOURCECHECK 10 /* verify source route and intf */ -#define IPV6CTL_SOURCECHECK_LOGINT 11 /* minimume logging interval */ -#define IPV6CTL_ACCEPT_RTADV 12 -#define IPV6CTL_KEEPFAITH 13 -#define IPV6CTL_LOG_INTERVAL 14 -#define IPV6CTL_HDRNESTLIMIT 15 -#define IPV6CTL_DAD_COUNT 16 -#define IPV6CTL_AUTO_FLOWLABEL 17 -#define IPV6CTL_DEFMCASTHLIM 18 -#define IPV6CTL_GIF_HLIM 19 /* default HLIM for gif encap packet */ -#define IPV6CTL_KAME_VERSION 20 -#define IPV6CTL_USE_DEPRECATED 21 /* use deprecated addr (RFC2462 5.5.4) */ -#define IPV6CTL_RR_PRUNE 22 /* walk timer for router renumbering */ -#if 0 /*obsolete*/ -#define IPV6CTL_MAPPED_ADDR 23 -#endif -#define IPV6CTL_V6ONLY 24 -#define IPV6CTL_RTEXPIRE 25 /* cloned route expiration time */ -#define IPV6CTL_RTMINEXPIRE 26 /* min value for expiration time */ -#define IPV6CTL_RTMAXCACHE 27 /* trigger level for dynamic expire */ - -#define IPV6CTL_USETEMPADDR 32 /* use temporary addresses (RFC3041) */ -#define IPV6CTL_TEMPPLTIME 33 /* preferred lifetime for tmpaddrs */ -#define IPV6CTL_TEMPVLTIME 34 /* valid lifetime for tmpaddrs */ -#define IPV6CTL_AUTO_LINKLOCAL 35 /* automatic link-local addr assign */ -#define IPV6CTL_RIP6STATS 36 /* raw_ip6 stats */ - -/* New entries should be added here from current IPV6CTL_MAXID value. */ -/* to define items, should talk with KAME guys first, for *BSD compatibility */ -#define IPV6CTL_MAXID 37 - -#endif /* !_XOPEN_SOURCE */ - /* * Redefinition of mbuf flags */ @@ -583,74 +603,73 @@ struct in6_pktinfo { #define M_LOOP M_PROTO4 #define M_AUTHIPDGM M_PROTO5 -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE struct cmsghdr; struct mbuf; struct ifnet; struct in6_aliasreq; -int in6_cksum __P((struct mbuf *, u_int8_t, u_int32_t, u_int32_t)); -int in6_localaddr __P((struct in6_addr *)); -int in6_addrscope __P((struct in6_addr *)); -struct in6_ifaddr *in6_ifawithscope __P((struct ifnet *, struct in6_addr *)); -struct in6_ifaddr *in6_ifawithifp __P((struct ifnet *, struct in6_addr *)); -extern void in6_if_up __P((struct ifnet *, struct in6_aliasreq *)); +int in6_cksum(struct mbuf *, __uint8_t, __uint32_t, __uint32_t); +int in6_localaddr(struct in6_addr *); +int in6_addrscope(struct in6_addr *); +struct in6_ifaddr *in6_ifawithscope(struct ifnet *, struct in6_addr *); +struct in6_ifaddr *in6_ifawithifp(struct ifnet *, struct in6_addr *); +extern void in6_if_up(struct ifnet *, struct in6_aliasreq *); struct sockaddr; -void in6_sin6_2_sin __P((struct sockaddr_in *sin, - struct sockaddr_in6 *sin6)); -void in6_sin_2_v4mapsin6 __P((struct sockaddr_in *sin, - struct sockaddr_in6 *sin6)); -void in6_sin6_2_sin_in_sock __P((struct sockaddr *nam)); -void in6_sin_2_v4mapsin6_in_sock __P((struct sockaddr **nam)); +void in6_sin6_2_sin(struct sockaddr_in *sin, + struct sockaddr_in6 *sin6); +void in6_sin_2_v4mapsin6(struct sockaddr_in *sin, + struct sockaddr_in6 *sin6); +void in6_sin6_2_sin_in_sock(struct sockaddr *nam); +void in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam); #define satosin6(sa) ((struct sockaddr_in6 *)(sa)) #define sin6tosa(sin6) ((struct sockaddr *)(sin6)) #define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +#endif KERNEL_PRIVATE +#ifndef KERNEL __BEGIN_DECLS struct cmsghdr; -extern int inet6_option_space __P((int)); -extern int inet6_option_init __P((void *, struct cmsghdr **, int)); -extern int inet6_option_append __P((struct cmsghdr *, const u_int8_t *, - int, int)); -extern u_int8_t *inet6_option_alloc __P((struct cmsghdr *, int, int, int)); -extern int inet6_option_next __P((const struct cmsghdr *, u_int8_t **)); -extern int inet6_option_find __P((const struct cmsghdr *, u_int8_t **, int)); - -extern size_t inet6_rthdr_space __P((int, int)); -extern struct cmsghdr *inet6_rthdr_init __P((void *, int)); -extern int inet6_rthdr_add __P((struct cmsghdr *, const struct in6_addr *, - unsigned int)); -extern int inet6_rthdr_lasthop __P((struct cmsghdr *, unsigned int)); +extern int inet6_option_space(int); +extern int inet6_option_init(void *, struct cmsghdr **, int); +extern int inet6_option_append(struct cmsghdr *, const __uint8_t *, + int, int); +extern __uint8_t *inet6_option_alloc(struct cmsghdr *, int, int, int); +extern int inet6_option_next(const struct cmsghdr *, __uint8_t **); +extern int inet6_option_find(const struct cmsghdr *, __uint8_t **, int); + +extern size_t inet6_rthdr_space(int, int); +extern struct cmsghdr *inet6_rthdr_init(void *, int); +extern int inet6_rthdr_add(struct cmsghdr *, const struct in6_addr *, + unsigned int); +extern int inet6_rthdr_lasthop(struct cmsghdr *, unsigned int); #if 0 /* not implemented yet */ -extern int inet6_rthdr_reverse __P((const struct cmsghdr *, struct cmsghdr *)); +extern int inet6_rthdr_reverse(const struct cmsghdr *, struct cmsghdr *); #endif -extern int inet6_rthdr_segments __P((const struct cmsghdr *)); -extern struct in6_addr *inet6_rthdr_getaddr __P((struct cmsghdr *, int)); -extern int inet6_rthdr_getflags __P((const struct cmsghdr *, int)); - -extern int inet6_opt_init __P((void *, size_t)); -extern int inet6_opt_append __P((void *, size_t, int, u_int8_t, - size_t, u_int8_t, void **)); -extern int inet6_opt_finish __P((void *, size_t, int)); -extern int inet6_opt_set_val __P((void *, size_t, void *, int)); - -extern int inet6_opt_next __P((void *, size_t, int, u_int8_t *, - size_t *, void **)); -extern int inet6_opt_find __P((void *, size_t, int, u_int8_t, - size_t *, void **)); -extern int inet6_opt_get_val __P((void *, size_t, void *, int)); -extern size_t inet6_rth_space __P((int, int)); -extern void *inet6_rth_init __P((void *, int, int, int)); -extern int inet6_rth_add __P((void *, const struct in6_addr *)); -extern int inet6_rth_reverse __P((const void *, void *)); -extern int inet6_rth_segments __P((const void *)); -extern struct in6_addr *inet6_rth_getaddr __P((const void *, int)); +extern int inet6_rthdr_segments(const struct cmsghdr *); +extern struct in6_addr *inet6_rthdr_getaddr(struct cmsghdr *, int); +extern int inet6_rthdr_getflags(const struct cmsghdr *, int); + +extern int inet6_opt_init(void *, size_t); +extern int inet6_opt_append(void *, size_t, int, __uint8_t, + size_t, __uint8_t, void **); +extern int inet6_opt_finish(void *, size_t, int); +extern int inet6_opt_set_val(void *, size_t, void *, int); + +extern int inet6_opt_next(void *, size_t, int, __uint8_t *, + size_t *, void **); +extern int inet6_opt_find(void *, size_t, int, __uint8_t, + size_t *, void **); +extern int inet6_opt_get_val(void *, size_t, void *, int); +extern size_t inet6_rth_space(int, int); +extern void *inet6_rth_init(void *, int, int, int); +extern int inet6_rth_add(void *, const struct in6_addr *); +extern int inet6_rth_reverse(const void *, void *); +extern int inet6_rth_segments(const void *); +extern struct in6_addr *inet6_rth_getaddr(const void *, int); __END_DECLS - +#endif !KERNEL +#endif /* _POSIX_C_SOURCE */ #endif /* !_NETINET6_IN6_H_ */ diff --git a/bsd/netinet6/in6_gif.c b/bsd/netinet6/in6_gif.c index 2886c784b..5c1138d0a 100644 --- a/bsd/netinet6/in6_gif.c +++ b/bsd/netinet6/in6_gif.c @@ -68,11 +68,11 @@ #include <net/net_osdep.h> int -in6_gif_output(ifp, family, m, rt) - struct ifnet *ifp; - int family; /* family of the packet to be encapsulate. */ - struct mbuf *m; - struct rtentry *rt; +in6_gif_output( + struct ifnet *ifp, + int family, /* family of the packet to be encapsulate. */ + struct mbuf *m, + struct rtentry *rt) { struct gif_softc *sc = (struct gif_softc*)ifp; struct sockaddr_in6 *dst = (struct sockaddr_in6 *)&sc->gif_ro6.ro_dst; @@ -201,9 +201,9 @@ in6_gif_output(ifp, family, m, rt) * it is too painful to ask for resend of inner packet, to achieve * path MTU discovery for encapsulated packets. */ - return(ip6_output(m, 0, &sc->gif_ro6, IPV6_MINMTU, 0, NULL)); + return(ip6_output(m, 0, &sc->gif_ro6, IPV6_MINMTU, 0, NULL, 0)); #else - return(ip6_output(m, 0, &sc->gif_ro6, 0, 0, NULL)); + return(ip6_output(m, 0, &sc->gif_ro6, 0, 0, NULL, 0)); #endif } diff --git a/bsd/netinet6/in6_gif.h b/bsd/netinet6/in6_gif.h index f34f963dd..6e292cd5d 100644 --- a/bsd/netinet6/in6_gif.h +++ b/bsd/netinet6/in6_gif.h @@ -34,12 +34,12 @@ #define _NETINET6_IN6_GIF_H_ #include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE #define GIF_HLIM 30 -int in6_gif_input __P((struct mbuf **, int *)); -int in6_gif_output __P((struct ifnet *, int, struct mbuf *, struct rtentry *)); -int gif_encapcheck6 __P((const struct mbuf *, int, int, void *)); -#endif /* __APPLE_API_PRIVATE */ +int in6_gif_input(struct mbuf **, int *); +int in6_gif_output(struct ifnet *, int, struct mbuf *, struct rtentry *); +int gif_encapcheck6(const struct mbuf *, int, int, void *); +#endif KERNEL_PRIVATE -#endif /*_NETINET6_IN6_GIF_H_*/ +#endif _NETINET6_IN6_GIF_H_ diff --git a/bsd/netinet6/in6_ifattach.c b/bsd/netinet6/in6_ifattach.c index 2b627e522..2ccb29cd0 100644 --- a/bsd/netinet6/in6_ifattach.c +++ b/bsd/netinet6/in6_ifattach.c @@ -39,6 +39,7 @@ #include <sys/kernel.h> #include <sys/syslog.h> #include <sys/md5.h> +#include <kern/lock.h> #include <net/if.h> #include <net/if_dl.h> @@ -66,6 +67,7 @@ struct icmp6_ifstat **icmp6_ifstat = NULL; size_t in6_ifstatmax = 0; size_t icmp6_ifstatmax = 0; unsigned long in6_maxmtu = 0; +extern lck_mtx_t *nd6_mutex; #if IP6_AUTO_LINKLOCAL int ip6_auto_linklocal = IP6_AUTO_LINKLOCAL; @@ -76,13 +78,14 @@ int ip6_auto_linklocal = 1; /* enable by default */ extern struct inpcbinfo udbinfo; extern struct inpcbinfo ripcbinfo; +extern lck_mtx_t *rt_mtx; -static int get_rand_ifid __P((struct ifnet *, struct in6_addr *)); -static int generate_tmp_ifid __P((u_int8_t *, const u_int8_t *, u_int8_t *)); -static int get_hw_ifid __P((struct ifnet *, struct in6_addr *)); -static int get_ifid __P((struct ifnet *, struct ifnet *, struct in6_addr *)); -static int in6_ifattach_linklocal __P((struct ifnet *, struct ifnet *, struct in6_aliasreq *)); -static int in6_ifattach_loopback __P((struct ifnet *)); +static int get_rand_ifid(struct ifnet *, struct in6_addr *); +static int generate_tmp_ifid(u_int8_t *, const u_int8_t *, u_int8_t *); +static int get_hw_ifid(struct ifnet *, struct in6_addr *); +static int get_ifid(struct ifnet *, struct ifnet *, struct in6_addr *); +static int in6_ifattach_linklocal(struct ifnet *, struct ifnet *, struct in6_aliasreq *); +static int in6_ifattach_loopback(struct ifnet *); #define EUI64_GBIT 0x01 #define EUI64_UBIT 0x02 @@ -103,9 +106,9 @@ static int in6_ifattach_loopback __P((struct ifnet *)); * We currently use MD5(hostname) for it. */ static int -get_rand_ifid(ifp, in6) - struct ifnet *ifp; - struct in6_addr *in6; /* upper 64bits are preserved */ +get_rand_ifid( + struct ifnet *ifp, + struct in6_addr *in6) /* upper 64bits are preserved */ { MD5_CTX ctxt; u_int8_t digest[16]; @@ -137,9 +140,10 @@ get_rand_ifid(ifp, in6) } static int -generate_tmp_ifid(seed0, seed1, ret) - u_int8_t *seed0, *ret; - const u_int8_t *seed1; +generate_tmp_ifid( + u_int8_t *seed0, + const u_int8_t *seed1, + u_int8_t *ret) { MD5_CTX ctxt; u_int8_t seed[16], digest[16], nullbuf[8]; @@ -226,9 +230,9 @@ generate_tmp_ifid(seed0, seed1, ret) * XXX assumes single sockaddr_dl (AF_LINK address) per an interface */ static int -get_hw_ifid(ifp, in6) - struct ifnet *ifp; - struct in6_addr *in6; /* upper 64bits are preserved */ +get_hw_ifid( + struct ifnet *ifp, + struct in6_addr *in6) /* upper 64bits are preserved */ { struct ifaddr *ifa; struct sockaddr_dl *sdl; @@ -238,6 +242,8 @@ get_hw_ifid(ifp, in6) static u_int8_t allone[8] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + /* Why doesn't this code use ifnet_addrs? */ + ifnet_lock_shared(ifp); for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = ifa->ifa_list.tqe_next) @@ -252,10 +258,12 @@ get_hw_ifid(ifp, in6) goto found; } + ifnet_lock_done(ifp); return -1; found: + ifnet_lock_done(ifp); addr = LLADDR(sdl); addrlen = sdl->sdl_alen; @@ -265,6 +273,8 @@ found: case IFT_FDDI: case IFT_ATM: case IFT_IEEE1394: + case IFT_L2VLAN: + case IFT_IEEE8023ADLAG: #if IFT_IEEE80211 case IFT_IEEE80211: #endif @@ -359,10 +369,10 @@ found: * sources. */ static int -get_ifid(ifp0, altifp, in6) - struct ifnet *ifp0; - struct ifnet *altifp; /* secondary EUI64 source */ - struct in6_addr *in6; +get_ifid( + struct ifnet *ifp0, + struct ifnet *altifp, /* secondary EUI64 source */ + struct in6_addr *in6) { struct ifnet *ifp; @@ -381,8 +391,8 @@ get_ifid(ifp0, altifp, in6) } /* next, try to get it from some other hardware interface */ - for (ifp = ifnet.tqh_first; ifp; ifp = ifp->if_list.tqe_next) - { + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_list) { if (ifp == ifp0) continue; if (get_hw_ifid(ifp, in6) != 0) @@ -396,9 +406,11 @@ get_ifid(ifp0, altifp, in6) nd6log((LOG_DEBUG, "%s: borrow interface identifier from %s\n", if_name(ifp0), if_name(ifp))); + ifnet_head_done(); goto success; } } + ifnet_head_done(); /* last resort: get from random number source */ if (get_rand_ifid(ifp, in6) == 0) { @@ -423,22 +435,22 @@ success: } static int -in6_ifattach_linklocal(ifp, altifp, ifra_passed) - struct ifnet *ifp; - struct ifnet *altifp; /* secondary EUI64 source */ - struct in6_aliasreq *ifra_passed; +in6_ifattach_linklocal( + struct ifnet *ifp, + struct ifnet *altifp, /* secondary EUI64 source */ + struct in6_aliasreq *ifra_passed) { struct in6_ifaddr *ia; struct in6_aliasreq ifra; struct nd_prefix pr0; - int i, dl_tag, error; + int i, error; /* * configure link-local address. */ bzero(&ifra, sizeof(ifra)); - dlil_plumb_protocol(PF_INET6, ifp, &dl_tag); + dlil_plumb_protocol(PF_INET6, ifp); /* * in6_update_ifa() does not use ifra_name, but we accurately set it @@ -572,8 +584,8 @@ in6_ifattach_linklocal(ifp, altifp, ifra_passed) } static int -in6_ifattach_loopback(ifp) - struct ifnet *ifp; /* must be IFT_LOOP */ +in6_ifattach_loopback( + struct ifnet *ifp) /* must be IFT_LOOP */ { struct in6_aliasreq ifra; int error; @@ -633,11 +645,11 @@ in6_ifattach_loopback(ifp) * when ifp == NULL, the caller is responsible for filling scopeid. */ int -in6_nigroup(ifp, name, namelen, in6) - struct ifnet *ifp; - const char *name; - int namelen; - struct in6_addr *in6; +in6_nigroup( + struct ifnet *ifp, + const char *name, + int namelen, + struct in6_addr *in6) { const char *p; u_char *q; @@ -680,9 +692,9 @@ in6_nigroup(ifp, name, namelen, in6) } void -in6_nigroup_attach(name, namelen) - const char *name; - int namelen; +in6_nigroup_attach( + const char *name, + int namelen) { struct ifnet *ifp; struct sockaddr_in6 mltaddr; @@ -695,12 +707,14 @@ in6_nigroup_attach(name, namelen) if (in6_nigroup(NULL, name, namelen, &mltaddr.sin6_addr) != 0) return; - for (ifp = ifnet.tqh_first; ifp; ifp = ifp->if_list.tqe_next) - { + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_list) { mltaddr.sin6_addr.s6_addr16[1] = htons(ifp->if_index); + ifnet_lock_shared(ifp); IN6_LOOKUP_MULTI(mltaddr.sin6_addr, ifp, in6m); + ifnet_lock_done(ifp); if (!in6m) { - if (!in6_addmulti(&mltaddr.sin6_addr, ifp, &error)) { + if (!in6_addmulti(&mltaddr.sin6_addr, ifp, &error, 0)) { nd6log((LOG_ERR, "%s: failed to join %s " "(errno=%d)\n", if_name(ifp), ip6_sprintf(&mltaddr.sin6_addr), @@ -708,12 +722,13 @@ in6_nigroup_attach(name, namelen) } } } + ifnet_head_done(); } void -in6_nigroup_detach(name, namelen) - const char *name; - int namelen; +in6_nigroup_detach( + const char *name, + int namelen) { struct ifnet *ifp; struct sockaddr_in6 mltaddr; @@ -725,13 +740,16 @@ in6_nigroup_detach(name, namelen) if (in6_nigroup(NULL, name, namelen, &mltaddr.sin6_addr) != 0) return; - for (ifp = ifnet.tqh_first; ifp; ifp = ifp->if_list.tqe_next) - { + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_list) { mltaddr.sin6_addr.s6_addr16[1] = htons(ifp->if_index); + ifnet_lock_shared(ifp); IN6_LOOKUP_MULTI(mltaddr.sin6_addr, ifp, in6m); + ifnet_lock_done(ifp); if (in6m) - in6_delmulti(in6m); + in6_delmulti(in6m, 0); } + ifnet_head_done(); } /* @@ -740,16 +758,15 @@ in6_nigroup_detach(name, namelen) * XXX multiple link-local address case */ void -in6_ifattach(ifp, altifp, ifra) - struct ifnet *ifp; - struct ifnet *altifp; /* secondary EUI64 source */ - struct in6_aliasreq *ifra; +in6_ifattach( + struct ifnet *ifp, + struct ifnet *altifp, /* secondary EUI64 source */ + struct in6_aliasreq *ifra) { static size_t if_indexlim = 8; struct in6_ifaddr *ia; struct in6_addr in6; - /* * We have some arrays that should be indexed by if_index. * since if_index will grow dynamically, they should grow too. @@ -830,11 +847,15 @@ in6_ifattach(ifp, altifp, ifra) * XXX multiple loopback interface case. */ if ((ifp->if_flags & IFF_LOOPBACK) != 0) { + struct in6_ifaddr *ia6 = NULL; in6 = in6addr_loopback; - if (in6ifa_ifpwithaddr(ifp, &in6) == NULL) { + if ((ia6 = in6ifa_ifpwithaddr(ifp, &in6)) == NULL) { if (in6_ifattach_loopback(ifp) != 0) return; } + else { + ifafree(&ia6->ia_ifa); + } } /* @@ -880,10 +901,10 @@ statinit: * from the ifnet list in bsdi. */ void -in6_ifdetach(ifp) - struct ifnet *ifp; +in6_ifdetach( + struct ifnet *ifp) { - struct in6_ifaddr *ia, *oia; + struct in6_ifaddr *ia, *oia, *nia; struct ifaddr *ifa, *next; struct rtentry *rt; short rtflags; @@ -898,13 +919,17 @@ in6_ifdetach(ifp) nd6_purge(ifp); /* nuke any of IPv6 addresses we have */ - for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = next) - { - next = ifa->ifa_list.tqe_next; - if (ifa->ifa_addr->sa_family != AF_INET6) + + lck_mtx_lock(nd6_mutex); + for (ia = in6_ifaddrs; ia != NULL; ia = nia) { + nia = ia->ia_next; + if (ia->ia_ifa.ifa_ifp != ifp) continue; - in6_purgeaddr(ifa); + in6_purgeaddr(&ia->ia_ifa, 1); } + lck_mtx_unlock(nd6_mutex); + + ifnet_lock_exclusive(ifp); /* undo everything done by in6_ifattach(), just in case */ for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = next) @@ -920,25 +945,28 @@ in6_ifdetach(ifp) ia = (struct in6_ifaddr *)ifa; /* remove from the routing table */ + lck_mtx_lock(rt_mtx); if ((ia->ia_flags & IFA_ROUTE) - && (rt = rtalloc1((struct sockaddr *)&ia->ia_addr, 0, 0UL))) { + && (rt = rtalloc1_locked((struct sockaddr *)&ia->ia_addr, 0, 0UL))) { rtflags = rt->rt_flags; - rtfree(rt); - rtrequest(RTM_DELETE, + rtfree_locked(rt); + rtrequest_locked(RTM_DELETE, (struct sockaddr *)&ia->ia_addr, (struct sockaddr *)&ia->ia_addr, (struct sockaddr *)&ia->ia_prefixmask, rtflags, (struct rtentry **)0); } + lck_mtx_unlock(rt_mtx); /* remove from the linked list */ - TAILQ_REMOVE(&ifp->if_addrlist, (struct ifaddr *)ia, ifa_list); + if_detach_ifa(ifp, &ia->ia_ifa); ifafree(&ia->ia_ifa); /* also remove from the IPv6 address chain(itojun&jinmei) */ oia = ia; - if (oia == (ia = in6_ifaddr)) - in6_ifaddr = ia->ia_next; + lck_mtx_lock(nd6_mutex); + if (oia == (ia = in6_ifaddrs)) + in6_ifaddrs = ia->ia_next; else { while (ia->ia_next && (ia->ia_next != oia)) ia = ia->ia_next; @@ -950,27 +978,11 @@ in6_ifdetach(ifp) "list\n", if_name(ifp))); } } + lck_mtx_unlock(nd6_mutex); - IFAFREE(&oia->ia_ifa); - } - -#ifndef __APPLE__ - -/* This is a cause for reentrency, as those multicast addresses are - * freed both from the interface detaching and triggered by the closing of the socket - * Let the socket do the cleanup and not force it from the interface level - */ - /* leave from all multicast groups joined */ - in6_pcbpurgeif0(LIST_FIRST(udbinfo.listhead), ifp); - in6_pcbpurgeif0(LIST_FIRST(ripcbinfo.listhead), ifp); - for (in6m = LIST_FIRST(&in6_multihead); in6m; in6m = in6m_next) { - in6m_next = LIST_NEXT(in6m, in6m_entry); - if (in6m->in6m_ifp != ifp) - continue; - in6_delmulti(in6m); - in6m = NULL; + ifafree(&oia->ia_ifa); } -#endif /* __APPLE__ */ + ifnet_lock_done(ifp); /* * remove neighbor management table. we call it twice just to make @@ -988,20 +1000,22 @@ in6_ifdetach(ifp) sin6.sin6_family = AF_INET6; sin6.sin6_addr = in6addr_linklocal_allnodes; sin6.sin6_addr.s6_addr16[1] = htons(ifp->if_index); - rt = rtalloc1((struct sockaddr *)&sin6, 0, 0UL); + lck_mtx_lock(rt_mtx); + rt = rtalloc1_locked((struct sockaddr *)&sin6, 0, 0UL); if (rt && rt->rt_ifp == ifp) { - rtrequest(RTM_DELETE, (struct sockaddr *)rt_key(rt), + rtrequest_locked(RTM_DELETE, (struct sockaddr *)rt_key(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags, 0); - rtfree(rt); + rtfree_locked(rt); } + lck_mtx_unlock(rt_mtx); } void -in6_get_tmpifid(ifp, retbuf, baseid, generate) - struct ifnet *ifp; - u_int8_t *retbuf; - const u_int8_t *baseid; - int generate; +in6_get_tmpifid( + struct ifnet *ifp, + u_int8_t *retbuf, + const u_int8_t *baseid, + int generate) { u_int8_t nullbuf[8]; struct nd_ifinfo *ndi = &nd_ifinfo[ifp->if_index]; @@ -1022,31 +1036,18 @@ in6_get_tmpifid(ifp, retbuf, baseid, generate) bcopy(ndi->randomid, retbuf, 8); } -void -in6_tmpaddrtimer_funneled(void *ignored_arg) -{ -#ifdef __APPLE__ - boolean_t funnel_state; - funnel_state = thread_funnel_set(network_flock, TRUE); -#endif - in6_tmpaddrtimer(ignored_arg); -#ifdef __APPLE__ - (void) thread_funnel_set(network_flock, FALSE); -#endif -} - extern size_t nd_ifinfo_indexlim; extern int ip6_use_tempaddr; void -in6_tmpaddrtimer(ignored_arg) - void *ignored_arg; +in6_tmpaddrtimer( + void *ignored_arg) { int i; struct nd_ifinfo *ndi; u_int8_t nullbuf[8]; int s = splnet(); - timeout(in6_tmpaddrtimer_funneled, (caddr_t)0, + timeout(in6_tmpaddrtimer, (caddr_t)0, (ip6_temp_preferred_lifetime - ip6_desync_factor - ip6_temp_regen_advance) * hz); @@ -1068,5 +1069,6 @@ in6_tmpaddrtimer(ignored_arg) } } } + splx(s); } diff --git a/bsd/netinet6/in6_ifattach.h b/bsd/netinet6/in6_ifattach.h index 307bd0b8f..f0b7d2d09 100644 --- a/bsd/netinet6/in6_ifattach.h +++ b/bsd/netinet6/in6_ifattach.h @@ -33,16 +33,14 @@ #define _NETINET6_IN6_IFATTACH_H_ #include <sys/appleapiopts.h> -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -void in6_nigroup_attach __P((const char *, int)); -void in6_nigroup_detach __P((const char *, int)); -void in6_ifattach __P((struct ifnet *, struct ifnet *, struct in6_aliasreq *)); -void in6_ifdetach __P((struct ifnet *)); -void in6_get_tmpifid __P((struct ifnet *, u_int8_t *, const u_int8_t *, int)); -void in6_tmpaddrtimer __P((void *)); -int in6_nigroup __P((struct ifnet *, const char *, int, struct in6_addr *)); -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +#ifdef KERNEL_PRIVATE +void in6_nigroup_attach(const char *, int); +void in6_nigroup_detach(const char *, int); +void in6_ifattach(struct ifnet *, struct ifnet *, struct in6_aliasreq *); +void in6_ifdetach(struct ifnet *); +void in6_get_tmpifid(struct ifnet *, u_int8_t *, const u_int8_t *, int); +void in6_tmpaddrtimer(void *); +int in6_nigroup(struct ifnet *, const char *, int, struct in6_addr *); +#endif KERNEL_PRIVATE -#endif /* _NETINET6_IN6_IFATTACH_H_ */ +#endif _NETINET6_IN6_IFATTACH_H_ diff --git a/bsd/netinet6/in6_pcb.c b/bsd/netinet6/in6_pcb.c index 709196f22..2e82a0376 100644 --- a/bsd/netinet6/in6_pcb.c +++ b/bsd/netinet6/in6_pcb.c @@ -1,3 +1,24 @@ +/* + * Copyright (c) 2003-2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. @@ -91,6 +112,9 @@ #include <netinet6/in6_pcb.h> #include <net/if_types.h> +#include <kern/kern_types.h> +#include <kern/zalloc.h> + #include "faith.h" #if defined(NFAITH) && NFAITH > 0 #include <net/if_faith.h> @@ -106,15 +130,16 @@ #include <netinet6/ah6.h> #endif #include <netkey/key.h> +extern lck_mtx_t *sadb_mutex; #endif /* IPSEC */ struct in6_addr zeroin6_addr; int -in6_pcbbind(inp, nam, p) - register struct inpcb *inp; - struct sockaddr *nam; - struct proc *p; +in6_pcbbind( + struct inpcb *inp, + struct sockaddr *nam, + struct proc *p) { struct socket *so = inp->inp_socket; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)NULL; @@ -122,25 +147,36 @@ in6_pcbbind(inp, nam, p) u_short lport = 0; int wild = 0, reuseport = (so->so_options & SO_REUSEPORT); - if (!in6_ifaddr) /* XXX broken! */ + if (!in6_ifaddrs) /* XXX broken! */ return (EADDRNOTAVAIL); if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) return(EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) wild = 1; + socket_unlock(so, 0); /* keep reference */ + lck_rw_lock_exclusive(pcbinfo->mtx); if (nam) { sin6 = (struct sockaddr_in6 *)nam; - if (nam->sa_len != sizeof(*sin6)) + if (nam->sa_len != sizeof(*sin6)) { + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); return(EINVAL); + } /* * family check. */ - if (nam->sa_family != AF_INET6) + if (nam->sa_family != AF_INET6) { + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); return(EAFNOSUPPORT); + } /* KAME hack: embed scopeid */ - if (in6_embedscope(&sin6->sin6_addr, sin6, inp, NULL) != 0) + if (in6_embedscope(&sin6->sin6_addr, sin6, inp, NULL) != 0) { + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); return EINVAL; + } /* this must be cleared for ifa_ifwithaddr() */ sin6->sin6_scope_id = 0; @@ -159,8 +195,11 @@ in6_pcbbind(inp, nam, p) struct ifaddr *ia = NULL; sin6->sin6_port = 0; /* yech... */ - if ((ia = ifa_ifwithaddr((struct sockaddr *)sin6)) == 0) + if ((ia = ifa_ifwithaddr((struct sockaddr *)sin6)) == 0) { + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); return(EADDRNOTAVAIL); + } /* * XXX: bind to an anycast address might accidentally @@ -171,20 +210,24 @@ in6_pcbbind(inp, nam, p) if (ia && ((struct in6_ifaddr *)ia)->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|IN6_IFF_DETACHED)) { + ifafree(ia); + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); return(EADDRNOTAVAIL); } + ifafree(ia); + ia = NULL; } if (lport) { struct inpcb *t; /* GROSS */ if (ntohs(lport) < IPV6PORT_RESERVED && p && -#if 0 - suser(p->p_ucred, &p->p_acflag)) -#else - ((so->so_state & SS_PRIV) == 0)) -#endif + ((so->so_state & SS_PRIV) == 0)) { + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); return(EACCES); + } if (so->so_uid && !IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { @@ -196,8 +239,11 @@ in6_pcbbind(inp, nam, p) !IN6_IS_ADDR_UNSPECIFIED(&t->in6p_laddr) || (t->inp_socket->so_options & SO_REUSEPORT) == 0) && - so->so_uid != t->inp_socket->so_uid) + so->so_uid != t->inp_socket->so_uid) { + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); return (EADDRINUSE); + } if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 && IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { struct sockaddr_in sin; @@ -212,14 +258,21 @@ in6_pcbbind(inp, nam, p) (ntohl(t->inp_laddr.s_addr) != INADDR_ANY || INP_SOCKAF(so) == - INP_SOCKAF(t->inp_socket))) + INP_SOCKAF(t->inp_socket))) { + + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); return (EADDRINUSE); + } } } t = in6_pcblookup_local(pcbinfo, &sin6->sin6_addr, lport, wild); - if (t && (reuseport & t->inp_socket->so_options) == 0) + if (t && (reuseport & t->inp_socket->so_options) == 0) { + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); return(EADDRINUSE); + } if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 && IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { struct sockaddr_in sin; @@ -233,25 +286,33 @@ in6_pcbbind(inp, nam, p) (ntohl(t->inp_laddr.s_addr) != INADDR_ANY || INP_SOCKAF(so) == - INP_SOCKAF(t->inp_socket))) + INP_SOCKAF(t->inp_socket))) { + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); return (EADDRINUSE); + } } } inp->in6p_laddr = sin6->sin6_addr; } + socket_lock(so, 0); if (lport == 0) { int e; - if ((e = in6_pcbsetport(&inp->in6p_laddr, inp, p)) != 0) + if ((e = in6_pcbsetport(&inp->in6p_laddr, inp, p, 1)) != 0) { + lck_rw_done(pcbinfo->mtx); return(e); + } } else { inp->inp_lport = lport; - if (in_pcbinshash(inp) != 0) { + if (in_pcbinshash(inp, 1) != 0) { inp->in6p_laddr = in6addr_any; inp->inp_lport = 0; + lck_rw_done(pcbinfo->mtx); return (EAGAIN); } - } + } + lck_rw_done(pcbinfo->mtx); return(0); } @@ -268,12 +329,15 @@ in6_pcbbind(inp, nam, p) */ int -in6_pcbladdr(inp, nam, plocal_addr6) - register struct inpcb *inp; - struct sockaddr *nam; - struct in6_addr **plocal_addr6; +in6_pcbladdr( + struct inpcb *inp, + struct sockaddr *nam, + struct in6_addr *plocal_addr6) { - register struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; + struct in6_addr *addr6 = NULL; + struct in6_addr src_storage; + struct ifnet *ifp = NULL; int error = 0; @@ -288,7 +352,7 @@ in6_pcbladdr(inp, nam, plocal_addr6) if (in6_embedscope(&sin6->sin6_addr, sin6, inp, &ifp) != 0) return EINVAL; - if (in6_ifaddr) { + if (in6_ifaddrs) { /* * If the destination address is UNSPECIFIED addr, * use the loopback addr, e.g ::1. @@ -302,15 +366,16 @@ in6_pcbladdr(inp, nam, plocal_addr6) * with the address specified by setsockopt(IPV6_PKTINFO). * Is it the intended behavior? */ - *plocal_addr6 = in6_selectsrc(sin6, inp->in6p_outputopts, + addr6 = in6_selectsrc(sin6, inp->in6p_outputopts, inp->in6p_moptions, &inp->in6p_route, - &inp->in6p_laddr, &error); - if (*plocal_addr6 == 0) { + &inp->in6p_laddr, &src_storage, &error); + if (addr6 == 0) { if (error == 0) error = EADDRNOTAVAIL; return(error); } + *plocal_addr6 = *addr6; /* * Don't do pcblookup call here; return interface in * plocal_addr6 @@ -333,12 +398,13 @@ in6_pcbladdr(inp, nam, plocal_addr6) */ int in6_pcbconnect(inp, nam, p) - register struct inpcb *inp; + struct inpcb *inp; struct sockaddr *nam; struct proc *p; { - struct in6_addr *addr6; - register struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; + struct in6_addr addr6; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; + struct inpcb *pcb; int error; /* @@ -347,12 +413,15 @@ in6_pcbconnect(inp, nam, p) */ if ((error = in6_pcbladdr(inp, nam, &addr6)) != 0) return(error); - - if (in6_pcblookup_hash(inp->inp_pcbinfo, &sin6->sin6_addr, + socket_unlock(inp->inp_socket, 0); + pcb = in6_pcblookup_hash(inp->inp_pcbinfo, &sin6->sin6_addr, sin6->sin6_port, IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) - ? addr6 : &inp->in6p_laddr, - inp->inp_lport, 0, NULL) != NULL) { + ? &addr6 : &inp->in6p_laddr, + inp->inp_lport, 0, NULL); + socket_lock(inp->inp_socket, 0); + if (pcb != NULL) { + in_pcb_checkstate(pcb, WNT_RELEASE, 0); return (EADDRINUSE); } if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { @@ -361,7 +430,13 @@ in6_pcbconnect(inp, nam, p) if (error) return (error); } - inp->in6p_laddr = *addr6; + inp->in6p_laddr = addr6; + } + if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) { + /*lock inversion issue, mostly with udp multicast packets */ + socket_unlock(inp->inp_socket, 0); + lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx); + socket_lock(inp->inp_socket, 0); } inp->in6p_faddr = sin6->sin6_addr; inp->inp_fport = sin6->sin6_port; @@ -372,6 +447,7 @@ in6_pcbconnect(inp, nam, p) (htonl(ip6_flow_seq++) & IPV6_FLOWLABEL_MASK); in_pcbrehash(inp); + lck_rw_done(inp->inp_pcbinfo->mtx); return (0); } @@ -383,13 +459,14 @@ in6_pcbconnect(inp, nam, p) * an entry to the caller for later use. */ struct in6_addr * -in6_selectsrc(dstsock, opts, mopts, ro, laddr, errorp) - struct sockaddr_in6 *dstsock; - struct ip6_pktopts *opts; - struct ip6_moptions *mopts; - struct route_in6 *ro; - struct in6_addr *laddr; - int *errorp; +in6_selectsrc( + struct sockaddr_in6 *dstsock, + struct ip6_pktopts *opts, + struct ip6_moptions *mopts, + struct route_in6 *ro, + struct in6_addr *laddr, + struct in6_addr *src_storage, + int *errorp) { struct in6_addr *dst; struct in6_ifaddr *ia6 = 0; @@ -426,7 +503,9 @@ in6_selectsrc(dstsock, opts, mopts, ro, laddr, errorp) *errorp = EADDRNOTAVAIL; return(0); } - return(&satosin6(&ia6->ia_addr)->sin6_addr); + *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; + ifafree(&ia6->ia_ifa); + return(src_storage); } /* @@ -455,7 +534,9 @@ in6_selectsrc(dstsock, opts, mopts, ro, laddr, errorp) *errorp = EADDRNOTAVAIL; return(0); } - return(&satosin6(&ia6->ia_addr)->sin6_addr); + *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; + ifafree(&ia6->ia_ifa); + return(src_storage); } /* @@ -480,7 +561,9 @@ in6_selectsrc(dstsock, opts, mopts, ro, laddr, errorp) *errorp = EADDRNOTAVAIL; return(0); } - return(&ia6->ia_addr.sin6_addr); + *src_storage = ia6->ia_addr.sin6_addr; + ifafree(&ia6->ia_ifa); + return(src_storage); } } @@ -495,17 +578,21 @@ in6_selectsrc(dstsock, opts, mopts, ro, laddr, errorp) if (opts && opts->ip6po_nexthop) { sin6_next = satosin6(opts->ip6po_nexthop); - rt = nd6_lookup(&sin6_next->sin6_addr, 1, NULL); + rt = nd6_lookup(&sin6_next->sin6_addr, 1, NULL, 0); if (rt) { ia6 = in6_ifawithscope(rt->rt_ifp, dst); - if (ia6 == 0) + if (ia6 == 0) { + ifaref(&rt->rt_ifa); ia6 = ifatoia6(rt->rt_ifa); + } } if (ia6 == 0) { *errorp = EADDRNOTAVAIL; return(0); } - return(&satosin6(&ia6->ia_addr)->sin6_addr); + *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; + ifaref(&rt->rt_ifa); + return(src_storage); } } @@ -546,14 +633,18 @@ in6_selectsrc(dstsock, opts, mopts, ro, laddr, errorp) if (ro->ro_rt) { ia6 = in6_ifawithscope(ro->ro_rt->rt_ifa->ifa_ifp, dst); - if (ia6 == 0) /* xxx scope error ?*/ + if (ia6 == 0) { /* xxx scope error ?*/ + ifaref(ro->ro_rt->rt_ifa); ia6 = ifatoia6(ro->ro_rt->rt_ifa); + } } if (ia6 == 0) { *errorp = EHOSTUNREACH; /* no route */ return(0); } - return(&satosin6(&ia6->ia_addr)->sin6_addr); + *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; + ifaref(&rt->rt_ifa); + return(src_storage); } *errorp = EADDRNOTAVAIL; @@ -568,9 +659,9 @@ in6_selectsrc(dstsock, opts, mopts, ro, laddr, errorp) * 3. The system default hoplimit. */ int -in6_selecthlim(in6p, ifp) - struct in6pcb *in6p; - struct ifnet *ifp; +in6_selecthlim( + struct in6pcb *in6p, + struct ifnet *ifp) { if (in6p && in6p->in6p_hops >= 0) return(in6p->in6p_hops); @@ -585,11 +676,18 @@ void in6_pcbdisconnect(inp) struct inpcb *inp; { + if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) { + /*lock inversion issue, mostly with udp multicast packets */ + socket_unlock(inp->inp_socket, 0); + lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx); + socket_lock(inp->inp_socket, 0); + } bzero((caddr_t)&inp->in6p_faddr, sizeof(inp->in6p_faddr)); inp->inp_fport = 0; /* clear flowinfo - draft-itojun-ipv6-flowlabel-api-00 */ inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK; in_pcbrehash(inp); + lck_rw_done(inp->inp_pcbinfo->mtx); if (inp->inp_socket->so_state & SS_NOFDREF) in6_pcbdetach(inp); } @@ -602,27 +700,35 @@ in6_pcbdetach(inp) struct inpcbinfo *ipi = inp->inp_pcbinfo; #if IPSEC - if (inp->in6p_sp != NULL) + if (inp->in6p_sp != NULL) { + lck_mtx_lock(sadb_mutex); ipsec6_delete_pcbpolicy(inp); + lck_mtx_unlock(sadb_mutex); + } #endif /* IPSEC */ - inp->inp_gencnt = ++ipi->ipi_gencnt; - in_pcbremlists(inp); - sotoinpcb(so) = 0; - sofree(so); - - if (inp->in6p_options) - m_freem(inp->in6p_options); - ip6_freepcbopts(inp->in6p_outputopts); - ip6_freemoptions(inp->in6p_moptions); - if (inp->in6p_route.ro_rt) - rtfree(inp->in6p_route.ro_rt); - /* Check and free IPv4 related resources in case of mapped addr */ - if (inp->inp_options) - (void)m_free(inp->inp_options); - ip_freemoptions(inp->inp_moptions); - inp->inp_vflag = 0; - zfree(ipi->ipi_zone, inp); + if (in_pcb_checkstate(inp, WNT_STOPUSING, 1) != WNT_STOPUSING) + printf("in6_pcbdetach so=%x can't be marked dead ok\n", so); + + inp->inp_state = INPCB_STATE_DEAD; + + if ((so->so_flags & SOF_PCBCLEARING) == 0) { + inp->inp_vflag = 0; + so->so_flags |= SOF_PCBCLEARING; + inp->inp_gencnt = ++ipi->ipi_gencnt; + if (inp->in6p_options) + m_freem(inp->in6p_options); + ip6_freepcbopts(inp->in6p_outputopts); + ip6_freemoptions(inp->in6p_moptions); + if (inp->in6p_route.ro_rt) + rtfree(inp->in6p_route.ro_rt); + /* Check and free IPv4 related resources in case of mapped addr */ + if (inp->inp_options) + (void)m_free(inp->inp_options); + ip_freemoptions(inp->inp_moptions); + inp->inp_moptions = NULL; + + } } struct sockaddr * @@ -684,20 +790,16 @@ in6_setsockaddr(so, nam) struct socket *so; struct sockaddr **nam; { - int s; - register struct inpcb *inp; + struct inpcb *inp; struct in6_addr addr; in_port_t port; - s = splnet(); inp = sotoinpcb(so); if (!inp) { - splx(s); return EINVAL; } port = inp->inp_lport; addr = inp->in6p_laddr; - splx(s); *nam = in6_sockaddr(port, &addr); return 0; @@ -708,20 +810,16 @@ in6_setpeeraddr(so, nam) struct socket *so; struct sockaddr **nam; { - int s; struct inpcb *inp; struct in6_addr addr; in_port_t port; - s = splnet(); inp = sotoinpcb(so); if (!inp) { - splx(s); return EINVAL; } port = inp->inp_fport; addr = inp->in6p_faddr; - splx(s); *nam = in6_sockaddr(port, &addr); return 0; @@ -777,20 +875,21 @@ in6_mapped_peeraddr(struct socket *so, struct sockaddr **nam) * Must be called at splnet. */ void -in6_pcbnotify(head, dst, fport_arg, src, lport_arg, cmd, notify) - struct inpcbhead *head; +in6_pcbnotify(pcbinfo, dst, fport_arg, src, lport_arg, cmd, notify) + struct inpcbinfo *pcbinfo; struct sockaddr *dst; const struct sockaddr *src; u_int fport_arg, lport_arg; int cmd; -// struct inpcb *(*notify) __P((struct inpcb *, int)); - void (*notify) __P((struct inpcb *, int)); +// struct inpcb *(*notify)(struct inpcb *, int); + void (*notify)(struct inpcb *, int); { struct inpcb *inp, *ninp; struct sockaddr_in6 sa6_src, *sa6_dst; u_short fport = fport_arg, lport = lport_arg; u_int32_t flowinfo; - int errno, s; + int errno; + struct inpcbhead *head = pcbinfo->listhead; if ((unsigned)cmd > PRC_NCMDS || dst->sa_family != AF_INET6) return; @@ -822,7 +921,7 @@ in6_pcbnotify(head, dst, fport_arg, src, lport_arg, cmd, notify) notify = in6_rtchange; } errno = inet6ctlerrmap[cmd]; - s = splnet(); + lck_rw_lock_shared(pcbinfo->mtx); for (inp = LIST_FIRST(head); inp != NULL; inp = ninp) { ninp = LIST_NEXT(inp, inp_list); @@ -849,14 +948,21 @@ in6_pcbnotify(head, dst, fport_arg, src, lport_arg, cmd, notify) (!IN6_IS_ADDR_UNSPECIFIED(&sa6_src.sin6_addr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &sa6_src.sin6_addr)) || - (fport && inp->inp_fport != fport)) + (fport && inp->inp_fport != fport)) continue; + do_notify: - if (notify) + if (notify) { + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) + continue; + socket_lock(inp->inp_socket, 1); (*notify)(inp, errno); + (void)in_pcb_checkstate(inp, WNT_RELEASE, 1); + socket_unlock(inp->inp_socket, 1); + } } - splx(s); + lck_rw_done(pcbinfo->mtx); } /* @@ -869,7 +975,7 @@ in6_pcblookup_local(pcbinfo, laddr, lport_arg, wild_okay) u_int lport_arg; int wild_okay; { - register struct inpcb *inp; + struct inpcb *inp; int matchwild = 3, wildcard; u_short lport = lport_arg; @@ -947,11 +1053,12 @@ in6_pcblookup_local(pcbinfo, laddr, lport_arg, wild_okay) return (match); } } - +#ifndef APPLE +/* this is not used in Darwin */ void -in6_pcbpurgeif0(head, ifp) - struct in6pcb *head; - struct ifnet *ifp; +in6_pcbpurgeif0( + struct in6pcb *head, + struct ifnet *ifp) { struct in6pcb *in6p; struct ip6_moptions *im6o; @@ -986,6 +1093,7 @@ in6_pcbpurgeif0(head, ifp) } } } +#endif /* * Check for alternatives when higher level complains @@ -1007,9 +1115,10 @@ in6_losing(in6p) (struct sockaddr *)&in6p->in6p_route.ro_dst; info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rt_mask(rt); + lck_mtx_lock(rt_mtx); rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0); if (rt->rt_flags & RTF_DYNAMIC) - (void)rtrequest(RTM_DELETE, rt_key(rt), + (void)rtrequest_locked(RTM_DELETE, rt_key(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags, (struct rtentry **)0); else @@ -1017,7 +1126,8 @@ in6_losing(in6p) * A new route can be allocated * the next time output is attempted. */ - rtfree(rt); + rtfree_locked(rt); + lck_mtx_unlock(rt_mtx); } } @@ -1026,9 +1136,9 @@ in6_losing(in6p) * and allocate a (hopefully) better one. */ void -in6_rtchange(inp, errno) - struct inpcb *inp; - int errno; +in6_rtchange( + struct inpcb *inp, + int errno) { if (inp->in6p_route.ro_rt) { rtfree(inp->in6p_route.ro_rt); @@ -1044,15 +1154,17 @@ in6_rtchange(inp, errno) * Lookup PCB in hash list. */ struct inpcb * -in6_pcblookup_hash(pcbinfo, faddr, fport_arg, laddr, lport_arg, wildcard, ifp) - struct inpcbinfo *pcbinfo; - struct in6_addr *faddr, *laddr; - u_int fport_arg, lport_arg; - int wildcard; - struct ifnet *ifp; +in6_pcblookup_hash( + struct inpcbinfo *pcbinfo, + struct in6_addr *faddr, + u_int fport_arg, + struct in6_addr *laddr, + u_int lport_arg, + int wildcard, + struct ifnet *ifp) { struct inpcbhead *head; - register struct inpcb *inp; + struct inpcb *inp; u_short fport = fport_arg, lport = lport_arg; int faith; @@ -1062,6 +1174,8 @@ in6_pcblookup_hash(pcbinfo, faddr, fport_arg, laddr, lport_arg, wildcard, ifp) faith = 0; #endif + lck_rw_lock_shared(pcbinfo->mtx); + /* * First look for an exact match. */ @@ -1076,9 +1190,16 @@ in6_pcblookup_hash(pcbinfo, faddr, fport_arg, laddr, lport_arg, wildcard, ifp) inp->inp_fport == fport && inp->inp_lport == lport) { /* - * Found. - */ - return (inp); + * Found. Check if pcb is still valid + */ + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) { + lck_rw_done(pcbinfo->mtx); + return (inp); + } + else { /* it's there but dead, say it isn't found */ + lck_rw_done(pcbinfo->mtx); + return(NULL); + } } } if (wildcard) { @@ -1094,18 +1215,34 @@ in6_pcblookup_hash(pcbinfo, faddr, fport_arg, laddr, lport_arg, wildcard, ifp) if (faith && (inp->inp_flags & INP_FAITH) == 0) continue; if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, - laddr)) - return (inp); + laddr)) { + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) { + lck_rw_done(pcbinfo->mtx); + return (inp); + } + else { /* it's there but dead, say it isn't found */ + lck_rw_done(pcbinfo->mtx); + return(NULL); + } + } else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) local_wild = inp; } } - return (local_wild); + if (local_wild && in_pcb_checkstate(local_wild, WNT_ACQUIRE, 0) != WNT_STOPUSING) { + lck_rw_done(pcbinfo->mtx); + return (local_wild); + } + else { + lck_rw_done(pcbinfo->mtx); + return (NULL); + } } /* * Not found. */ + lck_rw_done(pcbinfo->mtx); return (NULL); } diff --git a/bsd/netinet6/in6_pcb.h b/bsd/netinet6/in6_pcb.h index b0ebf339e..2bae22c14 100644 --- a/bsd/netinet6/in6_pcb.h +++ b/bsd/netinet6/in6_pcb.h @@ -67,50 +67,50 @@ #define _NETINET6_IN6_PCB_H_ #include <sys/appleapiopts.h> -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE #define satosin6(sa) ((struct sockaddr_in6 *)(sa)) #define sin6tosa(sin6) ((struct sockaddr *)(sin6)) #define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) -void in6_pcbpurgeif0 __P((struct in6pcb *, struct ifnet *)); -void in6_losing __P((struct inpcb *)); -int in6_pcballoc __P((struct socket *, struct inpcbinfo *, struct proc *)); -int in6_pcbbind __P((struct inpcb *, struct sockaddr *, struct proc *)); -int in6_pcbconnect __P((struct inpcb *, struct sockaddr *, struct proc *)); -void in6_pcbdetach __P((struct inpcb *)); -void in6_pcbdisconnect __P((struct inpcb *)); -int in6_pcbladdr __P((struct inpcb *, struct sockaddr *, - struct in6_addr **)); +#ifndef APPLE +//void in6_pcbpurgeif0(struct in6pcb *, struct ifnet *); +#endif +void in6_losing(struct inpcb *); +int in6_pcballoc(struct socket *, struct inpcbinfo *, struct proc *); +int in6_pcbbind(struct inpcb *, struct sockaddr *, struct proc *); +int in6_pcbconnect(struct inpcb *, struct sockaddr *, struct proc *); +void in6_pcbdetach(struct inpcb *); +void in6_pcbdisconnect(struct inpcb *); +int in6_pcbladdr(struct inpcb *, struct sockaddr *, + struct in6_addr *); struct inpcb * - in6_pcblookup_local __P((struct inpcbinfo *, - struct in6_addr *, u_int, int)); + in6_pcblookup_local(struct inpcbinfo *, + struct in6_addr *, u_int, int); struct inpcb * - in6_pcblookup_hash __P((struct inpcbinfo *, + in6_pcblookup_hash(struct inpcbinfo *, struct in6_addr *, u_int, struct in6_addr *, - u_int, int, struct ifnet *)); -void in6_pcbnotify __P((struct inpcbhead *, struct sockaddr *, + u_int, int, struct ifnet *); +void in6_pcbnotify(struct inpcbinfo *, struct sockaddr *, u_int, const struct sockaddr *, u_int, int, - void (*)(struct inpcb *, int))); + void (*)(struct inpcb *, int)); void - in6_rtchange __P((struct inpcb *, int)); + in6_rtchange(struct inpcb *, int); struct sockaddr * - in6_sockaddr __P((in_port_t port, struct in6_addr *addr_p)); + in6_sockaddr(in_port_t port, struct in6_addr *addr_p); struct sockaddr * - in6_v4mapsin6_sockaddr __P((in_port_t port, struct in_addr *addr_p)); -int in6_setpeeraddr __P((struct socket *so, struct sockaddr **nam)); -int in6_setsockaddr __P((struct socket *so, struct sockaddr **nam)); -int in6_mapped_sockaddr __P((struct socket *so, struct sockaddr **nam)); -int in6_mapped_peeraddr __P((struct socket *so, struct sockaddr **nam)); -struct in6_addr *in6_selectsrc __P((struct sockaddr_in6 *, + in6_v4mapsin6_sockaddr(in_port_t port, struct in_addr *addr_p); +int in6_setpeeraddr(struct socket *so, struct sockaddr **nam); +int in6_setsockaddr(struct socket *so, struct sockaddr **nam); +int in6_mapped_sockaddr(struct socket *so, struct sockaddr **nam); +int in6_mapped_peeraddr(struct socket *so, struct sockaddr **nam); +struct in6_addr *in6_selectsrc(struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, - struct route_in6 *, - struct in6_addr *, int *)); -int in6_selecthlim __P((struct in6pcb *, struct ifnet *)); -int in6_pcbsetport __P((struct in6_addr *, struct inpcb *, struct proc *)); -void init_sin6 __P((struct sockaddr_in6 *sin6, struct mbuf *m)); -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ + struct route_in6 *, struct in6_addr *, + struct in6_addr *, int *); +int in6_selecthlim(struct in6pcb *, struct ifnet *); +int in6_pcbsetport(struct in6_addr *, struct inpcb *, struct proc *, int); +void init_sin6(struct sockaddr_in6 *sin6, struct mbuf *m); +#endif KERNEL_PRIVATE -#endif /* !_NETINET6_IN6_PCB_H_ */ +#endif !_NETINET6_IN6_PCB_H_ diff --git a/bsd/netinet6/in6_prefix.c b/bsd/netinet6/in6_prefix.c index b9096fe2b..88a945059 100644 --- a/bsd/netinet6/in6_prefix.c +++ b/bsd/netinet6/in6_prefix.c @@ -94,13 +94,14 @@ struct rr_prhead rr_prefix; #include <net/net_osdep.h> -static void add_each_addr __P((struct socket *so, struct rr_prefix *rpp, - struct rp_addr *rap)); -static int create_ra_entry __P((struct rp_addr **rapp)); -static int add_each_prefix __P((struct socket *so, struct rr_prefix *rpp)); -static void free_rp_entries __P((struct rr_prefix *rpp)); -static int link_stray_ia6s __P((struct rr_prefix *rpp)); -static void rp_remove __P((struct rr_prefix *rpp)); +static void add_each_addr(struct socket *so, struct rr_prefix *rpp, + struct rp_addr *rap); +static int create_ra_entry(struct rp_addr **rapp); +static int add_each_prefix(struct socket *so, struct rr_prefix *rpp); +static void free_rp_entries(struct rr_prefix *rpp); +static int link_stray_ia6s(struct rr_prefix *rpp); +static void rp_remove(struct rr_prefix *rpp); +extern lck_mtx_t *prefix6_mutex; /* * Copy bits from src to tgt, from off bit for len bits. @@ -158,6 +159,7 @@ in6_prefixwithifp(struct ifnet *ifp, int plen, struct in6_addr *dst) struct ifprefix *ifpr; /* search matched prefix */ + ifnet_lock_shared(ifp); for (ifpr = TAILQ_FIRST(&ifp->if_prefixhead); ifpr; ifpr = TAILQ_NEXT(ifpr, ifpr_list)) { @@ -167,6 +169,7 @@ in6_prefixwithifp(struct ifnet *ifp, int plen, struct in6_addr *dst) if (plen <= in6_matchlen(dst, IFPR_IN6(ifpr))) break; } + ifnet_lock_done(ifp); return (ifpr); } @@ -192,6 +195,7 @@ search_matched_prefix(struct ifnet *ifp, struct in6_prefixreq *ipr) * which matches the addr */ + ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family != AF_INET6) @@ -200,13 +204,17 @@ search_matched_prefix(struct ifnet *ifp, struct in6_prefixreq *ipr) in6_matchlen(&ipr->ipr_prefix.sin6_addr, IFA_IN6(ifa))) break; } - if (ifa == NULL) + if (ifa == NULL) { + ifnet_lock_done(ifp); return NULL; + } rpp = ifpr2rp(((struct in6_ifaddr *)ifa)->ia6_ifpr); - if (rpp != 0) + if (rpp != 0) { + ifnet_lock_done(ifp); return rpp; - + } + for (ifpr = TAILQ_FIRST(&ifp->if_prefixhead); ifpr; ifpr = TAILQ_NEXT(ifpr, ifpr_list)) { @@ -217,6 +225,7 @@ search_matched_prefix(struct ifnet *ifp, struct in6_prefixreq *ipr) IFPR_IN6(ifpr))) break; } + ifnet_lock_done(ifp); if (ifpr != NULL) log(LOG_ERR, "in6_prefix.c: search_matched_prefix: addr %s" "has no pointer to prefix %s\n", ip6_sprintf(IFA_IN6(ifa)), @@ -237,6 +246,7 @@ mark_matched_prefixes(u_long cmd, struct ifnet *ifp, struct in6_rrenumreq *irr) int matchlen, matched = 0; /* search matched prefixes */ + ifnet_lock_exclusive(ifp); /* Should if_prefixhead be protected by IPv6?? */ for (ifpr = TAILQ_FIRST(&ifp->if_prefixhead); ifpr; ifpr = TAILQ_NEXT(ifpr, ifpr_list)) { @@ -282,6 +292,7 @@ mark_matched_prefixes(u_long cmd, struct ifnet *ifp, struct in6_rrenumreq *irr) "ND autoconfigured addr?\n", ip6_sprintf(IFA_IN6(ifa))); } + ifnet_lock_done(ifp); return matched; } @@ -294,6 +305,7 @@ delmark_global_prefixes(struct ifnet *ifp, struct in6_rrenumreq *irr) struct ifprefix *ifpr; /* search matched prefixes */ + ifnet_lock_exclusive(ifp); for (ifpr = TAILQ_FIRST(&ifp->if_prefixhead); ifpr; ifpr = TAILQ_NEXT(ifpr, ifpr_list)) { @@ -305,6 +317,7 @@ delmark_global_prefixes(struct ifnet *ifp, struct in6_rrenumreq *irr) IPV6_ADDR_SCOPE_GLOBAL) ifpr2rp(ifpr)->rp_statef_delmark = 1; } + ifnet_lock_done(ifp); } /* Unmark prefixes */ @@ -314,6 +327,7 @@ unmark_prefixes(struct ifnet *ifp) struct ifprefix *ifpr; /* unmark all prefix */ + ifnet_lock_exclusive(ifp); for (ifpr = TAILQ_FIRST(&ifp->if_prefixhead); ifpr; ifpr = TAILQ_NEXT(ifpr, ifpr_list)) { @@ -324,22 +338,26 @@ unmark_prefixes(struct ifnet *ifp) ifpr2rp(ifpr)->rp_statef_addmark = 0; ifpr2rp(ifpr)->rp_statef_delmark = 0; } + ifnet_lock_done(ifp); } static void init_prefix_ltimes(struct rr_prefix *rpp) { + struct timeval timenow; + + getmicrotime(&timenow); if (rpp->rp_pltime == RR_INFINITE_LIFETIME || rpp->rp_rrf_decrprefd == 0) rpp->rp_preferred = 0; else - rpp->rp_preferred = time_second + rpp->rp_pltime; + rpp->rp_preferred = timenow.tv_sec + rpp->rp_pltime; if (rpp->rp_vltime == RR_INFINITE_LIFETIME || rpp->rp_rrf_decrvalid == 0) rpp->rp_expire = 0; else - rpp->rp_expire = time_second + rpp->rp_vltime; + rpp->rp_expire = timenow.tv_sec + rpp->rp_vltime; } static int @@ -377,6 +395,7 @@ search_ifidwithprefix(struct rr_prefix *rpp, struct in6_addr *ifid) { struct rp_addr *rap; + lck_mtx_lock(prefix6_mutex); LIST_FOREACH(rap, &rpp->rp_addrhead, ra_entry) { if (rr_are_ifid_equal(ifid, &rap->ra_ifid, @@ -384,6 +403,7 @@ search_ifidwithprefix(struct rr_prefix *rpp, struct in6_addr *ifid) rpp->rp_plen)) break; } + lck_mtx_unlock(prefix6_mutex); return rap; } @@ -407,9 +427,9 @@ assign_ra_entry(struct rr_prefix *rpp, int iilen, struct in6_ifaddr *ia) #if 0 /* Can't do this now, because rpp may be on th stack. should fix it? */ ia->ia6_ifpr = rp2ifpr(rpp); #endif - s = splnet(); + lck_mtx_lock(prefix6_mutex); LIST_INSERT_HEAD(&rpp->rp_addrhead, rap, ra_entry); - splx(s); + lck_mtx_unlock(prefix6_mutex); return 0; } @@ -424,7 +444,7 @@ in6_prefix_add_llifid(int iilen, struct in6_ifaddr *ia) struct rr_prefix *rpp; struct rp_addr *rap; struct socket so; - int error, s; + int error; if ((error = create_ra_entry(&rap)) != 0) return(error); @@ -435,6 +455,7 @@ in6_prefix_add_llifid(int iilen, struct in6_ifaddr *ia) /* XXX: init dummy so */ bzero(&so, sizeof(so)); /* insert into list */ + lck_mtx_lock(prefix6_mutex); LIST_FOREACH(rpp, &rr_prefix, rp_entry) { /* @@ -443,11 +464,10 @@ in6_prefix_add_llifid(int iilen, struct in6_ifaddr *ia) if (rpp->rp_ifp != ia->ia_ifp) continue; - s = splnet(); LIST_INSERT_HEAD(&rpp->rp_addrhead, rap, ra_entry); - splx(s); add_each_addr(&so, rpp, rap); } + lck_mtx_unlock(prefix6_mutex); return 0; } @@ -546,9 +566,9 @@ in6_prefix_remove_ifid(int iilen, struct in6_ifaddr *ia) return; rap = search_ifidwithprefix(ifpr2rp(ia->ia6_ifpr), IA6_IN6(ia)); if (rap != NULL) { - int s = splnet(); + lck_mtx_lock(prefix6_mutex); LIST_REMOVE(rap, ra_entry); - splx(s); + lck_mtx_unlock(prefix6_mutex); if (rap->ra_addr) ifafree(&rap->ra_addr->ia_ifa); FREE(rap, M_RR_ADDR); @@ -559,12 +579,13 @@ in6_prefix_remove_ifid(int iilen, struct in6_ifaddr *ia) } void -in6_purgeprefix(ifp) - struct ifnet *ifp; +in6_purgeprefix( + struct ifnet *ifp) { struct ifprefix *ifpr, *nextifpr; /* delete prefixes before ifnet goes away */ + ifnet_lock_exclusive(ifp); for (ifpr = TAILQ_FIRST(&ifp->if_prefixhead); ifpr; ifpr = nextifpr) { @@ -574,6 +595,7 @@ in6_purgeprefix(ifp) continue; (void)delete_each_prefix(ifpr2rp(ifpr), PR_ORIG_KERNEL); } + ifnet_lock_done(ifp); } static void @@ -675,6 +697,7 @@ rrpr_update(struct socket *so, struct rr_prefix *new) int s; /* search existing prefix */ + ifnet_lock_exclusive(new->rp_ifp); for (ifpr = TAILQ_FIRST(&new->rp_ifp->if_prefixhead); ifpr; ifpr = TAILQ_NEXT(ifpr, ifpr_list)) { @@ -695,8 +718,10 @@ rrpr_update(struct socket *so, struct rr_prefix *new) * If the origin of the already-installed prefix is more * preferable than the new one, ignore installation request. */ - if (rpp->rp_origin > new->rp_origin) + if (rpp->rp_origin > new->rp_origin) { + ifnet_lock_done(new->rp_ifp); return(EPERM); + } /* update prefix information */ rpp->rp_flags.prf_ra = new->rp_flags.prf_ra; @@ -712,6 +737,7 @@ rrpr_update(struct socket *so, struct rr_prefix *new) * add rp_addr entries in new into rpp, if they have not * been already included in rpp. */ + lck_mtx_lock(prefix6_mutex); while (!LIST_EMPTY(&new->rp_addrhead)) { rap = LIST_FIRST(&new->rp_addrhead); @@ -723,10 +749,9 @@ rrpr_update(struct socket *so, struct rr_prefix *new) FREE(rap, M_RR_ADDR); continue; } - s = splnet(); LIST_INSERT_HEAD(&rpp->rp_addrhead, rap, ra_entry); - splx(s); } + lck_mtx_unlock(prefix6_mutex); } else { /* * We got a fresh prefix. @@ -737,9 +762,11 @@ rrpr_update(struct socket *so, struct rr_prefix *new) if (rpp == NULL) { log(LOG_ERR, "in6_prefix.c: rrpr_update:%d" ": ENOBUFS for rr_prefix\n", __LINE__); + ifnet_lock_done(new->rp_ifp); return(ENOBUFS); } /* initilization */ + lck_mtx_lock(prefix6_mutex); *rpp = *new; LIST_INIT(&rpp->rp_addrhead); /* move rp_addr entries of new to rpp */ @@ -749,6 +776,7 @@ rrpr_update(struct socket *so, struct rr_prefix *new) LIST_REMOVE(rap, ra_entry); LIST_INSERT_HEAD(&rpp->rp_addrhead, rap, ra_entry); } + lck_mtx_unlock(prefix6_mutex); /* let rp_ifpr.ifpr_prefix point rr_prefix. */ rpp->rp_ifpr.ifpr_prefix = (struct sockaddr *)&rpp->rp_prefix; @@ -769,10 +797,11 @@ rrpr_update(struct socket *so, struct rr_prefix *new) rp2ifpr(rpp)->ifpr_type = IN6_PREFIX_RR; } /* link rr_prefix entry to rr_prefix list */ - s = splnet(); + lck_mtx_lock(prefix6_mutex); LIST_INSERT_HEAD(&rr_prefix, rpp, rp_entry); - splx(s); + lck_mtx_unlock(prefix6_mutex); } + ifnet_lock_done(new->rp_ifp); if (!new->rp_raf_auto) return 0; @@ -782,6 +811,7 @@ rrpr_update(struct socket *so, struct rr_prefix *new) * If it existed but not pointing to the prefix yet, * init the prefix pointer. */ + lck_mtx_lock(prefix6_mutex); LIST_FOREACH(rap, &rpp->rp_addrhead, ra_entry) { if (rap->ra_addr != NULL) { @@ -791,6 +821,7 @@ rrpr_update(struct socket *so, struct rr_prefix *new) } add_each_addr(so, rpp, rap); } + lck_mtx_unlock(prefix6_mutex); return 0; } @@ -804,14 +835,14 @@ add_each_prefix(struct socket *so, struct rr_prefix *rpp) static void rp_remove(struct rr_prefix *rpp) { - int s; - s = splnet(); /* unlink rp_entry from if_prefixlist */ + lck_mtx_lock(prefix6_mutex); { struct ifnet *ifp = rpp->rp_ifp; struct ifprefix *ifpr; + ifnet_lock_exclusive(ifp); if ((ifpr = TAILQ_FIRST(&ifp->if_prefixhead)) == rp2ifpr(rpp)) TAILQ_FIRST(&ifp->if_prefixhead) = TAILQ_NEXT(ifpr, ifpr_list); @@ -825,10 +856,11 @@ rp_remove(struct rr_prefix *rpp) else printf("Couldn't unlink rr_prefix from ifp\n"); } + ifnet_lock_done(ifp); } /* unlink rp_entry from rr_prefix list */ LIST_REMOVE(rpp, rp_entry); - splx(s); + lck_mtx_unlock(prefix6_mutex); FREE(rpp, M_IP6RR); } @@ -871,6 +903,7 @@ init_newprefix(struct in6_rrenumreq *irr, struct ifprefix *ifpr, irr->irr_u_uselen, min(ifpr->ifpr_plen - irr->irr_u_uselen, irr->irr_u_keeplen)); + lck_mtx_lock(prefix6_mutex); LIST_FOREACH(orap, &(ifpr2rp(ifpr)->rp_addrhead), ra_entry) { struct rp_addr *rap; @@ -893,6 +926,7 @@ init_newprefix(struct in6_rrenumreq *irr, struct ifprefix *ifpr, /* Is some FlagMasks for rrf necessary? */ rpp->rp_rrf = irr->irr_rrf; rpp->rp_origin = irr->irr_origin; + lck_mtx_unlock(prefix6_mutex); return 0; } @@ -904,6 +938,7 @@ free_rp_entries(struct rr_prefix *rpp) * This func is only called with rpp on stack(not on list). * So no splnet() here */ + lck_mtx_lock(prefix6_mutex); while (!LIST_EMPTY(&rpp->rp_addrhead)) { struct rp_addr *rap; @@ -914,6 +949,7 @@ free_rp_entries(struct rr_prefix *rpp) ifafree(&rap->ra_addr->ia_ifa); FREE(rap, M_RR_ADDR); } + lck_mtx_unlock(prefix6_mutex); } static int @@ -925,6 +961,7 @@ add_useprefixes(struct socket *so, struct ifnet *ifp, int error = 0; /* add prefixes to each of marked prefix */ + ifnet_lock_exclusive(ifp); for (ifpr = TAILQ_FIRST(&ifp->if_prefixhead); ifpr; ifpr = nextifpr) { nextifpr = TAILQ_NEXT(ifpr, ifpr_list); @@ -937,6 +974,7 @@ add_useprefixes(struct socket *so, struct ifnet *ifp, error = add_each_prefix(so, &rp); } } + ifnet_lock_done(ifp); /* free each rp_addr entry */ free_rp_entries(&rp); @@ -947,14 +985,20 @@ static void unprefer_prefix(struct rr_prefix *rpp) { struct rp_addr *rap; + struct timeval timenow; + getmicrotime(&timenow); + + lck_mtx_lock(prefix6_mutex); for (rap = rpp->rp_addrhead.lh_first; rap != NULL; rap = rap->ra_entry.le_next) { if (rap->ra_addr == NULL) continue; - rap->ra_addr->ia6_lifetime.ia6t_preferred = time_second; + rap->ra_addr->ia6_lifetime.ia6t_preferred = timenow.tv_sec; rap->ra_addr->ia6_lifetime.ia6t_pltime = 0; } + lck_mtx_unlock(prefix6_mutex); + } int @@ -965,29 +1009,28 @@ delete_each_prefix(struct rr_prefix *rpp, u_char origin) if (rpp->rp_origin > origin) return(EPERM); + lck_mtx_lock(prefix6_mutex); while (rpp->rp_addrhead.lh_first != NULL) { struct rp_addr *rap; int s; - s = splnet(); rap = LIST_FIRST(&rpp->rp_addrhead); if (rap == NULL) { - splx(s); break; } LIST_REMOVE(rap, ra_entry); - splx(s); if (rap->ra_addr == NULL) { FREE(rap, M_RR_ADDR); continue; } rap->ra_addr->ia6_ifpr = NULL; - in6_purgeaddr(&rap->ra_addr->ia_ifa); + in6_purgeaddr(&rap->ra_addr->ia_ifa, 0); ifafree(&rap->ra_addr->ia_ifa); FREE(rap, M_RR_ADDR); } rp_remove(rpp); + lck_mtx_unlock(prefix6_mutex); return error; } @@ -998,6 +1041,7 @@ delete_prefixes(struct ifnet *ifp, u_char origin) struct ifprefix *ifpr, *nextifpr; /* delete prefixes marked as tobe deleted */ + ifnet_lock_exclusive(ifp); for (ifpr = TAILQ_FIRST(&ifp->if_prefixhead); ifpr; ifpr = nextifpr) { nextifpr = TAILQ_NEXT(ifpr, ifpr_list); @@ -1007,6 +1051,7 @@ delete_prefixes(struct ifnet *ifp, u_char origin) if (ifpr2rp(ifpr)->rp_statef_delmark) (void)delete_each_prefix(ifpr2rp(ifpr), origin); } + ifnet_lock_done(ifp); } static int @@ -1060,7 +1105,7 @@ in6_prefix_ioctl(struct socket *so, u_long cmd, caddr_t data, int error = 0; /* - * Failsafe for errneous address config program. + * Failsafe for erroneous address config program. * Let's hope rrenumd don't make a mistakes. */ if (ipr->ipr_origin <= PR_ORIG_RA) @@ -1132,6 +1177,7 @@ in6_prefix_ioctl(struct socket *so, u_long cmd, caddr_t data, free_rp_entries(&rp_tmp); break; } + ifnet_lock_exclusive(ifp); for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = ifa->ifa_list.tqe_next) @@ -1155,8 +1201,11 @@ in6_prefix_ioctl(struct socket *so, u_long cmd, caddr_t data, rp_tmp.rp_plen, (sizeof(rap->ra_ifid) << 3) - rp_tmp.rp_plen); /* insert into list */ + lck_mtx_lock(prefix6_mutex); LIST_INSERT_HEAD(&rp_tmp.rp_addrhead, rap, ra_entry); + lck_mtx_unlock(prefix6_mutex); } + ifnet_lock_done(ifp); error = add_each_prefix(so, &rp_tmp); @@ -1169,39 +1218,28 @@ in6_prefix_ioctl(struct socket *so, u_long cmd, caddr_t data, if (rpp == NULL || ifp != rpp->rp_ifp) return (EADDRNOTAVAIL); + ifnet_lock_exclusive(ifp); error = delete_each_prefix(rpp, ipr->ipr_origin); + ifnet_lock_done(ifp); break; } bad: return error; } -void -in6_rr_timer_funneled(void *ignored_arg) -{ -#ifdef __APPLE__ - boolean_t funnel_state; - funnel_state = thread_funnel_set(network_flock, TRUE); -#endif - in6_rr_timer(ignored_arg); -#ifdef __APPLE__ - (void) thread_funnel_set(network_flock, FALSE); -#endif -} - void in6_rr_timer(void *ignored_arg) { - int s; struct rr_prefix *rpp; + struct timeval timenow; - timeout(in6_rr_timer_funneled, (caddr_t)0, ip6_rr_prune * hz); + getmicrotime(&timenow); - s = splnet(); /* expire */ + lck_mtx_lock(prefix6_mutex); rpp = LIST_FIRST(&rr_prefix); while (rpp) { - if (rpp->rp_expire && rpp->rp_expire < time_second) { + if (rpp->rp_expire && rpp->rp_expire < timenow.tv_sec) { struct rr_prefix *next_rpp; next_rpp = LIST_NEXT(rpp, rp_entry); @@ -1209,9 +1247,10 @@ in6_rr_timer(void *ignored_arg) rpp = next_rpp; continue; } - if (rpp->rp_preferred && rpp->rp_preferred < time_second) + if (rpp->rp_preferred && rpp->rp_preferred < timenow.tv_sec) unprefer_prefix(rpp); rpp = LIST_NEXT(rpp, rp_entry); } - splx(s); + lck_mtx_unlock(prefix6_mutex); + timeout(in6_rr_timer, (caddr_t)0, ip6_rr_prune * hz); } diff --git a/bsd/netinet6/in6_prefix.h b/bsd/netinet6/in6_prefix.h index 29a04cff3..d235a069b 100644 --- a/bsd/netinet6/in6_prefix.h +++ b/bsd/netinet6/in6_prefix.h @@ -30,7 +30,7 @@ #include <sys/callout.h> -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE struct rr_prefix { struct ifprefix rp_ifpr; LIST_ENTRY(rr_prefix) rp_entry; @@ -85,8 +85,7 @@ LIST_HEAD(rr_prhead, rr_prefix); extern struct rr_prhead rr_prefix; -void in6_rr_timer __P((void *)); -void in6_rr_timer_funneled __P((void *)); -int delete_each_prefix __P((struct rr_prefix *rpp, u_char origin)); +void in6_rr_timer(void *); +int delete_each_prefix (struct rr_prefix *rpp, u_char origin); -#endif /* __APPLE_API_PRIVATE */ +#endif KERNEL_PRIVATE diff --git a/bsd/netinet6/in6_proto.c b/bsd/netinet6/in6_proto.c index d33c161ce..0f5281bfb 100644 --- a/bsd/netinet6/in6_proto.c +++ b/bsd/netinet6/in6_proto.c @@ -102,6 +102,8 @@ #include <netinet6/nd6.h> #include <netinet6/in6_prefix.h> +#include <netinet6/ip6_mroute.h> + #if IPSEC #include <netinet6/ipsec.h> #if INET6 @@ -133,12 +135,13 @@ extern struct domain inet6domain; static struct pr_usrreqs nousrreqs; +lck_mtx_t *inet6_domain_mutex; #define PR_LISTEN 0 #define PR_ABRTACPTDIS 0 extern struct domain inet6domain; -extern int in6_inithead __P((void **, int)); +extern int in6_inithead(void **, int); void in6_dinit(void); static int rip6_pr_output(struct mbuf *m, struct socket *so, struct sockaddr_in6 *, struct mbuf *); @@ -148,15 +151,21 @@ struct ip6protosw inet6sw[] = { 0, 0, 0, 0, 0, ip6_init, 0, frag6_slowtimo, frag6_drain, - 0, &nousrreqs + 0, + &nousrreqs, + 0, 0, 0 + }, -{ SOCK_DGRAM, &inet6domain, IPPROTO_UDP, PR_ATOMIC|PR_ADDR, +{ SOCK_DGRAM, &inet6domain, IPPROTO_UDP, PR_ATOMIC|PR_ADDR|PR_PROTOLOCK|PR_PCBLOCK, udp6_input, 0, udp6_ctlinput, ip6_ctloutput, 0, 0, 0, 0, 0, - 0, &udp6_usrreqs + 0, + &udp6_usrreqs, + udp_lock, udp_unlock, udp_getlock + }, -{ SOCK_STREAM, &inet6domain, IPPROTO_TCP, PR_CONNREQUIRED|PR_WANTRCVD|PR_LISTEN, +{ SOCK_STREAM, &inet6domain, IPPROTO_TCP, PR_CONNREQUIRED|PR_WANTRCVD|PR_LISTEN|PR_PROTOLOCK|PR_PCBLOCK|PR_DISPOSE, tcp6_input, 0, tcp6_ctlinput, tcp_ctloutput, 0, #if INET /* don't call initialization and timeout routines twice */ @@ -164,60 +173,79 @@ struct ip6protosw inet6sw[] = { #else tcp_init, tcp_fasttimo, tcp_slowtimo, tcp_drain, #endif - 0, &tcp6_usrreqs, + 0, + &tcp6_usrreqs, + tcp_lock, tcp_unlock, tcp_getlock + }, { SOCK_RAW, &inet6domain, IPPROTO_RAW, PR_ATOMIC|PR_ADDR, rip6_input, rip6_pr_output, rip6_ctlinput, rip6_ctloutput, 0, 0, 0, 0, 0, - 0, &rip6_usrreqs + 0, + &rip6_usrreqs, + 0, rip_unlock, 0 }, { SOCK_RAW, &inet6domain, IPPROTO_ICMPV6, PR_ATOMIC|PR_ADDR|PR_LASTHDR, icmp6_input, rip6_pr_output, rip6_ctlinput, rip6_ctloutput, 0, icmp6_init, icmp6_fasttimo, 0, 0, - 0, &rip6_usrreqs + 0, + &rip6_usrreqs, + 0, rip_unlock, 0 }, { SOCK_RAW, &inet6domain, IPPROTO_DSTOPTS,PR_ATOMIC|PR_ADDR, dest6_input, 0, 0, 0, 0, 0, 0, 0, 0, - 0, &nousrreqs + 0, + &nousrreqs, + 0, 0, 0 }, { SOCK_RAW, &inet6domain, IPPROTO_ROUTING,PR_ATOMIC|PR_ADDR, route6_input, 0, 0, 0, 0, 0, 0, 0, 0, - 0, &nousrreqs + 0, + &nousrreqs, + 0, 0, 0 }, { SOCK_RAW, &inet6domain, IPPROTO_FRAGMENT,PR_ATOMIC|PR_ADDR, frag6_input, 0, 0, 0, 0, 0, 0, 0, 0, - 0, &nousrreqs + 0, + &nousrreqs, + 0, 0, 0 }, #if IPSEC -{ SOCK_RAW, &inet6domain, IPPROTO_AH, PR_ATOMIC|PR_ADDR, +{ SOCK_RAW, &inet6domain, IPPROTO_AH, PR_ATOMIC|PR_ADDR|PR_PROTOLOCK, ah6_input, 0, 0, 0, 0, 0, 0, 0, 0, - 0, &nousrreqs + 0, + &nousrreqs, + 0, 0, 0 }, #if IPSEC_ESP -{ SOCK_RAW, &inet6domain, IPPROTO_ESP, PR_ATOMIC|PR_ADDR, +{ SOCK_RAW, &inet6domain, IPPROTO_ESP, PR_ATOMIC|PR_ADDR|PR_PROTOLOCK, esp6_input, 0, esp6_ctlinput, 0, 0, 0, 0, 0, 0, - 0, &nousrreqs + 0, + &nousrreqs, + 0, 0, 0 }, #endif -{ SOCK_RAW, &inet6domain, IPPROTO_IPCOMP, PR_ATOMIC|PR_ADDR, +{ SOCK_RAW, &inet6domain, IPPROTO_IPCOMP, PR_ATOMIC|PR_ADDR|PR_PROTOLOCK, ipcomp6_input, 0, 0, 0, 0, 0, 0, 0, 0, - 0, &nousrreqs + 0, + &nousrreqs, + 0, 0, 0 }, #endif /* IPSEC */ #if INET @@ -225,27 +253,35 @@ struct ip6protosw inet6sw[] = { encap6_input, rip6_pr_output, 0, rip6_ctloutput, 0, encap_init, 0, 0, 0, - 0, &rip6_usrreqs + 0, + &rip6_usrreqs, + 0, rip_unlock, 0 }, #endif /*INET*/ { SOCK_RAW, &inet6domain, IPPROTO_IPV6, PR_ATOMIC|PR_ADDR|PR_LASTHDR, encap6_input, rip6_pr_output, 0, rip6_ctloutput, 0, encap_init, 0, 0, 0, - 0, &rip6_usrreqs + 0, + &rip6_usrreqs, + 0, rip_unlock, 0 }, { SOCK_RAW, &inet6domain, IPPROTO_PIM, PR_ATOMIC|PR_ADDR|PR_LASTHDR, pim6_input, rip6_pr_output, 0, rip6_ctloutput, 0, - 0, 0, 0, 0, - 0, &rip6_usrreqs + 0, 0, 0, 0, + 0, + &rip6_usrreqs, + 0, rip_unlock, 0 }, /* raw wildcard */ -{ SOCK_RAW, &inet6domain, 0, PR_ATOMIC|PR_ADDR, +{ SOCK_RAW, &inet6domain, 0, PR_ATOMIC|PR_ADDR|PR_LASTHDR, rip6_input, rip6_pr_output, 0, rip6_ctloutput, 0, 0, 0, 0, 0, - 0, &rip6_usrreqs + 0, + &rip6_usrreqs, + 0, rip_unlock, 0 }, }; @@ -276,6 +312,7 @@ in6_dinit() for (i=0, pr = &inet6sw[0]; i<in6_proto_count; i++, pr++) net_add_proto((struct protosw*)pr, dp); + inet6_domain_mutex = dp->dom_mtx; inet6domain_initted = 1; } } @@ -307,6 +344,7 @@ int ip6_defhlim = IPV6_DEFHLIM; int ip6_defmcasthlim = IPV6_DEFAULT_MULTICAST_HOPS; int ip6_accept_rtadv = 0; /* "IPV6FORWARDING ? 0 : 1" is dangerous */ int ip6_maxfragpackets; /* initialized in frag6.c:frag6_init() */ +int ip6_maxfrags; int ip6_log_interval = 5; int ip6_hdrnestlimit = 50; /* appropriate? */ int ip6_dad_count = 1; /* DupAddrDetectionTransmits */ @@ -415,6 +453,8 @@ SYSCTL_STRUCT(_net_inet6_ip6, IPV6CTL_STATS, stats, CTLFLAG_RD, &ip6stat, ip6stat, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets, CTLFLAG_RW, &ip6_maxfragpackets, 0, ""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, + maxfrags, CTLFLAG_RW, &ip6_maxfrags, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_ACCEPT_RTADV, accept_rtadv, CTLFLAG_RW, &ip6_accept_rtadv, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_KEEPFAITH, @@ -451,6 +491,9 @@ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_LINKLOCAL, auto_linklocal, CTLFLAG_RW, &ip6_auto_linklocal, 0, ""); SYSCTL_STRUCT(_net_inet6_ip6, IPV6CTL_RIP6STATS, rip6stats, CTLFLAG_RD, &rip6stat, rip6stat, ""); +SYSCTL_STRUCT(_net_inet6_ip6, OID_AUTO, mrt6stat, CTLFLAG_RD, + &mrt6stat, mrt6stat, ""); + /* net.inet6.icmp6 */ SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT, diff --git a/bsd/netinet6/in6_rmx.c b/bsd/netinet6/in6_rmx.c index 805f2a9e0..1aa407220 100644 --- a/bsd/netinet6/in6_rmx.c +++ b/bsd/netinet6/in6_rmx.c @@ -82,6 +82,7 @@ #include <sys/socketvar.h> #include <sys/mbuf.h> #include <sys/syslog.h> +#include <kern/lock.h> #include <net/if.h> #include <net/route.h> @@ -99,9 +100,10 @@ #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> -extern int in6_inithead __P((void **head, int off)); -static void in6_rtqtimo __P((void *rock)); -static void in6_mtutimo __P((void *rock)); +extern int in6_inithead(void **head, int off); +static void in6_rtqtimo(void *rock); +static void in6_mtutimo(void *rock); +extern lck_mtx_t *rt_mtx; #define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */ @@ -160,21 +162,21 @@ in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, * Find out if it is because of an * ARP entry and delete it if so. */ - rt2 = rtalloc1((struct sockaddr *)sin6, 0, + rt2 = rtalloc1_locked((struct sockaddr *)sin6, 0, RTF_CLONING | RTF_PRCLONING); if (rt2) { if (rt2->rt_flags & RTF_LLINFO && rt2->rt_flags & RTF_HOST && rt2->rt_gateway && rt2->rt_gateway->sa_family == AF_LINK) { - rtrequest(RTM_DELETE, + rtrequest_locked(RTM_DELETE, (struct sockaddr *)rt_key(rt2), rt2->rt_gateway, rt_mask(rt2), rt2->rt_flags, 0); ret = rn_addroute(v_arg, n_arg, head, treenodes); } - rtfree(rt2); + rtfree_locked(rt2); } } else if (ret == NULL && rt->rt_flags & RTF_CLONING) { struct rtentry *rt2; @@ -190,7 +192,7 @@ in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, * net route entry, 3ffe:0501:: -> if0. * This case should not raise an error. */ - rt2 = rtalloc1((struct sockaddr *)sin6, 0, + rt2 = rtalloc1_locked((struct sockaddr *)sin6, 0, RTF_CLONING | RTF_PRCLONING); if (rt2) { if ((rt2->rt_flags & (RTF_CLONING|RTF_HOST|RTF_GATEWAY)) @@ -200,7 +202,7 @@ in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, && rt2->rt_ifp == rt->rt_ifp) { ret = rt2->rt_nodes; } - rtfree(rt2); + rtfree_locked(rt2); } } return ret; @@ -252,6 +254,8 @@ static void in6_clsroute(struct radix_node *rn, struct radix_node_head *head) { struct rtentry *rt = (struct rtentry *)rn; + struct timeval timenow; + if (!(rt->rt_flags & RTF_UP)) return; /* prophylactic measures */ @@ -268,11 +272,12 @@ in6_clsroute(struct radix_node *rn, struct radix_node_head *head) * If rtq_reallyold is 0, just delete the route without * waiting for a timeout cycle to kill it. */ + getmicrotime(&timenow); if (rtq_reallyold != 0) { rt->rt_flags |= RTPRF_OURS; - rt->rt_rmx.rmx_expire = time_second + rtq_reallyold; + rt->rt_rmx.rmx_expire = timenow.tv_sec + rtq_reallyold; } else { - rtrequest(RTM_DELETE, + rtrequest_locked(RTM_DELETE, (struct sockaddr *)rt_key(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags, 0); @@ -300,15 +305,19 @@ in6_rtqkill(struct radix_node *rn, void *rock) struct rtqk_arg *ap = rock; struct rtentry *rt = (struct rtentry *)rn; int err; + struct timeval timenow; + + getmicrotime(&timenow); + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); if (rt->rt_flags & RTPRF_OURS) { ap->found++; - if (ap->draining || rt->rt_rmx.rmx_expire <= time_second) { + if (ap->draining || rt->rt_rmx.rmx_expire <= timenow.tv_sec) { if (rt->rt_refcnt > 0) panic("rtqkill route really not free"); - err = rtrequest(RTM_DELETE, + err = rtrequest_locked(RTM_DELETE, (struct sockaddr *)rt_key(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags, 0); @@ -319,9 +328,9 @@ in6_rtqkill(struct radix_node *rn, void *rock) } } else { if (ap->updating - && (rt->rt_rmx.rmx_expire - time_second + && (rt->rt_rmx.rmx_expire - timenow.tv_sec > rtq_reallyold)) { - rt->rt_rmx.rmx_expire = time_second + rt->rt_rmx.rmx_expire = timenow.tv_sec + rtq_reallyold; } ap->nextstop = lmin(ap->nextstop, @@ -335,19 +344,6 @@ in6_rtqkill(struct radix_node *rn, void *rock) #define RTQ_TIMEOUT 60*10 /* run no less than once every ten minutes */ static int rtq_timeout = RTQ_TIMEOUT; -static void -in6_rtqtimo_funneled(void *rock) -{ -#ifdef __APPLE__ - boolean_t funnel_state; - funnel_state = thread_funnel_set(network_flock, TRUE); - in6_rtqtimo(rock); -#endif -#ifdef __APPLE__ - (void) thread_funnel_set(network_flock, FALSE); -#endif -} - static void in6_rtqtimo(void *rock) { @@ -355,15 +351,16 @@ in6_rtqtimo(void *rock) struct rtqk_arg arg; struct timeval atv; static time_t last_adjusted_timeout = 0; - int s; + struct timeval timenow; + + getmicrotime(&timenow); arg.found = arg.killed = 0; arg.rnh = rnh; - arg.nextstop = time_second + rtq_timeout; + arg.nextstop = timenow.tv_sec + rtq_timeout; arg.draining = arg.updating = 0; - s = splnet(); + lck_mtx_lock(rt_mtx); rnh->rnh_walktree(rnh, in6_rtqkill, &arg); - splx(s); /* * Attempt to be somewhat dynamic about this: @@ -374,28 +371,27 @@ in6_rtqtimo(void *rock) * hard. */ if ((arg.found - arg.killed > rtq_toomany) - && (time_second - last_adjusted_timeout >= rtq_timeout) + && (timenow.tv_sec - last_adjusted_timeout >= rtq_timeout) && rtq_reallyold > rtq_minreallyold) { rtq_reallyold = 2*rtq_reallyold / 3; if (rtq_reallyold < rtq_minreallyold) { rtq_reallyold = rtq_minreallyold; } - last_adjusted_timeout = time_second; + last_adjusted_timeout = timenow.tv_sec; #if DIAGNOSTIC log(LOG_DEBUG, "in6_rtqtimo: adjusted rtq_reallyold to %d", rtq_reallyold); #endif arg.found = arg.killed = 0; arg.updating = 1; - s = splnet(); rnh->rnh_walktree(rnh, in6_rtqkill, &arg); - splx(s); } atv.tv_usec = 0; - atv.tv_sec = arg.nextstop - time_second; - timeout(in6_rtqtimo_funneled, rock, tvtohz(&atv)); + atv.tv_sec = arg.nextstop - timenow.tv_sec; + lck_mtx_unlock(rt_mtx); + timeout(in6_rtqtimo, rock, tvtohz(&atv)); } /* @@ -411,13 +407,16 @@ in6_mtuexpire(struct radix_node *rn, void *rock) { struct rtentry *rt = (struct rtentry *)rn; struct mtuex_arg *ap = rock; + struct timeval timenow; + + getmicrotime(&timenow); /* sanity */ if (!rt) panic("rt == NULL in in6_mtuexpire"); if (rt->rt_rmx.rmx_expire && !(rt->rt_flags & RTF_PROBEMTU)) { - if (rt->rt_rmx.rmx_expire <= time_second) { + if (rt->rt_rmx.rmx_expire <= timenow.tv_sec) { rt->rt_flags |= RTF_PROBEMTU; } else { ap->nextstop = lmin(ap->nextstop, @@ -430,43 +429,32 @@ in6_mtuexpire(struct radix_node *rn, void *rock) #define MTUTIMO_DEFAULT (60*1) -static void -in6_mtutimo_funneled(void *rock) -{ -#ifdef __APPLE__ - boolean_t funnel_state; - funnel_state = thread_funnel_set(network_flock, TRUE); - in6_mtutimo(rock); -#endif -#ifdef __APPLE__ - (void) thread_funnel_set(network_flock, FALSE); -#endif -} - static void in6_mtutimo(void *rock) { struct radix_node_head *rnh = rock; struct mtuex_arg arg; struct timeval atv; - int s; + struct timeval timenow; + + getmicrotime(&timenow); arg.rnh = rnh; - arg.nextstop = time_second + MTUTIMO_DEFAULT; - s = splnet(); + arg.nextstop = timenow.tv_sec + MTUTIMO_DEFAULT; + lck_mtx_lock(rt_mtx); rnh->rnh_walktree(rnh, in6_mtuexpire, &arg); - splx(s); atv.tv_usec = 0; atv.tv_sec = arg.nextstop; - if (atv.tv_sec < time_second) { + if (atv.tv_sec < timenow.tv_sec) { #if DIAGNOSTIC log(LOG_DEBUG, "IPv6: invalid mtu expiration time on routing table\n"); #endif - arg.nextstop = time_second + 30; /*last resort*/ + arg.nextstop = timenow.tv_sec + 30; /*last resort*/ } - atv.tv_sec -= time_second; - timeout(in6_mtutimo_funneled, rock, tvtohz(&atv)); + atv.tv_sec -= timenow.tv_sec; + lck_mtx_unlock(rt_mtx); + timeout(in6_mtutimo, rock, tvtohz(&atv)); } #if 0 diff --git a/bsd/netinet6/in6_src.c b/bsd/netinet6/in6_src.c index 3f9c8e4ff..dd0b5c0b2 100644 --- a/bsd/netinet6/in6_src.c +++ b/bsd/netinet6/in6_src.c @@ -76,6 +76,7 @@ #include <sys/errno.h> #include <sys/time.h> #include <sys/proc.h> +#include <kern/lock.h> #include <net/if.h> #include <net/route.h> @@ -97,6 +98,7 @@ #include <net/net_osdep.h> #include "loop.h" +extern lck_mtx_t *rt_mtx; /* * Return an IPv6 address, which is the most appropriate for a given @@ -105,13 +107,14 @@ * an entry to the caller for later use. */ struct in6_addr * -in6_selectsrc(dstsock, opts, mopts, ro, laddr, errorp) - struct sockaddr_in6 *dstsock; - struct ip6_pktopts *opts; - struct ip6_moptions *mopts; - struct route_in6 *ro; - struct in6_addr *laddr; - int *errorp; +in6_selectsrc( + struct sockaddr_in6 *dstsock, + struct ip6_pktopts *opts, + struct ip6_moptions *mopts, + struct route_in6 *ro, + struct in6_addr *laddr, + struct in6_addr *src_storage, + int *errorp) { struct in6_addr *dst; struct in6_ifaddr *ia6 = 0; @@ -148,7 +151,8 @@ in6_selectsrc(dstsock, opts, mopts, ro, laddr, errorp) *errorp = EADDRNOTAVAIL; return(0); } - return(&satosin6(&ia6->ia_addr)->sin6_addr); + *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; + return src_storage; } /* @@ -177,7 +181,8 @@ in6_selectsrc(dstsock, opts, mopts, ro, laddr, errorp) *errorp = EADDRNOTAVAIL; return(0); } - return(&satosin6(&ia6->ia_addr)->sin6_addr); + *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; + return src_storage; } /* @@ -202,7 +207,8 @@ in6_selectsrc(dstsock, opts, mopts, ro, laddr, errorp) *errorp = EADDRNOTAVAIL; return(0); } - return(&satosin6(&ia6->ia_addr)->sin6_addr); + *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; + return src_storage; } } @@ -217,7 +223,7 @@ in6_selectsrc(dstsock, opts, mopts, ro, laddr, errorp) if (opts && opts->ip6po_nexthop) { sin6_next = satosin6(opts->ip6po_nexthop); - rt = nd6_lookup(&sin6_next->sin6_addr, 1, NULL); + rt = nd6_lookup(&sin6_next->sin6_addr, 1, NULL, 0); if (rt) { ia6 = in6_ifawithscope(rt->rt_ifp, dst); if (ia6 == 0) @@ -227,7 +233,8 @@ in6_selectsrc(dstsock, opts, mopts, ro, laddr, errorp) *errorp = EADDRNOTAVAIL; return(0); } - return(&satosin6(&ia6->ia_addr)->sin6_addr); + *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; + return src_storage; } } @@ -236,16 +243,17 @@ in6_selectsrc(dstsock, opts, mopts, ro, laddr, errorp) * our src addr is taken from the i/f, else punt. */ if (ro) { + lck_mtx_lock(rt_mtx); if (ro->ro_rt && (!(ro->ro_rt->rt_flags & RTF_UP) || satosin6(&ro->ro_dst)->sin6_family != AF_INET6 || !IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr, dst))) { - rtfree(ro->ro_rt); + rtfree_locked(ro->ro_rt); ro->ro_rt = (struct rtentry *)0; } if (ro->ro_rt == (struct rtentry *)0 || - ro->ro_rt->rt_ifp == (struct ifnet *)0) { + ro->ro_rt->rt_ifp == 0) { struct sockaddr_in6 *sa6; /* No route yet, so try to acquire one */ @@ -256,12 +264,13 @@ in6_selectsrc(dstsock, opts, mopts, ro, laddr, errorp) sa6->sin6_addr = *dst; sa6->sin6_scope_id = dstsock->sin6_scope_id; if (IN6_IS_ADDR_MULTICAST(dst)) { - ro->ro_rt = rtalloc1(&((struct route *)ro) + ro->ro_rt = rtalloc1_locked(&((struct route *)ro) ->ro_dst, 0, 0UL); } else { - rtalloc((struct route *)ro); + rtalloc_ign_locked((struct route *)ro, 0UL); } } + lck_mtx_unlock(rt_mtx); /* * in_pcbconnect() checks out IFF_LOOPBACK to skip using @@ -272,8 +281,14 @@ in6_selectsrc(dstsock, opts, mopts, ro, laddr, errorp) if (ro->ro_rt) { ia6 = in6_ifawithscope(ro->ro_rt->rt_ifa->ifa_ifp, dst); - if (ia6 == 0) /* xxx scope error ?*/ + if (ia6 == 0) { ia6 = ifatoia6(ro->ro_rt->rt_ifa); + if (ia6) + ifaref(&ia6->ia_ifa); + } + else { + ifaref(&ia6->ia_ifa); + } } #if 0 /* @@ -291,14 +306,17 @@ in6_selectsrc(dstsock, opts, mopts, ro, laddr, errorp) ia6 = ifatoia6(ifa_ifwithnet(sin6tosa(&sin6))); if (ia6 == 0) return(0); - return(&satosin6(&ia6->ia_addr)->sin6_addr); + *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; + return src_storage; } #endif /* 0 */ if (ia6 == 0) { *errorp = EHOSTUNREACH; /* no route */ return(0); } - return(&satosin6(&ia6->ia_addr)->sin6_addr); + *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; + ifafree(&ia6->ia_ifa); + return src_storage; } *errorp = EADDRNOTAVAIL; @@ -313,9 +331,9 @@ in6_selectsrc(dstsock, opts, mopts, ro, laddr, errorp) * 3. The system default hoplimit. */ int -in6_selecthlim(in6p, ifp) - struct in6pcb *in6p; - struct ifnet *ifp; +in6_selecthlim( + struct in6pcb *in6p, + struct ifnet *ifp) { if (in6p && in6p->in6p_hops >= 0) return(in6p->in6p_hops); @@ -330,15 +348,23 @@ in6_selecthlim(in6p, ifp) * share this function by all *bsd*... */ int -in6_pcbsetport(laddr, inp, p) +in6_pcbsetport(laddr, inp, p, locked) struct in6_addr *laddr; struct inpcb *inp; struct proc *p; + int locked; { struct socket *so = inp->inp_socket; u_int16_t lport = 0, first, last, *lastport; int count, error = 0, wild = 0; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + if (!locked) { /* Make sure we don't run into a deadlock: 4052373 */ + if (!lck_rw_try_lock_exclusive(pcbinfo->mtx)) { + socket_unlock(inp->inp_socket, 0); + lck_rw_lock_exclusive(pcbinfo->mtx); + socket_lock(inp->inp_socket, 0); + } + } /* XXX: this is redundant when called from in6_pcbbind */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) @@ -351,13 +377,11 @@ in6_pcbsetport(laddr, inp, p) last = ipport_hilastauto; lastport = &pcbinfo->lasthi; } else if (inp->inp_flags & INP_LOWPORT) { -#ifdef __APPLE__ - if (p && (error = suser(p->p_ucred, &p->p_acflag))) -#else - if (p && (error = suser(p))) -#endif - + if (p && (error = proc_suser(p))) { + if (!locked) + lck_rw_done(pcbinfo->mtx); return error; + } first = ipport_lowfirstauto; /* 1023 */ last = ipport_lowlastauto; /* 600 */ lastport = &pcbinfo->lastlow; @@ -386,6 +410,8 @@ in6_pcbsetport(laddr, inp, p) * occurred above. */ inp->in6p_laddr = in6addr_any; + if (!locked) + lck_rw_done(pcbinfo->mtx); return (EAGAIN); } --*lastport; @@ -407,6 +433,8 @@ in6_pcbsetport(laddr, inp, p) * occurred above. */ inp->in6p_laddr = in6addr_any; + if (!locked) + lck_rw_done(pcbinfo->mtx); return (EAGAIN); } ++*lastport; @@ -418,12 +446,16 @@ in6_pcbsetport(laddr, inp, p) } inp->inp_lport = lport; - if (in_pcbinshash(inp) != 0) { + if (in_pcbinshash(inp, 1) != 0) { inp->in6p_laddr = in6addr_any; inp->inp_lport = 0; + if (!locked) + lck_rw_done(pcbinfo->mtx); return (EAGAIN); } + if (!locked) + lck_rw_done(pcbinfo->mtx); return(0); } @@ -443,17 +475,17 @@ in6_pcbsetport(laddr, inp, p) * we may want to change the function to return something other than ifp. */ int -in6_embedscope(in6, sin6, in6p, ifpp) - struct in6_addr *in6; - const struct sockaddr_in6 *sin6; +in6_embedscope( + struct in6_addr *in6, + const struct sockaddr_in6 *sin6, #ifdef HAVE_NRL_INPCB - struct inpcb *in6p; + struct inpcb *in6p, #define in6p_outputopts inp_outputopts6 #define in6p_moptions inp_moptions6 #else - struct in6pcb *in6p; + struct in6pcb *in6p, #endif - struct ifnet **ifpp; + struct ifnet **ifpp) { struct ifnet *ifp = NULL; u_int32_t scopeid; @@ -518,10 +550,10 @@ in6_embedscope(in6, sin6, in6p, ifpp) * embedded scopeid thing. */ int -in6_recoverscope(sin6, in6, ifp) - struct sockaddr_in6 *sin6; - const struct in6_addr *in6; - struct ifnet *ifp; +in6_recoverscope( + struct sockaddr_in6 *sin6, + const struct in6_addr *in6, + struct ifnet *ifp) { u_int32_t scopeid; diff --git a/bsd/netinet6/in6_var.h b/bsd/netinet6/in6_var.h index 071ad9226..787393d33 100644 --- a/bsd/netinet6/in6_var.h +++ b/bsd/netinet6/in6_var.h @@ -95,7 +95,7 @@ struct in6_addrlifetime { u_int32_t ia6t_pltime; /* prefix lifetime */ }; -#ifdef __APPLE_API_PRIVATE +#ifdef PRIVATE struct in6_ifaddr { struct ifaddr ia_ifa; /* protocol-independent info */ #define ia_ifp ia_ifa.ifa_ifp @@ -115,12 +115,11 @@ struct in6_ifaddr { * (for autoconfigured addresses only) */ }; -#endif /* __APPLE_API_PRIVATE */ +#endif /* PRIVATE */ /* * IPv6 interface statistics, as defined in RFC2465 Ipv6IfStatsEntry (p12). */ -#ifdef __APPLE_API_UNSTABLE struct in6_ifstat { u_quad_t ifs6_in_receive; /* # of total input datagram */ u_quad_t ifs6_in_hdrerr; /* # of datagrams with invalid hdr */ @@ -236,7 +235,6 @@ struct icmp6_ifstat { /* ipv6IfIcmpOutGroupMembReductions, # of output MLD done */ u_quad_t ifs6_out_mlddone; }; -#endif /* __APPLE_API_UNSTABLE */ struct in6_ifreq { char ifr_name[IFNAMSIZ]; @@ -345,7 +343,7 @@ struct in6_rrenumreq { #define irr_rrf_decrvalid irr_flags.prf_rr.decrvalid #define irr_rrf_decrprefd irr_flags.prf_rr.decrprefd -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE /* * Given a pointer to an in6_ifaddr (ifaddr), * return a pointer to the addr as a sockaddr_in6 @@ -359,9 +357,8 @@ struct in6_rrenumreq { #define IFA_DSTIN6(x) (&((struct sockaddr_in6 *)((x)->ifa_dstaddr))->sin6_addr) #define IFPR_IN6(x) (&((struct sockaddr_in6 *)((x)->ifpr_prefix))->sin6_addr) +#endif KERNEL_PRIVATE - -#ifdef __APPLE__ /* * Event data, internet6 style. */ @@ -391,34 +388,26 @@ struct kev_in6_data { #define KEV_INET6_NEW_RTADV_ADDR 5 /* Autoconf router advertised address has appeared */ #define KEV_INET6_DEFROUTER 6 /* Default router dectected by kernel */ -#ifdef KERNEL +#ifdef KERNEL_PRIVATE /* Utility function used inside netinet6 kernel code for generating events */ void in6_post_msg(struct ifnet *, u_long, struct in6_ifaddr *); -#endif -#endif /* __APPLE__ */ -#endif /* __APPLE_API_PRIVATE */ +#endif KERNEL_PRIVATE -#ifdef KERNEL #define IN6_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ (((d)->s6_addr32[0] ^ (a)->s6_addr32[0]) & (m)->s6_addr32[0]) == 0 && \ (((d)->s6_addr32[1] ^ (a)->s6_addr32[1]) & (m)->s6_addr32[1]) == 0 && \ (((d)->s6_addr32[2] ^ (a)->s6_addr32[2]) & (m)->s6_addr32[2]) == 0 && \ (((d)->s6_addr32[3] ^ (a)->s6_addr32[3]) & (m)->s6_addr32[3]) == 0 ) -#endif #define SIOCSIFADDR_IN6 _IOW('i', 12, struct in6_ifreq) #define SIOCGIFADDR_IN6 _IOWR('i', 33, struct in6_ifreq) -#ifdef KERNEL -#ifdef __APPLE_API_OBSOLETE /* * SIOCSxxx ioctls should be unused (see comments in in6.c), but * we do not shift numbers for binary compatibility. */ #define SIOCSIFDSTADDR_IN6 _IOW('i', 14, struct in6_ifreq) #define SIOCSIFNETMASK_IN6 _IOW('i', 22, struct in6_ifreq) -#endif /* __APPLE_API_OBSOLETE */ -#endif #define SIOCGIFDSTADDR_IN6 _IOWR('i', 34, struct in6_ifreq) #define SIOCGIFNETMASK_IN6 _IOWR('i', 37, struct in6_ifreq) @@ -467,7 +456,6 @@ void in6_post_msg(struct ifnet *, u_long, struct in6_ifaddr *); #define SIOCGETMIFCNT_IN6 _IOWR('u', 107, \ struct sioc_mif_req6) /* get pkt cnt per if */ -#ifdef KERNEL_PRIVATE /* * temporary control calls to attach/detach IP to/from an ethernet interface */ @@ -478,8 +466,6 @@ void in6_post_msg(struct ifnet *, u_long, struct in6_ifaddr *); #define SIOCLL_STOP _IOWR('i', 131, struct in6_ifreq) /* deconfigure linklocal from interface */ #define SIOCAUTOCONF_START _IOWR('i', 132, struct in6_ifreq) /* accept rtadvd on this interface */ #define SIOCAUTOCONF_STOP _IOWR('i', 133, struct in6_ifreq) /* stop accepting rtadv for this interface */ -#endif KERNEL_PRIVATE - #define IN6_IFF_ANYCAST 0x01 /* anycast address */ #define IN6_IFF_TENTATIVE 0x02 /* tentative address */ @@ -503,9 +489,8 @@ void in6_post_msg(struct ifnet *, u_long, struct in6_ifaddr *); #define IN6_ARE_SCOPE_EQUAL(a,b) ((a)==(b)) #endif -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -extern struct in6_ifaddr *in6_ifaddr; +#ifdef KERNEL_PRIVATE +extern struct in6_ifaddr *in6_ifaddrs; extern struct in6_ifstat **in6_ifstat; extern size_t in6_ifstatmax; @@ -514,10 +499,11 @@ extern struct icmp6_ifstat **icmp6_ifstat; extern size_t icmp6_ifstatmax; #define in6_ifstat_inc(ifp, tag) \ do { \ - if ((ifp) && (ifp)->if_index <= if_index \ - && (ifp)->if_index < in6_ifstatmax \ - && in6_ifstat && in6_ifstat[(ifp)->if_index]) { \ - in6_ifstat[(ifp)->if_index]->tag++; \ + int _z_index = ifp ? ifp->if_index : 0; \ + if ((_z_index) && _z_index <= if_index \ + && _z_index < in6_ifstatmax \ + && in6_ifstat && in6_ifstat[_z_index]) { \ + in6_ifstat[_z_index]->tag++; \ } \ } while (0) @@ -527,7 +513,7 @@ extern u_char inet6ctlerrmap[]; extern unsigned long in6_maxmtu; #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_IPMADDR); -#endif +#endif MALLOC_DECLARE /* * Macro for finding the internet address structure (in6_ifaddr) corresponding @@ -548,10 +534,6 @@ do { \ (ia) = (struct in6_ifaddr *)ifa; \ } while (0) -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ - -#ifdef __APPLE_API_PRIVATE /* * Multi-cast membership entry. One for each group/ifp that a PCB * belongs to. @@ -570,10 +552,7 @@ struct in6_multi { u_int in6m_state; /* state of the membership */ u_int in6m_timer; /* MLD6 listener report timer */ }; -#endif /* __APPLE_API_PRIVATE */ -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE extern LIST_HEAD(in6_multihead, in6_multi) in6_multihead; /* @@ -631,8 +610,8 @@ do { \ } while(0) struct in6_multi *in6_addmulti __P((struct in6_addr *, struct ifnet *, - int *)); -void in6_delmulti __P((struct in6_multi *)); + int *, int)); +void in6_delmulti __P((struct in6_multi *, int)); extern int in6_ifindex2scopeid __P((int)); extern int in6_mask2len __P((struct in6_addr *, u_char *)); extern void in6_len2mask __P((struct in6_addr *, int)); @@ -640,7 +619,7 @@ int in6_control __P((struct socket *, u_long, caddr_t, struct ifnet *, struct proc *)); int in6_update_ifa __P((struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *)); -void in6_purgeaddr __P((struct ifaddr *)); +void in6_purgeaddr __P((struct ifaddr *, int)); int in6if_do_dad __P((struct ifnet *)); void in6_purgeif __P((struct ifnet *)); void in6_savemkludge __P((struct in6_ifaddr *)); @@ -669,7 +648,6 @@ int in6_embedscope __P((struct in6_addr *, const struct sockaddr_in6 *, int in6_recoverscope __P((struct sockaddr_in6 *, const struct in6_addr *, struct ifnet *)); void in6_clearscope __P((struct in6_addr *)); -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +#endif KERNEL_PRIVATE -#endif /* _NETINET6_IN6_VAR_H_ */ +#endif _NETINET6_IN6_VAR_H_ diff --git a/bsd/netinet6/ip6_ecn.h b/bsd/netinet6/ip6_ecn.h index 27104fcb8..6e926018a 100644 --- a/bsd/netinet6/ip6_ecn.h +++ b/bsd/netinet6/ip6_ecn.h @@ -36,9 +36,7 @@ */ #include <sys/appleapiopts.h> -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -extern void ip6_ecn_ingress __P((int, u_int32_t *, const u_int32_t *)); -extern void ip6_ecn_egress __P((int, const u_int32_t *, u_int32_t *)); -#endif /* __APPLE_API_PRIVATE */ -#endif +#ifdef KERNEL_PRIVATE +extern void ip6_ecn_ingress(int, u_int32_t *, const u_int32_t *); +extern void ip6_ecn_egress(int, const u_int32_t *, u_int32_t *); +#endif KERNEL_PRIVATE diff --git a/bsd/netinet6/ip6_forward.c b/bsd/netinet6/ip6_forward.c index d87abb122..d857cc307 100644 --- a/bsd/netinet6/ip6_forward.c +++ b/bsd/netinet6/ip6_forward.c @@ -66,6 +66,8 @@ #endif #include <netkey/key.h> extern int ipsec_bypass; +extern lck_mtx_t *sadb_mutex; +extern lck_mtx_t *ip6_mutex; #endif /* IPSEC */ #include <netinet6/ip6_fw.h> @@ -88,9 +90,10 @@ struct route_in6 ip6_forward_rt; */ void -ip6_forward(m, srcrt) +ip6_forward(m, srcrt, locked) struct mbuf *m; int srcrt; + int locked; { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct sockaddr_in6 *dst; @@ -101,6 +104,10 @@ ip6_forward(m, srcrt) #if IPSEC struct secpolicy *sp = NULL; #endif + struct timeval timenow; + + getmicrotime(&timenow); + #if IPSEC /* @@ -110,10 +117,15 @@ ip6_forward(m, srcrt) * Don't increment ip6s_cantforward because this is the check * before forwarding packet actually. */ - if (ipsec_bypass == 0 && ipsec6_in_reject(m, NULL)) { - ipsec6stat.in_polvio++; - m_freem(m); - return; + if (ipsec_bypass == 0) { + lck_mtx_lock(sadb_mutex); + if (ipsec6_in_reject(m, NULL)) { + ipsec6stat.in_polvio++; + lck_mtx_unlock(sadb_mutex); + m_freem(m); + return; + } + lck_mtx_unlock(sadb_mutex); } #endif /*IPSEC*/ @@ -128,8 +140,8 @@ ip6_forward(m, srcrt) IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { ip6stat.ip6s_cantforward++; /* XXX in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard) */ - if (ip6_log_time + ip6_log_interval < time_second) { - ip6_log_time = time_second; + if (ip6_log_time + ip6_log_interval < timenow.tv_sec) { + ip6_log_time = timenow.tv_sec; log(LOG_DEBUG, "cannot forward " "from %s to %s nxt %d received on %s\n", @@ -144,8 +156,12 @@ ip6_forward(m, srcrt) if (ip6->ip6_hlim <= IPV6_HLIMDEC) { /* XXX in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard) */ + if (locked) + lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_TIME_EXCEEDED, ICMP6_TIME_EXCEED_TRANSIT, 0); + if (locked) + lck_mtx_lock(ip6_mutex); return; } ip6->ip6_hlim -= IPV6_HLIMDEC; @@ -164,7 +180,7 @@ ip6_forward(m, srcrt) #if IPSEC if (ipsec_bypass != 0) goto skip_ipsec; - + lck_mtx_lock(sadb_mutex); /* get a security policy for this packet */ sp = ipsec6_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, IP_FORWARDING, &error); @@ -178,6 +194,7 @@ ip6_forward(m, srcrt) m_freem(mcopy); #endif } + lck_mtx_unlock(sadb_mutex); m_freem(m); return; } @@ -200,6 +217,7 @@ ip6_forward(m, srcrt) m_freem(mcopy); #endif } + lck_mtx_unlock(sadb_mutex); m_freem(m); return; @@ -207,6 +225,7 @@ ip6_forward(m, srcrt) case IPSEC_POLICY_NONE: /* no need to do IPsec. */ key_freesp(sp); + lck_mtx_unlock(sadb_mutex); goto skip_ipsec; case IPSEC_POLICY_IPSEC: @@ -222,6 +241,7 @@ ip6_forward(m, srcrt) m_freem(mcopy); #endif } + lck_mtx_unlock(sadb_mutex); m_freem(m); return; } @@ -233,6 +253,7 @@ ip6_forward(m, srcrt) /* should be panic ?? */ printf("ip6_forward: Invalid policy found. %d\n", sp->policy); key_freesp(sp); + lck_mtx_unlock(sadb_mutex); goto skip_ipsec; } @@ -281,10 +302,12 @@ ip6_forward(m, srcrt) m_freem(mcopy); #endif } + lck_mtx_unlock(sadb_mutex); m_freem(m); return; } } + lck_mtx_unlock(sadb_mutex); skip_ipsec: #endif /* IPSEC */ @@ -308,8 +331,12 @@ ip6_forward(m, srcrt) ip6stat.ip6s_noroute++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_noroute); if (mcopy) { + if (locked) + lck_mtx_unlock(ip6_mutex); icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE, 0); + if (locked) + lck_mtx_lock(ip6_mutex); } m_freem(m); return; @@ -330,8 +357,12 @@ ip6_forward(m, srcrt) ip6stat.ip6s_noroute++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_noroute); if (mcopy) { + if (locked) + lck_mtx_unlock(ip6_mutex); icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE, 0); + if (locked) + lck_mtx_lock(ip6_mutex); } m_freem(m); return; @@ -352,8 +383,8 @@ ip6_forward(m, srcrt) ip6stat.ip6s_badscope++; in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard); - if (ip6_log_time + ip6_log_interval < time_second) { - ip6_log_time = time_second; + if (ip6_log_time + ip6_log_interval < timenow.tv_sec) { + ip6_log_time = timenow.tv_sec; log(LOG_DEBUG, "cannot forward " "src %s, dst %s, nxt %d, rcvif %s, outif %s\n", @@ -362,9 +393,14 @@ ip6_forward(m, srcrt) ip6->ip6_nxt, if_name(m->m_pkthdr.rcvif), if_name(rt->rt_ifp)); } - if (mcopy) + if (mcopy) { + if (locked) + lck_mtx_unlock(ip6_mutex); icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_BEYONDSCOPE, 0); + if (locked) + lck_mtx_lock(ip6_mutex); + } m_freem(m); return; } @@ -388,6 +424,7 @@ ip6_forward(m, srcrt) * case, as we have the outgoing interface for * encapsulated packet as "rt->rt_ifp". */ + lck_mtx_lock(sadb_mutex); sp = ipsec6_getpolicybyaddr(mcopy, IPSEC_DIR_OUTBOUND, IP_FORWARDING, &ipsecerror); if (sp) { @@ -396,7 +433,7 @@ ip6_forward(m, srcrt) if (ipsechdrsiz < mtu) mtu -= ipsechdrsiz; } - + lck_mtx_unlock(sadb_mutex); /* * if mtu becomes less than minimum MTU, * tell minimum MTU (and I'll need to fragment it). @@ -404,7 +441,11 @@ ip6_forward(m, srcrt) if (mtu < IPV6_MMTU) mtu = IPV6_MMTU; #endif + if (locked) + lck_mtx_unlock(ip6_mutex); icmp6_error(mcopy, ICMP6_PACKET_TOO_BIG, 0, mtu); + if (locked) + lck_mtx_lock(ip6_mutex); } m_freem(m); return; @@ -435,8 +476,12 @@ ip6_forward(m, srcrt) * type/code is based on suggestion by Rich Draves. * not sure if it is the best pick. */ + if (locked) + lck_mtx_unlock(ip6_mutex); icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 0); + if (locked) + lck_mtx_lock(ip6_mutex); m_freem(m); return; } @@ -449,7 +494,7 @@ ip6_forward(m, srcrt) if (ip6_fw_enable && ip6_fw_chk_ptr) { u_short port = 0; /* If ipfw says divert, we have to just drop packet */ - if ((*ip6_fw_chk_ptr)(&ip6, rt->rt_ifp, &port, &m)) { + if (ip6_fw_chk_ptr(&ip6, rt->rt_ifp, &port, &m)) { m_freem(m); goto freecopy; } @@ -503,7 +548,7 @@ ip6_forward(m, srcrt) in6_clearscope(&ip6->ip6_dst); #endif - error = nd6_output(rt->rt_ifp, origifp, m, dst, rt); + error = nd6_output(rt->rt_ifp, origifp, m, dst, rt, locked); if (error) { in6_ifstat_inc(rt->rt_ifp, ifs6_out_discard); ip6stat.ip6s_cantforward++; @@ -547,7 +592,11 @@ ip6_forward(m, srcrt) code = ICMP6_DST_UNREACH_ADDR; break; } + if (locked) + lck_mtx_unlock(ip6_mutex); icmp6_error(mcopy, type, code, 0); + if (locked) + lck_mtx_lock(ip6_mutex); return; freecopy: diff --git a/bsd/netinet6/ip6_fw.c b/bsd/netinet6/ip6_fw.c new file mode 100644 index 000000000..db7926f14 --- /dev/null +++ b/bsd/netinet6/ip6_fw.c @@ -0,0 +1,1369 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/* $FreeBSD: src/sys/netinet6/ip6_fw.c,v 1.2.2.9 2002/04/28 05:40:27 suz Exp $ */ +/* $KAME: ip6_fw.c,v 1.21 2001/01/24 01:25:32 itojun Exp $ */ + +/* + * Copyright (C) 1998, 1999, 2000 and 2001 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Copyright (c) 1993 Daniel Boulet + * Copyright (c) 1994 Ugen J.S.Antsilevich + * Copyright (c) 1996 Alex Nash + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + */ + +/* + * Implement IPv6 packet firewall + */ + + +#ifdef IP6DIVERT +#error "NOT SUPPORTED IPV6 DIVERT" +#endif +#ifdef IP6FW_DIVERT_RESTART +#error "NOT SUPPORTED IPV6 DIVERT" +#endif + +#include <string.h> +#include <machine/spl.h> + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/queue.h> +#include <sys/kernel.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/syslog.h> +#include <sys/lock.h> +#include <sys/time.h> +#include <net/if.h> +#include <net/route.h> +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> + +#include <netinet/ip6.h> +#include <netinet6/ip6_var.h> +#include <netinet6/in6_var.h> +#include <netinet/icmp6.h> + +#include <netinet/in_pcb.h> + +#include <netinet6/ip6_fw.h> +#include <netinet/ip_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/udp.h> + +#include <sys/sysctl.h> + +#include <net/net_osdep.h> + +MALLOC_DEFINE(M_IP6FW, "Ip6Fw/Ip6Acct", "Ip6Fw/Ip6Acct chain's"); + +static int fw6_debug = 1; +#ifdef IPV6FIREWALL_VERBOSE +static int fw6_verbose = 1; +#else +static int fw6_verbose = 0; +#endif +#ifdef IPV6FIREWALL_VERBOSE_LIMIT +static int fw6_verbose_limit = IPV6FIREWALL_VERBOSE_LIMIT; +#else +static int fw6_verbose_limit = 0; +#endif + +LIST_HEAD (ip6_fw_head, ip6_fw_chain) ip6_fw_chain; + +#ifdef SYSCTL_NODE +SYSCTL_DECL(_net_inet6_ip6); +SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); +SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, enable, CTLFLAG_RW, + &ip6_fw_enable, 0, "Enable ip6fw"); +SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, debug, CTLFLAG_RW, &fw6_debug, 0, ""); +SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, verbose, CTLFLAG_RW, &fw6_verbose, 0, ""); +SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, verbose_limit, CTLFLAG_RW, &fw6_verbose_limit, 0, ""); +#endif + +#define dprintf(a) do { \ + if (fw6_debug) \ + printf a; \ + } while (0) +#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 + +static int add_entry6 __P((struct ip6_fw_head *chainptr, struct ip6_fw *frwl)); +static int del_entry6 __P((struct ip6_fw_head *chainptr, u_short number)); +static int zero_entry6 __P((struct ip6_fw *frwl)); +static struct ip6_fw *check_ip6fw_struct __P((struct ip6_fw *m)); +static int ip6opts_match __P((struct ip6_hdr **ip6, struct ip6_fw *f, + struct mbuf **m, + int *off, int *nxt, u_short *offset)); +static int port_match6 __P((u_short *portptr, int nports, u_short port, + int range_flag)); +static int tcp6flg_match __P((struct tcphdr *tcp6, struct ip6_fw *f)); +static int icmp6type_match __P((struct icmp6_hdr * icmp, struct ip6_fw * f)); +static void ip6fw_report __P((struct ip6_fw *f, struct ip6_hdr *ip6, + struct ifnet *rif, struct ifnet *oif, int off, int nxt)); + +static int ip6_fw_chk __P((struct ip6_hdr **pip6, + struct ifnet *oif, u_int16_t *cookie, struct mbuf **m)); +static int ip6_fw_ctl __P((struct sockopt *)); + +static char err_prefix[] = "ip6_fw_ctl:"; +extern lck_mtx_t *ip6_mutex; + +/* + * Returns 1 if the port is matched by the vector, 0 otherwise + */ +static +__inline int +port_match6(u_short *portptr, int nports, u_short port, int range_flag) +{ + if (!nports) + return 1; + if (range_flag) { + if (portptr[0] <= port && port <= portptr[1]) { + return 1; + } + nports -= 2; + portptr += 2; + } + while (nports-- > 0) { + if (*portptr++ == port) { + return 1; + } + } + return 0; +} + +static int +tcp6flg_match(struct tcphdr *tcp6, struct ip6_fw *f) +{ + u_char flg_set, flg_clr; + + /* + * If an established connection is required, reject packets that + * have only SYN of RST|ACK|SYN set. Otherwise, fall through to + * other flag requirements. + */ + if ((f->fw_ipflg & IPV6_FW_IF_TCPEST) && + ((tcp6->th_flags & (IPV6_FW_TCPF_RST | IPV6_FW_TCPF_ACK | + IPV6_FW_TCPF_SYN)) == IPV6_FW_TCPF_SYN)) + return 0; + + flg_set = tcp6->th_flags & f->fw_tcpf; + flg_clr = tcp6->th_flags & f->fw_tcpnf; + + if (flg_set != f->fw_tcpf) + return 0; + if (flg_clr) + return 0; + + return 1; +} + +static int +icmp6type_match(struct icmp6_hdr *icmp6, struct ip6_fw *f) +{ + int type; + + if (!(f->fw_flg & IPV6_FW_F_ICMPBIT)) + return(1); + + type = icmp6->icmp6_type; + + /* check for matching type in the bitmap */ + if (type < IPV6_FW_ICMPTYPES_DIM * sizeof(unsigned) * 8 && + (f->fw_icmp6types[type / (sizeof(unsigned) * 8)] & + (1U << (type % (8 * sizeof(unsigned)))))) + return(1); + + return(0); /* no match */ +} + +static int +is_icmp6_query(struct ip6_hdr *ip6, int off) +{ + const struct icmp6_hdr *icmp6; + int icmp6_type; + + icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); + icmp6_type = icmp6->icmp6_type; + + if (icmp6_type == ICMP6_ECHO_REQUEST || + icmp6_type == ICMP6_MEMBERSHIP_QUERY || + icmp6_type == ICMP6_WRUREQUEST || + icmp6_type == ICMP6_FQDN_QUERY || + icmp6_type == ICMP6_NI_QUERY) + return(1); + + return(0); +} + +static int +ip6opts_match(struct ip6_hdr **pip6, struct ip6_fw *f, struct mbuf **m, + int *off, int *nxt, u_short *offset) +{ + int len; + struct ip6_hdr *ip6 = *pip6; + struct ip6_ext *ip6e; + u_char opts, nopts, nopts_sve; + + opts = f->fw_ip6opt; + nopts = nopts_sve = f->fw_ip6nopt; + + *nxt = ip6->ip6_nxt; + *off = sizeof(struct ip6_hdr); + len = ntohs(ip6->ip6_plen) + sizeof(struct ip6_hdr); + while (*off < len) { + ip6e = (struct ip6_ext *)((caddr_t) ip6 + *off); + if ((*m)->m_len < *off + sizeof(*ip6e)) + goto opts_check; /* XXX */ + + switch(*nxt) { + case IPPROTO_FRAGMENT: + if ((*m)->m_len >= *off + sizeof(struct ip6_frag)) { + struct ip6_frag *ip6f; + + ip6f = (struct ip6_frag *) ((caddr_t)ip6 + *off); + *offset = ip6f->ip6f_offlg & IP6F_OFF_MASK; + } + opts &= ~IPV6_FW_IP6OPT_FRAG; + nopts &= ~IPV6_FW_IP6OPT_FRAG; + *off += sizeof(struct ip6_frag); + break; + case IPPROTO_AH: + opts &= ~IPV6_FW_IP6OPT_AH; + nopts &= ~IPV6_FW_IP6OPT_AH; + *off += (ip6e->ip6e_len + 2) << 2; + break; + default: + switch (*nxt) { + case IPPROTO_HOPOPTS: + opts &= ~IPV6_FW_IP6OPT_HOPOPT; + nopts &= ~IPV6_FW_IP6OPT_HOPOPT; + break; + case IPPROTO_ROUTING: + opts &= ~IPV6_FW_IP6OPT_ROUTE; + nopts &= ~IPV6_FW_IP6OPT_ROUTE; + break; + case IPPROTO_ESP: + opts &= ~IPV6_FW_IP6OPT_ESP; + nopts &= ~IPV6_FW_IP6OPT_ESP; + break; + case IPPROTO_NONE: + opts &= ~IPV6_FW_IP6OPT_NONXT; + nopts &= ~IPV6_FW_IP6OPT_NONXT; + goto opts_check; + break; + case IPPROTO_DSTOPTS: + opts &= ~IPV6_FW_IP6OPT_OPTS; + nopts &= ~IPV6_FW_IP6OPT_OPTS; + break; + default: + goto opts_check; + break; + } + *off += (ip6e->ip6e_len + 1) << 3; + break; + } + *nxt = ip6e->ip6e_nxt; + + } + opts_check: + if (f->fw_ip6opt == f->fw_ip6nopt) /* XXX */ + return 1; + + if (opts == 0 && nopts == nopts_sve) + return 1; + else + return 0; +} + +static +__inline int +iface_match(struct ifnet *ifp, union ip6_fw_if *ifu, int byname) +{ + /* Check by name or by IP address */ + if (byname) { + /* Check unit number (-1 is wildcard) */ + if (ifu->fu_via_if.unit != -1 + && ifp->if_unit != ifu->fu_via_if.unit) + return(0); + /* Check name */ + if (strncmp(ifp->if_name, ifu->fu_via_if.name, IP6FW_IFNLEN)) + return(0); + return(1); + } else if (!IN6_IS_ADDR_UNSPECIFIED(&ifu->fu_via_ip6)) { /* Zero == wildcard */ + struct ifaddr *ia; + + ifnet_lock_shared(ifp); + for (ia = ifp->if_addrlist.tqh_first; ia; ia = ia->ifa_list.tqe_next) + { + + if (ia->ifa_addr == NULL) + continue; + if (ia->ifa_addr->sa_family != AF_INET6) + continue; + if (!IN6_ARE_ADDR_EQUAL(&ifu->fu_via_ip6, + &(((struct sockaddr_in6 *) + (ia->ifa_addr))->sin6_addr))) + continue; + ifnet_lock_done(ifp); + return(1); + } + ifnet_lock_done(ifp); + return(0); + } + return(1); +} + +static void +ip6fw_report(struct ip6_fw *f, struct ip6_hdr *ip6, + struct ifnet *rif, struct ifnet *oif, int off, int nxt) +{ + static int counter; + struct tcphdr *const tcp6 = (struct tcphdr *) ((caddr_t) ip6+ off); + struct udphdr *const udp = (struct udphdr *) ((caddr_t) ip6+ off); + struct icmp6_hdr *const icmp6 = (struct icmp6_hdr *) ((caddr_t) ip6+ off); + int count; + char *action; + char action2[32], proto[102], name[18]; + int len; + + count = f ? f->fw_pcnt : ++counter; + if (fw6_verbose_limit != 0 && count > fw6_verbose_limit) + return; + + /* Print command name */ + snprintf(SNPARGS(name, 0), "ip6fw: %d", f ? f->fw_number : -1); + + action = action2; + if (!f) + action = "Refuse"; + else { + switch (f->fw_flg & IPV6_FW_F_COMMAND) { + case IPV6_FW_F_DENY: + action = "Deny"; + break; + case IPV6_FW_F_REJECT: + if (f->fw_reject_code == IPV6_FW_REJECT_RST) + action = "Reset"; + else + action = "Unreach"; + break; + case IPV6_FW_F_ACCEPT: + action = "Accept"; + break; + case IPV6_FW_F_COUNT: + action = "Count"; + break; + case IPV6_FW_F_DIVERT: + snprintf(SNPARGS(action2, 0), "Divert %d", + f->fw_divert_port); + break; + case IPV6_FW_F_TEE: + snprintf(SNPARGS(action2, 0), "Tee %d", + f->fw_divert_port); + break; + case IPV6_FW_F_SKIPTO: + snprintf(SNPARGS(action2, 0), "SkipTo %d", + f->fw_skipto_rule); + break; + default: + action = "UNKNOWN"; + break; + } + } + + switch (nxt) { + case IPPROTO_TCP: + len = snprintf(SNPARGS(proto, 0), "TCP [%s]", + ip6_sprintf(&ip6->ip6_src)); + if (off > 0) + len += snprintf(SNPARGS(proto, len), ":%d ", + ntohs(tcp6->th_sport)); + else + len += snprintf(SNPARGS(proto, len), " "); + len += snprintf(SNPARGS(proto, len), "[%s]", + ip6_sprintf(&ip6->ip6_dst)); + if (off > 0) + snprintf(SNPARGS(proto, len), ":%d", + ntohs(tcp6->th_dport)); + break; + case IPPROTO_UDP: + len = snprintf(SNPARGS(proto, 0), "UDP [%s]", + ip6_sprintf(&ip6->ip6_src)); + if (off > 0) + len += snprintf(SNPARGS(proto, len), ":%d ", + ntohs(udp->uh_sport)); + else + len += snprintf(SNPARGS(proto, len), " "); + len += snprintf(SNPARGS(proto, len), "[%s]", + ip6_sprintf(&ip6->ip6_dst)); + if (off > 0) + snprintf(SNPARGS(proto, len), ":%d", + ntohs(udp->uh_dport)); + break; + case IPPROTO_ICMPV6: + if (off > 0) + len = snprintf(SNPARGS(proto, 0), "IPV6-ICMP:%u.%u ", + icmp6->icmp6_type, icmp6->icmp6_code); + else + len = snprintf(SNPARGS(proto, 0), "IPV6-ICMP "); + len += snprintf(SNPARGS(proto, len), "[%s]", + ip6_sprintf(&ip6->ip6_src)); + snprintf(SNPARGS(proto, len), " [%s]", + ip6_sprintf(&ip6->ip6_dst)); + break; + default: + len = snprintf(SNPARGS(proto, 0), "P:%d [%s]", nxt, + ip6_sprintf(&ip6->ip6_src)); + snprintf(SNPARGS(proto, len), " [%s]", + ip6_sprintf(&ip6->ip6_dst)); + break; + } + + if (oif) + log(LOG_AUTHPRIV | LOG_INFO, "%s %s %s out via %s\n", + name, action, proto, if_name(oif)); + else if (rif) + log(LOG_AUTHPRIV | LOG_INFO, "%s %s %s in via %s\n", + name, action, proto, if_name(rif)); + else + log(LOG_AUTHPRIV | LOG_INFO, "%s %s %s", + name, action, proto); + if (fw6_verbose_limit != 0 && count == fw6_verbose_limit) + log(LOG_AUTHPRIV | LOG_INFO, "ip6fw: limit reached on entry %d\n", + f ? f->fw_number : -1); +} + +/* + * Parameters: + * + * ip Pointer to packet header (struct ip6_hdr *) + * hlen Packet header length + * oif Outgoing interface, or NULL if packet is incoming + * #ifndef IP6FW_DIVERT_RESTART + * *cookie Ignore all divert/tee rules to this port (if non-zero) + * #else + * *cookie Skip up to the first rule past this rule number; + * #endif + * *m The packet; we set to NULL when/if we nuke it. + * + * Return value: + * + * 0 The packet is to be accepted and routed normally OR + * the packet was denied/rejected and has been dropped; + * in the latter case, *m is equal to NULL upon return. + * port Divert the packet to port. + */ + +static int +ip6_fw_chk(struct ip6_hdr **pip6, + struct ifnet *oif, u_int16_t *cookie, struct mbuf **m) +{ + struct ip6_fw_chain *chain; + struct ip6_fw *rule = NULL; + struct ip6_hdr *ip6 = *pip6; + struct ifnet *const rif = (*m)->m_pkthdr.rcvif; + u_short offset = 0; + int off = sizeof(struct ip6_hdr), nxt = ip6->ip6_nxt; + u_short src_port, dst_port; +#ifdef IP6FW_DIVERT_RESTART + u_int16_t skipto = *cookie; +#else + u_int16_t ignport = ntohs(*cookie); +#endif + struct timeval timenow; + + getmicrotime(&timenow); + + *cookie = 0; + /* + * Go down the chain, looking for enlightment + * #ifdef IP6FW_DIVERT_RESTART + * If we've been asked to start at a given rule immediatly, do so. + * #endif + */ + chain = LIST_FIRST(&ip6_fw_chain); +#ifdef IP6FW_DIVERT_RESTART + if (skipto) { + if (skipto >= 65535) + goto dropit; + while (chain && (chain->rule->fw_number <= skipto)) { + chain = LIST_NEXT(chain, chain); + } + if (! chain) goto dropit; + } +#endif /* IP6FW_DIVERT_RESTART */ + for (; chain; chain = LIST_NEXT(chain, chain)) { + struct ip6_fw *const f = chain->rule; + + if (oif) { + /* Check direction outbound */ + if (!(f->fw_flg & IPV6_FW_F_OUT)) + continue; + } else { + /* Check direction inbound */ + if (!(f->fw_flg & IPV6_FW_F_IN)) + continue; + } + +#define IN6_ARE_ADDR_MASKEQUAL(x,y,z) (\ + (((x)->s6_addr32[0] & (y)->s6_addr32[0]) == (z)->s6_addr32[0]) && \ + (((x)->s6_addr32[1] & (y)->s6_addr32[1]) == (z)->s6_addr32[1]) && \ + (((x)->s6_addr32[2] & (y)->s6_addr32[2]) == (z)->s6_addr32[2]) && \ + (((x)->s6_addr32[3] & (y)->s6_addr32[3]) == (z)->s6_addr32[3])) + + /* If src-addr doesn't match, not this rule. */ + if (((f->fw_flg & IPV6_FW_F_INVSRC) != 0) ^ + (!IN6_ARE_ADDR_MASKEQUAL(&ip6->ip6_src,&f->fw_smsk,&f->fw_src))) + continue; + + /* If dest-addr doesn't match, not this rule. */ + if (((f->fw_flg & IPV6_FW_F_INVDST) != 0) ^ + (!IN6_ARE_ADDR_MASKEQUAL(&ip6->ip6_dst,&f->fw_dmsk,&f->fw_dst))) + continue; + +#undef IN6_ARE_ADDR_MASKEQUAL + /* Interface check */ + if ((f->fw_flg & IF6_FW_F_VIAHACK) == IF6_FW_F_VIAHACK) { + struct ifnet *const iface = oif ? oif : rif; + + /* Backwards compatibility hack for "via" */ + if (!iface || !iface_match(iface, + &f->fw_in_if, f->fw_flg & IPV6_FW_F_OIFNAME)) + continue; + } else { + /* Check receive interface */ + if ((f->fw_flg & IPV6_FW_F_IIFACE) + && (!rif || !iface_match(rif, + &f->fw_in_if, f->fw_flg & IPV6_FW_F_IIFNAME))) + continue; + /* Check outgoing interface */ + if ((f->fw_flg & IPV6_FW_F_OIFACE) + && (!oif || !iface_match(oif, + &f->fw_out_if, f->fw_flg & IPV6_FW_F_OIFNAME))) + continue; + } + + /* Check IP options */ + if (!ip6opts_match(&ip6, f, m, &off, &nxt, &offset)) + continue; + + /* Fragments */ + if ((f->fw_flg & IPV6_FW_F_FRAG) && !offset) + continue; + + /* Check protocol; if wildcard, match */ + if (f->fw_prot == IPPROTO_IPV6) + goto got_match; + + /* If different, don't match */ + if (nxt != f->fw_prot) + continue; + +#define PULLUP_TO(len) do { \ + if ((*m)->m_len < (len) \ + && (*m = m_pullup(*m, (len))) == 0) { \ + goto dropit; \ + } \ + *pip6 = ip6 = mtod(*m, struct ip6_hdr *); \ + } while (0) + + /* Protocol specific checks */ + switch (nxt) { + case IPPROTO_TCP: + { + struct tcphdr *tcp6; + + if (offset == 1) { /* cf. RFC 1858 */ + PULLUP_TO(off + 4); /* XXX ? */ + goto bogusfrag; + } + if (offset != 0) { + /* + * TCP flags and ports aren't available in this + * packet -- if this rule specified either one, + * we consider the rule a non-match. + */ + if (f->fw_nports != 0 || + f->fw_tcpf != f->fw_tcpnf) + continue; + + break; + } + PULLUP_TO(off + 14); + tcp6 = (struct tcphdr *) ((caddr_t)ip6 + off); + if (((f->fw_tcpf != f->fw_tcpnf) || + (f->fw_ipflg & IPV6_FW_IF_TCPEST)) && + !tcp6flg_match(tcp6, f)) + continue; + src_port = ntohs(tcp6->th_sport); + dst_port = ntohs(tcp6->th_dport); + goto check_ports; + } + + case IPPROTO_UDP: + { + struct udphdr *udp; + + if (offset != 0) { + /* + * Port specification is unavailable -- if this + * rule specifies a port, we consider the rule + * a non-match. + */ + if (f->fw_nports != 0) + continue; + + break; + } + PULLUP_TO(off + 4); + udp = (struct udphdr *) ((caddr_t)ip6 + off); + src_port = ntohs(udp->uh_sport); + dst_port = ntohs(udp->uh_dport); +check_ports: + if (!port_match6(&f->fw_pts[0], + IPV6_FW_GETNSRCP(f), src_port, + f->fw_flg & IPV6_FW_F_SRNG)) + continue; + if (!port_match6(&f->fw_pts[IPV6_FW_GETNSRCP(f)], + IPV6_FW_GETNDSTP(f), dst_port, + f->fw_flg & IPV6_FW_F_DRNG)) + continue; + break; + } + + case IPPROTO_ICMPV6: + { + struct icmp6_hdr *icmp; + + if (offset != 0) /* Type isn't valid */ + break; + PULLUP_TO(off + 2); + icmp = (struct icmp6_hdr *) ((caddr_t)ip6 + off); + if (!icmp6type_match(icmp, f)) + continue; + break; + } +#undef PULLUP_TO + +bogusfrag: + if (fw6_verbose) + ip6fw_report(NULL, ip6, rif, oif, off, nxt); + goto dropit; + } + +got_match: +#ifndef IP6FW_DIVERT_RESTART + /* Ignore divert/tee rule if socket port is "ignport" */ + switch (f->fw_flg & IPV6_FW_F_COMMAND) { + case IPV6_FW_F_DIVERT: + case IPV6_FW_F_TEE: + if (f->fw_divert_port == ignport) + continue; /* ignore this rule */ + break; + } + +#endif /* IP6FW_DIVERT_RESTART */ + /* Update statistics */ + f->fw_pcnt += 1; + f->fw_bcnt += ntohs(ip6->ip6_plen); + f->timestamp = timenow.tv_sec; + + /* Log to console if desired */ + if ((f->fw_flg & IPV6_FW_F_PRN) && fw6_verbose) + ip6fw_report(f, ip6, rif, oif, off, nxt); + + /* Take appropriate action */ + switch (f->fw_flg & IPV6_FW_F_COMMAND) { + case IPV6_FW_F_ACCEPT: + return(0); + case IPV6_FW_F_COUNT: + continue; + case IPV6_FW_F_DIVERT: +#ifdef IP6FW_DIVERT_RESTART + *cookie = f->fw_number; +#else + *cookie = htons(f->fw_divert_port); +#endif /* IP6FW_DIVERT_RESTART */ + return(f->fw_divert_port); + case IPV6_FW_F_TEE: + /* + * XXX someday tee packet here, but beware that you + * can't use m_copym() or m_copypacket() because + * the divert input routine modifies the mbuf + * (and these routines only increment reference + * counts in the case of mbuf clusters), so need + * to write custom routine. + */ + continue; + case IPV6_FW_F_SKIPTO: +#ifdef DIAGNOSTIC + while (chain->chain.le_next + && chain->chain.le_next->rule->fw_number + < f->fw_skipto_rule) +#else + while (chain->chain.le_next->rule->fw_number + < f->fw_skipto_rule) +#endif + chain = chain->chain.le_next; + continue; + } + + /* Deny/reject this packet using this rule */ + rule = f; + break; + } + +#ifdef DIAGNOSTIC + /* Rule 65535 should always be there and should always match */ + if (!chain) + panic("ip6_fw: chain"); +#endif + + /* + * At this point, we're going to drop the packet. + * Send a reject notice if all of the following are true: + * + * - The packet matched a reject rule + * - The packet is not an ICMP packet, or is an ICMP query packet + * - The packet is not a multicast or broadcast packet + */ + if ((rule->fw_flg & IPV6_FW_F_COMMAND) == IPV6_FW_F_REJECT + && (nxt != IPPROTO_ICMPV6 || is_icmp6_query(ip6, off)) + && !((*m)->m_flags & (M_BCAST|M_MCAST)) + && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { + switch (rule->fw_reject_code) { + case IPV6_FW_REJECT_RST: + { + struct tcphdr *const tcp = + (struct tcphdr *) ((caddr_t)ip6 + off); + struct { + struct ip6_hdr ip6; + struct tcphdr th; + } ti; + tcp_seq ack, seq; + int flags; + + if (offset != 0 || (tcp->th_flags & TH_RST)) + break; + + ti.ip6 = *ip6; + ti.th = *tcp; + ti.th.th_seq = ntohl(ti.th.th_seq); + ti.th.th_ack = ntohl(ti.th.th_ack); + ti.ip6.ip6_nxt = IPPROTO_TCP; + if (ti.th.th_flags & TH_ACK) { + ack = 0; + seq = ti.th.th_ack; + flags = TH_RST; + } else { + ack = ti.th.th_seq; + if (((*m)->m_flags & M_PKTHDR) != 0) { + ack += (*m)->m_pkthdr.len - off + - (ti.th.th_off << 2); + } else if (ip6->ip6_plen) { + ack += ntohs(ip6->ip6_plen) + sizeof(*ip6) + - off - (ti.th.th_off << 2); + } else { + m_freem(*m); + *m = 0; + break; + } + seq = 0; + flags = TH_RST|TH_ACK; + } + bcopy(&ti, ip6, sizeof(ti)); + tcp_respond(NULL, ip6, (struct tcphdr *)(ip6 + 1), + *m, ack, seq, flags); + *m = NULL; + break; + } + default: /* Send an ICMP unreachable using code */ + if (oif) + (*m)->m_pkthdr.rcvif = oif; + lck_mtx_assert(ip6_mutex, LCK_MTX_ASSERT_OWNED); + lck_mtx_unlock(ip6_mutex); + icmp6_error(*m, ICMP6_DST_UNREACH, + rule->fw_reject_code, 0); + lck_mtx_lock(ip6_mutex); + *m = NULL; + break; + } + } + +dropit: + /* + * Finally, drop the packet. + */ + if (*m) { + m_freem(*m); + *m = NULL; + } + return(0); +} + +static int +add_entry6(struct ip6_fw_head *chainptr, struct ip6_fw *frwl) +{ + struct ip6_fw *ftmp = 0; + struct ip6_fw_chain *fwc = 0, *fcp, *fcpl = 0; + u_short nbr = 0; + int s; + + fwc = _MALLOC(sizeof *fwc, M_IP6FW, M_WAITOK); + ftmp = _MALLOC(sizeof *ftmp, M_IP6FW, M_WAITOK); + if (!fwc || !ftmp) { + dprintf(("%s malloc said no\n", err_prefix)); + if (fwc) FREE(fwc, M_IP6FW); + if (ftmp) FREE(ftmp, M_IP6FW); + return (ENOSPC); + } + + bcopy(frwl, ftmp, sizeof(struct ip6_fw)); + ftmp->fw_in_if.fu_via_if.name[IP6FW_IFNLEN - 1] = '\0'; + ftmp->fw_pcnt = 0L; + ftmp->fw_bcnt = 0L; + fwc->rule = ftmp; + + s = splnet(); + + if (!chainptr->lh_first) { + LIST_INSERT_HEAD(chainptr, fwc, chain); + splx(s); + return(0); + } else if (ftmp->fw_number == (u_short)-1) { + if (fwc) FREE(fwc, M_IP6FW); + if (ftmp) FREE(ftmp, M_IP6FW); + splx(s); + dprintf(("%s bad rule number\n", err_prefix)); + return (EINVAL); + } + + /* If entry number is 0, find highest numbered rule and add 100 */ + if (ftmp->fw_number == 0) { + for (fcp = chainptr->lh_first; fcp; fcp = fcp->chain.le_next) { + if (fcp->rule->fw_number != (u_short)-1) + nbr = fcp->rule->fw_number; + else + break; + } + if (nbr < (u_short)-1 - 100) + nbr += 100; + ftmp->fw_number = nbr; + } + + /* Got a valid number; now insert it, keeping the list ordered */ + for (fcp = chainptr->lh_first; fcp; fcp = fcp->chain.le_next) { + if (fcp->rule->fw_number > ftmp->fw_number) { + if (fcpl) { + LIST_INSERT_AFTER(fcpl, fwc, chain); + } else { + LIST_INSERT_HEAD(chainptr, fwc, chain); + } + break; + } else { + fcpl = fcp; + } + } + + splx(s); + return (0); +} + +static int +del_entry6(struct ip6_fw_head *chainptr, u_short number) +{ + struct ip6_fw_chain *fcp; + int s; + + s = splnet(); + + fcp = chainptr->lh_first; + if (number != (u_short)-1) { + for (; fcp; fcp = fcp->chain.le_next) { + if (fcp->rule->fw_number == number) { + LIST_REMOVE(fcp, chain); + splx(s); + FREE(fcp->rule, M_IP6FW); + FREE(fcp, M_IP6FW); + return 0; + } + } + } + + splx(s); + return (EINVAL); +} + +static int +zero_entry6(struct ip6_fw *frwl) +{ + struct ip6_fw_chain *fcp; + int s; + + /* + * It's possible to insert multiple chain entries with the + * same number, so we don't stop after finding the first + * match if zeroing a specific entry. + */ + s = splnet(); + for (fcp = ip6_fw_chain.lh_first; fcp; fcp = fcp->chain.le_next) + if (!frwl || frwl->fw_number == 0 || frwl->fw_number == fcp->rule->fw_number) { + fcp->rule->fw_bcnt = fcp->rule->fw_pcnt = 0; + fcp->rule->timestamp = 0; + } + splx(s); + + if (fw6_verbose) { + if (frwl) + log(LOG_AUTHPRIV | LOG_NOTICE, + "ip6fw: Entry %d cleared.\n", frwl->fw_number); + else + log(LOG_AUTHPRIV | LOG_NOTICE, + "ip6fw: Accounting cleared.\n"); + } + + return(0); +} + +static struct ip6_fw * +check_ip6fw_struct(struct ip6_fw *frwl) +{ + /* Check for invalid flag bits */ + if ((frwl->fw_flg & ~IPV6_FW_F_MASK) != 0) { + dprintf(("%s undefined flag bits set (flags=%x)\n", + err_prefix, frwl->fw_flg)); + return (NULL); + } + /* Must apply to incoming or outgoing (or both) */ + if (!(frwl->fw_flg & (IPV6_FW_F_IN | IPV6_FW_F_OUT))) { + dprintf(("%s neither in nor out\n", err_prefix)); + return (NULL); + } + /* Empty interface name is no good */ + if (((frwl->fw_flg & IPV6_FW_F_IIFNAME) + && !*frwl->fw_in_if.fu_via_if.name) + || ((frwl->fw_flg & IPV6_FW_F_OIFNAME) + && !*frwl->fw_out_if.fu_via_if.name)) { + dprintf(("%s empty interface name\n", err_prefix)); + return (NULL); + } + /* Sanity check interface matching */ + if ((frwl->fw_flg & IF6_FW_F_VIAHACK) == IF6_FW_F_VIAHACK) { + ; /* allow "via" backwards compatibility */ + } else if ((frwl->fw_flg & IPV6_FW_F_IN) + && (frwl->fw_flg & IPV6_FW_F_OIFACE)) { + dprintf(("%s outgoing interface check on incoming\n", + err_prefix)); + return (NULL); + } + /* Sanity check port ranges */ + if ((frwl->fw_flg & IPV6_FW_F_SRNG) && IPV6_FW_GETNSRCP(frwl) < 2) { + dprintf(("%s src range set but n_src_p=%d\n", + err_prefix, IPV6_FW_GETNSRCP(frwl))); + return (NULL); + } + if ((frwl->fw_flg & IPV6_FW_F_DRNG) && IPV6_FW_GETNDSTP(frwl) < 2) { + dprintf(("%s dst range set but n_dst_p=%d\n", + err_prefix, IPV6_FW_GETNDSTP(frwl))); + return (NULL); + } + if (IPV6_FW_GETNSRCP(frwl) + IPV6_FW_GETNDSTP(frwl) > IPV6_FW_MAX_PORTS) { + dprintf(("%s too many ports (%d+%d)\n", + err_prefix, IPV6_FW_GETNSRCP(frwl), IPV6_FW_GETNDSTP(frwl))); + return (NULL); + } + /* + * Protocols other than TCP/UDP don't use port range + */ + if ((frwl->fw_prot != IPPROTO_TCP) && + (frwl->fw_prot != IPPROTO_UDP) && + (IPV6_FW_GETNSRCP(frwl) || IPV6_FW_GETNDSTP(frwl))) { + dprintf(("%s port(s) specified for non TCP/UDP rule\n", + err_prefix)); + return(NULL); + } + + /* + * Rather than modify the entry to make such entries work, + * we reject this rule and require user level utilities + * to enforce whatever policy they deem appropriate. + */ + if ((frwl->fw_src.s6_addr32[0] & (~frwl->fw_smsk.s6_addr32[0])) || + (frwl->fw_src.s6_addr32[1] & (~frwl->fw_smsk.s6_addr32[1])) || + (frwl->fw_src.s6_addr32[2] & (~frwl->fw_smsk.s6_addr32[2])) || + (frwl->fw_src.s6_addr32[3] & (~frwl->fw_smsk.s6_addr32[3])) || + (frwl->fw_dst.s6_addr32[0] & (~frwl->fw_dmsk.s6_addr32[0])) || + (frwl->fw_dst.s6_addr32[1] & (~frwl->fw_dmsk.s6_addr32[1])) || + (frwl->fw_dst.s6_addr32[2] & (~frwl->fw_dmsk.s6_addr32[2])) || + (frwl->fw_dst.s6_addr32[3] & (~frwl->fw_dmsk.s6_addr32[3]))) { + dprintf(("%s rule never matches\n", err_prefix)); + return(NULL); + } + + if ((frwl->fw_flg & IPV6_FW_F_FRAG) && + (frwl->fw_prot == IPPROTO_UDP || frwl->fw_prot == IPPROTO_TCP)) { + if (frwl->fw_nports) { + dprintf(("%s cannot mix 'frag' and ports\n", err_prefix)); + return(NULL); + } + if (frwl->fw_prot == IPPROTO_TCP && + frwl->fw_tcpf != frwl->fw_tcpnf) { + dprintf(("%s cannot mix 'frag' with TCP flags\n", err_prefix)); + return(NULL); + } + } + + /* Check command specific stuff */ + switch (frwl->fw_flg & IPV6_FW_F_COMMAND) + { + case IPV6_FW_F_REJECT: + if (frwl->fw_reject_code >= 0x100 + && !(frwl->fw_prot == IPPROTO_TCP + && frwl->fw_reject_code == IPV6_FW_REJECT_RST)) { + dprintf(("%s unknown reject code\n", err_prefix)); + return(NULL); + } + break; + case IPV6_FW_F_DIVERT: /* Diverting to port zero is invalid */ + case IPV6_FW_F_TEE: + if (frwl->fw_divert_port == 0) { + dprintf(("%s can't divert to port 0\n", err_prefix)); + return (NULL); + } + break; + case IPV6_FW_F_DENY: + case IPV6_FW_F_ACCEPT: + case IPV6_FW_F_COUNT: + case IPV6_FW_F_SKIPTO: + break; + default: + dprintf(("%s invalid command\n", err_prefix)); + return(NULL); + } + + return frwl; +} + +/*#####*/ +#if 0 +static int +ip6_fw_ctl(int stage, struct mbuf **mm) +{ + int error; + struct mbuf *m; + + if (stage == IPV6_FW_GET) { + struct ip6_fw_chain *fcp = ip6_fw_chain.lh_first; + *mm = m = m_get(M_WAIT, MT_DATA); /* XXX */ + if (!m) + return(ENOBUFS); + if (sizeof *(fcp->rule) > MLEN) { + MCLGET(m, M_WAIT); + if ((m->m_flags & M_EXT) == 0) { + m_free(m); + return(ENOBUFS); + } + } + for (; fcp; fcp = fcp->chain.le_next) { + bcopy(fcp->rule, m->m_data, sizeof *(fcp->rule)); + m->m_len = sizeof *(fcp->rule); + m->m_next = m_get(M_WAIT, MT_DATA); /* XXX */ + if (!m->m_next) { + m_freem(*mm); + return(ENOBUFS); + } + m = m->m_next; + if (sizeof *(fcp->rule) > MLEN) { + MCLGET(m, M_WAIT); + if ((m->m_flags & M_EXT) == 0) { + m_freem(*mm); + return(ENOBUFS); + } + } + m->m_len = 0; + } + return (0); + } + m = *mm; + /* only allow get calls if secure mode > 2 */ + if (securelevel > 2) { + if (m) { + (void)m_freem(m); + *mm = 0; + } + return(EPERM); + } + if (stage == IPV6_FW_FLUSH) { + while (ip6_fw_chain.lh_first != NULL && + ip6_fw_chain.lh_first->rule->fw_number != (u_short)-1) { + struct ip6_fw_chain *fcp = ip6_fw_chain.lh_first; + int s = splnet(); + LIST_REMOVE(ip6_fw_chain.lh_first, chain); + splx(s); + FREE(fcp->rule, M_IP6FW); + FREE(fcp, M_IP6FW); + } + if (m) { + (void)m_freem(m); + *mm = 0; + } + return (0); + } + if (stage == IPV6_FW_ZERO) { + error = zero_entry6(m); + if (m) { + (void)m_freem(m); + *mm = 0; + } + return (error); + } + if (m == NULL) { + printf("%s NULL mbuf ptr\n", err_prefix); + return (EINVAL); + } + + if (stage == IPV6_FW_ADD) { + struct ip6_fw *frwl = check_ip6fw_mbuf(m); + + if (!frwl) + error = EINVAL; + else + error = add_entry6(&ip6_fw_chain, frwl); + if (m) { + (void)m_freem(m); + *mm = 0; + } + return error; + } + if (stage == IPV6_FW_DEL) { + if (m->m_len != sizeof(struct ip6_fw)) { + dprintf(("%s len=%ld, want %lu\n", err_prefix, m->m_len, + sizeof(struct ip6_fw))); + error = EINVAL; + } else if (mtod(m, struct ip6_fw *)->fw_number == (u_short)-1) { + dprintf(("%s can't delete rule 65535\n", err_prefix)); + error = EINVAL; + } else + error = del_entry6(&ip6_fw_chain, + mtod(m, struct ip6_fw *)->fw_number); + if (m) { + (void)m_freem(m); + *mm = 0; + } + return error; + } + + dprintf(("%s unknown request %d\n", err_prefix, stage)); + if (m) { + (void)m_freem(m); + *mm = 0; + } + return (EINVAL); +} +#endif + +static int +ip6_fw_ctl(struct sockopt *sopt) +{ + int error = 0; + int spl; + int valsize; + struct ip6_fw rule; + + if (securelevel >= 3 && + (sopt->sopt_dir != SOPT_GET || sopt->sopt_name != IPV6_FW_GET)) + return (EPERM); + + /* We ALWAYS expect the client to pass in a rule structure so that we can + * check the version of the API that they are using. In the case of a + * IPV6_FW_GET operation, the first rule of the output buffer passed to us + * must have the version set. */ + if (!sopt->sopt_val || sopt->sopt_valsize < sizeof rule) return EINVAL; + + /* save sopt->sopt_valsize */ + valsize = sopt->sopt_valsize; + if (error = sooptcopyin(sopt, &rule, sizeof(rule), sizeof(rule))) + return error; + + if (rule.version != IPV6_FW_CURRENT_API_VERSION) return EINVAL; + rule.version = 0xFFFFFFFF; /* version is meaningless once rules "make it in the door". */ + + switch (sopt->sopt_name) + { + case IPV6_FW_GET: + { + struct ip6_fw_chain *fcp; + struct ip6_fw *buf; + size_t size = 0; + + spl = splnet(); + LIST_FOREACH(fcp, &ip6_fw_chain, chain) + size += sizeof *buf; + + buf = _MALLOC(size, M_TEMP, M_WAITOK); + if (!buf) error = ENOBUFS; + else + { + struct ip6_fw *bp = buf; + LIST_FOREACH(fcp, &ip6_fw_chain, chain) + { + bcopy(fcp->rule, bp, sizeof *bp); + bp->version = IPV6_FW_CURRENT_API_VERSION; + bp++; + } + } + + splx(spl); + if (buf) + { + sopt->sopt_valsize = valsize; + error = sooptcopyout(sopt, buf, size); + FREE(buf, M_TEMP); + } + + break; + } + + case IPV6_FW_FLUSH: + spl = splnet(); + while (ip6_fw_chain.lh_first && + ip6_fw_chain.lh_first->rule->fw_number != (u_short)-1) + { + struct ip6_fw_chain *fcp = ip6_fw_chain.lh_first; + LIST_REMOVE(ip6_fw_chain.lh_first, chain); + FREE(fcp->rule, M_IP6FW); + FREE(fcp, M_IP6FW); + } + splx(spl); + break; + + case IPV6_FW_ZERO: + error = zero_entry6(&rule); + break; + + case IPV6_FW_ADD: + if (check_ip6fw_struct(&rule)) + error = add_entry6(&ip6_fw_chain, &rule); + else + error = EINVAL; + break; + + case IPV6_FW_DEL: + if (rule.fw_number == (u_short)-1) + { + dprintf(("%s can't delete rule 65535\n", err_prefix)); + error = EINVAL; + } + else + error = del_entry6(&ip6_fw_chain, rule.fw_number); + break; + + default: + dprintf(("%s invalid option %d\n", err_prefix, sopt->sopt_name)); + error = EINVAL; + } + + return error; +} + +void +ip6_fw_init(void) +{ + struct ip6_fw default_rule; + + ip6_fw_chk_ptr = ip6_fw_chk; + ip6_fw_ctl_ptr = ip6_fw_ctl; + LIST_INIT(&ip6_fw_chain); + + bzero(&default_rule, sizeof default_rule); + default_rule.fw_prot = IPPROTO_IPV6; + default_rule.fw_number = (u_short)-1; +#ifdef IPV6FIREWALL_DEFAULT_TO_ACCEPT + default_rule.fw_flg |= IPV6_FW_F_ACCEPT; +#else + default_rule.fw_flg |= IPV6_FW_F_DENY; +#endif + default_rule.fw_flg |= IPV6_FW_F_IN | IPV6_FW_F_OUT; + if (check_ip6fw_struct(&default_rule) == NULL || + add_entry6(&ip6_fw_chain, &default_rule)) + panic(__FUNCTION__); + + printf("IPv6 packet filtering initialized, "); +#ifdef IPV6FIREWALL_DEFAULT_TO_ACCEPT + printf("default to accept, "); +#endif +#ifndef IPV6FIREWALL_VERBOSE + printf("logging disabled\n"); +#else + if (fw6_verbose_limit == 0) + printf("unlimited logging\n"); + else + printf("logging limited to %d packets/entry\n", + fw6_verbose_limit); +#endif +} + diff --git a/bsd/netinet6/ip6_fw.h b/bsd/netinet6/ip6_fw.h index ed1a10c89..cfb2c4ab3 100644 --- a/bsd/netinet6/ip6_fw.h +++ b/bsd/netinet6/ip6_fw.h @@ -207,8 +207,7 @@ struct ip6_fw_chain { /* * Main firewall chains definitions and global var's definitions. */ -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE #define M_IP6FW M_IPFW @@ -221,14 +220,13 @@ void ip6_fw_init(void); /* Firewall hooks */ struct ip6_hdr; struct sockopt; -typedef int ip6_fw_chk_t __P((struct ip6_hdr**, struct ifnet*, - u_short *, struct mbuf**)); -typedef int ip6_fw_ctl_t __P((struct sockopt *)); +typedef int ip6_fw_chk_t(struct ip6_hdr**, struct ifnet*, + u_short *, struct mbuf**); +typedef int ip6_fw_ctl_t(struct sockopt *); extern ip6_fw_chk_t *ip6_fw_chk_ptr; extern ip6_fw_ctl_t *ip6_fw_ctl_ptr; extern int ip6_fw_enable; -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +#endif /* KERNEL_PRIVATE */ #endif /* _IP6_FW_H */ diff --git a/bsd/netinet6/ip6_input.c b/bsd/netinet6/ip6_input.c index 2dc986a66..ccfce46ed 100644 --- a/bsd/netinet6/ip6_input.c +++ b/bsd/netinet6/ip6_input.c @@ -79,13 +79,13 @@ #include <sys/kernel.h> #include <sys/syslog.h> #include <sys/proc.h> +#include <sys/kauth.h> #include <net/if.h> #include <net/if_var.h> #include <net/if_types.h> #include <net/if_dl.h> #include <net/route.h> -#include <net/netisr.h> #include <netinet/in.h> #include <netinet/in_systm.h> @@ -108,10 +108,13 @@ #include <netinet6/ipsec6.h> #endif extern int ipsec_bypass; +extern lck_mtx_t *sadb_mutex; #endif #include <netinet6/ip6_fw.h> +#include <netinet/kpi_ipfilter_var.h> + #include <netinet6/ip6protosw.h> /* we need it for NLOOP. */ @@ -125,11 +128,7 @@ extern struct ip6protosw inet6sw[]; struct ip6protosw * ip6_protox[IPPROTO_MAX]; static int ip6qmaxlen = IFQ_MAXLEN; -struct in6_ifaddr *in6_ifaddr; - -extern void in6_tmpaddrtimer_funneled(void *); -extern void nd6_timer_funneled(void *); -extern void in6_rr_timer_funneled(void *); +struct in6_ifaddr *in6_ifaddrs; int ip6_forward_srcrt; /* XXX */ int ip6_sourcecheck; /* XXX */ @@ -149,22 +148,39 @@ struct ip6stat ip6stat; #ifdef __APPLE__ struct ifqueue ip6intrq; +lck_mtx_t *ip6_mutex; +lck_mtx_t *dad6_mutex; +lck_mtx_t *nd6_mutex; +lck_mtx_t *prefix6_mutex; +lck_attr_t *ip6_mutex_attr; +lck_grp_t *ip6_mutex_grp; +lck_grp_attr_t *ip6_mutex_grp_attr; +extern lck_mtx_t *inet6_domain_mutex; #endif +extern int loopattach_done; -static void ip6_init2 __P((void *)); -static struct mbuf *ip6_setdstifaddr __P((struct mbuf *, struct in6_ifaddr *)); +static void ip6_init2(void *); +static struct mbuf *ip6_setdstifaddr(struct mbuf *, struct in6_ifaddr *); -static int ip6_hopopts_input __P((u_int32_t *, u_int32_t *, struct mbuf **, int *)); +static int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *); #if PULLDOWN_TEST -static struct mbuf *ip6_pullexthdr __P((struct mbuf *, size_t, int)); +static struct mbuf *ip6_pullexthdr(struct mbuf *, size_t, int); #endif #ifdef __APPLE__ -void gifattach __P((void)); -void faithattach __P((void)); -void stfattach __P((void)); +void gifattach(void); +void faithattach(void); +void stfattach(void); #endif +static void +ip6_proto_input( + protocol_family_t protocol, + mbuf_t packet) +{ + ip6_input(packet); +} + /* * IP6 initialization: fill in IP6 protocol switch table. * All protocols not implemented in kernel go to raw IP6 protocol handler. @@ -175,12 +191,13 @@ ip6_init() struct ip6protosw *pr; int i; struct timeval tv; + extern lck_mtx_t *domain_proto_mtx; #if DIAGNOSTIC if (sizeof(struct protosw) != sizeof(struct ip6protosw)) panic("sizeof(protosw) != sizeof(ip6protosw)"); #endif - pr = (struct ip6protosw *)pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW); + pr = (struct ip6protosw *)pffindproto_locked(PF_INET6, IPPROTO_RAW, SOCK_RAW); if (pr == 0) panic("ip6_init"); for (i = 0; i < IPPROTO_MAX; i++) @@ -193,10 +210,34 @@ ip6_init() } } + ip6_mutex_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setdefault(ip6_mutex_grp_attr); + + ip6_mutex_grp = lck_grp_alloc_init("ip6", ip6_mutex_grp_attr); + ip6_mutex_attr = lck_attr_alloc_init(); + lck_attr_setdefault(ip6_mutex_attr); + + if ((ip6_mutex = lck_mtx_alloc_init(ip6_mutex_grp, ip6_mutex_attr)) == NULL) { + printf("ip6_init: can't alloc ip6_mutex\n"); + return; + } + if ((dad6_mutex = lck_mtx_alloc_init(ip6_mutex_grp, ip6_mutex_attr)) == NULL) { + printf("ip6_init: can't alloc dad6_mutex\n"); + return; + } + if ((nd6_mutex = lck_mtx_alloc_init(ip6_mutex_grp, ip6_mutex_attr)) == NULL) { + printf("ip6_init: can't alloc nd6_mutex\n"); + return; + } + + if ((prefix6_mutex = lck_mtx_alloc_init(ip6_mutex_grp, ip6_mutex_attr)) == NULL) { + printf("ip6_init: can't alloc prefix6_mutex\n"); + return; + } + + inet6domain.dom_flags = DOM_REENTRANT; + ip6intrq.ifq_maxlen = ip6qmaxlen; -#ifndef __APPLE__ - register_netisr(NETISR_IPV6, ip6intr); -#endif nd6_init(); frag6_init(); icmp6_init(); @@ -208,32 +249,36 @@ ip6_init() ip6_flow_seq = random() ^ tv.tv_usec; microtime(&tv); ip6_desync_factor = (random() ^ tv.tv_usec) % MAX_TEMP_DESYNC_FACTOR; - timeout(ip6_init2, (caddr_t)0, 2 * hz); + timeout(ip6_init2, (caddr_t)0, 1 * hz); + + lck_mtx_unlock(domain_proto_mtx); + proto_register_input(PF_INET6, ip6_proto_input, NULL); + lck_mtx_lock(domain_proto_mtx); } static void ip6_init2(dummy) void *dummy; { -#ifdef __APPLE__ - boolean_t funnel_state; - funnel_state = thread_funnel_set(network_flock, TRUE); -#endif /* * to route local address of p2p link to loopback, * assign loopback address first. */ + if (loopattach_done == 0) { + timeout(ip6_init2, (caddr_t)0, 1 * hz); + return; + } in6_ifattach(&loif[0], NULL, NULL); #ifdef __APPLE__ /* nd6_timer_init */ - timeout(nd6_timer_funneled, (caddr_t)0, hz); + timeout(nd6_timer, (caddr_t)0, hz); /* router renumbering prefix list maintenance */ - timeout(in6_rr_timer_funneled, (caddr_t)0, hz); + timeout(in6_rr_timer, (caddr_t)0, hz); /* timer for regeneranation of temporary addresses randomize ID */ - timeout(in6_tmpaddrtimer_funneled, (caddr_t)0, + timeout(in6_tmpaddrtimer, (caddr_t)0, (ip6_temp_preferred_lifetime - ip6_desync_factor - ip6_temp_regen_advance) * hz); @@ -264,9 +309,6 @@ ip6_init2(dummy) #endif in6_init2done = 1; -#ifdef __APPLE__ - (void) thread_funnel_set(network_flock, FALSE); -#endif } #if __FreeBSD__ @@ -275,25 +317,6 @@ ip6_init2(dummy) SYSINIT(netinet6init2, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ip6_init2, NULL); #endif -/* - * IP6 input interrupt handling. Just pass the packet to ip6_input. - */ -void -ip6intr(void) -{ - int s; - struct mbuf *m; - - for (;;) { - s = splimp(); - IF_DEQUEUE(&ip6intrq, m); - splx(s); - if (m == 0) - return; - ip6_input(m); - } -} - extern struct route_in6 ip6_forward_rt; void @@ -306,7 +329,22 @@ ip6_input(m) u_int32_t rtalert = ~0; int nxt = 0, ours = 0; struct ifnet *deliverifp = NULL; + ipfilter_t inject_ipfref = 0; + int seen; + /* + * No need to proccess packet twice if we've + * already seen it + */ + inject_ipfref = ipf_get_inject_filter(m); + if (inject_ipfref != 0) { + ip6 = mtod(m, struct ip6_hdr *); + nxt = ip6->ip6_nxt; + seen = 0; + goto injectit; + } else + seen = 1; + #if IPSEC /* * should the inner packet be considered authentic? @@ -323,6 +361,7 @@ ip6_input(m) */ ip6_delaux(m); + lck_mtx_lock(ip6_mutex); /* * mbuf statistics */ @@ -369,6 +408,7 @@ ip6_input(m) } if (n == NULL) { m_freem(m); + lck_mtx_unlock(ip6_mutex); return; /*ENOBUFS*/ } @@ -377,7 +417,8 @@ ip6_input(m) m_freem(m); m = n; } - IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), /*nothing*/); + IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), + {lck_mtx_unlock(ip6_mutex); return;}); #endif if (m->m_len < sizeof(struct ip6_hdr)) { @@ -386,6 +427,7 @@ ip6_input(m) if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == 0) { ip6stat.ip6s_toosmall++; in6_ifstat_inc(inifp, ifs6_in_hdrerr); + lck_mtx_unlock(ip6_mutex); return; } } @@ -411,8 +453,10 @@ ip6_input(m) m_freem(m); m = NULL; } - if (!m) + if (!m) { + lck_mtx_unlock(ip6_mutex); return; + } } /* @@ -502,12 +546,15 @@ ip6_input(m) */ if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) != 0 && IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_dst)) { - if (!in6ifa_ifpwithaddr(m->m_pkthdr.rcvif, &ip6->ip6_dst)) { + struct in6_ifaddr *ia6; + if (!(ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif, &ip6->ip6_dst))) { + lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 0); /* m is already freed */ return; } + ifafree(&ia6->ia_ifa); ours = 1; deliverifp = m->m_pkthdr.rcvif; @@ -647,7 +694,6 @@ ip6_input(m) "ip6_input: packet to an unready address %s->%s\n", ip6_sprintf(&ip6->ip6_src), ip6_sprintf(&ip6->ip6_dst))); - goto bad; } } @@ -713,6 +759,7 @@ ip6_input(m) #if 0 /*touches NULL pointer*/ in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); #endif + lck_mtx_unlock(ip6_mutex); return; /* m have already been freed */ } @@ -733,6 +780,7 @@ ip6_input(m) ip6stat.ip6s_badoptions++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); + lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (caddr_t)&ip6->ip6_plen - (caddr_t)ip6); @@ -746,6 +794,7 @@ ip6_input(m) sizeof(struct ip6_hbh)); if (hbh == NULL) { ip6stat.ip6s_tooshort++; + lck_mtx_unlock(ip6_mutex); return; } #endif @@ -794,14 +843,17 @@ ip6_input(m) if (ip6_mrouter && ip6_mforward(ip6, m->m_pkthdr.rcvif, m)) { ip6stat.ip6s_cantforward++; m_freem(m); + lck_mtx_unlock(ip6_mutex); return; } if (!ours) { m_freem(m); + lck_mtx_unlock(ip6_mutex); return; } } else if (!ours) { - ip6_forward(m, 0); + ip6_forward(m, 0, 1); + lck_mtx_unlock(ip6_mutex); return; } @@ -828,12 +880,17 @@ ip6_input(m) */ ip6stat.ip6s_delivered++; in6_ifstat_inc(deliverifp, ifs6_in_deliver); + + lck_mtx_unlock(ip6_mutex); +injectit: nest = 0; while (nxt != IPPROTO_DONE) { + struct ipfilter *filter; + if (ip6_hdrnestlimit && (++nest > ip6_hdrnestlimit)) { ip6stat.ip6s_toomanyhdr++; - goto bad; + goto badunlocked; } /* @@ -843,7 +900,7 @@ ip6_input(m) if (m->m_pkthdr.len < off) { ip6stat.ip6s_tooshort++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); - goto bad; + goto badunlocked; } #if 0 @@ -868,18 +925,58 @@ ip6_input(m) * note that we do not visit this with protocols with pcb layer * code - like udp/tcp/raw ip. */ - if ((ipsec_bypass == 0) && (ip6_protox[nxt]->pr_flags & PR_LASTHDR) != 0 && - ipsec6_in_reject(m, NULL)) { - ipsec6stat.in_polvio++; - goto bad; + if ((ipsec_bypass == 0) && (ip6_protox[nxt]->pr_flags & PR_LASTHDR) != 0) { + lck_mtx_lock(sadb_mutex); + if (ipsec6_in_reject(m, NULL)) { + ipsec6stat.in_polvio++; + lck_mtx_unlock(sadb_mutex); + goto badunlocked; + } + lck_mtx_unlock(sadb_mutex); } #endif - nxt = (*ip6_protox[nxt]->pr_input)(&m, &off); + /* + * Call IP filter on last header only + */ + if ((ip6_protox[nxt]->pr_flags & PR_LASTHDR) != 0 && !TAILQ_EMPTY(&ipv6_filters)) { + ipf_ref(); + TAILQ_FOREACH(filter, &ipv6_filters, ipf_link) { + if (seen == 0) { + if ((struct ipfilter *)inject_ipfref == filter) + seen = 1; + } else if (filter->ipf_filter.ipf_input) { + errno_t result; + + result = filter->ipf_filter.ipf_input( + filter->ipf_filter.cookie, (mbuf_t*)&m, off, nxt); + if (result == EJUSTRETURN) { + ipf_unref(); + return; + } + if (result != 0) { + ipf_unref(); + m_freem(m); + return; + } + } + } + ipf_unref(); + } + if (!(ip6_protox[nxt]->pr_flags & PR_PROTOLOCK)) { + lck_mtx_lock(inet6_domain_mutex); + nxt = (*ip6_protox[nxt]->pr_input)(&m, &off); + lck_mtx_unlock(inet6_domain_mutex); + } + else + nxt = (*ip6_protox[nxt]->pr_input)(&m, &off); } return; bad: + lck_mtx_unlock(ip6_mutex); + badunlocked: m_freem(m); + return; } /* @@ -930,11 +1027,11 @@ ip6_hopopts_input(plenp, rtalertp, mp, offp) /* validation of the length of the header */ #ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, sizeof(*hbh), -1); + IP6_EXTHDR_CHECK(m, off, sizeof(*hbh), return -1); hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); hbhlen = (hbh->ip6h_len + 1) << 3; - IP6_EXTHDR_CHECK(m, off, hbhlen, -1); + IP6_EXTHDR_CHECK(m, off, hbhlen, return -1); hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, @@ -1009,9 +1106,11 @@ ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) } if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) { /* XXX stat */ + lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); + lck_mtx_lock(ip6_mutex); return(-1); } optlen = IP6OPT_RTALERT_LEN; @@ -1026,9 +1125,11 @@ ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) } if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) { /* XXX stat */ + lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); + lck_mtx_lock(ip6_mutex); return(-1); } optlen = IP6OPT_JUMBO_LEN; @@ -1040,9 +1141,11 @@ ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) ip6 = mtod(m, struct ip6_hdr *); if (ip6->ip6_plen) { ip6stat.ip6s_badoptions++; + lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt - opthead); + lck_mtx_lock(ip6_mutex); return(-1); } @@ -1064,9 +1167,11 @@ ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) */ if (*plenp != 0) { ip6stat.ip6s_badoptions++; + lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); + lck_mtx_lock(ip6_mutex); return(-1); } #endif @@ -1076,9 +1181,11 @@ ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) */ if (jumboplen <= IPV6_MAXPACKET) { ip6stat.ip6s_badoptions++; + lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); + lck_mtx_lock(ip6_mutex); return(-1); } *plenp = jumboplen; @@ -1090,9 +1197,11 @@ ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) goto bad; } optlen = ip6_unknown_opt(opt, m, - erroff + opt - opthead); - if (optlen == -1) + erroff + opt - opthead, 1); + if (optlen == -1) { + /* ip6_unknown opt unlocked ip6_mutex */ return(-1); + } optlen += 2; break; } @@ -1100,7 +1209,7 @@ ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) return(0); - bad: + bad: m_freem(m); return(-1); } @@ -1112,10 +1221,11 @@ ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) * is not continuous in order to return an ICMPv6 error. */ int -ip6_unknown_opt(optp, m, off) +ip6_unknown_opt(optp, m, off, locked) u_int8_t *optp; struct mbuf *m; int off; + int locked; { struct ip6_hdr *ip6; @@ -1127,7 +1237,11 @@ ip6_unknown_opt(optp, m, off) return(-1); case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */ ip6stat.ip6s_badoptions++; + if (locked) + lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); + if (locked) + lck_mtx_lock(ip6_mutex); return(-1); case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */ ip6stat.ip6s_badoptions++; @@ -1135,9 +1249,14 @@ ip6_unknown_opt(optp, m, off) if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || (m->m_flags & (M_BCAST|M_MCAST))) m_freem(m); - else + else { + if (locked) + lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); + if (locked) + lck_mtx_lock(ip6_mutex); + } return(-1); } @@ -1162,17 +1281,8 @@ ip6_savecontrol(in6p, mp, ip6, m) struct ip6_hdr *ip6; struct mbuf *m; { - struct proc *p = current_proc(); /* XXX */ - int privileged = 0; int rthdr_exist = 0; -#ifdef __APPLE__ - if (p && !suser(p->p_ucred, &p->p_acflag)) -#else - if (p && !suser(p)) -#endif - privileged++; - #if SO_TIMESTAMP if ((in6p->in6p_socket->so_options & SO_TIMESTAMP) != 0) { struct timeval tv; @@ -1211,12 +1321,13 @@ ip6_savecontrol(in6p, mp, ip6, m) } /* - * IPV6_HOPOPTS socket option. We require super-user privilege - * for the option, but it might be too strict, since there might - * be some hop-by-hop options which can be returned to normal user. + * IPV6_HOPOPTS socket option. Recall that we required super-user + * privilege for the option (see ip6_ctloutput), but it might be too + * strict, since there might be some hop-by-hop options which can be + * returned to normal user. * See RFC 2292 section 6. */ - if ((in6p->in6p_flags & IN6P_HOPOPTS) != 0 && privileged) { + if ((in6p->in6p_flags & IN6P_HOPOPTS) != 0) { /* * Check if a hop-by-hop options header is contatined in the * received packet, and if so, store the options as ancillary @@ -1224,7 +1335,7 @@ ip6_savecontrol(in6p, mp, ip6, m) * just after the IPv6 header, which fact is assured through * the IPv6 input processing. */ - struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); + ip6 = mtod(m, struct ip6_hdr *); if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { struct ip6_hbh *hbh; int hbhlen = 0; @@ -1300,7 +1411,7 @@ ip6_savecontrol(in6p, mp, ip6, m) if ((in6p->in6p_flags & (IN6P_RTHDR | IN6P_DSTOPTS | IN6P_RTHDRDSTOPTS)) != 0) { - struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); + ip6 = mtod(m, struct ip6_hdr *); int nxt = ip6->ip6_nxt, off = sizeof(struct ip6_hdr); /* @@ -1364,14 +1475,6 @@ ip6_savecontrol(in6p, mp, ip6, m) if ((in6p->in6p_flags & IN6P_DSTOPTS) == 0) break; - /* - * We also require super-user privilege for - * the option. - * See the comments on IN6_HOPOPTS. - */ - if (!privileged) - break; - *mp = sbcreatecontrol((caddr_t)ip6e, elen, IPV6_DSTOPTS, IPPROTO_IPV6); @@ -1565,7 +1668,8 @@ ip6_nexthdr(m, off, proto, nxtp) if (m->m_pkthdr.len < off + sizeof(fh)) return -1; m_copydata(m, off, sizeof(fh), (caddr_t)&fh); - if ((ntohs(fh.ip6f_offlg) & IP6F_OFF_MASK) != 0) + /* IP6F_OFF_MASK = 0xfff8(BigEndian), 0xf8ff(LittleEndian) */ + if (fh.ip6f_offlg & IP6F_OFF_MASK) return -1; if (nxtp) *nxtp = fh.ip6f_nxt; diff --git a/bsd/netinet6/ip6_mroute.c b/bsd/netinet6/ip6_mroute.c index 595cf6c91..84e1a08ad 100644 --- a/bsd/netinet6/ip6_mroute.c +++ b/bsd/netinet6/ip6_mroute.c @@ -58,6 +58,7 @@ #include <sys/time.h> #include <sys/kernel.h> #include <sys/syslog.h> +#include <kern/locks.h> #include <net/if.h> #include <net/route.h> @@ -78,15 +79,16 @@ static MALLOC_DEFINE(M_MRTABLE, "mf6c", "multicast forwarding cache entry"); #define M_HASCL(m) ((m)->m_flags & M_EXT) -static int ip6_mdq __P((struct mbuf *, struct ifnet *, struct mf6c *)); -static void phyint_send __P((struct ip6_hdr *, struct mif6 *, struct mbuf *)); +static int ip6_mdq(struct mbuf *, struct ifnet *, struct mf6c *); +static void phyint_send(struct ip6_hdr *, struct mif6 *, struct mbuf *); -static int set_pim6 __P((int *)); -static int socket_send __P((struct socket *, struct mbuf *, - struct sockaddr_in6 *)); -static int register_send __P((struct ip6_hdr *, struct mif6 *, - struct mbuf *)); +static int set_pim6(int *); +static int socket_send(struct socket *, struct mbuf *, + struct sockaddr_in6 *); +static int register_send(struct ip6_hdr *, struct mif6 *, + struct mbuf *); +extern lck_mtx_t *ip6_mutex; /* * Globals. All but ip6_mrouter, ip6_mrtproto and mrt6stat could be static, * except for netstat or debugging purposes. @@ -112,8 +114,7 @@ u_int mrt6debug = 0; /* debug level */ #define DEBUG_PIM 0x40 #endif -static void expire_upcalls __P((void *)); -static void expire_upcalls_funneled __P((void *)); +static void expire_upcalls(void *); #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ #define UPCALL_EXPIRE 6 /* number of timeouts */ @@ -124,7 +125,6 @@ extern struct socket *ip_mrouter; #endif #endif -static u_long lo_dl_tag = 0; /* * 'Interfaces' associated with decapsulator (so we can tell * packets that went through it from ones that get reflected @@ -208,13 +208,13 @@ u_long upcall_data[UPCALL_MAX + 1]; static void collate(); #endif /* UPCALL_TIMING */ -static int get_sg_cnt __P((struct sioc_sg_req6 *)); -static int get_mif6_cnt __P((struct sioc_mif_req6 *)); -static int ip6_mrouter_init __P((struct socket *, struct mbuf *, int)); -static int add_m6if __P((struct mif6ctl *)); -static int del_m6if __P((mifi_t *)); -static int add_m6fc __P((struct mf6cctl *)); -static int del_m6fc __P((struct mf6cctl *)); +static int get_sg_cnt(struct sioc_sg_req6 *); +static int get_mif6_cnt(struct sioc_mif_req6 *); +static int ip6_mrouter_init(struct socket *, int, int); +static int add_m6if(struct mif6ctl *); +static int del_m6if(mifi_t *); +static int add_m6fc(struct mf6cctl *); +static int del_m6fc(struct mf6cctl *); #ifndef __APPLE__ static struct callout expire_upcalls_ch; @@ -227,53 +227,66 @@ ip6_mrouter_set(so, sopt) struct socket *so; struct sockopt *sopt; { - int error = 0; - struct mbuf *m; + int error = 0; + int optval; + struct mif6ctl mifc; + struct mf6cctl mfcc; + mifi_t mifi; if (so != ip6_mrouter && sopt->sopt_name != MRT6_INIT) return (EACCES); - if (sopt->sopt_valsize > MCLBYTES) - return (EMSGSIZE); - - - if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ - return (error); - if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ - return (error); - switch (sopt->sopt_name) { case MRT6_INIT: #if MRT6_OINIT case MRT6_OINIT: #endif - error = ip6_mrouter_init(so, m, sopt->sopt_name); + error = sooptcopyin(sopt, &optval, sizeof(optval), + sizeof(optval)); + if (error) + break; + error = ip6_mrouter_init(so, optval, sopt->sopt_name); break; case MRT6_DONE: error = ip6_mrouter_done(); break; case MRT6_ADD_MIF: - error = add_m6if(mtod(m, struct mif6ctl *)); - break; - case MRT6_DEL_MIF: - error = del_m6if(mtod(m, mifi_t *)); + error = sooptcopyin(sopt, &mifc, sizeof(mifc), sizeof(mifc)); + if (error) + break; + error = add_m6if(&mifc); break; case MRT6_ADD_MFC: - error = add_m6fc(mtod(m, struct mf6cctl *)); + error = sooptcopyin(sopt, &mfcc, sizeof(mfcc), sizeof(mfcc)); + if (error) + break; + error = add_m6fc(&mfcc); break; case MRT6_DEL_MFC: - error = del_m6fc(mtod(m, struct mf6cctl *)); + error = sooptcopyin(sopt, &mfcc, sizeof(mfcc), sizeof(mfcc)); + if (error) + break; + error = del_m6fc(&mfcc); + break; + case MRT6_DEL_MIF: + error = sooptcopyin(sopt, &mifi, sizeof(mifi), sizeof(mifi)); + if (error) + break; + error = del_m6if(&mifi); break; case MRT6_PIM: - error = set_pim6(mtod(m, int *)); + error = sooptcopyin(sopt, &optval, sizeof(optval), + sizeof(optval)); + if (error) + break; + error = set_pim6(&optval); break; default: error = EOPNOTSUPP; break; } - (void)m_freem(m); - return(error); + return (error); } /* @@ -328,11 +341,8 @@ get_sg_cnt(req) struct sioc_sg_req6 *req; { struct mf6c *rt; - int s; - s = splnet(); MF6CFIND(req->src.sin6_addr, req->grp.sin6_addr, rt); - splx(s); if (rt != NULL) { req->pktcnt = rt->mf6c_pkt_cnt; req->bytecnt = rt->mf6c_byte_cnt; @@ -382,13 +392,11 @@ set_pim6(i) * Enable multicast routing */ static int -ip6_mrouter_init(so, m, cmd) +ip6_mrouter_init(so, v, cmd) struct socket *so; - struct mbuf *m; + int v; int cmd; { - int *v; - #if MRT6DEBUG if (mrt6debug) log(LOG_DEBUG, @@ -400,12 +408,8 @@ ip6_mrouter_init(so, m, cmd) so->so_proto->pr_protocol != IPPROTO_ICMPV6) return EOPNOTSUPP; - if (!m || (m->m_len != sizeof(int *))) - return ENOPROTOOPT; - - v = mtod(m, int *); - if (*v != 1) - return ENOPROTOOPT; + if (v != 1) + return (ENOPROTOOPT); if (ip6_mrouter != NULL) return EADDRINUSE; @@ -421,7 +425,7 @@ ip6_mrouter_init(so, m, cmd) callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); #else - timeout(expire_upcalls_funneled, (caddr_t)NULL, EXPIRE_TIMEOUT); + timeout(expire_upcalls, (caddr_t)NULL, EXPIRE_TIMEOUT); #endif #if MRT6DEBUG @@ -444,9 +448,7 @@ ip6_mrouter_done() struct in6_ifreq ifr; struct mf6c *rt; struct rtdetq *rte; - int s; - s = splnet(); /* * For each phyint in use, disable promiscuous reception of all IPv6 @@ -492,11 +494,12 @@ ip6_mrouter_done() #ifndef __APPLE__ callout_stop(&expire_upcalls_ch); #else - untimeout(expire_upcalls_funneled, (caddr_t)NULL); + untimeout(expire_upcalls, (caddr_t)NULL); #endif /* * Free all multicast forwarding cache entries. + *###LD 5/27 needs locking */ for (i = 0; i < MF6CTBLSIZ; i++) { rt = mf6ctable[i]; @@ -526,7 +529,6 @@ ip6_mrouter_done() ip6_mrouter = NULL; ip6_mrouter_ver = 0; - splx(s); #if MRT6DEBUG if (mrt6debug) @@ -547,7 +549,7 @@ add_m6if(mifcp) { struct mif6 *mifp; struct ifnet *ifp; - int error, s; + int error; #if notyet struct tbf *m_tbf = tbftable + mifcp->mif6c_mifi; #endif @@ -577,14 +579,11 @@ add_m6if(mifcp) if ((ifp->if_flags & IFF_MULTICAST) == 0) return EOPNOTSUPP; - s = splnet(); error = if_allmulti(ifp, 1); - splx(s); if (error) return error; } - s = splnet(); mifp->m6_flags = mifcp->mif6c_flags; mifp->m6_ifp = ifp; #if notyet @@ -596,7 +595,6 @@ add_m6if(mifcp) mifp->m6_pkt_out = 0; mifp->m6_bytes_in = 0; mifp->m6_bytes_out = 0; - splx(s); /* Adjust nummifs up if the mifi is higher than nummifs */ if (nummifs <= mifcp->mif6c_mifi) @@ -623,14 +621,12 @@ del_m6if(mifip) struct mif6 *mifp = mif6table + *mifip; mifi_t mifi; struct ifnet *ifp; - int s; if (*mifip >= nummifs) return EINVAL; if (mifp->m6_ifp == NULL) return EINVAL; - s = splnet(); if (!(mifp->m6_flags & MIFF_REGISTER)) { /* @@ -646,7 +642,7 @@ del_m6if(mifip) bzero((caddr_t)qtable[*mifip], sizeof(qtable[*mifip])); bzero((caddr_t)mifp->m6_tbf, sizeof(*(mifp->m6_tbf))); #endif - bzero((caddr_t)mifp, sizeof (*mifp)); + bzero((caddr_t)mifp, sizeof(*mifp)); /* Adjust nummifs down */ for (mifi = nummifs; mifi > 0; mifi--) @@ -654,7 +650,6 @@ del_m6if(mifip) break; nummifs = mifi; - splx(s); #if MRT6DEBUG if (mrt6debug) @@ -691,17 +686,14 @@ add_m6fc(mfccp) mfccp->mf6cc_parent); #endif - s = splnet(); rt->mf6c_parent = mfccp->mf6cc_parent; rt->mf6c_ifset = mfccp->mf6cc_ifset; - splx(s); return 0; } /* * Find the entry for which the upcall was made and update */ - s = splnet(); hash = MF6CHASH(mfccp->mf6cc_origin.sin6_addr, mfccp->mf6cc_mcastgrp.sin6_addr); for (rt = mf6ctable[hash], nstl = 0; rt; rt = rt->mf6c_next) { @@ -815,7 +807,6 @@ add_m6fc(mfccp) mf6ctable[hash] = rt; } } - splx(s); return 0; } @@ -858,7 +849,6 @@ del_m6fc(mfccp) struct mf6c *rt; struct mf6c **nptr; u_long hash; - int s; origin = mfccp->mf6cc_origin; mcastgrp = mfccp->mf6cc_mcastgrp; @@ -871,7 +861,6 @@ del_m6fc(mfccp) ip6_sprintf(&mcastgrp.sin6_addr)); #endif - s = splnet(); nptr = &mf6ctable[hash]; while ((rt = *nptr) != NULL) { @@ -885,14 +874,12 @@ del_m6fc(mfccp) nptr = &rt->mf6c_next; } if (rt == NULL) { - splx(s); return EADDRNOTAVAIL; } *nptr = rt->mf6c_next; FREE(rt, M_MRTABLE); - splx(s); return 0; } @@ -903,15 +890,16 @@ socket_send(s, mm, src) struct mbuf *mm; struct sockaddr_in6 *src; { +//### LD 5/27/04 needs locking! +// if (s) { if (sbappendaddr(&s->so_rcv, (struct sockaddr *)src, - mm, (struct mbuf *)0) != 0) { + mm, (struct mbuf *)0, NULL) != 0) { sorwakeup(s); return 0; } } - m_freem(mm); return -1; } @@ -937,6 +925,7 @@ ip6_mforward(ip6, ifp, m) struct mbuf *mm; int s; mifi_t mifi; + struct timeval timenow; #if MRT6DEBUG if (mrt6debug & DEBUG_FORWARD) @@ -961,10 +950,11 @@ ip6_mforward(ip6, ifp, m) * MLD packets can be sent with the unspecified source address * (although such packets must normally set 1 to the hop limit field). */ + getmicrotime(&timenow); if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { ip6stat.ip6s_cantforward++; - if (ip6_log_time + ip6_log_interval < time_second) { - ip6_log_time = time_second; + if (ip6_log_time + ip6_log_interval < timenow.tv_sec) { + ip6_log_time = timenow.tv_sec; log(LOG_DEBUG, "cannot forward " "from %s to %s nxt %d received on %s\n", @@ -1188,20 +1178,6 @@ ip6_mforward(ip6, ifp, m) } } -static void -expire_upcalls_funneled(unused) - void *unused; -{ -#ifdef __APPLE__ - boolean_t funnel_state; - funnel_state = thread_funnel_set(network_flock, TRUE); -#endif - expire_upcalls(unused); -#ifdef __APPLE__ - (void) thread_funnel_set(network_flock, FALSE); -#endif -} - /* * Clean up cache entries if upcalls are not serviced * Call from the Slow Timeout mechanism, every half second. @@ -1262,7 +1238,7 @@ expire_upcalls(unused) callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); #else - timeout(expire_upcalls_funneled, (caddr_t)NULL, EXPIRE_TIMEOUT); + timeout(expire_upcalls, (caddr_t)NULL, EXPIRE_TIMEOUT); #endif } @@ -1483,7 +1459,8 @@ phyint_send(ip6, mifp, m) im6o.im6o_multicast_hlim = ip6->ip6_hlim; im6o.im6o_multicast_loop = 1; error = ip6_output(mb_copy, NULL, &ro, - IPV6_FORWARDING, &im6o, NULL); + IPV6_FORWARDING, &im6o, NULL, 0); + #if MRT6DEBUG if (mrt6debug & DEBUG_XMIT) @@ -1521,12 +1498,14 @@ phyint_send(ip6, mifp, m) #ifdef __APPLE__ /* Make sure the HW checksum flags are cleaned before sending the packet */ - mb_copy->m_pkthdr.rcvif = (struct ifnet *)0; + mb_copy->m_pkthdr.rcvif = 0; mb_copy->m_pkthdr.csum_data = 0; mb_copy->m_pkthdr.csum_flags = 0; - error = dlil_output(ifptodlt(ifp, PF_INET6), mb_copy, + lck_mtx_unlock(ip6_mutex); + error = dlil_output(ifp, PF_INET6, mb_copy, NULL, (struct sockaddr *)&ro.ro_dst, 0); + lck_mtx_lock(ip6_mutex); #else error = (*ifp->if_output)(ifp, mb_copy, (struct sockaddr *)&ro.ro_dst, @@ -1679,7 +1658,7 @@ pim6_input(mp, offp) * possibly the PIM REGISTER header */ #ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, minlen, IPPROTO_DONE); + IP6_EXTHDR_CHECK(m, off, minlen, return IPPROTO_DONE); /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); @@ -1847,11 +1826,11 @@ pim6_input(mp, offp) #ifdef __APPLE__ - if (lo_dl_tag == 0) - dlil_find_dltag(APPLE_IF_FAM_LOOPBACK, 0, PF_INET, &lo_dl_tag); - - if (lo_dl_tag) - dlil_output(lo_dl_tag, m, 0, (struct sockaddr *)&dst, 0); + if (lo_ifp) { + lck_mtx_unlock(ip6_mutex); + dlil_output(lo_ifp, PF_INET6, m, 0, (struct sockaddr *)&dst, 0); + lck_mtx_lock(ip6_mutex); + } else { printf("Warning: pim6_input call to dlil_find_dltag failed!\n"); m_freem(m); diff --git a/bsd/netinet6/ip6_mroute.h b/bsd/netinet6/ip6_mroute.h index 5bae8b74e..f38b57753 100644 --- a/bsd/netinet6/ip6_mroute.h +++ b/bsd/netinet6/ip6_mroute.h @@ -51,7 +51,7 @@ /* * Multicast Routing set/getsockopt commands. */ -#ifdef KERNEL +#ifdef KERNEL_PRIVATE #define MRT6_OINIT 100 /* initialize forwarder (omrt6msg) */ #endif #define MRT6_DONE 101 /* shut down forwarder */ @@ -62,12 +62,8 @@ #define MRT6_PIM 107 /* enable pim code */ #define MRT6_INIT 108 /* initialize forwarder (mrt6msg) */ -#if BSD >= 199103 -#define GET_TIME(t) microtime(&t) -#elif defined(sun) -#define GET_TIME(t) uniqtime(&t) -#else -#define GET_TIME(t) ((t) = time) +#ifdef __APPLE__ +#define GET_TIME(t) getmicrotime(&t) #endif /* @@ -140,6 +136,7 @@ struct mrt6stat { u_quad_t mrt6s_upq_sockfull; /* upcalls dropped - socket full */ }; +#ifdef KERNEL_PRIVATE #if MRT6_OINIT /* * Struct used to communicate from kernel to multicast router @@ -160,6 +157,7 @@ struct omrt6msg { struct in6_addr im6_src, im6_dst; }; #endif +#endif KERNEL_PRIVATE /* * Structure used to communicate from kernel to multicast router. @@ -203,8 +201,7 @@ struct sioc_mif_req6 { u_quad_t obytes; /* Output byte count on mif */ }; -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef PRIVATE /* * The kernel's multicast-interface structure. */ @@ -247,6 +244,15 @@ struct mf6c { #define MF6C_INCOMPLETE_PARENT ((mifi_t)-1) +#define MF6CTBLSIZ 256 +#if (MF6CTBLSIZ & (MF6CTBLSIZ - 1)) == 0 /* from sys:route.h */ +#define MF6CHASHMOD(h) ((h) & (MF6CTBLSIZ - 1)) +#else +#define MF6CHASHMOD(h) ((h) % MF6CTBLSIZ) +#endif + +#define MAX_UPQ6 4 /* max. no of pkts in upcall Q */ + /* * Argument structure used for pkt info. while upcall is made */ @@ -261,20 +267,14 @@ struct rtdetq { /* XXX: rtdetq is also defined in ip_mroute.h */ }; #endif /* _NETINET_IP_MROUTE_H_ */ -#define MF6CTBLSIZ 256 -#if (MF6CTBLSIZ & (MF6CTBLSIZ - 1)) == 0 /* from sys:route.h */ -#define MF6CHASHMOD(h) ((h) & (MF6CTBLSIZ - 1)) -#else -#define MF6CHASHMOD(h) ((h) % MF6CTBLSIZ) -#endif - -#define MAX_UPQ6 4 /* max. no of pkts in upcall Q */ +#ifdef KERNEL +extern struct mrt6stat mrt6stat; -int ip6_mrouter_set __P((struct socket *so, struct sockopt *sopt)); -int ip6_mrouter_get __P((struct socket *so, struct sockopt *sopt)); -int ip6_mrouter_done __P((void)); -int mrt6_ioctl __P((int, caddr_t)); -#endif /* __APPLE_API_PRIVATE */ +int ip6_mrouter_set(struct socket *so, struct sockopt *sopt); +int ip6_mrouter_get(struct socket *so, struct sockopt *sopt); +int ip6_mrouter_done(void); +int mrt6_ioctl(int, caddr_t); #endif /* KERNEL */ +#endif /* PRIVATE */ #endif /* !_NETINET6_IP6_MROUTE_H_ */ diff --git a/bsd/netinet6/ip6_output.c b/bsd/netinet6/ip6_output.c index f6023f234..c2bdcc28d 100644 --- a/bsd/netinet6/ip6_output.c +++ b/bsd/netinet6/ip6_output.c @@ -76,6 +76,7 @@ #include <sys/systm.h> #include <sys/kernel.h> #include <sys/proc.h> +#include <sys/kauth.h> #include <net/if.h> #include <net/route.h> @@ -97,21 +98,23 @@ #endif #include <netkey/key.h> extern int ipsec_bypass; +extern lck_mtx_t *sadb_mutex; +extern lck_mtx_t *nd6_mutex; #endif /* IPSEC */ #include <netinet6/ip6_fw.h> #include <net/net_osdep.h> +#include <netinet/kpi_ipfilter_var.h> + #ifndef __APPLE__ static MALLOC_DEFINE(M_IPMOPTS, "ip6_moptions", "internet multicast options"); #endif -static u_long lo_dl_tag = 0; extern u_long route_generation; - struct ip6_exthdrs { struct mbuf *ip6e_ip6; struct mbuf *ip6e_hbh; @@ -120,19 +123,20 @@ struct ip6_exthdrs { struct mbuf *ip6e_dest2; }; -static int ip6_pcbopts __P((struct ip6_pktopts **, struct mbuf *, - struct socket *, struct sockopt *sopt)); -static int ip6_setmoptions __P((int, struct inpcb *, struct mbuf *)); -static int ip6_getmoptions __P((int, struct ip6_moptions *, struct mbuf **)); -static int ip6_copyexthdr __P((struct mbuf **, caddr_t, int)); -static int ip6_insertfraghdr __P((struct mbuf *, struct mbuf *, int, - struct ip6_frag **)); -static int ip6_insert_jumboopt __P((struct ip6_exthdrs *, u_int32_t)); -static int ip6_splithdr __P((struct mbuf *, struct ip6_exthdrs *)); +static int ip6_pcbopts(struct ip6_pktopts **, struct mbuf *, + struct socket *, struct sockopt *sopt); +static int ip6_setmoptions(int, struct inpcb *, struct mbuf *); +static int ip6_getmoptions(int, struct ip6_moptions *, struct mbuf **); +static int ip6_copyexthdr(struct mbuf **, caddr_t, int); +static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int, + struct ip6_frag **); +static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t); +static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *); extern int ip_createmoptions(struct ip_moptions **imop); extern int ip_addmembership(struct ip_moptions *imo, struct ip_mreq *mreq); extern int ip_dropmembership(struct ip_moptions *imo, struct ip_mreq *mreq); +extern lck_mtx_t *ip6_mutex; /* * IP6 output. The packet in mbuf chain m contains a skeletal IP6 @@ -146,13 +150,14 @@ extern int ip_dropmembership(struct ip_moptions *imo, struct ip_mreq *mreq); * which is rt_rmx.rmx_mtu. */ int -ip6_output(m0, opt, ro, flags, im6o, ifpp) - struct mbuf *m0; - struct ip6_pktopts *opt; - struct route_in6 *ro; - int flags; - struct ip6_moptions *im6o; - struct ifnet **ifpp; /* XXX: just for statistics */ +ip6_output( + struct mbuf *m0, + struct ip6_pktopts *opt, + struct route_in6 *ro, + int flags, + struct ip6_moptions *im6o, + struct ifnet **ifpp, /* XXX: just for statistics */ + int locked) { struct ip6_hdr *ip6, *mhip6; struct ifnet *ifp, *origifp; @@ -169,20 +174,26 @@ ip6_output(m0, opt, ro, flags, im6o, ifpp) struct route_in6 *ro_pmtu = NULL; int hdrsplit = 0; int needipsec = 0; + ipfilter_t inject_filter_ref; + #if IPSEC int needipsectun = 0; struct socket *so = NULL; struct secpolicy *sp = NULL; + if (!locked) + lck_mtx_lock(ip6_mutex); /* for AH processing. stupid to have "socket" variable in IP layer... */ if (ipsec_bypass == 0) { so = ipsec_getsocket(m); (void)ipsec_setsocket(m, NULL); } - ip6 = mtod(m, struct ip6_hdr *); #endif /* IPSEC */ + ip6 = mtod(m, struct ip6_hdr *); + inject_filter_ref = ipf_get_inject_filter(m); + #define MAKE_EXTHDR(hp, mp) \ do { \ if (hp) { \ @@ -210,7 +221,8 @@ ip6_output(m0, opt, ro, flags, im6o, ifpp) #if IPSEC if (ipsec_bypass != 0) goto skip_ipsec; - + + lck_mtx_lock(sadb_mutex); /* get a security policy for this packet */ if (so == NULL) sp = ipsec6_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, 0, &error); @@ -219,6 +231,7 @@ ip6_output(m0, opt, ro, flags, im6o, ifpp) if (sp == NULL) { ipsec6stat.out_inval++; + lck_mtx_unlock(sadb_mutex); goto freehdrs; } @@ -231,6 +244,7 @@ ip6_output(m0, opt, ro, flags, im6o, ifpp) * This packet is just discarded. */ ipsec6stat.out_polvio++; + lck_mtx_unlock(sadb_mutex); goto freehdrs; case IPSEC_POLICY_BYPASS: @@ -243,6 +257,7 @@ ip6_output(m0, opt, ro, flags, im6o, ifpp) if (sp->req == NULL) { /* acquire a policy */ error = key_spdacquire(sp); + lck_mtx_unlock(sadb_mutex); goto freehdrs; } needipsec = 1; @@ -252,6 +267,7 @@ ip6_output(m0, opt, ro, flags, im6o, ifpp) default: printf("ip6_output: Invalid policy found. %d\n", sp->policy); } + lck_mtx_unlock(sadb_mutex); skip_ipsec: #endif /* IPSEC */ @@ -361,6 +377,61 @@ ip6_output(m0, opt, ro, flags, im6o, ifpp) MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp, IPPROTO_ROUTING); + if (!TAILQ_EMPTY(&ipv6_filters)) { + struct ipfilter *filter; + int seen = (inject_filter_ref == 0); + int fixscope = 0; + struct ipf_pktopts *ippo = 0, ipf_pktopts; + + if (im6o != NULL && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { + ippo = &ipf_pktopts; + ippo->ippo_flags = IPPOF_MCAST_OPTS; + ippo->ippo_mcast_ifnet = im6o->im6o_multicast_ifp; + ippo->ippo_mcast_ttl = im6o->im6o_multicast_hlim; + ippo->ippo_mcast_loop = im6o->im6o_multicast_loop; + } + + /* Hack: embed the scope_id in the destination */ + if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst) && + (ip6->ip6_dst.s6_addr16[1] == 0) && (ro != NULL)) { + fixscope = 1; + ip6->ip6_dst.s6_addr16[1] = htons(ro->ro_dst.sin6_scope_id); + } + { + lck_mtx_unlock(ip6_mutex); + ipf_ref(); + TAILQ_FOREACH(filter, &ipv6_filters, ipf_link) { + /* + * No need to proccess packet twice if we've + * already seen it + */ + if (seen == 0) { + if ((struct ipfilter *)inject_filter_ref == filter) + seen = 1; + } else if (filter->ipf_filter.ipf_output) { + errno_t result; + + result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo); + if (result == EJUSTRETURN) { + ipf_unref(); + locked = 1; /* Don't want to take lock to unlock it right away */ + goto done; + } + if (result != 0) { + ipf_unref(); + locked = 1; /* Don't want to take lock to unlock it right away */ + goto bad; + } + } + } + ipf_unref(); + lck_mtx_lock(ip6_mutex); + } + /* Hack: cleanup embedded scope_id if we put it there */ + if (fixscope) + ip6->ip6_dst.s6_addr16[1] = 0; + } + #if IPSEC if (!needipsec) goto skip_ipsec2; @@ -512,9 +583,10 @@ skip_ipsec2:; state.m = m; state.ro = (struct route *)ro; state.dst = (struct sockaddr *)dst; - + + lck_mtx_lock(sadb_mutex); error = ipsec6_output_tunnel(&state, sp, flags); - + lck_mtx_unlock(sadb_mutex); m = state.m; ro = (struct route_in6 *)state.ro; dst = (struct sockaddr_in6 *)state.dst; @@ -537,6 +609,7 @@ skip_ipsec2:; error = 0; break; } + lck_mtx_unlock(sadb_mutex); goto bad; } @@ -554,16 +627,18 @@ skip_ipsec2:; * if an interface is specified from an upper layer, * ifp must point it. */ + lck_mtx_lock(rt_mtx); if (ro->ro_rt == 0) { /* * non-bsdi always clone routes, if parent is * PRF_CLONING. */ - rtalloc((struct route *)ro); + rtalloc_ign_locked((struct route *)ro, 0UL); } if (ro->ro_rt == 0) { ip6stat.ip6s_noroute++; error = EHOSTUNREACH; + lck_mtx_unlock(rt_mtx); /* XXX in6_ifstat_inc(ifp, ifs6_out_discard); */ goto bad; } @@ -572,6 +647,7 @@ skip_ipsec2:; ro->ro_rt->rt_use++; if (ro->ro_rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in6 *)ro->ro_rt->rt_gateway; + lck_mtx_unlock(rt_mtx); m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */ in6_ifstat_inc(ifp, ifs6_out_request); @@ -652,12 +728,14 @@ skip_ipsec2:; * ``net'' ff00::/8). */ if (ifp == NULL) { + lck_mtx_lock(rt_mtx); if (ro->ro_rt == 0) { - ro->ro_rt = rtalloc1((struct sockaddr *) + ro->ro_rt = rtalloc1_locked((struct sockaddr *) &ro->ro_dst, 0, 0UL); } if (ro->ro_rt == 0) { ip6stat.ip6s_noroute++; + lck_mtx_unlock(rt_mtx); error = EHOSTUNREACH; /* XXX in6_ifstat_inc(ifp, ifs6_out_discard) */ goto bad; @@ -665,6 +743,7 @@ skip_ipsec2:; ia = ifatoia6(ro->ro_rt->rt_ifa); ifp = ro->ro_rt->rt_ifp; ro->ro_rt->rt_use++; + lck_mtx_unlock(rt_mtx); } if ((flags & IPV6_FORWARDING) == 0) @@ -680,7 +759,9 @@ skip_ipsec2:; error = ENETUNREACH; goto bad; } + ifnet_lock_shared(ifp); IN6_LOOKUP_MULTI(ip6->ip6_dst, ifp, in6m); + ifnet_lock_done(ifp); if (in6m != NULL && (im6o == NULL || im6o->im6o_multicast_loop)) { /* @@ -836,7 +917,7 @@ skip_ipsec2:; u_short port = 0; m->m_pkthdr.rcvif = NULL; /* XXX */ /* If ipfw says divert, we have to just drop packet */ - if ((*ip6_fw_chk_ptr)(&ip6, ifp, &port, &m)) { + if (ip6_fw_chk_ptr(&ip6, ifp, &port, &m)) { m_freem(m); goto done; } @@ -914,7 +995,7 @@ skip_ipsec2:; ipsec_delaux(m); #endif - error = nd6_output(ifp, origifp, m, dst, ro->ro_rt); + error = nd6_output(ifp, origifp, m, dst, ro->ro_rt, 1); goto done; } else if (mtu < IPV6_MMTU) { /* @@ -1011,7 +1092,8 @@ skip_ipsec2:; } m_cat(m, m_frgpart); m->m_pkthdr.len = len + hlen + sizeof(*ip6f); - m->m_pkthdr.rcvif = (struct ifnet *)0; + m->m_pkthdr.rcvif = 0; + m->m_pkthdr.socket_id = m0->m_pkthdr.socket_id; ip6f->ip6f_reserved = 0; ip6f->ip6f_ident = id; ip6f->ip6f_nxt = nextproto; @@ -1044,7 +1126,8 @@ sendorfree: /* clean ipsec history once it goes out of the node */ ipsec_delaux(m); #endif - error = nd6_output(ifp, origifp, m, dst, ro->ro_rt); + error = nd6_output(ifp, origifp, m, dst, ro->ro_rt, 1); + } else m_freem(m); } @@ -1053,6 +1136,8 @@ sendorfree: ip6stat.ip6s_fragmented++; done: + if (!locked) + lck_mtx_unlock(ip6_mutex); if (ro == &ip6route && ro->ro_rt) { /* brace necessary for rtfree */ rtfree(ro->ro_rt); } else if (ro_pmtu == &ip6route && ro_pmtu->ro_rt) { @@ -1060,8 +1145,11 @@ done: } #if IPSEC - if (sp != NULL) + if (sp != NULL) { + lck_mtx_lock(sadb_mutex); key_freesp(sp); + lck_mtx_unlock(sadb_mutex); + } #endif /* IPSEC */ return(error); @@ -1267,6 +1355,7 @@ ip6_ctloutput(so, sopt) int optlen; struct proc *p; + level = error = optval = 0; if (sopt == NULL) panic("ip6_ctloutput: arg soopt is NULL"); else { @@ -1276,9 +1365,8 @@ ip6_ctloutput(so, sopt) optlen = sopt->sopt_valsize; p = sopt->sopt_p; } - error = optval = 0; - privileged = (p == 0 || suser(p->p_ucred, &p->p_acflag)) ? 0 : 1; + privileged = (p == 0 || proc_suser(p)) ? 0 : 1; if (level == IPPROTO_IPV6) { switch (op) { @@ -1485,10 +1573,10 @@ do { \ size_t len = 0; struct mbuf *m; - if (sopt->sopt_valsize > MCLBYTES) { - error = EMSGSIZE; - break; - } + if (sopt->sopt_valsize > MCLBYTES) { + error = EMSGSIZE; + break; + } if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ @@ -1497,8 +1585,10 @@ do { \ req = mtod(m, caddr_t); len = m->m_len; } + lck_mtx_lock(sadb_mutex); error = ipsec6_set_policy(in6p, optname, req, len, privileged); + lck_mtx_unlock(sadb_mutex); m_freem(m); } break; @@ -1583,8 +1673,8 @@ do { \ case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: - if (optname == IPV6_HOPOPTS || - optname == IPV6_DSTOPTS || + if ((optname == IPV6_HOPOPTS || + optname == IPV6_DSTOPTS) && !privileged) return(EPERM); switch (optname) { @@ -1636,10 +1726,10 @@ do { \ struct mbuf *m = NULL; struct mbuf **mp = &m; - if (sopt->sopt_valsize > MCLBYTES) { - error = EMSGSIZE; - break; - } + if (sopt->sopt_valsize > MCLBYTES) { + error = EMSGSIZE; + break; + } error = soopt_getm(sopt, &m); /* XXX */ if (error != NULL) break; @@ -1650,7 +1740,9 @@ do { \ req = mtod(m, caddr_t); len = m->m_len; } + lck_mtx_lock(sadb_mutex); error = ipsec6_get_policy(in6p, req, len, mp); + lck_mtx_unlock(sadb_mutex); if (error == 0) error = soopt_mcopyout(sopt, m); /*XXX*/ if (error == 0 && m) @@ -1720,7 +1812,7 @@ ip6_pcbopts(pktopt, m, so, sopt) } /* set options specified by user. */ - if (p && !suser(p->p_ucred, &p->p_acflag)) + if (p && !proc_suser(p)) priv = 1; if ((error = ip6_setpktoptions(m, opt, priv, 1)) != 0) { ip6_clearpktopts(opt, 1, -1); /* XXX: discard all options */ @@ -1869,10 +1961,10 @@ ip6_freepcbopts(pktopt) * Set the IP6 multicast options in response to user setsockopt(). */ static int -ip6_setmoptions(optname, in6p, m) - int optname; - struct inpcb* in6p; - struct mbuf *m; +ip6_setmoptions( + int optname, + struct inpcb* in6p, + struct mbuf *m) { int error = 0; u_int loop, ifindex; @@ -2005,7 +2097,7 @@ ip6_setmoptions(optname, in6p, m) * all multicast addresses. Only super user is allowed * to do this. */ - if (suser(p->p_ucred, &p->p_acflag)) + if (suser(kauth_cred_get(), 0)) { error = EACCES; break; @@ -2022,12 +2114,14 @@ ip6_setmoptions(optname, in6p, m) ifp = ifindex2ifnet[mreq->ipv6mr_interface]; + lck_mtx_lock(rt_mtx); TAILQ_FOREACH(ifa, &in_ifaddrhead, ia_link) { if (ifa->ia_ifp == ifp) { v4req.imr_interface = IA_SIN(ifa)->sin_addr; break; } } + lck_mtx_unlock(rt_mtx); if (v4req.imr_multiaddr.s_addr == 0) { /* Interface has no IPv4 address. */ @@ -2093,6 +2187,7 @@ ip6_setmoptions(optname, in6p, m) /* * See if the membership already exists. */ + lck_mtx_lock(nd6_mutex); for (imm = im6o->im6o_memberships.lh_first; imm != NULL; imm = imm->i6mm_chain.le_next) if (imm->i6mm_maddr->in6m_ifp == ifp && @@ -2101,6 +2196,7 @@ ip6_setmoptions(optname, in6p, m) break; if (imm != NULL) { error = EADDRINUSE; + lck_mtx_unlock(nd6_mutex); break; } /* @@ -2110,14 +2206,17 @@ ip6_setmoptions(optname, in6p, m) imm = _MALLOC(sizeof(*imm), M_IPMADDR, M_WAITOK); if (imm == NULL) { error = ENOBUFS; + lck_mtx_unlock(nd6_mutex); break; } if ((imm->i6mm_maddr = - in6_addmulti(&mreq->ipv6mr_multiaddr, ifp, &error)) == NULL) { + in6_addmulti(&mreq->ipv6mr_multiaddr, ifp, &error, 1)) == NULL) { FREE(imm, M_IPMADDR); + lck_mtx_unlock(nd6_mutex); break; } LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain); + lck_mtx_unlock(nd6_mutex); break; case IPV6_LEAVE_GROUP: @@ -2142,7 +2241,7 @@ ip6_setmoptions(optname, in6p, m) ifp = ifindex2ifnet[mreq->ipv6mr_interface]; if (IN6_IS_ADDR_UNSPECIFIED(&mreq->ipv6mr_multiaddr)) { - if (suser(p->p_ucred, &p->p_acflag)) { + if (suser(kauth_cred_get(), 0)) { error = EACCES; break; } @@ -2155,12 +2254,14 @@ ip6_setmoptions(optname, in6p, m) if (ifp != NULL) { struct in_ifaddr *ifa; + lck_mtx_lock(rt_mtx); TAILQ_FOREACH(ifa, &in_ifaddrhead, ia_link) { if (ifa->ia_ifp == ifp) { v4req.imr_interface = IA_SIN(ifa)->sin_addr; break; } } + lck_mtx_unlock(rt_mtx); } error = ip_dropmembership(imo, &v4req); @@ -2180,6 +2281,7 @@ ip6_setmoptions(optname, in6p, m) /* * Find the membership in the membership list. */ + lck_mtx_lock(nd6_mutex); for (imm = im6o->im6o_memberships.lh_first; imm != NULL; imm = imm->i6mm_chain.le_next) { if ((ifp == NULL || @@ -2191,6 +2293,7 @@ ip6_setmoptions(optname, in6p, m) if (imm == NULL) { /* Unable to resolve interface */ error = EADDRNOTAVAIL; + lck_mtx_unlock(nd6_mutex); break; } /* @@ -2198,7 +2301,8 @@ ip6_setmoptions(optname, in6p, m) * membership points. */ LIST_REMOVE(imm, i6mm_chain); - in6_delmulti(imm->i6mm_maddr); + in6_delmulti(imm->i6mm_maddr, 1); + lck_mtx_unlock(nd6_mutex); FREE(imm, M_IPMADDR); break; @@ -2210,6 +2314,7 @@ ip6_setmoptions(optname, in6p, m) /* * If all options have default values, no need to keep the mbuf. */ + lck_mtx_lock(nd6_mutex); if (im6o->im6o_multicast_ifp == NULL && im6o->im6o_multicast_hlim == ip6_defmcasthlim && im6o->im6o_multicast_loop == IPV6_DEFAULT_MULTICAST_LOOP && @@ -2225,6 +2330,7 @@ ip6_setmoptions(optname, in6p, m) ip_freemoptions(imo); in6p->inp_moptions = 0; } + lck_mtx_unlock(nd6_mutex); return(error); } @@ -2287,13 +2393,15 @@ ip6_freemoptions(im6o) if (im6o == NULL) return; - + + lck_mtx_lock(nd6_mutex); while ((imm = im6o->im6o_memberships.lh_first) != NULL) { LIST_REMOVE(imm, i6mm_chain); if (imm->i6mm_maddr) - in6_delmulti(imm->i6mm_maddr); + in6_delmulti(imm->i6mm_maddr, 1); FREE(imm, M_IPMADDR); } + lck_mtx_unlock(nd6_mutex); FREE(im6o, M_IPMOPTS); } @@ -2372,8 +2480,12 @@ ip6_setpktoptions(control, opt, priv, needcopy) ia6 = (struct in6_ifaddr *)ifa_ifwithaddr(sin6tosa(&sin6)); if (ia6 == NULL || (ia6->ia6_flags & (IN6_IFF_ANYCAST | - IN6_IFF_NOTREADY)) != 0) + IN6_IFF_NOTREADY)) != 0) { + if (ia6) ifafree(&ia6->ia_ifa); return(EADDRNOTAVAIL); + } + ifafree(&ia6->ia_ifa); + ia6 = NULL; } break; @@ -2519,10 +2631,10 @@ ip6_setpktoptions(control, opt, priv, needcopy) * pointer that might NOT be &loif -- easier than replicating that code here. */ void -ip6_mloopback(ifp, m, dst) - struct ifnet *ifp; - struct mbuf *m; - struct sockaddr_in6 *dst; +ip6_mloopback( + struct ifnet *ifp, + struct mbuf *m, + struct sockaddr_in6 *dst) { struct mbuf *copym; struct ip6_hdr *ip6; @@ -2564,16 +2676,15 @@ ip6_mloopback(ifp, m, dst) /* Makes sure the HW checksum flags are cleaned before sending the packet */ - copym->m_pkthdr.rcvif = (struct ifnet *)0; + copym->m_pkthdr.rcvif = 0; copym->m_pkthdr.csum_data = 0; copym->m_pkthdr.csum_flags = 0; - if (lo_dl_tag == 0) - dlil_find_dltag(APPLE_IF_FAM_LOOPBACK, 0, PF_INET, &lo_dl_tag); - - if (lo_dl_tag) { + if (lo_ifp) { copym->m_pkthdr.rcvif = ifp; - dlil_output(lo_dl_tag, copym, 0, (struct sockaddr *)dst, 0); + lck_mtx_unlock(ip6_mutex); + dlil_output(lo_ifp, PF_INET6, copym, 0, (struct sockaddr *)dst, 0); + lck_mtx_lock(ip6_mutex); } else m_free(copym); #else diff --git a/bsd/netinet6/ip6_var.h b/bsd/netinet6/ip6_var.h index 254c8559f..96ca49088 100644 --- a/bsd/netinet6/ip6_var.h +++ b/bsd/netinet6/ip6_var.h @@ -69,7 +69,7 @@ #define _NETINET6_IP6_VAR_H_ #include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE /* * IP6 reassembly queue structure. Each fragment * being reassembled is attached to one of these structures. @@ -91,6 +91,7 @@ struct ip6q { #if notyet u_char *ip6q_nxtp; #endif + int ip6q_nfrag; /* number of fragments */ }; struct ip6asfrag { @@ -153,6 +154,7 @@ struct ip6_pktopts { /* * Control options for incoming packets */ +#endif /* KERNEL_PRIVATE */ struct ip6stat { u_quad_t ip6s_total; /* total packets received */ @@ -215,6 +217,7 @@ struct ip6stat { u_quad_t ip6s_forward_cachemiss; }; +#ifdef KERNEL_PRIVATE #ifdef KERNEL /* * IPv6 onion peeling state. @@ -244,9 +247,7 @@ struct ip6aux { * with IPsec it may not be accurate. */ }; -#endif -#ifdef KERNEL /* flags passed to ip6_output as last parameter */ #define IPV6_DADOUTPUT 0x01 /* DAD */ #define IPV6_FORWARDING 0x02 /* most of IPv6 header exists */ @@ -267,7 +268,8 @@ extern int ip6_v6only; extern struct socket *ip6_mrouter; /* multicast routing daemon */ extern int ip6_sendredirects; /* send IP redirects when forwarding? */ -extern int ip6_maxfragpackets; /* Maximum packets in reassembly queue */ +extern int ip6_maxfragpackets; /* Maximum packets in reassembly queue */ +extern int ip6_maxfrags; /* Maximum fragments in reassembly queue */ extern int ip6_sourcecheck; /* Verify source interface */ extern int ip6_sourcecheck_interval; /* Interval between log messages */ extern int ip6_accept_rtadv; /* Acts as a host not a router */ @@ -293,65 +295,65 @@ struct sockopt; struct inpcb; -int icmp6_ctloutput __P((struct socket *, struct sockopt *sopt)); +int icmp6_ctloutput(struct socket *, struct sockopt *sopt); struct in6_ifaddr; -void ip6_init __P((void)); -void ip6intr __P((void)); -void ip6_input __P((struct mbuf *)); -struct in6_ifaddr *ip6_getdstifaddr __P((struct mbuf *)); -void ip6_freepcbopts __P((struct ip6_pktopts *)); -void ip6_freemoptions __P((struct ip6_moptions *)); -int ip6_unknown_opt __P((u_int8_t *, struct mbuf *, int)); -char * ip6_get_prevhdr __P((struct mbuf *, int)); -int ip6_nexthdr __P((struct mbuf *, int, int, int *)); -int ip6_lasthdr __P((struct mbuf *, int, int, int *)); - -struct mbuf *ip6_addaux __P((struct mbuf *)); -struct mbuf *ip6_findaux __P((struct mbuf *)); -void ip6_delaux __P((struct mbuf *)); - -int ip6_mforward __P((struct ip6_hdr *, struct ifnet *, struct mbuf *)); -int ip6_process_hopopts __P((struct mbuf *, u_int8_t *, int, u_int32_t *, - u_int32_t *)); -void ip6_savecontrol __P((struct inpcb *, struct mbuf **, struct ip6_hdr *, - struct mbuf *)); -void ip6_notify_pmtu __P((struct inpcb *, struct sockaddr_in6 *, - u_int32_t *)); -int ip6_sysctl __P((int *, u_int, void *, size_t *, void *, size_t)); - -void ip6_forward __P((struct mbuf *, int)); - -void ip6_mloopback __P((struct ifnet *, struct mbuf *, struct sockaddr_in6 *)); -int ip6_output __P((struct mbuf *, struct ip6_pktopts *, +void ip6_init(void); +void ip6intr(void); +void ip6_input(struct mbuf *); +struct in6_ifaddr *ip6_getdstifaddr(struct mbuf *); +void ip6_freepcbopts(struct ip6_pktopts *); +void ip6_freemoptions(struct ip6_moptions *); +int ip6_unknown_opt(u_int8_t *, struct mbuf *, int, int); +char * ip6_get_prevhdr(struct mbuf *, int); +int ip6_nexthdr(struct mbuf *, int, int, int *); +int ip6_lasthdr(struct mbuf *, int, int, int *); + +struct mbuf *ip6_addaux(struct mbuf *); +struct mbuf *ip6_findaux(struct mbuf *); +void ip6_delaux(struct mbuf *); + +int ip6_mforward(struct ip6_hdr *, struct ifnet *, struct mbuf *); +int ip6_process_hopopts(struct mbuf *, u_int8_t *, int, u_int32_t *, + u_int32_t *); +void ip6_savecontrol(struct inpcb *, struct mbuf **, struct ip6_hdr *, + struct mbuf *); +void ip6_notify_pmtu(struct inpcb *, struct sockaddr_in6 *, + u_int32_t *); +int ip6_sysctl(int *, u_int, void *, size_t *, void *, size_t); + +void ip6_forward(struct mbuf *, int, int); + +void ip6_mloopback(struct ifnet *, struct mbuf *, struct sockaddr_in6 *); +int ip6_output(struct mbuf *, struct ip6_pktopts *, struct route_in6 *, int, - struct ip6_moptions *, struct ifnet **)); -int ip6_ctloutput __P((struct socket *, struct sockopt *sopt)); -void init_ip6pktopts __P((struct ip6_pktopts *)); -int ip6_setpktoptions __P((struct mbuf *, struct ip6_pktopts *, int, int)); -void ip6_clearpktopts __P((struct ip6_pktopts *, int, int)); -struct ip6_pktopts *ip6_copypktopts __P((struct ip6_pktopts *, int)); -int ip6_optlen __P((struct inpcb *)); - -int route6_input __P((struct mbuf **, int *)); - -void frag6_init __P((void)); -int frag6_input __P((struct mbuf **, int *)); -void frag6_slowtimo __P((void)); -void frag6_drain __P((void)); - -void rip6_init __P((void)); -int rip6_input __P((struct mbuf **mp, int *offset)); -void rip6_ctlinput __P((int, struct sockaddr *, void *)); -int rip6_ctloutput __P((struct socket *so, struct sockopt *sopt)); -int rip6_output __P((struct mbuf *, struct socket *, struct sockaddr_in6 *, struct mbuf *)); -int rip6_usrreq __P((struct socket *, - int, struct mbuf *, struct mbuf *, struct mbuf *, struct proc *)); - -int dest6_input __P((struct mbuf **, int *)); -int none_input __P((struct mbuf **, int *)); + struct ip6_moptions *, struct ifnet **, int locked); +int ip6_ctloutput(struct socket *, struct sockopt *sopt); +void init_ip6pktopts(struct ip6_pktopts *); +int ip6_setpktoptions(struct mbuf *, struct ip6_pktopts *, int, int); +void ip6_clearpktopts(struct ip6_pktopts *, int, int); +struct ip6_pktopts *ip6_copypktopts(struct ip6_pktopts *, int); +int ip6_optlen(struct inpcb *); + +int route6_input(struct mbuf **, int *); + +void frag6_init(void); +int frag6_input(struct mbuf **, int *); +void frag6_slowtimo(void); +void frag6_drain(void); + +void rip6_init(void); +int rip6_input(struct mbuf **mp, int *offset); +void rip6_ctlinput(int, struct sockaddr *, void *); +int rip6_ctloutput(struct socket *so, struct sockopt *sopt); +int rip6_output(struct mbuf *, struct socket *, struct sockaddr_in6 *, struct mbuf *); +int rip6_usrreq(struct socket *, + int, struct mbuf *, struct mbuf *, struct mbuf *, struct proc *); + +int dest6_input(struct mbuf **, int *); +int none_input(struct mbuf **, int *); #endif /* KERNEL */ -#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL_PRIVATE */ #endif /* !_NETINET6_IP6_VAR_H_ */ diff --git a/bsd/netinet6/ip6protosw.h b/bsd/netinet6/ip6protosw.h index beee88937..f0386fa39 100644 --- a/bsd/netinet6/ip6protosw.h +++ b/bsd/netinet6/ip6protosw.h @@ -71,7 +71,9 @@ #ifndef _NETINET6_IP6PROTOSW_H_ #define _NETINET6_IP6PROTOSW_H_ #include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE + +#ifdef KERNEL_PRIVATE +#include <kern/locks.h> /* * Protocol switch table for IPv6. @@ -124,27 +126,27 @@ struct ip6protosw { short pr_protocol; /* protocol number */ unsigned int pr_flags; /* see below */ /* protocol-protocol hooks */ - int (*pr_input) __P((struct mbuf **, int *)); + int (*pr_input)(struct mbuf **, int *); /* input to protocol (from below) */ - int (*pr_output) __P((struct mbuf *m, struct socket *so, - struct sockaddr_in6 *, struct mbuf *)); + int (*pr_output)(struct mbuf *m, struct socket *so, + struct sockaddr_in6 *, struct mbuf *); /* output to protocol (from above) */ - void (*pr_ctlinput)__P((int, struct sockaddr *, void *)); + void (*pr_ctlinput)(int, struct sockaddr *, void *); /* control input (from below) */ - int (*pr_ctloutput)__P((struct socket *, struct sockopt *)); + int (*pr_ctloutput)(struct socket *, struct sockopt *); /* control output (from above) */ /* user-protocol hook */ - int (*pr_usrreq) /* user request: see list below */ - __P((struct socket *, int, struct mbuf *, - struct mbuf *, struct mbuf *, struct proc *)); + int (*pr_usrreq)(struct socket *, int, struct mbuf *, + struct mbuf *, struct mbuf *, struct proc *); + /* user request: see list below */ /* utility hooks */ - void (*pr_init) __P((void)); /* initialization hook */ - void (*pr_fasttimo) __P((void)); + void (*pr_init)(void); /* initialization hook */ + void (*pr_fasttimo)(void); /* fast timeout (200ms) */ - void (*pr_slowtimo) __P((void)); + void (*pr_slowtimo)(void); /* slow timeout (500ms) */ - void (*pr_drain) __P((void)); + void (*pr_drain)(void); /* flush any excess space possible */ #ifdef __APPLE__ /* for compat. with IPv4 protosw */ @@ -153,12 +155,19 @@ struct ip6protosw { struct pr_usrreqs *pr_usrreqs; /* supersedes pr_usrreq() */ #ifdef __APPLE__ + int (*pr_lock) (struct socket *so, int locktype, int debug); /* lock function for protocol */ + int (*pr_unlock) (struct socket *so, int locktype, int debug); /* unlock for protocol */ +#ifdef _KERN_LOCKS_H_ + lck_mtx_t * (*pr_getlock) (struct socket *so, int locktype); /* unlock for protocol */ +#else + void * (*pr_getlock) (struct socket *so, int locktype); /* unlock for protocol */ +#endif /* Filter hooks */ TAILQ_HEAD(pr6_sfilter, NFDescriptor) pr_sfilter; struct ip6protosw *pr_next; /* Chain for domain */ - u_long reserved[4]; + u_long reserved[1]; #endif }; -#endif /* __APPLE_API_PRIVATE */ -#endif +#endif KERNEL_PRIVATE +#endif _NETINET6_IP6PROTOSW_H_ diff --git a/bsd/netinet6/ipcomp.h b/bsd/netinet6/ipcomp.h index 383a67555..41ab61b85 100644 --- a/bsd/netinet6/ipcomp.h +++ b/bsd/netinet6/ipcomp.h @@ -53,18 +53,18 @@ struct ipcomp { #define IPCOMP_CPI_NEGOTIATE_MIN 256 #ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE struct ipcomp_algorithm { - int (*compress) __P((struct mbuf *, struct mbuf *, size_t *)); - int (*decompress) __P((struct mbuf *, struct mbuf *, size_t *)); + int (*compress)(struct mbuf *, struct mbuf *, size_t *); + int (*decompress)(struct mbuf *, struct mbuf *, size_t *); size_t minplen; /* minimum required length for compression */ }; struct ipsecrequest; -extern const struct ipcomp_algorithm *ipcomp_algorithm_lookup __P((int)); -extern void ipcomp4_input __P((struct mbuf *, int)); -extern int ipcomp4_output __P((struct mbuf *, struct ipsecrequest *)); -#endif /* __APPLE_API_PRIVATE */ -#endif /*KERNEL*/ +extern const struct ipcomp_algorithm *ipcomp_algorithm_lookup(int); +extern void ipcomp4_input(struct mbuf *, int); +extern int ipcomp4_output(struct mbuf *, struct ipsecrequest *); +#endif KERNEL_PRIVATE +#endif KERNEL -#endif /*_NETINET6_IPCOMP_H_*/ +#endif _NETINET6_IPCOMP_H_ diff --git a/bsd/netinet6/ipcomp6.h b/bsd/netinet6/ipcomp6.h index b0ca0316e..3091dc6b5 100644 --- a/bsd/netinet6/ipcomp6.h +++ b/bsd/netinet6/ipcomp6.h @@ -38,12 +38,10 @@ #define _NETINET6_IPCOMP6_H_ #include <sys/appleapiopts.h> -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -extern int ipcomp6_input __P((struct mbuf **, int *)); -extern int ipcomp6_output __P((struct mbuf *, u_char *, struct mbuf *, - struct ipsecrequest *)); -#endif /* __APPLE_API_PRIVATE */ -#endif /*KERNEL*/ +#ifdef KERNEL_PRIVATE +extern int ipcomp6_input(struct mbuf **, int *); +extern int ipcomp6_output(struct mbuf *, u_char *, struct mbuf *, + struct ipsecrequest *); +#endif KERNEL_PRIVATE #endif /*_NETINET6_IPCOMP6_H_*/ diff --git a/bsd/netinet6/ipcomp_core.c b/bsd/netinet6/ipcomp_core.c index 1ca103860..38f70861c 100644 --- a/bsd/netinet6/ipcomp_core.c +++ b/bsd/netinet6/ipcomp_core.c @@ -50,7 +50,6 @@ #include <net/if.h> #include <net/route.h> -#include <net/netisr.h> #include <net/zlib.h> #include <kern/cpu_number.h> @@ -65,11 +64,11 @@ #include <net/net_osdep.h> -static void *deflate_alloc __P((void *, u_int, u_int)); -static void deflate_free __P((void *, void *)); -static int deflate_common __P((struct mbuf *, struct mbuf *, size_t *, int)); -static int deflate_compress __P((struct mbuf *, struct mbuf *, size_t *)); -static int deflate_decompress __P((struct mbuf *, struct mbuf *, size_t *)); +static void *deflate_alloc(void *, u_int, u_int); +static void deflate_free(void *, void *); +static int deflate_common(struct mbuf *, struct mbuf *, size_t *, int); +static int deflate_compress(struct mbuf *, struct mbuf *, size_t *); +static int deflate_decompress(struct mbuf *, struct mbuf *, size_t *); /* * We need to use default window size (2^15 = 32Kbytes as of writing) for diff --git a/bsd/netinet6/ipcomp_input.c b/bsd/netinet6/ipcomp_input.c index 7ea00daf6..45821aa26 100644 --- a/bsd/netinet6/ipcomp_input.c +++ b/bsd/netinet6/ipcomp_input.c @@ -49,9 +49,9 @@ #include <net/if.h> #include <net/route.h> -#include <net/netisr.h> #include <net/zlib.h> #include <kern/cpu_number.h> +#include <kern/locks.h> #include <netinet/in.h> #include <netinet/in_systm.h> @@ -80,7 +80,7 @@ #define IPLEN_FLIPPED - +extern lck_mtx_t *sadb_mutex; void ipcomp4_input(struct mbuf *m, int off) { @@ -95,6 +95,7 @@ ipcomp4_input(struct mbuf *m, int off) size_t newlen, olen; struct secasvar *sav = NULL; + lck_mtx_lock(sadb_mutex); if (m->m_pkthdr.len < off + sizeof(struct ipcomp)) { ipseclog((LOG_DEBUG, "IPv4 IPComp input: assumption failed " @@ -153,7 +154,9 @@ ipcomp4_input(struct mbuf *m, int off) olen = m->m_pkthdr.len; newlen = m->m_pkthdr.len - off; + lck_mtx_unlock(sadb_mutex); error = (*algo->decompress)(m, m->m_next, &newlen); + lck_mtx_lock(sadb_mutex); if (error != 0) { if (error == EINVAL) ipsecstat.in_inval++; @@ -214,18 +217,22 @@ ipcomp4_input(struct mbuf *m, int off) ipsecstat.in_polvio++; goto fail; } - (*ip_protox[nxt]->pr_input)(m, off); - + lck_mtx_unlock(sadb_mutex); + ip_proto_dispatch_in(m, off, nxt, 0); + lck_mtx_lock(sadb_mutex); } else m_freem(m); m = NULL; ipsecstat.in_success++; + lck_mtx_unlock(sadb_mutex); return; fail: if (sav) key_freesav(sav); + + lck_mtx_unlock(sadb_mutex); if (m) m_freem(m); return; @@ -252,6 +259,7 @@ ipcomp6_input(mp, offp) m = *mp; off = *offp; + lck_mtx_lock(sadb_mutex); md = m_pulldown(m, off, sizeof(*ipcomp), NULL); if (!m) { m = NULL; /*already freed*/ @@ -291,7 +299,9 @@ ipcomp6_input(mp, offp) m->m_pkthdr.len -= sizeof(struct ipcomp); newlen = m->m_pkthdr.len - off; + lck_mtx_unlock(sadb_mutex); error = (*algo->decompress)(m, md, &newlen); + lck_mtx_lock(sadb_mutex); if (error != 0) { if (error == EINVAL) ipsec6stat.in_inval++; @@ -330,6 +340,7 @@ ipcomp6_input(mp, offp) *offp = off; *mp = m; ipsec6stat.in_success++; + lck_mtx_unlock(sadb_mutex); return nxt; fail: @@ -337,6 +348,7 @@ fail: m_freem(m); if (sav) key_freesav(sav); + lck_mtx_unlock(sadb_mutex); return IPPROTO_DONE; } #endif /* INET6 */ diff --git a/bsd/netinet6/ipcomp_output.c b/bsd/netinet6/ipcomp_output.c index a8a839b93..7a8d39b1d 100644 --- a/bsd/netinet6/ipcomp_output.c +++ b/bsd/netinet6/ipcomp_output.c @@ -49,9 +49,9 @@ #include <net/if.h> #include <net/route.h> -#include <net/netisr.h> #include <net/zlib.h> #include <kern/cpu_number.h> +#include <kern/locks.h> #include <netinet/in.h> #include <netinet/in_systm.h> @@ -78,8 +78,10 @@ #include <net/net_osdep.h> -static int ipcomp_output __P((struct mbuf *, u_char *, struct mbuf *, - struct ipsecrequest *, int)); +extern lck_mtx_t *sadb_mutex; + +static int ipcomp_output(struct mbuf *, u_char *, struct mbuf *, + struct ipsecrequest *, int); /* * Modify the packet so that the payload is compressed. @@ -204,7 +206,9 @@ ipcomp_output(m, nexthdrp, md, isr, af) mprev->m_next = md; /* compress data part */ + lck_mtx_unlock(sadb_mutex); if ((*algo->compress)(m, md, &plen) || mprev->m_next == NULL) { + lck_mtx_lock(sadb_mutex); ipseclog((LOG_ERR, "packet compression failure\n")); m = NULL; m_freem(md0); @@ -213,6 +217,7 @@ ipcomp_output(m, nexthdrp, md, isr, af) error = EINVAL; goto fail; } + lck_mtx_lock(sadb_mutex); stat->out_comphist[sav->alg_enc]++; md = mprev->m_next; diff --git a/bsd/netinet6/ipsec.c b/bsd/netinet6/ipsec.c index 9a9f6ebe8..7d8f334fd 100644 --- a/bsd/netinet6/ipsec.c +++ b/bsd/netinet6/ipsec.c @@ -47,6 +47,8 @@ #include <sys/kernel.h> #include <sys/syslog.h> #include <sys/sysctl.h> +#include <kern/locks.h> +#include <sys/kauth.h> #include <net/if.h> #include <net/route.h> @@ -111,6 +113,8 @@ int ipsec_debug = 0; #define DBG_FNC_GETPOL_ADDR NETDBG_CODE(DBG_NETIPSEC, (2 << 8)) #define DBG_FNC_IPSEC_OUT NETDBG_CODE(DBG_NETIPSEC, (3 << 8)) +extern lck_mtx_t *sadb_mutex; +extern lck_mtx_t *ip6_mutex; struct ipsecstat ipsecstat; int ip4_ah_cleartos = 1; @@ -201,43 +205,43 @@ SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_ESP_RANDPAD, esp_randpad, CTLFLAG_RW, &ip6_esp_randpad, 0, ""); #endif /* INET6 */ -static int ipsec_setspidx_mbuf - __P((struct secpolicyindex *, u_int, u_int, struct mbuf *, int)); -static int ipsec4_setspidx_inpcb __P((struct mbuf *, struct inpcb *pcb)); +static int ipsec_setspidx_mbuf(struct secpolicyindex *, u_int, u_int, + struct mbuf *, int); +static int ipsec4_setspidx_inpcb(struct mbuf *, struct inpcb *pcb); #if INET6 -static int ipsec6_setspidx_in6pcb __P((struct mbuf *, struct in6pcb *pcb)); +static int ipsec6_setspidx_in6pcb(struct mbuf *, struct in6pcb *pcb); #endif -static int ipsec_setspidx __P((struct mbuf *, struct secpolicyindex *, int)); -static void ipsec4_get_ulp __P((struct mbuf *m, struct secpolicyindex *, int)); -static int ipsec4_setspidx_ipaddr __P((struct mbuf *, struct secpolicyindex *)); +static int ipsec_setspidx(struct mbuf *, struct secpolicyindex *, int); +static void ipsec4_get_ulp(struct mbuf *m, struct secpolicyindex *, int); +static int ipsec4_setspidx_ipaddr(struct mbuf *, struct secpolicyindex *); #if INET6 -static void ipsec6_get_ulp __P((struct mbuf *m, struct secpolicyindex *, int)); -static int ipsec6_setspidx_ipaddr __P((struct mbuf *, struct secpolicyindex *)); +static void ipsec6_get_ulp(struct mbuf *m, struct secpolicyindex *, int); +static int ipsec6_setspidx_ipaddr(struct mbuf *, struct secpolicyindex *); #endif -static struct inpcbpolicy *ipsec_newpcbpolicy __P((void)); -static void ipsec_delpcbpolicy __P((struct inpcbpolicy *)); -static struct secpolicy *ipsec_deepcopy_policy __P((struct secpolicy *src)); -static int ipsec_set_policy __P((struct secpolicy **pcb_sp, - int optname, caddr_t request, size_t len, int priv)); -static int ipsec_get_policy __P((struct secpolicy *pcb_sp, struct mbuf **mp)); -static void vshiftl __P((unsigned char *, int, int)); -static int ipsec_in_reject __P((struct secpolicy *, struct mbuf *)); -static size_t ipsec_hdrsiz __P((struct secpolicy *)); +static struct inpcbpolicy *ipsec_newpcbpolicy(void); +static void ipsec_delpcbpolicy(struct inpcbpolicy *); +static struct secpolicy *ipsec_deepcopy_policy(struct secpolicy *src); +static int ipsec_set_policy(struct secpolicy **pcb_sp, + int optname, caddr_t request, size_t len, int priv); +static int ipsec_get_policy(struct secpolicy *pcb_sp, struct mbuf **mp); +static void vshiftl(unsigned char *, int, int); +static int ipsec_in_reject(struct secpolicy *, struct mbuf *); +static size_t ipsec_hdrsiz(struct secpolicy *); #if INET -static struct mbuf *ipsec4_splithdr __P((struct mbuf *)); +static struct mbuf *ipsec4_splithdr(struct mbuf *); #endif #if INET6 -static struct mbuf *ipsec6_splithdr __P((struct mbuf *)); +static struct mbuf *ipsec6_splithdr(struct mbuf *); #endif #if INET -static int ipsec4_encapsulate __P((struct mbuf *, struct secasvar *)); +static int ipsec4_encapsulate(struct mbuf *, struct secasvar *); #endif #if INET6 -static int ipsec6_encapsulate __P((struct mbuf *, struct secasvar *)); +static int ipsec6_encapsulate(struct mbuf *, struct secasvar *); #endif -static struct mbuf *ipsec_addaux __P((struct mbuf *)); -static struct mbuf *ipsec_findaux __P((struct mbuf *)); -static void ipsec_optaux __P((struct mbuf *, struct mbuf *)); +static struct mbuf *ipsec_addaux(struct mbuf *); +static struct mbuf *ipsec_findaux(struct mbuf *); +static void ipsec_optaux(struct mbuf *, struct mbuf *); void ipsec_send_natt_keepalive(struct secasvar *sav); static int @@ -266,7 +270,7 @@ sysctl_def_policy SYSCTL_HANDLER_ARGS * 0 : bypass * EACCES : discard packet. * ENOENT : ipsec_acquire() in progress, maybe. - * others : error occured. + * others : error occurred. * others: a pointer to SP * * NOTE: IPv6 mapped adddress concern is implemented here. @@ -282,12 +286,13 @@ ipsec4_getpolicybysock(m, dir, so, error) struct secpolicy *currsp = NULL; /* policy on socket */ struct secpolicy *kernsp = NULL; /* policy on kernel */ + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); /* sanity check */ if (m == NULL || so == NULL || error == NULL) panic("ipsec4_getpolicybysock: NULL pointer was passed.\n"); - + if (so->so_pcb == NULL) { - /* Socket may be closing or without PCB */ + printf("ipsec4_getpolicybysock: so->so_pcb == NULL\n"); return ipsec4_getpolicybyaddr(m, dir, 0, error); } @@ -460,7 +465,7 @@ ipsec4_getpolicybysock(m, dir, so, error) * 0 : bypass * EACCES : discard packet. * ENOENT : ipsec_acquire() in progress, maybe. - * others : error occured. + * others : error occurred. */ struct secpolicy * ipsec4_getpolicybyaddr(m, dir, flag, error) @@ -474,6 +479,8 @@ ipsec4_getpolicybyaddr(m, dir, flag, error) if (ipsec_bypass != 0) return 0; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (m == NULL || error == NULL) panic("ipsec4_getpolicybyaddr: NULL pointer was passed.\n"); @@ -528,7 +535,7 @@ ipsec4_getpolicybyaddr(m, dir, flag, error) * 0 : bypass * EACCES : discard packet. * ENOENT : ipsec_acquire() in progress, maybe. - * others : error occured. + * others : error occurred. * others: a pointer to SP */ struct secpolicy * @@ -542,6 +549,8 @@ ipsec6_getpolicybysock(m, dir, so, error) struct secpolicy *currsp = NULL; /* policy on socket */ struct secpolicy *kernsp = NULL; /* policy on kernel */ + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (m == NULL || so == NULL || error == NULL) panic("ipsec6_getpolicybysock: NULL pointer was passed.\n"); @@ -685,7 +694,7 @@ ipsec6_getpolicybysock(m, dir, so, error) * 0 : bypass * EACCES : discard packet. * ENOENT : ipsec_acquire() in progress, maybe. - * others : error occured. + * others : error occurred. */ #ifndef IP_FORWARDING #define IP_FORWARDING 1 @@ -700,6 +709,8 @@ ipsec6_getpolicybyaddr(m, dir, flag, error) { struct secpolicy *sp = NULL; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (m == NULL || error == NULL) panic("ipsec6_getpolicybyaddr: NULL pointer was passed.\n"); @@ -1011,7 +1022,7 @@ ipsec4_get_ulp(m, spidx, needport) uh.uh_dport; return; case IPPROTO_AH: - if (m->m_pkthdr.len > off + sizeof(ip6e)) + if (off + sizeof(ip6e) > m->m_pkthdr.len) return; m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); off += (ip6e.ip6e_len + 2) << 2; @@ -1183,6 +1194,8 @@ ipsec_init_policy(so, pcb_sp) { struct inpcbpolicy *new; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check. */ if (so == NULL || pcb_sp == NULL) panic("ipsec_init_policy: NULL pointer was passed.\n"); @@ -1197,7 +1210,7 @@ ipsec_init_policy(so, pcb_sp) #ifdef __APPLE__ if (so->so_uid == 0) #else - if (so->so_cred != 0 && so->so_cred->pc_ucred->cr_uid == 0) + if (so->so_cred != 0 && !suser(so->so_cred->pc_ucred, NULL)) #endif new->priv = 1; else @@ -1233,6 +1246,8 @@ ipsec_copy_policy(old, new) if (ipsec_bypass != 0) return 0; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + sp = ipsec_deepcopy_policy(old->sp_in); if (sp) { key_freesp(new->sp_in); @@ -1323,6 +1338,8 @@ ipsec_set_policy(pcb_sp, optname, request, len, priv) struct secpolicy *newsp = NULL; int error; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check. */ if (pcb_sp == NULL || *pcb_sp == NULL || request == NULL) return EINVAL; @@ -1366,6 +1383,8 @@ ipsec_get_policy(pcb_sp, mp) struct mbuf **mp; { + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check. */ if (pcb_sp == NULL || mp == NULL) return EINVAL; @@ -1396,6 +1415,8 @@ ipsec4_set_policy(inp, optname, request, len, priv) struct secpolicy **pcb_sp; int error = 0; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check. */ if (inp == NULL || request == NULL) return EINVAL; @@ -1441,6 +1462,8 @@ ipsec4_get_policy(inp, request, len, mp) struct secpolicy *pcb_sp; int error = 0; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check. */ if (inp == NULL || request == NULL || mp == NULL) return EINVAL; @@ -1476,6 +1499,8 @@ int ipsec4_delete_pcbpolicy(inp) struct inpcb *inp; { + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check. */ if (inp == NULL) panic("ipsec4_delete_pcbpolicy: NULL pointer was passed.\n"); @@ -1512,6 +1537,8 @@ ipsec6_set_policy(in6p, optname, request, len, priv) struct secpolicy **pcb_sp; int error = 0; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check. */ if (in6p == NULL || request == NULL) return EINVAL; @@ -1557,6 +1584,8 @@ ipsec6_get_policy(in6p, request, len, mp) struct secpolicy *pcb_sp; int error = 0; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check. */ if (in6p == NULL || request == NULL || mp == NULL) return EINVAL; @@ -1591,6 +1620,8 @@ int ipsec6_delete_pcbpolicy(in6p) struct in6pcb *in6p; { + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check. */ if (in6p == NULL) panic("ipsec6_delete_pcbpolicy: NULL pointer was passed.\n"); @@ -1626,6 +1657,8 @@ ipsec_get_reqlevel(isr) u_int level = 0; u_int esp_trans_deflev, esp_net_deflev, ah_trans_deflev, ah_net_deflev; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (isr == NULL || isr->sp == NULL) panic("ipsec_get_reqlevel: NULL pointer is passed.\n"); @@ -1685,6 +1718,7 @@ ipsec_get_reqlevel(isr) level = ah_net_deflev; else level = ah_trans_deflev; + break; case IPPROTO_IPCOMP: /* * we don't really care, as IPcomp document says that @@ -1734,6 +1768,8 @@ ipsec_in_reject(sp, m) printf("ipsec_in_reject: using SP\n"); kdebug_secpolicy(sp)); + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* check policy */ switch (sp->policy) { case IPSEC_POLICY_DISCARD: @@ -1814,6 +1850,8 @@ ipsec4_in_reject_so(m, so) int error; int result; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (m == NULL) return 0; /* XXX should be panic ? */ @@ -1844,12 +1882,17 @@ ipsec4_in_reject(m, inp) struct mbuf *m; struct inpcb *inp; { + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + if (inp == NULL) return ipsec4_in_reject_so(m, NULL); if (inp->inp_socket) return ipsec4_in_reject_so(m, inp->inp_socket); else panic("ipsec4_in_reject: invalid inpcb/socket"); + + /* NOTREACHED */ + return 0; } #if INET6 @@ -1867,6 +1910,8 @@ ipsec6_in_reject_so(m, so) int error; int result; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (m == NULL) return 0; /* XXX should be panic ? */ @@ -1896,12 +1941,17 @@ ipsec6_in_reject(m, in6p) struct mbuf *m; struct in6pcb *in6p; { + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + if (in6p == NULL) return ipsec6_in_reject_so(m, NULL); if (in6p->in6p_socket) return ipsec6_in_reject_so(m, in6p->in6p_socket); else panic("ipsec6_in_reject: invalid in6p/socket"); + + /* NOTREACHED */ + return 0; } #endif @@ -1921,6 +1971,8 @@ ipsec_hdrsiz(sp) printf("ipsec_hdrsiz: using SP\n"); kdebug_secpolicy(sp)); + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* check policy */ switch (sp->policy) { case IPSEC_POLICY_DISCARD: @@ -1992,6 +2044,8 @@ ipsec4_hdrsiz(m, dir, inp) int error; size_t size; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (m == NULL) return 0; /* XXX should be panic ? */ @@ -2034,6 +2088,8 @@ ipsec6_hdrsiz(m, dir, in6p) int error; size_t size; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (m == NULL) return 0; /* XXX shoud be panic ? */ @@ -2075,6 +2131,8 @@ ipsec4_encapsulate(m, sav) size_t hlen; size_t plen; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* can't tunnel between different AFs */ if (((struct sockaddr *)&sav->sah->saidx.src)->sa_family != ((struct sockaddr *)&sav->sah->saidx.dst)->sa_family @@ -2194,6 +2252,8 @@ ipsec6_encapsulate(m, sav) struct ip6_hdr *ip6; size_t plen; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* can't tunnel between different AFs */ if (((struct sockaddr *)&sav->sah->saidx.src)->sa_family != ((struct sockaddr *)&sav->sah->saidx.dst)->sa_family @@ -2285,6 +2345,8 @@ ipsec_chkreplay(seq, sav) u_int32_t wsizeb; /* constant: bits of window size */ int frlast; /* constant: last frame */ + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (sav == NULL) panic("ipsec_chkreplay: NULL pointer was passed.\n"); @@ -2344,6 +2406,8 @@ ipsec_updatereplay(seq, sav) u_int32_t wsizeb; /* constant: bits of window size */ int frlast; /* constant: last frame */ + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (sav == NULL) panic("ipsec_chkreplay: NULL pointer was passed.\n"); @@ -2517,6 +2581,8 @@ ipsec_logsastr(sav) char *p; struct secasindex *saidx = &sav->sah->saidx; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* validity check */ if (((struct sockaddr *)&sav->sah->saidx.src)->sa_family != ((struct sockaddr *)&sav->sah->saidx.dst)->sa_family) @@ -2591,7 +2657,6 @@ ipsec4_output(state, sp, flags) struct ip *ip = NULL; struct ipsecrequest *isr = NULL; struct secasindex saidx; - int s; int error; struct sockaddr_in *dst4; struct sockaddr_in *sin; @@ -2605,6 +2670,8 @@ ipsec4_output(state, sp, flags) if (!state->dst) panic("state->dst == NULL in ipsec4_output"); + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + KERNEL_DEBUG(DBG_FNC_IPSEC_OUT | DBG_FUNC_START, 0,0,0,0,0); KEYDEBUG(KEYDEBUG_IPSEC_DATA, @@ -2687,7 +2754,6 @@ ipsec4_output(state, sp, flags) * There may be the case that SA status will be changed when * we are refering to one. So calling splsoftnet(). */ - s = splnet(); if (isr->saidx.mode == IPSEC_MODE_TUNNEL) { /* @@ -2698,19 +2764,16 @@ ipsec4_output(state, sp, flags) ipseclog((LOG_ERR, "ipsec4_output: " "family mismatched between inner and outer spi=%u\n", (u_int32_t)ntohl(isr->sav->spi))); - splx(s); error = EAFNOSUPPORT; goto bad; } state->m = ipsec4_splithdr(state->m); if (!state->m) { - splx(s); error = ENOMEM; goto bad; } error = ipsec4_encapsulate(state->m, isr->sav); - splx(s); if (error) { state->m = NULL; goto bad; @@ -2743,8 +2806,7 @@ ipsec4_output(state, sp, flags) state->dst = (struct sockaddr *)state->ro->ro_rt->rt_gateway; dst4 = (struct sockaddr_in *)state->dst; } - } else - splx(s); + } state->m = ipsec4_splithdr(state->m); if (!state->m) { @@ -2841,7 +2903,8 @@ ipsec6_output_trans(state, nexthdrp, mprev, sp, flags, tun) KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("ipsec6_output_trans: applyed SP\n"); kdebug_secpolicy(sp)); - + + lck_mtx_lock(sadb_mutex); *tun = 0; for (isr = sp->req; isr; isr = isr->next) { if (isr->saidx.mode == IPSEC_MODE_TUNNEL) { @@ -2900,8 +2963,10 @@ ipsec6_output_trans(state, nexthdrp, mprev, sp, flags, tun) * XXX: should we directly notify sockets via * pfctlinputs? */ + lck_mtx_unlock(ip6_mutex); icmp6_error(state->m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADMIN, 0); + lck_mtx_lock(ip6_mutex); state->m = NULL; /* icmp6_error freed the mbuf */ goto bad; } @@ -2971,9 +3036,11 @@ ipsec6_output_trans(state, nexthdrp, mprev, sp, flags, tun) if (isr != NULL) *tun = 1; + lck_mtx_unlock(sadb_mutex); return 0; bad: + lck_mtx_unlock(sadb_mutex); m_freem(state->m); state->m = NULL; return error; @@ -2994,7 +3061,6 @@ ipsec6_output_tunnel(state, sp, flags) int error = 0; int plen; struct sockaddr_in6* dst6; - int s; if (!state) panic("state == NULL in ipsec6_output_tunnel"); @@ -3003,6 +3069,8 @@ ipsec6_output_tunnel(state, sp, flags) if (!sp) panic("sp == NULL in ipsec6_output_tunnel"); + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("ipsec6_output_tunnel: applyed SP\n"); kdebug_secpolicy(sp)); @@ -3097,7 +3165,6 @@ ipsec6_output_tunnel(state, sp, flags) * There may be the case that SA status will be changed when * we are refering to one. So calling splsoftnet(). */ - s = splnet(); if (isr->saidx.mode == IPSEC_MODE_TUNNEL) { /* @@ -3108,7 +3175,6 @@ ipsec6_output_tunnel(state, sp, flags) ipseclog((LOG_ERR, "ipsec6_output_tunnel: " "family mismatched between inner and outer, spi=%u\n", (u_int32_t)ntohl(isr->sav->spi))); - splx(s); ipsec6stat.out_inval++; error = EAFNOSUPPORT; goto bad; @@ -3116,13 +3182,11 @@ ipsec6_output_tunnel(state, sp, flags) state->m = ipsec6_splithdr(state->m); if (!state->m) { - splx(s); ipsec6stat.out_nomem++; error = ENOMEM; goto bad; } error = ipsec6_encapsulate(state->m, isr->sav); - splx(s); if (error) { state->m = 0; goto bad; @@ -3157,8 +3221,7 @@ ipsec6_output_tunnel(state, sp, flags) state->dst = (struct sockaddr *)state->ro->ro_rt->rt_gateway; dst6 = (struct sockaddr_in6 *)state->dst; } - } else - splx(s); + } state->m = ipsec6_splithdr(state->m); if (!state->m) { @@ -3311,6 +3374,8 @@ ipsec4_tunnel_validate(m, off, nxt0, sav) struct secpolicy *sp; struct ip *oip; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + #if DIAGNOSTIC if (m->m_len < sizeof(struct ip)) panic("too short mbuf on ipsec4_tunnel_validate"); @@ -3378,8 +3443,9 @@ ipsec4_tunnel_validate(m, off, nxt0, sav) sp = key_gettunnel((struct sockaddr *)&osrc, (struct sockaddr *)&odst, (struct sockaddr *)&isrc, (struct sockaddr *)&idst); - if (!sp) + if (!sp) { return 0; + } key_freesp(sp); return 1; @@ -3400,6 +3466,8 @@ ipsec6_tunnel_validate(m, off, nxt0, sav) struct secpolicy *sp; struct ip6_hdr *oip6; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + #if DIAGNOSTIC if (m->m_len < sizeof(struct ip6_hdr)) panic("too short mbuf on ipsec6_tunnel_validate"); @@ -3725,9 +3793,12 @@ ipsec_send_natt_keepalive( struct mbuf *m; struct udphdr *uh; struct ip *ip; - + int error; + if ((esp_udp_encap_port & 0xFFFF) == 0 || sav->remote_ike_port == 0) return; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return; @@ -3742,17 +3813,23 @@ ipsec_send_natt_keepalive( uh = (struct udphdr*)((char*)m_mtod(m) + sizeof(struct ip)); m->m_len = sizeof(struct udpiphdr) + 1; bzero(m_mtod(m), m->m_len); - ip->ip_len = ntohs(m->m_len); + m->m_pkthdr.len = m->m_len; + + ip->ip_len = m->m_len; ip->ip_ttl = ip_defttl; ip->ip_p = IPPROTO_UDP; ip->ip_src = ((struct sockaddr_in*)&sav->sah->saidx.src)->sin_addr; ip->ip_dst = ((struct sockaddr_in*)&sav->sah->saidx.dst)->sin_addr; - uh->uh_sport = ntohs((u_short)esp_udp_encap_port); - uh->uh_dport = ntohs(sav->remote_ike_port); + uh->uh_sport = htons((u_short)esp_udp_encap_port); + uh->uh_dport = htons(sav->remote_ike_port); uh->uh_ulen = htons(1 + sizeof(struct udphdr)); uh->uh_sum = 0; *(u_int8_t*)((char*)m_mtod(m) + sizeof(struct ip) + sizeof(struct udphdr)) = 0xFF; - if (ip_output(m, NULL, &sav->sah->sa_route, IP_NOIPSEC, NULL) == 0) + lck_mtx_unlock(sadb_mutex); + error = ip_output(m, NULL, &sav->sah->sa_route, IP_NOIPSEC, NULL); + lck_mtx_lock(sadb_mutex); + if (error == 0) sav->natt_last_activity = natt_now; + } diff --git a/bsd/netinet6/ipsec.h b/bsd/netinet6/ipsec.h index 4fcddc80c..2a4f6dcef 100644 --- a/bsd/netinet6/ipsec.h +++ b/bsd/netinet6/ipsec.h @@ -39,10 +39,9 @@ #include <sys/appleapiopts.h> #include <net/pfkeyv2.h> +#ifdef KERNEL_PRIVATE #include <netkey/keydb.h> -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE /* * Security Policy Index * Ensure that both address families in the "src" and "dst" are same. @@ -123,8 +122,7 @@ struct secspacq { int count; /* for lifetime */ /* XXX: here is mbuf place holder to be sent ? */ }; -#endif /* __APPLE_API_PRIVATE */ -#endif /*KERNEL*/ +#endif /* KERNEL_PRIVATE */ /* according to IANA assignment, port 0x0000 and proto 0xff are reserved. */ #define IPSEC_PORT_ANY 0 @@ -179,7 +177,6 @@ struct secspacq { */ #define IPSEC_REPLAYWSIZE 32 -#ifdef __APPLE_API_UNSTABLE /* statistics for ipsec processing */ struct ipsecstat { u_quad_t in_success; /* succeeded inbound process */ @@ -209,8 +206,8 @@ struct ipsecstat { u_quad_t out_ahhist[256]; u_quad_t out_comphist[256]; }; -#endif /* __APPLE_API_UNSTABLE */ +#ifdef KERNEL_PRIVATE /* * Definitions for IPsec & Key sysctl operations. */ @@ -269,7 +266,6 @@ struct ipsecstat { } #ifdef KERNEL -#ifdef __APPLE_API_PRIVATE struct ipsec_output_state { struct mbuf *m; struct route *ro; @@ -297,59 +293,56 @@ extern int ip4_esp_randpad; #define ipseclog(x) do { if (ipsec_debug) log x; } while (0) -extern struct secpolicy *ipsec4_getpolicybysock - __P((struct mbuf *, u_int, struct socket *, int *)); -extern struct secpolicy *ipsec4_getpolicybyaddr - __P((struct mbuf *, u_int, int, int *)); +extern struct secpolicy *ipsec4_getpolicybysock(struct mbuf *, u_int, + struct socket *, int *); +extern struct secpolicy *ipsec4_getpolicybyaddr(struct mbuf *, u_int, int, + int *); struct inpcb; -extern int ipsec_init_policy __P((struct socket *so, struct inpcbpolicy **)); -extern int ipsec_copy_policy - __P((struct inpcbpolicy *, struct inpcbpolicy *)); -extern u_int ipsec_get_reqlevel __P((struct ipsecrequest *)); - -extern int ipsec4_set_policy __P((struct inpcb *inp, int optname, - caddr_t request, size_t len, int priv)); -extern int ipsec4_get_policy __P((struct inpcb *inpcb, caddr_t request, - size_t len, struct mbuf **mp)); -extern int ipsec4_delete_pcbpolicy __P((struct inpcb *)); -extern int ipsec4_in_reject_so __P((struct mbuf *, struct socket *)); -extern int ipsec4_in_reject __P((struct mbuf *, struct inpcb *)); +extern int ipsec_init_policy(struct socket *so, struct inpcbpolicy **); +extern int ipsec_copy_policy(struct inpcbpolicy *, struct inpcbpolicy *); +extern u_int ipsec_get_reqlevel(struct ipsecrequest *); + +extern int ipsec4_set_policy(struct inpcb *inp, int optname, + caddr_t request, size_t len, int priv); +extern int ipsec4_get_policy(struct inpcb *inpcb, caddr_t request, + size_t len, struct mbuf **mp); +extern int ipsec4_delete_pcbpolicy(struct inpcb *); +extern int ipsec4_in_reject_so(struct mbuf *, struct socket *); +extern int ipsec4_in_reject(struct mbuf *, struct inpcb *); struct secas; struct tcpcb; -extern int ipsec_chkreplay __P((u_int32_t, struct secasvar *)); -extern int ipsec_updatereplay __P((u_int32_t, struct secasvar *)); +extern int ipsec_chkreplay(u_int32_t, struct secasvar *); +extern int ipsec_updatereplay(u_int32_t, struct secasvar *); -extern size_t ipsec4_hdrsiz __P((struct mbuf *, u_int, struct inpcb *)); -extern size_t ipsec_hdrsiz_tcp __P((struct tcpcb *)); +extern size_t ipsec4_hdrsiz(struct mbuf *, u_int, struct inpcb *); +extern size_t ipsec_hdrsiz_tcp(struct tcpcb *); struct ip; -extern const char *ipsec4_logpacketstr __P((struct ip *, u_int32_t)); -extern const char *ipsec_logsastr __P((struct secasvar *)); - -extern void ipsec_dumpmbuf __P((struct mbuf *)); - -extern int ipsec4_output __P((struct ipsec_output_state *, struct secpolicy *, - int)); -extern int ipsec4_tunnel_validate __P((struct mbuf *, int, u_int, - struct secasvar *)); -extern struct mbuf *ipsec_copypkt __P((struct mbuf *)); -extern void ipsec_delaux __P((struct mbuf *)); -extern int ipsec_setsocket __P((struct mbuf *, struct socket *)); -extern struct socket *ipsec_getsocket __P((struct mbuf *)); -extern int ipsec_addhist __P((struct mbuf *, int, u_int32_t)); -extern struct ipsec_history *ipsec_gethist __P((struct mbuf *, int *)); -extern void ipsec_clearhist __P((struct mbuf *)); -#endif /* __APPLE_API_PRIVATE */ -#endif /*KERNEL*/ +extern const char *ipsec4_logpacketstr(struct ip *, u_int32_t); +extern const char *ipsec_logsastr(struct secasvar *); + +extern void ipsec_dumpmbuf(struct mbuf *); + +extern int ipsec4_output(struct ipsec_output_state *, struct secpolicy *, int); +extern int ipsec4_tunnel_validate(struct mbuf *, int, u_int, struct secasvar *); +extern struct mbuf *ipsec_copypkt(struct mbuf *); +extern void ipsec_delaux(struct mbuf *); +extern int ipsec_setsocket(struct mbuf *, struct socket *); +extern struct socket *ipsec_getsocket(struct mbuf *); +extern int ipsec_addhist(struct mbuf *, int, u_int32_t); +extern struct ipsec_history *ipsec_gethist(struct mbuf *, int *); +extern void ipsec_clearhist(struct mbuf *); +#endif KERNEL +#endif KERNEL_PRIVATE #ifndef KERNEL -extern caddr_t ipsec_set_policy __P((char *, int)); -extern int ipsec_get_policylen __P((caddr_t)); -extern char *ipsec_dump_policy __P((caddr_t, char *)); +extern caddr_t ipsec_set_policy(char *, int); +extern int ipsec_get_policylen(caddr_t); +extern char *ipsec_dump_policy(caddr_t, char *); -extern const char *ipsec_strerror __P((void)); -#endif /*!KERNEL*/ +extern const char *ipsec_strerror(void); +#endif KERNEL -#endif /*_NETINET6_IPSEC_H_*/ +#endif _NETINET6_IPSEC_H_ diff --git a/bsd/netinet6/ipsec6.h b/bsd/netinet6/ipsec6.h index 9fd4fc75a..33bcfe125 100644 --- a/bsd/netinet6/ipsec6.h +++ b/bsd/netinet6/ipsec6.h @@ -41,8 +41,7 @@ #include <net/pfkeyv2.h> #include <netkey/keydb.h> -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE extern struct ipsecstat ipsec6stat; extern struct secpolicy ip6_def_policy; extern int ip6_esp_trans_deflev; @@ -52,35 +51,34 @@ extern int ip6_ah_net_deflev; extern int ip6_ipsec_ecn; extern int ip6_esp_randpad; -extern struct secpolicy *ipsec6_getpolicybysock - __P((struct mbuf *, u_int, struct socket *, int *)); -extern struct secpolicy *ipsec6_getpolicybyaddr - __P((struct mbuf *, u_int, int, int *)); +extern struct secpolicy *ipsec6_getpolicybysock(struct mbuf *, u_int, + struct socket *, int *); +extern struct secpolicy *ipsec6_getpolicybyaddr(struct mbuf *, u_int, int, + int *); struct inpcb; -extern int ipsec6_in_reject_so __P((struct mbuf *, struct socket *)); -extern int ipsec6_delete_pcbpolicy __P((struct inpcb *)); -extern int ipsec6_set_policy __P((struct inpcb *inp, int optname, - caddr_t request, size_t len, int priv)); -extern int ipsec6_get_policy - __P((struct inpcb *inp, caddr_t request, size_t len, struct mbuf **mp)); -extern int ipsec6_in_reject __P((struct mbuf *, struct inpcb *)); +extern int ipsec6_in_reject_so(struct mbuf *, struct socket *); +extern int ipsec6_delete_pcbpolicy(struct inpcb *); +extern int ipsec6_set_policy(struct inpcb *inp, int optname, + caddr_t request, size_t len, int priv); +extern int ipsec6_get_policy(struct inpcb *inp, caddr_t request, size_t len, + struct mbuf **mp); +extern int ipsec6_in_reject(struct mbuf *, struct inpcb *); struct tcp6cb; -extern size_t ipsec6_hdrsiz __P((struct mbuf *, u_int, struct inpcb *)); +extern size_t ipsec6_hdrsiz(struct mbuf *, u_int, struct inpcb *); struct ip6_hdr; -extern const char *ipsec6_logpacketstr __P((struct ip6_hdr *, u_int32_t)); +extern const char *ipsec6_logpacketstr(struct ip6_hdr *, u_int32_t); -extern int ipsec6_output_trans __P((struct ipsec_output_state *, u_char *, - struct mbuf *, struct secpolicy *, int, int *)); -extern int ipsec6_output_tunnel __P((struct ipsec_output_state *, - struct secpolicy *, int)); -extern int ipsec6_tunnel_validate __P((struct mbuf *, int, u_int, - struct secasvar *)); -#endif /* __APPLE_API_PRIVATE */ -#endif /*KERNEL*/ +extern int ipsec6_output_trans(struct ipsec_output_state *, u_char *, + struct mbuf *, struct secpolicy *, int, int *); +extern int ipsec6_output_tunnel(struct ipsec_output_state *, + struct secpolicy *, int); +extern int ipsec6_tunnel_validate(struct mbuf *, int, u_int, + struct secasvar *); +#endif KERNEL_PRIVATE -#endif /*_NETINET6_IPSEC6_H_*/ +#endif _NETINET6_IPSEC6_H_ diff --git a/bsd/netinet6/mld6.c b/bsd/netinet6/mld6.c index 7b1b091ce..df44cb746 100644 --- a/bsd/netinet6/mld6.c +++ b/bsd/netinet6/mld6.c @@ -100,6 +100,7 @@ */ #define MLD6_UNSOLICITED_REPORT_INTERVAL 10 +extern lck_mtx_t *nd6_mutex; static struct ip6_pktopts ip6_opts; static int mld6_timers_are_running; static int mld6_init_done = 0 ; @@ -107,7 +108,7 @@ static int mld6_init_done = 0 ; static struct in6_addr mld6_all_nodes_linklocal = IN6ADDR_LINKLOCAL_ALLNODES_INIT; static struct in6_addr mld6_all_routers_linklocal = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; -static void mld6_sendpkt __P((struct in6_multi *, int, const struct in6_addr *)); +static void mld6_sendpkt(struct in6_multi *, int, const struct in6_addr *); void mld6_init() @@ -137,8 +138,8 @@ mld6_init() } void -mld6_start_listening(in6m) - struct in6_multi *in6m; +mld6_start_listening( + struct in6_multi *in6m) { int s = splnet(); @@ -166,8 +167,8 @@ mld6_start_listening(in6m) } void -mld6_stop_listening(in6m) - struct in6_multi *in6m; +mld6_stop_listening( + struct in6_multi *in6m) { mld6_all_nodes_linklocal.s6_addr16[1] = htons(in6m->in6m_ifp->if_index); /* XXX */ @@ -182,9 +183,9 @@ mld6_stop_listening(in6m) } void -mld6_input(m, off) - struct mbuf *m; - int off; +mld6_input( + struct mbuf *m, + int off) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct mld6_hdr *mldh; @@ -195,7 +196,7 @@ mld6_input(m, off) int timer; /* timer value in the MLD query header */ #ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, sizeof(*mldh),); + IP6_EXTHDR_CHECK(m, off, sizeof(*mldh), return); mldh = (struct mld6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(mldh, struct mld6_hdr *, m, off, sizeof(*mldh)); @@ -255,6 +256,7 @@ mld6_input(m, off) * - Use the value specified in the query message as * the maximum timeout. */ + ifnet_lock_exclusive(ifp); IFP_TO_IA6(ifp, ia); if (ia == NULL) break; @@ -301,6 +303,7 @@ mld6_input(m, off) } } } + ifnet_lock_done(ifp); if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld6_addr)) mldh->mld6_addr.s6_addr16[1] = 0; /* XXX */ @@ -328,11 +331,13 @@ mld6_input(m, off) * If we belong to the group being reported, stop * our timer for that group. */ + ifnet_lock_shared(ifp); IN6_LOOKUP_MULTI(mldh->mld6_addr, ifp, in6m); if (in6m) { in6m->in6m_timer = 0; /* transit to idle state */ in6m->in6m_state = MLD6_OTHERLISTENER; /* clear flag */ } + ifnet_lock_done(ifp); if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld6_addr)) mldh->mld6_addr.s6_addr16[1] = 0; /* XXX */ @@ -350,7 +355,6 @@ mld6_fasttimeo() { struct in6_multi *in6m; struct in6_multistep step; - int s; /* * Quick check to see if any work needs to be done, in order @@ -359,7 +363,7 @@ mld6_fasttimeo() if (!mld6_timers_are_running) return; - s = splnet(); + lck_mtx_lock(nd6_mutex); mld6_timers_are_running = 0; IN6_FIRST_MULTI(step, in6m); while (in6m != NULL) { @@ -373,14 +377,14 @@ mld6_fasttimeo() } IN6_NEXT_MULTI(step, in6m); } - splx(s); + lck_mtx_unlock(nd6_mutex); } static void -mld6_sendpkt(in6m, type, dst) - struct in6_multi *in6m; - int type; - const struct in6_addr *dst; +mld6_sendpkt( + struct in6_multi *in6m, + int type, + const struct in6_addr *dst) { struct mbuf *mh, *md; struct mld6_hdr *mldh; @@ -458,7 +462,7 @@ mld6_sendpkt(in6m, type, dst) /* increment output statictics */ icmp6stat.icp6s_outhist[type]++; - ip6_output(mh, &ip6_opts, NULL, 0, &im6o, &outif); + ip6_output(mh, &ip6_opts, NULL, 0, &im6o, &outif, 0); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); switch (type) { diff --git a/bsd/netinet6/mld6_var.h b/bsd/netinet6/mld6_var.h index dd252c18a..cd583fef1 100644 --- a/bsd/netinet6/mld6_var.h +++ b/bsd/netinet6/mld6_var.h @@ -34,8 +34,7 @@ #define _NETINET6_MLD6_VAR_H_ #include <sys/appleapiopts.h> -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE #define MLD6_RANDOM_DELAY(X) (random() % (X) + 1) @@ -45,12 +44,11 @@ #define MLD6_OTHERLISTENER 0 #define MLD6_IREPORTEDLAST 1 -void mld6_init __P((void)); -void mld6_input __P((struct mbuf *, int)); -void mld6_start_listening __P((struct in6_multi *)); -void mld6_stop_listening __P((struct in6_multi *)); -void mld6_fasttimeo __P((void)); -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +void mld6_init(void); +void mld6_input(struct mbuf *, int); +void mld6_start_listening(struct in6_multi *); +void mld6_stop_listening(struct in6_multi *); +void mld6_fasttimeo(void); +#endif KERNEL_PRIVATE -#endif /* _NETINET6_MLD6_VAR_H_ */ +#endif _NETINET6_MLD6_VAR_H_ diff --git a/bsd/netinet6/nd6.c b/bsd/netinet6/nd6.c index baff898b4..228c6fa2e 100644 --- a/bsd/netinet6/nd6.c +++ b/bsd/netinet6/nd6.c @@ -50,6 +50,7 @@ #include <sys/syslog.h> #include <sys/protosw.h> #include <kern/queue.h> +#include <kern/lock.h> #define DONT_WARN_OBSOLETE #include <net/if.h> @@ -110,9 +111,12 @@ struct nd_prhead nd_prefix = { 0 }; int nd6_recalc_reachtm_interval = ND6_RECALC_REACHTM_INTERVAL; static struct sockaddr_in6 all1_sa; -static void nd6_slowtimo_funneled __P((void *)); -static int regen_tmpaddr __P((struct in6_ifaddr *)); +static int regen_tmpaddr(struct in6_ifaddr *); +extern lck_mtx_t *rt_mtx; +extern lck_mtx_t *ip6_mutex; +extern lck_mtx_t *nd6_mutex; +static void nd6_slowtimo(void *ignored_arg); void nd6_init() @@ -136,12 +140,12 @@ nd6_init() nd6_init_done = 1; /* start timer */ - timeout(nd6_slowtimo_funneled, (caddr_t)0, ND6_SLOWTIMER_INTERVAL * hz); + timeout(nd6_slowtimo, (caddr_t)0, ND6_SLOWTIMER_INTERVAL * hz); } void -nd6_ifattach(ifp) - struct ifnet *ifp; +nd6_ifattach( + struct ifnet *ifp) { /* @@ -194,15 +198,15 @@ nd6_ifattach(ifp) * changes, which means we might have to adjust the ND level MTU. */ void -nd6_setmtu(ifp) - struct ifnet *ifp; +nd6_setmtu( + struct ifnet *ifp) { #ifndef MIN #define MIN(a,b) ((a) < (b) ? (a) : (b)) #endif struct nd_ifinfo *ndi; - u_long oldmaxmtu, oldlinkmtu, dl_tag; + u_long oldmaxmtu, oldlinkmtu; /* * Make sure IPv6 is enabled for the interface first, @@ -210,8 +214,6 @@ nd6_setmtu(ifp) */ if (ifp->if_index >= nd_ifinfo_indexlim) { - if (dlil_find_dltag(ifp->if_family, ifp->if_unit, PF_INET6, &dl_tag) != EPROTONOSUPPORT) - nd6log((LOG_INFO, "setmtu for ifp=% but nd6 is not attached\n", if_name(ifp))); return; /* we're out of bound for nd_ifinfo */ } @@ -223,6 +225,8 @@ nd6_setmtu(ifp) case IFT_ARCNET: /* XXX MTU handling needs more work */ ndi->maxmtu = MIN(60480, ifp->if_mtu); break; + case IFT_L2VLAN: /* XXX what if the VLAN isn't over ethernet? */ + case IFT_IEEE8023ADLAG: case IFT_ETHER: ndi->maxmtu = MIN(ETHERMTU, ifp->if_mtu); break; @@ -272,10 +276,10 @@ nd6_setmtu(ifp) } void -nd6_option_init(opt, icmp6len, ndopts) - void *opt; - int icmp6len; - union nd_opts *ndopts; +nd6_option_init( + void *opt, + int icmp6len, + union nd_opts *ndopts) { bzero(ndopts, sizeof(*ndopts)); ndopts->nd_opts_search = (struct nd_opt_hdr *)opt; @@ -292,8 +296,8 @@ nd6_option_init(opt, icmp6len, ndopts) * Take one ND option. */ struct nd_opt_hdr * -nd6_option(ndopts) - union nd_opts *ndopts; +nd6_option( + union nd_opts *ndopts) { struct nd_opt_hdr *nd_opt; int olen; @@ -344,8 +348,8 @@ nd6_option(ndopts) * multiple options of the same type. */ int -nd6_options(ndopts) - union nd_opts *ndopts; +nd6_options( + union nd_opts *ndopts) { struct nd_opt_hdr *nd_opt; int i = 0; @@ -424,33 +428,20 @@ skip1: * ND6 timer routine to expire default route list and prefix list */ void -nd6_timer_funneled(ignored_arg) - void *ignored_arg; +nd6_timer( + void *ignored_arg) { -#ifdef __APPLE__ - boolean_t funnel_state; - funnel_state = thread_funnel_set(network_flock, TRUE); -#endif - nd6_timer(ignored_arg); -#ifdef __APPLE__ - (void) thread_funnel_set(network_flock, FALSE); -#endif -} -void -nd6_timer(ignored_arg) - void *ignored_arg; -{ - int s; struct llinfo_nd6 *ln; struct nd_defrouter *dr; struct nd_prefix *pr; - struct ifnet *ifp; + struct ifnet *ifp = NULL; struct in6_ifaddr *ia6, *nia6; struct in6_addrlifetime *lt6; + struct timeval timenow; + + getmicrotime(&timenow); - s = splnet(); - timeout(nd6_timer_funneled, (caddr_t)0, nd6_prune * hz); ln = llinfo_nd6.ln_next; while (ln && ln != &llinfo_nd6) { @@ -471,7 +462,7 @@ nd6_timer(ignored_arg) ndi = &nd_ifinfo[ifp->if_index]; dst = (struct sockaddr_in6 *)rt_key(rt); - if (ln->ln_expire > time_second) { + if (ln->ln_expire > timenow.tv_sec) { ln = next; continue; } @@ -498,10 +489,10 @@ nd6_timer(ignored_arg) case ND6_LLINFO_INCOMPLETE: if (ln->ln_asked < nd6_mmaxtries) { ln->ln_asked++; - ln->ln_expire = time_second + + ln->ln_expire = timenow.tv_sec + nd_ifinfo[ifp->if_index].retrans / 1000; nd6_ns_output(ifp, NULL, &dst->sin6_addr, - ln, 0); + ln, 0, 0); } else { struct mbuf *m = ln->ln_hold; ln->ln_hold = NULL; @@ -526,7 +517,7 @@ nd6_timer(ignored_arg) case ND6_LLINFO_REACHABLE: if (ln->ln_expire) { ln->ln_state = ND6_LLINFO_STALE; - ln->ln_expire = time_second + nd6_gctimer; + ln->ln_expire = timenow.tv_sec + nd6_gctimer; } break; @@ -541,23 +532,23 @@ nd6_timer(ignored_arg) /* We need NUD */ ln->ln_asked = 1; ln->ln_state = ND6_LLINFO_PROBE; - ln->ln_expire = time_second + + ln->ln_expire = timenow.tv_sec + ndi->retrans / 1000; nd6_ns_output(ifp, &dst->sin6_addr, &dst->sin6_addr, - ln, 0); + ln, 0, 0); } else { ln->ln_state = ND6_LLINFO_STALE; /* XXX */ - ln->ln_expire = time_second + nd6_gctimer; + ln->ln_expire = timenow.tv_sec + nd6_gctimer; } break; case ND6_LLINFO_PROBE: if (ln->ln_asked < nd6_umaxtries) { ln->ln_asked++; - ln->ln_expire = time_second + + ln->ln_expire = timenow.tv_sec + nd_ifinfo[ifp->if_index].retrans / 1000; nd6_ns_output(ifp, &dst->sin6_addr, - &dst->sin6_addr, ln, 0); + &dst->sin6_addr, ln, 0, 0); } else { next = nd6_free(rt); } @@ -567,12 +558,13 @@ nd6_timer(ignored_arg) } /* expire default router list */ + lck_mtx_lock(nd6_mutex); dr = TAILQ_FIRST(&nd_defrouter); while (dr) { - if (dr->expire && dr->expire < time_second) { + if (dr->expire && dr->expire < timenow.tv_sec) { struct nd_defrouter *t; t = TAILQ_NEXT(dr, dr_entry); - defrtrlist_del(dr); + defrtrlist_del(dr, 1); dr = t; } else { dr = TAILQ_NEXT(dr, dr_entry); @@ -586,7 +578,7 @@ nd6_timer(ignored_arg) * rather separate address lifetimes and prefix lifetimes. */ addrloop: - for (ia6 = in6_ifaddr; ia6; ia6 = nia6) { + for (ia6 = in6_ifaddrs; ia6; ia6 = nia6) { nia6 = ia6->ia_next; /* check address lifetime */ lt6 = &ia6->ia6_lifetime; @@ -609,7 +601,7 @@ nd6_timer(ignored_arg) regen = 1; } - in6_purgeaddr(&ia6->ia_ifa); + in6_purgeaddr(&ia6->ia_ifa, 1); if (regen) goto addrloop; /* XXX: see below */ @@ -660,7 +652,7 @@ nd6_timer(ignored_arg) * since pltime is just for autoconf, pltime processing for * prefix is not necessary. */ - if (pr->ndpr_expire && pr->ndpr_expire < time_second) { + if (pr->ndpr_expire && pr->ndpr_expire < timenow.tv_sec) { struct nd_prefix *t; t = pr->ndpr_next; @@ -669,23 +661,28 @@ nd6_timer(ignored_arg) * separate. NEVER perform in6_purgeaddr here. */ - prelist_remove(pr); + prelist_remove(pr, 1); pr = t; } else pr = pr->ndpr_next; } - splx(s); + lck_mtx_unlock(nd6_mutex); + timeout(nd6_timer, (caddr_t)0, nd6_prune * hz); } static int -regen_tmpaddr(ia6) - struct in6_ifaddr *ia6; /* deprecated/invalidated temporary address */ +regen_tmpaddr( + struct in6_ifaddr *ia6) /* deprecated/invalidated temporary address */ { struct ifaddr *ifa; struct ifnet *ifp; struct in6_ifaddr *public_ifa6 = NULL; + struct timeval timenow; + + getmicrotime(&timenow); ifp = ia6->ia_ifa.ifa_ifp; + ifnet_lock_exclusive(ifp); for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = ifa->ifa_list.tqe_next) { @@ -726,6 +723,7 @@ regen_tmpaddr(ia6) if (!IFA6_IS_DEPRECATED(it6)) public_ifa6 = it6; } + ifnet_lock_done(ifp); if (public_ifa6 != NULL) { int e; @@ -746,14 +744,15 @@ regen_tmpaddr(ia6) * ifp goes away. */ void -nd6_purge(ifp) - struct ifnet *ifp; +nd6_purge( + struct ifnet *ifp) { struct llinfo_nd6 *ln, *nln; struct nd_defrouter *dr, *ndr, drany; struct nd_prefix *pr, *npr; /* Nuke default router list entries toward ifp */ + lck_mtx_lock(nd6_mutex); if ((dr = TAILQ_FIRST(&nd_defrouter)) != NULL) { /* * The first entry of the list may be stored in @@ -762,11 +761,11 @@ nd6_purge(ifp) for (dr = TAILQ_NEXT(dr, dr_entry); dr; dr = ndr) { ndr = TAILQ_NEXT(dr, dr_entry); if (dr->ifp == ifp) - defrtrlist_del(dr); + defrtrlist_del(dr, 1); } dr = TAILQ_FIRST(&nd_defrouter); if (dr->ifp == ifp) - defrtrlist_del(dr); + defrtrlist_del(dr, 1); } /* Nuke prefix list entries toward ifp */ @@ -781,7 +780,7 @@ nd6_purge(ifp) * by itself. * (jinmei@kame.net 20010129) */ - prelist_remove(pr); + prelist_remove(pr, 1); } } @@ -795,6 +794,7 @@ nd6_purge(ifp) defrouter_delreq(&drany, 0); defrouter_select(); } + lck_mtx_unlock(nd6_mutex); /* * Nuke neighbor cache entries for the ifp. @@ -820,10 +820,11 @@ nd6_purge(ifp) } struct rtentry * -nd6_lookup(addr6, create, ifp) - struct in6_addr *addr6; - int create; - struct ifnet *ifp; +nd6_lookup( + struct in6_addr *addr6, + int create, + struct ifnet *ifp, + int rt_locked) { struct rtentry *rt; struct sockaddr_in6 sin6; @@ -835,7 +836,9 @@ nd6_lookup(addr6, create, ifp) #if SCOPEDROUTING sin6.sin6_scope_id = in6_addr2scopeid(ifp, addr6); #endif - rt = rtalloc1((struct sockaddr *)&sin6, create, 0UL); + if (!rt_locked) + lck_mtx_lock(rt_mtx); + rt = rtalloc1_locked((struct sockaddr *)&sin6, create, 0UL); if (rt && (rt->rt_flags & RTF_LLINFO) == 0) { /* * This is the case for the default route. @@ -844,7 +847,7 @@ nd6_lookup(addr6, create, ifp) * interface route. */ if (create) { - rtfree(rt); + rtfree_locked(rt); rt = 0; } } @@ -861,8 +864,11 @@ nd6_lookup(addr6, create, ifp) */ struct ifaddr *ifa = ifaof_ifpforaddr((struct sockaddr *)&sin6, ifp); - if (ifa == NULL) + if (ifa == NULL) { + if (!rt_locked) + lck_mtx_unlock(rt_mtx); return(NULL); + } /* * Create a new route. RTF_LLINFO is necessary @@ -870,26 +876,35 @@ nd6_lookup(addr6, create, ifp) * destination in nd6_rtrequest which will be * called in rtrequest via ifa->ifa_rtrequest. */ - if ((e = rtrequest(RTM_ADD, (struct sockaddr *)&sin6, + if ((e = rtrequest_locked(RTM_ADD, (struct sockaddr *)&sin6, ifa->ifa_addr, (struct sockaddr *)&all1_sa, (ifa->ifa_flags | RTF_HOST | RTF_LLINFO) & ~RTF_CLONING, - &rt)) != 0) - log(LOG_ERR, - "nd6_lookup: failed to add route for a " - "neighbor(%s), errno=%d\n", - ip6_sprintf(addr6), e); - if (rt == NULL) + &rt)) != 0) { + if (e != EEXIST) + log(LOG_ERR, + "nd6_lookup: failed to add route for a " + "neighbor(%s), errno=%d\n", + ip6_sprintf(addr6), e); + } + ifafree(ifa); + if (rt == NULL) { + if (!rt_locked) + lck_mtx_unlock(rt_mtx); return(NULL); + } if (rt->rt_llinfo) { struct llinfo_nd6 *ln = (struct llinfo_nd6 *)rt->rt_llinfo; ln->ln_state = ND6_LLINFO_NOSTATE; } - } else + } else { + if (!rt_locked) + lck_mtx_unlock(rt_mtx); return(NULL); + } } rtunref(rt); /* @@ -908,13 +923,17 @@ nd6_lookup(addr6, create, ifp) rt->rt_gateway->sa_family != AF_LINK || rt->rt_llinfo == NULL || (ifp && rt->rt_ifa->ifa_ifp != ifp))) { + if (!rt_locked) + lck_mtx_unlock(rt_mtx); if (create) { log(LOG_DEBUG, "nd6_lookup: failed to lookup %s (if = %s)\n", ip6_sprintf(addr6), ifp ? if_name(ifp) : "unspec"); /* xxx more logs... kazu */ } return(NULL); - } + } + if (!rt_locked) + lck_mtx_unlock(rt_mtx); return(rt); } @@ -923,9 +942,10 @@ nd6_lookup(addr6, create, ifp) * XXX: should take care of the destination of a p2p link? */ int -nd6_is_addr_neighbor(addr, ifp) - struct sockaddr_in6 *addr; - struct ifnet *ifp; +nd6_is_addr_neighbor( + struct sockaddr_in6 *addr, + struct ifnet *ifp, + int rt_locked) { struct ifaddr *ifa; int i; @@ -946,27 +966,30 @@ nd6_is_addr_neighbor(addr, ifp) * If the address matches one of our addresses, * it should be a neighbor. */ + ifnet_lock_shared(ifp); for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = ifa->ifa_list.tqe_next) { if (ifa->ifa_addr->sa_family != AF_INET6) - next: continue; + continue; for (i = 0; i < 4; i++) { if ((IFADDR6(ifa).s6_addr32[i] ^ addr->sin6_addr.s6_addr32[i]) & - IFMASK6(ifa).s6_addr32[i]) - goto next; + IFMASK6(ifa).s6_addr32[i]) + continue; } + ifnet_lock_done(ifp); return(1); } + ifnet_lock_done(ifp); /* * Even if the address matches none of our addresses, it might be * in the neighbor cache. */ - if (nd6_lookup(&addr->sin6_addr, 0, ifp) != NULL) + if (nd6_lookup(&addr->sin6_addr, 0, ifp, rt_locked) != NULL) return(1); return(0); @@ -978,8 +1001,8 @@ nd6_is_addr_neighbor(addr, ifp) * Free an nd6 llinfo entry. */ struct llinfo_nd6 * -nd6_free(rt) - struct rtentry *rt; +nd6_free( + struct rtentry *rt) { struct llinfo_nd6 *ln = (struct llinfo_nd6 *)rt->rt_llinfo, *next; struct in6_addr in6 = ((struct sockaddr_in6 *)rt_key(rt))->sin6_addr; @@ -991,12 +1014,11 @@ nd6_free(rt) */ if (!ip6_forwarding && (ip6_accept_rtadv || (rt->rt_ifp->if_eflags & IFEF_ACCEPT_RTADVD))) { - int s; - s = splnet(); + lck_mtx_lock(nd6_mutex); dr = defrouter_lookup(&((struct sockaddr_in6 *)rt_key(rt))->sin6_addr, rt->rt_ifp); - if (ln && ln->ln_router || dr) { + if ((ln && ln->ln_router) || dr) { /* * rt6_flush must be called whether or not the neighbor * is in the Default Router List. @@ -1027,7 +1049,7 @@ nd6_free(rt) * before the default router selection, we perform * the check now. */ - pfxlist_onlink_check(); + pfxlist_onlink_check(1); if (dr == TAILQ_FIRST(&nd_defrouter)) { /* @@ -1043,7 +1065,7 @@ nd6_free(rt) defrouter_select(); } } - splx(s); + lck_mtx_unlock(nd6_mutex); } /* @@ -1074,12 +1096,15 @@ nd6_free(rt) * XXX cost-effective metods? */ void -nd6_nud_hint(rt, dst6, force) - struct rtentry *rt; - struct in6_addr *dst6; - int force; +nd6_nud_hint( + struct rtentry *rt, + struct in6_addr *dst6, + int force) { struct llinfo_nd6 *ln; + struct timeval timenow; + + getmicrotime(&timenow); /* * If the caller specified "rt", use that. Otherwise, resolve the @@ -1088,7 +1113,7 @@ nd6_nud_hint(rt, dst6, force) if (!rt) { if (!dst6) return; - if (!(rt = nd6_lookup(dst6, 0, NULL))) + if (!(rt = nd6_lookup(dst6, 0, NULL, 0))) return; } @@ -1116,21 +1141,23 @@ nd6_nud_hint(rt, dst6, force) ln->ln_state = ND6_LLINFO_REACHABLE; if (ln->ln_expire) - ln->ln_expire = time_second + + ln->ln_expire = timenow.tv_sec + nd_ifinfo[rt->rt_ifp->if_index].reachable; } void -nd6_rtrequest(req, rt, sa) - int req; - struct rtentry *rt; - struct sockaddr *sa; /* xxx unused */ +nd6_rtrequest( + int req, + struct rtentry *rt, + struct sockaddr *sa) /* xxx unused */ { struct sockaddr *gate = rt->rt_gateway; struct llinfo_nd6 *ln = (struct llinfo_nd6 *)rt->rt_llinfo; static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; struct ifnet *ifp = rt->rt_ifp; struct ifaddr *ifa; + struct timeval timenow; + if ((rt->rt_flags & RTF_GATEWAY)) return; @@ -1148,7 +1175,7 @@ nd6_rtrequest(req, rt, sa) if (req == RTM_RESOLVE && (nd6_need_cache(ifp) == 0 || /* stf case */ - !nd6_is_addr_neighbor((struct sockaddr_in6 *)rt_key(rt), ifp))) { + !nd6_is_addr_neighbor((struct sockaddr_in6 *)rt_key(rt), ifp, 1))) { /* * FreeBSD and BSD/OS often make a cloned host route based * on a less-specific route (e.g. the default route). @@ -1167,6 +1194,7 @@ nd6_rtrequest(req, rt, sa) return; } + getmicrotime(&timenow); switch (req) { case RTM_ADD: /* @@ -1189,7 +1217,7 @@ nd6_rtrequest(req, rt, sa) SDL(gate)->sdl_type = ifp->if_type; SDL(gate)->sdl_index = ifp->if_index; if (ln) - ln->ln_expire = time_second; + ln->ln_expire = timenow.tv_sec; #if 1 if (ln && ln->ln_expire == 0) { /* kludge for desktops */ @@ -1278,7 +1306,7 @@ nd6_rtrequest(req, rt, sa) * initialized in rtrequest(), so rt_expire is 0. */ ln->ln_state = ND6_LLINFO_NOSTATE; - ln->ln_expire = time_second; + ln->ln_expire = timenow.tv_sec; } rt->rt_flags |= RTF_LLINFO; ln->ln_next = llinfo_nd6.ln_next; @@ -1332,7 +1360,7 @@ nd6_rtrequest(req, rt, sa) llsol.s6_addr32[2] = htonl(1); llsol.s6_addr8[12] = 0xff; - if (!in6_addmulti(&llsol, ifp, &error)) { + if (!in6_addmulti(&llsol, ifp, &error, 0)) { nd6log((LOG_ERR, "%s: failed to join " "%s (errno=%d)\n", if_name(ifp), ip6_sprintf(&llsol), error)); @@ -1357,9 +1385,11 @@ nd6_rtrequest(req, rt, sa) llsol.s6_addr32[2] = htonl(1); llsol.s6_addr8[12] = 0xff; + ifnet_lock_shared(ifp); IN6_LOOKUP_MULTI(llsol, ifp, in6m); + ifnet_lock_done(ifp); if (in6m) - in6_delmulti(in6m); + in6_delmulti(in6m, 0); } nd6_inuse--; ln->ln_next->ln_prev = ln->ln_prev; @@ -1370,15 +1400,15 @@ nd6_rtrequest(req, rt, sa) if (ln->ln_hold) m_freem(ln->ln_hold); ln->ln_hold = NULL; - Free((caddr_t)ln); + R_Free((caddr_t)ln); } } int -nd6_ioctl(cmd, data, ifp) - u_long cmd; - caddr_t data; - struct ifnet *ifp; +nd6_ioctl( + u_long cmd, + caddr_t data, + struct ifnet *ifp) { struct in6_drlist *drl = (struct in6_drlist *)data; struct in6_prlist *prl = (struct in6_prlist *)data; @@ -1389,15 +1419,14 @@ nd6_ioctl(cmd, data, ifp) struct nd_prefix *pr; struct rtentry *rt; int i = 0, error = 0; - int s; switch (cmd) { case SIOCGDRLST_IN6: /* * obsolete API, use sysctl under net.inet6.icmp6 */ + lck_mtx_lock(nd6_mutex); bzero(drl, sizeof(*drl)); - s = splnet(); dr = TAILQ_FIRST(&nd_defrouter); while (dr && i < DRLSTSIZ) { drl->defrouter[i].rtaddr = dr->rtaddr; @@ -1417,7 +1446,7 @@ nd6_ioctl(cmd, data, ifp) i++; dr = TAILQ_NEXT(dr, dr_entry); } - splx(s); + lck_mtx_unlock(nd6_mutex); break; case SIOCGPRLST_IN6: /* @@ -1429,7 +1458,7 @@ nd6_ioctl(cmd, data, ifp) * how about separating ioctls into two? */ bzero(prl, sizeof(*prl)); - s = splnet(); + lck_mtx_lock(nd6_mutex); pr = nd_prefix.lh_first; while (pr && i < PRLSTSIZ) { struct nd_pfxrouter *pfr; @@ -1490,8 +1519,7 @@ nd6_ioctl(cmd, data, ifp) i++; } } - splx(s); - + lck_mtx_unlock(nd6_mutex); break; case OSIOCGIFINFO_IN6: if (!nd_ifinfo || i >= nd_ifinfo_indexlim) { @@ -1531,16 +1559,18 @@ nd6_ioctl(cmd, data, ifp) * route equals to the top of default router list */ bzero(&any, sizeof(any)); - defrouter_delreq(&any, 0); + lck_mtx_lock(nd6_mutex); + defrouter_delreq(&any, 1); defrouter_select(); + lck_mtx_unlock(nd6_mutex); /* xxx sumikawa: flush prefix list */ break; case SIOCSPFXFLUSH_IN6: { /* flush all the prefix advertised by routers */ struct nd_prefix *pr, *next; + lck_mtx_lock(nd6_mutex); - s = splnet(); for (pr = nd_prefix.lh_first; pr; pr = next) { struct in6_ifaddr *ia, *ia_next; @@ -1550,7 +1580,7 @@ nd6_ioctl(cmd, data, ifp) continue; /* XXX */ /* do we really have to remove addresses as well? */ - for (ia = in6_ifaddr; ia; ia = ia_next) { + for (ia = in6_ifaddrs; ia; ia = ia_next) { /* ia might be removed. keep the next ptr. */ ia_next = ia->ia_next; @@ -1558,11 +1588,11 @@ nd6_ioctl(cmd, data, ifp) continue; if (ia->ia6_ndpr == pr) - in6_purgeaddr(&ia->ia_ifa); + in6_purgeaddr(&ia->ia_ifa, 1); } - prelist_remove(pr); + prelist_remove(pr, 1); } - splx(s); + lck_mtx_unlock(nd6_mutex); break; } case SIOCSRTRFLUSH_IN6: @@ -1570,7 +1600,7 @@ nd6_ioctl(cmd, data, ifp) /* flush all the default routers */ struct nd_defrouter *dr, *next; - s = splnet(); + lck_mtx_lock(nd6_mutex); if ((dr = TAILQ_FIRST(&nd_defrouter)) != NULL) { /* * The first entry of the list may be stored in @@ -1578,11 +1608,11 @@ nd6_ioctl(cmd, data, ifp) */ for (dr = TAILQ_NEXT(dr, dr_entry); dr; dr = next) { next = TAILQ_NEXT(dr, dr_entry); - defrtrlist_del(dr); + defrtrlist_del(dr, 1); } - defrtrlist_del(TAILQ_FIRST(&nd_defrouter)); + defrtrlist_del(TAILQ_FIRST(&nd_defrouter), 1); } - splx(s); + lck_mtx_unlock(nd6_mutex); break; } case SIOCGNBRINFO_IN6: @@ -1602,10 +1632,8 @@ nd6_ioctl(cmd, data, ifp) *idp = htons(ifp->if_index); } - s = splnet(); - if ((rt = nd6_lookup(&nb_addr, 0, ifp)) == NULL) { + if ((rt = nd6_lookup(&nb_addr, 0, ifp, 0)) == NULL) { error = EINVAL; - splx(s); break; } ln = (struct llinfo_nd6 *)rt->rt_llinfo; @@ -1613,7 +1641,6 @@ nd6_ioctl(cmd, data, ifp) nbi->asked = ln->ln_asked; nbi->isrouter = ln->ln_router; nbi->expire = ln->ln_expire; - splx(s); break; } @@ -1632,13 +1659,13 @@ nd6_ioctl(cmd, data, ifp) * on reception of inbound ND6 packets. (RS/RA/NS/redirect) */ struct rtentry * -nd6_cache_lladdr(ifp, from, lladdr, lladdrlen, type, code) - struct ifnet *ifp; - struct in6_addr *from; - char *lladdr; - int lladdrlen; - int type; /* ICMP6 type */ - int code; /* type dependent information */ +nd6_cache_lladdr( + struct ifnet *ifp, + struct in6_addr *from, + char *lladdr, + int lladdrlen, + int type, /* ICMP6 type */ + int code) /* type dependent information */ { struct rtentry *rt = NULL; struct llinfo_nd6 *ln = NULL; @@ -1648,6 +1675,7 @@ nd6_cache_lladdr(ifp, from, lladdr, lladdrlen, type, code) int olladdr; int llchange; int newstate = 0; + struct timeval timenow; if (!ifp) panic("ifp == NULL in nd6_cache_lladdr"); @@ -1667,8 +1695,10 @@ nd6_cache_lladdr(ifp, from, lladdr, lladdrlen, type, code) * Spec says nothing in sections for RA, RS and NA. There's small * description on it in NS section (RFC 2461 7.2.3). */ + getmicrotime(&timenow); - rt = nd6_lookup(from, 0, ifp); + lck_mtx_lock(rt_mtx); + rt = nd6_lookup(from, 0, ifp, 1); if (!rt) { #if 0 /* nothing must be done if there's no lladdr */ @@ -1676,16 +1706,20 @@ nd6_cache_lladdr(ifp, from, lladdr, lladdrlen, type, code) return NULL; #endif - rt = nd6_lookup(from, 1, ifp); + rt = nd6_lookup(from, 1, ifp, 1); is_newentry = 1; } else { /* do nothing if static ndp is set */ - if (rt->rt_flags & RTF_STATIC) + if (rt->rt_flags & RTF_STATIC) { + lck_mtx_unlock(rt_mtx); return NULL; + } is_newentry = 0; } - if (!rt) + lck_mtx_unlock(rt_mtx); + + if (!rt) return NULL; if ((rt->rt_flags & (RTF_GATEWAY | RTF_LLINFO)) != RTF_LLINFO) { fail: @@ -1758,7 +1792,7 @@ fail: * we must set the timer now, although it is actually * meaningless. */ - ln->ln_expire = time_second + nd6_gctimer; + ln->ln_expire = timenow.tv_sec + nd6_gctimer; if (ln->ln_hold) { /* @@ -1767,12 +1801,12 @@ fail: */ nd6_output(ifp, ifp, ln->ln_hold, (struct sockaddr_in6 *)rt_key(rt), - rt); + rt, 0); ln->ln_hold = NULL; } } else if (ln->ln_state == ND6_LLINFO_INCOMPLETE) { /* probe right away */ - ln->ln_expire = time_second; + ln->ln_expire = timenow.tv_sec; } } @@ -1856,22 +1890,23 @@ fail: * for those are not autoconfigured hosts, we explicitly avoid such * cases for safety. */ - if (do_update && ln->ln_router && !ip6_forwarding && (ip6_accept_rtadv || (ifp->if_eflags & IFEF_ACCEPT_RTADVD))) + if (do_update && ln->ln_router && !ip6_forwarding && (ip6_accept_rtadv || (ifp->if_eflags & IFEF_ACCEPT_RTADVD))) { + lck_mtx_lock(nd6_mutex); defrouter_select(); + lck_mtx_unlock(nd6_mutex); + } return rt; } static void -nd6_slowtimo(ignored_arg) - void *ignored_arg; +nd6_slowtimo( + void *ignored_arg) { - int s = splnet(); int i; struct nd_ifinfo *nd6if; - s = splnet(); - timeout(nd6_slowtimo_funneled, (caddr_t)0, ND6_SLOWTIMER_INTERVAL * hz); + lck_mtx_lock(nd6_mutex); for (i = 1; i < if_index + 1; i++) { if (!nd_ifinfo || i >= nd_ifinfo_indexlim) continue; @@ -1888,37 +1923,27 @@ nd6_slowtimo(ignored_arg) nd6if->reachable = ND_COMPUTE_RTIME(nd6if->basereachable); } } - splx(s); + lck_mtx_unlock(nd6_mutex); + timeout(nd6_slowtimo, (caddr_t)0, ND6_SLOWTIMER_INTERVAL * hz); } -static void -nd6_slowtimo_funneled(ignored_arg) - void *ignored_arg; -{ -#ifdef __APPLE__ - boolean_t funnel_state; - funnel_state = thread_funnel_set(network_flock, TRUE); -#endif - nd6_slowtimo(ignored_arg); -#ifdef __APPLE__ - (void) thread_funnel_set(network_flock, FALSE); -#endif -} #define senderr(e) { error = (e); goto bad;} int -nd6_output(ifp, origifp, m0, dst, rt0) - struct ifnet *ifp; - struct ifnet *origifp; - struct mbuf *m0; - struct sockaddr_in6 *dst; - struct rtentry *rt0; +nd6_output( + struct ifnet *ifp, + struct ifnet *origifp, + struct mbuf *m0, + struct sockaddr_in6 *dst, + struct rtentry *rt0, + int locked) { struct mbuf *m = m0; struct rtentry *rt = rt0; struct sockaddr_in6 *gw6 = NULL; struct llinfo_nd6 *ln = NULL; int error = 0; + struct timeval timenow; if (IN6_IS_ADDR_MULTICAST(&dst->sin6_addr)) goto sendpkt; @@ -1929,19 +1954,23 @@ nd6_output(ifp, origifp, m0, dst, rt0) /* * next hop determination. This routine is derived from ether_outpout. */ + lck_mtx_lock(rt_mtx); if (rt) { if ((rt->rt_flags & RTF_UP) == 0) { - if ((rt0 = rt = rtalloc1((struct sockaddr *)dst, 1, 0UL)) != + if ((rt0 = rt = rtalloc1_locked((struct sockaddr *)dst, 1, 0UL)) != NULL) { rtunref(rt); if (rt->rt_ifp != ifp) { /* XXX: loop care? */ + lck_mtx_unlock(rt_mtx); return nd6_output(ifp, origifp, m0, - dst, rt); + dst, rt, locked); } - } else + } else { + lck_mtx_unlock(rt_mtx); senderr(EHOSTUNREACH); + } } if (rt->rt_flags & RTF_GATEWAY) { @@ -1955,13 +1984,14 @@ nd6_output(ifp, origifp, m0, dst, rt0) * if the gateway is our own address, which is * sometimes used to install a route to a p2p link. */ - if (!nd6_is_addr_neighbor(gw6, ifp) || + if (!nd6_is_addr_neighbor(gw6, ifp, 1) || in6ifa_ifpwithaddr(ifp, &gw6->sin6_addr)) { /* * We allow this kind of tricky route only * when the outgoing interface is p2p. * XXX: we may need a more generic rule here. */ + lck_mtx_unlock(rt_mtx); if ((ifp->if_flags & IFF_POINTOPOINT) == 0) senderr(EHOSTUNREACH); @@ -1971,10 +2001,12 @@ nd6_output(ifp, origifp, m0, dst, rt0) if (rt->rt_gwroute == 0) goto lookup; if (((rt = rt->rt_gwroute)->rt_flags & RTF_UP) == 0) { - rtfree(rt); rt = rt0; - lookup: rt->rt_gwroute = rtalloc1(rt->rt_gateway, 1, 0UL); - if ((rt = rt->rt_gwroute) == 0) + rtfree_locked(rt); rt = rt0; + lookup: rt->rt_gwroute = rtalloc1_locked(rt->rt_gateway, 1, 0UL); + if ((rt = rt->rt_gwroute) == 0) { + lck_mtx_unlock(rt_mtx); senderr(EHOSTUNREACH); + } } } } @@ -1995,10 +2027,11 @@ nd6_output(ifp, origifp, m0, dst, rt0) * the condition below is not very efficient. But we believe * it is tolerable, because this should be a rare case. */ - if (nd6_is_addr_neighbor(dst, ifp) && - (rt = nd6_lookup(&dst->sin6_addr, 1, ifp)) != NULL) + if (nd6_is_addr_neighbor(dst, ifp, 1) && + (rt = nd6_lookup(&dst->sin6_addr, 1, ifp, 1)) != NULL) ln = (struct llinfo_nd6 *)rt->rt_llinfo; } + lck_mtx_unlock(rt_mtx); if (!ln || !rt) { if ((ifp->if_flags & IFF_POINTOPOINT) == 0 && !(nd_ifinfo[ifp->if_index].flags & ND6_IFF_PERFORMNUD)) { @@ -2012,11 +2045,13 @@ nd6_output(ifp, origifp, m0, dst, rt0) goto sendpkt; /* send anyway */ } + getmicrotime(&timenow); + /* We don't have to do link-layer address resolution on a p2p link. */ if ((ifp->if_flags & IFF_POINTOPOINT) != 0 && ln->ln_state < ND6_LLINFO_REACHABLE) { ln->ln_state = ND6_LLINFO_STALE; - ln->ln_expire = time_second + nd6_gctimer; + ln->ln_expire = timenow.tv_sec + nd6_gctimer; } /* @@ -2029,7 +2064,7 @@ nd6_output(ifp, origifp, m0, dst, rt0) if (ln->ln_state == ND6_LLINFO_STALE) { ln->ln_asked = 0; ln->ln_state = ND6_LLINFO_DELAY; - ln->ln_expire = time_second + nd6_delay; + ln->ln_expire = timenow.tv_sec + nd6_delay; } /* @@ -2056,11 +2091,11 @@ nd6_output(ifp, origifp, m0, dst, rt0) ln->ln_hold = m; if (ln->ln_expire) { if (ln->ln_asked < nd6_mmaxtries && - ln->ln_expire < time_second) { + ln->ln_expire < timenow.tv_sec) { ln->ln_asked++; - ln->ln_expire = time_second + + ln->ln_expire = timenow.tv_sec + nd_ifinfo[ifp->if_index].retrans / 1000; - nd6_ns_output(ifp, NULL, &dst->sin6_addr, ln, 0); + nd6_ns_output(ifp, NULL, &dst->sin6_addr, ln, 0, locked); } } return(0); @@ -2075,7 +2110,12 @@ nd6_output(ifp, origifp, m0, dst, rt0) if ((ifp->if_flags & IFF_LOOPBACK) != 0) { m->m_pkthdr.rcvif = origifp; /* forwarding rules require the original scope_id */ - return (dlil_output(ifptodlt(origifp, PF_INET6), m, (caddr_t)rt, (struct sockaddr *)dst,0)); + if (locked) + lck_mtx_unlock(ip6_mutex); + error = dlil_output(origifp, PF_INET6, m, (caddr_t)rt, (struct sockaddr *)dst, 0); + if (locked) + lck_mtx_lock(ip6_mutex); + return error; } else { /* Do not allow loopback address to wind up on a wire */ struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); @@ -2084,18 +2124,24 @@ nd6_output(ifp, origifp, m0, dst, rt0) IN6_IS_ADDR_LOOPBACK(&ip6->ip6_dst))) { ip6stat.ip6s_badscope++; /* - * Simply drop the packet just like a firewall -- we do not want the - * the application to feel the pain, not yet... - * Returning ENETUNREACH like ip6_output does in some similar cases - * could startle the otherwise clueless process that specifies + * Do not simply drop the packet just like a firewall -- we want the + * the application to feel the pain. + * Return ENETUNREACH like ip6_output does in some similar cases. + * This can startle the otherwise clueless process that specifies * loopback as the source address. */ + error = ENETUNREACH; goto bad; } } - m->m_pkthdr.rcvif = (struct ifnet *)0; - return (dlil_output(ifptodlt(ifp, PF_INET6), m, (caddr_t)rt, (struct sockaddr *)dst, 0)); + m->m_pkthdr.rcvif = 0; + if (locked) + lck_mtx_unlock(ip6_mutex); + error = dlil_output(ifp, PF_INET6, m, (caddr_t)rt, (struct sockaddr *)dst, 0); + if (locked) + lck_mtx_lock(ip6_mutex); + return(error); #else if ((ifp->if_flags & IFF_LOOPBACK) != 0) { return((*ifp->if_output)(origifp, m, (struct sockaddr *)dst, @@ -2112,8 +2158,8 @@ nd6_output(ifp, origifp, m0, dst, rt0) #undef senderr int -nd6_need_cache(ifp) - struct ifnet *ifp; +nd6_need_cache( + struct ifnet *ifp) { /* * XXX: we currently do not make neighbor cache on any interface @@ -2127,9 +2173,8 @@ nd6_need_cache(ifp) case IFT_ETHER: case IFT_FDDI: case IFT_IEEE1394: -#if IFT_L2VLAN case IFT_L2VLAN: -#endif + case IFT_IEEE8023ADLAG: #if IFT_IEEE80211 case IFT_IEEE80211: #endif @@ -2141,12 +2186,12 @@ nd6_need_cache(ifp) } int -nd6_storelladdr(ifp, rt, m, dst, desten) - struct ifnet *ifp; - struct rtentry *rt; - struct mbuf *m; - struct sockaddr *dst; - u_char *desten; +nd6_storelladdr( + struct ifnet *ifp, + struct rtentry *rt, + struct mbuf *m, + struct sockaddr *dst, + u_char *desten) { int i; struct sockaddr_dl *sdl; @@ -2155,9 +2200,8 @@ nd6_storelladdr(ifp, rt, m, dst, desten) switch (ifp->if_type) { case IFT_ETHER: case IFT_FDDI: -#if IFT_L2VLAN - case IFT_L2VLAN: -#endif + case IFT_L2VLAN: + case IFT_IEEE8023ADLAG: #if IFT_IEEE80211 case IFT_IEEE80211: #endif @@ -2194,6 +2238,69 @@ nd6_storelladdr(ifp, rt, m, dst, desten) bcopy(LLADDR(sdl), desten, sdl->sdl_alen); return(1); } + +extern errno_t arp_route_to_gateway_route(const struct sockaddr *net_dest, + route_t hint, route_t *out_route); + +errno_t +nd6_lookup_ipv6( + ifnet_t ifp, + const struct sockaddr_in6 *ip6_dest, + struct sockaddr_dl *ll_dest, + size_t ll_dest_len, + route_t hint, + mbuf_t packet) +{ + route_t route = hint; + errno_t result = 0; + struct sockaddr_dl *sdl = NULL; + size_t copy_len; + + if (ip6_dest->sin6_family != AF_INET6) + return EAFNOSUPPORT; + + if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING)) + return ENETDOWN; + + if (hint) { + result = arp_route_to_gateway_route((const struct sockaddr*)ip6_dest, hint, &route); + if (result != 0) + return result; + } + + if ((packet->m_flags & M_MCAST) != 0) { + return dlil_resolve_multi(ifp, (const struct sockaddr*)ip6_dest, + ll_dest, ll_dest_len); + } + + if (route == NULL) { + /* this could happen, if we could not allocate memory */ + return ENOBUFS; + } + + lck_mtx_lock(rt_mtx); + + if (route->rt_gateway->sa_family != AF_LINK) { + printf("nd6_lookup_ipv6: gateway address not AF_LINK\n"); + result = EADDRNOTAVAIL; + goto done; + } + + sdl = SDL(route->rt_gateway); + if (sdl->sdl_alen == 0) { + /* this should be impossible, but we bark here for debugging */ + printf("nd6_storelladdr: sdl_alen == 0\n"); + result = EHOSTUNREACH; + } + + copy_len = sdl->sdl_len <= ll_dest_len ? sdl->sdl_len : ll_dest_len; + bcopy(sdl, ll_dest, copy_len); + +done: + lck_mtx_unlock(rt_mtx); + return result; +} + #ifndef __APPLE__ static int nd6_sysctl_drlist SYSCTL_HANDLER_ARGS; static int nd6_sysctl_prlist SYSCTL_HANDLER_ARGS; @@ -2215,6 +2322,7 @@ nd6_sysctl_drlist SYSCTL_HANDLER_ARGS return EPERM; error = 0; + lck_mtx_lock(nd6_mutex); for (dr = TAILQ_FIRST(&nd_defrouter); dr; dr = TAILQ_NEXT(dr, dr_entry)) { @@ -2242,6 +2350,7 @@ nd6_sysctl_drlist SYSCTL_HANDLER_ARGS if (error) break; } + lck_mtx_unlock(nd6_mutex); return error; } @@ -2257,6 +2366,8 @@ nd6_sysctl_prlist SYSCTL_HANDLER_ARGS return EPERM; error = 0; + lck_mtx_lock(nd6_mutex); + for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { u_short advrtrs; size_t advance; @@ -2316,6 +2427,7 @@ nd6_sysctl_prlist SYSCTL_HANDLER_ARGS if (error) break; } + lck_mtx_unlock(nd6_mutex); return error; } #endif diff --git a/bsd/netinet6/nd6.h b/bsd/netinet6/nd6.h index d774afb3a..8f525ce33 100644 --- a/bsd/netinet6/nd6.h +++ b/bsd/netinet6/nd6.h @@ -41,6 +41,7 @@ #include <sys/queue.h> +#ifdef KERNEL_PRIVATE struct llinfo_nd6 { struct llinfo_nd6 *ln_next; struct llinfo_nd6 *ln_prev; @@ -52,6 +53,7 @@ struct llinfo_nd6 { short ln_router; /* 2^0: ND6 router bit */ int ln_byhint; /* # of times we made it reachable by UL hint */ }; +#endif /* KERNEL_PRIVATE */ #define ND6_LLINFO_NOSTATE -2 /* @@ -185,8 +187,7 @@ struct in6_ndifreq { #define ND6_INFINITE_LIFETIME 0xffffffff -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE /* node constants */ #define MAX_REACHABLE_TIME 3600000 /* msec */ #define REACHABLE_TIME 30000 /* msec */ @@ -338,67 +339,96 @@ union nd_opts { /* XXX: need nd6_var.h?? */ /* nd6.c */ -void nd6_init __P((void)); -void nd6_ifattach __P((struct ifnet *)); -int nd6_is_addr_neighbor __P((struct sockaddr_in6 *, struct ifnet *)); -void nd6_option_init __P((void *, int, union nd_opts *)); -struct nd_opt_hdr *nd6_option __P((union nd_opts *)); -int nd6_options __P((union nd_opts *)); -struct rtentry *nd6_lookup __P((struct in6_addr *, int, struct ifnet *)); -void nd6_setmtu __P((struct ifnet *)); -void nd6_timer __P((void *)); -void nd6_purge __P((struct ifnet *)); -struct llinfo_nd6 *nd6_free __P((struct rtentry *)); -void nd6_nud_hint __P((struct rtentry *, struct in6_addr *, int)); -int nd6_resolve __P((struct ifnet *, struct rtentry *, - struct mbuf *, struct sockaddr *, u_char *)); -void nd6_rtrequest __P((int, struct rtentry *, struct sockaddr *)); -int nd6_ioctl __P((u_long, caddr_t, struct ifnet *)); -struct rtentry *nd6_cache_lladdr __P((struct ifnet *, struct in6_addr *, - char *, int, int, int)); -int nd6_output __P((struct ifnet *, struct ifnet *, struct mbuf *, - struct sockaddr_in6 *, struct rtentry *)); -int nd6_storelladdr __P((struct ifnet *, struct rtentry *, struct mbuf *, - struct sockaddr *, u_char *)); -int nd6_need_cache __P((struct ifnet *)); +void nd6_init(void); +void nd6_ifattach(struct ifnet *); +int nd6_is_addr_neighbor(struct sockaddr_in6 *, struct ifnet *, int); +void nd6_option_init(void *, int, union nd_opts *); +struct nd_opt_hdr *nd6_option(union nd_opts *); +int nd6_options(union nd_opts *); +struct rtentry *nd6_lookup(struct in6_addr *, int, struct ifnet *, int); +void nd6_setmtu(struct ifnet *); +void nd6_timer(void *); +void nd6_purge(struct ifnet *); +struct llinfo_nd6 *nd6_free(struct rtentry *); +void nd6_nud_hint(struct rtentry *, struct in6_addr *, int); +int nd6_resolve(struct ifnet *, struct rtentry *, + struct mbuf *, struct sockaddr *, u_char *); +void nd6_rtrequest(int, struct rtentry *, struct sockaddr *); +int nd6_ioctl(u_long, caddr_t, struct ifnet *); +struct rtentry *nd6_cache_lladdr(struct ifnet *, struct in6_addr *, + char *, int, int, int); +int nd6_output(struct ifnet *, struct ifnet *, struct mbuf *, + struct sockaddr_in6 *, struct rtentry *, int); +int nd6_storelladdr(struct ifnet *, struct rtentry *, struct mbuf *, + struct sockaddr *, u_char *); +int nd6_need_cache(struct ifnet *); /* nd6_nbr.c */ -void nd6_na_input __P((struct mbuf *, int, int)); -void nd6_na_output __P((struct ifnet *, const struct in6_addr *, - const struct in6_addr *, u_long, int, struct sockaddr *)); -void nd6_ns_input __P((struct mbuf *, int, int)); -void nd6_ns_output __P((struct ifnet *, const struct in6_addr *, - const struct in6_addr *, struct llinfo_nd6 *, int)); -caddr_t nd6_ifptomac __P((struct ifnet *)); -void nd6_dad_start __P((struct ifaddr *, int *)); -void nd6_dad_stop __P((struct ifaddr *)); -void nd6_dad_duplicated __P((struct ifaddr *)); +void nd6_na_input(struct mbuf *, int, int); +void nd6_na_output(struct ifnet *, const struct in6_addr *, + const struct in6_addr *, u_long, int, struct sockaddr *); +void nd6_ns_input(struct mbuf *, int, int); +void nd6_ns_output(struct ifnet *, const struct in6_addr *, + const struct in6_addr *, struct llinfo_nd6 *, int, int); +caddr_t nd6_ifptomac(struct ifnet *); +void nd6_dad_start(struct ifaddr *, int *); +void nd6_dad_stop(struct ifaddr *); +void nd6_dad_duplicated(struct ifaddr *); /* nd6_rtr.c */ -void nd6_rs_input __P((struct mbuf *, int, int)); -void nd6_ra_input __P((struct mbuf *, int, int)); -void prelist_del __P((struct nd_prefix *)); -void defrouter_addreq __P((struct nd_defrouter *)); -void defrouter_delreq __P((struct nd_defrouter *, int)); -void defrouter_select __P((void)); -void defrtrlist_del __P((struct nd_defrouter *)); -void prelist_remove __P((struct nd_prefix *)); -int prelist_update __P((struct nd_prefix *, struct nd_defrouter *, - struct mbuf *)); -int nd6_prelist_add __P((struct nd_prefix *, struct nd_defrouter *, - struct nd_prefix **)); -int nd6_prefix_onlink __P((struct nd_prefix *)); -int nd6_prefix_offlink __P((struct nd_prefix *)); -void pfxlist_onlink_check __P((void)); -struct nd_defrouter *defrouter_lookup __P((struct in6_addr *, - struct ifnet *)); -struct nd_prefix *nd6_prefix_lookup __P((struct nd_prefix *)); -int in6_init_prefix_ltimes __P((struct nd_prefix *ndpr)); -void rt6_flush __P((struct in6_addr *, struct ifnet *)); -int nd6_setdefaultiface __P((int)); -int in6_tmpifadd __P((const struct in6_ifaddr *, int)); - -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +void nd6_rs_input(struct mbuf *, int, int); +void nd6_ra_input(struct mbuf *, int, int); +void prelist_del(struct nd_prefix *); +void defrouter_addreq(struct nd_defrouter *); +void defrouter_delreq(struct nd_defrouter *, int); +void defrouter_select(void); +void defrtrlist_del(struct nd_defrouter *, int); +void prelist_remove(struct nd_prefix *, int); +int prelist_update(struct nd_prefix *, struct nd_defrouter *, + struct mbuf *); +int nd6_prelist_add(struct nd_prefix *, struct nd_defrouter *, + struct nd_prefix **); +int nd6_prefix_onlink(struct nd_prefix *, int, int); +int nd6_prefix_offlink(struct nd_prefix *); +void pfxlist_onlink_check(int); +struct nd_defrouter *defrouter_lookup(struct in6_addr *, + struct ifnet *); +struct nd_prefix *nd6_prefix_lookup(struct nd_prefix *); +int in6_init_prefix_ltimes(struct nd_prefix *ndpr); +void rt6_flush(struct in6_addr *, struct ifnet *); +int nd6_setdefaultiface(int); +int in6_tmpifadd(const struct in6_ifaddr *, int); +#endif /* KERNEL_PRIVATE */ + +#ifdef KERNEL + +/*! + @function nd6_lookup_ipv6 + @discussion This function will check the routing table for a cached + neighbor discovery entry or trigger an neighbor discovery query + to resolve the IPv6 address to a link-layer address. + + nd entries are stored in the routing table. This function will + lookup the IPv6 destination in the routing table. If the + destination requires forwarding to a gateway, the route of the + gateway will be looked up. The route entry is inspected to + determine if the link layer destination address is known. If + unknown, neighbor discovery will be used to resolve the entry. + @param interface The interface the packet is being sent on. + @param ip6_dest The IPv6 destination of the packet. + @param ll_dest On output, the link-layer destination. + @param ll_dest_len The length of the buffer for ll_dest. + @param hint Any routing hint passed down from the protocol. + @param packet The packet being transmitted. + @result May return an error such as EHOSTDOWN or ENETUNREACH. If + this function returns EJUSTRETURN, the packet has been queued + and will be sent when the address is resolved. If any other + value is returned, the caller is responsible for disposing of + the packet. + */ +errno_t nd6_lookup_ipv6(ifnet_t interface, const struct sockaddr_in6 *ip6_dest, + struct sockaddr_dl *ll_dest, size_t ll_dest_len, route_t hint, + mbuf_t packet); +#endif KERNEL #endif /* _NETINET6_ND6_H_ */ diff --git a/bsd/netinet6/nd6_nbr.c b/bsd/netinet6/nd6_nbr.c index ae8185854..5e968a96e 100644 --- a/bsd/netinet6/nd6_nbr.c +++ b/bsd/netinet6/nd6_nbr.c @@ -68,22 +68,23 @@ extern int ipsec_bypass; #define SDL(s) ((struct sockaddr_dl *)s) struct dadq; -static struct dadq *nd6_dad_find __P((struct ifaddr *)); +static struct dadq *nd6_dad_find(struct ifaddr *); #ifndef __APPLE__ -static void nd6_dad_starttimer __P((struct dadq *, int)); -static void nd6_dad_stoptimer __P((struct dadq *)); +static void nd6_dad_starttimer(struct dadq *, int); +static void nd6_dad_stoptimer(struct dadq *); #else -void nd6_dad_stoptimer __P((struct ifaddr *)); +void nd6_dad_stoptimer(struct ifaddr *); #endif -static void nd6_dad_timer __P((struct ifaddr *)); -static void nd6_dad_timer_funnel __P((struct ifaddr *)); -static void nd6_dad_ns_output __P((struct dadq *, struct ifaddr *)); -static void nd6_dad_ns_input __P((struct ifaddr *)); -static void nd6_dad_na_input __P((struct ifaddr *)); +static void nd6_dad_timer(struct ifaddr *); +static void nd6_dad_ns_output(struct dadq *, struct ifaddr *); +static void nd6_dad_ns_input(struct ifaddr *); +static void nd6_dad_na_input(struct ifaddr *); static int dad_ignore_ns = 0; /* ignore NS in DAD - specwise incorrect*/ static int dad_maxtry = 15; /* max # of *tries* to transmit DAD packet */ +extern lck_mtx_t *dad6_mutex; +extern lck_mtx_t *nd6_mutex; /* * Input an Neighbor Solicitation Message. * @@ -91,9 +92,10 @@ static int dad_maxtry = 15; /* max # of *tries* to transmit DAD packet */ * Based on RFC 2462 (duplicated address detection) */ void -nd6_ns_input(m, off, icmp6len) - struct mbuf *m; - int off, icmp6len; +nd6_ns_input( + struct mbuf *m, + int off, + int icmp6len) { struct ifnet *ifp = m->m_pkthdr.rcvif; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); @@ -111,7 +113,7 @@ nd6_ns_input(m, off, icmp6len) struct sockaddr_dl *proxydl = NULL; #ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, icmp6len,); + IP6_EXTHDR_CHECK(m, off, icmp6len, return); nd_ns = (struct nd_neighbor_solicit *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_ns, struct nd_neighbor_solicit *, m, off, icmp6len); @@ -333,11 +335,13 @@ nd6_ns_input(m, off, icmp6len) * Based on RFC 2462 (duplicated address detection) */ void -nd6_ns_output(ifp, daddr6, taddr6, ln, dad) - struct ifnet *ifp; - const struct in6_addr *daddr6, *taddr6; - struct llinfo_nd6 *ln; /* for source address determination */ - int dad; /* duplicated address detection */ +nd6_ns_output( + struct ifnet *ifp, + const struct in6_addr *daddr6, + const struct in6_addr *taddr6, + struct llinfo_nd6 *ln, /* for source address determination */ + int dad, /* duplicated address detection */ + int locked) { struct mbuf *m; struct ip6_hdr *ip6; @@ -513,7 +517,7 @@ nd6_ns_output(ifp, daddr6, taddr6, ln, dad) if (ipsec_bypass == 0) (void)ipsec_setsocket(m, NULL); #endif - ip6_output(m, NULL, NULL, dad ? IPV6_DADOUTPUT : 0, &im6o, &outif); + ip6_output(m, NULL, NULL, dad ? IPV6_DADOUTPUT : 0, &im6o, &outif, locked); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); icmp6_ifstat_inc(outif, ifs6_out_neighborsolicit); @@ -532,9 +536,10 @@ nd6_ns_output(ifp, daddr6, taddr6, ln, dad) * - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD) */ void -nd6_na_input(m, off, icmp6len) - struct mbuf *m; - int off, icmp6len; +nd6_na_input( + struct mbuf *m, + int off, + int icmp6len) { struct ifnet *ifp = m->m_pkthdr.rcvif; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); @@ -555,6 +560,7 @@ nd6_na_input(m, off, icmp6len) struct rtentry *rt; struct sockaddr_dl *sdl; union nd_opts ndopts; + struct timeval timenow; if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, @@ -565,7 +571,7 @@ nd6_na_input(m, off, icmp6len) } #ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, icmp6len,); + IP6_EXTHDR_CHECK(m, off, icmp6len, return); nd_na = (struct nd_neighbor_advert *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_na, struct nd_neighbor_advert *, m, off, icmp6len); @@ -646,12 +652,13 @@ nd6_na_input(m, off, icmp6len) /* * If no neighbor cache entry is found, NA SHOULD silently be discarded. */ - rt = nd6_lookup(&taddr6, 0, ifp); + rt = nd6_lookup(&taddr6, 0, ifp, 0); if ((rt == NULL) || ((ln = (struct llinfo_nd6 *)rt->rt_llinfo) == NULL) || ((sdl = SDL(rt->rt_gateway)) == NULL)) goto freeit; + getmicrotime(&timenow); if (ln->ln_state == ND6_LLINFO_INCOMPLETE) { /* * If the link-layer has address, and no lladdr option came, @@ -669,11 +676,11 @@ nd6_na_input(m, off, icmp6len) ln->ln_state = ND6_LLINFO_REACHABLE; ln->ln_byhint = 0; if (ln->ln_expire) - ln->ln_expire = time_second + + ln->ln_expire = timenow.tv_sec + nd_ifinfo[rt->rt_ifp->if_index].reachable; } else { ln->ln_state = ND6_LLINFO_STALE; - ln->ln_expire = time_second + nd6_gctimer; + ln->ln_expire = timenow.tv_sec + nd6_gctimer; } if ((ln->ln_router = is_router) != 0) { /* @@ -681,7 +688,7 @@ nd6_na_input(m, off, icmp6len) * non-reachable to probably reachable, and might * affect the status of associated prefixes.. */ - pfxlist_onlink_check(); + pfxlist_onlink_check(0); } } else { int llchange; @@ -727,7 +734,7 @@ nd6_na_input(m, off, icmp6len) */ if (ln->ln_state == ND6_LLINFO_REACHABLE) { ln->ln_state = ND6_LLINFO_STALE; - ln->ln_expire = time_second + nd6_gctimer; + ln->ln_expire = timenow.tv_sec + nd6_gctimer; } goto freeit; } else if (is_override /* (2a) */ @@ -750,13 +757,13 @@ nd6_na_input(m, off, icmp6len) ln->ln_state = ND6_LLINFO_REACHABLE; ln->ln_byhint = 0; if (ln->ln_expire) { - ln->ln_expire = time_second + + ln->ln_expire = timenow.tv_sec + nd_ifinfo[ifp->if_index].reachable; } } else { if (lladdr && llchange) { ln->ln_state = ND6_LLINFO_STALE; - ln->ln_expire = time_second + nd6_gctimer; + ln->ln_expire = timenow.tv_sec + nd6_gctimer; } } } @@ -779,21 +786,25 @@ nd6_na_input(m, off, icmp6len) * is only called under the network software interrupt * context. However, we keep it just for safety. */ - s = splnet(); + lck_mtx_lock(nd6_mutex); dr = defrouter_lookup(in6, rt->rt_ifp); - if (dr) - defrtrlist_del(dr); - else if (!ip6_forwarding && (ip6_accept_rtadv || (rt->rt_ifp->if_eflags & IFEF_ACCEPT_RTADVD))) { - /* - * Even if the neighbor is not in the default - * router list, the neighbor may be used - * as a next hop for some destinations - * (e.g. redirect case). So we must - * call rt6_flush explicitly. - */ - rt6_flush(&ip6->ip6_src, rt->rt_ifp); + if (dr) { + defrtrlist_del(dr, 1); + lck_mtx_unlock(nd6_mutex); + } + else { + lck_mtx_unlock(nd6_mutex); + if (!ip6_forwarding && (ip6_accept_rtadv || (rt->rt_ifp->if_eflags & IFEF_ACCEPT_RTADVD))) { + /* + * Even if the neighbor is not in the default + * router list, the neighbor may be used + * as a next hop for some destinations + * (e.g. redirect case). So we must + * call rt6_flush explicitly. + */ + rt6_flush(&ip6->ip6_src, rt->rt_ifp); + } } - splx(s); } ln->ln_router = is_router; } @@ -805,7 +816,7 @@ nd6_na_input(m, off, icmp6len) * argument as the 1st one. */ nd6_output(ifp, ifp, ln->ln_hold, - (struct sockaddr_in6 *)rt_key(rt), rt); + (struct sockaddr_in6 *)rt_key(rt), rt, 0); ln->ln_hold = 0; } @@ -828,12 +839,13 @@ nd6_na_input(m, off, icmp6len) * - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD) */ void -nd6_na_output(ifp, daddr6, taddr6, flags, tlladdr, sdl0) - struct ifnet *ifp; - const struct in6_addr *daddr6, *taddr6; - u_long flags; - int tlladdr; /* 1 if include target link-layer address */ - struct sockaddr *sdl0; /* sockaddr_dl (= proxy NA) or NULL */ +nd6_na_output( + struct ifnet *ifp, + const struct in6_addr *daddr6, + const struct in6_addr *taddr6, + u_long flags, + int tlladdr, /* 1 if include target link-layer address */ + struct sockaddr *sdl0) /* sockaddr_dl (= proxy NA) or NULL */ { struct mbuf *m; struct ip6_hdr *ip6; @@ -962,7 +974,7 @@ nd6_na_output(ifp, daddr6, taddr6, flags, tlladdr, sdl0) if (ipsec_bypass == 0) (void)ipsec_setsocket(m, NULL); #endif - ip6_output(m, NULL, NULL, 0, &im6o, &outif); + ip6_output(m, NULL, NULL, 0, &im6o, &outif, 0); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); icmp6_ifstat_inc(outif, ifs6_out_neighboradvert); @@ -971,25 +983,10 @@ nd6_na_output(ifp, daddr6, taddr6, flags, tlladdr, sdl0) } caddr_t -nd6_ifptomac(ifp) - struct ifnet *ifp; +nd6_ifptomac( + struct ifnet *ifp) { - switch (ifp->if_type) { - case IFT_ARCNET: - case IFT_ETHER: - case IFT_FDDI: - case IFT_IEEE1394: -#if IFT_L2VLAN - case IFT_L2VLAN: -#endif -#if IFT_IEEE80211 - case IFT_IEEE80211: -#endif - return ((caddr_t)(ifp + 1)); - break; - default: - return NULL; - } + return ((caddr_t)ifnet_lladdr(ifp)); } TAILQ_HEAD(dadq_head, dadq); @@ -1007,40 +1004,43 @@ static struct dadq_head dadq; static int dad_init = 0; static struct dadq * -nd6_dad_find(ifa) - struct ifaddr *ifa; +nd6_dad_find( + struct ifaddr *ifa) { struct dadq *dp; - + lck_mtx_lock(dad6_mutex); for (dp = dadq.tqh_first; dp; dp = dp->dad_list.tqe_next) { - if (dp->dad_ifa == ifa) + if (dp->dad_ifa == ifa) { + lck_mtx_unlock(dad6_mutex); return dp; + } } + lck_mtx_unlock(dad6_mutex); return NULL; } #ifdef __APPLE__ void -nd6_dad_stoptimer(ifa) - struct ifaddr *ifa; +nd6_dad_stoptimer( + struct ifaddr *ifa) { - untimeout((void (*) __P((void *)))nd6_dad_timer_funnel, (void *)ifa); + untimeout((void (*)(void *))nd6_dad_timer, (void *)ifa); } #else static void -nd6_dad_starttimer(dp, ticks) - struct dadq *dp; - int ticks; +nd6_dad_starttimer( + struct dadq *dp, + int ticks) { callout_reset(&dp->dad_timer_ch, ticks, - (void (*) __P((void *)))nd6_dad_timer, (void *)dp->dad_ifa); + (void (*)(void *))nd6_dad_timer, (void *)dp->dad_ifa); } static void -nd6_dad_stoptimer(dp) - struct dadq *dp; +nd6_dad_stoptimer( + struct dadq *dp) { callout_stop(&dp->dad_timer_ch); @@ -1051,9 +1051,9 @@ nd6_dad_stoptimer(dp) * Start Duplicated Address Detection (DAD) for specified interface address. */ void -nd6_dad_start(ifa, tick) - struct ifaddr *ifa; - int *tick; /* minimum delay ticks for IFF_UP event */ +nd6_dad_start( + struct ifaddr *ifa, + int *tick) /* minimum delay ticks for IFF_UP event */ { struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct dadq *dp; @@ -1103,7 +1103,9 @@ nd6_dad_start(ifa, tick) return; } bzero(dp, sizeof(*dp)); + lck_mtx_lock(dad6_mutex); TAILQ_INSERT_TAIL(&dadq, (struct dadq *)dp, dad_list); + lck_mtx_unlock(dad6_mutex); nd6log((LOG_DEBUG, "%s: starting DAD for %s\n", if_name(ifa->ifa_ifp), ip6_sprintf(&ia->ia_addr.sin6_addr))); @@ -1121,7 +1123,7 @@ nd6_dad_start(ifa, tick) dp->dad_ns_ocount = dp->dad_ns_tcount = 0; if (tick == NULL) { nd6_dad_ns_output(dp, ifa); - timeout((void (*) __P((void *)))nd6_dad_timer_funnel, (void *)ifa, + timeout((void (*)(void *))nd6_dad_timer, (void *)ifa, nd_ifinfo[ifa->ifa_ifp->if_index].retrans * hz / 1000); } else { int ntick; @@ -1131,7 +1133,7 @@ nd6_dad_start(ifa, tick) else ntick = *tick + random() % (hz / 2); *tick = ntick; - timeout((void (*) __P((void *)))nd6_dad_timer_funnel, (void *)ifa, + timeout((void (*)(void *))nd6_dad_timer, (void *)ifa, ntick); } } @@ -1140,8 +1142,8 @@ nd6_dad_start(ifa, tick) * terminate DAD unconditionally. used for address removals. */ void -nd6_dad_stop(ifa) - struct ifaddr *ifa; +nd6_dad_stop( + struct ifaddr *ifa) { struct dadq *dp; @@ -1153,34 +1155,20 @@ nd6_dad_stop(ifa) return; } - untimeout((void (*) __P((void *)))nd6_dad_timer_funnel, (void *)ifa); + untimeout((void (*)(void *))nd6_dad_timer, (void *)ifa); + lck_mtx_lock(dad6_mutex); TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list); + lck_mtx_unlock(dad6_mutex); FREE(dp, M_IP6NDP); dp = NULL; ifafree(ifa); } -static void -nd6_dad_timer_funnel(ifa) - struct ifaddr *ifa; -{ - -#ifdef __APPLE__ - boolean_t funnel_state; - funnel_state = thread_funnel_set(network_flock, TRUE); -#endif - nd6_dad_timer(ifa); -#ifdef __APPLE__ - (void) thread_funnel_set(network_flock, FALSE); -#endif - -} - static void -nd6_dad_timer(ifa) - struct ifaddr *ifa; +nd6_dad_timer( + struct ifaddr *ifa) { int s; struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; @@ -1218,7 +1206,9 @@ nd6_dad_timer(ifa) nd6log((LOG_INFO, "%s: could not run DAD, driver problem?\n", if_name(ifa->ifa_ifp))); + lck_mtx_lock(dad6_mutex); TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list); + lck_mtx_unlock(dad6_mutex); FREE(dp, M_IP6NDP); dp = NULL; ifafree(ifa); @@ -1231,7 +1221,7 @@ nd6_dad_timer(ifa) * We have more NS to go. Send NS packet for DAD. */ nd6_dad_ns_output(dp, ifa); - timeout((void (*) __P((void *)))nd6_dad_timer_funnel, (void *)ifa, + timeout((void (*)(void *))nd6_dad_timer, (void *)ifa, nd_ifinfo[ifa->ifa_ifp->if_index].retrans * hz / 1000); } else { /* @@ -1296,7 +1286,10 @@ nd6_dad_timer(ifa) if_name(ifa->ifa_ifp), ip6_sprintf(&ia->ia_addr.sin6_addr))); + lck_mtx_lock(dad6_mutex); TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list); + lck_mtx_unlock(dad6_mutex); + in6_post_msg(ia->ia_ifp, KEV_INET6_NEW_USER_ADDR, ia); FREE(dp, M_IP6NDP); dp = NULL; ifafree(ifa); @@ -1308,8 +1301,8 @@ done: } void -nd6_dad_duplicated(ifa) - struct ifaddr *ifa; +nd6_dad_duplicated( + struct ifaddr *ifa) { struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct dadq *dp; @@ -1329,7 +1322,7 @@ nd6_dad_duplicated(ifa) ia->ia6_flags |= IN6_IFF_DUPLICATED; /* We are done with DAD, with duplicated address found. (failure) */ - untimeout((void (*) __P((void *)))nd6_dad_timer_funnel, (void *)ifa); + untimeout((void (*)(void *))nd6_dad_timer, (void *)ifa); log(LOG_ERR, "%s: DAD complete for %s - duplicate found\n", @@ -1337,16 +1330,18 @@ nd6_dad_duplicated(ifa) log(LOG_ERR, "%s: manual intervention required\n", if_name(ifa->ifa_ifp)); + lck_mtx_lock(dad6_mutex); TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list); + lck_mtx_unlock(dad6_mutex); FREE(dp, M_IP6NDP); dp = NULL; ifafree(ifa); } static void -nd6_dad_ns_output(dp, ifa) - struct dadq *dp; - struct ifaddr *ifa; +nd6_dad_ns_output( + struct dadq *dp, + struct ifaddr *ifa) { struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct ifnet *ifp = ifa->ifa_ifp; @@ -1366,15 +1361,14 @@ nd6_dad_ns_output(dp, ifa) } dp->dad_ns_ocount++; - nd6_ns_output(ifp, NULL, &ia->ia_addr.sin6_addr, NULL, 1); + nd6_ns_output(ifp, NULL, &ia->ia_addr.sin6_addr, NULL, 1, 0); } static void -nd6_dad_ns_input(ifa) - struct ifaddr *ifa; +nd6_dad_ns_input( + struct ifaddr *ifa) { struct in6_ifaddr *ia; - struct ifnet *ifp; const struct in6_addr *taddr6; struct dadq *dp; int duplicate; @@ -1383,7 +1377,6 @@ nd6_dad_ns_input(ifa) panic("ifa == NULL in nd6_dad_ns_input"); ia = (struct in6_ifaddr *)ifa; - ifp = ifa->ifa_ifp; taddr6 = &ia->ia_addr.sin6_addr; duplicate = 0; dp = nd6_dad_find(ifa); @@ -1420,8 +1413,8 @@ nd6_dad_ns_input(ifa) } static void -nd6_dad_na_input(ifa) - struct ifaddr *ifa; +nd6_dad_na_input( + struct ifaddr *ifa) { struct dadq *dp; diff --git a/bsd/netinet6/nd6_rtr.c b/bsd/netinet6/nd6_rtr.c index c3fd29bc6..6ca948351 100644 --- a/bsd/netinet6/nd6_rtr.c +++ b/bsd/netinet6/nd6_rtr.c @@ -42,6 +42,7 @@ #include <sys/errno.h> #include <sys/syslog.h> #include <sys/queue.h> +#include <kern/lock.h> #include <net/if.h> #include <net/if_types.h> @@ -62,22 +63,21 @@ #define SDL(s) ((struct sockaddr_dl *)s) -static struct nd_defrouter *defrtrlist_update __P((struct nd_defrouter *)); -static struct in6_ifaddr *in6_ifadd __P((struct nd_prefix *, - struct in6_addr *)); -static struct nd_pfxrouter *pfxrtr_lookup __P((struct nd_prefix *, - struct nd_defrouter *)); -static void pfxrtr_add __P((struct nd_prefix *, struct nd_defrouter *)); -static void pfxrtr_del __P((struct nd_pfxrouter *)); -static struct nd_pfxrouter *find_pfxlist_reachable_router - __P((struct nd_prefix *)); -static void defrouter_addifreq __P((struct ifnet *)); -static void nd6_rtmsg __P((int, struct rtentry *)); +static struct nd_defrouter *defrtrlist_update(struct nd_defrouter *); +static struct in6_ifaddr *in6_ifadd(struct nd_prefix *, + struct in6_addr *); +static struct nd_pfxrouter *pfxrtr_lookup(struct nd_prefix *, + struct nd_defrouter *); +static void pfxrtr_add(struct nd_prefix *, struct nd_defrouter *); +static void pfxrtr_del(struct nd_pfxrouter *); +static struct nd_pfxrouter *find_pfxlist_reachable_router(struct nd_prefix *); +static void defrouter_addifreq(struct ifnet *); +static void nd6_rtmsg(int, struct rtentry *); -static void in6_init_address_ltimes __P((struct nd_prefix *ndpr, - struct in6_addrlifetime *lt6)); +static void in6_init_address_ltimes(struct nd_prefix *ndpr, + struct in6_addrlifetime *lt6); -static int rt6_deleteroute __P((struct radix_node *, void *)); +static int rt6_deleteroute(struct radix_node *, void *); extern int nd6_recalc_reachtm_interval; @@ -96,6 +96,9 @@ static int ip6_temp_valid_lifetime = 1800; */ int ip6_temp_regen_advance = TEMPADDR_REGEN_ADVANCE; +extern lck_mtx_t *rt_mtx; +extern lck_mtx_t *nd6_mutex; + /* * Receive Router Solicitation Message - just for routers. * Router solicitation/advertisement is mostly managed by userland program @@ -104,9 +107,10 @@ int ip6_temp_regen_advance = TEMPADDR_REGEN_ADVANCE; * Based on RFC 2461 */ void -nd6_rs_input(m, off, icmp6len) - struct mbuf *m; - int off, icmp6len; +nd6_rs_input( + struct mbuf *m, + int off, + int icmp6len) { struct ifnet *ifp = m->m_pkthdr.rcvif; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); @@ -146,7 +150,7 @@ nd6_rs_input(m, off, icmp6len) goto freeit; #ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, icmp6len,); + IP6_EXTHDR_CHECK(m, off, icmp6len, return); nd_rs = (struct nd_router_solicit *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_rs, struct nd_router_solicit *, m, off, icmp6len); @@ -197,9 +201,10 @@ nd6_rs_input(m, off, icmp6len) * TODO: ND_RA_FLAG_{OTHER,MANAGED} processing */ void -nd6_ra_input(m, off, icmp6len) - struct mbuf *m; - int off, icmp6len; +nd6_ra_input( + struct mbuf *m, + int off, + int icmp6len) { struct ifnet *ifp = m->m_pkthdr.rcvif; struct nd_ifinfo *ndi = &nd_ifinfo[ifp->if_index]; @@ -214,6 +219,9 @@ nd6_ra_input(m, off, icmp6len) #endif union nd_opts ndopts; struct nd_defrouter *dr; + struct timeval timenow; + + getmicrotime(&timenow); if (ip6_accept_rtadv == 0 && ((ifp->if_eflags & IFEF_ACCEPT_RTADVD) == 0)) goto freeit; @@ -234,7 +242,7 @@ nd6_ra_input(m, off, icmp6len) } #ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, icmp6len,); + IP6_EXTHDR_CHECK(m, off, icmp6len, return); nd_ra = (struct nd_router_advert *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_ra, struct nd_router_advert *, m, off, icmp6len); @@ -260,7 +268,7 @@ nd6_ra_input(m, off, icmp6len) dr0.rtaddr = saddr6; dr0.flags = nd_ra->nd_ra_flags_reserved; dr0.rtlifetime = ntohs(nd_ra->nd_ra_router_lifetime); - dr0.expire = time_second + dr0.rtlifetime; + dr0.expire = timenow.tv_sec + dr0.rtlifetime; dr0.ifp = ifp; dr0.advint = 0; /* Mobile IPv6 */ dr0.advint_expire = 0; /* Mobile IPv6 */ @@ -338,7 +346,7 @@ nd6_ra_input(m, off, icmp6len) pr.ndpr_prefix.sin6_family = AF_INET6; pr.ndpr_prefix.sin6_len = sizeof(pr.ndpr_prefix); pr.ndpr_prefix.sin6_addr = pi->nd_opt_pi_prefix; - pr.ndpr_ifp = (struct ifnet *)m->m_pkthdr.rcvif; + pr.ndpr_ifp = m->m_pkthdr.rcvif; pr.ndpr_raf_onlink = (pi->nd_opt_pi_flags_reserved & ND_OPT_PI_FLAG_ONLINK) ? 1 : 0; @@ -422,7 +430,7 @@ nd6_ra_input(m, off, icmp6len) * router's neighbor cache, which might also affect our on-link * detection of adveritsed prefixes. */ - pfxlist_onlink_check(); + pfxlist_onlink_check(0); } freeit: @@ -446,6 +454,8 @@ nd6_rtmsg(cmd, rt) { struct rt_addrinfo info; + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); + bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; @@ -458,12 +468,11 @@ nd6_rtmsg(cmd, rt) } void -defrouter_addreq(new) - struct nd_defrouter *new; +defrouter_addreq( + struct nd_defrouter *new) { struct sockaddr_in6 def, mask, gate; struct rtentry *newrt = NULL; - int s; Bzero(&def, sizeof(def)); Bzero(&mask, sizeof(mask)); @@ -474,27 +483,28 @@ defrouter_addreq(new) def.sin6_family = mask.sin6_family = gate.sin6_family = AF_INET6; gate.sin6_addr = new->rtaddr; - s = splnet(); - (void)rtrequest(RTM_ADD, (struct sockaddr *)&def, + lck_mtx_lock(rt_mtx); + (void)rtrequest_locked(RTM_ADD, (struct sockaddr *)&def, (struct sockaddr *)&gate, (struct sockaddr *)&mask, RTF_GATEWAY, &newrt); if (newrt) { nd6_rtmsg(RTM_ADD, newrt); /* tell user process */ rtunref(newrt); } - splx(s); + lck_mtx_unlock(rt_mtx); return; } /* Add a route to a given interface as default */ void -defrouter_addifreq(ifp) - struct ifnet *ifp; +defrouter_addifreq( + struct ifnet *ifp) { struct sockaddr_in6 def, mask; - struct ifaddr *ifa; + struct ifaddr *ifa = NULL; struct rtentry *newrt = NULL; - int error, flags; + int error; + u_long flags; bzero(&def, sizeof(def)); bzero(&mask, sizeof(mask)); @@ -514,8 +524,9 @@ defrouter_addifreq(ifp) return; } + lck_mtx_lock(rt_mtx); flags = ifa->ifa_flags; - error = rtrequest(RTM_ADD, (struct sockaddr *)&def, ifa->ifa_addr, + error = rtrequest_locked(RTM_ADD, (struct sockaddr *)&def, ifa->ifa_addr, (struct sockaddr *)&mask, flags, &newrt); if (error != 0) { nd6log((LOG_ERR, @@ -532,15 +543,20 @@ defrouter_addifreq(ifp) } in6_post_msg(ifp, KEV_INET6_DEFROUTER, (struct in6_ifaddr *)ifa); } + lck_mtx_unlock(rt_mtx); + ifafree(ifa); } struct nd_defrouter * -defrouter_lookup(addr, ifp) - struct in6_addr *addr; - struct ifnet *ifp; +defrouter_lookup( + struct in6_addr *addr, + struct ifnet *ifp) { struct nd_defrouter *dr; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + for (dr = TAILQ_FIRST(&nd_defrouter); dr; dr = TAILQ_NEXT(dr, dr_entry)) { if (dr->ifp == ifp && IN6_ARE_ADDR_EQUAL(addr, &dr->rtaddr)) @@ -551,9 +567,9 @@ defrouter_lookup(addr, ifp) } void -defrouter_delreq(dr, dofree) - struct nd_defrouter *dr; - int dofree; +defrouter_delreq( + struct nd_defrouter *dr, + int dofree) { struct sockaddr_in6 def, mask, gate; struct rtentry *oldrt = NULL; @@ -567,7 +583,8 @@ defrouter_delreq(dr, dofree) def.sin6_family = mask.sin6_family = gate.sin6_family = AF_INET6; gate.sin6_addr = dr->rtaddr; - rtrequest(RTM_DELETE, (struct sockaddr *)&def, + lck_mtx_lock(rt_mtx); + rtrequest_locked(RTM_DELETE, (struct sockaddr *)&def, (struct sockaddr *)&gate, (struct sockaddr *)&mask, RTF_GATEWAY, &oldrt); @@ -579,17 +596,18 @@ defrouter_delreq(dr, dofree) * rtrequest(). */ rtref(oldrt); - rtfree(oldrt); + rtfree_locked(oldrt); } } if (dofree) /* XXX: necessary? */ FREE(dr, M_IP6NDP); + lck_mtx_unlock(rt_mtx); } void -defrtrlist_del(dr) - struct nd_defrouter *dr; +defrtrlist_del( + struct nd_defrouter *dr, int nd6locked) { struct nd_defrouter *deldr = NULL; struct nd_prefix *pr; @@ -603,6 +621,8 @@ defrtrlist_del(dr) rt6_flush(&dr->rtaddr, dr->ifp); } + if (nd6locked == 0) + lck_mtx_lock(nd6_mutex); if (dr == TAILQ_FIRST(&nd_defrouter)) deldr = dr; /* The router is primary. */ @@ -616,7 +636,7 @@ defrtrlist_del(dr) if ((pfxrtr = pfxrtr_lookup(pr, dr)) != NULL) pfxrtr_del(pfxrtr); } - pfxlist_onlink_check(); + pfxlist_onlink_check(1); /* * If the router is the primary one, choose a new one. @@ -626,6 +646,9 @@ defrtrlist_del(dr) if (deldr) defrouter_select(); + if (nd6locked == 0) + lck_mtx_unlock(nd6_mutex); + FREE(dr, M_IP6NDP); } @@ -642,7 +665,6 @@ defrtrlist_del(dr) void defrouter_select() { - int s = splnet(); struct nd_defrouter *dr, anydr; struct rtentry *rt = NULL; struct llinfo_nd6 *ln = NULL; @@ -650,9 +672,11 @@ defrouter_select() /* * Search for a (probably) reachable router from the list. */ + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + for (dr = TAILQ_FIRST(&nd_defrouter); dr; dr = TAILQ_NEXT(dr, dr_entry)) { - if ((rt = nd6_lookup(&dr->rtaddr, 0, dr->ifp)) && + if ((rt = nd6_lookup(&dr->rtaddr, 0, dr->ifp, 0)) && (ln = (struct llinfo_nd6 *)rt->rt_llinfo) && ND6_IS_LLINFO_PROBREACH(ln)) { /* Got it, and move it to the head */ @@ -707,21 +731,20 @@ defrouter_select() } } - splx(s); return; } static struct nd_defrouter * -defrtrlist_update(new) - struct nd_defrouter *new; +defrtrlist_update( + struct nd_defrouter *new) { struct nd_defrouter *dr, *n; - int s = splnet(); + lck_mtx_lock(nd6_mutex); if ((dr = defrouter_lookup(&new->rtaddr, new->ifp)) != NULL) { /* entry exists */ if (new->rtlifetime == 0) { - defrtrlist_del(dr); + defrtrlist_del(dr, 1); dr = NULL; } else { /* override */ @@ -729,19 +752,19 @@ defrtrlist_update(new) dr->rtlifetime = new->rtlifetime; dr->expire = new->expire; } - splx(s); + lck_mtx_unlock(nd6_mutex); return(dr); } /* entry does not exist */ if (new->rtlifetime == 0) { - splx(s); + lck_mtx_unlock(nd6_mutex); return(NULL); } n = (struct nd_defrouter *)_MALLOC(sizeof(*n), M_IP6NDP, M_NOWAIT); if (n == NULL) { - splx(s); + lck_mtx_unlock(nd6_mutex); return(NULL); } bzero(n, sizeof(*n)); @@ -755,18 +778,19 @@ defrtrlist_update(new) TAILQ_INSERT_TAIL(&nd_defrouter, n, dr_entry); if (TAILQ_FIRST(&nd_defrouter) == n) defrouter_select(); - splx(s); + lck_mtx_unlock(nd6_mutex); return(n); } static struct nd_pfxrouter * -pfxrtr_lookup(pr, dr) - struct nd_prefix *pr; - struct nd_defrouter *dr; +pfxrtr_lookup( + struct nd_prefix *pr, + struct nd_defrouter *dr) { struct nd_pfxrouter *search; + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); for (search = pr->ndpr_advrtrs.lh_first; search; search = search->pfr_next) { if (search->router == dr) break; @@ -776,12 +800,14 @@ pfxrtr_lookup(pr, dr) } static void -pfxrtr_add(pr, dr) - struct nd_prefix *pr; - struct nd_defrouter *dr; +pfxrtr_add( + struct nd_prefix *pr, + struct nd_defrouter *dr) { struct nd_pfxrouter *new; + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + new = (struct nd_pfxrouter *)_MALLOC(sizeof(*new), M_IP6NDP, M_NOWAIT); if (new == NULL) return; @@ -790,23 +816,25 @@ pfxrtr_add(pr, dr) LIST_INSERT_HEAD(&pr->ndpr_advrtrs, new, pfr_entry); - pfxlist_onlink_check(); + pfxlist_onlink_check(1); } static void -pfxrtr_del(pfr) - struct nd_pfxrouter *pfr; +pfxrtr_del( + struct nd_pfxrouter *pfr) { + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); LIST_REMOVE(pfr, pfr_entry); FREE(pfr, M_IP6NDP); } struct nd_prefix * -nd6_prefix_lookup(pr) - struct nd_prefix *pr; +nd6_prefix_lookup( + struct nd_prefix *pr) { struct nd_prefix *search; + lck_mtx_lock(nd6_mutex); for (search = nd_prefix.lh_first; search; search = search->ndpr_next) { if (pr->ndpr_ifp == search->ndpr_ifp && pr->ndpr_plen == search->ndpr_plen && @@ -817,17 +845,19 @@ nd6_prefix_lookup(pr) break; } } + lck_mtx_unlock(nd6_mutex); return(search); } int -nd6_prelist_add(pr, dr, newp) - struct nd_prefix *pr, **newp; - struct nd_defrouter *dr; +nd6_prelist_add( + struct nd_prefix *pr, + struct nd_defrouter *dr, + struct nd_prefix **newp) { struct nd_prefix *new = NULL; - int i, s; + int i; new = (struct nd_prefix *)_MALLOC(sizeof(*new), M_IP6NDP, M_NOWAIT); if (new == NULL) @@ -845,16 +875,15 @@ nd6_prelist_add(pr, dr, newp) new->ndpr_prefix.sin6_addr.s6_addr32[i] &= new->ndpr_mask.s6_addr32[i]; - s = splnet(); /* link ndpr_entry to nd_prefix list */ + lck_mtx_lock(nd6_mutex); LIST_INSERT_HEAD(&nd_prefix, new, ndpr_entry); - splx(s); /* ND_OPT_PI_FLAG_ONLINK processing */ if (new->ndpr_raf_onlink) { int e; - if ((e = nd6_prefix_onlink(new)) != 0) { + if ((e = nd6_prefix_onlink(new, 0, 1)) != 0) { nd6log((LOG_ERR, "nd6_prelist_add: failed to make " "the prefix %s/%d on-link on %s (errno=%d)\n", ip6_sprintf(&pr->ndpr_prefix.sin6_addr), @@ -866,16 +895,17 @@ nd6_prelist_add(pr, dr, newp) if (dr) { pfxrtr_add(new, dr); } + lck_mtx_unlock(nd6_mutex); return 0; } void -prelist_remove(pr) - struct nd_prefix *pr; +prelist_remove( + struct nd_prefix *pr, int nd6locked) { struct nd_pfxrouter *pfr, *next; - int e, s; + int e; /* make sure to invalidate the prefix until it is really freed. */ pr->ndpr_vltime = 0; @@ -900,8 +930,8 @@ prelist_remove(pr) if (pr->ndpr_refcnt > 0) return; /* notice here? */ - s = splnet(); - + if (nd6locked == 0) + lck_mtx_lock(nd6_mutex); /* unlink ndpr_entry from nd_prefix list */ LIST_REMOVE(pr, ndpr_entry); @@ -911,28 +941,29 @@ prelist_remove(pr) FREE(pfr, M_IP6NDP); } - splx(s); FREE(pr, M_IP6NDP); - pfxlist_onlink_check(); + pfxlist_onlink_check(1); + if (nd6locked == 0) + lck_mtx_unlock(nd6_mutex); } int -prelist_update(new, dr, m) - struct nd_prefix *new; - struct nd_defrouter *dr; /* may be NULL */ - struct mbuf *m; +prelist_update( + struct nd_prefix *new, + struct nd_defrouter *dr, /* may be NULL */ + struct mbuf *m) { struct in6_ifaddr *ia6 = NULL, *ia6_match = NULL; struct ifaddr *ifa; struct ifnet *ifp = new->ndpr_ifp; struct nd_prefix *pr; - int s = splnet(); int error = 0; int newprefix = 0; int auth; struct in6_addrlifetime lt6_tmp; + struct timeval timenow; auth = 0; if (m) { @@ -973,7 +1004,7 @@ prelist_update(new, dr, m) (pr->ndpr_stateflags & NDPRF_ONLINK) == 0) { int e; - if ((e = nd6_prefix_onlink(pr)) != 0) { + if ((e = nd6_prefix_onlink(pr, 0, 0)) != 0) { nd6log((LOG_ERR, "prelist_update: failed to make " "the prefix %s/%d on-link on %s " @@ -983,9 +1014,11 @@ prelist_update(new, dr, m) /* proceed anyway. XXX: is it correct? */ } } - + + lck_mtx_lock(nd6_mutex); if (dr && pfxrtr_lookup(pr, dr) == NULL) pfxrtr_add(pr, dr); + lck_mtx_unlock(nd6_mutex); } else { struct nd_prefix *newpr = NULL; @@ -1050,6 +1083,9 @@ prelist_update(new, dr, m) * form an address. Note that even a manually configured address * should reject autoconfiguration of a new address. */ + getmicrotime(&timenow); + + ifnet_lock_exclusive(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { struct in6_ifaddr *ifa6; @@ -1092,7 +1128,7 @@ prelist_update(new, dr, m) lt6_tmp = ifa6->ia6_lifetime; storedlifetime = IFA6_IS_INVALID(ifa6) ? 0 : - (lt6_tmp.ia6t_expire - time_second); + (lt6_tmp.ia6t_expire - timenow.tv_sec); if (TWOHOUR < new->ndpr_vltime || storedlifetime < new->ndpr_vltime) { @@ -1146,6 +1182,7 @@ prelist_update(new, dr, m) ifa6->ia6_lifetime = lt6_tmp; } + ifnet_lock_done(ifp); if (ia6_match == NULL && new->ndpr_vltime) { /* * No address matched and the valid lifetime is non-zero. @@ -1190,7 +1227,7 @@ prelist_update(new, dr, m) * of other addresses, so we check and update it. * XXX: what if address duplication happens? */ - pfxlist_onlink_check(); + pfxlist_onlink_check(0); } else { /* just set an error. do not bark here. */ error = EADDRNOTAVAIL; /* XXX: might be unused. */ @@ -1200,7 +1237,6 @@ prelist_update(new, dr, m) afteraddrconf: end: - splx(s); return error; } @@ -1210,17 +1246,19 @@ prelist_update(new, dr, m) * XXX: lengthy function name... */ static struct nd_pfxrouter * -find_pfxlist_reachable_router(pr) - struct nd_prefix *pr; +find_pfxlist_reachable_router( + struct nd_prefix *pr) { struct nd_pfxrouter *pfxrtr; struct rtentry *rt; struct llinfo_nd6 *ln; + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + for (pfxrtr = LIST_FIRST(&pr->ndpr_advrtrs); pfxrtr; pfxrtr = LIST_NEXT(pfxrtr, pfr_entry)) { if ((rt = nd6_lookup(&pfxrtr->router->rtaddr, 0, - pfxrtr->router->ifp)) && + pfxrtr->router->ifp, 0)) && (ln = (struct llinfo_nd6 *)rt->rt_llinfo) && ND6_IS_LLINFO_PROBREACH(ln)) break; /* found */ @@ -1244,7 +1282,7 @@ find_pfxlist_reachable_router(pr) * is no router around us. */ void -pfxlist_onlink_check() +pfxlist_onlink_check(int nd6locked) { struct nd_prefix *pr; struct in6_ifaddr *ifa; @@ -1253,6 +1291,9 @@ pfxlist_onlink_check() * Check if there is a prefix that has a reachable advertising * router. */ + if (nd6locked == 0) + lck_mtx_lock(nd6_mutex); + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { if (pr->ndpr_raf_onlink && find_pfxlist_reachable_router(pr)) break; @@ -1327,7 +1368,7 @@ pfxlist_onlink_check() if ((pr->ndpr_stateflags & NDPRF_DETACHED) == 0 && (pr->ndpr_stateflags & NDPRF_ONLINK) == 0 && pr->ndpr_raf_onlink) { - if ((e = nd6_prefix_onlink(pr)) != 0) { + if ((e = nd6_prefix_onlink(pr, 0, 1)) != 0) { nd6log((LOG_ERR, "pfxlist_onlink_check: failed to " "make %s/%d offlink, errno=%d\n", @@ -1345,7 +1386,7 @@ pfxlist_onlink_check() * always be attached. * The precise detection logic is same as the one for prefixes. */ - for (ifa = in6_ifaddr; ifa; ifa = ifa->ia_next) { + for (ifa = in6_ifaddrs; ifa; ifa = ifa->ia_next) { if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; @@ -1362,7 +1403,7 @@ pfxlist_onlink_check() break; } if (ifa) { - for (ifa = in6_ifaddr; ifa; ifa = ifa->ia_next) { + for (ifa = in6_ifaddrs; ifa; ifa = ifa->ia_next) { if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; @@ -1376,18 +1417,20 @@ pfxlist_onlink_check() } } else { - for (ifa = in6_ifaddr; ifa; ifa = ifa->ia_next) { + for (ifa = in6_ifaddrs; ifa; ifa = ifa->ia_next) { if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; ifa->ia6_flags &= ~IN6_IFF_DETACHED; } } + if (nd6locked == 0) + lck_mtx_unlock(nd6_mutex); } int -nd6_prefix_onlink(pr) - struct nd_prefix *pr; +nd6_prefix_onlink( + struct nd_prefix *pr, int rtlocked, int nd6locked) { struct ifaddr *ifa; struct ifnet *ifp = pr->ndpr_ifp; @@ -1412,6 +1455,10 @@ nd6_prefix_onlink(pr) * Although such a configuration is expected to be rare, we explicitly * allow it. */ + if (nd6locked == 0) + lck_mtx_lock(nd6_mutex); + else + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); for (opr = nd_prefix.lh_first; opr; opr = opr->ndpr_next) { if (opr == pr) continue; @@ -1422,10 +1469,15 @@ nd6_prefix_onlink(pr) if (opr->ndpr_plen == pr->ndpr_plen && in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr, &opr->ndpr_prefix.sin6_addr, - pr->ndpr_plen)) + pr->ndpr_plen)) { + if (nd6locked == 0) + lck_mtx_unlock(nd6_mutex); return(0); + } } + if (nd6locked == 0) + lck_mtx_unlock(nd6_mutex); /* * We prefer link-local addresses as the associated interface address. */ @@ -1435,11 +1487,13 @@ nd6_prefix_onlink(pr) IN6_IFF_ANYCAST); if (ifa == NULL) { /* XXX: freebsd does not have ifa_ifwithaf */ + ifnet_lock_exclusive(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family == AF_INET6) break; } + ifnet_lock_done(ifp); /* should we care about ia6_flags? */ } if (ifa == NULL) { @@ -1464,6 +1518,10 @@ nd6_prefix_onlink(pr) bzero(&mask6, sizeof(mask6)); mask6.sin6_len = sizeof(mask6); mask6.sin6_addr = pr->ndpr_mask; + + if (rtlocked == 0) + lck_mtx_lock(rt_mtx); + rtflags = ifa->ifa_flags | RTF_CLONING | RTF_UP; if (nd6_need_cache(ifp)) { /* explicitly set in case ifa_flags does not set the flag. */ @@ -1474,7 +1532,7 @@ nd6_prefix_onlink(pr) */ rtflags &= ~RTF_CLONING; } - error = rtrequest(RTM_ADD, (struct sockaddr *)&pr->ndpr_prefix, + error = rtrequest_locked(RTM_ADD, (struct sockaddr *)&pr->ndpr_prefix, ifa->ifa_addr, (struct sockaddr *)&mask6, rtflags, &rt); if (error == 0) { @@ -1495,12 +1553,14 @@ nd6_prefix_onlink(pr) if (rt != NULL) rtunref(rt); + if (rtlocked == 0) + lck_mtx_unlock(rt_mtx); return(error); } int -nd6_prefix_offlink(pr) - struct nd_prefix *pr; +nd6_prefix_offlink( + struct nd_prefix *pr) { int error = 0; struct ifnet *ifp = pr->ndpr_ifp; @@ -1525,7 +1585,8 @@ nd6_prefix_offlink(pr) mask6.sin6_family = AF_INET6; mask6.sin6_len = sizeof(sa6); bcopy(&pr->ndpr_mask, &mask6.sin6_addr, sizeof(struct in6_addr)); - error = rtrequest(RTM_DELETE, (struct sockaddr *)&sa6, NULL, + lck_mtx_lock(rt_mtx); + error = rtrequest_locked(RTM_DELETE, (struct sockaddr *)&sa6, NULL, (struct sockaddr *)&mask6, 0, &rt); if (error == 0) { pr->ndpr_stateflags &= ~NDPRF_ONLINK; @@ -1541,6 +1602,7 @@ nd6_prefix_offlink(pr) * If there's one, try to make the prefix on-link on the * interface. */ + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); for (opr = nd_prefix.lh_first; opr; opr = opr->ndpr_next) { if (opr == pr) continue; @@ -1561,7 +1623,7 @@ nd6_prefix_offlink(pr) pr->ndpr_plen)) { int e; - if ((e = nd6_prefix_onlink(opr)) != 0) { + if ((e = nd6_prefix_onlink(opr, 1, 1)) != 0) { nd6log((LOG_ERR, "nd6_prefix_offlink: failed to " "recover a prefix %s/%d from %s " @@ -1586,17 +1648,18 @@ nd6_prefix_offlink(pr) if (rt->rt_refcnt <= 0) { /* XXX: we should free the entry ourselves. */ rtref(rt); - rtfree(rt); + rtfree_locked(rt); } } + lck_mtx_unlock(rt_mtx); return(error); } static struct in6_ifaddr * -in6_ifadd(pr, ifid) - struct nd_prefix *pr; - struct in6_addr *ifid; /* Mobile IPv6 addition */ +in6_ifadd( + struct nd_prefix *pr, + struct in6_addr *ifid) /* Mobile IPv6 addition */ { struct ifnet *ifp = pr->ndpr_ifp; struct ifaddr *ifa; @@ -1733,9 +1796,9 @@ in6_ifadd(pr, ifid) } int -in6_tmpifadd(ia0, forcegen) - const struct in6_ifaddr *ia0; /* corresponding public address */ - int forcegen; +in6_tmpifadd( + const struct in6_ifaddr *ia0, /* corresponding public address */ + int forcegen) { struct ifnet *ifp = ia0->ia_ifa.ifa_ifp; struct in6_ifaddr *newia; @@ -1744,6 +1807,9 @@ in6_tmpifadd(ia0, forcegen) int trylimit = 3; /* XXX: adhoc value */ u_int32_t randid[2]; time_t vltime0, pltime0; + struct timeval timenow; + + getmicrotime(&timenow); bzero(&ifra, sizeof(ifra)); strncpy(ifra.ifra_name, if_name(ifp), sizeof(ifra.ifra_name)); @@ -1790,14 +1856,14 @@ in6_tmpifadd(ia0, forcegen) */ if (ia0->ia6_lifetime.ia6t_expire != 0) { vltime0 = IFA6_IS_INVALID(ia0) ? 0 : - (ia0->ia6_lifetime.ia6t_expire - time_second); + (ia0->ia6_lifetime.ia6t_expire - timenow.tv_sec); if (vltime0 > ip6_temp_valid_lifetime) vltime0 = ip6_temp_valid_lifetime; } else vltime0 = ip6_temp_valid_lifetime; if (ia0->ia6_lifetime.ia6t_preferred != 0) { pltime0 = IFA6_IS_DEPRECATED(ia0) ? 0 : - (ia0->ia6_lifetime.ia6t_preferred - time_second); + (ia0->ia6_lifetime.ia6t_preferred - timenow.tv_sec); if (pltime0 > ip6_temp_preferred_lifetime - ip6_desync_factor){ pltime0 = ip6_temp_preferred_lifetime - ip6_desync_factor; @@ -1840,7 +1906,7 @@ in6_tmpifadd(ia0, forcegen) * and, in fact, we surely need the check when we create a new * temporary address due to deprecation of an old temporary address. */ - pfxlist_onlink_check(); + pfxlist_onlink_check(0); return(0); } @@ -1848,6 +1914,9 @@ in6_tmpifadd(ia0, forcegen) int in6_init_prefix_ltimes(struct nd_prefix *ndpr) { + struct timeval timenow; + + getmicrotime(&timenow); /* check if preferred lifetime > valid lifetime. RFC2462 5.5.3 (c) */ if (ndpr->ndpr_pltime > ndpr->ndpr_vltime) { nd6log((LOG_INFO, "in6_init_prefix_ltimes: preferred lifetime" @@ -1858,11 +1927,11 @@ in6_init_prefix_ltimes(struct nd_prefix *ndpr) if (ndpr->ndpr_pltime == ND6_INFINITE_LIFETIME) ndpr->ndpr_preferred = 0; else - ndpr->ndpr_preferred = time_second + ndpr->ndpr_pltime; + ndpr->ndpr_preferred = timenow.tv_sec + ndpr->ndpr_pltime; if (ndpr->ndpr_vltime == ND6_INFINITE_LIFETIME) ndpr->ndpr_expire = 0; else - ndpr->ndpr_expire = time_second + ndpr->ndpr_vltime; + ndpr->ndpr_expire = timenow.tv_sec + ndpr->ndpr_vltime; return 0; } @@ -1870,12 +1939,15 @@ in6_init_prefix_ltimes(struct nd_prefix *ndpr) static void in6_init_address_ltimes(struct nd_prefix *new, struct in6_addrlifetime *lt6) { + struct timeval timenow; + + getmicrotime(&timenow); /* Valid lifetime must not be updated unless explicitly specified. */ /* init ia6t_expire */ if (lt6->ia6t_vltime == ND6_INFINITE_LIFETIME) lt6->ia6t_expire = 0; else { - lt6->ia6t_expire = time_second; + lt6->ia6t_expire = timenow.tv_sec; lt6->ia6t_expire += lt6->ia6t_vltime; } @@ -1883,7 +1955,7 @@ in6_init_address_ltimes(struct nd_prefix *new, struct in6_addrlifetime *lt6) if (lt6->ia6t_pltime == ND6_INFINITE_LIFETIME) lt6->ia6t_preferred = 0; else { - lt6->ia6t_preferred = time_second; + lt6->ia6t_preferred = timenow.tv_sec; lt6->ia6t_preferred += lt6->ia6t_pltime; } } @@ -1894,34 +1966,35 @@ in6_init_address_ltimes(struct nd_prefix *new, struct in6_addrlifetime *lt6) * it shouldn't be called when acting as a router. */ void -rt6_flush(gateway, ifp) - struct in6_addr *gateway; - struct ifnet *ifp; +rt6_flush( + struct in6_addr *gateway, + struct ifnet *ifp) { struct radix_node_head *rnh = rt_tables[AF_INET6]; - int s = splnet(); /* We'll care only link-local addresses */ if (!IN6_IS_ADDR_LINKLOCAL(gateway)) { - splx(s); return; } + lck_mtx_lock(rt_mtx); /* XXX: hack for KAME's link-local address kludge */ gateway->s6_addr16[1] = htons(ifp->if_index); rnh->rnh_walktree(rnh, rt6_deleteroute, (void *)gateway); - splx(s); + lck_mtx_unlock(rt_mtx); } static int -rt6_deleteroute(rn, arg) - struct radix_node *rn; - void *arg; +rt6_deleteroute( + struct radix_node *rn, + void *arg) { #define SIN6(s) ((struct sockaddr_in6 *)s) struct rtentry *rt = (struct rtentry *)rn; struct in6_addr *gate = (struct in6_addr *)arg; + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); + if (rt->rt_gateway == NULL || rt->rt_gateway->sa_family != AF_INET6) return(0); @@ -1943,20 +2016,21 @@ rt6_deleteroute(rn, arg) if ((rt->rt_flags & RTF_HOST) == 0) return(0); - return(rtrequest(RTM_DELETE, rt_key(rt), + return(rtrequest_locked(RTM_DELETE, rt_key(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags, 0)); #undef SIN6 } int -nd6_setdefaultiface(ifindex) - int ifindex; +nd6_setdefaultiface( + int ifindex) { int error = 0; if (ifindex < 0 || if_index < ifindex) return(EINVAL); + lck_mtx_lock(nd6_mutex); if (nd6_defifindex != ifindex) { nd6_defifindex = ifindex; if (nd6_defifindex > 0) @@ -1983,5 +2057,6 @@ nd6_setdefaultiface(ifindex) scope6_setdefault(nd6_defifp); } + lck_mtx_unlock(nd6_mutex); return(error); } diff --git a/bsd/netinet6/pim6_var.h b/bsd/netinet6/pim6_var.h index 29abc1192..1cb8ec648 100644 --- a/bsd/netinet6/pim6_var.h +++ b/bsd/netinet6/pim6_var.h @@ -42,7 +42,6 @@ * Modified by Pavlin Ivanov Radoslavov, USC/ISI, May 1998 */ -#ifdef __APPLE_API_UNSTABLE struct pim6stat { u_quad_t pim6s_rcv_total; /* total PIM messages received */ u_quad_t pim6s_rcv_tooshort; /* received with too few bytes */ @@ -52,15 +51,9 @@ struct pim6stat { u_quad_t pim6s_rcv_badregisters; /* received invalid registers */ u_quad_t pim6s_snd_registers; /* sent registers */ }; -#endif -#if (defined(KERNEL)) || (defined(_KERNEL)) -#ifdef __APPLE_API_PRIVATE extern struct pim6stat pim6stat; -int pim6_input __P((struct mbuf **, int*)); -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ /* * Names for PIM sysctl objects @@ -68,8 +61,13 @@ int pim6_input __P((struct mbuf **, int*)); #define PIM6CTL_STATS 1 /* statistics (read-only) */ #define PIM6CTL_MAXID 2 +#ifdef KERNEL_PRIVATE #define PIM6CTL_NAMES { \ { 0, 0 }, \ { 0, 0 }, \ } -#endif /* _NETINET6_PIM6_VAR_H_ */ + +int pim6_input(struct mbuf **, int*); + +#endif KERNEL_PRIVATE +#endif _NETINET6_PIM6_VAR_H_ diff --git a/bsd/netinet6/raw_ip6.c b/bsd/netinet6/raw_ip6.c index 97eca96dc..7e3094d30 100644 --- a/bsd/netinet6/raw_ip6.c +++ b/bsd/netinet6/raw_ip6.c @@ -92,11 +92,13 @@ #include <netinet6/scope6_var.h> #endif #include <netinet6/raw_ip6.h> +#include <netinet6/ip6_fw.h> #if IPSEC #include <netinet6/ipsec.h> #include <netinet6/ipsec6.h> extern int ipsec_bypass; +extern lck_mtx_t *sadb_mutex; #endif /*IPSEC*/ @@ -116,6 +118,7 @@ extern struct inpcbhead ripcb; extern struct inpcbinfo ripcbinfo; extern u_long rip_sendspace; extern u_long rip_recvspace; +extern u_long route_generation; struct rip6stat rip6stat; @@ -125,9 +128,9 @@ struct rip6stat rip6stat; * mbuf chain. */ int -rip6_input(mp, offp) - struct mbuf **mp; - int *offp; +rip6_input( + struct mbuf **mp, + int *offp) { struct mbuf *m = *mp; register struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); @@ -149,6 +152,7 @@ rip6_input(mp, offp) init_sin6(&rip6src, m); /* general init */ + lck_rw_lock_shared(ripcbinfo.mtx); LIST_FOREACH(in6p, &ripcb, inp_list) { if ((in6p->in6p_vflag & INP_IPV6) == 0) continue; @@ -176,10 +180,14 @@ rip6_input(mp, offp) /* * Check AH/ESP integrity. */ - if (ipsec_bypass == 0 && n && ipsec6_in_reject_so(n, last->inp_socket)) { - m_freem(n); - ipsec6stat.in_polvio++; - /* do not inject data into pcb */ + if (ipsec_bypass == 0 && n) { + lck_mtx_lock(sadb_mutex); + if (ipsec6_in_reject_so(n, last->inp_socket)) { + m_freem(n); + ipsec6stat.in_polvio++; + /* do not inject data into pcb */ + } + lck_mtx_unlock(sadb_mutex); } else #endif /*IPSEC*/ if (n) { @@ -190,10 +198,7 @@ rip6_input(mp, offp) m_adj(n, *offp); if (sbappendaddr(&last->in6p_socket->so_rcv, (struct sockaddr *)&rip6src, - n, opts) == 0) { - m_freem(n); - if (opts) - m_freem(opts); + n, opts, NULL) == 0) { rip6stat.rip6s_fullsock++; } else sorwakeup(last->in6p_socket); @@ -202,15 +207,20 @@ rip6_input(mp, offp) } last = in6p; } + lck_rw_done(ripcbinfo.mtx); #if IPSEC /* * Check AH/ESP integrity. */ - if (ipsec_bypass == 0 && last && ipsec6_in_reject_so(m, last->inp_socket)) { - m_freem(m); - ipsec6stat.in_polvio++; - ip6stat.ip6s_delivered--; - /* do not inject data into pcb */ + if (ipsec_bypass == 0 && last) { + lck_mtx_lock(sadb_mutex); + if (ipsec6_in_reject_so(m, last->inp_socket)) { + m_freem(m); + ipsec6stat.in_polvio++; + ip6stat.ip6s_delivered--; + /* do not inject data into pcb */ + } + lck_mtx_unlock(sadb_mutex); } else #endif /*IPSEC*/ if (last) { @@ -220,10 +230,7 @@ rip6_input(mp, offp) /* strip intermediate headers */ m_adj(m, *offp); if (sbappendaddr(&last->in6p_socket->so_rcv, - (struct sockaddr *)&rip6src, m, opts) == 0) { - m_freem(m); - if (opts) - m_freem(opts); + (struct sockaddr *)&rip6src, m, opts, NULL) == 0) { rip6stat.rip6s_fullsock++; } else sorwakeup(last->in6p_socket); @@ -245,17 +252,17 @@ rip6_input(mp, offp) } void -rip6_ctlinput(cmd, sa, d) - int cmd; - struct sockaddr *sa; - void *d; +rip6_ctlinput( + int cmd, + struct sockaddr *sa, + void *d) { struct ip6_hdr *ip6; struct mbuf *m; int off = 0; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; - void (*notify) __P((struct inpcb *, int)) = in6_rtchange; + void (*notify)(struct inpcb *, int) = in6_rtchange; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) @@ -283,7 +290,7 @@ rip6_ctlinput(cmd, sa, d) sa6_src = &sa6_any; } - (void) in6_pcbnotify(&ripcb, sa, 0, (struct sockaddr *)sa6_src, + (void) in6_pcbnotify(&ripcbinfo, sa, 0, (struct sockaddr *)sa6_src, 0, cmd, notify); } @@ -292,11 +299,11 @@ rip6_ctlinput(cmd, sa, d) * Tack on options user may have setup with control call. */ int -rip6_output(m, so, dstsock, control) - register struct mbuf *m; - struct socket *so; - struct sockaddr_in6 *dstsock; - struct mbuf *control; +rip6_output( + register struct mbuf *m, + struct socket *so, + struct sockaddr_in6 *dstsock, + struct mbuf *control) { struct in6_addr *dst; struct ip6_hdr *ip6; @@ -383,12 +390,13 @@ rip6_output(m, so, dstsock, control) */ { struct in6_addr *in6a; + struct in6_addr storage; if ((in6a = in6_selectsrc(dstsock, optp, in6p->in6p_moptions, &in6p->in6p_route, &in6p->in6p_laddr, - &error)) == 0) { + &storage, &error)) == 0) { if (error == 0) error = EADDRNOTAVAIL; goto bad; @@ -441,8 +449,13 @@ rip6_output(m, so, dstsock, control) } #endif /*IPSEC*/ + if (in6p->in6p_route.ro_rt && in6p->in6p_route.ro_rt->generation_id != route_generation) { + rtfree(in6p->in6p_route.ro_rt); + in6p->in6p_route.ro_rt = (struct rtentry *)0; + } + error = ip6_output(m, optp, &in6p->in6p_route, 0, - in6p->in6p_moptions, &oifp); + in6p->in6p_moptions, &oifp, 0); if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (oifp) icmp6_ifoutstat_inc(oifp, type, code); @@ -467,13 +480,19 @@ rip6_output(m, so, dstsock, control) return(error); } +static void +load_ip6fw() +{ + ip6_fw_init(); +} + /* * Raw IPv6 socket option processing. */ int -rip6_ctloutput(so, sopt) - struct socket *so; - struct sockopt *sopt; +rip6_ctloutput( + struct socket *so, + struct sockopt *sopt) { int error; @@ -491,6 +510,16 @@ rip6_ctloutput(so, sopt) switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { + case IPV6_FW_ADD: + case IPV6_FW_GET: + if (ip6_fw_ctl_ptr == 0) + load_ip6fw(); + if (ip6_fw_ctl_ptr) + error = ip6_fw_ctl_ptr(sopt); + else + error = ENOPROTOOPT; + break; + case MRT6_INIT: case MRT6_DONE: case MRT6_ADD_MIF: @@ -508,6 +537,18 @@ rip6_ctloutput(so, sopt) case SOPT_SET: switch (sopt->sopt_name) { + case IPV6_FW_ADD: + case IPV6_FW_DEL: + case IPV6_FW_FLUSH: + case IPV6_FW_ZERO: + if (ip6_fw_ctl_ptr == 0) + load_ip6fw(); + if (ip6_fw_ctl_ptr) + error = ip6_fw_ctl_ptr(sopt); + else + error = ENOPROTOOPT; + break; + case MRT6_INIT: case MRT6_DONE: case MRT6_ADD_MIF: @@ -536,7 +577,7 @@ rip6_attach(struct socket *so, int proto, struct proc *p) inp = sotoinpcb(so); if (inp) panic("rip6_attach"); - if (p && (error = suser(p->p_ucred, &p->p_acflag)) != 0) + if (p && (error = proc_suser(p)) != 0) return error; error = soreserve(so, rip_sendspace, rip_recvspace); @@ -607,7 +648,7 @@ rip6_bind(struct socket *so, struct sockaddr *nam, struct proc *p) if (nam->sa_len != sizeof(*addr)) return EINVAL; - if (TAILQ_EMPTY(&ifnet) || addr->sin6_family != AF_INET6) + if (TAILQ_EMPTY(&ifnet_head) || addr->sin6_family != AF_INET6) return EADDRNOTAVAIL; #if ENABLE_DEFAULT_SCOPE if (addr->sin6_scope_id == 0) { /* not change if specified */ @@ -621,8 +662,10 @@ rip6_bind(struct socket *so, struct sockaddr *nam, struct proc *p) ((struct in6_ifaddr *)ia)->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY| IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) { + if (ia) ifafree(ia); return(EADDRNOTAVAIL); } + ifafree(ia); inp->in6p_laddr = addr->sin6_addr; return 0; } @@ -633,6 +676,7 @@ rip6_connect(struct socket *so, struct sockaddr *nam, struct proc *p) struct inpcb *inp = sotoinpcb(so); struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; struct in6_addr *in6a = NULL; + struct in6_addr storage; int error = 0; #if ENABLE_DEFAULT_SCOPE struct sockaddr_in6 tmp; @@ -640,7 +684,7 @@ rip6_connect(struct socket *so, struct sockaddr *nam, struct proc *p) if (nam->sa_len != sizeof(*addr)) return EINVAL; - if (TAILQ_EMPTY(&ifnet)) + if (TAILQ_EMPTY(&ifnet_head)) return EADDRNOTAVAIL; if (addr->sin6_family != AF_INET6) return EAFNOSUPPORT; @@ -655,7 +699,7 @@ rip6_connect(struct socket *so, struct sockaddr *nam, struct proc *p) /* Source address selection. XXX: need pcblookup? */ in6a = in6_selectsrc(addr, inp->in6p_outputopts, inp->in6p_moptions, &inp->in6p_route, - &inp->in6p_laddr, &error); + &inp->in6p_laddr, &storage, &error); if (in6a == NULL) return (error ? error : EADDRNOTAVAIL); inp->in6p_laddr = *in6a; @@ -713,5 +757,5 @@ struct pr_usrreqs rip6_usrreqs = { pru_connect2_notsupp, in6_control, rip6_detach, rip6_disconnect, pru_listen_notsupp, in6_setpeeraddr, pru_rcvd_notsupp, pru_rcvoob_notsupp, rip6_send, pru_sense_null, rip6_shutdown, - in6_setsockaddr, sosend, soreceive, sopoll + in6_setsockaddr, sosend, soreceive, pru_sopoll_notsupp }; diff --git a/bsd/netinet6/raw_ip6.h b/bsd/netinet6/raw_ip6.h index 879428b11..30cf70e60 100644 --- a/bsd/netinet6/raw_ip6.h +++ b/bsd/netinet6/raw_ip6.h @@ -48,10 +48,6 @@ struct rip6stat { u_quad_t rip6s_opackets; /* total output packets */ }; -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE extern struct rip6stat rip6stat; -#endif -#endif #endif diff --git a/bsd/netinet6/route6.c b/bsd/netinet6/route6.c index acd4263cb..9e01f4b16 100644 --- a/bsd/netinet6/route6.c +++ b/bsd/netinet6/route6.c @@ -44,8 +44,8 @@ #include <netinet/icmp6.h> -static int ip6_rthdr0 __P((struct mbuf *, struct ip6_hdr *, - struct ip6_rthdr0 *)); +static int ip6_rthdr0(struct mbuf *, struct ip6_hdr *, + struct ip6_rthdr0 *); int route6_input(mp, offp) @@ -70,7 +70,7 @@ route6_input(mp, offp) } #ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, sizeof(*rh), IPPROTO_DONE); + IP6_EXTHDR_CHECK(m, off, sizeof(*rh), return IPPROTO_DONE); ip6 = mtod(m, struct ip6_hdr *); rh = (struct ip6_rthdr *)((caddr_t)ip6 + off); #else @@ -91,7 +91,7 @@ route6_input(mp, offp) * due to IP6_EXTHDR_CHECK assumption, we cannot handle * very big routing header (max rhlen == 2048). */ - IP6_EXTHDR_CHECK(m, off, rhlen, IPPROTO_DONE); + IP6_EXTHDR_CHECK(m, off, rhlen, return IPPROTO_DONE); #else /* * note on option length: @@ -207,11 +207,11 @@ ip6_rthdr0(m, ip6, rh0) #if COMPAT_RFC1883 if (rh0->ip6r0_slmap[index / 8] & (1 << (7 - (index % 8)))) - ip6_forward(m, IPV6_SRCRT_NEIGHBOR); + ip6_forward(m, IPV6_SRCRT_NEIGHBOR, 0); else - ip6_forward(m, IPV6_SRCRT_NOTNEIGHBOR); + ip6_forward(m, IPV6_SRCRT_NOTNEIGHBOR, 0); #else - ip6_forward(m, 1); + ip6_forward(m, 1, 0); #endif return(-1); /* m would be freed in ip6_forward() */ diff --git a/bsd/netinet6/scope6.c b/bsd/netinet6/scope6.c index ed33f804d..845695f71 100644 --- a/bsd/netinet6/scope6.c +++ b/bsd/netinet6/scope6.c @@ -56,8 +56,8 @@ static size_t if_indexlim = 8; struct scope6_id *scope6_ids = NULL; void -scope6_ifattach(ifp) - struct ifnet *ifp; +scope6_ifattach( + struct ifnet *ifp) { int s = splnet(); @@ -108,9 +108,9 @@ scope6_ifattach(ifp) } int -scope6_set(ifp, idlist) - struct ifnet *ifp; - u_int32_t *idlist; +scope6_set( + struct ifnet *ifp, + u_int32_t *idlist) { int i, s; int error = 0; @@ -159,9 +159,9 @@ scope6_set(ifp, idlist) } int -scope6_get(ifp, idlist) - struct ifnet *ifp; - u_int32_t *idlist; +scope6_get( + struct ifnet *ifp, + u_int32_t *idlist) { if (scope6_ids == NULL) /* paranoid? */ return(EINVAL); @@ -233,18 +233,19 @@ struct in6_addr *addr; } int -in6_addr2scopeid(ifp, addr) - struct ifnet *ifp; /* must not be NULL */ - struct in6_addr *addr; /* must not be NULL */ +in6_addr2scopeid( + struct ifnet *ifp, /* must not be NULL */ + struct in6_addr *addr) /* must not be NULL */ { int scope = in6_addrscope(addr); + int index = ifp->if_index; if (scope6_ids == NULL) /* paranoid? */ return(0); /* XXX */ - if (ifp->if_index >= if_indexlim) + if (index >= if_indexlim) return(0); /* XXX */ -#define SID scope6_ids[ifp->if_index] +#define SID scope6_ids[index] switch(scope) { case IPV6_ADDR_SCOPE_NODELOCAL: return(-1); /* XXX: is this an appropriate value? */ @@ -265,8 +266,8 @@ in6_addr2scopeid(ifp, addr) } void -scope6_setdefault(ifp) - struct ifnet *ifp; /* note that this might be NULL */ +scope6_setdefault( + struct ifnet *ifp) /* note that this might be NULL */ { /* * Currently, this function just set the default "link" according to @@ -283,8 +284,8 @@ scope6_setdefault(ifp) } int -scope6_get_default(idlist) - u_int32_t *idlist; +scope6_get_default( + u_int32_t *idlist) { if (scope6_ids == NULL) /* paranoid? */ return(EINVAL); @@ -296,8 +297,8 @@ scope6_get_default(idlist) } u_int32_t -scope6_addr2default(addr) - struct in6_addr *addr; +scope6_addr2default( + struct in6_addr *addr) { return(scope6_ids[0].s6id_list[in6_addrscope(addr)]); } diff --git a/bsd/netinet6/scope6_var.h b/bsd/netinet6/scope6_var.h index 5831fde09..d7fd15e77 100644 --- a/bsd/netinet6/scope6_var.h +++ b/bsd/netinet6/scope6_var.h @@ -34,16 +34,14 @@ #define _NETINET6_SCOPE6_VAR_H_ #include <sys/appleapiopts.h> -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -void scope6_ifattach __P((struct ifnet *)); -int scope6_set __P((struct ifnet *, u_int32_t *)); -int scope6_get __P((struct ifnet *, u_int32_t *)); -void scope6_setdefault __P((struct ifnet *)); -int scope6_get_default __P((u_int32_t *)); -u_int32_t scope6_in6_addrscope __P((struct in6_addr *)); -u_int32_t scope6_addr2default __P((struct in6_addr *)); -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +#ifdef KERNEL_PRIVATE +void scope6_ifattach(struct ifnet *); +int scope6_set(struct ifnet *, u_int32_t *); +int scope6_get(struct ifnet *, u_int32_t *); +void scope6_setdefault(struct ifnet *); +int scope6_get_default(u_int32_t *); +u_int32_t scope6_in6_addrscope(struct in6_addr *); +u_int32_t scope6_addr2default(struct in6_addr *); +#endif KERNEL_PRIVATE -#endif /* _NETINET6_SCOPE6_VAR_H_ */ +#endif _NETINET6_SCOPE6_VAR_H_ diff --git a/bsd/netinet6/tcp6_var.h b/bsd/netinet6/tcp6_var.h index 286307c32..5b535dda5 100644 --- a/bsd/netinet6/tcp6_var.h +++ b/bsd/netinet6/tcp6_var.h @@ -69,8 +69,7 @@ #define _NETINET_TCP6_VAR_H_ #include <sys/appleapiopts.h> -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet6_tcp6); #endif @@ -78,14 +77,13 @@ SYSCTL_DECL(_net_inet6_tcp6); extern int tcp_v6mssdflt; /* XXX */ struct ip6_hdr; -void tcp6_ctlinput __P((int, struct sockaddr *, void *)); -void tcp6_init __P((void)); -int tcp6_input __P((struct mbuf **, int *)); -struct rtentry *tcp_rtlookup6 __P((struct inpcb *)); +void tcp6_ctlinput(int, struct sockaddr *, void *); +void tcp6_init(void); +int tcp6_input(struct mbuf **, int *); +struct rtentry *tcp_rtlookup6(struct inpcb *); extern struct pr_usrreqs tcp6_usrreqs; -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +#endif KERNEL_PRIVATE -#endif /* _NETINET_TCP6_VAR_H_ */ +#endif _NETINET_TCP6_VAR_H_ diff --git a/bsd/netinet6/udp6_output.c b/bsd/netinet6/udp6_output.c index 8d8c6df34..226b199c0 100644 --- a/bsd/netinet6/udp6_output.c +++ b/bsd/netinet6/udp6_output.c @@ -118,6 +118,21 @@ extern int ipsec_bypass; #define udp6stat udpstat #define udp6s_opackets udps_opackets +static __inline__ u_int16_t +get_socket_id(struct socket * s) +{ + u_int16_t val; + + if (s == NULL) { + return (0); + } + val = (u_int16_t)(((u_int32_t)s) / sizeof(struct socket)); + if (val == 0) { + val = 0xffff; + } + return (val); +} + int udp6_output(in6p, m, addr6, control, p) struct in6pcb *in6p; @@ -138,10 +153,11 @@ udp6_output(in6p, m, addr6, control, p) int af = AF_INET6, hlen = sizeof(struct ip6_hdr); int flags; struct sockaddr_in6 tmp; + struct in6_addr storage; priv = 0; #ifdef __APPLE__ - if (p && !suser(p->p_ucred, &p->p_acflag)) + if (p && !proc_suser(p)) #else if (p && !suser(p)) #endif @@ -208,7 +224,7 @@ udp6_output(in6p, m, addr6, control, p) laddr = in6_selectsrc(sin6, in6p->in6p_outputopts, in6p->in6p_moptions, &in6p->in6p_route, - &in6p->in6p_laddr, &error); + &in6p->in6p_laddr, &storage, &error); } else laddr = &in6p->in6p_laddr; /* XXX */ if (laddr == NULL) { @@ -217,7 +233,7 @@ udp6_output(in6p, m, addr6, control, p) goto release; } if (in6p->in6p_lport == 0 && - (error = in6_pcbsetport(laddr, in6p, p)) != 0) + (error = in6_pcbsetport(laddr, in6p, p, 0)) != 0) goto release; } else { if (IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) { @@ -300,8 +316,9 @@ udp6_output(in6p, m, addr6, control, p) goto release; } #endif /*IPSEC*/ + m->m_pkthdr.socket_id = get_socket_id(in6p->in6p_socket); error = ip6_output(m, in6p->in6p_outputopts, &in6p->in6p_route, - flags, in6p->in6p_moptions, NULL); + flags, in6p->in6p_moptions, NULL, 0); break; case AF_INET: error = EAFNOSUPPORT; diff --git a/bsd/netinet6/udp6_usrreq.c b/bsd/netinet6/udp6_usrreq.c index 9bab08c72..3be50feec 100644 --- a/bsd/netinet6/udp6_usrreq.c +++ b/bsd/netinet6/udp6_usrreq.c @@ -78,6 +78,7 @@ #include <sys/systm.h> #include <sys/syslog.h> #include <sys/proc.h> +#include <sys/kauth.h> #include <net/if.h> #include <net/route.h> @@ -102,6 +103,8 @@ #include <netinet6/ipsec.h> #include <netinet6/ipsec6.h> extern int ipsec_bypass; +extern lck_mtx_t *sadb_mutex; +extern lck_mtx_t *nd6_mutex; #endif /*IPSEC*/ #include "faith.h" @@ -115,14 +118,26 @@ extern int ipsec_bypass; */ extern struct protosw inetsw[]; -static int in6_mcmatch __P((struct inpcb *, struct in6_addr *, struct ifnet *)); -static int udp6_detach __P((struct socket *so)); +static int in6_mcmatch(struct inpcb *, struct in6_addr *, struct ifnet *); +static int udp6_detach(struct socket *so); + + +extern void ipfwsyslog( int level, char *format,...); +extern int fw_verbose; + +#define log_in_vain_log( a ) { \ + if ( (log_in_vain == 3 ) && (fw_verbose == 2)) { /* Apple logging, log to ipfw.log */ \ + ipfwsyslog a ; \ + } \ + else log a ; \ +} + static int -in6_mcmatch(in6p, ia6, ifp) - struct inpcb *in6p; - register struct in6_addr *ia6; - struct ifnet *ifp; +in6_mcmatch( + struct inpcb *in6p, + register struct in6_addr *ia6, + struct ifnet *ifp) { struct ip6_moptions *im6o = in6p->in6p_moptions; struct in6_multi_mship *imm; @@ -130,21 +145,25 @@ in6_mcmatch(in6p, ia6, ifp) if (im6o == NULL) return 0; + lck_mtx_lock(nd6_mutex); for (imm = im6o->im6o_memberships.lh_first; imm != NULL; imm = imm->i6mm_chain.le_next) { if ((ifp == NULL || imm->i6mm_maddr->in6m_ifp == ifp) && IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr, - ia6)) + ia6)) { + lck_mtx_unlock(nd6_mutex); return 1; + } } + lck_mtx_unlock(nd6_mutex); return 0; } int -udp6_input(mp, offp) - struct mbuf **mp; - int *offp; +udp6_input( + struct mbuf **mp, + int *offp) { struct mbuf *m = *mp; register struct ip6_hdr *ip6; @@ -154,8 +173,9 @@ udp6_input(mp, offp) int off = *offp; int plen, ulen; struct sockaddr_in6 udp_in6; + struct inpcbinfo *pcbinfo = &udbinfo; - IP6_EXTHDR_CHECK(m, off, sizeof(struct udphdr), IPPROTO_DONE); + IP6_EXTHDR_CHECK(m, off, sizeof(struct udphdr), return IPPROTO_DONE); ip6 = mtod(m, struct ip6_hdr *); @@ -234,23 +254,42 @@ udp6_input(mp, offp) * (Algorithm copied from raw_intr().) */ last = NULL; + lck_rw_lock_shared(pcbinfo->mtx); + LIST_FOREACH(in6p, &udb, inp_list) { + if ((in6p->inp_vflag & INP_IPV6) == 0) continue; - if (in6p->in6p_lport != uh->uh_dport) + + if (in_pcb_checkstate(in6p, WNT_ACQUIRE, 0) == WNT_STOPUSING) + continue; + + udp_lock(in6p->in6p_socket, 1, 0); + + if (in_pcb_checkstate(in6p, WNT_RELEASE, 1) == WNT_STOPUSING) { + udp_unlock(in6p->in6p_socket, 1, 0); + continue; + } + if (in6p->in6p_lport != uh->uh_dport) { + udp_unlock(in6p->in6p_socket, 1, 0); continue; + } if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) { if (!IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst) && !in6_mcmatch(in6p, &ip6->ip6_dst, - m->m_pkthdr.rcvif)) + m->m_pkthdr.rcvif)) { + udp_unlock(in6p->in6p_socket, 1, 0); continue; + } } if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) { if (!IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src) || - in6p->in6p_fport != uh->uh_sport) + in6p->in6p_fport != uh->uh_sport) { + udp_unlock(in6p->in6p_socket, 1, 0); continue; + } } if (last != NULL) { @@ -260,9 +299,13 @@ udp6_input(mp, offp) /* * Check AH/ESP integrity. */ - if (ipsec_bypass == 0 && ipsec6_in_reject_so(m, last->inp_socket)) - ipsec6stat.in_polvio++; + if (ipsec_bypass == 0) { + lck_mtx_lock(sadb_mutex); + if (ipsec6_in_reject_so(m, last->inp_socket)) + ipsec6stat.in_polvio++; /* do not inject data into pcb */ + lck_mtx_unlock(sadb_mutex); + } else #endif /*IPSEC*/ if ((n = m_copy(m, 0, M_COPYALL)) != NULL) { @@ -281,15 +324,13 @@ udp6_input(mp, offp) m_adj(n, off + sizeof(struct udphdr)); if (sbappendaddr(&last->in6p_socket->so_rcv, (struct sockaddr *)&udp_in6, - n, opts) == 0) { - m_freem(n); - if (opts) - m_freem(opts); + n, opts, NULL) == 0) { udpstat.udps_fullsock++; } else sorwakeup(last->in6p_socket); opts = NULL; } + udp_unlock(last->in6p_socket, 1, 0); } last = in6p; /* @@ -304,6 +345,7 @@ udp6_input(mp, offp) (SO_REUSEPORT|SO_REUSEADDR)) == 0) break; } + lck_rw_done(pcbinfo->mtx); if (last == NULL) { /* @@ -321,9 +363,15 @@ udp6_input(mp, offp) /* * Check AH/ESP integrity. */ - if (ipsec_bypass == 0 && ipsec6_in_reject_so(m, last->inp_socket)) { - ipsec6stat.in_polvio++; - goto bad; + if (ipsec_bypass == 0) { + lck_mtx_lock(sadb_mutex); + if (ipsec6_in_reject_so(m, last->inp_socket)) { + ipsec6stat.in_polvio++; + lck_mtx_unlock(sadb_mutex); + udp_unlock(last->in6p_socket, 1, 0); + goto bad; + } + lck_mtx_unlock(sadb_mutex); } #endif /*IPSEC*/ if (last->in6p_flags & IN6P_CONTROLOPTS @@ -333,11 +381,15 @@ udp6_input(mp, offp) m_adj(m, off + sizeof(struct udphdr)); if (sbappendaddr(&last->in6p_socket->so_rcv, (struct sockaddr *)&udp_in6, - m, opts) == 0) { + m, opts, NULL) == 0) { udpstat.udps_fullsock++; + m = NULL; + opts = NULL; + udp_unlock(last->in6p_socket, 1, 0); goto bad; } sorwakeup(last->in6p_socket); + udp_unlock(last->in6p_socket, 1, 0); return IPPROTO_DONE; } /* @@ -351,10 +403,17 @@ udp6_input(mp, offp) char buf[INET6_ADDRSTRLEN]; strcpy(buf, ip6_sprintf(&ip6->ip6_dst)); - log(LOG_INFO, - "Connection attempt to UDP %s:%d from %s:%d\n", - buf, ntohs(uh->uh_dport), - ip6_sprintf(&ip6->ip6_src), ntohs(uh->uh_sport)); + if (log_in_vain != 3) + log(LOG_INFO, + "Connection attempt to UDP %s:%d from %s:%d\n", + buf, ntohs(uh->uh_dport), + ip6_sprintf(&ip6->ip6_src), ntohs(uh->uh_sport)); + else if (!(m->m_flags & (M_BCAST | M_MCAST)) && + !IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) + log_in_vain_log((LOG_INFO, + "Connection attempt to UDP %s:%d from %s:%d\n", + buf, ntohs(uh->uh_dport), + ip6_sprintf(&ip6->ip6_src), ntohs(uh->uh_sport))); } udpstat.udps_noport++; if (m->m_flags & M_MCAST) { @@ -371,9 +430,15 @@ udp6_input(mp, offp) /* * Check AH/ESP integrity. */ - if (ipsec_bypass == 0 && ipsec6_in_reject_so(m, in6p->in6p_socket)) { - ipsec6stat.in_polvio++; - goto bad; + if (ipsec_bypass == 0) { + lck_mtx_lock(sadb_mutex); + if (ipsec6_in_reject_so(m, in6p->in6p_socket)) { + ipsec6stat.in_polvio++; + lck_mtx_unlock(sadb_mutex); + in_pcb_checkstate(in6p, WNT_RELEASE, 0); + goto bad; + } + lck_mtx_unlock(sadb_mutex); } #endif /*IPSEC*/ @@ -381,6 +446,13 @@ udp6_input(mp, offp) * Construct sockaddr format source address. * Stuff source address and datagram in user buffer. */ + udp_lock(in6p->in6p_socket, 1, 0); + + if (in_pcb_checkstate(in6p, WNT_RELEASE, 1) == WNT_STOPUSING) { + udp_unlock(in6p->in6p_socket, 1, 0); + goto bad; + } + init_sin6(&udp_in6, m); /* general init */ udp_in6.sin6_port = uh->uh_sport; if (in6p->in6p_flags & IN6P_CONTROLOPTS @@ -389,11 +461,15 @@ udp6_input(mp, offp) m_adj(m, off + sizeof(struct udphdr)); if (sbappendaddr(&in6p->in6p_socket->so_rcv, (struct sockaddr *)&udp_in6, - m, opts) == 0) { + m, opts, NULL) == 0) { + m = NULL; + opts = NULL; udpstat.udps_fullsock++; + udp_unlock(in6p->in6p_socket, 1, 0); goto bad; } sorwakeup(in6p->in6p_socket); + udp_unlock(in6p->in6p_socket, 1, 0); return IPPROTO_DONE; bad: if (m) @@ -404,10 +480,10 @@ bad: } void -udp6_ctlinput(cmd, sa, d) - int cmd; - struct sockaddr *sa; - void *d; +udp6_ctlinput( + int cmd, + struct sockaddr *sa, + void *d) { struct udphdr uh; struct ip6_hdr *ip6; @@ -415,7 +491,7 @@ udp6_ctlinput(cmd, sa, d) int off = 0; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; - void (*notify) __P((struct inpcb *, int)) = udp_notify; + void (*notify)(struct inpcb *, int) = udp_notify; struct udp_portonly { u_int16_t uh_sport; u_int16_t uh_dport; @@ -460,11 +536,11 @@ udp6_ctlinput(cmd, sa, d) bzero(&uh, sizeof(uh)); m_copydata(m, off, sizeof(*uhp), (caddr_t)&uh); - (void) in6_pcbnotify(&udb, sa, uh.uh_dport, + (void) in6_pcbnotify(&udbinfo, sa, uh.uh_dport, (struct sockaddr*)ip6cp->ip6c_src, uh.uh_sport, cmd, notify); } else - (void) in6_pcbnotify(&udb, sa, 0, (struct sockaddr *)&sa6_src, + (void) in6_pcbnotify(&udbinfo, sa, 0, (struct sockaddr *)&sa6_src, 0, cmd, notify); } @@ -482,7 +558,7 @@ udp6_getcred SYSCTL_HANDLER_ARGS if (req->newlen != sizeof(addrs)) return (EINVAL); - if (req->oldlen != sizeof(struct ucred)) + if (req->oldlen != sizeof(*(kauth_cred_t)0)) return (EINVAL); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) @@ -497,7 +573,7 @@ udp6_getcred SYSCTL_HANDLER_ARGS goto out; } error = SYSCTL_OUT(req, inp->inp_socket->so_cred->pc_ucred, - sizeof(struct ucred)); + sizeof(*(kauth_cred_t)0)); out: splx(s); @@ -513,15 +589,12 @@ static int udp6_abort(struct socket *so) { struct inpcb *inp; - int s; inp = sotoinpcb(so); if (inp == 0) return EINVAL; /* ??? possible? panic instead? */ soisdisconnected(so); - s = splnet(); in6_pcbdetach(inp); - splx(s); return 0; } @@ -529,22 +602,21 @@ static int udp6_attach(struct socket *so, int proto, struct proc *p) { struct inpcb *inp; - int s, error; + int error; inp = sotoinpcb(so); if (inp != 0) return EINVAL; + error = in_pcballoc(so, &udbinfo, p); + if (error) + return error; + if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { error = soreserve(so, udp_sendspace, udp_recvspace); if (error) return error; } - s = splnet(); - error = in_pcballoc(so, &udbinfo, p); - splx(s); - if (error) - return error; inp = (struct inpcb *)so->so_pcb; inp->inp_vflag |= INP_IPV6; if (ip6_mapped_addr_on) @@ -565,7 +637,7 @@ static int udp6_bind(struct socket *so, struct sockaddr *nam, struct proc *p) { struct inpcb *inp; - int s, error; + int error; inp = sotoinpcb(so); if (inp == 0) @@ -586,16 +658,12 @@ udp6_bind(struct socket *so, struct sockaddr *nam, struct proc *p) in6_sin6_2_sin(&sin, sin6_p); inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; - s = splnet(); error = in_pcbbind(inp, (struct sockaddr *)&sin, p); - splx(s); return error; } } - s = splnet(); error = in6_pcbbind(inp, nam, p); - splx(s); return error; } @@ -603,7 +671,7 @@ static int udp6_connect(struct socket *so, struct sockaddr *nam, struct proc *p) { struct inpcb *inp; - int s, error; + int error; inp = sotoinpcb(so); if (inp == 0) @@ -619,9 +687,7 @@ udp6_connect(struct socket *so, struct sockaddr *nam, struct proc *p) if (inp->inp_faddr.s_addr != INADDR_ANY) return EISCONN; in6_sin6_2_sin(&sin, sin6_p); - s = splnet(); error = in_pcbconnect(inp, (struct sockaddr *)&sin, p); - splx(s); if (error == 0) { inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; @@ -633,9 +699,7 @@ udp6_connect(struct socket *so, struct sockaddr *nam, struct proc *p) if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) return EISCONN; - s = splnet(); error = in6_pcbconnect(inp, nam, p); - splx(s); if (error == 0) { if (ip6_mapped_addr_on || (inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { /* should be non mapped addr */ inp->inp_vflag &= ~INP_IPV4; @@ -650,14 +714,11 @@ static int udp6_detach(struct socket *so) { struct inpcb *inp; - int s; inp = sotoinpcb(so); if (inp == 0) return EINVAL; - s = splnet(); in6_pcbdetach(inp); - splx(s); return 0; } @@ -665,7 +726,6 @@ static int udp6_disconnect(struct socket *so) { struct inpcb *inp; - int s; inp = sotoinpcb(so); if (inp == 0) @@ -681,10 +741,8 @@ udp6_disconnect(struct socket *so) if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) return ENOTCONN; - s = splnet(); in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; - splx(s); so->so_state &= ~SS_ISCONNECTED; /* XXX */ return 0; } @@ -749,5 +807,5 @@ struct pr_usrreqs udp6_usrreqs = { pru_connect2_notsupp, in6_control, udp6_detach, udp6_disconnect, pru_listen_notsupp, in6_mapped_peeraddr, pru_rcvd_notsupp, pru_rcvoob_notsupp, udp6_send, pru_sense_null, udp_shutdown, - in6_mapped_sockaddr, sosend, soreceive, sopoll + in6_mapped_sockaddr, sosend, soreceive, pru_sopoll_notsupp }; diff --git a/bsd/netinet6/udp6_var.h b/bsd/netinet6/udp6_var.h index 417190c62..49e35cc55 100644 --- a/bsd/netinet6/udp6_var.h +++ b/bsd/netinet6/udp6_var.h @@ -66,18 +66,16 @@ #define _NETINET6_UDP6_VAR_H_ #include <sys/appleapiopts.h> -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE SYSCTL_DECL(_net_inet6_udp6); extern struct pr_usrreqs udp6_usrreqs; -void udp6_ctlinput __P((int, struct sockaddr *, void *)); -int udp6_input __P((struct mbuf **, int *)); -int udp6_output __P((struct inpcb *inp, struct mbuf *m, +void udp6_ctlinput(int, struct sockaddr *, void *); +int udp6_input(struct mbuf **, int *); +int udp6_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, - struct proc *p)); -#endif /* __APPLE_API_PRIVATE */ -#endif + struct proc *p); +#endif KERNEL_PRIVATE -#endif /*_NETINET6_UDP6_VAR_H_*/ +#endif _NETINET6_UDP6_VAR_H_ diff --git a/bsd/netkey/Makefile b/bsd/netkey/Makefile index 0ef16274a..def3c0629 100644 --- a/bsd/netkey/Makefile +++ b/bsd/netkey/Makefile @@ -20,7 +20,13 @@ EXPINC_SUBDIRS_PPC = \ EXPINC_SUBDIRS_I386 = \ DATAFILES = \ - key.h key_debug.h keydb.h keysock.h keyv2.h key_var.h + keysock.h + +PRIVATE_DATAFILES = \ + key_debug.h keydb.h key_var.h + +PRIVATE_KERNELFILES = \ + key.h INSTALL_MI_LIST = ${DATAFILES} @@ -30,6 +36,9 @@ EXPORT_MI_LIST = ${DATAFILES} EXPORT_MI_DIR = netkey +INSTALL_MI_LCL_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} + +INSTALL_KF_MI_LCL_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} ${PRIVATE_KERNELFILES} include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/bsd/netkey/key.c b/bsd/netkey/key.c index bb183c5a1..9576d7afe 100644 --- a/bsd/netkey/key.c +++ b/bsd/netkey/key.c @@ -50,6 +50,8 @@ #include <sys/queue.h> #include <sys/syslog.h> +#include <kern/locks.h> + #include <net/if.h> #include <net/route.h> #include <net/raw_cb.h> @@ -111,6 +113,12 @@ #define FULLMASK 0xff +lck_grp_t *sadb_mutex_grp; +lck_grp_attr_t *sadb_mutex_grp_attr; +lck_attr_t *sadb_mutex_attr; +lck_mtx_t *sadb_mutex; +extern lck_mtx_t *nd6_mutex; + /* * Note on SA reference counting: * - SAs that are not in DEAD state will have (total external reference + 1) @@ -132,7 +140,7 @@ static u_int key_larval_lifetime = 30; /* interval to expire acquiring, 30(s)*/ static int key_blockacq_count = 10; /* counter for blocking SADB_ACQUIRE.*/ static int key_blockacq_lifetime = 20; /* lifetime for blocking SADB_ACQUIRE.*/ static int key_preferred_oldsa = 0; /* preferred old sa rather than new sa.*/ -static int natt_keepalive_interval = 29; /* interval between natt keepalives.*/ +static int natt_keepalive_interval = 20; /* interval between natt keepalives.*/ static u_int32_t acq_seq = 0; static int key_tick_init_random = 0; @@ -142,6 +150,11 @@ static LIST_HEAD(_sptree, secpolicy) sptree[IPSEC_DIR_MAX]; /* SPD */ static LIST_HEAD(_sahtree, secashead) sahtree; /* SAD */ static LIST_HEAD(_regtree, secreg) regtree[SADB_SATYPE_MAX + 1]; /* registed list */ + +#define SPIHASHSIZE 128 +#define SPIHASH(x) (((x) ^ ((x) >> 16)) % SPIHASHSIZE) +static LIST_HEAD(_spihash, secasvar) spihash[SPIHASHSIZE]; + #ifndef IPSEC_NONBLOCK_ACQUIRE static LIST_HEAD(_acqtree, secacq) acqtree; /* acquiring list */ #endif @@ -268,6 +281,10 @@ SYSCTL_INT(_net_key, KEYCTL_PREFERED_OLDSA, prefered_oldsa, CTLFLAG_RW,\ SYSCTL_INT(_net_key, KEYCTL_NATT_KEEPALIVE_INTERVAL, natt_keepalive_interval, CTLFLAG_RW,\ &natt_keepalive_interval, 0, ""); +/* PF_KEY statistics */ +SYSCTL_STRUCT(_net_key, KEYCTL_PFKEYSTAT, pfkeystat, CTLFLAG_RD,\ + &pfkeystat, pfkeystat, ""); + #ifndef LIST_FOREACH #define LIST_FOREACH(elm, head, field) \ for (elm = LIST_FIRST(head); elm; elm = LIST_NEXT(elm, field)) @@ -365,60 +382,61 @@ struct sadb_msghdr { int extlen[SADB_EXT_MAX + 1]; }; -static struct secasvar *key_allocsa_policy __P((struct secasindex *)); -static void key_freesp_so __P((struct secpolicy **)); -static struct secasvar *key_do_allocsa_policy __P((struct secashead *, u_int)); -static void key_delsp __P((struct secpolicy *)); -static struct secpolicy *key_getsp __P((struct secpolicyindex *)); -static struct secpolicy *key_getspbyid __P((u_int32_t)); -static u_int32_t key_newreqid __P((void)); -static struct mbuf *key_gather_mbuf __P((struct mbuf *, - const struct sadb_msghdr *, int, int, int *)); -static int key_spdadd __P((struct socket *, struct mbuf *, - const struct sadb_msghdr *)); -static u_int32_t key_getnewspid __P((void)); -static int key_spddelete __P((struct socket *, struct mbuf *, - const struct sadb_msghdr *)); -static int key_spddelete2 __P((struct socket *, struct mbuf *, - const struct sadb_msghdr *)); -static int key_spdget __P((struct socket *, struct mbuf *, - const struct sadb_msghdr *)); -static int key_spdflush __P((struct socket *, struct mbuf *, - const struct sadb_msghdr *)); -static int key_spddump __P((struct socket *, struct mbuf *, - const struct sadb_msghdr *)); -static struct mbuf *key_setdumpsp __P((struct secpolicy *, - u_int8_t, u_int32_t, u_int32_t)); -static u_int key_getspreqmsglen __P((struct secpolicy *)); -static int key_spdexpire __P((struct secpolicy *)); -static struct secashead *key_newsah __P((struct secasindex *)); -static void key_delsah __P((struct secashead *)); -static struct secasvar *key_newsav __P((struct mbuf *, - const struct sadb_msghdr *, struct secashead *, int *)); -static void key_delsav __P((struct secasvar *)); -static struct secashead *key_getsah __P((struct secasindex *)); -static struct secasvar *key_checkspidup __P((struct secasindex *, u_int32_t)); -static struct secasvar *key_getsavbyspi __P((struct secashead *, u_int32_t)); -static int key_setsaval __P((struct secasvar *, struct mbuf *, - const struct sadb_msghdr *)); -static int key_mature __P((struct secasvar *)); -static struct mbuf *key_setdumpsa __P((struct secasvar *, u_int8_t, - u_int8_t, u_int32_t, u_int32_t)); -static struct mbuf *key_setsadbmsg __P((u_int8_t, u_int16_t, u_int8_t, - u_int32_t, pid_t, u_int16_t)); -static struct mbuf *key_setsadbsa __P((struct secasvar *)); -static struct mbuf *key_setsadbaddr __P((u_int16_t, - struct sockaddr *, u_int8_t, u_int16_t)); +static struct secasvar *key_allocsa_policy(struct secasindex *); +static void key_freesp_so(struct secpolicy **); +static struct secasvar *key_do_allocsa_policy(struct secashead *, u_int); +static void key_delsp(struct secpolicy *); +static struct secpolicy *key_getsp(struct secpolicyindex *); +static struct secpolicy *key_getspbyid(u_int32_t); +static u_int32_t key_newreqid(void); +static struct mbuf *key_gather_mbuf(struct mbuf *, + const struct sadb_msghdr *, int, int, int *); +static int key_spdadd(struct socket *, struct mbuf *, + const struct sadb_msghdr *); +static u_int32_t key_getnewspid(void); +static int key_spddelete(struct socket *, struct mbuf *, + const struct sadb_msghdr *); +static int key_spddelete2(struct socket *, struct mbuf *, + const struct sadb_msghdr *); +static int key_spdget(struct socket *, struct mbuf *, + const struct sadb_msghdr *); +static int key_spdflush(struct socket *, struct mbuf *, + const struct sadb_msghdr *); +static int key_spddump(struct socket *, struct mbuf *, + const struct sadb_msghdr *); +static struct mbuf *key_setdumpsp(struct secpolicy *, + u_int8_t, u_int32_t, u_int32_t); +static u_int key_getspreqmsglen(struct secpolicy *); +static int key_spdexpire(struct secpolicy *); +static struct secashead *key_newsah(struct secasindex *); +static void key_delsah(struct secashead *); +static struct secasvar *key_newsav(struct mbuf *, + const struct sadb_msghdr *, struct secashead *, int *); +static void key_delsav(struct secasvar *); +static struct secashead *key_getsah(struct secasindex *); +static struct secasvar *key_checkspidup(struct secasindex *, u_int32_t); +static void key_setspi __P((struct secasvar *, u_int32_t)); +static struct secasvar *key_getsavbyspi(struct secashead *, u_int32_t); +static int key_setsaval(struct secasvar *, struct mbuf *, + const struct sadb_msghdr *); +static int key_mature(struct secasvar *); +static struct mbuf *key_setdumpsa(struct secasvar *, u_int8_t, + u_int8_t, u_int32_t, u_int32_t); +static struct mbuf *key_setsadbmsg(u_int8_t, u_int16_t, u_int8_t, + u_int32_t, pid_t, u_int16_t); +static struct mbuf *key_setsadbsa(struct secasvar *); +static struct mbuf *key_setsadbaddr(u_int16_t, + struct sockaddr *, u_int8_t, u_int16_t); #if 0 -static struct mbuf *key_setsadbident __P((u_int16_t, u_int16_t, caddr_t, - int, u_int64_t)); +static struct mbuf *key_setsadbident(u_int16_t, u_int16_t, caddr_t, + int, u_int64_t); #endif -static struct mbuf *key_setsadbxsa2 __P((u_int8_t, u_int32_t, u_int32_t)); -static struct mbuf *key_setsadbxpolicy __P((u_int16_t, u_int8_t, - u_int32_t)); -static void *key_newbuf __P((const void *, u_int)); +static struct mbuf *key_setsadbxsa2(u_int8_t, u_int32_t, u_int32_t); +static struct mbuf *key_setsadbxpolicy(u_int16_t, u_int8_t, + u_int32_t); +static void *key_newbuf(const void *, u_int); #if INET6 -static int key_ismyaddr6 __P((struct sockaddr_in6 *)); +static int key_ismyaddr6(struct sockaddr_in6 *); #endif /* flags for key_cmpsaidx() */ @@ -426,79 +444,102 @@ static int key_ismyaddr6 __P((struct sockaddr_in6 *)); #define CMP_MODE_REQID 2 /* additionally HEAD, reqid, mode. */ #define CMP_REQID 3 /* additionally HEAD, reaid. */ #define CMP_EXACTLY 4 /* all elements. */ -static int key_cmpsaidx - __P((struct secasindex *, struct secasindex *, int)); - -static int key_cmpspidx_exactly - __P((struct secpolicyindex *, struct secpolicyindex *)); -static int key_cmpspidx_withmask - __P((struct secpolicyindex *, struct secpolicyindex *)); -static int key_sockaddrcmp __P((struct sockaddr *, struct sockaddr *, int)); -static int key_bbcmp __P((caddr_t, caddr_t, u_int)); -static void key_srandom __P((void)); -static u_int16_t key_satype2proto __P((u_int8_t)); -static u_int8_t key_proto2satype __P((u_int16_t)); - -static int key_getspi __P((struct socket *, struct mbuf *, - const struct sadb_msghdr *)); -static u_int32_t key_do_getnewspi __P((struct sadb_spirange *, - struct secasindex *)); -static int key_update __P((struct socket *, struct mbuf *, - const struct sadb_msghdr *)); +static int key_cmpsaidx(struct secasindex *, struct secasindex *, int); + +static int key_cmpspidx_exactly(struct secpolicyindex *, + struct secpolicyindex *); +static int key_cmpspidx_withmask(struct secpolicyindex *, + struct secpolicyindex *); +static int key_sockaddrcmp(struct sockaddr *, struct sockaddr *, int); +static int key_bbcmp(caddr_t, caddr_t, u_int); +static void key_srandom(void); +static u_int16_t key_satype2proto(u_int8_t); +static u_int8_t key_proto2satype(u_int16_t); + +static int key_getspi(struct socket *, struct mbuf *, + const struct sadb_msghdr *); +static u_int32_t key_do_getnewspi(struct sadb_spirange *, struct secasindex *); +static int key_update(struct socket *, struct mbuf *, + const struct sadb_msghdr *); #if IPSEC_DOSEQCHECK -static struct secasvar *key_getsavbyseq __P((struct secashead *, u_int32_t)); +static struct secasvar *key_getsavbyseq(struct secashead *, u_int32_t); #endif -static int key_add __P((struct socket *, struct mbuf *, - const struct sadb_msghdr *)); -static int key_setident __P((struct secashead *, struct mbuf *, - const struct sadb_msghdr *)); -static struct mbuf *key_getmsgbuf_x1 __P((struct mbuf *, - const struct sadb_msghdr *)); -static int key_delete __P((struct socket *, struct mbuf *, - const struct sadb_msghdr *)); -static int key_get __P((struct socket *, struct mbuf *, - const struct sadb_msghdr *)); - -static void key_getcomb_setlifetime __P((struct sadb_comb *)); +static int key_add(struct socket *, struct mbuf *, const struct sadb_msghdr *); +static int key_setident(struct secashead *, struct mbuf *, + const struct sadb_msghdr *); +static struct mbuf *key_getmsgbuf_x1(struct mbuf *, const struct sadb_msghdr *); +static int key_delete(struct socket *, struct mbuf *, + const struct sadb_msghdr *); +static int key_get(struct socket *, struct mbuf *, const struct sadb_msghdr *); + +static void key_getcomb_setlifetime(struct sadb_comb *); #if IPSEC_ESP -static struct mbuf *key_getcomb_esp __P((void)); +static struct mbuf *key_getcomb_esp(void); #endif -static struct mbuf *key_getcomb_ah __P((void)); -static struct mbuf *key_getcomb_ipcomp __P((void)); -static struct mbuf *key_getprop __P((const struct secasindex *)); +static struct mbuf *key_getcomb_ah(void); +static struct mbuf *key_getcomb_ipcomp(void); +static struct mbuf *key_getprop(const struct secasindex *); -static int key_acquire __P((struct secasindex *, struct secpolicy *)); +static int key_acquire(struct secasindex *, struct secpolicy *); #ifndef IPSEC_NONBLOCK_ACQUIRE -static struct secacq *key_newacq __P((struct secasindex *)); -static struct secacq *key_getacq __P((struct secasindex *)); -static struct secacq *key_getacqbyseq __P((u_int32_t)); +static struct secacq *key_newacq(struct secasindex *); +static struct secacq *key_getacq(struct secasindex *); +static struct secacq *key_getacqbyseq(u_int32_t); #endif -static struct secspacq *key_newspacq __P((struct secpolicyindex *)); -static struct secspacq *key_getspacq __P((struct secpolicyindex *)); -static int key_acquire2 __P((struct socket *, struct mbuf *, - const struct sadb_msghdr *)); -static int key_register __P((struct socket *, struct mbuf *, - const struct sadb_msghdr *)); -static int key_expire __P((struct secasvar *)); -static int key_flush __P((struct socket *, struct mbuf *, - const struct sadb_msghdr *)); -static int key_dump __P((struct socket *, struct mbuf *, - const struct sadb_msghdr *)); -static int key_promisc __P((struct socket *, struct mbuf *, - const struct sadb_msghdr *)); -static int key_senderror __P((struct socket *, struct mbuf *, int)); -static int key_validate_ext __P((const struct sadb_ext *, int)); -static int key_align __P((struct mbuf *, struct sadb_msghdr *)); +static struct secspacq *key_newspacq(struct secpolicyindex *); +static struct secspacq *key_getspacq(struct secpolicyindex *); +static int key_acquire2(struct socket *, struct mbuf *, + const struct sadb_msghdr *); +static int key_register(struct socket *, struct mbuf *, + const struct sadb_msghdr *); +static int key_expire(struct secasvar *); +static int key_flush(struct socket *, struct mbuf *, + const struct sadb_msghdr *); +static int key_dump(struct socket *, struct mbuf *, const struct sadb_msghdr *); +static int key_promisc(struct socket *, struct mbuf *, + const struct sadb_msghdr *); +static int key_senderror(struct socket *, struct mbuf *, int); +static int key_validate_ext(const struct sadb_ext *, int); +static int key_align(struct mbuf *, struct sadb_msghdr *); #if 0 -static const char *key_getfqdn __P((void)); -static const char *key_getuserfqdn __P((void)); +static const char *key_getfqdn(void); +static const char *key_getuserfqdn(void); #endif -static void key_sa_chgstate __P((struct secasvar *, u_int8_t)); -static struct mbuf *key_alloc_mbuf __P((int)); +static void key_sa_chgstate(struct secasvar *, u_int8_t); +static struct mbuf *key_alloc_mbuf(int); extern int ipsec_bypass; void ipsec_send_natt_keepalive(struct secasvar *sav); + +/* + * PF_KEY init + * setup locks and call raw_init() + * + */ +void +key_init(void) +{ + + int i; + + sadb_mutex_grp_attr = lck_grp_attr_alloc_init(); + sadb_mutex_grp = lck_grp_alloc_init("sadb", sadb_mutex_grp_attr); + sadb_mutex_attr = lck_attr_alloc_init(); + lck_attr_setdefault(sadb_mutex_attr); + + if ((sadb_mutex = lck_mtx_alloc_init(sadb_mutex_grp, sadb_mutex_attr)) == NULL) { + printf("key_init: can't alloc sadb_mutex\n"); + return; + } + + for (i = 0; i < SPIHASHSIZE; i++) + LIST_INIT(&spihash[i]); + + raw_init(); +} + + /* %%% IPsec policy management */ /* * allocating a SP for OUTBOUND or INBOUND packet. @@ -515,6 +556,7 @@ key_allocsp(spidx, dir) struct timeval tv; int s; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); /* sanity check */ if (spidx == NULL) panic("key_allocsp: NULL pointer is passed.\n"); @@ -529,7 +571,6 @@ key_allocsp(spidx, dir) } /* get a SP entry */ - s = splnet(); /*called from softclock()*/ KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("*** objects\n"); kdebug_secpolicyindex(spidx)); @@ -545,7 +586,6 @@ key_allocsp(spidx, dir) goto found; } - splx(s); return NULL; found: @@ -556,7 +596,6 @@ found: microtime(&tv); sp->lastused = tv.tv_sec; sp->refcnt++; - splx(s); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP key_allocsp cause refcnt++:%d SP:%p\n", sp->refcnt, sp)); @@ -580,13 +619,14 @@ key_gettunnel(osrc, odst, isrc, idst) struct sockaddr *os, *od, *is, *id; struct secpolicyindex spidx; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + if (isrc->sa_family != idst->sa_family) { ipseclog((LOG_ERR, "protocol family mismatched %d != %d\n.", isrc->sa_family, idst->sa_family)); return NULL; } - s = splnet(); /*called from softclock()*/ LIST_FOREACH(sp, &sptree[dir], chain) { if (sp->state == IPSEC_SPSTATE_DEAD) continue; @@ -626,14 +666,13 @@ key_gettunnel(osrc, odst, isrc, idst) goto found; } } - splx(s); + return NULL; found: microtime(&tv); sp->lastused = tv.tv_sec; sp->refcnt++; - splx(s); return sp; } @@ -651,6 +690,8 @@ key_checkrequest(isr, saidx) u_int level; int error; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (isr == NULL || saidx == NULL) panic("key_checkrequest: NULL pointer is passed.\n"); @@ -742,6 +783,8 @@ key_allocsa_policy(saidx) const u_int *saorder_state_valid; int arraysize; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + LIST_FOREACH(sah, &sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; @@ -791,6 +834,8 @@ key_do_allocsa_policy(sah, state) { struct secasvar *sav, *nextsav, *candidate, *d; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* initilize */ candidate = NULL; @@ -909,7 +954,7 @@ key_do_allocsa_policy(sah, state) * allocating a SA entry for a *INBOUND* packet. * Must call key_freesav() later. * OUT: positive: pointer to a sav. - * NULL: not found, or error occured. + * NULL: not found, or error occurred. * * In the comparison, source address will be ignored for RFC2401 conformance. * To quote, from section 4.1: @@ -926,15 +971,16 @@ key_allocsa(family, src, dst, proto, spi) caddr_t src, dst; u_int32_t spi; { - struct secashead *sah; - struct secasvar *sav; - u_int stateidx, state; + struct secasvar *sav, *match; + u_int stateidx, state, tmpidx, matchidx; struct sockaddr_in sin; struct sockaddr_in6 sin6; int s; const u_int *saorder_state_valid; int arraysize; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (src == NULL || dst == NULL) panic("key_allocsa: NULL pointer is passed.\n"); @@ -957,115 +1003,118 @@ key_allocsa(family, src, dst, proto, spi) * IPsec tunnel packet is received. But ESP tunnel mode is * encrypted so we can't check internal IP header. */ - s = splnet(); /*called from softclock()*/ - LIST_FOREACH(sah, &sahtree, chain) { - /* - * search a valid state list for inbound packet. - * the search order is not important. - */ - for (stateidx = 0; stateidx < arraysize; stateidx++) { + /* + * search a valid state list for inbound packet. + * the search order is not important. + */ + match = NULL; + matchidx = arraysize; + LIST_FOREACH(sav, &spihash[SPIHASH(spi)], spihash) { + if (sav->spi != spi) + continue; + if (proto != sav->sah->saidx.proto) + continue; + if (family != sav->sah->saidx.src.ss_family || + family != sav->sah->saidx.dst.ss_family) + continue; + tmpidx = arraysize; + for (stateidx = 0; stateidx < matchidx; stateidx++) { state = saorder_state_valid[stateidx]; - LIST_FOREACH(sav, &sah->savtree[state], chain) { - /* sanity check */ - KEY_CHKSASTATE(sav->state, state, "key_allocsav"); - if (proto != sav->sah->saidx.proto) - continue; - if (spi != sav->spi) - continue; - if (family != sav->sah->saidx.src.ss_family || - family != sav->sah->saidx.dst.ss_family) - continue; + if (sav->state == state) { + tmpidx = stateidx; + break; + } + } + if (tmpidx >= matchidx) + continue; #if 0 /* don't check src */ - /* check src address */ - switch (family) { - case AF_INET: - bzero(&sin, sizeof(sin)); - sin.sin_family = AF_INET; - sin.sin_len = sizeof(sin); - bcopy(src, &sin.sin_addr, - sizeof(sin.sin_addr)); - if (key_sockaddrcmp((struct sockaddr*)&sin, - (struct sockaddr *)&sav->sah->saidx.src, 0) != 0) - continue; - - break; - case AF_INET6: - bzero(&sin6, sizeof(sin6)); - sin6.sin6_family = AF_INET6; - sin6.sin6_len = sizeof(sin6); - bcopy(src, &sin6.sin6_addr, - sizeof(sin6.sin6_addr)); - if (IN6_IS_SCOPE_LINKLOCAL(&sin6.sin6_addr)) { - /* kame fake scopeid */ - sin6.sin6_scope_id = - ntohs(sin6.sin6_addr.s6_addr16[1]); - sin6.sin6_addr.s6_addr16[1] = 0; - } - if (key_sockaddrcmp((struct sockaddr*)&sin6, - (struct sockaddr *)&sav->sah->saidx.src, 0) != 0) - continue; - break; - default: - ipseclog((LOG_DEBUG, "key_allocsa: " - "unknown address family=%d.\n", - family)); - continue; - } + /* check src address */ + switch (family) { + case AF_INET: + bzero(&sin, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_len = sizeof(sin); + bcopy(src, &sin.sin_addr, + sizeof(sin.sin_addr)); + if (key_sockaddrcmp((struct sockaddr*)&sin, + (struct sockaddr *)&sav->sah->saidx.src, 0) != 0) + continue; + break; + case AF_INET6: + bzero(&sin6, sizeof(sin6)); + sin6.sin6_family = AF_INET6; + sin6.sin6_len = sizeof(sin6); + bcopy(src, &sin6.sin6_addr, + sizeof(sin6.sin6_addr)); + if (IN6_IS_SCOPE_LINKLOCAL(&sin6.sin6_addr)) { + /* kame fake scopeid */ + sin6.sin6_scope_id = + ntohs(sin6.sin6_addr.s6_addr16[1]); + sin6.sin6_addr.s6_addr16[1] = 0; + } + if (key_sockaddrcmp((struct sockaddr*)&sin6, + (struct sockaddr *)&sav->sah->saidx.src, 0) != 0) + continue; + break; + default: + ipseclog((LOG_DEBUG, "key_allocsa: " + "unknown address family=%d.\n", + family)); + continue; + } #endif - /* check dst address */ - switch (family) { - case AF_INET: - bzero(&sin, sizeof(sin)); - sin.sin_family = AF_INET; - sin.sin_len = sizeof(sin); - bcopy(dst, &sin.sin_addr, - sizeof(sin.sin_addr)); - if (key_sockaddrcmp((struct sockaddr*)&sin, - (struct sockaddr *)&sav->sah->saidx.dst, 0) != 0) - continue; - - break; - case AF_INET6: - bzero(&sin6, sizeof(sin6)); - sin6.sin6_family = AF_INET6; - sin6.sin6_len = sizeof(sin6); - bcopy(dst, &sin6.sin6_addr, - sizeof(sin6.sin6_addr)); - if (IN6_IS_SCOPE_LINKLOCAL(&sin6.sin6_addr)) { - /* kame fake scopeid */ - sin6.sin6_scope_id = - ntohs(sin6.sin6_addr.s6_addr16[1]); - sin6.sin6_addr.s6_addr16[1] = 0; - } - if (key_sockaddrcmp((struct sockaddr*)&sin6, - (struct sockaddr *)&sav->sah->saidx.dst, 0) != 0) - continue; - break; - default: - ipseclog((LOG_DEBUG, "key_allocsa: " - "unknown address family=%d.\n", - family)); - continue; - } + /* check dst address */ + switch (family) { + case AF_INET: + bzero(&sin, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_len = sizeof(sin); + bcopy(dst, &sin.sin_addr, + sizeof(sin.sin_addr)); + if (key_sockaddrcmp((struct sockaddr*)&sin, + (struct sockaddr *)&sav->sah->saidx.dst, 0) != 0) + continue; - goto found; - } + break; + case AF_INET6: + bzero(&sin6, sizeof(sin6)); + sin6.sin6_family = AF_INET6; + sin6.sin6_len = sizeof(sin6); + bcopy(dst, &sin6.sin6_addr, + sizeof(sin6.sin6_addr)); + if (IN6_IS_SCOPE_LINKLOCAL(&sin6.sin6_addr)) { + /* kame fake scopeid */ + sin6.sin6_scope_id = + ntohs(sin6.sin6_addr.s6_addr16[1]); + sin6.sin6_addr.s6_addr16[1] = 0; + } + if (key_sockaddrcmp((struct sockaddr*)&sin6, + (struct sockaddr *)&sav->sah->saidx.dst, 0) != 0) + continue; + break; + default: + ipseclog((LOG_DEBUG, "key_allocsa: " + "unknown address family=%d.\n", family)); + continue; } + + match = sav; + matchidx = tmpidx; } + if (match) + goto found; /* not found */ - splx(s); return NULL; found: - sav->refcnt++; - splx(s); + match->refcnt++; KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP allocsa cause refcnt++:%d SA:%p\n", - sav->refcnt, sav)); - return sav; + match->refcnt, match)); + return match; } /* @@ -1076,6 +1125,8 @@ void key_freesp(sp) struct secpolicy *sp; { + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (sp == NULL) panic("key_freesp: NULL pointer is passed.\n"); @@ -1091,6 +1142,7 @@ key_freesp(sp) return; } +#if 0 /* * Must be called after calling key_allocsp(). * For the packet with socket. @@ -1099,6 +1151,8 @@ void key_freeso(so) struct socket *so; { + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (so == NULL) panic("key_freeso: NULL pointer is passed.\n"); @@ -1148,11 +1202,15 @@ key_freeso(so) return; } +#endif static void key_freesp_so(sp) struct secpolicy **sp; { + + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (sp == NULL || *sp == NULL) panic("key_freesp_so: sp == NULL\n"); @@ -1183,6 +1241,8 @@ void key_freesav(sav) struct secasvar *sav; { + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (sav == NULL) panic("key_freesav: NULL pointer is passed.\n"); @@ -1208,6 +1268,8 @@ key_delsp(sp) { int s; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (sp == NULL) panic("key_delsp: NULL pointer is passed.\n"); @@ -1258,6 +1320,8 @@ key_getsp(spidx) { struct secpolicy *sp; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (spidx == NULL) panic("key_getsp: NULL pointer is passed.\n"); @@ -1285,6 +1349,8 @@ key_getspbyid(id) { struct secpolicy *sp; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + LIST_FOREACH(sp, &sptree[IPSEC_DIR_INBOUND], chain) { if (sp->state == IPSEC_SPSTATE_DEAD) continue; @@ -1334,6 +1400,8 @@ key_msg2sp(xpl0, len, error) { struct secpolicy *newsp; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (xpl0 == NULL) panic("key_msg2sp: NULL pointer was passed.\n"); @@ -1551,6 +1619,8 @@ key_newreqid() { static u_int32_t auto_reqid = IPSEC_MANUAL_REQID_MAX + 1; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + auto_reqid = (auto_reqid == ~0 ? IPSEC_MANUAL_REQID_MAX + 1 : auto_reqid + 1); @@ -1571,6 +1641,8 @@ key_sp2msg(sp) caddr_t p; struct mbuf *m; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check. */ if (sp == NULL) panic("key_sp2msg: NULL pointer was passed.\n"); @@ -1729,6 +1801,8 @@ key_spdadd(so, m, mhp) struct timeval tv; int error; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (so == NULL || m == NULL || mhp == NULL || mhp->msg == NULL) panic("key_spdadd: NULL pointer is passed.\n"); @@ -1949,6 +2023,8 @@ key_getnewspid() int count = key_spi_trycnt; /* XXX */ struct secpolicy *sp; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* when requesting to allocate spi ranged */ while (count--) { newid = (policy_id = (policy_id == ~0 ? 1 : policy_id + 1)); @@ -1990,6 +2066,8 @@ key_spddelete(so, m, mhp) struct secpolicyindex spidx; struct secpolicy *sp; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (so == NULL || m == NULL || mhp == NULL || mhp->msg == NULL) panic("key_spddelete: NULL pointer is passed.\n"); @@ -2084,6 +2162,8 @@ key_spddelete2(so, m, mhp) u_int32_t id; struct secpolicy *sp; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (so == NULL || m == NULL || mhp == NULL || mhp->msg == NULL) panic("key_spddelete2: NULL pointer is passed.\n"); @@ -2181,6 +2261,8 @@ key_spdget(so, m, mhp) struct secpolicy *sp; struct mbuf *n; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (so == NULL || m == NULL || mhp == NULL || mhp->msg == NULL) panic("key_spdget: NULL pointer is passed.\n"); @@ -2213,7 +2295,7 @@ key_spdget(so, m, mhp) * send * <base, policy(*)> * to KMD, and expect to receive - * <base> with SADB_X_SPDACQUIRE if error occured, + * <base> with SADB_X_SPDACQUIRE if error occurred, * or * <base, policy> * with SADB_X_SPDUPDATE from KMD by PF_KEY. @@ -2230,6 +2312,8 @@ key_spdacquire(sp) struct secspacq *newspacq; int error; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (sp == NULL) panic("key_spdacquire: NULL pointer is passed.\n"); @@ -2302,6 +2386,8 @@ key_spdflush(so, m, mhp) struct secpolicy *sp; u_int dir; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (so == NULL || m == NULL || mhp == NULL || mhp->msg == NULL) panic("key_spdflush: NULL pointer is passed.\n"); @@ -2353,6 +2439,8 @@ key_spddump(so, m, mhp) u_int dir; struct mbuf *n; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (so == NULL || m == NULL || mhp == NULL || mhp->msg == NULL) panic("key_spddump: NULL pointer is passed.\n"); @@ -2391,6 +2479,8 @@ key_setdumpsp(sp, type, seq, pid) { struct mbuf *result = NULL, *m; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + m = key_setsadbmsg(type, 0, SADB_SATYPE_UNSPEC, seq, pid, sp->refcnt); if (!m) goto fail; @@ -2447,6 +2537,8 @@ key_getspreqmsglen(sp) { u_int tlen; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + tlen = sizeof(struct sadb_x_policy); /* if is the policy for ipsec ? */ @@ -2489,8 +2581,7 @@ key_spdexpire(sp) int error = -1; struct sadb_lifetime *lt; - /* XXX: Why do we lock ? */ - s = splnet(); /*called from softclock()*/ + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); /* sanity check */ if (sp == NULL) @@ -2583,7 +2674,6 @@ key_spdexpire(sp) fail: if (result) m_freem(result); - splx(s); return error; } @@ -2599,6 +2689,8 @@ key_newsah(saidx) { struct secashead *newsah; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (saidx == NULL) panic("key_newsaidx: NULL pointer is passed.\n"); @@ -2628,6 +2720,8 @@ key_delsah(sah) int s; int zombie = 0; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (sah == NULL) panic("key_delsah: NULL pointer is passed.\n"); @@ -2706,6 +2800,8 @@ key_newsav(m, mhp, sah, errp) struct secasvar *newsav; const struct sadb_sa *xsa; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (m == NULL || mhp == NULL || mhp->msg == NULL || sah == NULL) panic("key_newsa: NULL pointer is passed.\n"); @@ -2720,7 +2816,7 @@ key_newsav(m, mhp, sah, errp) switch (mhp->msg->sadb_msg_type) { case SADB_GETSPI: - newsav->spi = 0; + key_setspi(newsav, 0); #if IPSEC_DOSEQCHECK /* sync sequence number */ @@ -2741,7 +2837,7 @@ key_newsav(m, mhp, sah, errp) return NULL; } xsa = (const struct sadb_sa *)mhp->ext[SADB_EXT_SA]; - newsav->spi = xsa->sadb_sa_spi; + key_setspi(newsav, xsa->sadb_sa_spi); newsav->seq = mhp->msg->sadb_msg_seq; break; default: @@ -2785,6 +2881,8 @@ static void key_delsav(sav) struct secasvar *sav; { + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (sav == NULL) panic("key_delsav: NULL pointer is passed.\n"); @@ -2795,6 +2893,9 @@ key_delsav(sav) /* remove from SA header */ if (__LIST_CHAINED(sav)) LIST_REMOVE(sav, chain); + + if (sav->spihash.le_prev || sav->spihash.le_next) + LIST_REMOVE(sav, spihash); if (sav->key_auth != NULL) { bzero(_KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth)); @@ -2849,6 +2950,8 @@ key_getsah(saidx) { struct secashead *sah; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + LIST_FOREACH(sah, &sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; @@ -2871,8 +2974,10 @@ key_checkspidup(saidx, spi) struct secasindex *saidx; u_int32_t spi; { - struct secashead *sah; struct secasvar *sav; + u_int stateidx, state; + + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); /* check address family */ if (saidx->src.ss_family != saidx->dst.ss_family) { @@ -2881,17 +2986,35 @@ key_checkspidup(saidx, spi) } /* check all SAD */ - LIST_FOREACH(sah, &sahtree, chain) { - if (!key_ismyaddr((struct sockaddr *)&sah->saidx.dst)) + LIST_FOREACH(sav, &spihash[SPIHASH(spi)], spihash) { + if (sav->spi != spi) continue; - sav = key_getsavbyspi(sah, spi); - if (sav != NULL) - return sav; + for (stateidx = 0; + stateidx < _ARRAYLEN(saorder_state_alive); + stateidx++) { + state = saorder_state_alive[stateidx]; + if (sav->state == state && + key_ismyaddr((struct sockaddr *)&sav->sah->saidx.dst)) + return sav; + } } return NULL; } +static void +key_setspi(sav, spi) + struct secasvar *sav; + u_int32_t spi; +{ + + sav->spi = spi; + if (sav->spihash.le_prev || sav->spihash.le_next) + LIST_REMOVE(sav, spihash); + LIST_INSERT_HEAD(&spihash[SPIHASH(spi)], sav, spihash); +} + + /* * search SAD litmited alive SA, protocol, SPI. * OUT: @@ -2903,31 +3026,27 @@ key_getsavbyspi(sah, spi) struct secashead *sah; u_int32_t spi; { - struct secasvar *sav; - u_int stateidx, state; - - /* search all status */ - for (stateidx = 0; - stateidx < _ARRAYLEN(saorder_state_alive); - stateidx++) { - - state = saorder_state_alive[stateidx]; - LIST_FOREACH(sav, &sah->savtree[state], chain) { - - /* sanity check */ - if (sav->state != state) { - ipseclog((LOG_DEBUG, "key_getsavbyspi: " - "invalid sav->state (queue: %d SA: %d)\n", - state, sav->state)); - continue; + struct secasvar *sav, *match; + u_int stateidx, state, matchidx; + + match = NULL; + matchidx = _ARRAYLEN(saorder_state_alive); + LIST_FOREACH(sav, &spihash[SPIHASH(spi)], spihash) { + if (sav->spi != spi) + continue; + if (sav->sah != sah) + continue; + for (stateidx = 0; stateidx < matchidx; stateidx++) { + state = saorder_state_alive[stateidx]; + if (sav->state == state) { + match = sav; + matchidx = stateidx; + break; } - - if (sav->spi == spi) - return sav; } } - return NULL; + return match; } /* @@ -2950,6 +3069,8 @@ key_setsaval(sav, m, mhp) int error = 0; struct timeval tv; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (m == NULL || mhp == NULL || mhp->msg == NULL) panic("key_setsaval: NULL pointer is passed.\n"); @@ -3235,6 +3356,8 @@ key_mature(sav) mature = 0; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* check SPI value */ switch (sav->sah->saidx.proto) { case IPPROTO_ESP: @@ -3419,6 +3542,8 @@ key_setdumpsa(sav, type, satype, seq, pid) SADB_EXT_IDENTITY_DST, SADB_EXT_SENSITIVITY, }; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + m = key_setsadbmsg(type, 0, satype, seq, pid, sav->refcnt); if (m == NULL) goto fail; @@ -3825,6 +3950,7 @@ key_ismyaddr(sa) switch (sa->sa_family) { #if INET case AF_INET: + lck_mtx_lock(rt_mtx); sin = (struct sockaddr_in *)sa; for (ia = in_ifaddrhead.tqh_first; ia; ia = ia->ia_link.tqe_next) @@ -3833,9 +3959,11 @@ key_ismyaddr(sa) sin->sin_len == ia->ia_addr.sin_len && sin->sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr) { + lck_mtx_unlock(rt_mtx); return 1; } } + lck_mtx_unlock(rt_mtx); break; #endif #if INET6 @@ -3863,10 +3991,13 @@ key_ismyaddr6(sin6) struct in6_ifaddr *ia; struct in6_multi *in6m; - for (ia = in6_ifaddr; ia; ia = ia->ia_next) { + lck_mtx_lock(nd6_mutex); + for (ia = in6_ifaddrs; ia; ia = ia->ia_next) { if (key_sockaddrcmp((struct sockaddr *)&sin6, - (struct sockaddr *)&ia->ia_addr, 0) == 0) + (struct sockaddr *)&ia->ia_addr, 0) == 0) { + lck_mtx_unlock(nd6_mutex); return 1; + } /* * XXX Multicast @@ -3876,9 +4007,12 @@ key_ismyaddr6(sin6) */ in6m = NULL; IN6_LOOKUP_MULTI(sin6->sin6_addr, ia->ia_ifp, in6m); - if (in6m) + if (in6m) { + lck_mtx_unlock(nd6_mutex); return 1; + } } + lck_mtx_unlock(nd6_mutex); /* loopback, just for safety */ if (IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr)) @@ -4187,18 +4321,6 @@ key_bbcmp(p1, p2, bits) * and do to remove or to expire. * XXX: year 2038 problem may remain. */ -void -key_timehandler_funnel(void) -{ -#ifdef __APPLE__ - boolean_t funnel_state; - funnel_state = thread_funnel_set(network_flock, TRUE); -#endif - key_timehandler(); -#ifdef __APPLE__ - (void) thread_funnel_set(network_flock, FALSE); -#endif -} void key_timehandler(void) @@ -4209,8 +4331,7 @@ key_timehandler(void) microtime(&tv); - s = splnet(); /*called from softclock()*/ - + lck_mtx_lock(sadb_mutex); /* SPD */ { struct secpolicy *sp, *nextsp; @@ -4472,12 +4593,12 @@ key_timehandler(void) natt_now++; + lck_mtx_unlock(sadb_mutex); #ifndef IPSEC_DEBUG2 /* do exchange to tick time !! */ - (void)timeout((void *)key_timehandler_funnel, (void *)0, hz); + (void)timeout((void *)key_timehandler, (void *)0, hz); #endif /* IPSEC_DEBUG2 */ - splx(s); return; } @@ -4618,6 +4739,8 @@ key_getspi(so, m, mhp) u_int32_t reqid; int error; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (so == NULL || m == NULL || mhp == NULL || mhp->msg == NULL) panic("key_getspi: NULL pointer is passed.\n"); @@ -4710,7 +4833,7 @@ key_getspi(so, m, mhp) } /* set spi */ - newsav->spi = htonl(spi); + key_setspi(newsav, htonl(spi)); #ifndef IPSEC_NONBLOCK_ACQUIRE /* delete the entry in acqtree */ @@ -4811,6 +4934,8 @@ key_do_getnewspi(spirange, saidx) u_int32_t min, max; int count = key_spi_trycnt; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* set spi range to allocate */ if (spirange != NULL) { min = spirange->sadb_spirange_min; @@ -4896,6 +5021,8 @@ key_update(so, m, mhp) u_int32_t reqid; int error; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (so == NULL || m == NULL || mhp == NULL || mhp->msg == NULL) panic("key_update: NULL pointer is passed.\n"); @@ -5003,9 +5130,9 @@ key_update(so, m, mhp) } /* check SA values to be mature. */ - if ((mhp->msg->sadb_msg_errno = key_mature(sav)) != 0) { + if ((error = key_mature(sav)) != 0) { key_freesav(sav); - return key_senderror(so, m, 0); + return key_senderror(so, m, error); } { @@ -5039,6 +5166,8 @@ key_getsavbyseq(sah, seq) struct secasvar *sav; u_int state; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + state = SADB_SASTATE_LARVAL; /* search SAD with sequence number ? */ @@ -5091,6 +5220,8 @@ key_add(so, m, mhp) u_int32_t reqid; int error; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (so == NULL || m == NULL || mhp == NULL || mhp->msg == NULL) panic("key_add: NULL pointer is passed.\n"); @@ -5200,6 +5331,8 @@ key_setident(sah, m, mhp) const struct sadb_ident *idsrc, *iddst; int idsrclen, iddstlen; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (sah == NULL || m == NULL || mhp == NULL || mhp->msg == NULL) panic("key_setident: NULL pointer is passed.\n"); @@ -5296,8 +5429,8 @@ key_getmsgbuf_x1(m, mhp) return n; } -static int key_delete_all __P((struct socket *, struct mbuf *, - const struct sadb_msghdr *, u_int16_t)); +static int key_delete_all(struct socket *, struct mbuf *, + const struct sadb_msghdr *, u_int16_t); /* * SADB_DELETE processing @@ -5323,6 +5456,8 @@ key_delete(so, m, mhp) struct secasvar *sav = NULL; u_int16_t proto; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (so == NULL || m == NULL || mhp == NULL || mhp->msg == NULL) panic("key_delete: NULL pointer is passed.\n"); @@ -5427,6 +5562,8 @@ key_delete_all(so, m, mhp, proto) struct secasvar *sav, *nextsav; u_int stateidx, state; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]); dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]); @@ -5513,6 +5650,8 @@ key_get(so, m, mhp) struct secasvar *sav = NULL; u_int16_t proto; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (so == NULL || m == NULL || mhp == NULL || mhp->msg == NULL) panic("key_get: NULL pointer is passed.\n"); @@ -5839,7 +5978,7 @@ key_getprop(saidx) * <base, SA, address(SD), (address(P)), x_policy, * (identity(SD),) (sensitivity,) proposal> * to KMD, and expect to receive - * <base> with SADB_ACQUIRE if error occured, + * <base> with SADB_ACQUIRE if error occurred, * or * <base, src address, dst address, (SPI range)> with SADB_GETSPI * from KMD by PF_KEY. @@ -5866,6 +6005,8 @@ key_acquire(saidx, sp) int error = -1; u_int32_t seq; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (saidx == NULL) panic("key_acquire: NULL pointer is passed.\n"); @@ -6065,6 +6206,8 @@ key_getacq(saidx) { struct secacq *acq; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + LIST_FOREACH(acq, &acqtree, chain) { if (key_cmpsaidx(saidx, &acq->saidx, CMP_EXACTLY)) return acq; @@ -6079,6 +6222,8 @@ key_getacqbyseq(seq) { struct secacq *acq; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + LIST_FOREACH(acq, &acqtree, chain) { if (acq->seq == seq) return acq; @@ -6118,6 +6263,8 @@ key_getspacq(spidx) { struct secspacq *acq; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + LIST_FOREACH(acq, &spacqtree, chain) { if (key_cmpspidx_exactly(spidx, &acq->spidx)) return acq; @@ -6152,15 +6299,17 @@ key_acquire2(so, m, mhp) u_int16_t proto; int error; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (so == NULL || m == NULL || mhp == NULL || mhp->msg == NULL) panic("key_acquire2: NULL pointer is passed.\n"); /* * Error message from KMd. - * We assume that if error was occured in IKEd, the length of PFKEY + * We assume that if error was occurred in IKEd, the length of PFKEY * message is equal to the size of sadb_msg structure. - * We do not raise error even if error occured in this function. + * We do not raise error even if error occurred in this function. */ if (mhp->msg->sadb_msg_len == PFKEY_UNIT64(sizeof(struct sadb_msg))) { #ifndef IPSEC_NONBLOCK_ACQUIRE @@ -6266,6 +6415,8 @@ key_register(so, m, mhp) { struct secreg *reg, *newreg = 0; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (so == NULL || m == NULL || mhp == NULL || mhp->msg == NULL) panic("key_register: NULL pointer is passed.\n"); @@ -6294,8 +6445,10 @@ key_register(so, m, mhp) } bzero((caddr_t)newreg, sizeof(*newreg)); + socket_lock(so, 1); newreg->so = so; ((struct keycb *)sotorawcb(so))->kp_registered++; + socket_unlock(so, 1); /* add regnode to regtree. */ LIST_INSERT_HEAD(®tree[mhp->msg->sadb_msg_satype], newreg, chain); @@ -6429,6 +6582,8 @@ key_freereg(so) struct secreg *reg; int i; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (so == NULL) panic("key_freereg: NULL pointer is passed.\n"); @@ -6473,8 +6628,7 @@ key_expire(sav) int error = -1; struct sadb_lifetime *lt; - /* XXX: Why do we lock ? */ - s = splnet(); /*called from softclock()*/ + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); /* sanity check */ if (sav == NULL) @@ -6606,6 +6760,8 @@ key_flush(so, m, mhp) u_int8_t state; u_int stateidx; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (so == NULL || mhp == NULL || mhp->msg == NULL) panic("key_flush: NULL pointer is passed.\n"); @@ -6689,6 +6845,8 @@ key_dump(so, m, mhp) struct sadb_msg *newmsg; struct mbuf *n; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (so == NULL || m == NULL || mhp == NULL || mhp->msg == NULL) panic("key_dump: NULL pointer is passed.\n"); @@ -6764,6 +6922,8 @@ key_promisc(so, m, mhp) { int olen; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (so == NULL || m == NULL || mhp == NULL || mhp->msg == NULL) panic("key_promisc: NULL pointer is passed.\n"); @@ -6780,7 +6940,8 @@ key_promisc(so, m, mhp) } else if (olen == sizeof(struct sadb_msg)) { /* enable/disable promisc mode */ struct keycb *kp; - + + socket_lock(so, 1); if ((kp = (struct keycb *)sotorawcb(so)) == NULL) return key_senderror(so, m, EINVAL); mhp->msg->sadb_msg_errno = 0; @@ -6790,8 +6951,10 @@ key_promisc(so, m, mhp) kp->kp_promisc = mhp->msg->sadb_msg_satype; break; default: + socket_unlock(so, 1); return key_senderror(so, m, EINVAL); } + socket_unlock(so, 1); /* send the original message back to everyone */ mhp->msg->sadb_msg_errno = 0; @@ -6806,8 +6969,8 @@ key_promisc(so, m, mhp) } } -static int (*key_typesw[]) __P((struct socket *, struct mbuf *, - const struct sadb_msghdr *)) = { +static int (*key_typesw[])(struct socket *, struct mbuf *, + const struct sadb_msghdr *) = { NULL, /* SADB_RESERVED */ key_getspi, /* SADB_GETSPI */ key_update, /* SADB_UPDATE */ @@ -6855,6 +7018,8 @@ key_parse(m, so) int error; int target; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (m == NULL || so == NULL) panic("key_parse: NULL pointer is passed.\n"); @@ -7105,6 +7270,8 @@ key_senderror(so, m, code) { struct sadb_msg *msg; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + if (m->m_len < sizeof(struct sadb_msg)) panic("invalid mbuf passed to key_senderror"); @@ -7283,7 +7450,7 @@ key_validate_ext(ext, len) } void -key_init() +key_domain_init() { int i; @@ -7315,7 +7482,7 @@ key_init() #endif #ifndef IPSEC_DEBUG2 - timeout((void *)key_timehandler_funnel, (void *)0, hz); + timeout((void *)key_timehandler, (void *)0, hz); #endif /*IPSEC_DEBUG2*/ /* initialize key statistics */ @@ -7343,6 +7510,8 @@ key_checktunnelsanity(sav, family, src, dst) caddr_t src; caddr_t dst; { + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ if (sav->sah == NULL) panic("sav->sah == NULL at key_checktunnelsanity"); @@ -7422,6 +7591,8 @@ key_sa_recordxfer(sav, m) struct secasvar *sav; struct mbuf *m; { + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + if (!sav) panic("key_sa_recordxfer called with sav == NULL"); if (!m) @@ -7473,6 +7644,7 @@ key_sa_routechange(dst) struct secashead *sah; struct route *ro; + lck_mtx_lock(sadb_mutex); LIST_FOREACH(sah, &sahtree, chain) { ro = &sah->sa_route; if (ro->ro_rt && dst->sa_len == ro->ro_dst.sa_len @@ -7481,6 +7653,7 @@ key_sa_routechange(dst) ro->ro_rt = (struct rtentry *)NULL; } } + lck_mtx_unlock(sadb_mutex); return; } @@ -7490,6 +7663,8 @@ key_sa_chgstate(sav, state) struct secasvar *sav; u_int8_t state; { + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + if (sav == NULL) panic("key_sa_chgstate called with sav == NULL"); @@ -7508,6 +7683,8 @@ key_sa_stir_iv(sav) struct secasvar *sav; { + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + if (!sav->iv) panic("key_sa_stir_iv called with sav == NULL"); key_randomfill(sav->iv, sav->ivlen); diff --git a/bsd/netkey/key.h b/bsd/netkey/key.h index 99cf8bee4..ce509fa9f 100644 --- a/bsd/netkey/key.h +++ b/bsd/netkey/key.h @@ -33,8 +33,7 @@ #define _NETKEY_KEY_H_ #include <sys/appleapiopts.h> -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE extern struct key_cb key_cb; @@ -47,34 +46,30 @@ struct socket; struct sadb_msg; struct sadb_x_policy; -extern struct secpolicy *key_allocsp __P((struct secpolicyindex *, u_int)); -extern struct secpolicy *key_gettunnel __P((struct sockaddr *, - struct sockaddr *, struct sockaddr *, struct sockaddr *)); -extern int key_checkrequest - __P((struct ipsecrequest *isr, struct secasindex *)); -extern struct secasvar *key_allocsa __P((u_int, caddr_t, caddr_t, - u_int, u_int32_t)); -extern void key_freesp __P((struct secpolicy *)); -extern void key_freeso __P((struct socket *)); -extern void key_freesav __P((struct secasvar *)); -extern struct secpolicy *key_newsp __P((void)); -extern struct secpolicy *key_msg2sp __P((struct sadb_x_policy *, - size_t, int *)); -extern struct mbuf *key_sp2msg __P((struct secpolicy *)); -extern int key_ismyaddr __P((struct sockaddr *)); -extern int key_spdacquire __P((struct secpolicy *)); -extern void key_timehandler __P((void)); -extern u_long key_random __P((void)); -extern void key_randomfill __P((void *, size_t)); -extern void key_freereg __P((struct socket *)); -extern int key_parse __P((struct mbuf *, struct socket *)); -extern void key_init __P((void)); -extern int key_checktunnelsanity __P((struct secasvar *, u_int, - caddr_t, caddr_t)); -extern void key_sa_recordxfer __P((struct secasvar *, struct mbuf *)); -extern void key_sa_routechange __P((struct sockaddr *)); -extern void key_sa_stir_iv __P((struct secasvar *)); +extern struct secpolicy *key_allocsp(struct secpolicyindex *, u_int); +extern struct secpolicy *key_gettunnel(struct sockaddr *, + struct sockaddr *, struct sockaddr *, struct sockaddr *); +extern int key_checkrequest(struct ipsecrequest *isr, struct secasindex *); +extern struct secasvar *key_allocsa(u_int, caddr_t, caddr_t, + u_int, u_int32_t); +extern void key_freesp(struct secpolicy *); +extern void key_freeso(struct socket *); +extern void key_freesav(struct secasvar *); +extern struct secpolicy *key_newsp(void); +extern struct secpolicy *key_msg2sp(struct sadb_x_policy *, size_t, int *); +extern struct mbuf *key_sp2msg(struct secpolicy *); +extern int key_ismyaddr(struct sockaddr *); +extern int key_spdacquire(struct secpolicy *); +extern void key_timehandler(void); +extern u_long key_random(void); +extern void key_randomfill(void *, size_t); +extern void key_freereg(struct socket *); +extern int key_parse(struct mbuf *, struct socket *); +extern void key_domain_init(void); +extern int key_checktunnelsanity(struct secasvar *, u_int, caddr_t, caddr_t); +extern void key_sa_recordxfer(struct secasvar *, struct mbuf *); +extern void key_sa_routechange(struct sockaddr *); +extern void key_sa_stir_iv(struct secasvar *); -#endif /* __APPLE_API_PRIVATE */ -#endif /* defined(_KERNEL) */ +#endif /* KERNEL_PRIVATE */ #endif /* _NETKEY_KEY_H_ */ diff --git a/bsd/netkey/key_debug.c b/bsd/netkey/key_debug.c index 6db61a105..5c3acffdc 100644 --- a/bsd/netkey/key_debug.c +++ b/bsd/netkey/key_debug.c @@ -55,17 +55,17 @@ #if !defined(KERNEL) || (defined(KERNEL) && defined(IPSEC_DEBUG)) -static void kdebug_sadb_prop __P((struct sadb_ext *)); -static void kdebug_sadb_identity __P((struct sadb_ext *)); -static void kdebug_sadb_supported __P((struct sadb_ext *)); -static void kdebug_sadb_lifetime __P((struct sadb_ext *)); -static void kdebug_sadb_sa __P((struct sadb_ext *)); -static void kdebug_sadb_address __P((struct sadb_ext *)); -static void kdebug_sadb_key __P((struct sadb_ext *)); -static void kdebug_sadb_x_sa2 __P((struct sadb_ext *)); +static void kdebug_sadb_prop(struct sadb_ext *); +static void kdebug_sadb_identity(struct sadb_ext *); +static void kdebug_sadb_supported(struct sadb_ext *); +static void kdebug_sadb_lifetime(struct sadb_ext *); +static void kdebug_sadb_sa(struct sadb_ext *); +static void kdebug_sadb_address(struct sadb_ext *); +static void kdebug_sadb_key(struct sadb_ext *); +static void kdebug_sadb_x_sa2(struct sadb_ext *); #ifdef KERNEL -static void kdebug_secreplay __P((struct secreplay *)); +static void kdebug_secreplay(struct secreplay *); #endif #ifndef KERNEL diff --git a/bsd/netkey/key_debug.h b/bsd/netkey/key_debug.h index ebbf17f04..a9f823d77 100644 --- a/bsd/netkey/key_debug.h +++ b/bsd/netkey/key_debug.h @@ -33,8 +33,6 @@ #define _NETKEY_KEY_DEBUG_H_ #include <sys/appleapiopts.h> -#if !defined(KERNEL) || (defined(KERNEL) && defined(IPSEC_DEBUG)) - /* debug flags */ #define KEYDEBUG_STAMP 0x00000001 /* path */ #define KEYDEBUG_DATA 0x00000002 /* data */ @@ -59,11 +57,10 @@ struct sadb_msg; struct sadb_ext; -extern void kdebug_sadb __P((struct sadb_msg *)); -extern void kdebug_sadb_x_policy __P((struct sadb_ext *)); +extern void kdebug_sadb(struct sadb_msg *); +extern void kdebug_sadb_x_policy(struct sadb_ext *); #ifdef KERNEL -#ifdef __APPLE_API_PRIVATE extern u_int32_t key_debug_level; struct secpolicy; @@ -72,26 +69,21 @@ struct secasindex; struct secasvar; struct secreplay; struct mbuf; -extern void kdebug_secpolicy __P((struct secpolicy *)); -extern void kdebug_secpolicyindex __P((struct secpolicyindex *)); -extern void kdebug_secasindex __P((struct secasindex *)); -extern void kdebug_secasv __P((struct secasvar *)); -extern void kdebug_mbufhdr __P((struct mbuf *)); -extern void kdebug_mbuf __P((struct mbuf *)); -#endif /* __APPLE_API_PRIVATE */ -#endif /*KERNEL*/ +extern void kdebug_secpolicy(struct secpolicy *); +extern void kdebug_secpolicyindex(struct secpolicyindex *); +extern void kdebug_secasindex(struct secasindex *); +extern void kdebug_secasv(struct secasvar *); +extern void kdebug_mbufhdr(struct mbuf *); +extern void kdebug_mbuf(struct mbuf *); +#endif KERNEL struct sockaddr; -extern void kdebug_sockaddr __P((struct sockaddr *)); - -extern void ipsec_hexdump __P((caddr_t, int)); -extern void ipsec_bindump __P((caddr_t, int)); +extern void kdebug_sockaddr(struct sockaddr *); -#else +extern void ipsec_hexdump(caddr_t, int); +extern void ipsec_bindump(caddr_t, int); #define KEYDEBUG(lev,arg) -#endif /*!defined(KERNEL) || (defined(KERNEL) && defined(IPSEC_DEBUG))*/ - #endif /* _NETKEY_KEY_DEBUG_H_ */ diff --git a/bsd/netkey/key_var.h b/bsd/netkey/key_var.h index aa7d7f677..7f3f45c52 100644 --- a/bsd/netkey/key_var.h +++ b/bsd/netkey/key_var.h @@ -32,7 +32,6 @@ #ifndef _NETKEY_KEY_VAR_H_ #define _NETKEY_KEY_VAR_H_ #include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE /* sysctl */ #define KEYCTL_DEBUG_LEVEL 1 @@ -47,9 +46,11 @@ #define KEYCTL_ESP_AUTH 10 #define KEYCTL_AH_KEYMIN 11 #define KEYCTL_PREFERED_OLDSA 12 -#define KEYCTL_NATT_KEEPALIVE_INTERVAL 13 -#define KEYCTL_MAXID 14 +#define KEYCTL_NATT_KEEPALIVE_INTERVAL 13 +#define KEYCTL_PFKEYSTAT 14 +#define KEYCTL_MAXID 15 +#ifdef KERNEL_PRIVATE #define KEYCTL_NAMES { \ { 0, 0 }, \ { "debug", CTLTYPE_INT }, \ @@ -65,6 +66,7 @@ { "ah_keymin", CTLTYPE_INT }, \ { "prefered_oldsa", CTLTYPE_INT }, \ { "natt_keepalive_interval", CTLTYPE_INT }, \ + { "pfkeystat", CTLTYPE_STRUCT }, \ } #define KEYCTL_VARS { \ @@ -90,5 +92,6 @@ #define _KEYBUF(key) ((caddr_t)((caddr_t)(key) + sizeof(struct sadb_key))) #endif /*KERNEL*/ -#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL_PRIVATE */ #endif /* _NETKEY_KEY_VAR_H_ */ + diff --git a/bsd/netkey/keydb.c b/bsd/netkey/keydb.c index a96589042..8da09b877 100644 --- a/bsd/netkey/keydb.c +++ b/bsd/netkey/keydb.c @@ -51,7 +51,7 @@ MALLOC_DEFINE(M_SECA, "key mgmt", "security associations, key management"); -static void keydb_delsecasvar __P((struct secasvar *)); +static void keydb_delsecasvar(struct secasvar *); /* * secpolicy management diff --git a/bsd/netkey/keydb.h b/bsd/netkey/keydb.h index 8c70fa95a..80b16054f 100644 --- a/bsd/netkey/keydb.h +++ b/bsd/netkey/keydb.h @@ -34,7 +34,7 @@ #include <sys/appleapiopts.h> #ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE #include <netkey/key_var.h> @@ -70,7 +70,7 @@ struct secashead { /* Security Association */ struct secasvar { LIST_ENTRY(secasvar) chain; - + LIST_ENTRY(secasvar) spihash; int refcnt; /* reference count */ u_int8_t state; /* Status of this Association */ @@ -144,23 +144,23 @@ struct key_cb { }; /* secpolicy */ -extern struct secpolicy *keydb_newsecpolicy __P((void)); -extern void keydb_delsecpolicy __P((struct secpolicy *)); +extern struct secpolicy *keydb_newsecpolicy(void); +extern void keydb_delsecpolicy(struct secpolicy *); /* secashead */ -extern struct secashead *keydb_newsecashead __P((void)); -extern void keydb_delsecashead __P((struct secashead *)); +extern struct secashead *keydb_newsecashead(void); +extern void keydb_delsecashead(struct secashead *); /* secasvar */ -extern struct secasvar *keydb_newsecasvar __P((void)); -extern void keydb_refsecasvar __P((struct secasvar *)); -extern void keydb_freesecasvar __P((struct secasvar *)); +extern struct secasvar *keydb_newsecasvar(void); +extern void keydb_refsecasvar(struct secasvar *); +extern void keydb_freesecasvar(struct secasvar *); /* secreplay */ -extern struct secreplay *keydb_newsecreplay __P((size_t)); -extern void keydb_delsecreplay __P((struct secreplay *)); +extern struct secreplay *keydb_newsecreplay(size_t); +extern void keydb_delsecreplay(struct secreplay *); /* secreg */ -extern struct secreg *keydb_newsecreg __P((void)); -extern void keydb_delsecreg __P((struct secreg *)); +extern struct secreg *keydb_newsecreg(void); +extern void keydb_delsecreg(struct secreg *); -#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL_PRIVATE */ #endif /* KERNEL */ #endif /* _NETKEY_KEYDB_H_ */ diff --git a/bsd/netkey/keysock.c b/bsd/netkey/keysock.c index 364473ea0..a65bcc690 100644 --- a/bsd/netkey/keysock.c +++ b/bsd/netkey/keysock.c @@ -44,6 +44,7 @@ #include <sys/protosw.h> #include <sys/errno.h> +#include <kern/locks.h> #include <net/raw_cb.h> #include <net/route.h> @@ -54,10 +55,14 @@ #include <netkey/keysock.h> #include <netkey/key_debug.h> +extern lck_mtx_t *raw_mtx; +extern lck_mtx_t *sadb_mutex; +extern void key_init(void); + struct sockaddr key_dst = { 2, PF_KEY, }; struct sockaddr key_src = { 2, PF_KEY, }; -static int key_sendup0 __P((struct rawcb *, struct mbuf *, int)); +static int key_sendup0(struct rawcb *, struct mbuf *, int); struct pfkeystat pfkeystat; @@ -80,7 +85,6 @@ key_output(m, va_alist) { struct sadb_msg *msg; int len, error = 0; - int s; #ifndef __APPLE__ struct socket *so; va_list ap; @@ -93,6 +97,8 @@ key_output(m, va_alist) if (m == 0) panic("key_output: NULL pointer was passed.\n"); + socket_unlock(so, 0); + lck_mtx_lock(sadb_mutex); pfkeystat.out_total++; pfkeystat.out_bytes += m->m_pkthdr.len; @@ -135,14 +141,14 @@ key_output(m, va_alist) goto end; } - /*XXX giant lock*/ - s = splnet(); error = key_parse(m, so); m = NULL; - splx(s); + end: if (m) m_freem(m); + lck_mtx_unlock(sadb_mutex); + socket_lock(so, 0); return error; } @@ -157,6 +163,7 @@ key_sendup0(rp, m, promisc) { int error; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); if (promisc) { struct sadb_msg *pmsg; @@ -184,103 +191,18 @@ key_sendup0(rp, m, promisc) } if (!sbappendaddr(&rp->rcb_socket->so_rcv, (struct sockaddr *)&key_src, - m, NULL)) { + m, NULL, &error)) { #if IPSEC_DEBUG printf("key_sendup0: sbappendaddr failed\n"); #endif pfkeystat.in_nomem++; - m_freem(m); - error = ENOBUFS; - } else - error = 0; - sorwakeup(rp->rcb_socket); + } + else { + sorwakeup(rp->rcb_socket); + } return error; } -/* XXX this interface should be obsoleted. */ -int -key_sendup(so, msg, len, target) - struct socket *so; - struct sadb_msg *msg; - u_int len; - int target; /*target of the resulting message*/ -{ - struct mbuf *m, *n, *mprev; - int tlen; - - /* sanity check */ - if (so == 0 || msg == 0) - panic("key_sendup: NULL pointer was passed.\n"); - - KEYDEBUG(KEYDEBUG_KEY_DUMP, - printf("key_sendup: \n"); - kdebug_sadb(msg)); - - /* - * we increment statistics here, just in case we have ENOBUFS - * in this function. - */ - pfkeystat.in_total++; - pfkeystat.in_bytes += len; - pfkeystat.in_msgtype[msg->sadb_msg_type]++; - - /* - * Get mbuf chain whenever possible (not clusters), - * to save socket buffer. We'll be generating many SADB_ACQUIRE - * messages to listening key sockets. If we simply allocate clusters, - * sbappendaddr() will raise ENOBUFS due to too little sbspace(). - * sbspace() computes # of actual data bytes AND mbuf region. - * - * TODO: SADB_ACQUIRE filters should be implemented. - */ - tlen = len; - m = mprev = NULL; - while (tlen > 0) { - if (tlen == len) { - MGETHDR(n, M_DONTWAIT, MT_DATA); - n->m_len = MHLEN; - } else { - MGET(n, M_DONTWAIT, MT_DATA); - n->m_len = MLEN; - } - if (!n) { - pfkeystat.in_nomem++; - return ENOBUFS; - } - if (tlen >= MCLBYTES) { /*XXX better threshold? */ - MCLGET(n, M_DONTWAIT); - if ((n->m_flags & M_EXT) == 0) { - m_free(n); - m_freem(m); - pfkeystat.in_nomem++; - return ENOBUFS; - } - n->m_len = MCLBYTES; - } - - if (tlen < n->m_len) - n->m_len = tlen; - n->m_next = NULL; - if (m == NULL) - m = mprev = n; - else { - mprev->m_next = n; - mprev = n; - } - tlen -= n->m_len; - n = NULL; - } - m->m_pkthdr.len = len; - m->m_pkthdr.rcvif = NULL; - m_copyback(m, 0, len, (caddr_t)msg); - - /* avoid duplicated statistics */ - pfkeystat.in_total--; - pfkeystat.in_bytes -= len; - pfkeystat.in_msgtype[msg->sadb_msg_type]--; - - return key_sendup_mbuf(so, m, target); -} /* so can be NULL if target != KEY_SENDUP_ONE */ int @@ -295,6 +217,7 @@ key_sendup_mbuf(so, m, target) struct rawcb *rp; int error = 0; + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); if (m == NULL) panic("key_sendup_mbuf: NULL pointer was passed.\n"); if (so == NULL && target == KEY_SENDUP_ONE) @@ -318,7 +241,8 @@ key_sendup_mbuf(so, m, target) msg = mtod(m, struct sadb_msg *); pfkeystat.in_msgtype[msg->sadb_msg_type]++; } - + + lck_mtx_lock(raw_mtx); LIST_FOREACH(rp, &rawcb_list, list) { if (rp->rcb_proto.sp_family != PF_KEY) @@ -329,7 +253,8 @@ key_sendup_mbuf(so, m, target) } kp = (struct keycb *)rp; - + + socket_lock(rp->rcb_socket, 1); /* * If you are in promiscuous mode, and when you get broadcasted * reply, you'll get two PF_KEY messages. @@ -343,15 +268,15 @@ key_sendup_mbuf(so, m, target) } /* the exact target will be processed later */ - if (so && sotorawcb(so) == rp) + if (so && sotorawcb(so) == rp) { + socket_unlock(rp->rcb_socket, 1); continue; + } sendup = 0; switch (target) { case KEY_SENDUP_ONE: /* the statement has no effect */ - if (so && sotorawcb(so) == rp) - sendup++; break; case KEY_SENDUP_ALL: sendup++; @@ -363,8 +288,12 @@ key_sendup_mbuf(so, m, target) } pfkeystat.in_msgtarget[target]++; - if (!sendup) + if (!sendup) { + socket_unlock(rp->rcb_socket, 1); continue; + } + else + sendup = 0; // clear for next iteration if ((n = m_copy(m, 0, (int)M_COPYALL)) == NULL) { #if IPSEC_DEBUG @@ -372,19 +301,26 @@ key_sendup_mbuf(so, m, target) #endif m_freem(m); pfkeystat.in_nomem++; + socket_unlock(rp->rcb_socket, 1); + lck_mtx_unlock(raw_mtx); return ENOBUFS; } - if ((error = key_sendup0(rp, n, 0)) != 0) { - m_freem(m); - return error; - } - + /* + * ignore error even if queue is full. PF_KEY does not + * guarantee the delivery of the message. + * this is important when target == KEY_SENDUP_ALL. + */ + key_sendup0(rp, n, 0); + socket_unlock(rp->rcb_socket, 1); n = NULL; } + lck_mtx_unlock(raw_mtx); if (so) { + socket_lock(so, 1); error = key_sendup0(sotorawcb(so), m, 0); + socket_unlock(so, 1); m = NULL; } else { error = 0; @@ -400,10 +336,8 @@ key_sendup_mbuf(so, m, target) static int key_abort(struct socket *so) { - int s, error; - s = splnet(); + int error; error = raw_usrreqs.pru_abort(so); - splx(s); return error; } @@ -415,7 +349,7 @@ static int key_attach(struct socket *so, int proto, struct proc *p) { struct keycb *kp; - int s, error; + int error; if (sotorawcb(so) != 0) return EISCONN; /* XXX panic? */ @@ -424,36 +358,29 @@ key_attach(struct socket *so, int proto, struct proc *p) return ENOBUFS; bzero(kp, sizeof *kp); - /* - * The splnet() is necessary to block protocols from sending - * error notifications (like RTM_REDIRECT or RTM_LOSING) while - * this PCB is extant but incompletely initialized. - * Probably we should try to do more of this work beforehand and - * eliminate the spl. - */ - s = splnet(); so->so_pcb = (caddr_t)kp; + kp->kp_promisc = kp->kp_registered = 0; + kp->kp_raw.rcb_laddr = &key_src; + kp->kp_raw.rcb_faddr = &key_dst; + error = raw_usrreqs.pru_attach(so, proto, p); kp = (struct keycb *)sotorawcb(so); if (error) { _FREE(kp, M_PCB); so->so_pcb = (caddr_t) 0; - splx(s); + so->so_flags |= SOF_PCBCLEARING; printf("key_usrreq: key_usrreq results %d\n", error); return error; } - kp->kp_promisc = kp->kp_registered = 0; - + socket_lock(so, 1); if (kp->kp_raw.rcb_proto.sp_protocol == PF_KEY) /* XXX: AF_KEY */ key_cb.key_count++; key_cb.any_count++; - kp->kp_raw.rcb_laddr = &key_src; - kp->kp_raw.rcb_faddr = &key_dst; soisconnected(so); so->so_options |= SO_USELOOPBACK; + socket_unlock(so, 1); - splx(s); return 0; } @@ -464,10 +391,8 @@ key_attach(struct socket *so, int proto, struct proc *p) static int key_bind(struct socket *so, struct sockaddr *nam, struct proc *p) { - int s, error; - s = splnet(); + int error; error = raw_usrreqs.pru_bind(so, nam, p); /* xxx just EINVAL */ - splx(s); return error; } @@ -478,10 +403,8 @@ key_bind(struct socket *so, struct sockaddr *nam, struct proc *p) static int key_connect(struct socket *so, struct sockaddr *nam, struct proc *p) { - int s, error; - s = splnet(); + int error; error = raw_usrreqs.pru_connect(so, nam, p); /* XXX just EINVAL */ - splx(s); return error; } @@ -493,19 +416,19 @@ static int key_detach(struct socket *so) { struct keycb *kp = (struct keycb *)sotorawcb(so); - int s, error; + int error; - s = splnet(); if (kp != 0) { - if (kp->kp_raw.rcb_proto.sp_protocol - == PF_KEY) /* XXX: AF_KEY */ + if (kp->kp_raw.rcb_proto.sp_protocol == PF_KEY) /* XXX: AF_KEY */ key_cb.key_count--; key_cb.any_count--; - + socket_unlock(so, 0); + lck_mtx_lock(sadb_mutex); key_freereg(so); + lck_mtx_unlock(sadb_mutex); + socket_lock(so, 0); } error = raw_usrreqs.pru_detach(so); - splx(s); return error; } @@ -516,10 +439,8 @@ key_detach(struct socket *so) static int key_disconnect(struct socket *so) { - int s, error; - s = splnet(); + int error; error = raw_usrreqs.pru_disconnect(so); - splx(s); return error; } @@ -530,10 +451,8 @@ key_disconnect(struct socket *so) static int key_peeraddr(struct socket *so, struct sockaddr **nam) { - int s, error; - s = splnet(); + int error; error = raw_usrreqs.pru_peeraddr(so, nam); - splx(s); return error; } @@ -545,10 +464,8 @@ static int key_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct proc *p) { - int s, error; - s = splnet(); + int error; error = raw_usrreqs.pru_send(so, flags, m, nam, control, p); - splx(s); return error; } @@ -559,10 +476,8 @@ key_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, static int key_shutdown(struct socket *so) { - int s, error; - s = splnet(); + int error; error = raw_usrreqs.pru_shutdown(so); - splx(s); return error; } @@ -573,10 +488,8 @@ key_shutdown(struct socket *so) static int key_sockaddr(struct socket *so, struct sockaddr **nam) { - int s, error; - s = splnet(); + int error; error = raw_usrreqs.pru_sockaddr(so, nam); - splx(s); return error; } @@ -587,7 +500,7 @@ struct pr_usrreqs key_usrreqs = { key_disconnect, pru_listen_notsupp, key_peeraddr, pru_rcvd_notsupp, pru_rcvoob_notsupp, key_send, pru_sense_null, key_shutdown, - key_sockaddr, sosend, soreceive, sopoll + key_sockaddr, sosend, soreceive, pru_sopoll_notsupp }; /* sysctl */ @@ -603,13 +516,15 @@ struct protosw keysw[] = { { SOCK_RAW, &keydomain, PF_KEY_V2, PR_ATOMIC|PR_ADDR, 0, key_output, raw_ctlinput, 0, 0, - raw_init, 0, 0, 0, - 0, &key_usrreqs + key_init, 0, 0, 0, + 0, + &key_usrreqs, + 0, 0, 0, } }; struct domain keydomain = - { PF_KEY, "key", key_init, 0, 0, + { PF_KEY, "key", key_domain_init, 0, 0, keysw, 0, 0,0, sizeof(struct key_cb), 0 diff --git a/bsd/netkey/keysock.h b/bsd/netkey/keysock.h index f176d1a20..c055f2488 100644 --- a/bsd/netkey/keysock.h +++ b/bsd/netkey/keysock.h @@ -62,8 +62,8 @@ struct pfkeystat { #define KEY_SENDUP_ALL 1 #define KEY_SENDUP_REGISTERED 2 +#ifdef KERNEL_PRIVATE #ifdef KERNEL -#ifdef __APPLE_API_PRIVATE struct keycb { struct rawcb kp_raw; /* rawcb */ int kp_promisc; /* promiscuous mode */ @@ -73,16 +73,16 @@ struct keycb { extern struct pfkeystat pfkeystat; #ifdef __APPLE__ -extern int key_output __P((struct mbuf *, struct socket* so)); +extern int key_output(struct mbuf *, struct socket* so); #else -extern int key_output __P((struct mbuf *, ...)); +extern int key_output(struct mbuf *, ...); #endif -extern int key_usrreq __P((struct socket *, - int, struct mbuf *, struct mbuf *, struct mbuf *)); +extern int key_usrreq(struct socket *, + int, struct mbuf *, struct mbuf *, struct mbuf *); -extern int key_sendup __P((struct socket *, struct sadb_msg *, u_int, int)); -extern int key_sendup_mbuf __P((struct socket *, struct mbuf *, int)); -#endif /* __APPLE_API_PRIVATE */ +extern int key_sendup(struct socket *, struct sadb_msg *, u_int, int); +extern int key_sendup_mbuf(struct socket *, struct mbuf *, int); #endif /* KERNEL */ +#endif /* KERNEL_PRIVATE */ #endif /*_NETKEY_KEYSOCK_H_*/ diff --git a/bsd/nfs/Makefile b/bsd/nfs/Makefile index 55a7668eb..fa4b4198d 100644 --- a/bsd/nfs/Makefile +++ b/bsd/nfs/Makefile @@ -21,8 +21,8 @@ EXPINC_SUBDIRS_I386 = \ DATAFILES = \ krpc.h nfs.h nfsdiskless.h nfsm_subs.h nfsmount.h nfsnode.h \ - nlminfo.h nfs_lock.h \ - nfsproto.h nfsrtt.h nfsrvcache.h nqnfs.h rpcv2.h xdr_subs.h + nfs_lock.h \ + nfsproto.h nfsrtt.h nfsrvcache.h rpcv2.h xdr_subs.h INSTALL_MI_LIST = ${DATAFILES} diff --git a/bsd/nfs/krpc.h b/bsd/nfs/krpc.h index 16fe898a8..5d69fd6f6 100644 --- a/bsd/nfs/krpc.h +++ b/bsd/nfs/krpc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -20,13 +20,7 @@ * @APPLE_LICENSE_HEADER_END@ */ /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* - * - * The NEXTSTEP Software License Agreement specifies the terms - * and conditions for redistribution. - * - */ - + #ifndef __NFS_KRPC_H__ #define __NFS_KRPC_H__ @@ -35,12 +29,12 @@ #include <sys/cdefs.h> #ifdef __APPLE_API_PRIVATE -int krpc_call __P((struct sockaddr_in *sin, +int krpc_call(struct sockaddr_in *sin, u_int sotype, u_int prog, u_int vers, u_int func, - struct mbuf **data, struct sockaddr_in **from)); + mbuf_t *data, struct sockaddr_in *from); -int krpc_portmap __P((struct sockaddr_in *sin, - u_int prog, u_int vers, u_int16_t *portp)); +int krpc_portmap(struct sockaddr_in *sin, + u_int prog, u_int vers, u_int proto, u_int16_t *portp); /* diff --git a/bsd/nfs/krpc_subr.c b/bsd/nfs/krpc_subr.c index 8726f59d9..694b152f5 100644 --- a/bsd/nfs/krpc_subr.c +++ b/bsd/nfs/krpc_subr.c @@ -64,12 +64,13 @@ #include <sys/ioctl.h> #include <sys/proc.h> #include <sys/mount.h> -#include <sys/mbuf.h> +#include <sys/kpi_mbuf.h> #include <sys/malloc.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/systm.h> #include <sys/reboot.h> +#include <sys/uio_internal.h> #include <net/if.h> #include <netinet/in.h> @@ -140,10 +141,10 @@ struct rpc_reply { * Returns non-zero error on failure. */ int -krpc_portmap(sin, prog, vers, portp) - struct sockaddr_in *sin; /* server address */ - u_int prog, vers; /* host order */ - u_int16_t *portp; /* network order */ +krpc_portmap(sin, prog, vers, proto, portp) + struct sockaddr_in *sin; /* server address */ + u_int prog, vers, proto; /* host order */ + u_int16_t *portp; /* network order */ { struct sdata { u_int32_t prog; /* call program */ @@ -155,7 +156,7 @@ krpc_portmap(sin, prog, vers, portp) u_int16_t pad; u_int16_t port; } *rdata; - struct mbuf *m; + mbuf_t m; int error; /* The portmapper port is fixed. */ @@ -164,30 +165,32 @@ krpc_portmap(sin, prog, vers, portp) return 0; } - m = m_gethdr(M_WAIT, MT_DATA); - if (m == NULL) - return ENOBUFS; - m->m_len = sizeof(*sdata); - m->m_pkthdr.len = m->m_len; - sdata = mtod(m, struct sdata *); + error = mbuf_gethdr(MBUF_WAITOK, MBUF_TYPE_DATA, &m); + if (error) + return error; + mbuf_setlen(m, sizeof(*sdata)); + mbuf_pkthdr_setlen(m, sizeof(*sdata)); + sdata = mbuf_data(m); /* Do the RPC to get it. */ sdata->prog = htonl(prog); sdata->vers = htonl(vers); - sdata->proto = htonl(IPPROTO_UDP); + sdata->proto = htonl(proto); sdata->port = 0; sin->sin_port = htons(PMAPPORT); - error = krpc_call(sin, PMAPPROG, PMAPVERS, - PMAPPROC_GETPORT, &m, NULL); + error = krpc_call(sin, SOCK_DGRAM, PMAPPROG, PMAPVERS, PMAPPROC_GETPORT, &m, NULL); if (error) return error; - rdata = mtod(m, struct rdata *); + rdata = mbuf_data(m); *portp = rdata->port; - m_freem(m); - return 0; + if (!rdata->port) + error = EPROGUNAVAIL; + + mbuf_freem(m); + return (error); } /* @@ -196,22 +199,21 @@ krpc_portmap(sin, prog, vers, portp) * the address from whence the response came is saved there. */ int -krpc_call(sa, prog, vers, func, data, from_p) +krpc_call(sa, sotype, prog, vers, func, data, from_p) struct sockaddr_in *sa; - u_int prog, vers, func; - struct mbuf **data; /* input/output */ - struct sockaddr_in **from_p; /* output */ + u_int sotype, prog, vers, func; + mbuf_t *data; /* input/output */ + struct sockaddr_in *from_p; /* output */ { - struct socket *so; + socket_t so; struct sockaddr_in *sin; - struct mbuf *m, *nam, *mhead, *mhck; + mbuf_t m, nam, mhead; struct rpc_call *call; struct rpc_reply *reply; - struct uio auio; - int error, rcvflg, timo, secs, len; + int error, timo, secs, len; static u_int32_t xid = ~0xFF; u_int16_t tport; - struct sockopt sopt; + int maxpacket = 1<<16; /* * Validate address family. @@ -222,13 +224,11 @@ krpc_call(sa, prog, vers, func, data, from_p) /* Free at end if not null. */ nam = mhead = NULL; - if (from_p) - *from_p = 0; /* * Create socket and set its recieve timeout. */ - if ((error = socreate(AF_INET, &so, SOCK_DGRAM, 0))) + if ((error = sock_socket(AF_INET, sotype, 0, 0, 0, &so))) goto out; { @@ -236,14 +236,8 @@ krpc_call(sa, prog, vers, func, data, from_p) tv.tv_sec = 1; tv.tv_usec = 0; - bzero(&sopt, sizeof sopt); - sopt.sopt_dir = SOPT_SET; - sopt.sopt_level = SOL_SOCKET; - sopt.sopt_name = SO_RCVTIMEO; - sopt.sopt_val = &tv; - sopt.sopt_valsize = sizeof tv; - - if (error = sosetopt(so, &sopt)) + + if ((error = sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)))) goto out; } @@ -252,12 +246,9 @@ krpc_call(sa, prog, vers, func, data, from_p) * Enable broadcast if necessary. */ - if (from_p) { + if (from_p && (sotype == SOCK_DGRAM)) { int on = 1; - sopt.sopt_name = SO_BROADCAST; - sopt.sopt_val = &on; - sopt.sopt_valsize = sizeof on; - if (error = sosetopt(so, &sopt)) + if ((error = sock_setsockopt(so, SOL_SOCKET, SO_BROADCAST, &on, sizeof(on)))) goto out; } @@ -266,19 +257,22 @@ krpc_call(sa, prog, vers, func, data, from_p) * because some NFS servers refuse requests from * non-reserved (non-privileged) ports. */ - m = m_getclr(M_WAIT, MT_SONAME); - sin = mtod(m, struct sockaddr_in *); - sin->sin_len = m->m_len = sizeof(*sin); + if ((error = mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &m))) + goto out; + sin = mbuf_data(m); + bzero(sin, sizeof(*sin)); + mbuf_setlen(m, sizeof(*sin)); + sin->sin_len = sizeof(*sin); sin->sin_family = AF_INET; sin->sin_addr.s_addr = INADDR_ANY; tport = IPPORT_RESERVED; do { tport--; sin->sin_port = htons(tport); - error = sobind(so, mtod(m, struct sockaddr *)); + error = sock_bind(so, (struct sockaddr*)sin); } while (error == EADDRINUSE && tport > IPPORT_RESERVED / 2); - m_freem(m); + mbuf_freem(m); m = NULL; if (error) { printf("bind failed\n"); @@ -288,13 +282,27 @@ krpc_call(sa, prog, vers, func, data, from_p) /* * Setup socket address for the server. */ - nam = m_get(M_WAIT, MT_SONAME); - if (nam == NULL) { - error = ENOBUFS; + if ((error = mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &nam))) goto out; + sin = mbuf_data(nam); + mbuf_setlen(nam, sa->sin_len); + bcopy((caddr_t)sa, (caddr_t)sin, sa->sin_len); + + if (sotype == SOCK_STREAM) { + struct timeval tv; + tv.tv_sec = 60; + tv.tv_usec = 0; + error = sock_connect(so, mbuf_data(nam), MSG_DONTWAIT); + if (error && (error != EINPROGRESS)) + goto out; + error = sock_connectwait(so, &tv); + if (error) { + if (error == EINPROGRESS) + error = ETIMEDOUT; + printf("krpc_call: error waiting for TCP socket connect: %d\n", error); + goto out; + } } - sin = mtod(nam, struct sockaddr_in *); - bcopy((caddr_t)sa, (caddr_t)sin, (nam->m_len = sa->sin_len)); /* * Prepend RPC message header. @@ -302,23 +310,31 @@ krpc_call(sa, prog, vers, func, data, from_p) m = *data; *data = NULL; #if DIAGNOSTIC - if ((m->m_flags & M_PKTHDR) == 0) + if ((mbuf_flags(m) & MBUF_PKTHDR) == 0) panic("krpc_call: send data w/o pkthdr"); - if (m->m_pkthdr.len < m->m_len) + if (mbuf_pkthdr_len(m) < mbuf_len(m)) panic("krpc_call: pkthdr.len not set"); #endif - mhead = m_prepend(m, sizeof(*call), M_WAIT); - if (mhead == NULL) { - error = ENOBUFS; + len = sizeof(*call); + if (sotype == SOCK_STREAM) + len += 4; /* account for RPC record marker */ + mhead = m; + if ((error = mbuf_prepend(&mhead, len, MBUF_WAITOK))) + goto out; + if ((error = mbuf_pkthdr_setrcvif(mhead, NULL))) goto out; - } - mhead->m_pkthdr.len += sizeof(*call); - mhead->m_pkthdr.rcvif = NULL; /* * Fill in the RPC header */ - call = mtod(mhead, struct rpc_call *); + if (sotype == SOCK_STREAM) { + /* first, fill in RPC record marker */ + u_long *recmark = mbuf_data(mhead); + *recmark = htonl(0x80000000 | (mbuf_pkthdr_len(mhead) - 4)); + call = (struct rpc_call *)(recmark + 1); + } else { + call = mbuf_data(mhead); + } bzero((caddr_t)call, sizeof(*call)); xid++; call->rp_xid = htonl(xid); @@ -337,13 +353,20 @@ krpc_call(sa, prog, vers, func, data, from_p) */ timo = 0; for (;;) { + struct msghdr msg; + /* Send RPC request (or re-send). */ - m = m_copym(mhead, 0, M_COPYALL, M_WAIT); - if (m == NULL) { - error = ENOBUFS; + if ((error = mbuf_copym(mhead, 0, MBUF_COPYALL, MBUF_WAITOK, &m))) goto out; + bzero(&msg, sizeof(msg)); + if (sotype == SOCK_STREAM) { + msg.msg_name = NULL; + msg.msg_namelen = 0; + } else { + msg.msg_name = mbuf_data(nam); + msg.msg_namelen = mbuf_len(nam); } - error = sosend(so, mtod(nam, struct sockaddr *), NULL, m, NULL, 0); + error = sock_sendmbuf(so, &msg, m, 0, 0); if (error) { printf("krpc_call: sosend: %d\n", error); goto out; @@ -357,31 +380,70 @@ krpc_call(sa, prog, vers, func, data, from_p) printf("RPC timeout for server " IP_FORMAT "\n", IP_LIST(&(sin->sin_addr.s_addr))); - /* - * soreceive is now conditionally using this pointer - * if present, it updates per-proc stats - */ - auio.uio_procp = NULL; - /* * Wait for up to timo seconds for a reply. * The socket receive timeout was set to 1 second. */ secs = timo; while (secs > 0) { - if ((from_p) && (*from_p)){ - FREE(*from_p, M_SONAME); - *from_p = NULL; - } - + size_t readlen; + if (m) { - m_freem(m); + mbuf_freem(m); m = NULL; } - auio.uio_resid = len = 1<<16; - rcvflg = 0; - - error = soreceive(so, (struct sockaddr **) from_p, &auio, &m, NULL, &rcvflg); + if (sotype == SOCK_STREAM) { + int maxretries = 60; + struct iovec_32 aio; + aio.iov_base = (uintptr_t) &len; + aio.iov_len = sizeof(u_long); + bzero(&msg, sizeof(msg)); + msg.msg_iov = (struct iovec *) &aio; + msg.msg_iovlen = 1; + do { + error = sock_receive(so, &msg, MSG_WAITALL, &readlen); + if ((error == EWOULDBLOCK) && (--maxretries <= 0)) + error = ETIMEDOUT; + } while (error == EWOULDBLOCK); + if (!error && readlen < aio.iov_len) { + /* only log a message if we got a partial word */ + if (readlen != 0) + printf("short receive (%d/%d) from server " IP_FORMAT "\n", + readlen, sizeof(u_long), IP_LIST(&(sin->sin_addr.s_addr))); + error = EPIPE; + } + if (error) + goto out; + len = ntohl(len) & ~0x80000000; + /* + * This is SERIOUS! We are out of sync with the sender + * and forcing a disconnect/reconnect is all I can do. + */ + if (len > maxpacket) { + printf("impossible packet length (%d) from server %s\n", + len, IP_LIST(&(sin->sin_addr.s_addr))); + error = EFBIG; + goto out; + } + + do { + readlen = len; + error = sock_receivembuf(so, NULL, &m, MSG_WAITALL, &readlen); + } while (error == EWOULDBLOCK); + + if (!error && (len > (int)readlen)) { + printf("short receive (%d/%d) from server %s\n", + readlen, len, IP_LIST(&(sin->sin_addr.s_addr))); + error = EPIPE; + } + } else { + len = maxpacket; + readlen = len; + bzero(&msg, sizeof(msg)); + msg.msg_name = from_p; + msg.msg_namelen = (from_p == NULL) ? 0 : sizeof(*from_p); + error = sock_receivembuf(so, &msg, &m, 0, &readlen); + } if (error == EWOULDBLOCK) { secs--; @@ -389,14 +451,14 @@ krpc_call(sa, prog, vers, func, data, from_p) } if (error) goto out; - len -= auio.uio_resid; + len = readlen; /* Does the reply contain at least a header? */ if (len < MIN_REPLY_HDR) continue; - if (m->m_len < MIN_REPLY_HDR) + if (mbuf_len(m) < MIN_REPLY_HDR) continue; - reply = mtod(m, struct rpc_reply *); + reply = mbuf_data(m); /* Is it the right reply? */ if (reply->rp_direction != htonl(RPC_REPLY)) @@ -404,7 +466,7 @@ krpc_call(sa, prog, vers, func, data, from_p) if (reply->rp_xid != htonl(xid)) continue; - + /* Was RPC accepted? (authorization OK) */ if (reply->rp_astatus != 0) { error = ntohl(reply->rp_u.rpu_errno); @@ -463,17 +525,16 @@ krpc_call(sa, prog, vers, func, data, from_p) * contiguous (fix callers instead). -gwr */ #if DIAGNOSTIC - if ((m->m_flags & M_PKTHDR) == 0) + if ((mbuf_flags(m) & MBUF_PKTHDR) == 0) panic("krpc_call: received pkt w/o header?"); #endif - len = m->m_pkthdr.len; - if (m->m_len < len) { - m = m_pullup(m, len); - if (m == NULL) { - error = ENOBUFS; + len = mbuf_pkthdr_len(m); + if (sotype == SOCK_STREAM) + len -= 4; /* the RPC record marker was read separately */ + if (mbuf_len(m) < len) { + if ((error = mbuf_pullup(&m, len))) goto out; - } - reply = mtod(m, struct rpc_reply *); + reply = mbuf_data(m); } /* @@ -484,13 +545,13 @@ krpc_call(sa, prog, vers, func, data, from_p) len += ntohl(reply->rp_u.rpu_ok.rp_auth.rp_alen); len = (len + 3) & ~3; /* XXX? */ } - m_adj(m, len); + mbuf_adj(m, len); /* result */ *data = m; out: - if (nam) m_freem(nam); - if (mhead) m_freem(mhead); - soclose(so); + if (nam) mbuf_freem(nam); + if (mhead) mbuf_freem(mhead); + sock_close(so); return error; } diff --git a/bsd/nfs/nfs.h b/bsd/nfs/nfs.h index d9385473a..b955a56e8 100644 --- a/bsd/nfs/nfs.h +++ b/bsd/nfs/nfs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -63,6 +63,7 @@ #define _NFS_NFS_H_ #include <sys/appleapiopts.h> +#include <sys/cdefs.h> #ifdef __APPLE_API_PRIVATE /* @@ -79,6 +80,7 @@ #define NFS_MAXREXMIT 100 /* Stop counting after this many */ #define NFS_MAXWINDOW 1024 /* Max number of outstanding requests */ #define NFS_RETRANS 10 /* Num of retrans for soft mounts */ +#define NFS_TRYLATERDEL 15 /* Initial try later delay (sec) */ #define NFS_MAXGRPS 16 /* Max. size of groups list */ #ifndef NFS_MINATTRTIMO #define NFS_MINATTRTIMO 5 /* Attribute cache timeout in sec */ @@ -86,6 +88,12 @@ #ifndef NFS_MAXATTRTIMO #define NFS_MAXATTRTIMO 60 #endif +#ifndef NFS_MINDIRATTRTIMO +#define NFS_MINDIRATTRTIMO 5 /* directory attribute cache timeout in sec */ +#endif +#ifndef NFS_MAXDIRATTRTIMO +#define NFS_MAXDIRATTRTIMO 60 +#endif #define NFS_WSIZE 16384 /* Def. write data size <= 16K */ #define NFS_RSIZE 16384 /* Def. read data size <= 16K */ #define NFS_DGRAM_WSIZE 8192 /* UDP Def. write data size <= 8K */ @@ -111,7 +119,7 @@ #define NMOD(a) ((a) % nfs_asyncdaemons) #define NFS_CMPFH(n, f, s) \ ((n)->n_fhsize == (s) && !bcmp((caddr_t)(n)->n_fhp, (caddr_t)(f), (s))) -#define NFS_ISV3(v) (VFSTONFS((v)->v_mount)->nm_flag & NFSMNT_NFSV3) +#define NFS_ISV3(v) (VFSTONFS(vnode_mount(v))->nm_flag & NFSMNT_NFSV3) #define NFS_SRVMAXDATA(n) \ (((n)->nd_flag & ND_NFSV3) ? (((n)->nd_nam2) ? \ NFS_MAXDGRAMDATA : NFS_MAXDATA) : NFS_V2MAXDATA) @@ -153,7 +161,7 @@ /* * Arguments to mount NFS */ -#define NFS_ARGSVERSION 3 /* change when nfs_args changes */ +#define NFS_ARGSVERSION 4 /* change when nfs_args changes */ struct nfs_args { int version; /* args structure version number */ struct sockaddr *addr; /* file server address */ @@ -170,11 +178,99 @@ struct nfs_args { int retrans; /* times to retry send */ int maxgrouplist; /* Max. size of group list */ int readahead; /* # of blocks to readahead */ - int leaseterm; /* Term (sec) of lease */ - int deadthresh; /* Retrans threshold */ + int leaseterm; /* obsolete: Term (sec) of lease */ + int deadthresh; /* obsolete: Retrans threshold */ + char *hostname; /* server's name */ + /* NFS_ARGSVERSION 3 ends here */ + int acregmin; /* reg file min attr cache timeout */ + int acregmax; /* reg file max attr cache timeout */ + int acdirmin; /* dir min attr cache timeout */ + int acdirmax; /* dir max attr cache timeout */ +}; + +struct nfs_args3 { + int version; /* args structure version number */ + struct sockaddr *addr; /* file server address */ + int addrlen; /* length of address */ + int sotype; /* Socket type */ + int proto; /* and Protocol */ + u_char *fh; /* File handle to be mounted */ + int fhsize; /* Size, in bytes, of fh */ + int flags; /* flags */ + int wsize; /* write size in bytes */ + int rsize; /* read size in bytes */ + int readdirsize; /* readdir size in bytes */ + int timeo; /* initial timeout in .1 secs */ + int retrans; /* times to retry send */ + int maxgrouplist; /* Max. size of group list */ + int readahead; /* # of blocks to readahead */ + int leaseterm; /* obsolete: Term (sec) of lease */ + int deadthresh; /* obsolete: Retrans threshold */ char *hostname; /* server's name */ }; +// LP64todo - should this move? +#ifdef KERNEL +/* LP64 version of nfs_args. all pointers and longs + * grow when we're dealing with a 64-bit process. + * WARNING - keep in sync with nfs_args + */ +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_nfs_args { + int version; /* args structure version number */ + user_addr_t addr; /* file server address */ + int addrlen; /* length of address */ + int sotype; /* Socket type */ + int proto; /* and Protocol */ + user_addr_t fh; /* File handle to be mounted */ + int fhsize; /* Size, in bytes, of fh */ + int flags; /* flags */ + int wsize; /* write size in bytes */ + int rsize; /* read size in bytes */ + int readdirsize; /* readdir size in bytes */ + int timeo; /* initial timeout in .1 secs */ + int retrans; /* times to retry send */ + int maxgrouplist; /* Max. size of group list */ + int readahead; /* # of blocks to readahead */ + int leaseterm; /* obsolete: Term (sec) of lease */ + int deadthresh; /* obsolete: Retrans threshold */ + user_addr_t hostname; /* server's name */ + /* NFS_ARGSVERSION 3 ends here */ + int acregmin; /* reg file min attr cache timeout */ + int acregmax; /* reg file max attr cache timeout */ + int acdirmin; /* dir min attr cache timeout */ + int acdirmax; /* dir max attr cache timeout */ +}; +struct user_nfs_args3 { + int version; /* args structure version number */ + user_addr_t addr; /* file server address */ + int addrlen; /* length of address */ + int sotype; /* Socket type */ + int proto; /* and Protocol */ + user_addr_t fh; /* File handle to be mounted */ + int fhsize; /* Size, in bytes, of fh */ + int flags; /* flags */ + int wsize; /* write size in bytes */ + int rsize; /* read size in bytes */ + int readdirsize; /* readdir size in bytes */ + int timeo; /* initial timeout in .1 secs */ + int retrans; /* times to retry send */ + int maxgrouplist; /* Max. size of group list */ + int readahead; /* # of blocks to readahead */ + int leaseterm; /* obsolete: Term (sec) of lease */ + int deadthresh; /* obsolete: Retrans threshold */ + user_addr_t hostname; /* server's name */ +}; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +#endif // KERNEL + /* * NFS mount option flags */ @@ -186,29 +282,31 @@ struct nfs_args { #define NFSMNT_MAXGRPS 0x00000020 /* set maximum grouplist size */ #define NFSMNT_INT 0x00000040 /* allow interrupts on hard mount */ #define NFSMNT_NOCONN 0x00000080 /* Don't Connect the socket */ -#define NFSMNT_NQNFS 0x00000100 /* Use Nqnfs protocol */ #define NFSMNT_NFSV3 0x00000200 /* Use NFS Version 3 protocol */ #define NFSMNT_KERB 0x00000400 /* Use Kerberos authentication */ #define NFSMNT_DUMBTIMR 0x00000800 /* Don't estimate rtt dynamically */ -#define NFSMNT_LEASETERM 0x00001000 /* set lease term (nqnfs) */ #define NFSMNT_READAHEAD 0x00002000 /* set read ahead */ -#define NFSMNT_DEADTHRESH 0x00004000 /* set dead server retry thresh */ #define NFSMNT_RESVPORT 0x00008000 /* Allocate a reserved port */ #define NFSMNT_RDIRPLUS 0x00010000 /* Use Readdirplus for V3 */ #define NFSMNT_READDIRSIZE 0x00020000 /* Set readdir size */ #define NFSMNT_NOLOCKS 0x00040000 /* don't support file locking */ +#define NFSMNT_ACREGMIN 0x00100000 /* reg min attr cache timeout */ +#define NFSMNT_ACREGMAX 0x00200000 /* reg max attr cache timeout */ +#define NFSMNT_ACDIRMIN 0x00400000 /* dir min attr cache timeout */ +#define NFSMNT_ACDIRMAX 0x00800000 /* dir max attr cache timeout */ +/* + * NFS mount state flags (nm_state) + */ #define NFSSTA_LOCKTIMEO 0x00002000 /* experienced a lock req timeout */ #define NFSSTA_MOUNTED 0x00004000 /* completely mounted */ -#define NFSSTA_LOCKSWORK 0x00008000 /* lock ops have worked. */ +#define NFSSTA_LOCKSWORK 0x00008000 /* lock ops have worked. */ #define NFSSTA_TIMEO 0x00010000 /* experienced a timeout. */ #define NFSSTA_FORCE 0x00020000 /* doing a forced unmount. */ #define NFSSTA_HASWRITEVERF 0x00040000 /* Has write verifier for V3 */ #define NFSSTA_GOTPATHCONF 0x00080000 /* Got the V3 pathconf info */ #define NFSSTA_GOTFSINFO 0x00100000 /* Got the V3 fsinfo */ #define NFSSTA_MNTD 0x00200000 /* Mnt server for mnt point */ -#define NFSSTA_DISMINPROG 0x00400000 /* Dismount in progress */ -#define NFSSTA_DISMNT 0x00800000 /* Dismounted */ #define NFSSTA_SNDLOCK 0x01000000 /* Send socket lock */ #define NFSSTA_WANTSND 0x02000000 /* Want above */ #define NFSSTA_RCVLOCK 0x04000000 /* Rcv socket lock */ @@ -218,6 +316,14 @@ struct nfs_args { #define NFSSTA_WANTAUTH 0x40000000 /* Wants an authenticator */ #define NFSSTA_AUTHERR 0x80000000 /* Authentication error */ +/* + * NFS mount pathconf info flags (nm_fsinfo.pcflags) + */ +#define NFSPCINFO_NOTRUNC 0x01 +#define NFSPCINFO_CHOWN_RESTRICTED 0x02 +#define NFSPCINFO_CASE_INSENSITIVE 0x04 +#define NFSPCINFO_CASE_PRESERVING 0x08 + /* * Structures for the nfssvc(2) syscall. Not that anyone but nfsd and mount_nfs * should ever try and use it. @@ -228,6 +334,28 @@ struct nfsd_args { int namelen; /* Length of name */ }; +// LP64todo - should this move? +#ifdef KERNEL +/* LP64 version of nfsd_args. all pointers and longs + * grow when we're dealing with a 64-bit process. + * WARNING - keep in sync with nfsd_args + */ +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_nfsd_args { + int sock; /* Socket to serve */ + user_addr_t name; /* Client addr for connection based sockets */ + int namelen; /* Length of name */ +}; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +#endif // KERNEL + struct nfsd_srvargs { struct nfsd *nsd_nfsd; /* Pointer to in kernel nfsd struct */ uid_t nsd_uid; /* Effective uid mapped to cred */ @@ -253,6 +381,140 @@ struct nfsd_cargs { NFSKERBKEY_T ncd_key; /* Session key */ }; +/* + * NFS Server File Handle structures + */ + +/* NFS export handle identifies which NFS export */ +#define NFS_FH_VERSION 0x4e580000 /* 'NX00' */ +struct nfs_exphandle { + uint32_t nxh_version; /* data structure version */ + uint32_t nxh_fsid; /* File System Export ID */ + uint32_t nxh_expid; /* Export ID */ + uint16_t nxh_flags; /* export handle flags */ + uint8_t nxh_reserved; /* future use */ + uint8_t nxh_fidlen; /* length of File ID */ +}; + +/* nxh_flags */ +#define NXHF_INVALIDFH 0x0001 /* file handle is invalid */ + +#define NFS_MAX_FID_SIZE (NFS_MAX_FH_SIZE - sizeof(struct nfs_exphandle)) +#define NFSV2_MAX_FID_SIZE (NFSV2_MAX_FH_SIZE - sizeof(struct nfs_exphandle)) + +/* NFS server internal view of fhandle_t */ +struct nfs_filehandle { + int nfh_len; /* total length of file handle */ + struct nfs_exphandle nfh_xh; /* export handle */ + unsigned char nfh_fid[NFS_MAX_FID_SIZE]; /* File ID */ +}; + +/* + * NFS export data structures + */ + +struct nfs_export_net_args { + uint32_t nxna_flags; /* export flags */ + struct xucred nxna_cred; /* mapped credential for root/all user */ + struct sockaddr_storage nxna_addr; /* net address to which exported */ + struct sockaddr_storage nxna_mask; /* mask for net address */ +}; + +struct nfs_export_args { + uint32_t nxa_fsid; /* export FS ID */ + uint32_t nxa_expid; /* export ID */ + char *nxa_fspath; /* export FS path */ + char *nxa_exppath; /* export sub-path */ + uint32_t nxa_flags; /* export arg flags */ + uint32_t nxa_netcount; /* #entries in ex_nets array */ + struct nfs_export_net_args *nxa_nets; /* array of net args */ +}; + +#ifdef KERNEL +/* LP64 version of export_args */ + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_nfs_export_args { + uint32_t nxa_fsid; /* export FS ID */ + uint32_t nxa_expid; /* export ID */ + user_addr_t nxa_fspath; /* export FS path */ + user_addr_t nxa_exppath; /* export sub-path */ + uint32_t nxa_flags; /* export arg flags */ + uint32_t nxa_netcount; /* #entries in ex_nets array */ + user_addr_t nxa_nets; /* array of net args */ +}; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +#endif /* KERNEL */ + +/* nfs export arg flags */ +#define NXA_DELETE 0x0001 /* delete the specified export(s) */ +#define NXA_ADD 0x0002 /* add the specified export(s) */ +#define NXA_REPLACE 0x0003 /* delete and add the specified export(s) */ + +/* export option flags */ +#define NX_READONLY 0x0001 /* exported read-only */ +#define NX_DEFAULTEXPORT 0x0002 /* exported to the world */ +#define NX_MAPROOT 0x0004 /* map root access to anon credential */ +#define NX_MAPALL 0x0008 /* map all access to anon credential */ +#define NX_KERB 0x0010 /* exported with Kerberos uid mapping */ + +#ifdef KERNEL +struct nfs_exportfs; + +struct nfs_export_options { + uint32_t nxo_flags; /* export options */ + kauth_cred_t nxo_cred; /* mapped credential */ +}; + +/* Network address lookup element and individual export options */ +struct nfs_netopt { + struct radix_node no_rnodes[2]; /* radix tree glue */ + struct nfs_export_options no_opt; /* export options */ +}; + +/* Network export information */ +/* one of these for each exported directory */ +struct nfs_export { + LIST_ENTRY(nfs_export) nx_next; /* FS export list */ + LIST_ENTRY(nfs_export) nx_hash; /* export hash chain */ + struct nfs_export *nx_parent; /* parent export */ + uint32_t nx_id; /* export ID */ + uint32_t nx_flags; /* export flags */ + struct nfs_exportfs *nx_fs; /* exported file system */ + char *nx_path; /* exported file system sub-path */ + struct nfs_filehandle nx_fh; /* export root file handle */ + struct nfs_export_options nx_defopt; /* default options */ + uint32_t nx_expcnt; /* # exports in table */ + struct radix_node_head *nx_rtable[AF_MAX+1]; /* table of exports (netopts) */ +}; + +/* NFS exported file system info */ +/* one of these for each exported file system */ +struct nfs_exportfs { + LIST_ENTRY(nfs_exportfs) nxfs_next; /* exported file system list */ + uint32_t nxfs_id; /* exported file system ID */ + char *nxfs_path; /* exported file system path */ + LIST_HEAD(,nfs_export) nxfs_exports; /* list of exports for this file system */ +}; + +extern LIST_HEAD(nfsexpfslist, nfs_exportfs) nfs_exports; +extern lck_rw_t nfs_export_rwlock; // lock for export data structures +#define NFSEXPHASHVAL(FSID, EXPID) \ + (((FSID) >> 24) ^ ((FSID) >> 16) ^ ((FSID) >> 8) ^ (EXPID)) +#define NFSEXPHASH(FSID, EXPID) \ + (&nfsexphashtbl[NFSEXPHASHVAL((FSID),(EXPID)) & nfsexphash]) +extern LIST_HEAD(nfsexphashhead, nfs_export) *nfsexphashtbl; +extern u_long nfsexphash; + +#endif // KERNEL + /* * XXX to allow amd to include nfs.h without nfsproto.h */ @@ -290,9 +552,6 @@ struct nfsstats { int srvcache_idemdonehits; int srvcache_nonidemdonehits; int srvcache_misses; - int srvnqnfs_leases; - int srvnqnfs_maxleases; - int srvnqnfs_getleases; int srvvop_writes; int pageins; int pageouts; @@ -309,6 +568,7 @@ struct nfsstats { #define NFSSVC_GOTAUTH 0x040 #define NFSSVC_AUTHINFAIL 0x080 #define NFSSVC_MNTD 0x100 +#define NFSSVC_EXPORT 0x200 /* * Flags for nfsclnt() system call. @@ -345,6 +605,7 @@ struct nfsstats { * by them and break. */ #ifdef KERNEL +#include <sys/kernel_types.h> #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_NFSREQ); @@ -352,36 +613,40 @@ MALLOC_DECLARE(M_NFSMNT); MALLOC_DECLARE(M_NFSDIROFF); MALLOC_DECLARE(M_NFSRVDESC); MALLOC_DECLARE(M_NFSUID); -MALLOC_DECLARE(M_NQLEASE); MALLOC_DECLARE(M_NFSD); MALLOC_DECLARE(M_NFSBIGFH); #endif -struct uio; struct vattr; struct nameidata; /* XXX */ +struct uio; struct vnode_attr; struct nameidata; /* XXX */ struct nfsbuf; +struct nfs_vattr; +struct nfsnode; #define NFSINT_SIGMASK (sigmask(SIGINT)|sigmask(SIGTERM)|sigmask(SIGKILL)| \ sigmask(SIGHUP)|sigmask(SIGQUIT)) +__private_extern__ int nfs_mbuf_mlen, nfs_mbuf_mhlen, + nfs_mbuf_minclsize, nfs_mbuf_mclbytes; + /* * Socket errors ignored for connectionless sockets?? * For now, ignore them all */ #define NFSIGNORE_SOERROR(s, e) \ ((e) != EINTR && (e) != ERESTART && (e) != EWOULDBLOCK && \ - (e) != EIO && ((s) & PR_CONNREQUIRED) == 0) + (e) != EIO && ((s)) != SOCK_STREAM) /* * Nfs outstanding request list element */ struct nfsreq { TAILQ_ENTRY(nfsreq) r_chain; - struct mbuf *r_mreq; - struct mbuf *r_mrep; - struct mbuf *r_md; + mbuf_t r_mreq; + mbuf_t r_mrep; + mbuf_t r_md; caddr_t r_dpos; struct nfsmount *r_nmp; - struct vnode *r_vp; + vnode_t r_vp; u_long r_xid; int r_flags; /* flags on request, see below */ int r_retry; /* max retransmission count */ @@ -389,7 +654,7 @@ struct nfsreq { int r_timer; /* tick counter on reply */ u_int32_t r_procnum; /* NFS procedure number */ int r_rtt; /* RTT for rpc */ - struct proc *r_procp; /* Proc that did I/O system call */ + proc_t r_procp; /* Proc that did I/O system call */ long r_lastmsg; /* time of last tprintf */ }; @@ -406,7 +671,6 @@ extern TAILQ_HEAD(nfs_reqq, nfsreq) nfs_reqq; #define R_SOCKERR 0x0010 /* Fatal error on socket */ #define R_TPRINTFMSG 0x0020 /* Did a tprintf msg. */ #define R_MUSTRESEND 0x0040 /* Must resend request */ -#define R_GETONEREP 0x0080 /* Probe for one reply only */ #define R_BUSY 0x0100 /* Locked. */ #define R_WAITING 0x0200 /* Someone waiting for lock. */ #define R_RESENDERR 0x0400 /* resend failed. */ @@ -418,7 +682,7 @@ extern TAILQ_HEAD(nfs_reqq, nfsreq) nfs_reqq; * and uid hash lists. */ #ifndef NFS_UIDHASHSIZ -#define NFS_UIDHASHSIZ 29 /* Tune the size of nfssvc_sock with this */ +#define NFS_UIDHASHSIZ 13 /* Tune the size of nfssvc_sock with this */ #endif #define NUIDHASH(sock, uid) \ (&(sock)->ns_uidhashtbl[(uid) % NFS_UIDHASHSIZ]) @@ -434,7 +698,7 @@ extern TAILQ_HEAD(nfs_reqq, nfsreq) nfs_reqq; */ union nethostaddr { u_long had_inetaddr; - struct mbuf *had_nam; + mbuf_t had_nam; }; struct nfsuid { @@ -442,7 +706,7 @@ struct nfsuid { LIST_ENTRY(nfsuid) nu_hash; /* Hash list */ int nu_flag; /* Flags */ union nethostaddr nu_haddr; /* Host addr. for dgram sockets */ - struct ucred nu_cr; /* Cred uid mapped to */ + kauth_cred_t nu_cr; /* Cred uid mapped to */ int nu_expire; /* Expiry time (sec) */ struct timeval nu_timestamp; /* Kerb. timestamp */ u_long nu_nickname; /* Nickname on server */ @@ -461,46 +725,50 @@ struct nfsuid { struct nfsrv_rec { STAILQ_ENTRY(nfsrv_rec) nr_link; struct sockaddr *nr_address; - struct mbuf *nr_packet; + mbuf_t nr_packet; }; #endif struct nfssvc_sock { TAILQ_ENTRY(nfssvc_sock) ns_chain; /* List of all nfssvc_sock's */ - TAILQ_HEAD(, nfsuid) ns_uidlruhead; - struct file *ns_fp; - struct socket *ns_so; - struct mbuf *ns_nam; - struct mbuf *ns_raw; - struct mbuf *ns_rawend; - struct mbuf *ns_rec; - struct mbuf *ns_recend; - struct mbuf *ns_frag; - short ns_flag; /* modified under kernel funnel */ - short ns_nflag; /* modified under network funnel */ - int ns_solock; + lck_rw_t ns_rwlock; /* lock for most fields */ + socket_t ns_so; + mbuf_t ns_nam; + mbuf_t ns_raw; + mbuf_t ns_rawend; + mbuf_t ns_rec; + mbuf_t ns_recend; + mbuf_t ns_frag; + int ns_flag; + int ns_sotype; int ns_cc; int ns_reclen; int ns_numuids; u_long ns_sref; + lck_mtx_t ns_wgmutex; /* mutex for write gather fields */ + u_quad_t ns_wgtime; /* next Write deadline (usec) */ LIST_HEAD(, nfsrv_descript) ns_tq; /* Write gather lists */ - LIST_HEAD(, nfsuid) ns_uidhashtbl[NFS_UIDHASHSIZ]; LIST_HEAD(nfsrvw_delayhash, nfsrv_descript) ns_wdelayhashtbl[NFS_WDELAYHASHSIZ]; + TAILQ_HEAD(, nfsuid) ns_uidlruhead; + LIST_HEAD(, nfsuid) ns_uidhashtbl[NFS_UIDHASHSIZ]; }; -/* Bits for "ns_*flag" */ -#define SLP_VALID 0x01 /* ns_flag */ -#define SLP_DOREC 0x02 /* ns_flag */ -#define SLPN_NEEDQ 0x04 /* ns_nflag */ -#define SLPN_DISCONN 0x08 /* ns_nflag */ -#define SLPN_GETSTREAM 0x10 /* ns_nflag */ -#define SLPN_LASTFRAG 0x20 /* ns_nflag */ -#define SLP_ALLFLAGS 0xff /* ns_flag && ns_nflag */ +/* Bits for "ns_flag" */ +#define SLP_VALID 0x01 /* nfs sock valid */ +#define SLP_DOREC 0x02 /* nfs sock has received data to process */ +#define SLP_NEEDQ 0x04 /* network socket has data to receive */ +#define SLP_DISCONN 0x08 /* socket needs to be zapped */ +#define SLP_GETSTREAM 0x10 /* currently in nfsrv_getstream() */ +#define SLP_LASTFRAG 0x20 /* on last fragment of RPC record */ +#define SLP_ALLFLAGS 0xff extern TAILQ_HEAD(nfssvc_sockhead, nfssvc_sock) nfssvc_sockhead; -extern int nfssvc_sockhead_flag; -#define SLP_INIT 0x01 -#define SLP_WANTINIT 0x02 + +/* locks for nfssvc_sock's */ +extern lck_grp_attr_t *nfs_slp_group_attr; +extern lck_attr_t *nfs_slp_lock_attr; +extern lck_grp_t *nfs_slp_rwlock_group; +extern lck_grp_t *nfs_slp_mutex_group; /* * One of these structures is allocated for each nfsd. @@ -513,7 +781,7 @@ struct nfsd { u_char nfsd_authstr[RPCAUTH_MAXSIZ]; /* Authenticator data */ int nfsd_verflen; /* and the Verifier */ u_char nfsd_verfstr[RPCVERF_MAXSIZ]; - struct proc *nfsd_procp; /* Proc ptr */ + proc_t nfsd_procp; /* Proc ptr */ struct nfsrv_descript *nfsd_nd; /* Associated nfsrv_descript */ }; @@ -534,11 +802,11 @@ struct nfsrv_descript { LIST_ENTRY(nfsrv_descript) nd_hash; /* Hash list */ LIST_ENTRY(nfsrv_descript) nd_tq; /* and timer list */ LIST_HEAD(,nfsrv_descript) nd_coalesce; /* coalesced writes */ - struct mbuf *nd_mrep; /* Request mbuf list */ - struct mbuf *nd_md; /* Current dissect mbuf */ - struct mbuf *nd_mreq; /* Reply mbuf list */ - struct mbuf *nd_nam; /* and socket addr */ - struct mbuf *nd_nam2; /* return socket addr */ + mbuf_t nd_mrep; /* Request mbuf list */ + mbuf_t nd_md; /* Current dissect mbuf */ + mbuf_t nd_mreq; /* Reply mbuf list */ + mbuf_t nd_nam; /* and socket addr */ + mbuf_t nd_nam2; /* return socket addr */ caddr_t nd_dpos; /* Current dissect pos */ u_int32_t nd_procnum; /* RPC # */ int nd_stable; /* storage type */ @@ -546,19 +814,13 @@ struct nfsrv_descript { int nd_len; /* Length of this write */ int nd_repstat; /* Reply status */ u_long nd_retxid; /* Reply xid */ - u_long nd_duration; /* Lease duration */ struct timeval nd_starttime; /* Time RPC initiated */ - fhandle_t nd_fh; /* File handle */ - struct ucred nd_cr; /* Credentials */ + struct nfs_filehandle nd_fh; /* File handle */ + kauth_cred_t nd_cr; /* Credentials */ }; /* Bits for "nd_flag" */ -#define ND_READ LEASE_READ -#define ND_WRITE LEASE_WRITE -#define ND_CHECK 0x04 -#define ND_LEASE (ND_READ | ND_WRITE | ND_CHECK) #define ND_NFSV3 0x08 -#define ND_NQNFS 0x10 #define ND_KERBNICK 0x20 #define ND_KERBFULL 0x40 #define ND_KERBAUTH (ND_KERBNICK | ND_KERBFULL) @@ -571,14 +833,23 @@ extern int nfsd_head_flag; * These macros compare nfsrv_descript structures. */ #define NFSW_CONTIG(o, n) \ - ((o)->nd_eoff >= (n)->nd_off && \ - !bcmp((caddr_t)&(o)->nd_fh, (caddr_t)&(n)->nd_fh, NFSX_V3FH)) + (((o)->nd_eoff >= (n)->nd_off) && \ + ((o)->nd_fh.nfh_len == (n)->nd_fh.nfh_len) && \ + !bcmp((caddr_t)&(o)->nd_fh, (caddr_t)&(n)->nd_fh, (o)->nd_fh.nfh_len)) #define NFSW_SAMECRED(o, n) \ (((o)->nd_flag & ND_KERBAUTH) == ((n)->nd_flag & ND_KERBAUTH) && \ - !bcmp((caddr_t)&(o)->nd_cr, (caddr_t)&(n)->nd_cr, \ + !bcmp((caddr_t)(o)->nd_cr, (caddr_t)(n)->nd_cr, \ sizeof (struct ucred))) +/* mutex for nfs server */ +extern lck_grp_t * nfsd_lck_grp; +extern lck_grp_attr_t * nfsd_lck_grp_attr; +extern lck_attr_t * nfsd_lck_attr; +extern lck_mtx_t *nfsd_mutex; + +extern int nfs_numnfsd, nfsd_waiting; + /* * Defines for WebNFS */ @@ -596,280 +867,177 @@ extern int nfsd_head_flag; * Macro for converting escape characters in WebNFS pathnames. * Should really be in libkern. */ - +#define ISHEX(c) \ + ((((c) >= 'a') && ((c) <= 'f')) || \ + (((c) >= 'A') && ((c) <= 'F')) || \ + (((c) >= '0') && ((c) <= '9'))) #define HEXTOC(c) \ ((c) >= 'a' ? ((c) - ('a' - 10)) : \ ((c) >= 'A' ? ((c) - ('A' - 10)) : ((c) - '0'))) #define HEXSTRTOI(p) \ ((HEXTOC(p[0]) << 4) + HEXTOC(p[1])) -#define NFSDIAG 0 -#if NFSDIAG - -extern int nfs_debug; -#define NFS_DEBUG_ASYNCIO 1 /* asynchronous i/o */ -#define NFS_DEBUG_WG 2 /* server write gathering */ -#define NFS_DEBUG_RC 4 /* server request caching */ -#define NFS_DEBUG_SILLY 8 /* nfs_sillyrename (.nfsXXX aka turd files) */ -#define NFS_DEBUG_DUP 16 /* debug duplicate requests */ -#define NFS_DEBUG_ATTR 32 - -#define NFS_DPF(cat, args) \ - do { \ - if (nfs_debug & NFS_DEBUG_##cat) kprintf args; \ - } while (0) - -#else - -#define NFS_DPF(cat, args) - -#endif /* NFSDIAG */ - -int nfs_init __P((struct vfsconf *vfsp)); -int nfs_reply __P((struct nfsreq *)); -int nfs_getreq __P((struct nfsrv_descript *,struct nfsd *,int)); -int nfs_send __P((struct socket *, struct mbuf *, struct mbuf *, - struct nfsreq *)); -int nfs_rephead __P((int, struct nfsrv_descript *, struct nfssvc_sock *, - int, int, u_quad_t *, struct mbuf **, struct mbuf **, - caddr_t *)); -int nfs_sndlock __P((struct nfsreq *)); -void nfs_sndunlock __P((struct nfsreq *)); -int nfs_slplock __P((struct nfssvc_sock *, int)); -void nfs_slpunlock __P((struct nfssvc_sock *)); -int nfs_disct __P((struct mbuf **, caddr_t *, int, int, caddr_t *)); -int nfs_vinvalbuf __P((struct vnode *, int, struct ucred *, struct proc *, - int)); -int nfs_readrpc __P((struct vnode *, struct uio *, struct ucred *)); -int nfs_writerpc __P((struct vnode *, struct uio *, struct ucred *, int *, - int *)); -int nfs_readdirrpc __P((struct vnode *, struct uio *, struct ucred *)); -int nfs_asyncio __P((struct nfsbuf *, struct ucred *)); -int nfs_doio __P((struct nfsbuf *, struct ucred *, struct proc *)); -int nfs_readlinkrpc __P((struct vnode *, struct uio *, struct ucred *)); -int nfs_sigintr __P((struct nfsmount *, struct nfsreq *, struct proc *)); -int nfs_readdirplusrpc __P((struct vnode *, struct uio *, struct ucred *)); -int nfsm_disct __P((struct mbuf **, caddr_t *, int, int, caddr_t *)); -void nfsm_srvfattr __P((struct nfsrv_descript *, struct vattr *, - struct nfs_fattr *)); -void nfsm_srvwcc __P((struct nfsrv_descript *, int, struct vattr *, int, - struct vattr *, struct mbuf **, char **)); -void nfsm_srvpostopattr __P((struct nfsrv_descript *, int, struct vattr *, - struct mbuf **, char **)); -int netaddr_match __P((int, union nethostaddr *, struct mbuf *)); -int nfs_request __P((struct vnode *, struct mbuf *, int, struct proc *, - struct ucred *, struct mbuf **, struct mbuf **, - caddr_t *, u_int64_t *)); -int nfs_loadattrcache __P((struct vnode **, struct mbuf **, caddr_t *, - struct vattr *, int, u_int64_t *)); -int nfs_namei __P((struct nameidata *, fhandle_t *, int, - struct nfssvc_sock *, struct mbuf *, struct mbuf **, - caddr_t *, struct vnode **, struct proc *, int, int)); -void nfsm_adj __P((struct mbuf *, int, int)); -int nfsm_mbuftouio __P((struct mbuf **, struct uio *, int, caddr_t *)); -void nfsrv_initcache __P((void)); -int nfs_getauth __P((struct nfsmount *, struct nfsreq *, struct ucred *, - char **, int *, char *, int *, NFSKERBKEY_T)); -int nfs_getnickauth __P((struct nfsmount *, struct ucred *, char **, - int *, char *, int)); -int nfs_savenickauth __P((struct nfsmount *, struct ucred *, int, - NFSKERBKEY_T, struct mbuf **, char **, - struct mbuf *)); -int nfs_adv __P((struct mbuf **, caddr_t *, int, int)); -void nfs_nhinit __P((void)); -void nfs_timer __P((void*)); -u_long nfs_hash __P((nfsfh_t *, int)); -int nfsrv_dorec __P((struct nfssvc_sock *, struct nfsd *, - struct nfsrv_descript **)); -int nfsrv_getcache __P((struct nfsrv_descript *, struct nfssvc_sock *, - struct mbuf **)); -void nfsrv_updatecache __P((struct nfsrv_descript *, int, struct mbuf *)); -void nfsrv_cleancache __P((void)); -int nfs_bind_resv_thread_wake __P((void)); -int nfs_connect __P((struct nfsmount *, struct nfsreq *)); -void nfs_disconnect __P((struct nfsmount *)); -int nfs_getattrcache __P((struct vnode *, struct vattr *)); -int nfsm_strtmbuf __P((struct mbuf **, char **, char *, long)); -int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *, - int)); -int nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *)); -void nfsrv_init __P((int)); -int nfs_commit __P((struct vnode *vp, u_quad_t offset, int cnt, - struct ucred *cred, struct proc *procp)); -int nfs_flushcommits(struct vnode *, struct proc *); -void nfs_clearcommit __P((struct mount *)); -int nfsrv_errmap __P((struct nfsrv_descript *, int)); -void nfsrvw_sort __P((gid_t *, int)); -void nfsrv_setcred __P((struct ucred *, struct ucred *)); -int nfs_buf_write __P((struct nfsbuf *)); -int nfsrv_object_create __P((struct vnode *)); -void nfsrv_wakenfsd __P((struct nfssvc_sock *slp)); -int nfsrv_writegather __P((struct nfsrv_descript **, struct nfssvc_sock *, - struct proc *, struct mbuf **)); -int nfs_fsinfo __P((struct nfsmount *, struct vnode *, struct ucred *, - struct proc *p)); - -int nfsrv3_access __P((struct nfsrv_descript *nfsd, +__BEGIN_DECLS + +int nfs_init(struct vfsconf *vfsp); +void nfs_mbuf_init(void); +int nfs_reply(struct nfsreq *); +int nfs_getreq(struct nfsrv_descript *,struct nfsd *,int); +int nfs_send(socket_t, mbuf_t, mbuf_t, struct nfsreq *); +int nfs_rephead(int, struct nfsrv_descript *, struct nfssvc_sock *, + int, mbuf_t *, mbuf_t *, caddr_t *); +int nfs_sndlock(struct nfsreq *); +void nfs_sndunlock(struct nfsreq *); +int nfs_vinvalbuf(vnode_t, int, struct ucred *, proc_t, int); +int nfs_buf_page_inval(vnode_t vp, off_t offset); +int nfs_readrpc(vnode_t, struct uio *, struct ucred *, proc_t); +int nfs_writerpc(vnode_t, struct uio *, struct ucred *, proc_t, int *, int *); +int nfs_readdirrpc(vnode_t, struct uio *, struct ucred *, proc_t); +int nfs_readdirplusrpc(vnode_t, struct uio *, struct ucred *, proc_t); +int nfs_asyncio(struct nfsbuf *, struct ucred *); +int nfs_doio(struct nfsbuf *, struct ucred *, proc_t); +int nfs_readlinkrpc(vnode_t, struct uio *, struct ucred *, proc_t); +int nfs_sigintr(struct nfsmount *, struct nfsreq *, proc_t); +int nfsm_disct(mbuf_t *, caddr_t *, int, int, caddr_t *); +void nfsm_srvfattr(struct nfsrv_descript *, struct vnode_attr *, + struct nfs_fattr *); +void nfsm_srvwcc(struct nfsrv_descript *, int, struct vnode_attr *, int, + struct vnode_attr *, mbuf_t *, char **); +void nfsm_srvpostopattr(struct nfsrv_descript *, int, struct vnode_attr *, + mbuf_t *, char **); +int netaddr_match(int, union nethostaddr *, mbuf_t); +int nfs_request(vnode_t, mount_t, mbuf_t, int, proc_t, + struct ucred *, mbuf_t *, mbuf_t *, + caddr_t *, u_int64_t *); +int nfs_parsefattr(mbuf_t *, caddr_t *, int, struct nfs_vattr *); +int nfs_loadattrcache(struct nfsnode *, struct nfs_vattr *, u_int64_t *, int); +int nfsm_path_mbuftond(mbuf_t *, caddr_t *, int, int, int *, struct nameidata *); +int nfs_namei(struct nfsrv_descript *, struct vfs_context *, struct nameidata *, + struct nfs_filehandle *, mbuf_t, int, vnode_t *, + struct nfs_export **, struct nfs_export_options **); +void nfsm_adj(mbuf_t, int, int); +int nfsm_mbuftouio(mbuf_t *, struct uio *, int, caddr_t *); +void nfsrv_initcache(void); +int nfs_getauth(struct nfsmount *, struct nfsreq *, struct ucred *, + char **, int *, char *, int *, NFSKERBKEY_T); +int nfs_getnickauth(struct nfsmount *, struct ucred *, char **, + int *, char *, int); +int nfs_savenickauth(struct nfsmount *, struct ucred *, int, + NFSKERBKEY_T, mbuf_t *, char **, + mbuf_t); +int nfs_adv(mbuf_t *, caddr_t *, int, int); +void nfs_nhinit(void); +void nfs_timer_funnel(void*); +void nfs_timer(void*); +u_long nfs_hash(u_char *, int); +int nfsrv_dorec(struct nfssvc_sock *, struct nfsd *, + struct nfsrv_descript **); +int nfsrv_getcache(struct nfsrv_descript *, struct nfssvc_sock *, + mbuf_t *); +void nfsrv_updatecache(struct nfsrv_descript *, int, mbuf_t); +void nfsrv_cleancache(void); +int nfs_bind_resv_thread_wake(void); +int nfs_connect(struct nfsmount *, struct nfsreq *); +void nfs_disconnect(struct nfsmount *); +int nfs_getattr_no_vnode(mount_t,u_char *,int,struct ucred *,proc_t,struct nfs_vattr *,u_int64_t *); +int nfs_getattr(vnode_t vp, struct nfs_vattr *nvap, struct ucred *cred, proc_t p); +int nfs_getattrcache(vnode_t, struct nfs_vattr *); +int nfs_attrcachetimeout(vnode_t); +int nfsm_strtmbuf(mbuf_t *, char **, char *, long); +int nfs_bioread(vnode_t, struct uio *, int, struct ucred *, proc_t); +int nfsm_uiotombuf(struct uio *, mbuf_t *, int, caddr_t *); +void nfsrv_init(int); +int nfs_commit(vnode_t vp, u_quad_t offset, u_int32_t count, + struct ucred *cred, proc_t procp); +int nfs_flushcommits(vnode_t, proc_t, int); +int nfs_flush(vnode_t,int,struct ucred *,proc_t,int); +void nfs_clearcommit(mount_t); +int nfsrv_errmap(struct nfsrv_descript *, int); +void nfsrvw_sort(gid_t *, int); +void nfsrv_setcred(struct ucred *, struct ucred *); +int nfs_buf_write(struct nfsbuf *); +void nfsrv_wakenfsd(struct nfssvc_sock *slp); +int nfsrv_writegather(struct nfsrv_descript **, struct nfssvc_sock *, + proc_t, mbuf_t *); +int nfs_fsinfo(struct nfsmount *, vnode_t, struct ucred *, proc_t p); +int nfs_pathconfrpc(vnode_t, struct nfsv3_pathconf *, kauth_cred_t, proc_t); +void nfs_pathconf_cache(struct nfsmount *, struct nfsv3_pathconf *); + +int nfsrv3_access(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, - struct proc *procp, struct mbuf **mrq)); -int nfsrv_commit __P((struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, - struct proc *procp, struct mbuf **mrq)); -int nfsrv_create __P((struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, - struct proc *procp, struct mbuf **mrq)); -int nfsrv_fhtovp __P((fhandle_t *, int, struct vnode **, struct ucred *, - struct nfssvc_sock *, struct mbuf *, int *, - int, int)); -int nfsrv_setpublicfs __P((struct mount *, struct netexport *, - struct export_args *)); -int nfs_ispublicfh __P((fhandle_t *)); -int nfsrv_fsinfo __P((struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, - struct proc *procp, struct mbuf **mrq)); -int nfsrv_getattr __P((struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, - struct proc *procp, struct mbuf **mrq)); -int nfsrv_link __P((struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, - struct proc *procp, struct mbuf **mrq)); -int nfsrv_lookup __P((struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, - struct proc *procp, struct mbuf **mrq)); -int nfsrv_mkdir __P((struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, - struct proc *procp, struct mbuf **mrq)); -int nfsrv_mknod __P((struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, - struct proc *procp, struct mbuf **mrq)); -int nfsrv_noop __P((struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, - struct proc *procp, struct mbuf **mrq)); -int nfsrv_null __P((struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, - struct proc *procp, struct mbuf **mrq)); -int nfsrv_pathconf __P((struct nfsrv_descript *nfsd, - struct nfssvc_sock *slp, struct proc *procp, - struct mbuf **mrq)); -int nfsrv_read __P((struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, - struct proc *procp, struct mbuf **mrq)); -int nfsrv_readdir __P((struct nfsrv_descript *nfsd, + proc_t procp, mbuf_t *mrq); +int nfsrv_commit(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, + proc_t procp, mbuf_t *mrq); +int nfsrv_create(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, + proc_t procp, mbuf_t *mrq); +int nfsrv_credcheck(struct nfsrv_descript *, struct nfs_export *, + struct nfs_export_options *); +int nfsrv_export(struct user_nfs_export_args *, struct vfs_context *); +int nfsrv_fhmatch(struct nfs_filehandle *fh1, struct nfs_filehandle *fh2); +int nfsrv_fhtovp(struct nfs_filehandle *, mbuf_t, int, vnode_t *, + struct nfs_export **, struct nfs_export_options **); +int nfs_ispublicfh(struct nfs_filehandle *); +int nfsrv_fsinfo(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, + proc_t procp, mbuf_t *mrq); +int nfsrv_getattr(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, + proc_t procp, mbuf_t *mrq); +int nfsrv_link(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, + proc_t procp, mbuf_t *mrq); +int nfsrv_lookup(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, + proc_t procp, mbuf_t *mrq); +int nfsrv_mkdir(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, + proc_t procp, mbuf_t *mrq); +int nfsrv_mknod(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, + proc_t procp, mbuf_t *mrq); +int nfsrv_noop(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, + proc_t procp, mbuf_t *mrq); +int nfsrv_null(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, + proc_t procp, mbuf_t *mrq); +int nfsrv_pathconf(struct nfsrv_descript *nfsd, + struct nfssvc_sock *slp, proc_t procp, + mbuf_t *mrq); +void nfsrv_rcv(socket_t, caddr_t arg, int waitflag); +void nfsrv_rcv_locked(socket_t, struct nfssvc_sock *slp, int waitflag); +int nfsrv_read(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, + proc_t procp, mbuf_t *mrq); +int nfsrv_readdir(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, - struct proc *procp, struct mbuf **mrq)); -int nfsrv_readdirplus __P((struct nfsrv_descript *nfsd, - struct nfssvc_sock *slp, struct proc *procp, - struct mbuf **mrq)); -int nfsrv_readlink __P((struct nfsrv_descript *nfsd, - struct nfssvc_sock *slp, struct proc *procp, - struct mbuf **mrq)); -int nfsrv_remove __P((struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, - struct proc *procp, struct mbuf **mrq)); -int nfsrv_rename __P((struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, - struct proc *procp, struct mbuf **mrq)); -int nfsrv_rmdir __P((struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, - struct proc *procp, struct mbuf **mrq)); -int nfsrv_setattr __P((struct nfsrv_descript *nfsd, + proc_t procp, mbuf_t *mrq); +int nfsrv_readdirplus(struct nfsrv_descript *nfsd, + struct nfssvc_sock *slp, proc_t procp, + mbuf_t *mrq); +int nfsrv_readlink(struct nfsrv_descript *nfsd, + struct nfssvc_sock *slp, proc_t procp, + mbuf_t *mrq); +int nfsrv_remove(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, + proc_t procp, mbuf_t *mrq); +int nfsrv_rename(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, + proc_t procp, mbuf_t *mrq); +int nfsrv_rmdir(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, + proc_t procp, mbuf_t *mrq); +int nfsrv_setattr(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, - struct proc *procp, struct mbuf **mrq)); -int nfsrv_statfs __P((struct nfsrv_descript *nfsd, + proc_t procp, mbuf_t *mrq); +void nfsrv_slpderef(struct nfssvc_sock *slp); +int nfsrv_statfs(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, - struct proc *procp, struct mbuf **mrq)); -int nfsrv_symlink __P((struct nfsrv_descript *nfsd, + proc_t procp, mbuf_t *mrq); +int nfsrv_symlink(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, - struct proc *procp, struct mbuf **mrq)); -int nfsrv_write __P((struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, - struct proc *procp, struct mbuf **mrq)); -void nfsrv_rcv __P((struct socket *so, caddr_t arg, int waitflag)); -void nfsrv_slpderef __P((struct nfssvc_sock *slp)); + proc_t procp, mbuf_t *mrq); +int nfsrv_write(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, + proc_t procp, mbuf_t *mrq); +int nfsrv_vptofh( struct nfs_export *, int, struct nfs_filehandle *, + vnode_t, struct vfs_context *, struct nfs_filehandle *); -void nfs_up(struct nfsreq *, struct nfsmount *, struct proc *, - const char *, int); -void nfs_down(struct nfsreq *, struct nfsmount *, struct proc *, - const char *, int, int); +void nfs_up(struct nfsmount *, proc_t, int, const char *); +void nfs_down(struct nfsmount *, proc_t, int, int, const char *); -/* - * NFSTRACE points were changed to FSDBG (KERNEL_DEBUG) - * But some of this code may prove useful someday... - */ -#undef NFSDIAG -#if NFSDIAG - -extern int nfstraceindx; -#define NFSTBUFSIZ 8912 -struct nfstracerec { uint i1, i2, i3, i4; }; -extern struct nfstracerec nfstracebuf[NFSTBUFSIZ]; -extern uint nfstracemask; /* 32 bits - trace points over 31 are unconditional */ - -/* 0x0000000f nfs_getattrcache trace points */ -#define NFSTRC_GAC_MISS 0x00 /* 0x00000001 cache miss */ -#define NFSTRC_GAC_HIT 0x01 /* 0x00000002 cache hit */ -#define NFSTRC_GAC_NP 0x02 /* 0x00000004 np size mismatch - vp... */ -/* 0x00000038 nfs_loadattrcache trace points */ -#define NFSTRC_LAC 0x03 /* 0x00000008 function entry point - vp */ -#define NFSTRC_LAC_INIT 0x04 /* 0x00000010 new vp & init n_mtime - vp */ -#define NFSTRC_LAC_NP 0x05 /* 0x00000020 np size mismatch - vp... */ -/* 0x000000c0 nfs_getattr trace points */ -#define NFSTRC_GA_INV 0x06 /* 0x00000040 times mismatch - vp */ -#define NFSTRC_GA_INV1 0x07 /* 0x00000080 invalidate ok - vp */ -/* 0x00000100 vmp_invalidate trace points */ -#define NFSTRC_VMP_INV 0x08 /* 0x00000100 function entry point - vmp */ -/* 0x00000200 nfs_request trace points */ -#define NFSTRC_REQ 0x09 /* 0x00000200 - alternates vp and procnum */ -/* 0x00000c00 vmp_push_range trace points */ -#define NFSTRC_VPR 0xa /* 0x00000400 entry point - vp... */ -#define NFSTRC_VPR_DONE 0xb /* 0x00000800 tail exit - error # */ -/* 0x00003000 nfs_doio trace points */ -#define NFSTRC_DIO 0xc /* 0x00001000 entry point - vp */ -#define NFSTRC_DIO_DONE 0xd /* 0x00002000 exit points - vp */ -/* 0x000fc000 congestion window trace points */ -#define NFSTRC_CWND_INIT 0xe -#define NFSTRC_CWND_REPLY 0xf -#define NFSTRC_CWND_TIMER 0x10 -#define NFSTRC_CWND_REQ1 0x11 -#define NFSTRC_CWND_REQ2 0x12 -#define NFSTRC_CWND_SOFT 0x13 -/* 0xfff00000 nfs_rcvlock & nfs_rcvunlock trace points */ -#define NFSTRC_ECONN 0x14 -#define NFSTRC_RCVERR 0x15 -#define NFSTRC_REQFREE 0x16 -#define NFSTRC_NOTMINE 0x17 -#define NFSTRC_6 0x18 -#define NFSTRC_7 0x19 -#define NFSTRC_RCVLCKINTR 0x1a -#define NFSTRC_RCVALREADY 0x1b -#define NFSTRC_RCVLCKW 0x1c /* 0x10000000 seeking recieve lock (waiting) */ -#define NFSTRC_RCVLCK 0x1d /* 0x20000000 getting recieve lock */ -#define NFSTRC_RCVUNLW 0x1e /* 0x40000000 releasing rcv lock w/ wakeup */ -#define NFSTRC_RCVUNL 0x1f /* 0x80000000 releasing rcv lock w/o wakeup */ -/* trace points beyond 31 are on if any of above points are on */ -#define NFSTRC_GA_INV2 0x20 /* nfs_getattr invalidate - error# */ -#define NFSTRC_VBAD 0x21 -#define NFSTRC_REQERR 0x22 -#define NFSTRC_RPCERR 0x23 -#define NFSTRC_DISSECTERR 0x24 -#define NFSTRC_CONTINUE 0xff /* continuation record for previous entry */ - -#define NFSTRACEX(a1, a2, a3, a4) \ -( \ - nfstracebuf[nfstraceindx].i1 = (uint)(a1), \ - nfstracebuf[nfstraceindx].i2 = (uint)(a2), \ - nfstracebuf[nfstraceindx].i3 = (uint)(a3), \ - nfstracebuf[nfstraceindx].i4 = (uint)(a4), \ - nfstraceindx = (nfstraceindx + 1) % NFSTBUFSIZ, \ - 1 \ -) - -#define NFSTRACE(cnst, fptr) \ -( \ - (nfstracemask && ((cnst) > 31 || nfstracemask & 1<<(cnst))) ? \ - NFSTRACEX((cnst), (fptr), current_thread(), \ - clock_get_system_value().tv_nsec) : \ - 0 \ -) - -#define NFSTRACE4(cnst, fptr, a2, a3, a4) \ -( \ - NFSTRACE(cnst,fptr) ? \ - NFSTRACEX(NFSTRC_CONTINUE, a2, a3, a4) : \ - 0 \ -) - -#else /* NFSDIAG */ - -# define NFSTRACE(cnst, fptr) -# define NFSTRACE4(cnst, fptr, a2, a3, a4) - -#endif /* NFSDIAG */ +struct nfs_diskless; +int nfs_boot_init(struct nfs_diskless *nd, proc_t procp); +int nfs_boot_getfh(struct nfs_diskless *nd, proc_t procp, int v3, int sotype); + +__END_DECLS #endif /* KERNEL */ #endif /* __APPLE_API_PRIVATE */ diff --git a/bsd/nfs/nfs_bio.c b/bsd/nfs/nfs_bio.c index c756be676..966cf72f5 100644 --- a/bsd/nfs/nfs_bio.c +++ b/bsd/nfs/nfs_bio.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -62,27 +62,31 @@ #include <sys/systm.h> #include <sys/resourcevar.h> #include <sys/signalvar.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> #include <sys/malloc.h> #include <sys/vnode.h> #include <sys/dirent.h> -#include <sys/mount.h> +#include <sys/mount_internal.h> #include <sys/kernel.h> #include <sys/sysctl.h> -#include <sys/ubc.h> +#include <sys/ubc_internal.h> +#include <sys/uio_internal.h> #include <sys/vm.h> #include <sys/vmparam.h> #include <sys/time.h> #include <kern/clock.h> +#include <libkern/OSAtomic.h> +#include <kern/kalloc.h> #include <nfs/rpcv2.h> #include <nfs/nfsproto.h> #include <nfs/nfs.h> #include <nfs/nfsmount.h> -#include <nfs/nqnfs.h> #include <nfs/nfsnode.h> +#include <sys/buf_internal.h> #include <sys/kdebug.h> @@ -100,16 +104,21 @@ extern int nfs_numasync; extern int nfs_ioddelwri; extern struct nfsstats nfsstats; -#define NFSBUFHASH(dvp, lbn) \ - (&nfsbufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & nfsbufhash]) +#define NFSBUFHASH(np, lbn) \ + (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash]) LIST_HEAD(nfsbufhashhead, nfsbuf) *nfsbufhashtbl; struct nfsbuffreehead nfsbuffree, nfsbuffreemeta, nfsbufdelwri; u_long nfsbufhash; -int nfsbufhashlock, nfsbufcnt, nfsbufmin, nfsbufmax; +int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax; int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer; int nfs_nbdwrite; time_t nfsbuffreeuptimestamp; +lck_grp_t *nfs_buf_lck_grp; +lck_grp_attr_t *nfs_buf_lck_grp_attr; +lck_attr_t *nfs_buf_lck_attr; +lck_mtx_t *nfs_buf_mutex; + #define NFSBUFWRITE_THROTTLE 9 #define NFSBUF_LRU_STALE 120 #define NFSBUF_META_STALE 240 @@ -130,7 +139,7 @@ time_t nfsbuffreeuptimestamp; #define META_FREEUP_MIN_FRAC 2 #define NFS_BUF_FREEUP() \ - do { \ + do { \ /* only call nfs_buf_freeup() if it has work to do: */ \ if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \ (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \ @@ -144,17 +153,28 @@ time_t nfsbuffreeuptimestamp; void nfs_nbinit(void) { - nfsbufhashlock = 0; - nfsbufhashtbl = hashinit(nbuf, M_TEMP, &nfsbufhash); - TAILQ_INIT(&nfsbuffree); - TAILQ_INIT(&nfsbuffreemeta); - TAILQ_INIT(&nfsbufdelwri); - nfsbufcnt = nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0; - nfsbufmin = 128; // XXX tune me! - nfsbufmax = 8192; // XXX tune me! + nfs_buf_lck_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(nfs_buf_lck_grp_attr); + nfs_buf_lck_grp = lck_grp_alloc_init("nfs_buf", nfs_buf_lck_grp_attr); + + nfs_buf_lck_attr = lck_attr_alloc_init(); + + nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, nfs_buf_lck_attr); + + nfsbufcnt = nfsbufmetacnt = + nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0; + nfsbufmin = 128; + nfsbufmax = (sane_size >> PAGE_SHIFT) / 4; + nfsbufmetamax = (sane_size >> PAGE_SHIFT) / 16; nfsneedbuffer = 0; nfs_nbdwrite = 0; nfsbuffreeuptimestamp = 0; + + nfsbufhashtbl = hashinit(nfsbufmax/4, M_TEMP, &nfsbufhash); + TAILQ_INIT(&nfsbuffree); + TAILQ_INIT(&nfsbuffreemeta); + TAILQ_INIT(&nfsbufdelwri); + } /* @@ -166,41 +186,38 @@ nfs_buf_freeup(int timer) struct nfsbuf *fbp; struct timeval now; int count; + struct nfsbuffreehead nfsbuffreeup; + + TAILQ_INIT(&nfsbuffreeup); + + lck_mtx_lock(nfs_buf_mutex); microuptime(&now); nfsbuffreeuptimestamp = now.tv_sec; - FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, count); + FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0); + count = timer ? nfsbuffreecnt/LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP; while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) { fbp = TAILQ_FIRST(&nfsbuffree); if (!fbp) break; - if ((fbp->nb_timestamp + (2*NFSBUF_LRU_STALE)) > now.tv_sec) + if (fbp->nb_refs) + break; + if (NBUFSTAMPVALID(fbp) && + (fbp->nb_timestamp + (2*NFSBUF_LRU_STALE)) > now.tv_sec) break; nfs_buf_remfree(fbp); /* disassociate buffer from any vnode */ if (fbp->nb_vp) { - struct vnode *oldvp; if (fbp->nb_vnbufs.le_next != NFSNOLIST) { LIST_REMOVE(fbp, nb_vnbufs); fbp->nb_vnbufs.le_next = NFSNOLIST; } - oldvp = fbp->nb_vp; fbp->nb_vp = NULL; - HOLDRELE(oldvp); } LIST_REMOVE(fbp, nb_hash); - /* nuke any creds */ - if (fbp->nb_rcred != NOCRED) - crfree(fbp->nb_rcred); - if (fbp->nb_wcred != NOCRED) - crfree(fbp->nb_wcred); - /* if buf was NB_META, dump buffer */ - if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) { - FREE(fbp->nb_data, M_TEMP); - } - FREE(fbp, M_TEMP); + TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free); nfsbufcnt--; } @@ -209,36 +226,54 @@ nfs_buf_freeup(int timer) fbp = TAILQ_FIRST(&nfsbuffreemeta); if (!fbp) break; - if ((fbp->nb_timestamp + (2*NFSBUF_META_STALE)) > now.tv_sec) + if (fbp->nb_refs) + break; + if (NBUFSTAMPVALID(fbp) && + (fbp->nb_timestamp + (2*NFSBUF_META_STALE)) > now.tv_sec) break; nfs_buf_remfree(fbp); /* disassociate buffer from any vnode */ if (fbp->nb_vp) { - struct vnode *oldvp; if (fbp->nb_vnbufs.le_next != NFSNOLIST) { LIST_REMOVE(fbp, nb_vnbufs); fbp->nb_vnbufs.le_next = NFSNOLIST; } - oldvp = fbp->nb_vp; fbp->nb_vp = NULL; - HOLDRELE(oldvp); } LIST_REMOVE(fbp, nb_hash); + TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free); + nfsbufcnt--; + nfsbufmetacnt--; + } + + FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0); + NFSBUFCNTCHK(1); + + lck_mtx_unlock(nfs_buf_mutex); + + while ((fbp = TAILQ_FIRST(&nfsbuffreeup))) { + TAILQ_REMOVE(&nfsbuffreeup, fbp, nb_free); /* nuke any creds */ - if (fbp->nb_rcred != NOCRED) - crfree(fbp->nb_rcred); - if (fbp->nb_wcred != NOCRED) - crfree(fbp->nb_wcred); - /* if buf was NB_META, dump buffer */ - if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) { - FREE(fbp->nb_data, M_TEMP); + if (fbp->nb_rcred != NOCRED) { + kauth_cred_rele(fbp->nb_rcred); + fbp->nb_rcred = NOCRED; } + if (fbp->nb_wcred != NOCRED) { + kauth_cred_rele(fbp->nb_wcred); + fbp->nb_wcred = NOCRED; + } + /* if buf was NB_META, dump buffer */ + if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) + kfree(fbp->nb_data, fbp->nb_bufsize); FREE(fbp, M_TEMP); - nfsbufcnt--; } - FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, count); + } +/* + * remove a buffer from the freelist + * (must be called with nfs_buf_mutex held) + */ void nfs_buf_remfree(struct nfsbuf *bp) { @@ -247,7 +282,7 @@ nfs_buf_remfree(struct nfsbuf *bp) if (ISSET(bp->nb_flags, NB_DELWRI)) { nfsbufdelwricnt--; TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free); - } else if (ISSET(bp->nb_flags, NB_META) && !ISSET(bp->nb_flags, NB_INVAL)) { + } else if (ISSET(bp->nb_flags, NB_META)) { nfsbuffreemetacnt--; TAILQ_REMOVE(&nfsbuffreemeta, bp, nb_free); } else { @@ -255,17 +290,33 @@ nfs_buf_remfree(struct nfsbuf *bp) TAILQ_REMOVE(&nfsbuffree, bp, nb_free); } bp->nb_free.tqe_next = NFSNOLIST; - NFSBUFCNTCHK(); + NFSBUFCNTCHK(1); } /* * check for existence of nfsbuf in cache */ +boolean_t +nfs_buf_is_incore(vnode_t vp, daddr64_t blkno) +{ + boolean_t rv; + lck_mtx_lock(nfs_buf_mutex); + if (nfs_buf_incore(vp, blkno)) + rv = TRUE; + else + rv = FALSE; + lck_mtx_unlock(nfs_buf_mutex); + return (rv); +} + +/* + * return incore buffer (must be called with nfs_buf_mutex held) + */ struct nfsbuf * -nfs_buf_incore(struct vnode *vp, daddr_t blkno) +nfs_buf_incore(vnode_t vp, daddr64_t blkno) { /* Search hash chain */ - struct nfsbuf * bp = NFSBUFHASH(vp, blkno)->lh_first; + struct nfsbuf * bp = NFSBUFHASH(VTONFS(vp), blkno)->lh_first; for (; bp != NULL; bp = bp->nb_hash.le_next) if (bp->nb_lblkno == blkno && bp->nb_vp == vp) { if (!ISSET(bp->nb_flags, NB_INVAL)) { @@ -285,15 +336,20 @@ nfs_buf_incore(struct vnode *vp, daddr_t blkno) * later when/if we need to write the data (again). */ int -nfs_buf_page_inval(struct vnode *vp, off_t offset) +nfs_buf_page_inval(vnode_t vp, off_t offset) { struct nfsbuf *bp; + int error = 0; + + lck_mtx_lock(nfs_buf_mutex); bp = nfs_buf_incore(vp, ubc_offtoblk(vp, offset)); if (!bp) - return (0); + goto out; FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend); - if (ISSET(bp->nb_flags, NB_BUSY)) - return (EBUSY); + if (ISSET(bp->nb_lflags, NBL_BUSY)) { + error = EBUSY; + goto out; + } /* * If there's a dirty range in the buffer, check to * see if this page intersects with the dirty range. @@ -303,24 +359,39 @@ nfs_buf_page_inval(struct vnode *vp, off_t offset) int start = offset - NBOFF(bp); if (bp->nb_dirtyend <= start || bp->nb_dirtyoff >= (start + PAGE_SIZE)) - return (0); - return (EBUSY); + error = 0; + else + error = EBUSY; } - return (0); +out: + lck_mtx_unlock(nfs_buf_mutex); + return (error); } +/* + * set up the UPL for a buffer + * (must NOT be called with nfs_buf_mutex held) + */ int nfs_buf_upl_setup(struct nfsbuf *bp) { kern_return_t kret; upl_t upl; - int s; + int upl_flags; if (ISSET(bp->nb_flags, NB_PAGELIST)) return (0); + upl_flags = UPL_PRECIOUS; + if (! ISSET(bp->nb_flags, NB_READ)) { + /* + * We're doing a "write", so we intend to modify + * the pages we're gathering. + */ + upl_flags |= UPL_WILL_MODIFY; + } kret = ubc_create_upl(bp->nb_vp, NBOFF(bp), bp->nb_bufsize, - &upl, NULL, UPL_PRECIOUS); + &upl, NULL, upl_flags); if (kret == KERN_INVALID_ARGUMENT) { /* vm object probably doesn't exist any more */ bp->nb_pagelist = NULL; @@ -334,13 +405,15 @@ nfs_buf_upl_setup(struct nfsbuf *bp) FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_vp); - s = splbio(); bp->nb_pagelist = upl; SET(bp->nb_flags, NB_PAGELIST); - splx(s); return (0); } +/* + * update buffer's valid/dirty info from UBC + * (must NOT be called with nfs_buf_mutex held) + */ void nfs_buf_upl_check(struct nfsbuf *bp) { @@ -390,6 +463,10 @@ nfs_buf_upl_check(struct nfsbuf *bp) FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend); } +/* + * make sure that a buffer is mapped + * (must NOT be called with nfs_buf_mutex held) + */ static int nfs_buf_map(struct nfsbuf *bp) { @@ -465,18 +542,19 @@ nfs_buf_normalize_valid_range(struct nfsnode *np, struct nfsbuf *bp) pg++; bp->nb_validend = pg * PAGE_SIZE; /* clip to EOF */ - if (NBOFF(bp) + bp->nb_validend > np->n_size) + if (NBOFF(bp) + bp->nb_validend > (off_t)np->n_size) bp->nb_validend = np->n_size % bp->nb_bufsize; } /* * try to push out some delayed/uncommitted writes + * ("locked" indicates whether nfs_buf_mutex is already held) */ static void -nfs_buf_delwri_push(void) +nfs_buf_delwri_push(int locked) { struct nfsbuf *bp; - int i; + int i, error; if (TAILQ_EMPTY(&nfsbufdelwri)) return; @@ -487,63 +565,99 @@ nfs_buf_delwri_push(void) /* otherwise, try to do some of the work ourselves */ i = 0; + if (!locked) + lck_mtx_lock(nfs_buf_mutex); while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) { struct nfsnode *np = VTONFS(bp->nb_vp); nfs_buf_remfree(bp); + nfs_buf_refget(bp); + while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN); + nfs_buf_refrele(bp); + if (error) + break; + if (!bp->nb_vp) { + /* buffer is no longer valid */ + nfs_buf_drop(bp); + continue; + } if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { /* put buffer at end of delwri list */ TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free); nfsbufdelwricnt++; - nfs_flushcommits(np->n_vnode, (struct proc *)0); + nfs_buf_drop(bp); + lck_mtx_unlock(nfs_buf_mutex); + nfs_flushcommits(np->n_vnode, NULL, 1); } else { - SET(bp->nb_flags, (NB_BUSY | NB_ASYNC)); + SET(bp->nb_flags, NB_ASYNC); + lck_mtx_unlock(nfs_buf_mutex); nfs_buf_write(bp); } i++; + lck_mtx_lock(nfs_buf_mutex); } + if (!locked) + lck_mtx_unlock(nfs_buf_mutex); } /* - * Get an nfs cache block. - * Allocate a new one if the block isn't currently in the cache - * and return the block marked busy. If the calling process is - * interrupted by a signal for an interruptible mount point, return - * NULL. + * Get an nfs buffer. + * + * Returns errno on error, 0 otherwise. + * Any buffer is returned in *bpp. + * + * If NBLK_ONLYVALID is set, only return buffer if found in cache. + * If NBLK_NOWAIT is set, don't wait for the buffer if it's marked BUSY. + * + * Check for existence of buffer in cache. + * Or attempt to reuse a buffer from one of the free lists. + * Or allocate a new buffer if we haven't already hit max allocation. + * Or wait for a free buffer. + * + * If available buffer found, prepare it, and return it. + * + * If the calling process is interrupted by a signal for + * an interruptible mount point, return EINTR. */ -struct nfsbuf * +int nfs_buf_get( - struct vnode *vp, - daddr_t blkno, + vnode_t vp, + daddr64_t blkno, int size, - struct proc *p, - int operation) + proc_t p, + int flags, + struct nfsbuf **bpp) { struct nfsnode *np = VTONFS(vp); struct nfsbuf *bp; - int i, biosize, bufsize, rv; - struct ucred *cred; + int biosize, bufsize; + kauth_cred_t cred; int slpflag = PCATCH; + int operation = (flags & NBLK_OPMASK); + int error = 0; + struct timespec ts; - FSDBG_TOP(541, vp, blkno, size, operation); + FSDBG_TOP(541, vp, blkno, size, flags); + *bpp = NULL; bufsize = size; if (bufsize > MAXBSIZE) panic("nfs_buf_get: buffer larger than MAXBSIZE requested"); - biosize = vp->v_mount->mnt_stat.f_iosize; + biosize = vfs_statfs(vnode_mount(vp))->f_iosize; - if (UBCINVALID(vp) || !UBCINFOEXISTS(vp)) - operation = BLK_META; - else if (bufsize < biosize) + if (UBCINVALID(vp) || !UBCINFOEXISTS(vp)) { + operation = NBLK_META; + } else if (bufsize < biosize) { /* reg files should always have biosize blocks */ bufsize = biosize; + } - /* if BLK_WRITE, check for too many delayed/uncommitted writes */ - if ((operation == BLK_WRITE) && (nfs_nbdwrite > ((nfsbufcnt*3)/4))) { + /* if NBLK_WRITE, check for too many delayed/uncommitted writes */ + if ((operation == NBLK_WRITE) && (nfs_nbdwrite > ((nfsbufcnt*3)/4))) { FSDBG_TOP(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4)); /* poke the delwri list */ - nfs_buf_delwri_push(); + nfs_buf_delwri_push(0); /* sleep to let other threads run... */ tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1); @@ -551,45 +665,36 @@ nfs_buf_get( } loop: - /* - * Obtain a lock to prevent a race condition if the - * MALLOC() below happens to block. - */ - if (nfsbufhashlock) { - while (nfsbufhashlock) { - nfsbufhashlock = -1; - tsleep(&nfsbufhashlock, PCATCH, "nfsbufget", 0); - if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p)) - return (NULL); - } - goto loop; - } - nfsbufhashlock = 1; + lck_mtx_lock(nfs_buf_mutex); /* check for existence of nfsbuf in cache */ - if (bp = nfs_buf_incore(vp, blkno)) { + if ((bp = nfs_buf_incore(vp, blkno))) { /* if busy, set wanted and wait */ - if (ISSET(bp->nb_flags, NB_BUSY)) { + if (ISSET(bp->nb_lflags, NBL_BUSY)) { + if (flags & NBLK_NOWAIT) { + lck_mtx_unlock(nfs_buf_mutex); + FSDBG_BOT(541, vp, blkno, bp, 0xbcbcbcbc); + return (0); + } FSDBG_TOP(543, vp, blkno, bp, bp->nb_flags); - SET(bp->nb_flags, NB_WANTED); - /* unlock hash */ - if (nfsbufhashlock < 0) { - nfsbufhashlock = 0; - wakeup(&nfsbufhashlock); - } else - nfsbufhashlock = 0; - tsleep(bp, slpflag|(PRIBIO+1), "nfsbufget", (slpflag == PCATCH) ? 0 : 2*hz); + SET(bp->nb_lflags, NBL_WANTED); + + ts.tv_sec = 2; + ts.tv_nsec = 0; + msleep(bp, nfs_buf_mutex, slpflag|(PRIBIO+1)|PDROP, + "nfsbufget", (slpflag == PCATCH) ? 0 : &ts); slpflag = 0; FSDBG_BOT(543, vp, blkno, bp, bp->nb_flags); - if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p)) { - FSDBG_BOT(541, vp, blkno, 0, EINTR); - return (NULL); + if ((error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p))) { + FSDBG_BOT(541, vp, blkno, 0, error); + return (error); } goto loop; } if (bp->nb_bufsize != bufsize) panic("nfsbuf size mismatch"); - SET(bp->nb_flags, (NB_BUSY | NB_CACHE)); + SET(bp->nb_lflags, NBL_BUSY); + SET(bp->nb_flags, NB_CACHE); nfs_buf_remfree(bp); /* additional paranoia: */ if (ISSET(bp->nb_flags, NB_PAGELIST)) @@ -597,8 +702,15 @@ loop: goto buffer_setup; } + if (flags & NBLK_ONLYVALID) { + lck_mtx_unlock(nfs_buf_mutex); + FSDBG_BOT(541, vp, blkno, 0, 0x0000cace); + return (0); + } + /* * where to get a free buffer: + * - if meta and maxmeta reached, must reuse meta * - alloc new if we haven't reached min bufs * - if free lists are NOT empty * - if free list is stale, use it @@ -608,19 +720,24 @@ loop: * - start clearing out delwri list and try again */ - if ((nfsbufcnt > nfsbufmin) && + if ((operation == NBLK_META) && (nfsbufmetacnt >= nfsbufmetamax)) { + /* if we've hit max meta buffers, must reuse a meta buffer */ + bp = TAILQ_FIRST(&nfsbuffreemeta); + } else if ((nfsbufcnt > nfsbufmin) && (!TAILQ_EMPTY(&nfsbuffree) || !TAILQ_EMPTY(&nfsbuffreemeta))) { /* try to pull an nfsbuf off a free list */ struct nfsbuf *lrubp, *metabp; struct timeval now; microuptime(&now); - /* if the next LRU or META buffer is stale, use it */ + /* if the next LRU or META buffer is invalid or stale, use it */ lrubp = TAILQ_FIRST(&nfsbuffree); - if (lrubp && ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec)) + if (lrubp && (!NBUFSTAMPVALID(lrubp) || + ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec))) bp = lrubp; metabp = TAILQ_FIRST(&nfsbuffreemeta); - if (!bp && metabp && ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec)) + if (!bp && metabp && (!NBUFSTAMPVALID(metabp) || + ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec))) bp = metabp; if (!bp && (nfsbufcnt >= nfsbufmax)) { @@ -640,58 +757,67 @@ loop: bp = metabp; } } + } - if (bp) { - /* we have a buffer to reuse */ - FSDBG(544, vp, blkno, bp, bp->nb_flags); - nfs_buf_remfree(bp); - if (ISSET(bp->nb_flags, NB_DELWRI)) - panic("nfs_buf_get: delwri"); - SET(bp->nb_flags, NB_BUSY); - /* disassociate buffer from previous vnode */ - if (bp->nb_vp) { - struct vnode *oldvp; - if (bp->nb_vnbufs.le_next != NFSNOLIST) { - LIST_REMOVE(bp, nb_vnbufs); - bp->nb_vnbufs.le_next = NFSNOLIST; - } - oldvp = bp->nb_vp; - bp->nb_vp = NULL; - HOLDRELE(oldvp); - } - LIST_REMOVE(bp, nb_hash); - /* nuke any creds we're holding */ - cred = bp->nb_rcred; - if (cred != NOCRED) { - bp->nb_rcred = NOCRED; - crfree(cred); - } - cred = bp->nb_wcred; - if (cred != NOCRED) { - bp->nb_wcred = NOCRED; - crfree(cred); + if (bp) { + /* we have a buffer to reuse */ + FSDBG(544, vp, blkno, bp, bp->nb_flags); + nfs_buf_remfree(bp); + if (ISSET(bp->nb_flags, NB_DELWRI)) + panic("nfs_buf_get: delwri"); + SET(bp->nb_lflags, NBL_BUSY); + /* disassociate buffer from previous vnode */ + if (bp->nb_vp) { + if (bp->nb_vnbufs.le_next != NFSNOLIST) { + LIST_REMOVE(bp, nb_vnbufs); + bp->nb_vnbufs.le_next = NFSNOLIST; } - /* if buf will no longer be NB_META, dump old buffer */ - if ((operation != BLK_META) && - ISSET(bp->nb_flags, NB_META) && bp->nb_data) { - FREE(bp->nb_data, M_TEMP); + bp->nb_vp = NULL; + } + LIST_REMOVE(bp, nb_hash); + /* nuke any creds we're holding */ + cred = bp->nb_rcred; + if (cred != NOCRED) { + bp->nb_rcred = NOCRED; + kauth_cred_rele(cred); + } + cred = bp->nb_wcred; + if (cred != NOCRED) { + bp->nb_wcred = NOCRED; + kauth_cred_rele(cred); + } + /* if buf will no longer be NB_META, dump old buffer */ + if (operation == NBLK_META) { + if (!ISSET(bp->nb_flags, NB_META)) + nfsbufmetacnt++; + } else if (ISSET(bp->nb_flags, NB_META)) { + if (bp->nb_data) { + kfree(bp->nb_data, bp->nb_bufsize); bp->nb_data = NULL; } - /* re-init buf fields */ - bp->nb_error = 0; - bp->nb_validoff = bp->nb_validend = -1; - bp->nb_dirtyoff = bp->nb_dirtyend = 0; - bp->nb_valid = 0; - bp->nb_dirty = 0; + nfsbufmetacnt--; } - } - - if (!bp) { - if (nfsbufcnt < nfsbufmax) { + /* re-init buf fields */ + bp->nb_error = 0; + bp->nb_validoff = bp->nb_validend = -1; + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + bp->nb_valid = 0; + bp->nb_dirty = 0; + } else { + /* no buffer to reuse */ + if ((nfsbufcnt < nfsbufmax) && + ((operation != NBLK_META) || (nfsbufmetacnt < nfsbufmetamax))) { /* just alloc a new one */ MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK); + if (!bp) { + lck_mtx_unlock(nfs_buf_mutex); + FSDBG_BOT(541, vp, blkno, 0, error); + return (ENOMEM); + } nfsbufcnt++; - NFSBUFCNTCHK(); + if (operation == NBLK_META) + nfsbufmetacnt++; + NFSBUFCNTCHK(1); /* init nfsbuf */ bzero(bp, sizeof(*bp)); bp->nb_free.tqe_next = NFSNOLIST; @@ -700,46 +826,42 @@ loop: } else { /* too many bufs... wait for buffers to free up */ FSDBG_TOP(546, vp, blkno, nfsbufcnt, nfsbufmax); - /* unlock hash */ - if (nfsbufhashlock < 0) { - nfsbufhashlock = 0; - wakeup(&nfsbufhashlock); - } else - nfsbufhashlock = 0; /* poke the delwri list */ - nfs_buf_delwri_push(); + nfs_buf_delwri_push(1); nfsneedbuffer = 1; - tsleep(&nfsneedbuffer, PCATCH, "nfsbufget", 0); + msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH|PDROP, + "nfsbufget", 0); FSDBG_BOT(546, vp, blkno, nfsbufcnt, nfsbufmax); - if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p)) { - FSDBG_BOT(541, vp, blkno, 0, EINTR); - return (NULL); + if ((error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p))) { + FSDBG_BOT(541, vp, blkno, 0, error); + return (error); } goto loop; } } -setup_nfsbuf: - /* setup nfsbuf */ - bp->nb_flags = NB_BUSY; + bp->nb_lflags = NBL_BUSY; + bp->nb_flags = 0; bp->nb_lblkno = blkno; /* insert buf in hash */ - LIST_INSERT_HEAD(NFSBUFHASH(vp, blkno), bp, nb_hash); + LIST_INSERT_HEAD(NFSBUFHASH(np, blkno), bp, nb_hash); /* associate buffer with new vnode */ - VHOLD(vp); bp->nb_vp = vp; LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); buffer_setup: + /* unlock hash */ + lck_mtx_unlock(nfs_buf_mutex); + switch (operation) { - case BLK_META: + case NBLK_META: SET(bp->nb_flags, NB_META); if ((bp->nb_bufsize != bufsize) && bp->nb_data) { - FREE(bp->nb_data, M_TEMP); + kfree(bp->nb_data, bp->nb_bufsize); bp->nb_data = NULL; bp->nb_validoff = bp->nb_validend = -1; bp->nb_dirtyoff = bp->nb_dirtyend = 0; @@ -748,37 +870,62 @@ buffer_setup: CLR(bp->nb_flags, NB_CACHE); } if (!bp->nb_data) - MALLOC(bp->nb_data, caddr_t, bufsize, M_TEMP, M_WAITOK); - if (!bp->nb_data) - panic("nfs_buf_get: null nb_data"); + bp->nb_data = kalloc(bufsize); + if (!bp->nb_data) { + /* Ack! couldn't allocate the data buffer! */ + /* cleanup buffer and return error */ + lck_mtx_lock(nfs_buf_mutex); + LIST_REMOVE(bp, nb_vnbufs); + bp->nb_vnbufs.le_next = NFSNOLIST; + bp->nb_vp = NULL; + /* invalidate usage timestamp to allow immediate freeing */ + NBUFSTAMPINVALIDATE(bp); + if (bp->nb_free.tqe_next != NFSNOLIST) + panic("nfsbuf on freelist"); + TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free); + nfsbuffreecnt++; + lck_mtx_unlock(nfs_buf_mutex); + FSDBG_BOT(541, vp, blkno, 0xb00, ENOMEM); + return (ENOMEM); + } bp->nb_bufsize = bufsize; break; - case BLK_READ: - case BLK_WRITE: + case NBLK_READ: + case NBLK_WRITE: + /* + * Set or clear NB_READ now to let the UPL subsystem know + * if we intend to modify the pages or not. + */ + if (operation == NBLK_READ) { + SET(bp->nb_flags, NB_READ); + } else { + CLR(bp->nb_flags, NB_READ); + } if (bufsize < PAGE_SIZE) bufsize = PAGE_SIZE; bp->nb_bufsize = bufsize; bp->nb_validoff = bp->nb_validend = -1; - if (UBCISVALID(vp)) { + if (UBCINFOEXISTS(vp)) { /* setup upl */ if (nfs_buf_upl_setup(bp)) { /* unable to create upl */ /* vm object must no longer exist */ - /* cleanup buffer and return NULL */ + /* cleanup buffer and return error */ + lck_mtx_lock(nfs_buf_mutex); LIST_REMOVE(bp, nb_vnbufs); bp->nb_vnbufs.le_next = NFSNOLIST; bp->nb_vp = NULL; - /* clear usage timestamp to allow immediate freeing */ - bp->nb_timestamp = 0; - HOLDRELE(vp); + /* invalidate usage timestamp to allow immediate freeing */ + NBUFSTAMPINVALIDATE(bp); if (bp->nb_free.tqe_next != NFSNOLIST) panic("nfsbuf on freelist"); TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free); nfsbuffreecnt++; + lck_mtx_unlock(nfs_buf_mutex); FSDBG_BOT(541, vp, blkno, 0x2bc, EIO); - return (NULL); + return (EIO); } nfs_buf_upl_check(bp); } @@ -788,23 +935,19 @@ buffer_setup: panic("nfs_buf_get: %d unknown operation", operation); } - /* unlock hash */ - if (nfsbufhashlock < 0) { - nfsbufhashlock = 0; - wakeup(&nfsbufhashlock); - } else - nfsbufhashlock = 0; + *bpp = bp; FSDBG_BOT(541, vp, blkno, bp, bp->nb_flags); - return (bp); + return (0); } void nfs_buf_release(struct nfsbuf *bp, int freeup) { - struct vnode *vp = bp->nb_vp; + vnode_t vp = bp->nb_vp; struct timeval now; + int wakeup_needbuffer, wakeup_buffer, wakeup_nbdwrite; FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data); FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend); @@ -830,7 +973,7 @@ nfs_buf_release(struct nfsbuf *bp, int freeup) panic("ubc_upl_unmap failed"); bp->nb_data = NULL; } - if (bp->nb_flags & (NB_ERROR | NB_INVAL)) { + if (bp->nb_flags & (NB_ERROR | NB_INVAL | NB_NOCACHE)) { if (bp->nb_flags & (NB_READ | NB_INVAL)) upl_flags = UPL_ABORT_DUMP_PAGES; else @@ -858,28 +1001,34 @@ nfs_buf_release(struct nfsbuf *bp, int freeup) } pagelist_cleanup_done: /* was this the last buffer in the file? */ - if (NBOFF(bp) + bp->nb_bufsize > VTONFS(vp)->n_size) { + if (NBOFF(bp) + bp->nb_bufsize > (off_t)(VTONFS(vp)->n_size)) { /* if so, invalidate all pages of last buffer past EOF */ - int biosize = vp->v_mount->mnt_stat.f_iosize; - off_t off, size; - off = trunc_page_64(VTONFS(vp)->n_size) + PAGE_SIZE_64; - size = trunc_page_64(NBOFF(bp) + biosize) - off; - if (size) - ubc_invalidate(vp, off, size); + int biosize = vfs_statfs(vnode_mount(vp))->f_iosize; + off_t start, end; + start = trunc_page_64(VTONFS(vp)->n_size) + PAGE_SIZE_64; + end = trunc_page_64(NBOFF(bp) + biosize); + if (end > start) { + if (!(rv = ubc_sync_range(vp, start, end, UBC_INVALIDATE))) + printf("nfs_buf_release(): ubc_sync_range failed!\n"); + } } CLR(bp->nb_flags, NB_PAGELIST); bp->nb_pagelist = NULL; } + lck_mtx_lock(nfs_buf_mutex); + + wakeup_needbuffer = wakeup_buffer = wakeup_nbdwrite = 0; + /* Wake up any processes waiting for any buffer to become free. */ if (nfsneedbuffer) { nfsneedbuffer = 0; - wakeup(&nfsneedbuffer); + wakeup_needbuffer = 1; } /* Wake up any processes waiting for _this_ buffer to become free. */ - if (ISSET(bp->nb_flags, NB_WANTED)) { - CLR(bp->nb_flags, NB_WANTED); - wakeup(bp); + if (ISSET(bp->nb_lflags, NBL_WANTED)) { + CLR(bp->nb_lflags, NBL_WANTED); + wakeup_buffer = 1; } /* If it's not cacheable, or an error, mark it invalid. */ @@ -893,31 +1042,34 @@ pagelist_cleanup_done: bp->nb_vnbufs.le_next = NFSNOLIST; } bp->nb_vp = NULL; - HOLDRELE(vp); /* if this was a delayed write, wakeup anyone */ /* waiting for delayed writes to complete */ if (ISSET(bp->nb_flags, NB_DELWRI)) { CLR(bp->nb_flags, NB_DELWRI); - nfs_nbdwrite--; - NFSBUFCNTCHK(); - wakeup((caddr_t)&nfs_nbdwrite); + OSAddAtomic(-1, (SInt32*)&nfs_nbdwrite); + NFSBUFCNTCHK(1); + wakeup_nbdwrite = 1; } - /* clear usage timestamp to allow immediate freeing */ - bp->nb_timestamp = 0; + /* invalidate usage timestamp to allow immediate freeing */ + NBUFSTAMPINVALIDATE(bp); /* put buffer at head of free list */ if (bp->nb_free.tqe_next != NFSNOLIST) panic("nfsbuf on freelist"); SET(bp->nb_flags, NB_INVAL); - TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free); - nfsbuffreecnt++; - if (freeup) - NFS_BUF_FREEUP(); + if (ISSET(bp->nb_flags, NB_META)) { + TAILQ_INSERT_HEAD(&nfsbuffreemeta, bp, nb_free); + nfsbuffreemetacnt++; + } else { + TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free); + nfsbuffreecnt++; + } } else if (ISSET(bp->nb_flags, NB_DELWRI)) { /* put buffer at end of delwri list */ if (bp->nb_free.tqe_next != NFSNOLIST) panic("nfsbuf on freelist"); TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free); nfsbufdelwricnt++; + freeup = 0; } else { /* update usage timestamp */ microuptime(&now); @@ -932,16 +1084,26 @@ pagelist_cleanup_done: TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free); nfsbuffreecnt++; } - if (freeup) - NFS_BUF_FREEUP(); } - NFSBUFCNTCHK(); + NFSBUFCNTCHK(1); /* Unlock the buffer. */ - CLR(bp->nb_flags, (NB_ASYNC | NB_BUSY | NB_NOCACHE | NB_STABLE | NB_IOD)); + CLR(bp->nb_flags, (NB_ASYNC | NB_NOCACHE | NB_STABLE | NB_IOD)); + CLR(bp->nb_lflags, NBL_BUSY); FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data); + + lck_mtx_unlock(nfs_buf_mutex); + + if (wakeup_needbuffer) + wakeup(&nfsneedbuffer); + if (wakeup_buffer) + wakeup(bp); + if (wakeup_nbdwrite) + wakeup(&nfs_nbdwrite); + if (freeup) + NFS_BUF_FREEUP(); } /* @@ -953,8 +1115,12 @@ nfs_buf_iowait(struct nfsbuf *bp) { FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); + lck_mtx_lock(nfs_buf_mutex); + while (!ISSET(bp->nb_flags, NB_DONE)) - tsleep(bp, PRIBIO + 1, "nfs_buf_iowait", 0); + msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", 0); + + lck_mtx_unlock(nfs_buf_mutex); FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); @@ -973,13 +1139,11 @@ nfs_buf_iowait(struct nfsbuf *bp) void nfs_buf_iodone(struct nfsbuf *bp) { - struct vnode *vp; FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); if (ISSET(bp->nb_flags, NB_DONE)) panic("nfs_buf_iodone already"); - SET(bp->nb_flags, NB_DONE); /* note that it's done */ /* * I/O was done, so don't believe * the DIRTY state from VM anymore @@ -988,21 +1152,20 @@ nfs_buf_iodone(struct nfsbuf *bp) if (!ISSET(bp->nb_flags, NB_READ)) { CLR(bp->nb_flags, NB_WRITEINPROG); - vpwakeup(bp->nb_vp); - } - - /* Wakeup the throttled write operations as needed */ - vp = bp->nb_vp; - if (vp && (vp->v_flag & VTHROTTLED) - && (vp->v_numoutput <= (NFSBUFWRITE_THROTTLE / 3))) { - vp->v_flag &= ~VTHROTTLED; - wakeup((caddr_t)&vp->v_numoutput); + /* + * vnode_writedone() takes care of waking up + * any throttled write operations + */ + vnode_writedone(bp->nb_vp); } - - if (ISSET(bp->nb_flags, NB_ASYNC)) /* if async, release it */ + if (ISSET(bp->nb_flags, NB_ASYNC)) { /* if async, release it */ + SET(bp->nb_flags, NB_DONE); /* note that it's done */ nfs_buf_release(bp, 1); - else { /* or just wakeup the buffer */ - CLR(bp->nb_flags, NB_WANTED); + } else { /* or just wakeup the buffer */ + lck_mtx_lock(nfs_buf_mutex); + SET(bp->nb_flags, NB_DONE); /* note that it's done */ + CLR(bp->nb_lflags, NBL_WANTED); + lck_mtx_unlock(nfs_buf_mutex); wakeup(bp); } @@ -1010,10 +1173,9 @@ nfs_buf_iodone(struct nfsbuf *bp) } void -nfs_buf_write_delayed(struct nfsbuf *bp) +nfs_buf_write_delayed(struct nfsbuf *bp, proc_t p) { - struct proc *p = current_proc(); - struct vnode *vp = bp->nb_vp; + vnode_t vp = bp->nb_vp; FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0); FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty); @@ -1028,22 +1190,21 @@ nfs_buf_write_delayed(struct nfsbuf *bp) SET(bp->nb_flags, NB_DELWRI); if (p && p->p_stats) p->p_stats->p_ru.ru_oublock++; /* XXX */ - nfs_nbdwrite++; - NFSBUFCNTCHK(); + OSAddAtomic(1, (SInt32*)&nfs_nbdwrite); + NFSBUFCNTCHK(0); /* move to dirty list */ + lck_mtx_lock(nfs_buf_mutex); if (bp->nb_vnbufs.le_next != NFSNOLIST) LIST_REMOVE(bp, nb_vnbufs); LIST_INSERT_HEAD(&VTONFS(vp)->n_dirtyblkhd, bp, nb_vnbufs); + lck_mtx_unlock(nfs_buf_mutex); } /* * If the vnode has "too many" write operations in progress * wait for them to finish the IO */ - while (vp->v_numoutput >= NFSBUFWRITE_THROTTLE) { - vp->v_flag |= VTHROTTLED; - tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "nfs_buf_write_delayed", 0); - } + (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed"); /* * If we have too many delayed write buffers, @@ -1068,40 +1229,187 @@ nfs_buf_write_delayed(struct nfsbuf *bp) return; } +/* + * add a reference to a buffer so it doesn't disappear while being used + * (must be called with nfs_buf_mutex held) + */ +void +nfs_buf_refget(struct nfsbuf *bp) +{ + bp->nb_refs++; +} +/* + * release a reference on a buffer + * (must be called with nfs_buf_mutex held) + */ +void +nfs_buf_refrele(struct nfsbuf *bp) +{ + bp->nb_refs--; +} + +/* + * mark a particular buffer as BUSY + * (must be called with nfs_buf_mutex held) + */ +errno_t +nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo) +{ + errno_t error; + struct timespec ts; + + if (ISSET(bp->nb_lflags, NBL_BUSY)) { + /* + * since the mutex_lock may block, the buffer + * may become BUSY, so we need to recheck for + * a NOWAIT request + */ + if (flags & NBAC_NOWAIT) + return (EBUSY); + SET(bp->nb_lflags, NBL_WANTED); + + ts.tv_sec = (slptimeo/100); + /* the hz value is 100; which leads to 10ms */ + ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000; + + error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1), + "nfs_buf_acquire", &ts); + if (error) + return (error); + return (EAGAIN); + } + if (flags & NBAC_REMOVE) + nfs_buf_remfree(bp); + SET(bp->nb_lflags, NBL_BUSY); + + return (0); +} + +/* + * simply drop the BUSY status of a buffer + * (must be called with nfs_buf_mutex held) + */ +void +nfs_buf_drop(struct nfsbuf *bp) +{ + int need_wakeup = 0; + + if (!ISSET(bp->nb_lflags, NBL_BUSY)) + panic("nfs_buf_drop: buffer not busy!"); + if (ISSET(bp->nb_lflags, NBL_WANTED)) { + /* + * delay the actual wakeup until after we + * clear NBL_BUSY and we've dropped nfs_buf_mutex + */ + need_wakeup = 1; + } + /* Unlock the buffer. */ + CLR(bp->nb_lflags, (NBL_BUSY | NBL_WANTED)); + + if (need_wakeup) + wakeup(bp); +} + +/* + * prepare for iterating over an nfsnode's buffer list + * this lock protects the queue manipulation + * (must be called with nfs_buf_mutex held) + */ +int +nfs_buf_iterprepare(struct nfsnode *np, struct nfsbuflists *iterheadp, int flags) +{ + struct nfsbuflists *listheadp; + + if (flags & NBI_DIRTY) + listheadp = &np->n_dirtyblkhd; + else + listheadp = &np->n_cleanblkhd; + + if ((flags & NBI_NOWAIT) && (np->n_bufiterflags & NBI_ITER)) { + LIST_INIT(iterheadp); + return(EWOULDBLOCK); + } + + while (np->n_bufiterflags & NBI_ITER) { + np->n_bufiterflags |= NBI_ITERWANT; + msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", 0); + } + if (LIST_EMPTY(listheadp)) { + LIST_INIT(iterheadp); + return(EINVAL); + } + np->n_bufiterflags |= NBI_ITER; + + iterheadp->lh_first = listheadp->lh_first; + listheadp->lh_first->nb_vnbufs.le_prev = &iterheadp->lh_first; + LIST_INIT(listheadp); + + return(0); +} + +/* + * cleanup after iterating over an nfsnode's buffer list + * this lock protects the queue manipulation + * (must be called with nfs_buf_mutex held) + */ +void +nfs_buf_itercomplete(struct nfsnode *np, struct nfsbuflists *iterheadp, int flags) +{ + struct nfsbuflists * listheadp; + struct nfsbuf *bp; + + if (flags & NBI_DIRTY) + listheadp = &np->n_dirtyblkhd; + else + listheadp = &np->n_cleanblkhd; + + while (!LIST_EMPTY(iterheadp)) { + bp = LIST_FIRST(iterheadp); + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(listheadp, bp, nb_vnbufs); + } + + np->n_bufiterflags &= ~NBI_ITER; + if (np->n_bufiterflags & NBI_ITERWANT) { + np->n_bufiterflags &= ~NBI_ITERWANT; + wakeup(&np->n_bufiterflags); + } +} + /* * Vnode op for read using bio * Any similarity to readip() is purely coincidental */ int -nfs_bioread(vp, uio, ioflag, cred, getpages) - register struct vnode *vp; - register struct uio *uio; - int ioflag; - struct ucred *cred; - int getpages; // XXX unused! +nfs_bioread( + vnode_t vp, + struct uio *uio, + __unused int ioflag, + kauth_cred_t cred, + proc_t p) { struct nfsnode *np = VTONFS(vp); - int biosize, i; + int biosize; off_t diff; - struct nfsbuf *bp = 0, *rabp; - struct vattr vattr; - struct proc *p; - struct nfsmount *nmp = VFSTONFS(vp->v_mount); - daddr_t lbn, rabn, lastrabn = -1; + struct nfsbuf *bp = NULL, *rabp; + struct nfs_vattr nvattr; + struct nfsmount *nmp = VFSTONFS(vnode_mount(vp)); + daddr64_t lbn, rabn, lastrabn = -1, tlbn; int bufsize; int nra, error = 0, n = 0, on = 0; - int operation = (getpages? BLK_PAGEIN : BLK_READ); caddr_t dp; - struct dirent *direntp; + struct dirent *direntp = NULL; + enum vtype vtype; + int nocachereadahead = 0; - FSDBG_TOP(514, vp, uio->uio_offset, uio->uio_resid, ioflag); + FSDBG_TOP(514, vp, uio->uio_offset, uio_uio_resid(uio), ioflag); #if DIAGNOSTIC if (uio->uio_rw != UIO_READ) panic("nfs_read mode"); #endif - if (uio->uio_resid == 0) { + if (uio_uio_resid(uio) == 0) { FSDBG_BOT(514, vp, 0xd1e0001, 0, 0); return (0); } @@ -1109,16 +1417,15 @@ nfs_bioread(vp, uio, ioflag, cred, getpages) FSDBG_BOT(514, vp, 0xd1e0002, 0, EINVAL); return (EINVAL); } - p = uio->uio_procp; - if ((nmp->nm_flag & NFSMNT_NFSV3) && - !(nmp->nm_state & NFSSTA_GOTFSINFO)) - (void)nfs_fsinfo(nmp, vp, cred, p); - biosize = vp->v_mount->mnt_stat.f_iosize; + + if ((nmp->nm_flag & NFSMNT_NFSV3) && !(nmp->nm_state & NFSSTA_GOTFSINFO)) + nfs_fsinfo(nmp, vp, cred, p); + biosize = vfs_statfs(vnode_mount(vp))->f_iosize; + vtype = vnode_vtype(vp); /* * For nfs, cache consistency can only be maintained approximately. * Although RFC1094 does not specify the criteria, the following is * believed to be compatible with the reference port. - * For nqnfs, full cache consistency is maintained within the loop. * For nfs: * If the file's modify time on the server has changed since the * last read rpc or you have written to the file, @@ -1128,126 +1435,121 @@ nfs_bioread(vp, uio, ioflag, cred, getpages) * attributes. * NB: This implies that cache data can be read when up to * NFS_MAXATTRTIMEO seconds out of date. If you find that you need - * current attributes this could be forced by setting n_xid to 0 - * before the VOP_GETATTR() call. + * current attributes this could be forced by setting calling + * NATTRINVALIDATE() before the nfs_getattr() call. */ - if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) { - if (np->n_flag & NMODIFIED) { - if (vp->v_type != VREG) { - if (vp->v_type != VDIR) - panic("nfs: bioread, not dir"); - nfs_invaldir(vp); - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) { - FSDBG_BOT(514, vp, 0xd1e0003, 0, error); - return (error); - } - } - np->n_xid = 0; - error = VOP_GETATTR(vp, &vattr, cred, p); + if (np->n_flag & NNEEDINVALIDATE) { + np->n_flag &= ~NNEEDINVALIDATE; + nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cred, p, 1); + } + if (np->n_flag & NMODIFIED) { + if (vtype != VREG) { + if (vtype != VDIR) + panic("nfs: bioread, not dir"); + nfs_invaldir(vp); + error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) { - FSDBG_BOT(514, vp, 0xd1e0004, 0, error); + FSDBG_BOT(514, vp, 0xd1e0003, 0, error); return (error); } - if (vp->v_type == VDIR) { - /* if directory changed, purge any name cache entries */ - if (np->n_ncmtime != vattr.va_mtime.tv_sec) + } + NATTRINVALIDATE(np); + error = nfs_getattr(vp, &nvattr, cred, p); + if (error) { + FSDBG_BOT(514, vp, 0xd1e0004, 0, error); + return (error); + } + if (vtype == VDIR) { + /* if directory changed, purge any name cache entries */ + if (nfstimespeccmp(&np->n_ncmtime, &nvattr.nva_mtime, !=)) + cache_purge(vp); + np->n_ncmtime = nvattr.nva_mtime; + } + np->n_mtime = nvattr.nva_mtime; + } else { + error = nfs_getattr(vp, &nvattr, cred, p); + if (error) { + FSDBG_BOT(514, vp, 0xd1e0005, 0, error); + return (error); + } + if (nfstimespeccmp(&np->n_mtime, &nvattr.nva_mtime, !=)) { + if (vtype == VDIR) { + nfs_invaldir(vp); + /* purge name cache entries */ + if (nfstimespeccmp(&np->n_ncmtime, &nvattr.nva_mtime, !=)) cache_purge(vp); - np->n_ncmtime = vattr.va_mtime.tv_sec; } - np->n_mtime = vattr.va_mtime.tv_sec; - } else { - error = VOP_GETATTR(vp, &vattr, cred, p); + error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) { - FSDBG_BOT(514, vp, 0xd1e0005, 0, error); + FSDBG_BOT(514, vp, 0xd1e0006, 0, error); return (error); } - if (np->n_mtime != vattr.va_mtime.tv_sec) { - if (vp->v_type == VDIR) { - nfs_invaldir(vp); - /* purge name cache entries */ - if (np->n_ncmtime != vattr.va_mtime.tv_sec) - cache_purge(vp); - } + if (vtype == VDIR) + np->n_ncmtime = nvattr.nva_mtime; + np->n_mtime = nvattr.nva_mtime; + } + } + + if (vnode_isnocache(vp)) { + if (!(np->n_flag & NNOCACHE)) { + if (NVALIDBUFS(np)) { error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) { - FSDBG_BOT(514, vp, 0xd1e0006, 0, error); + FSDBG_BOT(514, vp, 0xd1e000a, 0, error); return (error); } - if (vp->v_type == VDIR) - np->n_ncmtime = vattr.va_mtime.tv_sec; - np->n_mtime = vattr.va_mtime.tv_sec; } + np->n_flag |= NNOCACHE; } + } else if (np->n_flag & NNOCACHE) { + np->n_flag &= ~NNOCACHE; } - do { - /* - * Get a valid lease. If cached data is stale, flush it. - */ - if (nmp->nm_flag & NFSMNT_NQNFS) { - if (NQNFS_CKINVALID(vp, np, ND_READ)) { - do { - error = nqnfs_getlease(vp, ND_READ, cred, p); - } while (error == NQNFS_EXPIRED); - if (error) { - FSDBG_BOT(514, vp, 0xd1e0007, 0, error); - return (error); - } - if (np->n_lrev != np->n_brev || - (np->n_flag & NQNFSNONCACHE) || - ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { - if (vp->v_type == VDIR) - nfs_invaldir(vp); - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) { - FSDBG_BOT(514, vp, 0xd1e0008, 0, error); - return (error); - } - np->n_brev = np->n_lrev; - } - } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { - nfs_invaldir(vp); - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) { - FSDBG_BOT(514, vp, 0xd1e0009, 0, error); - return (error); - } - } - } - if ((np->n_flag & NQNFSNONCACHE) || (vp->v_flag & VNOCACHE_DATA)) { - if ((vp->v_flag & VNOCACHE_DATA) && - (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first)) { - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) { - FSDBG_BOT(514, vp, 0xd1e000a, 0, error); + do { + if (np->n_flag & NNOCACHE) { + switch (vtype) { + case VREG: + /* + * If we have only a block or so to read, + * just do the rpc directly. + * If we have a couple blocks or more to read, + * then we'll take advantage of readahead within + * this loop to try to fetch all the data in parallel + */ + if (!nocachereadahead && (uio_uio_resid(uio) < 2*biosize)) { + error = nfs_readrpc(vp, uio, cred, p); + FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), error); return (error); } - } - switch (vp->v_type) { - case VREG: - error = nfs_readrpc(vp, uio, cred); - FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error); - return (error); + nocachereadahead = 1; + break; case VLNK: - error = nfs_readlinkrpc(vp, uio, cred); - FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error); + error = nfs_readlinkrpc(vp, uio, cred, p); + FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), error); return (error); case VDIR: break; default: - printf(" NQNFSNONCACHE: type %x unexpected\n", vp->v_type); + printf(" NFSNOCACHE: type %x unexpected\n", vtype); }; } - switch (vp->v_type) { + switch (vtype) { case VREG: lbn = uio->uio_offset / biosize; /* * Copy directly from any cached pages without grabbing the bufs. + * + * Note: for "nocache" reads, we don't copy directly from UBC + * because any cached pages will be for readahead buffers that + * need to be invalidated anyway before we finish this request. */ - if (uio->uio_segflg == UIO_USERSPACE) { - int io_resid = uio->uio_resid; + if (!(np->n_flag & NNOCACHE) && + (uio->uio_segflg == UIO_USERSPACE32 || + uio->uio_segflg == UIO_USERSPACE64 || + uio->uio_segflg == UIO_USERSPACE)) { + // LP64todo - fix this! + int io_resid = uio_uio_resid(uio); diff = np->n_size - uio->uio_offset; if (diff < io_resid) io_resid = diff; @@ -1260,7 +1562,7 @@ nfs_bioread(vp, uio, ioflag, cred, getpages) } /* count any biocache reads that we just copied directly */ if (lbn != uio->uio_offset / biosize) { - nfsstats.biocache_reads += (uio->uio_offset / biosize) - lbn; + OSAddAtomic((uio->uio_offset / biosize) - lbn, (SInt32*)&nfsstats.biocache_reads); FSDBG(514, vp, 0xcacefeed, uio->uio_offset, error); } } @@ -1280,16 +1582,23 @@ nfs_bioread(vp, uio, ioflag, cred, getpages) continue; } lastrabn = rabn; - if ((off_t)rabn * biosize >= np->n_size) + if ((off_t)rabn * biosize >= (off_t)np->n_size) + break; + if ((np->n_flag & NNOCACHE) && + (((off_t)rabn * biosize) >= (uio->uio_offset + uio_uio_resid(uio)))) + /* for uncached readahead, don't go beyond end of request */ break; /* check if block exists and is valid. */ - rabp = nfs_buf_incore(vp, rabn); - if (rabp && nfs_buf_upl_valid_range(rabp, 0, rabp->nb_bufsize)) + error = nfs_buf_get(vp, rabn, biosize, p, NBLK_READ|NBLK_NOWAIT, &rabp); + if (error) { + FSDBG_BOT(514, vp, 0xd1e000b, 1, error); + return (error); + } + if (!rabp) + continue; + if (nfs_buf_upl_valid_range(rabp, 0, rabp->nb_bufsize)) { + nfs_buf_release(rabp, 1); continue; - rabp = nfs_buf_get(vp, rabn, biosize, p, operation); - if (!rabp) { - FSDBG_BOT(514, vp, 0xd1e000b, 0, EINTR); - return (EINTR); } if (!ISSET(rabp->nb_flags, (NB_CACHE|NB_DELWRI))) { SET(rabp->nb_flags, (NB_READ|NB_ASYNC)); @@ -1303,12 +1612,12 @@ nfs_bioread(vp, uio, ioflag, cred, getpages) } } - if ((uio->uio_resid <= 0) || (uio->uio_offset >= np->n_size)) { - FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, 0xaaaaaaaa); + if ((uio_uio_resid(uio) <= 0) || (uio->uio_offset >= (off_t)np->n_size)) { + FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), 0xaaaaaaaa); return (0); } - nfsstats.biocache_reads++; + OSAddAtomic(1, (SInt32*)&nfsstats.biocache_reads); /* * If the block is in the cache and has the required data @@ -1318,13 +1627,14 @@ nfs_bioread(vp, uio, ioflag, cred, getpages) */ again: bufsize = biosize; - n = min((unsigned)(bufsize - on), uio->uio_resid); + // LP64todo - fix this! + n = min((unsigned)(bufsize - on), uio_uio_resid(uio)); diff = np->n_size - uio->uio_offset; if (diff < n) n = diff; - bp = nfs_buf_get(vp, lbn, bufsize, p, operation); - if (!bp) { + error = nfs_buf_get(vp, lbn, bufsize, p, NBLK_READ, &bp); + if (error) { FSDBG_BOT(514, vp, 0xd1e000c, 0, EINTR); return (EINTR); } @@ -1350,8 +1660,8 @@ again: if (bp->nb_validoff < 0) { /* valid range isn't set up, so */ /* set it to what we know is valid */ - bp->nb_validoff = trunc_page_32(on); - bp->nb_validend = round_page_32(on+n); + bp->nb_validoff = trunc_page(on); + bp->nb_validend = round_page(on+n); nfs_buf_normalize_valid_range(np, bp); } goto buffer_ready; @@ -1363,11 +1673,10 @@ again: /* so write the buffer out and try again */ CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL)); SET(bp->nb_flags, NB_ASYNC); - /* - * NFS has embedded ucred so crhold() risks zone corruption - */ - if (bp->nb_wcred == NOCRED) - bp->nb_wcred = crdup(cred); + if (bp->nb_wcred == NOCRED) { + kauth_cred_ref(cred); + bp->nb_wcred = cred; + } error = nfs_buf_write(bp); if (error) { FSDBG_BOT(514, vp, 0xd1e000d, 0, error); @@ -1382,20 +1691,22 @@ again: bp->nb_valid = 0; } else { /* read the page range in */ - struct iovec iov; - struct uio auio; - auio.uio_iov = &iov; - auio.uio_iovcnt = 1; - auio.uio_offset = NBOFF(bp) + firstpg * PAGE_SIZE_64; - auio.uio_resid = (lastpg - firstpg + 1) * PAGE_SIZE; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = UIO_READ; - auio.uio_procp = p; + uio_t auio; + char uio_buf[ UIO_SIZEOF(1) ]; + NFS_BUF_MAP(bp); - iov.iov_base = bp->nb_data + firstpg * PAGE_SIZE; - iov.iov_len = auio.uio_resid; - error = nfs_readrpc(vp, &auio, cred); + auio = uio_createwithbuffer(1, (NBOFF(bp) + firstpg * PAGE_SIZE_64), + UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf)); + if (!auio) { + error = ENOMEM; + } else { + uio_addiov(auio, CAST_USER_ADDR_T((bp->nb_data + firstpg * PAGE_SIZE)), + ((lastpg - firstpg + 1) * PAGE_SIZE)); + error = nfs_readrpc(vp, auio, cred, p); + } if (error) { + if (np->n_flag & NNOCACHE) + SET(bp->nb_flags, NB_NOCACHE); nfs_buf_release(bp, 1); FSDBG_BOT(514, vp, 0xd1e000e, 0, error); return (error); @@ -1404,10 +1715,10 @@ again: bp->nb_validoff = trunc_page_32(on); bp->nb_validend = round_page_32(on+n); nfs_buf_normalize_valid_range(np, bp); - if (auio.uio_resid > 0) { + if (uio_resid(auio) > 0) { /* if short read, must have hit EOF, */ /* so zero the rest of the range */ - bzero(iov.iov_base, auio.uio_resid); + bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio)); } /* mark the pages (successfully read) as valid */ for (pg=firstpg; pg <= lastpg; pg++) @@ -1420,13 +1731,14 @@ again: CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL)); error = nfs_doio(bp, cred, p); if (error) { + if (np->n_flag & NNOCACHE) + SET(bp->nb_flags, NB_NOCACHE); nfs_buf_release(bp, 1); FSDBG_BOT(514, vp, 0xd1e000f, 0, error); return (error); } } buffer_ready: - vp->v_lastr = lbn; /* validate read range against valid range and clip */ if (bp->nb_validend > 0) { diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on); @@ -1437,11 +1749,11 @@ buffer_ready: NFS_BUF_MAP(bp); break; case VLNK: - nfsstats.biocache_readlinks++; - bp = nfs_buf_get(vp, (daddr_t)0, NFS_MAXPATHLEN, p, operation); - if (!bp) { - FSDBG_BOT(514, vp, 0xd1e0010, 0, EINTR); - return (EINTR); + OSAddAtomic(1, (SInt32*)&nfsstats.biocache_readlinks); + error = nfs_buf_get(vp, 0, NFS_MAXPATHLEN, p, NBLK_READ, &bp); + if (error) { + FSDBG_BOT(514, vp, 0xd1e0010, 0, error); + return (error); } if (!ISSET(bp->nb_flags, NB_CACHE)) { SET(bp->nb_flags, NB_READ); @@ -1453,21 +1765,22 @@ buffer_ready: return (error); } } - n = min(uio->uio_resid, bp->nb_validend); + // LP64todo - fix this! + n = min(uio_uio_resid(uio), bp->nb_validend); on = 0; break; case VDIR: - nfsstats.biocache_readdirs++; + OSAddAtomic(1, (SInt32*)&nfsstats.biocache_readdirs); if (np->n_direofoffset && uio->uio_offset >= np->n_direofoffset) { FSDBG_BOT(514, vp, 0xde0f0001, 0, 0); return (0); } lbn = uio->uio_offset / NFS_DIRBLKSIZ; on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); - bp = nfs_buf_get(vp, lbn, NFS_DIRBLKSIZ, p, operation); - if (!bp) { - FSDBG_BOT(514, vp, 0xd1e0012, 0, EINTR); - return (EINTR); + error = nfs_buf_get(vp, lbn, NFS_DIRBLKSIZ, p, NBLK_READ, &bp); + if (error) { + FSDBG_BOT(514, vp, 0xd1e0012, 0, error); + return (error); } if (!ISSET(bp->nb_flags, NB_CACHE)) { SET(bp->nb_flags, NB_READ); @@ -1484,16 +1797,16 @@ buffer_ready: * reading from the beginning to get all the * offset cookies. */ - for (i = 0; i <= lbn && !error; i++) { + for (tlbn = 0; tlbn <= lbn && !error; tlbn++) { if (np->n_direofoffset - && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) { + && (tlbn * NFS_DIRBLKSIZ) >= np->n_direofoffset) { FSDBG_BOT(514, vp, 0xde0f0002, 0, 0); return (0); } - bp = nfs_buf_get(vp, i, NFS_DIRBLKSIZ, p, operation); - if (!bp) { - FSDBG_BOT(514, vp, 0xd1e0013, 0, EINTR); - return (EINTR); + error = nfs_buf_get(vp, tlbn, NFS_DIRBLKSIZ, p, NBLK_READ, &bp); + if (error) { + FSDBG_BOT(514, vp, 0xd1e0013, 0, error); + return (error); } if (!ISSET(bp->nb_flags, NB_CACHE)) { SET(bp->nb_flags, NB_READ); @@ -1511,7 +1824,7 @@ buffer_ready: * is not the block we want, we throw away the * block and go for the next one via the for loop. */ - if (error || i < lbn) + if (error || tlbn < lbn) nfs_buf_release(bp, 1); } } @@ -1534,10 +1847,12 @@ buffer_ready: if (nfs_numasync > 0 && nmp->nm_readahead > 0 && (np->n_direofoffset == 0 || (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && - !(np->n_flag & NQNFSNONCACHE) && - !nfs_buf_incore(vp, lbn + 1)) { - rabp = nfs_buf_get(vp, lbn + 1, NFS_DIRBLKSIZ, p, - operation); + !nfs_buf_is_incore(vp, lbn + 1)) { + error = nfs_buf_get(vp, lbn + 1, NFS_DIRBLKSIZ, p, NBLK_READ|NBLK_NOWAIT, &rabp); + if (error) { + FSDBG_BOT(514, vp, 0xd1e0015, 0, error); + return (error); + } if (rabp) { if (!ISSET(rabp->nb_flags, (NB_CACHE))) { SET(rabp->nb_flags, (NB_READ | NB_ASYNC)); @@ -1555,7 +1870,8 @@ buffer_ready: * Make sure we use a signed variant of min() since * the second term may be negative. */ - n = lmin(uio->uio_resid, bp->nb_validend - on); + // LP64todo - fix this! + n = lmin(uio_uio_resid(uio), bp->nb_validend - on); /* * We keep track of the directory eof in * np->n_direofoffset and chop it off as an @@ -1583,28 +1899,30 @@ buffer_ready: } break; default: - printf("nfs_bioread: type %x unexpected\n",vp->v_type); - FSDBG_BOT(514, vp, 0xd1e0015, 0, EINVAL); + printf("nfs_bioread: type %x unexpected\n", vtype); + FSDBG_BOT(514, vp, 0xd1e0016, 0, EINVAL); return (EINVAL); }; if (n > 0) { error = uiomove(bp->nb_data + on, (int)n, uio); } - switch (vp->v_type) { + switch (vtype) { case VREG: + if (np->n_flag & NNOCACHE) + SET(bp->nb_flags, NB_NOCACHE); break; case VLNK: n = 0; break; case VDIR: - if (np->n_flag & NQNFSNONCACHE) - SET(bp->nb_flags, NB_INVAL); + break; + default: break; } - nfs_buf_release(bp, 1); - } while (error == 0 && uio->uio_resid > 0 && n > 0); - FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error); + nfs_buf_release(bp, 1); + } while (error == 0 && uio_uio_resid(uio) > 0 && n > 0); + FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), error); return (error); } @@ -1614,42 +1932,54 @@ buffer_ready: */ int nfs_write(ap) - struct vop_write_args /* { - struct vnode *a_vp; + struct vnop_write_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; + int a_ioflag; + vfs_context_t a_context; } */ *ap; { struct uio *uio = ap->a_uio; - struct proc *p = uio->uio_procp; - struct vnode *vp = ap->a_vp; + vnode_t vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); - struct ucred *cred = ap->a_cred; + proc_t p; + kauth_cred_t cred; int ioflag = ap->a_ioflag; struct nfsbuf *bp; - struct vattr vattr; - struct nfsmount *nmp = VFSTONFS(vp->v_mount); - daddr_t lbn; - int biosize, bufsize, writeop; - int n, on, error = 0, iomode, must_commit; + struct nfs_vattr nvattr; + struct nfsmount *nmp = VFSTONFS(vnode_mount(vp)); + daddr64_t lbn; + int biosize, bufsize; + int n, on, error = 0; off_t boff, start, end, cureof; - struct iovec iov; + struct iovec_32 iov; struct uio auio; - FSDBG_TOP(515, vp, uio->uio_offset, uio->uio_resid, ioflag); + FSDBG_TOP(515, vp, uio->uio_offset, uio_uio_resid(uio), ioflag); #if DIAGNOSTIC if (uio->uio_rw != UIO_WRITE) panic("nfs_write mode"); - if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != current_proc()) + if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) panic("nfs_write proc"); #endif - if (vp->v_type != VREG) + + p = vfs_context_proc(ap->a_context); + cred = vfs_context_ucred(ap->a_context); + + if (vnode_vtype(vp) != VREG) return (EIO); + + np->n_flag |= NWRBUSY; + + if (np->n_flag & NNEEDINVALIDATE) { + np->n_flag &= ~NNEEDINVALIDATE; + nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cred, p, 1); + } if (np->n_flag & NWRITEERR) { - np->n_flag &= ~NWRITEERR; - FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, np->n_error); + np->n_flag &= ~(NWRITEERR | NWRBUSY); + FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), np->n_error); return (np->n_error); } if ((nmp->nm_flag & NFSMNT_NFSV3) && @@ -1657,17 +1987,19 @@ nfs_write(ap) (void)nfs_fsinfo(nmp, vp, cred, p); if (ioflag & (IO_APPEND | IO_SYNC)) { if (np->n_flag & NMODIFIED) { - np->n_xid = 0; + NATTRINVALIDATE(np); error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) { + np->n_flag &= ~NWRBUSY; FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad01, error); return (error); } } if (ioflag & IO_APPEND) { - np->n_xid = 0; - error = VOP_GETATTR(vp, &vattr, cred, p); + NATTRINVALIDATE(np); + error = nfs_getattr(vp, &nvattr, cred, p); if (error) { + np->n_flag &= ~NWRBUSY; FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad02, error); return (error); } @@ -1675,71 +2007,40 @@ nfs_write(ap) } } if (uio->uio_offset < 0) { + np->n_flag &= ~NWRBUSY; FSDBG_BOT(515, vp, uio->uio_offset, 0xbad0ff, EINVAL); return (EINVAL); } - if (uio->uio_resid == 0) { - FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, 0); + if (uio_uio_resid(uio) == 0) { + np->n_flag &= ~NWRBUSY; + FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), 0); return (0); } - /* - * Maybe this should be above the vnode op call, but so long as - * file servers have no limits, i don't think it matters - */ - if (p && uio->uio_offset + uio->uio_resid > - p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { - psignal(p, SIGXFSZ); - FSDBG_BOT(515, vp, uio->uio_offset, 0x2b1f, EFBIG); - return (EFBIG); - } - biosize = vp->v_mount->mnt_stat.f_iosize; + biosize = vfs_statfs(vnode_mount(vp))->f_iosize; - do { - /* - * Check for a valid write lease. - */ - if ((nmp->nm_flag & NFSMNT_NQNFS) && - NQNFS_CKINVALID(vp, np, ND_WRITE)) { - do { - error = nqnfs_getlease(vp, ND_WRITE, cred, p); - } while (error == NQNFS_EXPIRED); - if (error) { - FSDBG_BOT(515, vp, uio->uio_offset, 0x11110001, error); - return (error); - } - if (np->n_lrev != np->n_brev || - (np->n_flag & NQNFSNONCACHE)) { + if (vnode_isnocache(vp)) { + if (!(np->n_flag & NNOCACHE)) { + if (NVALIDBUFS(np)) { error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) { - FSDBG_BOT(515, vp, uio->uio_offset, 0x11110002, error); + np->n_flag &= ~NWRBUSY; + FSDBG_BOT(515, vp, 0, 0, error); return (error); } - np->n_brev = np->n_lrev; - } - } - if (ISSET(vp->v_flag, VNOCACHE_DATA) && - (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first)) { - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) { - FSDBG_BOT(515, vp, 0, 0, error); - return (error); } + np->n_flag |= NNOCACHE; } - if (((np->n_flag & NQNFSNONCACHE) || - ISSET(vp->v_flag, VNOCACHE_DATA)) && - uio->uio_iovcnt == 1) { - iomode = NFSV3WRITE_FILESYNC; - error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); - if (must_commit) - nfs_clearcommit(vp->v_mount); - FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error); - return (error); - } - nfsstats.biocache_writes++; + } else if (np->n_flag & NNOCACHE) { + np->n_flag &= ~NNOCACHE; + } + + do { + OSAddAtomic(1, (SInt32*)&nfsstats.biocache_writes); lbn = uio->uio_offset / biosize; on = uio->uio_offset % biosize; - n = min((unsigned)(biosize - on), uio->uio_resid); + // LP64todo - fix this + n = min((unsigned)(biosize - on), uio_uio_resid(uio)); again: bufsize = biosize; /* @@ -1748,22 +2049,22 @@ again: * either has no dirty region or that the given range is * contiguous with the existing dirty region. */ - bp = nfs_buf_get(vp, lbn, bufsize, p, BLK_WRITE); - if (!bp) { - FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, EINTR); - return (EINTR); + error = nfs_buf_get(vp, lbn, bufsize, p, NBLK_WRITE, &bp); + if (error) { + np->n_flag &= ~NWRBUSY; + FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error); + return (error); } /* map the block because we know we're going to write to it */ NFS_BUF_MAP(bp); - if (ISSET(vp->v_flag, VNOCACHE_DATA)) - SET(bp->nb_flags, (NB_NOCACHE|NB_INVAL)); + if (np->n_flag & NNOCACHE) + SET(bp->nb_flags, (NB_NOCACHE|NB_STABLE)); - /* - * NFS has embedded ucred so crhold() risks zone corruption - */ - if (bp->nb_wcred == NOCRED) - bp->nb_wcred = crdup(cred); + if (bp->nb_wcred == NOCRED) { + kauth_cred_ref(cred); + bp->nb_wcred = cred; + } /* * If there's already a dirty range AND dirty pages in this block we @@ -1787,7 +2088,8 @@ again: SET(bp->nb_flags, (NB_ASYNC | NB_STABLE)); error = nfs_buf_write(bp); if (error) { - FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error); + np->n_flag &= ~NWRBUSY; + FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error); return (error); } goto again; @@ -1808,7 +2110,8 @@ again: SET(bp->nb_flags, (NB_ASYNC | NB_STABLE)); error = nfs_buf_write(bp); if (error) { - FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error); + np->n_flag &= ~NWRBUSY; + FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error); return (error); } goto again; @@ -1821,7 +2124,7 @@ again: if (NBPGDIRTY(bp,lastpg)) { bp->nb_dirtyend = (lastpg+1) * PAGE_SIZE; /* clip to EOF */ - if (NBOFF(bp) + bp->nb_dirtyend > np->n_size) + if (NBOFF(bp) + bp->nb_dirtyend > (off_t)np->n_size) bp->nb_dirtyend = np->n_size - NBOFF(bp); } else bp->nb_dirtyend = on+n; @@ -1835,16 +2138,22 @@ again: * and zero the new bytes. */ cureof = (off_t)np->n_size; - if (uio->uio_offset + n > np->n_size) { + if (uio->uio_offset + n > (off_t)np->n_size) { struct nfsbuf *eofbp = NULL; - daddr_t eofbn = np->n_size / biosize; + daddr64_t eofbn = np->n_size / biosize; int eofoff = np->n_size % biosize; int neweofoff = (uio->uio_offset + n) % biosize; FSDBG(515, 0xb1ffa000, uio->uio_offset + n, eofoff, neweofoff); - if (eofoff && eofbn < lbn && nfs_buf_incore(vp, eofbn)) - eofbp = nfs_buf_get(vp, eofbn, biosize, p, BLK_WRITE); + if (eofoff && (eofbn < lbn)) { + error = nfs_buf_get(vp, eofbn, biosize, p, NBLK_WRITE|NBLK_ONLYVALID, &eofbp); + if (error) { + np->n_flag &= ~NWRBUSY; + FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error); + return (error); + } + } /* if we're extending within the same last block */ /* and the block is flagged as being cached... */ @@ -1891,8 +2200,8 @@ again: */ char *d; int i; - if (ISSET(vp->v_flag, VNOCACHE_DATA)) - SET(eofbp->nb_flags, (NB_NOCACHE|NB_INVAL)); + if (np->n_flag & NNOCACHE) + SET(eofbp->nb_flags, (NB_NOCACHE|NB_STABLE)); NFS_BUF_MAP(eofbp); FSDBG(516, eofbp, eofoff, biosize - eofoff, 0xe0fff01e); d = eofbp->nb_data; @@ -1915,7 +2224,7 @@ again: * If dirtyend exceeds file size, chop it down. This should * not occur unless there is a race. */ - if (NBOFF(bp) + bp->nb_dirtyend > np->n_size) + if (NBOFF(bp) + bp->nb_dirtyend > (off_t)np->n_size) bp->nb_dirtyend = np->n_size - NBOFF(bp); /* * UBC doesn't handle partial pages, so we need to make sure @@ -1979,34 +2288,39 @@ again: NFS_BUF_MAP(bp); /* setup uio for read(s) */ boff = NBOFF(bp); - auio.uio_iov = &iov; + auio.uio_iovs.iov32p = &iov; auio.uio_iovcnt = 1; +#if 1 /* LP64todo - can't use new segment flags until the drivers are ready */ auio.uio_segflg = UIO_SYSSPACE; +#else + auio.uio_segflg = UIO_SYSSPACE32; +#endif auio.uio_rw = UIO_READ; - auio.uio_procp = p; if (dirtypg <= (end-1)/PAGE_SIZE) { /* there's a dirty page in the way, so just do two reads */ /* we'll read the preceding data here */ auio.uio_offset = boff + start; - auio.uio_resid = iov.iov_len = on - start; - iov.iov_base = bp->nb_data + start; - error = nfs_readrpc(vp, &auio, cred); + iov.iov_len = on - start; + uio_uio_resid_set(&auio, iov.iov_len); + iov.iov_base = (uintptr_t) bp->nb_data + start; + error = nfs_readrpc(vp, &auio, cred, p); if (error) { bp->nb_error = error; SET(bp->nb_flags, NB_ERROR); printf("nfs_write: readrpc %d", error); } - if (auio.uio_resid > 0) { - FSDBG(516, bp, iov.iov_base - bp->nb_data, auio.uio_resid, 0xd00dee01); - bzero(iov.iov_base, auio.uio_resid); + if (uio_uio_resid(&auio) > 0) { + FSDBG(516, bp, iov.iov_base - bp->nb_data, uio_uio_resid(&auio), 0xd00dee01); + // LP64todo - fix this + bzero((caddr_t)iov.iov_base, uio_uio_resid(&auio)); } /* update validoff/validend if necessary */ if ((bp->nb_validoff < 0) || (bp->nb_validoff > start)) bp->nb_validoff = start; if ((bp->nb_validend < 0) || (bp->nb_validend < on)) bp->nb_validend = on; - if (np->n_size > boff + bp->nb_validend) + if ((off_t)np->n_size > boff + bp->nb_validend) bp->nb_validend = min(np->n_size - (boff + start), biosize); /* validate any pages before the write offset */ for (; start < on/PAGE_SIZE; start+=PAGE_SIZE) @@ -2037,17 +2351,19 @@ again: } else { /* now we'll read the (rest of the) data */ auio.uio_offset = boff + start; - auio.uio_resid = iov.iov_len = end - start; - iov.iov_base = bp->nb_data + start; - error = nfs_readrpc(vp, &auio, cred); + iov.iov_len = end - start; + uio_uio_resid_set(&auio, iov.iov_len); + iov.iov_base = (uintptr_t) (bp->nb_data + start); + error = nfs_readrpc(vp, &auio, cred, p); if (error) { bp->nb_error = error; SET(bp->nb_flags, NB_ERROR); printf("nfs_write: readrpc %d", error); } - if (auio.uio_resid > 0) { - FSDBG(516, bp, iov.iov_base - bp->nb_data, auio.uio_resid, 0xd00dee02); - bzero(iov.iov_base, auio.uio_resid); + if (uio_uio_resid(&auio) > 0) { + FSDBG(516, bp, iov.iov_base - bp->nb_data, uio_uio_resid(&auio), 0xd00dee02); + // LP64todo - fix this + bzero((caddr_t)iov.iov_base, uio_uio_resid(&auio)); } } /* update validoff/validend if necessary */ @@ -2055,7 +2371,7 @@ again: bp->nb_validoff = start; if ((bp->nb_validend < 0) || (bp->nb_validend < end)) bp->nb_validend = end; - if (np->n_size > boff + bp->nb_validend) + if ((off_t)np->n_size > boff + bp->nb_validend) bp->nb_validend = min(np->n_size - (boff + start), biosize); /* validate any pages before the write offset's page */ for (; start < trunc_page_32(on); start+=PAGE_SIZE) @@ -2070,44 +2386,20 @@ again: if (ISSET(bp->nb_flags, NB_ERROR)) { error = bp->nb_error; nfs_buf_release(bp, 1); - FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error); + np->n_flag &= ~NWRBUSY; + FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error); return (error); } np->n_flag |= NMODIFIED; - /* - * Check for valid write lease and get one as required. - * In case nfs_buf_get() and/or nfs_buf_write() delayed us. - */ - if ((nmp->nm_flag & NFSMNT_NQNFS) && - NQNFS_CKINVALID(vp, np, ND_WRITE)) { - do { - error = nqnfs_getlease(vp, ND_WRITE, cred, p); - } while (error == NQNFS_EXPIRED); - if (error) { - nfs_buf_release(bp, 1); - FSDBG_BOT(515, vp, uio->uio_offset, 0x11220001, error); - return (error); - } - if (np->n_lrev != np->n_brev || - (np->n_flag & NQNFSNONCACHE)) { - nfs_buf_release(bp, 1); - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) { - FSDBG_BOT(515, vp, uio->uio_offset, 0x11220002, error); - return (error); - } - np->n_brev = np->n_lrev; - goto again; - } - } NFS_BUF_MAP(bp); error = uiomove((char *)bp->nb_data + on, n, uio); if (error) { SET(bp->nb_flags, NB_ERROR); nfs_buf_release(bp, 1); - FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error); + np->n_flag &= ~NWRBUSY; + FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error); return (error); } @@ -2153,36 +2445,29 @@ again: } CLR(bp->nb_flags, NB_NEEDCOMMIT); - if ((np->n_flag & NQNFSNONCACHE) || - (ioflag & IO_SYNC) || (vp->v_flag & VNOCACHE_DATA)) { + if (ioflag & IO_SYNC) { bp->nb_proc = p; error = nfs_buf_write(bp); if (error) { + np->n_flag &= ~NWRBUSY; FSDBG_BOT(515, vp, uio->uio_offset, - uio->uio_resid, error); + uio_uio_resid(uio), error); return (error); } - if (np->n_flag & NQNFSNONCACHE) { - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) { - FSDBG_BOT(515, vp, uio->uio_offset, - uio->uio_resid, error); - return (error); - } - } - } else if ((n + on) == biosize && (nmp->nm_flag & NFSMNT_NQNFS) == 0) { - bp->nb_proc = (struct proc *)0; + } else if (((n + on) == biosize) || (np->n_flag & NNOCACHE)) { + bp->nb_proc = NULL; SET(bp->nb_flags, NB_ASYNC); nfs_buf_write(bp); } else - nfs_buf_write_delayed(bp); + nfs_buf_write_delayed(bp, p); - if (np->n_needcommitcnt > (nbuf/16)) - nfs_flushcommits(vp, p); + if (np->n_needcommitcnt > (nfsbufcnt/16)) + nfs_flushcommits(vp, p, 1); - } while (uio->uio_resid > 0 && n > 0); + } while (uio_uio_resid(uio) > 0 && n > 0); - FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, 0); + np->n_flag &= ~NWRBUSY; + FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), 0); return (0); } @@ -2191,60 +2476,63 @@ again: * Called with the underlying object locked. */ static int -nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, slptimeo) - register struct vnode *vp; - int flags; - struct ucred *cred; - struct proc *p; - int slpflag, slptimeo; +nfs_vinvalbuf_internal( + vnode_t vp, + int flags, + kauth_cred_t cred, + proc_t p, + int slpflag, + int slptimeo) { struct nfsbuf *bp; - struct nfsbuf *nbp, *blist; - int s, error = 0; + struct nfsbuflists blist; + int list, error = 0; struct nfsnode *np = VTONFS(vp); if (flags & V_SAVE) { - if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) + if ((error = nfs_flush(vp, MNT_WAIT, cred, p, + (flags & V_IGNORE_WRITEERR)))) return (error); - if (np->n_dirtyblkhd.lh_first) + if (!LIST_EMPTY(&np->n_dirtyblkhd)) panic("nfs_vinvalbuf: dirty bufs (vp 0x%x, bp 0x%x)", - vp, np->n_dirtyblkhd.lh_first); + vp, LIST_FIRST(&np->n_dirtyblkhd)); } + lck_mtx_lock(nfs_buf_mutex); for (;;) { - blist = np->n_cleanblkhd.lh_first; - if (!blist) - blist = np->n_dirtyblkhd.lh_first; - if (!blist) - break; - - for (bp = blist; bp; bp = nbp) { - nbp = bp->nb_vnbufs.le_next; - s = splbio(); - if (ISSET(bp->nb_flags, NB_BUSY)) { - SET(bp->nb_flags, NB_WANTED); - FSDBG_TOP(556, vp, bp, NBOFF(bp), bp->nb_flags); - error = tsleep((caddr_t)bp, - slpflag | (PRIBIO + 1), "nfs_vinvalbuf", - slptimeo); - FSDBG_BOT(556, vp, bp, NBOFF(bp), bp->nb_flags); - splx(s); - if (error) { + list = NBI_CLEAN; + if (nfs_buf_iterprepare(np, &blist, list)) { + list = NBI_DIRTY; + if (nfs_buf_iterprepare(np, &blist, list)) + break; + } + while ((bp = LIST_FIRST(&blist))) { + LIST_REMOVE(bp, nb_vnbufs); + if (list == NBI_CLEAN) + LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); + else + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + nfs_buf_refget(bp); + while ((error = nfs_buf_acquire(bp, NBAC_REMOVE, slpflag, slptimeo))) { + FSDBG(556, vp, bp, NBOFF(bp), bp->nb_flags); + if (error != EAGAIN) { FSDBG(554, vp, bp, -1, error); + nfs_buf_refrele(bp); + nfs_buf_itercomplete(np, &blist, list); + lck_mtx_unlock(nfs_buf_mutex); return (error); } - break; } + nfs_buf_refrele(bp); FSDBG(554, vp, bp, NBOFF(bp), bp->nb_flags); - nfs_buf_remfree(bp); - SET(bp->nb_flags, NB_BUSY); - splx(s); - if ((flags & V_SAVE) && UBCINFOEXISTS(vp) && (NBOFF(bp) < np->n_size)) { + lck_mtx_unlock(nfs_buf_mutex); + if ((flags & V_SAVE) && UBCINFOEXISTS(vp) && bp->nb_vp && + (NBOFF(bp) < (off_t)np->n_size)) { /* XXX extra paranoia: make sure we're not */ /* somehow leaving any dirty data around */ int mustwrite = 0; - int end = (NBOFF(bp) + bp->nb_bufsize >= np->n_size) ? - bp->nb_bufsize : (np->n_size - NBOFF(bp)); + int end = (NBOFF(bp) + bp->nb_bufsize > (off_t)np->n_size) ? + ((off_t)np->n_size - NBOFF(bp)) : bp->nb_bufsize; if (!ISSET(bp->nb_flags, NB_PAGELIST)) { error = nfs_buf_upl_setup(bp); if (error == EINVAL) { @@ -2252,8 +2540,7 @@ nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, slptimeo) /* hopefully we don't need to do */ /* anything for this buffer */ } else if (error) - printf("nfs_vinvalbuf: upl setup failed %d\n", - error); + printf("nfs_vinvalbuf: upl setup failed %d\n", error); bp->nb_valid = bp->nb_dirty = 0; } nfs_buf_upl_check(bp); @@ -2265,8 +2552,11 @@ nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, slptimeo) mustwrite++; } bp->nb_dirty &= (1 << (round_page_32(end)/PAGE_SIZE)) - 1; - if (bp->nb_dirty) - mustwrite++; + /* also make sure we'll have a credential to do the write */ + if (mustwrite && (bp->nb_wcred == NOCRED) && (cred == NOCRED)) { + printf("nfs_vinvalbuf: found dirty buffer with no write creds\n"); + mustwrite = 0; + } if (mustwrite) { FSDBG(554, vp, bp, 0xd00dee, bp->nb_flags); if (!ISSET(bp->nb_flags, NB_PAGELIST)) @@ -2276,30 +2566,39 @@ nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, slptimeo) /* (NB_NOCACHE indicates buffer should be discarded) */ CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC)); SET(bp->nb_flags, NB_STABLE | NB_NOCACHE); - /* - * NFS has embedded ucred so crhold() risks zone corruption - */ - if (bp->nb_wcred == NOCRED) - bp->nb_wcred = crdup(cred); + if (bp->nb_wcred == NOCRED) { + kauth_cred_ref(cred); + bp->nb_wcred = cred; + } error = nfs_buf_write(bp); // Note: bp has been released if (error) { FSDBG(554, bp, 0xd00dee, 0xbad, error); np->n_error = error; np->n_flag |= NWRITEERR; + /* + * There was a write error and we need to + * invalidate attrs to sync with server. + * (if this write was extending the file, + * we may no longer know the correct size) + */ + NATTRINVALIDATE(np); error = 0; } - break; + lck_mtx_lock(nfs_buf_mutex); + continue; } } SET(bp->nb_flags, NB_INVAL); - // Note: We don't want to do FREEUPs here because - // that may modify the buffer chain we're iterating! + // hold off on FREEUPs until we're done here nfs_buf_release(bp, 0); + lck_mtx_lock(nfs_buf_mutex); } + nfs_buf_itercomplete(np, &blist, list); } + lck_mtx_unlock(nfs_buf_mutex); NFS_BUF_FREEUP(); - if (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first) + if (NVALIDBUFS(np)) panic("nfs_vinvalbuf: flush failed"); return (0); } @@ -2310,17 +2609,17 @@ nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, slptimeo) * doing the flush, just wait for completion. */ int -nfs_vinvalbuf(vp, flags, cred, p, intrflg) - struct vnode *vp; - int flags; - struct ucred *cred; - struct proc *p; - int intrflg; +nfs_vinvalbuf( + vnode_t vp, + int flags, + kauth_cred_t cred, + proc_t p, + int intrflg) { - register struct nfsnode *np = VTONFS(vp); - struct nfsmount *nmp = VFSTONFS(vp->v_mount); + struct nfsnode *np = VTONFS(vp); + struct nfsmount *nmp = VFSTONFS(vnode_mount(vp)); int error = 0, slpflag, slptimeo; - int didhold = 0; + off_t size; FSDBG_TOP(554, vp, flags, intrflg, 0); @@ -2341,7 +2640,7 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg) FSDBG_TOP(555, vp, flags, intrflg, np->n_flag); error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", slptimeo); FSDBG_BOT(555, vp, flags, intrflg, np->n_flag); - if (error && (error = nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p))) { + if (error && (error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p))) { FSDBG_BOT(554, vp, flags, intrflg, error); return (error); } @@ -2354,7 +2653,7 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg) error = nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, 0); while (error) { FSDBG(554, vp, 0, 0, error); - error = nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p); + error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p); if (error) { np->n_flag &= ~NFLUSHINPROG; if (np->n_flag & NFLUSHWANT) { @@ -2371,13 +2670,15 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg) np->n_flag &= ~NFLUSHWANT; wakeup((caddr_t)&np->n_flag); } - didhold = ubc_hold(vp); - if (didhold) { - int rv = ubc_clean(vp, 1); /* get the pages out of vm also */ + /* + * get the pages out of vm also + */ + if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) { + int rv = ubc_sync_range(vp, 0, size, UBC_PUSHALL | UBC_INVALIDATE); if (!rv) - panic("nfs_vinvalbuf(): ubc_clean failed!"); - ubc_rele(vp); + panic("nfs_vinvalbuf(): ubc_sync_range failed!"); } + FSDBG_BOT(554, vp, flags, intrflg, 0); return (0); } @@ -2390,7 +2691,7 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg) int nfs_asyncio(bp, cred) struct nfsbuf *bp; - struct ucred *cred; + kauth_cred_t cred; { struct nfsmount *nmp; int i; @@ -2398,18 +2699,22 @@ nfs_asyncio(bp, cred) int slpflag = 0; int slptimeo = 0; int error, error2; + void *wakeme = NULL; + struct timespec ts; if (nfs_numasync == 0) return (EIO); FSDBG_TOP(552, bp, bp ? NBOFF(bp) : 0, bp ? bp->nb_flags : 0, 0); - nmp = ((bp != NULL) ? VFSTONFS(bp->nb_vp->v_mount) : NULL); + nmp = ((bp != NULL) ? VFSTONFS(vnode_mount(bp->nb_vp)) : NULL); again: if (nmp && nmp->nm_flag & NFSMNT_INT) slpflag = PCATCH; gotiod = FALSE; + lck_mtx_lock(nfs_iod_mutex); + /* no nfsbuf means tell nfsiod to process delwri list */ if (!bp) nfs_ioddelwri = 1; @@ -2423,21 +2728,23 @@ again: * Found one, so wake it up and tell it which * mount to process. */ - NFS_DPF(ASYNCIO, - ("nfs_asyncio: waking iod %d for mount %p\n", - i, nmp)); - nfs_iodwant[i] = (struct proc *)0; + nfs_iodwant[i] = NULL; nfs_iodmount[i] = nmp; if (nmp) nmp->nm_bufqiods++; - wakeup((caddr_t)&nfs_iodwant[i]); + wakeme = &nfs_iodwant[i]; gotiod = TRUE; break; } /* if we're just poking the delwri list, we're done */ - if (!bp) + if (!bp) { + lck_mtx_unlock(nfs_iod_mutex); + if (wakeme) + wakeup(wakeme); + FSDBG_BOT(552, bp, 0x10101010, wakeme, 0); return (0); + } /* * If none are free, we may already have an iod working on this mount @@ -2445,9 +2752,6 @@ again: */ if (!gotiod) { if (nmp->nm_bufqiods > 0) { - NFS_DPF(ASYNCIO, - ("nfs_asyncio: %d iods are already processing mount %p\n", - nmp->nm_bufqiods, nmp)); gotiod = TRUE; } } @@ -2470,14 +2774,18 @@ again: goto out; } FSDBG(552, bp, nmp->nm_bufqlen, 2*nfs_numasync, -1); - NFS_DPF(ASYNCIO, - ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); nmp->nm_bufqwant = TRUE; - error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO, - "nfsaio", slptimeo); + + ts.tv_sec = (slptimeo/100); + /* the hz value is 100; which leads to 10ms */ + ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000; + + error = msleep(&nmp->nm_bufq, nfs_iod_mutex, slpflag | PRIBIO, + "nfsaio", &ts); if (error) { error2 = nfs_sigintr(nmp, NULL, bp->nb_proc); if (error2) { + lck_mtx_unlock(nfs_iod_mutex); FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, error2); return (error2); } @@ -2491,43 +2799,39 @@ again: * so check and loop if nescessary. */ if (nmp->nm_bufqiods == 0) { - NFS_DPF(ASYNCIO, - ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); + lck_mtx_unlock(nfs_iod_mutex); goto again; } } if (ISSET(bp->nb_flags, NB_READ)) { if (bp->nb_rcred == NOCRED && cred != NOCRED) { - /* - * NFS has embedded ucred. - * Can not crhold() here as that causes zone corruption - */ - bp->nb_rcred = crdup(cred); + kauth_cred_ref(cred); + bp->nb_rcred = cred; } } else { SET(bp->nb_flags, NB_WRITEINPROG); if (bp->nb_wcred == NOCRED && cred != NOCRED) { - /* - * NFS has embedded ucred. - * Can not crhold() here as that causes zone corruption - */ - bp->nb_wcred = crdup(cred); + kauth_cred_ref(cred); + bp->nb_wcred = cred; } } TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, nb_free); nmp->nm_bufqlen++; + lck_mtx_unlock(nfs_iod_mutex); + if (wakeme) + wakeup(wakeme); FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, 0); return (0); } out: + lck_mtx_unlock(nfs_iod_mutex); /* * All the iods are busy on other mounts, so return EIO to * force the caller to process the i/o synchronously. */ - NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, EIO); return (EIO); } @@ -2537,27 +2841,29 @@ out: * synchronously or from an nfsiod. */ int -nfs_doio(bp, cr, p) - struct nfsbuf *bp; - struct ucred *cr; - struct proc *p; +nfs_doio(struct nfsbuf *bp, kauth_cred_t cr, proc_t p) { - register struct uio *uiop; - register struct vnode *vp; + struct uio *uiop; + vnode_t vp; struct nfsnode *np; struct nfsmount *nmp; - int error = 0, diff, len, iomode, must_commit = 0; + int error = 0, diff, len, iomode, must_commit = 0, invalidate = 0; struct uio uio; - struct iovec io; + struct iovec_32 io; + enum vtype vtype; vp = bp->nb_vp; + vtype = vnode_vtype(vp); np = VTONFS(vp); - nmp = VFSTONFS(vp->v_mount); + nmp = VFSTONFS(vnode_mount(vp)); uiop = &uio; - uiop->uio_iov = &io; + uiop->uio_iovs.iov32p = &io; uiop->uio_iovcnt = 1; +#if 1 /* LP64todo - can't use new segment flags until the drivers are ready */ uiop->uio_segflg = UIO_SYSSPACE; - uiop->uio_procp = p; +#else + uiop->uio_segflg = UIO_SYSSPACE32; +#endif /* * we've decided to perform I/O for this block, @@ -2573,31 +2879,34 @@ nfs_doio(bp, cr, p) bp->nb_dirtyend); if (ISSET(bp->nb_flags, NB_READ)) { - if (vp->v_type == VREG) + if (vtype == VREG) NFS_BUF_MAP(bp); - io.iov_len = uiop->uio_resid = bp->nb_bufsize; - io.iov_base = bp->nb_data; + io.iov_len = bp->nb_bufsize; + uio_uio_resid_set(uiop, io.iov_len); + io.iov_base = (uintptr_t) bp->nb_data; uiop->uio_rw = UIO_READ; - switch (vp->v_type) { + switch (vtype) { case VREG: uiop->uio_offset = NBOFF(bp); - nfsstats.read_bios++; - error = nfs_readrpc(vp, uiop, cr); - FSDBG(262, np->n_size, NBOFF(bp), uiop->uio_resid, error); + OSAddAtomic(1, (SInt32*)&nfsstats.read_bios); + error = nfs_readrpc(vp, uiop, cr, p); + FSDBG(262, np->n_size, NBOFF(bp), uio_uio_resid(uiop), error); if (!error) { /* update valid range */ bp->nb_validoff = 0; - if (uiop->uio_resid) { + if (uio_uio_resid(uiop) != 0) { /* * If len > 0, there is a hole in the file and * no writes after the hole have been pushed to * the server yet. * Just zero fill the rest of the valid area. */ - diff = bp->nb_bufsize - uiop->uio_resid; + // LP64todo - fix this + diff = bp->nb_bufsize - uio_uio_resid(uiop); len = np->n_size - (NBOFF(bp) + diff); if (len > 0) { - len = min(len, uiop->uio_resid); + // LP64todo - fix this + len = min(len, uio_uio_resid(uiop)); bzero((char *)bp->nb_data + diff, len); bp->nb_validend = diff + len; FSDBG(258, diff, len, 0, 1); @@ -2616,38 +2925,28 @@ nfs_doio(bp, cr, p) bp->nb_bufsize - bp->nb_validend, 0, 2); } } - if (p && (vp->v_flag & VTEXT) && - (((nmp->nm_flag & NFSMNT_NQNFS) && - NQNFS_CKINVALID(vp, np, ND_READ) && - np->n_lrev != np->n_brev) || - (!(nmp->nm_flag & NFSMNT_NQNFS) && - np->n_mtime != np->n_vattr.va_mtime.tv_sec))) { - uprintf("Process killed due to text file modification\n"); - psignal(p, SIGKILL); - p->p_flag |= P_NOSWAP; - } break; case VLNK: uiop->uio_offset = (off_t)0; - nfsstats.readlink_bios++; - error = nfs_readlinkrpc(vp, uiop, cr); + OSAddAtomic(1, (SInt32*)&nfsstats.readlink_bios); + error = nfs_readlinkrpc(vp, uiop, cr, p); if (!error) { bp->nb_validoff = 0; bp->nb_validend = uiop->uio_offset; } break; case VDIR: - nfsstats.readdir_bios++; + OSAddAtomic(1, (SInt32*)&nfsstats.readdir_bios); uiop->uio_offset = NBOFF(bp); if (!(nmp->nm_flag & NFSMNT_NFSV3)) nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */ if (nmp->nm_flag & NFSMNT_RDIRPLUS) { - error = nfs_readdirplusrpc(vp, uiop, cr); + error = nfs_readdirplusrpc(vp, uiop, cr, p); if (error == NFSERR_NOTSUPP) nmp->nm_flag &= ~NFSMNT_RDIRPLUS; } if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) - error = nfs_readdirrpc(vp, uiop, cr); + error = nfs_readdirrpc(vp, uiop, cr, p); if (!error) { bp->nb_validoff = 0; bp->nb_validend = uiop->uio_offset - NBOFF(bp); @@ -2655,7 +2954,7 @@ nfs_doio(bp, cr, p) } break; default: - printf("nfs_doio: type %x unexpected\n", vp->v_type); + printf("nfs_doio: type %x unexpected\n", vtype); break; }; if (error) { @@ -2668,7 +2967,7 @@ nfs_doio(bp, cr, p) int doff, dend = 0; /* We need to make sure the pages are locked before doing I/O. */ - if (!ISSET(bp->nb_flags, NB_META) && UBCISVALID(vp)) { + if (!ISSET(bp->nb_flags, NB_META) && UBCINFOEXISTS(vp)) { if (!ISSET(bp->nb_flags, NB_PAGELIST)) { error = nfs_buf_upl_setup(bp); if (error) { @@ -2716,7 +3015,7 @@ nfs_doio(bp, cr, p) np->n_needcommitcnt--; CHECK_NEEDCOMMITCNT(np); } else if (error == NFSERR_STALEWRITEVERF) - nfs_clearcommit(vp->v_mount); + nfs_clearcommit(vnode_mount(vp)); } if (!error && bp->nb_dirtyend > 0) { @@ -2724,7 +3023,7 @@ nfs_doio(bp, cr, p) u_int32_t pagemask; int firstpg, lastpg; - if (NBOFF(bp) + bp->nb_dirtyend > np->n_size) + if (NBOFF(bp) + bp->nb_dirtyend > (off_t)np->n_size) bp->nb_dirtyend = np->n_size - NBOFF(bp); NFS_BUF_MAP(bp); @@ -2747,7 +3046,7 @@ nfs_doio(bp, cr, p) while (dend < bp->nb_bufsize && NBPGDIRTY(bp,dend/PAGE_SIZE)) dend += PAGE_SIZE; /* make sure to keep dend clipped to EOF */ - if (NBOFF(bp) + dend > np->n_size) + if (NBOFF(bp) + dend > (off_t)np->n_size) dend = np->n_size - NBOFF(bp); /* calculate range of complete pages being written */ firstpg = round_page_32(doff) / PAGE_SIZE; @@ -2766,17 +3065,18 @@ nfs_doio(bp, cr, p) iomode = NFSV3WRITE_FILESYNC; /* write the dirty range */ - io.iov_len = uiop->uio_resid = dend - doff; + io.iov_len = dend - doff; + uio_uio_resid_set(uiop, io.iov_len); uiop->uio_offset = NBOFF(bp) + doff; - io.iov_base = (char *)bp->nb_data + doff; + io.iov_base = (uintptr_t) bp->nb_data + doff; uiop->uio_rw = UIO_WRITE; - nfsstats.write_bios++; + OSAddAtomic(1, (SInt32*)&nfsstats.write_bios); SET(bp->nb_flags, NB_WRITEINPROG); - error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); + error = nfs_writerpc(vp, uiop, cr, p, &iomode, &must_commit); if (must_commit) - nfs_clearcommit(vp->v_mount); + nfs_clearcommit(vnode_mount(vp)); /* clear dirty bits for pages we've written */ if (!error) bp->nb_dirty &= ~pagemask; @@ -2812,8 +3112,8 @@ nfs_doio(bp, cr, p) CLR(bp->nb_flags, NB_INVAL | NB_NOCACHE); if (!ISSET(bp->nb_flags, NB_DELWRI)) { SET(bp->nb_flags, NB_DELWRI); - nfs_nbdwrite++; - NFSBUFCNTCHK(); + OSAddAtomic(1, (SInt32*)&nfs_nbdwrite); + NFSBUFCNTCHK(0); } FSDBG(261, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize, 0); @@ -2824,11 +3124,11 @@ nfs_doio(bp, cr, p) */ if (ISSET(bp->nb_flags, NB_ASYNC)) { /* move to dirty list */ - int s = splbio(); + lck_mtx_lock(nfs_buf_mutex); if (bp->nb_vnbufs.le_next != NFSNOLIST) LIST_REMOVE(bp, nb_vnbufs); LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); - splx(s); + lck_mtx_unlock(nfs_buf_mutex); } else { SET(bp->nb_flags, NB_EINTR); } @@ -2838,6 +3138,32 @@ nfs_doio(bp, cr, p) SET(bp->nb_flags, NB_ERROR); bp->nb_error = np->n_error = error; np->n_flag |= NWRITEERR; + /* + * There was a write error and we need to + * invalidate attrs and flush buffers in + * order to sync up with the server. + * (if this write was extending the file, + * we may no longer know the correct size) + * + * But we can't call vinvalbuf while holding + * this buffer busy. Set a flag to do it after + * releasing the buffer. + * + * Note we can only invalidate in this function + * if this is an async write and so the iodone + * below will release the buffer. Also, we + * shouldn't call vinvalbuf from nfsiod because + * that may deadlock waiting for the completion + * of writes that are queued up behind this one. + */ + if (ISSET(bp->nb_flags, NB_ASYNC) && + !ISSET(bp->nb_flags, NB_IOD)) { + invalidate = 1; + } else { + /* invalidate later */ + np->n_flag |= NNEEDINVALIDATE; + } + NATTRINVALIDATE(np); } /* clear the dirty range */ bp->nb_dirtyoff = bp->nb_dirtyend = 0; @@ -2846,9 +3172,9 @@ nfs_doio(bp, cr, p) if (!error && bp->nb_dirty) { /* there are pages marked dirty that need to be written out */ - int pg, cnt, npages, off, len; + int pg, count, npages, off; - nfsstats.write_bios++; + OSAddAtomic(1, (SInt32*)&nfsstats.write_bios); NFS_BUF_MAP(bp); @@ -2871,31 +3197,32 @@ nfs_doio(bp, cr, p) for (pg=0; pg < npages; pg++) { if (!NBPGDIRTY(bp,pg)) continue; - cnt = 1; - while (((pg+cnt) < npages) && NBPGDIRTY(bp,pg+cnt)) - cnt++; - /* write cnt pages starting with page pg */ + count = 1; + while (((pg+count) < npages) && NBPGDIRTY(bp,pg+count)) + count++; + /* write count pages starting with page pg */ off = pg * PAGE_SIZE; - len = cnt * PAGE_SIZE; + len = count * PAGE_SIZE; /* clip writes to EOF */ - if (NBOFF(bp) + off + len > np->n_size) + if (NBOFF(bp) + off + len > (off_t)np->n_size) len -= (NBOFF(bp) + off + len) - np->n_size; if (len > 0) { - io.iov_len = uiop->uio_resid = len; + io.iov_len = len; + uio_uio_resid_set(uiop, io.iov_len); uiop->uio_offset = NBOFF(bp) + off; - io.iov_base = (char *)bp->nb_data + off; - error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); + io.iov_base = (uintptr_t) bp->nb_data + off; + error = nfs_writerpc(vp, uiop, cr, p, &iomode, &must_commit); if (must_commit) - nfs_clearcommit(vp->v_mount); + nfs_clearcommit(vnode_mount(vp)); if (error) break; } /* clear dirty bits */ - while (cnt--) { + while (count--) { bp->nb_dirty &= ~(1 << pg); /* leave pg on last page */ - if (cnt) pg++; + if (count) pg++; } } if (!error) { @@ -2919,5 +3246,25 @@ nfs_doio(bp, cr, p) FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize, error); nfs_buf_iodone(bp); + + if (invalidate) { + /* + * There was a write error and we need to + * invalidate attrs and flush buffers in + * order to sync up with the server. + * (if this write was extending the file, + * we may no longer know the correct size) + * + * But we couldn't call vinvalbuf while holding + * the buffer busy. So we call vinvalbuf() after + * releasing the buffer. + * + * Note: we don't bother calling nfs_vinvalbuf() if + * there's already a flush in progress. + */ + if (!(np->n_flag & NFLUSHINPROG)) + nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cr, p, 1); + } + return (error); } diff --git a/bsd/nfs/nfs_boot.c b/bsd/nfs/nfs_boot.c index 85de9123c..2eaf21273 100644 --- a/bsd/nfs/nfs_boot.c +++ b/bsd/nfs/nfs_boot.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -92,12 +92,11 @@ #include <sys/conf.h> #include <sys/ioctl.h> #include <sys/proc.h> -#include <sys/mount.h> -#include <sys/mbuf.h> +#include <sys/mount_internal.h> +#include <sys/kpi_mbuf.h> #include <sys/malloc.h> #include <sys/socket.h> -#include <sys/reboot.h> #include <net/if.h> #include <net/if_dl.h> @@ -119,21 +118,15 @@ #include <libkern/libkern.h> -extern char *strchr(const char *str, int ch); #if NETHER == 0 -int nfs_boot_init(nd, procp) - struct nfs_diskless *nd; - struct proc *procp; +int nfs_boot_init(struct nfs_diskless *nd, proc_t procp) { panic("nfs_boot_init: no ether"); } -int nfs_boot_getfh(nd, procp, v3) - struct nfs_diskless *nd; - struct proc *procp; - int v3; +int nfs_boot_getfh(struct nfs_diskless *nd, proc_t procp, int v3, int sotype) { panic("nfs_boot_getfh: no ether"); } @@ -161,17 +154,17 @@ int nfs_boot_getfh(nd, procp, v3) */ /* bootparam RPC */ -static int bp_whoami __P((struct sockaddr_in *bpsin, - struct in_addr *my_ip, struct in_addr *gw_ip)); -static int bp_getfile __P((struct sockaddr_in *bpsin, char *key, - struct sockaddr_in *mdsin, char *servname, char *path)); +static int bp_whoami(struct sockaddr_in *bpsin, + struct in_addr *my_ip, struct in_addr *gw_ip); +static int bp_getfile(struct sockaddr_in *bpsin, const char *key, + struct sockaddr_in *mdsin, char *servname, char *path); /* mountd RPC */ -static int md_mount __P((struct sockaddr_in *mdsin, char *path, int v3, - u_char *fhp, u_long *fhlenp)); +static int md_mount(struct sockaddr_in *mdsin, char *path, int v3, int sotype, + u_char *fhp, u_long *fhlenp); /* other helpers */ -static int get_file_handle __P((struct nfs_dlmount *ndmntp)); +static int get_file_handle(struct nfs_dlmount *ndmntp); #define IP_FORMAT "%d.%d.%d.%d" @@ -190,9 +183,7 @@ netboot_rootpath(struct in_addr * server_ip, * Called with an empty nfs_diskless struct to be filled in. */ int -nfs_boot_init(nd, procp) - struct nfs_diskless *nd; - struct proc *procp; +nfs_boot_init(struct nfs_diskless *nd, __unused proc_t procp) { struct sockaddr_in bp_sin; boolean_t do_bpwhoami = TRUE; @@ -201,15 +192,24 @@ nfs_boot_init(nd, procp) struct in_addr my_ip; struct sockaddr_in * sin_p; + /* make sure mbuf constants are set up */ + if (!nfs_mbuf_mlen) + nfs_mbuf_init(); + /* by this point, networking must already have been configured */ if (netboot_iaddr(&my_ip) == FALSE) { printf("nfs_boot: networking is not initialized\n"); error = ENXIO; - goto failed_noswitch; + goto failed; } /* get the root path information */ MALLOC_ZONE(nd->nd_root.ndm_path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (!nd->nd_root.ndm_path) { + printf("nfs_boot: can't allocate root path buffer\n"); + error = ENOMEM; + goto failed; + } sin_p = &nd->nd_root.ndm_saddr; bzero((caddr_t)sin_p, sizeof(*sin_p)); sin_p->sin_len = sizeof(*sin_p); @@ -222,8 +222,6 @@ nfs_boot_init(nd, procp) } nd->nd_private.ndm_saddr.sin_addr.s_addr = 0; - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - if (do_bpwhoami) { struct in_addr router; /* @@ -261,6 +259,11 @@ nfs_boot_init(nd, procp) #if !defined(NO_MOUNT_PRIVATE) if (do_bpgetfile) { /* get private path */ MALLOC_ZONE(nd->nd_private.ndm_path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (!nd->nd_private.ndm_path) { + printf("nfs_boot: can't allocate private path buffer\n"); + error = ENOMEM; + goto failed; + } error = bp_getfile(&bp_sin, "private", &nd->nd_private.ndm_saddr, nd->nd_private.ndm_host, @@ -269,6 +272,11 @@ nfs_boot_init(nd, procp) char * check_path = NULL; MALLOC_ZONE(check_path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (!check_path) { + printf("nfs_boot: can't allocate check_path buffer\n"); + error = ENOMEM; + goto failed; + } snprintf(check_path, MAXPATHLEN, "%s/private", nd->nd_root.ndm_path); if ((nd->nd_root.ndm_saddr.sin_addr.s_addr == nd->nd_private.ndm_saddr.sin_addr.s_addr) @@ -288,8 +296,6 @@ nfs_boot_init(nd, procp) } #endif /* NO_MOUNT_PRIVATE */ failed: - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); -failed_noswitch: return (error); } @@ -298,16 +304,12 @@ failed_noswitch: * with file handles to be filled in. */ int -nfs_boot_getfh(nd, procp, v3) - struct nfs_diskless *nd; - struct proc *procp; - int v3; +nfs_boot_getfh(struct nfs_diskless *nd, __unused proc_t procp, int v3, int sotype) { int error = 0; - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - nd->nd_root.ndm_nfsv3 = v3; + nd->nd_root.ndm_sotype = sotype; error = get_file_handle(&nd->nd_root); if (error) { printf("nfs_boot: get_file_handle(v%d) root failed, %d\n", @@ -319,6 +321,7 @@ nfs_boot_getfh(nd, procp, v3) if (nd->nd_private.ndm_saddr.sin_addr.s_addr) { /* get private file handle */ nd->nd_private.ndm_nfsv3 = v3; + nd->nd_private.ndm_sotype = sotype; error = get_file_handle(&nd->nd_private); if (error) { printf("nfs_boot: get_file_handle(v%d) private failed, %d\n", @@ -327,8 +330,7 @@ nfs_boot_getfh(nd, procp, v3) } } #endif /* NO_MOUNT_PRIVATE */ - failed: - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); +failed: return (error); } @@ -344,7 +346,7 @@ get_file_handle(ndmntp) * using RPC to mountd/mount */ error = md_mount(&ndmntp->ndm_saddr, ndmntp->ndm_path, ndmntp->ndm_nfsv3, - ndmntp->ndm_fh, &ndmntp->ndm_fhlen); + ndmntp->ndm_sotype, ndmntp->ndm_fh, &ndmntp->ndm_fhlen); if (error) return (error); @@ -365,23 +367,25 @@ get_file_handle(ndmntp) * Get an mbuf with the given length, and * initialize the pkthdr length field. */ -static struct mbuf * -m_get_len(int msg_len) +static int +mbuf_get_with_len(int msg_len, mbuf_t *m) { - struct mbuf *m; - m = m_gethdr(M_WAIT, MT_DATA); - if (m == NULL) - return NULL; - if (msg_len > MHLEN) { - if (msg_len > MCLBYTES) + int error; + error = mbuf_gethdr(MBUF_WAITOK, MBUF_TYPE_DATA, m); + if (error) + return (error); + if (msg_len > mbuf_maxlen(*m)) { + error = mbuf_mclget(MBUF_WAITOK, MBUF_TYPE_DATA, m); + if (error) { + mbuf_freem(*m); + return (error); + } + if (msg_len > mbuf_maxlen(*m)) panic("nfs_boot: msg_len > MCLBYTES"); - MCLGET(m, M_WAIT); - if (m == NULL) - return NULL; } - m->m_len = msg_len; - m->m_pkthdr.len = m->m_len; - return (m); + mbuf_setlen(*m, msg_len); + mbuf_pkthdr_setlen(*m, msg_len); + return (0); } @@ -438,8 +442,8 @@ bp_whoami(bpsin, my_ip, gw_ip) struct rpc_string *str; struct bp_inaddr *bia; - struct mbuf *m; - struct sockaddr_in *sin; + mbuf_t m; + struct sockaddr_in sin; int error, msg_len; int cn_len, dn_len; u_char *p; @@ -449,14 +453,14 @@ bp_whoami(bpsin, my_ip, gw_ip) * Get message buffer of sufficient size. */ msg_len = sizeof(*call); - m = m_get_len(msg_len); - if (m == NULL) - return ENOBUFS; + error = mbuf_get_with_len(msg_len, &m); + if (error) + return error; /* * Build request message for PMAPPROC_CALLIT. */ - call = mtod(m, struct whoami_call *); + call = mbuf_data(m); call->call_prog = htonl(BOOTPARAM_PROG); call->call_vers = htonl(BOOTPARAM_VERS); call->call_proc = htonl(BOOTPARAM_WHOAMI); @@ -474,32 +478,31 @@ bp_whoami(bpsin, my_ip, gw_ip) /* RPC: portmap/callit */ bpsin->sin_port = htons(PMAPPORT); - error = krpc_call(bpsin, PMAPPROG, PMAPVERS, - PMAPPROC_CALLIT, &m, &sin); + error = krpc_call(bpsin, SOCK_DGRAM, PMAPPROG, PMAPVERS, PMAPPROC_CALLIT, &m, &sin); if (error) return error; /* * Parse result message. */ - msg_len = m->m_len; - lp = mtod(m, long *); + msg_len = mbuf_len(m); + lp = mbuf_data(m); /* bootparam server port (also grab from address). */ - if (msg_len < sizeof(*lp)) + if (msg_len < (int)sizeof(*lp)) goto bad; msg_len -= sizeof(*lp); bpsin->sin_port = htons((short)ntohl(*lp++)); - bpsin->sin_addr.s_addr = sin->sin_addr.s_addr; + bpsin->sin_addr.s_addr = sin.sin_addr.s_addr; /* length of encapsulated results */ - if (msg_len < (ntohl(*lp) + sizeof(*lp))) + if (msg_len < (ntohl(*lp) + (int)sizeof(*lp))) goto bad; msg_len = ntohl(*lp++); p = (char*)lp; /* client name */ - if (msg_len < sizeof(*str)) + if (msg_len < (int)sizeof(*str)) goto bad; str = (struct rpc_string *)p; cn_len = ntohl(str->len); @@ -514,7 +517,7 @@ bp_whoami(bpsin, my_ip, gw_ip) msg_len -= RPC_STR_SIZE(cn_len); /* domain name */ - if (msg_len < sizeof(*str)) + if (msg_len < (int)sizeof(*str)) goto bad; str = (struct rpc_string *)p; dn_len = ntohl(str->len); @@ -529,7 +532,7 @@ bp_whoami(bpsin, my_ip, gw_ip) msg_len -= RPC_STR_SIZE(dn_len); /* gateway address */ - if (msg_len < sizeof(*bia)) + if (msg_len < (int)sizeof(*bia)) goto bad; bia = (struct bp_inaddr *)p; if (bia->atype != htonl(1)) @@ -546,10 +549,7 @@ bad: error = EBADRPC; out: - if (sin) - FREE(sin, M_SONAME); - - m_freem(m); + mbuf_freem(m); return(error); } @@ -564,13 +564,13 @@ out: static int bp_getfile(bpsin, key, md_sin, serv_name, pathname) struct sockaddr_in *bpsin; - char *key; + const char *key; struct sockaddr_in *md_sin; char *serv_name; char *pathname; { struct rpc_string *str; - struct mbuf *m; + mbuf_t m; struct bp_inaddr *bia; struct sockaddr_in *sin; u_char *p, *q; @@ -585,14 +585,14 @@ bp_getfile(bpsin, key, md_sin, serv_name, pathname) msg_len = 0; msg_len += RPC_STR_SIZE(cn_len); msg_len += RPC_STR_SIZE(key_len); - m = m_get_len(msg_len); - if (m == NULL) - return ENOBUFS; + error = mbuf_get_with_len(msg_len, &m); + if (error) + return error; /* * Build request message. */ - p = mtod(m, u_char *); + p = mbuf_data(m); bzero(p, msg_len); /* client name (hostname) */ str = (struct rpc_string *)p; @@ -605,7 +605,7 @@ bp_getfile(bpsin, key, md_sin, serv_name, pathname) bcopy(key, str->data, key_len); /* RPC: bootparam/getfile */ - error = krpc_call(bpsin, BOOTPARAM_PROG, BOOTPARAM_VERS, + error = krpc_call(bpsin, SOCK_DGRAM, BOOTPARAM_PROG, BOOTPARAM_VERS, BOOTPARAM_GETFILE, &m, NULL); if (error) return error; @@ -613,11 +613,11 @@ bp_getfile(bpsin, key, md_sin, serv_name, pathname) /* * Parse result message. */ - p = mtod(m, u_char *); - msg_len = m->m_len; + p = mbuf_data(m); + msg_len = mbuf_len(m); /* server name */ - if (msg_len < sizeof(*str)) + if (msg_len < (int)sizeof(*str)) goto bad; str = (struct rpc_string *)p; sn_len = ntohl(str->len); @@ -631,7 +631,7 @@ bp_getfile(bpsin, key, md_sin, serv_name, pathname) msg_len -= RPC_STR_SIZE(sn_len); /* server IP address (mountd) */ - if (msg_len < sizeof(*bia)) + if (msg_len < (int)sizeof(*bia)) goto bad; bia = (struct bp_inaddr *)p; if (bia->atype != htonl(1)) @@ -649,7 +649,7 @@ bp_getfile(bpsin, key, md_sin, serv_name, pathname) msg_len -= sizeof(*bia); /* server pathname */ - if (msg_len < sizeof(*str)) + if (msg_len < (int)sizeof(*str)) goto bad; str = (struct rpc_string *)p; path_len = ntohl(str->len); @@ -666,7 +666,7 @@ bad: error = EBADRPC; out: - m_freem(m); + mbuf_freem(m); return(0); } @@ -677,10 +677,11 @@ out: * Also, sets sin->sin_port to the NFS service port. */ static int -md_mount(mdsin, path, v3, fhp, fhlenp) +md_mount(mdsin, path, v3, sotype, fhp, fhlenp) struct sockaddr_in *mdsin; /* mountd server address */ char *path; int v3; + int sotype; u_char *fhp; u_long *fhlenp; { @@ -690,28 +691,38 @@ md_mount(mdsin, path, v3, fhp, fhlenp) u_long errno; u_char data[NFSX_V3FHMAX + sizeof(u_long)]; } *rdata; - struct mbuf *m; + mbuf_t m; int error, mlen, slen; int mntversion = v3 ? RPCMNT_VER3 : RPCMNT_VER1; + int proto = (sotype == SOCK_STREAM) ? IPPROTO_TCP : IPPROTO_UDP; + in_port_t mntport, nfsport; /* Get port number for MOUNTD. */ - error = krpc_portmap(mdsin, RPCPROG_MNT, mntversion, - &mdsin->sin_port); - if (error) return error; + error = krpc_portmap(mdsin, RPCPROG_MNT, mntversion, proto, &mntport); + if (error) + return error; + + /* Get port number for NFS use. */ + /* (If NFS/proto unavailable, don't bother with the mount call) */ + error = krpc_portmap(mdsin, NFS_PROG, v3 ? NFS_VER3 : NFS_VER2, proto, &nfsport); + if (error) + return error; + + /* Set port number for MOUNTD */ + mdsin->sin_port = mntport; slen = strlen(path); mlen = RPC_STR_SIZE(slen); - m = m_get_len(mlen); - if (m == NULL) - return ENOBUFS; - str = mtod(m, struct rpc_string *); + error = mbuf_get_with_len(mlen, &m); + if (error) + return error; + str = mbuf_data(m); str->len = htonl(slen); bcopy(path, str->data, slen); /* Do RPC to mountd. */ - error = krpc_call(mdsin, RPCPROG_MNT, mntversion, - RPCMNT_MOUNT, &m, NULL); + error = krpc_call(mdsin, sotype, RPCPROG_MNT, mntversion, RPCMNT_MOUNT, &m, NULL); if (error) return error; /* message already freed */ @@ -720,41 +731,40 @@ md_mount(mdsin, path, v3, fhp, fhlenp) * + a v2 filehandle * + a v3 filehandle length + a v3 filehandle */ - mlen = m->m_len; - if (mlen < sizeof(u_long)) + mlen = mbuf_len(m); + if (mlen < (int)sizeof(u_long)) goto bad; - rdata = mtod(m, struct rdata *); + rdata = mbuf_data(m); error = ntohl(rdata->errno); if (error) goto out; if (v3) { u_long fhlen; u_char *fh; - if (mlen < sizeof(u_long)*2) + if (mlen < (int)sizeof(u_long)*2) goto bad; fhlen = ntohl(*(u_long*)rdata->data); fh = rdata->data + sizeof(u_long); - if (mlen < (sizeof(u_long)*2 + fhlen)) + if (mlen < (int)(sizeof(u_long)*2 + fhlen)) goto bad; bcopy(fh, fhp, fhlen); *fhlenp = fhlen; } else { - if (mlen < (sizeof(u_long) + NFSX_V2FH)) + if (mlen < ((int)sizeof(u_long) + NFSX_V2FH)) goto bad; bcopy(rdata->data, fhp, NFSX_V2FH); *fhlenp = NFSX_V2FH; } /* Set port number for NFS use. */ - error = krpc_portmap(mdsin, NFS_PROG, v3 ? NFS_VER3 : NFS_VER2, - &mdsin->sin_port); + mdsin->sin_port = nfsport; goto out; bad: error = EBADRPC; out: - m_freem(m); + mbuf_freem(m); return error; } diff --git a/bsd/nfs/nfs_lock.c b/bsd/nfs/nfs_lock.c index 398e6f868..762c140b0 100644 --- a/bsd/nfs/nfs_lock.c +++ b/bsd/nfs/nfs_lock.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2002-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -54,22 +54,20 @@ #include <sys/systm.h> #include <sys/fcntl.h> #include <sys/kernel.h> /* for hz */ -#include <sys/file.h> -#include <sys/lock.h> +#include <sys/file_internal.h> #include <sys/malloc.h> #include <sys/lockf.h> /* for hz */ /* Must come after sys/malloc.h */ -#include <sys/mbuf.h> -#include <sys/mount.h> -#include <sys/namei.h> -#include <sys/proc.h> +#include <sys/kpi_mbuf.h> +#include <sys/mount_internal.h> +#include <sys/proc_internal.h> /* for p_start */ +#include <sys/kauth.h> #include <sys/resourcevar.h> #include <sys/socket.h> -#include <sys/socket.h> #include <sys/unistd.h> #include <sys/user.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> -#include <kern/thread_act.h> +#include <kern/thread.h> #include <machine/limits.h> @@ -84,17 +82,12 @@ #define OFF_MAX QUAD_MAX -uint64_t nfsadvlocks = 0; -struct timeval nfsadvlock_longest = {0, 0}; -struct timeval nfsadvlocks_time = {0, 0}; - /* * globals for managing the lockd fifo */ -pid_t nfslockdpid = 0; -struct file *nfslockdfp = 0; +vnode_t nfslockdvnode = 0; int nfslockdwaiting = 0; -int nfslockdfifowritten = 0; +time_t nfslockdstarttimeout = 0; int nfslockdfifolock = 0; #define NFSLOCKDFIFOLOCK_LOCKED 1 #define NFSLOCKDFIFOLOCK_WANT 2 @@ -303,20 +296,21 @@ nfs_lockxid_get(void) * (Also, if adding, try to clean up some stale entries.) */ static int -nfs_lock_pid_check(struct proc *p, int addflag, struct vnode *vp) +nfs_lock_pid_check(proc_t p, int addflag, vnode_t vp) { struct nfs_lock_pid *lp, *lplru, *lplru_next; - struct proc *plru; + proc_t plru; int error = 0; struct timeval now; /* lock hash */ loop: if (nfs_lock_pid_lock) { + struct nfsmount *nmp = VFSTONFS(vnode_mount(vp)); while (nfs_lock_pid_lock) { nfs_lock_pid_lock = -1; tsleep(&nfs_lock_pid_lock, PCATCH, "nfslockpid", 0); - if ((error = nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p))) + if ((error = nfs_sigintr(nmp, NULL, p))) return (error); } goto loop; @@ -325,9 +319,9 @@ loop: /* Search hash chain */ error = ENOENT; - lp = NFS_LOCK_PID_HASH(p->p_pid)->lh_first; + lp = NFS_LOCK_PID_HASH(proc_pid(p))->lh_first; for (; lp != NULL; lp = lp->lp_hash.le_next) - if (lp->lp_pid == p->p_pid) { + if (lp->lp_pid == proc_pid(p)) { /* found pid... */ if (timevalcmp(&lp->lp_pid_start, &p->p_stats->p_start, ==)) { /* ...and it's valid */ @@ -390,15 +384,19 @@ loop: MALLOC(lp, struct nfs_lock_pid *, sizeof(struct nfs_lock_pid), M_TEMP, M_WAITOK | M_ZERO); } - /* (re)initialize nfs_lock_pid info */ - lp->lp_pid = p->p_pid; - lp->lp_pid_start = p->p_stats->p_start; - /* insert pid in hash */ - LIST_INSERT_HEAD(NFS_LOCK_PID_HASH(lp->lp_pid), lp, lp_hash); - lp->lp_valid = 1; - lp->lp_time = now.tv_sec; - TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru); - error = 0; + if (!lp) { + error = ENOMEM; + } else { + /* (re)initialize nfs_lock_pid info */ + lp->lp_pid = proc_pid(p); + lp->lp_pid_start = p->p_stats->p_start; + /* insert pid in hash */ + LIST_INSERT_HEAD(NFS_LOCK_PID_HASH(lp->lp_pid), lp, lp_hash); + lp->lp_valid = 1; + lp->lp_time = now.tv_sec; + TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru); + error = 0; + } } /* unlock hash */ @@ -417,42 +415,46 @@ loop: * NFS advisory byte-level locks. */ int -nfs_dolock(struct vop_advlock_args *ap) -/* struct vop_advlock_args { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - caddr_t a_id; - int a_op; - struct flock *a_fl; - int a_flags; +nfs_dolock(struct vnop_advlock_args *ap) +/* struct vnop_advlock_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + caddr_t a_id; + int a_op; + struct flock *a_fl; + int a_flags; + vfs_context_t a_context; }; */ { LOCKD_MSG_REQUEST msgreq; LOCKD_MSG *msg; - struct vnode *vp, *wvp; + vnode_t vp, wvp; struct nfsnode *np; int error, error1; struct flock *fl; int fmode, ioflg; - struct proc *p; struct nfsmount *nmp; - struct vattr vattr; + struct nfs_vattr nvattr; off_t start, end; struct timeval now; int timeo, endtime, lastmsg, wentdown = 0; int lockpidcheck; + kauth_cred_t cred; + proc_t p; + struct sockaddr *saddr; - p = current_proc(); + p = vfs_context_proc(ap->a_context); + cred = vfs_context_ucred(ap->a_context); vp = ap->a_vp; fl = ap->a_fl; np = VTONFS(vp); - nmp = VFSTONFS(vp->v_mount); + nmp = VFSTONFS(vnode_mount(vp)); if (!nmp) return (ENXIO); if (nmp->nm_flag & NFSMNT_NOLOCKS) - return (EOPNOTSUPP); + return (ENOTSUP); /* * The NLM protocol doesn't allow the server to return an error @@ -474,11 +476,11 @@ nfs_dolock(struct vop_advlock_args *ap) return (EINVAL); } /* - * If daemon is running take a ref on its fifo + * If daemon is running take a ref on its fifo vnode */ - if (!nfslockdfp || !(wvp = (struct vnode *)nfslockdfp->f_data)) { - if (!nfslockdwaiting) - return (EOPNOTSUPP); + if (!(wvp = nfslockdvnode)) { + if (!nfslockdwaiting && !nfslockdstarttimeout) + return (ENOTSUP); /* * Don't wake lock daemon if it hasn't been started yet and * this is an unlock request (since we couldn't possibly @@ -486,17 +488,45 @@ nfs_dolock(struct vop_advlock_args *ap) * uninformed unlock request due to closef()'s behavior of doing * unlocks on all files if a process has had a lock on ANY file. */ - if (!nfslockdfp && (fl->l_type == F_UNLCK)) + if (!nfslockdvnode && (fl->l_type == F_UNLCK)) return (EINVAL); - /* wake up lock daemon */ - (void)wakeup((void *)&nfslockdwaiting); - /* wait on nfslockdfp for a while to allow daemon to start */ - tsleep((void *)&nfslockdfp, PCATCH | PUSER, "lockd", 60*hz); - /* check for nfslockdfp and f_data */ - if (!nfslockdfp || !(wvp = (struct vnode *)nfslockdfp->f_data)) - return (EOPNOTSUPP); + microuptime(&now); + if (nfslockdwaiting) { + /* wake up lock daemon */ + nfslockdstarttimeout = now.tv_sec + 60; + (void)wakeup((void *)&nfslockdwaiting); + } + /* wait on nfslockdvnode for a while to allow daemon to start */ + while (!nfslockdvnode && (now.tv_sec < nfslockdstarttimeout)) { + error = tsleep((void *)&nfslockdvnode, PCATCH | PUSER, "lockdstart", 2*hz); + if (error && (error != EWOULDBLOCK)) + return (error); + /* check that we still have our mount... */ + /* ...and that we still support locks */ + nmp = VFSTONFS(vnode_mount(vp)); + if (!nmp) + return (ENXIO); + if (nmp->nm_flag & NFSMNT_NOLOCKS) + return (ENOTSUP); + if (!error) + break; + microuptime(&now); + } + /* + * check for nfslockdvnode + * If it hasn't started by now, there's a problem. + */ + if (!(wvp = nfslockdvnode)) + return (ENOTSUP); + } + error = vnode_getwithref(wvp); + if (error) + return (ENOTSUP); + error = vnode_ref(wvp); + if (error) { + vnode_put(wvp); + return (ENOTSUP); } - VREF(wvp); /* * Need to check if this process has successfully acquired an NFS lock before. @@ -504,10 +534,14 @@ nfs_dolock(struct vop_advlock_args *ap) */ lockpidcheck = nfs_lock_pid_check(p, 0, vp); if (lockpidcheck) { - if (lockpidcheck != ENOENT) + if (lockpidcheck != ENOENT) { + vnode_rele(wvp); + vnode_put(wvp); return (lockpidcheck); + } if (ap->a_op == F_UNLCK) { - vrele(wvp); + vnode_rele(wvp); + vnode_put(wvp); return (0); } } @@ -532,23 +566,27 @@ nfs_dolock(struct vop_advlock_args *ap) /* need to flush, and refetch attributes to make */ /* sure we have the correct end of file offset */ if (np->n_flag & NMODIFIED) { - np->n_xid = 0; - error = nfs_vinvalbuf(vp, V_SAVE, p->p_ucred, p, 1); + NATTRINVALIDATE(np); + error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) { - vrele(wvp); + vnode_rele(wvp); + vnode_put(wvp); return (error); } } - np->n_xid = 0; - error = VOP_GETATTR(vp, &vattr, p->p_ucred, p); + NATTRINVALIDATE(np); + + error = nfs_getattr(vp, &nvattr, cred, p); if (error) { - vrele(wvp); + vnode_rele(wvp); + vnode_put(wvp); return (error); } start = np->n_size + fl->l_start; break; default: - vrele(wvp); + vnode_rele(wvp); + vnode_put(wvp); return (EINVAL); } if (fl->l_len == 0) @@ -560,12 +598,14 @@ nfs_dolock(struct vop_advlock_args *ap) start += fl->l_len; } if (start < 0) { - vrele(wvp); + vnode_rele(wvp); + vnode_put(wvp); return (EINVAL); } if (!NFS_ISV3(vp) && ((start >= 0x80000000) || (end >= 0x80000000))) { - vrele(wvp); + vnode_rele(wvp); + vnode_put(wvp); return (EINVAL); } @@ -583,37 +623,40 @@ nfs_dolock(struct vop_advlock_args *ap) msg->lm_fl.l_start = start; if (end != -1) msg->lm_fl.l_len = end - start + 1; - msg->lm_fl.l_pid = p->p_pid; + msg->lm_fl.l_pid = proc_pid(p); if (ap->a_flags & F_WAIT) msg->lm_flags |= LOCKD_MSG_BLOCK; if (ap->a_op == F_GETLK) msg->lm_flags |= LOCKD_MSG_TEST; - nmp = VFSTONFS(vp->v_mount); + nmp = VFSTONFS(vnode_mount(vp)); if (!nmp) { - vrele(wvp); + vnode_rele(wvp); + vnode_put(wvp); return (ENXIO); } - bcopy(mtod(nmp->nm_nam, struct sockaddr *), &msg->lm_addr, - min(sizeof msg->lm_addr, - mtod(nmp->nm_nam, struct sockaddr *)->sa_len)); + saddr = mbuf_data(nmp->nm_nam); + bcopy(saddr, &msg->lm_addr, min(sizeof msg->lm_addr, saddr->sa_len)); msg->lm_fh_len = NFS_ISV3(vp) ? VTONFS(vp)->n_fhsize : NFSX_V2FH; bcopy(VTONFS(vp)->n_fhp, msg->lm_fh, msg->lm_fh_len); if (NFS_ISV3(vp)) msg->lm_flags |= LOCKD_MSG_NFSV3; - cru2x(p->p_ucred, &msg->lm_cred); + cru2x(cred, &msg->lm_cred); microuptime(&now); lastmsg = now.tv_sec - ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay)); fmode = FFLAGS(O_WRONLY); - if ((error = VOP_OPEN(wvp, fmode, kernproc->p_ucred, p))) { - vrele(wvp); + if ((error = VNOP_OPEN(wvp, fmode, ap->a_context))) { + vnode_rele(wvp); + vnode_put(wvp); return (error); } + vnode_lock(wvp); ++wvp->v_writecount; + vnode_unlock(wvp); /* allocate unique xid */ msg->lm_xid = nfs_lockxid_get(); @@ -623,8 +666,6 @@ nfs_dolock(struct vop_advlock_args *ap) #define IO_NOMACCHECK 0; ioflg = IO_UNIT | IO_NOMACCHECK; for (;;) { - VOP_LEASE(wvp, p, kernproc->p_ucred, LEASE_WRITE); - error = 0; while (nfslockdfifolock & NFSLOCKDFIFOLOCK_LOCKED) { nfslockdfifolock |= NFSLOCKDFIFOLOCK_WANT; @@ -638,18 +679,13 @@ nfs_dolock(struct vop_advlock_args *ap) nfslockdfifolock |= NFSLOCKDFIFOLOCK_LOCKED; error = vn_rdwr(UIO_WRITE, wvp, (caddr_t)msg, sizeof(*msg), 0, - UIO_SYSSPACE, ioflg, kernproc->p_ucred, NULL, p); - - nfslockdfifowritten = 1; + UIO_SYSSPACE32, ioflg, proc_ucred(kernproc), NULL, p); nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_LOCKED; if (nfslockdfifolock & NFSLOCKDFIFOLOCK_WANT) { nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_WANT; wakeup((void *)&nfslockdfifolock); } - /* wake up lock daemon */ - if (nfslockdwaiting) - (void)wakeup((void *)&nfslockdwaiting); if (error && (((ioflg & IO_NDELAY) == 0) || error != EAGAIN)) { break; @@ -694,14 +730,14 @@ wait_for_granted: break; /* check that we still have our mount... */ /* ...and that we still support locks */ - nmp = VFSTONFS(vp->v_mount); + nmp = VFSTONFS(vnode_mount(vp)); if (!nmp || (nmp->nm_flag & NFSMNT_NOLOCKS)) break; /* * If the mount is hung and we've requested not to hang * on remote filesystems, then bail now. */ - if ((p != NULL) && ((p->p_flag & P_NOREMOTEHANG) != 0) && + if ((p != NULL) && ((proc_noremotehang(p)) != 0) && ((nmp->nm_state & (NFSSTA_TIMEO|NFSSTA_LOCKTIMEO)) != 0)) { if (fl->l_type == F_UNLCK) printf("nfs_dolock: aborting unlock request " @@ -713,7 +749,7 @@ wait_for_granted: } if (error) { /* check that we still have our mount... */ - nmp = VFSTONFS(vp->v_mount); + nmp = VFSTONFS(vnode_mount(vp)); if (!nmp) { if (error == EWOULDBLOCK) error = ENXIO; @@ -722,14 +758,14 @@ wait_for_granted: /* ...and that we still support locks */ if (nmp->nm_flag & NFSMNT_NOLOCKS) { if (error == EWOULDBLOCK) - error = EOPNOTSUPP; + error = ENOTSUP; break; } - if ((error == EOPNOTSUPP) && + if ((error == ENOTSUP) && (nmp->nm_state & NFSSTA_LOCKSWORK)) { /* * We have evidence that locks work, yet lockd - * returned EOPNOTSUPP. This is probably because + * returned ENOTSUP. This is probably because * it was unable to contact the server's lockd to * send it the request. * @@ -765,7 +801,7 @@ wait_for_granted: * If the mount is hung and we've requested not to hang * on remote filesystems, then bail now. */ - if ((p != NULL) && ((p->p_flag & P_NOREMOTEHANG) != 0) && + if ((p != NULL) && ((proc_noremotehang(p)) != 0) && ((nmp->nm_state & (NFSSTA_TIMEO|NFSSTA_LOCKTIMEO)) != 0)) { if (fl->l_type == F_UNLCK) printf("nfs_dolock: aborting unlock request " @@ -779,8 +815,7 @@ wait_for_granted: (nmp->nm_tprintf_initial_delay != 0) && ((lastmsg + nmp->nm_tprintf_delay) < now.tv_sec)) { lastmsg = now.tv_sec; - nfs_down(NULL, nmp, p, "lockd not responding", - 0, NFSSTA_LOCKTIMEO); + nfs_down(nmp, p, 0, NFSSTA_LOCKTIMEO, "lockd not responding"); wentdown = 1; } if (msgreq.lmr_errno == EINPROGRESS) { @@ -818,12 +853,10 @@ wait_for_granted: continue; } - if (wentdown) { - /* we got a reponse, so the server's lockd is OK */ - nfs_up(NULL, VFSTONFS(vp->v_mount), p, "lockd alive again", - NFSSTA_LOCKTIMEO); - wentdown = 0; - } + /* we got a reponse, so the server's lockd is OK */ + nfs_up(VFSTONFS(vnode_mount(vp)), p, NFSSTA_LOCKTIMEO, + wentdown ? "lockd alive again" : NULL); + wentdown = 0; if (msgreq.lmr_errno == EINPROGRESS) { /* got NLM_BLOCKED response */ @@ -877,7 +910,7 @@ wait_for_granted: if (!error) { /* record that NFS file locking has worked on this mount */ - nmp = VFSTONFS(vp->v_mount); + nmp = VFSTONFS(vnode_mount(vp)); if (nmp && !(nmp->nm_state & NFSSTA_LOCKSWORK)) nmp->nm_state |= NFSSTA_LOCKSWORK; /* @@ -892,10 +925,12 @@ wait_for_granted: } break; } - + nfs_lockdmsg_dequeue(&msgreq); - error1 = vn_close(wvp, FWRITE, kernproc->p_ucred, p); + error1 = VNOP_CLOSE(wvp, FWRITE, ap->a_context); + vnode_rele(wvp); + vnode_put(wvp); /* prefer any previous 'error' to our vn_close 'error1'. */ return (error != 0 ? error : error1); } @@ -905,20 +940,14 @@ wait_for_granted: * NFS advisory byte-level locks answer from the lock daemon. */ int -nfslockdans(struct proc *p, struct lockd_ans *ansp) +nfslockdans(proc_t p, struct lockd_ans *ansp) { LOCKD_MSG_REQUEST *msgreq; int error; - /* - * Let root, or someone who once was root (lockd generally - * switches to the daemon uid once it is done setting up) make - * this call. - * - * XXX This authorization check is probably not right. - */ - if ((error = suser(p->p_ucred, &p->p_acflag)) != 0 && - p->p_cred->p_svuid != 0) + /* Let root make this call. */ + error = proc_suser(p); + if (error) return (error); /* the version should match, or we're out of sync */ @@ -972,28 +1001,38 @@ nfslockdans(struct proc *p, struct lockd_ans *ansp) * NFS advisory byte-level locks: fifo file# from the lock daemon. */ int -nfslockdfd(struct proc *p, int fd) +nfslockdfd(proc_t p, int fd) { int error; - struct file *fp, *ofp; + vnode_t vp, oldvp; - error = suser(p->p_ucred, &p->p_acflag); + error = proc_suser(p); if (error) return (error); if (fd < 0) { - fp = 0; + vp = NULL; } else { - error = getvnode(p, fd, &fp); + error = file_vnode(fd, &vp); if (error) return (error); - (void)fref(fp); + error = vnode_getwithref(vp); + if (error) + return (error); + error = vnode_ref(vp); + if (error) { + vnode_put(vp); + return (error); + } + } + oldvp = nfslockdvnode; + nfslockdvnode = vp; + if (oldvp) { + vnode_rele(oldvp); + } + (void)wakeup((void *)&nfslockdvnode); + if (vp) { + vnode_put(vp); } - ofp = nfslockdfp; - nfslockdfp = fp; - if (ofp) - (void)frele(ofp); - nfslockdpid = nfslockdfp ? p->p_pid : 0; - (void)wakeup((void *)&nfslockdfp); return (0); } @@ -1002,23 +1041,17 @@ nfslockdfd(struct proc *p, int fd) * lock daemon waiting for lock request */ int -nfslockdwait(struct proc *p) +nfslockdwait(proc_t p) { int error; - struct file *fp, *ofp; - if (p->p_pid != nfslockdpid) { - error = suser(p->p_ucred, &p->p_acflag); - if (error) - return (error); - } - if (nfslockdwaiting) + error = proc_suser(p); + if (error) + return (error); + if (nfslockdwaiting || nfslockdvnode) return (EBUSY); - if (nfslockdfifowritten) { - nfslockdfifowritten = 0; - return (0); - } + nfslockdstarttimeout = 0; nfslockdwaiting = 1; tsleep((void *)&nfslockdwaiting, PCATCH | PUSER, "lockd", 0); nfslockdwaiting = 0; diff --git a/bsd/nfs/nfs_lock.h b/bsd/nfs/nfs_lock.h index cc99a1fa8..512408454 100644 --- a/bsd/nfs/nfs_lock.h +++ b/bsd/nfs/nfs_lock.h @@ -1,3 +1,24 @@ +/* + * Copyright (c) 2002-2005 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ /*- * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. * Redistribution and use in source and binary forms, with or without @@ -60,7 +81,7 @@ typedef struct nfs_lock_msg { u_int64_t lm_xid; /* unique message transaction ID */ struct flock lm_fl; /* The lock request. */ struct sockaddr_storage lm_addr; /* The address. */ - size_t lm_fh_len; /* The file handle length. */ + int lm_fh_len; /* The file handle length. */ struct xucred lm_cred; /* user cred for lock req */ u_int8_t lm_fh[NFS_SMALLFH]; /* The file handle. */ } LOCKD_MSG; @@ -96,7 +117,7 @@ struct lockd_ans { pid_t la_pid; /* pid of lock requester/owner */ off_t la_start; /* lock starting offset */ off_t la_len; /* lock length */ - size_t la_fh_len; /* The file handle length. */ + int la_fh_len; /* The file handle length. */ u_int8_t la_fh[NFS_SMALLFH]; /* The file handle. */ }; @@ -108,9 +129,12 @@ struct lockd_ans { #ifdef KERNEL void nfs_lockinit(void); -int nfs_dolock(struct vop_advlock_args *ap); -int nfslockdans(struct proc *p, struct lockd_ans *ansp); -int nfslockdfd(struct proc *p, int fd); -int nfslockdwait(struct proc *p); +int nfs_dolock(struct vnop_advlock_args *ap); +int nfslockdans(proc_t p, struct lockd_ans *ansp); +int nfslockdfd(proc_t p, int fd); +int nfslockdwait(proc_t p); + +extern vnode_t nfslockdvnode; +extern int nfslockdwaiting; #endif #endif /* __APPLE_API_PRIVATE */ diff --git a/bsd/nfs/nfs_node.c b/bsd/nfs/nfs_node.c index b05a62cc7..36769aad4 100644 --- a/bsd/nfs/nfs_node.c +++ b/bsd/nfs/nfs_node.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -63,9 +63,10 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> -#include <sys/mount.h> -#include <sys/namei.h> +#include <sys/kauth.h> +#include <sys/mount_internal.h> #include <sys/vnode.h> +#include <sys/ubc.h> #include <sys/malloc.h> #include <nfs/rpcv2.h> @@ -77,8 +78,10 @@ LIST_HEAD(nfsnodehashhead, nfsnode) *nfsnodehashtbl; u_long nfsnodehash; -#define TRUE 1 -#define FALSE 0 +lck_grp_t * nfs_node_hash_lck_grp; +lck_grp_attr_t * nfs_node_hash_lck_grp_attr; +lck_attr_t * nfs_node_hash_lck_attr; +lck_mtx_t *nfs_node_hash_mutex; /* * Initialize hash links for nfsnodes @@ -88,24 +91,28 @@ void nfs_nhinit(void) { nfsnodehashtbl = hashinit(desiredvnodes, M_NFSNODE, &nfsnodehash); + + nfs_node_hash_lck_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(nfs_node_hash_lck_grp_attr); + nfs_node_hash_lck_grp = lck_grp_alloc_init("nfs_node_hash", nfs_node_hash_lck_grp_attr); + + nfs_node_hash_lck_attr = lck_attr_alloc_init(); + + nfs_node_hash_mutex = lck_mtx_alloc_init(nfs_node_hash_lck_grp, nfs_node_hash_lck_attr); } /* * Compute an entry in the NFS hash table structure */ u_long -nfs_hash(fhp, fhsize) - register nfsfh_t *fhp; - int fhsize; +nfs_hash(u_char *fhp, int fhsize) { - register u_char *fhpp; - register u_long fhsum; - register int i; + u_long fhsum; + int i; - fhpp = &fhp->fh_bytes[0]; fhsum = 0; for (i = 0; i < fhsize; i++) - fhsum += *fhpp++; + fhsum += *fhp++; return (fhsum); } @@ -115,22 +122,25 @@ nfs_hash(fhp, fhsize) * In all cases, a pointer to a * nfsnode structure is returned. */ -int nfs_node_hash_lock; - int -nfs_nget(mntp, fhp, fhsize, npp) - struct mount *mntp; - register nfsfh_t *fhp; - int fhsize; - struct nfsnode **npp; +nfs_nget( + mount_t mntp, + vnode_t dvp, + struct componentname *cnp, + u_char *fhp, + int fhsize, + struct nfs_vattr *nvap, + u_int64_t *xidp, + int flags, + struct nfsnode **npp) { - struct proc *p = current_proc(); /* XXX */ struct nfsnode *np; struct nfsnodehashhead *nhpp; - register struct vnode *vp; - struct vnode *nvp; + vnode_t vp, nvp; int error; - struct mount *mp; + mount_t mp; + struct vnode_fsparam vfsp; + uint32_t vid; /* Check for unmount in progress */ if (!mntp || (mntp->mnt_kern_flag & MNTK_UNMOUNT)) { @@ -140,35 +150,39 @@ nfs_nget(mntp, fhp, fhsize, npp) nhpp = NFSNOHASH(nfs_hash(fhp, fhsize)); loop: + lck_mtx_lock(nfs_node_hash_mutex); for (np = nhpp->lh_first; np != 0; np = np->n_hash.le_next) { - mp = (np->n_flag & NINIT) ? np->n_mount : NFSTOV(np)->v_mount; + mp = (np->n_flag & NINIT) ? np->n_mount : vnode_mount(NFSTOV(np)); if (mntp != mp || np->n_fhsize != fhsize || - bcmp((caddr_t)fhp, (caddr_t)np->n_fhp, fhsize)) + bcmp(fhp, np->n_fhp, fhsize)) continue; /* if the node is still being initialized, sleep on it */ if (np->n_flag & NINIT) { np->n_flag |= NWINIT; - tsleep(np, PINOD, "nfsngt", 0); + msleep(np, nfs_node_hash_mutex, PDROP | PINOD, "nfs_nget", 0); goto loop; } vp = NFSTOV(np); - if (vget(vp, LK_EXCLUSIVE, p)) - goto loop; - *npp = np; - return(0); - } - /* - * Obtain a lock to prevent a race condition if the getnewvnode() - * or MALLOC() below happens to block. - */ - if (nfs_node_hash_lock) { - while (nfs_node_hash_lock) { - nfs_node_hash_lock = -1; - tsleep(&nfs_node_hash_lock, PVM, "nfsngt", 0); + vid = vnode_vid(vp); + lck_mtx_unlock(nfs_node_hash_mutex); + if ((error = vnode_getwithvid(vp, vid))) { + /* + * If vnode is being reclaimed or has already + * changed identity, no need to wait. + */ + return (error); + } + /* update attributes */ + error = nfs_loadattrcache(np, nvap, xidp, 0); + if (error) { + vnode_put(vp); + } else { + if (dvp && cnp && (flags & NG_MAKEENTRY)) + cache_enter(dvp, vp, cnp); + *npp = np; } - goto loop; + return(error); } - nfs_node_hash_lock = 1; /* * allocate and initialize nfsnode and stick it in the hash @@ -176,72 +190,131 @@ loop: * hash before initialization is complete will wait for it. */ MALLOC_ZONE(np, struct nfsnode *, sizeof *np, M_NFSNODE, M_WAITOK); + if (!np) { + lck_mtx_unlock(nfs_node_hash_mutex); + *npp = 0; + return (ENOMEM); + } bzero((caddr_t)np, sizeof *np); np->n_flag |= NINIT; np->n_mount = mntp; - lockinit(&np->n_lock, PINOD, "nfsnode", 0, 0); - /* lock the new nfsnode */ - lockmgr(&np->n_lock, LK_EXCLUSIVE, NULL, p); - /* Insert the nfsnode in the hash queue for its new file handle */ + /* setup node's file handle */ if (fhsize > NFS_SMALLFH) { - MALLOC_ZONE(np->n_fhp, nfsfh_t *, + MALLOC_ZONE(np->n_fhp, u_char *, fhsize, M_NFSBIGFH, M_WAITOK); - } else - np->n_fhp = &np->n_fh; - bcopy((caddr_t)fhp, (caddr_t)np->n_fhp, fhsize); + if (!np->n_fhp) { + lck_mtx_unlock(nfs_node_hash_mutex); + FREE_ZONE(np, sizeof *np, M_NFSNODE); + *npp = 0; + return (ENOMEM); + } + } else { + np->n_fhp = &np->n_fh[0]; + } + bcopy(fhp, np->n_fhp, fhsize); np->n_fhsize = fhsize; - LIST_INSERT_HEAD(nhpp, np, n_hash); + + /* Insert the nfsnode in the hash queue for its new file handle */ np->n_flag |= NHASHED; + LIST_INSERT_HEAD(nhpp, np, n_hash); /* release lock on hash table */ - if (nfs_node_hash_lock < 0) - wakeup(&nfs_node_hash_lock); - nfs_node_hash_lock = 0; + lck_mtx_unlock(nfs_node_hash_mutex); + + /* do initial loading of attributes */ + error = nfs_loadattrcache(np, nvap, xidp, 1); + if (error) { + lck_mtx_lock(nfs_node_hash_mutex); + LIST_REMOVE(np, n_hash); + np->n_flag &= ~(NHASHED|NINIT); + if (np->n_flag & NWINIT) { + np->n_flag &= ~NWINIT; + wakeup((caddr_t)np); + } + lck_mtx_unlock(nfs_node_hash_mutex); + if (np->n_fhsize > NFS_SMALLFH) + FREE_ZONE(np->n_fhp, np->n_fhsize, M_NFSBIGFH); + FREE_ZONE(np, sizeof *np, M_NFSNODE); + *npp = 0; + return (error); + } + np->n_mtime = nvap->nva_mtime; + if (nvap->nva_type == VDIR) + np->n_ncmtime = nvap->nva_mtime; + NMODEINVALIDATE(np); /* now, attempt to get a new vnode */ - error = getnewvnode(VT_NFS, mntp, nfsv2_vnodeop_p, &nvp); + vfsp.vnfs_mp = mntp; + vfsp.vnfs_vtype = nvap->nva_type; + vfsp.vnfs_str = "nfs"; + vfsp.vnfs_dvp = dvp; + vfsp.vnfs_fsnode = np; + if (nvap->nva_type == VFIFO) + vfsp.vnfs_vops = fifo_nfsv2nodeop_p; + else if (nvap->nva_type == VBLK || nvap->nva_type == VCHR) + vfsp.vnfs_vops = spec_nfsv2nodeop_p; + else + vfsp.vnfs_vops = nfsv2_vnodeop_p; + vfsp.vnfs_markroot = (flags & NG_MARKROOT) ? 1 : 0; + vfsp.vnfs_marksystem = 0; + vfsp.vnfs_rdev = 0; + vfsp.vnfs_filesize = nvap->nva_size; + vfsp.vnfs_cnp = cnp; + if (dvp && cnp && (flags & NG_MAKEENTRY)) + vfsp.vnfs_flags = 0; + else + vfsp.vnfs_flags = VNFS_NOCACHE; + error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &nvp); if (error) { + lck_mtx_lock(nfs_node_hash_mutex); LIST_REMOVE(np, n_hash); - np->n_flag &= ~NHASHED; + np->n_flag &= ~(NHASHED|NINIT); + if (np->n_flag & NWINIT) { + np->n_flag &= ~NWINIT; + wakeup((caddr_t)np); + } + lck_mtx_unlock(nfs_node_hash_mutex); if (np->n_fhsize > NFS_SMALLFH) - FREE_ZONE((caddr_t)np->n_fhp, np->n_fhsize, M_NFSBIGFH); + FREE_ZONE(np->n_fhp, np->n_fhsize, M_NFSBIGFH); FREE_ZONE(np, sizeof *np, M_NFSNODE); *npp = 0; return (error); } vp = nvp; - vp->v_data = np; np->n_vnode = vp; + vnode_addfsref(vp); + vnode_settag(vp, VT_NFS); // XXX shouldn't this be a vnode_create() parameter? *npp = np; + /* node is now initialized */ - /* node is now initialized, check if anyone's waiting for it */ + /* check if anyone's waiting on this node */ + lck_mtx_lock(nfs_node_hash_mutex); np->n_flag &= ~NINIT; if (np->n_flag & NWINIT) { np->n_flag &= ~NWINIT; wakeup((caddr_t)np); } + lck_mtx_unlock(nfs_node_hash_mutex); return (error); } + int nfs_inactive(ap) - struct vop_inactive_args /* { - struct vnode *a_vp; - struct proc *a_p; + struct vnop_inactive_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + vfs_context_t a_context; } */ *ap; { register struct nfsnode *np; register struct sillyrename *sp; - struct proc *p = current_proc(); /* XXX */ - extern int prtactive; - struct ucred *cred; + kauth_cred_t cred; np = VTONFS(ap->a_vp); - if (prtactive && ap->a_vp->v_usecount != 0) - vprint("nfs_inactive: pushing active", ap->a_vp); - if (ap->a_vp->v_type != VDIR) { + if (vnode_vtype(ap->a_vp) != VDIR) { sp = np->n_sillyrename; np->n_sillyrename = (struct sillyrename *)0; } else @@ -252,9 +325,11 @@ nfs_inactive(ap) * Remove the silly file that was rename'd earlier */ #if DIAGNOSTIC - kprintf("nfs_inactive removing %s, dvp=%x, a_vp=%x, ap=%x, np=%x, sp=%x\n", &sp->s_name[0], (unsigned)sp->s_dvp, (unsigned)ap->a_vp, (unsigned)ap, (unsigned)np, (unsigned)sp); + kprintf("nfs_inactive removing %s, dvp=%x, a_vp=%x, ap=%x, np=%x, sp=%x\n", + &sp->s_name[0], (unsigned)sp->s_dvp, (unsigned)ap->a_vp, (unsigned)ap, + (unsigned)np, (unsigned)sp); #endif - (void) nfs_vinvalbuf(ap->a_vp, 0, sp->s_cred, p, 1); + nfs_vinvalbuf(ap->a_vp, 0, sp->s_cred, vfs_context_proc(ap->a_context), 1); np->n_size = 0; ubc_setsize(ap->a_vp, (off_t)0); nfs_removeit(sp); @@ -263,19 +338,21 @@ nfs_inactive(ap) * again if another object gets created with the same filehandle * before this vnode gets reclaimed */ + lck_mtx_lock(nfs_node_hash_mutex); LIST_REMOVE(np, n_hash); np->n_flag &= ~NHASHED; + lck_mtx_unlock(nfs_node_hash_mutex); cred = sp->s_cred; if (cred != NOCRED) { sp->s_cred = NOCRED; - crfree(cred); + kauth_cred_rele(cred); } - vrele(sp->s_dvp); + vnode_rele(sp->s_dvp); FREE_ZONE((caddr_t)sp, sizeof (struct sillyrename), M_NFSREQ); + vnode_recycle(ap->a_vp); } - np->n_flag &= (NMODIFIED | NFLUSHINPROG | NFLUSHWANT | NQNFSEVICTED | - NQNFSNONCACHE | NQNFSWRITE | NHASHED); - VOP_UNLOCK(ap->a_vp, 0, ap->a_p); + /* clear all flags other than these */ + np->n_flag &= (NMODIFIED | NFLUSHINPROG | NFLUSHWANT | NHASHED); return (0); } @@ -284,36 +361,23 @@ nfs_inactive(ap) */ int nfs_reclaim(ap) - struct vop_reclaim_args /* { - struct vnode *a_vp; + struct vnop_reclaim_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + vfs_context_t a_context; } */ *ap; { - register struct vnode *vp = ap->a_vp; - register struct nfsnode *np = VTONFS(vp); - register struct nfsmount *nmp; - register struct nfsdmap *dp, *dp2; - extern int prtactive; + vnode_t vp = ap->a_vp; + struct nfsnode *np = VTONFS(vp); + struct nfsdmap *dp, *dp2; - if (prtactive && vp->v_usecount != 0) - vprint("nfs_reclaim: pushing active", vp); + vnode_removefsref(vp); if (np->n_flag & NHASHED) { + lck_mtx_lock(nfs_node_hash_mutex); LIST_REMOVE(np, n_hash); np->n_flag &= ~NHASHED; - } - - /* - * In case we block during FREE_ZONEs below, get the entry out - * of tbe name cache now so subsequent lookups won't find it. - */ - cache_purge(vp); - - /* - * For nqnfs, take it off the timer queue as required. - */ - nmp = VFSTONFS(vp->v_mount); - if (nmp && (nmp->nm_flag & NFSMNT_NQNFS) && np->n_timer.cqe_next != 0) { - CIRCLEQ_REMOVE(&nmp->nm_timerhead, np, n_timer); + lck_mtx_unlock(nfs_node_hash_mutex); } /* @@ -321,7 +385,7 @@ nfs_reclaim(ap) * large file handle structures that might be associated with * this nfs node. */ - if (vp->v_type == VDIR) { + if (vnode_vtype(vp) == VDIR) { dp = np->n_cookies.lh_first; while (dp) { dp2 = dp; @@ -331,66 +395,11 @@ nfs_reclaim(ap) } } if (np->n_fhsize > NFS_SMALLFH) { - FREE_ZONE((caddr_t)np->n_fhp, np->n_fhsize, M_NFSBIGFH); + FREE_ZONE(np->n_fhp, np->n_fhsize, M_NFSBIGFH); } + vnode_clearfsnode(vp); - FREE_ZONE(vp->v_data, sizeof (struct nfsnode), M_NFSNODE); - vp->v_data = (void *)0; + FREE_ZONE(np, sizeof(struct nfsnode), M_NFSNODE); return (0); } -/* - * Lock an nfsnode - */ -int -nfs_lock(ap) - struct vop_lock_args /* { - struct vnode *a_vp; - int a_flags; - struct proc *a_p; - } */ *ap; -{ - register struct vnode *vp = ap->a_vp; - - /* - * Ugh, another place where interruptible mounts will get hung. - * If you make this call interruptible, then you have to fix all - * the VOP_LOCK() calls to expect interruptibility. - */ - if (vp->v_tag == VT_NON) - return (ENOENT); /* ??? -- got to check something and error, but what? */ - - return(lockmgr(&VTONFS(vp)->n_lock, ap->a_flags, &vp->v_interlock, - ap->a_p)); - -} - -/* - * Unlock an nfsnode - */ -int -nfs_unlock(ap) - struct vop_unlock_args /* { - struct vnode *a_vp; - int a_flags; - struct proc *a_p; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - - return (lockmgr(&VTONFS(vp)->n_lock, ap->a_flags | LK_RELEASE, - &vp->v_interlock, ap->a_p)); -} - -/* - * Check for a locked nfsnode - */ -int -nfs_islocked(ap) - struct vop_islocked_args /* { - struct vnode *a_vp; - } */ *ap; -{ - return (lockstatus(&VTONFS(ap->a_vp)->n_lock)); - -} diff --git a/bsd/nfs/nfs_nqlease.c b/bsd/nfs/nfs_nqlease.c deleted file mode 100644 index 955925a24..000000000 --- a/bsd/nfs/nfs_nqlease.c +++ /dev/null @@ -1,1353 +0,0 @@ -/* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* - * Copyright (c) 1992, 1993 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Rick Macklem at The University of Guelph. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)nfs_nqlease.c 8.9 (Berkeley) 5/20/95 - * FreeBSD-Id: nfs_nqlease.c,v 1.32 1997/11/07 08:53:23 phk Exp $ - */ - - -/* - * References: - * Cary G. Gray and David R. Cheriton, "Leases: An Efficient Fault-Tolerant - * Mechanism for Distributed File Cache Consistency", - * In Proc. of the Twelfth ACM Symposium on Operating Systems - * Principals, pg. 202-210, Litchfield Park, AZ, Dec. 1989. - * Michael N. Nelson, Brent B. Welch and John K. Ousterhout, "Caching - * in the Sprite Network File System", ACM TOCS 6(1), - * pages 134-154, February 1988. - * V. Srinivasan and Jeffrey C. Mogul, "Spritely NFS: Implementation and - * Performance of Cache-Consistency Protocols", Digital - * Equipment Corporation WRL Research Report 89/5, May 1989. - */ -#include <sys/param.h> -#include <sys/vnode.h> -#include <sys/mount.h> -#include <sys/kernel.h> -#include <sys/proc.h> -#include <sys/systm.h> -#include <sys/malloc.h> -#include <sys/mbuf.h> -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/protosw.h> -#include <machine/spl.h> - -#include <netinet/in.h> -#include <nfs/rpcv2.h> -#include <nfs/nfsproto.h> -#include <nfs/nfs.h> -#include <nfs/nfsm_subs.h> -#include <nfs/xdr_subs.h> -#include <nfs/nqnfs.h> -#include <nfs/nfsnode.h> -#include <nfs/nfsmount.h> - -time_t nqnfsstarttime = (time_t)0; -int nqsrv_clockskew = NQ_CLOCKSKEW; -int nqsrv_writeslack = NQ_WRITESLACK; -int nqsrv_maxlease = NQ_MAXLEASE; -static int nqsrv_maxnumlease = NQ_MAXNUMLEASE; - -struct vop_lease_args; - -static int nqsrv_cmpnam __P((struct nfssvc_sock *, struct mbuf *, - struct nqhost *)); -extern void nqnfs_lease_updatetime __P((int deltat)); -static int nqnfs_vacated __P((struct vnode *vp, struct ucred *cred)); -static void nqsrv_addhost __P((struct nqhost *lph, struct nfssvc_sock *slp, - struct mbuf *nam)); -static void nqsrv_instimeq __P((struct nqlease *lp, u_long duration)); -static void nqsrv_locklease __P((struct nqlease *lp)); -static void nqsrv_send_eviction __P((struct vnode *vp, struct nqlease *lp, - struct nfssvc_sock *slp, - struct mbuf *nam, struct ucred *cred)); -static void nqsrv_unlocklease __P((struct nqlease *lp)); -static void nqsrv_waitfor_expiry __P((struct nqlease *lp)); - -/* - * Signifies which rpcs can have piggybacked lease requests - */ -int nqnfs_piggy[NFS_NPROCS] = { - 0, - 0, - ND_WRITE, - ND_READ, - 0, - ND_READ, - ND_READ, - ND_WRITE, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - ND_READ, - ND_READ, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, -}; - -extern nfstype nfsv2_type[9]; -extern nfstype nfsv3_type[9]; -extern struct nfssvc_sock *nfs_udpsock, *nfs_cltpsock; -extern int nfsd_waiting; -extern struct nfsstats nfsstats; -extern int nfs_mount_type; - -#define TRUE 1 -#define FALSE 0 - -#ifndef NFS_NOSERVER -/* - * Get or check for a lease for "vp", based on ND_CHECK flag. - * The rules are as follows: - * - if a current non-caching lease, reply non-caching - * - if a current lease for same host only, extend lease - * - if a read cachable lease and a read lease request - * add host to list any reply cachable - * - else { set non-cachable for read-write sharing } - * send eviction notice messages to all other hosts that have lease - * wait for lease termination { either by receiving vacated messages - * from all the other hosts or expiry - * via. timeout } - * modify lease to non-cachable - * - else if no current lease, issue new one - * - reply - * - return boolean TRUE iff nam should be m_freem()'d - * NB: Since nqnfs_serverd() is called from a timer, any potential tsleep() - * in here must be framed by nqsrv_locklease() and nqsrv_unlocklease(). - * nqsrv_locklease() is coded such that at least one of LC_LOCKED and - * LC_WANTED is set whenever a process is tsleeping in it. The exception - * is when a new lease is being allocated, since it is not in the timer - * queue yet. (Ditto for the splsoftclock() and splx(s) calls) - */ -int -nqsrv_getlease(vp, duration, flags, slp, procp, nam, cachablep, frev, cred) - struct vnode *vp; - u_long *duration; - int flags; - struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf *nam; - int *cachablep; - u_quad_t *frev; - struct ucred *cred; -{ - register struct nqlease *lp; - register struct nqfhhashhead *lpp = 0; - register struct nqhost *lph = 0; - struct nqlease *tlp; - struct nqm **lphp; - struct vattr vattr; - fhandle_t fh; - int i, ok, error, s; - - if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) - return (0); - if (*duration > nqsrv_maxlease) - *duration = nqsrv_maxlease; - error = VOP_GETATTR(vp, &vattr, cred, procp); - if (error) - return (error); - *frev = vattr.va_filerev; - s = splsoftclock(); - tlp = vp->v_lease; - if ((flags & ND_CHECK) == 0) - nfsstats.srvnqnfs_getleases++; - if (tlp == (struct nqlease *)0) { - - /* - * Find the lease by searching the hash list. - */ - fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; - error = VFS_VPTOFH(vp, &fh.fh_fid); - if (error) { - splx(s); - return (error); - } - lpp = NQFHHASH(fh.fh_fid.fid_data); - for (lp = lpp->lh_first; lp != 0; lp = lp->lc_hash.le_next) - if (fh.fh_fsid.val[0] == lp->lc_fsid.val[0] && - fh.fh_fsid.val[1] == lp->lc_fsid.val[1] && - !bcmp(fh.fh_fid.fid_data, lp->lc_fiddata, - fh.fh_fid.fid_len - sizeof (long))) { - /* Found it */ - lp->lc_vp = vp; - vp->v_lease = lp; - tlp = lp; - break; - } - } else - lp = tlp; - if (lp) { - if ((lp->lc_flag & LC_NONCACHABLE) || - (lp->lc_morehosts == (struct nqm *)0 && - nqsrv_cmpnam(slp, nam, &lp->lc_host))) - goto doreply; - if ((flags & ND_READ) && (lp->lc_flag & LC_WRITE) == 0) { - if (flags & ND_CHECK) - goto doreply; - if (nqsrv_cmpnam(slp, nam, &lp->lc_host)) - goto doreply; - i = 0; - if (lp->lc_morehosts) { - lph = lp->lc_morehosts->lpm_hosts; - lphp = &lp->lc_morehosts->lpm_next; - ok = 1; - } else { - lphp = &lp->lc_morehosts; - ok = 0; - } - while (ok && (lph->lph_flag & LC_VALID)) { - if (nqsrv_cmpnam(slp, nam, lph)) - goto doreply; - if (++i == LC_MOREHOSTSIZ) { - i = 0; - if (*lphp) { - lph = (*lphp)->lpm_hosts; - lphp = &((*lphp)->lpm_next); - } else - ok = 0; - } else - lph++; - } - nqsrv_locklease(lp); - if (!ok) { - MALLOC_ZONE(*lphp, struct nqm *, - sizeof(struct nqm), - M_NQMHOST, M_WAITOK); - bzero((caddr_t)*lphp, sizeof (struct nqm)); - lph = (*lphp)->lpm_hosts; - } - nqsrv_addhost(lph, slp, nam); - nqsrv_unlocklease(lp); - } else { - lp->lc_flag |= LC_NONCACHABLE; - nqsrv_locklease(lp); - nqsrv_send_eviction(vp, lp, slp, nam, cred); - nqsrv_waitfor_expiry(lp); - nqsrv_unlocklease(lp); - } -doreply: - /* - * Update the lease and return - */ - if ((flags & ND_CHECK) == 0) - nqsrv_instimeq(lp, *duration); - if (lp->lc_flag & LC_NONCACHABLE) - *cachablep = 0; - else { - *cachablep = 1; - if (flags & ND_WRITE) - lp->lc_flag |= LC_WRITTEN; - } - splx(s); - return (0); - } - splx(s); - if (flags & ND_CHECK) - return (0); - - /* - * Allocate new lease - * The value of nqsrv_maxnumlease should be set generously, so that - * the following "printf" happens infrequently. - */ - if (nfsstats.srvnqnfs_leases > nqsrv_maxnumlease) { - printf("Nqnfs server, too many leases\n"); - do { - (void) tsleep((caddr_t)&lbolt, PSOCK, - "nqsrvnuml", 0); - } while (nfsstats.srvnqnfs_leases > nqsrv_maxnumlease); - } - MALLOC_ZONE(lp, struct nqlease *, - sizeof (struct nqlease), M_NQLEASE, M_WAITOK); - bzero((caddr_t)lp, sizeof (struct nqlease)); - if (flags & ND_WRITE) - lp->lc_flag |= (LC_WRITE | LC_WRITTEN); - nqsrv_addhost(&lp->lc_host, slp, nam); - lp->lc_vp = vp; - lp->lc_fsid = fh.fh_fsid; - bcopy(fh.fh_fid.fid_data, lp->lc_fiddata, - fh.fh_fid.fid_len - sizeof (long)); - if(!lpp) - panic("nfs_nqlease.c: Phoney lpp"); - LIST_INSERT_HEAD(lpp, lp, lc_hash); - vp->v_lease = lp; - s = splsoftclock(); - nqsrv_instimeq(lp, *duration); - splx(s); - *cachablep = 1; - if (++nfsstats.srvnqnfs_leases > nfsstats.srvnqnfs_maxleases) - nfsstats.srvnqnfs_maxleases = nfsstats.srvnqnfs_leases; - return (0); -} - -/* - * Local lease check for server syscalls. - * Just set up args and let nqsrv_getlease() do the rest. - * nqnfs_vop_lease_check() is the VOP_LEASE() form of the same routine. - * Ifdef'd code in nfsnode.h renames these routines to whatever a particular - * OS needs. - */ -void -nqnfs_lease_check(vp, p, cred, flag) - struct vnode *vp; - struct proc *p; - struct ucred *cred; - int flag; -{ - u_long duration = 0; - int cache; - u_quad_t frev; - - (void) nqsrv_getlease(vp, &duration, ND_CHECK | flag, NQLOCALSLP, - p, (struct mbuf *)0, &cache, &frev, cred); -} - -int -nqnfs_vop_lease_check(ap) - struct vop_lease_args /* { - struct vnode *a_vp; - struct proc *a_p; - struct ucred *a_cred; - int a_flag; - } */ *ap; -{ - u_long duration = 0; - int cache; - u_quad_t frev; - - (void) nqsrv_getlease(ap->a_vp, &duration, ND_CHECK | ap->a_flag, - NQLOCALSLP, ap->a_p, (struct mbuf *)0, &cache, &frev, ap->a_cred); - return (0); -} - -#endif /* NFS_NOSERVER */ - -/* - * Add a host to an nqhost structure for a lease. - */ -static void -nqsrv_addhost(lph, slp, nam) - register struct nqhost *lph; - struct nfssvc_sock *slp; - struct mbuf *nam; -{ - register struct sockaddr_in *saddr; - - if (slp == NQLOCALSLP) - lph->lph_flag |= (LC_VALID | LC_LOCAL); - else if (slp == nfs_udpsock) { - saddr = mtod(nam, struct sockaddr_in *); - lph->lph_flag |= (LC_VALID | LC_UDP); - lph->lph_inetaddr = saddr->sin_addr.s_addr; - lph->lph_port = saddr->sin_port; - } else if (slp == nfs_cltpsock) { - lph->lph_nam = m_copym(nam, 0, M_COPYALL, M_WAIT); - lph->lph_flag |= (LC_VALID | LC_CLTP); - } else { - lph->lph_flag |= (LC_VALID | LC_SREF); - lph->lph_slp = slp; - slp->ns_sref++; - } -} - -/* - * Update the lease expiry time and position it in the timer queue correctly. - */ -static void -nqsrv_instimeq(lp, duration) - register struct nqlease *lp; - u_long duration; -{ - register struct nqlease *tlp; - time_t newexpiry; - struct timeval now; - - microtime(&now); - newexpiry = now.tv_sec + duration + nqsrv_clockskew; - if (lp->lc_expiry == newexpiry) - return; - if (lp->lc_timer.cqe_next != 0) { - CIRCLEQ_REMOVE(&nqtimerhead, lp, lc_timer); - } - lp->lc_expiry = newexpiry; - - /* - * Find where in the queue it should be. - */ - tlp = nqtimerhead.cqh_last; - while (tlp != (void *)&nqtimerhead && tlp->lc_expiry > newexpiry) - tlp = tlp->lc_timer.cqe_prev; -#ifdef HASNVRAM - if (tlp == nqtimerhead.cqh_last) - NQSTORENOVRAM(newexpiry); -#endif /* HASNVRAM */ - if (tlp == (void *)&nqtimerhead) { - CIRCLEQ_INSERT_HEAD(&nqtimerhead, lp, lc_timer); - } else { - CIRCLEQ_INSERT_AFTER(&nqtimerhead, tlp, lp, lc_timer); - } -} - -/* - * Compare the requesting host address with the lph entry in the lease. - * Return true iff it is the same. - * This is somewhat messy due to the union in the nqhost structure. - * The local host is indicated by the special value of NQLOCALSLP for slp. - */ -static int -nqsrv_cmpnam(slp, nam, lph) - register struct nfssvc_sock *slp; - struct mbuf *nam; - register struct nqhost *lph; -{ - register struct sockaddr_in *saddr; - struct mbuf *addr; - union nethostaddr lhaddr; - int ret; - - if (slp == NQLOCALSLP) { - if (lph->lph_flag & LC_LOCAL) - return (1); - else - return (0); - } - if (slp == nfs_udpsock || slp == nfs_cltpsock) - addr = nam; - else - addr = slp->ns_nam; - if (lph->lph_flag & LC_UDP) - ret = netaddr_match(AF_INET, &lph->lph_haddr, addr); - else if (lph->lph_flag & LC_CLTP) - ret = netaddr_match(AF_ISO, &lph->lph_claddr, addr); - else { - if ((lph->lph_slp->ns_flag & SLP_VALID) == 0) - return (0); - saddr = mtod(lph->lph_slp->ns_nam, struct sockaddr_in *); - if (saddr->sin_family == AF_INET) - lhaddr.had_inetaddr = saddr->sin_addr.s_addr; - else - lhaddr.had_nam = lph->lph_slp->ns_nam; - ret = netaddr_match(saddr->sin_family, &lhaddr, addr); - } - return (ret); -} - -/* - * Send out eviction notice messages to all other hosts for the lease. - */ -static void -nqsrv_send_eviction(vp, lp, slp, nam, cred) - struct vnode *vp; - register struct nqlease *lp; - struct nfssvc_sock *slp; - struct mbuf *nam; - struct ucred *cred; -{ - register struct nqhost *lph = &lp->lc_host; - register struct mbuf *m; - register int siz; - struct nqm *lphnext = lp->lc_morehosts; - struct mbuf *mreq, *mb, *mb2, *mheadend; - struct socket *so; - struct mbuf *nam2; - struct sockaddr_in *saddr; - nfsfh_t nfh; - fhandle_t *fhp; - caddr_t bpos, cp; - u_long xid, *tl; - int len = 1, ok = 1, i = 0; - int sotype, solock; - - while (ok && (lph->lph_flag & LC_VALID)) { - if (nqsrv_cmpnam(slp, nam, lph)) - lph->lph_flag |= LC_VACATED; - else if ((lph->lph_flag & (LC_LOCAL | LC_VACATED)) == 0) { - if (lph->lph_flag & LC_UDP) { - MGET(nam2, M_WAIT, MT_SONAME); - saddr = mtod(nam2, struct sockaddr_in *); - nam2->m_len = saddr->sin_len = - sizeof (struct sockaddr_in); - saddr->sin_family = AF_INET; - saddr->sin_addr.s_addr = lph->lph_inetaddr; - saddr->sin_port = lph->lph_port; - so = nfs_udpsock->ns_so; - } else if (lph->lph_flag & LC_CLTP) { - nam2 = lph->lph_nam; - so = nfs_cltpsock->ns_so; - } else if (lph->lph_slp->ns_flag & SLP_VALID) { - nam2 = (struct mbuf *)0; - so = lph->lph_slp->ns_so; - } else - goto nextone; - sotype = so->so_type; - solock = (so->so_proto->pr_flags & PR_CONNREQUIRED); - nfsm_reqhead((struct vnode *)0, NQNFSPROC_EVICTED, - NFSX_V3FH + NFSX_UNSIGNED); - fhp = &nfh.fh_generic; - bzero((caddr_t)fhp, sizeof(nfh)); - fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid; - VFS_VPTOFH(vp, &fhp->fh_fid); - nfsm_srvfhtom(fhp, 1); - m = mreq; - siz = 0; - while (m) { - siz += m->m_len; - m = m->m_next; - } - if (siz <= 0 || siz > NFS_MAXPACKET) { - printf("mbuf siz=%d\n",siz); - panic("Bad nfs svc reply"); - } - m = nfsm_rpchead(cred, (NFSMNT_NFSV3 | NFSMNT_NQNFS), - NQNFSPROC_EVICTED, - RPCAUTH_UNIX, 5 * NFSX_UNSIGNED, (char *)0, - 0, (char *)NULL, mreq, siz, &mheadend, &xid); - /* - * For stream protocols, prepend a Sun RPC - * Record Mark. - */ - if (sotype == SOCK_STREAM) { - M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); - *mtod(m, u_long *) = htonl(0x80000000 | - (m->m_pkthdr.len - NFSX_UNSIGNED)); - } - if (((lph->lph_flag & (LC_UDP | LC_CLTP)) == 0 && - (lph->lph_slp->ns_flag & SLP_VALID) == 0) || - (solock && nfs_slplock(lph->lph_slp, 0) == 0)) { - m_freem(m); - } else { - (void) nfs_send(so, nam2, m, - (struct nfsreq *)0); - if (solock) - nfs_slpunlock(lph->lph_slp); - } - if (lph->lph_flag & LC_UDP) - MFREE(nam2, m); - } -nextone: - if (++i == len) { - if (lphnext) { - i = 0; - len = LC_MOREHOSTSIZ; - lph = lphnext->lpm_hosts; - lphnext = lphnext->lpm_next; - } else - ok = 0; - } else - lph++; - } -} - -/* - * Wait for the lease to expire. - * This will occur when all clients have sent "vacated" messages to - * this server OR when it expires do to timeout. - */ -static void -nqsrv_waitfor_expiry(lp) - register struct nqlease *lp; -{ - register struct nqhost *lph; - register int i; - struct nqm *lphnext; - int len, ok; - struct timeval now; - -tryagain: - microtime(&now); - if (now.tv_sec > lp->lc_expiry) - return; - lph = &lp->lc_host; - lphnext = lp->lc_morehosts; - len = 1; - i = 0; - ok = 1; - while (ok && (lph->lph_flag & LC_VALID)) { - if ((lph->lph_flag & (LC_LOCAL | LC_VACATED)) == 0) { - lp->lc_flag |= LC_EXPIREDWANTED; - (void) tsleep((caddr_t)&lp->lc_flag, PSOCK, - "nqexp", 0); - goto tryagain; - } - if (++i == len) { - if (lphnext) { - i = 0; - len = LC_MOREHOSTSIZ; - lph = lphnext->lpm_hosts; - lphnext = lphnext->lpm_next; - } else - ok = 0; - } else - lph++; - } -} - -#ifndef NFS_NOSERVER - -/* - * Nqnfs server timer that maintains the server lease queue. - * Scan the lease queue for expired entries: - * - when one is found, wakeup anyone waiting for it - * else dequeue and free - */ -void -nqnfs_serverd() -{ - register struct nqlease *lp; - register struct nqhost *lph; - struct nqlease *nextlp; - struct nqm *lphnext, *olphnext; - struct mbuf *n; - int i, len, ok; - struct timeval now; - - microtime(&now); - for (lp = nqtimerhead.cqh_first; lp != (void *)&nqtimerhead; - lp = nextlp) { - if (lp->lc_expiry >= now.tv_sec) - break; - nextlp = lp->lc_timer.cqe_next; - if (lp->lc_flag & LC_EXPIREDWANTED) { - lp->lc_flag &= ~LC_EXPIREDWANTED; - wakeup((caddr_t)&lp->lc_flag); - } else if ((lp->lc_flag & (LC_LOCKED | LC_WANTED)) == 0) { - /* - * Make a best effort at keeping a write caching lease long - * enough by not deleting it until it has been explicitly - * vacated or there have been no writes in the previous - * write_slack seconds since expiry and the nfsds are not - * all busy. The assumption is that if the nfsds are not - * all busy now (no queue of nfs requests), then the client - * would have been able to do at least one write to the - * file during the last write_slack seconds if it was still - * trying to push writes to the server. - */ - if ((lp->lc_flag & (LC_WRITE | LC_VACATED)) == LC_WRITE && - ((lp->lc_flag & LC_WRITTEN) || nfsd_waiting == 0)) { - lp->lc_flag &= ~LC_WRITTEN; - nqsrv_instimeq(lp, nqsrv_writeslack); - } else { - CIRCLEQ_REMOVE(&nqtimerhead, lp, lc_timer); - LIST_REMOVE(lp, lc_hash); - /* - * This soft reference may no longer be valid, but - * no harm done. The worst case is if the vnode was - * recycled and has another valid lease reference, - * which is dereferenced prematurely. - */ - lp->lc_vp->v_lease = (struct nqlease *)0; - lph = &lp->lc_host; - lphnext = lp->lc_morehosts; - olphnext = (struct nqm *)0; - len = 1; - i = 0; - ok = 1; - while (ok && (lph->lph_flag & LC_VALID)) { - if (lph->lph_flag & LC_CLTP) - MFREE(lph->lph_nam, n); - if (lph->lph_flag & LC_SREF) - nfsrv_slpderef(lph->lph_slp); - if (++i == len) { - if (olphnext) { - FREE_ZONE((caddr_t)olphnext, - sizeof (struct nqm), - M_NQMHOST); - olphnext = (struct nqm *)0; - } - if (lphnext) { - olphnext = lphnext; - i = 0; - len = LC_MOREHOSTSIZ; - lph = lphnext->lpm_hosts; - lphnext = lphnext->lpm_next; - } else - ok = 0; - } else - lph++; - } - FREE_ZONE((caddr_t)lp, - sizeof (struct nqlease), M_NQLEASE); - if (olphnext) - FREE_ZONE((caddr_t)olphnext, - sizeof (struct nqm), M_NQMHOST); - nfsstats.srvnqnfs_leases--; - } - } - } -} - -/* - * Called from nfssvc_nfsd() for a getlease rpc request. - * Do the from/to xdr translation and call nqsrv_getlease() to - * do the real work. - */ -int -nqnfsrv_getlease(nfsd, slp, procp, mrq) - struct nfsrv_descript *nfsd; - struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; -{ - struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; - struct mbuf *nam = nfsd->nd_nam; - caddr_t dpos = nfsd->nd_dpos; - struct ucred *cred = &nfsd->nd_cr; - register struct nfs_fattr *fp; - struct vattr va; - register struct vattr *vap = &va; - struct vnode *vp; - nfsfh_t nfh; - fhandle_t *fhp; - register u_long *tl; - register long t1; - u_quad_t frev; - caddr_t bpos; - int error = 0; - char *cp2; - struct mbuf *mb, *mb2, *mreq; - int flags, rdonly, cache; - - fhp = &nfh.fh_generic; - nfsm_srvmtofh(fhp); - nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); - flags = fxdr_unsigned(int, *tl++); - nfsd->nd_duration = fxdr_unsigned(int, *tl); - error = nfsrv_fhtovp(fhp, 1, &vp, cred, slp, nam, &rdonly, - (nfsd->nd_flag & ND_KERBAUTH), TRUE); - if (error) - nfsm_reply(0); - if (rdonly && flags == ND_WRITE) { - error = EROFS; - vput(vp); - nfsm_reply(0); - } - (void) nqsrv_getlease(vp, &nfsd->nd_duration, flags, slp, procp, - nam, &cache, &frev, cred); - error = VOP_GETATTR(vp, vap, cred, procp); - vput(vp); - nfsm_reply(NFSX_V3FATTR + 4 * NFSX_UNSIGNED); - nfsm_build(tl, u_long *, 4 * NFSX_UNSIGNED); - *tl++ = txdr_unsigned(cache); - *tl++ = txdr_unsigned(nfsd->nd_duration); - txdr_hyper(&frev, tl); - nfsm_build(fp, struct nfs_fattr *, NFSX_V3FATTR); - nfsm_srvfillattr(vap, fp); - nfsm_srvdone; -} - -/* - * Called from nfssvc_nfsd() when a "vacated" message is received from a - * client. Find the entry and expire it. - */ -int -nqnfsrv_vacated(nfsd, slp, procp, mrq) - struct nfsrv_descript *nfsd; - struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; -{ - struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; - struct mbuf *nam = nfsd->nd_nam; - caddr_t dpos = nfsd->nd_dpos; - register struct nqlease *lp; - register struct nqhost *lph; - struct nqlease *tlp = (struct nqlease *)0; - nfsfh_t nfh; - fhandle_t *fhp; - register u_long *tl; - register long t1; - struct nqm *lphnext; - struct mbuf *mreq, *mb; - int error = 0, i, len, ok, gotit = 0, cache = 0; - char *cp2, *bpos; - u_quad_t frev; - - fhp = &nfh.fh_generic; - nfsm_srvmtofh(fhp); - m_freem(mrep); - /* - * Find the lease by searching the hash list. - */ - for (lp = NQFHHASH(fhp->fh_fid.fid_data)->lh_first; lp != 0; - lp = lp->lc_hash.le_next) - if (fhp->fh_fsid.val[0] == lp->lc_fsid.val[0] && - fhp->fh_fsid.val[1] == lp->lc_fsid.val[1] && - !bcmp(fhp->fh_fid.fid_data, lp->lc_fiddata, - MAXFIDSZ)) { - /* Found it */ - tlp = lp; - break; - } - if (tlp) { - lp = tlp; - len = 1; - i = 0; - lph = &lp->lc_host; - lphnext = lp->lc_morehosts; - ok = 1; - while (ok && (lph->lph_flag & LC_VALID)) { - if (nqsrv_cmpnam(slp, nam, lph)) { - lph->lph_flag |= LC_VACATED; - gotit++; - break; - } - if (++i == len) { - if (lphnext) { - len = LC_MOREHOSTSIZ; - i = 0; - lph = lphnext->lpm_hosts; - lphnext = lphnext->lpm_next; - } else - ok = 0; - } else - lph++; - } - if ((lp->lc_flag & LC_EXPIREDWANTED) && gotit) { - lp->lc_flag &= ~LC_EXPIREDWANTED; - wakeup((caddr_t)&lp->lc_flag); - } -nfsmout: - return (EPERM); - } - return (EPERM); -} - -#endif /* NFS_NOSERVER */ - -/* - * Client get lease rpc function. - */ -int -nqnfs_getlease(vp, rwflag, cred, p) - register struct vnode *vp; - int rwflag; - struct ucred *cred; - struct proc *p; -{ - register u_long *tl; - register caddr_t cp; - register long t1, t2; - register struct nfsnode *np; - struct nfsmount *nmp; - caddr_t bpos, dpos, cp2; - struct timeval now; - time_t reqtime; - int error = 0; - struct mbuf *mreq, *mrep, *md, *mb, *mb2; - int cachable; - u_quad_t frev; - u_int64_t xid; - - nmp = VFSTONFS(vp->v_mount); - if (!nmp) - return (ENXIO); - - nfsstats.rpccnt[NQNFSPROC_GETLEASE]++; - mb = mreq = nfsm_reqh(vp, NQNFSPROC_GETLEASE, NFSX_V3FH+2*NFSX_UNSIGNED, - &bpos); - nfsm_fhtom(vp, 1); - nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED); - *tl++ = txdr_unsigned(rwflag); - *tl = txdr_unsigned(nmp->nm_leaseterm); - microtime(&now); - reqtime = now.tv_sec; - nfsm_request(vp, NQNFSPROC_GETLEASE, p, cred, &xid); - np = VTONFS(vp); - nfsm_dissect(tl, u_long *, 4 * NFSX_UNSIGNED); - cachable = fxdr_unsigned(int, *tl++); - reqtime += fxdr_unsigned(int, *tl++); - microtime(&now); - if (reqtime > now.tv_sec) { - nmp = VFSTONFS(vp->v_mount); - if (!nmp) { - error = ENXIO; - } else { - fxdr_hyper(tl, &frev); - nqnfs_clientlease(nmp, np, rwflag, cachable, - reqtime, frev); - nfsm_loadattr(vp, (struct vattr *)0, &xid); - } - } else - error = NQNFS_EXPIRED; - nfsm_reqdone; - return (error); -} - -/* - * Client vacated message function. - */ -static int -nqnfs_vacated(vp, cred) - register struct vnode *vp; - struct ucred *cred; -{ - register caddr_t cp; - register struct mbuf *m; - register int i; - register u_long *tl; - register long t2; - caddr_t bpos; - u_long xid; - int error = 0; - struct mbuf *mreq, *mb, *mb2, *mheadend; - struct nfsmount *nmp; - struct nfsreq myrep; - int connrequired; - int *flagp; - - nmp = VFSTONFS(vp->v_mount); - if (!nmp) - return (ENXIO); - nfsstats.rpccnt[NQNFSPROC_VACATED]++; - nfsm_reqhead(vp, NQNFSPROC_VACATED, NFSX_FH(1)); - nfsm_fhtom(vp, 1); - m = mreq; - i = 0; - while (m) { - i += m->m_len; - m = m->m_next; - } - m = nfsm_rpchead(cred, nmp->nm_flag, NQNFSPROC_VACATED, - RPCAUTH_UNIX, 5 * NFSX_UNSIGNED, (char *)0, - 0, (char *)NULL, mreq, i, &mheadend, &xid); - if (nmp->nm_sotype == SOCK_STREAM) { - M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); - *mtod(m, u_long *) = htonl(0x80000000 | (m->m_pkthdr.len - - NFSX_UNSIGNED)); - } - myrep.r_flags = 0; - myrep.r_nmp = nmp; - - connrequired = (nmp->nm_soflags & PR_CONNREQUIRED); - if (connrequired) - (void) nfs_sndlock(&myrep); - - (void) nfs_send(nmp->nm_so, nmp->nm_nam, m, &myrep); - - if (connrequired) - nfs_sndunlock(&myrep); -nfsmout: - return (error); -} - -#ifndef NFS_NOSERVER - -/* - * Called for client side callbacks - */ -int -nqnfs_callback(nmp, mrep, md, dpos) - struct nfsmount *nmp; - struct mbuf *mrep, *md; - caddr_t dpos; -{ - register struct vnode *vp; - register u_long *tl; - register long t1; - nfsfh_t nfh; - fhandle_t *fhp; - struct nfsnode *np; - struct nfsd tnfsd; - struct nfssvc_sock *slp; - struct nfsrv_descript ndesc; - register struct nfsrv_descript *nfsd = &ndesc; - struct mbuf **mrq = (struct mbuf **)0, *mb, *mreq; - int error = 0, cache = 0; - char *cp2, *bpos; - u_quad_t frev; - -#ifndef nolint - slp = NULL; -#endif - nfsd->nd_mrep = mrep; - nfsd->nd_md = md; - nfsd->nd_dpos = dpos; - error = nfs_getreq(nfsd, &tnfsd, FALSE); - if (error) - return (error); - md = nfsd->nd_md; - dpos = nfsd->nd_dpos; - if (nfsd->nd_procnum != NQNFSPROC_EVICTED) { - m_freem(mrep); - return (EPERM); - } - fhp = &nfh.fh_generic; - nfsm_srvmtofh(fhp); - m_freem(mrep); - error = nfs_nget(nmp->nm_mountp, (nfsfh_t *)fhp, NFSX_V3FH, &np); - if (error) - return (error); - vp = NFSTOV(np); - if (np->n_timer.cqe_next != 0) { - np->n_expiry = 0; - np->n_flag |= NQNFSEVICTED; - if (nmp->nm_timerhead.cqh_first != np) { - CIRCLEQ_REMOVE(&nmp->nm_timerhead, np, n_timer); - CIRCLEQ_INSERT_HEAD(&nmp->nm_timerhead, np, n_timer); - } - } - vput(vp); - nfsm_srvdone; -} - - -/* - * Nqnfs client helper daemon. Runs once a second to expire leases. - * It also get authorization strings for "kerb" mounts. - * It must start at the beginning of the list again after any potential - * "sleep" since nfs_reclaim() called from vclean() can pull a node off - * the list asynchronously. - */ -int -nqnfs_clientd(nmp, cred, ncd, flag, argp, p) - register struct nfsmount *nmp; - struct ucred *cred; - struct nfsd_cargs *ncd; - int flag; - caddr_t argp; - struct proc *p; -{ - register struct nfsnode *np; - struct vnode *vp; - struct nfsreq myrep; - struct nfsuid *nuidp, *nnuidp; - int error = 0, vpid; - register struct nfsreq *rp; - struct timeval now; - - /* - * First initialize some variables - */ - microtime(&now); - - /* - * If an authorization string is being passed in, get it. - */ - if ((flag & NFSSVC_GOTAUTH) && - (nmp->nm_state & (NFSSTA_WAITAUTH | NFSSTA_DISMNT)) == 0) { - if (nmp->nm_state & NFSSTA_HASAUTH) - panic("cld kerb"); - if ((flag & NFSSVC_AUTHINFAIL) == 0) { - if (ncd->ncd_authlen <= nmp->nm_authlen && - ncd->ncd_verflen <= nmp->nm_verflen && - !copyin(ncd->ncd_authstr,nmp->nm_authstr,ncd->ncd_authlen)&& - !copyin(ncd->ncd_verfstr,nmp->nm_verfstr,ncd->ncd_verflen)){ - nmp->nm_authtype = ncd->ncd_authtype; - nmp->nm_authlen = ncd->ncd_authlen; - nmp->nm_verflen = ncd->ncd_verflen; -#if NFSKERB - nmp->nm_key = ncd->ncd_key; -#endif - } else - nmp->nm_state |= NFSSTA_AUTHERR; - } else - nmp->nm_state |= NFSSTA_AUTHERR; - nmp->nm_state |= NFSSTA_HASAUTH; - wakeup((caddr_t)&nmp->nm_authlen); - } else - nmp->nm_state |= NFSSTA_WAITAUTH; - - /* - * Loop every second updating queue until there is a termination sig. - */ - while ((nmp->nm_state & NFSSTA_DISMNT) == 0) { - if (nmp->nm_flag & NFSMNT_NQNFS) { - /* - * If there are no outstanding requests (and therefore no - * processes in nfs_reply) and there is data in the receive - * queue, poke for callbacks. - */ - if (nfs_reqq.tqh_first == 0 && nmp->nm_so && - nmp->nm_so->so_rcv.sb_cc > 0) { - myrep.r_flags = R_GETONEREP; - myrep.r_nmp = nmp; - myrep.r_mrep = (struct mbuf *)0; - myrep.r_procp = (struct proc *)0; - (void) nfs_reply(&myrep); - } - - /* - * Loop through the leases, updating as required. - */ - np = nmp->nm_timerhead.cqh_first; - while (np != (void *)&nmp->nm_timerhead && - (nmp->nm_state & NFSSTA_DISMINPROG) == 0) { - vp = NFSTOV(np); - vpid = vp->v_id; - if (np->n_expiry < now.tv_sec) { - if (vget(vp, LK_EXCLUSIVE, p) == 0) { - nmp->nm_inprog = vp; - if (vpid == vp->v_id) { - CIRCLEQ_REMOVE(&nmp->nm_timerhead, np, n_timer); - np->n_timer.cqe_next = 0; - if (np->n_flag & (NMODIFIED | NQNFSEVICTED)) { - if (np->n_flag & NQNFSEVICTED) { - if (vp->v_type == VDIR) - nfs_invaldir(vp); - cache_purge(vp); - (void) nfs_vinvalbuf(vp, - V_SAVE, cred, p, 0); - np->n_flag &= ~NQNFSEVICTED; - (void) nqnfs_vacated(vp, cred); - } else if (vp->v_type == VREG) { - (void) VOP_FSYNC(vp, cred, - MNT_WAIT, p); - np->n_flag &= ~NMODIFIED; - } - } - } - vrele(vp); - nmp->nm_inprog = NULLVP; - } - } else if ((np->n_expiry - NQ_RENEWAL) < now.tv_sec) { - if ((np->n_flag & (NQNFSWRITE | NQNFSNONCACHE)) - == NQNFSWRITE && np->n_dirtyblkhd.lh_first && - vget(vp, LK_EXCLUSIVE, p) == 0) { - nmp->nm_inprog = vp; - if (vpid == vp->v_id && - nqnfs_getlease(vp, ND_WRITE, cred, p)==0) - np->n_brev = np->n_lrev; - vrele(vp); - nmp->nm_inprog = NULLVP; - } - } else - break; - if (np == nmp->nm_timerhead.cqh_first) - break; - np = nmp->nm_timerhead.cqh_first; - } - } - - /* - * Get an authorization string, if required. - */ - if ((nmp->nm_state & (NFSSTA_WAITAUTH | NFSSTA_DISMNT | NFSSTA_HASAUTH)) == 0) { - ncd->ncd_authuid = nmp->nm_authuid; - if (copyout((caddr_t)ncd, argp, sizeof (struct nfsd_cargs))) - nmp->nm_state |= NFSSTA_WAITAUTH; - else - return (ENEEDAUTH); - } - - /* - * Wait a bit (no pun) and do it again. - */ - if ((nmp->nm_state & NFSSTA_DISMNT) == 0 && - (nmp->nm_state & (NFSSTA_WAITAUTH | NFSSTA_HASAUTH))) { - error = tsleep((caddr_t)&nmp->nm_authstr, PSOCK | PCATCH, - "nqnfstimr", hz / 3); - if (error == EINTR || error == ERESTART) - (void) dounmount(nmp->nm_mountp, 0, p); - } - } - - /* - * Finally, we can free up the mount structure. - */ - for (nuidp = nmp->nm_uidlruhead.tqh_first; nuidp != 0; nuidp = nnuidp) { - nnuidp = nuidp->nu_lru.tqe_next; - LIST_REMOVE(nuidp, nu_hash); - TAILQ_REMOVE(&nmp->nm_uidlruhead, nuidp, nu_lru); - FREE_ZONE((caddr_t)nuidp, sizeof (struct nfsuid), M_NFSUID); - } - /* - * Loop through outstanding request list and remove dangling - * references to defunct nfsmount struct - */ - for (rp = nfs_reqq.tqh_first; rp; rp = rp->r_chain.tqe_next) - if (rp->r_nmp == nmp) - rp->r_nmp = (struct nfsmount *)0; - /* Need to wake up any rcvlock waiters so they notice the unmount. */ - if (nmp->nm_state & NFSSTA_WANTRCV) { - nmp->nm_state &= ~NFSSTA_WANTRCV; - wakeup(&nmp->nm_state); - } - FREE_ZONE((caddr_t)nmp, sizeof (struct nfsmount), M_NFSMNT); - if (error == EWOULDBLOCK) - error = 0; - return (error); -} - -#endif /* NFS_NOSERVER */ - -/* - * Adjust all timer queue expiry times when the time of day clock is changed. - * Called from the settimeofday() syscall. - */ -void -nqnfs_lease_updatetime(deltat) - register int deltat; -{ - struct proc *p = current_proc(); /* XXX */ - struct nqlease *lp; - struct nfsnode *np; - struct mount *mp, *nxtmp; - struct nfsmount *nmp; - int s; - - if (nqnfsstarttime != 0) - nqnfsstarttime += deltat; - s = splsoftclock(); - for (lp = nqtimerhead.cqh_first; lp != (void *)&nqtimerhead; - lp = lp->lc_timer.cqe_next) - lp->lc_expiry += deltat; - splx(s); - - /* - * Search the mount list for all nqnfs mounts and do their timer - * queues. - */ - simple_lock(&mountlist_slock); - for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nxtmp) { - if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { - nxtmp = mp->mnt_list.cqe_next; - continue; - } - if (mp->mnt_stat.f_type == nfs_mount_type) { - nmp = VFSTONFS(mp); - if (nmp->nm_flag & NFSMNT_NQNFS) { - for (np = nmp->nm_timerhead.cqh_first; - np != (void *)&nmp->nm_timerhead; - np = np->n_timer.cqe_next) { - np->n_expiry += deltat; - } - } - } - simple_lock(&mountlist_slock); - nxtmp = mp->mnt_list.cqe_next; - vfs_unbusy(mp, p); - } - simple_unlock(&mountlist_slock); -} - -/* - * Lock a server lease. - */ -static void -nqsrv_locklease(lp) - struct nqlease *lp; -{ - - while (lp->lc_flag & LC_LOCKED) { - lp->lc_flag |= LC_WANTED; - (void) tsleep((caddr_t)lp, PSOCK, "nqlc", 0); - } - lp->lc_flag |= LC_LOCKED; - lp->lc_flag &= ~LC_WANTED; -} - -/* - * Unlock a server lease. - */ -static void -nqsrv_unlocklease(lp) - struct nqlease *lp; -{ - - lp->lc_flag &= ~LC_LOCKED; - if (lp->lc_flag & LC_WANTED) - wakeup((caddr_t)lp); -} - -/* - * Update a client lease. - */ -void -nqnfs_clientlease(nmp, np, rwflag, cachable, expiry, frev) - register struct nfsmount *nmp; - register struct nfsnode *np; - int rwflag, cachable; - time_t expiry; - u_quad_t frev; -{ - register struct nfsnode *tp; - - if (np->n_timer.cqe_next != 0) { - CIRCLEQ_REMOVE(&nmp->nm_timerhead, np, n_timer); - if (rwflag == ND_WRITE) - np->n_flag |= NQNFSWRITE; - } else if (rwflag == ND_READ) - np->n_flag &= ~NQNFSWRITE; - else - np->n_flag |= NQNFSWRITE; - if (cachable) - np->n_flag &= ~NQNFSNONCACHE; - else - np->n_flag |= NQNFSNONCACHE; - np->n_expiry = expiry; - np->n_lrev = frev; - tp = nmp->nm_timerhead.cqh_last; - while (tp != (void *)&nmp->nm_timerhead && tp->n_expiry > np->n_expiry) - tp = tp->n_timer.cqe_prev; - if (tp == (void *)&nmp->nm_timerhead) { - CIRCLEQ_INSERT_HEAD(&nmp->nm_timerhead, np, n_timer); - } else { - CIRCLEQ_INSERT_AFTER(&nmp->nm_timerhead, tp, np, n_timer); - } -} diff --git a/bsd/nfs/nfs_serv.c b/bsd/nfs/nfs_serv.c index 271a85b6f..7f513b015 100644 --- a/bsd/nfs/nfs_serv.c +++ b/bsd/nfs/nfs_serv.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -68,7 +68,7 @@ * 3 - build the rpc reply in an mbuf list * nb: * - do not mix the phases, since the nfsm_?? macros can return failures - * on a bad rpc or similar and do not do any vrele() or vput()'s + * on a bad rpc or similar and do not do any vnode_rele()s or vnode_put()s * * - the nfsm_reply() macro generates an nfs rpc reply with the nfs * error number iff error != 0 whereas @@ -82,30 +82,31 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> -#include <sys/namei.h> +#include <sys/kauth.h> #include <sys/unistd.h> #include <sys/malloc.h> #include <sys/vnode.h> -#include <sys/mount.h> +#include <sys/mount_internal.h> #include <sys/socket.h> #include <sys/socketvar.h> -#include <sys/mbuf.h> +#include <sys/kpi_mbuf.h> #include <sys/dirent.h> #include <sys/stat.h> #include <sys/kernel.h> #include <sys/sysctl.h> #include <sys/ubc.h> +#include <sys/vnode_internal.h> +#include <sys/uio_internal.h> +#include <libkern/OSAtomic.h> #include <sys/vm.h> #include <sys/vmparam.h> -#include <machine/spl.h> #include <nfs/nfsproto.h> #include <nfs/rpcv2.h> #include <nfs/nfs.h> #include <nfs/xdr_subs.h> #include <nfs/nfsm_subs.h> -#include <nfs/nqnfs.h> nfstype nfsv3_type[9] = { NFNON, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, NFSOCK, NFFIFO, NFNON }; @@ -127,10 +128,11 @@ int nfs_async = 0; SYSCTL_INT(_vfs_nfs, OID_AUTO, async, CTLFLAG_RW, &nfs_async, 0, ""); #endif -static int nfsrv_access __P((struct vnode *,int,struct ucred *,int, - struct proc *, int)); -static void nfsrvw_coalesce __P((struct nfsrv_descript *, - struct nfsrv_descript *)); +static int nfsrv_authorize(vnode_t,vnode_t,kauth_action_t,vfs_context_t,struct nfs_export_options*,int); +static void nfsrvw_coalesce(struct nfsrv_descript *, struct nfsrv_descript *); + +#define THREAD_SAFE_FS(VP) \ + ((VP)->v_mount ? (VP)->v_mount->mnt_vtable->vfc_threadsafe : 0) /* * nfs v3 access service @@ -139,64 +141,126 @@ int nfsrv3_access(nfsd, slp, procp, mrq) struct nfsrv_descript *nfsd; struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; + proc_t procp; + mbuf_t *mrq; { - struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; - struct mbuf *nam = nfsd->nd_nam; + mbuf_t mrep = nfsd->nd_mrep, md = nfsd->nd_md; + mbuf_t nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; - struct ucred *cred = &nfsd->nd_cr; - struct vnode *vp; - nfsfh_t nfh; - fhandle_t *fhp; - register u_long *tl; - register long t1; + vnode_t vp, dvp; + struct nfs_filehandle nfh; + u_long *tl; + long t1; caddr_t bpos; - int error = 0, rdonly, cache, getret; + int error = 0, getret; char *cp2; - struct mbuf *mb, *mreq, *mb2; - struct vattr vattr, *vap = &vattr; - u_long testmode, nfsmode; - u_quad_t frev; + mbuf_t mb, mreq, mb2; + struct vnode_attr vattr, *vap = &vattr; + u_long nfsmode; + kauth_action_t testaction; + struct vfs_context context; + struct nfs_export *nx; + struct nfs_export_options *nxo; -#ifndef nolint - cache = 0; -#endif - fhp = &nfh.fh_generic; - nfsm_srvmtofh(fhp); + nfsm_srvmtofh(&nfh); nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); - if ((error = nfsrv_fhtovp(fhp, 1, &vp, cred, slp, nam, - &rdonly, (nfsd->nd_flag & ND_KERBAUTH), TRUE))) { + if ((error = nfsrv_fhtovp(&nfh, nam, TRUE, &vp, &nx, &nxo))) { + nfsm_reply(NFSX_UNSIGNED); + nfsm_srvpostop_attr(1, NULL); + return (0); + } + if ((error = nfsrv_credcheck(nfsd, nx, nxo))) { + vnode_put(vp); nfsm_reply(NFSX_UNSIGNED); - nfsm_srvpostop_attr(1, (struct vattr *)0); + nfsm_srvpostop_attr(1, NULL); return (0); } nfsmode = fxdr_unsigned(u_long, *tl); - if ((nfsmode & NFSV3ACCESS_READ) && - nfsrv_access(vp, VREAD, cred, rdonly, procp, 0)) - nfsmode &= ~NFSV3ACCESS_READ; - if (vp->v_type == VDIR) - testmode = (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND | - NFSV3ACCESS_DELETE); - else - testmode = (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND); - if ((nfsmode & testmode) && - nfsrv_access(vp, VWRITE, cred, rdonly, procp, 0)) - nfsmode &= ~testmode; - if (vp->v_type == VDIR) - testmode = NFSV3ACCESS_LOOKUP; - else - testmode = NFSV3ACCESS_EXECUTE; - if ((nfsmode & testmode) && - nfsrv_access(vp, VEXEC, cred, rdonly, procp, 0)) - nfsmode &= ~testmode; - getret = VOP_GETATTR(vp, vap, cred, procp); - vput(vp); + + context.vc_proc = procp; + context.vc_ucred = nfsd->nd_cr; + + /* + * Each NFS mode bit is tested separately. + * + * XXX this code is nominally correct, but returns a pessimistic + * rather than optimistic result. It will be necessary to add + * an NFS-specific interface to the vnode_authorize code to + * obtain good performance in the optimistic mode. + */ + if (nfsmode & NFSV3ACCESS_READ) { + if (vnode_isdir(vp)) { + testaction = + KAUTH_VNODE_LIST_DIRECTORY | + KAUTH_VNODE_READ_EXTATTRIBUTES; + } else { + testaction = + KAUTH_VNODE_READ_DATA | + KAUTH_VNODE_READ_EXTATTRIBUTES; + } + if (nfsrv_authorize(vp, NULL, testaction, &context, nxo, 0)) + nfsmode &= ~NFSV3ACCESS_READ; + } + if ((nfsmode & NFSV3ACCESS_LOOKUP) && + (!vnode_isdir(vp) || + nfsrv_authorize(vp, NULL, KAUTH_VNODE_SEARCH, &context, nxo, 0))) + nfsmode &= ~NFSV3ACCESS_LOOKUP; + if (nfsmode & NFSV3ACCESS_MODIFY) { + if (vnode_isdir(vp)) { + testaction = + KAUTH_VNODE_ADD_FILE | + KAUTH_VNODE_ADD_SUBDIRECTORY | + KAUTH_VNODE_DELETE_CHILD; + } else { + testaction = + KAUTH_VNODE_WRITE_DATA | + KAUTH_VNODE_WRITE_ATTRIBUTES | + KAUTH_VNODE_WRITE_EXTATTRIBUTES | + KAUTH_VNODE_WRITE_SECURITY; + } + if (nfsrv_authorize(vp, NULL, testaction, &context, nxo, 0)) + nfsmode &= ~NFSV3ACCESS_MODIFY; + } + if (nfsmode & NFSV3ACCESS_EXTEND) { + if (vnode_isdir(vp)) { + testaction = + KAUTH_VNODE_ADD_FILE | + KAUTH_VNODE_ADD_SUBDIRECTORY; + } else { + testaction = + KAUTH_VNODE_WRITE_DATA | + KAUTH_VNODE_APPEND_DATA; + } + if (nfsrv_authorize(vp, NULL, testaction, &context, nxo, 0)) + nfsmode &= ~NFSV3ACCESS_EXTEND; + } + dvp = NULLVP; + /* + * For hard links, this answer may be wrong if the vnode + * has multiple parents with different permissions. + */ + if ((nfsmode & NFSV3ACCESS_DELETE) && + (((dvp = vnode_getparent(vp)) == NULL) || + nfsrv_authorize(vp, dvp, KAUTH_VNODE_DELETE, &context, nxo, 0))) { + nfsmode &= ~NFSV3ACCESS_DELETE; + } + if (dvp != NULLVP) + vnode_put(dvp); + + if ((nfsmode & NFSV3ACCESS_EXECUTE) && + (vnode_isdir(vp) || + nfsrv_authorize(vp, NULL, KAUTH_VNODE_EXECUTE, &context, nxo, 0))) + nfsmode &= ~NFSV3ACCESS_EXECUTE; + + nfsm_srv_vattr_init(vap, 1); + getret = vnode_getattr(vp, vap, &context); + vnode_put(vp); nfsm_reply(NFSX_POSTOPATTR(1) + NFSX_UNSIGNED); nfsm_srvpostop_attr(getret, vap); nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = txdr_unsigned(nfsmode); - nfsm_srvdone; +nfsmout: + return (error); } /* @@ -206,43 +270,51 @@ int nfsrv_getattr(nfsd, slp, procp, mrq) struct nfsrv_descript *nfsd; struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; + proc_t procp; + mbuf_t *mrq; { - struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; - struct mbuf *nam = nfsd->nd_nam; + mbuf_t mrep = nfsd->nd_mrep, md = nfsd->nd_md; + mbuf_t nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; - struct ucred *cred = &nfsd->nd_cr; - register struct nfs_fattr *fp; - struct vattr va; - register struct vattr *vap = &va; - struct vnode *vp; - nfsfh_t nfh; - fhandle_t *fhp; - register u_long *tl; - register long t1; + struct nfs_fattr *fp; + struct vnode_attr va; + struct vnode_attr *vap = &va; + vnode_t vp; + struct nfs_filehandle nfh; + u_long *tl; + long t1; caddr_t bpos; - int error = 0, rdonly, cache; + int error = 0; char *cp2; - struct mbuf *mb, *mb2, *mreq; - u_quad_t frev; + mbuf_t mb, mb2, mreq; + struct vfs_context context; + struct nfs_export *nx; + struct nfs_export_options *nxo; + int v3 = (nfsd->nd_flag & ND_NFSV3); - fhp = &nfh.fh_generic; - nfsm_srvmtofh(fhp); - if ((error = nfsrv_fhtovp(fhp, 1, &vp, cred, slp, nam, - &rdonly, (nfsd->nd_flag & ND_KERBAUTH), TRUE))) { + nfsm_srvmtofh(&nfh); + if ((error = nfsrv_fhtovp(&nfh, nam, TRUE, &vp, &nx, &nxo))) { nfsm_reply(0); return (0); } - nqsrv_getl(vp, ND_READ); - error = VOP_GETATTR(vp, vap, cred, procp); - vput(vp); - nfsm_reply(NFSX_FATTR(nfsd->nd_flag & ND_NFSV3)); + if ((error = nfsrv_credcheck(nfsd, nx, nxo))) { + vnode_put(vp); + nfsm_reply(0); + return (0); + } + context.vc_proc = procp; + context.vc_ucred = nfsd->nd_cr; + + nfsm_srv_vattr_init(vap, v3); + error = vnode_getattr(vp, vap, &context); + vnode_put(vp); + nfsm_reply(NFSX_FATTR(v3)); if (error) return (0); - nfsm_build(fp, struct nfs_fattr *, NFSX_FATTR(nfsd->nd_flag & ND_NFSV3)); + nfsm_build(fp, struct nfs_fattr *, NFSX_FATTR(v3)); nfsm_srvfillattr(vap, fp); - nfsm_srvdone; +nfsmout: + return (error); } /* @@ -252,33 +324,36 @@ int nfsrv_setattr(nfsd, slp, procp, mrq) struct nfsrv_descript *nfsd; struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; + proc_t procp; + mbuf_t *mrq; { - struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; - struct mbuf *nam = nfsd->nd_nam; + mbuf_t mrep = nfsd->nd_mrep, md = nfsd->nd_md; + mbuf_t nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; - struct ucred *cred = &nfsd->nd_cr; - struct vattr va, preat; - register struct vattr *vap = &va; - register struct nfsv2_sattr *sp; - register struct nfs_fattr *fp; - struct vnode *vp; - nfsfh_t nfh; - fhandle_t *fhp; - register u_long *tl; - register long t1; + struct vnode_attr preat; + struct vnode_attr postat; + struct vnode_attr va; + struct vnode_attr *vap = &va; + struct nfsv2_sattr *sp; + struct nfs_fattr *fp; + vnode_t vp; + struct nfs_filehandle nfh; + struct nfs_export *nx; + struct nfs_export_options *nxo; + u_long *tl; + long t1; caddr_t bpos; - int error = 0, rdonly, cache, preat_ret = 1, postat_ret = 1; + int error = 0, preat_ret = 1, postat_ret = 1; int v3 = (nfsd->nd_flag & ND_NFSV3), gcheck = 0; char *cp2; - struct mbuf *mb, *mb2, *mreq; - u_quad_t frev; + mbuf_t mb, mb2, mreq; struct timespec guard; + struct vfs_context context; + kauth_action_t action; + uid_t saved_uid; - fhp = &nfh.fh_generic; - nfsm_srvmtofh(fhp); - VATTR_NULL(vap); + nfsm_srvmtofh(&nfh); + VATTR_INIT(vap); if (v3) { nfsm_srvsattr(vap); nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); @@ -297,83 +372,99 @@ nfsrv_setattr(nfsd, slp, procp, mrq) * --> check the low order 2 bytes for 0xffff */ if ((fxdr_unsigned(int, sp->sa_mode) & 0xffff) != 0xffff) - vap->va_mode = nfstov_mode(sp->sa_mode); + VATTR_SET(vap, va_mode, nfstov_mode(sp->sa_mode)); if (sp->sa_uid != nfs_xdrneg1) - vap->va_uid = fxdr_unsigned(uid_t, sp->sa_uid); + VATTR_SET(vap, va_uid, fxdr_unsigned(uid_t, sp->sa_uid)); if (sp->sa_gid != nfs_xdrneg1) - vap->va_gid = fxdr_unsigned(gid_t, sp->sa_gid); + VATTR_SET(vap, va_gid, fxdr_unsigned(gid_t, sp->sa_gid)); if (sp->sa_size != nfs_xdrneg1) - vap->va_size = fxdr_unsigned(u_quad_t, sp->sa_size); + VATTR_SET(vap, va_data_size, fxdr_unsigned(u_quad_t, sp->sa_size)); if (sp->sa_atime.nfsv2_sec != nfs_xdrneg1) { -#ifdef notyet - fxdr_nfsv2time(&sp->sa_atime, &vap->va_atime); -#else - vap->va_atime.tv_sec = - fxdr_unsigned(long, sp->sa_atime.nfsv2_sec); - vap->va_atime.tv_nsec = 0; -#endif + fxdr_nfsv2time(&sp->sa_atime, &vap->va_access_time); + VATTR_SET_ACTIVE(vap, va_access_time); + } + if (sp->sa_mtime.nfsv2_sec != nfs_xdrneg1) { + fxdr_nfsv2time(&sp->sa_mtime, &vap->va_modify_time); + VATTR_SET_ACTIVE(vap, va_modify_time); } - if (sp->sa_mtime.nfsv2_sec != nfs_xdrneg1) - fxdr_nfsv2time(&sp->sa_mtime, &vap->va_mtime); - } + /* + * Save the original credential UID in case they are + * mapped and we need to map the IDs in the attributes. + */ + saved_uid = kauth_cred_getuid(nfsd->nd_cr); + /* * Now that we have all the fields, lets do it. */ - if ((error = nfsrv_fhtovp(fhp, 1, &vp, cred, slp, nam, - &rdonly, (nfsd->nd_flag & ND_KERBAUTH), TRUE))) { + if ((error = nfsrv_fhtovp(&nfh, nam, TRUE, &vp, &nx, &nxo))) { nfsm_reply(2 * NFSX_UNSIGNED); - nfsm_srvwcc_data(preat_ret, &preat, postat_ret, vap); + nfsm_srvwcc_data(preat_ret, &preat, postat_ret, &postat); return (0); } - nqsrv_getl(vp, ND_WRITE); + if ((error = nfsrv_credcheck(nfsd, nx, nxo))) { + vnode_put(vp); + nfsm_reply(2 * NFSX_UNSIGNED); + nfsm_srvwcc_data(preat_ret, &preat, postat_ret, &postat); + return (0); + } + + context.vc_proc = procp; + context.vc_ucred = nfsd->nd_cr; + if (v3) { - error = preat_ret = VOP_GETATTR(vp, &preat, cred, procp); - if (!error && gcheck && - (preat.va_ctime.tv_sec != guard.tv_sec || - preat.va_ctime.tv_nsec != guard.tv_nsec)) + nfsm_srv_pre_vattr_init(&preat, v3); + error = preat_ret = vnode_getattr(vp, &preat, &context); + if (!error && gcheck && VATTR_IS_SUPPORTED(&preat, va_change_time) && + (preat.va_change_time.tv_sec != guard.tv_sec || + preat.va_change_time.tv_nsec != guard.tv_nsec)) error = NFSERR_NOT_SYNC; + if (!preat_ret && !VATTR_ALL_SUPPORTED(&preat)) + preat_ret = 1; if (error) { - vput(vp); + vnode_put(vp); nfsm_reply(NFSX_WCCDATA(v3)); - nfsm_srvwcc_data(preat_ret, &preat, postat_ret, vap); + nfsm_srvwcc_data(preat_ret, &preat, postat_ret, &postat); return (0); } } /* - * If the size is being changed write acces is required, otherwise - * just check for a read only file system. + * If the credentials were mapped, we should + * map the same values in the attributes. */ - if (vap->va_size == ((u_quad_t)((quad_t) -1))) { - if (rdonly || (vp->v_mount->mnt_flag & MNT_RDONLY)) { - error = EROFS; - goto out; - } - } else { - if (vp->v_type == VDIR) { - error = EISDIR; - goto out; - } else if ((error = nfsrv_access(vp, VWRITE, cred, rdonly, - procp, 0))) - goto out; + if ((vap->va_uid == saved_uid) && (kauth_cred_getuid(nfsd->nd_cr) != saved_uid)) { + int ismember; + VATTR_SET(vap, va_uid, kauth_cred_getuid(nfsd->nd_cr)); + if (kauth_cred_ismember_gid(nfsd->nd_cr, vap->va_gid, &ismember) || !ismember) + VATTR_SET(vap, va_gid, kauth_cred_getgid(nfsd->nd_cr)); } - error = VOP_SETATTR(vp, vap, cred, procp); - postat_ret = VOP_GETATTR(vp, vap, cred, procp); + + /* + * Authorize the attribute changes. + */ + if (((error = vnode_authattr(vp, vap, &action, &context))) || + ((error = nfsrv_authorize(vp, NULL, action, &context, nxo, 0)))) + goto out; + error = vnode_setattr(vp, vap, &context); + + nfsm_srv_vattr_init(&postat, v3); + postat_ret = vnode_getattr(vp, &postat, &context); if (!error) error = postat_ret; out: - vput(vp); + vnode_put(vp); nfsm_reply(NFSX_WCCORFATTR(v3)); if (v3) { - nfsm_srvwcc_data(preat_ret, &preat, postat_ret, vap); + nfsm_srvwcc_data(preat_ret, &preat, postat_ret, &postat); return (0); } else { nfsm_build(fp, struct nfs_fattr *, NFSX_V2FATTR); - nfsm_srvfillattr(vap, fp); + nfsm_srvfillattr(&postat, fp); } - nfsm_srvdone; +nfsmout: + return (error); } /* @@ -383,70 +474,73 @@ int nfsrv_lookup(nfsd, slp, procp, mrq) struct nfsrv_descript *nfsd; struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; + proc_t procp; + mbuf_t *mrq; { - struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; - struct mbuf *nam = nfsd->nd_nam; + mbuf_t mrep = nfsd->nd_mrep, md = nfsd->nd_md; + mbuf_t nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; - struct ucred *cred = &nfsd->nd_cr; - register struct nfs_fattr *fp; + struct nfs_fattr *fp; struct nameidata nd, *ndp = &nd; -#ifdef notdef +/* XXX Revisit when enabling WebNFS */ +#ifdef WEBNFS_ENABLED struct nameidata ind; #endif - struct vnode *vp, *dirp; - nfsfh_t nfh; - fhandle_t *fhp; - register caddr_t cp; - register u_long *tl; - register long t1; + vnode_t vp, dirp = NULL; + struct nfs_filehandle dnfh, nfh; + struct nfs_export *nx; + struct nfs_export_options *nxo; + caddr_t cp; + u_long *tl; + long t1; caddr_t bpos; - int error = 0, cache, len, dirattr_ret = 1; + int error = 0, len, dirattr_ret = 1, isdotdot; int v3 = (nfsd->nd_flag & ND_NFSV3), pubflag; char *cp2; - struct mbuf *mb, *mb2, *mreq; - struct vattr va, dirattr, *vap = &va; - u_quad_t frev; + mbuf_t mb, mb2, mreq; + struct vnode_attr va, dirattr, *vap = &va; + struct vfs_context context; + + context.vc_proc = procp; + context.vc_ucred = nfsd->nd_cr; - fhp = &nfh.fh_generic; - nfsm_srvmtofh(fhp); - nfsm_srvnamesiz(len); + nfsm_srvmtofh(&dnfh); + nfsm_srvnamesiz(len, v3); - pubflag = nfs_ispublicfh(fhp); + pubflag = nfs_ispublicfh(&dnfh); - nd.ni_cnd.cn_cred = cred; nd.ni_cnd.cn_nameiop = LOOKUP; - nd.ni_cnd.cn_flags = LOCKLEAF | SAVESTART; - error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos, - &dirp, procp, (nfsd->nd_flag & ND_KERBAUTH), pubflag); + nd.ni_cnd.cn_flags = LOCKLEAF; + error = nfsm_path_mbuftond(&md, &dpos, v3, pubflag, &len, &nd); + isdotdot = ((len == 2) && (nd.ni_cnd.cn_pnbuf[0] == '.') && (nd.ni_cnd.cn_pnbuf[1] == '.')); + if (!error) + error = nfs_namei(nfsd, &context, &nd, &dnfh, nam, pubflag, &dirp, &nx, &nxo); -/* XXX CSM 12/4/97 Revisit when enabling WebNFS */ -#ifdef notyet +/* XXX Revisit when enabling WebNFS */ +#ifdef WEBNFS_ENABLED if (!error && pubflag) { - if (nd.ni_vp->v_type == VDIR && nfs_pub.np_index != NULL) { + if (vnode_vtype(nd.ni_vp) == VDIR && nfs_pub.np_index != NULL) { /* * Setup call to lookup() to see if we can find * the index file. Arguably, this doesn't belong * in a kernel.. Ugh. */ ind = nd; - VOP_UNLOCK(nd.ni_vp, 0, procp); ind.ni_pathlen = strlen(nfs_pub.np_index); ind.ni_cnd.cn_nameptr = ind.ni_cnd.cn_pnbuf = nfs_pub.np_index; ind.ni_startdir = nd.ni_vp; - VREF(ind.ni_startdir); - error = lookup(&ind); - if (!error) { + ind.ni_usedvp = nd.ni_vp; + + if (!(error = lookup(&ind))) { /* * Found an index file. Get rid of * the old references. */ if (dirp) - vrele(dirp); + vnode_put(dirp); dirp = nd.ni_vp; - vrele(nd.ni_startdir); + vnode_put(nd.ni_startdir); ndp = &ind; } else error = 0; @@ -457,18 +551,20 @@ nfsrv_lookup(nfsd, slp, procp, mrq) * filesystem. */ - if (!error && ndp->ni_vp->v_mount != nfs_pub.np_mount) { - vput(nd.ni_vp); + if (!error && vnode_mount(ndp->ni_vp) != nfs_pub.np_mount) { + vnode_put(nd.ni_vp); + nameidone(&nd); error = EPERM; } } #endif if (dirp) { - if (v3) - dirattr_ret = VOP_GETATTR(dirp, &dirattr, cred, - procp); - vrele(dirp); + if (v3) { + nfsm_srv_vattr_init(&dirattr, v3); + dirattr_ret = vnode_getattr(dirp, &dirattr, &context); + } + vnode_put(dirp); } if (error) { @@ -476,24 +572,21 @@ nfsrv_lookup(nfsd, slp, procp, mrq) nfsm_srvpostop_attr(dirattr_ret, &dirattr); return (0); } + nameidone(&nd); - nqsrv_getl(ndp->ni_startdir, ND_READ); - vrele(ndp->ni_startdir); - FREE_ZONE(nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); - nd.ni_cnd.cn_flags &= ~HASBUF; vp = ndp->ni_vp; - bzero((caddr_t)fhp, sizeof(nfh)); - fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid; - error = VFS_VPTOFH(vp, &fhp->fh_fid); - if (!error) - error = VOP_GETATTR(vp, vap, cred, procp); - vput(vp); - nfsm_reply(NFSX_SRVFH(v3) + NFSX_POSTOPORFATTR(v3) + NFSX_POSTOPATTR(v3)); + error = nfsrv_vptofh(nx, !v3, (isdotdot ? &dnfh : NULL), vp, &context, &nfh); + if (!error) { + nfsm_srv_vattr_init(vap, v3); + error = vnode_getattr(vp, vap, &context); + } + vnode_put(vp); + nfsm_reply(NFSX_SRVFH(v3, &nfh) + NFSX_POSTOPORFATTR(v3) + NFSX_POSTOPATTR(v3)); if (error) { nfsm_srvpostop_attr(dirattr_ret, &dirattr); return (0); } - nfsm_srvfhtom(fhp, v3); + nfsm_srvfhtom(&nfh, v3); if (v3) { nfsm_srvpostop_attr(0, vap); nfsm_srvpostop_attr(dirattr_ret, &dirattr); @@ -501,7 +594,8 @@ nfsrv_lookup(nfsd, slp, procp, mrq) nfsm_build(fp, struct nfs_fattr *, NFSX_V2FATTR); nfsm_srvfillattr(vap, fp); } - nfsm_srvdone; +nfsmout: + return (error); } /* @@ -511,104 +605,160 @@ int nfsrv_readlink(nfsd, slp, procp, mrq) struct nfsrv_descript *nfsd; struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; + proc_t procp; + mbuf_t *mrq; { - struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; - struct mbuf *nam = nfsd->nd_nam; + mbuf_t mrep = nfsd->nd_mrep, md = nfsd->nd_md; + mbuf_t nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; - struct ucred *cred = &nfsd->nd_cr; - struct iovec iv[(NFS_MAXPATHLEN+MLEN-1)/MLEN]; - register struct iovec *ivp = iv; - register struct mbuf *mp; - register u_long *tl; - register long t1; + mbuf_t mp; + u_long *tl; + long t1; caddr_t bpos; - int error = 0, rdonly, cache, i, tlen, len, getret; + int error = 0, i, tlen, len, getret = 1; int v3 = (nfsd->nd_flag & ND_NFSV3); char *cp2; - struct mbuf *mb, *mb2, *mp2, *mp3, *mreq; - struct vnode *vp; - struct vattr attr; - nfsfh_t nfh; - fhandle_t *fhp; - struct uio io, *uiop = &io; - u_quad_t frev; + mbuf_t mb, mb2, mp2, mp3, mreq; + vnode_t vp; + struct vnode_attr attr; + struct nfs_filehandle nfh; + struct nfs_export *nx; + struct nfs_export_options *nxo; + uio_t uiop = NULL; + char uio_buf[ UIO_SIZEOF(4) ]; + char *uio_bufp = &uio_buf[0]; + int uio_buflen = UIO_SIZEOF(4); + int mblen; + struct vfs_context context; -#ifndef nolint - mp2 = mp3 = (struct mbuf *)0; -#endif - fhp = &nfh.fh_generic; - nfsm_srvmtofh(fhp); + nfsm_srvmtofh(&nfh); len = 0; i = 0; + + mp2 = mp3 = NULL; + vp = NULL; while (len < NFS_MAXPATHLEN) { - MGET(mp, M_WAIT, MT_DATA); - MCLGET(mp, M_WAIT); - mp->m_len = NFSMSIZ(mp); + mp = NULL; + if ((error = mbuf_mclget(MBUF_WAITOK, MBUF_TYPE_DATA, &mp))) + goto out; + mblen = mbuf_maxlen(mp); + mbuf_setlen(mp, mblen); if (len == 0) mp3 = mp2 = mp; else { - mp2->m_next = mp; + if ((error = mbuf_setnext(mp2, mp))) { + mbuf_free(mp); + goto out; + } mp2 = mp; } - if ((len+mp->m_len) > NFS_MAXPATHLEN) { - mp->m_len = NFS_MAXPATHLEN-len; + if ((len + mblen) > NFS_MAXPATHLEN) { + mbuf_setlen(mp, NFS_MAXPATHLEN - len); len = NFS_MAXPATHLEN; } else - len += mp->m_len; - ivp->iov_base = mtod(mp, caddr_t); - ivp->iov_len = mp->m_len; - i++; - ivp++; - } - uiop->uio_iov = iv; - uiop->uio_iovcnt = i; - uiop->uio_offset = 0; - uiop->uio_resid = len; - uiop->uio_rw = UIO_READ; - uiop->uio_segflg = UIO_SYSSPACE; - uiop->uio_procp = (struct proc *)0; - if ((error = nfsrv_fhtovp(fhp, 1, &vp, cred, slp, nam, - &rdonly, (nfsd->nd_flag & ND_KERBAUTH), TRUE))) { - m_freem(mp3); + len += mblen; + i++; + } + if (i > 4) { + uio_buflen = UIO_SIZEOF(i); + MALLOC(uio_bufp, char*, uio_buflen, M_TEMP, M_WAITOK); + if (!uio_bufp) { + error = ENOMEM; + mbuf_freem(mp3); + nfsm_reply(2 * NFSX_UNSIGNED); + nfsm_srvpostop_attr(1, NULL); + return (0); + } + } + uiop = uio_createwithbuffer(i, 0, UIO_SYSSPACE, UIO_READ, uio_bufp, uio_buflen); + if (!uiop) { + error = ENOMEM; + mbuf_freem(mp3); + if (uio_bufp != &uio_buf[0]) { + FREE(uio_bufp, M_TEMP); + uio_bufp = &uio_buf[0]; + } + nfsm_reply(2 * NFSX_UNSIGNED); + nfsm_srvpostop_attr(1, NULL); + return (0); + } + mp = mp3; + while (mp) { + uio_addiov(uiop, CAST_USER_ADDR_T((caddr_t)mbuf_data(mp)), mbuf_len(mp)); + mp = mbuf_next(mp); + } + + if ((error = nfsrv_fhtovp(&nfh, nam, TRUE, &vp, &nx, &nxo))) { + mbuf_freem(mp3); + if (uio_bufp != &uio_buf[0]) { + FREE(uio_bufp, M_TEMP); + uio_bufp = &uio_buf[0]; + } + nfsm_reply(2 * NFSX_UNSIGNED); + nfsm_srvpostop_attr(1, NULL); + return (0); + } + if ((error = nfsrv_credcheck(nfsd, nx, nxo))) { + vnode_put(vp); + mbuf_freem(mp3); + if (uio_bufp != &uio_buf[0]) { + FREE(uio_bufp, M_TEMP); + uio_bufp = &uio_buf[0]; + } nfsm_reply(2 * NFSX_UNSIGNED); - nfsm_srvpostop_attr(1, (struct vattr *)0); + nfsm_srvpostop_attr(1, NULL); return (0); } - if (vp->v_type != VLNK) { + if (vnode_vtype(vp) != VLNK) { if (v3) error = EINVAL; else error = ENXIO; goto out; } - nqsrv_getl(vp, ND_READ); - error = VOP_READLINK(vp, uiop, cred); + + context.vc_proc = procp; + context.vc_ucred = nfsd->nd_cr; + + if ((error = nfsrv_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, &context, nxo, 0))) + goto out; + error = VNOP_READLINK(vp, uiop, &context); out: - getret = VOP_GETATTR(vp, &attr, cred, procp); - vput(vp); + if (vp) { + if (v3) { + nfsm_srv_vattr_init(&attr, v3); + getret = vnode_getattr(vp, &attr, &context); + } + vnode_put(vp); + } if (error) { - m_freem(mp3); + mbuf_freem(mp3); mp3 = NULL; } nfsm_reply(NFSX_POSTOPATTR(v3) + NFSX_UNSIGNED); if (v3) { nfsm_srvpostop_attr(getret, &attr); - if (error) + if (error) { + if (uio_bufp != &uio_buf[0]) + FREE(uio_bufp, M_TEMP); return (0); + } } if (!error) { - if (uiop->uio_resid > 0) { - len -= uiop->uio_resid; + if (uiop && (uio_resid(uiop) > 0)) { + // LP64todo - fix this + len -= uio_resid(uiop); tlen = nfsm_rndup(len); nfsm_adj(mp3, NFS_MAXPATHLEN-tlen, tlen-len); } nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = txdr_unsigned(len); - mb->m_next = mp3; + mbuf_setnext(mb, mp3); } - nfsm_srvdone; +nfsmout: + if (uio_bufp != &uio_buf[0]) + FREE(uio_bufp, M_TEMP); + return (error); } /* @@ -618,37 +768,35 @@ int nfsrv_read(nfsd, slp, procp, mrq) struct nfsrv_descript *nfsd; struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; + proc_t procp; + mbuf_t *mrq; { - struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; - struct mbuf *nam = nfsd->nd_nam; + mbuf_t mrep = nfsd->nd_mrep, md = nfsd->nd_md; + mbuf_t nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; - struct ucred *cred = &nfsd->nd_cr; - register struct iovec *iv; - struct iovec *iv2; - register struct mbuf *m; - register struct nfs_fattr *fp; - register u_long *tl; - register long t1; - register int i; + mbuf_t m; + struct nfs_fattr *fp; + u_long *tl; + long t1; + int i; caddr_t bpos; - int error = 0, rdonly, cache, cnt, len, left, siz, tlen, getret; + int error = 0, count, len, left, siz, tlen, getret; int v3 = (nfsd->nd_flag & ND_NFSV3), reqlen; char *cp2; - struct mbuf *mb, *mb2, *mreq; - struct mbuf *m2; - struct vnode *vp; - nfsfh_t nfh; - fhandle_t *fhp; - struct uio io, *uiop = &io; - struct vattr va, *vap = &va; + mbuf_t mb, mb2, mreq; + mbuf_t m2; + vnode_t vp; + struct nfs_filehandle nfh; + struct nfs_export *nx; + struct nfs_export_options *nxo; + uio_t uiop = NULL; + char *uio_bufp = NULL; + struct vnode_attr va, *vap = &va; off_t off; - u_quad_t frev; - int didhold = 0; + char uio_buf[ UIO_SIZEOF(0) ]; + struct vfs_context context; - fhp = &nfh.fh_generic; - nfsm_srvmtofh(fhp); + nfsm_srvmtofh(&nfh); if (v3) { nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); fxdr_hyper(tl, &off); @@ -657,39 +805,48 @@ nfsrv_read(nfsd, slp, procp, mrq) off = (off_t)fxdr_unsigned(u_long, *tl); } nfsm_srvstrsiz(reqlen, NFS_SRVMAXDATA(nfsd)); - if ((error = nfsrv_fhtovp(fhp, 1, &vp, cred, slp, nam, - &rdonly, (nfsd->nd_flag & ND_KERBAUTH), TRUE))) { + if ((error = nfsrv_fhtovp(&nfh, nam, TRUE, &vp, &nx, &nxo))) { nfsm_reply(2 * NFSX_UNSIGNED); - nfsm_srvpostop_attr(1, (struct vattr *)0); + nfsm_srvpostop_attr(1, NULL); return (0); } - if (vp->v_type != VREG) { + if ((error = nfsrv_credcheck(nfsd, nx, nxo))) { + vnode_put(vp); + nfsm_reply(2 * NFSX_UNSIGNED); + nfsm_srvpostop_attr(1, NULL); + return (0); + } + if (vnode_vtype(vp) != VREG) { if (v3) error = EINVAL; else - error = (vp->v_type == VDIR) ? EISDIR : EACCES; + error = (vnode_vtype(vp) == VDIR) ? EISDIR : EACCES; } + + context.vc_proc = procp; + context.vc_ucred = nfsd->nd_cr; + if (!error) { - nqsrv_getl(vp, ND_READ); - if ((error = nfsrv_access(vp, VREAD, cred, rdonly, procp, 1))) - error = nfsrv_access(vp, VEXEC, cred, rdonly, procp, 1); + if ((error = nfsrv_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, &context, nxo, 1))) + error = nfsrv_authorize(vp, NULL, KAUTH_VNODE_EXECUTE, &context, nxo, 1); } - getret = VOP_GETATTR(vp, vap, cred, procp); + nfsm_srv_vattr_init(vap, v3); + getret = vnode_getattr(vp, vap, &context); if (!error) error = getret; if (error) { - vput(vp); + vnode_put(vp); nfsm_reply(NFSX_POSTOPATTR(v3)); nfsm_srvpostop_attr(getret, vap); return (0); } - if (off >= vap->va_size) - cnt = 0; - else if ((off + reqlen) > vap->va_size) - cnt = nfsm_rndup(vap->va_size - off); + if ((u_quad_t)off >= vap->va_data_size) + count = 0; + else if (((u_quad_t)off + reqlen) > vap->va_data_size) + count = nfsm_rndup(vap->va_data_size - off); else - cnt = reqlen; - nfsm_reply(NFSX_POSTOPORFATTR(v3) + 3 * NFSX_UNSIGNED+nfsm_rndup(cnt)); + count = reqlen; + nfsm_reply(NFSX_POSTOPORFATTR(v3) + 3 * NFSX_UNSIGNED+nfsm_rndup(count)); if (v3) { nfsm_build(tl, u_long *, NFSX_V3FATTR + 4 * NFSX_UNSIGNED); *tl++ = nfs_true; @@ -700,56 +857,57 @@ nfsrv_read(nfsd, slp, procp, mrq) fp = (struct nfs_fattr *)tl; tl += (NFSX_V2FATTR / sizeof (u_long)); } - len = left = cnt; - if (cnt > 0) { + len = left = count; + if (count > 0) { /* * Generate the mbuf list with the uio_iov ref. to it. */ i = 0; m = m2 = mb; while (left > 0) { - siz = min(M_TRAILINGSPACE(m), left); + siz = min(mbuf_trailingspace(m), left); if (siz > 0) { left -= siz; i++; } if (left > 0) { - MGET(m, M_WAIT, MT_DATA); - MCLGET(m, M_WAIT); - m->m_len = 0; - m2->m_next = m; + m = NULL; + if ((error = mbuf_mclget(MBUF_WAITOK, MBUF_TYPE_DATA, &m))) + goto errorexit; + mbuf_setnext(m2, m); m2 = m; } } - MALLOC(iv, struct iovec *, i * sizeof (struct iovec), - M_TEMP, M_WAITOK); - uiop->uio_iov = iv2 = iv; + MALLOC(uio_bufp, char *, UIO_SIZEOF(i), M_TEMP, M_WAITOK); + if (!uio_bufp) { + error = ENOMEM; + goto errorexit; + } + uiop = uio_createwithbuffer(i, off, UIO_SYSSPACE, UIO_READ, + uio_bufp, UIO_SIZEOF(i)); + if (!uiop) { + error = ENOMEM; + goto errorexit; + } m = mb; - left = cnt; + left = count; i = 0; while (left > 0) { if (m == NULL) panic("nfsrv_read iov"); - siz = min(M_TRAILINGSPACE(m), left); + siz = min(mbuf_trailingspace(m), left); if (siz > 0) { - iv->iov_base = mtod(m, caddr_t) + m->m_len; - iv->iov_len = siz; - m->m_len += siz; + tlen = mbuf_len(m); + uio_addiov(uiop, CAST_USER_ADDR_T((char *)mbuf_data(m) + tlen), siz); + mbuf_setlen(m, tlen + siz); left -= siz; - iv++; i++; } - m = m->m_next; - } - uiop->uio_iovcnt = i; - uiop->uio_offset = off; - uiop->uio_resid = cnt; - uiop->uio_rw = UIO_READ; - uiop->uio_segflg = UIO_SYSSPACE; - didhold = ubc_hold(vp); - error = VOP_READ(vp, uiop, IO_NODELOCKED, cred); - off = uiop->uio_offset; - FREE((caddr_t)iv2, M_TEMP); + m = mbuf_next(m); + } + error = VNOP_READ(vp, uiop, IO_NODELOCKED, &context); + off = uio_offset(uiop); +errorexit: /* * This may seem a little weird that we drop the whole * successful read if we get an error on the getattr. @@ -761,31 +919,33 @@ nfsrv_read(nfsd, slp, procp, mrq) * postop attrs if the getattr fails. We might be able to * do that easier if we allocated separate mbufs for the data. */ - if (error || (getret = VOP_GETATTR(vp, vap, cred, procp))) { - VOP_UNLOCK(vp, 0, procp); - if (didhold) - ubc_rele(vp); + nfsm_srv_vattr_init(vap, v3); + if (error || (getret = vnode_getattr(vp, vap, &context))) { if (!error) error = getret; - m_freem(mreq); - vrele(vp); + mbuf_freem(mreq); + vnode_put(vp); nfsm_reply(NFSX_POSTOPATTR(v3)); nfsm_srvpostop_attr(getret, vap); + if (uio_bufp != NULL) { + FREE(uio_bufp, M_TEMP); + } return (0); } - VOP_UNLOCK(vp, 0, procp); - if (didhold) - ubc_rele(vp); - vrele(vp); } else { - uiop->uio_resid = 0; - vput(vp); + uiop = uio_createwithbuffer(0, 0, UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf)); + if (!uiop) { + error = ENOMEM; + goto errorexit; + } } + vnode_put(vp); nfsm_srvfillattr(vap, fp); - len -= uiop->uio_resid; + // LP64todo - fix this + len -= uio_resid(uiop); tlen = nfsm_rndup(len); - if (cnt != tlen || tlen != len) - nfsm_adj(mb, cnt - tlen, tlen - len); + if (count != tlen || tlen != len) + nfsm_adj(mb, count - tlen, tlen - len); if (v3) { *tl++ = txdr_unsigned(len); if (len < reqlen) @@ -794,7 +954,11 @@ nfsrv_read(nfsd, slp, procp, mrq) *tl++ = nfs_false; } *tl = txdr_unsigned(len); - nfsm_srvdone; +nfsmout: + if (uio_bufp != NULL) { + FREE(uio_bufp, M_TEMP); + } + return (error); } /* @@ -804,43 +968,40 @@ int nfsrv_write(nfsd, slp, procp, mrq) struct nfsrv_descript *nfsd; struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; + proc_t procp; + mbuf_t *mrq; { - struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; - struct mbuf *nam = nfsd->nd_nam; + mbuf_t mrep = nfsd->nd_mrep, md = nfsd->nd_md; + mbuf_t nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; - struct ucred *cred = &nfsd->nd_cr; - register struct iovec *ivp; - register int i, cnt; - register struct mbuf *mp; - register struct nfs_fattr *fp; - struct iovec *iv; - struct vattr va, forat; - register struct vattr *vap = &va; - register u_long *tl; - register long t1; - caddr_t bpos; - int error = 0, rdonly, cache, len, forat_ret = 1; - int ioflags, aftat_ret = 1, retlen, zeroing, adjust; + int i, count; + mbuf_t mp; + struct nfs_fattr *fp; + struct vnode_attr va, forat; + struct vnode_attr *vap = &va; + u_long *tl; + long t1; + caddr_t bpos, tpos; + int error = 0, len, forat_ret = 1; + int ioflags, aftat_ret = 1, retlen, zeroing, adjust, tlen; int stable = NFSV3WRITE_FILESYNC; int v3 = (nfsd->nd_flag & ND_NFSV3); char *cp2; - struct mbuf *mb, *mb2, *mreq; - struct vnode *vp; - nfsfh_t nfh; - fhandle_t *fhp; - struct uio io, *uiop = &io; + mbuf_t mb, mb2, mreq; + vnode_t vp; + struct nfs_filehandle nfh; + struct nfs_export *nx; + struct nfs_export_options *nxo; + uio_t uiop; off_t off; - u_quad_t frev; - int didhold = 0; + char *uio_bufp = NULL; + struct vfs_context context; if (mrep == NULL) { *mrq = NULL; return (0); } - fhp = &nfh.fh_generic; - nfsm_srvmtofh(fhp); + nfsm_srvmtofh(&nfh); if (v3) { nfsm_dissect(tl, u_long *, 5 * NFSX_UNSIGNED); fxdr_hyper(tl, &off); @@ -854,7 +1015,7 @@ nfsrv_write(nfsd, slp, procp, mrq) stable = NFSV3WRITE_UNSTABLE; } retlen = len = fxdr_unsigned(long, *tl); - cnt = i = 0; + count = i = 0; /* * For NFS Version 2, it is not obvious what a write of zero length @@ -867,23 +1028,32 @@ nfsrv_write(nfsd, slp, procp, mrq) while (mp) { if (mp == md) { zeroing = 0; - adjust = dpos - mtod(mp, caddr_t); - mp->m_len -= adjust; - if (mp->m_len > 0 && adjust > 0) - NFSMADV(mp, adjust); + tpos = mbuf_data(mp); + tlen = mbuf_len(mp); + adjust = dpos - tpos; + tlen -= adjust; + mbuf_setlen(mp, tlen); + if (tlen > 0 && adjust > 0) { + tpos += adjust; + if ((error = mbuf_setdata(mp, tpos, tlen))) { + nfsm_reply(2 * NFSX_UNSIGNED); + nfsm_srvwcc_data(forat_ret, &forat, aftat_ret, vap); + return (0); + } + } } if (zeroing) - mp->m_len = 0; - else if (mp->m_len > 0) { - i += mp->m_len; + mbuf_setlen(mp, 0); + else if ((tlen = mbuf_len(mp)) > 0) { + i += tlen; if (i > len) { - mp->m_len -= (i - len); + mbuf_setlen(mp, tlen - (i - len)); zeroing = 1; } - if (mp->m_len > 0) - cnt++; + if (mbuf_len(mp) > 0) + count++; } - mp = mp->m_next; + mp = mbuf_next(mp); } } if (len > NFS_MAXDATA || len < 0 || i < len) { @@ -892,44 +1062,65 @@ nfsrv_write(nfsd, slp, procp, mrq) nfsm_srvwcc_data(forat_ret, &forat, aftat_ret, vap); return (0); } - if ((error = nfsrv_fhtovp(fhp, 1, &vp, cred, slp, nam, - &rdonly, (nfsd->nd_flag & ND_KERBAUTH), TRUE))) { + if ((error = nfsrv_fhtovp(&nfh, nam, TRUE, &vp, &nx, &nxo))) { nfsm_reply(2 * NFSX_UNSIGNED); nfsm_srvwcc_data(forat_ret, &forat, aftat_ret, vap); return (0); } - if (v3) - forat_ret = VOP_GETATTR(vp, &forat, cred, procp); - if (vp->v_type != VREG) { + if ((error = nfsrv_credcheck(nfsd, nx, nxo))) { + vnode_put(vp); + nfsm_reply(2 * NFSX_UNSIGNED); + nfsm_srvwcc_data(forat_ret, &forat, aftat_ret, vap); + return (0); + } + context.vc_proc = procp; + context.vc_ucred = nfsd->nd_cr; + + if (v3) { + nfsm_srv_pre_vattr_init(&forat, v3); + forat_ret = vnode_getattr(vp, &forat, &context); + } + if (vnode_vtype(vp) != VREG) { if (v3) error = EINVAL; else - error = (vp->v_type == VDIR) ? EISDIR : EACCES; + error = (vnode_vtype(vp) == VDIR) ? EISDIR : EACCES; } if (!error) { - nqsrv_getl(vp, ND_WRITE); - error = nfsrv_access(vp, VWRITE, cred, rdonly, procp, 1); + error = nfsrv_authorize(vp, NULL, KAUTH_VNODE_WRITE_DATA, &context, nxo, 1); } if (error) { - vput(vp); + vnode_put(vp); nfsm_reply(NFSX_WCCDATA(v3)); nfsm_srvwcc_data(forat_ret, &forat, aftat_ret, vap); return (0); } if (len > 0) { - MALLOC(ivp, struct iovec *, cnt * sizeof (struct iovec), M_TEMP, - M_WAITOK); - uiop->uio_iov = iv = ivp; - uiop->uio_iovcnt = cnt; + MALLOC(uio_bufp, char *, UIO_SIZEOF(count), M_TEMP, M_WAITOK); + if (!uio_bufp) { + error = ENOMEM; + vnode_put(vp); + nfsm_reply(NFSX_WCCDATA(v3)); + nfsm_srvwcc_data(forat_ret, &forat, aftat_ret, vap); + return (0); + } + uiop = uio_createwithbuffer(count, off, UIO_SYSSPACE, UIO_WRITE, uio_bufp, UIO_SIZEOF(count)); + if (!uiop) { + error = ENOMEM; + vnode_put(vp); + nfsm_reply(NFSX_WCCDATA(v3)); + nfsm_srvwcc_data(forat_ret, &forat, aftat_ret, vap); + if (uio_bufp != NULL) { + FREE(uio_bufp, M_TEMP); + } + return (0); + } mp = mrep; while (mp) { - if (mp->m_len > 0) { - ivp->iov_base = mtod(mp, caddr_t); - ivp->iov_len = mp->m_len; - ivp++; - } - mp = mp->m_next; + if ((tlen = mbuf_len(mp)) > 0) + uio_addiov(uiop, CAST_USER_ADDR_T((caddr_t)mbuf_data(mp)), tlen); + mp = mbuf_next(mp); } /* @@ -945,29 +1136,25 @@ nfsrv_write(nfsd, slp, procp, mrq) ioflags = (IO_SYNC | IO_NODELOCKED); else ioflags = (IO_METASYNC | IO_SYNC | IO_NODELOCKED); - uiop->uio_resid = len; - uiop->uio_rw = UIO_WRITE; - uiop->uio_segflg = UIO_SYSSPACE; - uiop->uio_procp = (struct proc *)0; - uiop->uio_offset = off; - didhold = ubc_hold(vp); - error = VOP_WRITE(vp, uiop, ioflags, cred); - nfsstats.srvvop_writes++; - FREE((caddr_t)iv, M_TEMP); - } - aftat_ret = VOP_GETATTR(vp, vap, cred, procp); - VOP_UNLOCK(vp, 0, procp); - if (didhold) - ubc_rele(vp); - vrele(vp); + + error = VNOP_WRITE(vp, uiop, ioflags, &context); + OSAddAtomic(1, (SInt32*)(SInt32*)&nfsstats.srvvop_writes); + } + nfsm_srv_vattr_init(vap, v3); + aftat_ret = vnode_getattr(vp, vap, &context); + vnode_put(vp); if (!error) error = aftat_ret; nfsm_reply(NFSX_PREOPATTR(v3) + NFSX_POSTOPORFATTR(v3) + 2 * NFSX_UNSIGNED + NFSX_WRITEVERF(v3)); if (v3) { nfsm_srvwcc_data(forat_ret, &forat, aftat_ret, vap); - if (error) + if (error) { + if (uio_bufp != NULL) { + FREE(uio_bufp, M_TEMP); + } return (0); + } nfsm_build(tl, u_long *, 4 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(retlen); /* @@ -982,13 +1169,17 @@ nfsrv_write(nfsd, slp, procp, mrq) * but it may make the values more human readable, * for debugging purposes. */ - *tl++ = txdr_unsigned(boottime.tv_sec); - *tl = txdr_unsigned(boottime.tv_usec); + *tl++ = txdr_unsigned(boottime_sec()); + *tl = txdr_unsigned(0); } else { nfsm_build(fp, struct nfs_fattr *, NFSX_V2FATTR); nfsm_srvfillattr(vap, fp); } - nfsm_srvdone; +nfsmout: + if (uio_bufp != NULL) { + FREE(uio_bufp, M_TEMP); + } + return (error); } /* @@ -1002,35 +1193,39 @@ int nfsrv_writegather(ndp, slp, procp, mrq) struct nfsrv_descript **ndp; struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; + proc_t procp; + mbuf_t *mrq; { - register struct iovec *ivp; - register struct mbuf *mp; - register struct nfsrv_descript *wp, *nfsd, *owp, *swp; - register struct nfs_fattr *fp; - register int i; - struct iovec *iov; + mbuf_t mp; + struct nfsrv_descript *wp, *nfsd, *owp, *swp; + struct nfs_export *nx; + struct nfs_export_options *nxo; + struct nfs_fattr *fp; + int i; struct nfsrvw_delayhash *wpp; - struct ucred *cred; - struct vattr va, forat; - register u_long *tl; - register long t1; - caddr_t bpos, dpos; - int error = 0, rdonly, cache, len, forat_ret = 1; - int ioflags, aftat_ret = 1, s, adjust, v3, zeroing; + kauth_cred_t cred; + struct vnode_attr va, forat; + u_long *tl; + long t1; + caddr_t bpos, dpos, tpos; + int error = 0, len, forat_ret = 1; + int ioflags, aftat_ret = 1, adjust, v3, zeroing, tlen; char *cp2; - struct mbuf *mb, *mb2, *mreq, *mrep, *md; - struct vnode *vp; - struct uio io, *uiop = &io; - u_quad_t frev, cur_usec; - int didhold; + mbuf_t mb, mb2, mreq, mrep, md; + vnode_t vp; + uio_t uiop = NULL; + char *uio_bufp = NULL; + u_quad_t cur_usec; struct timeval now; + struct vfs_context context; + + context.vc_proc = procp; #ifndef nolint i = 0; len = 0; #endif + *mrq = NULL; if (*ndp) { nfsd = *ndp; @@ -1038,7 +1233,8 @@ nfsrv_writegather(ndp, slp, procp, mrq) mrep = nfsd->nd_mrep; md = nfsd->nd_md; dpos = nfsd->nd_dpos; - cred = &nfsd->nd_cr; + cred = nfsd->nd_cr; + context.vc_ucred = cred; v3 = (nfsd->nd_flag & ND_NFSV3); LIST_INIT(&nfsd->nd_coalesce); nfsd->nd_mreq = NULL; @@ -1052,6 +1248,7 @@ nfsrv_writegather(ndp, slp, procp, mrq) * Now, get the write header.. */ nfsm_srvmtofh(&nfsd->nd_fh); + /* XXX shouldn't we be checking for invalid FHs before doing any more work? */ if (v3) { nfsm_dissect(tl, u_long *, 5 * NFSX_UNSIGNED); fxdr_hyper(tl, &nfsd->nd_off); @@ -1078,25 +1275,32 @@ nfsrv_writegather(ndp, slp, procp, mrq) while (mp) { if (mp == md) { zeroing = 0; - adjust = dpos - mtod(mp, caddr_t); - mp->m_len -= adjust; - if (mp->m_len > 0 && adjust > 0) - NFSMADV(mp, adjust); + tpos = mbuf_data(mp); + tlen = mbuf_len(mp); + adjust = dpos - tpos; + tlen -= adjust; + mbuf_setlen(mp, tlen); + if (tlen > 0 && adjust > 0) { + tpos += adjust; + if ((error = mbuf_setdata(mp, tpos, tlen))) + goto nfsmout; + } } if (zeroing) - mp->m_len = 0; + mbuf_setlen(mp, 0); else { - i += mp->m_len; + tlen = mbuf_len(mp); + i += tlen; if (i > len) { - mp->m_len -= (i - len); + mbuf_setlen(mp, tlen - (i - len)); zeroing = 1; } } - mp = mp->m_next; + mp = mbuf_next(mp); } if (len > NFS_MAXDATA || len < 0 || i < len) { nfsmout: - m_freem(mrep); + mbuf_freem(mrep); mrep = NULL; error = EIO; nfsm_writereply(2 * NFSX_UNSIGNED, v3); @@ -1104,36 +1308,34 @@ nfsmout: nfsm_srvwcc_data(forat_ret, &forat, aftat_ret, &va); nfsd->nd_mreq = mreq; nfsd->nd_mrep = NULL; - nfsd->nd_time = 0; + nfsd->nd_time = 1; } /* * Add this entry to the hash and time queues. */ - s = splsoftclock(); + lck_mtx_lock(&slp->ns_wgmutex); owp = NULL; wp = slp->ns_tq.lh_first; while (wp && wp->nd_time < nfsd->nd_time) { owp = wp; wp = wp->nd_tq.le_next; } - NFS_DPF(WG, ("Q%03x", nfsd->nd_retxid & 0xfff)); if (owp) { LIST_INSERT_AFTER(owp, nfsd, nd_tq); } else { LIST_INSERT_HEAD(&slp->ns_tq, nfsd, nd_tq); } if (nfsd->nd_mrep) { - wpp = NWDELAYHASH(slp, nfsd->nd_fh.fh_fid.fid_data); + wpp = NWDELAYHASH(slp, nfsd->nd_fh.nfh_fid); owp = NULL; wp = wpp->lh_first; - while (wp && - bcmp((caddr_t)&nfsd->nd_fh,(caddr_t)&wp->nd_fh,NFSX_V3FH)) { + while (wp && !nfsrv_fhmatch(&nfsd->nd_fh, &wp->nd_fh)) { owp = wp; wp = wp->nd_hash.le_next; } - while (wp && wp->nd_off < nfsd->nd_off && - !bcmp((caddr_t)&nfsd->nd_fh,(caddr_t)&wp->nd_fh,NFSX_V3FH)) { + while (wp && (wp->nd_off < nfsd->nd_off) && + nfsrv_fhmatch(&nfsd->nd_fh, &wp->nd_fh)) { owp = wp; wp = wp->nd_hash.le_next; } @@ -1153,48 +1355,52 @@ nfsmout: LIST_INSERT_HEAD(wpp, nfsd, nd_hash); } } - splx(s); + } else { + lck_mtx_lock(&slp->ns_wgmutex); } /* - * Now, do VOP_WRITE()s for any one(s) that need to be done now + * Now, do VNOP_WRITE()s for any one(s) that need to be done now * and generate the associated reply mbuf list(s). */ loop1: microuptime(&now); cur_usec = (u_quad_t)now.tv_sec * 1000000 + (u_quad_t)now.tv_usec; - s = splsoftclock(); for (nfsd = slp->ns_tq.lh_first; nfsd; nfsd = owp) { owp = nfsd->nd_tq.le_next; if (nfsd->nd_time > cur_usec) break; if (nfsd->nd_mreq) continue; - NFS_DPF(WG, ("P%03x", nfsd->nd_retxid & 0xfff)); LIST_REMOVE(nfsd, nd_tq); LIST_REMOVE(nfsd, nd_hash); - splx(s); mrep = nfsd->nd_mrep; nfsd->nd_mrep = NULL; - cred = &nfsd->nd_cr; v3 = (nfsd->nd_flag & ND_NFSV3); forat_ret = aftat_ret = 1; - error = nfsrv_fhtovp(&nfsd->nd_fh, 1, &vp, cred, slp, - nfsd->nd_nam, &rdonly, (nfsd->nd_flag & ND_KERBAUTH), TRUE); + error = nfsrv_fhtovp(&nfsd->nd_fh, nfsd->nd_nam, TRUE, &vp, &nx, &nxo); if (!error) { - if (v3) - forat_ret = VOP_GETATTR(vp, &forat, cred, procp); - if (vp->v_type != VREG) { + error = nfsrv_credcheck(nfsd, nx, nxo); + if (error) + vnode_put(vp); + } + cred = nfsd->nd_cr; + context.vc_ucred = cred; + if (!error) { + if (v3) { + nfsm_srv_pre_vattr_init(&forat, v3); + forat_ret = vnode_getattr(vp, &forat, &context); + } + if (vnode_vtype(vp) != VREG) { if (v3) error = EINVAL; else - error = (vp->v_type == VDIR) ? EISDIR : EACCES; + error = (vnode_vtype(vp) == VDIR) ? EISDIR : EACCES; } } else vp = NULL; if (!error) { - nqsrv_getl(vp, ND_WRITE); - error = nfsrv_access(vp, VWRITE, cred, rdonly, procp, 1); + error = nfsrv_authorize(vp, NULL, KAUTH_VNODE_WRITE_DATA, &context, nxo, 1); } if (nfsd->nd_stable == NFSV3WRITE_UNSTABLE) @@ -1203,48 +1409,43 @@ loop1: ioflags = (IO_SYNC | IO_NODELOCKED); else ioflags = (IO_METASYNC | IO_SYNC | IO_NODELOCKED); - uiop->uio_rw = UIO_WRITE; - uiop->uio_segflg = UIO_SYSSPACE; - uiop->uio_procp = (struct proc *)0; - uiop->uio_offset = nfsd->nd_off; - uiop->uio_resid = nfsd->nd_eoff - nfsd->nd_off; - didhold = 0; - if (uiop->uio_resid > 0) { + + if (!error && ((nfsd->nd_eoff - nfsd->nd_off) > 0)) { mp = mrep; i = 0; while (mp) { - if (mp->m_len > 0) + if (mbuf_len(mp) > 0) i++; - mp = mp->m_next; + mp = mbuf_next(mp); } - uiop->uio_iovcnt = i; - MALLOC(iov, struct iovec *, i * sizeof (struct iovec), - M_TEMP, M_WAITOK); - uiop->uio_iov = ivp = iov; - mp = mrep; - while (mp) { - if (mp->m_len > 0) { - ivp->iov_base = mtod(mp, caddr_t); - ivp->iov_len = mp->m_len; - ivp++; + + MALLOC(uio_bufp, char *, UIO_SIZEOF(i), M_TEMP, M_WAITOK); + if (uio_bufp) + uiop = uio_createwithbuffer(i, nfsd->nd_off, UIO_SYSSPACE, + UIO_WRITE, uio_bufp, UIO_SIZEOF(i)); + if (!uio_bufp || !uiop) + error = ENOMEM; + if (!error) { + mp = mrep; + while (mp) { + if ((tlen = mbuf_len(mp)) > 0) + uio_addiov(uiop, CAST_USER_ADDR_T((caddr_t)mbuf_data(mp)), tlen); + mp = mbuf_next(mp); } - mp = mp->m_next; + error = VNOP_WRITE(vp, uiop, ioflags, &context); + OSAddAtomic(1, (SInt32*)&nfsstats.srvvop_writes); } - if (!error) { - didhold = ubc_hold(vp); - error = VOP_WRITE(vp, uiop, ioflags, cred); - nfsstats.srvvop_writes++; + if (uio_bufp) { + FREE(uio_bufp, M_TEMP); + uio_bufp = NULL; } - FREE((caddr_t)iov, M_TEMP); } - m_freem(mrep); + mbuf_freem(mrep); mrep = NULL; if (vp) { - aftat_ret = VOP_GETATTR(vp, &va, cred, procp); - VOP_UNLOCK(vp, 0, procp); - if (didhold) - ubc_rele(vp); - vrele(vp); + nfsm_srv_pre_vattr_init(&va, v3); + aftat_ret = vnode_getattr(vp, &va, &context); + vnode_put(vp); } /* @@ -1253,7 +1454,6 @@ loop1: */ swp = nfsd; do { - NFS_DPF(WG, ("R%03x", nfsd->nd_retxid & 0xfff)); if (error) { nfsm_writereply(NFSX_WCCDATA(v3), v3); if (v3) { @@ -1273,8 +1473,8 @@ loop1: * but it may make the values more human readable, * for debugging purposes. */ - *tl++ = txdr_unsigned(boottime.tv_sec); - *tl = txdr_unsigned(boottime.tv_usec); + *tl++ = txdr_unsigned(boottime_sec()); + *tl = txdr_unsigned(0); } else { nfsm_build(fp, struct nfs_fattr *, NFSX_V2FATTR); nfsm_srvfillattr(&va, fp); @@ -1288,38 +1488,32 @@ loop1: * Done. Put it at the head of the timer queue so that * the final phase can return the reply. */ - s = splsoftclock(); if (nfsd != swp) { - nfsd->nd_time = 0; + nfsd->nd_time = 1; LIST_INSERT_HEAD(&slp->ns_tq, nfsd, nd_tq); } nfsd = swp->nd_coalesce.lh_first; if (nfsd) { LIST_REMOVE(nfsd, nd_tq); } - splx(s); } while (nfsd); - s = splsoftclock(); - swp->nd_time = 0; + swp->nd_time = 1; LIST_INSERT_HEAD(&slp->ns_tq, swp, nd_tq); - splx(s); goto loop1; } - splx(s); /* * Search for a reply to return. */ - s = splsoftclock(); for (nfsd = slp->ns_tq.lh_first; nfsd; nfsd = nfsd->nd_tq.le_next) if (nfsd->nd_mreq) { - NFS_DPF(WG, ("X%03x", nfsd->nd_retxid & 0xfff)); LIST_REMOVE(nfsd, nd_tq); *mrq = nfsd->nd_mreq; *ndp = nfsd; break; } - splx(s); + slp->ns_wgtime = slp->ns_tq.lh_first ? slp->ns_tq.lh_first->nd_time : 0; + lck_mtx_unlock(&slp->ns_wgmutex); return (0); } @@ -1329,19 +1523,16 @@ loop1: * - merge nfsd->nd_mrep into owp->nd_mrep * - update the nd_eoff and nd_stable for owp * - put nfsd on owp's nd_coalesce list - * NB: Must be called at splsoftclock(). */ static void -nfsrvw_coalesce(owp, nfsd) - register struct nfsrv_descript *owp; - register struct nfsrv_descript *nfsd; +nfsrvw_coalesce( + struct nfsrv_descript *owp, + struct nfsrv_descript *nfsd) { - register int overlap; - register struct mbuf *mp; + int overlap, error; + mbuf_t mp, mpnext; struct nfsrv_descript *p; - NFS_DPF(WG, ("C%03x-%03x", - nfsd->nd_retxid & 0xfff, owp->nd_retxid & 0xfff)); LIST_REMOVE(nfsd, nd_hash); LIST_REMOVE(nfsd, nd_tq); if (owp->nd_eoff < nfsd->nd_eoff) { @@ -1349,14 +1540,17 @@ nfsrvw_coalesce(owp, nfsd) if (overlap < 0) panic("nfsrv_coalesce: bad off"); if (overlap > 0) - m_adj(nfsd->nd_mrep, overlap); + mbuf_adj(nfsd->nd_mrep, overlap); mp = owp->nd_mrep; - while (mp->m_next) - mp = mp->m_next; - mp->m_next = nfsd->nd_mrep; + while ((mpnext = mbuf_next(mp))) + mp = mpnext; + error = mbuf_setnext(mp, nfsd->nd_mrep); + if (error) + panic("nfsrvw_coalesce: mbuf_setnext failed: %d", error); owp->nd_eoff = nfsd->nd_eoff; - } else - m_freem(nfsd->nd_mrep); + } else { + mbuf_freem(nfsd->nd_mrep); + } nfsd->nd_mrep = NULL; if (nfsd->nd_stable == NFSV3WRITE_FILESYNC) owp->nd_stable = NFSV3WRITE_FILESYNC; @@ -1380,13 +1574,15 @@ nfsrvw_coalesce(owp, nfsd) * Sort the group list in increasing numerical order. * (Insertion sort by Chris Torek, who was grossed out by the bubble sort * that used to be here.) + * + * XXX ILLEGAL */ void nfsrvw_sort(list, num) - register gid_t *list; - register int num; + gid_t *list; + int num; { - register int i, j; + int i, j; gid_t v; /* Insertion sort. */ @@ -1401,16 +1597,17 @@ nfsrvw_sort(list, num) /* * copy credentials making sure that the result can be compared with bcmp(). + * + * XXX ILLEGAL */ void -nfsrv_setcred(incred, outcred) - register struct ucred *incred, *outcred; +nfsrv_setcred(kauth_cred_t incred, kauth_cred_t outcred) { - register int i; + int i; - bzero((caddr_t)outcred, sizeof (struct ucred)); + bzero((caddr_t)outcred, sizeof (*outcred)); outcred->cr_ref = 1; - outcred->cr_uid = incred->cr_uid; + outcred->cr_uid = kauth_cred_getuid(incred); outcred->cr_ngroups = incred->cr_ngroups; for (i = 0; i < incred->cr_ngroups; i++) outcred->cr_groups[i] = incred->cr_groups[i]; @@ -1425,67 +1622,85 @@ int nfsrv_create(nfsd, slp, procp, mrq) struct nfsrv_descript *nfsd; struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; + proc_t procp; + mbuf_t *mrq; { - struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; - struct mbuf *nam = nfsd->nd_nam; + mbuf_t mrep = nfsd->nd_mrep, md = nfsd->nd_md; + mbuf_t nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; - struct ucred *cred = &nfsd->nd_cr; - register struct nfs_fattr *fp; - struct vattr va, dirfor, diraft; - register struct vattr *vap = &va; - register struct nfsv2_sattr *sp; - register u_long *tl; + struct nfs_fattr *fp; + struct vnode_attr dirfor, diraft, postat; + struct vnode_attr va; + struct vnode_attr *vap = &va; + struct nfsv2_sattr *sp; + u_long *tl; struct nameidata nd; - register caddr_t cp; - register long t1; + caddr_t cp; + long t1; caddr_t bpos; - int error = 0, rdev, cache, len, tsize, dirfor_ret = 1, diraft_ret = 1; + int error = 0, rdev, len, tsize, dirfor_ret = 1, diraft_ret = 1; int v3 = (nfsd->nd_flag & ND_NFSV3), how, exclusive_flag = 0; char *cp2; - struct mbuf *mb, *mb2, *mreq; - struct vnode *vp, *dirp = (struct vnode *)0; - nfsfh_t nfh; - fhandle_t *fhp; - u_quad_t frev, tempsize; + mbuf_t mb, mb2, mreq; + vnode_t vp, dvp, dirp = NULL; + struct nfs_filehandle nfh; + struct nfs_export *nx; + struct nfs_export_options *nxo; + u_quad_t tempsize; u_char cverf[NFSX_V3CREATEVERF]; + struct vfs_context context; + uid_t saved_uid; + + context.vc_proc = procp; + context.vc_ucred = nfsd->nd_cr; + + /* + * Save the original credential UID in case they are + * mapped and we need to map the IDs in the attributes. + */ + saved_uid = kauth_cred_getuid(nfsd->nd_cr); #ifndef nolint rdev = 0; #endif nd.ni_cnd.cn_nameiop = 0; - fhp = &nfh.fh_generic; - nfsm_srvmtofh(fhp); - nfsm_srvnamesiz(len); - nd.ni_cnd.cn_cred = cred; + vp = dvp = NULL; + nfsm_srvmtofh(&nfh); + nfsm_srvnamesiz(len, v3); + nd.ni_cnd.cn_nameiop = CREATE; - nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | SAVESTART; - error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos, - &dirp, procp, (nfsd->nd_flag & ND_KERBAUTH), FALSE); + nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; + error = nfsm_path_mbuftond(&md, &dpos, v3, FALSE, &len, &nd); + if (!error) + error = nfs_namei(nfsd, &context, &nd, &nfh, nam, FALSE, &dirp, &nx, &nxo); if (dirp) { - if (v3) - dirfor_ret = VOP_GETATTR(dirp, &dirfor, cred, - procp); - else { - vrele(dirp); - dirp = (struct vnode *)0; + if (v3) { + nfsm_srv_pre_vattr_init(&dirfor, v3); + dirfor_ret = vnode_getattr(dirp, &dirfor, &context); + } else { + vnode_put(dirp); + dirp = NULL; } } if (error) { + nd.ni_cnd.cn_nameiop = 0; nfsm_reply(NFSX_WCCDATA(v3)); nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); if (dirp) - vrele(dirp); + vnode_put(dirp); return (0); } - VATTR_NULL(vap); + dvp = nd.ni_dvp; + vp = nd.ni_vp; + + VATTR_INIT(vap); + if (v3) { nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); how = fxdr_unsigned(int, *tl); switch (how) { case NFSV3CREATE_GUARDED: - if (nd.ni_vp) { + if (vp) { error = EEXIST; break; } @@ -1496,179 +1711,229 @@ nfsrv_create(nfsd, slp, procp, mrq) nfsm_dissect(cp, caddr_t, NFSX_V3CREATEVERF); bcopy(cp, cverf, NFSX_V3CREATEVERF); exclusive_flag = 1; - if (nd.ni_vp == NULL) - vap->va_mode = 0; + if (vp == NULL) + VATTR_SET(vap, va_mode, 0); break; }; - vap->va_type = VREG; + VATTR_SET(vap, va_type, VREG); } else { + enum vtype v_type; + nfsm_dissect(sp, struct nfsv2_sattr *, NFSX_V2SATTR); - vap->va_type = IFTOVT(fxdr_unsigned(u_long, sp->sa_mode)); - if (vap->va_type == VNON) - vap->va_type = VREG; - vap->va_mode = nfstov_mode(sp->sa_mode); - switch (vap->va_type) { + v_type = IFTOVT(fxdr_unsigned(u_long, sp->sa_mode)); + if (v_type == VNON) + v_type = VREG; + VATTR_SET(vap, va_type, v_type); + VATTR_SET(vap, va_mode, nfstov_mode(sp->sa_mode)); + + switch (v_type) { case VREG: tsize = fxdr_unsigned(long, sp->sa_size); if (tsize != -1) - vap->va_size = (u_quad_t)tsize; + VATTR_SET(vap, va_data_size, (u_quad_t)tsize); break; case VCHR: case VBLK: case VFIFO: rdev = fxdr_unsigned(long, sp->sa_size); break; + default: + break; }; } /* - * Iff doesn't exist, create it + * If it doesn't exist, create it * otherwise just truncate to 0 length * should I set the mode too ?? */ - if (nd.ni_vp == NULL) { + if (vp == NULL) { + kauth_acl_t xacl = NULL; + + /* + * If the credentials were mapped, we should + * map the same values in the attributes. + */ + if ((vap->va_uid == saved_uid) && (kauth_cred_getuid(nfsd->nd_cr) != saved_uid)) { + int ismember; + VATTR_SET(vap, va_uid, kauth_cred_getuid(nfsd->nd_cr)); + if (kauth_cred_ismember_gid(nfsd->nd_cr, vap->va_gid, &ismember) || !ismember) + VATTR_SET(vap, va_gid, kauth_cred_getgid(nfsd->nd_cr)); + } + + /* authorize before creating */ + error = nfsrv_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, &context, nxo, 0); + + /* construct ACL and handle inheritance */ + if (!error) { + error = kauth_acl_inherit(dvp, + NULL, + &xacl, + 0 /* !isdir */, + &context); + + if (!error && xacl != NULL) + VATTR_SET(vap, va_acl, xacl); + } + VATTR_CLEAR_ACTIVE(vap, va_data_size); + VATTR_CLEAR_ACTIVE(vap, va_access_time); + + /* validate new-file security information */ + if (!error) { + error = vnode_authattr_new(dvp, vap, 0, &context); + if (error && (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid))) { + /* + * Most NFS servers just ignore the UID/GID attributes, so we + * try ignoring them if that'll help the request succeed. + */ + VATTR_CLEAR_ACTIVE(vap, va_uid); + VATTR_CLEAR_ACTIVE(vap, va_gid); + error = vnode_authattr_new(dvp, vap, 0, &context); + } + } + if (vap->va_type == VREG || vap->va_type == VSOCK) { - vrele(nd.ni_startdir); - nqsrv_getl(nd.ni_dvp, ND_WRITE); - error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap); + + if (!error) + error = VNOP_CREATE(dvp, &vp, &nd.ni_cnd, vap, &context); + + if (!error && !VATTR_ALL_SUPPORTED(vap)) + /* + * If some of the requested attributes weren't handled by the VNOP, + * use our fallback code. + */ + error = vnode_setattr_fallback(vp, vap, &context); + + if (xacl != NULL) + kauth_acl_free(xacl); + if (!error) { - nfsrv_object_create(nd.ni_vp); - FREE_ZONE(nd.ni_cnd.cn_pnbuf, - nd.ni_cnd.cn_pnlen, M_NAMEI); - nd.ni_cnd.cn_flags &= ~HASBUF; if (exclusive_flag) { exclusive_flag = 0; - VATTR_NULL(vap); - bcopy(cverf, (caddr_t)&vap->va_atime, + VATTR_INIT(vap); + bcopy(cverf, (caddr_t)&vap->va_access_time, NFSX_V3CREATEVERF); - error = VOP_SETATTR(nd.ni_vp, vap, cred, - procp); + VATTR_SET_ACTIVE(vap, va_access_time); + // skip authorization, as this is an + // NFS internal implementation detail. + error = vnode_setattr(vp, vap, &context); } } + } else if (vap->va_type == VCHR || vap->va_type == VBLK || vap->va_type == VFIFO) { - if (vap->va_type == VCHR && rdev == 0xffffffff) - vap->va_type = VFIFO; + if (vap->va_type == VCHR && rdev == (int)0xffffffff) + VATTR_SET(vap, va_type, VFIFO); if (vap->va_type != VFIFO && - (error = suser(cred, (u_short *)0))) { - vrele(nd.ni_startdir); - FREE_ZONE(nd.ni_cnd.cn_pnbuf, - nd.ni_cnd.cn_pnlen, M_NAMEI); - nd.ni_cnd.cn_flags &= ~HASBUF; - VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); - vput(nd.ni_dvp); + (error = suser(nfsd->nd_cr, (u_short *)0))) { nfsm_reply(0); - return (error); } else - vap->va_rdev = (dev_t)rdev; - nqsrv_getl(nd.ni_dvp, ND_WRITE); - if ((error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap))) { - vrele(nd.ni_startdir); + VATTR_SET(vap, va_rdev, (dev_t)rdev); + + error = VNOP_MKNOD(dvp, &vp, &nd.ni_cnd, vap, &context); + + if (xacl != NULL) + kauth_acl_free(xacl); + + if (error) { nfsm_reply(0); } + if (vp) { + vnode_recycle(vp); + vnode_put(vp); + vp = NULL; + } nd.ni_cnd.cn_nameiop = LOOKUP; - nd.ni_cnd.cn_flags &= ~(LOCKPARENT | SAVESTART); - nd.ni_cnd.cn_proc = procp; - nd.ni_cnd.cn_cred = cred; - if ((error = lookup(&nd))) { - FREE_ZONE(nd.ni_cnd.cn_pnbuf, - nd.ni_cnd.cn_pnlen, M_NAMEI); - nd.ni_cnd.cn_flags &= ~HASBUF; - nfsm_reply(0); + nd.ni_cnd.cn_flags &= ~LOCKPARENT; + nd.ni_cnd.cn_context = &context; + nd.ni_startdir = dvp; + nd.ni_usedvp = dvp; + error = lookup(&nd); + if (!error) { + if (nd.ni_cnd.cn_flags & ISSYMLINK) + error = EINVAL; } - nfsrv_object_create(nd.ni_vp); - FREE_ZONE(nd.ni_cnd.cn_pnbuf, - nd.ni_cnd.cn_pnlen, M_NAMEI); - nd.ni_cnd.cn_flags &= ~HASBUF; - if (nd.ni_cnd.cn_flags & ISSYMLINK) { - vrele(nd.ni_dvp); - vput(nd.ni_vp); - VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); - error = EINVAL; + if (error) nfsm_reply(0); - } } else { - vrele(nd.ni_startdir); - FREE_ZONE(nd.ni_cnd.cn_pnbuf, - nd.ni_cnd.cn_pnlen, M_NAMEI); - nd.ni_cnd.cn_flags &= ~HASBUF; - VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); - vput(nd.ni_dvp); error = ENXIO; } - vp = nd.ni_vp; + /* + * nameidone has to happen before we vnode_put(dvp) + * since it may need to release the fs_nodelock on the dvp + */ + nameidone(&nd); + nd.ni_cnd.cn_nameiop = 0; + + vnode_put(dvp); } else { - vrele(nd.ni_startdir); - FREE_ZONE(nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); - nd.ni_cnd.cn_flags &= ~HASBUF; - vp = nd.ni_vp; - if (nd.ni_dvp == vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); - VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); - if (vap->va_size != -1) { - error = nfsrv_access(vp, VWRITE, cred, - (nd.ni_cnd.cn_flags & RDONLY), procp, 0); + /* + * nameidone has to happen before we vnode_put(dvp) + * since it may need to release the fs_nodelock on the dvp + */ + nameidone(&nd); + nd.ni_cnd.cn_nameiop = 0; + + vnode_put(dvp); + + if (!error && VATTR_IS_ACTIVE(vap, va_data_size)) { + error = nfsrv_authorize(vp, NULL, KAUTH_VNODE_WRITE_DATA, + &context, nxo, 0); if (!error) { - nqsrv_getl(vp, ND_WRITE); - tempsize = vap->va_size; - VATTR_NULL(vap); - vap->va_size = tempsize; - error = VOP_SETATTR(vp, vap, cred, - procp); + tempsize = vap->va_data_size; + VATTR_INIT(vap); + VATTR_SET(vap, va_data_size, tempsize); + error = vnode_setattr(vp, vap, &context); } - if (error) - vput(vp); - } else { - if (error) - vput(vp); /* make sure we catch the EEXIST for nfsv3 */ } } if (!error) { - bzero((caddr_t)fhp, sizeof(nfh)); - fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid; - error = VFS_VPTOFH(vp, &fhp->fh_fid); - if (!error) - error = VOP_GETATTR(vp, vap, cred, procp); - vput(vp); + error = nfsrv_vptofh(nx, !v3, NULL, vp, &context, &nfh); + if (!error) { + nfsm_srv_vattr_init(&postat, v3); + error = vnode_getattr(vp, &postat, &context); + } } + if (vp) + vnode_put(vp); + if (v3) { if (exclusive_flag && !error && - bcmp(cverf, (caddr_t)&vap->va_atime, NFSX_V3CREATEVERF)) + bcmp(cverf, (caddr_t)&postat.va_access_time, NFSX_V3CREATEVERF)) error = EEXIST; - diraft_ret = VOP_GETATTR(dirp, &diraft, cred, procp); - vrele(dirp); + nfsm_srv_vattr_init(&diraft, v3); + diraft_ret = vnode_getattr(dirp, &diraft, &context); + vnode_put(dirp); + dirp = NULL; } - nfsm_reply(NFSX_SRVFH(v3) + NFSX_FATTR(v3) + NFSX_WCCDATA(v3)); + nfsm_reply(NFSX_SRVFH(v3, &nfh) + NFSX_FATTR(v3) + NFSX_WCCDATA(v3)); + if (v3) { if (!error) { - nfsm_srvpostop_fh(fhp); - nfsm_srvpostop_attr(0, vap); + nfsm_srvpostop_fh(&nfh); + nfsm_srvpostop_attr(0, &postat); } nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); } else { - nfsm_srvfhtom(fhp, v3); + nfsm_srvfhtom(&nfh, v3); nfsm_build(fp, struct nfs_fattr *, NFSX_V2FATTR); - nfsm_srvfillattr(vap, fp); + nfsm_srvfillattr(&postat, fp); } return (0); nfsmout: - if (dirp) - vrele(dirp); if (nd.ni_cnd.cn_nameiop) { - vrele(nd.ni_startdir); - FREE_ZONE((caddr_t)nd.ni_cnd.cn_pnbuf, - nd.ni_cnd.cn_pnlen, M_NAMEI); - nd.ni_cnd.cn_flags &= ~HASBUF; - } - VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); - if (nd.ni_dvp == nd.ni_vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); - if (nd.ni_vp) - vput(nd.ni_vp); + /* + * nameidone has to happen before we vnode_put(dvp) + * since it may need to release the fs_nodelock on the dvp + */ + nameidone(&nd); + + if (vp) + vnode_put(vp); + vnode_put(dvp); + } + if (dirp) + vnode_put(dirp); return (error); } @@ -1679,156 +1944,218 @@ int nfsrv_mknod(nfsd, slp, procp, mrq) struct nfsrv_descript *nfsd; struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; + proc_t procp; + mbuf_t *mrq; { - struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; - struct mbuf *nam = nfsd->nd_nam; + mbuf_t mrep = nfsd->nd_mrep, md = nfsd->nd_md; + mbuf_t nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; - struct ucred *cred = &nfsd->nd_cr; - struct vattr va, dirfor, diraft; - register struct vattr *vap = &va; - register u_long *tl; + struct vnode_attr dirfor, diraft, postat; + struct vnode_attr va; + struct vnode_attr *vap = &va; + u_long *tl; struct nameidata nd; - register long t1; + long t1; caddr_t bpos; - int error = 0, cache, len, dirfor_ret = 1, diraft_ret = 1; + int error = 0, len, dirfor_ret = 1, diraft_ret = 1; u_long major, minor; enum vtype vtyp; char *cp2; - struct mbuf *mb, *mb2, *mreq; - struct vnode *vp, *dirp = (struct vnode *)0; - nfsfh_t nfh; - fhandle_t *fhp; - u_quad_t frev; + mbuf_t mb, mb2, mreq; + vnode_t vp, dvp, dirp = NULL; + struct nfs_filehandle nfh; + struct nfs_export *nx; + struct nfs_export_options *nxo; + struct vfs_context hacked_context; /* XXX should we have this? */ + struct vfs_context context; + uid_t saved_uid; + kauth_acl_t xacl = NULL; + + context.vc_proc = procp; + context.vc_ucred = nfsd->nd_cr; + hacked_context.vc_proc = procp; + hacked_context.vc_ucred = proc_ucred(procp); + + /* + * Save the original credential UID in case they are + * mapped and we need to map the IDs in the attributes. + */ + saved_uid = kauth_cred_getuid(nfsd->nd_cr); + vp = dvp = NULL; nd.ni_cnd.cn_nameiop = 0; - fhp = &nfh.fh_generic; - nfsm_srvmtofh(fhp); - nfsm_srvnamesiz(len); - nd.ni_cnd.cn_cred = cred; + nfsm_srvmtofh(&nfh); + nfsm_srvnamesiz(len, 1); + nd.ni_cnd.cn_nameiop = CREATE; - nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | SAVESTART; - error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos, - &dirp, procp, (nfsd->nd_flag & ND_KERBAUTH), FALSE); - if (dirp) - dirfor_ret = VOP_GETATTR(dirp, &dirfor, cred, procp); + nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; + error = nfsm_path_mbuftond(&md, &dpos, 1, FALSE, &len, &nd); + if (!error) + error = nfs_namei(nfsd, &context, &nd, &nfh, nam, FALSE, &dirp, &nx, &nxo); + if (dirp) { + nfsm_srv_pre_vattr_init(&dirfor, 1); + dirfor_ret = vnode_getattr(dirp, &dirfor, &context); + } if (error) { + nd.ni_cnd.cn_nameiop = 0; nfsm_reply(NFSX_WCCDATA(1)); nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); if (dirp) - vrele(dirp); + vnode_put(dirp); return (0); } + dvp = nd.ni_dvp; + vp = nd.ni_vp; + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); vtyp = nfsv3tov_type(*tl); if (vtyp != VCHR && vtyp != VBLK && vtyp != VSOCK && vtyp != VFIFO) { - vrele(nd.ni_startdir); - FREE_ZONE((caddr_t)nd.ni_cnd.cn_pnbuf, - nd.ni_cnd.cn_pnlen, M_NAMEI); - nd.ni_cnd.cn_flags &= ~HASBUF; error = NFSERR_BADTYPE; - VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); - vput(nd.ni_dvp); goto out; } - VATTR_NULL(vap); + VATTR_INIT(vap); nfsm_srvsattr(vap); + if (vtyp == VCHR || vtyp == VBLK) { nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); major = fxdr_unsigned(u_long, *tl++); minor = fxdr_unsigned(u_long, *tl); - vap->va_rdev = makedev(major, minor); + VATTR_SET(vap, va_rdev, makedev(major, minor)); } /* - * Iff doesn't exist, create it. + * If it doesn't exist, create it. */ - if (nd.ni_vp) { - vrele(nd.ni_startdir); - FREE_ZONE((caddr_t)nd.ni_cnd.cn_pnbuf, - nd.ni_cnd.cn_pnlen, M_NAMEI); - nd.ni_cnd.cn_flags &= ~HASBUF; + if (vp) { error = EEXIST; - VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); - vput(nd.ni_dvp); goto out; } - vap->va_type = vtyp; - if (vtyp == VSOCK) { - vrele(nd.ni_startdir); - nqsrv_getl(nd.ni_dvp, ND_WRITE); - error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap); - if (!error) - FREE_ZONE(nd.ni_cnd.cn_pnbuf, - nd.ni_cnd.cn_pnlen, M_NAMEI); - nd.ni_cnd.cn_flags &= ~HASBUF; - } else { - if (vtyp != VFIFO && (error = suser(cred, (u_short *)0))) { - vrele(nd.ni_startdir); - FREE_ZONE((caddr_t)nd.ni_cnd.cn_pnbuf, - nd.ni_cnd.cn_pnlen, M_NAMEI); - nd.ni_cnd.cn_flags &= ~HASBUF; - VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); - vput(nd.ni_dvp); - goto out; - } - nqsrv_getl(nd.ni_dvp, ND_WRITE); - if ((error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap))) { - vrele(nd.ni_startdir); - goto out; + VATTR_SET(vap, va_type, vtyp); + + /* + * If the credentials were mapped, we should + * map the same values in the attributes. + */ + if ((vap->va_uid == saved_uid) && (kauth_cred_getuid(nfsd->nd_cr) != saved_uid)) { + int ismember; + VATTR_SET(vap, va_uid, kauth_cred_getuid(nfsd->nd_cr)); + if (kauth_cred_ismember_gid(nfsd->nd_cr, vap->va_gid, &ismember) || !ismember) + VATTR_SET(vap, va_gid, kauth_cred_getgid(nfsd->nd_cr)); + } + + /* authorize before creating */ + error = nfsrv_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, &context, nxo, 0); + + /* construct ACL and handle inheritance */ + if (!error) { + error = kauth_acl_inherit(dvp, + NULL, + &xacl, + 0 /* !isdir */, + &context); + + if (!error && xacl != NULL) + VATTR_SET(vap, va_acl, xacl); + } + VATTR_CLEAR_ACTIVE(vap, va_data_size); + VATTR_CLEAR_ACTIVE(vap, va_access_time); + + /* validate new-file security information */ + if (!error) { + error = vnode_authattr_new(dvp, vap, 0, &context); + if (error && (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid))) { + /* + * Most NFS servers just ignore the UID/GID attributes, so we + * try ignoring them if that'll help the request succeed. + */ + VATTR_CLEAR_ACTIVE(vap, va_uid); + VATTR_CLEAR_ACTIVE(vap, va_gid); + error = vnode_authattr_new(dvp, vap, 0, &context); + } + } + + if (vtyp == VSOCK) { + error = VNOP_CREATE(dvp, &vp, &nd.ni_cnd, vap, &context); + + if (!error && !VATTR_ALL_SUPPORTED(vap)) + /* + * If some of the requested attributes weren't handled by the VNOP, + * use our fallback code. + */ + error = vnode_setattr_fallback(vp, vap, &context); + } else { + if (vtyp != VFIFO && (error = suser(nfsd->nd_cr, (u_short *)0))) { + goto out1; + } + if ((error = VNOP_MKNOD(dvp, &vp, &nd.ni_cnd, vap, &context))) { + goto out1; + } + if (vp) { + vnode_recycle(vp); + vnode_put(vp); + vp = NULL; } nd.ni_cnd.cn_nameiop = LOOKUP; - nd.ni_cnd.cn_flags &= ~(LOCKPARENT | SAVESTART); - nd.ni_cnd.cn_proc = procp; - nd.ni_cnd.cn_cred = procp->p_ucred; + nd.ni_cnd.cn_flags &= ~LOCKPARENT; + nd.ni_cnd.cn_context = &hacked_context; + nd.ni_startdir = dvp; + nd.ni_usedvp = dvp; error = lookup(&nd); - FREE_ZONE(nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); - nd.ni_cnd.cn_flags &= ~HASBUF; - if (error) - goto out; - if (nd.ni_cnd.cn_flags & ISSYMLINK) { - vrele(nd.ni_dvp); - vput(nd.ni_vp); - VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); - error = EINVAL; + if (!error) { + vp = nd.ni_vp; + if (nd.ni_cnd.cn_flags & ISSYMLINK) + error = EINVAL; } } +out1: + if (xacl != NULL) + kauth_acl_free(xacl); out: - vp = nd.ni_vp; + /* + * nameidone has to happen before we vnode_put(dvp) + * since it may need to release the fs_nodelock on the dvp + */ + nameidone(&nd); + nd.ni_cnd.cn_nameiop = 0; + + vnode_put(dvp); + if (!error) { - bzero((caddr_t)fhp, sizeof(nfh)); - fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid; - error = VFS_VPTOFH(vp, &fhp->fh_fid); - if (!error) - error = VOP_GETATTR(vp, vap, cred, procp); - vput(vp); + error = nfsrv_vptofh(nx, 0, NULL, vp, &context, &nfh); + if (!error) { + nfsm_srv_vattr_init(&postat, 1); + error = vnode_getattr(vp, &postat, &context); + } } - diraft_ret = VOP_GETATTR(dirp, &diraft, cred, procp); - vrele(dirp); - nfsm_reply(NFSX_SRVFH(1) + NFSX_POSTOPATTR(1) + NFSX_WCCDATA(1)); + if (vp) + vnode_put(vp); + + nfsm_srv_vattr_init(&diraft, 1); + diraft_ret = vnode_getattr(dirp, &diraft, &context); + vnode_put(dirp); + dirp = NULL; + + nfsm_reply(NFSX_SRVFH(1, &nfh) + NFSX_POSTOPATTR(1) + NFSX_WCCDATA(1)); if (!error) { - nfsm_srvpostop_fh(fhp); - nfsm_srvpostop_attr(0, vap); + nfsm_srvpostop_fh(&nfh); + nfsm_srvpostop_attr(0, &postat); } nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); return (0); nfsmout: - if (dirp) - vrele(dirp); if (nd.ni_cnd.cn_nameiop) { - vrele(nd.ni_startdir); - FREE_ZONE((caddr_t)nd.ni_cnd.cn_pnbuf, - nd.ni_cnd.cn_pnlen, M_NAMEI); - nd.ni_cnd.cn_flags &= ~HASBUF; - } - VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); - if (nd.ni_dvp == nd.ni_vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); - if (nd.ni_vp) - vput(nd.ni_vp); + /* + * nameidone has to happen before we vnode_put(dvp) + * since it may need to release the fs_nodelock on the dvp + */ + nameidone(&nd); + + if (vp) + vnode_put(vp); + vnode_put(dvp); + } + if (dirp) + vnode_put(dirp); return (error); } @@ -1839,84 +2166,86 @@ int nfsrv_remove(nfsd, slp, procp, mrq) struct nfsrv_descript *nfsd; struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; + proc_t procp; + mbuf_t *mrq; { - struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; - struct mbuf *nam = nfsd->nd_nam; + mbuf_t mrep = nfsd->nd_mrep, md = nfsd->nd_md; + mbuf_t nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; - struct ucred *cred = &nfsd->nd_cr; struct nameidata nd; - register u_long *tl; - register long t1; + u_long *tl; + long t1; caddr_t bpos; - int error = 0, cache, len, dirfor_ret = 1, diraft_ret = 1; + int error = 0, len, dirfor_ret = 1, diraft_ret = 1; int v3 = (nfsd->nd_flag & ND_NFSV3); char *cp2; - struct mbuf *mb, *mreq; - struct vnode *vp, *dirp; - struct vattr dirfor, diraft; - nfsfh_t nfh; - fhandle_t *fhp; - u_quad_t frev; + mbuf_t mb, mreq; + vnode_t vp, dvp, dirp = NULL; + struct vnode_attr dirfor, diraft; + struct nfs_filehandle nfh; + struct nfs_export *nx; + struct nfs_export_options *nxo; + struct vfs_context context; + + context.vc_proc = procp; + context.vc_ucred = nfsd->nd_cr; + + dvp = vp = NULL; + nfsm_srvmtofh(&nfh); + nfsm_srvnamesiz(len, v3); -#ifndef nolint - vp = (struct vnode *)0; -#endif - fhp = &nfh.fh_generic; - nfsm_srvmtofh(fhp); - nfsm_srvnamesiz(len); - nd.ni_cnd.cn_cred = cred; nd.ni_cnd.cn_nameiop = DELETE; nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; - error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos, - &dirp, procp, (nfsd->nd_flag & ND_KERBAUTH), FALSE); + error = nfsm_path_mbuftond(&md, &dpos, v3, FALSE, &len, &nd); + if (!error) + error = nfs_namei(nfsd, &context, &nd, &nfh, nam, FALSE, &dirp, &nx, &nxo); if (dirp) { - if (v3) - dirfor_ret = VOP_GETATTR(dirp, &dirfor, cred, - procp); - else - vrele(dirp); + if (v3) { + nfsm_srv_pre_vattr_init(&dirfor, v3); + dirfor_ret = vnode_getattr(dirp, &dirfor, &context); + } else { + vnode_put(dirp); + dirp = NULL; + } } if (!error) { + dvp = nd.ni_dvp; vp = nd.ni_vp; - if (vp->v_type == VDIR) { + + if (vnode_vtype(vp) == VDIR) error = EPERM; /* POSIX */ - goto out; - } - /* - * The root of a mounted filesystem cannot be deleted. - */ - if (vp->v_flag & VROOT) { + else if (vnode_isvroot(vp)) + /* + * The root of a mounted filesystem cannot be deleted. + */ error = EBUSY; - goto out; - } -out: - if (!error) { - nqsrv_getl(nd.ni_dvp, ND_WRITE); - nqsrv_getl(vp, ND_WRITE); + else + error = nfsrv_authorize(vp, dvp, KAUTH_VNODE_DELETE, &context, nxo, 0); - error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); + if (!error) + error = VNOP_REMOVE(dvp, vp, &nd.ni_cnd, 0, &context); - } else { - VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); - if (nd.ni_dvp == vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); - vput(vp); - } + /* + * nameidone has to happen before we vnode_put(dvp) + * since it may need to release the fs_nodelock on the dvp + */ + nameidone(&nd); + + vnode_put(vp); + vnode_put(dvp); } - if (dirp && v3) { - diraft_ret = VOP_GETATTR(dirp, &diraft, cred, procp); - vrele(dirp); + if (dirp) { + nfsm_srv_vattr_init(&diraft, v3); + diraft_ret = vnode_getattr(dirp, &diraft, &context); + vnode_put(dirp); } nfsm_reply(NFSX_WCCDATA(v3)); if (v3) { nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); return (0); } - nfsm_srvdone; +nfsmout: + return (error); } /* @@ -1926,107 +2255,163 @@ int nfsrv_rename(nfsd, slp, procp, mrq) struct nfsrv_descript *nfsd; struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; + proc_t procp; + mbuf_t *mrq; { - struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; - struct mbuf *nam = nfsd->nd_nam; + mbuf_t mrep = nfsd->nd_mrep, md = nfsd->nd_md; + mbuf_t nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; - struct ucred *cred = &nfsd->nd_cr; - register u_long *tl; - register long t1; + kauth_cred_t saved_cred = NULL; + u_long *tl; + long t1; caddr_t bpos; - int error = 0, cache, len, len2, fdirfor_ret = 1, fdiraft_ret = 1; + int error = 0, fromlen, tolen; + int fdirfor_ret = 1, fdiraft_ret = 1; int tdirfor_ret = 1, tdiraft_ret = 1; int v3 = (nfsd->nd_flag & ND_NFSV3); - char *cp2; - struct mbuf *mb, *mreq; + char *cp2, *frompath = NULL, *topath = NULL; + mbuf_t mb, mreq; struct nameidata fromnd, tond; - struct vnode *fvp, *tvp, *tdvp, *fdirp = (struct vnode *)0; - struct vnode *tdirp = (struct vnode *)0; - struct vattr fdirfor, fdiraft, tdirfor, tdiraft; - nfsfh_t fnfh, tnfh; - fhandle_t *ffhp, *tfhp; - u_quad_t frev; - uid_t saved_uid; + vnode_t fvp, tvp, tdvp, fdvp, fdirp = NULL; + vnode_t tdirp = NULL; + struct vnode_attr fdirfor, fdiraft, tdirfor, tdiraft; + struct nfs_filehandle fnfh, tnfh; + struct nfs_export *fnx, *tnx; + struct nfs_export_options *fnxo, *tnxo; + enum vtype fvtype, tvtype; + int holding_mntlock; + mount_t locked_mp; + struct vfs_context context; + + context.vc_proc = procp; + context.vc_ucred = nfsd->nd_cr; #ifndef nolint - fvp = (struct vnode *)0; + fvp = (vnode_t)0; #endif - ffhp = &fnfh.fh_generic; - tfhp = &tnfh.fh_generic; - fromnd.ni_cnd.cn_nameiop = 0; - tond.ni_cnd.cn_nameiop = 0; - nfsm_srvmtofh(ffhp); - nfsm_srvnamesiz(len); + + /* + * these need to be set before + * calling any nfsm_xxxx macros + * since they may take us out + * through the error path + */ + holding_mntlock = 0; + fvp = tvp = NULL; + fdvp = tdvp = NULL; + locked_mp = NULL; + + nfsm_srvmtofh(&fnfh); + nfsm_srvnamesiz(fromlen, v3); + error = nfsm_path_mbuftond(&md, &dpos, v3, FALSE, &fromlen, &fromnd); + if (error) { + nfsm_reply(0); + return (0); + } + frompath = fromnd.ni_cnd.cn_pnbuf; + nfsm_srvmtofh(&tnfh); + nfsm_strsiz(tolen, NFS_MAXNAMLEN, v3); + error = nfsm_path_mbuftond(&md, &dpos, v3, FALSE, &tolen, &tond); + if (error) { + nfsm_reply(0); + FREE_ZONE(frompath, MAXPATHLEN, M_NAMEI); + return (0); + } + topath = tond.ni_cnd.cn_pnbuf; + /* * Remember our original uid so that we can reset cr_uid before * the second nfs_namei() call, in case it is remapped. */ - saved_uid = cred->cr_uid; - fromnd.ni_cnd.cn_cred = cred; + saved_cred = nfsd->nd_cr; + kauth_cred_ref(saved_cred); +retry: fromnd.ni_cnd.cn_nameiop = DELETE; - fromnd.ni_cnd.cn_flags = WANTPARENT | SAVESTART; - error = nfs_namei(&fromnd, ffhp, len, slp, nam, &md, - &dpos, &fdirp, procp, (nfsd->nd_flag & ND_KERBAUTH), FALSE); + fromnd.ni_cnd.cn_flags = WANTPARENT; + + fromnd.ni_cnd.cn_pnbuf = frompath; + frompath = NULL; + fromnd.ni_cnd.cn_pnlen = MAXPATHLEN; + fromnd.ni_cnd.cn_flags |= HASBUF; + + error = nfs_namei(nfsd, &context, &fromnd, &fnfh, nam, FALSE, &fdirp, &fnx, &fnxo); + if (error) + goto out; + fdvp = fromnd.ni_dvp; + fvp = fromnd.ni_vp; + if (fdirp) { - if (v3) - fdirfor_ret = VOP_GETATTR(fdirp, &fdirfor, cred, - procp); - else { - vrele(fdirp); - fdirp = (struct vnode *)0; + if (v3) { + nfsm_srv_pre_vattr_init(&fdirfor, v3); + fdirfor_ret = vnode_getattr(fdirp, &fdirfor, &context); + } else { + vnode_put(fdirp); + fdirp = NULL; } } - if (error) { - nfsm_reply(2 * NFSX_WCCDATA(v3)); - nfsm_srvwcc_data(fdirfor_ret, &fdirfor, fdiraft_ret, &fdiraft); - nfsm_srvwcc_data(tdirfor_ret, &tdirfor, tdiraft_ret, &tdiraft); - if (fdirp) - vrele(fdirp); - return (0); + fvtype = vnode_vtype(fvp); + + /* reset credential if it was remapped */ + if (nfsd->nd_cr != saved_cred) { + kauth_cred_rele(nfsd->nd_cr); + nfsd->nd_cr = saved_cred; + kauth_cred_ref(nfsd->nd_cr); } - fvp = fromnd.ni_vp; - nfsm_srvmtofh(tfhp); - nfsm_strsiz(len2, NFS_MAXNAMLEN); - cred->cr_uid = saved_uid; - tond.ni_cnd.cn_cred = cred; + tond.ni_cnd.cn_nameiop = RENAME; - tond.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART; - error = nfs_namei(&tond, tfhp, len2, slp, nam, &md, - &dpos, &tdirp, procp, (nfsd->nd_flag & ND_KERBAUTH), FALSE); - if (tdirp) { - if (v3) - tdirfor_ret = VOP_GETATTR(tdirp, &tdirfor, cred, - procp); - else { - vrele(tdirp); - tdirp = (struct vnode *)0; - } - } + tond.ni_cnd.cn_flags = WANTPARENT; + + tond.ni_cnd.cn_pnbuf = topath; + topath = NULL; + tond.ni_cnd.cn_pnlen = MAXPATHLEN; + tond.ni_cnd.cn_flags |= HASBUF; + + if (fvtype == VDIR) + tond.ni_cnd.cn_flags |= WILLBEDIR; + + error = nfs_namei(nfsd, &context, &tond, &tnfh, nam, FALSE, &tdirp, &tnx, &tnxo); if (error) { - VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd); - vrele(fromnd.ni_dvp); - vrele(fvp); - goto out1; + /* + * Translate error code for rename("dir1", "dir2/."). + */ + if (error == EISDIR && fvtype == VDIR) { + if (v3) + error = EINVAL; + else + error = ENOTEMPTY; + } + goto out; } tdvp = tond.ni_dvp; - tvp = tond.ni_vp; + tvp = tond.ni_vp; + + if (tdirp) { + if (v3) { + nfsm_srv_pre_vattr_init(&tdirfor, v3); + tdirfor_ret = vnode_getattr(tdirp, &tdirfor, &context); + } else { + vnode_put(tdirp); + tdirp = NULL; + } + } + if (tvp != NULL) { - if (fvp->v_type == VDIR && tvp->v_type != VDIR) { + tvtype = vnode_vtype(tvp); + + if (fvtype == VDIR && tvtype != VDIR) { if (v3) error = EEXIST; else error = EISDIR; goto out; - } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { + } else if (fvtype != VDIR && tvtype == VDIR) { if (v3) error = EEXIST; else error = ENOTDIR; goto out; } - if (tvp->v_type == VDIR && tvp->v_mountedhere) { + if (tvtype == VDIR && vnode_mountedhere(tvp)) { if (v3) error = EXDEV; else @@ -2034,95 +2419,346 @@ nfsrv_rename(nfsd, slp, procp, mrq) goto out; } } - if (fvp->v_type == VDIR && fvp->v_mountedhere) { + if (fvp == tdvp) { + if (v3) + error = EINVAL; + else + error = ENOTEMPTY; + goto out; + } + + /* + * Authorization. + * + * If tvp is a directory and not the same as fdvp, or tdvp is not the same as fdvp, + * the node is moving between directories and we need rights to remove from the + * old and add to the new. + * + * If tvp already exists and is not a directory, we need to be allowed to delete it. + * + * Note that we do not inherit when renaming. XXX this needs to be revisited to + * implement the deferred-inherit bit. + */ + { + int moving = 0; + + error = 0; + if ((tvp != NULL) && vnode_isdir(tvp)) { + if (tvp != fdvp) + moving = 1; + } else if (tdvp != fdvp) { + moving = 1; + } + if (moving) { + /* moving out of fdvp, must have delete rights */ + if ((error = nfsrv_authorize(fvp, fdvp, KAUTH_VNODE_DELETE, &context, fnxo, 0)) != 0) + goto auth_exit; + /* moving into tdvp or tvp, must have rights to add */ + if ((error = nfsrv_authorize(((tvp != NULL) && vnode_isdir(tvp)) ? tvp : tdvp, + NULL, + vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, + &context, tnxo, 0)) != 0) + goto auth_exit; + } else { + /* node staying in same directory, must be allowed to add new name */ + if ((error = nfsrv_authorize(fdvp, NULL, + vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, + &context, fnxo, 0)) != 0) + goto auth_exit; + } + /* overwriting tvp */ + if ((tvp != NULL) && !vnode_isdir(tvp) && + ((error = nfsrv_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, &context, tnxo, 0)) != 0)) + goto auth_exit; + + /* XXX more checks? */ + +auth_exit: + /* authorization denied */ + if (error != 0) + goto out; + } + + if ((vnode_mount(fvp) != vnode_mount(tdvp)) || + (tvp && (vnode_mount(fvp) != vnode_mount(tvp)))) { if (v3) error = EXDEV; else error = ENOTEMPTY; goto out; } - if (fvp->v_mount != tdvp->v_mount) { + /* + * The following edge case is caught here: + * (to cannot be a descendent of from) + * + * o fdvp + * / + * / + * o fvp + * \ + * \ + * o tdvp + * / + * / + * o tvp + */ + if (tdvp->v_parent == fvp) { if (v3) error = EXDEV; else error = ENOTEMPTY; goto out; } - if (fvp == tdvp) + if (fvtype == VDIR && vnode_mountedhere(fvp)) { if (v3) - error = EINVAL; + error = EXDEV; else error = ENOTEMPTY; + goto out; + } /* * If source is the same as the destination (that is the - * same vnode) then there is nothing to do. - * (fixed to have POSIX semantics - CSM 3/2/98) + * same vnode) then there is nothing to do... + * EXCEPT if the underlying file system supports case + * insensitivity and is case preserving. In this case + * the file system needs to handle the special case of + * getting the same vnode as target (fvp) and source (tvp). + * + * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE + * and _PC_CASE_PRESERVING can have this exception, and they need to + * handle the special case of getting the same vnode as target and + * source. NOTE: Then the target is unlocked going into vnop_rename, + * so not to cause locking problems. There is a single reference on tvp. + * + * NOTE - that fvp == tvp also occurs if they are hard linked - NOTE + * that correct behaviour then is just to remove the source (link) */ - if (fvp == tvp) - error = -1; -out: - if (!error) { - nqsrv_getl(fromnd.ni_dvp, ND_WRITE); - nqsrv_getl(tdvp, ND_WRITE); - if (tvp) - nqsrv_getl(tvp, ND_WRITE); - error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, - tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); + if ((fvp == tvp) && (fdvp == tdvp)) { + if (fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen && + !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr, + fromnd.ni_cnd.cn_namelen)) { + goto out; + } + } + + if (holding_mntlock && vnode_mount(fvp) != locked_mp) { + /* + * we're holding a reference and lock + * on locked_mp, but it no longer matches + * what we want to do... so drop our hold + */ + mount_unlock_renames(locked_mp); + mount_drop(locked_mp, 0); + holding_mntlock = 0; + } + if (tdvp != fdvp && fvtype == VDIR) { + /* + * serialize renames that re-shape + * the tree... if holding_mntlock is + * set, then we're ready to go... + * otherwise we + * first need to drop the iocounts + * we picked up, second take the + * lock to serialize the access, + * then finally start the lookup + * process over with the lock held + */ + if (!holding_mntlock) { + /* + * need to grab a reference on + * the mount point before we + * drop all the iocounts... once + * the iocounts are gone, the mount + * could follow + */ + locked_mp = vnode_mount(fvp); + mount_ref(locked_mp, 0); + + /* make a copy of to path to pass to nfs_namei() again */ + MALLOC_ZONE(topath, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (topath) + bcopy(tond.ni_cnd.cn_pnbuf, topath, tolen + 1); + + /* + * nameidone has to happen before we vnode_put(tdvp) + * since it may need to release the fs_nodelock on the tdvp + */ + nameidone(&tond); + + if (tvp) + vnode_put(tvp); + vnode_put(tdvp); + + /* make a copy of from path to pass to nfs_namei() again */ + MALLOC_ZONE(frompath, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (frompath) + bcopy(fromnd.ni_cnd.cn_pnbuf, frompath, fromlen + 1); + + /* + * nameidone has to happen before we vnode_put(fdvp) + * since it may need to release the fs_nodelock on the fdvp + */ + nameidone(&fromnd); + + vnode_put(fvp); + vnode_put(fdvp); + + if (fdirp) { + vnode_put(fdirp); + fdirp = NULL; + } + if (tdirp) { + vnode_put(tdirp); + tdirp = NULL; + } + mount_lock_renames(locked_mp); + holding_mntlock = 1; + + fvp = tvp = NULL; + fdvp = tdvp = NULL; + + fdirfor_ret = tdirfor_ret = 1; + + if (!topath || !frompath) { + /* we couldn't allocate a path, so bail */ + error = ENOMEM; + goto out; + } + + goto retry; + } } else { - VOP_ABORTOP(tond.ni_dvp, &tond.ni_cnd); - if (tdvp == tvp) - vrele(tdvp); - else - vput(tdvp); + /* + * when we dropped the iocounts to take + * the lock, we allowed the identity of + * the various vnodes to change... if they did, + * we may no longer be dealing with a rename + * that reshapes the tree... once we're holding + * the iocounts, the vnodes can't change type + * so we're free to drop the lock at this point + * and continue on + */ + if (holding_mntlock) { + mount_unlock_renames(locked_mp); + mount_drop(locked_mp, 0); + holding_mntlock = 0; + } + } + + // save these off so we can later verify that fvp is the same + char *oname; + vnode_t oparent; + oname = fvp->v_name; + oparent = fvp->v_parent; + + error = VNOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, + tond.ni_dvp, tond.ni_vp, &tond.ni_cnd, &context); + /* + * fix up name & parent pointers. note that we first + * check that fvp has the same name/parent pointers it + * had before the rename call... this is a 'weak' check + * at best... + */ + if (oname == fvp->v_name && oparent == fvp->v_parent) { + int update_flags; + update_flags = VNODE_UPDATE_NAME; + if (fdvp != tdvp) + update_flags |= VNODE_UPDATE_PARENT; + vnode_update_identity(fvp, tdvp, tond.ni_cnd.cn_nameptr, tond.ni_cnd.cn_namelen, tond.ni_cnd.cn_hash, update_flags); + } +out: + if (holding_mntlock) { + mount_unlock_renames(locked_mp); + mount_drop(locked_mp, 0); + holding_mntlock = 0; + } + if (tdvp) { + /* + * nameidone has to happen before we vnode_put(tdvp) + * since it may need to release the fs_nodelock on the tdvp + */ + nameidone(&tond); if (tvp) - vput(tvp); - VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd); - vrele(fromnd.ni_dvp); - vrele(fvp); - if (error == -1) - error = 0; + vnode_put(tvp); + vnode_put(tdvp); + + tdvp = NULL; + } + if (fdvp) { + /* + * nameidone has to happen before we vnode_put(fdvp) + * since it may need to release the fs_nodelock on the fdvp + */ + nameidone(&fromnd); + + if (fvp) + vnode_put(fvp); + vnode_put(fdvp); + + fdvp = NULL; } - vrele(tond.ni_startdir); - FREE_ZONE(tond.ni_cnd.cn_pnbuf, tond.ni_cnd.cn_pnlen, M_NAMEI); - tond.ni_cnd.cn_flags &= ~HASBUF; -out1: if (fdirp) { - fdiraft_ret = VOP_GETATTR(fdirp, &fdiraft, cred, procp); - vrele(fdirp); + nfsm_srv_vattr_init(&fdiraft, v3); + fdiraft_ret = vnode_getattr(fdirp, &fdiraft, &context); + vnode_put(fdirp); + fdirp = NULL; } if (tdirp) { - tdiraft_ret = VOP_GETATTR(tdirp, &tdiraft, cred, procp); - vrele(tdirp); + nfsm_srv_vattr_init(&tdiraft, v3); + tdiraft_ret = vnode_getattr(tdirp, &tdiraft, &context); + vnode_put(tdirp); + tdirp = NULL; } - vrele(fromnd.ni_startdir); - FREE_ZONE(fromnd.ni_cnd.cn_pnbuf, fromnd.ni_cnd.cn_pnlen, M_NAMEI); - fromnd.ni_cnd.cn_flags &= ~HASBUF; nfsm_reply(2 * NFSX_WCCDATA(v3)); if (v3) { nfsm_srvwcc_data(fdirfor_ret, &fdirfor, fdiraft_ret, &fdiraft); nfsm_srvwcc_data(tdirfor_ret, &tdirfor, tdiraft_ret, &tdiraft); } + if (frompath) + FREE_ZONE(frompath, MAXPATHLEN, M_NAMEI); + if (topath) + FREE_ZONE(topath, MAXPATHLEN, M_NAMEI); + if (saved_cred) + kauth_cred_rele(saved_cred); return (0); nfsmout: + if (holding_mntlock) { + mount_unlock_renames(locked_mp); + mount_drop(locked_mp, 0); + } + if (tdvp) { + /* + * nameidone has to happen before we vnode_put(tdvp) + * since it may need to release the fs_nodelock on the tdvp + */ + nameidone(&tond); + + if (tvp) + vnode_put(tvp); + vnode_put(tdvp); + } + if (fdvp) { + /* + * nameidone has to happen before we vnode_put(fdvp) + * since it may need to release the fs_nodelock on the fdvp + */ + nameidone(&fromnd); + + if (fvp) + vnode_put(fvp); + vnode_put(fdvp); + } if (fdirp) - vrele(fdirp); + vnode_put(fdirp); if (tdirp) - vrele(tdirp); - if (tond.ni_cnd.cn_nameiop) { - vrele(tond.ni_startdir); - FREE_ZONE(tond.ni_cnd.cn_pnbuf, tond.ni_cnd.cn_pnlen, M_NAMEI); - tond.ni_cnd.cn_flags &= ~HASBUF; - } - if (fromnd.ni_cnd.cn_nameiop) { - vrele(fromnd.ni_startdir); - FREE_ZONE(fromnd.ni_cnd.cn_pnbuf, - fromnd.ni_cnd.cn_pnlen, M_NAMEI); - fromnd.ni_cnd.cn_flags &= ~HASBUF; - VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd); - vrele(fromnd.ni_dvp); - vrele(fvp); - } + vnode_put(tdirp); + if (frompath) + FREE_ZONE(frompath, MAXPATHLEN, M_NAMEI); + if (topath) + FREE_ZONE(topath, MAXPATHLEN, M_NAMEI); + if (saved_cred) + kauth_cred_rele(saved_cred); return (error); } @@ -2133,96 +2769,116 @@ int nfsrv_link(nfsd, slp, procp, mrq) struct nfsrv_descript *nfsd; struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; + proc_t procp; + mbuf_t *mrq; { - struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; - struct mbuf *nam = nfsd->nd_nam; + mbuf_t mrep = nfsd->nd_mrep, md = nfsd->nd_md; + mbuf_t nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; - struct ucred *cred = &nfsd->nd_cr; struct nameidata nd; - register u_long *tl; - register long t1; + u_long *tl; + long t1; caddr_t bpos; - int error = 0, rdonly, cache, len, dirfor_ret = 1, diraft_ret = 1; + int error = 0, len, dirfor_ret = 1, diraft_ret = 1; int getret = 1, v3 = (nfsd->nd_flag & ND_NFSV3); char *cp2; - struct mbuf *mb, *mreq; - struct vnode *vp, *xp, *dirp = (struct vnode *)0; - struct vattr dirfor, diraft, at; - nfsfh_t nfh, dnfh; - fhandle_t *fhp, *dfhp; - u_quad_t frev; - - fhp = &nfh.fh_generic; - dfhp = &dnfh.fh_generic; - nfsm_srvmtofh(fhp); - nfsm_srvmtofh(dfhp); - nfsm_srvnamesiz(len); - if ((error = nfsrv_fhtovp(fhp, FALSE, &vp, cred, slp, nam, - &rdonly, (nfsd->nd_flag & ND_KERBAUTH), TRUE))) { + mbuf_t mb, mreq; + vnode_t vp, xp, dvp, dirp = NULL; + struct vnode_attr dirfor, diraft, at; + struct nfs_filehandle nfh, dnfh; + struct nfs_export *nx; + struct nfs_export_options *nxo; + struct vfs_context context; + + vp = xp = dvp = NULL; + nfsm_srvmtofh(&nfh); + nfsm_srvmtofh(&dnfh); + nfsm_srvnamesiz(len, v3); + if ((error = nfsrv_fhtovp(&nfh, nam, TRUE, &vp, &nx, &nxo))) { nfsm_reply(NFSX_POSTOPATTR(v3) + NFSX_WCCDATA(v3)); nfsm_srvpostop_attr(getret, &at); nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); return (0); } - if (vp->v_type == VDIR) { + if ((error = nfsrv_credcheck(nfsd, nx, nxo))) { + vnode_put(vp); + nfsm_reply(NFSX_POSTOPATTR(v3) + NFSX_WCCDATA(v3)); + nfsm_srvpostop_attr(getret, &at); + nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); + return (0); + } + + /* we're not allowed to link to directories... */ + if (vnode_vtype(vp) == VDIR) { error = EPERM; /* POSIX */ goto out1; } - nd.ni_cnd.cn_cred = cred; + + context.vc_proc = procp; + context.vc_ucred = nfsd->nd_cr; + + /* ...or to anything that kauth doesn't want us to (eg. immutable items) */ + if ((error = nfsrv_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, &context, nxo, 0)) != 0) + goto out1; + nd.ni_cnd.cn_nameiop = CREATE; nd.ni_cnd.cn_flags = LOCKPARENT; - error = nfs_namei(&nd, dfhp, len, slp, nam, &md, &dpos, - &dirp, procp, (nfsd->nd_flag & ND_KERBAUTH), FALSE); + error = nfsm_path_mbuftond(&md, &dpos, v3, FALSE, &len, &nd); + if (!error) + error = nfs_namei(nfsd, &context, &nd, &dnfh, nam, FALSE, &dirp, &nx, &nxo); if (dirp) { - if (v3) - dirfor_ret = VOP_GETATTR(dirp, &dirfor, cred, - procp); - else { - vrele(dirp); - dirp = (struct vnode *)0; + if (v3) { + nfsm_srv_pre_vattr_init(&dirfor, v3); + dirfor_ret = vnode_getattr(dirp, &dirfor, &context); + } else { + vnode_put(dirp); + dirp = NULL; } } if (error) goto out1; + dvp = nd.ni_dvp; xp = nd.ni_vp; - if (xp != NULL) { + + if (xp != NULL) error = EEXIST; - goto out; - } - xp = nd.ni_dvp; - if (vp->v_mount != xp->v_mount) + else if (vnode_mount(vp) != vnode_mount(dvp)) error = EXDEV; -out: - if (!error) { - nqsrv_getl(vp, ND_WRITE); - nqsrv_getl(xp, ND_WRITE); - error = VOP_LINK(vp, nd.ni_dvp, &nd.ni_cnd); - } else { - VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); - if (nd.ni_dvp == nd.ni_vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); - if (nd.ni_vp) - vrele(nd.ni_vp); - } + else + error = nfsrv_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, &context, nxo, 0); + + if (!error) + error = VNOP_LINK(vp, dvp, &nd.ni_cnd, &context); + + /* + * nameidone has to happen before we vnode_put(dvp) + * since it may need to release the fs_nodelock on the dvp + */ + nameidone(&nd); + + if (xp) + vnode_put(xp); + vnode_put(dvp); out1: - if (v3) - getret = VOP_GETATTR(vp, &at, cred, procp); + if (v3) { + nfsm_srv_vattr_init(&at, v3); + getret = vnode_getattr(vp, &at, &context); + } if (dirp) { - diraft_ret = VOP_GETATTR(dirp, &diraft, cred, procp); - vrele(dirp); + nfsm_srv_vattr_init(&diraft, v3); + diraft_ret = vnode_getattr(dirp, &diraft, &context); + vnode_put(dirp); } - vrele(vp); + vnode_put(vp); + nfsm_reply(NFSX_POSTOPATTR(v3) + NFSX_WCCDATA(v3)); if (v3) { nfsm_srvpostop_attr(getret, &at); nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); return (0); } - nfsm_srvdone; +nfsmout: + return (error); } /* @@ -2232,142 +2888,198 @@ int nfsrv_symlink(nfsd, slp, procp, mrq) struct nfsrv_descript *nfsd; struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; + proc_t procp; + mbuf_t *mrq; { - struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; - struct mbuf *nam = nfsd->nd_nam; + mbuf_t mrep = nfsd->nd_mrep, md = nfsd->nd_md; + mbuf_t nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; - struct ucred *cred = &nfsd->nd_cr; - struct vattr va, dirfor, diraft; + struct vnode_attr dirfor, diraft, postat; struct nameidata nd; - register struct vattr *vap = &va; - register u_long *tl; - register long t1; + struct vnode_attr va; + struct vnode_attr *vap = &va; + u_long *tl; + long t1; struct nfsv2_sattr *sp; - char *bpos, *pathcp = (char *)0, *cp2; - struct uio io; - struct iovec iv; - int error = 0, cache, len, len2, dirfor_ret = 1, diraft_ret = 1; + char *bpos, *linkdata = NULL, *cp2; + int error = 0, len, linkdatalen; + int dirfor_ret = 1, diraft_ret = 1; int v3 = (nfsd->nd_flag & ND_NFSV3); - struct mbuf *mb, *mreq, *mb2; - struct vnode *dirp = (struct vnode *)0; - nfsfh_t nfh; - fhandle_t *fhp; - u_quad_t frev; + mbuf_t mb, mreq, mb2; + vnode_t vp, dvp, dirp = NULL; + struct nfs_filehandle nfh; + struct nfs_export *nx; + struct nfs_export_options *nxo; + uio_t auio; + char uio_buf[ UIO_SIZEOF(1) ]; + struct vfs_context context; + uid_t saved_uid; + + context.vc_proc = procp; + context.vc_ucred = nfsd->nd_cr; + + /* + * Save the original credential UID in case they are + * mapped and we need to map the IDs in the attributes. + */ + saved_uid = kauth_cred_getuid(nfsd->nd_cr); nd.ni_cnd.cn_nameiop = 0; - fhp = &nfh.fh_generic; - nfsm_srvmtofh(fhp); - nfsm_srvnamesiz(len); - nd.ni_cnd.cn_cred = cred; + vp = dvp = NULL; + nfsm_srvmtofh(&nfh); + nfsm_srvnamesiz(len, v3); + nd.ni_cnd.cn_nameiop = CREATE; - nd.ni_cnd.cn_flags = LOCKPARENT | SAVESTART; - error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos, - &dirp, procp, (nfsd->nd_flag & ND_KERBAUTH), FALSE); + nd.ni_cnd.cn_flags = LOCKPARENT; + error = nfsm_path_mbuftond(&md, &dpos, v3, FALSE, &len, &nd); + if (!error) + error = nfs_namei(nfsd, &context, &nd, &nfh, nam, FALSE, &dirp, &nx, &nxo); if (dirp) { - if (v3) - dirfor_ret = VOP_GETATTR(dirp, &dirfor, cred, - procp); - else { - vrele(dirp); - dirp = (struct vnode *)0; + if (v3) { + nfsm_srv_pre_vattr_init(&dirfor, v3); + dirfor_ret = vnode_getattr(dirp, &dirfor, &context); + } else { + vnode_put(dirp); + dirp = NULL; } } - if (error) - goto out; - VATTR_NULL(vap); + if (error) { + nd.ni_cnd.cn_nameiop = 0; + goto out1; + } + dvp = nd.ni_dvp; + vp = nd.ni_vp; + + VATTR_INIT(vap); if (v3) nfsm_srvsattr(vap); - nfsm_strsiz(len2, NFS_MAXPATHLEN); - MALLOC(pathcp, caddr_t, len2 + 1, M_TEMP, M_WAITOK); - iv.iov_base = pathcp; - iv.iov_len = len2; - io.uio_resid = len2; - io.uio_offset = 0; - io.uio_iov = &iv; - io.uio_iovcnt = 1; - io.uio_segflg = UIO_SYSSPACE; - io.uio_rw = UIO_READ; - io.uio_procp = (struct proc *)0; - nfsm_mtouio(&io, len2); + nfsm_strsiz(linkdatalen, NFS_MAXPATHLEN, v3); + MALLOC(linkdata, caddr_t, linkdatalen + 1, M_TEMP, M_WAITOK); + if (!linkdata) { + nameidone(&nd); + nd.ni_cnd.cn_nameiop = 0; + vnode_put(nd.ni_dvp); + vnode_put(nd.ni_vp); + error = ENOMEM; + goto out; + } + auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, + &uio_buf[0], sizeof(uio_buf)); + if (!auio) { + nameidone(&nd); + nd.ni_cnd.cn_nameiop = 0; + vnode_put(nd.ni_dvp); + vnode_put(nd.ni_vp); + error = ENOMEM; + goto out; + } + uio_addiov(auio, CAST_USER_ADDR_T(linkdata), linkdatalen); + nfsm_mtouio(auio, linkdatalen); if (!v3) { nfsm_dissect(sp, struct nfsv2_sattr *, NFSX_V2SATTR); - vap->va_mode = fxdr_unsigned(u_short, sp->sa_mode); - } - *(pathcp + len2) = '\0'; - if (nd.ni_vp) { - vrele(nd.ni_startdir); - FREE_ZONE(nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); - nd.ni_cnd.cn_flags &= ~HASBUF; - VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); - if (nd.ni_dvp == nd.ni_vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); - vrele(nd.ni_vp); + VATTR_SET(vap, va_mode, fxdr_unsigned(u_short, sp->sa_mode)); + } + *(linkdata + linkdatalen) = '\0'; + if (vp) { error = EEXIST; goto out; } - nqsrv_getl(nd.ni_dvp, ND_WRITE); - error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap, pathcp); - if (error) - vrele(nd.ni_startdir); - else { - if (v3) { - nd.ni_cnd.cn_nameiop = LOOKUP; - nd.ni_cnd.cn_flags &= ~(LOCKPARENT | SAVESTART | FOLLOW); - nd.ni_cnd.cn_flags |= (NOFOLLOW | LOCKLEAF); - nd.ni_cnd.cn_proc = procp; - nd.ni_cnd.cn_cred = cred; - error = lookup(&nd); - if (!error) { - bzero((caddr_t)fhp, sizeof(nfh)); - fhp->fh_fsid = nd.ni_vp->v_mount->mnt_stat.f_fsid; - error = VFS_VPTOFH(nd.ni_vp, &fhp->fh_fid); + + /* + * If the credentials were mapped, we should + * map the same values in the attributes. + */ + if ((vap->va_uid == saved_uid) && (kauth_cred_getuid(nfsd->nd_cr) != saved_uid)) { + int ismember; + VATTR_SET(vap, va_uid, kauth_cred_getuid(nfsd->nd_cr)); + if (kauth_cred_ismember_gid(nfsd->nd_cr, vap->va_gid, &ismember) || !ismember) + VATTR_SET(vap, va_gid, kauth_cred_getgid(nfsd->nd_cr)); + } + VATTR_SET(vap, va_type, VLNK); + + /* authorize before creating */ + error = nfsrv_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, &context, nxo, 0); + + /* validate given attributes */ + if (!error) { + error = vnode_authattr_new(dvp, vap, 0, &context); + if (error && (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid))) { + /* + * Most NFS servers just ignore the UID/GID attributes, so we + * try ignoring them if that'll help the request succeed. + */ + VATTR_CLEAR_ACTIVE(vap, va_uid); + VATTR_CLEAR_ACTIVE(vap, va_gid); + error = vnode_authattr_new(dvp, vap, 0, &context); + } + } + if (!error) + error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, vap, linkdata, &context); + + if (!error && v3) { + if (vp == NULL) { + nd.ni_cnd.cn_nameiop = LOOKUP; + nd.ni_cnd.cn_flags &= ~(LOCKPARENT | FOLLOW); + nd.ni_cnd.cn_flags |= (NOFOLLOW | LOCKLEAF); + nd.ni_cnd.cn_context = &context; + nd.ni_startdir = dvp; + nd.ni_usedvp = dvp; + error = lookup(&nd); if (!error) - error = VOP_GETATTR(nd.ni_vp, vap, cred, - procp); - vput(nd.ni_vp); + vp = nd.ni_vp; + } + if (!error) { + error = nfsrv_vptofh(nx, !v3, NULL, vp, &context, &nfh); + if (!error) { + nfsm_srv_vattr_init(&postat, v3); + error = vnode_getattr(vp, &postat, &context); + } } - } else - vrele(nd.ni_startdir); - FREE_ZONE(nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); - nd.ni_cnd.cn_flags &= ~HASBUF; } out: - if (pathcp) - FREE(pathcp, M_TEMP); + /* + * nameidone has to happen before we vnode_put(dvp) + * since it may need to release the fs_nodelock on the dvp + */ + nameidone(&nd); + nd.ni_cnd.cn_nameiop = 0; + + if (vp) + vnode_put(vp); + vnode_put(dvp); +out1: + if (linkdata) + FREE(linkdata, M_TEMP); if (dirp) { - diraft_ret = VOP_GETATTR(dirp, &diraft, cred, procp); - vrele(dirp); + nfsm_srv_vattr_init(&diraft, v3); + diraft_ret = vnode_getattr(dirp, &diraft, &context); + vnode_put(dirp); } - nfsm_reply(NFSX_SRVFH(v3) + NFSX_POSTOPATTR(v3) + NFSX_WCCDATA(v3)); + nfsm_reply(NFSX_SRVFH(v3, &nfh) + NFSX_POSTOPATTR(v3) + NFSX_WCCDATA(v3)); if (v3) { if (!error) { - nfsm_srvpostop_fh(fhp); - nfsm_srvpostop_attr(0, vap); + nfsm_srvpostop_fh(&nfh); + nfsm_srvpostop_attr(0, &postat); } nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); } return (0); nfsmout: if (nd.ni_cnd.cn_nameiop) { - vrele(nd.ni_startdir); - FREE_ZONE(nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); - nd.ni_cnd.cn_flags &= ~HASBUF; + /* + * nameidone has to happen before we vnode_put(dvp) + * since it may need to release the fs_nodelock on the dvp + */ + nameidone(&nd); + + if (vp) + vnode_put(vp); + vnode_put(dvp); } if (dirp) - vrele(dirp); - VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); - if (nd.ni_dvp == nd.ni_vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); - if (nd.ni_vp) - vrele(nd.ni_vp); - if (pathcp) - FREE(pathcp, M_TEMP); + vnode_put(dirp); + if (linkdata) + FREE(linkdata, M_TEMP); return (error); } @@ -2378,112 +3090,199 @@ int nfsrv_mkdir(nfsd, slp, procp, mrq) struct nfsrv_descript *nfsd; struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; + proc_t procp; + mbuf_t *mrq; { - struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; - struct mbuf *nam = nfsd->nd_nam; + mbuf_t mrep = nfsd->nd_mrep, md = nfsd->nd_md; + mbuf_t nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; - struct ucred *cred = &nfsd->nd_cr; - struct vattr va, dirfor, diraft; - register struct vattr *vap = &va; - register struct nfs_fattr *fp; + struct vnode_attr dirfor, diraft, postat; + struct vnode_attr va; + struct vnode_attr *vap = &va; + struct nfs_fattr *fp; struct nameidata nd; - register caddr_t cp; - register u_long *tl; - register long t1; + caddr_t cp; + u_long *tl; + long t1; caddr_t bpos; - int error = 0, cache, len, dirfor_ret = 1, diraft_ret = 1; + int error = 0, len; + int dirfor_ret = 1, diraft_ret = 1; int v3 = (nfsd->nd_flag & ND_NFSV3); char *cp2; - struct mbuf *mb, *mb2, *mreq; - struct vnode *vp, *dirp = (struct vnode *)0; - nfsfh_t nfh; - fhandle_t *fhp; - u_quad_t frev; - - fhp = &nfh.fh_generic; - nfsm_srvmtofh(fhp); - nfsm_srvnamesiz(len); - nd.ni_cnd.cn_cred = cred; + mbuf_t mb, mb2, mreq; + vnode_t vp, dvp, dirp = NULL; + struct nfs_filehandle nfh; + struct nfs_export *nx; + struct nfs_export_options *nxo; + struct vfs_context context; + uid_t saved_uid; + kauth_acl_t xacl = NULL; + + context.vc_proc = procp; + context.vc_ucred = nfsd->nd_cr; + + /* + * Save the original credential UID in case they are + * mapped and we need to map the IDs in the attributes. + */ + saved_uid = kauth_cred_getuid(nfsd->nd_cr); + + nd.ni_cnd.cn_nameiop = 0; + vp = dvp = NULL; + nfsm_srvmtofh(&nfh); + nfsm_srvnamesiz(len, v3); + nd.ni_cnd.cn_nameiop = CREATE; nd.ni_cnd.cn_flags = LOCKPARENT; - error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos, - &dirp, procp, (nfsd->nd_flag & ND_KERBAUTH), FALSE); + error = nfsm_path_mbuftond(&md, &dpos, v3, FALSE, &len, &nd); + if (!error) + error = nfs_namei(nfsd, &context, &nd, &nfh, nam, FALSE, &dirp, &nx, &nxo); if (dirp) { - if (v3) - dirfor_ret = VOP_GETATTR(dirp, &dirfor, cred, - procp); - else { - vrele(dirp); - dirp = (struct vnode *)0; + if (v3) { + nfsm_srv_pre_vattr_init(&dirfor, v3); + dirfor_ret = vnode_getattr(dirp, &dirfor, &context); + } else { + vnode_put(dirp); + dirp = NULL; } } if (error) { + nd.ni_cnd.cn_nameiop = 0; nfsm_reply(NFSX_WCCDATA(v3)); nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); if (dirp) - vrele(dirp); + vnode_put(dirp); return (0); } - VATTR_NULL(vap); + dvp = nd.ni_dvp; + vp = nd.ni_vp; + + VATTR_INIT(vap); if (v3) { nfsm_srvsattr(vap); } else { nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); - vap->va_mode = nfstov_mode(*tl++); + VATTR_SET(vap, va_mode, nfstov_mode(*tl++)); } - vap->va_type = VDIR; - vp = nd.ni_vp; + VATTR_SET(vap, va_type, VDIR); + if (vp != NULL) { - VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); - if (nd.ni_dvp == vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); - vrele(vp); + /* + * nameidone has to happen before we vnode_put(dvp) + * since it may need to release the fs_nodelock on the dvp + */ + nameidone(&nd); + + vnode_put(dvp); + vnode_put(vp); error = EEXIST; goto out; } - nqsrv_getl(nd.ni_dvp, ND_WRITE); - error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap); + + /* + * If the credentials were mapped, we should + * map the same values in the attributes. + */ + if ((vap->va_uid == saved_uid) && (kauth_cred_getuid(nfsd->nd_cr) != saved_uid)) { + int ismember; + VATTR_SET(vap, va_uid, kauth_cred_getuid(nfsd->nd_cr)); + if (kauth_cred_ismember_gid(nfsd->nd_cr, vap->va_gid, &ismember) || !ismember) + VATTR_SET(vap, va_gid, kauth_cred_getgid(nfsd->nd_cr)); + } + + error = nfsrv_authorize(dvp, NULL, KAUTH_VNODE_ADD_SUBDIRECTORY, &context, nxo, 0); + + /* construct ACL and handle inheritance */ if (!error) { - vp = nd.ni_vp; - bzero((caddr_t)fhp, sizeof(nfh)); - fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid; - error = VFS_VPTOFH(vp, &fhp->fh_fid); - if (!error) - error = VOP_GETATTR(vp, vap, cred, procp); - vput(vp); + error = kauth_acl_inherit(dvp, + NULL, + &xacl, /* isdir */ + 1, + &context); + + if (!error && xacl != NULL) + VATTR_SET(vap, va_acl, xacl); + } + VATTR_CLEAR_ACTIVE(vap, va_data_size); + VATTR_CLEAR_ACTIVE(vap, va_access_time); + + /* validate new-file security information */ + if (!error) { + error = vnode_authattr_new(dvp, vap, 0, &context); + if (error && (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid))) { + /* + * Most NFS servers just ignore the UID/GID attributes, so we + * try ignoring them if that'll help the request succeed. + */ + VATTR_CLEAR_ACTIVE(vap, va_uid); + VATTR_CLEAR_ACTIVE(vap, va_gid); + error = vnode_authattr_new(dvp, vap, 0, &context); + } + } + + if (!error) + error = VNOP_MKDIR(dvp, &vp, &nd.ni_cnd, vap, &context); + + if (!error && !VATTR_ALL_SUPPORTED(vap)) + /* + * If some of the requested attributes weren't handled by the VNOP, + * use our fallback code. + */ + error = vnode_setattr_fallback(vp, vap, &context); + + if (xacl != NULL) + kauth_acl_free(xacl); + + if (!error) { + error = nfsrv_vptofh(nx, !v3, NULL, vp, &context, &nfh); + if (!error) { + nfsm_srv_vattr_init(&postat, v3); + error = vnode_getattr(vp, &postat, &context); + } + vnode_put(vp); + vp = NULL; } + /* + * nameidone has to happen before we vnode_put(dvp) + * since it may need to release the fs_nodelock on the dvp + */ + nameidone(&nd); + + vnode_put(dvp); out: + nd.ni_cnd.cn_nameiop = 0; + if (dirp) { - diraft_ret = VOP_GETATTR(dirp, &diraft, cred, procp); - vrele(dirp); + nfsm_srv_vattr_init(&diraft, v3); + diraft_ret = vnode_getattr(dirp, &diraft, &context); + vnode_put(dirp); } - nfsm_reply(NFSX_SRVFH(v3) + NFSX_POSTOPATTR(v3) + NFSX_WCCDATA(v3)); + nfsm_reply(NFSX_SRVFH(v3, &nfh) + NFSX_POSTOPATTR(v3) + NFSX_WCCDATA(v3)); if (v3) { if (!error) { - nfsm_srvpostop_fh(fhp); - nfsm_srvpostop_attr(0, vap); + nfsm_srvpostop_fh(&nfh); + nfsm_srvpostop_attr(0, &postat); } nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); } else { - nfsm_srvfhtom(fhp, v3); + nfsm_srvfhtom(&nfh, v3); nfsm_build(fp, struct nfs_fattr *, NFSX_V2FATTR); - nfsm_srvfillattr(vap, fp); + nfsm_srvfillattr(&postat, fp); } return (0); nfsmout: + if (nd.ni_cnd.cn_nameiop) { + /* + * nameidone has to happen before we vnode_put(dvp) + * since it may need to release the fs_nodelock on the dvp + */ + nameidone(&nd); + vnode_put(dvp); + if (vp) + vnode_put(vp); + } if (dirp) - vrele(dirp); - VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); - if (nd.ni_dvp == nd.ni_vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); - if (nd.ni_vp) - vrele(nd.ni_vp); + vnode_put(dirp); return (error); } @@ -2494,110 +3293,120 @@ int nfsrv_rmdir(nfsd, slp, procp, mrq) struct nfsrv_descript *nfsd; struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; + proc_t procp; + mbuf_t *mrq; { - struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; - struct mbuf *nam = nfsd->nd_nam; + mbuf_t mrep = nfsd->nd_mrep, md = nfsd->nd_md; + mbuf_t nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; - struct ucred *cred = &nfsd->nd_cr; - register u_long *tl; - register long t1; + u_long *tl; + long t1; caddr_t bpos; - int error = 0, cache, len, dirfor_ret = 1, diraft_ret = 1; + int error = 0, len; + int dirfor_ret = 1, diraft_ret = 1; int v3 = (nfsd->nd_flag & ND_NFSV3); char *cp2; - struct mbuf *mb, *mreq; - struct vnode *vp, *dirp = (struct vnode *)0; - struct vattr dirfor, diraft; - nfsfh_t nfh; - fhandle_t *fhp; + mbuf_t mb, mreq; + vnode_t vp, dvp, dirp = NULL; + struct vnode_attr dirfor, diraft; + struct nfs_filehandle nfh; + struct nfs_export *nx; + struct nfs_export_options *nxo; struct nameidata nd; - u_quad_t frev; + struct vfs_context context; + + context.vc_proc = procp; + context.vc_ucred = nfsd->nd_cr; + + vp = dvp = NULL; + nfsm_srvmtofh(&nfh); + nfsm_srvnamesiz(len, v3); - fhp = &nfh.fh_generic; - nfsm_srvmtofh(fhp); - nfsm_srvnamesiz(len); - nd.ni_cnd.cn_cred = cred; nd.ni_cnd.cn_nameiop = DELETE; nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; - error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos, - &dirp, procp, (nfsd->nd_flag & ND_KERBAUTH), FALSE); + error = nfsm_path_mbuftond(&md, &dpos, v3, FALSE, &len, &nd); + if (!error) + error = nfs_namei(nfsd, &context, &nd, &nfh, nam, FALSE, &dirp, &nx, &nxo); if (dirp) { - if (v3) - dirfor_ret = VOP_GETATTR(dirp, &dirfor, cred, - procp); - else { - vrele(dirp); - dirp = (struct vnode *)0; + if (v3) { + nfsm_srv_pre_vattr_init(&dirfor, v3); + dirfor_ret = vnode_getattr(dirp, &dirfor, &context); + } else { + vnode_put(dirp); + dirp = NULL; } } if (error) { nfsm_reply(NFSX_WCCDATA(v3)); nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); if (dirp) - vrele(dirp); + vnode_put(dirp); return (0); } + dvp = nd.ni_dvp; vp = nd.ni_vp; - if (vp->v_type != VDIR) { + + if (vnode_vtype(vp) != VDIR) { error = ENOTDIR; goto out; } /* * No rmdir "." please. */ - if (nd.ni_dvp == vp) { + if (dvp == vp) { error = EINVAL; goto out; } /* * The root of a mounted filesystem cannot be deleted. */ - if (vp->v_flag & VROOT) + if (vnode_isvroot(vp)) error = EBUSY; + if (!error) + error = nfsrv_authorize(vp, dvp, KAUTH_VNODE_DELETE, &context, nxo, 0); + if (!error) + error = VNOP_RMDIR(dvp, vp, &nd.ni_cnd, &context); out: - if (!error) { - nqsrv_getl(nd.ni_dvp, ND_WRITE); - nqsrv_getl(vp, ND_WRITE); - error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); - } else { - VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); - if (nd.ni_dvp == nd.ni_vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); - vput(vp); - } + /* + * nameidone has to happen before we vnode_put(dvp) + * since it may need to release the fs_nodelock on the dvp + */ + nameidone(&nd); + + vnode_put(dvp); + vnode_put(vp); + if (dirp) { - diraft_ret = VOP_GETATTR(dirp, &diraft, cred, procp); - vrele(dirp); + nfsm_srv_vattr_init(&diraft, v3); + diraft_ret = vnode_getattr(dirp, &diraft, &context); + vnode_put(dirp); } nfsm_reply(NFSX_WCCDATA(v3)); if (v3) { nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); return (0); } - nfsm_srvdone; +nfsmout: + return (error); } /* * nfs readdir service * - mallocs what it thinks is enough to read * count rounded up to a multiple of NFS_DIRBLKSIZ <= NFS_MAXREADDIR - * - calls VOP_READDIR() + * - calls VNOP_READDIR() * - loops around building the reply * if the output generated exceeds count break out of loop * The nfsm_clget macro is used here so that the reply will be packed * tightly in mbuf clusters. - * - it only knows that it has encountered eof when the VOP_READDIR() + * - it only knows that it has encountered eof when the VNOP_READDIR() * reads nothing * - as such one readdir rpc will return eof false although you are there * and then the next will return eof * - it trims out records with d_fileno == 0 * this doesn't matter for Unix clients, but they might confuse clients * for other os'. - * NB: It is tempting to set eof to true if the VOP_READDIR() reads less + * NB: It is tempting to set eof to true if the VNOP_READDIR() reads less * than requested, but this may not apply to all filesystems. For * example, client NFS does not { although it is never remote mounted * anyhow } @@ -2608,49 +3417,44 @@ out: * the EOF flag. For readdirplus, the maxcount is the same, and the * dircount includes all that except for the entry attributes and handles. */ -struct flrep { - nfsuint64 fl_off; - u_long fl_postopok; - u_long fl_fattr[NFSX_V3FATTR / sizeof (u_long)]; - u_long fl_fhok; - u_long fl_fhsize; - u_long fl_nfh[NFSX_V3FH / sizeof (u_long)]; -}; int nfsrv_readdir(nfsd, slp, procp, mrq) struct nfsrv_descript *nfsd; struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; + proc_t procp; + mbuf_t *mrq; { - struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; - struct mbuf *nam = nfsd->nd_nam; + mbuf_t mrep = nfsd->nd_mrep, md = nfsd->nd_md; + mbuf_t nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; - struct ucred *cred = &nfsd->nd_cr; - register char *bp, *be; - register struct mbuf *mp; - register struct dirent *dp; - register caddr_t cp; - register u_long *tl; - register long t1; + char *bp, *be; + mbuf_t mp; + struct direntry *dp; + caddr_t cp; + u_long *tl; + long t1; caddr_t bpos; - struct mbuf *mb, *mb2, *mreq, *mp2; + mbuf_t mb, mb2, mreq, mp2; char *cpos, *cend, *cp2, *rbuf; - struct vnode *vp; - struct vattr at; - nfsfh_t nfh; - fhandle_t *fhp; - struct uio io; - struct iovec iv; + vnode_t vp; + struct vnode_attr at; + struct nfs_filehandle nfh; + struct nfs_export *nx; + struct nfs_export_options *nxo; + uio_t auio; + char uio_buf[ UIO_SIZEOF(1) ]; int len, nlen, rem, xfer, tsiz, i, error = 0, getret = 1; - int siz, cnt, fullsiz, eofflag, rdonly, cache, ncookies = 0; + int siz, count, fullsiz, eofflag, nentries = 0; int v3 = (nfsd->nd_flag & ND_NFSV3); - u_quad_t frev, off, toff, verf; - u_long *cookies = NULL, *cookiep; + u_quad_t off, toff, verf; + nfsuint64 tquad; + int vnopflag; + struct vfs_context context; - fhp = &nfh.fh_generic; - nfsm_srvmtofh(fhp); + vnopflag = VNODE_READDIR_EXTENDED | VNODE_READDIR_REQSEEKOFF; + + nfsm_srvmtofh(&nfh); if (v3) { nfsm_dissect(tl, u_long *, 5 * NFSX_UNSIGNED); fxdr_hyper(tl, &toff); @@ -2662,99 +3466,88 @@ nfsrv_readdir(nfsd, slp, procp, mrq) toff = fxdr_unsigned(u_quad_t, *tl++); } off = toff; - cnt = fxdr_unsigned(int, *tl); - siz = ((cnt + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1)); + count = fxdr_unsigned(int, *tl); + siz = ((count + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1)); xfer = NFS_SRVMAXDATA(nfsd); if (siz > xfer) siz = xfer; fullsiz = siz; - if ((error = nfsrv_fhtovp(fhp, 1, &vp, cred, slp, nam, - &rdonly, (nfsd->nd_flag & ND_KERBAUTH), TRUE))) { + if ((error = nfsrv_fhtovp(&nfh, nam, TRUE, &vp, &nx, &nxo))) { + nfsm_reply(NFSX_UNSIGNED); + nfsm_srvpostop_attr(getret, &at); + return (0); + } + if ((error = nfsrv_credcheck(nfsd, nx, nxo))) { + vnode_put(vp); nfsm_reply(NFSX_UNSIGNED); nfsm_srvpostop_attr(getret, &at); return (0); } - nqsrv_getl(vp, ND_READ); + context.vc_proc = procp; + context.vc_ucred = nfsd->nd_cr; if (v3) { - error = getret = VOP_GETATTR(vp, &at, cred, procp); + nfsm_srv_vattr_init(&at, v3); + error = getret = vnode_getattr(vp, &at, &context); if (!error && toff && verf && verf != at.va_filerev) error = NFSERR_BAD_COOKIE; } if (!error) - error = nfsrv_access(vp, VEXEC, cred, rdonly, procp, 0); + error = nfsrv_authorize(vp, NULL, KAUTH_VNODE_LIST_DIRECTORY, &context, nxo, 0); if (error) { - vput(vp); + vnode_put(vp); nfsm_reply(NFSX_POSTOPATTR(v3)); nfsm_srvpostop_attr(getret, &at); return (0); } - VOP_UNLOCK(vp, 0, procp); MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK); -again: - iv.iov_base = rbuf; - iv.iov_len = fullsiz; - io.uio_iov = &iv; - io.uio_iovcnt = 1; - io.uio_offset = (off_t)off; - io.uio_resid = fullsiz; - io.uio_segflg = UIO_SYSSPACE; - io.uio_rw = UIO_READ; - io.uio_procp = (struct proc *)0; - eofflag = 0; - - if (cookies) { - _FREE((caddr_t)cookies, M_TEMP); - cookies = NULL; - } - if (error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, procp)) { - FREE((caddr_t)rbuf, M_TEMP); + if (!rbuf) { + error = ENOMEM; + vnode_put(vp); nfsm_reply(NFSX_POSTOPATTR(v3)); nfsm_srvpostop_attr(getret, &at); return (0); } - error = VOP_READDIR(vp, &io, cred, &eofflag, &ncookies, &cookies); - off = (off_t)io.uio_offset; - /* - * We cannot set the error in the case where there are no cookies - * and no error, only, as FreeBSD. In the scenario the client is - * calling us back being told there were "more" entries on last readdir - * return, and we have no more entries, our VOP_READDIR can give - * cookies = NULL and no error. This is due to a zero size to MALLOC - * returning NULL unlike FreeBSD which returns a pointer. - * With FreeBSD it makes sense if the MALLOC failed and you get in that - * bind. For us, we need something more. Thus, we should make sure we - * had some cookies to return, but no pointer and no error for EPERM case. - * Otherwise, go thru normal processing of sending back the eofflag. This check - * is also legit on first call to the routine by client since . and .. - * should be returned. Make same change to nfsrv_readdirplus. - */ - if ((ncookies != 0) && !cookies && !error) - error = NFSERR_PERM; - + auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, + &uio_buf[0], sizeof(uio_buf)); + if (!auio) { + error = ENOMEM; + FREE(rbuf, M_TEMP); + vnode_put(vp); + nfsm_reply(NFSX_POSTOPATTR(v3)); + nfsm_srvpostop_attr(getret, &at); + return (0); + } +again: + uio_reset(auio, off, UIO_SYSSPACE, UIO_READ); + uio_addiov(auio, CAST_USER_ADDR_T(rbuf), fullsiz); + + eofflag = 0; + error = VNOP_READDIR(vp, auio, vnopflag, &eofflag, &nentries, &context); + off = uio_offset(auio); + if (v3) { - getret = VOP_GETATTR(vp, &at, cred, procp); + nfsm_srv_vattr_init(&at, v3); + getret = vnode_getattr(vp, &at, &context); if (!error) error = getret; } - VOP_UNLOCK(vp, 0, procp); if (error) { - vrele(vp); - _FREE((caddr_t)rbuf, M_TEMP); - if (cookies) - _FREE((caddr_t)cookies, M_TEMP); + vnode_put(vp); + FREE(rbuf, M_TEMP); nfsm_reply(NFSX_POSTOPATTR(v3)); nfsm_srvpostop_attr(getret, &at); return (0); } - if (io.uio_resid) { - siz -= io.uio_resid; + if (uio_resid(auio) != 0) { + // LP64todo - fix this + siz -= uio_resid(auio); /* * If nothing read, return eof * rpc reply */ if (siz == 0) { - vrele(vp); + vnode_put(vp); nfsm_reply(NFSX_POSTOPATTR(v3) + NFSX_COOKIEVERF(v3) + 2 * NFSX_UNSIGNED); if (v3) { @@ -2766,8 +3559,7 @@ again: nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED); *tl++ = nfs_false; *tl = nfs_true; - FREE((caddr_t)rbuf, M_TEMP); - FREE((caddr_t)cookies, M_TEMP); + FREE(rbuf, M_TEMP); return (0); } } @@ -2778,32 +3570,19 @@ again: */ cpos = rbuf; cend = rbuf + siz; - dp = (struct dirent *)cpos; - cookiep = cookies; -#ifdef __FreeBSD__ - /* - * For some reason FreeBSD's ufs_readdir() chooses to back the - * directory offset up to a block boundary, so it is necessary to - * skip over the records that preceed the requested offset. This - * requires the assumption that file offset cookies monotonically - * increase. - */ - while (cpos < cend && ncookies > 0 && - (dp->d_fileno == 0 || ((u_quad_t)(*cookiep)) <= toff)) { -#else - while (dp->d_fileno == 0 && cpos < cend && ncookies > 0) { -#endif + dp = (struct direntry *)cpos; + while (dp->d_fileno == 0 && cpos < cend && nentries > 0) { cpos += dp->d_reclen; - dp = (struct dirent *)cpos; - cookiep++; - ncookies--; + dp = (struct direntry *)cpos; + nentries--; } - if (cpos >= cend || ncookies == 0) { + if (cpos >= cend || nentries == 0) { toff = off; siz = fullsiz; goto again; } + vnode_put(vp); nfsm_reply(NFSX_POSTOPATTR(v3) + NFSX_COOKIEVERF(v3) + siz); if (v3) { len = NFSX_V3POSTOPATTR + NFSX_V3COOKIEVERF + 2 * NFSX_UNSIGNED; @@ -2814,36 +3593,42 @@ again: len = 2 * NFSX_UNSIGNED; mp = mp2 = mb; bp = bpos; - be = bp + M_TRAILINGSPACE(mp); + be = bp + mbuf_trailingspace(mp); /* Loop through the records and build reply */ - while (cpos < cend && ncookies > 0) { + while (cpos < cend && nentries > 0) { if (dp->d_fileno != 0) { nlen = dp->d_namlen; + if (!v3 && (nlen > NFS_MAXNAMLEN)) + nlen = NFS_MAXNAMLEN; rem = nfsm_rndup(nlen)-nlen; len += (4 * NFSX_UNSIGNED + nlen + rem); if (v3) len += 2 * NFSX_UNSIGNED; - if (len > cnt) { + if (len > count) { eofflag = 0; break; } /* * Build the directory record xdr from - * the dirent entry. + * the direntry entry. */ nfsm_clget; *tl = nfs_true; bp += NFSX_UNSIGNED; + nfsm_clget; if (v3) { + txdr_hyper(&dp->d_fileno, &tquad); + *tl = tquad.nfsuquad[0]; + bp += NFSX_UNSIGNED; nfsm_clget; - *tl = 0; + *tl = tquad.nfsuquad[1]; + bp += NFSX_UNSIGNED; + } else { + *tl = txdr_unsigned(dp->d_fileno); bp += NFSX_UNSIGNED; } nfsm_clget; - *tl = txdr_unsigned(dp->d_fileno); - bp += NFSX_UNSIGNED; - nfsm_clget; *tl = txdr_unsigned(nlen); bp += NFSX_UNSIGNED; @@ -2865,23 +3650,25 @@ again: /* And null pad to a long boundary */ for (i = 0; i < rem; i++) *bp++ = '\0'; - nfsm_clget; - /* Finish off the record */ + /* Finish off the record with the cookie */ + nfsm_clget; if (v3) { - *tl = 0; + txdr_hyper(&dp->d_seekoff, &tquad); + *tl = tquad.nfsuquad[0]; bp += NFSX_UNSIGNED; nfsm_clget; + *tl = tquad.nfsuquad[1]; + bp += NFSX_UNSIGNED; + } else { + *tl = txdr_unsigned(dp->d_seekoff); + bp += NFSX_UNSIGNED; } - *tl = txdr_unsigned(*cookiep); - bp += NFSX_UNSIGNED; } cpos += dp->d_reclen; - dp = (struct dirent *)cpos; - cookiep++; - ncookies--; + dp = (struct direntry *)cpos; + nentries--; } - vrele(vp); nfsm_clget; *tl = nfs_false; bp += NFSX_UNSIGNED; @@ -2893,135 +3680,150 @@ again: bp += NFSX_UNSIGNED; if (mp != mb) { if (bp < be) - mp->m_len = bp - mtod(mp, caddr_t); + mbuf_setlen(mp, bp - (char*)mbuf_data(mp)); } else - mp->m_len += bp - bpos; - FREE((caddr_t)rbuf, M_TEMP); - FREE((caddr_t)cookies, M_TEMP); - nfsm_srvdone; + mbuf_setlen(mp, mbuf_len(mp) + (bp - bpos)); + FREE(rbuf, M_TEMP); +nfsmout: + return (error); } +struct flrep { + nfsuint64 fl_off; + u_long fl_postopok; + u_long fl_fattr[NFSX_V3FATTR / sizeof (u_long)]; + u_long fl_fhok; + u_long fl_fhsize; + u_long fl_nfh[NFSX_V3FHMAX / sizeof (u_long)]; +}; + int nfsrv_readdirplus(nfsd, slp, procp, mrq) struct nfsrv_descript *nfsd; struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; + proc_t procp; + mbuf_t *mrq; { - struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; - struct mbuf *nam = nfsd->nd_nam; + mbuf_t mrep = nfsd->nd_mrep, md = nfsd->nd_md; + mbuf_t nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; - struct ucred *cred = &nfsd->nd_cr; - register char *bp, *be; - register struct mbuf *mp; - register struct dirent *dp; - register caddr_t cp; - register u_long *tl; - register long t1; + char *bp, *be; + mbuf_t mp; + struct direntry *dp; + caddr_t cp; + u_long *tl; + long t1; caddr_t bpos; - struct mbuf *mb, *mb2, *mreq, *mp2; + mbuf_t mb, mb2, mreq, mp2; char *cpos, *cend, *cp2, *rbuf; - struct vnode *vp, *nvp; + vnode_t vp, nvp; struct flrep fl; - nfsfh_t nfh; - fhandle_t *fhp, *nfhp = (fhandle_t *)fl.fl_nfh; - struct uio io; - struct iovec iv; - struct vattr va, at, *vap = &va; + struct nfs_filehandle dnfh, *nfhp = (struct nfs_filehandle *)&fl.fl_fhsize; + struct nfs_export *nx; + struct nfs_export_options *nxo; + uio_t auio; + char uio_buf[ UIO_SIZEOF(1) ]; + struct vnode_attr va, at, *vap = &va; struct nfs_fattr *fp; int len, nlen, rem, xfer, tsiz, i, error = 0, getret = 1; - int siz, cnt, fullsiz, eofflag, rdonly, cache, dirlen, ncookies = 0; - u_quad_t frev, off, toff, verf; - u_long *cookies = NULL, *cookiep; - void *file; + int siz, count, fullsiz, eofflag, dirlen, nentries = 0, isdotdot; + u_quad_t off, toff, verf; + nfsuint64 tquad; + int vnopflag; + struct vfs_context context; - fhp = &nfh.fh_generic; - nfsm_srvmtofh(fhp); + vnopflag = VNODE_READDIR_EXTENDED | VNODE_READDIR_REQSEEKOFF; + vp = NULL; + nfsm_srvmtofh(&dnfh); nfsm_dissect(tl, u_long *, 6 * NFSX_UNSIGNED); fxdr_hyper(tl, &toff); tl += 2; fxdr_hyper(tl, &verf); tl += 2; siz = fxdr_unsigned(int, *tl++); - cnt = fxdr_unsigned(int, *tl); + count = fxdr_unsigned(int, *tl); off = toff; siz = ((siz + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1)); xfer = NFS_SRVMAXDATA(nfsd); if (siz > xfer) siz = xfer; fullsiz = siz; - if ((error = nfsrv_fhtovp(fhp, 1, &vp, cred, slp, nam, - &rdonly, (nfsd->nd_flag & ND_KERBAUTH), TRUE))) { + if ((error = nfsrv_fhtovp(&dnfh, nam, TRUE, &vp, &nx, &nxo))) { + nfsm_reply(NFSX_UNSIGNED); + nfsm_srvpostop_attr(getret, &at); + return (0); + } + if ((error = nfsrv_credcheck(nfsd, nx, nxo))) { + vnode_put(vp); nfsm_reply(NFSX_UNSIGNED); nfsm_srvpostop_attr(getret, &at); return (0); } - error = getret = VOP_GETATTR(vp, &at, cred, procp); + context.vc_proc = procp; + context.vc_ucred = nfsd->nd_cr; + nfsm_srv_vattr_init(&at, 1); + error = getret = vnode_getattr(vp, &at, &context); if (!error && toff && verf && verf != at.va_filerev) error = NFSERR_BAD_COOKIE; - if (!error) { - nqsrv_getl(vp, ND_READ); - error = nfsrv_access(vp, VEXEC, cred, rdonly, procp, 0); - } + if (!error) + error = nfsrv_authorize(vp, NULL, KAUTH_VNODE_LIST_DIRECTORY, &context, nxo, 0); if (error) { - vput(vp); + vnode_put(vp); + vp = NULL; nfsm_reply(NFSX_V3POSTOPATTR); nfsm_srvpostop_attr(getret, &at); return (0); } - VOP_UNLOCK(vp, 0, procp); MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK); + if (!rbuf) { + error = ENOMEM; + vnode_put(vp); + vp = NULL; + nfsm_reply(NFSX_V3POSTOPATTR); + nfsm_srvpostop_attr(getret, &at); + return (0); + } + auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, + &uio_buf[0], sizeof(uio_buf)); + if (!auio) { + error = ENOMEM; + FREE(rbuf, M_TEMP); + vnode_put(vp); + vp = NULL; + nfsm_reply(NFSX_V3POSTOPATTR); + nfsm_srvpostop_attr(getret, &at); + return (0); + } again: - iv.iov_base = rbuf; - iv.iov_len = fullsiz; - io.uio_iov = &iv; - io.uio_iovcnt = 1; - io.uio_offset = (off_t)off; - io.uio_resid = fullsiz; - io.uio_segflg = UIO_SYSSPACE; - io.uio_rw = UIO_READ; - io.uio_procp = (struct proc *)0; + uio_reset(auio, off, UIO_SYSSPACE, UIO_READ); + uio_addiov(auio, CAST_USER_ADDR_T(rbuf), fullsiz); eofflag = 0; - if (cookies) { - _FREE((caddr_t)cookies, M_TEMP); - cookies = NULL; - } - if (error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, procp)) { - FREE((caddr_t)rbuf, M_TEMP); - nfsm_reply(NFSX_V3POSTOPATTR); - nfsm_srvpostop_attr(getret, &at); - return (0); - } - error = VOP_READDIR(vp, &io, cred, &eofflag, &ncookies, &cookies); - off = (u_quad_t)io.uio_offset; - getret = VOP_GETATTR(vp, &at, cred, procp); - VOP_UNLOCK(vp, 0, procp); - /* - * See nfsrv_readdir comment above on this - */ - if ((ncookies != 0) && !cookies && !error) - error = NFSERR_PERM; + error = VNOP_READDIR(vp, auio, vnopflag, &eofflag, &nentries, &context); + off = uio_offset(auio); + nfsm_srv_vattr_init(&at, 1); + getret = vnode_getattr(vp, &at, &context); if (!error) error = getret; if (error) { - vrele(vp); - if (cookies) - _FREE((caddr_t)cookies, M_TEMP); - _FREE((caddr_t)rbuf, M_TEMP); + vnode_put(vp); + vp = NULL; + FREE(rbuf, M_TEMP); nfsm_reply(NFSX_V3POSTOPATTR); nfsm_srvpostop_attr(getret, &at); return (0); } - if (io.uio_resid) { - siz -= io.uio_resid; + if (uio_resid(auio) != 0) { + // LP64todo - fix this + siz -= uio_resid(auio); /* * If nothing read, return eof * rpc reply */ if (siz == 0) { - vrele(vp); + vnode_put(vp); + vp = NULL; nfsm_reply(NFSX_V3POSTOPATTR + NFSX_V3COOKIEVERF + 2 * NFSX_UNSIGNED); nfsm_srvpostop_attr(getret, &at); @@ -3030,8 +3832,7 @@ again: tl += 2; *tl++ = nfs_false; *tl = nfs_true; - FREE((caddr_t)cookies, M_TEMP); - FREE((caddr_t)rbuf, M_TEMP); + FREE(rbuf, M_TEMP); return (0); } } @@ -3042,27 +3843,13 @@ again: */ cpos = rbuf; cend = rbuf + siz; - dp = (struct dirent *)cpos; - cookiep = cookies; -#ifdef __FreeBSD__ - /* - * For some reason FreeBSD's ufs_readdir() chooses to back the - * directory offset up to a block boundary, so it is necessary to - * skip over the records that preceed the requested offset. This - * requires the assumption that file offset cookies monotonically - * increase. - */ - while (cpos < cend && ncookies > 0 && - (dp->d_fileno == 0 || ((u_quad_t)(*cookiep)) <= toff)) { -#else - while (dp->d_fileno == 0 && cpos < cend && ncookies > 0) { -#endif + dp = (struct direntry *)cpos; + while (dp->d_fileno == 0 && cpos < cend && nentries > 0) { cpos += dp->d_reclen; - dp = (struct dirent *)cpos; - cookiep++; - ncookies--; + dp = (struct direntry *)cpos; + nentries--; } - if (cpos >= cend || ncookies == 0) { + if (cpos >= cend || nentries == 0) { toff = off; siz = fullsiz; goto again; @@ -3070,70 +3857,56 @@ again: /* * Probe one of the directory entries to see if the filesystem - * supports VGET. See later comment for VFS_VGET changes. + * supports VGET. */ - if (vp->v_tag == VT_UFS) - file = (void *) dp->d_fileno; - else { - file = &dp->d_fileno; - } - - if (error = VFS_VGET(vp->v_mount, file, &nvp)) { - if (error == EOPNOTSUPP) /* let others get passed back */ + if ((error = VFS_VGET(vnode_mount(vp), (ino64_t)dp->d_fileno, &nvp, &context))) { + if (error == ENOTSUP) /* let others get passed back */ error = NFSERR_NOTSUPP; - vrele(vp); - _FREE((caddr_t)cookies, M_TEMP); - _FREE((caddr_t)rbuf, M_TEMP); + vnode_put(vp); + vp = NULL; + FREE(rbuf, M_TEMP); nfsm_reply(NFSX_V3POSTOPATTR); nfsm_srvpostop_attr(getret, &at); return (0); } - vput(nvp); + vnode_put(nvp); dirlen = len = NFSX_V3POSTOPATTR + NFSX_V3COOKIEVERF + 2 * NFSX_UNSIGNED; - nfsm_reply(cnt); + nfsm_reply(count); nfsm_srvpostop_attr(getret, &at); nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED); txdr_hyper(&at.va_filerev, tl); mp = mp2 = mb; bp = bpos; - be = bp + M_TRAILINGSPACE(mp); + be = bp + mbuf_trailingspace(mp); /* Loop through the records and build reply */ - while (cpos < cend && ncookies > 0) { + while (cpos < cend && nentries > 0) { if (dp->d_fileno != 0) { nlen = dp->d_namlen; rem = nfsm_rndup(nlen)-nlen; /* * Got to get the vnode for lookup per entry. - * HFS+/volfs and others use address of file identifier to VGET - * UFS, nullfs, umapfs use inode (u_int32_t) - * until they are consistent, we must differentiate now. - * UFS is the only one of the latter class that is exported. - * Note this will be pulled out as we resolve the VGET issue - * of which it should use u_in32_t or addresses. */ - - if (vp->v_tag == VT_UFS) - file = (void *) dp->d_fileno; - else - file = &dp->d_fileno; - - if (VFS_VGET(vp->v_mount, file, &nvp)) + if (VFS_VGET(vnode_mount(vp), (ino64_t)dp->d_fileno, &nvp, &context)) goto invalid; - bzero((caddr_t)nfhp, NFSX_V3FH); - nfhp->fh_fsid = - nvp->v_mount->mnt_stat.f_fsid; - if (VFS_VPTOFH(nvp, &nfhp->fh_fid)) { - vput(nvp); + isdotdot = ((dp->d_namlen == 2) && + (dp->d_name[0] == '.') && (dp->d_name[1] == '.')); + if (nfsrv_vptofh(nx, 0, (isdotdot ? &dnfh : NULL), nvp, &context, nfhp)) { + // XXX file handle is optional, so we should be able to + // XXX return this entry without the file handle + vnode_put(nvp); goto invalid; } - if (VOP_GETATTR(nvp, vap, cred, procp)) { - vput(nvp); + nfsm_srv_vattr_init(vap, 1); + if (vnode_getattr(nvp, vap, &context)) { + // XXX attributes are optional, so we should be able to + // XXX return this entry without the attributes + vnode_put(nvp); goto invalid; } - vput(nvp); + vnode_put(nvp); /* * If either the dircount or maxcount will be @@ -3141,35 +3914,37 @@ again: * are calculated conservatively, including all * XDR overheads. */ - len += (8 * NFSX_UNSIGNED + nlen + rem + NFSX_V3FH + + len += (8 * NFSX_UNSIGNED + nlen + rem + nfhp->nfh_len + NFSX_V3POSTOPATTR); dirlen += (6 * NFSX_UNSIGNED + nlen + rem); - if (len > cnt || dirlen > fullsiz) { + if (len > count || dirlen > fullsiz) { eofflag = 0; break; } /* * Build the directory record xdr from - * the dirent entry. + * the direntry entry. */ fp = (struct nfs_fattr *)&fl.fl_fattr; nfsm_srvfillattr(vap, fp); - fl.fl_fhsize = txdr_unsigned(NFSX_V3FH); + fl.fl_fhsize = txdr_unsigned(nfhp->nfh_len); fl.fl_fhok = nfs_true; fl.fl_postopok = nfs_true; - fl.fl_off.nfsuquad[0] = 0; - fl.fl_off.nfsuquad[1] = txdr_unsigned(*cookiep); + txdr_hyper(&dp->d_seekoff, &fl.fl_off); nfsm_clget; *tl = nfs_true; bp += NFSX_UNSIGNED; + nfsm_clget; - *tl = 0; + txdr_hyper(&dp->d_fileno, &tquad); + *tl = tquad.nfsuquad[0]; bp += NFSX_UNSIGNED; nfsm_clget; - *tl = txdr_unsigned(dp->d_fileno); + *tl = tquad.nfsuquad[1]; bp += NFSX_UNSIGNED; + nfsm_clget; *tl = txdr_unsigned(nlen); bp += NFSX_UNSIGNED; @@ -3196,7 +3971,7 @@ again: /* * Now copy the flrep structure out. */ - xfer = sizeof (struct flrep); + xfer = sizeof(struct flrep) - sizeof(fl.fl_nfh) + fl.fl_fhsize; cp = (caddr_t)&fl; while (xfer > 0) { nfsm_clget; @@ -3213,11 +3988,11 @@ again: } invalid: cpos += dp->d_reclen; - dp = (struct dirent *)cpos; - cookiep++; - ncookies--; + dp = (struct direntry *)cpos; + nentries--; } - vrele(vp); + vnode_put(vp); + vp = NULL; nfsm_clget; *tl = nfs_false; bp += NFSX_UNSIGNED; @@ -3229,12 +4004,14 @@ invalid: bp += NFSX_UNSIGNED; if (mp != mb) { if (bp < be) - mp->m_len = bp - mtod(mp, caddr_t); + mbuf_setlen(mp, bp - (char*)mbuf_data(mp)); } else - mp->m_len += bp - bpos; - FREE((caddr_t)cookies, M_TEMP); - FREE((caddr_t)rbuf, M_TEMP); - nfsm_srvdone; + mbuf_setlen(mp, mbuf_len(mp) + (bp - bpos)); + FREE(rbuf, M_TEMP); +nfsmout: + if (vp) + vnode_put(vp); + return (error); } /* @@ -3244,63 +4021,66 @@ int nfsrv_commit(nfsd, slp, procp, mrq) struct nfsrv_descript *nfsd; struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; + proc_t procp; + mbuf_t *mrq; { - struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; - struct mbuf *nam = nfsd->nd_nam; + mbuf_t mrep = nfsd->nd_mrep, md = nfsd->nd_md; + mbuf_t nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; - struct ucred *cred = &nfsd->nd_cr; - struct vattr bfor, aft; - struct vnode *vp; - nfsfh_t nfh; - fhandle_t *fhp; - register u_long *tl; - register long t1; + struct vnode_attr bfor, aft; + vnode_t vp; + struct nfs_filehandle nfh; + struct nfs_export *nx; + struct nfs_export_options *nxo; + u_long *tl; + long t1; caddr_t bpos; - int error = 0, rdonly, for_ret = 1, aft_ret = 1, cnt, cache; + int error = 0, for_ret = 1, aft_ret = 1, count; char *cp2; - struct mbuf *mb, *mb2, *mreq; - u_quad_t frev, off; - int didhold; + mbuf_t mb, mb2, mreq; + u_quad_t off; + struct vfs_context context; -#ifndef nolint - cache = 0; -#endif - fhp = &nfh.fh_generic; - nfsm_srvmtofh(fhp); + nfsm_srvmtofh(&nfh); nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED); /* - * XXX At this time VOP_FSYNC() does not accept offset and byte + * XXX At this time VNOP_FSYNC() does not accept offset and byte * count parameters, so these arguments are useless (someday maybe). */ fxdr_hyper(tl, &off); tl += 2; - cnt = fxdr_unsigned(int, *tl); - if ((error = nfsrv_fhtovp(fhp, 1, &vp, cred, slp, nam, - &rdonly, (nfsd->nd_flag & ND_KERBAUTH), TRUE))) { + count = fxdr_unsigned(int, *tl); + if ((error = nfsrv_fhtovp(&nfh, nam, TRUE, &vp, &nx, &nxo))) { nfsm_reply(2 * NFSX_UNSIGNED); nfsm_srvwcc_data(for_ret, &bfor, aft_ret, &aft); return (0); } - for_ret = VOP_GETATTR(vp, &bfor, cred, procp); - didhold = ubc_hold(vp); - error = VOP_FSYNC(vp, cred, MNT_WAIT, procp); - aft_ret = VOP_GETATTR(vp, &aft, cred, procp); - VOP_UNLOCK(vp, 0, procp); - if (didhold) - ubc_rele(vp); - vrele(vp); + if ((error = nfsrv_credcheck(nfsd, nx, nxo))) { + vnode_put(vp); + nfsm_reply(2 * NFSX_UNSIGNED); + nfsm_srvwcc_data(for_ret, &bfor, aft_ret, &aft); + return (0); + } + context.vc_proc = procp; + context.vc_ucred = nfsd->nd_cr; + + nfsm_srv_pre_vattr_init(&bfor, 1); + for_ret = vnode_getattr(vp, &bfor, &context); + error = VNOP_FSYNC(vp, MNT_WAIT, &context); + nfsm_srv_vattr_init(&aft, 1); + aft_ret = vnode_getattr(vp, &aft, &context); + vnode_put(vp); nfsm_reply(NFSX_V3WCCDATA + NFSX_V3WRITEVERF); nfsm_srvwcc_data(for_ret, &bfor, aft_ret, &aft); if (!error) { nfsm_build(tl, u_long *, NFSX_V3WRITEVERF); - *tl++ = txdr_unsigned(boottime.tv_sec); - *tl = txdr_unsigned(boottime.tv_usec); + *tl++ = txdr_unsigned(boottime_sec()); + *tl = txdr_unsigned(0); } else return (0); - nfsm_srvdone; +nfsmout: + return (error); } /* @@ -3310,44 +4090,55 @@ int nfsrv_statfs(nfsd, slp, procp, mrq) struct nfsrv_descript *nfsd; struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; + proc_t procp; + mbuf_t *mrq; { - struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; - struct mbuf *nam = nfsd->nd_nam; + mbuf_t mrep = nfsd->nd_mrep, md = nfsd->nd_md; + mbuf_t nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; - struct ucred *cred = &nfsd->nd_cr; - register struct statfs *sf; - register struct nfs_statfs *sfp; - register u_long *tl; - register long t1; + struct vfs_attr va; + struct nfs_statfs *sfp; + u_long *tl; + long t1; caddr_t bpos; - int error = 0, rdonly, cache, getret = 1; + int error = 0, getret = 1; int v3 = (nfsd->nd_flag & ND_NFSV3); char *cp2; - struct mbuf *mb, *mb2, *mreq; - struct vnode *vp; - struct vattr at; - nfsfh_t nfh; - fhandle_t *fhp; - struct statfs statfs; - u_quad_t frev, tval; + mbuf_t mb, mb2, mreq; + vnode_t vp; + struct vnode_attr at; + struct nfs_filehandle nfh; + struct nfs_export *nx; + struct nfs_export_options *nxo; + u_quad_t tval; + off_t blksize; + struct vfs_context context; -#ifndef nolint - cache = 0; -#endif - fhp = &nfh.fh_generic; - nfsm_srvmtofh(fhp); - if ((error = nfsrv_fhtovp(fhp, 1, &vp, cred, slp, nam, - &rdonly, (nfsd->nd_flag & ND_KERBAUTH), TRUE))) { + nfsm_srvmtofh(&nfh); + if ((error = nfsrv_fhtovp(&nfh, nam, TRUE, &vp, &nx, &nxo))) { + nfsm_reply(NFSX_UNSIGNED); + nfsm_srvpostop_attr(getret, &at); + return (0); + } + if ((error = nfsrv_credcheck(nfsd, nx, nxo))) { + vnode_put(vp); nfsm_reply(NFSX_UNSIGNED); nfsm_srvpostop_attr(getret, &at); return (0); } - sf = &statfs; - error = VFS_STATFS(vp->v_mount, sf, procp); - getret = VOP_GETATTR(vp, &at, cred, procp); - vput(vp); + context.vc_proc = procp; + context.vc_ucred = nfsd->nd_cr; + + VFSATTR_INIT(&va); + VFSATTR_WANTED(&va, f_blocks); + VFSATTR_WANTED(&va, f_bavail); + VFSATTR_WANTED(&va, f_files); + VFSATTR_WANTED(&va, f_ffree); + error = vfs_getattr(vnode_mount(vp), &va, &context); + blksize = vnode_mount(vp)->mnt_vfsstat.f_bsize; + nfsm_srv_vattr_init(&at, v3); + getret = vnode_getattr(vp, &at, &context); + vnode_put(vp); nfsm_reply(NFSX_POSTOPATTR(v3) + NFSX_STATFS(v3)); if (v3) nfsm_srvpostop_attr(getret, &at); @@ -3355,30 +4146,25 @@ nfsrv_statfs(nfsd, slp, procp, mrq) return (0); nfsm_build(sfp, struct nfs_statfs *, NFSX_STATFS(v3)); if (v3) { - tval = (u_quad_t)(unsigned long)sf->f_blocks; - tval *= (u_quad_t)(unsigned long)sf->f_bsize; + tval = (u_quad_t)(va.f_blocks * blksize); txdr_hyper(&tval, &sfp->sf_tbytes); - tval = (u_quad_t)(unsigned long)sf->f_bfree; - tval *= (u_quad_t)(unsigned long)sf->f_bsize; + tval = (u_quad_t)(va.f_bfree * blksize); txdr_hyper(&tval, &sfp->sf_fbytes); - tval = (u_quad_t)(unsigned long)sf->f_bavail; - tval *= (u_quad_t)(unsigned long)sf->f_bsize; + tval = (u_quad_t)(va.f_bavail * blksize); txdr_hyper(&tval, &sfp->sf_abytes); - sfp->sf_tfiles.nfsuquad[0] = 0; - sfp->sf_tfiles.nfsuquad[1] = txdr_unsigned(sf->f_files); - sfp->sf_ffiles.nfsuquad[0] = 0; - sfp->sf_ffiles.nfsuquad[1] = txdr_unsigned(sf->f_ffree); - sfp->sf_afiles.nfsuquad[0] = 0; - sfp->sf_afiles.nfsuquad[1] = txdr_unsigned(sf->f_ffree); + txdr_hyper(&va.f_files, &sfp->sf_tfiles); + txdr_hyper(&va.f_ffree, &sfp->sf_ffiles); + txdr_hyper(&va.f_ffree, &sfp->sf_afiles); sfp->sf_invarsec = 0; } else { sfp->sf_tsize = txdr_unsigned(NFS_V2MAXDATA); - sfp->sf_bsize = txdr_unsigned(sf->f_bsize); - sfp->sf_blocks = txdr_unsigned(sf->f_blocks); - sfp->sf_bfree = txdr_unsigned(sf->f_bfree); - sfp->sf_bavail = txdr_unsigned(sf->f_bavail); + sfp->sf_bsize = txdr_unsigned((unsigned)blksize); + sfp->sf_blocks = txdr_unsigned((unsigned)va.f_blocks); + sfp->sf_bfree = txdr_unsigned((unsigned)va.f_bfree); + sfp->sf_bavail = txdr_unsigned((unsigned)va.f_bavail); } - nfsm_srvdone; +nfsmout: + return (error); } /* @@ -3388,39 +4174,44 @@ int nfsrv_fsinfo(nfsd, slp, procp, mrq) struct nfsrv_descript *nfsd; struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; + proc_t procp; + mbuf_t *mrq; { - struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; - struct mbuf *nam = nfsd->nd_nam; + mbuf_t mrep = nfsd->nd_mrep, md = nfsd->nd_md; + mbuf_t nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; - struct ucred *cred = &nfsd->nd_cr; - register u_long *tl; - register struct nfsv3_fsinfo *sip; - register long t1; + u_long *tl; + struct nfsv3_fsinfo *sip; + long t1; caddr_t bpos; - int error = 0, rdonly, cache, getret = 1, pref, max; + int error = 0, getret = 1, prefsize, maxsize; char *cp2; - struct mbuf *mb, *mb2, *mreq; - struct vnode *vp; - struct vattr at; - nfsfh_t nfh; - fhandle_t *fhp; - u_quad_t frev; + mbuf_t mb, mb2, mreq; + vnode_t vp; + struct vnode_attr at; + struct nfs_filehandle nfh; + struct nfs_export *nx; + struct nfs_export_options *nxo; + struct vfs_context context; -#ifndef nolint - cache = 0; -#endif - fhp = &nfh.fh_generic; - nfsm_srvmtofh(fhp); - if ((error = nfsrv_fhtovp(fhp, 1, &vp, cred, slp, nam, - &rdonly, (nfsd->nd_flag & ND_KERBAUTH), TRUE))) { + nfsm_srvmtofh(&nfh); + if ((error = nfsrv_fhtovp(&nfh, nam, TRUE, &vp, &nx, &nxo))) { nfsm_reply(NFSX_UNSIGNED); nfsm_srvpostop_attr(getret, &at); return (0); } - getret = VOP_GETATTR(vp, &at, cred, procp); - vput(vp); + if ((error = nfsrv_credcheck(nfsd, nx, nxo))) { + vnode_put(vp); + nfsm_reply(NFSX_UNSIGNED); + nfsm_srvpostop_attr(getret, &at); + return (0); + } + context.vc_proc = procp; + context.vc_ucred = nfsd->nd_cr; + + nfsm_srv_vattr_init(&at, 1); + getret = vnode_getattr(vp, &at, &context); + vnode_put(vp); nfsm_reply(NFSX_V3POSTOPATTR + NFSX_V3FSINFO); nfsm_srvpostop_attr(getret, &at); nfsm_build(sip, struct nfsv3_fsinfo *, NFSX_V3FSINFO); @@ -3430,17 +4221,18 @@ nfsrv_fsinfo(nfsd, slp, procp, mrq) * There should be file system VFS OP(s) to get this information. * For now, assume our usual NFS defaults. */ - if (slp->ns_so->so_type == SOCK_DGRAM) - max = pref = NFS_MAXDGRAMDATA; - else - max = pref = NFS_MAXDATA; - sip->fs_rtmax = txdr_unsigned(max); - sip->fs_rtpref = txdr_unsigned(pref); + if (slp->ns_sotype == SOCK_DGRAM) { + maxsize = NFS_MAXDGRAMDATA; + prefsize = NFS_PREFDGRAMDATA; + } else + maxsize = prefsize = NFS_MAXDATA; + sip->fs_rtmax = txdr_unsigned(maxsize); + sip->fs_rtpref = txdr_unsigned(prefsize); sip->fs_rtmult = txdr_unsigned(NFS_FABLKSIZE); - sip->fs_wtmax = txdr_unsigned(max); - sip->fs_wtpref = txdr_unsigned(pref); + sip->fs_wtmax = txdr_unsigned(maxsize); + sip->fs_wtpref = txdr_unsigned(prefsize); sip->fs_wtmult = txdr_unsigned(NFS_FABLKSIZE); - sip->fs_dtpref = txdr_unsigned(pref); + sip->fs_dtpref = txdr_unsigned(prefsize); sip->fs_maxfilesize.nfsuquad[0] = 0xffffffff; sip->fs_maxfilesize.nfsuquad[1] = 0xffffffff; sip->fs_timedelta.nfsv3_sec = 0; @@ -3448,7 +4240,8 @@ nfsrv_fsinfo(nfsd, slp, procp, mrq) sip->fs_properties = txdr_unsigned(NFSV3FSINFO_LINK | NFSV3FSINFO_SYMLINK | NFSV3FSINFO_HOMOGENEOUS | NFSV3FSINFO_CANSETTIME); - nfsm_srvdone; +nfsmout: + return (error); } /* @@ -3458,51 +4251,56 @@ int nfsrv_pathconf(nfsd, slp, procp, mrq) struct nfsrv_descript *nfsd; struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; + proc_t procp; + mbuf_t *mrq; { - struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md; - struct mbuf *nam = nfsd->nd_nam; + mbuf_t mrep = nfsd->nd_mrep, md = nfsd->nd_md; + mbuf_t nam = nfsd->nd_nam; caddr_t dpos = nfsd->nd_dpos; - struct ucred *cred = &nfsd->nd_cr; - register u_long *tl; - register struct nfsv3_pathconf *pc; - register long t1; + u_long *tl; + struct nfsv3_pathconf *pc; + long t1; caddr_t bpos; - int error = 0, rdonly, cache, getret = 1, linkmax, namemax; + int error = 0, getret = 1, linkmax, namemax; int chownres, notrunc, case_sensitive, case_preserving; char *cp2; - struct mbuf *mb, *mb2, *mreq; - struct vnode *vp; - struct vattr at; - nfsfh_t nfh; - fhandle_t *fhp; - u_quad_t frev; + mbuf_t mb, mb2, mreq; + vnode_t vp; + struct vnode_attr at; + struct nfs_filehandle nfh; + struct nfs_export *nx; + struct nfs_export_options *nxo; + struct vfs_context context; -#ifndef nolint - cache = 0; -#endif - fhp = &nfh.fh_generic; - nfsm_srvmtofh(fhp); - if ((error = nfsrv_fhtovp(fhp, 1, &vp, cred, slp, nam, - &rdonly, (nfsd->nd_flag & ND_KERBAUTH), TRUE))) { + nfsm_srvmtofh(&nfh); + if ((error = nfsrv_fhtovp(&nfh, nam, TRUE, &vp, &nx, &nxo))) { + nfsm_reply(NFSX_UNSIGNED); + nfsm_srvpostop_attr(getret, &at); + return (0); + } + if ((error = nfsrv_credcheck(nfsd, nx, nxo))) { + vnode_put(vp); nfsm_reply(NFSX_UNSIGNED); nfsm_srvpostop_attr(getret, &at); return (0); } - error = VOP_PATHCONF(vp, _PC_LINK_MAX, &linkmax); + context.vc_proc = procp; + context.vc_ucred = nfsd->nd_cr; + + error = VNOP_PATHCONF(vp, _PC_LINK_MAX, &linkmax, &context); if (!error) - error = VOP_PATHCONF(vp, _PC_NAME_MAX, &namemax); + error = VNOP_PATHCONF(vp, _PC_NAME_MAX, &namemax, &context); if (!error) - error = VOP_PATHCONF(vp, _PC_CHOWN_RESTRICTED, &chownres); + error = VNOP_PATHCONF(vp, _PC_CHOWN_RESTRICTED, &chownres, &context); if (!error) - error = VOP_PATHCONF(vp, _PC_NO_TRUNC, ¬runc); + error = VNOP_PATHCONF(vp, _PC_NO_TRUNC, ¬runc, &context); if (!error) - error = VOP_PATHCONF(vp, _PC_CASE_SENSITIVE, &case_sensitive); + error = VNOP_PATHCONF(vp, _PC_CASE_SENSITIVE, &case_sensitive, &context); if (!error) - error = VOP_PATHCONF(vp, _PC_CASE_PRESERVING, &case_preserving); - getret = VOP_GETATTR(vp, &at, cred, procp); - vput(vp); + error = VNOP_PATHCONF(vp, _PC_CASE_PRESERVING, &case_preserving, &context); + nfsm_srv_vattr_init(&at, 1); + getret = vnode_getattr(vp, &at, &context); + vnode_put(vp); nfsm_reply(NFSX_V3POSTOPATTR + NFSX_V3PATHCONF); nfsm_srvpostop_attr(getret, &at); if (error) @@ -3516,7 +4314,8 @@ nfsrv_pathconf(nfsd, slp, procp, mrq) pc->pc_caseinsensitive = txdr_unsigned(!case_sensitive); pc->pc_casepreserving = txdr_unsigned(case_preserving); - nfsm_srvdone; +nfsmout: + return (error); } /* @@ -3524,22 +4323,19 @@ nfsrv_pathconf(nfsd, slp, procp, mrq) */ /* ARGSUSED */ int -nfsrv_null(nfsd, slp, procp, mrq) - struct nfsrv_descript *nfsd; - struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; +nfsrv_null( + struct nfsrv_descript *nfsd, + struct nfssvc_sock *slp, + __unused proc_t procp, + mbuf_t *mrq) { - struct mbuf *mrep = nfsd->nd_mrep; + mbuf_t mrep = nfsd->nd_mrep; caddr_t bpos; - int error = NFSERR_RETVOID, cache; - struct mbuf *mb, *mreq; - u_quad_t frev; + int error = NFSERR_RETVOID; + mbuf_t mb, mreq; -#ifndef nolint - cache = 0; -#endif nfsm_reply(0); +nfsmout: return (0); } @@ -3548,83 +4344,79 @@ nfsrv_null(nfsd, slp, procp, mrq) */ /* ARGSUSED */ int -nfsrv_noop(nfsd, slp, procp, mrq) - struct nfsrv_descript *nfsd; - struct nfssvc_sock *slp; - struct proc *procp; - struct mbuf **mrq; +nfsrv_noop( + struct nfsrv_descript *nfsd, + struct nfssvc_sock *slp, + __unused proc_t procp, + mbuf_t *mrq) { - struct mbuf *mrep = nfsd->nd_mrep; + mbuf_t mrep = nfsd->nd_mrep; caddr_t bpos; - int error, cache; - struct mbuf *mb, *mreq; - u_quad_t frev; + int error; + mbuf_t mb, mreq; -#ifndef nolint - cache = 0; -#endif if (nfsd->nd_repstat) error = nfsd->nd_repstat; else error = EPROCUNAVAIL; nfsm_reply(0); +nfsmout: return (0); } /* * Perform access checking for vnodes obtained from file handles that would * refer to files already opened by a Unix client. You cannot just use - * vn_writechk() and VOP_ACCESS() for two reasons. + * vnode_authorize() for two reasons. * 1 - You must check for exported rdonly as well as MNT_RDONLY for the write case * 2 - The owner is to be given access irrespective of mode bits so that * processes that chmod after opening a file don't break. I don't like * this because it opens a security hole, but since the nfs server opens * a security hole the size of a barn door anyhow, what the heck. - - * The exception to rule 2 is EPERM. If a file is IMMUTABLE, VOP_ACCESS() + * + * The exception to rule 2 is EPERM. If a file is IMMUTABLE, vnode_authorize() * will return EPERM instead of EACCESS. EPERM is always an error. */ static int -nfsrv_access(vp, flags, cred, rdonly, p, override) - register struct vnode *vp; - int flags; - register struct ucred *cred; - int rdonly; - struct proc *p; - int override; +nfsrv_authorize( + vnode_t vp, + vnode_t dvp, + kauth_action_t action, + vfs_context_t context, + struct nfs_export_options *nxo, + int override) { - struct vattr vattr; + struct vnode_attr vattr; int error; - if (flags & VWRITE) { - /* Just vn_writechk() changed to check rdonly */ + + if (action & KAUTH_VNODE_WRITE_RIGHTS) { /* - * Disallow write attempts on read-only file systems; + * Disallow write attempts on read-only exports; * unless the file is a socket or a block or character * device resident on the file system. */ - if (rdonly || (vp->v_mount->mnt_flag & MNT_RDONLY)) { - switch (vp->v_type) { + if (nxo->nxo_flags & NX_READONLY) { + switch (vnode_vtype(vp)) { case VREG: case VDIR: case VLNK: case VCPLX: return (EROFS); + default: + break; } } - /* - * If there's shared text associated with - * the inode, we can't allow writing. - */ - if (vp->v_flag & VTEXT) - return (ETXTBSY); } - if ((error = VOP_GETATTR(vp, &vattr, cred, p))) - return (error); - error = VOP_ACCESS(vp, flags, cred, p); + error = vnode_authorize(vp, dvp, action, context); /* * Allow certain operations for the owner (reads and writes * on files that are already open). Picking up from FreeBSD. */ - if (override && error == EACCES && cred->cr_uid == vattr.va_uid) - error = 0; + if (override && (error == EACCES)) { + VATTR_INIT(&vattr); + VATTR_WANTED(&vattr, va_uid); + if ((vnode_getattr(vp, &vattr, context) == 0) && + (kauth_cred_getuid(vfs_context_ucred(context)) == vattr.va_uid)) + error = 0; + } return error; } #endif /* NFS_NOSERVER */ diff --git a/bsd/nfs/nfs_socket.c b/bsd/nfs/nfs_socket.c index ff2f55066..3f36830fa 100644 --- a/bsd/nfs/nfs_socket.c +++ b/bsd/nfs/nfs_socket.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -66,18 +66,19 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> -#include <sys/mount.h> +#include <sys/kauth.h> +#include <sys/mount_internal.h> #include <sys/kernel.h> -#include <sys/mbuf.h> +#include <sys/kpi_mbuf.h> #include <sys/malloc.h> #include <sys/vnode.h> #include <sys/domain.h> #include <sys/protosw.h> #include <sys/socket.h> -#include <sys/socketvar.h> #include <sys/syslog.h> #include <sys/tprintf.h> -#include <machine/spl.h> +#include <sys/uio_internal.h> +#include <libkern/OSAtomic.h> #include <sys/time.h> #include <kern/clock.h> @@ -96,7 +97,6 @@ #include <nfs/nfsmount.h> #include <nfs/nfsnode.h> #include <nfs/nfsrtt.h> -#include <nfs/nqnfs.h> #include <sys/kdebug.h> @@ -110,9 +110,6 @@ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \ (int)(B), (int)(C), (int)(D), (int)(E), 0) -#define TRUE 1 -#define FALSE 0 - /* * Estimate rto for an nfs rpc sent via. an unreliable datagram. * Use the mean and mean deviation of rtt for the appropriate type of rpc @@ -138,8 +135,7 @@ extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix, rpc_msgaccepted, rpc_call, rpc_autherr, rpc_auth_kerb; -extern u_long nfs_prog, nqnfs_prog; -extern time_t nqnfsstarttime; +extern u_long nfs_prog; extern struct nfsstats nfsstats; extern int nfsv3_procid[NFS_NPROCS]; extern int nfs_ticks; @@ -154,8 +150,7 @@ extern u_long nfs_xidwrap; * 4 - write */ static int proct[NFS_NPROCS] = { - 0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, - 0, 0, 0, + 0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0 }; /* @@ -177,27 +172,23 @@ static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, }; int nfsrtton = 0; struct nfsrtt nfsrtt; -static int nfs_msg __P((struct proc *, const char *, const char *, int)); -static int nfs_rcvlock __P((struct nfsreq *)); -static void nfs_rcvunlock __P((struct nfsreq *)); -static int nfs_receive __P((struct nfsreq *rep, struct mbuf **aname, - struct mbuf **mp)); -static int nfs_reconnect __P((struct nfsreq *rep)); -static void nfs_repbusy(struct nfsreq *rep); -static struct nfsreq * nfs_repnext(struct nfsreq *rep); +static int nfs_rcvlock(struct nfsreq *); +static void nfs_rcvunlock(struct nfsreq *); +static int nfs_receive(struct nfsreq *rep, mbuf_t *mp); +static int nfs_reconnect(struct nfsreq *rep); static void nfs_repdequeue(struct nfsreq *rep); /* XXX */ boolean_t current_thread_aborted(void); -kern_return_t thread_terminate(thread_act_t); +kern_return_t thread_terminate(thread_t); #ifndef NFS_NOSERVER -static int nfsrv_getstream __P((struct nfssvc_sock *,int)); +static int nfsrv_getstream(struct nfssvc_sock *,int); -int (*nfsrv3_procs[NFS_NPROCS]) __P((struct nfsrv_descript *nd, +int (*nfsrv3_procs[NFS_NPROCS])(struct nfsrv_descript *nd, struct nfssvc_sock *slp, - struct proc *procp, - struct mbuf **mreqp)) = { + proc_t procp, + mbuf_t *mreqp) = { nfsrv_null, nfsrv_getattr, nfsrv_setattr, @@ -220,137 +211,10 @@ int (*nfsrv3_procs[NFS_NPROCS]) __P((struct nfsrv_descript *nd, nfsrv_fsinfo, nfsrv_pathconf, nfsrv_commit, - nqnfsrv_getlease, - nqnfsrv_vacated, - nfsrv_noop, nfsrv_noop }; #endif /* NFS_NOSERVER */ -/* - * NFSTRACE points were changed to FSDBG (KERNEL_DEBUG) - * But some of this code may prove useful someday... - */ -#undef NFSDIAG -#if NFSDIAG -int nfstraceindx = 0; -struct nfstracerec nfstracebuf[NFSTBUFSIZ] = {{0,0,0,0}}; - -#define NFSTRACESUSPENDERS -#ifdef NFSTRACESUSPENDERS -uint nfstracemask = 0xfff00200; -int nfstracexid = -1; -uint onfstracemask = 0; -int nfstracesuspend = -1; -#define NFSTRACE_SUSPEND \ - { \ - if (nfstracemask) { \ - onfstracemask = nfstracemask; \ - nfstracemask = 0; \ - } \ - } -#define NFSTRACE_RESUME \ - { \ - nfstracesuspend = -1; \ - if (!nfstracemask) \ - nfstracemask = onfstracemask; \ - } -#define NFSTRACE_STARTSUSPENDCOUNTDOWN \ - { \ - nfstracesuspend = (nfstraceindx+100) % NFSTBUFSIZ; \ - } -#define NFSTRACE_SUSPENDING (nfstracesuspend != -1) -#define NFSTRACE_SUSPENSEOVER \ - (nfstracesuspend > 100 ? \ - (nfstraceindx >= nfstracesuspend || \ - nfstraceindx < nfstracesuspend - 100) : \ - (nfstraceindx >= nfstracesuspend && \ - nfstraceindx < nfstracesuspend + 8192 - 100)) -#else -uint nfstracemask = 0; -#endif /* NFSTRACESUSPENDERS */ - -int nfsprnttimo = 1; - -int nfsodata[1024]; -int nfsoprocnum, nfsolen; -int nfsbt[32], nfsbtlen; - -#if defined(__ppc__) -int -backtrace(int *where, int size) -{ - int register sp, *fp, numsaved; - - __asm__ volatile("mr %0,r1" : "=r" (sp)); - - fp = (int *)*((int *)sp); - size /= sizeof(int); - for (numsaved = 0; numsaved < size; numsaved++) { - *where++ = fp[2]; - if ((int)fp <= 0) - break; - fp = (int *)*fp; - } - return (numsaved); -} -#elif defined(__i386__) -int -backtrace() -{ - return (0); /* Till someone implements a real routine */ -} -#else -#error architecture not implemented. -#endif - -void -nfsdup(struct nfsreq *rep) -{ - int *ip, i, first = 1, end; - char *s, b[240]; - struct mbuf *mb; - - if ((nfs_debug & NFS_DEBUG_DUP) == 0) - return; - /* last mbuf in chain will be nfs content */ - for (mb = rep->r_mreq; mb->m_next; mb = mb->m_next) - ; - if (rep->r_procnum == nfsoprocnum && mb->m_len == nfsolen && - !bcmp((caddr_t)nfsodata, mb->m_data, nfsolen)) { - s = b + sprintf(b, "nfsdup x=%x p=%d h=", rep->r_xid, - rep->r_procnum); - end = (int)(VTONFS(rep->r_vp)->n_fhp); - ip = (int *)(end & ~3); - end += VTONFS(rep->r_vp)->n_fhsize; - while ((int)ip < end) { - i = *ip++; - if (first) { /* avoid leading zeroes */ - if (i == 0) - continue; - first = 0; - s += sprintf(s, "%x", i); - } else - s += sprintf(s, "%08x", i); - } - if (first) - sprintf(s, "%x", 0); - else /* eliminate trailing zeroes */ - while (*--s == '0') - *s = 0; - /* - * set a breakpoint here and you can view the - * current backtrace and the one saved in nfsbt - */ - kprintf("%s\n", b); - } - nfsoprocnum = rep->r_procnum; - nfsolen = mb->m_len; - bcopy(mb->m_data, (caddr_t)nfsodata, mb->m_len); - nfsbtlen = backtrace(&nfsbt, sizeof(nfsbt)); -} -#endif /* NFSDIAG */ - /* * attempt to bind a socket to a reserved port @@ -358,7 +222,7 @@ nfsdup(struct nfsreq *rep) static int nfs_bind_resv(struct nfsmount *nmp) { - struct socket *so = nmp->nm_so; + socket_t so = nmp->nm_so; struct sockaddr_in sin; int error; u_short tport; @@ -372,7 +236,7 @@ nfs_bind_resv(struct nfsmount *nmp) tport = IPPORT_RESERVED - 1; sin.sin_port = htons(tport); - while (((error = sobind(so, (struct sockaddr *) &sin)) == EADDRINUSE) && + while (((error = sock_bind(so, (struct sockaddr *) &sin)) == EADDRINUSE) && (--tport > IPPORT_RESERVED / 2)) sin.sin_port = htons(tport); return (error); @@ -385,7 +249,10 @@ int nfs_resv_mounts = 0; static int nfs_bind_resv_thread_state = 0; #define NFS_BIND_RESV_THREAD_STATE_INITTED 1 #define NFS_BIND_RESV_THREAD_STATE_RUNNING 2 -static struct slock nfs_bind_resv_slock; +lck_grp_t *nfs_bind_resv_lck_grp; +lck_grp_attr_t *nfs_bind_resv_lck_grp_attr; +lck_attr_t *nfs_bind_resv_lck_attr; +lck_mtx_t *nfs_bind_resv_mutex; struct nfs_bind_resv_request { TAILQ_ENTRY(nfs_bind_resv_request) brr_chain; struct nfsmount *brr_nmp; @@ -400,28 +267,25 @@ static void nfs_bind_resv_thread(void) { struct nfs_bind_resv_request *brreq; - boolean_t funnel_state; - funnel_state = thread_funnel_set(network_flock, TRUE); nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING; while (nfs_resv_mounts > 0) { - simple_lock(&nfs_bind_resv_slock); + lck_mtx_lock(nfs_bind_resv_mutex); while ((brreq = TAILQ_FIRST(&nfs_bind_resv_request_queue))) { TAILQ_REMOVE(&nfs_bind_resv_request_queue, brreq, brr_chain); - simple_unlock(&nfs_bind_resv_slock); + lck_mtx_unlock(nfs_bind_resv_mutex); brreq->brr_error = nfs_bind_resv(brreq->brr_nmp); wakeup(brreq); - simple_lock(&nfs_bind_resv_slock); + lck_mtx_lock(nfs_bind_resv_mutex); } - simple_unlock(&nfs_bind_resv_slock); - (void)tsleep((caddr_t)&nfs_bind_resv_request_queue, PSOCK, + msleep((caddr_t)&nfs_bind_resv_request_queue, + nfs_bind_resv_mutex, PSOCK | PDROP, "nfs_bind_resv_request_queue", 0); } nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED; - (void) thread_funnel_set(network_flock, funnel_state); - (void) thread_terminate(current_act()); + (void) thread_terminate(current_thread()); } int @@ -445,7 +309,11 @@ nfs_bind_resv_nopriv(struct nfsmount *nmp) if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING) { if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_INITTED) { - simple_lock_init(&nfs_bind_resv_slock); + nfs_bind_resv_lck_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(nfs_bind_resv_lck_grp_attr); + nfs_bind_resv_lck_grp = lck_grp_alloc_init("nfs_bind_resv", nfs_bind_resv_lck_grp_attr); + nfs_bind_resv_lck_attr = lck_attr_alloc_init(); + nfs_bind_resv_mutex = lck_mtx_alloc_init(nfs_bind_resv_lck_grp, nfs_bind_resv_lck_attr); TAILQ_INIT(&nfs_bind_resv_request_queue); nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED; } @@ -456,9 +324,9 @@ nfs_bind_resv_nopriv(struct nfsmount *nmp) brreq.brr_nmp = nmp; brreq.brr_error = 0; - simple_lock(&nfs_bind_resv_slock); + lck_mtx_lock(nfs_bind_resv_mutex); TAILQ_INSERT_TAIL(&nfs_bind_resv_request_queue, &brreq, brr_chain); - simple_unlock(&nfs_bind_resv_slock); + lck_mtx_unlock(nfs_bind_resv_mutex); error = nfs_bind_resv_thread_wake(); if (error) { @@ -467,7 +335,7 @@ nfs_bind_resv_nopriv(struct nfsmount *nmp) return (error); } - (void) tsleep((caddr_t)&brreq, PSOCK, "nfsbindresv", 0); + tsleep((caddr_t)&brreq, PSOCK, "nfsbindresv", 0); return (brreq.brr_error); } @@ -477,30 +345,29 @@ nfs_bind_resv_nopriv(struct nfsmount *nmp) * We do not free the sockaddr if error. */ int -nfs_connect(nmp, rep) - struct nfsmount *nmp; - struct nfsreq *rep; +nfs_connect( + struct nfsmount *nmp, + __unused struct nfsreq *rep) { - struct socket *so; - int s, error, rcvreserve, sndreserve; + socket_t so; + int error, rcvreserve, sndreserve; struct sockaddr *saddr; + struct timeval timeo; - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - nmp->nm_so = (struct socket *)0; - saddr = mtod(nmp->nm_nam, struct sockaddr *); - error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype, - nmp->nm_soproto); + nmp->nm_so = 0; + saddr = mbuf_data(nmp->nm_nam); + error = sock_socket(saddr->sa_family, nmp->nm_sotype, + nmp->nm_soproto, 0, 0, &nmp->nm_so); if (error) { goto bad; } so = nmp->nm_so; - nmp->nm_soflags = so->so_proto->pr_flags; /* * Some servers require that the client port be a reserved port number. */ if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) { - struct proc *p; + proc_t p; /* * sobind() requires current_proc() to have superuser privs. * If this bind is part of a reconnect, and the current proc @@ -508,7 +375,7 @@ nfs_connect(nmp, rep) * a kernel thread to process. */ if ((nmp->nm_state & NFSSTA_MOUNTED) && - (p = current_proc()) && suser(p->p_ucred, &p->p_acflag)) { + (p = current_proc()) && suser(kauth_cred_get(), 0)) { /* request nfs_bind_resv_thread() to do bind */ error = nfs_bind_resv_nopriv(nmp); } else { @@ -523,51 +390,40 @@ nfs_connect(nmp, rep) * unconnected for servers that reply from a port other than NFS_PORT. */ if (nmp->nm_flag & NFSMNT_NOCONN) { - if (nmp->nm_soflags & PR_CONNREQUIRED) { + if (nmp->nm_sotype == SOCK_STREAM) { error = ENOTCONN; goto bad; } } else { - error = soconnect(so, mtod(nmp->nm_nam, struct sockaddr *)); - if (error) { + struct timeval tv; + tv.tv_sec = 2; + tv.tv_usec = 0; + error = sock_connect(so, mbuf_data(nmp->nm_nam), MSG_DONTWAIT); + if (error && error != EINPROGRESS) { goto bad; } - - /* - * Wait for the connection to complete. Cribbed from the - * connect system call but with the wait timing out so - * that interruptible mounts don't hang here for a long time. - */ - s = splnet(); - while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { - (void) tsleep((caddr_t)&so->so_timeo, PSOCK, - "nfscon", 2 * hz); - if ((so->so_state & SS_ISCONNECTING) && - so->so_error == 0 && rep && - (error = nfs_sigintr(nmp, rep, rep->r_procp))) { - so->so_state &= ~SS_ISCONNECTING; - splx(s); + + while ((error = sock_connectwait(so, &tv)) == EINPROGRESS) { + if (rep && (error = nfs_sigintr(nmp, rep, rep->r_procp))) { goto bad; } } - if (so->so_error) { - error = so->so_error; - so->so_error = 0; - splx(s); - goto bad; - } - splx(s); } + /* * Always time out on recieve, this allows us to reconnect the * socket to deal with network changes. */ - so->so_rcv.sb_timeo = (2 * hz); + timeo.tv_usec = 0; + timeo.tv_sec = 2; + error = sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo)); if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) { - so->so_snd.sb_timeo = (5 * hz); + timeo.tv_sec = 5; } else { - so->so_snd.sb_timeo = 0; + timeo.tv_sec = 0; } + error = sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo)); + if (nmp->nm_sotype == SOCK_DGRAM) { sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3; rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) * @@ -577,34 +433,18 @@ nfs_connect(nmp, rep) rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) * (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2); } else { + int proto; + int on = 1; + + sock_gettype(so, NULL, NULL, &proto); if (nmp->nm_sotype != SOCK_STREAM) panic("nfscon sotype"); - if (so->so_proto->pr_flags & PR_CONNREQUIRED) { - struct sockopt sopt; - int val; - - bzero(&sopt, sizeof sopt); - sopt.sopt_dir = SOPT_SET; - sopt.sopt_level = SOL_SOCKET; - sopt.sopt_name = SO_KEEPALIVE; - sopt.sopt_val = &val; - sopt.sopt_valsize = sizeof val; - val = 1; - sosetopt(so, &sopt); - } - if (so->so_proto->pr_protocol == IPPROTO_TCP) { - struct sockopt sopt; - int val; - - bzero(&sopt, sizeof sopt); - sopt.sopt_dir = SOPT_SET; - sopt.sopt_level = IPPROTO_TCP; - sopt.sopt_name = TCP_NODELAY; - sopt.sopt_val = &val; - sopt.sopt_valsize = sizeof val; - val = 1; - sosetopt(so, &sopt); + // Assume that SOCK_STREAM always requires a connection + sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on)); + + if (proto == IPPROTO_TCP) { + sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on)); } sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof (u_long)) * 3; @@ -616,14 +456,16 @@ nfs_connect(nmp, rep) sndreserve = NFS_MAXSOCKBUF; if (rcvreserve > NFS_MAXSOCKBUF) rcvreserve = NFS_MAXSOCKBUF; - error = soreserve(so, sndreserve, rcvreserve); + error = sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &sndreserve, sizeof(sndreserve)); + if (error) { + goto bad; + } + error = sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &rcvreserve, sizeof(rcvreserve)); if (error) { goto bad; } - so->so_rcv.sb_flags |= SB_NOINTR; - so->so_snd.sb_flags |= SB_NOINTR; - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + sock_nointerrupt(so, 1); /* Initialize other non-zero congestion variables */ nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] = @@ -637,7 +479,6 @@ nfs_connect(nmp, rep) return (0); bad: - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); nfs_disconnect(nmp); return (error); } @@ -652,11 +493,10 @@ bad: * nb: Must be called with the nfs_sndlock() set on the mount point. */ static int -nfs_reconnect(rep) - register struct nfsreq *rep; +nfs_reconnect(struct nfsreq *rep) { - register struct nfsreq *rp; - register struct nfsmount *nmp = rep->r_nmp; + struct nfsreq *rp; + struct nfsmount *nmp = rep->r_nmp; int error; nfs_disconnect(nmp); @@ -665,8 +505,9 @@ nfs_reconnect(rep) return (EINTR); if (error == EIO) return (EIO); - nfs_down(rep, rep->r_nmp, rep->r_procp, "can not connect", - error, NFSSTA_TIMEO); + nfs_down(rep->r_nmp, rep->r_procp, error, NFSSTA_TIMEO, + "can not connect"); + rep->r_flags |= R_TPRINTFMSG; if (!(nmp->nm_state & NFSSTA_MOUNTED)) { /* we're not yet completely mounted and */ /* we can't reconnect, so we fail */ @@ -674,10 +515,9 @@ nfs_reconnect(rep) } if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp))) return (error); - (void) tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0); + tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0); } - NFS_DPF(DUP, ("nfs_reconnect RESEND\n")); /* * Loop through outstanding request list and fix up all requests * on old socket. @@ -693,19 +533,16 @@ nfs_reconnect(rep) * NFS disconnect. Clean up and unlink. */ void -nfs_disconnect(nmp) - register struct nfsmount *nmp; +nfs_disconnect(struct nfsmount *nmp) { - register struct socket *so; + socket_t so; - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); if (nmp->nm_so) { so = nmp->nm_so; - nmp->nm_so = (struct socket *)0; - soshutdown(so, 2); - soclose(so); + nmp->nm_so = 0; + sock_shutdown(so, 2); + sock_close(so); } - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); } /* @@ -723,65 +560,61 @@ nfs_disconnect(nmp) */ int nfs_send(so, nam, top, rep) - register struct socket *so; - struct mbuf *nam; - register struct mbuf *top; + socket_t so; + mbuf_t nam; + mbuf_t top; struct nfsreq *rep; { struct sockaddr *sendnam; - int error, error2, soflags, flags; - int xidqueued = 0; + int error, error2, sotype, flags; + u_long xidqueued = 0; struct nfsreq *rp; - char savenametolog[MNAMELEN]; + char savenametolog[MAXPATHLEN]; + struct msghdr msg; if (rep) { error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp); if (error) { - m_freem(top); + mbuf_freem(top); return (error); } if ((so = rep->r_nmp->nm_so) == NULL) { rep->r_flags |= R_MUSTRESEND; - m_freem(top); + mbuf_freem(top); return (0); } rep->r_flags &= ~R_MUSTRESEND; - soflags = rep->r_nmp->nm_soflags; TAILQ_FOREACH(rp, &nfs_reqq, r_chain) if (rp == rep) break; if (rp) xidqueued = rp->r_xid; - } else - soflags = so->so_proto->pr_flags; - if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED) || + } + sock_gettype(so, NULL, &sotype, NULL); + if ((sotype == SOCK_STREAM) || (sock_isconnected(so)) || (nam == 0)) sendnam = (struct sockaddr *)0; else - sendnam = mtod(nam, struct sockaddr *); + sendnam = mbuf_data(nam); - if (so->so_type == SOCK_SEQPACKET) + if (sotype == SOCK_SEQPACKET) flags = MSG_EOR; else flags = 0; -#if NFSDIAG - if (rep) - nfsdup(rep); -#endif /* - * Save the name here in case mount point goes away when we switch - * funnels. The name is using local stack and is large, but don't + * Save the name here in case mount point goes away if we block. + * The name is using local stack and is large, but don't * want to block if we malloc. */ if (rep) strncpy(savenametolog, - rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname, - MNAMELEN); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - error = sosend(so, sendnam, (struct uio *)0, top, - (struct mbuf *)0, flags); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname, + MAXPATHLEN - 1); + bzero(&msg, sizeof(msg)); + msg.msg_name = (caddr_t)sendnam; + msg.msg_namelen = sendnam == 0 ? 0 : sendnam->sa_len; + error = sock_sendmbuf(so, &msg, top, flags, NULL); if (error) { if (rep) { @@ -803,8 +636,6 @@ nfs_send(so, nam, top, rep) error = error2; } else { rep->r_flags |= R_MUSTRESEND; - NFS_DPF(DUP, - ("nfs_send RESEND error=%d\n", error)); } } else log(LOG_INFO, "nfsd send error %d\n", error); @@ -830,29 +661,22 @@ nfs_send(so, nam, top, rep) * we have read any of it, even if the system call has been interrupted. */ static int -nfs_receive(rep, aname, mp) - register struct nfsreq *rep; - struct mbuf **aname; - struct mbuf **mp; +nfs_receive(struct nfsreq *rep, mbuf_t *mp) { - register struct socket *so; - struct uio auio; - struct iovec aio; - register struct mbuf *m; - struct mbuf *control; - u_long len; - struct sockaddr **getnam; - struct sockaddr *tmp_nam; - struct mbuf *mhck; - struct sockaddr_in *sin; - int error, error2, sotype, rcvflg; - struct proc *p = current_proc(); /* XXX */ + socket_t so; + struct iovec_32 aio; + mbuf_t m, mlast; + u_long len, fraglen; + int error, error2, sotype; + proc_t p = current_proc(); /* XXX */ + struct msghdr msg; + size_t rcvlen; + int lastfragment; /* * Set up arguments for soreceive() */ - *mp = (struct mbuf *)0; - *aname = (struct mbuf *)0; + *mp = NULL; sotype = rep->r_nmp->nm_sotype; /* @@ -893,12 +717,11 @@ tryagain: goto tryagain; } while (rep->r_flags & R_MUSTRESEND) { - m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT); - nfsstats.rpcretries++; - NFS_DPF(DUP, - ("nfs_receive RESEND %s\n", - rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname)); - error = nfs_send(so, rep->r_nmp->nm_nam, m, rep); + error = mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_WAITOK, &m); + if (!error) { + OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries); + error = nfs_send(so, rep->r_nmp->nm_nam, m, rep); + } /* * we also hold rcv lock so rep is still * legit this point @@ -914,127 +737,115 @@ tryagain: } nfs_sndunlock(rep); if (sotype == SOCK_STREAM) { - aio.iov_base = (caddr_t) &len; - aio.iov_len = sizeof(u_long); - auio.uio_iov = &aio; - auio.uio_iovcnt = 1; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = UIO_READ; - auio.uio_offset = 0; - auio.uio_resid = sizeof(u_long); - auio.uio_procp = p; - do { - rcvflg = MSG_WAITALL; - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - error = soreceive(so, (struct sockaddr **)0, &auio, - (struct mbuf **)0, (struct mbuf **)0, &rcvflg); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - if (!rep->r_nmp) /* if unmounted then bailout */ - goto shutout; - if (error == EWOULDBLOCK && rep) { - error2 = nfs_sigintr(rep->r_nmp, rep, p); - if (error2) - error = error2; - } - } while (error == EWOULDBLOCK); - if (!error && auio.uio_resid > 0) { - log(LOG_INFO, - "short receive (%d/%d) from nfs server %s\n", - sizeof(u_long) - auio.uio_resid, - sizeof(u_long), - rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); - error = EPIPE; - } - if (error) - goto errout; - len = ntohl(len) & ~0x80000000; - /* - * This is SERIOUS! We are out of sync with the sender - * and forcing a disconnect/reconnect is all I can do. - */ - if (len > NFS_MAXPACKET) { - log(LOG_ERR, "%s (%d) from nfs server %s\n", - "impossible packet length", - len, - rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); - error = EFBIG; - goto errout; - } - auio.uio_resid = len; - - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - do { - rcvflg = MSG_WAITALL; - error = soreceive(so, (struct sockaddr **)0, - &auio, mp, (struct mbuf **)0, &rcvflg); - if (!rep->r_nmp) /* if unmounted then bailout */ { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - goto shutout; - } - } while (error == EWOULDBLOCK || error == EINTR || - error == ERESTART); - - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + error = 0; + len = 0; + lastfragment = 0; + mlast = NULL; + while (!error && !lastfragment) { + aio.iov_base = (uintptr_t) &fraglen; + aio.iov_len = sizeof(u_long); + bzero(&msg, sizeof(msg)); + msg.msg_iov = (struct iovec *) &aio; + msg.msg_iovlen = 1; + do { + error = sock_receive(so, &msg, MSG_WAITALL, &rcvlen); + if (!rep->r_nmp) /* if unmounted then bailout */ + goto shutout; + if (error == EWOULDBLOCK && rep) { + error2 = nfs_sigintr(rep->r_nmp, rep, p); + if (error2) + error = error2; + } + } while (error == EWOULDBLOCK); + if (!error && rcvlen < aio.iov_len) { + /* only log a message if we got a partial word */ + if (rcvlen != 0) + log(LOG_INFO, + "short receive (%d/%d) from nfs server %s\n", + rcvlen, sizeof(u_long), + vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname); + error = EPIPE; + } + if (error) + goto errout; + lastfragment = ntohl(fraglen) & 0x80000000; + fraglen = ntohl(fraglen) & ~0x80000000; + len += fraglen; + /* + * This is SERIOUS! We are out of sync with the sender + * and forcing a disconnect/reconnect is all I can do. + */ + if (len > NFS_MAXPACKET) { + log(LOG_ERR, "%s (%d) from nfs server %s\n", + "impossible RPC record length", len, + vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname); + error = EFBIG; + goto errout; + } - if (!error && auio.uio_resid > 0) { - log(LOG_INFO, - "short receive (%d/%d) from nfs server %s\n", - len - auio.uio_resid, len, - rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); - error = EPIPE; + m = NULL; + do { + rcvlen = fraglen; + error = sock_receivembuf(so, NULL, &m, MSG_WAITALL, &rcvlen); + if (!rep->r_nmp) /* if unmounted then bailout */ { + goto shutout; + } + } while (error == EWOULDBLOCK || error == EINTR || + error == ERESTART); + + if (!error && fraglen > rcvlen) { + log(LOG_INFO, + "short receive (%d/%d) from nfs server %s\n", + rcvlen, fraglen, + vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname); + error = EPIPE; + mbuf_freem(m); + } + if (!error) { + if (!*mp) { + *mp = m; + mlast = m; + } else { + error = mbuf_setnext(mlast, m); + if (error) { + printf("nfs_receive: mbuf_setnext failed %d\n", error); + mbuf_freem(m); + } + } + while (mbuf_next(mlast)) + mlast = mbuf_next(mlast); + } } } else { - /* - * NB: Since uio_resid is big, MSG_WAITALL is ignored - * and soreceive() will return when it has either a - * control msg or a data msg. - * We have no use for control msg., but must grab them - * and then throw them away so we know what is going - * on. - */ - auio.uio_resid = len = 100000000; /* Anything Big */ - auio.uio_procp = p; - - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + bzero(&msg, sizeof(msg)); do { - control = NULL; - rcvflg = 0; - error = soreceive(so, (struct sockaddr **)0, - &auio, mp, &control, &rcvflg); - if (control) - m_freem(control); + rcvlen = 100000000; + error = sock_receivembuf(so, &msg, mp, 0, &rcvlen); if (!rep->r_nmp) /* if unmounted then bailout */ { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); goto shutout; } if (error == EWOULDBLOCK && rep) { error2 = nfs_sigintr(rep->r_nmp, rep, p); if (error2) { - thread_funnel_switch(NETWORK_FUNNEL, - KERNEL_FUNNEL); return (error2); } } - } while (error == EWOULDBLOCK || - (!error && *mp == NULL && control)); - - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + } while (error == EWOULDBLOCK); - if ((rcvflg & MSG_EOR) == 0) + if ((msg.msg_flags & MSG_EOR) == 0) printf("Egad!!\n"); if (!error && *mp == NULL) error = EPIPE; - len -= auio.uio_resid; + len = rcvlen; } errout: if (error && error != EINTR && error != ERESTART) { - m_freem(*mp); - *mp = (struct mbuf *)0; + mbuf_freem(*mp); + *mp = NULL; if (error != EPIPE) log(LOG_INFO, - "receive error %d from nfs server %s\n", - error, - rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); + "receive error %d from nfs server %s\n", error, + vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname); error = nfs_sndlock(rep); if (!error) { error = nfs_reconnect(rep); @@ -1060,35 +871,18 @@ errout: return (ENXIO); so = rep->r_nmp->nm_so; } - if (so->so_state & SS_ISCONNECTED) - getnam = (struct sockaddr **)0; - else - getnam = &tmp_nam;; - auio.uio_resid = len = 1000000; - auio.uio_procp = p; - - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + bzero(&msg, sizeof(msg)); + len = 0; do { - rcvflg = 0; - error = soreceive(so, getnam, &auio, mp, - (struct mbuf **)0, &rcvflg); - - if ((getnam) && (*getnam)) { - MGET(mhck, M_WAIT, MT_SONAME); - mhck->m_len = (*getnam)->sa_len; - sin = mtod(mhck, struct sockaddr_in *); - bcopy(*getnam, sin, sizeof(struct sockaddr_in)); - mhck->m_hdr.mh_len = sizeof(struct sockaddr_in); - FREE(*getnam, M_SONAME); - *aname = mhck; - } + rcvlen = 1000000; + error = sock_receivembuf(so, &msg, mp, 0, &rcvlen); if (!rep->r_nmp) /* if unmounted then bailout */ - goto dgramout; + goto shutout; if (error) { error2 = nfs_sigintr(rep->r_nmp, rep, p); if (error2) { error = error2; - goto dgramout; + goto shutout; } } /* Reconnect for all errors. We may be receiving @@ -1099,8 +893,6 @@ errout: * although TCP doesn't seem to. */ if (error) { - thread_funnel_switch(NETWORK_FUNNEL, - KERNEL_FUNNEL); error2 = nfs_sndlock(rep); if (!error2) { error2 = nfs_reconnect(rep); @@ -1114,19 +906,13 @@ errout: } else { error = error2; } - thread_funnel_switch(KERNEL_FUNNEL, - NETWORK_FUNNEL); } } while (error == EWOULDBLOCK); - -dgramout: - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - len -= auio.uio_resid; } shutout: if (error) { - m_freem(*mp); - *mp = (struct mbuf *)0; + mbuf_freem(*mp); + *mp = NULL; } return (error); } @@ -1141,11 +927,10 @@ int nfs_reply(myrep) struct nfsreq *myrep; { - register struct nfsreq *rep; - register struct nfsmount *nmp = myrep->r_nmp; - register long t1; - struct mbuf *mrep, *md; - struct mbuf *nam; + struct nfsreq *rep; + struct nfsmount *nmp = myrep->r_nmp; + long t1; + mbuf_t mrep, md; u_long rxid, *tl; caddr_t dpos, cp2; int error; @@ -1185,15 +970,14 @@ nfs_reply(myrep) * Get the next Rpc reply off the socket. Assume myrep->r_nmp * is still intact by checks done in nfs_rcvlock. */ - /* XXX why do we ask for nam here? we don't use it! */ - error = nfs_receive(myrep, &nam, &mrep); - if (nam) - m_freem(nam); + error = nfs_receive(myrep, &mrep); /* * Bailout asap if nfsmount struct gone (unmounted). */ if (!myrep->r_nmp) { FSDBG(530, myrep->r_xid, myrep, nmp, -2); + if (mrep) + mbuf_freem(mrep); return (ENXIO); } if (error) { @@ -1201,25 +985,31 @@ nfs_reply(myrep) nfs_rcvunlock(myrep); /* Bailout asap if nfsmount struct gone (unmounted). */ - if (!myrep->r_nmp) + if (!myrep->r_nmp) { + if (mrep) + mbuf_freem(mrep); return (ENXIO); + } /* * Ignore routing errors on connectionless protocols?? */ - if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) { - if (nmp->nm_so) - nmp->nm_so->so_error = 0; - if (myrep->r_flags & R_GETONEREP) - return (0); + if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) { + if (nmp->nm_so) { + int clearerror; + int optlen = sizeof(clearerror); + sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen); + } continue; } + if (mrep) + mbuf_freem(mrep); return (error); } /* * We assume all is fine, but if we did not have an error - * and mrep is 0, better not dereference it. nfs_receieve + * and mrep is 0, better not dereference it. nfs_receive * calls soreceive which carefully sets error=0 when it got * errors on sbwait (tsleep). In most cases, I assume that's * so we could go back again. In tcp case, EPIPE is returned. @@ -1240,27 +1030,15 @@ nfs_reply(myrep) * Get the xid and check that it is an rpc reply */ md = mrep; - dpos = mtod(md, caddr_t); + dpos = mbuf_data(md); nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED); rxid = *tl++; if (*tl != rpc_reply) { -#ifndef NFS_NOSERVER - if (nmp->nm_flag & NFSMNT_NQNFS) { - if (nqnfs_callback(nmp, mrep, md, dpos)) - nfsstats.rpcinvalid++; - } else { - nfsstats.rpcinvalid++; - m_freem(mrep); - } -#else - nfsstats.rpcinvalid++; - m_freem(mrep); -#endif + OSAddAtomic(1, (SInt32*)&nfsstats.rpcinvalid); + mbuf_freem(mrep); nfsmout: if (nmp->nm_state & NFSSTA_RCVLOCK) nfs_rcvunlock(myrep); - if (myrep->r_flags & R_GETONEREP) - return (0); /* this path used by NQNFS */ continue; } @@ -1291,7 +1069,7 @@ nfsmout: panic("nfs_reply: proct[%d] is zero", rep->r_procnum); rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1]; rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1]; - rt->fsid = nmp->nm_mountp->mnt_stat.f_fsid; + rt->fsid = vfs_statfs(nmp->nm_mountp)->f_fsid; microtime(&rt->tstamp); // XXX unused if (rep->r_flags & R_TIMING) rt->rtt = rep->r_rtt; @@ -1350,8 +1128,8 @@ nfsmout: * If it's mine, get out. */ if (rep == 0) { - nfsstats.rpcunexpected++; - m_freem(mrep); + OSAddAtomic(1, (SInt32*)&nfsstats.rpcunexpected); + mbuf_freem(mrep); } else if (rep == myrep) { if (rep->r_mrep == NULL) panic("nfs_reply: nil r_mrep"); @@ -1359,8 +1137,6 @@ nfsmout: } FSDBG(530, myrep->r_xid, myrep, rep, rep ? rep->r_xid : myrep->r_flags); - if (myrep->r_flags & R_GETONEREP) - return (0); /* this path used by NQNFS */ } } @@ -1375,32 +1151,31 @@ nfsmout: * nb: always frees up mreq mbuf list */ int -nfs_request(vp, mrest, procnum, procp, cred, mrp, mdp, dposp, xidp) - struct vnode *vp; - struct mbuf *mrest; +nfs_request(vp, mp, mrest, procnum, procp, cred, mrp, mdp, dposp, xidp) + vnode_t vp; + mount_t mp; + mbuf_t mrest; int procnum; - struct proc *procp; - struct ucred *cred; - struct mbuf **mrp; - struct mbuf **mdp; + proc_t procp; + kauth_cred_t cred; + mbuf_t *mrp; + mbuf_t *mdp; caddr_t *dposp; u_int64_t *xidp; { - register struct mbuf *m, *mrep, *m2; - register struct nfsreq *rep, *rp; - register u_long *tl; - register int i; + mbuf_t m, mrep, m2; + struct nfsreq re, *rep; + u_long *tl; + int i; struct nfsmount *nmp; - struct mbuf *md, *mheadend; - struct nfsnode *np; + mbuf_t md, mheadend; char nickv[RPCX_NICKVERF]; - time_t reqtime, waituntil; + time_t waituntil; caddr_t dpos, cp2; - int t1, nqlflag, cachable, s, error = 0, mrest_len, auth_len, auth_type; - int trylater_delay = NQ_TRYLATERDEL, trylater_cnt = 0, failed_auth = 0; + int t1, error = 0, mrest_len, auth_len, auth_type; + int trylater_delay = NFS_TRYLATERDEL, failed_auth = 0; int verf_len, verf_type; u_long xid; - u_quad_t frev; char *auth_str, *verf_str; NFSKERBKEY_T key; /* save session key */ int nmsotype; @@ -1410,15 +1185,16 @@ nfs_request(vp, mrest, procnum, procp, cred, mrp, mdp, dposp, xidp) *mrp = NULL; if (xidp) *xidp = 0; + nmp = VFSTONFS(mp); - MALLOC_ZONE(rep, struct nfsreq *, - sizeof(struct nfsreq), M_NFSREQ, M_WAITOK); + rep = &re; - nmp = VFSTONFS(vp->v_mount); + if (vp) + nmp = VFSTONFS(vnode_mount(vp)); if (nmp == NULL || (nmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) == (NFSSTA_FORCE|NFSSTA_TIMEO)) { - FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); + mbuf_freem(mrest); return (ENXIO); } nmsotype = nmp->nm_sotype; @@ -1435,8 +1211,8 @@ nfs_request(vp, mrest, procnum, procp, cred, mrp, mdp, dposp, xidp) i = 0; m = mrest; while (m) { - i += m->m_len; - m = m->m_next; + i += mbuf_len(m); + m = mbuf_next(m); } mrest_len = i; @@ -1444,10 +1220,10 @@ nfs_request(vp, mrest, procnum, procp, cred, mrp, mdp, dposp, xidp) * Get the RPC header with authorization. */ kerbauth: - nmp = VFSTONFS(vp->v_mount); + nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp; if (!nmp) { FSDBG_BOT(531, error, rep->r_xid, nmp, rep); - FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); + mbuf_freem(mrest); return (ENXIO); } verf_str = auth_str = (char *)0; @@ -1458,24 +1234,20 @@ kerbauth: bzero((caddr_t)key, sizeof (key)); if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str, &auth_len, verf_str, verf_len)) { - nmp = VFSTONFS(vp->v_mount); + nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp; if (!nmp) { FSDBG_BOT(531, 2, vp, error, rep); - FREE_ZONE((caddr_t)rep, - sizeof (struct nfsreq), M_NFSREQ); - m_freem(mrest); + mbuf_freem(mrest); return (ENXIO); } error = nfs_getauth(nmp, rep, cred, &auth_str, &auth_len, verf_str, &verf_len, key); - nmp = VFSTONFS(vp->v_mount); + nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp; if (!error && !nmp) error = ENXIO; if (error) { FSDBG_BOT(531, 2, vp, error, rep); - FREE_ZONE((caddr_t)rep, - sizeof (struct nfsreq), M_NFSREQ); - m_freem(mrest); + mbuf_freem(mrest); return (error); } } @@ -1487,25 +1259,35 @@ kerbauth: nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) + 5 * NFSX_UNSIGNED; } - m = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len, - auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid); - if (xidp) - *xidp = ntohl(xid) + ((u_int64_t)nfs_xidwrap << 32); + error = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len, + auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid, &m); if (auth_str) _FREE(auth_str, M_TEMP); + if (error) { + mbuf_freem(mrest); + FSDBG_BOT(531, error, rep->r_xid, nmp, rep); + return (error); + } + if (xidp) + *xidp = ntohl(xid) + ((u_int64_t)nfs_xidwrap << 32); /* * For stream protocols, insert a Sun RPC Record Mark. */ if (nmsotype == SOCK_STREAM) { - M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); - *mtod(m, u_long *) = htonl(0x80000000 | - (m->m_pkthdr.len - NFSX_UNSIGNED)); + error = mbuf_prepend(&m, NFSX_UNSIGNED, MBUF_WAITOK); + if (error) { + mbuf_freem(m); + FSDBG_BOT(531, error, rep->r_xid, nmp, rep); + return (error); + } + *((u_long*)mbuf_data(m)) = + htonl(0x80000000 | (mbuf_pkthdr_len(m) - NFSX_UNSIGNED)); } rep->r_mreq = m; rep->r_xid = xid; tryagain: - nmp = VFSTONFS(vp->v_mount); + nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp; if (nmp && (nmp->nm_flag & NFSMNT_SOFT)) rep->r_retry = nmp->nm_retry; else @@ -1520,18 +1302,13 @@ tryagain: /* * Do the client side RPC. */ - nfsstats.rpcrequests++; + OSAddAtomic(1, (SInt32*)&nfsstats.rpcrequests); /* * Chain request into list of outstanding requests. Be sure * to put it LAST so timer finds oldest requests first. */ - s = splsoftclock(); TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain); - /* Get send time for nqnfs */ - microtime(&now); - reqtime = now.tv_sec; - /* * If backing off another request or avoiding congestion, don't * send this one now but let timer do it. If not timing a request, @@ -1540,9 +1317,8 @@ tryagain: if (nmp && nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM || (nmp->nm_flag & NFSMNT_DUMBTIMR) || nmp->nm_sent < nmp->nm_cwnd)) { - int connrequired = (nmp->nm_soflags & PR_CONNREQUIRED); + int connrequired = (nmp->nm_sotype == SOCK_STREAM); - splx(s); if (connrequired) error = nfs_sndlock(rep); @@ -1558,19 +1334,19 @@ tryagain: rep->r_flags |= R_SENT; } - m2 = m_copym(m, 0, M_COPYALL, M_WAIT); - error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep); + error = mbuf_copym(m, 0, MBUF_COPYALL, MBUF_WAITOK, &m2); + if (!error) + error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep); if (connrequired) nfs_sndunlock(rep); } - nmp = VFSTONFS(vp->v_mount); + nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp; if (error) { if (nmp) nmp->nm_sent -= NFS_CWNDSCALE; rep->r_flags &= ~R_SENT; } } else { - splx(s); rep->r_rtt = -1; } @@ -1585,7 +1361,7 @@ tryagain: */ nfs_repdequeue(rep); - nmp = VFSTONFS(vp->v_mount); + nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp; /* * Decrement the outstanding request count. @@ -1603,16 +1379,16 @@ tryagain: * tprintf a response. */ if (!error) - nfs_up(rep, nmp, procp, "is alive again", NFSSTA_TIMEO); + nfs_up(nmp, procp, NFSSTA_TIMEO, + (rep->r_flags & R_TPRINTFMSG) ? "is alive again" : NULL); mrep = rep->r_mrep; md = rep->r_md; dpos = rep->r_dpos; if (!error && !nmp) error = ENXIO; if (error) { - m_freem(rep->r_mreq); + mbuf_freem(rep->r_mreq); FSDBG_BOT(531, error, rep->r_xid, nmp, rep); - FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); return (error); } @@ -1626,18 +1402,19 @@ tryagain: else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) { if (!failed_auth) { failed_auth++; - mheadend->m_next = (struct mbuf *)0; - m_freem(mrep); - m_freem(rep->r_mreq); - goto kerbauth; + error = mbuf_setnext(mheadend, NULL); + mbuf_freem(mrep); + mbuf_freem(rep->r_mreq); + if (!error) + goto kerbauth; + printf("nfs_request: mbuf_setnext failed\n"); } else error = EAUTH; } else error = EACCES; - m_freem(mrep); - m_freem(rep->r_mreq); + mbuf_freem(mrep); + mbuf_freem(rep->r_mreq); FSDBG_BOT(531, error, rep->r_xid, nmp, rep); - FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); return (error); } @@ -1660,25 +1437,17 @@ tryagain: error = fxdr_unsigned(int, *tl); if ((nmp->nm_flag & NFSMNT_NFSV3) && error == NFSERR_TRYLATER) { - m_freem(mrep); + mbuf_freem(mrep); error = 0; microuptime(&now); waituntil = now.tv_sec + trylater_delay; - NFS_DPF(DUP, - ("nfs_request %s flag=%x trylater_cnt=%x waituntil=%lx trylater_delay=%x\n", - nmp->nm_mountp->mnt_stat.f_mntfromname, - nmp->nm_flag, trylater_cnt, waituntil, - trylater_delay)); while (now.tv_sec < waituntil) { - (void)tsleep((caddr_t)&lbolt, - PSOCK, "nqnfstry", 0); + tsleep((caddr_t)&lbolt, PSOCK, "nfstrylater", 0); microuptime(&now); } trylater_delay *= 2; if (trylater_delay > 60) trylater_delay = 60; - if (trylater_cnt < 7) - trylater_cnt++; goto tryagain; } @@ -1686,7 +1455,7 @@ tryagain: * If the File Handle was stale, invalidate the * lookup cache, just in case. */ - if (error == ESTALE) + if ((error == ESTALE) && vp) cache_purge(vp); if (nmp->nm_flag & NFSMNT_NFSV3) { *mrp = mrep; @@ -1694,49 +1463,26 @@ tryagain: *dposp = dpos; error |= NFSERR_RETERR; } else { - m_freem(mrep); + mbuf_freem(mrep); error &= ~NFSERR_RETERR; } - m_freem(rep->r_mreq); + mbuf_freem(rep->r_mreq); FSDBG_BOT(531, error, rep->r_xid, nmp, rep); - FREE_ZONE((caddr_t)rep, - sizeof (struct nfsreq), M_NFSREQ); return (error); } - /* - * For nqnfs, get any lease in reply - */ - if (nmp->nm_flag & NFSMNT_NQNFS) { - nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); - if (*tl) { - np = VTONFS(vp); - nqlflag = fxdr_unsigned(int, *tl); - nfsm_dissect(tl, u_long *, 4*NFSX_UNSIGNED); - cachable = fxdr_unsigned(int, *tl++); - reqtime += fxdr_unsigned(int, *tl++); - microtime(&now); - if (reqtime > now.tv_sec) { - fxdr_hyper(tl, &frev); - nqnfs_clientlease(nmp, np, nqlflag, - cachable, reqtime, frev); - } - } - } *mrp = mrep; *mdp = md; *dposp = dpos; - m_freem(rep->r_mreq); + mbuf_freem(rep->r_mreq); FSDBG_BOT(531, 0xf0f0f0f0, rep->r_xid, nmp, rep); - FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); return (0); } - m_freem(mrep); + mbuf_freem(mrep); error = EPROTONOSUPPORT; nfsmout: - m_freem(rep->r_mreq); + mbuf_freem(rep->r_mreq); FSDBG_BOT(531, error, rep->r_xid, nmp, rep); - FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); return (error); } @@ -1746,36 +1492,47 @@ nfsmout: * siz arg. is used to decide if adding a cluster is worthwhile */ int -nfs_rephead(siz, nd, slp, err, cache, frev, mrq, mbp, bposp) +nfs_rephead(siz, nd, slp, err, mrq, mbp, bposp) int siz; struct nfsrv_descript *nd; struct nfssvc_sock *slp; int err; - int cache; - u_quad_t *frev; - struct mbuf **mrq; - struct mbuf **mbp; + mbuf_t *mrq; + mbuf_t *mbp; caddr_t *bposp; { - register u_long *tl; - register struct mbuf *mreq; + u_long *tl; + mbuf_t mreq; caddr_t bpos; - struct mbuf *mb, *mb2; + mbuf_t mb, mb2; + int error, mlen; - MGETHDR(mreq, M_WAIT, MT_DATA); - mb = mreq; /* * If this is a big reply, use a cluster else * try and leave leading space for the lower level headers. */ siz += RPC_REPLYSIZ; - if (siz >= MINCLSIZE) { - MCLGET(mreq, M_WAIT); - } else - mreq->m_data += max_hdr; - tl = mtod(mreq, u_long *); - mreq->m_len = 6 * NFSX_UNSIGNED; - bpos = ((caddr_t)tl) + mreq->m_len; + if (siz >= nfs_mbuf_minclsize) { + error = mbuf_getpacket(MBUF_WAITOK, &mreq); + } else { + error = mbuf_gethdr(MBUF_WAITOK, MBUF_TYPE_DATA, &mreq); + } + if (error) { + /* unable to allocate packet */ + /* XXX nfsstat? */ + return (error); + } + mb = mreq; + tl = mbuf_data(mreq); + mlen = 6 * NFSX_UNSIGNED; + if (siz < nfs_mbuf_minclsize) { + /* leave space for lower level headers */ + tl += 80/sizeof(*tl); /* XXX max_hdr? XXX */ + mbuf_setdata(mreq, tl, mlen); + } else { + mbuf_setlen(mreq, mlen); + } + bpos = ((caddr_t)tl) + mlen; *tl++ = txdr_unsigned(nd->nd_retxid); *tl++ = rpc_reply; if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) { @@ -1783,7 +1540,8 @@ nfs_rephead(siz, nd, slp, err, cache, frev, mrq, mbp, bposp) if (err & NFSERR_AUTHERR) { *tl++ = rpc_autherr; *tl = txdr_unsigned(err & ~NFSERR_AUTHERR); - mreq->m_len -= NFSX_UNSIGNED; + mlen -= NFSX_UNSIGNED; + mbuf_setlen(mreq, mlen); bpos -= NFSX_UNSIGNED; } else { *tl++ = rpc_mismatch; @@ -1798,12 +1556,14 @@ nfs_rephead(siz, nd, slp, err, cache, frev, mrq, mbp, bposp) * verifier back, otherwise just RPCAUTH_NULL. */ if (nd->nd_flag & ND_KERBFULL) { - register struct nfsuid *nuidp; + struct nfsuid *nuidp; struct timeval ktvin, ktvout; + uid_t uid = kauth_cred_getuid(nd->nd_cr); - for (nuidp = NUIDHASH(slp, nd->nd_cr.cr_uid)->lh_first; + lck_rw_lock_shared(&slp->ns_rwlock); + for (nuidp = NUIDHASH(slp, uid)->lh_first; nuidp != 0; nuidp = nuidp->nu_hash.le_next) { - if (nuidp->nu_cr.cr_uid == nd->nd_cr.cr_uid && + if (kauth_cred_getuid(nuidp->nu_cr) == uid && (!nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp), &nuidp->nu_haddr, nd->nd_nam2))) break; @@ -1827,11 +1587,12 @@ nfs_rephead(siz, nd, slp, err, cache, frev, mrq, mbp, bposp) *tl = ktvout.tv_sec; nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED); *tl++ = ktvout.tv_usec; - *tl++ = txdr_unsigned(nuidp->nu_cr.cr_uid); + *tl++ = txdr_unsigned(kauth_cred_getuid(nuidp->nu_cr)); } else { *tl++ = 0; *tl++ = 0; } + lck_rw_done(&slp->ns_rwlock); } else { *tl++ = 0; *tl++ = 0; @@ -1843,13 +1604,9 @@ nfs_rephead(siz, nd, slp, err, cache, frev, mrq, mbp, bposp) case EPROGMISMATCH: *tl = txdr_unsigned(RPC_PROGMISMATCH); nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED); - if (nd->nd_flag & ND_NQNFS) { - *tl++ = txdr_unsigned(3); - *tl = txdr_unsigned(3); - } else { - *tl++ = txdr_unsigned(2); - *tl = txdr_unsigned(3); - } + // XXX hard coded versions + *tl++ = txdr_unsigned(2); + *tl = txdr_unsigned(3); break; case EPROCUNAVAIL: *tl = txdr_unsigned(RPC_PROCUNAVAIL); @@ -1867,30 +1624,16 @@ nfs_rephead(siz, nd, slp, err, cache, frev, mrq, mbp, bposp) *tl = 0; } break; - }; - } - - /* - * For nqnfs, piggyback lease as requested. - */ - if ((nd->nd_flag & ND_NQNFS) && err == 0) { - if (nd->nd_flag & ND_LEASE) { - nfsm_build(tl, u_long *, 5 * NFSX_UNSIGNED); - *tl++ = txdr_unsigned(nd->nd_flag & ND_LEASE); - *tl++ = txdr_unsigned(cache); - *tl++ = txdr_unsigned(nd->nd_duration); - txdr_hyper(frev, tl); - } else { - nfsm_build(tl, u_long *, NFSX_UNSIGNED); - *tl = 0; } } + if (mrq != NULL) *mrq = mreq; *mbp = mb; *bposp = bpos; - if (err != 0 && err != NFSERR_RETVOID) - nfsstats.srvrpc_errs++; + if (err != 0 && err != NFSERR_RETVOID) { + OSAddAtomic(1, (SInt32*)&nfsstats.srvrpc_errs); + } return (0); } @@ -1918,8 +1661,7 @@ nfs_softterm(struct nfsreq *rep) } void -nfs_timer_funnel(arg) - void * arg; +nfs_timer_funnel(void * arg) { (void) thread_funnel_set(kernel_flock, TRUE); nfs_timer(arg); @@ -1930,25 +1672,22 @@ nfs_timer_funnel(arg) /* * Ensure rep isn't in use by the timer, then dequeue it. */ -void +static void nfs_repdequeue(struct nfsreq *rep) { - int s; while ((rep->r_flags & R_BUSY)) { rep->r_flags |= R_WAITING; tsleep(rep, PSOCK, "repdeq", 0); } - s = splsoftclock(); TAILQ_REMOVE(&nfs_reqq, rep, r_chain); - splx(s); } /* * Busy (lock) a nfsreq, used by the nfs timer to make sure it's not * free()'d out from under it. */ -void +static void nfs_repbusy(struct nfsreq *rep) { @@ -1960,7 +1699,7 @@ nfs_repbusy(struct nfsreq *rep) /* * Unbusy the nfsreq passed in, return the next nfsreq in the chain busied. */ -struct nfsreq * +static struct nfsreq * nfs_repnext(struct nfsreq *rep) { struct nfsreq * nextrep; @@ -1991,55 +1730,27 @@ nfs_repnext(struct nfsreq *rep) * sure to set the r_retry field to 0 (implies nm_retry == 0). */ void -nfs_timer(arg) - void *arg; /* never used */ +nfs_timer(__unused void *arg) { - register struct nfsreq *rep; - register struct mbuf *m; - register struct socket *so; - register struct nfsmount *nmp; - register int timeo; - int s, error; + struct nfsreq *rep; + mbuf_t m; + socket_t so; + struct nfsmount *nmp; + int timeo; + int error; #ifndef NFS_NOSERVER - static long lasttime = 0; - register struct nfssvc_sock *slp; + struct nfssvc_sock *slp; u_quad_t cur_usec; #endif /* NFS_NOSERVER */ -#if NFSDIAG - int rttdiag; -#endif int flags, rexmit, cwnd, sent; u_long xid; struct timeval now; - s = splnet(); - /* - * XXX If preemptable threads are implemented the spls used for the - * outstanding request queue must be replaced with mutexes. - */ -#ifdef NFSTRACESUSPENDERS - if (NFSTRACE_SUSPENDING) { - TAILQ_FOREACH(rep, &nfs_reqq, r_chain) - if (rep->r_xid == nfstracexid) - break; - if (!rep) { - NFSTRACE_RESUME; - } else if (NFSTRACE_SUSPENSEOVER) { - NFSTRACE_SUSPEND; - } - } -#endif rep = TAILQ_FIRST(&nfs_reqq); if (rep != NULL) nfs_repbusy(rep); microuptime(&now); for ( ; rep != NULL ; rep = nfs_repnext(rep)) { -#ifdef NFSTRACESUSPENDERS - if (rep->r_mrep && !NFSTRACE_SUSPENDING) { - nfstracexid = rep->r_xid; - NFSTRACE_STARTSUSPENDCOUNTDOWN; - } -#endif nmp = rep->r_nmp; if (!nmp) /* unmounted */ continue; @@ -2051,12 +1762,13 @@ nfs_timer(arg) (rep->r_rexmit > 2 || (rep->r_flags & R_RESENDERR)) && rep->r_lastmsg + nmp->nm_tprintf_delay < now.tv_sec) { rep->r_lastmsg = now.tv_sec; - nfs_down(rep, rep->r_nmp, rep->r_procp, "not responding", - 0, NFSSTA_TIMEO); + nfs_down(rep->r_nmp, rep->r_procp, 0, NFSSTA_TIMEO, + "not responding"); + rep->r_flags |= R_TPRINTFMSG; if (!(nmp->nm_state & NFSSTA_MOUNTED)) { /* we're not yet completely mounted and */ /* we can't complete an RPC, so we fail */ - nfsstats.rpctimeouts++; + OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts); nfs_softterm(rep); continue; } @@ -2083,7 +1795,7 @@ nfs_timer(arg) * and never allow r_rexmit to be more than NFS_MAXREXMIT. */ if (rep->r_rexmit >= rep->r_retry) { /* too many */ - nfsstats.rpctimeouts++; + OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts); nfs_softterm(rep); continue; } @@ -2100,29 +1812,12 @@ nfs_timer(arg) * Resend it * Set r_rtt to -1 in case we fail to send it now. */ -#if NFSDIAG - rttdiag = rep->r_rtt; -#endif rep->r_rtt = -1; - if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len && - ((nmp->nm_flag & NFSMNT_DUMBTIMR) || + if (((nmp->nm_flag & NFSMNT_DUMBTIMR) || (rep->r_flags & R_SENT) || nmp->nm_sent < nmp->nm_cwnd) && - (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){ - - struct proc *p = current_proc(); - -#if NFSDIAG - if (rep->r_flags & R_SENT && nfsprnttimo && - nmp->nm_timeouts >= nfsprnttimo) { - int t = proct[rep->r_procnum]; - if (t) - NFS_DPF(DUP, ("nfs_timer %s nmtm=%d tms=%d rtt=%d tm=%d p=%d A=%d D=%d\n", nmp->nm_mountp->mnt_stat.f_mntfromname, nmp->nm_timeo, nmp->nm_timeouts, rttdiag, timeo, rep->r_procnum, nmp->nm_srtt[t-1], nmp->nm_sdrtt[t-1])); - else - NFS_DPF(DUP, ("nfs_timer %s nmtm=%d tms=%d rtt=%d tm=%d p=%d\n", nmp->nm_mountp->mnt_stat.f_mntfromname, nmp->nm_timeo, nmp->nm_timeouts, rttdiag, timeo, rep->r_procnum)); - } - nfsdup(rep); -#endif /* NFSDIAG */ + (mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_DONTWAIT, &m) == 0)){ + struct msghdr msg; /* * Iff first send, start timing * else turn timing off, backoff timer @@ -2143,61 +1838,61 @@ nfs_timer(arg) nmp->nm_cwnd >>= 1; if (nmp->nm_cwnd < NFS_CWNDSCALE) nmp->nm_cwnd = NFS_CWNDSCALE; - nfsstats.rpcretries++; + OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries); } else { rep->r_flags |= R_SENT; nmp->nm_sent += NFS_CWNDSCALE; } FSDBG(535, xid, rep, nmp->nm_sent, nmp->nm_cwnd); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - - if ((nmp->nm_flag & NFSMNT_NOCONN) == 0) - error = (*so->so_proto->pr_usrreqs->pru_send) - (so, 0, m, 0, 0, p); - else - error = (*so->so_proto->pr_usrreqs->pru_send) - (so, 0, m, mtod(nmp->nm_nam, struct sockaddr *), 0, p); - - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + bzero(&msg, sizeof(msg)); + if ((nmp->nm_flag & NFSMNT_NOCONN) == NFSMNT_NOCONN) { + msg.msg_name = mbuf_data(nmp->nm_nam); + msg.msg_namelen = mbuf_len(nmp->nm_nam); + } + error = sock_sendmbuf(so, &msg, m, MSG_DONTWAIT, NULL); FSDBG(535, xid, error, sent, cwnd); if (error) { - if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) - so->so_error = 0; - rep->r_flags = flags | R_RESENDERR; - rep->r_rexmit = rexmit; - nmp->nm_cwnd = cwnd; - nmp->nm_sent = sent; - if (flags & R_SENT) - nfsstats.rpcretries--; + if (error == EWOULDBLOCK) { + rep->r_flags = flags; + rep->r_rexmit = rexmit; + nmp->nm_cwnd = cwnd; + nmp->nm_sent = sent; + rep->r_xid = xid; + } + else { + if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) { + int clearerror; + int optlen = sizeof(clearerror); + sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen); + } + rep->r_flags = flags | R_RESENDERR; + rep->r_rexmit = rexmit; + nmp->nm_cwnd = cwnd; + nmp->nm_sent = sent; + if (flags & R_SENT) + OSAddAtomic(-1, (SInt32*)&nfsstats.rpcretries); + } } else rep->r_rtt = 0; } } microuptime(&now); #ifndef NFS_NOSERVER - /* - * Call the nqnfs server timer once a second to handle leases. - */ - if (lasttime != now.tv_sec) { - lasttime = now.tv_sec; - nqnfs_serverd(); - } - /* * Scan the write gathering queues for writes that need to be * completed now. */ cur_usec = (u_quad_t)now.tv_sec * 1000000 + (u_quad_t)now.tv_usec; + lck_mtx_lock(nfsd_mutex); TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) { - if (LIST_FIRST(&slp->ns_tq) && - LIST_FIRST(&slp->ns_tq)->nd_time <= cur_usec) + if (slp->ns_wgtime && (slp->ns_wgtime <= cur_usec)) nfsrv_wakenfsd(slp); } + lck_mtx_unlock(nfsd_mutex); #endif /* NFS_NOSERVER */ - splx(s); if (nfsbuffreeuptimestamp + 30 <= now.tv_sec) { /* @@ -2224,12 +1919,12 @@ int nfs_sigintr(nmp, rep, p) struct nfsmount *nmp; struct nfsreq *rep; - struct proc *p; + proc_t p; { - struct uthread *curr_td; sigset_t pending_sigs; int context_good = 0; struct nfsmount *repnmp; + extern proc_t kernproc; if (nmp == NULL) return (ENXIO); @@ -2249,7 +1944,7 @@ nfs_sigintr(nmp, rep, p) (NFSSTA_FORCE|NFSSTA_TIMEO)) return (EIO); /* Someone is unmounting us, go soft and mark it. */ - if ((repnmp->nm_mountp->mnt_kern_flag & MNTK_FRCUNMOUNT)) { + if (repnmp->nm_mountp->mnt_kern_flag & MNTK_FRCUNMOUNT) { repnmp->nm_flag |= NFSMNT_SOFT; nmp->nm_state |= NFSSTA_FORCE; } @@ -2257,7 +1952,7 @@ nfs_sigintr(nmp, rep, p) * If the mount is hung and we've requested not to hang * on remote filesystems, then bail now. */ - if (p != NULL && (p->p_flag & P_NOREMOTEHANG) != 0 && + if (p != NULL && (proc_noremotehang(p)) != 0 && (repnmp->nm_state & NFSSTA_TIMEO) != 0) return (EIO); } @@ -2265,30 +1960,13 @@ nfs_sigintr(nmp, rep, p) if (p == NULL) return (0); - /* - * XXX: Since nfs doesn't have a good shot at getting the current - * thread we take a guess. (only struct proc * are passed to VOPs) - * What we do is look at the current thread, if it belongs to the - * passed in proc pointer then we have a "good/accurate" context - * and can make an accurate guess as to what to do. - * However if we have a bad context we have to make due with what - * is in the proc struct which may not be as up to date as we'd - * like. - * This is ok because the process will call us with the correct - * context after a short timeout while waiting for a response. - */ - curr_td = (struct uthread *)get_bsdthread_info(current_act()); - if (curr_td->uu_proc == p) - context_good = 1; - if (context_good && current_thread_aborted()) + /* Is this thread belongs to kernel task; then abort check is not needed */ + if ((current_proc() != kernproc) && current_thread_aborted()) { return (EINTR); + } /* mask off thread and process blocked signals. */ - if (context_good) - pending_sigs = curr_td->uu_siglist & ~curr_td->uu_sigmask; - else - pending_sigs = p->p_siglist; - /* mask off process level and NFS ignored signals. */ - pending_sigs &= ~p->p_sigignore & NFSINT_SIGMASK; + + pending_sigs = proc_pendingsignals(p, NFSINT_SIGMASK); if (pending_sigs && (nmp->nm_flag & NFSMNT_INT) != 0) return (EINTR); return (0); @@ -2304,8 +1982,8 @@ int nfs_sndlock(rep) struct nfsreq *rep; { - register int *statep; - struct proc *p; + int *statep; + proc_t p; int error, slpflag = 0, slptimeo = 0; if (rep->r_nmp == NULL) @@ -2320,10 +1998,9 @@ nfs_sndlock(rep) if (error) return (error); *statep |= NFSSTA_WANTSND; - if (p != NULL && (p->p_flag & P_NOREMOTEHANG) != 0) + if (p != NULL && (proc_noremotehang(p)) != 0) slptimeo = hz; - (void) tsleep((caddr_t)statep, slpflag | (PZERO - 1), - "nfsndlck", slptimeo); + tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsndlck", slptimeo); if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; @@ -2346,7 +2023,7 @@ void nfs_sndunlock(rep) struct nfsreq *rep; { - register int *statep; + int *statep; if (rep->r_nmp == NULL) return; @@ -2361,10 +2038,9 @@ nfs_sndunlock(rep) } static int -nfs_rcvlock(rep) - register struct nfsreq *rep; +nfs_rcvlock(struct nfsreq *rep) { - register int *statep; + int *statep; int error, slpflag, slptimeo = 0; /* make sure we still have our mountpoint */ @@ -2398,10 +2074,9 @@ nfs_rcvlock(rep) * call nfs_sigintr periodically above. */ if (rep->r_procp != NULL && - (rep->r_procp->p_flag & P_NOREMOTEHANG) != 0) + (proc_noremotehang(rep->r_procp)) != 0) slptimeo = hz; - (void) tsleep((caddr_t)statep, slpflag | (PZERO - 1), - "nfsrcvlk", slptimeo); + tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsrcvlk", slptimeo); if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; @@ -2417,7 +2092,7 @@ nfs_rcvlock(rep) } /* * nfs_reply will handle it if reply already arrived. - * (We may have slept or been preempted while on network funnel). + * (We may have slept or been preempted). */ FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, *statep); *statep |= NFSSTA_RCVLOCK; @@ -2428,10 +2103,9 @@ nfs_rcvlock(rep) * Unlock the stream socket for others. */ static void -nfs_rcvunlock(rep) - register struct nfsreq *rep; +nfs_rcvunlock(struct nfsreq *rep) { - register int *statep; + int *statep; if (rep->r_nmp == NULL) return; @@ -2453,71 +2127,77 @@ nfs_rcvunlock(rep) * Socket upcall routine for the nfsd sockets. * The caddr_t arg is a pointer to the "struct nfssvc_sock". * Essentially do as much as possible non-blocking, else punt and it will - * be called with M_WAIT from an nfsd. - */ - /* - * Needs to run under network funnel + * be called with MBUF_WAITOK from an nfsd. */ void -nfsrv_rcv(so, arg, waitflag) - struct socket *so; - caddr_t arg; - int waitflag; +nfsrv_rcv(socket_t so, caddr_t arg, int waitflag) { - register struct nfssvc_sock *slp = (struct nfssvc_sock *)arg; - register struct mbuf *m; - struct mbuf *mp, *mhck; - struct sockaddr *nam; - struct uio auio; - int flags, ns_nflag=0, error; - struct sockaddr_in *sin; + struct nfssvc_sock *slp = (struct nfssvc_sock *)arg; - if ((slp->ns_flag & SLP_VALID) == 0) + if (!nfs_numnfsd || !(slp->ns_flag & SLP_VALID)) return; + + lck_rw_lock_exclusive(&slp->ns_rwlock); + nfsrv_rcv_locked(so, slp, waitflag); + /* Note: ns_rwlock gets dropped when called with MBUF_DONTWAIT */ +} +void +nfsrv_rcv_locked(socket_t so, struct nfssvc_sock *slp, int waitflag) +{ + mbuf_t m, mp, mhck, m2; + int ns_flag=0, error; + struct msghdr msg; + size_t bytes_read; + + if ((slp->ns_flag & SLP_VALID) == 0) { + if (waitflag == MBUF_DONTWAIT) + lck_rw_done(&slp->ns_rwlock); + return; + } + #ifdef notdef /* * Define this to test for nfsds handling this under heavy load. */ - if (waitflag == M_DONTWAIT) { - ns_nflag = SLPN_NEEDQ; + if (waitflag == MBUF_DONTWAIT) { + ns_flag = SLP_NEEDQ; goto dorecs; } #endif - auio.uio_procp = NULL; - if (so->so_type == SOCK_STREAM) { + if (slp->ns_sotype == SOCK_STREAM) { /* * If there are already records on the queue, defer soreceive() * to an nfsd so that there is feedback to the TCP layer that * the nfs servers are heavily loaded. */ - if (slp->ns_rec && waitflag == M_DONTWAIT) { - ns_nflag = SLPN_NEEDQ; + if (slp->ns_rec && waitflag == MBUF_DONTWAIT) { + ns_flag = SLP_NEEDQ; goto dorecs; } /* * Do soreceive(). */ - auio.uio_resid = 1000000000; - flags = MSG_DONTWAIT; - error = soreceive(so, (struct sockaddr **) 0, &auio, &mp, (struct mbuf **)0, &flags); - if (error || mp == (struct mbuf *)0) { + bytes_read = 1000000000; + error = sock_receivembuf(so, NULL, &mp, MSG_DONTWAIT, &bytes_read); + if (error || mp == NULL) { if (error == EWOULDBLOCK) - ns_nflag = SLPN_NEEDQ; + ns_flag = SLP_NEEDQ; else - ns_nflag = SLPN_DISCONN; + ns_flag = SLP_DISCONN; goto dorecs; } m = mp; if (slp->ns_rawend) { - slp->ns_rawend->m_next = m; - slp->ns_cc += 1000000000 - auio.uio_resid; + if ((error = mbuf_setnext(slp->ns_rawend, m))) + panic("nfsrv_rcv: mbuf_setnext failed %d\n", error); + slp->ns_cc += bytes_read; } else { slp->ns_raw = m; - slp->ns_cc = 1000000000 - auio.uio_resid; + slp->ns_cc = bytes_read; } - while (m->m_next) - m = m->m_next; + while ((m2 = mbuf_next(m))) + m = m2; slp->ns_rawend = m; /* @@ -2526,48 +2206,59 @@ nfsrv_rcv(so, arg, waitflag) error = nfsrv_getstream(slp, waitflag); if (error) { if (error == EPERM) - ns_nflag = SLPN_DISCONN; + ns_flag = SLP_DISCONN; else - ns_nflag = SLPN_NEEDQ; + ns_flag = SLP_NEEDQ; } } else { + struct sockaddr_storage nam; + + bzero(&msg, sizeof(msg)); + msg.msg_name = (caddr_t)&nam; + msg.msg_namelen = sizeof(nam); + do { - auio.uio_resid = 1000000000; - flags = MSG_DONTWAIT | MSG_NEEDSA; - nam = 0; - mp = 0; - error = soreceive(so, &nam, &auio, &mp, - (struct mbuf **)0, &flags); - + bytes_read = 1000000000; + error = sock_receivembuf(so, &msg, &mp, MSG_DONTWAIT | MSG_NEEDSA, &bytes_read); if (mp) { - if (nam) { - MGET(mhck, M_WAIT, MT_SONAME); - mhck->m_len = nam->sa_len; - sin = mtod(mhck, struct sockaddr_in *); - bcopy(nam, sin, sizeof(struct sockaddr_in)); - mhck->m_hdr.mh_len = sizeof(struct sockaddr_in); - + if (msg.msg_name && (mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &mhck) == 0)) { + mbuf_setlen(mhck, nam.ss_len); + bcopy(&nam, mbuf_data(mhck), nam.ss_len); m = mhck; - m->m_next = mp; - } else + if (mbuf_setnext(m, mp)) { + /* trouble... just drop it */ + printf("nfsrv_rcv: mbuf_setnext failed\n"); + mbuf_free(mhck); + m = mp; + } + } else { m = mp; + } if (slp->ns_recend) - slp->ns_recend->m_nextpkt = m; + mbuf_setnextpkt(slp->ns_recend, m); else slp->ns_rec = m; slp->ns_recend = m; - m->m_nextpkt = (struct mbuf *)0; - } - if (nam) { - FREE(nam, M_SONAME); + mbuf_setnextpkt(m, NULL); } +#if 0 if (error) { - if ((so->so_proto->pr_flags & PR_CONNREQUIRED) + /* + * This may be needed in the future to support + * non-byte-stream connection-oriented protocols + * such as SCTP. + */ + /* + * This (slp->ns_sotype == SOCK_STREAM) should really + * be a check for PR_CONNREQUIRED. + */ + if ((slp->ns_sotype == SOCK_STREAM) && error != EWOULDBLOCK) { - ns_nflag = SLPN_DISCONN; + ns_flag = SLP_DISCONN; goto dorecs; } } +#endif } while (mp); } @@ -2575,13 +2266,16 @@ nfsrv_rcv(so, arg, waitflag) * Now try and process the request records, non-blocking. */ dorecs: - if (ns_nflag) - slp->ns_nflag |= ns_nflag; - if (waitflag == M_DONTWAIT && - (slp->ns_rec || (slp->ns_nflag & (SLPN_NEEDQ | SLPN_DISCONN)))) { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - nfsrv_wakenfsd(slp); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + if (ns_flag) + slp->ns_flag |= ns_flag; + if (waitflag == MBUF_DONTWAIT) { + int wake = (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN))); + lck_rw_done(&slp->ns_rwlock); + if (wake && nfs_numnfsd) { + lck_mtx_lock(nfsd_mutex); + nfsrv_wakenfsd(slp); + lck_mtx_unlock(nfsd_mutex); + } } } @@ -2592,51 +2286,55 @@ dorecs: */ static int nfsrv_getstream(slp, waitflag) - register struct nfssvc_sock *slp; + struct nfssvc_sock *slp; int waitflag; { - register struct mbuf *m, **mpp; - register char *cp1, *cp2; - register int len; - struct mbuf *om, *m2, *recm; + mbuf_t m; + char *cp1, *cp2, *mdata; + int len, mlen, error; + mbuf_t om, m2, recm; u_long recmark; - if (slp->ns_nflag & SLPN_GETSTREAM) + if (slp->ns_flag & SLP_GETSTREAM) panic("nfs getstream"); - slp->ns_nflag |= SLPN_GETSTREAM; + slp->ns_flag |= SLP_GETSTREAM; for (;;) { if (slp->ns_reclen == 0) { if (slp->ns_cc < NFSX_UNSIGNED) { - slp->ns_nflag &= ~SLPN_GETSTREAM; + slp->ns_flag &= ~SLP_GETSTREAM; return (0); } m = slp->ns_raw; - if (m->m_len >= NFSX_UNSIGNED) { - bcopy(mtod(m, caddr_t), (caddr_t)&recmark, NFSX_UNSIGNED); - m->m_data += NFSX_UNSIGNED; - m->m_len -= NFSX_UNSIGNED; + mdata = mbuf_data(m); + mlen = mbuf_len(m); + if (mlen >= NFSX_UNSIGNED) { + bcopy(mdata, (caddr_t)&recmark, NFSX_UNSIGNED); + mdata += NFSX_UNSIGNED; + mlen -= NFSX_UNSIGNED; + mbuf_setdata(m, mdata, mlen); } else { cp1 = (caddr_t)&recmark; - cp2 = mtod(m, caddr_t); + cp2 = mdata; while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) { - while (m->m_len == 0) { - m = m->m_next; - cp2 = mtod(m, caddr_t); + while (mlen == 0) { + m = mbuf_next(m); + cp2 = mbuf_data(m); + mlen = mbuf_len(m); } *cp1++ = *cp2++; - m->m_data++; - m->m_len--; + mlen--; + mbuf_setdata(m, cp2, mlen); } } slp->ns_cc -= NFSX_UNSIGNED; recmark = ntohl(recmark); slp->ns_reclen = recmark & ~0x80000000; if (recmark & 0x80000000) - slp->ns_nflag |= SLPN_LASTFRAG; + slp->ns_flag |= SLP_LASTFRAG; else - slp->ns_nflag &= ~SLPN_LASTFRAG; + slp->ns_flag &= ~SLP_LASTFRAG; if (slp->ns_reclen < NFS_MINPACKET || slp->ns_reclen > NFS_MAXPACKET) { - slp->ns_nflag &= ~SLPN_GETSTREAM; + slp->ns_flag &= ~SLP_GETSTREAM; return (EPERM); } } @@ -2650,63 +2348,83 @@ nfsrv_getstream(slp, waitflag) recm = NULL; if (slp->ns_cc == slp->ns_reclen) { recm = slp->ns_raw; - slp->ns_raw = slp->ns_rawend = (struct mbuf *)0; + slp->ns_raw = slp->ns_rawend = NULL; slp->ns_cc = slp->ns_reclen = 0; } else if (slp->ns_cc > slp->ns_reclen) { len = 0; m = slp->ns_raw; - om = (struct mbuf *)0; + mlen = mbuf_len(m); + mdata = mbuf_data(m); + om = NULL; while (len < slp->ns_reclen) { - if ((len + m->m_len) > slp->ns_reclen) { - m2 = m_copym(m, 0, slp->ns_reclen - len, - waitflag); - if (m2) { - if (om) { - om->m_next = m2; - recm = slp->ns_raw; - } else - recm = m2; - m->m_data += slp->ns_reclen - len; - m->m_len -= slp->ns_reclen - len; - len = slp->ns_reclen; - } else { - slp->ns_nflag &= ~SLPN_GETSTREAM; + if ((len + mlen) > slp->ns_reclen) { + if (mbuf_copym(m, 0, slp->ns_reclen - len, waitflag, &m2)) { + slp->ns_flag &= ~SLP_GETSTREAM; return (EWOULDBLOCK); } - } else if ((len + m->m_len) == slp->ns_reclen) { + if (om) { + if (mbuf_setnext(om, m2)) { + /* trouble... just drop it */ + printf("nfsrv_getstream: mbuf_setnext failed\n"); + mbuf_freem(m2); + slp->ns_flag &= ~SLP_GETSTREAM; + return (EWOULDBLOCK); + } + recm = slp->ns_raw; + } else { + recm = m2; + } + mdata += slp->ns_reclen - len; + mlen -= slp->ns_reclen - len; + mbuf_setdata(m, mdata, mlen); + len = slp->ns_reclen; + } else if ((len + mlen) == slp->ns_reclen) { om = m; - len += m->m_len; - m = m->m_next; + len += mlen; + m = mbuf_next(m); recm = slp->ns_raw; - om->m_next = (struct mbuf *)0; + if (mbuf_setnext(om, NULL)) { + printf("nfsrv_getstream: mbuf_setnext failed 2\n"); + slp->ns_flag &= ~SLP_GETSTREAM; + return (EWOULDBLOCK); + } + mlen = mbuf_len(m); + mdata = mbuf_data(m); } else { om = m; - len += m->m_len; - m = m->m_next; + len += mlen; + m = mbuf_next(m); + mlen = mbuf_len(m); + mdata = mbuf_data(m); } } slp->ns_raw = m; slp->ns_cc -= len; slp->ns_reclen = 0; } else { - slp->ns_nflag &= ~SLPN_GETSTREAM; + slp->ns_flag &= ~SLP_GETSTREAM; return (0); } /* * Accumulate the fragments into a record. */ - mpp = &slp->ns_frag; - while (*mpp) - mpp = &((*mpp)->m_next); - *mpp = recm; - if (slp->ns_nflag & SLPN_LASTFRAG) { + if (slp->ns_frag == NULL) { + slp->ns_frag = recm; + } else { + m = slp->ns_frag; + while ((m2 = mbuf_next(m))) + m = m2; + if ((error = mbuf_setnext(m, recm))) + panic("nfsrv_getstream: mbuf_setnext failed 3, %d\n", error); + } + if (slp->ns_flag & SLP_LASTFRAG) { if (slp->ns_recend) - slp->ns_recend->m_nextpkt = slp->ns_frag; + mbuf_setnextpkt(slp->ns_recend, slp->ns_frag); else slp->ns_rec = slp->ns_frag; slp->ns_recend = slp->ns_frag; - slp->ns_frag = (struct mbuf *)0; + slp->ns_frag = NULL; } } } @@ -2716,39 +2434,42 @@ nfsrv_getstream(slp, waitflag) */ int nfsrv_dorec(slp, nfsd, ndp) - register struct nfssvc_sock *slp; + struct nfssvc_sock *slp; struct nfsd *nfsd; struct nfsrv_descript **ndp; { - register struct mbuf *m; - register struct mbuf *nam; - register struct nfsrv_descript *nd; + mbuf_t m; + mbuf_t nam; + struct nfsrv_descript *nd; int error; *ndp = NULL; - if ((slp->ns_flag & SLP_VALID) == 0 || - (m = slp->ns_rec) == (struct mbuf *)0) + if ((slp->ns_flag & SLP_VALID) == 0 || (slp->ns_rec == NULL)) return (ENOBUFS); - slp->ns_rec = m->m_nextpkt; + MALLOC_ZONE(nd, struct nfsrv_descript *, + sizeof (struct nfsrv_descript), M_NFSRVDESC, M_WAITOK); + if (!nd) + return (ENOMEM); + m = slp->ns_rec; + slp->ns_rec = mbuf_nextpkt(m); if (slp->ns_rec) - m->m_nextpkt = (struct mbuf *)0; + mbuf_setnextpkt(m, NULL); else - slp->ns_recend = (struct mbuf *)0; - if (m->m_type == MT_SONAME) { + slp->ns_recend = NULL; + if (mbuf_type(m) == MBUF_TYPE_SONAME) { nam = m; - m = m->m_next; - nam->m_next = NULL; + m = mbuf_next(m); + if ((error = mbuf_setnext(nam, NULL))) + panic("nfsrv_dorec: mbuf_setnext failed %d\n", error); } else nam = NULL; - MALLOC_ZONE(nd, struct nfsrv_descript *, - sizeof (struct nfsrv_descript), M_NFSRVDESC, M_WAITOK); nd->nd_md = nd->nd_mrep = m; nd->nd_nam2 = nam; - nd->nd_dpos = mtod(m, caddr_t); + nd->nd_dpos = mbuf_data(m); error = nfs_getreq(nd, nfsd, TRUE); if (error) { if (nam) - m_freem(nam); + mbuf_freem(nam); FREE_ZONE((caddr_t)nd, sizeof *nd, M_NFSRVDESC); return (error); } @@ -2764,26 +2485,32 @@ nfsrv_dorec(slp, nfsd, ndp) */ int nfs_getreq(nd, nfsd, has_header) - register struct nfsrv_descript *nd; + struct nfsrv_descript *nd; struct nfsd *nfsd; int has_header; { - register int len, i; - register u_long *tl; - register long t1; - struct uio uio; - struct iovec iov; + int len, i; + u_long *tl; + long t1; + uio_t uiop; caddr_t dpos, cp2, cp; u_long nfsvers, auth_type; uid_t nickuid; - int error = 0, nqnfs = 0, ticklen; - struct mbuf *mrep, *md; - register struct nfsuid *nuidp; + int error = 0, ticklen; + mbuf_t mrep, md; + struct nfsuid *nuidp; + uid_t user_id; + gid_t group_id; + int ngroups; + struct ucred temp_cred; struct timeval tvin, tvout, now; + char uio_buf[ UIO_SIZEOF(1) ]; #if 0 /* until encrypted keys are implemented */ NFSKERBKEYSCHED_T keys; /* stores key schedule */ #endif + nd->nd_cr = NULL; + mrep = nd->nd_mrep; md = nd->nd_md; dpos = nd->nd_dpos; @@ -2791,7 +2518,7 @@ nfs_getreq(nd, nfsd, has_header) nfsm_dissect(tl, u_long *, 10 * NFSX_UNSIGNED); nd->nd_retxid = fxdr_unsigned(u_long, *tl++); if (*tl++ != rpc_call) { - m_freem(mrep); + mbuf_freem(mrep); return (EBADRPC); } } else @@ -2804,31 +2531,23 @@ nfs_getreq(nd, nfsd, has_header) return (0); } if (*tl != nfs_prog) { - if (*tl == nqnfs_prog) - nqnfs++; - else { - nd->nd_repstat = EPROGUNAVAIL; - nd->nd_procnum = NFSPROC_NOOP; - return (0); - } + nd->nd_repstat = EPROGUNAVAIL; + nd->nd_procnum = NFSPROC_NOOP; + return (0); } tl++; nfsvers = fxdr_unsigned(u_long, *tl++); - if (((nfsvers < NFS_VER2 || nfsvers > NFS_VER3) && !nqnfs) || - (nfsvers != NQNFS_VER3 && nqnfs)) { + if ((nfsvers < NFS_VER2) || (nfsvers > NFS_VER3)) { nd->nd_repstat = EPROGMISMATCH; nd->nd_procnum = NFSPROC_NOOP; return (0); } - if (nqnfs) - nd->nd_flag = (ND_NFSV3 | ND_NQNFS); else if (nfsvers == NFS_VER3) nd->nd_flag = ND_NFSV3; nd->nd_procnum = fxdr_unsigned(u_long, *tl++); if (nd->nd_procnum == NFSPROC_NULL) return (0); - if (nd->nd_procnum >= NFS_NPROCS || - (!nqnfs && nd->nd_procnum >= NQNFSPROC_GETLEASE) || + if ((nd->nd_procnum >= NFS_NPROCS) || (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) { nd->nd_repstat = EPROCUNAVAIL; nd->nd_procnum = NFSPROC_NOOP; @@ -2839,7 +2558,7 @@ nfs_getreq(nd, nfsd, has_header) auth_type = *tl++; len = fxdr_unsigned(int, *tl++); if (len < 0 || len > RPCAUTH_MAXSIZ) { - m_freem(mrep); + mbuf_freem(mrep); return (EBADRPC); } @@ -2850,34 +2569,42 @@ nfs_getreq(nd, nfsd, has_header) if (auth_type == rpc_auth_unix) { len = fxdr_unsigned(int, *++tl); if (len < 0 || len > NFS_MAXNAMLEN) { - m_freem(mrep); + mbuf_freem(mrep); return (EBADRPC); } + bzero(&temp_cred, sizeof(temp_cred)); nfsm_adv(nfsm_rndup(len)); nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED); - bzero((caddr_t)&nd->nd_cr, sizeof (struct ucred)); - nd->nd_cr.cr_ref = 1; - nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++); - nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++); + user_id = fxdr_unsigned(uid_t, *tl++); + group_id = fxdr_unsigned(gid_t, *tl++); + temp_cred.cr_groups[0] = group_id; len = fxdr_unsigned(int, *tl); if (len < 0 || len > RPCAUTH_UNIXGIDS) { - m_freem(mrep); + mbuf_freem(mrep); return (EBADRPC); } nfsm_dissect(tl, u_long *, (len + 2) * NFSX_UNSIGNED); for (i = 1; i <= len; i++) if (i < NGROUPS) - nd->nd_cr.cr_groups[i] = fxdr_unsigned(gid_t, *tl++); + temp_cred.cr_groups[i] = fxdr_unsigned(gid_t, *tl++); else tl++; - nd->nd_cr.cr_ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1); - if (nd->nd_cr.cr_ngroups > 1) - nfsrvw_sort(nd->nd_cr.cr_groups, nd->nd_cr.cr_ngroups); + ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1); + if (ngroups > 1) + nfsrvw_sort(&temp_cred.cr_groups[0], ngroups); len = fxdr_unsigned(int, *++tl); if (len < 0 || len > RPCAUTH_MAXSIZ) { - m_freem(mrep); + mbuf_freem(mrep); return (EBADRPC); } + temp_cred.cr_uid = user_id; + temp_cred.cr_ngroups = ngroups; + nd->nd_cr = kauth_cred_create(&temp_cred); + if (nd->nd_cr == NULL) { + nd->nd_repstat = ENOMEM; + nd->nd_procnum = NFSPROC_NOOP; + return (0); + } if (len > 0) nfsm_adv(nfsm_rndup(len)); } else if (auth_type == rpc_auth_kerb) { @@ -2885,19 +2612,23 @@ nfs_getreq(nd, nfsd, has_header) case RPCAKN_FULLNAME: ticklen = fxdr_unsigned(int, *tl); *((u_long *)nfsd->nfsd_authstr) = *tl; - uio.uio_resid = nfsm_rndup(ticklen) + NFSX_UNSIGNED; - nfsd->nfsd_authlen = uio.uio_resid + NFSX_UNSIGNED; - if (uio.uio_resid > (len - 2 * NFSX_UNSIGNED)) { - m_freem(mrep); + uiop = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, + &uio_buf[0], sizeof(uio_buf)); + if (!uiop) { + nd->nd_repstat = ENOMEM; + nd->nd_procnum = NFSPROC_NOOP; + return (0); + } + + // LP64todo - fix this + nfsd->nfsd_authlen = (nfsm_rndup(ticklen) + (NFSX_UNSIGNED * 2)); + if ((nfsm_rndup(ticklen) + NFSX_UNSIGNED) > (len - 2 * NFSX_UNSIGNED)) { + mbuf_freem(mrep); return (EBADRPC); } - uio.uio_offset = 0; - uio.uio_iov = &iov; - uio.uio_iovcnt = 1; - uio.uio_segflg = UIO_SYSSPACE; - iov.iov_base = (caddr_t)&nfsd->nfsd_authstr[4]; - iov.iov_len = RPCAUTH_MAXSIZ - 4; - nfsm_mtouio(&uio, uio.uio_resid); + uio_addiov(uiop, CAST_USER_ADDR_T(&nfsd->nfsd_authstr[4]), RPCAUTH_MAXSIZ - 4); + // LP64todo - fix this + nfsm_mtouio(uiop, uio_resid(uiop)); nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); if (*tl++ != rpc_auth_kerb || fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) { @@ -2942,7 +2673,7 @@ nfs_getreq(nd, nfsd, has_header) for (nuidp = NUIDHASH(nfsd->nfsd_slp,nickuid)->lh_first; nuidp != 0; nuidp = nuidp->nu_hash.le_next) { - if (nuidp->nu_cr.cr_uid == nickuid && + if (kauth_cred_getuid(nuidp->nu_cr) == nickuid && (!nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp), &nuidp->nu_haddr, nd->nd_nam2))) @@ -2976,7 +2707,21 @@ nfs_getreq(nd, nfsd, has_header) nd->nd_procnum = NFSPROC_NOOP; return (0); } - nfsrv_setcred(&nuidp->nu_cr, &nd->nd_cr); + bzero(&temp_cred, sizeof(temp_cred)); + ngroups = nuidp->nu_cr->cr_ngroups; + for (i = 0; i < ngroups; i++) + temp_cred.cr_groups[i] = nuidp->nu_cr->cr_groups[i]; + if (ngroups > 1) + nfsrvw_sort(&temp_cred.cr_groups[0], ngroups); + + temp_cred.cr_uid = kauth_cred_getuid(nuidp->nu_cr); + temp_cred.cr_ngroups = ngroups; + nd->nd_cr = kauth_cred_create(&temp_cred); + if (!nd->nd_cr) { + nd->nd_repstat = ENOMEM; + nd->nd_procnum = NFSPROC_NOOP; + return (0); + } nd->nd_flag |= ND_KERBNICK; }; } else { @@ -2985,23 +2730,12 @@ nfs_getreq(nd, nfsd, has_header) return (0); } - /* - * For nqnfs, get piggybacked lease request. - */ - if (nqnfs && nd->nd_procnum != NQNFSPROC_EVICTED) { - nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); - nd->nd_flag |= fxdr_unsigned(int, *tl); - if (nd->nd_flag & ND_LEASE) { - nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); - nd->nd_duration = fxdr_unsigned(int, *tl); - } else - nd->nd_duration = NQ_MINLEASE; - } else - nd->nd_duration = NQ_MINLEASE; nd->nd_md = md; nd->nd_dpos = dpos; return (0); nfsmout: + if (nd->nd_cr) + kauth_cred_rele(nd->nd_cr); return (error); } @@ -3009,36 +2743,46 @@ nfsmout: * Search for a sleeping nfsd and wake it up. * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the * running nfsds will go look for the work in the nfssvc_sock list. + * Note: Must be called with nfsd_mutex held. */ void -nfsrv_wakenfsd(slp) - struct nfssvc_sock *slp; +nfsrv_wakenfsd(struct nfssvc_sock *slp) { - register struct nfsd *nd; + struct nfsd *nd; if ((slp->ns_flag & SLP_VALID) == 0) return; - TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) { - if (nd->nfsd_flag & NFSD_WAITING) { - nd->nfsd_flag &= ~NFSD_WAITING; - if (nd->nfsd_slp) - panic("nfsd wakeup"); - slp->ns_sref++; - nd->nfsd_slp = slp; - wakeup((caddr_t)nd); - return; + + lck_rw_lock_exclusive(&slp->ns_rwlock); + + if (nfsd_waiting) { + TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) { + if (nd->nfsd_flag & NFSD_WAITING) { + nd->nfsd_flag &= ~NFSD_WAITING; + if (nd->nfsd_slp) + panic("nfsd wakeup"); + slp->ns_sref++; + nd->nfsd_slp = slp; + lck_rw_done(&slp->ns_rwlock); + wakeup((caddr_t)nd); + return; + } } } + slp->ns_flag |= SLP_DOREC; + + lck_rw_done(&slp->ns_rwlock); + nfsd_head_flag |= NFSD_CHECKSLP; } #endif /* NFS_NOSERVER */ static int -nfs_msg(p, server, msg, error) - struct proc *p; - const char *server, *msg; - int error; +nfs_msg(proc_t p, + const char *server, + const char *msg, + int error) { tpr_t tpr; @@ -3056,51 +2800,43 @@ nfs_msg(p, server, msg, error) } void -nfs_down(rep, nmp, proc, msg, error, flags) - struct nfsreq *rep; +nfs_down(nmp, proc, error, flags, msg) struct nfsmount *nmp; - struct proc *proc; - const char *msg; + proc_t proc; int error, flags; + const char *msg; { if (nmp == NULL) return; if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) { - vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid, - VQ_NOTRESP, 0); + vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 0); nmp->nm_state |= NFSSTA_TIMEO; } if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) { - vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid, - VQ_NOTRESPLOCK, 0); + vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 0); nmp->nm_state |= NFSSTA_LOCKTIMEO; } - if (rep) - rep->r_flags |= R_TPRINTFMSG; - nfs_msg(proc, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, error); + nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, error); } void -nfs_up(rep, nmp, proc, msg, flags) - struct nfsreq *rep; +nfs_up(nmp, proc, flags, msg) struct nfsmount *nmp; - struct proc *proc; - const char *msg; + proc_t proc; int flags; + const char *msg; { if (nmp == NULL) return; - if ((rep == NULL) || (rep->r_flags & R_TPRINTFMSG) != 0) - nfs_msg(proc, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, 0); + if (msg) + nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, 0); if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) { nmp->nm_state &= ~NFSSTA_TIMEO; - vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid, - VQ_NOTRESP, 1); + vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 1); } if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) { nmp->nm_state &= ~NFSSTA_LOCKTIMEO; - vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid, - VQ_NOTRESPLOCK, 1); + vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 1); } } diff --git a/bsd/nfs/nfs_srvcache.c b/bsd/nfs/nfs_srvcache.c index 9e7007ddb..a4ce111ae 100644 --- a/bsd/nfs/nfs_srvcache.c +++ b/bsd/nfs/nfs_srvcache.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -67,14 +67,15 @@ */ #include <sys/param.h> #include <sys/vnode.h> -#include <sys/mount.h> +#include <sys/mount_internal.h> #include <sys/kernel.h> #include <sys/systm.h> #include <sys/proc.h> -#include <sys/mbuf.h> +#include <sys/kpi_mbuf.h> #include <sys/malloc.h> #include <sys/socket.h> #include <sys/socketvar.h> /* for dup_sockaddr */ +#include <libkern/OSAtomic.h> #include <netinet/in.h> #if ISO @@ -96,8 +97,10 @@ LIST_HEAD(nfsrvhash, nfsrvcache) *nfsrvhashtbl; TAILQ_HEAD(nfsrvlru, nfsrvcache) nfsrvlruhead; u_long nfsrvhash; -#define TRUE 1 -#define FALSE 0 +lck_grp_t *nfsrv_reqcache_lck_grp; +lck_grp_attr_t *nfsrv_reqcache_lck_grp_attr; +lck_attr_t *nfsrv_reqcache_lck_attr; +lck_mtx_t *nfsrv_reqcache_mutex; #define NETFAMILY(rp) \ (((rp)->rc_flag & RC_INETADDR) ? AF_INET : AF_ISO) @@ -129,9 +132,6 @@ static int nonidempotent[NFS_NPROCS] = { FALSE, FALSE, FALSE, - FALSE, - FALSE, - FALSE, }; /* True iff the rpc reply is an nfs status ONLY! */ @@ -162,6 +162,12 @@ static int nfsv2_repstat[NFS_NPROCS] = { void nfsrv_initcache() { + /* init nfs server request cache mutex */ + nfsrv_reqcache_lck_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(nfsrv_reqcache_lck_grp_attr); + nfsrv_reqcache_lck_grp = lck_grp_alloc_init("nfsrv_reqcache", nfsrv_reqcache_lck_grp_attr); + nfsrv_reqcache_lck_attr = lck_attr_alloc_init(); + nfsrv_reqcache_mutex = lck_mtx_alloc_init(nfsrv_reqcache_lck_grp, nfsrv_reqcache_lck_attr); nfsrvhashtbl = hashinit(desirednfsrvcache, M_NFSD, &nfsrvhash); TAILQ_INIT(&nfsrvlruhead); @@ -183,15 +189,15 @@ nfsrv_initcache() */ int nfsrv_getcache(nd, slp, repp) - register struct nfsrv_descript *nd; + struct nfsrv_descript *nd; struct nfssvc_sock *slp; - struct mbuf **repp; + mbuf_t *repp; { - register struct nfsrvcache *rp; - struct mbuf *mb; + struct nfsrvcache *rp; + mbuf_t mb; struct sockaddr_in *saddr; caddr_t bpos; - int ret; + int ret, error; /* * Don't cache recent requests for reliable transport protocols. @@ -199,12 +205,12 @@ nfsrv_getcache(nd, slp, repp) */ if (!nd->nd_nam2) return (RC_DOIT); + lck_mtx_lock(nfsrv_reqcache_mutex); loop: for (rp = NFSRCHASH(nd->nd_retxid)->lh_first; rp != 0; rp = rp->rc_hash.le_next) { if (nd->nd_retxid == rp->rc_xid && nd->nd_procnum == rp->rc_proc && netaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) { - NFS_DPF(RC, ("H%03x", rp->rc_xid & 0xfff)); if ((rp->rc_flag & RC_LOCKED) != 0) { rp->rc_flag |= RC_WANTED; (void) tsleep((caddr_t)rp, PZERO-1, "nfsrc", 0); @@ -219,20 +225,23 @@ loop: if (rp->rc_state == RC_UNUSED) panic("nfsrv cache"); if (rp->rc_state == RC_INPROG) { - nfsstats.srvcache_inproghits++; + OSAddAtomic(1, (SInt32*)&nfsstats.srvcache_inproghits); ret = RC_DROPIT; } else if (rp->rc_flag & RC_REPSTATUS) { - nfsstats.srvcache_nonidemdonehits++; - nfs_rephead(0, nd, slp, rp->rc_status, - 0, (u_quad_t *)0, repp, &mb, &bpos); + OSAddAtomic(1, (SInt32*)&nfsstats.srvcache_nonidemdonehits); + nfs_rephead(0, nd, slp, rp->rc_status, repp, &mb, &bpos); ret = RC_REPLY; } else if (rp->rc_flag & RC_REPMBUF) { - nfsstats.srvcache_nonidemdonehits++; - *repp = m_copym(rp->rc_reply, 0, M_COPYALL, - M_WAIT); - ret = RC_REPLY; + OSAddAtomic(1, (SInt32*)&nfsstats.srvcache_nonidemdonehits); + error = mbuf_copym(rp->rc_reply, 0, MBUF_COPYALL, MBUF_WAITOK, repp); + if (error) { + printf("nfsrv cache: reply copym failed for nonidem request hit\n"); + ret = RC_DROPIT; + } else { + ret = RC_REPLY; + } } else { - nfsstats.srvcache_idemdonehits++; + OSAddAtomic(1, (SInt32*)&nfsstats.srvcache_idemdonehits); rp->rc_state = RC_INPROG; ret = RC_DOIT; } @@ -241,18 +250,31 @@ loop: rp->rc_flag &= ~RC_WANTED; wakeup((caddr_t)rp); } + lck_mtx_unlock(nfsrv_reqcache_mutex); return (ret); } } - nfsstats.srvcache_misses++; - NFS_DPF(RC, ("M%03x", nd->nd_retxid & 0xfff)); + OSAddAtomic(1, (SInt32*)&nfsstats.srvcache_misses); if (numnfsrvcache < desirednfsrvcache) { + /* try to allocate a new entry */ MALLOC(rp, struct nfsrvcache *, sizeof *rp, M_NFSD, M_WAITOK); - bzero((char *)rp, sizeof *rp); - numnfsrvcache++; - rp->rc_flag = RC_LOCKED; + if (rp) { + bzero((char *)rp, sizeof *rp); + numnfsrvcache++; + rp->rc_flag = RC_LOCKED; + } } else { + rp = NULL; + } + if (!rp) { + /* try to reuse the least recently used entry */ rp = nfsrvlruhead.tqh_first; + if (!rp) { + /* no entry to reuse? */ + /* OK, we just won't be able to cache this request */ + lck_mtx_unlock(nfsrv_reqcache_mutex); + return (RC_DOIT); + } while ((rp->rc_flag & RC_LOCKED) != 0) { rp->rc_flag |= RC_WANTED; (void) tsleep((caddr_t)rp, PZERO-1, "nfsrc", 0); @@ -262,15 +284,15 @@ loop: LIST_REMOVE(rp, rc_hash); TAILQ_REMOVE(&nfsrvlruhead, rp, rc_lru); if (rp->rc_flag & RC_REPMBUF) - m_freem(rp->rc_reply); + mbuf_freem(rp->rc_reply); if (rp->rc_flag & RC_NAM) - MFREE(rp->rc_nam, mb); + mbuf_freem(rp->rc_nam); rp->rc_flag &= (RC_LOCKED | RC_WANTED); } TAILQ_INSERT_TAIL(&nfsrvlruhead, rp, rc_lru); rp->rc_state = RC_INPROG; rp->rc_xid = nd->nd_retxid; - saddr = mtod(nd->nd_nam, struct sockaddr_in *); + saddr = mbuf_data(nd->nd_nam); switch (saddr->sin_family) { case AF_INET: rp->rc_flag |= RC_INETADDR; @@ -278,8 +300,11 @@ loop: break; case AF_ISO: default: - rp->rc_flag |= RC_NAM; - rp->rc_nam = m_copym(nd->nd_nam, 0, M_COPYALL, M_WAIT); + error = mbuf_copym(nd->nd_nam, 0, MBUF_COPYALL, MBUF_WAITOK, &rp->rc_nam); + if (error) + printf("nfsrv cache: nam copym failed\n"); + else + rp->rc_flag |= RC_NAM; break; }; rp->rc_proc = nd->nd_procnum; @@ -289,6 +314,7 @@ loop: rp->rc_flag &= ~RC_WANTED; wakeup((caddr_t)rp); } + lck_mtx_unlock(nfsrv_reqcache_mutex); return (RC_DOIT); } @@ -297,20 +323,21 @@ loop: */ void nfsrv_updatecache(nd, repvalid, repmbuf) - register struct nfsrv_descript *nd; + struct nfsrv_descript *nd; int repvalid; - struct mbuf *repmbuf; + mbuf_t repmbuf; { - register struct nfsrvcache *rp; + struct nfsrvcache *rp; + int error; if (!nd->nd_nam2) return; + lck_mtx_lock(nfsrv_reqcache_mutex); loop: for (rp = NFSRCHASH(nd->nd_retxid)->lh_first; rp != 0; rp = rp->rc_hash.le_next) { if (nd->nd_retxid == rp->rc_xid && nd->nd_procnum == rp->rc_proc && netaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) { - NFS_DPF(RC, ("U%03x", rp->rc_xid & 0xfff)); if ((rp->rc_flag & RC_LOCKED) != 0) { rp->rc_flag |= RC_WANTED; (void) tsleep((caddr_t)rp, PZERO-1, "nfsrc", 0); @@ -328,9 +355,9 @@ loop: rp->rc_status = nd->nd_repstat; rp->rc_flag |= RC_REPSTATUS; } else { - rp->rc_reply = m_copym(repmbuf, - 0, M_COPYALL, M_WAIT); - rp->rc_flag |= RC_REPMBUF; + error = mbuf_copym(repmbuf, 0, MBUF_COPYALL, MBUF_WAITOK, &rp->rc_reply); + if (!error) + rp->rc_flag |= RC_REPMBUF; } } rp->rc_flag &= ~RC_LOCKED; @@ -338,10 +365,11 @@ loop: rp->rc_flag &= ~RC_WANTED; wakeup((caddr_t)rp); } + lck_mtx_unlock(nfsrv_reqcache_mutex); return; } } - NFS_DPF(RC, ("L%03x", nd->nd_retxid & 0xfff)); + lck_mtx_unlock(nfsrv_reqcache_mutex); } /* @@ -350,8 +378,9 @@ loop: void nfsrv_cleancache() { - register struct nfsrvcache *rp, *nextrp; + struct nfsrvcache *rp, *nextrp; + lck_mtx_lock(nfsrv_reqcache_mutex); for (rp = nfsrvlruhead.tqh_first; rp != 0; rp = nextrp) { nextrp = rp->rc_lru.tqe_next; LIST_REMOVE(rp, rc_hash); @@ -359,6 +388,7 @@ nfsrv_cleancache() _FREE(rp, M_NFSD); } numnfsrvcache = 0; + lck_mtx_unlock(nfsrv_reqcache_mutex); } #endif /* NFS_NOSERVER */ diff --git a/bsd/nfs/nfs_subs.c b/bsd/nfs/nfs_subs.c index 836b85f0f..d0c970018 100644 --- a/bsd/nfs/nfs_subs.c +++ b/bsd/nfs/nfs_subs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -66,25 +66,25 @@ */ #include <sys/param.h> #include <sys/proc.h> +#include <sys/kauth.h> #include <sys/systm.h> #include <sys/kernel.h> -#include <sys/mount.h> -#include <sys/vnode.h> -#include <sys/namei.h> -#include <sys/mbuf.h> +#include <sys/mount_internal.h> +#include <sys/vnode_internal.h> +#include <sys/kpi_mbuf.h> #include <sys/socket.h> #include <sys/stat.h> #include <sys/malloc.h> #include <sys/syscall.h> #include <sys/sysctl.h> -#include <sys/ubc.h> +#include <sys/ubc_internal.h> #include <sys/fcntl.h> +#include <sys/uio_internal.h> +#include <sys/domain.h> +#include <libkern/OSAtomic.h> #include <sys/vm.h> #include <sys/vmparam.h> -#include <machine/spl.h> - -#include <sys/lock.h> #include <sys/time.h> #include <kern/clock.h> @@ -96,7 +96,6 @@ #include <nfs/xdr_subs.h> #include <nfs/nfsm_subs.h> #include <nfs/nfsmount.h> -#include <nfs/nqnfs.h> #include <nfs/nfsrtt.h> #include <nfs/nfs_lock.h> @@ -129,7 +128,9 @@ u_long nfs_xdrneg1; u_long rpc_call, rpc_vers, rpc_reply, rpc_msgdenied, rpc_autherr, rpc_mismatch, rpc_auth_unix, rpc_msgaccepted, rpc_auth_kerb; -u_long nfs_prog, nqnfs_prog, nfs_true, nfs_false; +u_long nfs_prog, nfs_true, nfs_false; +__private_extern__ int nfs_mbuf_mlen = 0, nfs_mbuf_mhlen = 0, + nfs_mbuf_minclsize = 0, nfs_mbuf_mclbytes = 0; /* And other global data */ static u_long nfs_xid = 0; @@ -144,15 +145,28 @@ enum vtype nv3tov_type[8]= { int nfs_mount_type; int nfs_ticks; +lck_grp_t *nfsd_lck_grp; +lck_grp_attr_t *nfsd_lck_grp_attr; +lck_attr_t *nfsd_lck_attr; +lck_mtx_t *nfsd_mutex; + +lck_grp_attr_t *nfs_slp_group_attr; +lck_attr_t *nfs_slp_lock_attr; +lck_grp_t *nfs_slp_rwlock_group; +lck_grp_t *nfs_slp_mutex_group; + struct nfs_reqq nfs_reqq; struct nfssvc_sockhead nfssvc_sockhead; -int nfssvc_sockhead_flag; struct nfsd_head nfsd_head; int nfsd_head_flag; -struct nfs_bufq nfs_bufq; -struct nqtimerhead nqtimerhead; -struct nqfhhashhead *nqfhhashtbl; -u_long nqfhhash; + +struct nfsexpfslist nfs_exports; +struct nfsexphashhead *nfsexphashtbl; +u_long nfsexphash; +lck_grp_attr_t *nfs_export_group_attr; +lck_attr_t *nfs_export_lock_attr; +lck_grp_t *nfs_export_rwlock_group; +lck_rw_t nfs_export_rwlock; #ifndef NFS_NOSERVER /* @@ -181,9 +195,6 @@ int nfsv3_procid[NFS_NPROCS] = { NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, - NFSPROC_NOOP, - NFSPROC_NOOP, - NFSPROC_NOOP, NFSPROC_NOOP }; @@ -214,10 +225,7 @@ int nfsv2_procid[NFS_NPROCS] = { NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, - NFSV2PROC_NOOP, - NFSV2PROC_NOOP, - NFSV2PROC_NOOP, - NFSV2PROC_NOOP, + NFSV2PROC_NOOP }; #ifndef NFS_NOSERVER @@ -581,12 +589,7 @@ static short *nfsrv_v3errmap[] = { #endif /* NFS_NOSERVER */ extern struct nfsrtt nfsrtt; -extern time_t nqnfsstarttime; -extern int nqsrv_clockskew; -extern int nqsrv_writeslack; -extern int nqsrv_maxlease; extern struct nfsstats nfsstats; -extern int nqnfs_piggy[NFS_NPROCS]; extern nfstype nfsv2_type[9]; extern nfstype nfsv3_type[9]; extern struct nfsnodehashhead *nfsnodehashtbl; @@ -600,46 +603,20 @@ LIST_HEAD(nfsnodehashhead, nfsnode); * The hsiz is the size of the rest of the nfs request header. * (just used to decide if a cluster is a good idea) */ -struct mbuf * -nfsm_reqh(vp, procid, hsiz, bposp) - struct vnode *vp; - u_long procid; - int hsiz; - caddr_t *bposp; +int +nfsm_reqh(int hsiz, caddr_t *bposp, mbuf_t *mbp) { - register struct mbuf *mb; - register u_long *tl; - register caddr_t bpos; - struct mbuf *mb2; - struct nfsmount *nmp; - int nqflag; - - MGET(mb, M_WAIT, MT_DATA); - if (hsiz >= MINCLSIZE) - MCLGET(mb, M_WAIT); - mb->m_len = 0; - bpos = mtod(mb, caddr_t); + int error; - /* - * For NQNFS, add lease request. - */ - if (vp) { - nmp = VFSTONFS(vp->v_mount); - if (nmp && (nmp->nm_flag & NFSMNT_NQNFS)) { - nqflag = NQNFS_NEEDLEASE(vp, procid); - if (nqflag) { - nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); - *tl++ = txdr_unsigned(nqflag); - *tl = txdr_unsigned(nmp->nm_leaseterm); - } else { - nfsm_build(tl, u_long *, NFSX_UNSIGNED); - *tl = 0; - } - } - } - /* Finally, return values */ - *bposp = bpos; - return (mb); + *mbp = NULL; + if (hsiz >= nfs_mbuf_minclsize) + error = mbuf_mclget(MBUF_WAITOK, MBUF_TYPE_DATA, mbp); + else + error = mbuf_get(MBUF_WAITOK, MBUF_TYPE_DATA, mbp); + if (error) + return (error); + *bposp = mbuf_data(*mbp); + return (0); } /* @@ -648,10 +625,10 @@ nfsm_reqh(vp, procid, hsiz, bposp) * come from outside of the kernel. * Returns the head of the mbuf list. */ -struct mbuf * +int nfsm_rpchead(cr, nmflag, procid, auth_type, auth_len, auth_str, verf_len, - verf_str, mrest, mrest_len, mbp, xidp) - register struct ucred *cr; + verf_str, mrest, mrest_len, mbp, xidp, mreqp) + kauth_cred_t cr; int nmflag; int procid; int auth_type; @@ -659,31 +636,40 @@ nfsm_rpchead(cr, nmflag, procid, auth_type, auth_len, auth_str, verf_len, char *auth_str; int verf_len; char *verf_str; - struct mbuf *mrest; + mbuf_t mrest; int mrest_len; - struct mbuf **mbp; + mbuf_t *mbp; u_long *xidp; + mbuf_t *mreqp; { - register struct mbuf *mb; - register u_long *tl; - register caddr_t bpos; - register int i; - struct mbuf *mreq, *mb2; - int siz, grpsiz, authsiz; + mbuf_t mb; + u_long *tl; + caddr_t bpos; + int i, error, len; + mbuf_t mreq, mb2; + int siz, grpsiz, authsiz, mlen; struct timeval tv; authsiz = nfsm_rndup(auth_len); - MGETHDR(mb, M_WAIT, MT_DATA); - if ((authsiz + 10 * NFSX_UNSIGNED) >= MINCLSIZE) { - MCLGET(mb, M_WAIT); - } else if ((authsiz + 10 * NFSX_UNSIGNED) < MHLEN) { - MH_ALIGN(mb, authsiz + 10 * NFSX_UNSIGNED); + len = authsiz + 10 * NFSX_UNSIGNED; + if (len >= nfs_mbuf_minclsize) { + error = mbuf_getpacket(MBUF_WAITOK, &mb); } else { - MH_ALIGN(mb, 8 * NFSX_UNSIGNED); + error = mbuf_gethdr(MBUF_WAITOK, MBUF_TYPE_DATA, &mb); + if (!error) { + if (len < nfs_mbuf_mhlen) + mbuf_align_32(mb, len); + else + mbuf_align_32(mb, 8 * NFSX_UNSIGNED); + } + } + if (error) { + /* unable to allocate packet */ + /* XXX nfsstat? */ + return (error); } - mb->m_len = 0; mreq = mb; - bpos = mtod(mb, caddr_t); + bpos = mbuf_data(mb); /* * First the RPC header. @@ -714,16 +700,11 @@ nfsm_rpchead(cr, nmflag, procid, auth_type, auth_len, auth_str, verf_len, *tl++ = *xidp = txdr_unsigned(nfs_xid); *tl++ = rpc_call; *tl++ = rpc_vers; - if (nmflag & NFSMNT_NQNFS) { - *tl++ = txdr_unsigned(NQNFS_PROG); - *tl++ = txdr_unsigned(NQNFS_VER3); - } else { - *tl++ = txdr_unsigned(NFS_PROG); - if (nmflag & NFSMNT_NFSV3) - *tl++ = txdr_unsigned(NFS_VER3); - else - *tl++ = txdr_unsigned(NFS_VER2); - } + *tl++ = txdr_unsigned(NFS_PROG); + if (nmflag & NFSMNT_NFSV3) + *tl++ = txdr_unsigned(NFS_VER3); + else + *tl++ = txdr_unsigned(NFS_VER2); if (nmflag & NFSMNT_NFSV3) *tl++ = txdr_unsigned(procid); else @@ -739,7 +720,7 @@ nfsm_rpchead(cr, nmflag, procid, auth_type, auth_len, auth_str, verf_len, nfsm_build(tl, u_long *, auth_len); *tl++ = 0; /* stamp ?? */ *tl++ = 0; /* NULL hostname */ - *tl++ = txdr_unsigned(cr->cr_uid); + *tl++ = txdr_unsigned(kauth_cred_getuid(cr)); *tl++ = txdr_unsigned(cr->cr_groups[0]); grpsiz = (auth_len >> 2) - 5; *tl++ = txdr_unsigned(grpsiz); @@ -748,19 +729,28 @@ nfsm_rpchead(cr, nmflag, procid, auth_type, auth_len, auth_str, verf_len, break; case RPCAUTH_KERB4: siz = auth_len; + mlen = mbuf_len(mb); while (siz > 0) { - if (M_TRAILINGSPACE(mb) == 0) { - MGET(mb2, M_WAIT, MT_DATA); - if (siz >= MINCLSIZE) - MCLGET(mb2, M_WAIT); - mb->m_next = mb2; + if (mbuf_trailingspace(mb) == 0) { + mb2 = NULL; + if (siz >= nfs_mbuf_minclsize) + error = mbuf_mclget(MBUF_WAITOK, MBUF_TYPE_DATA, &mb2); + else + error = mbuf_get(MBUF_WAITOK, MBUF_TYPE_DATA, &mb2); + if (!error) + error = mbuf_setnext(mb, mb2); + if (error) { + mbuf_freem(mreq); + return (error); + } mb = mb2; - mb->m_len = 0; - bpos = mtod(mb, caddr_t); + mlen = 0; + bpos = mbuf_data(mb); } - i = min(siz, M_TRAILINGSPACE(mb)); + i = min(siz, mbuf_trailingspace(mb)); bcopy(auth_str, bpos, i); - mb->m_len += i; + mlen += i; + mbuf_setlen(mb, mlen); auth_str += i; bpos += i; siz -= i; @@ -768,7 +758,8 @@ nfsm_rpchead(cr, nmflag, procid, auth_type, auth_len, auth_str, verf_len, if ((siz = (nfsm_rndup(auth_len) - auth_len)) > 0) { for (i = 0; i < siz; i++) *bpos++ = '\0'; - mb->m_len += siz; + mlen += siz; + mbuf_setlen(mb, mlen); } break; }; @@ -778,22 +769,31 @@ nfsm_rpchead(cr, nmflag, procid, auth_type, auth_len, auth_str, verf_len, */ nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED); if (verf_str) { + mlen = mbuf_len(mb); *tl++ = txdr_unsigned(RPCAUTH_KERB4); *tl = txdr_unsigned(verf_len); siz = verf_len; while (siz > 0) { - if (M_TRAILINGSPACE(mb) == 0) { - MGET(mb2, M_WAIT, MT_DATA); - if (siz >= MINCLSIZE) - MCLGET(mb2, M_WAIT); - mb->m_next = mb2; + if (mbuf_trailingspace(mb) == 0) { + mb2 = NULL; + if (siz >= nfs_mbuf_minclsize) + error = mbuf_mclget(MBUF_WAITOK, MBUF_TYPE_DATA, &mb2); + else + error = mbuf_get(MBUF_WAITOK, MBUF_TYPE_DATA, &mb2); + if (!error) + error = mbuf_setnext(mb, mb2); + if (error) { + mbuf_freem(mreq); + return (error); + } mb = mb2; - mb->m_len = 0; - bpos = mtod(mb, caddr_t); + mlen = 0; + bpos = mbuf_data(mb); } - i = min(siz, M_TRAILINGSPACE(mb)); + i = min(siz, mbuf_trailingspace(mb)); bcopy(verf_str, bpos, i); - mb->m_len += i; + mlen += i; + mbuf_setlen(mb, mlen); verf_str += i; bpos += i; siz -= i; @@ -801,17 +801,24 @@ nfsm_rpchead(cr, nmflag, procid, auth_type, auth_len, auth_str, verf_len, if ((siz = (nfsm_rndup(verf_len) - verf_len)) > 0) { for (i = 0; i < siz; i++) *bpos++ = '\0'; - mb->m_len += siz; + mlen += siz; + mbuf_setlen(mb, mlen); } } else { *tl++ = txdr_unsigned(RPCAUTH_NULL); *tl = 0; } - mb->m_next = mrest; - mreq->m_pkthdr.len = authsiz + 10 * NFSX_UNSIGNED + mrest_len; - mreq->m_pkthdr.rcvif = (struct ifnet *)0; + error = mbuf_pkthdr_setrcvif(mreq, 0); + if (!error) + error = mbuf_setnext(mb, mrest); + if (error) { + mbuf_freem(mreq); + return (error); + } + mbuf_pkthdr_setlen(mreq, authsiz + 10 * NFSX_UNSIGNED + mrest_len); *mbp = mb; - return (mreq); + *mreqp = mreq; + return (0); } /* @@ -819,62 +826,56 @@ nfsm_rpchead(cr, nmflag, procid, auth_type, auth_len, auth_str, verf_len, */ int nfsm_mbuftouio(mrep, uiop, siz, dpos) - struct mbuf **mrep; - register struct uio *uiop; + mbuf_t *mrep; + struct uio *uiop; int siz; caddr_t *dpos; { - register char *mbufcp, *uiocp; - register int xfer, left, len; - register struct mbuf *mp; + char *mbufcp, *uiocp; + int xfer, left, len; + mbuf_t mp; long uiosiz, rem; int error = 0; mp = *mrep; mbufcp = *dpos; - len = mtod(mp, caddr_t)+mp->m_len-mbufcp; + len = (caddr_t)mbuf_data(mp) + mbuf_len(mp) - mbufcp; rem = nfsm_rndup(siz)-siz; while (siz > 0) { - if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) + if (uiop->uio_iovcnt <= 0 || uiop->uio_iovs.iov32p == NULL) return (EFBIG); - left = uiop->uio_iov->iov_len; - uiocp = uiop->uio_iov->iov_base; + // LP64todo - fix this! + left = uio_iov_len(uiop); + uiocp = CAST_DOWN(caddr_t, uio_iov_base(uiop)); if (left > siz) left = siz; uiosiz = left; while (left > 0) { while (len == 0) { - mp = mp->m_next; + mp = mbuf_next(mp); if (mp == NULL) return (EBADRPC); - mbufcp = mtod(mp, caddr_t); - len = mp->m_len; + mbufcp = mbuf_data(mp); + len = mbuf_len(mp); } xfer = (left > len) ? len : left; -#ifdef notdef - /* Not Yet.. */ - if (uiop->uio_iov->iov_op != NULL) - (*(uiop->uio_iov->iov_op)) - (mbufcp, uiocp, xfer); + if (UIO_SEG_IS_USER_SPACE(uiop->uio_segflg)) + copyout(mbufcp, CAST_USER_ADDR_T(uiocp), xfer); else -#endif - if (uiop->uio_segflg == UIO_SYSSPACE) bcopy(mbufcp, uiocp, xfer); - else - copyout(mbufcp, uiocp, xfer); left -= xfer; len -= xfer; mbufcp += xfer; uiocp += xfer; uiop->uio_offset += xfer; - uiop->uio_resid -= xfer; + uio_uio_resid_add(uiop, -xfer); } - if (uiop->uio_iov->iov_len <= siz) { + if (uio_iov_len(uiop) <= (size_t)siz) { uiop->uio_iovcnt--; - uiop->uio_iov++; + uio_next_iov(uiop); } else { - uiop->uio_iov->iov_base += uiosiz; - uiop->uio_iov->iov_len -= uiosiz; + uio_iov_base_add(uiop, uiosiz); + uio_iov_len_add(uiop, -uiosiz); } siz -= uiosiz; } @@ -895,78 +896,84 @@ nfsm_mbuftouio(mrep, uiop, siz, dpos) */ int nfsm_uiotombuf(uiop, mq, siz, bpos) - register struct uio *uiop; - struct mbuf **mq; + struct uio *uiop; + mbuf_t *mq; int siz; caddr_t *bpos; { - register char *uiocp; - register struct mbuf *mp, *mp2; - register int xfer, left, mlen; - int uiosiz, clflg, rem; + char *uiocp; + mbuf_t mp, mp2; + int xfer, left, mlen, mplen; + int uiosiz, clflg, rem, error; char *cp; if (uiop->uio_iovcnt != 1) panic("nfsm_uiotombuf: iovcnt != 1"); - if (siz > MLEN) /* or should it >= MCLBYTES ?? */ + if (siz > nfs_mbuf_mlen) /* or should it >= MCLBYTES ?? */ clflg = 1; else clflg = 0; rem = nfsm_rndup(siz)-siz; mp = mp2 = *mq; + mplen = mbuf_len(mp); while (siz > 0) { - left = uiop->uio_iov->iov_len; - uiocp = uiop->uio_iov->iov_base; + // LP64todo - fix this! + left = uio_iov_len(uiop); + uiocp = CAST_DOWN(caddr_t, uio_iov_base(uiop)); if (left > siz) left = siz; uiosiz = left; while (left > 0) { - mlen = M_TRAILINGSPACE(mp); + mlen = mbuf_trailingspace(mp); if (mlen == 0) { - MGET(mp, M_WAIT, MT_DATA); + mp = NULL; if (clflg) - MCLGET(mp, M_WAIT); - mp->m_len = 0; - mp2->m_next = mp; + error = mbuf_mclget(MBUF_WAITOK, MBUF_TYPE_DATA, &mp); + else + error = mbuf_get(MBUF_WAITOK, MBUF_TYPE_DATA, &mp); + if (!error) + error = mbuf_setnext(mp2, mp); + if (error) + return (error); + mplen = 0; mp2 = mp; - mlen = M_TRAILINGSPACE(mp); + mlen = mbuf_trailingspace(mp); } xfer = (left > mlen) ? mlen : left; -#ifdef notdef - /* Not Yet.. */ - if (uiop->uio_iov->iov_op != NULL) - (*(uiop->uio_iov->iov_op)) - (uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); + if (UIO_SEG_IS_USER_SPACE(uiop->uio_segflg)) + copyin(CAST_USER_ADDR_T(uiocp), (caddr_t)mbuf_data(mp) + mplen, xfer); else -#endif - if (uiop->uio_segflg == UIO_SYSSPACE) - bcopy(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); - else - copyin(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); - mp->m_len += xfer; + bcopy(uiocp, (caddr_t)mbuf_data(mp) + mplen, xfer); + mplen += xfer; + mbuf_setlen(mp, mplen); left -= xfer; uiocp += xfer; uiop->uio_offset += xfer; - uiop->uio_resid -= xfer; + uio_uio_resid_add(uiop, -xfer); } - uiop->uio_iov->iov_base += uiosiz; - uiop->uio_iov->iov_len -= uiosiz; + uio_iov_base_add(uiop, uiosiz); + uio_iov_len_add(uiop, -uiosiz); siz -= uiosiz; } if (rem > 0) { - if (rem > M_TRAILINGSPACE(mp)) { - MGET(mp, M_WAIT, MT_DATA); - mp->m_len = 0; - mp2->m_next = mp; + if (rem > mbuf_trailingspace(mp)) { + error = mbuf_get(MBUF_WAITOK, MBUF_TYPE_DATA, &mp); + if (!error) + error = mbuf_setnext(mp2, mp); + if (error) + return (error); + mplen = 0; } - cp = mtod(mp, caddr_t)+mp->m_len; + cp = (caddr_t)mbuf_data(mp) + mplen; for (left = 0; left < rem; left++) *cp++ = '\0'; - mp->m_len += rem; + mplen += rem; + mbuf_setlen(mp, mplen); *bpos = cp; - } else - *bpos = mtod(mp, caddr_t)+mp->m_len; + } else { + *bpos = (caddr_t)mbuf_data(mp) + mplen; + } *mq = mp; return (0); } @@ -979,60 +986,73 @@ nfsm_uiotombuf(uiop, mq, siz, bpos) */ int nfsm_disct(mdp, dposp, siz, left, cp2) - struct mbuf **mdp; + mbuf_t *mdp; caddr_t *dposp; int siz; int left; caddr_t *cp2; { - register struct mbuf *mp, *mp2; - register int siz2, xfer; - register caddr_t p; + mbuf_t mp, mp2; + int siz2, xfer, error, mp2len; + caddr_t p, mp2data; mp = *mdp; while (left == 0) { - *mdp = mp = mp->m_next; + *mdp = mp = mbuf_next(mp); if (mp == NULL) return (EBADRPC); - left = mp->m_len; - *dposp = mtod(mp, caddr_t); + left = mbuf_len(mp); + *dposp = mbuf_data(mp); } if (left >= siz) { *cp2 = *dposp; *dposp += siz; - } else if (mp->m_next == NULL) { + } else if (mbuf_next(mp) == NULL) { return (EBADRPC); - } else if (siz > MHLEN) { + } else if (siz > nfs_mbuf_mhlen) { panic("nfs S too big"); } else { - MGET(mp2, M_WAIT, MT_DATA); - mp2->m_next = mp->m_next; - mp->m_next = mp2; - mp->m_len -= left; + error = mbuf_get(MBUF_WAITOK, MBUF_TYPE_DATA, &mp2); + if (error) + return (error); + error = mbuf_setnext(mp2, mbuf_next(mp)); + if (!error) + error = mbuf_setnext(mp, mp2); + if (error) { + mbuf_free(mp2); + return (error); + } + mbuf_setlen(mp, mbuf_len(mp) - left); mp = mp2; - *cp2 = p = mtod(mp, caddr_t); + *cp2 = p = mbuf_data(mp); bcopy(*dposp, p, left); /* Copy what was left */ siz2 = siz-left; p += left; - mp2 = mp->m_next; + mp2 = mbuf_next(mp); + mp2data = mbuf_data(mp2); + mp2len = mbuf_len(mp2); /* Loop around copying up the siz2 bytes */ while (siz2 > 0) { if (mp2 == NULL) return (EBADRPC); - xfer = (siz2 > mp2->m_len) ? mp2->m_len : siz2; + xfer = (siz2 > mp2len) ? mp2len : siz2; if (xfer > 0) { - bcopy(mtod(mp2, caddr_t), p, xfer); - NFSMADV(mp2, xfer); - mp2->m_len -= xfer; + bcopy(mp2data, p, xfer); + mp2data += xfer; + mp2len -= xfer; + mbuf_setdata(mp2, mp2data, mp2len); p += xfer; siz2 -= xfer; } - if (siz2 > 0) - mp2 = mp2->m_next; + if (siz2 > 0) { + mp2 = mbuf_next(mp2); + mp2data = mbuf_data(mp2); + mp2len = mbuf_len(mp2); + } } - mp->m_len = siz; + mbuf_setlen(mp, siz); *mdp = mp2; - *dposp = mtod(mp2, caddr_t); + *dposp = mp2data; } return (0); } @@ -1042,25 +1062,25 @@ nfsm_disct(mdp, dposp, siz, left, cp2) */ int nfs_adv(mdp, dposp, offs, left) - struct mbuf **mdp; + mbuf_t *mdp; caddr_t *dposp; int offs; int left; { - register struct mbuf *m; - register int s; + mbuf_t m; + int s; m = *mdp; s = left; while (s < offs) { offs -= s; - m = m->m_next; + m = mbuf_next(m); if (m == NULL) return (EBADRPC); - s = m->m_len; + s = mbuf_len(m); } *mdp = m; - *dposp = mtod(m, caddr_t)+offs; + *dposp = (caddr_t)mbuf_data(m) + offs; return (0); } @@ -1069,64 +1089,74 @@ nfs_adv(mdp, dposp, offs, left) */ int nfsm_strtmbuf(mb, bpos, cp, siz) - struct mbuf **mb; + mbuf_t *mb; char **bpos; char *cp; long siz; { - register struct mbuf *m1 = 0, *m2; - long left, xfer, len, tlen; + mbuf_t m1 = NULL, m2; + long left, xfer, len, tlen, mlen; u_long *tl; - int putsize; + int putsize, error; putsize = 1; m2 = *mb; - left = M_TRAILINGSPACE(m2); - if (left > 0) { + left = mbuf_trailingspace(m2); + if (left >= NFSX_UNSIGNED) { tl = ((u_long *)(*bpos)); *tl++ = txdr_unsigned(siz); putsize = 0; left -= NFSX_UNSIGNED; - m2->m_len += NFSX_UNSIGNED; + len = mbuf_len(m2); + len += NFSX_UNSIGNED; + mbuf_setlen(m2, len); if (left > 0) { bcopy(cp, (caddr_t) tl, left); siz -= left; cp += left; - m2->m_len += left; + len += left; + mbuf_setlen(m2, len); left = 0; } } /* Loop around adding mbufs */ while (siz > 0) { - MGET(m1, M_WAIT, MT_DATA); - if (siz > MLEN) - MCLGET(m1, M_WAIT); - m1->m_len = NFSMSIZ(m1); - m2->m_next = m1; + m1 = NULL; + if (siz > nfs_mbuf_mlen) + error = mbuf_mclget(MBUF_WAITOK, MBUF_TYPE_DATA, &m1); + else + error = mbuf_get(MBUF_WAITOK, MBUF_TYPE_DATA, &m1); + if (!error) + error = mbuf_setnext(m2, m1); + if (error) + return (error); + mlen = mbuf_maxlen(m1); + mbuf_setlen(m1, mlen); m2 = m1; - tl = mtod(m1, u_long *); + tl = mbuf_data(m1); tlen = 0; if (putsize) { *tl++ = txdr_unsigned(siz); - m1->m_len -= NFSX_UNSIGNED; + mlen -= NFSX_UNSIGNED; + mbuf_setlen(m1, mlen); tlen = NFSX_UNSIGNED; putsize = 0; } - if (siz < m1->m_len) { + if (siz < mlen) { len = nfsm_rndup(siz); xfer = siz; if (xfer < len) *(tl+(xfer>>2)) = 0; } else { - xfer = len = m1->m_len; + xfer = len = mlen; } bcopy(cp, (caddr_t) tl, xfer); - m1->m_len = len+tlen; + mbuf_setlen(m1, len + tlen); siz -= xfer; cp += xfer; } *mb = m1; - *bpos = mtod(m1, caddr_t)+m1->m_len; + *bpos = (caddr_t)mbuf_data(m1) + mbuf_len(m1); return (0); } @@ -1134,10 +1164,9 @@ nfsm_strtmbuf(mb, bpos, cp, siz) * Called once to initialize data structures... */ int -nfs_init(vfsp) - struct vfsconf *vfsp; +nfs_init(struct vfsconf *vfsp) { - register int i; + int i; /* * Check to see if major data structures haven't bloated. @@ -1158,6 +1187,7 @@ nfs_init(vfsp) printf("struct nfsuid bloated (> %dbytes)\n",NFS_UIDALLOC); printf("Try unionizing the nu_nickname and nu_flag fields\n"); } + nfs_mount_type = vfsp->vfc_typenum; nfsrtt.pos = 0; rpc_vers = txdr_unsigned(RPC_VER2); @@ -1170,37 +1200,57 @@ nfs_init(vfsp) rpc_auth_unix = txdr_unsigned(RPCAUTH_UNIX); rpc_auth_kerb = txdr_unsigned(RPCAUTH_KERB4); nfs_prog = txdr_unsigned(NFS_PROG); - nqnfs_prog = txdr_unsigned(NQNFS_PROG); nfs_true = txdr_unsigned(TRUE); nfs_false = txdr_unsigned(FALSE); nfs_xdrneg1 = txdr_unsigned(-1); + nfs_ticks = (hz * NFS_TICKINTVL + 500) / 1000; if (nfs_ticks < 1) nfs_ticks = 1; /* Ensure async daemons disabled */ for (i = 0; i < NFS_MAXASYNCDAEMON; i++) { - nfs_iodwant[i] = (struct proc *)0; + nfs_iodwant[i] = NULL; nfs_iodmount[i] = (struct nfsmount *)0; } + /* init nfsiod mutex */ + nfs_iod_lck_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(nfs_iod_lck_grp_attr); + nfs_iod_lck_grp = lck_grp_alloc_init("nfs_iod", nfs_iod_lck_grp_attr); + nfs_iod_lck_attr = lck_attr_alloc_init(); + nfs_iod_mutex = lck_mtx_alloc_init(nfs_iod_lck_grp, nfs_iod_lck_attr); + nfs_nbinit(); /* Init the nfsbuf table */ nfs_nhinit(); /* Init the nfsnode table */ nfs_lockinit(); /* Init the nfs lock state */ + #ifndef NFS_NOSERVER + /* init nfsd mutex */ + nfsd_lck_grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(nfsd_lck_grp_attr); + nfsd_lck_grp = lck_grp_alloc_init("nfsd", nfsd_lck_grp_attr); + nfsd_lck_attr = lck_attr_alloc_init(); + nfsd_mutex = lck_mtx_alloc_init(nfsd_lck_grp, nfsd_lck_attr); + + /* init slp rwlock */ + nfs_slp_lock_attr = lck_attr_alloc_init(); + nfs_slp_group_attr = lck_grp_attr_alloc_init(); + nfs_slp_rwlock_group = lck_grp_alloc_init("nfs-slp-rwlock", nfs_slp_group_attr); + nfs_slp_mutex_group = lck_grp_alloc_init("nfs-slp-mutex", nfs_slp_group_attr); + + /* init export data structures */ + nfsexphashtbl = hashinit(8, M_TEMP, &nfsexphash); + LIST_INIT(&nfs_exports); + nfs_export_lock_attr = lck_attr_alloc_init(); + nfs_export_group_attr = lck_grp_attr_alloc_init(); + nfs_export_rwlock_group = lck_grp_alloc_init("nfs-export-rwlock", nfs_export_group_attr); + lck_rw_init(&nfs_export_rwlock, nfs_export_rwlock_group, nfs_export_lock_attr); + + lck_mtx_lock(nfsd_mutex); nfsrv_init(0); /* Init server data structures */ nfsrv_initcache(); /* Init the server request cache */ + lck_mtx_unlock(nfsd_mutex); #endif - /* - * Initialize the nqnfs server stuff. - */ - if (nqnfsstarttime == 0) { - nqnfsstarttime = boottime.tv_sec + nqsrv_maxlease - + nqsrv_clockskew + nqsrv_writeslack; - NQLOADNOVRAM(nqnfsstarttime); - CIRCLEQ_INIT(&nqtimerhead); - nqfhhashtbl = hashinit(NQLCHSZ, M_NQLEASE, &nqfhhash); - } - /* * Initialize reply list and start timer */ @@ -1208,83 +1258,52 @@ nfs_init(vfsp) nfs_timer(0); - -/* XXX CSM 12/4/97 Where are these declared in FreeBSD? */ -#ifdef notyet - /* - * Set up lease_check and lease_updatetime so that other parts - * of the system can call us, if we are loadable. - */ -#ifndef NFS_NOSERVER - default_vnodeop_p[VOFFSET(vop_lease)] = (vop_t *)nqnfs_vop_lease_check; -#endif - lease_updatetime = nfs_lease_updatetime; -#endif vfsp->vfc_refcount++; /* make us non-unloadable */ return (0); } /* - * Attribute cache routines. - * nfs_loadattrcache() - loads or updates the cache contents from attributes - * that are on the mbuf list - * nfs_getattrcache() - returns valid attributes if found in cache, returns - * error otherwise + * initialize NFS's cache of mbuf constants */ +void +nfs_mbuf_init(void) +{ + struct mbuf_stat ms; + + mbuf_stats(&ms); + nfs_mbuf_mlen = ms.mlen; + nfs_mbuf_mhlen = ms.mhlen; + nfs_mbuf_minclsize = ms.minclsize; + nfs_mbuf_mclbytes = ms.mclbytes; +} /* - * Load the attribute cache (that lives in the nfsnode entry) with - * the values on the mbuf list and - * Iff vap not NULL - * copy the attributes to *vaper + * Parse the attributes that are in the mbuf list and store them in *nvap. */ int -nfs_loadattrcache(vpp, mdp, dposp, vaper, dontshrink, xidp) - struct vnode **vpp; - struct mbuf **mdp; - caddr_t *dposp; - struct vattr *vaper; - int dontshrink; - u_int64_t *xidp; +nfs_parsefattr(mbuf_t *mdp, caddr_t *dposp, int v3, struct nfs_vattr *nvap) { - register struct vnode *vp = *vpp; - register struct vattr *vap; - register struct nfs_fattr *fp; - register struct nfsnode *np; - register long t1; + struct nfs_fattr *fp; + long t1; caddr_t cp2; int error = 0, rdev; - struct mbuf *md; - enum vtype vtyp; + mbuf_t md; + enum vtype vtype; u_short vmode; - struct timespec mtime; - struct timeval now; - struct vnode *nvp; - int v3; - - FSDBG_TOP(527, vp, 0, *xidp >> 32, *xidp); - if (!VFSTONFS(vp->v_mount)) { - FSDBG_BOT(527, ENXIO, 1, 0, *xidp); - return (ENXIO); - } - - v3 = NFS_ISV3(vp); md = *mdp; - t1 = (mtod(md, caddr_t) + md->m_len) - *dposp; + t1 = ((caddr_t)mbuf_data(md) + mbuf_len(md)) - *dposp; if ((error = nfsm_disct(mdp, dposp, NFSX_FATTR(v3), t1, &cp2))) { - FSDBG_BOT(527, error, 2, 0, *xidp); return (error); } fp = (struct nfs_fattr *)cp2; if (v3) { - vtyp = nfsv3tov_type(fp->fa_type); + vtype = nfsv3tov_type(fp->fa_type); vmode = fxdr_unsigned(u_short, fp->fa_mode); rdev = makedev(fxdr_unsigned(int, fp->fa3_rdev.specdata1), fxdr_unsigned(int, fp->fa3_rdev.specdata2)); - fxdr_nfsv3time(&fp->fa3_mtime, &mtime); } else { - vtyp = nfsv2tov_type(fp->fa_type); + vtype = nfsv2tov_type(fp->fa_type); vmode = fxdr_unsigned(u_short, fp->fa_mode); /* * XXX @@ -1305,26 +1324,79 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper, dontshrink, xidp) * contain any type information (while also introduing sockets * and FIFOs for fa_type). */ - if (vtyp == VNON || (vtyp == VREG && (vmode & S_IFMT) != 0)) - vtyp = IFTOVT(vmode); + if (vtype == VNON || (vtype == VREG && (vmode & S_IFMT) != 0)) + vtype = IFTOVT(vmode); rdev = fxdr_unsigned(long, fp->fa2_rdev); - fxdr_nfsv2time(&fp->fa2_mtime, &mtime); - /* * Really ugly NFSv2 kludge. */ - if (vtyp == VCHR && rdev == 0xffffffff) - vtyp = VFIFO; + if (vtype == VCHR && rdev == (int)0xffffffff) + vtype = VFIFO; + } + + nvap->nva_type = vtype; + nvap->nva_mode = (vmode & 07777); + nvap->nva_rdev = (dev_t)rdev; + nvap->nva_nlink = (uint64_t)fxdr_unsigned(u_long, fp->fa_nlink); + nvap->nva_uid = fxdr_unsigned(uid_t, fp->fa_uid); + nvap->nva_gid = fxdr_unsigned(gid_t, fp->fa_gid); + if (v3) { + fxdr_hyper(&fp->fa3_size, &nvap->nva_size); + nvap->nva_blocksize = 16*1024; + fxdr_hyper(&fp->fa3_used, &nvap->nva_bytes); + fxdr_hyper(&fp->fa3_fileid, &nvap->nva_fileid); + fxdr_nfsv3time(&fp->fa3_atime, &nvap->nva_atime); + fxdr_nfsv3time(&fp->fa3_mtime, &nvap->nva_mtime); + fxdr_nfsv3time(&fp->fa3_ctime, &nvap->nva_ctime); + } else { + nvap->nva_size = fxdr_unsigned(u_long, fp->fa2_size); + nvap->nva_blocksize = fxdr_unsigned(long, fp->fa2_blocksize); + nvap->nva_bytes = fxdr_unsigned(long, fp->fa2_blocks) * NFS_FABLKSIZE; + nvap->nva_fileid = (uint64_t)fxdr_unsigned(u_long, fp->fa2_fileid); + fxdr_nfsv2time(&fp->fa2_atime, &nvap->nva_atime); + fxdr_nfsv2time(&fp->fa2_mtime, &nvap->nva_mtime); + fxdr_nfsv2time(&fp->fa2_ctime, &nvap->nva_ctime); + } + + return (0); +} + +/* + * Load the attribute cache (that lives in the nfsnode entry) with + * the value pointed to by nvap, unless the file type in the attribute + * cache doesn't match the file type in the nvap, in which case log a + * warning and return ESTALE. + * + * If the dontshrink flag is set, then it's not safe to call ubc_setsize() + * to shrink the size of the file. + */ +int +nfs_loadattrcache( + struct nfsnode *np, + struct nfs_vattr *nvap, + u_int64_t *xidp, + int dontshrink) +{ + mount_t mp; + vnode_t vp; + struct timeval now; + struct nfs_vattr *npnvap; + + if (np->n_flag & NINIT) { + vp = NULL; + mp = np->n_mount; + } else { + vp = NFSTOV(np); + mp = vnode_mount(vp); + } + + FSDBG_TOP(527, vp, np, *xidp >> 32, *xidp); + + if (!VFSTONFS(mp)) { + FSDBG_BOT(527, ENXIO, 1, 0, *xidp); + return (ENXIO); } - /* - * If v_type == VNON it is a new node, so fill in the v_type, - * n_mtime fields. Check to see if it represents a special - * device, and if so, check for a possible alias. Once the - * correct vnode has been obtained, fill in the rest of the - * information. - */ - np = VTONFS(vp); if (*xidp < np->n_xid) { /* * We have already updated attributes with a response from @@ -1336,407 +1408,417 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper, dontshrink, xidp) * to indicate the attributes were dropped - only getattr * cares - it needs to retry the rpc. */ - np->n_xid = 0; + NATTRINVALIDATE(np); FSDBG_BOT(527, 0, np, np->n_xid, *xidp); *xidp = 0; return (0); } - if (vp->v_type != vtyp) { - if (vp->v_type != VNON) { - /* - * The filehandle has changed type on us. This can be - * caused by either the server not having unique filehandles - * or because another client has removed the previous - * filehandle and a new object (of a different type) - * has been created with the same filehandle. - * - * We can't simply switch the type on the vnode because - * there may be type-specific fields that need to be - * cleaned up or set up. - * - * So, what should we do with this vnode? - * - * About the best we can do is log a warning and return - * an error. ESTALE is about the closest error, but it - * is a little strange that we come up with this error - * internally instead of simply passing it through from - * the server. Hopefully, the vnode will be reclaimed - * soon so the filehandle can be reincarnated as the new - * object type. - */ - printf("nfs loadattrcache vnode changed type, was %d now %d", vp->v_type, vtyp); - FSDBG_BOT(527, ESTALE, 3, 0, *xidp); - return (ESTALE); - } else { - vp->v_type = vtyp; - } - if (vp->v_type == VFIFO) { - vp->v_op = fifo_nfsv2nodeop_p; - } - if (vp->v_type == VCHR || vp->v_type == VBLK) { - vp->v_op = spec_nfsv2nodeop_p; - nvp = checkalias(vp, (dev_t)rdev, vp->v_mount); - if (nvp) { - /* - * Discard unneeded vnode, but save its nfsnode. - * Since the nfsnode does not have a lock, its - * vnode lock has to be carried over. - */ - nvp->v_vnlock = vp->v_vnlock; - vp->v_vnlock = NULL; - nvp->v_data = vp->v_data; - vp->v_data = NULL; - vp->v_op = spec_vnodeop_p; - vrele(vp); - vgone(vp); - /* - * Reinitialize aliased node. - */ - np->n_vnode = nvp; - *vpp = vp = nvp; - } - } - np->n_mtime = mtime.tv_sec; - if (vp->v_type == VDIR) - np->n_ncmtime = mtime.tv_sec; - FSDBG(527, vp, np->n_mtime, 0, 0); - } - np->n_xid = *xidp; - vap = &np->n_vattr; - vap->va_type = vtyp; - vap->va_mode = (vmode & 07777); - vap->va_rdev = (dev_t)rdev; - vap->va_mtime = mtime; - vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; - if (v3) { - vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink); - vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); - vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); - fxdr_hyper(&fp->fa3_size, &vap->va_size); - vap->va_blocksize = 16*1024; - fxdr_hyper(&fp->fa3_used, &vap->va_bytes); - vap->va_fileid = fxdr_unsigned(int, fp->fa3_fileid.nfsuquad[1]); - fxdr_nfsv3time(&fp->fa3_atime, &vap->va_atime); - fxdr_nfsv3time(&fp->fa3_ctime, &vap->va_ctime); - vap->va_flags = 0; - vap->va_filerev = 0; - } else { - vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink); - vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); - vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); - vap->va_size = fxdr_unsigned(u_long, fp->fa2_size); - vap->va_blocksize = fxdr_unsigned(long, fp->fa2_blocksize); - vap->va_bytes = fxdr_unsigned(long, fp->fa2_blocks) * NFS_FABLKSIZE; - vap->va_fileid = fxdr_unsigned(long, fp->fa2_fileid); - fxdr_nfsv2time(&fp->fa2_atime, &vap->va_atime); - vap->va_flags = 0; - vap->va_ctime.tv_sec = fxdr_unsigned(long, fp->fa2_ctime.nfsv2_sec); - vap->va_ctime.tv_nsec = 0; - vap->va_gen = fxdr_unsigned(u_long, fp->fa2_ctime.nfsv2_usec); - vap->va_filerev = 0; + if (vp && (nvap->nva_type != vnode_vtype(vp))) { + /* + * The filehandle has changed type on us. This can be + * caused by either the server not having unique filehandles + * or because another client has removed the previous + * filehandle and a new object (of a different type) + * has been created with the same filehandle. + * + * We can't simply switch the type on the vnode because + * there may be type-specific fields that need to be + * cleaned up or set up. + * + * So, what should we do with this vnode? + * + * About the best we can do is log a warning and return + * an error. ESTALE is about the closest error, but it + * is a little strange that we come up with this error + * internally instead of simply passing it through from + * the server. Hopefully, the vnode will be reclaimed + * soon so the filehandle can be reincarnated as the new + * object type. + */ + printf("nfs loadattrcache vnode changed type, was %d now %d\n", + vnode_vtype(vp), nvap->nva_type); + FSDBG_BOT(527, ESTALE, 3, 0, *xidp); + return (ESTALE); } microuptime(&now); np->n_attrstamp = now.tv_sec; + np->n_xid = *xidp; - if (UBCINFOMISSING(vp) || UBCINFORECLAIMED(vp)) { - if (UBCINFORECLAIMED(vp) && ISSET(vp->v_flag, (VXLOCK|VORECLAIM))) { - // vnode is being vclean'ed, abort - FSDBG_BOT(527, ENXIO, 1, 0, *xidp); - return (ENXIO); - } - if ((error = ubc_info_init(vp))) { /* VREG */ - FSDBG_BOT(527, error, 3, 0, *xidp); - return(error); - } - } - - if (vap->va_size != np->n_size) { - FSDBG(527, vp, vap->va_size, np->n_size, - (vap->va_type == VREG) | - (np->n_flag & NMODIFIED ? 6 : 4)); - if (vap->va_type == VREG) { - int orig_size; + npnvap = &np->n_vattr; + nvap->nva_fsid = vfs_statfs(mp)->f_fsid.val[0]; + bcopy((caddr_t)nvap, (caddr_t)npnvap, sizeof(*nvap)); - orig_size = np->n_size; - if (np->n_flag & NMODIFIED) { - if (vap->va_size < np->n_size) - vap->va_size = np->n_size; - else - np->n_size = vap->va_size; + if (vp) { + if (nvap->nva_size != np->n_size) { + FSDBG(527, vp, nvap->nva_size, np->n_size, + (nvap->nva_type == VREG) | + (np->n_flag & NMODIFIED ? 6 : 4)); + if (nvap->nva_type == VREG) { + int orig_size = np->n_size; + if (np->n_flag & NMODIFIED) { + if (nvap->nva_size < np->n_size) + nvap->nva_size = np->n_size; + else + np->n_size = nvap->nva_size; + } else + np->n_size = nvap->nva_size; + if (!UBCINFOEXISTS(vp) || + (dontshrink && np->n_size < (u_quad_t)ubc_getsize(vp))) { + nvap->nva_size = np->n_size = orig_size; + NATTRINVALIDATE(np); + } else { + ubc_setsize(vp, (off_t)np->n_size); /* XXX */ + } } else - np->n_size = vap->va_size; - if (!UBCINFOEXISTS(vp) || - dontshrink && np->n_size < ubc_getsize(vp)) { - vap->va_size = np->n_size = orig_size; - np->n_xid = 0; - } else { - ubc_setsize(vp, (off_t)np->n_size); /* XXX */ - } - } else - np->n_size = vap->va_size; + np->n_size = nvap->nva_size; + } + } else { + np->n_size = nvap->nva_size; } - if (vaper != NULL) { - bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(*vap)); - if (np->n_flag & NCHG) { - if (np->n_flag & NACC) - vaper->va_atime = np->n_atim; - if (np->n_flag & NUPD) - vaper->va_mtime = np->n_mtim; - } + if (np->n_flag & NCHG) { + if (np->n_flag & NACC) + nvap->nva_atime = np->n_atim; + if (np->n_flag & NUPD) + nvap->nva_mtime = np->n_mtim; } + FSDBG_BOT(527, 0, np, 0, *xidp); return (0); } /* - * Check the time stamp - * If the cache is valid, copy contents to *vap and return 0 - * otherwise return an error + * Calculate the attribute timeout based on + * how recently the file has been modified. */ int -nfs_getattrcache(vp, vaper) - register struct vnode *vp; - struct vattr *vaper; +nfs_attrcachetimeout(vnode_t vp) { - register struct nfsnode *np = VTONFS(vp); - register struct vattr *vap; - struct timeval now, nowup; - int32_t timeo; + struct nfsnode *np = VTONFS(vp); + struct nfsmount *nmp; + struct timeval now; + int isdir, timeo; - if (np->n_xid == 0) { - FSDBG(528, vp, 0, 0, 0); - nfsstats.attrcache_misses++; - return (ENOENT); - } + if (!(nmp = VFSTONFS(vnode_mount(vp)))) + return (0); + + isdir = vnode_isdir(vp); - /* Set attribute timeout based on how recently the file has been modified. */ if ((np)->n_flag & NMODIFIED) - timeo = NFS_MINATTRTIMO; + timeo = isdir ? nmp->nm_acdirmin : nmp->nm_acregmin; else { /* Note that if the client and server clocks are way out of sync, */ /* timeout will probably get clamped to a min or max value */ microtime(&now); - timeo = (now.tv_sec - (np)->n_mtime) / 10; - if (timeo < NFS_MINATTRTIMO) - timeo = NFS_MINATTRTIMO; - else if (timeo > NFS_MAXATTRTIMO) - timeo = NFS_MAXATTRTIMO; + timeo = (now.tv_sec - (np)->n_mtime.tv_sec) / 10; + if (isdir) { + if (timeo < nmp->nm_acdirmin) + timeo = nmp->nm_acdirmin; + else if (timeo > nmp->nm_acdirmax) + timeo = nmp->nm_acdirmax; + } else { + if (timeo < nmp->nm_acregmin) + timeo = nmp->nm_acregmin; + else if (timeo > nmp->nm_acregmax) + timeo = nmp->nm_acregmax; + } } + return (timeo); +} + +/* + * Check the time stamp + * If the cache is valid, copy contents to *nvaper and return 0 + * otherwise return an error + */ +int +nfs_getattrcache(vp, nvaper) + vnode_t vp; + struct nfs_vattr *nvaper; +{ + struct nfsnode *np = VTONFS(vp); + struct nfs_vattr *nvap; + struct timeval nowup; + int32_t timeo; + + if (!NATTRVALID(np)) { + FSDBG(528, vp, 0, 0, 0); + OSAddAtomic(1, (SInt32*)&nfsstats.attrcache_misses); + return (ENOENT); + } + + timeo = nfs_attrcachetimeout(vp); + microuptime(&nowup); if ((nowup.tv_sec - np->n_attrstamp) >= timeo) { FSDBG(528, vp, 0, 0, 1); - nfsstats.attrcache_misses++; + OSAddAtomic(1, (SInt32*)&nfsstats.attrcache_misses); return (ENOENT); } FSDBG(528, vp, 0, 0, 2); - nfsstats.attrcache_hits++; - vap = &np->n_vattr; + OSAddAtomic(1, (SInt32*)&nfsstats.attrcache_hits); + nvap = &np->n_vattr; - if (vap->va_size != np->n_size) { - FSDBG(528, vp, vap->va_size, np->n_size, - (vap->va_type == VREG) | + if (nvap->nva_size != np->n_size) { + FSDBG(528, vp, nvap->nva_size, np->n_size, + (nvap->nva_type == VREG) | (np->n_flag & NMODIFIED ? 6 : 4)); - if (vap->va_type == VREG) { + if (nvap->nva_type == VREG) { if (np->n_flag & NMODIFIED) { - if (vap->va_size < np->n_size) - vap->va_size = np->n_size; + if (nvap->nva_size < np->n_size) + nvap->nva_size = np->n_size; else - np->n_size = vap->va_size; + np->n_size = nvap->nva_size; } else - np->n_size = vap->va_size; + np->n_size = nvap->nva_size; ubc_setsize(vp, (off_t)np->n_size); /* XXX */ } else - np->n_size = vap->va_size; + np->n_size = nvap->nva_size; } - bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr)); + bcopy((caddr_t)nvap, (caddr_t)nvaper, sizeof(struct nfs_vattr)); if (np->n_flag & NCHG) { if (np->n_flag & NACC) - vaper->va_atime = np->n_atim; + nvaper->nva_atime = np->n_atim; if (np->n_flag & NUPD) - vaper->va_mtime = np->n_mtim; + nvaper->nva_mtime = np->n_mtim; } return (0); } #ifndef NFS_NOSERVER /* - * Set up nameidata for a lookup() call and do it. - * - * If pubflag is set, this call is done for a lookup operation on the - * public filehandle. In that case we allow crossing mountpoints and - * absolute pathnames. However, the caller is expected to check that - * the lookup result is within the public fs, and deny access if - * it is not. + * Extract a lookup path from the given mbufs and store it in + * a newly allocated buffer saved in the given nameidata structure. + * exptected string length given as *lenp and final string length + * (after any WebNFS processing) is returned in *lenp. */ int -nfs_namei(ndp, fhp, len, slp, nam, mdp, dposp, retdirp, p, kerbflag, pubflag) - register struct nameidata *ndp; - fhandle_t *fhp; - int len; - struct nfssvc_sock *slp; - struct mbuf *nam; - struct mbuf **mdp; - caddr_t *dposp; - struct vnode **retdirp; - struct proc *p; - int kerbflag, pubflag; +nfsm_path_mbuftond( + mbuf_t *mdp, + caddr_t *dposp, + __unused int v3, + __unused int pubflag, + int* lenp, + struct nameidata *ndp) { - register int i, rem; - register struct mbuf *md; - register char *fromcp, *tocp, *cp; - struct iovec aiov; - struct uio auio; - struct vnode *dp; - int error, rdonly, linklen; + int i, len, len2, rem, error = 0; + mbuf_t md; + char *fromcp, *tocp; struct componentname *cnp = &ndp->ni_cnd; - int olen = len; - char *tmppn; - - *retdirp = (struct vnode *)0; +/* XXX Revisit when enabling WebNFS */ +#ifdef WEBNFS_ENABLED + int webcnt = 0, digitcnt = 0; + char hexdigits[2]; +#endif - if (len > MAXPATHLEN - 1) + len = *lenp; + if (len > (MAXPATHLEN - 1)) return (ENAMETOOLONG); - MALLOC_ZONE(cnp->cn_pnbuf, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); + /* + * Get a buffer for the name to be translated, and copy the + * name into the buffer. + */ + MALLOC_ZONE(cnp->cn_pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (!cnp->cn_pnbuf) + return (ENOMEM); cnp->cn_pnlen = MAXPATHLEN; + cnp->cn_flags |= HASBUF; /* - * Copy the name from the mbuf list to ndp->ni_pnbuf - * and set the various ndp fields appropriately. + * Copy the name from the mbuf list to the string + * + * Along the way, take note of any WebNFS characters + * and convert any % escapes. */ fromcp = *dposp; tocp = cnp->cn_pnbuf; md = *mdp; - rem = mtod(md, caddr_t) + md->m_len - fromcp; - cnp->cn_hash = 0; + rem = (caddr_t)mbuf_data(md) + mbuf_len(md) - fromcp; for (i = 1; i <= len; i++) { while (rem == 0) { - md = md->m_next; + md = mbuf_next(md); if (md == NULL) { error = EBADRPC; goto out; } - fromcp = mtod(md, caddr_t); - rem = md->m_len; + fromcp = mbuf_data(md); + rem = mbuf_len(md); } -/* XXX CSM 12/4/97 Revisit when enabling WebNFS */ -#ifdef notdef - if (*fromcp == '\0' || (!pubflag && *fromcp == '/')) { +/* XXX Revisit when enabling WebNFS */ +#ifdef WEBNFS_ENABLED + if (pubflag) { + if ((i == 1) && ((unsigned char)*fromcp >= WEBNFS_SPECCHAR_START)) { + switch ((unsigned char)*fromcp) { + case WEBNFS_NATIVE_CHAR: + /* + * 'Native' path for us is the same + * as a path according to the NFS spec, + * just skip the escape char. + */ + webcnt++; + fromcp++; + rem--; + /* next iteration of for loop */ + continue; + /* + * More may be added in the future, range 0x80-0xff. + * Don't currently support security query lookup (0x81). + */ + default: + error = EIO; + goto out; + } + } + if (digitcnt) { + /* We're expecting hex digits */ + if (!ISHEX(*fromcp)) { + error = ENOENT; + goto out; + } + digitcnt--; + hexdigits[digitcnt ? 0 : 1] = *fromcp++; + if (!digitcnt) + *tocp++ = HEXSTRTOI(hexdigits); + rem--; + /* next iteration of for loop */ + continue; + } else if (*fromcp == WEBNFS_ESC_CHAR) { + /* + * We can't really look at the next couple + * bytes here safely/easily, so we note that + * the next two characters should be hex + * digits and later save them in hexdigits[]. + * When we've got both, we'll convert it. + */ + digitcnt = 2; + webcnt += 2; + fromcp++; + rem--; + /* next iteration of for loop */ + continue; + } + } + if (*fromcp == '\0' || (!pubflag && *fromcp == '/')) #else - if (*fromcp == '\0' || *fromcp == '/') { + if (*fromcp == '\0' || *fromcp == '/') #endif + { error = EACCES; goto out; } - cnp->cn_hash += (unsigned char)*fromcp * i; *tocp++ = *fromcp++; rem--; } *tocp = '\0'; *mdp = md; *dposp = fromcp; - len = nfsm_rndup(len)-len; - if (len > 0) { - if (rem >= len) - *dposp += len; - else if ((error = nfs_adv(mdp, dposp, len, rem)) != 0) + len2 = nfsm_rndup(len)-len; + if (len2 > 0) { + if (rem >= len2) + *dposp += len2; + else if ((error = nfs_adv(mdp, dposp, len2, rem)) != 0) goto out; } +/* XXX Revisit when enabling WebNFS */ +#ifdef WEBNFS_ENABLED + if (pubflag) { + if (digitcnt) { + /* The string ended in the middle of an escape! */ + error = ENOENT; + goto out; + } + len -= webcnt; + } +#endif + +out: + if (error) { + if (cnp->cn_pnbuf) + FREE_ZONE(cnp->cn_pnbuf, MAXPATHLEN, M_NAMEI); + cnp->cn_flags &= ~HASBUF; + } else { + ndp->ni_pathlen = len; + *lenp = len; + } + return (error); +} + +/* + * Set up nameidata for a lookup() call and do it. + * + * If pubflag is set, this call is done for a lookup operation on the + * public filehandle. In that case we allow crossing mountpoints and + * absolute pathnames. However, the caller is expected to check that + * the lookup result is within the public fs, and deny access if + * it is not. + */ +int +nfs_namei( + struct nfsrv_descript *nfsd, + struct vfs_context *ctx, + struct nameidata *ndp, + struct nfs_filehandle *nfhp, + mbuf_t nam, + int pubflag, + vnode_t *retdirp, + struct nfs_export **nxp, + struct nfs_export_options **nxop) +{ +/* XXX Revisit when enabling WebNFS */ +#ifdef WEBNFS_ENABLED + char *cp; + uio_t auio; + char uio_buf[ UIO_SIZEOF(1) ]; + int linklen, olen = ndp->ni_pathlen; +#endif + vnode_t dp; + int error; + struct componentname *cnp = &ndp->ni_cnd; + char *tmppn; + + *retdirp = NULL; + /* * Extract and set starting directory. */ - error = nfsrv_fhtovp(fhp, FALSE, &dp, ndp->ni_cnd.cn_cred, slp, - nam, &rdonly, kerbflag, pubflag); + error = nfsrv_fhtovp(nfhp, nam, pubflag, &dp, nxp, nxop); if (error) goto out; - if (dp->v_type != VDIR) { - vrele(dp); + error = nfsrv_credcheck(nfsd, *nxp, *nxop); + if (error || (vnode_vtype(dp) != VDIR)) { + vnode_put(dp); error = ENOTDIR; goto out; } - if (rdonly) + ctx->vc_ucred = nfsd->nd_cr; + ndp->ni_cnd.cn_context = ctx; + + if (*nxop && ((*nxop)->nxo_flags & NX_READONLY)) cnp->cn_flags |= RDONLY; *retdirp = dp; -/* XXX CSM 12/4/97 Revisit when enabling WebNFS */ -#ifdef notyet - if (pubflag) { - /* - * Oh joy. For WebNFS, handle those pesky '%' escapes, - * and the 'native path' indicator. - */ - - assert(olen <= MAXPATHLEN - 1); - - MALLOC_ZONE(cp, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); - fromcp = cnp->cn_pnbuf; - tocp = cp; - if ((unsigned char)*fromcp >= WEBNFS_SPECCHAR_START) { - switch ((unsigned char)*fromcp) { - case WEBNFS_NATIVE_CHAR: - /* - * 'Native' path for us is the same - * as a path according to the NFS spec, - * just skip the escape char. - */ - fromcp++; - break; - /* - * More may be added in the future, range 0x80-0xff - */ - default: - error = EIO; - FREE_ZONE(cp, MAXPATHLEN, M_NAMEI); - goto out; - } - } - /* - * Translate the '%' escapes, URL-style. - */ - while (*fromcp != '\0') { - if (*fromcp == WEBNFS_ESC_CHAR) { - if (fromcp[1] != '\0' && fromcp[2] != '\0') { - fromcp++; - *tocp++ = HEXSTRTOI(fromcp); - fromcp += 2; - continue; - } else { - error = ENOENT; - FREE_ZONE(cp, MAXPATHLEN, M_NAMEI); - goto out; - } - } else - *tocp++ = *fromcp++; - } - *tocp = '\0'; - - tmppn = cnp->cn_pnbuf; - long len = cnp->cn_pnlen; - cnp->cn_pnbuf = cp; - cnp->cn_pnlen = MAXPATHLEN; - FREE_ZONE(tmppn, len, M_NAMEI); - - } -#endif - - ndp->ni_pathlen = (tocp - cnp->cn_pnbuf) + 1; - ndp->ni_segflg = UIO_SYSSPACE; - -/* XXX CSM 12/4/97 Revisit when enabling WebNFS */ -#ifdef notyet +/* XXX Revisit when enabling WebNFS */ +#ifdef WEBNFS_ENABLED if (pubflag) { ndp->ni_rootdir = rootvnode; ndp->ni_loopcnt = 0; - if (cnp->cn_pnbuf[0] == '/') + if (cnp->cn_pnbuf[0] == '/') { + vnode_put(dp); dp = rootvnode; + error = vnode_get(dp); + if (error) { + *retdirp = NULL; + goto out; + } + } } else { cnp->cn_flags |= NOCROSSMOUNT; } @@ -1744,8 +1826,7 @@ nfs_namei(ndp, fhp, len, slp, nam, mdp, dposp, retdirp, p, kerbflag, pubflag) cnp->cn_flags |= NOCROSSMOUNT; #endif - cnp->cn_proc = p; - VREF(dp); + ndp->ni_usedvp = dp; for (;;) { cnp->cn_nameptr = cnp->cn_pnbuf; @@ -1760,54 +1841,67 @@ nfs_namei(ndp, fhp, len, slp, nam, mdp, dposp, retdirp, p, kerbflag, pubflag) * Check for encountering a symbolic link */ if ((cnp->cn_flags & ISSYMLINK) == 0) { - nfsrv_object_create(ndp->ni_vp); - if (cnp->cn_flags & (SAVENAME | SAVESTART)) { - cnp->cn_flags |= HASBUF; - return (0); - } - break; + return (0); } else { - if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1) - VOP_UNLOCK(ndp->ni_dvp, 0, p); -/* XXX CSM 12/4/97 Revisit when enabling WebNFS */ -#ifdef notyet + if ((cnp->cn_flags & FSNODELOCKHELD)) { + cnp->cn_flags &= ~FSNODELOCKHELD; + unlock_fsnode(ndp->ni_dvp, NULL); + } +/* XXX Revisit when enabling WebNFS */ +#ifdef WEBNFS_ENABLED if (!pubflag) { #endif - vrele(ndp->ni_dvp); - vput(ndp->ni_vp); - ndp->ni_vp = NULL; + if (cnp->cn_flags & (LOCKPARENT | WANTPARENT)) + vnode_put(ndp->ni_dvp); + if (ndp->ni_vp) { + vnode_put(ndp->ni_vp); + ndp->ni_vp = NULL; + } error = EINVAL; break; -/* XXX CSM 12/4/97 Revisit when enabling WebNFS */ -#ifdef notyet +/* XXX Revisit when enabling WebNFS */ +#ifdef WEBNFS_ENABLED } if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { + vnode_put(ndp->ni_vp); + ndp->ni_vp = NULL; error = ELOOP; break; } /* XXX assert(olen <= MAXPATHLEN - 1); */ - if (ndp->ni_pathlen > 1) + if (ndp->ni_pathlen > 1) { MALLOC_ZONE(cp, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); - else + if (!cp) { + vnode_put(ndp->ni_vp); + ndp->ni_vp = NULL; + error = ENOMEM; + break; + } + } else { cp = cnp->cn_pnbuf; - aiov.iov_base = cp; - aiov.iov_len = MAXPATHLEN; - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_offset = 0; - auio.uio_rw = UIO_READ; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_procp = (struct proc *)0; - auio.uio_resid = MAXPATHLEN; - error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred); + } + auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, + &uio_buf[0], sizeof(uio_buf)); + if (!auio) { + vnode_put(ndp->ni_vp); + ndp->ni_vp = NULL; + if (ndp->ni_pathlen > 1) + FREE_ZONE(cp, MAXPATHLEN, M_NAMEI); + error = ENOMEM; + break; + } + uio_addiov(auio, CAST_USER_ADDR_T(cp), MAXPATHLEN); + error = VNOP_READLINK(ndp->ni_vp, auio, cnp->cn_context); if (error) { badlink: + vnode_put(ndp->ni_vp); + ndp->ni_vp = NULL; if (ndp->ni_pathlen > 1) FREE_ZONE(cp, MAXPATHLEN, M_NAMEI); break; } - linklen = MAXPATHLEN - auio.uio_resid; + linklen = MAXPATHLEN - uio_resid(auio); if (linklen == 0) { error = ENOENT; goto badlink; @@ -1826,15 +1920,20 @@ badlink: } else cnp->cn_pnbuf[linklen] = '\0'; ndp->ni_pathlen += linklen; - vput(ndp->ni_vp); + + vnode_put(ndp->ni_vp); dp = ndp->ni_dvp; + ndp->ni_dvp = NULL; + /* * Check if root directory should replace current directory. */ if (cnp->cn_pnbuf[0] == '/') { - vrele(dp); + vnode_put(dp); dp = ndp->ni_rootdir; - VREF(dp); + error = vnode_get(dp); + if (error) + break; } #endif } @@ -1854,13 +1953,13 @@ out: */ void nfsm_adj(mp, len, nul) - struct mbuf *mp; - register int len; + mbuf_t mp; + int len; int nul; { - register struct mbuf *m; - register int count, i; - register char *cp; + mbuf_t m, mnext; + int count, i, mlen; + char *cp; /* * Trim from tail. Scan the mbuf chain, @@ -1872,15 +1971,18 @@ nfsm_adj(mp, len, nul) count = 0; m = mp; for (;;) { - count += m->m_len; - if (m->m_next == (struct mbuf *)0) + mlen = mbuf_len(m); + count += mlen; + mnext = mbuf_next(m); + if (mnext == NULL) break; - m = m->m_next; + m = mnext; } - if (m->m_len > len) { - m->m_len -= len; + if (mlen > len) { + mlen -= len; + mbuf_setlen(m, mlen); if (nul > 0) { - cp = mtod(m, caddr_t)+m->m_len-nul; + cp = (caddr_t)mbuf_data(m) + mlen - nul; for (i = 0; i < nul; i++) *cp++ = '\0'; } @@ -1894,20 +1996,22 @@ nfsm_adj(mp, len, nul) * Find the mbuf with last data, adjust its length, * and toss data from remaining mbufs on chain. */ - for (m = mp; m; m = m->m_next) { - if (m->m_len >= count) { - m->m_len = count; + for (m = mp; m; m = mbuf_next(m)) { + mlen = mbuf_len(m); + if (mlen >= count) { + mlen = count; + mbuf_setlen(m, count); if (nul > 0) { - cp = mtod(m, caddr_t)+m->m_len-nul; + cp = (caddr_t)mbuf_data(m) + mlen - nul; for (i = 0; i < nul; i++) *cp++ = '\0'; } break; } - count -= m->m_len; + count -= mlen; } - for (m = m->m_next;m;m = m->m_next) - m->m_len = 0; + for (m = mbuf_next(m); m; m = mbuf_next(m)) + mbuf_setlen(m, 0); } /* @@ -1918,15 +2022,15 @@ void nfsm_srvwcc(nfsd, before_ret, before_vap, after_ret, after_vap, mbp, bposp) struct nfsrv_descript *nfsd; int before_ret; - register struct vattr *before_vap; + struct vnode_attr *before_vap; int after_ret; - struct vattr *after_vap; - struct mbuf **mbp; + struct vnode_attr *after_vap; + mbuf_t *mbp; char **bposp; { - register struct mbuf *mb = *mbp, *mb2; - register char *bpos = *bposp; - register u_long *tl; + mbuf_t mb = *mbp, mb2; + char *bpos = *bposp; + u_long *tl; if (before_ret) { nfsm_build(tl, u_long *, NFSX_UNSIGNED); @@ -1934,11 +2038,11 @@ nfsm_srvwcc(nfsd, before_ret, before_vap, after_ret, after_vap, mbp, bposp) } else { nfsm_build(tl, u_long *, 7 * NFSX_UNSIGNED); *tl++ = nfs_true; - txdr_hyper(&(before_vap->va_size), tl); + txdr_hyper(&(before_vap->va_data_size), tl); tl += 2; - txdr_nfsv3time(&(before_vap->va_mtime), tl); + txdr_nfsv3time(&(before_vap->va_modify_time), tl); tl += 2; - txdr_nfsv3time(&(before_vap->va_ctime), tl); + txdr_nfsv3time(&(before_vap->va_change_time), tl); } *bposp = bpos; *mbp = mb; @@ -1949,14 +2053,14 @@ void nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp) struct nfsrv_descript *nfsd; int after_ret; - struct vattr *after_vap; - struct mbuf **mbp; + struct vnode_attr *after_vap; + mbuf_t *mbp; char **bposp; { - register struct mbuf *mb = *mbp, *mb2; - register char *bpos = *bposp; - register u_long *tl; - register struct nfs_fattr *fp; + mbuf_t mb = *mbp, mb2; + char *bpos = *bposp; + u_long *tl; + struct nfs_fattr *fp; if (after_ret) { nfsm_build(tl, u_long *, NFSX_UNSIGNED); @@ -1973,117 +2077,640 @@ nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp) void nfsm_srvfattr(nfsd, vap, fp) - register struct nfsrv_descript *nfsd; - register struct vattr *vap; - register struct nfs_fattr *fp; + struct nfsrv_descript *nfsd; + struct vnode_attr *vap; + struct nfs_fattr *fp; { + // XXX Should we assert here that all fields are supported? + fp->fa_nlink = txdr_unsigned(vap->va_nlink); fp->fa_uid = txdr_unsigned(vap->va_uid); fp->fa_gid = txdr_unsigned(vap->va_gid); if (nfsd->nd_flag & ND_NFSV3) { fp->fa_type = vtonfsv3_type(vap->va_type); fp->fa_mode = vtonfsv3_mode(vap->va_mode); - txdr_hyper(&vap->va_size, &fp->fa3_size); - txdr_hyper(&vap->va_bytes, &fp->fa3_used); + txdr_hyper(&vap->va_data_size, &fp->fa3_size); + txdr_hyper(&vap->va_data_alloc, &fp->fa3_used); fp->fa3_rdev.specdata1 = txdr_unsigned(major(vap->va_rdev)); fp->fa3_rdev.specdata2 = txdr_unsigned(minor(vap->va_rdev)); fp->fa3_fsid.nfsuquad[0] = 0; fp->fa3_fsid.nfsuquad[1] = txdr_unsigned(vap->va_fsid); - fp->fa3_fileid.nfsuquad[0] = 0; - fp->fa3_fileid.nfsuquad[1] = txdr_unsigned(vap->va_fileid); - txdr_nfsv3time(&vap->va_atime, &fp->fa3_atime); - txdr_nfsv3time(&vap->va_mtime, &fp->fa3_mtime); - txdr_nfsv3time(&vap->va_ctime, &fp->fa3_ctime); + txdr_hyper(&vap->va_fileid, &fp->fa3_fileid); + txdr_nfsv3time(&vap->va_access_time, &fp->fa3_atime); + txdr_nfsv3time(&vap->va_modify_time, &fp->fa3_mtime); + txdr_nfsv3time(&vap->va_change_time, &fp->fa3_ctime); } else { fp->fa_type = vtonfsv2_type(vap->va_type); fp->fa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); - fp->fa2_size = txdr_unsigned(vap->va_size); - fp->fa2_blocksize = txdr_unsigned(vap->va_blocksize); + fp->fa2_size = txdr_unsigned(vap->va_data_size); + fp->fa2_blocksize = txdr_unsigned(vap->va_iosize); if (vap->va_type == VFIFO) fp->fa2_rdev = 0xffffffff; else fp->fa2_rdev = txdr_unsigned(vap->va_rdev); - fp->fa2_blocks = txdr_unsigned(vap->va_bytes / NFS_FABLKSIZE); + fp->fa2_blocks = txdr_unsigned(vap->va_data_alloc / NFS_FABLKSIZE); fp->fa2_fsid = txdr_unsigned(vap->va_fsid); fp->fa2_fileid = txdr_unsigned(vap->va_fileid); - txdr_nfsv2time(&vap->va_atime, &fp->fa2_atime); - txdr_nfsv2time(&vap->va_mtime, &fp->fa2_mtime); - txdr_nfsv2time(&vap->va_ctime, &fp->fa2_ctime); + txdr_nfsv2time(&vap->va_access_time, &fp->fa2_atime); + txdr_nfsv2time(&vap->va_modify_time, &fp->fa2_mtime); + txdr_nfsv2time(&vap->va_change_time, &fp->fa2_ctime); + } +} + +/* + * Build hash lists of net addresses and hang them off the NFS export. + * Called by nfsrv_export() to set up the lists of export addresses. + */ +static int +nfsrv_hang_addrlist(struct nfs_export *nx, struct user_nfs_export_args *unxa) +{ + struct nfs_export_net_args nxna; + struct nfs_netopt *no; + struct radix_node_head *rnh; + struct radix_node *rn; + struct sockaddr *saddr, *smask; + struct domain *dom; + int i, error; + unsigned int net; + user_addr_t uaddr; + kauth_cred_t cred; + struct ucred temp_cred; + + uaddr = unxa->nxa_nets; + for (net = 0; net < unxa->nxa_netcount; net++, uaddr += sizeof(nxna)) { + error = copyin(uaddr, &nxna, sizeof(nxna)); + if (error) + return (error); + + if (nxna.nxna_flags & (NX_MAPROOT|NX_MAPALL)) { + bzero(&temp_cred, sizeof(temp_cred)); + temp_cred.cr_uid = nxna.nxna_cred.cr_uid; + temp_cred.cr_ngroups = nxna.nxna_cred.cr_ngroups; + for (i=0; i < nxna.nxna_cred.cr_ngroups && i < NGROUPS; i++) + temp_cred.cr_groups[i] = nxna.nxna_cred.cr_groups[i]; + + cred = kauth_cred_create(&temp_cred); + if (!cred) + return (ENOMEM); + } else { + cred = NULL; + } + + if (nxna.nxna_addr.ss_len == 0) { + /* No address means this is a default/world export */ + if (nx->nx_flags & NX_DEFAULTEXPORT) + return (EEXIST); + nx->nx_flags |= NX_DEFAULTEXPORT; + nx->nx_defopt.nxo_flags = nxna.nxna_flags; + nx->nx_defopt.nxo_cred = cred; + nx->nx_expcnt++; + continue; + } + + i = sizeof(struct nfs_netopt); + i += nxna.nxna_addr.ss_len + nxna.nxna_mask.ss_len; + MALLOC(no, struct nfs_netopt *, i, M_NETADDR, M_WAITOK); + if (!no) + return (ENOMEM); + bzero(no, sizeof(struct nfs_netopt)); + no->no_opt.nxo_flags = nxna.nxna_flags; + no->no_opt.nxo_cred = cred; + + saddr = (struct sockaddr *)(no + 1); + bcopy(&nxna.nxna_addr, saddr, nxna.nxna_addr.ss_len); + if (nxna.nxna_mask.ss_len) { + smask = (struct sockaddr *)((caddr_t)saddr + nxna.nxna_addr.ss_len); + bcopy(&nxna.nxna_mask, smask, nxna.nxna_mask.ss_len); + } else { + smask = NULL; + } + i = saddr->sa_family; + if ((rnh = nx->nx_rtable[i]) == 0) { + /* + * Seems silly to initialize every AF when most are not + * used, do so on demand here + */ + for (dom = domains; dom; dom = dom->dom_next) + if (dom->dom_family == i && dom->dom_rtattach) { + dom->dom_rtattach((void **)&nx->nx_rtable[i], + dom->dom_rtoffset); + break; + } + if ((rnh = nx->nx_rtable[i]) == 0) { + kauth_cred_rele(cred); + _FREE(no, M_NETADDR); + return (ENOBUFS); + } + } + rn = (*rnh->rnh_addaddr)((caddr_t)saddr, (caddr_t)smask, rnh, no->no_rnodes); + if (rn == 0) { + /* + * One of the reasons that rnh_addaddr may fail is that + * the entry already exists. To check for this case, we + * look up the entry to see if it is there. If so, we + * do not need to make a new entry but do continue. + */ + int matched = 0; + rn = (*rnh->rnh_matchaddr)((caddr_t)saddr, rnh); + if (rn != 0 && (rn->rn_flags & RNF_ROOT) == 0 && + (((struct nfs_netopt *)rn)->no_opt.nxo_flags == nxna.nxna_flags)) { + kauth_cred_t cred2 = ((struct nfs_netopt *)rn)->no_opt.nxo_cred; + if (cred && cred2 && (cred->cr_uid == cred2->cr_uid) && + (cred->cr_ngroups == cred2->cr_ngroups)) { + for (i=0; i < cred2->cr_ngroups && i < NGROUPS; i++) + if (cred->cr_groups[i] != cred2->cr_groups[i]) + break; + if (i >= cred2->cr_ngroups || i >= NGROUPS) + matched = 1; + } + } + kauth_cred_rele(cred); + _FREE(no, M_NETADDR); + if (matched) + continue; + return (EPERM); + } + nx->nx_expcnt++; + } + + return (0); +} + +/* + * In order to properly track an export's netopt count, we need to pass + * an additional argument to nfsrv_free_netopt() so that it can decrement + * the export's netopt count. + */ +struct nfsrv_free_netopt_arg { + uint32_t *cnt; + struct radix_node_head *rnh; +}; + +static int +nfsrv_free_netopt(struct radix_node *rn, void *w) +{ + struct nfsrv_free_netopt_arg *fna = (struct nfsrv_free_netopt_arg *)w; + struct radix_node_head *rnh = fna->rnh; + uint32_t *cnt = fna->cnt; + struct nfs_netopt *nno = (struct nfs_netopt *)rn; + + (*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh); + if (nno->no_opt.nxo_cred) + kauth_cred_rele(nno->no_opt.nxo_cred); + _FREE((caddr_t)rn, M_NETADDR); + *cnt -= 1; + return (0); +} + +/* + * Free the net address hash lists that are hanging off the mount points. + */ +static void +nfsrv_free_addrlist(struct nfs_export *nx) +{ + int i; + struct radix_node_head *rnh; + struct nfsrv_free_netopt_arg fna; + + for (i = 0; i <= AF_MAX; i++) + if ( (rnh = nx->nx_rtable[i]) ) { + fna.rnh = rnh; + fna.cnt = &nx->nx_expcnt; + (*rnh->rnh_walktree)(rnh, nfsrv_free_netopt, (caddr_t)&fna); + _FREE((caddr_t)rnh, M_RTABLE); + nx->nx_rtable[i] = 0; + } +} + +void enablequotas(struct mount *mp, vfs_context_t ctx); // XXX + +int +nfsrv_export(struct user_nfs_export_args *unxa, struct vfs_context *ctx) +{ + int error = 0, pathlen; + struct nfs_exportfs *nxfs, *nxfs2, *nxfs3; + struct nfs_export *nx, *nx2, *nx3; + struct nfs_filehandle nfh; + struct nameidata mnd, xnd; + vnode_t mvp = NULL, xvp = NULL; + mount_t mp; + char path[MAXPATHLEN]; + int expisroot; + + error = copyinstr(unxa->nxa_fspath, path, MAXPATHLEN, (size_t *)&pathlen); + if (error) + return (error); + + lck_rw_lock_exclusive(&nfs_export_rwlock); + + // first check if we've already got an exportfs with the given ID + LIST_FOREACH(nxfs, &nfs_exports, nxfs_next) { + if (nxfs->nxfs_id == unxa->nxa_fsid) + break; + } + if (nxfs) { + /* verify exported FS path matches given path */ + if (strcmp(path, nxfs->nxfs_path)) { + error = EEXIST; + goto unlock_out; + } + mp = vfs_getvfs_by_mntonname(nxfs->nxfs_path); + /* find exported FS root vnode */ + NDINIT(&mnd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, + UIO_SYSSPACE, nxfs->nxfs_path, ctx); + error = namei(&mnd); + if (error) + goto unlock_out; + mvp = mnd.ni_vp; + /* make sure it's (still) the root of a file system */ + if ((mvp->v_flag & VROOT) == 0) { + error = EINVAL; + goto out; + } + /* sanity check: this should be same mount */ + if (mp != vnode_mount(mvp)) { + error = EINVAL; + goto out; + } + } else { + /* no current exported file system with that ID */ + if (!(unxa->nxa_flags & NXA_ADD)) { + error = ENOENT; + goto unlock_out; + } + + /* find exported FS root vnode */ + NDINIT(&mnd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, + UIO_SYSSPACE, path, ctx); + error = namei(&mnd); + if (error) + goto unlock_out; + mvp = mnd.ni_vp; + /* make sure it's the root of a file system */ + if ((mvp->v_flag & VROOT) == 0) { + error = EINVAL; + goto out; + } + mp = vnode_mount(mvp); + + /* make sure the file system is NFS-exportable */ + nfh.nfh_len = NFS_MAX_FID_SIZE; + error = VFS_VPTOFH(mvp, &nfh.nfh_len, &nfh.nfh_fid[0], NULL); + if (!error && (nfh.nfh_len > (int)NFS_MAX_FID_SIZE)) + error = EIO; + if (error) + goto out; + + /* add an exportfs for it */ + MALLOC(nxfs, struct nfs_exportfs *, sizeof(struct nfs_exportfs), M_TEMP, M_WAITOK); + if (!nxfs) { + error = ENOMEM; + goto out; + } + bzero(nxfs, sizeof(struct nfs_exportfs)); + nxfs->nxfs_id = unxa->nxa_fsid; + MALLOC(nxfs->nxfs_path, char*, pathlen, M_TEMP, M_WAITOK); + if (!nxfs->nxfs_path) { + FREE(nxfs, M_TEMP); + error = ENOMEM; + goto out; + } + bcopy(path, nxfs->nxfs_path, pathlen); + /* insert into list in reverse-sorted order */ + nxfs3 = NULL; + LIST_FOREACH(nxfs2, &nfs_exports, nxfs_next) { + if (strcmp(nxfs->nxfs_path, nxfs2->nxfs_path) > 0) + break; + nxfs3 = nxfs2; + } + if (nxfs2) + LIST_INSERT_BEFORE(nxfs2, nxfs, nxfs_next); + else if (nxfs3) + LIST_INSERT_AFTER(nxfs3, nxfs, nxfs_next); + else + LIST_INSERT_HEAD(&nfs_exports, nxfs, nxfs_next); + + /* make sure any quotas are enabled before we export the file system */ + enablequotas(mp, ctx); } + + if (unxa->nxa_exppath) { + error = copyinstr(unxa->nxa_exppath, path, MAXPATHLEN, (size_t *)&pathlen); + if (error) + goto out; + LIST_FOREACH(nx, &nxfs->nxfs_exports, nx_next) { + if (nx->nx_id == unxa->nxa_expid) + break; + } + if (nx) { + /* verify exported FS path matches given path */ + if (strcmp(path, nx->nx_path)) { + error = EEXIST; + goto out; + } + } else { + /* no current export with that ID */ + if (!(unxa->nxa_flags & NXA_ADD)) { + error = ENOENT; + goto out; + } + /* add an export for it */ + MALLOC(nx, struct nfs_export *, sizeof(struct nfs_export), M_TEMP, M_WAITOK); + if (!nx) { + error = ENOMEM; + goto out1; + } + bzero(nx, sizeof(struct nfs_export)); + nx->nx_id = unxa->nxa_expid; + nx->nx_fs = nxfs; + MALLOC(nx->nx_path, char*, pathlen, M_TEMP, M_WAITOK); + if (!nx->nx_path) { + error = ENOMEM; + FREE(nx, M_TEMP); + nx = NULL; + goto out1; + } + bcopy(path, nx->nx_path, pathlen); + /* insert into list in reverse-sorted order */ + nx3 = NULL; + LIST_FOREACH(nx2, &nxfs->nxfs_exports, nx_next) { + if (strcmp(nx->nx_path, nx2->nx_path) > 0) + break; + nx3 = nx2; + } + if (nx2) + LIST_INSERT_BEFORE(nx2, nx, nx_next); + else if (nx3) + LIST_INSERT_AFTER(nx3, nx, nx_next); + else + LIST_INSERT_HEAD(&nxfs->nxfs_exports, nx, nx_next); + /* insert into hash */ + LIST_INSERT_HEAD(NFSEXPHASH(nxfs->nxfs_id, nx->nx_id), nx, nx_hash); + + /* + * We don't allow nested exports. Check if the new entry + * nests with the entries before and after or if there's an + * entry for the file system root and subdirs. + */ + error = 0; + if ((nx3 && !strncmp(nx3->nx_path, nx->nx_path, pathlen - 1) && + (nx3->nx_path[pathlen-1] == '/')) || + (nx2 && !strncmp(nx2->nx_path, nx->nx_path, strlen(nx2->nx_path)) && + (nx->nx_path[strlen(nx2->nx_path)] == '/'))) + error = EINVAL; + if (!error) { + /* check export conflict with fs root export and vice versa */ + expisroot = !nx->nx_path[0] || + ((nx->nx_path[0] == '.') && !nx->nx_path[1]); + LIST_FOREACH(nx2, &nxfs->nxfs_exports, nx_next) { + if (expisroot) { + if (nx2 != nx) + break; + } else if (!nx2->nx_path[0]) + break; + else if ((nx2->nx_path[0] == '.') && !nx2->nx_path[1]) + break; + } + if (nx2) + error = EINVAL; + } + if (error) { + printf("nfsrv_export: attempt to register nested exports: %s/%s\n", + nxfs->nxfs_path, nx->nx_path); + goto out1; + } + + /* find export root vnode */ + if (!nx->nx_path[0] || ((nx->nx_path[0] == '.') && !nx->nx_path[1])) { + /* exporting file system's root directory */ + xvp = mvp; + vnode_get(xvp); + } else { + xnd.ni_cnd.cn_nameiop = LOOKUP; + xnd.ni_cnd.cn_flags = LOCKLEAF; + xnd.ni_pathlen = pathlen - 1; + xnd.ni_cnd.cn_nameptr = xnd.ni_cnd.cn_pnbuf = path; + xnd.ni_startdir = mvp; + xnd.ni_usedvp = mvp; + xnd.ni_cnd.cn_context = ctx; + error = lookup(&xnd); + if (error) + goto out1; + xvp = xnd.ni_vp; + } + + if (vnode_vtype(xvp) != VDIR) { + error = EINVAL; + vnode_put(xvp); + goto out1; + } + + /* grab file handle */ + nx->nx_fh.nfh_xh.nxh_version = NFS_FH_VERSION; + nx->nx_fh.nfh_xh.nxh_fsid = nx->nx_fs->nxfs_id; + nx->nx_fh.nfh_xh.nxh_expid = nx->nx_id; + nx->nx_fh.nfh_xh.nxh_flags = 0; + nx->nx_fh.nfh_xh.nxh_reserved = 0; + nx->nx_fh.nfh_len = NFS_MAX_FID_SIZE; + error = VFS_VPTOFH(xvp, &nx->nx_fh.nfh_len, &nx->nx_fh.nfh_fid[0], NULL); + if (!error && (nx->nx_fh.nfh_len > (int)NFS_MAX_FID_SIZE)) { + error = EIO; + } else { + nx->nx_fh.nfh_xh.nxh_fidlen = nx->nx_fh.nfh_len; + nx->nx_fh.nfh_len += sizeof(nx->nx_fh.nfh_xh); + } + + vnode_put(xvp); + if (error) + goto out1; + } + } else { + nx = NULL; + } + + /* perform the export changes */ + if (unxa->nxa_flags & NXA_DELETE) { + if (!nx) { + /* delete all exports on this file system */ + while ((nx = LIST_FIRST(&nxfs->nxfs_exports))) { + LIST_REMOVE(nx, nx_next); + LIST_REMOVE(nx, nx_hash); + /* delete all netopts for this export */ + nfsrv_free_addrlist(nx); + nx->nx_flags &= ~NX_DEFAULTEXPORT; + if (nx->nx_defopt.nxo_cred) { + kauth_cred_rele(nx->nx_defopt.nxo_cred); + nx->nx_defopt.nxo_cred = NULL; + } + FREE(nx->nx_path, M_TEMP); + FREE(nx, M_TEMP); + } + goto out1; + } else { + /* delete all netopts for this export */ + nfsrv_free_addrlist(nx); + nx->nx_flags &= ~NX_DEFAULTEXPORT; + if (nx->nx_defopt.nxo_cred) { + kauth_cred_rele(nx->nx_defopt.nxo_cred); + nx->nx_defopt.nxo_cred = NULL; + } + } + } + if (unxa->nxa_flags & NXA_ADD) { + error = nfsrv_hang_addrlist(nx, unxa); + if (!error) + mp->mnt_flag |= MNT_EXPORTED; + } + +out1: + if (nx && !nx->nx_expcnt) { + /* export has no export options */ + LIST_REMOVE(nx, nx_next); + LIST_REMOVE(nx, nx_hash); + FREE(nx->nx_path, M_TEMP); + FREE(nx, M_TEMP); + } + if (LIST_EMPTY(&nxfs->nxfs_exports)) { + /* exported file system has no more exports */ + LIST_REMOVE(nxfs, nxfs_next); + FREE(nxfs->nxfs_path, M_TEMP); + FREE(nxfs, M_TEMP); + mp->mnt_flag &= ~MNT_EXPORTED; + } + +out: + if (mvp) { + vnode_put(mvp); + nameidone(&mnd); + } +unlock_out: + lck_rw_done(&nfs_export_rwlock); + return (error); +} + +static struct nfs_export_options * +nfsrv_export_lookup(struct nfs_export *nx, mbuf_t nam) +{ + struct nfs_export_options *nxo = NULL; + struct nfs_netopt *no = NULL; + struct radix_node_head *rnh; + struct sockaddr *saddr; + + /* Lookup in the export list first. */ + if (nam != NULL) { + saddr = mbuf_data(nam); + rnh = nx->nx_rtable[saddr->sa_family]; + if (rnh != NULL) { + no = (struct nfs_netopt *) + (*rnh->rnh_matchaddr)((caddr_t)saddr, rnh); + if (no && no->no_rnodes->rn_flags & RNF_ROOT) + no = NULL; + if (no) + nxo = &no->no_opt; + } + } + /* If no address match, use the default if it exists. */ + if ((nxo == NULL) && (nx->nx_flags & NX_DEFAULTEXPORT)) + nxo = &nx->nx_defopt; + return (nxo); +} + +/* find an export for the given handle */ +static struct nfs_export * +nfsrv_fhtoexport(struct nfs_filehandle *nfhp) +{ + struct nfs_export *nx; + nx = NFSEXPHASH(nfhp->nfh_xh.nxh_fsid, nfhp->nfh_xh.nxh_expid)->lh_first; + for (; nx; nx = LIST_NEXT(nx, nx_hash)) { + if (nx->nx_fs->nxfs_id != nfhp->nfh_xh.nxh_fsid) + continue; + if (nx->nx_id != nfhp->nfh_xh.nxh_expid) + continue; + break; + } + return nx; } /* - * nfsrv_fhtovp() - convert a fh to a vnode ptr (optionally locked) - * - look up fsid in mount list (if not found ret error) - * - get vp and export rights by calling VFS_FHTOVP() - * - if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon - * - if not lockflag unlock it with VOP_UNLOCK() + * nfsrv_fhtovp() - convert FH to vnode and export info */ int -nfsrv_fhtovp(fhp, lockflag, vpp, cred, slp, nam, rdonlyp, kerbflag, pubflag) - fhandle_t *fhp; - int lockflag; - struct vnode **vpp; - struct ucred *cred; - struct nfssvc_sock *slp; - struct mbuf *nam; - int *rdonlyp; - int kerbflag; - int pubflag; +nfsrv_fhtovp( + struct nfs_filehandle *nfhp, + mbuf_t nam, + __unused int pubflag, + vnode_t *vpp, + struct nfs_export **nxp, + struct nfs_export_options **nxop) { - struct proc *p = current_proc(); /* XXX */ - register struct mount *mp; - register int i; - struct ucred *credanon; - int error, exflags; + int error; + struct mount *mp; - *vpp = (struct vnode *)0; + *vpp = NULL; + *nxp = NULL; + *nxop = NULL; -/* XXX CSM 12/4/97 Revisit when enabling WebNFS */ -#ifdef notyet - if (nfs_ispublicfh(fhp)) { + if (nfhp->nfh_xh.nxh_version != NFS_FH_VERSION) { + /* file handle format not supported */ + return (ESTALE); + } + if (nfhp->nfh_len > NFS_MAX_FH_SIZE) + return (EBADRPC); + if (nfhp->nfh_len < (int)sizeof(nfhp->nfh_xh)) + return (ESTALE); + if (nfhp->nfh_xh.nxh_flags & NXHF_INVALIDFH) + return (ESTALE); + +/* XXX Revisit when enabling WebNFS */ +#ifdef WEBNFS_ENABLED + if (nfs_ispublicfh(nfhp)) { if (!pubflag || !nfs_pub.np_valid) return (ESTALE); - fhp = &nfs_pub.np_handle; + nfhp = &nfs_pub.np_handle; } #endif - mp = vfs_getvfs(&fhp->fh_fsid); + *nxp = nfsrv_fhtoexport(nfhp); + if (!*nxp) + return (ESTALE); + + /* Get the export option structure for this <export, client> tuple. */ + *nxop = nfsrv_export_lookup(*nxp, nam); + if (nam && (*nxop == NULL)) + return (EACCES); + + /* find mount structure */ + mp = vfs_getvfs_by_mntonname((*nxp)->nx_fs->nxfs_path); if (!mp) return (ESTALE); - error = VFS_FHTOVP(mp, &fhp->fh_fid, nam, vpp, &exflags, &credanon); + + error = VFS_FHTOVP(mp, nfhp->nfh_xh.nxh_fidlen, &nfhp->nfh_fid[0], vpp, NULL); if (error) return (error); /* vnode pointer should be good at this point or ... */ if (*vpp == NULL) return (ESTALE); - /* - * Check/setup credentials. - */ - if (exflags & MNT_EXKERB) { - if (!kerbflag) { - vput(*vpp); - return (NFSERR_AUTHERR | AUTH_TOOWEAK); - } - } else if (kerbflag) { - vput(*vpp); - return (NFSERR_AUTHERR | AUTH_TOOWEAK); - } else if (cred->cr_uid == 0 || (exflags & MNT_EXPORTANON)) { - cred->cr_uid = credanon->cr_uid; - for (i = 0; i < credanon->cr_ngroups && i < NGROUPS; i++) - cred->cr_groups[i] = credanon->cr_groups[i]; - cred->cr_ngroups = i; - } - if (exflags & MNT_EXRDONLY) - *rdonlyp = 1; - else - *rdonlyp = 0; - - nfsrv_object_create(*vpp); + return (0); +} - if (!lockflag) - VOP_UNLOCK(*vpp, 0, p); +/* + * nfsrv_credcheck() - check/map credentials according to given export options + */ +int +nfsrv_credcheck( + struct nfsrv_descript *nfsd, + __unused struct nfs_export *nx, + struct nfs_export_options *nxo) +{ + if (nxo && nxo->nxo_cred) { + if ((nxo->nxo_flags & NX_MAPALL) || + ((nxo->nxo_flags & NX_MAPROOT) && !suser(nfsd->nd_cr, NULL))) { + kauth_cred_rele(nfsd->nd_cr); + nfsd->nd_cr = nxo->nxo_cred; + kauth_cred_ref(nfsd->nd_cr); + } + } return (0); } @@ -2094,17 +2721,84 @@ nfsrv_fhtovp(fhp, lockflag, vpp, cred, slp, nam, rdonlyp, kerbflag, pubflag) * transformed this to all zeroes in both cases, so check for it. */ int -nfs_ispublicfh(fhp) - fhandle_t *fhp; +nfs_ispublicfh(struct nfs_filehandle *nfhp) { - char *cp = (char *)fhp; - int i; - - for (i = 0; i < NFSX_V3FH; i++) + char *cp = (char *)nfhp; + unsigned int i; + + if (nfhp->nfh_len == 0) + return (TRUE); + if (nfhp->nfh_len != NFSX_V2FH) + return (FALSE); + for (i = 0; i < NFSX_V2FH; i++) if (*cp++ != 0) return (FALSE); return (TRUE); } + +/* + * nfsrv_vptofh() - convert vnode to file handle for given export + * + * If the caller is passing in a vnode for a ".." directory entry, + * they can pass a directory NFS file handle (dnfhp) which will be + * checked against the root export file handle. If it matches, we + * refuse to provide the file handle for the out-of-export directory. + */ +int +nfsrv_vptofh( + struct nfs_export *nx, + int v2, + struct nfs_filehandle *dnfhp, + vnode_t vp, + struct vfs_context *ctx, + struct nfs_filehandle *nfhp) +{ + int error; + + nfhp->nfh_xh.nxh_version = NFS_FH_VERSION; + nfhp->nfh_xh.nxh_fsid = nx->nx_fs->nxfs_id; + nfhp->nfh_xh.nxh_expid = nx->nx_id; + nfhp->nfh_xh.nxh_flags = 0; + nfhp->nfh_xh.nxh_reserved = 0; + + if (v2) + bzero(&nfhp->nfh_fid[0], NFSV2_MAX_FID_SIZE); + + /* if directory FH matches export root, return invalid FH */ + if (dnfhp && nfsrv_fhmatch(dnfhp, &nx->nx_fh)) { + nfhp->nfh_len = v2 ? NFSX_V2FH : sizeof(nfhp->nfh_xh); + nfhp->nfh_xh.nxh_fidlen = 0; + nfhp->nfh_xh.nxh_flags = NXHF_INVALIDFH; + return (0); + } + + nfhp->nfh_len = v2 ? NFSV2_MAX_FID_SIZE : NFS_MAX_FID_SIZE; + error = VFS_VPTOFH(vp, &nfhp->nfh_len, &nfhp->nfh_fid[0], ctx); + if (error) + return (error); + if (nfhp->nfh_len > (int)(v2 ? NFSV2_MAX_FID_SIZE : NFS_MAX_FID_SIZE)) + return (EOVERFLOW); + nfhp->nfh_xh.nxh_fidlen = nfhp->nfh_len; + nfhp->nfh_len += sizeof(nfhp->nfh_xh); + if (v2 && (nfhp->nfh_len < NFSX_V2FH)) + nfhp->nfh_len = NFSX_V2FH; + + return (0); +} + +int +nfsrv_fhmatch(struct nfs_filehandle *fh1, struct nfs_filehandle *fh2) +{ + int len1, len2; + + len1 = sizeof(fh1->nfh_xh) + fh1->nfh_xh.nxh_fidlen; + len2 = sizeof(fh2->nfh_xh) + fh2->nfh_xh.nxh_fidlen; + if (len1 != len2) + return (0); + if (bcmp(&fh1->nfh_xh, &fh2->nfh_xh, len1)) + return (0); + return (1); +} #endif /* NFS_NOSERVER */ /* @@ -2118,13 +2812,13 @@ int netaddr_match(family, haddr, nam) int family; union nethostaddr *haddr; - struct mbuf *nam; + mbuf_t nam; { - register struct sockaddr_in *inetaddr; + struct sockaddr_in *inetaddr; switch (family) { case AF_INET: - inetaddr = mtod(nam, struct sockaddr_in *); + inetaddr = mbuf_data(nam); if (inetaddr->sin_family == AF_INET && inetaddr->sin_addr.s_addr == haddr->had_inetaddr) return (1); @@ -2132,10 +2826,10 @@ netaddr_match(family, haddr, nam) #if ISO case AF_ISO: { - register struct sockaddr_iso *isoaddr1, *isoaddr2; + struct sockaddr_iso *isoaddr1, *isoaddr2; - isoaddr1 = mtod(nam, struct sockaddr_iso *); - isoaddr2 = mtod(haddr->had_nam, struct sockaddr_iso *); + isoaddr1 = mbuf_data(nam); + isoaddr2 = mbuf_data(haddr->had_nam); if (isoaddr1->siso_family == AF_ISO && isoaddr1->siso_nlen > 0 && isoaddr1->siso_nlen == isoaddr2->siso_nlen && @@ -2150,19 +2844,19 @@ netaddr_match(family, haddr, nam) return (0); } -static nfsuint64 nfs_nullcookie = { 0, 0 }; +static nfsuint64 nfs_nullcookie = { { 0, 0 } }; /* * This function finds the directory cookie that corresponds to the * logical byte offset given. */ nfsuint64 * nfs_getcookie(np, off, add) - register struct nfsnode *np; + struct nfsnode *np; off_t off; int add; { - register struct nfsdmap *dp, *dp2; - register int pos; + struct nfsdmap *dp, *dp2; + int pos; pos = off / NFS_DIRBLKSIZ; if (pos == 0) { @@ -2176,9 +2870,10 @@ nfs_getcookie(np, off, add) dp = np->n_cookies.lh_first; if (!dp) { if (add) { - MALLOC_ZONE(dp, struct nfsdmap *, - sizeof (struct nfsdmap), - M_NFSDIROFF, M_WAITOK); + MALLOC_ZONE(dp, struct nfsdmap *, sizeof(struct nfsdmap), + M_NFSDIROFF, M_WAITOK); + if (!dp) + return ((nfsuint64 *)0); dp->ndm_eocookie = 0; LIST_INSERT_HEAD(&np->n_cookies, dp, ndm_list); } else @@ -2192,9 +2887,10 @@ nfs_getcookie(np, off, add) return ((nfsuint64 *)0); dp = dp->ndm_list.le_next; } else if (add) { - MALLOC_ZONE(dp2, struct nfsdmap *, - sizeof (struct nfsdmap), - M_NFSDIROFF, M_WAITOK); + MALLOC_ZONE(dp2, struct nfsdmap *, sizeof(struct nfsdmap), + M_NFSDIROFF, M_WAITOK); + if (!dp2) + return ((nfsuint64 *)0); dp2->ndm_eocookie = 0; LIST_INSERT_AFTER(dp, dp2, ndm_list); dp = dp2; @@ -2217,12 +2913,12 @@ nfs_getcookie(np, off, add) */ void nfs_invaldir(vp) - register struct vnode *vp; + vnode_t vp; { - register struct nfsnode *np = VTONFS(vp); + struct nfsnode *np = VTONFS(vp); #if DIAGNOSTIC - if (vp->v_type != VDIR) + if (vnode_vtype(vp) != VDIR) panic("nfs: invaldir not dir"); #endif np->n_direofoffset = 0; @@ -2238,33 +2934,38 @@ nfs_invaldir(vp) * dirty block list as NB_DELWRI, all this takes is clearing the NB_NEEDCOMMIT * flag. Once done the new write verifier can be set for the mount point. */ -void -nfs_clearcommit(mp) - struct mount *mp; +static int +nfs_clearcommit_callout(vnode_t vp, __unused void *arg) { - register struct vnode *vp, *nvp; - register struct nfsbuf *bp, *nbp; - struct nfsnode *np; - int s; - - s = splbio(); -loop: - for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { - if (vp->v_mount != mp) /* Paranoia */ - goto loop; - nvp = vp->v_mntvnodes.le_next; - np = VTONFS(vp); - for (bp = np->n_dirtyblkhd.lh_first; bp; bp = nbp) { - nbp = bp->nb_vnbufs.le_next; - if ((bp->nb_flags & (NB_BUSY | NB_DELWRI | NB_NEEDCOMMIT)) - == (NB_DELWRI | NB_NEEDCOMMIT)) { - bp->nb_flags &= ~NB_NEEDCOMMIT; - np->n_needcommitcnt--; - CHECK_NEEDCOMMITCNT(np); - } + struct nfsnode *np = VTONFS(vp); + struct nfsbuflists blist; + struct nfsbuf *bp; + + lck_mtx_lock(nfs_buf_mutex); + if (nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) { + lck_mtx_unlock(nfs_buf_mutex); + return (VNODE_RETURNED); + } + LIST_FOREACH(bp, &blist, nb_vnbufs) { + if (nfs_buf_acquire(bp, NBAC_NOWAIT, 0, 0)) + continue; + if ((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT)) + == (NB_DELWRI | NB_NEEDCOMMIT)) { + bp->nb_flags &= ~NB_NEEDCOMMIT; + np->n_needcommitcnt--; } + nfs_buf_drop(bp); } - splx(s); + CHECK_NEEDCOMMITCNT(np); + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); + lck_mtx_unlock(nfs_buf_mutex); + return (VNODE_RETURNED); +} + +void +nfs_clearcommit(mount_t mp) +{ + vnode_iterate(mp, VNODE_NOLOCK_INTERNAL, nfs_clearcommit_callout, NULL); } #ifndef NFS_NOSERVER @@ -2275,9 +2976,9 @@ loop: int nfsrv_errmap(nd, err) struct nfsrv_descript *nd; - register int err; + int err; { - register short *defaulterrp, *errp; + short *defaulterrp, *errp; if (nd->nd_flag & ND_NFSV3) { if (nd->nd_procnum <= NFSPROC_COMMIT) { @@ -2297,16 +2998,5 @@ nfsrv_errmap(nd, err) return (NFSERR_IO); } -/* XXX CSM 11/25/97 Revisit when Ramesh merges vm with buffer cache */ -#define vfs_object_create(v, p, c, l) (0) - -int -nfsrv_object_create(struct vnode *vp) { - struct proc *curproc = current_proc(); - - if ((vp == NULL) || (vp->v_type != VREG)) - return 1; - return vfs_object_create(vp, curproc, curproc?curproc->p_ucred:NULL, 1); -} #endif /* NFS_NOSERVER */ diff --git a/bsd/nfs/nfs_syscalls.c b/bsd/nfs/nfs_syscalls.c index bfa8f08aa..bf82ef6bc 100644 --- a/bsd/nfs/nfs_syscalls.c +++ b/bsd/nfs/nfs_syscalls.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -66,27 +66,29 @@ #include <sys/sysproto.h> #endif #include <sys/kernel.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/filedesc.h> #include <sys/stat.h> -#include <sys/vnode.h> -#include <sys/mount.h> -#include <sys/proc.h> +#include <sys/vnode_internal.h> +#include <sys/mount_internal.h> +#include <sys/proc_internal.h> /* for fdflags */ +#include <sys/kauth.h> #include <sys/sysctl.h> #include <sys/ubc.h> #include <sys/uio.h> #include <sys/malloc.h> -#include <sys/mbuf.h> +#include <sys/kpi_mbuf.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/domain.h> #include <sys/protosw.h> -#include <sys/namei.h> #include <sys/fcntl.h> #include <sys/lockf.h> #include <sys/syslog.h> #include <sys/user.h> -#include <machine/spl.h> +#include <sys/sysproto.h> +#include <sys/kpi_socket.h> +#include <libkern/OSAtomic.h> #include <bsm/audit_kernel.h> @@ -103,45 +105,40 @@ #include <nfs/nfsrvcache.h> #include <nfs/nfsmount.h> #include <nfs/nfsnode.h> -#include <nfs/nqnfs.h> #include <nfs/nfsrtt.h> #include <nfs/nfs_lock.h> +extern void unix_syscall_return(int); + /* Global defs. */ -extern int (*nfsrv3_procs[NFS_NPROCS]) __P((struct nfsrv_descript *nd, +extern int (*nfsrv3_procs[NFS_NPROCS])(struct nfsrv_descript *nd, struct nfssvc_sock *slp, - struct proc *procp, - struct mbuf **mreqp)); + proc_t procp, + mbuf_t *mreqp); extern int nfs_numasync; extern int nfs_ioddelwri; -extern time_t nqnfsstarttime; -extern int nqsrv_writeslack; extern int nfsrtton; extern struct nfsstats nfsstats; extern int nfsrvw_procrastinate; extern int nfsrvw_procrastinate_v3; + struct nfssvc_sock *nfs_udpsock, *nfs_cltpsock; static int nuidhash_max = NFS_MAXUIDHASH; -static void nfsrv_zapsock __P((struct nfssvc_sock *slp)); -static int nfssvc_iod __P((struct proc *)); - -#define TRUE 1 -#define FALSE 0 +static void nfsrv_zapsock(struct nfssvc_sock *slp); +static int nfssvc_iod(proc_t); +static int nfskerb_clientd(struct nfsmount *, struct nfsd_cargs *, int, user_addr_t, proc_t); static int nfs_asyncdaemon[NFS_MAXASYNCDAEMON]; #ifndef NFS_NOSERVER int nfsd_waiting = 0; static struct nfsdrt nfsdrt; -static int nfs_numnfsd = 0; -static int notstarted = 1; -static int modify_flag = 0; -static void nfsd_rt __P((int sotype, struct nfsrv_descript *nd, - int cacherep)); -static int nfssvc_addsock __P((struct file *, struct mbuf *, - struct proc *)); -static int nfssvc_nfsd __P((struct nfsd_srvargs *,caddr_t,struct proc *)); +int nfs_numnfsd = 0; +static void nfsd_rt(int sotype, struct nfsrv_descript *nd, int cacherep); +static int nfssvc_addsock(socket_t, mbuf_t, proc_t); +static int nfssvc_nfsd(struct nfsd_srvargs *,user_addr_t, proc_t); +static int nfssvc_export(user_addr_t, proc_t); static int nfs_privport = 0; /* XXX CSM 11/25/97 Upgrade sysctl.h someday */ @@ -159,45 +156,95 @@ SYSCTL_INT(_vfs_nfs, OID_AUTO, gatherdelay_v3, CTLFLAG_RW, &nfsrvw_procrastinate /* * Get file handle system call */ -#ifndef _SYS_SYSPROTO_H_ -struct getfh_args { - char *fname; - fhandle_t *fhp; -}; -#endif int -getfh(p, uap) - struct proc *p; - register struct getfh_args *uap; +getfh(proc_t p, struct getfh_args *uap, __unused int *retval) { - register struct vnode *vp; - fhandle_t fh; + vnode_t vp; + struct nfs_filehandle nfh; int error; struct nameidata nd; + struct vfs_context context; + char path[MAXPATHLEN], *ptr; + u_int pathlen; + struct nfs_exportfs *nxfs; + struct nfs_export *nx; + + context.vc_proc = p; + context.vc_ucred = kauth_cred_get(); /* * Must be super user */ - error = suser(p->p_ucred, &p->p_acflag); - if(error) + error = proc_suser(p); + if (error) + return (error); + + error = copyinstr(uap->fname, path, MAXPATHLEN, (size_t *)&pathlen); + if (error) return (error); - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, UIO_USERSPACE, uap->fname, p); + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, + UIO_SYSSPACE, path, &context); error = namei(&nd); if (error) return (error); + nameidone(&nd); + vp = nd.ni_vp; - bzero((caddr_t)&fh, sizeof(fh)); - fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; - error = VFS_VPTOFH(vp, &fh.fh_fid); - vput(vp); + + // find exportfs that matches f_mntonname + lck_rw_lock_shared(&nfs_export_rwlock); + ptr = vnode_mount(vp)->mnt_vfsstat.f_mntonname; + LIST_FOREACH(nxfs, &nfs_exports, nxfs_next) { + if (!strcmp(nxfs->nxfs_path, ptr)) + break; + } + if (!nxfs || strncmp(nxfs->nxfs_path, path, strlen(nxfs->nxfs_path))) { + error = EINVAL; + goto out; + } + // find export that best matches remainder of path + ptr = path + strlen(nxfs->nxfs_path); + while (*ptr && (*ptr == '/')) + ptr++; + LIST_FOREACH(nx, &nxfs->nxfs_exports, nx_next) { + int len = strlen(nx->nx_path); + if (len == 0) // we've hit the export entry for the root directory + break; + if (!strncmp(nx->nx_path, ptr, len)) + break; + } + if (!nx) { + error = EINVAL; + goto out; + } + + bzero(&nfh, sizeof(nfh)); + nfh.nfh_xh.nxh_version = NFS_FH_VERSION; + nfh.nfh_xh.nxh_fsid = nxfs->nxfs_id; + nfh.nfh_xh.nxh_expid = nx->nx_id; + nfh.nfh_xh.nxh_flags = 0; + nfh.nfh_xh.nxh_reserved = 0; + nfh.nfh_len = NFS_MAX_FID_SIZE; + error = VFS_VPTOFH(vp, &nfh.nfh_len, &nfh.nfh_fid[0], NULL); + if (nfh.nfh_len > (int)NFS_MAX_FID_SIZE) + error = EOVERFLOW; + nfh.nfh_xh.nxh_fidlen = nfh.nfh_len; + nfh.nfh_len += sizeof(nfh.nfh_xh); + +out: + lck_rw_done(&nfs_export_rwlock); + vnode_put(vp); if (error) return (error); - error = copyout((caddr_t)&fh, (caddr_t)uap->fhp, sizeof (fh)); + error = copyout((caddr_t)&nfh, uap->fhp, sizeof(nfh)); return (error); } #endif /* NFS_NOSERVER */ +extern struct fileops vnops; + /* * syscall for the rpc.lockd to use to translate a NFS file handle into * an open descriptor. @@ -205,39 +252,30 @@ getfh(p, uap) * warning: do not remove the suser() call or this becomes one giant * security hole. */ -#ifndef _SYS_SYSPROTO_H_ -struct fhopen_args { - const struct fhandle *u_fhp; - int flags; -}; -#endif int -fhopen(p, uap, retval) - struct proc *p; - register struct fhopen_args *uap; - register_t *retval; +fhopen( proc_t p, + struct fhopen_args *uap, + register_t *retval) { - struct mount *mp; - struct vnode *vp; - struct fhandle fhp; - struct vattr vat; - struct vattr *vap = &vat; + vnode_t vp; + struct nfs_filehandle nfh; + struct nfs_export *nx; + struct nfs_export_options *nxo; struct flock lf; - struct file *fp; - register struct filedesc *fdp = p->p_fd; - int fmode, mode, error, type; - struct file *nfp; + struct fileproc *fp, *nfp; + int fmode, error, type; int indx; - struct ucred *credanon; - int exflags; - struct ucred *cred = p->p_ucred; - int didhold = 0; - extern struct fileops vnops; + kauth_cred_t cred = proc_ucred(p); + struct vfs_context context; + kauth_action_t action; + + context.vc_proc = p; + context.vc_ucred = cred; /* * Must be super user */ - error = suser(cred, &p->p_acflag); + error = suser(cred, 0); if (error) return (error); @@ -245,98 +283,76 @@ fhopen(p, uap, retval) /* why not allow a non-read/write open for our lockd? */ if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT)) return (EINVAL); - error = copyin((void*)uap->u_fhp, &fhp, sizeof(fhp)); + + error = copyin(uap->u_fhp, &nfh.nfh_len, sizeof(nfh.nfh_len)); + if (error) + return (error); + if ((nfh.nfh_len < (int)sizeof(struct nfs_exphandle)) || + (nfh.nfh_len > (int)NFS_MAX_FH_SIZE)) + return (EINVAL); + error = copyin(uap->u_fhp, &nfh, sizeof(nfh.nfh_len) + nfh.nfh_len); if (error) return (error); - /* find the mount point */ - mp = vfs_getvfs(&fhp.fh_fsid); - if (mp == NULL) - return (ESTALE); - /* now give me my vnode, it gets returned to me locked */ -/* XXX CSM need to split VFS_CHECKEXP out of VFS_FHTOVP? */ - error = VFS_FHTOVP(mp, &fhp.fh_fid, NULL, &vp, &exflags, &credanon); + + lck_rw_lock_shared(&nfs_export_rwlock); + /* now give me my vnode, it gets returned to me with a reference */ + error = nfsrv_fhtovp(&nfh, NULL, TRUE, &vp, &nx, &nxo); + lck_rw_done(&nfs_export_rwlock); if (error) return (error); + /* - * from now on we have to make sure not - * to forget about the vnode - * any error that causes an abort must vput(vp) - * just set error = err and 'goto bad;'. + * From now on we have to make sure not + * to forget about the vnode. + * Any error that causes an abort must vnode_put(vp). + * Just set error = err and 'goto bad;'. */ /* * from vn_open */ - if (vp->v_type == VSOCK) { + if (vnode_vtype(vp) == VSOCK) { error = EOPNOTSUPP; goto bad; } - if (UBCINFOEXISTS(vp) && ((didhold = ubc_hold(vp)) == 0)) { - error = ENOENT; + /* disallow write operations on directories */ + if (vnode_isdir(vp) && (fmode & (FWRITE | O_TRUNC))) { + error = EISDIR; goto bad; } - if (fmode & FREAD && fmode & (FWRITE | O_TRUNC)) { - int err = 0; - if (vp->v_type == VDIR) - err = EISDIR; - else - err = vn_writechk(vp); - if (err && !(error = VOP_ACCESS(vp, VREAD, cred, p))) - error = err; - if (error || (error = VOP_ACCESS(vp, VREAD|VWRITE, cred, p))) - goto bad; - } else if (fmode & FREAD) { - if ((error = VOP_ACCESS(vp, VREAD, cred, p))) - goto bad; - } else if (fmode & (FWRITE | O_TRUNC)) { - if (vp->v_type == VDIR) { - error = EISDIR; - goto bad; - } - if ((error = vn_writechk(vp)) || - (error = VOP_ACCESS(vp, VWRITE, cred, p))) - goto bad; - } - if (fmode & O_TRUNC) { - VOP_UNLOCK(vp, 0, p); /* XXX */ - VOP_LEASE(vp, p, cred, LEASE_WRITE); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); /* XXX */ - VATTR_NULL(vap); - vap->va_size = 0; - error = VOP_SETATTR(vp, vap, cred, p); - if (error) - goto bad; - } + /* compute action to be authorized */ + action = 0; + if (fmode & FREAD) + action |= KAUTH_VNODE_READ_DATA; + if (fmode & (FWRITE | O_TRUNC)) + action |= KAUTH_VNODE_WRITE_DATA; + if ((error = vnode_authorize(vp, NULL, action, &context)) != 0) + goto bad; - error = VOP_OPEN(vp, fmode, cred, p); - if (error) + if ((error = VNOP_OPEN(vp, fmode, &context))) + goto bad; + if ((error = vnode_ref_ext(vp, fmode))) goto bad; - if (fmode & FWRITE) - if (++vp->v_writecount <= 0) - panic("fhopen: v_writecount"); /* * end of vn_open code */ + // starting here... error paths should call vn_close/vnode_put if ((error = falloc(p, &nfp, &indx)) != 0) { - if (fmode & FWRITE) - vp->v_writecount--; + vn_close(vp, fmode & FMASK, cred, p); goto bad; } fp = nfp; - /* - * Hold an extra reference to avoid having fp ripped out - * from under us while we block in the lock op - */ - fref(fp); - nfp->f_data = (caddr_t)vp; - nfp->f_flag = fmode & FMASK; - nfp->f_ops = &vnops; - nfp->f_type = DTYPE_VNODE; + fp->f_fglob->fg_flag = fmode & FMASK; + fp->f_fglob->fg_type = DTYPE_VNODE; + fp->f_fglob->fg_ops = &vnops; + fp->f_fglob->fg_data = (caddr_t)vp; + + // XXX do we really need to support this with fhopen()? if (fmode & (O_EXLOCK | O_SHLOCK)) { lf.l_whence = SEEK_SET; lf.l_start = 0; @@ -348,34 +364,26 @@ fhopen(p, uap, retval) type = F_FLOCK; if ((fmode & FNONBLOCK) == 0) type |= F_WAIT; - VOP_UNLOCK(vp, 0, p); - if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, - type)) != 0) { - (void) vn_close(vp, fp->f_flag, fp->f_cred, p); - ffree(fp); - fdrelse(p, indx); - /* - * release our private reference - */ - frele(fp); - + if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, &context))) { + vn_close(vp, fp->f_fglob->fg_flag, fp->f_fglob->fg_cred, p); + fp_free(p, indx, fp); return (error); } - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - fp->f_flag |= FHASLOCK; + fp->f_fglob->fg_flag |= FHASLOCK; } - VOP_UNLOCK(vp, 0, p); + vnode_put(vp); + + proc_fdlock(p); *fdflags(p, indx) &= ~UF_RESERVED; - frele(fp); + fp_drop(p, indx, fp, 1); + proc_fdunlock(p); + *retval = indx; return (0); bad: - VOP_UNLOCK(vp, 0, p); - if (didhold) - ubc_rele(vp); - vrele(vp); + vnode_put(vp); return (error); } @@ -386,22 +394,13 @@ bad: * - remains in the kernel as an nfsd * - remains in the kernel as an nfsiod */ -#ifndef _SYS_SYSPROTO_H_ -struct nfssvc_args { - int flag; - caddr_t argp; -}; -#endif int -nfssvc(p, uap) - struct proc *p; - register struct nfssvc_args *uap; +nfssvc(proc_t p, struct nfssvc_args *uap, __unused int *retval) { #ifndef NFS_NOSERVER struct nameidata nd; - struct file *fp; - struct mbuf *nam; - struct nfsd_args nfsdarg; + mbuf_t nam; + struct user_nfsd_args user_nfsdarg; struct nfsd_srvargs nfsd_srvargs, *nsd = &nfsd_srvargs; struct nfsd_cargs ncd; struct nfsd *nfsd; @@ -409,6 +408,9 @@ nfssvc(p, uap) struct nfsuid *nuidp; struct nfsmount *nmp; struct timeval now; + socket_t so; + struct vfs_context context; + struct ucred temp_cred; #endif /* NFS_NOSERVER */ int error; @@ -417,13 +419,9 @@ nfssvc(p, uap) /* * Must be super user */ - error = suser(p->p_ucred, &p->p_acflag); + error = proc_suser(p); if(error) return (error); - while (nfssvc_sockhead_flag & SLP_INIT) { - nfssvc_sockhead_flag |= SLP_WANTINIT; - (void) tsleep((caddr_t)&nfssvc_sockhead, PSOCK, "nfsd init", 0); - } if (uap->flag & NFSSVC_BIOD) error = nfssvc_iod(p); #ifdef NFS_NOSERVER @@ -431,18 +429,26 @@ nfssvc(p, uap) error = ENXIO; #else /* !NFS_NOSERVER */ else if (uap->flag & NFSSVC_MNTD) { + + context.vc_proc = p; + context.vc_ucred = kauth_cred_get(); + error = copyin(uap->argp, (caddr_t)&ncd, sizeof (ncd)); if (error) return (error); - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, - UIO_USERSPACE, ncd.ncd_dirp, p); + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, + (proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32), + CAST_USER_ADDR_T(ncd.ncd_dirp), &context); error = namei(&nd); if (error) return (error); - if ((nd.ni_vp->v_flag & VROOT) == 0) + nameidone(&nd); + + if (vnode_isvroot(nd.ni_vp) == 0) error = EINVAL; - nmp = VFSTONFS(nd.ni_vp->v_mount); - vput(nd.ni_vp); + nmp = VFSTONFS(vnode_mount(nd.ni_vp)); + vnode_put(nd.ni_vp); if (error) return (error); @@ -450,28 +456,45 @@ nfssvc(p, uap) (uap->flag & NFSSVC_GOTAUTH) == 0) return (0); nmp->nm_state |= NFSSTA_MNTD; - error = nqnfs_clientd(nmp, p->p_ucred, &ncd, uap->flag, - uap->argp, p); + error = nfskerb_clientd(nmp, &ncd, uap->flag, uap->argp, p); } else if (uap->flag & NFSSVC_ADDSOCK) { - error = copyin(uap->argp, (caddr_t)&nfsdarg, sizeof(nfsdarg)); + if (IS_64BIT_PROCESS(p)) { + error = copyin(uap->argp, (caddr_t)&user_nfsdarg, sizeof(user_nfsdarg)); + } else { + struct nfsd_args tmp_args; + error = copyin(uap->argp, (caddr_t)&tmp_args, sizeof(tmp_args)); + if (error == 0) { + user_nfsdarg.sock = tmp_args.sock; + user_nfsdarg.name = CAST_USER_ADDR_T(tmp_args.name); + user_nfsdarg.namelen = tmp_args.namelen; + } + } if (error) return (error); - error = getsock(p->p_fd, nfsdarg.sock, &fp); + /* get the socket */ + error = file_socket(user_nfsdarg.sock, &so); if (error) return (error); - /* - * Get the client address for connected sockets. - */ - if (nfsdarg.name == NULL || nfsdarg.namelen == 0) - nam = (struct mbuf *)0; - else { - error = sockargs(&nam, nfsdarg.name, nfsdarg.namelen, - MT_SONAME); - if (error) + /* Get the client address for connected sockets. */ + if (user_nfsdarg.name == USER_ADDR_NULL || user_nfsdarg.namelen == 0) { + nam = NULL; + } else { + error = sockargs(&nam, user_nfsdarg.name, user_nfsdarg.namelen, MBUF_TYPE_SONAME); + if (error) { + /* drop the iocount file_socket() grabbed on the file descriptor */ + file_drop(user_nfsdarg.sock); return (error); + } } - error = nfssvc_addsock(fp, nam, p); - } else { + /* + * nfssvc_addsock() will grab a retain count on the socket + * to keep the socket from being closed when nfsd closes its + * file descriptor for it. + */ + error = nfssvc_addsock(so, nam, p); + /* drop the iocount file_socket() grabbed on the file descriptor */ + file_drop(user_nfsdarg.sock); + } else if (uap->flag & NFSSVC_NFSD) { error = copyin(uap->argp, (caddr_t)nsd, sizeof (*nsd)); if (error) return (error); @@ -486,14 +509,14 @@ nfssvc(p, uap) */ for (nuidp = NUIDHASH(slp,nsd->nsd_cr.cr_uid)->lh_first; nuidp != 0; nuidp = nuidp->nu_hash.le_next) { - if (nuidp->nu_cr.cr_uid == nsd->nsd_cr.cr_uid && + if (kauth_cred_getuid(nuidp->nu_cr) == nsd->nsd_cr.cr_uid && (!nfsd->nfsd_nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp), &nuidp->nu_haddr, nfsd->nfsd_nd->nd_nam2))) break; } if (nuidp) { - nfsrv_setcred(&nuidp->nu_cr,&nfsd->nfsd_nd->nd_cr); + nfsrv_setcred(nuidp->nu_cr,nfsd->nfsd_nd->nd_cr); nfsd->nfsd_nd->nd_flag |= ND_KERBFULL; } else { /* @@ -507,23 +530,36 @@ nfssvc(p, uap) } else nuidp = (struct nfsuid *)0; if ((slp->ns_flag & SLP_VALID) == 0) { - if (nuidp) + if (nuidp) { FREE_ZONE((caddr_t)nuidp, sizeof (struct nfsuid), M_NFSUID); + slp->ns_numuids--; + } } else { if (nuidp == (struct nfsuid *)0) { nuidp = slp->ns_uidlruhead.tqh_first; + if (!nuidp) + return (ENOMEM); LIST_REMOVE(nuidp, nu_hash); TAILQ_REMOVE(&slp->ns_uidlruhead, nuidp, nu_lru); if (nuidp->nu_flag & NU_NAM) - m_freem(nuidp->nu_nam); + mbuf_freem(nuidp->nu_nam); + kauth_cred_rele(nuidp->nu_cr); } nuidp->nu_flag = 0; - nuidp->nu_cr = nsd->nsd_cr; - if (nuidp->nu_cr.cr_ngroups > NGROUPS) - nuidp->nu_cr.cr_ngroups = NGROUPS; - nuidp->nu_cr.cr_ref = 1; + + if (nsd->nsd_cr.cr_ngroups > NGROUPS) + nsd->nsd_cr.cr_ngroups = NGROUPS; + + nfsrv_setcred(&nsd->nsd_cr, &temp_cred); + nuidp->nu_cr = kauth_cred_create(&temp_cred); + + if (!nuidp->nu_cr) { + FREE_ZONE(nuidp, sizeof(struct nfsuid), M_NFSUID); + slp->ns_numuids--; + return (ENOMEM); + } nuidp->nu_timestamp = nsd->nsd_timestamp; microtime(&now); nuidp->nu_expire = now.tv_sec + nsd->nsd_ttl; @@ -535,8 +571,7 @@ nfssvc(p, uap) if (nfsd->nfsd_nd->nd_nam2) { struct sockaddr_in *saddr; - saddr = mtod(nfsd->nfsd_nd->nd_nam2, - struct sockaddr_in *); + saddr = mbuf_data(nfsd->nfsd_nd->nd_nam2); switch (saddr->sin_family) { case AF_INET: nuidp->nu_flag |= NU_INETADDR; @@ -546,9 +581,15 @@ nfssvc(p, uap) case AF_ISO: default: nuidp->nu_flag |= NU_NAM; - nuidp->nu_nam = m_copym( - nfsd->nfsd_nd->nd_nam2, 0, - M_COPYALL, M_WAIT); + error = mbuf_copym(nfsd->nfsd_nd->nd_nam2, 0, + MBUF_COPYALL, MBUF_WAITOK, + &nuidp->nu_nam); + if (error) { + kauth_cred_rele(nuidp->nu_cr); + FREE_ZONE(nuidp, sizeof(struct nfsuid), M_NFSUID); + slp->ns_numuids--; + return (error); + } break; }; } @@ -556,8 +597,8 @@ nfssvc(p, uap) nu_lru); LIST_INSERT_HEAD(NUIDHASH(slp, nsd->nsd_uid), nuidp, nu_hash); - nfsrv_setcred(&nuidp->nu_cr, - &nfsd->nfsd_nd->nd_cr); + nfsrv_setcred(nuidp->nu_cr, + nfsd->nfsd_nd->nd_cr); nfsd->nfsd_nd->nd_flag |= ND_KERBFULL; } } @@ -565,6 +606,10 @@ nfssvc(p, uap) if ((uap->flag & NFSSVC_AUTHINFAIL) && (nfsd = nsd->nsd_nfsd)) nfsd->nfsd_flag |= NFSD_AUTHFAIL; error = nfssvc_nfsd(nsd, uap->argp, p); + } else if (uap->flag & NFSSVC_EXPORT) { + error = nfssvc_export(uap->argp, p); + } else { + error = EINVAL; } #endif /* NFS_NOSERVER */ if (error == EINTR || error == ERESTART) @@ -572,57 +617,156 @@ nfssvc(p, uap) return (error); } +/* + * NFSKERB client helper daemon. + * Gets authorization strings for "kerb" mounts. + */ +static int +nfskerb_clientd( + struct nfsmount *nmp, + struct nfsd_cargs *ncd, + int flag, + user_addr_t argp, + proc_t p) +{ + struct nfsuid *nuidp, *nnuidp; + int error = 0; + struct nfsreq *rp; + struct timeval now; + + /* + * First initialize some variables + */ + microtime(&now); + + /* + * If an authorization string is being passed in, get it. + */ + if ((flag & NFSSVC_GOTAUTH) && (nmp->nm_state & NFSSTA_MOUNTED) && + ((nmp->nm_state & NFSSTA_WAITAUTH) == 0)) { + if (nmp->nm_state & NFSSTA_HASAUTH) + panic("cld kerb"); + if ((flag & NFSSVC_AUTHINFAIL) == 0) { + if (ncd->ncd_authlen <= nmp->nm_authlen && + ncd->ncd_verflen <= nmp->nm_verflen && + !copyin(CAST_USER_ADDR_T(ncd->ncd_authstr),nmp->nm_authstr,ncd->ncd_authlen)&& + !copyin(CAST_USER_ADDR_T(ncd->ncd_verfstr),nmp->nm_verfstr,ncd->ncd_verflen)){ + nmp->nm_authtype = ncd->ncd_authtype; + nmp->nm_authlen = ncd->ncd_authlen; + nmp->nm_verflen = ncd->ncd_verflen; +#if NFSKERB + nmp->nm_key = ncd->ncd_key; +#endif + } else + nmp->nm_state |= NFSSTA_AUTHERR; + } else + nmp->nm_state |= NFSSTA_AUTHERR; + nmp->nm_state |= NFSSTA_HASAUTH; + wakeup((caddr_t)&nmp->nm_authlen); + } else { + nmp->nm_state |= NFSSTA_WAITAUTH; + } + + /* + * Loop every second updating queue until there is a termination sig. + */ + while (nmp->nm_state & NFSSTA_MOUNTED) { + /* Get an authorization string, if required. */ + if ((nmp->nm_state & (NFSSTA_WAITAUTH | NFSSTA_HASAUTH)) == 0) { + ncd->ncd_authuid = nmp->nm_authuid; + if (copyout((caddr_t)ncd, argp, sizeof (struct nfsd_cargs))) + nmp->nm_state |= NFSSTA_WAITAUTH; + else + return (ENEEDAUTH); + } + /* Wait a bit (no pun) and do it again. */ + if ((nmp->nm_state & NFSSTA_MOUNTED) && + (nmp->nm_state & (NFSSTA_WAITAUTH | NFSSTA_HASAUTH))) { + error = tsleep((caddr_t)&nmp->nm_authstr, PSOCK | PCATCH, + "nfskrbtimr", hz / 3); + if (error == EINTR || error == ERESTART) + dounmount(nmp->nm_mountp, 0, p); + } + } + + /* + * Finally, we can free up the mount structure. + */ + for (nuidp = nmp->nm_uidlruhead.tqh_first; nuidp != 0; nuidp = nnuidp) { + nnuidp = nuidp->nu_lru.tqe_next; + LIST_REMOVE(nuidp, nu_hash); + TAILQ_REMOVE(&nmp->nm_uidlruhead, nuidp, nu_lru); + kauth_cred_rele(nuidp->nu_cr); + FREE_ZONE((caddr_t)nuidp, sizeof (struct nfsuid), M_NFSUID); + } + /* + * Loop through outstanding request list and remove dangling + * references to defunct nfsmount struct + */ + for (rp = nfs_reqq.tqh_first; rp; rp = rp->r_chain.tqe_next) + if (rp->r_nmp == nmp) + rp->r_nmp = (struct nfsmount *)0; + /* Need to wake up any rcvlock waiters so they notice the unmount. */ + if (nmp->nm_state & NFSSTA_WANTRCV) { + nmp->nm_state &= ~NFSSTA_WANTRCV; + wakeup(&nmp->nm_state); + } + FREE_ZONE((caddr_t)nmp, sizeof (struct nfsmount), M_NFSMNT); + if (error == EWOULDBLOCK) + error = 0; + return (error); +} + #ifndef NFS_NOSERVER /* * Adds a socket to the list for servicing by nfsds. */ static int -nfssvc_addsock(fp, mynam, p) - struct file *fp; - struct mbuf *mynam; - struct proc *p; +nfssvc_addsock( + socket_t so, + mbuf_t mynam, + __unused proc_t p) { - register struct mbuf *m; - register int siz; - register struct nfssvc_sock *slp; - register struct socket *so; - struct nfssvc_sock *tslp; - int error, s; - - so = (struct socket *)fp->f_data; - tslp = (struct nfssvc_sock *)0; + int siz; + struct nfssvc_sock *slp; + struct nfssvc_sock *tslp = NULL; + int error, sodomain, sotype, soprotocol, on = 1; + struct timeval timeo; + + /* make sure mbuf constants are set up */ + if (!nfs_mbuf_mlen) + nfs_mbuf_init(); + + sock_gettype(so, &sodomain, &sotype, &soprotocol); + /* * Add it to the list, as required. */ - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - if (so->so_proto->pr_protocol == IPPROTO_UDP) { + if (soprotocol == IPPROTO_UDP) { tslp = nfs_udpsock; - if (tslp->ns_flag & SLP_VALID) { - m_freem(mynam); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + if (!tslp || (tslp->ns_flag & SLP_VALID)) { + mbuf_freem(mynam); return (EPERM); } #if ISO - } else if (so->so_proto->pr_protocol == ISOPROTO_CLTP) { + } else if (soprotocol == ISOPROTO_CLTP) { tslp = nfs_cltpsock; - if (tslp->ns_flag & SLP_VALID) { - m_freem(mynam); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + if (!tslp || (tslp->ns_flag & SLP_VALID)) { + mbuf_freem(mynam); return (EPERM); } #endif /* ISO */ } /* reserve buffer space for 2 maximally-sized packets */ siz = NFS_MAXPACKET; - if (so->so_type == SOCK_STREAM) + if (sotype == SOCK_STREAM) siz += sizeof (u_long); siz *= 2; if (siz > NFS_MAXSOCKBUF) siz = NFS_MAXSOCKBUF; - error = soreserve(so, siz, siz); - if (error) { - m_freem(mynam); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + if ((error = sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &siz, sizeof(siz))) || + (error = sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &siz, sizeof(siz)))) { + mbuf_freem(mynam); return (error); } @@ -631,62 +775,54 @@ nfssvc_addsock(fp, mynam, p) * reserve some space. For datagram sockets, this can get called * repeatedly for the same socket, but that isn't harmful. */ - if (so->so_type == SOCK_STREAM) { - struct sockopt sopt; - int val; - - bzero(&sopt, sizeof sopt); - sopt.sopt_dir = SOPT_SET; - sopt.sopt_level = SOL_SOCKET; - sopt.sopt_name = SO_KEEPALIVE; - sopt.sopt_val = &val; - sopt.sopt_valsize = sizeof val; - val = 1; - sosetopt(so, &sopt); + if (sotype == SOCK_STREAM) { + sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on)); } - if (so->so_proto->pr_domain->dom_family == AF_INET && - so->so_proto->pr_protocol == IPPROTO_TCP) { - struct sockopt sopt; - int val; - - bzero(&sopt, sizeof sopt); - sopt.sopt_dir = SOPT_SET; - sopt.sopt_level = IPPROTO_TCP; - sopt.sopt_name = TCP_NODELAY; - sopt.sopt_val = &val; - sopt.sopt_valsize = sizeof val; - val = 1; - sosetopt(so, &sopt); + if (sodomain == AF_INET && soprotocol == IPPROTO_TCP) { + sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on)); } - so->so_rcv.sb_flags &= ~SB_NOINTR; - so->so_rcv.sb_timeo = 0; - so->so_snd.sb_flags &= ~SB_NOINTR; - so->so_snd.sb_timeo = 0; - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - if (tslp) + sock_nointerrupt(so, 0); + + timeo.tv_usec = 0; + timeo.tv_sec = 0; + error = sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo)); + error = sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo)); + + if (tslp) { slp = tslp; - else { + lck_mtx_lock(nfsd_mutex); + } else { MALLOC(slp, struct nfssvc_sock *, sizeof(struct nfssvc_sock), M_NFSSVC, M_WAITOK); + if (!slp) { + mbuf_freem(mynam); + return (ENOMEM); + } bzero((caddr_t)slp, sizeof (struct nfssvc_sock)); + lck_rw_init(&slp->ns_rwlock, nfs_slp_rwlock_group, nfs_slp_lock_attr); + lck_mtx_init(&slp->ns_wgmutex, nfs_slp_mutex_group, nfs_slp_lock_attr); TAILQ_INIT(&slp->ns_uidlruhead); + lck_mtx_lock(nfsd_mutex); TAILQ_INSERT_TAIL(&nfssvc_sockhead, slp, ns_chain); } + + sock_retain(so); /* grab a retain count on the socket */ slp->ns_so = so; + slp->ns_sotype = sotype; slp->ns_nam = mynam; - slp->ns_fp = fp; - (void)fref(fp); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - s = splnet(); + + socket_lock(so, 1); so->so_upcallarg = (caddr_t)slp; so->so_upcall = nfsrv_rcv; so->so_rcv.sb_flags |= SB_UPCALL; /* required for freebsd merge */ - slp->ns_nflag = SLPN_NEEDQ; - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - slp->ns_flag = SLP_VALID; + socket_unlock(so, 1); + + slp->ns_flag = SLP_VALID | SLP_NEEDQ; + nfsrv_wakenfsd(slp); - splx(s); + lck_mtx_unlock(nfsd_mutex); + return (0); } @@ -697,86 +833,96 @@ nfssvc_addsock(fp, mynam, p) static int nfssvc_nfsd(nsd, argp, p) struct nfsd_srvargs *nsd; - caddr_t argp; - struct proc *p; + user_addr_t argp; + proc_t p; { - register struct mbuf *m; - register int siz; - register struct nfssvc_sock *slp; - register struct socket *so; + mbuf_t m, mreq; + struct nfssvc_sock *slp; struct nfsd *nfsd = nsd->nsd_nfsd; struct nfsrv_descript *nd = NULL; - struct mbuf *mreq; - int error = 0, cacherep, s, sotype, writes_todo; - int procrastinate; + int error = 0, cacherep, writes_todo; + int siz, procrastinate; u_quad_t cur_usec; struct timeval now; + boolean_t funnel_state; #ifndef nolint cacherep = RC_DOIT; writes_todo = 0; #endif - s = splnet(); if (nfsd == (struct nfsd *)0) { MALLOC(nfsd, struct nfsd *, sizeof(struct nfsd), M_NFSD, M_WAITOK); + if (!nfsd) + return (ENOMEM); nsd->nsd_nfsd = nfsd; bzero((caddr_t)nfsd, sizeof (struct nfsd)); nfsd->nfsd_procp = p; + lck_mtx_lock(nfsd_mutex); TAILQ_INSERT_TAIL(&nfsd_head, nfsd, nfsd_chain); nfs_numnfsd++; + lck_mtx_unlock(nfsd_mutex); } + + funnel_state = thread_funnel_set(kernel_flock, FALSE); + /* * Loop getting rpc requests until SIGKILL. */ for (;;) { if ((nfsd->nfsd_flag & NFSD_REQINPROG) == 0) { - while (nfsd->nfsd_slp == (struct nfssvc_sock *)0 && - (nfsd_head_flag & NFSD_CHECKSLP) == 0) { + lck_mtx_lock(nfsd_mutex); + while ((nfsd->nfsd_slp == NULL) && !(nfsd_head_flag & NFSD_CHECKSLP)) { nfsd->nfsd_flag |= NFSD_WAITING; nfsd_waiting++; - error = tsleep((caddr_t)nfsd, PSOCK | PCATCH, - "nfsd", 0); + error = msleep(nfsd, nfsd_mutex, PSOCK | PCATCH, "nfsd", 0); nfsd_waiting--; - if (error) + if (error) { + lck_mtx_unlock(nfsd_mutex); goto done; + } } - if (nfsd->nfsd_slp == (struct nfssvc_sock *)0 && - (nfsd_head_flag & NFSD_CHECKSLP) != 0) { - for (slp = nfssvc_sockhead.tqh_first; slp != 0; - slp = slp->ns_chain.tqe_next) { + if ((nfsd->nfsd_slp == NULL) && (nfsd_head_flag & NFSD_CHECKSLP)) { + TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) { + lck_rw_lock_shared(&slp->ns_rwlock); if ((slp->ns_flag & (SLP_VALID | SLP_DOREC)) == (SLP_VALID | SLP_DOREC)) { + if (lck_rw_lock_shared_to_exclusive(&slp->ns_rwlock)) { + /* upgrade failed and we lost the lock; take exclusive and recheck */ + lck_rw_lock_exclusive(&slp->ns_rwlock); + if ((slp->ns_flag & (SLP_VALID | SLP_DOREC)) + != (SLP_VALID | SLP_DOREC)) { + /* flags no longer set, so skip this socket */ + lck_rw_done(&slp->ns_rwlock); + continue; + } + } slp->ns_flag &= ~SLP_DOREC; slp->ns_sref++; nfsd->nfsd_slp = slp; + lck_rw_done(&slp->ns_rwlock); break; } + lck_rw_done(&slp->ns_rwlock); } if (slp == 0) nfsd_head_flag &= ~NFSD_CHECKSLP; } - if ((slp = nfsd->nfsd_slp) == (struct nfssvc_sock *)0) + lck_mtx_unlock(nfsd_mutex); + if ((slp = nfsd->nfsd_slp) == NULL) continue; + lck_rw_lock_exclusive(&slp->ns_rwlock); if (slp->ns_flag & SLP_VALID) { - nfs_slplock(slp, 1); - if (slp->ns_nflag & SLPN_DISCONN) { + if (slp->ns_flag & SLP_DISCONN) { nfsrv_zapsock(slp); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - } else if (slp->ns_nflag & SLPN_NEEDQ) { - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - slp->ns_nflag &= ~SLPN_NEEDQ; - nfsrv_rcv(slp->ns_so, (caddr_t)slp, - M_WAIT); - } else - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + } else if (slp->ns_flag & SLP_NEEDQ) { + slp->ns_flag &= ~SLP_NEEDQ; + nfsrv_rcv_locked(slp->ns_so, slp, MBUF_WAITOK); + } error = nfsrv_dorec(slp, nfsd, &nd); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - nfs_slpunlock(slp); microuptime(&now); cur_usec = (u_quad_t)now.tv_sec * 1000000 + (u_quad_t)now.tv_usec; - if (error && slp->ns_tq.lh_first && - slp->ns_tq.lh_first->nd_time <= cur_usec) { + if (error && slp->ns_wgtime && (slp->ns_wgtime <= cur_usec)) { error = 0; cacherep = RC_DOIT; writes_todo = 1; @@ -784,6 +930,7 @@ nfssvc_nfsd(nsd, argp, p) writes_todo = 0; nfsd->nfsd_flag |= NFSD_REQINPROG; } + lck_rw_done(&slp->ns_rwlock); } else { error = 0; slp = nfsd->nfsd_slp; @@ -791,19 +938,18 @@ nfssvc_nfsd(nsd, argp, p) if (error || (slp->ns_flag & SLP_VALID) == 0) { if (nd) { if (nd->nd_nam2) - m_freem(nd->nd_nam2); + mbuf_freem(nd->nd_nam2); + if (nd->nd_cr) + kauth_cred_rele(nd->nd_cr); FREE_ZONE((caddr_t)nd, sizeof *nd, M_NFSRVDESC); nd = NULL; } - nfsd->nfsd_slp = (struct nfssvc_sock *)0; + nfsd->nfsd_slp = NULL; nfsd->nfsd_flag &= ~NFSD_REQINPROG; nfsrv_slpderef(slp); continue; } - splx(s); - so = slp->ns_so; - sotype = so->so_type; if (nd) { microuptime(&nd->nd_starttime); if (nd->nd_nam2) @@ -816,42 +962,22 @@ nfssvc_nfsd(nsd, argp, p) */ if (nfsd->nfsd_flag & NFSD_NEEDAUTH) { nfsd->nfsd_flag &= ~NFSD_NEEDAUTH; - nsd->nsd_haddr = mtod(nd->nd_nam, - struct sockaddr_in *)->sin_addr.s_addr; + nsd->nsd_haddr = ((struct sockaddr_in *)mbuf_data(nd->nd_nam))->sin_addr.s_addr; nsd->nsd_authlen = nfsd->nfsd_authlen; nsd->nsd_verflen = nfsd->nfsd_verflen; - if (!copyout(nfsd->nfsd_authstr,nsd->nsd_authstr, + if (!copyout(nfsd->nfsd_authstr,CAST_USER_ADDR_T(nsd->nsd_authstr), nfsd->nfsd_authlen) && - !copyout(nfsd->nfsd_verfstr, nsd->nsd_verfstr, + !copyout(nfsd->nfsd_verfstr, CAST_USER_ADDR_T(nsd->nsd_verfstr), nfsd->nfsd_verflen) && - !copyout((caddr_t)nsd, argp, sizeof (*nsd))) + !copyout((caddr_t)nsd, argp, sizeof (*nsd))) { + thread_funnel_set(kernel_flock, funnel_state); return (ENEEDAUTH); + } cacherep = RC_DROPIT; } else cacherep = nfsrv_getcache(nd, slp, &mreq); - /* - * Check for just starting up for NQNFS and send - * fake "try again later" replies to the NQNFS clients. - */ - microtime(&now); - if (notstarted && nqnfsstarttime <= now.tv_sec) { - if (modify_flag) { - nqnfsstarttime = now.tv_sec + nqsrv_writeslack; - modify_flag = 0; - } else - notstarted = 0; - } - if (notstarted) { - if ((nd->nd_flag & ND_NQNFS) == 0) - cacherep = RC_DROPIT; - else if (nd->nd_procnum != NFSPROC_WRITE) { - nd->nd_procnum = NFSPROC_NOOP; - nd->nd_repstat = NQNFS_TRYLATER; - cacherep = RC_DOIT; - } else - modify_flag = 1; - } else if (nfsd->nfsd_flag & NFSD_AUTHFAIL) { + if (nfsd->nfsd_flag & NFSD_AUTHFAIL) { nfsd->nfsd_flag &= ~NFSD_AUTHFAIL; nd->nd_procnum = NFSPROC_NOOP; nd->nd_repstat = (NFSERR_AUTHERR | AUTH_TOOWEAK); @@ -859,18 +985,20 @@ nfssvc_nfsd(nsd, argp, p) } else if (nfs_privport) { /* Check if source port is privileged */ u_short port; - struct sockaddr *nam = mtod(nd->nd_nam, struct sockaddr*); + struct sockaddr *nam = mbuf_data(nd->nd_nam); struct sockaddr_in *sin; sin = (struct sockaddr_in *)nam; port = ntohs(sin->sin_port); if (port >= IPPORT_RESERVED && nd->nd_procnum != NFSPROC_NULL) { + char strbuf[MAX_IPv4_STR_LEN]; nd->nd_procnum = NFSPROC_NOOP; nd->nd_repstat = (NFSERR_AUTHERR | AUTH_TOOWEAK); cacherep = RC_DOIT; printf("NFS request from unprivileged port (%s:%d)\n", - (char *)(inet_ntoa(sin->sin_addr)), port); + inet_ntop(AF_INET, &sin->sin_addr, strbuf, sizeof(strbuf)), + port); } } @@ -887,94 +1015,100 @@ nfssvc_nfsd(nsd, argp, p) procrastinate = nfsrvw_procrastinate_v3; else procrastinate = nfsrvw_procrastinate; - if (writes_todo || (nd->nd_procnum == NFSPROC_WRITE && - procrastinate > 0 && !notstarted)) - error = nfsrv_writegather(&nd, slp, - nfsd->nfsd_procp, &mreq); + lck_rw_lock_shared(&nfs_export_rwlock); + if (writes_todo || ((nd->nd_procnum == NFSPROC_WRITE) && (procrastinate > 0))) + error = nfsrv_writegather(&nd, slp, nfsd->nfsd_procp, &mreq); else - error = (*(nfsrv3_procs[nd->nd_procnum]))(nd, - slp, nfsd->nfsd_procp, &mreq); + error = (*(nfsrv3_procs[nd->nd_procnum]))(nd, slp, nfsd->nfsd_procp, &mreq); + lck_rw_done(&nfs_export_rwlock); if (mreq == NULL) break; if (error) { - if (nd->nd_procnum != NQNFSPROC_VACATED) - nfsstats.srv_errs++; + OSAddAtomic(1, (SInt32*)&nfsstats.srv_errs); nfsrv_updatecache(nd, FALSE, mreq); if (nd->nd_nam2) { - m_freem(nd->nd_nam2); + mbuf_freem(nd->nd_nam2); nd->nd_nam2 = NULL; } break; } - nfsstats.srvrpccnt[nd->nd_procnum]++; + OSAddAtomic(1, (SInt32*)&nfsstats.srvrpccnt[nd->nd_procnum]); nfsrv_updatecache(nd, TRUE, mreq); - nd->nd_mrep = (struct mbuf *)0; + nd->nd_mrep = NULL; case RC_REPLY: m = mreq; siz = 0; while (m) { - siz += m->m_len; - m = m->m_next; + siz += mbuf_len(m); + m = mbuf_next(m); } if (siz <= 0 || siz > NFS_MAXPACKET) { printf("mbuf siz=%d\n",siz); panic("Bad nfs svc reply"); } m = mreq; - m->m_pkthdr.len = siz; - m->m_pkthdr.rcvif = (struct ifnet *)0; + mbuf_pkthdr_setlen(m, siz); + error = mbuf_pkthdr_setrcvif(m, NULL); + if (error) + panic("nfsd setrcvif failed: %d", error); /* * For stream protocols, prepend a Sun RPC * Record Mark. */ - if (sotype == SOCK_STREAM) { - M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); - *mtod(m, u_long *) = htonl(0x80000000 | siz); + if (slp->ns_sotype == SOCK_STREAM) { + error = mbuf_prepend(&m, NFSX_UNSIGNED, MBUF_WAITOK); + if (!error) + *(u_long*)mbuf_data(m) = htonl(0x80000000 | siz); } - if (so->so_proto->pr_flags & PR_CONNREQUIRED) - (void) nfs_slplock(slp, 1); - if (slp->ns_flag & SLP_VALID) - error = nfs_send(so, nd->nd_nam2, m, NULL); - else { - error = EPIPE; - m_freem(m); + if (!error) { + if (slp->ns_flag & SLP_VALID) { + error = nfs_send(slp->ns_so, nd->nd_nam2, m, NULL); + } else { + error = EPIPE; + mbuf_freem(m); + } + } else { + mbuf_freem(m); } mreq = NULL; if (nfsrtton) - nfsd_rt(sotype, nd, cacherep); + nfsd_rt(slp->ns_sotype, nd, cacherep); if (nd->nd_nam2) { - MFREE(nd->nd_nam2, m); + mbuf_freem(nd->nd_nam2); nd->nd_nam2 = NULL; } if (nd->nd_mrep) { - m_freem(nd->nd_mrep); + mbuf_freem(nd->nd_mrep); nd->nd_mrep = NULL; } - if (error == EPIPE) + if (error == EPIPE) { + lck_rw_lock_exclusive(&slp->ns_rwlock); nfsrv_zapsock(slp); - if (so->so_proto->pr_flags & PR_CONNREQUIRED) - nfs_slpunlock(slp); + lck_rw_done(&slp->ns_rwlock); + } if (error == EINTR || error == ERESTART) { - FREE_ZONE((caddr_t)nd, - sizeof *nd, M_NFSRVDESC); + if (nd->nd_cr) + kauth_cred_rele(nd->nd_cr); + FREE_ZONE((caddr_t)nd, sizeof *nd, M_NFSRVDESC); nfsrv_slpderef(slp); - s = splnet(); goto done; } break; case RC_DROPIT: if (nfsrtton) - nfsd_rt(sotype, nd, cacherep); - m_freem(nd->nd_mrep); - m_freem(nd->nd_nam2); + nfsd_rt(slp->ns_sotype, nd, cacherep); + mbuf_freem(nd->nd_mrep); + mbuf_freem(nd->nd_nam2); nd->nd_mrep = nd->nd_nam2 = NULL; break; }; if (nd) { if (nd->nd_mrep) - m_freem(nd->nd_mrep); + mbuf_freem(nd->nd_mrep); if (nd->nd_nam2) - m_freem(nd->nd_nam2); + mbuf_freem(nd->nd_nam2); + if (nd->nd_cr) + kauth_cred_rele(nd->nd_cr); FREE_ZONE((caddr_t)nd, sizeof *nd, M_NFSRVDESC); nd = NULL; } @@ -986,34 +1120,71 @@ nfssvc_nfsd(nsd, argp, p) microuptime(&now); cur_usec = (u_quad_t)now.tv_sec * 1000000 + (u_quad_t)now.tv_usec; - s = splsoftclock(); - if (slp->ns_tq.lh_first && - slp->ns_tq.lh_first->nd_time <= cur_usec) { + if (slp->ns_wgtime && (slp->ns_wgtime <= cur_usec)) { cacherep = RC_DOIT; writes_todo = 1; - } else + } else { writes_todo = 0; - splx(s); + } } while (writes_todo); - s = splnet(); - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + lck_rw_lock_exclusive(&slp->ns_rwlock); if (nfsrv_dorec(slp, nfsd, &nd)) { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + lck_rw_done(&slp->ns_rwlock); nfsd->nfsd_flag &= ~NFSD_REQINPROG; nfsd->nfsd_slp = NULL; nfsrv_slpderef(slp); - } else - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + } else { + lck_rw_done(&slp->ns_rwlock); + } } done: + thread_funnel_set(kernel_flock, funnel_state); + lck_mtx_lock(nfsd_mutex); TAILQ_REMOVE(&nfsd_head, nfsd, nfsd_chain); - splx(s); - _FREE((caddr_t)nfsd, M_NFSD); + FREE(nfsd, M_NFSD); nsd->nsd_nfsd = (struct nfsd *)0; if (--nfs_numnfsd == 0) nfsrv_init(TRUE); /* Reinitialize everything */ + lck_mtx_unlock(nfsd_mutex); return (error); } + +static int +nfssvc_export(user_addr_t argp, proc_t p) +{ + int error = 0, is_64bit; + struct user_nfs_export_args unxa; + struct vfs_context context; + + context.vc_proc = p; + context.vc_ucred = kauth_cred_get(); + is_64bit = IS_64BIT_PROCESS(p); + + /* copy in pointers to path and export args */ + if (is_64bit) { + error = copyin(argp, (caddr_t)&unxa, sizeof(unxa)); + } else { + struct nfs_export_args tnxa; + error = copyin(argp, (caddr_t)&tnxa, sizeof(tnxa)); + if (error == 0) { + /* munge into LP64 version of nfs_export_args structure */ + unxa.nxa_fsid = tnxa.nxa_fsid; + unxa.nxa_expid = tnxa.nxa_expid; + unxa.nxa_fspath = CAST_USER_ADDR_T(tnxa.nxa_fspath); + unxa.nxa_exppath = CAST_USER_ADDR_T(tnxa.nxa_exppath); + unxa.nxa_flags = tnxa.nxa_flags; + unxa.nxa_netcount = tnxa.nxa_netcount; + unxa.nxa_nets = CAST_USER_ADDR_T(tnxa.nxa_nets); + } + } + if (error) + return (error); + + error = nfsrv_export(&unxa, &context); + + return (error); +} + #endif /* NFS_NOSERVER */ int nfs_defect = 0; @@ -1022,14 +1193,8 @@ int nfs_defect = 0; SYSCTL_INT(_vfs_nfs, OID_AUTO, defect, CTLFLAG_RW, &nfs_defect, 0, ""); #endif -#ifndef _SYS_SYSPROTO_H_ -struct nfsclnt_args { - int flag; - caddr_t argp; -}; -#endif int -nfsclnt(struct proc *p, struct nfsclnt_args *uap) +nfsclnt(proc_t p, struct nfsclnt_args *uap, __unused int *retval) { struct lockd_ans la; int error; @@ -1042,7 +1207,7 @@ nfsclnt(struct proc *p, struct nfsclnt_args *uap) return (error != 0 ? error : nfslockdans(p, &la)); } if (uap->flag == NFSCLNT_LOCKDFD) - return (nfslockdfd(p, (int)uap->argp)); + return (nfslockdfd(p, CAST_DOWN(int, uap->argp))); return EINVAL; } @@ -1055,12 +1220,9 @@ static int nfssvc_iod_continue(int); * Never returns unless it fails or gets killed. */ static int -nfssvc_iod(p) - struct proc *p; +nfssvc_iod(__unused proc_t p) { register int i, myiod; - struct nfsmount *nmp; - int error = 0; struct uthread *ut; /* @@ -1079,106 +1241,131 @@ nfssvc_iod(p) /* stuff myiod into uthread to get off local stack for continuation */ - ut = (struct uthread *)get_bsdthread_info(current_act()); + ut = (struct uthread *)get_bsdthread_info(current_thread()); ut->uu_state.uu_nfs_myiod = myiod; /* squirrel away for continuation */ nfssvc_iod_continue(0); /* NOTREACHED */ - + return (0); } /* * Continuation for Asynchronous I/O daemons for client nfs. */ static int -nfssvc_iod_continue(error) +nfssvc_iod_continue(int error) { register struct nfsbuf *bp; register int i, myiod; struct nfsmount *nmp; struct uthread *ut; - struct proc *p; + proc_t p; /* * real myiod is stored in uthread, recover it */ - ut = (struct uthread *)get_bsdthread_info(current_act()); + ut = (struct uthread *)get_bsdthread_info(current_thread()); myiod = ut->uu_state.uu_nfs_myiod; - p = current_proc(); + p = current_proc(); // XXX /* * Just loop around doin our stuff until SIGKILL * - actually we don't loop with continuations... */ + lck_mtx_lock(nfs_iod_mutex); for (;;) { while (((nmp = nfs_iodmount[myiod]) == NULL || nmp->nm_bufq.tqh_first == NULL) && error == 0 && nfs_ioddelwri == 0) { if (nmp) nmp->nm_bufqiods--; - nfs_iodwant[myiod] = p; + nfs_iodwant[myiod] = p; // XXX this doesn't need to be a proc_t nfs_iodmount[myiod] = NULL; - error = tsleep0((caddr_t)&nfs_iodwant[myiod], - PWAIT | PCATCH, "nfsidl", 0, nfssvc_iod_continue); - /* NOTREACHED */ + error = msleep0((caddr_t)&nfs_iodwant[myiod], nfs_iod_mutex, + PWAIT | PCATCH | PDROP, "nfsidl", 0, nfssvc_iod_continue); + lck_mtx_lock(nfs_iod_mutex); } if (error) { nfs_asyncdaemon[myiod] = 0; if (nmp) nmp->nm_bufqiods--; nfs_iodwant[myiod] = NULL; nfs_iodmount[myiod] = NULL; + lck_mtx_unlock(nfs_iod_mutex); nfs_numasync--; if (error == EINTR || error == ERESTART) error = 0; unix_syscall_return(error); } if (nmp != NULL) { - while ((bp = nmp->nm_bufq.tqh_first) != NULL) { + while ((bp = TAILQ_FIRST(&nmp->nm_bufq)) != NULL) { /* Take one off the front of the list */ TAILQ_REMOVE(&nmp->nm_bufq, bp, nb_free); bp->nb_free.tqe_next = NFSNOLIST; nmp->nm_bufqlen--; if (nmp->nm_bufqwant && nmp->nm_bufqlen < 2 * nfs_numasync) { nmp->nm_bufqwant = FALSE; + lck_mtx_unlock(nfs_iod_mutex); wakeup(&nmp->nm_bufq); + } else { + lck_mtx_unlock(nfs_iod_mutex); } + + SET(bp->nb_flags, NB_IOD); if (ISSET(bp->nb_flags, NB_READ)) - (void) nfs_doio(bp, bp->nb_rcred, (struct proc *)0); + nfs_doio(bp, bp->nb_rcred, NULL); else - (void) nfs_doio(bp, bp->nb_wcred, (struct proc *)0); + nfs_doio(bp, bp->nb_wcred, NULL); + lck_mtx_lock(nfs_iod_mutex); /* * If there are more than one iod on this mount, then defect * so that the iods can be shared out fairly between the mounts */ if (nfs_defect && nmp->nm_bufqiods > 1) { - NFS_DPF(ASYNCIO, - ("nfssvc_iod: iod %d defecting from mount %p\n", - myiod, nmp)); nfs_iodmount[myiod] = NULL; nmp->nm_bufqiods--; break; } } } + lck_mtx_unlock(nfs_iod_mutex); + if (nfs_ioddelwri) { i = 0; nfs_ioddelwri = 0; + lck_mtx_lock(nfs_buf_mutex); while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) { struct nfsnode *np = VTONFS(bp->nb_vp); nfs_buf_remfree(bp); + nfs_buf_refget(bp); + while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN); + nfs_buf_refrele(bp); + if (error) + break; + if (!bp->nb_vp) { + /* buffer is no longer valid */ + nfs_buf_drop(bp); + continue; + } if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { /* put buffer at end of delwri list */ TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free); nfsbufdelwricnt++; - nfs_flushcommits(np->n_vnode, (struct proc *)0); + nfs_buf_drop(bp); + lck_mtx_unlock(nfs_buf_mutex); + nfs_flushcommits(np->n_vnode, NULL, 1); } else { - SET(bp->nb_flags, (NB_BUSY | NB_ASYNC | NB_IOD)); + SET(bp->nb_flags, (NB_ASYNC | NB_IOD)); + lck_mtx_unlock(nfs_buf_mutex); nfs_buf_write(bp); } i++; + lck_mtx_lock(nfs_buf_mutex); } + lck_mtx_unlock(nfs_buf_mutex); } + + lck_mtx_lock(nfs_iod_mutex); } } @@ -1190,52 +1377,23 @@ nfssvc_iod_continue(error) * reassigned during cleanup. */ static void -nfsrv_zapsock(slp) - register struct nfssvc_sock *slp; +nfsrv_zapsock(struct nfssvc_sock *slp) { - register struct nfsuid *nuidp, *nnuidp; - register struct nfsrv_descript *nwp, *nnwp; - struct socket *so; - struct file *fp; - struct mbuf *m; - int s; + socket_t so; + if ((slp->ns_flag & SLP_VALID) == 0) + return; slp->ns_flag &= ~SLP_ALLFLAGS; - slp->ns_nflag &= ~SLP_ALLFLAGS; - fp = slp->ns_fp; - if (fp) { - slp->ns_fp = (struct file *)0; - so = slp->ns_so; - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - so->so_upcall = NULL; - so->so_rcv.sb_flags &= ~SB_UPCALL; - soshutdown(so, 2); - if (slp->ns_nam) - MFREE(slp->ns_nam, m); - m_freem(slp->ns_raw); - m_freem(slp->ns_rec); - slp->ns_nam = slp->ns_raw = slp->ns_rec = NULL; - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - closef(fp, (struct proc *)0); - for (nuidp = slp->ns_uidlruhead.tqh_first; nuidp != 0; - nuidp = nnuidp) { - nnuidp = nuidp->nu_lru.tqe_next; - LIST_REMOVE(nuidp, nu_hash); - TAILQ_REMOVE(&slp->ns_uidlruhead, nuidp, nu_lru); - if (nuidp->nu_flag & NU_NAM) - m_freem(nuidp->nu_nam); - FREE_ZONE((caddr_t)nuidp, - sizeof (struct nfsuid), M_NFSUID); - } - s = splsoftclock(); - for (nwp = slp->ns_tq.lh_first; nwp; nwp = nnwp) { - nnwp = nwp->nd_tq.le_next; - LIST_REMOVE(nwp, nd_tq); - FREE_ZONE((caddr_t)nwp, sizeof *nwp, M_NFSRVDESC); - } - LIST_INIT(&slp->ns_tq); - splx(s); - } + + so = slp->ns_so; + if (so == NULL) + return; + + socket_lock(so, 1); + so->so_upcall = NULL; + so->so_rcv.sb_flags &= ~SB_UPCALL; + socket_unlock(so, 1); + sock_shutdown(so, SHUT_RDWR); } /* @@ -1246,7 +1404,7 @@ int nfs_getauth(nmp, rep, cred, auth_str, auth_len, verf_str, verf_len, key) register struct nfsmount *nmp; struct nfsreq *rep; - struct ucred *cred; + kauth_cred_t cred; char **auth_str; int *auth_len; char *verf_str; @@ -1265,13 +1423,16 @@ nfs_getauth(nmp, rep, cred, auth_str, auth_len, verf_str, verf_len, key) return (error); } } - nmp->nm_state &= ~(NFSSTA_WAITAUTH | NFSSTA_WANTAUTH); + nmp->nm_state &= ~NFSSTA_WANTAUTH; MALLOC(*auth_str, char *, RPCAUTH_MAXSIZ, M_TEMP, M_WAITOK); + if (!*auth_str) + return (ENOMEM); nmp->nm_authstr = *auth_str; nmp->nm_authlen = RPCAUTH_MAXSIZ; nmp->nm_verfstr = verf_str; nmp->nm_verflen = *verf_len; - nmp->nm_authuid = cred->cr_uid; + nmp->nm_authuid = kauth_cred_getuid(cred); + nmp->nm_state &= ~NFSSTA_WAITAUTH; wakeup((caddr_t)&nmp->nm_authstr); /* @@ -1287,7 +1448,7 @@ nfs_getauth(nmp, rep, cred, auth_str, auth_len, verf_str, verf_len, key) error = EAUTH; } if (error) - _FREE((caddr_t)*auth_str, M_TEMP); + FREE(*auth_str, M_TEMP); else { *auth_len = nmp->nm_authlen; *verf_len = nmp->nm_verflen; @@ -1306,13 +1467,13 @@ nfs_getauth(nmp, rep, cred, auth_str, auth_len, verf_str, verf_len, key) * Get a nickname authenticator and verifier. */ int -nfs_getnickauth(nmp, cred, auth_str, auth_len, verf_str, verf_len) - struct nfsmount *nmp; - struct ucred *cred; - char **auth_str; - int *auth_len; - char *verf_str; - int verf_len; +nfs_getnickauth( + struct nfsmount *nmp, + kauth_cred_t cred, + char **auth_str, + int *auth_len, + char *verf_str, + __unused int verf_len) { register struct nfsuid *nuidp; register u_long *nickp, *verfp; @@ -1322,22 +1483,25 @@ nfs_getnickauth(nmp, cred, auth_str, auth_len, verf_str, verf_len) if (verf_len < (4 * NFSX_UNSIGNED)) panic("nfs_getnickauth verf too small"); #endif - for (nuidp = NMUIDHASH(nmp, cred->cr_uid)->lh_first; + for (nuidp = NMUIDHASH(nmp, kauth_cred_getuid(cred))->lh_first; nuidp != 0; nuidp = nuidp->nu_hash.le_next) { - if (nuidp->nu_cr.cr_uid == cred->cr_uid) + if (kauth_cred_getuid(nuidp->nu_cr) == kauth_cred_getuid(cred)) break; } microtime(&now); if (!nuidp || nuidp->nu_expire < now.tv_sec) return (EACCES); + MALLOC(nickp, u_long *, 2 * NFSX_UNSIGNED, M_TEMP, M_WAITOK); + if (!nickp) + return (ENOMEM); + /* * Move to the end of the lru list (end of lru == most recently used). */ TAILQ_REMOVE(&nmp->nm_uidlruhead, nuidp, nu_lru); TAILQ_INSERT_TAIL(&nmp->nm_uidlruhead, nuidp, nu_lru); - MALLOC(nickp, u_long *, 2 * NFSX_UNSIGNED, M_TEMP, M_WAITOK); *nickp++ = txdr_unsigned(RPCAKN_NICKNAME); *nickp = txdr_unsigned(nuidp->nu_nickname); *auth_str = (char *)nickp; @@ -1378,17 +1542,17 @@ nfs_getnickauth(nmp, cred, auth_str, auth_len, verf_str, verf_len) int nfs_savenickauth(nmp, cred, len, key, mdp, dposp, mrep) register struct nfsmount *nmp; - struct ucred *cred; + kauth_cred_t cred; int len; NFSKERBKEY_T key; - struct mbuf **mdp; + mbuf_t *mdp; char **dposp; - struct mbuf *mrep; + mbuf_t mrep; { register struct nfsuid *nuidp; register u_long *tl; register long t1; - struct mbuf *md = *mdp; + mbuf_t md = *mdp; struct timeval ktvin, ktvout, now; u_long nick; char *dpos = *dposp, *cp2; @@ -1422,20 +1586,27 @@ nfs_savenickauth(nmp, cred, len, key, mdp, dposp, mrep) sizeof (struct nfsuid), M_NFSUID, M_WAITOK); } else { + nuidp = NULL; + } + if (!nuidp) { nuidp = nmp->nm_uidlruhead.tqh_first; + if (!nuidp) { + error = ENOMEM; + goto nfsmout; + } LIST_REMOVE(nuidp, nu_hash); - TAILQ_REMOVE(&nmp->nm_uidlruhead, nuidp, - nu_lru); + TAILQ_REMOVE(&nmp->nm_uidlruhead, nuidp, nu_lru); + kauth_cred_rele(nuidp->nu_cr); } nuidp->nu_flag = 0; - nuidp->nu_cr.cr_uid = cred->cr_uid; + kauth_cred_ref(cred); + nuidp->nu_cr = cred; nuidp->nu_expire = now.tv_sec + NFS_KERBTTL; nuidp->nu_timestamp = ktvout; nuidp->nu_nickname = nick; bcopy(key, nuidp->nu_key, sizeof (key)); - TAILQ_INSERT_TAIL(&nmp->nm_uidlruhead, nuidp, - nu_lru); - LIST_INSERT_HEAD(NMUIDHASH(nmp, cred->cr_uid), + TAILQ_INSERT_TAIL(&nmp->nm_uidlruhead, nuidp, nu_lru); + LIST_INSERT_HEAD(NMUIDHASH(nmp, kauth_cred_getuid(cred)), nuidp, nu_hash); } } else @@ -1449,57 +1620,75 @@ nfsmout: #ifndef NFS_NOSERVER /* - * Derefence a server socket structure. If it has no more references and - * is no longer valid, you can throw it away. + * cleanup and release a server socket structure. */ -void -nfsrv_slpderef(slp) - register struct nfssvc_sock *slp; +static void +nfsrv_slpfree(struct nfssvc_sock *slp) { - if (--(slp->ns_sref) == 0 && (slp->ns_flag & SLP_VALID) == 0) { - TAILQ_REMOVE(&nfssvc_sockhead, slp, ns_chain); - _FREE((caddr_t)slp, M_NFSSVC); - } -} + struct nfsuid *nuidp, *nnuidp; + struct nfsrv_descript *nwp, *nnwp; -/* - * Lock a socket against others. - */ -int -nfs_slplock(slp, wait) - register struct nfssvc_sock *slp; - int wait; -{ - int *statep = &slp->ns_solock; + if (slp->ns_so) { + sock_release(slp->ns_so); + slp->ns_so = NULL; + } + if (slp->ns_nam) + mbuf_free(slp->ns_nam); + if (slp->ns_raw) + mbuf_freem(slp->ns_raw); + if (slp->ns_rec) + mbuf_freem(slp->ns_rec); + slp->ns_nam = slp->ns_raw = slp->ns_rec = NULL; + + for (nuidp = slp->ns_uidlruhead.tqh_first; nuidp != 0; + nuidp = nnuidp) { + nnuidp = nuidp->nu_lru.tqe_next; + LIST_REMOVE(nuidp, nu_hash); + TAILQ_REMOVE(&slp->ns_uidlruhead, nuidp, nu_lru); + if (nuidp->nu_flag & NU_NAM) + mbuf_freem(nuidp->nu_nam); + kauth_cred_rele(nuidp->nu_cr); + FREE_ZONE((caddr_t)nuidp, + sizeof (struct nfsuid), M_NFSUID); + } - if (!wait && (*statep & NFSSTA_SNDLOCK)) - return(0); /* already locked, fail */ - while (*statep & NFSSTA_SNDLOCK) { - *statep |= NFSSTA_WANTSND; - (void) tsleep((caddr_t)statep, PZERO - 1, "nfsslplck", 0); + for (nwp = slp->ns_tq.lh_first; nwp; nwp = nnwp) { + nnwp = nwp->nd_tq.le_next; + LIST_REMOVE(nwp, nd_tq); + if (nwp->nd_cr) + kauth_cred_rele(nwp->nd_cr); + FREE_ZONE((caddr_t)nwp, sizeof *nwp, M_NFSRVDESC); } - *statep |= NFSSTA_SNDLOCK; - return (1); + LIST_INIT(&slp->ns_tq); + + lck_rw_destroy(&slp->ns_rwlock, nfs_slp_rwlock_group); + lck_mtx_destroy(&slp->ns_wgmutex, nfs_slp_mutex_group); + FREE(slp, M_NFSSVC); } /* - * Unlock the stream socket for others. + * Derefence a server socket structure. If it has no more references and + * is no longer valid, you can throw it away. */ void -nfs_slpunlock(slp) - struct nfssvc_sock *slp; +nfsrv_slpderef(struct nfssvc_sock *slp) { - int *statep = &slp->ns_solock; - - if ((*statep & NFSSTA_SNDLOCK) == 0) - panic("nfs slpunlock"); - *statep &= ~NFSSTA_SNDLOCK; - if (*statep & NFSSTA_WANTSND) { - *statep &= ~NFSSTA_WANTSND; - wakeup((caddr_t)statep); + lck_mtx_lock(nfsd_mutex); + lck_rw_lock_exclusive(&slp->ns_rwlock); + slp->ns_sref--; + if (slp->ns_sref || (slp->ns_flag & SLP_VALID)) { + lck_rw_done(&slp->ns_rwlock); + lck_mtx_unlock(nfsd_mutex); + return; } + + TAILQ_REMOVE(&nfssvc_sockhead, slp, ns_chain); + lck_mtx_unlock(nfsd_mutex); + + nfsrv_slpfree(slp); } + /* * Initialize the data structures for the server. * Handshake with any new nfsds starting up to avoid any chance of @@ -1509,22 +1698,24 @@ void nfsrv_init(terminating) int terminating; { - register struct nfssvc_sock *slp, *nslp; + struct nfssvc_sock *slp, *nslp; - if (nfssvc_sockhead_flag & SLP_INIT) - panic("nfsd init"); - nfssvc_sockhead_flag |= SLP_INIT; if (terminating) { - for (slp = nfssvc_sockhead.tqh_first; slp != 0; slp = nslp) { - nslp = slp->ns_chain.tqe_next; - if (slp->ns_flag & SLP_VALID) + for (slp = TAILQ_FIRST(&nfssvc_sockhead); slp != 0; slp = nslp) { + nslp = TAILQ_NEXT(slp, ns_chain); + if (slp->ns_flag & SLP_VALID) { + lck_rw_lock_exclusive(&slp->ns_rwlock); nfsrv_zapsock(slp); + lck_rw_done(&slp->ns_rwlock); + } TAILQ_REMOVE(&nfssvc_sockhead, slp, ns_chain); - _FREE((caddr_t)slp, M_NFSSVC); + /* grab the lock one final time in case anyone's using it */ + lck_rw_lock_exclusive(&slp->ns_rwlock); + nfsrv_slpfree(slp); } nfsrv_cleancache(); /* And clear out server cache */ -/* XXX CSM 12/4/97 Revisit when enabling WebNFS */ -#ifdef notyet +/* XXX Revisit when enabling WebNFS */ +#ifdef WEBNFS_ENABLED } else nfs_pub.np_valid = 0; #else @@ -1532,26 +1723,33 @@ nfsrv_init(terminating) #endif TAILQ_INIT(&nfssvc_sockhead); - nfssvc_sockhead_flag &= ~SLP_INIT; - if (nfssvc_sockhead_flag & SLP_WANTINIT) { - nfssvc_sockhead_flag &= ~SLP_WANTINIT; - wakeup((caddr_t)&nfssvc_sockhead); - } TAILQ_INIT(&nfsd_head); nfsd_head_flag &= ~NFSD_CHECKSLP; MALLOC(nfs_udpsock, struct nfssvc_sock *, sizeof(struct nfssvc_sock), M_NFSSVC, M_WAITOK); - bzero((caddr_t)nfs_udpsock, sizeof (struct nfssvc_sock)); - TAILQ_INIT(&nfs_udpsock->ns_uidlruhead); - TAILQ_INSERT_HEAD(&nfssvc_sockhead, nfs_udpsock, ns_chain); + if (nfs_udpsock) { + bzero((caddr_t)nfs_udpsock, sizeof (struct nfssvc_sock)); + lck_rw_init(&nfs_udpsock->ns_rwlock, nfs_slp_rwlock_group, nfs_slp_lock_attr); + TAILQ_INIT(&nfs_udpsock->ns_uidlruhead); + TAILQ_INSERT_HEAD(&nfssvc_sockhead, nfs_udpsock, ns_chain); + } else { + printf("nfsrv_init() failed to allocate UDP socket\n"); + } +#if ISO MALLOC(nfs_cltpsock, struct nfssvc_sock *, sizeof(struct nfssvc_sock), M_NFSSVC, M_WAITOK); - bzero((caddr_t)nfs_cltpsock, sizeof (struct nfssvc_sock)); - TAILQ_INIT(&nfs_cltpsock->ns_uidlruhead); - TAILQ_INSERT_TAIL(&nfssvc_sockhead, nfs_cltpsock, ns_chain); + if (nfs_cltpsock) { + bzero((caddr_t)nfs_cltpsock, sizeof (struct nfssvc_sock)); + lck_rw_init(&nfs_cltpsock->ns_rwlock, nfs_slp_rwlock_group, nfs_slp_lock_attr); + TAILQ_INIT(&nfs_cltpsock->ns_uidlruhead); + TAILQ_INSERT_TAIL(&nfssvc_sockhead, nfs_cltpsock, ns_chain); + } else { + printf("nfsrv_init() failed to allocate CLTP socket\n"); + } +#endif } /* @@ -1575,13 +1773,11 @@ nfsd_rt(sotype, nd, cacherep) rt->flag = DRT_CACHEDROP; if (sotype == SOCK_STREAM) rt->flag |= DRT_TCP; - if (nd->nd_flag & ND_NQNFS) - rt->flag |= DRT_NQNFS; else if (nd->nd_flag & ND_NFSV3) rt->flag |= DRT_NFSV3; rt->proc = nd->nd_procnum; - if (mtod(nd->nd_nam, struct sockaddr *)->sa_family == AF_INET) - rt->ipadr = mtod(nd->nd_nam, struct sockaddr_in *)->sin_addr.s_addr; + if (((struct sockaddr *)mbuf_data(nd->nd_nam))->sa_family == AF_INET) + rt->ipadr = ((struct sockaddr_in *)mbuf_data(nd->nd_nam))->sin_addr.s_addr; else rt->ipadr = INADDR_ANY; microuptime(&now); diff --git a/bsd/nfs/nfs_vfsops.c b/bsd/nfs/nfs_vfsops.c index b08612aa8..6d72e61e0 100644 --- a/bsd/nfs/nfs_vfsops.c +++ b/bsd/nfs/nfs_vfsops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -64,16 +64,18 @@ #include <sys/conf.h> #include <sys/ioctl.h> #include <sys/signal.h> -#include <sys/proc.h> -#include <sys/namei.h> -#include <sys/vnode.h> +#include <sys/proc_internal.h> /* for fs rooting to update rootdir in fdp */ +#include <sys/kauth.h> +#include <sys/vnode_internal.h> #include <sys/malloc.h> #include <sys/kernel.h> #include <sys/sysctl.h> -#include <sys/mount.h> -#include <sys/mbuf.h> +#include <sys/mount_internal.h> +#include <sys/kpi_mbuf.h> #include <sys/socket.h> #include <sys/socketvar.h> +#include <sys/fcntl.h> +#include <libkern/OSAtomic.h> #include <sys/vm.h> #include <sys/vmparam.h> @@ -94,30 +96,22 @@ #include <nfs/xdr_subs.h> #include <nfs/nfsm_subs.h> #include <nfs/nfsdiskless.h> -#include <nfs/nqnfs.h> +#include <nfs/nfs_lock.h> -extern int nfs_mountroot __P((void)); +extern int nfs_mountroot(void); extern int nfs_ticks; extern int nfs_mount_type; extern int nfs_resv_mounts; struct nfsstats nfsstats; -static int nfs_sysctl(int *, u_int, void *, size_t *, void *, size_t, - struct proc *); +static int nfs_sysctl(int *, u_int, user_addr_t, size_t *, user_addr_t, size_t, vfs_context_t); /* XXX CSM 11/25/97 Upgrade sysctl.h someday */ #ifdef notyet SYSCTL_NODE(_vfs, MOUNT_NFS, nfs, CTLFLAG_RW, 0, "NFS filesystem"); SYSCTL_STRUCT(_vfs_nfs, NFS_NFSSTATS, nfsstats, CTLFLAG_RD, &nfsstats, nfsstats, ""); #endif -#if NFSDIAG -int nfs_debug; -/* XXX CSM 11/25/97 Upgrade sysctl.h someday */ -#ifdef notyet -SYSCTL_INT(_vfs_nfs, OID_AUTO, debug, CTLFLAG_RW, &nfs_debug, 0, ""); -#endif -#endif SYSCTL_DECL(_vfs_generic_nfs); SYSCTL_NODE(_vfs_generic_nfs, OID_AUTO, client, CTLFLAG_RW, 0, @@ -131,27 +125,18 @@ static int nfs_tprintf_delay = NFS_TPRINTF_DELAY; SYSCTL_INT(_vfs_generic_nfs_client, NFS_TPRINTF_DELAY, nextdowndelay, CTLFLAG_RW, &nfs_tprintf_delay, 0, ""); -static int nfs_iosize __P((struct nfsmount *nmp)); -static int mountnfs __P((struct nfs_args *,struct mount *, - struct mbuf *,char *,char *,struct vnode **)); -static int nfs_mount __P(( struct mount *mp, char *path, caddr_t data, - struct nameidata *ndp, struct proc *p)); -static int nfs_start __P(( struct mount *mp, int flags, - struct proc *p)); -static int nfs_unmount __P(( struct mount *mp, int mntflags, - struct proc *p)); -static int nfs_root __P(( struct mount *mp, struct vnode **vpp)); -static int nfs_quotactl __P(( struct mount *mp, int cmds, uid_t uid, - caddr_t arg, struct proc *p)); -static int nfs_statfs __P(( struct mount *mp, struct statfs *sbp, - struct proc *p)); -static int nfs_sync __P(( struct mount *mp, int waitfor, - struct ucred *cred, struct proc *p)); -static int nfs_vptofh __P(( struct vnode *vp, struct fid *fhp)); -static int nfs_fhtovp __P((struct mount *mp, struct fid *fhp, - struct mbuf *nam, struct vnode **vpp, - int *exflagsp, struct ucred **credanonp)); -static int nfs_vget __P((struct mount *, void *, struct vnode **)); +static int nfs_iosize(struct nfsmount *nmp); +static int mountnfs(struct user_nfs_args *,mount_t,mbuf_t,proc_t,vnode_t *); +static int nfs_mount(mount_t mp, vnode_t vp, user_addr_t data, vfs_context_t context); +static int nfs_start(mount_t mp, int flags, vfs_context_t context); +static int nfs_unmount(mount_t mp, int mntflags, vfs_context_t context); +static int nfs_root(mount_t mp, vnode_t *vpp, vfs_context_t context); +static int nfs_statfs(mount_t mp, struct vfsstatfs *sbp, vfs_context_t context); +static int nfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t context); +static int nfs_sync( mount_t mp, int waitfor, vfs_context_t context); +static int nfs_vptofh(vnode_t vp, int *fhlenp, unsigned char *fhp, vfs_context_t context); +static int nfs_fhtovp(mount_t mp, int fhlen, unsigned char *fhp, vnode_t *vpp, vfs_context_t context); +static int nfs_vget(mount_t , ino64_t, vnode_t *, vfs_context_t context); /* @@ -162,36 +147,24 @@ struct vfsops nfs_vfsops = { nfs_start, nfs_unmount, nfs_root, - nfs_quotactl, - nfs_statfs, + NULL, /* quotactl */ + nfs_vfs_getattr, nfs_sync, nfs_vget, nfs_fhtovp, nfs_vptofh, nfs_init, - nfs_sysctl + nfs_sysctl, + NULL /* setattr */ }; -/* XXX CSM 11/25/97 Mysterious kernel.h ld crud */ -#ifdef notyet -VFS_SET(nfs_vfsops, nfs, MOUNT_NFS, VFCF_NETWORK); -#endif -void nfsargs_ntoh __P((struct nfs_args *)); static int -nfs_mount_diskless __P((struct nfs_dlmount *, char *, int, struct vnode **, - struct mount **)); +nfs_mount_diskless(struct nfs_dlmount *, const char *, int, vnode_t *, mount_t *); #if !defined(NO_MOUNT_PRIVATE) static int -nfs_mount_diskless_private __P((struct nfs_dlmount *, char *, int, - struct vnode **, struct mount **)); +nfs_mount_diskless_private(struct nfs_dlmount *, const char *, int, vnode_t *, mount_t *); #endif /* NO_MOUNT_PRIVATE */ -static void nfs_convert_oargs __P((struct nfs_args *args, - struct onfs_args *oargs)); -#if NFSDIAG -int nfsreqqusers = 0; -extern int nfsbtlen, nfsbtcpu, nfsbtthread, nfsbt[32]; -#endif static int nfs_iosize(nmp) struct nfsmount* nmp; @@ -213,76 +186,52 @@ static int nfs_iosize(nmp) return (trunc_page_32(iosize)); } -static void nfs_convert_oargs(args,oargs) - struct nfs_args *args; - struct onfs_args *oargs; -{ - args->version = NFS_ARGSVERSION; - args->addr = oargs->addr; - args->addrlen = oargs->addrlen; - args->sotype = oargs->sotype; - args->proto = oargs->proto; - args->fh = oargs->fh; - args->fhsize = oargs->fhsize; - args->flags = oargs->flags; - args->wsize = oargs->wsize; - args->rsize = oargs->rsize; - args->readdirsize = oargs->readdirsize; - args->timeo = oargs->timeo; - args->retrans = oargs->retrans; - args->maxgrouplist = oargs->maxgrouplist; - args->readahead = oargs->readahead; - args->leaseterm = oargs->leaseterm; - args->deadthresh = oargs->deadthresh; - args->hostname = oargs->hostname; -} - /* * nfs statfs call */ int -nfs_statfs(mp, sbp, p) - struct mount *mp; - register struct statfs *sbp; - struct proc *p; +nfs_statfs(mount_t mp, struct vfsstatfs *sbp, vfs_context_t context) { - register struct vnode *vp; - register struct nfs_statfs *sfp; - register caddr_t cp; - register u_long *tl; - register long t1, t2; + proc_t p = vfs_context_proc(context); + vnode_t vp; + struct nfs_statfs *sfp; + caddr_t cp; + u_long *tl; + long t1, t2; caddr_t bpos, dpos, cp2; struct nfsmount *nmp = VFSTONFS(mp); int error = 0, v3 = (nmp->nm_flag & NFSMNT_NFSV3), retattr; - struct mbuf *mreq, *mrep, *md, *mb, *mb2; - struct ucred *cred; - extern int nfs_mount_type; + mbuf_t mreq, mrep, md, mb, mb2; u_int64_t xid; + kauth_cred_t cred; + struct ucred temp_cred; #ifndef nolint sfp = (struct nfs_statfs *)0; #endif vp = nmp->nm_dvp; - if (error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p)) + if ((error = vnode_get(vp))) return(error); - cred = crget(); - cred->cr_ngroups = 1; + + bzero(&temp_cred, sizeof(temp_cred)); + temp_cred.cr_ngroups = 1; + cred = kauth_cred_create(&temp_cred); + if (v3 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) - (void)nfs_fsinfo(nmp, vp, cred, p); - nfsstats.rpccnt[NFSPROC_FSSTAT]++; - nfsm_reqhead(vp, NFSPROC_FSSTAT, NFSX_FH(v3)); + nfs_fsinfo(nmp, vp, cred, p); + nfsm_reqhead(NFSX_FH(v3)); + if (error) { + kauth_cred_rele(cred); + vnode_put(vp); + return (error); + } + OSAddAtomic(1, (SInt32*)&nfsstats.rpccnt[NFSPROC_FSSTAT]); nfsm_fhtom(vp, v3); nfsm_request(vp, NFSPROC_FSSTAT, p, cred, &xid); if (v3 && mrep) - nfsm_postop_attr(vp, retattr, &xid); + nfsm_postop_attr_update(vp, v3, retattr, &xid); nfsm_dissect(sfp, struct nfs_statfs *, NFSX_STATFS(v3)); -/* XXX CSM 12/2/97 Cleanup when/if we integrate FreeBSD mount.h */ -#ifdef notyet - sbp->f_type = MOUNT_NFS; -#else - sbp->f_type = nfs_mount_type; -#endif sbp->f_flags = nmp->nm_flag; sbp->f_iosize = nfs_iosize(nmp); if (v3) { @@ -328,13 +277,226 @@ nfs_statfs(mp, sbp, p) sbp->f_files = 0; sbp->f_ffree = 0; } - if (sbp != &mp->mnt_stat) { - bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); - bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); - } nfsm_reqdone; - VOP_UNLOCK(vp, 0, p); - crfree(cred); + kauth_cred_rele(cred); + vnode_put(vp); + return (error); +} + +/* + * The nfs_statfs code is complicated, and used by mountnfs(), so leave it as-is + * and handle VFS_GETATTR by calling nfs_statfs and copying fields. + */ +static int +nfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t context) +{ + int error = 0; + + if (VFSATTR_IS_ACTIVE(fsap, f_bsize) || + VFSATTR_IS_ACTIVE(fsap, f_iosize) || + VFSATTR_IS_ACTIVE(fsap, f_blocks) || + VFSATTR_IS_ACTIVE(fsap, f_bfree) || + VFSATTR_IS_ACTIVE(fsap, f_bavail) || + VFSATTR_IS_ACTIVE(fsap, f_bused) || + VFSATTR_IS_ACTIVE(fsap, f_files) || + VFSATTR_IS_ACTIVE(fsap, f_ffree)) { + struct vfsstatfs sb; + + error = nfs_statfs(mp, &sb, context); + if (!error) { + VFSATTR_RETURN(fsap, f_bsize, sb.f_bsize); + VFSATTR_RETURN(fsap, f_iosize, sb.f_iosize); + VFSATTR_RETURN(fsap, f_blocks, sb.f_blocks); + VFSATTR_RETURN(fsap, f_bfree, sb.f_bfree); + VFSATTR_RETURN(fsap, f_bavail, sb.f_bavail); + VFSATTR_RETURN(fsap, f_bused, sb.f_blocks - sb.f_bfree); + VFSATTR_RETURN(fsap, f_files, sb.f_files); + VFSATTR_RETURN(fsap, f_ffree, sb.f_ffree); + } + } + + if (VFSATTR_IS_ACTIVE(fsap, f_capabilities)) { + struct nfsmount *nmp; + struct nfsv3_pathconf pc; + u_int32_t caps, valid; + vnode_t vp; + int v3; + + if (!(nmp = VFSTONFS(mp))) + return (ENXIO); + vp = nmp->nm_dvp; + v3 = (nmp->nm_flag & NFSMNT_NFSV3); + + /* + * The capabilities[] array defines what this volume supports. + * + * The valid[] array defines which bits this code understands + * the meaning of (whether the volume has that capability or not). + * Any zero bits here means "I don't know what you're asking about" + * and the caller cannot tell whether that capability is + * present or not. + */ + caps = valid = 0; + if (v3) { + /* try to get fsinfo if we haven't already */ + if (!(nmp->nm_state & NFSSTA_GOTFSINFO)) { + nfs_fsinfo(nmp, vp, vfs_context_ucred(context), + vfs_context_proc(context)); + if (!(nmp = VFSTONFS(vnode_mount(vp)))) + return (ENXIO); + } + if (nmp->nm_state & NFSSTA_GOTFSINFO) { + /* fsinfo indicates (non)support of links and symlinks */ + valid |= VOL_CAP_FMT_SYMBOLICLINKS | + VOL_CAP_FMT_HARDLINKS; + if (nmp->nm_fsinfo.fsproperties & NFSV3FSINFO_SYMLINK) + caps |= VOL_CAP_FMT_SYMBOLICLINKS; + if (nmp->nm_fsinfo.fsproperties & NFSV3FSINFO_LINK) + caps |= VOL_CAP_FMT_HARDLINKS; + /* if fsinfo indicates all pathconf info is the same, */ + /* we can use it to report case attributes */ + if ((nmp->nm_fsinfo.fsproperties & NFSV3FSINFO_HOMOGENEOUS) && + !(nmp->nm_state & NFSSTA_GOTPATHCONF)) { + /* no cached pathconf info, try to get now */ + error = nfs_pathconfrpc(vp, &pc, + vfs_context_ucred(context), + vfs_context_proc(context)); + if (!(nmp = VFSTONFS(vnode_mount(vp)))) + return (ENXIO); + if (!error) { + /* all files have the same pathconf info, */ + /* so cache a copy of the results */ + nfs_pathconf_cache(nmp, &pc); + } + } + if (nmp->nm_state & NFSSTA_GOTPATHCONF) { + valid |= VOL_CAP_FMT_CASE_SENSITIVE | + VOL_CAP_FMT_CASE_PRESERVING; + if (!(nmp->nm_fsinfo.pcflags & + NFSPCINFO_CASE_INSENSITIVE)) + caps |= VOL_CAP_FMT_CASE_SENSITIVE; + if (nmp->nm_fsinfo.pcflags & + NFSPCINFO_CASE_PRESERVING) + caps |= VOL_CAP_FMT_CASE_PRESERVING; + } + /* Is server's max file size at least 2TB? */ + if (nmp->nm_fsinfo.maxfilesize >= 0x20000000000ULL) + caps |= VOL_CAP_FMT_2TB_FILESIZE; + } else { + /* + * NFSv3 supports 64 bits of file size. + * Without FSINFO from the server, we'll + * just assume maxfilesize >= 2TB + */ + caps |= VOL_CAP_FMT_2TB_FILESIZE; + } + } + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] = + // VOL_CAP_FMT_PERSISTENTOBJECTIDS | + // VOL_CAP_FMT_SYMBOLICLINKS | + // VOL_CAP_FMT_HARDLINKS | + // VOL_CAP_FMT_JOURNAL | + // VOL_CAP_FMT_JOURNAL_ACTIVE | + // VOL_CAP_FMT_NO_ROOT_TIMES | + // VOL_CAP_FMT_SPARSE_FILES | + // VOL_CAP_FMT_ZERO_RUNS | + // VOL_CAP_FMT_CASE_SENSITIVE | + // VOL_CAP_FMT_CASE_PRESERVING | + // VOL_CAP_FMT_FAST_STATFS | + // VOL_CAP_FMT_2TB_FILESIZE | + caps; + fsap->f_capabilities.valid[VOL_CAPABILITIES_FORMAT] = + VOL_CAP_FMT_PERSISTENTOBJECTIDS | + // VOL_CAP_FMT_SYMBOLICLINKS | + // VOL_CAP_FMT_HARDLINKS | + // VOL_CAP_FMT_JOURNAL | + // VOL_CAP_FMT_JOURNAL_ACTIVE | + // VOL_CAP_FMT_NO_ROOT_TIMES | + // VOL_CAP_FMT_SPARSE_FILES | + // VOL_CAP_FMT_ZERO_RUNS | + // VOL_CAP_FMT_CASE_SENSITIVE | + // VOL_CAP_FMT_CASE_PRESERVING | + VOL_CAP_FMT_FAST_STATFS | + VOL_CAP_FMT_2TB_FILESIZE | + valid; + + /* + * We don't support most of the interfaces. + * + * We MAY support locking, but we don't have any easy way of probing. + * We can tell if there's no lockd running or if locks have been + * disabled for a mount, so we can definitely answer NO in that case. + * Any attempt to send a request to lockd to test for locking support + * may cause the lazily-launched locking daemons to be started + * unnecessarily. So we avoid that. However, we do record if we ever + * successfully perform a lock operation on a mount point, so if it + * looks like lock ops have worked, we do report that we support them. + */ + caps = valid = 0; + if ((!nfslockdvnode && !nfslockdwaiting) || + (nmp->nm_flag & NFSMNT_NOLOCKS)) { + /* locks disabled on this mount, so they definitely won't work */ + valid = VOL_CAP_INT_ADVLOCK | VOL_CAP_INT_FLOCK; + } else if (nmp->nm_state & NFSSTA_LOCKSWORK) { + caps = VOL_CAP_INT_ADVLOCK | VOL_CAP_INT_FLOCK; + valid = VOL_CAP_INT_ADVLOCK | VOL_CAP_INT_FLOCK; + } + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] = + // VOL_CAP_INT_SEARCHFS | + // VOL_CAP_INT_ATTRLIST | + // VOL_CAP_INT_NFSEXPORT | + // VOL_CAP_INT_READDIRATTR | + // VOL_CAP_INT_EXCHANGEDATA | + // VOL_CAP_INT_COPYFILE | + // VOL_CAP_INT_ALLOCATE | + // VOL_CAP_INT_VOL_RENAME | + // VOL_CAP_INT_ADVLOCK | + // VOL_CAP_INT_FLOCK | + // VOL_CAP_INT_EXTENDED_SECURITY | + // VOL_CAP_INT_USERACCESS | + caps; + fsap->f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] = + VOL_CAP_INT_SEARCHFS | + VOL_CAP_INT_ATTRLIST | + VOL_CAP_INT_NFSEXPORT | + VOL_CAP_INT_READDIRATTR | + VOL_CAP_INT_EXCHANGEDATA | + VOL_CAP_INT_COPYFILE | + VOL_CAP_INT_ALLOCATE | + VOL_CAP_INT_VOL_RENAME | + // VOL_CAP_INT_ADVLOCK | + // VOL_CAP_INT_FLOCK | + // VOL_CAP_INT_EXTENDED_SECURITY | + // VOL_CAP_INT_USERACCESS | + valid; + + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_RESERVED1] = 0; + fsap->f_capabilities.valid[VOL_CAPABILITIES_RESERVED1] = 0; + + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_RESERVED2] = 0; + fsap->f_capabilities.valid[VOL_CAPABILITIES_RESERVED2] = 0; + + VFSATTR_SET_SUPPORTED(fsap, f_capabilities); + } + + if (VFSATTR_IS_ACTIVE(fsap, f_attributes)) { + fsap->f_attributes.validattr.commonattr = 0; + fsap->f_attributes.validattr.volattr = + ATTR_VOL_CAPABILITIES | ATTR_VOL_ATTRIBUTES; + fsap->f_attributes.validattr.dirattr = 0; + fsap->f_attributes.validattr.fileattr = 0; + fsap->f_attributes.validattr.forkattr = 0; + + fsap->f_attributes.nativeattr.commonattr = 0; + fsap->f_attributes.nativeattr.volattr = + ATTR_VOL_CAPABILITIES | ATTR_VOL_ATTRIBUTES; + fsap->f_attributes.nativeattr.dirattr = 0; + fsap->f_attributes.nativeattr.fileattr = 0; + fsap->f_attributes.nativeattr.forkattr = 0; + + VFSATTR_SET_SUPPORTED(fsap, f_attributes); + } + return (error); } @@ -343,55 +505,60 @@ nfs_statfs(mp, sbp, p) */ int nfs_fsinfo(nmp, vp, cred, p) - register struct nfsmount *nmp; - register struct vnode *vp; - struct ucred *cred; - struct proc *p; + struct nfsmount *nmp; + vnode_t vp; + kauth_cred_t cred; + proc_t p; { - register struct nfsv3_fsinfo *fsp; - register caddr_t cp; - register long t1, t2; - register u_long *tl, pref, max; + struct nfsv3_fsinfo *fsp; + caddr_t cp; + long t1, t2; + u_long *tl; + int prefsize, maxsize; caddr_t bpos, dpos, cp2; int error = 0, retattr; - struct mbuf *mreq, *mrep, *md, *mb, *mb2; + mbuf_t mreq, mrep, md, mb, mb2; u_int64_t xid; - nfsstats.rpccnt[NFSPROC_FSINFO]++; - nfsm_reqhead(vp, NFSPROC_FSINFO, NFSX_FH(1)); + nfsm_reqhead(NFSX_FH(1)); + if (error) + return (error); + OSAddAtomic(1, (SInt32*)&nfsstats.rpccnt[NFSPROC_FSINFO]); nfsm_fhtom(vp, 1); nfsm_request(vp, NFSPROC_FSINFO, p, cred, &xid); if (mrep) { - nfsm_postop_attr(vp, retattr, &xid); + nfsm_postop_attr_update(vp, 1, retattr, &xid); } if (!error) { nfsm_dissect(fsp, struct nfsv3_fsinfo *, NFSX_V3FSINFO); - pref = fxdr_unsigned(u_long, fsp->fs_wtpref); - if (pref < nmp->nm_wsize) - nmp->nm_wsize = (pref + NFS_FABLKSIZE - 1) & + prefsize = fxdr_unsigned(u_long, fsp->fs_wtpref); + if (prefsize < nmp->nm_wsize) + nmp->nm_wsize = (prefsize + NFS_FABLKSIZE - 1) & ~(NFS_FABLKSIZE - 1); - max = fxdr_unsigned(u_long, fsp->fs_wtmax); - if (max < nmp->nm_wsize) { - nmp->nm_wsize = max & ~(NFS_FABLKSIZE - 1); + maxsize = fxdr_unsigned(u_long, fsp->fs_wtmax); + if (maxsize < nmp->nm_wsize) { + nmp->nm_wsize = maxsize & ~(NFS_FABLKSIZE - 1); if (nmp->nm_wsize == 0) - nmp->nm_wsize = max; + nmp->nm_wsize = maxsize; } - pref = fxdr_unsigned(u_long, fsp->fs_rtpref); - if (pref < nmp->nm_rsize) - nmp->nm_rsize = (pref + NFS_FABLKSIZE - 1) & + prefsize = fxdr_unsigned(u_long, fsp->fs_rtpref); + if (prefsize < nmp->nm_rsize) + nmp->nm_rsize = (prefsize + NFS_FABLKSIZE - 1) & ~(NFS_FABLKSIZE - 1); - max = fxdr_unsigned(u_long, fsp->fs_rtmax); - if (max < nmp->nm_rsize) { - nmp->nm_rsize = max & ~(NFS_FABLKSIZE - 1); + maxsize = fxdr_unsigned(u_long, fsp->fs_rtmax); + if (maxsize < nmp->nm_rsize) { + nmp->nm_rsize = maxsize & ~(NFS_FABLKSIZE - 1); if (nmp->nm_rsize == 0) - nmp->nm_rsize = max; + nmp->nm_rsize = maxsize; } - pref = fxdr_unsigned(u_long, fsp->fs_dtpref); - if (pref < nmp->nm_readdirsize) - nmp->nm_readdirsize = pref; - if (max < nmp->nm_readdirsize) { - nmp->nm_readdirsize = max; + prefsize = fxdr_unsigned(u_long, fsp->fs_dtpref); + if (prefsize < nmp->nm_readdirsize) + nmp->nm_readdirsize = prefsize; + if (maxsize < nmp->nm_readdirsize) { + nmp->nm_readdirsize = maxsize; } + fxdr_hyper(&fsp->fs_maxfilesize, &nmp->nm_fsinfo.maxfilesize); + nmp->nm_fsinfo.fsproperties = fxdr_unsigned(u_long, fsp->fs_properties); nmp->nm_state |= NFSSTA_GOTFSINFO; } nfsm_reqdone; @@ -415,17 +582,16 @@ int nfs_mountroot() { struct nfs_diskless nd; - struct vattr attr; - struct mount *mp; - struct vnode *vp; - struct proc *procp; - long n; + struct nfs_vattr nvattr; + mount_t mp; + vnode_t vp; + proc_t procp; int error; #if !defined(NO_MOUNT_PRIVATE) - struct mount *mppriv; - struct vnode *vppriv; + mount_t mppriv; + vnode_t vppriv; #endif /* NO_MOUNT_PRIVATE */ - int v3; + int v3, sotype; procp = current_proc(); /* XXX */ @@ -440,11 +606,15 @@ nfs_mountroot() panic("nfs_boot_init failed with %d\n", error); } - /* try NFSv3 first, if that fails then try NFSv2 */ + /* + * Try NFSv3 first, then fallback to NFSv2. + * Likewise, try TCP first, then fall back to UDP. + */ v3 = 1; + sotype = SOCK_STREAM; tryagain: - error = nfs_boot_getfh(&nd, procp, v3); + error = nfs_boot_getfh(&nd, procp, v3, sotype); if (error) { if (error == EHOSTDOWN || error == EHOSTUNREACH) { if (nd.nd_root.ndm_path) @@ -456,49 +626,66 @@ tryagain: return (error); } if (v3) { - printf("nfs_boot_getfh(v3) failed with %d, trying v2...\n", error); + if (sotype == SOCK_STREAM) { + printf("nfs_boot_getfh(v3,TCP) failed with %d, trying UDP...\n", error); + sotype = SOCK_DGRAM; + goto tryagain; + } + printf("nfs_boot_getfh(v3,UDP) failed with %d, trying v2...\n", error); v3 = 0; + sotype = SOCK_STREAM; + goto tryagain; + } else if (sotype == SOCK_STREAM) { + printf("nfs_boot_getfh(v2,TCP) failed with %d, trying UDP...\n", error); + sotype = SOCK_DGRAM; goto tryagain; } - panic("nfs_boot_getfh(v2) failed with %d\n", error); + panic("nfs_boot_getfh(v2,UDP) failed with %d\n", error); } /* * Create the root mount point. */ #if !defined(NO_MOUNT_PRIVATE) - if ((error = nfs_mount_diskless(&nd.nd_root, "/", MNT_RDONLY, &vp, &mp))) { + if ((error = nfs_mount_diskless(&nd.nd_root, "/", MNT_RDONLY|MNT_ROOTFS, &vp, &mp))) #else - if (error = nfs_mount_diskless(&nd.nd_root, "/", NULL, &vp, &mp)) { + if ((error = nfs_mount_diskless(&nd.nd_root, "/", MNT_ROOTFS, &vp, &mp))) #endif /* NO_MOUNT_PRIVATE */ + { if (v3) { - printf("nfs_mount_diskless(v3) failed with %d, trying v2...\n", error); + if (sotype == SOCK_STREAM) { + printf("nfs_mount_diskless(v3,TCP) failed with %d, trying UDP...\n", error); + sotype = SOCK_DGRAM; + goto tryagain; + } + printf("nfs_mount_diskless(v3,UDP) failed with %d, trying v2...\n", error); v3 = 0; + sotype = SOCK_STREAM; + goto tryagain; + } else if (sotype == SOCK_STREAM) { + printf("nfs_mount_diskless(v2,TCP) failed with %d, trying UDP...\n", error); + sotype = SOCK_DGRAM; goto tryagain; } - panic("nfs_mount_diskless root failed with %d\n", error); + panic("nfs_mount_diskless(v2,UDP) root failed with %d\n", error); } printf("root on %s\n", (char *)&nd.nd_root.ndm_host); - simple_lock(&mountlist_slock); - CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list); - simple_unlock(&mountlist_slock); - vfs_unbusy(mp, procp); + vfs_unbusy(mp); + mount_list_add(mp); rootvp = vp; #if !defined(NO_MOUNT_PRIVATE) if (nd.nd_private.ndm_saddr.sin_addr.s_addr) { error = nfs_mount_diskless_private(&nd.nd_private, "/private", - NULL, &vppriv, &mppriv); + 0, &vppriv, &mppriv); if (error) { panic("nfs_mount_diskless private failed with %d\n", error); } printf("private on %s\n", (char *)&nd.nd_private.ndm_host); - simple_lock(&mountlist_slock); - CIRCLEQ_INSERT_TAIL(&mountlist, mppriv, mnt_list); - simple_unlock(&mountlist_slock); - vfs_unbusy(mppriv, procp); + vfs_unbusy(mppriv); + mount_list_add(mppriv); } #endif /* NO_MOUNT_PRIVATE */ @@ -509,10 +696,8 @@ tryagain: FREE_ZONE(nd.nd_private.ndm_path, MAXPATHLEN, M_NAMEI); /* Get root attributes (for the time). */ - error = VOP_GETATTR(vp, &attr, procp->p_ucred, procp); + error = nfs_getattr(vp, &nvattr, kauth_cred_get(), procp); if (error) panic("nfs_mountroot: getattr for root"); - n = attr.va_mtime.tv_sec; - inittodr(n); return (0); } @@ -520,56 +705,60 @@ tryagain: * Internal version of mount system call for diskless setup. */ static int -nfs_mount_diskless(ndmntp, mntname, mntflag, vpp, mpp) - struct nfs_dlmount *ndmntp; - char *mntname; - int mntflag; - struct vnode **vpp; - struct mount **mpp; +nfs_mount_diskless( + struct nfs_dlmount *ndmntp, + const char *mntname, + int mntflag, + vnode_t *vpp, + mount_t *mpp) { - struct nfs_args args; - struct mount *mp; - struct mbuf *m; + struct user_nfs_args args; + mount_t mp; + mbuf_t m; int error; - struct proc *procp; + proc_t procp; procp = current_proc(); /* XXX */ if ((error = vfs_rootmountalloc("nfs", ndmntp->ndm_host, &mp))) { - printf("nfs_mountroot: NFS not configured"); + printf("nfs_mount_diskless: NFS not configured"); return (error); } - mp->mnt_flag = mntflag; + + mp->mnt_flag |= mntflag; + if (!(mntflag & MNT_RDONLY)) + mp->mnt_flag &= ~MNT_RDONLY; /* Initialize mount args. */ bzero((caddr_t) &args, sizeof(args)); - args.addr = (struct sockaddr *)&ndmntp->ndm_saddr; - args.addrlen = args.addr->sa_len; - args.sotype = SOCK_DGRAM; - args.fh = ndmntp->ndm_fh; + args.addr = CAST_USER_ADDR_T(&ndmntp->ndm_saddr); + args.addrlen = ndmntp->ndm_saddr.sin_len; + args.sotype = ndmntp->ndm_sotype; + args.fh = CAST_USER_ADDR_T(&ndmntp->ndm_fh[0]); args.fhsize = ndmntp->ndm_fhlen; - args.hostname = ndmntp->ndm_host; + args.hostname = CAST_USER_ADDR_T(ndmntp->ndm_host); args.flags = NFSMNT_RESVPORT; if (ndmntp->ndm_nfsv3) args.flags |= NFSMNT_NFSV3; - MGET(m, M_DONTWAIT, MT_SONAME); - bcopy((caddr_t)args.addr, mtod(m, caddr_t), - (m->m_len = args.addr->sa_len)); - if ((error = mountnfs(&args, mp, m, mntname, args.hostname, vpp))) { - printf("nfs_mountroot: mount %s failed: %d", mntname, error); - mp->mnt_vfc->vfc_refcount--; - - if (mp->mnt_kern_flag & MNTK_IO_XINFO) - FREE(mp->mnt_xinfo_ptr, M_TEMP); - vfs_unbusy(mp, procp); - - FREE_ZONE(mp, sizeof (struct mount), M_MOUNT); + error = mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &m); + if (error) { + printf("nfs_mount_diskless: mbuf_get(soname) failed"); + return (error); + } + mbuf_setlen(m, ndmntp->ndm_saddr.sin_len); + bcopy((caddr_t)args.addr, mbuf_data(m), ndmntp->ndm_saddr.sin_len); + if ((error = mountnfs(&args, mp, m, procp, vpp))) { + printf("nfs_mountroot: mount %s failed: %d\n", mntname, error); + // XXX vfs_rootmountfailed(mp); + mount_list_lock(); + mp->mnt_vtable->vfc_refcount--; + mount_list_unlock(); + vfs_unbusy(mp); + mount_lock_destroy(mp); + FREE_ZONE(mp, sizeof(struct mount), M_MOUNT); return (error); } -#if 0 /* Causes incorrect reporting of "mounted on" */ - (void) copystr(args.hostname, mp->mnt_stat.f_mntonname, MNAMELEN - 1, 0); -#endif /* 0 */ *mpp = mp; return (0); } @@ -580,23 +769,26 @@ nfs_mount_diskless(ndmntp, mntname, mntflag, vpp, mpp) * separately in diskless setup */ static int -nfs_mount_diskless_private(ndmntp, mntname, mntflag, vpp, mpp) - struct nfs_dlmount *ndmntp; - char *mntname; - int mntflag; - struct vnode **vpp; - struct mount **mpp; +nfs_mount_diskless_private( + struct nfs_dlmount *ndmntp, + const char *mntname, + int mntflag, + vnode_t *vpp, + mount_t *mpp) { - struct nfs_args args; - struct mount *mp; - struct mbuf *m; + struct user_nfs_args args; + mount_t mp; + mbuf_t m; int error; - struct proc *procp; - struct vfsconf *vfsp; + proc_t procp; + struct vfstable *vfsp; struct nameidata nd; - struct vnode *vp; + vnode_t vp; + struct vfs_context context; procp = current_proc(); /* XXX */ + context.vc_proc = procp; + context.vc_ucred = kauth_cred_get(); { /* @@ -605,49 +797,55 @@ nfs_mount_diskless_private(ndmntp, mntname, mntflag, vpp, mpp) */ struct filedesc *fdp; /* pointer to file descriptor state */ fdp = procp->p_fd; - mountlist.cqh_first->mnt_flag |= MNT_ROOTFS; + mountlist.tqh_first->mnt_flag |= MNT_ROOTFS; /* Get the vnode for '/'. Set fdp->fd_cdir to reference it. */ - if (VFS_ROOT(mountlist.cqh_first, &rootvnode)) + if (VFS_ROOT(mountlist.tqh_first, &rootvnode, NULL)) panic("cannot find root vnode"); - VREF(rootvnode); + error = vnode_ref(rootvnode); + if (error) { + printf("nfs_mountroot: vnode_ref() failed on root vnode!\n"); + return (error); + } fdp->fd_cdir = rootvnode; - VOP_UNLOCK(rootvnode, 0, procp); fdp->fd_rdir = NULL; } /* * Get vnode to be covered */ - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, - mntname, procp); + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE32, + mntname, &context); if ((error = namei(&nd))) { - printf("nfs_mountroot: private namei failed!"); + printf("nfs_mountroot: private namei failed!\n"); return (error); } { - /* undo VREF in mimic main()! */ - vrele(rootvnode); + /* undo vnode_ref() in mimic main()! */ + vnode_rele(rootvnode); } + nameidone(&nd); vp = nd.ni_vp; - if ((error = vinvalbuf(vp, V_SAVE, procp->p_ucred, procp, 0, 0))) { - vput(vp); + + if ((error = VNOP_FSYNC(vp, MNT_WAIT, &context)) || + (error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) { + vnode_put(vp); return (error); } - if (vp->v_type != VDIR) { - vput(vp); + if (vnode_vtype(vp) != VDIR) { + vnode_put(vp); return (ENOTDIR); } for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (!strcmp(vfsp->vfc_name, "nfs")) break; if (vfsp == NULL) { - printf("nfs_mountroot: private NFS not configured"); - vput(vp); + printf("nfs_mountroot: private NFS not configured\n"); + vnode_put(vp); return (ENODEV); } - if (vp->v_mountedhere != NULL) { - vput(vp); + if (vnode_mountedhere(vp) != NULL) { + vnode_put(vp); return (EBUSY); } @@ -655,51 +853,64 @@ nfs_mount_diskless_private(ndmntp, mntname, mntflag, vpp, mpp) * Allocate and initialize the filesystem. */ mp = _MALLOC_ZONE((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); + if (!mp) { + printf("nfs_mountroot: unable to allocate mount structure\n"); + vnode_put(vp); + return (ENOMEM); + } bzero((char *)mp, (u_long)sizeof(struct mount)); /* Initialize the default IO constraints */ mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS; mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32; - lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0); - (void)vfs_busy(mp, LK_NOWAIT, 0, procp); - LIST_INIT(&mp->mnt_vnodelist); - mp->mnt_op = vfsp->vfc_vfsops; - mp->mnt_vfc = vfsp; + mount_lock_init(mp); + TAILQ_INIT(&mp->mnt_vnodelist); + TAILQ_INIT(&mp->mnt_workerqueue); + TAILQ_INIT(&mp->mnt_newvnodes); + (void)vfs_busy(mp, LK_NOWAIT); + TAILQ_INIT(&mp->mnt_vnodelist); + mount_list_lock(); vfsp->vfc_refcount++; - mp->mnt_stat.f_type = vfsp->vfc_typenum; + mount_list_unlock(); + mp->mnt_vtable = vfsp; + mp->mnt_op = vfsp->vfc_vfsops; + // mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_flag = mntflag; mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; - strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); + strncpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSNAMELEN-1); vp->v_mountedhere = mp; mp->mnt_vnodecovered = vp; - mp->mnt_stat.f_owner = procp->p_ucred->cr_uid; - (void) copystr(mntname, mp->mnt_stat.f_mntonname, MNAMELEN - 1, 0); - (void) copystr(ndmntp->ndm_host, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); + mp->mnt_vfsstat.f_owner = kauth_cred_getuid(kauth_cred_get()); + (void) copystr(mntname, mp->mnt_vfsstat.f_mntonname, MNAMELEN - 1, 0); + (void) copystr(ndmntp->ndm_host, mp->mnt_vfsstat.f_mntfromname, MNAMELEN - 1, 0); /* Initialize mount args. */ bzero((caddr_t) &args, sizeof(args)); - args.addr = (struct sockaddr *)&ndmntp->ndm_saddr; - args.addrlen = args.addr->sa_len; - args.sotype = SOCK_DGRAM; - args.fh = ndmntp->ndm_fh; + args.addr = CAST_USER_ADDR_T(&ndmntp->ndm_saddr); + args.addrlen = ndmntp->ndm_saddr.sin_len; + args.sotype = ndmntp->ndm_sotype; + args.fh = CAST_USER_ADDR_T(ndmntp->ndm_fh); args.fhsize = ndmntp->ndm_fhlen; - args.hostname = ndmntp->ndm_host; + args.hostname = CAST_USER_ADDR_T(ndmntp->ndm_host); args.flags = NFSMNT_RESVPORT; if (ndmntp->ndm_nfsv3) args.flags |= NFSMNT_NFSV3; - MGET(m, M_DONTWAIT, MT_SONAME); - bcopy((caddr_t)args.addr, mtod(m, caddr_t), - (m->m_len = args.addr->sa_len)); - if ((error = mountnfs(&args, mp, m, mntname, args.hostname, &vp))) { - printf("nfs_mountroot: mount %s failed: %d", mntname, error); - mp->mnt_vfc->vfc_refcount--; - - if (mp->mnt_kern_flag & MNTK_IO_XINFO) - FREE(mp->mnt_xinfo_ptr, M_TEMP); - vfs_unbusy(mp, procp); - + error = mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &m); + if (error) { + printf("nfs_mount_diskless_private: mbuf_get(soname) failed"); + return (error); + } + mbuf_setlen(m, ndmntp->ndm_saddr.sin_len); + bcopy((caddr_t)args.addr, mbuf_data(m), ndmntp->ndm_saddr.sin_len); + if ((error = mountnfs(&args, mp, m, procp, &vp))) { + printf("nfs_mountroot: mount %s failed: %d\n", mntname, error); + mount_list_lock(); + vfsp->vfc_refcount--; + mount_list_unlock(); + vfs_unbusy(mp); + mount_lock_destroy(mp); FREE_ZONE(mp, sizeof (struct mount), M_MOUNT); return (error); } @@ -714,66 +925,88 @@ nfs_mount_diskless_private(ndmntp, mntname, mntflag, vpp, mpp) * VFS Operations. * * mount system call - * It seems a bit dumb to copyinstr() the host and path here and then - * bcopy() them in mountnfs(), but I wanted to detect errors before - * doing the sockargs() call because sockargs() allocates an mbuf and - * an error after that means that I have to release the mbuf. */ -/* ARGSUSED */ static int -nfs_mount(mp, path, data, ndp, p) - struct mount *mp; - char *path; - caddr_t data; - struct nameidata *ndp; - struct proc *p; +nfs_mount(mount_t mp, vnode_t vp, user_addr_t data, vfs_context_t context) { - int error; - struct nfs_args args; - struct mbuf *nam; - struct vnode *vp; - char pth[MNAMELEN], hst[MNAMELEN]; + proc_t p = vfs_context_proc(context); + int error, argsvers; + struct user_nfs_args args; + struct nfs_args tempargs; + mbuf_t nam; size_t len; u_char nfh[NFSX_V3FHMAX]; + char *mntfrom; - error = copyin(data, (caddr_t)&args, sizeof (struct nfs_args)); + error = copyin(data, (caddr_t)&argsvers, sizeof (argsvers)); if (error) return (error); - if (args.version != NFS_ARGSVERSION) { -#ifndef NO_COMPAT_PRELITE2 - /* - * If the argument version is unknown, then assume the - * caller is a pre-lite2 4.4BSD client and convert its - * arguments. - */ - struct onfs_args oargs; - error = copyin(data, (caddr_t)&oargs, sizeof (struct onfs_args)); - if (error) - return (error); - nfs_convert_oargs(&args,&oargs); -#else /* NO_COMPAT_PRELITE2 */ + + switch (argsvers) { + case 3: + if (vfs_context_is64bit(context)) + error = copyin(data, (caddr_t)&args, sizeof (struct user_nfs_args3)); + else + error = copyin(data, (caddr_t)&tempargs, sizeof (struct nfs_args3)); + break; + case 4: + if (vfs_context_is64bit(context)) + error = copyin(data, (caddr_t)&args, sizeof (args)); + else + error = copyin(data, (caddr_t)&tempargs, sizeof (tempargs)); + break; + default: return (EPROGMISMATCH); -#endif /* !NO_COMPAT_PRELITE2 */ } - if (args.fhsize < 0 || args.fhsize > NFSX_V3FHMAX) - return (EINVAL); - error = copyin((caddr_t)args.fh, (caddr_t)nfh, args.fhsize); if (error) return (error); - error = copyinstr(path, pth, MNAMELEN-1, &len); + + if (!vfs_context_is64bit(context)) { + args.version = tempargs.version; + args.addrlen = tempargs.addrlen; + args.sotype = tempargs.sotype; + args.proto = tempargs.proto; + args.fhsize = tempargs.fhsize; + args.flags = tempargs.flags; + args.wsize = tempargs.wsize; + args.rsize = tempargs.rsize; + args.readdirsize = tempargs.readdirsize; + args.timeo = tempargs.timeo; + args.retrans = tempargs.retrans; + args.maxgrouplist = tempargs.maxgrouplist; + args.readahead = tempargs.readahead; + args.leaseterm = tempargs.leaseterm; + args.deadthresh = tempargs.deadthresh; + args.addr = CAST_USER_ADDR_T(tempargs.addr); + args.fh = CAST_USER_ADDR_T(tempargs.fh); + args.hostname = CAST_USER_ADDR_T(tempargs.hostname); + if (argsvers >= 4) { + args.acregmin = tempargs.acregmin; + args.acregmax = tempargs.acregmax; + args.acdirmin = tempargs.acdirmin; + args.acdirmax = tempargs.acdirmax; + } + } + + if (args.fhsize > NFSX_V3FHMAX) + return (EINVAL); + error = copyin(args.fh, (caddr_t)nfh, args.fhsize); if (error) return (error); - bzero(&pth[len], MNAMELEN - len); - error = copyinstr(args.hostname, hst, MNAMELEN-1, &len); + + mntfrom = &vfs_statfs(mp)->f_mntfromname[0]; + error = copyinstr(args.hostname, mntfrom, MAXPATHLEN-1, &len); if (error) return (error); - bzero(&hst[len], MNAMELEN - len); + bzero(&mntfrom[len], MAXPATHLEN - len); + /* sockargs() call must be after above copyin() calls */ - error = sockargs(&nam, (caddr_t)args.addr, args.addrlen, MT_SONAME); + error = sockargs(&nam, args.addr, args.addrlen, MBUF_TYPE_SONAME); if (error) return (error); - args.fh = nfh; - error = mountnfs(&args, mp, nam, pth, hst, &vp); + + args.fh = CAST_USER_ADDR_T(&nfh[0]); + error = mountnfs(&args, mp, nam, p, &vp); return (error); } @@ -781,29 +1014,19 @@ nfs_mount(mp, path, data, ndp, p) * Common code for mount and mountroot */ static int -mountnfs(argp, mp, nam, pth, hst, vpp) - register struct nfs_args *argp; - register struct mount *mp; - struct mbuf *nam; - char *pth, *hst; - struct vnode **vpp; +mountnfs( + struct user_nfs_args *argp, + mount_t mp, + mbuf_t nam, + proc_t p, + vnode_t *vpp) { - register struct nfsmount *nmp; + struct nfsmount *nmp; struct nfsnode *np; int error, maxio; - struct vattr attrs; - struct proc *curproc; - - /* - * turning off NQNFS until we have further testing - * with UBC changes, in particular, nfs_pagein and nfs_pageout. - * Those have NQNFS defined out in conjunction with this - * returning an error. Remove when fully tested. - */ - if (argp->flags & NFSMNT_NQNFS) { - error = NFSERR_NOTSUPP; - goto bad2; - } + struct nfs_vattr nvattrs; + struct vfs_context context; /* XXX get from caller? */ + u_int64_t xid; /* * Silently clear NFSMNT_NOCONN if it's a TCP mount, it makes @@ -812,30 +1035,25 @@ mountnfs(argp, mp, nam, pth, hst, vpp) if (argp->sotype == SOCK_STREAM) argp->flags &= ~NFSMNT_NOCONN; - if (mp->mnt_flag & MNT_UPDATE) { + if (vfs_flags(mp) & MNT_UPDATE) { nmp = VFSTONFS(mp); /* update paths, file handles, etc, here XXX */ - m_freem(nam); + mbuf_freem(nam); return (0); } else { MALLOC_ZONE(nmp, struct nfsmount *, sizeof (struct nfsmount), M_NFSMNT, M_WAITOK); + if (!nmp) { + mbuf_freem(nam); + return (ENOMEM); + } bzero((caddr_t)nmp, sizeof (struct nfsmount)); TAILQ_INIT(&nmp->nm_uidlruhead); TAILQ_INIT(&nmp->nm_bufq); - mp->mnt_data = (qaddr_t)nmp; + vfs_setfsprivate(mp, nmp); } - vfs_getnewfsid(mp); - nmp->nm_mountp = mp; - nmp->nm_flag = argp->flags; - if (nmp->nm_flag & NFSMNT_NQNFS) - /* - * We have to set mnt_maxsymlink to a non-zero value so - * that COMPAT_43 routines will know that we are setting - * the d_type field in directories (and can zero it for - * unsuspecting binaries). - */ - mp->mnt_maxsymlinklen = 1; + + /* setup defaults */ nmp->nm_timeo = NFS_TIMEO; nmp->nm_retry = NFS_RETRANS; if (argp->sotype == SOCK_DGRAM) { @@ -848,18 +1066,21 @@ mountnfs(argp, mp, nam, pth, hst, vpp) nmp->nm_readdirsize = NFS_READDIRSIZE; nmp->nm_numgrps = NFS_MAXGRPS; nmp->nm_readahead = NFS_DEFRAHEAD; - nmp->nm_leaseterm = NQ_DEFLEASE; - nmp->nm_deadthresh = NQ_DEADTHRESH; nmp->nm_tprintf_delay = nfs_tprintf_delay; if (nmp->nm_tprintf_delay < 0) nmp->nm_tprintf_delay = 0; nmp->nm_tprintf_initial_delay = nfs_tprintf_initial_delay; if (nmp->nm_tprintf_initial_delay < 0) nmp->nm_tprintf_initial_delay = 0; - CIRCLEQ_INIT(&nmp->nm_timerhead); - nmp->nm_inprog = NULLVP; - bcopy(hst, mp->mnt_stat.f_mntfromname, MNAMELEN); - bcopy(pth, mp->mnt_stat.f_mntonname, MNAMELEN); + nmp->nm_acregmin = NFS_MINATTRTIMO; + nmp->nm_acregmax = NFS_MAXATTRTIMO; + nmp->nm_acdirmin = NFS_MINDIRATTRTIMO; + nmp->nm_acdirmax = NFS_MAXDIRATTRTIMO; + + vfs_getnewfsid(mp); + nmp->nm_mountp = mp; + vfs_setauthopaque(mp); + nmp->nm_flag = argp->flags; nmp->nm_nam = nam; if ((argp->flags & NFSMNT_TIMEO) && argp->timeo > 0) { @@ -922,16 +1143,30 @@ mountnfs(argp, mp, nam, pth, hst, vpp) if ((argp->flags & NFSMNT_READAHEAD) && argp->readahead >= 0 && argp->readahead <= NFS_MAXRAHEAD) nmp->nm_readahead = argp->readahead; - if ((argp->flags & NFSMNT_LEASETERM) && argp->leaseterm >= 2 && - argp->leaseterm <= NQ_MAXLEASE) - nmp->nm_leaseterm = argp->leaseterm; - if ((argp->flags & NFSMNT_DEADTHRESH) && argp->deadthresh >= 1 && - argp->deadthresh <= NQ_NEVERDEAD) - nmp->nm_deadthresh = argp->deadthresh; + + if (argp->version >= 4) { + if ((argp->flags & NFSMNT_ACREGMIN) && argp->acregmin >= 0) + nmp->nm_acregmin = argp->acregmin; + if ((argp->flags & NFSMNT_ACREGMAX) && argp->acregmax >= 0) + nmp->nm_acregmax = argp->acregmax; + if ((argp->flags & NFSMNT_ACDIRMIN) && argp->acdirmin >= 0) + nmp->nm_acdirmin = argp->acdirmin; + if ((argp->flags & NFSMNT_ACDIRMAX) && argp->acdirmax >= 0) + nmp->nm_acdirmax = argp->acdirmax; + if (nmp->nm_acregmin > nmp->nm_acregmax) + nmp->nm_acregmin = nmp->nm_acregmax; + if (nmp->nm_acdirmin > nmp->nm_acdirmax) + nmp->nm_acdirmin = nmp->nm_acdirmax; + } + /* Set up the sockets and per-host congestion */ nmp->nm_sotype = argp->sotype; nmp->nm_soproto = argp->proto; + /* make sure mbuf constants are set up */ + if (!nfs_mbuf_mlen) + nfs_mbuf_init(); + /* * For Connection based sockets (TCP,...) defer the connect until * the first request, in case the server is not responding. @@ -940,6 +1175,21 @@ mountnfs(argp, mp, nam, pth, hst, vpp) (error = nfs_connect(nmp, (struct nfsreq *)0))) goto bad; + /* + * Get file attributes for the mountpoint. These are needed + * in order to properly create the root vnode. + */ + // LP64todo - fix CAST_DOWN of argp->fh + error = nfs_getattr_no_vnode(mp, CAST_DOWN(caddr_t, argp->fh), argp->fhsize, + proc_ucred(p), p, &nvattrs, &xid); + if (error) { + /* + * we got problems... we couldn't get the attributes + * from the NFS server... so the mount fails. + */ + goto bad; + } + /* * A reference count is needed on the nfsnode representing the * remote root. If this object is not persistent, then backward @@ -948,31 +1198,24 @@ mountnfs(argp, mp, nam, pth, hst, vpp) * this problem, because one can identify root inodes by their * number == ROOTINO (2). */ - error = nfs_nget(mp, (nfsfh_t *)argp->fh, argp->fhsize, &np); + error = nfs_nget(mp, NULL, NULL, CAST_DOWN(caddr_t, argp->fh), argp->fhsize, + &nvattrs, &xid, NG_MARKROOT, &np); if (error) goto bad; /* * save this vnode pointer. That way nfs_unmount() - * does not need to call nfs_net() just get it to drop + * does not need to call nfs_nget() just get it to drop * this vnode reference. */ nmp->nm_dvp = *vpp = NFSTOV(np); - - /* - * Get file attributes for the mountpoint. This has the side - * effect of filling in (*vpp)->v_type with the correct value. - */ - curproc = current_proc(); - error = VOP_GETATTR(*vpp, &attrs, curproc->p_ucred, curproc); + /* get usecount and drop iocount */ + error = vnode_ref(*vpp); if (error) { - /* - * we got problems... we couldn't get the attributes - * from the NFS server... so the mount fails. - */ - vput(*vpp); + vnode_put(*vpp); goto bad; } + vnode_put(*vpp); /* * Set the mount point's block I/O size. @@ -980,13 +1223,25 @@ mountnfs(argp, mp, nam, pth, hst, vpp) * the server about what its preferred I/O sizes are. */ if (nmp->nm_flag & NFSMNT_NFSV3) - nfs_fsinfo(nmp, *vpp, curproc->p_ucred, curproc); - mp->mnt_stat.f_iosize = nfs_iosize(nmp); + nfs_fsinfo(nmp, *vpp, proc_ucred(p), p); + vfs_statfs(mp)->f_iosize = nfs_iosize(nmp); /* - * Lose the lock but keep the ref. + * V3 mounts give us a (relatively) reliable remote access(2) + * call, so advertise the fact. + * + * XXX this may not be the best way to go, as the granularity + * offered isn't a good match to our needs. */ - VOP_UNLOCK(*vpp, 0, curproc); + if (nmp->nm_flag & NFSMNT_NFSV3) + vfs_setauthopaqueaccess(mp); + + /* + * Do statfs to ensure static info gets set to reasonable values. + */ + context.vc_proc = p; + context.vc_ucred = proc_ucred(p); + nfs_statfs(mp, vfs_statfs(mp), &context); if (nmp->nm_flag & NFSMNT_RESVPORT) nfs_resv_mounts++; @@ -995,8 +1250,7 @@ mountnfs(argp, mp, nam, pth, hst, vpp) bad: nfs_disconnect(nmp); FREE_ZONE((caddr_t)nmp, sizeof (struct nfsmount), M_NFSMNT); -bad2: - m_freem(nam); + mbuf_freem(nam); return (error); } @@ -1005,13 +1259,13 @@ bad2: * unmount system call */ static int -nfs_unmount(mp, mntflags, p) - struct mount *mp; - int mntflags; - struct proc *p; +nfs_unmount( + mount_t mp, + int mntflags, + __unused vfs_context_t context) { register struct nfsmount *nmp; - struct vnode *vp; + vnode_t vp; int error, flags = 0; nmp = VFSTONFS(mp); @@ -1029,19 +1283,12 @@ nfs_unmount(mp, mntflags, p) * Goes something like this.. * - Call vflush() to clear out vnodes for this file system, * except for the swap files. Deal with them in 2nd pass. - * It will do vgone making the vnode VBAD at that time. * - Decrement reference on the vnode representing remote root. * - Close the socket * - Free up the data structures */ vp = nmp->nm_dvp; - /* - * Must handshake with nqnfs_clientd() if it is active. - */ - nmp->nm_state |= NFSSTA_DISMINPROG; - while (nmp->nm_inprog != NULLVP) - (void) tsleep((caddr_t)&lbolt, PSOCK, "nfsdism", 0); /* * vflush will check for busy vnodes on mountpoint. * Will do the right thing for MNT_FORCE. That is, we should @@ -1051,24 +1298,13 @@ nfs_unmount(mp, mntflags, p) if (mntflags & MNT_FORCE) { error = vflush(mp, NULLVP, flags); /* locks vp in the process */ } else { - if (vp->v_usecount > 1) { - nmp->nm_state &= ~NFSSTA_DISMINPROG; + if (vnode_isinuse(vp, 1)) return (EBUSY); - } error = vflush(mp, vp, flags); } - - if (error) { - nmp->nm_state &= ~NFSSTA_DISMINPROG; + if (error) return (error); - } - /* - * We are now committed to the unmount. - * For NQNFS, let the server daemon free the nfsmount structure. - */ - if (nmp->nm_flag & (NFSMNT_NQNFS | NFSMNT_KERB)) - nmp->nm_state |= NFSSTA_DISMNT; nmp->nm_state &= ~NFSSTA_MOUNTED; if (nmp->nm_flag & NFSMNT_RESVPORT) { if (--nfs_resv_mounts == 0) @@ -1077,38 +1313,24 @@ nfs_unmount(mp, mntflags, p) /* * Release the root vnode reference held by mountnfs() - * vflush did the vgone for us when we didn't skip over - * it in the MNT_FORCE case. (Thus vp can't be locked when - * called vflush in non-skip vp case.) */ - vrele(vp); - if (!(mntflags & MNT_FORCE)) - vgone(vp); - mp->mnt_data = 0; /* don't want to end up using stale vp */ + vnode_rele(vp); + + (void)vflush(mp, NULLVP, FORCECLOSE); + vfs_setfsprivate(mp, 0); /* don't want to end up using stale vp */ + nfs_disconnect(nmp); - m_freem(nmp->nm_nam); + mbuf_freem(nmp->nm_nam); - if ((nmp->nm_flag & (NFSMNT_NQNFS | NFSMNT_KERB)) == 0) { - register struct nfsreq *rp; + if ((nmp->nm_flag & NFSMNT_KERB) == 0) { + struct nfsreq *rp; /* * Loop through outstanding request list and remove dangling * references to defunct nfsmount struct */ -#if NFSDIAG && 0 - if (hw_atomic_add(&nfsreqqusers, 1) != 1) - nfsatompanic("unmount add"); - nfsbtlen = backtrace(&nfsbt, sizeof(nfsbt)); - nfsbtcpu = cpu_number(); - nfsbtthread = (int)(current_thread()); -#endif - for (rp = nfs_reqq.tqh_first; rp; rp = rp->r_chain.tqe_next) if (rp->r_nmp == nmp) rp->r_nmp = (struct nfsmount *)0; -#if NFSDIAG && 0 - if (hw_atomic_sub(&nfsreqqusers, 1) != 0) - nfsatompanic("unmount sub"); -#endif /* Need to wake up any rcvlock waiters so they notice the unmount. */ if (nmp->nm_state & NFSSTA_WANTRCV) { nmp->nm_state &= ~NFSSTA_WANTRCV; @@ -1123,163 +1345,147 @@ nfs_unmount(mp, mntflags, p) * Return root of a filesystem */ static int -nfs_root(mp, vpp) - struct mount *mp; - struct vnode **vpp; +nfs_root(mount_t mp, vnode_t *vpp, __unused vfs_context_t context) { - register struct vnode *vp; + vnode_t vp; struct nfsmount *nmp; - int error, vpid; + int error; + u_long vpid; nmp = VFSTONFS(mp); vp = nmp->nm_dvp; - vpid = vp->v_id; - while (error = vget(vp, LK_EXCLUSIVE, current_proc())) { - /* vget may return ENOENT if the dir changes while in vget */ - /* If that happens, try vget again, else return the error */ - if ((error != ENOENT) || (vp->v_id == vpid)) + vpid = vnode_vid(vp); + while ((error = vnode_getwithvid(vp, vpid))) { + /* vnode_get() may return ENOENT if the dir changes. */ + /* If that happens, just try it again, else return the error. */ + if ((error != ENOENT) || (vnode_vid(vp) == vpid)) return (error); - vpid = vp->v_id; + vpid = vnode_vid(vp); } - if (vp->v_type == VNON) - vp->v_type = VDIR; - vp->v_flag |= VROOT; *vpp = vp; return (0); } -extern int syncprt; - /* * Flush out the buffer cache */ -/* ARGSUSED */ + +struct nfs_sync_cargs { + vfs_context_t context; + int waitfor; + int error; +}; + static int -nfs_sync(mp, waitfor, cred, p) - struct mount *mp; - int waitfor; - struct ucred *cred; - struct proc *p; +nfs_sync_callout(vnode_t vp, void *arg) { - register struct vnode *vp; - int error, allerror = 0; + struct nfs_sync_cargs *cargs = (struct nfs_sync_cargs*)arg; + int error; - /* - * Force stale buffer cache information to be flushed. - */ -loop: - LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { - int didhold; - /* - * If the vnode that we are about to sync is no longer - * associated with this mount point, start over. - */ - if (vp->v_mount != mp) - goto loop; - if (VOP_ISLOCKED(vp) || LIST_FIRST(&VTONFS(vp)->n_dirtyblkhd) == NULL) - continue; - if (vget(vp, LK_EXCLUSIVE, p)) - goto loop; - didhold = ubc_hold(vp); - error = VOP_FSYNC(vp, cred, waitfor, p); - if (error) - allerror = error; - VOP_UNLOCK(vp, 0, p); - if (didhold) - ubc_rele(vp); - vrele(vp); - } - return (allerror); + if (LIST_EMPTY(&VTONFS(vp)->n_dirtyblkhd)) + return (VNODE_RETURNED); + if (VTONFS(vp)->n_flag & NWRBUSY) + return (VNODE_RETURNED); + + error = nfs_flush(vp, cargs->waitfor, + vfs_context_ucred(cargs->context), + vfs_context_proc(cargs->context), 0); + if (error) + cargs->error = error; + + return (VNODE_RETURNED); +} + +static int +nfs_sync(mount_t mp, int waitfor, vfs_context_t context) +{ + struct nfs_sync_cargs cargs; + + cargs.waitfor = waitfor; + cargs.context = context; + cargs.error = 0; + + vnode_iterate(mp, 0, nfs_sync_callout, &cargs); + + return (cargs.error); } /* * NFS flat namespace lookup. * Currently unsupported. */ -/* ARGSUSED */ +/*ARGSUSED*/ static int -nfs_vget(mp, ino, vpp) - struct mount *mp; - void *ino; /* XXX void* or ino_t? */ - struct vnode **vpp; +nfs_vget( + __unused mount_t mp, + __unused ino64_t ino, + __unused vnode_t *vpp, + __unused vfs_context_t context) { - return (EOPNOTSUPP); + return (ENOTSUP); } /* * At this point, this should never happen */ -/* ARGSUSED */ +/*ARGSUSED*/ static int -nfs_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) - register struct mount *mp; - struct fid *fhp; - struct mbuf *nam; - struct vnode **vpp; - int *exflagsp; - struct ucred **credanonp; +nfs_fhtovp( + __unused mount_t mp, + __unused int fhlen, + __unused unsigned char *fhp, + __unused vnode_t *vpp, + __unused vfs_context_t context) { - return (EINVAL); + return (ENOTSUP); } /* * Vnode pointer to File handle, should never happen either */ -/* ARGSUSED */ +/*ARGSUSED*/ static int -nfs_vptofh(vp, fhp) - struct vnode *vp; - struct fid *fhp; +nfs_vptofh( + __unused vnode_t vp, + __unused int *fhlenp, + __unused unsigned char *fhp, + __unused vfs_context_t context) { - return (EINVAL); + return (ENOTSUP); } /* * Vfs start routine, a no-op. */ -/* ARGSUSED */ +/*ARGSUSED*/ static int -nfs_start(mp, flags, p) - struct mount *mp; - int flags; - struct proc *p; +nfs_start( + __unused mount_t mp, + __unused int flags, + __unused vfs_context_t context) { return (0); } -/* - * Do operations associated with quotas, not supported - */ -/* ARGSUSED */ -static int -nfs_quotactl(mp, cmd, uid, arg, p) - struct mount *mp; - int cmd; - uid_t uid; - caddr_t arg; - struct proc *p; -{ - - return (EOPNOTSUPP); -} - /* * Do that sysctl thang... */ static int -nfs_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, - size_t newlen, struct proc *p) +nfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen, vfs_context_t context) { int error = 0, val; - struct sysctl_req *req; + struct sysctl_req *req = NULL; struct vfsidctl vc; - struct mount *mp; - struct nfsmount *nmp; + struct user_vfsidctl user_vc; + mount_t mp; + struct nfsmount *nmp = NULL; struct vfsquery vq; + boolean_t is_64_bit; /* * All names at this level are terminal. @@ -1287,23 +1493,41 @@ nfs_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, if(namelen > 1) return ENOTDIR; /* overloaded */ + is_64_bit = vfs_context_is64bit(context); + /* common code for "new style" VFS_CTL sysctl, get the mount. */ switch (name[0]) { case VFS_CTL_TIMEO: case VFS_CTL_QUERY: case VFS_CTL_NOLOCKS: - req = oldp; - error = SYSCTL_IN(req, &vc, sizeof(vc)); - if (error) - return (error); - mp = vfs_getvfs(&vc.vc_fsid); + req = CAST_DOWN(struct sysctl_req *, oldp); + if (is_64_bit) { + error = SYSCTL_IN(req, &user_vc, sizeof(user_vc)); + if (error) + return (error); + mp = vfs_getvfs(&user_vc.vc_fsid); + } + else { + error = SYSCTL_IN(req, &vc, sizeof(vc)); + if (error) + return (error); + mp = vfs_getvfs(&vc.vc_fsid); + } if (mp == NULL) return (ENOENT); nmp = VFSTONFS(mp); if (nmp == NULL) return (ENOENT); bzero(&vq, sizeof(vq)); - VCTLTOREQ(&vc, req); + req->newidx = 0; + if (is_64_bit) { + req->newptr = user_vc.vc_ptr; + req->newlen = (size_t)user_vc.vc_len; + } + else { + req->newptr = CAST_USER_ADDR_T(vc.vc_ptr); + req->newlen = vc.vc_len; + } } switch(name[0]) { @@ -1331,12 +1555,12 @@ nfs_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, return 0; case VFS_CTL_NOLOCKS: val = (nmp->nm_flag & NFSMNT_NOLOCKS) ? 1 : 0; - if (req->oldptr != NULL) { + if (req->oldptr != USER_ADDR_NULL) { error = SYSCTL_OUT(req, &val, sizeof(val)); if (error) return (error); } - if (req->newptr != NULL) { + if (req->newptr != USER_ADDR_NULL) { error = SYSCTL_IN(req, &val, sizeof(val)); if (error) return (error); @@ -1347,7 +1571,7 @@ nfs_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, } break; case VFS_CTL_QUERY: - if ((nmp->nm_state & NFSSTA_TIMEO)) + if (nmp->nm_state & NFSSTA_TIMEO) vq.vq_flags |= VQ_NOTRESP; if (!(nmp->nm_flag & NFSMNT_NOLOCKS) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) @@ -1355,13 +1579,13 @@ nfs_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, error = SYSCTL_OUT(req, &vq, sizeof(vq)); break; case VFS_CTL_TIMEO: - if (req->oldptr != NULL) { + if (req->oldptr != USER_ADDR_NULL) { error = SYSCTL_OUT(req, &nmp->nm_tprintf_initial_delay, sizeof(nmp->nm_tprintf_initial_delay)); if (error) return (error); } - if (req->newptr != NULL) { + if (req->newptr != USER_ADDR_NULL) { error = SYSCTL_IN(req, &nmp->nm_tprintf_initial_delay, sizeof(nmp->nm_tprintf_initial_delay)); if (error) diff --git a/bsd/nfs/nfs_vnops.c b/bsd/nfs/nfs_vnops.c index a1f5b2a3d..c858df061 100644 --- a/bsd/nfs/nfs_vnops.c +++ b/bsd/nfs/nfs_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -67,26 +67,28 @@ #include <sys/kernel.h> #include <sys/systm.h> #include <sys/resourcevar.h> -#include <sys/proc.h> -#include <sys/mount.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> +#include <sys/mount_internal.h> #include <sys/malloc.h> -#include <sys/mbuf.h> +#include <sys/kpi_mbuf.h> #include <sys/conf.h> -#include <sys/namei.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/dirent.h> #include <sys/fcntl.h> #include <sys/lockf.h> -#include <sys/ubc.h> +#include <sys/ubc_internal.h> +#include <sys/attr.h> +#include <sys/signalvar.h> +#include <sys/uio_internal.h> #include <vfs/vfs_support.h> #include <sys/vm.h> -#include <machine/spl.h> -#include <vm/vm_pageout.h> #include <sys/time.h> #include <kern/clock.h> +#include <libkern/OSAtomic.h> #include <miscfs/fifofs/fifo.h> #include <miscfs/specfs/specdev.h> @@ -99,7 +101,6 @@ #include <nfs/nfs_lock.h> #include <nfs/xdr_subs.h> #include <nfs/nfsm_subs.h> -#include <nfs/nqnfs.h> #include <net/if.h> #include <netinet/in.h> @@ -121,121 +122,85 @@ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \ (int)(B), (int)(C), (int)(D), (int)(E), 0) -#define TRUE 1 -#define FALSE 0 - -#define NFS_FREE_PNBUF(CNP) \ - do { \ - char *tmp = (CNP)->cn_pnbuf; \ - (CNP)->cn_pnbuf = NULL; \ - (CNP)->cn_flags &= ~HASBUF; \ - FREE_ZONE(tmp, (CNP)->cn_pnlen, M_NAMEI); \ - } while (0) - - -static int nfsspec_read __P((struct vop_read_args *)); -static int nfsspec_write __P((struct vop_write_args *)); -static int nfsfifo_read __P((struct vop_read_args *)); -static int nfsfifo_write __P((struct vop_write_args *)); -static int nfsspec_close __P((struct vop_close_args *)); -static int nfsfifo_close __P((struct vop_close_args *)); -#define nfs_poll vop_nopoll -static int nfs_ioctl __P((struct vop_ioctl_args *)); -static int nfs_select __P((struct vop_select_args *)); -static int nfs_flush __P((struct vnode *,struct ucred *,int,struct proc *)); -static int nfs_setattrrpc __P((struct vnode *,struct vattr *,struct ucred *,struct proc *)); -static int nfs_lookup __P((struct vop_lookup_args *)); -static int nfs_create __P((struct vop_create_args *)); -static int nfs_mknod __P((struct vop_mknod_args *)); -static int nfs_open __P((struct vop_open_args *)); -static int nfs_close __P((struct vop_close_args *)); -static int nfs_access __P((struct vop_access_args *)); -static int nfs_getattr __P((struct vop_getattr_args *)); -static int nfs_setattr __P((struct vop_setattr_args *)); -static int nfs_read __P((struct vop_read_args *)); -static int nfs_mmap __P((struct vop_mmap_args *)); -static int nfs_fsync __P((struct vop_fsync_args *)); -static int nfs_remove __P((struct vop_remove_args *)); -static int nfs_link __P((struct vop_link_args *)); -static int nfs_rename __P((struct vop_rename_args *)); -static int nfs_mkdir __P((struct vop_mkdir_args *)); -static int nfs_rmdir __P((struct vop_rmdir_args *)); -static int nfs_symlink __P((struct vop_symlink_args *)); -static int nfs_readdir __P((struct vop_readdir_args *)); -static int nfs_bmap __P((struct vop_bmap_args *)); -static int nfs_lookitup __P((struct vnode *,char *,int,struct ucred *,struct proc *,struct nfsnode **)); -static int nfs_sillyrename __P((struct vnode *,struct vnode *,struct componentname *)); -static int nfsspec_access __P((struct vop_access_args *)); -static int nfs_readlink __P((struct vop_readlink_args *)); -static int nfs_print __P((struct vop_print_args *)); -static int nfs_pathconf __P((struct vop_pathconf_args *)); -static int nfs_advlock __P((struct vop_advlock_args *)); -static int nfs_blkatoff __P((struct vop_blkatoff_args *)); -static int nfs_valloc __P((struct vop_valloc_args *)); -static int nfs_vfree __P((struct vop_vfree_args *)); -static int nfs_truncate __P((struct vop_truncate_args *)); -static int nfs_update __P((struct vop_update_args *)); -static int nfs_pagein __P((struct vop_pagein_args *)); -static int nfs_pageout __P((struct vop_pageout_args *)); -static int nfs_blktooff __P((struct vop_blktooff_args *)); -static int nfs_offtoblk __P((struct vop_offtoblk_args *)); -static int nfs_cmap __P((struct vop_cmap_args *)); +static int nfsspec_read(struct vnop_read_args *); +static int nfsspec_write(struct vnop_write_args *); +static int nfsfifo_read(struct vnop_read_args *); +static int nfsfifo_write(struct vnop_write_args *); +static int nfsspec_close(struct vnop_close_args *); +static int nfsfifo_close(struct vnop_close_args *); +static int nfs_ioctl(struct vnop_ioctl_args *); +static int nfs_select(struct vnop_select_args *); +static int nfs_setattrrpc(vnode_t,struct vnode_attr *,kauth_cred_t,proc_t); +static int nfs_lookup(struct vnop_lookup_args *); +static int nfs_create(struct vnop_create_args *); +static int nfs_mknod(struct vnop_mknod_args *); +static int nfs_open(struct vnop_open_args *); +static int nfs_close(struct vnop_close_args *); +static int nfs_access(struct vnop_access_args *); +static int nfs_vnop_getattr(struct vnop_getattr_args *); +static int nfs_setattr(struct vnop_setattr_args *); +static int nfs_read(struct vnop_read_args *); +static int nfs_mmap(struct vnop_mmap_args *); +static int nfs_fsync(struct vnop_fsync_args *); +static int nfs_remove(struct vnop_remove_args *); +static int nfs_link(struct vnop_link_args *); +static int nfs_rename(struct vnop_rename_args *); +static int nfs_mkdir(struct vnop_mkdir_args *); +static int nfs_rmdir(struct vnop_rmdir_args *); +static int nfs_symlink(struct vnop_symlink_args *); +static int nfs_readdir(struct vnop_readdir_args *); +static int nfs_lookitup(vnode_t,char *,int,kauth_cred_t,proc_t,struct nfsnode **); +static int nfs_sillyrename(vnode_t,vnode_t,struct componentname *,kauth_cred_t,proc_t); +static int nfs_readlink(struct vnop_readlink_args *); +static int nfs_pathconf(struct vnop_pathconf_args *); +static int nfs_advlock(struct vnop_advlock_args *); +static int nfs_pagein(struct vnop_pagein_args *); +static int nfs_pageout(struct vnop_pageout_args *); +static int nfs_blktooff(struct vnop_blktooff_args *); +static int nfs_offtoblk(struct vnop_offtoblk_args *); +static int nfs_blockmap(struct vnop_blockmap_args *); /* * Global vfs data structures for nfs */ -vop_t **nfsv2_vnodeop_p; +vnop_t **nfsv2_vnodeop_p; static struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = { - { &vop_default_desc, (vop_t *)vn_default_error }, - { &vop_lookup_desc, (vop_t *)nfs_lookup }, /* lookup */ - { &vop_create_desc, (vop_t *)nfs_create }, /* create */ - { &vop_mknod_desc, (vop_t *)nfs_mknod }, /* mknod */ - { &vop_open_desc, (vop_t *)nfs_open }, /* open */ - { &vop_close_desc, (vop_t *)nfs_close }, /* close */ - { &vop_access_desc, (vop_t *)nfs_access }, /* access */ - { &vop_getattr_desc, (vop_t *)nfs_getattr }, /* getattr */ - { &vop_setattr_desc, (vop_t *)nfs_setattr }, /* setattr */ - { &vop_read_desc, (vop_t *)nfs_read }, /* read */ - { &vop_write_desc, (vop_t *)nfs_write }, /* write */ - { &vop_lease_desc, (vop_t *)nfs_lease_check }, /* lease */ - { &vop_ioctl_desc, (vop_t *)nfs_ioctl }, /* ioctl */ - { &vop_select_desc, (vop_t *)nfs_select }, /* select */ - { &vop_revoke_desc, (vop_t *)nfs_revoke }, /* revoke */ - { &vop_mmap_desc, (vop_t *)nfs_mmap }, /* mmap */ - { &vop_fsync_desc, (vop_t *)nfs_fsync }, /* fsync */ - { &vop_seek_desc, (vop_t *)nfs_seek }, /* seek */ - { &vop_remove_desc, (vop_t *)nfs_remove }, /* remove */ - { &vop_link_desc, (vop_t *)nfs_link }, /* link */ - { &vop_rename_desc, (vop_t *)nfs_rename }, /* rename */ - { &vop_mkdir_desc, (vop_t *)nfs_mkdir }, /* mkdir */ - { &vop_rmdir_desc, (vop_t *)nfs_rmdir }, /* rmdir */ - { &vop_symlink_desc, (vop_t *)nfs_symlink }, /* symlink */ - { &vop_readdir_desc, (vop_t *)nfs_readdir }, /* readdir */ - { &vop_readlink_desc, (vop_t *)nfs_readlink }, /* readlink */ - { &vop_abortop_desc, (vop_t *)nop_abortop }, /* abortop */ - { &vop_inactive_desc, (vop_t *)nfs_inactive }, /* inactive */ - { &vop_reclaim_desc, (vop_t *)nfs_reclaim }, /* reclaim */ - { &vop_lock_desc, (vop_t *)nfs_lock }, /* lock */ - { &vop_unlock_desc, (vop_t *)nfs_unlock }, /* unlock */ - { &vop_bmap_desc, (vop_t *)nfs_bmap }, /* bmap */ - { &vop_strategy_desc, (vop_t *)err_strategy }, /* strategy */ - { &vop_print_desc, (vop_t *)nfs_print }, /* print */ - { &vop_islocked_desc, (vop_t *)nfs_islocked }, /* islocked */ - { &vop_pathconf_desc, (vop_t *)nfs_pathconf }, /* pathconf */ - { &vop_advlock_desc, (vop_t *)nfs_advlock }, /* advlock */ - { &vop_blkatoff_desc, (vop_t *)nfs_blkatoff }, /* blkatoff */ - { &vop_valloc_desc, (vop_t *)nfs_valloc }, /* valloc */ - { &vop_reallocblks_desc, (vop_t *)nfs_reallocblks }, /* reallocblks */ - { &vop_vfree_desc, (vop_t *)nfs_vfree }, /* vfree */ - { &vop_truncate_desc, (vop_t *)nfs_truncate }, /* truncate */ - { &vop_update_desc, (vop_t *)nfs_update }, /* update */ - { &vop_bwrite_desc, (vop_t *)err_bwrite }, /* bwrite */ - { &vop_pagein_desc, (vop_t *)nfs_pagein }, /* Pagein */ - { &vop_pageout_desc, (vop_t *)nfs_pageout }, /* Pageout */ - { &vop_copyfile_desc, (vop_t *)err_copyfile }, /* Copyfile */ - { &vop_blktooff_desc, (vop_t *)nfs_blktooff }, /* blktooff */ - { &vop_offtoblk_desc, (vop_t *)nfs_offtoblk }, /* offtoblk */ - { &vop_cmap_desc, (vop_t *)nfs_cmap }, /* cmap */ + { &vnop_default_desc, (vnop_t *)vn_default_error }, + { &vnop_lookup_desc, (vnop_t *)nfs_lookup }, /* lookup */ + { &vnop_create_desc, (vnop_t *)nfs_create }, /* create */ + { &vnop_mknod_desc, (vnop_t *)nfs_mknod }, /* mknod */ + { &vnop_open_desc, (vnop_t *)nfs_open }, /* open */ + { &vnop_close_desc, (vnop_t *)nfs_close }, /* close */ + { &vnop_access_desc, (vnop_t *)nfs_access }, /* access */ + { &vnop_getattr_desc, (vnop_t *)nfs_vnop_getattr }, /* getattr */ + { &vnop_setattr_desc, (vnop_t *)nfs_setattr }, /* setattr */ + { &vnop_read_desc, (vnop_t *)nfs_read }, /* read */ + { &vnop_write_desc, (vnop_t *)nfs_write }, /* write */ + { &vnop_ioctl_desc, (vnop_t *)nfs_ioctl }, /* ioctl */ + { &vnop_select_desc, (vnop_t *)nfs_select }, /* select */ + { &vnop_revoke_desc, (vnop_t *)nfs_revoke }, /* revoke */ + { &vnop_mmap_desc, (vnop_t *)nfs_mmap }, /* mmap */ + { &vnop_fsync_desc, (vnop_t *)nfs_fsync }, /* fsync */ + { &vnop_remove_desc, (vnop_t *)nfs_remove }, /* remove */ + { &vnop_link_desc, (vnop_t *)nfs_link }, /* link */ + { &vnop_rename_desc, (vnop_t *)nfs_rename }, /* rename */ + { &vnop_mkdir_desc, (vnop_t *)nfs_mkdir }, /* mkdir */ + { &vnop_rmdir_desc, (vnop_t *)nfs_rmdir }, /* rmdir */ + { &vnop_symlink_desc, (vnop_t *)nfs_symlink }, /* symlink */ + { &vnop_readdir_desc, (vnop_t *)nfs_readdir }, /* readdir */ + { &vnop_readlink_desc, (vnop_t *)nfs_readlink }, /* readlink */ + { &vnop_inactive_desc, (vnop_t *)nfs_inactive }, /* inactive */ + { &vnop_reclaim_desc, (vnop_t *)nfs_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (vnop_t *)err_strategy }, /* strategy */ + { &vnop_pathconf_desc, (vnop_t *)nfs_pathconf }, /* pathconf */ + { &vnop_advlock_desc, (vnop_t *)nfs_advlock }, /* advlock */ + { &vnop_bwrite_desc, (vnop_t *)err_bwrite }, /* bwrite */ + { &vnop_pagein_desc, (vnop_t *)nfs_pagein }, /* Pagein */ + { &vnop_pageout_desc, (vnop_t *)nfs_pageout }, /* Pageout */ + { &vnop_copyfile_desc, (vnop_t *)err_copyfile }, /* Copyfile */ + { &vnop_blktooff_desc, (vnop_t *)nfs_blktooff }, /* blktooff */ + { &vnop_offtoblk_desc, (vnop_t *)nfs_offtoblk }, /* offtoblk */ + { &vnop_blockmap_desc, (vnop_t *)nfs_blockmap }, /* blockmap */ { NULL, NULL } }; struct vnodeopv_desc nfsv2_vnodeop_opv_desc = @@ -247,58 +212,42 @@ VNODEOP_SET(nfsv2_vnodeop_opv_desc); /* * Special device vnode ops */ -vop_t **spec_nfsv2nodeop_p; +vnop_t **spec_nfsv2nodeop_p; static struct vnodeopv_entry_desc spec_nfsv2nodeop_entries[] = { - { &vop_default_desc, (vop_t *)vn_default_error }, - { &vop_lookup_desc, (vop_t *)spec_lookup }, /* lookup */ - { &vop_create_desc, (vop_t *)spec_create }, /* create */ - { &vop_mknod_desc, (vop_t *)spec_mknod }, /* mknod */ - { &vop_open_desc, (vop_t *)spec_open }, /* open */ - { &vop_close_desc, (vop_t *)nfsspec_close }, /* close */ - { &vop_access_desc, (vop_t *)nfsspec_access }, /* access */ - { &vop_getattr_desc, (vop_t *)nfs_getattr }, /* getattr */ - { &vop_setattr_desc, (vop_t *)nfs_setattr }, /* setattr */ - { &vop_read_desc, (vop_t *)nfsspec_read }, /* read */ - { &vop_write_desc, (vop_t *)nfsspec_write }, /* write */ - { &vop_lease_desc, (vop_t *)spec_lease_check }, /* lease */ - { &vop_ioctl_desc, (vop_t *)spec_ioctl }, /* ioctl */ - { &vop_select_desc, (vop_t *)spec_select }, /* select */ - { &vop_revoke_desc, (vop_t *)spec_revoke }, /* revoke */ - { &vop_mmap_desc, (vop_t *)spec_mmap }, /* mmap */ - { &vop_fsync_desc, (vop_t *)nfs_fsync }, /* fsync */ - { &vop_seek_desc, (vop_t *)spec_seek }, /* seek */ - { &vop_remove_desc, (vop_t *)spec_remove }, /* remove */ - { &vop_link_desc, (vop_t *)spec_link }, /* link */ - { &vop_rename_desc, (vop_t *)spec_rename }, /* rename */ - { &vop_mkdir_desc, (vop_t *)spec_mkdir }, /* mkdir */ - { &vop_rmdir_desc, (vop_t *)spec_rmdir }, /* rmdir */ - { &vop_symlink_desc, (vop_t *)spec_symlink }, /* symlink */ - { &vop_readdir_desc, (vop_t *)spec_readdir }, /* readdir */ - { &vop_readlink_desc, (vop_t *)spec_readlink }, /* readlink */ - { &vop_abortop_desc, (vop_t *)spec_abortop }, /* abortop */ - { &vop_inactive_desc, (vop_t *)nfs_inactive }, /* inactive */ - { &vop_reclaim_desc, (vop_t *)nfs_reclaim }, /* reclaim */ - { &vop_lock_desc, (vop_t *)nfs_lock }, /* lock */ - { &vop_unlock_desc, (vop_t *)nfs_unlock }, /* unlock */ - { &vop_bmap_desc, (vop_t *)spec_bmap }, /* bmap */ - { &vop_strategy_desc, (vop_t *)spec_strategy }, /* strategy */ - { &vop_print_desc, (vop_t *)nfs_print }, /* print */ - { &vop_islocked_desc, (vop_t *)nfs_islocked }, /* islocked */ - { &vop_pathconf_desc, (vop_t *)spec_pathconf }, /* pathconf */ - { &vop_advlock_desc, (vop_t *)spec_advlock }, /* advlock */ - { &vop_blkatoff_desc, (vop_t *)spec_blkatoff }, /* blkatoff */ - { &vop_valloc_desc, (vop_t *)spec_valloc }, /* valloc */ - { &vop_reallocblks_desc, (vop_t *)spec_reallocblks }, /* reallocblks */ - { &vop_vfree_desc, (vop_t *)spec_vfree }, /* vfree */ - { &vop_truncate_desc, (vop_t *)spec_truncate }, /* truncate */ - { &vop_update_desc, (vop_t *)nfs_update }, /* update */ - { &vop_bwrite_desc, (vop_t *)vn_bwrite }, /* bwrite */ - { &vop_devblocksize_desc, (vop_t *)spec_devblocksize }, /* devblocksize */ - { &vop_pagein_desc, (vop_t *)nfs_pagein }, /* Pagein */ - { &vop_pageout_desc, (vop_t *)nfs_pageout }, /* Pageout */ - { &vop_blktooff_desc, (vop_t *)nfs_blktooff }, /* blktooff */ - { &vop_offtoblk_desc, (vop_t *)nfs_offtoblk }, /* offtoblk */ - { &vop_cmap_desc, (vop_t *)nfs_cmap }, /* cmap */ + { &vnop_default_desc, (vnop_t *)vn_default_error }, + { &vnop_lookup_desc, (vnop_t *)spec_lookup }, /* lookup */ + { &vnop_create_desc, (vnop_t *)spec_create }, /* create */ + { &vnop_mknod_desc, (vnop_t *)spec_mknod }, /* mknod */ + { &vnop_open_desc, (vnop_t *)spec_open }, /* open */ + { &vnop_close_desc, (vnop_t *)nfsspec_close }, /* close */ + { &vnop_getattr_desc, (vnop_t *)nfs_vnop_getattr }, /* getattr */ + { &vnop_setattr_desc, (vnop_t *)nfs_setattr }, /* setattr */ + { &vnop_read_desc, (vnop_t *)nfsspec_read }, /* read */ + { &vnop_write_desc, (vnop_t *)nfsspec_write }, /* write */ + { &vnop_ioctl_desc, (vnop_t *)spec_ioctl }, /* ioctl */ + { &vnop_select_desc, (vnop_t *)spec_select }, /* select */ + { &vnop_revoke_desc, (vnop_t *)spec_revoke }, /* revoke */ + { &vnop_mmap_desc, (vnop_t *)spec_mmap }, /* mmap */ + { &vnop_fsync_desc, (vnop_t *)nfs_fsync }, /* fsync */ + { &vnop_remove_desc, (vnop_t *)spec_remove }, /* remove */ + { &vnop_link_desc, (vnop_t *)spec_link }, /* link */ + { &vnop_rename_desc, (vnop_t *)spec_rename }, /* rename */ + { &vnop_mkdir_desc, (vnop_t *)spec_mkdir }, /* mkdir */ + { &vnop_rmdir_desc, (vnop_t *)spec_rmdir }, /* rmdir */ + { &vnop_symlink_desc, (vnop_t *)spec_symlink }, /* symlink */ + { &vnop_readdir_desc, (vnop_t *)spec_readdir }, /* readdir */ + { &vnop_readlink_desc, (vnop_t *)spec_readlink }, /* readlink */ + { &vnop_inactive_desc, (vnop_t *)nfs_inactive }, /* inactive */ + { &vnop_reclaim_desc, (vnop_t *)nfs_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (vnop_t *)spec_strategy }, /* strategy */ + { &vnop_pathconf_desc, (vnop_t *)spec_pathconf }, /* pathconf */ + { &vnop_advlock_desc, (vnop_t *)spec_advlock }, /* advlock */ + { &vnop_bwrite_desc, (vnop_t *)vn_bwrite }, /* bwrite */ + { &vnop_pagein_desc, (vnop_t *)nfs_pagein }, /* Pagein */ + { &vnop_pageout_desc, (vnop_t *)nfs_pageout }, /* Pageout */ + { &vnop_blktooff_desc, (vnop_t *)nfs_blktooff }, /* blktooff */ + { &vnop_offtoblk_desc, (vnop_t *)nfs_offtoblk }, /* offtoblk */ + { &vnop_blockmap_desc, (vnop_t *)nfs_blockmap }, /* blockmap */ { NULL, NULL } }; struct vnodeopv_desc spec_nfsv2nodeop_opv_desc = @@ -307,57 +256,42 @@ struct vnodeopv_desc spec_nfsv2nodeop_opv_desc = VNODEOP_SET(spec_nfsv2nodeop_opv_desc); #endif -vop_t **fifo_nfsv2nodeop_p; +vnop_t **fifo_nfsv2nodeop_p; static struct vnodeopv_entry_desc fifo_nfsv2nodeop_entries[] = { - { &vop_default_desc, (vop_t *)vn_default_error }, - { &vop_lookup_desc, (vop_t *)fifo_lookup }, /* lookup */ - { &vop_create_desc, (vop_t *)fifo_create }, /* create */ - { &vop_mknod_desc, (vop_t *)fifo_mknod }, /* mknod */ - { &vop_open_desc, (vop_t *)fifo_open }, /* open */ - { &vop_close_desc, (vop_t *)nfsfifo_close }, /* close */ - { &vop_access_desc, (vop_t *)nfsspec_access }, /* access */ - { &vop_getattr_desc, (vop_t *)nfs_getattr }, /* getattr */ - { &vop_setattr_desc, (vop_t *)nfs_setattr }, /* setattr */ - { &vop_read_desc, (vop_t *)nfsfifo_read }, /* read */ - { &vop_write_desc, (vop_t *)nfsfifo_write }, /* write */ - { &vop_lease_desc, (vop_t *)fifo_lease_check }, /* lease */ - { &vop_ioctl_desc, (vop_t *)fifo_ioctl }, /* ioctl */ - { &vop_select_desc, (vop_t *)fifo_select }, /* select */ - { &vop_revoke_desc, (vop_t *)fifo_revoke }, /* revoke */ - { &vop_mmap_desc, (vop_t *)fifo_mmap }, /* mmap */ - { &vop_fsync_desc, (vop_t *)nfs_fsync }, /* fsync */ - { &vop_seek_desc, (vop_t *)fifo_seek }, /* seek */ - { &vop_remove_desc, (vop_t *)fifo_remove }, /* remove */ - { &vop_link_desc, (vop_t *)fifo_link }, /* link */ - { &vop_rename_desc, (vop_t *)fifo_rename }, /* rename */ - { &vop_mkdir_desc, (vop_t *)fifo_mkdir }, /* mkdir */ - { &vop_rmdir_desc, (vop_t *)fifo_rmdir }, /* rmdir */ - { &vop_symlink_desc, (vop_t *)fifo_symlink }, /* symlink */ - { &vop_readdir_desc, (vop_t *)fifo_readdir }, /* readdir */ - { &vop_readlink_desc, (vop_t *)fifo_readlink }, /* readlink */ - { &vop_abortop_desc, (vop_t *)fifo_abortop }, /* abortop */ - { &vop_inactive_desc, (vop_t *)nfs_inactive }, /* inactive */ - { &vop_reclaim_desc, (vop_t *)nfs_reclaim }, /* reclaim */ - { &vop_lock_desc, (vop_t *)nfs_lock }, /* lock */ - { &vop_unlock_desc, (vop_t *)nfs_unlock }, /* unlock */ - { &vop_bmap_desc, (vop_t *)fifo_bmap }, /* bmap */ - { &vop_strategy_desc, (vop_t *)fifo_strategy }, /* strategy */ - { &vop_print_desc, (vop_t *)nfs_print }, /* print */ - { &vop_islocked_desc, (vop_t *)nfs_islocked }, /* islocked */ - { &vop_pathconf_desc, (vop_t *)fifo_pathconf }, /* pathconf */ - { &vop_advlock_desc, (vop_t *)fifo_advlock }, /* advlock */ - { &vop_blkatoff_desc, (vop_t *)fifo_blkatoff }, /* blkatoff */ - { &vop_valloc_desc, (vop_t *)fifo_valloc }, /* valloc */ - { &vop_reallocblks_desc, (vop_t *)fifo_reallocblks }, /* reallocblks */ - { &vop_vfree_desc, (vop_t *)fifo_vfree }, /* vfree */ - { &vop_truncate_desc, (vop_t *)fifo_truncate }, /* truncate */ - { &vop_update_desc, (vop_t *)nfs_update }, /* update */ - { &vop_bwrite_desc, (vop_t *)vn_bwrite }, /* bwrite */ - { &vop_pagein_desc, (vop_t *)nfs_pagein }, /* Pagein */ - { &vop_pageout_desc, (vop_t *)nfs_pageout }, /* Pageout */ - { &vop_blktooff_desc, (vop_t *)nfs_blktooff }, /* blktooff */ - { &vop_offtoblk_desc, (vop_t *)nfs_offtoblk }, /* offtoblk */ - { &vop_cmap_desc, (vop_t *)nfs_cmap }, /* cmap */ + { &vnop_default_desc, (vnop_t *)vn_default_error }, + { &vnop_lookup_desc, (vnop_t *)fifo_lookup }, /* lookup */ + { &vnop_create_desc, (vnop_t *)fifo_create }, /* create */ + { &vnop_mknod_desc, (vnop_t *)fifo_mknod }, /* mknod */ + { &vnop_open_desc, (vnop_t *)fifo_open }, /* open */ + { &vnop_close_desc, (vnop_t *)nfsfifo_close }, /* close */ + { &vnop_getattr_desc, (vnop_t *)nfs_vnop_getattr }, /* getattr */ + { &vnop_setattr_desc, (vnop_t *)nfs_setattr }, /* setattr */ + { &vnop_read_desc, (vnop_t *)nfsfifo_read }, /* read */ + { &vnop_write_desc, (vnop_t *)nfsfifo_write }, /* write */ + { &vnop_ioctl_desc, (vnop_t *)fifo_ioctl }, /* ioctl */ + { &vnop_select_desc, (vnop_t *)fifo_select }, /* select */ + { &vnop_revoke_desc, (vnop_t *)fifo_revoke }, /* revoke */ + { &vnop_mmap_desc, (vnop_t *)fifo_mmap }, /* mmap */ + { &vnop_fsync_desc, (vnop_t *)nfs_fsync }, /* fsync */ + { &vnop_remove_desc, (vnop_t *)fifo_remove }, /* remove */ + { &vnop_link_desc, (vnop_t *)fifo_link }, /* link */ + { &vnop_rename_desc, (vnop_t *)fifo_rename }, /* rename */ + { &vnop_mkdir_desc, (vnop_t *)fifo_mkdir }, /* mkdir */ + { &vnop_rmdir_desc, (vnop_t *)fifo_rmdir }, /* rmdir */ + { &vnop_symlink_desc, (vnop_t *)fifo_symlink }, /* symlink */ + { &vnop_readdir_desc, (vnop_t *)fifo_readdir }, /* readdir */ + { &vnop_readlink_desc, (vnop_t *)fifo_readlink }, /* readlink */ + { &vnop_inactive_desc, (vnop_t *)nfs_inactive }, /* inactive */ + { &vnop_reclaim_desc, (vnop_t *)nfs_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (vnop_t *)fifo_strategy }, /* strategy */ + { &vnop_pathconf_desc, (vnop_t *)fifo_pathconf }, /* pathconf */ + { &vnop_advlock_desc, (vnop_t *)fifo_advlock }, /* advlock */ + { &vnop_bwrite_desc, (vnop_t *)vn_bwrite }, /* bwrite */ + { &vnop_pagein_desc, (vnop_t *)nfs_pagein }, /* Pagein */ + { &vnop_pageout_desc, (vnop_t *)nfs_pageout }, /* Pageout */ + { &vnop_blktooff_desc, (vnop_t *)nfs_blktooff }, /* blktooff */ + { &vnop_offtoblk_desc, (vnop_t *)nfs_offtoblk }, /* offtoblk */ + { &vnop_blockmap_desc, (vnop_t *)nfs_blockmap }, /* blockmap */ { NULL, NULL } }; struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc = @@ -366,29 +300,35 @@ struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc = VNODEOP_SET(fifo_nfsv2nodeop_opv_desc); #endif -static int nfs_mknodrpc __P((struct vnode *dvp, struct vnode **vpp, - struct componentname *cnp, - struct vattr *vap)); -static int nfs_removerpc __P((struct vnode *dvp, char *name, int namelen, - struct ucred *cred, struct proc *proc)); -static int nfs_renamerpc __P((struct vnode *fdvp, char *fnameptr, - int fnamelen, struct vnode *tdvp, - char *tnameptr, int tnamelen, - struct ucred *cred, struct proc *proc)); -static int nfs_renameit __P((struct vnode *sdvp, - struct componentname *scnp, - struct sillyrename *sp)); +static int nfs_mknodrpc(vnode_t dvp, vnode_t *vpp, + struct componentname *cnp, + struct vnode_attr *vap, + kauth_cred_t cred, proc_t p); +static int nfs_removerpc(vnode_t dvp, char *name, int namelen, + kauth_cred_t cred, proc_t proc); +static int nfs_renamerpc(vnode_t fdvp, char *fnameptr, + int fnamelen, vnode_t tdvp, + char *tnameptr, int tnamelen, + kauth_cred_t cred, proc_t proc); /* * Global variables */ +extern u_long nfs_xdrneg1; extern u_long nfs_true, nfs_false; extern struct nfsstats nfsstats; extern nfstype nfsv3_type[9]; -struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; +proc_t nfs_iodwant[NFS_MAXASYNCDAEMON]; struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON]; + +lck_grp_t *nfs_iod_lck_grp; +lck_grp_attr_t *nfs_iod_lck_grp_attr; +lck_attr_t *nfs_iod_lck_attr; +lck_mtx_t *nfs_iod_mutex; + int nfs_numasync = 0; int nfs_ioddelwri = 0; + #define DIRHDSIZ (sizeof (struct dirent) - (MAXNAMLEN + 1)) static int nfsaccess_cache_timeout = NFS_MAXATTRTIMO; @@ -514,8 +454,7 @@ static const short errortooutcome[ELAST+1] = { static short -nfs_pageouterrorhandler(error) - int error; +nfs_pageouterrorhandler(int error) { if (error > ELAST) return(DUMP); @@ -524,16 +463,16 @@ nfs_pageouterrorhandler(error) } static int -nfs3_access_otw(struct vnode *vp, +nfs3_access_otw(vnode_t vp, int wmode, - struct proc *p, - struct ucred *cred) + proc_t p, + kauth_cred_t cred) { const int v3 = 1; u_long *tl; int error = 0, attrflag; - struct mbuf *mreq, *mrep, *md, *mb, *mb2; + mbuf_t mreq, mrep, md, mb, mb2; caddr_t bpos, dpos, cp2; register long t1, t2; register caddr_t cp; @@ -542,20 +481,22 @@ nfs3_access_otw(struct vnode *vp, u_int64_t xid; struct timeval now; - nfsstats.rpccnt[NFSPROC_ACCESS]++; - nfsm_reqhead(vp, NFSPROC_ACCESS, NFSX_FH(v3) + NFSX_UNSIGNED); + nfsm_reqhead(NFSX_FH(v3) + NFSX_UNSIGNED); + if (error) + return (error); + OSAddAtomic(1, (SInt32*)&nfsstats.rpccnt[NFSPROC_ACCESS]); nfsm_fhtom(vp, v3); nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = txdr_unsigned(wmode); nfsm_request(vp, NFSPROC_ACCESS, p, cred, &xid); if (mrep) { - nfsm_postop_attr(vp, attrflag, &xid); + nfsm_postop_attr_update(vp, 1, attrflag, &xid); } if (!error) { nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); rmode = fxdr_unsigned(u_int32_t, *tl); np->n_mode = rmode; - np->n_modeuid = cred->cr_uid; + np->n_modeuid = kauth_cred_getuid(cred); microuptime(&now); np->n_modestamp = now.tv_sec; } @@ -571,19 +512,20 @@ nfs3_access_otw(struct vnode *vp, */ static int nfs_access(ap) - struct vop_access_args /* { - struct vnode *a_vp; - int a_mode; - struct ucred *a_cred; - struct proc *a_p; + struct vnop_access_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + int a_mode; + vfs_context_t a_context; } */ *ap; { - register struct vnode *vp = ap->a_vp; - int error = 0; + vnode_t vp = ap->a_vp; + int error = 0, dorpc; u_long mode, wmode; int v3 = NFS_ISV3(vp); struct nfsnode *np = VTONFS(vp); struct timeval now; + kauth_cred_t cred; /* * For nfs v3, do an access rpc, otherwise you are stuck emulating @@ -594,23 +536,50 @@ nfs_access(ap) * in the cache. */ if (v3) { - if (ap->a_mode & VREAD) - mode = NFSV3ACCESS_READ; - else - mode = 0; - if (vp->v_type == VDIR) { - if (ap->a_mode & VWRITE) - mode |= NFSV3ACCESS_MODIFY | - NFSV3ACCESS_EXTEND | NFSV3ACCESS_DELETE; - if (ap->a_mode & VEXEC) + /* + * Convert KAUTH primitives to NFS access rights. + */ + mode = 0; + if (vnode_isdir(vp)) { + /* directory */ + if (ap->a_action & + (KAUTH_VNODE_LIST_DIRECTORY | + KAUTH_VNODE_READ_EXTATTRIBUTES)) + mode |= NFSV3ACCESS_READ; + if (ap->a_action & KAUTH_VNODE_SEARCH) mode |= NFSV3ACCESS_LOOKUP; + if (ap->a_action & + (KAUTH_VNODE_ADD_FILE | + KAUTH_VNODE_ADD_SUBDIRECTORY)) + mode |= NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND; + if (ap->a_action & KAUTH_VNODE_DELETE_CHILD) + mode |= NFSV3ACCESS_MODIFY; } else { - if (ap->a_mode & VWRITE) + /* file */ + if (ap->a_action & + (KAUTH_VNODE_READ_DATA | + KAUTH_VNODE_READ_EXTATTRIBUTES)) + mode |= NFSV3ACCESS_READ; + if (ap->a_action & KAUTH_VNODE_WRITE_DATA) mode |= NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND; - if (ap->a_mode & VEXEC) + if (ap->a_action & KAUTH_VNODE_APPEND_DATA) + mode |= NFSV3ACCESS_EXTEND; + if (ap->a_action & KAUTH_VNODE_EXECUTE) mode |= NFSV3ACCESS_EXECUTE; } - /* XXX safety belt, only make blanket request if caching */ + /* common */ + if (ap->a_action & KAUTH_VNODE_DELETE) + mode |= NFSV3ACCESS_DELETE; + if (ap->a_action & + (KAUTH_VNODE_WRITE_ATTRIBUTES | + KAUTH_VNODE_WRITE_EXTATTRIBUTES | + KAUTH_VNODE_WRITE_SECURITY)) + mode |= NFSV3ACCESS_MODIFY; + /* XXX this is pretty dubious */ + if (ap->a_action & KAUTH_VNODE_CHANGE_OWNER) + mode |= NFSV3ACCESS_MODIFY; + + /* if caching, always ask for every right */ if (nfsaccess_cache_timeout > 0) { wmode = NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE | @@ -618,44 +587,50 @@ nfs_access(ap) } else wmode = mode; + cred = vfs_context_ucred(ap->a_context); + /* * Does our cached result allow us to give a definite yes to * this request? */ - microuptime(&now); - if (now.tv_sec < np->n_modestamp + nfsaccess_cache_timeout && - ap->a_cred->cr_uid == np->n_modeuid && - (np->n_mode & mode) == mode) { - /* nfsstats.accesscache_hits++; */ - } else { + dorpc = 1; + if (NMODEVALID(np)) { + microuptime(&now); + if ((now.tv_sec < (np->n_modestamp + nfsaccess_cache_timeout)) && + (kauth_cred_getuid(cred) == np->n_modeuid) && + ((np->n_mode & mode) == mode)) { + /* OSAddAtomic(1, (SInt32*)&nfsstats.accesscache_hits); */ + dorpc = 0; + } + } + if (dorpc) { + /* Either a no, or a don't know. Go to the wire. */ + /* OSAddAtomic(1, (SInt32*)&nfsstats.accesscache_misses); */ + error = nfs3_access_otw(vp, wmode, vfs_context_proc(ap->a_context), cred); + } + if (!error) { /* - * Either a no, or a don't know. Go to the wire. + * If we asked for DELETE but didn't get it, the server + * may simply not support returning that bit (possible + * on UNIX systems). So, we'll assume that it is OK, + * and just let any subsequent delete action fail if it + * really isn't deletable. */ - /* nfsstats.accesscache_misses++; */ - error = nfs3_access_otw(vp, wmode, ap->a_p,ap->a_cred); - if (!error) { - if ((np->n_mode & mode) != mode) - error = EACCES; - } + if ((mode & NFSV3ACCESS_DELETE) && + !(np->n_mode & NFSV3ACCESS_DELETE)) + np->n_mode |= NFSV3ACCESS_DELETE; + if ((np->n_mode & mode) != mode) + error = EACCES; } - } else - return (nfsspec_access(ap)); /* NFSv2 case checks for EROFS here */ - /* - * Disallow write attempts on filesystems mounted read-only; - * unless the file is a socket, fifo, or a block or character - * device resident on the filesystem. - * CSM - moved EROFS check down per NetBSD rev 1.71. So you - * get the correct error value with layered filesystems. - * EKN - moved the return(error) below this so it does get called. - */ - if (!error && (ap->a_mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) { - switch (vp->v_type) { - case VREG: case VDIR: case VLNK: - error = EROFS; - default: - break; + } else { + /* v2 */ + if ((ap->a_action & KAUTH_VNODE_WRITE_RIGHTS) && vfs_isrdonly(vnode_mount(vp))) { + error = EROFS; + } else { + error = 0; } } + return (error); } @@ -670,82 +645,69 @@ nfs_access(ap) static int nfs_open(ap) - struct vop_open_args /* { - struct vnode *a_vp; - int a_mode; - struct ucred *a_cred; - struct proc *a_p; + struct vnop_open_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + int a_mode; + vfs_context_t a_context; } */ *ap; { - register struct vnode *vp = ap->a_vp; + vnode_t vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); - struct nfsmount *nmp = VFSTONFS(vp->v_mount); - struct vattr vattr; + struct nfs_vattr nvattr; + kauth_cred_t cred; + proc_t p; + enum vtype vtype; int error; - if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) { + vtype = vnode_vtype(vp); + if (vtype != VREG && vtype != VDIR && vtype != VLNK) { return (EACCES); } - /* - * Get a valid lease. If cached data is stale, flush it. - */ - if (nmp->nm_flag & NFSMNT_NQNFS) { - if (NQNFS_CKINVALID(vp, np, ND_READ)) { - do { - error = nqnfs_getlease(vp, ND_READ, ap->a_cred, - ap->a_p); - } while (error == NQNFS_EXPIRED); - if (error) + + cred = vfs_context_ucred(ap->a_context); + p = vfs_context_proc(ap->a_context); + + if (np->n_flag & NNEEDINVALIDATE) { + np->n_flag &= ~NNEEDINVALIDATE; + nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cred, p, 1); + } + if (np->n_flag & NMODIFIED) { + if ((error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1)) == EINTR) return (error); - if (np->n_lrev != np->n_brev || - (np->n_flag & NQNFSNONCACHE)) { - if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, - ap->a_p, 1)) == EINTR) - return (error); - np->n_brev = np->n_lrev; - } + NATTRINVALIDATE(np); + if (vtype == VDIR) + np->n_direofoffset = 0; + error = nfs_getattr(vp, &nvattr, cred, p); + if (error) + return (error); + if (vtype == VDIR) { + /* if directory changed, purge any name cache entries */ + if (nfstimespeccmp(&np->n_ncmtime, &nvattr.nva_mtime, !=)) + cache_purge(vp); + np->n_ncmtime = nvattr.nva_mtime; } + np->n_mtime = nvattr.nva_mtime; } else { - if (np->n_flag & NMODIFIED) { - if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, - ap->a_p, 1)) == EINTR) - return (error); - np->n_xid = 0; - if (vp->v_type == VDIR) + error = nfs_getattr(vp, &nvattr, cred, p); + if (error) + return (error); + if (nfstimespeccmp(&np->n_mtime, &nvattr.nva_mtime, !=)) { + if (vtype == VDIR) { np->n_direofoffset = 0; - error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_p); - if (error) - return (error); - if (vp->v_type == VDIR) { - /* if directory changed, purge any name cache entries */ - if (np->n_ncmtime != vattr.va_mtime.tv_sec) + nfs_invaldir(vp); + /* purge name cache entries */ + if (nfstimespeccmp(&np->n_ncmtime, &nvattr.nva_mtime, !=)) cache_purge(vp); - np->n_ncmtime = vattr.va_mtime.tv_sec; } - np->n_mtime = vattr.va_mtime.tv_sec; - } else { - error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_p); - if (error) + if ((error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1)) == EINTR) return (error); - if (np->n_mtime != vattr.va_mtime.tv_sec) { - if (vp->v_type == VDIR) { - np->n_direofoffset = 0; - nfs_invaldir(vp); - /* purge name cache entries */ - if (np->n_ncmtime != vattr.va_mtime.tv_sec) - cache_purge(vp); - } - if ((error = nfs_vinvalbuf(vp, V_SAVE, - ap->a_cred, ap->a_p, 1)) == EINTR) - return (error); - if (vp->v_type == VDIR) - np->n_ncmtime = vattr.va_mtime.tv_sec; - np->n_mtime = vattr.va_mtime.tv_sec; - } + if (vtype == VDIR) + np->n_ncmtime = nvattr.nva_mtime; + np->n_mtime = nvattr.nva_mtime; } } - if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) - np->n_xid = 0; /* For Open/Close consistency */ + NATTRINVALIDATE(np); /* For Open/Close consistency */ return (0); } @@ -770,27 +732,28 @@ nfs_open(ap) * for NFS Version 2 - play it safe and flush/invalidate all dirty buffers * for NFS Version 3 - flush dirty buffers to the server but don't invalidate * them. - * for NQNFS - do nothing now, since 2 is dealt with via leases and - * 1 should be dealt with via an fsync() system call for - * cases where write errors are important. */ /* ARGSUSED */ static int nfs_close(ap) - struct vop_close_args /* { + struct vnop_close_args /* { struct vnodeop_desc *a_desc; - struct vnode *a_vp; - int a_fflag; - struct ucred *a_cred; - struct proc *a_p; + vnode_t a_vp; + int a_fflag; + vfs_context_t a_context; } */ *ap; { - register struct vnode *vp = ap->a_vp; - register struct nfsnode *np = VTONFS(vp); + vnode_t vp = ap->a_vp; + struct nfsnode *np = VTONFS(vp); struct nfsmount *nmp; + kauth_cred_t cred; + proc_t p; int error = 0; - if (vp->v_type == VREG) { + cred = vfs_context_ucred(ap->a_context); + p = vfs_context_proc(ap->a_context); + + if (vnode_vtype(vp) == VREG) { #if DIAGNOSTIC register struct sillyrename *sp = np->n_sillyrename; if (sp) @@ -798,23 +761,16 @@ nfs_close(ap) &sp->s_name[0], (unsigned)(sp->s_dvp), (unsigned)vp, (unsigned)ap, (unsigned)np, (unsigned)sp); #endif - nmp = VFSTONFS(vp->v_mount); + nmp = VFSTONFS(vnode_mount(vp)); if (!nmp) return (ENXIO); - if ((nmp->nm_flag & NFSMNT_NQNFS) == 0 && - (np->n_flag & NMODIFIED)) { - int getlock = !VOP_ISLOCKED(vp); - if (getlock) { - error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, ap->a_p); - if (!error && !VFSTONFS(vp->v_mount)) { - VOP_UNLOCK(vp, 0, ap->a_p); - error = ENXIO; - } - if (error) - return (error); - } + if (np->n_flag & NNEEDINVALIDATE) { + np->n_flag &= ~NNEEDINVALIDATE; + nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cred, p, 1); + } + if (np->n_flag & NMODIFIED) { if (NFS_ISV3(vp)) { - error = nfs_flush(vp, ap->a_cred, MNT_WAIT, ap->a_p); + error = nfs_flush(vp, MNT_WAIT, cred, p, 0); /* * We cannot clear the NMODIFIED bit in np->n_flag due to * potential races with other processes @@ -822,11 +778,9 @@ nfs_close(ap) */ /* np->n_flag &= ~NMODIFIED; */ } else { - error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1); + error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); } - np->n_xid = 0; - if (getlock) - VOP_UNLOCK(vp, 0, ap->a_p); + NATTRINVALIDATE(np); } if (np->n_flag & NWRITEERR) { np->n_flag &= ~NWRITEERR; @@ -836,31 +790,99 @@ nfs_close(ap) return (error); } + +int +nfs_getattr_no_vnode( + mount_t mp, + u_char *fhp, + int fhsize, + kauth_cred_t cred, + proc_t p, + struct nfs_vattr *nvap, + u_int64_t *xidp) +{ + mbuf_t mreq, mrep, md, mb, mb2; + caddr_t bpos, dpos; + int t2; + u_long *tl; + caddr_t cp; + struct nfsmount *nmp = VFSTONFS(mp); + int v3 = (nmp->nm_flag & NFSMNT_NFSV3); + int hsiz; + int error = 0; + + // XXX fix this to use macros once the macros get cleaned up + //nfsm_reqhead(NFSX_FH(v3)); + hsiz = NFSX_FH(v3); + mb = NULL; + if (hsiz >= nfs_mbuf_minclsize) + error = mbuf_mclget(MBUF_WAITOK, MBUF_TYPE_DATA, &mb); + else + error = mbuf_get(MBUF_WAITOK, MBUF_TYPE_DATA, &mb); + if (error) + return (error); + bpos = mbuf_data(mb); + mreq = mb; + OSAddAtomic(1, (SInt32*)&nfsstats.rpccnt[NFSPROC_GETATTR]); + //nfsm_fhtom(vp, v3); + if (v3) { + t2 = nfsm_rndup(fhsize) + NFSX_UNSIGNED; + if (t2 <= mbuf_trailingspace(mb)) { + nfsm_build(tl, u_long *, t2); + *tl++ = txdr_unsigned(fhsize); + *(tl + ((t2>>2) - 2)) = 0; + bcopy((caddr_t)fhp,(caddr_t)tl, fhsize); + } else if ((t2 = nfsm_strtmbuf(&mb, &bpos, (caddr_t)fhp, fhsize))) { + error = t2; + mbuf_freem(mreq); + goto nfsmout; + } + } else { + nfsm_build(cp, caddr_t, NFSX_V2FH); + bcopy((caddr_t)fhp, cp, NFSX_V2FH); + } + //nfsm_request(vp, NFSPROC_GETATTR, p, cred, xidp); + if ((error = nfs_request(NULL, mp, mreq, NFSPROC_GETATTR, p, cred, &mrep, &md, &dpos, xidp))) { + if (error & NFSERR_RETERR) + error &= ~NFSERR_RETERR; + else + goto nfsmout; + } + if (!error) { + //nfsm_loadattr(vp, nvap, xidp); + error = nfs_parsefattr(&md, &dpos, v3, nvap); + if (error) { + mbuf_freem(mrep); + goto nfsmout; + } + } + nfsm_reqdone; + return (error); +} + /* * nfs getattr call from vfs. */ -static int -nfs_getattr(ap) - struct vop_getattr_args /* { - struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; +int +nfs_getattr( + vnode_t vp, + struct nfs_vattr *nvap, + kauth_cred_t cred, + proc_t p) { - register struct vnode *vp = ap->a_vp; - register struct nfsnode *np = VTONFS(vp); - register caddr_t cp; - register u_long *tl; - register int t1, t2; + struct nfsnode *np = VTONFS(vp); + caddr_t cp; + u_long *tl; + int t1, t2; caddr_t bpos, dpos; int error = 0; - struct mbuf *mreq, *mrep, *md, *mb, *mb2; + mbuf_t mreq, mrep, md, mb, mb2; int v3; u_int64_t xid; int avoidfloods; - - FSDBG_TOP(513, np->n_size, np, np->n_vattr.va_size, np->n_flag); + + FSDBG_TOP(513, np->n_size, np, np->n_vattr.nva_size, np->n_flag); + /* * Update local times for special files. */ @@ -869,29 +891,35 @@ nfs_getattr(ap) /* * First look in the cache. */ - if ((error = nfs_getattrcache(vp, ap->a_vap)) == 0) { - FSDBG_BOT(513, np->n_size, 0, np->n_vattr.va_size, np->n_flag); + if ((error = nfs_getattrcache(vp, nvap)) == 0) { + FSDBG_BOT(513, np->n_size, 0, np->n_vattr.nva_size, np->n_flag); return (0); } if (error != ENOENT) { - FSDBG_BOT(513, np->n_size, error, np->n_vattr.va_size, + FSDBG_BOT(513, np->n_size, error, np->n_vattr.nva_size, np->n_flag); return (error); } - if (!VFSTONFS(vp->v_mount)) { - FSDBG_BOT(513, np->n_size, ENXIO, np->n_vattr.va_size, np->n_flag); + if (!VFSTONFS(vnode_mount(vp))) { + FSDBG_BOT(513, np->n_size, ENXIO, np->n_vattr.nva_size, np->n_flag); return (ENXIO); } v3 = NFS_ISV3(vp); error = 0; - if (v3 && nfsaccess_cache_timeout > 0) { - /* nfsstats.accesscache_misses++; */ - if (error = nfs3_access_otw(vp, NFSV3ACCESS_ALL, ap->a_p, - ap->a_cred)) + /* + * Try to get both the attributes and access info by making an + * ACCESS call and seeing if it returns updated attributes. + * But don't bother if we aren't caching access info or if the + * attributes returned wouldn't be cached. + */ + if (v3 && (nfsaccess_cache_timeout > 0) && + (nfs_attrcachetimeout(vp) > 0)) { + /* OSAddAtomic(1, (SInt32*)&nfsstats.accesscache_misses); */ + if ((error = nfs3_access_otw(vp, NFSV3ACCESS_ALL, p, cred))) return (error); - if ((error = nfs_getattrcache(vp, ap->a_vap)) == 0) + if ((error = nfs_getattrcache(vp, nvap)) == 0) return (0); if (error != ENOENT) return (error); @@ -899,14 +927,18 @@ nfs_getattr(ap) } avoidfloods = 0; tryagain: - nfsstats.rpccnt[NFSPROC_GETATTR]++; - nfsm_reqhead(vp, NFSPROC_GETATTR, NFSX_FH(v3)); + nfsm_reqhead(NFSX_FH(v3)); + if (error) { + FSDBG_BOT(513, np->n_size, error, np->n_vattr.nva_size, np->n_flag); + return (error); + } + OSAddAtomic(1, (SInt32*)&nfsstats.rpccnt[NFSPROC_GETATTR]); nfsm_fhtom(vp, v3); - nfsm_request(vp, NFSPROC_GETATTR, ap->a_p, ap->a_cred, &xid); + nfsm_request(vp, NFSPROC_GETATTR, p, cred, &xid); if (!error) { - nfsm_loadattr(vp, ap->a_vap, &xid); + nfsm_loadattr(vp, v3, nvap, &xid); if (!xid) { /* out-of-order rpc - attributes were dropped */ - m_freem(mrep); + mbuf_freem(mrep); mrep = NULL; FSDBG(513, -1, np, np->n_xid << 32, np->n_xid); if (avoidfloods++ < 100) @@ -916,27 +948,72 @@ tryagain: */ panic("nfs_getattr: getattr flood\n"); } - if (np->n_mtime != ap->a_vap->va_mtime.tv_sec) { + if (nfstimespeccmp(&np->n_mtime, &nvap->nva_mtime, !=)) { + enum vtype vtype = vnode_vtype(vp); FSDBG(513, -1, np, -1, vp); - if (vp->v_type == VDIR) { + if (vtype == VDIR) { nfs_invaldir(vp); /* purge name cache entries */ - if (np->n_ncmtime != ap->a_vap->va_mtime.tv_sec) + if (nfstimespeccmp(&np->n_ncmtime, &nvap->nva_mtime, !=)) cache_purge(vp); } - error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, - ap->a_p, 1); + error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); FSDBG(513, -1, np, -2, error); if (!error) { - if (vp->v_type == VDIR) - np->n_ncmtime = ap->a_vap->va_mtime.tv_sec; - np->n_mtime = ap->a_vap->va_mtime.tv_sec; + if (vtype == VDIR) + np->n_ncmtime = nvap->nva_mtime; + np->n_mtime = nvap->nva_mtime; } } } nfsm_reqdone; - FSDBG_BOT(513, np->n_size, -1, np->n_vattr.va_size, error); + FSDBG_BOT(513, np->n_size, -1, np->n_vattr.nva_size, error); + return (error); +} + + +static int +nfs_vnop_getattr( + struct vnop_getattr_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + struct vnode_attr *a_vap; + vfs_context_t a_context; + } */ *ap) +{ + int error; + struct nfs_vattr nva; + struct vnode_attr *vap = ap->a_vap; + + error = nfs_getattr(ap->a_vp, &nva, + vfs_context_ucred(ap->a_context), + vfs_context_proc(ap->a_context)); + if (error) + return (error); + + /* copy nva to *a_vap */ + VATTR_RETURN(vap, va_type, nva.nva_type); + VATTR_RETURN(vap, va_mode, nva.nva_mode); + VATTR_RETURN(vap, va_rdev, nva.nva_rdev); + VATTR_RETURN(vap, va_uid, nva.nva_uid); + VATTR_RETURN(vap, va_gid, nva.nva_gid); + VATTR_RETURN(vap, va_nlink, nva.nva_nlink); + VATTR_RETURN(vap, va_fileid, nva.nva_fileid); + VATTR_RETURN(vap, va_data_size, nva.nva_size); + VATTR_RETURN(vap, va_data_alloc, nva.nva_bytes); + VATTR_RETURN(vap, va_iosize, nva.nva_blocksize); /* should this just be f_iosize? */ + VATTR_RETURN(vap, va_fsid, nva.nva_fsid); + vap->va_access_time.tv_sec = nva.nva_atime.tv_sec; + vap->va_access_time.tv_nsec = nva.nva_atime.tv_nsec; + VATTR_SET_SUPPORTED(vap, va_access_time); + vap->va_modify_time.tv_sec = nva.nva_mtime.tv_sec; + vap->va_modify_time.tv_nsec = nva.nva_mtime.tv_nsec; + VATTR_SET_SUPPORTED(vap, va_modify_time); + vap->va_change_time.tv_sec = nva.nva_ctime.tv_sec; + vap->va_change_time.tv_nsec = nva.nva_ctime.tv_nsec; + VATTR_SET_SUPPORTED(vap, va_change_time); + return (error); } @@ -945,158 +1022,174 @@ tryagain: */ static int nfs_setattr(ap) - struct vop_setattr_args /* { + struct vnop_setattr_args /* { struct vnodeop_desc *a_desc; - struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - struct proc *a_p; + vnode_t a_vp; + struct vnode_attr *a_vap; + vfs_context_t a_context; } */ *ap; { - register struct vnode *vp = ap->a_vp; - register struct nfsnode *np = VTONFS(vp); - register struct vattr *vap = ap->a_vap; + vnode_t vp = ap->a_vp; + struct nfsnode *np = VTONFS(vp); + struct vnode_attr *vap = ap->a_vap; int error = 0; u_quad_t tsize; + kauth_cred_t cred; + proc_t p; #ifndef nolint tsize = (u_quad_t)0; #endif -#ifdef XXX /* enable this code soon! (but test it first) */ - /* - * Setting of flags is not supported. - */ - if (vap->va_flags != VNOVAL) - return (EOPNOTSUPP); -#endif - - /* - * Disallow write attempts if the filesystem is mounted read-only. - */ - if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL || - vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || - vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) && - (vp->v_mount->mnt_flag & MNT_RDONLY)) + /* Setting of flags is not supported. */ + if (VATTR_IS_ACTIVE(vap, va_flags)) + return (ENOTSUP); + + cred = vfs_context_ucred(ap->a_context); + p = vfs_context_proc(ap->a_context); + + VATTR_SET_SUPPORTED(vap, va_mode); + VATTR_SET_SUPPORTED(vap, va_uid); + VATTR_SET_SUPPORTED(vap, va_gid); + VATTR_SET_SUPPORTED(vap, va_data_size); + VATTR_SET_SUPPORTED(vap, va_access_time); + VATTR_SET_SUPPORTED(vap, va_modify_time); + + /* Disallow write attempts if the filesystem is mounted read-only. */ + if ((VATTR_IS_ACTIVE(vap, va_flags) || VATTR_IS_ACTIVE(vap, va_mode) || + VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid) || + VATTR_IS_ACTIVE(vap, va_access_time) || + VATTR_IS_ACTIVE(vap, va_modify_time)) && + vnode_vfsisrdonly(vp)) return (EROFS); - if (vap->va_size != VNOVAL) { - switch (vp->v_type) { + + if (VATTR_IS_ACTIVE(vap, va_data_size)) { + switch (vnode_vtype(vp)) { case VDIR: return (EISDIR); case VCHR: case VBLK: case VSOCK: case VFIFO: - if (vap->va_mtime.tv_sec == VNOVAL && - vap->va_atime.tv_sec == VNOVAL && - vap->va_mode == (u_short)VNOVAL && - vap->va_uid == (uid_t)VNOVAL && - vap->va_gid == (gid_t)VNOVAL) + if (!VATTR_IS_ACTIVE(vap, va_modify_time) && + !VATTR_IS_ACTIVE(vap, va_access_time) && + !VATTR_IS_ACTIVE(vap, va_mode) && + !VATTR_IS_ACTIVE(vap, va_uid) && + !VATTR_IS_ACTIVE(vap, va_gid)) return (0); - vap->va_size = VNOVAL; + VATTR_CLEAR_ACTIVE(vap, va_data_size); break; default: /* * Disallow write attempts if the filesystem is * mounted read-only. */ - if (vp->v_mount->mnt_flag & MNT_RDONLY) + if (vnode_vfsisrdonly(vp)) return (EROFS); - FSDBG_TOP(512, np->n_size, vap->va_size, - np->n_vattr.va_size, np->n_flag); + FSDBG_TOP(512, np->n_size, vap->va_data_size, + np->n_vattr.nva_size, np->n_flag); if (np->n_flag & NMODIFIED) { - if (vap->va_size == 0) - error = nfs_vinvalbuf(vp, 0, - ap->a_cred, ap->a_p, 1); + if (vap->va_data_size == 0) + error = nfs_vinvalbuf(vp, 0, cred, p, 1); else - error = nfs_vinvalbuf(vp, V_SAVE, - ap->a_cred, ap->a_p, 1); + error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) { printf("nfs_setattr: nfs_vinvalbuf %d\n", error); - FSDBG_BOT(512, np->n_size, vap->va_size, - np->n_vattr.va_size, -1); + FSDBG_BOT(512, np->n_size, vap->va_data_size, + np->n_vattr.nva_size, -1); return (error); } - } else if (np->n_size > vap->va_size) { /* shrinking? */ - daddr_t obn, bn; - int biosize; + } else if (np->n_size > vap->va_data_size) { /* shrinking? */ + daddr64_t obn, bn; + int biosize, neweofoff, mustwrite; struct nfsbuf *bp; - biosize = vp->v_mount->mnt_stat.f_iosize; + biosize = vfs_statfs(vnode_mount(vp))->f_iosize; obn = (np->n_size - 1) / biosize; - bn = vap->va_size / biosize; - for ( ; obn >= bn; obn--) - if (nfs_buf_incore(vp, obn)) { - bp = nfs_buf_get(vp, obn, biosize, 0, BLK_READ); - if (!bp) + bn = vap->va_data_size / biosize; + for ( ; obn >= bn; obn--) { + if (!nfs_buf_is_incore(vp, obn)) + continue; + error = nfs_buf_get(vp, obn, biosize, 0, NBLK_READ, &bp); + if (error) continue; - if (obn == bn) { - int neweofoff, mustwrite; - mustwrite = 0; - neweofoff = vap->va_size - NBOFF(bp); - /* check for any dirty data before the new EOF */ - if (bp->nb_dirtyend && bp->nb_dirtyoff < neweofoff) { + if (obn != bn) { + FSDBG(512, bp, bp->nb_flags, 0, obn); + SET(bp->nb_flags, NB_INVAL); + nfs_buf_release(bp, 1); + continue; + } + mustwrite = 0; + neweofoff = vap->va_data_size - NBOFF(bp); + /* check for any dirty data before the new EOF */ + if (bp->nb_dirtyend && bp->nb_dirtyoff < neweofoff) { /* clip dirty range to EOF */ if (bp->nb_dirtyend > neweofoff) - bp->nb_dirtyend = neweofoff; + bp->nb_dirtyend = neweofoff; mustwrite++; - } - bp->nb_dirty &= (1 << round_page_32(neweofoff)/PAGE_SIZE) - 1; - if (bp->nb_dirty) + } + bp->nb_dirty &= (1 << round_page_32(neweofoff)/PAGE_SIZE) - 1; + if (bp->nb_dirty) mustwrite++; - if (mustwrite) { - /* gotta write out dirty data before invalidating */ - /* (NB_STABLE indicates that data writes should be FILESYNC) */ - /* (NB_NOCACHE indicates buffer should be discarded) */ - CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC | NB_READ)); - SET(bp->nb_flags, NB_STABLE | NB_NOCACHE); + if (!mustwrite) { + FSDBG(512, bp, bp->nb_flags, 0, obn); + SET(bp->nb_flags, NB_INVAL); + nfs_buf_release(bp, 1); + continue; + } + /* gotta write out dirty data before invalidating */ + /* (NB_STABLE indicates that data writes should be FILESYNC) */ + /* (NB_NOCACHE indicates buffer should be discarded) */ + CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC | NB_READ)); + SET(bp->nb_flags, NB_STABLE | NB_NOCACHE); + if (bp->nb_wcred == NOCRED) { + kauth_cred_ref(cred); + bp->nb_wcred = cred; + } + error = nfs_buf_write(bp); + // Note: bp has been released + if (error) { + FSDBG(512, bp, 0xd00dee, 0xbad, error); + np->n_error = error; + np->n_flag |= NWRITEERR; /* - * NFS has embedded ucred so crhold() risks zone corruption + * There was a write error and we need to + * invalidate attrs and flush buffers in + * order to sync up with the server. + * (if this write was extending the file, + * we may no longer know the correct size) */ - if (bp->nb_wcred == NOCRED) - bp->nb_wcred = crdup(ap->a_cred); - error = nfs_buf_write(bp); - // Note: bp has been released - if (error) { - FSDBG(512, bp, 0xd00dee, 0xbad, error); - np->n_error = error; - np->n_flag |= NWRITEERR; - error = 0; - } - bp = NULL; - } + NATTRINVALIDATE(np); + nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cred, p, 1); + error = 0; } - if (bp) { - FSDBG(512, bp, bp->nb_flags, 0, obn); - SET(bp->nb_flags, NB_INVAL); - nfs_buf_release(bp, 1); - } - } + } } tsize = np->n_size; - np->n_size = np->n_vattr.va_size = vap->va_size; - ubc_setsize(vp, (off_t)vap->va_size); /* XXX error? */ - }; - } else if ((vap->va_mtime.tv_sec != VNOVAL || - vap->va_atime.tv_sec != VNOVAL) && - (np->n_flag & NMODIFIED) && vp->v_type == VREG) { - error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1); + np->n_size = np->n_vattr.nva_size = vap->va_data_size; + ubc_setsize(vp, (off_t)vap->va_data_size); /* XXX error? */ + } + } else if ((VATTR_IS_ACTIVE(vap, va_modify_time) || + VATTR_IS_ACTIVE(vap, va_access_time)) && + (np->n_flag & NMODIFIED) && (vnode_vtype(vp) == VREG)) { + error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error == EINTR) return (error); } - error = nfs_setattrrpc(vp, vap, ap->a_cred, ap->a_p); - FSDBG_BOT(512, np->n_size, vap->va_size, np->n_vattr.va_size, error); - if (error && vap->va_size != VNOVAL) { + if (VATTR_IS_ACTIVE(vap, va_mode)) { + NMODEINVALIDATE(np); + } + error = nfs_setattrrpc(vp, vap, cred, p); + FSDBG_BOT(512, np->n_size, vap->va_data_size, np->n_vattr.nva_size, error); + if (error && VATTR_IS_ACTIVE(vap, va_data_size)) { /* make every effort to resync file size w/ server... */ - int err = 0; /* preserve "error" for return */ + int err; /* preserve "error" for return */ - printf("nfs_setattr: nfs_setattrrpc %d\n", error); - np->n_size = np->n_vattr.va_size = tsize; + np->n_size = np->n_vattr.nva_size = tsize; ubc_setsize(vp, (off_t)np->n_size); /* XXX check error */ - vap->va_size = tsize; - err = nfs_setattrrpc(vp, vap, ap->a_cred, ap->a_p); - if (err) - printf("nfs_setattr1: nfs_setattrrpc %d\n", err); + vap->va_data_size = tsize; + err = nfs_setattrrpc(vp, vap, cred, p); + printf("nfs_setattr: nfs_setattrrpc %d %d\n", error, err); } return (error); } @@ -1106,10 +1199,10 @@ nfs_setattr(ap) */ static int nfs_setattrrpc(vp, vap, cred, procp) - register struct vnode *vp; - register struct vattr *vap; - struct ucred *cred; - struct proc *procp; + vnode_t vp; + struct vnode_attr *vap; + kauth_cred_t cred; + proc_t procp; { register struct nfsv2_sattr *sp; register caddr_t cp; @@ -1117,20 +1210,22 @@ nfs_setattrrpc(vp, vap, cred, procp) caddr_t bpos, dpos, cp2; u_long *tl; int error = 0, wccpostattr = 0; - struct mbuf *mreq, *mrep, *md, *mb, *mb2; + mbuf_t mreq, mrep, md, mb, mb2; int v3; u_int64_t xid; struct timeval now; - if (!VFSTONFS(vp->v_mount)) + if (!VFSTONFS(vnode_mount(vp))) return (ENXIO); v3 = NFS_ISV3(vp); - nfsstats.rpccnt[NFSPROC_SETATTR]++; - nfsm_reqhead(vp, NFSPROC_SETATTR, NFSX_FH(v3) + NFSX_SATTR(v3)); + nfsm_reqhead(NFSX_FH(v3) + NFSX_SATTR(v3)); + if (error) + return (error); + OSAddAtomic(1, (SInt32*)&nfsstats.rpccnt[NFSPROC_SETATTR]); nfsm_fhtom(vp, v3); if (v3) { - if (vap->va_mode != (u_short)VNOVAL) { + if (VATTR_IS_ACTIVE(vap, va_mode)) { nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED); *tl++ = nfs_true; *tl = txdr_unsigned(vap->va_mode); @@ -1138,7 +1233,7 @@ nfs_setattrrpc(vp, vap, cred, procp) nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = nfs_false; } - if (vap->va_uid != (uid_t)VNOVAL) { + if (VATTR_IS_ACTIVE(vap, va_uid)) { nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED); *tl++ = nfs_true; *tl = txdr_unsigned(vap->va_uid); @@ -1146,7 +1241,7 @@ nfs_setattrrpc(vp, vap, cred, procp) nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = nfs_false; } - if (vap->va_gid != (gid_t)VNOVAL) { + if (VATTR_IS_ACTIVE(vap, va_gid)) { nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED); *tl++ = nfs_true; *tl = txdr_unsigned(vap->va_gid); @@ -1154,20 +1249,20 @@ nfs_setattrrpc(vp, vap, cred, procp) nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = nfs_false; } - if (vap->va_size != VNOVAL) { + if (VATTR_IS_ACTIVE(vap, va_data_size)) { nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED); *tl++ = nfs_true; - txdr_hyper(&vap->va_size, tl); + txdr_hyper(&vap->va_data_size, tl); } else { nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = nfs_false; } microtime(&now); - if (vap->va_atime.tv_sec != VNOVAL) { - if (vap->va_atime.tv_sec != now.tv_sec) { + if (VATTR_IS_ACTIVE(vap, va_access_time)) { + if (vap->va_access_time.tv_sec != now.tv_sec) { nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV3SATTRTIME_TOCLIENT); - txdr_nfsv3time(&vap->va_atime, tl); + txdr_nfsv3time(&vap->va_access_time, tl); } else { nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV3SATTRTIME_TOSERVER); @@ -1176,11 +1271,11 @@ nfs_setattrrpc(vp, vap, cred, procp) nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV3SATTRTIME_DONTCHANGE); } - if (vap->va_mtime.tv_sec != VNOVAL) { - if (vap->va_mtime.tv_sec != now.tv_sec) { + if (VATTR_IS_ACTIVE(vap, va_modify_time)) { + if (vap->va_modify_time.tv_sec != now.tv_sec) { nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV3SATTRTIME_TOCLIENT); - txdr_nfsv3time(&vap->va_mtime, tl); + txdr_nfsv3time(&vap->va_modify_time, tl); } else { nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV3SATTRTIME_TOSERVER); @@ -1192,42 +1287,55 @@ nfs_setattrrpc(vp, vap, cred, procp) nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = nfs_false; } else { + struct timespec neg1time = { -1, -1 }; nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); - if (vap->va_mode == (u_short)VNOVAL) - sp->sa_mode = VNOVAL; - else - sp->sa_mode = vtonfsv2_mode(vp->v_type, vap->va_mode); - if (vap->va_uid == (uid_t)VNOVAL) - sp->sa_uid = VNOVAL; + if (VATTR_IS_ACTIVE(vap, va_mode)) + sp->sa_mode = vtonfsv2_mode(vnode_vtype(vp), vap->va_mode); else + sp->sa_mode = nfs_xdrneg1; + if (VATTR_IS_ACTIVE(vap, va_uid)) sp->sa_uid = txdr_unsigned(vap->va_uid); - if (vap->va_gid == (gid_t)VNOVAL) - sp->sa_gid = VNOVAL; else + sp->sa_uid = nfs_xdrneg1; + if (VATTR_IS_ACTIVE(vap, va_gid)) sp->sa_gid = txdr_unsigned(vap->va_gid); - sp->sa_size = txdr_unsigned(vap->va_size); - txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); - txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); + else + sp->sa_gid = nfs_xdrneg1; + if (VATTR_IS_ACTIVE(vap, va_data_size)) + sp->sa_size = txdr_unsigned(vap->va_data_size); + else + sp->sa_size = nfs_xdrneg1; + if (VATTR_IS_ACTIVE(vap, va_access_time)) { + txdr_nfsv2time(&vap->va_access_time, &sp->sa_atime); + } else { + txdr_nfsv2time(&neg1time, &sp->sa_atime); + } + if (VATTR_IS_ACTIVE(vap, va_modify_time)) { + txdr_nfsv2time(&vap->va_modify_time, &sp->sa_mtime); + } else { + txdr_nfsv2time(&neg1time, &sp->sa_mtime); + } } nfsm_request(vp, NFSPROC_SETATTR, procp, cred, &xid); if (v3) { - time_t premtime = 0; + struct timespec premtime = { 0, 0 }; if (mrep) { - nfsm_wcc_data(vp, premtime, wccpostattr, &xid); + nfsm_wcc_data(vp, &premtime, wccpostattr, &xid); } /* if file hadn't changed, update cached mtime */ - if (VTONFS(vp)->n_mtime == premtime) { - VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.va_mtime.tv_sec; + if (nfstimespeccmp(&VTONFS(vp)->n_mtime, &premtime, ==)) { + VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.nva_mtime; } /* if directory hadn't changed, update namecache mtime */ - if ((vp->v_type == VDIR) && (VTONFS(vp)->n_ncmtime == premtime)) { - VTONFS(vp)->n_ncmtime = VTONFS(vp)->n_vattr.va_mtime.tv_sec; + if ((vnode_vtype(vp) == VDIR) && + nfstimespeccmp(&VTONFS(vp)->n_ncmtime, &premtime, ==)) { + VTONFS(vp)->n_ncmtime = VTONFS(vp)->n_vattr.nva_mtime; } if (!wccpostattr) - VTONFS(vp)->n_xid = 0; + NATTRINVALIDATE(VTONFS(vp)); } else { if (mrep) { - nfsm_loadattr(vp, (struct vattr *)0, &xid); + nfsm_loadattr(vp, v3, NULL, &xid); } } nfsm_reqdone; @@ -1241,258 +1349,234 @@ nfs_setattrrpc(vp, vap, cred, procp) */ static int nfs_lookup(ap) - struct vop_lookup_args /* { + struct vnop_lookup_args /* { struct vnodeop_desc *a_desc; - struct vnode *a_dvp; - struct vnode **a_vpp; + vnode_t a_dvp; + vnode_t *a_vpp; struct componentname *a_cnp; + vfs_context_t a_context; } */ *ap; { - register struct componentname *cnp = ap->a_cnp; - register struct vnode *dvp = ap->a_dvp; - register struct vnode **vpp = ap->a_vpp; - register int flags = cnp->cn_flags; - register struct vnode *newvp; - register u_long *tl; - register caddr_t cp; - register long t1, t2; + struct componentname *cnp = ap->a_cnp; + vnode_t dvp = ap->a_dvp; + vnode_t *vpp = ap->a_vpp; + int flags = cnp->cn_flags; + vnode_t newvp; + u_long *tl; + caddr_t cp; + long t1, t2; caddr_t bpos, dpos, cp2; - struct mbuf *mreq, *mrep, *md, *mb, *mb2; + mbuf_t mreq, mrep, md, mb, mb2; long len; - nfsfh_t *fhp; - struct nfsnode *np; - int lockparent, wantparent, error = 0, attrflag, fhsize; + u_char *fhp; + struct nfsnode *dnp, *np; + int wantparent, error, attrflag, dattrflag, fhsize, fhisdvp; int v3 = NFS_ISV3(dvp); - struct proc *p = cnp->cn_proc; - int unlockdvp = 0; - u_int64_t xid; - struct vattr vattr; + u_int64_t xid, dxid; + struct nfs_vattr nvattr; + kauth_cred_t cred; + proc_t p; + int ngflags; - if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && - (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) - return (EROFS); *vpp = NULLVP; - if (dvp->v_type != VDIR) - return (ENOTDIR); - lockparent = flags & LOCKPARENT; + cred = vfs_context_ucred(ap->a_context); + p = vfs_context_proc(ap->a_context); + wantparent = flags & (LOCKPARENT|WANTPARENT); - np = VTONFS(dvp); + dnp = VTONFS(dvp); - if (!VOP_GETATTR(dvp, &vattr, cnp->cn_cred, p) && - (np->n_ncmtime != vattr.va_mtime.tv_sec)) { + error = nfs_getattr(dvp, &nvattr, cred, p); + if (error) + goto error_return; + if (nfstimespeccmp(&dnp->n_ncmtime, &nvattr.nva_mtime, !=)) { /* * This directory has changed on us. * Purge any name cache entries. */ cache_purge(dvp); - np->n_ncmtime = vattr.va_mtime.tv_sec; + dnp->n_ncmtime = nvattr.nva_mtime; } - if ((error = cache_lookup(dvp, vpp, cnp)) && error != ENOENT) { - int vpid; - - newvp = *vpp; - vpid = newvp->v_id; - - /* - * See the comment starting `Step through' in ufs/ufs_lookup.c - * for an explanation of the locking protocol - */ - - /* - * Note: we need to make sure to get a lock/ref on newvp - * before we possibly go off to the server in VOP_ACCESS. - */ - if (dvp == newvp) { - VREF(newvp); - error = 0; - } else if (flags & ISDOTDOT) { - VOP_UNLOCK(dvp, 0, p); - error = vget(newvp, LK_EXCLUSIVE, p); - if (!error) - error = vn_lock(dvp, LK_EXCLUSIVE, p); - } else { - error = vget(newvp, LK_EXCLUSIVE, p); - if (error) - VOP_UNLOCK(dvp, 0, p); - } - - if (error) - goto cache_lookup_out; - - if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, p))) { - if (dvp == newvp) - vrele(newvp); - else - vput(newvp); - *vpp = NULLVP; - goto error_return; - } - - if ((dvp != newvp) && (!lockparent || !(flags & ISLASTCN))) - VOP_UNLOCK(dvp, 0, p); + error = cache_lookup(dvp, vpp, cnp); + switch (error) { + case ENOENT: + /* negative cache entry same as cache miss */ + error = 0; + /* FALLTHROUGH */ + case 0: + /* cache miss */ + break; + case -1: + /* cache hit, not really an error */ + { + struct vnop_access_args naa; + + OSAddAtomic(1, (SInt32*)&nfsstats.lookupcache_hits); + + /* check for directory access */ + naa.a_vp = dvp; + naa.a_action = KAUTH_VNODE_SEARCH; + naa.a_context = ap->a_context; + + /* compute actual success/failure based on accessibility */ + error = nfs_access(&naa); + } + /* FALLTHROUGH */ + default: + /* unexpected error from cache_lookup */ + goto error_return; + } + + /* check for lookup of "." */ + if ((cnp->cn_nameptr[0] == '.') && (cnp->cn_namelen == 1)) { + /* skip lookup, we know who we are */ + fhisdvp = 1; + fhp = NULL; + fhsize = 0; + mrep = NULL; + goto found; + } - if (vpid == newvp->v_id) { - nfsstats.lookupcache_hits++; - if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) - cnp->cn_flags |= SAVENAME; - error = 0; /* ignore any from VOP_GETATTR */ + /* do we know this name is too long? */ + if (v3) { + /* For NFSv3: need uniform pathconf info to test pc_namemax */ + struct nfsmount *nmp = VFSTONFS(vnode_mount(dvp)); + if (!nmp) { + error = ENXIO; goto error_return; } - vput(newvp); - if ((dvp != newvp) && lockparent && (flags & ISLASTCN)) - VOP_UNLOCK(dvp, 0, p); -cache_lookup_out: - error = vn_lock(dvp, LK_EXCLUSIVE, p); - *vpp = NULLVP; - if (error) + if (((nmp->nm_state & (NFSSTA_GOTFSINFO|NFSSTA_GOTPATHCONF)) == + (NFSSTA_GOTFSINFO|NFSSTA_GOTPATHCONF)) && + (nmp->nm_fsinfo.fsproperties & NFSV3FSINFO_HOMOGENEOUS) && + (cnp->cn_namelen > (long)nmp->nm_fsinfo.namemax)) { + error = ENAMETOOLONG; goto error_return; + } + } else if (cnp->cn_namelen > NFS_MAXNAMLEN) { + error = ENAMETOOLONG; + goto error_return; } error = 0; newvp = NULLVP; - nfsstats.lookupcache_misses++; - nfsstats.rpccnt[NFSPROC_LOOKUP]++; + + OSAddAtomic(1, (SInt32*)&nfsstats.lookupcache_misses); len = cnp->cn_namelen; - nfsm_reqhead(dvp, NFSPROC_LOOKUP, - NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len)); + nfsm_reqhead(NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len)); + if (error) + goto error_return; + OSAddAtomic(1, (SInt32*)&nfsstats.rpccnt[NFSPROC_LOOKUP]); nfsm_fhtom(dvp, v3); - nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN); + nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN, v3); /* nfsm_request for NFSv2 causes you to goto to nfsmout upon errors */ - nfsm_request(dvp, NFSPROC_LOOKUP, cnp->cn_proc, cnp->cn_cred, &xid); + nfsm_request(dvp, NFSPROC_LOOKUP, p, cred, &xid); if (error) { if (mrep) { - nfsm_postop_attr(dvp, attrflag, &xid); - m_freem(mrep); + nfsm_postop_attr_update(dvp, v3, dattrflag, &xid); + mbuf_freem(mrep); } goto nfsmout; } + + /* get the filehandle */ nfsm_getfh(fhp, fhsize, v3); + /* is the file handle the same as this directory's file handle? */ + fhisdvp = NFS_CMPFH(dnp, fhp, fhsize); + + /* get attributes */ + if (v3) { + dxid = xid; + nfsm_postop_attr_get(v3, attrflag, &nvattr); + nfsm_postop_attr_update(dvp, v3, dattrflag, &dxid); + if (!attrflag && (!fhisdvp || !dattrflag)) { + /* We need valid attributes in order */ + /* to call nfs_nget/vnode_create(). */ + error = nfs_getattr_no_vnode(vnode_mount(dvp), + fhp, fhsize, cred, p, &nvattr, &xid); + if (error) { + mbuf_freem(mrep); + goto error_return; + } + } + } else { + nfsm_attr_get(v3, &nvattr); + } + +found: /* * Handle RENAME case... */ if (cnp->cn_nameiop == RENAME && wantparent && (flags & ISLASTCN)) { - if (NFS_CMPFH(np, fhp, fhsize)) { - m_freem(mrep); + if (fhisdvp) { + mbuf_freem(mrep); error = EISDIR; goto error_return; } - if ((error = nfs_nget(dvp->v_mount, fhp, fhsize, &np))) { - m_freem(mrep); + error = nfs_nget(vnode_mount(dvp), dvp, cnp, fhp, fhsize, + &nvattr, &xid, 0, &np); + if (error) { + mbuf_freem(mrep); goto error_return; } - newvp = NFSTOV(np); - if (v3) { - u_int64_t dxid = xid; + *vpp = NFSTOV(np); + mbuf_freem(mrep); - nfsm_postop_attr(newvp, attrflag, &xid); - nfsm_postop_attr(dvp, attrflag, &dxid); - if (np->n_xid == 0) { - /* - * VFS currently requires that we have valid - * attributes when returning success. - */ - error = VOP_GETATTR(newvp, &vattr, cnp->cn_cred, p); - if (error) { - m_freem(mrep); - vput(newvp); - goto error_return; - } - } - } else - nfsm_loadattr(newvp, (struct vattr *)0, &xid); - *vpp = newvp; - m_freem(mrep); - cnp->cn_flags |= SAVENAME; - if (!lockparent) - VOP_UNLOCK(dvp, 0, p); - error = 0; goto error_return; } - if (NFS_CMPFH(np, fhp, fhsize)) { - VREF(dvp); - newvp = dvp; - } else if (flags & ISDOTDOT) { - VOP_UNLOCK(dvp, 0, p); - error = nfs_nget(dvp->v_mount, fhp, fhsize, &np); + if ((cnp->cn_flags & MAKEENTRY) && + (cnp->cn_nameiop != DELETE || !(flags & ISLASTCN))) + ngflags = NG_MAKEENTRY; + else + ngflags = 0; + + if (fhisdvp) { + error = vnode_get(dvp); if (error) { - m_freem(mrep); - vn_lock(dvp, LK_EXCLUSIVE + LK_RETRY, p); + mbuf_freem(mrep); goto error_return; } - newvp = NFSTOV(np); - if (!lockparent || !(flags & ISLASTCN)) - unlockdvp = 1; /* keep dvp locked until after postops */ - if (error = vn_lock(dvp, LK_EXCLUSIVE, p)) { - m_freem(mrep); - vput(newvp); - goto error_return; + newvp = dvp; + /* test fhp to see if we have valid attributes in nvattr */ + if (fhp && (dnp->n_xid <= xid)) { + error = nfs_loadattrcache(dnp, &nvattr, &xid, 0); + if (error) { + vnode_put(dvp); + mbuf_freem(mrep); + goto error_return; + } } } else { - if ((error = nfs_nget(dvp->v_mount, fhp, fhsize, &np))) { - m_freem(mrep); + error = nfs_nget(vnode_mount(dvp), dvp, cnp, fhp, fhsize, + &nvattr, &xid, ngflags, &np); + if (error) { + mbuf_freem(mrep); goto error_return; } - if (!lockparent || !(flags & ISLASTCN)) - unlockdvp = 1; /* keep dvp locked until after postops */ newvp = NFSTOV(np); } - if (v3) { - u_int64_t dxid = xid; - - nfsm_postop_attr(newvp, attrflag, &xid); - nfsm_postop_attr(dvp, attrflag, &dxid); - if (np->n_xid == 0) { - /* - * VFS currently requires that we have valid - * attributes when returning success. - */ - error = VOP_GETATTR(newvp, &vattr, cnp->cn_cred, p); - if (error) { - if (unlockdvp) - VOP_UNLOCK(dvp, 0, p); - m_freem(mrep); - vput(newvp); - goto error_return; - } - } - } else - nfsm_loadattr(newvp, (struct vattr *)0, &xid); - if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) - cnp->cn_flags |= SAVENAME; - if ((cnp->cn_flags & MAKEENTRY) && - (cnp->cn_nameiop != DELETE || !(flags & ISLASTCN))) { - cache_enter(dvp, newvp, cnp); - } *vpp = newvp; +// if (error == 0 && *vpp != NULL && *vpp != dvp) +// nfs_unlock(VTONFS(*vpp)); + nfsm_reqdone; - if (unlockdvp) - VOP_UNLOCK(dvp, 0, p); if (error) { - if (newvp != NULLVP) { - if (newvp == dvp) - vrele(newvp); - else - vput(newvp); - *vpp = NULLVP; - } if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) && (flags & ISLASTCN) && error == ENOENT) { - if (dvp->v_mount && (dvp->v_mount->mnt_flag & MNT_RDONLY)) + if (vnode_mount(dvp) && vnode_vfsisrdonly(dvp)) error = EROFS; else error = EJUSTRETURN; - if (!lockparent) - VOP_UNLOCK(dvp, 0, p); } - if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) - cnp->cn_flags |= SAVENAME; } error_return: + if (error && *vpp) { + vnode_put(*vpp); + *vpp = NULLVP; + } return (error); } @@ -1502,18 +1586,19 @@ error_return: */ static int nfs_read(ap) - struct vop_read_args /* { - struct vnode *a_vp; + struct vnop_read_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; + int a_ioflag; + vfs_context_t a_context; } */ *ap; { - register struct vnode *vp = ap->a_vp; - - if (vp->v_type != VREG) + if (vnode_vtype(ap->a_vp) != VREG) return (EPERM); - return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred, 0)); + return (nfs_bioread(ap->a_vp, ap->a_uio, ap->a_ioflag, + vfs_context_ucred(ap->a_context), + vfs_context_proc(ap->a_context))); } @@ -1522,17 +1607,18 @@ nfs_read(ap) */ static int nfs_readlink(ap) - struct vop_readlink_args /* { - struct vnode *a_vp; + struct vnop_readlink_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; struct uio *a_uio; - struct ucred *a_cred; + vfs_context_t a_context; } */ *ap; { - register struct vnode *vp = ap->a_vp; - - if (vp->v_type != VLNK) + if (vnode_vtype(ap->a_vp) != VLNK) return (EPERM); - return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred, 0)); + return (nfs_bioread(ap->a_vp, ap->a_uio, 0, + vfs_context_ucred(ap->a_context), + vfs_context_proc(ap->a_context))); } /* @@ -1540,33 +1626,36 @@ nfs_readlink(ap) * Called by nfs_doio() from below the buffer cache. */ int -nfs_readlinkrpc(vp, uiop, cred) - register struct vnode *vp; - struct uio *uiop; - struct ucred *cred; +nfs_readlinkrpc( + vnode_t vp, + struct uio *uiop, + kauth_cred_t cred, + proc_t p) { register u_long *tl; register caddr_t cp; register long t1, t2; caddr_t bpos, dpos, cp2; int error = 0, len, attrflag; - struct mbuf *mreq, *mrep, *md, *mb, *mb2; + mbuf_t mreq, mrep, md, mb, mb2; int v3; u_int64_t xid; - if (!VFSTONFS(vp->v_mount)) + if (!VFSTONFS(vnode_mount(vp))) return (ENXIO); v3 = NFS_ISV3(vp); - nfsstats.rpccnt[NFSPROC_READLINK]++; - nfsm_reqhead(vp, NFSPROC_READLINK, NFSX_FH(v3)); + nfsm_reqhead(NFSX_FH(v3)); + if (error) + return (error); + OSAddAtomic(1, (SInt32*)&nfsstats.rpccnt[NFSPROC_READLINK]); nfsm_fhtom(vp, v3); - nfsm_request(vp, NFSPROC_READLINK, uiop->uio_procp, cred, &xid); + nfsm_request(vp, NFSPROC_READLINK, p, cred, &xid); if (v3 && mrep) - nfsm_postop_attr(vp, attrflag, &xid); + nfsm_postop_attr_update(vp, v3, attrflag, &xid); if (!error) { - nfsm_strsiz(len, NFS_MAXPATHLEN); - if (len == NFS_MAXPATHLEN) { + nfsm_strsiz(len, NFS_MAXPATHLEN, v3); + if (len >= NFS_MAXPATHLEN) { struct nfsnode *np = VTONFS(vp); #if DIAGNOSTIC if (!np) @@ -1586,37 +1675,41 @@ nfs_readlinkrpc(vp, uiop, cred) * Ditto above */ int -nfs_readrpc(vp, uiop, cred) - register struct vnode *vp; - struct uio *uiop; - struct ucred *cred; +nfs_readrpc( + vnode_t vp, + struct uio *uiop, + kauth_cred_t cred, + proc_t p) { register u_long *tl; register caddr_t cp; register long t1, t2; caddr_t bpos, dpos, cp2; - struct mbuf *mreq, *mrep, *md, *mb, *mb2; + mbuf_t mreq, mrep, md, mb, mb2; struct nfsmount *nmp; int error = 0, len, retlen, tsiz, eof = 0, attrflag; int v3, nmrsize; u_int64_t xid; - FSDBG_TOP(536, vp, uiop->uio_offset, uiop->uio_resid, 0); - nmp = VFSTONFS(vp->v_mount); + FSDBG_TOP(536, vp, uiop->uio_offset, uio_uio_resid(uiop), 0); + nmp = VFSTONFS(vnode_mount(vp)); if (!nmp) return (ENXIO); v3 = NFS_ISV3(vp); nmrsize = nmp->nm_rsize; - tsiz = uiop->uio_resid; + // LP64todo - fix this + tsiz = uio_uio_resid(uiop); if (((u_int64_t)uiop->uio_offset + (unsigned int)tsiz > 0xffffffff) && !v3) { - FSDBG_BOT(536, vp, uiop->uio_offset, uiop->uio_resid, EFBIG); + FSDBG_BOT(536, vp, uiop->uio_offset, uio_uio_resid(uiop), EFBIG); return (EFBIG); } while (tsiz > 0) { - nfsstats.rpccnt[NFSPROC_READ]++; len = (tsiz > nmrsize) ? nmrsize : tsiz; - nfsm_reqhead(vp, NFSPROC_READ, NFSX_FH(v3) + NFSX_UNSIGNED * 3); + nfsm_reqhead(NFSX_FH(v3) + NFSX_UNSIGNED * 3); + if (error) + break; + OSAddAtomic(1, (SInt32*)&nfsstats.rpccnt[NFSPROC_READ]); nfsm_fhtom(vp, v3); nfsm_build(tl, u_long *, NFSX_UNSIGNED * 3); if (v3) { @@ -1628,26 +1721,26 @@ nfs_readrpc(vp, uiop, cred) *tl = 0; } FSDBG(536, vp, uiop->uio_offset, len, 0); - nfsm_request(vp, NFSPROC_READ, uiop->uio_procp, cred, &xid); + nfsm_request(vp, NFSPROC_READ, p, cred, &xid); if (v3) { if (mrep) { - nfsm_postop_attr(vp, attrflag, &xid); + nfsm_postop_attr_update(vp, v3, attrflag, &xid); } if (error) { - m_freem(mrep); + mbuf_freem(mrep); goto nfsmout; } nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); eof = fxdr_unsigned(int, *(tl + 1)); } else { if (mrep) { - nfsm_loadattr(vp, (struct vattr *)0, &xid); + nfsm_loadattr(vp, v3, NULL, &xid); } } if (mrep) { - nfsm_strsiz(retlen, nmrsize); + nfsm_strsiz(retlen, nmrsize, 0); nfsm_mtouio(uiop, retlen); - m_freem(mrep); + mbuf_freem(mrep); } else { retlen = 0; } @@ -1659,7 +1752,7 @@ nfs_readrpc(vp, uiop, cred) tsiz = 0; } nfsmout: - FSDBG_BOT(536, vp, eof, uiop->uio_resid, error); + FSDBG_BOT(536, vp, eof, uio_uio_resid(uiop), error); return (error); } @@ -1667,47 +1760,52 @@ nfsmout: * nfs write call */ int -nfs_writerpc(vp, uiop, cred, iomode, must_commit) - register struct vnode *vp; - register struct uio *uiop; - struct ucred *cred; - int *iomode, *must_commit; +nfs_writerpc( + vnode_t vp, + struct uio *uiop, + kauth_cred_t cred, + proc_t p, + int *iomode, + int *must_commit) { register u_long *tl; register caddr_t cp; register int t1, t2, backup; caddr_t bpos, dpos, cp2; - struct mbuf *mreq, *mrep, *md, *mb, *mb2; + mbuf_t mreq, mrep, md, mb, mb2; struct nfsmount *nmp; int error = 0, len, tsiz, updatemtime = 0, wccpostattr = 0, rlen, commit; int v3, committed = NFSV3WRITE_FILESYNC; u_int64_t xid; + mount_t mp; #if DIAGNOSTIC if (uiop->uio_iovcnt != 1) panic("nfs_writerpc: iovcnt > 1"); #endif - FSDBG_TOP(537, vp, uiop->uio_offset, uiop->uio_resid, *iomode); - nmp = VFSTONFS(vp->v_mount); + FSDBG_TOP(537, vp, uiop->uio_offset, uio_uio_resid(uiop), *iomode); + nmp = VFSTONFS(vnode_mount(vp)); if (!nmp) return (ENXIO); v3 = NFS_ISV3(vp); *must_commit = 0; - tsiz = uiop->uio_resid; + // LP64todo - fix this + tsiz = uio_uio_resid(uiop); if (((u_int64_t)uiop->uio_offset + (unsigned int)tsiz > 0xffffffff) && !v3) { - FSDBG_BOT(537, vp, uiop->uio_offset, uiop->uio_resid, EFBIG); + FSDBG_BOT(537, vp, uiop->uio_offset, uio_uio_resid(uiop), EFBIG); return (EFBIG); } while (tsiz > 0) { - nmp = VFSTONFS(vp->v_mount); + nmp = VFSTONFS(vnode_mount(vp)); if (!nmp) { error = ENXIO; break; } - nfsstats.rpccnt[NFSPROC_WRITE]++; len = (tsiz > nmp->nm_wsize) ? nmp->nm_wsize : tsiz; - nfsm_reqhead(vp, NFSPROC_WRITE, - NFSX_FH(v3) + 5 * NFSX_UNSIGNED + nfsm_rndup(len)); + nfsm_reqhead(NFSX_FH(v3) + 5 * NFSX_UNSIGNED + nfsm_rndup(len)); + if (error) + break; + OSAddAtomic(1, (SInt32*)&nfsstats.rpccnt[NFSPROC_WRITE]); nfsm_fhtom(vp, v3); if (v3) { nfsm_build(tl, u_long *, 5 * NFSX_UNSIGNED); @@ -1723,15 +1821,15 @@ nfs_writerpc(vp, uiop, cred, iomode, must_commit) *tl = txdr_unsigned(len); FSDBG(537, vp, uiop->uio_offset, len, 0); nfsm_uiotom(uiop, len); - nfsm_request(vp, NFSPROC_WRITE, uiop->uio_procp, cred, &xid); - nmp = VFSTONFS(vp->v_mount); + nfsm_request(vp, NFSPROC_WRITE, p, cred, &xid); + nmp = VFSTONFS(vnode_mount(vp)); if (!nmp) error = ENXIO; if (v3) { if (mrep) { - time_t premtime; - nfsm_wcc_data(vp, premtime, wccpostattr, &xid); - if (VTONFS(vp)->n_mtime == premtime) + struct timespec premtime; + nfsm_wcc_data(vp, &premtime, wccpostattr, &xid); + if (nfstimespeccmp(&VTONFS(vp)->n_mtime, &premtime, ==)) updatemtime = 1; } if (!error) { @@ -1743,10 +1841,10 @@ nfs_writerpc(vp, uiop, cred, iomode, must_commit) break; } else if (rlen < len) { backup = len - rlen; - uiop->uio_iov->iov_base -= backup; - uiop->uio_iov->iov_len += backup; + uio_iov_base_add(uiop, -backup); + uio_iov_len_add(uiop, backup); uiop->uio_offset -= backup; - uiop->uio_resid += backup; + uio_uio_resid_add(uiop, backup); len = rlen; } commit = fxdr_unsigned(int, *tl++); @@ -1773,13 +1871,13 @@ nfs_writerpc(vp, uiop, cred, iomode, must_commit) } } else { if (mrep) { - nfsm_loadattr(vp, (struct vattr *)0, &xid); + nfsm_loadattr(vp, v3, NULL, &xid); } } if (updatemtime) - VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.va_mtime.tv_sec; - m_freem(mrep); + VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.nva_mtime; + mbuf_freem(mrep); /* * we seem to have a case where we end up looping on shutdown * and taking down nfs servers. For V3, error cases, there is @@ -1792,12 +1890,12 @@ nfs_writerpc(vp, uiop, cred, iomode, must_commit) tsiz -= len; } nfsmout: - if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_ASYNC)) + if ((mp = vnode_mount(vp)) && (vfs_flags(mp) & MNT_ASYNC)) committed = NFSV3WRITE_FILESYNC; *iomode = committed; if (error) - uiop->uio_resid = tsiz; - FSDBG_BOT(537, vp, committed, uiop->uio_resid, error); + uio_uio_resid_set(uiop, tsiz); + FSDBG_BOT(537, vp, committed, uio_uio_resid(uiop), error); return (error); } @@ -1807,100 +1905,121 @@ nfsmout: * mode set to specify the file type and the size field for rdev. */ static int -nfs_mknodrpc(dvp, vpp, cnp, vap) - register struct vnode *dvp; - register struct vnode **vpp; - register struct componentname *cnp; - register struct vattr *vap; +nfs_mknodrpc( + vnode_t dvp, + vnode_t *vpp, + struct componentname *cnp, + struct vnode_attr *vap, + kauth_cred_t cred, + proc_t p) { register struct nfsv2_sattr *sp; - register struct nfsv3_sattr *sp3; register u_long *tl; register caddr_t cp; register long t1, t2; - struct vnode *newvp = (struct vnode *)0; + vnode_t newvp = (vnode_t)0; struct nfsnode *np = (struct nfsnode *)0; - struct vattr vattr; + struct nfs_vattr nvattr; char *cp2; caddr_t bpos, dpos; int error = 0, wccpostattr = 0, gotvp = 0; - time_t premtime = 0; - struct mbuf *mreq, *mrep, *md, *mb, *mb2; + struct timespec premtime = { 0, 0 }; + mbuf_t mreq, mrep, md, mb, mb2; u_long rdev; u_int64_t xid; int v3 = NFS_ISV3(dvp); + int gotuid, gotgid; - if (vap->va_type == VCHR || vap->va_type == VBLK) + if (!VATTR_IS_ACTIVE(vap, va_type)) + return (EINVAL); + if (vap->va_type == VCHR || vap->va_type == VBLK) { + if (!VATTR_IS_ACTIVE(vap, va_rdev)) + return (EINVAL); rdev = txdr_unsigned(vap->va_rdev); - else if (vap->va_type == VFIFO || vap->va_type == VSOCK) + } else if (vap->va_type == VFIFO || vap->va_type == VSOCK) rdev = 0xffffffff; else { - VOP_ABORTOP(dvp, cnp); - vput(dvp); - return (EOPNOTSUPP); + return (ENOTSUP); } - if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_proc))) { - VOP_ABORTOP(dvp, cnp); - vput(dvp); + nfsm_reqhead(NFSX_FH(v3) + 4 * NFSX_UNSIGNED + + nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3)); + if (error) return (error); - } - nfsstats.rpccnt[NFSPROC_MKNOD]++; - nfsm_reqhead(dvp, NFSPROC_MKNOD, NFSX_FH(v3) + 4 * NFSX_UNSIGNED + - + nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3)); + + VATTR_SET_SUPPORTED(vap, va_mode); + VATTR_SET_SUPPORTED(vap, va_uid); + VATTR_SET_SUPPORTED(vap, va_gid); + VATTR_SET_SUPPORTED(vap, va_data_size); + VATTR_SET_SUPPORTED(vap, va_access_time); + VATTR_SET_SUPPORTED(vap, va_modify_time); + gotuid = VATTR_IS_ACTIVE(vap, va_uid); + gotgid = VATTR_IS_ACTIVE(vap, va_gid); + + OSAddAtomic(1, (SInt32*)&nfsstats.rpccnt[NFSPROC_MKNOD]); nfsm_fhtom(dvp, v3); - nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); + nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN, v3); if (v3) { - nfsm_build(tl, u_long *, NFSX_UNSIGNED + NFSX_V3SRVSATTR); + nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl++ = vtonfsv3_type(vap->va_type); - sp3 = (struct nfsv3_sattr *)tl; - nfsm_v3sattr(sp3, vap, cnp->cn_cred->cr_uid, vattr.va_gid); + nfsm_v3sattr(vap); if (vap->va_type == VCHR || vap->va_type == VBLK) { nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(major(vap->va_rdev)); *tl = txdr_unsigned(minor(vap->va_rdev)); } } else { + struct timespec neg1time = { -1, -1 }; nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); - sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); - sp->sa_uid = txdr_unsigned(cnp->cn_cred->cr_uid); - sp->sa_gid = txdr_unsigned(vattr.va_gid); + sp->sa_mode = vtonfsv2_mode(vap->va_type, + (VATTR_IS_ACTIVE(vap, va_mode) ? vap->va_mode : 0600)); + sp->sa_uid = gotuid ? (u_long)txdr_unsigned(vap->va_uid) : nfs_xdrneg1; + sp->sa_gid = gotgid ? (u_long)txdr_unsigned(vap->va_gid) : nfs_xdrneg1; sp->sa_size = rdev; - txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); - txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); - } - nfsm_request(dvp, NFSPROC_MKNOD, cnp->cn_proc, cnp->cn_cred, &xid); - if (!error) { - nfsm_mtofh(dvp, newvp, v3, gotvp, &xid); - if (!gotvp) { - if (newvp) { - vput(newvp); - newvp = (struct vnode *)0; - } + if (VATTR_IS_ACTIVE(vap, va_access_time)) { + txdr_nfsv2time(&vap->va_access_time, &sp->sa_atime); + } else { + txdr_nfsv2time(&neg1time, &sp->sa_atime); + } + if (VATTR_IS_ACTIVE(vap, va_modify_time)) { + txdr_nfsv2time(&vap->va_modify_time, &sp->sa_mtime); + } else { + txdr_nfsv2time(&neg1time, &sp->sa_mtime); + } + } + nfsm_request(dvp, NFSPROC_MKNOD, p, cred, &xid); + /* XXX no EEXIST kludge here? */ + if (!error) { + nfsm_mtofh(dvp, cnp, newvp, v3, &xid, gotvp); + if (!gotvp) { error = nfs_lookitup(dvp, cnp->cn_nameptr, - cnp->cn_namelen, cnp->cn_cred, cnp->cn_proc, &np); + cnp->cn_namelen, cred, p, &np); if (!error) newvp = NFSTOV(np); } } if (v3 && mrep) - nfsm_wcc_data(dvp, premtime, wccpostattr, &xid); + nfsm_wcc_data(dvp, &premtime, wccpostattr, &xid); + if (!error && (gotuid || gotgid) && + (!newvp || nfs_getattrcache(newvp, &nvattr) || + (gotuid && (nvattr.nva_uid != vap->va_uid)) || + (gotgid && (nvattr.nva_gid != vap->va_gid)))) { + /* clear ID bits if server didn't use them (or we can't tell) */ + VATTR_CLEAR_SUPPORTED(vap, va_uid); + VATTR_CLEAR_SUPPORTED(vap, va_gid); + } nfsm_reqdone; if (error) { if (newvp) - vput(newvp); + vnode_put(newvp); } else { - if (cnp->cn_flags & MAKEENTRY) - cache_enter(dvp, newvp, cnp); *vpp = newvp; } VTONFS(dvp)->n_flag |= NMODIFIED; /* if directory hadn't changed, update namecache mtime */ - if (VTONFS(dvp)->n_ncmtime == premtime) - VTONFS(dvp)->n_ncmtime = VTONFS(dvp)->n_vattr.va_mtime.tv_sec; + if (nfstimespeccmp(&VTONFS(dvp)->n_ncmtime, &premtime, ==)) + VTONFS(dvp)->n_ncmtime = VTONFS(dvp)->n_vattr.nva_mtime; if (!wccpostattr) - VTONFS(dvp)->n_xid = 0; - vput(dvp); - NFS_FREE_PNBUF(cnp); + NATTRINVALIDATE(VTONFS(dvp)); return (error); } @@ -1911,20 +2030,21 @@ nfs_mknodrpc(dvp, vpp, cnp, vap) /* ARGSUSED */ static int nfs_mknod(ap) - struct vop_mknod_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; + struct vnop_mknod_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t *a_vpp; struct componentname *a_cnp; - struct vattr *a_vap; + struct vnode_attr *a_vap; + vfs_context_t a_context; } */ *ap; { - struct vnode *newvp; int error; - error = nfs_mknodrpc(ap->a_dvp, &newvp, ap->a_cnp, ap->a_vap); - if (!error && newvp) - vput(newvp); - *ap->a_vpp = 0; + error = nfs_mknodrpc(ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap, + vfs_context_ucred(ap->a_context), + vfs_context_proc(ap->a_context)); + return (error); } @@ -1934,50 +2054,66 @@ static u_long create_verf; */ static int nfs_create(ap) - struct vop_create_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; + struct vnop_create_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t *a_vpp; struct componentname *a_cnp; - struct vattr *a_vap; + struct vnode_attr *a_vap; + vfs_context_t a_context; } */ *ap; { - register struct vnode *dvp = ap->a_dvp; - register struct vattr *vap = ap->a_vap; - register struct componentname *cnp = ap->a_cnp; - register struct nfsv2_sattr *sp; - register struct nfsv3_sattr *sp3; - register u_long *tl; - register caddr_t cp; - register long t1, t2; + vnode_t dvp = ap->a_dvp; + struct vnode_attr *vap = ap->a_vap; + struct componentname *cnp = ap->a_cnp; + struct nfs_vattr nvattr; + struct nfsv2_sattr *sp; + u_long *tl; + caddr_t cp; + long t1, t2; struct nfsnode *np = (struct nfsnode *)0; - struct vnode *newvp = (struct vnode *)0; + vnode_t newvp = (vnode_t)0; caddr_t bpos, dpos, cp2; int error = 0, wccpostattr = 0, gotvp = 0, fmode = 0; - time_t premtime = 0; - struct mbuf *mreq, *mrep, *md, *mb, *mb2; - struct vattr vattr; + struct timespec premtime = { 0, 0 }; + mbuf_t mreq, mrep, md, mb, mb2; int v3 = NFS_ISV3(dvp); + int gotuid, gotgid; u_int64_t xid; + kauth_cred_t cred; + proc_t p; + + cred = vfs_context_ucred(ap->a_context); + p = vfs_context_proc(ap->a_context); + + if (!VATTR_IS_ACTIVE(vap, va_type)) + return (EINVAL); /* * Oops, not for me.. */ if (vap->va_type == VSOCK) - return (nfs_mknodrpc(dvp, ap->a_vpp, cnp, vap)); + return (nfs_mknodrpc(dvp, ap->a_vpp, cnp, vap, cred, p)); + + VATTR_SET_SUPPORTED(vap, va_mode); + VATTR_SET_SUPPORTED(vap, va_uid); + VATTR_SET_SUPPORTED(vap, va_gid); + VATTR_SET_SUPPORTED(vap, va_data_size); + VATTR_SET_SUPPORTED(vap, va_access_time); + VATTR_SET_SUPPORTED(vap, va_modify_time); + gotuid = VATTR_IS_ACTIVE(vap, va_uid); + gotgid = VATTR_IS_ACTIVE(vap, va_gid); - if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_proc))) { - VOP_ABORTOP(dvp, cnp); - vput(dvp); - return (error); - } if (vap->va_vaflags & VA_EXCLUSIVE) fmode |= O_EXCL; again: - nfsstats.rpccnt[NFSPROC_CREATE]++; - nfsm_reqhead(dvp, NFSPROC_CREATE, NFSX_FH(v3) + 2 * NFSX_UNSIGNED + + nfsm_reqhead(NFSX_FH(v3) + 2 * NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3)); + if (error) + return (error); + OSAddAtomic(1, (SInt32*)&nfsstats.rpccnt[NFSPROC_CREATE]); nfsm_fhtom(dvp, v3); - nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); + nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN, v3); if (v3) { nfsm_build(tl, u_long *, NFSX_UNSIGNED); if (fmode & O_EXCL) { @@ -1990,35 +2126,39 @@ again: *tl = ++create_verf; } else { *tl = txdr_unsigned(NFSV3CREATE_UNCHECKED); - nfsm_build(tl, u_long *, NFSX_V3SRVSATTR); - sp3 = (struct nfsv3_sattr *)tl; - nfsm_v3sattr(sp3, vap, cnp->cn_cred->cr_uid, vattr.va_gid); + nfsm_v3sattr(vap); } } else { + struct timespec neg1time = { -1, -1 }; nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); - sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); - sp->sa_uid = txdr_unsigned(cnp->cn_cred->cr_uid); - sp->sa_gid = txdr_unsigned(vattr.va_gid); + sp->sa_mode = vtonfsv2_mode(vap->va_type, + (VATTR_IS_ACTIVE(vap, va_mode) ? vap->va_mode : 0600)); + sp->sa_uid = gotuid ? (u_long)txdr_unsigned(vap->va_uid) : nfs_xdrneg1; + sp->sa_gid = gotgid ? (u_long)txdr_unsigned(vap->va_gid) : nfs_xdrneg1; sp->sa_size = 0; - txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); - txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); + if (VATTR_IS_ACTIVE(vap, va_access_time)) { + txdr_nfsv2time(&vap->va_access_time, &sp->sa_atime); + } else { + txdr_nfsv2time(&neg1time, &sp->sa_atime); + } + if (VATTR_IS_ACTIVE(vap, va_modify_time)) { + txdr_nfsv2time(&vap->va_modify_time, &sp->sa_mtime); + } else { + txdr_nfsv2time(&neg1time, &sp->sa_mtime); + } } - nfsm_request(dvp, NFSPROC_CREATE, cnp->cn_proc, cnp->cn_cred, &xid); + nfsm_request(dvp, NFSPROC_CREATE, p, cred, &xid); if (!error) { - nfsm_mtofh(dvp, newvp, v3, gotvp, &xid); + nfsm_mtofh(dvp, cnp, newvp, v3, &xid, gotvp); if (!gotvp) { - if (newvp) { - vput(newvp); - newvp = (struct vnode *)0; - } error = nfs_lookitup(dvp, cnp->cn_nameptr, - cnp->cn_namelen, cnp->cn_cred, cnp->cn_proc, &np); + cnp->cn_namelen, cred, p, &np); if (!error) newvp = NFSTOV(np); } } if (v3 && mrep) - nfsm_wcc_data(dvp, premtime, wccpostattr, &xid); + nfsm_wcc_data(dvp, &premtime, wccpostattr, &xid); nfsm_reqdone; if (error) { if (v3 && (fmode & O_EXCL) && error == NFSERR_NOTSUPP) { @@ -2026,25 +2166,36 @@ again: goto again; } if (newvp) - vput(newvp); + vnode_put(newvp); } else if (v3 && (fmode & O_EXCL)) { - error = nfs_setattrrpc(newvp, vap, cnp->cn_cred, cnp->cn_proc); + error = nfs_setattrrpc(newvp, vap, cred, p); + if (error && (gotuid || gotgid)) { + /* it's possible the server didn't like our attempt to set IDs. */ + /* so, let's try it again without those */ + VATTR_CLEAR_ACTIVE(vap, va_uid); + VATTR_CLEAR_ACTIVE(vap, va_gid); + error = nfs_setattrrpc(newvp, vap, cred, p); + } if (error) - vput(newvp); + vnode_put(newvp); } if (!error) { - if (cnp->cn_flags & MAKEENTRY) - cache_enter(dvp, newvp, cnp); *ap->a_vpp = newvp; } VTONFS(dvp)->n_flag |= NMODIFIED; /* if directory hadn't changed, update namecache mtime */ - if (VTONFS(dvp)->n_ncmtime == premtime) - VTONFS(dvp)->n_ncmtime = VTONFS(dvp)->n_vattr.va_mtime.tv_sec; + if (nfstimespeccmp(&VTONFS(dvp)->n_ncmtime, &premtime, ==)) + VTONFS(dvp)->n_ncmtime = VTONFS(dvp)->n_vattr.nva_mtime; if (!wccpostattr) - VTONFS(dvp)->n_xid = 0; - vput(dvp); - NFS_FREE_PNBUF(cnp); + NATTRINVALIDATE(VTONFS(dvp)); + if (!error && (gotuid || gotgid) && + (!newvp || nfs_getattrcache(newvp, &nvattr) || + (gotuid && (nvattr.nva_uid != vap->va_uid)) || + (gotgid && (nvattr.nva_gid != vap->va_gid)))) { + /* clear ID bits if server didn't use them (or we can't tell) */ + VATTR_CLEAR_SUPPORTED(vap, va_uid); + VATTR_CLEAR_SUPPORTED(vap, va_gid); + } return (error); } @@ -2053,7 +2204,7 @@ again: * To try and make nfs semantics closer to ufs semantics, a file that has * other processes using the vnode is renamed instead of removed and then * removed later on the last close. - * - If v_usecount > 1 + * - If vnode_isinuse() * If a rename is not already in the works * call nfs_sillyrename() to set it up * else @@ -2061,53 +2212,35 @@ again: */ static int nfs_remove(ap) - struct vop_remove_args /* { + struct vnop_remove_args /* { struct vnodeop_desc *a_desc; - struct vnode * a_dvp; - struct vnode * a_vp; - struct componentname * a_cnp; + vnode_t a_dvp; + vnode_t a_vp; + struct componentname *a_cnp; + int a_flags; + vfs_context_t a_context; } */ *ap; { - register struct vnode *vp = ap->a_vp; - register struct vnode *dvp = ap->a_dvp; - register struct componentname *cnp = ap->a_cnp; - register struct nfsnode *np = VTONFS(vp); + vnode_t vp = ap->a_vp; + vnode_t dvp = ap->a_dvp; + struct componentname *cnp = ap->a_cnp; + struct nfsnode *np = VTONFS(vp); int error = 0, gofree = 0; - struct vattr vattr; + struct nfs_vattr nvattr; + kauth_cred_t cred; + proc_t p; -#if DIAGNOSTIC - if ((cnp->cn_flags & HASBUF) == 0) - panic("nfs_remove: no name"); - if (vp->v_usecount < 1) - panic("nfs_remove: bad v_usecount"); -#endif + cred = vfs_context_ucred(ap->a_context); + p = vfs_context_proc(ap->a_context); - if (UBCISVALID(vp)) { - /* regular files */ - if (UBCINFOEXISTS(vp)) - gofree = (ubc_isinuse(vp, 1)) ? 0 : 1; - else { - /* dead or dying vnode.With vnode locking panic instead of error */ - vput(dvp); - vput(vp); - NFS_FREE_PNBUF(cnp); - return (EIO); - } - } else { - /* UBC not in play */ - if (vp->v_usecount == 1) - gofree = 1; - } - if ((ap->a_cnp->cn_flags & NODELETEBUSY) && !gofree) { + gofree = vnode_isinuse(vp, 0) ? 0 : 1; + if ((ap->a_flags & VNODE_REMOVE_NODELETEBUSY) && !gofree) { /* Caller requested Carbon delete semantics, but file is busy */ - vput(dvp); - vput(vp); - NFS_FREE_PNBUF(cnp); return (EBUSY); } if (gofree || (np->n_sillyrename && - VOP_GETATTR(vp, &vattr, cnp->cn_cred, cnp->cn_proc) == 0 && - vattr.va_nlink > 1)) { + nfs_getattr(vp, &nvattr, cred, p) == 0 && + nvattr.nva_nlink > 1)) { /* * Purge the name cache so that the chance of a lookup for * the name succeeding while the remove is in progress is @@ -2118,13 +2251,13 @@ nfs_remove(ap) * throw away biocache buffers, mainly to avoid * unnecessary delayed writes later. */ - error = nfs_vinvalbuf(vp, 0, cnp->cn_cred, cnp->cn_proc, 1); + error = nfs_vinvalbuf(vp, 0, cred, p, 1); np->n_size = 0; ubc_setsize(vp, (off_t)0); /* XXX check error */ /* Do the rpc */ if (error != EINTR) error = nfs_removerpc(dvp, cnp->cn_nameptr, - cnp->cn_namelen, cnp->cn_cred, cnp->cn_proc); + cnp->cn_namelen, cred, p); /* * Kludge City: If the first reply to the remove rpc is lost.. * the reply to the retransmitted request will be ENOENT @@ -2139,19 +2272,21 @@ nfs_remove(ap) * again if another object gets created with the same filehandle * before this vnode gets reclaimed */ + lck_mtx_lock(nfs_node_hash_mutex); LIST_REMOVE(np, n_hash); np->n_flag &= ~NHASHED; + lck_mtx_unlock(nfs_node_hash_mutex); + } + if (!error && !np->n_sillyrename) { + /* clear flags now: won't get nfs_inactive for recycled vnode */ + /* clear all flags other than these */ + np->n_flag &= (NMODIFIED | NFLUSHINPROG | NFLUSHWANT | NHASHED); + vnode_recycle(vp); } } else if (!np->n_sillyrename) { - error = nfs_sillyrename(dvp, vp, cnp); + error = nfs_sillyrename(dvp, vp, cnp, cred, p); } - np->n_xid = 0; - vput(dvp); - - VOP_UNLOCK(vp, 0, cnp->cn_proc); - NFS_FREE_PNBUF(cnp); - ubc_uncache(vp); - vrele(vp); + NATTRINVALIDATE(np); return (error); } @@ -2160,12 +2295,9 @@ nfs_remove(ap) * nfs file remove rpc called from nfs_inactive */ int -nfs_removeit(sp) - register struct sillyrename *sp; +nfs_removeit(struct sillyrename *sp) { - - return (nfs_removerpc(sp->s_dvp, sp->s_name, sp->s_namlen, sp->s_cred, - (struct proc *)0)); + return (nfs_removerpc(sp->s_dvp, sp->s_name, sp->s_namlen, sp->s_cred, NULL)); } /* @@ -2173,41 +2305,42 @@ nfs_removeit(sp) */ static int nfs_removerpc(dvp, name, namelen, cred, proc) - register struct vnode *dvp; + vnode_t dvp; char *name; int namelen; - struct ucred *cred; - struct proc *proc; + kauth_cred_t cred; + proc_t proc; { register u_long *tl; register caddr_t cp; register long t1, t2; caddr_t bpos, dpos, cp2; int error = 0, wccpostattr = 0; - time_t premtime = 0; - struct mbuf *mreq, *mrep, *md, *mb, *mb2; + struct timespec premtime = { 0, 0 }; + mbuf_t mreq, mrep, md, mb, mb2; int v3; u_int64_t xid; - if (!VFSTONFS(dvp->v_mount)) + if (!VFSTONFS(vnode_mount(dvp))) return (ENXIO); v3 = NFS_ISV3(dvp); - nfsstats.rpccnt[NFSPROC_REMOVE]++; - nfsm_reqhead(dvp, NFSPROC_REMOVE, - NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(namelen)); + nfsm_reqhead(NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(namelen)); + if (error) + return (error); + OSAddAtomic(1, (SInt32*)&nfsstats.rpccnt[NFSPROC_REMOVE]); nfsm_fhtom(dvp, v3); - nfsm_strtom(name, namelen, NFS_MAXNAMLEN); + nfsm_strtom(name, namelen, NFS_MAXNAMLEN, v3); nfsm_request(dvp, NFSPROC_REMOVE, proc, cred, &xid); if (v3 && mrep) - nfsm_wcc_data(dvp, premtime, wccpostattr, &xid); + nfsm_wcc_data(dvp, &premtime, wccpostattr, &xid); nfsm_reqdone; VTONFS(dvp)->n_flag |= NMODIFIED; /* if directory hadn't changed, update namecache mtime */ - if (VTONFS(dvp)->n_ncmtime == premtime) - VTONFS(dvp)->n_ncmtime = VTONFS(dvp)->n_vattr.va_mtime.tv_sec; + if (nfstimespeccmp(&VTONFS(dvp)->n_ncmtime, &premtime, ==)) + VTONFS(dvp)->n_ncmtime = VTONFS(dvp)->n_vattr.nva_mtime; if (!wccpostattr) - VTONFS(dvp)->n_xid = 0; + NATTRINVALIDATE(VTONFS(dvp)); return (error); } @@ -2216,34 +2349,40 @@ nfs_removerpc(dvp, name, namelen, cred, proc) */ static int nfs_rename(ap) - struct vop_rename_args /* { - struct vnode *a_fdvp; - struct vnode *a_fvp; + struct vnop_rename_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_fdvp; + vnode_t a_fvp; struct componentname *a_fcnp; - struct vnode *a_tdvp; - struct vnode *a_tvp; + vnode_t a_tdvp; + vnode_t a_tvp; struct componentname *a_tcnp; + vfs_context_t a_context; } */ *ap; { - register struct vnode *fvp = ap->a_fvp; - register struct vnode *tvp = ap->a_tvp; - register struct vnode *fdvp = ap->a_fdvp; - register struct vnode *tdvp = ap->a_tdvp; - register struct componentname *tcnp = ap->a_tcnp; - register struct componentname *fcnp = ap->a_fcnp; + vnode_t fvp = ap->a_fvp; + vnode_t tvp = ap->a_tvp; + vnode_t fdvp = ap->a_fdvp; + vnode_t tdvp = ap->a_tdvp; + struct componentname *tcnp = ap->a_tcnp; + struct componentname *fcnp = ap->a_fcnp; int error, inuse=0; + mount_t fmp, tdmp, tmp; + struct nfsnode *tnp; + kauth_cred_t cred; + proc_t p; + + cred = vfs_context_ucred(ap->a_context); + p = vfs_context_proc(ap->a_context); + + tnp = tvp ? VTONFS(tvp) : NULL; -#if DIAGNOSTIC - if ((tcnp->cn_flags & HASBUF) == 0 || - (fcnp->cn_flags & HASBUF) == 0) - panic("nfs_rename: no name"); -#endif /* Check for cross-device rename */ - if ((fvp->v_mount != tdvp->v_mount) || - (tvp && (fvp->v_mount != tvp->v_mount))) { + fmp = vnode_mount(fvp); + tmp = tvp ? vnode_mount(tvp) : NULL; + tdmp = vnode_mount(tdvp); + if ((fmp != tdmp) || (tvp && (fmp != tmp))) { error = EXDEV; - if (tvp) - VOP_UNLOCK(tvp, 0, tcnp->cn_proc); goto out; } @@ -2255,66 +2394,54 @@ nfs_rename(ap) * links or case-variants) */ if (tvp && tvp != fvp) { - if (UBCISVALID(tvp)) { - /* regular files */ - if (UBCINFOEXISTS(tvp)) - inuse = (ubc_isinuse(tvp, 1)) ? 1 : 0; - else { - /* dead or dying vnode.With vnode locking panic instead of error */ - error = EIO; - VOP_UNLOCK(tvp, 0, tcnp->cn_proc); - goto out; - } - } else { - /* UBC not in play */ - if (tvp->v_usecount > 1) - inuse = 1; - } + inuse = vnode_isinuse(tvp, 0); } - if (inuse && !VTONFS(tvp)->n_sillyrename && tvp->v_type != VDIR) { - if (error = nfs_sillyrename(tdvp, tvp, tcnp)) { + if (inuse && !tnp->n_sillyrename && vnode_vtype(tvp) != VDIR) { + if ((error = nfs_sillyrename(tdvp, tvp, tcnp, cred, p))) { /* sillyrename failed. Instead of pressing on, return error */ - VOP_UNLOCK(tvp, 0, tcnp->cn_proc); goto out; /* should not be ENOENT. */ } else { /* sillyrename succeeded.*/ - VOP_UNLOCK(tvp, 0, tcnp->cn_proc); - ubc_uncache(tvp); /* get the nfs turd file to disappear */ - vrele(tvp); tvp = NULL; } } error = nfs_renamerpc(fdvp, fcnp->cn_nameptr, fcnp->cn_namelen, - tdvp, tcnp->cn_nameptr, tcnp->cn_namelen, tcnp->cn_cred, - tcnp->cn_proc); + tdvp, tcnp->cn_nameptr, tcnp->cn_namelen, cred, p); - if (!error && tvp && tvp != fvp && !VTONFS(tvp)->n_sillyrename) { + /* + * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry. + */ + if (error == ENOENT) + error = 0; + + if (!error && tvp && tvp != fvp && !tnp->n_sillyrename) { /* * remove nfsnode from hash now so we can't accidentally find it * again if another object gets created with the same filehandle * before this vnode gets reclaimed */ - LIST_REMOVE(VTONFS(tvp), n_hash); - VTONFS(tvp)->n_flag &= ~NHASHED; + lck_mtx_lock(nfs_node_hash_mutex); + LIST_REMOVE(tnp, n_hash); + tnp->n_flag &= ~NHASHED; + lck_mtx_unlock(nfs_node_hash_mutex); } + /* purge the old name cache entries and enter the new one */ cache_purge(fvp); if (tvp) { cache_purge(tvp); - VOP_UNLOCK(tvp, 0, tcnp->cn_proc); - ubc_uncache(tvp); /* get the nfs turd file to disappear */ + if (!error && !tnp->n_sillyrename) { + /* clear flags now: won't get nfs_inactive for recycled vnode */ + /* clear all flags other than these */ + tnp->n_flag &= (NMODIFIED | NFLUSHINPROG | NFLUSHWANT | NHASHED); + vnode_recycle(tvp); + } } - + if (!error) + cache_enter(tdvp, fvp, tcnp); + out: - if (tdvp == tvp) - vrele(tdvp); - else - vput(tdvp); - if (tvp) - vrele(tvp); /* already unlocked */ - vrele(fdvp); - vrele(fvp); /* * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry. */ @@ -2324,74 +2451,62 @@ out: } /* - * nfs file rename rpc called from nfs_remove() above - */ -static int -nfs_renameit(sdvp, scnp, sp) - struct vnode *sdvp; - struct componentname *scnp; - register struct sillyrename *sp; -{ - return (nfs_renamerpc(sdvp, scnp->cn_nameptr, scnp->cn_namelen, - sdvp, sp->s_name, sp->s_namlen, scnp->cn_cred, scnp->cn_proc)); -} - -/* - * Do an nfs rename rpc. Called from nfs_rename() and nfs_renameit(). + * Do an nfs rename rpc. Called from nfs_rename() and nfs_sillyrename(). */ static int nfs_renamerpc(fdvp, fnameptr, fnamelen, tdvp, tnameptr, tnamelen, cred, proc) - register struct vnode *fdvp; + vnode_t fdvp; char *fnameptr; int fnamelen; - register struct vnode *tdvp; + vnode_t tdvp; char *tnameptr; int tnamelen; - struct ucred *cred; - struct proc *proc; + kauth_cred_t cred; + proc_t proc; { register u_long *tl; register caddr_t cp; register long t1, t2; caddr_t bpos, dpos, cp2; int error = 0, fwccpostattr = 0, twccpostattr = 0; - time_t fpremtime = 0, tpremtime = 0; - struct mbuf *mreq, *mrep, *md, *mb, *mb2; + struct timespec fpremtime = { 0, 0 }, tpremtime = { 0, 0 }; + mbuf_t mreq, mrep, md, mb, mb2; int v3; u_int64_t xid; - if (!VFSTONFS(fdvp->v_mount)) + if (!VFSTONFS(vnode_mount(fdvp))) return (ENXIO); v3 = NFS_ISV3(fdvp); - nfsstats.rpccnt[NFSPROC_RENAME]++; - nfsm_reqhead(fdvp, NFSPROC_RENAME, - (NFSX_FH(v3) + NFSX_UNSIGNED)*2 + nfsm_rndup(fnamelen) + + nfsm_reqhead((NFSX_FH(v3) + NFSX_UNSIGNED)*2 + nfsm_rndup(fnamelen) + nfsm_rndup(tnamelen)); + if (error) + return (error); + OSAddAtomic(1, (SInt32*)&nfsstats.rpccnt[NFSPROC_RENAME]); nfsm_fhtom(fdvp, v3); - nfsm_strtom(fnameptr, fnamelen, NFS_MAXNAMLEN); + nfsm_strtom(fnameptr, fnamelen, NFS_MAXNAMLEN, v3); nfsm_fhtom(tdvp, v3); - nfsm_strtom(tnameptr, tnamelen, NFS_MAXNAMLEN); + nfsm_strtom(tnameptr, tnamelen, NFS_MAXNAMLEN, v3); nfsm_request(fdvp, NFSPROC_RENAME, proc, cred, &xid); if (v3 && mrep) { u_int64_t txid = xid; - nfsm_wcc_data(fdvp, fpremtime, fwccpostattr, &xid); - nfsm_wcc_data(tdvp, tpremtime, twccpostattr, &txid); + nfsm_wcc_data(fdvp, &fpremtime, fwccpostattr, &xid); + nfsm_wcc_data(tdvp, &tpremtime, twccpostattr, &txid); } nfsm_reqdone; VTONFS(fdvp)->n_flag |= NMODIFIED; /* if directory hadn't changed, update namecache mtime */ - if (VTONFS(fdvp)->n_ncmtime == fpremtime) - VTONFS(fdvp)->n_ncmtime = VTONFS(fdvp)->n_vattr.va_mtime.tv_sec; + if (nfstimespeccmp(&VTONFS(fdvp)->n_ncmtime, &fpremtime, ==)) + VTONFS(fdvp)->n_ncmtime = VTONFS(fdvp)->n_vattr.nva_mtime; if (!fwccpostattr) - VTONFS(fdvp)->n_xid = 0; + NATTRINVALIDATE(VTONFS(fdvp)); VTONFS(tdvp)->n_flag |= NMODIFIED; /* if directory hadn't changed, update namecache mtime */ - if (VTONFS(tdvp)->n_ncmtime == tpremtime) - VTONFS(tdvp)->n_ncmtime = VTONFS(tdvp)->n_vattr.va_mtime.tv_sec; + if (nfstimespeccmp(&VTONFS(tdvp)->n_ncmtime, &tpremtime, ==)) + VTONFS(tdvp)->n_ncmtime = VTONFS(tdvp)->n_vattr.nva_mtime; if (!twccpostattr) - VTONFS(tdvp)->n_xid = 0; + NATTRINVALIDATE(VTONFS(tdvp)); return (error); } @@ -2400,44 +2515,36 @@ nfs_renamerpc(fdvp, fnameptr, fnamelen, tdvp, tnameptr, tnamelen, cred, proc) */ static int nfs_link(ap) - struct vop_link_args /* { - struct vnode *a_vp; - struct vnode *a_tdvp; + struct vnop_link_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + vnode_t a_tdvp; struct componentname *a_cnp; + vfs_context_t a_context; } */ *ap; { - register struct vnode *vp = ap->a_vp; - register struct vnode *tdvp = ap->a_tdvp; - register struct componentname *cnp = ap->a_cnp; - register u_long *tl; - register caddr_t cp; - register long t1, t2; + vnode_t vp = ap->a_vp; + vnode_t tdvp = ap->a_tdvp; + struct componentname *cnp = ap->a_cnp; + u_long *tl; + caddr_t cp; + long t1, t2; caddr_t bpos, dpos, cp2; int error = 0, wccpostattr = 0, attrflag = 0; - time_t premtime = 0; - struct mbuf *mreq, *mrep, *md, *mb, *mb2; - int v3, didhold; + struct timespec premtime = { 0, 0 }; + mbuf_t mreq, mrep, md, mb, mb2; + int v3; u_int64_t xid; + kauth_cred_t cred; + proc_t p; - if (vp->v_mount != tdvp->v_mount) { - VOP_ABORTOP(vp, cnp); - vput(tdvp); + if (vnode_mount(vp) != vnode_mount(tdvp)) { return (EXDEV); } - /* need to get vnode lock for vp before calling VOP_FSYNC() */ - if (error = vn_lock(vp, LK_EXCLUSIVE, cnp->cn_proc)) { - VOP_ABORTOP(vp, cnp); - vput(tdvp); - return (error); - } + cred = vfs_context_ucred(ap->a_context); + p = vfs_context_proc(ap->a_context); - if (!VFSTONFS(vp->v_mount)) { - VOP_UNLOCK(vp, 0, cnp->cn_proc); - VOP_ABORTOP(vp, cnp); - vput(tdvp); - return (ENXIO); - } v3 = NFS_ISV3(vp); /* @@ -2445,37 +2552,32 @@ nfs_link(ap) * doesn't get "out of sync" with the server. * XXX There should be a better way! */ - didhold = ubc_hold(vp); - VOP_FSYNC(vp, cnp->cn_cred, MNT_WAIT, cnp->cn_proc); - VOP_UNLOCK(vp, 0, cnp->cn_proc); + nfs_flush(vp, MNT_WAIT, cred, p, 0); - nfsstats.rpccnt[NFSPROC_LINK]++; - nfsm_reqhead(vp, NFSPROC_LINK, - NFSX_FH(v3)*2 + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen)); + nfsm_reqhead(NFSX_FH(v3)*2 + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen)); + if (error) + return (error); + OSAddAtomic(1, (SInt32*)&nfsstats.rpccnt[NFSPROC_LINK]); nfsm_fhtom(vp, v3); nfsm_fhtom(tdvp, v3); - nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); - nfsm_request(vp, NFSPROC_LINK, cnp->cn_proc, cnp->cn_cred, &xid); + nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN, v3); + nfsm_request(vp, NFSPROC_LINK, p, cred, &xid); if (v3 && mrep) { u_int64_t txid = xid; - nfsm_postop_attr(vp, attrflag, &xid); - nfsm_wcc_data(tdvp, premtime, wccpostattr, &txid); + nfsm_postop_attr_update(vp, v3, attrflag, &xid); + nfsm_wcc_data(tdvp, &premtime, wccpostattr, &txid); } nfsm_reqdone; VTONFS(tdvp)->n_flag |= NMODIFIED; if (!attrflag) - VTONFS(vp)->n_xid = 0; + NATTRINVALIDATE(VTONFS(vp)); /* if directory hadn't changed, update namecache mtime */ - if (VTONFS(tdvp)->n_ncmtime == premtime) - VTONFS(tdvp)->n_ncmtime = VTONFS(tdvp)->n_vattr.va_mtime.tv_sec; + if (nfstimespeccmp(&VTONFS(tdvp)->n_ncmtime, &premtime, ==)) + VTONFS(tdvp)->n_ncmtime = VTONFS(tdvp)->n_vattr.nva_mtime; if (!wccpostattr) - VTONFS(tdvp)->n_xid = 0; - if (didhold) - ubc_rele(vp); - vput(tdvp); - NFS_FREE_PNBUF(cnp); + NATTRINVALIDATE(VTONFS(tdvp)); /* * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. */ @@ -2489,76 +2591,127 @@ nfs_link(ap) */ static int nfs_symlink(ap) - struct vop_symlink_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; + struct vnop_symlink_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t *a_vpp; struct componentname *a_cnp; - struct vattr *a_vap; + struct vnode_attr *a_vap; char *a_target; + vfs_context_t a_context; } */ *ap; { - register struct vnode *dvp = ap->a_dvp; - register struct vattr *vap = ap->a_vap; - register struct componentname *cnp = ap->a_cnp; - register struct nfsv2_sattr *sp; - register struct nfsv3_sattr *sp3; - register u_long *tl; - register caddr_t cp; - register long t1, t2; + vnode_t dvp = ap->a_dvp; + struct vnode_attr *vap = ap->a_vap; + struct componentname *cnp = ap->a_cnp; + struct nfs_vattr nvattr; + struct nfsv2_sattr *sp; + u_long *tl; + caddr_t cp; + long t1, t2; caddr_t bpos, dpos, cp2; - int slen, error = 0, wccpostattr = 0, gotvp; - time_t premtime = 0; - struct mbuf *mreq, *mrep, *md, *mb, *mb2; - struct vnode *newvp = (struct vnode *)0; + int slen, error = 0, wccpostattr = 0, gotvp = 0; + struct timespec premtime = { 0, 0 }; + mbuf_t mreq, mrep, md, mb, mb2; + vnode_t newvp = (vnode_t)0; int v3 = NFS_ISV3(dvp); + int gotuid, gotgid; u_int64_t xid; + kauth_cred_t cred; + proc_t p; + struct nfsnode *np = NULL; + + cred = vfs_context_ucred(ap->a_context); + p = vfs_context_proc(ap->a_context); - nfsstats.rpccnt[NFSPROC_SYMLINK]++; slen = strlen(ap->a_target); - nfsm_reqhead(dvp, NFSPROC_SYMLINK, NFSX_FH(v3) + 2*NFSX_UNSIGNED + + nfsm_reqhead(NFSX_FH(v3) + 2*NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen) + nfsm_rndup(slen) + NFSX_SATTR(v3)); + if (error) + return (error); + + VATTR_SET_SUPPORTED(vap, va_mode); + VATTR_SET_SUPPORTED(vap, va_uid); + VATTR_SET_SUPPORTED(vap, va_gid); + VATTR_SET_SUPPORTED(vap, va_data_size); + VATTR_SET_SUPPORTED(vap, va_access_time); + VATTR_SET_SUPPORTED(vap, va_modify_time); + gotuid = VATTR_IS_ACTIVE(vap, va_uid); + gotgid = VATTR_IS_ACTIVE(vap, va_gid); + + OSAddAtomic(1, (SInt32*)&nfsstats.rpccnt[NFSPROC_SYMLINK]); nfsm_fhtom(dvp, v3); - nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); + nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN, v3); if (v3) { - nfsm_build(sp3, struct nfsv3_sattr *, NFSX_V3SRVSATTR); - nfsm_v3sattr(sp3, vap, cnp->cn_cred->cr_uid, - cnp->cn_cred->cr_gid); + nfsm_v3sattr(vap); } - nfsm_strtom(ap->a_target, slen, NFS_MAXPATHLEN); + nfsm_strtom(ap->a_target, slen, NFS_MAXPATHLEN, v3); if (!v3) { + struct timespec neg1time = { -1, -1 }; nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); - sp->sa_mode = vtonfsv2_mode(VLNK, vap->va_mode); - sp->sa_uid = txdr_unsigned(cnp->cn_cred->cr_uid); - sp->sa_gid = txdr_unsigned(cnp->cn_cred->cr_gid); - sp->sa_size = -1; - txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); - txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); + sp->sa_mode = vtonfsv2_mode(VLNK, + (VATTR_IS_ACTIVE(vap, va_mode) ? vap->va_mode : 0600)); + sp->sa_uid = gotuid ? (u_long)txdr_unsigned(vap->va_uid) : nfs_xdrneg1; + sp->sa_gid = gotgid ? (u_long)txdr_unsigned(vap->va_gid) : nfs_xdrneg1; + sp->sa_size = nfs_xdrneg1; + if (VATTR_IS_ACTIVE(vap, va_access_time)) { + txdr_nfsv2time(&vap->va_access_time, &sp->sa_atime); + } else { + txdr_nfsv2time(&neg1time, &sp->sa_atime); + } + if (VATTR_IS_ACTIVE(vap, va_modify_time)) { + txdr_nfsv2time(&vap->va_modify_time, &sp->sa_mtime); + } else { + txdr_nfsv2time(&neg1time, &sp->sa_mtime); + } } - nfsm_request(dvp, NFSPROC_SYMLINK, cnp->cn_proc, cnp->cn_cred, &xid); + nfsm_request(dvp, NFSPROC_SYMLINK, p, cred, &xid); if (v3 && mrep) { u_int64_t dxid = xid; if (!error) - nfsm_mtofh(dvp, newvp, v3, gotvp, &xid); - nfsm_wcc_data(dvp, premtime, wccpostattr, &dxid); + nfsm_mtofh(dvp, cnp, newvp, v3, &xid, gotvp); + nfsm_wcc_data(dvp, &premtime, wccpostattr, &dxid); } nfsm_reqdone; - if (newvp) - vput(newvp); VTONFS(dvp)->n_flag |= NMODIFIED; /* if directory hadn't changed, update namecache mtime */ - if (VTONFS(dvp)->n_ncmtime == premtime) - VTONFS(dvp)->n_ncmtime = VTONFS(dvp)->n_vattr.va_mtime.tv_sec; + if (nfstimespeccmp(&VTONFS(dvp)->n_ncmtime, &premtime, ==)) + VTONFS(dvp)->n_ncmtime = VTONFS(dvp)->n_vattr.nva_mtime; if (!wccpostattr) - VTONFS(dvp)->n_xid = 0; - vput(dvp); - NFS_FREE_PNBUF(cnp); + NATTRINVALIDATE(VTONFS(dvp)); + /* - * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. + * Kludge: Map EEXIST => 0 assuming that you have a reply to a retry + * if we can succeed in looking up the symlink. */ - if (error == EEXIST) - error = 0; + if ((error == EEXIST) || (!error && !gotvp)) { + if (newvp) { + vnode_put(newvp); + newvp = NULL; + } + error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen, cred, p, &np); + if (!error) { + newvp = NFSTOV(np); + if (vnode_vtype(newvp) != VLNK) + error = EEXIST; + } + } + if (!error && (gotuid || gotgid) && + (!newvp || nfs_getattrcache(newvp, &nvattr) || + (gotuid && (nvattr.nva_uid != vap->va_uid)) || + (gotgid && (nvattr.nva_gid != vap->va_gid)))) { + /* clear ID bits if server didn't use them (or we can't tell) */ + VATTR_CLEAR_SUPPORTED(vap, va_uid); + VATTR_CLEAR_SUPPORTED(vap, va_gid); + } + if (error) { + if (newvp) + vnode_put(newvp); + } else { + *ap->a_vpp = newvp; + } return (error); } @@ -2567,96 +2720,121 @@ nfs_symlink(ap) */ static int nfs_mkdir(ap) - struct vop_mkdir_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; + struct vnop_mkdir_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t *a_vpp; struct componentname *a_cnp; - struct vattr *a_vap; + struct vnode_attr *a_vap; + vfs_context_t a_context; } */ *ap; { - register struct vnode *dvp = ap->a_dvp; - register struct vattr *vap = ap->a_vap; - register struct componentname *cnp = ap->a_cnp; - register struct nfsv2_sattr *sp; - register struct nfsv3_sattr *sp3; - register u_long *tl; - register caddr_t cp; - register long t1, t2; - register int len; + vnode_t dvp = ap->a_dvp; + struct vnode_attr *vap = ap->a_vap; + struct componentname *cnp = ap->a_cnp; + struct nfs_vattr nvattr; + struct nfsv2_sattr *sp; + u_long *tl; + caddr_t cp; + long t1, t2; + int len; struct nfsnode *np = (struct nfsnode *)0; - struct vnode *newvp = (struct vnode *)0; + vnode_t newvp = (vnode_t)0; caddr_t bpos, dpos, cp2; int error = 0, wccpostattr = 0; - time_t premtime = 0; + struct timespec premtime = { 0, 0 }; int gotvp = 0; - struct mbuf *mreq, *mrep, *md, *mb, *mb2; - struct vattr vattr; + mbuf_t mreq, mrep, md, mb, mb2; int v3 = NFS_ISV3(dvp); + int gotuid, gotgid; u_int64_t xid, dxid; + kauth_cred_t cred; + proc_t p; + + cred = vfs_context_ucred(ap->a_context); + p = vfs_context_proc(ap->a_context); - if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_proc))) { - VOP_ABORTOP(dvp, cnp); - vput(dvp); - return (error); - } len = cnp->cn_namelen; - nfsstats.rpccnt[NFSPROC_MKDIR]++; - nfsm_reqhead(dvp, NFSPROC_MKDIR, - NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len) + NFSX_SATTR(v3)); + nfsm_reqhead(NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len) + NFSX_SATTR(v3)); + if (error) + return (error); + + VATTR_SET_SUPPORTED(vap, va_mode); + VATTR_SET_SUPPORTED(vap, va_uid); + VATTR_SET_SUPPORTED(vap, va_gid); + VATTR_SET_SUPPORTED(vap, va_data_size); + VATTR_SET_SUPPORTED(vap, va_access_time); + VATTR_SET_SUPPORTED(vap, va_modify_time); + gotuid = VATTR_IS_ACTIVE(vap, va_uid); + gotgid = VATTR_IS_ACTIVE(vap, va_gid); + + OSAddAtomic(1, (SInt32*)&nfsstats.rpccnt[NFSPROC_MKDIR]); nfsm_fhtom(dvp, v3); - nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN); + nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN, v3); if (v3) { - nfsm_build(sp3, struct nfsv3_sattr *, NFSX_V3SRVSATTR); - nfsm_v3sattr(sp3, vap, cnp->cn_cred->cr_uid, vattr.va_gid); + nfsm_v3sattr(vap); } else { + struct timespec neg1time = { -1, -1 }; nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); - sp->sa_mode = vtonfsv2_mode(VDIR, vap->va_mode); - sp->sa_uid = txdr_unsigned(cnp->cn_cred->cr_uid); - sp->sa_gid = txdr_unsigned(vattr.va_gid); - sp->sa_size = -1; - txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); - txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); + sp->sa_mode = vtonfsv2_mode(VDIR, + (VATTR_IS_ACTIVE(vap, va_mode) ? vap->va_mode : 0600)); + sp->sa_uid = gotuid ? (u_long)txdr_unsigned(vap->va_uid) : nfs_xdrneg1; + sp->sa_gid = gotgid ? (u_long)txdr_unsigned(vap->va_gid) : nfs_xdrneg1; + sp->sa_size = nfs_xdrneg1; + if (VATTR_IS_ACTIVE(vap, va_access_time)) { + txdr_nfsv2time(&vap->va_access_time, &sp->sa_atime); + } else { + txdr_nfsv2time(&neg1time, &sp->sa_atime); + } + if (VATTR_IS_ACTIVE(vap, va_modify_time)) { + txdr_nfsv2time(&vap->va_modify_time, &sp->sa_mtime); + } else { + txdr_nfsv2time(&neg1time, &sp->sa_mtime); + } } - nfsm_request(dvp, NFSPROC_MKDIR, cnp->cn_proc, cnp->cn_cred, &xid); + nfsm_request(dvp, NFSPROC_MKDIR, p, cred, &xid); dxid = xid; if (!error) - nfsm_mtofh(dvp, newvp, v3, gotvp, &xid); + nfsm_mtofh(dvp, cnp, newvp, v3, &xid, gotvp); if (v3 && mrep) - nfsm_wcc_data(dvp, premtime, wccpostattr, &dxid); + nfsm_wcc_data(dvp, &premtime, wccpostattr, &dxid); nfsm_reqdone; VTONFS(dvp)->n_flag |= NMODIFIED; /* if directory hadn't changed, update namecache mtime */ - if (VTONFS(dvp)->n_ncmtime == premtime) - VTONFS(dvp)->n_ncmtime = VTONFS(dvp)->n_vattr.va_mtime.tv_sec; + if (nfstimespeccmp(&VTONFS(dvp)->n_ncmtime, &premtime, ==)) + VTONFS(dvp)->n_ncmtime = VTONFS(dvp)->n_vattr.nva_mtime; if (!wccpostattr) - VTONFS(dvp)->n_xid = 0; + NATTRINVALIDATE(VTONFS(dvp)); /* * Kludge: Map EEXIST => 0 assuming that you have a reply to a retry * if we can succeed in looking up the directory. */ if (error == EEXIST || (!error && !gotvp)) { if (newvp) { - vput(newvp); - newvp = (struct vnode *)0; + vnode_put(newvp); + newvp = NULL; } - error = nfs_lookitup(dvp, cnp->cn_nameptr, len, cnp->cn_cred, - cnp->cn_proc, &np); + error = nfs_lookitup(dvp, cnp->cn_nameptr, len, cred, p, &np); if (!error) { newvp = NFSTOV(np); - if (newvp->v_type != VDIR) + if (vnode_vtype(newvp) != VDIR) error = EEXIST; } } + if (!error && (gotuid || gotgid) && + (!newvp || nfs_getattrcache(newvp, &nvattr) || + (gotuid && (nvattr.nva_uid != vap->va_uid)) || + (gotgid && (nvattr.nva_gid != vap->va_gid)))) { + /* clear ID bits if server didn't use them (or we can't tell) */ + VATTR_CLEAR_SUPPORTED(vap, va_uid); + VATTR_CLEAR_SUPPORTED(vap, va_gid); + } if (error) { if (newvp) - vput(newvp); + vnode_put(newvp); } else { - if (cnp->cn_flags & MAKEENTRY) - cache_enter(dvp, newvp, cnp); *ap->a_vpp = newvp; } - vput(dvp); - NFS_FREE_PNBUF(cnp); return (error); } @@ -2665,40 +2843,48 @@ nfs_mkdir(ap) */ static int nfs_rmdir(ap) - struct vop_rmdir_args /* { - struct vnode *a_dvp; - struct vnode *a_vp; + struct vnop_rmdir_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t a_vp; struct componentname *a_cnp; + vfs_context_t a_context; } */ *ap; { - register struct vnode *vp = ap->a_vp; - register struct vnode *dvp = ap->a_dvp; - register struct componentname *cnp = ap->a_cnp; - register u_long *tl; - register caddr_t cp; - register long t1, t2; + vnode_t vp = ap->a_vp; + vnode_t dvp = ap->a_dvp; + struct componentname *cnp = ap->a_cnp; + u_long *tl; + caddr_t cp; + long t1, t2; caddr_t bpos, dpos, cp2; int error = 0, wccpostattr = 0; - time_t premtime = 0; - struct mbuf *mreq, *mrep, *md, *mb, *mb2; + struct timespec premtime = { 0, 0 }; + mbuf_t mreq, mrep, md, mb, mb2; int v3 = NFS_ISV3(dvp); u_int64_t xid; + kauth_cred_t cred; + proc_t p; - nfsstats.rpccnt[NFSPROC_RMDIR]++; - nfsm_reqhead(dvp, NFSPROC_RMDIR, - NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen)); + cred = vfs_context_ucred(ap->a_context); + p = vfs_context_proc(ap->a_context); + + nfsm_reqhead(NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen)); + if (error) + return (error); + OSAddAtomic(1, (SInt32*)&nfsstats.rpccnt[NFSPROC_RMDIR]); nfsm_fhtom(dvp, v3); - nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); - nfsm_request(dvp, NFSPROC_RMDIR, cnp->cn_proc, cnp->cn_cred, &xid); + nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN, v3); + nfsm_request(dvp, NFSPROC_RMDIR, p, cred, &xid); if (v3 && mrep) - nfsm_wcc_data(dvp, premtime, wccpostattr, &xid); + nfsm_wcc_data(dvp, &premtime, wccpostattr, &xid); nfsm_reqdone; VTONFS(dvp)->n_flag |= NMODIFIED; /* if directory hadn't changed, update namecache mtime */ - if (VTONFS(dvp)->n_ncmtime == premtime) - VTONFS(dvp)->n_ncmtime = VTONFS(dvp)->n_vattr.va_mtime.tv_sec; + if (nfstimespeccmp(&VTONFS(dvp)->n_ncmtime, &premtime, ==)) + VTONFS(dvp)->n_ncmtime = VTONFS(dvp)->n_vattr.nva_mtime; if (!wccpostattr) - VTONFS(dvp)->n_xid = 0; + NATTRINVALIDATE(VTONFS(dvp)); cache_purge(vp); /* * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry. @@ -2711,12 +2897,11 @@ nfs_rmdir(ap) * again if another object gets created with the same filehandle * before this vnode gets reclaimed */ + lck_mtx_lock(nfs_node_hash_mutex); LIST_REMOVE(VTONFS(vp), n_hash); VTONFS(vp)->n_flag &= ~NHASHED; + lck_mtx_unlock(nfs_node_hash_mutex); } - vput(vp); - vput(dvp); - NFS_FREE_PNBUF(cnp); return (error); } @@ -2725,36 +2910,41 @@ nfs_rmdir(ap) */ static int nfs_readdir(ap) - struct vop_readdir_args /* { - struct vnode *a_vp; + struct vnop_readdir_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; struct uio *a_uio; - struct ucred *a_cred; + int *a_eofflag; + int *a_ncookies; + u_long **a_cookies; + vfs_context_t a_context; } */ *ap; { - register struct vnode *vp = ap->a_vp; - register struct nfsnode *np = VTONFS(vp); - register struct uio *uio = ap->a_uio; + vnode_t vp = ap->a_vp; + struct nfsnode *np = VTONFS(vp); + struct uio *uio = ap->a_uio; int tresid, error; - struct vattr vattr; + struct nfs_vattr nvattr; + kauth_cred_t cred; + proc_t p; - if (vp->v_type != VDIR) + if (vnode_vtype(vp) != VDIR) return (EPERM); + + cred = vfs_context_ucred(ap->a_context); + p = vfs_context_proc(ap->a_context); + /* * First, check for hit on the EOF offset cache */ if (np->n_direofoffset > 0 && uio->uio_offset >= np->n_direofoffset && (np->n_flag & NMODIFIED) == 0) { - if (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS) { - if (NQNFS_CKCACHABLE(vp, ND_READ)) { - nfsstats.direofcache_hits++; - return (0); - } - } else if (!VOP_GETATTR(vp, &vattr, ap->a_cred, uio->uio_procp)) { - if (np->n_mtime == vattr.va_mtime.tv_sec) { - nfsstats.direofcache_hits++; + if (!nfs_getattr(vp, &nvattr, cred, p)) { + if (nfstimespeccmp(&np->n_mtime, &nvattr.nva_mtime, ==)) { + OSAddAtomic(1, (SInt32*)&nfsstats.direofcache_hits); return (0); } - if (np->n_ncmtime != vattr.va_mtime.tv_sec) { + if (nfstimespeccmp(&np->n_ncmtime, &nvattr.nva_mtime, !=)) { /* directory changed, purge any name cache entries */ cache_purge(vp); } @@ -2764,11 +2954,12 @@ nfs_readdir(ap) /* * Call nfs_bioread() to do the real work. */ - tresid = uio->uio_resid; - error = nfs_bioread(vp, uio, 0, ap->a_cred, 0); + // LP64todo - fix this + tresid = uio_uio_resid(uio); + error = nfs_bioread(vp, uio, 0, cred, p); - if (!error && uio->uio_resid == tresid) - nfsstats.direofcache_misses++; + if (!error && uio_uio_resid(uio) == tresid) + OSAddAtomic(1, (SInt32*)&nfsstats.direofcache_misses); return (error); } @@ -2777,20 +2968,20 @@ nfs_readdir(ap) * Called from below the buffer cache by nfs_doio(). */ int -nfs_readdirrpc(vp, uiop, cred) - struct vnode *vp; - register struct uio *uiop; - struct ucred *cred; - +nfs_readdirrpc( + vnode_t vp, + struct uio *uiop, + kauth_cred_t cred, + proc_t p) { - register int len, left; + register int len, skiplen, left; register struct dirent *dp; register u_long *tl; register caddr_t cp; register long t1, t2; register nfsuint64 *cookiep; caddr_t bpos, dpos, cp2; - struct mbuf *mreq, *mrep, *md, *mb, *mb2; + mbuf_t mreq, mrep, md, mb, mb2; nfsuint64 cookie; struct nfsmount *nmp; struct nfsnode *dnp = VTONFS(vp); @@ -2805,10 +2996,10 @@ nfs_readdirrpc(vp, uiop, cred) #endif #if DIAGNOSTIC if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (NFS_DIRBLKSIZ - 1)) || - (uiop->uio_resid & (NFS_DIRBLKSIZ - 1))) + (uio_uio_resid(uiop) & (NFS_DIRBLKSIZ - 1))) panic("nfs_readdirrpc: bad uio"); #endif - nmp = VFSTONFS(vp->v_mount); + nmp = VFSTONFS(vnode_mount(vp)); if (!nmp) return (ENXIO); v3 = NFS_ISV3(vp); @@ -2828,9 +3019,10 @@ nfs_readdirrpc(vp, uiop, cred) * The stopping criteria is EOF or buffer full. */ while (more_dirs && bigenough) { - nfsstats.rpccnt[NFSPROC_READDIR]++; - nfsm_reqhead(vp, NFSPROC_READDIR, NFSX_FH(v3) + - NFSX_READDIR(v3)); + nfsm_reqhead(NFSX_FH(v3) + NFSX_READDIR(v3)); + if (error) + goto nfsmout; + OSAddAtomic(1, (SInt32*)&nfsstats.rpccnt[NFSPROC_READDIR]); nfsm_fhtom(vp, v3); if (v3) { nfsm_build(tl, u_long *, 5 * NFSX_UNSIGNED); @@ -2843,17 +3035,17 @@ nfs_readdirrpc(vp, uiop, cred) *tl++ = cookie.nfsuquad[0]; } *tl = txdr_unsigned(nmreaddirsize); - nfsm_request(vp, NFSPROC_READDIR, uiop->uio_procp, cred, &xid); + nfsm_request(vp, NFSPROC_READDIR, p, cred, &xid); if (v3) { if (mrep) { - nfsm_postop_attr(vp, attrflag, &xid); + nfsm_postop_attr_update(vp, v3, attrflag, &xid); } if (!error) { nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); dnp->n_cookieverf.nfsuquad[0] = *tl++; dnp->n_cookieverf.nfsuquad[1] = *tl; } else { - m_freem(mrep); + mbuf_freem(mrep); goto nfsmout; } } else if (!mrep) { @@ -2874,27 +3066,36 @@ nfs_readdirrpc(vp, uiop, cred) fileno = fxdr_unsigned(u_quad_t, *tl++); len = fxdr_unsigned(int, *tl); } - if (len <= 0 || len > NFS_MAXNAMLEN) { + /* Note: v3 supports longer names, but struct dirent doesn't */ + /* so we just truncate the names to fit */ + if (len <= 0) { error = EBADRPC; - m_freem(mrep); + mbuf_freem(mrep); goto nfsmout; } + if (len > MAXNAMLEN) { + skiplen = len - MAXNAMLEN; + len = MAXNAMLEN; + } else { + skiplen = 0; + } tlen = nfsm_rndup(len); if (tlen == len) tlen += 4; /* To ensure null termination */ left = DIRBLKSIZ - blksiz; - if ((tlen + DIRHDSIZ) > left) { + if ((tlen + (int)DIRHDSIZ) > left) { dp->d_reclen += left; - uiop->uio_iov->iov_base += left; - uiop->uio_iov->iov_len -= left; + uio_iov_base_add(uiop, left); + uio_iov_len_add(uiop, -left); uiop->uio_offset += left; - uiop->uio_resid -= left; + uio_uio_resid_add(uiop, -left); blksiz = 0; } - if ((tlen + DIRHDSIZ) > uiop->uio_resid) + if ((tlen + (int)DIRHDSIZ) > uio_uio_resid(uiop)) bigenough = 0; if (bigenough) { - dp = (struct dirent *)uiop->uio_iov->iov_base; + // LP64todo - fix this! + dp = (struct dirent *) CAST_DOWN(caddr_t, uio_iov_base(uiop)); dp->d_fileno = (int)fileno; dp->d_namlen = len; dp->d_reclen = tlen + DIRHDSIZ; @@ -2903,19 +3104,28 @@ nfs_readdirrpc(vp, uiop, cred) if (blksiz == DIRBLKSIZ) blksiz = 0; uiop->uio_offset += DIRHDSIZ; - uiop->uio_resid -= DIRHDSIZ; - uiop->uio_iov->iov_base += DIRHDSIZ; - uiop->uio_iov->iov_len -= DIRHDSIZ; +#if LP64KERN + uio_uio_resid_add(uiop, -((int64_t)DIRHDSIZ)); + uio_iov_len_add(uiop, -((int64_t)DIRHDSIZ)); +#else + uio_uio_resid_add(uiop, -((int)DIRHDSIZ)); + uio_iov_len_add(uiop, -((int)DIRHDSIZ)); +#endif + uio_iov_base_add(uiop, DIRHDSIZ); nfsm_mtouio(uiop, len); - cp = uiop->uio_iov->iov_base; + // LP64todo - fix this! + cp = CAST_DOWN(caddr_t, uio_iov_base(uiop)); tlen -= len; *cp = '\0'; /* null terminate */ - uiop->uio_iov->iov_base += tlen; - uiop->uio_iov->iov_len -= tlen; + uio_iov_base_add(uiop, tlen); + uio_iov_len_add(uiop, -tlen); uiop->uio_offset += tlen; - uiop->uio_resid -= tlen; - } else + uio_uio_resid_add(uiop, -tlen); + } else { nfsm_adv(nfsm_rndup(len)); + } + if (skiplen) + nfsm_adv(nfsm_rndup(skiplen)); if (v3) { nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED); } else { @@ -2938,7 +3148,7 @@ nfs_readdirrpc(vp, uiop, cred) nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); more_dirs = (fxdr_unsigned(int, *tl) == 0); } - m_freem(mrep); + mbuf_freem(mrep); } /* * Fill last record, iff any, out to a multiple of DIRBLKSIZ @@ -2947,10 +3157,10 @@ nfs_readdirrpc(vp, uiop, cred) if (blksiz > 0) { left = DIRBLKSIZ - blksiz; dp->d_reclen += left; - uiop->uio_iov->iov_base += left; - uiop->uio_iov->iov_len -= left; + uio_iov_base_add(uiop, left); + uio_iov_len_add(uiop, -left); uiop->uio_offset += left; - uiop->uio_resid -= left; + uio_uio_resid_add(uiop, -left); } /* @@ -2960,10 +3170,11 @@ nfs_readdirrpc(vp, uiop, cred) if (bigenough) dnp->n_direofoffset = uiop->uio_offset; else { - if (uiop->uio_resid > 0) + if (uio_uio_resid(uiop) > 0) printf("EEK! readdirrpc resid > 0\n"); cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1); - *cookiep = cookie; + if (cookiep) + *cookiep = cookie; } nfsmout: return (error); @@ -2973,46 +3184,47 @@ nfsmout: * NFS V3 readdir plus RPC. Used in place of nfs_readdirrpc(). */ int -nfs_readdirplusrpc(vp, uiop, cred) - struct vnode *vp; - register struct uio *uiop; - struct ucred *cred; +nfs_readdirplusrpc( + vnode_t vp, + struct uio *uiop, + kauth_cred_t cred, + proc_t p) { - register int len, left; - register struct dirent *dp; - register u_long *tl; - register caddr_t cp; - register long t1, t2; - register struct vnode *newvp; - register nfsuint64 *cookiep; - caddr_t bpos, dpos, cp2, dpossav1, dpossav2; - struct mbuf *mreq, *mrep, *md, *mb, *mb2, *mdsav1, *mdsav2; - struct nameidata nami, *ndp = &nami; - struct componentname *cnp = &ndp->ni_cnd; + int len, skiplen, left; + struct dirent *dp; + u_long *tl; + caddr_t cp; + long t1, t2; + vnode_t newvp; + nfsuint64 *cookiep; + caddr_t bpos, dpos, cp2; + mbuf_t mreq, mrep, md, mb, mb2; + struct componentname cn, *cnp = &cn; nfsuint64 cookie; struct nfsmount *nmp; struct nfsnode *dnp = VTONFS(vp), *np; - nfsfh_t *fhp; + u_char *fhp; u_quad_t fileno; int error = 0, tlen, more_dirs = 1, blksiz = 0, doit, bigenough = 1, i; int attrflag, fhsize, nmreaddirsize, nmrsize; u_int64_t xid, savexid; + struct nfs_vattr nvattr; #ifndef nolint dp = (struct dirent *)0; #endif #if DIAGNOSTIC if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) || - (uiop->uio_resid & (DIRBLKSIZ - 1))) + (uio_uio_resid(uiop) & (DIRBLKSIZ - 1))) panic("nfs_readdirplusrpc: bad uio"); #endif - nmp = VFSTONFS(vp->v_mount); + nmp = VFSTONFS(vnode_mount(vp)); if (!nmp) return (ENXIO); nmreaddirsize = nmp->nm_readdirsize; nmrsize = nmp->nm_rsize; - ndp->ni_dvp = vp; + bzero(cnp, sizeof(*cnp)); newvp = NULLVP; /* @@ -3029,9 +3241,10 @@ nfs_readdirplusrpc(vp, uiop, cred) * The stopping criteria is EOF or buffer full. */ while (more_dirs && bigenough) { - nfsstats.rpccnt[NFSPROC_READDIRPLUS]++; - nfsm_reqhead(vp, NFSPROC_READDIRPLUS, - NFSX_FH(1) + 6 * NFSX_UNSIGNED); + nfsm_reqhead(NFSX_FH(1) + 6 * NFSX_UNSIGNED); + if (error) + goto nfsmout; + OSAddAtomic(1, (SInt32*)&nfsstats.rpccnt[NFSPROC_READDIRPLUS]); nfsm_fhtom(vp, 1); nfsm_build(tl, u_long *, 6 * NFSX_UNSIGNED); *tl++ = cookie.nfsuquad[0]; @@ -3040,14 +3253,13 @@ nfs_readdirplusrpc(vp, uiop, cred) *tl++ = dnp->n_cookieverf.nfsuquad[1]; *tl++ = txdr_unsigned(nmreaddirsize); *tl = txdr_unsigned(nmrsize); - nfsm_request(vp, NFSPROC_READDIRPLUS, uiop->uio_procp, cred, - &xid); + nfsm_request(vp, NFSPROC_READDIRPLUS, p, cred, &xid); savexid = xid; if (mrep) { - nfsm_postop_attr(vp, attrflag, &xid); + nfsm_postop_attr_update(vp, 1, attrflag, &xid); } if (error) { - m_freem(mrep); + mbuf_freem(mrep); goto nfsmout; } nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED); @@ -3060,27 +3272,36 @@ nfs_readdirplusrpc(vp, uiop, cred) nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED); fxdr_hyper(tl, &fileno); len = fxdr_unsigned(int, *(tl + 2)); - if (len <= 0 || len > NFS_MAXNAMLEN) { + /* Note: v3 supports longer names, but struct dirent doesn't */ + /* so we just truncate the names to fit */ + if (len <= 0) { error = EBADRPC; - m_freem(mrep); + mbuf_freem(mrep); goto nfsmout; } + if (len > MAXNAMLEN) { + skiplen = len - MAXNAMLEN; + len = MAXNAMLEN; + } else { + skiplen = 0; + } tlen = nfsm_rndup(len); if (tlen == len) tlen += 4; /* To ensure null termination*/ left = DIRBLKSIZ - blksiz; - if ((tlen + DIRHDSIZ) > left) { + if ((tlen + (int)DIRHDSIZ) > left) { dp->d_reclen += left; - uiop->uio_iov->iov_base += left; - uiop->uio_iov->iov_len -= left; + uio_iov_base_add(uiop, left); + uio_iov_len_add(uiop, -left); uiop->uio_offset += left; - uiop->uio_resid -= left; + uio_uio_resid_add(uiop, -left); blksiz = 0; } - if ((tlen + DIRHDSIZ) > uiop->uio_resid) + if ((tlen + (int)DIRHDSIZ) > uio_uio_resid(uiop)) bigenough = 0; if (bigenough) { - dp = (struct dirent *)uiop->uio_iov->iov_base; + // LP64todo - fix this! + dp = (struct dirent *) CAST_DOWN(caddr_t, uio_iov_base(uiop)); dp->d_fileno = (int)fileno; dp->d_namlen = len; dp->d_reclen = tlen + DIRHDSIZ; @@ -3089,21 +3310,30 @@ nfs_readdirplusrpc(vp, uiop, cred) if (blksiz == DIRBLKSIZ) blksiz = 0; uiop->uio_offset += DIRHDSIZ; - uiop->uio_resid -= DIRHDSIZ; - uiop->uio_iov->iov_base += DIRHDSIZ; - uiop->uio_iov->iov_len -= DIRHDSIZ; - cnp->cn_nameptr = uiop->uio_iov->iov_base; +#if LP64KERN + uio_uio_resid_add(uiop, -((int64_t)DIRHDSIZ)); + uio_iov_len_add(uiop, -((int64_t)DIRHDSIZ)); +#else + uio_uio_resid_add(uiop, -((int)DIRHDSIZ)); + uio_iov_len_add(uiop, -((int)DIRHDSIZ)); +#endif + uio_iov_base_add(uiop, DIRHDSIZ); + // LP64todo - fix this! + cnp->cn_nameptr = CAST_DOWN(caddr_t, uio_iov_base(uiop)); cnp->cn_namelen = len; nfsm_mtouio(uiop, len); - cp = uiop->uio_iov->iov_base; + cp = CAST_DOWN(caddr_t, uio_iov_base(uiop)); tlen -= len; *cp = '\0'; - uiop->uio_iov->iov_base += tlen; - uiop->uio_iov->iov_len -= tlen; + uio_iov_base_add(uiop, tlen); + uio_iov_len_add(uiop, -tlen); uiop->uio_offset += tlen; - uiop->uio_resid -= tlen; - } else + uio_uio_resid_add(uiop, -tlen); + } else { nfsm_adv(nfsm_rndup(len)); + } + if (skiplen) + nfsm_adv(nfsm_rndup(skiplen)); nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED); if (bigenough) { cookie.nfsuquad[0] = *tl++; @@ -3118,56 +3348,54 @@ nfs_readdirplusrpc(vp, uiop, cred) */ attrflag = fxdr_unsigned(int, *tl); if (attrflag) { - dpossav1 = dpos; - mdsav1 = md; - nfsm_adv(NFSX_V3FATTR); + /* grab attributes */ + nfsm_attr_get(1, &nvattr); + dp->d_type = IFTODT(VTTOIF(nvattr.nva_type)); + /* check for file handle */ nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); doit = fxdr_unsigned(int, *tl); if (doit) { nfsm_getfh(fhp, fhsize, 1); if (NFS_CMPFH(dnp, fhp, fhsize)) { - VREF(vp); - newvp = vp; - np = dnp; + error = vnode_ref(vp); + if (error) { + doit = 0; + } else { + newvp = vp; + np = dnp; + } } else if (!bigenough || (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')) { /* + * XXXmacko I don't think this ".." thing is a problem anymore. * don't doit if we can't guarantee * that this entry is NOT ".." because * we would have to drop the lock on * the directory before getting the - * (lock on) the ".." vnode... and we + * lock on the ".." vnode... and we * don't want to drop the dvp lock in * the middle of a readdirplus. */ doit = 0; } else { - if ((error = nfs_nget(vp->v_mount, fhp, - fhsize, &np))) + cnp->cn_hash = 0; + + error = nfs_nget(vnode_mount(vp), vp, cnp, + fhp, fhsize, &nvattr, &xid, + NG_MAKEENTRY, &np); + if (error) doit = 0; else newvp = NFSTOV(np); } } - if (doit && bigenough) { - dpossav2 = dpos; - dpos = dpossav1; - mdsav2 = md; - md = mdsav1; + /* update attributes if not already updated */ + if (doit && bigenough && (np->n_xid <= savexid)) { xid = savexid; - nfsm_loadattr(newvp, (struct vattr *)0, &xid); - dpos = dpossav2; - md = mdsav2; - dp->d_type = - IFTODT(VTTOIF(np->n_vattr.va_type)); - ndp->ni_vp = newvp; - cnp->cn_hash = 0; - for (cp = cnp->cn_nameptr, i = 1; i <= len; - i++, cp++) - cnp->cn_hash += (unsigned char)*cp * i; - cache_enter(ndp->ni_dvp, ndp->ni_vp, cnp); + nfs_loadattrcache(np, &nvattr, &xid, 0); + /* any error can be ignored */ } } else { /* Just skip over the file handle */ @@ -3177,9 +3405,9 @@ nfs_readdirplusrpc(vp, uiop, cred) } if (newvp != NULLVP) { if (newvp == vp) - vrele(newvp); + vnode_rele(newvp); else - vput(newvp); + vnode_put(newvp); newvp = NULLVP; } nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); @@ -3192,7 +3420,7 @@ nfs_readdirplusrpc(vp, uiop, cred) nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); more_dirs = (fxdr_unsigned(int, *tl) == 0); } - m_freem(mrep); + mbuf_freem(mrep); } /* * Fill last record, iff any, out to a multiple of NFS_DIRBLKSIZ @@ -3201,10 +3429,10 @@ nfs_readdirplusrpc(vp, uiop, cred) if (blksiz > 0) { left = DIRBLKSIZ - blksiz; dp->d_reclen += left; - uiop->uio_iov->iov_base += left; - uiop->uio_iov->iov_len -= left; + uio_iov_base_add(uiop, left); + uio_iov_len_add(uiop, -left); uiop->uio_offset += left; - uiop->uio_resid -= left; + uio_uio_resid_add(uiop, -left); } /* @@ -3214,19 +3442,13 @@ nfs_readdirplusrpc(vp, uiop, cred) if (bigenough) dnp->n_direofoffset = uiop->uio_offset; else { - if (uiop->uio_resid > 0) + if (uio_uio_resid(uiop) > 0) printf("EEK! readdirplusrpc resid > 0\n"); cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1); - *cookiep = cookie; + if (cookiep) + *cookiep = cookie; } nfsmout: - if (newvp != NULLVP) { - if (newvp == vp) - vrele(newvp); - else - vput(newvp); - newvp = NULLVP; - } return (error); } @@ -3244,37 +3466,44 @@ nfsmout: static char sillyrename_name[] = ".nfsAAA%04x4.4"; static int -nfs_sillyrename(dvp, vp, cnp) - struct vnode *dvp, *vp; - struct componentname *cnp; +nfs_sillyrename( + vnode_t dvp, + vnode_t vp, + struct componentname *cnp, + kauth_cred_t cred, + proc_t p) { register struct sillyrename *sp; struct nfsnode *np; int error; short pid; - struct ucred *cred; + kauth_cred_t tmpcred; int i, j, k; cache_purge(vp); np = VTONFS(vp); #if DIAGNOSTIC - if (vp->v_type == VDIR) + if (vnode_vtype(vp) == VDIR) panic("nfs_sillyrename: dir"); #endif MALLOC_ZONE(sp, struct sillyrename *, sizeof (struct sillyrename), M_NFSREQ, M_WAITOK); - sp->s_cred = crdup(cnp->cn_cred); + if (!sp) + return (ENOMEM); + kauth_cred_ref(cred); + sp->s_cred = cred; sp->s_dvp = dvp; - VREF(dvp); + error = vnode_ref(dvp); + if (error) + goto bad_norele; /* Fudge together a funny name */ - pid = cnp->cn_proc->p_pid; + pid = proc_pid(p); sp->s_namlen = sprintf(sp->s_name, sillyrename_name, pid); /* Try lookitups until we get one that isn't there */ i = j = k = 0; - while (nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred, - cnp->cn_proc, (struct nfsnode **)0) == 0) { + while (nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred, p, NULL) == 0) { if (sp->s_name[4]++ >= 'z') sp->s_name[4] = 'A'; if (++i > ('z' - 'A' + 1)) { @@ -3302,10 +3531,11 @@ nfs_sillyrename(dvp, vp, cnp) } } /* now, do the rename */ - if ((error = nfs_renameit(dvp, cnp, sp))) + error = nfs_renamerpc(dvp, cnp->cn_nameptr, cnp->cn_namelen, + dvp, sp->s_name, sp->s_namlen, sp->s_cred, p); + if (error) goto bad; - error = nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred, - cnp->cn_proc, &np); + error = nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred, p, &np); #if DIAGNOSTIC kprintf("sillyrename: %s, vp=%x, np=%x, dvp=%x\n", &sp->s_name[0], (unsigned)vp, (unsigned)np, (unsigned)dvp); @@ -3313,10 +3543,11 @@ nfs_sillyrename(dvp, vp, cnp) np->n_sillyrename = sp; return (0); bad: - vrele(sp->s_dvp); - cred = sp->s_cred; + vnode_rele(sp->s_dvp); +bad_norele: + tmpcred = sp->s_cred; sp->s_cred = NOCRED; - crfree(cred); + kauth_cred_rele(tmpcred); FREE_ZONE((caddr_t)sp, sizeof (struct sillyrename), M_NFSREQ); return (error); } @@ -3331,81 +3562,119 @@ bad: */ static int nfs_lookitup(dvp, name, len, cred, procp, npp) - register struct vnode *dvp; + vnode_t dvp; char *name; int len; - struct ucred *cred; - struct proc *procp; + kauth_cred_t cred; + proc_t procp; struct nfsnode **npp; { - register u_long *tl; - register caddr_t cp; - register long t1, t2; - struct vnode *newvp = (struct vnode *)0; + u_long *tl; + caddr_t cp; + long t1, t2; + vnode_t newvp = (vnode_t)0; struct nfsnode *np, *dnp = VTONFS(dvp); caddr_t bpos, dpos, cp2; int error = 0, fhlen, attrflag; - struct mbuf *mreq, *mrep, *md, *mb, *mb2; - nfsfh_t *nfhp; + mbuf_t mreq, mrep, md, mb, mb2; + u_char *nfhp; int v3; - u_int64_t xid; + u_int64_t xid, dxid, savedxid; + struct nfs_vattr nvattr; - if (!VFSTONFS(dvp->v_mount)) + if (!VFSTONFS(vnode_mount(dvp))) return (ENXIO); v3 = NFS_ISV3(dvp); - nfsstats.rpccnt[NFSPROC_LOOKUP]++; - nfsm_reqhead(dvp, NFSPROC_LOOKUP, - NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len)); + nfsm_reqhead(NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len)); + if (error) + return (error); + OSAddAtomic(1, (SInt32*)&nfsstats.rpccnt[NFSPROC_LOOKUP]); nfsm_fhtom(dvp, v3); - nfsm_strtom(name, len, NFS_MAXNAMLEN); + nfsm_strtom(name, len, NFS_MAXNAMLEN, v3); nfsm_request(dvp, NFSPROC_LOOKUP, procp, cred, &xid); if (npp && !error) { + savedxid = xid; nfsm_getfh(nfhp, fhlen, v3); + /* get attributes */ + if (v3) { + nfsm_postop_attr_get(v3, attrflag, &nvattr); + if (!attrflag) { + /* We need valid attributes in order */ + /* to call nfs_nget/vnode_create(). */ + error = nfs_getattr_no_vnode(vnode_mount(dvp), + nfhp, fhlen, cred, procp, &nvattr, &xid); + if (error) { + mbuf_freem(mrep); + goto nfsmout; + } + } + dxid = savedxid; + nfsm_postop_attr_update(dvp, v3, attrflag, &dxid); + } else { + nfsm_attr_get(v3, &nvattr); + } if (*npp) { np = *npp; - if (np->n_fhsize > NFS_SMALLFH && fhlen <= NFS_SMALLFH) { - FREE_ZONE((caddr_t)np->n_fhp, - np->n_fhsize, M_NFSBIGFH); - np->n_fhp = &np->n_fh; - } else if (np->n_fhsize <= NFS_SMALLFH && fhlen>NFS_SMALLFH) - MALLOC_ZONE(np->n_fhp, nfsfh_t *, - fhlen, M_NFSBIGFH, M_WAITOK); - bcopy((caddr_t)nfhp, (caddr_t)np->n_fhp, fhlen); + if (fhlen != np->n_fhsize) { + u_char *oldbuf = (np->n_fhsize > NFS_SMALLFH) ? np->n_fhp : NULL; + if (fhlen > NFS_SMALLFH) { + MALLOC_ZONE(np->n_fhp, u_char *, fhlen, M_NFSBIGFH, M_WAITOK); + if (!np->n_fhp) { + np->n_fhp = oldbuf; + error = ENOMEM; + mbuf_freem(mrep); + goto nfsmout; + } + } else { + np->n_fhp = &np->n_fh[0]; + } + if (oldbuf) { + FREE_ZONE(oldbuf, np->n_fhsize, M_NFSBIGFH); + } + } + bcopy(nfhp, np->n_fhp, fhlen); np->n_fhsize = fhlen; newvp = NFSTOV(np); + error = nfs_loadattrcache(np, &nvattr, &xid, 0); + if (error) { + mbuf_freem(mrep); + goto nfsmout; + } } else if (NFS_CMPFH(dnp, nfhp, fhlen)) { - VREF(dvp); newvp = dvp; + if (dnp->n_xid <= savedxid) { + dxid = savedxid; + error = nfs_loadattrcache(dnp, &nvattr, &dxid, 0); + if (error) { + mbuf_freem(mrep); + goto nfsmout; + } + } } else { - error = nfs_nget(dvp->v_mount, nfhp, fhlen, &np); + struct componentname cn, *cnp = &cn; + bzero(cnp, sizeof(*cnp)); + cnp->cn_nameptr = name; + cnp->cn_namelen = len; + + error = nfs_nget(vnode_mount(dvp), dvp, cnp, nfhp, fhlen, + &nvattr, &xid, NG_MAKEENTRY, &np); if (error) { - m_freem(mrep); + mbuf_freem(mrep); return (error); } newvp = NFSTOV(np); } - if (v3) { - nfsm_postop_attr(newvp, attrflag, &xid); - if (!attrflag && *npp == NULL) { - m_freem(mrep); - if (newvp == dvp) - vrele(newvp); - else - vput(newvp); - return (ENOENT); - } - } else - nfsm_loadattr(newvp, (struct vattr *)0, &xid); } nfsm_reqdone; if (npp && *npp == NULL) { if (error) { - if (newvp) + if (newvp) { if (newvp == dvp) - vrele(newvp); + vnode_rele(newvp); else - vput(newvp); + vnode_put(newvp); + } } else *npp = np; } @@ -3416,38 +3685,40 @@ nfs_lookitup(dvp, name, len, cred, procp, npp) * Nfs Version 3 commit rpc */ int -nfs_commit(vp, offset, cnt, cred, procp) - register struct vnode *vp; +nfs_commit(vp, offset, count, cred, procp) + vnode_t vp; u_quad_t offset; - int cnt; - struct ucred *cred; - struct proc *procp; + u_int32_t count; + kauth_cred_t cred; + proc_t procp; { - register caddr_t cp; - register u_long *tl; - register int t1, t2; - register struct nfsmount *nmp = VFSTONFS(vp->v_mount); + caddr_t cp; + u_long *tl; + int t1, t2; + struct nfsmount *nmp = VFSTONFS(vnode_mount(vp)); caddr_t bpos, dpos, cp2; int error = 0, wccpostattr = 0; - time_t premtime = 0; - struct mbuf *mreq, *mrep, *md, *mb, *mb2; + struct timespec premtime = { 0, 0 }; + mbuf_t mreq, mrep, md, mb, mb2; u_int64_t xid; - FSDBG(521, vp, offset, cnt, nmp->nm_state); + FSDBG(521, vp, offset, count, nmp->nm_state); if (!nmp) return (ENXIO); if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) return (0); - nfsstats.rpccnt[NFSPROC_COMMIT]++; - nfsm_reqhead(vp, NFSPROC_COMMIT, NFSX_FH(1)); + nfsm_reqhead(NFSX_FH(1)); + if (error) + return (error); + OSAddAtomic(1, (SInt32*)&nfsstats.rpccnt[NFSPROC_COMMIT]); nfsm_fhtom(vp, 1); nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED); txdr_hyper(&offset, tl); tl += 2; - *tl = txdr_unsigned(cnt); + *tl = txdr_unsigned(count); nfsm_request(vp, NFSPROC_COMMIT, procp, cred, &xid); if (mrep) { - nfsm_wcc_data(vp, premtime, wccpostattr, &xid); + nfsm_wcc_data(vp, &premtime, wccpostattr, &xid); /* XXX can we do anything useful with the wcc info? */ } if (!error) { @@ -3464,34 +3735,19 @@ nfs_commit(vp, offset, cnt, cred, procp) } static int -nfs_bmap(ap) - struct vop_bmap_args /* { - struct vnode *a_vp; - daddr_t a_bn; - struct vnode **a_vpp; - daddr_t *a_bnp; - int *a_runp; - int *a_runb; - } */ *ap; +nfs_blockmap( + __unused struct vnop_blockmap_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + off_t a_foffset; + size_t a_size; + daddr64_t *a_bpn; + size_t *a_run; + void *a_poff; + int a_flags; + } */ *ap) { - register struct vnode *vp = ap->a_vp; - int devBlockSize = DEV_BSIZE; - - if (ap->a_vpp != NULL) - *ap->a_vpp = vp; - if (ap->a_bnp != NULL) { - if (!vp->v_mount) - return (ENXIO); - *ap->a_bnp = ap->a_bn * btodb(vp->v_mount->mnt_stat.f_iosize, - devBlockSize); - } - if (ap->a_runp != NULL) - *ap->a_runp = 0; -#ifdef notyet - if (ap->a_runb != NULL) - *ap->a_runb = 0; -#endif - return (0); + return (ENOTSUP); } /* @@ -3499,50 +3755,55 @@ nfs_bmap(ap) * * NB Currently unsupported. */ -/* ARGSUSED */ +/*ARGSUSED*/ static int -nfs_mmap(ap) - struct vop_mmap_args /* { - struct vnode *a_vp; - int a_fflags; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; +nfs_mmap( + __unused struct vnop_mmap_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + int a_fflags; + kauth_cred_t a_cred; + proc_t a_p; + } */ *ap) { return (EINVAL); } /* - * fsync vnode op. Just call nfs_flush(). + * fsync vnode op. Just call nfs_flush() with commit == 1. */ /* ARGSUSED */ static int nfs_fsync(ap) - struct vop_fsync_args /* { + struct vnop_fsync_args /* { struct vnodeop_desc *a_desc; - struct vnode * a_vp; - struct ucred * a_cred; - int a_waitfor; - struct proc * a_p; + vnode_t a_vp; + int a_waitfor; + vfs_context_t a_context; } */ *ap; { - return (nfs_flush(ap->a_vp, ap->a_cred, ap->a_waitfor, ap->a_p)); + kauth_cred_t cred = vfs_context_ucred(ap->a_context); + proc_t p = vfs_context_proc(ap->a_context); + struct nfsnode *np = VTONFS(ap->a_vp); + int error; + + np->n_flag |= NWRBUSY; + error = nfs_flush(ap->a_vp, ap->a_waitfor, cred, p, 0); + np->n_flag &= ~NWRBUSY; + return (error); } int -nfs_flushcommits(struct vnode *vp, struct proc *p) +nfs_flushcommits(vnode_t vp, proc_t p, int nowait) { struct nfsnode *np = VTONFS(vp); - struct nfsbuf *bp, *nbp; - int i, s, error = 0, retv, bvecpos, wcred_set; + struct nfsbuf *bp; + struct nfsbuflists blist, commitlist; + int error = 0, retv, wcred_set, flags; u_quad_t off, endoff, toff; - struct ucred* wcred; - struct nfsbuf **bvec = NULL; -#define NFS_COMMITBVECSIZ 20 -#define NFS_MAXCOMMITBVECSIZ 1024 - struct nfsbuf *bvec_on_stack[NFS_COMMITBVECSIZ]; - int bvecsize = NFS_MAXCOMMITBVECSIZ; + u_int32_t count; + kauth_cred_t wcred = NULL; FSDBG_TOP(557, vp, np, 0, 0); @@ -3552,15 +3813,15 @@ nfs_flushcommits(struct vnode *vp, struct proc *p) * yet. The byte range is worked out for as many nfsbufs as we can handle * and the commit rpc is done. */ - if (np->n_dirtyblkhd.lh_first) + if (!LIST_EMPTY(&np->n_dirtyblkhd)) np->n_flag |= NMODIFIED; off = (u_quad_t)-1; endoff = 0; - bvecpos = 0; wcred_set = 0; + LIST_INIT(&commitlist); - if (!VFSTONFS(vp->v_mount)) { + if (!VFSTONFS(vnode_mount(vp))) { error = ENXIO; goto done; } @@ -3568,95 +3829,96 @@ nfs_flushcommits(struct vnode *vp, struct proc *p) error = EINVAL; goto done; } - s = splbio(); - /* - * Allocate space to remember the list of bufs to commit. It is - * important to use M_NOWAIT here to avoid a race with nfs_write - */ - MALLOC(bvec, struct nfsbuf **, - bvecsize * sizeof(struct nfsbuf *), M_TEMP, - M_NOWAIT); - if (bvec == NULL) { - bvec = bvec_on_stack; - bvecsize = NFS_COMMITBVECSIZ; - } - for (bp = np->n_dirtyblkhd.lh_first; bp && bvecpos < bvecsize; bp = nbp) { - nbp = bp->nb_vnbufs.le_next; - - if (((bp->nb_flags & (NB_BUSY | NB_DELWRI | NB_NEEDCOMMIT)) - != (NB_DELWRI | NB_NEEDCOMMIT))) - continue; - - nfs_buf_remfree(bp); - SET(bp->nb_flags, NB_BUSY); - /* - * we need a upl to see if the page has been - * dirtied (think mmap) since the unstable write, and - * also to prevent vm from paging it during our commit rpc - */ - if (!ISSET(bp->nb_flags, NB_PAGELIST)) { - retv = nfs_buf_upl_setup(bp); - if (retv) { - /* unable to create upl */ - /* vm object must no longer exist */ - /* this could be fatal if we need */ - /* to write the data again, we'll see... */ - printf("nfs_flushcommits: upl create failed %d\n", retv); - bp->nb_valid = bp->nb_dirty = 0; + flags = NBI_DIRTY; + if (nowait) + flags |= NBI_NOWAIT; + lck_mtx_lock(nfs_buf_mutex); + if (!nfs_buf_iterprepare(np, &blist, flags)) { + while ((bp = LIST_FIRST(&blist))) { + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + error = nfs_buf_acquire(bp, NBAC_NOWAIT, 0, 0); + if (error) + continue; + if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT)) + != (NB_DELWRI | NB_NEEDCOMMIT))) { + nfs_buf_drop(bp); + continue; } - } - nfs_buf_upl_check(bp); + nfs_buf_remfree(bp); + lck_mtx_unlock(nfs_buf_mutex); + /* + * we need a upl to see if the page has been + * dirtied (think mmap) since the unstable write, and + * also to prevent vm from paging it during our commit rpc + */ + if (!ISSET(bp->nb_flags, NB_PAGELIST)) { + retv = nfs_buf_upl_setup(bp); + if (retv) { + /* unable to create upl */ + /* vm object must no longer exist */ + /* this could be fatal if we need */ + /* to write the data again, we'll see... */ + printf("nfs_flushcommits: upl create failed %d\n", retv); + bp->nb_valid = bp->nb_dirty = 0; + } + } + nfs_buf_upl_check(bp); + lck_mtx_lock(nfs_buf_mutex); - FSDBG(557, bp, bp->nb_flags, bp->nb_valid, bp->nb_dirty); - FSDBG(557, bp->nb_validoff, bp->nb_validend, - bp->nb_dirtyoff, bp->nb_dirtyend); + FSDBG(557, bp, bp->nb_flags, bp->nb_valid, bp->nb_dirty); + FSDBG(557, bp->nb_validoff, bp->nb_validend, + bp->nb_dirtyoff, bp->nb_dirtyend); - /* - * We used to check for dirty pages here; if there were any - * we'd abort the commit and force the entire buffer to be - * written again. - * - * Instead of doing that, we now go ahead and commit the dirty - * range, and then leave the buffer around with dirty pages - * that will be written out later. - */ + /* + * We used to check for dirty pages here; if there were any + * we'd abort the commit and force the entire buffer to be + * written again. + * + * Instead of doing that, we now go ahead and commit the dirty + * range, and then leave the buffer around with dirty pages + * that will be written out later. + */ - /* in case blocking calls were made, re-evaluate nbp */ - nbp = bp->nb_vnbufs.le_next; + /* + * Work out if all buffers are using the same cred + * so we can deal with them all with one commit. + * + * XXX creds in bp's must be obtained by kauth_cred_ref on + * the same original cred in order for them to be equal. + */ + if (wcred_set == 0) { + wcred = bp->nb_wcred; + if (wcred == NOCRED) + panic("nfs: needcommit w/out wcred"); + wcred_set = 1; + } else if ((wcred_set == 1) && wcred != bp->nb_wcred) { + wcred_set = -1; + } + SET(bp->nb_flags, NB_WRITEINPROG); - /* - * Work out if all buffers are using the same cred - * so we can deal with them all with one commit. - */ - if (wcred_set == 0) { - wcred = bp->nb_wcred; - if (wcred == NOCRED) - panic("nfs: needcommit w/out wcred"); - wcred_set = 1; - } else if ((wcred_set == 1) && crcmp(wcred, bp->nb_wcred)) { - wcred_set = -1; + /* + * A list of these buffers is kept so that the + * second loop knows which buffers have actually + * been committed. This is necessary, since there + * may be a race between the commit rpc and new + * uncommitted writes on the file. + */ + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs); + toff = NBOFF(bp) + bp->nb_dirtyoff; + if (toff < off) + off = toff; + toff += (u_quad_t)(bp->nb_dirtyend - bp->nb_dirtyoff); + if (toff > endoff) + endoff = toff; } - SET(bp->nb_flags, NB_WRITEINPROG); - - /* - * A list of these buffers is kept so that the - * second loop knows which buffers have actually - * been committed. This is necessary, since there - * may be a race between the commit rpc and new - * uncommitted writes on the file. - */ - bvec[bvecpos++] = bp; - toff = NBOFF(bp) + bp->nb_dirtyoff; - if (toff < off) - off = toff; - toff += (u_quad_t)(bp->nb_dirtyend - bp->nb_dirtyoff); - if (toff > endoff) - endoff = toff; + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); } - splx(s); + lck_mtx_unlock(nfs_buf_mutex); - if (bvecpos == 0) { + if (LIST_EMPTY(&commitlist)) { error = ENOBUFS; goto done; } @@ -3667,74 +3929,78 @@ nfs_flushcommits(struct vnode *vp, struct proc *p) * one call for all of them, otherwise commit each one * separately. */ - if (wcred_set == 1) - retv = nfs_commit(vp, off, (int)(endoff - off), wcred, p); - else { + if (wcred_set == 1) { + /* + * Note, it's possible the commit range could be >2^32-1. + * If it is, we'll send one commit that covers the whole file. + */ + if ((endoff - off) > 0xffffffff) + count = 0; + else + count = (endoff - off); + retv = nfs_commit(vp, off, count, wcred, p); + } else { retv = 0; - - for (i = 0; i < bvecpos; i++) { - off_t off, size; - bp = bvec[i]; - off = NBOFF(bp) + bp->nb_dirtyoff; - size = (u_quad_t)(bp->nb_dirtyend - bp->nb_dirtyoff); - retv = nfs_commit(vp, off, (int)size, bp->nb_wcred, p); - if (retv) break; + LIST_FOREACH(bp, &commitlist, nb_vnbufs) { + toff = NBOFF(bp) + bp->nb_dirtyoff; + count = bp->nb_dirtyend - bp->nb_dirtyoff; + retv = nfs_commit(vp, toff, count, bp->nb_wcred, p); + if (retv) + break; } } if (retv == NFSERR_STALEWRITEVERF) - nfs_clearcommit(vp->v_mount); + nfs_clearcommit(vnode_mount(vp)); /* * Now, either mark the blocks I/O done or mark the * blocks dirty, depending on whether the commit * succeeded. */ - for (i = 0; i < bvecpos; i++) { - bp = bvec[i]; + while ((bp = LIST_FIRST(&commitlist))) { + LIST_REMOVE(bp, nb_vnbufs); FSDBG(557, bp, retv, bp->nb_flags, bp->nb_dirty); - CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_WRITEINPROG)); - np->n_needcommitcnt--; CHECK_NEEDCOMMITCNT(np); if (retv) { + /* move back to dirty list */ + lck_mtx_lock(nfs_buf_mutex); + LIST_INSERT_HEAD(&VTONFS(vp)->n_dirtyblkhd, bp, nb_vnbufs); + lck_mtx_unlock(nfs_buf_mutex); nfs_buf_release(bp, 1); - } else { - s = splbio(); - vp->v_numoutput++; + continue; + } - if (ISSET(bp->nb_flags, NB_DELWRI)) { - nfs_nbdwrite--; - NFSBUFCNTCHK(); - wakeup((caddr_t)&nfs_nbdwrite); - } - CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI)); - /* if block still has dirty pages, we don't want it to */ - /* be released in nfs_buf_iodone(). So, don't set NB_ASYNC. */ - if (!bp->nb_dirty) - SET(bp->nb_flags, NB_ASYNC); + vnode_startwrite(vp); + if (ISSET(bp->nb_flags, NB_DELWRI)) { + OSAddAtomic(-1, (SInt32*)&nfs_nbdwrite); + NFSBUFCNTCHK(0); + wakeup(&nfs_nbdwrite); + } + CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI)); + /* if block still has dirty pages, we don't want it to */ + /* be released in nfs_buf_iodone(). So, don't set NB_ASYNC. */ + if (!bp->nb_dirty) + SET(bp->nb_flags, NB_ASYNC); - /* move to clean list */ - if (bp->nb_vnbufs.le_next != NFSNOLIST) - LIST_REMOVE(bp, nb_vnbufs); - LIST_INSERT_HEAD(&VTONFS(vp)->n_cleanblkhd, bp, nb_vnbufs); + /* move to clean list */ + lck_mtx_lock(nfs_buf_mutex); + LIST_INSERT_HEAD(&VTONFS(vp)->n_cleanblkhd, bp, nb_vnbufs); + lck_mtx_unlock(nfs_buf_mutex); - bp->nb_dirtyoff = bp->nb_dirtyend = 0; - splx(s); + bp->nb_dirtyoff = bp->nb_dirtyend = 0; - nfs_buf_iodone(bp); - if (bp->nb_dirty) { - /* throw it back in as a delayed write buffer */ - CLR(bp->nb_flags, NB_DONE); - nfs_buf_write_delayed(bp); - } + nfs_buf_iodone(bp); + if (bp->nb_dirty) { + /* throw it back in as a delayed write buffer */ + CLR(bp->nb_flags, NB_DONE); + nfs_buf_write_delayed(bp, p); } } done: - if (bvec != NULL && bvec != bvec_on_stack) - _FREE(bvec, M_TEMP); FSDBG_BOT(557, vp, np, 0, error); return (error); } @@ -3744,18 +4010,20 @@ done: * Walk through the buffer pool and push any dirty pages * associated with the vnode. */ -static int -nfs_flush(vp, cred, waitfor, p) - register struct vnode *vp; - struct ucred *cred; - int waitfor; - struct proc *p; +int +nfs_flush( + vnode_t vp, + int waitfor, + __unused kauth_cred_t cred, + proc_t p, + int ignore_writeerr) { struct nfsnode *np = VTONFS(vp); - struct nfsbuf *bp, *nbp; - struct nfsmount *nmp = VFSTONFS(vp->v_mount); - int i, s, error = 0, error2, slptimeo = 0, slpflag = 0; - int passone = 1; + struct nfsbuf *bp; + struct nfsbuflists blist; + struct nfsmount *nmp = VFSTONFS(vnode_mount(vp)); + int error = 0, error2, slptimeo = 0, slpflag = 0; + int flags, passone = 1; FSDBG_TOP(517, vp, np, waitfor, 0); @@ -3774,81 +4042,92 @@ nfs_flush(vp, cred, waitfor, p) * dirty buffers. Then wait for all writes to complete. */ again: - FSDBG(518, np->n_dirtyblkhd.lh_first, np->n_flag, 0, 0); - if (np->n_dirtyblkhd.lh_first) + lck_mtx_lock(nfs_buf_mutex); + FSDBG(518, LIST_FIRST(&np->n_dirtyblkhd), np->n_flag, 0, 0); + if (!LIST_EMPTY(&np->n_dirtyblkhd)) np->n_flag |= NMODIFIED; - if (!VFSTONFS(vp->v_mount)) { + if (!VFSTONFS(vnode_mount(vp))) { + lck_mtx_unlock(nfs_buf_mutex); error = ENXIO; goto done; } /* Start/do any write(s) that are required. */ -loop: - s = splbio(); - for (bp = np->n_dirtyblkhd.lh_first; bp; bp = nbp) { - nbp = bp->nb_vnbufs.le_next; - if (ISSET(bp->nb_flags, NB_BUSY)) { - FSDBG(524, bp, waitfor, passone, bp->nb_flags); - if (waitfor != MNT_WAIT || passone) - continue; - SET(bp->nb_flags, NB_WANTED); - error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1), - "nfsfsync", slptimeo); - splx(s); - if (error) { - error2 = nfs_sigintr(VFSTONFS(vp->v_mount), - (struct nfsreq *)0, p); - if (error2) { - error = error2; - goto done; - } - if (slpflag == PCATCH) { - slpflag = 0; - slptimeo = 2 * hz; + if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) { + while ((bp = LIST_FIRST(&blist))) { + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + flags = (passone || (waitfor != MNT_WAIT)) ? NBAC_NOWAIT : 0; + if (flags != NBAC_NOWAIT) + nfs_buf_refget(bp); + while ((error = nfs_buf_acquire(bp, flags, slpflag, slptimeo))) { + FSDBG(524, bp, flags, bp->nb_lflags, bp->nb_flags); + if (error == EBUSY) + break; + if (error) { + error2 = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p); + if (error2) { + if (flags != NBAC_NOWAIT) + nfs_buf_refrele(bp); + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); + lck_mtx_unlock(nfs_buf_mutex); + error = error2; + goto done; + } + if (slpflag == PCATCH) { + slpflag = 0; + slptimeo = 2 * hz; + } } } - goto loop; - } - if (!ISSET(bp->nb_flags, NB_DELWRI)) - panic("nfs_fsync: not dirty"); - FSDBG(525, bp, passone, 0, bp->nb_flags); - if ((passone || (waitfor != MNT_WAIT)) && ISSET(bp->nb_flags, NB_NEEDCOMMIT)) - continue; - nfs_buf_remfree(bp); - if (ISSET(bp->nb_flags, NB_ERROR)) { - np->n_error = bp->nb_error ? bp->nb_error : EIO; - np->n_flag |= NWRITEERR; - nfs_buf_release(bp, 1); - continue; - } - if (passone) - SET(bp->nb_flags, NB_BUSY|NB_ASYNC); - else { - /* the NB_STABLE forces this to be written FILESYNC */ - SET(bp->nb_flags, NB_BUSY|NB_ASYNC|NB_STABLE); + if (flags != NBAC_NOWAIT) + nfs_buf_refrele(bp); + if (error == EBUSY) + continue; + if (!bp->nb_vp) { + /* buffer is no longer valid */ + nfs_buf_drop(bp); + continue; + } + if (!ISSET(bp->nb_flags, NB_DELWRI)) + panic("nfs_flush: not dirty"); + FSDBG(525, bp, passone, bp->nb_lflags, bp->nb_flags); + if ((passone || (waitfor != MNT_WAIT)) && + ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + nfs_buf_drop(bp); + continue; + } + nfs_buf_remfree(bp); + lck_mtx_unlock(nfs_buf_mutex); + if (ISSET(bp->nb_flags, NB_ERROR)) { + np->n_error = bp->nb_error ? bp->nb_error : EIO; + np->n_flag |= NWRITEERR; + nfs_buf_release(bp, 1); + lck_mtx_lock(nfs_buf_mutex); + continue; + } + SET(bp->nb_flags, NB_ASYNC); + if (!passone) { + /* NB_STABLE forces this to be written FILESYNC */ + SET(bp->nb_flags, NB_STABLE); + } + nfs_buf_write(bp); + lck_mtx_lock(nfs_buf_mutex); } - splx(s); - nfs_buf_write(bp); - goto loop; + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); } - splx(s); + lck_mtx_unlock(nfs_buf_mutex); if (waitfor == MNT_WAIT) { - while (vp->v_numoutput) { - vp->v_flag |= VBWAIT; - error = tsleep((caddr_t)&vp->v_numoutput, - slpflag | (PRIBIO + 1), "nfsfsync", slptimeo); - if (error) { - error2 = nfs_sigintr(VFSTONFS(vp->v_mount), - (struct nfsreq *)0, p); - if (error2) { - error = error2; + while ((error = vnode_waitforwrites(vp, 0, slpflag, slptimeo, "nfsflush"))) { + error2 = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p); + if (error2) { + error = error2; goto done; - } - if (slpflag == PCATCH) { + } + if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; - } } } } @@ -3857,7 +4136,7 @@ loop: /* loop while it looks like there are still buffers to be */ /* commited and nfs_flushcommits() seems to be handling them. */ while (np->n_needcommitcnt) - if (nfs_flushcommits(vp, p)) + if (nfs_flushcommits(vp, p, 0)) break; } @@ -3866,14 +4145,12 @@ loop: goto again; } - if (waitfor == MNT_WAIT) { - if (np->n_dirtyblkhd.lh_first) { - goto again; - } + if ((waitfor == MNT_WAIT) && !LIST_EMPTY(&np->n_dirtyblkhd)) { + goto again; } FSDBG(526, np->n_flag, np->n_error, 0, 0); - if (np->n_flag & NWRITEERR) { + if (!ignore_writeerr && (np->n_flag & NWRITEERR)) { error = np->n_error; np->n_flag &= ~NWRITEERR; } @@ -3883,155 +4160,182 @@ done: } /* - * Return POSIX pathconf information applicable to nfs. - * - * The NFS V2 protocol doesn't support this, so just return EINVAL - * for V2. + * Do an nfs pathconf rpc. */ -/* ARGSUSED */ -static int -nfs_pathconf(ap) - struct vop_pathconf_args /* { - struct vnode *a_vp; - int a_name; - int *a_retval; - } */ *ap; +int +nfs_pathconfrpc( + vnode_t vp, + struct nfsv3_pathconf *pc, + kauth_cred_t cred, + proc_t procp) { + mbuf_t mreq, mrep, md, mb, mb2; + caddr_t bpos, dpos, cp, cp2; + int32_t t1, t2; + u_long *tl; + u_int64_t xid; + int attrflag, error = 0; + struct nfsv3_pathconf *mpc; - return (EINVAL); -} + /* fetch pathconf info from server */ + nfsm_reqhead(NFSX_FH(1)); + if (error) + return (error); + nfsm_fhtom(vp, 1); + nfsm_request(vp, NFSPROC_PATHCONF, procp, cred, &xid); + nfsm_postop_attr_update(vp, 1, attrflag, &xid); + if (!error) { + nfsm_dissect(mpc, struct nfsv3_pathconf *, NFSX_V3PATHCONF); + pc->pc_linkmax = fxdr_unsigned(long, mpc->pc_linkmax); + pc->pc_namemax = fxdr_unsigned(long, mpc->pc_namemax); + pc->pc_chownrestricted = fxdr_unsigned(long, mpc->pc_chownrestricted); + pc->pc_notrunc = fxdr_unsigned(long, mpc->pc_notrunc); + pc->pc_caseinsensitive = fxdr_unsigned(long, mpc->pc_caseinsensitive); + pc->pc_casepreserving = fxdr_unsigned(long, mpc->pc_casepreserving); + } + nfsm_reqdone; -/* - * NFS advisory byte-level locks (client) - */ -static int -nfs_advlock(ap) - struct vop_advlock_args /* { - struct vnode *a_vp; - caddr_t a_id; - int a_op; - struct flock *a_fl; - int a_flags; - } */ *ap; -{ - return (nfs_dolock(ap)); + return (error); } -/* - * Print out the contents of an nfsnode. - */ -static int -nfs_print(ap) - struct vop_print_args /* { - struct vnode *a_vp; - } */ *ap; +void +nfs_pathconf_cache(struct nfsmount *nmp, struct nfsv3_pathconf *pc) { - register struct vnode *vp = ap->a_vp; - register struct nfsnode *np = VTONFS(vp); - - printf("tag VT_NFS, fileid %ld fsid 0x%lx", - np->n_vattr.va_fileid, np->n_vattr.va_fsid); - if (vp->v_type == VFIFO) - fifo_printinfo(vp); - printf("\n"); - return (0); + nmp->nm_state |= NFSSTA_GOTPATHCONF; + nmp->nm_fsinfo.linkmax = pc->pc_linkmax; + nmp->nm_fsinfo.namemax = pc->pc_namemax; + nmp->nm_fsinfo.pcflags = 0; + if (pc->pc_notrunc) + nmp->nm_fsinfo.pcflags |= NFSPCINFO_NOTRUNC; + if (pc->pc_chownrestricted) + nmp->nm_fsinfo.pcflags |= NFSPCINFO_CHOWN_RESTRICTED; + if (pc->pc_caseinsensitive) + nmp->nm_fsinfo.pcflags |= NFSPCINFO_CASE_INSENSITIVE; + if (pc->pc_casepreserving) + nmp->nm_fsinfo.pcflags |= NFSPCINFO_CASE_PRESERVING; } /* - * NFS directory offset lookup. - * Currently unsupported. + * Return POSIX pathconf information applicable to nfs. + * + * The NFS V2 protocol doesn't support this, so just return EINVAL + * for V2. */ +/* ARGSUSED */ static int -nfs_blkatoff(ap) - struct vop_blkatoff_args /* { - struct vnode *a_vp; - off_t a_offset; - char **a_res; - struct buf **a_bpp; +nfs_pathconf(ap) + struct vnop_pathconf_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + int a_name; + register_t *a_retval; + vfs_context_t a_context; } */ *ap; { + vnode_t vp = ap->a_vp; + struct nfsmount *nmp; + struct nfsv3_pathconf pc; + int error = 0, cached; -#if DIAGNOSTIC - printf("nfs_blkatoff: unimplemented!!"); -#endif - return (EOPNOTSUPP); -} + nmp = VFSTONFS(vnode_mount(vp)); + if (!nmp) + return (ENXIO); + if (!NFS_ISV3(vp)) + return (EINVAL); -/* - * NFS flat namespace allocation. - * Currently unsupported. - */ -static int -nfs_valloc(ap) - struct vop_valloc_args /* { - struct vnode *a_pvp; - int a_mode; - struct ucred *a_cred; - struct vnode **a_vpp; - } */ *ap; -{ + switch (ap->a_name) { + case _PC_LINK_MAX: + case _PC_NAME_MAX: + case _PC_CHOWN_RESTRICTED: + case _PC_NO_TRUNC: + case _PC_CASE_SENSITIVE: + case _PC_CASE_PRESERVING: + break; + default: + /* don't bother contacting the server if we know the answer */ + return (EINVAL); + } - return (EOPNOTSUPP); -} + if (!(nmp->nm_state & NFSSTA_GOTPATHCONF)) { + /* no pathconf info cached */ + kauth_cred_t cred = vfs_context_ucred(ap->a_context); + proc_t p = vfs_context_proc(ap->a_context); + error = nfs_pathconfrpc(vp, &pc, cred, p); + if (error) + return (error); + nmp = VFSTONFS(vnode_mount(vp)); + if (!nmp) + return (ENXIO); + if (!(nmp->nm_state & NFSSTA_GOTFSINFO)) { + nfs_fsinfo(nmp, vp, cred, p); + nmp = VFSTONFS(vnode_mount(vp)); + if (!nmp) + return (ENXIO); + } + if ((nmp->nm_state & NFSSTA_GOTFSINFO) && + (nmp->nm_fsinfo.fsproperties & NFSV3FSINFO_HOMOGENEOUS)) { + /* all files have the same pathconf info, */ + /* so cache a copy of the results */ + nfs_pathconf_cache(nmp, &pc); + } + } -/* - * NFS flat namespace free. - * Currently unsupported. - */ -static int -nfs_vfree(ap) - struct vop_vfree_args /* { - struct vnode *a_pvp; - ino_t a_ino; - int a_mode; - } */ *ap; -{ + cached = (nmp->nm_state & NFSSTA_GOTPATHCONF); + + switch (ap->a_name) { + case _PC_LINK_MAX: + *ap->a_retval = cached ? nmp->nm_fsinfo.linkmax : pc.pc_linkmax; + break; + case _PC_NAME_MAX: + *ap->a_retval = cached ? nmp->nm_fsinfo.namemax : pc.pc_namemax; + break; + case _PC_CHOWN_RESTRICTED: + if (cached) + *ap->a_retval = (nmp->nm_fsinfo.pcflags & NFSPCINFO_CHOWN_RESTRICTED) ? 1 : 0; + else + *ap->a_retval = pc.pc_chownrestricted; + break; + case _PC_NO_TRUNC: + if (cached) + *ap->a_retval = (nmp->nm_fsinfo.pcflags & NFSPCINFO_NOTRUNC) ? 1 : 0; + else + *ap->a_retval = pc.pc_notrunc; + break; + case _PC_CASE_SENSITIVE: + if (cached) + *ap->a_retval = (nmp->nm_fsinfo.pcflags & NFSPCINFO_CASE_INSENSITIVE) ? 0 : 1; + else + *ap->a_retval = !pc.pc_caseinsensitive; + break; + case _PC_CASE_PRESERVING: + if (cached) + *ap->a_retval = (nmp->nm_fsinfo.pcflags & NFSPCINFO_CASE_PRESERVING) ? 1 : 0; + else + *ap->a_retval = pc.pc_casepreserving; + break; + default: + error = EINVAL; + } -#if DIAGNOSTIC - printf("nfs_vfree: unimplemented!!"); -#endif - return (EOPNOTSUPP); + return (error); } /* - * NFS file truncation. + * NFS advisory byte-level locks (client) */ static int -nfs_truncate(ap) - struct vop_truncate_args /* { - struct vnode *a_vp; - off_t a_length; +nfs_advlock(ap) + struct vnop_advlock_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + caddr_t a_id; + int a_op; + struct flock *a_fl; int a_flags; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { - - /* Use nfs_setattr */ -#if DIAGNOSTIC - printf("nfs_truncate: unimplemented!!"); -#endif - return (EOPNOTSUPP); -} - -/* - * NFS update. - */ -static int -nfs_update(ap) - struct vop_update_args /* { - struct vnode *a_vp; - struct timeval *a_ta; - struct timeval *a_tm; - int a_waitfor; - } */ *ap; -{ - - /* Use nfs_setattr */ -#if DIAGNOSTIC - printf("nfs_update: unimplemented!!"); -#endif - return (EOPNOTSUPP); + return (nfs_dolock(ap)); } /* @@ -4040,44 +4344,43 @@ nfs_update(ap) int nfs_buf_write(struct nfsbuf *bp) { - int s; int oldflags = bp->nb_flags, rv = 0; - off_t off; - struct vnode *vp = bp->nb_vp; - struct ucred *cr; - struct proc *p = current_proc(); + vnode_t vp = bp->nb_vp; + struct nfsnode *np = VTONFS(vp); + kauth_cred_t cr; + proc_t p = current_proc(); // XXX FSDBG_TOP(553, bp, NBOFF(bp), bp->nb_flags, 0); - if (!ISSET(bp->nb_flags, NB_BUSY)) + if (!ISSET(bp->nb_lflags, NBL_BUSY)) panic("nfs_buf_write: buffer is not busy???"); - s = splbio(); CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI)); if (ISSET(oldflags, NB_DELWRI)) { - nfs_nbdwrite--; - NFSBUFCNTCHK(); - wakeup((caddr_t)&nfs_nbdwrite); + OSAddAtomic(-1, (SInt32*)&nfs_nbdwrite); + NFSBUFCNTCHK(0); + wakeup(&nfs_nbdwrite); } /* move to clean list */ if (ISSET(oldflags, (NB_ASYNC|NB_DELWRI))) { + lck_mtx_lock(nfs_buf_mutex); if (bp->nb_vnbufs.le_next != NFSNOLIST) LIST_REMOVE(bp, nb_vnbufs); LIST_INSERT_HEAD(&VTONFS(vp)->n_cleanblkhd, bp, nb_vnbufs); + lck_mtx_unlock(nfs_buf_mutex); } + vnode_startwrite(vp); - vp->v_numoutput++; if (p && p->p_stats) p->p_stats->p_ru.ru_oublock++; - splx(s); /* * For async requests when nfsiod(s) are running, queue the request by * calling nfs_asyncio(), otherwise just all nfs_doio() to do the request. */ if (ISSET(bp->nb_flags, NB_ASYNC)) - p = (struct proc *)0; + p = NULL; if (ISSET(bp->nb_flags, NB_READ)) cr = bp->nb_rcred; else @@ -4089,14 +4392,34 @@ nfs_buf_write(struct nfsbuf *bp) rv = nfs_buf_iowait(bp); /* move to clean list */ if (oldflags & NB_DELWRI) { - s = splbio(); + lck_mtx_lock(nfs_buf_mutex); if (bp->nb_vnbufs.le_next != NFSNOLIST) LIST_REMOVE(bp, nb_vnbufs); LIST_INSERT_HEAD(&VTONFS(vp)->n_cleanblkhd, bp, nb_vnbufs); - splx(s); + lck_mtx_unlock(nfs_buf_mutex); } + oldflags = bp->nb_flags; FSDBG_BOT(553, bp, NBOFF(bp), bp->nb_flags, rv); + if (cr) { + kauth_cred_ref(cr); + } nfs_buf_release(bp, 1); + if (ISSET(oldflags, NB_ERROR) && !(np->n_flag & NFLUSHINPROG)) { + /* + * There was a write error and we need to + * invalidate attrs and flush buffers in + * order to sync up with the server. + * (if this write was extending the file, + * we may no longer know the correct size) + * + * But we couldn't call vinvalbuf while holding + * the buffer busy. So we call vinvalbuf() after + * releasing the buffer. + */ + nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cr, p, 1); + } + if (cr) + kauth_cred_rele(cr); return (rv); } @@ -4104,79 +4427,17 @@ nfs_buf_write(struct nfsbuf *bp) return (rv); } -/* - * nfs special file access vnode op. - * Essentially just get vattr and then imitate iaccess() since the device is - * local to the client. - */ -static int -nfsspec_access(ap) - struct vop_access_args /* { - struct vnode *a_vp; - int a_mode; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; -{ - register struct vattr *vap; - register gid_t *gp; - register struct ucred *cred = ap->a_cred; - struct vnode *vp = ap->a_vp; - mode_t mode = ap->a_mode; - struct vattr vattr; - register int i; - int error; - - /* - * Disallow write attempts on filesystems mounted read-only; - * unless the file is a socket, fifo, or a block or character - * device resident on the filesystem. - */ - if ((mode & VWRITE) && vp->v_mount && (vp->v_mount->mnt_flag & MNT_RDONLY)) { - switch (vp->v_type) { - case VREG: case VDIR: case VLNK: - return (EROFS); - } - } - /* - * If you're the super-user, - * you always get access. - */ - if (cred->cr_uid == 0) - return (0); - vap = &vattr; - error = VOP_GETATTR(vp, vap, cred, ap->a_p); - if (error) - return (error); - /* - * Access check is based on only one of owner, group, public. - * If not owner, then check group. If not a member of the - * group, then check public access. - */ - if (cred->cr_uid != vap->va_uid) { - mode >>= 3; - gp = cred->cr_groups; - for (i = 0; i < cred->cr_ngroups; i++, gp++) - if (vap->va_gid == *gp) - goto found; - mode >>= 3; -found: - ; - } - error = (vap->va_mode & mode) == mode ? 0 : EACCES; - return (error); -} - /* * Read wrapper for special devices. */ static int nfsspec_read(ap) - struct vop_read_args /* { - struct vnode *a_vp; + struct vnop_read_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; + int a_ioflag; + vfs_context_t a_context; } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); @@ -4189,7 +4450,7 @@ nfsspec_read(ap) microtime(&now); np->n_atim.tv_sec = now.tv_sec; np->n_atim.tv_nsec = now.tv_usec * 1000; - return (VOCALL(spec_vnodeop_p, VOFFSET(vop_read), ap)); + return (VOCALL(spec_vnodeop_p, VOFFSET(vnop_read), ap)); } /* @@ -4197,11 +4458,12 @@ nfsspec_read(ap) */ static int nfsspec_write(ap) - struct vop_write_args /* { - struct vnode *a_vp; + struct vnop_write_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; + int a_ioflag; + vfs_context_t a_context; } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); @@ -4214,7 +4476,7 @@ nfsspec_write(ap) microtime(&now); np->n_mtim.tv_sec = now.tv_sec; np->n_mtim.tv_nsec = now.tv_usec * 1000; - return (VOCALL(spec_vnodeop_p, VOFFSET(vop_write), ap)); + return (VOCALL(spec_vnodeop_p, VOFFSET(vnop_write), ap)); } /* @@ -4224,45 +4486,51 @@ nfsspec_write(ap) */ static int nfsspec_close(ap) - struct vop_close_args /* { - struct vnode *a_vp; - int a_fflag; - struct ucred *a_cred; - struct proc *a_p; + struct vnop_close_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + int a_fflag; + vfs_context_t a_context; } */ *ap; { - register struct vnode *vp = ap->a_vp; - register struct nfsnode *np = VTONFS(vp); - struct vattr vattr; + vnode_t vp = ap->a_vp; + struct nfsnode *np = VTONFS(vp); + struct vnode_attr vattr; + mount_t mp; if (np->n_flag & (NACC | NUPD)) { np->n_flag |= NCHG; - if (vp->v_usecount == 1 && vp->v_mount && - (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { - VATTR_NULL(&vattr); - if (np->n_flag & NACC) - vattr.va_atime = np->n_atim; - if (np->n_flag & NUPD) - vattr.va_mtime = np->n_mtim; - (void)VOP_SETATTR(vp, &vattr, ap->a_cred, ap->a_p); + if (!vnode_isinuse(vp, 1) && (mp = vnode_mount(vp)) && !vfs_isrdonly(mp)) { + VATTR_INIT(&vattr); + if (np->n_flag & NACC) { + vattr.va_access_time = np->n_atim; + VATTR_SET_ACTIVE(&vattr, va_access_time); + } + if (np->n_flag & NUPD) { + vattr.va_modify_time = np->n_mtim; + VATTR_SET_ACTIVE(&vattr, va_modify_time); + } + vnode_setattr(vp, &vattr, ap->a_context); } } - return (VOCALL(spec_vnodeop_p, VOFFSET(vop_close), ap)); + return (VOCALL(spec_vnodeop_p, VOFFSET(vnop_close), ap)); } +extern vnop_t **fifo_vnodeop_p; + /* * Read wrapper for fifos. */ static int nfsfifo_read(ap) - struct vop_read_args /* { - struct vnode *a_vp; + struct vnop_read_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; + int a_ioflag; + vfs_context_t a_context; } */ *ap; { - extern vop_t **fifo_vnodeop_p; register struct nfsnode *np = VTONFS(ap->a_vp); struct timeval now; @@ -4273,7 +4541,7 @@ nfsfifo_read(ap) microtime(&now); np->n_atim.tv_sec = now.tv_sec; np->n_atim.tv_nsec = now.tv_usec * 1000; - return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_read), ap)); + return (VOCALL(fifo_vnodeop_p, VOFFSET(vnop_read), ap)); } /* @@ -4281,14 +4549,14 @@ nfsfifo_read(ap) */ static int nfsfifo_write(ap) - struct vop_write_args /* { - struct vnode *a_vp; + struct vnop_write_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; + int a_ioflag; + vfs_context_t a_context; } */ *ap; { - extern vop_t **fifo_vnodeop_p; register struct nfsnode *np = VTONFS(ap->a_vp); struct timeval now; @@ -4299,7 +4567,7 @@ nfsfifo_write(ap) microtime(&now); np->n_mtim.tv_sec = now.tv_sec; np->n_mtim.tv_nsec = now.tv_usec * 1000; - return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_write), ap)); + return (VOCALL(fifo_vnodeop_p, VOFFSET(vnop_write), ap)); } /* @@ -4309,18 +4577,18 @@ nfsfifo_write(ap) */ static int nfsfifo_close(ap) - struct vop_close_args /* { - struct vnode *a_vp; - int a_fflag; - struct ucred *a_cred; - struct proc *a_p; + struct vnop_close_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + int a_fflag; + vfs_context_t a_context; } */ *ap; { - register struct vnode *vp = ap->a_vp; - register struct nfsnode *np = VTONFS(vp); - struct vattr vattr; + vnode_t vp = ap->a_vp; + struct nfsnode *np = VTONFS(vp); + struct vnode_attr vattr; struct timeval now; - extern vop_t **fifo_vnodeop_p; + mount_t mp; if (np->n_flag & (NACC | NUPD)) { microtime(&now); @@ -4333,22 +4601,34 @@ nfsfifo_close(ap) np->n_mtim.tv_nsec = now.tv_usec * 1000; } np->n_flag |= NCHG; - if (vp->v_usecount == 1 && vp->v_mount && - (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { - VATTR_NULL(&vattr); - if (np->n_flag & NACC) - vattr.va_atime = np->n_atim; - if (np->n_flag & NUPD) - vattr.va_mtime = np->n_mtim; - (void)VOP_SETATTR(vp, &vattr, ap->a_cred, ap->a_p); + if (!vnode_isinuse(vp, 1) && (mp = vnode_mount(vp)) && !vfs_isrdonly(mp)) { + VATTR_INIT(&vattr); + if (np->n_flag & NACC) { + vattr.va_access_time = np->n_atim; + VATTR_SET_ACTIVE(&vattr, va_access_time); + } + if (np->n_flag & NUPD) { + vattr.va_modify_time = np->n_mtim; + VATTR_SET_ACTIVE(&vattr, va_modify_time); + } + vnode_setattr(vp, &vattr, ap->a_context); } } - return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_close), ap)); + return (VOCALL(fifo_vnodeop_p, VOFFSET(vnop_close), ap)); } +/*ARGSUSED*/ static int -nfs_ioctl(ap) - struct vop_ioctl_args *ap; +nfs_ioctl( + __unused struct vnop_ioctl_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + u_long a_command; + caddr_t a_data; + int a_fflag; + kauth_cred_t a_cred; + proc_t a_p; + } */ *ap) { /* @@ -4358,9 +4638,18 @@ nfs_ioctl(ap) return (ENOTTY); } +/*ARGSUSED*/ static int -nfs_select(ap) - struct vop_select_args *ap; +nfs_select( + __unused struct vnop_select_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + int a_which; + int a_fflags; + kauth_cred_t a_cred; + void *a_wql; + proc_t a_p; + } */ *ap) { /* @@ -4376,32 +4665,32 @@ nfs_select(ap) */ static int nfs_pagein(ap) - struct vop_pagein_args /* { - struct vnode *a_vp, - upl_t a_pl, - vm_offset_t a_pl_offset, - off_t a_f_offset, - size_t a_size, - struct ucred *a_cred, - int a_flags + struct vnop_pagein_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + upl_t a_pl; + vm_offset_t a_pl_offset; + off_t a_f_offset; + size_t a_size; + int a_flags; + vfs_context_t a_context; } */ *ap; { - register struct vnode *vp = ap->a_vp; + vnode_t vp = ap->a_vp; upl_t pl = ap->a_pl; size_t size= ap->a_size; off_t f_offset = ap->a_f_offset; vm_offset_t pl_offset = ap->a_pl_offset; int flags = ap->a_flags; - struct ucred *cred; + kauth_cred_t cred; + proc_t p; struct nfsnode *np = VTONFS(vp); int biosize, xsize, iosize; - struct vattr vattr; - struct proc *p = current_proc(); struct nfsmount *nmp; int error = 0; vm_offset_t ioaddr; struct uio auio; - struct iovec aiov; + struct iovec_32 aiov; struct uio * uio = &auio; int nofreeupl = flags & UPL_NOCOMMIT; upl_page_info_t *plinfo; @@ -4413,7 +4702,7 @@ nfs_pagein(ap) if (UBCINVALID(vp)) { printf("nfs_pagein: invalid vnode 0x%x", (int)vp); if (!nofreeupl) - (void) ubc_upl_abort(pl, NULL); + (void) ubc_upl_abort(pl, 0); return (EPERM); } UBCINFOCHECK("nfs_pagein", vp); @@ -4421,25 +4710,31 @@ nfs_pagein(ap) if (size <= 0) { printf("nfs_pagein: invalid size %d", size); if (!nofreeupl) - (void) ubc_upl_abort(pl, NULL); + (void) ubc_upl_abort(pl, 0); return (EINVAL); } - if (f_offset < 0 || f_offset >= np->n_size || (f_offset & PAGE_MASK_64)) { + if (f_offset < 0 || f_offset >= (off_t)np->n_size || (f_offset & PAGE_MASK_64)) { if (!nofreeupl) ubc_upl_abort_range(pl, pl_offset, size, UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY); return (EINVAL); } + cred = ubc_getcred(vp); if (cred == NOCRED) - cred = ap->a_cred; + cred = vfs_context_ucred(ap->a_context); + p = vfs_context_proc(ap->a_context); auio.uio_offset = f_offset; +#if 1 /* LP64todo - can't use new segment flags until the drivers are ready */ auio.uio_segflg = UIO_SYSSPACE; +#else + auio.uio_segflg = UIO_SYSSPACE32; +#endif auio.uio_rw = UIO_READ; - auio.uio_procp = NULL; + auio.uio_procp = p; - nmp = VFSTONFS(vp->v_mount); + nmp = VFSTONFS(vnode_mount(vp)); if (!nmp) { if (!nofreeupl) ubc_upl_abort_range(pl, pl_offset, size, @@ -4448,7 +4743,7 @@ nfs_pagein(ap) } if ((nmp->nm_flag & NFSMNT_NFSV3) && !(nmp->nm_state & NFSSTA_GOTFSINFO)) (void)nfs_fsinfo(nmp, vp, cred, p); - biosize = vp->v_mount->mnt_stat.f_iosize; + biosize = vfs_statfs(vnode_mount(vp))->f_iosize; plinfo = ubc_upl_pageinfo(pl); ubc_upl_map(pl, &ioaddr); @@ -4462,35 +4757,35 @@ nfs_pagein(ap) * before sending the next one. * XXX Should we align these requests to block boundaries? */ - iosize = min(biosize, xsize); - uio->uio_resid = iosize; + iosize = min(biosize, xsize); aiov.iov_len = iosize; - aiov.iov_base = (caddr_t)ioaddr; - auio.uio_iov = &aiov; + aiov.iov_base = (uintptr_t)ioaddr; + auio.uio_iovs.iov32p = &aiov; auio.uio_iovcnt = 1; + uio_uio_resid_set(&auio, iosize); - FSDBG(322, uio->uio_offset, uio->uio_resid, ioaddr, xsize); -// XXX #warning our nfs_pagein does not support NQNFS + FSDBG(322, uio->uio_offset, uio_uio_resid(uio), ioaddr, xsize); /* * With UBC we get here only when the file data is not in the VM * page cache, so go ahead and read in. */ -#ifdef UBC_DEBUG - upl_ubc_alias_set(pl, current_act(), 2); -#endif /* UBC_DEBUG */ - nfsstats.pageins++; +#ifdef UPL_DEBUG + upl_ubc_alias_set(pl, current_thread(), 2); +#endif /* UPL_DEBUG */ + OSAddAtomic(1, (SInt32*)&nfsstats.pageins); - error = nfs_readrpc(vp, uio, cred); + error = nfs_readrpc(vp, uio, cred, p); if (!error) { - if (uio->uio_resid) { + if (uio_uio_resid(uio)) { /* * If uio_resid > 0, there is a hole in the file * and no writes after the hole have been pushed * to the server yet... or we're at the EOF * Just zero fill the rest of the valid area. */ - int zcnt = uio->uio_resid; + // LP64todo - fix this + int zcnt = uio_uio_resid(uio); int zoff = iosize - zcnt; bzero((char *)ioaddr + zoff, zcnt); @@ -4499,21 +4794,11 @@ nfs_pagein(ap) } ioaddr += iosize; xsize -= iosize; - } else - FSDBG(322, uio->uio_offset, uio->uio_resid, error, -1); - - nmp = VFSTONFS(vp->v_mount); - if (p && (vp->v_flag & VTEXT) && nmp && - ((nmp->nm_flag & NFSMNT_NQNFS && - NQNFS_CKINVALID(vp, np, ND_READ) && - np->n_lrev != np->n_brev) || - (!(nmp->nm_flag & NFSMNT_NQNFS) && - np->n_mtime != np->n_vattr.va_mtime.tv_sec))) { - uprintf("Process killed due to text file modification\n"); - psignal(p, SIGKILL); - p->p_flag |= P_NOSWAP; + } else { + FSDBG(322, uio->uio_offset, uio_uio_resid(uio), error, -1); } + nmp = VFSTONFS(vnode_mount(vp)); } while (error == 0 && xsize > 0); ubc_upl_unmap(pl); @@ -4539,36 +4824,36 @@ nfs_pagein(ap) */ static int nfs_pageout(ap) - struct vop_pageout_args /* { - struct vnode *a_vp, - upl_t a_pl, - vm_offset_t a_pl_offset, - off_t a_f_offset, - size_t a_size, - struct ucred *a_cred, - int a_flags + struct vnop_pageout_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + upl_t a_pl; + vm_offset_t a_pl_offset; + off_t a_f_offset; + size_t a_size; + int a_flags; + vfs_context_t a_context; } */ *ap; { - register struct vnode *vp = ap->a_vp; + vnode_t vp = ap->a_vp; upl_t pl = ap->a_pl; size_t size= ap->a_size; off_t f_offset = ap->a_f_offset; vm_offset_t pl_offset = ap->a_pl_offset; int flags = ap->a_flags; - int ioflag = ap->a_flags; - struct proc *p = current_proc(); struct nfsnode *np = VTONFS(vp); - register struct ucred *cred; + kauth_cred_t cred; + proc_t p; struct nfsbuf *bp; - struct nfsmount *nmp = VFSTONFS(vp->v_mount); - daddr_t lbn; - int n = 0, on, error = 0, iomode, must_commit, s; + struct nfsmount *nmp = VFSTONFS(vnode_mount(vp)); + daddr64_t lbn; + int error = 0, iomode, must_commit; off_t off; vm_offset_t ioaddr; struct uio auio; - struct iovec aiov; + struct iovec_32 aiov; int nofreeupl = flags & UPL_NOCOMMIT; - int biosize, iosize, pgsize, xsize; + size_t biosize, iosize, pgsize, xsize; FSDBG(323, f_offset, size, pl, pl_offset); @@ -4595,7 +4880,7 @@ nfs_pageout(ap) ubc_upl_abort(pl, UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY); return (ENXIO); } - biosize = vp->v_mount->mnt_stat.f_iosize; + biosize = vfs_statfs(vnode_mount(vp))->f_iosize; /* * Check to see whether the buffer is incore. @@ -4608,10 +4893,11 @@ nfs_pageout(ap) if (off + xsize > f_offset + size) xsize = f_offset + size - off; lbn = ubc_offtoblk(vp, off); - s = splbio(); - if (bp = nfs_buf_incore(vp, lbn)) { - FSDBG(323, off, 1, bp, bp->nb_flags); - if (ISSET(bp->nb_flags, NB_BUSY)) { + lck_mtx_lock(nfs_buf_mutex); + if ((bp = nfs_buf_incore(vp, lbn))) { + FSDBG(323, off, bp, bp->nb_lflags, bp->nb_flags); + if (nfs_buf_acquire(bp, NBAC_NOWAIT, 0, 0)) { + lck_mtx_unlock(nfs_buf_mutex); /* no panic. just tell vm we are busy */ if (!nofreeupl) ubc_upl_abort(pl, 0); @@ -4619,8 +4905,8 @@ nfs_pageout(ap) } if (bp->nb_dirtyend > 0) { /* - * if there's a dirty range in the buffer, check to - * see if it extends beyond the pageout region + * if there's a dirty range in the buffer, check + * to see if it extends beyond the pageout region * * if the dirty region lies completely within the * pageout region, we just invalidate the buffer @@ -4638,7 +4924,7 @@ nfs_pageout(ap) start = off; end = off + xsize; /* clip end to EOF */ - if (end > np->n_size) + if (end > (off_t)np->n_size) end = np->n_size; start -= boff; end -= boff; @@ -4646,6 +4932,8 @@ nfs_pageout(ap) (bp->nb_dirtyend > end)) { /* not gonna be able to clip the dirty region */ FSDBG(323, vp, bp, 0xd00deebc, EBUSY); + nfs_buf_drop(bp); + lck_mtx_unlock(nfs_buf_mutex); if (!nofreeupl) ubc_upl_abort(pl, 0); return (EBUSY); @@ -4659,24 +4947,29 @@ nfs_pageout(ap) bp->nb_dirtyoff = max(bp->nb_dirtyoff, end); FSDBG(323, bp, bp->nb_dirtyoff, bp->nb_dirtyend, 0xd00dee00); /* we're leaving this block dirty */ + nfs_buf_drop(bp); + lck_mtx_unlock(nfs_buf_mutex); continue; } } nfs_buf_remfree(bp); - SET(bp->nb_flags, (NB_BUSY | NB_INVAL)); + lck_mtx_unlock(nfs_buf_mutex); + SET(bp->nb_flags, NB_INVAL); if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { CLR(bp->nb_flags, NB_NEEDCOMMIT); np->n_needcommitcnt--; CHECK_NEEDCOMMITCNT(np); } nfs_buf_release(bp, 1); + } else { + lck_mtx_unlock(nfs_buf_mutex); } - splx(s); } cred = ubc_getcred(vp); if (cred == NOCRED) - cred = ap->a_cred; + cred = vfs_context_ucred(ap->a_context); + p = vfs_context_proc(ap->a_context); if (np->n_flag & NWRITEERR) { np->n_flag &= ~NWRITEERR; @@ -4685,11 +4978,10 @@ nfs_pageout(ap) UPL_ABORT_FREE_ON_EMPTY); return (np->n_error); } - if ((nmp->nm_flag & NFSMNT_NFSV3) && - !(nmp->nm_state & NFSSTA_GOTFSINFO)) - (void)nfs_fsinfo(nmp, vp, cred, p); + if ((nmp->nm_flag & NFSMNT_NFSV3) && !(nmp->nm_state & NFSSTA_GOTFSINFO)) + nfs_fsinfo(nmp, vp, cred, p); - if (f_offset < 0 || f_offset >= np->n_size || + if (f_offset < 0 || f_offset >= (off_t)np->n_size || f_offset & PAGE_MASK_64 || size & PAGE_MASK_64) { if (!nofreeupl) ubc_upl_abort_range(pl, pl_offset, size, @@ -4700,7 +4992,7 @@ nfs_pageout(ap) ubc_upl_map(pl, &ioaddr); ioaddr += pl_offset; - if (f_offset + size > np->n_size) + if ((u_quad_t)f_offset + size > np->n_size) xsize = np->n_size - f_offset; else xsize = size; @@ -4718,16 +5010,20 @@ nfs_pageout(ap) * contents past end of the file before * releasing it in the VM page cache */ - if (f_offset < np->n_size && f_offset + size > np->n_size) { + if ((u_quad_t)f_offset < np->n_size && (u_quad_t)f_offset + size > np->n_size) { size_t io = np->n_size - f_offset; bzero((caddr_t)(ioaddr + io), size - io); FSDBG(321, np->n_size, f_offset, f_offset + io, size - io); } auio.uio_offset = f_offset; +#if 1 /* LP64todo - can't use new segment flags until the drivers are ready */ auio.uio_segflg = UIO_SYSSPACE; +#else + auio.uio_segflg = UIO_SYSSPACE32; +#endif auio.uio_rw = UIO_READ; - auio.uio_procp = NULL; + auio.uio_procp = p; do { /* @@ -4737,23 +5033,23 @@ nfs_pageout(ap) * XXX Should we align these requests to block boundaries? */ iosize = min(biosize, xsize); - auio.uio_resid = iosize; + uio_uio_resid_set(&auio, iosize); aiov.iov_len = iosize; - aiov.iov_base = (caddr_t)ioaddr; - auio.uio_iov = &aiov; + aiov.iov_base = (uintptr_t)ioaddr; + auio.uio_iovs.iov32p = &aiov; auio.uio_iovcnt = 1; - FSDBG(323, auio.uio_offset, auio.uio_resid, ioaddr, xsize); -// XXX #warning our nfs_pageout does not support NQNFS - nfsstats.pageouts++; + FSDBG(323, auio.uio_offset, uio_uio_resid(&auio), ioaddr, xsize); + OSAddAtomic(1, (SInt32*)&nfsstats.pageouts); + + vnode_startwrite(vp); - vp->v_numoutput++; /* NMODIFIED would be set here if doing unstable writes */ iomode = NFSV3WRITE_FILESYNC; - error = nfs_writerpc(vp, &auio, cred, &iomode, &must_commit); + error = nfs_writerpc(vp, &auio, cred, p, &iomode, &must_commit); if (must_commit) - nfs_clearcommit(vp->v_mount); - vpwakeup(vp); + nfs_clearcommit(vnode_mount(vp)); + vnode_writedone(vp); if (error) goto cleanup; /* Note: no need to check uio_resid, because */ @@ -4790,7 +5086,7 @@ cleanup: if (!nofreeupl) { /* otherwise stacked file system has to handle this */ if (error) { - int abortflags; + int abortflags = 0; short action = nfs_pageouterrorhandler(error); switch (action) { @@ -4810,8 +5106,7 @@ cleanup: case RETRYWITHSLEEP: abortflags = UPL_ABORT_FREE_ON_EMPTY; /* pri unused. PSOCK for placeholder. */ - (void) tsleep(&lbolt, PSOCK, - "nfspageout", 0); + tsleep(&lbolt, PSOCK, "nfspageout", 0); break; case SEVER: /* not implemented */ default: @@ -4833,55 +5128,47 @@ cleanup: /* Blktooff derives file offset given a logical block number */ static int nfs_blktooff(ap) - struct vop_blktooff_args /* { - struct vnode *a_vp; - daddr_t a_lblkno; - off_t *a_offset; + struct vnop_blktooff_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + daddr64_t a_lblkno; + off_t *a_offset; } */ *ap; { int biosize; - register struct vnode *vp = ap->a_vp; + vnode_t vp = ap->a_vp; + mount_t mp = vnode_mount(vp); - if (!vp->v_mount) + if (!mp) return (ENXIO); - biosize = vp->v_mount->mnt_stat.f_iosize; + biosize = vfs_statfs(mp)->f_iosize; - *ap->a_offset = (off_t)ap->a_lblkno * biosize; + *ap->a_offset = (off_t)(ap->a_lblkno * biosize); return (0); } static int nfs_offtoblk(ap) - struct vop_offtoblk_args /* { - struct vnode *a_vp; - off_t a_offset; - daddr_t *a_lblkno; + struct vnop_offtoblk_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + off_t a_offset; + daddr64_t *a_lblkno; } */ *ap; { int biosize; - register struct vnode *vp = ap->a_vp; + vnode_t vp = ap->a_vp; + mount_t mp = vnode_mount(vp); - if (!vp->v_mount) + if (!mp) return (ENXIO); - biosize = vp->v_mount->mnt_stat.f_iosize; + biosize = vfs_statfs(mp)->f_iosize; - *ap->a_lblkno = (daddr_t)(ap->a_offset / biosize); + *ap->a_lblkno = (daddr64_t)(ap->a_offset / biosize); return (0); } -static int -nfs_cmap(ap) - struct vop_cmap_args /* { - struct vnode *a_vp; - off_t a_offset; - size_t a_size; - daddr_t *a_bpn; - size_t *a_run; - void *a_poff; - } */ *ap; -{ - return (EOPNOTSUPP); -} + diff --git a/bsd/nfs/nfsdiskless.h b/bsd/nfs/nfsdiskless.h index 3fa123d7f..c5292026d 100644 --- a/bsd/nfs/nfsdiskless.h +++ b/bsd/nfs/nfsdiskless.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -111,33 +111,11 @@ struct nfs_dlmount { char ndm_host[MNAMELEN]; /* Host name for mount pt */ char *ndm_path; /* path name for mount pt */ u_long ndm_nfsv3; /* NFSv3 or NFSv2? */ + u_long ndm_sotype; /* SOCK_STREAM or SOCK_DGRAM? */ u_long ndm_fhlen; /* length of file handle */ u_char ndm_fh[NFSX_V3FHMAX]; /* The file's file handle */ }; -/* - * Old arguments to mount NFS - */ -struct onfs_args { - struct sockaddr *addr; /* file server address */ - int addrlen; /* length of address */ - int sotype; /* Socket type */ - int proto; /* and Protocol */ - u_char *fh; /* File handle to be mounted */ - int fhsize; /* Size, in bytes, of fh */ - int flags; /* flags */ - int wsize; /* write size in bytes */ - int rsize; /* read size in bytes */ - int readdirsize; /* readdir size in bytes */ - int timeo; /* initial timeout in .1 secs */ - int retrans; /* times to retry send */ - int maxgrouplist; /* Max. size of group list */ - int readahead; /* # of blocks to readahead */ - int leaseterm; /* Term (sec) of lease */ - int deadthresh; /* Retrans threshold */ - char *hostname; /* server's name */ -}; - struct nfs_diskless { struct nfs_dlmount nd_root; /* Mount info for root */ struct nfs_dlmount nd_private; /* Mount info for private */ diff --git a/bsd/nfs/nfsm_subs.h b/bsd/nfs/nfsm_subs.h index cc1ac71b2..1e8de1dd5 100644 --- a/bsd/nfs/nfsm_subs.h +++ b/bsd/nfs/nfsm_subs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -75,25 +75,12 @@ /* * First define what the actual subs. return */ -struct mbuf *nfsm_reqh __P((struct vnode *vp, u_long procid, int hsiz, - caddr_t *bposp)); -struct mbuf *nfsm_rpchead __P((struct ucred *cr, int nmflag, int procid, +int nfsm_reqh(int hsiz, caddr_t *bposp, mbuf_t *mbp); +int nfsm_rpchead(struct ucred *cr, int nmflag, int procid, int auth_type, int auth_len, char *auth_str, int verf_len, char *verf_str, - struct mbuf *mrest, int mrest_len, - struct mbuf **mbp, u_long *xidp)); - -#define M_HASCL(m) ((m)->m_flags & M_EXT) -#define NFSMINOFF(m) \ - if (M_HASCL(m)) \ - (m)->m_data = (m)->m_ext.ext_buf; \ - else if ((m)->m_flags & M_PKTHDR) \ - (m)->m_data = (m)->m_pktdat; \ - else \ - (m)->m_data = (m)->m_dat -#define NFSMADV(m, s) (m)->m_data += (s) -#define NFSMSIZ(m) ((M_HASCL(m))?MCLBYTES: \ - (((m)->m_flags & M_PKTHDR)?MHLEN:MLEN)) + mbuf_t mrest, int mrest_len, + mbuf_t *mbp, u_long *xidp, mbuf_t *mreqp); /* * Now for the macros that do the simple stuff and call the functions @@ -109,27 +96,31 @@ struct mbuf *nfsm_rpchead __P((struct ucred *cr, int nmflag, int procid, */ #define nfsm_build(a,c,s) \ - { if ((s) > M_TRAILINGSPACE(mb)) { \ - MGET(mb2, M_WAIT, MT_DATA); \ - if ((s) > MLEN) \ - panic("build > MLEN"); \ - mb->m_next = mb2; \ + { if ((s) > mbuf_trailingspace(mb)) { \ + int __nfsm_error; \ + __nfsm_error = mbuf_get(MBUF_WAITOK, MBUF_TYPE_DATA, &mb2); \ + if (__nfsm_error) \ + panic("nfsm_build mbuf_get error %d", __nfsm_error); \ + if ((s) > mbuf_maxlen(mb2)) \ + panic("nfsm_build size error"); \ + __nfsm_error = mbuf_setnext(mb, mb2); \ + if (__nfsm_error) \ + panic("nfsm_build mbuf_setnext error %d", __nfsm_error); \ mb = mb2; \ - mb->m_len = 0; \ - bpos = mtod(mb, caddr_t); \ + bpos = mbuf_data(mb); \ } \ (a) = (c)(bpos); \ - mb->m_len += (s); \ + mbuf_setlen(mb, (mbuf_len(mb) + (s))); \ bpos += (s); } #define nfsm_dissect(a, c, s) \ - { t1 = mtod(md, caddr_t)+md->m_len-dpos; \ + { t1 = ((caddr_t)mbuf_data(md)) + mbuf_len(md) - dpos; \ if (t1 >= (s)) { \ (a) = (c)(dpos); \ dpos += (s); \ } else if ((t1 = nfsm_disct(&md, &dpos, (s), t1, &cp2))) { \ error = t1; \ - m_freem(mrep); \ + mbuf_freem(mrep); \ goto nfsmout; \ } else { \ (a) = (c)cp2; \ @@ -138,7 +129,7 @@ struct mbuf *nfsm_rpchead __P((struct ucred *cr, int nmflag, int procid, #define nfsm_fhtom(v, v3) \ { if (v3) { \ t2 = nfsm_rndup(VTONFS(v)->n_fhsize) + NFSX_UNSIGNED; \ - if (t2 <= M_TRAILINGSPACE(mb)) { \ + if (t2 <= mbuf_trailingspace(mb)) { \ nfsm_build(tl, u_long *, t2); \ *tl++ = txdr_unsigned(VTONFS(v)->n_fhsize); \ *(tl + ((t2>>2) - 2)) = 0; \ @@ -147,7 +138,7 @@ struct mbuf *nfsm_rpchead __P((struct ucred *cr, int nmflag, int procid, } else if ((t2 = nfsm_strtmbuf(&mb, &bpos, \ (caddr_t)VTONFS(v)->n_fhp, VTONFS(v)->n_fhsize))) { \ error = t2; \ - m_freem(mreq); \ + mbuf_freem(mreq); \ goto nfsmout; \ } \ } else { \ @@ -157,47 +148,67 @@ struct mbuf *nfsm_rpchead __P((struct ucred *cr, int nmflag, int procid, #define nfsm_srvfhtom(f, v3) \ { if (v3) { \ - nfsm_build(tl, u_long *, NFSX_UNSIGNED + NFSX_V3FH); \ - *tl++ = txdr_unsigned(NFSX_V3FH); \ - bcopy((caddr_t)(f), (caddr_t)tl, NFSX_V3FH); \ + nfsm_build(tl, u_long *, NFSX_UNSIGNED + (unsigned)(f)->nfh_len); \ + *tl++ = txdr_unsigned((f)->nfh_len); \ + bcopy((caddr_t)&(f)->nfh_xh, (caddr_t)tl, (f)->nfh_len); \ } else { \ nfsm_build(cp, caddr_t, NFSX_V2FH); \ - bcopy((caddr_t)(f), cp, NFSX_V2FH); \ + bcopy((caddr_t)&(f)->nfh_xh, cp, NFSX_V2FH); \ } } #define nfsm_srvpostop_fh(f) \ - { nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED + NFSX_V3FH); \ + { nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED + (unsigned)(f)->nfh_len); \ *tl++ = nfs_true; \ - *tl++ = txdr_unsigned(NFSX_V3FH); \ - bcopy((caddr_t)(f), (caddr_t)tl, NFSX_V3FH); \ + *tl++ = txdr_unsigned((f)->nfh_len); \ + bcopy((caddr_t)&(f)->nfh_xh, (caddr_t)tl, (f)->nfh_len); \ } -#define nfsm_mtofh(d, v, v3, f, x) \ - { struct nfsnode *ttnp; nfsfh_t *ttfhp; int ttfhsize; \ - if (v3) { \ +#define nfsm_mtofh(d, cnp, v, v3, xp, f) \ + { \ + struct nfsnode *ttnp; u_char *ttfhp = NULL; \ + int ttfhsize = 0, ttgotfh = 1, ttgotattr = 1, ttgotnode = 0; \ + struct nfs_vattr ttvattr; \ + (v) = NULL; \ + /* XXX would be nice to not bail to nfsmout on error */ \ + if (v3) { /* check for file handle */ \ nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); \ - (f) = fxdr_unsigned(int, *tl); \ - } else \ - (f) = 1; \ - if (f) { \ + ttgotfh = fxdr_unsigned(int, *tl); \ + } \ + if (ttgotfh) { \ + /* get file handle */ \ nfsm_getfh(ttfhp, ttfhsize, (v3)); \ - if ((t1 = nfs_nget((d)->v_mount, ttfhp, ttfhsize, \ - &ttnp))) { \ - error = t1; \ - m_freem(mrep); \ - goto nfsmout; \ - } \ - (v) = NFSTOV(ttnp); \ } \ - if (v3) { \ + if (v3) { /* check for attributes */ \ nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); \ - if (f) \ - (f) = fxdr_unsigned(int, *tl); \ - else if (fxdr_unsigned(int, *tl)) \ + ttgotattr = fxdr_unsigned(int, *tl); \ + } \ + /* get attributes */ \ + if (ttgotattr) { \ + if (!ttgotfh) { \ nfsm_adv(NFSX_V3FATTR); \ + } else { \ + nfsm_attr_get(v3, &ttvattr); \ + } \ + } else if (ttgotfh) { \ + /* We need valid attributes in order */ \ + /* to call nfs_nget/vnode_create(). */ \ + t1 = nfs_getattr_no_vnode(vnode_mount(d), \ + ttfhp, ttfhsize, cred, p, &ttvattr, xp); \ + if (t1) \ + ttgotattr = 0; \ + } \ + if (ttgotfh && ttgotattr) { \ + int ttngflags = NG_MAKEENTRY; \ + if ((t1 = nfs_nget(vnode_mount(d), d, cnp, ttfhp, ttfhsize, \ + &ttvattr, xp, ttngflags, &ttnp))) { \ + error = t1; \ + ttgotnode = 0; \ + } else { \ + ttgotnode = 1; \ + (v) = NFSTOV(ttnp); \ + } \ } \ - if (f) \ - nfsm_loadattr((v), (struct vattr *)0, (x)); \ + (f) = ttgotnode; \ } #define nfsm_getfh(f, s, v3) \ @@ -205,38 +216,72 @@ struct mbuf *nfsm_rpchead __P((struct ucred *cr, int nmflag, int procid, nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); \ if (((s) = fxdr_unsigned(int, *tl)) <= 0 || \ (s) > NFSX_V3FHMAX) { \ - m_freem(mrep); \ + mbuf_freem(mrep); \ error = EBADRPC; \ goto nfsmout; \ } \ - } else \ + } else { \ (s) = NFSX_V2FH; \ - nfsm_dissect((f), nfsfh_t *, nfsm_rndup(s)); } + } \ + nfsm_dissect((f), u_char *, nfsm_rndup(s)); } -#define nfsm_loadattr(v, a, x) \ - { struct vnode *ttvp = (v); \ - if ((t1 = nfs_loadattrcache(&ttvp, &md, &dpos, (a), 0, \ - (x)))) { \ +#define nfsm_loadattr(v, v3, a, x) \ + { struct nfs_vattr ttvattr; \ + if ((t1 = nfs_parsefattr(&md, &dpos, v3, &ttvattr))) { \ error = t1; \ - m_freem(mrep); \ + mbuf_freem(mrep); \ goto nfsmout; \ } \ - (v) = ttvp; } + if ((t1 = nfs_loadattrcache(VTONFS(v), &ttvattr, (x), 0))) { \ + error = t1; \ + mbuf_freem(mrep); \ + goto nfsmout; \ + } \ + if (a) { \ + bcopy(&ttvattr, (a), sizeof(ttvattr)); \ + } \ + } -#define nfsm_postop_attr(v, f, x) \ - { struct vnode *ttvp = (v); \ +#define nfsm_attr_get(v3, vap) \ + { \ + if ((t1 = nfs_parsefattr(&md, &dpos, v3, vap))) { \ + error = t1; \ + mbuf_freem(mrep); \ + goto nfsmout; \ + } \ + } + +#define nfsm_postop_attr_get(v3, f, vap) \ + { \ nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); \ if (((f) = fxdr_unsigned(int, *tl))) { \ - if ((t1 = nfs_loadattrcache(&ttvp, &md, &dpos, \ - (struct vattr *)0, 1, (x)))) { \ + if ((t1 = nfs_parsefattr(&md, &dpos, v3, vap))) { \ error = t1; \ (f) = 0; \ - m_freem(mrep); \ + mbuf_freem(mrep); \ + goto nfsmout; \ + } \ + } } + +#define nfsm_postop_attr_update(v, v3, f, x) \ + { \ + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); \ + if (((f) = fxdr_unsigned(int, *tl))) { \ + struct nfs_vattr ttvattr; \ + if ((t1 = nfs_parsefattr(&md, &dpos, v3, &ttvattr))) { \ + error = t1; \ + (f) = 0; \ + mbuf_freem(mrep); \ + goto nfsmout; \ + } \ + if ((t1 = nfs_loadattrcache(VTONFS(v), &ttvattr, (x), 1))) { \ + error = t1; \ + (f) = 0; \ + mbuf_freem(mrep); \ goto nfsmout; \ } \ if (*(x) == 0) \ (f) = 0; \ - (v) = ttvp; \ } } #define nfsm_wcc_data(v, premtime, newpostattr, x) \ @@ -244,29 +289,84 @@ struct mbuf *nfsm_rpchead __P((struct ucred *cr, int nmflag, int procid, nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); \ if (*tl == nfs_true) { \ nfsm_dissect(tl, u_long *, 6 * NFSX_UNSIGNED); \ - (premtime) = fxdr_unsigned(time_t, *(tl + 2)); \ + (premtime)->tv_sec = fxdr_unsigned(time_t, *(tl + 2)); \ + (premtime)->tv_nsec = fxdr_unsigned(time_t, *(tl + 3)); \ } else { \ - (premtime) = 0; \ + (premtime)->tv_sec = 0; \ + (premtime)->tv_nsec = 0; \ } \ - nfsm_postop_attr((v), (newpostattr), (x)); \ + nfsm_postop_attr_update((v), 1, (newpostattr), (x)); \ } -#define nfsm_v3sattr(s, a, u, g) \ - { (s)->sa_modetrue = nfs_true; \ - (s)->sa_mode = vtonfsv3_mode((a)->va_mode); \ - (s)->sa_uidtrue = nfs_true; \ - (s)->sa_uid = txdr_unsigned(u); \ - (s)->sa_gidtrue = nfs_true; \ - (s)->sa_gid = txdr_unsigned(g); \ - (s)->sa_sizefalse = nfs_false; \ - (s)->sa_atimetype = txdr_unsigned(NFSV3SATTRTIME_TOSERVER); \ - (s)->sa_mtimetype = txdr_unsigned(NFSV3SATTRTIME_TOSERVER); \ +#define nfsm_v3sattr(vap) \ + {\ + struct timeval now; \ + if (VATTR_IS_ACTIVE(vap, va_mode)) { \ + nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED); \ + *tl++ = nfs_true; \ + *tl = txdr_unsigned(vap->va_mode); \ + } else { \ + nfsm_build(tl, u_long *, NFSX_UNSIGNED); \ + *tl = nfs_false; \ + } \ + if (VATTR_IS_ACTIVE(vap, va_uid)) { \ + nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED); \ + *tl++ = nfs_true; \ + *tl = txdr_unsigned(vap->va_uid); \ + } else { \ + nfsm_build(tl, u_long *, NFSX_UNSIGNED); \ + *tl = nfs_false; \ + } \ + if (VATTR_IS_ACTIVE(vap, va_gid)) { \ + nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED); \ + *tl++ = nfs_true; \ + *tl = txdr_unsigned(vap->va_gid); \ + } else { \ + nfsm_build(tl, u_long *, NFSX_UNSIGNED); \ + *tl = nfs_false; \ + } \ + if (VATTR_IS_ACTIVE(vap, va_data_size)) { \ + nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED); \ + *tl++ = nfs_true; \ + txdr_hyper(&vap->va_data_size, tl); \ + } else { \ + nfsm_build(tl, u_long *, NFSX_UNSIGNED); \ + *tl = nfs_false; \ + } \ + microtime(&now); \ + if (VATTR_IS_ACTIVE(vap, va_access_time)) { \ + if (vap->va_access_time.tv_sec != now.tv_sec) { \ + nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED); \ + *tl++ = txdr_unsigned(NFSV3SATTRTIME_TOCLIENT); \ + txdr_nfsv3time(&vap->va_access_time, tl); \ + } else { \ + nfsm_build(tl, u_long *, NFSX_UNSIGNED); \ + *tl = txdr_unsigned(NFSV3SATTRTIME_TOSERVER); \ + } \ + } else { \ + nfsm_build(tl, u_long *, NFSX_UNSIGNED); \ + *tl = txdr_unsigned(NFSV3SATTRTIME_DONTCHANGE); \ + } \ + if (VATTR_IS_ACTIVE(vap, va_modify_time)) { \ + if (vap->va_modify_time.tv_sec != now.tv_sec) { \ + nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED); \ + *tl++ = txdr_unsigned(NFSV3SATTRTIME_TOCLIENT); \ + txdr_nfsv3time(&vap->va_modify_time, tl); \ + } else { \ + nfsm_build(tl, u_long *, NFSX_UNSIGNED); \ + *tl = txdr_unsigned(NFSV3SATTRTIME_TOSERVER); \ + } \ + } else { \ + nfsm_build(tl, u_long *, NFSX_UNSIGNED); \ + *tl = txdr_unsigned(NFSV3SATTRTIME_DONTCHANGE); \ + } \ } -#define nfsm_strsiz(s,m) \ +#define nfsm_strsiz(s,m,v3) \ { nfsm_dissect(tl,u_long *,NFSX_UNSIGNED); \ - if (((s) = fxdr_unsigned(long,*tl)) > (m)) { \ - m_freem(mrep); \ + (s) = fxdr_unsigned(long,*tl); \ + if (!(v3) && ((s) > (m))) { \ + mbuf_freem(mrep); \ error = EBADRPC; \ goto nfsmout; \ } } @@ -278,9 +378,10 @@ struct mbuf *nfsm_rpchead __P((struct ucred *cr, int nmflag, int procid, nfsm_reply(0); \ } } -#define nfsm_srvnamesiz(s) \ +#define nfsm_srvnamesiz(s,v3) \ { nfsm_dissect(tl,u_long *,NFSX_UNSIGNED); \ - if (((s) = fxdr_unsigned(long,*tl)) > NFS_MAXNAMLEN) \ + (s) = fxdr_unsigned(long,*tl); \ + if (!(v3) && ((s) > NFS_MAXNAMLEN)) \ error = NFSERR_NAMETOL; \ if ((s) <= 0) \ error = EBADRPC; \ @@ -292,67 +393,50 @@ struct mbuf *nfsm_rpchead __P((struct ucred *cr, int nmflag, int procid, if ((s) > 0 && \ (t1 = nfsm_mbuftouio(&md,(p),(s),&dpos))) { \ error = t1; \ - m_freem(mrep); \ + mbuf_freem(mrep); \ goto nfsmout; \ } #define nfsm_uiotom(p,s) \ if ((t1 = nfsm_uiotombuf((p),&mb,(s),&bpos))) { \ error = t1; \ - m_freem(mreq); \ + mbuf_freem(mreq); \ goto nfsmout; \ } -#define nfsm_reqhead(v,a,s) \ - mb = mreq = nfsm_reqh((v),(a),(s),&bpos) +#define nfsm_reqhead(s) \ + error = nfsm_reqh((s), &bpos, &mreq); \ + mb = mreq; -#define nfsm_reqdone m_freem(mrep); \ +#define nfsm_reqdone mbuf_freem(mrep); \ nfsmout: #define nfsm_rndup(a) (((a)+3)&(~0x3)) -/* -* We seem to see cases mainly on shutdown where the vnode got recycled -* on use while waiting on server. Maybe nfs vnode locking will help if -* we implement that, but for now, check for bad vnodes and return an -* error. This call spot should catch most of them. Note that NFSv2 -* just goes to nfsmout here, while nfsV3 goes back to caller's next -* line for post-processing. It will do a nfsm_reqdone also making -* m_freem(mrep). Wondering if some of our freeing problems could be -* due to nfsv3 calling nfsm_reqdone unlike nfsv2. Separate problem. -*/ #define nfsm_request(v, t, p, c, x) \ - { \ - int nfsv3; \ - if (!VFSTONFS((v)->v_mount)) { \ - error = ENXIO; \ - goto nfsmout; \ - } \ - nfsv3 = (VFSTONFS((v)->v_mount))->nm_flag & NFSMNT_NFSV3; \ - if ((error = nfs_request((v), mreq, (t), (p), \ + if ((error = nfs_request((v), vnode_mount(v), mreq, (t), (p), \ (c), &mrep, &md, &dpos, (x)))) { \ if (error & NFSERR_RETERR) \ error &= ~NFSERR_RETERR; \ else \ goto nfsmout; \ - } \ } -#define nfsm_strtom(a,s,m) \ - if ((s) > (m)) { \ - m_freem(mreq); \ +#define nfsm_strtom(a,s,m,v3) \ + if (!(v3) && ((s) > (m))) { \ + mbuf_freem(mreq); \ error = ENAMETOOLONG; \ goto nfsmout; \ } \ t2 = nfsm_rndup(s)+NFSX_UNSIGNED; \ - if (t2 <= M_TRAILINGSPACE(mb)) { \ + if (t2 <= mbuf_trailingspace(mb)) { \ nfsm_build(tl,u_long *,t2); \ *tl++ = txdr_unsigned(s); \ *(tl+((t2>>2)-2)) = 0; \ bcopy((caddr_t)(a), (caddr_t)tl, (s)); \ } else if ((t2 = nfsm_strtmbuf(&mb, &bpos, (a), (s)))) { \ error = t2; \ - m_freem(mreq); \ + mbuf_freem(mreq); \ goto nfsmout; \ } @@ -364,68 +448,102 @@ struct mbuf *nfsm_rpchead __P((struct ucred *cr, int nmflag, int procid, { \ nfsd->nd_repstat = error; \ if (error && !(nfsd->nd_flag & ND_NFSV3)) \ - (void) nfs_rephead(0, nfsd, slp, error, cache, &frev, \ - mrq, &mb, &bpos); \ + nfs_rephead(0, nfsd, slp, error, mrq, &mb, &bpos); \ else \ - (void) nfs_rephead((s), nfsd, slp, error, cache, &frev, \ - mrq, &mb, &bpos); \ - m_freem(mrep); \ + nfs_rephead((s), nfsd, slp, error, mrq, &mb, &bpos); \ + mbuf_freem(mrep); \ mrep = NULL; \ mreq = *mrq; \ if (error && (!(nfsd->nd_flag & ND_NFSV3) || \ - error == EBADRPC)) \ - return(0); \ + error == EBADRPC)) { \ + error = 0; \ + goto nfsmout; \ + } \ } #define nfsm_writereply(s, v3) \ { \ nfsd->nd_repstat = error; \ if (error && !(v3)) \ - (void) nfs_rephead(0, nfsd, slp, error, cache, &frev, \ - &mreq, &mb, &bpos); \ + nfs_rephead(0, nfsd, slp, error, &mreq, &mb, &bpos); \ else \ - (void) nfs_rephead((s), nfsd, slp, error, cache, &frev, \ - &mreq, &mb, &bpos); \ + nfs_rephead((s), nfsd, slp, error, &mreq, &mb, &bpos); \ } #define nfsm_adv(s) \ - { t1 = mtod(md, caddr_t)+md->m_len-dpos; \ + { t1 = ((caddr_t)mbuf_data(md)) + mbuf_len(md) - dpos; \ if (t1 >= (s)) { \ dpos += (s); \ } else if ((t1 = nfs_adv(&md, &dpos, (s), t1))) { \ error = t1; \ - m_freem(mrep); \ + mbuf_freem(mrep); \ goto nfsmout; \ } } #define nfsm_srvmtofh(f) \ - { if (nfsd->nd_flag & ND_NFSV3) { \ + { \ + if (nfsd->nd_flag & ND_NFSV3) { \ nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); \ - if (fxdr_unsigned(int, *tl) != NFSX_V3FH) { \ + (f)->nfh_len = fxdr_unsigned(int, *tl); \ + if (((f)->nfh_len < (int)sizeof(struct nfs_exphandle)) || \ + ((f)->nfh_len > NFSX_V3FHMAX)) { \ error = EBADRPC; \ nfsm_reply(0); \ } \ + } else { \ + (f)->nfh_len = NFSX_V2FH; \ } \ - nfsm_dissect(tl, u_long *, NFSX_V3FH); \ - bcopy((caddr_t)tl, (caddr_t)(f), NFSX_V3FH); \ - if ((nfsd->nd_flag & ND_NFSV3) == 0) \ - nfsm_adv(NFSX_V2FH - NFSX_V3FH); \ + nfsm_dissect(tl, u_long *, (f)->nfh_len); \ + bcopy((caddr_t)tl, (caddr_t)&(f)->nfh_xh, (f)->nfh_len); \ } #define nfsm_clget \ if (bp >= be) { \ + int __nfsm_error, __nfsm_len; \ if (mp == mb) \ - mp->m_len += bp-bpos; \ - MGET(mp, M_WAIT, MT_DATA); \ - MCLGET(mp, M_WAIT); \ - mp->m_len = NFSMSIZ(mp); \ - mp2->m_next = mp; \ + mbuf_setlen(mp, mbuf_len(mp) + bp - bpos); \ + mp = NULL; \ + __nfsm_error = mbuf_mclget(MBUF_WAITOK, MBUF_TYPE_DATA, &mp); \ + if (__nfsm_error) \ + panic("nfsm_clget: mbuf_mclget error %d", __nfsm_error); \ + __nfsm_len = mbuf_maxlen(mp); \ + mbuf_setlen(mp, __nfsm_len); \ + __nfsm_error = mbuf_setnext(mp2, mp); \ + if (__nfsm_error) \ + panic("nfsm_clget: mbuf_setnext error %d", __nfsm_error); \ mp2 = mp; \ - bp = mtod(mp, caddr_t); \ - be = bp+mp->m_len; \ + bp = mbuf_data(mp); \ + be = bp + __nfsm_len; \ } \ tl = (u_long *)bp +#define nfsm_srv_vattr_init(vap, v3) \ + { \ + VATTR_INIT(vap); \ + VATTR_WANTED((vap), va_type); \ + VATTR_WANTED((vap), va_mode); \ + VATTR_WANTED((vap), va_nlink); \ + VATTR_WANTED((vap), va_uid); \ + VATTR_WANTED((vap), va_gid); \ + VATTR_WANTED((vap), va_data_size); \ + VATTR_WANTED((vap), va_data_alloc); \ + VATTR_WANTED((vap), va_rdev); \ + VATTR_WANTED((vap), va_fsid); \ + VATTR_WANTED((vap), va_fileid); \ + VATTR_WANTED((vap), va_access_time); \ + VATTR_WANTED((vap), va_modify_time); \ + VATTR_WANTED((vap), va_change_time); \ + if (!v3) VATTR_WANTED((vap), va_iosize); \ + } + +#define nfsm_srv_pre_vattr_init(vap, v3) \ + { \ + VATTR_INIT(vap); \ + VATTR_WANTED((vap), va_data_size); \ + VATTR_WANTED((vap), va_modify_time); \ + VATTR_WANTED((vap), va_change_time); \ + } + #define nfsm_srvfillattr(a, f) \ nfsm_srvfattr(nfsd, (a), (f)) @@ -437,48 +555,49 @@ struct mbuf *nfsm_rpchead __P((struct ucred *cr, int nmflag, int procid, #define nfsm_srvsattr(a) \ { \ - struct timeval now; \ + struct timespec now; \ nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); \ if (*tl == nfs_true) { \ nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); \ - (a)->va_mode = nfstov_mode(*tl); \ + VATTR_SET(a, va_mode, nfstov_mode(*tl)); \ } \ nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); \ if (*tl == nfs_true) { \ nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); \ - (a)->va_uid = fxdr_unsigned(uid_t, *tl); \ + VATTR_SET(a, va_uid, fxdr_unsigned(uid_t, *tl)); \ } \ nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); \ if (*tl == nfs_true) { \ nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); \ - (a)->va_gid = fxdr_unsigned(gid_t, *tl); \ + VATTR_SET(a, va_gid, fxdr_unsigned(gid_t, *tl)); \ } \ nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); \ if (*tl == nfs_true) { \ nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); \ - fxdr_hyper(tl, &(a)->va_size); \ + fxdr_hyper(tl, &(a)->va_data_size); \ + VATTR_SET_ACTIVE(a, va_data_size); \ } \ nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); \ - microtime(&now); \ + nanotime(&now); \ switch (fxdr_unsigned(int, *tl)) { \ case NFSV3SATTRTIME_TOCLIENT: \ nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); \ - fxdr_nfsv3time(tl, &(a)->va_atime); \ + fxdr_nfsv3time(tl, &(a)->va_access_time); \ + VATTR_SET_ACTIVE(a, va_access_time); \ break; \ case NFSV3SATTRTIME_TOSERVER: \ - (a)->va_atime.tv_sec = now.tv_sec; \ - (a)->va_atime.tv_nsec = now.tv_usec * 1000; \ + VATTR_SET(a, va_access_time, now); \ break; \ }; \ nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); \ switch (fxdr_unsigned(int, *tl)) { \ case NFSV3SATTRTIME_TOCLIENT: \ nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); \ - fxdr_nfsv3time(tl, &(a)->va_mtime); \ + fxdr_nfsv3time(tl, &(a)->va_modify_time); \ + VATTR_SET_ACTIVE(a, va_modify_time); \ break; \ case NFSV3SATTRTIME_TOSERVER: \ - (a)->va_mtime.tv_sec = now.tv_sec; \ - (a)->va_mtime.tv_nsec = now.tv_usec * 1000; \ + VATTR_SET(a, va_modify_time, now); \ break; \ }; } diff --git a/bsd/nfs/nfsmount.h b/bsd/nfs/nfsmount.h index 577dde099..0c97699ad 100644 --- a/bsd/nfs/nfsmount.h +++ b/bsd/nfs/nfsmount.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -74,14 +74,13 @@ struct nfsmount { int nm_flag; /* Flags for soft/hard... */ int nm_state; /* Internal state flags */ - struct mount *nm_mountp; /* Vfs structure for this filesystem */ + mount_t nm_mountp; /* Vfs structure for this filesystem */ int nm_numgrps; /* Max. size of groupslist */ struct vnode *nm_dvp; /* root directory vnode pointer */ - struct socket *nm_so; /* Rpc socket */ + socket_t nm_so; /* Rpc socket */ int nm_sotype; /* Type of socket */ int nm_soproto; /* and protocol */ - int nm_soflags; /* pr_flags for socket protocol */ - struct mbuf *nm_nam; /* Addr of server */ + mbuf_t nm_nam; /* Addr of server */ int nm_timeo; /* Init timer for NFSMNT_DUMBTIMR */ int nm_retry; /* Max retries */ int nm_srtt[4]; /* Timers for rpcs */ @@ -89,14 +88,14 @@ struct nfsmount { int nm_sent; /* Request send count */ int nm_cwnd; /* Request send window */ int nm_timeouts; /* Request timeouts */ - int nm_deadthresh; /* Threshold of timeouts-->dead server*/ int nm_rsize; /* Max size of read rpc */ int nm_wsize; /* Max size of write rpc */ int nm_readdirsize; /* Size of a readdir rpc */ int nm_readahead; /* Num. of blocks to readahead */ - int nm_leaseterm; /* Term (sec) for NQNFS lease */ - CIRCLEQ_HEAD(, nfsnode) nm_timerhead; /* Head of lease timer queue */ - struct vnode *nm_inprog; /* Vnode in prog by nqnfs_clientd() */ + int nm_acregmin; /* reg file min attr cache timeout */ + int nm_acregmax; /* reg file max attr cache timeout */ + int nm_acdirmin; /* dir min attr cache timeout */ + int nm_acdirmax; /* dir max attr cache timeout */ uid_t nm_authuid; /* Uid for authenticator */ int nm_authtype; /* Authenticator type */ int nm_authlen; /* and length */ @@ -114,14 +113,21 @@ struct nfsmount { int nm_bufqiods; /* number of iods processing queue */ int nm_tprintf_initial_delay; /* delay first "server down" */ int nm_tprintf_delay; /* delay between "server down" */ + struct { /* fsinfo & (homogenous) pathconf info */ + u_int64_t maxfilesize; /* max size of a file */ + u_long linkmax; /* max # hard links to an object */ + u_long namemax; /* max length of filename component */ + u_char pcflags; /* boolean pathconf properties */ + u_char fsproperties; /* fsinfo properties */ + } nm_fsinfo; }; #if defined(KERNEL) /* - * Convert mount ptr to nfsmount ptr. + * Convert mount_t to struct nfsmount* */ -#define VFSTONFS(mp) ((mp) ? ((struct nfsmount *)((mp)->mnt_data)) : NULL) +#define VFSTONFS(mp) ((mp) ? ((struct nfsmount *)vfs_fsprivate(mp)) : NULL) #ifndef NFS_TPRINTF_INITIAL_DELAY #define NFS_TPRINTF_INITIAL_DELAY 12 diff --git a/bsd/nfs/nfsnode.h b/bsd/nfs/nfsnode.h index 53f821217..ada189445 100644 --- a/bsd/nfs/nfsnode.h +++ b/bsd/nfs/nfsnode.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -69,8 +69,6 @@ #ifndef _NFS_NFS_H_ #include <nfs/nfs.h> #endif -#include <sys/lock.h> - /* * Silly rename structure that hangs off the nfsnode until the name @@ -78,7 +76,7 @@ */ struct sillyrename { struct ucred *s_cred; - struct vnode *s_dvp; + vnode_t s_dvp; long s_namlen; char s_name[20]; }; @@ -107,9 +105,11 @@ struct nfsbuf { LIST_ENTRY(nfsbuf) nb_vnbufs; /* vnode's nfsbuf chain */ TAILQ_ENTRY(nfsbuf) nb_free; /* free list position if not active. */ volatile long nb_flags; /* NB_* flags. */ - time_t nb_timestamp; /* buffer timestamp */ + volatile long nb_lflags; /* NBL_* flags. */ + volatile long nb_refs; /* outstanding references. */ long nb_bufsize; /* buffer size */ - daddr_t nb_lblkno; /* logical block number. */ + daddr64_t nb_lblkno; /* logical block number. */ + time_t nb_timestamp; /* buffer timestamp */ int nb_error; /* errno value. */ u_int32_t nb_valid; /* valid pages in buf */ u_int32_t nb_dirty; /* dirty pages in buf */ @@ -118,20 +118,27 @@ struct nfsbuf { int nb_dirtyoff; /* offset in buffer of dirty region. */ int nb_dirtyend; /* offset of end of dirty region. */ caddr_t nb_data; /* mapped buffer */ - struct vnode * nb_vp; /* device vnode */ - struct proc * nb_proc; /* associated proc; NULL if kernel. */ + vnode_t nb_vp; /* device vnode */ + proc_t nb_proc; /* associated proc; NULL if kernel. */ struct ucred * nb_rcred; /* read credentials reference */ struct ucred * nb_wcred; /* write credentials reference */ void * nb_pagelist; /* upl */ }; +/* + * These flags are kept in b_lflags... + * nfs_buf_mutex must be held before examining/updating + */ +#define NBL_BUSY 0x00000001 /* I/O in progress. */ +#define NBL_WANTED 0x00000002 /* Process wants this buffer. */ + /* * These flags are kept in nb_flags and they're (purposefully) * very similar to the B_* flags for struct buf. + * nfs_buf_mutex is not needed to examine/update these. */ #define NB_NEEDCOMMIT 0x00000002 /* Append-write in progress. */ #define NB_ASYNC 0x00000004 /* Start I/O, do not wait. */ -#define NB_BUSY 0x00000010 /* I/O in progress. */ #define NB_CACHE 0x00000020 /* Bread found us in the cache. */ #define NB_STABLE 0x00000040 /* write FILESYNC not UNSTABLE. */ #define NB_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ @@ -143,19 +150,41 @@ struct nfsbuf { #define NB_NOCACHE 0x00008000 /* Do not cache block after use. */ #define NB_READ 0x00100000 /* Read buffer. */ #define NB_PAGELIST 0x00400000 /* Buffer describes pagelist I/O. */ -#define NB_WANTED 0x00800000 /* Process wants this buffer. */ #define NB_WRITE 0x00000000 /* Write buffer (pseudo flag). */ #define NB_WRITEINPROG 0x01000000 /* Write in progress. */ #define NB_META 0x40000000 /* buffer contains meta-data. */ #define NB_IOD 0x80000000 /* buffer being handled by nfsiod. */ - +/* Flags for operation type in nfs_buf_get() */ +#define NBLK_READ 0x00000001 /* buffer for read */ +#define NBLK_WRITE 0x00000002 /* buffer for write */ +#define NBLK_META 0x00000004 /* buffer for metadata */ +#define NBLK_OPMASK 0x00000007 /* operation mask */ +/* modifiers for above flags... */ +#define NBLK_NOWAIT 0x40000000 /* don't wait on busy buffer */ +#define NBLK_ONLYVALID 0x80000000 /* only return cached buffer */ + +/* These flags are used for nfsbuf iterating */ +#define NBI_ITER 0x01 /* iteration in progress */ +#define NBI_ITERWANT 0x02 /* waiting to iterate */ +#define NBI_CLEAN 0x04 /* requesting clean buffers */ +#define NBI_DIRTY 0x08 /* requesting dirty buffers */ +#define NBI_NOWAIT 0x10 /* don't block on NBI_ITER */ + +/* Flags for nfs_buf_acquire */ +#define NBAC_NOWAIT 0x01 /* Don't wait if buffer is busy */ +#define NBAC_REMOVE 0x02 /* Remove from free list once buffer is acquired */ + +/* some convenience macros... */ #define NBOFF(BP) ((off_t)(BP)->nb_lblkno * (off_t)(BP)->nb_bufsize) #define NBPGVALID(BP,P) (((BP)->nb_valid >> (P)) & 0x1) #define NBPGDIRTY(BP,P) (((BP)->nb_dirty >> (P)) & 0x1) #define NBPGVALID_SET(BP,P) ((BP)->nb_valid |= (1 << (P))) #define NBPGDIRTY_SET(BP,P) ((BP)->nb_dirty |= (1 << (P))) +#define NBUFSTAMPVALID(BP) ((BP)->nb_timestamp != ~0) +#define NBUFSTAMPINVALIDATE(BP) ((BP)->nb_timestamp = ~0) + #define NFS_BUF_MAP(BP) \ do { \ if (!(BP)->nb_data && nfs_buf_map(BP)) \ @@ -167,33 +196,58 @@ TAILQ_HEAD(nfsbuffreehead, nfsbuf); #define NFSNOLIST ((struct nfsbuf *)0xdeadbeef) -extern int nfsbufhashlock, nfsbufcnt, nfsbufmin, nfsbufmax; +extern lck_mtx_t *nfs_buf_mutex; +extern int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax; extern int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer; extern int nfs_nbdwrite; extern struct nfsbuffreehead nfsbuffree, nfsbufdelwri; extern time_t nfsbuffreeuptimestamp; -#define NFSBUFCNTCHK() \ +#define NFSBUFCNTCHK(locked) \ do { \ + if (!locked) lck_mtx_lock(nfs_buf_mutex); \ if ( (nfsbufcnt < 0) || \ (nfsbufcnt > nfsbufmax) || \ + (nfsbufmetacnt < 0) || \ + (nfsbufmetacnt > nfsbufmetamax) || \ + (nfsbufmetacnt > nfsbufcnt) || \ (nfsbuffreecnt < 0) || \ (nfsbuffreecnt > nfsbufmax) || \ (nfsbuffreecnt > nfsbufcnt) || \ (nfsbuffreemetacnt < 0) || \ (nfsbuffreemetacnt > nfsbufmax) || \ (nfsbuffreemetacnt > nfsbufcnt) || \ + (nfsbuffreemetacnt > nfsbufmetamax) || \ + (nfsbuffreemetacnt > nfsbufmetacnt) || \ (nfsbufdelwricnt < 0) || \ (nfsbufdelwricnt > nfsbufmax) || \ (nfsbufdelwricnt > nfsbufcnt) || \ (nfs_nbdwrite < 0) || \ (nfs_nbdwrite > nfsbufcnt) || \ 0) \ - panic("nfsbuf count error: max %d cnt %d free %d meta %d delwr %d bdw %d\n", \ - nfsbufmax, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, \ + panic("nfsbuf count error: max %d meta %d cnt %d meta %d free %d meta %d delwr %d bdw %d\n", \ + nfsbufmax, nfsbufmetamax, nfsbufcnt, nfsbufmetacnt, nfsbuffreecnt, nfsbuffreemetacnt, \ nfsbufdelwricnt, nfs_nbdwrite); \ + if (!locked) lck_mtx_unlock(nfs_buf_mutex); \ } while (0) +struct nfs_vattr { + enum vtype nva_type; /* vnode type (for create) */ + u_short nva_mode; /* files access mode and type */ + dev_t nva_rdev; /* device the special file represents */ + uid_t nva_uid; /* owner user id */ + gid_t nva_gid; /* owner group id */ + uint32_t nva_fsid; /* file system id (dev for now) */ + uint64_t nva_nlink; /* number of references to file */ + uint64_t nva_fileid; /* file id */ + uint64_t nva_size; /* file size in bytes */ + uint64_t nva_bytes; /* bytes of disk space held by file */ + uint32_t nva_blocksize; /* blocksize preferred for i/o */ + struct timespec nva_atime; /* time of last access */ + struct timespec nva_mtime; /* time of last modification */ + struct timespec nva_ctime; /* time file changed */ +}; + /* * The nfsnode is the nfs equivalent to ufs's inode. Any similarity * is purely coincidental. @@ -202,30 +256,25 @@ extern time_t nfsbuffreeuptimestamp; * An nfsnode is 'named' by its file handle. (nget/nfs_node.c) * If this structure exceeds 256 bytes (it is currently 256 using 4.4BSD-Lite * type definitions), file handles of > 32 bytes should probably be split out - * into a separate MALLOC()'d data structure. (Reduce the size of nfsfh_t by - * changing the definition in sys/mount.h of NFS_SMALLFH.) + * into a separate MALLOC()'d data structure. (Reduce the size of nfsnode.n_fh + * by changing the definition in nfsproto.h of NFS_SMALLFH.) * NB: Hopefully the current order of the fields is such that everything will * be well aligned and, therefore, tightly packed. */ struct nfsnode { - struct lock__bsd__ n_lock; /* the vnode lock */ LIST_ENTRY(nfsnode) n_hash; /* Hash chain */ - CIRCLEQ_ENTRY(nfsnode) n_timer; /* Nqnfs timer chain */ u_quad_t n_size; /* Current size of file */ - u_quad_t n_brev; /* Modify rev when cached */ - u_quad_t n_lrev; /* Modify rev for lease */ - struct vattr n_vattr; /* Vnode attribute cache */ + struct nfs_vattr n_vattr; /* Vnode attribute cache */ time_t n_attrstamp; /* Attr. cache timestamp */ u_int32_t n_mode; /* ACCESS mode cache */ uid_t n_modeuid; /* credentials having mode */ time_t n_modestamp; /* mode cache timestamp */ - time_t n_mtime; /* Prev modify time. */ - time_t n_ncmtime; /* namecache modify time. */ - time_t n_expiry; /* Lease expiry time */ - nfsfh_t *n_fhp; /* NFS File Handle */ + struct timespec n_mtime; /* Prev modify time. */ + struct timespec n_ncmtime; /* namecache modify time. */ + u_char *n_fhp; /* NFS File Handle */ union { - struct vnode *n_vp; /* associated vnode */ - struct mount *n_mp; /* associated mount (NINIT) */ + vnode_t n_vp; /* associated vnode */ + mount_t n_mp; /* associated mount (NINIT) */ } n_un0; struct lockf *n_lockf; /* Locking record of file */ int n_error; /* Save write error value */ @@ -243,13 +292,19 @@ struct nfsnode { } n_un3; short n_fhsize; /* size in bytes, of fh */ short n_flag; /* Flag for locking.. */ - nfsfh_t n_fh; /* Small File Handle */ + u_char n_fh[NFS_SMALLFH];/* Small File Handle */ u_int64_t n_xid; /* last xid to loadattr */ struct nfsbuflists n_cleanblkhd; /* clean blocklist head */ struct nfsbuflists n_dirtyblkhd; /* dirty blocklist head */ int n_needcommitcnt;/* # bufs that need committing */ + int n_bufiterflags; /* buf iterator flags */ }; +#define nfstimespeccmp(tvp, uvp, cmp) \ + (((tvp)->tv_sec == (uvp)->tv_sec) ? \ + ((tvp)->tv_nsec cmp (uvp)->tv_nsec) : \ + ((tvp)->tv_sec cmp (uvp)->tv_sec)) + #define CHECK_NEEDCOMMITCNT(np) \ do { \ if ((np)->n_needcommitcnt < 0) { \ @@ -274,9 +329,9 @@ struct nfsnode { #define NFLUSHINPROG 0x0002 /* Avoid multiple calls to vinvalbuf() */ #define NMODIFIED 0x0004 /* Might have a modified buffer in bio */ #define NWRITEERR 0x0008 /* Flag write errors so close will know */ -#define NQNFSNONCACHE 0x0020 /* Non-cachable lease */ -#define NQNFSWRITE 0x0040 /* Write lease */ -#define NQNFSEVICTED 0x0080 /* Has been evicted */ +#define NNEEDINVALIDATE 0x0010 /* need to call vinvalbuf() */ +#define NNOCACHE 0x0020 /* all bufs are uncached */ +#define NWRBUSY 0x0040 /* node in write/fsync */ #define NACC 0x0100 /* Special file accessed */ #define NUPD 0x0200 /* Special file updated */ #define NCHG 0x0400 /* Special file times changed */ @@ -284,64 +339,86 @@ struct nfsnode { #define NINIT 0x2000 /* node is being initialized */ #define NWINIT 0x4000 /* someone waiting for init to complete */ +#define NATTRVALID(np) ((np)->n_attrstamp != ~0) +#define NATTRINVALIDATE(np) ((np)->n_attrstamp = ~0) +#define NMODEVALID(np) ((np)->n_modestamp != ~0) +#define NMODEINVALIDATE(np) ((np)->n_modestamp = ~0) + +#define NVALIDBUFS(np) (!LIST_EMPTY(&(np)->n_dirtyblkhd) || \ + !LIST_EMPTY(&(np)->n_cleanblkhd)) + +/* + * NFS-specific flags for nfs_vinvalbuf/nfs_flush + */ +#define V_IGNORE_WRITEERR 0x8000 + +/* + * Flags for nfs_nget() + */ +#define NG_MARKROOT 0x0001 /* mark vnode as root of FS */ +#define NG_MAKEENTRY 0x0002 /* add name cache entry for vnode */ + /* * Convert between nfsnode pointers and vnode pointers */ -#define VTONFS(vp) ((struct nfsnode *)(vp)->v_data) -#define NFSTOV(np) ((struct vnode *)(np)->n_vnode) +#define VTONFS(vp) ((struct nfsnode *)vnode_fsnode(vp)) +#define NFSTOV(np) ((np)->n_vnode) + +/* nfsnode hash table mutex */ +extern lck_mtx_t *nfs_node_hash_mutex; /* - * Queue head for nfsiod's + * nfsiod structures */ -extern TAILQ_HEAD(nfs_bufq, buf) nfs_bufq; -extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; +extern proc_t nfs_iodwant[NFS_MAXASYNCDAEMON]; extern struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON]; +extern lck_grp_t *nfs_iod_lck_grp; +extern lck_grp_attr_t *nfs_iod_lck_grp_attr; +extern lck_attr_t *nfs_iod_lck_attr; +extern lck_mtx_t *nfs_iod_mutex; #if defined(KERNEL) -typedef int vop_t __P((void *)); -extern vop_t **fifo_nfsv2nodeop_p; -extern vop_t **nfsv2_vnodeop_p; -extern vop_t **spec_nfsv2nodeop_p; +typedef int vnop_t(void *); +extern vnop_t **fifo_nfsv2nodeop_p; +extern vnop_t **nfsv2_vnodeop_p; +extern vnop_t **spec_nfsv2nodeop_p; /* * Prototypes for NFS vnode operations */ -int nfs_write __P((struct vop_write_args *)); -#define nfs_lease_check ((int (*) __P((struct vop_lease_args *)))nullop) -#define nqnfs_vop_lease_check lease_check -int nqnfs_vop_lease_check __P((struct vop_lease_args *)); -#define nfs_revoke vop_revoke -#define nfs_seek ((int (*) __P((struct vop_seek_args *)))nullop) -int nfs_inactive __P((struct vop_inactive_args *)); -int nfs_reclaim __P((struct vop_reclaim_args *)); -int nfs_lock __P((struct vop_lock_args *)); -int nfs_unlock __P((struct vop_unlock_args *)); -int nfs_islocked __P((struct vop_islocked_args *)); - -#define nfs_reallocblks \ - ((int (*) __P((struct vop_reallocblks_args *)))eopnotsupp) +int nfs_write(struct vnop_write_args *); +#define nfs_revoke nop_revoke +#define nfs_seek ((int (*)(struct vnop_seek_args *))nullop) //XXXdead? +int nfs_inactive(struct vnop_inactive_args *); +int nfs_reclaim(struct vnop_reclaim_args *); -/* other stuff */ -int nfs_removeit __P((struct sillyrename *)); -int nfs_nget __P((struct mount *,nfsfh_t *,int,struct nfsnode **)); -nfsuint64 *nfs_getcookie __P((struct nfsnode *, off_t, int)); -void nfs_invaldir __P((struct vnode *)); -#define nqnfs_lease_updatetime lease_updatetime +/* other stuff */ +int nfs_removeit(struct sillyrename *); +int nfs_nget(mount_t,vnode_t,struct componentname *,u_char *,int,struct nfs_vattr *,u_int64_t *,int,struct nfsnode **); +nfsuint64 *nfs_getcookie(struct nfsnode *, off_t, int); +void nfs_invaldir(vnode_t); /* nfsbuf functions */ void nfs_nbinit(void); void nfs_buf_remfree(struct nfsbuf *); -struct nfsbuf * nfs_buf_incore(struct vnode *, daddr_t); -struct nfsbuf * nfs_buf_get(struct vnode *, daddr_t, int, struct proc *, int); +boolean_t nfs_buf_is_incore(vnode_t, daddr64_t); +struct nfsbuf * nfs_buf_incore(vnode_t, daddr64_t); +int nfs_buf_get(vnode_t, daddr64_t, int, proc_t, int, struct nfsbuf **); int nfs_buf_upl_setup(struct nfsbuf *bp); void nfs_buf_upl_check(struct nfsbuf *bp); void nfs_buf_release(struct nfsbuf *, int); int nfs_buf_iowait(struct nfsbuf *); void nfs_buf_iodone(struct nfsbuf *); -void nfs_buf_write_delayed(struct nfsbuf *); +void nfs_buf_write_delayed(struct nfsbuf *, proc_t); void nfs_buf_freeup(int); +void nfs_buf_refget(struct nfsbuf *bp); +void nfs_buf_refrele(struct nfsbuf *bp); +void nfs_buf_drop(struct nfsbuf *); +errno_t nfs_buf_acquire(struct nfsbuf *, int, int, int); +int nfs_buf_iterprepare(struct nfsnode *, struct nfsbuflists *, int); +void nfs_buf_itercomplete(struct nfsnode *, struct nfsbuflists *, int); #endif /* KERNEL */ diff --git a/bsd/nfs/nfsproto.h b/bsd/nfs/nfsproto.h index eebd0ee26..d44115245 100644 --- a/bsd/nfs/nfsproto.h +++ b/bsd/nfs/nfsproto.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -85,6 +85,7 @@ #define NFS_VER3 3 #define NFS_V2MAXDATA 8192 #define NFS_MAXDGRAMDATA 16384 +#define NFS_PREFDGRAMDATA 8192 #define NFS_MAXDATA (60*1024) // XXX not ready for 64K-128K #define NFS_MAXPATHLEN 1024 #define NFS_MAXNAMLEN 255 @@ -145,11 +146,9 @@ #define NFSX_V2STATFS 20 /* specific to NFS Version 3 */ -#define NFSX_V3FH (sizeof (fhandle_t)) /* size this server uses */ #define NFSX_V3FHMAX 64 /* max. allowed by protocol */ #define NFSX_V3FATTR 84 #define NFSX_V3SATTR 60 /* max. all fields filled in */ -#define NFSX_V3SRVSATTR (sizeof (struct nfsv3_sattr)) #define NFSX_V3POSTOPATTR (NFSX_V3FATTR + NFSX_UNSIGNED) #define NFSX_V3WCCDATA (NFSX_V3POSTOPATTR + 8 * NFSX_UNSIGNED) #define NFSX_V3COOKIEVERF 8 @@ -162,7 +161,7 @@ /* variants for both versions */ #define NFSX_FH(v3) ((v3) ? (NFSX_V3FHMAX + NFSX_UNSIGNED) : \ NFSX_V2FH) -#define NFSX_SRVFH(v3) ((v3) ? NFSX_V3FH : NFSX_V2FH) +#define NFSX_SRVFH(v3,FH) ((v3) ? (FH)->nfh_len : NFSX_V2FH) #define NFSX_FATTR(v3) ((v3) ? NFSX_V3FATTR : NFSX_V2FATTR) #define NFSX_PREOPATTR(v3) ((v3) ? (7 * NFSX_UNSIGNED) : 0) #define NFSX_POSTOPATTR(v3) ((v3) ? (NFSX_V3FATTR + NFSX_UNSIGNED) : 0) @@ -206,13 +205,8 @@ #endif /* !NFS_PROGRAM */ -/* And leasing (nqnfs) procedure numbers (must be last) */ -#define NQNFSPROC_GETLEASE 22 -#define NQNFSPROC_VACATED 23 -#define NQNFSPROC_EVICTED 24 - -#define NFSPROC_NOOP 25 -#define NFS_NPROCS 26 +#define NFSPROC_NOOP 22 +#define NFS_NPROCS 23 /* Actual Version 2 procedure numbers */ #define NFSV2PROC_NULL 0 @@ -264,8 +258,8 @@ /* Conversion macros */ #define vtonfsv2_mode(t,m) \ - txdr_unsigned(((t) == VFIFO) ? MAKEIMODE(VCHR, (m)) : \ - MAKEIMODE((t), (m))) + txdr_unsigned(((t) == VFIFO) ? vnode_makeimode(VCHR, (m)) : \ + vnode_makeimode((t), (m))) #define vtonfsv3_mode(m) txdr_unsigned((m) & 07777) #define nfstov_mode(a) (fxdr_unsigned(u_short, (a))&07777) #define vtonfsv2_type(a) txdr_unsigned(nfsv2_type[((long)(a))]) @@ -292,11 +286,6 @@ typedef enum { NFNON=0, NFREG=1, NFDIR=2, NFBLK=3, NFCHR=4, NFLNK=5, #ifndef NFS_SMALLFH #define NFS_SMALLFH 64 #endif -union nfsfh { - fhandle_t fh_generic; - u_char fh_bytes[NFS_SMALLFH]; -}; -typedef union nfsfh nfsfh_t; struct nfsv2_time { u_long nfsv2_sec; @@ -405,21 +394,6 @@ struct nfsv2_sattr { nfstime2 sa_mtime; }; -/* - * NFS Version 3 sattr structure for the new node creation case. - */ -struct nfsv3_sattr { - u_long sa_modetrue; - u_long sa_mode; - u_long sa_uidtrue; - u_long sa_uid; - u_long sa_gidtrue; - u_long sa_gid; - u_long sa_sizefalse; - u_long sa_atimetype; - u_long sa_mtimetype; -}; - struct nfs_statfs { union { struct { diff --git a/bsd/nfs/nfsrtt.h b/bsd/nfs/nfsrtt.h index 1cebaf787..8e00d78ca 100644 --- a/bsd/nfs/nfsrtt.h +++ b/bsd/nfs/nfsrtt.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -104,7 +104,6 @@ struct nfsrtt { /* * Bits for the flags field. */ -#define DRT_NQNFS 0x01 /* Rpc used Nqnfs protocol */ #define DRT_TCP 0x02 /* Client used TCP transport */ #define DRT_CACHEREPLY 0x04 /* Reply was from recent request cache */ #define DRT_CACHEDROP 0x08 /* Rpc request dropped, due to recent reply */ diff --git a/bsd/nfs/nfsrvcache.h b/bsd/nfs/nfsrvcache.h index 1e7f97766..b18671041 100644 --- a/bsd/nfs/nfsrvcache.h +++ b/bsd/nfs/nfsrvcache.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -77,7 +77,7 @@ struct nfsrvcache { LIST_ENTRY(nfsrvcache) rc_hash; /* Hash chain */ u_long rc_xid; /* rpc id number */ union { - struct mbuf *ru_repmb; /* Reply mbuf list OR */ + mbuf_t ru_repmb; /* Reply mbuf list OR */ int ru_repstat; /* Reply status */ } rc_un; union nethostaddr rc_haddr; /* Host address */ @@ -107,7 +107,6 @@ struct nfsrvcache { #define RC_WANTED 0x02 #define RC_REPSTATUS 0x04 #define RC_REPMBUF 0x08 -#define RC_NQNFS 0x10 #define RC_INETADDR 0x20 #define RC_NAM 0x40 diff --git a/bsd/nfs/nlminfo.h b/bsd/nfs/nlminfo.h deleted file mode 100644 index d149664da..000000000 --- a/bsd/nfs/nlminfo.h +++ /dev/null @@ -1,52 +0,0 @@ -/*- - * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Berkeley Software Design Inc's name may not be used to endorse or - * promote products derived from this software without specific prior - * written permission. - * - * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * from BSDI nlminfo.h,v 2.1 1998/03/18 01:30:38 don Exp - * $FreeBSD: src/sys/nfsclient/nlminfo.h,v 1.1 2001/04/17 20:45:22 alfred Exp $ - */ - -#include <sys/appleapiopts.h> - -#ifdef __APPLE_API_PRIVATE - -/* - * Misc NLM information, some needed for the master lockd process, and some - * needed by every process doing nlm based locking. - */ -struct nlminfo { - /* these are used by any process doing nlm locking */ - int msg_seq; /* sequence counter for lock requests */ - int retcode; /* return code for lock requests */ - int set_getlk; - int getlk_pid; - off_t getlk_start; - off_t getlk_len; - struct timeval pid_start; /* process starting time */ - struct timeval nlm_lockstart; /* XXX debug */ -}; - -extern void nlminfo_release(struct proc *p); -#endif /* __APPLE_API_PRIVATE */ diff --git a/bsd/nfs/nqnfs.h b/bsd/nfs/nqnfs.h deleted file mode 100644 index bb432511a..000000000 --- a/bsd/nfs/nqnfs.h +++ /dev/null @@ -1,244 +0,0 @@ -/* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* - * Copyright (c) 1992, 1993 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Rick Macklem at The University of Guelph. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)nqnfs.h 8.3 (Berkeley) 3/30/95 - * FreeBSD-Id: nqnfs.h,v 1.14 1997/08/16 19:16:05 wollman Exp $ - */ - - -#ifndef _NFS_NQNFS_H_ -#define _NFS_NQNFS_H_ - -#include <sys/appleapiopts.h> - -#ifdef __APPLE_API_PRIVATE -/* - * Definitions for NQNFS (Not Quite NFS) cache consistency protocol. - */ - -/* Tunable constants */ -#define NQ_CLOCKSKEW 3 /* Clock skew factor (sec) */ -#define NQ_WRITESLACK 5 /* Delay for write cache flushing */ -#define NQ_MAXLEASE 60 /* Max lease duration (sec) */ -#define NQ_MINLEASE 5 /* Min lease duration (sec) */ -#define NQ_DEFLEASE 30 /* Default lease duration (sec) */ -#define NQ_RENEWAL 3 /* Time before expiry (sec) to renew */ -#define NQ_TRYLATERDEL 15 /* Initial try later delay (sec) */ -#define NQ_MAXNUMLEASE 2048 /* Upper bound on number of server leases */ -#define NQ_DEADTHRESH NQ_NEVERDEAD /* Default nm_deadthresh */ -#define NQ_NEVERDEAD 9 /* Greater than max. nm_timeouts */ -#define NQLCHSZ 256 /* Server hash table size */ - -#define NQNFS_PROG 300105 /* As assigned by Sun */ -#define NQNFS_VER3 3 -#define NQNFS_EVICTSIZ 156 /* Size of eviction request in bytes */ - -/* - * Definitions used for saving the "last lease expires" time in Non-volatile - * RAM on the server. The default definitions below assume that NOVRAM is not - * available. - */ -#ifdef HASNVRAM -# undef HASNVRAM -#endif -#define NQSTORENOVRAM(t) -#define NQLOADNOVRAM(t) - -/* - * Defn and structs used on the server to maintain state for current leases. - * The list of host(s) that hold the lease are kept as nqhost structures. - * The first one lives in nqlease and any others are held in a linked - * list of nqm structures hanging off of nqlease. - * - * Each nqlease structure is chained into two lists. The first is a list - * ordered by increasing expiry time for nqsrv_timer() and the second is a chain - * hashed on lc_fh. - */ -#define LC_MOREHOSTSIZ 10 - -struct nqhost { - union { - struct { - u_short udp_flag; - u_short udp_port; - union nethostaddr udp_haddr; - } un_udp; - struct { - u_short connless_flag; - u_short connless_spare; - union nethostaddr connless_haddr; - } un_connless; - struct { - u_short conn_flag; - u_short conn_spare; - struct nfssvc_sock *conn_slp; - } un_conn; - } lph_un; -}; -#define lph_flag lph_un.un_udp.udp_flag -#define lph_port lph_un.un_udp.udp_port -#define lph_haddr lph_un.un_udp.udp_haddr -#define lph_inetaddr lph_un.un_udp.udp_haddr.had_inetaddr -#define lph_claddr lph_un.un_connless.connless_haddr -#define lph_nam lph_un.un_connless.connless_haddr.had_nam -#define lph_slp lph_un.un_conn.conn_slp - -struct nqlease { - LIST_ENTRY(nqlease) lc_hash; /* Fhandle hash list */ - CIRCLEQ_ENTRY(nqlease) lc_timer; /* Timer queue list */ - time_t lc_expiry; /* Expiry time (sec) */ - struct nqhost lc_host; /* Host that got lease */ - struct nqm *lc_morehosts; /* Other hosts that share read lease */ - fsid_t lc_fsid; /* Fhandle */ - char lc_fiddata[MAXFIDSZ]; - struct vnode *lc_vp; /* Soft reference to associated vnode */ -}; -#define lc_flag lc_host.lph_un.un_udp.udp_flag - -/* lc_flag bits */ -#define LC_VALID 0x0001 /* Host address valid */ -#define LC_WRITE 0x0002 /* Write cache */ -#define LC_NONCACHABLE 0x0004 /* Non-cachable lease */ -#define LC_LOCKED 0x0008 /* Locked */ -#define LC_WANTED 0x0010 /* Lock wanted */ -#define LC_EXPIREDWANTED 0x0020 /* Want lease when expired */ -#define LC_UDP 0x0040 /* Host address for udp socket */ -#define LC_CLTP 0x0080 /* Host address for other connectionless */ -#define LC_LOCAL 0x0100 /* Host is server */ -#define LC_VACATED 0x0200 /* Host has vacated lease */ -#define LC_WRITTEN 0x0400 /* Recently wrote to the leased file */ -#define LC_SREF 0x0800 /* Holds a nfssvc_sock reference */ - -struct nqm { - struct nqm *lpm_next; - struct nqhost lpm_hosts[LC_MOREHOSTSIZ]; -}; - -/* - * Special value for slp for local server calls. - */ -#define NQLOCALSLP ((struct nfssvc_sock *) -1) - -/* - * Server side macros. - */ -#define nqsrv_getl(v, l) \ - (void) nqsrv_getlease((v), &nfsd->nd_duration, \ - ((nfsd->nd_flag & ND_LEASE) ? (nfsd->nd_flag & ND_LEASE) : \ - ((l) | ND_CHECK)), \ - slp, procp, nfsd->nd_nam, &cache, &frev, cred) - -/* - * Client side macros that check for a valid lease. - */ -#define NQNFS_CKINVALID(v, n, f) \ - ((time.tv_sec > (n)->n_expiry && \ - VFSTONFS((v)->v_mount)->nm_timeouts < VFSTONFS((v)->v_mount)->nm_deadthresh) \ - || ((f) == ND_WRITE && ((n)->n_flag & NQNFSWRITE) == 0)) - -#define NQNFS_CKCACHABLE(v, f) \ - ((time.tv_sec <= VTONFS(v)->n_expiry || \ - VFSTONFS((v)->v_mount)->nm_timeouts >= VFSTONFS((v)->v_mount)->nm_deadthresh) \ - && (VTONFS(v)->n_flag & NQNFSNONCACHE) == 0 && \ - ((f) == ND_READ || (VTONFS(v)->n_flag & NQNFSWRITE))) - -#define NQNFS_NEEDLEASE(v, p) \ - (time.tv_sec > VTONFS(v)->n_expiry ? \ - ((VTONFS(v)->n_flag & NQNFSEVICTED) ? 0 : nqnfs_piggy[p]) : \ - (((time.tv_sec + NQ_RENEWAL) > VTONFS(v)->n_expiry && \ - nqnfs_piggy[p]) ? \ - ((VTONFS(v)->n_flag & NQNFSWRITE) ? \ - ND_WRITE : nqnfs_piggy[p]) : 0)) - -/* - * List head for timer queue. - */ -extern CIRCLEQ_HEAD(nqtimerhead, nqlease) nqtimerhead; - -/* - * List head for the file handle hash table. - */ -#define NQFHHASH(f) \ - (&nqfhhashtbl[(*((u_long *)(f))) & nqfhhash]) -extern LIST_HEAD(nqfhhashhead, nqlease) *nqfhhashtbl; -extern u_long nqfhhash; - -/* - * Nqnfs return status numbers. - */ -#define NQNFS_EXPIRED 500 -#define NQNFS_TRYLATER 501 - -#if defined(KERNEL) -void nqnfs_lease_check __P((struct vnode *, struct proc *, struct ucred *, int)); -void nqnfs_lease_updatetime __P((int)); -int nqsrv_getlease __P((struct vnode *, u_long *, int, - struct nfssvc_sock *, struct proc *, - struct mbuf *, int *, u_quad_t *, - struct ucred *)); -int nqnfs_getlease __P((struct vnode *,int,struct ucred *,struct proc *)); -int nqnfs_callback __P((struct nfsmount *,struct mbuf *,struct mbuf *,caddr_t)); -int nqnfs_clientd __P((struct nfsmount *,struct ucred *,struct nfsd_cargs *,int,caddr_t,struct proc *)); -struct nfsnode; -void nqnfs_clientlease __P((struct nfsmount *, struct nfsnode *, int, int, time_t, u_quad_t)); -void nqnfs_serverd __P((void)); -int nqnfsrv_getlease __P((struct nfsrv_descript *, struct nfssvc_sock *, struct proc *, struct mbuf **)); -int nqnfsrv_vacated __P((struct nfsrv_descript *, struct nfssvc_sock *, struct proc *, struct mbuf **)); -#endif - -#endif /* __APPLE_API_PRIVATE */ -#endif /* _NFS_NQNFS_H_ */ diff --git a/bsd/nfs/rpcv2.h b/bsd/nfs/rpcv2.h index d7a7a7df3..7bd0cca92 100644 --- a/bsd/nfs/rpcv2.h +++ b/bsd/nfs/rpcv2.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -79,7 +79,6 @@ #define RPCAUTH_UNIX 1 #define RPCAUTH_SHORT 2 #define RPCAUTH_KERB4 4 -#define RPCAUTH_NQNFS 300000 #define RPCAUTH_MAXSIZ 400 #define RPCVERF_MAXSIZ 12 /* For Kerb, can actually be 400 */ #define RPCAUTH_UNIXGIDS 16 diff --git a/bsd/nfs/xdr_subs.h b/bsd/nfs/xdr_subs.h index e786cb8c7..f6bc3748e 100644 --- a/bsd/nfs/xdr_subs.h +++ b/bsd/nfs/xdr_subs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -22,10 +22,7 @@ /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ /* * Copyright (c) 1989, 1993 - * The Regents of the University of California. All rights reserved. - * - * The NEXTSTEP Software License Agreement specifies the terms - * and conditions for redistribution. + * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. @@ -40,8 +37,8 @@ * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. + * This product includes software developed by the University of + * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. @@ -58,9 +55,10 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)xdr_subs.h 8.3 (Berkeley) 3/30/95 + * @(#)xdr_subs.h 8.3 (Berkeley) 3/30/95 * FreeBSD-Id: xdr_subs.h,v 1.9 1997/02/22 09:42:53 peter Exp $ */ + #ifndef _NFS_XDR_SUBS_H_ diff --git a/bsd/ppc/Makefile b/bsd/ppc/Makefile index 790667692..633b7a521 100644 --- a/bsd/ppc/Makefile +++ b/bsd/ppc/Makefile @@ -8,15 +8,21 @@ include $(MakeInc_cmd) include $(MakeInc_def) DATAFILES = \ - cpu.h disklabel.h endian.h exec.h label_t.h param.h profile.h \ - psl.h ptrace.h reboot.h reg.h setjmp.h signal.h spl.h \ - table.h types.h ucontext.h user.h vmparam.h + endian.h param.h profile.h \ + setjmp.h signal.h \ + types.h ucontext.h vmparam.h _types.h + +KERNELFILES = \ + endian.h param.h profile.h \ + signal.h \ + types.h vmparam.h _types.h INSTALL_MD_LIST = ${DATAFILES} +INSTALL_MD_LCL_LIST = ${DATAFILES} disklabel.h INSTALL_MD_DIR = ppc -EXPORT_MD_LIST = ${DATAFILES} +EXPORT_MD_LIST = ${KERNELFILES} EXPORT_MD_DIR = ppc diff --git a/bsd/ppc/_types.h b/bsd/ppc/_types.h new file mode 100644 index 000000000..337362194 --- /dev/null +++ b/bsd/ppc/_types.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#ifndef _BSD_PPC__TYPES_H_ +#define _BSD_PPC__TYPES_H_ + +/* + * This header file contains integer types. It's intended to also contain + * flotaing point and other arithmetic types, as needed, later. + */ + +#ifdef __GNUC__ +typedef __signed char __int8_t; +#else /* !__GNUC__ */ +typedef char __int8_t; +#endif /* !__GNUC__ */ +typedef unsigned char __uint8_t; +typedef unsigned short __int16_t; +typedef unsigned short __uint16_t; +typedef int __int32_t; +typedef unsigned int __uint32_t; +typedef long long __int64_t; +typedef unsigned long long __uint64_t; + +typedef long __darwin_intptr_t; +typedef unsigned int __darwin_natural_t; + +/* + * The rune type below is declared to be an ``int'' instead of the more natural + * ``unsigned long'' or ``long''. Two things are happening here. It is not + * unsigned so that EOF (-1) can be naturally assigned to it and used. Also, + * it looks like 10646 will be a 31 bit standard. This means that if your + * ints cannot hold 32 bits, you will be in trouble. The reason an int was + * chosen over a long is that the is*() and to*() routines take ints (says + * ANSI C), but they use __darwin_ct_rune_t instead of int. By changing it + * here, you lose a bit of ANSI conformance, but your programs will still + * work. + * + * NOTE: rune_t is not covered by ANSI nor other standards, and should not + * be instantiated outside of lib/libc/locale. Use wchar_t. wchar_t and + * rune_t must be the same type. Also wint_t must be no narrower than + * wchar_t, and should also be able to hold all members of the largest + * character set plus one extra value (WEOF). wint_t must be at least 16 bits. + */ + +typedef int __darwin_ct_rune_t; /* ct_rune_t */ + +/* + * mbstate_t is an opaque object to keep conversion state, during multibyte + * stream conversions. The content must not be referenced by user programs. + */ +typedef union { + char __mbstate8[128]; + long long _mbstateL; /* for alignment */ +} __mbstate_t; + +typedef __mbstate_t __darwin_mbstate_t; /* mbstate_t */ + +#if defined(__GNUC__) && defined(__PTRDIFF_TYPE__) +typedef __PTRDIFF_TYPE__ __darwin_ptrdiff_t; /* ptr1 - ptr2 */ +#else +typedef int __darwin_ptrdiff_t; /* ptr1 - ptr2 */ +#endif /* __GNUC__ */ + +#if defined(__GNUC__) && defined(__SIZE_TYPE__) +typedef __SIZE_TYPE__ __darwin_size_t; /* sizeof() */ +#else +typedef unsigned long __darwin_size_t; /* sizeof() */ +#endif + +#ifdef KERNEL +typedef char * __darwin_va_list; /* va_list */ +#else /* !KERNEL */ +#if (__GNUC__ > 2) +typedef __builtin_va_list __darwin_va_list; /* va_list */ +#else +typedef char * __darwin_va_list; /* va_list */ +#endif +#endif /* KERNEL */ + +#if defined(__GNUC__) && defined(__WCHAR_TYPE__) +typedef __WCHAR_TYPE__ __darwin_wchar_t; /* wchar_t */ +#else +typedef __darwin_ct_rune_t __darwin_wchar_t; /* wchar_t */ +#endif + +typedef __darwin_wchar_t __darwin_rune_t; /* rune_t */ + +#if defined(__GNUC__) && defined(__WINT_TYPE__) +typedef __WINT_TYPE__ __darwin_wint_t; /* wint_t */ +#else +typedef __darwin_ct_rune_t __darwin_wint_t; /* wint_t */ +#endif + +typedef unsigned long __darwin_clock_t; /* clock() */ +typedef __uint32_t __darwin_socklen_t; /* socklen_t (duh) */ +typedef long __darwin_ssize_t; /* byte count or error */ +typedef long __darwin_time_t; /* time() */ + +#endif /* _BSD_PPC__TYPES_H_ */ diff --git a/bsd/ppc/disklabel.h b/bsd/ppc/disklabel.h index 02a84a604..9d97865f0 100644 --- a/bsd/ppc/disklabel.h +++ b/bsd/ppc/disklabel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -20,13 +20,7 @@ * @APPLE_LICENSE_HEADER_END@ */ /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* -* - * The NEXTSTEP Software License Agreement specifies the terms - * and conditions for redistribution. - * - */ - + #ifndef _BSD_PPC_DISKLABEL_H_ #define _BSD_PPC_DISKLABEL_H_ diff --git a/bsd/ppc/endian.h b/bsd/ppc/endian.h index 984cdb588..72808459e 100644 --- a/bsd/ppc/endian.h +++ b/bsd/ppc/endian.h @@ -1,3 +1,25 @@ +/* + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + /* * Copyright (c) 1995 NeXT Computer, Inc. All rights reserved. * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. @@ -64,55 +86,25 @@ #define _QUAD_HIGHWORD 0 #define _QUAD_LOWWORD 1 -#if defined(KERNEL) || !defined(_POSIX_SOURCE) /* * Definitions for byte order, according to byte significance from low * address to high. */ -#define LITTLE_ENDIAN 1234 /* LSB first: i386, vax */ -#define BIG_ENDIAN 4321 /* MSB first: 68000, ibm, net, ppc */ -#define PDP_ENDIAN 3412 /* LSB first in word, MSW first in long */ - -#define BYTE_ORDER BIG_ENDIAN +#define __DARWIN_LITTLE_ENDIAN 1234 /* LSB first: i386, vax */ +#define __DARWIN_BIG_ENDIAN 4321 /* MSB first: 68000, ibm, net, ppc */ +#define __DARWIN_PDP_ENDIAN 3412 /* LSB first in word, MSW first in long */ -#include <sys/cdefs.h> +#define __DARWIN_BYTE_ORDER __DARWIN_BIG_ENDIAN -#ifndef __ASSEMBLER__ -__BEGIN_DECLS -unsigned long htonl __P((unsigned long)); -unsigned short htons __P((unsigned short)); -unsigned long ntohl __P((unsigned long)); -unsigned short ntohs __P((unsigned short)); -__END_DECLS -#endif /* __ASSEMBLER__ */ - -/* - * Macros for network/external number representation conversion. - */ -#if BYTE_ORDER == BIG_ENDIAN && !defined(lint) -#define ntohl(x) (x) -#define ntohs(x) (x) -#define htonl(x) (x) -#define htons(x) (x) +#if defined(KERNEL) || !defined(_POSIX_C_SOURCE) -#define NTOHL(x) (x) -#define NTOHS(x) (x) -#define HTONL(x) (x) -#define HTONS(x) (x) +#define LITTLE_ENDIAN __DARWIN_LITTLE_ENDIAN +#define BIG_ENDIAN __DARWIN_BIG_ENDIAN +#define PDP_ENDIAN __DARWIN_PDP_ENDIAN -#else +#define BYTE_ORDER __DARWIN_BYTE_ORDER -#include <machine/byte_order.h> - -#define ntohl(x) NXSwapBigLongToHost(x) -#define ntohs(x) NXSwapBigShortToHost(x) -#define htonl(x) NXSwapHostLongToBig(x) -#define htons(x) NXSwapHostShortToBig(x) +#include <sys/_endian.h> -#define NTOHL(x) (x) = ntohl((u_long)x) -#define NTOHS(x) (x) = ntohs((u_short)x) -#define HTONL(x) (x) = htonl((u_long)x) -#define HTONS(x) (x) = htons((u_short)x) -#endif -#endif /* defined(KERNEL) || !defined(_POSIX_SOURCE) */ +#endif /* defined(KERNEL) || !defined(_POSIX_C_SOURCE) */ #endif /* !_PPC_ENDIAN_H_ */ diff --git a/bsd/ppc/exec.h b/bsd/ppc/exec.h index 339bac2c7..86024c6d6 100644 --- a/bsd/ppc/exec.h +++ b/bsd/ppc/exec.h @@ -45,7 +45,7 @@ #include <sys/appleapiopts.h> -#ifdef __APPLE_API_OBSOLETE +#ifdef BSD_KERNEL_PRIVATE /* Size of a page in an object file. */ #define __LDPGSZ 4096 @@ -96,13 +96,7 @@ struct exec { unsigned int a_drsize; /* data relocation size */ }; -/* - * Address of ps_strings structure (in user space). - */ -#define PS_STRINGS \ - ((struct ps_strings *)(USRSTACK - sizeof(struct ps_strings))) - -#endif /* __APPLE_API_OBSOLETE */ +#endif /* BSD_KERNEL_PRIVATE */ #endif /* _BSD_PPC_EXEC_H_ */ diff --git a/bsd/ppc/param.h b/bsd/ppc/param.h index fa8f2cf46..ece487f07 100644 --- a/bsd/ppc/param.h +++ b/bsd/ppc/param.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -20,22 +20,17 @@ * @APPLE_LICENSE_HEADER_END@ */ /* Copyright (c) 1993,1995 NeXT Computer, Inc. All Rights Reserved */ -/* - * The NEXTSTEP Software License Agreement specifies the terms - * and conditions for redistribution. - * - */ #ifndef _PPC_PARAM_H_ #define _PPC_PARAM_H_ /* * Round p (pointer or byte index) up to a correctly-aligned value for all - * data types (int, long, ...). The result is u_int and must be cast to - * any desired pointer type. + * data types (int, long, ...). The result is unsigned int and must be + * cast to any desired pointer type. */ #define ALIGNBYTES 3 -#define ALIGN(p) (((u_int)(p) + ALIGNBYTES) &~ ALIGNBYTES) +#define ALIGN(p) (((unsigned int)(p) + ALIGNBYTES) &~ ALIGNBYTES) #define NBPG 4096 /* bytes/page */ #define PGOFSET (NBPG-1) /* byte offset into page */ diff --git a/bsd/ppc/reboot.h b/bsd/ppc/reboot.h index 0a47e6a49..576b8658c 100644 --- a/bsd/ppc/reboot.h +++ b/bsd/ppc/reboot.h @@ -28,8 +28,7 @@ /* * Empty file (publicly) */ -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef BSD_KERNEL_PRIVATE /* * Use most significant 16 bits to avoid collisions with * machine independent flags. @@ -43,9 +42,8 @@ #define RB_BOOTNEXT 0x00400000 /* reboot into NeXT */ #define RB_BOOTDOS 0x00800000 /* reboot into DOS */ -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +#endif /* BSD_KERNEL_PRIVATE */ #endif /* _BSD_PPC_REBOOT_H_ */ diff --git a/bsd/ppc/reg.h b/bsd/ppc/reg.h index 58f1be653..b45306d2c 100644 --- a/bsd/ppc/reg.h +++ b/bsd/ppc/reg.h @@ -26,17 +26,13 @@ #ifndef _BSD_PPC_REG_H_ #define _BSD_PPC_REG_H_ -#include <sys/appleapiopts.h> -#ifdef KERNEL_PRIVATE -#ifdef __APPLE_API_PRIVATE +#ifdef BSD_KERNEL_PRIVATE /* Index into the thread_state */ #define SP 3 #define PC 0 -#endif /* __APPLE_API_PRIVATE */ - #endif /* KERNEL_PRIVATE */ #endif /* _BSD_PPC_REG_H_ */ diff --git a/bsd/ppc/setjmp.h b/bsd/ppc/setjmp.h index f7b318d92..cb9c7cd33 100644 --- a/bsd/ppc/setjmp.h +++ b/bsd/ppc/setjmp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -33,44 +33,69 @@ struct _jmp_buf { struct sigcontext sigcontext; /* kernel state preserved by set/longjmp */ - unsigned long vmask __attribute__((aligned(8))); /* vector mask register */ - unsigned long vreg[32 * 4] __attribute__((aligned(16))); + unsigned int vmask __attribute__((aligned(8))); /* vector mask register */ + unsigned int vreg[32 * 4] __attribute__((aligned(16))); /* 32 128-bit vector registers */ }; /* * _JBLEN is number of ints required to save the following: - * r1, r2, r13-r31, lr, cr, ctr, xer, sig == 26 ints - * fr14 - fr31 = 18 doubles = 36 ints + * r1, r2, r13-r31, lr, cr, ctr, xer, sig == 26 register_t sized + * fr14 - fr31 = 18 doubles * vmask, 32 vector registers = 129 ints * 2 ints to get all the elements aligned + * + * register_t is 2 ints for ppc64 threads */ +#define _JBLEN64 (26*2 + 18*2 + 129 + 1) +#define _JBLEN32 (26 + 18*2 + 129 + 1) +#define _JBLEN_MAX _JBLEN64 -#define _JBLEN (26 + 36 + 129 + 1) +/* + * Locally scoped sizes + */ +#if defined(__ppc64__) +#define _JBLEN _JBLEN64 +#else +#define _JBLEN _JBLEN32 +#endif #if defined(KERNEL) -typedef struct sigcontext jmp_buf[1]; -typedef struct __sigjmp_buf { - int __storage[_JBLEN + 1] __attribute__((aligned(8))); - } sigjmp_buf[1]; +typedef struct sigcontext32 jmp_buf32[1]; +typedef struct __sigjmp_buf32 { + int __storage[_JBLEN32 + 1] __attribute__((aligned(8))); + } sigjmp_buf32[1]; + +typedef struct sigcontext64 jmp_buf64[1]; +typedef struct __sigjmp_buf64 { + int __storage[_JBLEN64 + 1] __attribute__((aligned(8))); + } sigjmp_buf64[1]; + +/* + * JMM - have to decide how the kernel will deal with this. + * For now, hard-code the 32-bit types. + */ +typedef struct sigcontext32 jmp_buf[1]; +typedef struct __sigjmp_buf32 sigjmp_buf[1]; + #else typedef int jmp_buf[_JBLEN]; typedef int sigjmp_buf[_JBLEN + 1]; #endif __BEGIN_DECLS -extern int setjmp __P((jmp_buf env)); -extern void longjmp __P((jmp_buf env, int val)); +extern int setjmp(jmp_buf env); +extern void longjmp(jmp_buf env, int val); #ifndef _ANSI_SOURCE -int sigsetjmp __P((sigjmp_buf env, int val)); -void siglongjmp __P((sigjmp_buf env, int val)); +int _setjmp(jmp_buf env); +void _longjmp(jmp_buf, int val); +int sigsetjmp(sigjmp_buf env, int val); +void siglongjmp(sigjmp_buf env, int val); #endif /* _ANSI_SOURCE */ -#if !defined(_ANSI_SOURCE) && !defined(_POSIX_SOURCE) -int _setjmp __P((jmp_buf env)); -void _longjmp __P((jmp_buf, int val)); -void longjmperror __P((void)); +#if !defined(_ANSI_SOURCE) && !defined(_POSIX_C_SOURCE) +void longjmperror(void); #endif /* neither ANSI nor POSIX */ __END_DECLS diff --git a/bsd/ppc/signal.h b/bsd/ppc/signal.h index ef4138630..fee82c365 100644 --- a/bsd/ppc/signal.h +++ b/bsd/ppc/signal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -63,13 +63,35 @@ typedef enum { * to the handler to allow it to properly restore state if * a non-standard exit is performed. */ +struct sigcontext32 { + int sc_onstack; /* sigstack state to restore */ + int sc_mask; /* signal mask to restore */ + int sc_ir; /* pc */ + int sc_psw; /* processor status word */ + int sc_sp; /* stack pointer if sc_regs == NULL */ + void *sc_regs; /* (kernel private) saved state */ +}; + +struct sigcontext64 { + int sc_onstack; /* sigstack state to restore */ + int sc_mask; /* signal mask to restore */ + long long sc_ir; /* pc */ + long long sc_psw; /* processor status word */ + long long sc_sp; /* stack pointer if sc_regs == NULL */ + void *sc_regs; /* (kernel private) saved state */ +}; + +/* + * LP64todo - Have to decide how to handle this. + * For now, just duplicate the 32-bit context as the generic one. + */ struct sigcontext { int sc_onstack; /* sigstack state to restore */ int sc_mask; /* signal mask to restore */ - int sc_ir; /* pc */ + int sc_ir; /* pc */ int sc_psw; /* processor status word */ int sc_sp; /* stack pointer if sc_regs == NULL */ - void *sc_regs; /* (kernel private) saved state */ + void *sc_regs; /* (kernel private) saved state */ }; #endif /* __APPLE_API_OBSOLETE */ diff --git a/bsd/ppc/spl.h b/bsd/ppc/spl.h deleted file mode 100644 index 01d0c0b21..000000000 --- a/bsd/ppc/spl.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -#ifndef _BSD_PPC_SPL_H_ -#define _BSD_PPC_SPL_H_ - -#ifdef KERNEL -#ifndef __ASSEMBLER__ -/* - * Machine-dependent SPL definitions. - * - */ -typedef unsigned spl_t; - -extern unsigned sploff(void); -extern unsigned splhigh(void); -extern unsigned splsched(void); -extern unsigned splclock(void); -extern unsigned splpower(void); -extern unsigned splvm(void); -extern unsigned splbio(void); -extern unsigned splimp(void); -extern unsigned spltty(void); -extern unsigned splnet(void); -extern unsigned splsoftclock(void); - -extern void spllo(void); -extern void splon(unsigned level); -extern void splx(unsigned level); -extern void spln(unsigned level); -#define splstatclock() splhigh() - -#endif /* __ASSEMBLER__ */ - -#endif - -#endif /* _BSD_PPC_SPL_H_ */ diff --git a/bsd/ppc/types.h b/bsd/ppc/types.h index f370e9bf1..58b77b5a3 100644 --- a/bsd/ppc/types.h +++ b/bsd/ppc/types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -61,26 +61,66 @@ #define _MACHTYPES_H_ #ifndef __ASSEMBLER__ +#include <ppc/_types.h> #include <sys/cdefs.h> /* * Basic integral types. Omit the typedef if * not possible for a machine/compiler combination. */ +#ifndef _INT8_T +#define _INT8_T typedef __signed char int8_t; +#endif typedef unsigned char u_int8_t; +#ifndef _INT16_T +#define _INT16_T typedef short int16_t; +#endif typedef unsigned short u_int16_t; +#ifndef _INT32_T +#define _INT32_T typedef int int32_t; +#endif typedef unsigned int u_int32_t; +#ifndef _INT64_T +#define _INT64_T typedef long long int64_t; +#endif typedef unsigned long long u_int64_t; +#if defined(__ppc64__) +typedef int64_t register_t; +#else typedef int32_t register_t; +#endif -typedef long int intptr_t; -typedef unsigned long int uintptr_t; +#ifndef _INTPTR_T +#define _INTPTR_T +typedef __darwin_intptr_t intptr_t; +#endif +#ifndef _UINTPTR_T +#define _UINTPTR_T +typedef unsigned long uintptr_t; +#endif +/* with LP64 support pointers and longs from user address space may vary */ +/* in size depending on the type of process (currently 32 or 64-bit, but */ +/* may change in the future). These types are used for reserving the largest */ +/* possible size. */ +// LP64todo - typedef mach_vm_address_t user_addr_t; /* varying length pointers from user space */ +// LP64todo - typedef mach_vm_size_t user_size_t; /* varying length values from user space (unsigned) */ +typedef u_int64_t user_addr_t; +typedef u_int64_t user_size_t; +typedef int64_t user_ssize_t; +typedef int64_t user_long_t; +typedef u_int64_t user_ulong_t; +typedef int64_t user_time_t; +#define USER_ADDR_NULL ((user_addr_t) 0) +#define CAST_USER_ADDR_T(a_ptr) ((user_addr_t)((uintptr_t)(a_ptr))) + +#ifndef __offsetof #define __offsetof(type, field) ((size_t)(&((type *)0)->field)) +#endif #endif /* __ASSEMBLER__ */ #endif /* _MACHTYPES_H_ */ diff --git a/bsd/ppc/ucontext.h b/bsd/ppc/ucontext.h index 77c459244..ab434a1d4 100644 --- a/bsd/ppc/ucontext.h +++ b/bsd/ppc/ucontext.h @@ -24,27 +24,44 @@ #define _PPC_UCONTEXT_H_ -#include <mach/thread_status.h> +#include <mach/ppc/_types.h> +#ifndef _POSIX_C_SOURCE struct mcontext { - ppc_exception_state_t es; - ppc_thread_state_t ss; - ppc_float_state_t fs; - ppc_vector_state_t vs; + struct ppc_exception_state es; + struct ppc_thread_state ss; + struct ppc_float_state fs; + struct ppc_vector_state vs; }; - #define PPC_MCONTEXT_SIZE (PPC_THREAD_STATE_COUNT + PPC_FLOAT_STATE_COUNT + PPC_EXCEPTION_STATE_COUNT + PPC_VECTOR_STATE_COUNT) * sizeof(int) +#else /* _POSIX_C_SOURCE */ +struct __darwin_mcontext { + struct __darwin_ppc_exception_state es; + struct __darwin_ppc_thread_state ss; + struct __darwin_ppc_float_state fs; + struct __darwin_ppc_vector_state vs; +}; +#endif /* _POSIX_C_SOURCE */ -typedef struct mcontext * mcontext_t; +#ifndef _MCONTEXT_T +#define _MCONTEXT_T +typedef __darwin_mcontext_t mcontext_t; +#endif +#ifndef _POSIX_C_SOURCE struct mcontext64 { - ppc_exception_state_t es; - ppc_thread_state64_t ss; - ppc_float_state_t fs; - ppc_vector_state_t vs; + struct ppc_exception_state64 es; + struct ppc_thread_state64 ss; + struct ppc_float_state fs; + struct ppc_vector_state vs; }; #define PPC_MCONTEXT64_SIZE (PPC_THREAD_STATE64_COUNT + PPC_FLOAT_STATE_COUNT + PPC_EXCEPTION_STATE_COUNT + PPC_VECTOR_STATE_COUNT) * sizeof(int) +#ifndef _MCONTEXT64_T +#define _MCONTEXT64_T typedef struct mcontext64 * mcontext64_t; +#endif + +#endif /* _POSIX_C_SOURCE */ #endif /* _PPC_UCONTEXT_H_ */ diff --git a/bsd/ppc/user.h b/bsd/ppc/user.h deleted file mode 100644 index 5914cf757..000000000 --- a/bsd/ppc/user.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* - * Copyright (C) 1989, NeXT, Inc. - * - * bsd/ppc/user.h - * - * We can use the default definition of u, so this file is empty. - */ - -#warning ---- Empty bsd/ppc/user.h ---- diff --git a/bsd/ppc/vmparam.h b/bsd/ppc/vmparam.h index 1f762947f..013608b64 100644 --- a/bsd/ppc/vmparam.h +++ b/bsd/ppc/vmparam.h @@ -25,7 +25,13 @@ #include <sys/resource.h> -#define USRSTACK 0xc0000000 +#define USRSTACK (0xc0000000) + +/* + * put the default 64-bit stack at the max address + * (minus one 32-bit address space for other incidentals) + */ +#define USRSTACK64 (MACH_VM_MAX_ADDRESS - VM_MAX_ADDRESS) /* * Virtual memory related constants, all in bytes diff --git a/bsd/sys/Makefile b/bsd/sys/Makefile index 9505779d9..946ec4094 100644 --- a/bsd/sys/Makefile +++ b/bsd/sys/Makefile @@ -19,30 +19,94 @@ EXPINC_SUBDIRS_PPC = \ EXPINC_SUBDIRS_I386 = \ +# In both the framework PrivateHeader area and /usr/include/sys DATAFILES = \ appleapiopts.h acct.h aio.h attr.h \ - buf.h callout.h cdefs.h clist.h conf.h \ - dir.h dirent.h disk.h disklabel.h disktab.h dkstat.h dmap.h domain.h \ - errno.h ev.h event.h exec.h fcntl.h file.h filedesc.h filio.h gmon.h ioccom.h ioctl.h \ - ioctl_compat.h ipc.h kernel.h kern_event.h ktrace.h loadable_fs.h lock.h lockf.h mach_swapon.h malloc.h \ - kdebug.h linker_set.h md5.h kern_control.h \ - mbuf.h mman.h mount.h msgbuf.h mtio.h namei.h netport.h param.h paths.h \ - proc.h protosw.h ptrace.h queue.h quota.h random.h reboot.h resource.h resourcevar.h \ + buf.h cdefs.h conf.h \ + dir.h dirent.h disk.h dkstat.h \ + errno.h ev.h event.h fcntl.h file.h filedesc.h filio.h gmon.h \ + ioccom.h ioctl.h \ + ioctl_compat.h ipc.h kernel.h kernel_types.h kern_event.h loadable_fs.h lock.h lockf.h \ + kauth.h kdebug.h md5.h kern_control.h malloc.h \ + mbuf.h mman.h mount.h msg.h msgbuf.h mtio.h netport.h param.h paths.h pipe.h poll.h \ + proc.h ptrace.h queue.h quota.h random.h reboot.h resource.h resourcevar.h \ select.h sem.h semaphore.h shm.h signal.h signalvar.h socket.h socketvar.h sockio.h stat.h \ - syscall.h sysctl.h syslimits.h syslog.h systm.h sys_domain.h termios.h time.h \ + syscall.h sysctl.h syslimits.h syslog.h sys_domain.h termios.h time.h \ timeb.h times.h tprintf.h trace.h tty.h ttychars.h ttycom.h \ ttydefaults.h ttydev.h types.h ubc.h ucontext.h ucred.h uio.h un.h unistd.h unpcb.h \ - user.h utfconv.h utsname.h ux_exception.h vadvise.h vcmd.h version.h \ - vm.h vmmeter.h vmparam.h vnioctl.h vnode.h vnode_if.h vstat.h wait.h + user.h utfconv.h utsname.h vadvise.h vcmd.h version.h \ + vm.h vmmeter.h vmparam.h vnioctl.h vnode.h vnode_if.h vstat.h wait.h xattr.h \ + _types.h _endian.h domain.h protosw.h + +# Only in the framework PrivateHeader area +PRIVATE_DATAFILES = \ + disklabel.h \ + ipcs.h \ + sem_internal.h \ + shm_internal.h \ + ux_exception.h \ + ktrace.h \ + vnioctl.h + +# KERNELFILES will appear only in the kernel framework +KERNELFILES = \ + appleapiopts.h attr.h \ + buf.h cdefs.h conf.h \ + dir.h dirent.h disk.h dkstat.h \ + errno.h ev.h event.h fcntl.h file.h filedesc.h filio.h \ + ioccom.h ioctl.h \ + ioctl_compat.h kernel.h kernel_types.h kern_event.h lock.h lockf.h \ + kauth.h kdebug.h md5.h kern_control.h malloc.h namei.h \ + mman.h mbuf.h mount.h mtio.h netport.h param.h paths.h \ + proc.h queue.h quota.h random.h resource.h resourcevar.h \ + select.h signal.h socket.h socketvar.h sockio.h stat.h \ + sysctl.h syslimits.h syslog.h systm.h sys_domain.h time.h \ + types.h ubc.h ucontext.h ucred.h uio.h un.h unistd.h unpcb.h \ + utfconv.h version.h \ + vm.h vmparam.h vnode.h vnode_if.h xattr.h \ + _types.h _endian.h protosw.h domain.h \ + kpi_mbuf.h kpi_socket.h kpi_socketfilter.h \ + ttycom.h termios.h + + +# Only in the private kernel framework +PRIVATE_KERNELFILES = \ + disktab.h \ + file_internal.h \ + mach_swapon.h \ + msgbuf.h \ + eventvar.h \ + mount_internal.h \ + proc_internal.h \ + ptrace_internal.h \ + vnode_internal.h \ + signalvar.h \ + tty.h ttychars.h \ + ttydefaults.h ttydev.h \ + user.h \ + ubc_internal.h \ + uio_internal.h \ + vfs_context.h + INSTALL_MI_LIST = ${DATAFILES} INSTALL_MI_DIR = sys -EXPORT_MI_LIST = ${DATAFILES} +EXPORT_MI_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} syscall.h ktrace.h linker_set.h EXPORT_MI_DIR = sys +# /System/Library/Frameworks/System.framework/PrivateHeaders +INSTALL_MI_LCL_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} + +# /System/Library/Frameworks/Kernel.framework/PrivateHeaders + +INSTALL_KF_MI_LCL_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} + +# /System/Library/Frameworks/Kernel.framework/Headers + +INSTALL_KF_MI_LIST = ${KERNELFILES} include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/bsd/miscfs/specfs/lockf.h b/bsd/sys/_endian.h similarity index 53% rename from bsd/miscfs/specfs/lockf.h rename to bsd/sys/_endian.h index c10b0b2d8..8d0c683b8 100644 --- a/bsd/miscfs/specfs/lockf.h +++ b/bsd/sys/_endian.h @@ -1,4 +1,27 @@ /* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/* + * Copyright (c) 1995 NeXT Computer, Inc. All rights reserved. * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ @@ -19,14 +42,10 @@ * * @APPLE_LICENSE_HEADER_END@ */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ /* - * Copyright (c) 1991, 1993 + * Copyright (c) 1987, 1991, 1993 * The Regents of the University of California. All rights reserved. * - * This code is derived from software contributed to Berkeley by - * Scooter Morris at Genentech Inc. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -54,61 +73,60 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * @(#)lockf.h 8.2 (Berkeley) 10/26/94 */ -#ifndef _UFS_LOCKF_H_ -#define _UFS_LOCKF_H_ -#include <sys/appleapiopts.h> +#ifndef _SYS__ENDIAN_H_ +#define _SYS__ENDIAN_H_ + +#include <sys/cdefs.h> -#ifdef __APPLE_API_PRIVATE /* - * The lockf structure is a kernel structure which contains the information - * associated with a byte range lock. The lockf structures are linked into - * the specinfo structure. Locks are sorted by the starting byte of the lock for - * efficiency. + * Macros for network/external number representation conversion. */ -TAILQ_HEAD(locklist, lockf); +#if __DARWIN_BYTE_ORDER == __DARWIN_BIG_ENDIAN && !defined(lint) +#define ntohl(x) (x) +#define ntohs(x) (x) +#define htonl(x) (x) +#define htons(x) (x) -struct specinfo; -struct lockf { - short lf_flags; /* Semantics: F_POSIX, F_FLOCK, F_WAIT */ - short lf_type; /* Lock type: F_RDLCK, F_WRLCK */ - off_t lf_start; /* Byte # of the start of the lock */ - off_t lf_end; /* Byte # of the end of the lock (-1=EOF) */ - caddr_t lf_id; /* Id of the resource holding the lock */ - struct specinfo *lf_specinfo; /* Back pointer to the specinfo */ - struct lockf *lf_next; /* Pointer to the next lock on this info */ - struct locklist lf_blkhd; /* List of requests blocked on this lock */ - TAILQ_ENTRY(lockf) lf_block;/* A request waiting for a lock */ -}; +#if defined(KERNEL) || !defined(_POSIX_C_SOURCE) +#define NTOHL(x) (x) +#define NTOHS(x) (x) +#define HTONL(x) (x) +#define HTONS(x) (x) +#endif /* defined(KERNEL) || !defined(_POSIX_C_SOURCE) */ -/* Maximum length of sleep chains to traverse to try and detect deadlock. */ -#define MAXDEPTH 50 +#else -__BEGIN_DECLS -void spec_lf_addblock __P((struct lockf *, struct lockf *)); -int spec_lf_clearlock __P((struct lockf *)); -int spec_lf_findoverlap __P((struct lockf *, - struct lockf *, int, struct lockf ***, struct lockf **)); -struct lockf * - spec_lf_getblock __P((struct lockf *)); -int spec_lf_getlock __P((struct lockf *, struct flock *)); -int spec_lf_setlock __P((struct lockf *)); -void spec_lf_split __P((struct lockf *, struct lockf *)); -void spec_lf_wakelock __P((struct lockf *)); -__END_DECLS - -#ifdef LOCKF_DEBUG -extern int lockf_debug; +#if !defined(__ASSEMBLER__) +#include <stdint.h> +#include <machine/byte_order.h> + __BEGIN_DECLS -void spec_lf_print __P((char *, struct lockf *)); -void spec_lf_printlist __P((char *, struct lockf *)); +uint16_t ntohs(uint16_t); +uint16_t htons(uint16_t); +uint32_t ntohl(uint32_t); +uint32_t htonl(uint32_t); __END_DECLS -#endif +#endif /* !defined(__ASSEMBLER__) */ + +#define ntohs(x) NXSwapBigShortToHost(x) +#define htons(x) NXSwapHostShortToBig(x) -#endif /* __APPLE_API_PRIVATE */ -#endif /* ! _UFS_LOCKF_H_ */ +#if defined(__LP64__) +#define ntohl(x) NXSwapBigIntToHost(x) +#define htonl(x) NXSwapHostIntToBig(x) +#else +#define ntohl(x) NXSwapBigLongToHost(x) +#define htonl(x) NXSwapHostLongToBig(x) +#endif /* defined(__LP64__) */ +#if defined(KERNEL) || !defined(_POSIX_C_SOURCE) +#define NTOHL(x) (x) = ntohl((u_long)x) +#define NTOHS(x) (x) = ntohs((u_short)x) +#define HTONL(x) (x) = htonl((u_long)x) +#define HTONS(x) (x) = htons((u_short)x) +#endif /* defined(KERNEL) || !defined(_POSIX_C_SOURCE) */ +#endif /* __DARWIN_BYTE_ORDER != __DARWIN_BIG_ENDIAN || defined(lint) */ +#endif /* !_SYS__ENDIAN_H_ */ diff --git a/bsd/sys/_types.h b/bsd/sys/_types.h new file mode 100644 index 000000000..addbb39b1 --- /dev/null +++ b/bsd/sys/_types.h @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef _SYS__TYPES_H_ +#define _SYS__TYPES_H_ + +#include <sys/cdefs.h> +#include <machine/_types.h> + +/* Forward references */ +#ifndef _POSIX_C_SOURCE +struct mcontext; +struct mcontext64; +#else /* _POSIX_C_SOURCE */ +struct __darwin_mcontext; +#endif /* _POSIX_C_SOURCE */ + +/* pthread opaque structures */ +#if defined(__LP64__) +#define __PTHREAD_SIZE__ 1168 +#define __PTHREAD_ATTR_SIZE__ 56 +#define __PTHREAD_MUTEXATTR_SIZE__ 8 +#define __PTHREAD_MUTEX_SIZE__ 56 +#define __PTHREAD_CONDATTR_SIZE__ 8 +#define __PTHREAD_COND_SIZE__ 40 +#define __PTHREAD_ONCE_SIZE__ 8 +#define __PTHREAD_RWLOCK_SIZE__ 192 +#define __PTHREAD_RWLOCKATTR_SIZE__ 16 +#else /* __LP64__ */ +#define __PTHREAD_SIZE__ 596 +#define __PTHREAD_ATTR_SIZE__ 36 +#define __PTHREAD_MUTEXATTR_SIZE__ 8 +#define __PTHREAD_MUTEX_SIZE__ 40 +#define __PTHREAD_CONDATTR_SIZE__ 4 +#define __PTHREAD_COND_SIZE__ 24 +#define __PTHREAD_ONCE_SIZE__ 4 +#define __PTHREAD_RWLOCK_SIZE__ 124 +#define __PTHREAD_RWLOCKATTR_SIZE__ 12 +#endif /* __LP64__ */ + +struct __darwin_pthread_handler_rec +{ + void (*__routine)(void *); /* Routine to call */ + void *__arg; /* Argument to pass */ + struct __darwin_pthread_handler_rec *__next; +}; +struct _opaque_pthread_attr_t { long __sig; char __opaque[__PTHREAD_ATTR_SIZE__]; }; +struct _opaque_pthread_cond_t { long __sig; char __opaque[__PTHREAD_COND_SIZE__]; }; +struct _opaque_pthread_condattr_t { long __sig; char __opaque[__PTHREAD_CONDATTR_SIZE__]; }; +struct _opaque_pthread_mutex_t { long __sig; char __opaque[__PTHREAD_MUTEX_SIZE__]; }; +struct _opaque_pthread_mutexattr_t { long __sig; char __opaque[__PTHREAD_MUTEXATTR_SIZE__]; }; +struct _opaque_pthread_once_t { long __sig; char __opaque[__PTHREAD_ONCE_SIZE__]; }; +struct _opaque_pthread_rwlock_t { long __sig; char __opaque[__PTHREAD_RWLOCK_SIZE__]; }; +struct _opaque_pthread_rwlockattr_t { long __sig; char __opaque[__PTHREAD_RWLOCKATTR_SIZE__]; }; +struct _opaque_pthread_t { long __sig; struct __darwin_pthread_handler_rec *__cleanup_stack; char __opaque[__PTHREAD_SIZE__]; }; + +/* + * Type definitions; takes common type definitions that must be used + * in multiple header files due to [XSI], removes them from the system + * space, and puts them in the implementation space. + */ + +#ifdef __cplusplus +#ifdef __GNUG__ +#define __DARWIN_NULL __null +#else /* ! __GNUG__ */ +#ifdef __LP64__ +#define __DARWIN_NULL (0L) +#else /* !__LP64__ */ +#define __DARWIN_NULL 0 +#endif /* __LP64__ */ +#endif /* __GNUG__ */ +#else /* ! __cplusplus */ +#define __DARWIN_NULL ((void *)0) +#endif /* __cplusplus */ + +typedef __int64_t __darwin_blkcnt_t; /* total blocks */ +typedef __int32_t __darwin_blksize_t; /* preferred block size */ +typedef __int32_t __darwin_dev_t; /* dev_t */ +typedef unsigned int __darwin_fsblkcnt_t; /* Used by statvfs and fstatvfs */ +typedef unsigned int __darwin_fsfilcnt_t; /* Used by statvfs and fstatvfs */ +typedef __uint32_t __darwin_gid_t; /* [???] process and group IDs */ +typedef __uint32_t __darwin_id_t; /* [XSI] pid_t, uid_t, or gid_t*/ +typedef __uint32_t __darwin_ino_t; /* [???] Used for inodes */ +typedef __darwin_natural_t __darwin_mach_port_name_t; /* Used by mach */ +typedef __darwin_mach_port_name_t __darwin_mach_port_t; /* Used by mach */ +#ifndef _POSIX_C_SOURCE +typedef struct mcontext *__darwin_mcontext_t; /* [???] machine context */ +typedef struct mcontext64 *__darwin_mcontext64_t; /* [???] machine context */ +#else /* _POSIX_C_SOURCE */ +typedef struct __darwin_mcontext *__darwin_mcontext_t; /* [???] machine context */ +#endif /* _POSIX_C_SOURCE */ +typedef __uint16_t __darwin_mode_t; /* [???] Some file attributes */ +typedef __int64_t __darwin_off_t; /* [???] Used for file sizes */ +typedef __int32_t __darwin_pid_t; /* [???] process and group IDs */ +typedef struct _opaque_pthread_attr_t + __darwin_pthread_attr_t; /* [???] Used for pthreads */ +typedef struct _opaque_pthread_cond_t + __darwin_pthread_cond_t; /* [???] Used for pthreads */ +typedef struct _opaque_pthread_condattr_t + __darwin_pthread_condattr_t; /* [???] Used for pthreads */ +typedef unsigned long __darwin_pthread_key_t; /* [???] Used for pthreads */ +typedef struct _opaque_pthread_mutex_t + __darwin_pthread_mutex_t; /* [???] Used for pthreads */ +typedef struct _opaque_pthread_mutexattr_t + __darwin_pthread_mutexattr_t; /* [???] Used for pthreads */ +typedef struct _opaque_pthread_once_t + __darwin_pthread_once_t; /* [???] Used for pthreads */ +typedef struct _opaque_pthread_rwlock_t + __darwin_pthread_rwlock_t; /* [???] Used for pthreads */ +typedef struct _opaque_pthread_rwlockattr_t + __darwin_pthread_rwlockattr_t; /* [???] Used for pthreads */ +typedef struct _opaque_pthread_t + *__darwin_pthread_t; /* [???] Used for pthreads */ +typedef __uint32_t __darwin_sigset_t; /* [???] signal set */ +typedef __int32_t __darwin_suseconds_t; /* [???] microseconds */ +typedef __uint32_t __darwin_uid_t; /* [???] user IDs */ +typedef __uint32_t __darwin_useconds_t; /* [???] microseconds */ +typedef unsigned char __darwin_uuid_t[16]; + +/* Structure used in sigaltstack call. */ +#ifndef _POSIX_C_SOURCE +struct sigaltstack +#else /* _POSIX_C_SOURCE */ +struct __darwin_sigaltstack +#endif /* _POSIX_C_SOURCE */ +{ + void *ss_sp; /* signal stack base */ + __darwin_size_t ss_size; /* signal stack length */ + int ss_flags; /* SA_DISABLE and/or SA_ONSTACK */ +}; +#ifndef _POSIX_C_SOURCE +typedef struct sigaltstack __darwin_stack_t; /* [???] signal stack */ +#else /* _POSIX_C_SOURCE */ +typedef struct __darwin_sigaltstack __darwin_stack_t; /* [???] signal stack */ +#endif /* _POSIX_C_SOURCE */ + +/* user context */ +#ifndef _POSIX_C_SOURCE +struct ucontext +#else /* _POSIX_C_SOURCE */ +struct __darwin_ucontext +#endif /* _POSIX_C_SOURCE */ +{ + int uc_onstack; + __darwin_sigset_t uc_sigmask; /* signal mask used by this context */ + __darwin_stack_t uc_stack; /* stack used by this context */ +#ifndef _POSIX_C_SOURCE + struct ucontext *uc_link; /* pointer to resuming context */ +#else /* _POSIX_C_SOURCE */ + struct __darwin_ucontext *uc_link; /* pointer to resuming context */ +#endif /* _POSIX_C_SOURCE */ + __darwin_size_t uc_mcsize; /* size of the machine context passed in */ + __darwin_mcontext_t uc_mcontext; /* pointer to machine specific context */ +}; +#ifndef _POSIX_C_SOURCE +typedef struct ucontext __darwin_ucontext_t; /* [???] user context */ +#else /* _POSIX_C_SOURCE */ +typedef struct __darwin_ucontext __darwin_ucontext_t; /* [???] user context */ +#endif /* _POSIX_C_SOURCE */ + +#ifndef _POSIX_C_SOURCE +struct ucontext64 { + int uc_onstack; + __darwin_sigset_t uc_sigmask; /* signal mask used by this context */ + __darwin_stack_t uc_stack; /* stack used by this context */ + struct ucontext64 *uc_link; /* pointer to resuming context */ + __darwin_size_t uc_mcsize; /* size of the machine context passed in */ + __darwin_mcontext64_t uc_mcontext64; /* pointer to machine specific context */ +}; +typedef struct ucontext64 __darwin_ucontext64_t; /* [???] user context */ +#endif /* _POSIX_C_SOURCE */ + +#ifdef KERNEL +#ifndef offsetof +#define offsetof(type, member) ((size_t)(&((type *)0)->member)) +#endif /* offsetof */ +#endif /* KERNEL */ +#endif /* _SYS__TYPES_H_ */ diff --git a/bsd/sys/acct.h b/bsd/sys/acct.h index 01aa44369..1cd61259b 100644 --- a/bsd/sys/acct.h +++ b/bsd/sys/acct.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -63,7 +63,7 @@ #define _SYS_ACCT_H_ #include <sys/appleapiopts.h> - +#include <sys/cdefs.h> /* * Accounting structures; these use a comp_t type which is a 3 bits base 8 * exponent, 13 bit fraction ``floating point'' number. Units are 1/AHZ @@ -100,7 +100,11 @@ struct acct { #ifdef KERNEL #ifdef __APPLE_API_PRIVATE extern struct vnode *acctp; -int acct_process __P((struct proc *p)); + +__BEGIN_DECLS +int acct_process(struct proc *p); +__END_DECLS + #endif /* __APPLE_API_PRIVATE */ #endif /* KERNEL */ diff --git a/bsd/sys/aio.h b/bsd/sys/aio.h index 57b3e3d17..f2d41b32c 100644 --- a/bsd/sys/aio.h +++ b/bsd/sys/aio.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2003-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -32,6 +32,7 @@ #define _SYS_AIO_H_ #include <sys/signal.h> +#include <sys/cdefs.h> struct aiocb { int aio_fildes; /* File descriptor */ @@ -43,6 +44,29 @@ struct aiocb { int aio_lio_opcode; /* Operation to be performed */ }; +// LP64todo - should this move? +#ifdef KERNEL + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_aiocb { + int aio_fildes; /* File descriptor */ + off_t aio_offset; /* File offset */ + user_addr_t aio_buf; /* Location of buffer */ + user_size_t aio_nbytes; /* Length of transfer */ + int aio_reqprio; /* Request priority offset */ + struct user_sigevent aio_sigevent; /* Signal number and value */ + int aio_lio_opcode; /* Operation to be performed */ +}; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +#endif // KERNEL + /* * aio_cancel() return values */ @@ -102,7 +126,9 @@ struct aiocb { * notification is given when the operation is complete */ -#define O_SYNC 0x0 /* queued IO is completed as if by fsync() */ +#ifndef O_SYNC /* XXX investigate documentation error */ +#define O_SYNC 0x0080 /* queued IO is completed as if by fsync() */ +#endif #if 0 /* O_DSYNC - NOT SUPPORTED */ #define O_DSYNC 0x1 /* queued async IO is completed as if by fdatasync() */ #endif @@ -112,6 +138,8 @@ struct aiocb { * Prototypes */ +__BEGIN_DECLS + /* * Attempt to cancel one or more asynchronous I/O requests currently outstanding * against file descriptor fd. The aiocbp argument points to the asynchronous I/O @@ -223,5 +251,7 @@ int lio_listio( int mode, struct aiocb *const aiocblist[], int nent, struct sigevent *sigp ); +__END_DECLS + #endif /* KERNEL */ #endif /* _SYS_AIO_H_ */ diff --git a/bsd/sys/aio_kern.h b/bsd/sys/aio_kern.h index 6c113a744..09401f0b3 100644 --- a/bsd/sys/aio_kern.h +++ b/bsd/sys/aio_kern.h @@ -38,15 +38,15 @@ struct aio_workq_entry { TAILQ_ENTRY( aio_workq_entry ) aio_workq_link; struct proc *procp; /* user proc that queued this request */ - struct aiocb *uaiocbp; /* pointer passed in from user land */ - struct aiocb *fsyncp; /* not NULL means this request must complete */ + user_addr_t uaiocbp; /* pointer passed in from user land */ + user_addr_t fsyncp; /* not NULL means this request must complete */ /* before an aio_fsync call can proceed. */ vm_map_t aio_map; /* user land map we have a reference to */ - ssize_t returnval; /* return value from read / write request */ + user_ssize_t returnval; /* return value from read / write request */ int errorval; /* error value from read / write request */ int flags; long group_tag; /* identifier used to group IO requests */ - struct aiocb aiocb; /* copy of aiocb from user land */ + struct user_aiocb aiocb; /* copy of aiocb from user land */ }; typedef struct aio_workq_entry aio_workq_entry; @@ -66,11 +66,27 @@ typedef struct aio_workq_entry aio_workq_entry; /* waiting for one or more active IO requests to */ /* complete */ +/* + * Prototypes + */ + +__private_extern__ void +_aio_close(struct proc *p, int fd); + +__private_extern__ void +_aio_exit(struct proc *p); + +__private_extern__ void +_aio_exec(struct proc *p); + +__private_extern__ void +_aio_create_worker_threads(int num); + +__private_extern__ void +aio_init(void); -__private_extern__ void _aio_close( struct proc *p, int fd ); -__private_extern__ void _aio_exit( struct proc *p ); -__private_extern__ void _aio_exec( struct proc *p ); -__private_extern__ void _aio_create_worker_threads( int num ); +task_t +get_aiotask(void); #endif /* KERNEL */ diff --git a/bsd/sys/attr.h b/bsd/sys/attr.h index 40a4a586c..80a7512c2 100644 --- a/bsd/sys/attr.h +++ b/bsd/sys/attr.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -32,21 +32,14 @@ #include <sys/appleapiopts.h> #ifdef __APPLE_API_UNSTABLE -#ifndef _SYS_TYPES_H_ #include <sys/types.h> -#endif -#ifndef _SYS_UCRED_H #include <sys/ucred.h> -#endif -#ifndef _SYS_MOUNT_H_ -#include <sys/mount.h> -#endif -#ifndef _SYS_TIME_H_ #include <sys/time.h> -#endif +#include <sys/cdefs.h> #define FSOPT_NOFOLLOW 0x00000001 -#define FSOPT_NOINMEMUPDATE 0x00000002 +#define FSOPT_NOINMEMUPDATE 0x00000002 +#define FSOPT_REPORT_FULLSIZE 0x00000004 /* we currently aren't anywhere near this amount for a valid * fssearchblock.sizeofsearchparams1 or fssearchblock.sizeofsearchparams2 @@ -92,8 +85,8 @@ typedef struct attribute_set { } attribute_set_t; typedef struct attrreference { - long attr_dataoffset; - size_t attr_length; + int32_t attr_dataoffset; + u_int32_t attr_length; } attrreference_t; /* XXX PPD This is derived from HFSVolumePriv.h and should perhaps be referenced from there? */ @@ -117,6 +110,11 @@ typedef struct vol_capabilities_attr { vol_capabilities_set_t valid; } vol_capabilities_attr_t; +/* + * XXX this value needs to be raised - 3893388 + */ +#define ATTR_MAX_BUFFER 8192 + /* * VOL_CAP_FMT_PERSISTENTOBJECTIDS: When set, the volume has object IDs * that are persistent (retain their values even when the volume is @@ -174,27 +172,33 @@ typedef struct vol_capabilities_attr { * the statfs information in its in-memory structures should set this bit. * A volume that must always read from disk or always perform a network * transaction should not set this bit. + * + * VOL_CAP_FMT_2TB_FILESIZE: When set, the volume format supports file + * size upto 2TB. This bit does not necessarily mean that the file + * system does not support file size more than 2TB. This bit does + * not mean that the currently available space on the volume is 2TB. */ -#define VOL_CAP_FMT_PERSISTENTOBJECTIDS 0x00000001 -#define VOL_CAP_FMT_SYMBOLICLINKS 0x00000002 -#define VOL_CAP_FMT_HARDLINKS 0x00000004 -#define VOL_CAP_FMT_JOURNAL 0x00000008 -#define VOL_CAP_FMT_JOURNAL_ACTIVE 0x00000010 -#define VOL_CAP_FMT_NO_ROOT_TIMES 0x00000020 -#define VOL_CAP_FMT_SPARSE_FILES 0x00000040 -#define VOL_CAP_FMT_ZERO_RUNS 0x00000080 -#define VOL_CAP_FMT_CASE_SENSITIVE 0x00000100 -#define VOL_CAP_FMT_CASE_PRESERVING 0x00000200 -#define VOL_CAP_FMT_FAST_STATFS 0x00000400 +#define VOL_CAP_FMT_PERSISTENTOBJECTIDS 0x00000001 +#define VOL_CAP_FMT_SYMBOLICLINKS 0x00000002 +#define VOL_CAP_FMT_HARDLINKS 0x00000004 +#define VOL_CAP_FMT_JOURNAL 0x00000008 +#define VOL_CAP_FMT_JOURNAL_ACTIVE 0x00000010 +#define VOL_CAP_FMT_NO_ROOT_TIMES 0x00000020 +#define VOL_CAP_FMT_SPARSE_FILES 0x00000040 +#define VOL_CAP_FMT_ZERO_RUNS 0x00000080 +#define VOL_CAP_FMT_CASE_SENSITIVE 0x00000100 +#define VOL_CAP_FMT_CASE_PRESERVING 0x00000200 +#define VOL_CAP_FMT_FAST_STATFS 0x00000400 +#define VOL_CAP_FMT_2TB_FILESIZE 0x00000800 /* * VOL_CAP_INT_SEARCHFS: When set, the volume implements the - * searchfs() system call (the VOP_SEARCHFS vnode operation). + * searchfs() system call (the vnop_searchfs vnode operation). * * VOL_CAP_INT_ATTRLIST: When set, the volume implements the - * getattrlist() and setattrlist() system calls (VOP_GETATTRLIST - * and VOP_SETATTRLIST vnode operations) for the volume, files, + * getattrlist() and setattrlist() system calls (vnop_getattrlist + * and vnop_setattrlist vnode operations) for the volume, files, * and directories. The volume may or may not implement the * readdirattr() system call. XXX Is there any minimum set * of attributes that should be supported? To determine the @@ -205,17 +209,17 @@ typedef struct vol_capabilities_attr { * of NFS volumes. * * VOL_CAP_INT_READDIRATTR: When set, the volume implements the - * readdirattr() system call (VOP_READDIRATTR vnode operation). + * readdirattr() system call (vnop_readdirattr vnode operation). * * VOL_CAP_INT_EXCHANGEDATA: When set, the volume implements the - * exchangedata() system call (VOP_EXCHANGE vnode operation). + * exchangedata() system call (VNOP_EXCHANGE vnode operation). * * VOL_CAP_INT_COPYFILE: When set, the volume implements the * VOP_COPYFILE vnode operation. (XXX There should be a copyfile() * system call in <unistd.h>.) * * VOL_CAP_INT_ALLOCATE: When set, the volume implements the - * VOP_ALLOCATE vnode operation, which means it implements the + * VNOP_ALLOCATE vnode operation, which means it implements the * F_PREALLOCATE selector of fcntl(2). * * VOL_CAP_INT_VOL_RENAME: When set, the volume implements the @@ -223,38 +227,38 @@ typedef struct vol_capabilities_attr { * The volume can be renamed by setting ATTR_VOL_NAME with setattrlist(). * * VOL_CAP_INT_ADVLOCK: When set, the volume implements POSIX style - * byte range locks via VOP_ADVLOCK (accessible from fcntl(2)). + * byte range locks via vnop_advlock (accessible from fcntl(2)). * * VOL_CAP_INT_FLOCK: When set, the volume implements whole-file flock(2) - * style locks via VOP_ADVLOCK. This includes the O_EXLOCK and O_SHLOCK + * style locks via vnop_advlock. This includes the O_EXLOCK and O_SHLOCK * flags of the open(2) call. * */ -#define VOL_CAP_INT_SEARCHFS 0x00000001 -#define VOL_CAP_INT_ATTRLIST 0x00000002 -#define VOL_CAP_INT_NFSEXPORT 0x00000004 -#define VOL_CAP_INT_READDIRATTR 0x00000008 -#define VOL_CAP_INT_EXCHANGEDATA 0x00000010 -#define VOL_CAP_INT_COPYFILE 0x00000020 -#define VOL_CAP_INT_ALLOCATE 0x00000040 -#define VOL_CAP_INT_VOL_RENAME 0x00000080 -#define VOL_CAP_INT_ADVLOCK 0x00000100 -#define VOL_CAP_INT_FLOCK 0x00000200 +#define VOL_CAP_INT_SEARCHFS 0x00000001 +#define VOL_CAP_INT_ATTRLIST 0x00000002 +#define VOL_CAP_INT_NFSEXPORT 0x00000004 +#define VOL_CAP_INT_READDIRATTR 0x00000008 +#define VOL_CAP_INT_EXCHANGEDATA 0x00000010 +#define VOL_CAP_INT_COPYFILE 0x00000020 +#define VOL_CAP_INT_ALLOCATE 0x00000040 +#define VOL_CAP_INT_VOL_RENAME 0x00000080 +#define VOL_CAP_INT_ADVLOCK 0x00000100 +#define VOL_CAP_INT_FLOCK 0x00000200 +#define VOL_CAP_INT_EXTENDED_SECURITY 0x00000400 +#define VOL_CAP_INT_USERACCESS 0x00000800 typedef struct vol_attributes_attr { attribute_set_t validattr; attribute_set_t nativeattr; } vol_attributes_attr_t; -#define DIR_MNTSTATUS_MNTPOINT 0x00000001 - #define ATTR_CMN_NAME 0x00000001 #define ATTR_CMN_DEVID 0x00000002 #define ATTR_CMN_FSID 0x00000004 #define ATTR_CMN_OBJTYPE 0x00000008 #define ATTR_CMN_OBJTAG 0x00000010 #define ATTR_CMN_OBJID 0x00000020 -#define ATTR_CMN_OBJPERMANENTID 0x00000040 +#define ATTR_CMN_OBJPERMANENTID 0x00000040 #define ATTR_CMN_PAROBJID 0x00000080 #define ATTR_CMN_SCRIPT 0x00000100 #define ATTR_CMN_CRTIME 0x00000200 @@ -267,12 +271,13 @@ typedef struct vol_attributes_attr { #define ATTR_CMN_GRPID 0x00010000 #define ATTR_CMN_ACCESSMASK 0x00020000 #define ATTR_CMN_FLAGS 0x00040000 -#define ATTR_CMN_NAMEDATTRCOUNT 0x00080000 -#define ATTR_CMN_NAMEDATTRLIST 0x00100000 #define ATTR_CMN_USERACCESS 0x00200000 +#define ATTR_CMN_EXTENDED_SECURITY 0x00400000 +#define ATTR_CMN_UUID 0x00800000 +#define ATTR_CMN_GRPUUID 0x01000000 #define ATTR_CMN_VALIDMASK 0x003FFFFF -#define ATTR_CMN_SETMASK 0x0007FF00 +#define ATTR_CMN_SETMASK 0x01C7FF00 #define ATTR_CMN_VOLSETMASK 0x00006700 #define ATTR_VOL_FSTYPE 0x00000001 @@ -280,19 +285,19 @@ typedef struct vol_attributes_attr { #define ATTR_VOL_SIZE 0x00000004 #define ATTR_VOL_SPACEFREE 0x00000008 #define ATTR_VOL_SPACEAVAIL 0x00000010 -#define ATTR_VOL_MINALLOCATION 0x00000020 -#define ATTR_VOL_ALLOCATIONCLUMP 0x00000040 -#define ATTR_VOL_IOBLOCKSIZE 0x00000080 +#define ATTR_VOL_MINALLOCATION 0x00000020 +#define ATTR_VOL_ALLOCATIONCLUMP 0x00000040 +#define ATTR_VOL_IOBLOCKSIZE 0x00000080 #define ATTR_VOL_OBJCOUNT 0x00000100 #define ATTR_VOL_FILECOUNT 0x00000200 #define ATTR_VOL_DIRCOUNT 0x00000400 -#define ATTR_VOL_MAXOBJCOUNT 0x00000800 +#define ATTR_VOL_MAXOBJCOUNT 0x00000800 #define ATTR_VOL_MOUNTPOINT 0x00001000 #define ATTR_VOL_NAME 0x00002000 #define ATTR_VOL_MOUNTFLAGS 0x00004000 -#define ATTR_VOL_MOUNTEDDEVICE 0x00008000 -#define ATTR_VOL_ENCODINGSUSED 0x00010000 -#define ATTR_VOL_CAPABILITIES 0x00020000 +#define ATTR_VOL_MOUNTEDDEVICE 0x00008000 +#define ATTR_VOL_ENCODINGSUSED 0x00010000 +#define ATTR_VOL_CAPABILITIES 0x00020000 #define ATTR_VOL_ATTRIBUTES 0x40000000 #define ATTR_VOL_INFO 0x80000000 @@ -303,7 +308,8 @@ typedef struct vol_attributes_attr { /* File/directory attributes: */ #define ATTR_DIR_LINKCOUNT 0x00000001 #define ATTR_DIR_ENTRYCOUNT 0x00000002 -#define ATTR_DIR_MOUNTSTATUS 0x00000004 +#define ATTR_DIR_MOUNTSTATUS 0x00000004 +#define DIR_MNTSTATUS_MNTPOINT 0x00000001 #define ATTR_DIR_VALIDMASK 0x00000007 #define ATTR_DIR_SETMASK 0x00000000 @@ -311,18 +317,14 @@ typedef struct vol_attributes_attr { #define ATTR_FILE_LINKCOUNT 0x00000001 #define ATTR_FILE_TOTALSIZE 0x00000002 #define ATTR_FILE_ALLOCSIZE 0x00000004 -#define ATTR_FILE_IOBLOCKSIZE 0x00000008 -#define ATTR_FILE_CLUMPSIZE 0x00000010 +#define ATTR_FILE_IOBLOCKSIZE 0x00000008 #define ATTR_FILE_DEVTYPE 0x00000020 -#define ATTR_FILE_FILETYPE 0x00000040 #define ATTR_FILE_FORKCOUNT 0x00000080 #define ATTR_FILE_FORKLIST 0x00000100 -#define ATTR_FILE_DATALENGTH 0x00000200 -#define ATTR_FILE_DATAALLOCSIZE 0x00000400 -#define ATTR_FILE_DATAEXTENTS 0x00000800 -#define ATTR_FILE_RSRCLENGTH 0x00001000 -#define ATTR_FILE_RSRCALLOCSIZE 0x00002000 -#define ATTR_FILE_RSRCEXTENTS 0x00004000 +#define ATTR_FILE_DATALENGTH 0x00000200 +#define ATTR_FILE_DATAALLOCSIZE 0x00000400 +#define ATTR_FILE_RSRCLENGTH 0x00001000 +#define ATTR_FILE_RSRCALLOCSIZE 0x00002000 #define ATTR_FILE_VALIDMASK 0x00007FFF #define ATTR_FILE_SETMASK 0x00000020 @@ -333,14 +335,25 @@ typedef struct vol_attributes_attr { #define ATTR_FORK_VALIDMASK 0x00000003 #define ATTR_FORK_SETMASK 0x00000000 -#define SRCHFS_START 0x00000001 +/* Obsolete, implemented, not supported */ +#define ATTR_CMN_NAMEDATTRCOUNT 0x00080000 /* not implemented */ +#define ATTR_CMN_NAMEDATTRLIST 0x00100000 /* not implemented */ +#define ATTR_FILE_CLUMPSIZE 0x00000010 /* obsolete */ +#define ATTR_FILE_FILETYPE 0x00000040 /* always zero */ +#define ATTR_FILE_DATAEXTENTS 0x00000800 /* obsolete, HFS-specific */ +#define ATTR_FILE_RSRCEXTENTS 0x00004000 /* obsolete, HFS-specific */ + +/* + * Searchfs + */ +#define SRCHFS_START 0x00000001 #define SRCHFS_MATCHPARTIALNAMES 0x00000002 -#define SRCHFS_MATCHDIRS 0x00000004 -#define SRCHFS_MATCHFILES 0x00000008 -#define SRCHFS_SKIPLINKS 0x00000010 -#define SRCHFS_SKIPINVISIBLE 0x00000020 -#define SRCHFS_SKIPPACKAGES 0x00000040 -#define SRCHFS_SKIPINAPPROPRIATE 0x00000080 +#define SRCHFS_MATCHDIRS 0x00000004 +#define SRCHFS_MATCHFILES 0x00000008 +#define SRCHFS_SKIPLINKS 0x00000010 +#define SRCHFS_SKIPINVISIBLE 0x00000020 +#define SRCHFS_SKIPPACKAGES 0x00000040 +#define SRCHFS_SKIPINAPPROPRIATE 0x00000080 #define SRCHFS_NEGATEPARAMS 0x80000000 #define SRCHFS_VALIDOPTIONSMASK 0x800000FF @@ -358,6 +371,37 @@ struct fssearchblock { struct attrlist searchattrs; }; +#ifdef KERNEL +/* LP64 version of fssearchblock. all pointers and longs + * grow when we're dealing with a 64-bit process. + * WARNING - keep in sync with fssearchblock + */ +// LP64todo - should this move? + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_fssearchblock { + user_addr_t returnattrs; + user_addr_t returnbuffer; + user_size_t returnbuffersize; + user_ulong_t maxmatches; + struct timeval timelimit; + user_addr_t searchparams1; + user_size_t sizeofsearchparams1; + user_addr_t searchparams2; + user_size_t sizeofsearchparams2; + struct attrlist searchattrs; +}; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + + +#endif // KERNEL + struct searchstate { u_char reserved[556]; // sizeof( SearchState ) diff --git a/bsd/sys/audit.h b/bsd/sys/audit.h deleted file mode 100644 index 5b53aa206..000000000 --- a/bsd/sys/audit.h +++ /dev/null @@ -1,208 +0,0 @@ -/* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ - -#ifndef _SYS_AUDIT_H -#define _SYS_AUDIT_H - -#include <sys/queue.h> -#include <sys/ucred.h> -#include <sys/param.h> -#include <sys/ipc.h> -#include <sys/socket.h> - -#define AUDIT_RECORD_MAGIC 0x828a0f1b -#define MAX_AUDIT_RECORDS 20 -#define MAX_AUDIT_RECORD_SIZE 4096 - -/* - * Define the masks for the classes of audit events. - */ -#define AU_NULL 0x00000000 -#define AU_FREAD 0x00000001 -#define AU_FWRITE 0x00000002 -#define AU_FACCESS 0x00000004 -#define AU_FMODIFY 0x00000008 -#define AU_FCREATE 0x00000010 -#define AU_FDELETE 0x00000020 -#define AU_CLOSE 0x00000040 -#define AU_PROCESS 0x00000080 -#define AU_NET 0x00000100 -#define AU_IPC 0x00000200 -#define AU_NONAT 0x00000400 -#define AU_ADMIN 0x00000800 -#define AU_LOGIN 0x00001000 -#define AU_TFM 0x00002000 -#define AU_APPL 0x00004000 -#define AU_SETL 0x00008000 -#define AU_IFLOAT 0x00010000 -#define AU_PRIV 0x00020000 -#define AU_MAC_RW 0x00040000 -#define AU_XCONN 0x00080000 -#define AU_XCREATE 0x00100000 -#define AU_XDELETE 0x00200000 -#define AU_XIFLOAT 0x00400000 -#define AU_XPRIVS 0x00800000 -#define AU_XPRIVF 0x01000000 -#define AU_XMOVE 0x02000000 -#define AU_XDACF 0x04000000 -#define AU_XMACF 0x08000000 -#define AU_XSECATTR 0x10000000 -#define AU_IOCTL 0x20000000 -#define AU_EXEC 0x40000000 -#define AU_OTHER 0x80000000 -#define AU_ALL 0xffffffff - -/* - * IPC types - */ -#define AT_IPC_MSG ((u_char)1) /* message IPC id */ -#define AT_IPC_SEM ((u_char)2) /* semaphore IPC id */ -#define AT_IPC_SHM ((u_char)3) /* shared mem IPC id */ - -/* - * Audit conditions. - */ -#define AUC_UNSET 0 -#define AUC_AUDITING 1 -#define AUC_NOAUDIT 2 -#define AUC_DISABLED -1 - -/* - * auditon(2) commands. - */ -#define A_GETPOLICY 2 -#define A_SETPOLICY 3 -#define A_GETKMASK 4 -#define A_SETKMASK 5 -#define A_GETQCTRL 6 -#define A_SETQCTRL 7 -#define A_GETCWD 8 -#define A_GETCAR 9 -#define A_GETSTAT 12 -#define A_SETSTAT 13 -#define A_SETUMASK 14 -#define A_SETSMASK 15 -#define A_GETCOND 20 -#define A_SETCOND 21 -#define A_GETCLASS 22 -#define A_SETCLASS 23 -#define A_GETPINFO 24 -#define A_SETPMASK 25 -#define A_SETFSIZE 26 -#define A_GETFSIZE 27 -#define A_GETPINFO_ADDR 28 -#define A_GETKAUDIT 29 -#define A_SETKAUDIT 30 - -/* - * Audit policy controls. - */ -#define AUDIT_CNT 0x0001 -#define AUDIT_AHLT 0x0002 -#define AUDIT_ARGV 0x0004 -#define AUDIT_ARGE 0x0008 -#define AUDIT_PASSWD 0x0010 -#define AUDIT_SEQ 0x0020 -#define AUDIT_WINDATA 0x0040 -#define AUDIT_USER 0x0080 -#define AUDIT_GROUP 0x0100 -#define AUDIT_TRAIL 0x0200 -#define AUDIT_PATH 0x0400 - -typedef uid_t au_id_t; -typedef pid_t au_asid_t; -typedef u_int16_t au_event_t; -typedef u_int16_t au_emod_t; -typedef u_int32_t au_class_t; - -struct au_tid { - dev_t port; - u_int32_t machine; -}; -typedef struct au_tid au_tid_t; - -struct au_tid_addr { - dev_t at_port; - u_int32_t at_type; - u_int32_t at_addr[4]; -}; -typedef struct au_tid_addr au_tid_addr_t; - -struct au_mask { - unsigned int am_success; /* success bits */ - unsigned int am_failure; /* failure bits */ -}; -typedef struct au_mask au_mask_t; - -struct auditinfo { - au_id_t ai_auid; /* Audit user ID */ - au_mask_t ai_mask; /* Audit masks */ - au_tid_t ai_termid; /* Terminal ID */ - au_asid_t ai_asid; /* Audit session ID */ -}; -typedef struct auditinfo auditinfo_t; - -struct auditinfo_addr { - au_id_t ai_auid; /* Audit user ID */ - au_mask_t ai_mask; /* Audit masks */ - au_tid_addr_t ai_termid; /* Terminal ID */ - au_asid_t ai_asid; /* Audit session ID */ -}; -typedef struct auditinfo_addr auditinfo_addr_t; - -/* Token and record structures */ - -struct au_token { - u_char *t_data; - size_t len; - TAILQ_ENTRY(au_token) tokens; -}; -typedef struct au_token token_t; - -struct au_record { - char used; /* Is this record currently being used */ - int desc; /* The descriptor associated with this record */ - TAILQ_HEAD(, au_token) token_q; /* queue of BSM tokens */ - u_char *data; - size_t len; - LIST_ENTRY(au_record) au_rec_q; -}; -typedef struct au_record au_record_t; - -#ifndef KERNEL -#include <sys/cdefs.h> - -__BEGIN_DECLS -int audit (const void *, int); -int auditon (int, void *, int); -int auditsvc (int, int); -int auditctl (const char *); -int getauid (au_id_t *); -int setauid (const au_id_t *); -int getaudit (struct auditinfo *); -int setaudit (const struct auditinfo *); -int getaudit_addr (struct auditinfo_addr *, int); -int setaudit_addr (const struct auditinfo_addr *, int); -__END_DECLS -#endif /* !KERNEL */ - -#endif /* !_SYS_AUDIT_H */ diff --git a/bsd/sys/buf.h b/bsd/sys/buf.h index 86ceeeb96..91aa77cf7 100644 --- a/bsd/sys/buf.h +++ b/bsd/sys/buf.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -63,226 +63,388 @@ #ifndef _SYS_BUF_H_ #define _SYS_BUF_H_ -#include <sys/appleapiopts.h> - -#ifdef KERNEL -#include <sys/queue.h> -#include <sys/errno.h> -#include <sys/vm.h> #include <sys/cdefs.h> +#include <sys/kernel_types.h> +#include <mach/memory_object_types.h> + -#ifdef __APPLE_API_PRIVATE - -#define NOLIST ((struct buf *)0x87654321) - -/* - * The buffer header describes an I/O operation in the kernel. - */ -struct buf { - LIST_ENTRY(buf) b_hash; /* Hash chain. */ - LIST_ENTRY(buf) b_vnbufs; /* Buffer's associated vnode. */ - TAILQ_ENTRY(buf) b_freelist; /* Free list position if not active. */ - struct proc *b_proc; /* Associated proc; NULL if kernel. */ - volatile long b_flags; /* B_* flags. */ - int b_error; /* Errno value. */ - long b_bufsize; /* Allocated buffer size. */ - long b_bcount; /* Valid bytes in buffer. */ - long b_resid; /* Remaining I/O. */ - dev_t b_dev; /* Device associated with buffer. */ - struct { - caddr_t b_addr; /* Memory, superblocks, indirect etc.*/ - } b_un; - void *b_saveaddr; /* Original b_addr for physio. */ - daddr_t b_lblkno; /* Logical block number. */ - daddr_t b_blkno; /* Underlying physical block number. */ - /* Function to call upon completion. */ - void (*b_iodone) __P((struct buf *)); - struct vnode *b_vp; /* Device vnode. */ - int b_dirtyoff; /* Offset in buffer of dirty region. */ - int b_dirtyend; /* Offset of end of dirty region. */ - int b_validoff; /* Offset in buffer of valid region. */ - int b_validend; /* Offset of end of valid region. */ - struct ucred *b_rcred; /* Read credentials reference. */ - struct ucred *b_wcred; /* Write credentials reference. */ - int b_timestamp; /* timestamp for queuing operation */ - long b_vectorcount; /* number of vectors in b_vectorlist */ - void *b_vectorlist; /* vector list for I/O */ - void *b_pagelist; /* to save pagelist info */ - long b_vects[2]; /* vectorlist when b_vectorcount is 1 */ - long b_whichq; /* the free list the buffer belongs to */ - TAILQ_ENTRY(buf) b_act; /* Device driver queue when active */ - void *b_drvdata; /* Device driver private use */ -}; - -/* - * For portability with historic industry practice, the cylinder number has - * to be maintained in the `b_resid' field. - */ -#define b_cylinder b_resid /* Cylinder number for disksort(). */ - -/* Device driver compatibility definitions. */ -#define b_active b_bcount /* Driver queue head: drive active. */ -#define b_data b_un.b_addr /* b_un.b_addr is not changeable. */ -#define b_errcnt b_resid /* Retry count while I/O in progress. */ -#define iodone biodone /* Old name for biodone. */ -#define iowait biowait /* Old name for biowait. */ - -/* cluster_io definitions for use with io bufs */ -#define b_uploffset b_bufsize -#define b_trans_head b_freelist.tqe_prev -#define b_trans_next b_freelist.tqe_next -#define b_real_bp b_saveaddr -#define b_iostate b_rcred - -/* journaling uses this cluster i/o field for its own - * purposes because meta data buf's should never go - * through the clustering code. - */ -#define b_transaction b_vectorlist - - - -/* - * These flags are kept in b_flags. - */ -#define B_AGE 0x00000001 /* Move to age queue when I/O done. */ -#define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */ -#define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ -#define B_BAD 0x00000008 /* Bad block revectoring in progress. */ -#define B_BUSY 0x00000010 /* I/O in progress. */ -#define B_CACHE 0x00000020 /* Bread found us in the cache. */ -#define B_CALL 0x00000040 /* Call b_iodone from biodone. */ -#define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ -#define B_DIRTY 0x00000100 /* Dirty page to be pushed out async. */ -#define B_DONE 0x00000200 /* I/O completed. */ -#define B_EINTR 0x00000400 /* I/O was interrupted */ -#define B_ERROR 0x00000800 /* I/O error occurred. */ -#define B_WASDIRTY 0x00001000 /* page was found dirty in the VM cache */ -#define B_INVAL 0x00002000 /* Does not contain valid info. */ -#define B_LOCKED 0x00004000 /* Locked in core (not reusable). */ -#define B_NOCACHE 0x00008000 /* Do not cache block after use. */ -#define B_PAGEOUT 0x00010000 /* Page out indicator... */ -#define B_PGIN 0x00020000 /* Pagein op, so swap() can count it. */ -#define B_PHYS 0x00040000 /* I/O to user memory. */ -#define B_RAW 0x00080000 /* Set by physio for raw transfers. */ -#define B_READ 0x00100000 /* Read buffer. */ -#define B_TAPE 0x00200000 /* Magnetic tape I/O. */ -#define B_PAGELIST 0x00400000 /* Buffer describes pagelist I/O. */ -#define B_WANTED 0x00800000 /* Process wants this buffer. */ #define B_WRITE 0x00000000 /* Write buffer (pseudo flag). */ -#define B_WRITEINPROG 0x01000000 /* Write in progress. */ -#define B_HDRALLOC 0x02000000 /* zone allocated buffer header */ -#define B_NORELSE 0x04000000 /* don't brelse() in bwrite() */ -#define B_NEED_IODONE 0x08000000 - /* need to do a biodone on the */ - /* real_bp associated with a cluster_io */ -#define B_COMMIT_UPL 0x10000000 - /* commit pages in upl when */ - /* I/O completes/fails */ -#define B_ZALLOC 0x20000000 /* b_data is zalloc()ed */ -#define B_META 0x40000000 /* buffer contains meta-data. */ -#define B_VECTORLIST 0x80000000 /* Used by device drivers. */ - - -/* - * Zero out the buffer's data area. - */ -#define clrbuf(bp) { \ - bzero((bp)->b_data, (u_int)(bp)->b_bcount); \ - (bp)->b_resid = 0; \ -} - -/* Flags to low-level allocation routines. */ -#define B_CLRBUF 0x01 /* Request allocated buffer be cleared. */ -#define B_SYNC 0x02 /* Do all allocations synchronously. */ -#define B_NOBUFF 0x04 /* Do not allocate struct buf */ - -/* Flags for operation type in getblk() */ -#define BLK_READ 0x01 /* buffer for read */ -#define BLK_WRITE 0x02 /* buffer for write */ -#define BLK_PAGEIN 0x04 /* buffer for pagein */ -#define BLK_PAGEOUT 0x08 /* buffer for pageout */ -#define BLK_META 0x10 /* buffer for metadata */ -#define BLK_CLREAD 0x20 /* buffer for cluster read */ -#define BLK_CLWRITE 0x40 /* buffer for cluster write */ +#define B_READ 0x00000001 /* Read buffer. */ +#define B_ASYNC 0x00000002 /* Start I/O, do not wait. */ +#define B_NOCACHE 0x00000004 /* Do not cache block after use. */ +#define B_DELWRI 0x00000008 /* Delay I/O until buffer reused. */ +#define B_LOCKED 0x00000010 /* Locked in core (not reusable). */ +#define B_PHYS 0x00000020 /* I/O to user memory. */ +#define B_CLUSTER 0x00000040 /* UPL based I/O generated by cluster layer */ +#define B_PAGEIO 0x00000080 /* Page in/out */ +#define B_META 0x00000100 /* buffer contains meta-data. */ +/* + * make sure to check when adding flags that + * that the new flags don't overlap the definitions + * in buf_internal.h + */ -extern int nbuf; /* The number of buffer headers */ -extern struct buf *buf; /* The buffer headers. */ +__BEGIN_DECLS -#endif /* __APPLE_API_PRIVATE */ +/* + * mark the buffer associated with buf_t + * as AGED with respect to the LRU cache + */ +void buf_markaged(buf_t); +/* + * mark the buffer associated with buf_t + * as invalid... on release, it will go + * directly to the free list + */ +void buf_markinvalid(buf_t); -#ifdef __APPLE_API_UNSTABLE -/* Macros to clear/set/test flags. */ -#define SET(t, f) (t) |= (f) -#define CLR(t, f) (t) &= ~(f) -#define ISSET(t, f) ((t) & (f)) -#endif /* __APPLE_API_UNSTABLE */ +/* + * mark the buffer assoicated with buf_t + * as a delayed write... + */ +void buf_markdelayed(buf_t); -#ifdef __APPLE_API_PRIVATE /* - * Definitions for the buffer free lists. + * mark the buffer associated with buf_t + * as having been interrupted... EINTR */ -#define BQUEUES 6 /* number of free buffer queues */ +void buf_markeintr(buf_t); -#define BQ_LOCKED 0 /* super-blocks &c */ -#define BQ_LRU 1 /* lru, useful buffers */ -#define BQ_AGE 2 /* rubbish */ -#define BQ_EMPTY 3 /* buffer headers with no memory */ -#define BQ_META 4 /* buffer containing metadata */ -#define BQ_LAUNDRY 5 /* buffers that need cleaning */ -#endif /* __APPLE_API_PRIVATE */ +/* + * returns 1 if the buffer associated with buf_t + * contains valid data... 0 if it does not + */ +int buf_valid(buf_t); -__BEGIN_DECLS -#ifdef __APPLE_API_UNSTABLE -int allocbuf __P((struct buf *, int)); -void bawrite __P((struct buf *)); -void bdwrite __P((struct buf *)); -void biodone __P((struct buf *)); -int biowait __P((struct buf *)); -int bread __P((struct vnode *, daddr_t, int, - struct ucred *, struct buf **)); -int meta_bread __P((struct vnode *, daddr_t, int, - struct ucred *, struct buf **)); -int breada __P((struct vnode *, daddr_t, int, daddr_t, int, - struct ucred *, struct buf **)); -int breadn __P((struct vnode *, daddr_t, int, daddr_t *, int *, int, - struct ucred *, struct buf **)); -int meta_breadn __P((struct vnode *, daddr_t, int, daddr_t *, int *, int, - struct ucred *, struct buf **)); -void brelse __P((struct buf *)); -void bremfree __P((struct buf *)); -void bufinit __P((void)); -void bwillwrite __P((void)); -int bwrite __P((struct buf *)); -struct buf *getblk __P((struct vnode *, daddr_t, int, int, int, int)); -struct buf *geteblk __P((int)); -struct buf *incore __P((struct vnode *, daddr_t)); -u_int minphys __P((struct buf *bp)); -int physio __P((void (*)(struct buf *), struct buf *, dev_t, int , u_int (*)(struct buf *), struct uio *, int )); -int count_busy_buffers __P((void)); -struct buf *alloc_io_buf __P((struct vnode *, int)); -void free_io_buf __P((struct buf *)); -void reassignbuf __P((struct buf *, struct vnode *)); -#endif /* __APPLE_API_UNSTABLE */ -__END_DECLS +/* + * returns 1 if the buffer was already valid + * in the cache... i.e. no I/O was performed + * returns 0 otherwise + */ +int buf_fromcache(buf_t); + +/* + * returns the UPL associated with buf_t + */ +void * buf_upl(buf_t); + +/* + * returns the offset into the UPL + * associated with buf_t which is to be + * used as the base offset for this I/O + */ +uint32_t buf_uploffset(buf_t); + +/* + * returns read credential associated with buf_t + * a reference is taken which must be explicilty dropped + */ +ucred_t buf_rcred(buf_t); + +/* + * returns write credential associated with buf_t + * a reference is taken which must be explicilty dropped + */ +ucred_t buf_wcred(buf_t); + +/* + * returns process handle associated with buf_t + * i.e identity of task that issued the I/O + */ +proc_t buf_proc(buf_t); + +uint32_t buf_dirtyoff(buf_t); +uint32_t buf_dirtyend(buf_t); +void buf_setdirtyoff(buf_t, uint32_t); +void buf_setdirtyend(buf_t, uint32_t); + +/* + * return the errno value associated with buf_t + */ +errno_t buf_error(buf_t); + +/* + * set errno on buf_t + */ +void buf_seterror(buf_t, errno_t); + +/* + * set specified flags on buf_t + * B_LOCKED/B_NOCACHE/B_ASYNC/B_READ/B_WRITE/B_PAGEIO + */ +void buf_setflags(buf_t, int32_t); + +/* + * clear specified flags on buf_t + * B_LOCKED/B_NOCACHE/B_ASYNC/B_READ/B_WRITE/B_PAGEIO + */ +void buf_clearflags(buf_t, int32_t); + +/* + * return external flags associated with buf_t + * B_CLUSTER/B_PHYS/B_LOCKED/B_DELWRI/B_ASYNC/B_READ/B_WRITE/B_META/B_PAGEIO + */ +int32_t buf_flags(buf_t); + +/* + * clears I/O related flags (both internal and + * external) associated with buf_t and allows + * the following to be set... + * B_READ/B_WRITE/B_ASYNC/B_NOCACHE + */ +void buf_reset(buf_t, int32_t); + +/* + * insure that the data storage associated with buf_t + * is addressable + */ +errno_t buf_map(buf_t, caddr_t *); + +/* + * release our need to have the storage associated + * with buf_t in an addressable state + */ +errno_t buf_unmap(buf_t); + +/* + * set driver specific data for buf_t + */ +void buf_setdrvdata(buf_t, void *); + +/* + * retrieve driver specific data associated with buf_t + */ +void * buf_drvdata(buf_t); + +/* + * set fs specific data for buf_t + */ +void buf_setfsprivate(buf_t, void *); + +/* + * retrieve driver specific data associated with buf_t + */ +void * buf_fsprivate(buf_t); + +/* + * retrieve the phsyical block number associated with buf_t + */ +daddr64_t buf_blkno(buf_t); + +/* + * retrieve the logical block number associated with buf_t + * i.e. the block number derived from the file offset + */ +daddr64_t buf_lblkno(buf_t); + +/* + * set the phsyical block number associated with buf_t + */ +void buf_setblkno(buf_t, daddr64_t); + +/* + * set the logical block number associated with buf_t + * i.e. the block number derived from the file offset + */ +void buf_setlblkno(buf_t, daddr64_t); + +/* + * retrieve the count of valid bytes associated with buf_t + */ +uint32_t buf_count(buf_t); + +/* + * retrieve the size of the data store assoicated with buf_t + */ +uint32_t buf_size(buf_t); + +/* + * retrieve the residual I/O count assoicated with buf_t + * i.e. number of bytes that have not yet been completed + */ +uint32_t buf_resid(buf_t); + +/* + * set the count of bytes associated with buf_t + * typically used to set the size of the I/O to be performed + */ +void buf_setcount(buf_t, uint32_t); + +/* + * set the size of the buffer store associated with buf_t + * typically used when providing private storage to buf_t + */ +void buf_setsize(buf_t, uint32_t); + +/* + * set the size in bytes of the unfinished I/O associated with buf_t + */ +void buf_setresid(buf_t, uint32_t); + +/* + * associate kernel addressable storage with buf_t + */ +void buf_setdataptr(buf_t, uintptr_t); + +/* + * retrieve pointer to buffer associated with buf_t + * if non-null, than guaranteed to be kernel addressable + * size of buffer can be retrieved via buf_size + * size of valid data can be retrieved via buf_count + * if NULL, than use buf_map/buf_unmap to manage access to the underlying storage + */ +uintptr_t buf_dataptr(buf_t); -#ifdef __APPLE_API_PRIVATE /* - * Stats on usefulness of the buffer cache + * return the vnode_t associated with buf_t */ -struct bufstats { - long bufs_incore; /* found incore */ - long bufs_busyincore; /* found incore. was busy */ - long bufs_vmhits; /* not incore. found in VM */ - long bufs_miss; /* not incore. not in VM */ - long bufs_sleeps; /* buffer starvation */ - long bufs_eblk; /* Calls to geteblk */ - long bufs_iobufmax; /* Max. number of IO buffers used */ - long bufs_iobufinuse; /* number of IO buffers in use */ - long bufs_iobufsleeps; /* IO buffer starvation */ -}; -#endif /* __APPLE_API_PRIVATE */ +vnode_t buf_vnode(buf_t); + +/* + * assign vnode_t to buf_t... the + * device currently associated with + * but_t is not changed. + */ +void buf_setvnode(buf_t, vnode_t); + +/* + * return the dev_t associated with buf_t + */ +dev_t buf_device(buf_t); + +/* + * assign the dev_t associated with vnode_t + * to buf_t + */ +errno_t buf_setdevice(buf_t, vnode_t); + +errno_t buf_strategy(vnode_t, void *); + +/* + * flags for buf_invalblkno + */ +#define BUF_WAIT 0x01 + +errno_t buf_invalblkno(vnode_t, daddr64_t, int); + + +/* + * return the callback function pointer + * if the callback is still valid + * returns NULL if a buffer that was not + * allocated via buf_alloc is specified + * or if a callback has not been set or + * it has already fired... + */ +void * buf_callback(buf_t); + +/* + * assign a one-shot callback function (driven from biodone) + * to a buf_t allocated via buf_alloc... a caller specified + * arg is passed to the callback function + */ +errno_t buf_setcallback(buf_t, void (*)(buf_t, void *), void *); + +/* + * add a upl_t to a buffer allocated via buf_alloc + * and set the offset into the upl_t (must be page + * aligned). + */ +errno_t buf_setupl(buf_t, upl_t, uint32_t); + +/* + * allocate a buf_t that is a clone of the buf_t + * passed in, but whose I/O range is a subset... + * if a callback routine is specified, it will + * be called from buf_biodone with the bp and + * arg specified. + * it must be freed via buf_free + */ +buf_t buf_clone(buf_t, int, int, void (*)(buf_t, void *), void *); + +/* + * allocate a buf_t associated with vnode_t + * that has NO storage associated with it + * but is suitable for use in issuing I/Os + * after storage has been assigned via buf_setdataptr + * or buf_addupl + */ +buf_t buf_alloc(vnode_t); + +/* + * free a buf_t that was allocated via buf_alloc + * any private storage associated with buf_t is the + * responsiblity of the caller to release + */ +void buf_free(buf_t); + +/* + * flags for buf_invalidateblks + */ +#define BUF_WRITE_DATA 0x0001 /* write data blocks first */ +#define BUF_SKIP_META 0x0002 /* skip over metadata blocks */ + +int buf_invalidateblks(vnode_t, int, int, int); +/* + * flags for buf_flushdirtyblks and buf_iterate + */ +#define BUF_SKIP_NONLOCKED 0x01 +#define BUF_SKIP_LOCKED 0x02 + +void buf_flushdirtyblks(vnode_t, int, int, char *); +void buf_iterate(vnode_t, int (*)(buf_t, void *), int, void *); + +#define BUF_RETURNED 0 +#define BUF_RETURNED_DONE 1 +#define BUF_CLAIMED 2 +#define BUF_CLAIMED_DONE 3 + +/* + * zero the storage associated with buf_t + */ +void buf_clear(buf_t); + +errno_t buf_bawrite(buf_t); +errno_t buf_bdwrite(buf_t); +errno_t buf_bwrite(buf_t); + +void buf_biodone(buf_t); +errno_t buf_biowait(buf_t); +void buf_brelse(buf_t); + +errno_t buf_bread(vnode_t, daddr64_t, int, ucred_t, buf_t *); +errno_t buf_breadn(vnode_t, daddr64_t, int, daddr64_t *, int *, int, ucred_t, buf_t *); +errno_t buf_meta_bread(vnode_t, daddr64_t, int, ucred_t, buf_t *); +errno_t buf_meta_breadn(vnode_t, daddr64_t, int, daddr64_t *, int *, int, ucred_t, buf_t *); + +u_int minphys(buf_t bp); +int physio(void (*)(buf_t), buf_t, dev_t, int , u_int (*)(buf_t), struct uio *, int ); + + +/* + * Flags for operation type in getblk() + */ +#define BLK_READ 0x01 /* buffer for read */ +#define BLK_WRITE 0x02 /* buffer for write */ +#define BLK_META 0x10 /* buffer for metadata */ +/* + * modifier for above flags... if set, getblk will only return + * a bp that is already valid... i.e. found in the cache + */ +#define BLK_ONLYVALID 0x80000000 + +/* timeout is in msecs */ +buf_t buf_getblk(vnode_t, daddr64_t, int, int, int, int); +buf_t buf_geteblk(int); + +__END_DECLS + + +/* Macros to clear/set/test flags. */ +#define SET(t, f) (t) |= (f) +#define CLR(t, f) (t) &= ~(f) +#define ISSET(t, f) ((t) & (f)) + -#endif /* KERNEL */ #endif /* !_SYS_BUF_H_ */ diff --git a/bsd/sys/buf_internal.h b/bsd/sys/buf_internal.h new file mode 100644 index 000000000..e06f99253 --- /dev/null +++ b/bsd/sys/buf_internal.h @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)buf.h 8.9 (Berkeley) 3/30/95 + */ + +#ifndef _SYS_BUF_INTERNAL_H_ +#define _SYS_BUF_INTERNAL_H_ + +#include <sys/appleapiopts.h> + +#ifdef KERNEL +#include <sys/queue.h> +#include <sys/errno.h> +#include <sys/vm.h> +#include <sys/cdefs.h> +#include <sys/buf.h> +#include <sys/lock.h> + + +extern lck_mtx_t *buf_mtxp; +#define NOLIST ((struct buf *)0x87654321) + +/* + * The buffer header describes an I/O operation in the kernel. + */ +struct buf { + LIST_ENTRY(buf) b_hash; /* Hash chain. */ + LIST_ENTRY(buf) b_vnbufs; /* Buffer's associated vnode. */ + TAILQ_ENTRY(buf) b_freelist; /* Free list position if not active. */ + int b_timestamp; /* timestamp for queuing operation */ + long b_whichq; /* the free list the buffer belongs to */ + volatile long b_flags; /* B_* flags. */ + volatile long b_lflags; /* BL_BUSY | BL_WANTED flags... protected by buf_mtx */ + int b_error; /* errno value. */ + long b_bufsize; /* Allocated buffer size. */ + long b_bcount; /* Valid bytes in buffer. */ + long b_resid; /* Remaining I/O. */ + dev_t b_dev; /* Device associated with buffer. */ + uintptr_t b_datap; /* Memory, superblocks, indirect etc.*/ + daddr64_t b_lblkno; /* Logical block number. */ + daddr64_t b_blkno; /* Underlying physical block number. */ + void (*b_iodone)(buf_t, void *); /* Function to call upon completion. */ + vnode_t b_vp; /* Device vnode. */ + struct ucred *b_rcred; /* Read credentials reference. */ + struct ucred *b_wcred; /* Write credentials reference. */ + void * b_upl; /* Pointer to UPL */ + buf_t b_real_bp; /* used to track bp generated through cluster_bp */ + TAILQ_ENTRY(buf) b_act; /* Device driver queue when active */ + void * b_drvdata; /* Device driver private use */ + void * b_fsprivate; /* filesystem private use */ + void * b_transaction; /* journal private use */ + int b_dirtyoff; /* Offset in buffer of dirty region. */ + int b_dirtyend; /* Offset of end of dirty region. */ + int b_validoff; /* Offset in buffer of valid region. */ + int b_validend; /* Offset of end of valid region. */ + proc_t b_proc; /* Associated proc; NULL if kernel. */ +#ifdef JOE_DEBUG + void * b_owner; + int b_tag; + void * b_lastbrelse; + int b_stackbrelse[6]; + int b_stackgetblk[6]; +#endif +}; + + +/* cluster_io definitions for use with io bufs */ +#define b_uploffset b_bufsize +#define b_trans_head b_freelist.tqe_prev +#define b_trans_next b_freelist.tqe_next +#define b_iostate b_rcred + +/* + * These flags are kept in b_lflags... + * buf_mtxp must be held before examining/updating + */ +#define BL_BUSY 0x00000001 /* I/O in progress. */ +#define BL_WANTED 0x00000002 /* Process wants this buffer. */ +#define BL_IOBUF 0x00000004 /* buffer allocated via 'buf_alloc' */ + + +/* + * mask used by buf_flags... these are the readable external flags + */ +#define BUF_X_RDFLAGS (B_CLUSTER | B_PHYS | B_LOCKED | B_DELWRI | B_ASYNC |\ + B_READ | B_WRITE | B_META | B_PAGEIO) +/* + * mask used by buf_clearflags/buf_setflags... these are the writable external flags + */ +#define BUF_X_WRFLAGS (B_LOCKED | B_NOCACHE | B_ASYNC | B_READ | B_WRITE | B_PAGEIO) + +/* + * These flags are kept in b_flags... access is lockless + * External flags are defined in buf.h and cannot overlap + * the internal flags + * + * these flags are internal... there definition may change + */ +#define B_CACHE 0x00010000 /* getblk found us in the cache. */ +#define B_DONE 0x00020000 /* I/O completed. */ +#define B_INVAL 0x00040000 /* Does not contain valid info. */ +#define B_ERROR 0x00080000 /* I/O error occurred. */ +#define B_EINTR 0x00100000 /* I/O was interrupted */ +#define B_AGE 0x00200000 /* Move to age queue when I/O done. */ +#define B_FILTER 0x00400000 /* call b_iodone from biodone as an in-line filter */ +#define B_CALL 0x00800000 /* Call b_iodone from biodone, assumes b_iodone consumes bp */ +#define B_RAW 0x01000000 /* Set by physio for raw transfers. */ +#define B_WASDIRTY 0x02000000 /* page was found dirty in the VM cache */ +#define B_HDRALLOC 0x04000000 /* zone allocated buffer header */ +#define B_ZALLOC 0x08000000 /* b_datap is zalloc()ed */ +/* + * private flags used by the journal layer + */ +#define B_NORELSE 0x10000000 /* don't brelse() in bwrite() */ +/* + * private flags used by by the cluster layer + */ +#define B_NEED_IODONE 0x20000000 /* need biodone on the real_bp associated with a cluster_io */ +#define B_COMMIT_UPL 0x40000000 /* commit/abort the UPL on I/O success/failure */ +/* + * can we deprecate? + */ +#define B_TAPE 0x80000000 /* Magnetic tape I/O. */ + + +/* Flags to low-level allocation routines. */ +#define B_CLRBUF 0x01 /* Request allocated buffer be cleared. */ +#define B_SYNC 0x02 /* Do all allocations synchronously. */ +#define B_NOBUFF 0x04 /* Do not allocate struct buf */ + + +extern int niobuf; /* The number of IO buffer headers for cluster IO */ +extern int nbuf; /* The number of buffer headers */ +extern struct buf *buf; /* The buffer headers. */ + + +/* + * Definitions for the buffer free lists. + */ +#define BQUEUES 6 /* number of free buffer queues */ + +#define BQ_LOCKED 0 /* super-blocks &c */ +#define BQ_LRU 1 /* lru, useful buffers */ +#define BQ_AGE 2 /* rubbish */ +#define BQ_EMPTY 3 /* buffer headers with no memory */ +#define BQ_META 4 /* buffer containing metadata */ +#define BQ_LAUNDRY 5 /* buffers that need cleaning */ + + +__BEGIN_DECLS + +buf_t alloc_io_buf(vnode_t, int); +void free_io_buf(buf_t); + +int allocbuf(struct buf *, int); +void bufinit(void); + +void buf_setfilter(buf_t, void (*)(buf_t, void *), void *, void **, void **); + +/* + * Flags for buf_acquire + */ +#define BAC_NOWAIT 0x01 /* Don't wait if buffer is busy */ +#define BAC_REMOVE 0x02 /* Remove from free list once buffer is acquired */ +#define BAC_SKIP_NONLOCKED 0x04 /* Don't return LOCKED buffers */ +#define BAC_SKIP_LOCKED 0x08 /* Only return LOCKED buffers */ + +void cluster_init(void); +void buf_drop(buf_t); +errno_t buf_acquire(buf_t, int, int, int); + +int count_busy_buffers(void); +int count_lock_queue(void); + + +__END_DECLS + + +/* + * Stats on usefulness of the buffer cache + */ +struct bufstats { + long bufs_incore; /* found incore */ + long bufs_busyincore; /* found incore. was busy */ + long bufs_vmhits; /* not incore. found in VM */ + long bufs_miss; /* not incore. not in VM */ + long bufs_sleeps; /* buffer starvation */ + long bufs_eblk; /* Calls to geteblk */ + long bufs_iobufmax; /* Max. number of IO buffers used */ + long bufs_iobufinuse; /* number of IO buffers in use */ + long bufs_iobufsleeps; /* IO buffer starvation */ +}; + +#endif /* KERNEL */ +#endif /* !_SYS_BUF_H_ */ diff --git a/bsd/sys/cdefs.h b/bsd/sys/cdefs.h index 3ec2547df..46dc8ec7f 100644 --- a/bsd/sys/cdefs.h +++ b/bsd/sys/cdefs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -158,6 +158,38 @@ #define __unused #endif +/* + * GCC 2.95 provides `__restrict' as an extension to C90 to support the + * C99-specific `restrict' type qualifier. We happen to use `__restrict' as + * a way to define the `restrict' type qualifier without disturbing older + * software that is unaware of C99 keywords. + */ +#if !(__GNUC__ == 2 && __GNUC_MINOR__ == 95) +#if __STDC_VERSION__ < 199901 +#define __restrict +#else +#define __restrict restrict +#endif +#endif + +/* + * Compiler-dependent macros to declare that functions take printf-like + * or scanf-like arguments. They are null except for versions of gcc + * that are known to support the features properly. Functions declared + * with these attributes will cause compilation warnings if there is a + * mismatch between the format string and subsequent function parameter + * types. + */ +#if __GNUC__ > 2 || __GNUC__ == 2 && __GNUC_MINOR__ >= 7 +#define __printflike(fmtarg, firstvararg) \ + __attribute__((__format__ (__printf__, fmtarg, firstvararg))) +#define __scanflike(fmtarg, firstvararg) \ + __attribute__((__format__ (__scanf__, fmtarg, firstvararg))) +#else +#define __printflike(fmtarg, firstvararg) +#define __scanflike(fmtarg, firstvararg) +#endif + #define __IDSTRING(name,string) static const char name[] __unused = string #ifndef __COPYRIGHT @@ -176,4 +208,153 @@ #define __PROJECT_VERSION(s) __IDSTRING(project_version,s) #endif +/* + * The __DARWIN_ALIAS macros is used to do symbol renaming, + * they allow old code to use the old symbol thus maintiang binary + * compatability while new code can use a new improved version of the + * same function. + * + * By default newly complied code will actually get the same symbols + * that the old code did. Defining any of _APPLE_C_SOURCE, _XOPEN_SOURCE, + * or _POSIX_C_SOURCE will give you the new symbols. Defining _XOPEN_SOURCE + * or _POSIX_C_SOURCE also restricts the avilable symbols to a subset of + * Apple's APIs. + * + * __DARWIN_ALIAS is used by itself if the function signature has not + * changed, it is used along with a #ifdef check for __DARWIN_UNIX03 + * if the signature has changed. Because the __LP64__ enviroment + * only supports UNIX03 sementics it causes __DARWIN_UNIX03 to be + * defined, but causes __DARWIN_ALIAS to do no symbol mangling. + */ + +#if !defined(__DARWIN_UNIX03) +#if defined(_APPLE_C_SOURCE) || defined(_XOPEN_SOURCE) || defined(_POSIX_C_SOURCE) || defined(__LP64__) +#if defined(_NONSTD_SOURCE) +#error "Can't define both _NONSTD_SOURCE and any of _APPLE_C_SOURCE, _XOPEN_SOURCE, _POSIX_C_SOURCE, or __LP64__" +#endif /* _NONSTD_SOURCE */ +#define __DARWIN_UNIX03 1 +#elif defined(_NONSTD_SOURCE) +#define __DARWIN_UNIX03 0 +#else /* default */ +#define __DARWIN_UNIX03 0 +#endif /* _APPLE_C_SOURCE || _XOPEN_SOURCE || _POSIX_C_SOURCE || __LP64__ */ +#endif /* !__DARWIN_UNIX03 */ + +#if __DARWIN_UNIX03 && !defined(__LP64__) +#define __DARWIN_ALIAS(sym) __asm("_" __STRING(sym) "$UNIX2003") +#else +#define __DARWIN_ALIAS(sym) +#endif + + +/* + * POSIX.1 requires that the macros we test be defined before any standard + * header file is included. This permits us to convert values for feature + * testing, as necessary, using only _POSIX_C_SOURCE. + * + * Here's a quick run-down of the versions: + * defined(_POSIX_SOURCE) 1003.1-1988 + * _POSIX_C_SOURCE == 1L 1003.1-1990 + * _POSIX_C_SOURCE == 2L 1003.2-1992 C Language Binding Option + * _POSIX_C_SOURCE == 199309L 1003.1b-1993 + * _POSIX_C_SOURCE == 199506L 1003.1c-1995, 1003.1i-1995, + * and the omnibus ISO/IEC 9945-1: 1996 + * _POSIX_C_SOURCE == 200112L 1003.1-2001 + * + * In addition, the X/Open Portability Guide, which is now the Single UNIX + * Specification, defines a feature-test macro which indicates the version of + * that specification, and which subsumes _POSIX_C_SOURCE. + */ + +/* Deal with IEEE Std. 1003.1-1990, in which _POSIX_C_SOURCE == 1L. */ +#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE == 1L +#undef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 199009L +#endif + +/* Deal with IEEE Std. 1003.2-1992, in which _POSIX_C_SOURCE == 2L. */ +#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE == 2L +#undef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 199209L +#endif + +/* Deal with various X/Open Portability Guides and Single UNIX Spec. */ +#ifdef _XOPEN_SOURCE +#if _XOPEN_SOURCE - 0L >= 600L +#undef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 200112L +#elif _XOPEN_SOURCE - 0L >= 500L +#undef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 199506L +#endif +#endif + +/* + * Deal with all versions of POSIX. The ordering relative to the tests above is + * important. + */ +#if defined(_POSIX_SOURCE) && !defined(_POSIX_C_SOURCE) +#define _POSIX_C_SOURCE 198808L +#endif + +/* + * long long is not supported in c89 (__STRICT_ANSI__), but g++ -ansi and + * c99 still want long longs. While not perfect, we allow long longs for + * g++. + */ +#define __DARWIN_NO_LONG_LONG (defined(__STRICT_ANSI__) \ + && (__STDC_VERSION__-0 < 199901L) \ + && !defined(__GNUG__)) + +/* + * Long double compatibility macro allow selecting variant symbols based + * on the old (compatible) 64-bit long doubles, or the new 128-bit + * long doubles. This applies only to ppc; i386 already has long double + * support, while ppc64 doesn't have any backwards history. + */ +#if defined(__ppc__) +# if defined(__LDBL_MANT_DIG__) && defined(__DBL_MANT_DIG__) && \ + __LDBL_MANT_DIG__ > __DBL_MANT_DIG__ +# if __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__-0 < 1040 +# define __DARWIN_LDBL_COMPAT(x) __asm("_" __STRING(x) "$LDBLStub") +# else +# define __DARWIN_LDBL_COMPAT(x) __asm("_" __STRING(x) "$LDBL128") +# endif +# define __DARWIN_LDBL_COMPAT2(x) __asm("_" __STRING(x) "$LDBL128") +# define __DARWIN_LONG_DOUBLE_IS_DOUBLE 0 +# else +# define __DARWIN_LDBL_COMPAT(x) /* nothing */ +# define __DARWIN_LDBL_COMPAT2(x) /* nothing */ +# define __DARWIN_LONG_DOUBLE_IS_DOUBLE 1 +# endif +#elif defined(__i386__) || defined(__ppc64__) +# define __DARWIN_LDBL_COMPAT(x) /* nothing */ +# define __DARWIN_LDBL_COMPAT2(x) /* nothing */ +# define __DARWIN_LONG_DOUBLE_IS_DOUBLE 0 +#else +# error Unknown architecture +#endif + +/* + * Structure alignment control macros. These specify how certain + * shared structures should be aligned. Some may need backward + * compatible legacy (POWER) alignment, while others may need + * forward compatible (NATURAL) alignment. + */ +#if !defined(__DARWIN_ALIGN_POWER) +#if defined(__ppc64__) +#define __DARWIN_ALIGN_POWER 1 +#else +#define __DARWIN_ALIGN_POWER 0 +#endif +#endif /* __DARWIN_ALIGN_POWER */ + +#if !defined(__DARWIN_ALIGN_NATURAL) +#if defined(__ppc__) && defined(KERNEL) +#define __DARWIN_ALIGN_NATURAL 1 +#else +#define __DARWIN_ALIGN_NATURAL 0 +#endif +#endif /* __DARWIN_ALIGN_NATURAL */ + #endif /* !_CDEFS_H_ */ diff --git a/bsd/sys/clist.h b/bsd/sys/clist.h index 64cb2eaea..5503f994a 100644 --- a/bsd/sys/clist.h +++ b/bsd/sys/clist.h @@ -58,10 +58,8 @@ #ifndef _SYS_CLIST_H_ #define _SYS_CLIST_H_ -#include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE -#ifdef KERNEL +#ifdef KERNEL_PRIVATE struct cblock { struct cblock *c_next; /* next cblock in queue */ @@ -71,8 +69,7 @@ struct cblock { extern struct cblock *cfree, *cfreelist; extern int cfreecount, nclist; -#endif /* KERNEL */ -#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL_PRIVATE */ #endif /* _SYS_CLIST_H_ */ diff --git a/bsd/sys/conf.h b/bsd/sys/conf.h index 9e7e80687..7dd5ad281 100644 --- a/bsd/sys/conf.h +++ b/bsd/sys/conf.h @@ -76,28 +76,36 @@ struct tty; struct uio; struct vnode; -#ifdef __APPLE_API_UNSTABLE +/* + * Types for d_type. + * These are returned by ioctl FIODTYPE + */ +#define D_TAPE 1 +#define D_DISK 2 +#define D_TTY 3 + +#ifdef KERNEL /* * Device switch function types. */ -typedef int open_close_fcn_t __P((dev_t dev, int flags, int devtype, - struct proc *p)); - -typedef struct tty *d_devtotty_t __P((dev_t dev)); - -typedef void strategy_fcn_t __P((struct buf *bp)); -typedef int ioctl_fcn_t __P((dev_t dev, u_long cmd, caddr_t data, - int fflag, struct proc *p)); -typedef int dump_fcn_t (); /* parameters vary by architecture */ -typedef int psize_fcn_t __P((dev_t dev)); -typedef int read_write_fcn_t __P((dev_t dev, struct uio *uio, int ioflag)); -typedef int stop_fcn_t __P((struct tty *tp, int rw)); -typedef int reset_fcn_t __P((int uban)); -typedef int select_fcn_t __P((dev_t dev, int which, void * wql, struct proc *p)); -typedef int mmap_fcn_t __P(()); -typedef int getc_fcn_t __P((dev_t dev)); -typedef int putc_fcn_t __P((dev_t dev, char c)); -typedef int d_poll_t __P((dev_t dev, int events, struct proc *p)); +typedef int open_close_fcn_t(dev_t dev, int flags, int devtype, + struct proc *p); + +typedef struct tty *d_devtotty_t(dev_t dev); + +typedef void strategy_fcn_t(struct buf *bp); +typedef int ioctl_fcn_t(dev_t dev, u_long cmd, caddr_t data, + int fflag, struct proc *p); +typedef int dump_fcn_t(void); /* parameters vary by architecture */ +typedef int psize_fcn_t(dev_t dev); +typedef int read_write_fcn_t(dev_t dev, struct uio *uio, int ioflag); +typedef int stop_fcn_t(struct tty *tp, int rw); +typedef int reset_fcn_t(int uban); +typedef int select_fcn_t(dev_t dev, int which, void * wql, struct proc *p); +typedef int mmap_fcn_t(void); +typedef int getc_fcn_t(dev_t dev); +typedef int putc_fcn_t(dev_t dev, char c); +typedef int d_poll_t(dev_t dev, int events, struct proc *p); #define d_open_t open_close_fcn_t #define d_close_t open_close_fcn_t @@ -113,8 +121,8 @@ typedef int d_poll_t __P((dev_t dev, int events, struct proc *p)); #define d_putc_t putc_fcn_t __BEGIN_DECLS -int enodev (); /* avoid actual prototype for multiple use */ -void enodev_strat(); +int enodev(void); +void enodev_strat(void); __END_DECLS /* @@ -134,12 +142,6 @@ __END_DECLS #define eno_putc ((putc_fcn_t *)&enodev) #define eno_select ((select_fcn_t *)&enodev) -/* - * Types for d_type. - */ -#define D_TAPE 1 -#define D_DISK 2 -#define D_TTY 3 /* * Block device switch table @@ -154,14 +156,13 @@ struct bdevsw { int d_type; }; -#ifdef KERNEL d_devtotty_t nodevtotty; d_write_t nowrite; -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE extern struct bdevsw bdevsw[]; -#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL_PRIVATE */ /* * Contents of empty bdevsw slot. @@ -170,7 +171,6 @@ extern struct bdevsw bdevsw[]; { eno_opcl, eno_opcl, eno_strat, eno_ioctl, \ eno_dump, eno_psize, 0 } -#endif /* KERNEL */ /* * Character device switch table @@ -180,23 +180,22 @@ struct cdevsw { open_close_fcn_t *d_close; read_write_fcn_t *d_read; read_write_fcn_t *d_write; - ioctl_fcn_t *d_ioctl; - stop_fcn_t *d_stop; - reset_fcn_t *d_reset; + ioctl_fcn_t *d_ioctl; + stop_fcn_t *d_stop; + reset_fcn_t *d_reset; struct tty **d_ttys; select_fcn_t *d_select; - mmap_fcn_t *d_mmap; + mmap_fcn_t *d_mmap; strategy_fcn_t *d_strategy; - getc_fcn_t *d_getc; - putc_fcn_t *d_putc; - int d_type; + getc_fcn_t *d_getc; + putc_fcn_t *d_putc; + int d_type; }; -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE extern struct cdevsw cdevsw[]; -#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL_PRIVATE */ /* * Contents of empty cdevsw slot. @@ -209,39 +208,45 @@ extern struct cdevsw cdevsw[]; (select_fcn_t *)seltrue, eno_mmap, eno_strat, eno_getc, \ eno_putc, 0 \ } + #endif /* KERNEL */ - + +#ifdef KERNEL_PRIVATE +typedef int l_open_t (dev_t dev, struct tty *tp); +typedef int l_close_t(struct tty *tp, int flags); +typedef int l_read_t (struct tty *tp, struct uio *uio, int flag); +typedef int l_write_t(struct tty *tp, struct uio *uio, int flag); +typedef int l_ioctl_t(struct tty *tp, u_long cmd, caddr_t data, int flag, + struct proc *p); +typedef int l_rint_t (int c, struct tty *tp); +typedef void l_start_t(struct tty *tp); +typedef int l_modem_t(struct tty *tp, int flag); + /* * Line discipline switch table */ struct linesw { - int (*l_open) __P((dev_t dev, struct tty *tp)); - int (*l_close) __P((struct tty *tp, int flags)); - int (*l_read) __P((struct tty *tp, struct uio *uio, - int flag)); - int (*l_write) __P((struct tty *tp, struct uio *uio, - int flag)); - int (*l_ioctl) __P((struct tty *tp, u_long cmd, caddr_t data, - int flag, struct proc *p)); - int (*l_rint) __P((int c, struct tty *tp)); - int (*l_start) __P((struct tty *tp)); - int (*l_modem) __P((struct tty *tp, int flag)); + l_open_t *l_open; + l_close_t *l_close; + l_read_t *l_read; + l_write_t *l_write; + l_ioctl_t *l_ioctl; + l_rint_t *l_rint; + l_start_t *l_start; + l_modem_t *l_modem; }; -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE extern struct linesw linesw[]; extern int nlinesw; -#endif /* __APPLE_API_PRIVATE */ -int ldisc_register __P((int , struct linesw *)); -void ldisc_deregister __P((int)); +int ldisc_register(int , struct linesw *); +void ldisc_deregister(int); #define LDISC_LOAD -1 /* Loadable line discipline */ -#endif /* KERNEL */ +#endif /* KERNEL_PRIVATE */ -#ifdef __APPLE_API_OBSOLETE +#ifdef BSD_KERNEL_PRIVATE /* * Swap device table */ @@ -255,11 +260,9 @@ struct swdevt { #define SW_SEQUENTIAL 0x02 #define sw_freed sw_flags /* XXX compat */ -#ifdef KERNEL extern struct swdevt swdevt[]; -#endif /* KERNEL */ -#endif /* __APPLE_API_OBSOLETE */ +#endif /* BSD_KERNEL_PRIVATE */ #ifdef KERNEL @@ -271,15 +274,14 @@ extern struct swdevt swdevt[]; * else -1 */ __BEGIN_DECLS -int bdevsw_isfree __P((int)); -int bdevsw_add __P((int, struct bdevsw *)); -int bdevsw_remove __P((int, struct bdevsw *)); -int cdevsw_isfree __P((int)); -int cdevsw_add __P((int, struct cdevsw *)); -int cdevsw_remove __P((int, struct cdevsw *)); +int bdevsw_isfree(int); +int bdevsw_add(int, struct bdevsw *); +int bdevsw_remove(int, struct bdevsw *); +int cdevsw_isfree(int); +int cdevsw_add(int, struct cdevsw *); +int cdevsw_add_with_bdev(int index, struct cdevsw * csw, int bdev); +int cdevsw_remove(int, struct cdevsw *); __END_DECLS #endif /* KERNEL */ -#endif /* __APPLE_API_UNSTABLE */ - #endif /* _SYS_CONF_H_ */ diff --git a/bsd/sys/dirent.h b/bsd/sys/dirent.h index 8dc0359cc..1b4d5e501 100644 --- a/bsd/sys/dirent.h +++ b/bsd/sys/dirent.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -69,19 +69,50 @@ #ifndef _SYS_DIRENT_H #define _SYS_DIRENT_H +#include <sys/_types.h> +#include <sys/cdefs.h> + +#ifndef _INO_T +typedef __darwin_ino_t ino_t; /* inode number */ +#define _INO_T +#endif + +#define __DARWIN_MAXNAMLEN 255 + +#if __DARWIN_ALIGN_POWER +#pragma options align=power +#endif + struct dirent { - u_int32_t d_fileno; /* file number of entry */ - u_int16_t d_reclen; /* length of this record */ - u_int8_t d_type; /* file type, see below */ - u_int8_t d_namlen; /* length of string in d_name */ -#ifdef _POSIX_SOURCE - char d_name[255 + 1]; /* name must be no longer than this */ -#else -#define MAXNAMLEN 255 - char d_name[MAXNAMLEN + 1]; /* name must be no longer than this */ + ino_t d_ino; /* file number of entry */ + __uint16_t d_reclen; /* length of this record */ + __uint8_t d_type; /* file type, see below */ + __uint8_t d_namlen; /* length of string in d_name */ + char d_name[__DARWIN_MAXNAMLEN + 1]; /* name must be no longer than this */ +}; + +#if __DARWIN_ALIGN_POWER +#pragma options align=reset #endif + +#ifdef KERNEL +#include <sys/kernel_types.h> + +/* Extended directory entry */ +struct direntry{ + ino64_t d_ino; /* file number of entry */ + __uint64_t d_seekoff; /* seek offset (optional, used by servers) */ + __uint16_t d_reclen; /* length of this record */ + __uint16_t d_namlen; /* length of string in d_name */ + __uint8_t d_type; /* file type, see below */ + u_char d_name[MAXPATHLEN - 1]; /* entry name (up to MAXPATHLEN - 1 bytes) */ }; +#endif + +#ifndef _POSIX_C_SOURCE +#define d_fileno d_ino /* backward compatibility */ +#define MAXNAMLEN __DARWIN_MAXNAMLEN /* * File types */ @@ -100,5 +131,6 @@ struct dirent { */ #define IFTODT(mode) (((mode) & 0170000) >> 12) #define DTTOIF(dirtype) ((dirtype) << 12) +#endif #endif /* _SYS_DIRENT_H */ diff --git a/bsd/sys/disk.h b/bsd/sys/disk.h index 0d5dc53bf..6e3d535ef 100644 --- a/bsd/sys/disk.h +++ b/bsd/sys/disk.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -25,6 +25,7 @@ #include <sys/ioctl.h> #include <sys/types.h> +#include <sys/cdefs.h> /* * Definitions @@ -54,6 +55,10 @@ * DKIOCGETMAXSEGMENTBYTECOUNTWRITE get maximum segment byte count for writes */ +#if __DARWIN_ALIGN_POWER +#pragma options align=power +#endif + typedef struct { char path[128]; @@ -67,6 +72,7 @@ typedef struct u_int8_t reserved0096[4]; /* reserved, clear to zero */ } dk_format_capacity_t; +/* LP64todo: not 64-bit clean */ typedef struct { dk_format_capacity_t * capacities; @@ -75,6 +81,10 @@ typedef struct u_int8_t reserved0064[8]; /* reserved, clear to zero */ } dk_format_capacities_t; +#if __DARWIN_ALIGN_POWER +#pragma options align=reset +#endif + #define DKIOCEJECT _IO('d', 21) #define DKIOCSYNCHRONIZECACHE _IO('d', 22) @@ -98,10 +108,11 @@ typedef struct #define DKIOCGETMAXSEGMENTBYTECOUNTWRITE _IOR('d', 69, u_int64_t) #ifdef KERNEL -#define DKIOCGETISVIRTUAL _IOR('d', 72, u_int32_t) #define DKIOCGETBLOCKCOUNT32 _IOR('d', 25, u_int32_t) #define DKIOCSETBLOCKSIZE _IOW('d', 24, u_int32_t) #define DKIOCGETBSDUNIT _IOR('d', 27, u_int32_t) +#define DKIOCISVIRTUAL _IOR('d', 72, u_int32_t) +#define DKIOCGETBASE _IOR('d', 73, u_int64_t) #endif /* KERNEL */ #endif /* _SYS_DISK_H_ */ diff --git a/bsd/sys/disklabel.h b/bsd/sys/disklabel.h index 4fa09f226..14eb3d746 100644 --- a/bsd/sys/disklabel.h +++ b/bsd/sys/disklabel.h @@ -58,6 +58,7 @@ #define _SYS_DISKLABEL_H_ #include <sys/appleapiopts.h> +#include <sys/types.h> /* for daddr_t */ #ifdef __APPLE_API_OBSOLETE @@ -357,7 +358,7 @@ struct partinfo { #include <sys/cdefs.h> __BEGIN_DECLS -struct disklabel *getdiskbyname __P((const char *)); +struct disklabel *getdiskbyname(const char *); __END_DECLS #endif diff --git a/bsd/sys/dkstat.h b/bsd/sys/dkstat.h index b0b256936..fa0060f6f 100644 --- a/bsd/sys/dkstat.h +++ b/bsd/sys/dkstat.h @@ -63,17 +63,11 @@ #ifndef _SYS_DKSTAT_H_ #define _SYS_DKSTAT_H_ -#include <sys/appleapiopts.h> - -#ifdef __APPLE_API_PRIVATE - -#ifdef KERNEL +#ifdef KERNEL_PRIVATE extern long tk_cancc; extern long tk_nin; extern long tk_nout; extern long tk_rawcc; #endif -#endif /* __APPLE_API_PRIVATE */ - #endif /* _SYS_DKSTAT_H_ */ diff --git a/bsd/sys/domain.h b/bsd/sys/domain.h index e6a75966c..c55eaeccd 100644 --- a/bsd/sys/domain.h +++ b/bsd/sys/domain.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -60,51 +60,68 @@ #ifndef _SYS_DOMAIN_H_ #define _SYS_DOMAIN_H_ -#include <sys/appleapiopts.h> +#ifdef PRIVATE +#include <sys/appleapiopts.h> +#ifdef KERNEL +#include <kern/locks.h> +#endif /* KERNEL */ /* * Structure per communications domain. */ +#include <sys/cdefs.h> + /* * Forward structure declarations for function prototypes [sic]. */ -#ifdef __APPLE_API_UNSTABLE struct mbuf; +#define DOM_REENTRANT 0x01 + +#if __DARWIN_ALIGN_POWER +#pragma options align=power +#endif struct domain { int dom_family; /* AF_xxx */ char *dom_name; - void (*dom_init) /* initialize domain data structures */ - __P((void)); - int (*dom_externalize) /* externalize access rights */ - __P((struct mbuf *)); - void (*dom_dispose) /* dispose of internalized rights */ - __P((struct mbuf *)); + void (*dom_init)(void); /* initialize domain data structures */ + int (*dom_externalize)(struct mbuf *); + /* externalize access rights */ + void (*dom_dispose)(struct mbuf *); + /* dispose of internalized rights */ struct protosw *dom_protosw; /* Chain of protosw's for AF */ struct domain *dom_next; - int (*dom_rtattach) /* initialize routing table */ - __P((void **, int)); + int (*dom_rtattach)(void **, int); + /* initialize routing table */ int dom_rtoffset; /* an arg to rtattach, in bits */ int dom_maxrtkey; /* for routing layer */ int dom_protohdrlen; /* Let the protocol tell us */ int dom_refs; /* # socreates outstanding */ - u_long reserved[4]; +#ifdef _KERN_LOCKS_H_ + lck_mtx_t *dom_mtx; /* domain global mutex */ +#else + void *dom_mtx; /* domain global mutex */ +#endif + u_long dom_flags; + u_long reserved[2]; }; +#if __DARWIN_ALIGN_POWER +#pragma options align=reset +#endif + #ifdef KERNEL extern struct domain *domains; extern struct domain localdomain; + +__BEGIN_DECLS extern void net_add_domain(struct domain *dp); extern int net_del_domain(struct domain *); +__END_DECLS #define DOMAIN_SET(domain_set) -/* -#define DOMAIN_SET(name) \ - DATA_SET(domain_set, name ## domain) -*/ - -#endif -#endif /* __APPLE_API_UNSTABLE */ +#endif /* KERNEL */ +#endif /* PRIVATE */ #endif /* _SYS_DOMAIN_H_ */ diff --git a/bsd/sys/errno.h b/bsd/sys/errno.h index f108d3121..b19b04da4 100644 --- a/bsd/sys/errno.h +++ b/bsd/sys/errno.h @@ -66,7 +66,7 @@ #if !defined(KERNEL) && !defined(KERNEL_PRIVATE) #include <sys/cdefs.h> __BEGIN_DECLS -extern int * __error __P((void)); +extern int * __error(void); #define errno (*__error()) __END_DECLS #endif @@ -90,7 +90,7 @@ __END_DECLS #define ENOMEM 12 /* Cannot allocate memory */ #define EACCES 13 /* Permission denied */ #define EFAULT 14 /* Bad address */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define ENOTBLK 15 /* Block device required */ #endif #define EBUSY 16 /* Device busy */ @@ -103,9 +103,7 @@ __END_DECLS #define ENFILE 23 /* Too many open files in system */ #define EMFILE 24 /* Too many open files */ #define ENOTTY 25 /* Inappropriate ioctl for device */ -#ifndef _POSIX_SOURCE #define ETXTBSY 26 /* Text file busy */ -#endif #define EFBIG 27 /* File too large */ #define ENOSPC 28 /* No space left on device */ #define ESPIPE 29 /* Illegal seek */ @@ -119,7 +117,6 @@ __END_DECLS /* non-blocking and interrupt i/o */ #define EAGAIN 35 /* Resource temporarily unavailable */ -#ifndef _POSIX_SOURCE #define EWOULDBLOCK EAGAIN /* Operation would block */ #define EINPROGRESS 36 /* Operation now in progress */ #define EALREADY 37 /* Operation already in progress */ @@ -131,12 +128,25 @@ __END_DECLS #define EPROTOTYPE 41 /* Protocol wrong type for socket */ #define ENOPROTOOPT 42 /* Protocol not available */ #define EPROTONOSUPPORT 43 /* Protocol not supported */ +#ifndef _POSIX_C_SOURCE #define ESOCKTNOSUPPORT 44 /* Socket type not supported */ -#endif /* ! _POSIX_SOURCE */ -#define ENOTSUP 45 /* Operation not supported */ -#ifndef _POSIX_SOURCE -#define EOPNOTSUPP ENOTSUP /* Operation not supported */ +#endif /* ! _POSIX_C_SOURCE */ +#define ENOTSUP 45 /* Operation not supported */ +#if !__DARWIN_UNIX03 && !defined(KERNEL) +/* + * This is the same for binary and source copmpatability, unless compiling + * the kernel itself, or compiling __DARWIN_UNIX03; if compiling for the + * kernel, the correct value will be returned. If compiling non-POSIX + * source, the kernel return value will be converted by a stub in libc, and + * if compiling source with __DARWIN_UNIX03, the conversion in libc is not + * done, and the caller gets the expected (discrete) value. + */ +#define EOPNOTSUPP ENOTSUP /* Operation not supported on socket */ +#endif /* !__DARWIN_UNIX03 && !KERNEL */ + +#ifndef _POSIX_C_SOURCE #define EPFNOSUPPORT 46 /* Protocol family not supported */ +#endif /* _POSIX_C_SOURCE */ #define EAFNOSUPPORT 47 /* Address family not supported by protocol family */ #define EADDRINUSE 48 /* Address already in use */ #define EADDRNOTAVAIL 49 /* Can't assign requested address */ @@ -150,73 +160,94 @@ __END_DECLS #define ENOBUFS 55 /* No buffer space available */ #define EISCONN 56 /* Socket is already connected */ #define ENOTCONN 57 /* Socket is not connected */ +#ifndef _POSIX_C_SOURCE #define ESHUTDOWN 58 /* Can't send after socket shutdown */ #define ETOOMANYREFS 59 /* Too many references: can't splice */ +#endif /* _POSIX_C_SOURCE */ #define ETIMEDOUT 60 /* Operation timed out */ #define ECONNREFUSED 61 /* Connection refused */ #define ELOOP 62 /* Too many levels of symbolic links */ -#endif /* _POSIX_SOURCE */ #define ENAMETOOLONG 63 /* File name too long */ /* should be rearranged */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define EHOSTDOWN 64 /* Host is down */ +#endif /* _POSIX_C_SOURCE */ #define EHOSTUNREACH 65 /* No route to host */ -#endif /* _POSIX_SOURCE */ #define ENOTEMPTY 66 /* Directory not empty */ /* quotas & mush */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define EPROCLIM 67 /* Too many processes */ #define EUSERS 68 /* Too many users */ +#endif /* _POSIX_C_SOURCE */ #define EDQUOT 69 /* Disc quota exceeded */ /* Network File System */ #define ESTALE 70 /* Stale NFS file handle */ +#ifndef _POSIX_C_SOURCE #define EREMOTE 71 /* Too many levels of remote in path */ #define EBADRPC 72 /* RPC struct is bad */ #define ERPCMISMATCH 73 /* RPC version wrong */ #define EPROGUNAVAIL 74 /* RPC prog. not avail */ #define EPROGMISMATCH 75 /* Program version wrong */ #define EPROCUNAVAIL 76 /* Bad procedure for program */ -#endif /* _POSIX_SOURCE */ +#endif /* _POSIX_C_SOURCE */ #define ENOLCK 77 /* No locks available */ #define ENOSYS 78 /* Function not implemented */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define EFTYPE 79 /* Inappropriate file type or format */ #define EAUTH 80 /* Authentication error */ #define ENEEDAUTH 81 /* Need authenticator */ -#endif /* _POSIX_SOURCE */ /* Intelligent device errors */ #define EPWROFF 82 /* Device power is off */ #define EDEVERR 83 /* Device error, e.g. paper out */ +#endif /* _POSIX_C_SOURCE */ -#ifndef _POSIX_SOURCE #define EOVERFLOW 84 /* Value too large to be stored in data type */ /* Program loading errors */ +#ifndef _POSIX_C_SOURCE #define EBADEXEC 85 /* Bad executable */ #define EBADARCH 86 /* Bad CPU type in executable */ #define ESHLIBVERS 87 /* Shared library version mismatch */ #define EBADMACHO 88 /* Malformed Macho file */ +#endif /* _POSIX_C_SOURCE */ #define ECANCELED 89 /* Operation canceled */ #define EIDRM 90 /* Identifier removed */ #define ENOMSG 91 /* No message of desired type */ #define EILSEQ 92 /* Illegal byte sequence */ +#ifndef _POSIX_C_SOURCE #define ENOATTR 93 /* Attribute not found */ - -#define ELAST 93 /* Must be equal largest errno */ -#endif /* _POSIX_SOURCE */ +#endif /* _POSIX_C_SOURCE */ + +#define EBADMSG 94 /* Bad message */ +#define EMULTIHOP 95 /* Reserved */ +#define ENODATA 96 /* No message available on STREAM */ +#define ENOLINK 97 /* Reserved */ +#define ENOSR 98 /* No STREAM resources */ +#define ENOSTR 99 /* Not a STREAM */ +#define EPROTO 100 /* Protocol error */ +#define ETIME 101 /* STREAM ioctl timeout */ + +#if __DARWIN_UNIX03 || defined(KERNEL) +/* This value is only discrete when compiling __DARWIN_UNIX03, or KERNEL */ +#define EOPNOTSUPP 102 /* Operation not supported on socket */ +#endif /* __DARWIN_UNIX03 || KERNEL */ + +#ifndef _POSIX_C_SOURCE +#define ELAST 102 /* Must be equal largest errno */ +#endif /* _POSIX_C_SOURCE */ #ifdef KERNEL /* pseudo-errors returned inside kernel to modify return to process */ -#define ERESTART -1 /* restart syscall */ -#define EJUSTRETURN -2 /* don't modify regs, just return */ +#define ERESTART (-1) /* restart syscall */ +#define EJUSTRETURN (-2) /* don't modify regs, just return */ #endif #endif /* _SYS_ERRNO_H_ */ diff --git a/bsd/sys/ev.h b/bsd/sys/ev.h index 16757b77f..39c4aeb61 100644 --- a/bsd/sys/ev.h +++ b/bsd/sys/ev.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -24,9 +24,12 @@ #ifndef _SYS_EV_H_ #define _SYS_EV_H_ +#if !defined(__LP64__) + #include <sys/appleapiopts.h> #include <sys/queue.h> +#include <sys/cdefs.h> struct eventreq { int er_type; @@ -59,8 +62,7 @@ typedef struct eventreq *er_t; #define EV_TIMEOUT 0x20000 #define EV_DMASK 0xffffff00 -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef BSD_KERNEL_PRIVATE struct eventqelt { TAILQ_ENTRY(eventqelt) ee_slist; @@ -68,12 +70,13 @@ struct eventqelt { struct eventreq ee_req; struct proc * ee_proc; u_int ee_flags; -#define EV_QUEUED 1 +#define EV_QUEUED 0x01 u_int ee_eventmask; - struct socket *ee_sp; }; -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +int waitevent_close(struct proc *p, struct fileproc *); +#endif /* BSD_KERNEL_PRIVATE */ + +#endif /* !defined(__LP64__) */ #endif /* _SYS_EV_H_ */ diff --git a/bsd/sys/event.h b/bsd/sys/event.h index a01242ab1..9f8d6c00a 100644 --- a/bsd/sys/event.h +++ b/bsd/sys/event.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2003-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -50,6 +50,10 @@ #ifndef _SYS_EVENT_H_ #define _SYS_EVENT_H_ +#include <machine/types.h> +#include <sys/cdefs.h> +#include <stdint.h> + #define EVFILT_READ (-1) #define EVFILT_WRITE (-2) #define EVFILT_AIO (-3) /* attached to aio requests */ @@ -61,16 +65,42 @@ #define EVFILT_FS (-9) /* Filesystem events */ #define EVFILT_SYSCOUNT 9 +#define EVFILT_THREADMARKER EVFILT_SYSCOUNT /* Internal use only */ + +#if __DARWIN_ALIGN_POWER +#pragma options align=power +#endif struct kevent { uintptr_t ident; /* identifier for this event */ short filter; /* filter for event */ - u_short flags; - u_int fflags; - intptr_t data; + unsigned short flags; /* general flags */ + unsigned int fflags; /* filter-specific flags */ + intptr_t data; /* filter-specific data */ +#ifdef KERNEL_PRIVATE + user_addr_t udata; /* opaque user data identifier */ +#else void *udata; /* opaque user data identifier */ +#endif +}; + +#ifdef KERNEL_PRIVATE + +struct user_kevent { + uint64_t ident; /* identifier for this event */ + short filter; /* filter for event */ + unsigned short flags; /* general flags */ + unsigned int fflags; /* filter-specific flags */ + int64_t data; /* filter-specific data */ + user_addr_t udata; /* opaque user data identifier */ }; +#endif + +#if __DARWIN_ALIGN_POWER +#pragma options align=reset +#endif + #define EV_SET(kevp, a, b, c, d, e, f) do { \ struct kevent *__kevp__ = (kevp); \ __kevp__->ident = (a); \ @@ -92,6 +122,7 @@ struct kevent { #define EV_CLEAR 0x0020 /* clear event state after reporting */ #define EV_SYSFLAGS 0xF000 /* reserved by system */ +#define EV_FLAG0 0x1000 /* filter-specific flag */ #define EV_FLAG1 0x2000 /* filter-specific flag */ /* returned values */ @@ -99,23 +130,46 @@ struct kevent { #define EV_ERROR 0x4000 /* error, data contains errno */ /* - * data/hint flags for EVFILT_{READ|WRITE}, shared with userspace + * Filter specific flags for EVFILT_READ + * + * The default behavior for EVFILT_READ is to make the "read" determination + * relative to the current file descriptor read pointer. The EV_POLL + * flag indicates the determination should be made via poll(2) semantics + * (which always returns true for regular files - regardless of the amount + * of unread data in the file). + * + * On input, EV_OOBAND specifies that only OOB data should be looked for. + * The returned data count is the number of bytes beyond the current OOB marker. + * + * On output, EV_OOBAND indicates that OOB data is present + * If it was not specified as an input parameter, then the data count is the + * number of bytes before the current OOB marker. If at the marker, the + * data count indicates the number of bytes available after it. In either + * case, it's the amount of data one could expect to receive next. */ -#define NOTE_LOWAT 0x0001 /* low water mark */ +#define EV_POLL EV_FLAG0 +#define EV_OOBAND EV_FLAG1 /* - * data/hint flags for EVFILT_VNODE, shared with userspace + * data/hint fflags for EVFILT_{READ|WRITE}, shared with userspace + * + * The default behavior for EVFILT_READ is to make the determination + * realtive to the current file descriptor read pointer. + */ +#define NOTE_LOWAT 0x00000001 /* low water mark */ +/* + * data/hint fflags for EVFILT_VNODE, shared with userspace */ -#define NOTE_DELETE 0x0001 /* vnode was removed */ -#define NOTE_WRITE 0x0002 /* data contents changed */ -#define NOTE_EXTEND 0x0004 /* size increased */ -#define NOTE_ATTRIB 0x0008 /* attributes changed */ -#define NOTE_LINK 0x0010 /* link count changed */ -#define NOTE_RENAME 0x0020 /* vnode was renamed */ -#define NOTE_REVOKE 0x0040 /* vnode access was revoked */ +#define NOTE_DELETE 0x00000001 /* vnode was removed */ +#define NOTE_WRITE 0x00000002 /* data contents changed */ +#define NOTE_EXTEND 0x00000004 /* size increased */ +#define NOTE_ATTRIB 0x00000008 /* attributes changed */ +#define NOTE_LINK 0x00000010 /* link count changed */ +#define NOTE_RENAME 0x00000020 /* vnode was renamed */ +#define NOTE_REVOKE 0x00000040 /* vnode access was revoked */ /* - * data/hint flags for EVFILT_PROC, shared with userspace + * data/hint fflags for EVFILT_PROC, shared with userspace */ #define NOTE_EXIT 0x80000000 /* process exited */ #define NOTE_FORK 0x40000000 /* process forked */ @@ -123,14 +177,36 @@ struct kevent { #define NOTE_PCTRLMASK 0xf0000000 /* mask for hint bits */ #define NOTE_PDATAMASK 0x000fffff /* mask for pid */ +/* + * data/hint fflags for EVFILT_TIMER, shared with userspace. + * The default is a (repeating) interval timer with the data + * specifying the timeout interval in milliseconds. + * + * All timeouts are implicitly EV_CLEAR events. + */ +#define NOTE_SECONDS 0x00000001 /* data is seconds */ +#define NOTE_USECONDS 0x00000002 /* data is microseconds */ +#define NOTE_NSECONDS 0x00000004 /* data is nanoseconds */ +#define NOTE_ABSOLUTE 0x00000008 /* absolute timeout */ + /* ... implicit EV_ONESHOT */ + /* additional flags for EVFILT_PROC */ #define NOTE_TRACK 0x00000001 /* follow across forks */ #define NOTE_TRACKERR 0x00000002 /* could not track child */ #define NOTE_CHILD 0x00000004 /* am a child process */ -#ifdef KERNEL_PRIVATE +#ifndef KERNEL +/* Temporay solution for BootX to use inode.h till kqueue moves to vfs layer */ +#include <sys/queue.h> +struct knote; +SLIST_HEAD(klist, knote); +#endif + +#ifdef KERNEL + +#ifdef KERNEL_PRIVATE #include <sys/queue.h> #ifdef MALLOC_DECLARE @@ -143,32 +219,33 @@ MALLOC_DECLARE(M_KQUEUE); */ #define NOTE_SIGNAL 0x08000000 +TAILQ_HEAD(kqtailq, knote); /* a list of "queued" events */ + struct knote { - /* JMM - line these up with wait_queue_link */ -#if 0 - struct wait_queue_link kn_wql; /* wait queue linkage */ -#else + int kn_inuse; /* inuse count */ + struct kqtailq *kn_tq; /* pointer to tail queue */ + TAILQ_ENTRY(knote) kn_tqe; /* linkage for tail queue */ + struct kqueue *kn_kq; /* which kqueue we are on */ + SLIST_ENTRY(knote) kn_link; /* linkage for search list */ SLIST_ENTRY(knote) kn_selnext; /* klist element chain */ - void *kn_type; /* knote vs. thread */ - struct klist *kn_list; /* pointer to list we are on */ - SLIST_ENTRY(knote) kn_link; /* members of kqueue */ - struct kqueue *kn_kq; /* which kqueue we are on */ -#endif - TAILQ_ENTRY(knote) kn_tqe; /* ...ready to process */ union { - struct file *p_fp; /* file data pointer */ + struct fileproc *p_fp; /* file data pointer */ struct proc *p_proc; /* proc pointer */ } kn_ptr; struct filterops *kn_fop; - int kn_status; + int kn_status; /* status bits */ int kn_sfflags; /* saved filter flags */ struct kevent kn_kevent; - intptr_t kn_sdata; /* saved data field */ caddr_t kn_hook; + int kn_hookid; + int64_t kn_sdata; /* saved data field */ + #define KN_ACTIVE 0x01 /* event has been triggered */ #define KN_QUEUED 0x02 /* event is on queue */ #define KN_DISABLED 0x04 /* event is disabled */ -#define KN_DETACHED 0x08 /* knote is detached */ +#define KN_DROPPING 0x08 /* knote is being dropped */ +#define KN_USEWAIT 0x10 /* wait for knote use */ +#define KN_DROPWAIT 0x20 /* wait for knote drop */ #define kn_id kn_kevent.ident #define kn_filter kn_kevent.filter @@ -180,9 +257,9 @@ struct knote { struct filterops { int f_isfd; /* true if ident == filedescriptor */ - int (*f_attach) __P((struct knote *kn)); - void (*f_detach) __P((struct knote *kn)); - int (*f_event) __P((struct knote *kn, long hint)); + int (*f_attach)(struct knote *kn); + void (*f_detach)(struct knote *kn); + int (*f_event)(struct knote *kn, long hint); }; struct proc; @@ -198,42 +275,33 @@ extern void klist_init(struct klist *list); extern void knote(struct klist *list, long hint); extern int knote_attach(struct klist *list, struct knote *kn); extern int knote_detach(struct klist *list, struct knote *kn); -extern void knote_remove(struct proc *p, struct klist *list); extern void knote_fdclose(struct proc *p, int fd); -extern int kqueue_register(struct kqueue *kq, - struct kevent *kev, struct proc *p); -#else /* !KERNEL_PRIVATE */ +#endif /* !KERNEL_PRIVATE */ + +#else /* KERNEL */ -/* - * This is currently visible to userland to work around broken - * programs which pull in <sys/proc.h> or <sys/select.h>. - */ -#include <sys/queue.h> -struct knote; -SLIST_HEAD(klist, knote); -#include <sys/cdefs.h> struct timespec; __BEGIN_DECLS -int kqueue __P((void)); -int kevent __P((int kq, const struct kevent *changelist, int nchanges, +int kqueue(void); +int kevent(int kq, const struct kevent *changelist, int nchanges, struct kevent *eventlist, int nevents, - const struct timespec *timeout)); + const struct timespec *timeout); __END_DECLS -#include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE -#include <mach/mach.h> +#ifdef PRIVATE +#include <mach/port.h> __BEGIN_DECLS -mach_port_t kqueue_portset_np __P((int kq)); -int kqueue_from_portset_np __P((mach_port_t portset)); +mach_port_t kqueue_portset_np(int kq); +int kqueue_from_portset_np(mach_port_t portset); __END_DECLS -#endif /* __APPLE_API_PRIVATE */ +#endif /* PRIVATE */ + +#endif /* KERNEL */ -#endif /* !KERNEL_PRIVATE */ #endif /* !_SYS_EVENT_H_ */ diff --git a/bsd/sys/eventvar.h b/bsd/sys/eventvar.h index c259f7a8c..8206a7201 100644 --- a/bsd/sys/eventvar.h +++ b/bsd/sys/eventvar.h @@ -50,6 +50,7 @@ #ifndef _SYS_EVENTVAR_H_ #define _SYS_EVENTVAR_H_ +#include <sys/event.h> #include <sys/select.h> #include <kern/kern_types.h> @@ -57,19 +58,27 @@ #define KQEXTENT 256 /* linear growth by this amount */ struct kqueue { -#if 0 - /* threads, member notes, and notes for us in parent sets */ - struct wait_queue_set kq_wqs; -#else + decl_lck_spin_data( ,kq_lock) /* kqueue lock */ int kq_state; - int kq_lock; /* space for a lock */ - TAILQ_HEAD(kqlist, knote) kq_head; /* list of pending events */ - int kq_count; /* number of pending events */ -#endif - struct selinfo kq_sel; /* JMM - parent set at some point */ - struct filedesc *kq_fdp; + int kq_count; /* number of queued events */ + struct kqtailq kq_head; /* list of queued events */ + struct kqtailq kq_inprocess; /* list of in-process events */ + struct selinfo kq_sel; /* parent select/kqueue info */ + struct filedesc *kq_fdp; + #define KQ_SEL 0x01 #define KQ_SLEEP 0x02 +#define KQ_PROCWAIT 0x04 }; +extern struct kqueue *kqueue_alloc(struct proc *); +extern void kqueue_dealloc(struct kqueue *, struct proc *); + +typedef int (*kevent_callback_t)(struct kqueue *, struct kevent *, void *); +typedef void (*kevent_continue_t)(struct kqueue *, void *, int); + +extern int kevent_register(struct kqueue *, struct kevent *, struct proc *); +extern int kevent_scan(struct kqueue *, kevent_callback_t, kevent_continue_t, + void *, struct timeval *, struct proc *); + #endif /* !_SYS_EVENTVAR_H_ */ diff --git a/bsd/sys/exec.h b/bsd/sys/exec.h index 5d4fb571e..e3b9d6e5b 100644 --- a/bsd/sys/exec.h +++ b/bsd/sys/exec.h @@ -65,38 +65,13 @@ #include <sys/appleapiopts.h> -#ifdef __APPLE_API_OBSOLETE /* - * The following structure is found at the top of the user stack of each - * user process. The ps program uses it to locate argv and environment - * strings. Programs that wish ps to display other information may modify - * it; normally ps_argvstr points to the text for argv[0], and ps_nargvstr - * is the same as the program's argc. The fields ps_envstr and ps_nenvstr - * are the equivalent for the environment. + * XXX at this point, this file only exists for backward compatability with + * XXX software which includes <sys/exec.h> instead of the more correct + * XXX <machine/exec.h> and/or need the inclusion of <sys/appleapiopts.h> + * XXX as a side effect. */ -struct ps_strings { - char *ps_argvstr; /* first of 0 or more argument strings */ - int ps_nargvstr; /* the number of argument strings */ - char *ps_envstr; /* first of 0 or more environment strings */ - int ps_nenvstr; /* the number of environment strings */ -}; - -#endif /* __APPLE_API_OBSOLETE */ - #include <machine/exec.h> -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -/* - * Arguments to the exec system call. - */ -struct execve_args { - char *fname; - char **argp; - char **envp; -}; -#endif /*__APPLE_API_PRIVATE */ -#endif /* KERNEL */ - #endif /* !_SYS_EXEC_H_ */ diff --git a/bsd/sys/fcntl.h b/bsd/sys/fcntl.h index 297c7791c..0519d9522 100644 --- a/bsd/sys/fcntl.h +++ b/bsd/sys/fcntl.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -69,9 +69,28 @@ * described by POSIX for <fcntl.h>; it also includes * related kernel definitions. */ +#include <sys/_types.h> +#include <sys/cdefs.h> -#ifndef KERNEL -#include <sys/types.h> +/* We should not be exporting size_t here. Temporary for gcc bootstrapping. */ +#ifndef _SIZE_T +#define _SIZE_T +typedef __darwin_size_t size_t; +#endif + +#ifndef _MODE_T +typedef __darwin_mode_t mode_t; +#define _MODE_T +#endif + +#ifndef _OFF_T +typedef __darwin_off_t off_t; +#define _OFF_T +#endif + +#ifndef _PID_T +typedef __darwin_pid_t pid_t; +#define _PID_T #endif /* @@ -95,19 +114,20 @@ * FREAD and FWRITE are excluded from the #ifdef KERNEL so that TIOCFLUSH, * which was documented to use FREAD/FWRITE, continues to work. */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define FREAD 0x0001 #define FWRITE 0x0002 #endif #define O_NONBLOCK 0x0004 /* no delay */ #define O_APPEND 0x0008 /* set append mode */ -#ifndef _POSIX_SOURCE +#define O_SYNC 0x0080 /* synchronous writes */ +#ifndef _POSIX_C_SOURCE #define O_SHLOCK 0x0010 /* open with shared file lock */ #define O_EXLOCK 0x0020 /* open with exclusive file lock */ #define O_ASYNC 0x0040 /* signal pgrp when data ready */ -#define O_FSYNC 0x0080 /* synchronous writes */ +#define O_FSYNC O_SYNC /* source compatibility: do not use */ #define O_NOFOLLOW 0x0100 /* don't follow symlinks */ -#endif +#endif /* _POSIX_C_SOURCE */ #define O_CREAT 0x0200 /* create if nonexistant */ #define O_TRUNC 0x0400 /* truncate to zero length */ #define O_EXCL 0x0800 /* error if already exists */ @@ -116,12 +136,17 @@ #define FDEFER 0x2000 /* defer for next gc pass */ #define FHASLOCK 0x4000 /* descriptor holds advisory lock */ #endif -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define O_EVTONLY 0x8000 /* descriptor requested for event notifications only */ #endif +#ifdef KERNEL +#define FWASWRITTEN 0x10000 /* descriptor was written */ +#endif + /* defined by POSIX 1003.1; BSD default, so no bit required */ #define O_NOCTTY 0 /* don't assign controlling terminal */ +//#define O_SYNC /* ??? POSIX: Write according to synchronized I/O file integrity completion */ #ifdef KERNEL /* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */ @@ -139,7 +164,7 @@ * and by fcntl. We retain the F* names for the kernel f_flags field * and for backward compatibility for fcntl. */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define FAPPEND O_APPEND /* kernel/compat */ #define FASYNC O_ASYNC /* kernel/compat */ #define FFSYNC O_FSYNC /* kernel */ @@ -152,7 +177,7 @@ * Flags used for copyfile(2) */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define CPF_OVERWRITE 1 #define CPF_IGNORE_MODE 2 #define CPF_MASK (CPF_OVERWRITE|CPF_IGNORE_MODE) @@ -168,13 +193,12 @@ #define F_SETFD 2 /* set file descriptor flags */ #define F_GETFL 3 /* get file status flags */ #define F_SETFL 4 /* set file status flags */ -#ifndef _POSIX_SOURCE #define F_GETOWN 5 /* get SIGIO/SIGURG proc/pgrp */ #define F_SETOWN 6 /* set SIGIO/SIGURG proc/pgrp */ -#endif #define F_GETLK 7 /* get record locking information */ #define F_SETLK 8 /* set record locking information */ #define F_SETLKW 9 /* F_SETLK; wait if blocked */ +#ifndef _POSIX_C_SOURCE #define F_CHKCLEAN 41 /* Used for regression test */ #define F_PREALLOCATE 42 /* Preallocate storage */ #define F_SETSIZE 43 /* Truncate a file without zeroing space */ @@ -186,6 +210,13 @@ #define F_LOG2PHYS 49 /* file offset to device offset */ #define F_GETPATH 50 /* return the full path of the fd */ #define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */ +#define F_PATHPKG_CHECK 52 /* find which component (if any) is a package */ +#define F_FREEZE_FS 53 /* "freeze" all fs operations */ +#define F_THAW_FS 54 /* "thaw" all fs operations */ + +// FS-specific fcntl()'s numbers begin at 0x00010000 and go up +#define FCNTL_FS_SPECIFIC_BASE 0x00010000 +#endif /* _POSIX_C_SOURCE */ /* file descriptor flags (F_GETFD, F_SETFD) */ #define FD_CLOEXEC 1 /* close-on-exec flag */ @@ -200,6 +231,65 @@ #define F_POSIX 0x040 /* Use POSIX semantics for lock */ #endif +/* + * [XSI] The values used for l_whence shall be defined as described + * in <unistd.h> + */ +#ifndef SEEK_SET +#define SEEK_SET 0 /* set file offset to offset */ +#define SEEK_CUR 1 /* set file offset to current plus offset */ +#define SEEK_END 2 /* set file offset to EOF plus offset */ +#endif /* !SEEK_SET */ + +/* + * [XSI] The symbolic names for file modes for use as values of mode_t + * shall be defined as described in <sys/stat.h> + */ +#ifndef S_IFMT +/* File type */ +#define S_IFMT 0170000 /* [XSI] type of file mask */ +#define S_IFIFO 0010000 /* [XSI] named pipe (fifo) */ +#define S_IFCHR 0020000 /* [XSI] character special */ +#define S_IFDIR 0040000 /* [XSI] directory */ +#define S_IFBLK 0060000 /* [XSI] block special */ +#define S_IFREG 0100000 /* [XSI] regular */ +#define S_IFLNK 0120000 /* [XSI] symbolic link */ +#define S_IFSOCK 0140000 /* [XSI] socket */ +#ifndef _POSIX_C_SOURCE +#define S_IFWHT 0160000 /* whiteout */ +#define S_IFXATTR 0200000 /* extended attribute */ +#endif + +/* File mode */ +/* Read, write, execute/search by owner */ +#define S_IRWXU 0000700 /* [XSI] RWX mask for owner */ +#define S_IRUSR 0000400 /* [XSI] R for owner */ +#define S_IWUSR 0000200 /* [XSI] W for owner */ +#define S_IXUSR 0000100 /* [XSI] X for owner */ +/* Read, write, execute/search by group */ +#define S_IRWXG 0000070 /* [XSI] RWX mask for group */ +#define S_IRGRP 0000040 /* [XSI] R for group */ +#define S_IWGRP 0000020 /* [XSI] W for group */ +#define S_IXGRP 0000010 /* [XSI] X for group */ +/* Read, write, execute/search by others */ +#define S_IRWXO 0000007 /* [XSI] RWX mask for other */ +#define S_IROTH 0000004 /* [XSI] R for other */ +#define S_IWOTH 0000002 /* [XSI] W for other */ +#define S_IXOTH 0000001 /* [XSI] X for other */ + +#define S_ISUID 0004000 /* [XSI] set user id on execution */ +#define S_ISGID 0002000 /* [XSI] set group id on execution */ +#define S_ISVTX 0001000 /* [XSI] directory restrcted delete */ + +#ifndef _POSIX_C_SOURCE +#define S_ISTXT S_ISVTX /* sticky bit: not supported */ +#define S_IREAD S_IRUSR /* backward compatability */ +#define S_IWRITE S_IWUSR /* backward compatability */ +#define S_IEXEC S_IXUSR /* backward compatability */ +#endif +#endif /* !S_IFMT */ + +#ifndef _POSIX_C_SOURCE /* allocate flags (F_PREALLOCATE) */ #define F_ALLOCATECONTIG 0x00000002 /* allocate contigious space */ @@ -210,6 +300,7 @@ #define F_PEOFPOSMODE 3 /* Make it past all of the SEEK pos modes so that */ /* we can keep them in sync should we desire */ #define F_VOLPOSMODE 4 /* specify volume starting postion */ +#endif /* _POSIX_C_SOURCE */ /* * Advisory file segment locking data type - @@ -224,6 +315,7 @@ struct flock { }; +#ifndef _POSIX_C_SOURCE /* * advisory file read data type - * information passed by user to system @@ -234,18 +326,16 @@ struct radvisory { }; -#ifndef _POSIX_SOURCE /* lock operations for flock(2) */ #define LOCK_SH 0x01 /* shared file lock */ #define LOCK_EX 0x02 /* exclusive file lock */ #define LOCK_NB 0x04 /* don't block when locking */ #define LOCK_UN 0x08 /* unlock file */ -#endif /* fstore_t type used by F_DEALLOCATE and F_PREALLOCATE commands */ typedef struct fstore { - u_int32_t fst_flags; /* IN: flags word */ + unsigned int fst_flags; /* IN: flags word */ int fst_posmode; /* IN: indicates use of offset field */ off_t fst_offset; /* IN: start of the region */ off_t fst_length; /* IN: size of the region */ @@ -256,10 +346,34 @@ typedef struct fstore { typedef struct fbootstraptransfer { off_t fbt_offset; /* IN: offset to start read/write */ - size_t fbt_length; /* IN: number of bytes to transfer */ + size_t fbt_length; /* IN: number of bytes to transfer */ void *fbt_buffer; /* IN: buffer to be read/written */ } fbootstraptransfer_t; + +// LP64todo - should this move? +#ifdef KERNEL +/* LP64 version of fbootstraptransfer. all pointers + * grow when we're dealing with a 64-bit process. + * WARNING - keep in sync with fbootstraptransfer + */ + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +typedef struct user_fbootstraptransfer { + off_t fbt_offset; /* IN: offset to start read/write */ + user_size_t fbt_length; /* IN: number of bytes to transfer */ + user_addr_t fbt_buffer; /* IN: buffer to be read/written */ +} user_fbootstraptransfer_t; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +#endif // KERNEL + /* * For F_LOG2PHYS this information is passed back to user * Currently only devoffset is returned - that is the VOP_BMAP @@ -276,27 +390,66 @@ typedef struct fbootstraptransfer { * and a per filesystem type flag will be needed to interpret the * contiguous bytes count result from CMAP. */ +#if __DARWIN_ALIGN_POWER +#pragma options align=power +#endif + struct log2phys { - u_int32_t l2p_flags; /* unused so far */ + unsigned int l2p_flags; /* unused so far */ off_t l2p_contigbytes; /* unused so far */ off_t l2p_devoffset; /* bytes into device */ }; -#ifndef _POSIX_SOURCE +#if __DARWIN_ALIGN_POWER +#pragma options align=reset +#endif + #define O_POPUP 0x80000000 /* force window to popup on open */ #define O_ALERT 0x20000000 /* small, clean popup window */ -#endif +#endif /* _POSIX_C_SOURCE */ #ifndef KERNEL -#include <sys/cdefs.h> + +#ifndef _POSIX_C_SOURCE +#ifndef _FILESEC_T +struct _filesec; +typedef struct _filesec *filesec_t; +#define _FILESEC_T +#endif +typedef enum { + FILESEC_OWNER = 1, + FILESEC_GROUP = 2, + FILESEC_UUID = 3, + FILESEC_MODE = 4, + FILESEC_ACL = 5, + FILESEC_GRPUUID = 6, + +/* XXX these are private to the implementation */ + FILESEC_ACL_RAW = 100, + FILESEC_ACL_ALLOCSIZE = 101 +} filesec_property_t; + +/* XXX backwards compatibility */ +#define FILESEC_GUID FILESEC_UUID +#endif /* _POSIX_C_SOURCE */ __BEGIN_DECLS -int open __P((const char *, int, ...)); -int creat __P((const char *, mode_t)); -int fcntl __P((int, int, ...)); -#ifndef _POSIX_SOURCE -int flock __P((int, int)); -#endif /* !_POSIX_SOURCE */ +int open(const char *, int, ...); +int creat(const char *, mode_t); +int fcntl(int, int, ...); +#ifndef _POSIX_C_SOURCE +int openx_np(const char *, int, filesec_t); +int flock(int, int); +filesec_t filesec_init(void); +filesec_t filesec_dup(filesec_t); +void filesec_free(filesec_t); +int filesec_get_property(filesec_t, filesec_property_t, void *); +int filesec_set_property(filesec_t, filesec_property_t, const void *); +int filesec_unset_property(filesec_t, filesec_property_t); +int filesec_query_property(filesec_t, filesec_property_t, int *); +#define _FILESEC_UNSET_PROPERTY ((void *)0) +#define _FILESEC_REMOVE_ACL ((void *)1) +#endif /* !_POSIX_C_SOURCE */ __END_DECLS #endif diff --git a/bsd/sys/file.h b/bsd/sys/file.h index 142529b92..710159af8 100644 --- a/bsd/sys/file.h +++ b/bsd/sys/file.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -59,141 +59,46 @@ #define _SYS_FILE_H_ #include <sys/appleapiopts.h> +#include <sys/types.h> #include <sys/fcntl.h> #include <sys/unistd.h> +#include <sys/queue.h> +#include <sys/cdefs.h> #ifdef KERNEL -#include <sys/errno.h> #include <sys/queue.h> -#include <sys/cdefs.h> +#include <sys/kernel_types.h> +#endif -struct proc; -struct uio; -struct knote; -#ifdef __APPLE_API_UNSTABLE +#if __DARWIN_ALIGN_POWER +#pragma options align=power +#endif -/* - * Kernel descriptor table. - * One entry for each open kernel vnode and socket. - */ -struct file { - LIST_ENTRY(file) f_list;/* list of active files */ +/* for the compat sake; */ +struct extern_file { + LIST_ENTRY(extern_file) f_list; /* list of active files */ short f_flag; /* see fcntl.h */ -#define DTYPE_VNODE 1 /* file */ -#define DTYPE_SOCKET 2 /* communications endpoint */ -#define DTYPE_PSXSHM 3 /* POSIX Shared memory */ -#define DTYPE_PSXSEM 4 /* POSIX Semaphores */ -#define DTYPE_KQUEUE 5 /* kqueue */ short f_type; /* descriptor type */ short f_count; /* reference count */ short f_msgcount; /* references from message queue */ struct ucred *f_cred; /* credentials associated with descriptor */ - struct fileops { - int (*fo_read) __P((struct file *fp, struct uio *uio, - struct ucred *cred, int flags, - struct proc *p)); - int (*fo_write) __P((struct file *fp, struct uio *uio, - struct ucred *cred, int flags, - struct proc *p)); -#define FOF_OFFSET 1 - int (*fo_ioctl) __P((struct file *fp, u_long com, - caddr_t data, struct proc *p)); - int (*fo_select) __P((struct file *fp, int which, - void *wql, struct proc *p)); - int (*fo_close) __P((struct file *fp, struct proc *p)); - int (*fo_kqfilter) __P((struct file *fp, struct knote *kn, - struct proc *p)); - } *f_ops; + void * f_ops; off_t f_offset; caddr_t f_data; /* vnode or socket or SHM or semaphore */ }; -#ifdef __APPLE_API_PRIVATE -LIST_HEAD(filelist, file); -extern struct filelist filehead; /* head of list of open files */ -extern int maxfiles; /* kernel limit on number of open files */ -extern int nfiles; /* actual number of open files */ -#endif /* __APPLE_API_PRIVATE */ +#if __DARWIN_ALIGN_POWER +#pragma options align=reset +#endif +#ifdef KERNEL __BEGIN_DECLS -int fref __P((struct file *)); /* take a reference on file pointer */ -int frele __P((struct file *)); /* release a reference on file pointer */ -int fcount __P((struct file *)); /* returns the reference count */ - -static __inline int fo_read __P((struct file *fp, struct uio *uio, - struct ucred *cred, int flags, struct proc *p)); -static __inline int fo_write __P((struct file *fp, struct uio *uio, - struct ucred *cred, int flags, struct proc *p)); -static __inline int fo_ioctl __P((struct file *fp, u_long com, caddr_t data, - struct proc *p)); -static __inline int fo_select __P((struct file *fp, int which, void *wql, - struct proc *p)); -static __inline int fo_close __P((struct file *fp, struct proc *p)); -static __inline int fo_kqfilter __P((struct file *fp, struct knote *kn, - struct proc *p)); - -static __inline int -fo_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct proc *p) -{ - int error; - - if ((error = fref(fp)) == -1) - return (EBADF); - error = (*fp->f_ops->fo_read)(fp, uio, cred, flags, p); - frele(fp); - return (error); -} - -static __inline int -fo_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct proc *p) -{ - int error; - - if ((error = fref(fp)) == -1) - return (EBADF); - error = (*fp->f_ops->fo_write)(fp, uio, cred, flags, p); - frele(fp); - return (error); -} - -static __inline int -fo_ioctl(struct file *fp, u_long com, caddr_t data, struct proc *p) -{ - int error; - - if ((error = fref(fp)) == -1) - return (EBADF); - error = (*fp->f_ops->fo_ioctl)(fp, com, data, p); - frele(fp); - return (error); -} - -static __inline int -fo_select(struct file *fp, int which, void *wql, struct proc *p) -{ - int error; - - error = (*fp->f_ops->fo_select)(fp, which, wql, p); - return (error); -} - -static __inline int -fo_close(struct file *fp, struct proc *p) -{ - - return ((*fp->f_ops->fo_close)(fp, p)); -} - -static __inline int -fo_kqfilter(struct file *fp, struct knote *kn, struct proc *p) -{ - return ((*fp->f_ops->fo_kqfilter)(fp, kn, p)); -} - +int file_socket(int, socket_t *); +int file_vnode(int, vnode_t *); +int file_flags(int, int *); +int file_drop(int); __END_DECLS -#endif /* __APPLE_API_UNSTABLE */ - #endif /* KERNEL */ #endif /* !_SYS_FILE_H_ */ diff --git a/bsd/sys/file_internal.h b/bsd/sys/file_internal.h new file mode 100644 index 000000000..76dd19505 --- /dev/null +++ b/bsd/sys/file_internal.h @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* Copyright (c) 1995, 1997 Apple Computer, Inc. All Rights Reserved */ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)file.h 8.3 (Berkeley) 1/9/95 + */ + +#ifndef _SYS_FILE_INTERNAL_H_ +#define _SYS_FILE_INTERNAL_H_ + +#include <sys/appleapiopts.h> +#include <sys/fcntl.h> +#include <sys/unistd.h> + +#ifdef KERNEL +#include <sys/errno.h> +#include <sys/queue.h> +#include <sys/cdefs.h> +#include <sys/lock.h> +#include <sys/file.h> + +struct proc; +struct uio; +struct knote; +#ifdef __APPLE_API_UNSTABLE + +struct file; + + +/* + * Kernel descriptor table. + * One entry for each open kernel vnode and socket. + */ +struct fileproc { + int32_t f_flags; + int32_t f_iocount; + struct fileglob * f_fglob; + void * f_waddr; +}; + +#define FILEPROC_NULL (struct fileproc *)0 + +#define FP_INCREATE 0x0001 +#define FP_INCLOSE 0x0002 +#define FP_INSELECT 0x0004 +#define FP_INCHRREAD 0x0008 +#define FP_WRITTEN 0x0010 +#define FP_CLOSING 0x0020 +#define FP_WAITCLOSE 0x0040 +#define FP_AIOISSUED 0x0080 +#define FP_WAITEVENT 0x0100 + + +/* defns of close_internal */ +#define CLOSEINT_LOCKED 1 +#define CLOSEINT_WAITONCLOSE 2 +#define CLOSEINT_NOFDRELSE 4 +#define CLOSEINT_NOFDNOREF 8 + +struct fileglob { + LIST_ENTRY(fileglob) f_list;/* list of active files */ + LIST_ENTRY(fileglob) f_msglist;/* list of active files */ + int32_t fg_flag; /* see fcntl.h */ + int32_t fg_type; /* descriptor type */ + int32_t fg_count; /* reference count */ + int32_t fg_msgcount; /* references from message queue */ + struct ucred *fg_cred; /* credentials associated with descriptor */ + struct fileops { + int (*fo_read) __P((struct fileproc *fp, struct uio *uio, + struct ucred *cred, int flags, + struct proc *p)); + int (*fo_write) __P((struct fileproc *fp, struct uio *uio, + struct ucred *cred, int flags, + struct proc *p)); +#define FOF_OFFSET 1 + int (*fo_ioctl) __P((struct fileproc *fp, u_long com, + caddr_t data, struct proc *p)); + int (*fo_select) __P((struct fileproc *fp, int which, + void *wql, struct proc *p)); + int (*fo_close) __P((struct fileglob *fg, struct proc *p)); + int (*fo_kqfilter) __P((struct fileproc *fp, struct knote *kn, + struct proc *p)); + int (*fo_drain) (struct fileproc *fp, struct proc *p); + } *fg_ops; + off_t fg_offset; + caddr_t fg_data; /* vnode or socket or SHM or semaphore */ + lck_mtx_t fg_lock; + int32_t fg_lflags; /* file global flags */ + unsigned int fg_lockpc[4]; + unsigned int fg_unlockpc[4]; +}; + +/* file types */ +#define DTYPE_VNODE 1 /* file */ +#define DTYPE_SOCKET 2 /* communications endpoint */ +#define DTYPE_PSXSHM 3 /* POSIX Shared memory */ +#define DTYPE_PSXSEM 4 /* POSIX Semaphores */ +#define DTYPE_KQUEUE 5 /* kqueue */ +#define DTYPE_PIPE 6 /* pipe */ +#define DTYPE_FSEVENTS 7 /* fsevents */ + +/* defines for fg_lflags */ +#define FG_TERM 0x01 /* the fileglob is terminating .. */ +#define FG_INSMSGQ 0x02 /* insert to msgqueue pending .. */ +#define FG_WINSMSGQ 0x04 /* wait for the fielglob is in msgque */ +#define FG_RMMSGQ 0x08 /* the fileglob is being removed from msgqueue */ +#define FG_WRMMSGQ 0x10 /* wait for the fileglob to be removed from msgqueue */ + + +#ifdef __APPLE_API_PRIVATE +LIST_HEAD(filelist, fileglob); +LIST_HEAD(fmsglist, fileglob); +extern struct filelist filehead; /* head of list of open files */ +extern struct fmsglist fmsghead; /* head of list of open files */ +extern int maxfiles; /* kernel limit on number of open files */ +extern int nfiles; /* actual number of open files */ +#endif /* __APPLE_API_PRIVATE */ + + +__BEGIN_DECLS +int fo_read(struct fileproc *fp, struct uio *uio, + struct ucred *cred, int flags, struct proc *p); +int fo_write(struct fileproc *fp, struct uio *uio, + struct ucred *cred, int flags, struct proc *p); +int fo_ioctl(struct fileproc *fp, u_long com, caddr_t data, + struct proc *p); +int fo_select(struct fileproc *fp, int which, void *wql, + struct proc *p); +int fo_close(struct fileglob *fg, struct proc *p); +int fo_kqfilter(struct fileproc *fp, struct knote *kn, + struct proc *p); +void fileproc_drain(proc_t, struct fileproc *); +void fp_setflags(proc_t, struct fileproc *, int); +void fp_clearflags(proc_t, struct fileproc *, int); +int fp_drop(struct proc *p, int fd, struct fileproc *fp, int locked); +int fp_drop_written(proc_t p, int fd, struct fileproc *fp); +int fp_drop_event(proc_t p, int fd, struct fileproc *fp); +int fp_free(struct proc * p, int fd, struct fileproc * fp); +struct kqueue; +int fp_getfkq(struct proc *p, int fd, struct fileproc **resultfp, struct kqueue **resultkq); +struct psemnode; +int fp_getfpsem(struct proc *p, int fd, struct fileproc **resultfp, struct psemnode **resultpsem); +struct vnode; +int fp_getfvp(struct proc *p, int fd, struct fileproc **resultfp, struct vnode **resultvp); +struct socket; +int fp_getfsock(struct proc *p, int fd, struct fileproc **resultfp, struct socket **results); +int fp_lookup(struct proc *p, int fd, struct fileproc **resultfp, int locked); +int close_internal(struct proc *p, int fd, struct fileproc *fp, int flags); +int closef_locked(struct fileproc *fp, struct fileglob *fg, struct proc *p); +void fg_insertuipc(struct fileglob * fg); +void fg_removeuipc(struct fileglob * fg); +__END_DECLS + +#endif /* __APPLE_API_UNSTABLE */ + +#endif /* KERNEL */ + +#endif /* !_SYS_FILE_INTERNAL_H_ */ diff --git a/bsd/sys/filedesc.h b/bsd/sys/filedesc.h index f1f2fb4fd..5212cc45c 100644 --- a/bsd/sys/filedesc.h +++ b/bsd/sys/filedesc.h @@ -81,7 +81,7 @@ struct klist; struct filedesc { - struct file **fd_ofiles; /* file structures for open files */ + struct fileproc **fd_ofiles; /* file structures for open files */ char *fd_ofileflags; /* per-process open file flags */ struct vnode *fd_cdir; /* current directory */ struct vnode *fd_rdir; /* root directory */ @@ -95,14 +95,22 @@ struct filedesc { struct klist *fd_knlist; /* list of attached knotes */ u_long fd_knhashmask; /* size of knhash */ struct klist *fd_knhash; /* hash table for attached knotes */ + int fd_flags; }; +/* + * definitions for fd_flags; + */ +#define FD_CHROOT 0x01 /* process was chrooted... keep track even */ + /* if we're force unmounted and unable to */ + /* take a vnode_ref on fd_rdir during a fork */ + /* * Per-process open flags. */ #define UF_EXCLOSE 0x01 /* auto-close on exec */ -#define UF_MAPPED 0x02 /* mapped from device */ #define UF_RESERVED 0x04 /* open pending / in progress */ +#define UF_CLOSING 0x08 /* close in progress */ /* * Storage required per open file descriptor. @@ -113,24 +121,23 @@ struct filedesc { /* * Kernel global variables and routines. */ -extern int dupfdopen __P((struct filedesc *fdp, - int indx, int dfd, int mode, int error)); -extern int fdalloc __P((struct proc *p, int want, int *result)); -extern void fdrelse __P((struct proc *p, int fd)); -extern int fdavail __P((struct proc *p, int n)); -extern int fdgetf __P((struct proc *p, int fd, struct file **resultfp)); +extern int dupfdopen(struct filedesc *fdp, + int indx, int dfd, int mode, int error); +extern int fdalloc(struct proc *p, int want, int *result); +extern void fdrelse(struct proc *p, int fd); +extern int fdavail(struct proc *p, int n); #define fdfile(p, fd) \ (&(p)->p_fd->fd_ofiles[(fd)]) #define fdflags(p, fd) \ (&(p)->p_fd->fd_ofileflags[(fd)]) -extern int falloc __P((struct proc *p, - struct file **resultfp, int *resultfd)); -extern void ffree __P((struct file *fp)); +extern int falloc(struct proc *p, + struct fileproc **resultfp, int *resultfd); +extern void ffree(struct file *fp); #ifdef __APPLE_API_PRIVATE -extern struct filedesc *fdcopy __P((struct proc *p)); -extern void fdfree __P((struct proc *p)); -extern void fdexec __P((struct proc *p)); +extern struct filedesc *fdcopy(struct proc *p); +extern void fdfree(struct proc *p); +extern void fdexec(struct proc *p); #endif /* __APPLE_API_PRIVATE */ #endif /* KERNEL */ diff --git a/bsd/sys/fsctl.h b/bsd/sys/fsctl.h index 63198c7ec..0d2033f06 100644 --- a/bsd/sys/fsctl.h +++ b/bsd/sys/fsctl.h @@ -65,15 +65,12 @@ #include <sys/ioccom.h> -/* get size of mount info struct: */ -#define FSGETMOUNTINFOSIZE _IOR('m', 1, long) - #ifndef KERNEL #include <sys/cdefs.h> __BEGIN_DECLS -int fsctl __P((const char *, unsigned long, void *, unsigned long)); +int fsctl(const char *, unsigned long, void *, unsigned long); __END_DECLS #endif /* !KERNEL */ diff --git a/bsd/sys/fsevents.h b/bsd/sys/fsevents.h new file mode 100644 index 000000000..91b9f5958 --- /dev/null +++ b/bsd/sys/fsevents.h @@ -0,0 +1,88 @@ +#ifndef FSEVENT_H +#define FSEVENT_H 1 + +// Event types that you can ask to listen for +#define FSE_INVALID -1 +#define FSE_CREATE_FILE 0 +#define FSE_DELETE 1 +#define FSE_STAT_CHANGED 2 +#define FSE_RENAME 3 +#define FSE_CONTENT_MODIFIED 4 +#define FSE_EXCHANGE 5 +#define FSE_FINDER_INFO_CHANGED 6 +#define FSE_CREATE_DIR 7 +#define FSE_CHOWN 8 + +#define FSE_MAX_EVENTS 9 +#define FSE_ALL_EVENTS 998 + +#define FSE_EVENTS_DROPPED 999 + +// Actions for each event type +#define FSE_IGNORE 0 +#define FSE_REPORT 1 +#define FSE_ASK 2 // Not implemented yet + +// The types of each of the arguments for an event +// Each type is followed by the size and then the +// data. FSE_ARG_VNODE is just a path string +#define FSE_ARG_VNODE 0x0001 // next arg is a vnode pointer +#define FSE_ARG_STRING 0x0002 // next arg is length followed by string ptr +#define FSE_ARG_PATH 0x0003 // next arg is a full path +#define FSE_ARG_INT32 0x0004 // next arg is a 32-bit int +#define FSE_ARG_INT64 0x0005 // next arg is a 64-bit int +#define FSE_ARG_RAW 0x0006 // next arg is a length followed by a void ptr +#define FSE_ARG_INO 0x0007 // next arg is the inode number (ino_t) +#define FSE_ARG_UID 0x0008 // next arg is the file's uid (uid_t) +#define FSE_ARG_DEV 0x0009 // next arg is the file's dev_t +#define FSE_ARG_MODE 0x000a // next arg is the file's mode (as an int32, file type only) +#define FSE_ARG_GID 0x000b // next arg is the file's gid (gid_t) +#define FSE_ARG_FINFO 0x000c // kernel internal only +#define FSE_ARG_DONE 0xb33f // no more arguments + +#define FSE_MAX_ARGS 12 + + +// ioctl's on /dev/fsevents +typedef struct fsevent_clone_args { + int8_t *event_list; + int32_t num_events; + int32_t event_queue_depth; + int32_t *fd; +} fsevent_clone_args; + +#define FSEVENTS_CLONE _IOW('s', 1, fsevent_clone_args) + + +// ioctl's on the cloned fd +typedef struct fsevent_dev_filter_args { + uint32_t num_devices; + dev_t *devices; +} fsevent_dev_filter_args; + +#define FSEVENTS_DEVICE_FILTER _IOW('s', 100, fsevent_dev_filter_args) + + +#ifdef KERNEL + +int need_fsevent(int type, vnode_t vp); +int add_fsevent(int type, vfs_context_t, ...); +void fsevent_unmount(struct mount *mp); + +// misc utility functions for fsevent info and pathbuffers... +typedef struct fse_info { + dev_t dev; + ino_t ino; + int32_t mode; // note: this is not a mode_t (it's 32-bits, not 16) + uid_t uid; + gid_t gid; +} fse_info; + +int get_fse_info(struct vnode *vp, fse_info *fse, vfs_context_t ctx); + +char *get_pathbuff(void); +void release_pathbuff(char *path); + +#endif /* KERNEL */ + +#endif /* FSEVENT_H */ diff --git a/bsd/net/if_slvar.h b/bsd/sys/imgact.h similarity index 54% rename from bsd/net/if_slvar.h rename to bsd/sys/imgact.h index cb3132ef1..7a6920171 100644 --- a/bsd/net/if_slvar.h +++ b/bsd/sys/imgact.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -19,9 +19,9 @@ * * @APPLE_LICENSE_HEADER_END@ */ -/*- - * Copyright (c) 1991, 1993 - * The Regents of the University of California. All rights reserved. +/* + * Copyright (c) 1993, David Greenman + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -39,7 +39,7 @@ * may be used to endorse or promote products derived from this software * without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE @@ -50,66 +50,50 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * @(#)if_slvar.h 8.3 (Berkeley) 2/1/94 - * */ +#ifndef _SYS_IMGACT_H_ +#define _SYS_IMGACT_H_ -#ifndef _NET_IF_SLVAR_H_ -#define _NET_IF_SLVAR_H_ -#include <sys/appleapiopts.h> +#define IMG_SHSIZE 512 /* largest shell interpreter, in bytes */ -#ifndef DONT_WARN_OBSOLETE -#warning if_slvar.h is not used by the darwin kernel -#endif +struct proc; +struct nameidata; -#include <sys/callout.h> -#include <sys/mbuf.h> -#include <net/slcompress.h> +struct image_params { + user_addr_t ip_user_fname; /* argument */ + user_addr_t ip_user_argv; /* argument */ + user_addr_t ip_user_envv; /* argument */ + struct vnode *ip_vp; /* file */ + struct vnode_attr *ip_vattr; /* run file attributes */ + struct vnode_attr *ip_origvattr; /* invocation file attributes */ + char *ip_vdata; /* file data (up to one page) */ + int ip_flags; /* image flags */ + int ip_argc; /* argument count */ + char *ip_argv; /* argument vector beginning */ + int ip_envc; /* environment count */ + char *ip_strings; /* base address for strings */ + char *ip_strendp; /* current end pointer */ + char *ip_strendargvp; /* end of argv/start of envp */ + int ip_strspace; /* remaining space */ + user_size_t ip_arch_offset; /* subfile offset in ip_vp */ + user_size_t ip_arch_size; /* subfile length in ip_vp */ + char ip_interp_name[IMG_SHSIZE]; /* interpreter name */ -#ifdef __APPLE_API_PRIVATE + /* Next two fields are for support of Classic... */ + char *ip_p_comm; /* optional alt p->p_comm */ + char *ip_tws_cache_name; /* task working set cache */ + struct vfs_context *ip_vfs_context; /* VFS context */ + struct nameidata *ip_ndp; /* current nameidata */ + thread_t ip_vfork_thread; /* thread created, if vfork */ +}; /* - * Definitions for SLIP interface data structures - * - * (This exists so programs like slstats can get at the definition - * of sl_softc.) + * Image flags */ -struct sl_softc { - struct ifnet sc_if; /* network-visible interface */ - struct ifqueue sc_fastq; /* interactive output queue */ - struct tty *sc_ttyp; /* pointer to tty structure */ - u_char *sc_mp; /* pointer to next available buf char */ - u_char *sc_ep; /* pointer to last available buf char */ - u_char *sc_buf; /* input buffer */ - u_int sc_flags; /* see below */ - u_int sc_escape; /* =1 if last char input was FRAME_ESCAPE */ - long sc_lasttime; /* last time a char arrived */ - long sc_abortcount; /* number of abort escape chars */ - long sc_starttime; /* time of first abort in window */ - u_int sc_keepalive; /* time to decide link hang */ - u_int sc_outfill; /* time to send FRAME_END when output idle */ - /* - * Handles for scheduling outfill and - * keepalive timeouts. - */ -#if FB3x - struct callout_handle sc_ofhandle; - struct callout_handle sc_kahandle; -#endif - struct slcompress sc_comp; /* tcp compression data */ -}; - -/* internal flags */ -#define SC_ERROR 0x0001 /* had an input error */ -#define SC_OUTWAIT 0x0002 /* waiting for output fill */ -#define SC_KEEPALIVE 0x0004 /* input keepalive */ -#define SC_STATIC 0x0008 /* it is static unit */ - -/* visible flags */ -#define SC_COMPRESS IFF_LINK0 /* compress TCP traffic */ -#define SC_NOICMP IFF_LINK1 /* suppress ICMP traffic */ -#define SC_AUTOCOMP IFF_LINK2 /* auto-enable TCP compression */ +#define IMGPF_NONE 0x00000000 /* No flags */ +#define IMGPF_INTERPRET 0x00000001 /* Interpreter invoked */ +#define IMGPF_RESERVED1 0x00000002 /* reserved */ +#define IMGPF_WAS_64BIT 0x00000004 /* exec from a 64Bit binary */ +#define IMGPF_IS_64BIT 0x00000008 /* exec to a 64Bit binary */ -#endif /* __APPLE_API_PRIVATE */ -#endif +#endif /* !_SYS_IMGACT */ diff --git a/bsd/sys/ioctl.h b/bsd/sys/ioctl.h index a9dfad6b3..006b04d44 100644 --- a/bsd/sys/ioctl.h +++ b/bsd/sys/ioctl.h @@ -89,7 +89,7 @@ struct ttysize { #include <sys/cdefs.h> __BEGIN_DECLS -int ioctl __P((int, unsigned long, ...)); +int ioctl(int, unsigned long, ...); __END_DECLS #endif /* !KERNEL */ #endif /* !_SYS_IOCTL_H_ */ @@ -99,9 +99,9 @@ __END_DECLS * Compatability with old terminal driver * * Source level -> #define USE_OLD_TTY - * Kernel level -> options COMPAT_43 or COMPAT_SUNOS + * Kernel level -> options COMPAT_SUNOS */ #if defined(USE_OLD_TTY) || COMPAT_43 || defined(COMPAT_SUNOS) || \ - defined(COMPAT_SVR4) || defined(COMPAT_NEXT_3X) + defined(COMPAT_SVR4) || defined(COMPAT_NEXT_3X) || COMPAT_43_TTY #include <sys/ioctl_compat.h> #endif /* !_SYS_IOCTL_H_ */ diff --git a/bsd/sys/ioctl_compat.h b/bsd/sys/ioctl_compat.h index 9ff3a2bc1..11d837db0 100644 --- a/bsd/sys/ioctl_compat.h +++ b/bsd/sys/ioctl_compat.h @@ -123,26 +123,33 @@ struct sgttyb { #define EVENP 0x00000080 /* get/send even parity */ #define ANYP 0x000000c0 /* get any parity/send none */ #define NLDELAY 0x00000300 /* \n delay */ +#define TBDELAY 0x00000c00 /* horizontal tab delay */ +#define XTABS 0x00000c00 /* expand tabs on output */ +#define CRDELAY 0x00003000 /* \r delay */ +#define VTDELAY 0x00004000 /* vertical tab delay */ +#define BSDELAY 0x00008000 /* \b delay */ +#ifndef _SYS_TERMIOS_H_ +/* + * These manifest constants have the same names as those in <sys/termios.h>, + * so you are not permitted to have both definitions in scope simultaneously + * in the same compilation unit. + */ #define NL0 0x00000000 #define NL1 0x00000100 /* tty 37 */ #define NL2 0x00000200 /* vt05 */ #define NL3 0x00000300 -#define TBDELAY 0x00000c00 /* horizontal tab delay */ #define TAB0 0x00000000 #define TAB1 0x00000400 /* tty 37 */ #define TAB2 0x00000800 -#define XTABS 0x00000c00 /* expand tabs on output */ -#define CRDELAY 0x00003000 /* \r delay */ #define CR0 0x00000000 #define CR1 0x00001000 /* tn 300 */ #define CR2 0x00002000 /* tty 37 */ #define CR3 0x00003000 /* concept 100 */ -#define VTDELAY 0x00004000 /* vertical tab delay */ #define FF0 0x00000000 #define FF1 0x00004000 /* tty 37 */ -#define BSDELAY 0x00008000 /* \b delay */ #define BS0 0x00000000 #define BS1 0x00008000 +#endif /* !_SYS_TERMIOS_H_ */ #define ALLDELAY (NLDELAY|TBDELAY|CRDELAY|VTDELAY|BSDELAY) #define CRTBS 0x00010000 /* do backspacing for crt */ #define PRTERA 0x00020000 /* \ ... / erase */ diff --git a/bsd/sys/ipc.h b/bsd/sys/ipc.h index 5c642955e..13479cf6a 100644 --- a/bsd/sys/ipc.h +++ b/bsd/sys/ipc.h @@ -64,42 +64,116 @@ * @(#)ipc.h 8.4 (Berkeley) 2/19/95 */ -#include <sys/appleapiopts.h> - /* * SVID compatible ipc.h file */ #ifndef _SYS_IPC_H_ #define _SYS_IPC_H_ -struct ipc_perm { - ushort cuid; /* creator user id */ - ushort cgid; /* creator group id */ - ushort uid; /* user id */ - ushort gid; /* group id */ - ushort mode; /* r/w permission */ - ushort seq; /* sequence # (to generate unique msg/sem/shm id) */ - key_t key; /* user specified msg/sem/shm key */ +#include <sys/appleapiopts.h> +#include <sys/cdefs.h> + +#include <sys/_types.h> + +/* + * [XSI] The uid_t, gid_t, mode_t, and key_t types SHALL be defined as + * described in <sys/types.h>. + */ +#ifndef _UID_T +typedef __darwin_uid_t uid_t; /* user id */ +#define _UID_T +#endif + +#ifndef _GID_T +typedef __darwin_gid_t gid_t; +#define _GID_T +#endif + +#ifndef _MODE_T +typedef __darwin_mode_t mode_t; +#define _MODE_T +#endif + +#ifndef _KEY_T +#define _KEY_T +typedef __int32_t key_t; +#endif + +/* + * Technically, we should force all code references to the new structure + * definition, not in just the standards conformance case, and leave the + * legacy interface there for binary compatibility only. Currently, we + * are only forcing this for programs requesting standards conformance. + */ +#if defined(__POSIX_C_SOURCE) || defined(kernel) || defined(__LP64__) +/* + * [XSI] Information used in determining permission to perform an IPC + * operation + */ +struct __ipc_perm_new { + uid_t uid; /* [XSI] Owner's user ID */ + gid_t gid; /* [XSI] Owner's group ID */ + uid_t cuid; /* [XSI] Creator's user ID */ + gid_t cgid; /* [XSI] Creator's group ID */ + mode_t mode; /* [XSI] Read/write permission */ + unsigned short _seq; /* Reserved for internal use */ + key_t _key; /* Reserved for internal use */ }; +#define ipc_perm __ipc_perm_new +#else /* !_POSIX_C_SOURCE */ +#define ipc_perm __ipc_perm_old +#endif /* !_POSIX_C_SOURCE */ -/* common mode bits */ -#define IPC_R 000400 /* read permission */ -#define IPC_W 000200 /* write/alter permission */ -#define IPC_M 010000 /* permission to change control info */ +#if !defined(__POSIX_C_SOURCE) && !defined(__LP64__) +/* + * Legacy structure; this structure is maintained for binary backward + * compatability with previous versions of the interface. New code + * should not use this interface, since ID values may be truncated. + */ +struct __ipc_perm_old { + __uint16_t cuid; /* Creator's user ID */ + __uint16_t cgid; /* Creator's group ID */ + __uint16_t uid; /* Owner's user ID */ + __uint16_t gid; /* Owner's group ID */ + mode_t mode; /* Read/Write permission */ + __uint16_t seq; /* Reserved for internal use */ + key_t key; /* Reserved for internal use */ +}; +#endif /* !_POSIX_C_SOURCE */ + +/* + * [XSI] Definitions shall be provided for the following constants: + */ + +/* Mode bits */ +#define IPC_CREAT 001000 /* Create entry if key does not exist */ +#define IPC_EXCL 002000 /* Fail if key exists */ +#define IPC_NOWAIT 004000 /* Error if request must wait */ -/* SVID required constants (same values as system 5) */ -#define IPC_CREAT 001000 /* create entry if key does not exist */ -#define IPC_EXCL 002000 /* fail if key exists */ -#define IPC_NOWAIT 004000 /* error if request must wait */ +/* Keys */ +#define IPC_PRIVATE ((key_t)0) /* Private key */ -#define IPC_PRIVATE (key_t)0 /* private key */ +/* Control commands */ +#define IPC_RMID 0 /* Remove identifier */ +#define IPC_SET 1 /* Set options */ +#define IPC_STAT 2 /* Get options */ -#define IPC_RMID 0 /* remove identifier */ -#define IPC_SET 1 /* set options */ -#define IPC_STAT 2 /* get options */ -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifndef _POSIX_C_SOURCE + +/* common mode bits */ +#define IPC_R 000400 /* Read permission */ +#define IPC_W 000200 /* Write/alter permission */ +#define IPC_M 010000 /* Modify control info permission */ + +#endif /* !_POSIX_C_SOURCE */ + + +#ifdef BSD_KERNEL_PRIVATE +/* + * Kernel implementation details which should not be utilized by user + * space programs. + */ /* Macros to convert between ipc ids and array indices or sequence ids */ #define IPCID_TO_IX(id) ((id) & 0xffff) @@ -108,18 +182,16 @@ struct ipc_perm { struct ucred; -int ipcperm __P((struct ucred *, struct ipc_perm *, int)); -#endif /* __APPLE_API_PRIVATE */ -#else /* ! KERNEL */ - -/* XXX doesn't really belong here, but has been historical practice in SysV. */ +int ipcperm(struct ucred *, struct ipc_perm *, int); +#endif /* BSD_KERNEL_PRIVATE */ -#include <sys/cdefs.h> +#ifndef KERNEL __BEGIN_DECLS -key_t ftok __P((const char *, int)); +/* [XSI] */ +key_t ftok(const char *, int); __END_DECLS -#endif /* KERNEL */ +#endif /* !KERNEL */ #endif /* !_SYS_IPC_H_ */ diff --git a/bsd/sys/ipcs.h b/bsd/sys/ipcs.h new file mode 100644 index 000000000..e4a6e23f4 --- /dev/null +++ b/bsd/sys/ipcs.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2004-2005 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* + * NOTE: Internal ipcs.h header; all interfaces are private; if you want this + * same information from your own program, popen(3) the ipcs(2) command and + * parse its output, or your program may not work on future OS releases. + */ + +#ifndef _SYS_IPCS_H_ +#define _SYS_IPCS_H_ + +#include <sys/appleapiopts.h> +#include <sys/cdefs.h> + +#define IPCS_MAGIC 0x00000001 /* Version */ + +/* + * IPCS_command + * + * This is the IPCS command structure used for obtaining status about the + * System V IPC mechanisms. All other operations are based on the per + * subsystem (shm, msg, ipc) *ctl entry point, which can be called once + * this information is known. + */ + +struct IPCS_command { + int ipcs_magic; /* Magic number for struct layout */ + int ipcs_op; /* Operation to perform */ + int ipcs_cursor; /* Cursor for iteration functions */ + int ipcs_datalen; /* Length of ipcs_data area */ + void *ipcs_data; /* OP specific data */ +}; + +#ifdef KERNEL_PRIVATE +#include <machine/types.h> + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_IPCS_command { + int ipcs_magic; /* Magic number for struct layout */ + int ipcs_op; /* Operation to perform */ + int ipcs_cursor; /* Cursor for iteration functions */ + int ipcs_datalen; /* Length of ipcs_data area */ + user_addr_t ipcs_data; /* OP specific data */ +}; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +#endif /* KERNEL_PRIVATE */ + +/* + * OP code values for 'ipcs_op' + */ +#define IPCS_SHM_CONF 0x00000001 /* Obtain shared memory config */ +#define IPCS_SHM_ITER 0x00000002 /* Iterate shared memory info */ + +#define IPCS_SEM_CONF 0x00000010 /* Obtain semaphore config */ +#define IPCS_SEM_ITER 0x00000020 /* Iterate semaphore info */ + +#define IPCS_MSG_CONF 0x00000100 /* Obtain message queue config */ +#define IPCS_MSG_ITER 0x00000200 /* Iterate message queue info */ + +/* + * Sysctl oid name values + */ +#define IPCS_SHM_SYSCTL "kern.sysv.ipcs.shm" +#define IPCS_SEM_SYSCTL "kern.sysv.ipcs.sem" +#define IPCS_MSG_SYSCTL "kern.sysv.ipcs.msg" + + +#endif /* _SYS_IPCS_H_ */ diff --git a/bsd/sys/kauth.h b/bsd/sys/kauth.h new file mode 100644 index 000000000..eb87187e9 --- /dev/null +++ b/bsd/sys/kauth.h @@ -0,0 +1,652 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef _SYS_KAUTH_H +#define _SYS_KAUTH_H + +#include <sys/appleapiopts.h> +#include <sys/cdefs.h> + +#ifdef __APPLE_API_EVOLVING + +/* + * Identities. + */ + +#define KAUTH_UID_NONE (~(uid_t)0 - 100) /* not a valid UID */ +#define KAUTH_GID_NONE (~(gid_t)0 - 100) /* not a valid GID */ + +#ifndef _KAUTH_GUID +#define _KAUTH_GUID +/* Apple-style globally unique identifier */ +typedef struct { +#define KAUTH_GUID_SIZE 16 /* 128-bit identifier */ + unsigned char g_guid[KAUTH_GUID_SIZE]; +} guid_t; +#define _GUID_T +#endif /* _KAUTH_GUID */ + +/* NT Security Identifier, structure as defined by Microsoft */ +#pragma pack(1) /* push packing of 1 byte */ +typedef struct { + u_int8_t sid_kind; + u_int8_t sid_authcount; + u_int8_t sid_authority[6]; +#define KAUTH_NTSID_MAX_AUTHORITIES 16 + u_int32_t sid_authorities[KAUTH_NTSID_MAX_AUTHORITIES]; +} ntsid_t; +#pragma pack() /* pop packing to previous packing level */ +#define _NTSID_T + +/* valid byte count inside a SID structure */ +#define KAUTH_NTSID_HDRSIZE (8) +#define KAUTH_NTSID_SIZE(_s) (KAUTH_NTSID_HDRSIZE + ((_s)->sid_authcount * sizeof(u_int32_t))) + +/* + * External lookup message payload + */ +struct kauth_identity_extlookup { + u_int32_t el_seqno; /* request sequence number */ + u_int32_t el_result; /* lookup result */ +#define KAUTH_EXTLOOKUP_SUCCESS 0 /* results here are good */ +#define KAUTH_EXTLOOKUP_BADRQ 1 /* request badly formatted */ +#define KAUTH_EXTLOOKUP_FAILURE 2 /* transient failure during lookup */ +#define KAUTH_EXTLOOKUP_FATAL 3 /* permanent failure during lookup */ +#define KAUTH_EXTLOOKUP_INPROG 100 /* request in progress */ + u_int32_t el_flags; +#define KAUTH_EXTLOOKUP_VALID_UID (1<<0) +#define KAUTH_EXTLOOKUP_VALID_UGUID (1<<1) +#define KAUTH_EXTLOOKUP_VALID_USID (1<<2) +#define KAUTH_EXTLOOKUP_VALID_GID (1<<3) +#define KAUTH_EXTLOOKUP_VALID_GGUID (1<<4) +#define KAUTH_EXTLOOKUP_VALID_GSID (1<<5) +#define KAUTH_EXTLOOKUP_WANT_UID (1<<6) +#define KAUTH_EXTLOOKUP_WANT_UGUID (1<<7) +#define KAUTH_EXTLOOKUP_WANT_USID (1<<8) +#define KAUTH_EXTLOOKUP_WANT_GID (1<<9) +#define KAUTH_EXTLOOKUP_WANT_GGUID (1<<10) +#define KAUTH_EXTLOOKUP_WANT_GSID (1<<11) +#define KAUTH_EXTLOOKUP_WANT_MEMBERSHIP (1<<12) +#define KAUTH_EXTLOOKUP_VALID_MEMBERSHIP (1<<13) +#define KAUTH_EXTLOOKUP_ISMEMBER (1<<14) + uid_t el_uid; /* user ID */ + guid_t el_uguid; /* user GUID */ + u_int32_t el_uguid_valid; /* TTL on translation result (seconds) */ + ntsid_t el_usid; /* user NT SID */ + u_int32_t el_usid_valid; /* TTL on translation result (seconds) */ + gid_t el_gid; /* group ID */ + guid_t el_gguid; /* group GUID */ + u_int32_t el_gguid_valid; /* TTL on translation result (seconds) */ + ntsid_t el_gsid; /* group SID */ + u_int32_t el_gsid_valid; /* TTL on translation result (seconds) */ + u_int32_t el_member_valid; /* TTL on group lookup result */ +}; + +#define KAUTH_EXTLOOKUP_REGISTER (0) +#define KAUTH_EXTLOOKUP_RESULT (1<<0) +#define KAUTH_EXTLOOKUP_WORKER (1<<1) + + +#ifdef KERNEL +/* + * Credentials. + */ + +#if 0 +/* + * Supplemental credential data. + * + * This interface allows us to associate arbitrary data with a credential. + * As with the credential, the data is considered immutable. + */ +struct kauth_cred_supplement { + TAILQ_ENTRY(kauth_cred_supplement) kcs_link; + + int kcs_ref; /* reference count */ + int kcs_id; /* vended identifier */ + size_t kcs_size; /* size of data field */ + char kcs_data[0]; +}; + +typedef struct kauth_cred_supplement *kauth_cred_supplement_t; + +struct kauth_cred { + TAILQ_ENTRY(kauth_cred) kc_link; + + int kc_ref; /* reference count */ + uid_t kc_uid; /* effective user id */ + uid_t kc_ruid; /* real user id */ + uid_t kc_svuid; /* saved user id */ + gid_t kc_gid; /* effective group id */ + gid_t kc_rgid; /* real group id */ + gid_t kc_svgid; /* saved group id */ + + int kc_flags; +#define KAUTH_CRED_GRPOVERRIDE (1<<0) /* private group list is authoritative */ + + int kc_npvtgroups; /* private group list, advisory or authoritative */ + gid_t kc_pvtgroups[NGROUPS]; /* based on KAUTH_CRED_GRPOVERRIDE flag */ + + int kc_nsuppgroups; /* supplementary group list */ + gid_t *kc_suppgroups; + + int kc_nwhtgroups; /* whiteout group list */ + gid_t *kc_whtgroups; + + struct auditinfo cr_au; /* user auditing data */ + + int kc_nsupplement; /* entry count in supplemental data pointer array */ + kauth_cred_supplement_t *kc_supplement; +}; +#else + +/* XXX just for now */ +#include <sys/ucred.h> +// typedef struct ucred *kauth_cred_t; +#endif + +/* Kernel SPI for now */ +__BEGIN_DECLS +extern uid_t kauth_getuid(void); +extern uid_t kauth_getruid(void); +extern gid_t kauth_getgid(void); +extern gid_t kauth_getrgid(void); +extern kauth_cred_t kauth_cred_get(void); +extern kauth_cred_t kauth_cred_get_with_ref(void); +extern kauth_cred_t kauth_cred_proc_ref(proc_t procp); +extern kauth_cred_t kauth_cred_alloc(void); +extern kauth_cred_t kauth_cred_create(kauth_cred_t cred); +extern void kauth_cred_ref(kauth_cred_t _cred); +extern void kauth_cred_rele(kauth_cred_t _cred); +extern kauth_cred_t kauth_cred_dup(kauth_cred_t cred); +extern kauth_cred_t kauth_cred_copy_real(kauth_cred_t cred); +extern void kauth_cred_unref(kauth_cred_t _cred); +extern kauth_cred_t kauth_cred_setuid(kauth_cred_t cred, uid_t uid); +extern kauth_cred_t kauth_cred_seteuid(kauth_cred_t cred, uid_t euid); +extern kauth_cred_t kauth_cred_setgid(kauth_cred_t cred, gid_t gid); +extern kauth_cred_t kauth_cred_setegid(kauth_cred_t cred, gid_t egid); +extern kauth_cred_t kauth_cred_setuidgid(kauth_cred_t cred, uid_t uid, gid_t gid); +extern kauth_cred_t kauth_cred_setsvuidgid(kauth_cred_t cred, uid_t uid, gid_t gid); +extern kauth_cred_t kauth_cred_setgroups(kauth_cred_t cred, gid_t *groups, int groupcount, uid_t gmuid); +extern kauth_cred_t kauth_cred_find(kauth_cred_t cred); +extern int kauth_cred_getgroups(gid_t *_groups, int *_groupcount); +extern int kauth_cred_assume(uid_t _uid); +extern uid_t kauth_cred_getuid(kauth_cred_t _cred); +extern gid_t kauth_cred_getgid(kauth_cred_t _cred); +extern int kauth_cred_guid2uid(guid_t *_guid, uid_t *_uidp); +extern int kauth_cred_guid2gid(guid_t *_guid, gid_t *_gidp); +extern int kauth_cred_ntsid2uid(ntsid_t *_sid, uid_t *_uidp); +extern int kauth_cred_ntsid2gid(ntsid_t *_sid, gid_t *_gidp); +extern int kauth_cred_ntsid2guid(ntsid_t *_sid, guid_t *_guidp); +extern int kauth_cred_uid2guid(uid_t _uid, guid_t *_guidp); +extern int kauth_cred_getguid(kauth_cred_t _cred, guid_t *_guidp); +extern int kauth_cred_gid2guid(gid_t _gid, guid_t *_guidp); +extern int kauth_cred_uid2ntsid(uid_t _uid, ntsid_t *_sidp); +extern int kauth_cred_getntsid(kauth_cred_t _cred, ntsid_t *_sidp); +extern int kauth_cred_gid2ntsid(gid_t _gid, ntsid_t *_sidp); +extern int kauth_cred_guid2ntsid(guid_t *_guid, ntsid_t *_sidp); +extern int kauth_cred_ismember_gid(kauth_cred_t _cred, gid_t _gid, int *_resultp); +extern int kauth_cred_ismember_guid(kauth_cred_t _cred, guid_t *_guidp, int *_resultp); + +extern int kauth_cred_supplementary_register(const char *name, int *ident); +extern int kauth_cred_supplementary_add(kauth_cred_t cred, int ident, const void *data, size_t datasize); +extern int kauth_cred_supplementary_remove(kauth_cred_t cred, int ident); + +/* NOT KPI - fast path for in-kernel code only */ +extern int kauth_cred_issuser(kauth_cred_t _cred); + + +/* GUID, NTSID helpers */ +extern guid_t kauth_null_guid; +extern int kauth_guid_equal(guid_t *_guid1, guid_t *_guid2); +extern int kauth_ntsid_equal(ntsid_t *_sid1, ntsid_t *_sid2); + +extern int kauth_wellknown_guid(guid_t *_guid); +#define KAUTH_WKG_NOT 0 /* not a well-known GUID */ +#define KAUTH_WKG_OWNER 1 +#define KAUTH_WKG_GROUP 2 +#define KAUTH_WKG_NOBODY 3 +#define KAUTH_WKG_EVERYBODY 4 + +extern int cantrace(proc_t cur_procp, kauth_cred_t creds, proc_t traced_procp, int *errp); + +__END_DECLS + +#endif /* KERNEL */ + +/* + * Generic Access Control Lists. + */ +#if defined(KERNEL) || defined (_SYS_ACL_H) + +typedef u_int32_t kauth_ace_rights_t; + +/* Access Control List Entry (ACE) */ +struct kauth_ace { + guid_t ace_applicable; + u_int32_t ace_flags; +#define KAUTH_ACE_KINDMASK 0xf +#define KAUTH_ACE_PERMIT 1 +#define KAUTH_ACE_DENY 2 +#define KAUTH_ACE_AUDIT 3 /* not implemented */ +#define KAUTH_ACE_ALARM 4 /* not implemented */ +#define KAUTH_ACE_INHERITED (1<<4) +#define KAUTH_ACE_FILE_INHERIT (1<<5) +#define KAUTH_ACE_DIRECTORY_INHERIT (1<<6) +#define KAUTH_ACE_LIMIT_INHERIT (1<<7) +#define KAUTH_ACE_ONLY_INHERIT (1<<8) +#define KAUTH_ACE_SUCCESS (1<<9) /* not implemented (AUDIT/ALARM) */ +#define KAUTH_ACE_FAILURE (1<<10) /* not implemented (AUDIT/ALARM) */ + kauth_ace_rights_t ace_rights; /* scope specific */ + /* These rights are never tested, but may be present in an ACL */ +#define KAUTH_ACE_GENERIC_ALL (1<<21) +#define KAUTH_ACE_GENERIC_EXECUTE (1<<22) +#define KAUTH_ACE_GENERIC_WRITE (1<<23) +#define KAUTH_ACE_GENERIC_READ (1<<24) + +}; + +#ifndef _KAUTH_ACE +#define _KAUTH_ACE +typedef struct kauth_ace *kauth_ace_t; +#endif + + +/* Access Control List */ +struct kauth_acl { + u_int32_t acl_entrycount; + u_int32_t acl_flags; + + struct kauth_ace acl_ace[]; +}; + +/* + * XXX this value needs to be raised - 3893388 + */ +#define KAUTH_ACL_MAX_ENTRIES 128 + +/* + * The low 16 bits of the flags field are reserved for filesystem + * internal use and must be preserved by all APIs. This includes + * round-tripping flags through user-space interfaces. + */ +#define KAUTH_ACL_FLAGS_PRIVATE (0xffff) + +/* + * The high 16 bits of the flags are used to store attributes and + * to request specific handling of the ACL. + */ + +/* inheritance will be deferred until the first rename operation */ +#define KAUTH_ACL_DEFER_INHERIT (1<<16) +/* this ACL must not be overwritten as part of an inheritance operation */ +#define KAUTH_ACL_NO_INHERIT (1<<17) + +#define KAUTH_ACL_SIZE(c) (sizeof(struct kauth_acl) + (c) * sizeof(struct kauth_ace)) +#define KAUTH_ACL_COPYSIZE(p) KAUTH_ACL_SIZE((p)->acl_entrycount) + + +#ifndef _KAUTH_ACL +#define _KAUTH_ACL +typedef struct kauth_acl *kauth_acl_t; +#endif + +#ifdef KERNEL +__BEGIN_DECLS +kauth_acl_t kauth_acl_alloc(int size); +void kauth_acl_free(kauth_acl_t fsp); +__END_DECLS +#endif + + +/* + * Extended File Security. + */ + +/* File Security information */ +struct kauth_filesec { + u_int32_t fsec_magic; +#define KAUTH_FILESEC_MAGIC 0x012cc16d + guid_t fsec_owner; + guid_t fsec_group; + + struct kauth_acl fsec_acl; + /* acl_entrycount that tells us the ACL is not valid */ +#define KAUTH_FILESEC_NOACL ((u_int32_t)(-1)) +}; + +/* backwards compatibility */ +#define fsec_entrycount fsec_acl.acl_entrycount +#define fsec_flags fsec_acl.acl_flags +#define fsec_ace fsec_acl.acl_ace +#define KAUTH_FILESEC_FLAGS_PRIVATE KAUTH_ACL_FLAGS_PRIVATE +#define KAUTH_FILESEC_DEFER_INHERIT KAUTH_ACL_DEFER_INHERIT +#define KAUTH_FILESEC_NO_INHERIT KAUTH_ACL_NO_INHERIT +#define KAUTH_FILESEC_NONE ((kauth_filesec_t)0) +#define KAUTH_FILESEC_WANTED ((kauth_filesec_t)1) + +#ifndef _KAUTH_FILESEC +#define _KAUTH_FILESEC +typedef struct kauth_filesec *kauth_filesec_t; +#endif + +#define KAUTH_FILESEC_SIZE(c) (sizeof(struct kauth_filesec) + (c) * sizeof(struct kauth_ace)) +#define KAUTH_FILESEC_COPYSIZE(p) KAUTH_FILESEC_SIZE(((p)->fsec_entrycount == KAUTH_FILESEC_NOACL) ? 0 : (p)->fsec_entrycount) +#define KAUTH_FILESEC_COUNT(s) ((s - sizeof(struct kauth_filesec)) / sizeof(struct kauth_ace)) + +#define KAUTH_FILESEC_XATTR "com.apple.system.Security" + +__BEGIN_DECLS +kauth_filesec_t kauth_filesec_alloc(int size); +void kauth_filesec_free(kauth_filesec_t fsp); +int kauth_copyinfilesec(user_addr_t xsecurity, kauth_filesec_t *xsecdestpp); +__END_DECLS + +#endif /* KERNEL || <sys/acl.h> */ + + +#ifdef KERNEL +/* + * Scope management. + */ +struct kauth_scope; +typedef struct kauth_scope *kauth_scope_t; +struct kauth_listener; +typedef struct kauth_listener *kauth_listener_t; +#ifndef _KAUTH_ACTION_T +typedef int kauth_action_t; +# define _KAUTH_ACTION_T +#endif + +typedef int (* kauth_scope_callback_t)(kauth_cred_t _credential, + void *_idata, + kauth_action_t _action, + uintptr_t _arg0, + uintptr_t _arg1, + uintptr_t _arg2, + uintptr_t _arg3); + +#define KAUTH_RESULT_ALLOW (1) +#define KAUTH_RESULT_DENY (2) +#define KAUTH_RESULT_DEFER (3) + +struct kauth_acl_eval { + kauth_ace_t ae_acl; + int ae_count; + kauth_ace_rights_t ae_requested; + kauth_ace_rights_t ae_residual; + int ae_result; + int ae_options; +#define KAUTH_AEVAL_IS_OWNER (1<<0) /* authorizing operation for owner */ +#define KAUTH_AEVAL_IN_GROUP (1<<1) /* authorizing operation for groupmember */ + /* expansions for 'generic' rights bits */ + kauth_ace_rights_t ae_exp_gall; + kauth_ace_rights_t ae_exp_gread; + kauth_ace_rights_t ae_exp_gwrite; + kauth_ace_rights_t ae_exp_gexec; +}; + +typedef struct kauth_acl_eval *kauth_acl_eval_t; + +__BEGIN_DECLS +extern kauth_scope_t kauth_register_scope(const char *_identifier, kauth_scope_callback_t _callback, void *_idata); +extern void kauth_deregister_scope(kauth_scope_t _scope); +extern kauth_listener_t kauth_listen_scope(const char *_identifier, kauth_scope_callback_t _callback, void *_idata); +extern void kauth_unlisten_scope(kauth_listener_t _scope); +extern int kauth_authorize_action(kauth_scope_t _scope, kauth_cred_t _credential, kauth_action_t _action, + uintptr_t _arg0, uintptr_t _arg1, uintptr_t _arg2, uintptr_t _arg3); +extern int kauth_acl_evaluate(kauth_cred_t _credential, kauth_acl_eval_t _eval); +extern int kauth_acl_inherit(vnode_t _dvp, kauth_acl_t _initial, kauth_acl_t *_product, int _isdir, vfs_context_t _ctx); + +/* default scope handlers */ +extern int kauth_authorize_allow(kauth_cred_t _credential, void *_idata, kauth_action_t _action, + uintptr_t _arg0, uintptr_t _arg1, uintptr_t _arg2, uintptr_t _arg3); +__END_DECLS + +/* + * Generic scope. + */ +#define KAUTH_SCOPE_GENERIC "com.apple.kauth.generic" + +/* Actions */ +#define KAUTH_GENERIC_ISSUSER 1 + +__BEGIN_DECLS +extern int kauth_authorize_generic(kauth_cred_t credential, kauth_action_t action); +__END_DECLS + +/* + * Process/task scope. + */ +#define KAUTH_SCOPE_PROCESS "com.apple.kauth.process" + +/* Actions */ +#define KAUTH_PROCESS_CANSIGNAL 1 +#define KAUTH_PROCESS_CANTRACE 2 + +__BEGIN_DECLS +extern int kauth_authorize_process(kauth_cred_t _credential, kauth_action_t _action, + struct proc *_process, uintptr_t _arg1, uintptr_t _arg2, uintptr_t _arg3); +__END_DECLS + +/* + * Vnode operation scope. + * + * Prototype for vnode_authorize is in vnode.h + */ +#define KAUTH_SCOPE_VNODE "com.apple.kauth.vnode" + +/* + * File system operation scope. + * + */ +#define KAUTH_SCOPE_FILEOP "com.apple.kauth.fileop" + +/* Actions */ +#define KAUTH_FILEOP_OPEN 1 +#define KAUTH_FILEOP_CLOSE 2 +#define KAUTH_FILEOP_RENAME 3 +#define KAUTH_FILEOP_EXCHANGE 4 +#define KAUTH_FILEOP_LINK 5 +#define KAUTH_FILEOP_EXEC 6 + +/* + * arguments passed to KAUTH_FILEOP_OPEN listeners + * arg0 is pointer to vnode (vnode *) for given user path. + * arg1 is pointer to path (char *) passed in to open. + * arguments passed to KAUTH_FILEOP_CLOSE listeners + * arg0 is pointer to vnode (vnode *) for file to be closed. + * arg1 is pointer to path (char *) of file to be closed. + * arg2 is close flags. + * arguments passed to KAUTH_FILEOP_RENAME listeners + * arg0 is pointer to "from" path (char *). + * arg1 is pointer to "to" path (char *). + * arguments passed to KAUTH_FILEOP_EXCHANGE listeners + * arg0 is pointer to file 1 path (char *). + * arg1 is pointer to file 2 path (char *). + * arguments passed to KAUTH_FILEOP_LINK listeners + * arg0 is pointer to path to file we are linking to (char *). + * arg1 is pointer to path to the new link file (char *). + * arguments passed to KAUTH_FILEOP_EXEC listeners + * arg0 is pointer to vnode (vnode *) for executable. + * arg1 is pointer to path (char *) to executable. + */ + +/* Flag values returned to close listeners. */ +#define KAUTH_FILEOP_CLOSE_MODIFIED (1<<1) + +__BEGIN_DECLS +extern int kauth_authorize_fileop_has_listeners(void); +extern int kauth_authorize_fileop(kauth_cred_t _credential, kauth_action_t _action, + uintptr_t _arg0, uintptr_t _arg1); +__END_DECLS + +#endif /* KERNEL */ + +/* Actions, also rights bits in an ACE */ + +#if defined(KERNEL) || defined (_SYS_ACL_H) +#define KAUTH_VNODE_READ_DATA (1<<1) +#define KAUTH_VNODE_LIST_DIRECTORY KAUTH_VNODE_READ_DATA +#define KAUTH_VNODE_WRITE_DATA (1<<2) +#define KAUTH_VNODE_ADD_FILE KAUTH_VNODE_WRITE_DATA +#define KAUTH_VNODE_EXECUTE (1<<3) +#define KAUTH_VNODE_SEARCH KAUTH_VNODE_EXECUTE +#define KAUTH_VNODE_DELETE (1<<4) +#define KAUTH_VNODE_APPEND_DATA (1<<5) +#define KAUTH_VNODE_ADD_SUBDIRECTORY KAUTH_VNODE_APPEND_DATA +#define KAUTH_VNODE_DELETE_CHILD (1<<6) +#define KAUTH_VNODE_READ_ATTRIBUTES (1<<7) +#define KAUTH_VNODE_WRITE_ATTRIBUTES (1<<8) +#define KAUTH_VNODE_READ_EXTATTRIBUTES (1<<9) +#define KAUTH_VNODE_WRITE_EXTATTRIBUTES (1<<10) +#define KAUTH_VNODE_READ_SECURITY (1<<11) +#define KAUTH_VNODE_WRITE_SECURITY (1<<12) +#define KAUTH_VNODE_TAKE_OWNERSHIP (1<<13) + +/* backwards compatibility only */ +#define KAUTH_VNODE_CHANGE_OWNER KAUTH_VNODE_TAKE_OWNERSHIP + +/* For Windows interoperability only */ +#define KAUTH_VNODE_SYNCHRONIZE (1<<20) + +/* (1<<21) - (1<<24) are reserved for generic rights bits */ + +/* Actions not expressed as rights bits */ +/* + * Authorizes the vnode as the target of a hard link. + */ +#define KAUTH_VNODE_LINKTARGET (1<<25) + +/* + * Indicates that other steps have been taken to authorise the action, + * but authorisation should be denied for immutable objects. + */ +#define KAUTH_VNODE_CHECKIMMUTABLE (1<<26) + +/* Action modifiers */ +/* + * The KAUTH_VNODE_ACCESS bit is passed to the callback if the authorisation + * request in progress is advisory, rather than authoritative. Listeners + * performing consequential work (i.e. not strictly checking authorisation) + * may test this flag to avoid performing unnecessary work. + * + * This bit will never be present in an ACE. + */ +#define KAUTH_VNODE_ACCESS (1<<31) + +/* + * The KAUTH_VNODE_NOIMMUTABLE bit is passed to the callback along with the + * KAUTH_VNODE_WRITE_SECURITY bit (and no others) to indicate that the + * caller wishes to change one or more of the immutable flags, and the + * state of these flags should not be considered when authorizing the request. + * The system immutable flags are only ignored when the system securelevel + * is low enough to allow their removal. + */ +#define KAUTH_VNODE_NOIMMUTABLE (1<<30) + +/* The expansions of the GENERIC bits at evaluation time */ +#define KAUTH_VNODE_GENERIC_READ_BITS (KAUTH_VNODE_READ_DATA | \ + KAUTH_VNODE_READ_ATTRIBUTES | \ + KAUTH_VNODE_READ_EXTATTRIBUTES | \ + KAUTH_VNODE_READ_SECURITY) + +#define KAUTH_VNODE_GENERIC_WRITE_BITS (KAUTH_VNODE_WRITE_DATA | \ + KAUTH_VNODE_APPEND_DATA | \ + KAUTH_VNODE_DELETE | \ + KAUTH_VNODE_DELETE_CHILD | \ + KAUTH_VNODE_WRITE_ATTRIBUTES | \ + KAUTH_VNODE_WRITE_EXTATTRIBUTES | \ + KAUTH_VNODE_WRITE_SECURITY) + +#define KAUTH_VNODE_GENERIC_EXECUTE_BITS (KAUTH_VNODE_EXECUTE) + +#define KAUTH_VNODE_GENERIC_ALL_BITS (KAUTH_VNODE_GENERIC_READ_BITS | \ + KAUTH_VNODE_GENERIC_WRITE_BITS | \ + KAUTH_VNODE_GENERIC_EXECUTE_BITS) + +/* + * Some sets of bits, defined here for convenience. + */ +#define KAUTH_VNODE_WRITE_RIGHTS (KAUTH_VNODE_ADD_FILE | \ + KAUTH_VNODE_ADD_SUBDIRECTORY | \ + KAUTH_VNODE_DELETE_CHILD | \ + KAUTH_VNODE_WRITE_DATA | \ + KAUTH_VNODE_APPEND_DATA | \ + KAUTH_VNODE_DELETE | \ + KAUTH_VNODE_WRITE_ATTRIBUTES | \ + KAUTH_VNODE_WRITE_EXTATTRIBUTES | \ + KAUTH_VNODE_WRITE_SECURITY | \ + KAUTH_VNODE_TAKE_OWNERSHIP | \ + KAUTH_VNODE_LINKTARGET | \ + KAUTH_VNODE_CHECKIMMUTABLE) + + +#endif /* KERNEL || <sys/acl.h> */ + +#ifdef KERNEL +#include <sys/lock.h> /* lck_grp_t */ + +/* + * Debugging + * + * XXX this wouldn't be necessary if we had a *real* debug-logging system. + */ +#if 0 +# ifndef _FN_KPRINTF +# define _FN_KPRINTF +void kprintf(const char *fmt, ...); +# endif +# define KAUTH_DEBUG_ENABLE +# define K_UUID_FMT "%08x:%08x:%08x:%08x" +# define K_UUID_ARG(_u) *(int *)&_u.g_guid[0],*(int *)&_u.g_guid[4],*(int *)&_u.g_guid[8],*(int *)&_u.g_guid[12] +# define KAUTH_DEBUG(fmt, args...) do { kprintf("%s:%d: " fmt "\n", __PRETTY_FUNCTION__, __LINE__ , ##args); } while (0) +# define KAUTH_DEBUG_CTX(_c) KAUTH_DEBUG("p = %p c = %p", _c->vc_proc, _c->vc_ucred) +# define VFS_DEBUG(_ctx, _vp, fmt, args...) \ + do { \ + kprintf("%p '%s' %s:%d " fmt "\n", \ + _ctx, \ + (_vp != NULL && _vp->v_name != NULL) ? _vp->v_name : "????", \ + __PRETTY_FUNCTION__, __LINE__ , \ + ##args); \ + } while(0) +#else +# define KAUTH_DEBUG(fmt, args...) do { } while (0) +# define VFS_DEBUG(ctx, vp, fmt, args...) do { } while(0) +#endif + +/* + * Initialisation. + */ +extern lck_grp_t *kauth_lck_grp; +__BEGIN_DECLS +extern void kauth_init(void); +extern void kauth_identity_init(void); +extern void kauth_groups_init(void); +extern void kauth_cred_init(void); +extern void kauth_resolver_init(void); +__END_DECLS +#endif + +#endif /* __APPLE_API_EVOLVING */ +#endif /* _SYS_KAUTH_H */ + diff --git a/bsd/sys/kdebug.h b/bsd/sys/kdebug.h index 0f00781f6..28f6456c7 100644 --- a/bsd/sys/kdebug.h +++ b/bsd/sys/kdebug.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -75,9 +75,11 @@ __BEGIN_DECLS #define DBG_DRIVERS 6 #define DBG_TRACE 7 #define DBG_DLIL 8 +#define DBG_SECURITY 9 #define DBG_MISC 20 #define DBG_DYLD 31 #define DBG_QT 32 +#define DBG_APPS 33 #define DBG_MIG 255 /* **** The Kernel Debug Sub Classes for Mach (DBG_MACH) **** */ @@ -96,6 +98,7 @@ __BEGIN_DECLS #define DBG_MACH_VM 0x30 /* Virtual Memory */ #define DBG_MACH_SCHED 0x40 /* Scheduler */ #define DBG_MACH_MSGID_INVALID 0x50 /* Messages - invalid */ +#define DBG_MACH_LOCKS 0x60 /* new lock APIs */ /* Codes for Scheduler (DBG_MACH_SCHED) */ #define MACH_SCHED 0x0 /* Scheduler */ @@ -211,8 +214,10 @@ __BEGIN_DECLS #define TRACEDBG_CODE(SubClass,code) KDBG_CODE(DBG_TRACE, SubClass, code) #define MISCDBG_CODE(SubClass,code) KDBG_CODE(DBG_MISC, SubClass, code) #define DLILDBG_CODE(SubClass,code) KDBG_CODE(DBG_DLIL, SubClass, code) +#define SECURITYDBG_CODE(SubClass,code) KDBG_CODE(DBG_SECURITY, SubClass, code) #define DYLDDBG_CODE(SubClass,code) KDBG_CODE(DBG_DYLD, SubClass, code) #define QTDBG_CODE(SubClass,code) KDBG_CODE(DBG_QT, SubClass, code) +#define APPSDBG_CODE(SubClass,code) KDBG_CODE(DBG_APPS, SubClass, code) /* Usage: * kernel_debug((KDBG_CODE(DBG_NETWORK, DNET_PROTOCOL, 51) | DBG_FUNC_START), @@ -262,6 +267,13 @@ extern void kernel_debug(unsigned int debugid, unsigned int arg1, unsigned int a extern void kernel_debug1(unsigned int debugid, unsigned int arg1, unsigned int arg2, unsigned int arg3, unsigned int arg4, unsigned int arg5); +/* + * LP64todo - for some reason these are problematic + */ +extern void kdbg_trace_data(struct proc *proc, long *arg_pid); + +extern void kdbg_trace_string(struct proc *proc, long *arg1, long *arg2, long *arg3, long *arg4); + #if KDEBUG #define KERNEL_DEBUG(x,a,b,c,d,e) \ @@ -276,18 +288,21 @@ do { \ kernel_debug1(x,a,b,c,d,e); \ } while(0) +#define __kdebug_only + #else #define KERNEL_DEBUG(x,a,b,c,d,e) #define KERNEL_DEBUG1(x,a,b,c,d,e) +#define __kdebug_only __unused #endif #endif /* __APPLE_API_UNSTABLE */ __END_DECLS -#ifdef KERNEL_PRIVATE +#ifdef PRIVATE #ifdef __APPLE_API_PRIVATE /* * private kernel_debug definitions @@ -303,8 +318,9 @@ unsigned int arg5; /* will hold current thread */ unsigned int debugid; } kd_buf; -#define KDBG_THREAD_MASK 0x7fffffff -#define KDBG_CPU_MASK 0x80000000 +#define KDBG_TIMESTAMP_MASK 0x00ffffffffffffffULL +#define KDBG_CPU_MASK 0x0f00000000000000ULL +#define KDBG_CPU_SHIFT 56 /* Debug Flags */ #define KDBG_INIT 0x1 @@ -315,6 +331,7 @@ unsigned int debugid; #define KDBG_PIDCHECK 0x10 #define KDBG_MAPINIT 0x20 #define KDBG_PIDEXCLUDE 0x40 +#define KDBG_LOCKINIT 0x80 typedef struct { unsigned int type; @@ -393,11 +410,11 @@ typedef struct int npcbufs; int bufsize; int enable; - unsigned long pcsample_beg; - unsigned long pcsample_end; + unsigned int pcsample_beg; + unsigned int pcsample_end; } pcinfo_t; #endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL_PRIVATE */ +#endif /* PRIVATE */ #endif /* !BSD_SYS_KDEBUG_H */ diff --git a/bsd/sys/kern_audit.h b/bsd/sys/kern_audit.h deleted file mode 100644 index 7475e299e..000000000 --- a/bsd/sys/kern_audit.h +++ /dev/null @@ -1,285 +0,0 @@ -/* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ - -#ifndef _SYS_KERN_AUDIT_H -#define _SYS_KERN_AUDIT_H - -#ifdef KERNEL - -/* - * Audit subsystem condition flags. The audit_enabled flag is set and - * removed automatically as a result of configuring log files, and - * can be observed but should not be directly manipulated. The audit - * suspension flag permits audit to be temporarily disabled without - * reconfiguring the audit target. - */ -extern int audit_enabled; -extern int audit_suspended; - -#define BSM_SUCCESS 0 -#define BSM_FAILURE 1 -#define BSM_NOAUDIT 2 - -/* - * Define the masks for the audited arguments. - */ -#define ARG_EUID 0x0000000000000001ULL -#define ARG_RUID 0x0000000000000002ULL -#define ARG_SUID 0x0000000000000004ULL -#define ARG_EGID 0x0000000000000008ULL -#define ARG_RGID 0x0000000000000010ULL -#define ARG_SGID 0x0000000000000020ULL -#define ARG_PID 0x0000000000000040ULL -#define ARG_UID 0x0000000000000080ULL -#define ARG_AUID 0x0000000000000100ULL -#define ARG_GID 0x0000000000000200ULL -#define ARG_FD 0x0000000000000400ULL -#define UNUSED 0x0000000000000800ULL -#define ARG_FFLAGS 0x0000000000001000ULL -#define ARG_MODE 0x0000000000002000ULL -#define ARG_DEV 0x0000000000004000ULL -#define ARG_ACCMODE 0x0000000000008000ULL -#define ARG_CMODE 0x0000000000010000ULL -#define ARG_MASK 0x0000000000020000ULL -#define ARG_SIGNUM 0x0000000000040000ULL -#define ARG_LOGIN 0x0000000000080000ULL -#define ARG_SADDRINET 0x0000000000100000ULL -#define ARG_SADDRINET6 0x0000000000200000ULL -#define ARG_SADDRUNIX 0x0000000000400000ULL -#define ARG_KPATH1 0x0000000000800000ULL -#define ARG_KPATH2 0x0000000001000000ULL -#define ARG_UPATH1 0x0000000002000000ULL -#define ARG_UPATH2 0x0000000004000000ULL -#define ARG_TEXT 0x0000000008000000ULL -#define ARG_VNODE1 0x0000000010000000ULL -#define ARG_VNODE2 0x0000000020000000ULL -#define ARG_SVIPC_CMD 0x0000000040000000ULL -#define ARG_SVIPC_PERM 0x0000000080000000ULL -#define ARG_SVIPC_ID 0x0000000100000000ULL -#define ARG_SVIPC_ADDR 0x0000000200000000ULL -#define ARG_GROUPSET 0x0000000400000000ULL -#define ARG_CMD 0x0000000800000000ULL -#define ARG_SOCKINFO 0x0000001000000000ULL -#define ARG_NONE 0x0000000000000000ULL -#define ARG_ALL 0xFFFFFFFFFFFFFFFFULL - -struct vnode_au_info { - mode_t vn_mode; - uid_t vn_uid; - gid_t vn_gid; - dev_t vn_dev; - long vn_fsid; - long vn_fileid; - long vn_gen; -}; - -struct groupset { - gid_t gidset[NGROUPS]; - u_int gidset_size; -}; - -struct socket_info { - int sodomain; - int sotype; - int soprotocol; -}; - -struct audit_record { - /* Audit record header. */ - u_int32_t ar_magic; - int ar_event; - int ar_retval; /* value returned to the process */ - int ar_errno; /* return status of system call */ - struct timespec ar_starttime; - struct timespec ar_endtime; - u_int64_t ar_valid_arg; /* Bitmask of valid arguments */ - - /* Audit subject information. */ - struct xucred ar_subj_cred; - uid_t ar_subj_ruid; - gid_t ar_subj_rgid; - gid_t ar_subj_egid; - uid_t ar_subj_auid; /* Audit user ID */ - pid_t ar_subj_asid; /* Audit session ID */ - pid_t ar_subj_pid; - struct au_tid ar_subj_term; - char ar_subj_comm[MAXCOMLEN + 1]; - struct au_mask ar_subj_amask; - - /* Operation arguments. */ - uid_t ar_arg_euid; - uid_t ar_arg_ruid; - uid_t ar_arg_suid; - gid_t ar_arg_egid; - gid_t ar_arg_rgid; - gid_t ar_arg_sgid; - pid_t ar_arg_pid; - uid_t ar_arg_uid; - uid_t ar_arg_auid; - gid_t ar_arg_gid; - struct groupset ar_arg_groups; - int ar_arg_fd; - int ar_arg_fflags; - mode_t ar_arg_mode; - int ar_arg_dev; - int ar_arg_accmode; - int ar_arg_cmode; - int ar_arg_mask; - u_int ar_arg_signum; - char ar_arg_login[MAXLOGNAME]; - struct sockaddr ar_arg_sockaddr; - struct socket_info ar_arg_sockinfo; - char *ar_arg_upath1; - char *ar_arg_upath2; - char *ar_arg_kpath1; - char *ar_arg_kpath2; - char *ar_arg_text; - struct au_mask ar_arg_amask; - struct vnode_au_info ar_arg_vnode1; - struct vnode_au_info ar_arg_vnode2; - int ar_arg_cmd; - int ar_arg_svipc_cmd; - struct ipc_perm ar_arg_svipc_perm; - int ar_arg_svipc_id; - void * ar_arg_svipc_addr; -}; - -/* - * In-kernel version of audit record; the basic record plus queue meta-data. - * This record can also have a pointer set to some opaque data that will - * be passed through to the audit writing mechanism. - */ -struct kaudit_record { - struct audit_record k_ar; - caddr_t k_udata; /* user data */ - u_int k_ulen; /* user data length */ - struct uthread *k_uthread; /* thread we are auditing */ - TAILQ_ENTRY(kaudit_record) k_q; -}; - -struct proc; -struct vnode; -struct componentname; - -void audit_abort(struct kaudit_record *ar); -void audit_commit(struct kaudit_record *ar, int error, - int retval); -void audit_init(void); -void audit_shutdown(void); - -struct kaudit_record *audit_new(int event, struct proc *p, - struct uthread *uthread); - -void audit_syscall_enter(unsigned short code, struct proc *proc, struct uthread *uthread); -void audit_syscall_exit(int error, struct proc *proc, - struct uthread *uthread); - -int kaudit_to_bsm(struct kaudit_record *kar, - struct au_record **pau); - -int bsm_rec_verify(caddr_t rec); - -/* - * Kernel versions of the BSM audit record functions. - */ -struct au_record *kau_open(void); -int kau_write(struct au_record *rec, token_t *m); -int kau_close(struct au_record *rec, - struct timespec *endtime, short event); -void kau_free(struct au_record *rec); -void kau_init(void); -token_t *kau_to_file(char *file, struct timeval *tv); -token_t *kau_to_header(struct timespec *ctime, int rec_size, - au_event_t e_type, au_emod_t e_mod); -token_t *kau_to_header32(struct timespec *ctime, int rec_size, - au_event_t e_type, au_emod_t e_mod); -token_t *kau_to_header64(struct timespec *ctime, int rec_size, - au_event_t e_type, au_emod_t e_mod); -/* - * The remaining kernel functions are conditionally compiled in as they - * are wrapped by a macro, and the macro should be the only place in - * the source tree where these functions are referenced. - */ -#ifdef AUDIT -void audit_arg_accmode(int mode); -void audit_arg_cmode(int cmode); -void audit_arg_fd(int fd); -void audit_arg_fflags(int fflags); -void audit_arg_gid(gid_t gid, gid_t egid, gid_t rgid, - gid_t sgid); -void audit_arg_uid(uid_t uid, uid_t euid, uid_t ruid, - uid_t suid); -void audit_arg_groupset(gid_t *gidset, u_int gidset_size); -void audit_arg_login(char[MAXLOGNAME]); -void audit_arg_mask(int mask); -void audit_arg_mode(mode_t mode); -void audit_arg_dev(int dev); -void audit_arg_owner(uid_t uid, gid_t gid); -void audit_arg_pid(pid_t pid); -void audit_arg_signum(u_int signum); -void audit_arg_socket(int sodomain, int sotype, - int soprotocol); -void audit_arg_sockaddr(struct proc *p, - struct sockaddr *so); -void audit_arg_auid(uid_t auid); -void audit_arg_upath(struct proc *p, char *upath, - u_int64_t flags); -void audit_arg_vnpath(struct vnode *vp, u_int64_t flags); -void audit_arg_text(char *text); -void audit_arg_cmd(int cmd); -void audit_arg_svipc_cmd(int cmd); -void audit_arg_svipc_perm(struct ipc_perm *perm); -void audit_arg_svipc_id(int id); -void audit_arg_svipc_addr(void *addr); - -void audit_proc_init(struct proc *p); -void audit_proc_fork(struct proc *parent, - struct proc *child); -void audit_proc_free(struct proc *p); - -/* - * Define a macro to wrap the audit_arg_* calls by checking the global - * audit_enabled flag before performing the actual call. - */ -#define AUDIT_ARG(op, args...) do { \ - if (audit_enabled) \ - audit_arg_ ## op (args); \ - } while (0) - -#define AUDIT_CMD(audit_cmd) do { \ - if (audit_enabled) { \ - audit_cmd; \ - } \ - } while (0) - -#else /* !AUDIT */ -#define AUDIT_ARG(op, args...) do { \ - } while (0) - -#define AUDIT_CMD(audit_cmd) do { \ - } while (0) - -#endif /* AUDIT */ - -#endif /* KERNEL */ - -#endif /* !_SYS_KERN_AUDIT_H */ diff --git a/bsd/sys/kern_control.h b/bsd/sys/kern_control.h index 04b37c06a..f032e79a2 100644 --- a/bsd/sys/kern_control.h +++ b/bsd/sys/kern_control.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -19,220 +19,409 @@ * * @APPLE_LICENSE_HEADER_END@ */ +/*! + @header kern_control.h + This header defines an API to communicate between a kernel + extension and a process outside of the kernel. + */ +#ifndef KPI_KERN_CONTROL_H +#define KPI_KERN_CONTROL_H -#ifndef SYS_KERN_CONTROL_H -#define SYS_KERN_CONTROL_H #include <sys/appleapiopts.h> -#ifdef __APPLE_API_UNSTABLE /* * Define Controller event subclass, and associated events. + * Subclass of KEV_SYSTEM_CLASS */ -/* Subclass of KEV_SYSTEM_CLASS */ -#define KEV_CTL_SUBCLASS 1 +/*! + @defined KEV_CTL_SUBCLASS + @discussion The kernel event subclass for kernel control events. +*/ +#define KEV_CTL_SUBCLASS 2 + +/*! + @defined KEV_CTL_REGISTERED + @discussion The event code indicating a new controller was + registered. The data portion will contain a ctl_event_data. +*/ +#define KEV_CTL_REGISTERED 1 /* a new controller appears */ -#define KEV_CTL_REGISTERED 1 /* a new controller appears */ -#define KEV_CTL_DEREGISTERED 2 /* a controller disappears */ +/*! + @defined KEV_CTL_DEREGISTERED + @discussion The event code indicating a controller was unregistered. + The data portion will contain a ctl_event_data. +*/ +#define KEV_CTL_DEREGISTERED 2 /* a controller disappears */ -/* All KEV_CTL_SUBCLASS events share the same header */ +/*! + @struct ctl_event_data + @discussion This structure is used for KEV_CTL_SUBCLASS kernel + events. + @field ctl_id The kernel control id. + @field ctl_unit The kernel control unit. +*/ struct ctl_event_data { - u_int32_t ctl_id; + u_int32_t ctl_id; /* Kernel Controller ID */ u_int32_t ctl_unit; }; - /* * Controls destined to the Controller Manager. */ -#define CTLIOCGCOUNT _IOR('N', 1, int) /* get number of control structures registered */ +/*! + @defined CTLIOCGCOUNT + @discussion The CTLIOCGCOUNT ioctl can be used to determine the + number of kernel controllers registered. +*/ +#define CTLIOCGCOUNT _IOR('N', 2, int) /* get number of control structures registered */ + +/*! + @defined CTLIOCGINFO + @discussion The CTLIOCGINFO ioctl can be used to convert a kernel + control name to a kernel control id. +*/ +#define CTLIOCGINFO _IOWR('N', 3, struct ctl_info) /* get id from name */ + + +/*! + @defined MAX_KCTL_NAME + @discussion Kernel control names must be no longer than + MAX_KCTL_NAME. +*/ +#define MAX_KCTL_NAME 96 /* - * Controller address structure - * used to establish contact between user client and kernel controller - * sc_id/sc_unit uniquely identify each controller - * sc_id is a 32-bit "signature" obtained by developers from Apple Computer - * sc_unit is a unit number for this sc_id, and is privately used - * by the developper to identify several instances to control + * Controls destined to the Controller Manager. */ -struct sockaddr_ctl -{ - u_char sc_len; /* sizeof(struct sockaddr_ctl) */ - u_char sc_family; /* AF_SYSTEM */ - u_int16_t ss_sysaddr; /* AF_SYS_CONTROL */ - u_int32_t sc_id; /* 32-bit "signature" managed by Apple */ +/*! + @struct ctl_info + @discussion This structure is used with the CTLIOCGINFO ioctl to + translate from a kernel control name to a control id. + @field ctl_id The kernel control id, filled out upon return. + @field ctl_name The kernel control name to find. +*/ +struct ctl_info { + u_int32_t ctl_id; /* Kernel Controller ID */ + char ctl_name[MAX_KCTL_NAME]; /* Kernel Controller Name (a C string) */ +}; + + +/*! + @struct sockaddr_ctl + @discussion The controller address structure is used to establish + contact between a user client and a kernel controller. The + sc_id/sc_unit uniquely identify each controller. sc_id is a + unique identifier assigned to the controller. The identifier can + be assigned by the system at registration time or be a 32-bit + creator code obtained from Apple Computer. sc_unit is a unit + number for this sc_id, and is privately used by the kernel + controller to identify several instances of the controller. + @field sc_len The length of the structure. + @field sc_family AF_SYSTEM. + @field ss_sysaddr AF_SYS_KERNCONTROL. + @field sc_id Controller unique identifier. + @field sc_unit Kernel controller private unit number. + @field sc_reserved Reserved, must be set to zero. +*/ +struct sockaddr_ctl { + u_char sc_len; /* depends on size of bundle ID string */ + u_char sc_family; /* AF_SYSTEM */ + u_int16_t ss_sysaddr; /* AF_SYS_KERNCONTROL */ + u_int32_t sc_id; /* Controller unique identifier */ u_int32_t sc_unit; /* Developer private unit number */ u_int32_t sc_reserved[5]; }; -#endif /* __APPLE_API_UNSTABLE */ #ifdef KERNEL -#ifdef __APPLE_API_UNSTABLE -/* Reference to a controller object */ +#include <sys/kpi_mbuf.h> + +/*! + @typedef kern_ctl_ref + @discussion A control reference is used to track an attached kernel + control. Registering a kernel control will create a kernel + control reference. This reference is required for sending data + or removing the kernel control. This reference will be passed to + callbacks for that kernel control. +*/ typedef void * kern_ctl_ref; -/* Support flags for controllers */ -#define CTL_FLAG_PRIVILEGED 0x1 /* user must be root to contact controller */ +/*! + @defined CTL_FLAG_PRIVILEGED + @discussion The CTL_FLAG_PRIVILEGED flag is passed in ctl_flags. If + this flag is set, only privileged processes may attach to this + kernel control. +*/ +#define CTL_FLAG_PRIVILEGED 0x1 +/*! + @defined CTL_FLAG_REG_ID_UNIT + @discussion The CTL_FLAG_REG_ID_UNIT flag is passed to indicate that + the ctl_id specified should be used. If this flag is not + present, a unique ctl_id will be dynamically assigned to your + kernel control. The CTLIOCGINFO ioctl can be used by the client + to find the dynamically assigned id based on the control name + specified in ctl_name. +*/ +#define CTL_FLAG_REG_ID_UNIT 0x2 +/*! + @defined CTL_FLAG_REG_SOCK_STREAM + @discussion Use the CTL_FLAG_REG_SOCK_STREAM flag when client need to open + socket of type SOCK_STREAM to communicate with the kernel control. + By default kernel control sockets are of type SOCK_DGRAM. +*/ +#define CTL_FLAG_REG_SOCK_STREAM 0x4 /* Data flags for controllers */ -#define CTL_DATA_NOWAKEUP 0x1 /* don't wake up client yet */ +/*! + @defined CTL_DATA_NOWAKEUP + @discussion The CTL_DATA_NOWAKEUP flag can be used for the enqueue + data and enqueue mbuf functions to indicate that the process + should not be woken up yet. This is useful when you want to + enqueue data using more than one call but only want to wake up + the client after all of the data has been enqueued. +*/ +#define CTL_DATA_NOWAKEUP 0x1 +/*! + @defined CTL_DATA_EOR + @discussion The CTL_DATA_NOWAKEUP flag can be used for the enqueue + data and enqueue mbuf functions to mark the end of a record. +*/ +#define CTL_DATA_EOR 0x2 +/*! + @typedef ctl_connect_func + @discussion The ctl_connect_func is used to receive + notification of a client connecting to the kernel control. + @param kctlref The control ref for the kernel control the client is + connecting to. + @param sac The address used to connect to this control. The field sc_unit + contains the unit number of the kernel control instance the client is + connecting to. If CTL_FLAG_REG_ID_UNIT was set when the kernel control + was registered, sc_unit is the ctl_unit of the kern_ctl_reg structure. + If CTL_FLAG_REG_ID_UNIT was not set when the kernel control was + registered, sc_unit is the dynamically allocated unit number of + the new kernel control instance that is used for this connection. + @param unitinfo A place for the kernel control to store a pointer to + per-connection data. + */ +typedef errno_t (*ctl_connect_func)(kern_ctl_ref kctlref, + struct sockaddr_ctl *sac, + void **unitinfo); -/* - * Controller registration structure, given at registration time +/*! + @typedef ctl_disconnect_func + @discussion The ctl_disconnect_func is used to receive notification + that a client has disconnected from the kernel control. This + usually happens when the socket is closed. If this is the last + socket attached to your kernel control, you may unregister your + kernel control from this callback. + @param kctlref The control ref for the kernel control instance the client has + disconnected from. + @param unit The unit number of the kernel control instance the client has + disconnected from. + @param unitinfo The unitinfo value specified by the connect function + when the client connected. */ +typedef errno_t (*ctl_disconnect_func)(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo); + +/*! + @typedef ctl_send_func + @discussion The ctl_send_func is used to receive data sent from + the client to the kernel control. + @param kctlref The control ref of the kernel control. + @param unit The unit number of the kernel control instance the client has + connected to. + @param unitinfo The unitinfo value specified by the connect function + when the client connected. + @param m The data sent by the client to the kernel control in an + mbuf chain. + @param flags The flags specified by the client when calling + send/sendto/sendmsg (MSG_OOB/MSG_DONTROUTE). + */ +typedef errno_t (*ctl_send_func)(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo, + mbuf_t m, int flags); + +/*! + @typedef ctl_setopt_func + @discussion The ctl_setopt_func is used to handle set socket option + calls for the SYSPROTO_CONTROL option level. + @param kctlref The control ref of the kernel control. + @param unit The unit number of the kernel control instance. + @param unitinfo The unitinfo value specified by the connect function + when the client connected. + @param opt The socket option. + @param data A pointer to the socket option data. The data has + already been copied in to the kernel for you. + @param len The length of the socket option data. + */ +typedef errno_t (*ctl_setopt_func)(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo, + int opt, void *data, size_t len); + +/*! + @typedef ctl_getopt_func + @discussion The ctl_getopt_func is used to handle client get socket + option requests for the SYSPROTO_CONTROL option level. A buffer + is allocated for storage and passed to your function. The length + of that buffer is also passed. Upon return, you should set *len + to length of the buffer used. In some cases, data may be NULL. + When this happens, *len should be set to the length you would + have returned had data not been NULL. If the buffer is too small, + return an error. + @param kctlref The control ref of the kernel control. + @param unit The unit number of the kernel control instance. + @param unitinfo The unitinfo value specified by the connect function + when the client connected. + @param opt The socket option. + @param data A buffer to copy the results in to. May be NULL, see + discussion. + @param len A pointer to the length of the buffer. This should be set + to the length of the buffer used before returning. + */ +typedef errno_t (*ctl_getopt_func)(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo, + int opt, void *data, size_t *len); + +/*! + @struct kern_ctl_reg + @discussion This structure defines the properties of a kernel + control being registered. + @field ctl_name A Bundle ID string of up to MAX_KCTL_NAME bytes (including the ending zero). + This string should not be empty. + @field ctl_id The control ID may be dynamically assigned or it can be a + 32-bit creator code assigned by DTS. + For a DTS assigned creator code the CTL_FLAG_REG_ID_UNIT flag must be set. + For a dynamically assigned control ID, do not set the CTL_FLAG_REG_ID_UNIT flag. + The value of the dynamically assigned control ID is set to this field + when the registration succeeds. + @field ctl_unit A separate unit number to register multiple units that + share the same control ID with DTS assigned creator code when + the CTL_FLAG_REG_ID_UNIT flag is set. + This field is ignored for a dynamically assigned control ID. + @field ctl_flags CTL_FLAG_PRIVILEGED and/or CTL_FLAG_REG_ID_UNIT. + @field ctl_sendsize Override the default send size. If set to zero, + the default send size will be used, and this default value + is set to this field to be retrieved by the caller. + @field ctl_recvsize Override the default receive size. If set to + zero, the default receive size will be used, and this default value + is set to this field to be retrieved by the caller. + @field ctl_connect Specify the function to be called whenever a client + connects to the kernel control. This field must be specified. + @field ctl_disconnect Specify a function to be called whenever a + client disconnects from the kernel control. + @field ctl_send Specify a function to handle data send from the + client to the kernel control. + @field ctl_setopt Specify a function to handle set socket option + operations for the kernel control. + @field ctl_getopt Specify a function to handle get socket option + operations for the kernel control. +*/ struct kern_ctl_reg { - /* control information */ - u_int32_t ctl_id; /* unique id of the controller, provided by DTS */ - u_int32_t ctl_unit; /* unit number for the controller, for the specified id */ - /* a controller can be registered several times with the same id */ - /* but must have a different unit number */ - + /* control information */ + char ctl_name[MAX_KCTL_NAME]; + u_int32_t ctl_id; + u_int32_t ctl_unit; + /* control settings */ - u_int32_t ctl_flags; /* support flags */ - u_int32_t ctl_sendsize; /* override send/receive buffer size */ - u_int32_t ctl_recvsize; /* 0 = use default values */ + u_int32_t ctl_flags; + u_int32_t ctl_sendsize; + u_int32_t ctl_recvsize; /* Dispatch functions */ - - int (*ctl_connect) - (kern_ctl_ref ctlref, void *userdata); - /* Make contact, called when user client calls connect */ - /* the socket with the id/unit of the controller */ - - void (*ctl_disconnect) - (kern_ctl_ref ctlref, void *userdata); - /* Break contact, called when user client */ - /* closes the control socket */ - - int (*ctl_write) - (kern_ctl_ref ctlref, void *userdata, struct mbuf *m); - /* Send data to the controller, called when user client */ - /* writes data to the socket */ - - int (*ctl_set) - (kern_ctl_ref ctlref, void *userdata, int opt, void *data, size_t len); - /* set controller configuration, called when user client */ - /* calls setsockopt() for the socket */ - /* opt is the option number */ - /* data points to the data, already copied in kernel space */ - /* len is the lenght of the data buffer */ - - int (*ctl_get) - (kern_ctl_ref ctlref, void *userdata, int opt, void *data, size_t *len); - /* get controller configuration, called when user client */ - /* calls getsockopt() for the socket */ - /* opt is the option number */ - /* data points to the data buffer of max lenght len */ - /* the controller can directly copy data in the buffer space */ - /* and does not need to worry about copying out the data */ - /* as long as it respects the max buffer lenght */ - /* on input, len contains the maximum buffer length */ - /* on output, len contains the actual buffer lenght */ - /* if data is NULL on input, then, by convention, the controller */ - /* should return in len the lenght of the data it would like */ - /* to return in the subsequent call for that option */ - - /* prepare the future */ - u_int32_t ctl_reserved[4]; /* for future use if needed */ + ctl_connect_func ctl_connect; + ctl_disconnect_func ctl_disconnect; + ctl_send_func ctl_send; + ctl_setopt_func ctl_setopt; + ctl_getopt_func ctl_getopt; }; +/*! + @function ctl_register + @discussion Register a kernel control. This will enable clients to + connect to the kernel control using a PF_SYSTEM socket. + @param userkctl A structure defining the kernel control to be + attached. The ctl_connect callback must be specified, the other callbacks + are optional. If ctl_connect is set to zero, ctl_register fails with + the error code EINVAL. + @param kctlref Upon successful return, the kctlref will contain a + reference to the attached kernel control. This reference is used + to unregister the kernel control. This reference will also be + passed in to the callbacks each time they are called. + @result 0 - Kernel control was registered. + EINVAL - The registration structure was not valid. + ENOMEM - There was insufficient memory. + EEXIST - A controller with that id/unit is already registered. + */ +errno_t +ctl_register(struct kern_ctl_reg *userkctl, kern_ctl_ref *kctlref); -/* - * FUNCTION : - * Register the controller to the controller manager - * For example, can be called from a Kernel Extension Start routine - * - * PARAMETERS : - * userctl : Registration structure containing control information - * and callback functions for the controller. - * Callbacks are optional and can be null. - * A controller with all callbacks set to null would not be very useful. - * userdata : This parameter is for use by the controller and - * will be passed to every callback function - * - * RETURN CODE : - * 0 : No error - * ctlref will be filled with a control reference, - * to use in subsequent call to the controller manager - * EINVAL : Invalid registration structure - * ENOMEM : Not enough memory available to register the controller - * EEXIST : Controller id/unit already registered +/*! + @function ctl_deregister + @discussion Unregister a kernel control. A kernel extension must + unregister it's kernel control(s) before unloading. If a kernel + control has clients attached, this call will fail. + @param kctlref The control reference of the control to unregister. + @result 0 - Kernel control was unregistered. + EINVAL - The kernel control reference was invalid. + EBUSY - The kernel control has clients still attached. */ - -int -ctl_register(struct kern_ctl_reg *userctl, void *userdata, kern_ctl_ref *ctlref); +errno_t +ctl_deregister(kern_ctl_ref kctlref); -/* - * FUNCTION : - * Deregister the controller - * For example, can be called from a Kernel Extension Stop routine - * - * PARAMETERS : - * ctlref : Reference to the controller previously registered - * - * RETURN CODE : - * 0 : No error, - * The controller manager no longer knows about the controller - * EINVAL : Invalid reference +/*! + @function ctl_enqueuedata + @discussion Send data from the kernel control to the client. + @param kctlref The control reference of the kernel control. + @param unit The unit number of the kernel control instance. + @param data A pointer to the data to send. + @param len The length of data to send. + @param flags Send flags. CTL_DATA_NOWAKEUP is currently the only + supported flag. + @result 0 - Data was enqueued to be read by the client. + EINVAL - Invalid parameters. + EMSGSIZE - The buffer is too large. + ENOBUFS - The queue is full or there are no free mbufs. */ - -int -ctl_deregister(kern_ctl_ref ctlref); +errno_t +ctl_enqueuedata(kern_ctl_ref kctlref, u_int32_t unit, void *data, size_t len, u_int32_t flags); -/* - * FUNCTION : - * Send data to the application in contact with the controller - * ctl_enqueuedata will allocate a mbuf, copy data and enqueue it. - * - * PARAMETERS : - * ctlref : Reference to the controller previously registered - * data : Data to send - * len : Length of the data (maximum lenght of MCLBYTES) - * flags : Flags used when enqueing - * CTL_DATA_NOWAKEUP = just enqueue, don't wake up client - * - * RETURN CODE : - * 0 : No error - * EINVAL: Invalid reference - * EMSGSIZE: The buffer is too large - * ENOTCONN : No user client is connected - * ENOBUFS : Socket buffer is full, or can't get a new mbuf - * The controller should re-enqueue later +/*! + @function ctl_enqueuembuf + @discussion Send data stored in an mbuf chain from the kernel + control to the client. The caller is responsible for freeing + the mbuf chain if ctl_enqueuembuf returns an error. + @param kctlref The control reference of the kernel control. + @param unit The unit number of the kernel control instance. + @param m An mbuf chain containing the data to send to the client. + @param flags Send flags. CTL_DATA_NOWAKEUP is currently the only + supported flag. + @result 0 - Data was enqueued to be read by the client. + EINVAL - Invalid parameters. + ENOBUFS - The queue is full. */ - -int -ctl_enqueuedata(kern_ctl_ref ctlref, void *data, size_t len, u_int32_t flags); +errno_t +ctl_enqueuembuf(kern_ctl_ref kctlref, u_int32_t unit, mbuf_t m, u_int32_t flags); -/* - * FUNCTION : - * Send data to the application in contact with the controller - * - * PARAMETERS : - * ctlref : Reference to the controller previously registered - * m : mbuf containing the data to send - * flags : Flags used when enqueing - * CTL_DATA_NOWAKEUP = just enqueue, don't wake up client - * - * RETURN CODE : - * 0 : No error - * EINVAL: Invalid reference - * ENOTCONN : No user client is connected - * ENOBUFS : Socket buffer is full, - * The controller should either free the mbuf or re-enqueue later + +/*! + @function ctl_getenqueuespace + @discussion Retrieve the amount of space currently available for data to be sent + from the kernel control to the client. + @param kctlref The control reference of the kernel control. + @param unit The unit number of the kernel control instance. + @param space The address where to return the current space available + @result 0 - Data was enqueued to be read by the client. + EINVAL - Invalid parameters. */ - -int -ctl_enqueuembuf(kern_ctl_ref ctlref, struct mbuf *m, u_int32_t flags); +errno_t +ctl_getenqueuespace(kern_ctl_ref kctlref, u_int32_t unit, size_t *space); + -#endif /* __APPLE_API_UNSTABLE */ #endif /* KERNEL */ -#endif /* SYS_KERN_CONTROL_H */ +#endif /* KPI_KERN_CONTROL_H */ diff --git a/bsd/sys/kern_event.h b/bsd/sys/kern_event.h index 5a232c02a..ca40cc713 100644 --- a/bsd/sys/kern_event.h +++ b/bsd/sys/kern_event.h @@ -20,7 +20,11 @@ * @APPLE_LICENSE_HEADER_END@ */ /* Copyright (c) 1998, 1999 Apple Computer, Inc. All Rights Reserved */ - +/*! + @header kern_event.h + This header defines in-kernel functions for generating kernel events as well + as functions for receiving kernel events using a kernel event socket. + */ #ifndef SYS_KERN_EVENT_H #define SYS_KERN_EVENT_H @@ -41,54 +45,184 @@ * Vendor Code */ +/*! + @defined KEV_VENDOR_APPLE + @discussion Apple generated kernel events use the hard coded vendor code + value of 1. Third party kernel events use a dynamically allocated vendor + code. The vendor code can be found using the SIOCGKEVVENDOR ioctl. +*/ #define KEV_VENDOR_APPLE 1 /* - * Definition of top-level classifications + * Definition of top-level classifications for KEV_VENDOR_APPLE */ -#define KEV_NETWORK_CLASS 1 -#define KEV_IOKIT_CLASS 2 -#define KEV_SYSTEM_CLASS 3 +/*! + @defined KEV_NETWORK_CLASS + @discussion Network kernel event class. +*/ +#define KEV_NETWORK_CLASS 1 + +/*! + @defined KEV_IOKIT_CLASS + @discussion IOKit kernel event class. +*/ +#define KEV_IOKIT_CLASS 2 + +/*! + @defined KEV_IOKIT_CLASS + @discussion System kernel event class. +*/ +#define KEV_SYSTEM_CLASS 3 +/*! + @defined KEV_APPLESHARE_CLASS + @discussion AppleShare kernel event class. +*/ +#define KEV_APPLESHARE_CLASS 4 +/*! + @struct kern_event_msg + @discussion This structure is prepended to all kernel events. This structure + is used to determine the format of the remainder of the kernel event. + This structure will appear on all messages received on a kernel event + socket. To post a kernel event, a slightly different structure is used. + @field total_size Total size of the kernel event message including the + header. + @field vendor_code The vendor code indicates which vendor generated the + kernel event. This gives every vendor a unique set of classes and + subclasses to use. Use the SIOCGKEVVENDOR ioctl to look up vendor codes + for vendors other than Apple. Apple uses KEV_VENDOR_APPLE. + @field kev_class The class of the kernel event. + @field kev_subclass The subclass of the kernel event. + @field id Monotonically increasing value. + @field event_code The event code. + @field event_data Any additional data about this event. Format will depend + on the vendor_code, kev_class, kev_subclass, and event_code. The length + of the event_data can be determined using total_size - + KEV_MSG_HEADER_SIZE. +*/ struct kern_event_msg { - u_long total_size; /* Size of entire event msg */ - u_long vendor_code; /* For non-Apple extensibility */ - u_long kev_class; /* Layer of event source */ - u_long kev_subclass; /* Component within layer */ - u_long id; /* Monotonically increasing value */ - u_long event_code; /* unique code */ - u_long event_data[1]; /* One or more data longwords */ + u_long total_size; /* Size of entire event msg */ + u_long vendor_code; /* For non-Apple extensibility */ + u_long kev_class; /* Layer of event source */ + u_long kev_subclass; /* Component within layer */ + u_long id; /* Monotonically increasing value */ + u_long event_code; /* unique code */ + u_long event_data[1]; /* One or more data longwords */ }; -#define KEV_MSG_HEADER_SIZE (6 * sizeof(u_long)) - +/*! + @defined KEV_MSG_HEADER_SIZE + @discussion Size of the header portion of the kern_event_msg structure. This + accounts for everything right up to event_data. The size of the data can + be found by subtracting KEV_MSG_HEADER_SIZE from the total size from the + kern_event_msg. +*/ +#define KEV_MSG_HEADER_SIZE (offsetof(struct kern_event_msg, event_data[0])) +/*! + @struct kev_request + @discussion This structure is used with the SIOCSKEVFILT and SIOCGKEVFILT to + set and get the control filter setting for a kernel control socket. + @field total_size Total size of the kernel event message including the + header. + @field vendor_code All kernel events that don't match this vendor code will + be ignored. KEV_ANY_VENDOR can be used to receive kernel events with any + vendor code. + @field kev_class All kernel events that don't match this class will be + ignored. KEV_ANY_CLASS can be used to receive kernel events with any + class. + @field kev_subclass All kernel events that don't match this subclass will be + ignored. KEV_ANY_SUBCLASS can be used to receive kernel events with any + subclass. +*/ struct kev_request { u_long vendor_code; u_long kev_class; u_long kev_subclass; }; +/*! + @defined KEV_VENDOR_CODE_MAX_STR_LEN + @discussion This define sets the maximum length of a string that can be used + to identify a vendor or kext when looking up a vendor code. +*/ +#define KEV_VENDOR_CODE_MAX_STR_LEN 200 + +/*! + @struct kev_vendor_code + @discussion This structure is used with the SIOCGKEVVENDOR ioctl to convert + from a string identifying a kext or vendor, in the form of a bundle + identifier, to a vendor code. + @field vendor_code After making the SIOCGKEVVENDOR ioctl call, this will + be filled in with the vendor code if there is one. + @field vendor_string A bundle style identifier. +*/ +struct kev_vendor_code { + u_long vendor_code; + char vendor_string[KEV_VENDOR_CODE_MAX_STR_LEN]; +}; + + +/*! + @defined SIOCGKEVID + @discussion Retrieve the current event id. Each event generated will have + a new idea. The next event to be generated will have an id of id+1. +*/ #define SIOCGKEVID _IOR('e', 1, u_long) + +/*! + @defined SIOCSKEVFILT + @discussion Set the kernel event filter for this socket. Kernel events not + matching this filter will not be received on this socket. +*/ #define SIOCSKEVFILT _IOW('e', 2, struct kev_request) + +/*! + @defined SIOCGKEVFILT + @discussion Retrieve the kernel event filter for this socket. Kernel events + not matching this filter will not be received on this socket. +*/ #define SIOCGKEVFILT _IOR('e', 3, struct kev_request) -#ifdef KERNEL -#ifdef __APPLE_API_UNSTABLE +/*! + @defined SIOCGKEVVENDOR + @discussion Lookup the vendor code for the specified vendor. ENOENT will be + returned if a vendor code for that vendor string does not exist. +*/ +#define SIOCGKEVVENDOR _IOWR('e', 4, struct kev_vendor_code) +#ifdef KERNEL +/*! + @define N_KEV_VECTORS + @discussion The maximum number of kev_d_vectors for a kernel event. +*/ #define N_KEV_VECTORS 5 +/*! + @struct kev_d_vectors + @discussion This structure is used to append some data to a kernel event. + @field data_length The length of data. + @field data_ptr A pointer to data. +*/ struct kev_d_vectors { - u_long data_length; /* Length of the event data */ void *data_ptr; /* Pointer to event data */ -}; - +}; +/*! + @struct kev_d_vectors + @discussion This structure is used when posting a kernel event. + @field vendor_code The vendor code assigned by kev_vendor_code_find. + @field kev_class The event's class. + @field kev_class The event's subclass. + @field kev_class The event's code. + @field dv An array of vectors describing additional data to be appended to + the kernel event. +*/ struct kev_msg { u_long vendor_code; /* For non-Apple extensibility */ u_long kev_class; /* Layer of event source */ @@ -97,10 +231,38 @@ struct kev_msg { struct kev_d_vectors dv[N_KEV_VECTORS]; /* Up to n data vectors */ }; -int kev_post_msg(struct kev_msg *event); +/*! + @function kev_vendor_code_find + @discussion Lookup a vendor_code given a unique string. If the vendor code + has not been used since launch, a unique integer will be assigned for + that string. Vendor codes will remain the same until the machine is + rebooted. + @param vendor_string A bundle style vendor identifier (i.e. com.apple). + @param vender_code Upon return, a unique vendor code for use when posting + kernel events. + @result May return ENOMEM if memory constraints prevent allocation of a new + vendor code. + */ +errno_t kev_vendor_code_find(const char *vendor_string, u_long *vender_code); + +/*! + @function kev_msg_post + @discussion Post a kernel event message. + @param event_msg A structure defining the kernel event message to post. + @result Will return zero upon success. May return a number of errors + depending on the type of failure. EINVAL indicates that there was + something wrong with the kerne event. The vendor code of the kernel + event must be assigned using kev_vendor_code_find. If the message is + too large, EMSGSIZE will be returned. + */ +errno_t kev_msg_post(struct kev_msg *event_msg); -#endif /* ___APPLE_API_UNSTABLE */ -#ifdef __APPLE_API_PRIVATE +#ifdef PRIVATE +/* + * Internal version of kev_post_msg. Allows posting Apple vendor code kernel + * events. + */ +int kev_post_msg(struct kev_msg *event); LIST_HEAD(kern_event_head, kern_event_pcb); @@ -114,7 +276,7 @@ struct kern_event_pcb { #define sotoevpcb(so) ((struct kern_event_pcb *)((so)->so_pcb)) -#endif /* __APPLE_API_PRIVATE */ -#endif -#endif +#endif /* PRIVATE */ +#endif /* KERNEL */ +#endif /* SYS_KERN_EVENT_H */ diff --git a/bsd/sys/kernel.h b/bsd/sys/kernel.h index 420f56e2d..c7a2c0b77 100644 --- a/bsd/sys/kernel.h +++ b/bsd/sys/kernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -68,7 +68,7 @@ #include <sys/param.h> -#ifdef __APPLE_API_PRIVATE +#ifdef BSD_KERNEL_PRIVATE /* Global variables for the kernel. */ /* 1.1 */ @@ -79,19 +79,20 @@ extern char domainname[MAXHOSTNAMELEN]; extern int domainnamelen; /* 1.2 */ -extern struct timeval boottime; -#ifdef __APPLE_API_OBSOLETE -extern volatile struct timeval time; -extern struct timezone tz; /* XXX */ +extern int stathz; /* statistics clock's frequency */ +extern int profhz; /* profiling clock's frequency */ +#endif /* BSD_KERNEL_PRIVATE */ + + +#ifdef KERNEL_PRIVATE extern int lbolt; /* once a second sleep address */ + +extern struct timezone tz; /* XXX */ + extern int tick; /* usec per tick (1000000 / hz) */ extern int hz; /* system clock's frequency */ -extern int stathz; /* statistics clock's frequency */ -extern int profhz; /* profiling clock's frequency */ -#endif /* __APPLE_API_OBSOLETE */ - -#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL_PRIVATE */ #endif /* KERNEL */ diff --git a/bsd/sys/kernel_types.h b/bsd/sys/kernel_types.h new file mode 100644 index 000000000..89eb0ed0b --- /dev/null +++ b/bsd/sys/kernel_types.h @@ -0,0 +1,127 @@ +#ifndef _KERN_SYS_KERNELTYPES_H_ +#define _KERN_SYS_KERNELTYPES_H_ + +#include <sys/cdefs.h> +#include <sys/types.h> +#include <stdint.h> + +#ifdef BSD_BUILD +/* Macros(?) to clear/set/test flags. */ +#define SET(t, f) (t) |= (f) +#define CLR(t, f) (t) &= ~(f) +#define ISSET(t, f) ((t) & (f)) +#endif + + +typedef int errno_t; +typedef int64_t daddr64_t; + +typedef int64_t ino64_t; + +#ifndef BSD_BUILD +struct buf; +typedef struct buf * buf_t; + +struct file; +typedef struct file * file_t; + +struct ucred; +typedef struct ucred * ucred_t; + +struct mount; +typedef struct mount * mount_t; + +#ifdef TBDDDD +typedef struct fsid { int32_t val[2]; } fsid_t; /* file system id type */ +#endif TBDDDD + +struct vnode; +typedef struct vnode * vnode_t; + +struct proc; +typedef struct proc * proc_t; + +struct uio; +typedef struct uio * uio_t; + +struct vfs_context; +typedef struct vfs_context * vfs_context_t; + +struct vfstable; +typedef struct vfstable * vfstable_t; + +struct __ifnet; +struct __mbuf; +struct __pkthdr; +struct __socket; +struct __sockopt; +struct __ifaddr; +struct __ifmultiaddr; +struct __ifnet_filter; +struct __rtentry; + +typedef struct __ifnet* ifnet_t; +typedef struct __mbuf* mbuf_t; +typedef struct __pkthdr* pkthdr_t; +typedef struct __socket* socket_t; +typedef struct __sockopt* sockopt_t; +typedef struct __ifaddr* ifaddr_t; +typedef struct __ifmultiaddr* ifmultiaddr_t; +typedef struct __ifnet_filter* interface_filter_t; +typedef struct __rtentry* route_t; + +#else /* BSD_BUILD */ + +typedef struct buf * buf_t; +typedef struct file * file_t; +typedef struct ucred * ucred_t; +typedef struct mount * mount_t; +typedef struct vnode * vnode_t; +typedef struct proc * proc_t; +typedef struct uio * uio_t; +typedef struct user_iovec * user_iovec_t; +typedef struct vfs_context * vfs_context_t; +typedef struct vfstable * vfstable_t; + +#if KERNEL_PRIVATE +typedef struct kern_iovec * kern_iovec_t; +typedef struct ifnet* ifnet_t; +typedef struct mbuf* mbuf_t; +typedef struct pkthdr* pkthdr_t; +typedef struct socket* socket_t; +typedef struct sockopt* sockopt_t; +typedef struct ifaddr* ifaddr_t; +typedef struct ifmultiaddr* ifmultiaddr_t; +typedef struct ifnet_filter* interface_filter_t; +typedef struct rtentry* route_t; +#endif /* KERNEL_PRIVATE */ + +#endif /* !BSD_BUILD */ + +#ifndef _KAUTH_GUID +#define _KAUTH_GUID +/* Apple-style globally unique identifier */ +typedef struct { +#define KAUTH_GUID_SIZE 16 /* 128-bit identifier */ + unsigned char g_guid[KAUTH_GUID_SIZE]; +} guid_t; +#define _GUID_T +#endif /* _KAUTH_GUID */ + +#ifndef _KAUTH_ACE +#define _KAUTH_ACE +struct kauth_ace; +typedef struct kauth_ace * kauth_ace_t; +#endif +#ifndef _KAUTH_ACL +#define _KAUTH_ACL +struct kauth_acl; +typedef struct kauth_acl * kauth_acl_t; +#endif +#ifndef _KAUTH_FILESEC +#define _KAUTH_FILESEC +struct kauth_filesec; +typedef struct kauth_filesec * kauth_filesec_t; +#endif + +#endif /* !_KERN_SYS_KERNELTYPES_H_ */ diff --git a/bsd/sys/kpi_mbuf.h b/bsd/sys/kpi_mbuf.h new file mode 100644 index 000000000..5a1d26d8e --- /dev/null +++ b/bsd/sys/kpi_mbuf.h @@ -0,0 +1,1127 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/*! + @header kpi_mbuf.h + This header defines an API for interacting with mbufs. mbufs are the + primary method of storing packets in the networking stack. + + mbufs are used to store various items in the networking stack. The + most common usage of an mbuf is to store a packet or data on a + socket waiting to be sent or received. The mbuf is a contiguous + structure with some header followed by some data. To store more data + than would fit in an mbuf, external data is used. Most mbufs with + external data use clusters to store the external data. + + mbufs can be chained, contiguous data in a packet can be found by + following the m_next chain. Packets may be bundled together using + m_nextpacket. Many parts of the stack do not properly handle chains + of packets. When in doubt, don't chain packets. + */ + +#ifndef __KPI_MBUF__ +#define __KPI_MBUF__ +#include <sys/kernel_types.h> +#include <mach/vm_types.h> + +/*! + @enum mbuf_flags_t + @abstract Constants defining mbuf flags. Only the flags listed below + can be set or retreieved. + @constant MBUF_EXT Indicates this mbuf has external data. + @constant MBUF_PKTHDR Indicates this mbuf has a packet header. + @constant MBUF_EOR Indicates this mbuf is the end of a record. + @constant MBUF_BCAST Indicates this packet will be sent or was + received as a brodcast. + @constant MBUF_MCAST Indicates this packet will be sent or was + received as a multicast. + @constant MBUF_FRAG Indicates this packet is a fragment of a larger + packet. + @constant MBUF_FIRSTFRAG Indicates this packet is the first fragment. + @constant MBUF_LASTFRAG Indicates this packet is the last fragment. + @constant MBUF_PROMISC Indicates this packet was only received + because the interface is in promiscuous mode. This should be set + by the demux function. These packets will be discarded after + being passed to any interface filters. +*/ +enum { + MBUF_EXT = 0x0001, /* has associated external storage */ + MBUF_PKTHDR = 0x0002, /* start of record */ + MBUF_EOR = 0x0004, /* end of record */ + + MBUF_BCAST = 0x0100, /* send/received as link-level broadcast */ + MBUF_MCAST = 0x0200, /* send/received as link-level multicast */ + MBUF_FRAG = 0x0400, /* packet is a fragment of a larger packet */ + MBUF_FIRSTFRAG = 0x0800, /* packet is first fragment */ + MBUF_LASTFRAG = 0x1000, /* packet is last fragment */ + MBUF_PROMISC = 0x2000 /* packet is promiscuous */ +}; +typedef u_int32_t mbuf_flags_t; + +/*! + @enum mbuf_type_t + @abstract Types of mbufs. + @discussion Some mbufs represent packets, some represnt data waiting + on sockets. Other mbufs store control data or other various + structures. The mbuf type is used to store what sort of data the + mbuf contains. + @constant MBUF_MT_FREE Indicates the mbuf is free and is + sitting on the queue of free mbufs. If you find that an mbuf you + have a reference to has this type, something has gone terribly + wrong. + @constant MBUF_MT_DATA Indicates this mbuf is being used to store + data. + @constant MBUF_MT_HEADER Indicates this mbuf has a packet header, + this is probably a packet. + @constant MBUF_MT_SOCKET Socket structure. + @constant MBUF_MT_PCB Protocol control block. + @constant MBUF_MT_RTABLE Routing table entry. + @constant MBUF_MT_HTABLE IMP host tables???. + @constant MBUF_MT_ATABLE Address resolution table data. + @constant MBUF_MT_SONAME Socket name, usually a sockaddr of some + sort. + @constant MBUF_MT_FTABLE Fragment reassembly header. + @constant MBUF_MT_RIGHTS Access rights. + @constant MBUF_MT_IFADDR Interface address. + @constant MBUF_MT_CONTROL Extra-data protocol message (control + message). + @constant MBUF_MT_OOBDATA Out of band data. +*/ +enum { + MBUF_TYPE_FREE = 0, /* should be on free list */ + MBUF_TYPE_DATA = 1, /* dynamic (data) allocation */ + MBUF_TYPE_HEADER = 2, /* packet header */ + MBUF_TYPE_SOCKET = 3, /* socket structure */ + MBUF_TYPE_PCB = 4, /* protocol control block */ + MBUF_TYPE_RTABLE = 5, /* routing tables */ + MBUF_TYPE_HTABLE = 6, /* IMP host tables */ + MBUF_TYPE_ATABLE = 7, /* address resolution tables */ + MBUF_TYPE_SONAME = 8, /* socket name */ + MBUF_TYPE_SOOPTS = 10, /* socket options */ + MBUF_TYPE_FTABLE = 11, /* fragment reassembly header */ + MBUF_TYPE_RIGHTS = 12, /* access rights */ + MBUF_TYPE_IFADDR = 13, /* interface address */ + MBUF_TYPE_CONTROL = 14, /* extra-data protocol message */ + MBUF_TYPE_OOBDATA = 15 /* expedited data */ +}; +typedef u_int32_t mbuf_type_t; + +/*! + @enum mbuf_csum_request_flags_t + @abstract Checksum performed/requested flags. + @discussion Mbufs often contain packets. Some hardware supports + performing checksums in hardware. The stack uses these flags to + indicate to the driver what sort of checksumming should be + handled in by the driver/hardware. These flags will only be set + if the driver indicates that it supports the corresponding + checksums using ifnet_set_offload. + @constant MBUF_CSUM_REQ_IP Indicates the IP checksum has not been + calculated yet. + @constant MBUF_CSUM_REQ_TCP Indicates the TCP checksum has not been + calculated yet. + @constant MBUF_CSUM_REQ_UDP Indicates the UDP checksum has not been + calculated yet. +*/ +enum { +#ifdef KERNEL_PRIVATE + MBUF_CSUM_REQ_SUM16 = 0x1000, /* Weird apple hardware checksum */ +#endif KERNEL_PRIVATE + MBUF_CSUM_REQ_IP = 0x0001, + MBUF_CSUM_REQ_TCP = 0x0002, + MBUF_CSUM_REQ_UDP = 0x0004 +}; +typedef u_int32_t mbuf_csum_request_flags_t; + +/*! + @enum mbuf_csum_performed_flags_t + @abstract Checksum performed/requested flags. + @discussion Mbufs often contain packets. Some hardware supports + performing checksums in hardware. The driver uses these flags to + communicate to the stack the checksums that were calculated in + hardware. + @constant MBUF_CSUM_DID_IP Indicates that the driver/hardware verified + the IP checksum in hardware. + @constant MBUF_CSUM_IP_GOOD Indicates whether or not the IP checksum + was good or bad. Only valid when MBUF_CSUM_DID_IP is set. + @constant MBUF_CSUM_DID_DATA Indicates that the TCP or UDP checksum + was calculated. The value for the checksum calculated in + hardware should be passed as the second parameter of + mbuf_set_csum_performed. The hardware calculated checksum value + can be retrieved using the second parameter passed to + mbuf_get_csum_performed. + @constant MBUF_CSUM_PSEUDO_HDR If set, this indicates that the + checksum value for MBUF_CSUM_DID_DATA includes the pseudo header + value. If this is not set, the stack will calculate the pseudo + header value and add that to the checksum. The value of this bit + is only valid when MBUF_CSUM_DID_DATA is set. +*/ +enum { +#ifdef KERNEL_PRIVATE + MBUF_CSUM_TCP_SUM16 = MBUF_CSUM_REQ_SUM16, /* Weird apple hardware checksum */ +#endif + MBUF_CSUM_DID_IP = 0x0100, + MBUF_CSUM_IP_GOOD = 0x0200, + MBUF_CSUM_DID_DATA = 0x0400, + MBUF_CSUM_PSEUDO_HDR = 0x0800 +}; +typedef u_int32_t mbuf_csum_performed_flags_t; + +/*! + @enum mbuf_how_t + @abstract Method of allocating an mbuf. + @discussion Blocking will cause the funnel to be dropped. If the + funnel is dropped, other threads may make changes to networking + data structures. This can lead to very bad things happening. + Blocking on the input our output path can also impact + performance. There are some cases where making a blocking call + is acceptable. When in doubt, use MBUF_DONTWAIT. + @constant MBUF_WAITOK Allow a call to allocate an mbuf to block. + @constant MBUF_DONTWAIT Don't allow the mbuf allocation call to + block, if blocking is necessary fail and return immediately. +*/ +enum { + MBUF_WAITOK = 0, /* Ok to block to get memory */ + MBUF_DONTWAIT = 1 /* Don't block, fail if blocking would be required */ +}; +typedef u_int32_t mbuf_how_t; + +typedef u_int32_t mbuf_tag_id_t; +typedef u_int16_t mbuf_tag_type_t; + +/*! + @struct mbuf_stat + @discussion The mbuf_stat contains mbuf statistics. + @field mbufs Number of mbufs (free or otherwise). + @field clusters Number of clusters (free or otherwise). + @field clfree Number of free clusters. + @field drops Number of times allocation failed. + @field wait Number of times allocation blocked. + @field drain Number of times protocol drain functions were called. + @field mtypes An array of counts of each type of mbuf allocated. + @field mcfail Number of times m_copym failed. + @field mpfail Number of times m_pullup failed. + @field msize Length of an mbuf. + @field mclbytes Length of an mbuf cluster. + @field minclsize Minimum length of data to allocate a cluster. + Anything smaller than this should be placed in chained mbufs. + @field mlen Length of data in an mbuf. + @field mhlen Length of data in an mbuf with a packet header. + @field bigclusters Number of big clusters. + @field bigclfree Number of unused big clusters. + @field bigmclbytes Length of a big mbuf cluster. +*/ +struct mbuf_stat { + u_long mbufs; /* mbufs obtained from page pool */ + u_long clusters; /* clusters obtained from page pool */ + u_long clfree; /* free clusters */ + u_long drops; /* times failed to find space */ + u_long wait; /* times waited for space */ + u_long drain; /* times drained protocols for space */ + u_short mtypes[256]; /* type specific mbuf allocations */ + u_long mcfail; /* times m_copym failed */ + u_long mpfail; /* times m_pullup failed */ + u_long msize; /* length of an mbuf */ + u_long mclbytes; /* length of an mbuf cluster */ + u_long minclsize; /* min length of data to allocate a cluster */ + u_long mlen; /* length of data in an mbuf */ + u_long mhlen; /* length of data in a header mbuf */ + u_long bigclusters; /* number of big clusters */ + u_long bigclfree; /* number of big clustser free */ + u_long bigmclbytes; /* length of data in a big cluster */ +}; + +/* Parameter for m_copym to copy all bytes */ +#define MBUF_COPYALL 1000000000 + +/* Data access */ +/*! + @function mbuf_data + @discussion Returns a pointer to the start of data in this mbuf. + There may be additional data on chained mbufs. The data you're + looking for may not be contiguous if it spans more than one + mbuf. Use mbuf_len to determine the lenght of data available in + this mbuf. If a data structure you want to access stradles two + mbufs in a chain, either use mbuf_pullup to get the data + contiguous in one mbuf or copy the pieces of data from each mbuf + in to a contiguous buffer. Using mbuf_pullup has the advantage + of not having to copy the data. On the other hand, if you don't + make sure there is space in the mbuf, mbuf_pullup may fail and + free the mbuf. + @param mbuf The mbuf. + @result A pointer to the data in the mbuf. + */ +void* mbuf_data(mbuf_t mbuf); + +/*! + @function mbuf_datastart + @discussion Returns the start of the space set aside for storing + data in an mbuf. An mbuf's data may come from a cluster or be + embedded in the mbuf structure itself. The data pointer + retrieved by mbuf_data may not be at the start of the data + (mbuf_leadingspace will be non-zero). This function will return to + you a pointer that matches mbuf_data() - mbuf_leadingspace(). + @param mbuf The mbuf. + @result A pointer to smallest possible value for data. + */ +void* mbuf_datastart(mbuf_t mbuf); + +/*! + @function mbuf_setdata + @discussion Sets the data and length values for an mbuf. The data + value must be in a valid range. In the case of an mbuf with a cluster, + the data value must point to a location in the cluster and the data + value plus the length, must be less than the end of the cluster. For + data embedded directly in an mbuf (no cluster), the data value must + fall somewhere between the start and end of the data area in the + mbuf and the data + length must also be in the same range. + @param mbuf The mbuf. + @param data The new pointer value for data. + @param len The new length of data in the mbuf. + @result 0 on success, errno error on failure. + */ +errno_t mbuf_setdata(mbuf_t mbuf, void *data, size_t len); + +/*! + @function mbuf_align_32 + @discussion mbuf_align_32 is a replacement for M_ALIGN and MH_ALIGN. + mbuf_align_32 will set the data pointer to a location aligned on + a four byte boundry with at least 'len' bytes between the data + pointer and the end of the data block. + @param mbuf The mbuf. + @param len The minimum length of space that should follow the new + data location. + @result 0 on success, errno error on failure. + */ +errno_t mbuf_align_32(mbuf_t mbuf, size_t len); + +/*! + @function mbuf_data_to_physical + @discussion mbuf_data_to_physical is a replacement for mcl_to_paddr. + Given a pointer returned from mbuf_data of mbuf_datastart, + mbuf_data_to_physical will return the phyical address for that + block of data. + @param ptr A pointer to data stored in an mbuf. + @result The 64 bit physical address of the mbuf data or NULL if ptr + does not point to data stored in an mbuf. + */ +addr64_t mbuf_data_to_physical(void* ptr); + + +/* Allocation */ + +/*! + @function mbuf_get + @discussion Allocates an mbuf without a cluster for external data. + @param how Blocking or non-blocking. + @param type The type of the mbuf. + @param mbuf The mbuf. + @result 0 on success, errno error on failure. + */ +errno_t mbuf_get(mbuf_how_t how, mbuf_type_t type, mbuf_t* mbuf); + +/*! + @function mbuf_gethdr + @discussion Allocates an mbuf without a cluster for external data. + Sets a flag to indicate there is a packet header and initializes + the packet header. + @param how Blocking or non-blocking. + @param type The type of the mbuf. + @param mbuf The mbuf. + @result 0 on success, errno error on failure. + */ +errno_t mbuf_gethdr(mbuf_how_t how, mbuf_type_t type, mbuf_t* mbuf); + + +/*! + @function mbuf_getcluster + @discussion Allocate a cluster of the requested size and attach it to + an mbuf for use as external data. If mbuf points to a NULL mbuf_t, + an mbuf will be allocated for you. If mbuf points to a non-NULL mbuf_t, + mbuf_getcluster may return a different mbuf_t than the one you + passed in. + @param how Blocking or non-blocking. + @param type The type of the mbuf. + @param size The size of the cluster to be allocated. Supported sizes for a + cluster are be 2048 or 4096. Any other value with return EINVAL. + @param mbuf The mbuf the cluster will be attached to. + @result 0 on success, errno error on failure. If you specified NULL + for the mbuf, any intermediate mbuf that may have been allocated + will be freed. If you specify an mbuf value in *mbuf, + mbuf_mclget will not free it. + EINVAL - Invalid parameter + ENOMEM - Not enough memory available + */ +errno_t mbuf_getcluster(mbuf_how_t how, mbuf_type_t type, size_t size, mbuf_t* mbuf); + +/*! + @function mbuf_mclget + @discussion Allocate a cluster and attach it to an mbuf for use as + external data. If mbuf points to a NULL mbuf_t, an mbuf will be + allocated for you. If mbuf points to a non-NULL mbuf_t, + mbuf_mclget may return a different mbuf_t than the one you + passed in. + @param how Blocking or non-blocking. + @param type The type of the mbuf. + @param mbuf The mbuf the cluster will be attached to. + @result 0 on success, errno error on failure. If you specified NULL + for the mbuf, any intermediate mbuf that may have been allocated + will be freed. If you specify an mbuf value in *mbuf, + mbuf_mclget will not free it. + */ +errno_t mbuf_mclget(mbuf_how_t how, mbuf_type_t type, mbuf_t* mbuf); + +/*! + @function mbuf_allocpacket + @discussion Allocate an mbuf chain to store a single packet of the requested length. + According to the requested length, a chain of mbufs will be created. The mbuf type + will be set to MBUF_TYPE_DATA. The caller may specify the maximum number of + buffer + @param how Blocking or non-blocking + @param packetlen The total length of the packet mbuf to be allocated. + The length must be greater than zero. + @param maxchunks An input/output pointer to the maximum number of mbufs segments making up the chain. + On input if maxchunks is zero, or the value pointed to by maxchunks is zero, + the packet will be made of as many buffer segments as necessary to fit the length. + The allocation will fail with ENOBUFS if the number of segments requested is too small and + the sum of the maximum size of each individual segment is less than the packet length. + On output, if the allocation succeed and maxchunks is non zero, it will point to + the actual number of segments allocated. + @param Upon success, *mbuf will be a reference to the new mbuf. + @result Returns 0 upon success or the following error code: + EINVAL - Invalid parameter + ENOMEM - Not enough memory available + ENOBUFS - Buffers not big enough for the maximum number of chunks requested +*/ +errno_t mbuf_allocpacket(mbuf_how_t how, size_t packetlen, unsigned int * maxchunks, mbuf_t *mbuf); + +/*! + @function mbuf_getpacket + @discussion Allocate an mbuf, allocate and attach a cluster, and set + the packet header flag. + @param how Blocking or non-blocking. + @param mbuf Upon success, *mbuf will be a reference to the new mbuf. + @result 0 on success, errno error on failure. + */ +errno_t mbuf_getpacket(mbuf_how_t how, mbuf_t* mbuf); + +/*! + @function mbuf_free + @discussion Frees a single mbuf. Not commonly used because it + doesn't touch the rest of the mbufs on the chain. + @param mbuf The mbuf to free. + @result The next mbuf in the chain. + */ +mbuf_t mbuf_free(mbuf_t mbuf); + +/*! + @function mbuf_freem + @discussion Frees a chain of mbufs link through mnext. + @param mbuf The first mbuf in the chain to free. + */ +void mbuf_freem(mbuf_t mbuf); + +/*! + @function mbuf_freem_list + @discussion Frees linked list of mbuf chains. Walks through + mnextpackt and does the equivalent of mbuf_mfreem to each. + @param mbuf The first mbuf in the linked list to free. + @result The number of mbufs freed. + */ +int mbuf_freem_list(mbuf_t mbuf); + +/*! + @function mbuf_leadingspace + @discussion Determines the space available in the mbuf proceeding + the current data. + @param mbuf The mbuf. + @result The number of unused bytes at the start of the mbuf. + */ +size_t mbuf_leadingspace(mbuf_t mbuf); + +/*! + @function mbuf_trailingspace + @discussion Determines the space available in the mbuf following + the current data. + @param mbuf The mbuf. + @result The number of unused bytes following the current data. + */ +size_t mbuf_trailingspace(mbuf_t mbuf); + +/* Manipulation */ + +/*! + @function mbuf_copym + @discussion Copies len bytes from offset from src to a new mbuf. + @param src The source mbuf. + @param offset The offset in the mbuf to start copying from. + @param len The the number of bytes to copy. + @param how To block or not to block, that is a question. + @param new_mbuf Upon success, the newly allocated mbuf. + @result 0 upon success otherwise the errno error. + */ +errno_t mbuf_copym(mbuf_t src, size_t offset, size_t len, + mbuf_how_t how, mbuf_t* new_mbuf); + +/*! + @function mbuf_dup + @discussion Exactly duplicates an mbuf chain. + @param src The source mbuf. + @param how Blocking or non-blocking. + @param new_mbuf Upon success, the newly allocated mbuf. + @result 0 upon success otherwise the errno error. + */ +errno_t mbuf_dup(mbuf_t src, mbuf_how_t how, mbuf_t* new_mbuf); + +/*! + @function mbuf_prepend + @discussion Prepend len bytes to an mbuf. If there is space + (mbuf_leadingspace >= len), the mbuf's data ptr is changed and + the same mbuf is returned. If there is no space, a new mbuf may + be allocated and prepended to the mbuf chain. If the operation + fails, the mbuf may be freed (*mbuf will be NULL). + @param mbuf The mbuf to prepend data to. This may change if a new + mbuf must be allocated or may be NULL if the operation fails. + @param len The length, in bytes, to be prepended to the mbuf. + @param how Blocking or non-blocking. + @result 0 upon success otherwise the errno error. + */ +errno_t mbuf_prepend(mbuf_t* mbuf, size_t len, mbuf_how_t how); + +/*! + @function mbuf_split + @discussion Split an mbuf chain at a specific offset. + @param src The mbuf to be split. + @param offset The offset in the buffer where the mbuf should be + split. + @param how Blocking or non-blocking. + @param new_mbuf Upon success, the second half of the split mbuf + chain. + @result 0 upon success otherwise the errno error. In the case of + failure, the original mbuf chain passed in to src will be + preserved. + */ +errno_t mbuf_split(mbuf_t src, size_t offset, + mbuf_how_t how, mbuf_t* new_mbuf); + +/*! + @function mbuf_pullup + @discussion Move the next len bytes in to mbuf from other mbufs in + the chain. This is commonly used to get the IP and TCP or UDP + header contiguous in the first mbuf. If mbuf_pullup fails, the + entire mbuf chain will be freed. + @param mbuf The mbuf in the chain the data should be contiguous in. + @param len The number of bytes to pull from the next mbuf(s). + @result 0 upon success otherwise the errno error. In the case of an + error, the mbuf chain has been freed. + */ +errno_t mbuf_pullup(mbuf_t* mbuf, size_t len); + +/*! + @function mbuf_pulldown + @discussion Make length bytes at offset in the mbuf chain + contiguous. Nothing before offset bytes in the chain will be + modified. Upon return, location will be the mbuf the data is + contiguous in and offset will be the offset in that mbuf at + which the data is located. In the case of a failure, the mbuf + chain will be freed. + @param src The start of the mbuf chain. + @param offset Pass in a pointer to a value with the offset of the + data you're interested in making contiguous. Upon success, this + will be overwritten with the offset from the mbuf returned in + location. + @param length The length of data that should be made contiguous. + @param location Upon success, *location will be the mbuf the data is + in. + @result 0 upon success otherwise the errno error. + */ +errno_t mbuf_pulldown(mbuf_t src, size_t *offset, size_t length, mbuf_t *location); + +/*! + @function mbuf_adj + @discussion Trims len bytes from the mbuf. If the length is greater + than zero, the bytes are trimmed from the front of the mbuf. If + the length is less than zero, the bytes are trimmed from the end + of the mbuf chain. + @param mbuf The mbuf chain to trim. + @param len The number of bytes to trim from the mbuf chain. + @result 0 upon success otherwise the errno error. + */ +void mbuf_adj(mbuf_t mbuf, int len); + +/*! + @function mbuf_copydata + @discussion Copies data out of an mbuf in to a specified buffer. If + the data is stored in a chain of mbufs, the data will be copied + from each mbuf in the chain until length bytes have been copied. + @param mbuf The mbuf chain to copy data out of. + @param offset The offset in to the mbuf to start copying. + @param length The number of bytes to copy. + @param out_data A pointer to the location where the data will be + copied. + @result 0 upon success otherwise the errno error. + */ +errno_t mbuf_copydata(mbuf_t mbuf, size_t offset, size_t length, void* out_data); + +/*! + @function mbuf_copyback + @discussion Copies data from a buffer to an mbuf chain. + mbuf_copyback will grow the chain to fit the specified buffer. + + If mbuf_copydata is unable to allocate enough mbufs to grow the + chain, ENOBUFS will be returned. The mbuf chain will be shorter + than expected but all of the data up to the end of the mbuf + chain will be valid. + + If an offset is specified, mbuf_copyback will skip that many + bytes in the mbuf chain before starting to write the buffer in + to the chain. If the mbuf chain does not contain this many + bytes, mbufs will be allocated to create the space. + @param mbuf The first mbuf in the chain to copy the data in to. + @param offset Offset in bytes to skip before copying data. + @param length The length, in bytes, of the data to copy in to the mbuf + chain. + @param data A pointer to data in the kernel's address space. + @param how Blocking or non-blocking. + @result 0 upon success, EINVAL or ENOBUFS upon failure. + */ +errno_t mbuf_copyback(mbuf_t mbuf, size_t offset, size_t length, + const void *data, mbuf_how_t how); + +#ifdef KERNEL_PRIVATE +/*! + @function mbuf_mclref + @discussion Incrememnt the reference count of the cluster. + @param mbuf The mbuf with the cluster to increment the refcount of. + @result 0 upon success otherwise the errno error. + */ +int mbuf_mclref(mbuf_t mbuf); + +/*! + @function mbuf_mclunref + @discussion Decrement the reference count of the cluster. + @param mbuf The mbuf with the cluster to decrement the refcount of. + @result 0 upon success otherwise the errno error. + */ +int mbuf_mclunref(mbuf_t mbuf); +#endif + +/*! + @function mbuf_mclhasreference + @discussion Check if a cluster of an mbuf is referenced by another mbuf. + References may be taken, for example, as a result of a call to + mbuf_split or mbuf_copym + @param mbuf The mbuf with the cluster to test. + @result 0 if there is no reference by another mbuf, 1 otherwise. + */ +int mbuf_mclhasreference(mbuf_t mbuf); + + +/* mbuf header */ + +/*! + @function mbuf_next + @discussion Returns the next mbuf in the chain. + @param mbuf The mbuf. + @result The next mbuf in the chain. + */ +mbuf_t mbuf_next(mbuf_t mbuf); + +/*! + @function mbuf_setnext + @discussion Sets the next mbuf in the chain. + @param mbuf The mbuf. + @param next The new next mbuf. + @result 0 upon success otherwise the errno error. + */ +errno_t mbuf_setnext(mbuf_t mbuf, mbuf_t next); + +/*! + @function mbuf_nextpkt + @discussion Gets the next packet from the mbuf. + @param mbuf The mbuf. + @result The nextpkt. + */ +mbuf_t mbuf_nextpkt(mbuf_t mbuf); + +/*! + @function mbuf_setnextpkt + @discussion Sets the next packet attached to this mbuf. + @param mbuf The mbuf. + @param nextpkt The new next packet. + */ +void mbuf_setnextpkt(mbuf_t mbuf, mbuf_t nextpkt); + +/*! + @function mbuf_len + @discussion Gets the length of data in this mbuf. + @param mbuf The mbuf. + @result The length. + */ +size_t mbuf_len(mbuf_t mbuf); + +/*! + @function mbuf_setlen + @discussion Sets the length of data in this packet. Be careful to + not set the length over the space available in the mbuf. + @param mbuf The mbuf. + @param len The new length. + @result 0 upon success otherwise the errno error. + */ +void mbuf_setlen(mbuf_t mbuf, size_t len); + +/*! + @function mbuf_maxlen + @discussion Retrieves the maximum length of data that may be stored + in this mbuf. This value assumes that the data pointer was set + to the start of the possible range for that pointer + (mbuf_data_start). + @param mbuf The mbuf. + @result The maximum lenght of data for this mbuf. + */ +size_t mbuf_maxlen(mbuf_t mbuf); + +/*! + @function mbuf_type + @discussion Gets the type of mbuf. + @param mbuf The mbuf. + @result The type. + */ +mbuf_type_t mbuf_type(mbuf_t mbuf); + +/*! + @function mbuf_settype + @discussion Sets the type of mbuf. + @param mbuf The mbuf. + @param new_type The new type. + @result 0 upon success otherwise the errno error. + */ +errno_t mbuf_settype(mbuf_t mbuf, mbuf_type_t new_type); + +/*! + @function mbuf_flags + @discussion Returns the set flags. + @param mbuf The mbuf. + @result The flags. + */ +mbuf_flags_t mbuf_flags(mbuf_t mbuf); + +/*! + @function mbuf_setflags + @discussion Sets the set of set flags. + @param mbuf The mbuf. + @param flags The flags that should be set, all other flags will be cleared. + @result 0 upon success otherwise the errno error. + */ +errno_t mbuf_setflags(mbuf_t mbuf, mbuf_flags_t flags); + +/*! + @function mbuf_setflags_mask + @discussion Useful for setting or clearing individual flags. Easier + than calling mbuf_setflags(m, mbuf_flags(m) | M_FLAG). + @param mbuf The mbuf. + @param flags The flags that should be set or cleared. + @param mask The mask controlling which flags will be modified. + @result 0 upon success otherwise the errno error. + */ +errno_t mbuf_setflags_mask(mbuf_t mbuf, mbuf_flags_t flags, + mbuf_flags_t mask); + +/*! + @function mbuf_copy_pkthdr + @discussion Copies the packet header from src to dest. + @param src The mbuf from which the packet header will be copied. + @param mbuf The mbuf to which the packet header will be copied. + @result 0 upon success otherwise the errno error. + */ +errno_t mbuf_copy_pkthdr(mbuf_t dest, mbuf_t src); + +/*! + @function mbuf_pkthdr_len + @discussion Returns the length as reported by the packet header. + @param mbuf The mbuf containing the packet header with the length to + be changed. + @result The length, in bytes, of the packet. + */ +size_t mbuf_pkthdr_len(mbuf_t mbuf); + +/*! + @function mbuf_pkthdr_setlen + @discussion Sets the length of the packet in the packet header. + @param mbuf The mbuf containing the packet header. + @param len The new length of the packet. + @result 0 upon success otherwise the errno error. + */ +void mbuf_pkthdr_setlen(mbuf_t mbuf, size_t len); + +/*! + @function mbuf_pkthdr_rcvif + @discussion Returns a reference to the interface the packet was + received on. Increments the reference count of the interface + before returning. Caller is responsible for releasing + the reference by calling ifnet_release. + @param mbuf The mbuf containing the packet header. + @result A reference to the interface. + */ +ifnet_t mbuf_pkthdr_rcvif(mbuf_t mbuf); + +/*! + @function mbuf_pkthdr_setrcvif + @discussion Sets the interface the packet was received on. + @param mbuf The mbuf containing the packet header. + @param ifnet A reference to an interface. + @result 0 upon success otherwise the errno error. + */ +errno_t mbuf_pkthdr_setrcvif(mbuf_t mbuf, ifnet_t ifnet); + +/*! + @function mbuf_pkthdr_header + @discussion Returns a pointer to the packet header. + @param mbuf The mbuf containing the packet header. + @result A pointer to the packet header. + */ +void* mbuf_pkthdr_header(mbuf_t mbuf); + +/*! + @function mbuf_pkthdr_setheader + @discussion Sets the pointer to the packet header. + @param mbuf The mbuf containing the packet header. + @param ifnet A pointer to the header. + @result 0 upon success otherwise the errno error. + */ +void mbuf_pkthdr_setheader(mbuf_t mbuf, void* header); +#ifdef KERNEL_PRIVATE + +/* mbuf aux data */ + +/*! + @function mbuf_aux_add + @discussion Adds auxiliary data in the form of an mbuf. + @param mbuf The mbuf to add aux data to. + @param family The protocol family of the aux data to add. + @param type The mbuf type of the aux data to add. + @param aux_mbuf The aux mbuf allocated for you. + @result 0 upon success otherwise the errno error. + */ +errno_t mbuf_aux_add(mbuf_t mbuf, int family, mbuf_type_t type, mbuf_t *aux_mbuf); + +/*! + @function mbuf_aux_find + @discussion Finds auxiliary data attached to an mbuf. + @param mbuf The mbuf to find aux data on. + @param family The protocol family of the aux data to add. + @param type The mbuf type of the aux data to add. + @result The aux data mbuf or NULL if there isn't one. + */ +mbuf_t mbuf_aux_find(mbuf_t mbuf, int family, mbuf_type_t type); + +/*! + @function mbuf_aux_delete + @discussion Free an mbuf used as aux data and disassosciate it from + the mbuf. + @param mbuf The mbuf to find aux data on. + @param aux The aux data to free. + */ +void mbuf_aux_delete(mbuf_t mbuf, mbuf_t aux); +#endif /* KERNEL_PRIVATE */ + +/* Checksums */ + +/*! + @function mbuf_inbound_modified + @discussion This function will clear the checksum flags to indicate + that a hardware checksum should not be used. Any filter + modifying data should call this function on an mbuf before + passing the packet up the stack. If a filter modifies a packet + in a way that affects any checksum, the filter is responsible + for either modifying the checksum to compensate for the changes + or verifying the checksum before making the changes and then + modifying the data and calculating a new checksum only if the + original checksum was valid. + @param mbuf The mbuf that has been modified. + */ +void mbuf_inbound_modified(mbuf_t mbuf); + +/*! + @function mbuf_outbound_finalize + @discussion This function will "finalize" the packet allowing your + code to inspect the final packet. + + There are a number of operations that are performed in hardware, + such as calculating checksums. This function will perform in + software the various opterations that were scheduled to be done + in hardware. Future operations may include IPSec processing or + vlan support. If you are redirecting a packet to a new interface + which may not have the same hardware support or encapsulating + the packet, you should call this function to force the stack to + calculate and fill out the checksums. This will bypass hardware + checksums but give you a complete packet to work with. If you + need to inspect aspects of the packet which may be generated by + hardware, you must call this function to get an aproximate final + packet. If you plan to modify the packet in any way, you should + call this function. + + This function should be called before modifying any outbound + packets. + + This function may be called at various levels, in some cases + additional headers may have already been prepended, such as the + case of a packet seen by an interface filter. To handle this, + the caller must pass the protocol family of the packet as well + as the offset from the start of the packet to the protocol + header. + @param mbuf The mbuf that should be finalized. + @param protocol_family The protocol family of the packet in the + mbuf. + @param protocol_offset The offset from the start of the mbuf to the + protocol header. For an IP packet with an ethernet header, this + would be the length of an ethernet header. + */ +void mbuf_outbound_finalize(mbuf_t mbuf, u_long protocol_family, + size_t protocol_offset); + +/*! + @function mbuf_set_vlan_tag + @discussion This function is used by interfaces that support vlan + tagging in hardware. This function will set properties in the + mbuf to indicate which vlan the packet was received for. + @param mbuf The mbuf containing the packet. + @param vlan The protocol family of the aux data to add. + @result 0 upon success otherwise the errno error. + */ +errno_t mbuf_set_vlan_tag(mbuf_t mbuf, u_int16_t vlan); + +/*! + @function mbuf_get_vlan_tag + @discussion This function is used by drivers that support hardware + vlan tagging to determine which vlan this packet belongs to. To + differentiate between the case where the vlan tag is zero and + the case where there is no vlan tag, this function will return + ENXIO when there is no vlan. + @param mbuf The mbuf containing the packet. + @param vlan The protocol family of the aux data to add. + @result 0 upon success otherwise the errno error. ENXIO indicates + that the vlan tag is not set. + */ +errno_t mbuf_get_vlan_tag(mbuf_t mbuf, u_int16_t *vlan); + +/*! + @function mbuf_clear_vlan_tag + @discussion This function will clear any vlan tag associated with + the mbuf. + @param mbuf The mbuf containing the packet. + @result 0 upon success otherwise the errno error. + */ +errno_t mbuf_clear_vlan_tag(mbuf_t mbuf); + +#ifdef KERNEL_PRIVATE +/*! + @function mbuf_set_csum_requested + @discussion This function is used by the stack to indicate which + checksums should be calculated in hardware. The stack normally + sets these flags as the packet is processed in the outbound + direction. Just before send the packe to the interface, the + stack will look at these flags and perform any checksums in + software that are not supported by the interface. + @param mbuf The mbuf containing the packet. + @param request Flags indicating which checksums are being requested + for this packet. + @param value This parameter is currently unsupported. + @result 0 upon success otherwise the errno error. + */ +errno_t mbuf_set_csum_requested(mbuf_t mbuf, + mbuf_csum_request_flags_t request, u_int32_t value); +#endif + +/*! + @function mbuf_get_csum_requested + @discussion This function is used by the driver to determine which + checksum operations should be performed in hardware. + @param mbuf The mbuf containing the packet. + @param request Flags indicating which checksums are being requested + for this packet. + @param value This parameter is currently unsupported. + @result 0 upon success otherwise the errno error. + */ +errno_t mbuf_get_csum_requested(mbuf_t mbuf, + mbuf_csum_request_flags_t *request, u_int32_t *value); + +/*! + @function mbuf_clear_csum_requested + @discussion This function clears the checksum request flags. + @param mbuf The mbuf containing the packet. + @result 0 upon success otherwise the errno error. + */ +errno_t mbuf_clear_csum_requested(mbuf_t mbuf); + +/*! + @function mbuf_set_csum_performed + @discussion This is used by the driver to indicate to the stack which + checksum operations were performed in hardware. + @param mbuf The mbuf containing the packet. + @param flags Flags indicating which hardware checksum operations + were performed. + @param value If the MBUF_CSUM_DID_DATA flag is set, value should be + set to the value of the TCP or UDP header as calculated by the + hardware. + @result 0 upon success otherwise the errno error. + */ +errno_t mbuf_set_csum_performed(mbuf_t mbuf, + mbuf_csum_performed_flags_t flags, u_int32_t value); + +#ifdef KERNEL_PRIVATE +/*! + @function mbuf_get_csum_performed + @discussion This is used by the stack to determine which checksums + were calculated in hardware on the inbound path. + @param mbuf The mbuf containing the packet. + @param flags Flags indicating which hardware checksum operations + were performed. + @param value If the MBUF_CSUM_DID_DATA flag is set, value will be + set to the value of the TCP or UDP header as calculated by the + hardware. + @result 0 upon success otherwise the errno error. + */ +errno_t mbuf_get_csum_performed(mbuf_t mbuf, + mbuf_csum_performed_flags_t *flags, u_int32_t *value); +#endif + +/*! + @function mbuf_clear_csum_performed + @discussion Clears the hardware checksum flags and values. + @param mbuf The mbuf containing the packet. + @result 0 upon success otherwise the errno error. + */ +errno_t mbuf_clear_csum_performed(mbuf_t mbuf); + +/* mbuf tags */ + +/*! + @function mbuf_tag_id_find + @discussion Lookup the module id for a string. If there is no module + id assigned to this string, a new module id will be assigned. + The string should be the bundle id of the kext. In the case of a + tag that will be shared across multiple kexts, a common bundle id + style string should be used. + + The lookup operation is not optimized. A module should call this + function once during startup and chache the module id. The module id + will not be resassigned until the machine reboots. + @param module_string A unique string identifying your module. + Example: com.apple.nke.SharedIP. + @param module_id Upon return, a unique identifier for use with + mbuf_tag_* functions. This identifier is valid until the machine + is rebooted. + @result 0 upon success otherwise the errno error. + */ +errno_t mbuf_tag_id_find(const char *module_string, + mbuf_tag_id_t *module_id); + +/*! + @function mbuf_tag_allocate + @discussion Allocate an mbuf tag. Mbuf tags allow various portions + of the stack to tag mbufs with data that will travel with the + mbuf through the stack. + + Tags may only be added to mbufs with packet headers + (MBUF_PKTHDR flag is set). Mbuf tags are freed when the mbuf is + freed or when mbuf_tag_free is called. + @param mbuf The mbuf to attach this tag to. + @param module_id A module identifier returned by mbuf_tag_id_find. + @param type A 16 bit type value. For a given module_id, you can use + a number of different tag types. + @param length The length, in bytes, to allocate for storage that + will be associated with this tag on this mbuf. + @param how Indicate whether you want to block and wait for memory if + memory is not immediately available. + @param data_p Upon successful return, *data_p will point to the + buffer allocated for the mtag. + @result 0 upon success otherwise the errno error. + */ +errno_t mbuf_tag_allocate(mbuf_t mbuf, mbuf_tag_id_t module_id, + mbuf_tag_type_t type, size_t length, + mbuf_how_t how, void** data_p); + +/*! + @function mbuf_tag_find + @discussion Find the data associated with an mbuf tag. + @param mbuf The mbuf the tag is attached to. + @param module_id A module identifier returned by mbuf_tag_id_find. + @param type The 16 bit type of the tag to find. + @param length Upon success, the length of data will be store in + *length. + @param data_p Upon successful return, *data_p will point to the + buffer allocated for the mtag. + @result 0 upon success otherwise the errno error. + */ +errno_t mbuf_tag_find(mbuf_t mbuf, mbuf_tag_id_t module_id, + mbuf_tag_type_t type, size_t *length, void** data_p); + +/*! + @function mbuf_tag_free + @discussion Frees a previously allocated mbuf tag. + @param mbuf The mbuf the tag was allocated on. + @param module_id The ID of the tag to free. + @param type The type of the tag to free. + */ +void mbuf_tag_free(mbuf_t mbuf, mbuf_tag_id_t module_id, + mbuf_tag_type_t type); + +/* mbuf stats */ + +/*! + @function mbuf_stats + @discussion Get the mbuf statistics. + @param stats Storage to copy the stats in to. + */ +void mbuf_stats(struct mbuf_stat* stats); + + + +/* IF_QUEUE interaction */ + +#define IF_ENQUEUE_MBUF(ifq, m) { \ + mbuf_setnextpkt((m), 0); \ + if ((ifq)->ifq_tail == 0) \ + (ifq)->ifq_head = (m); \ + else \ + mbuf_setnextpkt((mbuf_t)(ifq)->ifq_tail, (m)); \ + (ifq)->ifq_tail = (m); \ + (ifq)->ifq_len++; \ +} +#define IF_PREPEND_MBUF(ifq, m) { \ + mbuf_setnextpkt((m), (ifq)->ifq_head); \ + if ((ifq)->ifq_tail == 0) \ + (ifq)->ifq_tail = (m); \ + (ifq)->ifq_head = (m); \ + (ifq)->ifq_len++; \ +} +#define IF_DEQUEUE_MBUF(ifq, m) { \ + (m) = (ifq)->ifq_head; \ + if (m) { \ + if (((ifq)->ifq_head = mbuf_nextpkt((m))) == 0) \ + (ifq)->ifq_tail = 0; \ + mbuf_setnextpkt((m), 0); \ + (ifq)->ifq_len--; \ + } \ +} + + +#endif diff --git a/bsd/sys/kpi_socket.h b/bsd/sys/kpi_socket.h new file mode 100644 index 000000000..13c56414f --- /dev/null +++ b/bsd/sys/kpi_socket.h @@ -0,0 +1,375 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/*! + @header kpi_socket.h + This header defines an API for creating and interacting with sockets + in the kernel. It is possible to create sockets in the kernel + without an associated file descriptor. In some cases, a reference to + the socket may be known while the file descriptor is not. These + functions can be used for interacting with sockets in the kernel. + The API is similar to the user space socket API. + */ +#ifndef __KPI_SOCKET__ +#define __KPI_SOCKET__ + +#include <sys/types.h> +#include <sys/kernel_types.h> + +struct timeval; + +/*! + @typedef sock_upcall + + @discussion sock_upcall is used by a socket to notify an in kernel + client that data is waiting. Instead of making blocking calls in + the kernel, a client can specify an upcall which will be called + when data is available or the socket is ready for sending. + + Calls to your upcall function are not serialized and may be + called concurrently from multiple threads in the kernel. + + Your upcall function will be called when: + + @param so A reference to the socket that's ready. + @param cookie The cookie passed in when the socket was created. + @param waitf Indicates whether or not it's safe to block. +*/ +typedef void (*sock_upcall)(socket_t so, void* cookie, int waitf); + +/*! + @function sock_accept + @discussion Accepts an incoming connection on a socket. See 'man 2 + accept' for more information. Allocating a socket in this manner + creates a socket with no associated file descriptor. + @param so The listening socket you'd like to accept a connection on. + @param from A pointer to a socket address that will be filled in + with the address the connection is from. + @param fromlen Maximum length of from. + @param flags Supports MSG_DONTWAIT and MSG_USEUPCALL. If + MSG_DONTWAIT is set, accept will return EWOULDBLOCK if there are + no connections ready to be accepted. If MSG_USEUPCALL is set, + the created socket will use the same upcall function attached to + the original socket. + @param callback A notifier function to be called when an event + occurs on the socket. This may be NULL. + @param cookie A cookie passed directly to the callback. + @param new_so Upon success, *new_so will be a reference to a new + socket for tracking the connection. + @result 0 on success otherwise the errno error. + */ +errno_t sock_accept(socket_t so, struct sockaddr *from, int fromlen, + int flags, sock_upcall callback, void* cookie, + socket_t *new_so); + +/*! + @function sock_bind + @discussion Binds a socket to a specific address. See 'man 2 bind' + for more information. + @param so The socket to be bound. + @param to The local address the socket should be bound to. + @result 0 on success otherwise the errno error. + */ +errno_t sock_bind(socket_t so, const struct sockaddr *to); + +/*! + @function sock_connect + @discussion Initiates a connection on the socket. See 'man 2 + connect' for more information. + @param so The socket to be connect. + @param to The remote address the socket should connect to. + @param flags Flags for connecting. The only flag supported so far is + MSG_DONTWAIT. MSG_DONTWAIT will perform a non-blocking connect. + sock_connect will return immediately with EINPROGRESS. The + upcall, if supplied, will be called when the connection is + completed. + @result 0 on success, EINPROGRESS for a non-blocking connect that + has not completed, otherwise the errno error. + */ +errno_t sock_connect(socket_t so, const struct sockaddr *to, int flags); + +#ifdef KERNEL_PRIVATE +/*! + This function was added to support NFS. NFS does something funny, + setting a short timeout and checking to see if it should abort the + connect every two seconds. Ideally, NFS would use the upcall to be + notified when the connect is complete. + + If you feel you need to use this function, please contact us to + explain why. + + @function sock_connectwait + @discussion Allows a caller to wait on a socket connect. + @param so The socket being connected. + @param tv The amount of time to wait. + @result 0 on success otherwise the errno error. EINPROGRESS will be + returned if the connection did not complete in the timeout + specified. + */ +errno_t sock_connectwait(socket_t so, const struct timeval *tv); +#endif KERNEL_PRIVATE + +/*! + @function sock_getpeername + @discussion Retrieves the remote address of a connected socket. See + 'man 2 getpeername'. + @param so The socket. + @param peername Storage for the peer name. + @param peernamelen Length of storage for the peer name. + @result 0 on success otherwise the errno error. + */ +errno_t sock_getpeername(socket_t so, struct sockaddr *peername, int peernamelen); + +/*! + @function sock_getsockname + @discussion Retrieves the local address of a socket. See 'man 2 + getsockname'. + @param so The socket. + @param sockname Storage for the local name. + @param socknamelen Length of storage for the socket name. + @result 0 on success otherwise the errno error. + */ +errno_t sock_getsockname(socket_t so, struct sockaddr *sockname, int socknamelen); + +/*! + @function sock_getsockopt + @discussion Retrieves a socket option. See 'man 2 getsockopt'. + @param so The socket. + @param level Level of the socket option. + @param optname The option name. + @param optval The option value. + @param optlen The length of optval, returns the actual length. + @result 0 on success otherwise the errno error. + */ +errno_t sock_getsockopt(socket_t so, int level, int optname, void *optval, int *optlen); + +/*! + @function sock_ioctl + @discussion Performs an ioctl operation on a socket. See 'man 2 ioctl'. + @param so The socket. + @param request The ioctl name. + @param argp The argument. + @result 0 on success otherwise the errno error. + */ +errno_t sock_ioctl(socket_t so, unsigned long request, void *argp); + +/*! + @function sock_setsockopt + @discussion Sets a socket option. See 'man 2 setsockopt'. + @param so The socket. + @param level Level of the socket option. + @param optname The option name. + @param optval The option value. + @param optlen The length of optval. + @result 0 on success otherwise the errno error. + */ +errno_t sock_setsockopt(socket_t so, int level, int optname, const void *optval, int optlen); + +/*! + @function sock_listen + @discussion Indicate that the socket should start accepting incoming + connections. See 'man 2 listen'. + @param so The socket. + @param backlog The maximum length of the queue of pending connections. + @result 0 on success otherwise the errno error. + */ +errno_t sock_listen(socket_t so, int backlog); + +/*! + @function sock_receive + @discussion Receive data from a socket. Similar to recvmsg. See 'man + 2 recvmsg' for more information about receiving data. + @param so The socket. + @param msg The msg describing how the data should be received. + @param flags See 'man 2 recvmsg'. + @param recvdlen Number of bytes received, same as return value of + userland recvmsg. + @result 0 on success, EWOULDBLOCK if non-blocking and operation + would cause the thread to block, otherwise the errno error. + */ +errno_t sock_receive(socket_t so, struct msghdr *msg, int flags, size_t *recvdlen); + +/*! + @function sock_receivembuf + @discussion Receive data from a socket. Similar to sock_receive + though data is returned as a chain of mbufs. See 'man 2 recvmsg' + for more information about receiving data. + @param so The socket. + @param msg The msg describing how the data should be received. May + be NULL. The msg_iov is ignored. + @param data Upon return *data will be a reference to an mbuf chain + containing the data received. This eliminates copying the data + out of the mbufs. Caller is responsible for freeing the mbufs. + @param flags See 'man 2 recvmsg'. + @param recvlen Maximum number of bytes to receive in the mbuf chain. + Upon return, this value will be set to the number of bytes + received, same as return value of userland recvmsg. + @result 0 on success, EWOULDBLOCK if non-blocking and operation + would cause the thread to block, otherwise the errno error. + */ +errno_t sock_receivembuf(socket_t so, struct msghdr *msg, mbuf_t *data, int flags, size_t *recvlen); + +/*! + @function sock_send + @discussion Send data on a socket. Similar to sendmsg. See 'man 2 + sendmsg' for more information about sending data. + @param so The socket. + @param msg The msg describing how the data should be sent. Any + pointers must point to data in the kernel. + @param flags See 'man 2 sendmsg'. + @param sentlen The number of bytes sent. + @result 0 on success, EWOULDBLOCK if non-blocking and operation + would cause the thread to block, otherwise the errno error. + */ +errno_t sock_send(socket_t so, const struct msghdr *msg, int flags, size_t *sentlen); + +/*! + @function sock_sendmbuf + @discussion Send data in an mbuf on a socket. Similar to sock_send + only the data to be sent is taken from the mbuf chain. + @param so The socket. + @param msg The msg describing how the data should be sent. The + msg_iov is ignored. msg may be NULL. + @param data The mbuf chain of data to send. + @param flags See 'man 2 sendmsg'. + @param sentlen The number of bytes sent. + @result 0 on success, EWOULDBLOCK if non-blocking and operation + would cause the thread to block, otherwise the errno error. + Regardless of return value, the mbuf chain 'data' will be freed. + */ +errno_t sock_sendmbuf(socket_t so, const struct msghdr *msg, mbuf_t data, int flags, size_t *sentlen); + +/*! + @function sock_shutdown + @discussion Shutdown one or both directions of a connection. See + 'man 2 shutdown' for more information. + @param so The socket. + @param how SHUT_RD - shutdown receive. SHUT_WR - shutdown send. SHUT_RDWR - shutdown both. + @result 0 on success otherwise the errno error. + */ +errno_t sock_shutdown(socket_t so, int how); + +/*! + @function sock_socket + @discussion Allocate a socket. Allocating a socket in this manner + creates a socket with no associated file descriptor. For more + information, see 'man 2 socket'. + @param domain The socket domain (PF_INET, etc...). + @param type The socket type (SOCK_STREAM, SOCK_DGRAM, etc...). + @param protocol The socket protocol. + @param callback A notifier function to be called when an event + occurs on the socket. This may be NULL. + @param cookie A cookie passed directly to the callback. + @param new_so Upon success, a reference to the new socket. + @result 0 on success otherwise the errno error. + */ +errno_t sock_socket(int domain, int type, int protocol, sock_upcall callback, + void* cookie, socket_t *new_so); + +/*! + @function sock_close + @discussion Close the socket. + @param so The socket to close. This should only ever be a socket + created with sock_socket. Closing a socket created in user space + using sock_close may leave a file descriptor pointing to the closed + socket, resulting in undefined behavior. + */ +void sock_close(socket_t so); + +/*! + @function sock_retain + @discussion Prevents the socket from closing + @param so The socket to close. Increment a retain count on the + socket, preventing it from being closed when sock_close is + called. This is used when a File Descriptor is passed (and + closed) from userland and the kext wants to keep ownership of + that socket. It is used in conjunction with + sock_release(socket_t so). + */ +void sock_retain(socket_t so); + +/*! + @function sock_release + @discussion Decrement the retain count and close the socket if the + retain count reaches zero. + @param so The socket to release. This is used to release ownership + on a socket acquired with sock_retain. When the last retain + count is reached, this will call sock_close to close the socket. + */ +void sock_release(socket_t so); + +/*! + @function sock_setpriv + @discussion Set the privileged bit in the socket. Allows for + operations that require root privileges. + @param so The socket on which to modify the SS_PRIV flag. + @param on Indicate whether or not the SS_PRIV flag should be set. + @result 0 on success otherwise the errno error. + */ +errno_t sock_setpriv(socket_t so, int on); + +/*! + @function sock_isconnected + @discussion Returns whether or not the socket is connected. + @param so The socket to check. + @result 0 - socket is not connected. 1 - socket is connected. + */ +int sock_isconnected(socket_t so); + +/*! + @function sock_isnonblocking + @discussion Returns whether or not the socket is non-blocking. In + the context of this KPI, non-blocking means that functions to + perform operations on a socket will not wait for completion. + + To enable or disable blocking, use the FIONBIO ioctl. The + parameter is an int. If the int is zero, the socket will block. + If the parameter is non-zero, the socket will not block. + @result 0 - socket will block. 1 - socket will not block. + */ +int sock_isnonblocking(socket_t so); + +/*! + @function sock_gettype + @discussion Retrieves information about the socket. This is the same + information that was used to create the socket. If any of the + parameters following so are NULL, that information is not + retrieved. + @param so The socket to check. + @param domain The domain of the socket (PF_INET, etc...). May be NULL. + @param type The socket type (SOCK_STREAM, SOCK_DGRAM, etc...). May be NULL. + @param protocol The socket protocol. May be NULL. + @result 0 on success otherwise the errno error. + */ +errno_t sock_gettype(socket_t so, int *domain, int *type, int *protocol); + +#ifdef KERNEL_PRIVATE +/*! + @function sock_nointerrupt + @discussion Disables interrupt on socket buffers (sets SB_NOINTR on + send and receive socket buffers). + @param so The socket to modify. + @param on Indicate whether or not the SB_NOINTR flag should be set. + @result 0 on success otherwise the errno error. + */ +errno_t sock_nointerrupt(socket_t so, int on); +#endif KERNEL_PRIVATE +#endif __KPI_SOCKET__ diff --git a/bsd/sys/kpi_socketfilter.h b/bsd/sys/kpi_socketfilter.h new file mode 100644 index 000000000..efc3f75a2 --- /dev/null +++ b/bsd/sys/kpi_socketfilter.h @@ -0,0 +1,604 @@ +/* + * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/*! + @header kpi_socketfilter.h + This header defines an API for intercepting communications at the + socket layer. + + For the most part, socket filters want to do three things: Filter + data in and out, watch for state changes, and intercept a few calls + for security. The number of function pointers supplied by a socket + filter has been significantly reduced. The filter no longer has any + knowledge of socket buffers. The filter no longer intercepts nearly + every internal socket call. There are two data filters, an in + filter, and an out filter. The in filter occurs before data is + placed in the receive socket buffer. This is done to avoid waking + the process unnecessarily. The out filter occurs before the data is + appended to the send socket buffer. This should cover inbound and + outbound data. For monitoring state changes, we've added a notify + function that will be called when various events that the filter can + not intercept occur. In addition, we've added a few functions that a + filter may use to intercept common operations. These functions are: + connect (inbound), connect (outbound), bind, set socket option, + get socket option, and listen. Bind, listen, connect in, and connect + out could be used together to build a fairly comprehensive firewall + without having to do much with individual packets. + */ +#ifndef __KPI_SOCKETFILTER__ +#define __KPI_SOCKETFILTER__ + +#include <sys/kernel_types.h> +#include <sys/kpi_socket.h> + +struct sockaddr; + +/*! + @enum sflt_flags + @abstract Constants defining mbuf flags. Only the flags listed below + can be set or retreieved. + @constant SFLT_GLOBAL Indicates this socket filter should be + attached to all new sockets when they're created. + @constant SFLT_PROG Indicates this socket filter should be attached + only when request by the application using the SO_NKE socket + option. +*/ +enum { + SFLT_GLOBAL = 0x01, + SFLT_PROG = 0x02 +}; +typedef u_int32_t sflt_flags; + +/*! + @typedef sflt_handle + @abstract A 4 byte identifier used with the SO_NKE socket option to + identify the socket filter to be attached. +*/ +typedef u_int32_t sflt_handle; + +/*! + @enum sflt_event_t + @abstract Events notify a filter of state changes and other various + events related to the socket. These events can not be prevented + or intercepted, only observed. + @constant sock_evt_connected Indicates this socket has moved to the + connected state. + @constant sock_evt_disconnected Indicates this socket has moved to + the disconnected state. + @constant sock_evt_flush_read The read socket buffer has been + flushed. + @constant sock_evt_shutdown The read and or write side(s) of the + connection have been shutdown. The param will point to an + integer that indicates the direction that has been shutdown. See + 'man 2 shutdown' for more information. + @constant sock_evt_cantrecvmore Indicates the socket can not receive + more data. + @constant sock_evt_cantsendmore Indicates the socket can not send + more data. + @constant sock_evt_closing Indicates the socket is closing. +*/ +enum { + sock_evt_connecting = 1, + sock_evt_connected = 2, + sock_evt_disconnecting = 3, + sock_evt_disconnected = 4, + sock_evt_flush_read = 5, + sock_evt_shutdown = 6, /* param points to an integer specifying how (read, write, or both) see man 2 shutdown */ + sock_evt_cantrecvmore = 7, + sock_evt_cantsendmore = 8, + sock_evt_closing = 9 +}; +typedef u_int32_t sflt_event_t; + +/*! + @enum sflt_data_flag_t + @abstract Inbound and outbound data filters may handle many + different types of incoming and outgoing data. These flags help + distinguish between normal data, out-of-band data, and records. + @constant sock_data_filt_flag_oob Indicates this data is out-of-band + data. + @constant sock_data_filt_flag_record Indicates this data is a + record. This flag is only ever seen on inbound data. +*/ +enum { + sock_data_filt_flag_oob = 1, + sock_data_filt_flag_record = 2 +}; +typedef u_int32_t sflt_data_flag_t; + +/*! + @typedef sf_unregistered_func + + @discussion sf_unregistered_func is called to notify the filter it + has been unregistered. This is the last function the stack will + call and this function will only be called once all other + function calls in to your filter have completed. Once this + function has been called, your kext may safely unload. + @param handle The socket filter handle used to identify this filter. +*/ +typedef void (*sf_unregistered_func)(sflt_handle handle); + +/*! + @typedef sf_attach_func + + @discussion sf_attach_func is called to notify the filter it has + been attached to a socket. The filter may allocate memory for + this attachment and use the cookie to track it. This filter is + called in one of two cases: + 1) You've installed a global filter and a new socket was created. + 2) Your non-global socket filter is being attached using the SO_NKE + socket option. + @param cookie Used to allow the socket filter to set the cookie for + this attachment. + @param so The socket the filter is being attached to. + @result If you return a non-zero value, your filter will not be + attached to this socket. +*/ +typedef errno_t (*sf_attach_func)(void **cookie, socket_t so); + +/*! + @typedef sf_detach_func + + @discussion sf_detach_func is called to notify the filter it has + been detached from a socket. If the filter allocated any memory + for this attachment, it should be freed. This function will + be called when the socket is disposed of. + @param cookie Cookie value specified when the filter attach was + called. + @param so The socket the filter is attached to. + @result If you return a non-zero value, your filter will not be + attached to this socket. +*/ +typedef void (*sf_detach_func)(void *cookie, socket_t so); + +/*! + @typedef sf_notify_func + + @discussion sf_notify_func is called to notify the filter of various + state changes and other events occuring on the socket. + @param cookie Cookie value specified when the filter attach was + called. + @param so The socket the filter is attached to. + @param event The type of event that has occurred. + @param param Additional information about the event. +*/ +typedef void (*sf_notify_func)(void *cookie, socket_t so, + sflt_event_t event, void *param); + +/*! + @typedef sf_getpeername_func + + @discussion sf_getpeername_func is called to allow a filter to + to intercept the getpeername function. When called, sa will + point to a pointer to a socket address that was malloced + in zone M_SONAME. If you want to replace this address, either + modify the currenty copy or allocate a new one and free the + old one. + @param cookie Cookie value specified when the filter attach was + called. + @param so The socket the filter is attached to. + @param sa A pointer to a socket address pointer. + @result If you return a non-zero value, processing will stop. If + you return EJUSTRETURN, no further filters will be called + but a result of zero will be returned to the caller of + getpeername. +*/ +typedef int (*sf_getpeername_func)(void *cookie, socket_t so, + struct sockaddr **sa); + +/*! + @typedef sf_getsockname_func + + @discussion sf_getsockname_func is called to allow a filter to + to intercept the getsockname function. When called, sa will + point to a pointer to a socket address that was malloced + in zone M_SONAME. If you want to replace this address, either + modify the currenty copy or allocate a new one and free the + old one. + @param cookie Cookie value specified when the filter attach was + called. + @param so The socket the filter is attached to. + @param sa A pointer to a socket address pointer. + @result If you return a non-zero value, processing will stop. If + you return EJUSTRETURN, no further filters will be called + but a result of zero will be returned to the caller of + getsockname. +*/ +typedef int (*sf_getsockname_func)(void *cookie, socket_t so, + struct sockaddr **sa); + +/*! + @typedef sf_data_in_func + + @discussion sf_data_in_func is called to filter incoming data. If your + filter intercepts data for later reinjection, it must queue all incoming + data to preserve the order of the data. Use sock_inject_data_in to later + reinject this data if you return EJUSTRETURN. Warning: This filter is on + the data path. Do not spend excesive time. Do not wait for data on + another socket. + @param cookie Cookie value specified when the filter attach was + called. + @param so The socket the filter is attached to. + @param from The addres the data is from, may be NULL if the socket + is connected. + @param data The data being received. Control data may appear in the + mbuf chain, be sure to check the mbuf types to find control + data. + @param control Control data being passed separately from the data. + @param flags Flags to indicate if this is out of band data or a + record. + @result Return: + 0 - The caller will continue with normal processing of the data. + EJUSTRETURN - The caller will stop processing the data, the data will not be freed. + Anything Else - The caller will free the data and stop processing. +*/ +typedef errno_t (*sf_data_in_func)(void *cookie, socket_t so, + const struct sockaddr *from, mbuf_t *data, + mbuf_t *control, sflt_data_flag_t flags); + +/*! + @typedef sf_data_out_func + + @discussion sf_data_out_func is called to filter outbound data. If + your filter intercepts data for later reinjection, it must queue + all outbound data to preserve the order of the data when + reinjecting. Use sock_inject_data_out to later reinject this + data. + @param cookie Cookie value specified when the filter attach was + called. + @param so The socket the filter is attached to. + @param from The address the data is from, may be NULL if the socket + is connected. + @param data The data being received. Control data may appear in the + mbuf chain, be sure to check the mbuf types to find control + data. + @param control Control data being passed separately from the data. + @param flags Flags to indicate if this is out of band data or a + record. + @result Return: + 0 - The caller will continue with normal processing of the data. + EJUSTRETURN - The caller will stop processing the data, the data will not be freed. + Anything Else - The caller will free the data and stop processing. +*/ +typedef errno_t (*sf_data_out_func)(void *cookie, socket_t so, + const struct sockaddr *to, mbuf_t *data, + mbuf_t *control, sflt_data_flag_t flags); + +/*! + @typedef sf_connect_in_func + + @discussion sf_connect_in_func is called to filter inbound connections. A + protocol will call this before accepting an incoming connection and + placing it on the queue of completed connections. Warning: This filter + is on the data path. Do not spend excesive time. Do not wait for data on + another socket. + @param cookie Cookie value specified when the filter attach was + called. + @param so The socket the filter is attached to. + @param from The address the incoming connection is from. + @result Return: + 0 - The caller will continue with normal processing of the connection. + Anything Else - The caller will rejecting the incoming connection. +*/ +typedef errno_t (*sf_connect_in_func)(void *cookie, socket_t so, + const struct sockaddr *from); + +/*! + @typedef sf_connect_out_func + + @discussion sf_connect_out_func is called to filter outbound + connections. A protocol will call this before initiating an + outbound connection. + @param cookie Cookie value specified when the filter attach was + called. + @param so The socket the filter is attached to. + @param to The remote address of the outbound connection. + @result Return: + 0 - The caller will continue with normal processing of the connection. + Anything Else - The caller will rejecting the outbound connection. +*/ +typedef errno_t (*sf_connect_out_func)(void *cookie, socket_t so, + const struct sockaddr *to); + +/*! + @typedef sf_bind_func + + @discussion sf_bind_func is called before performing a bind + operation on a socket. + @param cookie Cookie value specified when the filter attach was + called. + @param so The socket the filter is attached to. + @param to The local address of the socket will be bound to. + @result Return: + 0 - The caller will continue with normal processing of the bind. + Anything Else - The caller will rejecting the bind. +*/ +typedef errno_t (*sf_bind_func)(void *cookie, socket_t so, + const struct sockaddr *to); + +/*! + @typedef sf_setoption_func + + @discussion sf_setoption_func is called before performing setsockopt + on a socket. + @param cookie Cookie value specified when the filter attach was + called. + @param so The socket the filter is attached to. + @param opt The socket option to set. + @result Return: + 0 - The caller will continue with normal processing of the setsockopt. + Anything Else - The caller will stop processing and return this error. +*/ +typedef errno_t (*sf_setoption_func)(void *cookie, socket_t so, + sockopt_t opt); + +/*! + @typedef sf_getoption_func + + @discussion sf_getoption_func is called before performing getsockopt + on a socket. + @param cookie Cookie value specified when the filter attach was + called. + @param so The socket the filter is attached to. + @param opt The socket option to get. + @result Return: + 0 - The caller will continue with normal processing of the getsockopt. + Anything Else - The caller will stop processing and return this error. +*/ +typedef errno_t (*sf_getoption_func)(void *cookie, socket_t so, + sockopt_t opt); + +/*! + @typedef sf_listen_func + + @discussion sf_listen_func is called before performing listen + on a socket. + @param cookie Cookie value specified when the filter attach was + called. + @param so The socket the filter is attached to. + @result Return: + 0 - The caller will continue with normal processing of listen. + Anything Else - The caller will stop processing and return this error. +*/ +typedef errno_t (*sf_listen_func)(void *cookie, socket_t so); + +/*! + @typedef sf_ioctl_func + + @discussion sf_ioctl_func is called before performing an ioctl + on a socket. + @param cookie Cookie value specified when the filter attach was + called. + @param so The socket the filter is attached to. + @param request The ioctl name. + @param argp A pointer to the ioctl parameter. + @result Return: + 0 - The caller will continue with normal processing of this ioctl. + Anything Else - The caller will stop processing and return this error. +*/ +typedef errno_t (*sf_ioctl_func)(void *cookie, socket_t so, + u_int32_t request, const char* argp); + +/*! + @struct sflt_filter + @discussion This structure is used to define a socket filter. + @field sf_handle A value used to find socket filters by + applications. An application can use this value to specify that + this filter should be attached when using the SO_NKE socket + option. + @field sf_flags Indicate whether this filter should be attached to + all new sockets or just those that request the filter be + attached using the SO_NKE socket option. + @field sf_name A name used for debug purposes. + @field sf_unregistered Your function for being notified when your + filter has been unregistered. + @field sf_attach Your function for handling attaches to sockets. + @field sf_detach Your function for handling detaches from sockets. + @field sf_notify Your function for handling events. May be null. + @field sf_data_in Your function for handling incoming data. May be + null. + @field sf_data_out Your function for handling outgoing data. May be + null. + @field sf_connect_in Your function for handling inbound + connections. May be null. + @field sf_connect_in Your function for handling outbound + connections. May be null. + @field sf_bind Your function for handling binds. May be null. + @field sf_setoption Your function for handling setsockopt. May be null. + @field sf_getoption Your function for handling getsockopt. May be null. + @field sf_listen Your function for handling listen. May be null. + @field sf_ioctl Your function for handling ioctls. May be null. +*/ +struct sflt_filter { + sflt_handle sf_handle; + int sf_flags; + char* sf_name; + + sf_unregistered_func sf_unregistered; + sf_attach_func sf_attach; + sf_detach_func sf_detach; + + sf_notify_func sf_notify; + sf_getpeername_func sf_getpeername; + sf_getsockname_func sf_getsockname; + sf_data_in_func sf_data_in; + sf_data_out_func sf_data_out; + sf_connect_in_func sf_connect_in; + sf_connect_out_func sf_connect_out; + sf_bind_func sf_bind; + sf_setoption_func sf_setoption; + sf_getoption_func sf_getoption; + sf_listen_func sf_listen; + sf_ioctl_func sf_ioctl; +}; + +/*! + @function sflt_register + @discussion Registers a socket filter. See 'man 2 socket' for a + desciption of domain, type, and protocol. + @param filter A structure describing the filter. + @param domain The protocol domain these filters will be attached to. + @param type The socket type these filters will be attached to. + @param protocol The protocol these filters will be attached to. + @result 0 on success otherwise the errno error. + */ +errno_t sflt_register(const struct sflt_filter *filter, int domain, + int type, int protocol); + +/*! + @function sflt_unregister + @discussion Unregisters a socket filter. This will not detach the + socket filter from all sockets it may be attached to at the + time, it will just prevent the socket filter from being attached + to any new sockets. + @param handle The sf_handle of the socket filter to unregister. + @result 0 on success otherwise the errno error. + */ +errno_t sflt_unregister(sflt_handle handle); + +/*! + @function sflt_attach + @discussion Attaches a socket filter to the specified socket. A + filter must be registered before it can be attached. + @param socket The socket the filter should be attached to. + @param handle The handle of the registered filter to be attached. + @result 0 on success otherwise the errno error. + */ +errno_t sflt_attach(socket_t socket, sflt_handle); + +/*! + @function sflt_detach + @discussion Detaches a socket filter from a specified socket. + @param socket The socket the filter should be detached from. + @param handle The handle of the registered filter to be detached. + @result 0 on success otherwise the errno error. + */ +errno_t sflt_detach(socket_t socket, sflt_handle); + +/* Functions for manipulating sockets */ +/* + * Inject data in to the receive buffer of the socket as if it + * had come from the network. + * + * flags should match sflt_data_flag_t + */ + +/*! + @function sock_inject_data_in + @discussion Inject data in to the receive buffer of the socket as if + it had come from the network. + @param so The socket to inject the data on. + @param from The address the data is from, only necessary on + un-connected sockets. A copy of the address will be made, caller + is responsible for freeing the address after calling this + function. + @param data The data and possibly control mbufs. + @param control The separate control mbufs. + @param flags Flags indicating the type of data. + @result 0 on success otherwise the errno error. If the function + returns an error, the caller is responsible for freeing the + mbuf. + */ +errno_t sock_inject_data_in(socket_t so, const struct sockaddr* from, + mbuf_t data, mbuf_t control, sflt_data_flag_t flags); + +/*! + @function sock_inject_data_out + @discussion Inject data in to the send buffer of the socket as if it + had come from the client. + @param so The socket to inject the data on. + @param to The address the data should be sent to, only necessary on + un-connected sockets. The caller is responsible for freeing the + to address after sock_inject_data_out returns. + @param data The data and possibly control mbufs. + @param control The separate control mbufs. + @param flags Flags indicating the type of data. + @result 0 on success otherwise the errno error. The data and control + values are always freed regardless of return value. + */ +errno_t sock_inject_data_out(socket_t so, const struct sockaddr* to, + mbuf_t data, mbuf_t control, sflt_data_flag_t flags); + + +/* + * sockopt_t accessors + */ + +enum { + sockopt_get = 1, + sockopt_set = 2 +}; +typedef u_int8_t sockopt_dir; + +/*! + @function sockopt_direction + @discussion Retreives the direction of the socket option (Get or + Set). + @param sopt The socket option. + @result sock_opt_get or sock_opt_set. + */ +sockopt_dir sockopt_direction(sockopt_t sopt); + +/*! + @function sockopt_level + @discussion Retreives the socket option level. (SOL_SOCKET, etc). + @param sopt The socket option. + @result The socket option level. See man 2 setsockopt + */ +int sockopt_level(sockopt_t sopt); + +/*! + @function sockopt_name + @discussion Retreives the socket option name. (SO_SNDBUF, etc). + @param sopt The socket option. + @result The socket option name. See man 2 setsockopt + */ +int sockopt_name(sockopt_t sopt); + +/*! + @function sockopt_valsize + @discussion Retreives the size of the socket option data. + @param sopt The socket option. + @result The length, in bytes, of the data. + */ +size_t sockopt_valsize(sockopt_t sopt); + +/*! + @function sockopt_copyin + @discussion Copies the data from the socket option to a buffer. + @param sopt The socket option. + @param data A pointer to the buffer to copy the data in to. + @param length The number of bytes to copy. + @result An errno error or zero upon success. + */ +errno_t sockopt_copyin(sockopt_t sopt, void *data, size_t length); + +/*! + @function sockopt_copyout + @discussion Copies the data from a buffer to a socket option. + @param sopt The socket option. + @param data A pointer to the buffer to copy the data out of. + @param length The number of bytes to copy. + @result An errno error or zero upon success. + */ +errno_t sockopt_copyout(sockopt_t sopt, void *data, size_t length); + +#endif diff --git a/bsd/sys/ktrace.h b/bsd/sys/ktrace.h index ce39adb2d..f07a9a8d1 100644 --- a/bsd/sys/ktrace.h +++ b/bsd/sys/ktrace.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -61,15 +61,12 @@ #include <sys/appleapiopts.h> -#if defined(MACH_KERNEL_PRIVATE) +#ifdef MACH_KERNEL_PRIVATE -#ifdef __APPLE_API_PRIVATE -void ktrsyscall(void *, int, int, void *, int); +void ktrsyscall(void *, int, int, u_int64_t *, int); void ktrsysret(void *, int, int, int, int); -#endif /* __APPLE_API_PRIVATE */ #else - #ifdef __APPLE_API_UNSTABLE /* * operations to ktrace system call (KTROP(op)) @@ -85,6 +82,8 @@ void ktrsysret(void *, int, int, int, int); /* * ktrace record header + * + * LP64todo: not 64-bit safe */ struct ktr_header { int ktr_len; /* length of buf */ @@ -115,7 +114,7 @@ struct ktr_syscall { /* * followed by ktr_narg register_t */ - register_t ktr_args[1]; + u_int64_t ktr_args[1]; }; /* @@ -194,21 +193,20 @@ struct ktr_csw { #ifdef KERNEL #ifdef __APPLE_API_PRIVATE -void ktrnamei __P((struct vnode *,char *)); -void ktrcsw __P((struct vnode *, int, int, int)); -void ktrpsig __P((struct vnode *, int, sig_t, sigset_t *, int, int)); -void ktrgenio __P((struct vnode *, int, enum uio_rw, - struct uio *, int, int)); -void ktrsyscall __P((struct proc *, int, int, register_t args[], int)); -void ktrsysret __P((struct proc *, int, int, register_t, int)); +void ktrnamei(struct vnode *,char *); +void ktrcsw(struct vnode *, int, int); +void ktrpsig(struct vnode *, int, sig_t, sigset_t *, int); +void ktrgenio(struct vnode *, int, enum uio_rw, struct uio *, int); +void ktrsyscall(struct proc *, int, int, u_int64_t args[]); +void ktrsysret(struct proc *, int, int, register_t); #endif /* __APPLE_API_PRIVATE */ #else #include <sys/cdefs.h> __BEGIN_DECLS -int ktrace __P((const char *, int, int, pid_t)); -int utrace __P((const void *, size_t)); +int ktrace(const char *, int, int, pid_t); +int utrace(const void *, size_t); __END_DECLS #endif /* !KERNEL */ diff --git a/bsd/sys/loadable_fs.h b/bsd/sys/loadable_fs.h index 1d198c74d..95f1c727d 100644 --- a/bsd/sys/loadable_fs.h +++ b/bsd/sys/loadable_fs.h @@ -37,9 +37,7 @@ #ifndef _SYS_LOADABLE_FS_ #define _SYS_LOADABLE_FS_ -#include <sys/appleapiopts.h> -#ifdef __APPLE_API_UNSTABLE /* * Constants for Loadabls FS Utilities (in "/System/Library/Filesystems") * @@ -116,5 +114,4 @@ #define MNTOPT_FS "filesystem=" /* e.g. "filesystem=DOS" */ #define MNTOPT_REMOVABLE "removable" -#endif /* __APPLE_API_UNSTABLE */ #endif /* _SYS_LOADABLE_FS_ */ diff --git a/bsd/sys/lock.h b/bsd/sys/lock.h index c866f9570..5364f66e1 100644 --- a/bsd/sys/lock.h +++ b/bsd/sys/lock.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -63,76 +63,69 @@ #define _SYS_LOCK_H_ #include <sys/appleapiopts.h> +#include <sys/types.h> +#include <sys/cdefs.h> #ifdef KERNEL -#ifdef __APPLE_API_UNSTABLE -#include <kern/simple_lock.h> -#include <kern/simple_lock_types.h> -#if defined(simple_lock_init) -#undef simple_lock_init -#endif -#define simple_lock_init(l) usimple_lock_init((l),0) - -#if defined(simple_lock) -#undef simple_lock -#endif -#define simple_lock(l) ((void) 1) +#include <kern/locks.h> -#if defined(simple_unlock) -#undef simple_unlock -#endif -#define simple_unlock(l) ((void) 1) - -#if defined(simple_lock_try) -#undef simple_lock_try -#endif -#define simple_lock_try(l) 1 #if defined(thread_sleep_simple_lock) #undef thread_sleep_simple_lock #endif #define thread_sleep_simple_lock(l, e, i) thread_sleep_funnel((e), (i)) -#endif /* __APPLE_API_UNSTABLE */ -#else /* KERNEL */ - -#ifndef _MACHINE_SIMPLE_LOCK_DATA_ -#define _MACHINE_SIMPLE_LOCK_DATA_ - -#include <mach/boolean.h> - -struct slock{ - volatile unsigned int lock_data[10]; -}; -typedef struct slock simple_lock_data_t; -typedef struct slock *simple_lock_t; -#define decl_simple_lock_data(class,name) \ -class simple_lock_data_t name; - -#endif /* _MACHINE_SIMPLE_LOCK_DATA_ */ #endif /* KERNEL */ -#ifdef __APPLE_API_UNSTABLE +#ifdef BSD_KERNEL_PRIVATE /* * The general lock structure. Provides for multiple shared locks, * upgrading from shared to exclusive, and sleeping until the lock * can be gained. The simple locks are defined in <machine/param.h>. */ struct lock__bsd__ { - simple_lock_data_t - lk_interlock; /* lock on remaining fields */ + void * lk_interlock[10]; /* lock on remaining fields */ u_int lk_flags; /* see below */ int lk_sharecount; /* # of accepted shared locks */ int lk_waitcount; /* # of processes sleeping for lock */ short lk_exclusivecount; /* # of recursive exclusive locks */ short lk_prio; /* priority at which to sleep */ - char *lk_wmesg; /* resource sleeping (for tsleep) */ + const char *lk_wmesg; /* resource sleeping (for tsleep) */ int lk_timo; /* maximum sleep time (for tsleep) */ pid_t lk_lockholder; /* pid of exclusive lock holder */ void *lk_lockthread; /* thread which acquired excl lock */ }; + +// LP64todo - should this move? + +/* LP64 version of lock__bsd__. all pointers + * grow when we're dealing with a 64-bit process. + * WARNING - keep in sync with lock__bsd__ + */ + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_lock__bsd__ { + user_addr_t lk_interlock[10]; /* lock on remaining fields */ + u_int lk_flags; /* see below */ + int lk_sharecount; /* # of accepted shared locks */ + int lk_waitcount; /* # of processes sleeping for lock */ + short lk_exclusivecount; /* # of recursive exclusive locks */ + short lk_prio; /* priority at which to sleep */ + user_addr_t lk_wmesg; /* resource sleeping (for tsleep) */ + int lk_timo; /* maximum sleep time (for tsleep) */ + pid_t lk_lockholder; /* pid of exclusive lock holder */ + user_addr_t lk_lockthread; /* thread which acquired excl lock */ +}; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + /* * Lock request types: * LK_SHARED - get one of many possible shared locks. If a process @@ -231,12 +224,12 @@ struct lock__bsd__ { struct proc; -void lockinit __P((struct lock__bsd__ *, int prio, char *wmesg, int timo, - int flags)); -int lockmgr __P((struct lock__bsd__ *, u_int flags, - simple_lock_t, struct proc *p)); -int lockstatus __P((struct lock__bsd__ *)); +void lockinit(struct lock__bsd__ *, int prio, const char *wmesg, int timo, + int flags); +int lockmgr(struct lock__bsd__ *, u_int flags, + void *, struct proc *p); +int lockstatus(struct lock__bsd__ *); -#endif /* __APPLE_API_UNSTABLE */ +#endif /* BSD_KERNEL_PRIVATE */ #endif /* _SYS_LOCK_H_ */ diff --git a/bsd/sys/lockf.h b/bsd/sys/lockf.h index 6461cea8e..7c3e814d0 100644 --- a/bsd/sys/lockf.h +++ b/bsd/sys/lockf.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -19,7 +19,6 @@ * * @APPLE_LICENSE_HEADER_END@ */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. @@ -35,10 +34,6 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. @@ -56,59 +51,63 @@ * SUCH DAMAGE. * * @(#)lockf.h 8.1 (Berkeley) 6/11/93 + * $FreeBSD: src/sys/sys/lockf.h,v 1.16 2004/04/07 04:19:49 imp Exp $ */ #ifndef _SYS_LOCKF_H_ #define _SYS_LOCKF_H_ -#include <sys/appleapiopts.h> -#include <sys/cdefs.h> +#include <sys/queue.h> +#include <sys/cdefs.h> + +struct vnop_advlock_args; +struct vnode; + +#ifdef MALLOC_DECLARE +MALLOC_DECLARE(M_LOCKF); +#endif -#ifdef __APPLE_API_PRIVATE /* * The lockf structure is a kernel structure which contains the information * associated with a byte range lock. The lockf structures are linked into * the inode structure. Locks are sorted by the starting byte of the lock for * efficiency. */ +TAILQ_HEAD(locklist, lockf); + +#if __DARWIN_ALIGN_POWER +#pragma options align=power +#endif + struct lockf { - short lf_flags; /* Lock semantics: F_POSIX, F_FLOCK, F_WAIT */ + short lf_flags; /* Semantics: F_POSIX, F_FLOCK, F_WAIT */ short lf_type; /* Lock type: F_RDLCK, F_WRLCK */ - off_t lf_start; /* The byte # of the start of the lock */ - off_t lf_end; /* The byte # of the end of the lock (-1=EOF)*/ - caddr_t lf_id; /* The id of the resource holding the lock */ - struct lockf **lf_head; /* Back pointer to the head of lockf list */ - struct lockf *lf_next; /* A pointer to the next lock on this inode */ - struct lockf *lf_block; /* The list of blocked locks */ + off_t lf_start; /* Byte # of the start of the lock */ + off_t lf_end; /* Byte # of the end of the lock (-1=EOF) */ + caddr_t lf_id; /* Id of the resource holding the lock */ + struct lockf **lf_head; /* Back pointer to the head of the locf list */ + struct vnode *lf_vnode; /* Back pointer to the inode */ + struct lockf *lf_next; /* Pointer to the next lock on this inode */ + struct locklist lf_blkhd; /* List of requests blocked on this lock */ + TAILQ_ENTRY(lockf) lf_block;/* A request waiting for a lock */ }; +#if __DARWIN_ALIGN_POWER +#pragma options align=reset +#endif + /* Maximum length of sleep chains to traverse to try and detect deadlock. */ #define MAXDEPTH 50 __BEGIN_DECLS -void lf_addblock __P((struct lockf *, struct lockf *)); -int lf_advlock __P((struct lockf **, - off_t, caddr_t, int, struct flock *, int)); -int lf_clearlock __P((struct lockf *)); -int lf_findoverlap __P((struct lockf *, - struct lockf *, int, struct lockf ***, struct lockf **)); -struct lockf * - lf_getblock __P((struct lockf *)); -int lf_getlock __P((struct lockf *, struct flock *)); -int lf_setlock __P((struct lockf *)); -void lf_split __P((struct lockf *, struct lockf *)); -void lf_wakelock __P((struct lockf *)); -__END_DECLS -#if LOCKF_DEBUG -extern int lockf_debug; +int lf_advlock(struct vnop_advlock_args *); -__BEGIN_DECLS -void lf_print __P((char *, struct lockf *)); -void lf_printlist __P((char *, struct lockf *)); -__END_DECLS -#endif /* LOCKF_DEBUG */ +#ifdef LOCKF_DEBUG +void lf_print(char *, struct lockf *); +void lf_printlist(char *, struct lockf *); +#endif -#endif /* __APPLE_API_PRIVATE */ +__END_DECLS #endif /* !_SYS_LOCKF_H_ */ diff --git a/bsd/sys/mach_swapon.h b/bsd/sys/mach_swapon.h index fcba2d61b..152a9265b 100644 --- a/bsd/sys/mach_swapon.h +++ b/bsd/sys/mach_swapon.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -23,11 +23,7 @@ * Copyright (c) 1989,1995 NeXT, Inc. * All rights reserved. * - * The NEXTSTEP Software License Agreement specifies the terms - * and conditions for redistribution. - * */ - #ifndef _MACH_SWAPON_H #define _MACH_SWAPON_H diff --git a/bsd/sys/malloc.h b/bsd/sys/malloc.h index 0e68c6663..1bb501e9c 100644 --- a/bsd/sys/malloc.h +++ b/bsd/sys/malloc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -61,8 +61,8 @@ #include <sys/appleapiopts.h> -#define KMEMSTATS +#ifdef KERNEL /* * flags to malloc */ @@ -70,6 +70,11 @@ #define M_NOWAIT 0x0001 #define M_ZERO 0x0004 /* bzero the allocation */ + +#ifdef BSD_KERNEL_PRIVATE + +#define KMEMSTATS + /* * Types of memory to be allocated (not all are used by us) */ @@ -92,7 +97,7 @@ #define M_CRED 16 /* credentials */ #define M_PGRP 17 /* process group header */ #define M_SESSION 18 /* session header */ -#define M_IOV 19 /* large iov's */ +#define M_IOV32 19 /* large iov's for 32 bit process */ #define M_MOUNT 20 /* vfs mount struct */ #define M_FHANDLE 21 /* network file handle */ #define M_NFSREQ 22 /* NFS request header */ @@ -111,7 +116,7 @@ #define M_VMPVENT 35 /* VM phys-virt mapping entry */ #define M_VMPAGER 36 /* XXX: VM pager struct */ #define M_VMPGDATA 37 /* XXX: VM pager private data */ -#define M_FILE 38 /* Open file structure */ +#define M_FILEPROC 38 /* Open file structure */ #define M_FILEDESC 39 /* Open file descriptor table */ #define M_LOCKF 40 /* Byte-range locking structures */ #define M_PROC 41 /* Proc structures */ @@ -120,8 +125,8 @@ #define M_LFSNODE 44 /* LFS vnode private part */ #define M_FFSNODE 45 /* FFS vnode private part */ #define M_MFSNODE 46 /* MFS vnode private part */ -#define M_NQLEASE 47 /* Nqnfs lease */ -#define M_NQMHOST 48 /* Nqnfs host address table */ +#define M_NQLEASE 47 /* XXX: Nqnfs lease */ +#define M_NQMHOST 48 /* XXX: Nqnfs host address table */ #define M_NETADDR 49 /* Export host address structure */ #define M_NFSSVC 50 /* Nfs server structure */ #define M_NFSUID 51 /* Nfs uid mapping structure */ @@ -169,8 +174,34 @@ #define M_JNL_TR 92 /* Journaling: "struct transaction" */ #define M_SPECINFO 93 /* special file node */ #define M_KQUEUE 94 /* kqueue */ +#define M_HFSDIRHINT 95 /* HFS directory hint */ +#define M_CLRDAHEAD 96 /* storage for cluster read-ahead state */ +#define M_CLWRBEHIND 97 /* storage for cluster write-behind state */ +#define M_IOV64 98 /* large iov's for 64 bit process */ +#define M_FILEGLOB 99 /* fileglobal */ +#define M_KAUTH 100 /* kauth subsystem */ +#define M_DUMMYNET 101 /* dummynet */ +#define M_UNSAFEFS 102 /* storage for vnode lock state for unsafe FS */ -#define M_LAST 95 /* Must be last type + 1 */ +#else /* BSD_KERNEL_PRIVATE */ + +#define M_RTABLE 5 /* routing tables */ +#define M_IFADDR 9 /* interface address (IOFireWireIP)*/ +#define M_LOCKF 40 /* Byte-range locking structures (msdos) */ +#define M_TEMP 80 /* misc temporary data buffers */ +#define M_HFSMNT 75 /* HFS mount structure (afpfs) */ +#define M_KAUTH 100 /* kauth subsystem (smb) */ +#define M_SONAME 11 /* socket name (smb) */ +#define M_PCB 4 /* protocol control block (smb) */ +#define M_UDFNODE 84 /* UDF inodes (udf)*/ +#define M_UDFMNT 85 /* UDF mount structures (udf)*/ + +#endif /* BSD_KERNEL_PRIVATE */ + +#ifdef BSD_KERNEL_PRIVATE + + +#define M_LAST 103 /* Must be last type + 1 */ /* Strings corresponding to types of memory */ /* Must be in synch with the #defines above */ @@ -194,7 +225,7 @@ "cred", /* 16 M_CRED */ \ "pgrp", /* 17 M_PGRP */ \ "session", /* 18 M_SESSION */ \ - "iov", /* 19 M_IOV */ \ + "iov32", /* 19 M_IOV32 */ \ "mount", /* 20 M_MOUNT */ \ "fhandle", /* 21 M_FHANDLE */ \ "NFS req", /* 22 M_NFSREQ */ \ @@ -213,7 +244,7 @@ "VM pvmap", /* 35 M_VMPVENT */ \ "VM pager", /* 36 M_VMPAGER */ \ "VM pgdata", /* 37 M_VMPGDATA */ \ - "file", /* 38 M_FILE */ \ + "fileproc", /* 38 M_FILEPROC */ \ "file desc", /* 39 M_FILEDESC */ \ "lockf", /* 40 M_LOCKF */ \ "proc", /* 41 M_PROC */ \ @@ -269,7 +300,15 @@ "Journal", /* 91 M_JNL_JNL */\ "Transaction", /* 92 M_JNL_TR */\ "specinfo", /* 93 M_SPECINFO */\ - "kqueue" /* 94 M_KQUEUE */\ + "kqueue", /* 94 M_KQUEUE */\ + "HFS dirhint", /* 95 M_HFSDIRHINT */ \ + "cluster_read", /* 96 M_CLRDAHEAD */ \ + "cluster_write",/* 97 M_CLWRBEHIND */ \ + "iov64", /* 98 M_IOV64 */ \ + "fileglob", /* 99 M_FILEGLOB */ \ + "kauth", /* 100 M_KAUTH */ \ + "dummynet", /* 101 M_DUMMYNET */ \ + "unsafe_fsnode" /* 102 M_UNSAFEFS */ \ } struct kmemstats { @@ -285,10 +324,9 @@ struct kmemstats { long ks_spare; }; -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE extern struct kmemstats kmemstats[]; -#endif /* __APPLE_API_PRIVATE */ + +#endif /* BSD_KERNEL_PRIVATE */ /* * The malloc/free primatives used @@ -306,24 +344,24 @@ extern struct kmemstats kmemstats[]; #define FREE_ZONE(addr, size, type) \ _FREE_ZONE((void *)addr, size, type) -extern void *_MALLOC __P(( +extern void *_MALLOC( size_t size, int type, - int flags)); + int flags); -extern void _FREE __P(( +extern void _FREE( void *addr, - int type)); + int type); -extern void *_MALLOC_ZONE __P(( +extern void *_MALLOC_ZONE( size_t size, int type, - int flags)); + int flags); -extern void _FREE_ZONE __P(( +extern void _FREE_ZONE( void *elem, size_t size, - int type)); + int type); #endif /* KERNEL */ diff --git a/bsd/sys/mbuf.h b/bsd/sys/mbuf.h index 94efd165a..d7a417160 100644 --- a/bsd/sys/mbuf.h +++ b/bsd/sys/mbuf.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -72,8 +72,13 @@ #ifndef _SYS_MBUF_H_ #define _SYS_MBUF_H_ +#include <sys/cdefs.h> #include <sys/appleapiopts.h> + +#ifdef KERNEL_PRIVATE + #include <sys/lock.h> +#include <sys/queue.h> /* * Mbufs are of a single size, MSIZE (machine/param.h), which @@ -83,7 +88,6 @@ * at least MINCLSIZE of data must be stored. */ -#ifdef __APPLE_API_UNSTABLE #define MLEN (MSIZE - sizeof(struct m_hdr)) /* normal data len */ #define MHLEN (MLEN - sizeof(struct pkthdr)) /* data len w/pkthdr */ @@ -117,6 +121,16 @@ struct m_hdr { short mh_flags; /* flags; see below */ }; +/* + * Packet tag structure (see below for details). + */ +struct m_tag { + SLIST_ENTRY(m_tag) m_tag_link; /* List of packet tags */ + u_int16_t m_tag_type; /* Module specific type */ + u_int16_t m_tag_len; /* Length of data */ + u_int32_t m_tag_id; /* Module ID */ +}; + /* record/packet header in first mbuf of chain; valid if M_PKTHDR set */ struct pkthdr { int len; /* total packet length */ @@ -133,11 +147,11 @@ struct pkthdr { struct mbuf *aux; /* extra data buffer; ipsec/others */ #ifdef KERNEL_PRIVATE u_short vlan_tag; /* VLAN tag, host byte order */ - u_short reserved_1; /* for future use */ + u_short socket_id; /* socket id */ #else KERNEL_PRIVATE - void *reserved1; /* for future use */ + u_int reserved1; /* for future use */ #endif KERNEL_PRIVATE - void *reserved2; /* for future use */ + SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */ }; @@ -194,10 +208,12 @@ struct mbuf { #define M_FRAG 0x0400 /* packet is a fragment of a larger packet */ #define M_FIRSTFRAG 0x0800 /* packet is first fragment */ #define M_LASTFRAG 0x1000 /* packet is last fragment */ +#define M_PROMISC 0x2000 /* packet is promiscuous (shouldn't go to stack) */ /* flags copied when copying m_pkthdr */ -#define M_COPYFLAGS (M_PKTHDR|M_EOR|M_PROTO1|M_PROTO1|M_PROTO2|M_PROTO3 | \ - M_PROTO4|M_PROTO5|M_BCAST|M_MCAST|M_FRAG) +#define M_COPYFLAGS (M_PKTHDR|M_EOR|M_PROTO1|M_PROTO2|M_PROTO3 | \ + M_PROTO4|M_PROTO5|M_BCAST|M_MCAST|M_FRAG | \ + M_FIRSTFRAG|M_LASTFRAG|M_PROMISC) /* flags indicating hw checksum support and sw checksum requirements [freebsd4.1]*/ #define CSUM_IP 0x0001 /* will csum IP */ @@ -214,7 +230,6 @@ struct mbuf { #define CSUM_DELAY_DATA (CSUM_TCP | CSUM_UDP) #define CSUM_DELAY_IP (CSUM_IP) /* XXX add ipv6 here too? */ -#ifdef KERNEL_PRIVATE /* * Note: see also IF_HWASSIST_CSUM defined in <net/if_var.h> */ @@ -242,8 +257,11 @@ struct mbuf { #define MT_IFADDR 13 /* interface address */ #define MT_CONTROL 14 /* extra-data protocol message */ #define MT_OOBDATA 15 /* expedited data */ +#define MT_TAG 16 /* volatile metadata associated to pkts */ #define MT_MAX 32 /* enough? */ +#ifdef KERNEL_PRIVATE + /* flags to m_get/MGET */ /* Need to include malloc.h to get right options for malloc */ #include <sys/malloc.h> @@ -251,8 +269,6 @@ struct mbuf { #define M_DONTWAIT M_NOWAIT #define M_WAIT M_WAITOK -#ifdef __APPLE_API_PRIVATE - /* * mbuf utility macros: * @@ -261,14 +277,14 @@ struct mbuf { * drivers. */ +#ifdef _KERN_LOCKS_H_ +extern lck_mtx_t * mbuf_mlock; +#else +extern void * mbuf_mlock; +#endif -extern -decl_simple_lock_data(, mbuf_slock); -#define MBUF_LOCK() usimple_lock(&mbuf_slock); -#define MBUF_UNLOCK() usimple_unlock(&mbuf_slock); -#define MBUF_LOCKINIT() simple_lock_init(&mbuf_slock); - -#endif /* __APPLE_API_PRIVATE */ +#define MBUF_LOCK() lck_mtx_lock(mbuf_mlock); +#define MBUF_UNLOCK() lck_mtx_unlock(mbuf_mlock); /* * mbuf allocation/deallocation macros: @@ -287,11 +303,7 @@ decl_simple_lock_data(, mbuf_slock); #define MCHECK(m) #endif -#ifdef __APPLE_API_PRIVATE extern struct mbuf *mfree; /* mbuf free list */ -extern simple_lock_data_t mbuf_slock; -#endif /* __APPLE_API_PRIVATE */ - #define MGET(m, how, type) ((m) = m_get((how), (type))) @@ -320,6 +332,16 @@ union mcluster { #define MCLGET(m, how) ((m) = m_mclget(m, how)) +/* + * Mbuf big cluster + */ + +union mbigcluster { + union mbigcluster *mbc_next; + char mbc_buf[NBPG]; +}; + + #define MCLHASREFERENCE(m) m_mclhasreference(m) /* @@ -381,10 +403,35 @@ union mcluster { /* compatiblity with 4.3 */ #define m_copy(m, o, l) m_copym((m), (o), (l), M_DONTWAIT) +#endif /* KERNEL_PRIVATE */ + /* * Mbuf statistics. */ +/* LP64todo - not 64-bit safe */ struct mbstat { + u_long m_mbufs; /* mbufs obtained from page pool */ + u_long m_clusters; /* clusters obtained from page pool */ + u_long m_spare; /* spare field */ + u_long m_clfree; /* free clusters */ + u_long m_drops; /* times failed to find space */ + u_long m_wait; /* times waited for space */ + u_long m_drain; /* times drained protocols for space */ + u_short m_mtypes[256]; /* type specific mbuf allocations */ + u_long m_mcfail; /* times m_copym failed */ + u_long m_mpfail; /* times m_pullup failed */ + u_long m_msize; /* length of an mbuf */ + u_long m_mclbytes; /* length of an mbuf cluster */ + u_long m_minclsize; /* min length of data to allocate a cluster */ + u_long m_mlen; /* length of data in an mbuf */ + u_long m_mhlen; /* length of data in a header mbuf */ + u_long m_bigclusters; /* clusters obtained from page pool */ + u_long m_bigclfree; /* free clusters */ + u_long m_bigmclbytes; /* length of an mbuf cluster */ +}; + +/* Compatibillity with 10.3 */ +struct ombstat { u_long m_mbufs; /* mbufs obtained from page pool */ u_long m_clusters; /* clusters obtained from page pool */ u_long m_spare; /* spare field */ @@ -401,6 +448,7 @@ struct mbstat { u_long m_mlen; /* length of data in an mbuf */ u_long m_mhlen; /* length of data in a header mbuf */ }; +#ifdef KERNEL_PRIVATE /* * pkthdr.aux type tags. @@ -423,50 +471,136 @@ extern int max_protohdr; /* largest protocol header */ extern int max_hdr; /* largest link+protocol header */ extern int max_datalen; /* MHLEN - max_hdr */ -struct mbuf *m_copym __P((struct mbuf *, int, int, int)); -struct mbuf *m_split __P((struct mbuf *, int, int)); -struct mbuf *m_free __P((struct mbuf *)); -struct mbuf *m_get __P((int, int)); -struct mbuf *m_getpacket __P((void)); -struct mbuf *m_getclr __P((int, int)); -struct mbuf *m_gethdr __P((int, int)); -struct mbuf *m_prepend __P((struct mbuf *, int, int)); -struct mbuf *m_prepend_2 __P((struct mbuf *, int, int)); -struct mbuf *m_pullup __P((struct mbuf *, int)); -struct mbuf *m_retry __P((int, int)); -struct mbuf *m_retryhdr __P((int, int)); -void m_adj __P((struct mbuf *, int)); -int m_clalloc __P((int, int)); -void m_freem __P((struct mbuf *)); -int m_freem_list __P((struct mbuf *)); -struct mbuf *m_devget __P((char *, int, int, struct ifnet *, void (*)())); -char *mcl_to_paddr __P((char *)); -struct mbuf *m_pulldown __P((struct mbuf*, int, int, int*)); -struct mbuf *m_aux_add __P((struct mbuf *, int, int)); -struct mbuf *m_aux_find __P((struct mbuf *, int, int)); -void m_aux_delete __P((struct mbuf *, struct mbuf *)); - -struct mbuf *m_mclget __P((struct mbuf *, int)); -caddr_t m_mclalloc __P((int)); -void m_mclfree __P((caddr_t p)); -int m_mclhasreference __P((struct mbuf *)); -void m_copy_pkthdr __P((struct mbuf *, struct mbuf*)); - -int m_mclref __P((struct mbuf *)); -int m_mclunref __P((struct mbuf *)); - -void * m_mtod __P((struct mbuf *)); -struct mbuf * m_dtom __P((void *)); -int m_mtocl __P((void *)); -union mcluster *m_cltom __P((int )); - -int m_trailingspace __P((struct mbuf *)); -int m_leadingspace __P((struct mbuf *)); - -void m_mchtype __P((struct mbuf *m, int t)); - -void m_mcheck __P((struct mbuf*)); +__BEGIN_DECLS +struct mbuf *m_copym(struct mbuf *, int, int, int); +struct mbuf *m_split(struct mbuf *, int, int); +struct mbuf *m_free(struct mbuf *); +struct mbuf *m_get(int, int); +struct mbuf *m_getpacket(void); +struct mbuf *m_getclr(int, int); +struct mbuf *m_gethdr(int, int); +struct mbuf *m_prepend(struct mbuf *, int, int); +struct mbuf *m_prepend_2(struct mbuf *, int, int); +struct mbuf *m_pullup(struct mbuf *, int); +struct mbuf *m_retry(int, int); +struct mbuf *m_retryhdr(int, int); +void m_adj(struct mbuf *, int); +void m_freem(struct mbuf *); +int m_freem_list(struct mbuf *); +struct mbuf *m_devget(char *, int, int, struct ifnet *, void (*)(const void *, void *, size_t)); +char *mcl_to_paddr(char *); +struct mbuf *m_pulldown(struct mbuf*, int, int, int*); +struct mbuf *m_aux_add(struct mbuf *, int, int); +struct mbuf *m_aux_find(struct mbuf *, int, int); +void m_aux_delete(struct mbuf *, struct mbuf *); + +struct mbuf *m_mclget(struct mbuf *, int); +caddr_t m_mclalloc(int); +void m_mclfree(caddr_t p); +int m_mclhasreference(struct mbuf *); +void m_copy_pkthdr(struct mbuf *, struct mbuf*); + +int m_mclref(struct mbuf *); +int m_mclunref(struct mbuf *); + +void * m_mtod(struct mbuf *); +struct mbuf * m_dtom(void *); +int m_mtocl(void *); +union mcluster *m_cltom(int ); + +int m_trailingspace(struct mbuf *); +int m_leadingspace(struct mbuf *); + +void m_mchtype(struct mbuf *m, int t); +void m_mcheck(struct mbuf*); + +void m_copyback(struct mbuf *, int , int , caddr_t); +void m_copydata(struct mbuf *, int , int , caddr_t); +struct mbuf* m_dup(struct mbuf *m, int how); +void m_cat(struct mbuf *, struct mbuf *); +struct mbuf *m_copym_with_hdrs(struct mbuf*, int, int, int, struct mbuf**, int*); +struct mbuf *m_getpackets(int, int, int); +struct mbuf * m_getpackethdrs(int , int ); +struct mbuf* m_getpacket_how(int ); +struct mbuf * m_getpackets_internal(unsigned int *, int , int , int , size_t); +struct mbuf * m_allocpacket_internal(unsigned int * , size_t , unsigned int *, int , int , size_t ); + +__END_DECLS +/* + Packets may have annotations attached by affixing a list of "packet + tags" to the pkthdr structure. Packet tags are dynamically allocated + semi-opaque data structures that have a fixed header (struct m_tag) + that specifies the size of the memory block and an <id,type> pair that + identifies it. The id identifies the module and the type identifies the + type of data for that module. The id of zero is reserved for the kernel. + + Note that the packet tag returned by m_tag_allocate has the default + memory alignment implemented by malloc. To reference private data one + can use a construct like: + + struct m_tag *mtag = m_tag_allocate(...); + struct foo *p = (struct foo *)(mtag+1); + + if the alignment of struct m_tag is sufficient for referencing members + of struct foo. Otherwise it is necessary to embed struct m_tag within + the private data structure to insure proper alignment; e.g. + + struct foo { + struct m_tag tag; + ... + }; + struct foo *p = (struct foo *) m_tag_allocate(...); + struct m_tag *mtag = &p->tag; + */ + +#define KERNEL_MODULE_TAG_ID 0 + +enum { + KERNEL_TAG_TYPE_NONE = 0, + KERNEL_TAG_TYPE_DUMMYNET = 1, + KERNEL_TAG_TYPE_DIVERT = 2, + KERNEL_TAG_TYPE_IPFORWARD = 3, + KERNEL_TAG_TYPE_IPFILT = 4 +}; + +/* + * As a temporary and low impact solution to replace the even uglier + * approach used so far in some parts of the network stack (which relies + * on global variables), packet tag-like annotations are stored in MT_TAG + * mbufs (or lookalikes) prepended to the actual mbuf chain. + * + * m_type = MT_TAG + * m_flags = m_tag_id + * m_next = next buffer in chain. + * + * BE VERY CAREFUL not to pass these blocks to the mbuf handling routines. + */ +#define _m_tag_id m_hdr.mh_flags + +__BEGIN_DECLS + +/* Packet tag routines */ +struct m_tag *m_tag_alloc(u_int32_t id, u_int16_t type, int len, int wait); +void m_tag_free(struct m_tag *); +void m_tag_prepend(struct mbuf *, struct m_tag *); +void m_tag_unlink(struct mbuf *, struct m_tag *); +void m_tag_delete(struct mbuf *, struct m_tag *); +void m_tag_delete_chain(struct mbuf *, struct m_tag *); +struct m_tag *m_tag_locate(struct mbuf *,u_int32_t id, u_int16_t type, + struct m_tag *); +struct m_tag *m_tag_copy(struct m_tag *, int wait); +int m_tag_copy_chain(struct mbuf *to, struct mbuf *from, int wait); +void m_tag_init(struct mbuf *); +struct m_tag *m_tag_first(struct mbuf *); +struct m_tag *m_tag_next(struct mbuf *, struct m_tag *); + +__END_DECLS + +#endif /* KERNEL */ + +#endif /* KERNEL_PRIVATE */ +#ifdef KERNEL +#include <sys/kpi_mbuf.h> #endif -#endif /* __APPLE_API_UNSTABLE */ #endif /* !_SYS_MBUF_H_ */ diff --git a/bsd/sys/md5.h b/bsd/sys/md5.h index 5ae59a40b..f825f0aaa 100644 --- a/bsd/sys/md5.h +++ b/bsd/sys/md5.h @@ -47,7 +47,7 @@ char * MD5End(MD5_CTX *, char *); char * MD5File(const char *, char *); char * MD5Data(const unsigned char *, unsigned int, char *); #ifdef KERNEL -void MD5Transform __P((u_int32_t [4], const unsigned char [64])); +void MD5Transform(u_int32_t [4], const unsigned char [64]); #endif __END_DECLS #endif /* !KERNEL || __APPLE_API_PRIVATE */ diff --git a/bsd/sys/mman.h b/bsd/sys/mman.h index 7907a55b7..aeaab5a7f 100644 --- a/bsd/sys/mman.h +++ b/bsd/sys/mman.h @@ -55,127 +55,185 @@ * @(#)mman.h 8.1 (Berkeley) 6/2/93 */ +/* + * Currently unsupported: + * + * [TYM] POSIX_TYPED_MEM_ALLOCATE + * [TYM] POSIX_TYPED_MEM_ALLOCATE_CONTIG + * [TYM] POSIX_TYPED_MEM_MAP_ALLOCATABLE + * [TYM] struct posix_typed_mem_info + * [TYM] posix_mem_offset() + * [TYM] posix_typed_mem_get_info() + * [TYM] posix_typed_mem_open() + */ + #ifndef _SYS_MMAN_H_ #define _SYS_MMAN_H_ #include <sys/appleapiopts.h> -#include <mach/shared_memory_server.h> +#include <sys/cdefs.h> + +#include <sys/_types.h> + +/* + * [various] The mode_t, off_t, and size_t types shall be defined as + * described in <sys/types.h> + */ +#ifndef _MODE_T +typedef __darwin_mode_t mode_t; +#define _MODE_T +#endif + +#ifndef _OFF_T +typedef __darwin_off_t off_t; +#define _OFF_T +#endif + +#ifndef _SIZE_T +#define _SIZE_T +typedef __darwin_size_t size_t; +#endif + /* * Protections are chosen from these bits, or-ed together */ -#define PROT_NONE 0x00 /* no permissions */ -#define PROT_READ 0x01 /* pages can be read */ -#define PROT_WRITE 0x02 /* pages can be written */ -#define PROT_EXEC 0x04 /* pages can be executed */ +#define PROT_NONE 0x00 /* [MC2] no permissions */ +#define PROT_READ 0x01 /* [MC2] pages can be read */ +#define PROT_WRITE 0x02 /* [MC2] pages can be written */ +#define PROT_EXEC 0x04 /* [MC2] pages can be executed */ /* * Flags contain sharing type and options. * Sharing types; choose one. */ -#define MAP_SHARED 0x0001 /* share changes */ -#define MAP_PRIVATE 0x0002 /* changes are private */ +#define MAP_SHARED 0x0001 /* [MF|SHM] share changes */ +#define MAP_PRIVATE 0x0002 /* [MF|SHM] changes are private */ +#ifndef _POSIX_C_SOURCE #define MAP_COPY MAP_PRIVATE /* Obsolete */ +#endif /* !_POSIX_C_SOURCE */ /* * Other flags */ -#define MAP_FIXED 0x0010 /* map addr must be exactly as requested */ +#define MAP_FIXED 0x0010 /* [MF|SHM] interpret addr exactly */ +#ifndef _POSIX_C_SOURCE #define MAP_RENAME 0x0020 /* Sun: rename private pages to file */ #define MAP_NORESERVE 0x0040 /* Sun: don't reserve needed swap area */ #define MAP_RESERVED0080 0x0080 /* previously unimplemented MAP_INHERIT */ #define MAP_NOEXTEND 0x0100 /* for MAP_FILE, don't change file size */ #define MAP_HASSEMAPHORE 0x0200 /* region may contain semaphores */ +#endif /* !_POSIX_C_SOURCE */ -#ifdef _P1003_1B_VISIBLE /* * Process memory locking */ -#define MCL_CURRENT 0x0001 /* Lock only current memory */ -#define MCL_FUTURE 0x0002 /* Lock all future memory as well */ - -#endif /* _P1003_1B_VISIBLE */ +#define MCL_CURRENT 0x0001 /* [ML] Lock only current memory */ +#define MCL_FUTURE 0x0002 /* [ML] Lock all future memory as well */ /* * Error return from mmap() */ -#define MAP_FAILED ((void *)-1) +#define MAP_FAILED ((void *)-1) /* [MF|SHM] mmap failed */ /* * msync() flags */ -#define MS_SYNC 0x0000 /* msync synchronously */ -#define MS_ASYNC 0x0001 /* return immediately */ -#define MS_INVALIDATE 0x0002 /* invalidate all cached data */ +#define MS_ASYNC 0x0001 /* [MF|SIO] return immediately */ +#define MS_INVALIDATE 0x0002 /* [MF|SIO] invalidate all cached data */ +#define MS_SYNC 0x0010 /* [MF|SIO] msync synchronously */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define MS_KILLPAGES 0x0004 /* invalidate pages, leave mapped */ #define MS_DEACTIVATE 0x0008 /* deactivate pages, leave mapped */ -#endif /* * Mapping type */ #define MAP_FILE 0x0000 /* map from file (default) */ #define MAP_ANON 0x1000 /* allocated from memory, swap space */ +#endif /* !_POSIX_C_SOURCE */ + /* * Advice to madvise */ -#define MADV_NORMAL 0 /* no further special treatment */ -#define MADV_RANDOM 1 /* expect random page references */ -#define MADV_SEQUENTIAL 2 /* expect sequential page references */ -#define MADV_WILLNEED 3 /* will need these pages */ -#define MADV_DONTNEED 4 /* dont need these pages */ -#define MADV_FREE 5 /* dont need these pages, and junk contents */ -#define POSIX_MADV_NORMAL MADV_NORMAL -#define POSIX_MADV_RANDOM MADV_RANDOM -#define POSIX_MADV_SEQUENTIAL MADV_SEQUENTIAL -#define POSIX_MADV_WILLNEED MADV_WILLNEED -#define POSIX_MADV_DONTNEED MADV_DONTNEED +#define POSIX_MADV_NORMAL 0 /* [MC1] no further special treatment */ +#define POSIX_MADV_RANDOM 1 /* [MC1] expect random page refs */ +#define POSIX_MADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */ +#define POSIX_MADV_WILLNEED 3 /* [MC1] will need these pages */ +#define POSIX_MADV_DONTNEED 4 /* [MC1] dont need these pages */ + +#ifndef _POSIX_C_SOURCE +#define MADV_NORMAL POSIX_MADV_NORMAL +#define MADV_RANDOM POSIX_MADV_RANDOM +#define MADV_SEQUENTIAL POSIX_MADV_SEQUENTIAL +#define MADV_WILLNEED POSIX_MADV_WILLNEED +#define MADV_DONTNEED POSIX_MADV_DONTNEED +#define MADV_FREE 5 /* pages unneeded, discard contents */ /* * Return bits from mincore */ -#define MINCORE_INCORE 0x1 /* Page is incore */ -#define MINCORE_REFERENCED 0x2 /* Page has been referenced by us */ -#define MINCORE_MODIFIED 0x4 /* Page has been modified by us */ -#define MINCORE_REFERENCED_OTHER 0x8 /* Page has been referenced */ -#define MINCORE_MODIFIED_OTHER 0x10 /* Page has been modified */ +#define MINCORE_INCORE 0x1 /* Page is incore */ +#define MINCORE_REFERENCED 0x2 /* Page has been referenced by us */ +#define MINCORE_MODIFIED 0x4 /* Page has been modified by us */ +#define MINCORE_REFERENCED_OTHER 0x8 /* Page has been referenced */ +#define MINCORE_MODIFIED_OTHER 0x10 /* Page has been modified */ +#endif /* !_POSIX_C_SOURCE */ -#ifndef KERNEL -#include <sys/cdefs.h> +#ifndef KERNEL __BEGIN_DECLS -#ifdef _P1003_1B_VISIBLE -int mlockall __P((int)); -int munlockall __P((void)); -#endif /* _P1003_1B_VISIBLE */ -int mlock __P((const void *, size_t)); -#ifndef _MMAP_DECLARED -#define _MMAP_DECLARED -void * mmap __P((void *, size_t, int, int, int, off_t)); +/* [ML] */ +int mlockall(int); +int munlockall(void); +/* [MR] */ +int mlock(const void *, size_t); +#ifndef _MMAP +#define _MMAP +/* [MC3]*/ +void * mmap(void *, size_t, int, int, int, off_t) __DARWIN_ALIAS(mmap); #endif -int mprotect __P((const void *, size_t, int)); -int msync __P((void *, size_t, int)); -int munlock __P((const void *, size_t)); -int munmap __P((void *, size_t)); -int shm_open __P((const char *, int, ...)); -int shm_unlink __P((const char *)); -int posix_madvise __P((void *, size_t, int)); -#ifndef _POSIX_SOURCE -#ifdef __APPLE_API_PRIVATE -int load_shared_file __P((char *, caddr_t, u_long, - caddr_t *, int, sf_mapping_t *, int *)); -int reset_shared_file __P((caddr_t *, int, sf_mapping_t *)); -int new_system_shared_regions __P((void)); -#endif /* __APPLE_API_PRIVATE */ -int madvise __P((void *, size_t, int)); -int mincore __P((const void *, size_t, char *)); -int minherit __P((void *, size_t, int)); +/* [MPR] */ +int mprotect(void *, size_t, int) __DARWIN_ALIAS(mprotect); +/* [MF|SIO] */ +int msync(void *, size_t, int) __DARWIN_ALIAS(msync); +/* [MR] */ +int munlock(const void *, size_t); +/* [MC3]*/ +int munmap(void *, size_t) __DARWIN_ALIAS(munmap); +/* [SHM] */ +int shm_open(const char *, int, ...); +int shm_unlink(const char *); +/* [ADV] */ +int posix_madvise(void *, size_t, int); + +#ifndef _POSIX_C_SOURCE +int madvise(void *, size_t, int); +int mincore(const void *, size_t, char *); +int minherit(void *, size_t, int); #endif __END_DECLS -#endif /* !KERNEL */ +#else /* KERNEL */ + +void pshm_cache_init(void); /* for bsd_init() */ + +/* + * XXX routine exported by posix_shm.c, but never used there, only used in + * XXX kern_mman.c in the implementation of mmap(). + */ +struct mmap_args; +struct fileproc; +int pshm_mmap(struct proc *p, struct mmap_args *uap, user_addr_t *retval, + struct fileproc *fp, off_t pageoff); +/* Really need to overhaul struct fileops to avoid this... */ +struct pshmnode; +int pshm_stat(struct pshmnode *pnode, struct stat *sb); +struct fileproc; +int pshm_truncate(struct proc *p, struct fileproc *fp, int fd, off_t length, register_t *retval); + +#endif /* KERNEL */ #endif /* !_SYS_MMAN_H_ */ diff --git a/bsd/sys/mount.h b/bsd/sys/mount.h index 539742c58..58d5cc0ea 100644 --- a/bsd/sys/mount.h +++ b/bsd/sys/mount.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -59,28 +59,19 @@ #define _SYS_MOUNT_H_ #include <sys/appleapiopts.h> +#include <sys/cdefs.h> +#include <sys/attr.h> /* needed for vol_capabilities_attr_t */ + #ifndef KERNEL +#include <stdint.h> #include <sys/ucred.h> +#include <sys/queue.h> /* XXX needed for user builds */ +#else +#include <sys/kernel_types.h> #endif -#include <sys/queue.h> -#include <sys/lock.h> -#include <net/radix.h> -#include <sys/socket.h> /* XXX for AF_MAX */ typedef struct fsid { int32_t val[2]; } fsid_t; /* file system id type */ -/* - * File identifier. - * These are unique per filesystem on a single machine. - */ -#define MAXFIDSZ 16 - -struct fid { - u_short fid_len; /* length of data in bytes */ - u_short fid_reserved; /* force longword alignment */ - char fid_data[MAXFIDSZ]; /* data (variable length) */ -}; - /* * file system statistics */ @@ -88,6 +79,9 @@ struct fid { #define MFSNAMELEN 15 /* length of fs type name, not inc. null */ #define MNAMELEN 90 /* length of buffer for returned name */ +/* + * LP64 - WARNING - must be kept in sync with struct user_statfs in mount_internal.h. + */ struct statfs { short f_otype; /* TEMPORARY SHADOW COPY OF f_type */ short f_oflags; /* TEMPORARY SHADOW COPY OF f_flags */ @@ -116,38 +110,116 @@ struct statfs { #endif }; -#ifdef __APPLE_API_PRIVATE + +#define MFSTYPENAMELEN 16 /* length of fs type name including null */ + +#if __DARWIN_ALIGN_POWER +#pragma options align=power +#endif + +struct vfsstatfs { + uint32_t f_bsize; /* fundamental file system block size */ + size_t f_iosize; /* optimal transfer block size */ + uint64_t f_blocks; /* total data blocks in file system */ + uint64_t f_bfree; /* free blocks in fs */ + uint64_t f_bavail; /* free blocks avail to non-superuser */ + uint64_t f_bused; /* free blocks avail to non-superuser */ + uint64_t f_files; /* total file nodes in file system */ + uint64_t f_ffree; /* free file nodes in fs */ + fsid_t f_fsid; /* file system id */ + uid_t f_owner; /* user that mounted the filesystem */ + uint64_t f_flags; /* copy of mount exported flags */ + char f_fstypename[MFSTYPENAMELEN];/* fs type name inclus */ + char f_mntonname[MAXPATHLEN];/* directory on which mounted */ + char f_mntfromname[MAXPATHLEN];/* mounted filesystem */ + uint32_t f_fssubtype; /* fs sub-type (flavor) */ + void *f_reserved[2]; /* For future use == 0 */ +}; + +#if __DARWIN_ALIGN_POWER +#pragma options align=reset +#endif + +#define VFSATTR_INIT(s) ((s)->f_supported = (s)->f_active = 0LL) +#define VFSATTR_SET_SUPPORTED(s, a) ((s)->f_supported |= VFSATTR_ ## a) +#define VFSATTR_IS_SUPPORTED(s, a) ((s)->f_supported & VFSATTR_ ## a) +#define VFSATTR_CLEAR_ACTIVE(s, a) ((s)->f_active &= ~VFSATTR_ ## a) +#define VFSATTR_IS_ACTIVE(s, a) ((s)->f_active & VFSATTR_ ## a) +#define VFSATTR_ALL_SUPPORTED(s) (((s)->f_active & (s)->f_supported) == (s)->f_active) +#define VFSATTR_WANTED(s, a) ((s)->f_active |= VFSATTR_ ## a) +#define VFSATTR_RETURN(s, a, x) do { (s)-> a = (x); VFSATTR_SET_SUPPORTED(s, a);} while(0) + +#define VFSATTR_f_objcount (1LL<< 0) +#define VFSATTR_f_filecount (1LL<< 1) +#define VFSATTR_f_dircount (1LL<< 2) +#define VFSATTR_f_maxobjcount (1LL<< 3) +#define VFSATTR_f_bsize (1LL<< 4) +#define VFSATTR_f_iosize (1LL<< 5) +#define VFSATTR_f_blocks (1LL<< 6) +#define VFSATTR_f_bfree (1LL<< 7) +#define VFSATTR_f_bavail (1LL<< 8) +#define VFSATTR_f_bused (1LL<< 9) +#define VFSATTR_f_files (1LL<< 10) +#define VFSATTR_f_ffree (1LL<< 11) +#define VFSATTR_f_fsid (1LL<< 12) +#define VFSATTR_f_owner (1LL<< 13) +#define VFSATTR_f_capabilities (1LL<< 14) +#define VFSATTR_f_attributes (1LL<< 15) +#define VFSATTR_f_create_time (1LL<< 16) +#define VFSATTR_f_modify_time (1LL<< 17) +#define VFSATTR_f_access_time (1LL<< 18) +#define VFSATTR_f_backup_time (1LL<< 19) +#define VFSATTR_f_fssubtype (1LL<< 20) +#define VFSATTR_f_vol_name (1LL<< 21) +#define VFSATTR_f_signature (1LL<< 22) +#define VFSATTR_f_carbon_fsid (1LL<< 23) + /* - * Structure per mounted file system. Each mounted file system has an - * array of operations and an instance record. The file systems are - * put on a doubly linked list. + * New VFS_STAT argument structure. */ -LIST_HEAD(vnodelst, vnode); - -struct mount { - CIRCLEQ_ENTRY(mount) mnt_list; /* mount list */ - struct vfsops *mnt_op; /* operations on fs */ - struct vfsconf *mnt_vfc; /* configuration info */ - struct vnode *mnt_vnodecovered; /* vnode we mounted on */ - struct vnodelst mnt_vnodelist; /* list of vnodes this mount */ - struct lock__bsd__ mnt_lock; /* mount structure lock */ - int mnt_flag; /* flags */ - int mnt_kern_flag; /* kernel only flags */ - int mnt_maxsymlinklen; /* max size of short symlink */ - struct statfs mnt_stat; /* cache of filesystem stats */ - qaddr_t mnt_data; /* private data */ - /* Cached values of the IO constraints for the device */ - union { - u_int32_t mntu_maxreadcnt; /* Max. byte count for read */ - void *mntu_xinfo_ptr; /* points at extended IO constraints */ - } mnt_un; /* if MNTK_IO_XINFO is set */ -#define mnt_maxreadcnt mnt_un.mntu_maxreadcnt -#define mnt_xinfo_ptr mnt_un.mntu_xinfo_ptr - u_int32_t mnt_maxwritecnt; /* Max. byte count for write */ - u_int16_t mnt_segreadcnt; /* Max. segment count for read */ - u_int16_t mnt_segwritecnt; /* Max. segment count for write */ +#if __DARWIN_ALIGN_POWER +#pragma options align=power +#endif + +struct vfs_attr { + uint64_t f_supported; + uint64_t f_active; + + uint64_t f_objcount; /* number of filesystem objects in volume */ + uint64_t f_filecount; /* ... files */ + uint64_t f_dircount; /* ... directories */ + uint64_t f_maxobjcount; /* maximum number of filesystem objects */ + + uint32_t f_bsize; /* block size for the below size values */ + size_t f_iosize; /* optimal transfer block size */ + uint64_t f_blocks; /* total data blocks in file system */ + uint64_t f_bfree; /* free blocks in fs */ + uint64_t f_bavail; /* free blocks avail to non-superuser */ + uint64_t f_bused; /* blocks in use */ + uint64_t f_files; /* total file nodes in file system */ + uint64_t f_ffree; /* free file nodes in fs */ + fsid_t f_fsid; /* file system id */ + uid_t f_owner; /* user that mounted the filesystem */ + + vol_capabilities_attr_t f_capabilities; + vol_attributes_attr_t f_attributes; + + struct timespec f_create_time; /* creation time */ + struct timespec f_modify_time; /* last modification time */ + struct timespec f_access_time; /* time of last access */ + struct timespec f_backup_time; /* last backup time */ + + uint32_t f_fssubtype; /* filesystem subtype */ + + char *f_vol_name; /* volume name */ + + uint16_t f_signature; /* used for ATTR_VOL_SIGNATURE, Carbon's FSVolumeInfo.signature */ + uint16_t f_carbon_fsid; /* same as Carbon's FSVolumeInfo.filesystemID */ }; -#endif /* __APPLE_API_PRIVATE */ + +#if __DARWIN_ALIGN_POWER +#pragma options align=reset +#endif /* * User specifiable flags. @@ -162,18 +234,20 @@ struct mount { #define MNT_UNION 0x00000020 /* union with underlying filesystem */ #define MNT_ASYNC 0x00000040 /* file system written asynchronously */ #define MNT_DONTBROWSE 0x00100000 /* file system is not appropriate path to user data */ -#define MNT_UNKNOWNPERMISSIONS 0x00200000 /* no known mapping for uid/gid in permissions information on disk */ +#define MNT_IGNORE_OWNERSHIP 0x00200000 /* VFS will ignore ownership information on filesystem + * objects */ #define MNT_AUTOMOUNTED 0x00400000 /* filesystem was mounted by automounter */ #define MNT_JOURNALED 0x00800000 /* filesystem is journaled */ +#define MNT_NOUSERXATTR 0x01000000 /* Don't allow user extended attributes */ +#define MNT_DEFWRITE 0x02000000 /* filesystem should defer writes */ + +/* backwards compatibility only */ +#define MNT_UNKNOWNPERMISSIONS MNT_IGNORE_OWNERSHIP /* * NFS export related mount flags. */ -#define MNT_EXRDONLY 0x00000080 /* exported read only */ #define MNT_EXPORTED 0x00000100 /* file system is exported */ -#define MNT_DEFEXPORTED 0x00000200 /* exported to the world */ -#define MNT_EXPORTANON 0x00000400 /* use anon uid mapping for everyone */ -#define MNT_EXKERB 0x00000800 /* exported with Kerberos uid mapping */ /* * Flags set by internal operations. @@ -182,7 +256,6 @@ struct mount { #define MNT_QUOTA 0x00002000 /* quotas are enabled on filesystem */ #define MNT_ROOTFS 0x00004000 /* identifies the root filesystem */ #define MNT_DOVOLFS 0x00008000 /* FS supports volfs */ -#define MNT_FIXEDSCRIPTENCODING 0x10000000 /* FS supports only fixed script encoding [HFS] */ /* * XXX I think that this could now become (~(MNT_CMDFLAGS)) @@ -190,11 +263,11 @@ struct mount { */ #define MNT_VISFLAGMASK (MNT_RDONLY | MNT_SYNCHRONOUS | MNT_NOEXEC | \ MNT_NOSUID | MNT_NODEV | MNT_UNION | \ - MNT_ASYNC | MNT_EXRDONLY | MNT_EXPORTED | \ - MNT_DEFEXPORTED | MNT_EXPORTANON| MNT_EXKERB | \ + MNT_ASYNC | MNT_EXPORTED | \ MNT_LOCAL | MNT_QUOTA | \ MNT_ROOTFS | MNT_DOVOLFS | MNT_DONTBROWSE | \ - MNT_UNKNOWNPERMISSIONS | MNT_AUTOMOUNTED | MNT_JOURNALED | MNT_FIXEDSCRIPTENCODING ) + MNT_UNKNOWNPERMISSIONS | MNT_AUTOMOUNTED | MNT_JOURNALED | \ + MNT_DEFWRITE) /* * External filesystem command modifier flags. * Unmount can use the MNT_FORCE flag. @@ -202,28 +275,12 @@ struct mount { * External filesystem control flags. */ #define MNT_UPDATE 0x00010000 /* not a real mount, just an update */ -#define MNT_DELEXPORT 0x00020000 /* delete export host lists */ #define MNT_RELOAD 0x00040000 /* reload filesystem data */ #define MNT_FORCE 0x00080000 /* force unmount or readonly change */ -#define MNT_CMDFLAGS (MNT_UPDATE|MNT_DELEXPORT|MNT_RELOAD|MNT_FORCE) +#define MNT_CMDFLAGS (MNT_UPDATE|MNT_RELOAD|MNT_FORCE) + + -/* - * Internal filesystem control flags stored in mnt_kern_flag. - * - * MNTK_UNMOUNT locks the mount entry so that name lookup cannot proceed - * past the mount point. This keeps the subtree stable during mounts - * and unmounts. - */ -#define MNTK_VIRTUALDEV 0x00200000 /* mounted on a virtual device i.e. a disk image */ -#define MNTK_ROOTDEV 0x00400000 /* this filesystem resides on the same device as the root */ -#define MNTK_IO_XINFO 0x00800000 /* mnt_un.mntu_ioptr has a malloc associated with it */ -#define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ -#define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */ -#define MNTK_WANTRDWR 0x04000000 /* upgrade to read/write requested */ -#if REV_ENDIAN_FS -#define MNT_REVEND 0x08000000 /* Reverse endian FS */ -#endif /* REV_ENDIAN_FS */ -#define MNTK_FRCUNMOUNT 0x10000000 /* Forced unmount wanted. */ /* * Sysctl CTL_VFS definitions. * @@ -240,8 +297,7 @@ struct mount { #define VFS_MAXTYPENUM 1 /* int: highest defined filesystem type */ #define VFS_CONF 2 /* struct: vfsconf for filesystem given as next argument */ -#define VFS_FMOD_WATCH 3 /* block waiting for the next modified file */ -#define VFS_FMOD_WATCH_ENABLE 4 /* 1==enable, 0==disable */ +#define VFS_SET_PACKAGE_EXTS 3 /* set package extension list */ /* * Flags for various system call interfaces. @@ -251,46 +307,24 @@ struct mount { #define MNT_WAIT 1 /* synchronously wait for I/O to complete */ #define MNT_NOWAIT 2 /* start all I/O, but do not wait for it */ -/* - * Generic file handle - */ -struct fhandle { - fsid_t fh_fsid; /* File system id of mount point */ - struct fid fh_fid; /* File sys specific id */ -}; -typedef struct fhandle fhandle_t; -/* - * Export arguments for local filesystem mount calls. - */ -struct export_args { - int ex_flags; /* export related flags */ - uid_t ex_root; /* mapping for root uid */ - struct ucred ex_anon; /* mapping for anonymous user */ - struct sockaddr *ex_addr; /* net address to which exported */ - int ex_addrlen; /* and the net address length */ - struct sockaddr *ex_mask; /* mask of valid bits in saddr */ - int ex_masklen; /* and the smask length */ -}; +#ifndef KERNEL +struct mount; +typedef struct mount * mount_t; +struct vnode; +typedef struct vnode * vnode_t; +#endif -#ifdef __APPLE_API_UNSTABLE -/* - * Filesystem configuration information. One of these exists for each - * type of filesystem supported by the kernel. These are searched at - * mount time to identify the requested filesystem. - */ struct vfsconf { struct vfsops *vfc_vfsops; /* filesystem operations vector */ char vfc_name[MFSNAMELEN]; /* filesystem type name */ int vfc_typenum; /* historic filesystem type number */ int vfc_refcount; /* number mounted of this type */ int vfc_flags; /* permanent flags */ - int (*vfc_mountroot)(void); /* if != NULL, routine to mount root */ + int (*vfc_mountroot)(mount_t, vnode_t); /* if != NULL, routine to mount root */ struct vfsconf *vfc_next; /* next in list */ }; -#endif /*__APPLE_API_UNSTABLE */ - struct vfsidctl { int vc_vers; /* should be VFSIDCTL_VERS1 (below) */ fsid_t vc_fsid; /* fsid to operate on. */ @@ -299,9 +333,45 @@ struct vfsidctl { u_int32_t vc_spare[12]; /* spare (must be zero). */ }; + /* vfsidctl API version. */ #define VFS_CTL_VERS1 0x01 +#ifdef KERNEL +// LP64todo - should this move? + +/* LP64 version of vfsconf. all pointers + * grow when we're dealing with a 64-bit process. + * WARNING - keep in sync with vfsconf + */ +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_vfsconf { + user_addr_t vfc_vfsops; /* filesystem operations vector */ + char vfc_name[MFSNAMELEN]; /* filesystem type name */ + int vfc_typenum; /* historic filesystem type number */ + int vfc_refcount; /* number mounted of this type */ + int vfc_flags; /* permanent flags */ + user_addr_t vfc_mountroot; /* if != NULL, routine to mount root */ + user_addr_t vfc_next; /* next in list */ +}; + +struct user_vfsidctl { + int vc_vers; /* should be VFSIDCTL_VERS1 (below) */ + fsid_t vc_fsid; /* fsid to operate on. */ + user_addr_t vc_ptr; /* pointer to data structure. */ + user_size_t vc_len; /* sizeof said structure. */ + u_int32_t vc_spare[12]; /* spare (must be zero). */ +}; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +#endif /* KERNEL */ + /* * New style VFS sysctls, do not reuse/conflict with the namespace for * private sysctls. @@ -325,10 +395,9 @@ struct vfsquery { #define VQ_MOUNT 0x0008 /* new filesystem arrived */ #define VQ_UNMOUNT 0x0010 /* filesystem has left */ #define VQ_DEAD 0x0020 /* filesystem is dead, needs force unmount */ -#define VQ_ASSIST 0x0040 /* filesystem needs assistance from external - program */ +#define VQ_ASSIST 0x0040 /* filesystem needs assistance from external program */ #define VQ_NOTRESPLOCK 0x0080 /* server lockd down */ -#define VQ_FLAG0100 0x0100 /* placeholder */ +#define VQ_UPDATE 0x0100 /* filesystem information has changed */ #define VQ_FLAG0200 0x0200 /* placeholder */ #define VQ_FLAG0400 0x0400 /* placeholder */ #define VQ_FLAG0800 0x0800 /* placeholder */ @@ -337,128 +406,176 @@ struct vfsquery { #define VQ_FLAG4000 0x4000 /* placeholder */ #define VQ_FLAG8000 0x8000 /* placeholder */ -#ifdef KERNEL -/* Point a sysctl request at a vfsidctl's data. */ -#define VCTLTOREQ(vc, req) \ - do { \ - (req)->newptr = (vc)->vc_ptr; \ - (req)->newlen = (vc)->vc_len; \ - (req)->newidx = 0; \ - } while (0) -#endif #ifdef KERNEL -#ifdef __APPLE_API_UNSTABLE -extern int maxvfsconf; /* highest defined filesystem type */ -extern struct vfsconf *vfsconf; /* head of list of filesystem types */ -extern int maxvfsslots; /* Maximum slots available to be used */ -extern int numused_vfsslots; /* number of slots already used */ -int vfsconf_add __P((struct vfsconf *)); -int vfsconf_del __P((char *)); +/* Structure for setting device IO parameters per mount point */ +struct vfsioattr { + u_int32_t io_maxreadcnt; /* Max. byte count for read */ + u_int32_t io_maxwritecnt; /* Max. byte count for write */ + u_int32_t io_segreadcnt; /* Max. segment count for read */ + u_int32_t io_segwritecnt; /* Max. segment count for write */ + u_int32_t io_maxsegreadsize; /* Max. segment read size */ + u_int32_t io_maxsegwritesize; /* Max. segment write size */ + u_int32_t io_devblocksize; /* the underlying device block size */ + void * io_reserved[3]; /* extended attribute information */ +}; + /* - * Operations supported on mounted file system. + * Filesystem Registration information */ -#ifdef __STDC__ -struct nameidata; -struct mbuf; -#endif + +#define VFS_TBLTHREADSAFE 0x01 +#define VFS_TBLFSNODELOCK 0x02 +#define VFS_TBLNOTYPENUM 0x08 +#define VFS_TBLLOCALVOL 0x10 +#define VFS_TBL64BITREADY 0x20 + +struct vfs_fsentry { + struct vfsops * vfe_vfsops; /* vfs operations */ + int vfe_vopcnt; /* # of vnodeopv_desc being registered (reg, spec, fifo ...) */ + struct vnodeopv_desc ** vfe_opvdescs; /* null terminated; */ + int vfe_fstypenum; /* historic filesystem type number */ + char vfe_fsname[MFSNAMELEN]; /* filesystem type name */ + uint32_t vfe_flags; /* defines the FS capabilities */ + void * vfe_reserv[2]; /* reserved for future use; set this to zero*/ + }; + + struct vfsops { - int (*vfs_mount) __P((struct mount *mp, char *path, caddr_t data, - struct nameidata *ndp, struct proc *p)); - int (*vfs_start) __P((struct mount *mp, int flags, - struct proc *p)); - int (*vfs_unmount) __P((struct mount *mp, int mntflags, - struct proc *p)); - int (*vfs_root) __P((struct mount *mp, struct vnode **vpp)); - int (*vfs_quotactl) __P((struct mount *mp, int cmds, uid_t uid, - caddr_t arg, struct proc *p)); - int (*vfs_statfs) __P((struct mount *mp, struct statfs *sbp, - struct proc *p)); - int (*vfs_sync) __P((struct mount *mp, int waitfor, - struct ucred *cred, struct proc *p)); - int (*vfs_vget) __P((struct mount *mp, void *ino, - struct vnode **vpp)); - int (*vfs_fhtovp) __P((struct mount *mp, struct fid *fhp, - struct mbuf *nam, struct vnode **vpp, - int *exflagsp, struct ucred **credanonp)); - int (*vfs_vptofh) __P((struct vnode *vp, struct fid *fhp)); - int (*vfs_init) __P((struct vfsconf *)); - int (*vfs_sysctl) __P((int *, u_int, void *, size_t *, void *, - size_t, struct proc *)); + int (*vfs_mount)(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t context); + int (*vfs_start)(struct mount *mp, int flags, vfs_context_t context); + int (*vfs_unmount)(struct mount *mp, int mntflags, vfs_context_t context); + int (*vfs_root)(struct mount *mp, struct vnode **vpp, vfs_context_t context); + int (*vfs_quotactl)(struct mount *mp, int cmds, uid_t uid, caddr_t arg, vfs_context_t context); + int (*vfs_getattr)(struct mount *mp, struct vfs_attr *, vfs_context_t context); +/* int (*vfs_statfs)(struct mount *mp, struct vfsstatfs *sbp, vfs_context_t context);*/ + int (*vfs_sync)(struct mount *mp, int waitfor, vfs_context_t context); + int (*vfs_vget)(struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context); + int (*vfs_fhtovp)(struct mount *mp, int fhlen, unsigned char *fhp, struct vnode **vpp, + vfs_context_t context); + int (*vfs_vptofh)(struct vnode *vp, int *fhlen, unsigned char *fhp, vfs_context_t context); + int (*vfs_init)(struct vfsconf *); + int (*vfs_sysctl)(int *, u_int, user_addr_t, size_t *, user_addr_t, size_t, vfs_context_t context); + int (*vfs_setattr)(struct mount *mp, struct vfs_attr *, vfs_context_t context); + void *vfs_reserved[7]; }; -#define VFS_MOUNT(MP, PATH, DATA, NDP, P) \ - (*(MP)->mnt_op->vfs_mount)(MP, PATH, DATA, NDP, P) -#define VFS_START(MP, FLAGS, P) (*(MP)->mnt_op->vfs_start)(MP, FLAGS, P) -#define VFS_UNMOUNT(MP, FORCE, P) (*(MP)->mnt_op->vfs_unmount)(MP, FORCE, P) -#define VFS_ROOT(MP, VPP) (*(MP)->mnt_op->vfs_root)(MP, VPP) -#define VFS_QUOTACTL(MP,C,U,A,P) (*(MP)->mnt_op->vfs_quotactl)(MP, C, U, A, P) -#define VFS_STATFS(MP, SBP, P) (*(MP)->mnt_op->vfs_statfs)(MP, SBP, P) -#define VFS_SYNC(MP, WAIT, C, P) (*(MP)->mnt_op->vfs_sync)(MP, WAIT, C, P) -#define VFS_VGET(MP, INO, VPP) (*(MP)->mnt_op->vfs_vget)(MP, INO, VPP) -#define VFS_FHTOVP(MP, FIDP, NAM, VPP, EXFLG, CRED) \ - (*(MP)->mnt_op->vfs_fhtovp)(MP, FIDP, NAM, VPP, EXFLG, CRED) -#define VFS_VPTOFH(VP, FIDP) (*(VP)->v_mount->mnt_op->vfs_vptofh)(VP, FIDP) /* - * Network address lookup element + * flags passed into vfs_iterate */ -struct netcred { - struct radix_node netc_rnodes[2]; - int netc_exflags; - struct ucred netc_anon; -}; /* - * Network export information + * return values from callback */ -struct netexport { - struct netcred ne_defexported; /* Default export */ - struct radix_node_head *ne_rtable[AF_MAX+1]; /* Individual exports */ -}; +#define VFS_RETURNED 0 /* done with vnode, reference can be dropped */ +#define VFS_RETURNED_DONE 1 /* done with vnode, reference can be dropped, terminate iteration */ +#define VFS_CLAIMED 2 /* don't drop reference */ +#define VFS_CLAIMED_DONE 3 /* don't drop reference, terminate iteration */ + +__BEGIN_DECLS /* - * exported vnode operations + * prototypes for exported VFS operations */ -int vfs_busy __P((struct mount *, int, struct slock *, struct proc *)); -int vfs_export __P((struct mount *, struct netexport *, - struct export_args *)); -struct netcred *vfs_export_lookup __P((struct mount *, struct netexport *, - struct mbuf *)); -void vfs_getnewfsid __P((struct mount *)); -struct mount *vfs_getvfs __P((fsid_t *)); -int vfs_mountedon __P((struct vnode *)); -void vfs_unbusy __P((struct mount *, struct proc *)); -#ifdef __APPLE_API_PRIVATE -int vfs_mountroot __P((void)); -int vfs_rootmountalloc __P((char *, char *, struct mount **)); -void vfs_unmountall __P((void)); -int safedounmount(struct mount *, int, struct proc *); -int dounmount(struct mount *, int, struct proc *); +extern int VFS_MOUNT(mount_t, vnode_t, user_addr_t, vfs_context_t); +extern int VFS_START(mount_t, int, vfs_context_t); +extern int VFS_UNMOUNT(mount_t, int, vfs_context_t); +extern int VFS_ROOT(mount_t, vnode_t *, vfs_context_t); +extern int VFS_QUOTACTL(mount_t, int, uid_t, caddr_t, vfs_context_t); +extern int VFS_SYNC(mount_t, int, vfs_context_t); +extern int VFS_VGET(mount_t, ino64_t, vnode_t *, vfs_context_t); +extern int VFS_FHTOVP(mount_t, int, unsigned char *, vnode_t *, vfs_context_t); +extern int VFS_VPTOFH(vnode_t, int *, unsigned char *, vfs_context_t); + +/* The file system registrartion KPI */ +int vfs_fsadd(struct vfs_fsentry *, vfstable_t *); +int vfs_fsremove(vfstable_t); +int vfs_iterate(int, int (*)(struct mount *, void *), void *); + +uint64_t vfs_flags(mount_t); +void vfs_setflags(mount_t, uint64_t); +void vfs_clearflags(mount_t, uint64_t); + +int vfs_issynchronous(mount_t); +int vfs_iswriteupgrade(mount_t); +int vfs_isupdate(mount_t); +int vfs_isreload(mount_t); +int vfs_isforce(mount_t); +int vfs_isrdonly(mount_t); +int vfs_isrdwr(mount_t); +int vfs_authopaque(mount_t); +int vfs_authopaqueaccess(mount_t); +void vfs_setauthopaque(mount_t); +void vfs_setauthopaqueaccess(mount_t); +void vfs_clearauthopaque(mount_t); +void vfs_clearauthopaqueaccess(mount_t); +int vfs_extendedsecurity(mount_t); +void vfs_setextendedsecurity(mount_t); +void vfs_clearextendedsecurity(mount_t); +void vfs_setlocklocal(mount_t); + + + +uint32_t vfs_maxsymlen(mount_t); +void vfs_setmaxsymlen(mount_t, uint32_t); +void * vfs_fsprivate(mount_t); +void vfs_setfsprivate(mount_t, void *mntdata); + +struct vfsstatfs * vfs_statfs(mount_t); +int vfs_update_vfsstat(mount_t, vfs_context_t); +int vfs_getattr(mount_t mp, struct vfs_attr *vfa, vfs_context_t ctx); +int vfs_setattr(mount_t mp, struct vfs_attr *vfa, vfs_context_t ctx); + +int vfs_typenum(mount_t); +void vfs_name(mount_t, char *); +int vfs_devblocksize(mount_t); +void vfs_ioattr(mount_t, struct vfsioattr *); +void vfs_setioattr(mount_t, struct vfsioattr *); +int vfs_64bitready(mount_t); + + +int vfs_busy(mount_t, int); +void vfs_unbusy(mount_t); + +void vfs_getnewfsid(struct mount *); +mount_t vfs_getvfs(fsid_t *); +mount_t vfs_getvfs_by_mntonname(u_char *); +int vfs_mountedon(struct vnode *); + void vfs_event_signal(fsid_t *, u_int32_t, intptr_t); void vfs_event_init(void); -#endif /* __APPLE_API_PRIVATE */ -extern CIRCLEQ_HEAD(mntlist, mount) mountlist; -extern struct slock mountlist_slock; +__END_DECLS + +#endif /* KERNEL */ -#endif /* __APPLE_API_UNSTABLE */ -#else /* !KERNEL */ +#ifndef KERNEL + +/* + * Generic file handle + */ +#define NFS_MAX_FH_SIZE 64 +#define NFSV2_MAX_FH_SIZE 32 +struct fhandle { + int fh_len; /* length of file handle */ + unsigned char fh_data[NFS_MAX_FH_SIZE]; /* file handle value */ +}; +typedef struct fhandle fhandle_t; -#include <sys/cdefs.h> __BEGIN_DECLS -int fhopen __P((const struct fhandle *, int)); -int fstatfs __P((int, struct statfs *)); -int getfh __P((const char *, fhandle_t *)); -int getfsstat __P((struct statfs *, long, int)); -int getmntinfo __P((struct statfs **, int)); -int mount __P((const char *, const char *, int, void *)); -int statfs __P((const char *, struct statfs *)); -int unmount __P((const char *, int)); -int getvfsbyname __P((const char *, struct vfsconf *)); +int fhopen(const struct fhandle *, int); +int fstatfs(int, struct statfs *); +int getfh(const char *, fhandle_t *); +int getfsstat(struct statfs *, int, int); +int getmntinfo(struct statfs **, int); +int mount(const char *, const char *, int, void *); +int statfs(const char *, struct statfs *); +int unmount(const char *, int); +int getvfsbyname(const char *, struct vfsconf *); __END_DECLS #endif /* KERNEL */ diff --git a/bsd/sys/mount_internal.h b/bsd/sys/mount_internal.h new file mode 100644 index 000000000..8eacce4ea --- /dev/null +++ b/bsd/sys/mount_internal.h @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ +/* + * Copyright (c) 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)mount.h 8.21 (Berkeley) 5/20/95 + */ + +#ifndef _SYS_MOUNT_INTERNAL_H_ +#define _SYS_MOUNT_INTERNAL_H_ + +#include <sys/appleapiopts.h> +#ifndef KERNEL +#include <sys/ucred.h> +#else +#include <sys/kernel_types.h> +#include <sys/namei.h> +#endif +#include <sys/queue.h> +#include <sys/lock.h> +#include <net/radix.h> +#include <sys/socket.h> /* XXX for AF_MAX */ +#include <sys/vfs_context.h> /* XXX for AF_MAX */ +#include <sys/mount.h> +#include <sys/cdefs.h> + +/* + * Structure per mounted file system. Each mounted file system has an + * array of operations and an instance record. The file systems are + * put on a doubly linked list. + */ +TAILQ_HEAD(vnodelst, vnode); + +struct mount { + TAILQ_ENTRY(mount) mnt_list; /* mount list */ + int32_t mnt_count; /* reference on the mount */ + lck_mtx_t mnt_mlock; /* mutex that protects mount point */ + struct vfsops *mnt_op; /* operations on fs */ + struct vfstable *mnt_vtable; /* configuration info */ + struct vnode *mnt_vnodecovered; /* vnode we mounted on */ + struct vnodelst mnt_vnodelist; /* list of vnodes this mount */ + struct vnodelst mnt_workerqueue; /* list of vnodes this mount */ + struct vnodelst mnt_newvnodes; /* list of vnodes this mount */ + int mnt_flag; /* flags */ + int mnt_kern_flag; /* kernel only flags */ + int mnt_lflag; /* mount life cycle flags */ + int mnt_maxsymlinklen; /* max size of short symlink */ + struct vfsstatfs mnt_vfsstat; /* cache of filesystem stats */ + qaddr_t mnt_data; /* private data */ + /* Cached values of the IO constraints for the device */ + u_int32_t mnt_maxreadcnt; /* Max. byte count for read */ + u_int32_t mnt_maxwritecnt; /* Max. byte count for write */ + u_int32_t mnt_segreadcnt; /* Max. segment count for read */ + u_int32_t mnt_segwritecnt; /* Max. segment count for write */ + u_int32_t mnt_maxsegreadsize; /* Max. segment read size */ + u_int32_t mnt_maxsegwritesize; /* Max. segment write size */ + u_int32_t mnt_devblocksize; /* the underlying device block size */ + lck_rw_t mnt_rwlock; /* mutex readwrite lock */ + lck_mtx_t mnt_renamelock; /* mutex that serializes renames that change shape of tree */ + vnode_t mnt_devvp; /* the device mounted on for local file systems */ + int32_t mnt_crossref; /* refernces to cover lookups crossing into mp */ + int32_t mnt_iterref; /* refernces to cover iterations; drained makes it -ve */ + + /* XXX 3762912 hack to support HFS filesystem 'owner' */ + uid_t mnt_fsowner; + gid_t mnt_fsgroup; +}; + +/* XXX 3762912 hack to support HFS filesystem 'owner' */ +#define vfs_setowner(_mp, _uid, _gid) do {(_mp)->mnt_fsowner = (_uid); (_mp)->mnt_fsgroup = (_gid); } while (0) + + +/* mount point to which dead vps point to */ +extern struct mount * dead_mountp; + +/* + * Internal filesystem control flags stored in mnt_kern_flag. + * + * MNTK_UNMOUNT locks the mount entry so that name lookup cannot proceed + * past the mount point. This keeps the subtree stable during mounts + * and unmounts. + * + * Note: We are counting down on new bit assignments. This is + * because the bits here were broken out from the high bits + * of the mount flags. + */ +#define MNTK_LOCK_LOCAL 0x00100000 /* advisory locking is done above the VFS itself */ +#define MNTK_VIRTUALDEV 0x00200000 /* mounted on a virtual device i.e. a disk image */ +#define MNTK_ROOTDEV 0x00400000 /* this filesystem resides on the same device as the root */ +#define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ +#define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */ +#define MNTK_WANTRDWR 0x04000000 /* upgrade to read/write requested */ +#if REV_ENDIAN_FS +#define MNT_REVEND 0x08000000 /* Reverse endian FS */ +#endif /* REV_ENDIAN_FS */ +#define MNTK_FRCUNMOUNT 0x10000000 /* Forced unmount wanted. */ +#define MNTK_AUTH_OPAQUE 0x20000000 /* authorisation decisions are not made locally */ +#define MNTK_AUTH_OPAQUE_ACCESS 0x40000000 /* VNOP_ACCESS is reliable for remote auth */ +#define MNTK_EXTENDED_SECURITY 0x80000000 /* extended security supported */ + +#define MNT_LBUSY 0x00000001 /* mount is busy */ +#define MNT_LUNMOUNT 0x00000002 /* mount in unmount */ +#define MNT_LFORCE 0x00000004 /* mount in forced unmount */ +#define MNT_LDRAIN 0x00000008 /* mount in drain */ +#define MNT_LITER 0x00000010 /* mount in iteration */ +#define MNT_LNEWVN 0x00000020 /* mount has new vnodes created */ +#define MNT_LWAIT 0x00000040 /* wait for unmount op */ +#define MNT_LITERWAIT 0x00000080 /* mount in iteration */ +#define MNT_LDEAD 0x00000100 /* mount already unmounted*/ + + +/* + * Generic file handle + */ +#define NFS_MAX_FH_SIZE 64 +#define NFSV2_MAX_FH_SIZE 32 +struct fhandle { + int fh_len; /* length of file handle */ + unsigned char fh_data[NFS_MAX_FH_SIZE]; /* file handle value */ +}; +typedef struct fhandle fhandle_t; + + + +/* + * Filesystem configuration information. One of these exists for each + * type of filesystem supported by the kernel. These are searched at + * mount time to identify the requested filesystem. + */ +struct vfstable { +/* THE FOLLOWING SHOULD KEEP THE SAME FOR user compat with sysctl */ + struct vfsops *vfc_vfsops; /* filesystem operations vector */ + char vfc_name[MFSNAMELEN]; /* filesystem type name */ + int vfc_typenum; /* historic filesystem type number */ + int vfc_refcount; /* number mounted of this type */ + int vfc_flags; /* permanent flags */ + int (*vfc_mountroot)(mount_t, vnode_t, vfs_context_t); /* if != NULL, routine to mount root */ + struct vfstable *vfc_next; /* next in list */ +/* Till the above we SHOULD KEEP THE SAME FOR user compat with sysctl */ + int vfc_threadsafe; /* FS is thread & premeption safe */ + lck_mtx_t vfc_lock; /* for non-threaded file systems */ + int vfc_vfsflags; /* for optional types */ + void * vfc_descptr; /* desc table allocated address */ + int vfc_descsize; /* size allocated for desc table */ + int vfc_64bitready; /* The file system is ready for 64bit */ +}; + +#define VFC_VFSLOCALARGS 0x02 +#define VFC_VFSGENERICARGS 0x04 +#define VFC_VFSNATIVEXATTR 0x10 + + +extern int maxvfsconf; /* highest defined filesystem type */ +extern struct vfstable *vfsconf; /* head of list of filesystem types */ +extern int maxvfsslots; /* Maximum slots available to be used */ +extern int numused_vfsslots; /* number of slots already used */ + +/* the following two are xnu private */ +struct vfstable * vfstable_add(struct vfstable *); +int vfstable_del(struct vfstable *); + + +struct vfsmount_args { + union { + struct { + char * mnt_fspec; + void * mnt_fsdata; + } mnt_localfs_args; + struct { + void * mnt_fsdata; /* FS specific */ + } mnt_remotefs_args; + } mountfs_args; +}; + + +/* + * LP64 version of statfs structure. + * NOTE - must be kept in sync with struct statfs in mount.h + */ +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_statfs { + short f_otype; /* TEMPORARY SHADOW COPY OF f_type */ + short f_oflags; /* TEMPORARY SHADOW COPY OF f_flags */ + user_long_t f_bsize; /* fundamental file system block size */ + user_long_t f_iosize; /* optimal transfer block size */ + user_long_t f_blocks; /* total data blocks in file system */ + user_long_t f_bfree; /* free blocks in fs */ + user_long_t f_bavail; /* free blocks avail to non-superuser */ + user_long_t f_files; /* total file nodes in file system */ + user_long_t f_ffree; /* free file nodes in fs */ + fsid_t f_fsid; /* file system id */ + uid_t f_owner; /* user that mounted the filesystem */ + short f_reserved1; /* spare for later */ + short f_type; /* type of filesystem */ + user_long_t f_flags; /* copy of mount exported flags */ + user_long_t f_reserved2[2]; /* reserved for future use */ + char f_fstypename[MFSNAMELEN]; /* fs type name */ + char f_mntonname[MNAMELEN]; /* directory on which mounted */ + char f_mntfromname[MNAMELEN];/* mounted filesystem */ +#if COMPAT_GETFSSTAT + char f_reserved3[0]; /* For alignment */ + user_long_t f_reserved4[0]; /* For future use */ +#else + char f_reserved3; /* For alignment */ + user_long_t f_reserved4[4]; /* For future use */ +#endif +}; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +__BEGIN_DECLS + +extern TAILQ_HEAD(mntlist, mount) mountlist; +void mount_list_lock(void); +void mount_list_unlock(void); +void mount_lock_init(mount_t); +void mount_lock_destroy(mount_t); +void mount_lock(mount_t); +void mount_unlock(mount_t); +void mount_lock_renames(mount_t); +void mount_unlock_renames(mount_t); +void mount_ref(mount_t, int); +void mount_drop(mount_t, int); + +/* vfs_rootmountalloc should be kept as a private api */ +errno_t vfs_rootmountalloc(const char *, const char *, mount_t *mpp); +errno_t vfs_init_io_attributes(vnode_t, mount_t); + +int vfs_mountroot(void); +void vfs_unmountall(void); +int safedounmount(struct mount *, int, struct proc *); +int dounmount(struct mount *, int, struct proc *); + +/* xnuy internal api */ +void mount_dropcrossref(mount_t, vnode_t, int); +int validfsnode(mount_t); +mount_t mount_lookupby_volfsid(int, int); +mount_t mount_list_lookupby_fsid(fsid_t *, int, int); +int mount_iterref(mount_t, int); +int mount_isdrained(mount_t, int); +void mount_iterdrop(mount_t); +void mount_iterdrain(mount_t); +void mount_iterreset(mount_t); + +__END_DECLS + +#endif /* !_SYS_MOUNT_INTERNAL_H_ */ diff --git a/bsd/sys/msg.h b/bsd/sys/msg.h index 76e3f1c27..d53ebd98c 100644 --- a/bsd/sys/msg.h +++ b/bsd/sys/msg.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -45,53 +45,174 @@ #include <sys/appleapiopts.h> -#ifdef __APPLE_API_UNSTABLE +#include <sys/_types.h> +#include <sys/cdefs.h> + +/* + * [XSI] All of the symbols from <sys/ipc.h> SHALL be defined when this + * header is included + */ #include <sys/ipc.h> /* - * The MSG_NOERROR identifier value, the msqid_ds struct and the msg struct - * are as defined by the SV API Intel 386 Processor Supplement. + * [XSI] The pid_t, time_t, key_t, size_t, and ssize_t types shall be + * defined as described in <sys/types.h>. + * + * NOTE: The definition of the key_t type is implicit from the + * inclusion of <sys/ipc.h> + */ +#ifndef _PID_T +typedef __darwin_pid_t pid_t; +#define _PID_T +#endif + +#ifndef _TIME_T +#define _TIME_T +typedef __darwin_time_t time_t; +#endif + +#ifndef _SIZE_T +#define _SIZE_T +typedef __darwin_size_t size_t; +#endif + +#ifndef _SSIZE_T +#define _SSIZE_T +typedef __darwin_ssize_t ssize_t; +#endif + +/* [XSI] Used for the number of messages in the message queue */ +typedef unsigned long msgqnum_t; + +/* [XSI] Used for the number of bytes allowed in a message queue */ +typedef unsigned long msglen_t; + + +/* + * Possible values for the fifth parameter to msgrcv(), in addition to the + * IPC_NOWAIT flag, which is permitted. */ +#define MSG_NOERROR 010000 /* [XSI] No error if big message */ + -#define MSG_NOERROR 010000 /* don't complain about too long msgs */ - -struct msqid_ds { - struct ipc_perm msg_perm; /* msg queue permission bits */ - struct msg *msg_first; /* first message in the queue */ - struct msg *msg_last; /* last message in the queue */ - u_long msg_cbytes; /* number of bytes in use on the queue */ - u_long msg_qnum; /* number of msgs in the queue */ - u_long msg_qbytes; /* max # of bytes on the queue */ - pid_t msg_lspid; /* pid of last msgsnd() */ - pid_t msg_lrpid; /* pid of last msgrcv() */ - time_t msg_stime; /* time of last msgsnd() */ - long msg_pad1; - time_t msg_rtime; /* time of last msgrcv() */ - long msg_pad2; - time_t msg_ctime; /* time of last msgctl() */ - long msg_pad3; - long msg_pad4[4]; +/* + * Technically, we should force all code references to the new structure + * definition, not in just the standards conformance case, and leave the + * legacy interface there for binary compatibility only. Currently, we + * are only forcing this for programs requesting standards conformance. + */ +#if defined(__POSIX_C_SOURCE) || defined(kernel) || defined(__LP64__) +/* + * Structure used internally. + * + * Structure whose address is passed as the third parameter to msgctl() + * when the second parameter is IPC_SET or IPC_STAT. In the case of the + * IPC_SET command, only the msg_perm.{uid|gid|perm} and msg_qbytes are + * honored. In the case of IPC_STAT, only the fields indicated as [XSI] + * mandated fields are guaranteed to meaningful: DO NOT depend on the + * contents of the other fields. + * + * NOTES: Reserved fields are not preserved across IPC_SET/IPC_STAT. + */ +struct __msqid_ds_new { + struct __ipc_perm_new msg_perm; /* [XSI] msg queue permissions */ + __int32_t msg_first; /* RESERVED: kernel use only */ + __int32_t msg_last; /* RESERVED: kernel use only */ + msglen_t msg_cbytes; /* # of bytes on the queue */ + msgqnum_t msg_qnum; /* [XSI] number of msgs on the queue */ + msglen_t msg_qbytes; /* [XSI] max bytes on the queue */ + pid_t msg_lspid; /* [XSI] pid of last msgsnd() */ + pid_t msg_lrpid; /* [XSI] pid of last msgrcv() */ + time_t msg_stime; /* [XSI] time of last msgsnd() */ + __int32_t msg_pad1; /* RESERVED: DO NOT USE */ + time_t msg_rtime; /* [XSI] time of last msgrcv() */ + __int32_t msg_pad2; /* RESERVED: DO NOT USE */ + time_t msg_ctime; /* [XSI] time of last msgctl() */ + __int32_t msg_pad3; /* RESERVED: DO NOT USE */ + __int32_t msg_pad4[4]; /* RESERVED: DO NOT USE */ +}; +#define msqid_ds __msqid_ds_new +#else /* !_POSIX_C_SOURCE */ +#define msqid_ds __msqid_ds_old +#endif /* !_POSIX_C_SOURCE */ + +#if !defined(__POSIX_C_SOURCE) && !defined(__LP64__) +struct __msqid_ds_old { + struct __ipc_perm_old msg_perm; /* [XSI] msg queue permissions */ + __int32_t msg_first; /* RESERVED: kernel use only */ + __int32_t msg_last; /* RESERVED: kernel use only */ + msglen_t msg_cbytes; /* # of bytes on the queue */ + msgqnum_t msg_qnum; /* [XSI] number of msgs on the queue */ + msglen_t msg_qbytes; /* [XSI] max bytes on the queue */ + pid_t msg_lspid; /* [XSI] pid of last msgsnd() */ + pid_t msg_lrpid; /* [XSI] pid of last msgrcv() */ + time_t msg_stime; /* [XSI] time of last msgsnd() */ + __int32_t msg_pad1; /* RESERVED: DO NOT USE */ + time_t msg_rtime; /* [XSI] time of last msgrcv() */ + __int32_t msg_pad2; /* RESERVED: DO NOT USE */ + time_t msg_ctime; /* [XSI] time of last msgctl() */ + __int32_t msg_pad3; /* RESERVED: DO NOT USE */ + __int32_t msg_pad4[4]; /* RESERVED: DO NOT USE */ }; +#endif /* !_POSIX_C_SOURCE */ + +#ifdef KERNEL +#ifdef __APPLE_API_PRIVATE +#include <machine/types.h> + +// LP64todo - should this move? + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_msqid_ds { + struct ipc_perm msg_perm; /* [XSI] msg queue permissions */ + struct msg *msg_first; /* first message in the queue */ + struct msg *msg_last; /* last message in the queue */ + msglen_t msg_cbytes; /* # of bytes on the queue */ + msgqnum_t msg_qnum; /* [XSI] number of msgs on the queue */ + msglen_t msg_qbytes; /* [XSI] max bytes on the queue */ + pid_t msg_lspid; /* [XSI] pid of last msgsnd() */ + pid_t msg_lrpid; /* [XSI] pid of last msgrcv() */ + user_time_t msg_stime; /* [XSI] time of last msgsnd() */ + __int32_t msg_pad1; /* RESERVED: DO NOT USE */ + user_time_t msg_rtime; /* [XSI] time of last msgrcv() */ + __int32_t msg_pad2; /* RESERVED: DO NOT USE */ + user_time_t msg_ctime; /* [XSI] time of last msgctl() */ + __int32_t msg_pad3; /* RESERVED: DO NOT USE */ + __int32_t msg_pad4[4]; +}; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL */ + + +#ifndef _POSIX_C_SOURCE +#ifdef __APPLE_API_UNSTABLE +/* XXX kernel only; protect with macro later */ struct msg { - struct msg *msg_next; /* next msg in the chain */ - long msg_type; /* type of this message */ - /* >0 -> type of this message */ - /* 0 -> free header */ - u_short msg_ts; /* size of this message */ - short msg_spot; /* location of start of msg in buffer */ + struct msg *msg_next; /* next msg in the chain */ + long msg_type; /* type of this message */ + /* >0 -> type of this message */ + /* 0 -> free header */ + unsigned short msg_ts; /* size of this message */ + short msg_spot; /* location of msg start in buffer */ }; /* - * Structure describing a message. The SVID doesn't suggest any - * particular name for this structure. There is a reference in the - * msgop man page that reads "The structure mymsg is an example of what - * this user defined buffer might look like, and includes the following - * members:". This sentence is followed by two lines equivalent - * to the mtype and mtext field declarations below. It isn't clear - * if "mymsg" refers to the naem of the structure type or the name of an - * instance of the structure... + * Example structure describing a message whose address is to be passed as + * the second argument to the functions msgrcv() and msgsnd(). The only + * actual hard requirement is that the first field be of type long, and + * contain the message type. The user is encouraged to define their own + * application specific structure; this definition is included solely for + * backward compatability with existing source code. */ struct mymsg { long mtype; /* message type (+ve integer) */ @@ -158,26 +279,30 @@ struct msgmap { /* 0..(MSGSEG-1) -> index of next segment */ }; +/* The following four externs really, really need to die; should be static */ extern char *msgpool; /* MSGMAX byte long msg buffer pool */ extern struct msgmap *msgmaps; /* MSGSEG msgmap structures */ extern struct msg *msghdrs; /* MSGTQL msg headers */ -extern struct msqid_ds *msqids; /* MSGMNI msqid_ds struct's */ +extern struct user_msqid_ds *msqids; /* MSGMNI user_msqid_ds struct's */ #define MSG_LOCKED 01000 /* Is this msqid_ds locked? */ -#endif /* KERNEL */ +#endif /* KERNEL */ +#endif /* __APPLE_API_UNSTABLE */ +#endif /* !_POSIX_C_SOURCE */ #ifndef KERNEL -#include <sys/cdefs.h> __BEGIN_DECLS -int msgsys __P((int, ...)); -int msgctl __P((int, int, struct msqid_ds *)); -int msgget __P((key_t, int)); -int msgsnd __P((int, void *, size_t, int)); -int msgrcv __P((int, void*, size_t, long, int)); +#ifndef _POSIX_C_SOURCE +int msgsys(int, ...); +#endif /* !_POSIX_C_SOURCE */ +int msgctl(int, int, struct msqid_ds *) __DARWIN_ALIAS(msgctl); +int msgget(key_t, int); +ssize_t msgrcv(int, void *, size_t, long, int); +int msgsnd(int, const void *, size_t, int); __END_DECLS + #endif /* !KERNEL */ -#endif /* __APPLE_API_UNSTABLE */ #endif /* !_SYS_MSG_H_ */ diff --git a/bsd/sys/mtio.h b/bsd/sys/mtio.h index ab2f39e65..e6516a033 100644 --- a/bsd/sys/mtio.h +++ b/bsd/sys/mtio.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -59,6 +59,7 @@ #define _SYS_MTIO_H_ #include <sys/appleapiopts.h> +#include <sys/cdefs.h> #ifdef __APPLE_API_OBSOLETE @@ -67,6 +68,7 @@ */ /* structure for MTIOCTOP - mag tape op command */ +/* LP64todo - not 64-bit safe */ struct mtop { short mt_op; /* operations defined below */ daddr_t mt_count; /* how many of them */ @@ -91,7 +93,7 @@ struct mtop { #define MTSETDNSTY 15 /* set density code for current mode */ /* structure for MTIOCGET - mag tape get status command */ - +/* LP64todo - not 64-bit safe */ struct mtget { short mt_type; /* type of magtape device */ /* the following two registers are grossly device dependent */ diff --git a/bsd/sys/namei.h b/bsd/sys/namei.h index 1c5ce5991..775a41030 100644 --- a/bsd/sys/namei.h +++ b/bsd/sys/namei.h @@ -60,34 +60,22 @@ #include <sys/appleapiopts.h> -#ifdef __APPLE_API_UNSTABLE +#ifdef KERNEL +#define LOCKLEAF 0x0004 /* lock inode on return */ +#define LOCKPARENT 0x0008 /* want parent vnode returned */ +#define WANTPARENT 0x0010 /* want parent vnode returned */ +#endif + + +#ifdef BSD_KERNEL_PRIVATE #include <sys/queue.h> #include <sys/uio.h> +#include <sys/vnode.h> +#include <sys/mount.h> +#include <sys/filedesc.h> -/* - * Lookup parameters: this structure describes the subset of - * information from the nameidata structure that is passed - * through the VOP interface. - */ -struct componentname { - /* - * Arguments to lookup. - */ - u_long cn_nameiop; /* namei operation */ - u_long cn_flags; /* flags to namei */ - struct proc *cn_proc; /* process requesting lookup */ - struct ucred *cn_cred; /* credentials */ - /* - * Shared between lookup and commit routines. - */ - char *cn_pnbuf; /* pathname buffer */ - long cn_pnlen; /* length of allocated buffer */ - char *cn_nameptr; /* pointer to looked up name */ - long cn_namelen; /* length of looked up component */ - u_long cn_hash; /* hash value of looked up name */ - long cn_consume; /* chars to consume in lookup() */ -}; +#define PATHBUFLEN 256 /* * Encapsulation of namei parameters. @@ -96,17 +84,14 @@ struct nameidata { /* * Arguments to namei/lookup. */ - caddr_t ni_dirp; /* pathname pointer */ + user_addr_t ni_dirp; /* pathname pointer */ enum uio_seg ni_segflg; /* location of pathname */ - /* u_long ni_nameiop; namei operation */ - /* u_long ni_flags; flags to namei */ - /* struct proc *ni_proc; process requesting lookup */ /* * Arguments to lookup. */ - /* struct ucred *ni_cred; credentials */ struct vnode *ni_startdir; /* starting directory */ struct vnode *ni_rootdir; /* logical root directory */ + struct vnode *ni_usedvp; /* directory passed in via USEDVP */ /* * Results: returned from/manipulated by lookup */ @@ -117,116 +102,106 @@ struct nameidata { */ u_int ni_pathlen; /* remaining chars in path */ char *ni_next; /* next location in pathname */ + char ni_pathbuf[PATHBUFLEN]; u_long ni_loopcnt; /* count of symlinks encountered */ + struct componentname ni_cnd; }; #ifdef KERNEL -/* - * namei operations - */ -#define LOOKUP 0 /* perform name lookup only */ -#define CREATE 1 /* setup for file creation */ -#define DELETE 2 /* setup for file deletion */ -#define RENAME 3 /* setup for file renaming */ -#define OPMASK 3 /* mask for operation */ /* * namei operational modifier flags, stored in ni_cnd.flags */ -#define LOCKLEAF 0x0004 /* lock inode on return */ -#define LOCKPARENT 0x0008 /* want parent vnode returned locked */ -#define WANTPARENT 0x0010 /* want parent vnode returned unlocked */ #define NOCACHE 0x0020 /* name must not be left in cache */ -#define FOLLOW 0x0040 /* follow symbolic links */ #define NOFOLLOW 0x0000 /* do not follow symbolic links (pseudo) */ #define SHAREDLEAF 0x0080 /* OK to have shared leaf lock */ #define MODMASK 0x00fc /* mask of operational modifiers */ /* * Namei parameter descriptors. * - * SAVENAME may be set by either the callers of namei or by VOP_LOOKUP. - * If the caller of namei sets the flag (for example execve wants to - * know the name of the program that is being executed), then it must - * free the buffer. If VOP_LOOKUP sets the flag, then the buffer must - * be freed by either the commit routine or the VOP_ABORT routine. * SAVESTART is set only by the callers of namei. It implies SAVENAME * plus the addition of saving the parent directory that contains the * name in ni_startdir. It allows repeated calls to lookup for the * name being sought. The caller is responsible for releasing the * buffer and for vrele'ing ni_startdir. */ -#define NOCROSSMOUNT 0x000100 /* do not cross mount points */ -#define RDONLY 0x000200 /* lookup with read-only semantics */ -#define HASBUF 0x000400 /* has allocated pathname buffer */ -#define SAVENAME 0x000800 /* save pathanme buffer */ -#define SAVESTART 0x001000 /* save starting directory */ -#define ISDOTDOT 0x002000 /* current component name is .. */ -#define MAKEENTRY 0x004000 /* entry is to be added to name cache */ -#define ISLASTCN 0x008000 /* this is last component of pathname */ -#define ISSYMLINK 0x010000 /* symlink needs interpretation */ -#define ISWHITEOUT 0x020000 /* found whiteout */ -#define DOWHITEOUT 0x040000 /* do whiteouts */ -#define WILLBEDIR 0x080000 /* new files will be dirs; allow trailing / */ -#define AUDITVNPATH1 0x100000 /* audit the path/vnode info */ -#define AUDITVNPATH2 0x200000 /* audit the path/vnode info */ -#define USEDVP 0x400000 /* start the lookup at ndp.ni_dvp */ -#define NODELETEBUSY 0x800000 /* donot delete busy files (Carbon semantic) */ -#define PARAMASK 0x3fff00 /* mask of parameter descriptors */ +#define NOCROSSMOUNT 0x00000100 /* do not cross mount points */ +#define RDONLY 0x00000200 /* lookup with read-only semantics */ +#define HASBUF 0x00000400 /* has allocated pathname buffer */ +#define SAVENAME 0x00000800 /* save pathanme buffer */ +#define SAVESTART 0x00001000 /* save starting directory */ +#define ISSYMLINK 0x00010000 /* symlink needs interpretation */ +#define DONOTAUTH 0x00020000 /* do not authorize during lookup */ +#define WILLBEDIR 0x00080000 /* new files will be dirs; allow trailing / */ +#define AUDITVNPATH1 0x00100000 /* audit the path/vnode info */ +#define AUDITVNPATH2 0x00200000 /* audit the path/vnode info */ +#define USEDVP 0x00400000 /* start the lookup at ndp.ni_dvp */ +#define PARAMASK 0x003fff00 /* mask of parameter descriptors */ +#define FSNODELOCKHELD 0x01000000 + /* * Initialization of an nameidata structure. */ -#define NDINIT(ndp, op, flags, segflg, namep, p) { \ +#define NDINIT(ndp, op, flags, segflg, namep, ctx) { \ (ndp)->ni_cnd.cn_nameiop = op; \ (ndp)->ni_cnd.cn_flags = flags; \ - (ndp)->ni_segflg = segflg; \ + if ((segflg) == UIO_USERSPACE) { \ + (ndp)->ni_segflg = ((IS_64BIT_PROCESS(vfs_context_proc(ctx))) ? UIO_USERSPACE64 : UIO_USERSPACE32); \ + } \ + else if ((segflg) == UIO_SYSSPACE) { \ + (ndp)->ni_segflg = UIO_SYSSPACE32; \ + } \ + else { \ + (ndp)->ni_segflg = segflg; \ + } \ (ndp)->ni_dirp = namep; \ - (ndp)->ni_cnd.cn_proc = p; \ + (ndp)->ni_cnd.cn_context = ctx; \ } #endif /* KERNEL */ /* * This structure describes the elements in the cache of recent - * names looked up by namei. NCHNAMLEN is sized to make structure - * size a power of two to optimize malloc's. Minimum reasonable - * size is 15. + * names looked up by namei. */ #define NCHNAMLEN 31 /* maximum name segment length we bother with */ +#define NCHASHMASK 0x7fffffff struct namecache { - LIST_ENTRY(namecache) nc_hash; /* hash chain */ - TAILQ_ENTRY(namecache) nc_lru; /* LRU chain */ - struct vnode *nc_dvp; /* vnode of parent of name */ - u_long nc_dvpid; /* capability number of nc_dvp */ - struct vnode *nc_vp; /* vnode the name refers to */ - u_long nc_vpid; /* capability number of nc_vp */ - char *nc_name; /* segment name */ + TAILQ_ENTRY(namecache) nc_entry; /* chain of all entries */ + LIST_ENTRY(namecache) nc_hash; /* hash chain */ + LIST_ENTRY(namecache) nc_child; /* chain of ncp's that are children of a vp */ + union { + LIST_ENTRY(namecache) nc_link; /* chain of ncp's that 'name' a vp */ + TAILQ_ENTRY(namecache) nc_negentry; /* chain of ncp's that 'name' a vp */ + } nc_un; + vnode_t nc_dvp; /* vnode of parent of name */ + vnode_t nc_vp; /* vnode the name refers to */ + unsigned int nc_whiteout:1, /* name has whiteout applied */ + nc_hashval:31; /* hashval of stringname */ + char * nc_name; /* pointer to segment name in string cache */ }; + #ifdef KERNEL -struct mount; -extern u_long nextvnodeid; -int namei __P((struct nameidata *ndp)); -int lookup __P((struct nameidata *ndp)); -int relookup __P((struct vnode *dvp, struct vnode **vpp, - struct componentname *cnp)); - -/* namecache function prototypes */ -int cache_lookup __P((struct vnode *dvp, struct vnode **vpp, - struct componentname *cnp)); -void cache_enter __P((struct vnode *dvp, struct vnode *vpp, - struct componentname *cnp)); -void cache_purge __P((struct vnode *vp)); -void cache_purgevfs __P((struct mount *mp)); - -// -// Global string-cache routines. You can pass zero for nc_hash -// if you don't know it (add_name() will then compute the hash). -// There are no flags for now but maybe someday. -// -char *add_name(const char *name, size_t len, u_int nc_hash, u_int flags); -int remove_name(const char *name); +int namei(struct nameidata *ndp); +void nameidone(struct nameidata *); +int lookup(struct nameidata *ndp); +int relookup(struct vnode *dvp, struct vnode **vpp, + struct componentname *cnp); + +/* + * namecache function prototypes + */ +void cache_purgevfs(mount_t mp); +int cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, + vfs_context_t context, int *trailing_slash, int *dp_authorized); + +void vnode_cache_credentials(vnode_t vp, vfs_context_t context); +void vnode_uncache_credentials(vnode_t vp); +int reverse_lookup(vnode_t start_vp, vnode_t *lookup_vpp, + struct filedesc *fdp, vfs_context_t context, int *dp_authorized); #endif /* KERNEL */ @@ -234,15 +209,18 @@ int remove_name(const char *name); * Stats on usefulness of namei caches. */ struct nchstats { + long ncs_negtotal; long ncs_goodhits; /* hits that we can really use */ long ncs_neghits; /* negative hits that we can use */ long ncs_badhits; /* hits we must drop */ - long ncs_falsehits; /* hits with id mismatch */ long ncs_miss; /* misses */ - long ncs_long; /* long names that ignore cache */ long ncs_pass2; /* names found with passes == 2 */ long ncs_2passes; /* number of times we attempt it */ + long ncs_stolen; + long ncs_enters; + long ncs_deletes; + long ncs_badvid; }; -#endif /* __APPLE_API_UNSTABLE */ +#endif /* BSD_KERNEL_PRIVATE */ #endif /* !_SYS_NAMEI_H_ */ diff --git a/bsd/sys/param.h b/bsd/sys/param.h index 61d1806c3..f9b6aafc2 100644 --- a/bsd/sys/param.h +++ b/bsd/sys/param.h @@ -70,9 +70,11 @@ #define NeXTBSD 1995064 /* NeXTBSD version (year, month, release) */ #define NeXTBSD4_0 0 /* NeXTBSD 4.0 */ +#include <sys/_types.h> + #ifndef NULL -#define NULL 0 -#endif +#define NULL __DARWIN_NULL +#endif /* ! NULL */ #ifndef LOCORE #include <sys/types.h> @@ -137,6 +139,7 @@ #define PRIMASK 0x0ff #define PCATCH 0x100 /* OR'd with pri for tsleep to check signals */ #define PTTYBLOCK 0x200 /* for tty SIGTTOU and SIGTTIN blocking */ +#define PDROP 0x400 /* OR'd with pri to stop re-entry of interlock mutex */ #define NZERO 0 /* default "nice" */ @@ -175,16 +178,20 @@ /* * File system parameters and macros. * - * The file system is made out of blocks of at most MAXBSIZE units, with + * The file system is made out of blocks of at most MAXPHYS units, with * smaller units (fragments) only in the last direct block. MAXBSIZE * primarily determines the size of buffers in the buffer pool. It may be - * made larger without any effect on existing file systems; however making - * it smaller make make some file systems unmountable. + * made larger than MAXPHYS without any effect on existing file systems; + * however making it smaller may make some file systems unmountable. + * We set this to track the value of (MAX_UPL_TRANSFER*PAGE_SIZE) from + * osfmk/mach/memory_object_types.h to bound it at the maximum UPL size. */ -#define MAXBSIZE MAXPHYS +#define MAXBSIZE (256 * 4096) #define MAXPHYSIO MAXPHYS #define MAXFRAG 8 +#define MAXPHYSIO_WIRED (16 * 1024 * 1024) + /* * MAXPATHLEN defines the longest permissable path length after expanding * symbolic links. It is used to allocate a temporary buffer from the buffer diff --git a/bsd/sys/pipe.h b/bsd/sys/pipe.h new file mode 100644 index 000000000..c999c7dbf --- /dev/null +++ b/bsd/sys/pipe.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2004-2005 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* + * Copyright (c) 1996 John S. Dyson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice immediately at the beginning of the file, without modification, + * this list of conditions, and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Absolutely no warranty of function or purpose is made by the author + * John S. Dyson. + * 4. This work was done expressly for inclusion into FreeBSD. Other use + * is allowed if this notation is included. + * 5. Modifications may be freely made to this file if the above conditions + * are met. + * + * $FreeBSD: src/sys/sys/pipe.h,v 1.24 2003/08/13 20:01:38 alc Exp $ + */ + +#ifndef _SYS_PIPE_H_ +#define _SYS_PIPE_H_ + +#ifdef KERNEL +#include <libkern/locks.h> +#endif +#include <sys/queue.h> /* for TAILQ macros */ +#include <sys/ev.h> +#include <sys/cdefs.h> + +/* + * Pipe buffer size, keep moderate in value, pipes take kva space. + */ +#ifndef PIPE_SIZE +#define PIPE_SIZE 16384 +#endif + +#ifndef BIG_PIPE_SIZE +#define BIG_PIPE_SIZE (64*1024) +#endif + +#ifndef SMALL_PIPE_SIZE +#define SMALL_PIPE_SIZE PAGE_SIZE +#endif + +/* + * PIPE_MINDIRECT MUST be smaller than PIPE_SIZE and MUST be bigger + * than PIPE_BUF. + */ +#ifndef PIPE_MINDIRECT +#define PIPE_MINDIRECT 8192 +#endif + +#define PIPENPAGES (BIG_PIPE_SIZE / PAGE_SIZE + 1) + +/* + * Pipe buffer information. + * Separate in, out, cnt are used to simplify calculations. + * Buffered write is active when the buffer.cnt field is set. + */ +struct pipebuf { + u_int cnt; /* number of chars currently in buffer */ + u_int in; /* in pointer */ + u_int out; /* out pointer */ + u_int size; /* size of buffer */ + caddr_t buffer; /* kva of buffer */ +}; + + +#ifdef PIPE_DIRECT +/* + * Information to support direct transfers between processes for pipes. + */ +/* LP64todo - not 64bit safe */ +struct pipemapping { + vm_offset_t kva; /* kernel virtual address */ + vm_size_t cnt; /* number of chars in buffer */ + vm_size_t pos; /* current position of transfer */ + int npages; /* number of pages */ + vm_page_t ms[PIPENPAGES]; /* pages in source process */ +}; +#endif + +/* + * Bits in pipe_state. + */ +#define PIPE_ASYNC 0x004 /* Async? I/O. */ +#define PIPE_WANTR 0x008 /* Reader wants some characters. */ +#define PIPE_WANTW 0x010 /* Writer wants space to put characters. */ +#define PIPE_WANT 0x020 /* Pipe is wanted to be run-down. */ +#define PIPE_SEL 0x040 /* Pipe has a select active. */ +#define PIPE_EOF 0x080 /* Pipe is in EOF condition. */ +#define PIPE_LOCKFL 0x100 /* Process has exclusive access to pointers/data. */ +#define PIPE_LWANT 0x200 /* Process wants exclusive access to pointers/data. */ +#define PIPE_DIRECTW 0x400 /* Pipe direct write active. */ +#define PIPE_DIRECTOK 0x800 /* Direct mode ok. */ +#define PIPE_KNOTE 0x1000 /* Pipe has kernel events activated */ + +#ifdef KERNEL +/* + * Per-pipe data structure. + * Two of these are linked together to produce bi-directional pipes. + */ +struct pipe { + struct pipebuf pipe_buffer; /* data storage */ +#ifdef PIPE_DIRECT + struct pipemapping pipe_map; /* pipe mapping for direct I/O */ +#endif + struct selinfo pipe_sel; /* for compat with select */ + pid_t pipe_pgid; /* information for async I/O */ + struct pipe *pipe_peer; /* link with other direction */ + u_int pipe_state; /* pipe status info */ + int pipe_busy; /* busy flag, mostly to handle rundown sanely */ +#ifdef MAC + struct label *pipe_label; /* pipe MAC label - shared */ +#endif + TAILQ_HEAD(,eventqelt) pipe_evlist; + lck_mtx_t *pipe_mtxp; /* shared mutex between both pipes */ +}; + +#define PIPE_MTX(pipe) ((pipe)->pipe_mtxp) + +#define PIPE_LOCK(pipe) lck_mtx_lock(PIPE_MTX(pipe)) +#define PIPE_UNLOCK(pipe) lck_mtx_unlock(PIPE_MTX(pipe)) +#define PIPE_LOCK_ASSERT(pipe, type) lck_mtx_assert(PIPE_MTX(pipe), (type)) + +__BEGIN_DECLS +extern int pipe_stat(struct pipe *, struct stat *); +__END_DECLS + +#endif /* KERNEL */ + +#endif /* !_SYS_PIPE_H_ */ diff --git a/bsd/sys/poll.h b/bsd/sys/poll.h index 443469915..7c1077722 100644 --- a/bsd/sys/poll.h +++ b/bsd/sys/poll.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -51,9 +51,6 @@ #ifndef _SYS_POLL_H_ #define _SYS_POLL_H_ -#include <sys/appleapiopts.h> - -#ifdef __APPLE_API_PRIVATE /* * This file is intended to be compatable with the traditional poll.h. */ @@ -61,12 +58,6 @@ /* * Requestable events. If poll(2) finds any of these set, they are * copied to revents on return. - * XXX Note that FreeBSD doesn't make much distinction between POLLPRI - * and POLLRDBAND since none of the file types have distinct priority - * bands - and only some have an urgent "mode". - * XXX Note POLLIN isn't really supported in true SVSV terms. Under SYSV - * POLLIN includes all of normal, band and urgent data. Most poll handlers - * on FreeBSD only treat it as "normal" data. */ #define POLLIN 0x0001 /* any readable data available */ #define POLLPRI 0x0002 /* OOB/Urgent readable data */ @@ -78,7 +69,7 @@ /* * FreeBSD extensions: polling on a regular file might return one - * of these events (currently only supported on UFS). + * of these events (currently only supported on local filesystems). */ #define POLLEXTEND 0x0200 /* file may have been extended */ #define POLLATTRIB 0x0400 /* file attributes may have changed */ @@ -96,6 +87,29 @@ #define POLLSTANDARD (POLLIN|POLLPRI|POLLOUT|POLLRDNORM|POLLRDBAND|\ POLLWRBAND|POLLERR|POLLHUP|POLLNVAL) -#endif /* __APPLE_API_PRIVATE */ +struct pollfd +{ + int fd; + short events; + short revents; +}; + +typedef unsigned int nfds_t; + +#if !defined(KERNEL) + +#include <sys/cdefs.h> + +__BEGIN_DECLS + +/* + * This is defined here (instead of <poll.h>) because this is where + * traditional SVR4 code will look to find it. + */ +extern int poll (struct pollfd *, nfds_t, int); + +__END_DECLS + +#endif /* !KERNEL */ #endif /* !_SYS_POLL_H_ */ diff --git a/bsd/sys/proc.h b/bsd/sys/proc.h index a7a500a5c..cbf1b3a80 100644 --- a/bsd/sys/proc.h +++ b/bsd/sys/proc.h @@ -70,168 +70,25 @@ #include <sys/lock.h> #include <sys/param.h> #include <sys/event.h> +#ifdef KERNEL +#include <sys/kernel_types.h> +#endif +#include <mach/boolean.h> -#ifdef __APPLE_API_PRIVATE - -/* - * One structure allocated per session. - */ -struct session { - int s_count; /* Ref cnt; pgrps in session. */ - struct proc *s_leader; /* Session leader. */ - struct vnode *s_ttyvp; /* Vnode of controlling terminal. */ - struct tty *s_ttyp; /* Controlling terminal. */ - pid_t s_sid; /* Session ID */ - char s_login[MAXLOGNAME]; /* Setlogin() name. */ -}; - -/* - * One structure allocated per process group. - */ -struct pgrp { - LIST_ENTRY(pgrp) pg_hash; /* Hash chain. */ - LIST_HEAD(, proc) pg_members; /* Pointer to pgrp members. */ - struct session *pg_session; /* Pointer to session. */ - pid_t pg_id; /* Pgrp id. */ - int pg_jobc; /* # procs qualifying pgrp for job control */ -}; - -/* - * Description of a process. - * - * This structure contains the information needed to manage a thread of - * control, known in UN*X as a process; it has references to substructures - * containing descriptions of things that the process uses, but may share - * with related processes. The process structure and the substructures - * are always addressible except for those marked "(PROC ONLY)" below, - * which might be addressible only on a processor on which the process - * is running. - */ -struct proc { - LIST_ENTRY(proc) p_list; /* List of all processes. */ - - /* substructures: */ - struct pcred *p_cred; /* Process owner's identity. */ - struct filedesc *p_fd; /* Ptr to open files structure. */ - struct pstats *p_stats; /* Accounting/statistics (PROC ONLY). */ - struct plimit *p_limit; /* Process limits. */ - struct sigacts *p_sigacts; /* Signal actions, state (PROC ONLY). */ - -#define p_ucred p_cred->pc_ucred -#define p_rlimit p_limit->pl_rlimit - - int p_flag; /* P_* flags. */ - char p_stat; /* S* process status. */ - char p_shutdownstate; - char p_pad1[2]; - - pid_t p_pid; /* Process identifier. */ - LIST_ENTRY(proc) p_pglist; /* List of processes in pgrp. */ - struct proc *p_pptr; /* Pointer to parent process. */ - LIST_ENTRY(proc) p_sibling; /* List of sibling processes. */ - LIST_HEAD(, proc) p_children; /* Pointer to list of children. */ - -/* The following fields are all zeroed upon creation in fork. */ -#define p_startzero p_oppid - - pid_t p_oppid; /* Save parent pid during ptrace. XXX */ - int p_dupfd; /* Sideways return value from fdopen. XXX */ - - /* scheduling */ - u_int p_estcpu; /* Time averaged value of p_cpticks. */ - int p_cpticks; /* Ticks of cpu time. */ - fixpt_t p_pctcpu; /* %cpu for this process during p_swtime */ - void *p_wchan; /* Sleep address. */ - char *p_wmesg; /* Reason for sleep. */ - u_int p_swtime; /* DEPRECATED (Time swapped in or out.) */ -#define p_argslen p_swtime /* Length of process arguments. */ - u_int p_slptime; /* Time since last blocked. */ - - struct itimerval p_realtimer; /* Alarm timer. */ - struct timeval p_rtime; /* Real time. */ - u_quad_t p_uticks; /* Statclock hits in user mode. */ - u_quad_t p_sticks; /* Statclock hits in system mode. */ - u_quad_t p_iticks; /* Statclock hits processing intr. */ - - int p_traceflag; /* Kernel trace points. */ - struct vnode *p_tracep; /* Trace to vnode. */ - - sigset_t p_siglist; /* DEPRECATED. */ - - struct vnode *p_textvp; /* Vnode of executable. */ - -/* End area that is zeroed on creation. */ -#define p_endzero p_hash.le_next - - /* - * Not copied, not zero'ed. - * Belongs after p_pid, but here to avoid shifting proc elements. - */ - LIST_ENTRY(proc) p_hash; /* Hash chain. */ - TAILQ_HEAD( ,eventqelt) p_evlist; - -/* The following fields are all copied upon creation in fork. */ -#define p_startcopy p_sigmask - - sigset_t p_sigmask; /* DEPRECATED */ - sigset_t p_sigignore; /* Signals being ignored. */ - sigset_t p_sigcatch; /* Signals being caught by user. */ - - u_char p_priority; /* Process priority. */ - u_char p_usrpri; /* User-priority based on p_cpu and p_nice. */ - char p_nice; /* Process "nice" value. */ - char p_comm[MAXCOMLEN+1]; - - struct pgrp *p_pgrp; /* Pointer to process group. */ - -/* End area that is copied on creation. */ -#define p_endcopy p_xstat - - u_short p_xstat; /* Exit status for wait; also stop signal. */ - u_short p_acflag; /* Accounting flags. */ - struct rusage *p_ru; /* Exit information. XXX */ +#ifdef XNU_KERNEL_PRIVATE +#define PROC_DEF_ENABLED +#else +#ifndef KERNEL +#define PROC_DEF_ENABLED +#endif +#endif - int p_debugger; /* 1: can exec set-bit programs if suser */ - - void *task; /* corresponding task */ - void *sigwait_thread; /* 'thread' holding sigwait */ - struct lock__bsd__ signal_lock; /* multilple thread prot for signals*/ - boolean_t sigwait; /* indication to suspend */ - void *exit_thread; /* Which thread is exiting? */ - caddr_t user_stack; /* where user stack was allocated */ - void * exitarg; /* exit arg for proc terminate */ - void * vm_shm; /* for sysV shared memory */ - int p_argc; /* saved argc for sysctl_procargs() */ - int p_vforkcnt; /* number of outstanding vforks */ - void * p_vforkact; /* activation running this vfork proc */ - TAILQ_HEAD( , uthread) p_uthlist; /* List of uthreads */ - /* Following fields are info from SIGCHLD */ - pid_t si_pid; - u_short si_status; - u_short si_code; - uid_t si_uid; - TAILQ_HEAD( , aio_workq_entry ) aio_activeq; /* active async IO requests */ - int aio_active_count; /* entries on aio_activeq */ - TAILQ_HEAD( , aio_workq_entry ) aio_doneq; /* completed async IO requests */ - int aio_done_count; /* entries on aio_doneq */ - - struct klist p_klist; /* knote list */ - struct auditinfo *p_au; /* User auditing data */ -#if DIAGNOSTIC -#if SIGNAL_DEBUG - unsigned int lockpc[8]; - unsigned int unlockpc[8]; -#endif /* SIGNAL_DEBUG */ -#endif /* DIAGNOSTIC */ -}; +#ifdef PROC_DEF_ENABLED -#else /* !__APPLE_API_PRIVATE */ struct session; struct pgrp; struct proc; -#endif /* !__APPLE_API_PRIVATE */ -#ifdef __APPLE_API_UNSTABLE /* Exported fields for kern sysctls */ struct extern_proc { union { @@ -288,8 +145,6 @@ struct extern_proc { struct rusage *p_ru; /* Exit information. XXX */ }; -#define p_session p_pgrp->pg_session -#define p_pgid p_pgrp->pg_id /* Status values. */ #define SIDL 1 /* Process being created by fork. */ @@ -299,146 +154,129 @@ struct extern_proc { #define SZOMB 5 /* Awaiting collection by parent. */ /* These flags are kept in p_flags. */ -#define P_ADVLOCK 0x00001 /* Process may hold a POSIX advisory lock. */ -#define P_CONTROLT 0x00002 /* Has a controlling terminal. */ -#define P_INMEM 0x00004 /* Loaded into memory. */ -#define P_NOCLDSTOP 0x00008 /* No SIGCHLD when children stop. */ -#define P_PPWAIT 0x00010 /* Parent is waiting for child to exec/exit. */ -#define P_PROFIL 0x00020 /* Has started profiling. */ -#define P_SELECT 0x00040 /* Selecting; wakeup/waiting danger. */ -#define P_SINTR 0x00080 /* Sleep is interruptible. */ -#define P_SUGID 0x00100 /* Had set id privileges since last exec. */ -#define P_SYSTEM 0x00200 /* System proc: no sigs, stats or swapping. */ -#define P_TIMEOUT 0x00400 /* Timing out during sleep. */ -#define P_TRACED 0x00800 /* Debugged process being traced. */ -#define P_WAITED 0x01000 /* Debugging process has waited for child. */ -#define P_WEXIT 0x02000 /* Working on exiting. */ -#define P_EXEC 0x04000 /* Process called exec. */ +#define P_ADVLOCK 0x00000001 /* Process may hold POSIX adv. lock */ +#define P_CONTROLT 0x00000002 /* Has a controlling terminal */ +#define P_LP64 0x00000004 /* Process is LP64 */ +#define P_NOCLDSTOP 0x00000008 /* No SIGCHLD when children stop */ + +#define P_PPWAIT 0x00000010 /* Parent waiting for chld exec/exit */ +#define P_PROFIL 0x00000020 /* Has started profiling */ +#define P_SELECT 0x00000040 /* Selecting; wakeup/waiting danger */ +#define P_CONTINUED 0x00000080 /* Process was stopped and continued */ + +#define P_SUGID 0x00000100 /* Has set privileges since last exec */ +#define P_SYSTEM 0x00000200 /* Sys proc: no sigs, stats or swap */ +#define P_TIMEOUT 0x00000400 /* Timing out during sleep */ +#define P_TRACED 0x00000800 /* Debugged process being traced */ + +#define P_WAITED 0x00001000 /* Debugging prc has waited for child */ +#define P_WEXIT 0x00002000 /* Working on exiting. */ +#define P_EXEC 0x00004000 /* Process called exec. */ /* Should be moved to machine-dependent areas. */ -#define P_OWEUPC 0x08000 /* Owe process an addupc() call at next ast. */ +#define P_OWEUPC 0x00008000 /* Owe process an addupc() call at next ast. */ -#define P_AFFINITY 0x0010000 /* xxx */ -#define P_CLASSIC 0x0020000 /* xxx */ +#define P_AFFINITY 0x00010000 /* xxx */ +#define P_CLASSIC 0x00020000 /* xxx */ /* -#define P_FSTRACE 0x10000 / * tracing via file system (elsewhere?) * / -#define P_SSTEP 0x20000 / * process needs single-step fixup ??? * / +#define P_FSTRACE 0x10000 / * tracing via file system (elsewhere?) * / +#define P_SSTEP 0x20000 / * process needs single-step fixup ??? * / */ -#define P_WAITING 0x0040000 /* process has a wait() in progress */ -#define P_KDEBUG 0x0080000 /* kdebug tracing is on for this process */ -#define P_TTYSLEEP 0x0100000 /* blocked due to SIGTTOU or SIGTTIN */ -#define P_REBOOT 0x0200000 /* Process called reboot() */ -#define P_TBE 0x0400000 /* Process is TBE */ -#define P_SIGEXC 0x0800000 /* signal exceptions */ -#define P_BTRACE 0x1000000 /* process is being branch traced */ -#define P_VFORK 0x2000000 /* process has vfork children */ -#define P_NOATTACH 0x4000000 -#define P_INVFORK 0x8000000 /* proc in vfork */ +#define P_WAITING 0x00040000 /* process has a wait() in progress */ +#define P_KDEBUG 0x00080000 /* kdebug tracing on for this process */ + +#define P_TTYSLEEP 0x00100000 /* blocked due to SIGTTOU or SIGTTIN */ +#define P_REBOOT 0x00200000 /* Process called reboot() */ +#define P_TBE 0x00400000 /* Process is TBE */ +#define P_SIGEXC 0x00800000 /* signal exceptions */ + +#define P_BTRACE 0x01000000 /* process is being branch traced */ +#define P_VFORK 0x02000000 /* process has vfork children */ +#define P_NOATTACH 0x04000000 +#define P_INVFORK 0x08000000 /* proc in vfork */ + #define P_NOSHLIB 0x10000000 /* no shared libs are in use for proc */ /* flag set on exec */ #define P_FORCEQUOTA 0x20000000 /* Force quota for root */ #define P_NOCLDWAIT 0x40000000 /* No zombies when chil procs exit */ #define P_NOREMOTEHANG 0x80000000 /* Don't hang on remote FS ops */ -#define P_NOSWAP 0 /* Obsolete: retained so that nothing breaks */ -#define P_PHYSIO 0 /* Obsolete: retained so that nothing breaks */ -#define P_FSTRACE 0 /* Obsolete: retained so that nothing breaks */ -#define P_SSTEP 0 /* Obsolete: retained so that nothing breaks */ - -/* - * Shareable process credentials (always resident). This includes a reference - * to the current user credentials as well as real and saved ids that may be - * used to change ids. - */ -struct pcred { - struct lock__bsd__ pc_lock; - struct ucred *pc_ucred; /* Current credentials. */ - uid_t p_ruid; /* Real user id. */ - uid_t p_svuid; /* Saved effective user id. */ - gid_t p_rgid; /* Real group id. */ - gid_t p_svgid; /* Saved effective group id. */ - int p_refcnt; /* Number of references. */ -}; +#define P_INMEM 0 /* Obsolete: retained for compilation */ +#define P_NOSWAP 0 /* Obsolete: retained for compilation */ +#define P_PHYSIO 0 /* Obsolete: retained for compilation */ +#define P_FSTRACE 0 /* Obsolete: retained for compilation */ +#define P_SSTEP 0 /* Obsolete: retained for compilation */ -#define pcred_readlock(p) lockmgr(&(p)->p_cred->pc_lock, \ - LK_SHARED, 0, (p)) -#define pcred_writelock(p) lockmgr(&(p)->p_cred->pc_lock, \ - LK_EXCLUSIVE, 0, (p)) -#define pcred_unlock(p) lockmgr(&(p)->p_cred->pc_lock, \ - LK_RELEASE, 0, (p)) -#endif /* __APPLE_API_UNSTABLE */ +#endif /* PROC_DEF_ENABLED */ #ifdef KERNEL - __BEGIN_DECLS -#ifdef __APPLE_API_PRIVATE -/* - * We use process IDs <= PID_MAX; PID_MAX + 1 must also fit in a pid_t, - * as it is used to represent "no process group". - */ -extern int nprocs, maxproc; /* Current and max number of procs. */ -__private_extern__ int hard_maxproc; /* hard limit */ - -#define PID_MAX 30000 -#define NO_PID 30001 - -#define SESS_LEADER(p) ((p)->p_session->s_leader == (p)) -#define SESSHOLD(s) ((s)->s_count++) -#define SESSRELE(s) sessrele(s) - -#define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash]) -extern LIST_HEAD(pidhashhead, proc) *pidhashtbl; -extern u_long pidhash; - -#define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash]) -extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl; -extern u_long pgrphash; - -LIST_HEAD(proclist, proc); -extern struct proclist allproc; /* List of all processes. */ -extern struct proclist zombproc; /* List of zombie processes. */ -extern struct proc *initproc, *kernproc; -extern void pgdelete __P((struct pgrp *pgrp)); -extern void sessrele __P((struct session *sess)); -extern void procinit __P((void)); -__private_extern__ char *proc_core_name(const char *name, uid_t uid, pid_t pid); + +extern proc_t kernproc; + extern int proc_is_classic(struct proc *p); struct proc *current_proc_EXTERNAL(void); -#endif /* __APPLE_API_PRIVATE */ - -#ifdef __APPLE_API_UNSTABLE - -extern int isinferior(struct proc *, struct proc *); -extern struct proc *pfind __P((pid_t)); /* Find process by id. */ -__private_extern__ struct proc *pzfind(pid_t); /* Find zombie by id. */ -extern struct pgrp *pgfind __P((pid_t)); /* Find process group by id. */ - -extern int chgproccnt __P((uid_t uid, int diff)); -extern int enterpgrp __P((struct proc *p, pid_t pgid, int mksess)); -extern void fixjobc __P((struct proc *p, struct pgrp *pgrp, int entering)); -extern int inferior __P((struct proc *p)); -extern int leavepgrp __P((struct proc *p)); -#ifdef __APPLE_API_OBSOLETE -extern void mi_switch __P((void)); -#endif /* __APPLE_API_OBSOLETE */ -extern void resetpriority __P((struct proc *)); -extern void setrunnable __P((struct proc *)); -extern void setrunqueue __P((struct proc *)); -extern int sleep __P((void *chan, int pri)); -extern int tsleep __P((void *chan, int pri, char *wmesg, int timo)); -extern int tsleep0 __P((void *chan, int pri, char *wmesg, int timo, int (*continuation)(int))); -extern int tsleep1 __P((void *chan, int pri, char *wmesg, u_int64_t abstime, int (*continuation)(int))); -extern void unsleep __P((struct proc *)); -extern void wakeup __P((void *chan)); -#endif /* __APPLE_API_UNSTABLE */ -__END_DECLS +extern int msleep(void *chan, lck_mtx_t *mtx, int pri, const char *wmesg, struct timespec * ts ); +extern void unsleep(struct proc *); +extern void wakeup(void *chan); +extern void wakeup_one(caddr_t chan); + +/* proc kpis */ +/* this routine returns the pid of the current process */ +extern int proc_selfpid(void); +/* this routine returns the pid of the parent of the current process */ +extern int proc_selfppid(void); +/* this routine returns sends a signal signum to the process identified by the pid */ +extern void proc_signal(int pid, int signum); +/* this routine checks whether any signal identified by the mask are pending in the process identified by the pid. The check is on all threads of the process. */ +extern int proc_issignal(int pid, sigset_t mask); +/* this routine returns 1 if the pid1 is inferior of pid2 */ +extern int proc_isinferior(int pid1, int pid2); +/* this routine copies the process's name of the executable to the passed in buffer. It + * is always null terminated. The size of the buffer is to be passed in as well. This + * routine is to be used typically for debugging + */ +void proc_name(int pid, char * buf, int size); +/* This routine is simillar to proc_name except it returns for current process */ +void proc_selfname(char * buf, int size); + +/* find a process with a given pid. This comes with a reference which needs to be dropped by proc_rele */ +extern proc_t proc_find(int pid); +/* returns a handle to current process which is referenced. The reference needs to be dropped with proc_rele */ +extern proc_t proc_self(void); +/* releases the held reference on the process */ +extern int proc_rele(proc_t p); +/* returns the pid of the given process */ +extern int proc_pid(proc_t); +/* returns the pid of the parent of a given process */ +extern int proc_ppid(proc_t); +/* returns 1 if the process is marked for no remote hangs */ +extern int proc_noremotehang(proc_t); +/* returns 1 is the process is marked for force quota */ +extern int proc_forcequota(proc_t); + +/* this routine returns 1 if the process is running with 64bit address space, else 0 */ +extern int proc_is64bit(proc_t); +/* is this process exiting? */ +extern int proc_exiting(proc_t); +/* this routine returns error is the process is not one with super user privileges */ +int proc_suser(struct proc *p); +/* returns the ucred assicaited with the process; temporary api */ +struct ucred * proc_ucred(struct proc *p); + +/* LP64todo - figure out how to identify 64-bit processes if NULL procp */ +extern int IS_64BIT_PROCESS(proc_t); +extern int proc_pendingsignals(struct proc *, sigset_t); +extern int proc_tbe(struct proc *); + +#ifdef KERNEL_PRIVATE +extern int tsleep(void *chan, int pri, const char *wmesg, int timo); +extern int msleep1(void *chan, lck_mtx_t *mtx, int pri, const char *wmesg, u_int64_t timo); +#endif -#ifdef __APPLE_API_OBSOLETE -/* FreeBSD source compatibility macro */ -#define PRISON_CHECK(p1, p2) (1) -#endif /* __APPLE_API_OBSOLETE */ +__END_DECLS #endif /* KERNEL */ diff --git a/bsd/sys/proc_internal.h b/bsd/sys/proc_internal.h new file mode 100644 index 000000000..6d7c06111 --- /dev/null +++ b/bsd/sys/proc_internal.h @@ -0,0 +1,369 @@ +/* + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* Copyright (c) 1995, 1997 Apple Computer, Inc. All Rights Reserved */ +/*- + * Copyright (c) 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)proc_internal.h 8.15 (Berkeley) 5/19/95 + */ + +#ifndef _SYS_PROC_INTERNAL_H_ +#define _SYS_PROC_INTERNAL_H_ + +#include <sys/proc.h> +__BEGIN_DECLS +#include <kern/locks.h> +__END_DECLS + +/* + * One structure allocated per session. + */ +struct session { + int s_count; /* Ref cnt; pgrps in session. */ + struct proc *s_leader; /* Session leader. */ + struct vnode *s_ttyvp; /* Vnode of controlling terminal. */ + struct tty *s_ttyp; /* Controlling terminal. */ + pid_t s_sid; /* Session ID */ + char s_login[MAXLOGNAME]; /* Setlogin() name. */ +}; + +/* + * One structure allocated per process group. + */ +struct pgrp { + LIST_ENTRY(pgrp) pg_hash; /* Hash chain. */ + LIST_HEAD(, proc) pg_members; /* Pointer to pgrp members. */ + struct session *pg_session; /* Pointer to session. */ + pid_t pg_id; /* Pgrp id. */ + int pg_jobc; /* # procs qualifying pgrp for job control */ +}; + +struct proc; + +#define PROC_NULL (struct proc *)0; + +#define p_session p_pgrp->pg_session +#define p_pgid p_pgrp->pg_id + +/* + * Description of a process. + * + * This structure contains the information needed to manage a thread of + * control, known in UN*X as a process; it has references to substructures + * containing descriptions of things that the process uses, but may share + * with related processes. The process structure and the substructures + * are always addressible except for those marked "(PROC ONLY)" below, + * which might be addressible only on a processor on which the process + * is running. + */ +struct proc { + LIST_ENTRY(proc) p_list; /* List of all processes. */ + + /* substructures: */ + struct ucred *p_ucred; /* Process owner's identity. */ + struct filedesc *p_fd; /* Ptr to open files structure. */ + struct pstats *p_stats; /* Accounting/statistics (PROC ONLY). */ + struct plimit *p_limit; /* Process limits. */ + struct sigacts *p_sigacts; /* Signal actions, state (PROC ONLY). */ + +#define p_rlimit p_limit->pl_rlimit + + int p_flag; /* P_* flags. */ + char p_stat; /* S* process status. */ + char p_shutdownstate; + char p_pad1[2]; + + pid_t p_pid; /* Process identifier. */ + LIST_ENTRY(proc) p_pglist; /* List of processes in pgrp. */ + struct proc *p_pptr; /* Pointer to parent process. */ + LIST_ENTRY(proc) p_sibling; /* List of sibling processes. */ + LIST_HEAD(, proc) p_children; /* Pointer to list of children. */ + +/* The following fields are all zeroed upon creation in fork. */ +#define p_startzero p_oppid + + pid_t p_oppid; /* Save parent pid during ptrace. XXX */ + int p_dupfd; /* Sideways return value from fdopen. XXX */ + + /* scheduling */ + u_int p_estcpu; /* Time averaged value of p_cpticks. */ + int p_cpticks; /* Ticks of cpu time. */ + fixpt_t p_pctcpu; /* %cpu for this process during p_swtime */ + void *p_wchan; /* Sleep address. */ + char *p_wmesg; /* Reason for sleep. */ + u_int p_swtime; /* DEPRECATED (Time swapped in or out.) */ +#define p_argslen p_swtime /* Length of process arguments. */ + u_int p_slptime; /* Time since last blocked. */ + + struct itimerval p_realtimer; /* Alarm timer. */ + struct timeval p_rtime; /* Real time. */ + u_quad_t p_uticks; /* Statclock hits in user mode. */ + u_quad_t p_sticks; /* Statclock hits in system mode. */ + u_quad_t p_iticks; /* Statclock hits processing intr. */ + + int p_traceflag; /* Kernel trace points. */ + struct vnode *p_tracep; /* Trace to vnode. */ + + sigset_t p_siglist; /* DEPRECATED. */ + + struct vnode *p_textvp; /* Vnode of executable. */ + +/* End area that is zeroed on creation. */ +#define p_endzero p_hash.le_next + + /* + * Not copied, not zero'ed. + * Belongs after p_pid, but here to avoid shifting proc elements. + */ + LIST_ENTRY(proc) p_hash; /* Hash chain. */ + TAILQ_HEAD( ,eventqelt) p_evlist; + +/* The following fields are all copied upon creation in fork. */ +#define p_startcopy p_sigmask + + sigset_t p_sigmask; /* DEPRECATED */ + sigset_t p_sigignore; /* Signals being ignored. */ + sigset_t p_sigcatch; /* Signals being caught by user. */ + + u_char p_priority; /* Process priority. */ + u_char p_usrpri; /* User-priority based on p_cpu and p_nice. */ + char p_nice; /* Process "nice" value. */ + char p_comm[MAXCOMLEN+1]; + + struct pgrp *p_pgrp; /* Pointer to process group. */ + +/* End area that is copied on creation. */ +#define p_endcopy p_xstat + + u_short p_xstat; /* Exit status for wait; also stop signal. */ + u_short p_acflag; /* Accounting flags. */ + struct rusage *p_ru; /* Exit information. XXX */ + + int p_debugger; /* 1: can exec set-bit programs if suser */ + + void *task; /* corresponding task */ + void *sigwait_thread; /* 'thread' holding sigwait */ + char signal_lock[72]; + boolean_t sigwait; /* indication to suspend */ + void *exit_thread; /* Which thread is exiting? */ + user_addr_t user_stack; /* where user stack was allocated */ + void * exitarg; /* exit arg for proc terminate */ + void * vm_shm; /* for sysV shared memory */ + int p_argc; /* saved argc for sysctl_procargs() */ + int p_vforkcnt; /* number of outstanding vforks */ + void * p_vforkact; /* activation running this vfork proc */ + TAILQ_HEAD( , uthread) p_uthlist; /* List of uthreads */ + /* Following fields are info from SIGCHLD */ + pid_t si_pid; + u_short si_status; + u_short si_code; + uid_t si_uid; + TAILQ_HEAD( , aio_workq_entry ) aio_activeq; /* active async IO requests */ + int aio_active_count; /* entries on aio_activeq */ + TAILQ_HEAD( , aio_workq_entry ) aio_doneq; /* completed async IO requests */ + int aio_done_count; /* entries on aio_doneq */ + + struct klist p_klist; /* knote list */ + lck_mtx_t p_mlock; /* proc lock to protect evques */ + lck_mtx_t p_fdmlock; /* proc lock to protect evques */ + unsigned int p_fdlock_pc[4]; + unsigned int p_fdunlock_pc[4]; + int p_fpdrainwait; + int p_lflag; /* local flags */ +#if DIAGNOSTIC +#if SIGNAL_DEBUG + unsigned int lockpc[8]; + unsigned int unlockpc[8]; +#endif /* SIGNAL_DEBUG */ +#endif /* DIAGNOSTIC */ +}; + + +#define P_LDELAYTERM 0x1 /* */ +#define P_LNOZOMB 0x2 /* */ +#define P_LLOW_PRI_IO 0x4 +#define P_LPEXIT 0x8 +#define P_LBACKGROUND_IO 0x10 + +// LP64todo - should this move? +/* LP64 version of extern_proc. all pointers + * grow when we're dealing with a 64-bit process. + * WARNING - keep in sync with extern_proc + * but use native alignment of 64-bit process. + */ + +#ifdef KERNEL +#include <sys/time.h> /* user_timeval, user_itimerval */ + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_extern_proc { + union { + struct { + user_addr_t __p_forw; /* Doubly-linked run/sleep queue. */ + user_addr_t __p_back; + } p_st1; + struct user_timeval __p_starttime; /* process start time */ + } p_un; + user_addr_t p_vmspace; /* Address space. */ + user_addr_t p_sigacts; /* Signal actions, state (PROC ONLY). */ + int p_flag; /* P_* flags. */ + char p_stat; /* S* process status. */ + pid_t p_pid; /* Process identifier. */ + pid_t p_oppid; /* Save parent pid during ptrace. XXX */ + int p_dupfd; /* Sideways return value from fdopen. XXX */ + /* Mach related */ + user_addr_t user_stack; /* where user stack was allocated */ + user_addr_t exit_thread; /* XXX Which thread is exiting? */ + int p_debugger; /* allow to debug */ + boolean_t sigwait; /* indication to suspend */ + /* scheduling */ + u_int p_estcpu; /* Time averaged value of p_cpticks. */ + int p_cpticks; /* Ticks of cpu time. */ + fixpt_t p_pctcpu; /* %cpu for this process during p_swtime */ + user_addr_t p_wchan; /* Sleep address. */ + user_addr_t p_wmesg; /* Reason for sleep. */ + u_int p_swtime; /* Time swapped in or out. */ + u_int p_slptime; /* Time since last blocked. */ + struct user_itimerval p_realtimer; /* Alarm timer. */ + struct user_timeval p_rtime; /* Real time. */ + u_quad_t p_uticks; /* Statclock hits in user mode. */ + u_quad_t p_sticks; /* Statclock hits in system mode. */ + u_quad_t p_iticks; /* Statclock hits processing intr. */ + int p_traceflag; /* Kernel trace points. */ + user_addr_t p_tracep; /* Trace to vnode. */ + int p_siglist; /* DEPRECATED */ + user_addr_t p_textvp; /* Vnode of executable. */ + int p_holdcnt; /* If non-zero, don't swap. */ + sigset_t p_sigmask; /* DEPRECATED. */ + sigset_t p_sigignore; /* Signals being ignored. */ + sigset_t p_sigcatch; /* Signals being caught by user. */ + u_char p_priority; /* Process priority. */ + u_char p_usrpri; /* User-priority based on p_cpu and p_nice. */ + char p_nice; /* Process "nice" value. */ + char p_comm[MAXCOMLEN+1]; + user_addr_t p_pgrp; /* Pointer to process group. */ + user_addr_t p_addr; /* Kernel virtual addr of u-area (PROC ONLY). */ + u_short p_xstat; /* Exit status for wait; also stop signal. */ + u_short p_acflag; /* Accounting flags. */ + user_addr_t p_ru; /* Exit information. XXX */ +}; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif +#endif /* KERNEL */ + +/* + * We use process IDs <= PID_MAX; PID_MAX + 1 must also fit in a pid_t, + * as it is used to represent "no process group". + */ +extern int nprocs, maxproc; /* Current and max number of procs. */ +__private_extern__ int hard_maxproc; /* hard limit */ + +#define PID_MAX 30000 +#define NO_PID 30001 + +#define SESS_LEADER(p) ((p)->p_session->s_leader == (p)) +#define SESSHOLD(s) ((s)->s_count++) +#define SESSRELE(s) sessrele(s) + +#define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash]) +extern LIST_HEAD(pidhashhead, proc) *pidhashtbl; +extern u_long pidhash; + +#define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash]) +extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl; +extern u_long pgrphash; +extern lck_grp_t * proc_lck_grp; +extern lck_grp_attr_t * proc_lck_grp_attr; +extern lck_attr_t * proc_lck_attr; + +LIST_HEAD(proclist, proc); +extern struct proclist allproc; /* List of all processes. */ +extern struct proclist zombproc; /* List of zombie processes. */ +extern struct proc *initproc; +extern void pgdelete(struct pgrp *pgrp); +extern void sessrele(struct session *sess); +extern void procinit(void); +extern void proc_lock(struct proc *); +extern void proc_unlock(struct proc *); +extern void proc_fdlock(struct proc *); +extern void proc_fdunlock(struct proc *); +__private_extern__ char *proc_core_name(const char *name, uid_t uid, pid_t pid); +extern int isinferior(struct proc *, struct proc *); +extern struct proc *pfind(pid_t); /* Find process by id. */ +__private_extern__ struct proc *pzfind(pid_t); /* Find zombie by id. */ +extern struct pgrp *pgfind(pid_t); /* Find process group by id. */ + +extern int chgproccnt(uid_t uid, int diff); +extern int enterpgrp(struct proc *p, pid_t pgid, int mksess); +extern void fixjobc(struct proc *p, struct pgrp *pgrp, int entering); +extern int inferior(struct proc *p); +extern int leavepgrp(struct proc *p); +extern void resetpriority(struct proc *); +extern void setrunnable(struct proc *); +extern void setrunqueue(struct proc *); +extern int sleep(void *chan, int pri); +extern int tsleep0(void *chan, int pri, const char *wmesg, int timo, int (*continuation)(int)); +extern int tsleep1(void *chan, int pri, const char *wmesg, u_int64_t abstime, int (*continuation)(int)); +extern int msleep0(void *chan, lck_mtx_t *mtx, int pri, const char *wmesg, int timo, int (*continuation)(int)); +extern void vfork_return(thread_t th_act, struct proc *p, struct proc *p2, register_t *retval); + + +#endif /* !_SYS_PROC_INTERNAL_H_ */ diff --git a/bsd/sys/protosw.h b/bsd/sys/protosw.h index 80c1a3120..693c71733 100644 --- a/bsd/sys/protosw.h +++ b/bsd/sys/protosw.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -60,12 +60,21 @@ #ifndef _SYS_PROTOSW_H_ #define _SYS_PROTOSW_H_ +#include <sys/appleapiopts.h> +#include <sys/cdefs.h> + +#define PR_SLOWHZ 2 /* 2 slow timeouts per second */ +#define PR_FASTHZ 5 /* 5 fast timeouts per second */ + +#ifdef PRIVATE + /* Forward declare these structures referenced from prototypes below. */ struct mbuf; struct proc; struct sockaddr; struct socket; struct sockopt; +struct socket_filter; /*#ifdef _KERNEL*/ /* @@ -91,49 +100,65 @@ struct sockopt; * described below. */ -#include <sys/appleapiopts.h> #include <sys/socketvar.h> #include <sys/queue.h> +#ifdef KERNEL +#include <kern/locks.h> +#endif /* KERNEL */ + +#if __DARWIN_ALIGN_POWER +#pragma options align=power +#endif -#ifdef __APPLE_API_UNSTABLE struct protosw { short pr_type; /* socket type used for */ struct domain *pr_domain; /* domain protocol a member of */ short pr_protocol; /* protocol number */ unsigned int pr_flags; /* see below */ /* protocol-protocol hooks */ - void (*pr_input) __P((struct mbuf *, int len)); + void (*pr_input)(struct mbuf *, int len); /* input to protocol (from below) */ - int (*pr_output) __P((struct mbuf *m, struct socket *so)); + int (*pr_output)(struct mbuf *m, struct socket *so); /* output to protocol (from above) */ - void (*pr_ctlinput)__P((int, struct sockaddr *, void *)); + void (*pr_ctlinput)(int, struct sockaddr *, void *); /* control input (from below) */ - int (*pr_ctloutput)__P((struct socket *, struct sockopt *)); + int (*pr_ctloutput)(struct socket *, struct sockopt *); /* control output (from above) */ /* user-protocol hook */ void *pr_ousrreq; /* utility hooks */ - void (*pr_init) __P((void)); /* initialization hook */ - void (*pr_fasttimo) __P((void)); + void (*pr_init)(void); /* initialization hook */ + void (*pr_fasttimo)(void); /* fast timeout (200ms) */ - void (*pr_slowtimo) __P((void)); + void (*pr_slowtimo)(void); /* slow timeout (500ms) */ - void (*pr_drain) __P((void)); + void (*pr_drain)(void); /* flush any excess space possible */ #if __APPLE__ - int (*pr_sysctl)(); /* sysctl for protocol */ + int (*pr_sysctl)(int *, u_int, void *, size_t *, void *, size_t); + /* sysctl for protocol */ #endif struct pr_usrreqs *pr_usrreqs; /* supersedes pr_usrreq() */ +#if __APPLE__ + int (*pr_lock) (struct socket *so, int locktype, int debug); /* lock function for protocol */ + int (*pr_unlock) (struct socket *so, int locktype, int debug); /* unlock for protocol */ +#ifdef _KERN_LOCKS_H_ + lck_mtx_t * (*pr_getlock) (struct socket *so, int locktype); +#else + void * (*pr_getlock) (struct socket *so, int locktype); +#endif +#endif #if __APPLE__ /* Implant hooks */ - TAILQ_HEAD(pr_sfilter, NFDescriptor) pr_sfilter; + TAILQ_HEAD(, socket_filter) pr_filter_head; struct protosw *pr_next; /* Chain for domain */ - u_long reserved[4]; /* Padding for future use */ + u_long reserved[1]; /* Padding for future use */ #endif }; -#define PR_SLOWHZ 2 /* 2 slow timeouts per second */ -#define PR_FASTHZ 5 /* 5 fast timeouts per second */ +#if __DARWIN_ALIGN_POWER +#pragma options align=reset +#endif /* * Values for pr_flags. @@ -144,13 +169,16 @@ struct protosw { * is only relevant if PR_CONNREQUIRED is set (otherwise sendto is allowed * anyhow). */ -#define PR_ATOMIC 0x01 /* exchange atomic messages only */ -#define PR_ADDR 0x02 /* addresses given with messages */ +#define PR_ATOMIC 0x01 /* exchange atomic messages only */ +#define PR_ADDR 0x02 /* addresses given with messages */ #define PR_CONNREQUIRED 0x04 /* connection required by protocol */ -#define PR_WANTRCVD 0x08 /* want PRU_RCVD calls */ -#define PR_RIGHTS 0x10 /* passes capabilities */ -#define PR_IMPLOPCL 0x20 /* implied open/close */ -#define PR_LASTHDR 0x40 /* enforce ipsec policy; last header */ +#define PR_WANTRCVD 0x08 /* want PRU_RCVD calls */ +#define PR_RIGHTS 0x10 /* passes capabilities */ +#define PR_IMPLOPCL 0x20 /* implied open/close */ +#define PR_LASTHDR 0x40 /* enforce ipsec policy; last header */ +#define PR_PROTOLOCK 0x80 /* protocol takes care of it's own locking */ +#define PR_PCBLOCK 0x100 /* protocol supports per pcb finer grain locking */ +#define PR_DISPOSE 0x200 /* protocol requires late lists disposal */ /* * The arguments to usrreq are: @@ -217,35 +245,31 @@ struct uio; * migrate this stuff back into the main structure. */ struct pr_usrreqs { - int (*pru_abort) __P((struct socket *so)); - int (*pru_accept) __P((struct socket *so, struct sockaddr **nam)); - int (*pru_attach) __P((struct socket *so, int proto, - struct proc *p)); - int (*pru_bind) __P((struct socket *so, struct sockaddr *nam, - struct proc *p)); - int (*pru_connect) __P((struct socket *so, struct sockaddr *nam, - struct proc *p)); - int (*pru_connect2) __P((struct socket *so1, struct socket *so2)); - int (*pru_control) __P((struct socket *so, u_long cmd, caddr_t data, - struct ifnet *ifp, struct proc *p)); - int (*pru_detach) __P((struct socket *so)); - int (*pru_disconnect) __P((struct socket *so)); - int (*pru_listen) __P((struct socket *so, struct proc *p)); - int (*pru_peeraddr) __P((struct socket *so, - struct sockaddr **nam)); - int (*pru_rcvd) __P((struct socket *so, int flags)); - int (*pru_rcvoob) __P((struct socket *so, struct mbuf *m, - int flags)); - int (*pru_send) __P((struct socket *so, int flags, struct mbuf *m, + int (*pru_abort)(struct socket *so); + int (*pru_accept)(struct socket *so, struct sockaddr **nam); + int (*pru_attach)(struct socket *so, int proto, struct proc *p); + int (*pru_bind)(struct socket *so, struct sockaddr *nam, + struct proc *p); + int (*pru_connect)(struct socket *so, struct sockaddr *nam, + struct proc *p); + int (*pru_connect2)(struct socket *so1, struct socket *so2); + int (*pru_control)(struct socket *so, u_long cmd, caddr_t data, + struct ifnet *ifp, struct proc *p); + int (*pru_detach)(struct socket *so); + int (*pru_disconnect)(struct socket *so); + int (*pru_listen)(struct socket *so, struct proc *p); + int (*pru_peeraddr)(struct socket *so, struct sockaddr **nam); + int (*pru_rcvd)(struct socket *so, int flags); + int (*pru_rcvoob)(struct socket *so, struct mbuf *m, int flags); + int (*pru_send)(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, - struct proc *p)); + struct proc *p); #define PRUS_OOB 0x1 #define PRUS_EOF 0x2 #define PRUS_MORETOCOME 0x4 - int (*pru_sense) __P((struct socket *so, struct stat *sb)); - int (*pru_shutdown) __P((struct socket *so)); - int (*pru_sockaddr) __P((struct socket *so, - struct sockaddr **nam)); + int (*pru_sense)(struct socket *so, struct stat *sb); + int (*pru_shutdown)(struct socket *so); + int (*pru_sockaddr)(struct socket *so, struct sockaddr **nam); /* * These three added later, so they are out of order. They are used @@ -255,17 +279,19 @@ struct pr_usrreqs { * through these entry points. For protocols which still use * the generic code, these just point to those routines. */ - int (*pru_sosend) __P((struct socket *so, struct sockaddr *addr, + int (*pru_sosend)(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, - struct mbuf *control, int flags)); - int (*pru_soreceive) __P((struct socket *so, + struct mbuf *control, int flags); + int (*pru_soreceive)(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, - struct mbuf **controlp, int *flagsp)); - int (*pru_sopoll) __P((struct socket *so, int events, - struct ucred *cred, void *)); + struct mbuf **controlp, int *flagsp); + int (*pru_sopoll)(struct socket *so, int events, + struct ucred *cred, void *); }; +__BEGIN_DECLS + extern int pru_abort_notsupp(struct socket *so); extern int pru_accept_notsupp(struct socket *so, struct sockaddr **nam); extern int pru_attach_notsupp(struct socket *so, int proto, @@ -300,8 +326,9 @@ extern int pru_soreceive_notsupp(struct socket *so, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); extern int pru_sopoll_notsupp(struct socket *so, int events, - struct ucred *cred); + struct ucred *cred, void *); +__END_DECLS #endif /* KERNEL */ @@ -375,14 +402,20 @@ char *prcorequests[] = { #endif #ifdef KERNEL -void pfctlinput __P((int, struct sockaddr *)); -void pfctlinput2 __P((int, struct sockaddr *, void *)); -struct protosw *pffindproto __P((int family, int protocol, int type)); -struct protosw *pffindtype __P((int family, int type)); + +__BEGIN_DECLS + +void pfctlinput(int, struct sockaddr *); +void pfctlinput2(int, struct sockaddr *, void *); +struct protosw *pffindproto(int family, int protocol, int type); +struct protosw *pffindproto_locked(int family, int protocol, int type); +struct protosw *pffindtype(int family, int type); extern int net_add_proto(struct protosw *, struct domain *); extern int net_del_proto(int, int, struct domain *); +__END_DECLS + /* Temp hack to link static domains together */ #define LINK_PROTOS(psw) \ @@ -395,5 +428,6 @@ static void link_ ## psw ## _protos() \ } #endif -#endif /* __APPLE_API_UNSTABLE */ + +#endif /* PRIVATE */ #endif /* !_SYS_PROTOSW_H_ */ diff --git a/bsd/sys/ptrace.h b/bsd/sys/ptrace.h index 3ae7cdadb..61ae0448a 100644 --- a/bsd/sys/ptrace.h +++ b/bsd/sys/ptrace.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -59,6 +59,7 @@ #define _SYS_PTRACE_H_ #include <sys/appleapiopts.h> +#include <sys/cdefs.h> #define PT_TRACE_ME 0 /* child declares it's being traced */ #define PT_READ_I 1 /* read word in child's I space */ @@ -80,21 +81,21 @@ #define PT_DENY_ATTACH 31 #define PT_FIRSTMACH 32 /* for machine-specific requests */ -#include <machine/ptrace.h> /* machine-specific requests, if any */ + +__BEGIN_DECLS #ifdef KERNEL #ifdef __APPLE_API_PRIVATE -void proc_reparent __P((struct proc *child, struct proc *newparent)); + +void proc_reparent(struct proc *child, struct proc *newparent); #endif /* __APPLE_API_PRIVATE */ #else /* !KERNEL */ -#include <sys/cdefs.h> - -__BEGIN_DECLS -int ptrace __P((int _request, pid_t _pid, caddr_t _addr, int _data)); -__END_DECLS +int ptrace(int _request, pid_t _pid, caddr_t _addr, int _data); #endif /* !KERNEL */ +__END_DECLS + #endif /* !_SYS_PTRACE_H_ */ diff --git a/bsd/net/netisr.h b/bsd/sys/ptrace_internal.h similarity index 63% rename from bsd/net/netisr.h rename to bsd/sys/ptrace_internal.h index f8db86f47..6d2f11907 100644 --- a/bsd/net/netisr.h +++ b/bsd/sys/ptrace_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -21,8 +21,8 @@ */ /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ /* - * Copyright (c) 1980, 1986, 1989, 1993 - * The Regents of the University of California. All rights reserved. + * Copyright (c) 1982, 1986, 1993, 1994 + * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -34,8 +34,8 @@ * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. + * This product includes software developed by the University of + * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. @@ -52,40 +52,46 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)netisr.h 8.1 (Berkeley) 6/10/93 + * @(#)uio.h 8.5 (Berkeley) 2/22/94 */ + +#ifndef _SYS_PTRACE_INTERNAL_H_ +#define _SYS_PTRACE_INTERNAL_H_ + #include <sys/appleapiopts.h> +#ifdef KERNEL_PRIVATE +#include <sys/ptrace.h> + /* - * The networking code runs as a seperate kernel task. + * Additional request flags used by shipping 3rd party products that have been + * patching ptrace. We should be able to remove these additional requests once + * the 3rd party products move to the KPis introduced in Tiger. */ +#define PT_VENDOR_REQUEST1 5561 /* reserved for 3rd party vendor */ + + +__BEGIN_DECLS + /* - * Each ``pup-level-1'' input queue has a bit in a ``netisr'' status - * word which is used to de-multiplex a single software - * interrupt used for scheduling the network code to calls - * on the lowest level routine of each protocol. + * WARNING - these are temporary KPI that allow binary compatibility with + * shipping product that must patch ptrace. These KPI will be removed in the + * next system release that follows Tiger. radar - 3928003 + * + * temp_patch_ptrace - patch ptrace using new_ptrace as current implementation. + * Returns the address of the original ptrace implementation. + * + * temp_unpatch_ptrace - restore ptrace to the original implementation. Caller + * must insure all inflight ptrace requests have drained before their kext + * is unloaded. */ -#define NETISR_IP 2 /* same as AF_INET */ -#define NETISR_IMP 3 /* same as AF_IMPLINK */ -#define NETISR_NS 6 /* same as AF_NS */ -#define NETISR_ISO 7 /* same as AF_ISO */ -#define NETISR_CCITT 10 /* same as AF_CCITT */ -#define NETISR_APPLETALK 16 /* same as AF_APPLETALK */ -#define NETISR_ARP 18 /* same as AF_LINK */ -#define NETISR_BLUE 26 /* same as psuedo_AF_BLUE */ -#define NETISR_IPV6 30 /* same as AF_INET6 */ +uintptr_t temp_patch_ptrace(uintptr_t new_ptrace); +void temp_unpatch_ptrace(void); -#define NETISR_SET(a,b) +__END_DECLS -#ifdef __APPLE_API_PRIVATE -#if defined(KERNEL) && !defined(LOCORE) -extern volatile int netisr; /* scheduling bits for network */ -void wakeup(void *); -extern int dlil_input_thread_wakeup; -#define setsoftnet() (wakeup((caddr_t)&dlil_input_thread_wakeup)) -#endif /* defined(KERNEL) && !defined(LOCORE) */ -#define schednetisr(anisr) { netisr |= 1<<(anisr); setsoftnet(); } -#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL */ +#endif /* !_SYS_PTRACE_INTERNAL_H_ */ diff --git a/bsd/sys/queue.h b/bsd/sys/queue.h index bd4b21341..feb16f76e 100644 --- a/bsd/sys/queue.h +++ b/bsd/sys/queue.h @@ -539,8 +539,8 @@ remque(void *a) #else /* !__GNUC__ */ -void insque __P((void *a, void *b)); -void remque __P((void *a)); +void insque(void *a, void *b); +void remque(void *a); #endif /* __GNUC__ */ diff --git a/bsd/sys/quota.h b/bsd/sys/quota.h index 2e9aa0804..691eecc88 100644 --- a/bsd/sys/quota.h +++ b/bsd/sys/quota.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -62,6 +62,12 @@ #define _SYS_QUOTA_H #include <sys/appleapiopts.h> +#include <sys/cdefs.h> +#ifdef KERNEL_PRIVATE +#include <kern/locks.h> +#endif + +#include <mach/boolean.h> #ifdef __APPLE_API_UNSTABLE /* @@ -159,6 +165,34 @@ struct dqblk { u_int32_t dqb_spare[4]; /* pad struct to power of 2 */ }; +#ifdef KERNEL_PRIVATE +#include <machine/types.h> /* user_time_t */ +/* LP64 version of struct dqblk. time_t is a long and must grow when + * we're dealing with a 64-bit process. + * WARNING - keep in sync with struct dqblk + */ + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_dqblk { + u_int64_t dqb_bhardlimit; /* absolute limit on disk bytes alloc */ + u_int64_t dqb_bsoftlimit; /* preferred limit on disk bytes */ + u_int64_t dqb_curbytes; /* current byte count */ + u_int32_t dqb_ihardlimit; /* maximum # allocated inodes + 1 */ + u_int32_t dqb_isoftlimit; /* preferred inode limit */ + u_int32_t dqb_curinodes; /* current # allocated inodes */ + user_time_t dqb_btime; /* time limit for excessive disk use */ + user_time_t dqb_itime; /* time limit for excessive files */ + u_int32_t dqb_id; /* identifier (0 for empty entries) */ + u_int32_t dqb_spare[4]; /* pad struct to power of 2 */ +}; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif +#endif /* KERNEL_PRIVATE */ #define INITQMAGICS { \ 0xff31ff35, /* USRQUOTA */ \ @@ -211,46 +245,41 @@ dqhashshift(u_long size) #ifndef KERNEL - -#include <sys/cdefs.h> - __BEGIN_DECLS -int quotactl __P((char *, int, int, caddr_t)); +int quotactl(char *, int, int, caddr_t); __END_DECLS #endif /* !KERNEL */ -#ifdef KERNEL +#ifdef KERNEL_PRIVATE #include <sys/queue.h> -/* - * Macros to avoid subroutine calls to trivial functions. - */ -#if DIAGNOSTIC -#define DQREF(dq) dqref(dq) -#else -#define DQREF(dq) (dq)->dq_cnt++ -#endif /* Quota file info */ struct quotafile { + lck_mtx_t qf_lock; /* quota file mutex */ struct vnode *qf_vp; /* quota file vnode */ struct ucred *qf_cred; /* quota file access cred */ int qf_shift; /* primary hash shift */ int qf_maxentries; /* size of hash table (power of 2) */ - int qf_entrycnt; /* count of active entries */ + int qf_entrycnt; /* count of active entries */ time_t qf_btime; /* block quota time limit */ time_t qf_itime; /* inode quota time limit */ + + /* the following 2 fields are protected */ + /* by the quota list lock */ char qf_qflags; /* quota specific flags */ + int qf_refcnt; /* count of dquot refs on this file */ }; /* * Flags describing the runtime state of quotas. * (in qf_qflags) */ -#define QTF_OPENING 0x01 /* Q_QUOTAON in progress */ +#define QTF_OPENING 0x01 /* Q_QUOTAON in progress */ #define QTF_CLOSING 0x02 /* Q_QUOTAOFF in progress */ +#define QTF_WANTED 0x04 /* waiting for change of state */ /* @@ -264,22 +293,28 @@ struct dquot { TAILQ_ENTRY(dquot) dq_freelist; /* free list */ u_int16_t dq_flags; /* flags, see below */ u_int16_t dq_cnt; /* count of active references */ - u_int16_t dq_spare; /* unused spare padding */ + u_int16_t dq_lflags; /* protected by the quota list lock */ u_int16_t dq_type; /* quota type of this dquot */ u_int32_t dq_id; /* identifier this applies to */ u_int32_t dq_index; /* index into quota file */ struct quotafile *dq_qfile; /* quota file that this is taken from */ struct dqblk dq_dqb; /* actual usage & quotas */ }; + +/* + * dq_lflags values + */ +#define DQ_LLOCK 0x01 /* this quota locked (no MODS) */ +#define DQ_LWANT 0x02 /* wakeup on unlock */ + /* - * Flag values. + * dq_flags values */ -#define DQ_LOCK 0x01 /* this quota locked (no MODS) */ -#define DQ_WANT 0x02 /* wakeup on unlock */ -#define DQ_MOD 0x04 /* this quota modified since read */ -#define DQ_FAKE 0x08 /* no limits here, just usage */ -#define DQ_BLKS 0x10 /* has been warned about blk limit */ -#define DQ_INODS 0x20 /* has been warned about inode limit */ +#define DQ_MOD 0x01 /* this quota modified since read */ +#define DQ_FAKE 0x02 /* no limits here, just usage */ +#define DQ_BLKS 0x04 /* has been warned about blk limit */ +#define DQ_INODS 0x08 /* has been warned about inode limit */ + /* * Shorthand notation. */ @@ -311,19 +346,27 @@ struct dquot { * on-disk dqblk data structures. */ __BEGIN_DECLS +void dqfileinit(struct quotafile *); int dqfileopen(struct quotafile *, int); void dqfileclose(struct quotafile *, int); void dqflush(struct vnode *); -int dqget(struct vnode *, u_long, struct quotafile *, int, struct dquot **); +int dqget(u_long, struct quotafile *, int, struct dquot **); void dqinit(void); void dqref(struct dquot *); -void dqrele(struct vnode *, struct dquot *); -void dqreclaim(struct vnode *, struct dquot *); -int dqsync(struct vnode *, struct dquot *); +void dqrele(struct dquot *); +void dqreclaim(struct dquot *); +int dqsync(struct dquot *); void dqsync_orphans(struct quotafile *); +void dqlock(struct dquot *); +void dqunlock(struct dquot *); + +int qf_get(struct quotafile *, int type); +void qf_put(struct quotafile *, int type); + +__private_extern__ void munge_dqblk(struct dqblk *dqblkp, struct user_dqblk *user_dqblkp, boolean_t to64); __END_DECLS -#endif /* KERNEL */ +#endif /* KERNEL_PRIVATE */ #endif /* __APPLE_API_UNSTABLE */ diff --git a/bsd/sys/random.h b/bsd/sys/random.h index c8976a552..db6e0f701 100644 --- a/bsd/sys/random.h +++ b/bsd/sys/random.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999, 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999, 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -24,9 +24,12 @@ #define __SYS_RANDOM_H__ #include <sys/appleapiopts.h> +#include <sys/cdefs.h> #ifdef __APPLE_API_UNSTABLE +__BEGIN_DECLS void read_random(void* buffer, u_int numBytes); +__END_DECLS #endif /* __APPLE_API_UNSTABLE */ #endif /* __SYS_RANDOM_H__ */ diff --git a/bsd/sys/reboot.h b/bsd/sys/reboot.h index a312ed635..12eafdc11 100644 --- a/bsd/sys/reboot.h +++ b/bsd/sys/reboot.h @@ -122,6 +122,9 @@ ((partition) << B_PARTITIONSHIFT) | B_DEVMAGIC) #endif /* __APPLE_API_OBSOLETE */ + +#ifdef BSD_KERNEL_PRIVATE #include <machine/reboot.h> +#endif /* BSD_KERNEL_PRIVATE */ #endif /* _SYS_REBOOT_H_ */ diff --git a/bsd/sys/resource.h b/bsd/sys/resource.h index e2f12be0f..823fcc738 100644 --- a/bsd/sys/resource.h +++ b/bsd/sys/resource.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -59,29 +59,92 @@ #define _SYS_RESOURCE_H_ #include <sys/appleapiopts.h> +#include <sys/cdefs.h> +#include <sys/_types.h> + + +/* [XSI] The timeval structure shall be defined as described in + * <sys/time.h> + * + * NB: We use __darwin_time_t and __darwin_suseconds_t here to avoid + * improperly exposing time_t and suseconds_t into the namespace. + */ +#ifndef _TIMEVAL +#define _TIMEVAL +struct timeval { + __darwin_time_t tv_sec; /* seconds */ + __darwin_suseconds_t tv_usec; /* and microseconds */ +}; +#endif + +/* The id_t type shall be defined as described in <sys/types.h> */ +#ifndef _ID_T +#define _ID_T +typedef __darwin_id_t id_t; /* can hold pid_t, gid_t, or uid_t */ +#endif + /* - * Process priority specifications to get/setpriority. + * Resource limit type (low 63 bits, excluding the sign bit) + */ +typedef __int64_t rlim_t; + + +/***** + * PRIORITY + */ + +/* + * Possible values of the first parameter to getpriority()/setpriority(), + * used to indicate the type of the second parameter. + */ +#define PRIO_PROCESS 0 /* Second argument is a PID */ +#define PRIO_PGRP 1 /* Second argument is a GID */ +#define PRIO_USER 2 /* Second argument is a UID */ + +#ifndef _POSIX_C_SOURCE +/* + * Range limitations for the value of the third parameter to setpriority(). */ #define PRIO_MIN -20 #define PRIO_MAX 20 +#endif /* !_POSIX_C_SOURCE */ -#define PRIO_PROCESS 0 -#define PRIO_PGRP 1 -#define PRIO_USER 2 -/* - * Resource utilization information. + +/***** + * RESOURCE USAGE */ -#define RUSAGE_SELF 0 -#define RUSAGE_CHILDREN -1 +/* + * Possible values of the first parameter to getrusage(), used to indicate + * the scope of the information to be returned. + */ +#define RUSAGE_SELF 0 /* Current process information */ +#define RUSAGE_CHILDREN -1 /* Current process' children */ +/* + * A structure representing an accounting of resource utilization. The + * address of an instance of this structure is the second parameter to + * getrusage(). + * + * Note: All values other than ru_utime and ru_stime are implementaiton + * defined and subject to change in a future release. Their use + * is discouraged for standards compliant programs. + */ struct rusage { struct timeval ru_utime; /* user time used */ struct timeval ru_stime; /* system time used */ +#ifdef _POSIX_C_SOURCE + long ru_opaque[14]; /* implementation defined */ +#else /* !_POSIX_C_SOURCE */ + /* + * Informational aliases for source compatibility with programs + * that need more information than that provided by standards, + * and which do not mind being OS-dependent. + */ long ru_maxrss; /* max resident set size */ -#define ru_first ru_ixrss +#define ru_first ru_ixrss /* internal: ruadd() range start */ long ru_ixrss; /* integral shared memory size */ long ru_idrss; /* integral unshared data " */ long ru_isrss; /* integral unshared stack " */ @@ -95,57 +158,107 @@ struct rusage { long ru_nsignals; /* signals received */ long ru_nvcsw; /* voluntary context switches */ long ru_nivcsw; /* involuntary " */ -#define ru_last ru_nivcsw +#define ru_last ru_nivcsw /* internal: ruadd() range end */ +#endif /* !_POSIX_C_SOURCE */ +}; + + + +// LP64todo - should this move? +#ifdef KERNEL +#include <machine/types.h> /* user_time_t */ + +/* LP64 version of struct timeval. time_t is a long and must grow when + * we're dealing with a 64-bit process. + * WARNING - keep in sync with struct timeval + */ + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_rusage_timeval { + user_time_t tv_sec; /* seconds */ + __darwin_suseconds_t tv_usec; /* and microseconds */ +}; +struct user_rusage { + struct user_rusage_timeval ru_utime; /* user time used */ + struct user_rusage_timeval ru_stime; /* system time used */ + user_long_t ru_maxrss; /* max resident set size */ + user_long_t ru_ixrss; /* integral shared memory size */ + user_long_t ru_idrss; /* integral unshared data " */ + user_long_t ru_isrss; /* integral unshared stack " */ + user_long_t ru_minflt; /* page reclaims */ + user_long_t ru_majflt; /* page faults */ + user_long_t ru_nswap; /* swaps */ + user_long_t ru_inblock; /* block input operations */ + user_long_t ru_oublock; /* block output operations */ + user_long_t ru_msgsnd; /* messages sent */ + user_long_t ru_msgrcv; /* messages received */ + user_long_t ru_nsignals; /* signals received */ + user_long_t ru_nvcsw; /* voluntary context switches */ + user_long_t ru_nivcsw; /* involuntary " */ }; +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +#endif // KERNEL + + +/***** + * RESOURCE LIMITS + */ + /* - * Resource limits + * Symbolic constants for resource limits; since all limits are representable + * as a type rlim_t, we are permitted to define RLIM_SAVED_* in terms of + * RLIM_INFINITY. */ -#define RLIMIT_CPU 0 /* cpu time in milliseconds */ -#define RLIMIT_FSIZE 1 /* maximum file size */ -#define RLIMIT_DATA 2 /* data size */ +#define RLIM_INFINITY (((__uint64_t)1 << 63) - 1) /* no limit */ +#define RLIM_SAVED_MAX RLIM_INFINITY /* Unrepresentable hard limit */ +#define RLIM_SAVED_CUR RLIM_INFINITY /* Unrepresentable soft limit */ + +/* + * Possible values of the first parameter to getrlimit()/setrlimit(), to + * indicate for which resource the operation is being performed. + */ +#define RLIMIT_CPU 0 /* cpu time per process, in ms */ +#define RLIMIT_FSIZE 1 /* file size */ +#define RLIMIT_DATA 2 /* data segment size */ #define RLIMIT_STACK 3 /* stack size */ #define RLIMIT_CORE 4 /* core file size */ -#define RLIMIT_RSS 5 /* resident set size */ +#define RLIMIT_AS 5 /* address space (resident set size) */ +#ifndef _POSIX_C_SOURCE +#define RLIMIT_RSS RLIMIT_AS /* source compatibility alias */ #define RLIMIT_MEMLOCK 6 /* locked-in-memory address space */ #define RLIMIT_NPROC 7 /* number of processes */ +#endif /* !_POSIX_C_SOURCE */ #define RLIMIT_NOFILE 8 /* number of open files */ +#ifndef _POSIX_C_SOURCE +#define RLIM_NLIMITS 9 /* total number of resource limits */ +#endif /* !_POSIX_C_SOURCE */ -#define RLIM_NLIMITS 9 /* number of resource limits */ - -#define RLIM_INFINITY (((u_quad_t)1 << 63) - 1) - -struct orlimit { - int32_t rlim_cur; /* current (soft) limit */ - int32_t rlim_max; /* maximum value for rlim_cur */ -}; - +/* + * A structure representing a resource limit. The address of an instance + * of this structure is the second parameter to getrlimit()/setrlimit(). + */ struct rlimit { rlim_t rlim_cur; /* current (soft) limit */ rlim_t rlim_max; /* maximum value for rlim_cur */ }; -/* Load average structure. */ -struct loadavg { - fixpt_t ldavg[3]; - long fscale; -}; -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -extern struct loadavg averunnable; -#define LSCALE 1000 /* scaling for "fixed point" arithmetic */ -#endif /* __APPLE_API_PRIVATE */ -#else -#include <sys/cdefs.h> +#ifndef KERNEL __BEGIN_DECLS -int getpriority __P((int, int)); -int getrlimit __P((int, struct rlimit *)); -int getrusage __P((int, struct rusage *)); -int setpriority __P((int, int, int)); -int setrlimit __P((int, const struct rlimit *)); +int getpriority(int, id_t); +int getrlimit(int, struct rlimit *); +int getrusage(int, struct rusage *); +int setpriority(int, id_t, int); +int setrlimit(int, const struct rlimit *); __END_DECLS -#endif /* KERNEL */ +#endif /* !KERNEL */ #endif /* !_SYS_RESOURCE_H_ */ diff --git a/bsd/sys/resourcevar.h b/bsd/sys/resourcevar.h index 77b30d6fd..8593ed6ef 100644 --- a/bsd/sys/resourcevar.h +++ b/bsd/sys/resourcevar.h @@ -85,6 +85,17 @@ struct pstats { struct itimerval p_timer[3]; /* virtual-time timers */ #define pstat_endcopy p_start struct timeval p_start; /* starting time */ +#ifdef KERNEL + struct user_uprof { /* profile arguments */ + struct user_uprof *pr_next; /* multiple prof buffers allowed */ + user_addr_t pr_base; /* buffer base */ + user_size_t pr_size; /* buffer size */ + user_ulong_t pr_off; /* pc offset */ + user_ulong_t pr_scale; /* pc scaling */ + user_ulong_t pr_addr; /* temp storage for addr until AST */ + user_ulong_t pr_ticks; /* temp storage for ticks until AST */ + } user_p_prof; +#endif // KERNEL }; /* @@ -102,18 +113,21 @@ struct plimit { int p_refcnt; /* number of references */ }; +#ifdef KERNEL /* add user profiling from AST */ #define ADDUPROF(p) \ - addupc_task(p, \ - (p)->p_stats->p_prof.pr_addr, (p)->p_stats->p_prof.pr_ticks) + addupc_task(p, \ + (proc_is64bit((p)) ? (p)->p_stats->user_p_prof.pr_addr \ + : CAST_USER_ADDR_T((p)->p_stats->p_prof.pr_addr)), \ + (proc_is64bit((p)) ? (p)->p_stats->user_p_prof.pr_ticks \ + : (p)->p_stats->p_prof.pr_ticks)) -#ifdef KERNEL -void addupc_intr __P((struct proc *p, u_long pc, u_int ticks)); -void addupc_task __P((struct proc *p, u_long pc, u_int ticks)); -void calcru __P((struct proc *p, struct timeval *up, struct timeval *sp, - struct timeval *ip)); -void ruadd __P((struct rusage *ru, struct rusage *ru2)); -struct plimit *limcopy __P((struct plimit *lim)); +void addupc_intr(struct proc *p, u_long pc, u_int ticks); +void addupc_task(struct proc *p, user_addr_t pc, u_int ticks); +void calcru(struct proc *p, struct timeval *up, struct timeval *sp, + struct timeval *ip); +void ruadd(struct rusage *ru, struct rusage *ru2); +struct plimit *limcopy(struct plimit *lim); #endif /* KERNEL */ #endif /* __APPLE_API_PRIVATE */ diff --git a/bsd/sys/select.h b/bsd/sys/select.h index 6b0ea8e63..1bcd93484 100644 --- a/bsd/sys/select.h +++ b/bsd/sys/select.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -59,14 +59,90 @@ #include <sys/appleapiopts.h> #include <sys/cdefs.h> +#include <sys/_types.h> -#ifdef __APPLE_API_UNSTABLE +/* + * The time_t and suseconds_t types shall be defined as described in + * <sys/types.h> + * The sigset_t type shall be defined as described in <signal.h> + * The timespec structure shall be defined as described in <time.h> + */ +#ifndef _TIME_T +#define _TIME_T +typedef __darwin_time_t time_t; +#endif -__BEGIN_DECLS +#ifndef _SUSECONDS_T +#define _SUSECONDS_T +typedef __darwin_suseconds_t suseconds_t; +#endif + +#ifndef _SIGSET_T +#define _SIGSET_T +typedef __darwin_sigset_t sigset_t; +#endif + +#ifndef _TIMESPEC +#define _TIMESPEC +struct timespec { + time_t tv_sec; + long tv_nsec; +}; +#endif + +/* + * [XSI] The <sys/select.h> header shall define the fd_set type as a structure. + * [XSI] FD_CLR, FD_ISSET, FD_SET, FD_ZERO may be declared as a function, or + * defined as a macro, or both + * [XSI] FD_SETSIZE shall be defined as a macro + * + * Note: We use _FD_SET to protect all select related + * types and macros + */ +#ifndef _FD_SET +#define _FD_SET + +/* + * Select uses bit masks of file descriptors in longs. These macros + * manipulate such bit fields (the filesystem macros use chars). The + * extra protection here is to permit application redefinition above + * the default size. + */ +#ifndef FD_SETSIZE +#define FD_SETSIZE 1024 +#endif + +#define __DARWIN_NBBY 8 /* bits in a byte */ +#define __DARWIN_NFDBITS (sizeof(__int32_t) * __DARWIN_NBBY) /* bits per mask */ +#define __DARWIN_howmany(x, y) (((x) + ((y) - 1)) / (y)) /* # y's == x bits? */ + +typedef struct fd_set { + __int32_t fds_bits[__DARWIN_howmany(FD_SETSIZE, __DARWIN_NFDBITS)]; +} fd_set; + +#define FD_SET(n, p) ((p)->fds_bits[(n)/__DARWIN_NFDBITS] |= (1<<((n) % __DARWIN_NFDBITS))) +#define FD_CLR(n, p) ((p)->fds_bits[(n)/__DARWIN_NFDBITS] &= ~(1<<((n) % __DARWIN_NFDBITS))) +#define FD_ISSET(n, p) ((p)->fds_bits[(n)/__DARWIN_NFDBITS] & (1<<((n) % __DARWIN_NFDBITS))) +#if __GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ >= 3 +/* + * Use the built-in bzero function instead of the library version so that + * we do not pollute the namespace or introduce prototype warnings. + */ +#define FD_ZERO(p) __builtin_bzero(p, sizeof(*(p))) +#else +#define FD_ZERO(p) bzero(p, sizeof(*(p))) +#endif +#ifndef _POSIX_C_SOURCE +#define FD_COPY(f, t) bcopy(f, t, sizeof(*(f))) +#endif /* !_POSIX_C_SOURCE */ + +#endif /* !_FD_SET */ #ifdef KERNEL +#ifdef KERNEL_PRIVATE #include <kern/wait_queue.h> #endif +#include <sys/kernel_types.h> #include <sys/event.h> @@ -74,17 +150,10 @@ __BEGIN_DECLS * Used to maintain information about processes that wish to be * notified when I/O becomes possible. */ +#ifdef KERNEL_PRIVATE struct selinfo { -#ifdef KERNEL - union { - struct wait_queue wait_queue; /* wait_queue for wait/wakeup */ - struct klist note; /* JMM - temporary separation */ - } si_u; -#define si_wait_queue si_u.wait_queue -#define si_note si_u.note -#else - char si_wait_queue[16]; -#endif + struct wait_queue si_wait_queue; /* wait_queue for wait/wakeup */ + struct klist si_note; /* JMM - temporary separation */ u_int si_flags; /* see below */ }; @@ -93,31 +162,38 @@ struct selinfo { #define SI_INITED 0x0008 /* selinfo has been inited */ #define SI_CLEAR 0x0010 /* selinfo has been cleared */ -#ifdef KERNEL -struct proc; +#else +struct selinfo; +#endif -void selrecord __P((struct proc *selector, struct selinfo *, void *)); -void selwakeup __P((struct selinfo *)); -void selthreadclear __P((struct selinfo *)); -#endif /* KERNEL */ +__BEGIN_DECLS + +void selrecord(proc_t selector, struct selinfo *, void *); +void selwakeup(struct selinfo *); +void selthreadclear(struct selinfo *); __END_DECLS -#endif /* __APPLE_API_UNSTABLE */ +#endif /* KERNEL */ + #ifndef KERNEL +#ifndef _POSIX_C_SOURCE #include <sys/types.h> #ifndef __MWERKS__ #include <signal.h> #endif /* __MWERKS__ */ #include <sys/time.h> +#endif /* !_POSIX_C_SOURCE */ __BEGIN_DECLS #ifndef __MWERKS__ -int pselect(int, fd_set *, fd_set *, fd_set *, - const struct timespec *, const sigset_t *); +int pselect(int, fd_set * __restrict, fd_set * __restrict, + fd_set * __restrict, const struct timespec * __restrict, + const sigset_t * __restrict); #endif /* __MWERKS__ */ -int select(int, fd_set *, fd_set *, fd_set *, struct timeval *); +int select(int, fd_set * __restrict, fd_set * __restrict, + fd_set * __restrict, struct timeval * __restrict); __END_DECLS #endif /* ! KERNEL */ diff --git a/bsd/sys/sem.h b/bsd/sys/sem.h index 8db16a7a6..c31acf9ac 100644 --- a/bsd/sys/sem.h +++ b/bsd/sys/sem.h @@ -25,224 +25,191 @@ * SVID compatible sem.h file * * Author: Daniel Boulet - */ -/* * John Bellardo modified the implementation for Darwin. 12/2000 */ #ifndef _SYS_SEM_H_ #define _SYS_SEM_H_ -#include <sys/appleapiopts.h> -#include <sys/ipc.h> -struct sem { - u_short semval; /* semaphore value */ - pid_t sempid; /* pid of last operation */ - u_short semncnt; /* # awaiting semval > cval */ - u_short semzcnt; /* # awaiting semval = 0 */ -}; - -struct semid_ds { - struct ipc_perm sem_perm; /* operation permission struct */ - struct sem *sem_base; /* pointer to first semaphore in set */ - u_short sem_nsems; /* number of sems in set */ - time_t sem_otime; /* last operation time */ - long sem_pad1; /* SVABI/386 says I need this here */ - time_t sem_ctime; /* last change time */ - /* Times measured in secs since */ - /* 00:00:00 GMT, Jan. 1, 1970 */ - long sem_pad2; /* SVABI/386 says I need this here */ - long sem_pad3[4]; /* SVABI/386 says I need this here */ -}; - -/* - * semop's sops parameter structure - */ -struct sembuf { - u_short sem_num; /* semaphore # */ - short sem_op; /* semaphore operation */ - short sem_flg; /* operation flags */ -}; -#define SEM_UNDO 010000 - -#define MAX_SOPS 5 /* maximum # of sembuf's per semop call */ - -/* - * semctl's arg parameter structure - */ -union semun { - int val; /* value for SETVAL */ - struct semid_ds *buf; /* buffer for IPC_STAT & IPC_SET */ - u_short *array; /* array for GETALL & SETALL */ -}; - -/* - * commands for semctl - */ -#define GETNCNT 3 /* Return the value of semncnt {READ} */ -#define GETPID 4 /* Return the value of sempid {READ} */ -#define GETVAL 5 /* Return the value of semval {READ} */ -#define GETALL 6 /* Return semvals into arg.array {READ} */ -#define GETZCNT 7 /* Return the value of semzcnt {READ} */ -#define SETVAL 8 /* Set the value of semval to arg.val {ALTER} */ -#define SETALL 9 /* Set semvals from arg.array {ALTER} */ +#include <sys/cdefs.h> +#include <sys/_types.h> /* - * Permissions + * [XSI] All of the symbols from <sys/ipc.h> SHALL be defined + * when this header is included */ -#define SEM_A 0200 /* alter permission */ -#define SEM_R 0400 /* read permission */ +#include <sys/ipc.h> -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -/* - * Kernel implementation stuff - */ -#define SEMVMX 32767 /* semaphore maximum value */ -#define SEMAEM 16384 /* adjust on exit max value */ /* - * Configuration parameters. SEMMNI, SEMMNS, and SEMMNU are hard limits. - * The code dynamically allocates enough memory to satisfy the current - * demand in even increments of SEMMNI_INC, SEMMNS_INC, and SEMMNU_INC. - * The code will never allocate more than the hard limits. The *_INC's - * are defined in the kernel section of the header. - */ -/* - * Configuration parameters + * [XSI] The pid_t, time_t, key_t, and size_t types shall be defined as + * described in <sys/types.h>. + * + * NOTE: The definition of the key_t type is implicit from the + * inclusion of <sys/ipc.h> */ -#ifndef SEMMNS /* # of semaphores in system */ -#define SEMMNS (1048576/sizeof(struct sem)) -#endif /* no more than 1M of semaphore data */ -#ifndef SEMMNI /* # of semaphore identifiers */ -#define SEMMNI SEMMNS /* max of 1 for each semaphore */ -#endif -#ifndef SEMUME -#define SEMUME 10 /* max # of undo entries per process */ +#ifndef _PID_T +typedef __darwin_pid_t pid_t; +#define _PID_T #endif -#ifndef SEMMNU /* # of undo structures in system */ -#define SEMMNU SEMMNS /* 1 for each semaphore. This is quite large */ -#endif /* This should be max 1 for each process */ -/* shouldn't need tuning */ -#ifndef SEMMAP -#define SEMMAP 30 /* # of entries in semaphore map */ -#endif -#ifndef SEMMSL -#define SEMMSL SEMMNS /* max # of semaphores per id */ -#endif -#ifndef SEMOPM -#define SEMOPM 100 /* max # of operations per semop call */ +#ifndef _TIME_T +#define _TIME_T +typedef __darwin_time_t time_t; #endif +#ifndef _SIZE_T +#define _SIZE_T +typedef __darwin_size_t size_t; +#endif /* - * Undo structure (one per process) + * Technically, we should force all code references to the new structure + * definition, not in just the standards conformance case, and leave the + * legacy interface there for binary compatibility only. Currently, we + * are only forcing this for programs requesting standards conformance. */ -struct sem_undo { - struct sem_undo *un_next; /* ptr to next active undo structure */ - struct proc *un_proc; /* owner of this structure */ - short un_cnt; /* # of active entries */ - struct undo { - short un_adjval; /* adjust on exit values */ - short un_num; /* semaphore # */ - int un_id; /* semid */ - } un_ent[SEMUME]; /* undo entries */ +#if defined(__POSIX_C_SOURCE) || defined(kernel) || defined(__LP64__) +/* + * Structure used internally. + * + * This structure is exposed because standards dictate that it is used as + * the semun union member 'buf' as the fourth argment to semctl() when the + * third argument is IPC_STAT or IPC_SET. + * + * Note: only the fields sem_perm, sem_nsems, sem_otime, and sem_ctime + * are meaningful in user space. + */ +struct __semid_ds_new { + struct __ipc_perm_new sem_perm; /* [XSI] operation permission struct */ + __int32_t sem_base; /* 32 bit base ptr for semaphore set */ + unsigned short sem_nsems; /* [XSI] number of sems in set */ + time_t sem_otime; /* [XSI] last operation time */ + __int32_t sem_pad1; /* RESERVED: DO NOT USE! */ + time_t sem_ctime; /* [XSI] last change time */ + /* Times measured in secs since */ + /* 00:00:00 GMT, Jan. 1, 1970 */ + __int32_t sem_pad2; /* RESERVED: DO NOT USE! */ + __int32_t sem_pad3[4]; /* RESERVED: DO NOT USE! */ +}; +#define semid_ds __semid_ds_new +#else /* !_POSIX_C_SOURCE */ +#define semid_ds __semid_ds_old +#endif /* !_POSIX_C_SOURCE */ + +#if !defined(__POSIX_C_SOURCE) && !defined(__LP64__) +struct __semid_ds_old { + struct __ipc_perm_old sem_perm; /* [XSI] operation permission struct */ + __int32_t sem_base; /* 32 bit base ptr for semaphore set */ + unsigned short sem_nsems; /* [XSI] number of sems in set */ + time_t sem_otime; /* [XSI] last operation time */ + __int32_t sem_pad1; /* RESERVED: DO NOT USE! */ + time_t sem_ctime; /* [XSI] last change time */ + /* Times measured in secs since */ + /* 00:00:00 GMT, Jan. 1, 1970 */ + __int32_t sem_pad2; /* RESERVED: DO NOT USE! */ + __int32_t sem_pad3[4]; /* RESERVED: DO NOT USE! */ }; +#endif /* !_POSIX_C_SOURCE */ /* - * semaphore info struct + * Possible values for the third argument to semctl() */ -struct seminfo { - int semmap, /* # of entries in semaphore map */ - semmni, /* # of semaphore identifiers */ - semmns, /* # of semaphores in system */ - semmnu, /* # of undo structures in system */ - semmsl, /* max # of semaphores per id */ - semopm, /* max # of operations per semop call */ - semume, /* max # of undo entries per process */ - semusz, /* size in bytes of undo structure */ - semvmx, /* semaphore maximum value */ - semaem; /* adjust on exit max value */ -}; -extern struct seminfo seminfo; +#define GETNCNT 3 /* [XSI] Return the value of semncnt {READ} */ +#define GETPID 4 /* [XSI] Return the value of sempid {READ} */ +#define GETVAL 5 /* [XSI] Return the value of semval {READ} */ +#define GETALL 6 /* [XSI] Return semvals into arg.array {READ} */ +#define GETZCNT 7 /* [XSI] Return the value of semzcnt {READ} */ +#define SETVAL 8 /* [XSI] Set the value of semval to arg.val {ALTER} */ +#define SETALL 9 /* [XSI] Set semvals from arg.array {ALTER} */ -/* internal "mode" bits */ -#define SEM_ALLOC 01000 /* semaphore is allocated */ -#define SEM_DEST 02000 /* semaphore will be destroyed on last detach */ -#define SEMMNI_INC 8 /* increment value for semaphore identifiers */ -#define SEMMNS_INC 64 /* increment value for semaphores */ -#define SEMMNU_INC 32 /* increment value for undo structures */ +/* A semaphore; this is an anonymous structure, not for external use */ +struct sem { + unsigned short semval; /* semaphore value */ + pid_t sempid; /* pid of last operation */ + unsigned short semncnt; /* # awaiting semval > cval */ + unsigned short semzcnt; /* # awaiting semval == 0 */ +}; + /* - * Due to the way semaphore memory is allocated, we have to ensure that - * SEMUSZ is properly aligned. - * - * We are not doing strange semaphore memory allocation anymore, so - * these macros are no longer needed. + * Structure of array element for second argument to semop() */ +struct sembuf { + unsigned short sem_num; /* [XSI] semaphore # */ + short sem_op; /* [XSI] semaphore operation */ + short sem_flg; /* [XSI] operation flags */ +}; /* - * #define SEM_ALIGN(bytes) (((bytes) + (sizeof(long) - 1)) & ~(sizeof(long) - 1)) + * Possible flag values for sem_flg */ +#define SEM_UNDO 010000 /* [XSI] Set up adjust on exit entry */ + + +#ifndef _POSIX_C_SOURCE -/* actual size of an undo structure */ /* - * #define SEMUSZ SEM_ALIGN(offsetof(struct sem_undo, un_ent[SEMUME])) + * System imposed limit on the value of the third parameter to semop(). + * This is arbitrary, and the standards unfortunately do not provide a + * way for user applications to retrieve this value (e.g. via sysconf() + * or from a manifest value in <unistd.h>). The value shown here is + * informational, and subject to change in future revisions. */ -#define SEMUSZ sizeof(struct sem_undo) +#define MAX_SOPS 5 /* maximum # of sembuf's per semop call */ -extern struct semid_ds *sema; /* semaphore id pool */ -extern struct sem *sem; /* semaphore pool */ -/* This is now a struct sem_undo with the new memory allocation - * extern int *semu; /* undo structure pool - */ -extern struct sem_undo *semu; /* undo structure pool */ /* - * Macro to find a particular sem_undo vector - */ -/* Until we can initialize seminfo.semusz to SEMUSZ, we hard code the size macro - * in SEMU. This should be fixed when (if) we implement dynamic pool sizes + * Union used as the fourth argment to semctl() in all cases. Specific + * member values are used for different values of the third parameter: + * + * Command Member + * ------------------------------------------- ------ + * GETALL, SETALL array + * SETVAL val + * IPC_STAT, IPC_SET buf * - * #define SEMU(ix) ((struct sem_undo *)(((intptr_t)semu)+ix * seminfo.semusz)) - */ -/* - * This macro doesn't work because we are using a staticly allocated array - * for semu now. - * #define SEMU(ix) ((struct sem_undo *)(((intptr_t)semu)+ix * SEMUSZ)) + * The union definition is intended to be defined by the user application + * in conforming applications; it is provided here for two reasons: + * + * 1) Historical source compatability for non-conforming applications + * expecting this header to declare the union type on their behalf + * + * 2) Documentation; specifically, 64 bit applications that do not pass + * this structure for 'val', or, alternately, a 64 bit type, will + * not function correctly */ -#define SEMU(ix) (&semu[ix]) +union semun { + int val; /* value for SETVAL */ + struct semid_ds *buf; /* buffer for IPC_STAT & IPC_SET */ + unsigned short *array; /* array for GETALL & SETALL */ +}; +typedef union semun semun_t; /* - * Process sem_undo vectors at proc exit. + * Permissions */ -void semexit __P((struct proc *p)); +#define SEM_A 0200 /* alter permission */ +#define SEM_R 0400 /* read permission */ -/* - * Parameters to the semconfig system call - */ -typedef enum { - SEM_CONFIG_FREEZE, /* Freeze the semaphore facility. */ - SEM_CONFIG_THAW /* Thaw the semaphore facility. */ -} semconfig_ctl_t; +#endif /* !_POSIX_C_SOURCE */ -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ #ifndef KERNEL -#include <sys/cdefs.h> __BEGIN_DECLS -int semsys __P((int, ...)); -int semctl __P((int, int, int, ...)); -int semget __P((key_t, int, int)); -int semop __P((int, struct sembuf *,unsigned)); +#ifndef _POSIX_C_SOURCE +int semsys(int, ...); +#endif /* !_POSIX_C_SOURCE */ +int semctl(int, int, int, ...) __DARWIN_ALIAS(semctl); +int semget(key_t, int, int); +int semop(int, struct sembuf *, size_t); __END_DECLS + #endif /* !KERNEL */ #endif /* !_SEM_H_ */ diff --git a/bsd/sys/sem_internal.h b/bsd/sys/sem_internal.h new file mode 100644 index 000000000..ed17b0e9b --- /dev/null +++ b/bsd/sys/sem_internal.h @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* $NetBSD: sem.h,v 1.5 1994/06/29 06:45:15 cgd Exp $ */ + +/* + * SVID compatible sem_internal.h file + * + * Author: Daniel Boulet + */ +/* + * John Bellardo modified the implementation for Darwin. 12/2000 + */ + +#ifndef _SYS_SEM__INTERNALH_ +#define _SYS_SEM__INTERNALH_ + +#include <sys/sem.h> +#include <sys/cdefs.h> + + +/* + * This structure is variant for 64 bits because of sem_otime and sem_ctime. + */ + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_semid_ds { + struct ipc_perm sem_perm; /* [XSI] operation permission struct */ + struct sem *sem_base; /* 32 bit base ptr for semaphore set */ + unsigned short sem_nsems; /* [XSI] number of sems in set */ + user_time_t sem_otime; /* [XSI] last operation time */ + __int32_t sem_pad1; /* RESERVED: DO NOT USE! */ + user_time_t sem_ctime; /* [XSI] last change time */ + /* Times measured in secs since */ + /* 00:00:00 GMT, Jan. 1, 1970 */ + __int32_t sem_pad2; /* RESERVED: DO NOT USE! */ + __int32_t sem_pad3[4]; /* RESERVED: DO NOT USE! */ +}; + +union user_semun { + user_addr_t buf; /* buffer for IPC_STAT & IPC_SET */ + user_addr_t array; /* array for GETALL & SETALL */ +}; +typedef union user_semun user_semun_t; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + + +/* + * Kernel implementation stuff + */ +#define SEMVMX 32767 /* semaphore maximum value */ +#define SEMAEM 16384 /* adjust on exit max value */ + +/* + * Configuration parameters. SEMMNI, SEMMNS, and SEMMNU are hard limits. + * The code dynamically allocates enough memory to satisfy the current + * demand in even increments of SEMMNI_INC, SEMMNS_INC, and SEMMNU_INC. + * The code will never allocate more than the hard limits. The *_INC's + * are defined in the kernel section of the header. + */ +/* + * Configuration parameters + */ +#ifndef SEMMNS /* # of semaphores in system */ +#define SEMMNS (1048576/sizeof(struct sem)) +#endif /* no more than 1M of semaphore data */ +#ifndef SEMMNI /* # of semaphore identifiers */ +#define SEMMNI SEMMNS /* max of 1 for each semaphore */ +#endif +#ifndef SEMUME +#define SEMUME 10 /* max # of undo entries per process */ +#endif +#ifndef SEMMNU /* # of undo structures in system */ +#define SEMMNU SEMMNS /* 1 for each semaphore. This is quite large */ +#endif /* This should be max 1 for each process */ + +/* shouldn't need tuning */ +#ifndef SEMMAP +#define SEMMAP 30 /* # of entries in semaphore map */ +#endif +#ifndef SEMMSL +#define SEMMSL SEMMNS /* max # of semaphores per id */ +#endif +#ifndef SEMOPM +#define SEMOPM 100 /* max # of operations per semop call */ +#endif + + +/* + * Undo structure (internal: one per process) + */ +struct sem_undo { + struct sem_undo *un_next; /* ptr to next active undo structure */ + struct proc *un_proc; /* owner of this structure */ + short un_cnt; /* # of active entries */ + struct undo { + short une_adjval; /* adjust on exit values */ + short une_num; /* semaphore # */ + int une_id; /* semid */ + struct undo *une_next; /* next undo entry */ + } *un_ent; /* undo entries */ +}; + +/* + * semaphore info struct (internal; for administrative limits and ipcs) + */ +struct seminfo { + int semmap, /* # of entries in semaphore map */ + semmni, /* # of semaphore identifiers */ + semmns, /* # of semaphores in system */ + semmnu, /* # of undo structures in system */ + semmsl, /* max # of semaphores per id */ + semopm, /* max # of operations per semop call */ + semume, /* max # of undo entries per process */ + semusz, /* size in bytes of undo structure */ + semvmx, /* semaphore maximum value */ + semaem; /* adjust on exit max value */ +}; +extern struct seminfo seminfo; + +/* internal "mode" bits */ +#define SEM_ALLOC 01000 /* semaphore is allocated */ +#define SEM_DEST 02000 /* semaphore will be destroyed on last detach */ + +#define SEMMNI_INC 8 /* increment value for semaphore identifiers */ +#define SEMMNS_INC 64 /* increment value for semaphores */ +#define SEMMNU_INC 32 /* increment value for undo structures */ + +/* + * Due to the way semaphore memory is allocated, we have to ensure that + * SEMUSZ is properly aligned. + * + * We are not doing strange semaphore memory allocation anymore, so + * these macros are no longer needed. + */ + +/* + * #define SEM_ALIGN(bytes) (((bytes) + (sizeof(long) - 1)) & ~(sizeof(long) - 1)) + */ + +/* actual size of an undo structure */ +/* + * #define SEMUSZ SEM_ALIGN(offsetof(struct sem_undo, un_ent[SEMUME])) + */ +#define SEMUSZ sizeof(struct sem_undo) + +extern struct user_semid_ds *sema; /* semaphore id pool */ +extern struct sem *sem_pool; /* semaphore pool */ +/* This is now a struct sem_undo with the new memory allocation + * extern int *semu; // undo structure pool + */ +extern struct sem_undo *semu; /* undo structure pool */ + +/* + * Macro to find a particular sem_undo vector + */ +/* Until we can initialize seminfo.semusz to SEMUSZ, we hard code the size macro + * in SEMU. This should be fixed when (if) we implement dynamic pool sizes + * + * #define SEMU(ix) ((struct sem_undo *)(((intptr_t)semu)+ix * seminfo.semusz)) + */ +/* + * This macro doesn't work because we are using a staticly allocated array + * for semu now. + * #define SEMU(ix) ((struct sem_undo *)(((intptr_t)semu)+ix * SEMUSZ)) + */ +#define SEMU(ix) (&semu[ix]) + + +/* + * Process sem_undo vectors at proc exit. + */ +void semexit(struct proc *p); + +/* + * Parameters to the semconfig system call + */ +typedef enum { + SEM_CONFIG_FREEZE, /* Freeze the semaphore facility. */ + SEM_CONFIG_THAW /* Thaw the semaphore facility. */ +} semconfig_ctl_t; + + +#endif /* !_SYS_SEM__INTERNALH_ */ diff --git a/bsd/sys/semaphore.h b/bsd/sys/semaphore.h index 7a5ea7ed4..10ba6378b 100644 --- a/bsd/sys/semaphore.h +++ b/bsd/sys/semaphore.h @@ -54,6 +54,8 @@ int sem_unlink(const char *); int sem_wait(sem_t *); __END_DECLS -#endif /* KERNEL */ +#else /* KERNEL */ +void psem_cache_init(void); +#endif /* KERNEL */ #endif /* _SYS_SEMAPHORE_H_ */ diff --git a/bsd/sys/shm.h b/bsd/sys/shm.h index f86d8aae8..dc3fc2b56 100644 --- a/bsd/sys/shm.h +++ b/bsd/sys/shm.h @@ -59,64 +59,124 @@ #ifndef _SYS_SHM_H_ #define _SYS_SHM_H_ -#include <sys/appleapiopts.h> -#include <sys/param.h> +#include <sys/cdefs.h> +#include <sys/_types.h> + +/* + * [XSI] All of the symbols from <sys/ipc.h> SHALL be defined + * when this header is included + */ #include <sys/ipc.h> -#define SHM_RDONLY 010000 /* Attach read-only (else read-write) */ -#define SHM_RND 020000 /* Round attach address to SHMLBA */ -#define SHMLBA NBPG /* Segment low boundary address multiple */ +/* + * [XSI] The pid_t, time_t, key_t, and size_t types shall be defined as + * described in <sys/types.h>. + * + * NOTE: The definition of the key_t type is implicit from the + * inclusion of <sys/ipc.h> + */ +#ifndef _PID_T +typedef __darwin_pid_t pid_t; +#define _PID_T +#endif + +#ifndef _TIME_T +#define _TIME_T +typedef __darwin_time_t time_t; +#endif + +#ifndef _SIZE_T +#define _SIZE_T +typedef __darwin_size_t size_t; +#endif + +/* + * [XSI] The unsigned integer type used for the number of current attaches + * that MUST be able to store values at least as large as a type unsigned + * short. + */ +typedef unsigned short shmatt_t; + + +/* + * Possible flag values which may be OR'ed into the third argument to + * shmat() + */ +#define SHM_RDONLY 010000 /* [XSI] Attach read-only (else read-write) */ +#define SHM_RND 020000 /* [XSI] Round attach address to SHMLBA */ + +/* + * This value is symbolic, and generally not expected to be sed by user + * programs directly, although such ise is permitted by the standard. Its + * value in our implementation is equal to the number of bytes per page. + * + * NOTE: We DO NOT obtain this value from the appropriate system + * headers at this time, to avoid the resulting namespace + * pollution, which is why we discourages its use. + */ +#define SHMLBA 4096 /* [XSI] Segment low boundary address multiple*/ /* "official" access mode definitions; somewhat braindead since you have to specify (SHM_* >> 3) for group and (SHM_* >> 6) for world permissions */ #define SHM_R (IPC_R) #define SHM_W (IPC_W) - -struct shmid_ds { - struct ipc_perm shm_perm; /* operation permission structure */ - int shm_segsz; /* size of segment in bytes */ - pid_t shm_lpid; /* process ID of last shared memory op */ - pid_t shm_cpid; /* process ID of creator */ - short shm_nattch; /* number of current attaches */ - time_t shm_atime; /* time of last shmat() */ - time_t shm_dtime; /* time of last shmdt() */ - time_t shm_ctime; /* time of last change by shmctl() */ - void *shm_internal; /* sysv stupidity */ -}; - -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE /* - * System 5 style catch-all structure for shared memory constants that - * might be of interest to user programs. Do we really want/need this? + * Technically, we should force all code references to the new structure + * definition, not in just the standards conformance case, and leave the + * legacy interface there for binary compatibility only. Currently, we + * are only forcing this for programs requesting standards conformance. */ -struct shminfo { - int shmmax, /* max shared memory segment size (bytes) */ - shmmin, /* min shared memory segment size (bytes) */ - shmmni, /* max number of shared memory identifiers */ - shmseg, /* max shared memory segments per process */ - shmall; /* max amount of shared memory (pages) */ +#if defined(__POSIX_C_SOURCE) || defined(kernel) || defined(__LP64__) +/* + * Structure used internally. + * + * This structure is exposed because standards dictate that it is used as + * the third argment to shmctl(). + * + * NOTE: The field shm_internal is not meaningful in user space, + * and mst not be used there. + */ +struct __shmid_ds_new { + struct __ipc_perm_new shm_perm; /* [XSI] Operation permission value */ + size_t shm_segsz; /* [XSI] Size of segment in bytes */ + pid_t shm_lpid; /* [XSI] PID of last shared memory op */ + pid_t shm_cpid; /* [XSI] PID of creator */ + short shm_nattch; /* [XSI] Number of current attaches */ + time_t shm_atime; /* [XSI] Time of last shmat() */ + time_t shm_dtime; /* [XSI] Time of last shmdt() */ + time_t shm_ctime; /* [XSI] Time of last shmctl() change */ + void *shm_internal; /* reserved for kernel use */ }; -extern struct shminfo shminfo; -extern struct shmid_ds *shmsegs; +#define shmid_ds __shmid_ds_new +#else /* !_POSIX_C_SOURCE */ +#define shmid_ds __shmid_ds_old +#endif /* !_POSIX_C_SOURCE */ -struct proc; - -void shmexit __P((struct proc *)); -void shmfork __P((struct proc *, struct proc *)); -__private_extern__ void shmexec __P((struct proc *)); -#endif /* __APPLE_API_PRIVATE */ -#else /* !KERNEL */ +#if !defined(__POSIX_C_SOURCE) && !defined(__LP64__) +struct __shmid_ds_old { + struct __ipc_perm_old shm_perm; /* [XSI] Operation permission value */ + size_t shm_segsz; /* [XSI] Size of segment in bytes */ + pid_t shm_lpid; /* [XSI] PID of last shared memory op */ + pid_t shm_cpid; /* [XSI] PID of creator */ + short shm_nattch; /* [XSI] Number of current attaches */ + time_t shm_atime; /* [XSI] Time of last shmat() */ + time_t shm_dtime; /* [XSI] Time of last shmdt() */ + time_t shm_ctime; /* [XSI] Time of last shmctl() change */ + void *shm_internal; /* reserved for kernel use */ +}; +#endif /* !_POSIX_C_SOURCE */ -#include <sys/cdefs.h> +#ifndef KERNEL __BEGIN_DECLS -int shmsys __P((int, ...)); -void *shmat __P((int, void *, int)); -int shmget __P((key_t, int, int)); -int shmctl __P((int, int, struct shmid_ds *)); -int shmdt __P((void *)); +#ifndef _POSIX_C_SOURCE +int shmsys(int, ...); +#endif /* !_POSIX_C_SOURCE */ +void *shmat (int, const void *, int); +int shmctl(int, int, struct shmid_ds *) __DARWIN_ALIAS(shmctl); +int shmdt(const void *); +int shmget(key_t, size_t, int); __END_DECLS #endif /* !KERNEL */ diff --git a/bsd/sys/shm_internal.h b/bsd/sys/shm_internal.h new file mode 100644 index 000000000..e0bd76189 --- /dev/null +++ b/bsd/sys/shm_internal.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* $NetBSD: shm.h,v 1.15 1994/06/29 06:45:17 cgd Exp $ */ + +/* + * Copyright (c) 1994 Adam Glass + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Adam Glass. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * As defined+described in "X/Open System Interfaces and Headers" + * Issue 4, p. XXX + */ + +#ifndef _SYS_SHM_INTERNALH_ +#define _SYS_SHM_INTERNALH_ + +#include <sys/shm.h> +#include <sys/cdefs.h> + +#include <machine/types.h> + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_shmid_ds { + struct ipc_perm shm_perm; /* operation permission structure */ + user_size_t shm_segsz; /* size of segment in bytes */ + pid_t shm_lpid; /* PID of last shared memory op */ + pid_t shm_cpid; /* PID of creator */ + short shm_nattch; /* number of current attaches */ + time_t shm_atime; /* time of last shmat() */ + time_t shm_dtime; /* time of last shmdt() */ + time_t shm_ctime; /* time of last change by shmctl() */ + user_addr_t shm_internal; /* reserved for kernel use */ +}; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +/* + * System 5 style catch-all structure for shared memory constants that + * might be of interest to user programs. Also part of the ipcs interface. + * Note: use of user_ssize_t intentional: permits 32 bit ipcs to provide + * information about 64 bit programs shared segments. + */ +struct shminfo { + user_ssize_t shmmax; /* max shm segment size (bytes) */ + user_ssize_t shmmin; /* min shm segment size (bytes) */ + user_ssize_t shmmni; /* max number of shm identifiers */ + user_ssize_t shmseg; /* max shm segments per process */ + user_ssize_t shmall; /* max amount of shm (pages) */ +}; + +#ifdef KERNEL +extern struct shminfo shminfo; +extern struct user_shmid_ds *shmsegs; + +struct proc; + +__BEGIN_DECLS + +void shmexit(struct proc *); +int shmfork(struct proc *, struct proc *); +__private_extern__ void shmexec(struct proc *); + +__END_DECLS + +#endif /* kernel */ + +#endif /* !_SYS_SHM_INTERNALH_ */ diff --git a/bsd/sys/signal.h b/bsd/sys/signal.h index b074ab459..0fa8fb1cb 100644 --- a/bsd/sys/signal.h +++ b/bsd/sys/signal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -63,9 +63,10 @@ #ifndef _SYS_SIGNAL_H_ #define _SYS_SIGNAL_H_ +#include <sys/cdefs.h> #include <sys/appleapiopts.h> -#if !defined(_ANSI_SOURCE) && !defined(_POSIX_SOURCE) +#if !defined(_ANSI_SOURCE) && !defined(_POSIX_C_SOURCE) #define NSIG 32 /* counting 0; could be 33 (mask is 1-32) */ #endif @@ -75,77 +76,122 @@ #define SIGINT 2 /* interrupt */ #define SIGQUIT 3 /* quit */ #define SIGILL 4 /* illegal instruction (not reset when caught) */ -#if !defined(_POSIX_SOURCE) #define SIGTRAP 5 /* trace trap (not reset when caught) */ -#endif #define SIGABRT 6 /* abort() */ -#if !defined(_POSIX_SOURCE) +#if defined(_POSIX_C_SOURCE) +#define SIGPOLL 7 /* pollable event ([XSR] generated, not supported) */ +#else /* !_POSIX_C_SOURCE */ #define SIGIOT SIGABRT /* compatibility */ #define SIGEMT 7 /* EMT instruction */ -#endif +#endif /* !_POSIX_C_SOURCE */ #define SIGFPE 8 /* floating point exception */ #define SIGKILL 9 /* kill (cannot be caught or ignored) */ -#if !defined(_POSIX_SOURCE) #define SIGBUS 10 /* bus error */ -#endif #define SIGSEGV 11 /* segmentation violation */ -#if !defined(_POSIX_SOURCE) #define SIGSYS 12 /* bad argument to system call */ -#endif #define SIGPIPE 13 /* write on a pipe with no one to read it */ #define SIGALRM 14 /* alarm clock */ #define SIGTERM 15 /* software termination signal from kill */ -#if !defined(_POSIX_SOURCE) #define SIGURG 16 /* urgent condition on IO channel */ -#endif #define SIGSTOP 17 /* sendable stop signal not from tty */ #define SIGTSTP 18 /* stop signal from tty */ #define SIGCONT 19 /* continue a stopped process */ #define SIGCHLD 20 /* to parent on child stop or exit */ #define SIGTTIN 21 /* to readers pgrp upon background tty read */ #define SIGTTOU 22 /* like TTIN for output if (tp->t_local<OSTOP) */ -#if !defined(_POSIX_SOURCE) +#if !defined(_POSIX_C_SOURCE) #define SIGIO 23 /* input/output possible signal */ +#endif #define SIGXCPU 24 /* exceeded CPU time limit */ #define SIGXFSZ 25 /* exceeded file size limit */ #define SIGVTALRM 26 /* virtual time alarm */ #define SIGPROF 27 /* profiling time alarm */ +#if !defined(_POSIX_C_SOURCE) #define SIGWINCH 28 /* window size changes */ #define SIGINFO 29 /* information request */ #endif #define SIGUSR1 30 /* user defined signal 1 */ #define SIGUSR2 31 /* user defined signal 2 */ -#if defined(_ANSI_SOURCE) || defined(__cplusplus) +#if defined(_ANSI_SOURCE) || defined(_POSIX_C_SOURCE) || defined(__cplusplus) /* * Language spec sez we must list exactly one parameter, even though we * actually supply three. Ugh! + * SIG_HOLD is chosen to avoid KERN_SIG_* values in <sys/signalvar.h> */ #define SIG_DFL (void (*)(int))0 #define SIG_IGN (void (*)(int))1 -#define SIG_ERR (void (*)(int))-1 +#define SIG_HOLD (void (*)(int))5 +#define SIG_ERR ((void (*)(int))-1) #else -#define SIG_DFL (void (*)())0 -#define SIG_IGN (void (*)())1 -#define SIG_ERR (void (*)())-1 +/* DO NOT REMOVE THE COMMENTED OUT int: fixincludes needs to see them */ +#define SIG_DFL (void (*)(/*int*/))0 +#define SIG_IGN (void (*)(/*int*/))1 +#define SIG_HOLD (void (*)(/*int*/))5 +#define SIG_ERR ((void (*)(/*int*/))-1) #endif #ifndef _ANSI_SOURCE -#include <sys/types.h> +#include <sys/_types.h> + +#ifndef _MCONTEXT_T +#define _MCONTEXT_T +typedef __darwin_mcontext_t mcontext_t; +#endif + +#ifndef _POSIX_C_SOURCE +#ifndef _MCONTEXT64_T +#define _MCONTEXT64_T +typedef __darwin_mcontext64_t mcontext64_t; +#endif +#endif /* _POSIX_C_SOURCE */ + +#ifndef _PID_T +#define _PID_T +typedef __darwin_pid_t pid_t; +#endif + +#ifndef _PTHREAD_ATTR_T +#define _PTHREAD_ATTR_T +typedef __darwin_pthread_attr_t pthread_attr_t; +#endif + +#ifndef _SIGSET_T +#define _SIGSET_T +typedef __darwin_sigset_t sigset_t; +#endif + +#ifndef _SIZE_T +#define _SIZE_T +typedef __darwin_size_t size_t; +#endif + +#ifndef _UCONTEXT_T +#define _UCONTEXT_T +typedef __darwin_ucontext_t ucontext_t; +#endif -typedef unsigned int sigset_t; +#ifndef _POSIX_C_SOURCE +#ifndef _UCONTEXT64_T +#define _UCONTEXT64_T +typedef __darwin_ucontext64_t ucontext64_t; +#endif +#endif /* _POSIX_C_SOURCE */ + +#ifndef _UID_T +#define _UID_T +typedef __darwin_uid_t uid_t; +#endif union sigval { /* Members as suggested by Annex C of POSIX 1003.1b. */ - int sigval_int; - void *sigval_ptr; + int sival_int; + void *sival_ptr; }; -#define SIGEV_NONE 0 /* No async notification */ +#define SIGEV_NONE 0 /* No async notification */ #define SIGEV_SIGNAL 1 /* aio - completion notification */ -#ifdef __APPLE_API_PRIVATE #define SIGEV_THREAD 3 /* A notification function will be called to perform notification */ -#endif /*__APPLE_API_PRIVATE */ struct sigevent { int sigev_notify; /* Notification type */ @@ -155,19 +201,73 @@ struct sigevent { pthread_attr_t *sigev_notify_attributes; /* Notification attributes */ }; +// LP64todo - should this move? +#ifdef BSD_KERNEL_PRIVATE + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +union user_sigval { + struct { + int pad; /* assumes Motorolla byte order */ + int sival_int; + } size_equivalent; + user_addr_t sival_ptr; +}; + +struct user_sigevent { + int sigev_notify; /* Notification type */ + int sigev_signo; /* Signal number */ + union user_sigval sigev_value; /* Signal value */ + user_addr_t sigev_notify_function; /* Notify function */ + user_addr_t sigev_notify_attributes; /* Notify attributes */ +}; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +#endif /* BSD_KERNEL_PRIVATE */ + typedef struct __siginfo { int si_signo; /* signal number */ int si_errno; /* errno association */ int si_code; /* signal code */ - int si_pid; /* sending process */ - unsigned int si_uid; /* sender's ruid */ + pid_t si_pid; /* sending process */ + uid_t si_uid; /* sender's ruid */ int si_status; /* exit value */ void *si_addr; /* faulting instruction */ union sigval si_value; /* signal value */ long si_band; /* band event for SIGPOLL */ - unsigned int pad[7]; /* Reserved for Future Use */ + unsigned long pad[7]; /* Reserved for Future Use */ } siginfo_t; +#ifdef BSD_KERNEL_PRIVATE + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +typedef struct __user_siginfo { + int si_signo; /* signal number */ + int si_errno; /* errno association */ + int si_code; /* signal code */ + pid_t si_pid; /* sending process */ + uid_t si_uid; /* sender's ruid */ + int si_status; /* exit value */ + user_addr_t si_addr; /* faulting instruction */ + union user_sigval si_value; /* signal value */ + user_long_t si_band; /* band event for SIGPOLL */ + user_ulong_t pad[7]; /* Reserved for Future Use */ +} user_siginfo_t; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +#endif /* BSD_KERNEL_PRIVATE */ + /* * Incase of SIGILL and SIGFPE, si_addr contains the address of * faulting instruction. @@ -181,38 +281,68 @@ typedef struct __siginfo { /* Values for si_code */ /* Codes for SIGILL */ +#ifndef _POSIX_C_SOURCE #define ILL_NOOP 0 /* if only I knew... */ -#define ILL_ILLOPC 1 /* illegal opcode */ -#define ILL_ILLTRP 2 /* illegal trap */ -#define ILL_PRVOPC 3 /* privileged opcode */ +#endif +#define ILL_ILLOPC 1 /* [XSI] illegal opcode */ +#define ILL_ILLTRP 2 /* [XSI] illegal trap */ +#define ILL_PRVOPC 3 /* [XSI] privileged opcode */ +#define ILL_ILLOPN 4 /* [XSI] illegal operand -NOTIMP */ +#define ILL_ILLADR 5 /* [XSI] illegal addressing mode -NOTIMP */ +#define ILL_PRVREG 6 /* [XSI] privileged register -NOTIMP */ +#define ILL_COPROC 7 /* [XSI] coprocessor error -NOTIMP */ +#define ILL_BADSTK 8 /* [XSI] internal stack error -NOTIMP */ /* Codes for SIGFPE */ +#ifndef _POSIX_C_SOURCE #define FPE_NOOP 0 /* if only I knew... */ -#define FPE_FLTDIV 1 /* floating point divide by zero */ -#define FPE_FLTOVF 2 /* floating point overflow */ -#define FPE_FLTUND 3 /* floating point underflow */ -#define FPE_FLTRES 4 /* floating point inexact result */ -#define FPE_FLTINV 5 /* invalid floating point operation */ +#endif +#define FPE_FLTDIV 1 /* [XSI] floating point divide by zero */ +#define FPE_FLTOVF 2 /* [XSI] floating point overflow */ +#define FPE_FLTUND 3 /* [XSI] floating point underflow */ +#define FPE_FLTRES 4 /* [XSI] floating point inexact result */ +#define FPE_FLTINV 5 /* [XSI] invalid floating point operation */ +#define FPE_FLTSUB 6 /* [XSI] subscript out of range -NOTIMP */ +#define FPE_INTDIV 7 /* [XSI] integer divide by zero -NOTIMP */ +#define FPE_INTOVF 8 /* [XSI] integer overflow -NOTIMP */ /* Codes for SIGSEGV */ +#ifndef _POSIX_C_SOURCE #define SEGV_NOOP 0 /* if only I knew... */ -#define SEGV_MAPERR 1 /* address not mapped to object */ -#define SEGV_ACCERR 2 /* invalid permissions for mapped to object */ +#endif +#define SEGV_MAPERR 1 /* [XSI] address not mapped to object */ +#define SEGV_ACCERR 2 /* [XSI] invalid permission for mapped object */ /* Codes for SIGBUS */ +#ifndef _POSIX_C_SOURCE #define BUS_NOOP 0 /* if only I knew... */ -#define BUS_ADRALN 1 /* invalid address alignment */ +#endif +#define BUS_ADRALN 1 /* [XSI] Invalid address alignment */ +#define BUS_ADRERR 2 /* [XSI] Nonexistent physical address -NOTIMP */ +#define BUS_OBJERR 3 /* [XSI] Object-specific HW error - NOTIMP */ + +/* Codes for SIGTRAP */ +#define TRAP_BRKPT 1 /* [XSI] Process breakpoint -NOTIMP */ +#define TRAP_TRACE 2 /* [XSI] Process trace trap -NOTIMP */ /* Codes for SIGCHLD */ +#ifndef _POSIX_C_SOURCE #define CLD_NOOP 0 /* if only I knew... */ -#define CLD_EXITED 1 /* child has exited */ -#define CLD_KILLED 2 - /* child has terminated abnormally and did not create a core file */ -#define CLD_DUMPED 3 - /* child has terminated abnormally and create a core file */ -#define CLD_TRAPPED 4 /* traced child has trapped */ -#define CLD_STOPPED 5 /* child has stopped */ -#define CLD_CONTINUED 6 /* stopped child has continued */ +#endif +#define CLD_EXITED 1 /* [XSI] child has exited */ +#define CLD_KILLED 2 /* [XSI] terminated abnormally, no core file */ +#define CLD_DUMPED 3 /* [XSI] terminated abnormally, core file */ +#define CLD_TRAPPED 4 /* [XSI] traced child has trapped */ +#define CLD_STOPPED 5 /* [XSI] child has stopped */ +#define CLD_CONTINUED 6 /* [XSI] stopped child has continued */ + +/* Codes for SIGPOLL */ +#define POLL_IN 1 /* [XSR] Data input available */ +#define POLL_OUT 2 /* [XSR] Output buffers available */ +#define POLL_MSG 3 /* [XSR] Input message available */ +#define POLL_ERR 4 /* [XSR] I/O error */ +#define POLL_PRI 5 /* [XSR] High priority input available */ +#define POLL_HUP 6 /* [XSR] Device disconnected */ /* union for signal handlers */ union __sigaction_u { @@ -237,24 +367,63 @@ struct sigaction { sigset_t sa_mask; /* signal mask to apply */ int sa_flags; /* see signal options below */ }; + +#ifdef BSD_KERNEL_PRIVATE +#include <machine/types.h> + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +union __user_sigaction_u { + user_addr_t __sa_handler; + user_addr_t __sa_sigaction; +}; + +struct user_sigaction { + union __user_sigaction_u __sigaction_u; /* signal handler */ + sigset_t sa_mask; /* signal mask to apply */ + int sa_flags; /* see signal options below */ +}; + +struct __user_sigaction { + union __user_sigaction_u __sigaction_u; /* signal handler */ + user_addr_t sa_tramp; /* signal mask to apply */ + sigset_t sa_mask; /* signal mask to apply */ + int sa_flags; /* see signal options below */ +}; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +#undef SIG_DFL +#undef SIG_IGN +#undef SIG_ERR +#define SIG_DFL ((user_addr_t)0LL) +#define SIG_IGN ((user_addr_t)1LL) +#define SIG_ERR ((user_addr_t)-1LL) + +#endif /* BSD_KERNEL_PRIVATE */ + + /* if SA_SIGINFO is set, sa_sigaction is to be used instead of sa_handler. */ #define sa_handler __sigaction_u.__sa_handler #define sa_sigaction __sigaction_u.__sa_sigaction - -#if !defined(_POSIX_SOURCE) #define SA_ONSTACK 0x0001 /* take signal on signal stack */ #define SA_RESTART 0x0002 /* restart system on signal return */ #define SA_DISABLE 0x0004 /* disable taking signals on alternate stack */ #define SA_RESETHAND 0x0004 /* reset to SIG_DFL when taking signal */ +#define SA_NOCLDSTOP 0x0008 /* do not generate SIGCHLD on child stop */ #define SA_NODEFER 0x0010 /* don't mask the signal we're delivering */ #define SA_NOCLDWAIT 0x0020 /* don't keep zombies around */ #define SA_SIGINFO 0x0040 /* signal handler with SA_SIGINFO args */ +#ifndef _POSIX_C_SOURCE #define SA_USERTRAMP 0x0100 /* do not bounce off kernel's sigtramp */ /* This will provide 64bit register set in a 32bit user address space */ #define SA_64REGSET 0x0200 /* signal handler with SA_SIGINFO args with 64bit regs information */ -#endif -#define SA_NOCLDSTOP 0x0008 /* do not generate SIGCHLD on child stop */ +#endif /* !_POSIX_C_SOURCE */ /* * Flags for sigprocmask: @@ -264,32 +433,48 @@ struct sigaction { #define SIG_SETMASK 3 /* set specified signal set */ /* POSIX 1003.1b required values. */ -#define SI_USER 0x10001 -#define SI_QUEUE 0x10002 -#define SI_TIMER 0x10003 -#define SI_ASYNCIO 0x10004 -#define SI_MESGQ 0x10005 - -#if !defined(_POSIX_SOURCE) -#include <sys/cdefs.h> -typedef void (*sig_t) __P((int)); /* type of signal function */ +#define SI_USER 0x10001 /* [CX] signal from kill() */ +#define SI_QUEUE 0x10002 /* [CX] signal from sigqueue() */ +#define SI_TIMER 0x10003 /* [CX] timer expiration */ +#define SI_ASYNCIO 0x10004 /* [CX] aio request completion */ +#define SI_MESGQ 0x10005 /* [CX] from message arrival on empty queue */ + +#ifndef _POSIX_C_SOURCE +typedef void (*sig_t)(int); /* type of signal function */ +#endif /* * Structure used in sigaltstack call. */ -struct sigaltstack { - char *ss_sp; /* signal stack base */ - int ss_size; /* signal stack length */ - int ss_flags; /* SA_DISABLE and/or SA_ONSTACK */ +#ifdef BSD_KERNEL_PRIVATE + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_sigaltstack { + user_addr_t ss_sp; /* signal stack base */ + user_size_t ss_size; /* signal stack length */ + int ss_flags; /* SA_DISABLE and/or SA_ONSTACK */ }; -typedef struct sigaltstack stack_t; +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +#endif /* BSD_KERNEL_PRIVATE */ + +#ifndef _STACK_T +#define _STACK_T +typedef __darwin_stack_t stack_t; +#endif #define SS_ONSTACK 0x0001 /* take signal on signal stack */ #define SS_DISABLE 0x0004 /* disable taking signals on alternate stack */ #define MINSIGSTKSZ 32768 /* (32K)minimum allowable stack */ #define SIGSTKSZ 131072 /* (128K)recommended stack size */ +#ifndef _POSIX_C_SOURCE /* * 4.3 compatibility: * Signal vector "template" used in sigvec call. @@ -308,6 +493,7 @@ struct sigvec { #define SV_SIGINFO SA_SIGINFO #define sv_onstack sv_flags /* isn't compatibility wonderful! */ +#endif /* !_POSIX_C_SOURCE */ /* * Structure used in sigstack call. @@ -317,14 +503,14 @@ struct sigstack { int ss_onstack; /* current status */ }; +#ifndef _POSIX_C_SOURCE /* * Macro for converting signal number to a mask suitable for * sigblock(). */ #define sigmask(m) (1 << ((m)-1)) -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef BSD_KERNEL_PRIVATE /* * signals delivered on a per-thread basis. */ @@ -333,12 +519,11 @@ struct sigstack { sigmask(SIGFPE)|sigmask(SIGBUS)|\ sigmask(SIGSEGV)|sigmask(SIGSYS)|\ sigmask(SIGPIPE)) -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +#endif /* BSD_KERNEL_PRIVATE */ #define BADSIG SIG_ERR -#endif /* !_POSIX_SOURCE */ +#endif /* !_POSIX_C_SOURCE */ #endif /* !_ANSI_SOURCE */ /* @@ -346,6 +531,6 @@ struct sigstack { * defined by <sys/signal.h>. */ __BEGIN_DECLS -void (*signal __P((int, void (*) __P((int))))) __P((int)); +void (*signal(int, void (*)(int)))(int); __END_DECLS #endif /* !_SYS_SIGNAL_H_ */ diff --git a/bsd/sys/signalvar.h b/bsd/sys/signalvar.h index 1e6e46ed8..cd0f01880 100644 --- a/bsd/sys/signalvar.h +++ b/bsd/sys/signalvar.h @@ -60,7 +60,7 @@ #include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE +#ifdef BSD_KERNEL_PRIVATE /* * Kernel signal definitions and data structures, * not exported to user programs. @@ -71,8 +71,8 @@ * (not necessarily resident). */ struct sigacts { - sig_t ps_sigact[NSIG]; /* disposition of signals */ - sig_t ps_trampact[NSIG]; /* disposition of signals */ + user_addr_t ps_sigact[NSIG]; /* disposition of signals */ + user_addr_t ps_trampact[NSIG]; /* disposition of signals */ sigset_t ps_catchmask[NSIG]; /* signals to be blocked */ sigset_t ps_sigonstack; /* signals to take on sigstack */ sigset_t ps_sigintr; /* signals that interrupt syscalls */ @@ -81,7 +81,7 @@ struct sigacts { sigset_t ps_siginfo; /* signals that want SA_SIGINFO args */ sigset_t ps_oldmask; /* saved mask from before sigpause */ int ps_flags; /* signal flags, below */ - struct sigaltstack ps_sigstk; /* sp & on stack state variable */ + struct user_sigaltstack ps_sigstk; /* sp, length & flags */ int ps_sig; /* for core dump/debugger XXX */ int ps_code; /* for core dump/debugger XXX */ int ps_addr; /* for core dump/debugger XXX */ @@ -93,10 +93,14 @@ struct sigacts { #define SAS_OLDMASK 0x01 /* need to restore mask before pause */ #define SAS_ALTSTACK 0x02 /* have alternate signal stack */ -/* additional signal action values, used only temporarily/internally */ -#define SIG_CATCH (void (*)())2 -#define SIG_HOLD (void (*)())3 -#define SIG_WAIT (void (*)())4 +/* + * Additional signal action values, used only temporarily/internally; these + * values should be non-intersecting with values defined in signal.h, e.g.: + * SIG_IGN, SIG_DFL, SIG_ERR, SIG_IGN. + */ +#define KERN_SIG_CATCH (void (*)(int))2 +#define KERN_SIG_HOLD (void (*)(int))3 +#define KERN_SIG_WAIT (void (*)(int))4 #define pgsigio(pgid, sig, notused) \ { \ @@ -187,37 +191,47 @@ int sigprop[NSIG + 1] = { #define sigcantmask (sigmask(SIGKILL) | sigmask(SIGSTOP)) -#ifdef KERNEL /* * Machine-independent functions: */ -int coredump __P((struct proc *p)); -void execsigs __P((struct proc *p, thread_act_t thr_act)); -void gsignal __P((int pgid, int sig)); -int issignal __P((struct proc *p)); -int CURSIG __P((struct proc *p)); -int clear_procsiglist __P((struct proc *p, int bit)); -int clear_procsigmask __P((struct proc *p, int bit)); -int set_procsigmask __P((struct proc *p, int bit)); -void tty_pgsignal __P((struct pgrp *pgrp, int sig)); -void postsig __P((int sig)); -void siginit __P((struct proc *p)); -void trapsignal __P((struct proc *p, int sig, unsigned code)); -void pt_setrunnable __P((struct proc *p)); +int signal_lock(struct proc *); +int signal_unlock(struct proc *); +int coredump(struct proc *p); +void execsigs(struct proc *p, thread_t thread); +void gsignal(int pgid, int sig); +int issignal(struct proc *p); +int CURSIG(struct proc *p); +int clear_procsiglist(struct proc *p, int bit); +int clear_procsigmask(struct proc *p, int bit); +int set_procsigmask(struct proc *p, int bit); +void tty_pgsignal(struct pgrp *pgrp, int sig); +void postsig(int sig); +void siginit(struct proc *p); +void trapsignal(struct proc *p, int sig, unsigned code); +void pt_setrunnable(struct proc *p); /* * Machine-dependent functions: */ -void sendsig __P((struct proc *, sig_t action, int sig, - int returnmask, u_long code)); - -#ifdef __APPLE_API_UNSTABLE -void psignal __P((struct proc *p, int sig)); -void pgsignal __P((struct pgrp *pgrp, int sig, int checkctty)); -#endif /* __APPLE_API_UNSTABLE */ - -#endif /* KERNEL */ - -#endif /* __APPLE_API_PRIVATE */ +void sendsig(struct proc *, /*sig_t*/ user_addr_t action, int sig, + int returnmask, u_long code); + +void psignal(struct proc *p, int sig); +void pgsignal(struct pgrp *pgrp, int sig, int checkctty); +void threadsignal(thread_t sig_actthread, int signum, u_long code); +int thread_issignal(proc_t p, thread_t th, sigset_t mask); +void psignal_vfork(struct proc *p, task_t new_task, thread_t thr_act, + int signum); +void psignal_vtalarm(struct proc *); +void psignal_xcpu(struct proc *); +void psignal_sigprof(struct proc *); +void psignal_lock(struct proc *, int, int); +void signal_setast(thread_t sig_actthread); + +/* XXX not really very "inline"... */ +__inline__ void sig_lock_to_exit(struct proc *p); +__inline__ int sig_try_locked(struct proc *p); + +#endif /* BSD_KERNEL_PRIVATE */ #endif /* !_SYS_SIGNALVAR_H_ */ diff --git a/bsd/sys/socket.h b/bsd/sys/socket.h index 957cabb8d..65567cc12 100644 --- a/bsd/sys/socket.h +++ b/bsd/sys/socket.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -60,9 +60,9 @@ #ifndef _SYS_SOCKET_H_ #define _SYS_SOCKET_H_ -#ifndef __APPLE__ -#include <machine/ansi.h> -#endif +#include <sys/_types.h> +#include <sys/cdefs.h> + #define _NO_NAMESPACE_POLLUTION #include <machine/param.h> #undef _NO_NAMESPACE_POLLUTION @@ -74,10 +74,52 @@ /* * Data types. */ -typedef u_char sa_family_t; -#ifdef _BSD_SOCKLEN_T_ -typedef _BSD_SOCKLEN_T_ socklen_t; -#undef _BSD_SOCKLEN_T_ +#ifndef _GID_T +typedef __darwin_gid_t gid_t; +#define _GID_T +#endif + +#ifndef _OFF_T +typedef __darwin_off_t off_t; +#define _OFF_T +#endif + +#ifndef _PID_T +typedef __darwin_pid_t pid_t; +#define _PID_T +#endif + +#ifndef _SA_FAMILY_T +#define _SA_FAMILY_T +typedef __uint8_t sa_family_t; +#endif + +#ifndef _SOCKLEN_T +#define _SOCKLEN_T +typedef __darwin_socklen_t socklen_t; +#endif + +/* XXX Not explicitly defined by POSIX, but function return types are */ +#ifndef _SIZE_T +#define _SIZE_T +typedef __darwin_size_t size_t; +#endif + +/* XXX Not explicitly defined by POSIX, but function return types are */ +#ifndef _SSIZE_T +#define _SSIZE_T +typedef __darwin_ssize_t ssize_t; +#endif + +/* + * [XSI] The iovec structure shall be defined as described in <sys/uio.h>. + */ +#ifndef _STRUCT_IOVEC +#define _STRUCT_IOVEC +struct iovec { + void * iov_base; /* [XSI] Base address of I/O memory region */ + size_t iov_len; /* [XSI] Size of region iov_base points to */ +}; #endif /* @@ -86,7 +128,9 @@ typedef _BSD_SOCKLEN_T_ socklen_t; #define SOCK_STREAM 1 /* stream socket */ #define SOCK_DGRAM 2 /* datagram socket */ #define SOCK_RAW 3 /* raw-protocol interface */ +#ifndef _POSIX_C_SOURCE #define SOCK_RDM 4 /* reliably-delivered message */ +#endif /* !_POSIX_C_SOURCE */ #define SOCK_SEQPACKET 5 /* sequenced packet stream */ /* @@ -98,9 +142,14 @@ typedef _BSD_SOCKLEN_T_ socklen_t; #define SO_KEEPALIVE 0x0008 /* keep connections alive */ #define SO_DONTROUTE 0x0010 /* just use interface addresses */ #define SO_BROADCAST 0x0020 /* permit sending of broadcast msgs */ +#ifndef _POSIX_C_SOURCE #define SO_USELOOPBACK 0x0040 /* bypass hardware when possible */ -#define SO_LINGER 0x0080 /* linger on close if data present */ +#define SO_LINGER 0x0080 /* linger on close if data present (in ticks) */ +#else +#define SO_LINGER 0x1080 /* linger on close if data present (in seconds) */ +#endif /* !_POSIX_C_SOURCE */ #define SO_OOBINLINE 0x0100 /* leave received OOB data in line */ +#ifndef _POSIX_C_SOURCE #define SO_REUSEPORT 0x0200 /* allow local address & port reuse */ #define SO_TIMESTAMP 0x0400 /* timestamp received dgram traffic */ #ifndef __APPLE__ @@ -111,6 +160,7 @@ typedef _BSD_SOCKLEN_T_ socklen_t; #define SO_WANTMORE 0x4000 /* APPLE: Give hint when more data ready */ #define SO_WANTOOBFLAG 0x8000 /* APPLE: Want OOB in MSG_FLAG on receive */ #endif +#endif /* !_POSIX_C_SOURCE */ /* * Additional options, not kept in so_options. @@ -123,13 +173,18 @@ typedef _BSD_SOCKLEN_T_ socklen_t; #define SO_RCVTIMEO 0x1006 /* receive timeout */ #define SO_ERROR 0x1007 /* get error status and clear */ #define SO_TYPE 0x1008 /* get socket type */ +#ifndef _POSIX_C_SOURCE /*efine SO_PRIVSTATE 0x1009 get/deny privileged state */ #ifdef __APPLE__ #define SO_NREAD 0x1020 /* APPLE: get 1st-packet byte count */ #define SO_NKE 0x1021 /* APPLE: Install socket-level NKE */ #define SO_NOSIGPIPE 0x1022 /* APPLE: No SIGPIPE on EPIPE */ #define SO_NOADDRERR 0x1023 /* APPLE: Returns EADDRNOTAVAIL when src is not available anymore */ +#define SO_NWRITE 0x1024 /* APPLE: Get number of bytes currently in send socket buffer */ +#define SO_LINGER_SEC 0x1080 /* linger on close if data present (in seconds) */ #endif +#endif /* !_POSIX_C_SOURCE */ + /* * Structure used for manipulating linger option. */ @@ -154,9 +209,12 @@ struct accept_filter_arg { * Address families. */ #define AF_UNSPEC 0 /* unspecified */ -#define AF_LOCAL 1 /* local to host (pipes) */ -#define AF_UNIX AF_LOCAL /* backward compatibility */ +#define AF_UNIX 1 /* local to host (pipes) */ +#ifndef _POSIX_C_SOURCE +#define AF_LOCAL AF_UNIX /* backward compatibility */ +#endif /* !_POSIX_C_SOURCE */ #define AF_INET 2 /* internetwork: UDP, TCP, etc. */ +#ifndef _POSIX_C_SOURCE #define AF_IMPLINK 3 /* arpanet imp addresses */ #define AF_PUP 4 /* pup protocols: e.g. BSP */ #define AF_CHAOS 5 /* mit CHAOS protocols */ @@ -188,7 +246,9 @@ struct accept_filter_arg { #define AF_ISDN 28 /* Integrated Services Digital Network*/ #define AF_E164 AF_ISDN /* CCITT E.164 recommendation */ #define pseudo_AF_KEY 29 /* Internal key-management function */ +#endif /* !_POSIX_C_SOURCE */ #define AF_INET6 30 /* IPv6 */ +#ifndef _POSIX_C_SOURCE #define AF_NATM 31 /* native ATM access */ #ifdef __APPLE__ #define AF_SYSTEM 32 /* Kernel event messages */ @@ -200,20 +260,28 @@ struct accept_filter_arg { #define pseudo_AF_HDRCMPLT 35 /* Used by BPF to not rewrite headers * in interface output routine */ +#ifdef PRIVATE +#define AF_AFP 36 /* Used by AFP */ +#else +#define AF_RESERVED_36 36 /* Reserved for internal usage */ +#endif + #ifndef __APPLE__ #define AF_NETGRAPH 32 /* Netgraph sockets */ #endif -#define AF_MAX 36 +#define AF_MAX 37 +#endif /* !_POSIX_C_SOURCE */ /* - * Structure used by kernel to store most - * addresses. + * [XSI] Structure used by kernel to store most addresses. */ struct sockaddr { - u_char sa_len; /* total length */ - u_char sa_family; /* address family */ - char sa_data[14]; /* actually longer; address value */ + __uint8_t sa_len; /* total length */ + sa_family_t sa_family; /* [XSI] address family */ + char sa_data[14]; /* [XSI] addr value (actually larger) */ }; + +#ifndef _POSIX_C_SOURCE #define SOCK_MAXADDRLEN 255 /* longest possible addresses */ /* @@ -221,24 +289,30 @@ struct sockaddr { * information in raw sockets. */ struct sockproto { - u_short sp_family; /* address family */ - u_short sp_protocol; /* protocol */ + __uint16_t sp_family; /* address family */ + __uint16_t sp_protocol; /* protocol */ }; +#endif /* !_POSIX_C_SOURCE*/ /* * RFC 2553: protocol-independent placeholder for socket addresses */ #define _SS_MAXSIZE 128 -#define _SS_ALIGNSIZE (sizeof(int64_t)) -#define _SS_PAD1SIZE (_SS_ALIGNSIZE - sizeof(u_char) - sizeof(sa_family_t)) -#define _SS_PAD2SIZE (_SS_MAXSIZE - sizeof(u_char) - sizeof(sa_family_t) - \ +#define _SS_ALIGNSIZE (sizeof(__int64_t)) +#define _SS_PAD1SIZE \ + (_SS_ALIGNSIZE - sizeof(__uint8_t) - sizeof(sa_family_t)) +#define _SS_PAD2SIZE \ + (_SS_MAXSIZE - sizeof(__uint8_t) - sizeof(sa_family_t) - \ _SS_PAD1SIZE - _SS_ALIGNSIZE) +/* + * [XSI] sockaddr_storage + */ struct sockaddr_storage { - u_char ss_len; /* address length */ - sa_family_t ss_family; /* address family */ + __uint8_t ss_len; /* address length */ + sa_family_t ss_family; /* [XSI] address family */ char __ss_pad1[_SS_PAD1SIZE]; - int64_t __ss_align; /* force desired structure storage alignment */ + __int64_t __ss_align; /* force structure storage alignment */ char __ss_pad2[_SS_PAD2SIZE]; }; @@ -284,6 +358,12 @@ struct sockaddr_storage { #define PF_SYSTEM AF_SYSTEM #define PF_NETBIOS AF_NETBIOS #define PF_PPP AF_PPP +#ifdef PRIVATE +#define PF_AFP AF_AFP +#else +#define PF_RESERVED_36 AF_RESERVED_36 +#endif + #else #define PF_ATM AF_ATM #define PF_NETGRAPH AF_NETGRAPH @@ -291,6 +371,12 @@ struct sockaddr_storage { #define PF_MAX AF_MAX +/* + * These do not have socket-layer support: + */ +#define PF_VLAN ((uint32_t)0x766c616e) /* 'vlan' */ +#define PF_BOND ((uint32_t)0x626f6e64) /* 'bond' */ + /* * Definitions for network related sysctl, CTL_NET. * @@ -301,6 +387,8 @@ struct sockaddr_storage { */ #define NET_MAXID AF_MAX + +#ifdef KERNEL_PRIVATE #define CTL_NET_NAMES { \ { 0, 0 }, \ { "local", CTLTYPE_NODE }, \ @@ -339,6 +427,7 @@ struct sockaddr_storage { { "ppp", CTLTYPE_NODE }, \ { "hdrcomplete", CTLTYPE_NODE }, \ } +#endif KERNEL_PRIVATE /* * PF_ROUTE - Routing table @@ -348,37 +437,74 @@ struct sockaddr_storage { * Fifth: type of info, defined below * Sixth: flag(s) to mask with for NET_RT_FLAGS */ -#define NET_RT_DUMP 1 /* dump; may limit to a.f. */ -#define NET_RT_FLAGS 2 /* by flags, e.g. RESOLVING */ -#define NET_RT_IFLIST 3 /* survey interface list */ -#define NET_RT_MAXID 4 - +#define NET_RT_DUMP 1 /* dump; may limit to a.f. */ +#define NET_RT_FLAGS 2 /* by flags, e.g. RESOLVING */ +#define NET_RT_IFLIST 3 /* survey interface list */ +#define NET_RT_STAT 4 /* routing statistics */ +#define NET_RT_TRASH 5 /* routes not in table but not freed */ +#define NET_RT_IFLIST2 6 /* interface list with addresses */ +#define NET_RT_DUMP2 7 /* dump; may limit to a.f. */ +#define NET_RT_MAXID 8 + +#ifdef KERNEL_PRIVATE #define CTL_NET_RT_NAMES { \ { 0, 0 }, \ { "dump", CTLTYPE_STRUCT }, \ { "flags", CTLTYPE_STRUCT }, \ { "iflist", CTLTYPE_STRUCT }, \ + { "stat", CTLTYPE_STRUCT }, \ + { "trash", CTLTYPE_INT }, \ + { "iflist2", CTLTYPE_STRUCT }, \ + { "dump2", CTLTYPE_STRUCT }, \ } +#endif KERNEL_PRIVATE + /* * Maximum queue length specifiable by listen. */ #define SOMAXCONN 128 /* - * Message header for recvmsg and sendmsg calls. + * [XSI] Message header for recvmsg and sendmsg calls. * Used value-result for recvmsg, value only for sendmsg. */ struct msghdr { - caddr_t msg_name; /* optional address */ + void *msg_name; /* [XSI] optional address */ + socklen_t msg_namelen; /* [XSI] size of address */ + struct iovec *msg_iov; /* [XSI] scatter/gather array */ + int msg_iovlen; /* [XSI] # elements in msg_iov */ + void *msg_control; /* [XSI] ancillary data, see below */ + socklen_t msg_controllen; /* [XSI] ancillary data buffer len */ + int msg_flags; /* [XSI] flags on received message */ +}; + +// LP64todo - should this move? +#ifdef KERNEL +/* LP64 version of struct msghdr. all pointers + * grow when we're dealing with a 64-bit process. + * WARNING - keep in sync with struct msghdr + */ +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_msghdr { + user_addr_t msg_name; /* optional address */ socklen_t msg_namelen; /* size of address */ - struct iovec *msg_iov; /* scatter/gather array */ - u_int msg_iovlen; /* # elements in msg_iov */ - caddr_t msg_control; /* ancillary data, see below */ + user_addr_t msg_iov; /* scatter/gather array */ + int msg_iovlen; /* # elements in msg_iov */ + user_addr_t msg_control; /* ancillary data, see below */ socklen_t msg_controllen; /* ancillary data buffer len */ int msg_flags; /* flags on received message */ }; +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +#endif // KERNEL + #define MSG_OOB 0x1 /* process out-of-band data */ #define MSG_PEEK 0x2 /* peek at incoming message */ #define MSG_DONTROUTE 0x4 /* send without using routing tables */ @@ -386,6 +512,7 @@ struct msghdr { #define MSG_TRUNC 0x10 /* data discarded before delivery */ #define MSG_CTRUNC 0x20 /* control data lost before delivery */ #define MSG_WAITALL 0x40 /* wait for full request or error */ +#ifndef _POSIX_C_SOURCE #define MSG_DONTWAIT 0x80 /* this message should be nonblocking */ #define MSG_EOF 0x100 /* data completes connection */ #ifdef __APPLE__ @@ -398,6 +525,13 @@ struct msghdr { #endif #define MSG_COMPAT 0x8000 /* used in sendit() */ #define MSG_NEEDSA 0x10000 /* Fail receive if socket address cannot be allocated */ +#ifdef KERNEL_PRIVATE +#define MSG_NBIO 0x20000 /* FIONBIO mode, used by fifofs */ +#endif +#ifdef KERNEL +#define MSG_USEUPCALL 0x80000000 /* Inherit upcall in sock_accept */ +#endif +#endif /* !_POSIX_C_SOURCE */ /* * Header for ancillary data objects in msg_control buffer. @@ -406,12 +540,13 @@ struct msghdr { * of message elements headed by cmsghdr structures. */ struct cmsghdr { - socklen_t cmsg_len; /* data byte count, including hdr */ - int cmsg_level; /* originating protocol */ - int cmsg_type; /* protocol-specific type */ -/* followed by u_char cmsg_data[]; */ + socklen_t cmsg_len; /* [XSI] data byte count, including hdr */ + int cmsg_level; /* [XSI] originating protocol */ + int cmsg_type; /* [XSI] protocol-specific type */ +/* followed by unsigned char cmsg_data[]; */ }; +#ifndef _POSIX_C_SOURCE #ifndef __APPLE__ /* * While we may have more groups than this, the cmsgcred struct must @@ -436,32 +571,35 @@ struct cmsgcred { gid_t cmcred_groups[CMGROUP_MAX]; /* groups */ }; #endif +#endif /* !_POSIX_C_SOURCE */ /* given pointer to struct cmsghdr, return pointer to data */ -#define CMSG_DATA(cmsg) ((u_char *)(cmsg) + \ +#define CMSG_DATA(cmsg) ((unsigned char *)(cmsg) + \ ALIGN(sizeof(struct cmsghdr))) /* given pointer to struct cmsghdr, return pointer to next cmsghdr */ #define CMSG_NXTHDR(mhdr, cmsg) \ - (((caddr_t)(cmsg) + ALIGN((cmsg)->cmsg_len) + \ + (((unsigned char *)(cmsg) + ALIGN((cmsg)->cmsg_len) + \ ALIGN(sizeof(struct cmsghdr)) > \ - (caddr_t)(mhdr)->msg_control + (mhdr)->msg_controllen) ? \ - (struct cmsghdr *)NULL : \ - (struct cmsghdr *)((caddr_t)(cmsg) + ALIGN((cmsg)->cmsg_len))) + (unsigned char *)(mhdr)->msg_control + (mhdr)->msg_controllen) ? \ + (struct cmsghdr *)0 /* NULL */ : \ + (struct cmsghdr *)((unsigned char *)(cmsg) + ALIGN((cmsg)->cmsg_len))) #define CMSG_FIRSTHDR(mhdr) ((struct cmsghdr *)(mhdr)->msg_control) +#ifndef _POSIX_C_SOURCE /* RFC 2292 additions */ - #define CMSG_SPACE(l) (ALIGN(sizeof(struct cmsghdr)) + ALIGN(l)) #define CMSG_LEN(l) (ALIGN(sizeof(struct cmsghdr)) + (l)) #ifdef KERNEL #define CMSG_ALIGN(n) ALIGN(n) #endif +#endif /* !_POSIX_C_SOURCE */ /* "Socket"-level control message types: */ #define SCM_RIGHTS 0x01 /* access rights (array of int) */ +#ifndef _POSIX_C_SOURCE #define SCM_TIMESTAMP 0x02 /* timestamp (struct timeval) */ #define SCM_CREDS 0x03 /* process creds (struct cmsgcred) */ @@ -469,21 +607,23 @@ struct cmsgcred { * 4.3 compat sockaddr, move to compat file later */ struct osockaddr { - u_short sa_family; /* address family */ + __uint16_t sa_family; /* address family */ char sa_data[14]; /* up to 14 bytes of direct address */ }; /* * 4.3-compat message header (move to compat file later). */ + // LP64todo - fix this. should msg_iov be * iovec_64? struct omsghdr { - caddr_t msg_name; /* optional address */ - int msg_namelen; /* size of address */ - struct iovec *msg_iov; /* scatter/gather array */ - int msg_iovlen; /* # elements in msg_iov */ - caddr_t msg_accrights; /* access rights sent/received */ - int msg_accrightslen; + void *msg_name; /* optional address */ + socklen_t msg_namelen; /* size of address */ + struct iovec *msg_iov; /* scatter/gather array */ + int msg_iovlen; /* # elements in msg_iov */ + void *msg_accrights; /* access rights sent/rcvd */ + int msg_accrightslen; }; +#endif /* !_POSIX_C_SOURCE */ /* * howto arguments for shutdown(2), specified by Posix.1g. @@ -492,6 +632,7 @@ struct omsghdr { #define SHUT_WR 1 /* shut down the writing side */ #define SHUT_RDWR 2 /* shut down both sides */ +#ifndef _POSIX_C_SOURCE #if SENDFILE /* * sendfile(2) header/trailer struct @@ -503,37 +644,49 @@ struct sf_hdtr { int trl_cnt; /* number of trailer iovec's */ }; #endif +#endif /* !_POSIX_C_SOURCE */ #ifndef KERNEL - -#include <sys/cdefs.h> - __BEGIN_DECLS -int accept __P((int, struct sockaddr *, socklen_t *)); -int bind __P((int, const struct sockaddr *, socklen_t)); -int connect __P((int, const struct sockaddr *, socklen_t)); -int getpeername __P((int, struct sockaddr *, socklen_t *)); -int getsockname __P((int, struct sockaddr *, socklen_t *)); -int getsockopt __P((int, int, int, void *, int *)); -int listen __P((int, int)); -ssize_t recv __P((int, void *, size_t, int)); -ssize_t recvfrom __P((int, void *, size_t, int, struct sockaddr *, socklen_t *)); -ssize_t recvmsg __P((int, struct msghdr *, int)); -ssize_t send __P((int, const void *, size_t, int)); -ssize_t sendto __P((int, const void *, - size_t, int, const struct sockaddr *, socklen_t)); -ssize_t sendmsg __P((int, const struct msghdr *, int)); +int accept(int, struct sockaddr * __restrict, socklen_t * __restrict) + __DARWIN_ALIAS(accept); +int bind(int, const struct sockaddr *, socklen_t) __DARWIN_ALIAS(bind); +int connect(int, const struct sockaddr *, socklen_t) __DARWIN_ALIAS( connect); +int getpeername(int, struct sockaddr * __restrict, socklen_t * __restrict) + __DARWIN_ALIAS(getpeername); +int getsockname(int, struct sockaddr * __restrict, socklen_t * __restrict) + __DARWIN_ALIAS(getsockname); +int getsockopt(int, int, int, void * __restrict, socklen_t * __restrict); +int listen(int, int) __DARWIN_ALIAS(listen); +ssize_t recv(int, void *, size_t, int) __DARWIN_ALIAS(recv); +ssize_t recvfrom(int, void *, size_t, int, struct sockaddr * __restrict, + socklen_t * __restrict) __DARWIN_ALIAS(recvfrom); +ssize_t recvmsg(int, struct msghdr *, int) __DARWIN_ALIAS(recvmsg); +ssize_t send(int, const void *, size_t, int) __DARWIN_ALIAS(send); +ssize_t sendmsg(int, const struct msghdr *, int) __DARWIN_ALIAS(sendmsg); +ssize_t sendto(int, const void *, size_t, + int, const struct sockaddr *, socklen_t) __DARWIN_ALIAS(sendto); +int setsockopt(int, int, int, const void *, socklen_t); +int shutdown(int, int); +int socket(int, int, int); +int socketpair(int, int, int, int *) __DARWIN_ALIAS(socketpair); +/* + * NOTIMP: + * int sockatmark(int s); + */ + +#ifndef _POSIX_C_SOURCE #if SENDFILE -int sendfile __P((int, int, off_t, size_t, struct sf_hdtr *, off_t *, int)); +int sendfile(int, int, off_t, size_t, struct sf_hdtr *, off_t *, int); #endif -int setsockopt __P((int, int, int, const void *, socklen_t)); -int shutdown __P((int, int)); -int socket __P((int, int, int)); -int socketpair __P((int, int, int, int *)); - -void pfctlinput __P((int, struct sockaddr *)); +void pfctlinput(int, struct sockaddr *); +#endif /* !_POSIX_C_SOURCE */ __END_DECLS #endif /* !KERNEL */ +#ifdef KERNEL +#include <sys/kpi_socket.h> +#endif + #endif /* !_SYS_SOCKET_H_ */ diff --git a/bsd/sys/socketvar.h b/bsd/sys/socketvar.h index b3d700a9f..f069bf4ac 100644 --- a/bsd/sys/socketvar.h +++ b/bsd/sys/socketvar.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -65,17 +65,19 @@ #include <sys/select.h> /* for struct selinfo */ #include <net/kext_net.h> #include <sys/ev.h> +#include <sys/cdefs.h> + /* * Hacks to get around compiler complaints */ struct mbuf; -struct kextcb; +struct socket_filter_entry; struct protosw; struct sockif; struct sockutil; +#ifdef KERNEL_PRIVATE /* strings for sleep message: */ -#ifdef __APPLE_API_UNSTABLE extern char netio[], netcon[], netcls[]; #define SOCKET_CACHE_ON #define SO_CACHE_FLUSH_INTERVAL 1 /* Seconds */ @@ -90,8 +92,11 @@ extern char netio[], netcon[], netcls[]; * handle on protocol and pointer to protocol * private data and error information. */ +#endif /* KERNEL_PRIVATE */ + typedef u_quad_t so_gen_t; +#ifdef KERNEL_PRIVATE #ifndef __APPLE__ /* We don't support BSD style socket filters */ struct accept_filter; @@ -147,9 +152,9 @@ struct socket { #endif struct selinfo sb_sel; /* process selecting read/write */ short sb_flags; /* flags, see below */ - short sb_timeo; /* timeout for read/write */ - void *reserved1; /* for future use if needed */ - void *reserved2; + struct timeval sb_timeo; /* timeout for read/write */ + void *reserved1; /* for future use if needed */ + void *reserved2; } so_rcv, so_snd; #define SB_MAX (256*1024) /* default for max chars in sockbuf */ #define SB_LOCK 0x01 /* lock on data queue */ @@ -169,7 +174,7 @@ struct socket { caddr_t so_tpcb; /* Wisc. protocol control block - XXX unused? */ #endif - void (*so_upcall) __P((struct socket *so, caddr_t arg, int waitf)); + void (*so_upcall)(struct socket *so, caddr_t arg, int waitf); caddr_t so_upcallarg; /* Arg for above */ uid_t so_uid; /* who opened the socket */ /* NB: generation count must not be first; easiest to make it last. */ @@ -191,16 +196,20 @@ struct socket { struct mbuf *so_temp; /* Holding area for outbound frags */ /* Plug-in support - make the socket interface overridable */ struct mbuf *so_tail; - struct kextcb *so_ext; /* NKE hook */ + struct socket_filter_entry *so_filt; /* NKE hook */ u_long so_flags; /* Flags */ -#define SOF_NOSIGPIPE 0x00000001 -#define SOF_NOADDRAVAIL 0x00000002 /* returns EADDRNOTAVAIL if src address is gone */ - void *reserved2; - void *reserved3; - void *reserved4; +#define SOF_NOSIGPIPE 0x00000001 +#define SOF_NOADDRAVAIL 0x00000002 /* returns EADDRNOTAVAIL if src address is gone */ +#define SOF_PCBCLEARING 0x00000004 /* pru_disconnect done, no need to call pru_detach */ + int so_usecount; /* refcounting of socket use */; + int so_retaincnt; + u_int32_t so_filteruse; /* usecount for the socket filters */ + void *reserved3; /* Temporarily in use/debug: last socket lock LR */ + void *reserved4; /* Temporarily in use/debug: last socket unlock LR */ + #endif }; -#endif /* __APPLE_API_UNSTABLE */ +#endif /* KERNEL_PRIVATE */ /* * Socket state bits. @@ -220,6 +229,7 @@ struct socket { #define SS_INCOMP 0x800 /* Unaccepted, incomplete connection */ #define SS_COMP 0x1000 /* unaccepted, complete connection */ #define SS_ISDISCONNECTED 0x2000 /* socket disconnected from peer */ +#define SS_DRAINING 0x4000 /* close waiting for blocked system calls to drain */ /* * Externalized form of struct socket used by the sysctl(3) interface. @@ -253,11 +263,11 @@ struct xsocket { uid_t so_uid; /* XXX */ }; +#ifdef KERNEL_PRIVATE /* * Macros for sockets and socket buffering. */ -#ifdef __APPLE__ -#ifdef __APPLE_API_UNSTABLE + #define sbtoso(sb) (sb->sb_so) /* @@ -265,17 +275,20 @@ struct xsocket { * These are macros on FreeBSD. On Darwin the * implementation is in bsd/kern/uipc_socket2.c */ -int sb_notify __P((struct sockbuf *sb)); -long sbspace __P((struct sockbuf *sb)); -int sosendallatonce __P((struct socket *so)); -int soreadable __P((struct socket *so)); -int sowriteable __P((struct socket *so)); -void sballoc __P((struct sockbuf *sb, struct mbuf *m)); -void sbfree __P((struct sockbuf *sb, struct mbuf *m)); -int sblock __P((struct sockbuf *sb, int wf)); -void sbunlock __P((struct sockbuf *sb)); -void sorwakeup __P((struct socket * so)); -void sowwakeup __P((struct socket * so)); + +__BEGIN_DECLS +int sb_notify(struct sockbuf *sb); +long sbspace(struct sockbuf *sb); +int sosendallatonce(struct socket *so); +int soreadable(struct socket *so); +int sowriteable(struct socket *so); +void sballoc(struct sockbuf *sb, struct mbuf *m); +void sbfree(struct sockbuf *sb, struct mbuf *m); +int sblock(struct sockbuf *sb, int wf); +void sbunlock(struct sockbuf *sb, int locked); +void sorwakeup(struct socket * so); +void sowwakeup(struct socket * so); +__END_DECLS /* * Socket extension mechanism: control block hooks: @@ -294,10 +307,11 @@ struct kextcb }; #define EXT_NULL 0x0 /* STATE: Not in use */ #define sotokextcb(so) (so ? so->so_ext : 0) -#endif /* __APPLE___ */ #ifdef KERNEL +#define SO_FILT_HINT_LOCKED 0x1 + /* * Argument structure for sosetopt et seq. This is in the KERNEL * section because it will never be visible to user code. @@ -307,7 +321,7 @@ struct sockopt { enum sopt_dir sopt_dir; /* is this a get or a set? */ int sopt_level; /* second arg of [gs]etsockopt */ int sopt_name; /* third arg of [gs]etsockopt */ - void *sopt_val; /* fourth arg of [gs]etsockopt */ + user_addr_t sopt_val; /* fourth arg of [gs]etsockopt */ size_t sopt_valsize; /* (almost) fifth arg of [gs]etsockopt */ struct proc *sopt_p; /* calling process or null if kernel */ }; @@ -340,109 +354,101 @@ struct ucred; struct uio; struct knote; -/* - * File operations on sockets. - */ -int soo_read __P((struct file *fp, struct uio *uio, struct ucred *cred, - int flags, struct proc *p)); -int soo_write __P((struct file *fp, struct uio *uio, struct ucred *cred, - int flags, struct proc *p)); -int soo_close __P((struct file *fp, struct proc *p)); -int soo_ioctl __P((struct file *fp, u_long cmd, caddr_t data, - struct proc *p)); -int soo_stat __P((struct socket *so, struct stat *ub)); -int soo_select __P((struct file *fp, int which, void * wql, struct proc *p)); -int soo_kqfilter __P((struct file *fp, struct knote *kn, struct proc *p)); - - /* * From uipc_socket and friends */ -struct sockaddr *dup_sockaddr __P((struct sockaddr *sa, int canwait)); -int getsock __P((struct filedesc *fdp, int fd, struct file **fpp)); -int sockargs __P((struct mbuf **mp, caddr_t buf, int buflen, int type)); -int getsockaddr __P((struct sockaddr **namp, caddr_t uaddr, size_t len)); -void sbappend __P((struct sockbuf *sb, struct mbuf *m)); -int sbappendaddr __P((struct sockbuf *sb, struct sockaddr *asa, - struct mbuf *m0, struct mbuf *control)); -int sbappendcontrol __P((struct sockbuf *sb, struct mbuf *m0, - struct mbuf *control)); -void sbappendrecord __P((struct sockbuf *sb, struct mbuf *m0)); -void sbcheck __P((struct sockbuf *sb)); -void sbcompress __P((struct sockbuf *sb, struct mbuf *m, struct mbuf *n)); +__BEGIN_DECLS +struct sockaddr *dup_sockaddr(struct sockaddr *sa, int canwait); +int getsock(struct filedesc *fdp, int fd, struct file **fpp); +int sockargs(struct mbuf **mp, user_addr_t data, int buflen, int type); +int getsockaddr(struct sockaddr **namp, user_addr_t uaddr, size_t len); +int sbappend(struct sockbuf *sb, struct mbuf *m); +int sbappendaddr(struct sockbuf *sb, struct sockaddr *asa, + struct mbuf *m0, struct mbuf *control, int *error_out); +int sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, + struct mbuf *control, int *error_out); +int sbappendrecord(struct sockbuf *sb, struct mbuf *m0); +void sbcheck(struct sockbuf *sb); +int sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n); struct mbuf * - sbcreatecontrol __P((caddr_t p, int size, int type, int level)); -void sbdrop __P((struct sockbuf *sb, int len)); -void sbdroprecord __P((struct sockbuf *sb)); -void sbflush __P((struct sockbuf *sb)); -void sbinsertoob __P((struct sockbuf *sb, struct mbuf *m0)); -void sbrelease __P((struct sockbuf *sb)); -int sbreserve __P((struct sockbuf *sb, u_long cc)); -void sbtoxsockbuf __P((struct sockbuf *sb, struct xsockbuf *xsb)); -int sbwait __P((struct sockbuf *sb)); -int sb_lock __P((struct sockbuf *sb)); -int soabort __P((struct socket *so)); -int soaccept __P((struct socket *so, struct sockaddr **nam)); -struct socket *soalloc __P((int waitok, int dom, int type)); -int sobind __P((struct socket *so, struct sockaddr *nam)); -void socantrcvmore __P((struct socket *so)); -void socantsendmore __P((struct socket *so)); -int soclose __P((struct socket *so)); -int soconnect __P((struct socket *so, struct sockaddr *nam)); -int soconnect2 __P((struct socket *so1, struct socket *so2)); -int socreate __P((int dom, struct socket **aso, int type, int proto)); -void sodealloc __P((struct socket *so)); -int sodisconnect __P((struct socket *so)); -void sofree __P((struct socket *so)); -int sogetopt __P((struct socket *so, struct sockopt *sopt)); -void sohasoutofband __P((struct socket *so)); -void soisconnected __P((struct socket *so)); -void soisconnecting __P((struct socket *so)); -void soisdisconnected __P((struct socket *so)); -void soisdisconnecting __P((struct socket *so)); -int solisten __P((struct socket *so, int backlog)); + sbcreatecontrol(caddr_t p, int size, int type, int level); +void sbdrop(struct sockbuf *sb, int len); +void sbdroprecord(struct sockbuf *sb); +void sbflush(struct sockbuf *sb); +int sbinsertoob(struct sockbuf *sb, struct mbuf *m0); +void sbrelease(struct sockbuf *sb); +int sbreserve(struct sockbuf *sb, u_long cc); +void sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb); +int sbwait(struct sockbuf *sb); +int sb_lock(struct sockbuf *sb); +int soabort(struct socket *so); +int soaccept(struct socket *so, struct sockaddr **nam); +int soacceptlock (struct socket *so, struct sockaddr **nam, int dolock); +struct socket *soalloc(int waitok, int dom, int type); +int sobind(struct socket *so, struct sockaddr *nam); +void socantrcvmore(struct socket *so); +void socantsendmore(struct socket *so); +int soclose(struct socket *so); +int soconnect(struct socket *so, struct sockaddr *nam); +int soconnectlock (struct socket *so, struct sockaddr *nam, int dolock); +int soconnect2(struct socket *so1, struct socket *so2); +int socreate(int dom, struct socket **aso, int type, int proto); +void sodealloc(struct socket *so); +int sodisconnect(struct socket *so); +void sofree(struct socket *so); +int sogetopt(struct socket *so, struct sockopt *sopt); +void sohasoutofband(struct socket *so); +void soisconnected(struct socket *so); +void soisconnecting(struct socket *so); +void soisdisconnected(struct socket *so); +void soisdisconnecting(struct socket *so); +int solisten(struct socket *so, int backlog); struct socket * - sodropablereq __P((struct socket *head)); + sodropablereq(struct socket *head); struct socket * - sonewconn __P((struct socket *head, int connstatus)); -int sooptcopyin __P((struct sockopt *sopt, void *buf, size_t len, - size_t minlen)); -int sooptcopyout __P((struct sockopt *sopt, void *buf, size_t len)); + sonewconn(struct socket *head, int connstatus, const struct sockaddr* from); +int sooptcopyin(struct sockopt *sopt, void *data, size_t len, size_t minlen); +int sooptcopyout(struct sockopt *sopt, void *data, size_t len); +int socket_lock(struct socket *so, int refcount); +int socket_unlock(struct socket *so, int refcount); /* * XXX; prepare mbuf for (__FreeBSD__ < 3) routines. * Used primarily in IPSec and IPv6 code. */ -int soopt_getm __P((struct sockopt *sopt, struct mbuf **mp)); -int soopt_mcopyin __P((struct sockopt *sopt, struct mbuf *m)); -int soopt_mcopyout __P((struct sockopt *sopt, struct mbuf *m)); +int soopt_getm(struct sockopt *sopt, struct mbuf **mp); +int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m); +int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m); -int sopoll __P((struct socket *so, int events, struct ucred *cred, void *wql)); -int soreceive __P((struct socket *so, struct sockaddr **paddr, +int sopoll(struct socket *so, int events, struct ucred *cred, void *wql); +int soreceive(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, - struct mbuf **controlp, int *flagsp)); -int soreserve __P((struct socket *so, u_long sndcc, u_long rcvcc)); -void sorflush __P((struct socket *so)); -int sosend __P((struct socket *so, struct sockaddr *addr, struct uio *uio, - struct mbuf *top, struct mbuf *control, int flags)); + struct mbuf **controlp, int *flagsp); +int soreserve(struct socket *so, u_long sndcc, u_long rcvcc); +void sorflush(struct socket *so); +int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, + struct mbuf *top, struct mbuf *control, int flags); -int sosetopt __P((struct socket *so, struct sockopt *sopt)); -int soshutdown __P((struct socket *so, int how)); -void sotoxsocket __P((struct socket *so, struct xsocket *xso)); -void sowakeup __P((struct socket *so, struct sockbuf *sb)); +int sosetopt(struct socket *so, struct sockopt *sopt); +int soshutdown(struct socket *so, int how); +void sotoxsocket(struct socket *so, struct xsocket *xso); +void sowakeup(struct socket *so, struct sockbuf *sb); +int soioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p); #ifndef __APPLE__ /* accept filter functions */ -int accept_filt_add __P((struct accept_filter *filt)); -int accept_filt_del __P((char *name)); -struct accept_filter * accept_filt_get __P((char *name)); +int accept_filt_add(struct accept_filter *filt); +int accept_filt_del(char *name); +struct accept_filter * accept_filt_get(char *name); #ifdef ACCEPT_FILTER_MOD -int accept_filt_generic_mod_event __P((module_t mod, int event, void *data)); +int accept_filt_generic_mod_event(module_t mod, int event, void *data); SYSCTL_DECL(_net_inet_accf); #endif /* ACCEPT_FILTER_MOD */ #endif /* !defined(__APPLE__) */ +__END_DECLS + #endif /* KERNEL */ -#endif /* __APPLE_API_UNSTABLE */ +#endif /* KERNEL_PRIVATE */ #endif /* !_SYS_SOCKETVAR_H_ */ diff --git a/bsd/sys/sockio.h b/bsd/sys/sockio.h index 5a96e324b..701b94c1b 100644 --- a/bsd/sys/sockio.h +++ b/bsd/sys/sockio.h @@ -58,6 +58,8 @@ #ifndef _SYS_SOCKIO_H_ #define _SYS_SOCKIO_H_ +#include <sys/appleapiopts.h> + #include <sys/ioccom.h> /* Socket ioctl's. */ @@ -69,8 +71,10 @@ #define SIOCSPGRP _IOW('s', 8, int) /* set process group */ #define SIOCGPGRP _IOR('s', 9, int) /* get process group */ -#define SIOCADDRT _IOW('r', 10, struct ortentry) /* add route */ -#define SIOCDELRT _IOW('r', 11, struct ortentry) /* delete route */ +#if 0 +#define SIOCADDRT _IOW('r', 10, struct ortentry) /* add route */ +#define SIOCDELRT _IOW('r', 11, struct ortentry) /* delete route */ +#endif #define SIOCSIFADDR _IOW('i', 12, struct ifreq) /* set ifnet address */ #define OSIOCGIFADDR _IOWR('i', 13, struct ifreq) /* get ifnet address */ @@ -98,9 +102,13 @@ #define SIOCGIFDSTADDR _IOWR('i', 34, struct ifreq) /* get p-p address */ #define SIOCGIFBRDADDR _IOWR('i', 35, struct ifreq) /* get broadcast addr */ #define SIOCGIFCONF _IOWR('i', 36, struct ifconf) /* get ifnet list */ +#ifdef KERNEL_PRIVATE +#define SIOCGIFCONF64 _IOWR('i', 36, struct ifconf64) /* get ifnet list */ +#endif KERNEL_PRIVATE #define SIOCGIFNETMASK _IOWR('i', 37, struct ifreq) /* get net addr mask */ #define SIOCAUTOADDR _IOWR('i', 38, struct ifreq) /* autoconf address */ #define SIOCAUTONETMASK _IOW('i', 39, struct ifreq) /* autoconf netmask */ +#define SIOCARPIPLL _IOWR('i', 40, struct ifreq) /* arp for IPv4LL address */ #define SIOCADDMULTI _IOW('i', 49, struct ifreq) /* add m'cast addr */ @@ -111,6 +119,9 @@ #define SIOCSIFPHYS _IOW('i', 54, struct ifreq) /* set IF wire */ #define SIOCSIFMEDIA _IOWR('i', 55, struct ifreq) /* set net media */ #define SIOCGIFMEDIA _IOWR('i', 56, struct ifmediareq) /* get net media */ +#ifdef KERNEL_PRIVATE +#define SIOCGIFMEDIA64 _IOWR('i', 56, struct ifmediareq64) /* get net media (64-bit) */ +#endif KERNEL_PRIVATE #define SIOCSIFGENERIC _IOW('i', 57, struct ifreq) /* generic IF set op */ #define SIOCGIFGENERIC _IOWR('i', 58, struct ifreq) /* generic IF get op */ #define SIOCRSLVMULTI _IOWR('i', 59, struct rslvmulti_req) @@ -124,33 +135,40 @@ #define SIOCSLIFPHYADDR _IOW('i', 66, struct if_laddrreq) /* set gif addrs */ #define SIOCGLIFPHYADDR _IOWR('i', 67, struct if_laddrreq) /* get gif addrs */ - - - - - - -#ifdef KERNEL_PRIVATE +#define SIOCGIFDEVMTU _IOWR('i', 68, struct ifreq) /* get if ifdevmtu */ +#define SIOCSIFALTMTU _IOW('i', 69, struct ifreq) /* set if alternate mtu */ +#define SIOCGIFALTMTU _IOWR('i', 72, struct ifreq) /* get if alternate mtu */ +#define SIOCSIFBOND _IOW('i', 70, struct ifreq) /* set bond if config */ +#define SIOCGIFBOND _IOWR('i', 71, struct ifreq) /* get bond if config */ #define SIOCIFCREATE _IOWR('i', 120, struct ifreq) /* create clone if */ #define SIOCIFDESTROY _IOW('i', 121, struct ifreq) /* destroy clone if */ -#if 0 +#define SIOCSIFVLAN _IOW('i', 126, struct ifreq) /* set VLAN config */ +#define SIOCGIFVLAN _IOWR('i', 127, struct ifreq) /* get VLAN config */ +#define SIOCSETVLAN SIOCSIFVLAN +#define SIOCGETVLAN SIOCGIFVLAN +#ifdef KERNEL_PRIVATE +#define SIOCSIFDEVMTU SIOCSIFALTMTU /* deprecated */ +#endif KERNEL_PRIVATE + +#ifdef PRIVATE +#ifdef KERNEL #define SIOCIFGCLONERS _IOWR('i', 129, struct if_clonereq) /* get cloners */ -#endif 0 -#define SIOCSETVLAN _IOW('i', 126, struct ifreq) /* set VLAN config */ -#define SIOCGETVLAN _IOWR('i', 127, struct ifreq) /* get VLAN config */ +#define SIOCIFGCLONERS64 _IOWR('i', 129, struct if_clonereq64) /* get cloners */ +#endif KERNEL /* * temporary control calls to attach/detach IP to/from an ethernet interface */ #define SIOCPROTOATTACH _IOWR('i', 80, struct ifreq) /* attach proto to interface */ #define SIOCPROTODETACH _IOWR('i', 81, struct ifreq) /* detach proto from interface */ -#endif /* KERNEL_PRIVATE */ +#endif /* PRIVATE */ #define SIOCGIFASYNCMAP _IOWR('i', 124, struct ifreq) /* get ppp asyncmap */ #define SIOCSIFASYNCMAP _IOW('i', 125, struct ifreq) /* set ppp asyncmap */ +#ifdef PRIVATE #define SIOCSETOT _IOW('s', 128, int) /* set socket for LibOT */ - +#endif /* PRIVATE */ #endif /* !_SYS_SOCKIO_H_ */ diff --git a/bsd/sys/stat.h b/bsd/sys/stat.h index ffba038f3..3acb7c4a9 100644 --- a/bsd/sys/stat.h +++ b/bsd/sys/stat.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -64,116 +64,305 @@ #ifndef _SYS_STAT_H_ #define _SYS_STAT_H_ -#include <sys/time.h> +#include <sys/_types.h> +#include <sys/cdefs.h> + +/* + * [XSI] The blkcnt_t, blksize_t, dev_t, ino_t, mode_t, nlink_t, uid_t, + * gid_t, off_t, and time_t types shall be defined as described in + * <sys/types.h>. + */ +#ifndef _BLKCNT_T +typedef __darwin_blkcnt_t blkcnt_t; +#define _BLKCNT_T +#endif + +#ifndef _BLKSIZE_T +typedef __darwin_blksize_t blksize_t; +#define _BLKSIZE_T +#endif + +#ifndef _DEV_T +typedef __darwin_dev_t dev_t; /* device number */ +#define _DEV_T +#endif + +#ifndef _INO_T +typedef __darwin_ino_t ino_t; /* inode number */ +#define _INO_T +#endif + +#ifndef _MODE_T +typedef __darwin_mode_t mode_t; +#define _MODE_T +#endif + +#ifndef _NLINK_T +typedef __uint16_t nlink_t; /* link count */ +#define _NLINK_T +#endif + +#ifndef _UID_T +typedef __darwin_uid_t uid_t; /* user id */ +#define _UID_T +#endif + +#ifndef _GID_T +typedef __darwin_gid_t gid_t; +#define _GID_T +#endif -#ifndef _POSIX_SOURCE +#ifndef _OFF_T +typedef __darwin_off_t off_t; +#define _OFF_T +#endif + +#ifndef _TIME_T +#define _TIME_T +typedef __darwin_time_t time_t; +#endif + +/* [XSI] The timespec structure may be defined as described in <time.h> */ +#ifndef _TIMESPEC +#define _TIMESPEC +struct timespec { + time_t tv_sec; /* seconds */ + long tv_nsec; /* and nanoseconds */ +}; +// LP64todo - should this move? +#ifdef KERNEL +/* LP64 version of struct timespec. time_t is a long and must grow when + * we're dealing with a 64-bit process. + * WARNING - keep in sync with struct timespec + */ +struct user_timespec { + user_time_t tv_sec; /* seconds */ + __int64_t tv_nsec; /* and nanoseconds */ +}; +#endif // KERNEL +#endif /* _TIMESPEC */ + + +#ifndef _POSIX_C_SOURCE +/* + * XXX So deprecated, it would make your head spin + * + * The old stat structure. In fact, this is not used by the kernel at all, + * and should not be used by user space, and should be removed from this + * header file entirely (along with the unused cvtstat() prototype in + * vnode_internal.h). + */ struct ostat { - u_int16_t st_dev; /* inode's device */ - ino_t st_ino; /* inode's number */ - mode_t st_mode; /* inode protection mode */ - nlink_t st_nlink; /* number of hard links */ - u_int16_t st_uid; /* user ID of the file's owner */ - u_int16_t st_gid; /* group ID of the file's group */ - u_int16_t st_rdev; /* device type */ - int32_t st_size; /* file size, in bytes */ + __uint16_t st_dev; /* inode's device */ + ino_t st_ino; /* inode's number */ + mode_t st_mode; /* inode protection mode */ + nlink_t st_nlink; /* number of hard links */ + __uint16_t st_uid; /* user ID of the file's owner */ + __uint16_t st_gid; /* group ID of the file's group */ + __uint16_t st_rdev; /* device type */ + __int32_t st_size; /* file size, in bytes */ struct timespec st_atimespec; /* time of last access */ struct timespec st_mtimespec; /* time of last data modification */ struct timespec st_ctimespec; /* time of last file status change */ - int32_t st_blksize; /* optimal blocksize for I/O */ - int32_t st_blocks; /* blocks allocated for file */ - u_int32_t st_flags; /* user defined flags for file */ - u_int32_t st_gen; /* file generation number */ + __int32_t st_blksize; /* optimal blocksize for I/O */ + __int32_t st_blocks; /* blocks allocated for file */ + __uint32_t st_flags; /* user defined flags for file */ + __uint32_t st_gen; /* file generation number */ }; -#endif /* !_POSIX_SOURCE */ +#endif /* !_POSIX_C_SOURCE */ +/* + * [XSI] This structure is used as the second parameter to the fstat(), + * lstat(), and stat() functions. + */ struct stat { - dev_t st_dev; /* inode's device */ - ino_t st_ino; /* inode's number */ - mode_t st_mode; /* inode protection mode */ - nlink_t st_nlink; /* number of hard links */ - uid_t st_uid; /* user ID of the file's owner */ - gid_t st_gid; /* group ID of the file's group */ - dev_t st_rdev; /* device type */ -#ifndef _POSIX_SOURCE + dev_t st_dev; /* [XSI] ID of device containing file */ + ino_t st_ino; /* [XSI] File serial number */ + mode_t st_mode; /* [XSI] Mode of file (see below) */ + nlink_t st_nlink; /* [XSI] Number of hard links */ + uid_t st_uid; /* [XSI] User ID of the file */ + gid_t st_gid; /* [XSI] Group ID of the file */ + dev_t st_rdev; /* [XSI] Device ID */ +#ifndef _POSIX_C_SOURCE struct timespec st_atimespec; /* time of last access */ struct timespec st_mtimespec; /* time of last data modification */ - struct timespec st_ctimespec; /* time of last file status change */ + struct timespec st_ctimespec; /* time of last status change */ #else - time_t st_atime; /* time of last access */ - long st_atimensec; /* nsec of last access */ - time_t st_mtime; /* time of last data modification */ - long st_mtimensec; /* nsec of last data modification */ - time_t st_ctime; /* time of last file status change */ - long st_ctimensec; /* nsec of last file status change */ -#endif - off_t st_size; /* file size, in bytes */ - int64_t st_blocks; /* blocks allocated for file */ - u_int32_t st_blksize; /* optimal blocksize for I/O */ - u_int32_t st_flags; /* user defined flags for file */ - u_int32_t st_gen; /* file generation number */ - int32_t st_lspare; - int64_t st_qspare[2]; + time_t st_atime; /* [XSI] Time of last access */ + long st_atimensec; /* nsec of last access */ + time_t st_mtime; /* [XSI] Last data modification time */ + long st_mtimensec; /* last data modification nsec */ + time_t st_ctime; /* [XSI] Time of last status change */ + long st_ctimensec; /* nsec of last status change */ +#endif + off_t st_size; /* [XSI] file size, in bytes */ + blkcnt_t st_blocks; /* [XSI] blocks allocated for file */ + blksize_t st_blksize; /* [XSI] optimal blocksize for I/O */ + __uint32_t st_flags; /* user defined flags for file */ + __uint32_t st_gen; /* file generation number */ + __int32_t st_lspare; /* RESERVED: DO NOT USE! */ + __int64_t st_qspare[2]; /* RESERVED: DO NOT USE! */ }; +// LP64todo - should this move? +#ifdef KERNEL +#include <machine/types.h> -#ifndef _POSIX_SOURCE -#define st_atime st_atimespec.tv_sec -#define st_mtime st_mtimespec.tv_sec -#define st_ctime st_ctimespec.tv_sec +/* LP64 version of struct stat. time_t (see timespec) is a long and must + * grow when we're dealing with a 64-bit process. + * WARNING - keep in sync with struct stat + */ +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural #endif -#define S_ISUID 0004000 /* set user id on execution */ -#define S_ISGID 0002000 /* set group id on execution */ -#ifndef _POSIX_SOURCE -#define S_ISTXT 0001000 /* sticky bit */ +struct user_stat { + dev_t st_dev; /* [XSI] ID of device containing file */ + ino_t st_ino; /* [XSI] File serial number */ + mode_t st_mode; /* [XSI] Mode of file (see below) */ + nlink_t st_nlink; /* [XSI] Number of hard links */ + uid_t st_uid; /* [XSI] User ID of the file */ + gid_t st_gid; /* [XSI] Group ID of the file */ + dev_t st_rdev; /* [XSI] Device ID */ +#ifndef _POSIX_C_SOURCE + struct user_timespec st_atimespec; /* time of last access */ + struct user_timespec st_mtimespec; /* time of last data modification */ + struct user_timespec st_ctimespec; /* time of last status change */ +#else + user_time_t st_atime; /* [XSI] Time of last access */ + __int64_t st_atimensec; /* nsec of last access */ + user_time_t st_mtime; /* [XSI] Last data modification */ + __int64_t st_mtimensec; /* last data modification nsec */ + user_time_t st_ctime; /* [XSI] Time of last status change */ + __int64_t st_ctimensec; /* nsec of last status change */ #endif + off_t st_size; /* [XSI] File size, in bytes */ + blkcnt_t st_blocks; /* [XSI] Blocks allocated for file */ + blksize_t st_blksize; /* [XSI] Optimal blocksize for I/O */ + __uint32_t st_flags; /* user defined flags for file */ + __uint32_t st_gen; /* file generation number */ + __int32_t st_lspare; /* RESERVED: DO NOT USE! */ + __int64_t st_qspare[2]; /* RESERVED: DO NOT USE! */ +}; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +extern void munge_stat(struct stat *sbp, struct user_stat *usbp); -#define S_IRWXU 0000700 /* RWX mask for owner */ -#define S_IRUSR 0000400 /* R for owner */ -#define S_IWUSR 0000200 /* W for owner */ -#define S_IXUSR 0000100 /* X for owner */ +#endif // KERNEL -#ifndef _POSIX_SOURCE -#define S_IREAD S_IRUSR -#define S_IWRITE S_IWUSR -#define S_IEXEC S_IXUSR + +#ifndef _POSIX_C_SOURCE +#define st_atime st_atimespec.tv_sec +#define st_mtime st_mtimespec.tv_sec +#define st_ctime st_ctimespec.tv_sec #endif -#define S_IRWXG 0000070 /* RWX mask for group */ -#define S_IRGRP 0000040 /* R for group */ -#define S_IWGRP 0000020 /* W for group */ -#define S_IXGRP 0000010 /* X for group */ +/* + * [XSI] The following are symbolic names for the values of type mode_t. They + * are bitmap values. + */ +#ifndef S_IFMT +/* File type */ +#define S_IFMT 0170000 /* [XSI] type of file mask */ +#define S_IFIFO 0010000 /* [XSI] named pipe (fifo) */ +#define S_IFCHR 0020000 /* [XSI] character special */ +#define S_IFDIR 0040000 /* [XSI] directory */ +#define S_IFBLK 0060000 /* [XSI] block special */ +#define S_IFREG 0100000 /* [XSI] regular */ +#define S_IFLNK 0120000 /* [XSI] symbolic link */ +#define S_IFSOCK 0140000 /* [XSI] socket */ +#ifndef _POSIX_C_SOURCE +#define S_IFWHT 0160000 /* whiteout */ +#define S_IFXATTR 0200000 /* extended attribute */ +#endif -#define S_IRWXO 0000007 /* RWX mask for other */ -#define S_IROTH 0000004 /* R for other */ -#define S_IWOTH 0000002 /* W for other */ -#define S_IXOTH 0000001 /* X for other */ +/* File mode */ +/* Read, write, execute/search by owner */ +#define S_IRWXU 0000700 /* [XSI] RWX mask for owner */ +#define S_IRUSR 0000400 /* [XSI] R for owner */ +#define S_IWUSR 0000200 /* [XSI] W for owner */ +#define S_IXUSR 0000100 /* [XSI] X for owner */ +/* Read, write, execute/search by group */ +#define S_IRWXG 0000070 /* [XSI] RWX mask for group */ +#define S_IRGRP 0000040 /* [XSI] R for group */ +#define S_IWGRP 0000020 /* [XSI] W for group */ +#define S_IXGRP 0000010 /* [XSI] X for group */ +/* Read, write, execute/search by others */ +#define S_IRWXO 0000007 /* [XSI] RWX mask for other */ +#define S_IROTH 0000004 /* [XSI] R for other */ +#define S_IWOTH 0000002 /* [XSI] W for other */ +#define S_IXOTH 0000001 /* [XSI] X for other */ -#ifndef _POSIX_SOURCE -#define S_IFMT 0170000 /* type of file mask */ -#define S_IFIFO 0010000 /* named pipe (fifo) */ -#define S_IFCHR 0020000 /* character special */ -#define S_IFDIR 0040000 /* directory */ -#define S_IFBLK 0060000 /* block special */ -#define S_IFREG 0100000 /* regular */ -#define S_IFLNK 0120000 /* symbolic link */ -#define S_IFSOCK 0140000 /* socket */ -#define S_IFWHT 0160000 /* whiteout */ -#define S_ISVTX 0001000 /* save swapped text even after use */ +#define S_ISUID 0004000 /* [XSI] set user id on execution */ +#define S_ISGID 0002000 /* [XSI] set group id on execution */ +#define S_ISVTX 0001000 /* [XSI] directory restrcted delete */ + +#ifndef _POSIX_C_SOURCE +#define S_ISTXT S_ISVTX /* sticky bit: not supported */ +#define S_IREAD S_IRUSR /* backward compatability */ +#define S_IWRITE S_IWUSR /* backward compatability */ +#define S_IEXEC S_IXUSR /* backward compatability */ #endif +#endif /* !S_IFMT */ -#define S_ISDIR(m) (((m) & 0170000) == 0040000) /* directory */ -#define S_ISCHR(m) (((m) & 0170000) == 0020000) /* char special */ +/* + * [XSI] The following macros shall be provided to test whether a file is + * of the specified type. The value m supplied to the macros is the value + * of st_mode from a stat structure. The macro shall evaluate to a non-zero + * value if the test is true; 0 if the test is false. + */ #define S_ISBLK(m) (((m) & 0170000) == 0060000) /* block special */ -#define S_ISREG(m) (((m) & 0170000) == 0100000) /* regular file */ +#define S_ISCHR(m) (((m) & 0170000) == 0020000) /* char special */ +#define S_ISDIR(m) (((m) & 0170000) == 0040000) /* directory */ #define S_ISFIFO(m) (((m) & 0170000) == 0010000) /* fifo or socket */ -#ifndef _POSIX_SOURCE +#define S_ISREG(m) (((m) & 0170000) == 0100000) /* regular file */ #define S_ISLNK(m) (((m) & 0170000) == 0120000) /* symbolic link */ #define S_ISSOCK(m) (((m) & 0170000) == 0140000) /* socket */ +#ifndef _POSIX_C_SOURCE #define S_ISWHT(m) (((m) & 0170000) == 0160000) /* whiteout */ +#define S_ISXATTR(m) (((m) & 0200000) == 0200000) /* extended attribute */ #endif -#ifndef _POSIX_SOURCE +/* + * [XSI] The implementation may implement message queues, semaphores, or + * shared memory objects as distinct file types. The following macros + * shall be provided to test whether a file is of the specified type. + * The value of the buf argument supplied to the macros is a pointer to + * a stat structure. The macro shall evaluate to a non-zero value if + * the specified object is implemented as a distinct file type and the + * specified file type is contained in the stat structure referenced by + * buf. Otherwise, the macro shall evaluate to zero. + * + * NOTE: The current implementation does not do this, although + * this may change in future revisions, and co currently only + * provides these macros to ensure source compatability with + * implementations which do. + */ +#define S_TYPEISMQ(buf) (0) /* Test for a message queue */ +#define S_TYPEISSEM(buf) (0) /* Test for a semaphore */ +#define S_TYPEISSHM(buf) (0) /* Test for a shared memory object */ + +/* + * [TYM] The implementation may implement typed memory objects as distinct + * file types, and the following macro shall test whether a file is of the + * specified type. The value of the buf argument supplied to the macros is + * a pointer to a stat structure. The macro shall evaluate to a non-zero + * value if the specified object is implemented as a distinct file type and + * the specified file type is contained in the stat structure referenced by + * buf. Otherwise, the macro shall evaluate to zero. + * + * NOTE: The current implementation does not do this, although + * this may change in future revisions, and co currently only + * provides this macro to ensure source compatability with + * implementations which do. + */ +#define S_TYPEISTMO(buf) (0) /* Test for a typed memory object */ + + +#ifndef _POSIX_C_SOURCE #define ACCESSPERMS (S_IRWXU|S_IRWXG|S_IRWXO) /* 0777 */ /* 7777 */ #define ALLPERMS (S_ISUID|S_ISGID|S_ISTXT|S_IRWXU|S_IRWXG|S_IRWXO) @@ -211,21 +400,36 @@ struct stat { #endif #ifndef KERNEL -#include <sys/cdefs.h> __BEGIN_DECLS -int chmod __P((const char *, mode_t)); -int fstat __P((int, struct stat *)); -int mkdir __P((const char *, mode_t)); -int mkfifo __P((const char *, mode_t)); -int stat __P((const char *, struct stat *)); -mode_t umask __P((mode_t)); -#ifndef _POSIX_SOURCE -int chflags __P((const char *, u_long)); -int fchflags __P((int, u_long)); -int fchmod __P((int, mode_t)); -int lstat __P((const char *, struct stat *)); +/* [XSI] */ +int chmod(const char *, mode_t); +int fchmod(int, mode_t); +int fstat(int, struct stat *); +int lstat(const char *, struct stat *); +int mkdir(const char *, mode_t); +int mkfifo(const char *, mode_t); +int stat(const char *, struct stat *); +int mknod(const char *, mode_t, dev_t); +mode_t umask(mode_t); + +#ifndef _POSIX_C_SOURCE +#ifndef _FILESEC_T +struct _filesec; +typedef struct _filesec *filesec_t; +#define _FILESEC_T #endif +int chflags(const char *, __uint32_t); +int chmodx_np(const char *, filesec_t); +int fchflags(int, __uint32_t); +int fchmodx_np(int, filesec_t); +int fstatx_np(int, struct stat *, filesec_t); +int lstatx_np(const char *, struct stat *, filesec_t); +int mkdirx_np(const char *, filesec_t); +int mkfifox_np(const char *, filesec_t); +int statx_np(const char *, struct stat *, filesec_t); +int umaskx_np(filesec_t); +#endif /* POSIX_C_SOURCE */ __END_DECLS #endif #endif /* !_SYS_STAT_H_ */ diff --git a/bsd/sys/sys_domain.h b/bsd/sys/sys_domain.h index b9582eca3..788b71d96 100644 --- a/bsd/sys/sys_domain.h +++ b/bsd/sys/sys_domain.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -26,6 +26,7 @@ #include <sys/appleapiopts.h> +#include <sys/cdefs.h> /* Kernel Events Protocol */ #define SYSPROTO_EVENT 1 /* kernel events protocol */ @@ -45,15 +46,17 @@ struct sockaddr_sys #ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef KERNEL_PRIVATE extern struct domain systemdomain; /* built in system domain protocols init function */ -int kern_event_init(); -int kern_control_init(); +__BEGIN_DECLS +int kern_event_init(void); +int kern_control_init(void); +__END_DECLS -#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL_PRIVATE */ #endif /* KERNEL */ #endif /* _SYSTEM_DOMAIN_H_ */ diff --git a/bsd/sys/syscall.h b/bsd/sys/syscall.h index 120dee023..d23b0ecf1 100644 --- a/bsd/sys/syscall.h +++ b/bsd/sys/syscall.h @@ -1,341 +1,456 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. + * @APPLE_LICENSE_HEADER_START@ * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. * - * @APPLE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1992, 1995-1999 Apple Computer, Inc. All Rights Reserved */ -/* - * - * The NEXTSTEP Software License Agreement specifies the terms - * and conditions for redistribution. + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + * + * + * System call switch table. * + * DO NOT EDIT-- this file is automatically generated. + * created from syscalls.master */ -#include <sys/appleapiopts.h> -#ifdef __APPLE_API_PRIVATE +#ifndef _SYS_SYSCALL_H_ +#define _SYS_SYSCALL_H_ -#define SYS_syscall 0 -#define SYS_exit 1 -#define SYS_fork 2 -#define SYS_read 3 -#define SYS_write 4 -#define SYS_open 5 -#define SYS_close 6 -#define SYS_wait4 7 - /* 8 is old creat */ -#define SYS_link 9 -#define SYS_unlink 10 - /* 11 is obsolete execv */ -#define SYS_chdir 12 -#define SYS_fchdir 13 -#define SYS_mknod 14 -#define SYS_chmod 15 -#define SYS_chown 16 - /* 17 is obsolete sbreak */ +#include <sys/appleapiopts.h> +#ifdef __APPLE_API_PRIVATE +#define SYS_syscall 0 +#define SYS_exit 1 +#define SYS_fork 2 +#define SYS_read 3 +#define SYS_write 4 +#define SYS_open 5 +#define SYS_close 6 +#define SYS_wait4 7 + /* 8 old creat */ +#define SYS_link 9 +#define SYS_unlink 10 + /* 11 old execv */ +#define SYS_chdir 12 +#define SYS_fchdir 13 +#define SYS_mknod 14 +#define SYS_chmod 15 +#define SYS_chown 16 +#define SYS_obreak 17 #if COMPAT_GETFSSTAT - /* 18 is old getfsstat */ +#define SYS_ogetfsstat 18 +#else +#define SYS_getfsstat 18 +#endif + /* 19 old lseek */ +#define SYS_getpid 20 + /* 21 old mount */ + /* 22 old umount */ +#define SYS_setuid 23 +#define SYS_getuid 24 +#define SYS_geteuid 25 +#define SYS_ptrace 26 +#define SYS_recvmsg 27 +#define SYS_sendmsg 28 +#define SYS_recvfrom 29 +#define SYS_accept 30 +#define SYS_getpeername 31 +#define SYS_getsockname 32 +#define SYS_access 33 +#define SYS_chflags 34 +#define SYS_fchflags 35 +#define SYS_sync 36 +#define SYS_kill 37 + /* 38 old stat */ +#define SYS_getppid 39 + /* 40 old lstat */ +#define SYS_dup 41 +#define SYS_pipe 42 +#define SYS_getegid 43 +#define SYS_profil 44 +#define SYS_ktrace 45 +#define SYS_sigaction 46 +#define SYS_getgid 47 +#define SYS_sigprocmask 48 +#define SYS_getlogin 49 +#define SYS_setlogin 50 +#define SYS_acct 51 +#define SYS_sigpending 52 +#define SYS_sigaltstack 53 +#define SYS_ioctl 54 +#define SYS_reboot 55 +#define SYS_revoke 56 +#define SYS_symlink 57 +#define SYS_readlink 58 +#define SYS_execve 59 +#define SYS_umask 60 +#define SYS_chroot 61 + /* 62 old fstat */ + /* 63 used internally , reserved */ + /* 64 old getpagesize */ +#define SYS_msync 65 +#define SYS_vfork 66 + /* 67 old vread */ + /* 68 old vwrite */ +#define SYS_sbrk 69 +#define SYS_sstk 70 + /* 71 old mmap */ +#define SYS_ovadvise 72 +#define SYS_munmap 73 +#define SYS_mprotect 74 +#define SYS_madvise 75 + /* 76 old vhangup */ + /* 77 old vlimit */ +#define SYS_mincore 78 +#define SYS_getgroups 79 +#define SYS_setgroups 80 +#define SYS_getpgrp 81 +#define SYS_setpgid 82 +#define SYS_setitimer 83 + /* 84 old wait */ +#define SYS_swapon 85 +#define SYS_getitimer 86 + /* 87 old gethostname */ + /* 88 old sethostname */ +#define SYS_getdtablesize 89 +#define SYS_dup2 90 + /* 91 old getdopt */ +#define SYS_fcntl 92 +#define SYS_select 93 + /* 94 old setdopt */ +#define SYS_fsync 95 +#define SYS_setpriority 96 +#define SYS_socket 97 +#define SYS_connect 98 + /* 99 old accept */ +#define SYS_getpriority 100 + /* 101 old send */ + /* 102 old recv */ +#ifdef __ppc__ + /* 103 old sigreturn */ +#else +#define SYS_sigreturn 103 +#endif +#define SYS_bind 104 +#define SYS_setsockopt 105 +#define SYS_listen 106 + /* 107 old vtimes */ + /* 108 old sigvec */ + /* 109 old sigblock */ + /* 110 old sigsetmask */ +#define SYS_sigsuspend 111 + /* 112 old sigstack */ + /* 113 old recvmsg */ + /* 114 old sendmsg */ + /* 115 old vtrace */ +#ifdef __ppc__ +#define SYS_ppc_gettimeofday 116 +#define SYS_gettimeofday 116 +#else +#define SYS_gettimeofday 116 +#endif +#define SYS_getrusage 117 +#define SYS_getsockopt 118 + /* 119 old resuba */ +#define SYS_readv 120 +#define SYS_writev 121 +#define SYS_settimeofday 122 +#define SYS_fchown 123 +#define SYS_fchmod 124 + /* 125 old recvfrom */ + /* 126 old setreuid */ + /* 127 old setregid */ +#define SYS_rename 128 + /* 129 old truncate */ + /* 130 old ftruncate */ +#define SYS_flock 131 +#define SYS_mkfifo 132 +#define SYS_sendto 133 +#define SYS_shutdown 134 +#define SYS_socketpair 135 +#define SYS_mkdir 136 +#define SYS_rmdir 137 +#define SYS_utimes 138 +#define SYS_futimes 139 +#define SYS_adjtime 140 + /* 141 old getpeername */ + /* 142 old gethostid */ + /* 143 old sethostid */ + /* 144 old getrlimit */ + /* 145 old setrlimit */ + /* 146 old killpg */ +#define SYS_setsid 147 + /* 148 old setquota */ + /* 149 old qquota */ + /* 150 old getsockname */ +#define SYS_getpgid 151 +#define SYS_setprivexec 152 +#define SYS_pread 153 +#define SYS_pwrite 154 +#if NFSSERVER +#define SYS_nfssvc 155 #else -#define SYS_getfsstat 18 + /* 155 */ #endif - /* 19 is old lseek */ -#define SYS_getpid 20 - /* 21 is obsolete mount */ - /* 22 is obsolete umount */ -#define SYS_setuid 23 -#define SYS_getuid 24 -#define SYS_geteuid 25 -#define SYS_ptrace 26 -#define SYS_recvmsg 27 -#define SYS_sendmsg 28 -#define SYS_recvfrom 29 -#define SYS_accept 30 -#define SYS_getpeername 31 -#define SYS_getsockname 32 -#define SYS_access 33 -#define SYS_chflags 34 -#define SYS_fchflags 35 -#define SYS_sync 36 -#define SYS_kill 37 - /* 38 is old stat */ -#define SYS_getppid 39 - /* 40 is old lstat */ -#define SYS_dup 41 -#define SYS_pipe 42 -#define SYS_getegid 43 -#define SYS_profil 44 -#define SYS_ktrace 45 -#define SYS_sigaction 46 -#define SYS_getgid 47 -#define SYS_sigprocmask 48 -#define SYS_getlogin 49 -#define SYS_setlogin 50 -#define SYS_acct 51 -#define SYS_sigpending 52 -#define SYS_sigaltstack 53 -#define SYS_ioctl 54 -#define SYS_reboot 55 -#define SYS_revoke 56 -#define SYS_symlink 57 -#define SYS_readlink 58 -#define SYS_execve 59 -#define SYS_umask 60 -#define SYS_chroot 61 - /* 62 is old fstat */ - /* 63 is unused */ - /* 64 is old getpagesize */ -#define SYS_msync 65 -#define SYS_vfork 66 - /* 67 is obsolete vread */ - /* 68 is obsolete vwrite */ -#define SYS_sbrk 69 -#define SYS_sstk 70 - /* 71 is old mmap */ - /* 72 is obsolete vadvise */ -#define SYS_munmap 73 -#define SYS_mprotect 74 -#define SYS_madvise 75 - /* 76 is obsolete vhangup */ - /* 77 is obsolete vlimit */ -#define SYS_mincore 78 -#define SYS_getgroups 79 -#define SYS_setgroups 80 -#define SYS_getpgrp 81 -#define SYS_setpgid 82 -#define SYS_setitimer 83 - /* 84 is old wait */ -#define SYS_swapon 85 -#define SYS_getitimer 86 - /* 87 is old gethostname */ - /* 88 is old sethostname */ -#define SYS_getdtablesize 89 -#define SYS_dup2 90 -#define SYS_fcntl 92 -#define SYS_select 93 - /* 94 is obsolete setdopt */ -#define SYS_fsync 95 -#define SYS_setpriority 96 -#define SYS_socket 97 -#define SYS_connect 98 - /* 99 is old accept */ -#define SYS_getpriority 100 - /* 101 is old send */ - /* 102 is old recv */ -#ifndef __ppc__ -#define SYS_sigreturn 103 + /* 156 old getdirentries */ +#define SYS_statfs 157 +#define SYS_fstatfs 158 +#define SYS_unmount 159 + /* 160 old async_daemon */ +#if NFSCLIENT +#define SYS_getfh 161 +#else + /* 161 */ #endif -#define SYS_bind 104 -#define SYS_setsockopt 105 -#define SYS_listen 106 - /* 107 is obsolete vtimes */ - /* 108 is old sigvec */ - /* 109 is old sigblock */ - /* 110 is old sigsetmask */ -#define SYS_sigsuspend 111 - /* 112 is old sigstack */ - /* 113 is old recvmsg */ - /* 114 is old sendmsg */ - /* 115 is obsolete vtrace */ -#define SYS_gettimeofday 116 -#define SYS_getrusage 117 -#define SYS_getsockopt 118 - /* 119 is obsolete resuba */ -#define SYS_readv 120 -#define SYS_writev 121 -#define SYS_settimeofday 122 -#define SYS_fchown 123 -#define SYS_fchmod 124 - /* 125 is old recvfrom */ - /* 126 is old setreuid */ - /* 127 is old setregid */ -#define SYS_rename 128 - /* 129 is old truncate */ - /* 130 is old ftruncate */ -#define SYS_flock 131 -#define SYS_mkfifo 132 -#define SYS_sendto 133 -#define SYS_shutdown 134 -#define SYS_socketpair 135 -#define SYS_mkdir 136 -#define SYS_rmdir 137 -#define SYS_utimes 138 -#define SYS_futimes 139 -#define SYS_adjtime 140 - /* 141 is old getpeername */ - /* 142 is old gethostid */ - /* 143 is old sethostid */ - /* 144 is old getrlimit */ - /* 145 is old setrlimit */ - /* 146 is old killpg */ -#define SYS_setsid 147 - /* 148 is obsolete setquota */ - /* 149 is obsolete quota */ - /* 150 is old getsockname */ -#define SYS_getpgid 151 -#define SYS_setprivexec 152 -#define SYS_pread 153 -#define SYS_pwrite 154 -#define SYS_nfssvc 155 - /* 156 is old getdirentries */ -#define SYS_statfs 157 -#define SYS_fstatfs 158 -#define SYS_unmount 159 - /* 160 is obsolete async_daemon */ -#define SYS_getfh 161 - /* 162 is old getdomainname */ - /* 163 is old setdomainname */ - /* 164 is obsolete pcfs_mount */ -#define SYS_quotactl 165 - /* 166 is obsolete exportfs */ -#define SYS_mount 167 - /* 168 is obsolete ustat */ - /* 169 is unused */ -#define SYS_table 170 - /* 171 is old wait_3 */ - /* 172 is obsolete rpause */ - /* 173 is unused */ - /* 174 is obsolete getdents */ -#define SYS_gc_control 175 -#define SYS_add_profil 176 - /* 177 is unused */ - /* 178 is unused */ - /* 179 is unused */ -#define SYS_kdebug_trace 180 -#define SYS_setgid 181 -#define SYS_setegid 182 -#define SYS_seteuid 183 + /* 162 old getdomainname */ + /* 163 old setdomainname */ + /* 164 */ +#define SYS_quotactl 165 + /* 166 old exportfs */ +#define SYS_mount 167 + /* 168 old ustat */ + /* 169 */ +#define SYS_table 170 + /* 171 old wait3 */ + /* 172 old rpause */ +#define SYS_waitid 173 + /* 174 old getdents */ + /* 175 old gc_control */ +#define SYS_add_profil 176 + /* 177 */ + /* 178 */ + /* 179 */ +#define SYS_kdebug_trace 180 +#define SYS_setgid 181 +#define SYS_setegid 182 +#define SYS_seteuid 183 #ifdef __ppc__ -#define SYS_sigreturn 184 +#define SYS_sigreturn 184 +#else + /* 184 */ #endif - /* 185 is unused */ - /* 186 is unused */ - /* 187 is unused */ -#define SYS_stat 188 -#define SYS_fstat 189 -#define SYS_lstat 190 -#define SYS_pathconf 191 -#define SYS_fpathconf 192 + /* 185 */ + /* 186 */ + /* 187 */ +#define SYS_stat 188 +#define SYS_fstat 189 +#define SYS_lstat 190 +#define SYS_pathconf 191 +#define SYS_fpathconf 192 #if COMPAT_GETFSSTAT -#define SYS_getfsstat 193 +#define SYS_getfsstat 193 +#else + /* 193 */ #endif -#define SYS_getrlimit 194 -#define SYS_setrlimit 195 -#define SYS_getdirentries 196 -#define SYS_mmap 197 -#define SYS___syscall 198 -#define SYS_lseek 199 -#define SYS_truncate 200 -#define SYS_ftruncate 201 -#define SYS___sysctl 202 -#define SYS_mlock 203 -#define SYS_munlock 204 -#define SYS_undelete 205 -#define SYS_ATsocket 206 -#define SYS_ATgetmsg 207 -#define SYS_ATputmsg 208 -#define SYS_ATPsndreq 209 -#define SYS_ATPsndrsp 210 -#define SYS_ATPgetreq 211 -#define SYS_ATPgetrsp 212 - /* 213 is reserved for AppleTalk */ -#define SYS_kqueue_from_portset_np 214 -#define SYS_kqueue_portset_np 215 -#define SYS_mkcomplex 216 -#define SYS_statv 217 -#define SYS_lstatv 218 -#define SYS_fstatv 219 -#define SYS_getattrlist 220 -#define SYS_setattrlist 221 -#define SYS_getdirentriesattr 222 -#define SYS_exchangedata 223 -#define SYS_checkuseraccess 224 -#define SYS_searchfs 225 - - /* 226 - 230 are reserved for HFS expansion */ - /* 231 - 241 are reserved */ -#define SYS_fsctl 242 - /* 243 - 246 are reserved */ -#define SYS_nfsclnt 247 /* from freebsd, for lockd */ -#define SYS_fhopen 248 /* from freebsd, for lockd */ - /* 249 is reserved */ -#define SYS_minherit 250 -#define SYS_semsys 251 -#define SYS_msgsys 252 -#define SYS_shmsys 253 -#define SYS_semctl 254 -#define SYS_semget 255 -#define SYS_semop 256 -#define SYS_semconfig 257 -#define SYS_msgctl 258 -#define SYS_msgget 259 -#define SYS_msgsnd 260 -#define SYS_msgrcv 261 -#define SYS_shmat 262 -#define SYS_shmctl 263 -#define SYS_shmdt 264 -#define SYS_shmget 265 -#define SYS_shm_open 266 -#define SYS_shm_unlink 267 -#define SYS_sem_open 268 -#define SYS_sem_close 269 -#define SYS_sem_unlink 270 -#define SYS_sem_wait 271 -#define SYS_sem_trywait 272 -#define SYS_sem_post 273 -#define SYS_sem_getvalue 274 -#define SYS_sem_init 275 -#define SYS_sem_destroy 276 - /* 277 - 295 are reserved */ -#define SYS_load_shared_file 296 -#define SYS_reset_shared_file 297 -#define SYS_new_system_shared_regions 298 - /* 299 - 309 are reserved */ -#define SYS_getsid 310 - /* 311 - 312 are reserved */ -#define SYS_aio_fsync 313 -#define SYS_aio_return 314 -#define SYS_aio_suspend 315 -#define SYS_aio_cancel 316 -#define SYS_aio_error 317 -#define SYS_aio_read 318 -#define SYS_aio_write 319 -#define SYS_lio_listio 320 - /* 321 - 323 are reserved */ -#define SYS_mlockall 324 -#define SYS_munlockall 325 - /* 326 is reserved */ -#define SYS_issetugid 327 -#define SYS___pthread_kill 328 -#define SYS_pthread_sigmask 329 -#define SYS_sigwait 330 - -#define SYS_audit 350 /* submit user space audit records */ -#define SYS_auditon 351 /* audit subsystem control */ - /* 352 is unused; used to be auditsvc */ -#define SYS_getauid 353 -#define SYS_setauid 354 -#define SYS_getaudit 355 -#define SYS_setaudit 356 -#define SYS_getaudit_addr 357 -#define SYS_setaudit_addr 358 -#define SYS_auditctl 359 /* audit file control */ +#define SYS_getrlimit 194 +#define SYS_setrlimit 195 +#define SYS_getdirentries 196 +#define SYS_mmap 197 + /* 198 __syscall */ +#define SYS_lseek 199 +#define SYS_truncate 200 +#define SYS_ftruncate 201 +#define SYS___sysctl 202 +#define SYS_mlock 203 +#define SYS_munlock 204 +#define SYS_undelete 205 +#ifdef __ppc__ +#define SYS_ATsocket 206 +#define SYS_ATgetmsg 207 +#define SYS_ATputmsg 208 +#define SYS_ATPsndreq 209 +#define SYS_ATPsndrsp 210 +#define SYS_ATPgetreq 211 +#define SYS_ATPgetrsp 212 + /* 213 Reserved for AppleTalk */ +#else +#define SYS_ATsocket 206 +#define SYS_ATgetmsg 207 +#define SYS_ATputmsg 208 +#define SYS_ATPsndreq 209 +#define SYS_ATPsndrsp 210 +#define SYS_ATPgetreq 211 +#define SYS_ATPgetrsp 212 + /* 213 Reserved for AppleTalk */ +#endif /* __ppc__ */ +#define SYS_kqueue_from_portset_np 214 +#define SYS_kqueue_portset_np 215 +#define SYS_mkcomplex 216 +#define SYS_statv 217 +#define SYS_lstatv 218 +#define SYS_fstatv 219 +#define SYS_getattrlist 220 +#define SYS_setattrlist 221 +#define SYS_getdirentriesattr 222 +#define SYS_exchangedata 223 +#ifdef __APPLE_API_OBSOLETE +#define SYS_checkuseraccess 224 +#else + /* 224 HFS checkuseraccess check access to a file */ +#endif /* __APPLE_API_OBSOLETE */ +#define SYS_searchfs 225 +#define SYS_delete 226 +#define SYS_copyfile 227 + /* 228 */ + /* 229 */ +#define SYS_poll 230 +#define SYS_watchevent 231 +#define SYS_waitevent 232 +#define SYS_modwatch 233 +#define SYS_getxattr 234 +#define SYS_fgetxattr 235 +#define SYS_setxattr 236 +#define SYS_fsetxattr 237 +#define SYS_removexattr 238 +#define SYS_fremovexattr 239 +#define SYS_listxattr 240 +#define SYS_flistxattr 241 +#define SYS_fsctl 242 +#define SYS_initgroups 243 + /* 244 */ + /* 245 */ + /* 246 */ +#if NFSCLIENT +#define SYS_nfsclnt 247 +#define SYS_fhopen 248 +#else + /* 247 */ + /* 248 */ +#endif + /* 249 */ +#define SYS_minherit 250 +#define SYS_semsys 251 +#define SYS_msgsys 252 +#define SYS_shmsys 253 +#define SYS_semctl 254 +#define SYS_semget 255 +#define SYS_semop 256 +#define SYS_semconfig 257 +#define SYS_msgctl 258 +#define SYS_msgget 259 +#define SYS_msgsnd 260 +#define SYS_msgrcv 261 +#define SYS_shmat 262 +#define SYS_shmctl 263 +#define SYS_shmdt 264 +#define SYS_shmget 265 +#define SYS_shm_open 266 +#define SYS_shm_unlink 267 +#define SYS_sem_open 268 +#define SYS_sem_close 269 +#define SYS_sem_unlink 270 +#define SYS_sem_wait 271 +#define SYS_sem_trywait 272 +#define SYS_sem_post 273 +#define SYS_sem_getvalue 274 +#define SYS_sem_init 275 +#define SYS_sem_destroy 276 +#define SYS_open_extended 277 +#define SYS_umask_extended 278 +#define SYS_stat_extended 279 +#define SYS_lstat_extended 280 +#define SYS_fstat_extended 281 +#define SYS_chmod_extended 282 +#define SYS_fchmod_extended 283 +#define SYS_access_extended 284 +#define SYS_settid 285 +#define SYS_gettid 286 +#define SYS_setsgroups 287 +#define SYS_getsgroups 288 +#define SYS_setwgroups 289 +#define SYS_getwgroups 290 +#define SYS_mkfifo_extended 291 +#define SYS_mkdir_extended 292 +#define SYS_identitysvc 293 + /* 294 */ + /* 295 */ +#define SYS_load_shared_file 296 +#define SYS_reset_shared_file 297 +#define SYS_new_system_shared_regions 298 +#define SYS_shared_region_map_file_np 299 +#define SYS_shared_region_make_private_np 300 + /* 301 */ + /* 302 */ + /* 303 */ + /* 304 */ + /* 305 */ + /* 306 */ + /* 307 */ + /* 308 */ + /* 309 */ +#define SYS_getsid 310 +#define SYS_settid_with_pid 311 + /* 312 */ +#define SYS_aio_fsync 313 +#define SYS_aio_return 314 +#define SYS_aio_suspend 315 +#define SYS_aio_cancel 316 +#define SYS_aio_error 317 +#define SYS_aio_read 318 +#define SYS_aio_write 319 +#define SYS_lio_listio 320 + /* 321 */ + /* 322 */ + /* 323 */ +#define SYS_mlockall 324 +#define SYS_munlockall 325 + /* 326 */ +#define SYS_issetugid 327 +#define SYS___pthread_kill 328 +#define SYS_pthread_sigmask 329 +#define SYS_sigwait 330 +#define SYS___disable_threadsignal 331 +#define SYS___pthread_markcancel 332 +#define SYS___pthread_canceled 333 +#define SYS___semwait_signal 334 +#define SYS_utrace 335 + /* 336 */ + /* 337 */ + /* 338 */ + /* 339 */ + /* 340 */ + /* 341 */ + /* 342 */ + /* 343 */ + /* 344 */ + /* 345 */ + /* 346 */ + /* 347 */ + /* 348 */ + /* 349 */ +#define SYS_audit 350 +#define SYS_auditon 351 + /* 352 */ +#define SYS_getauid 353 +#define SYS_setauid 354 +#define SYS_getaudit 355 +#define SYS_setaudit 356 +#define SYS_getaudit_addr 357 +#define SYS_setaudit_addr 358 +#define SYS_auditctl 359 + /* 360 */ + /* 361 */ +#define SYS_kqueue 362 +#define SYS_kevent 363 +#define SYS_lchown 364 + /* 365 */ + /* 366 */ + /* 367 */ + /* 368 */ + /* 369 */ +#define SYS_MAXSYSCALL 370 -#define SYS_kqueue 362 -#define SYS_kevent 363 #endif /* __APPLE_API_PRIVATE */ - +#endif /* !_SYS_SYSCALL_H_ */ diff --git a/bsd/sys/sysctl.h b/bsd/sys/sysctl.h index 241f9c132..e6fb7b1b6 100644 --- a/bsd/sys/sysctl.h +++ b/bsd/sys/sysctl.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -64,15 +64,21 @@ /* * These are for the eproc structure defined below. */ +#include <sys/cdefs.h> + #include <sys/appleapiopts.h> #ifndef KERNEL #include <sys/time.h> #include <sys/ucred.h> +#else +#include <libkern/sysctl.h> #endif - -#include <sys/vm.h> #include <sys/proc.h> +#include <sys/vm.h> + +#ifdef BSD_KERNEL_PRIVATE #include <sys/linker_set.h> +#endif /* * Definitions for sysctl call. The sysctl call uses a hierarchical name @@ -125,7 +131,6 @@ struct ctlname { #define OID_AUTO (-1) #ifdef KERNEL -#ifdef __APPLE_API_UNSTABLE #define SYSCTL_HANDLER_ARGS (struct sysctl_oid *oidp, void *arg1, int arg2, \ struct sysctl_req *req) @@ -135,15 +140,15 @@ struct ctlname { */ struct sysctl_req { struct proc *p; - int lock; - void *oldptr; + int lock; + user_addr_t oldptr; size_t oldlen; size_t oldidx; - int (*oldfunc)(struct sysctl_req *, const void *, size_t); - void *newptr; + int (*oldfunc)(struct sysctl_req *, const void *, size_t); + user_addr_t newptr; size_t newlen; size_t newidx; - int (*newfunc)(struct sysctl_req *, void *, size_t); + int (*newfunc)(struct sysctl_req *, void *, size_t); }; SLIST_HEAD(sysctl_oid_list, sysctl_oid); @@ -167,6 +172,8 @@ struct sysctl_oid { #define SYSCTL_IN(r, p, l) (r->newfunc)(r, p, l) #define SYSCTL_OUT(r, p, l) (r->oldfunc)(r, p, l) +__BEGIN_DECLS + int sysctl_handle_int SYSCTL_HANDLER_ARGS; int sysctl_handle_long SYSCTL_HANDLER_ARGS; int sysctl_handle_quad SYSCTL_HANDLER_ARGS; @@ -181,6 +188,8 @@ int sysctl_handle_opaque SYSCTL_HANDLER_ARGS; void sysctl_register_oid(struct sysctl_oid *oidp); void sysctl_unregister_oid(struct sysctl_oid *oidp); +__END_DECLS + /* Declare an oid to allow child oids to be added to it. */ #define SYSCTL_DECL(name) \ extern struct sysctl_oid_list sysctl_##name##_children @@ -244,9 +253,30 @@ void sysctl_unregister_oid(struct sysctl_oid *oidp); #define SYSCTL_PROC(parent, nbr, name, access, ptr, arg, handler, fmt, descr) \ SYSCTL_OID(parent, nbr, name, access, \ ptr, arg, handler, fmt, descr) -#endif /* __APPLE_API_UNSTABLE */ + + +extern struct sysctl_oid_list sysctl__children; +SYSCTL_DECL(_kern); +SYSCTL_DECL(_sysctl); +SYSCTL_DECL(_vm); +SYSCTL_DECL(_vfs); +SYSCTL_DECL(_net); +SYSCTL_DECL(_debug); +SYSCTL_DECL(_hw); +SYSCTL_DECL(_machdep); +SYSCTL_DECL(_user); + #endif /* KERNEL */ +#ifdef XNU_KERNEL_PRIVATE +#define SYSCTL_DEF_ENABLED +#else +#ifndef KERNEL +#define SYSCTL_DEF_ENABLED +#endif +#endif + +#ifdef SYSCTL_DEF_ENABLED /* * Top-level identifiers */ @@ -308,21 +338,21 @@ void sysctl_unregister_oid(struct sysctl_oid *oidp); #define KERN_MAXFILESPERPROC 29 /* int: max open files per proc */ #define KERN_MAXPROCPERUID 30 /* int: max processes per uid */ #define KERN_DUMPDEV 31 /* dev_t: device to dump on */ -#define KERN_IPC 32 /* node: anything related to IPC */ -#define KERN_DUMMY 33 /* unused */ -#define KERN_PS_STRINGS 34 /* int: address of PS_STRINGS */ -#define KERN_USRSTACK 35 /* int: address of USRSTACK */ -#define KERN_LOGSIGEXIT 36 /* int: do we log sigexit procs? */ +#define KERN_IPC 32 /* node: anything related to IPC */ +#define KERN_DUMMY 33 /* unused */ +#define KERN_PS_STRINGS 34 /* int: address of PS_STRINGS */ +#define KERN_USRSTACK32 35 /* int: address of USRSTACK */ +#define KERN_LOGSIGEXIT 36 /* int: do we log sigexit procs? */ #define KERN_SYMFILE 37 /* string: kernel symbol filename */ #define KERN_PROCARGS 38 #define KERN_PCSAMPLES 39 /* node: pc sampling */ #define KERN_NETBOOT 40 /* int: are we netbooted? 1=yes,0=no */ #define KERN_PANICINFO 41 /* node: panic UI information */ -#define KERN_SYSV 42 /* node: panic UI information */ +#define KERN_SYSV 42 /* node: System V IPC information */ #define KERN_AFFINITY 43 /* xxx */ #define KERN_CLASSIC 44 /* xxx */ #define KERN_CLASSICHANDLER 45 /* xxx */ -#define KERN_AIOMAX 46 /* int: max aio requests */ +#define KERN_AIOMAX 46 /* int: max aio requests */ #define KERN_AIOPROCMAX 47 /* int: max aio requests per process */ #define KERN_AIOTHREADS 48 /* int: max aio worker threads */ #ifdef __APPLE_API_UNSTABLE @@ -331,8 +361,20 @@ void sysctl_unregister_oid(struct sysctl_oid *oidp); #define KERN_COREFILE 50 /* string: corefile format string */ #define KERN_COREDUMP 51 /* int: whether to coredump at all */ #define KERN_SUGID_COREDUMP 52 /* int: whether to dump SUGID cores */ -#define KERN_MAXID 53 /* number of valid kern ids */ - +#define KERN_PROCDELAYTERM 53 /* int: set/reset current proc for delayed termination during shutdown */ +#define KERN_SHREG_PRIVATIZABLE 54 /* int: can shared regions be privatized ? */ +#define KERN_PROC_LOW_PRI_IO 55 /* int: set/reset current proc for low priority I/O */ +#define KERN_LOW_PRI_WINDOW 56 /* int: set/reset throttle window - milliseconds */ +#define KERN_LOW_PRI_DELAY 57 /* int: set/reset throttle delay - milliseconds */ +#define KERN_POSIX 58 /* node: posix tunables */ +#define KERN_USRSTACK64 59 /* LP64 user stack query */ +#define KERN_MAXID 60 /* number of valid kern ids */ + +#if defined(__LP64__) +#define KERN_USRSTACK KERN_USRSTACK64 +#else +#define KERN_USRSTACK KERN_USRSTACK32 +#endif /* KERN_KDEBUG types */ #define KERN_KDEFLAGS 1 @@ -364,8 +406,7 @@ void sysctl_unregister_oid(struct sysctl_oid *oidp); /* KERN_PANICINFO types */ #define KERN_PANICINFO_MAXSIZE 1 /* quad: panic UI image size limit */ -#define KERN_PANICINFO_IMAGE16 2 /* string: path to the panic UI (16 bit) */ -#define KERN_PANICINFO_IMAGE32 3 /* string: path to the panic UI (32 bit) */ +#define KERN_PANICINFO_IMAGE 2 /* panic UI in 8-bit kraw format */ /* * KERN_SYSV identifiers @@ -417,7 +458,7 @@ void sysctl_unregister_oid(struct sysctl_oid *oidp); { "dumpdev", CTLTYPE_STRUCT }, /* we lie; don't print as int */ \ { "ipc", CTLTYPE_NODE }, \ { "dummy", CTLTYPE_INT }, \ - { "ps_strings", CTLTYPE_INT }, \ + { "dummy", CTLTYPE_INT }, \ { "usrstack", CTLTYPE_INT }, \ { "logsigexit", CTLTYPE_INT }, \ { "symfile",CTLTYPE_STRING },\ @@ -435,7 +476,13 @@ void sysctl_unregister_oid(struct sysctl_oid *oidp); { "procargs2",CTLTYPE_STRUCT }, \ { "corefile",CTLTYPE_STRING }, \ { "coredump", CTLTYPE_INT }, \ - { "sugid_coredump", CTLTYPE_INT } \ + { "sugid_coredump", CTLTYPE_INT }, \ + { "delayterm", CTLTYPE_INT }, \ + { "shreg_private", CTLTYPE_INT }, \ + { "proc_low_pri_io", CTLTYPE_INT }, \ + { "low_pri_window", CTLTYPE_INT }, \ + { "low_pri_delay", CTLTYPE_INT }, \ + { "posix", CTLTYPE_NODE } \ } /* @@ -460,13 +507,31 @@ void sysctl_unregister_oid(struct sysctl_oid *oidp); * KERN_PROC subtype ops return arrays of augmented proc structures: */ #ifdef __APPLE_API_UNSTABLE + +struct _pcred { + char pc_lock[72]; /* opaque content */ + struct ucred *pc_ucred; /* Current credentials. */ + uid_t p_ruid; /* Real user id. */ + uid_t p_svuid; /* Saved effective user id. */ + gid_t p_rgid; /* Real group id. */ + gid_t p_svgid; /* Saved effective group id. */ + int p_refcnt; /* Number of references. */ +}; + +struct _ucred { + int32_t cr_ref; /* reference count */ + uid_t cr_uid; /* effective user id */ + short cr_ngroups; /* number of groups */ + gid_t cr_groups[NGROUPS]; /* groups */ +}; + struct kinfo_proc { struct extern_proc kp_proc; /* proc structure */ struct eproc { struct proc *e_paddr; /* address of proc */ struct session *e_sess; /* session pointer */ - struct pcred e_pcred; /* process credentials */ - struct ucred e_ucred; /* current credentials */ + struct _pcred e_pcred; /* process credentials */ + struct _ucred e_ucred; /* current credentials */ struct vmspace e_vm; /* address space */ pid_t e_ppid; /* parent process id */ pid_t e_pgid; /* process group id */ @@ -480,14 +545,74 @@ struct kinfo_proc { short e_xrssize; /* text rss */ short e_xccount; /* text references */ short e_xswrss; - long e_flag; + int32_t e_flag; #define EPROC_CTTY 0x01 /* controlling tty vnode active */ #define EPROC_SLEADER 0x02 /* session leader */ #define COMAPT_MAXLOGNAME 12 char e_login[COMAPT_MAXLOGNAME]; /* short setlogin() name */ - long e_spare[4]; + int32_t e_spare[4]; } kp_eproc; }; + +#ifdef BSD_KERNEL_PRIVATE +#include <sys/proc_internal.h> + +// LP64todo - should this move? + +/* LP64 version of _pcred. all pointers + * grow when we're dealing with a 64-bit process. + * WARNING - keep in sync with _pcred + */ + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_pcred { + char pc_lock[72]; /* opaque content */ + user_addr_t pc_ucred; /* Current credentials. */ + uid_t p_ruid; /* Real user id. */ + uid_t p_svuid; /* Saved effective user id. */ + gid_t p_rgid; /* Real group id. */ + gid_t p_svgid; /* Saved effective group id. */ + int p_refcnt; /* Number of references. */ +}; + +/* LP64 version of kinfo_proc. all pointers + * grow when we're dealing with a 64-bit process. + * WARNING - keep in sync with kinfo_proc + */ +struct user_kinfo_proc { + struct user_extern_proc kp_proc; /* proc structure */ + struct user_eproc { + user_addr_t e_paddr; /* address of proc */ + user_addr_t e_sess; /* session pointer */ + struct user_pcred e_pcred; /* process credentials */ + struct _ucred e_ucred; /* current credentials */ + struct user_vmspace e_vm; /* address space */ + pid_t e_ppid; /* parent process id */ + pid_t e_pgid; /* process group id */ + short e_jobc; /* job control counter */ + dev_t e_tdev; /* controlling tty dev */ + pid_t e_tpgid; /* tty process group id */ + user_addr_t e_tsess; /* tty session pointer */ + char e_wmesg[WMESGLEN+1]; /* wchan message */ + segsz_t e_xsize; /* text size */ + short e_xrssize; /* text rss */ + short e_xccount; /* text references */ + short e_xswrss; + int32_t e_flag; + char e_login[COMAPT_MAXLOGNAME]; /* short setlogin() name */ + int32_t e_spare[4]; + } kp_eproc; +}; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +#endif /* BSD_KERNEL_PRIVATE */ + #endif /* __APPLE_API_UNSTABLE */ /* @@ -508,15 +633,60 @@ struct kinfo_proc { */ #define VM_METER 1 /* struct vmmeter */ #define VM_LOADAVG 2 /* struct loadavg */ -#define VM_MAXID 3 /* number of valid vm ids */ +/* + * Note: "3" was skipped sometime ago and should probably remain unused + * to avoid any new entry from being accepted by older kernels... + */ #define VM_MACHFACTOR 4 /* struct loadavg with mach factor*/ +#define VM_SWAPUSAGE 5 /* total swap usage */ +#define VM_MAXID 6 /* number of valid vm ids */ #define CTL_VM_NAMES { \ { 0, 0 }, \ { "vmmeter", CTLTYPE_STRUCT }, \ - { "loadavg", CTLTYPE_STRUCT } \ + { "loadavg", CTLTYPE_STRUCT }, \ + { 0, 0 }, /* placeholder for "3" (see comment above) */ \ + { "machfactor", CTLTYPE_STRUCT }, \ + { "swapusage", CTLTYPE_STRUCT } \ } +struct xsw_usage { + u_int64_t xsu_total; + u_int64_t xsu_avail; + u_int64_t xsu_used; + u_int32_t xsu_pagesize; + boolean_t xsu_encrypted; +}; + +#ifdef __APPLE_API_PRIVATE +/* Load average structure. Use of fixpt_t assume <sys/types.h> in scope. */ +/* XXX perhaps we should protect fixpt_t, and define it here (or discard it) */ +struct loadavg { + fixpt_t ldavg[3]; + long fscale; +}; +extern struct loadavg averunnable; +#define LSCALE 1000 /* scaling for "fixed point" arithmetic */ + +// LP64todo - should this move? +#ifdef BSD_KERNEL_PRIVATE + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif +struct user_loadavg { + fixpt_t ldavg[3]; + user_long_t fscale; +}; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +#endif /* BSD_KERNEL_PRIVATE */ +#endif /* __APPLE_API_PRIVATE */ + + /* * CTL_HW identifiers */ @@ -577,18 +747,24 @@ struct kinfo_proc { } /* - * These are the support HW selectors for sysctlbyname. Parameters that are byte count or frequencies are 64 bit numbers. + * These are the support HW selectors for sysctlbyname. Parameters that are byte counts or frequencies are 64 bit numbers. * All other parameters are 32 bit numbers. * * hw.memsize - The number of bytes of physical memory in the system. * - * hw.ncpu - The number maximum number of processor that could be available this boot. + * hw.ncpu - The maximum number of processors that could be available this boot. * Use this value for sizing of static per processor arrays; i.e. processor load statistics. * - * hw.activecpu - The number of cpus currently available for executing threads. + * hw.activecpu - The number of processors currently available for executing threads. * Use this number to determine the number threads to create in SMP aware applications. * This number can change when power management modes are changed. - * + * + * hw.physicalcpu - The number of physical processors available in the current power management mode. + * hw.physicalcpu_max - The maximum number of physical processors that could be available this boot + * + * hw.logicalcpu - The number of logical processors available in the current power management mode. + * hw.logicalcpu_max - The maximum number of logical processors that could be available this boot + * * hw.tbfrequency - This gives the time base frequency used by the OS and is the basis of all timing services. * In general is is better to use mach's or higher level timing services, but this value * is needed to convert the PPC Time Base registers to real time. @@ -605,6 +781,9 @@ struct kinfo_proc { * hw.cpusubtype - These values should be used to determine what processor family the running cpu is from so that * the best binary can be chosen, or the best dynamic code generated. They should not be used * to determine if a given processor feature is available. + * hw.cputhreadtype - This value will be present if the processor supports threads. Like hw.cpusubtype this selector + * should not be used to infer features, and only used to name the processors thread architecture. + * The values are defined in <mach/machine.h> * * hw.byteorder - Gives the byte order of the processor. 4321 for big endian, 1234 for little. * @@ -699,20 +878,6 @@ struct kinfo_proc { #define CTL_DEBUG_MAXID 20 #ifdef KERNEL -#ifdef __APPLE_API_UNSTABLE - -extern struct sysctl_oid_list sysctl__children; -SYSCTL_DECL(_kern); -SYSCTL_DECL(_sysctl); -SYSCTL_DECL(_vm); -SYSCTL_DECL(_vfs); -SYSCTL_DECL(_net); -SYSCTL_DECL(_debug); -SYSCTL_DECL(_hw); -SYSCTL_DECL(_machdep); -SYSCTL_DECL(_user); - - #ifdef DEBUG /* * CTL_DEBUG variables. @@ -736,6 +901,7 @@ extern struct ctldebug debug10, debug11, debug12, debug13, debug14; extern struct ctldebug debug15, debug16, debug17, debug18, debug19; #endif /* DEBUG */ +#ifdef BSD_KERNEL_PRIVATE extern char machine[]; extern char osrelease[]; extern char ostype[]; @@ -747,18 +913,10 @@ void sysctl_unregister_set(struct linker_set *lsp); void sysctl_mib_init(void); int kernel_sysctl(struct proc *p, int *name, u_int namelen, void *old, size_t *oldlenp, void *newp, size_t newlen); -int userland_sysctl(struct proc *p, int *name, u_int namelen, void *old, - size_t *oldlenp, int inkernel, void *newp, size_t newlen, +int userland_sysctl(struct proc *p, int *name, u_int namelen, user_addr_t old, + size_t *oldlenp, int inkernel, user_addr_t newp, size_t newlen, size_t *retval); -/* - * Sysctl handling within the kernel. - * - * May be called with either or no funnel held; will take and - * switch funnels as required. - */ -int sysctlbyname __P((const char *, void *, size_t *, void *, size_t)); - /* * Internal sysctl function calling convention: * @@ -769,24 +927,30 @@ int sysctlbyname __P((const char *, void *, size_t *, void *, size_t)); * the name. */ typedef int (sysctlfn) - __P((int *, u_int, void *, size_t *, void *, size_t, struct proc *)); - -int sysctl_int __P((void *, size_t *, void *, size_t, int *)); -int sysctl_rdint __P((void *, size_t *, void *, int)); -int sysctl_quad __P((void *, size_t *, void *, size_t, quad_t *)); -int sysctl_rdquad __P((void *, size_t *, void *, quad_t)); -int sysctl_string __P((void *, size_t *, void *, size_t, char *, int)); -int sysctl_rdstring __P((void *, size_t *, void *, char *)); -int sysctl_rdstruct __P((void *, size_t *, void *, void *, int)); - -#endif /* __APPLE_API_UNSTABLE */ + (int *, u_int, user_addr_t, size_t *, user_addr_t, size_t, struct proc *); + +int sysctl_int(user_addr_t, size_t *, user_addr_t, size_t, int *); +int sysctl_rdint(user_addr_t, size_t *, user_addr_t, int); +int sysctl_quad(user_addr_t, size_t *, user_addr_t, size_t, quad_t *); +int sysctl_rdquad(void *, size_t *, void *, quad_t); +int sysctl_string(user_addr_t, size_t *, user_addr_t, size_t, char *, int); +int sysctl_trstring(user_addr_t, size_t *, user_addr_t, size_t, char *, int); +int sysctl_rdstring(user_addr_t, size_t *, user_addr_t, char *); +int sysctl_rdstruct(user_addr_t, size_t *, user_addr_t, void *, int); + +#endif /* BSD_KERNEL_PRIVATE */ #else /* !KERNEL */ -#include <sys/cdefs.h> __BEGIN_DECLS -int sysctl __P((int *, u_int, void *, size_t *, void *, size_t)); -int sysctlbyname __P((const char *, void *, size_t *, void *, size_t)); -int sysctlnametomib __P((const char *, int *, size_t *)); +int sysctl(int *, u_int, void *, size_t *, void *, size_t); +int sysctlbyname(const char *, void *, size_t *, void *, size_t); +int sysctlnametomib(const char *, int *, size_t *); __END_DECLS + #endif /* KERNEL */ + + +#endif /* SYSCTL_DEF_ENABLED */ + + #endif /* !_SYS_SYSCTL_H_ */ diff --git a/bsd/sys/sysent.h b/bsd/sys/sysent.h new file mode 100644 index 000000000..b8d73190d --- /dev/null +++ b/bsd/sys/sysent.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef _SYS_SYSENT_H_ +#define _SYS_SYSENT_H_ + +#include <sys/appleapiopts.h> +#include <sys/cdefs.h> +#ifdef __ppc__ +#include <sys/types.h> +#endif + +#ifdef KERNEL_PRIVATE +#ifdef __APPLE_API_PRIVATE + +typedef int32_t sy_call_t(struct proc *, void *, int *); +typedef void sy_munge_t(const void *, void *); + +extern struct sysent { /* system call table */ + int16_t sy_narg; /* number of args */ + int8_t sy_cancel; /* funnel type */ + int8_t sy_funnel; /* funnel type */ + sy_call_t *sy_call; /* implementing function */ + sy_munge_t *sy_arg_munge32; /* system call aguments munger for 32-bit process */ + sy_munge_t *sy_arg_munge64; /* system call aguments munger for 64-bit process */ + int32_t sy_return_type; /* system call return types */ +} sysent[]; + +/* sy_funnel flags bits */ +#define FUNNEL_MASK 0x00ff +#define UNSAFE_64BIT 0x0100 + +/* + * Valid values for sy_cancel + */ +#define _SYSCALL_CANCEL_NONE 0 /* Not a cancellation point */ +#define _SYSCALL_CANCEL_PRE 1 /* Canbe cancelled on entry itself */ +#define _SYSCALL_CANCEL_POST 2 /* Can only be cancelled after syscall is run */ + +/* + * Valid values for sy_return_type + */ +#define _SYSCALL_RET_NONE 0 +#define _SYSCALL_RET_INT_T 1 +#define _SYSCALL_RET_UINT_T 2 +#define _SYSCALL_RET_OFF_T 3 +#define _SYSCALL_RET_ADDR_T 4 +#define _SYSCALL_RET_SIZE_T 5 +#define _SYSCALL_RET_SSIZE_T 6 + +extern int nsysent; + +#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL_PRIVATE */ + +#endif /* !_SYS_SYSENT_H_ */ diff --git a/bsd/sys/syslimits.h b/bsd/sys/syslimits.h index dcf4cf403..187fd0791 100644 --- a/bsd/sys/syslimits.h +++ b/bsd/sys/syslimits.h @@ -77,10 +77,10 @@ #define PATH_MAX 1024 /* max bytes in pathname */ #define PIPE_BUF 512 /* max bytes for atomic pipe writes */ -#define BC_BASE_MAX INT_MAX /* max ibase/obase values in bc(1) */ -#define BC_DIM_MAX 65535 /* max array elements in bc(1) */ -#define BC_SCALE_MAX INT_MAX /* max scale value in bc(1) */ -#define BC_STRING_MAX INT_MAX /* max const string length in bc(1) */ +#define BC_BASE_MAX 99 /* max ibase/obase values in bc(1) */ +#define BC_DIM_MAX 2048 /* max array elements in bc(1) */ +#define BC_SCALE_MAX 99 /* max scale value in bc(1) */ +#define BC_STRING_MAX 1000 /* max const string length in bc(1) */ #define COLL_WEIGHTS_MAX 2 /* max weights for order keyword */ #define EQUIV_CLASS_MAX 2 #define EXPR_NEST_MAX 32 /* max expressions nested in expr(1) */ diff --git a/bsd/sys/syslog.h b/bsd/sys/syslog.h index aa0564e6e..4b8618692 100644 --- a/bsd/sys/syslog.h +++ b/bsd/sys/syslog.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -59,6 +59,7 @@ #define _SYS_SYSLOG_H_ #include <sys/appleapiopts.h> +#include <sys/cdefs.h> #define _PATH_LOG "/var/run/syslog" @@ -95,40 +96,40 @@ typedef struct _code { } CODE; CODE prioritynames[] = { - "alert", LOG_ALERT, - "crit", LOG_CRIT, - "debug", LOG_DEBUG, - "emerg", LOG_EMERG, - "err", LOG_ERR, - "error", LOG_ERR, /* DEPRECATED */ - "info", LOG_INFO, - "none", INTERNAL_NOPRI, /* INTERNAL */ - "notice", LOG_NOTICE, - "panic", LOG_EMERG, /* DEPRECATED */ - "warn", LOG_WARNING, /* DEPRECATED */ - "warning", LOG_WARNING, - NULL, -1, + { "alert", LOG_ALERT }, + { "crit", LOG_CRIT }, + { "debug", LOG_DEBUG }, + { "emerg", LOG_EMERG }, + { "err", LOG_ERR }, + { "error", LOG_ERR }, /* DEPRECATED */ + { "info", LOG_INFO }, + { "none", INTERNAL_NOPRI }, /* INTERNAL */ + { "notice", LOG_NOTICE }, + { "panic", LOG_EMERG }, /* DEPRECATED */ + { "warn", LOG_WARNING }, /* DEPRECATED */ + { "warning", LOG_WARNING }, + { 0, -1 } }; #endif /* facility codes */ -#define LOG_KERN (0<<3) /* kernel messages */ -#define LOG_USER (1<<3) /* random user-level messages */ -#define LOG_MAIL (2<<3) /* mail system */ -#define LOG_DAEMON (3<<3) /* system daemons */ -#define LOG_AUTH (4<<3) /* security/authorization messages */ -#define LOG_SYSLOG (5<<3) /* messages generated internally by syslogd */ -#define LOG_LPR (6<<3) /* line printer subsystem */ -#define LOG_NEWS (7<<3) /* network news subsystem */ -#define LOG_UUCP (8<<3) /* UUCP subsystem */ -#define LOG_CRON (9<<3) /* clock daemon */ -#define LOG_AUTHPRIV (10<<3) /* security/authorization messages (private) */ -#define LOG_FTP (11<<3) /* ftp daemon */ -#define LOG_NETINFO (12<<3) /* NetInfo */ +#define LOG_KERN (0<<3) /* kernel messages */ +#define LOG_USER (1<<3) /* random user-level messages */ +#define LOG_MAIL (2<<3) /* mail system */ +#define LOG_DAEMON (3<<3) /* system daemons */ +#define LOG_AUTH (4<<3) /* security/authorization messages */ +#define LOG_SYSLOG (5<<3) /* messages generated internally by syslogd */ +#define LOG_LPR (6<<3) /* line printer subsystem */ +#define LOG_NEWS (7<<3) /* network news subsystem */ +#define LOG_UUCP (8<<3) /* UUCP subsystem */ +#define LOG_CRON (9<<3) /* clock daemon */ +#define LOG_AUTHPRIV (10<<3) /* security/authorization messages (private) */ +#define LOG_FTP (11<<3) /* ftp daemon */ +#define LOG_NETINFO (12<<3) /* NetInfo */ #define LOG_REMOTEAUTH (13<<3) /* remote authentication/authorization */ -#define LOG_INSTALL (14<<3) /* installer subsystem */ +#define LOG_INSTALL (14<<3) /* installer subsystem */ +#define LOG_RAS (15<<3) /* Remote Access Service (VPN / PPP) */ - /* other codes through 15 reserved for system use */ #define LOG_LOCAL0 (16<<3) /* reserved for local use */ #define LOG_LOCAL1 (17<<3) /* reserved for local use */ #define LOG_LOCAL2 (18<<3) /* reserved for local use */ @@ -138,39 +139,43 @@ CODE prioritynames[] = { #define LOG_LOCAL6 (22<<3) /* reserved for local use */ #define LOG_LOCAL7 (23<<3) /* reserved for local use */ -#define LOG_NFACILITIES 24 /* current number of facilities */ +#define LOG_LAUNCHD (24<<3) /* launchd - general bootstrap daemon */ + +#define LOG_NFACILITIES 25 /* current number of facilities */ #define LOG_FACMASK 0x03f8 /* mask to extract facility part */ /* facility of pri */ #define LOG_FAC(p) (((p) & LOG_FACMASK) >> 3) #ifdef SYSLOG_NAMES CODE facilitynames[] = { - "auth", LOG_AUTH, - "authpriv", LOG_AUTHPRIV, - "cron", LOG_CRON, - "daemon", LOG_DAEMON, - "ftp", LOG_FTP, - "install", LOG_INSTALL, - "kern", LOG_KERN, - "lpr", LOG_LPR, - "mail", LOG_MAIL, - "mark", INTERNAL_MARK, /* INTERNAL */ - "netinfo", LOG_NETINFO, - "remoteauth", LOG_REMOTEAUTH, - "news", LOG_NEWS, - "security", LOG_AUTH, /* DEPRECATED */ - "syslog", LOG_SYSLOG, - "user", LOG_USER, - "uucp", LOG_UUCP, - "local0", LOG_LOCAL0, - "local1", LOG_LOCAL1, - "local2", LOG_LOCAL2, - "local3", LOG_LOCAL3, - "local4", LOG_LOCAL4, - "local5", LOG_LOCAL5, - "local6", LOG_LOCAL6, - "local7", LOG_LOCAL7, - NULL, -1, + { "auth", LOG_AUTH }, + { "authpriv", LOG_AUTHPRIV }, + { "cron", LOG_CRON }, + { "daemon", LOG_DAEMON }, + { "ftp", LOG_FTP }, + { "install", LOG_INSTALL }, + { "kern", LOG_KERN }, + { "lpr", LOG_LPR }, + { "mail", LOG_MAIL }, + { "mark", INTERNAL_MARK }, /* INTERNAL */ + { "netinfo", LOG_NETINFO }, + { "ras", LOG_RAS }, + { "remoteauth", LOG_REMOTEAUTH }, + { "news", LOG_NEWS }, + { "security", LOG_AUTH }, /* DEPRECATED */ + { "syslog", LOG_SYSLOG }, + { "user", LOG_USER }, + { "uucp", LOG_UUCP }, + { "local0", LOG_LOCAL0 }, + { "local1", LOG_LOCAL1 }, + { "local2", LOG_LOCAL2 }, + { "local3", LOG_LOCAL3 }, + { "local4", LOG_LOCAL4 }, + { "local5", LOG_LOCAL5 }, + { "local6", LOG_LOCAL6 }, + { "local7", LOG_LOCAL7 }, + { "launchd", LOG_LAUNCHD }, + { 0, -1 } }; #endif @@ -199,25 +204,19 @@ CODE facilitynames[] = { #define LOG_NOWAIT 0x10 /* don't wait for console forks: DEPRECATED */ #define LOG_PERROR 0x20 /* log to stderr as well */ -#include <sys/cdefs.h> - #ifndef KERNEL - -/* - * Don't use va_list in the vsyslog() prototype. Va_list is typedef'd in two - * places (<machine/varargs.h> and <machine/stdarg.h>), so if we include one - * of them here we may collide with the utility's includes. It's unreasonable - * for utilities to have to include one of them to include syslog.h, so we get - * _BSD_VA_LIST_ from <machine/ansi.h> and use it. - */ -#include <machine/ansi.h> +#ifndef _POSIX_C_SOURCE +#include <sys/_types.h> /* for __darwin_va_list */ +#endif /* _POSIX_C_SOURCE */ __BEGIN_DECLS -void closelog __P((void)); -void openlog __P((const char *, int, int)); -int setlogmask __P((int)); -void syslog __P((int, const char *, ...)); -void vsyslog __P((int, const char *, _BSD_VA_LIST_)); +void closelog(void); +void openlog(const char *, int, int); +int setlogmask(int); +void syslog(int, const char *, ...) __DARWIN_LDBL_COMPAT(syslog); +#ifndef _POSIX_C_SOURCE +void vsyslog(int, const char *, __darwin_va_list) __DARWIN_LDBL_COMPAT(vsyslog); +#endif /* _POSIX_C_SOURCE */ __END_DECLS #else /* !KERNEL */ @@ -303,9 +302,11 @@ struct reg_desc { #endif /* __APPLE_API_OBSOLETE */ -void logpri __P((int)); -void log __P((int, const char *, ...)); -void addlog __P((const char *, ...)); +__BEGIN_DECLS +void logpri(int); +void log(int, const char *, ...); +void addlog(const char *, ...); +__END_DECLS #endif /* !KERNEL */ #endif /* !_SYS_SYSLOG_H_ */ diff --git a/bsd/sys/sysproto.h b/bsd/sys/sysproto.h new file mode 100644 index 000000000..9bc5c5f86 --- /dev/null +++ b/bsd/sys/sysproto.h @@ -0,0 +1,1610 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + * + * + * System call switch table. + * + * DO NOT EDIT-- this file is automatically generated. + * created from syscalls.master + */ + +#ifndef _SYS_SYSPROTO_H_ +#define _SYS_SYSPROTO_H_ + +#include <sys/appleapiopts.h> +#include <sys/cdefs.h> +#include <sys/mount_internal.h> +#include <sys/types.h> +#include <sys/sem_internal.h> +#include <sys/semaphore.h> +#include <sys/wait.h> +#include <mach/shared_memory_server.h> + +#ifdef KERNEL +#ifdef __APPLE_API_PRIVATE +#ifdef __ppc__ +#define PAD_(t) (sizeof(uint64_t) <= sizeof(t) \ + ? 0 : sizeof(uint64_t) - sizeof(t)) +#else +#define PAD_(t) (sizeof(register_t) <= sizeof(t) \ + ? 0 : sizeof(register_t) - sizeof(t)) +#endif +#if BYTE_ORDER == LITTLE_ENDIAN +#define PADL_(t) 0 +#define PADR_(t) PAD_(t) +#else +#define PADL_(t) PAD_(t) +#define PADR_(t) 0 +#endif + +__BEGIN_DECLS +#ifndef __MUNGE_ONCE +#define __MUNGE_ONCE +#ifdef __ppc__ +void munge_w(const void *, void *); +void munge_ww(const void *, void *); +void munge_www(const void *, void *); +void munge_wwww(const void *, void *); +void munge_wwwww(const void *, void *); +void munge_wwwwww(const void *, void *); +void munge_wwwwwww(const void *, void *); +void munge_wwwwwwww(const void *, void *); +void munge_d(const void *, void *); +void munge_dd(const void *, void *); +void munge_ddd(const void *, void *); +void munge_dddd(const void *, void *); +void munge_ddddd(const void *, void *); +void munge_dddddd(const void *, void *); +void munge_ddddddd(const void *, void *); +void munge_dddddddd(const void *, void *); +void munge_wl(const void *, void *); +void munge_wlw(const void *, void *); +void munge_wwwl(const void *, void *); +void munge_wwwwl(const void *, void *); +void munge_wwwwwl(const void *, void *); +void munge_wsw(const void *, void *); +void munge_wws(const void *, void *); +void munge_wwwsw(const void *, void *); +#else +#define munge_w NULL +#define munge_ww NULL +#define munge_www NULL +#define munge_wwww NULL +#define munge_wwwww NULL +#define munge_wwwwww NULL +#define munge_wwwwwww NULL +#define munge_wwwwwwww NULL +#define munge_d NULL +#define munge_dd NULL +#define munge_ddd NULL +#define munge_dddd NULL +#define munge_ddddd NULL +#define munge_dddddd NULL +#define munge_ddddddd NULL +#define munge_dddddddd NULL +#define munge_wl NULL +#define munge_wlw NULL +#define munge_wwwl NULL +#define munge_wwwwl NULL +#define munge_wwwwwl NULL +#define munge_wsw NULL +#define munge_wws NULL +#define munge_wwwsw NULL +#endif // __ppc__ +#endif /* !__MUNGE_ONCE */ + +struct nosys_args { + register_t dummy; +}; +struct exit_args { + char rval_l_[PADL_(int)]; int rval; char rval_r_[PADR_(int)]; +}; +struct fork_args { + register_t dummy; +}; +struct read_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char cbuf_l_[PADL_(user_addr_t)]; user_addr_t cbuf; char cbuf_r_[PADR_(user_addr_t)]; + char nbyte_l_[PADL_(user_size_t)]; user_size_t nbyte; char nbyte_r_[PADR_(user_size_t)]; +}; +struct write_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char cbuf_l_[PADL_(user_addr_t)]; user_addr_t cbuf; char cbuf_r_[PADR_(user_addr_t)]; + char nbyte_l_[PADL_(user_size_t)]; user_size_t nbyte; char nbyte_r_[PADR_(user_size_t)]; +}; +struct open_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; + char mode_l_[PADL_(int)]; int mode; char mode_r_[PADR_(int)]; +}; +struct close_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; +}; +struct wait4_args { + char pid_l_[PADL_(int)]; int pid; char pid_r_[PADR_(int)]; + char status_l_[PADL_(user_addr_t)]; user_addr_t status; char status_r_[PADR_(user_addr_t)]; + char options_l_[PADL_(int)]; int options; char options_r_[PADR_(int)]; + char rusage_l_[PADL_(user_addr_t)]; user_addr_t rusage; char rusage_r_[PADR_(user_addr_t)]; +}; +struct link_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char link_l_[PADL_(user_addr_t)]; user_addr_t link; char link_r_[PADR_(user_addr_t)]; +}; +struct unlink_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; +}; +struct chdir_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; +}; +struct fchdir_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; +}; +struct mknod_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char mode_l_[PADL_(int)]; int mode; char mode_r_[PADR_(int)]; + char dev_l_[PADL_(int)]; int dev; char dev_r_[PADR_(int)]; +}; +struct chmod_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char mode_l_[PADL_(int)]; int mode; char mode_r_[PADR_(int)]; +}; +struct chown_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char uid_l_[PADL_(int)]; int uid; char uid_r_[PADR_(int)]; + char gid_l_[PADL_(int)]; int gid; char gid_r_[PADR_(int)]; +}; +struct obreak_args { + char nsize_l_[PADL_(char *)]; char * nsize; char nsize_r_[PADR_(char *)]; +}; +#if COMPAT_GETFSSTAT +struct ogetfsstat_args { + char buf_l_[PADL_(user_addr_t)]; user_addr_t buf; char buf_r_[PADR_(user_addr_t)]; + char bufsize_l_[PADL_(int)]; int bufsize; char bufsize_r_[PADR_(int)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; +}; +#else +struct getfsstat_args { + char buf_l_[PADL_(user_addr_t)]; user_addr_t buf; char buf_r_[PADR_(user_addr_t)]; + char bufsize_l_[PADL_(int)]; int bufsize; char bufsize_r_[PADR_(int)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; +}; +#endif +struct getpid_args { + register_t dummy; +}; +struct setuid_args { + char uid_l_[PADL_(uid_t)]; uid_t uid; char uid_r_[PADR_(uid_t)]; +}; +struct getuid_args { + register_t dummy; +}; +struct geteuid_args { + register_t dummy; +}; +struct ptrace_args { + char req_l_[PADL_(int)]; int req; char req_r_[PADR_(int)]; + char pid_l_[PADL_(pid_t)]; pid_t pid; char pid_r_[PADR_(pid_t)]; + char addr_l_[PADL_(user_addr_t)]; user_addr_t addr; char addr_r_[PADR_(user_addr_t)]; + char data_l_[PADL_(int)]; int data; char data_r_[PADR_(int)]; +}; +struct recvmsg_args { + char s_l_[PADL_(int)]; int s; char s_r_[PADR_(int)]; + char msg_l_[PADL_(user_addr_t)]; user_addr_t msg; char msg_r_[PADR_(user_addr_t)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; +}; +struct sendmsg_args { + char s_l_[PADL_(int)]; int s; char s_r_[PADR_(int)]; + char msg_l_[PADL_(user_addr_t)]; user_addr_t msg; char msg_r_[PADR_(user_addr_t)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; +}; +struct recvfrom_args { + char s_l_[PADL_(int)]; int s; char s_r_[PADR_(int)]; + char buf_l_[PADL_(user_addr_t)]; user_addr_t buf; char buf_r_[PADR_(user_addr_t)]; + char len_l_[PADL_(user_size_t)]; user_size_t len; char len_r_[PADR_(user_size_t)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; + char from_l_[PADL_(user_addr_t)]; user_addr_t from; char from_r_[PADR_(user_addr_t)]; + char fromlenaddr_l_[PADL_(user_addr_t)]; user_addr_t fromlenaddr; char fromlenaddr_r_[PADR_(user_addr_t)]; +}; +struct accept_args { + char s_l_[PADL_(int)]; int s; char s_r_[PADR_(int)]; + char name_l_[PADL_(user_addr_t)]; user_addr_t name; char name_r_[PADR_(user_addr_t)]; + char anamelen_l_[PADL_(user_addr_t)]; user_addr_t anamelen; char anamelen_r_[PADR_(user_addr_t)]; +}; +struct getpeername_args { + char fdes_l_[PADL_(int)]; int fdes; char fdes_r_[PADR_(int)]; + char asa_l_[PADL_(user_addr_t)]; user_addr_t asa; char asa_r_[PADR_(user_addr_t)]; + char alen_l_[PADL_(user_addr_t)]; user_addr_t alen; char alen_r_[PADR_(user_addr_t)]; +}; +struct getsockname_args { + char fdes_l_[PADL_(int)]; int fdes; char fdes_r_[PADR_(int)]; + char asa_l_[PADL_(user_addr_t)]; user_addr_t asa; char asa_r_[PADR_(user_addr_t)]; + char alen_l_[PADL_(user_addr_t)]; user_addr_t alen; char alen_r_[PADR_(user_addr_t)]; +}; +struct access_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; +}; +struct chflags_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; +}; +struct fchflags_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; +}; +struct sync_args { + register_t dummy; +}; +struct kill_args { + char pid_l_[PADL_(int)]; int pid; char pid_r_[PADR_(int)]; + char signum_l_[PADL_(int)]; int signum; char signum_r_[PADR_(int)]; +}; +struct getppid_args { + register_t dummy; +}; +struct dup_args { + char fd_l_[PADL_(u_int)]; u_int fd; char fd_r_[PADR_(u_int)]; +}; +struct pipe_args { + register_t dummy; +}; +struct getegid_args { + register_t dummy; +}; +struct profil_args { + char bufbase_l_[PADL_(user_addr_t)]; user_addr_t bufbase; char bufbase_r_[PADR_(user_addr_t)]; + char bufsize_l_[PADL_(user_size_t)]; user_size_t bufsize; char bufsize_r_[PADR_(user_size_t)]; + char pcoffset_l_[PADL_(user_ulong_t)]; user_ulong_t pcoffset; char pcoffset_r_[PADR_(user_ulong_t)]; + char pcscale_l_[PADL_(u_int)]; u_int pcscale; char pcscale_r_[PADR_(u_int)]; +}; +struct ktrace_args { + char fname_l_[PADL_(user_addr_t)]; user_addr_t fname; char fname_r_[PADR_(user_addr_t)]; + char ops_l_[PADL_(int)]; int ops; char ops_r_[PADR_(int)]; + char facs_l_[PADL_(int)]; int facs; char facs_r_[PADR_(int)]; + char pid_l_[PADL_(int)]; int pid; char pid_r_[PADR_(int)]; +}; +struct sigaction_args { + char signum_l_[PADL_(int)]; int signum; char signum_r_[PADR_(int)]; + char nsa_l_[PADL_(user_addr_t)]; user_addr_t nsa; char nsa_r_[PADR_(user_addr_t)]; + char osa_l_[PADL_(user_addr_t)]; user_addr_t osa; char osa_r_[PADR_(user_addr_t)]; +}; +struct getgid_args { + register_t dummy; +}; +struct sigprocmask_args { + char how_l_[PADL_(int)]; int how; char how_r_[PADR_(int)]; + char mask_l_[PADL_(user_addr_t)]; user_addr_t mask; char mask_r_[PADR_(user_addr_t)]; + char omask_l_[PADL_(user_addr_t)]; user_addr_t omask; char omask_r_[PADR_(user_addr_t)]; +}; +struct getlogin_args { + char namebuf_l_[PADL_(user_addr_t)]; user_addr_t namebuf; char namebuf_r_[PADR_(user_addr_t)]; + char namelen_l_[PADL_(u_int)]; u_int namelen; char namelen_r_[PADR_(u_int)]; +}; +struct setlogin_args { + char namebuf_l_[PADL_(user_addr_t)]; user_addr_t namebuf; char namebuf_r_[PADR_(user_addr_t)]; +}; +struct acct_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; +}; +struct sigpending_args { + char osv_l_[PADL_(user_addr_t)]; user_addr_t osv; char osv_r_[PADR_(user_addr_t)]; +}; +struct sigaltstack_args { + char nss_l_[PADL_(user_addr_t)]; user_addr_t nss; char nss_r_[PADR_(user_addr_t)]; + char oss_l_[PADL_(user_addr_t)]; user_addr_t oss; char oss_r_[PADR_(user_addr_t)]; +}; +struct ioctl_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char com_l_[PADL_(user_ulong_t)]; user_ulong_t com; char com_r_[PADR_(user_ulong_t)]; + char data_l_[PADL_(user_addr_t)]; user_addr_t data; char data_r_[PADR_(user_addr_t)]; +}; +struct reboot_args { + char opt_l_[PADL_(int)]; int opt; char opt_r_[PADR_(int)]; + char command_l_[PADL_(user_addr_t)]; user_addr_t command; char command_r_[PADR_(user_addr_t)]; +}; +struct revoke_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; +}; +struct symlink_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char link_l_[PADL_(user_addr_t)]; user_addr_t link; char link_r_[PADR_(user_addr_t)]; +}; +struct readlink_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char buf_l_[PADL_(user_addr_t)]; user_addr_t buf; char buf_r_[PADR_(user_addr_t)]; + char count_l_[PADL_(int)]; int count; char count_r_[PADR_(int)]; +}; +struct execve_args { + char fname_l_[PADL_(user_addr_t)]; user_addr_t fname; char fname_r_[PADR_(user_addr_t)]; + char argp_l_[PADL_(user_addr_t)]; user_addr_t argp; char argp_r_[PADR_(user_addr_t)]; + char envp_l_[PADL_(user_addr_t)]; user_addr_t envp; char envp_r_[PADR_(user_addr_t)]; +}; +struct umask_args { + char newmask_l_[PADL_(int)]; int newmask; char newmask_r_[PADR_(int)]; +}; +struct chroot_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; +}; +struct msync_args { + char addr_l_[PADL_(user_addr_t)]; user_addr_t addr; char addr_r_[PADR_(user_addr_t)]; + char len_l_[PADL_(user_size_t)]; user_size_t len; char len_r_[PADR_(user_size_t)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; +}; +struct vfork_args { + register_t dummy; +}; +struct sbrk_args { + char incr_l_[PADL_(int)]; int incr; char incr_r_[PADR_(int)]; +}; +struct sstk_args { + char incr_l_[PADL_(int)]; int incr; char incr_r_[PADR_(int)]; +}; +struct ovadvise_args { + register_t dummy; +}; +struct munmap_args { + char addr_l_[PADL_(user_addr_t)]; user_addr_t addr; char addr_r_[PADR_(user_addr_t)]; + char len_l_[PADL_(user_size_t)]; user_size_t len; char len_r_[PADR_(user_size_t)]; +}; +struct mprotect_args { + char addr_l_[PADL_(user_addr_t)]; user_addr_t addr; char addr_r_[PADR_(user_addr_t)]; + char len_l_[PADL_(user_size_t)]; user_size_t len; char len_r_[PADR_(user_size_t)]; + char prot_l_[PADL_(int)]; int prot; char prot_r_[PADR_(int)]; +}; +struct madvise_args { + char addr_l_[PADL_(user_addr_t)]; user_addr_t addr; char addr_r_[PADR_(user_addr_t)]; + char len_l_[PADL_(user_size_t)]; user_size_t len; char len_r_[PADR_(user_size_t)]; + char behav_l_[PADL_(int)]; int behav; char behav_r_[PADR_(int)]; +}; +struct mincore_args { + char addr_l_[PADL_(user_addr_t)]; user_addr_t addr; char addr_r_[PADR_(user_addr_t)]; + char len_l_[PADL_(user_size_t)]; user_size_t len; char len_r_[PADR_(user_size_t)]; + char vec_l_[PADL_(user_addr_t)]; user_addr_t vec; char vec_r_[PADR_(user_addr_t)]; +}; +struct getgroups_args { + char gidsetsize_l_[PADL_(u_int)]; u_int gidsetsize; char gidsetsize_r_[PADR_(u_int)]; + char gidset_l_[PADL_(user_addr_t)]; user_addr_t gidset; char gidset_r_[PADR_(user_addr_t)]; +}; +struct setgroups_args { + char gidsetsize_l_[PADL_(u_int)]; u_int gidsetsize; char gidsetsize_r_[PADR_(u_int)]; + char gidset_l_[PADL_(user_addr_t)]; user_addr_t gidset; char gidset_r_[PADR_(user_addr_t)]; +}; +struct getpgrp_args { + register_t dummy; +}; +struct setpgid_args { + char pid_l_[PADL_(int)]; int pid; char pid_r_[PADR_(int)]; + char pgid_l_[PADL_(int)]; int pgid; char pgid_r_[PADR_(int)]; +}; +struct setitimer_args { + char which_l_[PADL_(u_int)]; u_int which; char which_r_[PADR_(u_int)]; + char itv_l_[PADL_(user_addr_t)]; user_addr_t itv; char itv_r_[PADR_(user_addr_t)]; + char oitv_l_[PADL_(user_addr_t)]; user_addr_t oitv; char oitv_r_[PADR_(user_addr_t)]; +}; +struct swapon_args { + register_t dummy; +}; +struct getitimer_args { + char which_l_[PADL_(u_int)]; u_int which; char which_r_[PADR_(u_int)]; + char itv_l_[PADL_(user_addr_t)]; user_addr_t itv; char itv_r_[PADR_(user_addr_t)]; +}; +struct getdtablesize_args { + register_t dummy; +}; +struct dup2_args { + char from_l_[PADL_(u_int)]; u_int from; char from_r_[PADR_(u_int)]; + char to_l_[PADL_(u_int)]; u_int to; char to_r_[PADR_(u_int)]; +}; +struct fcntl_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char cmd_l_[PADL_(int)]; int cmd; char cmd_r_[PADR_(int)]; + char arg_l_[PADL_(user_long_t)]; user_long_t arg; char arg_r_[PADR_(user_long_t)]; +}; +struct select_args { + char nd_l_[PADL_(int)]; int nd; char nd_r_[PADR_(int)]; + char in_l_[PADL_(user_addr_t)]; user_addr_t in; char in_r_[PADR_(user_addr_t)]; + char ou_l_[PADL_(user_addr_t)]; user_addr_t ou; char ou_r_[PADR_(user_addr_t)]; + char ex_l_[PADL_(user_addr_t)]; user_addr_t ex; char ex_r_[PADR_(user_addr_t)]; + char tv_l_[PADL_(user_addr_t)]; user_addr_t tv; char tv_r_[PADR_(user_addr_t)]; +}; +struct fsync_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; +}; +struct setpriority_args { + char which_l_[PADL_(int)]; int which; char which_r_[PADR_(int)]; + char who_l_[PADL_(int)]; int who; char who_r_[PADR_(int)]; + char prio_l_[PADL_(int)]; int prio; char prio_r_[PADR_(int)]; +}; +struct socket_args { + char domain_l_[PADL_(int)]; int domain; char domain_r_[PADR_(int)]; + char type_l_[PADL_(int)]; int type; char type_r_[PADR_(int)]; + char protocol_l_[PADL_(int)]; int protocol; char protocol_r_[PADR_(int)]; +}; +struct connect_args { + char s_l_[PADL_(int)]; int s; char s_r_[PADR_(int)]; + char name_l_[PADL_(user_addr_t)]; user_addr_t name; char name_r_[PADR_(user_addr_t)]; + char namelen_l_[PADL_(socklen_t)]; socklen_t namelen; char namelen_r_[PADR_(socklen_t)]; +}; +struct getpriority_args { + char which_l_[PADL_(int)]; int which; char which_r_[PADR_(int)]; + char who_l_[PADL_(int)]; int who; char who_r_[PADR_(int)]; +}; +#ifdef __ppc__ +#else +struct sigreturn_args { + char sigcntxp_l_[PADL_(struct sigcontext *)]; struct sigcontext * sigcntxp; char sigcntxp_r_[PADR_(struct sigcontext *)]; +}; +#endif +struct bind_args { + char s_l_[PADL_(int)]; int s; char s_r_[PADR_(int)]; + char name_l_[PADL_(user_addr_t)]; user_addr_t name; char name_r_[PADR_(user_addr_t)]; + char namelen_l_[PADL_(socklen_t)]; socklen_t namelen; char namelen_r_[PADR_(socklen_t)]; +}; +struct setsockopt_args { + char s_l_[PADL_(int)]; int s; char s_r_[PADR_(int)]; + char level_l_[PADL_(int)]; int level; char level_r_[PADR_(int)]; + char name_l_[PADL_(int)]; int name; char name_r_[PADR_(int)]; + char val_l_[PADL_(user_addr_t)]; user_addr_t val; char val_r_[PADR_(user_addr_t)]; + char valsize_l_[PADL_(socklen_t)]; socklen_t valsize; char valsize_r_[PADR_(socklen_t)]; +}; +struct listen_args { + char s_l_[PADL_(int)]; int s; char s_r_[PADR_(int)]; + char backlog_l_[PADL_(int)]; int backlog; char backlog_r_[PADR_(int)]; +}; +struct sigsuspend_args { + char mask_l_[PADL_(sigset_t)]; sigset_t mask; char mask_r_[PADR_(sigset_t)]; +}; +#ifdef __ppc__ +struct ppc_gettimeofday_args { + char tp_l_[PADL_(user_addr_t)]; user_addr_t tp; char tp_r_[PADR_(user_addr_t)]; + char tzp_l_[PADL_(user_addr_t)]; user_addr_t tzp; char tzp_r_[PADR_(user_addr_t)]; +}; +#else +struct gettimeofday_args { + char tp_l_[PADL_(user_addr_t)]; user_addr_t tp; char tp_r_[PADR_(user_addr_t)]; + char tzp_l_[PADL_(user_addr_t)]; user_addr_t tzp; char tzp_r_[PADR_(user_addr_t)]; +}; +#endif +struct getrusage_args { + char who_l_[PADL_(int)]; int who; char who_r_[PADR_(int)]; + char rusage_l_[PADL_(user_addr_t)]; user_addr_t rusage; char rusage_r_[PADR_(user_addr_t)]; +}; +struct getsockopt_args { + char s_l_[PADL_(int)]; int s; char s_r_[PADR_(int)]; + char level_l_[PADL_(int)]; int level; char level_r_[PADR_(int)]; + char name_l_[PADL_(int)]; int name; char name_r_[PADR_(int)]; + char val_l_[PADL_(user_addr_t)]; user_addr_t val; char val_r_[PADR_(user_addr_t)]; + char avalsize_l_[PADL_(user_addr_t)]; user_addr_t avalsize; char avalsize_r_[PADR_(user_addr_t)]; +}; +struct readv_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char iovp_l_[PADL_(user_addr_t)]; user_addr_t iovp; char iovp_r_[PADR_(user_addr_t)]; + char iovcnt_l_[PADL_(u_int)]; u_int iovcnt; char iovcnt_r_[PADR_(u_int)]; +}; +struct writev_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char iovp_l_[PADL_(user_addr_t)]; user_addr_t iovp; char iovp_r_[PADR_(user_addr_t)]; + char iovcnt_l_[PADL_(u_int)]; u_int iovcnt; char iovcnt_r_[PADR_(u_int)]; +}; +struct settimeofday_args { + char tv_l_[PADL_(user_addr_t)]; user_addr_t tv; char tv_r_[PADR_(user_addr_t)]; + char tzp_l_[PADL_(user_addr_t)]; user_addr_t tzp; char tzp_r_[PADR_(user_addr_t)]; +}; +struct fchown_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char uid_l_[PADL_(int)]; int uid; char uid_r_[PADR_(int)]; + char gid_l_[PADL_(int)]; int gid; char gid_r_[PADR_(int)]; +}; +struct fchmod_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char mode_l_[PADL_(int)]; int mode; char mode_r_[PADR_(int)]; +}; +struct rename_args { + char from_l_[PADL_(user_addr_t)]; user_addr_t from; char from_r_[PADR_(user_addr_t)]; + char to_l_[PADL_(user_addr_t)]; user_addr_t to; char to_r_[PADR_(user_addr_t)]; +}; +struct flock_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char how_l_[PADL_(int)]; int how; char how_r_[PADR_(int)]; +}; +struct mkfifo_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char mode_l_[PADL_(int)]; int mode; char mode_r_[PADR_(int)]; +}; +struct sendto_args { + char s_l_[PADL_(int)]; int s; char s_r_[PADR_(int)]; + char buf_l_[PADL_(user_addr_t)]; user_addr_t buf; char buf_r_[PADR_(user_addr_t)]; + char len_l_[PADL_(user_size_t)]; user_size_t len; char len_r_[PADR_(user_size_t)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; + char to_l_[PADL_(user_addr_t)]; user_addr_t to; char to_r_[PADR_(user_addr_t)]; + char tolen_l_[PADL_(socklen_t)]; socklen_t tolen; char tolen_r_[PADR_(socklen_t)]; +}; +struct shutdown_args { + char s_l_[PADL_(int)]; int s; char s_r_[PADR_(int)]; + char how_l_[PADL_(int)]; int how; char how_r_[PADR_(int)]; +}; +struct socketpair_args { + char domain_l_[PADL_(int)]; int domain; char domain_r_[PADR_(int)]; + char type_l_[PADL_(int)]; int type; char type_r_[PADR_(int)]; + char protocol_l_[PADL_(int)]; int protocol; char protocol_r_[PADR_(int)]; + char rsv_l_[PADL_(user_addr_t)]; user_addr_t rsv; char rsv_r_[PADR_(user_addr_t)]; +}; +struct mkdir_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char mode_l_[PADL_(int)]; int mode; char mode_r_[PADR_(int)]; +}; +struct rmdir_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; +}; +struct utimes_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char tptr_l_[PADL_(user_addr_t)]; user_addr_t tptr; char tptr_r_[PADR_(user_addr_t)]; +}; +struct futimes_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char tptr_l_[PADL_(user_addr_t)]; user_addr_t tptr; char tptr_r_[PADR_(user_addr_t)]; +}; +struct adjtime_args { + char delta_l_[PADL_(user_addr_t)]; user_addr_t delta; char delta_r_[PADR_(user_addr_t)]; + char olddelta_l_[PADL_(user_addr_t)]; user_addr_t olddelta; char olddelta_r_[PADR_(user_addr_t)]; +}; +struct setsid_args { + register_t dummy; +}; +struct getpgid_args { + char pid_l_[PADL_(pid_t)]; pid_t pid; char pid_r_[PADR_(pid_t)]; +}; +struct setprivexec_args { + char flag_l_[PADL_(int)]; int flag; char flag_r_[PADR_(int)]; +}; +struct pread_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char buf_l_[PADL_(user_addr_t)]; user_addr_t buf; char buf_r_[PADR_(user_addr_t)]; + char nbyte_l_[PADL_(user_size_t)]; user_size_t nbyte; char nbyte_r_[PADR_(user_size_t)]; + char offset_l_[PADL_(off_t)]; off_t offset; char offset_r_[PADR_(off_t)]; +}; +struct pwrite_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char buf_l_[PADL_(user_addr_t)]; user_addr_t buf; char buf_r_[PADR_(user_addr_t)]; + char nbyte_l_[PADL_(user_size_t)]; user_size_t nbyte; char nbyte_r_[PADR_(user_size_t)]; + char offset_l_[PADL_(off_t)]; off_t offset; char offset_r_[PADR_(off_t)]; +}; +#if NFSSERVER +struct nfssvc_args { + char flag_l_[PADL_(int)]; int flag; char flag_r_[PADR_(int)]; + char argp_l_[PADL_(user_addr_t)]; user_addr_t argp; char argp_r_[PADR_(user_addr_t)]; +}; +#else +#endif +struct statfs_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char buf_l_[PADL_(user_addr_t)]; user_addr_t buf; char buf_r_[PADR_(user_addr_t)]; +}; +struct fstatfs_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char buf_l_[PADL_(user_addr_t)]; user_addr_t buf; char buf_r_[PADR_(user_addr_t)]; +}; +struct unmount_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; +}; +#if NFSCLIENT +struct getfh_args { + char fname_l_[PADL_(user_addr_t)]; user_addr_t fname; char fname_r_[PADR_(user_addr_t)]; + char fhp_l_[PADL_(user_addr_t)]; user_addr_t fhp; char fhp_r_[PADR_(user_addr_t)]; +}; +#else +#endif +struct quotactl_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char cmd_l_[PADL_(int)]; int cmd; char cmd_r_[PADR_(int)]; + char uid_l_[PADL_(int)]; int uid; char uid_r_[PADR_(int)]; + char arg_l_[PADL_(user_addr_t)]; user_addr_t arg; char arg_r_[PADR_(user_addr_t)]; +}; +struct mount_args { + char type_l_[PADL_(user_addr_t)]; user_addr_t type; char type_r_[PADR_(user_addr_t)]; + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; + char data_l_[PADL_(user_addr_t)]; user_addr_t data; char data_r_[PADR_(user_addr_t)]; +}; +struct waitid_args { + char idtype_l_[PADL_(idtype_t)]; idtype_t idtype; char idtype_r_[PADR_(idtype_t)]; + char id_l_[PADL_(id_t)]; id_t id; char id_r_[PADR_(id_t)]; + char infop_l_[PADL_(user_addr_t)]; user_addr_t infop; char infop_r_[PADR_(user_addr_t)]; + char options_l_[PADL_(int)]; int options; char options_r_[PADR_(int)]; +}; +struct add_profil_args { + char bufbase_l_[PADL_(user_addr_t)]; user_addr_t bufbase; char bufbase_r_[PADR_(user_addr_t)]; + char bufsize_l_[PADL_(user_size_t)]; user_size_t bufsize; char bufsize_r_[PADR_(user_size_t)]; + char pcoffset_l_[PADL_(user_ulong_t)]; user_ulong_t pcoffset; char pcoffset_r_[PADR_(user_ulong_t)]; + char pcscale_l_[PADL_(u_int)]; u_int pcscale; char pcscale_r_[PADR_(u_int)]; +}; +struct kdebug_trace_args { + char code_l_[PADL_(int)]; int code; char code_r_[PADR_(int)]; + char arg1_l_[PADL_(int)]; int arg1; char arg1_r_[PADR_(int)]; + char arg2_l_[PADL_(int)]; int arg2; char arg2_r_[PADR_(int)]; + char arg3_l_[PADL_(int)]; int arg3; char arg3_r_[PADR_(int)]; + char arg4_l_[PADL_(int)]; int arg4; char arg4_r_[PADR_(int)]; + char arg5_l_[PADL_(int)]; int arg5; char arg5_r_[PADR_(int)]; +}; +struct setgid_args { + char gid_l_[PADL_(gid_t)]; gid_t gid; char gid_r_[PADR_(gid_t)]; +}; +struct setegid_args { + char egid_l_[PADL_(gid_t)]; gid_t egid; char egid_r_[PADR_(gid_t)]; +}; +struct seteuid_args { + char euid_l_[PADL_(uid_t)]; uid_t euid; char euid_r_[PADR_(uid_t)]; +}; +#ifdef __ppc__ +struct sigreturn_args { + char uctx_l_[PADL_(user_addr_t)]; user_addr_t uctx; char uctx_r_[PADR_(user_addr_t)]; + char infostyle_l_[PADL_(int)]; int infostyle; char infostyle_r_[PADR_(int)]; +}; +#else +#endif +struct stat_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char ub_l_[PADL_(user_addr_t)]; user_addr_t ub; char ub_r_[PADR_(user_addr_t)]; +}; +struct fstat_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char ub_l_[PADL_(user_addr_t)]; user_addr_t ub; char ub_r_[PADR_(user_addr_t)]; +}; +struct lstat_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char ub_l_[PADL_(user_addr_t)]; user_addr_t ub; char ub_r_[PADR_(user_addr_t)]; +}; +struct pathconf_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char name_l_[PADL_(int)]; int name; char name_r_[PADR_(int)]; +}; +struct fpathconf_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char name_l_[PADL_(int)]; int name; char name_r_[PADR_(int)]; +}; +#if COMPAT_GETFSSTAT +struct getfsstat_args { + char buf_l_[PADL_(user_addr_t)]; user_addr_t buf; char buf_r_[PADR_(user_addr_t)]; + char bufsize_l_[PADL_(user_long_t)]; user_long_t bufsize; char bufsize_r_[PADR_(user_long_t)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; +}; +#else +#endif +struct getrlimit_args { + char which_l_[PADL_(u_int)]; u_int which; char which_r_[PADR_(u_int)]; + char rlp_l_[PADL_(user_addr_t)]; user_addr_t rlp; char rlp_r_[PADR_(user_addr_t)]; +}; +struct setrlimit_args { + char which_l_[PADL_(u_int)]; u_int which; char which_r_[PADR_(u_int)]; + char rlp_l_[PADL_(user_addr_t)]; user_addr_t rlp; char rlp_r_[PADR_(user_addr_t)]; +}; +struct getdirentries_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char buf_l_[PADL_(user_addr_t)]; user_addr_t buf; char buf_r_[PADR_(user_addr_t)]; + char count_l_[PADL_(u_int)]; u_int count; char count_r_[PADR_(u_int)]; + char basep_l_[PADL_(user_addr_t)]; user_addr_t basep; char basep_r_[PADR_(user_addr_t)]; +}; +struct mmap_args { + char addr_l_[PADL_(user_addr_t)]; user_addr_t addr; char addr_r_[PADR_(user_addr_t)]; + char len_l_[PADL_(user_size_t)]; user_size_t len; char len_r_[PADR_(user_size_t)]; + char prot_l_[PADL_(int)]; int prot; char prot_r_[PADR_(int)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char pos_l_[PADL_(off_t)]; off_t pos; char pos_r_[PADR_(off_t)]; +}; +struct lseek_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char offset_l_[PADL_(off_t)]; off_t offset; char offset_r_[PADR_(off_t)]; + char whence_l_[PADL_(int)]; int whence; char whence_r_[PADR_(int)]; +}; +struct truncate_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char length_l_[PADL_(off_t)]; off_t length; char length_r_[PADR_(off_t)]; +}; +struct ftruncate_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char length_l_[PADL_(off_t)]; off_t length; char length_r_[PADR_(off_t)]; +}; +struct __sysctl_args { + char name_l_[PADL_(user_addr_t)]; user_addr_t name; char name_r_[PADR_(user_addr_t)]; + char namelen_l_[PADL_(u_int)]; u_int namelen; char namelen_r_[PADR_(u_int)]; + char old_l_[PADL_(user_addr_t)]; user_addr_t old; char old_r_[PADR_(user_addr_t)]; + char oldlenp_l_[PADL_(user_addr_t)]; user_addr_t oldlenp; char oldlenp_r_[PADR_(user_addr_t)]; + char new_l_[PADL_(user_addr_t)]; user_addr_t new; char new_r_[PADR_(user_addr_t)]; + char newlen_l_[PADL_(user_size_t)]; user_size_t newlen; char newlen_r_[PADR_(user_size_t)]; +}; +struct mlock_args { + char addr_l_[PADL_(user_addr_t)]; user_addr_t addr; char addr_r_[PADR_(user_addr_t)]; + char len_l_[PADL_(user_size_t)]; user_size_t len; char len_r_[PADR_(user_size_t)]; +}; +struct munlock_args { + char addr_l_[PADL_(user_addr_t)]; user_addr_t addr; char addr_r_[PADR_(user_addr_t)]; + char len_l_[PADL_(user_size_t)]; user_size_t len; char len_r_[PADR_(user_size_t)]; +}; +struct undelete_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; +}; +#ifdef __ppc__ +struct ATsocket_args { + char proto_l_[PADL_(int)]; int proto; char proto_r_[PADR_(int)]; +}; +struct ATgetmsg_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char ctlptr_l_[PADL_(void *)]; void * ctlptr; char ctlptr_r_[PADR_(void *)]; + char datptr_l_[PADL_(void *)]; void * datptr; char datptr_r_[PADR_(void *)]; + char flags_l_[PADL_(int *)]; int * flags; char flags_r_[PADR_(int *)]; +}; +struct ATputmsg_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char ctlptr_l_[PADL_(void *)]; void * ctlptr; char ctlptr_r_[PADR_(void *)]; + char datptr_l_[PADL_(void *)]; void * datptr; char datptr_r_[PADR_(void *)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; +}; +struct ATPsndreq_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char buf_l_[PADL_(unsigned char *)]; unsigned char * buf; char buf_r_[PADR_(unsigned char *)]; + char len_l_[PADL_(int)]; int len; char len_r_[PADR_(int)]; + char nowait_l_[PADL_(int)]; int nowait; char nowait_r_[PADR_(int)]; +}; +struct ATPsndrsp_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char respbuff_l_[PADL_(unsigned char *)]; unsigned char * respbuff; char respbuff_r_[PADR_(unsigned char *)]; + char resplen_l_[PADL_(int)]; int resplen; char resplen_r_[PADR_(int)]; + char datalen_l_[PADL_(int)]; int datalen; char datalen_r_[PADR_(int)]; +}; +struct ATPgetreq_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char buf_l_[PADL_(unsigned char *)]; unsigned char * buf; char buf_r_[PADR_(unsigned char *)]; + char buflen_l_[PADL_(int)]; int buflen; char buflen_r_[PADR_(int)]; +}; +struct ATPgetrsp_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char bdsp_l_[PADL_(unsigned char *)]; unsigned char * bdsp; char bdsp_r_[PADR_(unsigned char *)]; +}; +#else +#endif /* __ppc__ */ +struct kqueue_from_portset_np_args { + char portset_l_[PADL_(int)]; int portset; char portset_r_[PADR_(int)]; +}; +struct kqueue_portset_np_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; +}; +struct getattrlist_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char alist_l_[PADL_(user_addr_t)]; user_addr_t alist; char alist_r_[PADR_(user_addr_t)]; + char attributeBuffer_l_[PADL_(user_addr_t)]; user_addr_t attributeBuffer; char attributeBuffer_r_[PADR_(user_addr_t)]; + char bufferSize_l_[PADL_(user_size_t)]; user_size_t bufferSize; char bufferSize_r_[PADR_(user_size_t)]; + char options_l_[PADL_(user_ulong_t)]; user_ulong_t options; char options_r_[PADR_(user_ulong_t)]; +}; +struct setattrlist_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char alist_l_[PADL_(user_addr_t)]; user_addr_t alist; char alist_r_[PADR_(user_addr_t)]; + char attributeBuffer_l_[PADL_(user_addr_t)]; user_addr_t attributeBuffer; char attributeBuffer_r_[PADR_(user_addr_t)]; + char bufferSize_l_[PADL_(user_size_t)]; user_size_t bufferSize; char bufferSize_r_[PADR_(user_size_t)]; + char options_l_[PADL_(user_ulong_t)]; user_ulong_t options; char options_r_[PADR_(user_ulong_t)]; +}; +struct getdirentriesattr_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char alist_l_[PADL_(user_addr_t)]; user_addr_t alist; char alist_r_[PADR_(user_addr_t)]; + char buffer_l_[PADL_(user_addr_t)]; user_addr_t buffer; char buffer_r_[PADR_(user_addr_t)]; + char buffersize_l_[PADL_(user_size_t)]; user_size_t buffersize; char buffersize_r_[PADR_(user_size_t)]; + char count_l_[PADL_(user_addr_t)]; user_addr_t count; char count_r_[PADR_(user_addr_t)]; + char basep_l_[PADL_(user_addr_t)]; user_addr_t basep; char basep_r_[PADR_(user_addr_t)]; + char newstate_l_[PADL_(user_addr_t)]; user_addr_t newstate; char newstate_r_[PADR_(user_addr_t)]; + char options_l_[PADL_(user_ulong_t)]; user_ulong_t options; char options_r_[PADR_(user_ulong_t)]; +}; +struct exchangedata_args { + char path1_l_[PADL_(user_addr_t)]; user_addr_t path1; char path1_r_[PADR_(user_addr_t)]; + char path2_l_[PADL_(user_addr_t)]; user_addr_t path2; char path2_r_[PADR_(user_addr_t)]; + char options_l_[PADL_(user_ulong_t)]; user_ulong_t options; char options_r_[PADR_(user_ulong_t)]; +}; +#ifdef __APPLE_API_OBSOLETE +struct checkuseraccess_args { + char path_l_[PADL_(const char *)]; const char * path; char path_r_[PADR_(const char *)]; + char userid_l_[PADL_(uid_t)]; uid_t userid; char userid_r_[PADR_(uid_t)]; + char groups_l_[PADL_(gid_t *)]; gid_t * groups; char groups_r_[PADR_(gid_t *)]; + char ngroups_l_[PADL_(int)]; int ngroups; char ngroups_r_[PADR_(int)]; + char accessrequired_l_[PADL_(int)]; int accessrequired; char accessrequired_r_[PADR_(int)]; + char options_l_[PADL_(u_long)]; u_long options; char options_r_[PADR_(u_long)]; +}; +#else +#endif /* __APPLE_API_OBSOLETE */ +struct searchfs_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char searchblock_l_[PADL_(user_addr_t)]; user_addr_t searchblock; char searchblock_r_[PADR_(user_addr_t)]; + char nummatches_l_[PADL_(user_addr_t)]; user_addr_t nummatches; char nummatches_r_[PADR_(user_addr_t)]; + char scriptcode_l_[PADL_(user_ulong_t)]; user_ulong_t scriptcode; char scriptcode_r_[PADR_(user_ulong_t)]; + char options_l_[PADL_(user_ulong_t)]; user_ulong_t options; char options_r_[PADR_(user_ulong_t)]; + char state_l_[PADL_(user_addr_t)]; user_addr_t state; char state_r_[PADR_(user_addr_t)]; +}; +struct delete_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; +}; +struct copyfile_args { + char from_l_[PADL_(user_addr_t)]; user_addr_t from; char from_r_[PADR_(user_addr_t)]; + char to_l_[PADL_(user_addr_t)]; user_addr_t to; char to_r_[PADR_(user_addr_t)]; + char mode_l_[PADL_(int)]; int mode; char mode_r_[PADR_(int)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; +}; +struct poll_args { + char fds_l_[PADL_(user_addr_t)]; user_addr_t fds; char fds_r_[PADR_(user_addr_t)]; + char nfds_l_[PADL_(u_int)]; u_int nfds; char nfds_r_[PADR_(u_int)]; + char timeout_l_[PADL_(int)]; int timeout; char timeout_r_[PADR_(int)]; +}; +struct watchevent_args { + char u_req_l_[PADL_(struct eventreq *)]; struct eventreq * u_req; char u_req_r_[PADR_(struct eventreq *)]; + char u_eventmask_l_[PADL_(int)]; int u_eventmask; char u_eventmask_r_[PADR_(int)]; +}; +struct waitevent_args { + char u_req_l_[PADL_(struct eventreq *)]; struct eventreq * u_req; char u_req_r_[PADR_(struct eventreq *)]; + char tv_l_[PADL_(struct timeval *)]; struct timeval * tv; char tv_r_[PADR_(struct timeval *)]; +}; +struct modwatch_args { + char u_req_l_[PADL_(struct eventreq *)]; struct eventreq * u_req; char u_req_r_[PADR_(struct eventreq *)]; + char u_eventmask_l_[PADL_(int)]; int u_eventmask; char u_eventmask_r_[PADR_(int)]; +}; +struct getxattr_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char attrname_l_[PADL_(user_addr_t)]; user_addr_t attrname; char attrname_r_[PADR_(user_addr_t)]; + char value_l_[PADL_(user_addr_t)]; user_addr_t value; char value_r_[PADR_(user_addr_t)]; + char size_l_[PADL_(user_size_t)]; user_size_t size; char size_r_[PADR_(user_size_t)]; + char position_l_[PADL_(uint32_t)]; uint32_t position; char position_r_[PADR_(uint32_t)]; + char options_l_[PADL_(int)]; int options; char options_r_[PADR_(int)]; +}; +struct fgetxattr_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char attrname_l_[PADL_(user_addr_t)]; user_addr_t attrname; char attrname_r_[PADR_(user_addr_t)]; + char value_l_[PADL_(user_addr_t)]; user_addr_t value; char value_r_[PADR_(user_addr_t)]; + char size_l_[PADL_(user_size_t)]; user_size_t size; char size_r_[PADR_(user_size_t)]; + char position_l_[PADL_(uint32_t)]; uint32_t position; char position_r_[PADR_(uint32_t)]; + char options_l_[PADL_(int)]; int options; char options_r_[PADR_(int)]; +}; +struct setxattr_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char attrname_l_[PADL_(user_addr_t)]; user_addr_t attrname; char attrname_r_[PADR_(user_addr_t)]; + char value_l_[PADL_(user_addr_t)]; user_addr_t value; char value_r_[PADR_(user_addr_t)]; + char size_l_[PADL_(user_size_t)]; user_size_t size; char size_r_[PADR_(user_size_t)]; + char position_l_[PADL_(uint32_t)]; uint32_t position; char position_r_[PADR_(uint32_t)]; + char options_l_[PADL_(int)]; int options; char options_r_[PADR_(int)]; +}; +struct fsetxattr_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char attrname_l_[PADL_(user_addr_t)]; user_addr_t attrname; char attrname_r_[PADR_(user_addr_t)]; + char value_l_[PADL_(user_addr_t)]; user_addr_t value; char value_r_[PADR_(user_addr_t)]; + char size_l_[PADL_(user_size_t)]; user_size_t size; char size_r_[PADR_(user_size_t)]; + char position_l_[PADL_(uint32_t)]; uint32_t position; char position_r_[PADR_(uint32_t)]; + char options_l_[PADL_(int)]; int options; char options_r_[PADR_(int)]; +}; +struct removexattr_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char attrname_l_[PADL_(user_addr_t)]; user_addr_t attrname; char attrname_r_[PADR_(user_addr_t)]; + char options_l_[PADL_(int)]; int options; char options_r_[PADR_(int)]; +}; +struct fremovexattr_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char attrname_l_[PADL_(user_addr_t)]; user_addr_t attrname; char attrname_r_[PADR_(user_addr_t)]; + char options_l_[PADL_(int)]; int options; char options_r_[PADR_(int)]; +}; +struct listxattr_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char namebuf_l_[PADL_(user_addr_t)]; user_addr_t namebuf; char namebuf_r_[PADR_(user_addr_t)]; + char bufsize_l_[PADL_(user_size_t)]; user_size_t bufsize; char bufsize_r_[PADR_(user_size_t)]; + char options_l_[PADL_(int)]; int options; char options_r_[PADR_(int)]; +}; +struct flistxattr_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char namebuf_l_[PADL_(user_addr_t)]; user_addr_t namebuf; char namebuf_r_[PADR_(user_addr_t)]; + char bufsize_l_[PADL_(user_size_t)]; user_size_t bufsize; char bufsize_r_[PADR_(user_size_t)]; + char options_l_[PADL_(int)]; int options; char options_r_[PADR_(int)]; +}; +struct fsctl_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char cmd_l_[PADL_(user_ulong_t)]; user_ulong_t cmd; char cmd_r_[PADR_(user_ulong_t)]; + char data_l_[PADL_(user_addr_t)]; user_addr_t data; char data_r_[PADR_(user_addr_t)]; + char options_l_[PADL_(user_ulong_t)]; user_ulong_t options; char options_r_[PADR_(user_ulong_t)]; +}; +struct initgroups_args { + char gidsetsize_l_[PADL_(u_int)]; u_int gidsetsize; char gidsetsize_r_[PADR_(u_int)]; + char gidset_l_[PADL_(user_addr_t)]; user_addr_t gidset; char gidset_r_[PADR_(user_addr_t)]; + char gmuid_l_[PADL_(int)]; int gmuid; char gmuid_r_[PADR_(int)]; +}; +#if NFSCLIENT +struct nfsclnt_args { + char flag_l_[PADL_(int)]; int flag; char flag_r_[PADR_(int)]; + char argp_l_[PADL_(user_addr_t)]; user_addr_t argp; char argp_r_[PADR_(user_addr_t)]; +}; +struct fhopen_args { + char u_fhp_l_[PADL_(user_addr_t)]; user_addr_t u_fhp; char u_fhp_r_[PADR_(user_addr_t)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; +}; +#else +#endif +struct minherit_args { + char addr_l_[PADL_(user_addr_t)]; user_addr_t addr; char addr_r_[PADR_(user_addr_t)]; + char len_l_[PADL_(user_size_t)]; user_size_t len; char len_r_[PADR_(user_size_t)]; + char inherit_l_[PADL_(int)]; int inherit; char inherit_r_[PADR_(int)]; +}; +struct semsys_args { + char which_l_[PADL_(u_int)]; u_int which; char which_r_[PADR_(u_int)]; + char a2_l_[PADL_(int)]; int a2; char a2_r_[PADR_(int)]; + char a3_l_[PADL_(int)]; int a3; char a3_r_[PADR_(int)]; + char a4_l_[PADL_(int)]; int a4; char a4_r_[PADR_(int)]; + char a5_l_[PADL_(int)]; int a5; char a5_r_[PADR_(int)]; +}; +struct msgsys_args { + char which_l_[PADL_(u_int)]; u_int which; char which_r_[PADR_(u_int)]; + char a2_l_[PADL_(int)]; int a2; char a2_r_[PADR_(int)]; + char a3_l_[PADL_(int)]; int a3; char a3_r_[PADR_(int)]; + char a4_l_[PADL_(int)]; int a4; char a4_r_[PADR_(int)]; + char a5_l_[PADL_(int)]; int a5; char a5_r_[PADR_(int)]; +}; +struct shmsys_args { + char which_l_[PADL_(u_int)]; u_int which; char which_r_[PADR_(u_int)]; + char a2_l_[PADL_(int)]; int a2; char a2_r_[PADR_(int)]; + char a3_l_[PADL_(int)]; int a3; char a3_r_[PADR_(int)]; + char a4_l_[PADL_(int)]; int a4; char a4_r_[PADR_(int)]; +}; +struct semctl_args { + char semid_l_[PADL_(int)]; int semid; char semid_r_[PADR_(int)]; + char semnum_l_[PADL_(int)]; int semnum; char semnum_r_[PADR_(int)]; + char cmd_l_[PADL_(int)]; int cmd; char cmd_r_[PADR_(int)]; + char arg_l_[PADL_(user_addr_t)]; user_addr_t arg; char arg_r_[PADR_(user_addr_t)]; +}; +struct semget_args { + char key_l_[PADL_(key_t)]; key_t key; char key_r_[PADR_(key_t)]; + char nsems_l_[PADL_(int)]; int nsems; char nsems_r_[PADR_(int)]; + char semflg_l_[PADL_(int)]; int semflg; char semflg_r_[PADR_(int)]; +}; +struct semop_args { + char semid_l_[PADL_(int)]; int semid; char semid_r_[PADR_(int)]; + char sops_l_[PADL_(user_addr_t)]; user_addr_t sops; char sops_r_[PADR_(user_addr_t)]; + char nsops_l_[PADL_(int)]; int nsops; char nsops_r_[PADR_(int)]; +}; +struct semconfig_args { + char flag_l_[PADL_(semconfig_ctl_t)]; semconfig_ctl_t flag; char flag_r_[PADR_(semconfig_ctl_t)]; +}; +struct msgctl_args { + char msqid_l_[PADL_(int)]; int msqid; char msqid_r_[PADR_(int)]; + char cmd_l_[PADL_(int)]; int cmd; char cmd_r_[PADR_(int)]; + char buf_l_[PADL_(user_addr_t)]; user_addr_t buf; char buf_r_[PADR_(user_addr_t)]; +}; +struct msgget_args { + char key_l_[PADL_(key_t)]; key_t key; char key_r_[PADR_(key_t)]; + char msgflg_l_[PADL_(int)]; int msgflg; char msgflg_r_[PADR_(int)]; +}; +struct msgsnd_args { + char msqid_l_[PADL_(int)]; int msqid; char msqid_r_[PADR_(int)]; + char msgp_l_[PADL_(user_addr_t)]; user_addr_t msgp; char msgp_r_[PADR_(user_addr_t)]; + char msgsz_l_[PADL_(user_size_t)]; user_size_t msgsz; char msgsz_r_[PADR_(user_size_t)]; + char msgflg_l_[PADL_(int)]; int msgflg; char msgflg_r_[PADR_(int)]; +}; +struct msgrcv_args { + char msqid_l_[PADL_(int)]; int msqid; char msqid_r_[PADR_(int)]; + char msgp_l_[PADL_(user_addr_t)]; user_addr_t msgp; char msgp_r_[PADR_(user_addr_t)]; + char msgsz_l_[PADL_(user_size_t)]; user_size_t msgsz; char msgsz_r_[PADR_(user_size_t)]; + char msgtyp_l_[PADL_(user_long_t)]; user_long_t msgtyp; char msgtyp_r_[PADR_(user_long_t)]; + char msgflg_l_[PADL_(int)]; int msgflg; char msgflg_r_[PADR_(int)]; +}; +struct shmat_args { + char shmid_l_[PADL_(int)]; int shmid; char shmid_r_[PADR_(int)]; + char shmaddr_l_[PADL_(user_addr_t)]; user_addr_t shmaddr; char shmaddr_r_[PADR_(user_addr_t)]; + char shmflg_l_[PADL_(int)]; int shmflg; char shmflg_r_[PADR_(int)]; +}; +struct shmctl_args { + char shmid_l_[PADL_(int)]; int shmid; char shmid_r_[PADR_(int)]; + char cmd_l_[PADL_(int)]; int cmd; char cmd_r_[PADR_(int)]; + char buf_l_[PADL_(user_addr_t)]; user_addr_t buf; char buf_r_[PADR_(user_addr_t)]; +}; +struct shmdt_args { + char shmaddr_l_[PADL_(user_addr_t)]; user_addr_t shmaddr; char shmaddr_r_[PADR_(user_addr_t)]; +}; +struct shmget_args { + char key_l_[PADL_(key_t)]; key_t key; char key_r_[PADR_(key_t)]; + char size_l_[PADL_(user_size_t)]; user_size_t size; char size_r_[PADR_(user_size_t)]; + char shmflg_l_[PADL_(int)]; int shmflg; char shmflg_r_[PADR_(int)]; +}; +struct shm_open_args { + char name_l_[PADL_(user_addr_t)]; user_addr_t name; char name_r_[PADR_(user_addr_t)]; + char oflag_l_[PADL_(int)]; int oflag; char oflag_r_[PADR_(int)]; + char mode_l_[PADL_(int)]; int mode; char mode_r_[PADR_(int)]; +}; +struct shm_unlink_args { + char name_l_[PADL_(user_addr_t)]; user_addr_t name; char name_r_[PADR_(user_addr_t)]; +}; +struct sem_open_args { + char name_l_[PADL_(user_addr_t)]; user_addr_t name; char name_r_[PADR_(user_addr_t)]; + char oflag_l_[PADL_(int)]; int oflag; char oflag_r_[PADR_(int)]; + char mode_l_[PADL_(int)]; int mode; char mode_r_[PADR_(int)]; + char value_l_[PADL_(int)]; int value; char value_r_[PADR_(int)]; +}; +struct sem_close_args { + char sem_l_[PADL_(user_addr_t)]; user_addr_t sem; char sem_r_[PADR_(user_addr_t)]; +}; +struct sem_unlink_args { + char name_l_[PADL_(user_addr_t)]; user_addr_t name; char name_r_[PADR_(user_addr_t)]; +}; +struct sem_wait_args { + char sem_l_[PADL_(user_addr_t)]; user_addr_t sem; char sem_r_[PADR_(user_addr_t)]; +}; +struct sem_trywait_args { + char sem_l_[PADL_(user_addr_t)]; user_addr_t sem; char sem_r_[PADR_(user_addr_t)]; +}; +struct sem_post_args { + char sem_l_[PADL_(user_addr_t)]; user_addr_t sem; char sem_r_[PADR_(user_addr_t)]; +}; +struct sem_getvalue_args { + char sem_l_[PADL_(user_addr_t)]; user_addr_t sem; char sem_r_[PADR_(user_addr_t)]; + char sval_l_[PADL_(user_addr_t)]; user_addr_t sval; char sval_r_[PADR_(user_addr_t)]; +}; +struct sem_init_args { + char sem_l_[PADL_(user_addr_t)]; user_addr_t sem; char sem_r_[PADR_(user_addr_t)]; + char phsared_l_[PADL_(int)]; int phsared; char phsared_r_[PADR_(int)]; + char value_l_[PADL_(u_int)]; u_int value; char value_r_[PADR_(u_int)]; +}; +struct sem_destroy_args { + char sem_l_[PADL_(user_addr_t)]; user_addr_t sem; char sem_r_[PADR_(user_addr_t)]; +}; +struct open_extended_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; + char uid_l_[PADL_(uid_t)]; uid_t uid; char uid_r_[PADR_(uid_t)]; + char gid_l_[PADL_(gid_t)]; gid_t gid; char gid_r_[PADR_(gid_t)]; + char mode_l_[PADL_(int)]; int mode; char mode_r_[PADR_(int)]; + char xsecurity_l_[PADL_(user_addr_t)]; user_addr_t xsecurity; char xsecurity_r_[PADR_(user_addr_t)]; +}; +struct umask_extended_args { + char newmask_l_[PADL_(int)]; int newmask; char newmask_r_[PADR_(int)]; + char xsecurity_l_[PADL_(user_addr_t)]; user_addr_t xsecurity; char xsecurity_r_[PADR_(user_addr_t)]; +}; +struct stat_extended_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char ub_l_[PADL_(user_addr_t)]; user_addr_t ub; char ub_r_[PADR_(user_addr_t)]; + char xsecurity_l_[PADL_(user_addr_t)]; user_addr_t xsecurity; char xsecurity_r_[PADR_(user_addr_t)]; + char xsecurity_size_l_[PADL_(user_addr_t)]; user_addr_t xsecurity_size; char xsecurity_size_r_[PADR_(user_addr_t)]; +}; +struct lstat_extended_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char ub_l_[PADL_(user_addr_t)]; user_addr_t ub; char ub_r_[PADR_(user_addr_t)]; + char xsecurity_l_[PADL_(user_addr_t)]; user_addr_t xsecurity; char xsecurity_r_[PADR_(user_addr_t)]; + char xsecurity_size_l_[PADL_(user_addr_t)]; user_addr_t xsecurity_size; char xsecurity_size_r_[PADR_(user_addr_t)]; +}; +struct fstat_extended_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char ub_l_[PADL_(user_addr_t)]; user_addr_t ub; char ub_r_[PADR_(user_addr_t)]; + char xsecurity_l_[PADL_(user_addr_t)]; user_addr_t xsecurity; char xsecurity_r_[PADR_(user_addr_t)]; + char xsecurity_size_l_[PADL_(user_addr_t)]; user_addr_t xsecurity_size; char xsecurity_size_r_[PADR_(user_addr_t)]; +}; +struct chmod_extended_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char uid_l_[PADL_(uid_t)]; uid_t uid; char uid_r_[PADR_(uid_t)]; + char gid_l_[PADL_(gid_t)]; gid_t gid; char gid_r_[PADR_(gid_t)]; + char mode_l_[PADL_(int)]; int mode; char mode_r_[PADR_(int)]; + char xsecurity_l_[PADL_(user_addr_t)]; user_addr_t xsecurity; char xsecurity_r_[PADR_(user_addr_t)]; +}; +struct fchmod_extended_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char uid_l_[PADL_(uid_t)]; uid_t uid; char uid_r_[PADR_(uid_t)]; + char gid_l_[PADL_(gid_t)]; gid_t gid; char gid_r_[PADR_(gid_t)]; + char mode_l_[PADL_(int)]; int mode; char mode_r_[PADR_(int)]; + char xsecurity_l_[PADL_(user_addr_t)]; user_addr_t xsecurity; char xsecurity_r_[PADR_(user_addr_t)]; +}; +struct access_extended_args { + char entries_l_[PADL_(user_addr_t)]; user_addr_t entries; char entries_r_[PADR_(user_addr_t)]; + char size_l_[PADL_(user_size_t)]; user_size_t size; char size_r_[PADR_(user_size_t)]; + char results_l_[PADL_(user_addr_t)]; user_addr_t results; char results_r_[PADR_(user_addr_t)]; + char uid_l_[PADL_(uid_t)]; uid_t uid; char uid_r_[PADR_(uid_t)]; +}; +struct settid_args { + char uid_l_[PADL_(uid_t)]; uid_t uid; char uid_r_[PADR_(uid_t)]; + char gid_l_[PADL_(gid_t)]; gid_t gid; char gid_r_[PADR_(gid_t)]; +}; +struct gettid_args { + char uidp_l_[PADL_(user_addr_t)]; user_addr_t uidp; char uidp_r_[PADR_(user_addr_t)]; + char gidp_l_[PADL_(user_addr_t)]; user_addr_t gidp; char gidp_r_[PADR_(user_addr_t)]; +}; +struct setsgroups_args { + char setlen_l_[PADL_(int)]; int setlen; char setlen_r_[PADR_(int)]; + char guidset_l_[PADL_(user_addr_t)]; user_addr_t guidset; char guidset_r_[PADR_(user_addr_t)]; +}; +struct getsgroups_args { + char setlen_l_[PADL_(user_addr_t)]; user_addr_t setlen; char setlen_r_[PADR_(user_addr_t)]; + char guidset_l_[PADL_(user_addr_t)]; user_addr_t guidset; char guidset_r_[PADR_(user_addr_t)]; +}; +struct setwgroups_args { + char setlen_l_[PADL_(int)]; int setlen; char setlen_r_[PADR_(int)]; + char guidset_l_[PADL_(user_addr_t)]; user_addr_t guidset; char guidset_r_[PADR_(user_addr_t)]; +}; +struct getwgroups_args { + char setlen_l_[PADL_(user_addr_t)]; user_addr_t setlen; char setlen_r_[PADR_(user_addr_t)]; + char guidset_l_[PADL_(user_addr_t)]; user_addr_t guidset; char guidset_r_[PADR_(user_addr_t)]; +}; +struct mkfifo_extended_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char uid_l_[PADL_(uid_t)]; uid_t uid; char uid_r_[PADR_(uid_t)]; + char gid_l_[PADL_(gid_t)]; gid_t gid; char gid_r_[PADR_(gid_t)]; + char mode_l_[PADL_(int)]; int mode; char mode_r_[PADR_(int)]; + char xsecurity_l_[PADL_(user_addr_t)]; user_addr_t xsecurity; char xsecurity_r_[PADR_(user_addr_t)]; +}; +struct mkdir_extended_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char uid_l_[PADL_(uid_t)]; uid_t uid; char uid_r_[PADR_(uid_t)]; + char gid_l_[PADL_(gid_t)]; gid_t gid; char gid_r_[PADR_(gid_t)]; + char mode_l_[PADL_(int)]; int mode; char mode_r_[PADR_(int)]; + char xsecurity_l_[PADL_(user_addr_t)]; user_addr_t xsecurity; char xsecurity_r_[PADR_(user_addr_t)]; +}; +struct identitysvc_args { + char opcode_l_[PADL_(int)]; int opcode; char opcode_r_[PADR_(int)]; + char message_l_[PADL_(user_addr_t)]; user_addr_t message; char message_r_[PADR_(user_addr_t)]; +}; +struct load_shared_file_args { + char filename_l_[PADL_(char *)]; char * filename; char filename_r_[PADR_(char *)]; + char mfa_l_[PADL_(caddr_t)]; caddr_t mfa; char mfa_r_[PADR_(caddr_t)]; + char mfs_l_[PADL_(u_long)]; u_long mfs; char mfs_r_[PADR_(u_long)]; + char ba_l_[PADL_(caddr_t *)]; caddr_t * ba; char ba_r_[PADR_(caddr_t *)]; + char map_cnt_l_[PADL_(int)]; int map_cnt; char map_cnt_r_[PADR_(int)]; + char mappings_l_[PADL_(sf_mapping_t *)]; sf_mapping_t * mappings; char mappings_r_[PADR_(sf_mapping_t *)]; + char flags_l_[PADL_(int *)]; int * flags; char flags_r_[PADR_(int *)]; +}; +struct reset_shared_file_args { + char ba_l_[PADL_(caddr_t *)]; caddr_t * ba; char ba_r_[PADR_(caddr_t *)]; + char map_cnt_l_[PADL_(int)]; int map_cnt; char map_cnt_r_[PADR_(int)]; + char mappings_l_[PADL_(sf_mapping_t *)]; sf_mapping_t * mappings; char mappings_r_[PADR_(sf_mapping_t *)]; +}; +struct new_system_shared_regions_args { + register_t dummy; +}; +struct shared_region_map_file_np_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char mappingCount_l_[PADL_(uint32_t)]; uint32_t mappingCount; char mappingCount_r_[PADR_(uint32_t)]; + char mappings_l_[PADL_(user_addr_t)]; user_addr_t mappings; char mappings_r_[PADR_(user_addr_t)]; + char slide_p_l_[PADL_(user_addr_t)]; user_addr_t slide_p; char slide_p_r_[PADR_(user_addr_t)]; +}; +struct shared_region_make_private_np_args { + char rangeCount_l_[PADL_(uint32_t)]; uint32_t rangeCount; char rangeCount_r_[PADR_(uint32_t)]; + char ranges_l_[PADL_(user_addr_t)]; user_addr_t ranges; char ranges_r_[PADR_(user_addr_t)]; +}; +struct getsid_args { + char pid_l_[PADL_(pid_t)]; pid_t pid; char pid_r_[PADR_(pid_t)]; +}; +struct settid_with_pid_args { + char pid_l_[PADL_(pid_t)]; pid_t pid; char pid_r_[PADR_(pid_t)]; + char assume_l_[PADL_(int)]; int assume; char assume_r_[PADR_(int)]; +}; +struct aio_fsync_args { + char op_l_[PADL_(int)]; int op; char op_r_[PADR_(int)]; + char aiocbp_l_[PADL_(user_addr_t)]; user_addr_t aiocbp; char aiocbp_r_[PADR_(user_addr_t)]; +}; +struct aio_return_args { + char aiocbp_l_[PADL_(user_addr_t)]; user_addr_t aiocbp; char aiocbp_r_[PADR_(user_addr_t)]; +}; +struct aio_suspend_args { + char aiocblist_l_[PADL_(user_addr_t)]; user_addr_t aiocblist; char aiocblist_r_[PADR_(user_addr_t)]; + char nent_l_[PADL_(int)]; int nent; char nent_r_[PADR_(int)]; + char timeoutp_l_[PADL_(user_addr_t)]; user_addr_t timeoutp; char timeoutp_r_[PADR_(user_addr_t)]; +}; +struct aio_cancel_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char aiocbp_l_[PADL_(user_addr_t)]; user_addr_t aiocbp; char aiocbp_r_[PADR_(user_addr_t)]; +}; +struct aio_error_args { + char aiocbp_l_[PADL_(user_addr_t)]; user_addr_t aiocbp; char aiocbp_r_[PADR_(user_addr_t)]; +}; +struct aio_read_args { + char aiocbp_l_[PADL_(user_addr_t)]; user_addr_t aiocbp; char aiocbp_r_[PADR_(user_addr_t)]; +}; +struct aio_write_args { + char aiocbp_l_[PADL_(user_addr_t)]; user_addr_t aiocbp; char aiocbp_r_[PADR_(user_addr_t)]; +}; +struct lio_listio_args { + char mode_l_[PADL_(int)]; int mode; char mode_r_[PADR_(int)]; + char aiocblist_l_[PADL_(user_addr_t)]; user_addr_t aiocblist; char aiocblist_r_[PADR_(user_addr_t)]; + char nent_l_[PADL_(int)]; int nent; char nent_r_[PADR_(int)]; + char sigp_l_[PADL_(user_addr_t)]; user_addr_t sigp; char sigp_r_[PADR_(user_addr_t)]; +}; +struct mlockall_args { + char how_l_[PADL_(int)]; int how; char how_r_[PADR_(int)]; +}; +struct munlockall_args { + char how_l_[PADL_(int)]; int how; char how_r_[PADR_(int)]; +}; +struct issetugid_args { + register_t dummy; +}; +struct __pthread_kill_args { + char thread_port_l_[PADL_(int)]; int thread_port; char thread_port_r_[PADR_(int)]; + char sig_l_[PADL_(int)]; int sig; char sig_r_[PADR_(int)]; +}; +struct pthread_sigmask_args { + char how_l_[PADL_(int)]; int how; char how_r_[PADR_(int)]; + char set_l_[PADL_(user_addr_t)]; user_addr_t set; char set_r_[PADR_(user_addr_t)]; + char oset_l_[PADL_(user_addr_t)]; user_addr_t oset; char oset_r_[PADR_(user_addr_t)]; +}; +struct sigwait_args { + char set_l_[PADL_(user_addr_t)]; user_addr_t set; char set_r_[PADR_(user_addr_t)]; + char sig_l_[PADL_(user_addr_t)]; user_addr_t sig; char sig_r_[PADR_(user_addr_t)]; +}; +struct __disable_threadsignal_args { + char value_l_[PADL_(int)]; int value; char value_r_[PADR_(int)]; +}; +struct __pthread_markcancel_args { + char thread_port_l_[PADL_(int)]; int thread_port; char thread_port_r_[PADR_(int)]; +}; +struct __pthread_canceled_args { + char action_l_[PADL_(int)]; int action; char action_r_[PADR_(int)]; +}; +struct __semwait_signal_args { + char cond_sem_l_[PADL_(int)]; int cond_sem; char cond_sem_r_[PADR_(int)]; + char mutex_sem_l_[PADL_(int)]; int mutex_sem; char mutex_sem_r_[PADR_(int)]; + char timeout_l_[PADL_(int)]; int timeout; char timeout_r_[PADR_(int)]; + char relative_l_[PADL_(int)]; int relative; char relative_r_[PADR_(int)]; + char tv_sec_l_[PADL_(time_t)]; time_t tv_sec; char tv_sec_r_[PADR_(time_t)]; + char tv_nsec_l_[PADL_(int32_t)]; int32_t tv_nsec; char tv_nsec_r_[PADR_(int32_t)]; +}; +struct utrace_args { + char addr_l_[PADL_(user_addr_t)]; user_addr_t addr; char addr_r_[PADR_(user_addr_t)]; + char len_l_[PADL_(user_size_t)]; user_size_t len; char len_r_[PADR_(user_size_t)]; +}; +struct audit_args { + char record_l_[PADL_(user_addr_t)]; user_addr_t record; char record_r_[PADR_(user_addr_t)]; + char length_l_[PADL_(int)]; int length; char length_r_[PADR_(int)]; +}; +struct auditon_args { + char cmd_l_[PADL_(int)]; int cmd; char cmd_r_[PADR_(int)]; + char data_l_[PADL_(user_addr_t)]; user_addr_t data; char data_r_[PADR_(user_addr_t)]; + char length_l_[PADL_(int)]; int length; char length_r_[PADR_(int)]; +}; +struct getauid_args { + char auid_l_[PADL_(user_addr_t)]; user_addr_t auid; char auid_r_[PADR_(user_addr_t)]; +}; +struct setauid_args { + char auid_l_[PADL_(user_addr_t)]; user_addr_t auid; char auid_r_[PADR_(user_addr_t)]; +}; +struct getaudit_args { + char auditinfo_l_[PADL_(user_addr_t)]; user_addr_t auditinfo; char auditinfo_r_[PADR_(user_addr_t)]; +}; +struct setaudit_args { + char auditinfo_l_[PADL_(user_addr_t)]; user_addr_t auditinfo; char auditinfo_r_[PADR_(user_addr_t)]; +}; +struct getaudit_addr_args { + char auditinfo_addr_l_[PADL_(user_addr_t)]; user_addr_t auditinfo_addr; char auditinfo_addr_r_[PADR_(user_addr_t)]; + char length_l_[PADL_(int)]; int length; char length_r_[PADR_(int)]; +}; +struct setaudit_addr_args { + char auditinfo_addr_l_[PADL_(user_addr_t)]; user_addr_t auditinfo_addr; char auditinfo_addr_r_[PADR_(user_addr_t)]; + char length_l_[PADL_(int)]; int length; char length_r_[PADR_(int)]; +}; +struct auditctl_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; +}; +struct kqueue_args { + register_t dummy; +}; +struct kevent_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char changelist_l_[PADL_(user_addr_t)]; user_addr_t changelist; char changelist_r_[PADR_(user_addr_t)]; + char nchanges_l_[PADL_(int)]; int nchanges; char nchanges_r_[PADR_(int)]; + char eventlist_l_[PADL_(user_addr_t)]; user_addr_t eventlist; char eventlist_r_[PADR_(user_addr_t)]; + char nevents_l_[PADL_(int)]; int nevents; char nevents_r_[PADR_(int)]; + char timeout_l_[PADL_(user_addr_t)]; user_addr_t timeout; char timeout_r_[PADR_(user_addr_t)]; +}; +struct lchown_args { + char path_l_[PADL_(user_addr_t)]; user_addr_t path; char path_r_[PADR_(user_addr_t)]; + char owner_l_[PADL_(uid_t)]; uid_t owner; char owner_r_[PADR_(uid_t)]; + char group_l_[PADL_(gid_t)]; gid_t group; char group_r_[PADR_(gid_t)]; +}; +int nosys(struct proc *, struct nosys_args *, int *); +void exit(struct proc *, struct exit_args *, int *); +int fork(struct proc *, struct fork_args *, int *); +int read(struct proc *, struct read_args *, user_ssize_t *); +int write(struct proc *, struct write_args *, user_ssize_t *); +int open(struct proc *, struct open_args *, int *); +int close(struct proc *, struct close_args *, int *); +int wait4(struct proc *, struct wait4_args *, int *); +int link(struct proc *, struct link_args *, int *); +int unlink(struct proc *, struct unlink_args *, int *); +int chdir(struct proc *, struct chdir_args *, int *); +int fchdir(struct proc *, struct fchdir_args *, int *); +int mknod(struct proc *, struct mknod_args *, int *); +int chmod(struct proc *, struct chmod_args *, int *); +int chown(struct proc *, struct chown_args *, int *); +int obreak(struct proc *, struct obreak_args *, int *); +#if COMPAT_GETFSSTAT +int ogetfsstat(struct proc *, struct ogetfsstat_args *, int *); +#else +int getfsstat(struct proc *, struct getfsstat_args *, int *); +#endif +int getpid(struct proc *, struct getpid_args *, int *); +int setuid(struct proc *, struct setuid_args *, int *); +int getuid(struct proc *, struct getuid_args *, int *); +int geteuid(struct proc *, struct geteuid_args *, int *); +int ptrace(struct proc *, struct ptrace_args *, int *); +int recvmsg(struct proc *, struct recvmsg_args *, int *); +int sendmsg(struct proc *, struct sendmsg_args *, int *); +int recvfrom(struct proc *, struct recvfrom_args *, int *); +int accept(struct proc *, struct accept_args *, int *); +int getpeername(struct proc *, struct getpeername_args *, int *); +int getsockname(struct proc *, struct getsockname_args *, int *); +int access(struct proc *, struct access_args *, int *); +int chflags(struct proc *, struct chflags_args *, int *); +int fchflags(struct proc *, struct fchflags_args *, int *); +int sync(struct proc *, struct sync_args *, int *); +int kill(struct proc *, struct kill_args *, int *); +int getppid(struct proc *, struct getppid_args *, int *); +int dup(struct proc *, struct dup_args *, int *); +int pipe(struct proc *, struct pipe_args *, int *); +int getegid(struct proc *, struct getegid_args *, int *); +int profil(struct proc *, struct profil_args *, int *); +int ktrace(struct proc *, struct ktrace_args *, int *); +int sigaction(struct proc *, struct sigaction_args *, int *); +int getgid(struct proc *, struct getgid_args *, int *); +int sigprocmask(struct proc *, struct sigprocmask_args *, int *); +int getlogin(struct proc *, struct getlogin_args *, int *); +int setlogin(struct proc *, struct setlogin_args *, int *); +int acct(struct proc *, struct acct_args *, int *); +int sigpending(struct proc *, struct sigpending_args *, int *); +int sigaltstack(struct proc *, struct sigaltstack_args *, int *); +int ioctl(struct proc *, struct ioctl_args *, int *); +int reboot(struct proc *, struct reboot_args *, int *); +int revoke(struct proc *, struct revoke_args *, int *); +int symlink(struct proc *, struct symlink_args *, int *); +int readlink(struct proc *, struct readlink_args *, int *); +int execve(struct proc *, struct execve_args *, int *); +int umask(struct proc *, struct umask_args *, int *); +int chroot(struct proc *, struct chroot_args *, int *); +int msync(struct proc *, struct msync_args *, int *); +int vfork(struct proc *, struct vfork_args *, int *); +int sbrk(struct proc *, struct sbrk_args *, int *); +int sstk(struct proc *, struct sstk_args *, int *); +int ovadvise(struct proc *, struct ovadvise_args *, int *); +int munmap(struct proc *, struct munmap_args *, int *); +int mprotect(struct proc *, struct mprotect_args *, int *); +int madvise(struct proc *, struct madvise_args *, int *); +int mincore(struct proc *, struct mincore_args *, int *); +int getgroups(struct proc *, struct getgroups_args *, int *); +int setgroups(struct proc *, struct setgroups_args *, int *); +int getpgrp(struct proc *, struct getpgrp_args *, int *); +int setpgid(struct proc *, struct setpgid_args *, int *); +int setitimer(struct proc *, struct setitimer_args *, int *); +int swapon(struct proc *, struct swapon_args *, int *); +int getitimer(struct proc *, struct getitimer_args *, int *); +int getdtablesize(struct proc *, struct getdtablesize_args *, int *); +int dup2(struct proc *, struct dup2_args *, int *); +int fcntl(struct proc *, struct fcntl_args *, int *); +int select(struct proc *, struct select_args *, int *); +int fsync(struct proc *, struct fsync_args *, int *); +int setpriority(struct proc *, struct setpriority_args *, int *); +int socket(struct proc *, struct socket_args *, int *); +int connect(struct proc *, struct connect_args *, int *); +int getpriority(struct proc *, struct getpriority_args *, int *); +#ifdef __ppc__ +#else +int sigreturn(struct proc *, struct sigreturn_args *, int *); +#endif +int bind(struct proc *, struct bind_args *, int *); +int setsockopt(struct proc *, struct setsockopt_args *, int *); +int listen(struct proc *, struct listen_args *, int *); +int sigsuspend(struct proc *, struct sigsuspend_args *, int *); +#ifdef __ppc__ +int ppc_gettimeofday(struct proc *, struct ppc_gettimeofday_args *, int *); +#else +int gettimeofday(struct proc *, struct gettimeofday_args *, int *); +#endif +int getrusage(struct proc *, struct getrusage_args *, int *); +int getsockopt(struct proc *, struct getsockopt_args *, int *); +int readv(struct proc *, struct readv_args *, user_ssize_t *); +int writev(struct proc *, struct writev_args *, user_ssize_t *); +int settimeofday(struct proc *, struct settimeofday_args *, int *); +int fchown(struct proc *, struct fchown_args *, int *); +int fchmod(struct proc *, struct fchmod_args *, int *); +int rename(struct proc *, struct rename_args *, int *); +int flock(struct proc *, struct flock_args *, int *); +int mkfifo(struct proc *, struct mkfifo_args *, int *); +int sendto(struct proc *, struct sendto_args *, int *); +int shutdown(struct proc *, struct shutdown_args *, int *); +int socketpair(struct proc *, struct socketpair_args *, int *); +int mkdir(struct proc *, struct mkdir_args *, int *); +int rmdir(struct proc *, struct rmdir_args *, int *); +int utimes(struct proc *, struct utimes_args *, int *); +int futimes(struct proc *, struct futimes_args *, int *); +int adjtime(struct proc *, struct adjtime_args *, int *); +int setsid(struct proc *, struct setsid_args *, int *); +int getpgid(struct proc *, struct getpgid_args *, int *); +int setprivexec(struct proc *, struct setprivexec_args *, int *); +int pread(struct proc *, struct pread_args *, user_ssize_t *); +int pwrite(struct proc *, struct pwrite_args *, user_ssize_t *); +#if NFSSERVER +int nfssvc(struct proc *, struct nfssvc_args *, int *); +#else +#endif +int statfs(struct proc *, struct statfs_args *, int *); +int fstatfs(struct proc *, struct fstatfs_args *, int *); +int unmount(struct proc *, struct unmount_args *, int *); +#if NFSCLIENT +int getfh(struct proc *, struct getfh_args *, int *); +#else +#endif +int quotactl(struct proc *, struct quotactl_args *, int *); +int mount(struct proc *, struct mount_args *, int *); +int waitid(struct proc *, struct waitid_args *, int *); +int add_profil(struct proc *, struct add_profil_args *, int *); +int kdebug_trace(struct proc *, struct kdebug_trace_args *, int *); +int setgid(struct proc *, struct setgid_args *, int *); +int setegid(struct proc *, struct setegid_args *, int *); +int seteuid(struct proc *, struct seteuid_args *, int *); +#ifdef __ppc__ +int sigreturn(struct proc *, struct sigreturn_args *, int *); +#else +#endif +int stat(struct proc *, struct stat_args *, int *); +int fstat(struct proc *, struct fstat_args *, int *); +int lstat(struct proc *, struct lstat_args *, int *); +int pathconf(struct proc *, struct pathconf_args *, int *); +int fpathconf(struct proc *, struct fpathconf_args *, int *); +#if COMPAT_GETFSSTAT +int getfsstat(struct proc *, struct getfsstat_args *, int *); +#else +#endif +int getrlimit(struct proc *, struct getrlimit_args *, int *); +int setrlimit(struct proc *, struct setrlimit_args *, int *); +int getdirentries(struct proc *, struct getdirentries_args *, int *); +int mmap(struct proc *, struct mmap_args *, user_addr_t *); +int lseek(struct proc *, struct lseek_args *, off_t *); +int truncate(struct proc *, struct truncate_args *, int *); +int ftruncate(struct proc *, struct ftruncate_args *, int *); +int __sysctl(struct proc *, struct __sysctl_args *, int *); +int mlock(struct proc *, struct mlock_args *, int *); +int munlock(struct proc *, struct munlock_args *, int *); +int undelete(struct proc *, struct undelete_args *, int *); +#ifdef __ppc__ +int ATsocket(struct proc *, struct ATsocket_args *, int *); +int ATgetmsg(struct proc *, struct ATgetmsg_args *, int *); +int ATputmsg(struct proc *, struct ATputmsg_args *, int *); +int ATPsndreq(struct proc *, struct ATPsndreq_args *, int *); +int ATPsndrsp(struct proc *, struct ATPsndrsp_args *, int *); +int ATPgetreq(struct proc *, struct ATPgetreq_args *, int *); +int ATPgetrsp(struct proc *, struct ATPgetrsp_args *, int *); +#else +#endif /* __ppc__ */ +int kqueue_from_portset_np(struct proc *, struct kqueue_from_portset_np_args *, int *); +int kqueue_portset_np(struct proc *, struct kqueue_portset_np_args *, int *); +int getattrlist(struct proc *, struct getattrlist_args *, int *); +int setattrlist(struct proc *, struct setattrlist_args *, int *); +int getdirentriesattr(struct proc *, struct getdirentriesattr_args *, int *); +int exchangedata(struct proc *, struct exchangedata_args *, int *); +#ifdef __APPLE_API_OBSOLETE +int checkuseraccess(struct proc *, struct checkuseraccess_args *, int *); +#else +#endif /* __APPLE_API_OBSOLETE */ +int searchfs(struct proc *, struct searchfs_args *, int *); +int delete(struct proc *, struct delete_args *, int *); +int copyfile(struct proc *, struct copyfile_args *, int *); +int poll(struct proc *, struct poll_args *, int *); +int watchevent(struct proc *, struct watchevent_args *, int *); +int waitevent(struct proc *, struct waitevent_args *, int *); +int modwatch(struct proc *, struct modwatch_args *, int *); +int getxattr(struct proc *, struct getxattr_args *, user_ssize_t *); +int fgetxattr(struct proc *, struct fgetxattr_args *, user_ssize_t *); +int setxattr(struct proc *, struct setxattr_args *, int *); +int fsetxattr(struct proc *, struct fsetxattr_args *, int *); +int removexattr(struct proc *, struct removexattr_args *, int *); +int fremovexattr(struct proc *, struct fremovexattr_args *, int *); +int listxattr(struct proc *, struct listxattr_args *, user_ssize_t *); +int flistxattr(struct proc *, struct flistxattr_args *, user_ssize_t *); +int fsctl(struct proc *, struct fsctl_args *, int *); +int initgroups(struct proc *, struct initgroups_args *, int *); +#if NFSCLIENT +int nfsclnt(struct proc *, struct nfsclnt_args *, int *); +int fhopen(struct proc *, struct fhopen_args *, int *); +#else +#endif +int minherit(struct proc *, struct minherit_args *, int *); +int semsys(struct proc *, struct semsys_args *, int *); +int msgsys(struct proc *, struct msgsys_args *, int *); +int shmsys(struct proc *, struct shmsys_args *, int *); +int semctl(struct proc *, struct semctl_args *, int *); +int semget(struct proc *, struct semget_args *, int *); +int semop(struct proc *, struct semop_args *, int *); +int semconfig(struct proc *, struct semconfig_args *, int *); +int msgctl(struct proc *, struct msgctl_args *, int *); +int msgget(struct proc *, struct msgget_args *, int *); +int msgsnd(struct proc *, struct msgsnd_args *, int *); +int msgrcv(struct proc *, struct msgrcv_args *, user_ssize_t *); +int shmat(struct proc *, struct shmat_args *, int *); +int shmctl(struct proc *, struct shmctl_args *, int *); +int shmdt(struct proc *, struct shmdt_args *, int *); +int shmget(struct proc *, struct shmget_args *, int *); +int shm_open(struct proc *, struct shm_open_args *, int *); +int shm_unlink(struct proc *, struct shm_unlink_args *, int *); +int sem_open(struct proc *, struct sem_open_args *, user_addr_t *); +int sem_close(struct proc *, struct sem_close_args *, int *); +int sem_unlink(struct proc *, struct sem_unlink_args *, int *); +int sem_wait(struct proc *, struct sem_wait_args *, int *); +int sem_trywait(struct proc *, struct sem_trywait_args *, int *); +int sem_post(struct proc *, struct sem_post_args *, int *); +int sem_getvalue(struct proc *, struct sem_getvalue_args *, int *); +int sem_init(struct proc *, struct sem_init_args *, int *); +int sem_destroy(struct proc *, struct sem_destroy_args *, int *); +int open_extended(struct proc *, struct open_extended_args *, int *); +int umask_extended(struct proc *, struct umask_extended_args *, int *); +int stat_extended(struct proc *, struct stat_extended_args *, int *); +int lstat_extended(struct proc *, struct lstat_extended_args *, int *); +int fstat_extended(struct proc *, struct fstat_extended_args *, int *); +int chmod_extended(struct proc *, struct chmod_extended_args *, int *); +int fchmod_extended(struct proc *, struct fchmod_extended_args *, int *); +int access_extended(struct proc *, struct access_extended_args *, int *); +int settid(struct proc *, struct settid_args *, int *); +int gettid(struct proc *, struct gettid_args *, int *); +int setsgroups(struct proc *, struct setsgroups_args *, int *); +int getsgroups(struct proc *, struct getsgroups_args *, int *); +int setwgroups(struct proc *, struct setwgroups_args *, int *); +int getwgroups(struct proc *, struct getwgroups_args *, int *); +int mkfifo_extended(struct proc *, struct mkfifo_extended_args *, int *); +int mkdir_extended(struct proc *, struct mkdir_extended_args *, int *); +int identitysvc(struct proc *, struct identitysvc_args *, int *); +int load_shared_file(struct proc *, struct load_shared_file_args *, int *); +int reset_shared_file(struct proc *, struct reset_shared_file_args *, int *); +int new_system_shared_regions(struct proc *, struct new_system_shared_regions_args *, int *); +int shared_region_map_file_np(struct proc *, struct shared_region_map_file_np_args *, int *); +int shared_region_make_private_np(struct proc *, struct shared_region_make_private_np_args *, int *); +int getsid(struct proc *, struct getsid_args *, int *); +int settid_with_pid(struct proc *, struct settid_with_pid_args *, int *); +int aio_fsync(struct proc *, struct aio_fsync_args *, int *); +int aio_return(struct proc *, struct aio_return_args *, user_ssize_t *); +int aio_suspend(struct proc *, struct aio_suspend_args *, int *); +int aio_cancel(struct proc *, struct aio_cancel_args *, int *); +int aio_error(struct proc *, struct aio_error_args *, int *); +int aio_read(struct proc *, struct aio_read_args *, int *); +int aio_write(struct proc *, struct aio_write_args *, int *); +int lio_listio(struct proc *, struct lio_listio_args *, int *); +int mlockall(struct proc *, struct mlockall_args *, int *); +int munlockall(struct proc *, struct munlockall_args *, int *); +int issetugid(struct proc *, struct issetugid_args *, int *); +int __pthread_kill(struct proc *, struct __pthread_kill_args *, int *); +int pthread_sigmask(struct proc *, struct pthread_sigmask_args *, int *); +int sigwait(struct proc *, struct sigwait_args *, int *); +int __disable_threadsignal(struct proc *, struct __disable_threadsignal_args *, int *); +int __pthread_markcancel(struct proc *, struct __pthread_markcancel_args *, int *); +int __pthread_canceled(struct proc *, struct __pthread_canceled_args *, int *); +int __semwait_signal(struct proc *, struct __semwait_signal_args *, int *); +int utrace(struct proc *, struct utrace_args *, int *); +int audit(struct proc *, struct audit_args *, int *); +int auditon(struct proc *, struct auditon_args *, int *); +int getauid(struct proc *, struct getauid_args *, int *); +int setauid(struct proc *, struct setauid_args *, int *); +int getaudit(struct proc *, struct getaudit_args *, int *); +int setaudit(struct proc *, struct setaudit_args *, int *); +int getaudit_addr(struct proc *, struct getaudit_addr_args *, int *); +int setaudit_addr(struct proc *, struct setaudit_addr_args *, int *); +int auditctl(struct proc *, struct auditctl_args *, int *); +int kqueue(struct proc *, struct kqueue_args *, int *); +int kevent(struct proc *, struct kevent_args *, int *); +int lchown(struct proc *, struct lchown_args *, int *); + +__END_DECLS +#undef PAD_ +#undef PADL_ +#undef PADR_ + +#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL */ + +#endif /* !_SYS_SYSPROTO_H_ */ diff --git a/bsd/sys/systm.h b/bsd/sys/systm.h index 3110e2679..e81bed439 100644 --- a/bsd/sys/systm.h +++ b/bsd/sys/systm.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -95,46 +95,44 @@ #include <sys/cdefs.h> #include <sys/types.h> #include <sys/time.h> +#include <sys/ioctl.h> +#include <sys/malloc.h> +#ifdef BSD_KERNEL_PRIVATE #include <sys/tty.h> #include <sys/vm.h> -#include <sys/proc.h> #include <sys/linker_set.h> +#endif +#include <sys/proc.h> __BEGIN_DECLS +#ifdef KERNEL +#include <libkern/libkern.h> +#endif #include <kern/thread.h> +#include <kern/debug.h> __END_DECLS -#ifdef __APPLE_API_PRIVATE -extern int securelevel; /* system security level */ -extern const char *panicstr; /* panic message */ +#ifdef BSD_KERNEL_PRIVATE extern char version[]; /* system version */ extern char copyright[]; /* system copyright */ -extern struct sysent { /* system call table */ - int16_t sy_narg; /* number of args */ - int8_t sy_parallel;/* can execute in parallel */ - int8_t sy_funnel; /* funnel type */ - int32_t (*sy_call)(); /* implementing function */ -} sysent[]; -extern int nsysent; - extern int boothowto; /* reboot flags, from console subsystem */ extern int show_space; extern int nblkdev; /* number of entries in bdevsw */ extern int nchrdev; /* number of entries in cdevsw */ -extern dev_t rootdev; /* root device */ -extern struct vnode *rootvp; /* vnode equivalent to above */ -#endif /* __APPLE_API_PRIVATE */ +#endif /* BSD_KERNEL_PRIVATE */ -#ifdef __APPLE_API_UNSTABLE +#ifdef KERNEL_PRIVATE #define NO_FUNNEL 0 #define KERNEL_FUNNEL 1 -#define NETWORK_FUNNEL 2 +extern int securelevel; /* system security level */ +extern dev_t rootdev; /* root device */ +extern struct vnode *rootvp; /* vnode equivalent to above */ extern funnel_t * kernel_flock; -extern funnel_t * network_flock; -#endif /* __APPLE_API_UNSTABLE */ + +#endif /* KERNEL_PRIVATE */ #define SYSINIT(a,b,c,d,e) #define MALLOC_DEFINE(a,b,c) @@ -146,95 +144,85 @@ extern funnel_t * network_flock; * General function declarations. */ __BEGIN_DECLS -int nullop __P((void)); -int enodev (); /* avoid actual prototype for multiple use */ -void enodev_strat(); -int nulldev(); -int enoioctl __P((void)); -int enxio __P((void)); -int eopnotsupp __P((void)); -int einval __P((void)); - -#ifdef __APPLE_API_UNSTABLE -int seltrue __P((dev_t dev, int which, struct proc *p)); +int nullop(void); +int nulldev(void); +int enoioctl(void); +int enxio(void); +int eopnotsupp(void); +int einval(void); + +#ifdef BSD_KERNEL_PRIVATE +int seltrue(dev_t dev, int which, struct proc *p); +void ttyprintf(struct tty *, const char *, ...); +void realitexpire(void *); +int hzto(struct timeval *tv); #endif /* __APPLE_API_UNSTABLE */ -void *hashinit __P((int count, int type, u_long *hashmask)); -int nosys __P((struct proc *, void *, register_t *)); +void *hashinit(int count, int type, u_long *hashmask); + +void tablefull(const char *); + +int kvprintf(char const *, void (*)(int, void*), void *, int, + __darwin_va_list); + +void uprintf(const char *, ...); + + +void ovbcopy(const void *from, void *to, size_t len); +int copywithin(void *saddr, void *daddr, size_t len); + +int fubyte(user_addr_t addr); +int fuibyte(user_addr_t addr); +int subyte(user_addr_t addr, int byte); +int suibyte(user_addr_t addr, int byte); +long fuword(user_addr_t addr); +long fuiword(user_addr_t addr); +int suword(user_addr_t addr, long word); +int suiword(user_addr_t addr, long word); +int64_t fulong(user_addr_t addr); +int sulong(user_addr_t addr, int64_t longword); +uint64_t fuulong(user_addr_t addr); +int suulong(user_addr_t addr, uint64_t ulongword); +#define fusize(_a) ((user_size_t)fulong(_a)) +#define susize(_a, _s) sulong((_a), (_s)) +#define fuptr(a) ((user_addr_t)fulong(_a) +#define suptr(_a, _p) sulong((_a), (_p)) +int useracc(user_addr_t addr, user_size_t len,int prot); -#ifdef __GNUC__ -volatile void panic __P((const char *, ...)); -#else -void panic __P((const char *, ...)); -#endif -void tablefull __P((const char *)); -void log __P((int, const char *, ...)); -void kprintf __P((const char *, ...)); -void ttyprintf __P((struct tty *, const char *, ...)); - -int kvprintf __P((char const *, void (*)(int, void*), void *, int, - _BSD_VA_LIST_)); - -int snprintf __P((char *, size_t, const char *, ...)); -int sprintf __P((char *buf, const char *, ...)); -void uprintf __P((const char *, ...)); -void vprintf __P((const char *, _BSD_VA_LIST_)); -int vsnprintf __P((char *, size_t, const char *, _BSD_VA_LIST_)); -int vsprintf __P((char *buf, const char *, _BSD_VA_LIST_)); - -void bcopy __P((const void *from, void *to, size_t len)); -void ovbcopy __P((const void *from, void *to, size_t len)); -void bzero __P((void *buf, size_t len)); - -int copystr __P((void *kfaddr, void *kdaddr, size_t len, size_t *done)); -int copyinstr __P((void *udaddr, void *kaddr, size_t len, size_t *done)); -int copyoutstr __P((void *kaddr, void *udaddr, size_t len, size_t *done)); -int copyin __P((void *udaddr, void *kaddr, size_t len)); -int copyout __P((void *kaddr, void *udaddr, size_t len)); -int copywithin __P((void *saddr, void *daddr, size_t len)); - -int fubyte __P((void *base)); -#ifdef notdef -int fuibyte __P((void *base)); -#endif -int subyte __P((void *base, int byte)); -int suibyte __P((void *base, int byte)); -long fuword __P((void *base)); -long fuiword __P((void *base)); -int suword __P((void *base, long word)); -int suiword __P((void *base, long word)); - -#ifdef __APPLE_API_UNSTABLE -int hzto __P((struct timeval *tv)); typedef void (*timeout_fcn_t)(void *); -void timeout __P((void (*)(void *), void *arg, int ticks)); -void untimeout __P((void (*)(void *), void *arg)); -void realitexpire __P((void *)); -#endif /* __APPLE_API_UNSTABLE */ +#ifdef KERNEL_PRIVATE +void timeout(void (*)(void *), void *arg, int ticks); +void untimeout(void (*)(void *), void *arg); +#endif /* KERNEL_PRIVATE */ +void bsd_timeout(void (*)(void *), void *arg, struct timespec * ts); +void bsd_untimeout(void (*)(void *), void *arg); -#ifdef __APPLE_API_PRIVATE -void bsd_hardclock __P((boolean_t usermode, caddr_t pc, int numticks)); -void gatherstats __P((boolean_t usermode, caddr_t pc)); +void set_fsblocksize(struct vnode *); -void initclocks __P((void)); +#ifdef BSD_KERNEL_PRIVATE +int vslock(user_addr_t addr, user_size_t len); +int vsunlock(user_addr_t addr, user_size_t len, int dirtied); +int clone_system_shared_regions(int shared_regions_active, + int chain_regions, + int base_vnode); -void startprofclock __P((struct proc *)); -void stopprofclock __P((struct proc *)); -void setstatclockrate __P((int hzrate)); -#ifdef DDB -/* debugger entry points */ -int Debugger __P((void)); /* in DDB only */ -#endif +extern kern_return_t bsd_exception(int, exception_data_type_t codes[], int); +extern void bsdinit_task(void); +void bsd_hardclock(boolean_t usermode, caddr_t pc, int numticks); +void gatherstats(boolean_t usermode, caddr_t pc); -void set_fsblocksize __P((struct vnode *)); -#endif /* __APPLE_API_PRIVATE */ +void initclocks(void); -void addlog __P((const char *, ...)); -void printf __P((const char *, ...)); +void startprofclock(struct proc *); +void stopprofclock(struct proc *); +void setstatclockrate(int hzrate); -extern boolean_t thread_funnel_switch(int oldfnl, int newfnl); +struct time_value; +void get_procrustime(struct time_value *tv); + +void load_init_program(struct proc *p); +#endif /* BSD_KERNEL_PRIVATE */ -#include <libkern/libkern.h> __END_DECLS diff --git a/bsd/sys/table.h b/bsd/sys/table.h deleted file mode 100644 index a59713b4e..000000000 --- a/bsd/sys/table.h +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* - * Mach Operating System - * Copyright (c) 1986 Carnegie-Mellon University - * All rights reserved. The CMU software License Agreement specifies - * the terms and conditions for use and redistribution. - */ - -#ifndef _SYS_TABLE_ -#define _SYS_TABLE_ - -#include <sys/appleapiopts.h> - -#warning obsolete header! Please delete the include from your sources. - -#ifdef KERNEL_PRIVATE - -#ifdef __APPLE_API_OBSOLETE -#include <sys/dkstat.h> -#include <machine/table.h> - -#define TBL_LOADAVG 3 /* (no index) */ -#define TBL_ARGUMENTS 6 /* index by process ID */ -#define TBL_PROCINFO 10 /* index by proc table slot */ -#define TBL_MACHFACTOR 11 /* index by cpu number */ -#define TBL_CPUINFO 12 /* (no index), generic CPU info */ - -/* - * Machine specific table id base - */ -#define TBL_MACHDEP_BASE 0x4000 /* Machine dependent codes start here */ - -/* - * Return codes from machine dependent calls - */ -#define TBL_MACHDEP_NONE 0 /* Not handled by machdep code */ -#define TBL_MACHDEP_OKAY 1 /* Handled by machdep code */ -#define TBL_MACHDEP_BAD -1 /* Bad status from machdep code */ - - - -/* - * TBL_LOADAVG data layout - * (used by TBL_MACHFACTOR too) - */ -struct tbl_loadavg -{ - long tl_avenrun[3]; - int tl_lscale; /* 0 scale when floating point */ -}; - -/* - * TBL_PROCINFO data layout - */ -#define PI_COMLEN 19 /* length of command string */ -struct tbl_procinfo -{ - int pi_uid; /* user ID */ - int pi_pid; /* proc ID */ - int pi_ppid; /* parent proc ID */ - int pi_pgrp; /* proc group ID */ - int pi_ttyd; /* controlling terminal number */ - int pi_status; /* process status: */ -#define PI_EMPTY 0 /* no process */ -#define PI_ACTIVE 1 /* active process */ -#define PI_EXITING 2 /* exiting */ -#define PI_ZOMBIE 3 /* zombie */ - int pi_flag; /* other random flags */ - char pi_comm[PI_COMLEN+1]; - /* short command name */ -}; - -/* - * TBL_CPUINFO data layout - */ -struct tbl_cpuinfo -{ - int ci_swtch; /* # context switches */ - int ci_intr; /* # interrupts */ - int ci_syscall; /* # system calls */ - int ci_traps; /* # system traps */ - int ci_hz; /* # ticks per second */ - int ci_phz; /* profiling hz */ - int ci_cptime[CPUSTATES]; /* cpu state times */ -}; - - - -#ifdef KERNEL -/* - * Machine specific procedure prototypes. - */ -int machine_table(int id, int index, caddr_t addr, int nel, u_int lel, int set); -int machine_table_setokay(int id); -#endif /* KERNEL */ - -#endif /* __APPLE_API_OBSOLETE */ - -#endif /* KERNEL_PRIVATE */ -#endif /* _SYS_TABLE_ */ - diff --git a/bsd/sys/termios.h b/bsd/sys/termios.h index 71afa6606..47ea1c9ec 100644 --- a/bsd/sys/termios.h +++ b/bsd/sys/termios.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -58,6 +58,8 @@ #ifndef _SYS_TERMIOS_H_ #define _SYS_TERMIOS_H_ +#include <sys/cdefs.h> + /* * Special Control Characters * @@ -67,33 +69,33 @@ */ #define VEOF 0 /* ICANON */ #define VEOL 1 /* ICANON */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define VEOL2 2 /* ICANON together with IEXTEN */ #endif #define VERASE 3 /* ICANON */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define VWERASE 4 /* ICANON together with IEXTEN */ #endif #define VKILL 5 /* ICANON */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define VREPRINT 6 /* ICANON together with IEXTEN */ #endif /* 7 spare 1 */ #define VINTR 8 /* ISIG */ #define VQUIT 9 /* ISIG */ #define VSUSP 10 /* ISIG */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define VDSUSP 11 /* ISIG together with IEXTEN */ #endif #define VSTART 12 /* IXON, IXOFF */ #define VSTOP 13 /* IXON, IXOFF */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define VLNEXT 14 /* IEXTEN */ #define VDISCARD 15 /* IEXTEN */ #endif #define VMIN 16 /* !ICANON */ #define VTIME 17 /* !ICANON */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define VSTATUS 18 /* ICANON together with IEXTEN */ /* 19 spare 2 */ #endif @@ -103,7 +105,7 @@ #define _POSIX_VDISABLE 0xff #endif -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define CCEQ(val, c) ((c) == (val) ? (val) != _POSIX_VDISABLE : 0) #endif @@ -121,25 +123,75 @@ #define ICRNL 0x00000100 /* map CR to NL (ala CRMOD) */ #define IXON 0x00000200 /* enable output flow control */ #define IXOFF 0x00000400 /* enable input flow control */ -#ifndef _POSIX_SOURCE #define IXANY 0x00000800 /* any char will restart after stop */ +#ifndef _POSIX_C_SOURCE #define IMAXBEL 0x00002000 /* ring bell on input queue full */ -#endif /*_POSIX_SOURCE */ +#endif /*_POSIX_C_SOURCE */ /* * Output flags - software output processing */ #define OPOST 0x00000001 /* enable following output processing */ -#ifndef _POSIX_SOURCE #define ONLCR 0x00000002 /* map NL to CR-NL (ala CRMOD) */ +#ifndef _POSIX_C_SOURCE #define OXTABS 0x00000004 /* expand tabs to spaces */ #define ONOEOT 0x00000008 /* discard EOT's (^D) on output) */ -#endif /*_POSIX_SOURCE */ +#endif /*_POSIX_C_SOURCE */ +/* + * The following block of features is unimplemented. Use of these flags in + * programs will currently result in unexpected behaviour. + * + * - Begin unimplemented features + */ +#define OCRNL 0x00000010 /* map CR to NL on output */ +#define ONOCR 0x00000020 /* no CR output at column 0 */ +#define ONLRET 0x00000040 /* NL performs CR function */ +#define OFILL 0x00000080 /* use fill characters for delay */ +#define NLDLY 0x00000300 /* \n delay */ +#define TABDLY 0x00000c00 /* horizontal tab delay */ +#define CRDLY 0x00003000 /* \r delay */ +#define FFDLY 0x00004000 /* form feed delay */ +#define BSDLY 0x00008000 /* \b delay */ +#define VTDLY 0x00010000 /* vertical tab delay */ +#define OFDEL 0x00020000 /* fill is DEL, else NUL */ +#if !defined(_SYS_IOCTL_COMPAT_H_) || defined(_POSIX_C_SOURCE) +/* + * These manifest constants have the same names as those in the header + * <sys/ioctl_compat.h>, so you are not permitted to have both definitions + * in scope simultaneously in the same compilation unit. Nevertheless, + * they are required to be in scope when _POSIX_C_SOURCE is requested; + * this means that including the <sys/ioctl_compat.h> header before this + * one whien _POSIX_C_SOURCE is in scope will result in redefintions. We + * attempt to maintain these as the same values so as to avoid this being + * an outright error in most compilers. + */ +#define NL0 0x00000000 +#define NL1 0x00000100 +#define NL2 0x00000200 +#define NL3 0x00000300 +#define TAB0 0x00000000 +#define TAB1 0x00000400 +#define TAB2 0x00000800 +#define TAB3 0x00000c00 +#define CR0 0x00000000 +#define CR1 0x00001000 +#define CR2 0x00002000 +#define CR3 0x00003000 +#define FF0 0x00000000 +#define FF1 0x00004000 +#define BS0 0x00000000 +#define BS1 0x00008000 +#define VT0 0x00000000 +#define VT1 0x00010000 +#endif /* !_SYS_IOCTL_COMPAT_H_ */ +/* + * + End unimplemented features + */ /* * Control flags - hardware control of terminal */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define CIGNORE 0x00000001 /* ignore control flags */ #endif #define CSIZE 0x00000300 /* character size mask */ @@ -153,7 +205,7 @@ #define PARODD 0x00002000 /* odd parity, else even */ #define HUPCL 0x00004000 /* hang up on last close */ #define CLOCAL 0x00008000 /* ignore modem status lines */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define CCTS_OFLOW 0x00010000 /* CTS flow control of output */ #define CRTSCTS (CCTS_OFLOW | CRTS_IFLOW) #define CRTS_IFLOW 0x00020000 /* RTS flow control of input */ @@ -172,30 +224,30 @@ * input flag. */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define ECHOKE 0x00000001 /* visual erase for line kill */ -#endif /*_POSIX_SOURCE */ +#endif /*_POSIX_C_SOURCE */ #define ECHOE 0x00000002 /* visually erase chars */ #define ECHOK 0x00000004 /* echo NL after line kill */ #define ECHO 0x00000008 /* enable echoing */ #define ECHONL 0x00000010 /* echo NL even if ECHO is off */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define ECHOPRT 0x00000020 /* visual erase mode for hardcopy */ #define ECHOCTL 0x00000040 /* echo control chars as ^(Char) */ -#endif /*_POSIX_SOURCE */ +#endif /*_POSIX_C_SOURCE */ #define ISIG 0x00000080 /* enable signals INTR, QUIT, [D]SUSP */ #define ICANON 0x00000100 /* canonicalize input lines */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define ALTWERASE 0x00000200 /* use alternate WERASE algorithm */ -#endif /*_POSIX_SOURCE */ +#endif /*_POSIX_C_SOURCE */ #define IEXTEN 0x00000400 /* enable DISCARD and LNEXT */ #define EXTPROC 0x00000800 /* external processing */ #define TOSTOP 0x00400000 /* stop background jobs from output */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define FLUSHO 0x00800000 /* output being flushed (state) */ #define NOKERNINFO 0x02000000 /* no kernel output from VSTATUS */ #define PENDIN 0x20000000 /* XXX retype pending input (state) */ -#endif /*_POSIX_SOURCE */ +#endif /*_POSIX_C_SOURCE */ #define NOFLSH 0x80000000 /* don't flush after interrupt */ typedef unsigned long tcflag_t; @@ -212,13 +264,43 @@ struct termios { speed_t c_ospeed; /* output speed */ }; +#ifdef KERNEL +typedef unsigned long long user_tcflag_t; +typedef unsigned long long user_speed_t; + +/* + * LP64 version of struct termios. tcflag_t and speed_t are long and must + * grow when we're dealing with a 64-bit process. + * WARNING - keep in sync with struct termios + */ + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_termios { + user_tcflag_t c_iflag; /* input flags */ + user_tcflag_t c_oflag; /* output flags */ + user_tcflag_t c_cflag; /* control flags */ + user_tcflag_t c_lflag; /* local flags */ + cc_t c_cc[NCCS]; /* control chars */ + user_speed_t c_ispeed; /* input speed */ + user_speed_t c_ospeed; /* output speed */ +}; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +#endif /* KERNEL */ + /* * Commands passed to tcsetattr() for setting the termios structure. */ #define TCSANOW 0 /* make change immediate */ #define TCSADRAIN 1 /* drain output, then change */ #define TCSAFLUSH 2 /* drain output, flush input */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define TCSASOFT 0x10 /* flag - don't alter h.w. state */ #endif @@ -241,7 +323,7 @@ struct termios { #define B9600 9600 #define B19200 19200 #define B38400 38400 -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define B7200 7200 #define B14400 14400 #define B28800 28800 @@ -251,7 +333,7 @@ struct termios { #define B230400 230400 #define EXTA 19200 #define EXTB 38400 -#endif /* !_POSIX_SOURCE */ +#endif /* !_POSIX_C_SOURCE */ #ifndef KERNEL @@ -266,26 +348,26 @@ struct termios { #include <sys/cdefs.h> __BEGIN_DECLS -speed_t cfgetispeed __P((const struct termios *)); -speed_t cfgetospeed __P((const struct termios *)); -int cfsetispeed __P((struct termios *, speed_t)); -int cfsetospeed __P((struct termios *, speed_t)); -int tcgetattr __P((int, struct termios *)); -int tcsetattr __P((int, int, const struct termios *)); -int tcdrain __P((int)); -int tcflow __P((int, int)); -int tcflush __P((int, int)); -int tcsendbreak __P((int, int)); +speed_t cfgetispeed(const struct termios *); +speed_t cfgetospeed(const struct termios *); +int cfsetispeed(struct termios *, speed_t); +int cfsetospeed(struct termios *, speed_t); +int tcgetattr(int, struct termios *); +int tcsetattr(int, int, const struct termios *); +int tcdrain(int); +int tcflow(int, int); +int tcflush(int, int); +int tcsendbreak(int, int); -#ifndef _POSIX_SOURCE -void cfmakeraw __P((struct termios *)); -int cfsetspeed __P((struct termios *, speed_t)); -#endif /* !_POSIX_SOURCE */ +#ifndef _POSIX_C_SOURCE +void cfmakeraw(struct termios *); +int cfsetspeed(struct termios *, speed_t); +#endif /* !_POSIX_C_SOURCE */ __END_DECLS #endif /* !KERNEL */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE /* * Include tty ioctl's that aren't just for backwards compatibility @@ -300,6 +382,6 @@ __END_DECLS */ #endif /* !_SYS_TERMIOS_H_ */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #include <sys/ttydefaults.h> #endif diff --git a/bsd/sys/time.h b/bsd/sys/time.h index 49bdecbbc..a7791b3c3 100644 --- a/bsd/sys/time.h +++ b/bsd/sys/time.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -58,27 +58,145 @@ #ifndef _SYS_TIME_H_ #define _SYS_TIME_H_ -#include <sys/appleapiopts.h> -#include <sys/types.h> +#include <sys/cdefs.h> +#include <sys/_types.h> + +#ifndef _TIME_T +#define _TIME_T +typedef __darwin_time_t time_t; +#endif + +#ifndef _SUSECONDS_T +#define _SUSECONDS_T +typedef __darwin_suseconds_t suseconds_t; +#endif + /* * Structure returned by gettimeofday(2) system call, * and used in other calls. */ +#ifndef _TIMEVAL +#define _TIMEVAL struct timeval { - int32_t tv_sec; /* seconds */ - int32_t tv_usec; /* and microseconds */ + time_t tv_sec; /* seconds */ + suseconds_t tv_usec; /* and microseconds */ +}; +#endif /* _TIMEVAL */ + +/* + * Structure used as a parameter by getitimer(2) and setitimer(2) system + * calls. + */ +struct itimerval { + struct timeval it_interval; /* timer interval */ + struct timeval it_value; /* current value */ }; +/* + * Names of the interval timers, and structure + * defining a timer setting. + */ +#define ITIMER_REAL 0 +#define ITIMER_VIRTUAL 1 +#define ITIMER_PROF 2 + + +/* + * [XSI] The fd_set type shall be defined as described in <sys/select.h>. + * + * Note: We use _FD_SET to protect all select related + * types and macros + */ +#ifndef _FD_SET +#define _FD_SET + +/* + * Select uses bit masks of file descriptors in longs. These macros + * manipulate such bit fields (the filesystem macros use chars). The + * extra protection here is to permit application redefinition above + * the default size. + */ +#ifndef FD_SETSIZE +#define FD_SETSIZE 1024 +#endif + +#define __DARWIN_NBBY 8 /* bits in a byte */ +#define __DARWIN_NFDBITS (sizeof(__int32_t) * __DARWIN_NBBY) /* bits per mask */ +#define __DARWIN_howmany(x, y) (((x) + ((y) - 1)) / (y)) /* # y's == x bits? */ + +__BEGIN_DECLS +typedef struct fd_set { + __int32_t fds_bits[__DARWIN_howmany(FD_SETSIZE, __DARWIN_NFDBITS)]; +} fd_set; +__END_DECLS + +#define FD_SET(n, p) ((p)->fds_bits[(n)/__DARWIN_NFDBITS] |= (1<<((n) % __DARWIN_NFDBITS))) +#define FD_CLR(n, p) ((p)->fds_bits[(n)/__DARWIN_NFDBITS] &= ~(1<<((n) % __DARWIN_NFDBITS))) +#define FD_ISSET(n, p) ((p)->fds_bits[(n)/__DARWIN_NFDBITS] & (1<<((n) % __DARWIN_NFDBITS))) +#if __GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ >= 3 +/* + * Use the built-in bzero function instead of the library version so that + * we do not pollute the namespace or introduce prototype warnings. + */ +#define FD_ZERO(p) __builtin_bzero(p, sizeof(*(p))) +#else +#define FD_ZERO(p) bzero(p, sizeof(*(p))) +#endif +#ifndef _POSIX_C_SOURCE +#define FD_COPY(f, t) bcopy(f, t, sizeof(*(f))) +#endif /* !_POSIX_C_SOURCE */ + +#endif /* !_FD_SET */ + + +#ifndef _POSIX_C_SOURCE /* * Structure defined by POSIX.4 to be like a timeval. */ -#ifndef _TIMESPEC_DECLARED -#define _TIMESPEC_DECLARED +#ifndef _TIMESPEC +#define _TIMESPEC struct timespec { time_t tv_sec; /* seconds */ + long tv_nsec; /* and nanoseconds */ +}; + +#ifdef KERNEL +// LP64todo - should this move? +#include <machine/types.h> /* user_time_t */ + +/* LP64 version of struct timeval. time_t is a long and must grow when + * we're dealing with a 64-bit process. + * WARNING - keep in sync with struct timeval + */ +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_timeval { + user_time_t tv_sec; /* seconds */ + suseconds_t tv_usec; /* and microseconds */ +}; + +struct user_itimerval { + struct user_timeval it_interval; /* timer interval */ + struct user_timeval it_value; /* current value */ +}; + +/* LP64 version of struct timespec. time_t is a long and must grow when + * we're dealing with a 64-bit process. + * WARNING - keep in sync with struct timespec + */ +struct user_timespec { + user_time_t tv_sec; /* seconds */ int32_t tv_nsec; /* and nanoseconds */ }; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +#endif // KERNEL #endif #define TIMEVAL_TO_TIMESPEC(tv, ts) { \ @@ -102,8 +220,6 @@ struct timezone { #define DST_EET 5 /* Eastern European dst */ #define DST_CAN 6 /* Canada */ -#define time_second time.tv_sec - /* Operations on timevals. */ #define timerclear(tvp) (tvp)->tv_sec = (tvp)->tv_usec = 0 #define timerisset(tvp) ((tvp)->tv_sec || (tvp)->tv_usec) @@ -132,19 +248,6 @@ struct timezone { #define timevalcmp(l, r, cmp) timercmp(l, r, cmp) /* freebsd */ -/* - * Names of the interval timers, and structure - * defining a timer setting. - */ -#define ITIMER_REAL 0 -#define ITIMER_VIRTUAL 1 -#define ITIMER_PROF 2 - -struct itimerval { - struct timeval it_interval; /* timer interval */ - struct timeval it_value; /* current value */ -}; - /* * Getkerninfo clock information structure */ @@ -155,39 +258,56 @@ struct clockinfo { int stathz; /* statistics clock frequency */ int profhz; /* profiling clock frequency */ }; +#endif /* ! _POSIX_C_SOURCE */ -#include <sys/cdefs.h> #ifdef KERNEL -void microtime __P((struct timeval *tv)); -void microuptime __P((struct timeval *tv)); + +#ifndef _POSIX_C_SOURCE +__BEGIN_DECLS +void microtime(struct timeval *tv); +void microuptime(struct timeval *tv); #define getmicrotime(a) microtime(a) #define getmicrouptime(a) microuptime(a) -void nanotime __P((struct timespec *ts)); -void nanouptime __P((struct timespec *ts)); +void nanotime(struct timespec *ts); +void nanouptime(struct timespec *ts); #define getnanotime(a) nanotime(a) #define getnanouptime(a) nanouptime(a) -#ifdef __APPLE_API_PRIVATE -int itimerfix __P((struct timeval *tv)); -int itimerdecr __P((struct itimerval *itp, int usec)); -#endif /* __APPLE_API_PRIVATE */ +void timevaladd(struct timeval *t1, struct timeval *t2); +void timevalsub(struct timeval *t1, struct timeval *t2); +void timevalfix(struct timeval *t1); +#ifdef BSD_KERNEL_PRIVATE +time_t boottime_sec(void); +void inittodr(time_t base); +int itimerfix(struct timeval *tv); +int itimerdecr(struct itimerval *itp, int usec); +#endif /* BSD_KERNEL_PRIVATE */ + +__END_DECLS + +#endif /* ! _POSIX_C_SOURCE */ #else /* !KERNEL */ + +__BEGIN_DECLS + +#ifndef _POSIX_C_SOURCE #include <time.h> -#ifndef _POSIX_SOURCE -#include <sys/cdefs.h> +int adjtime(const struct timeval *, struct timeval *); +int futimes(int, const struct timeval *); +int settimeofday(const struct timeval *, const struct timezone *); +#endif /* ! _POSIX_C_SOURCE */ + +int getitimer(int, struct itimerval *); +int gettimeofday(struct timeval * __restrict, struct timezone * __restrict); +int select(int, fd_set * __restrict, fd_set * __restrict, + fd_set * __restrict, struct timeval * __restrict); +int setitimer(int, const struct itimerval * __restrict, + struct itimerval * __restrict); +int utimes(const char *, const struct timeval *); -__BEGIN_DECLS -int adjtime __P((const struct timeval *, struct timeval *)); -int futimes __P((int, const struct timeval *)); -int getitimer __P((int, struct itimerval *)); -int gettimeofday __P((struct timeval *, struct timezone *)); -int setitimer __P((int, const struct itimerval *, struct itimerval *)); -int settimeofday __P((const struct timeval *, const struct timezone *)); -int utimes __P((const char *, const struct timeval *)); __END_DECLS -#endif /* !POSIX */ #endif /* !KERNEL */ diff --git a/bsd/sys/timeb.h b/bsd/sys/timeb.h index 9277d37d7..604a4fabd 100644 --- a/bsd/sys/timeb.h +++ b/bsd/sys/timeb.h @@ -64,17 +64,30 @@ #define _SYS_TIMEB_H_ #include <sys/appleapiopts.h> +#include <sys/cdefs.h> +#include <sys/_types.h> -#ifdef __APPLE_API_OBSOLETE +/* [XSI] The time_t type shall be defined as described in <sys/types.h> */ +#ifndef _TIME_T +#define _TIME_T +typedef __darwin_time_t time_t; +#endif -/* The ftime(2) system call structure -- deprecated. */ +/* + * [XSI] Structure whose address is passed as the first parameter to ftime() + */ struct timeb { - time_t time; /* seconds since the Epoch */ - unsigned short millitm; /* + milliseconds since the Epoch */ - short timezone; /* minutes west of CUT */ - short dstflag; /* DST == non-zero */ + time_t time; /* [XSI] Seconds since the Epoch */ + unsigned short millitm; /* [XSI] Milliseconds since the Epoch */ + short timezone; /* [XSI] Minutes west of CUT */ + short dstflag; /* [XSI] non-zero if DST in effect */ }; -#endif /* __APPLE_API_OBSOLETE */ +#ifndef KERNEL +__BEGIN_DECLS +/* [XSI] Legacy interface */ +int ftime(struct timeb *); +__END_DECLS +#endif /* !KERNEL */ #endif /* !_SYS_TIMEB_H_ */ diff --git a/bsd/sys/times.h b/bsd/sys/times.h index 01d0a3734..0cd072e49 100644 --- a/bsd/sys/times.h +++ b/bsd/sys/times.h @@ -63,25 +63,29 @@ #ifndef _SYS_TIMES_H_ #define _SYS_TIMES_H_ -#include <machine/ansi.h> +#include <sys/appleapiopts.h> +#include <sys/cdefs.h> +#include <sys/_types.h> -#ifndef _BSD_CLOCK_T_DEFINED_ -#define _BSD_CLOCK_T_DEFINED_ -typedef _BSD_CLOCK_T_ clock_t; +/* [XSI] The clock_t type shall be defined as described in <sys/types.h> */ +#ifndef _CLOCK_T +#define _CLOCK_T +typedef __darwin_clock_t clock_t; #endif +/* + * [XSI] Structure whose address is passed as the first parameter to times() + */ struct tms { - clock_t tms_utime; /* User CPU time */ - clock_t tms_stime; /* System CPU time */ - clock_t tms_cutime; /* User CPU time of terminated child procs */ - clock_t tms_cstime; /* System CPU time of terminated child procs */ + clock_t tms_utime; /* [XSI] User CPU time */ + clock_t tms_stime; /* [XSI] System CPU time */ + clock_t tms_cutime; /* [XSI] Terminated children user CPU time */ + clock_t tms_cstime; /* [XSI] Terminated children System CPU time */ }; #ifndef KERNEL -#include <sys/cdefs.h> - __BEGIN_DECLS -clock_t times __P((struct tms *)); +clock_t times(struct tms *); __END_DECLS #endif #endif /* !_SYS_TIMES_H_ */ diff --git a/bsd/sys/tprintf.h b/bsd/sys/tprintf.h index 8eaa93748..16f35aa38 100644 --- a/bsd/sys/tprintf.h +++ b/bsd/sys/tprintf.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -59,15 +59,18 @@ #define _SYS_TPRINTF_H_ #include <sys/appleapiopts.h> +#include <sys/cdefs.h> #ifdef __APPLE_API_UNSTABLE typedef struct session *tpr_t; -tpr_t tprintf_open __P((struct proc *)); -void tprintf_close __P((tpr_t)); +__BEGIN_DECLS +tpr_t tprintf_open(struct proc *); +void tprintf_close(tpr_t); +void tprintf(tpr_t, const char *fmt, ...); +__END_DECLS -void tprintf __P((tpr_t, const char *fmt, ...)); #endif /* __APPLE_API_UNSTABLE */ #endif /* !_SYS_TPRINTF_H_ */ diff --git a/bsd/sys/trace.h b/bsd/sys/trace.h index c77ef1ca8..04d279aa5 100644 --- a/bsd/sys/trace.h +++ b/bsd/sys/trace.h @@ -133,7 +133,7 @@ extern struct proc *traceproc; extern int tracewhich, tracebuf[TRCSIZ]; extern u_int tracex; extern char traceflags[TR_NFLAGS]; -#define pack(v,b) (((v)->v_mount->mnt_stat.f_fsid.val[0])<<16)|(b) +#define pack(v,b) (((v)->v_mount->mnt_vfsstat.f_fsid.val[0])<<16)|(b) #define trace(a,b,c) { \ if (traceflags[a]) \ trace1(a,b,c); \ diff --git a/bsd/sys/tty.h b/bsd/sys/tty.h index c24e3083b..b74c3f510 100644 --- a/bsd/sys/tty.h +++ b/bsd/sys/tty.h @@ -68,22 +68,8 @@ #include <sys/termios.h> #include <sys/select.h> /* For struct selinfo. */ -#ifdef __APPLE_API_UNSTABLE -#ifndef __APPLE__ -/* - * Clists are character lists, which is a variable length linked list - * of cblocks, with a count of the number of characters in the list. - */ -struct clist { - int c_cc; /* Number of characters in the clist. */ - int c_cbcount; /* Number of cblocks. */ - int c_cbmax; /* Max # cblocks allowed for this clist. */ - int c_cbreserved; /* # cblocks reserved for this clist. */ - char *c_cf; /* Pointer to the first cblock. */ - char *c_cl; /* Pointer to the last cblock. */ -}; -#else /* __APPLE__ */ +#ifdef KERNEL /* * NetBSD Clists are actually ring buffers. The c_cc, c_cf, c_cl fields have * exactly the same behaviour as in true clists. @@ -106,7 +92,6 @@ struct clist { #define TTYCLSIZE 1024 #endif -#endif /* __APPLE__ */ /* * Per-tty structure. @@ -134,11 +119,11 @@ struct tty { struct termios t_termios; /* Termios state. */ struct winsize t_winsize; /* Window size. */ /* Start output. */ - void (*t_oproc) __P((struct tty *)); + void (*t_oproc)(struct tty *); /* Stop output. */ - void (*t_stop) __P((struct tty *, int)); + void (*t_stop)(struct tty *, int); /* Set hardware state. */ - int (*t_param) __P((struct tty *, struct termios *)); + int (*t_param)(struct tty *, struct termios *); void *t_sc; /* XXX: net/if_sl.c:sl_softc. */ int t_column; /* Tty output column. */ int t_rocount, t_rocol; /* Tty. */ @@ -173,11 +158,13 @@ struct tty { #define TTYHOG 1024 #endif -#ifdef KERNEL #define TTMAXHIWAT roundup(2048, CBSIZE) #define TTMINHIWAT roundup(100, CBSIZE) #define TTMAXLOWAT 256 #define TTMINLOWAT 32 +#else +struct tty; +struct clist; #endif /* KERNEL */ /* These flags are kept in t_state. */ @@ -247,6 +234,7 @@ struct speedtab { #define TTY_OE 0x04000000 /* Overrun error */ #define TTY_BI 0x08000000 /* Break condition */ +#ifdef KERNEL /* Is tp controlling terminal for p? */ #define isctty(p, tp) \ ((p)->p_session == (tp)->t_session && (p)->p_flag & P_CONTROLT) @@ -265,87 +253,63 @@ struct speedtab { #define TSA_PTS_READ(tp) ((void *)&(tp)->t_canq) -#ifdef KERNEL __BEGIN_DECLS -#ifndef __APPLE__ -extern struct tty *constty; /* Temporary virtual console. */ - -int b_to_q __P((char *cp, int cc, struct clist *q)); -void catq __P((struct clist *from, struct clist *to)); -void clist_alloc_cblocks __P((struct clist *q, int ccmax, int ccres)); -void clist_free_cblocks __P((struct clist *q)); -/* void clist_init __P((void)); */ /* defined in systm.h for main() */ -int getc __P((struct clist *q)); -void ndflush __P((struct clist *q, int cc)); -int ndqb __P((struct clist *q, int flag)); -char *nextc __P((struct clist *q, char *cp, int *c)); -int putc __P((int c, struct clist *q)); -int q_to_b __P((struct clist *q, char *cp, int cc)); -int unputc __P((struct clist *q)); - -int ttcompat __P((struct tty *tp, int com, caddr_t data, int flag)); -int ttsetcompat __P((struct tty *tp, int *com, caddr_t data, struct termios *term)); -#else /* __APPLE__ */ -int b_to_q __P((u_char *cp, int cc, struct clist *q)); -void catq __P((struct clist *from, struct clist *to)); -void clist_init __P((void)); -int getc __P((struct clist *q)); -void ndflush __P((struct clist *q, int cc)); -int ndqb __P((struct clist *q, int flag)); -u_char *firstc __P((struct clist *clp, int *c)); -u_char *nextc __P((struct clist *q, u_char *cp, int *c)); -int putc __P((int c, struct clist *q)); -int q_to_b __P((struct clist *q, u_char *cp, int cc)); -int unputc __P((struct clist *q)); -int clalloc __P((struct clist *clp, int size, int quot)); -void clfree __P((struct clist *clp)); +int b_to_q(const u_char *cp, int cc, struct clist *q); +void catq(struct clist *from, struct clist *to); +void clist_init(void); +int getc(struct clist *q); +void ndflush(struct clist *q, int cc); +int ndqb(struct clist *q, int flag); +u_char *firstc (struct clist *clp, int *c); +u_char *nextc(struct clist *q, u_char *cp, int *c); +int putc(int c, struct clist *q); +int q_to_b(struct clist *q, u_char *cp, int cc); +int unputc(struct clist *q); +int clalloc(struct clist *clp, int size, int quot); +void clfree(struct clist *clp); +void cinit(void); +void clrbits(u_char *cp, int off, int len); #ifdef KERNEL_PRIVATE -int ttcompat __P((struct tty *tp, u_long com, caddr_t data, int flag, - struct proc *p)); -int ttsetcompat __P((struct tty *tp, u_long *com, caddr_t data, struct termios *term)); +int ttcompat(struct tty *tp, u_long com, caddr_t data, int flag, + struct proc *p); +int ttsetcompat(struct tty *tp, u_long *com, caddr_t data, struct termios *term); #endif /* KERNEL_PRIVATE */ -#endif /* __APPLE__ */ -void termioschars __P((struct termios *t)); -int tputchar __P((int c, struct tty *tp)); -#ifndef __APPLE__ -int ttioctl __P((struct tty *tp, int com, void *data, int flag)); -#else -int ttioctl __P((struct tty *tp, u_long com, caddr_t data, int flag, - struct proc *p)); -#endif -int ttread __P((struct tty *tp, struct uio *uio, int flag)); -void ttrstrt __P((void *tp)); -int ttyselect __P((struct tty *tp, int rw, void * wql, struct proc *p)); -int ttselect __P((dev_t dev, int rw, void * wql, struct proc *p)); -void ttsetwater __P((struct tty *tp)); -int ttspeedtab __P((int speed, struct speedtab *table)); -int ttstart __P((struct tty *tp)); -void ttwakeup __P((struct tty *tp)); -int ttwrite __P((struct tty *tp, struct uio *uio, int flag)); -void ttwwakeup __P((struct tty *tp)); -void ttyblock __P((struct tty *tp)); -void ttychars __P((struct tty *tp)); -int ttycheckoutq __P((struct tty *tp, int wait)); -int ttyclose __P((struct tty *tp)); -void ttyflush __P((struct tty *tp, int rw)); -void ttyinfo __P((struct tty *tp)); -int ttyinput __P((int c, struct tty *tp)); -int ttylclose __P((struct tty *tp, int flag)); -int ttymodem __P((struct tty *tp, int flag)); -int ttyopen __P((dev_t device, struct tty *tp)); -int ttysleep __P((struct tty *tp, - void *chan, int pri, char *wmesg, int timeout)); -int ttywait __P((struct tty *tp)); -struct tty *ttymalloc __P((void)); -void ttyfree __P((struct tty *)); +void termioschars(struct termios *t); +int tputchar(int c, struct tty *tp); +int ttioctl(struct tty *tp, u_long com, caddr_t data, int flag, + struct proc *p); +int ttread(struct tty *tp, struct uio *uio, int flag); +void ttrstrt(void *tp); +int ttyselect(struct tty *tp, int rw, void * wql, struct proc *p); +int ttselect(dev_t dev, int rw, void * wql, struct proc *p); +void ttsetwater(struct tty *tp); +int ttspeedtab(int speed, struct speedtab *table); +int ttstart(struct tty *tp); +void ttwakeup(struct tty *tp); +int ttwrite(struct tty *tp, struct uio *uio, int flag); +void ttwwakeup(struct tty *tp); +void ttyblock(struct tty *tp); +void ttychars(struct tty *tp); +int ttycheckoutq(struct tty *tp, int wait); +int ttyclose(struct tty *tp); +void ttyflush(struct tty *tp, int rw); +void ttyinfo(struct tty *tp); +int ttyinput(int c, struct tty *tp); +int ttylclose(struct tty *tp, int flag); +int ttymodem(struct tty *tp, int flag); +int ttyopen(dev_t device, struct tty *tp); +int ttysleep(struct tty *tp, + void *chan, int pri, const char *wmesg, int timeout); +int ttywait(struct tty *tp); +struct tty *ttymalloc(void); +void ttyfree(struct tty *); __END_DECLS #endif /* KERNEL */ -#endif /* __APPLE_API_UNSTABLE */ #endif /* !_SYS_TTY_H_ */ diff --git a/bsd/sys/ttycom.h b/bsd/sys/ttycom.h index 5baac4b80..b7c4d8b3c 100644 --- a/bsd/sys/ttycom.h +++ b/bsd/sys/ttycom.h @@ -104,6 +104,12 @@ struct winsize { #define TIOCSETA _IOW('t', 20, struct termios) /* set termios struct */ #define TIOCSETAW _IOW('t', 21, struct termios) /* drain output, set */ #define TIOCSETAF _IOW('t', 22, struct termios) /* drn out, fls in, set */ +#ifdef KERNEL +#define TIOCGETA_64 _IOR('t', 19, struct user_termios) +#define TIOCSETA_64 _IOW('t', 20, struct user_termios) +#define TIOCSETAW_64 _IOW('t', 21, struct user_termios) +#define TIOCSETAF_64 _IOW('t', 22, struct user_termios) +#endif /* KERNEL */ #define TIOCGETD _IOR('t', 26, int) /* get line discipline */ #define TIOCSETD _IOW('t', 27, int) /* set line discipline */ /* 127-124 compat */ diff --git a/bsd/sys/ttydefaults.h b/bsd/sys/ttydefaults.h index b11fbb6d3..078882c79 100644 --- a/bsd/sys/ttydefaults.h +++ b/bsd/sys/ttydefaults.h @@ -83,7 +83,7 @@ #define CEOL 0xff /* XXX avoid _POSIX_VDISABLE */ #define CERASE 0177 #define CINTR CTRL('c') -#define CSTATUS 0xff /* XXX avoid _POSIX_VDISABLE */ +#define CSTATUS CTRL('t') #define CKILL CTRL('u') #define CMIN 1 #define CQUIT 034 /* FS, ^\ */ diff --git a/bsd/sys/types.h b/bsd/sys/types.h index 4a05f931d..d9f9d810a 100644 --- a/bsd/sys/types.h +++ b/bsd/sys/types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -70,96 +70,205 @@ /* Machine type dependent parameters. */ #include <machine/types.h> +#include <sys/_types.h> -#include <machine/ansi.h> #include <machine/endian.h> -#ifndef _POSIX_SOURCE -typedef unsigned char u_char; -typedef unsigned short u_short; -typedef unsigned int u_int; -typedef unsigned long u_long; -typedef unsigned short ushort; /* Sys V compatibility */ -typedef unsigned int uint; /* Sys V compatibility */ -#endif - -typedef u_int64_t u_quad_t; /* quads */ -typedef int64_t quad_t; -typedef quad_t * qaddr_t; - -typedef char * caddr_t; /* core address */ -typedef int32_t daddr_t; /* disk address */ -typedef int32_t dev_t; /* device number */ -typedef u_int32_t fixpt_t; /* fixed point number */ -typedef u_int32_t gid_t; /* group id */ -typedef u_int32_t in_addr_t; /* base type for internet address */ -typedef u_int16_t in_port_t; -typedef u_int32_t ino_t; /* inode number */ -typedef long key_t; /* IPC key (for Sys V IPC) */ -typedef u_int16_t mode_t; /* permissions */ -typedef u_int16_t nlink_t; /* link count */ -typedef quad_t off_t; /* file offset */ -typedef int32_t pid_t; /* process id */ -typedef quad_t rlim_t; /* resource limit */ -typedef int32_t segsz_t; /* segment size */ -typedef int32_t swblk_t; /* swap offset */ -typedef u_int32_t uid_t; /* user id */ -typedef u_int32_t useconds_t; /* microseconds (unsigned) */ - -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE +typedef unsigned char u_char; +typedef unsigned short u_short; +typedef unsigned int u_int; +#ifndef _U_LONG +typedef unsigned long u_long; +#define _U_LONG +#endif +typedef unsigned short ushort; /* Sys V compatibility */ +typedef unsigned int uint; /* Sys V compatibility */ +#endif + +typedef u_int64_t u_quad_t; /* quads */ +typedef int64_t quad_t; +typedef quad_t * qaddr_t; + +typedef char * caddr_t; /* core address */ +typedef int32_t daddr_t; /* disk address */ + +#ifndef _DEV_T +typedef __darwin_dev_t dev_t; /* device number */ +#define _DEV_T +#endif + +typedef u_int32_t fixpt_t; /* fixed point number */ + +#ifndef _BLKCNT_T +typedef __darwin_blkcnt_t blkcnt_t; +#define _BLKCNT_T +#endif + +#ifndef _BLKSIZE_T +typedef __darwin_blksize_t blksize_t; +#define _BLKSIZE_T +#endif + +#ifndef _GID_T +typedef __darwin_gid_t gid_t; +#define _GID_T +#endif + +#ifndef _IN_ADDR_T +#define _IN_ADDR_T +typedef __uint32_t in_addr_t; /* base type for internet address */ +#endif + +#ifndef _IN_PORT_T +#define _IN_PORT_T +typedef __uint16_t in_port_t; +#endif + +#ifndef _INO_T +typedef __darwin_ino_t ino_t; /* inode number */ +#define _INO_T +#endif + +#ifndef _KEY_T +#define _KEY_T +typedef __int32_t key_t; /* IPC key (for Sys V IPC) */ +#endif + +#ifndef _MODE_T +typedef __darwin_mode_t mode_t; +#define _MODE_T +#endif + +#ifndef _NLINK_T +typedef __uint16_t nlink_t; /* link count */ +#define _NLINK_T +#endif + +#ifndef _ID_T +#define _ID_T +typedef __darwin_id_t id_t; /* can hold pid_t, gid_t, or uid_t */ +#endif + +#ifndef _PID_T +typedef __darwin_pid_t pid_t; +#define _PID_T +#endif + +#ifndef _OFF_T +typedef __darwin_off_t off_t; +#define _OFF_T +#endif + +typedef int32_t segsz_t; /* segment size */ +typedef int32_t swblk_t; /* swap offset */ + +#ifndef _UID_T +typedef __darwin_uid_t uid_t; /* user id */ +#define _UID_T +#endif + +#ifndef _ID_T +typedef __darwin_id_t id_t; +#define _ID_T +#endif + +#ifndef _POSIX_C_SOURCE /* Major, minor numbers, dev_t's. */ #define major(x) ((int32_t)(((u_int32_t)(x) >> 24) & 0xff)) #define minor(x) ((int32_t)((x) & 0xffffff)) #define makedev(x,y) ((dev_t)(((x) << 24) | (y))) #endif -#ifndef _BSD_CLOCK_T_DEFINED_ -#define _BSD_CLOCK_T_DEFINED_ -typedef _BSD_CLOCK_T_ clock_t; +#ifndef _CLOCK_T +#define _CLOCK_T +typedef __darwin_clock_t clock_t; #endif -#ifndef _BSD_SIZE_T_DEFINED_ -#define _BSD_SIZE_T_DEFINED_ -typedef _BSD_SIZE_T_ size_t; +#ifndef _SIZE_T +#define _SIZE_T +/* DO NOT REMOVE THIS COMMENT: fixincludes needs to see + * _GCC_SIZE_T */ +typedef __darwin_size_t size_t; #endif -#ifndef _BSD_SSIZE_T_DEFINED_ -#define _BSD_SSIZE_T_DEFINED_ -typedef _BSD_SSIZE_T_ ssize_t; +#ifndef _SSIZE_T +#define _SSIZE_T +typedef __darwin_ssize_t ssize_t; #endif -#ifndef _BSD_TIME_T_DEFINED_ -#define _BSD_TIME_T_DEFINED_ -typedef _BSD_TIME_T_ time_t; +#ifndef _TIME_T +#define _TIME_T +typedef __darwin_time_t time_t; #endif -#ifndef _POSIX_SOURCE -#define NBBY 8 /* number of bits in a byte */ +#ifndef _USECONDS_T +#define _USECONDS_T +typedef __darwin_useconds_t useconds_t; +#endif + +#ifndef _SUSECONDS_T +#define _SUSECONDS_T +typedef __darwin_suseconds_t suseconds_t; +#endif + +#ifndef _POSIX_C_SOURCE +/* + * This code is present here in order to maintain historical backward + * compatability, and is intended to be removed at some point in the + * future; please include <sys/select.h> instead. + */ +#define NBBY 8 /* bits in a byte */ +#define NFDBITS (sizeof(__int32_t) * NBBY) /* bits per mask */ +#define howmany(x, y) (((x) + ((y) - 1)) / (y)) /* # y's == x bits? */ +typedef __int32_t fd_mask; + + +/* + * Note: We use _FD_SET to protect all select related + * types and macros + */ +#ifndef _FD_SET +#define _FD_SET /* * Select uses bit masks of file descriptors in longs. These macros - * manipulate such bit fields (the filesystem macros use chars). + * manipulate such bit fields (the filesystem macros use chars). The + * extra protection here is to permit application redefinition above + * the default size. */ #ifndef FD_SETSIZE #define FD_SETSIZE 1024 #endif -typedef int32_t fd_mask; -#define NFDBITS (sizeof(fd_mask) * NBBY) /* bits per mask */ - -#ifndef howmany -#define howmany(x, y) (((x) + ((y) - 1)) / (y)) -#endif +#define __DARWIN_NBBY 8 /* bits in a byte */ +#define __DARWIN_NFDBITS (sizeof(__int32_t) * __DARWIN_NBBY) /* bits per mask */ +#define __DARWIN_howmany(x, y) (((x) + ((y) - 1)) / (y)) /* # y's == x bits? */ +__BEGIN_DECLS typedef struct fd_set { - fd_mask fds_bits[howmany(FD_SETSIZE, NFDBITS)]; + __int32_t fds_bits[__DARWIN_howmany(FD_SETSIZE, __DARWIN_NFDBITS)]; } fd_set; +__END_DECLS -#define FD_SET(n, p) ((p)->fds_bits[(n)/NFDBITS] |= (1 << ((n) % NFDBITS))) -#define FD_CLR(n, p) ((p)->fds_bits[(n)/NFDBITS] &= ~(1 << ((n) % NFDBITS))) -#define FD_ISSET(n, p) ((p)->fds_bits[(n)/NFDBITS] & (1 << ((n) % NFDBITS))) -#define FD_COPY(f, t) bcopy(f, t, sizeof(*(f))) +#define FD_SET(n, p) ((p)->fds_bits[(n)/__DARWIN_NFDBITS] |= (1<<((n) % __DARWIN_NFDBITS))) +#define FD_CLR(n, p) ((p)->fds_bits[(n)/__DARWIN_NFDBITS] &= ~(1<<((n) % __DARWIN_NFDBITS))) +#define FD_ISSET(n, p) ((p)->fds_bits[(n)/__DARWIN_NFDBITS] & (1<<((n) % __DARWIN_NFDBITS))) +#if __GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ >= 3 +/* + * Use the built-in bzero function instead of the library version so that + * we do not pollute the namespace or introduce prototype warnings. + */ +#define FD_ZERO(p) __builtin_bzero(p, sizeof(*(p))) +#else #define FD_ZERO(p) bzero(p, sizeof(*(p))) +#endif +#ifndef _POSIX_C_SOURCE +#define FD_COPY(f, t) bcopy(f, t, sizeof(*(f))) +#endif /* !_POSIX_C_SOURCE */ + +#endif /* !_FD_SET */ + #if defined(__STDC__) && defined(KERNEL) /* @@ -177,49 +286,64 @@ struct tty; struct uio; #endif -#endif /* !_POSIX_SOURCE */ +#endif /* !_POSIX_C_SOURCE */ #endif /* __ASSEMBLER__ */ -struct _pthread_handler_rec -{ - void (*routine)(void *); /* Routine to call */ - void *arg; /* Argument to pass */ - struct _pthread_handler_rec *next; -}; - #ifndef __POSIX_LIB__ -#define __PTHREAD_SIZE__ 596 -#define __PTHREAD_ATTR_SIZE__ 36 -#define __PTHREAD_MUTEXATTR_SIZE__ 8 -#define __PTHREAD_MUTEX_SIZE__ 40 -#define __PTHREAD_CONDATTR_SIZE__ 4 -#define __PTHREAD_COND_SIZE__ 24 -#define __PTHREAD_ONCE_SIZE__ 4 -#define __PTHREAD_RWLOCK_SIZE__ 124 -#define __PTHREAD_RWLOCKATTR_SIZE__ 12 - - -typedef struct _opaque_pthread_t { long sig; struct _pthread_handler_rec *cleanup_stack; char opaque[__PTHREAD_SIZE__];} *pthread_t; - -typedef struct _opaque_pthread_attr_t { long sig; char opaque[__PTHREAD_ATTR_SIZE__]; } pthread_attr_t; - -typedef struct _opaque_pthread_mutexattr_t { long sig; char opaque[__PTHREAD_MUTEXATTR_SIZE__]; } pthread_mutexattr_t; - -typedef struct _opaque_pthread_mutex_t { long sig; char opaque[__PTHREAD_MUTEX_SIZE__]; } pthread_mutex_t; - -typedef struct _opaque_pthread_condattr_t { long sig; char opaque[__PTHREAD_CONDATTR_SIZE__]; } pthread_condattr_t; - -typedef struct _opaque_pthread_cond_t { long sig; char opaque[__PTHREAD_COND_SIZE__]; } pthread_cond_t; - -typedef struct _opaque_pthread_rwlockattr_t { long sig; char opaque[__PTHREAD_RWLOCKATTR_SIZE__]; } pthread_rwlockattr_t; +#ifndef _PTHREAD_ATTR_T +#define _PTHREAD_ATTR_T +typedef __darwin_pthread_attr_t pthread_attr_t; +#endif +#ifndef _PTHREAD_COND_T +#define _PTHREAD_COND_T +typedef __darwin_pthread_cond_t pthread_cond_t; +#endif +#ifndef _PTHREAD_CONDATTR_T +#define _PTHREAD_CONDATTR_T +typedef __darwin_pthread_condattr_t pthread_condattr_t; +#endif +#ifndef _PTHREAD_MUTEX_T +#define _PTHREAD_MUTEX_T +typedef __darwin_pthread_mutex_t pthread_mutex_t; +#endif +#ifndef _PTHREAD_MUTEXATTR_T +#define _PTHREAD_MUTEXATTR_T +typedef __darwin_pthread_mutexattr_t pthread_mutexattr_t; +#endif +#ifndef _PTHREAD_ONCE_T +#define _PTHREAD_ONCE_T +typedef __darwin_pthread_once_t pthread_once_t; +#endif +#ifndef _PTHREAD_RWLOCK_T +#define _PTHREAD_RWLOCK_T +typedef __darwin_pthread_rwlock_t pthread_rwlock_t; +#endif +#ifndef _PTHREAD_RWLOCKATTR_T +#define _PTHREAD_RWLOCKATTR_T +typedef __darwin_pthread_rwlockattr_t pthread_rwlockattr_t; +#endif +#ifndef _PTHREAD_T +#define _PTHREAD_T +typedef __darwin_pthread_t pthread_t; +#endif -typedef struct _opaque_pthread_rwlock_t { long sig; char opaque[__PTHREAD_RWLOCK_SIZE__]; } pthread_rwlock_t; +#endif /* __POSIX_LIB__ */ -typedef struct { long sig; char opaque[__PTHREAD_ONCE_SIZE__]; } pthread_once_t; +#ifndef _PTHREAD_KEY_T +#define _PTHREAD_KEY_T +typedef __darwin_pthread_key_t pthread_key_t; +#endif -#endif /* __POSIX_LIB__ */ +/* statvfs and fstatvfs */ +#ifndef _FSBLKCNT_T +#define _FSBLKCNT_T +typedef __darwin_fsblkcnt_t fsblkcnt_t; +#endif -typedef unsigned long pthread_key_t; /* Opaque 'pointer' */ +#ifndef _FSFILCNT_T +#define _FSFILCNT_T +typedef __darwin_fsfilcnt_t fsfilcnt_t; +#endif #endif /* !_SYS_TYPES_H_ */ diff --git a/bsd/sys/ubc.h b/bsd/sys/ubc.h index 0e401f6f7..46be17aa7 100644 --- a/bsd/sys/ubc.h +++ b/bsd/sys/ubc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999, 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -20,10 +20,6 @@ * @APPLE_LICENSE_HEADER_END@ */ /* - * File: ubc.h - * Author: Umesh Vaishampayan [umeshv@apple.com] - * 05-Aug-1999 umeshv Created. - * * Header file for Unified Buffer Cache. * */ @@ -32,141 +28,58 @@ #define _SYS_UBC_H_ #include <sys/appleapiopts.h> -#include <sys/types.h> -#include <sys/ucred.h> -#include <sys/vnode.h> - #include <sys/cdefs.h> - +#include <sys/kernel_types.h> +#include <kern/locks.h> #include <mach/memory_object_types.h> -#define UBC_INFO_NULL ((struct ubc_info *) 0) -#define UBC_NOINFO ((struct ubc_info *)0xDEADD1ED) -#ifdef __APPLE_API_PRIVATE -extern struct zone *ubc_info_zone; +/* defns for ubc_sync_range() and ubc_msync */ -/* - * The following data structure keeps the information to associate - * a vnode to the correspondig VM objects. - */ - -struct ubc_info { - memory_object_t ui_pager; /* pager */ - memory_object_control_t ui_control; /* VM control for the pager */ - long ui_flags; /* flags */ - struct vnode *ui_vnode; /* The vnode for this ubc_info */ - struct ucred *ui_ucred; /* holds credentials for NFS paging */ - int ui_refcount;/* ref count on the ubc_info */ - off_t ui_size; /* file size for the vnode */ - long ui_mapped; /* is it currently mapped */ - void *ui_owner; /* for recursive ubc_busy */ -}; - -/* Defines for ui_flags */ -#define UI_NONE 0x00000000 /* none */ -#define UI_HASPAGER 0x00000001 /* has a pager associated */ -#define UI_INITED 0x00000002 /* newly initialized vnode */ -#define UI_HASOBJREF 0x00000004 /* hold a reference on object */ -#define UI_WASMAPPED 0x00000008 /* vnode was mapped */ -#define UI_DONTCACHE 0x00000010 /* do not cache object */ -#define UI_BUSY 0x00000020 /* for VM synchronization */ -#define UI_WANTED 0x00000040 /* for VM synchronization */ - -#endif /* __APPLE_API_PRIVATE */ - -#ifdef __APPLE_API_EVOLVING -/* - * exported primitives for loadable file systems. - */ +#define UBC_PUSHDIRTY 0x01 /* clean any dirty pages in the specified range to the backing store */ +#define UBC_PUSHALL 0x02 /* push both dirty and precious pages to the backing store */ +#define UBC_INVALIDATE 0x04 /* invalidate pages in the specified range... may be used with UBC_PUSHDIRTY/ALL */ +#define UBC_SYNC 0x08 /* wait for I/Os generated by UBC_PUSHDIRTY to complete */ __BEGIN_DECLS -int ubc_info_init __P((struct vnode *)); -void ubc_info_deallocate __P((struct ubc_info *)); -int ubc_setsize __P((struct vnode *, off_t)); -off_t ubc_getsize __P((struct vnode *)); -int ubc_uncache __P((struct vnode *)); -int ubc_umount __P((struct mount *)); -void ubc_unmountall __P(()); -int ubc_setcred __P((struct vnode *, struct proc *)); -struct ucred *ubc_getcred __P((struct vnode *)); -memory_object_t ubc_getpager __P((struct vnode *)); -memory_object_control_t ubc_getobject __P((struct vnode *, int)); -int ubc_setpager __P((struct vnode *, memory_object_t)); -int ubc_setflags __P((struct vnode *, int)); -int ubc_clearflags __P((struct vnode *, int)); -int ubc_issetflags __P((struct vnode *, int)); -off_t ubc_blktooff __P((struct vnode *, daddr_t)); -daddr_t ubc_offtoblk __P((struct vnode *, off_t)); -int ubc_clean __P((struct vnode *, int)); -int ubc_pushdirty __P((struct vnode *)); -int ubc_pushdirty_range __P((struct vnode *, off_t, off_t)); -int ubc_hold __P((struct vnode *)); -void ubc_rele __P((struct vnode *)); -void ubc_map __P((struct vnode *)); -int ubc_destroy_named __P((struct vnode *)); -int ubc_release_named __P((struct vnode *)); -int ubc_invalidate __P((struct vnode *, off_t, size_t)); -int ubc_isinuse __P((struct vnode *, int)); - -int ubc_page_op __P((struct vnode *, off_t, int, ppnum_t *, int *)); -/* cluster IO routines */ -int cluster_read __P((struct vnode *, struct uio *, off_t, int, int)); -int advisory_read __P((struct vnode *, off_t, off_t, int, int)); -int cluster_write __P((struct vnode *, struct uio*, off_t, off_t, - off_t, off_t, int, int)); -int cluster_push __P((struct vnode *)); -int cluster_release __P((struct vnode *)); -int cluster_pageout __P((struct vnode *, upl_t, vm_offset_t, off_t, int, - off_t, int, int)); -int cluster_pagein __P((struct vnode *, upl_t, vm_offset_t, off_t, int, - off_t, int, int)); -int cluster_bp __P((struct buf *)); -int cluster_copy_upl_data __P((struct uio *, upl_t, int, int)); -int cluster_copy_ubc_data __P((struct vnode *, struct uio *, int *, int)); +off_t ubc_blktooff(struct vnode *, daddr64_t); +daddr64_t ubc_offtoblk(struct vnode *, off_t); +off_t ubc_getsize(struct vnode *); +int ubc_setsize(struct vnode *, off_t); -/* UPL routines */ -int ubc_create_upl __P((struct vnode *, off_t, long, upl_t *, - upl_page_info_t **, int)); -int ubc_upl_map __P((upl_t, vm_offset_t *)); -int ubc_upl_unmap __P((upl_t)); -int ubc_upl_commit __P((upl_t)); -int ubc_upl_commit_range __P((upl_t, vm_offset_t, vm_size_t, int)); -int ubc_upl_abort __P((upl_t, int)); -int ubc_upl_abort_range __P((upl_t, vm_offset_t, vm_size_t, int)); -upl_page_info_t *ubc_upl_pageinfo __P((upl_t)); -__END_DECLS +struct ucred *ubc_getcred(struct vnode *); +int ubc_setcred(struct vnode *, struct proc *); -#define UBCINFOMISSING(vp) \ - ((vp) && ((vp)->v_type == VREG) && ((vp)->v_ubcinfo == UBC_INFO_NULL)) +int ubc_sync_range(vnode_t, off_t, off_t, int); +errno_t ubc_msync(vnode_t, off_t, off_t, off_t *, int); +int ubc_pages_resident(vnode_t); -#define UBCINFORECLAIMED(vp) \ - ((vp) && ((vp)->v_type == VREG) && ((vp)->v_ubcinfo == UBC_NOINFO)) -#define UBCINFOEXISTS(vp) \ - ((vp) && ((vp)->v_type == VREG) && \ - ((vp)->v_ubcinfo) && ((vp)->v_ubcinfo != UBC_NOINFO)) +/* cluster IO routines */ +int advisory_read(vnode_t, off_t, off_t, int); -#define UBCISVALID(vp) \ - ((vp) && ((vp)->v_type == VREG) && !((vp)->v_flag & VSYSTEM)) +int cluster_read(vnode_t, struct uio *, off_t, int); +int cluster_write(vnode_t, struct uio *, off_t, off_t, off_t, off_t, int); +int cluster_pageout(vnode_t, upl_t, vm_offset_t, off_t, int, off_t, int); +int cluster_pagein(vnode_t, upl_t, vm_offset_t, off_t, int, off_t, int); +int cluster_push(vnode_t, int); +int cluster_bp(buf_t); +void cluster_zero(upl_t, vm_offset_t, int, buf_t); -#define UBCINVALID(vp) \ - (((vp) == NULL) || ((vp) && ((vp)->v_type != VREG)) \ - || ((vp) && ((vp)->v_flag & VSYSTEM))) -#define UBCINFOCHECK(fun, vp) \ - if ((vp) && ((vp)->v_type == VREG) && \ - (((vp)->v_ubcinfo == UBC_INFO_NULL) \ - || ((vp)->v_ubcinfo == UBC_NOINFO))) \ - panic("%s: lost ubc_info", (fun)); +/* UPL routines */ +int ubc_create_upl(vnode_t, off_t, long, upl_t *, upl_page_info_t **, int); +int ubc_upl_map(upl_t, upl_offset_t *); +int ubc_upl_unmap(upl_t); +int ubc_upl_commit(upl_t); +int ubc_upl_commit_range(upl_t, upl_offset_t, upl_size_t, int); +int ubc_upl_abort(upl_t, int); +int ubc_upl_abort_range(upl_t, upl_offset_t, upl_size_t, int); -/* Flags for ubc_getobject() */ -#define UBC_FLAGS_NONE 0x0000 -#define UBC_HOLDOBJECT 0x0001 -#define UBC_FOR_PAGEOUT 0x0002 +upl_page_info_t *ubc_upl_pageinfo(upl_t); -#endif /* __APPLE_API_EVOLVING */ +__END_DECLS #endif /* _SYS_UBC_H_ */ diff --git a/bsd/sys/ubc_internal.h b/bsd/sys/ubc_internal.h new file mode 100644 index 000000000..1362f30ee --- /dev/null +++ b/bsd/sys/ubc_internal.h @@ -0,0 +1,154 @@ +/* + * Copyright (c) 1999-2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* + * File: ubc.h + * Author: Umesh Vaishampayan [umeshv@apple.com] + * 05-Aug-1999 umeshv Created. + * + * Header file for Unified Buffer Cache. + * + */ + +#ifndef _SYS_UBC_INTERNAL_H_ +#define _SYS_UBC_INTERNAL_H_ + +#include <sys/appleapiopts.h> +#include <sys/types.h> +#include <sys/kernel_types.h> +#include <sys/ucred.h> +#include <sys/vnode.h> +#include <sys/ubc.h> +#include <sys/mman.h> + +#include <sys/cdefs.h> + +#include <kern/locks.h> +#include <mach/memory_object_types.h> + + +#define UBC_INFO_NULL ((struct ubc_info *) 0) + + +extern struct zone *ubc_info_zone; + + +#define MAX_CLUSTERS 4 /* maximum number of vfs clusters per vnode */ + +struct cl_extent { + daddr64_t b_addr; + daddr64_t e_addr; +}; + +struct cl_wextent { + daddr64_t b_addr; + daddr64_t e_addr; + int io_nocache; +}; + +struct cl_readahead { + lck_mtx_t cl_lockr; + daddr64_t cl_lastr; /* last block read by client */ + daddr64_t cl_maxra; /* last block prefetched by the read ahead */ + int cl_ralen; /* length of last prefetch */ +}; + +struct cl_writebehind { + lck_mtx_t cl_lockw; + int cl_hasbeenpaged; /* if set, indicates pager has cleaned pages associated with this file */ + void * cl_scmap; /* pointer to sparse cluster map */ + int cl_scdirty; /* number of dirty pages in the sparse cluster map */ + int cl_number; /* number of packed write behind clusters currently valid */ + struct cl_wextent cl_clusters[MAX_CLUSTERS]; /* packed write behind clusters */ +}; + + +/* + * The following data structure keeps the information to associate + * a vnode to the correspondig VM objects. + */ +struct ubc_info { + memory_object_t ui_pager; /* pager */ + memory_object_control_t ui_control; /* VM control for the pager */ + long ui_flags; /* flags */ + vnode_t *ui_vnode; /* The vnode for this ubc_info */ + ucred_t *ui_ucred; /* holds credentials for NFS paging */ + off_t ui_size; /* file size for the vnode */ + + struct cl_readahead *cl_rahead; /* cluster read ahead context */ + struct cl_writebehind *cl_wbehind; /* cluster write behind context */ +}; + +/* Defines for ui_flags */ +#define UI_NONE 0x00000000 /* none */ +#define UI_HASPAGER 0x00000001 /* has a pager associated */ +#define UI_INITED 0x00000002 /* newly initialized vnode */ +#define UI_HASOBJREF 0x00000004 /* hold a reference on object */ +#define UI_WASMAPPED 0x00000008 /* vnode was mapped */ +#define UI_ISMAPPED 0x00000010 /* vnode is currently mapped */ + +/* + * exported primitives for loadable file systems. + */ + +__BEGIN_DECLS +__private_extern__ int ubc_umount(struct mount *mp); +__private_extern__ void ubc_unmountall(void); +__private_extern__ memory_object_t ubc_getpager(struct vnode *); +__private_extern__ int ubc_map(struct vnode *, int); +__private_extern__ int ubc_destroy_named(struct vnode *); + +/* internal only */ +__private_extern__ void cluster_release(struct ubc_info *); + + +/* Flags for ubc_getobject() */ +#define UBC_FLAGS_NONE 0x0000 +#define UBC_HOLDOBJECT 0x0001 +#define UBC_FOR_PAGEOUT 0x0002 + +memory_object_control_t ubc_getobject(struct vnode *, int); + +int ubc_info_init(struct vnode *); +void ubc_info_deallocate (struct ubc_info *); + +int ubc_isinuse(struct vnode *, int); + +int ubc_page_op(vnode_t, off_t, int, ppnum_t *, int *); +int ubc_range_op(vnode_t, off_t, off_t, int, int *); + + +int cluster_copy_upl_data(struct uio *, upl_t, int, int); +int cluster_copy_ubc_data(vnode_t, struct uio *, int *, int); + + +int UBCINFOMISSING(vnode_t); +int UBCINFORECLAIMED(vnode_t); +int UBCINFOEXISTS(vnode_t); +int UBCISVALID(vnode_t); +int UBCINVALID(vnode_t); +int UBCINFOCHECK(const char *, vnode_t); + +__END_DECLS + + +#endif /* _SYS_UBC_INTERNAL_H_ */ + diff --git a/bsd/sys/ucontext.h b/bsd/sys/ucontext.h index dd97a8cef..f231226a2 100644 --- a/bsd/sys/ucontext.h +++ b/bsd/sys/ucontext.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2002-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -23,29 +23,58 @@ #ifndef _SYS_UCONTEXT_H_ #define _SYS_UCONTEXT_H_ +#include <sys/cdefs.h> +#include <sys/_types.h> #include <machine/ucontext.h> -struct ucontext { - int uc_onstack; - sigset_t uc_sigmask; /* signal mask used by this context */ - stack_t uc_stack; /* stack used by this context */ - struct ucontext *uc_link; /* pointer to resuming context */ - size_t uc_mcsize; /* size of the machine context passed in */ - mcontext_t uc_mcontext; /* machine specific context */ -}; +#ifndef _SIGSET_T +#define _SIGSET_T +typedef __darwin_sigset_t sigset_t; +#endif + +#ifndef _STACK_T +#define _STACK_T +typedef __darwin_stack_t stack_t; +#endif + +#ifndef _UCONTEXT_T +#define _UCONTEXT_T +#ifndef _POSIX_C_SOURCE +typedef struct ucontext ucontext_t; +#else /* _POSIX_C_SOURCE */ +typedef struct __darwin_ucontext ucontext_t; +#endif /* _POSIX_C_SOURCE */ +#endif +#ifndef _POSIX_C_SOURCE +#ifndef _UCONTEXT64_T +#define _UCONTEXT64_T +typedef struct ucontext64 ucontext64_t; +#endif +#endif /* _POSIX_C_SOURCE */ -typedef struct ucontext ucontext_t; +#ifdef KERNEL +#include <machine/types.h> /* user_addr_t, user_size_t */ -struct ucontext64 { - int uc_onstack; - sigset_t uc_sigmask; /* signal mask used by this context */ - stack_t uc_stack; /* stack used by this context */ - struct ucontext *uc_link; /* pointer to resuming context */ - size_t uc_mcsize; /* size of the machine context passed in */ - mcontext64_t uc_mcontext64; /* machine specific context */ +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +/* kernel representation of struct ucontext64 for 64 bit processes */ +struct user_ucontext64 { + int uc_onstack; + sigset_t uc_sigmask; /* signal mask */ + struct user_sigaltstack uc_stack; /* stack */ + user_addr_t uc_link; /* ucontext pointer */ + user_size_t uc_mcsize; /* mcontext size */ + user_addr_t uc_mcontext64; /* machine context */ }; -typedef struct ucontext64 ucontext64_t; +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +typedef struct user_ucontext64 user_ucontext64_t; +#endif /* KERNEL */ #endif /* _SYS_UCONTEXT_H_ */ diff --git a/bsd/sys/ucred.h b/bsd/sys/ucred.h index b7f6be44d..7da8c28eb 100644 --- a/bsd/sys/ucred.h +++ b/bsd/sys/ucred.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -59,48 +59,59 @@ #define _SYS_UCRED_H_ #include <sys/appleapiopts.h> +#include <sys/cdefs.h> #include <sys/param.h> +#include <bsm/audit.h> #ifdef __APPLE_API_UNSTABLE + /* - * Credentials. + * In-kernel credential structure. + * + * Note that this structure should not be used outside the kernel, nor should + * it or copies of it be exported outside. */ struct ucred { + TAILQ_ENTRY(ucred) cr_link; /* never modify this without KAUTH_CRED_HASH_LOCK */ u_long cr_ref; /* reference count */ + + /* credential hash depends on everything from this point on (see kauth_cred_get_hashkey) */ uid_t cr_uid; /* effective user id */ - short cr_ngroups; /* number of groups */ - gid_t cr_groups[NGROUPS]; /* groups */ + uid_t cr_ruid; /* real user id */ + uid_t cr_svuid; /* saved user id */ + short cr_ngroups; /* number of groups in advisory list */ + gid_t cr_groups[NGROUPS]; /* advisory group list */ + gid_t cr_rgid; /* real group id */ + gid_t cr_svgid; /* saved group id */ + uid_t cr_gmuid; /* user id for group membership purposes */ + struct auditinfo cr_au; /* user auditing data */ }; +typedef struct ucred *kauth_cred_t; + /* * This is the external representation of struct ucred. */ struct xucred { u_int cr_version; /* structure layout version */ uid_t cr_uid; /* effective user id */ - short cr_ngroups; /* number of groups */ - gid_t cr_groups[NGROUPS]; /* groups */ + short cr_ngroups; /* number of advisory groups */ + gid_t cr_groups[NGROUPS]; /* advisory group list */ }; #define XUCRED_VERSION 0 #define cr_gid cr_groups[0] -#define NOCRED ((struct ucred *)0) /* no credential available */ -#define FSCRED ((struct ucred *)-1) /* filesystem credential */ +#define NOCRED ((kauth_cred_t )0) /* no credential available */ +#define FSCRED ((kauth_cred_t )-1) /* filesystem credential */ #ifdef KERNEL -#define crhold(cr) \ -{ \ - if (++(cr)->cr_ref == 0) \ - panic("crhold"); \ -} - -struct ucred *crcopy __P((struct ucred *cr)); -struct ucred *crdup __P((struct ucred *cr)); -void crfree __P((struct ucred *cr)); -struct ucred *crget __P((void)); -int crcmp __P((struct ucred *cr1, struct ucred *cr2)); -int suser __P((struct ucred *cred, u_short *acflag)); -void cru2x __P((struct ucred *cr, struct xucred *xcr)); - +#ifdef __APPLE_API_OBSOLETE +__BEGIN_DECLS +int crcmp(kauth_cred_t cr1, kauth_cred_t cr2); +int suser(kauth_cred_t cred, u_short *acflag); +int set_security_token(struct proc * p); +void cru2x(kauth_cred_t cr, struct xucred *xcr); +__END_DECLS +#endif /* __APPLE_API_OBSOLETE */ #endif /* KERNEL */ #endif /* __APPLE_API_UNSTABLE */ diff --git a/bsd/sys/uio.h b/bsd/sys/uio.h index 6e61edc6a..b133e43a0 100644 --- a/bsd/sys/uio.h +++ b/bsd/sys/uio.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -58,56 +58,213 @@ #ifndef _SYS_UIO_H_ #define _SYS_UIO_H_ +#include <sys/cdefs.h> +#include <sys/_types.h> + +/* + * [XSI] The ssize_t and size_t types shall be defined as described + * in <sys/types.h>. + */ +#ifndef _SIZE_T +#define _SIZE_T +typedef __darwin_size_t size_t; +#endif + +#ifndef _SSIZE_T +#define _SSIZE_T +typedef __darwin_ssize_t ssize_t; +#endif + /* - * XXX - * iov_base should be a void *. + * [XSI] Structure whose address is passed as the second parameter to the + * readv() and writev() functions. */ +#ifndef _STRUCT_IOVEC +#define _STRUCT_IOVEC struct iovec { - char *iov_base; /* Base address. */ - size_t iov_len; /* Length. */ + void * iov_base; /* [XSI] Base address of I/O memory region */ + size_t iov_len; /* [XSI] Size of region iov_base points to */ }; +#endif -enum uio_rw { UIO_READ, UIO_WRITE }; -/* Segment flag values. */ -enum uio_seg { - UIO_USERSPACE, /* kernel address is virtual, to/from user virtual */ - UIO_USERISPACE, /* kernel address is virtual, to/from user virtual */ - UIO_SYSSPACE, /* kernel address is virtual, to/from system virtual */ - UIO_PHYS_USERSPACE, /* kernel address is physical, to/from user virtual */ - UIO_PHYS_SYSSPACE, /* kernel address is physical, to/from system virtual */ -}; +#ifndef _POSIX_C_SOURCE +/* + * IO direction for uio_t. + * UIO_READ - data moves into iovec(s) associated with uio_t + * UIO_WRITE - data moves out of iovec(s) associated with uio_t + */ +enum uio_rw { UIO_READ, UIO_WRITE }; +#endif #ifdef KERNEL -struct uio { - struct iovec *uio_iov; - int uio_iovcnt; - off_t uio_offset; - int uio_resid; - enum uio_seg uio_segflg; - enum uio_rw uio_rw; - struct proc *uio_procp; + +/* + * XXX This all really wants a uio_internal.h + */ + +#include <sys/kernel_types.h> + + +/* + * user / kernel address space type flags. + * WARNING - make sure to check when adding flags! Be sure new flags + * don't overlap the definitions in uio_internal.h + * NOTES - + * UIO_USERSPACE is equivalent to UIO_USERSPACE32, but UIO_USERSPACE32 + * is preferred. UIO_USERSPACE remains for backwards compatibility. + * UIO_SYSSPACE is equivalent to UIO_SYSSPACE32, but UIO_SYSSPACE32 + * is preferred. UIO_SYSSPACE remains for backwards compatibility. + */ +enum uio_seg { + UIO_USERSPACE = 0, /* kernel address is virtual, to/from user virtual */ + UIO_SYSSPACE = 2, /* kernel address is virtual, to/from system virtual */ + UIO_USERSPACE32 = 5, /* kernel address is virtual, to/from user 32-bit virtual */ + UIO_USERSPACE64 = 8, /* kernel address is virtual, to/from user 64-bit virtual */ + UIO_SYSSPACE32 = 11 /* kernel address is virtual, to/from system virtual */ }; +#define UIO_SEG_IS_USER_SPACE( a_uio_seg ) \ + ( (a_uio_seg) == UIO_USERSPACE64 || (a_uio_seg) == UIO_USERSPACE32 || \ + (a_uio_seg) == UIO_USERSPACE ) + + +__BEGIN_DECLS + +/* + * uio_create - create an uio_t. + * Space is allocated to hold up to a_iovcount number of iovecs. The uio_t + * is not fully initialized until all iovecs are added using uio_addiov calls. + * a_iovcount is the maximum number of iovecs you may add. + */ +uio_t uio_create( int a_iovcount, /* max number of iovecs */ + off_t a_offset, /* current offset */ + int a_spacetype, /* type of address space */ + int a_iodirection ); /* read or write flag */ + +/* + * uio_reset - reset an uio_t. + * Reset the given uio_t to initial values. The uio_t is not fully initialized + * until all iovecs are added using uio_add_ov calls. + * The a_iovcount value passed in the uio_create is the maximum number of + * iovecs you may add. + */ +void uio_reset( uio_t a_uio, + off_t a_offset, /* current offset */ + int a_spacetype, /* type of address space */ + int a_iodirection ); /* read or write flag */ + +/* + * uio_duplicate - allocate a new uio and make a copy of the given uio_t. + * may return NULL. + */ +uio_t uio_duplicate( uio_t a_uio ); + + +/* + * uio_free - free a uio_t allocated via uio_create. + */ +void uio_free( uio_t a_uio ); + +/* + * uio_addiov - add an iovec to the given uio_t. You may call this up to + * the a_iovcount number that was passed to uio_create. + * returns 0 if add was successful else non zero. + */ +int uio_addiov( uio_t a_uio, user_addr_t a_baseaddr, user_size_t a_length ); + +/* + * uio_getiov - get iovec data associated with the given uio_t. Use + * a_index to iterate over each iovec (0 to (uio_iovcnt(uio_t) - 1)). + * a_baseaddr_p and a_length_p may be NULL. + * returns -1 when a_index is out of range or invalid uio_t. + * returns 0 when data is returned. + */ +int uio_getiov( uio_t a_uio, + int a_index, + user_addr_t * a_baseaddr_p, + user_size_t * a_length_p ); + +/* + * uio_update - update the given uio_t for a_count of completed IO. + * This call adjusts decrements the current iovec length and residual IO, + * and increments the current iovec base address and offset value. + */ +void uio_update( uio_t a_uio, user_size_t a_count ); + +/* + * uio_resid - return the residual IO value for the given uio_t + */ +user_ssize_t uio_resid( uio_t a_uio ); + +/* + * uio_setresid - set the residual IO value for the given uio_t + */ +void uio_setresid( uio_t a_uio, user_ssize_t a_value ); + +/* + * uio_iovcnt - return count of active iovecs for the given uio_t + */ +int uio_iovcnt( uio_t a_uio ); + +/* + * uio_offset - return the current offset value for the given uio_t + */ +off_t uio_offset( uio_t a_uio ); + +/* + * uio_setoffset - set the current offset value for the given uio_t + */ +void uio_setoffset( uio_t a_uio, off_t a_offset ); + +/* + * uio_rw - return the read / write flag for the given uio_t + */ +int uio_rw( uio_t a_uio ); + +/* + * uio_setrw - set the read / write flag for the given uio_t + */ +void uio_setrw( uio_t a_uio, int a_value ); + +/* + * uio_isuserspace - return non zero value if the address space + * flag is for a user address space (could be 32 or 64 bit). + */ +int uio_isuserspace( uio_t a_uio ); + +/* + * uio_curriovbase - return the base address of the current iovec associated + * with the given uio_t. May return 0. + */ +user_addr_t uio_curriovbase( uio_t a_uio ); + +/* + * uio_curriovlen - return the length value of the current iovec associated + * with the given uio_t. + */ +user_size_t uio_curriovlen( uio_t a_uio ); + /* * Limits */ #define UIO_MAXIOV 1024 /* max 1K of iov's */ #define UIO_SMALLIOV 8 /* 8 on stack, else malloc */ -extern int uiomove __P((caddr_t cp, int n, struct uio *uio)); -extern int uiomove64 __P((unsigned long long cp, int n, struct uio *uio)); -extern int ureadc __P((int c, struct uio *uio)); -extern int uwritec __P((struct uio *uio)); +extern int uiomove(caddr_t cp, int n, struct uio *uio); +extern int uiomove64(unsigned long long cp, int n, struct uio *uio); +extern int ureadc(int c, struct uio *uio); +extern int uwritec(struct uio *uio); +__END_DECLS #endif /* KERNEL */ #ifndef KERNEL -#include <sys/cdefs.h> __BEGIN_DECLS -ssize_t readv __P((int, const struct iovec *, int)); -ssize_t writev __P((int, const struct iovec *, int)); +ssize_t readv(int, const struct iovec *, int); +ssize_t writev(int, const struct iovec *, int); __END_DECLS #endif /* !KERNEL */ + #endif /* !_SYS_UIO_H_ */ diff --git a/bsd/sys/uio_internal.h b/bsd/sys/uio_internal.h new file mode 100644 index 000000000..a2a7cc0f5 --- /dev/null +++ b/bsd/sys/uio_internal.h @@ -0,0 +1,445 @@ +/* + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ +/* + * Copyright (c) 1982, 1986, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uio.h 8.5 (Berkeley) 2/22/94 + */ + +#ifndef _SYS_UIO_INTERNAL_H_ +#define _SYS_UIO_INTERNAL_H_ + +#include <sys/appleapiopts.h> + +#ifdef KERNEL_PRIVATE +#include <sys/uio.h> +#include <sys/malloc.h> + +/* + * user / kernel address space type flags. + * WARNING - make sure to check when adding flags! Be sure new flags + * don't overlap the definitions in uio.h + */ +// UIO_USERSPACE 0 defined in uio.h +#define UIO_USERISPACE 1 +// UIO_SYSSPACE 2 defined in uio.h +#define UIO_PHYS_USERSPACE 3 +#define UIO_PHYS_SYSSPACE 4 +// UIO_USERSPACE32 5 defined in uio.h +#define UIO_USERISPACE32 6 +#define UIO_PHYS_USERSPACE32 7 +// UIO_USERSPACE64 8 defined in uio.h +#define UIO_USERISPACE64 9 +#define UIO_PHYS_USERSPACE64 10 +// UIO_SYSSPACE32 11 defined in uio.h +#define UIO_PHYS_SYSSPACE32 12 +#define UIO_SYSSPACE64 13 +#define UIO_PHYS_SYSSPACE64 14 + +__BEGIN_DECLS +struct user_iovec; + +// uio_iovsaddr was __private_extern__ temporary chnage for 3777436 +struct user_iovec * uio_iovsaddr( uio_t a_uio ); +__private_extern__ void uio_calculateresid( uio_t a_uio ); +__private_extern__ void uio_setcurriovlen( uio_t a_uio, user_size_t a_value ); +// uio_spacetype was __private_extern__ temporary chnage for 3777436 +int uio_spacetype( uio_t a_uio ); +__private_extern__ uio_t + uio_createwithbuffer( int a_iovcount, off_t a_offset, int a_spacetype, + int a_iodirection, void *a_buf_p, int a_buffer_size ); + +/* use kern_iovec for system space requests */ +struct kern_iovec { + u_int32_t iov_base; /* Base address. */ + u_int32_t iov_len; /* Length. */ +}; + +/* use user_iovec for user space requests */ +struct user_iovec { + user_addr_t iov_base; /* Base address. */ + user_size_t iov_len; /* Length. */ +}; + +#if 1 // LP64todo - remove this after kext adopt new KPI +#define uio_iov uio_iovs.iovp +#define iovec_32 kern_iovec +#define iovec_64 user_iovec +#define iov32p kiovp +#define iov64p uiovp +#endif + +union iovecs { + struct iovec *iovp; + struct kern_iovec *kiovp; + struct user_iovec *uiovp; +}; + +/* WARNING - use accessor calls for uio_iov and uio_resid since these */ +/* fields vary depending on the originating address space. */ +struct uio { + union iovecs uio_iovs; /* current iovec */ + int uio_iovcnt; /* active iovecs */ + off_t uio_offset; + int uio_resid; /* compatibility uio_resid (pre-LP64) */ + enum uio_seg uio_segflg; + enum uio_rw uio_rw; + proc_t uio_procp; /* obsolete - not used! */ + user_ssize_t uio_resid_64; + int uio_size; /* size for use with kfree */ + int uio_max_iovs; /* max number of iovecs this uio_t can hold */ + u_int32_t uio_flags; +}; + +/* values for uio_flags */ +#define UIO_FLAGS_INITED 0x00000001 +#define UIO_FLAGS_WE_ALLOCED 0x00000002 + +__END_DECLS + +/* + * UIO_SIZEOF - return the amount of space a uio_t requires to + * contain the given number of iovecs. Use this macro to + * create a stack buffer that can be passed to uio_createwithbuffer. + */ +#define UIO_SIZEOF( a_iovcount ) \ + ( sizeof(struct uio) + (sizeof(struct user_iovec) * (a_iovcount)) ) + +#define UIO_IS_64_BIT_SPACE( a_uio_t ) \ + ( (a_uio_t)->uio_segflg == UIO_USERSPACE64 || (a_uio_t)->uio_segflg == UIO_USERISPACE64 || \ + (a_uio_t)->uio_segflg == UIO_PHYS_USERSPACE64 || (a_uio_t)->uio_segflg == UIO_SYSSPACE64 || \ + (a_uio_t)->uio_segflg == UIO_PHYS_SYSSPACE64 ) + +#define UIO_IS_32_BIT_SPACE( a_uio_t ) \ + ( (a_uio_t)->uio_segflg == UIO_USERSPACE || (a_uio_t)->uio_segflg == UIO_USERISPACE || \ + (a_uio_t)->uio_segflg == UIO_SYSSPACE || (a_uio_t)->uio_segflg == UIO_PHYS_USERSPACE || \ + (a_uio_t)->uio_segflg == UIO_USERISPACE32 || (a_uio_t)->uio_segflg == UIO_PHYS_USERSPACE32 || \ + (a_uio_t)->uio_segflg == UIO_SYSSPACE32 || (a_uio_t)->uio_segflg == UIO_PHYS_SYSSPACE32 || \ + (a_uio_t)->uio_segflg == UIO_PHYS_SYSSPACE || (a_uio_t)->uio_segflg == UIO_USERSPACE32 ) + +#define UIO_IS_USER_SPACE32( a_uio_t ) \ + ( (a_uio_t)->uio_segflg == UIO_USERSPACE32 || (a_uio_t)->uio_segflg == UIO_PHYS_USERSPACE32 || \ + (a_uio_t)->uio_segflg == UIO_USERISPACE32 ) +#define UIO_IS_USER_SPACE64( a_uio_t ) \ + ( (a_uio_t)->uio_segflg == UIO_USERSPACE64 || (a_uio_t)->uio_segflg == UIO_PHYS_USERSPACE64 || \ + (a_uio_t)->uio_segflg == UIO_USERISPACE64 ) +#define UIO_IS_USER_SPACE( a_uio_t ) \ + ( UIO_IS_USER_SPACE32((a_uio_t)) || UIO_IS_USER_SPACE64((a_uio_t)) || \ + (a_uio_t)->uio_segflg == UIO_USERSPACE || (a_uio_t)->uio_segflg == UIO_USERISPACE || \ + (a_uio_t)->uio_segflg == UIO_PHYS_USERSPACE ) + + +/* + * W A R N I N G!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + * anything in this section will be removed. please move to the uio KPI + */ + +#if 1 // UIO_KPI - WARNING OBSOLETE!!!! LP64todo - remove these!!!! +// DO NOT USE THESE +#define IS_UIO_USER_SPACE32( segflg ) \ + ( (segflg) == UIO_USERSPACE32 || (segflg) == UIO_PHYS_USERSPACE32 || \ + (segflg) == UIO_USERISPACE32 ) +#define IS_UIO_USER_SPACE64( segflg ) \ + ( (segflg) == UIO_USERSPACE64 || (segflg) == UIO_PHYS_USERSPACE64 || \ + (segflg) == UIO_USERISPACE64 ) +#define IS_UIO_USER_SPACE( segflg ) \ + ( IS_UIO_USER_SPACE32((segflg)) || IS_UIO_USER_SPACE64((segflg)) || \ + (segflg) == UIO_USERSPACE || (segflg) == UIO_USERISPACE || \ + (segflg) == UIO_PHYS_USERSPACE ) + +#define IS_UIO_SYS_SPACE32( segflg ) \ + ( (segflg) == UIO_SYSSPACE32 || (segflg) == UIO_PHYS_SYSSPACE32 || \ + (segflg) == UIO_SYSSPACE || (segflg) == UIO_PHYS_SYSSPACE ) +#define IS_UIO_SYS_SPACE64( segflg ) \ + ( (segflg) == UIO_SYSSPACE64 || (segflg) == UIO_PHYS_SYSSPACE64 ) +#define IS_UIO_SYS_SPACE( segflg ) \ + ( IS_UIO_SYS_SPACE32((segflg)) || IS_UIO_SYS_SPACE64((segflg)) ) + +#define IS_OBSOLETE_UIO_SEGFLG(segflg) \ + ( (segflg) == UIO_USERSPACE || (segflg) == UIO_USERISPACE || \ + (segflg) == UIO_SYSSPACE || (segflg) == UIO_PHYS_USERSPACE || \ + (segflg) == UIO_PHYS_SYSSPACE ) +#define IS_VALID_UIO_SEGFLG(segflg) \ + ( IS_UIO_USER_SPACE((segflg)) || IS_UIO_SYS_SPACE((segflg)) ) + +/* accessor routines for uio and embedded iovecs */ +// WARNING all these are OBSOLETE!!!! +static inline int64_t uio_uio_resid( struct uio *a_uiop ); +static inline void uio_uio_resid_add( struct uio *a_uiop, int64_t a_amount ); +static inline void uio_uio_resid_set( struct uio *a_uiop, int64_t a_value ); + +static inline void uio_iov_base_add( struct uio *a_uiop, int64_t a_amount ); +static inline void uio_iov_base_add_at( struct uio *a_uiop, int64_t a_amount, int a_index ); +static inline void uio_iov_len_add( struct uio *a_uiop, int64_t a_amount ); +static inline void uio_iov_len_add_at( struct uio *a_uiop, int64_t a_amount, int a_index ); +static inline u_int64_t uio_iov_len( struct uio *a_uiop ); +static inline u_int64_t uio_iov_len_at( struct uio *a_uiop, int a_index ); +static inline u_int64_t uio_iov_base( struct uio *a_uiop ); +static inline u_int64_t uio_iov_base_at( struct uio *a_uiop, int a_index ); +static inline void uio_next_iov( struct uio *a_uiop ); +static inline void uio_iov_len_set( struct uio *a_uiop, u_int64_t a_value ); +static inline void uio_iov_len_set_at( struct uio *a_uiop, u_int64_t a_value, int a_index ); + + +static inline int64_t uio_uio_resid( struct uio *a_uiop ) +{ +//#warning obsolete - use uio_resid call + return( (int64_t)a_uiop->uio_resid ); +} + +static inline void uio_uio_resid_add( struct uio *a_uiop, int64_t a_amount ) +{ +//#warning obsolete - use uio_update or uio_addiov or uio_setresid if in kernel and you must + a_uiop->uio_resid += ((int32_t) a_amount); +} + +static inline void uio_uio_resid_set( struct uio *a_uiop, int64_t a_value ) +{ +//#warning obsolete - use uio_update or uio_addiov or uio_setresid if in kernel and you must + a_uiop->uio_resid = a_value; +} + +static inline u_int64_t uio_iov_base( struct uio *a_uiop ) +{ +//#warning obsolete - use uio_curriovbase call + return(uio_iov_base_at(a_uiop, 0)); +} + +static inline u_int64_t uio_iov_base_at( struct uio *a_uiop, int a_index ) +{ +//#warning obsolete - use uio_curriovbase call + if (IS_UIO_USER_SPACE32(a_uiop->uio_segflg) || IS_OBSOLETE_UIO_SEGFLG(a_uiop->uio_segflg)) { + /* user space iovec was most likely a struct iovec so we must cast to uintptr_t first */ + return((u_int64_t)((uintptr_t)a_uiop->uio_iovs.iov32p[a_index].iov_base)); + } + if (IS_UIO_SYS_SPACE32(a_uiop->uio_segflg)) { + return((u_int64_t)a_uiop->uio_iovs.iov32p[a_index].iov_base); + } + if (IS_UIO_USER_SPACE64(a_uiop->uio_segflg) || IS_UIO_SYS_SPACE64(a_uiop->uio_segflg)) { + return(a_uiop->uio_iovs.iov64p[a_index].iov_base); + } + return(0); +} + +static inline u_int64_t uio_iov_len( struct uio *a_uiop ) +{ +//#warning obsolete - use uio_curriovlen call + return(uio_iov_len_at(a_uiop, 0)); +} + +static inline u_int64_t uio_iov_len_at( struct uio *a_uiop, int a_index ) +{ +//#warning obsolete - use uio_curriovlen call + if (IS_UIO_USER_SPACE32(a_uiop->uio_segflg) || + IS_UIO_SYS_SPACE32(a_uiop->uio_segflg) || + IS_OBSOLETE_UIO_SEGFLG(a_uiop->uio_segflg)) { + return((u_int64_t)a_uiop->uio_iovs.iov32p[a_index].iov_len); + } + if (IS_UIO_USER_SPACE64(a_uiop->uio_segflg) || IS_UIO_SYS_SPACE64(a_uiop->uio_segflg)) { + return(a_uiop->uio_iovs.iov64p[a_index].iov_len); + } + return(0); +} + +static inline void uio_iov_len_set_at( struct uio *a_uiop, u_int64_t a_value, int a_index ) +{ +//#warning obsolete - use uio_addiov call + if (IS_UIO_USER_SPACE32(a_uiop->uio_segflg) || + IS_UIO_SYS_SPACE32(a_uiop->uio_segflg) || + IS_OBSOLETE_UIO_SEGFLG(a_uiop->uio_segflg)) { + a_uiop->uio_iovs.iov32p[a_index].iov_len = a_value; + } + else if (IS_UIO_USER_SPACE64(a_uiop->uio_segflg) || IS_UIO_SYS_SPACE64(a_uiop->uio_segflg)) { + a_uiop->uio_iovs.iov64p[a_index].iov_len = a_value; + } + return; +} + +static inline void uio_iov_len_set( struct uio *a_uiop, u_int64_t a_value ) +{ +//#warning obsolete - use uio_addiov call + return(uio_iov_len_set_at(a_uiop, a_value, 0)); +} + +static inline void uio_iov_len_add_at( struct uio *a_uiop, int64_t a_amount, int a_index ) +{ +//#warning obsolete - use uio_addiov call + if (IS_UIO_USER_SPACE32(a_uiop->uio_segflg) || + IS_UIO_SYS_SPACE32(a_uiop->uio_segflg) || + IS_OBSOLETE_UIO_SEGFLG(a_uiop->uio_segflg)) { + a_uiop->uio_iovs.iov32p[a_index].iov_len += ((int32_t) a_amount); + } + else if (IS_UIO_USER_SPACE64(a_uiop->uio_segflg) || IS_UIO_SYS_SPACE64(a_uiop->uio_segflg)) { + a_uiop->uio_iovs.iov64p[a_index].iov_len += a_amount; + } + return; +} + +static inline void uio_iov_len_add( struct uio *a_uiop, int64_t a_amount ) +{ +//#warning obsolete - use uio_addiov call + return(uio_iov_len_add_at(a_uiop, a_amount, 0)); +} + +static inline void uio_iov_base_add_at( struct uio *a_uiop, int64_t a_amount, int a_index ) +{ +//#warning obsolete - use uio_addiov call + if (IS_UIO_USER_SPACE32(a_uiop->uio_segflg) || + IS_UIO_SYS_SPACE32(a_uiop->uio_segflg) || + IS_OBSOLETE_UIO_SEGFLG(a_uiop->uio_segflg)) { + a_uiop->uio_iovs.iov32p[a_index].iov_base += ((int32_t) a_amount); + } + else if (IS_UIO_USER_SPACE64(a_uiop->uio_segflg) || IS_UIO_SYS_SPACE64(a_uiop->uio_segflg)) { + a_uiop->uio_iovs.iov64p[a_index].iov_base += a_amount; + } + return; +} + +static inline void uio_iov_base_add( struct uio *a_uiop, int64_t a_amount ) +{ +//#warning obsolete - use uio_addiov call + return(uio_iov_base_add_at(a_uiop, a_amount, 0)); +} + +static inline void uio_next_iov( struct uio *a_uiop ) +{ +//#warning obsolete - use uio_update call + if (IS_UIO_USER_SPACE32(a_uiop->uio_segflg) || + IS_UIO_SYS_SPACE32(a_uiop->uio_segflg) || + IS_OBSOLETE_UIO_SEGFLG(a_uiop->uio_segflg)) { + a_uiop->uio_iovs.iov32p++; + } + else if (IS_UIO_USER_SPACE64(a_uiop->uio_segflg) || IS_UIO_SYS_SPACE64(a_uiop->uio_segflg)) { + a_uiop->uio_iovs.iov64p++; + } + return; +} + +/* + * WARNING - this routine relies on iovec_64 being larger than iovec_32 and will + * not work if you are going to initialize an array of iovec_64 as an array of + * iovec_32 then pass that array in a uio (since uio_iov is always expected to + * be an array of like sized iovecs - see how uio_next_iov gets to the next iovec) + */ +static inline void init_iovec( u_int64_t a_base, + u_int64_t a_len, + struct iovec_64 *a_iovp, + int is_64bit_process ) +{ +//#warning obsolete - use uio_create call + if (is_64bit_process) { + a_iovp->iov_base = a_base; + a_iovp->iov_len = a_len; + } + else { + struct iovec_32 *a_iov32p = (struct iovec_32 *) a_iovp; + a_iov32p->iov_base = a_base; + a_iov32p->iov_len = a_len; + } + return; +} + +#define INIT_UIO_BASE( uiop, iovcnt, offset, resid, rw, procp ) \ +{ \ + (uiop)->uio_iovcnt = (iovcnt); \ + (uiop)->uio_offset = (offset); \ + (uiop)->uio_resid = (resid); \ + (uiop)->uio_rw = (rw); \ + (uiop)->uio_procp = (procp); \ +} +#define INIT_UIO_USER32( uiop, iovp, iovcnt, offset, resid, rw, procp ) \ +{ \ + (uiop)->uio_iovs.iov32p = (iovp); \ + (uiop)->uio_segflg = UIO_USERSPACE; \ + INIT_UIO_BASE((uiop), (iovcnt), (offset), (resid), (rw), (procp)); \ +} +#define INIT_UIO_USER64( uiop, iovp, iovcnt, offset, resid, rw, procp ) \ +{ \ + (uiop)->uio_iovs.iov64p = (iovp); \ + (uiop)->uio_segflg = UIO_USERSPACE64; \ + INIT_UIO_BASE((uiop), (iovcnt), (offset), (resid), (rw), (procp)); \ +} +#define INIT_UIO_SYS32( uiop, iovp, iovcnt, offset, resid, rw, procp ) \ +{ \ + (uiop)->uio_iovs.iov32p = (iovp); \ + (uiop)->uio_segflg = UIO_SYSSPACE; \ + INIT_UIO_BASE((uiop), (iovcnt), (offset), (resid), (rw), (procp)); \ +} +#define INIT_UIO_USERSPACE( uiop, iovp, iovcnt, offset, resid, rw, procp ) \ +{ \ + if (IS_64BIT_PROCESS((procp))) { \ + (uiop)->uio_iovs.iov64p = (iovp); \ + (uiop)->uio_segflg = UIO_USERSPACE64; \ + } \ + else { \ + (uiop)->uio_iovs.iov32p = (struct iovec_32 *)(iovp); \ + (uiop)->uio_segflg = UIO_USERSPACE; \ + } \ + INIT_UIO_BASE((uiop), (iovcnt), (offset), (resid), (rw), (procp)); \ +} +#define INIT_UIO_SYSSPACE( uiop, iovp, iovcnt, offset, resid, rw, procp ) \ +{ \ + if (0) { /* we do not support 64-bit system space yet */ \ + (uiop)->uio_iovs.iov64p = (iovp); \ + (uiop)->uio_segflg = UIO_SYSSPACE64; \ + } \ + else { \ + (uiop)->uio_iovs.iov32p = (struct iovec_32 *)(iovp); \ + (uiop)->uio_segflg = UIO_SYSSPACE; \ + } \ + INIT_UIO_BASE((uiop), (iovcnt), (offset), (resid), (rw), (procp)); \ +} +#endif // UIO_KPI - WARNING OBSOLETE!!!! + + +#endif /* KERNEL */ +#endif /* !_SYS_UIO_INTERNAL_H_ */ diff --git a/bsd/sys/un.h b/bsd/sys/un.h index e3e4cdb5c..d200eff87 100644 --- a/bsd/sys/un.h +++ b/bsd/sys/un.h @@ -58,34 +58,53 @@ #define _SYS_UN_H_ #include <sys/appleapiopts.h> +#include <sys/cdefs.h> +#include <sys/_types.h> + +/* [XSI] The sa_family_t type shall be defined as described in <sys/socket.h> */ +#ifndef _SA_FAMILY_T +#define _SA_FAMILY_T +typedef __uint8_t sa_family_t; +#endif /* - * Definitions for UNIX IPC domain. + * [XSI] Definitions for UNIX IPC domain. */ struct sockaddr_un { - u_char sun_len; /* sockaddr len including null */ - u_char sun_family; /* AF_UNIX */ - char sun_path[104]; /* path name (gag) */ + unsigned char sun_len; /* sockaddr len including null */ + sa_family_t sun_family; /* [XSI] AF_UNIX */ + char sun_path[104]; /* [XSI] path name (gag) */ }; +#ifndef _POSIX_C_SOURCE +/* Socket options. */ +#define LOCAL_PEERCRED 0x001 /* retrieve peer credentails */ +#endif /* !_POSIX_C_SOURCE */ + + #ifdef KERNEL -#ifdef __APPLE_API_PRIVATE +#ifdef PRIVATE +__BEGIN_DECLS struct mbuf; struct socket; -int uipc_usrreq __P((struct socket *so, int req, struct mbuf *m, - struct mbuf *nam, struct mbuf *control)); -int unp_connect2 __P((struct socket *so, struct socket *so2)); -void unp_dispose __P((struct mbuf *m)); -int unp_externalize __P((struct mbuf *rights)); -void unp_init __P((void)); +int uipc_usrreq(struct socket *so, int req, struct mbuf *m, + struct mbuf *nam, struct mbuf *control); +int uipc_ctloutput (struct socket *so, struct sockopt *sopt); +int unp_connect2(struct socket *so, struct socket *so2); +void unp_dispose(struct mbuf *m); +int unp_externalize(struct mbuf *rights); +void unp_init(void); extern struct pr_usrreqs uipc_usrreqs; -#endif /* __APPLE_API_PRIVATE */ +__END_DECLS +#endif /* PRIVATE */ #else /* !KERNEL */ +#ifndef _POSIX_C_SOURCE /* actual length of an initialized sockaddr_un */ #define SUN_LEN(su) \ (sizeof(*(su)) - sizeof((su)->sun_path) + strlen((su)->sun_path)) +#endif /* !_POSIX_C_SOURCE */ #endif /* KERNEL */ diff --git a/bsd/sys/unistd.h b/bsd/sys/unistd.h index d455b4db7..74fcc4411 100644 --- a/bsd/sys/unistd.h +++ b/bsd/sys/unistd.h @@ -58,9 +58,6 @@ #ifndef _SYS_UNISTD_H_ #define _SYS_UNISTD_H_ -/* compile-time symbolic constants */ -#define _POSIX_JOB_CONTROL /* implementation supports job control */ - /* * Although we have saved user/group IDs, we do not use them in setuid * as described in POSIX 1003.1, because the feature does not work for @@ -71,46 +68,74 @@ #define _POSIX_SAVED_IDS /* saved set-user-ID and set-group-ID */ #endif -#define _POSIX_VERSION 198808L -#define _POSIX2_VERSION 199212L +#define _POSIX_VERSION 200112L +#define _POSIX2_VERSION 200112L /* execution-time symbolic constants */ - /* chown requires appropriate privileges */ -#define _POSIX_CHOWN_RESTRICTED 1 - /* too-long path components generate errors */ -#define _POSIX_NO_TRUNC 1 /* may disable terminal special characters */ #ifndef _POSIX_VDISABLE #define _POSIX_VDISABLE ((unsigned char)'\377') #endif -#define _POSIX_THREADS -#define _POSIX_THREAD_ATTR_STACKADDR -#define _POSIX_THREAD_ATTR_STACKSIZE -#define _POSIX_THREAD_PRIORITY_SCHEDULING -#define _POSIX_THREAD_PRIO_INHERIT -#define _POSIX_THREAD_PRIO_PROTECT - #define _POSIX_THREAD_KEYS_MAX 128 /* access function */ #define F_OK 0 /* test for existence of file */ -#define X_OK 0x01 /* test for execute or search permission */ -#define W_OK 0x02 /* test for write permission */ -#define R_OK 0x04 /* test for read permission */ +#define X_OK (1<<0) /* test for execute or search permission */ +#define W_OK (1<<1) /* test for write permission */ +#define R_OK (1<<2) /* test for read permission */ + +#ifndef _POSIX_C_SOURCE +/* + * Extended access functions. + * Note that we depend on these matching the definitions in sys/kauth.h, + * but with the bits shifted left by 8. + */ +#define _READ_OK (1<<9) /* read file data / read directory */ +#define _WRITE_OK (1<<10) /* write file data / add file to directory */ +#define _EXECUTE_OK (1<<11) /* execute file / search in directory*/ +#define _DELETE_OK (1<<12) /* delete file / delete directory */ +#define _APPEND_OK (1<<13) /* append to file / add subdirectory to directory */ +#define _RMFILE_OK (1<<14) /* - / remove file from directory */ +#define _RATTR_OK (1<<15) /* read basic attributes */ +#define _WATTR_OK (1<<16) /* write basic attributes */ +#define _REXT_OK (1<<17) /* read extended attributes */ +#define _WEXT_OK (1<<18) /* write extended attributes */ +#define _RPERM_OK (1<<19) /* read permissions */ +#define _WPERM_OK (1<<20) /* write permissions */ +#define _CHOWN_OK (1<<21) /* change ownership */ + +#define _ACCESS_EXTENDED_MASK (_READ_OK | _WRITE_OK | _EXECUTE_OK | \ + _DELETE_OK | _APPEND_OK | \ + _RMFILE_OK | _REXT_OK | \ + _WEXT_OK | _RATTR_OK | _WATTR_OK | _RPERM_OK | \ + _WPERM_OK | _CHOWN_OK) +#endif /* whence values for lseek(2) */ +#ifndef SEEK_SET #define SEEK_SET 0 /* set file offset to offset */ #define SEEK_CUR 1 /* set file offset to current plus offset */ #define SEEK_END 2 /* set file offset to EOF plus offset */ +#endif /* !SEEK_SET */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE /* whence values for lseek(2); renamed by POSIX 1003.1 */ #define L_SET SEEK_SET #define L_INCR SEEK_CUR #define L_XTND SEEK_END #endif +#ifndef _POSIX_C_SOURCE +struct accessx_descriptor { + unsigned ad_name_offset; + int ad_flags; + int ad_pad[2]; +}; +#define ACCESSX_MAX_DESCRIPTORS 100 +#define ACCESSX_MAX_TABLESIZE (16 * 1024) +#endif + /* configurable pathname variables */ #define _PC_LINK_MAX 1 #define _PC_MAX_CANON 2 @@ -122,16 +147,15 @@ #define _PC_NO_TRUNC 8 #define _PC_VDISABLE 9 -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE #define _PC_NAME_CHARS_MAX 10 #define _PC_CASE_SENSITIVE 11 #define _PC_CASE_PRESERVING 12 +#define _PC_EXTENDED_SECURITY_NP 13 +#define _PC_AUTH_OPAQUE_NP 14 #endif /* configurable system strings */ #define _CS_PATH 1 -/* async IO support */ -#define _POSIX_ASYNCHRONOUS_IO - #endif /* !_SYS_UNISTD_H_ */ diff --git a/bsd/sys/unpcb.h b/bsd/sys/unpcb.h index 408cc93d1..2a5f8af33 100644 --- a/bsd/sys/unpcb.h +++ b/bsd/sys/unpcb.h @@ -60,6 +60,7 @@ #include <sys/appleapiopts.h> #include <sys/queue.h> #include <sys/un.h> +#include <sys/ucred.h> /* * Protocol control block for an active @@ -86,9 +87,13 @@ * so that changes in the sockbuf may be computed to modify * back pressure on the sender accordingly. */ -#ifdef __APPLE_API_PRIVATE -typedef u_quad_t unp_gen_t; + +typedef u_quad_t unp_gen_t; + +#ifdef PRIVATE LIST_HEAD(unp_head, unpcb); +#ifdef KERNEL +#define sotounpcb(so) ((struct unpcb *)((so)->so_pcb)) struct unpcb { LIST_ENTRY(unpcb) unp_link; /* glue on list of all PCBs */ @@ -102,31 +107,67 @@ struct unpcb { int unp_cc; /* copy of rcv.sb_cc */ int unp_mbcnt; /* copy of rcv.sb_mbcnt */ unp_gen_t unp_gencnt; /* generation count of this instance */ + int unp_flags; /* flags */ + struct xucred unp_peercred; /* peer credentials, if applicable */ }; +#endif /* KERNEL */ -#define sotounpcb(so) ((struct unpcb *)((so)->so_pcb)) -#endif /* __APPLE_API_PRIVATE */ +/* + * Flags in unp_flags. + * + * UNP_HAVEPC - indicates that the unp_peercred member is filled in + * and is really the credentials of the connected peer. This is used + * to determine whether the contents should be sent to the user or + * not. + * + * UNP_HAVEPCCACHED - indicates that the unp_peercred member is filled + * in, but does *not* contain the credentials of the connected peer + * (there may not even be a peer). This is set in unp_listen() when + * it fills in unp_peercred for later consumption by unp_connect(). + */ +#define UNP_HAVEPC 0x001 +#define UNP_HAVEPCCACHED 0x002 + +#ifdef KERNEL +struct unpcb_compat { +#else /* KERNEL */ +#define unpcb_compat unpcb +struct unpcb { +#endif /* KERNEL */ + LIST_ENTRY(unpcb_compat) unp_link; /* glue on list of all PCBs */ + struct socket *unp_socket; /* pointer back to socket */ + struct vnode *unp_vnode; /* if associated with file */ + ino_t unp_ino; /* fake inode number */ + struct unpcb_compat *unp_conn; /* control block of connected socket */ + struct unp_head unp_refs; /* referencing socket linked list */ + LIST_ENTRY(unpcb_compat) unp_reflink; /* link in unp_refs list */ + struct sockaddr_un *unp_addr; /* bound address of socket */ + int unp_cc; /* copy of rcv.sb_cc */ + int unp_mbcnt; /* copy of rcv.sb_mbcnt */ + unp_gen_t unp_gencnt; /* generation count of this instance */ +}; /* Hack alert -- this structure depends on <sys/socketvar.h>. */ -#ifdef _SYS_SOCKETVAR_H_ -#ifdef __APPLE_API_UNSTABLE -struct xunpcb { - size_t xu_len; /* length of this structure */ - struct unpcb *xu_unpp; /* to help netstat, fstat */ - struct unpcb xu_unp; /* our information */ - union { - struct sockaddr_un xuu_addr; /* our bound address */ - char xu_dummy1[256]; - } xu_au; -#define xu_addr xu_au.xuu_addr - union { - struct sockaddr_un xuu_caddr; /* their bound address */ - char xu_dummy2[256]; - } xu_cau; -#define xu_caddr xu_cau.xuu_caddr - struct xsocket xu_socket; - u_quad_t xu_alignment_hack; +#ifdef _SYS_SOCKETVAR_H_ +struct xunpcb { + size_t xu_len; /* length of this structure */ + struct unpcb_compat *xu_unpp; /* to help netstat, fstat */ + struct unpcb_compat xu_unp; /* our information */ + union { + struct sockaddr_un xuu_addr; /* our bound address */ + char xu_dummy1[256]; + } xu_au; +#define xu_addr xu_au.xuu_addr + union { + struct sockaddr_un xuu_caddr; /* their bound address */ + char xu_dummy2[256]; + } xu_cau; +#define xu_caddr xu_cau.xuu_caddr + struct xsocket xu_socket; + u_quad_t xu_alignment_hack; }; +#endif /* _SYS_SOCKETVAR_H_ */ +#endif /* PRIVATE */ struct xunpgen { size_t xug_len; @@ -134,7 +175,5 @@ struct xunpgen { unp_gen_t xug_gen; so_gen_t xug_sogen; }; -#endif /* __APPLE_API_UNSTABLE */ -#endif /* _SYS_SOCKETVAR_H_ */ #endif /* _SYS_UNPCB_H_ */ diff --git a/bsd/sys/user.h b/bsd/sys/user.h index 8c1a005ab..b90a52481 100644 --- a/bsd/sys/user.h +++ b/bsd/sys/user.h @@ -68,14 +68,16 @@ #include <sys/uio.h> #endif #include <sys/resourcevar.h> +#ifdef KERNEL_PRIVATE #include <sys/signalvar.h> +#endif #include <sys/vm.h> /* XXX */ #include <sys/sysctl.h> #ifdef KERNEL - #ifdef __APPLE_API_PRIVATE -struct nlminfo; +#include <sys/eventvar.h> + /* * Per-thread U area. */ @@ -83,39 +85,44 @@ struct uthread { int *uu_ar0; /* address of users saved R0 */ /* syscall parameters, results and catches */ - int uu_arg[8]; /* arguments to current system call */ + u_int64_t uu_arg[8]; /* arguments to current system call */ int *uu_ap; /* pointer to arglist */ int uu_rval[2]; /* thread exception handling */ int uu_code; /* ``code'' to trap */ - char uu_cursig; /* p_cursig for exc. */ - struct nlminfo *uu_nlminfo; /* for rpc.lockd */ - /* support for syscalls which use continuations */ + char uu_cursig; /* p_cursig for exc. */ + /* support for select - across system calls */ + struct _select { + u_int32_t *ibits, *obits; /* bits to select on */ + uint nbytes; /* number of bytes in ibits and obits */ + wait_queue_set_t wqset; /* cached across select calls */ + size_t allocsize; /* ...size of select cache */ + u_int64_t abstime; + int poll; + int error; + int count; + char * wql; + } uu_select; /* saved state for select() */ + /* to support continuations */ union { - struct _select { - u_int32_t *ibits, *obits; /* bits to select on */ - uint nbytes; /* number of bytes in ibits and obits */ - u_int64_t abstime; - int poll; - int error; - int count; - int nfcount; - char * wql; - int allocsize; /* select allocated size */ - } ss_select; /* saved state for select() */ - struct _wait { - int f; - } ss_wait; /* saved state for wait?() */ - struct _owait { - int pid; - int *status; - int options; - struct rusage *rusage; - } ss_owait; - int uu_nfs_myiod; /* saved state for nfsd */ + int uu_nfs_myiod; /* saved state for nfsd */ + struct _kevent_scan { + kevent_callback_t call; /* per-event callback */ + kevent_continue_t cont; /* whole call continuation */ + uint64_t deadline; /* computed deadline for operation */ + void *data; /* caller's private data */ + } ss_kevent_scan; /* saved state for kevent_scan() */ + struct _kevent { + struct _kevent_scan scan;/* space for the generic data */ + struct fileproc *fp; /* fileproc we hold iocount on */ + int fd; /* filedescriptor for kq */ + register_t *retval; /* place to store return val */ + user_addr_t eventlist; /* user-level event list address */ + int eventcount; /* user-level event count */ + int eventout; /* number of events output */ + } ss_kevent; /* saved state for kevent() */ } uu_state; - /* internal support for continuation framework */ int (*uu_continuation)(int); int uu_pri; @@ -123,27 +130,48 @@ struct uthread { int uu_flag; struct proc * uu_proc; void * uu_userstate; - wait_queue_sub_t uu_wqsub; sigset_t uu_siglist; /* signals pending for the thread */ sigset_t uu_sigwait; /* sigwait on this thread*/ sigset_t uu_sigmask; /* signal mask for the thread */ sigset_t uu_oldmask; /* signal mask saved before sigpause */ - thread_act_t uu_act; + thread_t uu_act; sigset_t uu_vforkmask; /* saved signal mask during vfork */ TAILQ_ENTRY(uthread) uu_list; /* List of uthreads in proc */ struct kaudit_record *uu_ar; /* audit record */ struct task* uu_aio_task; /* target task for async io */ + + /* network support for dlil layer locking */ + u_int32_t dlil_incremented_read; + lck_mtx_t *uu_mtx; + + int uu_lowpri_delay; + + struct ucred *uu_ucred; /* per thread credential */ + int uu_defer_reclaims; + vnode_t uu_vreclaims; + +#ifdef JOE_DEBUG + int uu_iocount; + int uu_vpindex; + void * uu_vps[32]; +#endif }; typedef struct uthread * uthread_t; /* Definition of uu_flag */ -#define USAS_OLDMASK 0x1 /* need to restore mask before pause */ -#define UNO_SIGMASK 0x2 /* exited thread; invalid sigmask */ -/* Kept same as in proc */ -#define P_VFORK 0x2000000 /* process has vfork children */ +#define UT_SAS_OLDMASK 0x00000001 /* need to restore mask before pause */ +#define UT_NO_SIGMASK 0x00000002 /* exited thread; invalid sigmask */ +#define UT_NOTCANCELPT 0x00000004 /* not a cancelation point */ +#define UT_CANCEL 0x00000008 /* thread marked for cancel */ +#define UT_CANCELED 0x00000010 /* thread cancelled */ +#define UT_CANCELDISABLE 0x00000020 /* thread cancel disabled */ + +#define UT_VFORK 0x02000000 /* thread has vfork children */ +#define UT_SETUID 0x04000000 /* thread is settugid() */ +#define UT_WASSETUID 0x08000000 /* thread was settugid() (in vfork) */ #endif /* __APPLE_API_PRIVATE */ diff --git a/bsd/sys/utfconv.h b/bsd/sys/utfconv.h index 18db6b179..1feafbf5b 100644 --- a/bsd/sys/utfconv.h +++ b/bsd/sys/utfconv.h @@ -37,13 +37,16 @@ #define UTF_PRECOMPOSED 0x08 /* generate precomposed UCS-2 */ __BEGIN_DECLS -size_t utf8_encodelen __P((const u_int16_t *, size_t, u_int16_t, int)); +size_t utf8_encodelen(const u_int16_t *, size_t, u_int16_t, int); -int utf8_encodestr __P((const u_int16_t *, size_t, u_int8_t *, size_t *, - size_t, u_int16_t, int)); +int utf8_encodestr(const u_int16_t *, size_t, u_int8_t *, size_t *, + size_t, u_int16_t, int); + +int utf8_decodestr(const u_int8_t *, size_t, u_int16_t *,size_t *, + size_t, u_int16_t, int); + +int utf8_validatestr(const u_int8_t*, size_t); -int utf8_decodestr __P((const u_int8_t *, size_t, u_int16_t *,size_t *, - size_t, u_int16_t, int)); __END_DECLS #endif /* __APPLE_API_UNSTABLE */ diff --git a/bsd/sys/utsname.h b/bsd/sys/utsname.h index 35779be0d..8f3d2e2f8 100644 --- a/bsd/sys/utsname.h +++ b/bsd/sys/utsname.h @@ -61,20 +61,20 @@ #ifndef _SYS_UTSNAME_H #define _SYS_UTSNAME_H +#include <sys/cdefs.h> + #define _SYS_NAMELEN 256 struct utsname { - char sysname[_SYS_NAMELEN]; /* Name of OS */ - char nodename[_SYS_NAMELEN]; /* Name of this network node */ - char release[_SYS_NAMELEN]; /* Release level */ - char version[_SYS_NAMELEN]; /* Version level */ - char machine[_SYS_NAMELEN]; /* Hardware type */ + char sysname[_SYS_NAMELEN]; /* [XSI] Name of OS */ + char nodename[_SYS_NAMELEN]; /* [XSI] Name of this network node */ + char release[_SYS_NAMELEN]; /* [XSI] Release level */ + char version[_SYS_NAMELEN]; /* [XSI] Version level */ + char machine[_SYS_NAMELEN]; /* [XSI] Hardware type */ }; -#include <sys/cdefs.h> - __BEGIN_DECLS -int uname __P((struct utsname *)); +int uname(struct utsname *); __END_DECLS #endif /* !_SYS_UTSNAME_H */ diff --git a/bsd/sys/ux_exception.h b/bsd/sys/ux_exception.h index 2b55b6cb5..2f4372642 100644 --- a/bsd/sys/ux_exception.h +++ b/bsd/sys/ux_exception.h @@ -57,6 +57,10 @@ extern mach_port_name_t ux_exception_port; +boolean_t machine_exception(int exception, int code, int subcode, + int *unix_signal, int *unix_code); +void ux_handler_init(void); + #endif /* __APPLE_API_PRIVATE */ #endif /* KERNEL */ diff --git a/bsd/sys/version.h b/bsd/sys/version.h index 308e5f5f4..a11538fb6 100644 --- a/bsd/sys/version.h +++ b/bsd/sys/version.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * diff --git a/bsd/sys/vfs_context.h b/bsd/sys/vfs_context.h new file mode 100644 index 000000000..931271de2 --- /dev/null +++ b/bsd/sys/vfs_context.h @@ -0,0 +1,14 @@ +#ifndef _BSD_SYS_VFS_CONTEXT_H_ +#define _BSD_SYS_VFS_CONTEXT_H_ + +#include <sys/cdefs.h> +#include <sys/types.h> +#include <sys/kernel_types.h> +#include <stdint.h> + +struct vfs_context { + proc_t vc_proc; + ucred_t vc_ucred; +}; + +#endif /* !_BSD_SYS_VFS_CONTEXT_H_ */ diff --git a/bsd/sys/vm.h b/bsd/sys/vm.h index 1718e1369..2ff69a04b 100644 --- a/bsd/sys/vm.h +++ b/bsd/sys/vm.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -63,6 +63,9 @@ #define _SYS_VM_H #include <sys/appleapiopts.h> +#include <sys/cdefs.h> + +#ifdef BSD_KERNEL_PRIVATE /* Machine specific config stuff */ #if defined(KERNEL) && !defined(MACH_USER_API) @@ -71,7 +74,6 @@ #include <mach/vm_param.h> #endif -#ifdef __APPLE_API_OBSOLETE /* * Shareable process virtual address space. * May eventually be merged with vm_map. @@ -92,22 +94,55 @@ struct vmspace { caddr_t vm_maxsaddr; /* user VA at max stack growth */ }; -#else /* __APPLE_API_OBSOLETE */ +#ifdef KERNEL +// LP64todo - should this move? +/* LP64 version of vmspace. all pointers + * grow when we're dealing with a 64-bit process. + * WARNING - keep in sync with vmspace + */ + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_vmspace { + int vm_refcnt; /* number of references */ + user_addr_t vm_shm; /* SYS5 shared memory private data XXX */ + segsz_t vm_rssize; /* current resident set size in pages */ + segsz_t vm_swrss; /* resident set size before last swap */ + segsz_t vm_tsize; /* text size (pages) XXX */ + segsz_t vm_dsize; /* data size (pages) XXX */ + segsz_t vm_ssize; /* stack size (pages) */ + user_addr_t vm_taddr; /* user virtual address of text XXX */ + user_addr_t vm_daddr; /* user virtual address of data XXX */ + user_addr_t vm_maxsaddr; /* user VA at max stack growth */ +}; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +#endif /* KERNEL */ + +#include <kern/thread.h> + +#else /* BSD_KERNEL_PRIVATE */ /* just to keep kinfo_proc happy */ +/* NOTE: Pointer fields are size variant for LP64 */ struct vmspace { - int32_t dummy[10]; + int32_t dummy; + caddr_t dummy2; + int32_t dummy3[5]; + caddr_t dummy4[3]; }; -#endif /* __APPLE_API_OBSOLETE */ -#ifdef KERNEL +#endif /* BSD_KERNEL_PRIVATE */ -#ifdef __APPLE_API_PRIVATE -#ifdef BSD_BUILD -#include <kern/thread.h> -#endif /* BSD_BUILD */ -#endif /* __APPLE_API_PRIVATE */ +#ifdef KERNEL +__BEGIN_DECLS struct proc *current_proc(void); +__END_DECLS #endif /* KERNEL */ diff --git a/bsd/sys/vmmeter.h b/bsd/sys/vmmeter.h index 0564670bc..caa2fbc15 100644 --- a/bsd/sys/vmmeter.h +++ b/bsd/sys/vmmeter.h @@ -110,9 +110,6 @@ struct vmmeter { u_int v_inactive_target; /* number of pages desired inactive */ u_int v_inactive_count; /* number of pages inactive */ }; -#ifdef KERNEL -extern struct vmmeter cnt; -#endif /* systemwide totals computed every five seconds */ struct vmtotal diff --git a/bsd/sys/vnioctl.h b/bsd/sys/vnioctl.h index e3a3729a6..37bb0de23 100644 --- a/bsd/sys/vnioctl.h +++ b/bsd/sys/vnioctl.h @@ -1,3 +1,24 @@ +/* + * Copyright (c) 2004-2005 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ /* * Copyright (c) 1988 University of Utah. * Copyright (c) 1990, 1993 @@ -46,10 +67,9 @@ #define _SYS_VNIOCTL_H_ #include <sys/appleapiopts.h> +#include <sys/cdefs.h> -#ifdef KERNEL_PRIVATE -#ifdef __APPLE_API_PRIVATE /* * Ioctl definitions for file (vnode) disk pseudo-device. */ @@ -62,10 +82,28 @@ typedef enum { struct vn_ioctl { char * vn_file; /* pathname of file to mount */ - int vn_size; /* (returned) size of disk */ + int vn_size; /* (returned) size of disk */ vncontrol_t vn_control; }; +#ifdef KERNEL_PRIVATE + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=natural +#endif + +struct user_vn_ioctl { + u_int64_t vn_file; /* pathname of file to mount */ + int vn_size; /* (returned) size of disk */ + vncontrol_t vn_control; +}; + +#if __DARWIN_ALIGN_NATURAL +#pragma options align=reset +#endif + +#endif /* KERNEL_PRIVATE */ + /* * Before you can use a unit, it must be configured with VNIOCSET. * The configuration persists across opens and closes of the device; @@ -74,11 +112,16 @@ struct vn_ioctl { */ #define VNIOCATTACH _IOWR('F', 0, struct vn_ioctl) /* attach file */ #define VNIOCDETACH _IOWR('F', 1, struct vn_ioctl) /* detach disk */ -#define VNIOCGSET _IOWR('F', 2, u_long ) /* set global option */ -#define VNIOCGCLEAR _IOWR('F', 3, u_long ) /* reset --//-- */ -#define VNIOCUSET _IOWR('F', 4, u_long ) /* set unit option */ -#define VNIOCUCLEAR _IOWR('F', 5, u_long ) /* reset --//-- */ +#define VNIOCGSET _IOWR('F', 2, u_int32_t ) /* set global option */ +#define VNIOCGCLEAR _IOWR('F', 3, u_int32_t ) /* reset --//-- */ +#define VNIOCUSET _IOWR('F', 4, u_int32_t ) /* set unit option */ +#define VNIOCUCLEAR _IOWR('F', 5, u_int32_t ) /* reset --//-- */ #define VNIOCSHADOW _IOWR('F', 6, struct vn_ioctl) /* attach shadow */ +#ifdef KERNEL_PRIVATE +#define VNIOCATTACH64 _IOWR('F', 0, struct user_vn_ioctl) /* attach file - LP64 */ +#define VNIOCDETACH64 _IOWR('F', 1, struct user_vn_ioctl) /* detach disk - LP64 */ +#define VNIOCSHADOW64 _IOWR('F', 6, struct user_vn_ioctl) /* attach shadow - LP64 */ +#endif /* KERNEL_PRIVATE */ #define VN_LABELS 0x1 /* Use disk(/slice) labels */ #define VN_FOLLOW 0x2 /* Debug flow in vn driver */ @@ -87,8 +130,5 @@ struct vn_ioctl { #define VN_DONTCLUSTER 0x10 /* Don't cluster */ #define VN_RESERVE 0x20 /* Pre-reserve swap */ -#endif /* __APPLE_API_PRIVATE */ - -#endif /* KERNEL_PRIVATE */ #endif /* _SYS_VNIOCTL_H_*/ diff --git a/bsd/sys/vnode.h b/bsd/sys/vnode.h index a6f13a11b..9bac1aec0 100644 --- a/bsd/sys/vnode.h +++ b/bsd/sys/vnode.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -60,19 +60,11 @@ #include <sys/appleapiopts.h> #include <sys/cdefs.h> -#include <sys/queue.h> -#include <sys/lock.h> - -#include <sys/time.h> -#include <sys/uio.h> - -#include <sys/vm.h> #ifdef KERNEL -#include <sys/systm.h> -#include <vm/vm_pageout.h> -#endif /* KERNEL */ +#include <sys/kernel_types.h> +#include <sys/signal.h> +#endif -#ifdef __APPLE_API_PRIVATE /* * The vnode is the focus of all file activity in UNIX. There is a * unique vnode allocated for each active file, each current directory, @@ -96,177 +88,300 @@ enum vtagtype { VT_UNION, VT_HFS, VT_VOLFS, VT_DEVFS, VT_WEBDAV, VT_UDF, VT_AFP, VT_CDDA, VT_CIFS,VT_OTHER}; + /* - * Each underlying filesystem allocates its own private area and hangs - * it from v_data. If non-null, this area is freed in getnewvnode(). + * flags for VNOP_BLOCKMAP */ -LIST_HEAD(buflists, buf); +#define VNODE_READ 0x01 +#define VNODE_WRITE 0x02 -#define MAX_CLUSTERS 4 /* maximum number of vfs clusters per vnode */ -struct v_cluster { - unsigned int start_pg; - unsigned int last_pg; -}; -struct v_padded_clusters { - long v_pad; - struct v_cluster v_c[MAX_CLUSTERS]; -}; +/* flags for VNOP_ALLOCATE */ +#define PREALLOCATE 0x00000001 /* preallocate allocation blocks */ +#define ALLOCATECONTIG 0x00000002 /* allocate contigious space */ +#define ALLOCATEALL 0x00000004 /* allocate all requested space */ + /* or no space at all */ +#define FREEREMAINDER 0x00000008 /* deallocate allocated but */ + /* unfilled blocks */ +#define ALLOCATEFROMPEOF 0x00000010 /* allocate from the physical eof */ +#define ALLOCATEFROMVOL 0x00000020 /* allocate from the volume offset */ /* - * Reading or writing any of these items requires holding the appropriate lock. - * v_freelist is locked by the global vnode_free_list simple lock. - * v_mntvnodes is locked by the global mntvnodes simple lock. - * v_flag, v_usecount, v_holdcount and v_writecount are - * locked by the v_interlock simple lock. + * Token indicating no attribute value yet assigned. some user source uses this */ -struct vnode { - u_long v_flag; /* vnode flags (see below) */ - long v_usecount; /* reference count of users */ - long v_holdcnt; /* page & buffer references */ - daddr_t v_lastr; /* last read (read-ahead) */ - u_long v_id; /* capability identifier */ - struct mount *v_mount; /* ptr to vfs we are in */ - int (**v_op)(void *); /* vnode operations vector */ - TAILQ_ENTRY(vnode) v_freelist; /* vnode freelist */ - LIST_ENTRY(vnode) v_mntvnodes; /* vnodes for mount point */ - struct buflists v_cleanblkhd; /* clean blocklist head */ - struct buflists v_dirtyblkhd; /* dirty blocklist head */ - long v_numoutput; /* num of writes in progress */ - enum vtype v_type; /* vnode type */ - union { - struct mount *vu_mountedhere;/* ptr to mounted vfs (VDIR) */ - struct socket *vu_socket; /* unix ipc (VSOCK) */ - struct specinfo *vu_specinfo; /* device (VCHR, VBLK) */ - struct fifoinfo *vu_fifoinfo; /* fifo (VFIFO) */ - char *vu_name; /* name (only for VREG) */ - } v_un; - struct ubc_info *v_ubcinfo; /* valid for (VREG) */ - struct nqlease *v_lease; /* Soft reference to lease */ - void *v_scmap; /* pointer to sparse cluster map */ - int v_scdirty; /* number of dirty pages in the sparse cluster map */ - daddr_t v_ciosiz; /* real size of I/O for cluster */ - int v_clen; /* length of current cluster */ - int v_ralen; /* Read-ahead length */ - daddr_t v_maxra; /* last readahead block */ - union { - simple_lock_data_t v_ilk; /* lock on usecount and flag */ - struct v_padded_clusters v_cl; /* vfs cluster IO */ - } v_un1; -#define v_clusters v_un1.v_cl.v_c -#define v_interlock v_un1.v_ilk - - struct lock__bsd__ *v_vnlock; /* used for non-locking fs's */ - long v_writecount; /* reference count of writers */ - enum vtagtype v_tag; /* type of underlying data */ - void *v_data; /* private data for fs */ -}; -#define v_mountedhere v_un.vu_mountedhere -#define v_socket v_un.vu_socket -#define v_specinfo v_un.vu_specinfo -#define v_fifoinfo v_un.vu_fifoinfo +#define VNOVAL (-1) -// NOTE: Do not use these macros. They are for vfs internal use only. -#define VNAME(vp) ((char *)((vp)->v_type == VREG ? (vp)->v_un.vu_name : (vp)->v_scmap)) -#define VPARENT(vp) ((struct vnode *)((vp)->v_type == VREG ? (vp)->v_un1.v_cl.v_pad : (vp)->v_scdirty)) +#ifdef KERNEL +/* + * Flags for ioflag. + */ +#define IO_UNIT 0x0001 /* do I/O as atomic unit */ +#define IO_APPEND 0x0002 /* append write to end */ +#define IO_SYNC 0x0004 /* do I/O synchronously */ +#define IO_NODELOCKED 0x0008 /* underlying node already locked */ +#define IO_NDELAY 0x0010 /* FNDELAY flag set in file table */ +#define IO_NOZEROFILL 0x0020 /* F_SETSIZE fcntl uses to prevent zero filling */ +#define IO_TAILZEROFILL 0x0040 /* zero fills at the tail of write */ +#define IO_HEADZEROFILL 0x0080 /* zero fills at the head of write */ +#define IO_NOZEROVALID 0x0100 /* do not zero fill if valid page */ +#define IO_NOZERODIRTY 0x0200 /* do not zero fill if page is dirty */ +#define IO_CLOSE 0x0400 /* I/O issued from close path */ +#define IO_NOCACHE 0x0800 /* same effect as VNOCACHE_DATA, but only for this 1 I/O */ +#define IO_RAOFF 0x1000 /* same effect as VRAOFF, but only for this 1 I/O */ +#define IO_DEFWRITE 0x2000 /* defer write if vfs.defwrite is set */ /* - * Vnode flags. + * Component Name: this structure describes the pathname + * information that is passed through the VNOP interface. */ -#define VROOT 0x000001 /* root of its file system */ -#define VTEXT 0x000002 /* vnode is a pure text prototype */ -#define VSYSTEM 0x000004 /* vnode being used by kernel */ -#define VISTTY 0x000008 /* vnode represents a tty */ -#define VWASMAPPED 0x000010 /* vnode was mapped before */ -#define VTERMINATE 0x000020 /* terminating memory object */ -#define VTERMWANT 0x000040 /* wating for memory object death */ -#define VMOUNT 0x000080 /* mount operation in progress */ -#define VXLOCK 0x000100 /* vnode is locked to change underlying type */ -#define VXWANT 0x000200 /* process is waiting for vnode */ -#define VBWAIT 0x000400 /* waiting for output to complete */ -#define VALIASED 0x000800 /* vnode has an alias */ -#define VORECLAIM 0x001000 /* vm object is being reclaimed */ -#define VNOCACHE_DATA 0x002000 /* don't keep data cached once it's been consumed */ -#define VSTANDARD 0x004000 /* vnode obtained from common pool */ -#define VAGE 0x008000 /* Insert vnode at head of free list */ -#define VRAOFF 0x010000 /* read ahead disabled */ -#define VUINIT 0x020000 /* ubc_info being initialized */ -#define VUWANT 0x040000 /* process is wating for VUINIT */ -#define VUINACTIVE 0x080000 /* UBC vnode is on inactive list */ -#define VHASDIRTY 0x100000 /* UBC vnode may have 1 or more */ - /* delayed dirty pages that need to be flushed at the next 'sync' */ -#define VSWAP 0x200000 /* vnode is being used as swapfile */ -#define VTHROTTLED 0x400000 /* writes or pageouts have been throttled */ - /* wakeup tasks waiting when count falls below threshold */ -#define VNOFLUSH 0x800000 /* don't vflush() if SKIPSYSTEM */ -#define VDELETED 0x1000000 /* this vnode is being deleted */ -#define VFULLFSYNC 0x2000000 /* ask the drive to write the data to the media */ -#define VHASBEENPAGED 0x4000000 /* vnode has been recently paged to */ +struct componentname { + /* + * Arguments to lookup. + */ + u_long cn_nameiop; /* lookup operation */ + u_long cn_flags; /* flags (see below) */ +#ifdef BSD_KERNEL_PRIVATE + vfs_context_t cn_context; + void * pad_obsolete2; + +/* XXX use of these defines are deprecated */ +#define cn_proc (cn_context->vc_proc + 0) /* non-lvalue */ +#define cn_cred (cn_context->vc_ucred + 0) /* non-lvalue */ + +#else + void * obsolete1; /* use vfs_context_t */ + void * obsolete2; /* use vfs_context_t */ +#endif + /* + * Shared between lookup and commit routines. + */ + char *cn_pnbuf; /* pathname buffer */ + long cn_pnlen; /* length of allocated buffer */ + char *cn_nameptr; /* pointer to looked up name */ + long cn_namelen; /* length of looked up component */ + u_long cn_hash; /* hash value of looked up name */ + long cn_consume; /* chars to consume in lookup() */ +}; /* - * Vnode attributes. A field value of VNOVAL represents a field whose value - * is unavailable (getattr) or which is not to be changed (setattr). + * component name operations (for VNOP_LOOKUP) */ -struct vattr { - enum vtype va_type; /* vnode type (for create) */ - u_short va_mode; /* files access mode and type */ - short va_nlink; /* number of references to file */ - uid_t va_uid; /* owner user id */ - gid_t va_gid; /* owner group id */ - long va_fsid; /* file system id (dev for now) */ - long va_fileid; /* file id */ - u_quad_t va_size; /* file size in bytes */ - long va_blocksize; /* blocksize preferred for i/o */ - struct timespec va_atime; /* time of last access */ - struct timespec va_mtime; /* time of last modification */ - struct timespec va_ctime; /* time file changed */ - u_long va_gen; /* generation number of file */ - u_long va_flags; /* flags defined for file */ - dev_t va_rdev; /* device the special file represents */ - u_quad_t va_bytes; /* bytes of disk space held by file */ - u_quad_t va_filerev; /* file modification number */ - u_int va_vaflags; /* operations flags, see below */ - long va_spare; /* remain quad aligned */ +#define LOOKUP 0 /* perform name lookup only */ +#define CREATE 1 /* setup for file creation */ +#define DELETE 2 /* setup for file deletion */ +#define RENAME 3 /* setup for file renaming */ +#define OPMASK 3 /* mask for operation */ + +/* + * component name operational modifier flags + */ +#define FOLLOW 0x0040 /* follow symbolic links */ + +/* + * component name parameter descriptors. + */ +#define ISDOTDOT 0x002000 /* current component name is .. */ +#define MAKEENTRY 0x004000 /* entry is to be added to name cache */ +#define ISLASTCN 0x008000 /* this is last component of pathname */ +#define ISWHITEOUT 0x020000 /* found whiteout */ +#define DOWHITEOUT 0x040000 /* do whiteouts */ + + + +/* The following structure specifies a vnode for creation */ +struct vnode_fsparam { + struct mount * vnfs_mp; /* mount point to which this vnode_t is part of */ + enum vtype vnfs_vtype; /* vnode type */ + const char * vnfs_str; /* File system Debug aid */ + struct vnode * vnfs_dvp; /* The parent vnode */ + void * vnfs_fsnode; /* inode */ + int (**vnfs_vops)(void *); /* vnode dispatch table */ + int vnfs_markroot; /* is this a root vnode in FS (not a system wide one) */ + int vnfs_marksystem; /* is a system vnode */ + dev_t vnfs_rdev; /* dev_t for block or char vnodes */ + off_t vnfs_filesize; /* that way no need for getattr in UBC */ + struct componentname * vnfs_cnp; /* component name to add to namecache */ + uint32_t vnfs_flags; /* flags */ }; +#define VNFS_NOCACHE 0x01 /* do not add to name cache at this time */ +#define VNFS_CANTCACHE 0x02 /* never add this instance to the name cache */ + +#define VNCREATE_FLAVOR 0 +#define VCREATESIZE sizeof(struct vnode_fsparam) + /* - * Flags for va_vaflags. + * Vnode attributes, new-style. + * + * The vnode_attr structure is used to transact attribute changes and queries + * with the filesystem. + * + * Note that this structure may be extended, but existing fields must not move. */ -#define VA_UTIMES_NULL 0x01 /* utimes argument was NULL */ -#define VA_EXCLUSIVE 0x02 /* exclusive create request */ + +#define VATTR_INIT(v) do {(v)->va_supported = (v)->va_active = 0ll; (v)->va_vaflags = 0;} while(0) +#define VATTR_SET_ACTIVE(v, a) ((v)->va_active |= VNODE_ATTR_ ## a) +#define VATTR_SET_SUPPORTED(v, a) ((v)->va_supported |= VNODE_ATTR_ ## a) +#define VATTR_IS_SUPPORTED(v, a) ((v)->va_supported & VNODE_ATTR_ ## a) +#define VATTR_CLEAR_ACTIVE(v, a) ((v)->va_active &= ~VNODE_ATTR_ ## a) +#define VATTR_CLEAR_SUPPORTED(v, a) ((v)->va_supported &= ~VNODE_ATTR_ ## a) +#define VATTR_IS_ACTIVE(v, a) ((v)->va_active & VNODE_ATTR_ ## a) +#define VATTR_ALL_SUPPORTED(v) (((v)->va_active & (v)->va_supported) == (v)->va_active) +#define VATTR_INACTIVE_SUPPORTED(v) do {(v)->va_active &= ~(v)->va_supported; (v)->va_supported = 0;} while(0) +#define VATTR_SET(v, a, x) do { (v)-> a = (x); VATTR_SET_ACTIVE(v, a);} while(0) +#define VATTR_WANTED(v, a) VATTR_SET_ACTIVE(v, a) +#define VATTR_RETURN(v, a, x) do { (v)-> a = (x); VATTR_SET_SUPPORTED(v, a);} while(0) +#define VATTR_NOT_RETURNED(v, a) (VATTR_IS_ACTIVE(v, a) && !VATTR_IS_SUPPORTED(v, a)) /* - * Flags for ioflag. + * Two macros to simplify conditional checking in kernel code. + */ +#define VATTR_IS(v, a, x) (VATTR_IS_SUPPORTED(v, a) && (v)-> a == (x)) +#define VATTR_IS_NOT(v, a, x) (VATTR_IS_SUPPORTED(v, a) && (v)-> a != (x)) + +#define VNODE_ATTR_va_rdev (1LL<< 0) /* 00000001 */ +#define VNODE_ATTR_va_nlink (1LL<< 1) /* 00000002 */ +#define VNODE_ATTR_va_total_size (1LL<< 2) /* 00000004 */ +#define VNODE_ATTR_va_total_alloc (1LL<< 3) /* 00000008 */ +#define VNODE_ATTR_va_data_size (1LL<< 4) /* 00000010 */ +#define VNODE_ATTR_va_data_alloc (1LL<< 5) /* 00000020 */ +#define VNODE_ATTR_va_iosize (1LL<< 6) /* 00000040 */ +#define VNODE_ATTR_va_uid (1LL<< 7) /* 00000080 */ +#define VNODE_ATTR_va_gid (1LL<< 8) /* 00000100 */ +#define VNODE_ATTR_va_mode (1LL<< 9) /* 00000200 */ +#define VNODE_ATTR_va_flags (1LL<<10) /* 00000400 */ +#define VNODE_ATTR_va_acl (1LL<<11) /* 00000800 */ +#define VNODE_ATTR_va_create_time (1LL<<12) /* 00001000 */ +#define VNODE_ATTR_va_access_time (1LL<<13) /* 00002000 */ +#define VNODE_ATTR_va_modify_time (1LL<<14) /* 00004000 */ +#define VNODE_ATTR_va_change_time (1LL<<15) /* 00008000 */ +#define VNODE_ATTR_va_backup_time (1LL<<16) /* 00010000 */ +#define VNODE_ATTR_va_fileid (1LL<<17) /* 00020000 */ +#define VNODE_ATTR_va_linkid (1LL<<18) /* 00040000 */ +#define VNODE_ATTR_va_parentid (1LL<<19) /* 00080000 */ +#define VNODE_ATTR_va_fsid (1LL<<20) /* 00100000 */ +#define VNODE_ATTR_va_filerev (1LL<<21) /* 00200000 */ +#define VNODE_ATTR_va_gen (1LL<<22) /* 00400000 */ +#define VNODE_ATTR_va_encoding (1LL<<23) /* 00800000 */ +#define VNODE_ATTR_va_type (1LL<<24) /* 01000000 */ +#define VNODE_ATTR_va_name (1LL<<25) /* 02000000 */ +#define VNODE_ATTR_va_uuuid (1LL<<26) /* 04000000 */ +#define VNODE_ATTR_va_guuid (1LL<<27) /* 08000000 */ +#define VNODE_ATTR_va_nchildren (1LL<<28) /* 10000000 */ + +#define VNODE_ATTR_BIT(n) (VNODE_ATTR_ ## n) +/* + * Read-only attributes. + */ +#define VNODE_ATTR_RDONLY (VNODE_ATTR_BIT(va_rdev) | \ + VNODE_ATTR_BIT(va_nlink) | \ + VNODE_ATTR_BIT(va_total_size) | \ + VNODE_ATTR_BIT(va_total_alloc) | \ + VNODE_ATTR_BIT(va_data_alloc) | \ + VNODE_ATTR_BIT(va_iosize) | \ + VNODE_ATTR_BIT(va_fileid) | \ + VNODE_ATTR_BIT(va_linkid) | \ + VNODE_ATTR_BIT(va_parentid) | \ + VNODE_ATTR_BIT(va_fsid) | \ + VNODE_ATTR_BIT(va_filerev) | \ + VNODE_ATTR_BIT(va_gen) | \ + VNODE_ATTR_BIT(va_name) | \ + VNODE_ATTR_BIT(va_type) | \ + VNODE_ATTR_BIT(va_nchildren)) +/* + * Attributes that can be applied to a new file object. */ -#define IO_UNIT 0x01 /* do I/O as atomic unit */ -#define IO_APPEND 0x02 /* append write to end */ -#define IO_SYNC 0x04 /* do I/O synchronously */ -#define IO_NODELOCKED 0x08 /* underlying node already locked */ -#define IO_NDELAY 0x10 /* FNDELAY flag set in file table */ -#define IO_NOZEROFILL 0x20 /* F_SETSIZE fcntl uses to prevent zero filling */ -#define IO_TAILZEROFILL 0x40 /* zero fills at the tail of write */ -#define IO_HEADZEROFILL 0x80 /* zero fills at the head of write */ -#define IO_NOZEROVALID 0x100 /* do not zero fill if valid page */ -#define IO_NOZERODIRTY 0x200 /* do not zero fill if page is dirty */ +#define VNODE_ATTR_NEWOBJ (VNODE_ATTR_BIT(va_rdev) | \ + VNODE_ATTR_BIT(va_uid) | \ + VNODE_ATTR_BIT(va_gid) | \ + VNODE_ATTR_BIT(va_mode) | \ + VNODE_ATTR_BIT(va_flags) | \ + VNODE_ATTR_BIT(va_acl) | \ + VNODE_ATTR_BIT(va_create_time) | \ + VNODE_ATTR_BIT(va_modify_time) | \ + VNODE_ATTR_BIT(va_change_time) | \ + VNODE_ATTR_BIT(va_encoding) | \ + VNODE_ATTR_BIT(va_type) | \ + VNODE_ATTR_BIT(va_uuuid) | \ + VNODE_ATTR_BIT(va_guuid)) + +struct vnode_attr { + /* bitfields */ + uint64_t va_supported; + uint64_t va_active; + + /* + * Control flags. The low 16 bits are reserved for the + * ioflags being passed for truncation operations. + */ + int va_vaflags; + + /* traditional stat(2) parameter fields */ + dev_t va_rdev; /* device id (device nodes only) */ + uint64_t va_nlink; /* number of references to this file */ + uint64_t va_total_size; /* size in bytes of all forks */ + uint64_t va_total_alloc; /* disk space used by all forks */ + uint64_t va_data_size; /* size in bytes of the main(data) fork */ + uint64_t va_data_alloc; /* disk space used by the main(data) fork */ + uint32_t va_iosize; /* optimal I/O blocksize */ + + /* file security information */ + uid_t va_uid; /* owner UID */ + gid_t va_gid; /* owner GID */ + mode_t va_mode; /* posix permissions */ + uint32_t va_flags; /* file flags */ + struct kauth_acl *va_acl; /* access control list */ + + /* timestamps */ + struct timespec va_create_time; /* time of creation */ + struct timespec va_access_time; /* time of last access */ + struct timespec va_modify_time; /* time of last data modification */ + struct timespec va_change_time; /* time of last metadata change */ + struct timespec va_backup_time; /* time of last backup */ + + /* file parameters */ + uint64_t va_fileid; /* file unique ID in filesystem */ + uint64_t va_linkid; /* file link unique ID */ + uint64_t va_parentid; /* parent ID */ + uint32_t va_fsid; /* filesystem ID */ + uint64_t va_filerev; /* file revision counter */ /* XXX */ + uint32_t va_gen; /* file generation count */ /* XXX - relationship of + * these two? */ + /* misc parameters */ + uint32_t va_encoding; /* filename encoding script */ + + enum vtype va_type; /* file type (create only) */ + char * va_name; /* Name for ATTR_CMN_NAME; MAXPATHLEN bytes */ + guid_t va_uuuid; /* file owner UUID */ + guid_t va_guuid; /* file group UUID */ + + uint64_t va_nchildren; /* Number of items in a directory */ + /* Meaningful for directories only */ + + /* add new fields here only */ +}; /* - * Modes. Some values same as Ixxx entries from inode.h for now. + * Flags for va_vaflags. */ -#define VSUID 04000 /* set user id on execution */ -#define VSGID 02000 /* set group id on execution */ -#define VSVTX 01000 /* save swapped text even after use */ -#define VREAD 00400 /* read, write, execute permissions */ -#define VWRITE 00200 -#define VEXEC 00100 +#define VA_UTIMES_NULL 0x010000 /* utimes argument was NULL */ +#define VA_EXCLUSIVE 0x020000 /* exclusive create request */ + + /* - * Token indicating no attribute value yet assigned. + * Modes. Some values same as Ixxx entries from inode.h for now. */ -#define VNOVAL (-1) +#define VSUID 0x800 /*04000*/ /* set user id on execution */ +#define VSGID 0x400 /*02000*/ /* set group id on execution */ +#define VSVTX 0x200 /*01000*/ /* save swapped text even after use */ +#define VREAD 0x100 /*00400*/ /* read, write, execute permissions */ +#define VWRITE 0x080 /*00200*/ +#define VEXEC 0x040 /*00100*/ + -#ifdef KERNEL /* * Convert between vnode types and inode formats (since POSIX.1 * defines mode word of stat structure in terms of inode formats). @@ -277,6 +392,7 @@ extern int vttoif_tab[]; #define VTTOIF(indx) (vttoif_tab[(int)(indx)]) #define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode)) + /* * Flags to various vnode functions. */ @@ -284,146 +400,37 @@ extern int vttoif_tab[]; #define FORCECLOSE 0x0002 /* vflush: force file closeure */ #define WRITECLOSE 0x0004 /* vflush: only close writeable files */ #define SKIPSWAP 0x0008 /* vflush: skip vnodes marked VSWAP */ +#define SKIPROOT 0x0010 /* vflush: skip root vnodes marked VROOT */ #define DOCLOSE 0x0008 /* vclean: close active files */ #define V_SAVE 0x0001 /* vinvalbuf: sync file first */ #define V_SAVEMETA 0x0002 /* vinvalbuf: leave indirect blocks */ -#define REVOKEALL 0x0001 /* vop_revoke: revoke all aliases */ +#define REVOKEALL 0x0001 /* vnop_revoke: revoke all aliases */ -/* flags for vop_allocate */ -#define PREALLOCATE 0x00000001 /* preallocate allocation blocks */ -#define ALLOCATECONTIG 0x00000002 /* allocate contigious space */ -#define ALLOCATEALL 0x00000004 /* allocate all requested space */ - /* or no space at all */ -#define FREEREMAINDER 0x00000008 /* deallocate allocated but */ - /* unfilled blocks */ -#define ALLOCATEFROMPEOF 0x00000010 /* allocate from the physical eof */ -#define ALLOCATEFROMVOL 0x00000020 /* allocate from the volume offset */ +/* VNOP_REMOVE: do not delete busy files (Carbon remove file semantics) */ +#define VNODE_REMOVE_NODELETEBUSY 0x0001 -#if DIAGNOSTIC -#define VATTR_NULL(vap) vattr_null(vap) -#define HOLDRELE(vp) holdrele(vp) -#define VHOLD(vp) vhold(vp) +/* VNOP_READDIR flags: */ +#define VNODE_READDIR_EXTENDED 0x0001 /* use extended directory entries */ +#define VNODE_READDIR_REQSEEKOFF 0x0002 /* requires seek offset (cookies) */ -void holdrele __P((struct vnode *)); -void vattr_null __P((struct vattr *)); -void vhold __P((struct vnode *)); -#else -#define VATTR_NULL(vap) (*(vap) = va_null) /* initialize a vattr */ -#define HOLDRELE(vp) holdrele(vp) /* decrease buf or page ref */ -extern __inline void holdrele(struct vnode *vp) -{ - simple_lock(&vp->v_interlock); - vp->v_holdcnt--; - simple_unlock(&vp->v_interlock); -} -#define VHOLD(vp) vhold(vp) /* increase buf or page ref */ -extern __inline void vhold(struct vnode *vp) -{ - simple_lock(&vp->v_interlock); - if (++vp->v_holdcnt <= 0) - panic("vhold: v_holdcnt"); - simple_unlock(&vp->v_interlock); -} -#endif /* DIAGNOSTIC */ - -#define VREF(vp) vref(vp) -void vref __P((struct vnode *)); -#define NULLVP ((struct vnode *)NULL) -/* - * Global vnode data. - */ -extern struct vnode *rootvnode; /* root (i.e. "/") vnode */ -extern int desiredvnodes; /* number of vnodes desired */ -extern struct vattr va_null; /* predefined null vattr structure */ +#define NULLVP ((struct vnode *)NULL) /* * Macro/function to check for client cache inconsistency w.r.t. leasing. */ #define LEASE_READ 0x1 /* Check lease for readers */ #define LEASE_WRITE 0x2 /* Check lease for modifiers */ -#endif /* KERNEL */ - -/* - * Mods for exensibility. - */ - -/* - * Flags for vdesc_flags: - */ -#define VDESC_MAX_VPS 16 -/* Low order 16 flag bits are reserved for willrele flags for vp arguments. */ -#define VDESC_VP0_WILLRELE 0x0001 -#define VDESC_VP1_WILLRELE 0x0002 -#define VDESC_VP2_WILLRELE 0x0004 -#define VDESC_VP3_WILLRELE 0x0008 -#define VDESC_NOMAP_VPP 0x0100 -#define VDESC_VPP_WILLRELE 0x0200 -/* - * VDESC_NO_OFFSET is used to identify the end of the offset list - * and in places where no such field exists. - */ -#define VDESC_NO_OFFSET -1 -/* - * This structure describes the vnode operation taking place. - */ -struct vnodeop_desc { - int vdesc_offset; /* offset in vector--first for speed */ - char *vdesc_name; /* a readable name for debugging */ - int vdesc_flags; /* VDESC_* flags */ - - /* - * These ops are used by bypass routines to map and locate arguments. - * Creds and procs are not needed in bypass routines, but sometimes - * they are useful to (for example) transport layers. - * Nameidata is useful because it has a cred in it. - */ - int *vdesc_vp_offsets; /* list ended by VDESC_NO_OFFSET */ - int vdesc_vpp_offset; /* return vpp location */ - int vdesc_cred_offset; /* cred location, if any */ - int vdesc_proc_offset; /* proc location, if any */ - int vdesc_componentname_offset; /* if any */ - /* - * Finally, we've got a list of private data (about each operation) - * for each transport layer. (Support to manage this list is not - * yet part of BSD.) - */ - caddr_t *vdesc_transports; -}; +#ifndef BSD_KERNEL_PRIVATE +struct vnodeop_desc; +#endif -#endif /* __APPLE_API_PRIVATE */ - -#ifdef KERNEL - -#ifdef __APPLE_API_PRIVATE -/* - * A list of all the operation descs. - */ -extern struct vnodeop_desc *vnodeop_descs[]; - -/* - * Interlock for scanning list of vnodes attached to a mountpoint - */ -extern struct slock mntvnode_slock; - -/* - * This macro is very helpful in defining those offsets in the vdesc struct. - * - * This is stolen from X11R4. I ingored all the fancy stuff for - * Crays, so if you decide to port this to such a serious machine, - * you might want to consult Intrisics.h's XtOffset{,Of,To}. - */ -#define VOPARG_OFFSET(p_type,field) \ - ((int) (((char *) (&(((p_type)NULL)->field))) - ((char *) NULL))) -#define VOPARG_OFFSETOF(s_type,field) \ - VOPARG_OFFSET(s_type*,field) -#define VOPARG_OFFSETTO(S_TYPE,S_OFFSET,STRUCT_P) \ - ((S_TYPE)(((char*)(STRUCT_P))+(S_OFFSET))) +extern int desiredvnodes; /* number of vnodes desired */ /* @@ -442,94 +449,195 @@ struct vnodeopv_desc { /* * A default routine which just returns an error. */ -int vn_default_error __P((void)); +int vn_default_error(void); /* * A generic structure. * This can be used by bypass routines to identify generic arguments. */ -struct vop_generic_args { +struct vnop_generic_args { struct vnodeop_desc *a_desc; /* other random data follows, presumably */ }; +#ifndef _KAUTH_ACTION_T +typedef int kauth_action_t; +# define _KAUTH_ACTION_T +#endif + +#include <sys/vnode_if.h> + +__BEGIN_DECLS + +errno_t vnode_create(int, size_t, void *, vnode_t *); +int vnode_addfsref(vnode_t); +int vnode_removefsref(vnode_t); + +int vnode_hasdirtyblks(vnode_t); +int vnode_hascleanblks(vnode_t); +#define VNODE_ASYNC_THROTTLE 18 +/* timeout is in 10 msecs and not hz tick based */ +int vnode_waitforwrites(vnode_t, int, int, int, char *); +void vnode_startwrite(vnode_t); +void vnode_writedone(vnode_t); + +enum vtype vnode_vtype(vnode_t); +uint32_t vnode_vid(vnode_t); +mount_t vnode_mountedhere(vnode_t vp); +mount_t vnode_mount(vnode_t); +dev_t vnode_specrdev(vnode_t); +void * vnode_fsnode(vnode_t); +void vnode_clearfsnode(vnode_t); + +int vnode_isvroot(vnode_t); +int vnode_issystem(vnode_t); +int vnode_ismount(vnode_t); +int vnode_isreg(vnode_t); +int vnode_isdir(vnode_t); +int vnode_islnk(vnode_t); +int vnode_isfifo(vnode_t); +int vnode_isblk(vnode_t); +int vnode_ischr(vnode_t); + +int vnode_ismountedon(vnode_t); +void vnode_setmountedon(vnode_t); +void vnode_clearmountedon(vnode_t); + +int vnode_isnocache(vnode_t); +void vnode_setnocache(vnode_t); +void vnode_clearnocache(vnode_t); +int vnode_isnoreadahead(vnode_t); +void vnode_setnoreadahead(vnode_t); +void vnode_clearnoreadahead(vnode_t); +/* left only for compat reasons as User code depends on this from getattrlist, for ex */ +void vnode_settag(vnode_t, int); +int vnode_tag(vnode_t); +int vnode_getattr(vnode_t vp, struct vnode_attr *vap, vfs_context_t ctx); +int vnode_setattr(vnode_t vp, struct vnode_attr *vap, vfs_context_t ctx); + +#ifdef BSD_KERNEL_PRIVATE + /* - * VOCALL calls an op given an ops vector. We break it out because BSD's - * vclean changes the ops vector and then wants to call ops with the old - * vector. + * Indicate that a file has multiple hard links. VFS will always call + * VNOP_LOOKUP on this vnode. Volfs will always ask for it's parent + * object ID (instead of using the v_parent pointer). */ -#define VOCALL(OPSV,OFF,AP) (( *((OPSV)[(OFF)])) (AP)) +void vnode_set_hard_link(vnode_t vp); + +vnode_t vnode_parent(vnode_t); +void vnode_setparent(vnode_t, vnode_t); +char * vnode_name(vnode_t); +void vnode_setname(vnode_t, char *); +int vnode_isnoflush(vnode_t); +void vnode_setnoflush(vnode_t); +void vnode_clearnoflush(vnode_t); +#endif + +uint32_t vnode_vfsmaxsymlen(vnode_t); +int vnode_vfsisrdonly(vnode_t); +int vnode_vfstypenum(vnode_t); +void vnode_vfsname(vnode_t, char *); +int vnode_vfs64bitready(vnode_t); + +proc_t vfs_context_proc(vfs_context_t); +ucred_t vfs_context_ucred(vfs_context_t); +int vfs_context_issuser(vfs_context_t); +int vfs_context_pid(vfs_context_t); +int vfs_context_issignal(vfs_context_t, sigset_t); +int vfs_context_suser(vfs_context_t); +int vfs_context_is64bit(vfs_context_t); +vfs_context_t vfs_context_create(vfs_context_t); +int vfs_context_rele(vfs_context_t); + + +int vflush(struct mount *mp, struct vnode *skipvp, int flags); +int vnode_get(vnode_t); +int vnode_getwithvid(vnode_t, int); +int vnode_put(vnode_t); +int vnode_ref(vnode_t); +void vnode_rele(vnode_t); +int vnode_isinuse(vnode_t, int); +void vnode_lock(vnode_t); +void vnode_unlock(vnode_t); +int vnode_recycle(vnode_t); +void vnode_reclaim(vnode_t); + +#define VNODE_UPDATE_PARENT 0x01 +#define VNODE_UPDATE_NAME 0x02 +#define VNODE_UPDATE_CACHE 0x04 +void vnode_update_identity(vnode_t vp, vnode_t dvp, char *name, int name_len, int name_hashval, int flags); + +int vn_bwrite(struct vnop_bwrite_args *ap); + +int vnode_authorize(vnode_t /*vp*/, vnode_t /*dvp*/, kauth_action_t, vfs_context_t); +int vnode_authattr(vnode_t, struct vnode_attr *, kauth_action_t *, vfs_context_t); +int vnode_authattr_new(vnode_t /*dvp*/, struct vnode_attr *, int /*noauth*/, vfs_context_t); +errno_t vnode_close(vnode_t, int, vfs_context_t); + +int vn_getpath(struct vnode *vp, char *pathbuf, int *len); /* - * This call works for vnodes in the kernel. + * Flags for the vnode_lookup and vnode_open */ -#define VCALL(VP,OFF,AP) VOCALL((VP)->v_op,(OFF),(AP)) -#define VDESC(OP) (& __CONCAT(OP,_desc)) -#define VOFFSET(OP) (VDESC(OP)->vdesc_offset) +#define VNODE_LOOKUP_NOFOLLOW 0x01 +#define VNODE_LOOKUP_NOCROSSMOUNT 0x02 +#define VNODE_LOOKUP_DOWHITEOUT 0x04 -#endif /* __APPLE_API_PRIVATE */ +errno_t vnode_lookup(const char *, int, vnode_t *, vfs_context_t); +errno_t vnode_open(const char *, int, int, int, vnode_t *, vfs_context_t); /* - * Finally, include the default set of vnode operations. + * exported vnode operations */ -#include <sys/vnode_if.h> +int vnode_iterate(struct mount *, int, int (*)(struct vnode *, void *), void *); /* - * vnode manipulation functions. + * flags passed into vnode_iterate */ -struct file; -struct mount; -struct nameidata; -struct ostat; -struct proc; +#define VNODE_RELOAD 0x01 +#define VNODE_WAIT 0x02 +#define VNODE_WRITEABLE 0x04 +#define VNODE_WITHID 0x08 +#define VNODE_NOLOCK_INTERNAL 0x10 +#define VNODE_NODEAD 0x20 +#define VNODE_NOSUSPEND 0x40 +#define VNODE_ITERATE_ALL 0x80 +#define VNODE_ITERATE_ACTIVE 0x100 +#define VNODE_ITERATE_INACTIVE 0x200 + +/* + * return values from callback + */ +#define VNODE_RETURNED 0 /* done with vnode, reference can be dropped */ +#define VNODE_RETURNED_DONE 1 /* done with vnode, reference can be dropped, terminate iteration */ +#define VNODE_CLAIMED 2 /* don't drop reference */ +#define VNODE_CLAIMED_DONE 3 /* don't drop reference, terminate iteration */ + + struct stat; -struct ucred; -struct uio; -struct vattr; -struct vnode; -struct vop_bwrite_args; - -#ifdef __APPLE_API_EVOLVING -int bdevvp __P((dev_t dev, struct vnode **vpp)); -void cvtstat __P((struct stat *st, struct ostat *ost)); -int getnewvnode __P((enum vtagtype tag, - struct mount *mp, int (**vops)(void *), struct vnode **vpp)); -void insmntque __P((struct vnode *vp, struct mount *mp)); -void vattr_null __P((struct vattr *vap)); -int vcount __P((struct vnode *vp)); -int vflush __P((struct mount *mp, struct vnode *skipvp, int flags)); -int vget __P((struct vnode *vp, int lockflag, struct proc *p)); -void vgone __P((struct vnode *vp)); -int vinvalbuf __P((struct vnode *vp, int save, struct ucred *cred, - struct proc *p, int slpflag, int slptimeo)); -void vprint __P((char *label, struct vnode *vp)); -int vrecycle __P((struct vnode *vp, struct slock *inter_lkp, - struct proc *p)); -int vn_bwrite __P((struct vop_bwrite_args *ap)); -int vn_close __P((struct vnode *vp, - int flags, struct ucred *cred, struct proc *p)); -int vn_lock __P((struct vnode *vp, int flags, struct proc *p)); -int vn_open __P((struct nameidata *ndp, int fmode, int cmode)); -#ifndef __APPLE_API_PRIVATE -__private_extern__ int - vn_open_modflags __P((struct nameidata *ndp, int *fmode, int cmode)); -#endif /* __APPLE_API_PRIVATE */ -int vn_rdwr __P((enum uio_rw rw, struct vnode *vp, caddr_t base, - int len, off_t offset, enum uio_seg segflg, int ioflg, - struct ucred *cred, int *aresid, struct proc *p)); -int vn_stat __P((struct vnode *vp, struct stat *sb, struct proc *p)); -int vop_noislocked __P((struct vop_islocked_args *)); -int vop_nolock __P((struct vop_lock_args *)); -int vop_nounlock __P((struct vop_unlock_args *)); -int vop_revoke __P((struct vop_revoke_args *)); -struct vnode * - checkalias __P((struct vnode *vp, dev_t nvp_rdev, struct mount *mp)); -void vput __P((struct vnode *vp)); -void vrele __P((struct vnode *vp)); -int vaccess __P((mode_t file_mode, uid_t uid, gid_t gid, - mode_t acc_mode, struct ucred *cred)); -int getvnode __P((struct proc *p, int fd, struct file **fpp)); -#endif /* __APPLE_API_EVOLVING */ +int vn_stat(struct vnode *vp, struct stat *sb, kauth_filesec_t *xsec, vfs_context_t ctx); +int vn_stat_noauth(struct vnode *vp, struct stat *sb, kauth_filesec_t *xsec, vfs_context_t ctx); +int vn_revoke(vnode_t vp, int flags, vfs_context_t); +/* XXX BOGUS */ +int vaccess(mode_t file_mode, uid_t uid, gid_t gid, + mode_t acc_mode, struct ucred *cred); + + +/* namecache function prototypes */ +int cache_lookup(vnode_t dvp, vnode_t *vpp, struct componentname *cnp); +void cache_enter(vnode_t dvp, vnode_t vp, struct componentname *cnp); +void cache_purge(vnode_t vp); +void cache_purge_negatives(vnode_t vp); + +/* + * Global string-cache routines. You can pass zero for nc_hash + * if you don't know it (add_name() will then compute the hash). + * There are no flags for now but maybe someday. + */ +char *vfs_addname(const char *name, size_t len, u_int nc_hash, u_int flags); +int vfs_removename(const char *name); + +__END_DECLS #endif /* KERNEL */ diff --git a/bsd/sys/vnode_if.h b/bsd/sys/vnode_if.h index abfc5beb5..aa1201b5f 100644 --- a/bsd/sys/vnode_if.h +++ b/bsd/sys/vnode_if.h @@ -68,685 +68,507 @@ #define _SYS_VNODE_IF_H_ #include <sys/appleapiopts.h> - -#ifdef __APPLE_API_UNSTABLE -extern struct vnodeop_desc vop_default_desc; - - -struct vop_lookup_args { - struct vnodeop_desc *a_desc; - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; -}; -extern struct vnodeop_desc vop_lookup_desc; -#define VOP_LOOKUP(dvp, vpp, cnp) _VOP_LOOKUP(dvp, vpp, cnp) -static __inline int _VOP_LOOKUP(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) -{ - struct vop_lookup_args a; - a.a_desc = VDESC(vop_lookup); - a.a_dvp = dvp; - a.a_vpp = vpp; - a.a_cnp = cnp; - return (VCALL(dvp, VOFFSET(vop_lookup), &a)); -} - -struct vop_cachedlookup_args { +#include <sys/cdefs.h> +#include <sys/kernel_types.h> +#include <sys/buf.h> +#ifdef BSD_KERNEL_PRIVATE +#include <sys/vm.h> +#endif +#include <mach/memory_object_types.h> + + +#ifdef KERNEL + +extern struct vnodeop_desc vnop_default_desc; +extern struct vnodeop_desc vnop_lookup_desc; +extern struct vnodeop_desc vnop_create_desc; +extern struct vnodeop_desc vnop_whiteout_desc; +extern struct vnodeop_desc vnop_mknod_desc; +extern struct vnodeop_desc vnop_open_desc; +extern struct vnodeop_desc vnop_close_desc; +extern struct vnodeop_desc vnop_access_desc; +extern struct vnodeop_desc vnop_getattr_desc; +extern struct vnodeop_desc vnop_setattr_desc; +extern struct vnodeop_desc vnop_getattrlist_desc; +extern struct vnodeop_desc vnop_setattrlist_desc; +extern struct vnodeop_desc vnop_read_desc; +extern struct vnodeop_desc vnop_write_desc; +extern struct vnodeop_desc vnop_ioctl_desc; +extern struct vnodeop_desc vnop_select_desc; +extern struct vnodeop_desc vnop_exchange_desc; +extern struct vnodeop_desc vnop_revoke_desc; +extern struct vnodeop_desc vnop_mmap_desc; +extern struct vnodeop_desc vnop_mnomap_desc; +extern struct vnodeop_desc vnop_fsync_desc; +extern struct vnodeop_desc vnop_remove_desc; +extern struct vnodeop_desc vnop_link_desc; +extern struct vnodeop_desc vnop_rename_desc; +extern struct vnodeop_desc vnop_mkdir_desc; +extern struct vnodeop_desc vnop_rmdir_desc; +extern struct vnodeop_desc vnop_symlink_desc; +extern struct vnodeop_desc vnop_readdir_desc; +extern struct vnodeop_desc vnop_readdirattr_desc; +extern struct vnodeop_desc vnop_readlink_desc; +extern struct vnodeop_desc vnop_inactive_desc; +extern struct vnodeop_desc vnop_reclaim_desc; +extern struct vnodeop_desc vnop_print_desc; +extern struct vnodeop_desc vnop_pathconf_desc; +extern struct vnodeop_desc vnop_advlock_desc; +extern struct vnodeop_desc vnop_truncate_desc; +extern struct vnodeop_desc vnop_allocate_desc; +extern struct vnodeop_desc vnop_pagein_desc; +extern struct vnodeop_desc vnop_pageout_desc; +extern struct vnodeop_desc vnop_devblocksize_desc; +extern struct vnodeop_desc vnop_searchfs_desc; +extern struct vnodeop_desc vnop_copyfile_desc; +extern struct vnodeop_desc vnop_blktooff_desc; +extern struct vnodeop_desc vnop_offtoblk_desc; +extern struct vnodeop_desc vnop_blockmap_desc; +extern struct vnodeop_desc vnop_strategy_desc; +extern struct vnodeop_desc vnop_bwrite_desc; + +__BEGIN_DECLS +/* + *# + *#% lookup dvp L ? ? + *#% lookup vpp - L - + */ +struct vnop_lookup_args { struct vnodeop_desc *a_desc; - struct vnode *a_dvp; - struct vnode **a_vpp; + vnode_t a_dvp; + vnode_t *a_vpp; struct componentname *a_cnp; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_cachedlookup_desc; -#define VOP_CACHEDLOOKUP(dvp, vpp, cnp) _VOP_CACHEDLOOKUP(dvp, vpp, cnp) -static __inline int _VOP_CACHEDLOOKUP(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) -{ - struct vop_cachedlookup_args a; - a.a_desc = VDESC(vop_cachedlookup); - a.a_dvp = dvp; - a.a_vpp = vpp; - a.a_cnp = cnp; - return (VCALL(dvp, VOFFSET(vop_cachedlookup), &a)); -} - -struct vop_create_args { +extern errno_t VNOP_LOOKUP(vnode_t, vnode_t *, struct componentname *, vfs_context_t); + + +/* + *# + *#% create dvp L L L + *#% create vpp - L - + *# + */ + +struct vnop_create_args { struct vnodeop_desc *a_desc; - struct vnode *a_dvp; - struct vnode **a_vpp; + vnode_t a_dvp; + vnode_t *a_vpp; struct componentname *a_cnp; - struct vattr *a_vap; + struct vnode_attr *a_vap; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_create_desc; -#define VOP_CREATE(dvp, vpp, cnp, vap) _VOP_CREATE(dvp, vpp, cnp, vap) -static __inline int _VOP_CREATE(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vattr *vap) -{ - struct vop_create_args a; - a.a_desc = VDESC(vop_create); - a.a_dvp = dvp; - a.a_vpp = vpp; - a.a_cnp = cnp; - a.a_vap = vap; - return (VCALL(dvp, VOFFSET(vop_create), &a)); -} - -struct vop_whiteout_args { +extern errno_t VNOP_CREATE(vnode_t, vnode_t *, struct componentname *, struct vnode_attr *, vfs_context_t); + +/* + *# + *#% whiteout dvp L L L + *#% whiteout cnp - - - + *#% whiteout flag - - - + *# + */ +struct vnop_whiteout_args { struct vnodeop_desc *a_desc; - struct vnode *a_dvp; + vnode_t a_dvp; struct componentname *a_cnp; int a_flags; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_whiteout_desc; -#define VOP_WHITEOUT(dvp, cnp, flags) _VOP_WHITEOUT(dvp, cnp, flags) -static __inline int _VOP_WHITEOUT(struct vnode *dvp, struct componentname *cnp, int flags) -{ - struct vop_whiteout_args a; - a.a_desc = VDESC(vop_whiteout); - a.a_dvp = dvp; - a.a_cnp = cnp; - a.a_flags = flags; - return (VCALL(dvp, VOFFSET(vop_whiteout), &a)); -} - -struct vop_mknod_args { - struct vnodeop_desc *a_desc; - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - struct vattr *a_vap; -}; -extern struct vnodeop_desc vop_mknod_desc; -#define VOP_MKNOD(dvp, vpp, cnp, vap) _VOP_MKNOD(dvp, vpp, cnp, vap) -static __inline int _VOP_MKNOD(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vattr *vap) -{ - struct vop_mknod_args a; - a.a_desc = VDESC(vop_mknod); - a.a_dvp = dvp; - a.a_vpp = vpp; - a.a_cnp = cnp; - a.a_vap = vap; - return (VCALL(dvp, VOFFSET(vop_mknod), &a)); -} - -struct vop_mkcomplex_args { - struct vnodeop_desc *a_desc; - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - struct vattr *a_vap; - u_long a_type; +extern errno_t VNOP_WHITEOUT(vnode_t, struct componentname *, int, vfs_context_t); + +/* + *# + *#% mknod dvp L U U + *#% mknod vpp - X - + *# + */ +struct vnop_mknod_args { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t *a_vpp; + struct componentname *a_cnp; + struct vnode_attr *a_vap; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_mkcomplex_desc; -#define VOP_MKCOMPLEX(dvp, vpp, cnp, vap, type) _VOP_MKCOMPLEX(dvp, vpp, cnp, vap, type) -static __inline int _VOP_MKCOMPLEX(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vattr *vap, u_long type) -{ - struct vop_mkcomplex_args a; - a.a_desc = VDESC(vop_mkcomplex); - a.a_dvp = dvp; - a.a_vpp = vpp; - a.a_cnp = cnp; - a.a_vap = vap; - a.a_type = type; - return (VCALL(dvp, VOFFSET(vop_mkcomplex), &a)); -} - -struct vop_open_args { +extern errno_t VNOP_MKNOD(vnode_t, vnode_t *, struct componentname *, struct vnode_attr *, vfs_context_t); + +/* + *# + *#% open vp L L L + *# + */ +struct vnop_open_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; + vnode_t a_vp; int a_mode; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_open_desc; -#define VOP_OPEN(vp, mode, cred, p) _VOP_OPEN(vp, mode, cred, p) -static __inline int _VOP_OPEN(struct vnode *vp, int mode, struct ucred *cred, struct proc *p) -{ - struct vop_open_args a; - a.a_desc = VDESC(vop_open); - a.a_vp = vp; - a.a_mode = mode; - a.a_cred = cred; - a.a_p = p; - return (VCALL(vp, VOFFSET(vop_open), &a)); -} - -struct vop_close_args { +extern errno_t VNOP_OPEN(vnode_t, int, vfs_context_t); + +/* + *# + *#% close vp U U U + *# + */ +struct vnop_close_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; + vnode_t a_vp; int a_fflag; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_close_desc; -#define VOP_CLOSE(vp, fflag, cred, p) _VOP_CLOSE(vp, fflag, cred, p) -static __inline int _VOP_CLOSE(struct vnode *vp, int fflag, struct ucred *cred, struct proc *p) -{ - struct vop_close_args a; - a.a_desc = VDESC(vop_close); - a.a_vp = vp; - a.a_fflag = fflag; - a.a_cred = cred; - a.a_p = p; - return (VCALL(vp, VOFFSET(vop_close), &a)); -} - -struct vop_access_args { +extern errno_t VNOP_CLOSE(vnode_t, int, vfs_context_t); + +/* + *# + *#% access vp L L L + *# + */ +struct vnop_access_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; - int a_mode; - struct ucred *a_cred; - struct proc *a_p; + vnode_t a_vp; + int a_action; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_access_desc; -#define VOP_ACCESS(vp, mode, cred, p) _VOP_ACCESS(vp, mode, cred, p) -static __inline int _VOP_ACCESS(struct vnode *vp, int mode, struct ucred *cred, struct proc *p) -{ - struct vop_access_args a; - a.a_desc = VDESC(vop_access); - a.a_vp = vp; - a.a_mode = mode; - a.a_cred = cred; - a.a_p = p; - return (VCALL(vp, VOFFSET(vop_access), &a)); -} - -struct vop_getattr_args { +extern errno_t VNOP_ACCESS(vnode_t, int, vfs_context_t); + + +/* + *# + *#% getattr vp = = = + *# + */ +struct vnop_getattr_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - struct proc *a_p; + vnode_t a_vp; + struct vnode_attr *a_vap; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_getattr_desc; -#define VOP_GETATTR(vp, vap, cred, p) _VOP_GETATTR(vp, vap, cred, p) -static __inline int _VOP_GETATTR(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct proc *p) -{ - struct vop_getattr_args a; - a.a_desc = VDESC(vop_getattr); - a.a_vp = vp; - a.a_vap = vap; - a.a_cred = cred; - a.a_p = p; - return (VCALL(vp, VOFFSET(vop_getattr), &a)); -} - -struct vop_setattr_args { +extern errno_t VNOP_GETATTR(vnode_t, struct vnode_attr *, vfs_context_t); + +/* + *# + *#% setattr vp L L L + *# + */ +struct vnop_setattr_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - struct proc *a_p; + vnode_t a_vp; + struct vnode_attr *a_vap; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_setattr_desc; -#define VOP_SETATTR(vp, vap, cred, p) _VOP_SETATTR(vp, vap, cred, p) -static __inline int _VOP_SETATTR(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct proc *p) -{ - struct vop_setattr_args a; - a.a_desc = VDESC(vop_setattr); - a.a_vp = vp; - a.a_vap = vap; - a.a_cred = cred; - a.a_p = p; - return (VCALL(vp, VOFFSET(vop_setattr), &a)); -} - -struct vop_getattrlist_args { +extern errno_t VNOP_SETATTR(vnode_t, struct vnode_attr *, vfs_context_t); + +/* + *# + *#% getattrlist vp = = = + *# + */ +struct vnop_getattrlist_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; + vnode_t a_vp; struct attrlist *a_alist; struct uio *a_uio; - struct ucred *a_cred; - struct proc *a_p; + int a_options; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_getattrlist_desc; -#define VOP_GETATTRLIST(vp, alist, uio, cred, p) _VOP_GETATTRLIST(vp, alist, uio, cred, p) -static __inline int _VOP_GETATTRLIST(struct vnode *vp, struct attrlist *alist, struct uio *uio, struct ucred *cred, struct proc *p) -{ - struct vop_getattrlist_args a; - a.a_desc = VDESC(vop_getattrlist); - a.a_vp = vp; - a.a_alist = alist; - a.a_uio = uio; - a.a_cred = cred; - a.a_p = p; - return (VCALL(vp, VOFFSET(vop_getattrlist), &a)); -} - -struct vop_setattrlist_args { +extern errno_t VNOP_GETATTRLIST(vnode_t, struct attrlist *, struct uio *, int, vfs_context_t); + + +/* + *# + *#% setattrlist vp L L L + *# + */ +struct vnop_setattrlist_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; + vnode_t a_vp; struct attrlist *a_alist; struct uio *a_uio; - struct ucred *a_cred; - struct proc *a_p; + int a_options; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_setattrlist_desc; -#define VOP_SETATTRLIST(vp, alist, uio, cred, p) _VOP_SETATTRLIST(vp, alist, uio, cred, p) -static __inline int _VOP_SETATTRLIST(struct vnode *vp, struct attrlist *alist, struct uio *uio, struct ucred *cred, struct proc *p) -{ - struct vop_setattrlist_args a; - a.a_desc = VDESC(vop_setattrlist); - a.a_vp = vp; - a.a_alist = alist; - a.a_uio = uio; - a.a_cred = cred; - a.a_p = p; - return (VCALL(vp, VOFFSET(vop_setattrlist), &a)); -} - -struct vop_read_args { +extern errno_t VNOP_SETATTRLIST(vnode_t, struct attrlist *, struct uio *, int, vfs_context_t); + + +/* + *# + *#% read vp L L L + *# + */ +struct vnop_read_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; + vnode_t a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_read_desc; -#define VOP_READ(vp, uio, ioflag, cred) _VOP_READ(vp, uio, ioflag, cred) -static __inline int _VOP_READ(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred) -{ - struct vop_read_args a; - a.a_desc = VDESC(vop_read); - a.a_vp = vp; - a.a_uio = uio; - a.a_ioflag = ioflag; - a.a_cred = cred; - { - int _err; - extern int ubc_hold(struct vnode *vp); - extern void ubc_rele(struct vnode *vp); - int _didhold = ubc_hold(vp); - _err = VCALL(vp, VOFFSET(vop_read), &a); - if (_didhold) - ubc_rele(vp); - return (_err); - } -} - -struct vop_write_args { +extern errno_t VNOP_READ(vnode_t, struct uio *, int, vfs_context_t); + + +/* + *# + *#% write vp L L L + *# + */ +struct vnop_write_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; + vnode_t a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_write_desc; -#define VOP_WRITE(vp, uio, ioflag, cred) _VOP_WRITE(vp, uio, ioflag, cred) -static __inline int _VOP_WRITE(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred) -{ - struct vop_write_args a; - a.a_desc = VDESC(vop_write); - a.a_vp = vp; - a.a_uio = uio; - a.a_ioflag = ioflag; - a.a_cred = cred; - { - int _err; - extern int ubc_hold(struct vnode *vp); - extern void ubc_rele(struct vnode *vp); - int _didhold = ubc_hold(vp); - _err = VCALL(vp, VOFFSET(vop_write), &a); - if (_didhold) - ubc_rele(vp); - return (_err); - } -} - -struct vop_lease_args { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - struct proc *a_p; - struct ucred *a_cred; - int a_flag; -}; -extern struct vnodeop_desc vop_lease_desc; -#define VOP_LEASE(vp, p, cred, flag) _VOP_LEASE(vp, p, cred, flag) -static __inline int _VOP_LEASE(struct vnode *vp, struct proc *p, struct ucred *cred, int flag) -{ - struct vop_lease_args a; - a.a_desc = VDESC(vop_lease); - a.a_vp = vp; - a.a_p = p; - a.a_cred = cred; - a.a_flag = flag; - return (VCALL(vp, VOFFSET(vop_lease), &a)); -} - -struct vop_ioctl_args { +extern errno_t VNOP_WRITE(vnode_t, struct uio *, int, vfs_context_t); + + +/* + *# + *#% ioctl vp U U U + *# + */ +struct vnop_ioctl_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; + vnode_t a_vp; u_long a_command; caddr_t a_data; int a_fflag; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_ioctl_desc; -#define VOP_IOCTL(vp, command, data, fflag, cred, p) _VOP_IOCTL(vp, command, data, fflag, cred, p) -static __inline int _VOP_IOCTL(struct vnode *vp, u_long command, caddr_t data, int fflag, struct ucred *cred, struct proc *p) -{ - struct vop_ioctl_args a; - a.a_desc = VDESC(vop_ioctl); - a.a_vp = vp; - a.a_command = command; - a.a_data = data; - a.a_fflag = fflag; - a.a_cred = cred; - a.a_p = p; - return (VCALL(vp, VOFFSET(vop_ioctl), &a)); -} - -struct vop_select_args { +extern errno_t VNOP_IOCTL(vnode_t, u_long, caddr_t, int, vfs_context_t); + + +/* + *# + *#% select vp U U U + *# + */ +struct vnop_select_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; + vnode_t a_vp; int a_which; int a_fflags; - struct ucred *a_cred; void *a_wql; - struct proc *a_p; -}; -extern struct vnodeop_desc vop_select_desc; -#define VOP_SELECT(vp, which, fflags, cred, wql, p) _VOP_SELECT(vp, which, fflags, cred, wql, p) -static __inline int _VOP_SELECT(struct vnode *vp, int which, int fflags, struct ucred *cred, void *wql, struct proc *p) -{ - struct vop_select_args a; - a.a_desc = VDESC(vop_select); - a.a_vp = vp; - a.a_which = which; - a.a_fflags = fflags; - a.a_cred = cred; - a.a_wql = wql; - a.a_p = p; - return (VCALL(vp, VOFFSET(vop_select), &a)); -} - -struct vop_exchange_args { - struct vnodeop_desc *a_desc; - struct vnode *a_fvp; - struct vnode *a_tvp; - struct ucred *a_cred; - struct proc *a_p; -}; -extern struct vnodeop_desc vop_exchange_desc; -#define VOP_EXCHANGE(fvp, tvp, cred, p) _VOP_EXCHANGE(fvp, tvp, cred, p) -static __inline int _VOP_EXCHANGE(struct vnode *fvp, struct vnode *tvp, struct ucred *cred, struct proc *p) -{ - struct vop_exchange_args a; - a.a_desc = VDESC(vop_exchange); - a.a_fvp = fvp; - a.a_tvp = tvp; - a.a_cred = cred; - a.a_p = p; - return (VCALL(fvp, VOFFSET(vop_exchange), &a)); -} - -struct vop_kqfilt_add_args { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - struct knote *a_kn; - struct proc *a_p; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_kqfilt_add_desc; -#define VOP_KQFILT_ADD(vp, kn, p) _VOP_KQFILT_ADD(vp, kn, p) -static __inline int _VOP_KQFILT_ADD(struct vnode *vp, struct knote *kn, struct proc *p) -{ - struct vop_kqfilt_add_args a; - a.a_desc = VDESC(vop_kqfilt_add); - a.a_vp = vp; - a.a_kn = kn; - a.a_p = p; - return (VCALL(vp, VOFFSET(vop_kqfilt_add), &a)); -} - -struct vop_kqfilt_remove_args { +extern errno_t VNOP_SELECT(vnode_t, int, int, void *, vfs_context_t); + + +/* + *# + *#% exchange fvp L L L + *#% exchange tvp L L L + *# + */ +struct vnop_exchange_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; - uintptr_t a_ident; - struct proc *a_p; + vnode_t a_fvp; + vnode_t a_tvp; + int a_options; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_kqfilt_remove_desc; -#define VOP_KQFILT_REMOVE(vp, ident, p) _VOP_KQFILT_REMOVE(vp, ident, p) -static __inline int _VOP_KQFILT_REMOVE(struct vnode *vp, uintptr_t ident, struct proc *p) -{ - struct vop_kqfilt_remove_args a; - a.a_desc = VDESC(vop_kqfilt_remove); - a.a_vp = vp; - a.a_ident = ident; - a.a_p = p; - return (VCALL(vp, VOFFSET(vop_kqfilt_remove), &a)); -} - -struct vop_revoke_args { +extern errno_t VNOP_EXCHANGE(vnode_t, vnode_t, int, vfs_context_t); + + +/* + *# + *#% revoke vp U U U + *# + */ +struct vnop_revoke_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; + vnode_t a_vp; int a_flags; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_revoke_desc; -#define VOP_REVOKE(vp, flags) _VOP_REVOKE(vp, flags) -static __inline int _VOP_REVOKE(struct vnode *vp, int flags) -{ - struct vop_revoke_args a; - a.a_desc = VDESC(vop_revoke); - a.a_vp = vp; - a.a_flags = flags; - return (VCALL(vp, VOFFSET(vop_revoke), &a)); -} - -struct vop_mmap_args { +extern errno_t VNOP_REVOKE(vnode_t, int, vfs_context_t); + + +/* + *# + *# mmap - vp U U U + *# + */ +struct vnop_mmap_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; + vnode_t a_vp; int a_fflags; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_mmap_desc; -#define VOP_MMAP(vp, fflags, cred, p) _VOP_MMAP(vp, fflags, cred, p) -static __inline int _VOP_MMAP(struct vnode *vp, int fflags, struct ucred *cred, struct proc *p) -{ - struct vop_mmap_args a; - a.a_desc = VDESC(vop_mmap); - a.a_vp = vp; - a.a_fflags = fflags; - a.a_cred = cred; - a.a_p = p; - return (VCALL(vp, VOFFSET(vop_mmap), &a)); -} - -struct vop_fsync_args { +extern errno_t VNOP_MMAP(vnode_t, int, vfs_context_t); + +/* + *# + *# mnomap - vp U U U + *# + */ +struct vnop_mnomap_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; - struct ucred *a_cred; - int a_waitfor; - struct proc *a_p; + vnode_t a_vp; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_fsync_desc; -#define VOP_FSYNC(vp, cred, waitfor, p) _VOP_FSYNC(vp, cred, waitfor, p) -static __inline int _VOP_FSYNC(struct vnode *vp, struct ucred *cred, int waitfor, struct proc *p) -{ - struct vop_fsync_args a; - a.a_desc = VDESC(vop_fsync); - a.a_vp = vp; - a.a_cred = cred; - a.a_waitfor = waitfor; - a.a_p = p; - { - int _err; - extern int ubc_hold(struct vnode *vp); - extern void ubc_rele(struct vnode *vp); - int _didhold = ubc_hold(vp); - _err = VCALL(vp, VOFFSET(vop_fsync), &a); - if (_didhold) - ubc_rele(vp); - return (_err); - } -} - -struct vop_seek_args { +extern errno_t VNOP_MNOMAP(vnode_t, vfs_context_t); + + +/* + *# + *#% fsync vp L L L + *# + */ +struct vnop_fsync_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; - off_t a_oldoff; - off_t a_newoff; - struct ucred *a_cred; + vnode_t a_vp; + int a_waitfor; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_seek_desc; -#define VOP_SEEK(vp, oldoff, newoff, cred) _VOP_SEEK(vp, oldoff, newoff, cred) -static __inline int _VOP_SEEK(struct vnode *vp, off_t oldoff, off_t newoff, struct ucred *cred) -{ - struct vop_seek_args a; - a.a_desc = VDESC(vop_seek); - a.a_vp = vp; - a.a_oldoff = oldoff; - a.a_newoff = newoff; - a.a_cred = cred; - return (VCALL(vp, VOFFSET(vop_seek), &a)); -} - -struct vop_remove_args { +extern errno_t VNOP_FSYNC(vnode_t, int, vfs_context_t); + + +/* + *# + *#% remove dvp L U U + *#% remove vp L U U + *# + */ +struct vnop_remove_args { struct vnodeop_desc *a_desc; - struct vnode *a_dvp; - struct vnode *a_vp; + vnode_t a_dvp; + vnode_t a_vp; struct componentname *a_cnp; + int a_flags; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_remove_desc; -#define VOP_REMOVE(dvp, vp, cnp) _VOP_REMOVE(dvp, vp, cnp) -static __inline int _VOP_REMOVE(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) -{ - struct vop_remove_args a; - a.a_desc = VDESC(vop_remove); - a.a_dvp = dvp; - a.a_vp = vp; - a.a_cnp = cnp; - return (VCALL(dvp, VOFFSET(vop_remove), &a)); -} - -struct vop_link_args { +extern errno_t VNOP_REMOVE(vnode_t, vnode_t, struct componentname *, int, vfs_context_t); + + +/* + *# + *#% link vp U U U + *#% link tdvp L U U + *# + */ +struct vnop_link_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; - struct vnode *a_tdvp; + vnode_t a_vp; + vnode_t a_tdvp; struct componentname *a_cnp; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_link_desc; -#define VOP_LINK(vp, tdvp, cnp) _VOP_LINK(vp, tdvp, cnp) -static __inline int _VOP_LINK(struct vnode *vp, struct vnode *tdvp, struct componentname *cnp) -{ - struct vop_link_args a; - a.a_desc = VDESC(vop_link); - a.a_vp = vp; - a.a_tdvp = tdvp; - a.a_cnp = cnp; - return (VCALL(vp, VOFFSET(vop_link), &a)); -} - -struct vop_rename_args { +extern errno_t VNOP_LINK(vnode_t, vnode_t, struct componentname *, vfs_context_t); + + +/* + *# + *#% rename fdvp U U U + *#% rename fvp U U U + *#% rename tdvp L U U + *#% rename tvp X U U + *# + */ +struct vnop_rename_args { struct vnodeop_desc *a_desc; - struct vnode *a_fdvp; - struct vnode *a_fvp; + vnode_t a_fdvp; + vnode_t a_fvp; struct componentname *a_fcnp; - struct vnode *a_tdvp; - struct vnode *a_tvp; + vnode_t a_tdvp; + vnode_t a_tvp; struct componentname *a_tcnp; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_rename_desc; -#define VOP_RENAME(fdvp, fvp, fcnp, tdvp, tvp, tcnp) _VOP_RENAME(fdvp, fvp, fcnp, tdvp, tvp, tcnp) -static __inline int _VOP_RENAME(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp) -{ - struct vop_rename_args a; - a.a_desc = VDESC(vop_rename); - a.a_fdvp = fdvp; - a.a_fvp = fvp; - a.a_fcnp = fcnp; - a.a_tdvp = tdvp; - a.a_tvp = tvp; - a.a_tcnp = tcnp; - return (VCALL(fdvp, VOFFSET(vop_rename), &a)); -} - -struct vop_mkdir_args { +extern errno_t VNOP_RENAME(vnode_t, vnode_t, struct componentname *, vnode_t, vnode_t, struct componentname *, vfs_context_t); + + +/* + *# + *#% mkdir dvp L U U + *#% mkdir vpp - L - + *# + */ +struct vnop_mkdir_args { struct vnodeop_desc *a_desc; - struct vnode *a_dvp; - struct vnode **a_vpp; + vnode_t a_dvp; + vnode_t *a_vpp; struct componentname *a_cnp; - struct vattr *a_vap; -}; -extern struct vnodeop_desc vop_mkdir_desc; -#define VOP_MKDIR(dvp, vpp, cnp, vap) _VOP_MKDIR(dvp, vpp, cnp, vap) -static __inline int _VOP_MKDIR(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vattr *vap) -{ - struct vop_mkdir_args a; - a.a_desc = VDESC(vop_mkdir); - a.a_dvp = dvp; - a.a_vpp = vpp; - a.a_cnp = cnp; - a.a_vap = vap; - return (VCALL(dvp, VOFFSET(vop_mkdir), &a)); -} - -struct vop_rmdir_args { + struct vnode_attr *a_vap; + vfs_context_t a_context; + }; +extern errno_t VNOP_MKDIR(vnode_t, vnode_t *, struct componentname *, struct vnode_attr *, vfs_context_t); + + +/* + *# + *#% rmdir dvp L U U + *#% rmdir vp L U U + *# + */ +struct vnop_rmdir_args { struct vnodeop_desc *a_desc; - struct vnode *a_dvp; - struct vnode *a_vp; + vnode_t a_dvp; + vnode_t a_vp; struct componentname *a_cnp; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_rmdir_desc; -#define VOP_RMDIR(dvp, vp, cnp) _VOP_RMDIR(dvp, vp, cnp) -static __inline int _VOP_RMDIR(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) -{ - struct vop_rmdir_args a; - a.a_desc = VDESC(vop_rmdir); - a.a_dvp = dvp; - a.a_vp = vp; - a.a_cnp = cnp; - return (VCALL(dvp, VOFFSET(vop_rmdir), &a)); -} - -struct vop_symlink_args { - struct vnodeop_desc *a_desc; - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - struct vattr *a_vap; - char *a_target; +extern errno_t VNOP_RMDIR(vnode_t, vnode_t, struct componentname *, vfs_context_t); + + +/* + *# + *#% symlink dvp L U U + *#% symlink vpp - U - + *# + */ +struct vnop_symlink_args { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t *a_vpp; + struct componentname *a_cnp; + struct vnode_attr *a_vap; + char *a_target; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_symlink_desc; -#define VOP_SYMLINK(dvp, vpp, cnp, vap, target) _VOP_SYMLINK(dvp, vpp, cnp, vap, target) -static __inline int _VOP_SYMLINK(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vattr *vap, char *target) -{ - struct vop_symlink_args a; - a.a_desc = VDESC(vop_symlink); - a.a_dvp = dvp; - a.a_vpp = vpp; - a.a_cnp = cnp; - a.a_vap = vap; - a.a_target = target; - return (VCALL(dvp, VOFFSET(vop_symlink), &a)); -} - -struct vop_readdir_args { +extern errno_t VNOP_SYMLINK(vnode_t, vnode_t *, struct componentname *, struct vnode_attr *, char *, vfs_context_t); + + +/* + *# + *#% readdir vp L L L + *# + * + * When VNOP_READDIR is called from the NFS Server, the nfs_data + * argument is non-NULL. + * + * The value of nfs_eofflag should be set to TRUE if the end of + * the directory was reached while reading. + * + * The directory seek offset (cookies) are returned to the NFS client and + * may be used later to restart a directory read part way through + * the directory. There is one cookie returned for each directory + * entry returned and its size is determince from nfs_sizeofcookie. + * The value of the cookie should be the logical offset within the + * directory where the on-disc version of the appropriate directory + * entry starts. Memory for the cookies is allocated from M_TEMP + * and it is freed by the caller of VNOP_READDIR. + * + */ + +struct vnop_readdir_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; + vnode_t a_vp; struct uio *a_uio; - struct ucred *a_cred; + int a_flags; int *a_eofflag; - int *a_ncookies; - u_long **a_cookies; + int *a_numdirent; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_readdir_desc; -#define VOP_READDIR(vp, uio, cred, eofflag, ncookies, cookies) _VOP_READDIR(vp, uio, cred, eofflag, ncookies, cookies) -static __inline int _VOP_READDIR(struct vnode *vp, struct uio *uio, struct ucred *cred, int *eofflag, int *ncookies, u_long **cookies) -{ - struct vop_readdir_args a; - a.a_desc = VDESC(vop_readdir); - a.a_vp = vp; - a.a_uio = uio; - a.a_cred = cred; - a.a_eofflag = eofflag; - a.a_ncookies = ncookies; - a.a_cookies = cookies; - return (VCALL(vp, VOFFSET(vop_readdir), &a)); -} - -struct vop_readdirattr_args { +extern errno_t VNOP_READDIR(vnode_t, struct uio *, int, int *, int *, vfs_context_t); + + +/* + *# + *#% readdirattr vp L L L + *# + */ +struct vnop_readdirattr_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; + vnode_t a_vp; struct attrlist *a_alist; struct uio *a_uio; u_long a_maxcount; @@ -754,480 +576,154 @@ struct vop_readdirattr_args { u_long *a_newstate; int *a_eofflag; u_long *a_actualcount; - u_long **a_cookies; - struct ucred *a_cred; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_readdirattr_desc; -#define VOP_READDIRATTR(vp, alist, uio, maxcount, options, newstate, eofflag, actualcount, cookies, cred) _VOP_READDIRATTR(vp, alist, uio, maxcount, options, newstate, eofflag, actualcount, cookies, cred) -static __inline int _VOP_READDIRATTR(struct vnode *vp, struct attrlist *alist, struct uio *uio, u_long maxcount, u_long options, u_long *newstate, int *eofflag, u_long *actualcount, u_long **cookies, struct ucred *cred) -{ - struct vop_readdirattr_args a; - a.a_desc = VDESC(vop_readdirattr); - a.a_vp = vp; - a.a_alist = alist; - a.a_uio = uio; - a.a_maxcount = maxcount; - a.a_options = options; - a.a_newstate = newstate; - a.a_eofflag = eofflag; - a.a_actualcount = actualcount; - a.a_cookies = cookies; - a.a_cred = cred; - return (VCALL(vp, VOFFSET(vop_readdirattr), &a)); -} - -struct vop_readlink_args { +extern errno_t VNOP_READDIRATTR(vnode_t, struct attrlist *, struct uio *, u_long, u_long, u_long *, int *, u_long *, vfs_context_t); + + +/* + *# + *#% readlink vp L L L + *# + */ +struct vnop_readlink_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; + vnode_t a_vp; struct uio *a_uio; - struct ucred *a_cred; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_readlink_desc; -#define VOP_READLINK(vp, uio, cred) _VOP_READLINK(vp, uio, cred) -static __inline int _VOP_READLINK(struct vnode *vp, struct uio *uio, struct ucred *cred) -{ - struct vop_readlink_args a; - a.a_desc = VDESC(vop_readlink); - a.a_vp = vp; - a.a_uio = uio; - a.a_cred = cred; - return (VCALL(vp, VOFFSET(vop_readlink), &a)); -} - -struct vop_abortop_args { - struct vnodeop_desc *a_desc; - struct vnode *a_dvp; - struct componentname *a_cnp; -}; -extern struct vnodeop_desc vop_abortop_desc; -#define VOP_ABORTOP(dvp, cnp) _VOP_ABORTOP(dvp, cnp) -static __inline int _VOP_ABORTOP(struct vnode *dvp, struct componentname *cnp) -{ - struct vop_abortop_args a; - a.a_desc = VDESC(vop_abortop); - a.a_dvp = dvp; - a.a_cnp = cnp; - return (VCALL(dvp, VOFFSET(vop_abortop), &a)); -} - -struct vop_inactive_args { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - struct proc *a_p; -}; -extern struct vnodeop_desc vop_inactive_desc; -#define VOP_INACTIVE(vp, p) _VOP_INACTIVE(vp, p) -static __inline int _VOP_INACTIVE(struct vnode *vp, struct proc *p) -{ - struct vop_inactive_args a; - a.a_desc = VDESC(vop_inactive); - a.a_vp = vp; - a.a_p = p; - return (VCALL(vp, VOFFSET(vop_inactive), &a)); -} - -struct vop_reclaim_args { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - struct proc *a_p; -}; -extern struct vnodeop_desc vop_reclaim_desc; -#define VOP_RECLAIM(vp, p) _VOP_RECLAIM(vp, p) -static __inline int _VOP_RECLAIM(struct vnode *vp, struct proc *p) -{ - struct vop_reclaim_args a; - a.a_desc = VDESC(vop_reclaim); - a.a_vp = vp; - a.a_p = p; - return (VCALL(vp, VOFFSET(vop_reclaim), &a)); -} - -struct vop_lock_args { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - int a_flags; - struct proc *a_p; -}; -extern struct vnodeop_desc vop_lock_desc; -#define VOP_LOCK(vp, flags, p) _VOP_LOCK(vp, flags, p) -static __inline int _VOP_LOCK(struct vnode *vp, int flags, struct proc *p) -{ - struct vop_lock_args a; - a.a_desc = VDESC(vop_lock); - a.a_vp = vp; - a.a_flags = flags; - a.a_p = p; - return (VCALL(vp, VOFFSET(vop_lock), &a)); -} - -struct vop_unlock_args { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - int a_flags; - struct proc *a_p; -}; -extern struct vnodeop_desc vop_unlock_desc; -#define VOP_UNLOCK(vp, flags, p) _VOP_UNLOCK(vp, flags, p) -static __inline int _VOP_UNLOCK(struct vnode *vp, int flags, struct proc *p) -{ - struct vop_unlock_args a; - a.a_desc = VDESC(vop_unlock); - a.a_vp = vp; - a.a_flags = flags; - a.a_p = p; - return (VCALL(vp, VOFFSET(vop_unlock), &a)); -} - -struct vop_bmap_args { +extern errno_t VNOP_READLINK(vnode_t, struct uio *, vfs_context_t); + + +/* + *# + *#% inactive vp L U U + *# + */ +struct vnop_inactive_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; - daddr_t a_bn; - struct vnode **a_vpp; - daddr_t *a_bnp; - int *a_runp; + vnode_t a_vp; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_bmap_desc; -#define VOP_BMAP(vp, bn, vpp, bnp, runp) _VOP_BMAP(vp, bn, vpp, bnp, runp) -static __inline int _VOP_BMAP(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr_t *bnp, int *runp) -{ - struct vop_bmap_args a; - a.a_desc = VDESC(vop_bmap); - a.a_vp = vp; - a.a_bn = bn; - a.a_vpp = vpp; - a.a_bnp = bnp; - a.a_runp = runp; - return (VCALL(vp, VOFFSET(vop_bmap), &a)); -} - -struct vop_print_args { +extern errno_t VNOP_INACTIVE(vnode_t, vfs_context_t); + + +/* + *# + *#% reclaim vp U U U + *# + */ +struct vnop_reclaim_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; + vnode_t a_vp; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_print_desc; -#define VOP_PRINT(vp) _VOP_PRINT(vp) -static __inline int _VOP_PRINT(struct vnode *vp) -{ - struct vop_print_args a; - a.a_desc = VDESC(vop_print); - a.a_vp = vp; - return (VCALL(vp, VOFFSET(vop_print), &a)); -} - -struct vop_islocked_args { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; -}; -extern struct vnodeop_desc vop_islocked_desc; -#define VOP_ISLOCKED(vp) _VOP_ISLOCKED(vp) -static __inline int _VOP_ISLOCKED(struct vnode *vp) -{ - struct vop_islocked_args a; - a.a_desc = VDESC(vop_islocked); - a.a_vp = vp; - return (VCALL(vp, VOFFSET(vop_islocked), &a)); -} - -struct vop_pathconf_args { +extern errno_t VNOP_RECLAIM(vnode_t, vfs_context_t); + + +/* + *# + *#% pathconf vp L L L + *# + */ +struct vnop_pathconf_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; + vnode_t a_vp; int a_name; register_t *a_retval; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_pathconf_desc; -#define VOP_PATHCONF(vp, name, retval) _VOP_PATHCONF(vp, name, retval) -static __inline int _VOP_PATHCONF(struct vnode *vp, int name, register_t *retval) -{ - struct vop_pathconf_args a; - a.a_desc = VDESC(vop_pathconf); - a.a_vp = vp; - a.a_name = name; - a.a_retval = retval; - return (VCALL(vp, VOFFSET(vop_pathconf), &a)); -} - -struct vop_advlock_args { +extern errno_t VNOP_PATHCONF(vnode_t, int, register_t *, vfs_context_t); /* register_t??????? */ + + +/* + *# + *#% advlock vp U U U + *# + */ +struct vnop_advlock_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; + vnode_t a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_advlock_desc; -#define VOP_ADVLOCK(vp, id, op, fl, flags) _VOP_ADVLOCK(vp, id, op, fl, flags) -static __inline int _VOP_ADVLOCK(struct vnode *vp, caddr_t id, int op, struct flock *fl, int flags) -{ - struct vop_advlock_args a; - a.a_desc = VDESC(vop_advlock); - a.a_vp = vp; - a.a_id = id; - a.a_op = op; - a.a_fl = fl; - a.a_flags = flags; - return (VCALL(vp, VOFFSET(vop_advlock), &a)); -} - -struct vop_blkatoff_args { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - off_t a_offset; - char **a_res; - struct buf **a_bpp; -}; -extern struct vnodeop_desc vop_blkatoff_desc; -#define VOP_BLKATOFF(vp, offset, res, bpp) _VOP_BLKATOFF(vp, offset, res, bpp) -static __inline int _VOP_BLKATOFF(struct vnode *vp, off_t offset, char **res, struct buf **bpp) -{ - struct vop_blkatoff_args a; - a.a_desc = VDESC(vop_blkatoff); - a.a_vp = vp; - a.a_offset = offset; - a.a_res = res; - a.a_bpp = bpp; - return (VCALL(vp, VOFFSET(vop_blkatoff), &a)); -} - -struct vop_valloc_args { - struct vnodeop_desc *a_desc; - struct vnode *a_pvp; - int a_mode; - struct ucred *a_cred; - struct vnode **a_vpp; -}; -extern struct vnodeop_desc vop_valloc_desc; -#define VOP_VALLOC(pvp, mode, cred, vpp) _VOP_VALLOC(pvp, mode, cred, vpp) -static __inline int _VOP_VALLOC(struct vnode *pvp, int mode, struct ucred *cred, struct vnode **vpp) -{ - struct vop_valloc_args a; - a.a_desc = VDESC(vop_valloc); - a.a_pvp = pvp; - a.a_mode = mode; - a.a_cred = cred; - a.a_vpp = vpp; - return (VCALL(pvp, VOFFSET(vop_valloc), &a)); -} - -struct vop_reallocblks_args { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - struct cluster_save *a_buflist; -}; -extern struct vnodeop_desc vop_reallocblks_desc; -#define VOP_REALLOCBLKS(vp, buflist) _VOP_REALLOCBLKS(vp, buflist) -static __inline int _VOP_REALLOCBLKS(struct vnode *vp, struct cluster_save *buflist) -{ - struct vop_reallocblks_args a; - a.a_desc = VDESC(vop_reallocblks); - a.a_vp = vp; - a.a_buflist = buflist; - return (VCALL(vp, VOFFSET(vop_reallocblks), &a)); -} - -struct vop_vfree_args { - struct vnodeop_desc *a_desc; - struct vnode *a_pvp; - ino_t a_ino; - int a_mode; -}; -extern struct vnodeop_desc vop_vfree_desc; -#define VOP_VFREE(pvp, ino, mode) _VOP_VFREE(pvp, ino, mode) -static __inline int _VOP_VFREE(struct vnode *pvp, ino_t ino, int mode) -{ - struct vop_vfree_args a; - a.a_desc = VDESC(vop_vfree); - a.a_pvp = pvp; - a.a_ino = ino; - a.a_mode = mode; - return (VCALL(pvp, VOFFSET(vop_vfree), &a)); -} - -struct vop_truncate_args { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - off_t a_length; - int a_flags; - struct ucred *a_cred; - struct proc *a_p; -}; -extern struct vnodeop_desc vop_truncate_desc; -#define VOP_TRUNCATE(vp, length, flags, cred, p) _VOP_TRUNCATE(vp, length, flags, cred, p) -static __inline int _VOP_TRUNCATE(struct vnode *vp, off_t length, int flags, struct ucred *cred, struct proc *p) -{ - struct vop_truncate_args a; - a.a_desc = VDESC(vop_truncate); - a.a_vp = vp; - a.a_length = length; - a.a_flags = flags; - a.a_cred = cred; - a.a_p = p; - { - int _err; - extern int ubc_hold(struct vnode *vp); - extern void ubc_rele(struct vnode *vp); - int _didhold = ubc_hold(vp); - _err = VCALL(vp, VOFFSET(vop_truncate), &a); - if (_didhold) - ubc_rele(vp); - return (_err); - } -} - -struct vop_allocate_args { +extern errno_t VNOP_ADVLOCK(vnode_t, caddr_t, int, struct flock *, int, vfs_context_t); + +/* + *# + *#% allocate vp L L L + *# + */ +struct vnop_allocate_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; + vnode_t a_vp; off_t a_length; u_int32_t a_flags; off_t *a_bytesallocated; off_t a_offset; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_allocate_desc; -#define VOP_ALLOCATE(vp, length, flags, bytesallocated, offset, cred, p) _VOP_ALLOCATE(vp, length, flags, bytesallocated, offset, cred, p) -static __inline int _VOP_ALLOCATE(struct vnode *vp, off_t length, u_int32_t flags, off_t *bytesallocated, off_t offset, struct ucred *cred, struct proc *p) -{ - struct vop_allocate_args a; - a.a_desc = VDESC(vop_allocate); - a.a_vp = vp; - a.a_length = length; - a.a_flags = flags; - a.a_bytesallocated = bytesallocated; - a.a_offset = offset; - a.a_cred = cred; - a.a_p = p; - return (VCALL(vp, VOFFSET(vop_allocate), &a)); -} - -struct vop_update_args { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - struct timeval *a_access; - struct timeval *a_modify; - int a_waitfor; -}; -extern struct vnodeop_desc vop_update_desc; -#define VOP_UPDATE(vp, access, modify, waitfor) _VOP_UPDATE(vp, access, modify, waitfor) -static __inline int _VOP_UPDATE(struct vnode *vp, struct timeval *access, struct timeval *modify, int waitfor) -{ - struct vop_update_args a; - a.a_desc = VDESC(vop_update); - a.a_vp = vp; - a.a_access = access; - a.a_modify = modify; - a.a_waitfor = waitfor; - return (VCALL(vp, VOFFSET(vop_update), &a)); -} - -struct vop_pgrd_args { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - struct uio *a_uio; - struct ucred *a_cred; -}; -extern struct vnodeop_desc vop_pgrd_desc; -#define VOP_PGRD(vp, uio, cred) _VOP_PGRD(vp, uio, cred) -static __inline int _VOP_PGRD(struct vnode *vp, struct uio *uio, struct ucred *cred) -{ - struct vop_pgrd_args a; - a.a_desc = VDESC(vop_pgrd); - a.a_vp = vp; - a.a_uio = uio; - a.a_cred = cred; - return (VCALL(vp, VOFFSET(vop_pgrd), &a)); -} - -struct vop_pgwr_args { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - struct uio *a_uio; - struct ucred *a_cred; - vm_offset_t a_offset; -}; -extern struct vnodeop_desc vop_pgwr_desc; -#define VOP_PGWR(vp, uio, cred, offset) _VOP_PGWR(vp, uio, cred, offset) -static __inline int _VOP_PGWR(struct vnode *vp, struct uio *uio, struct ucred *cred, vm_offset_t offset) -{ - struct vop_pgwr_args a; - a.a_desc = VDESC(vop_pgwr); - a.a_vp = vp; - a.a_uio = uio; - a.a_cred = cred; - a.a_offset = offset; - return (VCALL(vp, VOFFSET(vop_pgwr), &a)); -} - -struct vop_pagein_args { +extern errno_t VNOP_ALLOCATE(vnode_t, off_t, u_int32_t, off_t *, off_t, vfs_context_t); + +/* + *# + *#% pagein vp = = = + *# + */ +struct vnop_pagein_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; + vnode_t a_vp; upl_t a_pl; vm_offset_t a_pl_offset; off_t a_f_offset; size_t a_size; - struct ucred *a_cred; int a_flags; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_pagein_desc; -#define VOP_PAGEIN(vp, pl, pl_offset, f_offset, size, cred, flags) _VOP_PAGEIN(vp, pl, pl_offset, f_offset, size, cred, flags) -static __inline int _VOP_PAGEIN(struct vnode *vp, upl_t pl, vm_offset_t pl_offset, off_t f_offset, size_t size, struct ucred *cred, int flags) -{ - struct vop_pagein_args a; - a.a_desc = VDESC(vop_pagein); - a.a_vp = vp; - a.a_pl = pl; - a.a_pl_offset = pl_offset; - a.a_f_offset = f_offset; - a.a_size = size; - a.a_cred = cred; - a.a_flags = flags; - return (VCALL(vp, VOFFSET(vop_pagein), &a)); -} - -struct vop_pageout_args { +extern errno_t VNOP_PAGEIN(vnode_t, upl_t, vm_offset_t, off_t, size_t, int, vfs_context_t); /* vm_offset_t ? */ + + +/* + *# + *#% pageout vp = = = + *# + */ +struct vnop_pageout_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; + vnode_t a_vp; upl_t a_pl; vm_offset_t a_pl_offset; off_t a_f_offset; size_t a_size; - struct ucred *a_cred; int a_flags; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_pageout_desc; -#define VOP_PAGEOUT(vp, pl, pl_offset, f_offset, size, cred, flags) _VOP_PAGEOUT(vp, pl, pl_offset, f_offset, size, cred, flags) -static __inline int _VOP_PAGEOUT(struct vnode *vp, upl_t pl, vm_offset_t pl_offset, off_t f_offset, size_t size, struct ucred *cred, int flags) -{ - struct vop_pageout_args a; - a.a_desc = VDESC(vop_pageout); - a.a_vp = vp; - a.a_pl = pl; - a.a_pl_offset = pl_offset; - a.a_f_offset = f_offset; - a.a_size = size; - a.a_cred = cred; - a.a_flags = flags; - return (VCALL(vp, VOFFSET(vop_pageout), &a)); -} - -struct vop_devblocksize_args { +extern errno_t VNOP_PAGEOUT(vnode_t, upl_t, vm_offset_t, off_t, size_t, int, vfs_context_t); + + +#ifdef BSD_KERNEL_PRIVATE +/* + *#% devblocksize vp = = = + *# + */ +struct vnop_devblocksize_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; + vnode_t a_vp; register_t *a_retval; }; -extern struct vnodeop_desc vop_devblocksize_desc; -#define VOP_DEVBLOCKSIZE(vp, retval) _VOP_DEVBLOCKSIZE(vp, retval) -static __inline int _VOP_DEVBLOCKSIZE(struct vnode *vp, register_t *retval) -{ - struct vop_devblocksize_args a; - a.a_desc = VDESC(vop_devblocksize); - a.a_vp = vp; - a.a_retval = retval; - return (VCALL(vp, VOFFSET(vop_devblocksize), &a)); -} - -struct vop_searchfs_args { +#endif /* BSD_KERNEL_PRIVATE */ + +/* + *# + *#% searchfs vp L L L + *# + */ +struct vnop_searchfs_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; + vnode_t a_vp; void *a_searchparams1; void *a_searchparams2; struct attrlist *a_searchattrs; @@ -1239,145 +735,156 @@ struct vop_searchfs_args { u_long a_options; struct uio *a_uio; struct searchstate *a_searchstate; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_searchfs_desc; -#define VOP_SEARCHFS(vp, searchparams1, searchparams2, searchattrs, maxmatches, timelimit, returnattrs, nummatches, scriptcode, options, uio, searchstate) _VOP_SEARCHFS(vp, searchparams1, searchparams2, searchattrs, maxmatches, timelimit, returnattrs, nummatches, scriptcode, options, uio, searchstate) -static __inline int _VOP_SEARCHFS(struct vnode *vp, void *searchparams1, void *searchparams2, struct attrlist *searchattrs, u_long maxmatches, struct timeval *timelimit, struct attrlist *returnattrs, u_long *nummatches, u_long scriptcode, u_long options, struct uio *uio, struct searchstate *searchstate) -{ - struct vop_searchfs_args a; - a.a_desc = VDESC(vop_searchfs); - a.a_vp = vp; - a.a_searchparams1 = searchparams1; - a.a_searchparams2 = searchparams2; - a.a_searchattrs = searchattrs; - a.a_maxmatches = maxmatches; - a.a_timelimit = timelimit; - a.a_returnattrs = returnattrs; - a.a_nummatches = nummatches; - a.a_scriptcode = scriptcode; - a.a_options = options; - a.a_uio = uio; - a.a_searchstate = searchstate; - return (VCALL(vp, VOFFSET(vop_searchfs), &a)); -} - -struct vop_copyfile_args { +extern errno_t VNOP_SEARCHFS(vnode_t, void *, void *, struct attrlist *, u_long, struct timeval *, struct attrlist *, u_long *, u_long, u_long, struct uio *, struct searchstate *, vfs_context_t); + + +/* + *# + *#% copyfile fvp U U U + *#% copyfile tdvp L U U + *#% copyfile tvp X U U + *# + */ +struct vnop_copyfile_args { struct vnodeop_desc *a_desc; - struct vnode *a_fvp; - struct vnode *a_tdvp; - struct vnode *a_tvp; + vnode_t a_fvp; + vnode_t a_tdvp; + vnode_t a_tvp; struct componentname *a_tcnp; int a_mode; int a_flags; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_copyfile_desc; -#define VOP_COPYFILE(fvp, tdvp, tvp, tcnp, mode, flags) _VOP_COPYFILE(fvp, tdvp, tvp, tcnp, mode, flags) -static __inline int _VOP_COPYFILE(struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, int mode, int flags) -{ - struct vop_copyfile_args a; - a.a_desc = VDESC(vop_copyfile); - a.a_fvp = fvp; - a.a_tdvp = tdvp; - a.a_tvp = tvp; - a.a_tcnp = tcnp; - a.a_mode = mode; - a.a_flags = flags; - return (VCALL(fvp, VOFFSET(vop_copyfile), &a)); -} - -struct vop_blktooff_args { +extern errno_t VNOP_COPYFILE(vnode_t, vnode_t, vnode_t, struct componentname *, int, int, vfs_context_t); + + +struct vnop_getxattr_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; - daddr_t a_lblkno; + vnode_t a_vp; + char * a_name; + uio_t a_uio; + size_t *a_size; + int a_options; + vfs_context_t a_context; +}; +extern struct vnodeop_desc vnop_getxattr_desc; +extern errno_t VNOP_GETXATTR(vnode_t, const char *, uio_t, size_t *, int, vfs_context_t); + +struct vnop_setxattr_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + char * a_name; + uio_t a_uio; + int a_options; + vfs_context_t a_context; +}; +extern struct vnodeop_desc vnop_setxattr_desc; +extern errno_t VNOP_SETXATTR(vnode_t, const char *, uio_t, int, vfs_context_t); + +struct vnop_removexattr_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + char * a_name; + int a_options; + vfs_context_t a_context; +}; +extern struct vnodeop_desc vnop_removexattr_desc; +extern errno_t VNOP_REMOVEXATTR(vnode_t, const char *, int, vfs_context_t); + +struct vnop_listxattr_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + uio_t a_uio; + size_t *a_size; + int a_options; + vfs_context_t a_context; +}; +extern struct vnodeop_desc vnop_listxattr_desc; +extern errno_t VNOP_LISTXATTR(vnode_t, uio_t, size_t *, int, vfs_context_t); + + +/* + *# + *#% blktooff vp = = = + *# + */ +struct vnop_blktooff_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + daddr64_t a_lblkno; off_t *a_offset; }; -extern struct vnodeop_desc vop_blktooff_desc; -#define VOP_BLKTOOFF(vp, lblkno, offset) _VOP_BLKTOOFF(vp, lblkno, offset) -static __inline int _VOP_BLKTOOFF(struct vnode *vp, daddr_t lblkno, off_t *offset) -{ - struct vop_blktooff_args a; - a.a_desc = VDESC(vop_blktooff); - a.a_vp = vp; - a.a_lblkno = lblkno; - a.a_offset = offset; - return (VCALL(vp, VOFFSET(vop_blktooff), &a)); -} - -struct vop_offtoblk_args { +extern errno_t VNOP_BLKTOOFF(vnode_t, daddr64_t, off_t *); + + +/* + *# + *#% offtoblk vp = = = + *# + */ +struct vnop_offtoblk_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; + vnode_t a_vp; off_t a_offset; - daddr_t *a_lblkno; + daddr64_t *a_lblkno; }; -extern struct vnodeop_desc vop_offtoblk_desc; -#define VOP_OFFTOBLK(vp, offset, lblkno) _VOP_OFFTOBLK(vp, offset, lblkno) -static __inline int _VOP_OFFTOBLK(struct vnode *vp, off_t offset, daddr_t *lblkno) -{ - struct vop_offtoblk_args a; - a.a_desc = VDESC(vop_offtoblk); - a.a_vp = vp; - a.a_offset = offset; - a.a_lblkno = lblkno; - return (VCALL(vp, VOFFSET(vop_offtoblk), &a)); -} - -struct vop_cmap_args { +extern errno_t VNOP_OFFTOBLK(vnode_t, off_t, daddr64_t *); + + +/* + *# + *#% blockmap vp L L L + *# + */ +struct vnop_blockmap_args { struct vnodeop_desc *a_desc; - struct vnode *a_vp; + vnode_t a_vp; off_t a_foffset; size_t a_size; - daddr_t *a_bpn; + daddr64_t *a_bpn; size_t *a_run; void *a_poff; + int a_flags; + vfs_context_t a_context; }; -extern struct vnodeop_desc vop_cmap_desc; -#define VOP_CMAP(vp, foffset, size, bpn, run, poff) _VOP_CMAP(vp, foffset, size, bpn, run, poff) -static __inline int _VOP_CMAP(struct vnode *vp, off_t foffset, size_t size, daddr_t *bpn, size_t *run, void *poff) -{ - struct vop_cmap_args a; - a.a_desc = VDESC(vop_cmap); - a.a_vp = vp; - a.a_foffset = foffset; - a.a_size = size; - a.a_bpn = bpn; - a.a_run = run; - a.a_poff = poff; - return (VCALL(vp, VOFFSET(vop_cmap), &a)); -} - -/* Special cases: */ -#include <sys/buf.h> -#include <sys/vm.h> +extern errno_t VNOP_BLOCKMAP(vnode_t, off_t, size_t, daddr64_t *, size_t *, void *, + int, vfs_context_t); -struct vop_strategy_args { +struct vnop_strategy_args { struct vnodeop_desc *a_desc; struct buf *a_bp; }; -extern struct vnodeop_desc vop_strategy_desc; -#define VOP_STRATEGY(bp) _VOP_STRATEGY(bp) -static __inline int _VOP_STRATEGY(struct buf *bp) -{ - struct vop_strategy_args a; - a.a_desc = VDESC(vop_strategy); - a.a_bp = bp; - return (VCALL(bp->b_vp, VOFFSET(vop_strategy), &a)); -} - -struct vop_bwrite_args { +extern errno_t VNOP_STRATEGY(struct buf *bp); + +struct vnop_bwrite_args { struct vnodeop_desc *a_desc; - struct buf *a_bp; + buf_t a_bp; }; -extern struct vnodeop_desc vop_bwrite_desc; -#define VOP_BWRITE(bp) _VOP_BWRITE(bp) -static __inline int _VOP_BWRITE(struct buf *bp) -{ - struct vop_bwrite_args a; - a.a_desc = VDESC(vop_bwrite); - a.a_bp = bp; - return (VCALL(bp->b_vp, VOFFSET(vop_bwrite), &a)); -} - -/* End of special cases. */ - -#endif /* __APPLE_API_UNSTABLE */ +extern errno_t VNOP_BWRITE(buf_t); + + +struct vnop_kqfilt_add_args { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + struct knote *a_kn; + vfs_context_t a_context; +}; +extern struct vnodeop_desc vnop_kqfilt_add_desc; +extern errno_t VNOP_KQFILT_ADD(vnode_t , struct knote *, vfs_context_t); + +struct vnop_kqfilt_remove_args { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + uintptr_t a_ident; + vfs_context_t a_context; +}; +extern struct vnodeop_desc vnop_kqfilt_remove_desc; +errno_t VNOP_KQFILT_REMOVE(vnode_t , uintptr_t , vfs_context_t); + +__END_DECLS + +#endif /* KERNEL */ + #endif /* !_SYS_VNODE_IF_H_ */ diff --git a/bsd/sys/vnode_internal.h b/bsd/sys/vnode_internal.h new file mode 100644 index 000000000..df02742df --- /dev/null +++ b/bsd/sys/vnode_internal.h @@ -0,0 +1,370 @@ +/* + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vnode.h 8.17 (Berkeley) 5/20/95 + */ + +#ifndef _SYS_VNODE_INTERNAL_H_ +#define _SYS_VNODE_INTERNAL_H_ + +#define INTERIM_FSNODE_LOCK 1 + +#include <sys/appleapiopts.h> +#include <sys/cdefs.h> +#include <sys/queue.h> +#include <sys/lock.h> + +#include <sys/time.h> +#include <sys/uio.h> + +#include <sys/vm.h> +#include <sys/systm.h> +#include <kern/locks.h> +#include <vm/vm_kern.h> +#include <sys/vnode.h> +#include <sys/namei.h> +#include <sys/vfs_context.h> + + +struct lockf; + +LIST_HEAD(buflists, buf); + + +struct unsafe_fsnode { + lck_mtx_t fsnodelock; + int32_t fsnode_count; + void * fsnodeowner; +}; + +/* + * Reading or writing any of these items requires holding the appropriate lock. + * v_freelist is locked by the global vnode_list_lock + * v_mntvnodes is locked by the mount_lock + * v_nclinks and v_ncchildren are protected by the global name_cache_lock + * v_cleanblkhd and v_dirtyblkhd and v_iterblkflags are locked via the global buf_mtxp + * the rest of the structure is protected by the vnode_lock + */ +struct vnode { + lck_mtx_t v_lock; /* vnode mutex */ + TAILQ_ENTRY(vnode) v_freelist; /* vnode freelist */ + TAILQ_ENTRY(vnode) v_mntvnodes; /* vnodes for mount point */ + LIST_HEAD(, namecache) v_nclinks; /* name cache entries that name this vnode */ + LIST_HEAD(, namecache) v_ncchildren; /* name cache entries that regard us as there parent */ + vnode_t v_defer_reclaimlist; /* in case we have to defer the reclaim to avoid recursion */ + u_long v_flag; /* vnode flags (see below) */ + u_short v_lflag; /* vnode local and named ref flags */ + u_char v_iterblkflags; /* buf iterator flags */ + u_char v_references; /* number of times io_count has been granted */ + int32_t v_kusecount; /* count of in-kernel refs */ + int32_t v_usecount; /* reference count of users */ + int32_t v_iocount; /* iocounters */ + void * v_owner; /* act that owns the vnode */ + enum vtype v_type; /* vnode type */ + u_long v_id; /* identity of vnode contents */ + union { + struct mount *vu_mountedhere;/* ptr to mounted vfs (VDIR) */ + struct socket *vu_socket; /* unix ipc (VSOCK) */ + struct specinfo *vu_specinfo; /* device (VCHR, VBLK) */ + struct fifoinfo *vu_fifoinfo; /* fifo (VFIFO) */ + struct ubc_info *vu_ubcinfo; /* valid for (VREG) */ + } v_un; + struct buflists v_cleanblkhd; /* clean blocklist head */ + struct buflists v_dirtyblkhd; /* dirty blocklist head */ + kauth_cred_t v_cred; + int v_cred_timestamp; + long v_numoutput; /* num of writes in progress */ + long v_writecount; /* reference count of writers */ + char * v_name; /* name component of the vnode */ + vnode_t v_parent; /* pointer to parent vnode */ +#ifdef INTERIM_FSNODE_LOCK + struct lockf *v_lockf; /* advisory lock list head */ + struct unsafe_fsnode *v_unsafefs; /* pointer to struct used to lock */ +#endif /* vnodes on unsafe filesystems */ + int (**v_op)(void *); /* vnode operations vector */ + enum vtagtype v_tag; /* type of underlying data */ + mount_t v_mount; /* ptr to vfs we are in */ + void * v_data; /* private data for fs */ +}; + +#define v_mountedhere v_un.vu_mountedhere +#define v_socket v_un.vu_socket +#define v_specinfo v_un.vu_specinfo +#define v_fifoinfo v_un.vu_fifoinfo +#define v_ubcinfo v_un.vu_ubcinfo + + +/* + * v_iterblkflags + */ +#define VBI_ITER 0x1 +#define VBI_ITERWANT 0x2 +#define VBI_CLEAN 0x4 +#define VBI_DIRTY 0x8 +#define VBI_NEWBUF 0x10 + + +/* + * v_lflags + */ +#define VL_SUSPENDED 0x0001 /* vnode is suspended */ +#define VL_DRAIN 0x0002 /* vnode is being drained */ +#define VL_TERMINATE 0x0004 /* vnode is marked for termination */ +#define VL_TERMWANT 0x0008 /* vnode is marked for termination */ +#define VL_DEAD 0x0010 /* vnode is dead and completed recycle */ +#define VL_MARKTERM 0x0020 /* vnode is dead and completed recycle */ +#define VL_MOUNTDEAD 0x0040 /* v_moutnedhere is dead */ +#define VL_NEEDINACTIVE 0x0080 /* delay VNOP_INACTIVE until iocount goes to 0 */ + +#define VNAMED_UBC 0x2000 /* ubc named reference */ +#define VNAMED_MOUNT 0x4000 /* mount point named reference */ +#define VNAMED_FSHASH 0x8000 /* FS hash named reference */ + + +/* + * v_flags + */ +#define VROOT 0x000001 /* root of its file system */ +#define VTEXT 0x000002 /* vnode is a pure text prototype */ +#define VSYSTEM 0x000004 /* vnode being used by kernel */ +#define VISTTY 0x000008 /* vnode represents a tty */ +#define VWASMAPPED 0x000010 /* vnode was mapped before */ +#define VTERMINATE 0x000020 /* terminating memory object */ +#define VTERMWANT 0x000040 /* wating for memory object death */ +#define VMOUNT 0x000080 /* mount operation in progress */ +#define VBWAIT 0x000100 /* waiting for output to complete */ +#define VALIASED 0x000200 /* vnode has an alias */ +#define VNOCACHE_DATA 0x000400 /* don't keep data cached once it's been consumed */ +#define VSTANDARD 0x000800 /* vnode obtained from common pool */ +#define VAGE 0x001000 /* Insert vnode at head of free list */ +#define VRAOFF 0x002000 /* read ahead disabled */ +#define VNCACHEABLE 0x004000 /* vnode is allowed to be put back in name cache */ +#define VUINACTIVE 0x008000 /* UBC vnode is on inactive list */ +#define VSWAP 0x010000 /* vnode is being used as swapfile */ +#define VTHROTTLED 0x020000 /* writes or pageouts have been throttled */ + /* wakeup tasks waiting when count falls below threshold */ +#define VNOFLUSH 0x040000 /* don't vflush() if SKIPSYSTEM */ +#define VLOCKLOCAL 0x080000 /* this vnode does adv locking in vfs */ +#define VISHARDLINK 0x100000 /* hard link needs special processing on lookup and in volfs */ + +#define VCRED_EXPIRED 2 /* number of seconds to keep cached credential valid */ + + +/* + * Global vnode data. + */ +extern struct vnode *rootvnode; /* root (i.e. "/") vnode */ + + +/* + * Mods for exensibility. + */ + +/* + * Flags for vdesc_flags: + */ +#define VDESC_MAX_VPS 16 +/* Low order 16 flag bits are reserved for willrele flags for vp arguments. */ +#define VDESC_VP0_WILLRELE 0x0001 +#define VDESC_VP1_WILLRELE 0x0002 +#define VDESC_VP2_WILLRELE 0x0004 +#define VDESC_VP3_WILLRELE 0x0008 +#define VDESC_NOMAP_VPP 0x0100 +#define VDESC_VPP_WILLRELE 0x0200 + +/* + * VDESC_NO_OFFSET is used to identify the end of the offset list + * and in places where no such field exists. + */ +#define VDESC_NO_OFFSET -1 + +/* + * This structure describes the vnode operation taking place. + */ +struct vnodeop_desc { + int vdesc_offset; /* offset in vector--first for speed */ + char *vdesc_name; /* a readable name for debugging */ + int vdesc_flags; /* VDESC_* flags */ + + /* + * These ops are used by bypass routines to map and locate arguments. + * Creds and procs are not needed in bypass routines, but sometimes + * they are useful to (for example) transport layers. + * Nameidata is useful because it has a cred in it. + */ + int *vdesc_vp_offsets; /* list ended by VDESC_NO_OFFSET */ + int vdesc_vpp_offset; /* return vpp location */ + int vdesc_cred_offset; /* cred location, if any */ + int vdesc_proc_offset; /* proc location, if any */ + int vdesc_componentname_offset; /* if any */ + int vdesc_context_offset; /* context location, if any */ + /* + * Finally, we've got a list of private data (about each operation) + * for each transport layer. (Support to manage this list is not + * yet part of BSD.) + */ + caddr_t *vdesc_transports; +}; + +/* + * A list of all the operation descs. + */ +extern struct vnodeop_desc *vnodeop_descs[]; + +/* + * Interlock for scanning list of vnodes attached to a mountpoint + */ +extern void * mntvnode_slock; + +/* + * This macro is very helpful in defining those offsets in the vdesc struct. + * + * This is stolen from X11R4. I ingored all the fancy stuff for + * Crays, so if you decide to port this to such a serious machine, + * you might want to consult Intrisics.h's XtOffset{,Of,To}. + */ +#define VOPARG_OFFSET(p_type,field) \ + ((int) (((char *) (&(((p_type)NULL)->field))) - ((char *) NULL))) +#define VOPARG_OFFSETOF(s_type,field) \ + VOPARG_OFFSET(s_type*,field) +#define VOPARG_OFFSETTO(S_TYPE,S_OFFSET,STRUCT_P) \ + ((S_TYPE)(((char*)(STRUCT_P))+(S_OFFSET))) + + + +/* + * VOCALL calls an op given an ops vector. We break it out because BSD's + * vclean changes the ops vector and then wants to call ops with the old + * vector. + */ +#define VOCALL(OPSV,OFF,AP) (( *((OPSV)[(OFF)])) (AP)) + +/* + * This call works for vnodes in the kernel. + */ +#define VCALL(VP,OFF,AP) VOCALL((VP)->v_op,(OFF),(AP)) +#define VDESC(OP) (& __CONCAT(OP,_desc)) +#define VOFFSET(OP) (VDESC(OP)->vdesc_offset) + + + +int build_path(vnode_t first_vp, char *buff, int buflen, int *outlen); +int bdevvp(dev_t dev, struct vnode **vpp); +void cvtstat(struct stat *st, struct ostat *ost); +void vprint(const char *label, struct vnode *vp); + + +__private_extern__ int is_package_name(char *name, int len); +__private_extern__ int set_package_extensions_table(void *data, int nentries, int maxwidth); +int vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, + int len, off_t offset, enum uio_seg segflg, int ioflg, + struct ucred *cred, int *aresid, struct proc *p); +int vn_rdwr_64(enum uio_rw rw, struct vnode *vp, uint64_t base, + int64_t len, off_t offset, enum uio_seg segflg, + int ioflg, struct ucred *cred, int *aresid, + struct proc *p); +void fifo_printinfo(struct vnode *vp); +int vn_lock(struct vnode *vp, int flags, struct proc *p); +int vn_open(struct nameidata *ndp, int fmode, int cmode); +int vn_open_modflags(struct nameidata *ndp, int *fmode, int cmode); +int vn_open_auth(struct nameidata *ndp, int *fmode, struct vnode_attr *); +int vn_close(vnode_t, int flags, struct ucred *cred, struct proc *p); + +#define VN_CREATE_NOAUTH (1<<0) +#define VN_CREATE_NOINHERIT (1<<1) +errno_t vn_create(vnode_t, vnode_t *, struct componentname *, struct vnode_attr *, int flags, vfs_context_t); + + +int vn_getxattr(vnode_t, const char *, uio_t, size_t *, int, vfs_context_t); +int vn_setxattr(vnode_t, const char *, uio_t, int, vfs_context_t); +int vn_removexattr(vnode_t, const char *, int, vfs_context_t); +int vn_listxattr(vnode_t, uio_t, size_t *, int, vfs_context_t); + +void name_cache_lock(void); +void name_cache_unlock(void); + +char * vnode_getname(vnode_t vp); +void vnode_putname(char *name); + +vnode_t vnode_getparent(vnode_t vp); + +int vn_pathconf(vnode_t, int, register_t *, vfs_context_t); + +void vnode_list_lock(void); +void vnode_list_unlock(void); +int vnode_ref_ext(vnode_t, int); +void vnode_rele_ext(vnode_t, int, int); +void vnode_rele_internal(vnode_t, int, int, int); +int vnode_getwithref(vnode_t); +int vnode_put_locked(vnode_t); + +int vnode_issock(vnode_t); + +void unlock_fsnode(vnode_t, int *); +int lock_fsnode(vnode_t, int *); + +errno_t vnode_resume(vnode_t); + +errno_t vnode_size(vnode_t, off_t *, vfs_context_t); +errno_t vnode_setsize(vnode_t, off_t, int ioflag, vfs_context_t); +int vnode_setattr_fallback(vnode_t vp, struct vnode_attr *vap, vfs_context_t ctx); + +void SPECHASH_LOCK(void); +void SPECHASH_UNLOCK(void); + +int check_cdevmounted(dev_t, enum vtype, int *); + +void vnode_authorize_init(void); + +#endif /* !_SYS_VNODE_INTERNAL_H_ */ diff --git a/bsd/sys/vstat.h b/bsd/sys/vstat.h index 83b87ed23..4a8817c8d 100644 --- a/bsd/sys/vstat.h +++ b/bsd/sys/vstat.h @@ -36,7 +36,7 @@ #include <sys/time.h> #include <sys/attr.h> -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE struct vstat { fsid_t vst_volid; /* volume identifier */ @@ -49,7 +49,7 @@ struct vstat { gid_t vst_gid; /* group ID of the file's group */ dev_t vst_dev; /* inode's device */ dev_t vst_rdev; /* device type */ -#ifndef _POSIX_SOURCE +#ifndef _POSIX_C_SOURCE struct timespec vst_atimespec; /* time of last access */ struct timespec vst_mtimespec; /* time of last data modification */ struct timespec vst_ctimespec; /* time of last file status change */ @@ -67,7 +67,7 @@ struct vstat { u_int32_t vst_flags; /* user defined flags for file */ }; -#endif /* ! _POSIX_SOURCE */ +#endif /* ! _POSIX_C_SOURCE */ #endif /* __APPLE_API_OBSOLETE */ #endif /* !_SYS_VSTAT_H_ */ diff --git a/bsd/sys/wait.h b/bsd/sys/wait.h index 76bf41a3b..34aba1bb8 100644 --- a/bsd/sys/wait.h +++ b/bsd/sys/wait.h @@ -58,51 +58,119 @@ #ifndef _SYS_WAIT_H_ #define _SYS_WAIT_H_ +#include <sys/cdefs.h> +#include <sys/_types.h> + /* * This file holds definitions relevent to the wait4 system call * and the alternate interfaces that use it (wait, wait3, waitpid). */ +/* + * [XSI] The type idtype_t shall be defined as an enumeration type whose + * possible values shall include at least P_ALL, P_PID, and P_PGID. + */ +typedef enum { + P_ALL, + P_PID, + P_PGID +} idtype_t; + +/* + * [XSI] The id_t and pid_t types shall be defined as described + * in <sys/types.h> + */ +#ifndef _PID_T +typedef __darwin_pid_t pid_t; +#define _PID_T +#endif + +#ifndef _ID_T +typedef __darwin_id_t id_t; +#define _ID_T +#endif + +/* + * [XSI] The siginfo_t type shall be defined as described in <signal.h> + * [XSI] The rusage structure shall be defined as described in <sys/resource.h> + * [XSI] Inclusion of the <sys/wait.h> header may also make visible all + * symbols from <signal.h> and <sys/resource.h> + * + * NOTE: This requirement is currently being satisfied by the direct + * inclusion of <sys/signal.h> and <sys/resource.h>, below. + * + * Software should not depend on the exposure of anything other + * than the types siginfo_t and struct rusage as a result of + * this inclusion. If you depend on any types or manifest + * values othe than siginfo_t and struct rusage from either of + * those files, you should explicitly include them yourself, as + * well, or in future releases your stware may not compile + * without modification. + */ +#include <sys/signal.h> /* [XSI] for siginfo_t */ +#include <sys/resource.h> /* [XSI] for struct rusage */ + +/* + * Option bits for the third argument of wait4. WNOHANG causes the + * wait to not hang if there are no stopped or terminated processes, rather + * returning an error indication in this case (pid==0). WUNTRACED + * indicates that the caller should receive status about untraced children + * which stop due to signals. If children are stopped and a wait without + * this option is done, it is as though they were still running... nothing + * about them is returned. + */ +#define WNOHANG 0x01 /* [XSI] don't hang in wait/no child to reap */ +#define WUNTRACED 0x02 /* [XSI] notify on stopped, untraced children */ + /* * Macros to test the exit status returned by wait * and extract the relevant values. */ -#ifdef _POSIX_SOURCE +#ifdef _POSIX_C_SOURCE #define _W_INT(i) (i) #else #define _W_INT(w) (*(int *)&(w)) /* convert union wait to int */ #define WCOREFLAG 0200 +#endif /* _POSIX_C_SOURCE */ -#endif /* _POSIX_SOURCE */ - +/* These macros are permited, as they are in the implementation namespace */ #define _WSTATUS(x) (_W_INT(x) & 0177) #define _WSTOPPED 0177 /* _WSTATUS if process is stopped */ + +/* + * [XSI] The <sys/wait.h> header shall define the following macros for + * analysis of process status values + */ +#define WEXITSTATUS(x) (_W_INT(x) >> 8) +#define WIFCONTINUED(x) (x == 0x13) /* 0x13 == SIGCONT */ +#define WIFEXITED(x) (_WSTATUS(x) == 0) +#define WIFSIGNALED(x) (_WSTATUS(x) != _WSTOPPED && _WSTATUS(x) != 0) #define WIFSTOPPED(x) (_WSTATUS(x) == _WSTOPPED) #define WSTOPSIG(x) (_W_INT(x) >> 8) -#define WIFSIGNALED(x) (_WSTATUS(x) != _WSTOPPED && _WSTATUS(x) != 0) #define WTERMSIG(x) (_WSTATUS(x)) -#define WIFEXITED(x) (_WSTATUS(x) == 0) -#define WEXITSTATUS(x) (_W_INT(x) >> 8) -#if !defined(_POSIX_SOURCE) +#if !defined(_POSIX_C_SOURCE) #define WCOREDUMP(x) (_W_INT(x) & WCOREFLAG) #define W_EXITCODE(ret, sig) ((ret) << 8 | (sig)) #define W_STOPCODE(sig) ((sig) << 8 | _WSTOPPED) -#endif /* !defined(_POSIX_SOURCE) */ +#endif /* !defined(_POSIX_C_SOURCE) */ /* - * Option bits for the third argument of wait4. WNOHANG causes the - * wait to not hang if there are no stopped or terminated processes, rather - * returning an error indication in this case (pid==0). WUNTRACED - * indicates that the caller should receive status about untraced children - * which stop due to signals. If children are stopped and a wait without - * this option is done, it is as though they were still running... nothing - * about them is returned. + * [XSI] The following symbolic constants shall be defined as possible + * values for the fourth argument to waitid(). */ -#define WNOHANG 1 /* don't hang in wait */ -#define WUNTRACED 2 /* tell about stopped, untraced children */ +/* WNOHANG already defined for wait4() */ +/* WUNTRACED defined for wait4() but not for waitid() */ +#define WEXITED 0x04 /* [XSI] Processes which have exitted */ +#ifdef _POSIX_C_SOURCE +/* waitid() parameter */ +#define WSTOPPED 0x08 /* [XSI] Any child stopped by signal receipt */ +#endif +#define WCONTINUED 0x10 /* [XSI] Any child stopped then continued */ +#define WNOWAIT 0x20 /* [XSI] Leave process returned waitable */ + -#if !defined(_POSIX_SOURCE) +#if !defined(_POSIX_C_SOURCE) /* POSIX extensions and 4.2/4.3 compatability: */ /* @@ -125,13 +193,13 @@ union wait { * Terminated process status. */ struct { -#if BYTE_ORDER == LITTLE_ENDIAN +#if __DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN unsigned int w_Termsig:7, /* termination signal */ w_Coredump:1, /* core dump indicator */ w_Retcode:8, /* exit code if w_termsig==0 */ w_Filler:16; /* upper bits filler */ #endif -#if BYTE_ORDER == BIG_ENDIAN +#if __DARWIN_BYTE_ORDER == __DARWIN_BIG_ENDIAN unsigned int w_Filler:16, /* upper bits filler */ w_Retcode:8, /* exit code if w_termsig==0 */ w_Coredump:1, /* core dump indicator */ @@ -144,12 +212,12 @@ union wait { * with the WUNTRACED option bit. */ struct { -#if BYTE_ORDER == LITTLE_ENDIAN +#if __DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN unsigned int w_Stopval:8, /* == W_STOPPED if stopped */ w_Stopsig:8, /* signal that stopped us */ w_Filler:16; /* upper bits filler */ #endif -#if BYTE_ORDER == BIG_ENDIAN +#if __DARWIN_BYTE_ORDER == __DARWIN_BIG_ENDIAN unsigned int w_Filler:16, /* upper bits filler */ w_Stopsig:8, /* signal that stopped us */ w_Stopval:8; /* == W_STOPPED if stopped */ @@ -162,22 +230,24 @@ union wait { #define w_stopval w_S.w_Stopval #define w_stopsig w_S.w_Stopsig +/* + * Stopped state value; cannot use waitid() parameter of the same name + * in the same scope + */ #define WSTOPPED _WSTOPPED -#endif /* !defined(_POSIX_SOURCE) */ +#endif /* !defined(_POSIX_C_SOURCE) */ #ifndef KERNEL -#include <sys/types.h> -#include <sys/cdefs.h> - __BEGIN_DECLS -struct rusage; /* forward declaration */ - -pid_t wait __P((int *)); -pid_t waitpid __P((pid_t, int *, int)); -#if !defined(_POSIX_SOURCE) -pid_t wait3 __P((int *, int, struct rusage *)); -pid_t wait4 __P((pid_t, int *, int, struct rusage *)); -#endif /* !defined(_POSIX_SOURCE) */ +pid_t wait(int *); +pid_t waitpid(pid_t, int *, int); +#ifndef _ANSI_SOURCE +int waitid(idtype_t, id_t, siginfo_t *, int); +#endif /* !_ANSI_SOURCE */ +#if !defined(_POSIX_C_SOURCE) +pid_t wait3(int *, int, struct rusage *); +pid_t wait4(pid_t, int *, int, struct rusage *); +#endif /* !defined(_POSIX_C_SOURCE) */ __END_DECLS #endif #endif /* !_SYS_WAIT_H_ */ diff --git a/bsd/sys/xattr.h b/bsd/sys/xattr.h new file mode 100644 index 000000000..6628bbeee --- /dev/null +++ b/bsd/sys/xattr.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2004-2005 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef _SYS_XATTR_H_ +#define _SYS_XATTR_H_ + +#include <sys/types.h> + +/* Options for pathname based xattr calls */ +#define XATTR_NOFOLLOW 0x0001 /* Don't follow symbolic links */ + +/* Options for setxattr calls */ +#define XATTR_CREATE 0x0002 /* set the value, fail if attr already exists */ +#define XATTR_REPLACE 0x0004 /* set the value, fail if attr does not exist */ + +/* Set this to bypass authorization checking (eg. if doing auth-related work) */ +#define XATTR_NOSECURITY 0x0008 + +#define XATTR_MAXNAMELEN 127 + +#define XATTR_FINDERINFO_NAME "com.apple.FinderInfo" + +#define XATTR_RESOURCEFORK_NAME "com.apple.ResourceFork" + + +#ifdef KERNEL +__BEGIN_DECLS +int xattr_protected(const char *); +int xattr_validatename(const char *); +__END_DECLS +#endif /* KERNEL */ + +#ifndef KERNEL +__BEGIN_DECLS + +ssize_t getxattr(const char *path, const char *name, void *value, size_t size, u_int32_t position, int options); + +ssize_t fgetxattr(int fd, const char *name, void *value, size_t size, u_int32_t position, int options); + +int setxattr(const char *path, const char *name, const void *value, size_t size, u_int32_t position, int options); + +int fsetxattr(int fd, const char *name, const void *value, size_t size, u_int32_t position, int options); + +int removexattr(const char *path, const char *name, int options); + +int fremovexattr(int fd, const char *name, int options); + +ssize_t listxattr(const char *path, char *namebuff, size_t size, int options); + +ssize_t flistxattr(int fd, char *namebuff, size_t size, int options); + +__END_DECLS +#endif /* KERNEL */ + +#endif /* _SYS_XATTR_H_ */ diff --git a/bsd/ufs/ffs/ffs_alloc.c b/bsd/ufs/ffs/ffs_alloc.c index 624d1bdd5..275808fd4 100644 --- a/bsd/ufs/ffs/ffs_alloc.c +++ b/bsd/ufs/ffs/ffs_alloc.c @@ -59,10 +59,11 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/buf.h> +#include <sys/buf_internal.h> #include <sys/proc.h> -#include <sys/vnode.h> -#include <sys/mount.h> +#include <sys/kauth.h> +#include <sys/vnode_internal.h> +#include <sys/mount_internal.h> #include <sys/kernel.h> #include <sys/syslog.h> #include <sys/quota.h> @@ -82,18 +83,18 @@ extern u_long nextgennumber; -static ufs_daddr_t ffs_alloccg __P((struct inode *, int, ufs_daddr_t, int)); -static ufs_daddr_t ffs_alloccgblk __P((struct fs *, struct cg *, ufs_daddr_t)); -static ufs_daddr_t ffs_clusteralloc __P((struct inode *, int, ufs_daddr_t, - int)); -static ino_t ffs_dirpref __P((struct inode *)); -static ufs_daddr_t ffs_fragextend __P((struct inode *, int, long, int, int)); -static void ffs_fserr __P((struct fs *, u_int, char *)); +static ufs_daddr_t ffs_alloccg(struct inode *, int, ufs_daddr_t, int); +static ufs_daddr_t ffs_alloccgblk(struct fs *, struct cg *, ufs_daddr_t); +static ufs_daddr_t ffs_clusteralloc(struct inode *, int, ufs_daddr_t, int); +static ino_t ffs_dirpref(struct inode *); +static ufs_daddr_t ffs_fragextend(struct inode *, int, long, int, int); +static void ffs_fserr(struct fs *, u_int, char *); static u_long ffs_hashalloc - __P((struct inode *, int, long, int, u_int32_t (*)())); -static ino_t ffs_nodealloccg __P((struct inode *, int, ufs_daddr_t, int)); -static ufs_daddr_t ffs_mapsearch __P((struct fs *, struct cg *, ufs_daddr_t, - int)); + (struct inode *, int, long, int, u_int32_t (*)()); +static ino_t ffs_nodealloccg(struct inode *, int, ufs_daddr_t, int); +static ufs_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs_daddr_t, int); +static void ffs_clusteracct + (struct fs *fs, struct cg *cgp, ufs_daddr_t blkno, int cnt); /* * Allocate a block in the file system. @@ -118,7 +119,7 @@ ffs_alloc(ip, lbn, bpref, size, cred, bnp) register struct inode *ip; ufs_daddr_t lbn, bpref; int size; - struct ucred *cred; + kauth_cred_t cred; ufs_daddr_t *bnp; { register struct fs *fs; @@ -138,9 +139,9 @@ ffs_alloc(ip, lbn, bpref, size, cred, bnp) #endif /* DIAGNOSTIC */ if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) goto nospace; - if (cred->cr_uid != 0 && freespace(fs, fs->fs_minfree) <= 0) + if (suser(cred, NULL) && freespace(fs, fs->fs_minfree) <= 0) goto nospace; - VOP_DEVBLOCKSIZE(ip->i_devvp,&devBlockSize); + devBlockSize = vfs_devblocksize(vnode_mount(ITOV(ip))); #if QUOTA if (error = chkdq(ip, (int64_t)size, cred, 0)) return (error); @@ -166,7 +167,7 @@ ffs_alloc(ip, lbn, bpref, size, cred, bnp) (void) chkdq(ip, (int64_t)-size, cred, FORCE); #endif /* QUOTA */ nospace: - ffs_fserr(fs, cred->cr_uid, "file system full"); + ffs_fserr(fs, kauth_cred_getuid(cred), "file system full"); uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt); return (ENOSPC); } @@ -184,7 +185,7 @@ ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp) ufs_daddr_t lbprev; ufs_daddr_t bpref; int osize, nsize; - struct ucred *cred; + kauth_cred_t cred; struct buf **bpp; { register struct fs *fs; @@ -206,7 +207,7 @@ ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp) if (cred == NOCRED) panic("ffs_realloccg: missing credential\n"); #endif /* DIAGNOSTIC */ - if (cred->cr_uid != 0 && freespace(fs, fs->fs_minfree) <= 0) + if (suser(cred, NULL) != 0 && freespace(fs, fs->fs_minfree) <= 0) goto nospace; if ((bprev = ip->i_db[lbprev]) == 0) { printf("dev = 0x%x, bsize = %d, bprev = %d, fs = %s\n", @@ -216,16 +217,16 @@ ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp) /* * Allocate the extra space in the buffer. */ - if (error = bread(ITOV(ip), lbprev, osize, NOCRED, &bp)) { - brelse(bp); + if (error = (int)buf_bread(ITOV(ip), (daddr64_t)((unsigned)lbprev), osize, NOCRED, &bp)) { + buf_brelse(bp); return (error); } - VOP_DEVBLOCKSIZE(ip->i_devvp,&devBlockSize); + devBlockSize = vfs_devblocksize(vnode_mount(ITOV(ip))); #if QUOTA if (error = chkdq(ip, (int64_t)(nsize - osize), cred, 0)) { - brelse(bp); + buf_brelse(bp); return (error); } #endif /* QUOTA */ @@ -234,13 +235,13 @@ ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp) */ cg = dtog(fs, bprev); if (bno = ffs_fragextend(ip, cg, (long)bprev, osize, nsize)) { - if (bp->b_blkno != fsbtodb(fs, bno)) + if ((ufs_daddr_t)buf_blkno(bp) != fsbtodb(fs, bno)) panic("bad blockno"); ip->i_blocks += btodb(nsize - osize, devBlockSize); ip->i_flag |= IN_CHANGE | IN_UPDATE; allocbuf(bp, nsize); - bp->b_flags |= B_DONE; - bzero((char *)bp->b_data + osize, (u_int)bp->b_bufsize - osize); + buf_setflags(bp, B_DONE); + bzero((char *)buf_dataptr(bp) + osize, (u_int)buf_size(bp) - osize); *bpp = bp; return (0); } @@ -295,7 +296,7 @@ ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp) bno = (ufs_daddr_t)ffs_hashalloc(ip, cg, (long)bpref, request, (u_int32_t (*)())ffs_alloccg); if (bno > 0) { - bp->b_blkno = fsbtodb(fs, bno); + buf_setblkno(bp, (daddr64_t)((unsigned)fsbtodb(fs, bno))); ffs_blkfree(ip, bprev, (long)osize); if (nsize < request) ffs_blkfree(ip, bno + numfrags(fs, nsize), @@ -303,8 +304,8 @@ ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp) ip->i_blocks += btodb(nsize - osize, devBlockSize); ip->i_flag |= IN_CHANGE | IN_UPDATE; allocbuf(bp, nsize); - bp->b_flags |= B_DONE; - bzero((char *)bp->b_data + osize, (u_int)bp->b_bufsize - osize); + buf_setflags(bp, B_DONE); + bzero((char *)buf_dataptr(bp) + osize, (u_int)buf_size(bp) - osize); *bpp = bp; return (0); } @@ -314,12 +315,12 @@ ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp) */ (void) chkdq(ip, (int64_t)-(nsize - osize), cred, FORCE); #endif /* QUOTA */ - brelse(bp); + buf_brelse(bp); nospace: /* * no space available */ - ffs_fserr(fs, cred->cr_uid, "file system full"); + ffs_fserr(fs, kauth_cred_getuid(cred), "file system full"); uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt); return (ENOSPC); } @@ -343,12 +344,6 @@ nospace: int doasyncfree = 1; int doreallocblks = 1; -int -ffs_reallocblks(ap) - struct vop_reallocblks_args *ap; -{ - return (ENOSPC); -} /* * Allocate an inode in the file system. @@ -366,23 +361,21 @@ ffs_reallocblks(ap) * available inode is located. */ int -ffs_valloc(ap) - struct vop_valloc_args /* { - struct vnode *a_pvp; - int a_mode; - struct ucred *a_cred; - struct vnode **a_vpp; - } */ *ap; +ffs_valloc( + struct vnode *pvp, + mode_t mode, + kauth_cred_t cred, + struct vnode **vpp) + { - register struct vnode *pvp = ap->a_pvp; register struct inode *pip; register struct fs *fs; register struct inode *ip; - mode_t mode = ap->a_mode; + struct timeval tv; ino_t ino, ipref; int cg, error; - *ap->a_vpp = NULL; + *vpp = NULL; pip = VTOI(pvp); fs = pip->i_fs; if (fs->fs_cstotal.cs_nifree == 0) @@ -409,12 +402,14 @@ ffs_valloc(ap) ino = (ino_t)ffs_hashalloc(pip, cg, (long)ipref, mode, ffs_nodealloccg); if (ino == 0) goto noinodes; - error = VFS_VGET(pvp->v_mount, (void *)ino, ap->a_vpp); + + error = ffs_vget_internal(pvp->v_mount, ino, vpp, NULL, NULL, mode, 0); if (error) { - VOP_VFREE(pvp, ino, mode); + ffs_vfree(pvp, ino, mode); return (error); } - ip = VTOI(*ap->a_vpp); + ip = VTOI(*vpp); + if (ip->i_mode) { printf("mode = 0%o, inum = %d, fs = %s\n", ip->i_mode, ip->i_number, fs->fs_fsmnt); @@ -429,12 +424,13 @@ ffs_valloc(ap) /* * Set up a new generation number for this inode. */ - if (++nextgennumber < (u_long)time.tv_sec) - nextgennumber = time.tv_sec; + microtime(&tv); + if (++nextgennumber < (u_long)tv.tv_sec) + nextgennumber = tv.tv_sec; ip->i_gen = nextgennumber; return (0); noinodes: - ffs_fserr(fs, ap->a_cred->cr_uid, "out of inodes"); + ffs_fserr(fs, kauth_cred_getuid(cred), "out of inodes"); uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt); return (ENOSPC); } @@ -753,6 +749,7 @@ ffs_fragextend(ip, cg, bprev, osize, nsize) register struct fs *fs; register struct cg *cgp; struct buf *bp; + struct timeval tv; long bno; int frags, bbase; int i, error; @@ -772,13 +769,13 @@ ffs_fragextend(ip, cg, bprev, osize, nsize) return (NULL); } /* read corresponding cylinder group info */ - error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), - (int)fs->fs_cgsize, NOCRED, &bp); + error = (int)buf_bread(ip->i_devvp, (daddr64_t)((unsigned)fsbtodb(fs, cgtod(fs, cg))), + (int)fs->fs_cgsize, NOCRED, &bp); if (error) { - brelse(bp); + buf_brelse(bp); return (NULL); } - cgp = (struct cg *)bp->b_data; + cgp = (struct cg *)buf_dataptr(bp); #if REV_ENDIAN_FS if (rev_endian) { byte_swap_cgin(cgp, fs); @@ -790,10 +787,11 @@ ffs_fragextend(ip, cg, bprev, osize, nsize) if (rev_endian) byte_swap_cgout(cgp,fs); #endif /* REV_ENDIAN_FS */ - brelse(bp); + buf_brelse(bp); return (NULL); } - cgp->cg_time = time.tv_sec; + microtime(&tv); + cgp->cg_time = tv.tv_sec; bno = dtogd(fs, bprev); for (i = numfrags(fs, osize); i < frags; i++) if (isclr(cg_blksfree(cgp), bno + i)) { @@ -801,7 +799,7 @@ ffs_fragextend(ip, cg, bprev, osize, nsize) if (rev_endian) byte_swap_cgout(cgp,fs); #endif /* REV_ENDIAN_FS */ - brelse(bp); + buf_brelse(bp); return (NULL); } /* @@ -827,7 +825,7 @@ ffs_fragextend(ip, cg, bprev, osize, nsize) if (rev_endian) byte_swap_cgout(cgp,fs); #endif /* REV_ENDIAN_FS */ - bdwrite(bp); + buf_bdwrite(bp); return (bprev); } @@ -847,6 +845,7 @@ ffs_alloccg(ip, cg, bpref, size) register struct fs *fs; register struct cg *cgp; struct buf *bp; + struct timeval tv; register int i; int error, bno, frags, allocsiz; #if REV_ENDIAN_FS @@ -858,13 +857,13 @@ ffs_alloccg(ip, cg, bpref, size) fs = ip->i_fs; if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) return (NULL); - error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), - (int)fs->fs_cgsize, NOCRED, &bp); + error = (int)buf_bread(ip->i_devvp, (daddr64_t)((unsigned)fsbtodb(fs, cgtod(fs, cg))), + (int)fs->fs_cgsize, NOCRED, &bp); if (error) { - brelse(bp); + buf_brelse(bp); return (NULL); } - cgp = (struct cg *)bp->b_data; + cgp = (struct cg *)buf_dataptr(bp); #if REV_ENDIAN_FS if (rev_endian) byte_swap_cgin(cgp,fs); @@ -875,17 +874,18 @@ ffs_alloccg(ip, cg, bpref, size) if (rev_endian) byte_swap_cgout(cgp,fs); #endif /* REV_ENDIAN_FS */ - brelse(bp); + buf_brelse(bp); return (NULL); } - cgp->cg_time = time.tv_sec; + microtime(&tv); + cgp->cg_time = tv.tv_sec; if (size == fs->fs_bsize) { bno = ffs_alloccgblk(fs, cgp, bpref); #if REV_ENDIAN_FS if (rev_endian) byte_swap_cgout(cgp,fs); #endif /* REV_ENDIAN_FS */ - bdwrite(bp); + buf_bdwrite(bp); return (bno); } /* @@ -907,7 +907,7 @@ ffs_alloccg(ip, cg, bpref, size) if (rev_endian) byte_swap_cgout(cgp,fs); #endif /* REV_ENDIAN_FS */ - brelse(bp); + buf_brelse(bp); return (NULL); } bno = ffs_alloccgblk(fs, cgp, bpref); @@ -924,7 +924,7 @@ ffs_alloccg(ip, cg, bpref, size) if (rev_endian) byte_swap_cgout(cgp,fs); #endif /* REV_ENDIAN_FS */ - bdwrite(bp); + buf_bdwrite(bp); return (bno); } bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); @@ -933,7 +933,7 @@ ffs_alloccg(ip, cg, bpref, size) if (rev_endian) byte_swap_cgout(cgp,fs); #endif /* REV_ENDIAN_FS */ - brelse(bp); + buf_brelse(bp); return (NULL); } for (i = 0; i < frags; i++) @@ -949,7 +949,7 @@ ffs_alloccg(ip, cg, bpref, size) if (rev_endian) byte_swap_cgout(cgp,fs); #endif /* REV_ENDIAN_FS */ - bdwrite(bp); + buf_bdwrite(bp); return (cg * fs->fs_fpg + bno); } @@ -1097,10 +1097,10 @@ ffs_clusteralloc(ip, cg, bpref, len) fs = ip->i_fs; if (fs->fs_maxcluster[cg] < len) return (NULL); - if (bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, - NOCRED, &bp)) + if (buf_bread(ip->i_devvp, (daddr64_t)((unsigned)fsbtodb(fs, cgtod(fs, cg))), (int)fs->fs_cgsize, + NOCRED, &bp)) goto fail; - cgp = (struct cg *)bp->b_data; + cgp = (struct cg *)buf_dataptr(bp); #if REV_ENDIAN_FS if (rev_endian) byte_swap_cgin(cgp,fs); @@ -1196,11 +1196,11 @@ ffs_clusteralloc(ip, cg, bpref, len) if (rev_endian) byte_swap_cgout(cgp,fs); #endif /* REV_ENDIAN_FS */ - bdwrite(bp); + buf_bdwrite(bp); return (bno); fail: - brelse(bp); + buf_brelse(bp); return (0); } @@ -1223,6 +1223,7 @@ ffs_nodealloccg(ip, cg, ipref, mode) register struct fs *fs; register struct cg *cgp; struct buf *bp; + struct timeval tv; int error, start, len, loc, map, i; #if REV_ENDIAN_FS struct vnode *vp=ITOV(ip); @@ -1233,13 +1234,13 @@ ffs_nodealloccg(ip, cg, ipref, mode) fs = ip->i_fs; if (fs->fs_cs(fs, cg).cs_nifree == 0) return (NULL); - error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), - (int)fs->fs_cgsize, NOCRED, &bp); + error = (int)buf_bread(ip->i_devvp, (daddr64_t)((unsigned)fsbtodb(fs, cgtod(fs, cg))), + (int)fs->fs_cgsize, NOCRED, &bp); if (error) { - brelse(bp); + buf_brelse(bp); return (NULL); } - cgp = (struct cg *)bp->b_data; + cgp = (struct cg *)buf_dataptr(bp); #if REV_ENDIAN_FS if (rev_endian) byte_swap_cgin(cgp,fs); @@ -1249,11 +1250,12 @@ ffs_nodealloccg(ip, cg, ipref, mode) if (rev_endian) byte_swap_cgout(cgp,fs); #endif /* REV_ENDIAN_FS */ - brelse(bp); + buf_brelse(bp); return (NULL); } - cgp->cg_time = time.tv_sec; + microtime(&tv); + cgp->cg_time = tv.tv_sec; if (ipref) { ipref %= fs->fs_ipg; if (isclr(cg_inosused(cgp), ipref)) @@ -1300,7 +1302,7 @@ gotit: if (rev_endian) byte_swap_cgout(cgp,fs); #endif /* REV_ENDIAN_FS */ - bdwrite(bp); + buf_bdwrite(bp); return (cg * fs->fs_ipg + ipref); } @@ -1311,6 +1313,7 @@ gotit: * free map. If a fragment is deallocated, a possible * block reassembly is checked. */ +void ffs_blkfree(ip, bno, size) register struct inode *ip; ufs_daddr_t bno; @@ -1319,6 +1322,7 @@ ffs_blkfree(ip, bno, size) register struct fs *fs; register struct cg *cgp; struct buf *bp; + struct timeval tv; ufs_daddr_t blkno; int i, error, cg, blk, frags, bbase; #if REV_ENDIAN_FS @@ -1326,6 +1330,7 @@ ffs_blkfree(ip, bno, size) struct mount *mp=vp->v_mount; int rev_endian=(mp->mnt_flag & MNT_REVEND); #endif /* REV_ENDIAN_FS */ + fs = ip->i_fs; if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { printf("dev = 0x%x, bsize = %d, size = %d, fs = %s\n", @@ -1338,13 +1343,13 @@ ffs_blkfree(ip, bno, size) ffs_fserr(fs, ip->i_uid, "bad block"); return; } - error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), - (int)fs->fs_cgsize, NOCRED, &bp); + error = (int)buf_bread(ip->i_devvp, (daddr64_t)((unsigned)fsbtodb(fs, cgtod(fs, cg))), + (int)fs->fs_cgsize, NOCRED, &bp); if (error) { - brelse(bp); + buf_brelse(bp); return; } - cgp = (struct cg *)bp->b_data; + cgp = (struct cg *)buf_dataptr(bp); #if REV_ENDIAN_FS if (rev_endian) byte_swap_cgin(cgp,fs); @@ -1354,10 +1359,11 @@ ffs_blkfree(ip, bno, size) if (rev_endian) byte_swap_cgout(cgp,fs); #endif /* REV_ENDIAN_FS */ - brelse(bp); + buf_brelse(bp); return; } - cgp->cg_time = time.tv_sec; + microtime(&tv); + cgp->cg_time = tv.tv_sec; bno = dtogd(fs, bno); if (size == fs->fs_bsize) { blkno = fragstoblks(fs, bno); @@ -1423,7 +1429,7 @@ ffs_blkfree(ip, bno, size) if (rev_endian) byte_swap_cgout(cgp,fs); #endif /* REV_ENDIAN_FS */ - bdwrite(bp); + buf_bdwrite(bp); } #if DIAGNOSTIC @@ -1454,13 +1460,13 @@ ffs_checkblk(ip, bno, size) } if ((u_int)bno >= fs->fs_size) panic("checkblk: bad block %d", bno); - error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, dtog(fs, bno))), - (int)fs->fs_cgsize, NOCRED, &bp); + error = (int)buf_bread(ip->i_devvp, (daddr64_t)((unsigned)fsbtodb(fs, cgtod(fs, dtog(fs, bno)))), + (int)fs->fs_cgsize, NOCRED, &bp); if (error) { - brelse(bp); + buf_brelse(bp); return; } - cgp = (struct cg *)bp->b_data; + cgp = (struct cg *)buf_dataptr(bp); #if REV_ENDIAN_FS if (rev_endian) byte_swap_cgin(cgp,fs); @@ -1470,7 +1476,7 @@ ffs_checkblk(ip, bno, size) if (rev_endian) byte_swap_cgout(cgp,fs); #endif /* REV_ENDIAN_FS */ - brelse(bp); + buf_brelse(bp); return; } bno = dtogd(fs, bno); @@ -1488,7 +1494,7 @@ ffs_checkblk(ip, bno, size) if (rev_endian) byte_swap_cgout(cgp,fs); #endif /* REV_ENDIAN_FS */ - brelse(bp); + buf_brelse(bp); return (!free); } #endif /* DIAGNOSTIC */ @@ -1499,38 +1505,32 @@ ffs_checkblk(ip, bno, size) * The specified inode is placed back in the free map. */ int -ffs_vfree(ap) - struct vop_vfree_args /* { - struct vnode *a_pvp; - ino_t a_ino; - int a_mode; - } */ *ap; +ffs_vfree(struct vnode *vp, ino_t ino, int mode) { register struct fs *fs; register struct cg *cgp; register struct inode *pip; - ino_t ino = ap->a_ino; struct buf *bp; + struct timeval tv; int error, cg; #if REV_ENDIAN_FS - struct vnode *vp=ap->a_pvp; struct mount *mp=vp->v_mount; int rev_endian=(mp->mnt_flag & MNT_REVEND); #endif /* REV_ENDIAN_FS */ - pip = VTOI(ap->a_pvp); + pip = VTOI(vp); fs = pip->i_fs; if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg) panic("ifree: range: dev = 0x%x, ino = %d, fs = %s\n", pip->i_dev, ino, fs->fs_fsmnt); cg = ino_to_cg(fs, ino); - error = bread(pip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), - (int)fs->fs_cgsize, NOCRED, &bp); + error = (int)buf_bread(pip->i_devvp, (daddr64_t)((unsigned)fsbtodb(fs, cgtod(fs, cg))), + (int)fs->fs_cgsize, NOCRED, &bp); if (error) { - brelse(bp); + buf_brelse(bp); return (0); } - cgp = (struct cg *)bp->b_data; + cgp = (struct cg *)buf_dataptr(bp); #if REV_ENDIAN_FS if (rev_endian) byte_swap_cgin(cgp,fs); @@ -1540,10 +1540,11 @@ ffs_vfree(ap) if (rev_endian) byte_swap_cgout(cgp,fs); #endif /* REV_ENDIAN_FS */ - brelse(bp); + buf_brelse(bp); return (0); } - cgp->cg_time = time.tv_sec; + microtime(&tv); + cgp->cg_time = tv.tv_sec; ino %= fs->fs_ipg; if (isclr(cg_inosused(cgp), ino)) { printf("dev = 0x%x, ino = %d, fs = %s\n", @@ -1557,7 +1558,7 @@ ffs_vfree(ap) cgp->cg_cs.cs_nifree++; fs->fs_cstotal.cs_nifree++; fs->fs_cs(fs, cg).cs_nifree++; - if ((ap->a_mode & IFMT) == IFDIR) { + if ((mode & IFMT) == IFDIR) { cgp->cg_cs.cs_ndir--; fs->fs_cstotal.cs_ndir--; fs->fs_cs(fs, cg).cs_ndir--; @@ -1567,7 +1568,7 @@ ffs_vfree(ap) if (rev_endian) byte_swap_cgout(cgp,fs); #endif /* REV_ENDIAN_FS */ - bdwrite(bp); + buf_bdwrite(bp); return (0); } @@ -1641,11 +1642,8 @@ ffs_mapsearch(fs, cgp, bpref, allocsiz) * * Cnt == 1 means free; cnt == -1 means allocating. */ -ffs_clusteracct(fs, cgp, blkno, cnt) - struct fs *fs; - struct cg *cgp; - ufs_daddr_t blkno; - int cnt; +static void +ffs_clusteracct(struct fs *fs, struct cg *cgp, ufs_daddr_t blkno, int cnt) { int32_t *sump; int32_t *lp; diff --git a/bsd/ufs/ffs/ffs_balloc.c b/bsd/ufs/ffs/ffs_balloc.c index 37cf82024..5a0cf7bcf 100644 --- a/bsd/ufs/ffs/ffs_balloc.c +++ b/bsd/ufs/ffs/ffs_balloc.c @@ -58,15 +58,16 @@ #include <rev_endian_fs.h> #include <sys/param.h> #include <sys/systm.h> -#include <sys/buf.h> +#include <sys/buf_internal.h> #include <sys/proc.h> +#include <sys/kauth.h> #include <sys/file.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/ubc.h> #include <sys/quota.h> #if REV_ENDIAN_FS -#include <sys/mount.h> +#include <sys/mount_internal.h> #endif /* REV_ENDIAN_FS */ #include <sys/vm.h> @@ -88,14 +89,14 @@ * by allocating the physical blocks on a device given * the inode and the logical block number in a file. */ -ffs_balloc(ip, lbn, size, cred, bpp, flags, blk_alloc) - register struct inode *ip; - register ufs_daddr_t lbn; - int size; - struct ucred *cred; - struct buf **bpp; - int flags; - int * blk_alloc; +ffs_balloc( + register struct inode *ip, + register ufs_daddr_t lbn, + int size, + kauth_cred_t cred, + struct buf **bpp, + int flags, + int * blk_alloc) { register struct fs *fs; register ufs_daddr_t nb; @@ -107,8 +108,8 @@ ffs_balloc(ip, lbn, size, cred, bpp, flags, blk_alloc) ufs_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1]; int devBlockSize=0; int alloc_buffer = 1; -#if REV_ENDIAN_FS struct mount *mp=vp->v_mount; +#if REV_ENDIAN_FS int rev_endian=(mp->mnt_flag & MNT_REVEND); #endif /* REV_ENDIAN_FS */ @@ -148,19 +149,20 @@ ffs_balloc(ip, lbn, size, cred, bpp, flags, blk_alloc) osize, (int)fs->fs_bsize, cred, &bp); if (error) return (error); - /* adjust the innode size we just grew */ + /* adjust the inode size we just grew */ /* it is in nb+1 as nb starts from 0 */ ip->i_size = (nb + 1) * fs->fs_bsize; - if (UBCISVALID(vp)) - ubc_setsize(vp, (off_t)ip->i_size); /* XXX check error */ - ip->i_db[nb] = dbtofsb(fs, bp->b_blkno); + ubc_setsize(vp, (off_t)ip->i_size); + + ip->i_db[nb] = dbtofsb(fs, (ufs_daddr_t)buf_blkno(bp)); ip->i_flag |= IN_CHANGE | IN_UPDATE; + if ((flags & B_SYNC) || (!alloc_buffer)) { if (!alloc_buffer) - SET(bp->b_flags, B_NOCACHE); - bwrite(bp); + buf_setflags(bp, B_NOCACHE); + buf_bwrite(bp); } else - bdwrite(bp); + buf_bdwrite(bp); /* note that bp is already released here */ } } @@ -171,9 +173,9 @@ ffs_balloc(ip, lbn, size, cred, bpp, flags, blk_alloc) nb = ip->i_db[lbn]; if (nb != 0 && ip->i_size >= (lbn + 1) * fs->fs_bsize) { if (alloc_buffer) { - error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp); + error = (int)buf_bread(vp, (daddr64_t)((unsigned)lbn), fs->fs_bsize, NOCRED, &bp); if (error) { - brelse(bp); + buf_brelse(bp); return (error); } *bpp = bp; @@ -188,9 +190,9 @@ ffs_balloc(ip, lbn, size, cred, bpp, flags, blk_alloc) nsize = fragroundup(fs, size); if (nsize <= osize) { if (alloc_buffer) { - error = bread(vp, lbn, osize, NOCRED, &bp); + error = (int)buf_bread(vp, (daddr64_t)((unsigned)lbn), osize, NOCRED, &bp); if (error) { - brelse(bp); + buf_brelse(bp); return (error); } ip->i_flag |= IN_CHANGE | IN_UPDATE; @@ -207,14 +209,19 @@ ffs_balloc(ip, lbn, size, cred, bpp, flags, blk_alloc) &ip->i_db[0]), osize, nsize, cred, &bp); if (error) return (error); - ip->i_db[lbn] = dbtofsb(fs, bp->b_blkno); + ip->i_db[lbn] = dbtofsb(fs, (ufs_daddr_t)buf_blkno(bp)); ip->i_flag |= IN_CHANGE | IN_UPDATE; - if(!alloc_buffer) { - SET(bp->b_flags, B_NOCACHE); + + /* adjust the inode size we just grew */ + ip->i_size = (lbn * fs->fs_bsize) + size; + ubc_setsize(vp, (off_t)ip->i_size); + + if (!alloc_buffer) { + buf_setflags(bp, B_NOCACHE); if (flags & B_SYNC) - bwrite(bp); + buf_bwrite(bp); else - bdwrite(bp); + buf_bdwrite(bp); } else *bpp = bp; return (0); @@ -231,10 +238,11 @@ ffs_balloc(ip, lbn, size, cred, bpp, flags, blk_alloc) if (error) return (error); if (alloc_buffer) { - bp = getblk(vp, lbn, nsize, 0, 0, BLK_WRITE); - bp->b_blkno = fsbtodb(fs, newb); - if (flags & B_CLRBUF) - clrbuf(bp); + bp = buf_getblk(vp, (daddr64_t)((unsigned)lbn), nsize, 0, 0, BLK_WRITE); + buf_setblkno(bp, (daddr64_t)((unsigned)fsbtodb(fs, newb))); + + if (flags & B_CLRBUF) + buf_clear(bp); } ip->i_db[lbn] = newb; ip->i_flag |= IN_CHANGE | IN_UPDATE; @@ -270,16 +278,16 @@ ffs_balloc(ip, lbn, size, cred, bpp, flags, blk_alloc) return (error); nb = newb; *allocblk++ = nb; - bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, BLK_META); - bp->b_blkno = fsbtodb(fs, nb); - clrbuf(bp); + bp = buf_getblk(vp, (daddr64_t)((unsigned)(indirs[1].in_lbn)), fs->fs_bsize, 0, 0, BLK_META); + buf_setblkno(bp, (daddr64_t)((unsigned)fsbtodb(fs, nb))); + buf_clear(bp); /* * Write synchronously conditional on mount flags. */ if ((vp)->v_mount->mnt_flag & MNT_ASYNC) { error = 0; - bdwrite(bp); - } else if ((error = bwrite(bp)) != 0) { + buf_bdwrite(bp); + } else if ((error = buf_bwrite(bp)) != 0) { goto fail; } allocib = &ip->i_ib[indirs[0].in_off]; @@ -290,13 +298,12 @@ ffs_balloc(ip, lbn, size, cred, bpp, flags, blk_alloc) * Fetch through the indirect blocks, allocating as necessary. */ for (i = 1;;) { - error = meta_bread(vp, - indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); + error = (int)buf_meta_bread(vp, (daddr64_t)((unsigned)(indirs[i].in_lbn)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { - brelse(bp); + buf_brelse(bp); goto fail; } - bap = (ufs_daddr_t *)bp->b_data; + bap = (ufs_daddr_t *)buf_dataptr(bp); #if REV_ENDIAN_FS if (rev_endian) nb = NXSwapLong(bap[indirs[i].in_off]); @@ -310,29 +317,29 @@ ffs_balloc(ip, lbn, size, cred, bpp, flags, blk_alloc) break; i += 1; if (nb != 0) { - brelse(bp); + buf_brelse(bp); continue; } if (pref == 0) pref = ffs_blkpref(ip, lbn, 0, (ufs_daddr_t *)0); if (error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) { - brelse(bp); + buf_brelse(bp); goto fail; } nb = newb; *allocblk++ = nb; - nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, BLK_META); - nbp->b_blkno = fsbtodb(fs, nb); - clrbuf(nbp); + nbp = buf_getblk(vp, (daddr64_t)((unsigned)(indirs[i].in_lbn)), fs->fs_bsize, 0, 0, BLK_META); + buf_setblkno(nbp, (daddr64_t)((unsigned)fsbtodb(fs, nb))); + buf_clear(nbp); /* * Write synchronously conditional on mount flags. */ if ((vp)->v_mount->mnt_flag & MNT_ASYNC) { error = 0; - bdwrite(nbp); - } else if (error = bwrite(nbp)) { - brelse(bp); + buf_bdwrite(nbp); + } else if (error = buf_bwrite(nbp)) { + buf_brelse(bp); goto fail; } #if REV_ENDIAN_FS @@ -349,9 +356,9 @@ ffs_balloc(ip, lbn, size, cred, bpp, flags, blk_alloc) * delayed write. */ if (flags & B_SYNC) { - bwrite(bp); + buf_bwrite(bp); } else { - bdwrite(bp); + buf_bdwrite(bp); } } /* @@ -361,7 +368,7 @@ ffs_balloc(ip, lbn, size, cred, bpp, flags, blk_alloc) pref = ffs_blkpref(ip, lbn, indirs[i].in_off, &bap[0]); if (error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) { - brelse(bp); + buf_brelse(bp); goto fail; } nb = newb; @@ -380,15 +387,16 @@ ffs_balloc(ip, lbn, size, cred, bpp, flags, blk_alloc) * delayed write. */ if ((flags & B_SYNC)) { - bwrite(bp); + buf_bwrite(bp); } else { - bdwrite(bp); + buf_bdwrite(bp); } if(alloc_buffer ) { - nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, BLK_WRITE); - nbp->b_blkno = fsbtodb(fs, nb); + nbp = buf_getblk(vp, (daddr64_t)((unsigned)lbn), fs->fs_bsize, 0, 0, BLK_WRITE); + buf_setblkno(nbp, (daddr64_t)((unsigned)fsbtodb(fs, nb))); + if (flags & B_CLRBUF) - clrbuf(nbp); + buf_clear(nbp); } if (blk_alloc) { *blk_alloc = fs->fs_bsize; @@ -398,19 +406,19 @@ ffs_balloc(ip, lbn, size, cred, bpp, flags, blk_alloc) return (0); } - brelse(bp); + buf_brelse(bp); if (alloc_buffer) { - if (flags & B_CLRBUF) { - error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp); - if (error) { - brelse(nbp); - goto fail; + if (flags & B_CLRBUF) { + error = (int)buf_bread(vp, (daddr64_t)((unsigned)lbn), (int)fs->fs_bsize, NOCRED, &nbp); + if (error) { + buf_brelse(nbp); + goto fail; + } + } else { + nbp = buf_getblk(vp, (daddr64_t)((unsigned)lbn), fs->fs_bsize, 0, 0, BLK_WRITE); + buf_setblkno(nbp, (daddr64_t)((unsigned)fsbtodb(fs, nb))); } - } else { - nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, BLK_WRITE); - nbp->b_blkno = fsbtodb(fs, nb); - } - *bpp = nbp; + *bpp = nbp; } return (0); fail: @@ -425,8 +433,7 @@ fail: if (allocib != NULL) *allocib = 0; if (deallocated) { - VOP_DEVBLOCKSIZE(ip->i_devvp,&devBlockSize); - + devBlockSize = vfs_devblocksize(mp); #if QUOTA /* * Restore user's disk quota because allocation failed. @@ -441,7 +448,7 @@ fail: /* * ffs_blkalloc allocates a disk block for ffs_pageout(), as a consequence - * it does no breads (that could lead to deadblock as the page may be already + * it does no buf_breads (that could lead to deadblock as the page may be already * marked busy as it is being paged out. Also important to note that we are not * growing the file in pageouts. So ip->i_size cannot increase by this call * due to the way UBC works. @@ -450,12 +457,12 @@ fail: * Do not call with B_CLRBUF flags as this should only be called only * from pageouts */ -ffs_blkalloc(ip, lbn, size, cred, flags) - register struct inode *ip; - ufs_daddr_t lbn; - int size; - struct ucred *cred; - int flags; +ffs_blkalloc( + struct inode *ip, + ufs_daddr_t lbn, + int size, + kauth_cred_t cred, + int flags) { register struct fs *fs; register ufs_daddr_t nb; @@ -466,8 +473,8 @@ ffs_blkalloc(ip, lbn, size, cred, flags) int deallocated, osize, nsize, num, i, error; ufs_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1]; int devBlockSize=0; -#if REV_ENDIAN_FS struct mount *mp=vp->v_mount; +#if REV_ENDIAN_FS int rev_endian=(mp->mnt_flag & MNT_REVEND); #endif /* REV_ENDIAN_FS */ @@ -544,16 +551,16 @@ ffs_blkalloc(ip, lbn, size, cred, flags) return (error); nb = newb; *allocblk++ = nb; - bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, BLK_META); - bp->b_blkno = fsbtodb(fs, nb); - clrbuf(bp); + bp = buf_getblk(vp, (daddr64_t)((unsigned)(indirs[1].in_lbn)), fs->fs_bsize, 0, 0, BLK_META); + buf_setblkno(bp, (daddr64_t)((unsigned)fsbtodb(fs, nb))); + buf_clear(bp); /* * Write synchronously conditional on mount flags. */ if ((vp)->v_mount->mnt_flag & MNT_ASYNC) { error = 0; - bdwrite(bp); - } else if (error = bwrite(bp)) { + buf_bdwrite(bp); + } else if (error = buf_bwrite(bp)) { goto fail; } allocib = &ip->i_ib[indirs[0].in_off]; @@ -564,13 +571,12 @@ ffs_blkalloc(ip, lbn, size, cred, flags) * Fetch through the indirect blocks, allocating as necessary. */ for (i = 1;;) { - error = meta_bread(vp, - indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); + error = (int)buf_meta_bread(vp, (daddr64_t)((unsigned)(indirs[i].in_lbn)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { - brelse(bp); + buf_brelse(bp); goto fail; } - bap = (ufs_daddr_t *)bp->b_data; + bap = (ufs_daddr_t *)buf_dataptr(bp); #if REV_ENDIAN_FS if (rev_endian) nb = NXSwapLong(bap[indirs[i].in_off]); @@ -584,29 +590,29 @@ ffs_blkalloc(ip, lbn, size, cred, flags) break; i += 1; if (nb != 0) { - brelse(bp); + buf_brelse(bp); continue; } if (pref == 0) pref = ffs_blkpref(ip, lbn, 0, (ufs_daddr_t *)0); if (error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) { - brelse(bp); + buf_brelse(bp); goto fail; } nb = newb; *allocblk++ = nb; - nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, BLK_META); - nbp->b_blkno = fsbtodb(fs, nb); - clrbuf(nbp); + nbp = buf_getblk(vp, (daddr64_t)((unsigned)(indirs[i].in_lbn)), fs->fs_bsize, 0, 0, BLK_META); + buf_setblkno(nbp, (daddr64_t)((unsigned)fsbtodb(fs, nb))); + buf_clear(nbp); /* * Write synchronously conditional on mount flags. */ if ((vp)->v_mount->mnt_flag & MNT_ASYNC) { error = 0; - bdwrite(nbp); - } else if (error = bwrite(nbp)) { - brelse(bp); + buf_bdwrite(nbp); + } else if (error = buf_bwrite(nbp)) { + buf_brelse(bp); goto fail; } #if REV_ENDIAN_FS @@ -623,9 +629,9 @@ ffs_blkalloc(ip, lbn, size, cred, flags) * delayed write. */ if (flags & B_SYNC) { - bwrite(bp); + buf_bwrite(bp); } else { - bdwrite(bp); + buf_bdwrite(bp); } } /* @@ -635,7 +641,7 @@ ffs_blkalloc(ip, lbn, size, cred, flags) pref = ffs_blkpref(ip, lbn, indirs[i].in_off, &bap[0]); if (error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) { - brelse(bp); + buf_brelse(bp); goto fail; } nb = newb; @@ -654,13 +660,13 @@ ffs_blkalloc(ip, lbn, size, cred, flags) * delayed write. */ if (flags & B_SYNC) { - bwrite(bp); + buf_bwrite(bp); } else { - bdwrite(bp); + buf_bdwrite(bp); } return (0); } - brelse(bp); + buf_brelse(bp); return (0); fail: /* @@ -674,8 +680,7 @@ fail: if (allocib != NULL) *allocib = 0; if (deallocated) { - VOP_DEVBLOCKSIZE(ip->i_devvp,&devBlockSize); - + devBlockSize = vfs_devblocksize(mp); #if QUOTA /* * Restore user's disk quota because allocation failed. diff --git a/bsd/ufs/ffs/ffs_extern.h b/bsd/ufs/ffs/ffs_extern.h index 95fefe238..c833abc20 100644 --- a/bsd/ufs/ffs/ffs_extern.h +++ b/bsd/ufs/ffs/ffs_extern.h @@ -80,13 +80,12 @@ #endif /* __APPLE_API_UNSTABLE */ struct buf; -struct fid; struct fs; struct inode; struct mount; struct nameidata; struct proc; -struct statfs; +struct vfsstatfs; struct timeval; struct ucred; struct uio; @@ -96,51 +95,46 @@ struct vfsconf; #ifdef __APPLE_API_PRIVATE __BEGIN_DECLS -int ffs_alloc __P((struct inode *, - ufs_daddr_t, ufs_daddr_t, int, struct ucred *, ufs_daddr_t *)); -int ffs_balloc __P((struct inode *, - ufs_daddr_t, int, struct ucred *, struct buf **, int, int *)); -int ffs_blkatoff __P((struct vop_blkatoff_args *)); -int ffs_blkfree __P((struct inode *, ufs_daddr_t, long)); -ufs_daddr_t ffs_blkpref __P((struct inode *, ufs_daddr_t, int, ufs_daddr_t *)); -int ffs_bmap __P((struct vop_bmap_args *)); -void ffs_clrblock __P((struct fs *, u_char *, ufs_daddr_t)); -int ffs_fhtovp __P((struct mount *, struct fid *, struct mbuf *, - struct vnode **, int *, struct ucred **)); -void ffs_fragacct __P((struct fs *, int, int32_t [], int)); -int ffs_fsync __P((struct vop_fsync_args *)); -int ffs_init __P((struct vfsconf *)); -int ffs_isblock __P((struct fs *, u_char *, ufs_daddr_t)); -int ffs_mount __P((struct mount *, - char *, caddr_t, struct nameidata *, struct proc *)); -int ffs_mountfs __P((struct vnode *, struct mount *, struct proc *)); -int ffs_mountroot __P((void)); -int ffs_read __P((struct vop_read_args *)); -int ffs_reallocblks __P((struct vop_reallocblks_args *)); -int ffs_realloccg __P((struct inode *, - ufs_daddr_t, ufs_daddr_t, int, int, struct ucred *, struct buf **)); -int ffs_reclaim __P((struct vop_reclaim_args *)); -void ffs_setblock __P((struct fs *, u_char *, ufs_daddr_t)); -int ffs_statfs __P((struct mount *, struct statfs *, struct proc *)); -int ffs_sync __P((struct mount *, int, struct ucred *, struct proc *)); -int ffs_sysctl __P((int *, u_int, void *, size_t *, void *, size_t, - struct proc *)); -int ffs_truncate __P((struct vop_truncate_args *)); -int ffs_unmount __P((struct mount *, int, struct proc *)); -int ffs_update __P((struct vop_update_args *)); -int ffs_valloc __P((struct vop_valloc_args *)); -int ffs_vfree __P((struct vop_vfree_args *)); -int ffs_vget __P((struct mount *, void *, struct vnode **)); -int ffs_vptofh __P((struct vnode *, struct fid *)); -int ffs_write __P((struct vop_write_args *)); -int ffs_pagein __P((struct vop_pagein_args *)); -int ffs_pageout __P((struct vop_pageout_args *)); -int ffs_blktooff __P((struct vop_blktooff_args *)); -int ffs_offtoblk __P((struct vop_offtoblk_args *)); +int ffs_fsync_internal(vnode_t, int); + +int ffs_blkatoff(vnode_t, off_t, char **, buf_t *); + +int ffs_alloc(struct inode *, + ufs_daddr_t, ufs_daddr_t, int, struct ucred *, ufs_daddr_t *); +int ffs_balloc(struct inode *, + ufs_daddr_t, int, struct ucred *, struct buf **, int, int *); +void ffs_blkfree(struct inode *, ufs_daddr_t, long); +ufs_daddr_t ffs_blkpref(struct inode *, ufs_daddr_t, int, ufs_daddr_t *); +void ffs_clrblock(struct fs *, u_char *, ufs_daddr_t); +int ffs_fhtovp(struct mount *, int, unsigned char *, struct vnode **, vfs_context_t); +void ffs_fragacct(struct fs *, int, int32_t [], int); +int ffs_fsync(struct vnop_fsync_args *); +int ffs_init(struct vfsconf *); +int ffs_isblock(struct fs *, u_char *, ufs_daddr_t); +int ffs_mount(struct mount *, vnode_t , user_addr_t, vfs_context_t); +int ffs_mountfs(struct vnode *, struct mount *, vfs_context_t); +int ffs_mountroot(mount_t, vnode_t, vfs_context_t); +int ffs_read(struct vnop_read_args *); +int ffs_realloccg(struct inode *, + ufs_daddr_t, ufs_daddr_t, int, int, struct ucred *, struct buf **); +int ffs_reclaim(struct vnop_reclaim_args *); +void ffs_setblock(struct fs *, u_char *, ufs_daddr_t); +int ffs_vfs_getattr(struct mount *, struct vfs_attr *, vfs_context_t); +int ffs_vfs_setattr(struct mount *, struct vfs_attr *, vfs_context_t); +int ffs_sync(struct mount *, int, vfs_context_t); +int ffs_sysctl(int *, u_int, user_addr_t, size_t *, user_addr_t, size_t, vfs_context_t); +int ffs_unmount(struct mount *, int, vfs_context_t); +int ffs_update(struct vnode *, struct timeval *, struct timeval *, int); +int ffs_valloc(vnode_t dvp, mode_t mode, kauth_cred_t cred, vnode_t *vpp); +int ffs_vfree(struct vnode *vp, ino_t ino, int mode); +int ffs_vget(struct mount *, ino64_t, struct vnode **, vfs_context_t); +int ffs_vptofh(struct vnode *, int *, unsigned char *, vfs_context_t); +int ffs_write(struct vnop_write_args *); +int ffs_pagein(struct vnop_pagein_args *); +int ffs_pageout(struct vnop_pageout_args *); +int ffs_blktooff(struct vnop_blktooff_args *); +int ffs_offtoblk(struct vnop_offtoblk_args *); -#if DIAGNOSTIC -void ffs_checkoverlap __P((struct buf *, struct inode *)); -#endif __END_DECLS extern int (**ffs_vnodeop_p)(void *); diff --git a/bsd/ufs/ffs/ffs_inode.c b/bsd/ufs/ffs/ffs_inode.c index f84771f2c..4ee62c22b 100644 --- a/bsd/ufs/ffs/ffs_inode.c +++ b/bsd/ufs/ffs/ffs_inode.c @@ -60,11 +60,11 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/mount.h> -#include <sys/proc.h> +#include <sys/mount_internal.h> +#include <sys/proc_internal.h> /* for accessing p_stats */ #include <sys/file.h> -#include <sys/buf.h> -#include <sys/vnode.h> +#include <sys/buf_internal.h> +#include <sys/vnode_internal.h> #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/trace.h> @@ -87,8 +87,8 @@ #include <architecture/byte_order.h> #endif /* REV_ENDIAN_FS */ -static int ffs_indirtrunc __P((struct inode *, ufs_daddr_t, ufs_daddr_t, - ufs_daddr_t, int, long *)); +static int ffs_indirtrunc(struct inode *, ufs_daddr_t, ufs_daddr_t, + ufs_daddr_t, int, long *); /* * Update the access, modified, and inode change times as specified by the @@ -100,25 +100,20 @@ static int ffs_indirtrunc __P((struct inode *, ufs_daddr_t, ufs_daddr_t, * complete. */ int -ffs_update(ap) - struct vop_update_args /* { - struct vnode *a_vp; - struct timeval *a_access; - struct timeval *a_modify; - int a_waitfor; - } */ *ap; +ffs_update(struct vnode *vp, struct timeval *access, struct timeval *modify, int waitfor) { register struct fs *fs; struct buf *bp; struct inode *ip; - int error; + struct timeval tv; + errno_t error; #if REV_ENDIAN_FS - struct mount *mp=(ap->a_vp)->v_mount; + struct mount *mp=(vp)->v_mount; int rev_endian=(mp->mnt_flag & MNT_REVEND); #endif /* REV_ENDIAN_FS */ - ip = VTOI(ap->a_vp); - if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) { + ip = VTOI(vp); + if (vp->v_mount->mnt_flag & MNT_RDONLY) { ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE); return (0); @@ -127,13 +122,15 @@ ffs_update(ap) (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0) return (0); if (ip->i_flag & IN_ACCESS) - ip->i_atime = ap->a_access->tv_sec; + ip->i_atime = access->tv_sec; if (ip->i_flag & IN_UPDATE) { - ip->i_mtime = ap->a_modify->tv_sec; + ip->i_mtime = modify->tv_sec; ip->i_modrev++; } - if (ip->i_flag & IN_CHANGE) - ip->i_ctime = time.tv_sec; + if (ip->i_flag & IN_CHANGE) { + microtime(&tv); + ip->i_ctime = tv.tv_sec; + } ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE); fs = ip->i_fs; /* @@ -144,62 +141,50 @@ ffs_update(ap) ip->i_din.di_ouid = ip->i_uid; /* XXX */ ip->i_din.di_ogid = ip->i_gid; /* XXX */ } /* XXX */ - if (error = bread(ip->i_devvp, - fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), + if (error = buf_bread(ip->i_devvp, + (daddr64_t)((unsigned)fsbtodb(fs, ino_to_fsba(fs, ip->i_number))), (int)fs->fs_bsize, NOCRED, &bp)) { - brelse(bp); - return (error); + buf_brelse(bp); + return ((int)error); } #if REV_ENDIAN_FS if (rev_endian) - byte_swap_inode_out(ip, ((struct dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number))); + byte_swap_inode_out(ip, ((struct dinode *)buf_dataptr(bp) + ino_to_fsbo(fs, ip->i_number))); else { #endif /* REV_ENDIAN_FS */ - *((struct dinode *)bp->b_data + - ino_to_fsbo(fs, ip->i_number)) = ip->i_din; + *((struct dinode *)buf_dataptr(bp) + ino_to_fsbo(fs, ip->i_number)) = ip->i_din; #if REV_ENDIAN_FS } #endif /* REV_ENDIAN_FS */ - if (ap->a_waitfor && (ap->a_vp->v_mount->mnt_flag & MNT_ASYNC) == 0) - return (bwrite(bp)); + if (waitfor && (vp->v_mount->mnt_flag & MNT_ASYNC) == 0) + return ((int)buf_bwrite(bp)); else { - bdwrite(bp); + buf_bdwrite(bp); return (0); } } + #define SINGLE 0 /* index of single indirect block */ #define DOUBLE 1 /* index of double indirect block */ #define TRIPLE 2 /* index of triple indirect block */ -/* - * Truncate the inode oip to at most length size, freeing the - * disk blocks. - */ -ffs_truncate(ap) - struct vop_truncate_args /* { - struct vnode *a_vp; - off_t a_length; - int a_flags; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; + +int +ffs_truncate_internal(vnode_t ovp, off_t length, int flags, ucred_t cred) { - register struct vnode *ovp = ap->a_vp; + struct inode *oip; + struct fs *fs; ufs_daddr_t lastblock; - register struct inode *oip; ufs_daddr_t bn, lbn, lastiblock[NIADDR], indir_lbn[NIADDR]; ufs_daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR]; - off_t length = ap->a_length; - register struct fs *fs; - struct buf *bp; - int offset, size, level; - long count, nblocks, vflags, blocksreleased = 0; - struct timeval tv; - register int i; - int aflags, error, allerror; - off_t osize; - int devBlockSize=0; + buf_t bp; + int offset, size, level, i; + long count, nblocks, vflags, blocksreleased = 0; + struct timeval tv; + int aflags, error, allerror; + off_t osize; + int devBlockSize=0; #if QUOTA int64_t change; /* in bytes */ #endif /* QUOTA */ @@ -213,7 +198,7 @@ ffs_truncate(ap) if (length > fs->fs_maxfilesize) return (EFBIG); - tv = time; + microtime(&tv); if (ovp->v_type == VLNK && oip->i_size < ovp->v_mount->mnt_maxsymlinklen) { #if DIAGNOSTIC @@ -223,12 +208,12 @@ ffs_truncate(ap) bzero((char *)&oip->i_shortlink, (u_int)oip->i_size); oip->i_size = 0; oip->i_flag |= IN_CHANGE | IN_UPDATE; - return (VOP_UPDATE(ovp, &tv, &tv, 1)); + return (ffs_update(ovp, &tv, &tv, 1)); } if (oip->i_size == length) { oip->i_flag |= IN_CHANGE | IN_UPDATE; - return (VOP_UPDATE(ovp, &tv, &tv, 0)); + return (ffs_update(ovp, &tv, &tv, 0)); } #if QUOTA if (error = getinoquota(oip)) @@ -245,25 +230,24 @@ ffs_truncate(ap) offset = blkoff(fs, length - 1); lbn = lblkno(fs, length - 1); aflags = B_CLRBUF; - if (ap->a_flags & IO_SYNC) + if (flags & IO_SYNC) aflags |= B_SYNC; - if (error = ffs_balloc(oip, lbn, offset + 1, ap->a_cred, &bp, - aflags , 0)) + if (error = ffs_balloc(oip, lbn, offset + 1, cred, &bp, aflags, 0)) return (error); oip->i_size = length; if (UBCINFOEXISTS(ovp)) { - bp->b_flags |= B_INVAL; - bwrite(bp); + buf_markinvalid(bp); + buf_bwrite(bp); ubc_setsize(ovp, (off_t)length); } else { if (aflags & B_SYNC) - bwrite(bp); + buf_bwrite(bp); else - bawrite(bp); + buf_bawrite(bp); } oip->i_flag |= IN_CHANGE | IN_UPDATE; - return (VOP_UPDATE(ovp, &tv, &tv, 1)); + return (ffs_update(ovp, &tv, &tv, 1)); } /* * Shorten the size of the file. If the file is not being @@ -275,33 +259,34 @@ ffs_truncate(ap) if (UBCINFOEXISTS(ovp)) ubc_setsize(ovp, (off_t)length); - vflags = ((length > 0) ? V_SAVE : 0) | V_SAVEMETA; - allerror = vinvalbuf(ovp, vflags, ap->a_cred, ap->a_p, 0, 0); - + vflags = ((length > 0) ? BUF_WRITE_DATA : 0) | BUF_SKIP_META; + if (vflags & BUF_WRITE_DATA) + ffs_fsync_internal(ovp, MNT_WAIT); + allerror = buf_invalidateblks(ovp, vflags, 0, 0); + offset = blkoff(fs, length); if (offset == 0) { oip->i_size = length; } else { lbn = lblkno(fs, length); aflags = B_CLRBUF; - if (ap->a_flags & IO_SYNC) + if (flags & IO_SYNC) aflags |= B_SYNC; - if (error = ffs_balloc(oip, lbn, offset, ap->a_cred, &bp, - aflags, 0)) + if (error = ffs_balloc(oip, lbn, offset, cred, &bp, aflags, 0)) return (error); oip->i_size = length; size = blksize(fs, oip, lbn); - bzero((char *)bp->b_data + offset, (u_int)(size - offset)); + bzero((char *)buf_dataptr(bp) + offset, (u_int)(size - offset)); allocbuf(bp, size); if (UBCINFOEXISTS(ovp)) { - bp->b_flags |= B_INVAL; - bwrite(bp); + buf_markinvalid(bp); + buf_bwrite(bp); } else { if (aflags & B_SYNC) - bwrite(bp); + buf_bwrite(bp); else - bawrite(bp); + buf_bawrite(bp); } } /* @@ -314,7 +299,8 @@ ffs_truncate(ap) lastiblock[SINGLE] = lastblock - NDADDR; lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); - VOP_DEVBLOCKSIZE(oip->i_devvp,&devBlockSize); + + devBlockSize = vfs_devblocksize(vnode_mount(ovp)); nblocks = btodb(fs->fs_bsize, devBlockSize); /* @@ -332,7 +318,7 @@ ffs_truncate(ap) for (i = NDADDR - 1; i > lastblock; i--) oip->i_db[i] = 0; oip->i_flag |= IN_CHANGE | IN_UPDATE; - if (error = VOP_UPDATE(ovp, &tv, &tv, MNT_WAIT)) + if (error = ffs_update(ovp, &tv, &tv, MNT_WAIT)) allerror = error; /* * Having written the new inode to disk, save its new configuration @@ -343,8 +329,12 @@ ffs_truncate(ap) bcopy((caddr_t)&oip->i_db[0], (caddr_t)newblks, sizeof newblks); bcopy((caddr_t)oldblks, (caddr_t)&oip->i_db[0], sizeof oldblks); oip->i_size = osize; - vflags = ((length > 0) ? V_SAVE : 0) | V_SAVEMETA; - allerror = vinvalbuf(ovp, vflags, ap->a_cred, ap->a_p, 0, 0); + + vflags = ((length > 0) ? BUF_WRITE_DATA : 0) | BUF_SKIP_META; + + if (vflags & BUF_WRITE_DATA) + ffs_fsync_internal(ovp, MNT_WAIT); + allerror = buf_invalidateblks(ovp, vflags, 0, 0); /* * Indirect blocks first. @@ -424,7 +414,7 @@ done: if (newblks[i] != oip->i_db[i]) panic("itrunc2"); if (length == 0 && - (ovp->v_dirtyblkhd.lh_first || ovp->v_cleanblkhd.lh_first)) + (vnode_hasdirtyblks(ovp) || vnode_hascleanblks(ovp))) panic("itrunc3"); #endif /* DIAGNOSTIC */ /* @@ -468,10 +458,10 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp) ufs_daddr_t *copy, nb, nlbn, last; long blkcount, factor; int nblocks, blocksreleased = 0; - int error = 0, allerror = 0; + errno_t error = 0, allerror = 0; int devBlockSize=0; -#if REV_ENDIAN_FS struct mount *mp=vp->v_mount; +#if REV_ENDIAN_FS int rev_endian=(mp->mnt_flag & MNT_REVEND); #endif /* REV_ENDIAN_FS */ @@ -486,7 +476,8 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp) last = lastbn; if (lastbn > 0) last /= factor; - VOP_DEVBLOCKSIZE(ip->i_devvp,&devBlockSize); + + devBlockSize = vfs_devblocksize(mp); nblocks = btodb(fs->fs_bsize, devBlockSize); /* Doing a MALLOC here is asking for trouble. We can still @@ -494,51 +485,52 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp) * low on memory and block in MALLOC */ - tbp = geteblk(fs->fs_bsize); - copy = (ufs_daddr_t *)tbp->b_data; + tbp = buf_geteblk(fs->fs_bsize); + copy = (ufs_daddr_t *)buf_dataptr(tbp); /* * Get buffer of block pointers, zero those entries corresponding * to blocks to be free'd, and update on disk copy first. Since * double(triple) indirect before single(double) indirect, calls * to bmap on these blocks will fail. However, we already have - * the on disk address, so we have to set the b_blkno field - * explicitly instead of letting bread do everything for us. + * the on disk address, so we have to set the blkno field + * explicitly instead of letting buf_bread do everything for us. */ vp = ITOV(ip); - bp = getblk(vp, lbn, (int)fs->fs_bsize, 0, 0, BLK_META); - if (bp->b_flags & (B_DONE | B_DELWRI)) { + bp = buf_getblk(vp, (daddr64_t)((unsigned)lbn), (int)fs->fs_bsize, 0, 0, BLK_META); + + if (buf_valid(bp)) { /* Braces must be here in case trace evaluates to nothing. */ trace(TR_BREADHIT, pack(vp, fs->fs_bsize), lbn); } else { trace(TR_BREADMISS, pack(vp, fs->fs_bsize), lbn); current_proc()->p_stats->p_ru.ru_inblock++; /* pay for read */ - bp->b_flags |= B_READ; - if (bp->b_bcount > bp->b_bufsize) + buf_setflags(bp, B_READ); + if (buf_count(bp) > buf_size(bp)) panic("ffs_indirtrunc: bad buffer size"); - bp->b_blkno = dbn; - VOP_STRATEGY(bp); - error = biowait(bp); + buf_setblkno(bp, (daddr64_t)((unsigned)dbn)); + VNOP_STRATEGY(bp); + error = buf_biowait(bp); } if (error) { - brelse(bp); + buf_brelse(bp); *countp = 0; - brelse(tbp); - return (error); + buf_brelse(tbp); + return ((int)error); } - bap = (ufs_daddr_t *)bp->b_data; + bap = (ufs_daddr_t *)buf_dataptr(bp); bcopy((caddr_t)bap, (caddr_t)copy, (u_int)fs->fs_bsize); bzero((caddr_t)&bap[last + 1], (u_int)(NINDIR(fs) - (last + 1)) * sizeof (ufs_daddr_t)); if (last == -1) - bp->b_flags |= B_INVAL; + buf_markinvalid(bp); if (last != -1 && (vp)->v_mount->mnt_flag & MNT_ASYNC) { error = 0; - bdwrite(bp); + buf_bdwrite(bp); } else { - error = bwrite(bp); + error = buf_bwrite(bp); if (error) allerror = error; } @@ -591,8 +583,8 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp) blocksreleased += blkcount; } } - brelse(tbp); + buf_brelse(tbp); *countp = blocksreleased; - return (allerror); + return ((int)allerror); } diff --git a/bsd/ufs/ffs/ffs_subr.c b/bsd/ufs/ffs/ffs_subr.c index c023a273d..5d466f630 100644 --- a/bsd/ufs/ffs/ffs_subr.c +++ b/bsd/ufs/ffs/ffs_subr.c @@ -58,7 +58,7 @@ #include <rev_endian_fs.h> #include <sys/param.h> #if REV_ENDIAN_FS -#include <sys/mount.h> +#include <sys/mount_internal.h> #endif /* REV_ENDIAN_FS */ #ifndef KERNEL @@ -67,7 +67,7 @@ #else #include <sys/systm.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/buf.h> #include <sys/quota.h> #include <ufs/ufs/quota.h> @@ -84,14 +84,9 @@ * directory "ip". If "res" is non-zero, fill it in with a pointer to the * remaining space in the directory. */ +__private_extern__ int -ffs_blkatoff(ap) - struct vop_blkatoff_args /* { - struct vnode *a_vp; - off_t a_offset; - char **a_res; - struct buf **a_bpp; - } */ *ap; +ffs_blkatoff(vnode_t vp, off_t offset, char **res, buf_t *bpp) { struct inode *ip; register struct fs *fs; @@ -99,28 +94,28 @@ ffs_blkatoff(ap) ufs_daddr_t lbn; int bsize, error; #if REV_ENDIAN_FS - struct mount *mp=(ap->a_vp)->v_mount; + struct mount *mp = vnode_mount(vp); int rev_endian=(mp->mnt_flag & MNT_REVEND); #endif /* REV_ENDIAN_FS */ - ip = VTOI(ap->a_vp); + ip = VTOI(vp); fs = ip->i_fs; - lbn = lblkno(fs, ap->a_offset); + lbn = lblkno(fs, offset); bsize = blksize(fs, ip, lbn); - *ap->a_bpp = NULL; - if (error = bread(ap->a_vp, lbn, bsize, NOCRED, &bp)) { - brelse(bp); + *bpp = NULL; + if (error = (int)buf_bread(vp, (daddr64_t)((unsigned)lbn), bsize, NOCRED, &bp)) { + buf_brelse(bp); return (error); } #if REV_ENDIAN_FS if (rev_endian) - byte_swap_dir_block_in(bp->b_data, bp->b_bcount); + byte_swap_dir_block_in((char *)buf_dataptr(bp), buf_count(bp)); #endif /* REV_ENDIAN_FS */ - if (ap->a_res) - *ap->a_res = (char *)bp->b_data + blkoff(fs, ap->a_offset); - *ap->a_bpp = bp; + if (res) + *res = (char *)buf_dataptr(bp) + blkoff(fs, offset); + *bpp = bp; return (0); } #endif @@ -160,59 +155,6 @@ ffs_fragacct(fs, fragmap, fraglist, cnt) } } -#if defined(KERNEL) && DIAGNOSTIC -void -ffs_checkoverlap(bp, ip) - struct buf *bp; - struct inode *ip; -{ - register struct buf *ebp, *ep; - register ufs_daddr_t start, last; - struct vnode *vp; -#ifdef NeXT - int devBlockSize=0; -#endif /* NeXT */ - - ebp = &buf[nbuf]; - start = bp->b_blkno; -#ifdef NeXT - VOP_DEVBLOCKSIZE(ip->i_devvp,&devBlockSize); - last = start + btodb(bp->b_bcount, devBlockSize) - 1; -#else - last = start + btodb(bp->b_bcount) - 1; -#endif /* NeXT */ - for (ep = buf; ep < ebp; ep++) { - if (ep == bp || (ep->b_flags & B_INVAL) || - ep->b_vp == NULLVP) - continue; - if (VOP_BMAP(ep->b_vp, (ufs_daddr_t)0, &vp, (ufs_daddr_t)0, - NULL)) - continue; - if (vp != ip->i_devvp) - continue; - /* look for overlap */ -#ifdef NeXT - if (ep->b_bcount == 0 || ep->b_blkno > last || - ep->b_blkno + btodb(ep->b_bcount, devBlockSize) <= start) - continue; - vprint("Disk overlap", vp); - (void)printf("\tstart %d, end %d overlap start %d, end %d\n", - start, last, ep->b_blkno, - ep->b_blkno + btodb(ep->b_bcount, devBlockSize) - 1); -#else - if (ep->b_bcount == 0 || ep->b_blkno > last || - ep->b_blkno + btodb(ep->b_bcount) <= start) - continue; - vprint("Disk overlap", vp); - (void)printf("\tstart %d, end %d overlap start %d, end %d\n", - start, last, ep->b_blkno, - ep->b_blkno + btodb(ep->b_bcount) - 1); -#endif /* NeXT */ - panic("Disk buffer overlap"); - } -} -#endif /* DIAGNOSTIC */ - /* * block operations * @@ -241,6 +183,8 @@ ffs_isblock(fs, cp, h) default: panic("ffs_isblock"); } + /* NOTREACHED */ + return 0; } /* diff --git a/bsd/ufs/ffs/ffs_vfsops.c b/bsd/ufs/ffs/ffs_vfsops.c index ab6ebe668..06d21d70a 100644 --- a/bsd/ufs/ffs/ffs_vfsops.c +++ b/bsd/ufs/ffs/ffs_vfsops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -60,9 +60,11 @@ #include <sys/systm.h> #include <sys/namei.h> #include <sys/proc.h> +#include <sys/kauth.h> #include <sys/kernel.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/socket.h> +#include <sys/mount_internal.h> #include <sys/mount.h> #include <sys/buf.h> #include <sys/mbuf.h> @@ -88,7 +90,7 @@ #include <architecture/byte_order.h> #endif /* REV_ENDIAN_FS */ -int ffs_sbupdate __P((struct ufsmount *, int)); +int ffs_sbupdate(struct ufsmount *, int); struct vfsops ufs_vfsops = { ffs_mount, @@ -96,68 +98,53 @@ struct vfsops ufs_vfsops = { ffs_unmount, ufs_root, ufs_quotactl, - ffs_statfs, + ffs_vfs_getattr, ffs_sync, ffs_vget, ffs_fhtovp, ffs_vptofh, ffs_init, ffs_sysctl, + ffs_vfs_setattr, + {0} }; extern u_long nextgennumber; +union _qcvt { + int64_t qcvt; + int32_t val[2]; +}; +#define SETHIGH(q, h) { \ + union _qcvt tmp; \ + tmp.qcvt = (q); \ + tmp.val[_QUAD_HIGHWORD] = (h); \ + (q) = tmp.qcvt; \ +} +#define SETLOW(q, l) { \ + union _qcvt tmp; \ + tmp.qcvt = (q); \ + tmp.val[_QUAD_LOWWORD] = (l); \ + (q) = tmp.qcvt; \ +} + /* * Called by main() when ufs is going to be mounted as root. */ -ffs_mountroot() +int +ffs_mountroot(mount_t mp, vnode_t rvp, vfs_context_t context) { - extern struct vnode *rootvp; - struct fs *fs; - struct mount *mp; struct proc *p = current_proc(); /* XXX */ - struct ufsmount *ump; - u_int size; - int error; + int error; - /* - * Get vnode for rootdev. - */ - if (error = bdevvp(rootdev, &rootvp)) { - printf("ffs_mountroot: can't setup bdevvp"); - return (error); - } - if (error = vfs_rootmountalloc("ufs", "root_device", &mp)) { - vrele(rootvp); /* release the reference from bdevvp() */ - return (error); - } - - /* Must set the MNT_ROOTFS flag before doing the actual mount */ - mp->mnt_flag |= MNT_ROOTFS; - /* Set asynchronous flag by default */ - mp->mnt_flag |= MNT_ASYNC; + vfs_setflags(mp, MNT_ASYNC); - if (error = ffs_mountfs(rootvp, mp, p)) { - mp->mnt_vfc->vfc_refcount--; + if (error = ffs_mountfs(rvp, mp, context)) + return (error); - if (mp->mnt_kern_flag & MNTK_IO_XINFO) - FREE(mp->mnt_xinfo_ptr, M_TEMP); - vfs_unbusy(mp, p); + (void)ffs_statfs(mp, vfs_statfs(mp), NULL); - vrele(rootvp); /* release the reference from bdevvp() */ - FREE_ZONE(mp, sizeof (struct mount), M_MOUNT); - return (error); - } - simple_lock(&mountlist_slock); - CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list); - simple_unlock(&mountlist_slock); - ump = VFSTOUFS(mp); - fs = ump->um_fs; - (void) copystr(mp->mnt_stat.f_mntonname, fs->fs_fsmnt, MNAMELEN - 1, 0); - (void)ffs_statfs(mp, &mp->mnt_stat, p); - vfs_unbusy(mp, p); - inittodr(fs->fs_time); return (0); } @@ -167,33 +154,33 @@ ffs_mountroot() * mount system call */ int -ffs_mount(mp, path, data, ndp, p) - register struct mount *mp; - char *path; - caddr_t data; - struct nameidata *ndp; - struct proc *p; +ffs_mount(struct mount *mp, vnode_t devvp, __unused user_addr_t data, vfs_context_t context) { - struct vnode *devvp; - struct ufs_args args; + struct proc *p = vfs_context_proc(context); struct ufsmount *ump; register struct fs *fs; u_int size; - int error, flags; + int error = 0, flags; mode_t accessmode; int ronly; int reload = 0; - if (error = copyin(data, (caddr_t)&args, sizeof (struct ufs_args))) - return (error); /* - * If updating, check whether changing from read-only to - * read/write; if there is no device name, that's all we do. + * If updating, check whether changing from read-write to + * read-only; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { ump = VFSTOUFS(mp); fs = ump->um_fs; if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) { + /* + * Flush any dirty data. + */ + VFS_SYNC(mp, MNT_WAIT, context); + /* + * Check for and optionally get rid of files open + * for writing. + */ flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; @@ -212,7 +199,7 @@ ffs_mount(mp, path, data, ndp, p) if ((mp->mnt_flag & MNT_RELOAD) || ronly) reload = 1; if ((reload) && - (error = ffs_reload(mp, ndp->ni_cnd.cn_cred, p))) + (error = ffs_reload(mp, vfs_context_ucred(context), p))) return (error); /* replace the ronly after load */ fs->fs_ronly = ronly; @@ -234,89 +221,91 @@ ffs_mount(mp, path, data, ndp, p) } if (ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)) { - /* - * If upgrade to read-write by non-root, then verify - * that user has necessary permissions on the device. - */ - if (p->p_ucred->cr_uid != 0) { - devvp = ump->um_devvp; - vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); - if (error = VOP_ACCESS(devvp, VREAD | VWRITE, - p->p_ucred, p)) { - VOP_UNLOCK(devvp, 0, p); - return (error); - } - VOP_UNLOCK(devvp, 0, p); - } fs->fs_ronly = 0; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT); } - if (args.fspec == 0) { - /* - * Process export requests. - */ - return (vfs_export(mp, &ump->um_export, &args.export)); + if (devvp == 0) { + return(0); } } - /* - * Not an update, or updating the name: look up the name - * and verify that it refers to a sensible block device. - */ - NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args.fspec, p); - if (error = namei(ndp)) - return (error); - devvp = ndp->ni_vp; - - if (devvp->v_type != VBLK) { - vrele(devvp); - return (ENOTBLK); - } - if (major(devvp->v_rdev) >= nblkdev) { - vrele(devvp); - return (ENXIO); - } - /* - * If mount by non-root, then verify that user has necessary - * permissions on the device. - */ - if (p->p_ucred->cr_uid != 0) { - accessmode = VREAD; - if ((mp->mnt_flag & MNT_RDONLY) == 0) - accessmode |= VWRITE; - vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); - if (error = VOP_ACCESS(devvp, accessmode, p->p_ucred, p)) { - vput(devvp); - return (error); - } - VOP_UNLOCK(devvp, 0, p); - } if ((mp->mnt_flag & MNT_UPDATE) == 0) - error = ffs_mountfs(devvp, mp, p); + error = ffs_mountfs(devvp, mp, context); else { if (devvp != ump->um_devvp) error = EINVAL; /* needs translation */ - else - vrele(devvp); } if (error) { - vrele(devvp); return (error); } ump = VFSTOUFS(mp); fs = ump->um_fs; - (void) copyinstr(path, fs->fs_fsmnt, sizeof(fs->fs_fsmnt) - 1, - (size_t *)&size); - bzero(fs->fs_fsmnt + size, sizeof(fs->fs_fsmnt) - size); - bcopy((caddr_t)fs->fs_fsmnt, (caddr_t)mp->mnt_stat.f_mntonname, - MNAMELEN); - (void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, - (size_t *)&size); - bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); - (void)ffs_statfs(mp, &mp->mnt_stat, p); + bzero(fs->fs_fsmnt , sizeof(fs->fs_fsmnt)); + strncpy(fs->fs_fsmnt, (caddr_t)mp->mnt_vfsstat.f_mntonname, sizeof(fs->fs_fsmnt) - 1); + (void)ffs_statfs(mp, &mp->mnt_vfsstat, p); return (0); } + +struct ffs_reload_cargs { + struct vnode *devvp; + kauth_cred_t cred; + struct fs *fs; + struct proc *p; + int error; +#if REV_ENDIAN_FS + int rev_endian; +#endif /* REV_ENDIAN_FS */ +}; + + +static int +ffs_reload_callback(struct vnode *vp, void *cargs) +{ + struct inode *ip; + struct buf *bp; + struct fs *fs; + struct ffs_reload_cargs *args; + + args = (struct ffs_reload_cargs *)cargs; + + /* + * flush all the buffers associated with this node + */ + if (buf_invalidateblks(vp, 0, 0, 0)) + panic("ffs_reload: dirty2"); + + /* + * Step 6: re-read inode data + */ + ip = VTOI(vp); + fs = args->fs; + + if (args->error = (int)buf_bread(args->devvp, (daddr64_t)((unsigned)fsbtodb(fs, ino_to_fsba(fs, ip->i_number))), + (int)fs->fs_bsize, NOCRED, &bp)) { + buf_brelse(bp); + + return (VNODE_RETURNED_DONE); + } + +#if REV_ENDIAN_FS + if (args->rev_endian) { + byte_swap_inode_in(((struct dinode *)buf_dataptr(bp) + + ino_to_fsbo(fs, ip->i_number)), ip); + } else { +#endif /* REV_ENDIAN_FS */ + ip->i_din = *((struct dinode *)buf_dataptr(bp) + + ino_to_fsbo(fs, ip->i_number)); +#if REV_ENDIAN_FS + } +#endif /* REV_ENDIAN_FS */ + + buf_brelse(bp); + + return (VNODE_RETURNED); +} + + /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). The filesystem must @@ -330,19 +319,16 @@ ffs_mount(mp, path, data, ndp, p) * 5) invalidate all cached file data. * 6) re-read inode data for all active vnodes. */ -ffs_reload(mountp, cred, p) - register struct mount *mountp; - struct ucred *cred; - struct proc *p; +ffs_reload(struct mount *mountp, kauth_cred_t cred, struct proc *p) { - register struct vnode *vp, *nvp, *devvp; - struct inode *ip; + register struct vnode *devvp; void *space; struct buf *bp; struct fs *fs, *newfs; int i, blks, size, error; u_int64_t maxfilesize; /* XXX */ int32_t *lp; + struct ffs_reload_cargs args; #if REV_ENDIAN_FS int rev_endian = (mountp->mnt_flag & MNT_REVEND); #endif /* REV_ENDIAN_FS */ @@ -353,18 +339,18 @@ ffs_reload(mountp, cred, p) * Step 1: invalidate all cached meta-data. */ devvp = VFSTOUFS(mountp)->um_devvp; - if (vinvalbuf(devvp, 0, cred, p, 0, 0)) + if (buf_invalidateblks(devvp, 0, 0, 0)) panic("ffs_reload: dirty1"); /* * Step 2: re-read superblock from disk. */ - VOP_DEVBLOCKSIZE(devvp,&size); + size = vfs_devblocksize(mountp); - if (error = bread(devvp, (ufs_daddr_t)(SBOFF/size), SBSIZE, NOCRED,&bp)) { - brelse(bp); + if (error = (int)buf_bread(devvp, (daddr64_t)((unsigned)(SBOFF/size)), SBSIZE, NOCRED,&bp)) { + buf_brelse(bp); return (error); } - newfs = (struct fs *)bp->b_data; + newfs = (struct fs *)buf_dataptr(bp); #if REV_ENDIAN_FS if (rev_endian) { byte_swap_sbin(newfs); @@ -377,7 +363,7 @@ ffs_reload(mountp, cred, p) byte_swap_sbout(newfs); #endif /* REV_ENDIAN_FS */ - brelse(bp); + buf_brelse(bp); return (EIO); /* XXX needs translation */ } fs = VFSTOUFS(mountp)->um_fs; @@ -391,12 +377,12 @@ ffs_reload(mountp, cred, p) newfs->fs_contigdirs = fs->fs_contigdirs; bcopy(newfs, fs, (u_int)fs->fs_sbsize); if (fs->fs_sbsize < SBSIZE) - bp->b_flags |= B_INVAL; + buf_markinvalid(bp); #if REV_ENDIAN_FS if (rev_endian) byte_swap_sbout(newfs); #endif /* REV_ENDIAN_FS */ - brelse(bp); + buf_brelse(bp); mountp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; ffs_oldfscompat(fs); maxfilesize = 0x100000000ULL; /* 4GB */ @@ -411,26 +397,26 @@ ffs_reload(mountp, cred, p) size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; - if (error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, - NOCRED, &bp)) { - brelse(bp); + if (error = (int)buf_bread(devvp, (daddr64_t)((unsigned)fsbtodb(fs, fs->fs_csaddr + i)), size, + NOCRED, &bp)) { + buf_brelse(bp); return (error); } #if REV_ENDIAN_FS if (rev_endian) { /* csum swaps */ - byte_swap_ints((int *)bp->b_data, size / sizeof(int)); + byte_swap_ints((int *)buf_dataptr(bp), size / sizeof(int)); } #endif /* REV_ENDIAN_FS */ - bcopy(bp->b_data, space, (u_int)size); + bcopy((char *)buf_dataptr(bp), space, (u_int)size); #if REV_ENDIAN_FS if (rev_endian) { /* csum swaps */ - byte_swap_ints((int *)bp->b_data, size / sizeof(int)); + byte_swap_ints((int *)buf_dataptr(bp), size / sizeof(int)); } #endif /* REV_ENDIAN_FS */ space = (char *) space + size; - brelse(bp); + buf_brelse(bp); } /* * We no longer know anything about clusters per cylinder group. @@ -440,112 +426,72 @@ ffs_reload(mountp, cred, p) for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; } - -loop: - simple_lock(&mntvnode_slock); - for (vp = mountp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { - if (vp->v_mount != mountp) { - simple_unlock(&mntvnode_slock); - goto loop; - } - nvp = vp->v_mntvnodes.le_next; - /* - * Step 4: invalidate all inactive vnodes. - */ - if (vrecycle(vp, &mntvnode_slock, p)) - goto loop; - /* - * Step 5: invalidate all cached file data. - */ - simple_lock(&vp->v_interlock); - simple_unlock(&mntvnode_slock); - if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) { - goto loop; - } - if (vinvalbuf(vp, 0, cred, p, 0, 0)) - panic("ffs_reload: dirty2"); - /* - * Step 6: re-read inode data for all active vnodes. - */ - ip = VTOI(vp); - if (error = - bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), - (int)fs->fs_bsize, NOCRED, &bp)) { - brelse(bp); - vput(vp); - return (error); - } #if REV_ENDIAN_FS - if (rev_endian) { - byte_swap_inode_in(((struct dinode *)bp->b_data + - ino_to_fsbo(fs, ip->i_number)), ip); - } else { + args.rev_endian = rev_endian; #endif /* REV_ENDIAN_FS */ - ip->i_din = *((struct dinode *)bp->b_data + - ino_to_fsbo(fs, ip->i_number)); -#if REV_ENDIAN_FS - } -#endif /* REV_ENDIAN_FS */ - brelse(bp); - vput(vp); - simple_lock(&mntvnode_slock); - } - simple_unlock(&mntvnode_slock); - return (0); + args.devvp = devvp; + args.cred = cred; + args.fs = fs; + args.p = p; + args.error = 0; + /* + * ffs_reload_callback will be called for each vnode + * hung off of this mount point that can't be recycled... + * vnode_iterate will recycle those that it can (the VNODE_RELOAD option) + * the vnode will be in an 'unbusy' state (VNODE_WAIT) and + * properly referenced and unreferenced around the callback + */ + vnode_iterate(mountp, VNODE_RELOAD | VNODE_WAIT, ffs_reload_callback, (void *)&args); + + return (args.error); } /* * Common code for mount and mountroot */ int -ffs_mountfs(devvp, mp, p) - register struct vnode *devvp; +ffs_mountfs(devvp, mp, context) + struct vnode *devvp; struct mount *mp; - struct proc *p; + vfs_context_t context; { - register struct ufsmount *ump; + struct ufsmount *ump; struct buf *bp; - register struct fs *fs; + struct fs *fs; dev_t dev; struct buf *cgbp; struct cg *cgp; int32_t clustersumoff; void *space; - int error, i, blks, size, ronly; + int error, i, blks, ronly; + u_int32_t size; int32_t *lp; - struct ucred *cred; - extern struct vnode *rootvp; + kauth_cred_t cred; u_int64_t maxfilesize; /* XXX */ u_int dbsize = DEV_BSIZE; #if REV_ENDIAN_FS int rev_endian=0; #endif /* REV_ENDIAN_FS */ dev = devvp->v_rdev; - cred = p ? p->p_ucred : NOCRED; - /* - * Disallow multiple mounts of the same device. - * Disallow mounting of a device that is currently in use - * (except for root, which might share swap device for miniroot). - * Flush out any old buffers remaining from a previous use. - */ - if (error = vfs_mountedon(devvp)) - return (error); - if (vcount(devvp) > 1 && devvp != rootvp) - return (EBUSY); - if (error = vinvalbuf(devvp, V_SAVE, cred, p, 0, 0)) - return (error); + cred = vfs_context_ucred(context); - ronly = (mp->mnt_flag & MNT_RDONLY) != 0; - if (error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p)) - return (error); + ronly = vfs_isrdonly(mp); + bp = NULL; + ump = NULL; - VOP_DEVBLOCKSIZE(devvp,&size); + /* Advisory locking should be handled at the VFS layer */ + vfs_setlocklocal(mp); - bp = NULL; - ump = NULL; - if (error = bread(devvp, (ufs_daddr_t)(SBOFF/size), SBSIZE, cred, &bp)) + /* Obtain the actual device block size */ + if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&size, 0, context)) { + error = ENXIO; goto out; - fs = (struct fs *)bp->b_data; + } + + if (error = (int)buf_bread(devvp, (daddr64_t)((unsigned)(SBOFF/size)), + SBSIZE, cred, &bp)) + goto out; + fs = (struct fs *)buf_dataptr(bp); #if REV_ENDIAN_FS if (fs->fs_magic != FS_MAGIC || fs->fs_bsize > MAXBSIZE || fs->fs_bsize < sizeof(struct fs)) { @@ -610,22 +556,14 @@ ffs_mountfs(devvp, mp, p) if(dbsize <= 0 ) { kprintf("device blocksize computaion failed\n"); } else { - if (VOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&dbsize, - FWRITE, NOCRED, p) != 0) { + if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&dbsize, + FWRITE, context) != 0) { kprintf("failed to set device blocksize\n"); } /* force the specfs to reread blocksize from size() */ set_fsblocksize(devvp); } - /* cache the IO attributes */ - error = vfs_init_io_attributes(devvp, mp); - if (error) { - printf("ffs_mountfs: vfs_init_io_attributes returned %d\n", - error); - goto out; - } - /* XXX updating 4.2 FFS superblocks trashes rotational layout tables */ if (fs->fs_postblformat == FS_42POSTBLFMT && !ronly) { #if REV_ENDIAN_FS @@ -642,12 +580,12 @@ ffs_mountfs(devvp, mp, p) * to avoid further corruption. PR#2216969 */ if (ronly == 0){ - if (error = bread (devvp, fsbtodb(fs, cgtod(fs, 0)), - (int)fs->fs_cgsize, NOCRED, &cgbp)) { - brelse(cgbp); + if (error = (int)buf_bread (devvp, (daddr64_t)((unsigned)fsbtodb(fs, cgtod(fs, 0))), + (int)fs->fs_cgsize, NOCRED, &cgbp)) { + buf_brelse(cgbp); goto out; } - cgp = (struct cg *)cgbp->b_data; + cgp = (struct cg *)buf_dataptr(cgbp); #if REV_ENDIAN_FS if (rev_endian) byte_swap_cgin(cgp,fs); @@ -657,7 +595,7 @@ ffs_mountfs(devvp, mp, p) if (rev_endian) byte_swap_cgout(cgp,fs); #endif /* REV_ENDIAN_FS */ - brelse(cgbp); + buf_brelse(cgbp); goto out; } if (cgp->cg_clustersumoff != 0) { @@ -675,21 +613,21 @@ ffs_mountfs(devvp, mp, p) if (rev_endian) byte_swap_cgout(cgp,fs); #endif /* REV_ENDIAN_FS */ - brelse(cgbp); + buf_brelse(cgbp); } ump = _MALLOC(sizeof *ump, M_UFSMNT, M_WAITOK); bzero((caddr_t)ump, sizeof *ump); ump->um_fs = _MALLOC((u_long)fs->fs_sbsize, M_UFSMNT, M_WAITOK); - bcopy(bp->b_data, ump->um_fs, (u_int)fs->fs_sbsize); + bcopy((char *)buf_dataptr(bp), ump->um_fs, (u_int)fs->fs_sbsize); if (fs->fs_sbsize < SBSIZE) - bp->b_flags |= B_INVAL; + buf_markinvalid(bp); #if REV_ENDIAN_FS if (rev_endian) byte_swap_sbout(fs); #endif /* REV_ENDIAN_FS */ - brelse(bp); + buf_brelse(bp); bp = NULL; fs = ump->um_fs; fs->fs_ronly = ronly; @@ -704,18 +642,18 @@ ffs_mountfs(devvp, mp, p) size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; - if (error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, - cred, &bp)) { + if (error = (int)buf_bread(devvp, (daddr64_t)((unsigned)fsbtodb(fs, fs->fs_csaddr + i)), + size, cred, &bp)) { _FREE(fs->fs_csp, M_UFSMNT); goto out; } - bcopy(bp->b_data, space, (u_int)size); + bcopy((char *)buf_dataptr(bp), space, (u_int)size); #if REV_ENDIAN_FS if (rev_endian) byte_swap_ints((int *) space, size / sizeof(int)); #endif /* REV_ENDIAN_FS */ space = (char *)space + size; - brelse(bp); + buf_brelse(bp); bp = NULL; } if (fs->fs_contigsumsize > 0) { @@ -735,8 +673,8 @@ ffs_mountfs(devvp, mp, p) fs->fs_avgfpdir = AFPDIR; /* XXX End of compatibility */ mp->mnt_data = (qaddr_t)ump; - mp->mnt_stat.f_fsid.val[0] = (long)dev; - mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; + mp->mnt_vfsstat.f_fsid.val[0] = (long)dev; + mp->mnt_vfsstat.f_fsid.val[1] = vfs_typenum(mp); /* XXX warning hardcoded max symlen and not "mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;" */ mp->mnt_maxsymlinklen = 60; #if REV_ENDIAN_FS @@ -750,8 +688,7 @@ ffs_mountfs(devvp, mp, p) ump->um_bptrtodb = fs->fs_fsbtodb; ump->um_seqinc = fs->fs_frag; for (i = 0; i < MAXQUOTAS; i++) - ump->um_qfiles[i].qf_vp = NULLVP; - devvp->v_specflags |= SI_MOUNTEDON; + dqfileinit(&ump->um_qfiles[i]); ffs_oldfscompat(fs); ump->um_savedmaxfilesize = fs->fs_maxfilesize; /* XXX */ maxfilesize = 0x100000000ULL; /* 4GB */ @@ -767,12 +704,10 @@ ffs_mountfs(devvp, mp, p) return (0); out: if (bp) - brelse(bp); - (void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, cred, p); + buf_brelse(bp); if (ump) { _FREE(ump->um_fs, M_UFSMNT); _FREE(ump, M_UFSMNT); - mp->mnt_data = (qaddr_t)0; } return (error); } @@ -809,11 +744,12 @@ ffs_oldfscompat(fs) * unmount system call */ int -ffs_unmount(mp, mntflags, p) +ffs_unmount(mp, mntflags, context) struct mount *mp; int mntflags; - struct proc *p; + vfs_context_t context; { + struct proc *p = vfs_context_proc(context); register struct ufsmount *ump; register struct fs *fs; int error, flags; @@ -829,6 +765,7 @@ ffs_unmount(mp, mntflags, p) return (error); ump = VFSTOUFS(mp); fs = ump->um_fs; + if (fs->fs_ronly == 0) { fs->fs_clean = 1; if (error = ffs_sbupdate(ump, MNT_WAIT)) { @@ -843,20 +780,10 @@ ffs_unmount(mp, mntflags, p) #endif /* notyet */ } } - ump->um_devvp->v_specflags &= ~SI_MOUNTEDON; - error = VOP_CLOSE(ump->um_devvp, fs->fs_ronly ? FREAD : FREAD|FWRITE, - NOCRED, p); - if (error && !force) - return (error); - vrele(ump->um_devvp); - _FREE(fs->fs_csp, M_UFSMNT); _FREE(fs, M_UFSMNT); _FREE(ump, M_UFSMNT); - mp->mnt_data = (qaddr_t)0; -#if REV_ENDIAN_FS - mp->mnt_flag &= ~MNT_REVEND; -#endif /* REV_ENDIAN_FS */ + return (0); } @@ -907,7 +834,7 @@ ffs_flushfiles(mp, flags, p) (rootvp->v_usecount > (1 + quotafilecnt))) { error = EBUSY; /* root dir is still open */ } - vput(rootvp); + vnode_put(rootvp); } if (error && (flags & FORCECLOSE) == 0) return (error); @@ -915,7 +842,7 @@ ffs_flushfiles(mp, flags, p) for (i = 0; i < MAXQUOTAS; i++) { if (ump->um_qfiles[i].qf_vp == NULLVP) continue; - quotaoff(p, mp, i); + quotaoff(mp, i); } /* * Here we fall through to vflush again to ensure @@ -932,10 +859,10 @@ ffs_flushfiles(mp, flags, p) * Get file system statistics. */ int -ffs_statfs(mp, sbp, p) +ffs_statfs(mp, sbp, context) struct mount *mp; - register struct statfs *sbp; - struct proc *p; + register struct vfsstatfs *sbp; + vfs_context_t context; { register struct ufsmount *ump; register struct fs *fs; @@ -946,22 +873,256 @@ ffs_statfs(mp, sbp, p) panic("ffs_statfs"); sbp->f_bsize = fs->fs_fsize; sbp->f_iosize = fs->fs_bsize; - sbp->f_blocks = fs->fs_dsize; - sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag + - fs->fs_cstotal.cs_nffree; - sbp->f_bavail = freespace(fs, fs->fs_minfree); - sbp->f_files = fs->fs_ncg * fs->fs_ipg - ROOTINO; - sbp->f_ffree = fs->fs_cstotal.cs_nifree; - if (sbp != &mp->mnt_stat) { - sbp->f_type = mp->mnt_vfc->vfc_typenum; - bcopy((caddr_t)mp->mnt_stat.f_mntonname, - (caddr_t)&sbp->f_mntonname[0], MNAMELEN); - bcopy((caddr_t)mp->mnt_stat.f_mntfromname, - (caddr_t)&sbp->f_mntfromname[0], MNAMELEN); + sbp->f_blocks = (uint64_t)((unsigned long)fs->fs_dsize); + sbp->f_bfree = (uint64_t) ((unsigned long)(fs->fs_cstotal.cs_nbfree * fs->fs_frag + + fs->fs_cstotal.cs_nffree)); + sbp->f_bavail = (uint64_t) ((unsigned long)freespace(fs, fs->fs_minfree)); + sbp->f_files = (uint64_t) ((unsigned long)(fs->fs_ncg * fs->fs_ipg - ROOTINO)); + sbp->f_ffree = (uint64_t) ((unsigned long)fs->fs_cstotal.cs_nifree); + return (0); +} + +int +ffs_vfs_getattr(mp, fsap, context) + struct mount *mp; + struct vfs_attr *fsap; + vfs_context_t context; +{ + struct ufsmount *ump; + struct fs *fs; + kauth_cred_t cred; + struct vnode *devvp; + struct buf *bp; + struct ufslabel *ulp; + char *offset; + int bs, error, length; + + ump = VFSTOUFS(mp); + fs = ump->um_fs; + cred = vfs_context_ucred(context); + + VFSATTR_RETURN(fsap, f_bsize, fs->fs_fsize); + VFSATTR_RETURN(fsap, f_iosize, fs->fs_bsize); + VFSATTR_RETURN(fsap, f_blocks, (uint64_t)((unsigned long)fs->fs_dsize)); + VFSATTR_RETURN(fsap, f_bfree, (uint64_t)((unsigned long) + (fs->fs_cstotal.cs_nbfree * fs->fs_frag + + fs->fs_cstotal.cs_nffree))); + VFSATTR_RETURN(fsap, f_bavail, (uint64_t)((unsigned long)freespace(fs, + fs->fs_minfree))); + VFSATTR_RETURN(fsap, f_files, (uint64_t)((unsigned long) + (fs->fs_ncg * fs->fs_ipg - ROOTINO))); + VFSATTR_RETURN(fsap, f_ffree, (uint64_t)((unsigned long) + fs->fs_cstotal.cs_nifree)); + + if (VFSATTR_IS_ACTIVE(fsap, f_fsid)) { + fsap->f_fsid.val[0] = mp->mnt_vfsstat.f_fsid.val[0]; + fsap->f_fsid.val[1] = mp->mnt_vfsstat.f_fsid.val[1]; + VFSATTR_SET_SUPPORTED(fsap, f_fsid); + } + + if (VFSATTR_IS_ACTIVE(fsap, f_vol_name)) { + devvp = ump->um_devvp; + bs = vfs_devblocksize(mp); + + if (error = (int)buf_meta_bread(devvp, + (daddr64_t)(UFS_LABEL_OFFSET / bs), + MAX(bs, UFS_LABEL_SIZE), cred, &bp)) { + if (bp) + buf_brelse(bp); + return (error); + } + + /* + * Since the disklabel is read directly by older user space + * code, make sure this buffer won't remain in the cache when + * we release it. + */ + buf_setflags(bp, B_NOCACHE); + + offset = buf_dataptr(bp) + (UFS_LABEL_OFFSET % bs); + ulp = (struct ufslabel *)offset; + + if (ufs_label_check(ulp)) { + length = ulp->ul_namelen; +#if REV_ENDIAN_FS + if (mp->mnt_flag & MNT_REVEND) + length = NXSwapShort(length); +#endif + if (length > 0 && length <= UFS_MAX_LABEL_NAME) { + bcopy(ulp->ul_name, fsap->f_vol_name, length); + fsap->f_vol_name[UFS_MAX_LABEL_NAME - 1] = '\0'; + fsap->f_vol_name[length] = '\0'; + } + } + + buf_brelse(bp); + VFSATTR_SET_SUPPORTED(fsap, f_vol_name); + } + + if (VFSATTR_IS_ACTIVE(fsap, f_capabilities)) { + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] = + VOL_CAP_FMT_SYMBOLICLINKS | + VOL_CAP_FMT_HARDLINKS | + VOL_CAP_FMT_SPARSE_FILES | + VOL_CAP_FMT_CASE_SENSITIVE | + VOL_CAP_FMT_CASE_PRESERVING | + VOL_CAP_FMT_FAST_STATFS ; + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] + = VOL_CAP_INT_NFSEXPORT | + VOL_CAP_INT_VOL_RENAME | + VOL_CAP_INT_ADVLOCK | + VOL_CAP_INT_FLOCK; + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_RESERVED1] + = 0; + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_RESERVED2] + = 0; + + /* Capabilities we know about: */ + fsap->f_capabilities.valid[VOL_CAPABILITIES_FORMAT] = + VOL_CAP_FMT_PERSISTENTOBJECTIDS | + VOL_CAP_FMT_SYMBOLICLINKS | + VOL_CAP_FMT_HARDLINKS | + VOL_CAP_FMT_JOURNAL | + VOL_CAP_FMT_JOURNAL_ACTIVE | + VOL_CAP_FMT_NO_ROOT_TIMES | + VOL_CAP_FMT_SPARSE_FILES | + VOL_CAP_FMT_ZERO_RUNS | + VOL_CAP_FMT_CASE_SENSITIVE | + VOL_CAP_FMT_CASE_PRESERVING | + VOL_CAP_FMT_FAST_STATFS | + VOL_CAP_FMT_2TB_FILESIZE; + fsap->f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] = + VOL_CAP_INT_SEARCHFS | + VOL_CAP_INT_ATTRLIST | + VOL_CAP_INT_NFSEXPORT | + VOL_CAP_INT_READDIRATTR | + VOL_CAP_INT_EXCHANGEDATA | + VOL_CAP_INT_COPYFILE | + VOL_CAP_INT_ALLOCATE | + VOL_CAP_INT_VOL_RENAME | + VOL_CAP_INT_ADVLOCK | + VOL_CAP_INT_FLOCK ; + fsap->f_capabilities.valid[VOL_CAPABILITIES_RESERVED1] = 0; + fsap->f_capabilities.valid[VOL_CAPABILITIES_RESERVED2] = 0; + + VFSATTR_SET_SUPPORTED(fsap, f_capabilities); } + + if (VFSATTR_IS_ACTIVE(fsap, f_attributes)) { + fsap->f_attributes.validattr.commonattr = 0; + fsap->f_attributes.validattr.volattr = + ATTR_VOL_NAME | ATTR_VOL_CAPABILITIES | ATTR_VOL_ATTRIBUTES; + fsap->f_attributes.validattr.dirattr = 0; + fsap->f_attributes.validattr.fileattr = 0; + fsap->f_attributes.validattr.forkattr = 0; + + fsap->f_attributes.nativeattr.commonattr = 0; + fsap->f_attributes.nativeattr.volattr = + ATTR_VOL_NAME | ATTR_VOL_CAPABILITIES | ATTR_VOL_ATTRIBUTES; + fsap->f_attributes.nativeattr.dirattr = 0; + fsap->f_attributes.nativeattr.fileattr = 0; + fsap->f_attributes.nativeattr.forkattr = 0; + + VFSATTR_SET_SUPPORTED(fsap, f_attributes); + } + return (0); } + +int +ffs_vfs_setattr(mp, fsap, context) + struct mount *mp; + struct vfs_attr *fsap; + vfs_context_t context; +{ + struct ufsmount *ump; + struct vnode *devvp; + struct buf *bp; + struct ufslabel *ulp; + kauth_cred_t cred; + char *offset; + int bs, error; + + + ump = VFSTOUFS(mp); + cred = vfs_context_ucred(context); + + if (VFSATTR_IS_ACTIVE(fsap, f_vol_name)) { + devvp = ump->um_devvp; + bs = vfs_devblocksize(mp); + if (error = buf_meta_bread(devvp, + (daddr64_t)(UFS_LABEL_OFFSET / bs), + MAX(bs, UFS_LABEL_SIZE), cred, &bp)) { + if (bp) + buf_brelse(bp); + return (error); + } + + /* + * Since the disklabel is read directly by older user space + * code, make sure this buffer won't remain in the cache when + * we release it. + */ + buf_setflags(bp, B_NOCACHE); + + /* Validate the label structure; init if not valid */ + offset = buf_dataptr(bp) + (UFS_LABEL_OFFSET % bs); + ulp = (struct ufslabel *)offset; + if (!ufs_label_check(ulp)) + ufs_label_init(ulp); + + /* Copy new name over existing name */ + ulp->ul_namelen = strlen(fsap->f_vol_name); +#if REV_ENDIAN_FS + if (mp->mnt_flag & MNT_REVEND) + ulp->ul_namelen = NXSwapShort(ulp->ul_namelen); +#endif + bcopy(fsap->f_vol_name, ulp->ul_name, ulp->ul_namelen); + ulp->ul_name[UFS_MAX_LABEL_NAME - 1] = '\0'; + ulp->ul_name[ulp->ul_namelen] = '\0'; + + /* Update the checksum */ + ulp->ul_checksum = 0; + ulp->ul_checksum = ul_cksum(ulp, sizeof(*ulp)); + + /* Write the label back to disk */ + buf_bwrite(bp); + bp = NULL; + + VFSATTR_SET_SUPPORTED(fsap, f_vol_name); + } + + return (0); + } +struct ffs_sync_cargs { + vfs_context_t context; + int waitfor; + int error; +}; + + +static int +ffs_sync_callback(struct vnode *vp, void *cargs) +{ + struct inode *ip; + struct ffs_sync_cargs *args; + int error; + + args = (struct ffs_sync_cargs *)cargs; + + ip = VTOI(vp); + + if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) || vnode_hasdirtyblks(vp)) { + error = VNOP_FSYNC(vp, args->waitfor, args->context); + + if (error) + args->error = error; + + } + return (VNODE_RETURNED); +} + /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; @@ -970,17 +1131,17 @@ ffs_statfs(mp, sbp, p) * Note: we are always called with the filesystem marked `MPBUSY'. */ int -ffs_sync(mp, waitfor, cred, p) +ffs_sync(mp, waitfor, context) struct mount *mp; int waitfor; - struct ucred *cred; - struct proc *p; + vfs_context_t context; { struct vnode *nvp, *vp; - struct inode *ip; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; + struct timeval tv; int error, allerror = 0; + struct ffs_sync_cargs args; fs = ump->um_fs; if (fs->fs_fmod != 0 && fs->fs_ronly != 0) { /* XXX */ @@ -990,58 +1151,23 @@ ffs_sync(mp, waitfor, cred, p) /* * Write back each (modified) inode. */ - simple_lock(&mntvnode_slock); -loop: - for (vp = mp->mnt_vnodelist.lh_first; - vp != NULL; - vp = nvp) { - int didhold = 0; + args.context = context; + args.waitfor = waitfor; + args.error = 0; + /* + * ffs_sync_callback will be called for each vnode + * hung off of this mount point... the vnode will be + * properly referenced and unreferenced around the callback + */ + vnode_iterate(mp, 0, ffs_sync_callback, (void *)&args); - /* - * If the vnode that we are about to sync is no longer - * associated with this mount point, start over. - */ - if (vp->v_mount != mp) - goto loop; - simple_lock(&vp->v_interlock); - nvp = vp->v_mntvnodes.le_next; - ip = VTOI(vp); - - // restart our whole search if this guy is locked - // or being reclaimed. - if (ip == NULL || vp->v_flag & (VXLOCK|VORECLAIM)) { - simple_unlock(&vp->v_interlock); - continue; - } + if (args.error) + allerror = args.error; - if ((vp->v_type == VNON) || - ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && - vp->v_dirtyblkhd.lh_first == NULL && !(vp->v_flag & VHASDIRTY))) { - simple_unlock(&vp->v_interlock); - continue; - } - simple_unlock(&mntvnode_slock); - error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, p); - if (error) { - simple_lock(&mntvnode_slock); - if (error == ENOENT) - goto loop; - continue; - } - didhold = ubc_hold(vp); - if (error = VOP_FSYNC(vp, cred, waitfor, p)) - allerror = error; - VOP_UNLOCK(vp, 0, p); - if (didhold) - ubc_rele(vp); - vrele(vp); - simple_lock(&mntvnode_slock); - } - simple_unlock(&mntvnode_slock); /* * Force stale file system control information to be flushed. */ - if (error = VOP_FSYNC(ump->um_devvp, cred, waitfor, p)) + if (error = VNOP_FSYNC(ump->um_devvp, waitfor, context)) allerror = error; #if QUOTA qsync(mp); @@ -1051,7 +1177,8 @@ loop: */ if (fs->fs_fmod != 0) { fs->fs_fmod = 0; - fs->fs_time = time.tv_sec; + microtime(&tv); + fs->fs_time = tv.tv_sec; if (error = ffs_sbupdate(ump, waitfor)) allerror = error; } @@ -1065,10 +1192,25 @@ loop: * done by the calling routine. */ int -ffs_vget(mp, inop, vpp) - struct mount *mp; - void *inop; - struct vnode **vpp; +ffs_vget(mp, ino, vpp, context) + mount_t mp; + ino64_t ino; + vnode_t *vpp; + vfs_context_t context; +{ + return(ffs_vget_internal(mp, (ino_t)ino, vpp, NULL, NULL, 0, 0)); +} + + +int +ffs_vget_internal(mp, ino, vpp, dvp, cnp, mode, fhwanted) + mount_t mp; + ino_t ino; + vnode_t *vpp; + vnode_t dvp; + struct componentname *cnp; + int mode; + int fhwanted; { struct proc *p = current_proc(); /* XXX */ struct fs *fs; @@ -1076,58 +1218,56 @@ ffs_vget(mp, inop, vpp) struct ufsmount *ump; struct buf *bp; struct vnode *vp; - ino_t ino; + struct vnode_fsparam vfsp; + struct timeval tv; + enum vtype vtype; dev_t dev; int i, type, error = 0; - ino = (ino_t) inop; - ump = VFSTOUFS(mp); - dev = ump->um_dev; - + *vpp = NULL; + ump = VFSTOUFS(mp); + dev = ump->um_dev; +#if 0 /* Check for unmount in progress */ if (mp->mnt_kern_flag & MNTK_UNMOUNT) { - *vpp = NULL; return (EPERM); } +#endif + /* + * Allocate a new inode... do it before we check the + * cache, because the MALLOC_ZONE may block + */ + type = M_FFSNODE; + MALLOC_ZONE(ip, struct inode *, sizeof(struct inode), type, M_WAITOK); - /* check in the inode hash */ + /* + * check in the inode hash + */ if ((*vpp = ufs_ihashget(dev, ino)) != NULL) { + /* + * found it... get rid of the allocation + * that we didn't need and return + * the 'found' vnode + */ + FREE_ZONE(ip, sizeof(struct inode), type); vp = *vpp; - UBCINFOCHECK("ffs_vget", vp); return (0); } - + bzero((caddr_t)ip, sizeof(struct inode)); /* - * Not in inode hash. - * Allocate a new vnode/inode. + * lock the inode */ - type = ump->um_devvp->v_tag == VT_MFS ? M_MFSNODE : M_FFSNODE; /* XXX */ - MALLOC_ZONE(ip, struct inode *, sizeof(struct inode), type, M_WAITOK); - bzero((caddr_t)ip, sizeof(struct inode)); - lockinit(&ip->i_lock, PINOD, "inode", 0, 0); - /* lock the inode */ - lockmgr(&ip->i_lock, LK_EXCLUSIVE, (struct slock *)0, p); +// lockinit(&ip->i_lock, PINOD, "inode", 0, 0); +// lockmgr(&ip->i_lock, LK_EXCLUSIVE, (struct slock *)0, p); ip->i_fs = fs = ump->um_fs; ip->i_dev = dev; ip->i_number = ino; - SET(ip->i_flag, IN_ALLOC); #if QUOTA for (i = 0; i < MAXQUOTAS; i++) ip->i_dquot[i] = NODQUOT; #endif - - /* - * We could have blocked in MALLOC_ZONE. Check for the race. - */ - if ((*vpp = ufs_ihashget(dev, ino)) != NULL) { - /* lost the race, clean up */ - FREE_ZONE(ip, sizeof(struct inode), type); - vp = *vpp; - UBCINFOCHECK("ffs_vget", vp); - return (0); - } - + SET(ip->i_flag, IN_ALLOC); /* * Put it onto its hash chain locked so that other requests for * this inode will block if they arrive while we are sleeping waiting @@ -1137,49 +1277,100 @@ ffs_vget(mp, inop, vpp) ufs_ihashins(ip); /* Read in the disk contents for the inode, copy into the inode. */ - if (error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), - (int)fs->fs_bsize, NOCRED, &bp)) { - brelse(bp); + if (error = (int)buf_bread(ump->um_devvp, (daddr64_t)((unsigned)fsbtodb(fs, ino_to_fsba(fs, ino))), + (int)fs->fs_bsize, NOCRED, &bp)) { + buf_brelse(bp); goto errout; } #if REV_ENDIAN_FS if (mp->mnt_flag & MNT_REVEND) { - byte_swap_inode_in(((struct dinode *)bp->b_data + ino_to_fsbo(fs, ino)),ip); + byte_swap_inode_in(((struct dinode *)buf_dataptr(bp) + ino_to_fsbo(fs, ino)),ip); } else { - ip->i_din = *((struct dinode *)bp->b_data + ino_to_fsbo(fs, ino)); + ip->i_din = *((struct dinode *)buf_dataptr(bp) + ino_to_fsbo(fs, ino)); } #else - ip->i_din = *((struct dinode *)bp->b_data + ino_to_fsbo(fs, ino)); + ip->i_din = *((struct dinode *)buf_dataptr(bp) + ino_to_fsbo(fs, ino)); #endif /* REV_ENDIAN_FS */ - brelse(bp); - - if (error = getnewvnode(VT_UFS, mp, ffs_vnodeop_p, &vp)) - goto errout; + buf_brelse(bp); + + if (mode == 0) + vtype = IFTOVT(ip->i_mode); + else + vtype = IFTOVT(mode); + + if (vtype == VNON) { + if (fhwanted) { + /* NFS is in play */ + error = ESTALE; + goto errout; + } else { + error = ENOENT; + goto errout; + } + } - vp->v_data = ip; - ip->i_vnode = vp; + vfsp.vnfs_mp = mp; + vfsp.vnfs_vtype = vtype; + vfsp.vnfs_str = "ufs"; + vfsp.vnfs_dvp = dvp; + vfsp.vnfs_fsnode = ip; + vfsp.vnfs_cnp = cnp; + + if (mode == 0) + vfsp.vnfs_filesize = ip->i_din.di_size; + else + vfsp.vnfs_filesize = 0; + + if (vtype == VFIFO ) + vfsp.vnfs_vops = FFS_FIFOOPS; + else if (vtype == VBLK || vtype == VCHR) + vfsp.vnfs_vops = ffs_specop_p; + else + vfsp.vnfs_vops = ffs_vnodeop_p; + + if (vtype == VBLK || vtype == VCHR) + vfsp.vnfs_rdev = ip->i_rdev; + else + vfsp.vnfs_rdev = 0; + + if (dvp && cnp && (cnp->cn_flags & MAKEENTRY)) + vfsp.vnfs_flags = 0; + else + vfsp.vnfs_flags = VNFS_NOCACHE; /* - * Initialize the vnode from the inode, check for aliases. - * Note that the underlying vnode may have changed. + * Tag root directory */ - if (error = ufs_vinit(mp, ffs_specop_p, FFS_FIFOOPS, &vp)) { - vput(vp); - *vpp = NULL; - goto out; - } + vfsp.vnfs_markroot = (ip->i_number == ROOTINO); + vfsp.vnfs_marksystem = 0; + + if ((error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &vp))) + goto errout; + /* * Finish inode initialization now that aliasing has been resolved. */ ip->i_devvp = ump->um_devvp; - VREF(ip->i_devvp); + ip->i_vnode = vp; + + vnode_ref(ip->i_devvp); + vnode_addfsref(vp); + vnode_settag(vp, VT_UFS); + + /* + * Initialize modrev times + */ + microtime(&tv); + SETHIGH(ip->i_modrev, tv.tv_sec); + SETLOW(ip->i_modrev, tv.tv_usec * 4294); + /* * Set up a generation number for this inode if it does not * already have one. This should only happen on old filesystems. */ if (ip->i_gen == 0) { - if (++nextgennumber < (u_long)time.tv_sec) - nextgennumber = time.tv_sec; + if (++nextgennumber < (u_long)tv.tv_sec) + nextgennumber = tv.tv_sec; ip->i_gen = nextgennumber; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) ip->i_flag |= IN_MODIFIED; @@ -1192,24 +1383,22 @@ ffs_vget(mp, inop, vpp) ip->i_uid = ip->i_din.di_ouid; /* XXX */ ip->i_gid = ip->i_din.di_ogid; /* XXX */ } /* XXX */ - - if (UBCINFOMISSING(vp) || UBCINFORECLAIMED(vp)) - ubc_info_init(vp); *vpp = vp; -out: CLR(ip->i_flag, IN_ALLOC); + if (ISSET(ip->i_flag, IN_WALLOC)) wakeup(ip); - return (error); + + return (0); errout: ufs_ihashrem(ip); - CLR(ip->i_flag, IN_ALLOC); + if (ISSET(ip->i_flag, IN_WALLOC)) wakeup(ip); FREE_ZONE(ip, sizeof(struct inode), type); - *vpp = NULL; + return (error); } @@ -1218,47 +1407,66 @@ errout: * * Have to be really careful about stale file handles: * - check that the inode number is valid - * - call ffs_vget() to get the locked inode + * - call vget to get the locked inode * - check for an unallocated inode (i_mode == 0) - * - check that the given client host has export rights and return - * those rights via. exflagsp and credanonp */ int -ffs_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) +ffs_fhtovp(mp, fhlen, fhp, vpp, context) register struct mount *mp; - struct fid *fhp; - struct mbuf *nam; + int fhlen; + unsigned char *fhp; struct vnode **vpp; - int *exflagsp; - struct ucred **credanonp; + vfs_context_t context; { register struct ufid *ufhp; + register struct inode *ip; + struct vnode *nvp; struct fs *fs; + int error; + if (fhlen < (int)sizeof(struct ufid)) + return (EINVAL); ufhp = (struct ufid *)fhp; fs = VFSTOUFS(mp)->um_fs; if (ufhp->ufid_ino < ROOTINO || ufhp->ufid_ino >= fs->fs_ncg * fs->fs_ipg) return (ESTALE); - return (ufs_check_export(mp, ufhp, nam, vpp, exflagsp, credanonp)); + error = ffs_vget_internal(mp, ufhp->ufid_ino, &nvp, NULL, NULL, 0, 1); + if (error) { + *vpp = NULLVP; + return (error); + } + ip = VTOI(nvp); + if (ip->i_mode == 0 || ip->i_gen != ufhp->ufid_gen) { + vnode_put(nvp); + *vpp = NULLVP; + return (ESTALE); + } + *vpp = nvp; + return (0); } /* * Vnode pointer to File handle */ /* ARGSUSED */ -ffs_vptofh(vp, fhp) +int +ffs_vptofh(vp, fhlenp, fhp, context) struct vnode *vp; - struct fid *fhp; + int *fhlenp; + unsigned char *fhp; + vfs_context_t context; { register struct inode *ip; register struct ufid *ufhp; + if (*fhlenp < (int)sizeof(struct ufid)) + return (EOVERFLOW); ip = VTOI(vp); ufhp = (struct ufid *)fhp; - ufhp->ufid_len = sizeof(struct ufid); ufhp->ufid_ino = ip->i_number; ufhp->ufid_gen = ip->i_gen; + *fhlenp = sizeof(struct ufid); return (0); } @@ -1276,14 +1484,8 @@ ffs_init(vfsp) /* * fast filesystem related variables. */ -ffs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) - int *name; - u_int namelen; - void *oldp; - size_t *oldlenp; - void *newp; - size_t newlen; - struct proc *p; +ffs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen, vfs_context_t context) { extern int doclusterread, doclusterwrite, doreallocblks, doasyncfree; @@ -1304,7 +1506,7 @@ ffs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) case FFS_ASYNCFREE: return (sysctl_int(oldp, oldlenp, newp, newlen, &doasyncfree)); default: - return (EOPNOTSUPP); + return (ENOTSUP); } /* NOTREACHED */ } @@ -1336,18 +1538,18 @@ ffs_sbupdate(mp, waitfor) size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; - bp = getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i), - size, 0, 0, BLK_META); - bcopy(space, bp->b_data, (u_int)size); + bp = buf_getblk(mp->um_devvp, (daddr64_t)((unsigned)fsbtodb(fs, fs->fs_csaddr + i)), + size, 0, 0, BLK_META); + bcopy(space, (char *)buf_dataptr(bp), (u_int)size); #if REV_ENDIAN_FS if (rev_endian) { - byte_swap_ints((int *)bp->b_data, size / sizeof(int)); + byte_swap_ints((int *)buf_dataptr(bp), size / sizeof(int)); } #endif /* REV_ENDIAN_FS */ space = (char *)space + size; if (waitfor != MNT_WAIT) - bawrite(bp); - else if (error = bwrite(bp)) + buf_bawrite(bp); + else if (error = (int)buf_bwrite(bp)) allerror = error; } /* @@ -1357,11 +1559,12 @@ ffs_sbupdate(mp, waitfor) */ if (allerror) return (allerror); - VOP_DEVBLOCKSIZE(mp->um_devvp,&devBlockSize); - bp = getblk(mp->um_devvp, (SBOFF/devBlockSize), (int)fs->fs_sbsize, 0, 0, BLK_META); - bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); + devBlockSize = vfs_devblocksize(mp->um_mountp); + + bp = buf_getblk(mp->um_devvp, (daddr64_t)((unsigned)(SBOFF/devBlockSize)), (int)fs->fs_sbsize, 0, 0, BLK_META); + bcopy((caddr_t)fs, (char *)buf_dataptr(bp), (u_int)fs->fs_sbsize); /* Restore compatibility to old file systems. XXX */ - dfs = (struct fs *)bp->b_data; /* XXX */ + dfs = (struct fs *)buf_dataptr(bp); /* XXX */ if (fs->fs_postblformat == FS_42POSTBLFMT) /* XXX */ dfs->fs_nrpos = -1; /* XXX */ #if REV_ENDIAN_FS @@ -1371,7 +1574,7 @@ ffs_sbupdate(mp, waitfor) * fields get moved */ if (rev_endian) { - byte_swap_sbout((struct fs *)bp->b_data); + byte_swap_sbout((struct fs *)buf_dataptr(bp)); } #endif /* REV_ENDIAN_FS */ if (fs->fs_inodefmt < FS_44INODEFMT) { /* XXX */ @@ -1396,8 +1599,8 @@ ffs_sbupdate(mp, waitfor) } #endif /* REV_ENDIAN_FS */ if (waitfor != MNT_WAIT) - bawrite(bp); - else if (error = bwrite(bp)) + buf_bawrite(bp); + else if (error = (int)buf_bwrite(bp)) allerror = error; return (allerror); diff --git a/bsd/ufs/ffs/ffs_vnops.c b/bsd/ufs/ffs/ffs_vnops.c index 2e216c4f6..b8dec359e 100644 --- a/bsd/ufs/ffs/ffs_vnops.c +++ b/bsd/ufs/ffs/ffs_vnops.c @@ -62,11 +62,10 @@ #include <sys/kernel.h> #include <sys/file.h> #include <sys/stat.h> -#include <sys/buf.h> #include <sys/proc.h> #include <sys/conf.h> -#include <sys/mount.h> -#include <sys/vnode.h> +#include <sys/mount_internal.h> +#include <sys/vnode_internal.h> #include <sys/malloc.h> #include <sys/ubc.h> #include <sys/quota.h> @@ -77,7 +76,6 @@ #include <miscfs/specfs/specdev.h> #include <miscfs/fifofs/fifo.h> -#include <ufs/ufs/lockf.h> #include <ufs/ufs/quota.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/dir.h> @@ -96,60 +94,43 @@ /* Global vfs data structures for ufs. */ int (**ffs_vnodeop_p)(void *); struct vnodeopv_entry_desc ffs_vnodeop_entries[] = { - { &vop_default_desc, (VOPFUNC)vn_default_error }, - { &vop_lookup_desc, (VOPFUNC)ufs_lookup }, /* lookup */ - { &vop_create_desc, (VOPFUNC)ufs_create }, /* create */ - { &vop_whiteout_desc, (VOPFUNC)ufs_whiteout }, /* whiteout */ - { &vop_mknod_desc, (VOPFUNC)ufs_mknod }, /* mknod */ - { &vop_open_desc, (VOPFUNC)ufs_open }, /* open */ - { &vop_close_desc, (VOPFUNC)ufs_close }, /* close */ - { &vop_access_desc, (VOPFUNC)ufs_access }, /* access */ - { &vop_getattr_desc, (VOPFUNC)ufs_getattr }, /* getattr */ - { &vop_setattr_desc, (VOPFUNC)ufs_setattr }, /* setattr */ - { &vop_read_desc, (VOPFUNC)ffs_read }, /* read */ - { &vop_write_desc, (VOPFUNC)ffs_write }, /* write */ - { &vop_lease_desc, (VOPFUNC)ufs_lease_check }, /* lease */ - { &vop_ioctl_desc, (VOPFUNC)ufs_ioctl }, /* ioctl */ - { &vop_select_desc, (VOPFUNC)ufs_select }, /* select */ - { &vop_revoke_desc, (VOPFUNC)ufs_revoke }, /* revoke */ - { &vop_mmap_desc, (VOPFUNC)ufs_mmap }, /* mmap */ - { &vop_fsync_desc, (VOPFUNC)ffs_fsync }, /* fsync */ - { &vop_seek_desc, (VOPFUNC)ufs_seek }, /* seek */ - { &vop_remove_desc, (VOPFUNC)ufs_remove }, /* remove */ - { &vop_link_desc, (VOPFUNC)ufs_link }, /* link */ - { &vop_rename_desc, (VOPFUNC)ufs_rename }, /* rename */ - { &vop_mkdir_desc, (VOPFUNC)ufs_mkdir }, /* mkdir */ - { &vop_rmdir_desc, (VOPFUNC)ufs_rmdir }, /* rmdir */ - { &vop_symlink_desc, (VOPFUNC)ufs_symlink }, /* symlink */ - { &vop_readdir_desc, (VOPFUNC)ufs_readdir }, /* readdir */ - { &vop_readlink_desc, (VOPFUNC)ufs_readlink }, /* readlink */ - { &vop_abortop_desc, (VOPFUNC)nop_abortop }, /* abortop */ - { &vop_inactive_desc, (VOPFUNC)ufs_inactive }, /* inactive */ - { &vop_reclaim_desc, (VOPFUNC)ffs_reclaim }, /* reclaim */ - { &vop_lock_desc, (VOPFUNC)ufs_lock }, /* lock */ - { &vop_unlock_desc, (VOPFUNC)ufs_unlock }, /* unlock */ - { &vop_bmap_desc, (VOPFUNC)ufs_bmap }, /* bmap */ - { &vop_strategy_desc, (VOPFUNC)ufs_strategy }, /* strategy */ - { &vop_print_desc, (VOPFUNC)ufs_print }, /* print */ - { &vop_islocked_desc, (VOPFUNC)ufs_islocked }, /* islocked */ - { &vop_pathconf_desc, (VOPFUNC)ufs_pathconf }, /* pathconf */ - { &vop_advlock_desc, (VOPFUNC)ufs_advlock }, /* advlock */ - { &vop_blkatoff_desc, (VOPFUNC)ffs_blkatoff }, /* blkatoff */ - { &vop_valloc_desc, (VOPFUNC)ffs_valloc }, /* valloc */ - { &vop_reallocblks_desc, (VOPFUNC)ffs_reallocblks }, /* reallocblks */ - { &vop_vfree_desc, (VOPFUNC)ffs_vfree }, /* vfree */ - { &vop_truncate_desc, (VOPFUNC)ffs_truncate }, /* truncate */ - { &vop_update_desc, (VOPFUNC)ffs_update }, /* update */ - { &vop_bwrite_desc, (VOPFUNC)vn_bwrite }, - { &vop_pagein_desc, (VOPFUNC)ffs_pagein }, /* Pagein */ - { &vop_pageout_desc, (VOPFUNC)ffs_pageout }, /* Pageout */ - { &vop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copy File */ - { &vop_blktooff_desc, (VOPFUNC)ffs_blktooff }, /* blktooff */ - { &vop_offtoblk_desc, (VOPFUNC)ffs_offtoblk }, /* offtoblk */ - { &vop_cmap_desc, (VOPFUNC)ufs_cmap }, /* cmap */ - { &vop_getattrlist_desc, (VOPFUNC)ufs_getattrlist }, /* getattrlist */ - { &vop_setattrlist_desc, (VOPFUNC)ufs_setattrlist }, /* setattrlist */ - { &vop_kqfilt_add_desc, (VOPFUNC)ufs_kqfilt_add }, /* kqfilt_add */ + { &vnop_default_desc, (VOPFUNC)vn_default_error }, + { &vnop_lookup_desc, (VOPFUNC)ufs_lookup }, /* lookup */ + { &vnop_create_desc, (VOPFUNC)ufs_create }, /* create */ + { &vnop_whiteout_desc, (VOPFUNC)ufs_whiteout }, /* whiteout */ + { &vnop_mknod_desc, (VOPFUNC)ufs_mknod }, /* mknod */ + { &vnop_open_desc, (VOPFUNC)ufs_open }, /* open */ + { &vnop_close_desc, (VOPFUNC)ufs_close }, /* close */ + { &vnop_getattr_desc, (VOPFUNC)ufs_getattr }, /* getattr */ + { &vnop_setattr_desc, (VOPFUNC)ufs_setattr }, /* setattr */ + { &vnop_read_desc, (VOPFUNC)ffs_read }, /* read */ + { &vnop_write_desc, (VOPFUNC)ffs_write }, /* write */ + { &vnop_ioctl_desc, (VOPFUNC)ufs_ioctl }, /* ioctl */ + { &vnop_select_desc, (VOPFUNC)ufs_select }, /* select */ + { &vnop_revoke_desc, (VOPFUNC)ufs_revoke }, /* revoke */ + { &vnop_mmap_desc, (VOPFUNC)ufs_mmap }, /* mmap */ + { &vnop_fsync_desc, (VOPFUNC)ffs_fsync }, /* fsync */ + { &vnop_remove_desc, (VOPFUNC)ufs_remove }, /* remove */ + { &vnop_link_desc, (VOPFUNC)ufs_link }, /* link */ + { &vnop_rename_desc, (VOPFUNC)ufs_rename }, /* rename */ + { &vnop_mkdir_desc, (VOPFUNC)ufs_mkdir }, /* mkdir */ + { &vnop_rmdir_desc, (VOPFUNC)ufs_rmdir }, /* rmdir */ + { &vnop_symlink_desc, (VOPFUNC)ufs_symlink }, /* symlink */ + { &vnop_readdir_desc, (VOPFUNC)ufs_readdir }, /* readdir */ + { &vnop_readlink_desc, (VOPFUNC)ufs_readlink }, /* readlink */ + { &vnop_inactive_desc, (VOPFUNC)ufs_inactive }, /* inactive */ + { &vnop_reclaim_desc, (VOPFUNC)ffs_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (VOPFUNC)ufs_strategy }, /* strategy */ + { &vnop_pathconf_desc, (VOPFUNC)ufs_pathconf }, /* pathconf */ + { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ + { &vnop_bwrite_desc, (VOPFUNC)vn_bwrite }, + { &vnop_pagein_desc, (VOPFUNC)ffs_pagein }, /* Pagein */ + { &vnop_pageout_desc, (VOPFUNC)ffs_pageout }, /* Pageout */ + { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copy File */ + { &vnop_blktooff_desc, (VOPFUNC)ffs_blktooff }, /* blktooff */ + { &vnop_offtoblk_desc, (VOPFUNC)ffs_offtoblk }, /* offtoblk */ + { &vnop_blockmap_desc, (VOPFUNC)ufs_blockmap }, /* blockmap */ + { &vnop_kqfilt_add_desc, (VOPFUNC)ufs_kqfilt_add }, /* kqfilt_add */ { (struct vnodeop_desc*)NULL, (int(*)())NULL } }; struct vnodeopv_desc ffs_vnodeop_opv_desc = @@ -157,57 +138,42 @@ struct vnodeopv_desc ffs_vnodeop_opv_desc = int (**ffs_specop_p)(void *); struct vnodeopv_entry_desc ffs_specop_entries[] = { - { &vop_default_desc, (VOPFUNC)vn_default_error }, - { &vop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */ - { &vop_create_desc, (VOPFUNC)spec_create }, /* create */ - { &vop_mknod_desc, (VOPFUNC)spec_mknod }, /* mknod */ - { &vop_open_desc, (VOPFUNC)spec_open }, /* open */ - { &vop_close_desc, (VOPFUNC)ufsspec_close }, /* close */ - { &vop_access_desc, (VOPFUNC)ufs_access }, /* access */ - { &vop_getattr_desc, (VOPFUNC)ufs_getattr }, /* getattr */ - { &vop_setattr_desc, (VOPFUNC)ufs_setattr }, /* setattr */ - { &vop_read_desc, (VOPFUNC)ufsspec_read }, /* read */ - { &vop_write_desc, (VOPFUNC)ufsspec_write }, /* write */ - { &vop_lease_desc, (VOPFUNC)spec_lease_check }, /* lease */ - { &vop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */ - { &vop_select_desc, (VOPFUNC)spec_select }, /* select */ - { &vop_revoke_desc, (VOPFUNC)spec_revoke }, /* revoke */ - { &vop_mmap_desc, (VOPFUNC)spec_mmap }, /* mmap */ - { &vop_fsync_desc, (VOPFUNC)ffs_fsync }, /* fsync */ - { &vop_seek_desc, (VOPFUNC)spec_seek }, /* seek */ - { &vop_remove_desc, (VOPFUNC)spec_remove }, /* remove */ - { &vop_link_desc, (VOPFUNC)spec_link }, /* link */ - { &vop_rename_desc, (VOPFUNC)spec_rename }, /* rename */ - { &vop_mkdir_desc, (VOPFUNC)spec_mkdir }, /* mkdir */ - { &vop_rmdir_desc, (VOPFUNC)spec_rmdir }, /* rmdir */ - { &vop_symlink_desc, (VOPFUNC)spec_symlink }, /* symlink */ - { &vop_readdir_desc, (VOPFUNC)spec_readdir }, /* readdir */ - { &vop_readlink_desc, (VOPFUNC)spec_readlink }, /* readlink */ - { &vop_abortop_desc, (VOPFUNC)spec_abortop }, /* abortop */ - { &vop_inactive_desc, (VOPFUNC)ufs_inactive }, /* inactive */ - { &vop_reclaim_desc, (VOPFUNC)ffs_reclaim }, /* reclaim */ - { &vop_lock_desc, (VOPFUNC)ufs_lock }, /* lock */ - { &vop_unlock_desc, (VOPFUNC)ufs_unlock }, /* unlock */ - { &vop_bmap_desc, (VOPFUNC)spec_bmap }, /* bmap */ - { &vop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */ - { &vop_print_desc, (VOPFUNC)ufs_print }, /* print */ - { &vop_islocked_desc, (VOPFUNC)ufs_islocked }, /* islocked */ - { &vop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */ - { &vop_advlock_desc, (VOPFUNC)spec_advlock }, /* advlock */ - { &vop_blkatoff_desc, (VOPFUNC)spec_blkatoff }, /* blkatoff */ - { &vop_valloc_desc, (VOPFUNC)spec_valloc }, /* valloc */ - { &vop_reallocblks_desc, (VOPFUNC)spec_reallocblks }, /* reallocblks */ - { &vop_vfree_desc, (VOPFUNC)ffs_vfree }, /* vfree */ - { &vop_truncate_desc, (VOPFUNC)spec_truncate }, /* truncate */ - { &vop_update_desc, (VOPFUNC)ffs_update }, /* update */ - { &vop_bwrite_desc, (VOPFUNC)vn_bwrite }, - { &vop_devblocksize_desc, (VOPFUNC)spec_devblocksize }, /* devblocksize */ - { &vop_pagein_desc, (VOPFUNC)ffs_pagein }, /* Pagein */ - { &vop_pageout_desc, (VOPFUNC)ffs_pageout }, /* Pageout */ - { &vop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copy File */ - { &vop_blktooff_desc, (VOPFUNC)ffs_blktooff }, /* blktooff */ - { &vop_offtoblk_desc, (VOPFUNC)ffs_offtoblk }, /* offtoblk */ - { &vop_cmap_desc, (VOPFUNC)spec_cmap }, /* cmap */ + { &vnop_default_desc, (VOPFUNC)vn_default_error }, + { &vnop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */ + { &vnop_create_desc, (VOPFUNC)spec_create }, /* create */ + { &vnop_mknod_desc, (VOPFUNC)spec_mknod }, /* mknod */ + { &vnop_open_desc, (VOPFUNC)spec_open }, /* open */ + { &vnop_close_desc, (VOPFUNC)ufsspec_close }, /* close */ + { &vnop_getattr_desc, (VOPFUNC)ufs_getattr }, /* getattr */ + { &vnop_setattr_desc, (VOPFUNC)ufs_setattr }, /* setattr */ + { &vnop_read_desc, (VOPFUNC)ufsspec_read }, /* read */ + { &vnop_write_desc, (VOPFUNC)ufsspec_write }, /* write */ + { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */ + { &vnop_select_desc, (VOPFUNC)spec_select }, /* select */ + { &vnop_revoke_desc, (VOPFUNC)spec_revoke }, /* revoke */ + { &vnop_mmap_desc, (VOPFUNC)spec_mmap }, /* mmap */ + { &vnop_fsync_desc, (VOPFUNC)ffs_fsync }, /* fsync */ + { &vnop_remove_desc, (VOPFUNC)spec_remove }, /* remove */ + { &vnop_link_desc, (VOPFUNC)spec_link }, /* link */ + { &vnop_rename_desc, (VOPFUNC)spec_rename }, /* rename */ + { &vnop_mkdir_desc, (VOPFUNC)spec_mkdir }, /* mkdir */ + { &vnop_rmdir_desc, (VOPFUNC)spec_rmdir }, /* rmdir */ + { &vnop_symlink_desc, (VOPFUNC)spec_symlink }, /* symlink */ + { &vnop_readdir_desc, (VOPFUNC)spec_readdir }, /* readdir */ + { &vnop_readlink_desc, (VOPFUNC)spec_readlink }, /* readlink */ + { &vnop_inactive_desc, (VOPFUNC)ufs_inactive }, /* inactive */ + { &vnop_reclaim_desc, (VOPFUNC)ffs_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */ + { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */ + { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ + { &vnop_bwrite_desc, (VOPFUNC)vn_bwrite }, + { &vnop_devblocksize_desc, (VOPFUNC)spec_devblocksize }, /* devblocksize */ + { &vnop_pagein_desc, (VOPFUNC)ffs_pagein }, /* Pagein */ + { &vnop_pageout_desc, (VOPFUNC)ffs_pageout }, /* Pageout */ + { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copy File */ + { &vnop_blktooff_desc, (VOPFUNC)ffs_blktooff }, /* blktooff */ + { &vnop_offtoblk_desc, (VOPFUNC)ffs_offtoblk }, /* offtoblk */ + { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap }, /* blockmap */ { (struct vnodeop_desc*)NULL, (int(*)())NULL } }; struct vnodeopv_desc ffs_specop_opv_desc = @@ -216,57 +182,42 @@ struct vnodeopv_desc ffs_specop_opv_desc = #if FIFO int (**ffs_fifoop_p)(void *); struct vnodeopv_entry_desc ffs_fifoop_entries[] = { - { &vop_default_desc, (VOPFUNC)vn_default_error }, - { &vop_lookup_desc, (VOPFUNC)fifo_lookup }, /* lookup */ - { &vop_create_desc, (VOPFUNC)fifo_create }, /* create */ - { &vop_mknod_desc, (VOPFUNC)fifo_mknod }, /* mknod */ - { &vop_open_desc, (VOPFUNC)fifo_open }, /* open */ - { &vop_close_desc, (VOPFUNC)ufsfifo_close }, /* close */ - { &vop_access_desc, (VOPFUNC)ufs_access }, /* access */ - { &vop_getattr_desc, (VOPFUNC)ufs_getattr }, /* getattr */ - { &vop_setattr_desc, (VOPFUNC)ufs_setattr }, /* setattr */ - { &vop_read_desc, (VOPFUNC)ufsfifo_read }, /* read */ - { &vop_write_desc, (VOPFUNC)ufsfifo_write }, /* write */ - { &vop_lease_desc, (VOPFUNC)fifo_lease_check }, /* lease */ - { &vop_ioctl_desc, (VOPFUNC)fifo_ioctl }, /* ioctl */ - { &vop_select_desc, (VOPFUNC)fifo_select }, /* select */ - { &vop_revoke_desc, (VOPFUNC)fifo_revoke }, /* revoke */ - { &vop_mmap_desc, (VOPFUNC)fifo_mmap }, /* mmap */ - { &vop_fsync_desc, (VOPFUNC)ffs_fsync }, /* fsync */ - { &vop_seek_desc, (VOPFUNC)fifo_seek }, /* seek */ - { &vop_remove_desc, (VOPFUNC)fifo_remove }, /* remove */ - { &vop_link_desc, (VOPFUNC)fifo_link }, /* link */ - { &vop_rename_desc, (VOPFUNC)fifo_rename }, /* rename */ - { &vop_mkdir_desc, (VOPFUNC)fifo_mkdir }, /* mkdir */ - { &vop_rmdir_desc, (VOPFUNC)fifo_rmdir }, /* rmdir */ - { &vop_symlink_desc, (VOPFUNC)fifo_symlink }, /* symlink */ - { &vop_readdir_desc, (VOPFUNC)fifo_readdir }, /* readdir */ - { &vop_readlink_desc, (VOPFUNC)fifo_readlink }, /* readlink */ - { &vop_abortop_desc, (VOPFUNC)fifo_abortop }, /* abortop */ - { &vop_inactive_desc, (VOPFUNC)ufs_inactive }, /* inactive */ - { &vop_reclaim_desc, (VOPFUNC)ffs_reclaim }, /* reclaim */ - { &vop_lock_desc, (VOPFUNC)ufs_lock }, /* lock */ - { &vop_unlock_desc, (VOPFUNC)ufs_unlock }, /* unlock */ - { &vop_bmap_desc, (VOPFUNC)fifo_bmap }, /* bmap */ - { &vop_strategy_desc, (VOPFUNC)fifo_strategy }, /* strategy */ - { &vop_print_desc, (VOPFUNC)ufs_print }, /* print */ - { &vop_islocked_desc, (VOPFUNC)ufs_islocked }, /* islocked */ - { &vop_pathconf_desc, (VOPFUNC)fifo_pathconf }, /* pathconf */ - { &vop_advlock_desc, (VOPFUNC)fifo_advlock }, /* advlock */ - { &vop_blkatoff_desc, (VOPFUNC)fifo_blkatoff }, /* blkatoff */ - { &vop_valloc_desc, (VOPFUNC)fifo_valloc }, /* valloc */ - { &vop_reallocblks_desc, (VOPFUNC)fifo_reallocblks }, /* reallocblks */ - { &vop_vfree_desc, (VOPFUNC)ffs_vfree }, /* vfree */ - { &vop_truncate_desc, (VOPFUNC)fifo_truncate }, /* truncate */ - { &vop_update_desc, (VOPFUNC)ffs_update }, /* update */ - { &vop_bwrite_desc, (VOPFUNC)vn_bwrite }, - { &vop_pagein_desc, (VOPFUNC)ffs_pagein }, /* Pagein */ - { &vop_pageout_desc, (VOPFUNC)ffs_pageout }, /* Pageout */ - { &vop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copy File */ - { &vop_blktooff_desc, (VOPFUNC)ffs_blktooff }, /* blktooff */ - { &vop_offtoblk_desc, (VOPFUNC)ffs_offtoblk }, /* offtoblk */ - { &vop_cmap_desc, (VOPFUNC)ufs_cmap }, /* cmap */ - { &vop_kqfilt_add_desc, (VOPFUNC)ufsfifo_kqfilt_add }, /* kqfilt_add */ + { &vnop_default_desc, (VOPFUNC)vn_default_error }, + { &vnop_lookup_desc, (VOPFUNC)fifo_lookup }, /* lookup */ + { &vnop_create_desc, (VOPFUNC)fifo_create }, /* create */ + { &vnop_mknod_desc, (VOPFUNC)fifo_mknod }, /* mknod */ + { &vnop_open_desc, (VOPFUNC)fifo_open }, /* open */ + { &vnop_close_desc, (VOPFUNC)ufsfifo_close }, /* close */ + { &vnop_getattr_desc, (VOPFUNC)ufs_getattr }, /* getattr */ + { &vnop_setattr_desc, (VOPFUNC)ufs_setattr }, /* setattr */ + { &vnop_read_desc, (VOPFUNC)ufsfifo_read }, /* read */ + { &vnop_write_desc, (VOPFUNC)ufsfifo_write }, /* write */ + { &vnop_ioctl_desc, (VOPFUNC)fifo_ioctl }, /* ioctl */ + { &vnop_select_desc, (VOPFUNC)fifo_select }, /* select */ + { &vnop_revoke_desc, (VOPFUNC)fifo_revoke }, /* revoke */ + { &vnop_mmap_desc, (VOPFUNC)fifo_mmap }, /* mmap */ + { &vnop_fsync_desc, (VOPFUNC)ffs_fsync }, /* fsync */ + { &vnop_remove_desc, (VOPFUNC)fifo_remove }, /* remove */ + { &vnop_link_desc, (VOPFUNC)fifo_link }, /* link */ + { &vnop_rename_desc, (VOPFUNC)fifo_rename }, /* rename */ + { &vnop_mkdir_desc, (VOPFUNC)fifo_mkdir }, /* mkdir */ + { &vnop_rmdir_desc, (VOPFUNC)fifo_rmdir }, /* rmdir */ + { &vnop_symlink_desc, (VOPFUNC)fifo_symlink }, /* symlink */ + { &vnop_readdir_desc, (VOPFUNC)fifo_readdir }, /* readdir */ + { &vnop_readlink_desc, (VOPFUNC)fifo_readlink }, /* readlink */ + { &vnop_inactive_desc, (VOPFUNC)ufs_inactive }, /* inactive */ + { &vnop_reclaim_desc, (VOPFUNC)ffs_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (VOPFUNC)fifo_strategy }, /* strategy */ + { &vnop_pathconf_desc, (VOPFUNC)fifo_pathconf }, /* pathconf */ + { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ + { &vnop_bwrite_desc, (VOPFUNC)vn_bwrite }, + { &vnop_pagein_desc, (VOPFUNC)ffs_pagein }, /* Pagein */ + { &vnop_pageout_desc, (VOPFUNC)ffs_pageout }, /* Pageout */ + { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copy File */ + { &vnop_blktooff_desc, (VOPFUNC)ffs_blktooff }, /* blktooff */ + { &vnop_offtoblk_desc, (VOPFUNC)ffs_offtoblk }, /* offtoblk */ + { &vnop_blockmap_desc, (VOPFUNC)ufs_blockmap }, /* blockmap */ + { &vnop_kqfilt_add_desc, (VOPFUNC)ufsfifo_kqfilt_add }, /* kqfilt_add */ { (struct vnodeop_desc*)NULL, (int(*)())NULL } }; struct vnodeopv_desc ffs_fifoop_opv_desc = @@ -287,82 +238,34 @@ int doclusterwrite = 0; /* ARGSUSED */ int ffs_fsync(ap) - struct vop_fsync_args /* { + struct vnop_fsync_args /* { struct vnode *a_vp; - struct ucred *a_cred; int a_waitfor; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { - register struct vnode *vp = ap->a_vp; - register struct buf *bp; + return(ffs_fsync_internal(ap->a_vp, ap->a_waitfor)); +} + + +int +ffs_fsync_internal(vnode_t vp, int waitfor) +{ struct timeval tv; - struct buf *nbp; - int s; - struct inode *ip = VTOI(vp); - int retry = 0; + int wait = (waitfor == MNT_WAIT); /* * Write out any clusters. */ - cluster_push(vp); + cluster_push(vp, 0); /* * Flush all dirty buffers associated with a vnode. */ -loop: - s = splbio(); - for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { - nbp = bp->b_vnbufs.le_next; - if ((bp->b_flags & B_BUSY)) - continue; - if ((bp->b_flags & B_DELWRI) == 0) - panic("ffs_fsync: not dirty"); - bremfree(bp); - bp->b_flags |= B_BUSY; - splx(s); - /* - * Wait for I/O associated with indirect blocks to complete, - * since there is no way to quickly wait for them below. - */ - if (bp->b_vp == vp || ap->a_waitfor == MNT_NOWAIT) - (void) bawrite(bp); - else - (void) bwrite(bp); - goto loop; - } - - if (ap->a_waitfor == MNT_WAIT) { - while (vp->v_numoutput) { - vp->v_flag |= VBWAIT; - tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "ffs_fsync", 0); - } - - if (vp->v_dirtyblkhd.lh_first) { - /* still have some dirty buffers */ - if (retry++ > 10) { - vprint("ffs_fsync: dirty", vp); - splx(s); - /* - * Looks like the requests are not - * getting queued to the driver. - * Retrying here causes a cpu bound loop. - * Yield to the other threads and hope - * for the best. - */ - (void)tsleep((caddr_t)&vp->v_numoutput, - PRIBIO + 1, "ffs_fsync", hz/10); - retry = 0; - } else { - splx(s); - } - /* try again */ - goto loop; - } - } - splx(s); - tv = time; - return (VOP_UPDATE(ap->a_vp, &tv, &tv, ap->a_waitfor == MNT_WAIT)); + buf_flushdirtyblks(vp, wait, 0, (char *)"ffs_fsync"); + microtime(&tv); + + return (ffs_update(vp, &tv, &tv, wait)); } /* @@ -370,71 +273,63 @@ loop: */ int ffs_reclaim(ap) - struct vop_reclaim_args /* { + struct vnop_reclaim_args /* { struct vnode *a_vp; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { register struct vnode *vp = ap->a_vp; int error; - if (error = ufs_reclaim(vp, ap->a_p)) + if ( (error = ufs_reclaim(vp, vfs_context_proc(ap->a_context))) ) return (error); - FREE_ZONE(vp->v_data, sizeof (struct inode), - VFSTOUFS(vp->v_mount)->um_devvp->v_tag == VT_MFS ? - M_MFSNODE : M_FFSNODE); - vp->v_data = NULL; + + FREE_ZONE(vnode_fsnode(vp), sizeof (struct inode), M_FFSNODE); + + vnode_clearfsnode(vp); + return (0); } /* Blktooff converts a logical block number to a file offset */ int ffs_blktooff(ap) - struct vop_blktooff_args /* { + struct vnop_blktooff_args /* { struct vnode *a_vp; - daddr_t a_lblkno; + daddr64_t a_lblkno; off_t *a_offset; } */ *ap; { register struct inode *ip; register FS *fs; - ufs_daddr_t bn; - if (ap->a_vp == NULL) return (EINVAL); - ip = VTOI(ap->a_vp); - fs = ip->I_FS; - bn = ap->a_lblkno; + fs = VTOI(ap->a_vp)->I_FS; - if ((long)bn < 0) { - panic("-ve blkno in ffs_blktooff"); - bn = -(long)bn; - } + *ap->a_offset = (off_t)lblktosize(fs, ap->a_lblkno); - *ap->a_offset = (off_t)lblktosize(fs, bn); return (0); } /* Blktooff converts a logical block number to a file offset */ int ffs_offtoblk(ap) - struct vop_offtoblk_args /* { + struct vnop_offtoblk_args /* { struct vnode *a_vp; off_t a_offset; - daddr_t *a_lblkno; + daddr64_t *a_lblkno; } */ *ap; { - register struct inode *ip; - register FS *fs; + register FS *fs; if (ap->a_vp == NULL) return (EINVAL); - ip = VTOI(ap->a_vp); - fs = ip->I_FS; + fs = VTOI(ap->a_vp)->I_FS; + + *ap->a_lblkno = (daddr64_t)lblkno(fs, ap->a_offset); - *ap->a_lblkno = (daddr_t)lblkno(fs, ap->a_offset); return (0); } diff --git a/bsd/ufs/ufs/Makefile b/bsd/ufs/ufs/Makefile index 5717ecbea..08c53815e 100644 --- a/bsd/ufs/ufs/Makefile +++ b/bsd/ufs/ufs/Makefile @@ -20,7 +20,7 @@ EXPINC_SUBDIRS_PPC = \ EXPINC_SUBDIRS_I386 = \ DATAFILES = \ - dinode.h dir.h inode.h lockf.h quota.h ufs_extern.h ufsmount.h + dinode.h dir.h inode.h quota.h ufs_extern.h ufsmount.h INSTALL_MI_LIST = ${DATAFILES} diff --git a/bsd/ufs/ufs/inode.h b/bsd/ufs/ufs/inode.h index ef40be739..37a5fc619 100644 --- a/bsd/ufs/ufs/inode.h +++ b/bsd/ufs/ufs/inode.h @@ -67,6 +67,7 @@ #ifdef __APPLE_API_PRIVATE #include <ufs/ufs/dir.h> #include <ufs/ufs/dinode.h> +#include <sys/queue.h> #include <sys/event.h> #include <sys/lock.h> #include <sys/quota.h> @@ -96,8 +97,8 @@ struct inode { struct klist i_knotes; /* knotes attached to this vnode */ struct dquot *i_dquot[MAXQUOTAS]; /* Dquot structures. */ u_quad_t i_modrev; /* Revision level for NFS lease. */ - struct lockf *i_lockf;/* Head of byte-level lock list. */ - struct lock__bsd__ i_lock; /* Inode lock. */ + void *i_lockf; /* DEPRECATED */ + /* * Side effects; used during directory lookup. */ @@ -107,6 +108,7 @@ struct inode { doff_t i_offset; /* Offset of free space in directory. */ ino_t i_ino; /* Inode number of found directory. */ u_int32_t i_reclen; /* Size of found directory entry. */ + daddr_t i_lastr; /* last read... read-ahead */ /* * The on-disk dinode itself. */ @@ -173,17 +175,15 @@ struct indir { (ip)->i_modrev++; \ } \ if ((ip)->i_flag & IN_CHANGE) \ - (ip)->i_ctime = time.tv_sec; \ + (ip)->i_ctime = (t2)->tv_sec; \ (ip)->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); \ } \ } #define VN_KNOTE(vp, hint) KNOTE(&VTOI(vp)->i_knotes, (hint)) -/* This overlays the fid structure (see mount.h). */ +/* This overlays the FileID portion of NFS file handles. */ struct ufid { - u_int16_t ufid_len; /* Length of structure. */ - u_int16_t ufid_pad; /* Force 32-bit alignment. */ ino_t ufid_ino; /* File number (ino). */ int32_t ufid_gen; /* Generation number. */ }; diff --git a/bsd/ufs/ufs/lockf.h b/bsd/ufs/ufs/lockf.h deleted file mode 100644 index 92121a1f5..000000000 --- a/bsd/ufs/ufs/lockf.h +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* - * Copyright (c) 1991, 1993 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Scooter Morris at Genentech Inc. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)lockf.h 8.2 (Berkeley) 10/26/94 - */ -#ifndef _UFS_LOCKF_H_ -#define _UFS_LOCKF_H_ - -#include <sys/appleapiopts.h> - -#ifdef __APPLE_API_PRIVATE -/* - * The lockf structure is a kernel structure which contains the information - * associated with a byte range lock. The lockf structures are linked into - * the inode structure. Locks are sorted by the starting byte of the lock for - * efficiency. - */ -TAILQ_HEAD(locklist, lockf); - -struct lockf { - short lf_flags; /* Semantics: F_POSIX, F_FLOCK, F_WAIT */ - short lf_type; /* Lock type: F_RDLCK, F_WRLCK */ - off_t lf_start; /* Byte # of the start of the lock */ - off_t lf_end; /* Byte # of the end of the lock (-1=EOF) */ - caddr_t lf_id; /* Id of the resource holding the lock */ - struct inode *lf_inode; /* Back pointer to the inode */ - struct lockf *lf_next; /* Pointer to the next lock on this inode */ - struct locklist lf_blkhd; /* List of requests blocked on this lock */ - TAILQ_ENTRY(lockf) lf_block;/* A request waiting for a lock */ -}; - -/* Maximum length of sleep chains to traverse to try and detect deadlock. */ -#define MAXDEPTH 50 - -__BEGIN_DECLS -void lf_addblock __P((struct lockf *, struct lockf *)); -int lf_clearlock __P((struct lockf *)); -int lf_findoverlap __P((struct lockf *, - struct lockf *, int, struct lockf ***, struct lockf **)); -struct lockf * - lf_getblock __P((struct lockf *)); -int lf_getlock __P((struct lockf *, struct flock *)); -int lf_setlock __P((struct lockf *)); -void lf_split __P((struct lockf *, struct lockf *)); -void lf_wakelock __P((struct lockf *)); -__END_DECLS - -#ifdef LOCKF_DEBUG -extern int lockf_debug; - -__BEGIN_DECLS -void lf_print __P((char *, struct lockf *)); -void lf_printlist __P((char *, struct lockf *)); -__END_DECLS -#endif - -#endif /* __APPLE_API_PRIVATE */ -#endif /* ! _UFS_LOCKF_H_ */ - diff --git a/bsd/ufs/ufs/quota.h b/bsd/ufs/ufs/quota.h index 13de74d26..f48b7f8f5 100644 --- a/bsd/ufs/ufs/quota.h +++ b/bsd/ufs/ufs/quota.h @@ -76,23 +76,23 @@ struct mount; struct proc; struct ucred; __BEGIN_DECLS -int chkdq __P((struct inode *, int64_t, struct ucred *, int)); -int chkdqchg __P((struct inode *, int64_t, struct ucred *, int)); -int chkiq __P((struct inode *, long, struct ucred *, int)); -int chkiqchg __P((struct inode *, long, struct ucred *, int)); -int getinoquota __P((struct inode *)); -int getquota __P((struct mount *, u_long, int, caddr_t)); -int qsync __P((struct mount *mp)); -int quotaoff __P((struct proc *, struct mount *, int)); -int quotaon __P((struct proc *, struct mount *, int, caddr_t, enum uio_seg)); -int setquota __P((struct mount *, u_long, int, caddr_t)); -int setuse __P((struct mount *, u_long, int, caddr_t)); -int ufs_quotactl __P((struct mount *, int, uid_t, caddr_t, struct proc *)); +int chkdq(struct inode *, int64_t, struct ucred *, int); +int chkdqchg(struct inode *, int64_t, struct ucred *, int); +int chkiq(struct inode *, long, struct ucred *, int); +int chkiqchg(struct inode *, long, struct ucred *, int); +int getinoquota(struct inode *); +int getquota(struct mount *, u_long, int, caddr_t); +int qsync(struct mount *mp); +int quotaoff(struct mount *, int); +int quotaon(vfs_context_t, struct mount *, int, caddr_t); +int setquota(struct mount *, u_long, int, caddr_t); +int setuse(struct mount *, u_long, int, caddr_t); +int ufs_quotactl(struct mount *, int, uid_t, caddr_t, vfs_context_t); __END_DECLS #if DIAGNOSTIC __BEGIN_DECLS -void chkdquot __P((struct inode *)); +void chkdquot(struct inode *); __END_DECLS #endif #endif /* KERNEL */ diff --git a/bsd/ufs/ufs/ufs_attrlist.c b/bsd/ufs/ufs/ufs_attrlist.c index 161391c1d..bea11d309 100644 --- a/bsd/ufs/ufs/ufs_attrlist.c +++ b/bsd/ufs/ufs/ufs_attrlist.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -28,40 +28,18 @@ #include <sys/types.h> #include <sys/systm.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/malloc.h> #include <sys/attr.h> #include <sys/kernel.h> +#include <sys/kauth.h> #include <architecture/byte_order.h> #include <ufs/ufs/dinode.h> #include <ufs/ffs/fs.h> +#include <sys/mount_internal.h> #include "ufsmount.h" -/* -12345678901234567890123456789012345678901234567890123456789012345678901234567890 -*/ -enum { - UFS_ATTR_CMN_NATIVE = 0, - UFS_ATTR_CMN_SUPPORTED = 0, - UFS_ATTR_VOL_NATIVE = ATTR_VOL_NAME | - ATTR_VOL_CAPABILITIES | - ATTR_VOL_ATTRIBUTES, - UFS_ATTR_VOL_SUPPORTED = UFS_ATTR_VOL_NATIVE, - UFS_ATTR_DIR_NATIVE = 0, - UFS_ATTR_DIR_SUPPORTED = 0, - UFS_ATTR_FILE_NATIVE = 0, - UFS_ATTR_FILE_SUPPORTED = 0, - UFS_ATTR_FORK_NATIVE = 0, - UFS_ATTR_FORK_SUPPORTED = 0, - - UFS_ATTR_CMN_SETTABLE = 0, - UFS_ATTR_VOL_SETTABLE = ATTR_VOL_NAME, - UFS_ATTR_DIR_SETTABLE = 0, - UFS_ATTR_FILE_SETTABLE = 0, - UFS_ATTR_FORK_SETTABLE = 0 -}; - static char ufs_label_magic[4] = UFS_LABEL_MAGIC; /* Copied from diskdev_cmds/disklib/ufslabel.c */ @@ -90,8 +68,8 @@ reduce(int *sum) } /* Copied from diskdev_cmds/disklib/ufslabel.c */ -static unsigned short -in_cksum(void *data, int len) +__private_extern__ unsigned short +ul_cksum(void *data, int len) { u_short *w; int sum; @@ -135,7 +113,7 @@ in_cksum(void *data, int len) } /* Adapted from diskdev_cmds/disklib/ufslabel.c */ -static boolean_t +__private_extern__ boolean_t ufs_label_check(struct ufslabel *ul_p) { u_int16_t calc; @@ -166,7 +144,7 @@ ufs_label_check(struct ufslabel *ul_p) checksum = ul_p->ul_checksum; /* Remember previous checksum. */ ul_p->ul_checksum = 0; - calc = in_cksum(ul_p, sizeof(*ul_p)); + calc = ul_cksum(ul_p, sizeof(*ul_p)); if (calc != checksum) { #ifdef DEBUG printf("ufslabel_check: label checksum %x (should be %x)\n", @@ -177,632 +155,16 @@ ufs_label_check(struct ufslabel *ul_p) return (TRUE); } -static void +__private_extern__ void ufs_label_init(struct ufslabel *ul_p) { + struct timeval tv; + + microtime(&tv); + bzero(ul_p, sizeof(*ul_p)); ul_p->ul_version = htonl(UFS_LABEL_VERSION); bcopy(ufs_label_magic, &ul_p->ul_magic, sizeof(ul_p->ul_magic)); - ul_p->ul_time = htonl(time.tv_sec); -} - -static int -ufs_get_label(struct vnode *vp, struct ucred *cred, char *label, - int *name_length) -{ - int error; - int devBlockSize; - struct mount *mp; - struct vnode *devvp; - struct buf *bp; - struct ufslabel *ulp; - - mp = vp->v_mount; - devvp = VFSTOUFS(mp)->um_devvp; - VOP_DEVBLOCKSIZE(devvp, &devBlockSize); - - if (error = meta_bread(devvp, (ufs_daddr_t)(UFS_LABEL_OFFSET / devBlockSize), - UFS_LABEL_SIZE, cred, &bp)) - goto out; - - /* - * Since the disklabel is read directly by older user space code, - * make sure this buffer won't remain in the cache when we release it. - * - * It would be better if that user space code was modified to get - * at the fields of the disklabel via the filesystem (such as - * getattrlist). - */ - SET(bp->b_flags, B_NOCACHE); - - ulp = (struct ufslabel *) bp->b_data; - if (ufs_label_check(ulp)) { - int length; - /* Copy the name out */ - length = ulp->ul_namelen; -#if REV_ENDIAN_FS - if (mp->mnt_flag & MNT_REVEND) - length = NXSwapShort(length); -#endif - if (length > 0 && length <= UFS_MAX_LABEL_NAME) { - bcopy(ulp->ul_name, label, length); - *name_length = length; - } else { - /* Return an empty name */ - *label = '\0'; - *name_length = 0; - } - } - -out: - if (bp) - brelse(bp); - return error; -} - -static int ufs_set_label(struct vnode *vp, struct ucred *cred, - const char *label, int name_length) -{ - int error; - int devBlockSize; - struct mount *mp; - struct vnode *devvp; - struct buf *bp; - struct ufslabel *ulp; - - mp = vp->v_mount; - - /* Validate the new name's length */ - if (name_length < 0 || name_length > UFS_MAX_LABEL_NAME) - return EINVAL; - - /* Read UFS_LABEL_SIZE bytes at UFS_LABEL_OFFSET */ - devvp = VFSTOUFS(mp)->um_devvp; - VOP_DEVBLOCKSIZE(devvp, &devBlockSize); - if (error = meta_bread(devvp, (ufs_daddr_t)(UFS_LABEL_OFFSET / devBlockSize), - UFS_LABEL_SIZE, cred, &bp)) - goto out; - - /* - * Since the disklabel is read directly by older user space code, - * make sure this buffer won't remain in the cache when we release it. - * - * It would be better if that user space code was modified to get - * at the fields of the disklabel via the filesystem (such as - * getattrlist). - */ - SET(bp->b_flags, B_NOCACHE); - - /* Validate the label structure; init if not valid */ - ulp = (struct ufslabel *) bp->b_data; - if (!ufs_label_check(ulp)) - ufs_label_init(ulp); - - /* Copy new name over existing name */ - ulp->ul_namelen = name_length; -#if REV_ENDIAN_FS - if (mp->mnt_flag & MNT_REVEND) - ulp->ul_namelen = NXSwapShort(ulp->ul_namelen); -#endif - bcopy(label, ulp->ul_name, name_length); - - /* Update the checksum */ - ulp->ul_checksum = 0; - ulp->ul_checksum = in_cksum(ulp, sizeof(*ulp)); - - /* Write the label back to disk */ - bwrite(bp); - bp = NULL; - -out: - if (bp) - brelse(bp); - return error; -} - -/* - * Pack a C-style string into an attribute buffer. Returns the new varptr. - */ -static void * -packstr(char *s, void *attrptr, void *varptr) -{ - struct attrreference *ref = attrptr; - u_long length; - - length = strlen(s) + 1; /* String, plus terminator */ - - /* - * In the fixed-length part of buffer, store the offset and length of - * the variable-length data. - */ - ref->attr_dataoffset = (u_int8_t *)varptr - (u_int8_t *)attrptr; - ref->attr_length = length; - - /* Copy the string to variable-length part of buffer */ - (void) strncpy((unsigned char *)varptr, s, length); - - /* Advance pointer past string, and round up to multiple of 4 bytes */ - return (char *)varptr + ((length + 3) & ~3); + ul_p->ul_time = htonl(tv.tv_sec); } -/* - * Pack an unterminated string into an attribute buffer as a C-style - * string. Copies the indicated number of characters followed by a - * terminating '\0'. Returns the new varptr. - */ -static void * -packtext(u_char *text, u_int text_len, void *attrptr, void *varptr) -{ - struct attrreference *ref = attrptr; - u_long length; /* of the attribute, including terminator */ - - length = text_len + 1; /* String, plus terminator */ - - /* - * In the fixed-length part of buffer, store the offset and length of - * the variable-length data. - */ - ref->attr_dataoffset = (u_int8_t *) varptr - (u_int8_t *) attrptr; - ref->attr_length = length; - - /* Copy the string to variable-length part of buffer */ - bcopy(text, varptr, text_len); - ((char *) varptr)[text_len] = '\0'; - - /* Advance pointer past string, and round up to multiple of 4 bytes */ - return (char *) varptr + ((length + 3) & ~3); -} - -/* - * ufs_packvolattr - * - * Pack the volume-related attributes from a getattrlist call into result - * buffers. Fields are packed in order based on the bitmap masks. - * Attributes with smaller masks are packed first. - * - * The buffer pointers are updated to point past the data that was returned. - */ -static int ufs_packvolattr( - struct vnode *vp, /* The volume's vnode */ - struct ucred *cred, - struct attrlist *alist, /* Desired attributes */ - void **attrptrptr, /* Fixed-size attributes buffer */ - void **varptrptr) /* Variable-size attributes buffer */ -{ - int error; - attrgroup_t a; - void *attrptr = *attrptrptr; - void *varptr = *varptrptr; - - a = alist->volattr; - if (a) { - if (a & ATTR_VOL_NAME) { - int length; - char name[UFS_MAX_LABEL_NAME]; - - error = ufs_get_label(vp, cred, name, &length); - if (error) - return error; - - varptr = packtext(name, length, attrptr, varptr); - ++((struct attrreference *)attrptr); - } - - if (a & ATTR_VOL_CAPABILITIES) { - vol_capabilities_attr_t *vcapattrptr; - - vcapattrptr = (vol_capabilities_attr_t *) attrptr; - - /* - * Capabilities this volume format has. Note that - * we do not set VOL_CAP_FMT_PERSISTENTOBJECTIDS. - * That's because we can't resolve an inode number - * into a directory entry (parent and name), which - * Carbon would need to support PBResolveFileIDRef. - */ - vcapattrptr->capabilities[VOL_CAPABILITIES_FORMAT] = - VOL_CAP_FMT_SYMBOLICLINKS | - VOL_CAP_FMT_HARDLINKS | - VOL_CAP_FMT_SPARSE_FILES | - VOL_CAP_FMT_CASE_SENSITIVE | - VOL_CAP_FMT_CASE_PRESERVING | - VOL_CAP_FMT_FAST_STATFS ; - vcapattrptr->capabilities[VOL_CAPABILITIES_INTERFACES] - = VOL_CAP_INT_NFSEXPORT | - VOL_CAP_INT_VOL_RENAME | - VOL_CAP_INT_ADVLOCK | - VOL_CAP_INT_FLOCK ; - vcapattrptr->capabilities[VOL_CAPABILITIES_RESERVED1] - = 0; - vcapattrptr->capabilities[VOL_CAPABILITIES_RESERVED2] - = 0; - - /* Capabilities we know about: */ - vcapattrptr->valid[VOL_CAPABILITIES_FORMAT] = - VOL_CAP_FMT_PERSISTENTOBJECTIDS | - VOL_CAP_FMT_SYMBOLICLINKS | - VOL_CAP_FMT_HARDLINKS | - VOL_CAP_FMT_JOURNAL | - VOL_CAP_FMT_JOURNAL_ACTIVE | - VOL_CAP_FMT_NO_ROOT_TIMES | - VOL_CAP_FMT_SPARSE_FILES | - VOL_CAP_FMT_ZERO_RUNS | - VOL_CAP_FMT_CASE_SENSITIVE | - VOL_CAP_FMT_CASE_PRESERVING | - VOL_CAP_FMT_FAST_STATFS ; - vcapattrptr->valid[VOL_CAPABILITIES_INTERFACES] = - VOL_CAP_INT_SEARCHFS | - VOL_CAP_INT_ATTRLIST | - VOL_CAP_INT_NFSEXPORT | - VOL_CAP_INT_READDIRATTR | - VOL_CAP_INT_EXCHANGEDATA | - VOL_CAP_INT_COPYFILE | - VOL_CAP_INT_ALLOCATE | - VOL_CAP_INT_VOL_RENAME | - VOL_CAP_INT_ADVLOCK | - VOL_CAP_INT_FLOCK ; - vcapattrptr->valid[VOL_CAPABILITIES_RESERVED1] = 0; - vcapattrptr->valid[VOL_CAPABILITIES_RESERVED2] = 0; - - ++((vol_capabilities_attr_t *)attrptr); - } - - if (a & ATTR_VOL_ATTRIBUTES) { - vol_attributes_attr_t *volattrptr; - - volattrptr = (vol_attributes_attr_t *)attrptr; - - volattrptr->validattr.commonattr = - UFS_ATTR_CMN_SUPPORTED; - volattrptr->validattr.volattr = - UFS_ATTR_VOL_SUPPORTED; - volattrptr->validattr.dirattr = - UFS_ATTR_DIR_SUPPORTED; - volattrptr->validattr.fileattr = - UFS_ATTR_FILE_SUPPORTED; - volattrptr->validattr.forkattr = - UFS_ATTR_FORK_SUPPORTED; - - volattrptr->nativeattr.commonattr = - UFS_ATTR_CMN_NATIVE; - volattrptr->nativeattr.volattr = - UFS_ATTR_VOL_NATIVE; - volattrptr->nativeattr.dirattr = - UFS_ATTR_DIR_NATIVE; - volattrptr->nativeattr.fileattr = - UFS_ATTR_FILE_NATIVE; - volattrptr->nativeattr.forkattr = - UFS_ATTR_FORK_NATIVE; - - ++((vol_attributes_attr_t *)attrptr); - } - } - - /* Update the buffer pointers to point past what we just returned */ - *attrptrptr = attrptr; - *varptrptr = varptr; - - return 0; -} - -/* - * Pack all attributes from a getattrlist or readdirattr call into - * the result buffer. For now, we only support volume attributes. - */ -static int -ufs_packattr(struct vnode *vp, struct ucred *cred, struct attrlist *alist, - void **attrptr, void **varptr) -{ - int error=0; - - if (alist->volattr != 0) - error = ufs_packvolattr(vp, cred, alist, attrptr, varptr); - - return error; -} - -/* - * Calculate the fixed-size space required to hold a set of attributes. - * For variable-length attributes, this will be the size of the - * attribute reference (an offset and length). - */ -static size_t -ufs_attrsize(struct attrlist *attrlist) -{ - size_t size; - attrgroup_t a = 0; - -#if ((ATTR_CMN_NAME | ATTR_CMN_DEVID | ATTR_CMN_FSID | ATTR_CMN_OBJTYPE | \ - ATTR_CMN_OBJTAG | ATTR_CMN_OBJID | ATTR_CMN_OBJPERMANENTID | \ - ATTR_CMN_PAROBJID | ATTR_CMN_SCRIPT | ATTR_CMN_CRTIME | \ - ATTR_CMN_MODTIME | ATTR_CMN_CHGTIME | ATTR_CMN_ACCTIME | \ - ATTR_CMN_BKUPTIME | ATTR_CMN_FNDRINFO | ATTR_CMN_OWNERID | \ - ATTR_CMN_GRPID | ATTR_CMN_ACCESSMASK | ATTR_CMN_NAMEDATTRCOUNT | \ - ATTR_CMN_NAMEDATTRLIST | ATTR_CMN_FLAGS | ATTR_CMN_USERACCESS) \ - != ATTR_CMN_VALIDMASK) -#error ufs_attrsize: Missing bits in common mask computation! -#endif - -#if ((ATTR_VOL_FSTYPE | ATTR_VOL_SIGNATURE | ATTR_VOL_SIZE | \ - ATTR_VOL_SPACEFREE | ATTR_VOL_SPACEAVAIL | ATTR_VOL_MINALLOCATION | \ - ATTR_VOL_ALLOCATIONCLUMP | ATTR_VOL_IOBLOCKSIZE | \ - ATTR_VOL_OBJCOUNT | ATTR_VOL_FILECOUNT | ATTR_VOL_DIRCOUNT | \ - ATTR_VOL_MAXOBJCOUNT | ATTR_VOL_MOUNTPOINT | ATTR_VOL_NAME | \ - ATTR_VOL_MOUNTFLAGS | ATTR_VOL_INFO | ATTR_VOL_MOUNTEDDEVICE | \ - ATTR_VOL_ENCODINGSUSED | ATTR_VOL_CAPABILITIES | ATTR_VOL_ATTRIBUTES) \ - != ATTR_VOL_VALIDMASK) -#error ufs_attrsize: Missing bits in volume mask computation! -#endif - -#if ((ATTR_DIR_LINKCOUNT | ATTR_DIR_ENTRYCOUNT | ATTR_DIR_MOUNTSTATUS) \ - != ATTR_DIR_VALIDMASK) -#error ufs_attrsize: Missing bits in directory mask computation! -#endif - -#if ((ATTR_FILE_LINKCOUNT | ATTR_FILE_TOTALSIZE | ATTR_FILE_ALLOCSIZE | \ - ATTR_FILE_IOBLOCKSIZE | ATTR_FILE_CLUMPSIZE | ATTR_FILE_DEVTYPE | \ - ATTR_FILE_FILETYPE | ATTR_FILE_FORKCOUNT | ATTR_FILE_FORKLIST | \ - ATTR_FILE_DATALENGTH | ATTR_FILE_DATAALLOCSIZE | \ - ATTR_FILE_DATAEXTENTS | ATTR_FILE_RSRCLENGTH | \ - ATTR_FILE_RSRCALLOCSIZE | ATTR_FILE_RSRCEXTENTS) \ - != ATTR_FILE_VALIDMASK) -#error ufs_attrsize: Missing bits in file mask computation! -#endif - -#if ((ATTR_FORK_TOTALSIZE | ATTR_FORK_ALLOCSIZE) != ATTR_FORK_VALIDMASK) -#error ufs_attrsize: Missing bits in fork mask computation! -#endif - - size = 0; - - if ((a = attrlist->volattr) != 0) { - if (a & ATTR_VOL_NAME) - size += sizeof(struct attrreference); - if (a & ATTR_VOL_CAPABILITIES) - size += sizeof(vol_capabilities_attr_t); - if (a & ATTR_VOL_ATTRIBUTES) - size += sizeof(vol_attributes_attr_t); - }; - - /* - * Ignore common, dir, file, and fork attributes since we - * don't support those yet. - */ - - return size; -} - -/* -# -#% getattrlist vp = = = -# - vop_getattrlist { - IN struct vnode *vp; - IN struct attrlist *alist; - INOUT struct uio *uio; - IN struct ucred *cred; - IN struct proc *p; - }; - - */ -__private_extern__ int -ufs_getattrlist(struct vop_getattrlist_args *ap) -{ - struct vnode *vp = ap->a_vp; - struct attrlist *alist = ap->a_alist; - size_t fixedblocksize; - size_t attrblocksize; - size_t attrbufsize; - void *attrbufptr; - void *attrptr; - void *varptr; - int error; - - /* - * Check the attrlist for valid inputs (i.e. be sure we understand what - * caller is asking). - */ - if ((alist->bitmapcount != ATTR_BIT_MAP_COUNT) || - ((alist->commonattr & ~ATTR_CMN_VALIDMASK) != 0) || - ((alist->volattr & ~ATTR_VOL_VALIDMASK) != 0) || - ((alist->dirattr & ~ATTR_DIR_VALIDMASK) != 0) || - ((alist->fileattr & ~ATTR_FILE_VALIDMASK) != 0) || - ((alist->forkattr & ~ATTR_FORK_VALIDMASK) != 0)) - return EINVAL; - - /* - * Requesting volume information requires setting the - * ATTR_VOL_INFO bit. Also, volume info requests are - * mutually exclusive with all other info requests. - */ - if ((alist->volattr != 0) && - (((alist->volattr & ATTR_VOL_INFO) == 0) || - (alist->dirattr != 0) || (alist->fileattr != 0) || - alist->forkattr != 0)) - return EINVAL; - - /* - * Make sure caller isn't asking for an attibute we don't support. - */ - if ((alist->commonattr & ~UFS_ATTR_CMN_SUPPORTED) != 0 || - (alist->volattr & ~(UFS_ATTR_VOL_SUPPORTED | ATTR_VOL_INFO)) != 0 || - (alist->dirattr & ~UFS_ATTR_DIR_SUPPORTED) != 0 || - (alist->fileattr & ~UFS_ATTR_FILE_SUPPORTED) != 0 || - (alist->forkattr & ~UFS_ATTR_FORK_SUPPORTED) != 0) - return EOPNOTSUPP; - - /* - * Requesting volume information requires a vnode for the volume root. - */ - if (alist->volattr && (vp->v_flag & VROOT) == 0) - return EINVAL; - - fixedblocksize = ufs_attrsize(alist); - attrblocksize = fixedblocksize + (sizeof(u_long)); - if (alist->volattr & ATTR_VOL_NAME) - attrblocksize += 516; /* 512 + terminator + padding */ - attrbufsize = MIN(ap->a_uio->uio_resid, attrblocksize); - MALLOC(attrbufptr, void *, attrblocksize, M_TEMP, M_WAITOK); - attrptr = attrbufptr; - *((u_long *)attrptr) = 0; /* Set buffer length in case of errors */ - ++((u_long *)attrptr); /* skip over length field */ - varptr = ((char *)attrptr) + fixedblocksize; - - error = ufs_packattr(vp, ap->a_cred, alist, &attrptr, &varptr); - - if (error == 0) { - /* Don't return more data than was generated */ - attrbufsize = MIN(attrbufsize, (size_t) varptr - (size_t) attrbufptr); - - /* Return the actual buffer length */ - *((u_long *) attrbufptr) = attrbufsize; - - error = uiomove((caddr_t) attrbufptr, attrbufsize, ap->a_uio); - } - - FREE(attrbufptr, M_TEMP); - return error; -} - - -/* - * Unpack the volume-related attributes from a setattrlist call into the - * appropriate in-memory and on-disk structures. - */ -static int -ufs_unpackvolattr( - struct vnode *vp, - struct ucred *cred, - attrgroup_t attrs, - void *attrbufptr) -{ - int i; - int error; - attrreference_t *attrref; - - error = 0; - - if (attrs & ATTR_VOL_NAME) { - char *name; - int name_length; - - attrref = attrbufptr; - name = ((char*)attrbufptr) + attrref->attr_dataoffset; - name_length = strlen(name); - ufs_set_label(vp, cred, name, name_length); - - /* Advance buffer pointer past attribute reference */ - attrbufptr = ++attrref; - } - - return error; -} - - - -/* - * Unpack the attributes from a setattrlist call into the - * appropriate in-memory and on-disk structures. Right now, - * we only support the volume name. - */ -static int -ufs_unpackattr( - struct vnode *vp, - struct ucred *cred, - struct attrlist *alist, - void *attrbufptr) -{ - int error; - - error = 0; - - if (alist->volattr != 0) { - error = ufs_unpackvolattr(vp, cred, alist->volattr, - attrbufptr); - } - - return error; -} - - - -/* -# -#% setattrlist vp L L L -# -vop_setattrlist { - IN struct vnode *vp; - IN struct attrlist *alist; - INOUT struct uio *uio; - IN struct ucred *cred; - IN struct proc *p; -}; -*/ -__private_extern__ int -ufs_setattrlist(struct vop_setattrlist_args *ap) -{ - struct vnode *vp = ap->a_vp; - struct attrlist *alist = ap->a_alist; - size_t attrblocksize; - void *attrbufptr; - int error; - - if (vp->v_mount->mnt_flag & MNT_RDONLY) - return (EROFS); - - /* - * Check the attrlist for valid inputs (i.e. be sure we understand - * what caller is asking). - */ - if ((alist->bitmapcount != ATTR_BIT_MAP_COUNT) || - ((alist->commonattr & ~ATTR_CMN_SETMASK) != 0) || - ((alist->volattr & ~ATTR_VOL_SETMASK) != 0) || - ((alist->dirattr & ~ATTR_DIR_SETMASK) != 0) || - ((alist->fileattr & ~ATTR_FILE_SETMASK) != 0) || - ((alist->forkattr & ~ATTR_FORK_SETMASK) != 0)) - return EINVAL; - - /* - * Setting volume information requires setting the - * ATTR_VOL_INFO bit. Also, volume info requests are - * mutually exclusive with all other info requests. - */ - if ((alist->volattr != 0) && - (((alist->volattr & ATTR_VOL_INFO) == 0) || - (alist->dirattr != 0) || (alist->fileattr != 0) || - alist->forkattr != 0)) - return EINVAL; - - /* - * Make sure caller isn't asking for an attibute we don't support. - * Right now, all we support is setting the volume name. - */ - if ((alist->commonattr & ~UFS_ATTR_CMN_SETTABLE) != 0 || - (alist->volattr & ~(UFS_ATTR_VOL_SETTABLE | ATTR_VOL_INFO)) != 0 || - (alist->dirattr & ~UFS_ATTR_DIR_SETTABLE) != 0 || - (alist->fileattr & ~UFS_ATTR_FILE_SETTABLE) != 0 || - (alist->forkattr & ~UFS_ATTR_FORK_SETTABLE) != 0) - return EOPNOTSUPP; - - /* - * Setting volume information requires a vnode for the volume root. - */ - if (alist->volattr && (vp->v_flag & VROOT) == 0) - return EINVAL; - - attrblocksize = ap->a_uio->uio_resid; - if (attrblocksize < ufs_attrsize(alist)) - return EINVAL; - - MALLOC(attrbufptr, void *, attrblocksize, M_TEMP, M_WAITOK); - - error = uiomove((caddr_t)attrbufptr, attrblocksize, ap->a_uio); - if (error) - goto ErrorExit; - - error = ufs_unpackattr(vp, ap->a_cred, alist, attrbufptr); - -ErrorExit: - FREE(attrbufptr, M_TEMP); - return error; -} diff --git a/bsd/ufs/ufs/ufs_bmap.c b/bsd/ufs/ufs/ufs_bmap.c index 86cf8a596..ca7fd9352 100644 --- a/bsd/ufs/ufs/ufs_bmap.c +++ b/bsd/ufs/ufs/ufs_bmap.c @@ -67,9 +67,9 @@ #include <rev_endian_fs.h> #include <sys/param.h> #include <sys/buf.h> -#include <sys/proc.h> -#include <sys/vnode.h> -#include <sys/mount.h> +#include <sys/proc_internal.h> /* for p_stats */ +#include <sys/vnode_internal.h> +#include <sys/mount_internal.h> #include <sys/resourcevar.h> #include <sys/trace.h> #include <sys/quota.h> @@ -85,33 +85,6 @@ #include <architecture/byte_order.h> #endif /* REV_ENDIAN_FS */ -/* - * Bmap converts a the logical block number of a file to its physical block - * number on the disk. The conversion is done by using the logical block - * number to index into the array of block pointers described by the dinode. - */ -int -ufs_bmap(ap) - struct vop_bmap_args /* { - struct vnode *a_vp; - ufs_daddr_t a_bn; - struct vnode **a_vpp; - ufs_daddr_t *a_bnp; - int *a_runp; - } */ *ap; -{ - /* - * Check for underlying vnode requests and ensure that logical - * to physical mapping is requested. - */ - if (ap->a_vpp != NULL) - *ap->a_vpp = VTOI(ap->a_vp)->i_devvp; - if (ap->a_bnp == NULL) - return (0); - - return (ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL, - ap->a_runp)); -} /* * Indirect blocks are now on the vnode for the file. They are given negative @@ -129,7 +102,7 @@ ufs_bmap(ap) int ufs_bmaparray(vp, bn, bnp, ap, nump, runp) - struct vnode *vp; + vnode_t vp; ufs_daddr_t bn; ufs_daddr_t *bnp; struct indir *ap; @@ -170,7 +143,7 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp) * don't create a block larger than the device can handle. */ *runp = 0; - maxrun = MAXPHYSIO / mp->mnt_stat.f_iosize - 1; + maxrun = MAXPHYSIO / mp->mnt_vfsstat.f_iosize - 1; } xap = ap == NULL ? a : ap; @@ -197,44 +170,54 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp) devvp = VFSTOUFS(vp->v_mount)->um_devvp; for (bp = NULL, ++xap; --num; ++xap) { - /* - * Exit the loop if there is no disk address assigned yet and - * the indirect block isn't in the cache, or if we were - * looking for an indirect block and we've found it. - */ + ufs_daddr_t *dataptr; + int bop; + + if ((metalbn = xap->in_lbn) == bn) + /* + * found the indirect block we were + * looking for... exit the loop + */ + break; + + if (daddr == 0) + bop = BLK_ONLYVALID | BLK_META; + else + bop = BLK_META; - metalbn = xap->in_lbn; - if (daddr == 0 && !incore(vp, metalbn) || metalbn == bn) + if (bp) + buf_brelse(bp); + bp = buf_getblk(vp, (daddr64_t)((unsigned)metalbn), mp->mnt_vfsstat.f_iosize, 0, 0, bop); + + if (bp == 0) { + /* + * Exit the loop if there is no disk address assigned yet and + * the indirect block isn't in the cache + */ break; + } /* * If we get here, we've either got the block in the cache * or we have a disk address for it, go fetch it. */ - if (bp) - brelse(bp); - xap->in_exists = 1; - bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0, BLK_META); - if (bp->b_flags & (B_DONE | B_DELWRI)) { - trace(TR_BREADHIT, pack(vp, mp->mnt_stat.f_iosize), metalbn); + + if (buf_valid(bp)) { + trace(TR_BREADHIT, pack(vp, mp->mnt_vfsstat.f_iosize), metalbn); } -#if DIAGNOSTIC - else if (!daddr) - panic("ufs_bmaparry: indirect block not in cache"); -#endif else { - trace(TR_BREADMISS, pack(vp, mp->mnt_stat.f_iosize), metalbn); - bp->b_blkno = blkptrtodb(ump, daddr); - bp->b_flags |= B_READ; - VOP_STRATEGY(bp); + trace(TR_BREADMISS, pack(vp, mp->mnt_vfsstat.f_iosize), metalbn); + buf_setblkno(bp, blkptrtodb(ump, (daddr64_t)((unsigned)daddr))); + buf_setflags(bp, B_READ); + VNOP_STRATEGY(bp); current_proc()->p_stats->p_ru.ru_inblock++; /* XXX */ - if (error = biowait(bp)) { - brelse(bp); + if (error = (int)buf_biowait(bp)) { + buf_brelse(bp); return (error); } } - - daddr = ((ufs_daddr_t *)bp->b_data)[xap->in_off]; + dataptr = (ufs_daddr_t *)buf_dataptr(bp); + daddr = dataptr[xap->in_off]; #if REV_ENDIAN_FS if (rev_endian) daddr = NXSwapLong(daddr); @@ -245,16 +228,16 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp) for (bn = xap->in_off + 1; bn < MNINDIR(ump) && *runp < maxrun && is_sequential(ump, - NXSwapLong(((ufs_daddr_t *)bp->b_data)[bn - 1]), - NXSwapLong(((ufs_daddr_t *)bp->b_data)[bn])); + NXSwapLong(dataptr[bn - 1]), + NXSwapLong(dataptr[bn])); ++bn, ++*runp); } else { #endif /* REV_ENDIAN_FS */ for (bn = xap->in_off + 1; bn < MNINDIR(ump) && *runp < maxrun && is_sequential(ump, - ((ufs_daddr_t *)bp->b_data)[bn - 1], - ((ufs_daddr_t *)bp->b_data)[bn]); + dataptr[bn - 1], + dataptr[bn]); ++bn, ++*runp); #if REV_ENDIAN_FS } @@ -262,7 +245,7 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp) } } if (bp) - brelse(bp); + buf_brelse(bp); daddr = blkptrtodb(ump, daddr); *bnp = daddr == 0 ? -1 : daddr; @@ -352,332 +335,91 @@ ufs_getlbns(vp, bn, ap, nump) return (0); } /* - * Cmap converts a the file offset of a file to its physical block - * number on the disk And returns contiguous size for transfer. + * blockmap converts a file offsetto its physical block + * number on the disk... it optionally returns the physically + * contiguous size. */ int -ufs_cmap(ap) - struct vop_cmap_args /* { +ufs_blockmap(ap) + struct vnop_blockmap_args /* { struct vnode *a_vp; off_t a_foffset; size_t a_size; - daddr_t *a_bpn; + daddr64_t *a_bpn; size_t *a_run; void *a_poff; + int a_flags; } */ *ap; { - struct vnode * vp = ap->a_vp; - ufs_daddr_t *bnp = ap->a_bpn; - size_t *runp = ap->a_run; - int size = ap->a_size; - daddr_t bn; - int nblks; - register struct inode *ip; + vnode_t vp = ap->a_vp; + daddr64_t * bnp = ap->a_bpn; + size_t * runp = ap->a_run; + int size = ap->a_size; + struct fs * fs; + struct inode *ip; + ufs_daddr_t lbn; ufs_daddr_t daddr = 0; - int devBlockSize=0; - struct fs *fs; - int retsize=0; - int error=0; + int devBlockSize = 0; + int retsize = 0; + int error = 0; + int nblks; ip = VTOI(vp); fs = ip->i_fs; + lbn = (ufs_daddr_t)lblkno(fs, ap->a_foffset); + devBlockSize = vfs_devblocksize(vnode_mount(vp)); - if (blkoff(fs, ap->a_foffset)) { - panic("ufs_cmap; allocation requested inside a block"); - } + if (blkoff(fs, ap->a_foffset)) + panic("ufs_blockmap; allocation requested inside a block"); - bn = (daddr_t)lblkno(fs, ap->a_foffset); - VOP_DEVBLOCKSIZE(ip->i_devvp, &devBlockSize); + if (size % devBlockSize) + panic("ufs_blockmap: size is not multiple of device block size\n"); - if (size % devBlockSize) { - panic("ufs_cmap: size is not multiple of device block size\n"); - } - - if (error = VOP_BMAP(vp, bn, (struct vnode **) 0, &daddr, &nblks)) { - return(error); - } - - retsize = nblks * fs->fs_bsize; + if ((error = ufs_bmaparray(vp, lbn, &daddr, NULL, NULL, &nblks))) + return (error); if (bnp) - *bnp = daddr; + *bnp = (daddr64_t)daddr; if (ap->a_poff) *(int *)ap->a_poff = 0; - if (daddr == -1) { - if (size < fs->fs_bsize) { - retsize = fragroundup(fs, size); - if(size >= retsize) - *runp = retsize; - else - *runp = size; - } else { - *runp = fs->fs_bsize; - } - return(0); - } - if (runp) { - if ((size < fs->fs_bsize)) { - *runp = size; - return(0); - } - if (retsize) { - retsize += fs->fs_bsize; - if(size >= retsize) - *runp = retsize; - else - *runp = size; + if (lbn < 0) { + /* + * we're dealing with the indirect blocks + * which are always fs_bsize in size + */ + retsize = (nblks + 1) * fs->fs_bsize; + } else if (daddr == -1 || nblks == 0) { + /* + * we're dealing with a 'hole'... UFS doesn't + * have a clean way to determine it's size + * or + * there's are no physically contiguous blocks + * so + * just return the size of the lbn we started with + */ + retsize = blksize(fs, ip, lbn); } else { - if (size < fs->fs_bsize) { - retsize = fragroundup(fs, size); - if(size >= retsize) - *runp = retsize; - else - *runp = size; - } else { - *runp = fs->fs_bsize; - } - } - } - return (0); -} - - -#if NOTTOBEUSED -/* - * Cmap converts a the file offset of a file to its physical block - * number on the disk And returns contiguous size for transfer. - */ -int -ufs_cmap(ap) - struct vop_cmap_args /* { - struct vnode *a_vp; - off_t a_foffset; - size_t a_size; - daddr_t *a_bpn; - size_t *a_run; - void *a_poff; - } */ *ap; -{ - struct vnode * vp = ap->a_vp; - ufs_daddr_t *bnp = ap->a_bpn; - size_t *runp = ap->a_run; - daddr_t bn; - int nblks, blks; - int *nump; - register struct inode *ip; - struct buf *bp; - struct ufsmount *ump; - struct mount *mp; - struct vnode *devvp; - struct indir a[NIADDR], *xap; - ufs_daddr_t daddr; - long metalbn; - int error, maxrun, num; - int devBlockSize=0; - struct fs *fs; - int size = ap->a_size; - int block_offset=0; - int retsize=0; -#if 1 - daddr_t orig_blkno; - daddr_t orig_bblkno; -#endif /* 1 */ -#if REV_ENDIAN_FS - int rev_endian=0; -#endif /* REV_ENDIAN_FS */ - - ip = VTOI(vp); - fs = ip->i_fs; - - mp = vp->v_mount; - ump = VFSTOUFS(mp); - - VOP_DEVBLOCKSIZE(ip->i_devvp, &devBlockSize); - bn = (daddr_t)lblkno(fs, ap->a_foffset); - - if (size % devBlockSize) { - panic("ufs_cmap: size is not multiple of device block size\n"); - } - - block_offset = blkoff(fs, ap->a_foffset); - if (block_offset) { - panic("ufs_cmap; allocation requested inside a block"); - } - -#if 1 - VOP_OFFTOBLK(vp, ap->a_foffset, & orig_blkno); -#endif /* 1 */ - /* less than block size and not block offset aligned */ - if ( (size < fs->fs_bsize) && fragoff(fs, size) && block_offset ) { - panic("ffs_cmap: size not a mult of fragment\n"); - } -#if 0 - if (size > fs->fs_bsize && fragoff(fs, size)) { - panic("ffs_cmap: more than bsize & not a multiple of fragment\n"); - } -#endif /* 0 */ -#if REV_ENDIAN_FS - rev_endian=(mp->mnt_flag & MNT_REVEND); -#endif /* REV_ENDIAN_FS */ - - if(runp) - *runp = 0; - - if ( size > MAXPHYSIO) - size = MAXPHYSIO; - nblks = (blkroundup(fs, size))/fs->fs_bsize; - - xap = a; - num = 0; - if (error = ufs_getlbns(vp, bn, xap, &num)) - return (error); - - blks = 0; - if (num == 0) { - daddr = blkptrtodb(ump, ip->i_db[bn]); - *bnp = ((daddr == 0) ? -1 : daddr); - if (daddr && runp) { - for (++bn; bn < NDADDR && blks < nblks && - ip->i_db[bn] && - is_sequential(ump, ip->i_db[bn - 1], ip->i_db[bn]); - ++bn, ++blks); - - if (blks) { - retsize = lblktosize(fs, blks); - if(size >= retsize) - *runp = retsize; - else - *runp = size; - } else { - if (size < fs->fs_bsize) { - retsize = fragroundup(fs, size); - if(size >= retsize) - *runp = retsize; - else - *runp = size; - } else { - *runp = fs->fs_bsize; - } - } - if (ap->a_poff) - *(int *)ap->a_poff = 0; - } -#if 1 - if (VOP_BMAP(vp, orig_blkno, NULL, &orig_bblkno, NULL)) { - panic("vop_bmap failed\n"); - } - if(daddr != orig_bblkno) { - panic("vop_bmap and vop_cmap differ\n"); - } -#endif /* 1 */ - return (0); - } - - - /* Get disk address out of indirect block array */ - daddr = ip->i_ib[xap->in_off]; - - devvp = VFSTOUFS(vp->v_mount)->um_devvp; - for (bp = NULL, ++xap; --num; ++xap) { - /* - * Exit the loop if there is no disk address assigned yet - * or if we were looking for an indirect block and we've - * found it. - */ - - metalbn = xap->in_lbn; - if (daddr == 0 || metalbn == bn) - break; - /* - * We have a disk address for it, go fetch it. - */ - if (bp) - brelse(bp); - - xap->in_exists = 1; - bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0, BLK_META); - if (bp->b_flags & (B_DONE | B_DELWRI)) { - trace(TR_BREADHIT, pack(vp, mp->mnt_stat.f_iosize), metalbn); - } - else { - trace(TR_BREADMISS, pack(vp, mp->mnt_stat.f_iosize), metalbn); - bp->b_blkno = blkptrtodb(ump, daddr); - bp->b_flags |= B_READ; - VOP_STRATEGY(bp); - current_proc()->p_stats->p_ru.ru_inblock++; /* XXX */ - if (error = biowait(bp)) { - brelse(bp); - return (error); - } - } - - daddr = ((ufs_daddr_t *)bp->b_data)[xap->in_off]; -#if REV_ENDIAN_FS - if (rev_endian) - daddr = NXSwapLong(daddr); -#endif /* REV_ENDIAN_FS */ - if (num == 1 && daddr && runp) { - blks = 0; -#if REV_ENDIAN_FS - if (rev_endian) { - for (bn = xap->in_off + 1; - bn < MNINDIR(ump) && blks < maxrun && - is_sequential(ump, - NXSwapLong(((ufs_daddr_t *)bp->b_data)[bn - 1]), - NXSwapLong(((ufs_daddr_t *)bp->b_data)[bn])); - ++bn, ++blks); - } else { -#endif /* REV_ENDIAN_FS */ - for (bn = xap->in_off + 1; - bn < MNINDIR(ump) && blks < maxrun && - is_sequential(ump, - ((ufs_daddr_t *)bp->b_data)[bn - 1], - ((ufs_daddr_t *)bp->b_data)[bn]); - ++bn, ++blks); -#if REV_ENDIAN_FS - } -#endif /* REV_ENDIAN_FS */ + /* + * we have 1 or more blocks that are physically contiguous + * to our starting block number... the orignal block + (nblks - 1) + * blocks must be full sized since only the last block can be + * composed of fragments... + */ + retsize = nblks * fs->fs_bsize; + + /* + * now compute the size of the last block and add it in + */ + retsize += blksize(fs, ip, (lbn + nblks)); } + if (retsize < size) + *runp = retsize; + else + *runp = size; } - if (bp) - brelse(bp); - - daddr = blkptrtodb(ump, daddr); - *bnp = ((daddr == 0) ? -1 : daddr); - if (daddr && runp) { - if (blks) { - retsize = lblktosize(fs, blks); - if(size >= retsize) - *runp = retsize; - else - *runp = size; - } else { - if (size < fs->fs_bsize) { - retsize = fragroundup(fs, size); - if(size >= retsize) - *runp = retsize; - else - *runp = size; - } else { - *runp = fs->fs_bsize; - } - } - - } - if (daddr && ap->a_poff) - *(int *)ap->a_poff = 0; -#if 1 - if (VOP_BMAP(vp, orig_blkno, (struct vnode **) 0, &orig_bblkno, 0)) { - panic("vop_bmap failed\n"); - } - if(daddr != orig_bblkno) { - panic("vop_bmap and vop_cmap differ\n"); - } -#endif /* 1 */ return (0); } -#endif /* NOTTOBEUSED */ diff --git a/bsd/ufs/ufs/ufs_byte_order.c b/bsd/ufs/ufs/ufs_byte_order.c index c8cf52d13..86f0b010d 100644 --- a/bsd/ufs/ufs/ufs_byte_order.c +++ b/bsd/ufs/ufs/ufs_byte_order.c @@ -339,11 +339,11 @@ byte_swap_dir_out(char *addr, int count) void byte_swap_dir_block_out(struct buf *bp) { - struct direct *ep = (struct direct *) bp->b_data; + struct direct *ep = (struct direct *) buf_dataptr(bp); int reclen, entryoffsetinblk = 0; - while (entryoffsetinblk < bp->b_bcount) { - ep = (struct direct *) (entryoffsetinblk + bp->b_data); + while (entryoffsetinblk < buf_count(bp)) { + ep = (struct direct *) (entryoffsetinblk + buf_dataptr(bp)); reclen = ep->d_reclen; entryoffsetinblk += reclen; byte_swap_int(ep->d_ino); diff --git a/bsd/ufs/ufs/ufs_byte_order.h b/bsd/ufs/ufs/ufs_byte_order.h index 12dd0badc..fda8614fd 100644 --- a/bsd/ufs/ufs/ufs_byte_order.h +++ b/bsd/ufs/ufs/ufs_byte_order.h @@ -41,26 +41,26 @@ #include <ufs/ufs/inode.h> #include <ufs/ffs/fs.h> -void byte_swap_longlongs __P((unsigned long long *, int)); -void byte_swap_ints __P((int *, int)); -void byte_swap_shorts __P((short *, int)); +void byte_swap_longlongs(unsigned long long *, int); +void byte_swap_ints(int *, int); +void byte_swap_shorts(short *, int); -/* void byte_swap_superblock __P((struct fs *)); */ -void byte_swap_sbin __P((struct fs *)); -void byte_swap_sbout __P((struct fs *)); -void byte_swap_csum __P((struct csum *)); -void byte_swap_ocylgroup __P((struct cg *)); -void byte_swap_cgin __P((struct cg *, struct fs *)); -void byte_swap_cgout __P((struct cg *, struct fs *)); +/* void byte_swap_superblock(struct fs *); */ +void byte_swap_sbin(struct fs *); +void byte_swap_sbout(struct fs *); +void byte_swap_csum(struct csum *); +void byte_swap_ocylgroup(struct cg *); +void byte_swap_cgin(struct cg *, struct fs *); +void byte_swap_cgout(struct cg *, struct fs *); -void byte_swap_inode_in __P((struct dinode *, struct inode *)); -void byte_swap_inode_out __P((struct inode *, struct dinode *)); +void byte_swap_inode_in(struct dinode *, struct inode *); +void byte_swap_inode_out(struct inode *, struct dinode *); -void byte_swap_dir_block_in __P((char *, int)); -void byte_swap_dir_block_out __P((struct buf *)); -void byte_swap_direct __P((struct direct *)); -void byte_swap_dirtemplate_in __P((struct dirtemplate *)); -void byte_swap_minidir_in __P((struct direct *)); +void byte_swap_dir_block_in(char *, int); +void byte_swap_dir_block_out(buf_t); +void byte_swap_direct(struct direct *); +void byte_swap_dirtemplate_in(struct dirtemplate *); +void byte_swap_minidir_in(struct direct *); #endif /* __APPLE_API_PRIVATE */ #endif /* _UFS_BYTE_ORDER_H_ */ diff --git a/bsd/ufs/ufs/ufs_extern.h b/bsd/ufs/ufs/ufs_extern.h index 75469fa99..d7e9815c9 100644 --- a/bsd/ufs/ufs/ufs_extern.h +++ b/bsd/ufs/ufs/ufs_extern.h @@ -63,7 +63,6 @@ struct buf; struct direct; struct disklabel; -struct fid; struct flock; struct inode; struct mbuf; @@ -73,96 +72,84 @@ struct proc; struct ucred; struct ufs_args; struct uio; -struct vattr; +struct vnode_attr; struct vfsconf; struct vnode; __BEGIN_DECLS +int ufs_remove_internal(vnode_t, vnode_t, struct componentname *, int); +int ufs_access_internal(vnode_t, mode_t, ucred_t); + +int ffs_read_internal(vnode_t, struct uio *, int); +int ffs_write_internal(vnode_t, struct uio *, int, ucred_t); +int ffs_truncate_internal(vnode_t, off_t, int, ucred_t); + void diskerr - __P((struct buf *, char *, char *, int, int, struct disklabel *)); -void disksort __P((struct buf *, struct buf *)); -u_int dkcksum __P((struct disklabel *)); -char *readdisklabel __P((dev_t, int (*)(), struct disklabel *)); -int setdisklabel __P((struct disklabel *, struct disklabel *, u_long)); -int writedisklabel __P((dev_t, int (*)(), struct disklabel *)); + (struct buf *, char *, char *, int, int, struct disklabel *); +void disksort(struct buf *, struct buf *); +u_int dkcksum(struct disklabel *); +char *readdisklabel(dev_t, int (*)(), struct disklabel *); +int setdisklabel(struct disklabel *, struct disklabel *, u_long); +int writedisklabel(dev_t, int (*)(), struct disklabel *); -int ufs_access __P((struct vop_access_args *)); -int ufs_advlock __P((struct vop_advlock_args *)); -int ufs_bmap __P((struct vop_bmap_args *)); -int ufs_check_export __P((struct mount *, struct ufid *, struct mbuf *, - struct vnode **, int *exflagsp, struct ucred **)); -int ufs_checkpath __P((struct inode *, struct inode *, struct ucred *)); -int ufs_close __P((struct vop_close_args *)); -int ufs_create __P((struct vop_create_args *)); -void ufs_dirbad __P((struct inode *, doff_t, char *)); -int ufs_dirbadentry __P((struct vnode *, struct direct *, int)); -int ufs_dirempty __P((struct inode *, ino_t, struct ucred *)); -int ufs_direnter __P((struct inode *, struct vnode *,struct componentname *)); -int ufs_dirremove __P((struct vnode *, struct componentname*)); +int ufs_access(struct vnop_access_args *); +int ufs_checkpath(struct inode *, struct inode *, struct ucred *); +int ufs_close(struct vnop_close_args *); +int ufs_create(struct vnop_create_args *); +void ufs_dirbad(struct inode *, doff_t, const char *); +int ufs_dirbadentry(struct vnode *, struct direct *, int); +int ufs_dirempty(struct inode *, ino_t, struct ucred *); +int ufs_direnter(struct inode *, struct vnode *,struct componentname *); +int ufs_dirremove(struct vnode *, struct componentname*); int ufs_dirrewrite - __P((struct inode *, struct inode *, struct componentname *)); -int ufs_getattr __P((struct vop_getattr_args *)); -int ufs_getattrlist __P((struct vop_getattrlist_args *)); -int ufs_getlbns __P((struct vnode *, ufs_daddr_t, struct indir *, int *)); + (struct inode *, struct inode *, struct componentname *); +int ufs_getattr(struct vnop_getattr_args *); +int ufs_getlbns(struct vnode *, ufs_daddr_t, struct indir *, int *); struct vnode * - ufs_ihashget __P((dev_t, ino_t)); -void ufs_ihashinit __P((void)); -void ufs_ihashins __P((struct inode *)); + ufs_ihashget(dev_t, ino_t); +void ufs_ihashinit(void); +void ufs_ihashins(struct inode *); struct vnode * - ufs_ihashlookup __P((dev_t, ino_t)); -void ufs_ihashrem __P((struct inode *)); -int ufs_inactive __P((struct vop_inactive_args *)); -int ufs_init __P((struct vfsconf *)); -int ufs_ioctl __P((struct vop_ioctl_args *)); -int ufs_islocked __P((struct vop_islocked_args *)); -#if NFSSERVER -int lease_check __P((struct vop_lease_args *)); -#define ufs_lease_check lease_check -#else -#define ufs_lease_check ((int (*) __P((struct vop_lease_args *)))nullop) -#endif -int ufs_link __P((struct vop_link_args *)); -int ufs_lock __P((struct vop_lock_args *)); -int ufs_lookup __P((struct vop_lookup_args *)); -int ufs_makeinode __P((int mode, struct vnode *, struct vnode **, struct componentname *)); -int ufs_mkdir __P((struct vop_mkdir_args *)); -int ufs_mknod __P((struct vop_mknod_args *)); -int ufs_mmap __P((struct vop_mmap_args *)); -int ufs_open __P((struct vop_open_args *)); -int ufs_pathconf __P((struct vop_pathconf_args *)); -int ufs_print __P((struct vop_print_args *)); -int ufs_readdir __P((struct vop_readdir_args *)); -int ufs_readlink __P((struct vop_readlink_args *)); -int ufs_reclaim __P((struct vnode *, struct proc *)); -int ufs_remove __P((struct vop_remove_args *)); -int ufs_rename __P((struct vop_rename_args *)); -#define ufs_revoke vop_revoke -int ufs_rmdir __P((struct vop_rmdir_args *)); -int ufs_root __P((struct mount *, struct vnode **)); -int ufs_seek __P((struct vop_seek_args *)); -int ufs_select __P((struct vop_select_args *)); -int ufs_kqfilt_add __P((struct vop_kqfilt_add_args *)); -int ufs_setattr __P((struct vop_setattr_args *)); -int ufs_setattrlist __P((struct vop_setattrlist_args *)); -int ufs_start __P((struct mount *, int, struct proc *)); -int ufs_strategy __P((struct vop_strategy_args *)); -int ufs_symlink __P((struct vop_symlink_args *)); -int ufs_unlock __P((struct vop_unlock_args *)); -int ufs_whiteout __P((struct vop_whiteout_args *)); -int ufs_vinit __P((struct mount *, - int (**)(), int (**)(), struct vnode **)); -int ufsspec_close __P((struct vop_close_args *)); -int ufsspec_read __P((struct vop_read_args *)); -int ufsspec_write __P((struct vop_write_args *)); + ufs_ihashlookup(dev_t, ino_t); +void ufs_ihashrem(struct inode *); +int ufs_inactive(struct vnop_inactive_args *); +int ufs_init(struct vfsconf *); +int ufs_ioctl(struct vnop_ioctl_args *); +int ufs_link(struct vnop_link_args *); +int ufs_lookup(struct vnop_lookup_args *); +int ufs_makeinode(struct vnode_attr *, struct vnode *, struct vnode **, struct componentname *); +int ufs_mkdir(struct vnop_mkdir_args *); +int ufs_mknod(struct vnop_mknod_args *); +int ufs_mmap(struct vnop_mmap_args *); +int ufs_open(struct vnop_open_args *); +int ufs_pathconf(struct vnop_pathconf_args *); +int ufs_readdir(struct vnop_readdir_args *); +int ufs_readlink(struct vnop_readlink_args *); +int ufs_reclaim(struct vnode *, struct proc *); +int ufs_remove(struct vnop_remove_args *); +int ufs_rename(struct vnop_rename_args *); +#define ufs_revoke nop_revoke +int ufs_rmdir(struct vnop_rmdir_args *); +int ufs_root(struct mount *, struct vnode **, vfs_context_t); +int ufs_select(struct vnop_select_args *); +int ufs_kqfilt_add(struct vnop_kqfilt_add_args *); +int ufs_setattr(struct vnop_setattr_args *); +int ufs_start(struct mount *, int, vfs_context_t); +int ufs_strategy(struct vnop_strategy_args *); +int ufs_symlink(struct vnop_symlink_args *); +int ufs_whiteout(struct vnop_whiteout_args *); +int ufsspec_close(struct vnop_close_args *); +int ufsspec_read(struct vnop_read_args *); +int ufsspec_write(struct vnop_write_args *); #if FIFO -int ufsfifo_read __P((struct vop_read_args *)); -int ufsfifo_write __P((struct vop_write_args *)); -int ufsfifo_close __P((struct vop_close_args *)); -int ufsfifo_kqfilt_add __P((struct vop_kqfilt_add_args *)); +int ufsfifo_read(struct vnop_read_args *); +int ufsfifo_write(struct vnop_write_args *); +int ufsfifo_close(struct vnop_close_args *); +int ufsfifo_kqfilt_add(struct vnop_kqfilt_add_args *); #endif -int ufs_blktooff __P((struct vop_blktooff_args *)); -int ufs_cmap __P((struct vop_cmap_args *)); +int ufs_blktooff(struct vnop_blktooff_args *); +int ufs_blockmap(struct vnop_blockmap_args *); __END_DECLS diff --git a/bsd/ufs/ufs/ufs_ihash.c b/bsd/ufs/ufs/ufs_ihash.c index a3da69ee5..140f8564a 100644 --- a/bsd/ufs/ufs/ufs_ihash.c +++ b/bsd/ufs/ufs/ufs_ihash.c @@ -57,7 +57,7 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/malloc.h> #include <sys/proc.h> #include <sys/quota.h> @@ -72,7 +72,6 @@ LIST_HEAD(ihashhead, inode) *ihashtbl; u_long ihash; /* size of hash table - 1 */ #define INOHASH(device, inum) (&ihashtbl[((device) + (inum)) & ihash]) -struct slock ufs_ihash_slock; /* * Initialize inode hash table. @@ -82,7 +81,6 @@ ufs_ihashinit() { ihashtbl = hashinit(desiredvnodes, M_UFSMNT, &ihash); - simple_lock_init(&ufs_ihash_slock); } /* @@ -96,12 +94,9 @@ ufs_ihashlookup(dev, inum) { struct inode *ip; - simple_lock(&ufs_ihash_slock); for (ip = INOHASH(dev, inum)->lh_first; ip; ip = ip->i_hash.le_next) if (inum == ip->i_number && dev == ip->i_dev) break; - simple_unlock(&ufs_ihash_slock); - if (ip) return (ITOV(ip)); return (NULLVP); @@ -119,19 +114,18 @@ ufs_ihashget(dev, inum) struct proc *p = current_proc(); /* XXX */ struct inode *ip; struct vnode *vp; + uint32_t vid; loop: - simple_lock(&ufs_ihash_slock); for (ip = INOHASH(dev, inum)->lh_first; ip; ip = ip->i_hash.le_next) { if (inum == ip->i_number && dev == ip->i_dev) { - vp = ITOV(ip); + if (ISSET(ip->i_flag, IN_ALLOC)) { /* * inode is being created. Wait for it * to finish creation */ SET(ip->i_flag, IN_WALLOC); - simple_unlock(&ufs_ihash_slock); (void)tsleep((caddr_t)ip, PINOD, "ufs_ihashget", 0); goto loop; } @@ -143,18 +137,32 @@ loop: * error */ SET(ip->i_flag, IN_WTRANSIT); - simple_unlock(&ufs_ihash_slock); (void)tsleep((caddr_t)ip, PINOD, "ufs_ihashget1", 0); goto loop; } - simple_lock(&vp->v_interlock); - simple_unlock(&ufs_ihash_slock); - if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) - goto loop; + vp = ITOV(ip); + /* + * the vid needs to be grabbed before we drop + * lock protecting the hash + */ + vid = vnode_vid(vp); + + /* + * we currently depend on running under the FS funnel + * when we do proper locking and advertise ourselves + * as thread safe, we'll need a lock to protect the + * hash lookup... this is where we would drop it + */ + if (vnode_getwithvid(vp, vid)) { + /* + * If vnode is being reclaimed, or has + * already changed identity, no need to wait + */ + return (NULL); + } return (vp); } } - simple_unlock(&ufs_ihash_slock); return (NULL); } @@ -166,13 +174,10 @@ void ufs_ihashins(ip) struct inode *ip; { - struct proc *p = current_proc(); struct ihashhead *ipp; - simple_lock(&ufs_ihash_slock); ipp = INOHASH(ip->i_dev, ip->i_number); LIST_INSERT_HEAD(ipp, ip, i_hash); - simple_unlock(&ufs_ihash_slock); } /* @@ -182,13 +187,9 @@ void ufs_ihashrem(ip) struct inode *ip; { - struct inode *iq; - - simple_lock(&ufs_ihash_slock); LIST_REMOVE(ip, i_hash); #if DIAGNOSTIC ip->i_hash.le_next = NULL; ip->i_hash.le_prev = NULL; #endif - simple_unlock(&ufs_ihash_slock); } diff --git a/bsd/ufs/ufs/ufs_inode.c b/bsd/ufs/ufs/ufs_inode.c index 90e6d8f93..41ae10abe 100644 --- a/bsd/ufs/ufs/ufs_inode.c +++ b/bsd/ufs/ufs/ufs_inode.c @@ -63,8 +63,8 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> -#include <sys/vnode.h> -#include <sys/mount.h> +#include <sys/vnode_internal.h> +#include <sys/mount_internal.h> #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/quota.h> @@ -82,14 +82,14 @@ extern int prtactive; */ int ufs_inactive(ap) - struct vop_inactive_args /* { + struct vnop_inactive_args /* { struct vnode *a_vp; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); - struct proc *p = ap->a_p; + struct proc *p = vfs_context_proc(ap->a_context); struct timeval tv; int mode, error = 0; extern int prtactive; @@ -112,25 +112,24 @@ ufs_inactive(ap) * inode from inodecache */ SET(ip->i_flag, IN_TRANSIT); - error = VOP_TRUNCATE(vp, (off_t)0, 0, NOCRED, p); + error = ffs_truncate_internal(vp, (off_t)0, 0, NOCRED); ip->i_rdev = 0; mode = ip->i_mode; ip->i_mode = 0; ip->i_flag |= IN_CHANGE | IN_UPDATE; - VOP_VFREE(vp, ip->i_number, mode); + ffs_vfree(vp, ip->i_number, mode); } if (ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) { - tv = time; - VOP_UPDATE(vp, &tv, &tv, 0); + microtime(&tv); + ffs_update(vp, &tv, &tv, 0); } out: - VOP_UNLOCK(vp, 0, p); /* * If we are done with the inode, reclaim it * so that it can be reused immediately. */ if (ip->i_mode == 0) - vrecycle(vp, (struct slock *)0, p); + vnode_recycle(vp); return (error); } @@ -148,24 +147,23 @@ ufs_reclaim(vp, p) if (prtactive && vp->v_usecount != 0) vprint("ufs_reclaim: pushing active", vp); + + vnode_removefsref(vp); /* * Remove the inode from its hash chain. */ ip = VTOI(vp); ufs_ihashrem(ip); - /* - * Purge old data structures associated with the inode. - */ - cache_purge(vp); + if (ip->i_devvp) { struct vnode *tvp = ip->i_devvp; ip->i_devvp = NULL; - vrele(tvp); + vnode_rele(tvp); } #if QUOTA for (i = 0; i < MAXQUOTAS; i++) { if (ip->i_dquot[i] != NODQUOT) { - dqrele(vp, ip->i_dquot[i]); + dqrele(ip->i_dquot[i]); ip->i_dquot[i] = NODQUOT; } } diff --git a/bsd/ufs/ufs/ufs_lookup.c b/bsd/ufs/ufs/ufs_lookup.c index 1e2a64188..48bbde8c5 100644 --- a/bsd/ufs/ufs/ufs_lookup.c +++ b/bsd/ufs/ufs/ufs_lookup.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -64,15 +64,18 @@ #include <sys/namei.h> #include <sys/buf.h> #include <sys/file.h> -#include <sys/mount.h> -#include <sys/vnode.h> +#include <sys/mount_internal.h> +#include <sys/vnode_internal.h> #include <sys/quota.h> +#include <sys/kauth.h> +#include <sys/uio_internal.h> #include <ufs/ufs/quota.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/dir.h> #include <ufs/ufs/ufsmount.h> #include <ufs/ufs/ufs_extern.h> +#include <ufs/ffs/ffs_extern.h> #if REV_ENDIAN_FS #include <ufs/ufs/ufs_byte_order.h> #include <architecture/byte_order.h> @@ -101,8 +104,7 @@ int dirchk = 0; * exists, lookup returns both the target and its parent directory locked. * When creating or renaming and LOCKPARENT is specified, the target may * not be ".". When deleting and LOCKPARENT is specified, the target may - * be "."., but the caller must check to ensure it does an vrele and vput - * instead of two vputs. + * be "."., * * Overall outline of ufs_lookup: * @@ -122,10 +124,11 @@ int dirchk = 0; */ int ufs_lookup(ap) - struct vop_lookup_args /* { + struct vnop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; + vfs_context_t a_context } */ *ap; { register struct vnode *vdp; /* vnode for directory being searched */ @@ -145,27 +148,28 @@ ufs_lookup(ap) struct vnode *tdp; /* returned by VFS_VGET */ doff_t enduseful; /* pointer past last used dir slot */ u_long bmask; /* block offset mask */ - int lockparent; /* 1 => lockparent flag is set */ int wantparent; /* 1 => wantparent or lockparent flag */ int namlen, error; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; - struct ucred *cred = cnp->cn_cred; int flags = cnp->cn_flags; int nameiop = cnp->cn_nameiop; - struct proc *p = cnp->cn_proc; + vfs_context_t context = ap->a_context; + kauth_cred_t cred; #if REV_ENDIAN_FS int rev_endian=0; #endif /* REV_ENDIAN_FS */ + cred = vfs_context_ucred(context); bp = NULL; slotoffset = -1; *vpp = NULL; vdp = ap->a_dvp; dp = VTOI(vdp); - lockparent = flags & LOCKPARENT; + wantparent = flags & (LOCKPARENT|WANTPARENT); + #if REV_ENDIAN_FS rev_endian=(vdp->v_mount->mnt_flag & MNT_REVEND); #endif /* REV_ENDIAN_FS */ @@ -175,11 +179,6 @@ ufs_lookup(ap) */ if ((dp->i_mode & IFMT) != IFDIR) return (ENOTDIR); - if (error = VOP_ACCESS(vdp, VEXEC, cred, cnp->cn_proc)) - return (error); - if ((flags & ISLASTCN) && (vdp->v_mount->mnt_flag & MNT_RDONLY) && - (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) - return (EROFS); /* * We now have a segment name to search for, and a directory to search. @@ -189,50 +188,10 @@ ufs_lookup(ap) * we are looking for is known already. */ if (error = cache_lookup(vdp, vpp, cnp)) { - int vpid; /* capability number of vnode */ - if (error == ENOENT) return (error); - /* - * Get the next vnode in the path. - * See comment below starting `Step through' for - * an explaination of the locking protocol. - */ - pdp = vdp; - dp = VTOI(*vpp); - vdp = *vpp; - vpid = vdp->v_id; - if (pdp == vdp) { /* lookup on "." */ - VREF(vdp); - error = 0; - } else if (flags & ISDOTDOT) { - VOP_UNLOCK(pdp, 0, p); - error = vget(vdp, LK_EXCLUSIVE, p); - if (!error && lockparent && (flags & ISLASTCN)) - error = vn_lock(pdp, LK_EXCLUSIVE, p); - } else { - error = vget(vdp, LK_EXCLUSIVE, p); - if (!lockparent || error || !(flags & ISLASTCN)) - VOP_UNLOCK(pdp, 0, p); - } - /* - * Check that the capability number did not change - * while we were waiting for the lock. - */ - if (!error) { - if (vpid == vdp->v_id) - return (0); - vput(vdp); - if (lockparent && pdp != vdp && (flags & ISLASTCN)) - VOP_UNLOCK(pdp, 0, p); - } - if (error = vn_lock(pdp, LK_EXCLUSIVE, p)) - return (error); - vdp = pdp; - dp = VTOI(pdp); - *vpp = NULL; + return (0); } - /* * Suppress search for slots unless creating * file and at end of pathname, in which case @@ -247,7 +206,6 @@ ufs_lookup(ap) slotneeded = (sizeof(struct direct) - MAXNAMLEN + cnp->cn_namelen + 3) &~ 3; } - /* * If there is cached information on a previous search of * this directory, pick up where we last left off. @@ -259,7 +217,7 @@ ufs_lookup(ap) * profiling time and hence has been removed in the interest * of simplicity. */ - bmask = VFSTOUFS(vdp->v_mount)->um_mountp->mnt_stat.f_iosize - 1; + bmask = VFSTOUFS(vdp->v_mount)->um_mountp->mnt_vfsstat.f_iosize - 1; if (nameiop != LOOKUP || dp->i_diroff == 0 || dp->i_diroff > dp->i_size) { entryoffsetinblock = 0; @@ -268,8 +226,8 @@ ufs_lookup(ap) } else { dp->i_offset = dp->i_diroff; if ((entryoffsetinblock = dp->i_offset & bmask) && - (error = VOP_BLKATOFF(vdp, (off_t)dp->i_offset, NULL, &bp))) - return (error); + (error = ffs_blkatoff(vdp, (off_t)dp->i_offset, NULL, &bp))) + goto out; numdirpasses = 2; nchstats.ncs_2passes++; } @@ -288,11 +246,10 @@ searchloop: if (rev_endian) byte_swap_dir_block_out(bp); #endif /* REV_ENDIAN_FS */ - brelse(bp); + buf_brelse(bp); } - if (error = - VOP_BLKATOFF(vdp, (off_t)dp->i_offset, NULL, &bp)) - return (error); + if (error = ffs_blkatoff(vdp, (off_t)dp->i_offset, NULL, &bp)) + goto out; entryoffsetinblock = 0; } /* @@ -311,7 +268,7 @@ searchloop: * directory. Complete checks can be run by patching * "dirchk" to be true. */ - ep = (struct direct *)((char *)bp->b_data + entryoffsetinblock); + ep = (struct direct *)((char *)buf_dataptr(bp) + entryoffsetinblock); if (ep->d_reclen == 0 || dirchk && ufs_dirbadentry(vdp, ep, entryoffsetinblock)) { int i; @@ -389,7 +346,7 @@ searchloop: if (rev_endian) byte_swap_dir_block_out(bp); #endif /* REV_ENDIAN_FS */ - brelse(bp); + buf_brelse(bp); goto found; } } @@ -415,7 +372,7 @@ notfound: if (rev_endian) byte_swap_dir_block_out(bp); #endif /* REV_ENDIAN_FS */ - brelse(bp); + buf_brelse(bp); } /* * If creating, and at end of pathname and current @@ -427,12 +384,6 @@ notfound: (ap->a_cnp->cn_flags & DOWHITEOUT) && (ap->a_cnp->cn_flags & ISWHITEOUT))) && (flags & ISLASTCN) && dp->i_nlink != 0) { - /* - * Access for write is interpreted as allowing - * creation of files in the directory. - */ - if (error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_proc)) - return (error); /* * Return an indication of where the new directory * entry should be put. If we didn't find a slot, @@ -467,23 +418,20 @@ notfound: * We return ni_vp == NULL to indicate that the entry * does not currently exist; we leave a pointer to * the (locked) directory inode in ndp->ni_dvp. - * The pathname buffer is saved so that the name - * can be obtained later. * * NB - if the directory is unlocked, then this * information cannot be used. */ - cnp->cn_flags |= SAVENAME; - if (!lockparent) - VOP_UNLOCK(vdp, 0, p); - return (EJUSTRETURN); + error = EJUSTRETURN; + goto out; } /* * Insert name into cache (as non-existent) if appropriate. */ if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) cache_enter(vdp, *vpp, cnp); - return (ENOENT); + error = ENOENT; + goto out; found: if (numdirpasses == 2) @@ -514,11 +462,6 @@ found: * on and lock the inode, being careful with ".". */ if (nameiop == DELETE && (flags & ISLASTCN)) { - /* - * Write access to directory required to delete files. - */ - if (error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_proc)) - return (error); /* * Return pointer to current entry in dp->i_offset, * and distance past previous entry (if there @@ -530,30 +473,15 @@ found: else dp->i_count = dp->i_offset - prevoff; if (dp->i_number == dp->i_ino) { - VREF(vdp); + vnode_get(vdp); *vpp = vdp; - return (0); - } - if (error = VFS_VGET(vdp->v_mount, (void *)dp->i_ino, &tdp)) - return (error); - /* - * If directory is "sticky", then user must own - * the directory, or the file in it, else she - * may not delete it (unless she's root). This - * implements append-only directories. - */ - if ((dp->i_mode & ISVTX) && - cred->cr_uid != 0 && - cred->cr_uid != dp->i_uid && - tdp->v_type != VLNK && - VTOI(tdp)->i_uid != cred->cr_uid) { - vput(tdp); - return (EPERM); + error = 0; + goto out; } + if (error = ffs_vget_internal(vdp->v_mount, dp->i_ino, &tdp, vdp, cnp, 0, 0)) + goto out; *vpp = tdp; - if (!lockparent) - VOP_UNLOCK(vdp, 0, p); - return (0); + goto out; } /* @@ -563,25 +491,23 @@ found: * regular file, or empty directory. */ if (nameiop == RENAME && wantparent && (flags & ISLASTCN)) { - if (error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_proc)) - return (error); /* * Careful about locking second inode. * This can only occur if the target is ".". */ - if (dp->i_number == dp->i_ino) - return (EISDIR); - if (error = VFS_VGET(vdp->v_mount, (void *)dp->i_ino, &tdp)) - return (error); + if (dp->i_number == dp->i_ino) { + error =EISDIR; + goto out; + } + if (error = ffs_vget_internal(vdp->v_mount, dp->i_ino, &tdp, vdp, cnp, 0, 0)) + goto out; *vpp = tdp; - cnp->cn_flags |= SAVENAME; - if (!lockparent) - VOP_UNLOCK(vdp, 0, p); - return (0); + + goto out; } /* - * Step through the translation in the name. We do not `vput' the + * Step through the translation in the name. We do not `vnode_put' the * directory because we may need it again if a symbolic link * is relative to the current directory. Instead we save it * unlocked as "pdp". We must get the target inode before unlocking @@ -601,48 +527,36 @@ found: */ pdp = vdp; if (flags & ISDOTDOT) { - VOP_UNLOCK(pdp, 0, p); /* race to get the inode */ - if (error = VFS_VGET(vdp->v_mount, (void *)dp->i_ino, &tdp)) { - vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY, p); - return (error); - } - if (lockparent && (flags & ISLASTCN) && - (error = vn_lock(pdp, LK_EXCLUSIVE, p))) { - vput(tdp); - return (error); + if (error = ffs_vget_internal(vdp->v_mount, dp->i_ino, &tdp, vdp, cnp, 0, 0)) { + goto out; } *vpp = tdp; } else if (dp->i_number == dp->i_ino) { - VREF(vdp); /* we want ourself, ie "." */ + vnode_get(vdp); /* we want ourself, ie "." */ *vpp = vdp; } else { - if (error = VFS_VGET(vdp->v_mount, (void *)dp->i_ino, &tdp)) - return (error); - if (!lockparent || !(flags & ISLASTCN)) - VOP_UNLOCK(pdp, 0, p); + if (error = ffs_vget_internal(vdp->v_mount, dp->i_ino, &tdp, vdp, cnp, 0, 0)) + goto out; *vpp = tdp; } - /* - * Insert name into cache if appropriate. - */ - if (cnp->cn_flags & MAKEENTRY) - cache_enter(vdp, *vpp, cnp); - return (0); + error = 0; +out: + return (error); } void ufs_dirbad(ip, offset, how) struct inode *ip; doff_t offset; - char *how; + const char *how; { struct mount *mp; mp = ITOV(ip)->v_mount; (void)printf("%s: bad dir ino %d at offset %d: %s\n", - mp->mnt_stat.f_mntonname, ip->i_number, offset, how); - if ((mp->mnt_stat.f_flags & MNT_RDONLY) == 0) + mp->mnt_vfsstat.f_mntonname, ip->i_number, offset, how); + if ((mp->mnt_vfsstat.f_flags & MNT_RDONLY) == 0) panic("bad dir"); } @@ -710,10 +624,6 @@ ufs_direnter(ip, dvp, cnp) register struct inode *dp; struct direct newdir; -#if DIAGNOSTIC - if ((cnp->cn_flags & HASBUF) == 0) - panic("direnter: missing name"); -#endif dp = VTOI(dvp); newdir.d_ino = ip->i_number; newdir.d_namlen = cnp->cn_namelen; @@ -728,28 +638,25 @@ ufs_direnter(ip, dvp, cnp) newdir.d_type = tmp; } # endif } - return (ufs_direnter2(dvp, &newdir, cnp->cn_cred, cnp->cn_proc)); + return (ufs_direnter2(dvp, &newdir, cnp->cn_context)); } /* * Common entry point for directory entry removal used by ufs_direnter * and ufs_whiteout */ -ufs_direnter2(dvp, dirp, cr, p) - struct vnode *dvp; - struct direct *dirp; - struct ucred *cr; - struct proc *p; +int +ufs_direnter2(struct vnode *dvp, struct direct *dirp, vfs_context_t ctx) { int newentrysize; struct inode *dp; struct buf *bp; - struct iovec aiov; - struct uio auio; + uio_t auio; u_int dsize; struct direct *ep, *nep; int error, loc, spacefree; char *dirbuf; + char uio_buf[ UIO_SIZEOF(1) ]; #if REV_ENDIAN_FS struct mount *mp=dvp->v_mount; int rev_endian=(mp->mnt_flag & MNT_REVEND); @@ -767,19 +674,14 @@ ufs_direnter2(dvp, dirp, cr, p) */ if (dp->i_offset & (DIRBLKSIZ - 1)) panic("ufs_direnter2: newblk"); - auio.uio_offset = dp->i_offset; dirp->d_reclen = DIRBLKSIZ; - auio.uio_resid = newentrysize; - aiov.iov_len = newentrysize; - aiov.iov_base = (caddr_t)dirp; - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_rw = UIO_WRITE; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_procp = (struct proc *)0; - error = VOP_WRITE(dvp, &auio, IO_SYNC, cr); + auio = uio_createwithbuffer(1, dp->i_offset, UIO_SYSSPACE, UIO_WRITE, + &uio_buf[0], sizeof(uio_buf)); + uio_addiov(auio, CAST_USER_ADDR_T(dirp), newentrysize); + + error = ffs_write_internal(dvp, auio, IO_SYNC, vfs_context_ucred(ctx)); if (DIRBLKSIZ > - VFSTOUFS(dvp->v_mount)->um_mountp->mnt_stat.f_bsize) + VFSTOUFS(dvp->v_mount)->um_mountp->mnt_vfsstat.f_bsize) /* XXX should grow with balloc() */ panic("ufs_direnter2: frag size"); else if (!error) { @@ -810,7 +712,7 @@ ufs_direnter2(dvp, dirp, cr, p) /* * Get the block containing the space for the new directory entry. */ - if (error = VOP_BLKATOFF(dvp, (off_t)dp->i_offset, &dirbuf, &bp)) + if (error = ffs_blkatoff(dvp, (off_t)dp->i_offset, &dirbuf, &bp)) return (error); /* * Find space for the new entry. In the simple case, the entry at @@ -861,13 +763,14 @@ ufs_direnter2(dvp, dirp, cr, p) #endif /* REV_ENDIAN_FS */ if (mp->mnt_flag & MNT_ASYNC) { error = 0; - bdwrite(bp); + buf_bdwrite(bp); } else { - error = VOP_BWRITE(bp); + error = VNOP_BWRITE(bp); } dp->i_flag |= IN_CHANGE | IN_UPDATE; if (!error && dp->i_endoff && dp->i_endoff < dp->i_size) - error = VOP_TRUNCATE(dvp, (off_t)dp->i_endoff, IO_SYNC, cr, p); + error = ffs_truncate_internal(dvp, (off_t)dp->i_endoff, IO_SYNC, vfs_context_ucred(ctx)); + return (error); } @@ -903,8 +806,7 @@ ufs_dirremove(dvp, cnp) /* * Whiteout entry: set d_ino to WINO. */ - if (error = - VOP_BLKATOFF(dvp, (off_t)dp->i_offset, (char **)&ep, &bp)) + if (error = ffs_blkatoff(dvp, (off_t)dp->i_offset, (char **)&ep, &bp)) return (error); ep->d_ino = WINO; ep->d_type = DT_WHT; @@ -914,9 +816,9 @@ ufs_dirremove(dvp, cnp) #endif /* REV_ENDIAN_FS */ if (mp->mnt_flag & MNT_ASYNC) { error = 0; - bdwrite(bp); + buf_bdwrite(bp); } else { - error = VOP_BWRITE(bp); + error = VNOP_BWRITE(bp); } dp->i_flag |= IN_CHANGE | IN_UPDATE; return (error); @@ -926,8 +828,7 @@ ufs_dirremove(dvp, cnp) /* * First entry in block: set d_ino to zero. */ - if (error = - VOP_BLKATOFF(dvp, (off_t)dp->i_offset, (char **)&ep, &bp)) + if (error = ffs_blkatoff(dvp, (off_t)dp->i_offset, (char **)&ep, &bp)) return (error); ep->d_ino = 0; #if REV_ENDIAN_FS @@ -936,9 +837,9 @@ ufs_dirremove(dvp, cnp) #endif /* REV_ENDIAN_FS */ if (mp->mnt_flag & MNT_ASYNC) { error = 0; - bdwrite(bp); + buf_bdwrite(bp); } else { - error = VOP_BWRITE(bp); + error = VNOP_BWRITE(bp); } dp->i_flag |= IN_CHANGE | IN_UPDATE; return (error); @@ -946,7 +847,7 @@ ufs_dirremove(dvp, cnp) /* * Collapse new free space into previous entry. */ - if (error = VOP_BLKATOFF(dvp, (off_t)(dp->i_offset - dp->i_count), + if (error = ffs_blkatoff(dvp, (off_t)(dp->i_offset - dp->i_count), (char **)&ep, &bp)) return (error); ep->d_reclen += dp->i_reclen; @@ -956,11 +857,12 @@ ufs_dirremove(dvp, cnp) #endif /* REV_ENDIAN_FS */ if (mp->mnt_flag & MNT_ASYNC) { error = 0; - bdwrite(bp); + buf_bdwrite(bp); } else { - error = VOP_BWRITE(bp); + error = VNOP_BWRITE(bp); } dp->i_flag |= IN_CHANGE | IN_UPDATE; + return (error); } @@ -979,7 +881,7 @@ ufs_dirrewrite(dp, ip, cnp) struct vnode *vdp = ITOV(dp); int error; - if (error = VOP_BLKATOFF(vdp, (off_t)dp->i_offset, (char **)&ep, &bp)) + if (error = ffs_blkatoff(vdp, (off_t)dp->i_offset, (char **)&ep, &bp)) return (error); ep->d_ino = ip->i_number; if (vdp->v_mount->mnt_maxsymlinklen > 0) @@ -990,9 +892,9 @@ ufs_dirrewrite(dp, ip, cnp) #endif /* REV_ENDIAN_FS */ if (vdp->v_mount->mnt_flag & MNT_ASYNC) { error = 0; - bdwrite(bp); + buf_bdwrite(bp); } else { - error = VOP_BWRITE(bp); + error = VNOP_BWRITE(bp); } dp->i_flag |= IN_CHANGE | IN_UPDATE; return (error); @@ -1008,10 +910,7 @@ ufs_dirrewrite(dp, ip, cnp) * NB: does not handle corrupted directories. */ int -ufs_dirempty(ip, parentino, cred) - register struct inode *ip; - ino_t parentino; - struct ucred *cred; +ufs_dirempty(struct inode *ip, ino_t parentino, kauth_cred_t cred) { register off_t off; struct dirtemplate dbuf; @@ -1027,7 +926,7 @@ ufs_dirempty(ip, parentino, cred) for (off = 0; off < ip->i_size; off += dp->d_reclen) { error = vn_rdwr(UIO_READ, ITOV(ip), (caddr_t)dp, MINDIRSIZ, off, - UIO_SYSSPACE, IO_NODELOCKED, cred, &count, (struct proc *)0); + UIO_SYSSPACE32, IO_NODELOCKED, cred, &count, (struct proc *)0); /* * Since we read MINDIRSIZ, residual must * be 0 unless we're at end of file. @@ -1074,15 +973,15 @@ ufs_dirempty(ip, parentino, cred) /* * Check if source directory is in the path of the target directory. * Target is supplied locked, source is unlocked. - * The target is always vput before returning. */ int ufs_checkpath(source, target, cred) struct inode *source, *target; - struct ucred *cred; + kauth_cred_t cred; { struct vnode *vp; int error, rootino, namlen; + int need_put = 0; struct dirtemplate dirbuf; vp = ITOV(target); @@ -1101,7 +1000,7 @@ ufs_checkpath(source, target, cred) break; } error = vn_rdwr(UIO_READ, vp, (caddr_t)&dirbuf, - sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE, + sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE32, IO_NODELOCKED, cred, (int *)0, (struct proc *)0); if (error != 0) break; @@ -1125,17 +1024,22 @@ ufs_checkpath(source, target, cred) } if (dirbuf.dotdot_ino == rootino) break; - vput(vp); - if (error = VFS_VGET(vp->v_mount, (void *)dirbuf.dotdot_ino, &vp)) { + + if (need_put) + vnode_put(vp); + + if (error = VFS_VGET(vp->v_mount, (ino64_t)dirbuf.dotdot_ino, &vp, NULL)) { /* XXX need context */ vp = NULL; break; } + need_put = 1; } out: if (error == ENOTDIR) printf("checkpath: .. not a directory\n"); - if (vp != NULL) - vput(vp); + if (need_put && vp) + vnode_put(vp); + return (error); } diff --git a/bsd/ufs/ufs/ufs_quota.c b/bsd/ufs/ufs/ufs_quota.c index e17245068..5ae7280be 100644 --- a/bsd/ufs/ufs/ufs_quota.c +++ b/bsd/ufs/ufs/ufs_quota.c @@ -62,9 +62,10 @@ #include <sys/systm.h> #include <sys/malloc.h> #include <sys/file.h> -#include <sys/proc.h> -#include <sys/vnode.h> -#include <sys/mount.h> +#include <sys/proc.h> +#include <sys/kauth.h> +#include <sys/vnode_internal.h> +#include <sys/mount_internal.h> #include <sys/namei.h> #include <sys/quota.h> @@ -101,7 +102,7 @@ getinoquota(ip) */ if (ip->i_dquot[USRQUOTA] == NODQUOT && (error = - dqget(vp, ip->i_uid, &ump->um_qfiles[USRQUOTA], USRQUOTA, &ip->i_dquot[USRQUOTA])) && + dqget(ip->i_uid, &ump->um_qfiles[USRQUOTA], USRQUOTA, &ip->i_dquot[USRQUOTA])) && error != EINVAL) return (error); /* @@ -110,7 +111,7 @@ getinoquota(ip) */ if (ip->i_dquot[GRPQUOTA] == NODQUOT && (error = - dqget(vp, ip->i_gid, &ump->um_qfiles[GRPQUOTA], GRPQUOTA, &ip->i_dquot[GRPQUOTA])) && + dqget(ip->i_gid, &ump->um_qfiles[GRPQUOTA], GRPQUOTA, &ip->i_dquot[GRPQUOTA])) && error != EINVAL) return (error); return (0); @@ -120,11 +121,7 @@ getinoquota(ip) * Update disk usage, and take corrective action. */ int -chkdq(ip, change, cred, flags) - register struct inode *ip; - int64_t change; - struct ucred *cred; - int flags; +chkdq(struct inode *ip, int64_t change, kauth_cred_t cred, int flags) { register struct dquot *dq; register int i; @@ -142,10 +139,8 @@ chkdq(ip, change, cred, flags) for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; - while (dq->dq_flags & DQ_LOCK) { - dq->dq_flags |= DQ_WANT; - sleep((caddr_t)dq, PINOD+1); - } + dqlock(dq); + ncurbytes = dq->dq_curbytes + change; if (ncurbytes >= 0) dq->dq_curbytes = ncurbytes; @@ -153,29 +148,32 @@ chkdq(ip, change, cred, flags) dq->dq_curbytes = 0; dq->dq_flags &= ~DQ_BLKS; dq->dq_flags |= DQ_MOD; + + dqunlock(dq); } return (0); } +#warning "hack for no cred passed to chkdq()" p = current_proc(); if (cred == NOCRED) - cred = kernproc->p_ucred; - if ((flags & FORCE) == 0 && ((cred->cr_uid != 0) || (p->p_flag & P_FORCEQUOTA))) { + cred = proc_ucred(kernproc); + if ((flags & FORCE) == 0 && (suser(cred, NULL) || (proc_forcequota(p)))) { for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; - if (error = chkdqchg(ip, change, cred, i)) + if ( (error = chkdqchg(ip, change, cred, i)) ) return (error); } } for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; - while (dq->dq_flags & DQ_LOCK) { - dq->dq_flags |= DQ_WANT; - sleep((caddr_t)dq, PINOD+1); - } + dqlock(dq); + dq->dq_curbytes += change; dq->dq_flags |= DQ_MOD; + + dqunlock(dq); } return (0); } @@ -185,28 +183,29 @@ chkdq(ip, change, cred, flags) * Issue an error message if appropriate. */ int -chkdqchg(ip, change, cred, type) - struct inode *ip; - int64_t change; - struct ucred *cred; - int type; +chkdqchg(struct inode *ip, int64_t change, kauth_cred_t cred, int type) { register struct dquot *dq = ip->i_dquot[type]; - u_int64_t ncurbytes = dq->dq_curbytes + change; + u_int64_t ncurbytes; + dqlock(dq); + + ncurbytes = dq->dq_curbytes + change; /* * If user would exceed their hard limit, disallow space allocation. */ if (ncurbytes >= dq->dq_bhardlimit && dq->dq_bhardlimit) { if ((dq->dq_flags & DQ_BLKS) == 0 && - ip->i_uid == cred->cr_uid) { + ip->i_uid == kauth_cred_getuid(cred)) { #if 1 printf("\n%s: write failed, %s disk limit reached\n", - ITOV(ip)->v_mount->mnt_stat.f_mntonname, + ITOV(ip)->v_mount->mnt_vfsstat.f_mntonname, quotatypes[type]); #endif dq->dq_flags |= DQ_BLKS; } + dqunlock(dq); + return (EDQUOT); } /* @@ -214,31 +213,40 @@ chkdqchg(ip, change, cred, type) * allocation. Reset time limit as they cross their soft limit. */ if (ncurbytes >= dq->dq_bsoftlimit && dq->dq_bsoftlimit) { + struct timeval tv; + + microtime(&tv); if (dq->dq_curbytes < dq->dq_bsoftlimit) { - dq->dq_btime = time.tv_sec + + dq->dq_btime = tv.tv_sec + VFSTOUFS(ITOV(ip)->v_mount)->um_qfiles[type].qf_btime; #if 1 - if (ip->i_uid == cred->cr_uid) + if (ip->i_uid == kauth_cred_getuid(cred)) printf("\n%s: warning, %s %s\n", - ITOV(ip)->v_mount->mnt_stat.f_mntonname, + ITOV(ip)->v_mount->mnt_vfsstat.f_mntonname, quotatypes[type], "disk quota exceeded"); #endif + dqunlock(dq); + return (0); } - if (time.tv_sec > dq->dq_btime) { + if (tv.tv_sec > dq->dq_btime) { if ((dq->dq_flags & DQ_BLKS) == 0 && - ip->i_uid == cred->cr_uid) { + ip->i_uid == kauth_cred_getuid(cred)) { #if 1 printf("\n%s: write failed, %s %s\n", - ITOV(ip)->v_mount->mnt_stat.f_mntonname, + ITOV(ip)->v_mount->mnt_vfsstat.f_mntonname, quotatypes[type], "disk quota exceeded for too long"); #endif dq->dq_flags |= DQ_BLKS; } + dqunlock(dq); + return (EDQUOT); } } + dqunlock(dq); + return (0); } @@ -246,11 +254,7 @@ chkdqchg(ip, change, cred, type) * Check the inode limit, applying corrective action. */ int -chkiq(ip, change, cred, flags) - register struct inode *ip; - long change; - struct ucred *cred; - int flags; +chkiq(struct inode *ip, long change, kauth_cred_t cred, int flags) { register struct dquot *dq; register int i; @@ -267,10 +271,8 @@ chkiq(ip, change, cred, flags) for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; - while (dq->dq_flags & DQ_LOCK) { - dq->dq_flags |= DQ_WANT; - sleep((caddr_t)dq, PINOD+1); - } + dqlock(dq); + ncurinodes = dq->dq_curinodes + change; if (ncurinodes >= 0) dq->dq_curinodes = ncurinodes; @@ -278,29 +280,32 @@ chkiq(ip, change, cred, flags) dq->dq_curinodes = 0; dq->dq_flags &= ~DQ_INODS; dq->dq_flags |= DQ_MOD; + + dqunlock(dq); } return (0); } +#warning "hack for no cred passed to chkiq()" p = current_proc(); if (cred == NOCRED) - cred = kernproc->p_ucred; - if ((flags & FORCE) == 0 && ((cred->cr_uid != 0) || (p->p_flag & P_FORCEQUOTA))) { + cred = proc_ucred(kernproc); + if ((flags & FORCE) == 0 && (suser(cred, NULL) || (proc_forcequota(p)))) { for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; - if (error = chkiqchg(ip, change, cred, i)) + if ( (error = chkiqchg(ip, change, cred, i)) ) return (error); } } for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; - while (dq->dq_flags & DQ_LOCK) { - dq->dq_flags |= DQ_WANT; - sleep((caddr_t)dq, PINOD+1); - } + dqlock(dq); + dq->dq_curinodes += change; dq->dq_flags |= DQ_MOD; + + dqunlock(dq); } return (0); } @@ -310,28 +315,29 @@ chkiq(ip, change, cred, flags) * Issue an error message if appropriate. */ int -chkiqchg(ip, change, cred, type) - struct inode *ip; - long change; - struct ucred *cred; - int type; +chkiqchg(struct inode *ip, long change, kauth_cred_t cred, int type) { register struct dquot *dq = ip->i_dquot[type]; - long ncurinodes = dq->dq_curinodes + change; + long ncurinodes; + + dqlock(dq); + ncurinodes = dq->dq_curinodes + change; /* * If user would exceed their hard limit, disallow inode allocation. */ if (ncurinodes >= dq->dq_ihardlimit && dq->dq_ihardlimit) { if ((dq->dq_flags & DQ_INODS) == 0 && - ip->i_uid == cred->cr_uid) { + ip->i_uid == kauth_cred_getuid(cred)) { #if 1 printf("\n%s: write failed, %s inode limit reached\n", - ITOV(ip)->v_mount->mnt_stat.f_mntonname, + ITOV(ip)->v_mount->mnt_vfsstat.f_mntonname, quotatypes[type]); #endif dq->dq_flags |= DQ_INODS; } + dqunlock(dq); + return (EDQUOT); } /* @@ -339,31 +345,40 @@ chkiqchg(ip, change, cred, type) * allocation. Reset time limit as they cross their soft limit. */ if (ncurinodes >= dq->dq_isoftlimit && dq->dq_isoftlimit) { + struct timeval tv; + + microtime(&tv); if (dq->dq_curinodes < dq->dq_isoftlimit) { - dq->dq_itime = time.tv_sec + + dq->dq_itime = tv.tv_sec + VFSTOUFS(ITOV(ip)->v_mount)->um_qfiles[type].qf_itime; #if 1 - if (ip->i_uid == cred->cr_uid) + if (ip->i_uid == kauth_cred_getuid(cred)) printf("\n%s: warning, %s %s\n", - ITOV(ip)->v_mount->mnt_stat.f_mntonname, + ITOV(ip)->v_mount->mnt_vfsstat.f_mntonname, quotatypes[type], "inode quota exceeded"); #endif + dqunlock(dq); + return (0); } - if (time.tv_sec > dq->dq_itime) { + if (tv.tv_sec > dq->dq_itime) { if ((dq->dq_flags & DQ_INODS) == 0 && - ip->i_uid == cred->cr_uid) { + ip->i_uid == kauth_cred_getuid(cred)) { #if 1 printf("\n%s: write failed, %s %s\n", - ITOV(ip)->v_mount->mnt_stat.f_mntonname, + ITOV(ip)->v_mount->mnt_vfsstat.f_mntonname, quotatypes[type], "inode quota exceeded for too long"); #endif dq->dq_flags |= DQ_INODS; } + dqunlock(dq); + return (EDQUOT); } } + dqunlock(dq); + return (0); } @@ -380,8 +395,7 @@ chkdquot(ip) register int i; for (i = 0; i < MAXQUOTAS; i++) { - if (ump->um_qfiles[i].qf_vp == NULLVP || - (ump->um_qfiles[i].qf_qflags & (QTF_OPENING|QTF_CLOSING))) + if (ump->um_qfiles[i].qf_vp == NULLVP) continue; if (ip->i_dquot[i] == NODQUOT) { vprint("chkdquot: missing dquot", ITOV(ip)); @@ -395,140 +409,186 @@ chkdquot(ip) * Code to process quotactl commands. */ + +struct ufs_quotaon_cargs { + int error; +}; + + +static int +ufs_quotaon_callback(struct vnode *vp, void *cargs) +{ + struct ufs_quotaon_cargs *args; + + args = (struct ufs_quotaon_cargs *)cargs; + + if ( (args->error = getinoquota(VTOI(vp))) ) + return (VNODE_RETURNED_DONE); + + return (VNODE_RETURNED); +} + + /* * Q_QUOTAON - set up a quota file for a particular file system. */ int -quotaon(p, mp, type, fname, segflg) - struct proc *p; +quotaon(context, mp, type, fnamep) + vfs_context_t context; struct mount *mp; register int type; - caddr_t fname; - enum uio_seg segflg; + caddr_t fnamep; { struct ufsmount *ump = VFSTOUFS(mp); - struct vnode *vp, **vpp; - struct vnode *nextvp; - struct dquot *dq; - int error; - struct nameidata nd; + struct quotafile *qfp; + struct vnode *vp; + int error = 0; + struct ufs_quotaon_cargs args; - vpp = &ump->um_qfiles[type].qf_vp; - NDINIT(&nd, LOOKUP, FOLLOW, segflg, fname, p); - if (error = vn_open(&nd, FREAD|FWRITE, 0)) - return (error); - vp = nd.ni_vp; - VOP_UNLOCK(vp, 0, p); - if (vp->v_type != VREG) { - (void) vn_close(vp, FREAD|FWRITE, p->p_ucred, p); - return (EACCES); + qfp = &ump->um_qfiles[type]; + + if ( (qf_get(qfp, QTF_OPENING)) ) + return (0); + + error = vnode_open(fnamep, FREAD|FWRITE, 0, 0, &vp, NULL); + if (error) { + goto out; } - if (*vpp != vp) - quotaoff(p, mp, type); - ump->um_qfiles[type].qf_qflags |= QTF_OPENING; - mp->mnt_flag |= MNT_QUOTA; - vp->v_flag |= VNOFLUSH; - *vpp = vp; + if (!vnode_isreg(vp)) { + (void) vnode_close(vp, FREAD|FWRITE, NULL); + error = EACCES; + goto out; + } + vfs_setflags(mp, (uint64_t)((unsigned int)MNT_QUOTA)); + vnode_setnoflush(vp); /* * Save the credential of the process that turned on quotas. */ - crhold(p->p_ucred); - ump->um_qfiles[type].qf_cred = p->p_ucred; - /* Finish initializing the quota file */ - if (error = dqfileopen(&ump->um_qfiles[type], type)) - goto exit; -#if 0 - ump->um_qfiles[type].qf_btime = MAX_DQ_TIME; - ump->um_qfiles[type].qf_itime = MAX_IQ_TIME; - if (dqget(NULLVP, 0, &ump->um_qfiles[type], type, &dq) == 0) { - if (dq->dq_btime > 0) - ump->um_qfiles[type].qf_btime = dq->dq_btime; - if (dq->dq_itime > 0) - ump->um_qfiles[type].qf_itime = dq->dq_itime; - dqrele(NULLVP, dq); - } -#endif + qfp->qf_vp = vp; + qfp->qf_cred = vfs_context_ucred(context); + kauth_cred_ref(qfp->qf_cred); + + /* + * Finish initializing the quota file + */ + if ( (error = dqfileopen(&ump->um_qfiles[type], type)) ) { + (void) vnode_close(vp, FREAD|FWRITE, NULL); + + kauth_cred_rele(qfp->qf_cred); + qfp->qf_cred = NOCRED; + qfp->qf_vp = NULLVP; + goto out; + } + qf_put(qfp, QTF_OPENING); + /* * Search vnodes associated with this mount point, * adding references to quota file being opened. * NB: only need to add dquot's for inodes being modified. + * + * ufs_quota_callback will be called for each vnode open for + * 'write' (VNODE_WRITEABLE) hung off of this mount point + * the vnode will be in an 'unbusy' state (VNODE_WAIT) and + * properly referenced and unreferenced around the callback */ -again: - for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nextvp) { - nextvp = vp->v_mntvnodes.le_next; - if (vp->v_writecount == 0) - continue; - if (vget(vp, LK_EXCLUSIVE, p)) - goto again; - if (error = getinoquota(VTOI(vp))) { - vput(vp); - break; - } - vput(vp); - if (vp->v_mntvnodes.le_next != nextvp || vp->v_mount != mp) - goto again; - } -exit: - ump->um_qfiles[type].qf_qflags &= ~QTF_OPENING; + args.error = 0; + + vnode_iterate(mp, VNODE_WRITEABLE | VNODE_WAIT, ufs_quotaon_callback, (void *)&args); + + error = args.error; + if (error) - quotaoff(p, mp, type); + quotaoff(mp, type); return (error); +out: + qf_put(qfp, QTF_OPENING); + + return (error); +} + + + +struct ufs_quotaoff_cargs { + int type; +}; + +static int +ufs_quotaoff_callback(struct vnode *vp, void *cargs) +{ + struct ufs_quotaoff_cargs *args; + struct inode *ip; + struct dquot *dq; + + args = (struct ufs_quotaoff_cargs *)cargs; + + ip = VTOI(vp); + + dq = ip->i_dquot[args->type]; + ip->i_dquot[args->type] = NODQUOT; + + dqrele(dq); + + return (VNODE_RETURNED); } /* * Q_QUOTAOFF - turn off disk quotas for a filesystem. */ int -quotaoff(p, mp, type) - struct proc *p; - struct mount *mp; - register int type; +quotaoff(struct mount *mp, register int type) { - struct vnode *vp; - struct vnode *qvp, *nextvp; + struct vnode *qvp; struct ufsmount *ump = VFSTOUFS(mp); - struct dquot *dq; - struct inode *ip; - int error; - struct ucred *cred; + struct quotafile *qfp; + int error = 0; + kauth_cred_t cred; + struct ufs_quotaoff_cargs args; + + qfp = &ump->um_qfiles[type]; + + if ( (qf_get(qfp, QTF_CLOSING)) ) + return (0); + qvp = qfp->qf_vp; + + /* + * Sync out any orpaned dirty dquot entries. + */ + dqsync_orphans(qfp); - if ((qvp = ump->um_qfiles[type].qf_vp) == NULLVP) - return (0); - ump->um_qfiles[type].qf_qflags |= QTF_CLOSING; /* * Search vnodes associated with this mount point, * deleting any references to quota file being closed. + * + * ufs_quotaoff_callback will be called for each vnode + * hung off of this mount point + * the vnode will be in an 'unbusy' state (VNODE_WAIT) and + * properly referenced and unreferenced around the callback */ -again: - for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nextvp) { - nextvp = vp->v_mntvnodes.le_next; - if (vget(vp, LK_EXCLUSIVE, p)) - goto again; - ip = VTOI(vp); - dq = ip->i_dquot[type]; - ip->i_dquot[type] = NODQUOT; - dqrele(vp, dq); - vput(vp); - if (vp->v_mntvnodes.le_next != nextvp || vp->v_mount != mp) - goto again; - } + args.type = type; + + vnode_iterate(mp, VNODE_WAIT, ufs_quotaoff_callback, (void *)&args); + dqflush(qvp); /* Finish tearing down the quota file */ - dqfileclose(&ump->um_qfiles[type], type); - qvp->v_flag &= ~VNOFLUSH; - error = vn_close(qvp, FREAD|FWRITE, p->p_ucred, p); - ump->um_qfiles[type].qf_vp = NULLVP; - cred = ump->um_qfiles[type].qf_cred; + dqfileclose(qfp, type); + + vnode_clearnoflush(qvp); + error = vnode_close(qvp, FREAD|FWRITE, NULL); + + qfp->qf_vp = NULLVP; + cred = qfp->qf_cred; if (cred != NOCRED) { - ump->um_qfiles[type].qf_cred = NOCRED; - crfree(cred); + qfp->qf_cred = NOCRED; + kauth_cred_rele(cred); } - ump->um_qfiles[type].qf_qflags &= ~QTF_CLOSING; for (type = 0; type < MAXQUOTAS; type++) if (ump->um_qfiles[type].qf_vp != NULLVP) break; if (type == MAXQUOTAS) mp->mnt_flag &= ~MNT_QUOTA; + + qf_put(qfp, QTF_CLOSING); + return (error); } @@ -536,19 +596,24 @@ again: * Q_GETQUOTA - return current values in a dqblk structure. */ int -getquota(mp, id, type, addr) +getquota(mp, id, type, datap) struct mount *mp; u_long id; int type; - caddr_t addr; + caddr_t datap; { struct dquot *dq; int error; - if (error = dqget(NULLVP, id, &VFSTOUFS(mp)->um_qfiles[type], type, &dq)) + if ( (error = dqget(id, &VFSTOUFS(mp)->um_qfiles[type], type, &dq)) ) return (error); - error = copyout((caddr_t)&dq->dq_dqb, addr, sizeof (struct dqblk)); - dqrele(NULLVP, dq); + dqlock(dq); + + bcopy(&dq->dq_dqb, datap, sizeof(dq->dq_dqb)); + + dqunlock(dq); + dqrele(dq); + return (error); } @@ -556,47 +621,47 @@ getquota(mp, id, type, addr) * Q_SETQUOTA - assign an entire dqblk structure. */ int -setquota(mp, id, type, addr) +setquota(mp, id, type, datap) struct mount *mp; u_long id; int type; - caddr_t addr; + caddr_t datap; { - register struct dquot *dq; - struct dquot *ndq; + struct dquot *dq; struct ufsmount *ump = VFSTOUFS(mp); - struct dqblk newlim; + struct dqblk * newlimp = (struct dqblk *) datap; + struct timeval tv; int error; - if (error = copyin(addr, (caddr_t)&newlim, sizeof (struct dqblk))) - return (error); - if (error = dqget(NULLVP, id, &ump->um_qfiles[type], type, &ndq)) + error = dqget(id, &ump->um_qfiles[type], type, &dq); + if (error) return (error); - dq = ndq; - while (dq->dq_flags & DQ_LOCK) { - dq->dq_flags |= DQ_WANT; - sleep((caddr_t)dq, PINOD+1); - } + dqlock(dq); + /* * Copy all but the current values. * Reset time limit if previously had no soft limit or were * under it, but now have a soft limit and are over it. */ - newlim.dqb_curbytes = dq->dq_curbytes; - newlim.dqb_curinodes = dq->dq_curinodes; + newlimp->dqb_curbytes = dq->dq_curbytes; + newlimp->dqb_curinodes = dq->dq_curinodes; if (dq->dq_id != 0) { - newlim.dqb_btime = dq->dq_btime; - newlim.dqb_itime = dq->dq_itime; + newlimp->dqb_btime = dq->dq_btime; + newlimp->dqb_itime = dq->dq_itime; + } + if (newlimp->dqb_bsoftlimit && + dq->dq_curbytes >= newlimp->dqb_bsoftlimit && + (dq->dq_bsoftlimit == 0 || dq->dq_curbytes < dq->dq_bsoftlimit)) { + microtime(&tv); + newlimp->dqb_btime = tv.tv_sec + ump->um_qfiles[type].qf_btime; } - if (newlim.dqb_bsoftlimit && - dq->dq_curbytes >= newlim.dqb_bsoftlimit && - (dq->dq_bsoftlimit == 0 || dq->dq_curbytes < dq->dq_bsoftlimit)) - newlim.dqb_btime = time.tv_sec + ump->um_qfiles[type].qf_btime; - if (newlim.dqb_isoftlimit && - dq->dq_curinodes >= newlim.dqb_isoftlimit && - (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit)) - newlim.dqb_itime = time.tv_sec + ump->um_qfiles[type].qf_itime; - dq->dq_dqb = newlim; + if (newlimp->dqb_isoftlimit && + dq->dq_curinodes >= newlimp->dqb_isoftlimit && + (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit)) { + microtime(&tv); + newlimp->dqb_itime = tv.tv_sec + ump->um_qfiles[type].qf_itime; + } + bcopy(newlimp, &dq->dq_dqb, sizeof(dq->dq_dqb)); if (dq->dq_curbytes < dq->dq_bsoftlimit) dq->dq_flags &= ~DQ_BLKS; if (dq->dq_curinodes < dq->dq_isoftlimit) @@ -607,7 +672,10 @@ setquota(mp, id, type, addr) else dq->dq_flags &= ~DQ_FAKE; dq->dq_flags |= DQ_MOD; - dqrele(NULLVP, dq); + + dqunlock(dq); + dqrele(dq); + return (0); } @@ -615,48 +683,71 @@ setquota(mp, id, type, addr) * Q_SETUSE - set current inode and byte usage. */ int -setuse(mp, id, type, addr) +setuse(mp, id, type, datap) struct mount *mp; u_long id; int type; - caddr_t addr; + caddr_t datap; { - register struct dquot *dq; + struct dquot *dq; struct ufsmount *ump = VFSTOUFS(mp); - struct dquot *ndq; - struct dqblk usage; + struct timeval tv; int error; - - if (error = copyin(addr, (caddr_t)&usage, sizeof (struct dqblk))) - return (error); - if (error = dqget(NULLVP, id, &ump->um_qfiles[type], type, &ndq)) + struct dqblk *quotablkp = (struct dqblk *) datap; + + error = dqget(id, &ump->um_qfiles[type], type, &dq); + if (error) return (error); - dq = ndq; - while (dq->dq_flags & DQ_LOCK) { - dq->dq_flags |= DQ_WANT; - sleep((caddr_t)dq, PINOD+1); - } + dqlock(dq); + /* * Reset time limit if have a soft limit and were * previously under it, but are now over it. */ if (dq->dq_bsoftlimit && dq->dq_curbytes < dq->dq_bsoftlimit && - usage.dqb_curbytes >= dq->dq_bsoftlimit) - dq->dq_btime = time.tv_sec + ump->um_qfiles[type].qf_btime; + quotablkp->dqb_curbytes >= dq->dq_bsoftlimit) { + microtime(&tv); + dq->dq_btime = tv.tv_sec + ump->um_qfiles[type].qf_btime; + } if (dq->dq_isoftlimit && dq->dq_curinodes < dq->dq_isoftlimit && - usage.dqb_curinodes >= dq->dq_isoftlimit) - dq->dq_itime = time.tv_sec + ump->um_qfiles[type].qf_itime; - dq->dq_curbytes = usage.dqb_curbytes; - dq->dq_curinodes = usage.dqb_curinodes; + quotablkp->dqb_curinodes >= dq->dq_isoftlimit) { + microtime(&tv); + dq->dq_itime = tv.tv_sec + ump->um_qfiles[type].qf_itime; + } + dq->dq_curbytes = quotablkp->dqb_curbytes; + dq->dq_curinodes = quotablkp->dqb_curinodes; if (dq->dq_curbytes < dq->dq_bsoftlimit) dq->dq_flags &= ~DQ_BLKS; if (dq->dq_curinodes < dq->dq_isoftlimit) dq->dq_flags &= ~DQ_INODS; dq->dq_flags |= DQ_MOD; - dqrele(NULLVP, dq); + + dqunlock(dq); + dqrele(dq); + return (0); } + + +static int +ufs_qsync_callback(struct vnode *vp, __unused void *cargs) +{ + struct inode *ip; + struct dquot *dq; + int i; + + ip = VTOI(vp); + + for (i = 0; i < MAXQUOTAS; i++) { + dq = ip->i_dquot[i]; + if (dq != NODQUOT && (dq->dq_flags & DQ_MOD)) + dqsync(dq); + } + return (VNODE_RETURNED); +} + + /* * Q_SYNC - sync quota files to disk. */ @@ -665,10 +756,7 @@ qsync(mp) struct mount *mp; { struct ufsmount *ump = VFSTOUFS(mp); - struct proc *p = current_proc(); /* XXX */ - struct vnode *vp, *nextvp; - struct dquot *dq; - int i, error; + int i; /* * Check if the mount point has any quotas. @@ -682,33 +770,14 @@ qsync(mp) /* * Search vnodes associated with this mount point, * synchronizing any modified dquot structures. + * + * ufs_qsync_callback will be called for each vnode + * hung off of this mount point + * the vnode will be + * properly referenced and unreferenced around the callback */ - simple_lock(&mntvnode_slock); -again: - for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nextvp) { - if (vp->v_mount != mp) - goto again; - nextvp = vp->v_mntvnodes.le_next; - simple_lock(&vp->v_interlock); - simple_unlock(&mntvnode_slock); - error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, p); - if (error) { - simple_lock(&mntvnode_slock); - if (error == ENOENT) - goto again; - continue; - } - for (i = 0; i < MAXQUOTAS; i++) { - dq = VTOI(vp)->i_dquot[i]; - if (dq != NODQUOT && (dq->dq_flags & DQ_MOD)) - dqsync(vp, dq); - } - vput(vp); - simple_lock(&mntvnode_slock); - if (vp->v_mntvnodes.le_next != nextvp) - goto again; - } - simple_unlock(&mntvnode_slock); + vnode_iterate(mp, 0, ufs_qsync_callback, (void *)NULL); + return (0); } @@ -716,10 +785,10 @@ again: * Q_QUOTASTAT - get quota on/off status */ int -quotastat(mp, type, addr) +quotastat(mp, type, datap) struct mount *mp; register int type; - caddr_t addr; + caddr_t datap; { struct ufsmount *ump = VFSTOUFS(mp); int error = 0; @@ -729,8 +798,7 @@ quotastat(mp, type, addr) qstat = 1; /* quotas are on for this type */ else qstat = 0; /* quotas are off for this type */ - - error = copyout ((caddr_t)&qstat, addr, sizeof(qstat)); + *((int *)datap) = qstat; return (error); } diff --git a/bsd/ufs/ufs/ufs_readwrite.c b/bsd/ufs/ufs/ufs_readwrite.c index fc4a4cafe..9aa3b92cd 100644 --- a/bsd/ufs/ufs/ufs_readwrite.c +++ b/bsd/ufs/ufs/ufs_readwrite.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -55,45 +55,49 @@ * @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 */ +#include <sys/buf_internal.h> +#include <sys/uio_internal.h> + + #define BLKSIZE(a, b, c) blksize(a, b, c) #define FS struct fs #define I_FS i_fs -#define PGRD ffs_pgrd -#define PGRD_S "ffs_pgrd" -#define PGWR ffs_pgwr -#define PGWR_S "ffs_pgwr" + + /* * Vnode op for reading. */ /* ARGSUSED */ ffs_read(ap) - struct vop_read_args /* { + struct vnop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; + vfs_context_t a_context; } */ *ap; { - register struct vnode *vp; - register struct inode *ip; - register struct uio *uio; - register FS *fs; - struct buf *bp = (struct buf *)0; + return(ffs_read_internal(ap->a_vp, ap->a_uio, ap->a_ioflag)); +} + + +int +ffs_read_internal(vnode_t vp, struct uio *uio, int ioflag) +{ + struct inode *ip; + FS *fs; + buf_t bp = (struct buf *)0; ufs_daddr_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; - int devBlockSize=0; int error; u_short mode; #if REV_ENDIAN_FS int rev_endian=0; #endif /* REV_ENDIAN_FS */ - vp = ap->a_vp; ip = VTOI(vp); mode = ip->i_mode; - uio = ap->a_uio; #if REV_ENDIAN_FS rev_endian=(vp->v_mount->mnt_flag & MNT_REVEND); @@ -115,14 +119,13 @@ ffs_read(ap) if (uio->uio_offset > fs->fs_maxfilesize) return (EFBIG); - VOP_DEVBLOCKSIZE(ip->i_devvp, &devBlockSize); - - if (UBCISVALID(vp)) { - error = cluster_read(vp, uio, (off_t)ip->i_size, - devBlockSize, 0); + if (UBCINFOEXISTS(vp)) { + error = cluster_read(vp, uio, (off_t)ip->i_size, 0); } else { - for (error = 0, bp = NULL; uio->uio_resid > 0; + for (error = 0, bp = NULL; uio_resid(uio) > 0; bp = NULL) { + char *buf_data; + if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) break; lbn = lblkno(fs, uio->uio_offset); @@ -130,46 +133,48 @@ ffs_read(ap) size = BLKSIZE(fs, ip, lbn); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->fs_bsize - blkoffset; - if (uio->uio_resid < xfersize) - xfersize = uio->uio_resid; + // LP64todo - fix this + if (uio_resid(uio) < xfersize) + xfersize = uio_resid(uio); if (bytesinfile < xfersize) xfersize = bytesinfile; if (lblktosize(fs, nextlbn) >= ip->i_size) - error = bread(vp, lbn, size, NOCRED, &bp); - else if (lbn - 1 == vp->v_lastr && !(vp->v_flag & VRAOFF)) { + error = (int)buf_bread(vp, (daddr64_t)((unsigned)lbn), size, NOCRED, &bp); + else if (lbn - 1 == ip->i_lastr && !(vp->v_flag & VRAOFF)) { int nextsize = BLKSIZE(fs, ip, nextlbn); - error = breadn(vp, lbn, - size, &nextlbn, &nextsize, 1, NOCRED, &bp); + error = (int)buf_breadn(vp, (daddr64_t)((unsigned)lbn), + size, &nextlbn, &nextsize, 1, NOCRED, &bp); } else - error = bread(vp, lbn, size, NOCRED, &bp); + error = (int)buf_bread(vp, lbn, size, NOCRED, &bp); if (error) break; - vp->v_lastr = lbn; + ip->i_lastr = lbn; /* - * We should only get non-zero b_resid when an I/O error + * We should only get non-zero buffer resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, * then we want to ensure that we do not uiomove bad * or uninitialized data. */ - size -= bp->b_resid; + size -= buf_resid(bp); if (size < xfersize) { if (size == 0) break; xfersize = size; } + buf_data = (char *)buf_dataptr(bp); #if REV_ENDIAN_FS if (rev_endian && S_ISDIR(mode)) { - byte_swap_dir_block_in((char *)bp->b_data + blkoffset, xfersize); + byte_swap_dir_block_in(buf_data + blkoffset, xfersize); } #endif /* REV_ENDIAN_FS */ if (error = - uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio)) { + uiomove(buf_data + blkoffset, (int)xfersize, uio)) { #if REV_ENDIAN_FS if (rev_endian && S_ISDIR(mode)) { - byte_swap_dir_block_in((char *)bp->b_data + blkoffset, xfersize); + byte_swap_dir_block_in(buf_data + blkoffset, xfersize); } #endif /* REV_ENDIAN_FS */ break; @@ -177,17 +182,17 @@ ffs_read(ap) #if REV_ENDIAN_FS if (rev_endian && S_ISDIR(mode)) { - byte_swap_dir_out((char *)bp->b_data + blkoffset, xfersize); + byte_swap_dir_out(buf_data + blkoffset, xfersize); } #endif /* REV_ENDIAN_FS */ if (S_ISREG(mode) && (xfersize + blkoffset == fs->fs_bsize || uio->uio_offset == ip->i_size)) - bp->b_flags |= B_AGE; - brelse(bp); + buf_markaged(bp); + buf_brelse(bp); } } if (bp != NULL) - brelse(bp); + buf_brelse(bp); ip->i_flag |= IN_ACCESS; return (error); } @@ -196,23 +201,26 @@ ffs_read(ap) * Vnode op for writing. */ ffs_write(ap) - struct vop_write_args /* { + struct vnop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; + vfs_context_t a_context; } */ *ap; { - register struct vnode *vp; - register struct uio *uio; - register struct inode *ip; - register FS *fs; - struct buf *bp; - struct proc *p; + return(ffs_write_internal(ap->a_vp, ap->a_uio, ap->a_ioflag, vfs_context_ucred(ap->a_context))); +} + + +ffs_write_internal(vnode_t vp, struct uio *uio, int ioflag, ucred_t cred) +{ + buf_t bp; + proc_t p; + struct inode *ip; + FS *fs; ufs_daddr_t lbn; off_t osize; - int blkoffset, flags, ioflag, resid, rsd, size, xfersize; - int devBlockSize=0; + int blkoffset, flags, resid, rsd, size, xfersize; int save_error=0, save_size=0; int blkalloc = 0; int error = 0; @@ -223,9 +231,6 @@ ffs_write(ap) int rev_endian=0; #endif /* REV_ENDIAN_FS */ - ioflag = ap->a_ioflag; - uio = ap->a_uio; - vp = ap->a_vp; ip = VTOI(vp); #if REV_ENDIAN_FS rev_endian=(vp->v_mount->mnt_flag & MNT_REVEND); @@ -256,32 +261,19 @@ ffs_write(ap) fs = ip->I_FS; if (uio->uio_offset < 0 || - (u_int64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) + (u_int64_t)uio->uio_offset + uio_resid(uio) > fs->fs_maxfilesize) return (EFBIG); - if (uio->uio_resid == 0) + if (uio_resid(uio) == 0) return (0); - VOP_DEVBLOCKSIZE(ip->i_devvp, &devBlockSize); - - /* - * Maybe this should be above the vnode op call, but so long as - * file servers have no limits, I don't think it matters. - */ - p = uio->uio_procp; - if (vp->v_type == VREG && p && - uio->uio_offset + uio->uio_resid > - p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { - psignal(p, SIGXFSZ); - return (EFBIG); - } - - resid = uio->uio_resid; + // LP64todo - fix this + resid = uio_resid(uio); osize = ip->i_size; flags = 0; if ((ioflag & IO_SYNC) && !((vp)->v_mount->mnt_flag & MNT_ASYNC)) flags = B_SYNC; - if (UBCISVALID(vp)) { + if (UBCINFOEXISTS(vp)) { off_t filesize; off_t endofwrite; off_t local_offset; @@ -292,7 +284,8 @@ ffs_write(ap) int fblk; int loopcount; - endofwrite = uio->uio_offset + uio->uio_resid; + // LP64todo - fix this + endofwrite = uio->uio_offset + uio_resid(uio); if (endofwrite > ip->i_size) { filesize = endofwrite; @@ -303,7 +296,8 @@ ffs_write(ap) head_offset = ip->i_size; /* Go ahead and allocate the block that are going to be written */ - rsd = uio->uio_resid; + // LP64todo - fix this + rsd = uio_resid(uio); local_offset = uio->uio_offset; local_flags = 0; if ((ioflag & IO_SYNC) && !((vp)->v_mount->mnt_flag & MNT_ASYNC)) @@ -331,7 +325,7 @@ ffs_write(ap) /* Allocate block without reading into a buf */ error = ffs_balloc(ip, - lbn, blkoffset + xfersize, ap->a_cred, + lbn, blkoffset + xfersize, cred, &bp, local_flags, &blkalloc); if (error) break; @@ -350,9 +344,9 @@ ffs_write(ap) if(error) { save_error = error; save_size = rsd; - uio->uio_resid -= rsd; - if (file_extended) - filesize -= rsd; + uio_setresid(uio, (uio_resid(uio) - rsd)); + if (file_extended) + filesize -= rsd; } flags = ioflag & IO_SYNC ? IO_SYNC : 0; @@ -387,17 +381,16 @@ ffs_write(ap) * we we'll zero fill from the current EOF to where the write begins */ - error = cluster_write(vp, uio, osize, filesize, head_offset, local_offset, devBlockSize, flags); + error = cluster_write(vp, uio, osize, filesize, head_offset, local_offset, flags); if (uio->uio_offset > osize) { if (error && ((ioflag & IO_UNIT)==0)) - (void)VOP_TRUNCATE(vp, uio->uio_offset, - ioflag & IO_SYNC, ap->a_cred, uio->uio_procp); + (void)ffs_truncate_internal(vp, uio->uio_offset, ioflag & IO_SYNC, cred); ip->i_size = uio->uio_offset; ubc_setsize(vp, (off_t)ip->i_size); } if(save_error) { - uio->uio_resid += save_size; + uio_setresid(uio, (uio_resid(uio) + save_size)); if(!error) error = save_error; } @@ -407,48 +400,49 @@ ffs_write(ap) if ((ioflag & IO_SYNC) && !((vp)->v_mount->mnt_flag & MNT_ASYNC)) flags = B_SYNC; - for (error = 0; uio->uio_resid > 0;) { + for (error = 0; uio_resid(uio) > 0;) { + char *buf_data; + lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->fs_bsize - blkoffset; - if (uio->uio_resid < xfersize) - xfersize = uio->uio_resid; + if (uio_resid(uio) < xfersize) + // LP64todo - fix this + xfersize = uio_resid(uio); if (fs->fs_bsize > xfersize) flags |= B_CLRBUF; else flags &= ~B_CLRBUF; - error = ffs_balloc(ip, - lbn, blkoffset + xfersize, ap->a_cred, &bp, flags, 0); + error = ffs_balloc(ip, lbn, blkoffset + xfersize, cred, &bp, flags, 0); if (error) break; if (uio->uio_offset + xfersize > ip->i_size) { ip->i_size = uio->uio_offset + xfersize; - - if (UBCISVALID(vp)) - ubc_setsize(vp, (u_long)ip->i_size); /* XXX check errors */ + ubc_setsize(vp, (u_long)ip->i_size); } - size = BLKSIZE(fs, ip, lbn) - bp->b_resid; + size = BLKSIZE(fs, ip, lbn) - buf_resid(bp); if (size < xfersize) xfersize = size; - error = - uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); + buf_data = (char *)buf_dataptr(bp); + + error = uiomove(buf_data + blkoffset, (int)xfersize, uio); #if REV_ENDIAN_FS if (rev_endian && S_ISDIR(ip->i_mode)) { - byte_swap_dir_out((char *)bp->b_data + blkoffset, xfersize); + byte_swap_dir_out(buf_data + blkoffset, xfersize); } #endif /* REV_ENDIAN_FS */ if (doingdirectory == 0 && (ioflag & IO_SYNC)) - (void)bwrite(bp); + (void)buf_bwrite(bp); else if (xfersize + blkoffset == fs->fs_bsize) { - bp->b_flags |= B_AGE; - bdwrite(bp); + buf_markaged(bp); + buf_bdwrite(bp); } else - bdwrite(bp); + buf_bdwrite(bp); if (error || xfersize == 0) break; ip->i_flag |= IN_CHANGE | IN_UPDATE; @@ -459,20 +453,23 @@ ffs_write(ap) * we clear the setuid and setgid bits as a precaution against * tampering. */ - if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0) + if (resid > uio_resid(uio) && cred && suser(cred, NULL)) ip->i_mode &= ~(ISUID | ISGID); - if (resid > uio->uio_resid) + if (resid > uio_resid(uio)) VN_KNOTE(vp, NOTE_WRITE | (file_extended ? NOTE_EXTEND : 0)); if (error) { if (ioflag & IO_UNIT) { - (void)VOP_TRUNCATE(vp, osize, - ioflag & IO_SYNC, ap->a_cred, uio->uio_procp); - uio->uio_offset -= resid - uio->uio_resid; - uio->uio_resid = resid; + (void)ffs_truncate_internal(vp, osize, ioflag & IO_SYNC, cred); + // LP64todo - fix this + uio->uio_offset -= resid - uio_resid(uio); + uio_setresid(uio, resid); } - } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) - error = VOP_UPDATE(vp, (struct timeval *)&time, - (struct timeval *)&time, 1); + } else if (resid > uio_resid(uio) && (ioflag & IO_SYNC)) { + struct timeval tv; + + microtime(&tv); + error = ffs_update(vp, &tv, &tv, 1); + } return (error); } @@ -482,14 +479,14 @@ ffs_write(ap) */ /* ARGSUSED */ ffs_pagein(ap) - struct vop_pagein_args /* { + struct vnop_pagein_args /* { struct vnode *a_vp, upl_t a_pl, vm_offset_t a_pl_offset, off_t a_f_offset, size_t a_size, - struct ucred *a_cred, int a_flags + vfs_context_t a_context; } */ *ap; { register struct vnode *vp = ap->a_vp; @@ -499,7 +496,6 @@ ffs_pagein(ap) vm_offset_t pl_offset = ap->a_pl_offset; int flags = ap->a_flags; register struct inode *ip; - int devBlockSize=0; int error; ip = VTOI(vp); @@ -518,10 +514,8 @@ ffs_pagein(ap) panic("%s: type %d", "ffs_pagein", vp->v_type); #endif - VOP_DEVBLOCKSIZE(ip->i_devvp, &devBlockSize); + error = cluster_pagein(vp, pl, pl_offset, f_offset, size, (off_t)ip->i_size, flags); - error = cluster_pagein(vp, pl, pl_offset, f_offset, size, - (off_t)ip->i_size, devBlockSize, flags); /* ip->i_flag |= IN_ACCESS; */ return (error); } @@ -532,14 +526,14 @@ ffs_pagein(ap) * make sure the buf is not in hash queue when you return */ ffs_pageout(ap) - struct vop_pageout_args /* { + struct vnop_pageout_args /* { struct vnode *a_vp, upl_t a_pl, vm_offset_t a_pl_offset, off_t a_f_offset, size_t a_size, - struct ucred *a_cred, int a_flags + vfs_context_t a_context; } */ *ap; { register struct vnode *vp = ap->a_vp; @@ -551,7 +545,6 @@ ffs_pageout(ap) register struct inode *ip; register FS *fs; int error ; - int devBlockSize=0; size_t xfer_size = 0; int local_flags=0; off_t local_offset; @@ -561,6 +554,7 @@ ffs_pageout(ap) int save_error =0, save_size=0; vm_offset_t lupl_offset; int nocommit = flags & UPL_NOCOMMIT; + int devBlockSize = 0; struct buf *bp; ip = VTOI(vp); @@ -596,7 +590,7 @@ ffs_pageout(ap) else xfer_size = size; - VOP_DEVBLOCKSIZE(ip->i_devvp, &devBlockSize); + devBlockSize = vfs_devblocksize(vnode_mount(vp)); if (xfer_size & (PAGE_SIZE - 1)) { /* if not a multiple of page size @@ -607,7 +601,7 @@ ffs_pageout(ap) } /* - * once the block allocation is moved to ufs_cmap + * once the block allocation is moved to ufs_blockmap * we can remove all the size and offset checks above * cluster_pageout does all of this now * we need to continue to do it here so as not to @@ -625,7 +619,7 @@ ffs_pageout(ap) xsize = resid; /* Allocate block without reading into a buf */ error = ffs_blkalloc(ip, - lbn, blkoffset + xsize, ap->a_cred, + lbn, blkoffset + xsize, vfs_context_ucred(ap->a_context), local_flags); if (error) break; @@ -640,7 +634,7 @@ ffs_pageout(ap) } - error = cluster_pageout(vp, pl, pl_offset, f_offset, round_page_32(xfer_size), ip->i_size, devBlockSize, flags); + error = cluster_pageout(vp, pl, pl_offset, f_offset, round_page_32(xfer_size), ip->i_size, flags); if(save_error) { lupl_offset = size - save_size; diff --git a/bsd/ufs/ufs/ufs_vfsops.c b/bsd/ufs/ufs/ufs_vfsops.c index 06e006cf3..2285303a6 100644 --- a/bsd/ufs/ufs/ufs_vfsops.c +++ b/bsd/ufs/ufs/ufs_vfsops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -62,10 +62,9 @@ #include <sys/param.h> #include <sys/mbuf.h> -#include <sys/mount.h> +#include <sys/mount_internal.h> #include <sys/proc.h> -#include <sys/buf.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/malloc.h> #include <sys/quota.h> @@ -82,10 +81,10 @@ */ /* ARGSUSED */ int -ufs_start(mp, flags, p) +ufs_start(mp, flags, context) struct mount *mp; int flags; - struct proc *p; + vfs_context_t context; { return (0); @@ -95,14 +94,15 @@ ufs_start(mp, flags, p) * Return the root of a filesystem. */ int -ufs_root(mp, vpp) +ufs_root(mp, vpp, context) struct mount *mp; struct vnode **vpp; + vfs_context_t context; { struct vnode *nvp; int error; - if (error = VFS_VGET(mp, (void *)ROOTINO, &nvp)) + if (error = VFS_VGET(mp, (ino64_t)ROOTINO, &nvp, context)) return (error); *vpp = nvp; return (0); @@ -112,20 +112,21 @@ ufs_root(mp, vpp) * Do operations associated with quotas */ int -ufs_quotactl(mp, cmds, uid, arg, p) +ufs_quotactl(mp, cmds, uid, datap, context) struct mount *mp; int cmds; uid_t uid; - caddr_t arg; - struct proc *p; + caddr_t datap; + vfs_context_t context; { + struct proc *p = vfs_context_proc(context); int cmd, type, error; #if !QUOTA - return (EOPNOTSUPP); + return (ENOTSUP); #else if (uid == -1) - uid = p->p_cred->p_ruid; + uid = vfs_context_ucred(context)->cr_ruid; cmd = cmds >> SUBCMDSHIFT; switch (cmd) { @@ -133,40 +134,40 @@ ufs_quotactl(mp, cmds, uid, arg, p) case Q_QUOTASTAT: break; case Q_GETQUOTA: - if (uid == p->p_cred->p_ruid) + if (uid == vfs_context_ucred(context)->cr_ruid) break; /* fall through */ default: - if (error = suser(p->p_ucred, &p->p_acflag)) + if (error = vfs_context_suser(context)) return (error); } type = cmds & SUBCMDMASK; if ((u_int)type >= MAXQUOTAS) return (EINVAL); - if (vfs_busy(mp, LK_NOWAIT, 0, p)) + if (vfs_busy(mp, LK_NOWAIT)) return (0); switch (cmd) { case Q_QUOTAON: - error = quotaon(p, mp, type, arg, UIO_USERSPACE); + error = quotaon(context, mp, type, datap); break; case Q_QUOTAOFF: - error = quotaoff(p, mp, type); + error = quotaoff(mp, type); break; case Q_SETQUOTA: - error = setquota(mp, uid, type, arg); + error = setquota(mp, uid, type, datap); break; case Q_SETUSE: - error = setuse(mp, uid, type, arg); + error = setuse(mp, uid, type, datap); break; case Q_GETQUOTA: - error = getquota(mp, uid, type, arg); + error = getquota(mp, uid, type, datap); break; case Q_SYNC: @@ -174,14 +175,15 @@ ufs_quotactl(mp, cmds, uid, arg, p) break; case Q_QUOTASTAT: - error = quotastat(mp, type, arg); + error = quotastat(mp, type, datap); break; default: error = EINVAL; break; } - vfs_unbusy(mp, p); + vfs_unbusy(mp); + return (error); #endif } @@ -205,49 +207,3 @@ ufs_init(vfsp) return (0); } -/* - * This is the generic part of fhtovp called after the underlying - * filesystem has validated the file handle. - * - * Verify that a host should have access to a filesystem, and if so - * return a vnode for the presented file handle. - */ -int -ufs_check_export(mp, ufhp, nam, vpp, exflagsp, credanonp) - register struct mount *mp; - struct ufid *ufhp; - struct mbuf *nam; - struct vnode **vpp; - int *exflagsp; - struct ucred **credanonp; -{ - register struct inode *ip; - register struct netcred *np; - register struct ufsmount *ump = VFSTOUFS(mp); - struct vnode *nvp; - int error; - - /* - * Get the export permission structure for this <mp, client> tuple. - */ - np = vfs_export_lookup(mp, &ump->um_export, nam); - if (nam && (np == NULL)) - return (EACCES); - - if (error = VFS_VGET(mp, (void *)ufhp->ufid_ino, &nvp)) { - *vpp = NULLVP; - return (error); - } - ip = VTOI(nvp); - if (ip->i_mode == 0 || ip->i_gen != ufhp->ufid_gen) { - vput(nvp); - *vpp = NULLVP; - return (ESTALE); - } - *vpp = nvp; - if (np) { - *exflagsp = np->netc_exflags; - *credanonp = &np->netc_anon; - } - return (0); -} diff --git a/bsd/ufs/ufs/ufs_vnops.c b/bsd/ufs/ufs/ufs_vnops.c index f45e51d62..9e4fe6eda 100644 --- a/bsd/ufs/ufs/ufs_vnops.c +++ b/bsd/ufs/ufs/ufs_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -66,25 +66,26 @@ #include <sys/namei.h> #include <sys/resourcevar.h> #include <sys/kernel.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/stat.h> #include <sys/buf.h> #include <sys/proc.h> +#include <sys/kauth.h> #include <sys/conf.h> -#include <sys/mount.h> -#include <sys/vnode.h> +#include <sys/mount_internal.h> +#include <sys/vnode_internal.h> #include <sys/malloc.h> #include <sys/dirent.h> #include <sys/fcntl.h> #include <sys/ubc.h> #include <sys/quota.h> +#include <sys/uio_internal.h> #include <kern/thread.h> #include <sys/vm.h> #include <miscfs/specfs/specdev.h> -#include <ufs/ufs/lockf.h> #include <ufs/ufs/quota.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/dir.h> @@ -96,49 +97,40 @@ #include <architecture/byte_order.h> #endif /* REV_ENDIAN_FS */ -static int ufs_chmod __P((struct vnode *, int, struct ucred *, struct proc *)); -static int ufs_chown - __P((struct vnode *, uid_t, gid_t, struct ucred *, struct proc *)); -static int filt_ufsread __P((struct knote *kn, long hint)); -static int filt_ufswrite __P((struct knote *kn, long hint)); -static int filt_ufsvnode __P((struct knote *kn, long hint)); -static void filt_ufsdetach __P((struct knote *kn)); -static int ufs_kqfilter __P((struct vop_kqfilter_args *ap)); - -union _qcvt { - int64_t qcvt; - int32_t val[2]; -}; -#define SETHIGH(q, h) { \ - union _qcvt tmp; \ - tmp.qcvt = (q); \ - tmp.val[_QUAD_HIGHWORD] = (h); \ - (q) = tmp.qcvt; \ -} -#define SETLOW(q, l) { \ - union _qcvt tmp; \ - tmp.qcvt = (q); \ - tmp.val[_QUAD_LOWWORD] = (l); \ - (q) = tmp.qcvt; \ -} + +static int ufs_chmod(struct vnode *, int, kauth_cred_t, struct proc *); +static int ufs_chown(struct vnode *, uid_t, gid_t, kauth_cred_t, + struct proc *); +static int filt_ufsread(struct knote *kn, long hint); +static int filt_ufswrite(struct knote *kn, long hint); +static int filt_ufsvnode(struct knote *kn, long hint); +static void filt_ufsdetach(struct knote *kn); + +#if FIFO +extern void fifo_printinfo(struct vnode *vp); +#endif /* FIFO */ +extern int ufs_direnter2(struct vnode *dvp, struct direct *dirp, + vfs_context_t ctx); + +static int ufs_readdirext(vnode_t vp, uio_t uio, int *eofflag, int *numdirent, + vfs_context_t context); /* * Create a regular file */ int ufs_create(ap) - struct vop_create_args /* { + struct vnop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; - struct vattr *a_vap; + struct vnode_vattr *a_vap; + vfs_context_t a_context; } */ *ap; { int error; - if (error = - ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode), - ap->a_dvp, ap->a_vpp, ap->a_cnp)) + if ( (error = ufs_makeinode(ap->a_vap, ap->a_dvp, ap->a_vpp, ap->a_cnp)) ) return (error); VN_KNOTE(ap->a_dvp, NOTE_WRITE); return (0); @@ -147,24 +139,37 @@ ufs_create(ap) /* * Mknod vnode call */ -/* ARGSUSED */ int ufs_mknod(ap) - struct vop_mknod_args /* { + struct vnop_mknod_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; - struct vattr *a_vap; + struct vnode_attr *a_vap; + vfs_context_t a_context; } */ *ap; { - struct vattr *vap = ap->a_vap; + struct vnode_attr *vap = ap->a_vap; struct vnode **vpp = ap->a_vpp; + struct vnode *dvp = ap->a_dvp; + struct vnode *tvp; struct inode *ip; + struct componentname *cnp = ap->a_cnp; int error; - if (error = - ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode), - ap->a_dvp, vpp, ap->a_cnp)) + /* use relookup to force correct directory hints */ + cnp->cn_flags &= ~MODMASK; + cnp->cn_flags |= (WANTPARENT | NOCACHE); + cnp->cn_nameiop = CREATE; + + (void) relookup(dvp, &tvp, cnp); + + /* get rid of reference relookup returned */ + if (tvp) + vnode_put(tvp); + + if ( (error = + ufs_makeinode(ap->a_vap, ap->a_dvp, vpp, ap->a_cnp)) ) return (error); VN_KNOTE(ap->a_dvp, NOTE_WRITE); ip = VTOI(*vpp); @@ -176,15 +181,6 @@ ufs_mknod(ap) */ ip->i_rdev = vap->va_rdev; } - /* - * Remove inode so that it will be reloaded by VFS_VGET and - * checked to see if it is an alias of an existing entry in - * the inode cache. - */ - vput(*vpp); - (*vpp)->v_type = VNON; - vgone(*vpp); - *vpp = 0; return (0); } @@ -193,14 +189,12 @@ ufs_mknod(ap) * * Nothing to do. */ -/* ARGSUSED */ int ufs_open(ap) - struct vop_open_args /* { + struct vnop_open_args /* { struct vnode *a_vp; int a_mode; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { @@ -218,160 +212,77 @@ ufs_open(ap) * * Update the times on the inode. */ -/* ARGSUSED */ int ufs_close(ap) - struct vop_close_args /* { + struct vnop_close_args /* { struct vnode *a_vp; int a_fflag; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct inode *ip = VTOI(vp); + struct timeval tv; - simple_lock(&vp->v_interlock); - if ((!UBCISVALID(vp) && vp->v_usecount > 1) - || (UBCISVALID(vp) && ubc_isinuse(vp, 1))) - ITIMES(ip, &time, &time); - simple_unlock(&vp->v_interlock); - - if (!VOP_ISLOCKED(vp)) { - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, ap->a_p); - - cluster_push(vp); - - VOP_UNLOCK(vp, 0, ap->a_p); - } - return (0); -} - -int -ufs_access(ap) - struct vop_access_args /* { - struct vnode *a_vp; - int a_mode; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - struct inode *ip = VTOI(vp); - struct ucred *cred = ap->a_cred; - mode_t mask, mode = ap->a_mode; - register gid_t *gp; - int i, error; - - /* - * Disallow write attempts on read-only file systems; - * unless the file is a socket, fifo, or a block or - * character device resident on the file system. - */ - if (mode & VWRITE) { - switch (vp->v_type) { - case VDIR: - case VLNK: - case VREG: - if (vp->v_mount->mnt_flag & MNT_RDONLY) - return (EROFS); -#if QUOTA - if (error = getinoquota(ip)) - return (error); -#endif - break; - } - } - - /* If immutable bit set, nobody gets to write it. */ - if ((mode & VWRITE) && (ip->i_flags & IMMUTABLE)) - return (EPERM); - - /* Otherwise, user id 0 always gets access. */ - if (cred->cr_uid == 0) - return (0); - - mask = 0; - - /* Otherwise, check the owner. */ - if (cred->cr_uid == ip->i_uid) { - if (mode & VEXEC) - mask |= S_IXUSR; - if (mode & VREAD) - mask |= S_IRUSR; - if (mode & VWRITE) - mask |= S_IWUSR; - return ((ip->i_mode & mask) == mask ? 0 : EACCES); + if (vnode_isinuse(vp, 1)) { + microtime(&tv); + ITIMES(ip, &tv, &tv); } - /* Otherwise, check the groups. */ - for (i = 0, gp = cred->cr_groups; i < cred->cr_ngroups; i++, gp++) - if (ip->i_gid == *gp) { - if (mode & VEXEC) - mask |= S_IXGRP; - if (mode & VREAD) - mask |= S_IRGRP; - if (mode & VWRITE) - mask |= S_IWGRP; - return ((ip->i_mode & mask) == mask ? 0 : EACCES); - } + cluster_push(vp, IO_CLOSE); - /* Otherwise, check everyone else. */ - if (mode & VEXEC) - mask |= S_IXOTH; - if (mode & VREAD) - mask |= S_IROTH; - if (mode & VWRITE) - mask |= S_IWOTH; - return ((ip->i_mode & mask) == mask ? 0 : EACCES); + return (0); } -/* ARGSUSED */ int ufs_getattr(ap) - struct vop_getattr_args /* { + struct vnop_getattr_args /* { struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - struct proc *a_p; + struct vnode_attr *a_vap; + vfs_context_t a_context; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct inode *ip = VTOI(vp); - register struct vattr *vap = ap->a_vap; + register struct vnode_attr *vap = ap->a_vap; int devBlockSize=0; + struct timeval tv; - ITIMES(ip, &time, &time); + microtime(&tv); + + ITIMES(ip, &tv, &tv); /* * Copy from inode table */ - vap->va_fsid = ip->i_dev; - vap->va_fileid = ip->i_number; - vap->va_mode = ip->i_mode & ~IFMT; - vap->va_nlink = ip->i_nlink; - vap->va_uid = ip->i_uid; - vap->va_gid = ip->i_gid; - vap->va_rdev = (dev_t)ip->i_rdev; - vap->va_size = ip->i_din.di_size; - vap->va_atime.tv_sec = ip->i_atime; - vap->va_atime.tv_nsec = ip->i_atimensec; - vap->va_mtime.tv_sec = ip->i_mtime; - vap->va_mtime.tv_nsec = ip->i_mtimensec; - vap->va_ctime.tv_sec = ip->i_ctime; - vap->va_ctime.tv_nsec = ip->i_ctimensec; - vap->va_flags = ip->i_flags; - vap->va_gen = ip->i_gen; - /* this doesn't belong here */ + VATTR_RETURN(vap, va_fsid, ip->i_dev); + VATTR_RETURN(vap, va_fileid, ip->i_number); + VATTR_RETURN(vap, va_mode, ip->i_mode & ~IFMT); + VATTR_RETURN(vap, va_nlink, ip->i_nlink); + VATTR_RETURN(vap, va_uid, ip->i_uid); + VATTR_RETURN(vap, va_gid, ip->i_gid); + VATTR_RETURN(vap, va_rdev, (dev_t)ip->i_rdev); + VATTR_RETURN(vap, va_data_size, ip->i_din.di_size); + vap->va_access_time.tv_sec = ip->i_atime; + vap->va_access_time.tv_nsec = ip->i_atimensec; + VATTR_SET_SUPPORTED(vap, va_access_time); + vap->va_modify_time.tv_sec = ip->i_mtime; + vap->va_modify_time.tv_nsec = ip->i_mtimensec; + VATTR_SET_SUPPORTED(vap, va_modify_time); + vap->va_change_time.tv_sec = ip->i_ctime; + vap->va_change_time.tv_nsec = ip->i_ctimensec; + VATTR_SET_SUPPORTED(vap, va_change_time); + VATTR_RETURN(vap, va_flags, ip->i_flags); + VATTR_RETURN(vap, va_gen, ip->i_gen); if (vp->v_type == VBLK) - vap->va_blocksize = BLKDEV_IOSIZE; + VATTR_RETURN(vap, va_iosize, BLKDEV_IOSIZE); else if (vp->v_type == VCHR) - vap->va_blocksize = MAXPHYSIO; + VATTR_RETURN(vap, va_iosize, MAXPHYSIO); else - vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; - VOP_DEVBLOCKSIZE(ip->i_devvp, &devBlockSize); - vap->va_bytes = dbtob((u_quad_t)ip->i_blocks, devBlockSize); - vap->va_type = vp->v_type; - vap->va_filerev = ip->i_modrev; + VATTR_RETURN(vap, va_iosize, vp->v_mount->mnt_vfsstat.f_iosize); + devBlockSize = vfs_devblocksize(vnode_mount(vp)); + VATTR_RETURN(vap, va_data_alloc, dbtob((u_quad_t)ip->i_blocks, devBlockSize)); + VATTR_RETURN(vap, va_type, vp->v_type); + VATTR_RETURN(vap, va_filerev, ip->i_modrev); return (0); } @@ -380,109 +291,72 @@ ufs_getattr(ap) */ int ufs_setattr(ap) - struct vop_setattr_args /* { + struct vnop_setattr_args /* { struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; + struct vnode_attr *a_vap; struct proc *a_p; + vfs_context_t a_context; } */ *ap; { - struct vattr *vap = ap->a_vap; + struct vnode_attr *vap = ap->a_vap; struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); - struct ucred *cred = ap->a_cred; - struct proc *p = ap->a_p; + kauth_cred_t cred = vfs_context_ucred(ap->a_context); + struct proc *p = vfs_context_proc(ap->a_context); struct timeval atimeval, mtimeval; int error; + uid_t nuid; + gid_t ngid; /* - * Check for unsettable attributes. + * Go through the fields and update iff set. */ - if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || - (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || - (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || - ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { - return (EINVAL); - } - if (vap->va_flags != VNOVAL) { - if (vp->v_mount->mnt_flag & MNT_RDONLY) - return (EROFS); - if (cred->cr_uid != ip->i_uid && - (error = suser(cred, &p->p_acflag))) - return (error); - if (cred->cr_uid == 0) { - if ((ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) && - securelevel > 0) - return (EPERM); - ip->i_flags = vap->va_flags; - } else { - if (ip->i_flags & (SF_IMMUTABLE | SF_APPEND) || - (vap->va_flags & UF_SETTABLE) != vap->va_flags) - return (EPERM); - ip->i_flags &= SF_SETTABLE; - ip->i_flags |= (vap->va_flags & UF_SETTABLE); - } + if (VATTR_IS_ACTIVE(vap, va_flags)) { + ip->i_flags = vap->va_flags; ip->i_flag |= IN_CHANGE; - if (vap->va_flags & (IMMUTABLE | APPEND)) - return (0); } - if (ip->i_flags & (IMMUTABLE | APPEND)) - return (EPERM); - /* - * Go through the fields and update iff not VNOVAL. - */ - if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { - if (vp->v_mount->mnt_flag & MNT_RDONLY) - return (EROFS); - if (error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred, p)) + VATTR_SET_SUPPORTED(vap, va_flags); + + nuid = VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : (uid_t)VNOVAL; + ngid = VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : (gid_t)VNOVAL; + if (nuid != (uid_t)VNOVAL || ngid != (gid_t)VNOVAL) { + if ( (error = ufs_chown(vp, nuid, ngid, cred, p)) ) return (error); } - if (vap->va_size != VNOVAL) { - /* - * Disallow write attempts on read-only file systems; - * unless the file is a socket, fifo, or a block or - * character device resident on the file system. - */ - switch (vp->v_type) { - case VDIR: - return (EISDIR); - case VLNK: - case VREG: - if (vp->v_mount->mnt_flag & MNT_RDONLY) - return (EROFS); - break; - } - if (error = VOP_TRUNCATE(vp, vap->va_size, 0, cred, p)) + VATTR_SET_SUPPORTED(vap, va_uid); + VATTR_SET_SUPPORTED(vap, va_gid); + + if (VATTR_IS_ACTIVE(vap, va_data_size)) { + if ( (error = ffs_truncate_internal(vp, vap->va_data_size, vap->va_vaflags & 0xffff, cred)) ) return (error); } + VATTR_SET_SUPPORTED(vap, va_data_size); + ip = VTOI(vp); - if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { - if (vp->v_mount->mnt_flag & MNT_RDONLY) - return (EROFS); - if (cred->cr_uid != ip->i_uid && - (error = suser(cred, &p->p_acflag)) && - ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || - (error = VOP_ACCESS(vp, VWRITE, cred, p)))) - return (error); - if (vap->va_atime.tv_sec != VNOVAL) + if (VATTR_IS_ACTIVE(vap, va_access_time) || VATTR_IS_ACTIVE(vap, va_modify_time)) { + if (VATTR_IS_ACTIVE(vap, va_access_time)) ip->i_flag |= IN_ACCESS; - if (vap->va_mtime.tv_sec != VNOVAL) + if (VATTR_IS_ACTIVE(vap, va_modify_time)) ip->i_flag |= IN_CHANGE | IN_UPDATE; - atimeval.tv_sec = vap->va_atime.tv_sec; - atimeval.tv_usec = vap->va_atime.tv_nsec / 1000; - mtimeval.tv_sec = vap->va_mtime.tv_sec; - mtimeval.tv_usec = vap->va_mtime.tv_nsec / 1000; - if (error = VOP_UPDATE(vp, &atimeval, &mtimeval, 1)) + atimeval.tv_sec = vap->va_access_time.tv_sec; + atimeval.tv_usec = vap->va_access_time.tv_nsec / 1000; + mtimeval.tv_sec = vap->va_modify_time.tv_sec; + mtimeval.tv_usec = vap->va_modify_time.tv_nsec / 1000; + if ( (error = ffs_update(vp, &atimeval, &mtimeval, 1)) ) return (error); } - error = 0; - if (vap->va_mode != (mode_t)VNOVAL) { - if (vp->v_mount->mnt_flag & MNT_RDONLY) - return (EROFS); - error = ufs_chmod(vp, (int)vap->va_mode, cred, p); + VATTR_SET_SUPPORTED(vap, va_access_time); + VATTR_SET_SUPPORTED(vap, va_modify_time); + + if (VATTR_IS_ACTIVE(vap, va_mode)) { + if ((error = ufs_chmod(vp, (int)vap->va_mode, cred, p))) + return (error); } + VATTR_SET_SUPPORTED(vap, va_mode); + VN_KNOTE(vp, NOTE_ATTRIB); - return (error); + + return (0); } /* @@ -490,24 +364,10 @@ ufs_setattr(ap) * Inode must be locked before calling. */ static int -ufs_chmod(vp, mode, cred, p) - register struct vnode *vp; - register int mode; - register struct ucred *cred; - struct proc *p; +ufs_chmod(struct vnode *vp, int mode, kauth_cred_t cred, struct proc *p) { register struct inode *ip = VTOI(vp); - int error; - if (cred->cr_uid != ip->i_uid && - (error = suser(cred, &p->p_acflag))) - return (error); - if (cred->cr_uid) { - if (vp->v_type != VDIR && (mode & S_ISTXT)) - return (EFTYPE); - if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) - return (EPERM); - } ip->i_mode &= ~ALLPERMS; ip->i_mode |= (mode & ALLPERMS); ip->i_flag |= IN_CHANGE; @@ -519,17 +379,14 @@ ufs_chmod(vp, mode, cred, p) * inode must be locked prior to call. */ static int -ufs_chown(vp, uid, gid, cred, p) - register struct vnode *vp; - uid_t uid; - gid_t gid; - struct ucred *cred; - struct proc *p; +ufs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred, + struct proc *p) { register struct inode *ip = VTOI(vp); uid_t ouid; gid_t ogid; int error = 0; + int is_member; #if QUOTA register int i; int64_t change; /* in bytes */ @@ -540,34 +397,26 @@ ufs_chown(vp, uid, gid, cred, p) uid = ip->i_uid; if (gid == (gid_t)VNOVAL) gid = ip->i_gid; - /* - * If we don't own the file, are trying to change the owner - * of the file, or are not a member of the target group, - * the caller must be superuser or the call fails. - */ - if ((cred->cr_uid != ip->i_uid || uid != ip->i_uid || - (gid != ip->i_gid && !groupmember((gid_t)gid, cred))) && - (error = suser(cred, &p->p_acflag))) - return (error); ogid = ip->i_gid; ouid = ip->i_uid; #if QUOTA - if (error = getinoquota(ip)) + if ( (error = getinoquota(ip)) ) return (error); if (ouid == uid) { - dqrele(vp, ip->i_dquot[USRQUOTA]); + dqrele(ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { - dqrele(vp, ip->i_dquot[GRPQUOTA]); + dqrele(ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } - VOP_DEVBLOCKSIZE(ip->i_devvp, &devBlockSize); + devBlockSize = vfs_devblocksize(vnode_mount(vp)); + change = dbtob((int64_t)ip->i_blocks, devBlockSize); (void) chkdq(ip, -change, cred, CHOWN); (void) chkiq(ip, -1, cred, CHOWN); for (i = 0; i < MAXQUOTAS; i++) { - dqrele(vp, ip->i_dquot[i]); + dqrele(ip->i_dquot[i]); ip->i_dquot[i] = NODQUOT; } #endif @@ -576,11 +425,11 @@ ufs_chown(vp, uid, gid, cred, p) #if QUOTA if ((error = getinoquota(ip)) == 0) { if (ouid == uid) { - dqrele(vp, ip->i_dquot[USRQUOTA]); + dqrele(ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { - dqrele(vp, ip->i_dquot[GRPQUOTA]); + dqrele(ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } if ((error = chkdq(ip, change, cred, CHOWN)) == 0) { @@ -590,7 +439,7 @@ ufs_chown(vp, uid, gid, cred, p) (void) chkdq(ip, -change, cred, CHOWN|FORCE); } for (i = 0; i < MAXQUOTAS; i++) { - dqrele(vp, ip->i_dquot[i]); + dqrele(ip->i_dquot[i]); ip->i_dquot[i] = NODQUOT; } } @@ -598,11 +447,11 @@ ufs_chown(vp, uid, gid, cred, p) ip->i_uid = ouid; if (getinoquota(ip) == 0) { if (ouid == uid) { - dqrele(vp, ip->i_dquot[USRQUOTA]); + dqrele(ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { - dqrele(vp, ip->i_dquot[GRPQUOTA]); + dqrele(ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } (void) chkdq(ip, change, cred, FORCE|CHOWN); @@ -616,23 +465,17 @@ good: #endif /* QUOTA */ if (ouid != uid || ogid != gid) ip->i_flag |= IN_CHANGE; - if (ouid != uid && cred->cr_uid != 0) - ip->i_mode &= ~ISUID; - if (ogid != gid && cred->cr_uid != 0) - ip->i_mode &= ~ISGID; return (0); } -/* ARGSUSED */ int ufs_ioctl(ap) - struct vop_ioctl_args /* { + struct vnop_ioctl_args /* { struct vnode *a_vp; int a_command; caddr_t a_data; int a_fflag; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { @@ -648,21 +491,17 @@ ufs_ioctl(ap) vp = ap->a_vp; - VOP_LEASE(vp, ap->a_p, ap->a_cred, LEASE_READ); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, ap->a_p); - ra = (struct radvisory *)(ap->a_data); ip = VTOI(vp); fs = ip->i_fs; if ((u_int64_t)ra->ra_offset >= ip->i_size) { - VOP_UNLOCK(vp, 0, ap->a_p); return (EFBIG); } - VOP_DEVBLOCKSIZE(ip->i_devvp, &devBlockSize); + devBlockSize = vfs_devblocksize(vnode_mount(vp)); + + error = advisory_read(vp, ip->i_size, ra->ra_offset, ra->ra_count); - error = advisory_read(vp, ip->i_size, ra->ra_offset, ra->ra_count, devBlockSize); - VOP_UNLOCK(vp, 0, ap->a_p); return (error); } default: @@ -670,19 +509,9 @@ ufs_ioctl(ap) } } -/* ARGSUSED */ int -ufs_select(ap) - struct vop_select_args /* { - struct vnode *a_vp; - int a_which; - int a_fflags; - struct ucred *a_cred; - void *a_wql; - struct proc *a_p; - } */ *ap; +ufs_select(__unused struct vnop_select_args *ap) { - /* * We should really check to see if I/O is possible. */ @@ -694,91 +523,65 @@ ufs_select(ap) * * NB Currently unsupported. */ -/* ARGSUSED */ int -ufs_mmap(ap) - struct vop_mmap_args /* { - struct vnode *a_vp; - int a_fflags; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; +ufs_mmap(__unused struct vnop_mmap_args *ap) { - return (EINVAL); } -/* - * Seek on a file - * - * Nothing to do, so just return. - */ -/* ARGSUSED */ int -ufs_seek(ap) - struct vop_seek_args /* { +ufs_remove(ap) + struct vnop_remove_args /* { + struct vnode *a_dvp; struct vnode *a_vp; - off_t a_oldoff; - off_t a_newoff; - struct ucred *a_cred; + struct componentname *a_cnp; + int *a_flags; + vfs_context_t a_context; } */ *ap; { - - return (0); + return(ufs_remove_internal(ap->a_dvp, ap->a_vp, ap->a_cnp, ap->a_flags)); } + int -ufs_remove(ap) - struct vop_remove_args /* { - struct vnode *a_dvp; - struct vnode *a_vp; - struct componentname *a_cnp; - } */ *ap; +ufs_remove_internal(vnode_t dvp, vnode_t vp, struct componentname *cnp, int flags) { struct inode *ip; - struct vnode *vp = ap->a_vp; - struct vnode *dvp = ap->a_dvp; + struct vnode *tvp; int error; - ip = VTOI(vp); - if ((ip->i_flags & (IMMUTABLE | APPEND)) || - (VTOI(dvp)->i_flags & APPEND)) { - error = EPERM; - goto out; - } - - if (ap->a_cnp->cn_flags & NODELETEBUSY) { + if (flags & VNODE_REMOVE_NODELETEBUSY) { /* Caller requested Carbon delete semantics */ - if ((!UBCISVALID(vp) && vp->v_usecount > 1) - || (UBCISVALID(vp) && ubc_isinuse(vp, 1))) { + if (vnode_isinuse(vp, 0)) { error = EBUSY; goto out; } } + cnp->cn_flags &= ~MODMASK; + cnp->cn_flags |= (WANTPARENT | NOCACHE); + cnp->cn_nameiop = DELETE; + + (void) relookup(dvp, &tvp, cnp); + + if (tvp == NULL) + return (ENOENT); + if (tvp != vp) + panic("ufs_remove_internal: relookup returned a different vp"); + /* + * get rid of reference relookup returned + */ + vnode_put(tvp); + + + ip = VTOI(vp); - if ((error = ufs_dirremove(dvp, ap->a_cnp)) == 0) { + if ((error = ufs_dirremove(dvp, cnp)) == 0) { ip->i_nlink--; ip->i_flag |= IN_CHANGE; VN_KNOTE(vp, NOTE_DELETE); VN_KNOTE(dvp, NOTE_WRITE); } - - if (dvp != vp) - VOP_UNLOCK(vp, 0, ap->a_cnp->cn_proc); - - (void) ubc_uncache(vp); - - vrele(vp); - vput(dvp); - - return (error); - out: - if (dvp == vp) - vrele(vp); - else - vput(vp); - vput(dvp); return (error); } @@ -787,98 +590,72 @@ out: */ int ufs_link(ap) - struct vop_link_args /* { + struct vnop_link_args /* { struct vnode *a_vp; struct vnode *a_tdvp; struct componentname *a_cnp; + vfs_context_t a_context; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; - struct proc *p = cnp->cn_proc; + vfs_context_t ctx = cnp->cn_context; + struct proc *p = vfs_context_proc(ctx); struct inode *ip; struct timeval tv; int error; -#if DIAGNOSTIC - if ((cnp->cn_flags & HASBUF) == 0) - panic("ufs_link: no name"); -#endif - if (tdvp->v_mount != vp->v_mount) { - VOP_ABORTOP(tdvp, cnp); - error = EXDEV; - goto out2; - } - if (tdvp != vp && (error = vn_lock(vp, LK_EXCLUSIVE, p))) { - VOP_ABORTOP(tdvp, cnp); - goto out2; - } ip = VTOI(vp); + if ((nlink_t)ip->i_nlink >= LINK_MAX) { - VOP_ABORTOP(tdvp, cnp); error = EMLINK; goto out1; } - if (ip->i_flags & (IMMUTABLE | APPEND)) { - VOP_ABORTOP(tdvp, cnp); - error = EPERM; - goto out1; - } ip->i_nlink++; ip->i_flag |= IN_CHANGE; - tv = time; - error = VOP_UPDATE(vp, &tv, &tv, 1); + microtime(&tv); + error = ffs_update(vp, &tv, &tv, 1); if (!error) error = ufs_direnter(ip, tdvp, cnp); if (error) { ip->i_nlink--; ip->i_flag |= IN_CHANGE; } - { - char *tmp = cnp->cn_pnbuf; - cnp->cn_pnbuf = NULL; - cnp->cn_flags &= ~HASBUF; - FREE_ZONE(tmp, cnp->cn_pnlen, M_NAMEI); - } VN_KNOTE(vp, NOTE_LINK); VN_KNOTE(tdvp, NOTE_WRITE); out1: - if (tdvp != vp) - VOP_UNLOCK(vp, 0, p); -out2: - vput(tdvp); return (error); } /* * whiteout vnode call */ + int ufs_whiteout(ap) - struct vop_whiteout_args /* { + struct vnop_whiteout_args /* { struct vnode *a_dvp; struct componentname *a_cnp; int a_flags; + vfs_context_t a_context; } */ *ap; { struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct direct newdir; - int error; + int error = 0; switch (ap->a_flags) { case LOOKUP: /* 4.4 format directories support whiteout operations */ if (dvp->v_mount->mnt_maxsymlinklen > 0) return (0); - return (EOPNOTSUPP); + return (ENOTSUP); case CREATE: /* create a new directory whiteout */ #if DIAGNOSTIC - if ((cnp->cn_flags & HASBUF) == 0) - panic("ufs_whiteout: missing name"); if (dvp->v_mount->mnt_maxsymlinklen <= 0) panic("ufs_whiteout: old format filesystem"); #endif @@ -887,7 +664,7 @@ ufs_whiteout(ap) newdir.d_namlen = cnp->cn_namelen; bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1); newdir.d_type = DT_WHT; - error = ufs_direnter2(dvp, &newdir, cnp->cn_cred, cnp->cn_proc); + error = ufs_direnter2(dvp, &newdir, cnp->cn_context); break; case DELETE: @@ -901,12 +678,6 @@ ufs_whiteout(ap) error = ufs_dirremove(dvp, cnp); break; } - if (cnp->cn_flags & HASBUF) { - char *tmp = cnp->cn_pnbuf; - cnp->cn_pnbuf = NULL; - cnp->cn_flags &= ~HASBUF; - FREE_ZONE(tmp, cnp->cn_pnlen, M_NAMEI); - } return (error); } @@ -937,13 +708,14 @@ ufs_whiteout(ap) */ int ufs_rename(ap) - struct vop_rename_args /* { + struct vnop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; + vfs_context_t a_context; } */ *ap; { struct vnode *tvp = ap->a_tvp; @@ -952,86 +724,78 @@ ufs_rename(ap) struct vnode *fdvp = ap->a_fdvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; - struct proc *p = fcnp->cn_proc; + vfs_context_t ctx = fcnp->cn_context; + struct proc *p = vfs_context_proc(ctx); struct inode *ip, *xp, *dp; struct dirtemplate dirbuf; struct timeval tv; - int doingdirectory = 0, oldparent = 0, newparent = 0; + ino_t doingdirectory = 0, oldparent = 0, newparent = 0; int error = 0, ioflag; u_char namlen; + struct vnode *rl_vp = NULL; -#if DIAGNOSTIC - if ((tcnp->cn_flags & HASBUF) == 0 || - (fcnp->cn_flags & HASBUF) == 0) - panic("ufs_rename: no name"); -#endif - /* - * Check for cross-device rename. - */ - if ((fvp->v_mount != tdvp->v_mount) || - (tvp && (fvp->v_mount != tvp->v_mount))) { - error = EXDEV; -abortit: - VOP_ABORTOP(tdvp, tcnp); /* XXX, why not in NFS? */ - if (tdvp == tvp) - vrele(tdvp); - else - vput(tdvp); - if (tvp) - vput(tvp); - VOP_ABORTOP(fdvp, fcnp); /* XXX, why not in NFS? */ - vrele(fdvp); - vrele(fvp); - return (error); - } /* - * Check if just deleting a link name. + * Check if just deleting a link name or if we've lost a race. + * If another process completes the same rename after we've looked + * up the source and have blocked looking up the target, then the + * source and target inodes may be identical now although the + * names were never linked. */ - if (tvp && ((VTOI(tvp)->i_flags & (IMMUTABLE | APPEND)) || - (VTOI(tdvp)->i_flags & APPEND))) { - error = EPERM; - goto abortit; - } if (fvp == tvp) { if (fvp->v_type == VDIR) { - error = EINVAL; + /* + * Linked directories are impossible, so we must + * have lost the race. Pretend that the rename + * completed before the lookup. + */ +#ifdef UFS_RENAME_DEBUG + printf("ufs_rename: fvp == tvp for directories\n"); +#endif + error = ENOENT; goto abortit; } - /* Release destination completely. */ - VOP_ABORTOP(tdvp, tcnp); - vput(tdvp); - vput(tvp); - - /* Delete source. */ - vrele(fdvp); - vrele(fvp); - fcnp->cn_flags &= ~MODMASK; - fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; - if ((fcnp->cn_flags & SAVESTART) == 0) - panic("ufs_rename: lost from startdir"); - fcnp->cn_nameiop = DELETE; - (void) relookup(fdvp, &fvp, fcnp); - return (VOP_REMOVE(fdvp, fvp, fcnp)); + /* + * don't need to check in here for permissions, must already have been granted + * ufs_remove_internal now does the relookup + */ + error = ufs_remove_internal(fdvp, fvp, fcnp, 0); + + return (error); } - if (error = vn_lock(fvp, LK_EXCLUSIVE, p)) + /* + * because the vnode_authorization code may have looked up in this directory + * between the original lookup and the actual call to VNOP_RENAME, we need + * to reset the directory hints... since we haven't dropped the FSNODELOCK + * on tdvp since this whole thing started, we expect relookup to return + * tvp (which may be NULL) + */ + tcnp->cn_flags &= ~MODMASK; + tcnp->cn_flags |= (WANTPARENT | NOCACHE); + + if ( (error = relookup(tdvp, &rl_vp, tcnp)) ) + panic("ufs_rename: relookup on target returned error"); + if (rl_vp != tvp) { + /* + * Don't panic. The only way this state will be reached is if + * another rename has taken effect. In that case, it's safe + * to restart this rename and let things sort themselves out. + */ + if (rl_vp) + vnode_put(rl_vp); + error = ERESTART; goto abortit; + } + if (rl_vp) { + vnode_put(rl_vp); + rl_vp = NULL; + } dp = VTOI(fdvp); ip = VTOI(fvp); - if ((ip->i_flags & (IMMUTABLE | APPEND)) || (dp->i_flags & APPEND)) { - VOP_UNLOCK(fvp, 0, p); - error = EPERM; - goto abortit; - } + if ((ip->i_mode & IFMT) == IFDIR) { - /* - * Avoid ".", "..", and aliases of "." for obvious reasons. - */ - if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || - dp == ip || (fcnp->cn_flags&ISDOTDOT) || - (ip->i_flag & IN_RENAME)) { - VOP_UNLOCK(fvp, 0, p); + if (ip->i_flag & IN_RENAME) { error = EINVAL; goto abortit; } @@ -1040,7 +804,6 @@ abortit: doingdirectory++; } VN_KNOTE(fdvp, NOTE_WRITE); /* XXX right place? */ - vrele(fdvp); /* * When the target exists, both the directory @@ -1059,9 +822,8 @@ abortit: */ ip->i_nlink++; ip->i_flag |= IN_CHANGE; - tv = time; - if (error = VOP_UPDATE(fvp, &tv, &tv, 1)) { - VOP_UNLOCK(fvp, 0, p); + microtime(&tv); + if ( (error = ffs_update(fvp, &tv, &tv, 1)) ) { goto bad; } @@ -1075,25 +837,26 @@ abortit: * to namei, as the parent directory is unlocked by the * call to checkpath(). */ - error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_proc); - VOP_UNLOCK(fvp, 0, p); + if (oldparent != dp->i_number) newparent = dp->i_number; + if (doingdirectory && newparent) { if (error) /* write access check above */ goto bad; - if (xp != NULL) - vput(tvp); - if (error = ufs_checkpath(ip, dp, tcnp->cn_cred)) - goto out; - if ((tcnp->cn_flags & SAVESTART) == 0) - panic("ufs_rename: lost to startdir"); - if (error = relookup(tdvp, &tvp, tcnp)) - goto out; + + if ( (error = ufs_checkpath(ip, dp, vfs_context_ucred(tcnp->cn_context))) ) + goto bad; + + if ( (error = relookup(tdvp, &tvp, tcnp)) ) + goto bad; + rl_vp = tvp; + dp = VTOI(tdvp); - xp = NULL; if (tvp) xp = VTOI(tvp); + else + xp = NULL; } /* * 2) If target doesn't exist, link the target @@ -1117,19 +880,18 @@ abortit: } dp->i_nlink++; dp->i_flag |= IN_CHANGE; - if (error = VOP_UPDATE(tdvp, &tv, &tv, 1)) + if ( (error = ffs_update(tdvp, &tv, &tv, 1)) ) goto bad; } - if (error = ufs_direnter(ip, tdvp, tcnp)) { + if ( (error = ufs_direnter(ip, tdvp, tcnp)) ) { if (doingdirectory && newparent) { dp->i_nlink--; dp->i_flag |= IN_CHANGE; - (void)VOP_UPDATE(tdvp, &tv, &tv, 1); + (void)ffs_update(tdvp, &tv, &tv, 1); } goto bad; } VN_KNOTE(tdvp, NOTE_WRITE); - vput(tdvp); } else { if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev) panic("rename: EXDEV"); @@ -1138,25 +900,13 @@ abortit: */ if (xp->i_number == ip->i_number) panic("rename: same file"); - /* - * If the parent directory is "sticky", then the user must - * own the parent directory, or the destination of the rename, - * otherwise the destination may not be changed (except by - * root). This implements append-only directories. - */ - if ((dp->i_mode & S_ISTXT) && tcnp->cn_cred->cr_uid != 0 && - tcnp->cn_cred->cr_uid != dp->i_uid && - xp->i_uid != tcnp->cn_cred->cr_uid) { - error = EPERM; - goto bad; - } /* * Target must be empty if a directory and have no links * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ if ((xp->i_mode&IFMT) == IFDIR) { - if (!ufs_dirempty(xp, dp->i_number, tcnp->cn_cred) || + if (!ufs_dirempty(xp, dp->i_number, vfs_context_ucred(tcnp->cn_context)) || xp->i_nlink > 2) { error = ENOTEMPTY; goto bad; @@ -1170,7 +920,7 @@ abortit: error = EISDIR; goto bad; } - if (error = ufs_dirrewrite(dp, ip, tcnp)) + if ( (error = ufs_dirrewrite(dp, ip, tcnp)) ) goto bad; /* * If the target directory is in the same @@ -1183,7 +933,6 @@ abortit: dp->i_flag |= IN_CHANGE; } VN_KNOTE(tdvp, NOTE_WRITE); - vput(tdvp); /* * Adjust the link count of the target to * reflect the dirrewrite above. If this is @@ -1200,33 +949,35 @@ abortit: panic("rename: linked directory"); ioflag = ((tvp)->v_mount->mnt_flag & MNT_ASYNC) ? 0 : IO_SYNC; - error = VOP_TRUNCATE(tvp, (off_t)0, ioflag, - tcnp->cn_cred, tcnp->cn_proc); + error = ffs_truncate_internal(tvp, (off_t)0, ioflag, vfs_context_ucred(tcnp->cn_context)); } xp->i_flag |= IN_CHANGE; VN_KNOTE(tvp, NOTE_DELETE); - vput(tvp); xp = NULL; } - + if (rl_vp) + vnode_put(rl_vp); + rl_vp = NULL; + /* * 3) Unlink the source. */ fcnp->cn_flags &= ~MODMASK; - fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; - if ((fcnp->cn_flags & SAVESTART) == 0) - panic("ufs_rename: lost from startdir"); + fcnp->cn_flags |= (WANTPARENT | NOCACHE); + (void) relookup(fdvp, &fvp, fcnp); + if (fvp != NULL) { xp = VTOI(fvp); dp = VTOI(fdvp); + rl_vp = fvp; } else { /* * From name has disappeared. */ if (doingdirectory) panic("rename: lost dir entry"); - vrele(ap->a_fvp); + return (0); } /* @@ -1236,7 +987,7 @@ abortit: * either case there is no further work to be done. If the source * is a directory then it cannot have been rmdir'ed; its link * count of three would cause a rmdir to fail with ENOTEMPTY. - * The IRENAME flag ensures that it cannot be moved by another + * The IN_RENAME flag ensures that it cannot be moved by another * rename. */ if (xp != ip) { @@ -1254,8 +1005,8 @@ abortit: dp->i_flag |= IN_CHANGE; error = vn_rdwr(UIO_READ, fvp, (caddr_t)&dirbuf, sizeof (struct dirtemplate), (off_t)0, - UIO_SYSSPACE, IO_NODELOCKED, - tcnp->cn_cred, (int *)0, (struct proc *)0); + UIO_SYSSPACE32, IO_NODELOCKED, + vfs_context_ucred(tcnp->cn_context), (int *)0, (struct proc *)0); if (error == 0) { # if (BYTE_ORDER == LITTLE_ENDIAN) if (fvp->v_mount->mnt_maxsymlinklen <= 0) @@ -1275,9 +1026,9 @@ abortit: (void) vn_rdwr(UIO_WRITE, fvp, (caddr_t)&dirbuf, sizeof (struct dirtemplate), - (off_t)0, UIO_SYSSPACE, + (off_t)0, UIO_SYSSPACE32, IO_NODELOCKED|IO_SYNC, - tcnp->cn_cred, (int *)0, + vfs_context_ucred(tcnp->cn_context), (int *)0, (struct proc *)0); cache_purge(fdvp); } @@ -1291,26 +1042,24 @@ abortit: xp->i_flag &= ~IN_RENAME; } VN_KNOTE(fvp, NOTE_RENAME); - if (dp) - vput(fdvp); - if (xp) - vput(fvp); - vrele(ap->a_fvp); + + if (rl_vp) + vnode_put(rl_vp); + return (error); bad: - if (xp) - vput(ITOV(xp)); - vput(ITOV(dp)); -out: + if (rl_vp) + vnode_put(rl_vp); + if (doingdirectory) ip->i_flag &= ~IN_RENAME; - if (vn_lock(fvp, LK_EXCLUSIVE, p) == 0) { - ip->i_nlink--; - ip->i_flag |= IN_CHANGE; - vput(fvp); - } else - vrele(fvp); + + ip->i_nlink--; + ip->i_flag |= IN_CHANGE; + ip->i_flag &= ~IN_RENAME; + +abortit: return (error); } @@ -1331,15 +1080,16 @@ static struct odirtemplate omastertemplate = { */ int ufs_mkdir(ap) - struct vop_mkdir_args /* { + struct vnop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; - struct vattr *a_vap; + struct vnode_attr *a_vap; + vfs_context_t a_context; } */ *ap; { register struct vnode *dvp = ap->a_dvp; - register struct vattr *vap = ap->a_vap; + register struct vnode_attr *vap = ap->a_vap; register struct componentname *cnp = ap->a_cnp; register struct inode *ip, *dp; struct vnode *tvp; @@ -1347,10 +1097,17 @@ ufs_mkdir(ap) struct timeval tv; int error, dmode; -#if DIAGNOSTIC - if ((cnp->cn_flags & HASBUF) == 0) - panic("ufs_mkdir: no name"); -#endif + /* use relookup to force correct directory hints */ + cnp->cn_flags &= ~MODMASK; + cnp->cn_flags |= (WANTPARENT | NOCACHE); + cnp->cn_nameiop = CREATE; + + (void) relookup(dvp, &tvp, cnp); + + /* get rid of reference relookup returned */ + if (tvp) + vnode_put(tvp); + dp = VTOI(dvp); if ((nlink_t)dp->i_nlink >= LINK_MAX) { error = EMLINK; @@ -1358,37 +1115,35 @@ ufs_mkdir(ap) } dmode = vap->va_mode & 0777; dmode |= IFDIR; + /* * Must simulate part of ufs_makeinode here to acquire the inode, * but not have it entered in the parent directory. The entry is * made later after writing "." and ".." entries. */ - if (error = VOP_VALLOC(dvp, dmode, cnp->cn_cred, &tvp)) + if ( (error = ffs_valloc(dvp, (mode_t)dmode, vfs_context_ucred(cnp->cn_context), &tvp)) ) goto out; ip = VTOI(tvp); - ip->i_uid = cnp->cn_cred->cr_uid; - ip->i_gid = dp->i_gid; + ip->i_uid = ap->a_vap->va_uid; + ip->i_gid = ap->a_vap->va_gid; + VATTR_SET_SUPPORTED(ap->a_vap, va_mode); + VATTR_SET_SUPPORTED(ap->a_vap, va_uid); + VATTR_SET_SUPPORTED(ap->a_vap, va_gid); #if QUOTA if ((error = getinoquota(ip)) || - (error = chkiq(ip, 1, cnp->cn_cred, 0))) { - char *tmp = cnp->cn_pnbuf; - cnp->cn_pnbuf = NULL; - cnp->cn_flags &= ~HASBUF; - FREE_ZONE(tmp, cnp->cn_pnlen, M_NAMEI); - VOP_VFREE(tvp, ip->i_number, dmode); - vput(tvp); - vput(dvp); + (error = chkiq(ip, 1, vfs_context_ucred(cnp->cn_context), 0))) { + ffs_vfree(tvp, ip->i_number, dmode); + vnode_put(tvp); return (error); } #endif ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_mode = dmode; - tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */ ip->i_nlink = 2; if (cnp->cn_flags & ISWHITEOUT) ip->i_flags |= UF_OPAQUE; - tv = time; - error = VOP_UPDATE(tvp, &tv, &tv, 1); + microtime(&tv); + error = ffs_update(tvp, &tv, &tv, 1); /* * Bump link count in parent directory @@ -1398,7 +1153,7 @@ ufs_mkdir(ap) */ dp->i_nlink++; dp->i_flag |= IN_CHANGE; - if (error = VOP_UPDATE(dvp, &tv, &tv, 1)) + if ( (error = ffs_update(dvp, &tv, &tv, 1)) ) goto bad; /* Initialize directory with "." and ".." from static template. */ @@ -1410,14 +1165,14 @@ ufs_mkdir(ap) dirtemplate.dot_ino = ip->i_number; dirtemplate.dotdot_ino = dp->i_number; error = vn_rdwr(UIO_WRITE, tvp, (caddr_t)&dirtemplate, - sizeof (dirtemplate), (off_t)0, UIO_SYSSPACE, - IO_NODELOCKED|IO_SYNC, cnp->cn_cred, (int *)0, (struct proc *)0); + sizeof (dirtemplate), (off_t)0, UIO_SYSSPACE32, + IO_NODELOCKED|IO_SYNC, vfs_context_ucred(cnp->cn_context), (int *)0, (struct proc *)0); if (error) { dp->i_nlink--; dp->i_flag |= IN_CHANGE; goto bad; } - if (DIRBLKSIZ > VFSTOUFS(dvp->v_mount)->um_mountp->mnt_stat.f_bsize) + if (DIRBLKSIZ > VFSTOUFS(dvp->v_mount)->um_mountp->mnt_vfsstat.f_bsize) panic("ufs_mkdir: blksize"); /* XXX should grow with balloc() */ else { ip->i_size = DIRBLKSIZ; @@ -1425,31 +1180,28 @@ ufs_mkdir(ap) } /* Directory set up, now install it's entry in the parent directory. */ - if (error = ufs_direnter(ip, dvp, cnp)) { + if ( (error = ufs_direnter(ip, dvp, cnp)) ) { dp->i_nlink--; dp->i_flag |= IN_CHANGE; } bad: /* - * No need to do an explicit VOP_TRUNCATE here, vrele will do this + * No need to do an explicit vnop_truncate here, vnode_put will do it * for us because we set the link count to 0. */ if (error) { ip->i_nlink = 0; ip->i_flag |= IN_CHANGE; - vput(tvp); + /* + * since we're not returning tvp due to the error, + * we're responsible for releasing it here + */ + vnode_put(tvp); } else { VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK); *ap->a_vpp = tvp; }; out: - { - char *tmp = cnp->cn_pnbuf; - cnp->cn_pnbuf = NULL; - cnp->cn_flags &= ~HASBUF; - FREE_ZONE(tmp, cnp->cn_pnlen, M_NAMEI); - } - vput(dvp); return (error); } @@ -1458,28 +1210,45 @@ out: */ int ufs_rmdir(ap) - struct vop_rmdir_args /* { + struct vnop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; + vfs_context_t a_context; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; + struct vnode *tvp; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp; int error, ioflag; + ip = VTOI(vp); dp = VTOI(dvp); /* * No rmdir "." please. */ - if (dp == ip) { - vrele(dvp); - vput(vp); + if (dp == ip) return (EINVAL); - } + + + cnp->cn_flags &= ~MODMASK; + cnp->cn_flags |= (WANTPARENT | NOCACHE); + + (void) relookup(dvp, &tvp, cnp); + + if (tvp == NULL) + return (ENOENT); + if (tvp != vp) + panic("ufs_rmdir: relookup returned a different vp"); + /* + * get rid of reference relookup returned + */ + vnode_put(tvp); + + /* * Verify the directory is empty (and valid). * (Rmdir ".." won't be valid since @@ -1489,27 +1258,21 @@ ufs_rmdir(ap) */ error = 0; if (ip->i_nlink != 2 || - !ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) { + !ufs_dirempty(ip, dp->i_number, vfs_context_ucred(cnp->cn_context))) { error = ENOTEMPTY; goto out; } - if ((dp->i_flags & APPEND) || (ip->i_flags & (IMMUTABLE | APPEND))) { - error = EPERM; - goto out; - } /* * Delete reference to directory before purging * inode. If we crash in between, the directory * will be reattached to lost+found, */ - if (error = ufs_dirremove(dvp, cnp)) + if ( (error = ufs_dirremove(dvp, cnp)) ) goto out; VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK); dp->i_nlink--; dp->i_flag |= IN_CHANGE; cache_purge(dvp); - vput(dvp); - dvp = NULL; /* * Truncate inode. The only stuff left * in the directory is "." and "..". The @@ -1523,14 +1286,10 @@ ufs_rmdir(ap) */ ip->i_nlink -= 2; ioflag = ((vp)->v_mount->mnt_flag & MNT_ASYNC) ? 0 : IO_SYNC; - error = VOP_TRUNCATE(vp, (off_t)0, ioflag, cnp->cn_cred, - cnp->cn_proc); + error = ffs_truncate_internal(vp, (off_t)0, ioflag, vfs_context_ucred(cnp->cn_context)); cache_purge(ITOV(ip)); out: - if (dvp) - vput(dvp); VN_KNOTE(vp, NOTE_DELETE); - vput(vp); return (error); } @@ -1539,20 +1298,20 @@ out: */ int ufs_symlink(ap) - struct vop_symlink_args /* { + struct vnop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; - struct vattr *a_vap; + struct vnode_attr *a_vap; char *a_target; + vfs_context_t a_context; } */ *ap; { register struct vnode *vp, **vpp = ap->a_vpp; register struct inode *ip; int len, error; - if (error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, - vpp, ap->a_cnp)) + if ( (error = ufs_makeinode(ap->a_vap, ap->a_dvp, vpp, ap->a_cnp)) ) return (error); VN_KNOTE(ap->a_dvp, NOTE_WRITE); vp = *vpp; @@ -1564,9 +1323,8 @@ ufs_symlink(ap) ip->i_flag |= IN_CHANGE | IN_UPDATE; } else error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0, - UIO_SYSSPACE, IO_NODELOCKED, ap->a_cnp->cn_cred, (int *)0, + UIO_SYSSPACE32, IO_NODELOCKED, vfs_context_ucred(ap->a_cnp->cn_context), (int *)0, (struct proc *)0); - vput(vp); return (error); } @@ -1581,49 +1339,60 @@ ufs_symlink(ap) */ int ufs_readdir(ap) - struct vop_readdir_args /* { + struct vnop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; - struct ucred *a_cred; + int a_flags; int *a_eofflag; - int *ncookies; - u_long **a_cookies; + int *a_numdirent; + vfs_context_t a_context; } */ *ap; { - register struct uio *uio = ap->a_uio; + struct uio *uio = ap->a_uio; int error; size_t count, lost; - off_t off = uio->uio_offset; - count = uio->uio_resid; + if (ap->a_flags & VNODE_READDIR_EXTENDED) { + return ufs_readdirext(ap->a_vp, uio, ap->a_eofflag, + ap->a_numdirent, ap->a_context); + } + + // LP64todo - fix this + count = uio_resid(uio); /* Make sure we don't return partial entries. */ count -= (uio->uio_offset + count) & (DIRBLKSIZ -1); if (count <= 0) return (EINVAL); - lost = uio->uio_resid - count; - uio->uio_resid = count; - uio->uio_iov->iov_len = count; + // LP64todo - fix this + lost = uio_resid(uio) - count; + uio_setresid(uio, count); + uio_iov_len_set(uio, count); # if (BYTE_ORDER == LITTLE_ENDIAN) if (ap->a_vp->v_mount->mnt_maxsymlinklen > 0) { - error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred); + error = ffs_read_internal(ap->a_vp, uio, 0); } else { struct dirent *dp, *edp; struct uio auio; - struct iovec aiov; + struct iovec_32 aiov; caddr_t dirbuf; int readcnt; u_char tmp; auio = *uio; - auio.uio_iov = &aiov; + auio.uio_iovs.iov32p = &aiov; auio.uio_iovcnt = 1; +#if 1 /* LP64todo - can't use new segment flags until the drivers are ready */ auio.uio_segflg = UIO_SYSSPACE; +#else + auio.uio_segflg = UIO_SYSSPACE32; +#endif aiov.iov_len = count; MALLOC(dirbuf, caddr_t, count, M_TEMP, M_WAITOK); - aiov.iov_base = dirbuf; - error = VOP_READ(ap->a_vp, &auio, 0, ap->a_cred); + aiov.iov_base = (uintptr_t)dirbuf; + error = ffs_read_internal(ap->a_vp, &auio, 0); if (error == 0) { - readcnt = count - auio.uio_resid; + // LP64todo - fix this + readcnt = count - uio_resid(&auio); edp = (struct dirent *)&dirbuf[readcnt]; for (dp = (struct dirent *)dirbuf; dp < edp; ) { tmp = dp->d_namlen; @@ -1643,56 +1412,137 @@ ufs_readdir(ap) FREE(dirbuf, M_TEMP); } # else - error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred); + error = ffs_read_internal(ap->a_vp, uio, 0); # endif - if (!error && ap->a_ncookies != NULL) { - struct dirent* dpStart; - struct dirent* dpEnd; - struct dirent* dp; - int ncookies; - u_long *cookies; - u_long *cookiep; - /* - * Only the NFS server uses cookies, and it loads the - * directory block into system space, so we can just look at - * it directly. - */ - if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) - panic("ufs_readdir: unexpected uio from NFS server"); - dpStart = (struct dirent *) - (uio->uio_iov->iov_base - (uio->uio_offset - off)); - dpEnd = (struct dirent *) uio->uio_iov->iov_base; - for (dp = dpStart, ncookies = 0; - dp < dpEnd && dp->d_reclen != 0; - dp = (struct dirent *)((caddr_t)dp + dp->d_reclen)) - ncookies++; - MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP, - M_WAITOK); - for (dp = dpStart, cookiep = cookies; - dp < dpEnd; - dp = (struct dirent *)((caddr_t) dp + dp->d_reclen)) { - off += dp->d_reclen; - *cookiep++ = (u_long) off; - } - *ap->a_ncookies = ncookies; - *ap->a_cookies = cookies; + uio_setresid(uio, (uio_resid(uio) + lost)); + if (ap->a_eofflag) + *ap->a_eofflag = (off_t)VTOI(ap->a_vp)->i_size <= uio->uio_offset; + return (error); +} + + +/* + * ufs_readdirext reads directory entries into the buffer pointed + * to by uio, in a filesystem independent format. Up to uio_resid + * bytes of data can be transferred. The data in the buffer is a + * series of packed direntry structures where each one contains the + * following entries: + * + * d_reclen: length of record + * d_ino: file number of entry + * d_seekoff: seek offset (used by NFS server, aka cookie) + * d_type: file type + * d_namlen: length of string in d_name + * d_name: null terminated file name + * + * The current position (uio_offset) refers to the next block of + * entries. The offset will only be set to a value previously + * returned by ufs_readdirext or zero. This offset does not have + * to match the number of bytes returned (in uio_resid). + */ +#define EXT_DIRENT_LEN(namlen) \ + ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 3) & ~3) + +static int +ufs_readdirext(vnode_t vp, uio_t uio, int *eofflag, int *numdirent, + __unused vfs_context_t context) +{ + int error; + size_t count, lost; + off_t off = uio->uio_offset; + struct dirent *dp, *edp; + struct uio auio; + struct iovec_32 aiov; + caddr_t dirbuf; + struct direntry *xdp; + int nentries = 0; + + // LP64todo - fix this + count = uio_resid(uio); + /* Make sure we don't return partial entries. */ + count -= (uio->uio_offset + count) & (DIRBLKSIZ -1); + if (count <= 0) + return (EINVAL); + // LP64todo - fix this + lost = uio_resid(uio) - count; + uio_setresid(uio, count); + uio_iov_len_set(uio, count); + + auio = *uio; + auio.uio_iovs.iov32p = &aiov; + auio.uio_iovcnt = 1; + /* LP64todo - can't use new segment flags until the drivers are ready */ + auio.uio_segflg = UIO_SYSSPACE; + aiov.iov_len = count; + MALLOC(dirbuf, caddr_t, count, M_TEMP, M_WAITOK); + aiov.iov_base = (uintptr_t)dirbuf; + + MALLOC(xdp, struct direntry *, sizeof(struct direntry), M_TEMP, M_WAITOK); + + error = ffs_read_internal(vp, &auio, 0); + if (error) + goto out; + + // LP64todo - fix this + edp = (struct dirent *)&dirbuf[count - uio_resid(&auio)]; + for (dp = (struct dirent *)dirbuf; dp < edp; ) { + +#if (BYTE_ORDER == LITTLE_ENDIAN) + u_char tmp; + + tmp = dp->d_namlen; + dp->d_namlen = dp->d_type; + dp->d_type = tmp; +#endif + xdp->d_reclen = EXT_DIRENT_LEN(dp->d_namlen); + if (xdp->d_reclen > uio_resid(uio)) { + break; /* user buffer is full */ + } + xdp->d_ino = dp->d_ino; + xdp->d_namlen = dp->d_namlen; + xdp->d_type = dp->d_type; + bcopy(dp->d_name, xdp->d_name, dp->d_namlen + 1); + off += dp->d_reclen; + xdp->d_seekoff = off; + error = uiomove((caddr_t)xdp, xdp->d_reclen, uio); + if (error) { + off -= dp->d_reclen; + break; /* unexpected this error is */ + } + nentries++; + + if (dp->d_reclen > 0) { + dp = (struct dirent *) + ((char *)dp + dp->d_reclen); + } else { + error = EIO; + break; + } } - uio->uio_resid += lost; - if (ap->a_eofflag) - *ap->a_eofflag = VTOI(ap->a_vp)->i_size <= uio->uio_offset; +out: + FREE(dirbuf, M_TEMP); + FREE(xdp, M_TEMP); + + /* Use the on-disk dirent offset */ + uio_setoffset(uio, off); + *numdirent = nentries; + uio_setresid(uio, (uio_resid(uio) + lost)); + if (eofflag) + *eofflag = (off_t)VTOI(vp)->i_size <= uio->uio_offset; return (error); } + /* * Return target name of a symbolic link */ int ufs_readlink(ap) - struct vop_readlink_args /* { + struct vnop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; - struct ucred *a_cred; + vfs_context_t a_context; } */ *ap; { register struct vnode *vp = ap->a_vp; @@ -1704,149 +1554,23 @@ ufs_readlink(ap) uiomove((char *)ip->i_shortlink, isize, ap->a_uio); return (0); } - return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred)); -} - -/* - * Lock an inode. If its already locked, set the WANT bit and sleep. - */ -int -ufs_lock(ap) - struct vop_lock_args /* { - struct vnode *a_vp; - int a_flags; - struct proc *a_p; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - - if (VTOI(vp) == (struct inode *)NULL) - panic("inode in vnode is null\n"); - return (lockmgr(&VTOI(vp)->i_lock, ap->a_flags, &vp->v_interlock, - ap->a_p)); -} - -/* - * Unlock an inode. - */ -int -ufs_unlock(ap) - struct vop_unlock_args /* { - struct vnode *a_vp; - int a_flags; - struct proc *a_p; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - - return (lockmgr(&VTOI(vp)->i_lock, ap->a_flags | LK_RELEASE, - &vp->v_interlock, ap->a_p)); + return (ffs_read_internal(vp, ap->a_uio, 0)); } /* - * Check for a locked inode. + * prepare and issue the I/O */ -int -ufs_islocked(ap) - struct vop_islocked_args /* { - struct vnode *a_vp; - } */ *ap; -{ - - return (lockstatus(&VTOI(ap->a_vp)->i_lock)); -} - -/* - * Calculate the logical to physical mapping if not done already, - * then call the device strategy routine. - */ -int +errno_t ufs_strategy(ap) - struct vop_strategy_args /* { + struct vnop_strategy_args /* { struct buf *a_bp; } */ *ap; { - register struct buf *bp = ap->a_bp; - register struct vnode *vp = bp->b_vp; - register struct inode *ip; - int error; - - ip = VTOI(vp); - if ( !(bp->b_flags & B_VECTORLIST)) { - if (vp->v_type == VBLK || vp->v_type == VCHR) - panic("ufs_strategy: spec"); - - - if (bp->b_flags & B_PAGELIST) { - /* - * if we have a page list associated with this bp, - * then go through cluste_bp since it knows how to - * deal with a page request that might span non-contiguous - * physical blocks on the disk... - */ -#if 1 - if (bp->b_blkno == bp->b_lblkno) { - if (error = VOP_BMAP(vp, bp->b_lblkno, NULL, - &bp->b_blkno, NULL)) { - bp->b_error = error; - bp->b_flags |= B_ERROR; - biodone(bp); - return (error); - } - } -#endif /* 1 */ - error = cluster_bp(bp); - vp = ip->i_devvp; - bp->b_dev = vp->v_rdev; - - return (error); - } - - if (bp->b_blkno == bp->b_lblkno) { - if (error = - VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL)) { - bp->b_error = error; - bp->b_flags |= B_ERROR; - biodone(bp); - return (error); - } - if ((long)bp->b_blkno == -1) - clrbuf(bp); - } - if ((long)bp->b_blkno == -1) { - biodone(bp); - return (0); - } - - } - - vp = ip->i_devvp; - bp->b_dev = vp->v_rdev; - VOCALL (vp->v_op, VOFFSET(vop_strategy), ap); - return (0); -} - -/* - * Print out the contents of an inode. - */ -int -ufs_print(ap) - struct vop_print_args /* { - struct vnode *a_vp; - } */ *ap; -{ - register struct vnode *vp = ap->a_vp; - register struct inode *ip = VTOI(vp); + buf_t bp = ap->a_bp; + vnode_t vp = buf_vnode(bp); + struct inode *ip = VTOI(vp); - printf("tag VT_UFS, ino %d, on dev %d, %d", ip->i_number, - major(ip->i_dev), minor(ip->i_dev)); -#if FIFO - if (vp->v_type == VFIFO) - fifo_printinfo(vp); -#endif /* FIFO */ - lockmgr_printinfo(&ip->i_lock); - printf("\n"); - return (0); + return (buf_strategy(ip->i_devvp, ap)); } /* @@ -1854,11 +1578,11 @@ ufs_print(ap) */ int ufsspec_read(ap) - struct vop_read_args /* { + struct vnop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; + vfs_context_t a_context; } */ *ap; { @@ -1866,27 +1590,27 @@ ufsspec_read(ap) * Set access flag. */ VTOI(ap->a_vp)->i_flag |= IN_ACCESS; - return (VOCALL (spec_vnodeop_p, VOFFSET(vop_read), ap)); + return (VOCALL (spec_vnodeop_p, VOFFSET(vnop_read), ap)); } /* * Write wrapper for special devices. */ int -ufsspec_write(ap) - struct vop_write_args /* { +ufsspec_write( + struct vnop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; - } */ *ap; + kauth_cred_t a_cred; + } */ *ap) { /* * Set update and change flags. */ VTOI(ap->a_vp)->i_flag |= IN_CHANGE | IN_UPDATE; - return (VOCALL (spec_vnodeop_p, VOFFSET(vop_write), ap)); + return (VOCALL (spec_vnodeop_p, VOFFSET(vnop_write), ap)); } /* @@ -1896,21 +1620,21 @@ ufsspec_write(ap) */ int ufsspec_close(ap) - struct vop_close_args /* { + struct vnop_close_args /* { struct vnode *a_vp; int a_fflag; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); + struct timeval tv; - simple_lock(&vp->v_interlock); - if (ap->a_vp->v_usecount > 1) - ITIMES(ip, &time, &time); - simple_unlock(&vp->v_interlock); - return (VOCALL (spec_vnodeop_p, VOFFSET(vop_close), ap)); + if (ap->a_vp->v_usecount > 1) { + microtime(&tv); + ITIMES(ip, &tv, &tv); + } + return (VOCALL (spec_vnodeop_p, VOFFSET(vnop_close), ap)); } #if FIFO @@ -1919,11 +1643,11 @@ ufsspec_close(ap) */ int ufsfifo_read(ap) - struct vop_read_args /* { + struct vnop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; + vfs_context_t a_context; } */ *ap; { extern int (**fifo_vnodeop_p)(void *); @@ -1932,20 +1656,20 @@ ufsfifo_read(ap) * Set access flag. */ VTOI(ap->a_vp)->i_flag |= IN_ACCESS; - return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_read), ap)); + return (VOCALL (fifo_vnodeop_p, VOFFSET(vnop_read), ap)); } /* * Write wrapper for fifo's. */ int -ufsfifo_write(ap) - struct vop_write_args /* { +ufsfifo_write( + struct vnop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; - } */ *ap; + kauth_cred_t a_cred; + } */ *ap) { extern int (**fifo_vnodeop_p)(void *); @@ -1953,7 +1677,7 @@ ufsfifo_write(ap) * Set update and change flags. */ VTOI(ap->a_vp)->i_flag |= IN_CHANGE | IN_UPDATE; - return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_write), ap)); + return (VOCALL (fifo_vnodeop_p, VOFFSET(vnop_write), ap)); } /* @@ -1961,23 +1685,24 @@ ufsfifo_write(ap) * * Update the times on the inode then do device close. */ +int ufsfifo_close(ap) - struct vop_close_args /* { + struct vnop_close_args /* { struct vnode *a_vp; int a_fflag; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */ *ap; { extern int (**fifo_vnodeop_p)(void *); struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); + struct timeval tv; - simple_lock(&vp->v_interlock); - if (ap->a_vp->v_usecount > 1) - ITIMES(ip, &time, &time); - simple_unlock(&vp->v_interlock); - return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_close), ap)); + if (ap->a_vp->v_usecount > 1) { + microtime(&tv); + ITIMES(ip, &tv, &tv); + } + return (VOCALL (fifo_vnodeop_p, VOFFSET(vnop_close), ap)); } /* @@ -1987,12 +1712,12 @@ ufsfifo_close(ap) */ int ufsfifo_kqfilt_add(ap) - struct vop_kqfilt_add_args *ap; + struct vnop_kqfilt_add_args *ap; { extern int (**fifo_vnodeop_p)(void *); int error; - error = VOCALL(fifo_vnodeop_p, VOFFSET(vop_kqfilt_add), ap); + error = VOCALL(fifo_vnodeop_p, VOFFSET(vnop_kqfilt_add), ap); if (error) error = ufs_kqfilt_add(ap); return (error); @@ -2006,12 +1731,12 @@ ufsfifo_kqfilt_add(ap) */ int ufsfifo_kqfilt_remove(ap) - struct vop_kqfilt_remove_args *ap; + struct vnop_kqfilt_remove_args *ap; { extern int (**fifo_vnodeop_p)(void *); int error; - error = VOCALL(fifo_vnodeop_p, VOFFSET(vop_kqfilt_remove), ap); + error = VOCALL(fifo_vnodeop_p, VOFFSET(vnop_kqfilt_remove), ap); if (error) error = ufs_kqfilt_remove(ap); return (error); @@ -2032,17 +1757,17 @@ static struct filterops ufsvnode_filtops = # #% kqfilt_add vp L L L # - vop_kqfilt_add + vnop_kqfilt_add IN struct vnode *vp; IN struct knote *kn; - IN struct proc *p; + IN vfs_context_t context; */ int ufs_kqfilt_add(ap) - struct vop_kqfilt_add_args /* { + struct vnop_kqfilt_add_args /* { struct vnode *a_vp; struct knote *a_kn; - struct proc *p; + vfs_context_t a_context; } */ *ap; { struct vnode *vp = ap->a_vp; @@ -2063,6 +1788,7 @@ ufs_kqfilt_add(ap) } kn->kn_hook = (caddr_t)vp; + kn->kn_hookid = vnode_vid(vp); KNOTE_ATTACH(&VTOI(vp)->i_knotes, kn); @@ -2077,75 +1803,108 @@ filt_ufsdetach(struct knote *kn) struct proc *p = current_proc(); vp = (struct vnode *)kn->kn_hook; - if (1) { /* ! KNDETACH_VNLOCKED */ - result = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - if (result) return; - }; + + if (vnode_getwithvid(vp, kn->kn_hookid)) + return; result = KNOTE_DETACH(&VTOI(vp)->i_knotes, kn); - - if (1) { /* ! KNDETACH_VNLOCKED */ - VOP_UNLOCK(vp, 0, p); - }; + vnode_put(vp); } -/*ARGSUSED*/ static int filt_ufsread(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; - struct inode *ip = VTOI(vp); + struct inode *ip; + int dropvp = 0; + int result; - /* - * filesystem is gone, so set the EOF flag and schedule - * the knote for deletion. - */ + if (hint == 0) { + if ((vnode_getwithvid(vp, kn->kn_hookid) != 0)) { + hint = NOTE_REVOKE; + } else + dropvp = 1; + } if (hint == NOTE_REVOKE) { + /* + * filesystem is gone, so set the EOF flag and schedule + * the knote for deletion. + */ kn->kn_flags |= (EV_EOF | EV_ONESHOT); return (1); } - kn->kn_data = ip->i_size - kn->kn_fp->f_offset; - return (kn->kn_data != 0); + /* poll(2) semantics dictate always returning true */ + if (kn->kn_flags & EV_POLL) { + kn->kn_data = 1; + result = 1; + } else { + ip = VTOI(vp); + kn->kn_data = ip->i_size - kn->kn_fp->f_fglob->fg_offset; + result = (kn->kn_data != 0); + } + + if (dropvp) + vnode_put(vp); + + return (result); } -/*ARGSUSED*/ static int filt_ufswrite(struct knote *kn, long hint) { - /* - * filesystem is gone, so set the EOF flag and schedule - * the knote for deletion. - */ - if (hint == NOTE_REVOKE) + int dropvp = 0; + + if (hint == 0) { + if ((vnode_getwithvid(kn->kn_hook, kn->kn_hookid) != 0)) { + hint = NOTE_REVOKE; + } else + vnode_put(kn->kn_hook); + } + if (hint == NOTE_REVOKE) { + /* + * filesystem is gone, so set the EOF flag and schedule + * the knote for deletion. + */ + kn->kn_data = 0; kn->kn_flags |= (EV_EOF | EV_ONESHOT); - - kn->kn_data = 0; - return (1); + return (1); + } + kn->kn_data = 0; + return (1); } static int filt_ufsvnode(struct knote *kn, long hint) { + if (hint == 0) { + if ((vnode_getwithvid(kn->kn_hook, kn->kn_hookid) != 0)) { + hint = NOTE_REVOKE; + } else + vnode_put(kn->kn_hook); + } if (kn->kn_sfflags & hint) kn->kn_fflags |= hint; - if (hint == NOTE_REVOKE) { - kn->kn_flags |= EV_EOF; + if ((hint == NOTE_REVOKE)) { + kn->kn_flags |= (EV_EOF | EV_ONESHOT); return (1); } + return (kn->kn_fflags != 0); } /* * Return POSIX pathconf information applicable to ufs filesystems. */ +int ufs_pathconf(ap) - struct vop_pathconf_args /* { + struct vnop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; + vfs_context_t a_context; } */ *ap; { @@ -2174,172 +1933,12 @@ ufs_pathconf(ap) /* NOTREACHED */ } -/* - * Advisory record locking support - */ -int -ufs_advlock(ap) - struct vop_advlock_args /* { - struct vnode *a_vp; - caddr_t a_id; - int a_op; - struct flock *a_fl; - int a_flags; - } */ *ap; -{ - register struct inode *ip = VTOI(ap->a_vp); - register struct flock *fl = ap->a_fl; - register struct lockf *lock; - off_t start, end; - int error; - - /* - * Avoid the common case of unlocking when inode has no locks. - */ - if (ip->i_lockf == (struct lockf *)0) { - if (ap->a_op != F_SETLK) { - fl->l_type = F_UNLCK; - return (0); - } - } - /* - * Convert the flock structure into a start and end. - */ - switch (fl->l_whence) { - - case SEEK_SET: - case SEEK_CUR: - /* - * Caller is responsible for adding any necessary offset - * when SEEK_CUR is used. - */ - start = fl->l_start; - break; - - case SEEK_END: - start = ip->i_size + fl->l_start; - break; - - default: - return (EINVAL); - } - if (fl->l_len == 0) - end = -1; - else if (fl->l_len > 0) - end = start + fl->l_len - 1; - else { /* l_len is negative */ - end = start - 1; - start += fl->l_len; - } - if (start < 0) - return (EINVAL); - /* - * Create the lockf structure - */ - MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK); - lock->lf_start = start; - lock->lf_end = end; - lock->lf_id = ap->a_id; - lock->lf_inode = ip; - lock->lf_type = fl->l_type; - lock->lf_next = (struct lockf *)0; - TAILQ_INIT(&lock->lf_blkhd); - lock->lf_flags = ap->a_flags; - /* - * Do the requested operation. - */ - switch(ap->a_op) { - case F_SETLK: - return (lf_setlock(lock)); - - case F_UNLCK: - error = lf_clearlock(lock); - FREE(lock, M_LOCKF); - return (error); - - case F_GETLK: - error = lf_getlock(lock, fl); - FREE(lock, M_LOCKF); - return (error); - - default: - _FREE(lock, M_LOCKF); - return (EINVAL); - } - /* NOTREACHED */ -} - -/* - * Initialize the vnode associated with a new inode, handle aliased - * vnodes. - */ -int -ufs_vinit(mntp, specops, fifoops, vpp) - struct mount *mntp; - int (**specops)(); - int (**fifoops)(); - struct vnode **vpp; -{ - struct proc *p = current_proc(); /* XXX */ - struct inode *ip; - struct vnode *vp, *nvp; - - vp = *vpp; - ip = VTOI(vp); - switch(vp->v_type = IFTOVT(ip->i_mode)) { - case VCHR: - case VBLK: - vp->v_op = specops; - if (nvp = checkalias(vp, ip->i_rdev, mntp)) { - /* - * Discard unneeded vnode, but save its inode. - * Note that the lock is carried over in the inode - * to the replacement vnode. - */ - nvp->v_data = vp->v_data; - vp->v_data = NULL; - vp->v_op = spec_vnodeop_p; - vrele(vp); - vgone(vp); - /* - * Reinitialize aliased inode. - */ - vp = nvp; - ip->i_vnode = vp; - } - break; - case VFIFO: -#if FIFO - vp->v_op = fifoops; - break; -#else - return (EOPNOTSUPP); -#endif - case VREG: -#if 0 - ubc_info_init(vp); -#endif /* 0 */ - break; - default: - break; - } - if (ip->i_number == ROOTINO) - vp->v_flag |= VROOT; - /* - * Initialize modrev times - */ - SETHIGH(ip->i_modrev, time.tv_sec); - SETLOW(ip->i_modrev, time.tv_usec * 4294); - *vpp = vp; - return (0); -} - /* * Allocate a new inode. */ int -ufs_makeinode(mode, dvp, vpp, cnp) - int mode; +ufs_makeinode(vap, dvp, vpp, cnp) + struct vnode_attr *vap; struct vnode *dvp; struct vnode **vpp; struct componentname *cnp; @@ -2348,79 +1947,47 @@ ufs_makeinode(mode, dvp, vpp, cnp) struct timeval tv; struct vnode *tvp; int error; - + int is_member; + int mode; + + mode = MAKEIMODE(vap->va_type, vap->va_mode); pdir = VTOI(dvp); -#if DIAGNOSTIC - if ((cnp->cn_flags & HASBUF) == 0) - panic("ufs_makeinode: no name"); -#endif *vpp = NULL; if ((mode & IFMT) == 0) mode |= IFREG; - if (error = VOP_VALLOC(dvp, mode, cnp->cn_cred, &tvp)) { - char *tmp = cnp->cn_pnbuf; - cnp->cn_pnbuf = NULL; - cnp->cn_flags &= ~HASBUF; - FREE_ZONE(tmp, cnp->cn_pnlen, M_NAMEI); - vput(dvp); + if ( (error = ffs_valloc(dvp, (mode_t)mode, vfs_context_ucred(cnp->cn_context), &tvp)) ) return (error); - } + ip = VTOI(tvp); - ip->i_gid = pdir->i_gid; - if ((mode & IFMT) == IFLNK) - ip->i_uid = pdir->i_uid; - else - ip->i_uid = cnp->cn_cred->cr_uid; + ip->i_gid = vap->va_gid; + ip->i_uid = vap->va_uid; + VATTR_SET_SUPPORTED(vap, va_mode); + VATTR_SET_SUPPORTED(vap, va_uid); + VATTR_SET_SUPPORTED(vap, va_gid); #if QUOTA if ((error = getinoquota(ip)) || - (error = chkiq(ip, 1, cnp->cn_cred, 0))) { - char *tmp = cnp->cn_pnbuf; - cnp->cn_pnbuf = NULL; - cnp->cn_flags &= ~HASBUF; - FREE_ZONE(tmp, cnp->cn_pnlen, M_NAMEI); - VOP_VFREE(tvp, ip->i_number, mode); - vput(tvp); - vput(dvp); + (error = chkiq(ip, 1, vfs_context_ucred(cnp->cn_context), 0))) { + ffs_vfree(tvp, ip->i_number, mode); + vnode_put(tvp); return (error); } #endif ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_mode = mode; - tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */ ip->i_nlink = 1; - if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) && - suser(cnp->cn_cred, NULL)) - ip->i_mode &= ~ISGID; if (cnp->cn_flags & ISWHITEOUT) ip->i_flags |= UF_OPAQUE; - /* - * initialize UBC before calling VOP_UPDATE and ufs_direnter - * Not doing so introduces probelms in handling error from - * those calls. - * It results in a "vget: stolen ubc_info" panic due to attempt - * to shutdown uninitialized UBC. - */ - if (UBCINFOMISSING(tvp) || UBCINFORECLAIMED(tvp)) - ubc_info_init(tvp); - /* * Make sure inode goes to disk before directory entry. */ - tv = time; - if (error = VOP_UPDATE(tvp, &tv, &tv, 1)) + microtime(&tv); + if ( (error = ffs_update(tvp, &tv, &tv, 1)) ) goto bad; - if (error = ufs_direnter(ip, dvp, cnp)) + if ( (error = ufs_direnter(ip, dvp, cnp)) ) goto bad; - if ((cnp->cn_flags & SAVESTART) == 0) { - char *tmp = cnp->cn_pnbuf; - cnp->cn_pnbuf = NULL; - cnp->cn_flags &= ~HASBUF; - FREE_ZONE(tmp, cnp->cn_pnlen, M_NAMEI); - } - vput(dvp); *vpp = tvp; return (0); @@ -2430,16 +1997,10 @@ bad: * Write error occurred trying to update the inode * or the directory so must deallocate the inode. */ - { - char *tmp = cnp->cn_pnbuf; - cnp->cn_pnbuf = NULL; - cnp->cn_flags &= ~HASBUF; - FREE_ZONE(tmp, cnp->cn_pnlen, M_NAMEI); - } - vput(dvp); ip->i_nlink = 0; ip->i_flag |= IN_CHANGE; - vput(tvp); + vnode_put(tvp); + return (error); } diff --git a/bsd/ufs/ufs/ufsmount.h b/bsd/ufs/ufs/ufsmount.h index a54746310..79a073abd 100644 --- a/bsd/ufs/ufs/ufsmount.h +++ b/bsd/ufs/ufs/ufsmount.h @@ -67,7 +67,6 @@ */ struct ufs_args { char *fspec; /* block special device to mount */ - struct export_args export; /* network export information */ }; #endif /* __APPLE_API_UNSTABLE */ @@ -78,7 +77,6 @@ struct ufs_args { */ struct mfs_args { char *fspec; /* name to export for statfs */ - struct export_args export; /* if exported MFSes are supported */ caddr_t base; /* base of file system in memory */ u_long size; /* size of file system */ }; @@ -90,7 +88,6 @@ struct mfs_args { struct fs; struct mount; struct vnode; -struct netexport; /* This structure describes the UFS specific mount structure data. */ struct ufsmount { @@ -107,7 +104,6 @@ struct ufsmount { u_long um_nindir; /* indirect ptrs per block */ u_long um_bptrtodb; /* indir ptr to disk block */ u_long um_seqinc; /* inc between seq blocks */ - struct netexport um_export; /* export information */ int64_t um_savedmaxfilesize; /* XXX - limit maxfilesize */ }; diff --git a/bsd/uuid/Makefile b/bsd/uuid/Makefile new file mode 100644 index 000000000..8d5af9310 --- /dev/null +++ b/bsd/uuid/Makefile @@ -0,0 +1,60 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + + +include $(MakeInc_cmd) +include $(MakeInc_def) + +INSTINC_SUBDIRS = \ + +INSTINC_SUBDIRS_PPC = \ + +INSTINC_SUBDIRS_I386 = \ + +EXPINC_SUBDIRS = \ + +EXPINC_SUBDIRS_PPC = \ + +EXPINC_SUBDIRS_I386 = \ + +# In both the framework PrivateHeader area and /usr/include/uuid +DATAFILES = \ + uuid.h + +# Only in the framework PrivateHeader area +PRIVATE_DATAFILES = \ + +# KERNELFILES will appear only in the kernel framework +KERNELFILES = \ + uuid.h + + +# Only in the private kernel framework +PRIVATE_KERNELFILES = \ + + +INSTALL_MI_LIST = ${DATAFILES} + +INSTALL_MI_DIR = uuid + +EXPORT_MI_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} + +EXPORT_MI_DIR = uuid + +# /System/Library/Frameworks/System.framework/PrivateHeaders +INSTALL_MI_LCL_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} + +# /System/Library/Frameworks/Kernel.framework/PrivateHeaders + +INSTALL_KF_MI_LCL_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} + +# /System/Library/Frameworks/Kernel.framework/Headers + +INSTALL_KF_MI_LIST = ${KERNELFILES} + +include $(MakeInc_rule) +include $(MakeInc_dir) + + diff --git a/bsd/uuid/uuid.h b/bsd/uuid/uuid.h new file mode 100644 index 000000000..3d172d2f6 --- /dev/null +++ b/bsd/uuid/uuid.h @@ -0,0 +1,74 @@ +/* + * Public include file for the UUID library + * + * Copyright (C) 1996, 1997, 1998 Theodore Ts'o. + * + * %Begin-Header% + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * %End-Header% + */ + +#ifndef _UUID_UUID_H +#define _UUID_UUID_H + +#include <sys/_types.h> + +#ifndef _UUID_T +#define _UUID_T +typedef __darwin_uuid_t uuid_t; +#endif /* _UUID_T */ + +#define UUID_DEFINE(name,u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15) \ + static const uuid_t name __attribute__ ((unused)) = {u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15} + +#ifdef __cplusplus +extern "C" { +#endif + +void uuid_clear(uuid_t uu); + +int uuid_compare(const uuid_t uu1, const uuid_t uu2); + +void uuid_copy(uuid_t dst, const uuid_t src); + +void uuid_generate(uuid_t out); +void uuid_generate_random(uuid_t out); +void uuid_generate_time(uuid_t out); + +int uuid_is_null(const uuid_t uu); + +int uuid_parse(const char *in, uuid_t uu); + +void uuid_unparse(const uuid_t uu, char *out); +void uuid_unparse_lower(const uuid_t uu, char *out); +void uuid_unparse_upper(const uuid_t uu, char *out); + +#ifdef __cplusplus +} +#endif + +#endif /* _UUID_UUID_H */ diff --git a/bsd/uxkern/ux_exception.c b/bsd/uxkern/ux_exception.c index 655634a46..e576ad299 100644 --- a/bsd/uxkern/ux_exception.c +++ b/bsd/uxkern/ux_exception.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -39,11 +39,12 @@ #include <mach/kern_return.h> #include <mach/message.h> #include <mach/port.h> +#include <mach/mach_port.h> #include <mach/mig_errors.h> +#include <mach/exc_server.h> #include <kern/task.h> #include <kern/thread.h> #include <kern/sched_prim.h> -#include <kern/thread_act.h> #include <kern/kalloc.h> #include <sys/proc.h> @@ -51,13 +52,35 @@ #include <sys/systm.h> #include <sys/ux_exception.h> +#include <vm/vm_protos.h> /* get_task_ipcspace() */ + +/* + * XXX Things that should be retrieved from Mach headers, but aren't + */ +struct ipc_object; +extern kern_return_t ipc_object_copyin(ipc_space_t space, mach_port_name_t name, + mach_msg_type_name_t msgt_name, struct ipc_object **objectp); +extern mach_msg_return_t mach_msg_receive(mach_msg_header_t *msg, + mach_msg_option_t option, mach_msg_size_t rcv_size, + mach_port_name_t rcv_name, mach_msg_timeout_t rcv_timeout, + void (*continuation)(mach_msg_return_t), + mach_msg_size_t slist_size); +extern mach_msg_return_t mach_msg_send(mach_msg_header_t *msg, + mach_msg_option_t option, mach_msg_size_t send_size, + mach_msg_timeout_t send_timeout, mach_port_name_t notify); +extern thread_t convert_port_to_thread(ipc_port_t port); +extern void ipc_port_release(ipc_port_t); + + + + /* * Unix exception handler. */ -static void ux_exception(); +static void ux_exception(int exception, int code, int subcode, + int *ux_signal, int *ux_code); -decl_simple_lock_data(static, ux_handler_init_lock) mach_port_name_t ux_exception_port; static task_t ux_handler_self; @@ -154,37 +177,33 @@ ux_handler(void) void ux_handler_init(void) { - simple_lock_init(&ux_handler_init_lock); ux_exception_port = MACH_PORT_NULL; (void) kernel_thread(kernel_task, ux_handler); - simple_lock(&ux_handler_init_lock); if (ux_exception_port == MACH_PORT_NULL) { - simple_unlock(&ux_handler_init_lock); assert_wait(&ux_exception_port, THREAD_UNINT); thread_block(THREAD_CONTINUE_NULL); } - else - simple_unlock(&ux_handler_init_lock); } kern_return_t catch_exception_raise( - mach_port_name_t exception_port, - mach_port_name_t thread_name, - mach_port_name_t task_name, - int exception, - exception_data_t code, - mach_msg_type_number_t codecnt + __unused mach_port_t exception_port, + mach_port_t thread, + mach_port_t task, + exception_type_t exception, + exception_data_t code, + __unused mach_msg_type_number_t codeCnt ) { task_t self = current_task(); - thread_act_t th_act; + thread_t th_act; ipc_port_t thread_port; - ipc_port_t task_port; kern_return_t result = MACH_MSG_SUCCESS; - int signal = 0; + int ux_signal = 0; u_long ucode = 0; struct uthread *ut; + mach_port_name_t thread_name = (mach_port_name_t)thread; /* XXX */ + mach_port_name_t task_name = (mach_port_name_t)task; /* XXX */ /* * Convert local thread name to global port. @@ -194,31 +213,31 @@ catch_exception_raise( MACH_MSG_TYPE_PORT_SEND, (void *) &thread_port) == MACH_MSG_SUCCESS)) { if (IPC_PORT_VALID(thread_port)) { - th_act = (thread_act_t)convert_port_to_act(thread_port); + th_act = convert_port_to_thread(thread_port); ipc_port_release(thread_port); } else { - th_act = THR_ACT_NULL; + th_act = THREAD_NULL; } /* * Catch bogus ports */ - if (th_act != THR_ACT_NULL) { + if (th_act != THREAD_NULL) { /* * Convert exception to unix signal and code. */ ut = get_bsdthread_info(th_act); ux_exception(exception, code[0], code[1], - &signal, &ucode); + &ux_signal, (int *)&ucode); /* * Send signal. */ - if (signal != 0) - threadsignal(th_act, signal, ucode); + if (ux_signal != 0) + threadsignal(th_act, ux_signal, ucode); - act_deallocate(th_act); + thread_deallocate(th_act); } else result = KERN_INVALID_ARGUMENT; @@ -230,23 +249,43 @@ catch_exception_raise( * Delete our send rights to the task and thread ports. */ (void)mach_port_deallocate(get_task_ipcspace(ux_handler_self), task_name); - (void)mach_port_deallocate(get_task_ipcspace(ux_handler_self),thread_name); + (void)mach_port_deallocate(get_task_ipcspace(ux_handler_self), thread_name); return (result); } + kern_return_t -catch_exception_raise_state(mach_port_name_t exception_port, int exception, exception_data_t code, mach_msg_type_number_t codeCnt, int flavor, thread_state_t old_state, int old_stateCnt, thread_state_t new_state, int new_stateCnt) +catch_exception_raise_state( + __unused mach_port_t exception_port, + __unused exception_type_t exception, + __unused const exception_data_t code, + __unused mach_msg_type_number_t codeCnt, + __unused int *flavor, + __unused const thread_state_t old_state, + __unused mach_msg_type_number_t old_stateCnt, + __unused thread_state_t new_state, + __unused mach_msg_type_number_t *new_stateCnt) { return(KERN_INVALID_ARGUMENT); } + kern_return_t -catch_exception_raise_state_identity(mach_port_name_t exception_port, mach_port_t thread, mach_port_t task, int exception, exception_data_t code, mach_msg_type_number_t codeCnt, int flavor, thread_state_t old_state, int old_stateCnt, thread_state_t new_state, int new_stateCnt) +catch_exception_raise_state_identity( + __unused mach_port_t exception_port, + __unused mach_port_t thread, + __unused mach_port_t task, + __unused exception_type_t exception, + __unused exception_data_t code, + __unused mach_msg_type_number_t codeCnt, + __unused int *flavor, + __unused thread_state_t old_state, + __unused mach_msg_type_number_t old_stateCnt, + __unused thread_state_t new_state, + __unused mach_msg_type_number_t *new_stateCnt) { return(KERN_INVALID_ARGUMENT); } -boolean_t machine_exception(); - /* * ux_exception translates a mach exception, code and subcode to * a signal and u.u_code. Calls machine_exception (machine dependent) diff --git a/bsd/vfs/kpi_vfs.c b/bsd/vfs/kpi_vfs.c new file mode 100644 index 000000000..7f72472a9 --- /dev/null +++ b/bsd/vfs/kpi_vfs.c @@ -0,0 +1,4626 @@ +/* + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kpi_vfs.c + */ + +/* + * External virtual filesystem routines + */ + +#undef DIAGNOSTIC +#define DIAGNOSTIC 1 + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> +#include <sys/mount.h> +#include <sys/mount_internal.h> +#include <sys/time.h> +#include <sys/vnode_internal.h> +#include <sys/stat.h> +#include <sys/namei.h> +#include <sys/ucred.h> +#include <sys/buf.h> +#include <sys/errno.h> +#include <sys/malloc.h> +#include <sys/domain.h> +#include <sys/mbuf.h> +#include <sys/syslog.h> +#include <sys/ubc.h> +#include <sys/vm.h> +#include <sys/sysctl.h> +#include <sys/filedesc.h> +#include <sys/fsevents.h> +#include <sys/user.h> +#include <sys/lockf.h> +#include <sys/xattr.h> + +#include <kern/assert.h> +#include <kern/kalloc.h> + +#include <miscfs/specfs/specdev.h> + +#include <mach/mach_types.h> +#include <mach/memory_object_types.h> + +#define ESUCCESS 0 +#undef mount_t +#undef vnode_t + +#define COMPAT_ONLY + + +#define THREAD_SAFE_FS(VP) \ + ((VP)->v_unsafefs ? 0 : 1) + +#define NATIVE_XATTR(VP) \ + ((VP)->v_mount ? (VP)->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR : 0) + +static void xattrfile_remove(vnode_t dvp, const char * basename, vfs_context_t context, + int thread_safe, int force); +static void xattrfile_setattr(vnode_t dvp, const char * basename, struct vnode_attr * vap, + vfs_context_t context, int thread_safe); + + +static void +vnode_setneedinactive(vnode_t vp) +{ + cache_purge(vp); + + vnode_lock(vp); + vp->v_lflag |= VL_NEEDINACTIVE; + vnode_unlock(vp); +} + + +int +lock_fsnode(vnode_t vp, int *funnel_state) +{ + if (funnel_state) + *funnel_state = thread_funnel_set(kernel_flock, TRUE); + + if (vp->v_unsafefs) { + if (vp->v_unsafefs->fsnodeowner == current_thread()) { + vp->v_unsafefs->fsnode_count++; + } else { + lck_mtx_lock(&vp->v_unsafefs->fsnodelock); + + if (vp->v_lflag & (VL_TERMWANT | VL_TERMINATE | VL_DEAD)) { + lck_mtx_unlock(&vp->v_unsafefs->fsnodelock); + + if (funnel_state) + (void) thread_funnel_set(kernel_flock, *funnel_state); + return (ENOENT); + } + vp->v_unsafefs->fsnodeowner = current_thread(); + vp->v_unsafefs->fsnode_count = 1; + } + } + return (0); +} + + +void +unlock_fsnode(vnode_t vp, int *funnel_state) +{ + if (vp->v_unsafefs) { + if (--vp->v_unsafefs->fsnode_count == 0) { + vp->v_unsafefs->fsnodeowner = NULL; + lck_mtx_unlock(&vp->v_unsafefs->fsnodelock); + } + } + if (funnel_state) + (void) thread_funnel_set(kernel_flock, *funnel_state); +} + + + +/* ====================================================================== */ +/* ************ EXTERNAL KERNEL APIS ********************************** */ +/* ====================================================================== */ + +/* + * prototypes for exported VFS operations + */ +int +VFS_MOUNT(struct mount * mp, vnode_t devvp, user_addr_t data, vfs_context_t context) +{ + int error; + int thread_safe; + int funnel_state = 0; + + if ((mp == dead_mountp) || (mp->mnt_op->vfs_mount == 0)) + return(ENOTSUP); + + thread_safe = mp->mnt_vtable->vfc_threadsafe; + + + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + } + + if (vfs_context_is64bit(context)) { + if (vfs_64bitready(mp)) { + error = (*mp->mnt_op->vfs_mount)(mp, devvp, data, context); + } + else { + error = ENOTSUP; + } + } + else { + error = (*mp->mnt_op->vfs_mount)(mp, devvp, data, context); + } + + if (!thread_safe) { + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return (error); +} + +int +VFS_START(struct mount * mp, int flags, vfs_context_t context) +{ + int error; + int thread_safe; + int funnel_state = 0; + + if ((mp == dead_mountp) || (mp->mnt_op->vfs_start == 0)) + return(ENOTSUP); + + thread_safe = mp->mnt_vtable->vfc_threadsafe; + + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + } + error = (*mp->mnt_op->vfs_start)(mp, flags, context); + if (!thread_safe) { + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return (error); +} + +int +VFS_UNMOUNT(struct mount *mp, int flags, vfs_context_t context) +{ + int error; + int thread_safe; + int funnel_state = 0; + + if ((mp == dead_mountp) || (mp->mnt_op->vfs_unmount == 0)) + return(ENOTSUP); + + thread_safe = mp->mnt_vtable->vfc_threadsafe; + + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + } + error = (*mp->mnt_op->vfs_unmount)(mp, flags, context); + if (!thread_safe) { + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return (error); +} + +int +VFS_ROOT(struct mount * mp, struct vnode ** vpp, vfs_context_t context) +{ + int error; + int thread_safe; + int funnel_state = 0; + struct vfs_context acontext; + + if ((mp == dead_mountp) || (mp->mnt_op->vfs_root == 0)) + return(ENOTSUP); + + if (context == NULL) { + acontext.vc_proc = current_proc(); + acontext.vc_ucred = kauth_cred_get(); + context = &acontext; + } + thread_safe = mp->mnt_vtable->vfc_threadsafe; + + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + } + error = (*mp->mnt_op->vfs_root)(mp, vpp, context); + if (!thread_safe) { + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return (error); +} + +int +VFS_QUOTACTL(struct mount *mp, int cmd, uid_t uid, caddr_t datap, vfs_context_t context) +{ + int error; + int thread_safe; + int funnel_state = 0; + + if ((mp == dead_mountp) || (mp->mnt_op->vfs_quotactl == 0)) + return(ENOTSUP); + + thread_safe = mp->mnt_vtable->vfc_threadsafe; + + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + } + error = (*mp->mnt_op->vfs_quotactl)(mp, cmd, uid, datap, context); + if (!thread_safe) { + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return (error); +} + +int +VFS_GETATTR(struct mount *mp, struct vfs_attr *vfa, vfs_context_t context) +{ + int error; + int thread_safe; + int funnel_state = 0; + struct vfs_context acontext; + + if ((mp == dead_mountp) || (mp->mnt_op->vfs_getattr == 0)) + return(ENOTSUP); + + if (context == NULL) { + acontext.vc_proc = current_proc(); + acontext.vc_ucred = kauth_cred_get(); + context = &acontext; + } + thread_safe = mp->mnt_vtable->vfc_threadsafe; + + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + } + error = (*mp->mnt_op->vfs_getattr)(mp, vfa, context); + if (!thread_safe) { + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return(error); +} + +int +VFS_SETATTR(struct mount *mp, struct vfs_attr *vfa, vfs_context_t context) +{ + int error; + int thread_safe; + int funnel_state = 0; + struct vfs_context acontext; + + if ((mp == dead_mountp) || (mp->mnt_op->vfs_setattr == 0)) + return(ENOTSUP); + + if (context == NULL) { + acontext.vc_proc = current_proc(); + acontext.vc_ucred = kauth_cred_get(); + context = &acontext; + } + thread_safe = mp->mnt_vtable->vfc_threadsafe; + + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + } + error = (*mp->mnt_op->vfs_setattr)(mp, vfa, context); + if (!thread_safe) { + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return(error); +} + +int +VFS_SYNC(struct mount *mp, int flags, vfs_context_t context) +{ + int error; + int thread_safe; + int funnel_state = 0; + struct vfs_context acontext; + + if ((mp == dead_mountp) || (mp->mnt_op->vfs_sync == 0)) + return(ENOTSUP); + + if (context == NULL) { + acontext.vc_proc = current_proc(); + acontext.vc_ucred = kauth_cred_get(); + context = &acontext; + } + thread_safe = mp->mnt_vtable->vfc_threadsafe; + + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + } + error = (*mp->mnt_op->vfs_sync)(mp, flags, context); + if (!thread_safe) { + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return(error); +} + +int +VFS_VGET(struct mount * mp, ino64_t ino, struct vnode **vpp, vfs_context_t context) +{ + int error; + int thread_safe; + int funnel_state = 0; + struct vfs_context acontext; + + if ((mp == dead_mountp) || (mp->mnt_op->vfs_vget == 0)) + return(ENOTSUP); + + if (context == NULL) { + acontext.vc_proc = current_proc(); + acontext.vc_ucred = kauth_cred_get(); + context = &acontext; + } + thread_safe = mp->mnt_vtable->vfc_threadsafe; + + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + } + error = (*mp->mnt_op->vfs_vget)(mp, ino, vpp, context); + if (!thread_safe) { + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return(error); +} + +int +VFS_FHTOVP(struct mount * mp, int fhlen, unsigned char * fhp, vnode_t * vpp, vfs_context_t context) +{ + int error; + int thread_safe; + int funnel_state = 0; + struct vfs_context acontext; + + if ((mp == dead_mountp) || (mp->mnt_op->vfs_fhtovp == 0)) + return(ENOTSUP); + + if (context == NULL) { + acontext.vc_proc = current_proc(); + acontext.vc_ucred = kauth_cred_get(); + context = &acontext; + } + thread_safe = mp->mnt_vtable->vfc_threadsafe; + + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + } + error = (*mp->mnt_op->vfs_fhtovp)(mp, fhlen, fhp, vpp, context); + if (!thread_safe) { + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return(error); +} + +int +VFS_VPTOFH(struct vnode * vp, int *fhlenp, unsigned char * fhp, vfs_context_t context) +{ + int error; + int thread_safe; + int funnel_state = 0; + struct vfs_context acontext; + + if ((vp->v_mount == dead_mountp) || (vp->v_mount->mnt_op->vfs_vptofh == 0)) + return(ENOTSUP); + + if (context == NULL) { + acontext.vc_proc = current_proc(); + acontext.vc_ucred = kauth_cred_get(); + context = &acontext; + } + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + } + error = (*vp->v_mount->mnt_op->vfs_vptofh)(vp, fhlenp, fhp, context); + if (!thread_safe) { + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return(error); +} + + +/* returns a copy of vfs type name for the mount_t */ +void +vfs_name(mount_t mp, char * buffer) +{ + strncpy(buffer, mp->mnt_vtable->vfc_name, MFSNAMELEN); +} + +/* returns vfs type number for the mount_t */ +int +vfs_typenum(mount_t mp) +{ + return(mp->mnt_vtable->vfc_typenum); +} + + +/* returns command modifier flags of mount_t ie. MNT_CMDFLAGS */ +uint64_t +vfs_flags(mount_t mp) +{ + return((uint64_t)(mp->mnt_flag & (MNT_CMDFLAGS | MNT_VISFLAGMASK))); +} + +/* set any of the command modifier flags(MNT_CMDFLAGS) in mount_t */ +void +vfs_setflags(mount_t mp, uint64_t flags) +{ + uint32_t lflags = (uint32_t)(flags & (MNT_CMDFLAGS | MNT_VISFLAGMASK)); + + mp->mnt_flag |= lflags; +} + +/* clear any of the command modifier flags(MNT_CMDFLAGS) in mount_t */ +void +vfs_clearflags(mount_t mp , uint64_t flags) +{ + uint32_t lflags = (uint32_t)(flags & (MNT_CMDFLAGS | MNT_VISFLAGMASK)); + + mp->mnt_flag &= ~lflags; +} + +/* Is the mount_t ronly and upgrade read/write requested? */ +int +vfs_iswriteupgrade(mount_t mp) /* ronly && MNTK_WANTRDWR */ +{ + return ((mp->mnt_flag & MNT_RDONLY) && (mp->mnt_kern_flag & MNTK_WANTRDWR)); +} + + +/* Is the mount_t mounted ronly */ +int +vfs_isrdonly(mount_t mp) +{ + return (mp->mnt_flag & MNT_RDONLY); +} + +/* Is the mount_t mounted for filesystem synchronous writes? */ +int +vfs_issynchronous(mount_t mp) +{ + return (mp->mnt_flag & MNT_SYNCHRONOUS); +} + +/* Is the mount_t mounted read/write? */ +int +vfs_isrdwr(mount_t mp) +{ + return ((mp->mnt_flag & MNT_RDONLY) == 0); +} + + +/* Is mount_t marked for update (ie MNT_UPDATE) */ +int +vfs_isupdate(mount_t mp) +{ + return (mp->mnt_flag & MNT_UPDATE); +} + + +/* Is mount_t marked for reload (ie MNT_RELOAD) */ +int +vfs_isreload(mount_t mp) +{ + return ((mp->mnt_flag & MNT_UPDATE) && (mp->mnt_flag & MNT_RELOAD)); +} + +/* Is mount_t marked for reload (ie MNT_FORCE) */ +int +vfs_isforce(mount_t mp) +{ + if ((mp->mnt_flag & MNT_FORCE) || (mp->mnt_kern_flag & MNTK_FRCUNMOUNT)) + return(1); + else + return(0); +} + +int +vfs_64bitready(mount_t mp) +{ + if ((mp->mnt_vtable->vfc_64bitready)) + return(1); + else + return(0); +} + +int +vfs_authopaque(mount_t mp) +{ + if ((mp->mnt_kern_flag & MNTK_AUTH_OPAQUE)) + return(1); + else + return(0); +} + +int +vfs_authopaqueaccess(mount_t mp) +{ + if ((mp->mnt_kern_flag & MNTK_AUTH_OPAQUE_ACCESS)) + return(1); + else + return(0); +} + +void +vfs_setauthopaque(mount_t mp) +{ + mount_lock(mp); + mp->mnt_kern_flag |= MNTK_AUTH_OPAQUE; + mount_unlock(mp); +} + +void +vfs_setauthopaqueaccess(mount_t mp) +{ + mount_lock(mp); + mp->mnt_kern_flag |= MNTK_AUTH_OPAQUE_ACCESS; + mount_unlock(mp); +} + +void +vfs_clearauthopaque(mount_t mp) +{ + mount_lock(mp); + mp->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE; + mount_unlock(mp); +} + +void +vfs_clearauthopaqueaccess(mount_t mp) +{ + mount_lock(mp); + mp->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE_ACCESS; + mount_unlock(mp); +} + +void +vfs_setextendedsecurity(mount_t mp) +{ + mount_lock(mp); + mp->mnt_kern_flag |= MNTK_EXTENDED_SECURITY; + mount_unlock(mp); +} + +void +vfs_clearextendedsecurity(mount_t mp) +{ + mount_lock(mp); + mp->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY; + mount_unlock(mp); +} + +int +vfs_extendedsecurity(mount_t mp) +{ + return(mp->mnt_kern_flag & MNTK_EXTENDED_SECURITY); +} + +/* returns the max size of short symlink in this mount_t */ +uint32_t +vfs_maxsymlen(mount_t mp) +{ + return(mp->mnt_maxsymlinklen); +} + +/* set max size of short symlink on mount_t */ +void +vfs_setmaxsymlen(mount_t mp, uint32_t symlen) +{ + mp->mnt_maxsymlinklen = symlen; +} + +/* return a pointer to the RO vfs_statfs associated with mount_t */ +struct vfsstatfs * +vfs_statfs(mount_t mp) +{ + return(&mp->mnt_vfsstat); +} + +int +vfs_getattr(mount_t mp, struct vfs_attr *vfa, vfs_context_t ctx) +{ + int error; + char *vname; + + if ((error = VFS_GETATTR(mp, vfa, ctx)) != 0) + return(error); + + /* + * If we have a filesystem create time, use it to default some others. + */ + if (VFSATTR_IS_SUPPORTED(vfa, f_create_time)) { + if (VFSATTR_IS_ACTIVE(vfa, f_modify_time) && !VFSATTR_IS_SUPPORTED(vfa, f_modify_time)) + VFSATTR_RETURN(vfa, f_modify_time, vfa->f_create_time); + } + + return(0); +} + +int +vfs_setattr(mount_t mp, struct vfs_attr *vfa, vfs_context_t ctx) +{ + int error; + + if (vfs_isrdonly(mp)) + return EROFS; + + error = VFS_SETATTR(mp, vfa, ctx); + + /* + * If we had alternate ways of setting vfs attributes, we'd + * fall back here. + */ + + return error; +} + +/* return the private data handle stored in mount_t */ +void * +vfs_fsprivate(mount_t mp) +{ + return(mp->mnt_data); +} + +/* set the private data handle in mount_t */ +void +vfs_setfsprivate(mount_t mp, void *mntdata) +{ + mp->mnt_data = mntdata; +} + + +/* + * return the block size of the underlying + * device associated with mount_t + */ +int +vfs_devblocksize(mount_t mp) { + + return(mp->mnt_devblocksize); +} + + +/* + * return the io attributes associated with mount_t + */ +void +vfs_ioattr(mount_t mp, struct vfsioattr *ioattrp) +{ + if (mp == NULL) { + ioattrp->io_maxreadcnt = MAXPHYS; + ioattrp->io_maxwritecnt = MAXPHYS; + ioattrp->io_segreadcnt = 32; + ioattrp->io_segwritecnt = 32; + ioattrp->io_maxsegreadsize = MAXPHYS; + ioattrp->io_maxsegwritesize = MAXPHYS; + ioattrp->io_devblocksize = DEV_BSIZE; + } else { + ioattrp->io_maxreadcnt = mp->mnt_maxreadcnt; + ioattrp->io_maxwritecnt = mp->mnt_maxwritecnt; + ioattrp->io_segreadcnt = mp->mnt_segreadcnt; + ioattrp->io_segwritecnt = mp->mnt_segwritecnt; + ioattrp->io_maxsegreadsize = mp->mnt_maxsegreadsize; + ioattrp->io_maxsegwritesize = mp->mnt_maxsegwritesize; + ioattrp->io_devblocksize = mp->mnt_devblocksize; + } + ioattrp->io_reserved[0] = 0; + ioattrp->io_reserved[1] = 0; + ioattrp->io_reserved[2] = 0; +} + + +/* + * set the IO attributes associated with mount_t + */ +void +vfs_setioattr(mount_t mp, struct vfsioattr * ioattrp) +{ + if (mp == NULL) + return; + mp->mnt_maxreadcnt = ioattrp->io_maxreadcnt; + mp->mnt_maxwritecnt = ioattrp->io_maxwritecnt; + mp->mnt_segreadcnt = ioattrp->io_segreadcnt; + mp->mnt_segwritecnt = ioattrp->io_segwritecnt; + mp->mnt_maxsegreadsize = ioattrp->io_maxsegreadsize; + mp->mnt_maxsegwritesize = ioattrp->io_maxsegwritesize; + mp->mnt_devblocksize = ioattrp->io_devblocksize; +} + +/* + * Add a new filesystem into the kernel specified in passed in + * vfstable structure. It fills in the vnode + * dispatch vector that is to be passed to when vnodes are created. + * It returns a handle which is to be used to when the FS is to be removed + */ +typedef int (*PFI)(void *); +extern int vfs_opv_numops; +errno_t +vfs_fsadd(struct vfs_fsentry *vfe, vfstable_t * handle) +{ +#pragma unused(data) + struct vfstable *newvfstbl = NULL; + int i,j; + int (***opv_desc_vector_p)(void *); + int (**opv_desc_vector)(void *); + struct vnodeopv_entry_desc *opve_descp; + int desccount; + int descsize; + PFI *descptr; + + /* + * This routine is responsible for all the initialization that would + * ordinarily be done as part of the system startup; + */ + + if (vfe == (struct vfs_fsentry *)0) + return(EINVAL); + + desccount = vfe->vfe_vopcnt; + if ((desccount <=0) || ((desccount > 5)) || (vfe->vfe_vfsops == (struct vfsops *)NULL) + || (vfe->vfe_opvdescs == (struct vnodeopv_desc **)NULL)) + return(EINVAL); + + + MALLOC(newvfstbl, void *, sizeof(struct vfstable), M_TEMP, + M_WAITOK); + bzero(newvfstbl, sizeof(struct vfstable)); + newvfstbl->vfc_vfsops = vfe->vfe_vfsops; + strncpy(&newvfstbl->vfc_name[0], vfe->vfe_fsname, MFSNAMELEN); + if ((vfe->vfe_flags & VFS_TBLNOTYPENUM)) + newvfstbl->vfc_typenum = maxvfsconf++; + else + newvfstbl->vfc_typenum = vfe->vfe_fstypenum; + + newvfstbl->vfc_refcount = 0; + newvfstbl->vfc_flags = 0; + newvfstbl->vfc_mountroot = NULL; + newvfstbl->vfc_next = NULL; + newvfstbl->vfc_threadsafe = 0; + newvfstbl->vfc_vfsflags = 0; + if (vfe->vfe_flags & VFS_TBL64BITREADY) + newvfstbl->vfc_64bitready= 1; + if (vfe->vfe_flags & VFS_TBLTHREADSAFE) + newvfstbl->vfc_threadsafe= 1; + if (vfe->vfe_flags & VFS_TBLFSNODELOCK) + newvfstbl->vfc_threadsafe= 1; + if ((vfe->vfe_flags & VFS_TBLLOCALVOL) == VFS_TBLLOCALVOL) + newvfstbl->vfc_flags |= MNT_LOCAL; + if (vfe->vfe_flags & VFS_TBLLOCALVOL) + newvfstbl->vfc_vfsflags |= VFC_VFSLOCALARGS; + else + newvfstbl->vfc_vfsflags |= VFC_VFSGENERICARGS; + + + /* + * Allocate and init the vectors. + * Also handle backwards compatibility. + * + * We allocate one large block to hold all <desccount> + * vnode operation vectors stored contiguously. + */ + /* XXX - shouldn't be M_TEMP */ + + descsize = desccount * vfs_opv_numops * sizeof(PFI); + MALLOC(descptr, PFI *, descsize, + M_TEMP, M_WAITOK); + bzero(descptr, descsize); + + newvfstbl->vfc_descptr = descptr; + newvfstbl->vfc_descsize = descsize; + + + for (i= 0; i< desccount; i++ ) { + opv_desc_vector_p = vfe->vfe_opvdescs[i]->opv_desc_vector_p; + /* + * Fill in the caller's pointer to the start of the i'th vector. + * They'll need to supply it when calling vnode_create. + */ + opv_desc_vector = descptr + i * vfs_opv_numops; + *opv_desc_vector_p = opv_desc_vector; + + for (j = 0; vfe->vfe_opvdescs[i]->opv_desc_ops[j].opve_op; j++) { + opve_descp = &(vfe->vfe_opvdescs[i]->opv_desc_ops[j]); + + /* + * Sanity check: is this operation listed + * in the list of operations? We check this + * by seeing if its offest is zero. Since + * the default routine should always be listed + * first, it should be the only one with a zero + * offset. Any other operation with a zero + * offset is probably not listed in + * vfs_op_descs, and so is probably an error. + * + * A panic here means the layer programmer + * has committed the all-too common bug + * of adding a new operation to the layer's + * list of vnode operations but + * not adding the operation to the system-wide + * list of supported operations. + */ + if (opve_descp->opve_op->vdesc_offset == 0 && + opve_descp->opve_op->vdesc_offset != VOFFSET(vnop_default)) { + printf("vfs_fsadd: operation %s not listed in %s.\n", + opve_descp->opve_op->vdesc_name, + "vfs_op_descs"); + panic("vfs_fsadd: bad operation"); + } + /* + * Fill in this entry. + */ + opv_desc_vector[opve_descp->opve_op->vdesc_offset] = + opve_descp->opve_impl; + } + + + /* + * Finally, go back and replace unfilled routines + * with their default. (Sigh, an O(n^3) algorithm. I + * could make it better, but that'd be work, and n is small.) + */ + opv_desc_vector_p = vfe->vfe_opvdescs[i]->opv_desc_vector_p; + + /* + * Force every operations vector to have a default routine. + */ + opv_desc_vector = *opv_desc_vector_p; + if (opv_desc_vector[VOFFSET(vnop_default)] == NULL) + panic("vfs_fsadd: operation vector without default routine."); + for (j = 0; j < vfs_opv_numops; j++) + if (opv_desc_vector[j] == NULL) + opv_desc_vector[j] = + opv_desc_vector[VOFFSET(vnop_default)]; + + } /* end of each vnodeopv_desc parsing */ + + + + *handle = vfstable_add(newvfstbl); + + if (newvfstbl->vfc_typenum <= maxvfsconf ) + maxvfsconf = newvfstbl->vfc_typenum + 1; + numused_vfsslots++; + + if (newvfstbl->vfc_vfsops->vfs_init) + (*newvfstbl->vfc_vfsops->vfs_init)((struct vfsconf *)handle); + + FREE(newvfstbl, M_TEMP); + + return(0); +} + +/* + * Removes the filesystem from kernel. + * The argument passed in is the handle that was given when + * file system was added + */ +errno_t +vfs_fsremove(vfstable_t handle) +{ + struct vfstable * vfstbl = (struct vfstable *)handle; + void *old_desc = NULL; + errno_t err; + + /* Preflight check for any mounts */ + mount_list_lock(); + if ( vfstbl->vfc_refcount != 0 ) { + mount_list_unlock(); + return EBUSY; + } + mount_list_unlock(); + + /* + * save the old descriptor; the free cannot occur unconditionally, + * since vfstable_del() may fail. + */ + if (vfstbl->vfc_descptr && vfstbl->vfc_descsize) { + old_desc = vfstbl->vfc_descptr; + } + err = vfstable_del(vfstbl); + + /* free the descriptor if the delete was successful */ + if (err == 0 && old_desc) { + FREE(old_desc, M_TEMP); + } + + return(err); +} + +/* + * This returns a reference to mount_t + * which should be dropped using vfs_mountrele(). + * Not doing so will leak a mountpoint + * and associated data structures. + */ +errno_t +vfs_mountref(__unused mount_t mp ) /* gives a reference */ +{ + return(0); +} + +/* This drops the reference on mount_t that was acquired */ +errno_t +vfs_mountrele(__unused mount_t mp ) /* drops reference */ +{ + return(0); +} + +int +vfs_context_pid(vfs_context_t context) +{ + return (context->vc_proc->p_pid); +} + +int +vfs_context_suser(vfs_context_t context) +{ + return (suser(context->vc_ucred, 0)); +} +int +vfs_context_issignal(vfs_context_t context, sigset_t mask) +{ + if (context->vc_proc) + return(proc_pendingsignals(context->vc_proc, mask)); + return(0); +} + +int +vfs_context_is64bit(vfs_context_t context) +{ + if (context->vc_proc) + return(proc_is64bit(context->vc_proc)); + return(0); +} + +proc_t +vfs_context_proc(vfs_context_t context) +{ + return (context->vc_proc); +} + +vfs_context_t +vfs_context_create(vfs_context_t context) +{ + struct vfs_context * newcontext; + + newcontext = (struct vfs_context *)kalloc(sizeof(struct vfs_context)); + + if (newcontext) { + if (context) { + newcontext->vc_proc = context->vc_proc; + newcontext->vc_ucred = context->vc_ucred; + } else { + newcontext->vc_proc = proc_self(); + newcontext->vc_ucred = kauth_cred_get(); + } + return(newcontext); + } + return((vfs_context_t)0); +} + +int +vfs_context_rele(vfs_context_t context) +{ + if (context) + kfree(context, sizeof(struct vfs_context)); + return(0); +} + + +ucred_t +vfs_context_ucred(vfs_context_t context) +{ + return (context->vc_ucred); +} + +/* + * Return true if the context is owned by the superuser. + */ +int +vfs_context_issuser(vfs_context_t context) +{ + return(context->vc_ucred->cr_uid == 0); +} + + +/* XXXXXXXXXXXXXX VNODE KAPIS XXXXXXXXXXXXXXXXXXXXXXXXX */ + + +/* + * Convert between vnode types and inode formats (since POSIX.1 + * defines mode word of stat structure in terms of inode formats). + */ +enum vtype +vnode_iftovt(int mode) +{ + return(iftovt_tab[((mode) & S_IFMT) >> 12]); +} + +int +vnode_vttoif(enum vtype indx) +{ + return(vttoif_tab[(int)(indx)]); +} + +int +vnode_makeimode(int indx, int mode) +{ + return (int)(VTTOIF(indx) | (mode)); +} + + +/* + * vnode manipulation functions. + */ + +/* returns system root vnode reference; It should be dropped using vrele() */ +vnode_t +vfs_rootvnode(void) +{ + int error; + + error = vnode_get(rootvnode); + if (error) + return ((vnode_t)0); + else + return rootvnode; +} + + +uint32_t +vnode_vid(vnode_t vp) +{ + return ((uint32_t)(vp->v_id)); +} + +/* returns a mount reference; drop it with vfs_mountrelease() */ +mount_t +vnode_mount(vnode_t vp) +{ + return (vp->v_mount); +} + +/* returns a mount reference iff vnode_t is a dir and is a mount point */ +mount_t +vnode_mountedhere(vnode_t vp) +{ + mount_t mp; + + if ((vp->v_type == VDIR) && ((mp = vp->v_mountedhere) != NULL) && + (mp->mnt_vnodecovered == vp)) + return (mp); + else + return (mount_t)NULL; +} + +/* returns vnode type of vnode_t */ +enum vtype +vnode_vtype(vnode_t vp) +{ + return (vp->v_type); +} + +/* returns FS specific node saved in vnode */ +void * +vnode_fsnode(vnode_t vp) +{ + return (vp->v_data); +} + +void +vnode_clearfsnode(vnode_t vp) +{ + vp->v_data = 0; +} + +dev_t +vnode_specrdev(vnode_t vp) +{ + return(vp->v_rdev); +} + + +/* Accessor functions */ +/* is vnode_t a root vnode */ +int +vnode_isvroot(vnode_t vp) +{ + return ((vp->v_flag & VROOT)? 1 : 0); +} + +/* is vnode_t a system vnode */ +int +vnode_issystem(vnode_t vp) +{ + return ((vp->v_flag & VSYSTEM)? 1 : 0); +} + +/* if vnode_t mount operation in progress */ +int +vnode_ismount(vnode_t vp) +{ + return ((vp->v_flag & VMOUNT)? 1 : 0); +} + +/* is this vnode under recyle now */ +int +vnode_isrecycled(vnode_t vp) +{ + int ret; + + vnode_lock(vp); + ret = (vp->v_lflag & (VL_TERMINATE|VL_DEAD))? 1 : 0; + vnode_unlock(vp); + return(ret); +} + +/* is vnode_t marked to not keep data cached once it's been consumed */ +int +vnode_isnocache(vnode_t vp) +{ + return ((vp->v_flag & VNOCACHE_DATA)? 1 : 0); +} + +/* + * has sequential readahead been disabled on this vnode + */ +int +vnode_isnoreadahead(vnode_t vp) +{ + return ((vp->v_flag & VRAOFF)? 1 : 0); +} + +/* is vnode_t a standard one? */ +int +vnode_isstandard(vnode_t vp) +{ + return ((vp->v_flag & VSTANDARD)? 1 : 0); +} + +/* don't vflush() if SKIPSYSTEM */ +int +vnode_isnoflush(vnode_t vp) +{ + return ((vp->v_flag & VNOFLUSH)? 1 : 0); +} + +/* is vnode_t a regular file */ +int +vnode_isreg(vnode_t vp) +{ + return ((vp->v_type == VREG)? 1 : 0); +} + +/* is vnode_t a directory? */ +int +vnode_isdir(vnode_t vp) +{ + return ((vp->v_type == VDIR)? 1 : 0); +} + +/* is vnode_t a symbolic link ? */ +int +vnode_islnk(vnode_t vp) +{ + return ((vp->v_type == VLNK)? 1 : 0); +} + +/* is vnode_t a fifo ? */ +int +vnode_isfifo(vnode_t vp) +{ + return ((vp->v_type == VFIFO)? 1 : 0); +} + +/* is vnode_t a block device? */ +int +vnode_isblk(vnode_t vp) +{ + return ((vp->v_type == VBLK)? 1 : 0); +} + +/* is vnode_t a char device? */ +int +vnode_ischr(vnode_t vp) +{ + return ((vp->v_type == VCHR)? 1 : 0); +} + +/* is vnode_t a socket? */ +int +vnode_issock(vnode_t vp) +{ + return ((vp->v_type == VSOCK)? 1 : 0); +} + + +/* TBD: set vnode_t to not cache data after it is consumed once; used for quota */ +void +vnode_setnocache(vnode_t vp) +{ + vnode_lock(vp); + vp->v_flag |= VNOCACHE_DATA; + vnode_unlock(vp); +} + +void +vnode_clearnocache(vnode_t vp) +{ + vnode_lock(vp); + vp->v_flag &= ~VNOCACHE_DATA; + vnode_unlock(vp); +} + +void +vnode_setnoreadahead(vnode_t vp) +{ + vnode_lock(vp); + vp->v_flag |= VRAOFF; + vnode_unlock(vp); +} + +void +vnode_clearnoreadahead(vnode_t vp) +{ + vnode_lock(vp); + vp->v_flag &= ~VRAOFF; + vnode_unlock(vp); +} + + +/* mark vnode_t to skip vflush() is SKIPSYSTEM */ +void +vnode_setnoflush(vnode_t vp) +{ + vnode_lock(vp); + vp->v_flag |= VNOFLUSH; + vnode_unlock(vp); +} + +void +vnode_clearnoflush(vnode_t vp) +{ + vnode_lock(vp); + vp->v_flag &= ~VNOFLUSH; + vnode_unlock(vp); +} + + +/* is vnode_t a blkdevice and has a FS mounted on it */ +int +vnode_ismountedon(vnode_t vp) +{ + return ((vp->v_specflags & SI_MOUNTEDON)? 1 : 0); +} + +void +vnode_setmountedon(vnode_t vp) +{ + vnode_lock(vp); + vp->v_specflags |= SI_MOUNTEDON; + vnode_unlock(vp); +} + +void +vnode_clearmountedon(vnode_t vp) +{ + vnode_lock(vp); + vp->v_specflags &= ~SI_MOUNTEDON; + vnode_unlock(vp); +} + + +void +vnode_settag(vnode_t vp, int tag) +{ + vp->v_tag = tag; + +} + +int +vnode_tag(vnode_t vp) +{ + return(vp->v_tag); +} + +vnode_t +vnode_parent(vnode_t vp) +{ + + return(vp->v_parent); +} + +void +vnode_setparent(vnode_t vp, vnode_t dvp) +{ + vp->v_parent = dvp; +} + +char * +vnode_name(vnode_t vp) +{ + /* we try to keep v_name a reasonable name for the node */ + return(vp->v_name); +} + +void +vnode_setname(vnode_t vp, char * name) +{ + vp->v_name = name; +} + +/* return the registered FS name when adding the FS to kernel */ +void +vnode_vfsname(vnode_t vp, char * buf) +{ + strncpy(buf, vp->v_mount->mnt_vtable->vfc_name, MFSNAMELEN); +} + +/* return the FS type number */ +int +vnode_vfstypenum(vnode_t vp) +{ + return(vp->v_mount->mnt_vtable->vfc_typenum); +} + +int +vnode_vfs64bitready(vnode_t vp) +{ + + if ((vp->v_mount->mnt_vtable->vfc_64bitready)) + return(1); + else + return(0); +} + + + +/* return the visible flags on associated mount point of vnode_t */ +uint32_t +vnode_vfsvisflags(vnode_t vp) +{ + return(vp->v_mount->mnt_flag & MNT_VISFLAGMASK); +} + +/* return the command modifier flags on associated mount point of vnode_t */ +uint32_t +vnode_vfscmdflags(vnode_t vp) +{ + return(vp->v_mount->mnt_flag & MNT_CMDFLAGS); +} + +/* return the max symlink of short links of vnode_t */ +uint32_t +vnode_vfsmaxsymlen(vnode_t vp) +{ + return(vp->v_mount->mnt_maxsymlinklen); +} + +/* return a pointer to the RO vfs_statfs associated with vnode_t's mount point */ +struct vfsstatfs * +vnode_vfsstatfs(vnode_t vp) +{ + return(&vp->v_mount->mnt_vfsstat); +} + +/* return a handle to the FSs specific private handle associated with vnode_t's mount point */ +void * +vnode_vfsfsprivate(vnode_t vp) +{ + return(vp->v_mount->mnt_data); +} + +/* is vnode_t in a rdonly mounted FS */ +int +vnode_vfsisrdonly(vnode_t vp) +{ + return ((vp->v_mount->mnt_flag & MNT_RDONLY)? 1 : 0); +} + + +/* returns vnode ref to current working directory */ +vnode_t +current_workingdir(void) +{ + struct proc *p = current_proc(); + struct vnode * vp ; + + if ( (vp = p->p_fd->fd_cdir) ) { + if ( (vnode_getwithref(vp)) ) + return (NULL); + } + return vp; +} + +/* returns vnode ref to current root(chroot) directory */ +vnode_t +current_rootdir(void) +{ + struct proc *p = current_proc(); + struct vnode * vp ; + + if ( (vp = p->p_fd->fd_rdir) ) { + if ( (vnode_getwithref(vp)) ) + return (NULL); + } + return vp; +} + +static int +vnode_get_filesec(vnode_t vp, kauth_filesec_t *fsecp, vfs_context_t ctx) +{ + kauth_filesec_t fsec; + uio_t fsec_uio; + size_t fsec_size; + size_t xsize, rsize; + int error; + + fsec = NULL; + fsec_uio = NULL; + error = 0; + + /* find out how big the EA is */ + if (vn_getxattr(vp, KAUTH_FILESEC_XATTR, NULL, &xsize, XATTR_NOSECURITY, ctx) != 0) { + /* no EA, no filesec */ + if ((error == ENOATTR) || (error == ENOENT) || (error == EJUSTRETURN)) + error = 0; + /* either way, we are done */ + goto out; + } + + /* how many entries would fit? */ + fsec_size = KAUTH_FILESEC_COUNT(xsize); + + /* get buffer and uio */ + if (((fsec = kauth_filesec_alloc(fsec_size)) == NULL) || + ((fsec_uio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ)) == NULL) || + uio_addiov(fsec_uio, CAST_USER_ADDR_T(fsec), xsize)) { + KAUTH_DEBUG(" ERROR - could not allocate iov to read ACL"); + error = ENOMEM; + goto out; + } + + /* read security attribute */ + rsize = xsize; + if ((error = vn_getxattr(vp, + KAUTH_FILESEC_XATTR, + fsec_uio, + &rsize, + XATTR_NOSECURITY, + ctx)) != 0) { + + /* no attribute - no security data */ + if ((error == ENOATTR) || (error == ENOENT) || (error == EJUSTRETURN)) + error = 0; + /* either way, we are done */ + goto out; + } + + /* + * Validate security structure. If it's corrupt, we will + * just ignore it. + */ + if (rsize < KAUTH_FILESEC_SIZE(0)) { + KAUTH_DEBUG("ACL - DATA TOO SMALL (%d)", rsize); + goto out; + } + if (fsec->fsec_magic != KAUTH_FILESEC_MAGIC) { + KAUTH_DEBUG("ACL - BAD MAGIC %x", fsec->fsec_magic); + goto out; + } + if ((fsec->fsec_acl.acl_entrycount != KAUTH_FILESEC_NOACL) && + (fsec->fsec_acl.acl_entrycount > KAUTH_ACL_MAX_ENTRIES)) { + KAUTH_DEBUG("ACL - BAD ENTRYCOUNT %x", fsec->fsec_entrycount); + goto out; + } + if ((fsec->fsec_acl.acl_entrycount != KAUTH_FILESEC_NOACL) && + (KAUTH_FILESEC_SIZE(fsec->fsec_acl.acl_entrycount) > rsize)) { + KAUTH_DEBUG("ACL - BUFFER OVERFLOW (%d entries too big for %d)", fsec->fsec_acl.acl_entrycount, rsize); + goto out; + } + + *fsecp = fsec; + fsec = NULL; + error = 0; +out: + if (fsec != NULL) + kauth_filesec_free(fsec); + if (fsec_uio != NULL) + uio_free(fsec_uio); + if (error) + *fsecp = NULL; + return(error); +} + +static int +vnode_set_filesec(vnode_t vp, kauth_filesec_t fsec, kauth_acl_t acl, vfs_context_t ctx) +{ + uio_t fsec_uio; + int error; + + fsec_uio = NULL; + + if ((fsec_uio = uio_create(2, 0, UIO_SYSSPACE, UIO_WRITE)) == NULL) { + KAUTH_DEBUG(" ERROR - could not allocate iov to write ACL"); + error = ENOMEM; + goto out; + } + uio_addiov(fsec_uio, CAST_USER_ADDR_T(fsec), sizeof(struct kauth_filesec) - sizeof(struct kauth_acl)); + uio_addiov(fsec_uio, CAST_USER_ADDR_T(acl), KAUTH_ACL_COPYSIZE(acl)); + error = vn_setxattr(vp, + KAUTH_FILESEC_XATTR, + fsec_uio, + XATTR_NOSECURITY, /* we have auth'ed already */ + ctx); + VFS_DEBUG(ctx, vp, "SETATTR - set ACL returning %d", error); + +out: + if (fsec_uio != NULL) + uio_free(fsec_uio); + return(error); +} + + +int +vnode_getattr(vnode_t vp, struct vnode_attr *vap, vfs_context_t ctx) +{ + kauth_filesec_t fsec; + kauth_acl_t facl; + int error; + uid_t nuid; + gid_t ngid; + + /* don't ask for extended security data if the filesystem doesn't support it */ + if (!vfs_extendedsecurity(vnode_mount(vp))) { + VATTR_CLEAR_ACTIVE(vap, va_acl); + VATTR_CLEAR_ACTIVE(vap, va_uuuid); + VATTR_CLEAR_ACTIVE(vap, va_guuid); + } + + /* + * If the caller wants size values we might have to synthesise, give the + * filesystem the opportunity to supply better intermediate results. + */ + if (VATTR_IS_ACTIVE(vap, va_data_alloc) || + VATTR_IS_ACTIVE(vap, va_total_size) || + VATTR_IS_ACTIVE(vap, va_total_alloc)) { + VATTR_SET_ACTIVE(vap, va_data_size); + VATTR_SET_ACTIVE(vap, va_data_alloc); + VATTR_SET_ACTIVE(vap, va_total_size); + VATTR_SET_ACTIVE(vap, va_total_alloc); + } + + error = VNOP_GETATTR(vp, vap, ctx); + if (error) { + KAUTH_DEBUG("ERROR - returning %d", error); + goto out; + } + + /* + * If extended security data was requested but not returned, try the fallback + * path. + */ + if (VATTR_NOT_RETURNED(vap, va_acl) || VATTR_NOT_RETURNED(vap, va_uuuid) || VATTR_NOT_RETURNED(vap, va_guuid)) { + fsec = NULL; + + if ((vp->v_type == VDIR) || (vp->v_type == VLNK) || (vp->v_type == VREG)) { + /* try to get the filesec */ + if ((error = vnode_get_filesec(vp, &fsec, ctx)) != 0) + goto out; + } + /* if no filesec, no attributes */ + if (fsec == NULL) { + VATTR_RETURN(vap, va_acl, NULL); + VATTR_RETURN(vap, va_uuuid, kauth_null_guid); + VATTR_RETURN(vap, va_guuid, kauth_null_guid); + } else { + + /* looks good, try to return what we were asked for */ + VATTR_RETURN(vap, va_uuuid, fsec->fsec_owner); + VATTR_RETURN(vap, va_guuid, fsec->fsec_group); + + /* only return the ACL if we were actually asked for it */ + if (VATTR_IS_ACTIVE(vap, va_acl)) { + if (fsec->fsec_acl.acl_entrycount == KAUTH_FILESEC_NOACL) { + VATTR_RETURN(vap, va_acl, NULL); + } else { + facl = kauth_acl_alloc(fsec->fsec_acl.acl_entrycount); + if (facl == NULL) { + kauth_filesec_free(fsec); + error = ENOMEM; + goto out; + } + bcopy(&fsec->fsec_acl, facl, KAUTH_ACL_COPYSIZE(&fsec->fsec_acl)); + VATTR_RETURN(vap, va_acl, facl); + } + } + kauth_filesec_free(fsec); + } + } + /* + * If someone gave us an unsolicited filesec, toss it. We promise that + * we're OK with a filesystem giving us anything back, but our callers + * only expect what they asked for. + */ + if (VATTR_IS_SUPPORTED(vap, va_acl) && !VATTR_IS_ACTIVE(vap, va_acl)) { + if (vap->va_acl != NULL) + kauth_acl_free(vap->va_acl); + VATTR_CLEAR_SUPPORTED(vap, va_acl); + } + +#if 0 /* enable when we have a filesystem only supporting UUIDs */ + /* + * Handle the case where we need a UID/GID, but only have extended + * security information. + */ + if (VATTR_NOT_RETURNED(vap, va_uid) && + VATTR_IS_SUPPORTED(vap, va_uuuid) && + !kauth_guid_equal(&vap->va_uuuid, &kauth_null_guid)) { + if ((error = kauth_cred_guid2uid(&vap->va_uuuid, &nuid)) == 0) + VATTR_RETURN(vap, va_uid, nuid); + } + if (VATTR_NOT_RETURNED(vap, va_gid) && + VATTR_IS_SUPPORTED(vap, va_guuid) && + !kauth_guid_equal(&vap->va_guuid, &kauth_null_guid)) { + if ((error = kauth_cred_guid2gid(&vap->va_guuid, &ngid)) == 0) + VATTR_RETURN(vap, va_gid, ngid); + } +#endif + + /* + * Handle uid/gid == 99 and MNT_IGNORE_OWNERSHIP here. + */ + if (VATTR_IS_ACTIVE(vap, va_uid)) { + if (vp->v_mount->mnt_flag & MNT_IGNORE_OWNERSHIP) { + nuid = vp->v_mount->mnt_fsowner; + if (nuid == KAUTH_UID_NONE) + nuid = 99; + } else if (VATTR_IS_SUPPORTED(vap, va_uid)) { + nuid = vap->va_uid; + } else { + /* this will always be something sensible */ + nuid = vp->v_mount->mnt_fsowner; + } + if ((nuid == 99) && !vfs_context_issuser(ctx)) + nuid = kauth_cred_getuid(vfs_context_ucred(ctx)); + VATTR_RETURN(vap, va_uid, nuid); + } + if (VATTR_IS_ACTIVE(vap, va_gid)) { + if (vp->v_mount->mnt_flag & MNT_IGNORE_OWNERSHIP) { + ngid = vp->v_mount->mnt_fsgroup; + if (ngid == KAUTH_GID_NONE) + ngid = 99; + } else if (VATTR_IS_SUPPORTED(vap, va_gid)) { + ngid = vap->va_gid; + } else { + /* this will always be something sensible */ + ngid = vp->v_mount->mnt_fsgroup; + } + if ((ngid == 99) && !vfs_context_issuser(ctx)) + ngid = kauth_cred_getgid(vfs_context_ucred(ctx)); + VATTR_RETURN(vap, va_gid, ngid); + } + + /* + * Synthesise some values that can be reasonably guessed. + */ + if (!VATTR_IS_SUPPORTED(vap, va_iosize)) + VATTR_RETURN(vap, va_iosize, vp->v_mount->mnt_vfsstat.f_iosize); + + if (!VATTR_IS_SUPPORTED(vap, va_flags)) + VATTR_RETURN(vap, va_flags, 0); + + if (!VATTR_IS_SUPPORTED(vap, va_filerev)) + VATTR_RETURN(vap, va_filerev, 0); + + if (!VATTR_IS_SUPPORTED(vap, va_gen)) + VATTR_RETURN(vap, va_gen, 0); + + /* + * Default sizes. Ordering here is important, as later defaults build on earlier ones. + */ + if (!VATTR_IS_SUPPORTED(vap, va_data_size)) + VATTR_RETURN(vap, va_data_size, 0); + + /* do we want any of the possibly-computed values? */ + if (VATTR_IS_ACTIVE(vap, va_data_alloc) || + VATTR_IS_ACTIVE(vap, va_total_size) || + VATTR_IS_ACTIVE(vap, va_total_alloc)) { + /* make sure f_bsize is valid */ + if (vp->v_mount->mnt_vfsstat.f_bsize == 0) { + if ((error = vfs_update_vfsstat(vp->v_mount, ctx)) != 0) + goto out; + } + + /* default va_data_alloc from va_data_size */ + if (!VATTR_IS_SUPPORTED(vap, va_data_alloc)) + VATTR_RETURN(vap, va_data_alloc, roundup(vap->va_data_size, vp->v_mount->mnt_vfsstat.f_bsize)); + + /* default va_total_size from va_data_size */ + if (!VATTR_IS_SUPPORTED(vap, va_total_size)) + VATTR_RETURN(vap, va_total_size, vap->va_data_size); + + /* default va_total_alloc from va_total_size which is guaranteed at this point */ + if (!VATTR_IS_SUPPORTED(vap, va_total_alloc)) + VATTR_RETURN(vap, va_total_alloc, roundup(vap->va_total_size, vp->v_mount->mnt_vfsstat.f_bsize)); + } + + /* + * If we don't have a change time, pull it from the modtime. + */ + if (!VATTR_IS_SUPPORTED(vap, va_change_time) && VATTR_IS_SUPPORTED(vap, va_modify_time)) + VATTR_RETURN(vap, va_change_time, vap->va_modify_time); + + /* + * This is really only supported for the creation VNOPs, but since the field is there + * we should populate it correctly. + */ + VATTR_RETURN(vap, va_type, vp->v_type); + + /* + * The fsid can be obtained from the mountpoint directly. + */ + VATTR_RETURN(vap, va_fsid, vp->v_mount->mnt_vfsstat.f_fsid.val[0]); + +out: + + return(error); +} + +int +vnode_setattr(vnode_t vp, struct vnode_attr *vap, vfs_context_t ctx) +{ + int error, is_ownership_change=0; + + /* + * Make sure the filesystem is mounted R/W. + * If not, return an error. + */ + if (vfs_isrdonly(vp->v_mount)) + return(EROFS); + + /* + * If ownership is being ignored on this volume, we silently discard + * ownership changes. + */ + if (vp->v_mount->mnt_flag & MNT_IGNORE_OWNERSHIP) { + VATTR_CLEAR_ACTIVE(vap, va_uid); + VATTR_CLEAR_ACTIVE(vap, va_gid); + } + + if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) { + is_ownership_change = 1; + } + + /* + * Make sure that extended security is enabled if we're going to try + * to set any. + */ + if (!vfs_extendedsecurity(vnode_mount(vp)) && + (VATTR_IS_ACTIVE(vap, va_acl) || VATTR_IS_ACTIVE(vap, va_uuuid) || VATTR_IS_ACTIVE(vap, va_guuid))) { + KAUTH_DEBUG("SETATTR - returning ENOTSUP to request to set extended security"); + return(ENOTSUP); + } + + error = VNOP_SETATTR(vp, vap, ctx); + + if ((error == 0) && !VATTR_ALL_SUPPORTED(vap)) + error = vnode_setattr_fallback(vp, vap, ctx); + + /* + * If we have changed any of the things about the file that are likely + * to result in changes to authorisation results, blow the vnode auth + * cache + */ + if (VATTR_IS_SUPPORTED(vap, va_mode) || + VATTR_IS_SUPPORTED(vap, va_uid) || + VATTR_IS_SUPPORTED(vap, va_gid) || + VATTR_IS_SUPPORTED(vap, va_flags) || + VATTR_IS_SUPPORTED(vap, va_acl) || + VATTR_IS_SUPPORTED(vap, va_uuuid) || + VATTR_IS_SUPPORTED(vap, va_guuid)) + vnode_uncache_credentials(vp); + // only send a stat_changed event if this is more than + // just an access time update + if (error == 0 && (vap->va_active != VNODE_ATTR_BIT(va_access_time))) { + if (need_fsevent(FSE_STAT_CHANGED, vp) || (is_ownership_change && need_fsevent(FSE_CHOWN, vp))) { + if (is_ownership_change == 0) + add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE); + else + add_fsevent(FSE_CHOWN, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE); + } + } + return(error); +} + +/* + * Following an operation which sets attributes (setattr, create, etc.) we may + * need to perform fallback operations to get attributes saved. + */ +int +vnode_setattr_fallback(vnode_t vp, struct vnode_attr *vap, vfs_context_t ctx) +{ + kauth_filesec_t fsec; + kauth_acl_t facl; + struct kauth_filesec lfsec; + int error; + + error = 0; + + /* + * Extended security fallback via extended attributes. + * + * Note that we do not free the filesec; the caller is expected to do this. + */ + if (VATTR_NOT_RETURNED(vap, va_acl) || + VATTR_NOT_RETURNED(vap, va_uuuid) || + VATTR_NOT_RETURNED(vap, va_guuid)) { + VFS_DEBUG(ctx, vp, "SETATTR - doing filesec fallback"); + + /* + * Fail for file types that we don't permit extended security to be set on. + */ + if ((vp->v_type != VDIR) && (vp->v_type != VLNK) && (vp->v_type != VREG)) { + VFS_DEBUG(ctx, vp, "SETATTR - Can't write ACL to file type %d", vnode_vtype(vp)); + error = EINVAL; + goto out; + } + + /* + * If we don't have all the extended security items, we need to fetch the existing + * data to perform a read-modify-write operation. + */ + fsec = NULL; + if (!VATTR_IS_ACTIVE(vap, va_acl) || + !VATTR_IS_ACTIVE(vap, va_uuuid) || + !VATTR_IS_ACTIVE(vap, va_guuid)) { + if ((error = vnode_get_filesec(vp, &fsec, ctx)) != 0) { + KAUTH_DEBUG("SETATTR - ERROR %d fetching filesec for update", error); + goto out; + } + } + /* if we didn't get a filesec, use our local one */ + if (fsec == NULL) { + KAUTH_DEBUG("SETATTR - using local filesec for new/full update"); + fsec = &lfsec; + } else { + KAUTH_DEBUG("SETATTR - updating existing filesec"); + } + /* find the ACL */ + facl = &fsec->fsec_acl; + + /* if we're using the local filesec, we need to initialise it */ + if (fsec == &lfsec) { + fsec->fsec_magic = KAUTH_FILESEC_MAGIC; + fsec->fsec_owner = kauth_null_guid; + fsec->fsec_group = kauth_null_guid; + facl->acl_entrycount = KAUTH_FILESEC_NOACL; + facl->acl_flags = 0; + } + + /* + * Update with the supplied attributes. + */ + if (VATTR_IS_ACTIVE(vap, va_uuuid)) { + KAUTH_DEBUG("SETATTR - updating owner UUID"); + fsec->fsec_owner = vap->va_uuuid; + VATTR_SET_SUPPORTED(vap, va_uuuid); + } + if (VATTR_IS_ACTIVE(vap, va_guuid)) { + KAUTH_DEBUG("SETATTR - updating group UUID"); + fsec->fsec_group = vap->va_guuid; + VATTR_SET_SUPPORTED(vap, va_guuid); + } + if (VATTR_IS_ACTIVE(vap, va_acl)) { + if (vap->va_acl == NULL) { + KAUTH_DEBUG("SETATTR - removing ACL"); + facl->acl_entrycount = KAUTH_FILESEC_NOACL; + } else { + KAUTH_DEBUG("SETATTR - setting ACL with %d entries", vap->va_acl->acl_entrycount); + facl = vap->va_acl; + } + VATTR_SET_SUPPORTED(vap, va_acl); + } + + /* + * If the filesec data is all invalid, we can just remove the EA completely. + */ + if ((facl->acl_entrycount == KAUTH_FILESEC_NOACL) && + kauth_guid_equal(&fsec->fsec_owner, &kauth_null_guid) && + kauth_guid_equal(&fsec->fsec_group, &kauth_null_guid)) { + error = vn_removexattr(vp, KAUTH_FILESEC_XATTR, XATTR_NOSECURITY, ctx); + /* no attribute is ok, nothing to delete */ + if (error == ENOATTR) + error = 0; + VFS_DEBUG(ctx, vp, "SETATTR - remove filesec returning %d", error); + } else { + /* write the EA */ + error = vnode_set_filesec(vp, fsec, facl, ctx); + VFS_DEBUG(ctx, vp, "SETATTR - update filesec returning %d", error); + } + + /* if we fetched a filesec, dispose of the buffer */ + if (fsec != &lfsec) + kauth_filesec_free(fsec); + } +out: + + return(error); +} + +/* + * Definition of vnode operations. + */ + +#if 0 +/* + *# + *#% lookup dvp L ? ? + *#% lookup vpp - L - + */ +struct vnop_lookup_args { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t *a_vpp; + struct componentname *a_cnp; + vfs_context_t a_context; +}; +#endif /* 0*/ + +errno_t +VNOP_LOOKUP(vnode_t dvp, vnode_t *vpp, struct componentname *cnp, vfs_context_t context) +{ + int _err; + struct vnop_lookup_args a; + vnode_t vp; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_lookup_desc; + a.a_dvp = dvp; + a.a_vpp = vpp; + a.a_cnp = cnp; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(dvp); + + vnode_cache_credentials(dvp, context); + + if (!thread_safe) { + if ( (_err = lock_fsnode(dvp, &funnel_state)) ) { + return (_err); + } + } + _err = (*dvp->v_op[vnop_lookup_desc.vdesc_offset])(&a); + + vp = *vpp; + + if (!thread_safe) { + if ( (cnp->cn_flags & ISLASTCN) ) { + if ( (cnp->cn_flags & LOCKPARENT) ) { + if ( !(cnp->cn_flags & FSNODELOCKHELD) ) { + /* + * leave the fsnode lock held on + * the directory, but restore the funnel... + * also indicate that we need to drop the + * fsnode_lock when we're done with the + * system call processing for this path + */ + cnp->cn_flags |= FSNODELOCKHELD; + + (void) thread_funnel_set(kernel_flock, funnel_state); + return (_err); + } + } + } + unlock_fsnode(dvp, &funnel_state); + } + return (_err); +} + +#if 0 +/* + *# + *#% create dvp L L L + *#% create vpp - L - + *# + */ + +struct vnop_create_args { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t *a_vpp; + struct componentname *a_cnp; + struct vnode_attr *a_vap; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_CREATE(vnode_t dvp, vnode_t * vpp, struct componentname * cnp, struct vnode_attr * vap, vfs_context_t context) +{ + int _err; + struct vnop_create_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_create_desc; + a.a_dvp = dvp; + a.a_vpp = vpp; + a.a_cnp = cnp; + a.a_vap = vap; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(dvp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(dvp, &funnel_state)) ) { + return (_err); + } + } + _err = (*dvp->v_op[vnop_create_desc.vdesc_offset])(&a); + if (_err == 0 && !NATIVE_XATTR(dvp)) { + /* + * Remove stale Apple Double file (if any). + */ + xattrfile_remove(dvp, cnp->cn_nameptr, context, thread_safe, 0); + } + if (!thread_safe) { + unlock_fsnode(dvp, &funnel_state); + } + return (_err); +} + +#if 0 +/* + *# + *#% whiteout dvp L L L + *#% whiteout cnp - - - + *#% whiteout flag - - - + *# + */ +struct vnop_whiteout_args { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + struct componentname *a_cnp; + int a_flags; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_WHITEOUT(vnode_t dvp, struct componentname * cnp, int flags, vfs_context_t context) +{ + int _err; + struct vnop_whiteout_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_whiteout_desc; + a.a_dvp = dvp; + a.a_cnp = cnp; + a.a_flags = flags; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(dvp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(dvp, &funnel_state)) ) { + return (_err); + } + } + _err = (*dvp->v_op[vnop_whiteout_desc.vdesc_offset])(&a); + if (!thread_safe) { + unlock_fsnode(dvp, &funnel_state); + } + return (_err); +} + + #if 0 +/* + *# + *#% mknod dvp L U U + *#% mknod vpp - X - + *# + */ +struct vnop_mknod_args { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t *a_vpp; + struct componentname *a_cnp; + struct vnode_attr *a_vap; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_MKNOD(vnode_t dvp, vnode_t * vpp, struct componentname * cnp, struct vnode_attr * vap, vfs_context_t context) +{ + + int _err; + struct vnop_mknod_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_mknod_desc; + a.a_dvp = dvp; + a.a_vpp = vpp; + a.a_cnp = cnp; + a.a_vap = vap; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(dvp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(dvp, &funnel_state)) ) { + return (_err); + } + } + _err = (*dvp->v_op[vnop_mknod_desc.vdesc_offset])(&a); + if (!thread_safe) { + unlock_fsnode(dvp, &funnel_state); + } + return (_err); +} + +#if 0 +/* + *# + *#% open vp L L L + *# + */ +struct vnop_open_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + int a_mode; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_OPEN(vnode_t vp, int mode, vfs_context_t context) +{ + int _err; + struct vnop_open_args a; + int thread_safe; + int funnel_state = 0; + struct vfs_context acontext; + + if (context == NULL) { + acontext.vc_proc = current_proc(); + acontext.vc_ucred = kauth_cred_get(); + context = &acontext; + } + a.a_desc = &vnop_open_desc; + a.a_vp = vp; + a.a_mode = mode; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + if (vp->v_type != VCHR && vp->v_type != VFIFO && vp->v_type != VSOCK) { + if ( (_err = lock_fsnode(vp, NULL)) ) { + (void) thread_funnel_set(kernel_flock, funnel_state); + return (_err); + } + } + } + _err = (*vp->v_op[vnop_open_desc.vdesc_offset])(&a); + if (!thread_safe) { + if (vp->v_type != VCHR && vp->v_type != VFIFO && vp->v_type != VSOCK) { + unlock_fsnode(vp, NULL); + } + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return (_err); +} + +#if 0 +/* + *# + *#% close vp U U U + *# + */ +struct vnop_close_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + int a_fflag; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_CLOSE(vnode_t vp, int fflag, vfs_context_t context) +{ + int _err; + struct vnop_close_args a; + int thread_safe; + int funnel_state = 0; + struct vfs_context acontext; + + if (context == NULL) { + acontext.vc_proc = current_proc(); + acontext.vc_ucred = kauth_cred_get(); + context = &acontext; + } + a.a_desc = &vnop_close_desc; + a.a_vp = vp; + a.a_fflag = fflag; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + if (vp->v_type != VCHR && vp->v_type != VFIFO && vp->v_type != VSOCK) { + if ( (_err = lock_fsnode(vp, NULL)) ) { + (void) thread_funnel_set(kernel_flock, funnel_state); + return (_err); + } + } + } + _err = (*vp->v_op[vnop_close_desc.vdesc_offset])(&a); + if (!thread_safe) { + if (vp->v_type != VCHR && vp->v_type != VFIFO && vp->v_type != VSOCK) { + unlock_fsnode(vp, NULL); + } + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return (_err); +} + +#if 0 +/* + *# + *#% access vp L L L + *# + */ +struct vnop_access_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + int a_action; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_ACCESS(vnode_t vp, int action, vfs_context_t context) +{ + int _err; + struct vnop_access_args a; + int thread_safe; + int funnel_state = 0; + struct vfs_context acontext; + + if (context == NULL) { + acontext.vc_proc = current_proc(); + acontext.vc_ucred = kauth_cred_get(); + context = &acontext; + } + a.a_desc = &vnop_access_desc; + a.a_vp = vp; + a.a_action = action; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(vp, &funnel_state)) ) { + return (_err); + } + } + _err = (*vp->v_op[vnop_access_desc.vdesc_offset])(&a); + if (!thread_safe) { + unlock_fsnode(vp, &funnel_state); + } + return (_err); +} + +#if 0 +/* + *# + *#% getattr vp = = = + *# + */ +struct vnop_getattr_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + struct vnode_attr *a_vap; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_GETATTR(vnode_t vp, struct vnode_attr * vap, vfs_context_t context) +{ + int _err; + struct vnop_getattr_args a; + int thread_safe; + int funnel_state; + + a.a_desc = &vnop_getattr_desc; + a.a_vp = vp; + a.a_vap = vap; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(vp, &funnel_state)) ) { + return (_err); + } + } + _err = (*vp->v_op[vnop_getattr_desc.vdesc_offset])(&a); + if (!thread_safe) { + unlock_fsnode(vp, &funnel_state); + } + return (_err); +} + +#if 0 +/* + *# + *#% setattr vp L L L + *# + */ +struct vnop_setattr_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + struct vnode_attr *a_vap; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_SETATTR(vnode_t vp, struct vnode_attr * vap, vfs_context_t context) +{ + int _err; + struct vnop_setattr_args a; + int thread_safe; + int funnel_state; + + a.a_desc = &vnop_setattr_desc; + a.a_vp = vp; + a.a_vap = vap; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(vp, &funnel_state)) ) { + return (_err); + } + } + _err = (*vp->v_op[vnop_setattr_desc.vdesc_offset])(&a); + + /* + * Shadow uid/gid/mod change to extended attibute file. + */ + if (_err == 0 && !NATIVE_XATTR(vp)) { + struct vnode_attr va; + int change = 0; + + VATTR_INIT(&va); + if (VATTR_IS_ACTIVE(vap, va_uid)) { + VATTR_SET(&va, va_uid, vap->va_uid); + change = 1; + } + if (VATTR_IS_ACTIVE(vap, va_gid)) { + VATTR_SET(&va, va_gid, vap->va_gid); + change = 1; + } + if (VATTR_IS_ACTIVE(vap, va_mode)) { + VATTR_SET(&va, va_mode, vap->va_mode); + change = 1; + } + if (change) { + vnode_t dvp; + char *vname; + + dvp = vnode_getparent(vp); + vname = vnode_getname(vp); + + xattrfile_setattr(dvp, vname, &va, context, thread_safe); + if (dvp != NULLVP) + vnode_put(dvp); + if (vname != NULL) + vnode_putname(vname); + } + } + if (!thread_safe) { + unlock_fsnode(vp, &funnel_state); + } + return (_err); +} + +#if 0 +/* + *# + *#% getattrlist vp = = = + *# + */ +struct vnop_getattrlist_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + struct attrlist *a_alist; + struct uio *a_uio; + int a_options; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_GETATTRLIST(vnode_t vp, struct attrlist * alist, struct uio * uio, int options, vfs_context_t context) +{ + int _err; + struct vnop_getattrlist_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_getattrlist_desc; + a.a_vp = vp; + a.a_alist = alist; + a.a_uio = uio; + a.a_options = options; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(vp, &funnel_state)) ) { + return (_err); + } + } + _err = (*vp->v_op[vnop_getattrlist_desc.vdesc_offset])(&a); + if (!thread_safe) { + unlock_fsnode(vp, &funnel_state); + } + return (_err); +} + +#if 0 +/* + *# + *#% setattrlist vp L L L + *# + */ +struct vnop_setattrlist_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + struct attrlist *a_alist; + struct uio *a_uio; + int a_options; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_SETATTRLIST(vnode_t vp, struct attrlist * alist, struct uio * uio, int options, vfs_context_t context) +{ + int _err; + struct vnop_setattrlist_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_setattrlist_desc; + a.a_vp = vp; + a.a_alist = alist; + a.a_uio = uio; + a.a_options = options; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(vp, &funnel_state)) ) { + return (_err); + } + } + _err = (*vp->v_op[vnop_setattrlist_desc.vdesc_offset])(&a); + + vnode_uncache_credentials(vp); + + if (!thread_safe) { + unlock_fsnode(vp, &funnel_state); + } + return (_err); +} + + +#if 0 +/* + *# + *#% read vp L L L + *# + */ +struct vnop_read_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + struct uio *a_uio; + int a_ioflag; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_READ(vnode_t vp, struct uio * uio, int ioflag, vfs_context_t context) +{ + int _err; + struct vnop_read_args a; + int thread_safe; + int funnel_state = 0; + struct vfs_context acontext; + + if (context == NULL) { + acontext.vc_proc = current_proc(); + acontext.vc_ucred = kauth_cred_get(); + context = &acontext; + } + + a.a_desc = &vnop_read_desc; + a.a_vp = vp; + a.a_uio = uio; + a.a_ioflag = ioflag; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + if (vp->v_type != VCHR && vp->v_type != VFIFO && vp->v_type != VSOCK) { + if ( (_err = lock_fsnode(vp, NULL)) ) { + (void) thread_funnel_set(kernel_flock, funnel_state); + return (_err); + } + } + } + _err = (*vp->v_op[vnop_read_desc.vdesc_offset])(&a); + + if (!thread_safe) { + if (vp->v_type != VCHR && vp->v_type != VFIFO && vp->v_type != VSOCK) { + unlock_fsnode(vp, NULL); + } + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return (_err); +} + + +#if 0 +/* + *# + *#% write vp L L L + *# + */ +struct vnop_write_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + struct uio *a_uio; + int a_ioflag; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_WRITE(vnode_t vp, struct uio * uio, int ioflag, vfs_context_t context) +{ + struct vnop_write_args a; + int _err; + int thread_safe; + int funnel_state = 0; + struct vfs_context acontext; + + if (context == NULL) { + acontext.vc_proc = current_proc(); + acontext.vc_ucred = kauth_cred_get(); + context = &acontext; + } + + a.a_desc = &vnop_write_desc; + a.a_vp = vp; + a.a_uio = uio; + a.a_ioflag = ioflag; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + if (vp->v_type != VCHR && vp->v_type != VFIFO && vp->v_type != VSOCK) { + if ( (_err = lock_fsnode(vp, NULL)) ) { + (void) thread_funnel_set(kernel_flock, funnel_state); + return (_err); + } + } + } + _err = (*vp->v_op[vnop_write_desc.vdesc_offset])(&a); + + if (!thread_safe) { + if (vp->v_type != VCHR && vp->v_type != VFIFO && vp->v_type != VSOCK) { + unlock_fsnode(vp, NULL); + } + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return (_err); +} + + +#if 0 +/* + *# + *#% ioctl vp U U U + *# + */ +struct vnop_ioctl_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + u_long a_command; + caddr_t a_data; + int a_fflag; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_IOCTL(vnode_t vp, u_long command, caddr_t data, int fflag, vfs_context_t context) +{ + int _err; + struct vnop_ioctl_args a; + int thread_safe; + int funnel_state = 0; + struct vfs_context acontext; + + if (context == NULL) { + acontext.vc_proc = current_proc(); + acontext.vc_ucred = kauth_cred_get(); + context = &acontext; + } + + if (vfs_context_is64bit(context)) { + if (!vnode_vfs64bitready(vp)) { + return(ENOTTY); + } + } + + a.a_desc = &vnop_ioctl_desc; + a.a_vp = vp; + a.a_command = command; + a.a_data = data; + a.a_fflag = fflag; + a.a_context= context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + if (vp->v_type != VCHR && vp->v_type != VFIFO && vp->v_type != VSOCK) { + if ( (_err = lock_fsnode(vp, NULL)) ) { + (void) thread_funnel_set(kernel_flock, funnel_state); + return (_err); + } + } + } + _err = (*vp->v_op[vnop_ioctl_desc.vdesc_offset])(&a); + if (!thread_safe) { + if (vp->v_type != VCHR && vp->v_type != VFIFO && vp->v_type != VSOCK) { + unlock_fsnode(vp, NULL); + } + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return (_err); +} + + +#if 0 +/* + *# + *#% select vp U U U + *# + */ +struct vnop_select_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + int a_which; + int a_fflags; + void *a_wql; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_SELECT(vnode_t vp, int which , int fflags, void * wql, vfs_context_t context) +{ + int _err; + struct vnop_select_args a; + int thread_safe; + int funnel_state = 0; + struct vfs_context acontext; + + if (context == NULL) { + acontext.vc_proc = current_proc(); + acontext.vc_ucred = kauth_cred_get(); + context = &acontext; + } + a.a_desc = &vnop_select_desc; + a.a_vp = vp; + a.a_which = which; + a.a_fflags = fflags; + a.a_context = context; + a.a_wql = wql; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + if (vp->v_type != VCHR && vp->v_type != VFIFO && vp->v_type != VSOCK) { + if ( (_err = lock_fsnode(vp, NULL)) ) { + (void) thread_funnel_set(kernel_flock, funnel_state); + return (_err); + } + } + } + _err = (*vp->v_op[vnop_select_desc.vdesc_offset])(&a); + if (!thread_safe) { + if (vp->v_type != VCHR && vp->v_type != VFIFO && vp->v_type != VSOCK) { + unlock_fsnode(vp, NULL); + } + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return (_err); +} + + +#if 0 +/* + *# + *#% exchange fvp L L L + *#% exchange tvp L L L + *# + */ +struct vnop_exchange_args { + struct vnodeop_desc *a_desc; + vnode_t a_fvp; + vnode_t a_tvp; + int a_options; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_EXCHANGE(vnode_t fvp, vnode_t tvp, int options, vfs_context_t context) +{ + int _err; + struct vnop_exchange_args a; + int thread_safe; + int funnel_state = 0; + vnode_t lock_first = NULL, lock_second = NULL; + + a.a_desc = &vnop_exchange_desc; + a.a_fvp = fvp; + a.a_tvp = tvp; + a.a_options = options; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(fvp); + + if (!thread_safe) { + /* + * Lock in vnode address order to avoid deadlocks + */ + if (fvp < tvp) { + lock_first = fvp; + lock_second = tvp; + } else { + lock_first = tvp; + lock_second = fvp; + } + if ( (_err = lock_fsnode(lock_first, &funnel_state)) ) { + return (_err); + } + if ( (_err = lock_fsnode(lock_second, NULL)) ) { + unlock_fsnode(lock_first, &funnel_state); + return (_err); + } + } + _err = (*fvp->v_op[vnop_exchange_desc.vdesc_offset])(&a); + if (!thread_safe) { + unlock_fsnode(lock_second, NULL); + unlock_fsnode(lock_first, &funnel_state); + } + return (_err); +} + + +#if 0 +/* + *# + *#% revoke vp U U U + *# + */ +struct vnop_revoke_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + int a_flags; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_REVOKE(vnode_t vp, int flags, vfs_context_t context) +{ + struct vnop_revoke_args a; + int _err; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_revoke_desc; + a.a_vp = vp; + a.a_flags = flags; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + } + _err = (*vp->v_op[vnop_revoke_desc.vdesc_offset])(&a); + if (!thread_safe) { + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return (_err); +} + + +#if 0 +/* + *# + *# mmap - vp U U U + *# + */ +struct vnop_mmap_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + int a_fflags; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_MMAP(vnode_t vp, int fflags, vfs_context_t context) +{ + int _err; + struct vnop_mmap_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_mmap_desc; + a.a_vp = vp; + a.a_fflags = fflags; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(vp, &funnel_state)) ) { + return (_err); + } + } + _err = (*vp->v_op[vnop_mmap_desc.vdesc_offset])(&a); + if (!thread_safe) { + unlock_fsnode(vp, &funnel_state); + } + return (_err); +} + + +#if 0 +/* + *# + *# mnomap - vp U U U + *# + */ +struct vnop_mnomap_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_MNOMAP(vnode_t vp, vfs_context_t context) +{ + int _err; + struct vnop_mnomap_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_mnomap_desc; + a.a_vp = vp; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(vp, &funnel_state)) ) { + return (_err); + } + } + _err = (*vp->v_op[vnop_mnomap_desc.vdesc_offset])(&a); + if (!thread_safe) { + unlock_fsnode(vp, &funnel_state); + } + return (_err); +} + + +#if 0 +/* + *# + *#% fsync vp L L L + *# + */ +struct vnop_fsync_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + int a_waitfor; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_FSYNC(vnode_t vp, int waitfor, vfs_context_t context) +{ + struct vnop_fsync_args a; + int _err; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_fsync_desc; + a.a_vp = vp; + a.a_waitfor = waitfor; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(vp, &funnel_state)) ) { + return (_err); + } + } + _err = (*vp->v_op[vnop_fsync_desc.vdesc_offset])(&a); + if (!thread_safe) { + unlock_fsnode(vp, &funnel_state); + } + return (_err); +} + + +#if 0 +/* + *# + *#% remove dvp L U U + *#% remove vp L U U + *# + */ +struct vnop_remove_args { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t a_vp; + struct componentname *a_cnp; + int a_flags; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_REMOVE(vnode_t dvp, vnode_t vp, struct componentname * cnp, int flags, vfs_context_t context) +{ + int _err; + struct vnop_remove_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_remove_desc; + a.a_dvp = dvp; + a.a_vp = vp; + a.a_cnp = cnp; + a.a_flags = flags; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(dvp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(vp, &funnel_state)) ) { + return (_err); + } + } + _err = (*dvp->v_op[vnop_remove_desc.vdesc_offset])(&a); + + if (_err == 0) { + vnode_setneedinactive(vp); + + if ( !(NATIVE_XATTR(dvp)) ) { + /* + * Remove any associated extended attibute file (._ AppleDouble file). + */ + xattrfile_remove(dvp, cnp->cn_nameptr, context, thread_safe, 1); + } + } + if (!thread_safe) { + unlock_fsnode(vp, &funnel_state); + } + return (_err); +} + + +#if 0 +/* + *# + *#% link vp U U U + *#% link tdvp L U U + *# + */ +struct vnop_link_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + vnode_t a_tdvp; + struct componentname *a_cnp; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_LINK(vnode_t vp, vnode_t tdvp, struct componentname * cnp, vfs_context_t context) +{ + int _err; + struct vnop_link_args a; + int thread_safe; + int funnel_state = 0; + + /* + * For file systems with non-native extended attributes, + * disallow linking to an existing "._" Apple Double file. + */ + if ( !NATIVE_XATTR(tdvp) && (vp->v_type == VREG)) { + char *vname; + + vname = vnode_getname(vp); + if (vname != NULL) { + _err = 0; + if (vname[0] == '.' && vname[1] == '_' && vname[2] != '\0') { + _err = EPERM; + } + vnode_putname(vname); + if (_err) + return (_err); + } + } + a.a_desc = &vnop_link_desc; + a.a_vp = vp; + a.a_tdvp = tdvp; + a.a_cnp = cnp; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(vp, &funnel_state)) ) { + return (_err); + } + } + _err = (*tdvp->v_op[vnop_link_desc.vdesc_offset])(&a); + if (!thread_safe) { + unlock_fsnode(vp, &funnel_state); + } + return (_err); +} + + +#if 0 +/* + *# + *#% rename fdvp U U U + *#% rename fvp U U U + *#% rename tdvp L U U + *#% rename tvp X U U + *# + */ +struct vnop_rename_args { + struct vnodeop_desc *a_desc; + vnode_t a_fdvp; + vnode_t a_fvp; + struct componentname *a_fcnp; + vnode_t a_tdvp; + vnode_t a_tvp; + struct componentname *a_tcnp; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_RENAME(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, + struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, + vfs_context_t context) +{ + int _err; + struct vnop_rename_args a; + int funnel_state = 0; + char smallname1[48]; + char smallname2[48]; + char *xfromname = NULL; + char *xtoname = NULL; + vnode_t lock_first = NULL, lock_second = NULL; + vnode_t fdvp_unsafe = NULLVP; + vnode_t tdvp_unsafe = NULLVP; + + a.a_desc = &vnop_rename_desc; + a.a_fdvp = fdvp; + a.a_fvp = fvp; + a.a_fcnp = fcnp; + a.a_tdvp = tdvp; + a.a_tvp = tvp; + a.a_tcnp = tcnp; + a.a_context = context; + + if (!THREAD_SAFE_FS(fdvp)) + fdvp_unsafe = fdvp; + if (!THREAD_SAFE_FS(tdvp)) + tdvp_unsafe = tdvp; + + if (fdvp_unsafe != NULLVP) { + /* + * Lock parents in vnode address order to avoid deadlocks + * note that it's possible for the fdvp to be unsafe, + * but the tdvp to be safe because tvp could be a directory + * in the root of a filesystem... in that case, tdvp is the + * in the filesystem that this root is mounted on + */ + if (tdvp_unsafe == NULL || fdvp_unsafe == tdvp_unsafe) { + lock_first = fdvp_unsafe; + lock_second = NULL; + } else if (fdvp_unsafe < tdvp_unsafe) { + lock_first = fdvp_unsafe; + lock_second = tdvp_unsafe; + } else { + lock_first = tdvp_unsafe; + lock_second = fdvp_unsafe; + } + if ( (_err = lock_fsnode(lock_first, &funnel_state)) ) + return (_err); + + if (lock_second != NULL && (_err = lock_fsnode(lock_second, NULL))) { + unlock_fsnode(lock_first, &funnel_state); + return (_err); + } + + /* + * Lock both children in vnode address order to avoid deadlocks + */ + if (tvp == NULL || tvp == fvp) { + lock_first = fvp; + lock_second = NULL; + } else if (fvp < tvp) { + lock_first = fvp; + lock_second = tvp; + } else { + lock_first = tvp; + lock_second = fvp; + } + if ( (_err = lock_fsnode(lock_first, NULL)) ) + goto out1; + + if (lock_second != NULL && (_err = lock_fsnode(lock_second, NULL))) { + unlock_fsnode(lock_first, NULL); + goto out1; + } + } + /* + * Save source and destination names (._ AppleDouble files). + * Skip if source already has a "._" prefix. + */ + if (!NATIVE_XATTR(fdvp) && + !(fcnp->cn_nameptr[0] == '.' && fcnp->cn_nameptr[1] == '_')) { + size_t len; + + /* Get source attribute file name. */ + len = fcnp->cn_namelen + 3; + if (len > sizeof(smallname1)) { + MALLOC(xfromname, char *, len, M_TEMP, M_WAITOK); + } else { + xfromname = &smallname1[0]; + } + strcpy(xfromname, "._"); + strncat(xfromname, fcnp->cn_nameptr, fcnp->cn_namelen); + xfromname[len-1] = '\0'; + + /* Get destination attribute file name. */ + len = tcnp->cn_namelen + 3; + if (len > sizeof(smallname2)) { + MALLOC(xtoname, char *, len, M_TEMP, M_WAITOK); + } else { + xtoname = &smallname2[0]; + } + strcpy(xtoname, "._"); + strncat(xtoname, tcnp->cn_nameptr, tcnp->cn_namelen); + xtoname[len-1] = '\0'; + } + + _err = (*fdvp->v_op[vnop_rename_desc.vdesc_offset])(&a); + + if (fdvp_unsafe != NULLVP) { + if (lock_second != NULL) + unlock_fsnode(lock_second, NULL); + unlock_fsnode(lock_first, NULL); + } + if (_err == 0) { + if (tvp && tvp != fvp) + vnode_setneedinactive(tvp); + } + + /* + * Rename any associated extended attibute file (._ AppleDouble file). + */ + if (_err == 0 && !NATIVE_XATTR(fdvp) && xfromname != NULL) { + struct nameidata fromnd, tond; + int killdest = 0; + int error; + + /* + * Get source attribute file vnode. + * Note that fdvp already has an iocount reference and + * using DELETE will take an additional reference. + */ + NDINIT(&fromnd, DELETE, NOFOLLOW | USEDVP, UIO_SYSSPACE, + CAST_USER_ADDR_T(xfromname), context); + fromnd.ni_dvp = fdvp; + error = namei(&fromnd); + + if (error) { + /* When source doesn't exist there still may be a destination. */ + if (error == ENOENT) { + killdest = 1; + } else { + goto out; + } + } else if (fromnd.ni_vp->v_type != VREG) { + vnode_put(fromnd.ni_vp); + nameidone(&fromnd); + killdest = 1; + } + if (killdest) { + struct vnop_remove_args args; + + /* + * Get destination attribute file vnode. + * Note that tdvp already has an iocount reference. + */ + NDINIT(&tond, DELETE, NOFOLLOW | USEDVP, UIO_SYSSPACE, + CAST_USER_ADDR_T(xtoname), context); + tond.ni_dvp = tdvp; + error = namei(&tond); + if (error) { + goto out; + } + if (tond.ni_vp->v_type != VREG) { + vnode_put(tond.ni_vp); + nameidone(&tond); + goto out; + } + args.a_desc = &vnop_remove_desc; + args.a_dvp = tdvp; + args.a_vp = tond.ni_vp; + args.a_cnp = &tond.ni_cnd; + args.a_context = context; + + if (fdvp_unsafe != NULLVP) + error = lock_fsnode(tond.ni_vp, NULL); + if (error == 0) { + error = (*tdvp->v_op[vnop_remove_desc.vdesc_offset])(&args); + + if (fdvp_unsafe != NULLVP) + unlock_fsnode(tond.ni_vp, NULL); + + if (error == 0) + vnode_setneedinactive(tond.ni_vp); + } + vnode_put(tond.ni_vp); + nameidone(&tond); + goto out; + } + + /* + * Get destination attribute file vnode. + */ + NDINIT(&tond, RENAME, + NOCACHE | NOFOLLOW | USEDVP, UIO_SYSSPACE, + CAST_USER_ADDR_T(xtoname), context); + tond.ni_dvp = tdvp; + error = namei(&tond); + + if (error) { + vnode_put(fromnd.ni_vp); + nameidone(&fromnd); + goto out; + } + a.a_desc = &vnop_rename_desc; + a.a_fdvp = fdvp; + a.a_fvp = fromnd.ni_vp; + a.a_fcnp = &fromnd.ni_cnd; + a.a_tdvp = tdvp; + a.a_tvp = tond.ni_vp; + a.a_tcnp = &tond.ni_cnd; + a.a_context = context; + + if (fdvp_unsafe != NULLVP) { + /* + * Lock in vnode address order to avoid deadlocks + */ + if (tond.ni_vp == NULL || tond.ni_vp == fromnd.ni_vp) { + lock_first = fromnd.ni_vp; + lock_second = NULL; + } else if (fromnd.ni_vp < tond.ni_vp) { + lock_first = fromnd.ni_vp; + lock_second = tond.ni_vp; + } else { + lock_first = tond.ni_vp; + lock_second = fromnd.ni_vp; + } + if ( (error = lock_fsnode(lock_first, NULL)) == 0) { + if (lock_second != NULL && (error = lock_fsnode(lock_second, NULL)) ) + unlock_fsnode(lock_first, NULL); + } + } + if (error == 0) { + error = (*fdvp->v_op[vnop_rename_desc.vdesc_offset])(&a); + + if (fdvp_unsafe != NULLVP) { + if (lock_second != NULL) + unlock_fsnode(lock_second, NULL); + unlock_fsnode(lock_first, NULL); + } + if (error == 0) { + vnode_setneedinactive(fromnd.ni_vp); + + if (tond.ni_vp && tond.ni_vp != fromnd.ni_vp) + vnode_setneedinactive(tond.ni_vp); + } + } + vnode_put(fromnd.ni_vp); + if (tond.ni_vp) { + vnode_put(tond.ni_vp); + } + nameidone(&tond); + nameidone(&fromnd); + } +out: + if (xfromname && xfromname != &smallname1[0]) { + FREE(xfromname, M_TEMP); + } + if (xtoname && xtoname != &smallname2[0]) { + FREE(xtoname, M_TEMP); + } +out1: + if (fdvp_unsafe != NULLVP) { + if (tdvp_unsafe != NULLVP) + unlock_fsnode(tdvp_unsafe, NULL); + unlock_fsnode(fdvp_unsafe, &funnel_state); + } + return (_err); +} + + #if 0 +/* + *# + *#% mkdir dvp L U U + *#% mkdir vpp - L - + *# + */ +struct vnop_mkdir_args { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t *a_vpp; + struct componentname *a_cnp; + struct vnode_attr *a_vap; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_MKDIR(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, + struct vnode_attr *vap, vfs_context_t context) +{ + int _err; + struct vnop_mkdir_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_mkdir_desc; + a.a_dvp = dvp; + a.a_vpp = vpp; + a.a_cnp = cnp; + a.a_vap = vap; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(dvp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(dvp, &funnel_state)) ) { + return (_err); + } + } + _err = (*dvp->v_op[vnop_mkdir_desc.vdesc_offset])(&a); + if (_err == 0 && !NATIVE_XATTR(dvp)) { + /* + * Remove stale Apple Double file (if any). + */ + xattrfile_remove(dvp, cnp->cn_nameptr, context, thread_safe, 0); + } + if (!thread_safe) { + unlock_fsnode(dvp, &funnel_state); + } + return (_err); +} + + +#if 0 +/* + *# + *#% rmdir dvp L U U + *#% rmdir vp L U U + *# + */ +struct vnop_rmdir_args { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t a_vp; + struct componentname *a_cnp; + vfs_context_t a_context; +}; + +#endif /* 0*/ +errno_t +VNOP_RMDIR(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, vfs_context_t context) +{ + int _err; + struct vnop_rmdir_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_rmdir_desc; + a.a_dvp = dvp; + a.a_vp = vp; + a.a_cnp = cnp; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(dvp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(vp, &funnel_state)) ) { + return (_err); + } + } + _err = (*vp->v_op[vnop_rmdir_desc.vdesc_offset])(&a); + + if (_err == 0) { + vnode_setneedinactive(vp); + + if ( !(NATIVE_XATTR(dvp)) ) { + /* + * Remove any associated extended attibute file (._ AppleDouble file). + */ + xattrfile_remove(dvp, cnp->cn_nameptr, context, thread_safe, 1); + } + } + if (!thread_safe) { + unlock_fsnode(vp, &funnel_state); + } + return (_err); +} + +/* + * Remove a ._ AppleDouble file + */ +#define AD_STALE_SECS (180) +static void +xattrfile_remove(vnode_t dvp, const char * basename, vfs_context_t context, int thread_safe, int force) { + vnode_t xvp; + struct nameidata nd; + char smallname[64]; + char *filename = NULL; + size_t len; + + if ((basename == NULL) || (basename[0] == '\0') || + (basename[0] == '.' && basename[1] == '_')) { + return; + } + filename = &smallname[0]; + len = snprintf(filename, sizeof(smallname), "._%s", basename); + if (len >= sizeof(smallname)) { + len++; /* snprintf result doesn't include '\0' */ + MALLOC(filename, char *, len, M_TEMP, M_WAITOK); + len = snprintf(filename, len, "._%s", basename); + } + NDINIT(&nd, DELETE, LOCKLEAF | NOFOLLOW | USEDVP, UIO_SYSSPACE, + CAST_USER_ADDR_T(filename), context); + nd.ni_dvp = dvp; + if (namei(&nd) != 0) + goto out2; + + xvp = nd.ni_vp; + nameidone(&nd); + if (xvp->v_type != VREG) + goto out1; + + /* + * When creating a new object and a "._" file already + * exists, check to see if its a stale "._" file. + * + */ + if (!force) { + struct vnode_attr va; + + VATTR_INIT(&va); + VATTR_WANTED(&va, va_data_size); + VATTR_WANTED(&va, va_modify_time); + if (VNOP_GETATTR(xvp, &va, context) == 0 && + VATTR_IS_SUPPORTED(&va, va_data_size) && + VATTR_IS_SUPPORTED(&va, va_modify_time) && + va.va_data_size != 0) { + struct timeval tv; + + microtime(&tv); + if ((tv.tv_sec > va.va_modify_time.tv_sec) && + (tv.tv_sec - va.va_modify_time.tv_sec) > AD_STALE_SECS) { + force = 1; /* must be stale */ + } + } + } + if (force) { + struct vnop_remove_args a; + int error; + + a.a_desc = &vnop_remove_desc; + a.a_dvp = nd.ni_dvp; + a.a_vp = xvp; + a.a_cnp = &nd.ni_cnd; + a.a_context = context; + + if (!thread_safe) { + if ( (lock_fsnode(xvp, NULL)) ) + goto out1; + } + error = (*dvp->v_op[vnop_remove_desc.vdesc_offset])(&a); + + if (!thread_safe) + unlock_fsnode(xvp, NULL); + + if (error == 0) + vnode_setneedinactive(xvp); + } +out1: + /* Note: nd.ni_dvp's iocount is dropped by caller of VNOP_XXXX */ + vnode_put(xvp); +out2: + if (filename && filename != &smallname[0]) { + FREE(filename, M_TEMP); + } +} + +/* + * Shadow uid/gid/mod to a ._ AppleDouble file + */ +static void +xattrfile_setattr(vnode_t dvp, const char * basename, struct vnode_attr * vap, + vfs_context_t context, int thread_safe) { + vnode_t xvp; + struct nameidata nd; + char smallname[64]; + char *filename = NULL; + size_t len; + + if ((dvp == NULLVP) || + (basename == NULL) || (basename[0] == '\0') || + (basename[0] == '.' && basename[1] == '_')) { + return; + } + filename = &smallname[0]; + len = snprintf(filename, sizeof(smallname), "._%s", basename); + if (len >= sizeof(smallname)) { + len++; /* snprintf result doesn't include '\0' */ + MALLOC(filename, char *, len, M_TEMP, M_WAITOK); + len = snprintf(filename, len, "._%s", basename); + } + NDINIT(&nd, LOOKUP, NOFOLLOW | USEDVP, UIO_SYSSPACE, + CAST_USER_ADDR_T(filename), context); + nd.ni_dvp = dvp; + if (namei(&nd) != 0) + goto out2; + + xvp = nd.ni_vp; + nameidone(&nd); + + if (xvp->v_type == VREG) { + struct vnop_setattr_args a; + + a.a_desc = &vnop_setattr_desc; + a.a_vp = xvp; + a.a_vap = vap; + a.a_context = context; + + if (!thread_safe) { + if ( (lock_fsnode(xvp, NULL)) ) + goto out1; + } + (void) (*xvp->v_op[vnop_setattr_desc.vdesc_offset])(&a); + if (!thread_safe) { + unlock_fsnode(xvp, NULL); + } + } +out1: + vnode_put(xvp); +out2: + if (filename && filename != &smallname[0]) { + FREE(filename, M_TEMP); + } +} + + #if 0 +/* + *# + *#% symlink dvp L U U + *#% symlink vpp - U - + *# + */ +struct vnop_symlink_args { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t *a_vpp; + struct componentname *a_cnp; + struct vnode_attr *a_vap; + char *a_target; + vfs_context_t a_context; +}; + +#endif /* 0*/ +errno_t +VNOP_SYMLINK(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, + struct vnode_attr *vap, char *target, vfs_context_t context) +{ + int _err; + struct vnop_symlink_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_symlink_desc; + a.a_dvp = dvp; + a.a_vpp = vpp; + a.a_cnp = cnp; + a.a_vap = vap; + a.a_target = target; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(dvp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(dvp, &funnel_state)) ) { + return (_err); + } + } + _err = (*dvp->v_op[vnop_symlink_desc.vdesc_offset])(&a); + if (_err == 0 && !NATIVE_XATTR(dvp)) { + /* + * Remove stale Apple Double file (if any). + */ + xattrfile_remove(dvp, cnp->cn_nameptr, context, thread_safe, 0); + } + if (!thread_safe) { + unlock_fsnode(dvp, &funnel_state); + } + return (_err); +} + +#if 0 +/* + *# + *#% readdir vp L L L + *# + */ +struct vnop_readdir_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + struct uio *a_uio; + int a_flags; + int *a_eofflag; + int *a_numdirent; + vfs_context_t a_context; +}; + +#endif /* 0*/ +errno_t +VNOP_READDIR(struct vnode *vp, struct uio *uio, int flags, int *eofflag, + int *numdirent, vfs_context_t context) +{ + int _err; + struct vnop_readdir_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_readdir_desc; + a.a_vp = vp; + a.a_uio = uio; + a.a_flags = flags; + a.a_eofflag = eofflag; + a.a_numdirent = numdirent; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(vp, &funnel_state)) ) { + return (_err); + } + } + _err = (*vp->v_op[vnop_readdir_desc.vdesc_offset])(&a); + if (!thread_safe) { + unlock_fsnode(vp, &funnel_state); + } + return (_err); +} + +#if 0 +/* + *# + *#% readdirattr vp L L L + *# + */ +struct vnop_readdirattr_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + struct attrlist *a_alist; + struct uio *a_uio; + u_long a_maxcount; + u_long a_options; + u_long *a_newstate; + int *a_eofflag; + u_long *a_actualcount; + vfs_context_t a_context; +}; + +#endif /* 0*/ +errno_t +VNOP_READDIRATTR(struct vnode *vp, struct attrlist *alist, struct uio *uio, u_long maxcount, + u_long options, u_long *newstate, int *eofflag, u_long *actualcount, vfs_context_t context) +{ + int _err; + struct vnop_readdirattr_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_readdirattr_desc; + a.a_vp = vp; + a.a_alist = alist; + a.a_uio = uio; + a.a_maxcount = maxcount; + a.a_options = options; + a.a_newstate = newstate; + a.a_eofflag = eofflag; + a.a_actualcount = actualcount; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(vp, &funnel_state)) ) { + return (_err); + } + } + _err = (*vp->v_op[vnop_readdirattr_desc.vdesc_offset])(&a); + if (!thread_safe) { + unlock_fsnode(vp, &funnel_state); + } + return (_err); +} + +#if 0 +/* + *# + *#% readlink vp L L L + *# + */ +struct vnop_readlink_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + struct uio *a_uio; + vfs_context_t a_context; +}; +#endif /* 0 */ + +errno_t +VNOP_READLINK(struct vnode *vp, struct uio *uio, vfs_context_t context) +{ + int _err; + struct vnop_readlink_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_readlink_desc; + a.a_vp = vp; + a.a_uio = uio; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(vp, &funnel_state)) ) { + return (_err); + } + } + _err = (*vp->v_op[vnop_readlink_desc.vdesc_offset])(&a); + if (!thread_safe) { + unlock_fsnode(vp, &funnel_state); + } + return (_err); +} + +#if 0 +/* + *# + *#% inactive vp L U U + *# + */ +struct vnop_inactive_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_INACTIVE(struct vnode *vp, vfs_context_t context) +{ + int _err; + struct vnop_inactive_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_inactive_desc; + a.a_vp = vp; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(vp, &funnel_state)) ) { + return (_err); + } + } + _err = (*vp->v_op[vnop_inactive_desc.vdesc_offset])(&a); + if (!thread_safe) { + unlock_fsnode(vp, &funnel_state); + } + return (_err); +} + + +#if 0 +/* + *# + *#% reclaim vp U U U + *# + */ +struct vnop_reclaim_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_RECLAIM(struct vnode *vp, vfs_context_t context) +{ + int _err; + struct vnop_reclaim_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_reclaim_desc; + a.a_vp = vp; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + } + _err = (*vp->v_op[vnop_reclaim_desc.vdesc_offset])(&a); + if (!thread_safe) { + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return (_err); +} + + +#if 0 +/* + *# + *#% pathconf vp L L L + *# + */ +struct vnop_pathconf_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + int a_name; + register_t *a_retval; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_PATHCONF(struct vnode *vp, int name, register_t *retval, vfs_context_t context) +{ + int _err; + struct vnop_pathconf_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_pathconf_desc; + a.a_vp = vp; + a.a_name = name; + a.a_retval = retval; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(vp, &funnel_state)) ) { + return (_err); + } + } + _err = (*vp->v_op[vnop_pathconf_desc.vdesc_offset])(&a); + if (!thread_safe) { + unlock_fsnode(vp, &funnel_state); + } + return (_err); +} + +#if 0 +/* + *# + *#% advlock vp U U U + *# + */ +struct vnop_advlock_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + caddr_t a_id; + int a_op; + struct flock *a_fl; + int a_flags; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_ADVLOCK(struct vnode *vp, caddr_t id, int op, struct flock *fl, int flags, vfs_context_t context) +{ + int _err; + struct vnop_advlock_args a; + int thread_safe; + int funnel_state = 0; + struct uthread * uth; + + a.a_desc = &vnop_advlock_desc; + a.a_vp = vp; + a.a_id = id; + a.a_op = op; + a.a_fl = fl; + a.a_flags = flags; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + uth = get_bsdthread_info(current_thread()); + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + } + /* Disallow advisory locking on non-seekable vnodes */ + if (vnode_isfifo(vp)) { + _err = err_advlock(&a); + } else { + if ((vp->v_flag & VLOCKLOCAL)) { + /* Advisory locking done at this layer */ + _err = lf_advlock(&a); + } else { + /* Advisory locking done by underlying filesystem */ + _err = (*vp->v_op[vnop_advlock_desc.vdesc_offset])(&a); + } + } + if (!thread_safe) { + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return (_err); +} + + + +#if 0 +/* + *# + *#% allocate vp L L L + *# + */ +struct vnop_allocate_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + off_t a_length; + u_int32_t a_flags; + off_t *a_bytesallocated; + off_t a_offset; + vfs_context_t a_context; +}; + +#endif /* 0*/ +errno_t +VNOP_ALLOCATE(struct vnode *vp, off_t length, u_int32_t flags, off_t *bytesallocated, off_t offset, vfs_context_t context) +{ + int _err; + struct vnop_allocate_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_allocate_desc; + a.a_vp = vp; + a.a_length = length; + a.a_flags = flags; + a.a_bytesallocated = bytesallocated; + a.a_offset = offset; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(vp, &funnel_state)) ) { + return (_err); + } + } + _err = (*vp->v_op[vnop_allocate_desc.vdesc_offset])(&a); + if (!thread_safe) { + unlock_fsnode(vp, &funnel_state); + } + return (_err); +} + +#if 0 +/* + *# + *#% pagein vp = = = + *# + */ +struct vnop_pagein_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + upl_t a_pl; + vm_offset_t a_pl_offset; + off_t a_f_offset; + size_t a_size; + int a_flags; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_PAGEIN(struct vnode *vp, upl_t pl, vm_offset_t pl_offset, off_t f_offset, size_t size, int flags, vfs_context_t context) +{ + int _err; + struct vnop_pagein_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_pagein_desc; + a.a_vp = vp; + a.a_pl = pl; + a.a_pl_offset = pl_offset; + a.a_f_offset = f_offset; + a.a_size = size; + a.a_flags = flags; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + } + _err = (*vp->v_op[vnop_pagein_desc.vdesc_offset])(&a); + if (!thread_safe) { + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return (_err); +} + +#if 0 +/* + *# + *#% pageout vp = = = + *# + */ +struct vnop_pageout_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + upl_t a_pl; + vm_offset_t a_pl_offset; + off_t a_f_offset; + size_t a_size; + int a_flags; + vfs_context_t a_context; +}; + +#endif /* 0*/ +errno_t +VNOP_PAGEOUT(struct vnode *vp, upl_t pl, vm_offset_t pl_offset, off_t f_offset, size_t size, int flags, vfs_context_t context) +{ + int _err; + struct vnop_pageout_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_pageout_desc; + a.a_vp = vp; + a.a_pl = pl; + a.a_pl_offset = pl_offset; + a.a_f_offset = f_offset; + a.a_size = size; + a.a_flags = flags; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + } + _err = (*vp->v_op[vnop_pageout_desc.vdesc_offset])(&a); + if (!thread_safe) { + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return (_err); +} + + +#if 0 +/* + *# + *#% searchfs vp L L L + *# + */ +struct vnop_searchfs_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + void *a_searchparams1; + void *a_searchparams2; + struct attrlist *a_searchattrs; + u_long a_maxmatches; + struct timeval *a_timelimit; + struct attrlist *a_returnattrs; + u_long *a_nummatches; + u_long a_scriptcode; + u_long a_options; + struct uio *a_uio; + struct searchstate *a_searchstate; + vfs_context_t a_context; +}; + +#endif /* 0*/ +errno_t +VNOP_SEARCHFS(struct vnode *vp, void *searchparams1, void *searchparams2, struct attrlist *searchattrs, u_long maxmatches, struct timeval *timelimit, struct attrlist *returnattrs, u_long *nummatches, u_long scriptcode, u_long options, struct uio *uio, struct searchstate *searchstate, vfs_context_t context) +{ + int _err; + struct vnop_searchfs_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_searchfs_desc; + a.a_vp = vp; + a.a_searchparams1 = searchparams1; + a.a_searchparams2 = searchparams2; + a.a_searchattrs = searchattrs; + a.a_maxmatches = maxmatches; + a.a_timelimit = timelimit; + a.a_returnattrs = returnattrs; + a.a_nummatches = nummatches; + a.a_scriptcode = scriptcode; + a.a_options = options; + a.a_uio = uio; + a.a_searchstate = searchstate; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(vp, &funnel_state)) ) { + return (_err); + } + } + _err = (*vp->v_op[vnop_searchfs_desc.vdesc_offset])(&a); + if (!thread_safe) { + unlock_fsnode(vp, &funnel_state); + } + return (_err); +} + +#if 0 +/* + *# + *#% copyfile fvp U U U + *#% copyfile tdvp L U U + *#% copyfile tvp X U U + *# + */ +struct vnop_copyfile_args { + struct vnodeop_desc *a_desc; + vnode_t a_fvp; + vnode_t a_tdvp; + vnode_t a_tvp; + struct componentname *a_tcnp; + int a_mode; + int a_flags; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_COPYFILE(struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, + int mode, int flags, vfs_context_t context) +{ + int _err; + struct vnop_copyfile_args a; + a.a_desc = &vnop_copyfile_desc; + a.a_fvp = fvp; + a.a_tdvp = tdvp; + a.a_tvp = tvp; + a.a_tcnp = tcnp; + a.a_mode = mode; + a.a_flags = flags; + a.a_context = context; + _err = (*fvp->v_op[vnop_copyfile_desc.vdesc_offset])(&a); + return (_err); +} + + +errno_t +VNOP_GETXATTR(vnode_t vp, const char *name, uio_t uio, size_t *size, int options, vfs_context_t context) +{ + struct vnop_getxattr_args a; + int error; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_getxattr_desc; + a.a_vp = vp; + a.a_name = name; + a.a_uio = uio; + a.a_size = size; + a.a_options = options; + a.a_context = context; + + thread_safe = THREAD_SAFE_FS(vp); + if (!thread_safe) { + if ( (error = lock_fsnode(vp, &funnel_state)) ) { + return (error); + } + } + error = (*vp->v_op[vnop_getxattr_desc.vdesc_offset])(&a); + if (!thread_safe) { + unlock_fsnode(vp, &funnel_state); + } + return (error); +} + +errno_t +VNOP_SETXATTR(vnode_t vp, const char *name, uio_t uio, int options, vfs_context_t context) +{ + struct vnop_setxattr_args a; + int error; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_setxattr_desc; + a.a_vp = vp; + a.a_name = name; + a.a_uio = uio; + a.a_options = options; + a.a_context = context; + + thread_safe = THREAD_SAFE_FS(vp); + if (!thread_safe) { + if ( (error = lock_fsnode(vp, &funnel_state)) ) { + return (error); + } + } + error = (*vp->v_op[vnop_setxattr_desc.vdesc_offset])(&a); + if (!thread_safe) { + unlock_fsnode(vp, &funnel_state); + } + return (error); +} + +errno_t +VNOP_REMOVEXATTR(vnode_t vp, const char *name, int options, vfs_context_t context) +{ + struct vnop_removexattr_args a; + int error; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_removexattr_desc; + a.a_vp = vp; + a.a_name = name; + a.a_options = options; + a.a_context = context; + + thread_safe = THREAD_SAFE_FS(vp); + if (!thread_safe) { + if ( (error = lock_fsnode(vp, &funnel_state)) ) { + return (error); + } + } + error = (*vp->v_op[vnop_removexattr_desc.vdesc_offset])(&a); + if (!thread_safe) { + unlock_fsnode(vp, &funnel_state); + } + return (error); +} + +errno_t +VNOP_LISTXATTR(vnode_t vp, uio_t uio, size_t *size, int options, vfs_context_t context) +{ + struct vnop_listxattr_args a; + int error; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_listxattr_desc; + a.a_vp = vp; + a.a_uio = uio; + a.a_size = size; + a.a_options = options; + a.a_context = context; + + thread_safe = THREAD_SAFE_FS(vp); + if (!thread_safe) { + if ( (error = lock_fsnode(vp, &funnel_state)) ) { + return (error); + } + } + error = (*vp->v_op[vnop_listxattr_desc.vdesc_offset])(&a); + if (!thread_safe) { + unlock_fsnode(vp, &funnel_state); + } + return (error); +} + + +#if 0 +/* + *# + *#% blktooff vp = = = + *# + */ +struct vnop_blktooff_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + daddr64_t a_lblkno; + off_t *a_offset; +}; +#endif /* 0*/ +errno_t +VNOP_BLKTOOFF(struct vnode *vp, daddr64_t lblkno, off_t *offset) +{ + int _err; + struct vnop_blktooff_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_blktooff_desc; + a.a_vp = vp; + a.a_lblkno = lblkno; + a.a_offset = offset; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + } + _err = (*vp->v_op[vnop_blktooff_desc.vdesc_offset])(&a); + if (!thread_safe) { + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return (_err); +} + +#if 0 +/* + *# + *#% offtoblk vp = = = + *# + */ +struct vnop_offtoblk_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + off_t a_offset; + daddr64_t *a_lblkno; +}; +#endif /* 0*/ +errno_t +VNOP_OFFTOBLK(struct vnode *vp, off_t offset, daddr64_t *lblkno) +{ + int _err; + struct vnop_offtoblk_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = &vnop_offtoblk_desc; + a.a_vp = vp; + a.a_offset = offset; + a.a_lblkno = lblkno; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + } + _err = (*vp->v_op[vnop_offtoblk_desc.vdesc_offset])(&a); + if (!thread_safe) { + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return (_err); +} + +#if 0 +/* + *# + *#% blockmap vp L L L + *# + */ +struct vnop_blockmap_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + off_t a_foffset; + size_t a_size; + daddr64_t *a_bpn; + size_t *a_run; + void *a_poff; + int a_flags; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_BLOCKMAP(struct vnode *vp, off_t foffset, size_t size, daddr64_t *bpn, size_t *run, void *poff, int flags, vfs_context_t context) +{ + int _err; + struct vnop_blockmap_args a; + int thread_safe; + int funnel_state = 0; + struct vfs_context acontext; + + if (context == NULL) { + acontext.vc_proc = current_proc(); + acontext.vc_ucred = kauth_cred_get(); + context = &acontext; + } + a.a_desc = &vnop_blockmap_desc; + a.a_vp = vp; + a.a_foffset = foffset; + a.a_size = size; + a.a_bpn = bpn; + a.a_run = run; + a.a_poff = poff; + a.a_flags = flags; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + funnel_state = thread_funnel_set(kernel_flock, TRUE); + } + _err = (*vp->v_op[vnop_blockmap_desc.vdesc_offset])(&a); + if (!thread_safe) { + (void) thread_funnel_set(kernel_flock, funnel_state); + } + return (_err); +} + +#if 0 +struct vnop_strategy_args { + struct vnodeop_desc *a_desc; + struct buf *a_bp; +}; + +#endif /* 0*/ +errno_t +VNOP_STRATEGY(struct buf *bp) +{ + int _err; + struct vnop_strategy_args a; + a.a_desc = &vnop_strategy_desc; + a.a_bp = bp; + _err = (*buf_vnode(bp)->v_op[vnop_strategy_desc.vdesc_offset])(&a); + return (_err); +} + +#if 0 +struct vnop_bwrite_args { + struct vnodeop_desc *a_desc; + buf_t a_bp; +}; +#endif /* 0*/ +errno_t +VNOP_BWRITE(struct buf *bp) +{ + int _err; + struct vnop_bwrite_args a; + a.a_desc = &vnop_bwrite_desc; + a.a_bp = bp; + _err = (*buf_vnode(bp)->v_op[vnop_bwrite_desc.vdesc_offset])(&a); + return (_err); +} + +#if 0 +struct vnop_kqfilt_add_args { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + struct knote *a_kn; + vfs_context_t a_context; +}; +#endif +errno_t +VNOP_KQFILT_ADD(struct vnode *vp, struct knote *kn, vfs_context_t context) +{ + int _err; + struct vnop_kqfilt_add_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = VDESC(vnop_kqfilt_add); + a.a_vp = vp; + a.a_kn = kn; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(vp, &funnel_state)) ) { + return (_err); + } + } + _err = (*vp->v_op[vnop_kqfilt_add_desc.vdesc_offset])(&a); + if (!thread_safe) { + unlock_fsnode(vp, &funnel_state); + } + return(_err); +} + +#if 0 +struct vnop_kqfilt_remove_args { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + uintptr_t a_ident; + vfs_context_t a_context; +}; +#endif +errno_t +VNOP_KQFILT_REMOVE(struct vnode *vp, uintptr_t ident, vfs_context_t context) +{ + int _err; + struct vnop_kqfilt_remove_args a; + int thread_safe; + int funnel_state = 0; + + a.a_desc = VDESC(vnop_kqfilt_remove); + a.a_vp = vp; + a.a_ident = ident; + a.a_context = context; + thread_safe = THREAD_SAFE_FS(vp); + + if (!thread_safe) { + if ( (_err = lock_fsnode(vp, &funnel_state)) ) { + return (_err); + } + } + _err = (*vp->v_op[vnop_kqfilt_remove_desc.vdesc_offset])(&a); + if (!thread_safe) { + unlock_fsnode(vp, &funnel_state); + } + return(_err); +} + diff --git a/bsd/vfs/vfs_attrlist.c b/bsd/vfs/vfs_attrlist.c new file mode 100644 index 000000000..7716e41e2 --- /dev/null +++ b/bsd/vfs/vfs_attrlist.c @@ -0,0 +1,1632 @@ +/* + * Copyright (c) 1995-2005 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/namei.h> +#include <sys/kernel.h> +#include <sys/stat.h> +#include <sys/vnode_internal.h> +#include <sys/mount_internal.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> +#include <sys/uio_internal.h> +#include <sys/malloc.h> +#include <sys/attr.h> +#include <sys/sysproto.h> +#include <sys/xattr.h> +#include <sys/fsevents.h> +#include <kern/kalloc.h> +#include <miscfs/specfs/specdev.h> +#include <hfs/hfs.h> + +#define ATTR_TIME_SIZE -1 + +/* + * Structure describing the state of an in-progress attrlist operation. + */ +struct _attrlist_buf { + char *base; + char *fixedcursor; + char *varcursor; + ssize_t allocated; + ssize_t needed; +}; + + +/* + * Pack (count) bytes from (source) into (buf). + */ +static void +attrlist_pack_fixed(struct _attrlist_buf *ab, void *source, ssize_t count) +{ + ssize_t fit; + + /* how much room left in the buffer? */ + fit = imin(count, ab->allocated - (ab->fixedcursor - ab->base)); + if (fit > 0) + bcopy(source, ab->fixedcursor, fit); + + /* always move in increments of 4 */ + ab->fixedcursor += roundup(count, 4); +} +static void +attrlist_pack_variable2(struct _attrlist_buf *ab, const void *source, ssize_t count, const void *ext, ssize_t extcount) +{ + struct attrreference ar; + ssize_t fit; + + /* pack the reference to the variable object */ + ar.attr_dataoffset = ab->varcursor - ab->fixedcursor; + ar.attr_length = count + extcount; + attrlist_pack_fixed(ab, &ar, sizeof(ar)); + + /* calculate space and pack the variable object */ + fit = imin(count, ab->allocated - (ab->varcursor - ab->base)); + if (fit > 0) { + if (source != NULL) + bcopy(source, ab->varcursor, fit); + ab->varcursor += fit; + } + fit = imin(extcount, ab->allocated - (ab->varcursor - ab->base)); + if (fit > 0) { + if (ext != NULL) + bcopy(ext, ab->varcursor, fit); + ab->varcursor += fit; + } + /* always move in increments of 4 */ + ab->varcursor = (char *)roundup((uintptr_t)ab->varcursor, 4); +} +static void +attrlist_pack_variable(struct _attrlist_buf *ab, const void *source, ssize_t count) +{ + attrlist_pack_variable2(ab, source, count, NULL, 0); +} +static void +attrlist_pack_string(struct _attrlist_buf *ab, const char *source, ssize_t count) +{ + struct attrreference ar; + ssize_t fit, space; + + + /* + * Supplied count is character count of string text, excluding trailing nul + * which we always supply here. + */ + if (source == NULL) { + count = 0; + } else if (count == 0) { + count = strlen(source); + } + + /* + * Make the reference and pack it. + * Note that this is entirely independent of how much we get into + * the buffer. + */ + ar.attr_dataoffset = ab->varcursor - ab->fixedcursor; + ar.attr_length = count + 1; + attrlist_pack_fixed(ab, &ar, sizeof(ar)); + + /* calculate how much of the string text we can copy, and do that */ + space = ab->allocated - (ab->varcursor - ab->base); + fit = imin(count, space); + if (fit > 0) + bcopy(source, ab->varcursor, fit); + /* is there room for our trailing nul? */ + if (space > fit) + ab->varcursor[fit] = '\0'; + + /* always move in increments of 4 */ + ab->varcursor += roundup(count + 1, 4); +} + +#define ATTR_PACK(b, v) attrlist_pack_fixed(b, &v, sizeof(v)) +#define ATTR_PACK_CAST(b, t, v) \ + do { \ + t _f = (t)v; \ + ATTR_PACK(b, _f); \ + } while (0) + +#define ATTR_PACK_TIME(b, v, is64) \ + do { \ + if (is64) { \ + struct user_timespec us = {v.tv_sec, v.tv_nsec}; \ + ATTR_PACK(b, us); \ + } else { \ + ATTR_PACK(b, v); \ + } \ + } while(0) + + +/* + * Table-driven setup for all valid common/volume attributes. + */ +struct getvolattrlist_attrtab { + attrgroup_t attr; + uint64_t bits; +#define VFSATTR_BIT(b) (VFSATTR_ ## b) + ssize_t size; +}; +static struct getvolattrlist_attrtab getvolattrlist_common_tab[] = { + {ATTR_CMN_NAME, 0, sizeof(struct attrreference)}, + {ATTR_CMN_DEVID, 0, sizeof(dev_t)}, + {ATTR_CMN_FSID, 0, sizeof(fsid_t)}, + {ATTR_CMN_OBJTYPE, 0, sizeof(fsobj_type_t)}, + {ATTR_CMN_OBJTAG, 0, sizeof(fsobj_tag_t)}, + {ATTR_CMN_OBJID, 0, sizeof(fsobj_id_t)}, + {ATTR_CMN_OBJPERMANENTID, 0, sizeof(fsobj_id_t)}, + {ATTR_CMN_PAROBJID, 0, sizeof(fsobj_id_t)}, + {ATTR_CMN_SCRIPT, 0, sizeof(text_encoding_t)}, + {ATTR_CMN_CRTIME, VFSATTR_BIT(f_create_time), ATTR_TIME_SIZE}, + {ATTR_CMN_MODTIME, VFSATTR_BIT(f_modify_time), ATTR_TIME_SIZE}, + {ATTR_CMN_CHGTIME, VFSATTR_BIT(f_modify_time), ATTR_TIME_SIZE}, + {ATTR_CMN_ACCTIME, VFSATTR_BIT(f_access_time), ATTR_TIME_SIZE}, + {ATTR_CMN_BKUPTIME, VFSATTR_BIT(f_backup_time), ATTR_TIME_SIZE}, + {ATTR_CMN_FNDRINFO, 0, 32}, + {ATTR_CMN_OWNERID, 0, sizeof(uid_t)}, + {ATTR_CMN_GRPID, 0, sizeof(gid_t)}, + {ATTR_CMN_ACCESSMASK, 0, sizeof(uint32_t)}, + {ATTR_CMN_FLAGS, 0, sizeof(uint32_t)}, + {ATTR_CMN_USERACCESS, 0, sizeof(uint32_t)}, + {0, 0, 0} +}; + +static struct getvolattrlist_attrtab getvolattrlist_vol_tab[] = { + {ATTR_VOL_FSTYPE, 0, sizeof(uint32_t)}, + {ATTR_VOL_SIGNATURE, VFSATTR_BIT(f_signature), sizeof(uint32_t)}, + {ATTR_VOL_SIZE, VFSATTR_BIT(f_blocks), sizeof(off_t)}, + {ATTR_VOL_SPACEFREE, VFSATTR_BIT(f_bfree) | VFSATTR_BIT(f_bsize), sizeof(off_t)}, + {ATTR_VOL_SPACEAVAIL, VFSATTR_BIT(f_bavail) | VFSATTR_BIT(f_bsize), sizeof(off_t)}, + {ATTR_VOL_MINALLOCATION, VFSATTR_BIT(f_bsize), sizeof(off_t)}, + {ATTR_VOL_ALLOCATIONCLUMP, VFSATTR_BIT(f_bsize), sizeof(off_t)}, + {ATTR_VOL_IOBLOCKSIZE, VFSATTR_BIT(f_iosize), sizeof(uint32_t)}, + {ATTR_VOL_OBJCOUNT, VFSATTR_BIT(f_objcount), sizeof(uint32_t)}, + {ATTR_VOL_FILECOUNT, VFSATTR_BIT(f_filecount), sizeof(uint32_t)}, + {ATTR_VOL_DIRCOUNT, VFSATTR_BIT(f_dircount), sizeof(uint32_t)}, + {ATTR_VOL_MAXOBJCOUNT, VFSATTR_BIT(f_maxobjcount), sizeof(uint32_t)}, + {ATTR_VOL_MOUNTPOINT, 0, sizeof(struct attrreference)}, + {ATTR_VOL_NAME, VFSATTR_BIT(f_vol_name), sizeof(struct attrreference)}, + {ATTR_VOL_MOUNTFLAGS, 0, sizeof(uint32_t)}, + {ATTR_VOL_MOUNTEDDEVICE, 0, sizeof(struct attrreference)}, + {ATTR_VOL_ENCODINGSUSED, 0, sizeof(uint64_t)}, + {ATTR_VOL_CAPABILITIES, VFSATTR_BIT(f_capabilities), sizeof(vol_capabilities_attr_t)}, + {ATTR_VOL_ATTRIBUTES, VFSATTR_BIT(f_attributes), sizeof(vol_attributes_attr_t)}, + {ATTR_VOL_INFO, 0, 0}, + {0, 0, 0} +}; + +static int +getvolattrlist_parsetab(struct getvolattrlist_attrtab *tab, attrgroup_t attrs, struct vfs_attr *vsp, + ssize_t *sizep, int is_64bit) +{ + attrgroup_t recognised; + + recognised = 0; + do { + /* is this attribute set? */ + if (tab->attr & attrs) { + recognised |= tab->attr; + vsp->f_active |= tab->bits; + if (tab->size == ATTR_TIME_SIZE) { + if (is_64bit) { + *sizep += sizeof(struct user_timespec); + } else { + *sizep += sizeof(struct timespec); + } + } else { + *sizep += tab->size; + } + } + } while ((++tab)->attr != 0); + + /* check to make sure that we recognised all of the passed-in attributes */ + if (attrs & ~recognised) + return(EINVAL); + return(0); +} + +/* + * Given the attributes listed in alp, configure vap to request + * the data from a filesystem. + */ +static int +getvolattrlist_setupvfsattr(struct attrlist *alp, struct vfs_attr *vsp, ssize_t *sizep, int is_64bit) +{ + int error; + + /* + * Parse the above tables. + */ + *sizep = sizeof(uint32_t); /* length count */ + if (alp->commonattr && + (error = getvolattrlist_parsetab(getvolattrlist_common_tab, alp->commonattr, vsp, sizep, is_64bit)) != 0) + return(error); + if (alp->volattr && + (error = getvolattrlist_parsetab(getvolattrlist_vol_tab, alp->volattr, vsp, sizep, is_64bit)) != 0) + return(error); + + return(0); +} + +/* + * Table-driven setup for all valid common/dir/file/fork attributes against files. + */ +struct getattrlist_attrtab { + attrgroup_t attr; + uint64_t bits; +#define VATTR_BIT(b) (VNODE_ATTR_ ## b) + ssize_t size; + kauth_action_t action; +}; +static struct getattrlist_attrtab getattrlist_common_tab[] = { + {ATTR_CMN_NAME, VATTR_BIT(va_name), sizeof(struct attrreference), KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_CMN_DEVID, 0, sizeof(dev_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_CMN_FSID, VATTR_BIT(va_fsid), sizeof(fsid_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_CMN_OBJTYPE, 0, sizeof(fsobj_type_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_CMN_OBJTAG, 0, sizeof(fsobj_tag_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_CMN_OBJID, VATTR_BIT(va_fileid) | VATTR_BIT(va_linkid), sizeof(fsobj_id_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_CMN_OBJPERMANENTID, VATTR_BIT(va_fileid) | VATTR_BIT(va_linkid), sizeof(fsobj_id_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_CMN_PAROBJID, VATTR_BIT(va_parentid), sizeof(fsobj_id_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_CMN_SCRIPT, VATTR_BIT(va_encoding), sizeof(text_encoding_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_CMN_CRTIME, VATTR_BIT(va_create_time), ATTR_TIME_SIZE, KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_CMN_MODTIME, VATTR_BIT(va_modify_time), ATTR_TIME_SIZE, KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_CMN_CHGTIME, VATTR_BIT(va_change_time), ATTR_TIME_SIZE, KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_CMN_ACCTIME, VATTR_BIT(va_access_time), ATTR_TIME_SIZE, KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_CMN_BKUPTIME, VATTR_BIT(va_backup_time), ATTR_TIME_SIZE, KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_CMN_FNDRINFO, 0, 32, KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_CMN_OWNERID, VATTR_BIT(va_uid), sizeof(uid_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_CMN_GRPID, VATTR_BIT(va_gid), sizeof(gid_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_CMN_ACCESSMASK, VATTR_BIT(va_mode), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_CMN_FLAGS, VATTR_BIT(va_flags), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_CMN_USERACCESS, 0, sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_CMN_EXTENDED_SECURITY, VATTR_BIT(va_acl), sizeof(struct attrreference), KAUTH_VNODE_READ_SECURITY}, + {ATTR_CMN_UUID, VATTR_BIT(va_uuuid), sizeof(guid_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_CMN_GRPUUID, VATTR_BIT(va_guuid), sizeof(guid_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {0, 0, 0, 0} +}; +static struct getattrlist_attrtab getattrlist_dir_tab[] = { + {ATTR_DIR_LINKCOUNT, VATTR_BIT(va_nlink), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_DIR_ENTRYCOUNT, VATTR_BIT(va_nchildren), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES}, + /* ATTR_DIR_ENTRYCOUNT falls back to va_nlink-2 if va_nchildren isn't supported, so request va_nlink just in case */ + {ATTR_DIR_ENTRYCOUNT, VATTR_BIT(va_nlink), 0, KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_DIR_MOUNTSTATUS, 0, sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {0, 0, 0, 0} +}; +static struct getattrlist_attrtab getattrlist_file_tab[] = { + {ATTR_FILE_LINKCOUNT, VATTR_BIT(va_nlink), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_FILE_TOTALSIZE, VATTR_BIT(va_total_size), sizeof(off_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_FILE_ALLOCSIZE, VATTR_BIT(va_total_alloc) | VATTR_BIT(va_total_size), sizeof(off_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_FILE_IOBLOCKSIZE, VATTR_BIT(va_iosize), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_FILE_DEVTYPE, VATTR_BIT(va_rdev), sizeof(dev_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_FILE_DATALENGTH, VATTR_BIT(va_total_size) | VATTR_BIT(va_data_size), sizeof(off_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_FILE_DATAALLOCSIZE, VATTR_BIT(va_total_alloc)| VATTR_BIT(va_data_alloc), sizeof(off_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_FILE_RSRCLENGTH, 0, sizeof(off_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_FILE_RSRCALLOCSIZE, 0, sizeof(off_t), KAUTH_VNODE_READ_ATTRIBUTES}, + {0, 0, 0, 0} +}; + +static int +getattrlist_parsetab(struct getattrlist_attrtab *tab, attrgroup_t attrs, struct vnode_attr *vap, + ssize_t *sizep, kauth_action_t *actionp, int is_64bit) +{ + attrgroup_t recognised; + + recognised = 0; + do { + /* is this attribute set? */ + if (tab->attr & attrs) { + recognised |= tab->attr; + vap->va_active |= tab->bits; + if (tab->size == ATTR_TIME_SIZE) { + if (is_64bit) { + *sizep += sizeof(struct user_timespec); + } else { + *sizep += sizeof(struct timespec); + } + } else { + *sizep += tab->size; + } + *actionp |= tab->action; + } + } while ((++tab)->attr != 0); + + /* check to make sure that we recognised all of the passed-in attributes */ + if (attrs & ~recognised) + return(EINVAL); + return(0); +} + +/* + * Given the attributes listed in alp, configure vap to request + * the data from a filesystem. + */ +static int +getattrlist_setupvattr(struct attrlist *alp, struct vnode_attr *vap, ssize_t *sizep, kauth_action_t *actionp, int is_64bit, int isdir) +{ + int error; + + /* + * Parse the above tables. + */ + *sizep = sizeof(uint32_t); /* length count */ + *actionp = 0; + if (alp->commonattr && + (error = getattrlist_parsetab(getattrlist_common_tab, alp->commonattr, vap, sizep, actionp, is_64bit)) != 0) + return(error); + if (isdir && alp->dirattr && + (error = getattrlist_parsetab(getattrlist_dir_tab, alp->dirattr, vap, sizep, actionp, is_64bit)) != 0) + return(error); + if (!isdir && alp->fileattr && + (error = getattrlist_parsetab(getattrlist_file_tab, alp->fileattr, vap, sizep, actionp, is_64bit)) != 0) + return(error); + + return(0); +} + + +/* + * Find something resembling a terminal component name in the mountedonname for vp + * + */ +static void +getattrlist_findnamecomp(const char *mn, const char **np, ssize_t *nl) +{ + int counting; + const char *cp; + + /* + * We're looking for the last sequence of non / characters, but + * not including any trailing / characters. + */ + *np = NULL; + *nl = 0; + counting = 0; + for (cp = mn; *cp != 0; cp++) { + if (!counting) { + /* start of run of chars */ + if (*cp != '/') { + *np = cp; + counting = 1; + } + } else { + /* end of run of chars */ + if (*cp == '/') { + *nl = cp - *np; + counting = 0; + } + } + } + /* need to close run? */ + if (counting) + *nl = cp - *np; +} + + +static int +getvolattrlist(vnode_t vp, struct getattrlist_args *uap, struct attrlist *alp, vfs_context_t ctx, int is_64bit) +{ + struct vfs_attr vs; + struct vnode_attr va; + struct _attrlist_buf ab; + int error; + ssize_t fixedsize, varsize; + const char *cnp; + ssize_t cnl; + mount_t mnt; + + ab.base = NULL; + VATTR_INIT(&va); + VFSATTR_INIT(&vs); + vs.f_vol_name = NULL; + mnt = vp->v_mount; + + + /* + * For now, the vnode must be the root of its filesystem. + * To relax this, we need to be able to find the root vnode of a filesystem + * from any vnode in the filesystem. + */ + if (!vnode_isvroot(vp)) { + error = EINVAL; + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: volume attributes requested but not the root of a filesystem"); + goto out; + } + + /* + * Set up the vfs_attr structure and call the filesystem. + */ + if ((error = getvolattrlist_setupvfsattr(alp, &vs, &fixedsize, is_64bit)) != 0) { + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: setup for request failed"); + goto out; + } + if (vs.f_active != 0) { + /* If we're going to ask for f_vol_name, allocate a buffer to point it at */ + if (VFSATTR_IS_ACTIVE(&vs, f_vol_name)) { + vs.f_vol_name = (char *) kalloc(MAXPATHLEN); + if (vs.f_vol_name == NULL) { + error = ENOMEM; + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: could not allocate f_vol_name buffer"); + goto out; + } + } + + VFS_DEBUG(ctx, vp, "ATTRLIST - calling to get %016llx with supported %016llx", vs.f_active, vs.f_supported); + if ((error = vfs_getattr(mnt, &vs, ctx)) != 0) { + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: filesystem returned %d", error); + goto out; + } + + /* + * Did we ask for something the filesystem doesn't support? + */ + if (!VFSATTR_ALL_SUPPORTED(&vs)) { + /* default value for volume subtype */ + if (VFSATTR_IS_ACTIVE(&vs, f_fssubtype) + && !VFSATTR_IS_SUPPORTED(&vs, f_fssubtype)) + VFSATTR_RETURN(&vs, f_fssubtype, 0); + + /* + * If the file system didn't supply f_signature, then + * default it to 'BD', which is the generic signature + * that most Carbon file systems should return. + */ + if (VFSATTR_IS_ACTIVE(&vs, f_signature) + && !VFSATTR_IS_SUPPORTED(&vs, f_signature)) + VFSATTR_RETURN(&vs, f_signature, 0x4244); + + /* default for block size */ + if (VFSATTR_IS_ACTIVE(&vs, f_bsize) + && !VFSATTR_IS_SUPPORTED(&vs, f_bsize)) + VFSATTR_RETURN(&vs, f_bsize, mnt->mnt_devblocksize); + + /* check to see if our fixups were enough */ + if (!VFSATTR_ALL_SUPPORTED(&vs)) { + error = EINVAL; + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: could not get all requested volume attributes"); + VFS_DEBUG(ctx, vp, "ATTRLIST - wanted %016llx got %016llx missing %016llx", + vs.f_active, vs.f_supported, vs.f_active & ~vs.f_supported); + goto out; + } + } + } + + /* + * Some fields require data from the root vp + */ + if (alp->commonattr & (ATTR_CMN_OWNERID | ATTR_CMN_GRPID | ATTR_CMN_ACCESSMASK | ATTR_CMN_FLAGS | ATTR_CMN_SCRIPT)) { + VATTR_WANTED(&va, va_uid); + VATTR_WANTED(&va, va_gid); + VATTR_WANTED(&va, va_mode); + VATTR_WANTED(&va, va_flags); + VATTR_WANTED(&va, va_encoding); + + if ((error = vnode_getattr(vp, &va, ctx)) != 0) { + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: could not fetch attributes from root vnode", vp); + goto out; + } + + if (VATTR_IS_ACTIVE(&va, va_encoding) && !VATTR_IS_SUPPORTED(&va, va_encoding)) + VATTR_RETURN(&va, va_encoding, 0x7e /* kTextEncodingMacUnicode */); + } + + /* + * Compute variable-size buffer requirements. + */ + varsize = 0; + if (alp->commonattr & ATTR_CMN_NAME) { + if (vp->v_mount->mnt_vfsstat.f_mntonname[1] == 0x00 && + vp->v_mount->mnt_vfsstat.f_mntonname[0] == '/') { + /* special case for boot volume. Use root name when it's + * available (which is the volume name) or just the mount on + * name of "/". we must do this for binary compatibility with + * pre Tiger code. returning nothing for the boot volume name + * breaks installers - 3961058 + */ + cnp = vnode_getname(vp); + if (cnp == NULL) { + /* just use "/" as name */ + cnp = &vp->v_mount->mnt_vfsstat.f_mntonname[0]; + } + cnl = strlen(cnp); + } + else { + getattrlist_findnamecomp(vp->v_mount->mnt_vfsstat.f_mntonname, &cnp, &cnl); + } + if (alp->commonattr & ATTR_CMN_NAME) + varsize += roundup(cnl + 1, 4); + } + if (alp->volattr & ATTR_VOL_MOUNTPOINT) + varsize += roundup(strlen(mnt->mnt_vfsstat.f_mntonname) + 1, 4); + if (alp->volattr & ATTR_VOL_NAME) { + vs.f_vol_name[MAXPATHLEN-1] = '\0'; /* Ensure nul-termination */ + varsize += roundup(strlen(vs.f_vol_name) + 1, 4); + } + if (alp->volattr & ATTR_VOL_MOUNTEDDEVICE) + varsize += roundup(strlen(mnt->mnt_vfsstat.f_mntfromname) + 1, 4); + + /* + * Allocate a target buffer for attribute results. + * Note that since we won't ever copy out more than the caller requested, + * we never need to allocate more than they offer. + */ + ab.allocated = imin(uap->bufferSize, fixedsize + varsize); + if (ab.allocated > ATTR_MAX_BUFFER) { + error = ENOMEM; + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: buffer size too large (%d limit %d)", ab.allocated, ATTR_MAX_BUFFER); + goto out; + } + MALLOC(ab.base, char *, ab.allocated, M_TEMP, M_WAITOK); + if (ab.base == NULL) { + error = ENOMEM; + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: could not allocate %d for copy buffer", ab.allocated); + goto out; + } + + /* + * Pack results into the destination buffer. + */ + ab.fixedcursor = ab.base + sizeof(uint32_t); + ab.varcursor = ab.base + fixedsize; + ab.needed = fixedsize + varsize; + + /* common attributes **************************************************/ + if (alp->commonattr & ATTR_CMN_NAME) + attrlist_pack_string(&ab, cnp, cnl); + if (alp->commonattr & ATTR_CMN_DEVID) + ATTR_PACK_CAST(&ab, dev_t, mnt->mnt_vfsstat.f_fsid.val[0]); + if (alp->commonattr & ATTR_CMN_FSID) + ATTR_PACK(&ab, mnt->mnt_vfsstat.f_fsid); + if (alp->commonattr & ATTR_CMN_OBJTYPE) + ATTR_PACK_CAST(&ab, fsobj_type_t, 0); + if (alp->commonattr & ATTR_CMN_OBJTAG) + ATTR_PACK_CAST(&ab, fsobj_tag_t, vp->v_tag); + if (alp->commonattr & ATTR_CMN_OBJID) { + fsobj_id_t f = {0, 0}; + ATTR_PACK(&ab, f); + } + if (alp->commonattr & ATTR_CMN_OBJPERMANENTID) { + fsobj_id_t f = {0, 0}; + ATTR_PACK(&ab, f); + } + if (alp->commonattr & ATTR_CMN_PAROBJID) { + fsobj_id_t f = {0, 0}; + ATTR_PACK(&ab, f); + } + /* note that this returns the encoding for the volume name, not the node name */ + if (alp->commonattr & ATTR_CMN_SCRIPT) + ATTR_PACK_CAST(&ab, text_encoding_t, va.va_encoding); + if (alp->commonattr & ATTR_CMN_CRTIME) + ATTR_PACK_TIME(&ab, vs.f_create_time, is_64bit); + if (alp->commonattr & ATTR_CMN_MODTIME) + ATTR_PACK_TIME(&ab, vs.f_modify_time, is_64bit); + if (alp->commonattr & ATTR_CMN_CHGTIME) + ATTR_PACK_TIME(&ab, vs.f_modify_time, is_64bit); + if (alp->commonattr & ATTR_CMN_ACCTIME) + ATTR_PACK_TIME(&ab, vs.f_access_time, is_64bit); + if (alp->commonattr & ATTR_CMN_BKUPTIME) + ATTR_PACK_TIME(&ab, vs.f_backup_time, is_64bit); + if (alp->commonattr & ATTR_CMN_FNDRINFO) { + char f[32]; + /* + * This attribute isn't really Finder Info, at least for HFS. + */ + if (vp->v_tag == VT_HFS) { + if ((error = VNOP_IOCTL(vp, HFS_GET_BOOT_INFO, (caddr_t)&f, 0, ctx)) != 0) + goto out; + } else { + /* XXX we could at least pass out the volume UUID here */ + bzero(&f, sizeof(f)); + } + attrlist_pack_fixed(&ab, f, sizeof(f)); + } + if (alp->commonattr & ATTR_CMN_OWNERID) + ATTR_PACK(&ab, va.va_uid); + if (alp->commonattr & ATTR_CMN_GRPID) + ATTR_PACK(&ab, va.va_gid); + if (alp->commonattr & ATTR_CMN_ACCESSMASK) + ATTR_PACK_CAST(&ab, uint32_t, va.va_mode); + if (alp->commonattr & ATTR_CMN_FLAGS) + ATTR_PACK(&ab, va.va_flags); + if (alp->commonattr & ATTR_CMN_USERACCESS) { /* XXX this is expensive and also duplicate work */ + uint32_t perms = 0; + if (vnode_isdir(vp)) { + if (vnode_authorize(vp, NULL, + KAUTH_VNODE_ACCESS | KAUTH_VNODE_ADD_FILE | KAUTH_VNODE_ADD_SUBDIRECTORY | KAUTH_VNODE_DELETE_CHILD, ctx) == 0) + perms |= W_OK; + if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_LIST_DIRECTORY, ctx) == 0) + perms |= R_OK; + if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_SEARCH, ctx) == 0) + perms |= X_OK; + } else { + if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA, ctx) == 0) + perms |= W_OK; + if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_READ_DATA, ctx) == 0) + perms |= R_OK; + if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_EXECUTE, ctx) == 0) + perms |= X_OK; + } + KAUTH_DEBUG("ATTRLIST - returning user access %x", perms); + ATTR_PACK(&ab, perms); + } + + /* volume attributes **************************************************/ + + if (alp->volattr & ATTR_VOL_FSTYPE) + ATTR_PACK_CAST(&ab, uint32_t, vfs_typenum(mnt)); + if (alp->volattr & ATTR_VOL_SIGNATURE) + ATTR_PACK_CAST(&ab, uint32_t, vs.f_signature); + if (alp->volattr & ATTR_VOL_SIZE) + ATTR_PACK_CAST(&ab, off_t, vs.f_bsize * vs.f_blocks); + if (alp->volattr & ATTR_VOL_SPACEFREE) + ATTR_PACK_CAST(&ab, off_t, vs.f_bsize * vs.f_bfree); + if (alp->volattr & ATTR_VOL_SPACEAVAIL) + ATTR_PACK_CAST(&ab, off_t, vs.f_bsize * vs.f_bavail); + if (alp->volattr & ATTR_VOL_MINALLOCATION) + ATTR_PACK_CAST(&ab, off_t, vs.f_bsize); + if (alp->volattr & ATTR_VOL_ALLOCATIONCLUMP) + ATTR_PACK_CAST(&ab, off_t, vs.f_bsize); /* not strictly true */ + if (alp->volattr & ATTR_VOL_IOBLOCKSIZE) + ATTR_PACK_CAST(&ab, uint32_t, vs.f_iosize); + if (alp->volattr & ATTR_VOL_OBJCOUNT) + ATTR_PACK_CAST(&ab, uint32_t, vs.f_objcount); + if (alp->volattr & ATTR_VOL_FILECOUNT) + ATTR_PACK_CAST(&ab, uint32_t, vs.f_filecount); + if (alp->volattr & ATTR_VOL_DIRCOUNT) + ATTR_PACK_CAST(&ab, uint32_t, vs.f_dircount); + if (alp->volattr & ATTR_VOL_MAXOBJCOUNT) + ATTR_PACK_CAST(&ab, uint32_t, vs.f_maxobjcount); + if (alp->volattr & ATTR_VOL_MOUNTPOINT) + attrlist_pack_string(&ab, mnt->mnt_vfsstat.f_mntonname, 0); + if (alp->volattr & ATTR_VOL_NAME) + attrlist_pack_string(&ab, vs.f_vol_name, 0); + if (alp->volattr & ATTR_VOL_MOUNTFLAGS) + ATTR_PACK_CAST(&ab, uint32_t, mnt->mnt_flag); + if (alp->volattr & ATTR_VOL_MOUNTEDDEVICE) + attrlist_pack_string(&ab, mnt->mnt_vfsstat.f_mntfromname, 0); + if (alp->volattr & ATTR_VOL_ENCODINGSUSED) + ATTR_PACK_CAST(&ab, uint64_t, ~0LL); /* return all encodings */ + if (alp->volattr & ATTR_VOL_CAPABILITIES) { + /* fix up volume capabilities */ + if (vfs_extendedsecurity(mnt)) { + vs.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] |= VOL_CAP_INT_EXTENDED_SECURITY; + } else { + vs.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &= ~VOL_CAP_INT_EXTENDED_SECURITY; + } + vs.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] |= VOL_CAP_INT_EXTENDED_SECURITY; + ATTR_PACK(&ab, vs.f_capabilities); + } + if (alp->volattr & ATTR_VOL_ATTRIBUTES) { + /* fix up volume attribute information */ + if (vfs_extendedsecurity(mnt)) { + vs.f_attributes.validattr.commonattr |= (ATTR_CMN_EXTENDED_SECURITY | ATTR_CMN_UUID | ATTR_CMN_GRPUUID); + } else { + vs.f_attributes.validattr.commonattr &= ~(ATTR_CMN_EXTENDED_SECURITY | ATTR_CMN_UUID | ATTR_CMN_GRPUUID); + vs.f_attributes.nativeattr.commonattr &= ~(ATTR_CMN_EXTENDED_SECURITY | ATTR_CMN_UUID | ATTR_CMN_GRPUUID); + } + ATTR_PACK(&ab, vs.f_attributes); + } + + /* diagnostic */ + if ((ab.fixedcursor - ab.base) != fixedsize) + panic("packed field size mismatch; allocated %d but packed %d for common %08x vol %08x", + fixedsize, ab.fixedcursor - ab.base, alp->commonattr, alp->volattr); + if (ab.varcursor != (ab.base + ab.needed)) + panic("packed variable field size mismatch; used %d but expected %d", ab.varcursor - ab.base, ab.needed); + + /* + * In the compatible case, we report the smaller of the required and returned sizes. + * If the FSOPT_REPORT_FULLSIZE option is supplied, we report the full (required) size + * of the result buffer, even if we copied less out. The caller knows how big a buffer + * they gave us, so they can always check for truncation themselves. + */ + *(uint32_t *)ab.base = (uap->options & FSOPT_REPORT_FULLSIZE) ? ab.needed : imin(ab.allocated, ab.needed); + + error = copyout(ab.base, uap->attributeBuffer, ab.allocated); + +out: + if (vs.f_vol_name != NULL) + kfree(vs.f_vol_name, MAXPATHLEN); + if (ab.base != NULL) + FREE(ab.base, M_TEMP); + VFS_DEBUG(ctx, vp, "ATTRLIST - returning %d", error); + return(error); +} + +/* + * Obtain attribute information about a filesystem object. + */ +int +getattrlist(struct proc *p, struct getattrlist_args *uap, __unused register_t *retval) +{ + struct attrlist al; + struct vnode_attr va; + struct vfs_context context, *ctx; + struct nameidata nd; + struct _attrlist_buf ab; + vnode_t vp; + u_long nameiflags; + kauth_action_t action; + ssize_t fixedsize, varsize; + const char *cnp; + char *vname = NULL; + ssize_t cnl; + int error; + + context.vc_proc = p; + context.vc_ucred = kauth_cred_get(); + ctx = &context; + vp = NULL; + error = 0; + VATTR_INIT(&va); + va.va_name = NULL; + ab.base = NULL; + cnp = "unknown"; + cnl = 0; + + /* + * Look up the file. + */ + nameiflags = AUDITVNPATH1; + if (!(uap->options & FSOPT_NOFOLLOW)) + nameiflags |= FOLLOW; + NDINIT(&nd, LOOKUP, nameiflags, UIO_USERSPACE, uap->path, &context); + + if ((error = namei(&nd)) != 0) + goto out; + vp = nd.ni_vp; + nameidone(&nd); + + /* + * Fetch the attribute request. + */ + if ((error = copyin(uap->alist, &al, sizeof(al))) != 0) + goto out; + if (al.bitmapcount != ATTR_BIT_MAP_COUNT) { + error = EINVAL; + goto out; + } + + VFS_DEBUG(ctx, vp, "%p ATTRLIST - %s request common %08x vol %08x file %08x dir %08x fork %08x %sfollow on '%s'", + vp, p->p_comm, al.commonattr, al.volattr, al.fileattr, al.dirattr, al.forkattr, + (uap->options & FSOPT_NOFOLLOW) ? "no":"", vp->v_name); + + /* + * It is legal to request volume or file attributes, + * but not both. + */ + if (al.volattr) { + if (al.fileattr || al.dirattr || al.forkattr) { + error = EINVAL; + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: mixed volume/file/directory/fork attributes"); + goto out; + } + /* handle volume attribute request */ + error = getvolattrlist(vp, uap, &al, &context, proc_is64bit(p)); + goto out; + } + + /* + * Set up the vnode_attr structure and authorise. + */ + if ((error = getattrlist_setupvattr(&al, &va, &fixedsize, &action, proc_is64bit(p), vnode_isdir(vp))) != 0) { + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: setup for request failed"); + goto out; + } + if ((error = vnode_authorize(vp, NULL, action, &context)) != 0) { + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: authorisation failed/denied"); + goto out; + } + + if (va.va_active != 0) { + /* + * If we're going to ask for va_name, allocate a buffer to point it at + */ + if (VATTR_IS_ACTIVE(&va, va_name)) { + va.va_name = (char *) kalloc(MAXPATHLEN); + if (va.va_name == NULL) { + error = ENOMEM; + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: cannot allocate va_name buffer"); + goto out; + } + } + + /* + * Call the filesystem. + */ + if ((error = vnode_getattr(vp, &va, &context)) != 0) { + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: filesystem returned %d", error); + goto out; + } + + /* did we ask for something the filesystem doesn't support? */ + if (!VATTR_ALL_SUPPORTED(&va)) { + + /* + * There are a couple of special cases. If we are after object IDs, + * we can make do with va_fileid. + */ + if ((al.commonattr & (ATTR_CMN_OBJID | ATTR_CMN_OBJPERMANENTID)) && !VATTR_IS_SUPPORTED(&va, va_linkid)) + VATTR_CLEAR_ACTIVE(&va, va_linkid); /* forget we wanted this */ + /* + * Many (most?) filesystems don't know their parent object id. We can get it the + * hard way. + */ + if ((al.commonattr & ATTR_CMN_PAROBJID) && !VATTR_IS_SUPPORTED(&va, va_parentid)) + VATTR_CLEAR_ACTIVE(&va, va_parentid); + /* + * And we can report datasize/alloc from total. + */ + if ((al.fileattr & ATTR_FILE_DATALENGTH) && !VATTR_IS_SUPPORTED(&va, va_data_size)) + VATTR_CLEAR_ACTIVE(&va, va_data_size); + if ((al.fileattr & ATTR_FILE_DATAALLOCSIZE) && !VATTR_IS_SUPPORTED(&va, va_data_alloc)) + VATTR_CLEAR_ACTIVE(&va, va_data_alloc); + + /* + * If we don't have an encoding, go with UTF-8 + */ + if ((al.commonattr & ATTR_CMN_SCRIPT) && !VATTR_IS_SUPPORTED(&va, va_encoding)) + VATTR_RETURN(&va, va_encoding, 0x7e /* kTextEncodingMacUnicode */); + + /* + * If we don't have a name, we'll get one from the vnode or mount point. + */ + if ((al.commonattr & ATTR_CMN_NAME) && !VATTR_IS_SUPPORTED(&va, va_name)) { + VATTR_CLEAR_ACTIVE(&va, va_name); + } + + /* + * We used to return va_nlink-2 for ATTR_DIR_ENTRYCOUNT. The va_nchildren + * field is preferred, but we'll fall back to va_nlink-2 for compatibility + * with file systems which haven't adopted va_nchildren. Note: the "- 2" + * reflects the "." and ".." entries which are reported via POSIX APIs, but + * not via Carbon (since they don't in fact exist in HFS). + */ + if ((al.dirattr & ATTR_DIR_ENTRYCOUNT) && !VATTR_IS_SUPPORTED(&va, va_nchildren) && + VATTR_IS_SUPPORTED(&va, va_nlink)) { + VATTR_RETURN(&va, va_nchildren, va.va_nlink - 2); + } + + /* check again */ + if (!VATTR_ALL_SUPPORTED(&va)) { + error = ENOTSUP; + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: could not get all requested file attributes"); + VFS_DEBUG(ctx, vp, "ATTRLIST - have %016llx wanted %016llx missing %016llx", + va.va_supported, va.va_active, va.va_active & ~va.va_supported); + goto out; + } + } + } + + /* + * Compute variable-space requirements. + */ + varsize = 0; /* length count */ + if (al.commonattr & ATTR_CMN_NAME) { + if (VATTR_IS_SUPPORTED(&va, va_name)) { + va.va_name[MAXPATHLEN-1] = '\0'; /* Ensure nul-termination */ + cnp = va.va_name; + cnl = strlen(cnp); + } else { + if (vnode_isvroot(vp)) { + if (vp->v_mount->mnt_vfsstat.f_mntonname[1] == 0x00 && + vp->v_mount->mnt_vfsstat.f_mntonname[0] == '/') { + /* special case for boot volume. Use root name when it's + * available (which is the volume name) or just the mount on + * name of "/". we must do this for binary compatibility with + * pre Tiger code. returning nothing for the boot volume name + * breaks installers - 3961058 + */ + cnp = vname = vnode_getname(vp); + if (cnp == NULL) { + /* just use "/" as name */ + cnp = &vp->v_mount->mnt_vfsstat.f_mntonname[0]; + } + cnl = strlen(cnp); + } + else { + getattrlist_findnamecomp(vp->v_mount->mnt_vfsstat.f_mntonname, &cnp, &cnl); + } + } else { + cnp = vname = vnode_getname(vp); + cnl = 0; + if (cnp != NULL) { + cnl = strlen(cnp); + } + } + } + varsize += roundup(cnl + 1, 4); + } + + /* + * We have a kauth_acl_t but we will be returning a kauth_filesec_t. + * + * XXX This needs to change at some point; since the blob is opaque in + * user-space this is OK. + */ + if ((al.commonattr & ATTR_CMN_EXTENDED_SECURITY) && + VATTR_IS_SUPPORTED(&va, va_acl) && + (va.va_acl != NULL)) + varsize += roundup(KAUTH_FILESEC_SIZE(va.va_acl->acl_entrycount), 4); + + /* + * Allocate a target buffer for attribute results. + * Note that since we won't ever copy out more than the caller requested, + * we never need to allocate more than they offer. + */ + ab.allocated = imin(uap->bufferSize, fixedsize + varsize); + if (ab.allocated > ATTR_MAX_BUFFER) { + error = ENOMEM; + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: buffer size too large (%d limit %d)", ab.allocated, ATTR_MAX_BUFFER); + goto out; + } + MALLOC(ab.base, char *, ab.allocated, M_TEMP, M_WAITOK); + if (ab.base == NULL) { + error = ENOMEM; + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: could not allocate %d for copy buffer", ab.allocated); + goto out; + } + + /* + * Pack results into the destination buffer. + */ + ab.fixedcursor = ab.base + sizeof(uint32_t); + ab.varcursor = ab.base + fixedsize; + ab.needed = fixedsize + varsize; + + /* common attributes **************************************************/ + if (al.commonattr & ATTR_CMN_NAME) + attrlist_pack_string(&ab, cnp, cnl); + if (al.commonattr & ATTR_CMN_DEVID) + ATTR_PACK_CAST(&ab, dev_t, vp->v_mount->mnt_vfsstat.f_fsid.val[0]); + if (al.commonattr & ATTR_CMN_FSID) + ATTR_PACK(&ab, vp->v_mount->mnt_vfsstat.f_fsid); + if (al.commonattr & ATTR_CMN_OBJTYPE) + ATTR_PACK_CAST(&ab, fsobj_type_t, vp->v_type); + if (al.commonattr & ATTR_CMN_OBJTAG) + ATTR_PACK_CAST(&ab, fsobj_tag_t, vp->v_tag); + if (al.commonattr & ATTR_CMN_OBJID) { + fsobj_id_t f; + /* + * Carbon can't deal with us reporting the target ID + * for links. So we ask the filesystem to give us the + * source ID as well, and if it gives us one, we use + * it instead. + */ + if (VATTR_IS_SUPPORTED(&va, va_linkid)) { + f.fid_objno = va.va_linkid; + } else { + f.fid_objno = va.va_fileid; + } + f.fid_generation = 0; + ATTR_PACK(&ab, f); + } + if (al.commonattr & ATTR_CMN_OBJPERMANENTID) { + fsobj_id_t f; + /* + * Carbon can't deal with us reporting the target ID + * for links. So we ask the filesystem to give us the + * source ID as well, and if it gives us one, we use + * it instead. + */ + if (VATTR_IS_SUPPORTED(&va, va_linkid)) { + f.fid_objno = va.va_linkid; + } else { + f.fid_objno = va.va_fileid; + } + f.fid_generation = 0; + ATTR_PACK(&ab, f); + } + if (al.commonattr & ATTR_CMN_PAROBJID) { + fsobj_id_t f; + /* + * If the filesystem doesn't know the parent ID, we can + * try to get it via v->v_parent. Don't need to worry + * about links here, as we dont allow hardlinks to + * directories. + */ + if (VATTR_IS_SUPPORTED(&va, va_parentid)) { + f.fid_objno = va.va_parentid; + } else { + struct vnode_attr lva; + vnode_t pvp; + + pvp = vnode_getparent(vp); + + if (pvp == NULLVP) { + error = ENOTSUP; + goto out; + } + VATTR_INIT(&lva); + VATTR_WANTED(&lva, va_fileid); + error = vnode_getattr(pvp, &lva, &context); + vnode_put(pvp); + + if (error != 0) + goto out; + f.fid_objno = lva.va_fileid; + } + f.fid_generation = 0; + ATTR_PACK(&ab, f); + } + if (al.commonattr & ATTR_CMN_SCRIPT) + ATTR_PACK_CAST(&ab, text_encoding_t, va.va_encoding); + if (al.commonattr & ATTR_CMN_CRTIME) + ATTR_PACK_TIME(&ab, va.va_create_time, proc_is64bit(p)); + if (al.commonattr & ATTR_CMN_MODTIME) + ATTR_PACK_TIME(&ab, va.va_modify_time, proc_is64bit(p)); + if (al.commonattr & ATTR_CMN_CHGTIME) + ATTR_PACK_TIME(&ab, va.va_change_time, proc_is64bit(p)); + if (al.commonattr & ATTR_CMN_ACCTIME) + ATTR_PACK_TIME(&ab, va.va_access_time, proc_is64bit(p)); + if (al.commonattr & ATTR_CMN_BKUPTIME) + ATTR_PACK_TIME(&ab, va.va_backup_time, proc_is64bit(p)); + if (al.commonattr & ATTR_CMN_FNDRINFO) { + uio_t auio; + size_t fisize; + char uio_buf[UIO_SIZEOF(1)]; + + fisize = imin(32, ab.allocated - (ab.fixedcursor - ab.base)); + if (fisize > 0) { + if ((auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, uio_buf, sizeof(uio_buf))) == NULL) { + error = ENOMEM; + goto out; + } else { + uio_addiov(auio, CAST_USER_ADDR_T(ab.fixedcursor), fisize); + error = vn_getxattr(vp, XATTR_FINDERINFO_NAME, auio, &fisize, XATTR_NOSECURITY, &context); + uio_free(auio); + } + if (error != 0) { + if ((error == ENOENT) || (error == ENOATTR) || (error == ENOTSUP) || (error == EPERM)) { + VFS_DEBUG(ctx, vp, "ATTRLIST - No system.finderinfo attribute, returning zeroes"); + bzero(ab.fixedcursor, 32); + error = 0; + } else { + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: reading system.finderinfo attribute"); + goto out; + } + } + } else { + VFS_DEBUG(ctx, vp, "ATTRLIST - no room in caller buffer for FINDERINFO"); + } + ab.fixedcursor += 32; + } + if (al.commonattr & ATTR_CMN_OWNERID) + ATTR_PACK(&ab, va.va_uid); + if (al.commonattr & ATTR_CMN_GRPID) + ATTR_PACK(&ab, va.va_gid); + if (al.commonattr & ATTR_CMN_ACCESSMASK) + ATTR_PACK_CAST(&ab, uint32_t, va.va_mode); + if (al.commonattr & ATTR_CMN_FLAGS) + ATTR_PACK(&ab, va.va_flags); + if (al.commonattr & ATTR_CMN_USERACCESS) { /* this is expensive */ + uint32_t perms = 0; + if (vnode_isdir(vp)) { + if (vnode_authorize(vp, NULL, + KAUTH_VNODE_ACCESS | KAUTH_VNODE_ADD_FILE | KAUTH_VNODE_ADD_SUBDIRECTORY | KAUTH_VNODE_DELETE_CHILD, &context) == 0) + perms |= W_OK; + if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_LIST_DIRECTORY, &context) == 0) + perms |= R_OK; + if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_SEARCH, &context) == 0) + perms |= X_OK; + } else { + if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA, &context) == 0) + perms |= W_OK; + if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_READ_DATA, &context) == 0) + perms |= R_OK; + if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_EXECUTE, &context) == 0) + perms |= X_OK; + } + VFS_DEBUG(ctx, vp, "ATTRLIST - granting perms %d", perms); + ATTR_PACK(&ab, perms); + } + if (al.commonattr & ATTR_CMN_EXTENDED_SECURITY) { + if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL)) { + struct kauth_filesec fsec; + /* + * We want to return a kauth_filesec (for now), but all we have is a kauth_acl. + */ + fsec.fsec_magic = KAUTH_FILESEC_MAGIC; + fsec.fsec_owner = kauth_null_guid; + fsec.fsec_group = kauth_null_guid; + attrlist_pack_variable2(&ab, &fsec, ((char *)&fsec.fsec_acl - (char *)&fsec), va.va_acl, KAUTH_ACL_COPYSIZE(va.va_acl)); + } else { + attrlist_pack_variable(&ab, NULL, 0); + } + } + if (al.commonattr & ATTR_CMN_UUID) { + if (!VATTR_IS_SUPPORTED(&va, va_uuuid)) { + ATTR_PACK(&ab, kauth_null_guid); + } else { + ATTR_PACK(&ab, va.va_uuuid); + } + } + if (al.commonattr & ATTR_CMN_GRPUUID) { + if (!VATTR_IS_SUPPORTED(&va, va_guuid)) { + ATTR_PACK(&ab, kauth_null_guid); + } else { + ATTR_PACK(&ab, va.va_guuid); + } + } + + /* directory attributes **************************************************/ + if (vnode_isdir(vp)) { + if (al.dirattr & ATTR_DIR_LINKCOUNT) /* full count of entries */ + ATTR_PACK_CAST(&ab, uint32_t, va.va_nlink); + if (al.dirattr & ATTR_DIR_ENTRYCOUNT) + ATTR_PACK_CAST(&ab, uint32_t, va.va_nchildren); + if (al.dirattr & ATTR_DIR_MOUNTSTATUS) + ATTR_PACK_CAST(&ab, uint32_t, (vp->v_flag & VROOT) ? DIR_MNTSTATUS_MNTPOINT : 0); + } + + /* file attributes **************************************************/ + if (!vnode_isdir(vp)) { + if (al.fileattr & ATTR_FILE_LINKCOUNT) + ATTR_PACK_CAST(&ab, uint32_t, va.va_nlink); + if (al.fileattr & ATTR_FILE_TOTALSIZE) + ATTR_PACK(&ab, va.va_total_size); + if (al.fileattr & ATTR_FILE_ALLOCSIZE) + ATTR_PACK(&ab, va.va_total_alloc); + if (al.fileattr & ATTR_FILE_IOBLOCKSIZE) + ATTR_PACK(&ab, va.va_iosize); + if (al.fileattr & ATTR_FILE_CLUMPSIZE) + ATTR_PACK_CAST(&ab, uint32_t, 0); /* XXX value is deprecated */ + if (al.fileattr & ATTR_FILE_DEVTYPE) { + if ((vp->v_type == VCHR) || (vp->v_type == VBLK)) { + ATTR_PACK(&ab, vp->v_specinfo->si_rdev); + } else { + ATTR_PACK_CAST(&ab, uint32_t, 0); + } + } + if (al.fileattr & ATTR_FILE_DATALENGTH) { + if (VATTR_IS_SUPPORTED(&va, va_data_size)) { + ATTR_PACK(&ab, va.va_data_size); + } else { + ATTR_PACK(&ab, va.va_total_size); + } + } + if (al.fileattr & ATTR_FILE_DATAALLOCSIZE) { + if (VATTR_IS_SUPPORTED(&va, va_data_alloc)) { + ATTR_PACK(&ab, va.va_data_alloc); + } else { + ATTR_PACK(&ab, va.va_total_alloc); + } + } + /* fetch resource fork size/allocation via xattr interface */ + if (al.fileattr & (ATTR_FILE_RSRCLENGTH | ATTR_FILE_RSRCALLOCSIZE)) { + size_t rsize; + if ((error = vn_getxattr(vp, XATTR_RESOURCEFORK_NAME, NULL, &rsize, XATTR_NOSECURITY, &context)) != 0) { + if ((error == ENOENT) || (error == ENOATTR) || (error == ENOTSUP) || (error == EPERM)) { + rsize = 0; + error = 0; + } else { + goto out; + } + } + if (al.fileattr & ATTR_FILE_RSRCLENGTH) + ATTR_PACK_CAST(&ab, off_t, rsize); + if (al.fileattr & ATTR_FILE_RSRCALLOCSIZE) { + uint32_t blksize = vp->v_mount->mnt_vfsstat.f_bsize; + if (blksize == 0) + blksize = 512; + ATTR_PACK_CAST(&ab, off_t, (roundup(rsize, blksize))); + } + } + } + + /* diagnostic */ + if ((ab.fixedcursor - ab.base) != fixedsize) + panic("packed field size mismatch; allocated %d but packed %d for common %08x vol %08x", + fixedsize, ab.fixedcursor - ab.base, al.commonattr, al.volattr); + if (ab.varcursor != (ab.base + ab.needed)) + panic("packed variable field size mismatch; used %d but expected %d", ab.varcursor - ab.base, ab.needed); + + /* + * In the compatible case, we report the smaller of the required and returned sizes. + * If the FSOPT_REPORT_FULLSIZE option is supplied, we report the full (required) size + * of the result buffer, even if we copied less out. The caller knows how big a buffer + * they gave us, so they can always check for truncation themselves. + */ + *(uint32_t *)ab.base = (uap->options & FSOPT_REPORT_FULLSIZE) ? ab.needed : imin(ab.allocated, ab.needed); + + error = copyout(ab.base, uap->attributeBuffer, ab.allocated); + +out: + if (va.va_name) + kfree(va.va_name, MAXPATHLEN); + if (vname) + vnode_putname(vname); + if (vp) + vnode_put(vp); + if (ab.base != NULL) + FREE(ab.base, M_TEMP); + if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL)) + kauth_acl_free(va.va_acl); + + VFS_DEBUG(ctx, vp, "ATTRLIST - returning %d", error); + return(error); +} + +static int +attrlist_unpack_fixed(char **cursor, char *end, void *buf, ssize_t size) +{ + /* make sure we have enough source data */ + if ((*cursor) + size > end) + return(EINVAL); + + bcopy(*cursor, buf, size); + *cursor += size; + return(0); +} + +#define ATTR_UNPACK(v) do {if ((error = attrlist_unpack_fixed(&cursor, bufend, &v, sizeof(v))) != 0) goto out;} while(0); +#define ATTR_UNPACK_CAST(t, v) do { t _f; ATTR_UNPACK(_f); v = _f;} while(0) +#define ATTR_UNPACK_TIME(v, is64) \ + do { \ + if (is64) { \ + struct user_timespec us; \ + ATTR_UNPACK(us); \ + v.tv_sec = us.tv_sec; \ + v.tv_nsec = us.tv_nsec; \ + } else { \ + ATTR_UNPACK(v); \ + } \ + } while(0) + + +/* + * Write attributes. + */ +int +setattrlist(struct proc *p, register struct setattrlist_args *uap, __unused register_t *retval) +{ + struct attrlist al; + struct vfs_context context, *ctx; + struct vnode_attr va; + struct attrreference ar; + struct nameidata nd; + vnode_t vp; + u_long nameiflags; + kauth_action_t action; + char *user_buf, *cursor, *bufend, *fndrinfo, *cp, *volname; + int proc_is64, error; + uint32_t nace; + kauth_filesec_t rfsec; + + context.vc_proc = p; + context.vc_ucred = kauth_cred_get(); + ctx = &context; + vp = NULL; + user_buf = NULL; + fndrinfo = NULL; + volname = NULL; + error = 0; + proc_is64 = proc_is64bit(p); + VATTR_INIT(&va); + + + /* + * Look up the file. + */ + nameiflags = 0; + if ((uap->options & FSOPT_NOFOLLOW) == 0) + nameiflags |= FOLLOW; + NDINIT(&nd, LOOKUP, nameiflags | AUDITVNPATH1, UIO_USERSPACE, uap->path, &context); + if ((error = namei(&nd)) != 0) + goto out; + vp = nd.ni_vp; + nameidone(&nd); + + /* + * Fetch the attribute set and validate. + */ + if ((error = copyin(uap->alist, (caddr_t) &al, sizeof (al)))) + goto out; + if (al.bitmapcount != ATTR_BIT_MAP_COUNT) { + error = EINVAL; + goto out; + } + + VFS_DEBUG(ctx, vp, "%p ATTRLIST - %s set common %08x vol %08x file %08x dir %08x fork %08x %sfollow on '%s'", + vp, p->p_comm, al.commonattr, al.volattr, al.fileattr, al.dirattr, al.forkattr, + (uap->options & FSOPT_NOFOLLOW) ? "no":"", vp->v_name); + + if (al.volattr) { + if ((al.volattr & ~ATTR_VOL_SETMASK) || + (al.commonattr & ~ATTR_CMN_VOLSETMASK) || + al.fileattr || + al.forkattr) { + error = EINVAL; + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: attempt to set invalid volume attributes"); + goto out; + } + } else { + if ((al.commonattr & ~ATTR_CMN_SETMASK) || + (al.fileattr & ~ATTR_FILE_SETMASK) || + (al.dirattr & ~ATTR_DIR_SETMASK) || + (al.forkattr & ~ATTR_FORK_SETMASK)) { + error = EINVAL; + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: attempt to set invalid file/folder attributes"); + goto out; + } + } + + /* + * Make the naive assumption that the caller has supplied a reasonable buffer + * size. We could be more careful by pulling in the fixed-size region, checking + * the attrref structures, then pulling in the variable section. + * We need to reconsider this for handling large ACLs, as they should probably be + * brought directly into a buffer. Multiple copyins will make this slower though. + * + * We could also map the user buffer if it is larger than some sensible mimimum. + */ + if (uap->bufferSize > ATTR_MAX_BUFFER) { + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: buffer size %d too large", uap->bufferSize); + error = ENOMEM; + goto out; + } + MALLOC(user_buf, char *, uap->bufferSize, M_TEMP, M_WAITOK); + if (user_buf == NULL) { + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: could not allocate %d bytes for buffer", uap->bufferSize); + error = ENOMEM; + goto out; + } + if ((error = copyin(uap->attributeBuffer, user_buf, uap->bufferSize)) != 0) { + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: buffer copyin failed"); + goto out; + } + VFS_DEBUG(ctx, vp, "ATTRLIST - copied in %d bytes of user attributes to %p", uap->bufferSize, user_buf); + + /* + * Unpack the argument buffer. + */ + cursor = user_buf; + bufend = cursor + uap->bufferSize; + + /* common */ + if (al.commonattr & ATTR_CMN_SCRIPT) { + ATTR_UNPACK(va.va_encoding); + VATTR_SET_ACTIVE(&va, va_encoding); + } + if (al.commonattr & ATTR_CMN_CRTIME) { + ATTR_UNPACK_TIME(va.va_create_time, proc_is64); + VATTR_SET_ACTIVE(&va, va_create_time); + } + if (al.commonattr & ATTR_CMN_MODTIME) { + ATTR_UNPACK_TIME(va.va_modify_time, proc_is64); + VATTR_SET_ACTIVE(&va, va_modify_time); + } + if (al.commonattr & ATTR_CMN_CHGTIME) { + ATTR_UNPACK_TIME(va.va_change_time, proc_is64); + VATTR_SET_ACTIVE(&va, va_change_time); + } + if (al.commonattr & ATTR_CMN_ACCTIME) { + ATTR_UNPACK_TIME(va.va_access_time, proc_is64); + VATTR_SET_ACTIVE(&va, va_access_time); + } + if (al.commonattr & ATTR_CMN_BKUPTIME) { + ATTR_UNPACK_TIME(va.va_backup_time, proc_is64); + VATTR_SET_ACTIVE(&va, va_backup_time); + } + if (al.commonattr & ATTR_CMN_FNDRINFO) { + if ((cursor + 32) > bufend) { + error = EINVAL; + VFS_DEBUG(ctx, vp, "ATTRLIST - not enough data supplied for FINDERINFO"); + goto out; + } + fndrinfo = cursor; + cursor += 32; + } + if (al.commonattr & ATTR_CMN_OWNERID) { + ATTR_UNPACK(va.va_uid); + VATTR_SET_ACTIVE(&va, va_uid); + } + if (al.commonattr & ATTR_CMN_GRPID) { + ATTR_UNPACK(va.va_gid); + VATTR_SET_ACTIVE(&va, va_gid); + } + if (al.commonattr & ATTR_CMN_ACCESSMASK) { + ATTR_UNPACK_CAST(uint32_t, va.va_mode); + VATTR_SET_ACTIVE(&va, va_mode); + } + if (al.commonattr & ATTR_CMN_FLAGS) { + ATTR_UNPACK(va.va_flags); + VATTR_SET_ACTIVE(&va, va_flags); + } + if (al.commonattr & ATTR_CMN_EXTENDED_SECURITY) { + + /* + * We are (for now) passed a kauth_filesec_t, but all we want from + * it is the ACL. + */ + cp = cursor; + ATTR_UNPACK(ar); + cp += ar.attr_dataoffset; + rfsec = (kauth_filesec_t)cp; + if (((char *)(rfsec + 1) > bufend) || /* no space for acl */ + (rfsec->fsec_magic != KAUTH_FILESEC_MAGIC) || /* bad magic */ + (KAUTH_FILESEC_COPYSIZE(rfsec) != ar.attr_length) || /* size does not match */ + ((cp + KAUTH_FILESEC_COPYSIZE(rfsec)) > bufend)) { /* ACEs overrun buffer */ + error = EINVAL; + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: bad ACL supplied", ar.attr_length); + goto out; + } + nace = rfsec->fsec_entrycount; + if (nace == KAUTH_FILESEC_NOACL) + nace = 0; + if (nace > KAUTH_ACL_MAX_ENTRIES) { /* ACL size invalid */ + error = EINVAL; + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: bad ACL supplied"); + goto out; + } + nace = rfsec->fsec_acl.acl_entrycount; + if (nace == KAUTH_FILESEC_NOACL) { + /* deleting ACL */ + VATTR_SET(&va, va_acl, NULL); + } else { + + if (nace > KAUTH_ACL_MAX_ENTRIES) { /* ACL size invalid */ + error = EINVAL; + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: supplied ACL is too large"); + goto out; + } + VATTR_SET(&va, va_acl, &rfsec->fsec_acl); + } + } + if (al.commonattr & ATTR_CMN_UUID) { + ATTR_UNPACK(va.va_uuuid); + VATTR_SET_ACTIVE(&va, va_uuuid); + } + if (al.commonattr & ATTR_CMN_GRPUUID) { + ATTR_UNPACK(va.va_guuid); + VATTR_SET_ACTIVE(&va, va_guuid); + } + + /* volume */ + if (al.volattr & ATTR_VOL_INFO) { + if (al.volattr & ATTR_VOL_NAME) { + volname = cursor; + ATTR_UNPACK(ar); + volname += ar.attr_dataoffset; + if ((volname + ar.attr_length) > bufend) { + error = EINVAL; + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: volume name too big for caller buffer"); + goto out; + } + /* guarantee NUL termination */ + volname[ar.attr_length - 1] = 0; + } + } + + /* file */ + if (al.fileattr & ATTR_FILE_DEVTYPE) { + /* XXX does it actually make any sense to change this? */ + error = EINVAL; + VFS_DEBUG(ctx, vp, "ATTRLIST - XXX device type change not implemented"); + goto out; + } + + /* + * Validate and authorize. + */ + action = 0; + if ((va.va_active != 0LL) && ((error = vnode_authattr(vp, &va, &action, &context)) != 0)) { + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: attribute changes refused: %d", error); + goto out; + } + /* + * We can auth file Finder Info here. HFS volume FinderInfo is really boot data, + * and will be auth'ed by the FS. + */ + if (fndrinfo != NULL) { + if (al.volattr & ATTR_VOL_INFO) { + if (vp->v_tag != VT_HFS) { + error = EINVAL; + goto out; + } + } else { + action |= KAUTH_VNODE_WRITE_ATTRIBUTES; + } + } + + if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, &context)) != 0)) { + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: authorization failed"); + goto out; + } + + /* + * Write the attributes if we have any. + */ + if ((va.va_active != 0LL) && ((error = vnode_setattr(vp, &va, &context)) != 0)) { + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: filesystem returned %d", error); + goto out; + } + + /* + * Write the Finder Info if we have any. + */ + if (fndrinfo != NULL) { + if (al.volattr & ATTR_VOL_INFO) { + if (vp->v_tag == VT_HFS) { + error = VNOP_IOCTL(vp, HFS_SET_BOOT_INFO, (caddr_t)fndrinfo, 0, &context); + if (error != 0) + goto out; + } else { + /* XXX should never get here */ + } + } else { + /* write Finder Info EA */ + uio_t auio; + char uio_buf[UIO_SIZEOF(1)]; + + if ((auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_WRITE, uio_buf, sizeof(uio_buf))) == NULL) { + error = ENOMEM; + } else { + uio_addiov(auio, CAST_USER_ADDR_T(fndrinfo), 32); + error = vn_setxattr(vp, XATTR_FINDERINFO_NAME, auio, XATTR_NOSECURITY, &context); + uio_free(auio); + } + + if (error == 0 && need_fsevent(FSE_FINDER_INFO_CHANGED, vp)) { + add_fsevent(FSE_FINDER_INFO_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE); + } + + if (error != 0) { + goto out; + } + } + } + + /* + * Set the volume name, if we have one + */ + if (volname != NULL) + { + struct vfs_attr vs; + + VFSATTR_INIT(&vs); + + vs.f_vol_name = volname; /* References the setattrlist buffer directly */ + VFSATTR_WANTED(&vs, f_vol_name); + + if ((error = vfs_setattr(vp->v_mount, &vs, ctx)) != 0) { + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: setting volume name failed"); + goto out; + } + + if (!VFSATTR_ALL_SUPPORTED(&vs)) { + error = EINVAL; + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: could not set volume name"); + goto out; + } + } + + /* all done and successful */ + +out: + if (vp != NULL) + vnode_put(vp); + if (user_buf != NULL) + FREE(user_buf, M_TEMP); + VFS_DEBUG(ctx, vp, "ATTRLIST - set returning %d", error); + return(error); +} diff --git a/bsd/vfs/vfs_bio.c b/bsd/vfs/vfs_bio.c index 8919319e4..5371c4b3a 100644 --- a/bsd/vfs/vfs_bio.c +++ b/bsd/vfs/vfs_bio.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -58,9 +58,6 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * The NEXTSTEP Software License Agreement specifies the terms - * and conditions for redistribution. - * * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94 */ @@ -73,48 +70,55 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/proc.h> -#include <sys/buf.h> -#include <sys/vnode.h> -#include <sys/mount.h> +#include <sys/proc_internal.h> +#include <sys/buf_internal.h> +#include <sys/vnode_internal.h> +#include <sys/mount_internal.h> #include <sys/trace.h> #include <sys/malloc.h> #include <sys/resourcevar.h> #include <miscfs/specfs/specdev.h> #include <sys/ubc.h> -#include <vm/vm_pageout.h> +#include <sys/kauth.h> #if DIAGNOSTIC #include <kern/assert.h> #endif /* DIAGNOSTIC */ #include <kern/task.h> #include <kern/zalloc.h> +#include <kern/lock.h> + +#include <vm/vm_kern.h> #include <sys/kdebug.h> #include <machine/spl.h> +#if BALANCE_QUEUES static __inline__ void bufqinc(int q); static __inline__ void bufqdec(int q); +#endif -static int do_breadn_for_type(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablks, - int *rasizes, int nrablks, struct ucred *cred, struct buf **bpp, int queuetype); -static struct buf *getnewbuf(int slpflag, int slptimeo, int *queue); -static int bcleanbuf(struct buf *bp); -static int brecover_data(struct buf *bp); -extern void vwakeup(); +static int bcleanbuf(buf_t bp); +static int brecover_data(buf_t bp); +static boolean_t incore(vnode_t vp, daddr64_t blkno); +static buf_t incore_locked(vnode_t vp, daddr64_t blkno); +/* timeout is in msecs */ +static buf_t getnewbuf(int slpflag, int slptimeo, int *queue); +static void bremfree_locked(buf_t bp); +static void buf_reassign(buf_t bp, vnode_t newvp); +static errno_t buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo); +static int buf_iterprepare(vnode_t vp, struct buflists *, int flags); +static void buf_itercomplete(vnode_t vp, struct buflists *, int flags); -extern int niobuf; /* The number of IO buffer headers for cluster IO */ -int blaundrycnt; +__private_extern__ int bdwrite_internal(buf_t, int); /* zone allocated buffer headers */ -static zone_t buf_hdr_zone; -static int buf_hdr_count; +static void bufzoneinit(void); +static void bcleanbuf_thread_init(void); +static void bcleanbuf_thread(void); + +static zone_t buf_hdr_zone; +static int buf_hdr_count; -#if TRACE -struct proc *traceproc; -int tracewhich, tracebuf[TRCSIZ]; -u_int tracex; -char traceflags[TR_NFLAGS]; -#endif /* TRACE */ /* * Definitions for the buffer hash lists. @@ -129,38 +133,60 @@ struct bufstats bufstats; /* Number of delayed write buffers */ int nbdwrite = 0; +int blaundrycnt = 0; -/* - * Insq/Remq for the buffer hash lists. - */ -#if 0 -#define binshash(bp, dp) LIST_INSERT_HEAD(dp, bp, b_hash) -#define bremhash(bp) LIST_REMOVE(bp, b_hash) -#endif /* 0 */ - -TAILQ_HEAD(ioqueue, buf) iobufqueue; -TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES]; +static TAILQ_HEAD(ioqueue, buf) iobufqueue; +static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES]; static int needbuffer; static int need_iobuffer; +static lck_grp_t *buf_mtx_grp; +static lck_attr_t *buf_mtx_attr; +static lck_grp_attr_t *buf_mtx_grp_attr; +static lck_mtx_t *iobuffer_mtxp; +static lck_mtx_t *buf_mtxp; + +static __inline__ int +buf_timestamp(void) +{ + struct timeval t; + microuptime(&t); + return (t.tv_sec); +} + /* * Insq/Remq for the buffer free lists. */ +#if BALANCE_QUEUES #define binsheadfree(bp, dp, whichq) do { \ TAILQ_INSERT_HEAD(dp, bp, b_freelist); \ bufqinc((whichq)); \ (bp)->b_whichq = whichq; \ - (bp)->b_timestamp = time.tv_sec; \ + (bp)->b_timestamp = buf_timestamp(); \ } while (0) #define binstailfree(bp, dp, whichq) do { \ TAILQ_INSERT_TAIL(dp, bp, b_freelist); \ bufqinc((whichq)); \ (bp)->b_whichq = whichq; \ - (bp)->b_timestamp = time.tv_sec; \ + (bp)->b_timestamp = buf_timestamp(); \ + } while (0) +#else +#define binsheadfree(bp, dp, whichq) do { \ + TAILQ_INSERT_HEAD(dp, bp, b_freelist); \ + (bp)->b_whichq = whichq; \ + (bp)->b_timestamp = buf_timestamp(); \ } while (0) +#define binstailfree(bp, dp, whichq) do { \ + TAILQ_INSERT_TAIL(dp, bp, b_freelist); \ + (bp)->b_whichq = whichq; \ + (bp)->b_timestamp = buf_timestamp(); \ + } while (0) +#endif + + #define BHASHENTCHECK(bp) \ if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \ panic("%x: b_hash.le_prev is not deadbeef", (bp)); @@ -178,12 +204,6 @@ static int need_iobuffer; (bp)->b_vnbufs.le_next = NOLIST; \ } -simple_lock_data_t bufhashlist_slock; /* lock on buffer hash list */ - -/* number of per vnode, "in flight" buffer writes */ -#define BUFWRITE_THROTTLE 9 - - /* * Time in seconds before a buffer on a list is * considered as a stale buffer @@ -196,9 +216,11 @@ int lru_is_stale = LRU_IS_STALE; int age_is_stale = AGE_IS_STALE; int meta_is_stale = META_IS_STALE; + + /* LIST_INSERT_HEAD() with assertions */ static __inline__ void -blistenterhead(struct bufhashhdr * head, struct buf * bp) +blistenterhead(struct bufhashhdr * head, buf_t bp) { if ((bp->b_hash.le_next = (head)->lh_first) != NULL) (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next; @@ -209,16 +231,9 @@ blistenterhead(struct bufhashhdr * head, struct buf * bp) } static __inline__ void -binshash(struct buf *bp, struct bufhashhdr *dp) +binshash(buf_t bp, struct bufhashhdr *dp) { - struct buf *nbp; - - simple_lock(&bufhashlist_slock); - -#if 0 - if((bad = incore(bp->b_vp, bp->b_lblkno))) - panic("binshash: already incore bp 0x%x, bad 0x%x\n", bp, bad); -#endif /* 0 */ + buf_t nbp; BHASHENTCHECK(bp); @@ -229,13 +244,11 @@ binshash(struct buf *bp, struct bufhashhdr *dp) } blistenterhead(dp, bp); - simple_unlock(&bufhashlist_slock); } static __inline__ void -bremhash(struct buf *bp) +bremhash(buf_t bp) { - simple_lock(&bufhashlist_slock); if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef) panic("bremhash le_prev is deadbeef"); if (bp->b_hash.le_next == bp) @@ -244,324 +257,1262 @@ bremhash(struct buf *bp) if (bp->b_hash.le_next != NULL) bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev; *bp->b_hash.le_prev = (bp)->b_hash.le_next; - simple_unlock(&bufhashlist_slock); } -/* - * Remove a buffer from the free list it's on - */ -void -bremfree(bp) - struct buf *bp; -{ - struct bqueues *dp = NULL; - int whichq = -1; - /* - * We only calculate the head of the freelist when removing - * the last element of the list as that is the only time that - * it is needed (e.g. to reset the tail pointer). - * - * NB: This makes an assumption about how tailq's are implemented. - */ - if (bp->b_freelist.tqe_next == NULL) { - for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) - if (dp->tqh_last == &bp->b_freelist.tqe_next) - break; - if (dp == &bufqueues[BQUEUES]) - panic("bremfree: lost tail"); - } - TAILQ_REMOVE(dp, bp, b_freelist); - whichq = bp->b_whichq; - bufqdec(whichq); - bp->b_whichq = -1; - bp->b_timestamp = 0; -} -/* - * Associate a buffer with a vnode. - */ -static void -bgetvp(vp, bp) - register struct vnode *vp; - register struct buf *bp; -{ - if (bp->b_vp != vp) - panic("bgetvp: not free"); - VHOLD(vp); - bp->b_vp = vp; - if (vp->v_type == VBLK || vp->v_type == VCHR) - bp->b_dev = vp->v_rdev; - else - bp->b_dev = NODEV; - /* - * Insert onto list for new vnode. - */ - bufinsvn(bp, &vp->v_cleanblkhd); +int +buf_valid(buf_t bp) { + + if ( (bp->b_flags & (B_DONE | B_DELWRI)) ) + return 1; + return 0; } -/* - * Disassociate a buffer from a vnode. - */ -static void -brelvp(bp) - register struct buf *bp; -{ - struct vnode *vp; +int +buf_fromcache(buf_t bp) { - if (bp->b_vp == (struct vnode *) 0) - panic("brelvp: NULL vp"); - /* - * Delete from old vnode list, if on one. - */ - if (bp->b_vnbufs.le_next != NOLIST) - bufremvn(bp); - vp = bp->b_vp; - bp->b_vp = (struct vnode *) 0; - HOLDRELE(vp); + if ( (bp->b_flags & B_CACHE) ) + return 1; + return 0; } -/* - * Reassign a buffer from one vnode to another. - * Used to assign file specific control information - * (indirect blocks) to the vnode to which they belong. - */ void -reassignbuf(bp, newvp) - register struct buf *bp; - register struct vnode *newvp; -{ - register struct buflists *listheadp; +buf_markinvalid(buf_t bp) { + + SET(bp->b_flags, B_INVAL); +} - if (newvp == NULL) { - printf("reassignbuf: NULL"); - return; - } - /* - * Delete from old vnode list, if on one. - */ - if (bp->b_vnbufs.le_next != NOLIST) - bufremvn(bp); - /* - * If dirty, put on list of dirty buffers; - * otherwise insert onto list of clean buffers. - */ - if (ISSET(bp->b_flags, B_DELWRI)) - listheadp = &newvp->v_dirtyblkhd; - else - listheadp = &newvp->v_cleanblkhd; - bufinsvn(bp, listheadp); +void +buf_markdelayed(buf_t bp) { + + SET(bp->b_flags, B_DELWRI); + buf_reassign(bp, bp->b_vp); } -static __inline__ void -bufhdrinit(struct buf *bp) -{ - bzero((char *)bp, sizeof *bp); - bp->b_dev = NODEV; - bp->b_rcred = NOCRED; - bp->b_wcred = NOCRED; - bp->b_vnbufs.le_next = NOLIST; - bp->b_flags = B_INVAL; +void +buf_markeintr(buf_t bp) { + + SET(bp->b_flags, B_EINTR); +} - return; +void +buf_markaged(buf_t bp) { + + SET(bp->b_flags, B_AGE); } -/* - * Initialize buffers and hash links for buffers. - */ -__private_extern__ void -bufinit() -{ - register struct buf *bp; - register struct bqueues *dp; - register int i; - int metabuf; - long whichq; - static void bufzoneinit(); - static void bcleanbuf_thread_init(); +errno_t +buf_error(buf_t bp) { + + return (bp->b_error); +} - /* Initialize the buffer queues ('freelists') and the hash table */ - for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) - TAILQ_INIT(dp); - bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash); +void +buf_seterror(buf_t bp, errno_t error) { - simple_lock_init(&bufhashlist_slock ); + if ((bp->b_error = error)) + SET(bp->b_flags, B_ERROR); + else + CLR(bp->b_flags, B_ERROR); +} - metabuf = nbuf/8; /* reserved for meta buf */ +void +buf_setflags(buf_t bp, int32_t flags) { - /* Initialize the buffer headers */ - for (i = 0; i < nbuf; i++) { - bp = &buf[i]; - bufhdrinit(bp); + SET(bp->b_flags, (flags & BUF_X_WRFLAGS)); +} - /* - * metabuf buffer headers on the meta-data list and - * rest of the buffer headers on the empty list - */ - if (--metabuf) - whichq = BQ_META; - else - whichq = BQ_EMPTY; +void +buf_clearflags(buf_t bp, int32_t flags) { - BLISTNONE(bp); - dp = &bufqueues[whichq]; - binsheadfree(bp, dp, whichq); - binshash(bp, &invalhash); - } + CLR(bp->b_flags, (flags & BUF_X_WRFLAGS)); +} - for (; i < nbuf + niobuf; i++) { - bp = &buf[i]; - bufhdrinit(bp); - binsheadfree(bp, &iobufqueue, -1); - } +int32_t +buf_flags(buf_t bp) { + + return ((bp->b_flags & BUF_X_RDFLAGS)); +} - printf("using %d buffer headers and %d cluster IO buffer headers\n", - nbuf, niobuf); +void +buf_reset(buf_t bp, int32_t io_flags) { + + CLR(bp->b_flags, (B_READ | B_WRITE | B_ERROR | B_DONE | B_INVAL | B_ASYNC | B_NOCACHE)); + SET(bp->b_flags, (io_flags & (B_ASYNC | B_READ | B_WRITE | B_NOCACHE))); - /* Set up zones used by the buffer cache */ - bufzoneinit(); + bp->b_error = 0; +} - /* start the bcleanbuf() thread */ - bcleanbuf_thread_init(); +uint32_t +buf_count(buf_t bp) { + + return (bp->b_bcount); +} -#if 0 /* notyet */ - { - static void bufq_balance_thread_init(); - /* create a thread to do dynamic buffer queue balancing */ - bufq_balance_thread_init(); - } -#endif /* notyet */ +void +buf_setcount(buf_t bp, uint32_t bcount) { + + bp->b_bcount = bcount; } -static struct buf * -bio_doread(vp, blkno, size, cred, async, queuetype) - struct vnode *vp; - daddr_t blkno; - int size; - struct ucred *cred; - int async; - int queuetype; -{ - register struct buf *bp; - struct proc *p = current_proc(); +uint32_t +buf_size(buf_t bp) { + + return (bp->b_bufsize); +} - bp = getblk(vp, blkno, size, 0, 0, queuetype); +void +buf_setsize(buf_t bp, uint32_t bufsize) { + + bp->b_bufsize = bufsize; +} - /* - * If buffer does not have data valid, start a read. - * Note that if buffer is B_INVAL, getblk() won't return it. - * Therefore, it's valid if it's I/O has completed or been delayed. - */ - if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) { - /* Start I/O for the buffer (keeping credentials). */ - SET(bp->b_flags, B_READ | async); - if (cred != NOCRED && bp->b_rcred == NOCRED) { - /* - * NFS has embedded ucred. - * Can not crhold() here as that causes zone corruption - */ - bp->b_rcred = crdup(cred); - } +uint32_t +buf_resid(buf_t bp) { + + return (bp->b_resid); +} - VOP_STRATEGY(bp); +void +buf_setresid(buf_t bp, uint32_t resid) { + + bp->b_resid = resid; +} - trace(TR_BREADMISS, pack(vp, size), blkno); +uint32_t +buf_dirtyoff(buf_t bp) { - /* Pay for the read. */ - if (p && p->p_stats) - p->p_stats->p_ru.ru_inblock++; /* XXX */ - } else if (async) { - brelse(bp); - } + return (bp->b_dirtyoff); +} - trace(TR_BREADHIT, pack(vp, size), blkno); +uint32_t +buf_dirtyend(buf_t bp) { - return (bp); + return (bp->b_dirtyend); } -/* - * Read a disk block. - * This algorithm described in Bach (p.54). - */ -int -bread(vp, blkno, size, cred, bpp) - struct vnode *vp; - daddr_t blkno; - int size; - struct ucred *cred; - struct buf **bpp; -{ - register struct buf *bp; - /* Get buffer for block. */ - bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ); +void +buf_setdirtyoff(buf_t bp, uint32_t dirtyoff) { + + bp->b_dirtyoff = dirtyoff; +} - /* Wait for the read to complete, and return result. */ - return (biowait(bp)); +void +buf_setdirtyend(buf_t bp, uint32_t dirtyend) { + + bp->b_dirtyend = dirtyend; } -/* - * Read a disk block. [bread() for meta-data] - * This algorithm described in Bach (p.54). - */ -int -meta_bread(vp, blkno, size, cred, bpp) - struct vnode *vp; - daddr_t blkno; - int size; - struct ucred *cred; - struct buf **bpp; -{ - register struct buf *bp; +uintptr_t +buf_dataptr(buf_t bp) { + + return (bp->b_datap); +} - /* Get buffer for block. */ - bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META); +void +buf_setdataptr(buf_t bp, uintptr_t data) { + + bp->b_datap = data; +} + +vnode_t +buf_vnode(buf_t bp) { + + return (bp->b_vp); +} + +void +buf_setvnode(buf_t bp, vnode_t vp) { + + bp->b_vp = vp; +} + + +void * +buf_callback(buf_t bp) +{ + if ( !(bp->b_lflags & BL_IOBUF) ) + return ((void *) NULL); + if ( !(bp->b_flags & B_CALL) ) + return ((void *) NULL); + + return ((void *)bp->b_iodone); +} + + +errno_t +buf_setcallback(buf_t bp, void (*callback)(buf_t, void *), void *transaction) +{ + + if ( !(bp->b_lflags & BL_IOBUF) ) + return (EINVAL); + + if (callback) + bp->b_flags |= (B_CALL | B_ASYNC); + else + bp->b_flags &= ~B_CALL; + bp->b_transaction = transaction; + bp->b_iodone = callback; + + return (0); +} + +errno_t +buf_setupl(buf_t bp, upl_t upl, uint32_t offset) +{ + + if ( !(bp->b_lflags & BL_IOBUF) ) + return (EINVAL); + + if (upl) + bp->b_flags |= B_CLUSTER; + else + bp->b_flags &= ~B_CLUSTER; + bp->b_upl = upl; + bp->b_uploffset = offset; + + return (0); +} + +buf_t +buf_clone(buf_t bp, int io_offset, int io_size, void (*iodone)(buf_t, void *), void *arg) +{ + buf_t io_bp; + + if (io_offset < 0 || io_size < 0) + return (NULL); + + if ((unsigned)(io_offset + io_size) > (unsigned)bp->b_bcount) + return (NULL); + + if (bp->b_flags & B_CLUSTER) { + if (io_offset && ((bp->b_uploffset + io_offset) & PAGE_MASK)) + return (NULL); + + if (((bp->b_uploffset + io_offset + io_size) & PAGE_MASK) && ((io_offset + io_size) < bp->b_bcount)) + return (NULL); + } + io_bp = alloc_io_buf(bp->b_vp, 0); + + io_bp->b_flags = bp->b_flags & (B_COMMIT_UPL | B_META | B_PAGEIO | B_CLUSTER | B_PHYS | B_ASYNC | B_READ); + + if (iodone) { + io_bp->b_transaction = arg; + io_bp->b_iodone = iodone; + io_bp->b_flags |= B_CALL; + } + if (bp->b_flags & B_CLUSTER) { + io_bp->b_upl = bp->b_upl; + io_bp->b_uploffset = bp->b_uploffset + io_offset; + } else { + io_bp->b_datap = (uintptr_t)(((char *)bp->b_datap) + io_offset); + } + io_bp->b_bcount = io_size; + + return (io_bp); +} + + + +void +buf_setfilter(buf_t bp, void (*filter)(buf_t, void *), void *transaction, + void **old_iodone, void **old_transaction) +{ + if (old_iodone) + *old_iodone = (void *)(bp->b_iodone); + if (old_transaction) + *old_transaction = (void *)(bp->b_transaction); + + bp->b_transaction = transaction; + bp->b_iodone = filter; + bp->b_flags |= B_FILTER; +} + + +daddr64_t +buf_blkno(buf_t bp) { + + return (bp->b_blkno); +} + +daddr64_t +buf_lblkno(buf_t bp) { + + return (bp->b_lblkno); +} + +void +buf_setblkno(buf_t bp, daddr64_t blkno) { + + bp->b_blkno = blkno; +} + +void +buf_setlblkno(buf_t bp, daddr64_t lblkno) { + + bp->b_lblkno = lblkno; +} + +dev_t +buf_device(buf_t bp) { + + return (bp->b_dev); +} + +errno_t +buf_setdevice(buf_t bp, vnode_t vp) { + + if ((vp->v_type != VBLK) && (vp->v_type != VCHR)) + return EINVAL; + bp->b_dev = vp->v_rdev; + + return 0; +} + + +void * +buf_drvdata(buf_t bp) { + + return (bp->b_drvdata); +} + +void +buf_setdrvdata(buf_t bp, void *drvdata) { + + bp->b_drvdata = drvdata; +} + +void * +buf_fsprivate(buf_t bp) { + + return (bp->b_fsprivate); +} + +void +buf_setfsprivate(buf_t bp, void *fsprivate) { + + bp->b_fsprivate = fsprivate; +} + +ucred_t +buf_rcred(buf_t bp) { + + return (bp->b_rcred); +} + +ucred_t +buf_wcred(buf_t bp) { + + return (bp->b_wcred); +} + +void * +buf_upl(buf_t bp) { + + return (bp->b_upl); +} + +uint32_t +buf_uploffset(buf_t bp) { + + return ((uint32_t)(bp->b_uploffset)); +} + +proc_t +buf_proc(buf_t bp) { + + return (bp->b_proc); +} + + +errno_t +buf_map(buf_t bp, caddr_t *io_addr) +{ + buf_t real_bp; + vm_offset_t vaddr; + kern_return_t kret; + + if ( !(bp->b_flags & B_CLUSTER)) { + *io_addr = (caddr_t)bp->b_datap; + return (0); + } + real_bp = (buf_t)(bp->b_real_bp); + + if (real_bp && real_bp->b_datap) { + /* + * b_real_bp is only valid if B_CLUSTER is SET + * if it's non-zero, than someone did a cluster_bp call + * if the backing physical pages were already mapped + * in before the call to cluster_bp (non-zero b_datap), + * than we just use that mapping + */ + *io_addr = (caddr_t)real_bp->b_datap; + return (0); + } + kret = ubc_upl_map(bp->b_upl, &vaddr); /* Map it in */ + + if (kret != KERN_SUCCESS) { + *io_addr = 0; + + return(ENOMEM); + } + vaddr += bp->b_uploffset; + + *io_addr = (caddr_t)vaddr; + + return (0); +} + +errno_t +buf_unmap(buf_t bp) +{ + buf_t real_bp; + kern_return_t kret; + + if ( !(bp->b_flags & B_CLUSTER)) + return (0); + /* + * see buf_map for the explanation + */ + real_bp = (buf_t)(bp->b_real_bp); + + if (real_bp && real_bp->b_datap) + return (0); + + if (bp->b_lflags & BL_IOBUF) { + /* + * when we commit these pages, we'll hit + * it with UPL_COMMIT_INACTIVE which + * will clear the reference bit that got + * turned on when we touched the mapping + */ + bp->b_flags |= B_AGE; + } + kret = ubc_upl_unmap(bp->b_upl); + + if (kret != KERN_SUCCESS) + return (EINVAL); + return (0); +} + + +void +buf_clear(buf_t bp) { + caddr_t baddr; + + if (buf_map(bp, &baddr) == 0) { + bzero(baddr, bp->b_bcount); + buf_unmap(bp); + } + bp->b_resid = 0; +} + + + +/* + * Read or write a buffer that is not contiguous on disk. + * buffer is marked done/error at the conclusion + */ +static int +buf_strategy_fragmented(vnode_t devvp, buf_t bp, off_t f_offset, size_t contig_bytes) +{ + vnode_t vp = buf_vnode(bp); + buf_t io_bp; /* For reading or writing a single block */ + int io_direction; + int io_resid; + size_t io_contig_bytes; + daddr64_t io_blkno; + int error = 0; + int bmap_flags; + + /* + * save our starting point... the bp was already mapped + * in buf_strategy before we got called + * no sense doing it again. + */ + io_blkno = bp->b_blkno; + /* + * Make sure we redo this mapping for the next I/O + * i.e. this can never be a 'permanent' mapping + */ + bp->b_blkno = bp->b_lblkno; + + /* + * Get an io buffer to do the deblocking + */ + io_bp = alloc_io_buf(devvp, 0); + + io_bp->b_lblkno = bp->b_lblkno; + io_bp->b_datap = bp->b_datap; + io_resid = bp->b_bcount; + io_direction = bp->b_flags & B_READ; + io_contig_bytes = contig_bytes; + + if (bp->b_flags & B_READ) + bmap_flags = VNODE_READ; + else + bmap_flags = VNODE_WRITE; + + for (;;) { + if (io_blkno == -1) + /* + * this is unexepected, but we'll allow for it + */ + bzero((caddr_t)io_bp->b_datap, (int)io_contig_bytes); + else { + io_bp->b_bcount = io_contig_bytes; + io_bp->b_bufsize = io_contig_bytes; + io_bp->b_resid = io_contig_bytes; + io_bp->b_blkno = io_blkno; + + buf_reset(io_bp, io_direction); + /* + * Call the device to do the I/O and wait for it + */ + if ((error = VNOP_STRATEGY(io_bp))) + break; + if ((error = (int)buf_biowait(io_bp))) + break; + if (io_bp->b_resid) { + io_resid -= (io_contig_bytes - io_bp->b_resid); + break; + } + } + if ((io_resid -= io_contig_bytes) == 0) + break; + f_offset += io_contig_bytes; + io_bp->b_datap += io_contig_bytes; + + /* + * Map the current position to a physical block number + */ + if ((error = VNOP_BLOCKMAP(vp, f_offset, io_resid, &io_blkno, &io_contig_bytes, NULL, bmap_flags, NULL))) + break; + } + buf_free(io_bp); + + if (error) + buf_seterror(bp, error); + bp->b_resid = io_resid; + /* + * This I/O is now complete + */ + buf_biodone(bp); + + return error; +} + + +/* + * struct vnop_strategy_args { + * struct buf *a_bp; + * } *ap; + */ +errno_t +buf_strategy(vnode_t devvp, void *ap) +{ + buf_t bp = ((struct vnop_strategy_args *)ap)->a_bp; + vnode_t vp = bp->b_vp; + int bmap_flags; + errno_t error; + + if (vp == NULL || vp->v_type == VCHR || vp->v_type == VBLK) + panic("buf_strategy: b_vp == NULL || vtype == VCHR | VBLK\n"); + /* + * associate the physical device with + * with this buf_t even if we don't + * end up issuing the I/O... + */ + bp->b_dev = devvp->v_rdev; + + if (bp->b_flags & B_READ) + bmap_flags = VNODE_READ; + else + bmap_flags = VNODE_WRITE; + + if ( !(bp->b_flags & B_CLUSTER)) { + + if ( (bp->b_upl) ) { + /* + * we have a UPL associated with this bp + * go through cluster_bp which knows how + * to deal with filesystem block sizes + * that aren't equal to the page size + */ + return (cluster_bp(bp)); + } + if (bp->b_blkno == bp->b_lblkno) { + off_t f_offset; + size_t contig_bytes; + + if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) { + buf_seterror(bp, error); + buf_biodone(bp); + + return (error); + } + if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) { + buf_seterror(bp, error); + buf_biodone(bp); + + return (error); + } + if (bp->b_blkno == -1) + buf_clear(bp); + else if ((long)contig_bytes < bp->b_bcount) + return (buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes)); + } + if (bp->b_blkno == -1) { + buf_biodone(bp); + return (0); + } + } + /* + * we can issue the I/O because... + * either B_CLUSTER is set which + * means that the I/O is properly set + * up to be a multiple of the page size, or + * we were able to successfully set up the + * phsyical block mapping + */ + return (VOCALL(devvp->v_op, VOFFSET(vnop_strategy), ap)); +} + + + +buf_t +buf_alloc(vnode_t vp) +{ + return(alloc_io_buf(vp, 0)); +} + +void +buf_free(buf_t bp) { + + free_io_buf(bp); +} + + + +void +buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg) { + buf_t bp; + int retval; + struct buflists local_iterblkhd; + int lock_flags = BAC_NOWAIT | BAC_REMOVE; + + if (flags & BUF_SKIP_LOCKED) + lock_flags |= BAC_SKIP_LOCKED; + if (flags & BUF_SKIP_NONLOCKED) + lock_flags |= BAC_SKIP_NONLOCKED; + + lck_mtx_lock(buf_mtxp); + + if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) { + lck_mtx_unlock(buf_mtxp); + return; + } + while (!LIST_EMPTY(&local_iterblkhd)) { + bp = LIST_FIRST(&local_iterblkhd); + LIST_REMOVE(bp, b_vnbufs); + LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs); + + if (buf_acquire_locked(bp, lock_flags, 0, 0)) + continue; + + lck_mtx_unlock(buf_mtxp); + + retval = callout(bp, arg); + + switch (retval) { + case BUF_RETURNED: + buf_brelse(bp); + break; + case BUF_CLAIMED: + break; + case BUF_RETURNED_DONE: + buf_brelse(bp); + lck_mtx_lock(buf_mtxp); + goto out; + case BUF_CLAIMED_DONE: + lck_mtx_lock(buf_mtxp); + goto out; + } + lck_mtx_lock(buf_mtxp); + } +out: + buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY); + + lck_mtx_unlock(buf_mtxp); +} + + +/* + * Flush out and invalidate all buffers associated with a vnode. + */ +int +buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo) +{ + buf_t bp; + int error = 0; + int must_rescan = 1; + struct buflists local_iterblkhd; + + lck_mtx_lock(buf_mtxp); + + for (;;) { + if (must_rescan == 0) + /* + * the lists may not be empty, but all that's left at this + * point are metadata or B_LOCKED buffers which are being + * skipped... we know this because we made it through both + * the clean and dirty lists without dropping buf_mtxp... + * each time we drop buf_mtxp we bump "must_rescan" + */ + break; + if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd)) + break; + must_rescan = 0; + /* + * iterate the clean list + */ + if (buf_iterprepare(vp, &local_iterblkhd, VBI_CLEAN)) { + goto try_dirty_list; + } + while (!LIST_EMPTY(&local_iterblkhd)) { + bp = LIST_FIRST(&local_iterblkhd); + + LIST_REMOVE(bp, b_vnbufs); + LIST_INSERT_HEAD(&vp->v_cleanblkhd, bp, b_vnbufs); + + /* + * some filesystems distinguish meta data blocks with a negative logical block # + */ + if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META))) + continue; + + if ( (error = (int)buf_acquire_locked(bp, BAC_REMOVE | BAC_SKIP_LOCKED, slpflag, slptimeo)) ) { + if (error == EDEADLK) + /* + * this buffer was marked B_LOCKED... + * we didn't drop buf_mtxp, so we + * we don't need to rescan + */ + continue; + if (error == EAGAIN) { + /* + * found a busy buffer... we blocked and + * dropped buf_mtxp, so we're going to + * need to rescan after this pass is completed + */ + must_rescan++; + continue; + } + /* + * got some kind of 'real' error out of the msleep + * in buf_acquire_locked, terminate the scan and return the error + */ + buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN); + + lck_mtx_unlock(buf_mtxp); + return (error); + } + lck_mtx_unlock(buf_mtxp); + + SET(bp->b_flags, B_INVAL); + buf_brelse(bp); + + lck_mtx_lock(buf_mtxp); + + /* + * by dropping buf_mtxp, we allow new + * buffers to be added to the vnode list(s) + * we'll have to rescan at least once more + * if the queues aren't empty + */ + must_rescan++; + } + buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN); + +try_dirty_list: + /* + * Now iterate on dirty blks + */ + if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) { + continue; + } + while (!LIST_EMPTY(&local_iterblkhd)) { + bp = LIST_FIRST(&local_iterblkhd); + + LIST_REMOVE(bp, b_vnbufs); + LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs); + + /* + * some filesystems distinguish meta data blocks with a negative logical block # + */ + if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META))) + continue; + + if ( (error = (int)buf_acquire_locked(bp, BAC_REMOVE | BAC_SKIP_LOCKED, slpflag, slptimeo)) ) { + if (error == EDEADLK) + /* + * this buffer was marked B_LOCKED... + * we didn't drop buf_mtxp, so we + * we don't need to rescan + */ + continue; + if (error == EAGAIN) { + /* + * found a busy buffer... we blocked and + * dropped buf_mtxp, so we're going to + * need to rescan after this pass is completed + */ + must_rescan++; + continue; + } + /* + * got some kind of 'real' error out of the msleep + * in buf_acquire_locked, terminate the scan and return the error + */ + buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY); + + lck_mtx_unlock(buf_mtxp); + return (error); + } + lck_mtx_unlock(buf_mtxp); + + SET(bp->b_flags, B_INVAL); + + if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA)) + (void) VNOP_BWRITE(bp); + else + buf_brelse(bp); + + lck_mtx_lock(buf_mtxp); + /* + * by dropping buf_mtxp, we allow new + * buffers to be added to the vnode list(s) + * we'll have to rescan at least once more + * if the queues aren't empty + */ + must_rescan++; + } + buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY); + } + lck_mtx_unlock(buf_mtxp); + + return (0); +} + +void +buf_flushdirtyblks(vnode_t vp, int wait, int flags, char *msg) { + buf_t bp; + int writes_issued = 0; + errno_t error; + int busy = 0; + struct buflists local_iterblkhd; + int lock_flags = BAC_NOWAIT | BAC_REMOVE; + + if (flags & BUF_SKIP_LOCKED) + lock_flags |= BAC_SKIP_LOCKED; + if (flags & BUF_SKIP_NONLOCKED) + lock_flags |= BAC_SKIP_NONLOCKED; +loop: + lck_mtx_lock(buf_mtxp); + + if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == 0) { + while (!LIST_EMPTY(&local_iterblkhd)) { + bp = LIST_FIRST(&local_iterblkhd); + LIST_REMOVE(bp, b_vnbufs); + LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs); + + if ((error = buf_acquire_locked(bp, lock_flags, 0, 0)) == EBUSY) + busy++; + if (error) + continue; + lck_mtx_unlock(buf_mtxp); + + bp->b_flags &= ~B_LOCKED; + + /* + * Wait for I/O associated with indirect blocks to complete, + * since there is no way to quickly wait for them below. + */ + if ((bp->b_vp == vp) || (wait == 0)) + (void) buf_bawrite(bp); + else + (void) VNOP_BWRITE(bp); + writes_issued++; + + lck_mtx_lock(buf_mtxp); + } + buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY); + } + lck_mtx_unlock(buf_mtxp); + + if (wait) { + (void)vnode_waitforwrites(vp, 0, 0, 0, msg); + + if (vp->v_dirtyblkhd.lh_first && busy) { + /* + * we had one or more BUSY buffers on + * the dirtyblock list... most likely + * these are due to delayed writes that + * were moved to the bclean queue but + * have not yet been 'written'. + * if we issued some writes on the + * previous pass, we try again immediately + * if we didn't, we'll sleep for some time + * to allow the state to change... + */ + if (writes_issued == 0) { + (void)tsleep((caddr_t)&vp->v_numoutput, + PRIBIO + 1, "vnode_flushdirtyblks", hz/20); + } + writes_issued = 0; + busy = 0; + + goto loop; + } + } +} + + +/* + * called with buf_mtxp held... + * this lock protects the queue manipulation + */ +static int +buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags) +{ + struct buflists * listheadp; + + if (flags & VBI_DIRTY) + listheadp = &vp->v_dirtyblkhd; + else + listheadp = &vp->v_cleanblkhd; + + while (vp->v_iterblkflags & VBI_ITER) { + vp->v_iterblkflags |= VBI_ITERWANT; + msleep(&vp->v_iterblkflags, buf_mtxp, 0, "buf_iterprepare", 0); + } + if (LIST_EMPTY(listheadp)) { + LIST_INIT(iterheadp); + return(EINVAL); + } + vp->v_iterblkflags |= VBI_ITER; + + iterheadp->lh_first = listheadp->lh_first; + listheadp->lh_first->b_vnbufs.le_prev = &iterheadp->lh_first; + LIST_INIT(listheadp); + + return(0); +} + +/* + * called with buf_mtxp held... + * this lock protects the queue manipulation + */ +static void +buf_itercomplete(vnode_t vp, struct buflists *iterheadp, int flags) +{ + struct buflists * listheadp; + buf_t bp; + + if (flags & VBI_DIRTY) + listheadp = &vp->v_dirtyblkhd; + else + listheadp = &vp->v_cleanblkhd; + + while (!LIST_EMPTY(iterheadp)) { + bp = LIST_FIRST(iterheadp); + LIST_REMOVE(bp, b_vnbufs); + LIST_INSERT_HEAD(listheadp, bp, b_vnbufs); + } + vp->v_iterblkflags &= ~VBI_ITER; + + if (vp->v_iterblkflags & VBI_ITERWANT) { + vp->v_iterblkflags &= ~VBI_ITERWANT; + wakeup(&vp->v_iterblkflags); + } +} + + +static void +bremfree_locked(buf_t bp) +{ + struct bqueues *dp = NULL; + int whichq = -1; + + /* + * We only calculate the head of the freelist when removing + * the last element of the list as that is the only time that + * it is needed (e.g. to reset the tail pointer). + * + * NB: This makes an assumption about how tailq's are implemented. + */ + if (bp->b_freelist.tqe_next == NULL) { + for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) + if (dp->tqh_last == &bp->b_freelist.tqe_next) + break; + if (dp == &bufqueues[BQUEUES]) + panic("bremfree: lost tail"); + } + TAILQ_REMOVE(dp, bp, b_freelist); + whichq = bp->b_whichq; +#if BALANCE_QUEUES + bufqdec(whichq); +#endif + bp->b_whichq = -1; + bp->b_timestamp = 0; +} + +/* + * Associate a buffer with a vnode. + */ +static void +bgetvp(vnode_t vp, buf_t bp) +{ + + if (bp->b_vp != vp) + panic("bgetvp: not free"); + + if (vp->v_type == VBLK || vp->v_type == VCHR) + bp->b_dev = vp->v_rdev; + else + bp->b_dev = NODEV; + /* + * Insert onto list for new vnode. + */ + lck_mtx_lock(buf_mtxp); + bufinsvn(bp, &vp->v_cleanblkhd); + lck_mtx_unlock(buf_mtxp); +} + +/* + * Disassociate a buffer from a vnode. + */ +static void +brelvp(buf_t bp) +{ + vnode_t vp; + + if ((vp = bp->b_vp) == (vnode_t)NULL) + panic("brelvp: NULL vp"); + /* + * Delete from old vnode list, if on one. + */ + lck_mtx_lock(buf_mtxp); + if (bp->b_vnbufs.le_next != NOLIST) + bufremvn(bp); + lck_mtx_unlock(buf_mtxp); + + bp->b_vp = (vnode_t)NULL; +} + +/* + * Reassign a buffer from one vnode to another. + * Used to assign file specific control information + * (indirect blocks) to the vnode to which they belong. + */ +static void +buf_reassign(buf_t bp, vnode_t newvp) +{ + register struct buflists *listheadp; - /* Wait for the read to complete, and return result. */ - return (biowait(bp)); + if (newvp == NULL) { + printf("buf_reassign: NULL"); + return; + } + lck_mtx_lock(buf_mtxp); + + /* + * Delete from old vnode list, if on one. + */ + if (bp->b_vnbufs.le_next != NOLIST) + bufremvn(bp); + /* + * If dirty, put on list of dirty buffers; + * otherwise insert onto list of clean buffers. + */ + if (ISSET(bp->b_flags, B_DELWRI)) + listheadp = &newvp->v_dirtyblkhd; + else + listheadp = &newvp->v_cleanblkhd; + bufinsvn(bp, listheadp); + + lck_mtx_unlock(buf_mtxp); } -/* - * Read-ahead multiple disk blocks. The first is sync, the rest async. - */ -int -breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp) - struct vnode *vp; - daddr_t blkno; int size; - daddr_t rablks[]; int rasizes[]; - int nrablks; - struct ucred *cred; - struct buf **bpp; +static __inline__ void +bufhdrinit(buf_t bp) { - return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ)); + bzero((char *)bp, sizeof *bp); + bp->b_dev = NODEV; + bp->b_rcred = NOCRED; + bp->b_wcred = NOCRED; + bp->b_vnbufs.le_next = NOLIST; + bp->b_flags = B_INVAL; + + return; } /* - * Read-ahead multiple disk blocks. The first is sync, the rest async. - * [breadn() for meta-data] + * Initialize buffers and hash links for buffers. */ -int -meta_breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp) - struct vnode *vp; - daddr_t blkno; int size; - daddr_t rablks[]; int rasizes[]; - int nrablks; - struct ucred *cred; - struct buf **bpp; +__private_extern__ void +bufinit() { - return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META)); + buf_t bp; + struct bqueues *dp; + int i; + int metabuf; + long whichq; + + /* Initialize the buffer queues ('freelists') and the hash table */ + for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) + TAILQ_INIT(dp); + bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash); + + metabuf = nbuf/8; /* reserved for meta buf */ + + /* Initialize the buffer headers */ + for (i = 0; i < nbuf; i++) { + bp = &buf[i]; + bufhdrinit(bp); + + /* + * metabuf buffer headers on the meta-data list and + * rest of the buffer headers on the empty list + */ + if (--metabuf) + whichq = BQ_META; + else + whichq = BQ_EMPTY; + + BLISTNONE(bp); + dp = &bufqueues[whichq]; + binsheadfree(bp, dp, whichq); + binshash(bp, &invalhash); + } + + for (; i < nbuf + niobuf; i++) { + bp = &buf[i]; + bufhdrinit(bp); + binsheadfree(bp, &iobufqueue, -1); + } + + /* + * allocate lock group attribute and group + */ + buf_mtx_grp_attr = lck_grp_attr_alloc_init(); + //lck_grp_attr_setstat(buf_mtx_grp_attr); + buf_mtx_grp = lck_grp_alloc_init("buffer cache", buf_mtx_grp_attr); + + /* + * allocate the lock attribute + */ + buf_mtx_attr = lck_attr_alloc_init(); + //lck_attr_setdebug(buf_mtx_attr); + + /* + * allocate and initialize mutex's for the buffer and iobuffer pools + */ + buf_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr); + iobuffer_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr); + + if (iobuffer_mtxp == NULL) + panic("couldn't create iobuffer mutex"); + + if (buf_mtxp == NULL) + panic("couldn't create buf mutex"); + + /* + * allocate and initialize cluster specific global locks... + */ + cluster_init(); + + printf("using %d buffer headers and %d cluster IO buffer headers\n", + nbuf, niobuf); + + /* Set up zones used by the buffer cache */ + bufzoneinit(); + + /* start the bcleanbuf() thread */ + bcleanbuf_thread_init(); + +#if BALANCE_QUEUES + { + static void bufq_balance_thread_init(); + /* create a thread to do dynamic buffer queue balancing */ + bufq_balance_thread_init(); + } +#endif /* notyet */ +} + +static struct buf * +bio_doread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, int async, int queuetype) +{ + buf_t bp; + + bp = buf_getblk(vp, blkno, size, 0, 0, queuetype); + + /* + * If buffer does not have data valid, start a read. + * Note that if buffer is B_INVAL, buf_getblk() won't return it. + * Therefore, it's valid if it's I/O has completed or been delayed. + */ + if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) { + struct proc *p; + + p = current_proc(); + + /* Start I/O for the buffer (keeping credentials). */ + SET(bp->b_flags, B_READ | async); + if (cred != NOCRED && bp->b_rcred == NOCRED) { + kauth_cred_ref(cred); + bp->b_rcred = cred; + } + + VNOP_STRATEGY(bp); + + trace(TR_BREADMISS, pack(vp, size), blkno); + + /* Pay for the read. */ + if (p && p->p_stats) + p->p_stats->p_ru.ru_inblock++; /* XXX */ + + if (async) { + /* + * since we asked for an ASYNC I/O + * the biodone will do the brelse + * we don't want to pass back a bp + * that we don't 'own' + */ + bp = NULL; + } + } else if (async) { + buf_brelse(bp); + bp = NULL; + } + + trace(TR_BREADHIT, pack(vp, size), blkno); + + return (bp); } /* - * Perform the reads for breadn() and meta_breadn(). + * Perform the reads for buf_breadn() and buf_meta_breadn(). * Trivial modification to the breada algorithm presented in Bach (p.55). */ -static int -do_breadn_for_type(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablks, int *rasizes, - int nrablks, struct ucred *cred, struct buf **bpp, int queuetype) +static errno_t +do_breadn_for_type(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, + int nrablks, ucred_t cred, buf_t *bpp, int queuetype) { - register struct buf *bp; - int i; + buf_t bp; + int i; bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype); @@ -578,38 +1529,73 @@ do_breadn_for_type(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablks, i } /* Otherwise, we had to start a read for it; wait until it's valid. */ - return (biowait(bp)); + return (buf_biowait(bp)); } + /* - * Read with single-block read-ahead. Defined in Bach (p.55), but - * implemented as a call to breadn(). - * XXX for compatibility with old file systems. + * Read a disk block. + * This algorithm described in Bach (p.54). */ -int -breada(vp, blkno, size, rablkno, rabsize, cred, bpp) - struct vnode *vp; - daddr_t blkno; int size; - daddr_t rablkno; int rabsize; - struct ucred *cred; - struct buf **bpp; +errno_t +buf_bread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, buf_t *bpp) +{ + buf_t bp; + + /* Get buffer for block. */ + bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ); + + /* Wait for the read to complete, and return result. */ + return (buf_biowait(bp)); +} + +/* + * Read a disk block. [bread() for meta-data] + * This algorithm described in Bach (p.54). + */ +errno_t +buf_meta_bread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, buf_t *bpp) +{ + buf_t bp; + + /* Get buffer for block. */ + bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META); + + /* Wait for the read to complete, and return result. */ + return (buf_biowait(bp)); +} + +/* + * Read-ahead multiple disk blocks. The first is sync, the rest async. + */ +errno_t +buf_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, ucred_t cred, buf_t *bpp) { + return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ)); +} - return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp)); +/* + * Read-ahead multiple disk blocks. The first is sync, the rest async. + * [buf_breadn() for meta-data] + */ +errno_t +buf_meta_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, ucred_t cred, buf_t *bpp) +{ + return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META)); } /* * Block write. Described in Bach (p.56) */ -int -bwrite(bp) - struct buf *bp; +errno_t +buf_bwrite(buf_t bp) { - int rv, sync, wasdelayed; - struct proc *p = current_proc(); - struct vnode *vp = bp->b_vp; + int sync, wasdelayed; + errno_t rv; + proc_t p = current_proc(); + vnode_t vp = bp->b_vp; - if (bp->b_data == 0) { + if (bp->b_datap == 0) { if (brecover_data(bp) == 0) return (0); } @@ -617,10 +1603,9 @@ bwrite(bp) sync = !ISSET(bp->b_flags, B_ASYNC); wasdelayed = ISSET(bp->b_flags, B_DELWRI); CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI)); - if (wasdelayed) { - nbdwrite--; - wakeup((caddr_t)&nbdwrite); - } + + if (wasdelayed) + OSAddAtomic(-1, &nbdwrite); if (!sync) { /* @@ -630,25 +1615,24 @@ bwrite(bp) * be properly notified that its I/O has completed. */ if (wasdelayed) - reassignbuf(bp, vp); + buf_reassign(bp, vp); else if (p && p->p_stats) p->p_stats->p_ru.ru_oublock++; /* XXX */ } - trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno); /* Initiate disk write. Make sure the appropriate party is charged. */ - SET(bp->b_flags, B_WRITEINPROG); - vp->v_numoutput++; + + OSAddAtomic(1, &vp->v_numoutput); - VOP_STRATEGY(bp); + VNOP_STRATEGY(bp); if (sync) { /* * If I/O was synchronous, wait for it to complete. */ - rv = biowait(bp); + rv = buf_biowait(bp); /* * Pay for the I/O operation, if it's not been paid for, and @@ -656,7 +1640,7 @@ bwrite(bp) * were payed for above.) */ if (wasdelayed) - reassignbuf(bp, vp); + buf_reassign(bp, vp); else if (p && p->p_stats) p->p_stats->p_ru.ru_oublock++; /* XXX */ @@ -664,7 +1648,7 @@ bwrite(bp) /* Release the buffer. */ // XXXdbg - only if the unused bit is set if (!ISSET(bp->b_flags, B_NORELSE)) { - brelse(bp); + buf_brelse(bp); } else { CLR(bp->b_flags, B_NORELSE); } @@ -677,9 +1661,9 @@ bwrite(bp) int vn_bwrite(ap) - struct vop_bwrite_args *ap; + struct vnop_bwrite_args *ap; { - return (bwrite(ap->a_bp)); + return (buf_bwrite(ap->a_bp)); } /* @@ -697,17 +1681,15 @@ vn_bwrite(ap) * * Note: With the abilitty to allocate additional buffer * headers, we can get in to the situation where "too" many - * bdwrite()s can create situation where the kernel can create - * buffers faster than the disks can service. Doing a bawrite() in - * cases were we have "too many" outstanding bdwrite()s avoids that. + * buf_bdwrite()s can create situation where the kernel can create + * buffers faster than the disks can service. Doing a buf_bawrite() in + * cases were we have "too many" outstanding buf_bdwrite()s avoids that. */ __private_extern__ int -bdwrite_internal(bp, return_error) - struct buf *bp; - int return_error; +bdwrite_internal(buf_t bp, int return_error) { - struct proc *p = current_proc(); - struct vnode *vp = bp->b_vp; + proc_t p = current_proc(); + vnode_t vp = bp->b_vp; /* * If the block hasn't been seen before: @@ -719,265 +1701,250 @@ bdwrite_internal(bp, return_error) SET(bp->b_flags, B_DELWRI); if (p && p->p_stats) p->p_stats->p_ru.ru_oublock++; /* XXX */ - nbdwrite ++; - reassignbuf(bp, vp); + OSAddAtomic(1, &nbdwrite); + buf_reassign(bp, vp); } /* If this is a tape block, write it the block now. */ if (ISSET(bp->b_flags, B_TAPE)) { - /* bwrite(bp); */ - VOP_BWRITE(bp); + VNOP_BWRITE(bp); return (0); } /* - * If the vnode has "too many" write operations in progress - * wait for them to finish the IO - */ - while (vp->v_numoutput >= BUFWRITE_THROTTLE) { - vp->v_flag |= VTHROTTLED; - (void)tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "bdwrite", 0); - } - - /* - * If we have too many delayed write buffers, - * more than we can "safely" handle, just fall back to - * doing the async write + * if we're not LOCKED, but the total number of delayed writes + * has climbed above 75% of the total buffers in the system + * return an error if the caller has indicated that it can + * handle one in this case, otherwise schedule the I/O now + * this is done to prevent us from allocating tons of extra + * buffers when dealing with virtual disks (i.e. DiskImages), + * because additional buffers are dynamically allocated to prevent + * deadlocks from occurring + * + * however, can't do a buf_bawrite() if the LOCKED bit is set because the + * buffer is part of a transaction and can't go to disk until + * the LOCKED bit is cleared. */ - if (nbdwrite < 0) - panic("bdwrite: Negative nbdwrite"); - - // can't do a bawrite() if the LOCKED bit is set because the - // buffer is part of a transaction and can't go to disk until - // the LOCKED bit is cleared. if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf/4)*3)) { if (return_error) return (EAGAIN); - else - bawrite(bp); - return (0); + /* + * If the vnode has "too many" write operations in progress + * wait for them to finish the IO + */ + (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (char *)"buf_bdwrite"); + + return (buf_bawrite(bp)); } /* Otherwise, the "write" is done, so mark and release the buffer. */ SET(bp->b_flags, B_DONE); - brelse(bp); + buf_brelse(bp); return (0); } -void -bdwrite(bp) - struct buf *bp; +errno_t +buf_bdwrite(buf_t bp) { - (void) bdwrite_internal(bp, 0); + return (bdwrite_internal(bp, 0)); } /* - * Asynchronous block write; just an asynchronous bwrite(). + * Asynchronous block write; just an asynchronous buf_bwrite(). * * Note: With the abilitty to allocate additional buffer * headers, we can get in to the situation where "too" many - * bawrite()s can create situation where the kernel can create + * buf_bawrite()s can create situation where the kernel can create * buffers faster than the disks can service. * We limit the number of "in flight" writes a vnode can have to * avoid this. */ static int -bawrite_internal(bp, throttle) - struct buf *bp; - int throttle; +bawrite_internal(buf_t bp, int throttle) { - struct vnode *vp = bp->b_vp; + vnode_t vp = bp->b_vp; if (vp) { - /* - * If the vnode has "too many" write operations in progress - * wait for them to finish the IO - */ - while (vp->v_numoutput >= BUFWRITE_THROTTLE) { - if (throttle) { - vp->v_flag |= VTHROTTLED; - (void)tsleep((caddr_t)&vp->v_numoutput, - PRIBIO + 1, "bawrite", 0); - } else - return (EWOULDBLOCK); - } + if (throttle) + /* + * If the vnode has "too many" write operations in progress + * wait for them to finish the IO + */ + (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (const char *)"buf_bawrite"); + else if (vp->v_numoutput >= VNODE_ASYNC_THROTTLE) + /* + * return to the caller and + * let him decide what to do + */ + return (EWOULDBLOCK); } - SET(bp->b_flags, B_ASYNC); - VOP_BWRITE(bp); - return (0); -} -void -bawrite(bp) - struct buf *bp; -{ - (void) bawrite_internal(bp, 1); + return (VNOP_BWRITE(bp)); } -/* - * bwillwrite: - * - * Called prior to the locking of any vnodes when we are expecting to - * write. We do not want to starve the buffer cache with too many - * dirty buffers so we block here. By blocking prior to the locking - * of any vnodes we attempt to avoid the situation where a locked vnode - * prevents the various system daemons from flushing related buffers. - */ - -void -bwillwrite(void) +errno_t +buf_bawrite(buf_t bp) { - /* XXX To be implemented later */ + return (bawrite_internal(bp, 1)); } + /* * Release a buffer on to the free lists. * Described in Bach (p. 46). */ void -brelse(bp) - struct buf *bp; +buf_brelse(buf_t bp) { struct bqueues *bufq; - int s; - long whichq; + long whichq; + upl_t upl; + int need_wakeup = 0; + int need_bp_wakeup = 0; + + + if (bp->b_whichq != -1 || !(bp->b_lflags & BL_BUSY)) + panic("buf_brelse: bad buffer = %x\n", bp); + +#ifdef JOE_DEBUG + bp->b_stackbrelse[0] = __builtin_return_address(0); + bp->b_stackbrelse[1] = __builtin_return_address(1); + bp->b_stackbrelse[2] = __builtin_return_address(2); + bp->b_stackbrelse[3] = __builtin_return_address(3); + bp->b_stackbrelse[4] = __builtin_return_address(4); + bp->b_stackbrelse[5] = __builtin_return_address(5); + + bp->b_lastbrelse = current_thread(); + bp->b_tag = 0; +#endif + if (bp->b_lflags & BL_IOBUF) { + free_io_buf(bp); + return; + } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START, - bp->b_lblkno * PAGE_SIZE, (int)bp, (int)bp->b_data, + bp->b_lblkno * PAGE_SIZE, (int)bp, (int)bp->b_datap, bp->b_flags, 0); trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); - // if we're invalidating a buffer that has the B_CALL bit - // set then call the b_iodone function so it gets cleaned - // up properly. - // + /* + * if we're invalidating a buffer that has the B_FILTER bit + * set then call the b_iodone function so it gets cleaned + * up properly. + * + * the HFS journal code depends on this + */ if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) { - if (ISSET(bp->b_flags, B_CALL) && !ISSET(bp->b_flags, B_DELWRI)) { - panic("brelse: CALL flag set but not DELWRI! bp 0x%x\n", bp); - } - if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */ - void (*iodone_func)(struct buf *) = bp->b_iodone; + if (ISSET(bp->b_flags, B_FILTER)) { /* if necessary, call out */ + void (*iodone_func)(struct buf *, void *) = bp->b_iodone; + void *arg = (void *)bp->b_transaction; - CLR(bp->b_flags, B_CALL); /* but note callout done */ + CLR(bp->b_flags, B_FILTER); /* but note callout done */ bp->b_iodone = NULL; + bp->b_transaction = NULL; if (iodone_func == NULL) { panic("brelse: bp @ 0x%x has NULL b_iodone!\n", bp); } - (*iodone_func)(bp); + (*iodone_func)(bp, arg); } } - - /* IO is done. Cleanup the UPL state */ - if (!ISSET(bp->b_flags, B_META) - && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) { + /* + * I/O is done. Cleanup the UPL state + */ + upl = bp->b_upl; + + if ( !ISSET(bp->b_flags, B_META) && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) { kern_return_t kret; - upl_t upl; int upl_flags; - if ( !ISSET(bp->b_flags, B_PAGELIST)) { + if ( (upl == NULL) ) { if ( !ISSET(bp->b_flags, B_INVAL)) { kret = ubc_create_upl(bp->b_vp, - ubc_blktooff(bp->b_vp, bp->b_lblkno), - bp->b_bufsize, - &upl, - NULL, - UPL_PRECIOUS); + ubc_blktooff(bp->b_vp, bp->b_lblkno), + bp->b_bufsize, + &upl, + NULL, + UPL_PRECIOUS); + if (kret != KERN_SUCCESS) - panic("brelse: Failed to get pagelists"); -#ifdef UBC_DEBUG + panic("brelse: Failed to create UPL"); +#ifdef UPL_DEBUG upl_ubc_alias_set(upl, bp, 5); -#endif /* UBC_DEBUG */ - } else - upl = (upl_t) 0; +#endif /* UPL_DEBUG */ + } } else { - upl = bp->b_pagelist; - - if (bp->b_data) { + if (bp->b_datap) { kret = ubc_upl_unmap(upl); if (kret != KERN_SUCCESS) - panic("kernel_upl_unmap failed"); - bp->b_data = 0; + panic("ubc_upl_unmap failed"); + bp->b_datap = (uintptr_t)NULL; } } if (upl) { if (bp->b_flags & (B_ERROR | B_INVAL)) { - if (bp->b_flags & (B_READ | B_INVAL)) + if (bp->b_flags & (B_READ | B_INVAL)) upl_flags = UPL_ABORT_DUMP_PAGES; else upl_flags = 0; + ubc_upl_abort(upl, upl_flags); } else { - if (ISSET(bp->b_flags, B_NEEDCOMMIT)) - upl_flags = UPL_COMMIT_CLEAR_DIRTY ; - else if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY)) - upl_flags = UPL_COMMIT_SET_DIRTY ; - else - upl_flags = UPL_COMMIT_CLEAR_DIRTY ; + if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY)) + upl_flags = UPL_COMMIT_SET_DIRTY ; + else + upl_flags = UPL_COMMIT_CLEAR_DIRTY ; + ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags | - UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY); + UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY); } - s = splbio(); - CLR(bp->b_flags, B_PAGELIST); - bp->b_pagelist = 0; - splx(s); + bp->b_upl = NULL; } } else { - if(ISSET(bp->b_flags, B_PAGELIST)) - panic("brelse: pagelist set for non VREG; vp=%x", bp->b_vp); + if ( (upl) ) + panic("brelse: UPL set for non VREG; vp=%x", bp->b_vp); } - /* Wake up any processes waiting for any buffer to become free. */ - if (needbuffer) { - needbuffer = 0; - wakeup(&needbuffer); - } - - /* Wake up any proceeses waiting for _this_ buffer to become free. */ - if (ISSET(bp->b_flags, B_WANTED)) { - CLR(bp->b_flags, B_WANTED); - wakeup(bp); - } - - /* Block disk interrupts. */ - s = splbio(); - /* - * Determine which queue the buffer should be on, then put it there. + * If it's locked, don't report an error; try again later. */ - - /* If it's locked, don't report an error; try again later. */ if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR)) CLR(bp->b_flags, B_ERROR); - - /* If it's not cacheable, or an error, mark it invalid. */ + /* + * If it's not cacheable, or an error, mark it invalid. + */ if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR))) SET(bp->b_flags, B_INVAL); - + if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) { /* * If it's invalid or empty, dissociate it from its vnode * and put on the head of the appropriate queue. */ - if (bp->b_vp) - brelvp(bp); - if (ISSET(bp->b_flags, B_DELWRI)) { - CLR(bp->b_flags, B_DELWRI); - nbdwrite--; - wakeup((caddr_t)&nbdwrite); - } + if (bp->b_vp) + brelvp(bp); + + if (ISSET(bp->b_flags, B_DELWRI)) + OSAddAtomic(-1, &nbdwrite); + + CLR(bp->b_flags, (B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE)); + /* + * Determine which queue the buffer should be on, then put it there. + */ if (bp->b_bufsize <= 0) whichq = BQ_EMPTY; /* no data */ else if (ISSET(bp->b_flags, B_META)) whichq = BQ_META; /* meta-data */ else whichq = BQ_AGE; /* invalid data */ - bufq = &bufqueues[whichq]; + + lck_mtx_lock(buf_mtxp); + binsheadfree(bp, bufq, whichq); } else { /* @@ -992,19 +1959,52 @@ brelse(bp) whichq = BQ_AGE; /* stale but valid data */ else whichq = BQ_LRU; /* valid data */ - bufq = &bufqueues[whichq]; + + CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE)); + + lck_mtx_lock(buf_mtxp); + binstailfree(bp, bufq, whichq); } + if (needbuffer) { + /* + * needbuffer is a global + * we're currently using buf_mtxp to protect it + * delay doing the actual wakeup until after + * we drop buf_mtxp + */ + needbuffer = 0; + need_wakeup = 1; + } + if (ISSET(bp->b_lflags, BL_WANTED)) { + /* + * delay the actual wakeup until after we + * clear BL_BUSY and we've dropped buf_mtxp + */ + need_bp_wakeup = 1; + } + /* + * Unlock the buffer. + */ + CLR(bp->b_lflags, (BL_BUSY | BL_WANTED)); - /* Unlock the buffer. */ - CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE)); - - /* Allow disk interrupts. */ - splx(s); + lck_mtx_unlock(buf_mtxp); + if (need_wakeup) { + /* + * Wake up any processes waiting for any buffer to become free. + */ + wakeup(&needbuffer); + } + if (need_bp_wakeup) { + /* + * Wake up any proceeses waiting for _this_ buffer to become free. + */ + wakeup(bp); + } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END, - (int)bp, (int)bp->b_data, bp->b_flags, 0, 0); + (int)bp, (int)bp->b_datap, bp->b_flags, 0, 0); } /* @@ -1014,10 +2014,25 @@ brelse(bp) * we normally don't return the buffer, unless the caller explicitly * wants us to. */ -struct buf * -incore(vp, blkno) - struct vnode *vp; - daddr_t blkno; +static boolean_t +incore(vnode_t vp, daddr64_t blkno) +{ + boolean_t retval; + + lck_mtx_lock(buf_mtxp); + + if (incore_locked(vp, blkno)) + retval = TRUE; + else + retval = FALSE; + lck_mtx_unlock(buf_mtxp); + + return (retval); +} + + +static buf_t +incore_locked(vnode_t vp, daddr64_t blkno) { struct buf *bp; @@ -1026,10 +2041,10 @@ incore(vp, blkno) /* Search hash chain */ for (; bp != NULL; bp = bp->b_hash.le_next) { if (bp->b_lblkno == blkno && bp->b_vp == vp && - !ISSET(bp->b_flags, B_INVAL)) + !ISSET(bp->b_flags, B_INVAL)) { return (bp); + } } - return (0); } @@ -1043,112 +2058,123 @@ incore(vp, blkno) * correct size. It is up to the caller to insure that the * cached blocks be of the correct size. */ -struct buf * -getblk(vp, blkno, size, slpflag, slptimeo, operation) - register struct vnode *vp; - daddr_t blkno; - int size, slpflag, slptimeo, operation; +buf_t +buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int operation) { - struct buf *bp; - int s, err; + buf_t bp; + int err; upl_t upl; upl_page_info_t *pl; kern_return_t kret; - int error=0; - int pagedirty = 0; + int ret_only_valid; + struct timespec ts; + int upl_flags; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START, - blkno * PAGE_SIZE, size, operation, 0, 0); -start: + (int)(blkno * PAGE_SIZE), size, operation, 0, 0); - s = splbio(); - if ((bp = incore(vp, blkno))) { - /* Found in the Buffer Cache */ - if (ISSET(bp->b_flags, B_BUSY)) { - /* but is busy */ + ret_only_valid = operation & BLK_ONLYVALID; + operation &= ~BLK_ONLYVALID; +start: + lck_mtx_lock(buf_mtxp); +start_locked: + if ((bp = incore_locked(vp, blkno))) { + /* + * Found in the Buffer Cache + */ + if (ISSET(bp->b_lflags, BL_BUSY)) { + /* + * but is busy + */ switch (operation) { case BLK_READ: case BLK_WRITE: case BLK_META: - SET(bp->b_flags, B_WANTED); + SET(bp->b_lflags, BL_WANTED); bufstats.bufs_busyincore++; - err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk", - slptimeo); - splx(s); + + /* + * don't retake the mutex after being awakened... + * the time out is in msecs + */ + ts.tv_sec = (slptimeo/1000); + ts.tv_nsec = (slptimeo % 1000) * 10 * NSEC_PER_USEC * 1000; + + err = msleep(bp, buf_mtxp, slpflag | PDROP | (PRIBIO + 1), "buf_getblk", &ts); + /* * Callers who call with PCATCH or timeout are * willing to deal with the NULL pointer */ - if (err && ((slpflag & PCATCH) || - ((err == EWOULDBLOCK) && slptimeo))) + if (err && ((slpflag & PCATCH) || ((err == EWOULDBLOCK) && slptimeo))) return (NULL); goto start; /*NOTREACHED*/ break; - case BLK_PAGEIN: - /* pagein operation must not use getblk */ - panic("getblk: pagein for incore busy buffer"); - splx(s); - /*NOTREACHED*/ - break; - - case BLK_PAGEOUT: - /* pageout operation must not use getblk */ - panic("getblk: pageout for incore busy buffer"); - splx(s); - /*NOTREACHED*/ - break; - default: - panic("getblk: %d unknown operation 1", operation); + /* + * unknown operation requested + */ + panic("getblk: paging or unknown operation for incore busy buffer - %x\n", operation); /*NOTREACHED*/ break; } } else { - /* not busy */ - SET(bp->b_flags, (B_BUSY | B_CACHE)); - bremfree(bp); + /* + * buffer in core and not busy + */ + if ( (bp->b_upl) ) + panic("buffer has UPL, but not marked BUSY: %x", bp); + SET(bp->b_lflags, BL_BUSY); + SET(bp->b_flags, B_CACHE); +#ifdef JOE_DEBUG + bp->b_owner = current_thread(); + bp->b_tag = 1; +#endif + bremfree_locked(bp); bufstats.bufs_incore++; - splx(s); + + lck_mtx_unlock(buf_mtxp); - allocbuf(bp, size); - if (ISSET(bp->b_flags, B_PAGELIST)) - panic("pagelist buffer is not busy"); + if ( !ret_only_valid) + allocbuf(bp, size); + upl_flags = 0; switch (operation) { - case BLK_READ: case BLK_WRITE: - if (UBCISVALID(bp->b_vp) && bp->b_bufsize) { + /* + * "write" operation: let the UPL subsystem + * know that we intend to modify the buffer + * cache pages we're gathering. + */ + upl_flags |= UPL_WILL_MODIFY; + case BLK_READ: + upl_flags |= UPL_PRECIOUS; + if (UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) { kret = ubc_create_upl(vp, - ubc_blktooff(vp, bp->b_lblkno), - bp->b_bufsize, - &upl, - &pl, - UPL_PRECIOUS); + ubc_blktooff(vp, bp->b_lblkno), + bp->b_bufsize, + &upl, + &pl, + upl_flags); if (kret != KERN_SUCCESS) - panic("Failed to get pagelists"); + panic("Failed to create UPL"); - SET(bp->b_flags, B_PAGELIST); - bp->b_pagelist = upl; + bp->b_upl = upl; - if (!upl_valid_page(pl, 0)) { - if (vp->v_tag != VT_NFS) - panic("getblk: incore buffer without valid page"); - CLR(bp->b_flags, B_CACHE); - } + if (upl_valid_page(pl, 0)) { + if (upl_dirty_page(pl, 0)) + SET(bp->b_flags, B_WASDIRTY); + else + CLR(bp->b_flags, B_WASDIRTY); + } else + CLR(bp->b_flags, (B_DONE | B_CACHE | B_WASDIRTY | B_DELWRI)); - if (upl_dirty_page(pl, 0)) - SET(bp->b_flags, B_WASDIRTY); - else - CLR(bp->b_flags, B_WASDIRTY); + kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_datap)); - kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data)); if (kret != KERN_SUCCESS) - panic("getblk: ubc_upl_map() failed with (%d)", - kret); - if (bp->b_data == 0) - panic("ubc_upl_map mapped 0"); + panic("getblk: ubc_upl_map() failed with (%d)", kret); } break; @@ -1157,35 +2183,42 @@ start: * VM is not involved in IO for the meta data * buffer already has valid data */ - if(bp->b_data == 0) - panic("bp->b_data null incore buf=%x", bp); - break; - - case BLK_PAGEIN: - case BLK_PAGEOUT: - panic("getblk: paging operation 1"); break; default: - panic("getblk: %d unknown operation 2", operation); + panic("getblk: paging or unknown operation for incore buffer- %d\n", operation); /*NOTREACHED*/ break; } } } else { /* not incore() */ int queue = BQ_EMPTY; /* Start with no preference */ - splx(s); - if ((operation == BLK_META) || (UBCINVALID(vp)) || - !(UBCINFOEXISTS(vp))) { - operation = BLK_META; + if (ret_only_valid) { + lck_mtx_unlock(buf_mtxp); + return (NULL); } + + if ((UBCINVALID(vp)) || !(UBCINFOEXISTS(vp))) + operation = BLK_META; + if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL) - goto start; - if (incore(vp, blkno)) { + goto start_locked; + + /* + * getnewbuf may block for a number of different reasons... + * if it does, it's then possible for someone else to + * create a buffer for the same block and insert it into + * the hash... if we see it incore at this point we dump + * the buffer we were working on and start over + */ + if (incore_locked(vp, blkno)) { SET(bp->b_flags, B_INVAL); binshash(bp, &invalhash); - brelse(bp); + + lck_mtx_unlock(buf_mtxp); + + buf_brelse(bp); goto start; } /* @@ -1194,15 +2227,11 @@ start: */ /* - * if it is meta, the queue may be set to other - * type so reset as well as mark it to be B_META + * mark the buffer as B_META if indicated * so that when buffer is released it will goto META queue - * Also, if the vnode is not VREG, then it is META */ - if (operation == BLK_META) { - SET(bp->b_flags, B_META); - queue = BQ_META; - } + if (operation == BLK_META) + SET(bp->b_flags, B_META); bp->b_blkno = bp->b_lblkno = blkno; bp->b_vp = vp; @@ -1212,158 +2241,138 @@ start: */ binshash(bp, BUFHASH(vp, blkno)); - s = splbio(); + lck_mtx_unlock(buf_mtxp); + bgetvp(vp, bp); - splx(s); allocbuf(bp, size); + upl_flags = 0; switch (operation) { case BLK_META: - /* buffer data is invalid */ - - if(bp->b_data == 0) - panic("bp->b_data is null %x",bp); - - bufstats.bufs_miss++; - - /* wakeup the buffer */ - CLR(bp->b_flags, B_WANTED); - wakeup(bp); + /* + * buffer data is invalid... + * + * I don't want to have to retake buf_mtxp, + * so the miss and vmhits counters are done + * with Atomic updates... all other counters + * in bufstats are protected with either + * buf_mtxp or iobuffer_mtxp + */ + OSAddAtomic(1, &bufstats.bufs_miss); break; - case BLK_READ: case BLK_WRITE: + /* + * "write" operation: let the UPL subsystem know + * that we intend to modify the buffer cache pages + * we're gathering. + */ + upl_flags |= UPL_WILL_MODIFY; + case BLK_READ: + { off_t f_offset; + size_t contig_bytes; + int bmap_flags; - if (ISSET(bp->b_flags, B_PAGELIST)) - panic("B_PAGELIST in bp=%x",bp); + if ( (bp->b_upl) ) + panic("bp already has UPL: %x",bp); + f_offset = ubc_blktooff(vp, blkno); + + upl_flags |= UPL_PRECIOUS; kret = ubc_create_upl(vp, - ubc_blktooff(vp, blkno), - bp->b_bufsize, - &upl, - &pl, - UPL_PRECIOUS); - if (kret != KERN_SUCCESS) - panic("Failed to get pagelists"); + f_offset, + bp->b_bufsize, + &upl, + &pl, + upl_flags); -#ifdef UBC_DEBUG + if (kret != KERN_SUCCESS) + panic("Failed to create UPL"); +#ifdef UPL_DEBUG upl_ubc_alias_set(upl, bp, 4); -#endif /* UBC_DEBUG */ - bp->b_pagelist = upl; - - SET(bp->b_flags, B_PAGELIST); +#endif /* UPL_DEBUG */ + bp->b_upl = upl; if (upl_valid_page(pl, 0)) { - SET(bp->b_flags, B_CACHE | B_DONE); - bufstats.bufs_vmhits++; - - pagedirty = upl_dirty_page(pl, 0); - if (pagedirty) - SET(bp->b_flags, B_WASDIRTY); - - if (vp->v_tag == VT_NFS) { - off_t f_offset; - int valid_size; + if (operation == BLK_READ) + bmap_flags = VNODE_READ; + else + bmap_flags = VNODE_WRITE; - bp->b_validoff = 0; - bp->b_dirtyoff = 0; + SET(bp->b_flags, B_CACHE | B_DONE); - f_offset = ubc_blktooff(vp, blkno); + OSAddAtomic(1, &bufstats.bufs_vmhits); - if (f_offset > vp->v_ubcinfo->ui_size) { - CLR(bp->b_flags, (B_CACHE|B_DONE|B_WASDIRTY)); - bp->b_validend = 0; - bp->b_dirtyend = 0; - } else { - valid_size = min(((unsigned int)(vp->v_ubcinfo->ui_size - f_offset)), PAGE_SIZE); - bp->b_validend = valid_size; + bp->b_validoff = 0; + bp->b_dirtyoff = 0; - if (pagedirty) - bp->b_dirtyend = valid_size; - else - bp->b_dirtyend = 0; + if (upl_dirty_page(pl, 0)) { + /* page is dirty */ + SET(bp->b_flags, B_WASDIRTY); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_NONE, - bp->b_validend, bp->b_dirtyend, - (int)vp->v_ubcinfo->ui_size, 0, 0); - } + bp->b_validend = bp->b_bcount; + bp->b_dirtyend = bp->b_bcount; } else { - bp->b_validoff = 0; - bp->b_dirtyoff = 0; - - if (pagedirty) { - /* page is dirty */ - bp->b_validend = bp->b_bcount; - bp->b_dirtyend = bp->b_bcount; - } else { - /* page is clean */ - bp->b_validend = bp->b_bcount; - bp->b_dirtyend = 0; - } - } - error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL); - if(error) { - panic("getblk: VOP_BMAP failed"); - /*NOTREACHED*/ - /* - * XXX: We probably should invalidate the VM Page - */ - bp->b_error = error; - SET(bp->b_flags, (B_ERROR | B_INVAL)); - /* undo B_DONE that was set before upl_commit() */ - CLR(bp->b_flags, B_DONE); - brelse(bp); - return (0); + /* page is clean */ + bp->b_validend = bp->b_bcount; + bp->b_dirtyend = 0; } + /* + * try to recreate the physical block number associated with + * this buffer... + */ + if (VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL)) + panic("getblk: VNOP_BLOCKMAP failed"); + /* + * if the extent represented by this buffer + * is not completely physically contiguous on + * disk, than we can't cache the physical mapping + * in the buffer header + */ + if ((long)contig_bytes < bp->b_bcount) + bp->b_blkno = bp->b_lblkno; } else { - bufstats.bufs_miss++; - } - kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data)); - if (kret != KERN_SUCCESS) { - panic("getblk: ubc_upl_map() " - "failed with (%d)", kret); + OSAddAtomic(1, &bufstats.bufs_miss); } - if (bp->b_data == 0) - panic("kernel_upl_map mapped 0"); - - break; + kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_datap)); - case BLK_PAGEIN: - case BLK_PAGEOUT: - panic("getblk: paging operation 2"); + if (kret != KERN_SUCCESS) + panic("getblk: ubc_upl_map() failed with (%d)", kret); break; + } default: - panic("getblk: %d unknown operation 3", operation); + panic("getblk: paging or unknown operation - %x", operation); /*NOTREACHED*/ break; } } - - if (bp->b_data == NULL) - panic("getblk: bp->b_addr is null"); - - if (bp->b_bufsize & 0xfff) { - if (ISSET(bp->b_flags, B_META) && (bp->b_bufsize & 0x1ff)) - panic("getblk: bp->b_bufsize = %d", bp->b_bufsize); - } - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END, - (int)bp, (int)bp->b_data, bp->b_flags, 3, 0); - + (int)bp, (int)bp->b_datap, bp->b_flags, 3, 0); + +#ifdef JOE_DEBUG + bp->b_stackgetblk[0] = __builtin_return_address(0); + bp->b_stackgetblk[1] = __builtin_return_address(1); + bp->b_stackgetblk[2] = __builtin_return_address(2); + bp->b_stackgetblk[3] = __builtin_return_address(3); + bp->b_stackgetblk[4] = __builtin_return_address(4); + bp->b_stackgetblk[5] = __builtin_return_address(5); +#endif return (bp); } /* * Get an empty, disassociated buffer of given size. */ -struct buf * -geteblk(size) +buf_t +buf_geteblk(size) int size; { - struct buf *bp; - int queue = BQ_EMPTY; + buf_t bp; + int queue = BQ_EMPTY; + + lck_mtx_lock(buf_mtxp); while ((bp = getnewbuf(0, 0, &queue)) == 0) ; @@ -1375,9 +2384,12 @@ geteblk(size) /* XXX need to implement logic to deal with other queues */ binshash(bp, &invalhash); - allocbuf(bp, size); bufstats.bufs_eblk++; + lck_mtx_unlock(buf_mtxp); + + allocbuf(bp, size); + return (bp); } @@ -1429,7 +2441,7 @@ getbufzone(size_t size) if ((size % 512) || (size < MINMETA) || (size > MAXMETA)) panic("getbufzone: incorect size = %d", size); - for (i = 0; meta_zones[i].mz_size != 0; i++) { + for (i = 0; meta_zones[i].mz_size != 0; i++) { if (meta_zones[i].mz_size >= size) break; } @@ -1450,91 +2462,69 @@ getbufzone(size_t size) */ int -allocbuf(bp, size) - struct buf *bp; - int size; +allocbuf(buf_t bp, int size) { vm_size_t desired_size; desired_size = roundup(size, CLBYTES); - if(desired_size < PAGE_SIZE) + if (desired_size < PAGE_SIZE) desired_size = PAGE_SIZE; if (desired_size > MAXBSIZE) panic("allocbuf: buffer larger than MAXBSIZE requested"); if (ISSET(bp->b_flags, B_META)) { - kern_return_t kret; zone_t zprev, z; - size_t nsize = roundup(size, MINMETA); - - if (bp->b_data) { - vm_offset_t elem = (vm_offset_t)bp->b_data; - - if (ISSET(bp->b_flags, B_ZALLOC)) - if (bp->b_bufsize <= MAXMETA) { - if (bp->b_bufsize < nsize) { - /* reallocate to a bigger size */ - - zprev = getbufzone(bp->b_bufsize); - if (nsize <= MAXMETA) { - desired_size = nsize; - z = getbufzone(nsize); - bp->b_data = (caddr_t)zalloc(z); - if(bp->b_data == 0) - panic("allocbuf: zalloc() returned NULL"); - } else { - kret = kmem_alloc(kernel_map, &bp->b_data, desired_size); - if (kret != KERN_SUCCESS) - panic("allocbuf: kmem_alloc() 0 returned %d", kret); - if(bp->b_data == 0) - panic("allocbuf: null b_data 0"); - CLR(bp->b_flags, B_ZALLOC); - } - bcopy((const void *)elem, bp->b_data, bp->b_bufsize); - zfree(zprev, elem); + int nsize = roundup(size, MINMETA); + + if (bp->b_datap) { + vm_offset_t elem = (vm_offset_t)bp->b_datap; + + if (ISSET(bp->b_flags, B_ZALLOC)) { + if (bp->b_bufsize < nsize) { + /* reallocate to a bigger size */ + + zprev = getbufzone(bp->b_bufsize); + if (nsize <= MAXMETA) { + desired_size = nsize; + z = getbufzone(nsize); + bp->b_datap = (uintptr_t)zalloc(z); } else { - desired_size = bp->b_bufsize; + bp->b_datap = (uintptr_t)NULL; + kmem_alloc(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size); + CLR(bp->b_flags, B_ZALLOC); } - } else - panic("allocbuf: B_ZALLOC set incorrectly"); - else - if (bp->b_bufsize < desired_size) { + bcopy((void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize); + zfree(zprev, (void *)elem); + } else { + desired_size = bp->b_bufsize; + } + + } else { + if ((vm_size_t)bp->b_bufsize < desired_size) { /* reallocate to a bigger size */ - kret = kmem_alloc(kernel_map, &bp->b_data, desired_size); - if (kret != KERN_SUCCESS) - panic("allocbuf: kmem_alloc() returned %d", kret); - if(bp->b_data == 0) - panic("allocbuf: null b_data"); - bcopy((const void *)elem, bp->b_data, bp->b_bufsize); + bp->b_datap = (uintptr_t)NULL; + kmem_alloc(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size); + bcopy((const void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize); kmem_free(kernel_map, elem, bp->b_bufsize); } else { desired_size = bp->b_bufsize; } + } } else { /* new allocation */ if (nsize <= MAXMETA) { desired_size = nsize; z = getbufzone(nsize); - bp->b_data = (caddr_t)zalloc(z); - if(bp->b_data == 0) - panic("allocbuf: zalloc() returned NULL 2"); + bp->b_datap = (uintptr_t)zalloc(z); SET(bp->b_flags, B_ZALLOC); - } else { - kret = kmem_alloc(kernel_map, &bp->b_data, desired_size); - if (kret != KERN_SUCCESS) - panic("allocbuf: kmem_alloc() 2 returned %d", kret); - if(bp->b_data == 0) - panic("allocbuf: null b_data 2"); - } + } else + kmem_alloc(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size); } } - - if (ISSET(bp->b_flags, B_META) && (bp->b_data == 0)) - panic("allocbuf: bp->b_data is NULL, buf @ 0x%x", bp); - bp->b_bufsize = desired_size; bp->b_bcount = size; + return (0); } @@ -1554,30 +2544,33 @@ allocbuf(bp, size) * Initialize the fields and disassociate the buffer from the vnode. * Remove the buffer from the hash. Return the buffer and the queue * on which it was found. + * + * buf_mtxp is held upon entry + * returns with buf_mtxp locked */ -static struct buf * -getnewbuf(slpflag, slptimeo, queue) - int slpflag, slptimeo; - int *queue; +static buf_t +getnewbuf(int slpflag, int slptimeo, int * queue) { - register struct buf *bp; - register struct buf *lru_bp; - register struct buf *age_bp; - register struct buf *meta_bp; - register int age_time, lru_time, bp_time, meta_time; - int s; - int req = *queue; /* save it for restarts */ + buf_t bp; + buf_t lru_bp; + buf_t age_bp; + buf_t meta_bp; + int age_time, lru_time, bp_time, meta_time; + int req = *queue; /* save it for restarts */ + struct timespec ts; start: - s = splbio(); - - /* invalid request gets empty queue */ + /* + * invalid request gets empty queue + */ if ((*queue > BQUEUES) || (*queue < 0) || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED)) *queue = BQ_EMPTY; - /* (*queue == BQUEUES) means no preference */ + /* + * (*queue == BQUEUES) means no preference + */ if (*queue != BQUEUES) { /* Try for the requested queue first */ bp = bufqueues[*queue].tqh_first; @@ -1600,10 +2593,13 @@ start: *queue = BQ_EMPTY; goto found; } + lck_mtx_unlock(buf_mtxp); - /* Create a new temparory buffer header */ + /* Create a new temporary buffer header */ bp = (struct buf *)zalloc(buf_hdr_zone); + lck_mtx_lock(buf_mtxp); + if (bp) { bufhdrinit(bp); BLISTNONE(bp); @@ -1614,15 +2610,16 @@ start: buf_hdr_count++; goto found; } - - /* Log this error condition */ - printf("getnewbuf: No useful buffers"); + bufstats.bufs_sleeps++; /* wait for a free buffer of any kind */ needbuffer = 1; - bufstats.bufs_sleeps++; - tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo); - splx(s); + /* hz value is 100 */ + ts.tv_sec = (slptimeo/1000); + /* the hz value is 100; which leads to 10ms */ + ts.tv_nsec = (slptimeo % 1000) * NSEC_PER_USEC * 1000 * 10; + msleep(&needbuffer, buf_mtxp, slpflag|(PRIBIO+1), (char *)"getnewbuf", &ts); + return (0); } @@ -1638,8 +2635,10 @@ start: bp = age_bp; *queue = BQ_AGE; } else { /* buffer available on both AGE and LRU */ - age_time = time.tv_sec - age_bp->b_timestamp; - lru_time = time.tv_sec - lru_bp->b_timestamp; + int t = buf_timestamp(); + + age_time = t - age_bp->b_timestamp; + lru_time = t - lru_bp->b_timestamp; if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */ bp = age_bp; *queue = BQ_AGE; @@ -1662,8 +2661,10 @@ start: bp = meta_bp; *queue = BQ_META; } else if (meta_bp) { - bp_time = time.tv_sec - bp->b_timestamp; - meta_time = time.tv_sec - meta_bp->b_timestamp; + int t = buf_timestamp(); + + bp_time = t - bp->b_timestamp; + meta_time = t - meta_bp->b_timestamp; if (!(bp_time < 0) && !(meta_time < 0)) { /* time not set backwards */ @@ -1678,138 +2679,256 @@ start: } } } - - if (bp == NULL) - panic("getnewbuf: null bp"); - found: - if (ISSET(bp->b_flags, B_LOCKED)) { - panic("getnewbuf: bp @ 0x%x is LOCKED! (flags 0x%x)\n", bp, bp->b_flags); - } - - if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef) - panic("getnewbuf: le_prev is deadbeef, buf @ 0x%x", bp); - - if(ISSET(bp->b_flags, B_BUSY)) - panic("getnewbuf reusing BUSY buf @ 0x%x", bp); + if (ISSET(bp->b_flags, B_LOCKED) || ISSET(bp->b_lflags, BL_BUSY)) + panic("getnewbuf: bp @ 0x%x is LOCKED or BUSY! (flags 0x%x)\n", bp, bp->b_flags); /* Clean it */ if (bcleanbuf(bp)) { - /* bawrite() issued, buffer not ready */ - splx(s); + /* + * moved to the laundry thread, buffer not ready + */ *queue = req; goto start; } - splx(s); return (bp); } -#include <mach/mach_types.h> -#include <mach/memory_object_types.h> -#include <kern/sched_prim.h> /* * Clean a buffer. * Returns 0 is buffer is ready to use, - * Returns 1 if issued a bawrite() to indicate + * Returns 1 if issued a buf_bawrite() to indicate * that the buffer is not ready. + * + * buf_mtxp is held upon entry + * returns with buf_mtxp locked */ static int -bcleanbuf(struct buf *bp) +bcleanbuf(buf_t bp) { - int s; - struct ucred *cred; - int hdralloc = 0; + ucred_t cred; - s = splbio(); /* Remove from the queue */ - bremfree(bp); + bremfree_locked(bp); /* Buffer is no longer on free lists. */ - SET(bp->b_flags, B_BUSY); - - /* Check whether the buffer header was "allocated" */ - if (ISSET(bp->b_flags, B_HDRALLOC)) - hdralloc = 1; - - if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef) - panic("bcleanbuf: le_prev is deadbeef"); - + SET(bp->b_lflags, BL_BUSY); +#ifdef JOE_DEBUG + bp->b_owner = current_thread(); + bp->b_tag = 2; +#endif /* * If buffer was a delayed write, start the IO by queuing * it on the LAUNDRY queue, and return 1 */ if (ISSET(bp->b_flags, B_DELWRI)) { - splx(s); binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY); blaundrycnt++; + + lck_mtx_unlock(buf_mtxp); + wakeup(&blaundrycnt); /* and give it a chance to run */ (void)thread_block(THREAD_CONTINUE_NULL); + + lck_mtx_lock(buf_mtxp); return (1); } + bremhash(bp); + + lck_mtx_unlock(buf_mtxp); + + BLISTNONE(bp); + /* + * disassociate us from our vnode, if we had one... + */ + if (bp->b_vp) + brelvp(bp); + + if (ISSET(bp->b_flags, B_META)) { + vm_offset_t elem; + + elem = (vm_offset_t)bp->b_datap; + bp->b_datap = (uintptr_t)0xdeadbeef; + + if (ISSET(bp->b_flags, B_ZALLOC)) { + zone_t z; + + z = getbufzone(bp->b_bufsize); + zfree(z, (void *)elem); + } else + kmem_free(kernel_map, elem, bp->b_bufsize); + } + + trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); + + /* clear out various other fields */ + bp->b_bufsize = 0; + bp->b_datap = (uintptr_t)NULL; + bp->b_upl = (void *)NULL; + /* + * preserve the state of whether this buffer + * was allocated on the fly or not... + * the only other flag that should be set at + * this point is BL_BUSY... + */ +#ifdef JOE_DEBUG + bp->b_owner = current_thread(); + bp->b_tag = 3; +#endif + bp->b_lflags = BL_BUSY; + bp->b_flags = (bp->b_flags & B_HDRALLOC); + bp->b_dev = NODEV; + bp->b_blkno = bp->b_lblkno = 0; + bp->b_iodone = NULL; + bp->b_error = 0; + bp->b_resid = 0; + bp->b_bcount = 0; + bp->b_dirtyoff = bp->b_dirtyend = 0; + bp->b_validoff = bp->b_validend = 0; + + /* nuke any credentials we were holding */ + cred = bp->b_rcred; + if (cred != NOCRED) { + bp->b_rcred = NOCRED; + kauth_cred_rele(cred); + } + cred = bp->b_wcred; + if (cred != NOCRED) { + bp->b_wcred = NOCRED; + kauth_cred_rele(cred); + } + lck_mtx_lock(buf_mtxp); + + return (0); +} + + + +errno_t +buf_invalblkno(vnode_t vp, daddr64_t lblkno, int flags) +{ + buf_t bp; + errno_t error; + + lck_mtx_lock(buf_mtxp); +relook: + if ((bp = incore_locked(vp, lblkno)) == (struct buf *)0) { + lck_mtx_unlock(buf_mtxp); + return (0); + } + if (ISSET(bp->b_lflags, BL_BUSY)) { + if ( !ISSET(flags, BUF_WAIT)) { + lck_mtx_unlock(buf_mtxp); + return (EBUSY); + } + SET(bp->b_lflags, BL_WANTED); + + error = msleep((caddr_t)bp, buf_mtxp, (PRIBIO + 1), (char *)"buf_invalblkno", 0); + + if (error) + return (error); + goto relook; + } + bremfree_locked(bp); + SET(bp->b_lflags, BL_BUSY); + SET(bp->b_flags, B_INVAL); +#ifdef JOE_DEBUG + bp->b_owner = current_thread(); + bp->b_tag = 4; +#endif + lck_mtx_unlock(buf_mtxp); + buf_brelse(bp); + + return (0); +} + + +void +buf_drop(buf_t bp) +{ + int need_wakeup = 0; + + lck_mtx_lock(buf_mtxp); + + if (ISSET(bp->b_lflags, BL_WANTED)) { + /* + * delay the actual wakeup until after we + * clear BL_BUSY and we've dropped buf_mtxp + */ + need_wakeup = 1; + } + /* + * Unlock the buffer. + */ + CLR(bp->b_lflags, (BL_BUSY | BL_WANTED)); - if (bp->b_vp) - brelvp(bp); - bremhash(bp); - BLISTNONE(bp); + lck_mtx_unlock(buf_mtxp); - splx(s); + if (need_wakeup) { + /* + * Wake up any proceeses waiting for _this_ buffer to become free. + */ + wakeup(bp); + } +} - if (ISSET(bp->b_flags, B_META)) { - vm_offset_t elem = (vm_offset_t)bp->b_data; - if (elem == 0) - panic("bcleanbuf: NULL bp->b_data B_META buffer"); - if (ISSET(bp->b_flags, B_ZALLOC)) { - if (bp->b_bufsize <= MAXMETA) { - zone_t z; +errno_t +buf_acquire(buf_t bp, int flags, int slpflag, int slptimeo) { + errno_t error; - z = getbufzone(bp->b_bufsize); - bp->b_data = (caddr_t)0xdeadbeef; - zfree(z, elem); - CLR(bp->b_flags, B_ZALLOC); - } else - panic("bcleanbuf: B_ZALLOC set incorrectly"); - } else { - bp->b_data = (caddr_t)0xdeadbeef; - kmem_free(kernel_map, elem, bp->b_bufsize); - } - } + lck_mtx_lock(buf_mtxp); - trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); + error = buf_acquire_locked(bp, flags, slpflag, slptimeo); - /* disassociate us from our vnode, if we had one... */ - s = splbio(); + lck_mtx_unlock(buf_mtxp); - /* clear out various other fields */ - bp->b_bufsize = 0; - bp->b_data = 0; - bp->b_flags = B_BUSY; - if (hdralloc) - SET(bp->b_flags, B_HDRALLOC); - bp->b_dev = NODEV; - bp->b_blkno = bp->b_lblkno = 0; - bp->b_iodone = 0; - bp->b_error = 0; - bp->b_resid = 0; - bp->b_bcount = 0; - bp->b_dirtyoff = bp->b_dirtyend = 0; - bp->b_validoff = bp->b_validend = 0; + return (error); +} - /* nuke any credentials we were holding */ - cred = bp->b_rcred; - if (cred != NOCRED) { - bp->b_rcred = NOCRED; - crfree(cred); + +static errno_t +buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo) +{ + errno_t error; + struct timespec ts; + + if (ISSET(bp->b_flags, B_LOCKED)) { + if ((flags & BAC_SKIP_LOCKED)) + return (EDEADLK); + } else { + if ((flags & BAC_SKIP_NONLOCKED)) + return (EDEADLK); } - cred = bp->b_wcred; - if (cred != NOCRED) { - bp->b_wcred = NOCRED; - crfree(cred); + if (ISSET(bp->b_lflags, BL_BUSY)) { + /* + * since the mutex_lock may block, the buffer + * may become BUSY, so we need to + * recheck for a NOWAIT request + */ + if (flags & BAC_NOWAIT) + return (EBUSY); + SET(bp->b_lflags, BL_WANTED); + + /* the hz value is 100; which leads to 10ms */ + ts.tv_sec = (slptimeo/100); + ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000; + error = msleep((caddr_t)bp, buf_mtxp, slpflag | (PRIBIO + 1), (char *)"buf_acquire", &ts); + + if (error) + return (error); + return (EAGAIN); } - splx(s); + if (flags & BAC_REMOVE) + bremfree_locked(bp); + SET(bp->b_lflags, BL_BUSY); +#ifdef JOE_DEBUG + bp->b_owner = current_thread(); + bp->b_tag = 5; +#endif return (0); } @@ -1818,16 +2937,15 @@ bcleanbuf(struct buf *bp) * Wait for operations on the buffer to complete. * When they do, extract and return the I/O's error value. */ -int -biowait(bp) - struct buf *bp; +errno_t +buf_biowait(buf_t bp) { - int s; + lck_mtx_lock(buf_mtxp); - s = splbio(); while (!ISSET(bp->b_flags, B_DONE)) - tsleep(bp, PRIBIO + 1, "biowait", 0); - splx(s); + (void) msleep(bp, buf_mtxp, (PRIBIO+1), (char *)"buf_biowait", 0); + + lck_mtx_unlock(buf_mtxp); /* check for interruption of I/O (e.g. via NFS), then errors. */ if (ISSET(bp->b_flags, B_EINTR)) { @@ -1851,102 +2969,138 @@ biowait(bp) * process, invokes a procedure specified in the buffer structure" ] * * In real life, the pagedaemon (or other system processes) wants - * to do async stuff to, and doesn't want the buffer brelse()'d. + * to do async stuff to, and doesn't want the buffer buf_brelse()'d. * (for swap pager, that puts swap buffers on the free lists (!!!), * for the vn device, that puts malloc'd buffers on the free lists!) */ +extern struct timeval priority_IO_timestamp_for_root; +extern int hard_throttle_on_root; + void -biodone(bp) - struct buf *bp; +buf_biodone(buf_t bp) { - boolean_t funnel_state; - struct vnode *vp; - extern struct timeval priority_IO_timestamp_for_root; - extern int hard_throttle_on_root; - - funnel_state = thread_funnel_set(kernel_flock, TRUE); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START, - (int)bp, (int)bp->b_data, bp->b_flags, 0, 0); + (int)bp, (int)bp->b_datap, bp->b_flags, 0, 0); if (ISSET(bp->b_flags, B_DONE)) panic("biodone already"); - SET(bp->b_flags, B_DONE); /* note that it's done */ - /* - * I/O was done, so don't believe - * the DIRTY state from VM anymore - */ - CLR(bp->b_flags, B_WASDIRTY); - if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW)) - vwakeup(bp); /* wake up reader */ - if (kdebug_enable) { - int code = DKIO_DONE; + int code = DKIO_DONE; - if (bp->b_flags & B_READ) - code |= DKIO_READ; - if (bp->b_flags & B_ASYNC) - code |= DKIO_ASYNC; + if (bp->b_flags & B_READ) + code |= DKIO_READ; + if (bp->b_flags & B_ASYNC) + code |= DKIO_ASYNC; - if (bp->b_flags & B_META) - code |= DKIO_META; - else if (bp->b_flags & (B_PGIN | B_PAGEOUT)) - code |= DKIO_PAGING; + if (bp->b_flags & B_META) + code |= DKIO_META; + else if (bp->b_flags & B_PAGEIO) + code |= DKIO_PAGING; - KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE, - (unsigned int)bp, (unsigned int)bp->b_vp, - bp->b_resid, bp->b_error, 0); + KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE, + (unsigned int)bp, (unsigned int)bp->b_vp, + bp->b_resid, bp->b_error, 0); } - - /* Wakeup the throttled write operations as needed */ - vp = bp->b_vp; - if (vp - && (vp->v_flag & VTHROTTLED) - && (vp->v_numoutput <= (BUFWRITE_THROTTLE / 3))) { - vp->v_flag &= ~VTHROTTLED; - wakeup((caddr_t)&vp->v_numoutput); - } - if ((bp->b_flags & B_PGIN) && (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) { - priority_IO_timestamp_for_root = time; + if ((bp->b_vp != NULLVP) && + ((bp->b_flags & (B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) && + (bp->b_vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) { + microuptime(&priority_IO_timestamp_for_root); hard_throttle_on_root = 0; } - if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */ - void (*iodone_func)(struct buf *) = bp->b_iodone; + /* + * I/O was done, so don't believe + * the DIRTY state from VM anymore + */ + CLR(bp->b_flags, B_WASDIRTY); - CLR(bp->b_flags, B_CALL); /* but note callout done */ + if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW)) + /* + * wake up any writer's blocked + * on throttle or waiting for I/O + * to drain + */ + vnode_writedone(bp->b_vp); + + if (ISSET(bp->b_flags, (B_CALL | B_FILTER))) { /* if necessary, call out */ + void (*iodone_func)(struct buf *, void *) = bp->b_iodone; + void *arg = (void *)bp->b_transaction; + int callout = ISSET(bp->b_flags, B_CALL); + + CLR(bp->b_flags, (B_CALL | B_FILTER)); /* filters and callouts are one-shot */ bp->b_iodone = NULL; + bp->b_transaction = NULL; if (iodone_func == NULL) { panic("biodone: bp @ 0x%x has NULL b_iodone!\n", bp); } else { - (*iodone_func)(bp); + if (callout) + SET(bp->b_flags, B_DONE); /* note that it's done */ + (*iodone_func)(bp, arg); } - } else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */ - brelse(bp); - else { /* or just wakeup the buffer */ - CLR(bp->b_flags, B_WANTED); - wakeup(bp); + if (callout) + /* + * assumes that the call back function takes + * ownership of the bp and deals with releasing it if necessary + */ + goto biodone_done; + /* + * in this case the call back function is acting + * strictly as a filter... it does not take + * ownership of the bp and is expecting us + * to finish cleaning up... this is currently used + * by the HFS journaling code + */ } + if (ISSET(bp->b_flags, B_ASYNC)) { /* if async, release it */ + SET(bp->b_flags, B_DONE); /* note that it's done */ - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END, - (int)bp, (int)bp->b_data, bp->b_flags, 0, 0); + buf_brelse(bp); + } else { /* or just wakeup the buffer */ + /* + * by taking the mutex, we serialize + * the buf owner calling buf_biowait so that we'll + * only see him in one of 2 states... + * state 1: B_DONE wasn't set and he's + * blocked in msleep + * state 2: he's blocked trying to take the + * mutex before looking at B_DONE + * BL_WANTED is cleared in case anyone else + * is blocked waiting for the buffer... note + * that we haven't cleared B_BUSY yet, so if + * they do get to run, their going to re-set + * BL_WANTED and go back to sleep + */ + lck_mtx_lock(buf_mtxp); - thread_funnel_set(kernel_flock, funnel_state); + CLR(bp->b_lflags, BL_WANTED); + SET(bp->b_flags, B_DONE); /* note that it's done */ + + lck_mtx_unlock(buf_mtxp); + + wakeup(bp); + } +biodone_done: + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END, + (int)bp, (int)bp->b_datap, bp->b_flags, 0, 0); } /* * Return a count of buffers on the "locked" queue. */ int -count_lock_queue() +count_lock_queue(void) { - register struct buf *bp; - register int n = 0; + buf_t bp; + int n = 0; + + lck_mtx_lock(buf_mtxp); for (bp = bufqueues[BQ_LOCKED].tqh_first; bp; bp = bp->b_freelist.tqe_next) n++; + lck_mtx_unlock(buf_mtxp); + return (n); } @@ -1954,13 +3108,13 @@ count_lock_queue() * Return a count of 'busy' buffers. Used at the time of shutdown. */ int -count_busy_buffers() +count_busy_buffers(void) { - register struct buf *bp; - register int nbusy = 0; + buf_t bp; + int nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) - if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY) + if (!ISSET(bp->b_flags, B_INVAL) && ISSET(bp->b_lflags, BL_BUSY)) nbusy++; return (nbusy); } @@ -1974,107 +3128,350 @@ count_busy_buffers() void vfs_bufstats() { - int s, i, j, count; - register struct buf *bp; - register struct bqueues *dp; - int counts[MAXBSIZE/CLBYTES+1]; - static char *bname[BQUEUES] = - { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" }; + int i, j, count; + register struct buf *bp; + register struct bqueues *dp; + int counts[MAXBSIZE/CLBYTES+1]; + static char *bname[BQUEUES] = + { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" }; + + for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) { + count = 0; + for (j = 0; j <= MAXBSIZE/CLBYTES; j++) + counts[j] = 0; + + lck_mtx_lock(buf_mtxp); + + for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) { + counts[bp->b_bufsize/CLBYTES]++; + count++; + } + lck_mtx_unlock(buf_mtxp); + + printf("%s: total-%d", bname[i], count); + for (j = 0; j <= MAXBSIZE/CLBYTES; j++) + if (counts[j] != 0) + printf(", %d-%d", j * CLBYTES, counts[j]); + printf("\n"); + } +} +#endif /* DIAGNOSTIC */ + +#define NRESERVEDIOBUFS 64 + + +buf_t +alloc_io_buf(vnode_t vp, int priv) +{ + buf_t bp; + + lck_mtx_lock(iobuffer_mtxp); + + while (((niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse) && !priv) || + (bp = iobufqueue.tqh_first) == NULL) { + bufstats.bufs_iobufsleeps++; + + need_iobuffer = 1; + (void) msleep(&need_iobuffer, iobuffer_mtxp, (PRIBIO+1), (const char *)"alloc_io_buf", 0); + } + TAILQ_REMOVE(&iobufqueue, bp, b_freelist); + + bufstats.bufs_iobufinuse++; + if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax) + bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse; + + lck_mtx_unlock(iobuffer_mtxp); + + /* + * initialize various fields + * we don't need to hold the mutex since the buffer + * is now private... the vp should have a reference + * on it and is not protected by this mutex in any event + */ + bp->b_timestamp = 0; + bp->b_proc = NULL; + + bp->b_datap = 0; + bp->b_flags = 0; + bp->b_lflags = BL_BUSY | BL_IOBUF; + bp->b_blkno = bp->b_lblkno = 0; +#ifdef JOE_DEBUG + bp->b_owner = current_thread(); + bp->b_tag = 6; +#endif + bp->b_iodone = NULL; + bp->b_error = 0; + bp->b_resid = 0; + bp->b_bcount = 0; + bp->b_bufsize = 0; + bp->b_upl = NULL; + bp->b_vp = vp; + + if (vp && (vp->v_type == VBLK || vp->v_type == VCHR)) + bp->b_dev = vp->v_rdev; + else + bp->b_dev = NODEV; + + return (bp); +} + + +void +free_io_buf(buf_t bp) +{ + int need_wakeup = 0; + + /* + * put buffer back on the head of the iobufqueue + */ + bp->b_vp = NULL; + bp->b_flags = B_INVAL; + + lck_mtx_lock(iobuffer_mtxp); + + binsheadfree(bp, &iobufqueue, -1); + + if (need_iobuffer) { + /* + * Wake up any processes waiting because they need an io buffer + * + * do the wakeup after we drop the mutex... it's possible that the + * wakeup will be superfluous if need_iobuffer gets set again and + * another thread runs this path, but it's highly unlikely, doesn't + * hurt, and it means we don't hold up I/O progress if the wakeup blocks + * trying to grab a task related lock... + */ + need_iobuffer = 0; + need_wakeup = 1; + } + bufstats.bufs_iobufinuse--; + + lck_mtx_unlock(iobuffer_mtxp); + + if (need_wakeup) + wakeup(&need_iobuffer); +} + + + +/* + * If getnewbuf() calls bcleanbuf() on the same thread + * there is a potential for stack overrun and deadlocks. + * So we always handoff the work to a worker thread for completion + */ +#include <mach/mach_types.h> +#include <mach/memory_object_types.h> +#include <kern/sched_prim.h> + + +static void +bcleanbuf_thread_init(void) +{ + /* create worker thread */ + kernel_thread(kernel_task, bcleanbuf_thread); +} + +static void +bcleanbuf_thread(void) +{ + struct buf *bp; + int error = 0; + int loopcnt = 0; + + for (;;) { + lck_mtx_lock(buf_mtxp); + + while (blaundrycnt == 0) + (void)msleep((void *)&blaundrycnt, buf_mtxp, PRIBIO, "blaundry", 0); + + bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]); + /* + * Remove from the queue + */ + bremfree_locked(bp); + blaundrycnt--; + + lck_mtx_unlock(buf_mtxp); + /* + * do the IO + */ + error = bawrite_internal(bp, 0); + + if (error) { + lck_mtx_lock(buf_mtxp); + + binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY); + blaundrycnt++; + + lck_mtx_unlock(buf_mtxp); + + if (loopcnt > 10) { + (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 1); + loopcnt = 0; + } else { + (void)thread_block(THREAD_CONTINUE_NULL); + loopcnt++; + } + } + } +} + + +static int +brecover_data(buf_t bp) +{ + int upl_offset; + upl_t upl; + upl_page_info_t *pl; + kern_return_t kret; + vnode_t vp = bp->b_vp; + int upl_flags; + + + if ( !UBCINFOEXISTS(vp) || bp->b_bufsize == 0) + goto dump_buffer; + + upl_flags = UPL_PRECIOUS; + if (! (buf_flags(bp) & B_READ)) { + /* + * "write" operation: let the UPL subsystem know + * that we intend to modify the buffer cache pages we're + * gathering. + */ + upl_flags |= UPL_WILL_MODIFY; + } + + kret = ubc_create_upl(vp, + ubc_blktooff(vp, bp->b_lblkno), + bp->b_bufsize, + &upl, + &pl, + upl_flags); + if (kret != KERN_SUCCESS) + panic("Failed to create UPL"); + + for (upl_offset = 0; upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) { + + if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) || !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) { + ubc_upl_abort(upl, 0); + goto dump_buffer; + } + } + bp->b_upl = upl; + + kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_datap)); + + if (kret != KERN_SUCCESS) + panic("getblk: ubc_upl_map() failed with (%d)", kret); + return (1); + +dump_buffer: + bp->b_bufsize = 0; + SET(bp->b_flags, B_INVAL); + buf_brelse(bp); + + return(0); +} + + + +/* + * disabled for now + */ + +#if FLUSH_QUEUES + +#define NFLUSH 32 + +static int +bp_cmp(void *a, void *b) +{ + buf_t *bp_a = *(buf_t **)a, + *bp_b = *(buf_t **)b; + daddr64_t res; - for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) { - count = 0; - for (j = 0; j <= MAXBSIZE/CLBYTES; j++) - counts[j] = 0; - s = splbio(); - for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) { - counts[bp->b_bufsize/CLBYTES]++; - count++; - } - splx(s); - printf("%s: total-%d", bname[i], count); - for (j = 0; j <= MAXBSIZE/CLBYTES; j++) - if (counts[j] != 0) - printf(", %d-%d", j * CLBYTES, counts[j]); - printf("\n"); - } + // don't have to worry about negative block + // numbers so this is ok to do. + // + res = (bp_a->b_blkno - bp_b->b_blkno); + + return (int)res; } -#endif /* DIAGNOSTIC */ -#define NRESERVEDIOBUFS 64 -__private_extern__ struct buf * -alloc_io_buf(vp, priv) - struct vnode *vp; - int priv; +int +bflushq(int whichq, mount_t mp) { - register struct buf *bp; - int s; + buf_t bp, next; + int i, buf_count; + int total_writes = 0; + static buf_t flush_table[NFLUSH]; - s = splbio(); - - while (niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse && !priv) { - need_iobuffer = 1; - bufstats.bufs_iobufsleeps++; - (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf", 0); + if (whichq < 0 || whichq >= BQUEUES) { + return (0); } - while ((bp = iobufqueue.tqh_first) == NULL) { - need_iobuffer = 1; - bufstats.bufs_iobufsleeps++; - (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf1", 0); - } + restart: + lck_mtx_lock(buf_mtxp); - TAILQ_REMOVE(&iobufqueue, bp, b_freelist); - bp->b_timestamp = 0; + bp = TAILQ_FIRST(&bufqueues[whichq]); - /* clear out various fields */ - bp->b_flags = B_BUSY; - bp->b_blkno = bp->b_lblkno = 0; + for (buf_count = 0; bp; bp = next) { + next = bp->b_freelist.tqe_next; + + if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) { + continue; + } - bp->b_iodone = 0; - bp->b_error = 0; - bp->b_resid = 0; - bp->b_bcount = 0; - bp->b_bufsize = 0; - bp->b_vp = vp; + if (ISSET(bp->b_flags, B_DELWRI) && !ISSET(bp->b_lflags, BL_BUSY)) { - if (vp->v_type == VBLK || vp->v_type == VCHR) - bp->b_dev = vp->v_rdev; - else - bp->b_dev = NODEV; - bufstats.bufs_iobufinuse++; - if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax) - bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse; - splx(s); + bremfree_locked(bp); +#ifdef JOE_DEBUG + bp->b_owner = current_thread(); + bp->b_tag = 7; +#endif + SET(bp->b_lflags, BL_BUSY); + flush_table[buf_count] = bp; + buf_count++; + total_writes++; - return (bp); -} + if (buf_count >= NFLUSH) { + lck_mtx_unlock(buf_mtxp); -__private_extern__ void -free_io_buf(bp) - struct buf *bp; -{ - int s; + qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp); - s = splbio(); - /* put buffer back on the head of the iobufqueue */ - bp->b_vp = NULL; - bp->b_flags = B_INVAL; + for (i = 0; i < buf_count; i++) { + buf_bawrite(flush_table[i]); + } + goto restart; + } + } + } + lck_mtx_unlock(buf_mtxp); - binsheadfree(bp, &iobufqueue, -1); + if (buf_count > 0) { + qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp); - /* Wake up any processes waiting for any buffer to become free. */ - if (need_iobuffer) { - need_iobuffer = 0; - wakeup(&need_iobuffer); + for (i = 0; i < buf_count; i++) { + buf_bawrite(flush_table[i]); + } } - bufstats.bufs_iobufinuse--; - splx(s); + + return (total_writes); } +#endif -/* disabled for now */ + +#if BALANCE_QUEUES /* XXX move this to a separate file */ + +/* + * NOTE: THIS CODE HAS NOT BEEN UPDATED + * WITH RESPECT TO THE NEW LOCKING MODEL + */ + + /* * Dynamic Scaling of the Buffer Queues */ @@ -2170,6 +3567,27 @@ static __inline__ int initbufqscan(void); static __inline__ int nextbufq(int q); static void buqlimprt(int all); + +static __inline__ void +bufqinc(int q) +{ + if ((q < 0) || (q >= BQUEUES)) + return; + + bufqlim[q].bl_num++; + return; +} + +static __inline__ void +bufqdec(int q) +{ + if ((q < 0) || (q >= BQUEUES)) + return; + + bufqlim[q].bl_num--; + return; +} + static void bufq_balance_thread_init() { @@ -2236,11 +3654,8 @@ bufq_balance_thread_init() static void bufqscan_thread() { - boolean_t funnel_state; int moretodo = 0; - funnel_state = thread_funnel_set(kernel_flock, TRUE); - for(;;) { do { int q; /* buffer queue to process */ @@ -2259,8 +3674,6 @@ bufqscan_thread() (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz); moretodo = 0; } - - (void) thread_funnel_set(kernel_flock, FALSE); } /* Seed for the buffer queue balancing */ @@ -2288,7 +3701,7 @@ balancebufq(int q) { int moretodo = 0; int s = splbio(); - int n; + int n, t; /* reject invalid q */ if ((q < 0) || (q >= BQUEUES)) @@ -2314,6 +3727,8 @@ balancebufq(int q) moretodo |= btrimempty(n); goto out; } + + t = buf_timestamp(): for (; n > 0; n--) { struct buf *bp = bufqueues[q].tqh_first; @@ -2321,14 +3736,14 @@ balancebufq(int q) break; /* check if it's stale */ - if ((time.tv_sec - bp->b_timestamp) > bufqlim[q].bl_stale) { + if ((t - bp->b_timestamp) > bufqlim[q].bl_stale) { if (bcleanbuf(bp)) { - /* bawrite() issued, bp not ready */ + /* buf_bawrite() issued, bp not ready */ moretodo = 1; } else { /* release the cleaned buffer to BQ_EMPTY */ SET(bp->b_flags, B_INVAL); - brelse(bp); + buf_brelse(bp); } } else break; @@ -2350,26 +3765,6 @@ btrimempty(int n) return (0); } -static __inline__ void -bufqinc(int q) -{ - if ((q < 0) || (q >= BQUEUES)) - return; - - bufqlim[q].bl_num++; - return; -} - -static __inline__ void -bufqdec(int q) -{ - if ((q < 0) || (q >= BQUEUES)) - return; - - bufqlim[q].bl_num--; - return; -} - static void buqlimprt(int all) { @@ -2393,183 +3788,6 @@ buqlimprt(int all) } } -/* - * If the getnewbuf() calls bcleanbuf() on the same thread - * there is a potential for stack overrun and deadlocks. - * So we always handoff the work to worker thread for completion - */ - -static void -bcleanbuf_thread_init() -{ - static void bcleanbuf_thread(); - - /* create worker thread */ - kernel_thread(kernel_task, bcleanbuf_thread); -} - -static void -bcleanbuf_thread() -{ - boolean_t funnel_state; - struct buf *bp; - int error = 0; - int loopcnt = 0; - - funnel_state = thread_funnel_set(kernel_flock, TRUE); - -doit: - while (blaundrycnt == 0) - (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 60 * hz); - bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]); - /* Remove from the queue */ - bremfree(bp); - blaundrycnt--; - - /* do the IO */ - error = bawrite_internal(bp, 0); - if (error) { - binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY); - blaundrycnt++; - if (loopcnt > 10) { - (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 1); - loopcnt = 0; - } else { - (void)thread_block(THREAD_CONTINUE_NULL); - loopcnt++; - } - } - /* start again */ - goto doit; - - (void) thread_funnel_set(kernel_flock, funnel_state); -} - - -static int -brecover_data(struct buf *bp) -{ - upl_t upl; - upl_page_info_t *pl; - int upl_offset; - kern_return_t kret; - struct vnode *vp = bp->b_vp; - - if (vp->v_tag == VT_NFS) - /* - * NFS currently deals with this case - * in a slightly different manner... - * continue to let it do so - */ - return(1); - - if (!UBCISVALID(vp) || bp->b_bufsize == 0) - goto dump_buffer; - - kret = ubc_create_upl(vp, - ubc_blktooff(vp, bp->b_lblkno), - bp->b_bufsize, - &upl, - &pl, - UPL_PRECIOUS); - if (kret != KERN_SUCCESS) - panic("Failed to get pagelists"); - - for (upl_offset = 0; upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) { - - if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) || !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) { - ubc_upl_abort(upl, 0); - goto dump_buffer; - } - } - SET(bp->b_flags, B_PAGELIST); - bp->b_pagelist = upl; - - kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data)); - if (kret != KERN_SUCCESS) - panic("getblk: ubc_upl_map() failed with (%d)", kret); - if (bp->b_data == 0) - panic("ubc_upl_map mapped 0"); - - return (1); - -dump_buffer: - bp->b_bufsize = 0; - SET(bp->b_flags, B_INVAL); - brelse(bp); - - return(0); -} - - -static int -bp_cmp(void *a, void *b) -{ - struct buf *bp_a = *(struct buf **)a, - *bp_b = *(struct buf **)b; - daddr_t res; - - // don't have to worry about negative block - // numbers so this is ok to do. - // - res = (bp_a->b_blkno - bp_b->b_blkno); - - return (int)res; -} - -#define NFLUSH 32 - -int -bflushq(int whichq, struct mount *mp) -{ - struct buf *bp, *next; - int i, buf_count, s; - int counter=0, total_writes=0; - static struct buf *flush_table[NFLUSH]; - - if (whichq < 0 || whichq >= BQUEUES) { - return; - } - - - restart: - bp = TAILQ_FIRST(&bufqueues[whichq]); - for(buf_count=0; bp; bp=next) { - next = bp->b_freelist.tqe_next; - - if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) { - continue; - } - - if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_BUSY) == 0) { - if (whichq != BQ_LOCKED && (bp->b_flags & B_LOCKED)) { - panic("bflushq: bp @ 0x%x is locked!\n", bp); - } - - bremfree(bp); - bp->b_flags |= B_BUSY; - flush_table[buf_count] = bp; - buf_count++; - total_writes++; - - if (buf_count >= NFLUSH) { - qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp); - - for(i=0; i < buf_count; i++) { - bawrite(flush_table[i]); - } - - goto restart; - } - } - } +#endif - if (buf_count > 0) { - qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp); - for(i=0; i < buf_count; i++) { - bawrite(flush_table[i]); - } - } - return total_writes; -} diff --git a/bsd/vfs/vfs_cache.c b/bsd/vfs/vfs_cache.c index 85a9f2c7c..8cb282de6 100644 --- a/bsd/vfs/vfs_cache.c +++ b/bsd/vfs/vfs_cache.c @@ -61,11 +61,13 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/time.h> -#include <sys/mount.h> -#include <sys/vnode.h> +#include <sys/mount_internal.h> +#include <sys/vnode_internal.h> #include <sys/namei.h> #include <sys/errno.h> #include <sys/malloc.h> +#include <sys/kauth.h> +#include <sys/user.h> /* * Name caching works as follows: @@ -79,10 +81,6 @@ * If it is a "negative" entry, (i.e. for a name that is known NOT to * exist) the vnode pointer will be NULL. * - * For simplicity (and economy of storage), names longer than - * a maximum length of NCHNAMLEN are not cached; they occur - * infrequently in any case, and are almost never of interest. - * * Upon reaching the last segment of a path, if the reference * is for DELETE, or NOCACHE is set (rewrite), and the * name is located in the cache, it will be dropped. @@ -91,59 +89,704 @@ /* * Structures associated with name cacheing. */ -#define NCHHASH(dvp, hash_val) \ - (&nchashtbl[((u_long)(dvp) ^ ((dvp)->v_id ^ (hash_val))) & nchash]) + LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */ +u_long nchashmask; u_long nchash; /* size of hash table - 1 */ long numcache; /* number of cache entries allocated */ -TAILQ_HEAD(, namecache) nclruhead; /* LRU chain */ +int desiredNodes; +int desiredNegNodes; +TAILQ_HEAD(, namecache) nchead; /* chain of all name cache entries */ +TAILQ_HEAD(, namecache) neghead; /* chain of only negative cache entries */ struct nchstats nchstats; /* cache effectiveness statistics */ -u_long nextvnodeid = 0; -int doingcache = 1; /* 1 => enable the cache */ + +/* vars for name cache list lock */ +lck_grp_t * namecache_lck_grp; +lck_grp_attr_t * namecache_lck_grp_attr; +lck_attr_t * namecache_lck_attr; +lck_mtx_t * namecache_mtx_lock; + +static vnode_t cache_lookup_locked(vnode_t dvp, struct componentname *cnp); +static int remove_name_locked(const char *); +static char *add_name_locked(const char *, size_t, u_int, u_int); +static void init_string_table(void); +static void cache_delete(struct namecache *, int); +static void dump_string_table(void); + +static void init_crc32(void); +static unsigned int crc32tab[256]; + + +#define NCHHASH(dvp, hash_val) \ + (&nchashtbl[(dvp->v_id ^ (hash_val)) & nchashmask]) + + + +// +// This function builds the path to a filename in "buff". The +// length of the buffer *INCLUDING* the trailing zero byte is +// returned in outlen. NOTE: the length includes the trailing +// zero byte and thus the length is one greater than what strlen +// would return. This is important and lots of code elsewhere +// in the kernel assumes this behavior. +// +int +build_path(vnode_t first_vp, char *buff, int buflen, int *outlen) +{ + vnode_t vp = first_vp; + char *end, *str; + int len, ret=0, counter=0; + + end = &buff[buflen-1]; + *end = '\0'; + + /* + * if this is the root dir of a file system... + */ + if (vp && (vp->v_flag & VROOT) && vp->v_mount) { + /* + * then if it's the root fs, just put in a '/' and get out of here + */ + if (vp->v_mount->mnt_flag & MNT_ROOTFS) { + *--end = '/'; + goto out; + } else { + /* + * else just use the covered vnode to get the mount path + */ + vp = vp->v_mount->mnt_vnodecovered; + } + } + name_cache_lock(); + + while (vp && vp->v_parent != vp) { + /* + * the maximum depth of a file system hierarchy is MAXPATHLEN/2 + * (with single-char names separated by slashes). we panic if + * we've ever looped more than that. + */ + if (counter++ > MAXPATHLEN/2) { + panic("build_path: vnode parent chain is too long! vp 0x%x\n", vp); + } + str = vp->v_name; + + if (str == NULL) { + if (vp->v_parent != NULL) { + ret = EINVAL; + } + break; + } + len = strlen(str); + + /* + * check that there's enough space (make sure to include space for the '/') + */ + if ((end - buff) < (len + 1)) { + ret = ENOSPC; + break; + } + /* + * copy it backwards + */ + str += len; + + for (; len > 0; len--) { + *--end = *--str; + } + /* + * put in the path separator + */ + *--end = '/'; + + /* + * walk up the chain (as long as we're not the root) + */ + if (vp == first_vp && (vp->v_flag & VROOT)) { + if (vp->v_mount && vp->v_mount->mnt_vnodecovered) { + vp = vp->v_mount->mnt_vnodecovered->v_parent; + } else { + vp = NULLVP; + } + } else { + vp = vp->v_parent; + } + /* + * check if we're crossing a mount point and + * switch the vp if we are. + */ + if (vp && (vp->v_flag & VROOT) && vp->v_mount) { + vp = vp->v_mount->mnt_vnodecovered; + } + } + name_cache_unlock(); +out: + /* + * slide it down to the beginning of the buffer + */ + memmove(buff, end, &buff[buflen] - end); + + *outlen = &buff[buflen] - end; // length includes the trailing zero byte + + return ret; +} + /* - * Delete an entry from its hash list and move it to the front - * of the LRU list for immediate reuse. - * - * NOTE: THESE MACROS CAN BLOCK (in the call to remove_name()) - * SO BE CAREFUL IF YOU HOLD POINTERS TO nclruhead OR - * nchashtbl. + * return NULLVP if vp's parent doesn't + * exist, or we can't get a valid iocount + * else return the parent of vp */ -#if DIAGNOSTIC -#define PURGE(ncp) { \ - if (ncp->nc_hash.le_prev == 0) \ - panic("namecache purge le_prev"); \ - if (ncp->nc_hash.le_next == ncp) \ - panic("namecache purge le_next"); \ - LIST_REMOVE(ncp, nc_hash); \ - ncp->nc_hash.le_prev = 0; \ - TAILQ_REMOVE(&nclruhead, ncp, nc_lru); \ - TAILQ_INSERT_HEAD(&nclruhead, ncp, nc_lru); \ - /* this has to come last because it could block */ \ - remove_name(ncp->nc_name); \ - ncp->nc_name = NULL; \ +vnode_t +vnode_getparent(vnode_t vp) +{ + vnode_t pvp = NULLVP; + int pvid; + + name_cache_lock(); + /* + * v_parent is stable behind the name_cache lock + * however, the only thing we can really guarantee + * is that we've grabbed a valid iocount on the + * parent of 'vp' at the time we took the name_cache lock... + * once we drop the lock, vp could get re-parented + */ + if ( (pvp = vp->v_parent) != NULLVP ) { + pvid = pvp->v_id; + + name_cache_unlock(); + + if (vnode_getwithvid(pvp, pvid) != 0) + pvp = NULL; + } else + name_cache_unlock(); + + return (pvp); +} + +char * +vnode_getname(vnode_t vp) +{ + char *name = NULL; + + name_cache_lock(); + + if (vp->v_name) + name = add_name_locked(vp->v_name, strlen(vp->v_name), 0, 0); + name_cache_unlock(); + + return (name); } -#else -#define PURGE(ncp) { \ - LIST_REMOVE(ncp, nc_hash); \ - ncp->nc_hash.le_prev = 0; \ - TAILQ_REMOVE(&nclruhead, ncp, nc_lru); \ - TAILQ_INSERT_HEAD(&nclruhead, ncp, nc_lru); \ - /* this has to come last because it could block */ \ - remove_name(ncp->nc_name); \ - ncp->nc_name = NULL; \ + +void +vnode_putname(char *name) +{ + name_cache_lock(); + + remove_name_locked(name); + + name_cache_unlock(); +} + + +/* + * if VNODE_UPDATE_PARENT, and we can take + * a reference on dvp, then update vp with + * it's new parent... if vp already has a parent, + * then drop the reference vp held on it + * + * if VNODE_UPDATE_NAME, + * then drop string ref on v_name if it exists, and if name is non-NULL + * then pick up a string reference on name and record it in v_name... + * optionally pass in the length and hashval of name if known + * + * if VNODE_UPDATE_CACHE, flush the name cache entries associated with vp + */ +void +vnode_update_identity(vnode_t vp, vnode_t dvp, char *name, int name_len, int name_hashval, int flags) +{ + struct namecache *ncp; + vnode_t old_parentvp = NULLVP; + + + if (flags & VNODE_UPDATE_PARENT) { + if (dvp && vnode_ref(dvp) != 0) + dvp = NULLVP; + } else + dvp = NULLVP; + name_cache_lock(); + + if ( (flags & VNODE_UPDATE_NAME) && (name != vp->v_name) ) { + if (vp->v_name != NULL) { + remove_name_locked(vp->v_name); + vp->v_name = NULL; + } + if (name && *name) { + if (name_len == 0) + name_len = strlen(name); + vp->v_name = add_name_locked(name, name_len, name_hashval, 0); + } + } + if (flags & VNODE_UPDATE_PARENT) { + if (dvp != vp && dvp != vp->v_parent) { + old_parentvp = vp->v_parent; + vp->v_parent = dvp; + dvp = NULLVP; + + if (old_parentvp) + flags |= VNODE_UPDATE_CACHE; + } + } + if (flags & VNODE_UPDATE_CACHE) { + while ( (ncp = LIST_FIRST(&vp->v_nclinks)) ) + cache_delete(ncp, 1); + } + name_cache_unlock(); + + if (dvp != NULLVP) + vnode_rele(dvp); + + if (old_parentvp) { + struct uthread *ut; + + ut = get_bsdthread_info(current_thread()); + + /* + * indicated to vnode_rele that it shouldn't do a + * vnode_reclaim at this time... instead it will + * chain the vnode to the uu_vreclaims list... + * we'll be responsible for calling vnode_reclaim + * on each of the vnodes in this list... + */ + ut->uu_defer_reclaims = 1; + ut->uu_vreclaims = NULLVP; + + while ( (vp = old_parentvp) != NULLVP ) { + + vnode_lock(vp); + + vnode_rele_internal(vp, 0, 0, 1); + + /* + * check to see if the vnode is now in the state + * that would have triggered a vnode_reclaim in vnode_rele + * if it is, we save it's parent pointer and then NULL + * out the v_parent field... we'll drop the reference + * that was held on the next iteration of this loop... + * this short circuits a potential deep recursion if we + * have a long chain of parents in this state... + * we'll sit in this loop until we run into + * a parent in this chain that is not in this state + * + * make our check and the node_rele atomic + * with respect to the current vnode we're working on + * by holding the vnode lock + * if vnode_rele deferred the vnode_reclaim and has put + * this vnode on the list to be reaped by us, than + * it has left this vnode with an iocount == 1 + */ + if ( (vp->v_iocount == 1) && (vp->v_usecount == 0) && + ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM)) { + /* + * vnode_rele wanted to do a vnode_reclaim on this vnode + * it should be sitting on the head of the uu_vreclaims chain + * pull the parent pointer now so that when we do the + * vnode_reclaim for each of the vnodes in the uu_vreclaims + * list, we won't recurse back through here + */ + name_cache_lock(); + old_parentvp = vp->v_parent; + vp->v_parent = NULLVP; + name_cache_unlock(); + } else { + /* + * we're done... we ran into a vnode that isn't + * being terminated + */ + old_parentvp = NULLVP; + } + vnode_unlock(vp); + } + ut->uu_defer_reclaims = 0; + + while ( (vp = ut->uu_vreclaims) != NULLVP) { + ut->uu_vreclaims = vp->v_defer_reclaimlist; + + /* + * vnode_put will drive the vnode_reclaim if + * we are still the only reference on this vnode + */ + vnode_put(vp); + } + } } -#endif /* DIAGNOSTIC */ + /* - * Move an entry that has been used to the tail of the LRU list - * so that it will be preserved for future use. + * Mark a vnode as having multiple hard links. HFS makes use of this + * because it keeps track of each link separately, and wants to know + * which link was actually used. + * + * This will cause the name cache to force a VNOP_LOOKUP on the vnode + * so that HFS can post-process the lookup. Also, volfs will call + * VNOP_GETATTR2 to determine the parent, instead of using v_parent. */ -#define TOUCH(ncp) { \ - if (ncp->nc_lru.tqe_next != 0) { \ - TAILQ_REMOVE(&nclruhead, ncp, nc_lru); \ - TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); \ - } \ +void vnode_set_hard_link(vnode_t vp) +{ + vnode_lock(vp); + + /* + * In theory, we're changing the vnode's identity as far as the + * name cache is concerned, so we ought to grab the name cache lock + * here. However, there is already a race, and grabbing the name + * cache lock only makes the race window slightly smaller. + * + * The race happens because the vnode already exists in the name + * cache, and could be found by one thread before another thread + * can set the hard link flag. + */ + + vp->v_flag |= VISHARDLINK; + + vnode_unlock(vp); +} + + +void vnode_uncache_credentials(vnode_t vp) +{ + kauth_cred_t ucred = NULL; + + if (vp->v_cred) { + vnode_lock(vp); + + ucred = vp->v_cred; + vp->v_cred = NULL; + + vnode_unlock(vp); + + if (ucred) + kauth_cred_rele(ucred); + } +} + + +void vnode_cache_credentials(vnode_t vp, vfs_context_t context) +{ + kauth_cred_t ucred; + kauth_cred_t tcred = NOCRED; + struct timeval tv; + + ucred = vfs_context_ucred(context); + + if (vp->v_cred != ucred || (vp->v_mount->mnt_kern_flag & MNTK_AUTH_OPAQUE)) { + vnode_lock(vp); + + microuptime(&tv); + vp->v_cred_timestamp = tv.tv_sec; + + if (vp->v_cred != ucred) { + kauth_cred_ref(ucred); + + tcred = vp->v_cred; + vp->v_cred = ucred; + } + vnode_unlock(vp); + + if (tcred) + kauth_cred_rele(tcred); + } +} + +/* reverse_lookup - lookup by walking back up the parent chain while leveraging + * use of the name cache lock in order to protect our starting vnode. + * NOTE - assumes you already have search access to starting point. + * returns 0 when we have reached the root, current working dir, or chroot root + * + */ +int +reverse_lookup(vnode_t start_vp, vnode_t *lookup_vpp, struct filedesc *fdp, vfs_context_t context, int *dp_authorized) +{ + int vid, done = 0; + int auth_opaque = 0; + vnode_t dp = start_vp; + vnode_t vp = NULLVP; + kauth_cred_t ucred; + struct timeval tv; + + ucred = vfs_context_ucred(context); + *lookup_vpp = start_vp; + + name_cache_lock(); + + if ( dp->v_mount && (dp->v_mount->mnt_kern_flag & MNTK_AUTH_OPAQUE) ) { + auth_opaque = 1; + microuptime(&tv); + } + for (;;) { + *dp_authorized = 0; + + if (auth_opaque && ((tv.tv_sec - dp->v_cred_timestamp) > VCRED_EXPIRED)) + break; + if (dp->v_cred != ucred) + break; + /* + * indicate that we're allowed to traverse this directory... + * even if we bail for some reason, this information is valid and is used + * to avoid doing a vnode_authorize + */ + *dp_authorized = 1; + + if ((dp->v_flag & VROOT) != 0 || /* Hit "/" */ + (dp == fdp->fd_cdir) || /* Hit process's working directory */ + (dp == fdp->fd_rdir)) { /* Hit process chroot()-ed root */ + done = 1; + break; + } + + if ( (vp = dp->v_parent) == NULLVP) + break; + + dp = vp; + *lookup_vpp = dp; + } /* for (;;) */ + + vid = dp->v_id; + + name_cache_unlock(); + + if (done == 0 && dp != start_vp) { + if (vnode_getwithvid(dp, vid) != 0) { + *lookup_vpp = start_vp; + } + } + + return((done == 1) ? 0 : -1); +} + +int +cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, vfs_context_t context, int *trailing_slash, int *dp_authorized) +{ + char *cp; /* pointer into pathname argument */ + int vid, vvid; + int auth_opaque = 0; + vnode_t vp = NULLVP; + vnode_t tdp = NULLVP; + kauth_cred_t ucred; + struct timeval tv; + unsigned int hash; + + ucred = vfs_context_ucred(context); + *trailing_slash = 0; + + name_cache_lock(); + + + if ( dp->v_mount && (dp->v_mount->mnt_kern_flag & MNTK_AUTH_OPAQUE) ) { + auth_opaque = 1; + microuptime(&tv); + } + for (;;) { + /* + * Search a directory. + * + * The cn_hash value is for use by cache_lookup + * The last component of the filename is left accessible via + * cnp->cn_nameptr for callers that need the name. + */ + hash = 0; + cp = cnp->cn_nameptr; + + while (*cp && (*cp != '/')) { + hash ^= crc32tab[((hash >> 24) ^ (unsigned char)*cp++)]; + } + /* + * the crc generator can legitimately generate + * a 0... however, 0 for us means that we + * haven't computed a hash, so use 1 instead + */ + if (hash == 0) + hash = 1; + cnp->cn_hash = hash; + cnp->cn_namelen = cp - cnp->cn_nameptr; + + ndp->ni_pathlen -= cnp->cn_namelen; + ndp->ni_next = cp; + + /* + * Replace multiple slashes by a single slash and trailing slashes + * by a null. This must be done before VNOP_LOOKUP() because some + * fs's don't know about trailing slashes. Remember if there were + * trailing slashes to handle symlinks, existing non-directories + * and non-existing files that won't be directories specially later. + */ + while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { + cp++; + ndp->ni_pathlen--; + + if (*cp == '\0') { + *trailing_slash = 1; + *ndp->ni_next = '\0'; + } + } + ndp->ni_next = cp; + + cnp->cn_flags &= ~(MAKEENTRY | ISLASTCN | ISDOTDOT); + + if (*cp == '\0') + cnp->cn_flags |= ISLASTCN; + + if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') + cnp->cn_flags |= ISDOTDOT; + + *dp_authorized = 0; + + if (auth_opaque && ((tv.tv_sec - dp->v_cred_timestamp) > VCRED_EXPIRED)) + break; + + if (dp->v_cred != ucred) + break; + /* + * indicate that we're allowed to traverse this directory... + * even if we fail the cache lookup or decide to bail for + * some other reason, this information is valid and is used + * to avoid doing a vnode_authorize before the call to VNOP_LOOKUP + */ + *dp_authorized = 1; + + if ( (cnp->cn_flags & (ISLASTCN | ISDOTDOT)) ) { + if (cnp->cn_nameiop != LOOKUP) + break; + if (cnp->cn_flags & (LOCKPARENT | NOCACHE | ISDOTDOT)) + break; + } + if ( (vp = cache_lookup_locked(dp, cnp)) == NULLVP) + break; + + if ( (cnp->cn_flags & ISLASTCN) ) + break; + + if (vp->v_type != VDIR) { + if (vp->v_type != VLNK) + vp = NULL; + break; + } + if (vp->v_mountedhere && ((cnp->cn_flags & NOCROSSMOUNT) == 0)) + break; + + dp = vp; + vp = NULLVP; + + cnp->cn_nameptr = ndp->ni_next + 1; + ndp->ni_pathlen--; + while (*cnp->cn_nameptr == '/') { + cnp->cn_nameptr++; + ndp->ni_pathlen--; + } + } + if (vp != NULLVP) + vvid = vp->v_id; + vid = dp->v_id; + + name_cache_unlock(); + + + if ((vp != NULLVP) && (vp->v_type != VLNK) && + ((cnp->cn_flags & (ISLASTCN | LOCKPARENT | WANTPARENT | SAVESTART)) == ISLASTCN)) { + /* + * if we've got a child and it's the last component, and + * the lookup doesn't need to return the parent then we + * can skip grabbing an iocount on the parent, since all + * we're going to do with it is a vnode_put just before + * we return from 'lookup'. If it's a symbolic link, + * we need the parent in case the link happens to be + * a relative pathname. + */ + tdp = dp; + dp = NULLVP; + } else { +need_dp: + /* + * return the last directory we looked at + * with an io reference held + */ + if (dp == ndp->ni_usedvp) { + /* + * if this vnode matches the one passed in via USEDVP + * than this context already holds an io_count... just + * use vnode_get to get an extra ref for lookup to play + * with... can't use the getwithvid variant here because + * it will block behind a vnode_drain which would result + * in a deadlock (since we already own an io_count that the + * vnode_drain is waiting on)... vnode_get grabs the io_count + * immediately w/o waiting... it always succeeds + */ + vnode_get(dp); + } else if ( (vnode_getwithvid(dp, vid)) ) { + /* + * failure indicates the vnode + * changed identity or is being + * TERMINATED... in either case + * punt this lookup + */ + return (ENOENT); + } + } + if (vp != NULLVP) { + if ( (vnode_getwithvid(vp, vvid)) ) { + vp = NULLVP; + + /* + * can't get reference on the vp we'd like + * to return... if we didn't grab a reference + * on the directory (due to fast path bypass), + * then we need to do it now... we can't return + * with both ni_dvp and ni_vp NULL, and no + * error condition + */ + if (dp == NULLVP) { + dp = tdp; + goto need_dp; + } + } + } + ndp->ni_dvp = dp; + ndp->ni_vp = vp; + + return (0); +} + + +static vnode_t +cache_lookup_locked(vnode_t dvp, struct componentname *cnp) +{ + register struct namecache *ncp; + register struct nchashhead *ncpp; + register long namelen = cnp->cn_namelen; + char *nameptr = cnp->cn_nameptr; + unsigned int hashval = (cnp->cn_hash & NCHASHMASK); + vnode_t vp; + + ncpp = NCHHASH(dvp, cnp->cn_hash); + LIST_FOREACH(ncp, ncpp, nc_hash) { + if ((ncp->nc_dvp == dvp) && (ncp->nc_hashval == hashval)) { + if (memcmp(ncp->nc_name, nameptr, namelen) == 0 && ncp->nc_name[namelen] == 0) + break; + } + } + if (ncp == 0) + /* + * We failed to find an entry + */ + return (NULL); + + vp = ncp->nc_vp; + if (vp && (vp->v_flag & VISHARDLINK)) { + /* + * The file system wants a VNOP_LOOKUP on this vnode + */ + vp = NULL; + } + + return (vp); } @@ -152,26 +795,30 @@ int doingcache = 1; /* 1 => enable the cache */ // hash part of a componentname. // static unsigned int -hash_string(const char *str, int len) +hash_string(const char *cp, int len) { - unsigned int i, hashval = 0; + unsigned hash = 0; - if (len == 0) { - for(i=1; *str != 0; i++, str++) { - hashval += (unsigned char)*str * i; - } + if (len) { + while (len--) { + hash ^= crc32tab[((hash >> 24) ^ (unsigned char)*cp++)]; + } } else { - for(i=len; i > 0; i--, str++) { - hashval += (unsigned char)*str * (len - i + 1); - } + while (*cp != '\0') { + hash ^= crc32tab[((hash >> 24) ^ (unsigned char)*cp++)]; + } } - - return hashval; + /* + * the crc generator can legitimately generate + * a 0... however, 0 for us means that we + * haven't computed a hash, so use 1 instead + */ + if (hash == 0) + hash = 1; + return hash; } - - /* * Lookup an entry in the cache * @@ -193,74 +840,73 @@ cache_lookup(dvp, vpp, cnp) struct vnode **vpp; struct componentname *cnp; { - register struct namecache *ncp, *nnp; + register struct namecache *ncp; register struct nchashhead *ncpp; register long namelen = cnp->cn_namelen; char *nameptr = cnp->cn_nameptr; + unsigned int hashval = (cnp->cn_hash & NCHASHMASK); + uint32_t vid; + vnode_t vp; - if (!doingcache) { - cnp->cn_flags &= ~MAKEENTRY; - return (0); - } + name_cache_lock(); ncpp = NCHHASH(dvp, cnp->cn_hash); - for (ncp = ncpp->lh_first; ncp != 0; ncp = nnp) { - nnp = ncp->nc_hash.le_next; - - if (ncp->nc_dvp == dvp && - strncmp(ncp->nc_name, nameptr, namelen) == 0 && - ncp->nc_name[namelen] == 0) { - /* Make sure the vp isn't stale. */ - if ((ncp->nc_dvpid != dvp->v_id) || - (ncp->nc_vp && ncp->nc_vpid != ncp->nc_vp->v_id)) { - nchstats.ncs_falsehits++; - PURGE(ncp); - continue; - } - break; + LIST_FOREACH(ncp, ncpp, nc_hash) { + if ((ncp->nc_dvp == dvp) && (ncp->nc_hashval == hashval)) { + if (memcmp(ncp->nc_name, nameptr, namelen) == 0 && ncp->nc_name[namelen] == 0) + break; } } - /* We failed to find an entry */ if (ncp == 0) { nchstats.ncs_miss++; + name_cache_unlock(); return (0); } /* We don't want to have an entry, so dump it */ if ((cnp->cn_flags & MAKEENTRY) == 0) { nchstats.ncs_badhits++; - PURGE(ncp); + cache_delete(ncp, 1); + name_cache_unlock(); return (0); } + vp = ncp->nc_vp; /* We found a "positive" match, return the vnode */ - if (ncp->nc_vp) { - if (ncp->nc_vp->v_flag & (VUINIT|VXLOCK|VTERMINATE|VORECLAIM)) { - PURGE(ncp); - return (0); - } - + if (vp) { nchstats.ncs_goodhits++; - TOUCH(ncp); - *vpp = ncp->nc_vp; + + vid = vp->v_id; + name_cache_unlock(); + + if (vnode_getwithvid(vp, vid)) { + name_cache_lock(); + nchstats.ncs_badvid++; + name_cache_unlock(); + return (0); + } + *vpp = vp; return (-1); } /* We found a negative match, and want to create it, so purge */ - if (cnp->cn_nameiop == CREATE) { + if (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) { nchstats.ncs_badhits++; - PURGE(ncp); + cache_delete(ncp, 1); + name_cache_unlock(); return (0); } /* * We found a "negative" match, ENOENT notifies client of this match. - * The nc_vpid field records whether this is a whiteout. + * The nc_whiteout field records whether this is a whiteout. */ nchstats.ncs_neghits++; - TOUCH(ncp); - cnp->cn_flags |= ncp->nc_vpid; + + if (ncp->nc_whiteout) + cnp->cn_flags |= ISWHITEOUT; + name_cache_unlock(); return (ENOENT); } @@ -273,57 +919,69 @@ cache_enter(dvp, vp, cnp) struct vnode *vp; struct componentname *cnp; { - register struct namecache *ncp; + register struct namecache *ncp, *negp; register struct nchashhead *ncpp; - if (!doingcache) - return; + if (cnp->cn_hash == 0) + cnp->cn_hash = hash_string(cnp->cn_nameptr, cnp->cn_namelen); + + name_cache_lock(); + /* if the entry is for -ve caching vp is null */ + if ((vp != NULLVP) && (LIST_FIRST(&vp->v_nclinks))) { + /* + * someone beat us to the punch.. + * this vnode is already in the cache + */ + name_cache_unlock(); + return; + } /* * We allocate a new entry if we are less than the maximum - * allowed and the one at the front of the LRU list is in use. - * Otherwise we use the one at the front of the LRU list. + * allowed and the one at the front of the list is in use. + * Otherwise we use the one at the front of the list. */ - if (numcache < desiredvnodes && - ((ncp = nclruhead.tqh_first) == NULL || - ncp->nc_hash.le_prev != 0)) { - /* Add one more entry */ - ncp = (struct namecache *) - _MALLOC_ZONE((u_long)sizeof *ncp, M_CACHE, M_WAITOK); + if (numcache < desiredNodes && + ((ncp = nchead.tqh_first) == NULL || + ncp->nc_hash.le_prev != 0)) { + /* + * Allocate one more entry + */ + ncp = (struct namecache *)_MALLOC_ZONE((u_long)sizeof *ncp, M_CACHE, M_WAITOK); numcache++; - } else if (ncp = nclruhead.tqh_first) { - /* reuse an old entry */ - TAILQ_REMOVE(&nclruhead, ncp, nc_lru); + } else { + /* + * reuse an old entry + */ + ncp = TAILQ_FIRST(&nchead); + TAILQ_REMOVE(&nchead, ncp, nc_entry); + if (ncp->nc_hash.le_prev != 0) { -#if DIAGNOSTIC - if (ncp->nc_hash.le_next == ncp) - panic("cache_enter: le_next"); -#endif - LIST_REMOVE(ncp, nc_hash); - remove_name(ncp->nc_name); - ncp->nc_name = NULL; - ncp->nc_hash.le_prev = 0; + /* + * still in use... we need to + * delete it before re-using it + */ + nchstats.ncs_stolen++; + cache_delete(ncp, 0); } - } else { - /* give up */ - return; } + nchstats.ncs_enters++; /* * Fill in cache info, if vp is NULL this is a "negative" cache entry. - * For negative entries, we have to record whether it is a whiteout. - * the whiteout flag is stored in the nc_vpid field which is - * otherwise unused. */ ncp->nc_vp = vp; - if (vp) - ncp->nc_vpid = vp->v_id; - else - ncp->nc_vpid = cnp->cn_flags & ISWHITEOUT; ncp->nc_dvp = dvp; - ncp->nc_dvpid = dvp->v_id; - ncp->nc_name = add_name(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0); - TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); + ncp->nc_hashval = cnp->cn_hash; + ncp->nc_whiteout = FALSE; + ncp->nc_name = add_name_locked(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0); + + /* + * make us the newest entry in the cache + * i.e. we'll be the last to be stolen + */ + TAILQ_INSERT_TAIL(&nchead, ncp, nc_entry); + ncpp = NCHHASH(dvp, cnp->cn_hash); #if DIAGNOSTIC { @@ -334,124 +992,294 @@ cache_enter(dvp, vp, cnp) panic("cache_enter: duplicate"); } #endif + /* + * make us available to be found via lookup + */ LIST_INSERT_HEAD(ncpp, ncp, nc_hash); + + if (vp) { + /* + * add to the list of name cache entries + * that point at vp + */ + LIST_INSERT_HEAD(&vp->v_nclinks, ncp, nc_un.nc_link); + } else { + /* + * this is a negative cache entry (vp == NULL) + * stick it on the negative cache list + * and record the whiteout state + */ + TAILQ_INSERT_TAIL(&neghead, ncp, nc_un.nc_negentry); + + if (cnp->cn_flags & ISWHITEOUT) + ncp->nc_whiteout = TRUE; + nchstats.ncs_negtotal++; + + if (nchstats.ncs_negtotal > desiredNegNodes) { + /* + * if we've reached our desired limit + * of negative cache entries, delete + * the oldest + */ + negp = TAILQ_FIRST(&neghead); + TAILQ_REMOVE(&neghead, negp, nc_un.nc_negentry); + + cache_delete(negp, 1); + } + } + /* + * add us to the list of name cache entries that + * are children of dvp + */ + LIST_INSERT_HEAD(&dvp->v_ncchildren, ncp, nc_child); + + name_cache_unlock(); } + +/* + * Initialize CRC-32 remainder table. + */ +static void init_crc32(void) +{ + /* + * the CRC-32 generator polynomial is: + * x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^10 + * + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1 + */ + unsigned int crc32_polynomial = 0x04c11db7; + unsigned int i,j; + + /* + * pre-calculate the CRC-32 remainder for each possible octet encoding + */ + for (i = 0; i < 256; i++) { + unsigned int crc_rem = i << 24; + + for (j = 0; j < 8; j++) { + if (crc_rem & 0x80000000) + crc_rem = (crc_rem << 1) ^ crc32_polynomial; + else + crc_rem = (crc_rem << 1); + } + crc32tab[i] = crc_rem; + } +} + + /* * Name cache initialization, from vfs_init() when we are booting */ void -nchinit() +nchinit(void) +{ + desiredNegNodes = (desiredvnodes / 10); + desiredNodes = desiredvnodes + desiredNegNodes; + + TAILQ_INIT(&nchead); + TAILQ_INIT(&neghead); + + init_crc32(); + + nchashtbl = hashinit(MAX(4096, (2 *desiredNodes)), M_CACHE, &nchash); + nchashmask = nchash; + nchash++; + + init_string_table(); + + /* Allocate mount list lock group attribute and group */ + namecache_lck_grp_attr= lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(namecache_lck_grp_attr); + + namecache_lck_grp = lck_grp_alloc_init("Name Cache", namecache_lck_grp_attr); + + /* Allocate mount list lock attribute */ + namecache_lck_attr = lck_attr_alloc_init(); + //lck_attr_setdebug(namecache_lck_attr); + + /* Allocate mount list lock */ + namecache_mtx_lock = lck_mtx_alloc_init(namecache_lck_grp, namecache_lck_attr); + + +} + +void +name_cache_lock(void) { - static void init_string_table(void); + lck_mtx_lock(namecache_mtx_lock); +} - TAILQ_INIT(&nclruhead); - nchashtbl = hashinit(MAX(4096, desiredvnodes), M_CACHE, &nchash); +void +name_cache_unlock(void) +{ + lck_mtx_unlock(namecache_mtx_lock); - init_string_table(); } int resize_namecache(u_int newsize) { - struct nchashhead *new_table; - struct nchashhead *old_table; - struct nchashhead *old_head, *head; - struct namecache *entry, *next; - uint32_t i; - u_long new_mask, old_mask; + struct nchashhead *new_table; + struct nchashhead *old_table; + struct nchashhead *old_head, *head; + struct namecache *entry, *next; + uint32_t i, hashval; + int dNodes, dNegNodes; + u_long new_size, old_size; + + dNegNodes = (newsize / 10); + dNodes = newsize + dNegNodes; // we don't support shrinking yet - if (newsize < nchash) { + if (dNodes < desiredNodes) { return 0; } + new_table = hashinit(2 * dNodes, M_CACHE, &nchashmask); + new_size = nchashmask + 1; - new_table = hashinit(newsize, M_CACHE, &new_mask); if (new_table == NULL) { return ENOMEM; } + name_cache_lock(); // do the switch! old_table = nchashtbl; nchashtbl = new_table; - old_mask = nchash; - nchash = new_mask; + old_size = nchash; + nchash = new_size; // walk the old table and insert all the entries into // the new table // - for(i=0; i <= old_mask; i++) { + for(i=0; i < old_size; i++) { old_head = &old_table[i]; for (entry=old_head->lh_first; entry != NULL; entry=next) { // // XXXdbg - Beware: this assumes that hash_string() does // the same thing as what happens in // lookup() over in vfs_lookup.c - head = NCHHASH(entry->nc_dvp, hash_string(entry->nc_name, 0)); - + hashval = hash_string(entry->nc_name, 0); + entry->nc_hashval = hashval; + head = NCHHASH(entry->nc_dvp, hashval); + next = entry->nc_hash.le_next; LIST_INSERT_HEAD(head, entry, nc_hash); } } + desiredNodes = dNodes; + desiredNegNodes = dNegNodes; + name_cache_unlock(); FREE(old_table, M_CACHE); return 0; } +static void +cache_delete(struct namecache *ncp, int age_entry) +{ + nchstats.ncs_deletes++; + + if (ncp->nc_vp) { + LIST_REMOVE(ncp, nc_un.nc_link); + } else { + TAILQ_REMOVE(&neghead, ncp, nc_un.nc_negentry); + nchstats.ncs_negtotal--; + } + LIST_REMOVE(ncp, nc_child); + + LIST_REMOVE(ncp, nc_hash); + /* + * this field is used to indicate + * that the entry is in use and + * must be deleted before it can + * be reused... + */ + ncp->nc_hash.le_prev = NULL; + + if (age_entry) { + /* + * make it the next one available + * for cache_enter's use + */ + TAILQ_REMOVE(&nchead, ncp, nc_entry); + TAILQ_INSERT_HEAD(&nchead, ncp, nc_entry); + } + remove_name_locked(ncp->nc_name); + ncp->nc_name = NULL; +} + + +/* + * purge the entry associated with the + * specified vnode from the name cache + */ +void +cache_purge(vnode_t vp) +{ + struct namecache *ncp; + + if ((LIST_FIRST(&vp->v_nclinks) == NULL) && (LIST_FIRST(&vp->v_ncchildren) == NULL)) + return; + + name_cache_lock(); + while ( (ncp = LIST_FIRST(&vp->v_nclinks)) ) + cache_delete(ncp, 1); + while ( (ncp = LIST_FIRST(&vp->v_ncchildren)) ) + cache_delete(ncp, 1); + + name_cache_unlock(); +} /* - * Invalidate a all entries to particular vnode. - * - * We actually just increment the v_id, that will do it. The entries will - * be purged by lookup as they get found. If the v_id wraps around, we - * need to ditch the entire cache, to avoid confusion. No valid vnode will - * ever have (v_id == 0). + * Purge all negative cache entries that are children of the + * given vnode. A case-insensitive file system (or any file + * system that has multiple equivalent names for the same + * directory entry) can use this when creating or renaming + * to remove negative entries that may no longer apply. */ void -cache_purge(vp) - struct vnode *vp; +cache_purge_negatives(vnode_t vp) { struct namecache *ncp; - struct nchashhead *ncpp; - vp->v_id = ++nextvnodeid; - if (nextvnodeid != 0) - return; - for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) { - while (ncp = ncpp->lh_first) - PURGE(ncp); - } - vp->v_id = ++nextvnodeid; + name_cache_lock(); + + LIST_FOREACH(ncp, &vp->v_ncchildren, nc_child) + if (ncp->nc_vp == NULL) + cache_delete(ncp , 1); + + name_cache_unlock(); } /* * Flush all entries referencing a particular filesystem. * * Since we need to check it anyway, we will flush all the invalid - * entriess at the same time. + * entries at the same time. */ void cache_purgevfs(mp) struct mount *mp; { struct nchashhead *ncpp; - struct namecache *ncp, *nnp; + struct namecache *ncp; + name_cache_lock(); /* Scan hash tables for applicable entries */ - for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) { - for (ncp = ncpp->lh_first; ncp != 0; ncp = nnp) { - nnp = ncp->nc_hash.le_next; - if (ncp->nc_dvpid != ncp->nc_dvp->v_id || - (ncp->nc_vp && ncp->nc_vpid != ncp->nc_vp->v_id) || - ncp->nc_dvp->v_mount == mp) { - PURGE(ncp); + for (ncpp = &nchashtbl[nchash - 1]; ncpp >= nchashtbl; ncpp--) { +restart: + for (ncp = ncpp->lh_first; ncp != 0; ncp = ncp->nc_hash.le_next) { + if (ncp->nc_dvp->v_mount == mp) { + cache_delete(ncp, 0); + goto restart; } } } + name_cache_unlock(); } @@ -476,7 +1304,7 @@ typedef struct string_t { static int -resize_string_ref_table() +resize_string_ref_table(void) { struct stringhead *new_table; struct stringhead *old_table; @@ -533,11 +1361,23 @@ init_string_table(void) char * -add_name(const char *name, size_t len, u_int hashval, u_int flags) +vfs_addname(const char *name, size_t len, u_int hashval, u_int flags) +{ + char * ptr; + + name_cache_lock(); + ptr = add_name_locked(name, len, hashval, flags); + name_cache_unlock(); + + return(ptr); +} + +static char * +add_name_locked(const char *name, size_t len, u_int hashval, __unused u_int flags) { struct stringhead *head; string_t *entry; - int chain_len = 0; + uint32_t chain_len = 0; // // If the table gets more than 3/4 full, resize it @@ -547,14 +1387,13 @@ add_name(const char *name, size_t len, u_int hashval, u_int flags) printf("failed to resize the hash table.\n"); } } - if (hashval == 0) { - hashval = hash_string(name, len); + hashval = hash_string(name, 0); } head = &string_ref_table[hashval & string_table_mask]; for (entry=head->lh_first; entry != NULL; chain_len++, entry=entry->hash_chain.le_next) { - if (strncmp(entry->str, name, len) == 0 && entry->str[len] == '\0') { + if (memcmp(entry->str, name, len) == 0 && entry->str[len] == '\0') { entry->refcount++; num_dups++; break; @@ -573,11 +1412,11 @@ add_name(const char *name, size_t len, u_int hashval, u_int flags) filled_buckets++; } - LIST_INSERT_HEAD(head, entry, hash_chain); entry->str = (char *)((char *)entry + sizeof(string_t)); strncpy(entry->str, name, len); entry->str[len] = '\0'; entry->refcount = 1; + LIST_INSERT_HEAD(head, entry, hash_chain); if (chain_len > max_chain_len) { max_chain_len = chain_len; @@ -591,11 +1430,26 @@ add_name(const char *name, size_t len, u_int hashval, u_int flags) } int -remove_name(const char *nameref) +vfs_removename(const char *nameref) +{ + int i; + + name_cache_lock(); + i = remove_name_locked(nameref); + name_cache_unlock(); + + return(i); + +} + + +static int +remove_name_locked(const char *nameref) { struct stringhead *head; string_t *entry; uint32_t hashval; + char * ptr; hashval = hash_string(nameref, 0); head = &string_ref_table[hashval & string_table_mask]; @@ -607,6 +1461,7 @@ remove_name(const char *nameref) if (head->lh_first == NULL) { filled_buckets--; } + ptr = entry->str; entry->str = NULL; nstrings--; @@ -628,12 +1483,14 @@ dump_string_table(void) { struct stringhead *head; string_t *entry; - int i; + u_long i; - for(i=0; i <= string_table_mask; i++) { + name_cache_lock(); + for (i = 0; i <= string_table_mask; i++) { head = &string_ref_table[i]; for (entry=head->lh_first; entry != NULL; entry=entry->hash_chain.le_next) { printf("%6d - %s\n", entry->refcount, entry->str); } } + name_cache_unlock(); } diff --git a/bsd/vfs/vfs_cluster.c b/bsd/vfs/vfs_cluster.c index 616f09e1c..29a38b7c2 100644 --- a/bsd/vfs/vfs_cluster.c +++ b/bsd/vfs/vfs_cluster.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -56,26 +56,34 @@ */ #include <sys/param.h> -#include <sys/proc.h> -#include <sys/buf.h> -#include <sys/vnode.h> -#include <sys/mount.h> +#include <sys/proc_internal.h> +#include <sys/buf_internal.h> +#include <sys/mount_internal.h> +#include <sys/vnode_internal.h> #include <sys/trace.h> #include <sys/malloc.h> #include <sys/time.h> #include <sys/kernel.h> #include <sys/resourcevar.h> +#include <sys/uio_internal.h> #include <libkern/libkern.h> #include <machine/machine_routines.h> -#include <sys/ubc.h> -#include <vm/vm_pageout.h> +#include <sys/ubc_internal.h> #include <mach/mach_types.h> #include <mach/memory_object_types.h> +#include <mach/vm_map.h> +#include <mach/upl.h> + +#include <vm/vm_kern.h> +#include <vm/vm_map.h> +#include <vm/vm_pageout.h> #include <sys/kdebug.h> + + #define CL_READ 0x01 #define CL_ASYNC 0x02 #define CL_COMMIT 0x04 @@ -87,6 +95,7 @@ #define CL_DEV_MEMORY 0x200 #define CL_PRESERVE 0x400 #define CL_THROTTLE 0x800 +#define CL_KEEPCACHED 0x1000 struct clios { @@ -96,57 +105,188 @@ struct clios { int io_wanted; /* someone is sleeping waiting for a change in state */ }; +static lck_grp_t *cl_mtx_grp; +static lck_attr_t *cl_mtx_attr; +static lck_grp_attr_t *cl_mtx_grp_attr; +static lck_mtx_t *cl_mtxp; + + +static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size, + int flags, buf_t real_bp, struct clios *iostate); +static int cluster_iodone(buf_t bp, void *dummy); +static int cluster_rd_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize); +static int cluster_hard_throttle_on(vnode_t vp); + +static int cluster_read_x(vnode_t vp, struct uio *uio, off_t filesize, int flags); +static int cluster_write_x(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, + off_t headOff, off_t tailOff, int flags); +static int cluster_nocopy_read(vnode_t vp, struct uio *uio, off_t filesize); +static int cluster_nocopy_write(vnode_t vp, struct uio *uio, off_t newEOF); +static int cluster_phys_read(vnode_t vp, struct uio *uio, off_t filesize); +static int cluster_phys_write(vnode_t vp, struct uio *uio, off_t newEOF); +static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, int xsize, int flags); -static void cluster_zero(upl_t upl, vm_offset_t upl_offset, - int size, struct buf *bp); -static int cluster_read_x(struct vnode *vp, struct uio *uio, - off_t filesize, int devblocksize, int flags); -static int cluster_write_x(struct vnode *vp, struct uio *uio, - off_t oldEOF, off_t newEOF, off_t headOff, - off_t tailOff, int devblocksize, int flags); -static int cluster_nocopy_read(struct vnode *vp, struct uio *uio, - off_t filesize, int devblocksize, int flags); -static int cluster_nocopy_write(struct vnode *vp, struct uio *uio, - off_t newEOF, int devblocksize, int flags); -static int cluster_phys_read(struct vnode *vp, struct uio *uio, - off_t filesize, int devblocksize, int flags); -static int cluster_phys_write(struct vnode *vp, struct uio *uio, - off_t newEOF, int devblocksize, int flags); -static int cluster_align_phys_io(struct vnode *vp, struct uio *uio, - addr64_t usr_paddr, int xsize, int devblocksize, int flags); -static int cluster_push_x(struct vnode *vp, off_t EOF, unsigned int first, unsigned int last, int can_delay); -static int cluster_try_push(struct vnode *vp, off_t EOF, int can_delay, int push_all); - -static int sparse_cluster_switch(struct vnode *vp, off_t EOF); -static int sparse_cluster_push(struct vnode *vp, off_t EOF, int push_all); -static int sparse_cluster_add(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last); +static void cluster_rd_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra); + +static int cluster_push_x(vnode_t vp, struct cl_extent *, off_t EOF, int flags); +static void cluster_push_EOF(vnode_t vp, off_t EOF); + +static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int can_delay, int push_all); + +static void sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF); +static void sparse_cluster_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_all); +static void sparse_cluster_add(struct cl_writebehind *, vnode_t vp, struct cl_extent *, off_t EOF); static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp); -static kern_return_t vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length); static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp); static kern_return_t vfs_drt_control(void **cmapp, int op_type); -int ubc_page_op_with_control __P((memory_object_control_t, off_t, int, ppnum_t *, int *)); - +int is_file_clean(vnode_t, off_t); /* * throttle the number of async writes that * can be outstanding on a single vnode * before we issue a synchronous write */ -#define ASYNC_THROTTLE 18 -#define HARD_THROTTLE_MAXCNT 1 -#define HARD_THROTTLE_MAXSIZE (64 * 1024) +#define HARD_THROTTLE_MAXCNT 0 +#define HARD_THROTTLE_MAXSIZE (64 * 1024) int hard_throttle_on_root = 0; struct timeval priority_IO_timestamp_for_root; +void +cluster_init(void) { + /* + * allocate lock group attribute and group + */ + cl_mtx_grp_attr = lck_grp_attr_alloc_init(); + //lck_grp_attr_setstat(cl_mtx_grp_attr); + cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr); + + /* + * allocate the lock attribute + */ + cl_mtx_attr = lck_attr_alloc_init(); + //lck_attr_setdebug(clf_mtx_attr); + + /* + * allocate and initialize mutex's used to protect updates and waits + * on the cluster_io context + */ + cl_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr); + + if (cl_mtxp == NULL) + panic("cluster_init: failed to allocate cl_mtxp"); +} + + + +#define CLW_ALLOCATE 0x01 +#define CLW_RETURNLOCKED 0x02 +/* + * if the read ahead context doesn't yet exist, + * allocate and initialize it... + * the vnode lock serializes multiple callers + * during the actual assignment... first one + * to grab the lock wins... the other callers + * will release the now unnecessary storage + * + * once the context is present, try to grab (but don't block on) + * the lock associated with it... if someone + * else currently owns it, than the read + * will run without read-ahead. this allows + * multiple readers to run in parallel and + * since there's only 1 read ahead context, + * there's no real loss in only allowing 1 + * reader to have read-ahead enabled. + */ +static struct cl_readahead * +cluster_get_rap(vnode_t vp) +{ + struct ubc_info *ubc; + struct cl_readahead *rap; + + ubc = vp->v_ubcinfo; + + if ((rap = ubc->cl_rahead) == NULL) { + MALLOC_ZONE(rap, struct cl_readahead *, sizeof *rap, M_CLRDAHEAD, M_WAITOK); + + bzero(rap, sizeof *rap); + rap->cl_lastr = -1; + lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr); + + vnode_lock(vp); + + if (ubc->cl_rahead == NULL) + ubc->cl_rahead = rap; + else { + lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp); + FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD); + rap = ubc->cl_rahead; + } + vnode_unlock(vp); + } + if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE) + return(rap); + + return ((struct cl_readahead *)NULL); +} + + +/* + * if the write behind context doesn't yet exist, + * and CLW_ALLOCATE is specified, allocate and initialize it... + * the vnode lock serializes multiple callers + * during the actual assignment... first one + * to grab the lock wins... the other callers + * will release the now unnecessary storage + * + * if CLW_RETURNLOCKED is set, grab (blocking if necessary) + * the lock associated with the write behind context before + * returning + */ + +static struct cl_writebehind * +cluster_get_wbp(vnode_t vp, int flags) +{ + struct ubc_info *ubc; + struct cl_writebehind *wbp; + + ubc = vp->v_ubcinfo; + + if ((wbp = ubc->cl_wbehind) == NULL) { + + if ( !(flags & CLW_ALLOCATE)) + return ((struct cl_writebehind *)NULL); + + MALLOC_ZONE(wbp, struct cl_writebehind *, sizeof *wbp, M_CLWRBEHIND, M_WAITOK); + + bzero(wbp, sizeof *wbp); + lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr); + + vnode_lock(vp); + + if (ubc->cl_wbehind == NULL) + ubc->cl_wbehind = wbp; + else { + lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp); + FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND); + wbp = ubc->cl_wbehind; + } + vnode_unlock(vp); + } + if (flags & CLW_RETURNLOCKED) + lck_mtx_lock(&wbp->cl_lockw); + + return (wbp); +} + + static int -cluster_hard_throttle_on(vp) - struct vnode *vp; +cluster_hard_throttle_on(vnode_t vp) { - static struct timeval hard_throttle_maxelapsed = { 0, 300000 }; + static struct timeval hard_throttle_maxelapsed = { 0, 200000 }; if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) { struct timeval elapsed; @@ -154,7 +294,7 @@ cluster_hard_throttle_on(vp) if (hard_throttle_on_root) return(1); - elapsed = time; + microuptime(&elapsed); timevalsub(&elapsed, &priority_IO_timestamp_for_root); if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <)) @@ -165,27 +305,24 @@ cluster_hard_throttle_on(vp) static int -cluster_iodone(bp) - struct buf *bp; +cluster_iodone(buf_t bp, __unused void *dummy) { - int b_flags; - int error; - int total_size; - int total_resid; - int upl_offset; - int zero_offset; - upl_t upl; - struct buf *cbp; - struct buf *cbp_head; - struct buf *cbp_next; - struct buf *real_bp; - struct vnode *vp; - struct clios *iostate; - int commit_size; - int pg_offset; - - - cbp_head = (struct buf *)(bp->b_trans_head); + int b_flags; + int error; + int total_size; + int total_resid; + int upl_offset; + int zero_offset; + upl_t upl; + buf_t cbp; + buf_t cbp_head; + buf_t cbp_next; + buf_t real_bp; + struct clios *iostate; + int commit_size; + int pg_offset; + + cbp_head = (buf_t)(bp->b_trans_head); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START, (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0); @@ -209,13 +346,15 @@ cluster_iodone(bp) cbp = cbp_head; upl_offset = cbp->b_uploffset; - upl = cbp->b_pagelist; + upl = cbp->b_upl; b_flags = cbp->b_flags; real_bp = cbp->b_real_bp; - vp = cbp->b_vp; zero_offset= cbp->b_validend; iostate = (struct clios *)cbp->b_iostate; + if (real_bp) + real_bp->b_dev = cbp->b_dev; + while (cbp) { if ((cbp->b_flags & B_ERROR) && error == 0) error = cbp->b_error; @@ -232,15 +371,15 @@ cluster_iodone(bp) if (zero_offset) cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp); - if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) { - vp->v_flag &= ~VTHROTTLED; - wakeup((caddr_t)&vp->v_numoutput); - } if (iostate) { + int need_wakeup = 0; + /* * someone has issued multiple I/Os asynchrounsly * and is waiting for them to complete (streaming) */ + lck_mtx_lock(cl_mtxp); + if (error && iostate->io_error == 0) iostate->io_error = error; @@ -252,8 +391,12 @@ cluster_iodone(bp) * this io stream to change */ iostate->io_wanted = 0; - wakeup((caddr_t)&iostate->io_wanted); + need_wakeup = 1; } + lck_mtx_unlock(cl_mtxp); + + if (need_wakeup) + wakeup((caddr_t)&iostate->io_wanted); } if ((b_flags & B_NEED_IODONE) && real_bp) { if (error) { @@ -262,7 +405,7 @@ cluster_iodone(bp) } real_bp->b_resid = total_resid; - biodone(real_bp); + buf_biodone(real_bp); } if (error == 0 && total_resid) error = EIO; @@ -273,17 +416,27 @@ cluster_iodone(bp) if (error || (b_flags & B_NOCACHE)) { int upl_abort_code; + int page_in = 0; + int page_out = 0; - if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */ + if (b_flags & B_PAGEIO) { + if (b_flags & B_READ) + page_in = 1; + else + page_out = 1; + } + if (b_flags & B_CACHE) /* leave pages in the cache unchanged on error */ upl_abort_code = UPL_ABORT_FREE_ON_EMPTY; - else if (b_flags & B_PGIN) - upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR; + else if (page_out && (error != ENXIO)) /* transient error */ + upl_abort_code = UPL_ABORT_FREE_ON_EMPTY; + else if (page_in) + upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR; else upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES; ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size, - upl_abort_code); - + upl_abort_code); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, (int)upl, upl_offset - pg_offset, commit_size, 0x80000000|upl_abort_code, 0); @@ -291,11 +444,8 @@ cluster_iodone(bp) } else { int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY; - if (b_flags & B_PHYS) { - if (b_flags & B_READ) - upl_commit_flags |= UPL_COMMIT_SET_DIRTY; - } else if ( !(b_flags & B_PAGEOUT)) - upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY; + if ((b_flags & B_PHYS) && (b_flags & B_READ)) + upl_commit_flags |= UPL_COMMIT_SET_DIRTY; if (b_flags & B_AGE) upl_commit_flags |= UPL_COMMIT_INACTIVATE; @@ -307,27 +457,24 @@ cluster_iodone(bp) (int)upl, upl_offset - pg_offset, commit_size, upl_commit_flags, 0); } - } else + } else { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, (int)upl, upl_offset, 0, error, 0); + } return (error); } -static void -cluster_zero(upl, upl_offset, size, bp) - upl_t upl; - vm_offset_t upl_offset; - int size; - struct buf *bp; +void +cluster_zero(upl_t upl, vm_offset_t upl_offset, int size, buf_t bp) { upl_page_info_t *pl; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START, upl_offset, size, (int)bp, 0, 0); - if (bp == NULL || bp->b_data == NULL) { + if (bp == NULL || bp->b_datap == 0) { pl = ubc_upl_pageinfo(upl); @@ -349,62 +496,83 @@ cluster_zero(upl, upl_offset, size, bp) upl_offset += zero_cnt; } } else - bzero((caddr_t)((vm_offset_t)bp->b_data + upl_offset), size); + bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END, upl_offset, size, 0, 0, 0); } + static int -cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp, iostate) - struct vnode *vp; - upl_t upl; - vm_offset_t upl_offset; - off_t f_offset; - int non_rounded_size; - int devblocksize; - int flags; - struct buf *real_bp; - struct clios *iostate; +cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size, + int flags, buf_t real_bp, struct clios *iostate) { - struct buf *cbp; - u_int size; - u_int io_size; - int io_flags; - int error = 0; - int retval = 0; - struct buf *cbp_head = 0; - struct buf *cbp_tail = 0; - int buf_count = 0; - int pg_count; - int pg_offset; - u_int max_iosize; - u_int max_vectors; - int priv; - int zero_offset = 0; - int async_throttle; - - if (devblocksize) - size = (non_rounded_size + (devblocksize - 1)) & ~(devblocksize - 1); - else - size = non_rounded_size; + buf_t cbp; + u_int size; + u_int io_size; + int io_flags; + int bmap_flags; + int error = 0; + int retval = 0; + buf_t cbp_head = NULL; + buf_t cbp_tail = NULL; + int trans_count = 0; + u_int pg_count; + int pg_offset; + u_int max_iosize; + u_int max_vectors; + int priv; + int zero_offset = 0; + int async_throttle = 0; + mount_t mp; + + mp = vp->v_mount; + + if (mp->mnt_devblocksize > 1) { + /* + * round the requested size up so that this I/O ends on a + * page boundary in case this is a 'write'... if the filesystem + * has blocks allocated to back the page beyond the EOF, we want to + * make sure to write out the zero's that are sitting beyond the EOF + * so that in case the filesystem doesn't explicitly zero this area + * if a hole is created via a lseek/write beyond the current EOF, + * it will return zeros when it's read back from the disk. If the + * physical allocation doesn't extend for the whole page, we'll + * only write/read from the disk up to the end of this allocation + * via the extent info returned from the VNOP_BLOCKMAP call. + */ + pg_offset = upl_offset & PAGE_MASK; + size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset; + } else { + /* + * anyone advertising a blocksize of 1 byte probably + * can't deal with us rounding up the request size + * AFP is one such filesystem/device + */ + size = non_rounded_size; + } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0); - if (flags & CL_READ) { - io_flags = (B_VECTORLIST | B_READ); + io_flags = (B_READ); + bmap_flags = VNODE_READ; - vfs_io_attributes(vp, B_READ, &max_iosize, &max_vectors); + max_iosize = mp->mnt_maxreadcnt; + max_vectors = mp->mnt_segreadcnt; } else { - io_flags = (B_VECTORLIST | B_WRITEINPROG); + io_flags = 0; + bmap_flags = VNODE_WRITE; - vfs_io_attributes(vp, B_WRITE, &max_iosize, &max_vectors); + max_iosize = mp->mnt_maxwritecnt; + max_vectors = mp->mnt_segwritecnt; } + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0); + /* - * make sure the maximum iosize are at least the size of a page - * and that they are multiples of the page size + * make sure the maximum iosize is a + * multiple of the page size */ max_iosize &= ~PAGE_MASK; @@ -414,20 +582,20 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, max_iosize = HARD_THROTTLE_MAXSIZE; async_throttle = HARD_THROTTLE_MAXCNT; } else - async_throttle = ASYNC_THROTTLE; + async_throttle = VNODE_ASYNC_THROTTLE; } if (flags & CL_AGE) io_flags |= B_AGE; if (flags & CL_DUMP) io_flags |= B_NOCACHE; - if (flags & CL_PAGEIN) - io_flags |= B_PGIN; - if (flags & CL_PAGEOUT) - io_flags |= B_PAGEOUT; + if (flags & (CL_PAGEIN | CL_PAGEOUT)) + io_flags |= B_PAGEIO; if (flags & CL_COMMIT) io_flags |= B_COMMIT_UPL; if (flags & CL_PRESERVE) io_flags |= B_PHYS; + if (flags & CL_KEEPCACHED) + io_flags |= B_CACHE; if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) { /* @@ -440,50 +608,117 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, zero_offset = upl_offset + non_rounded_size; } while (size) { - int vsize; - int i; - int pg_resid; - int num_contig; - daddr_t lblkno; - daddr_t blkno; + int pg_resid; + daddr64_t blkno; + daddr64_t lblkno; if (size > max_iosize) io_size = max_iosize; else io_size = size; - - if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL)) { - if (error == EOPNOTSUPP) - panic("VOP_CMAP Unimplemented"); + + if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL, bmap_flags, NULL))) { break; } + if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) + real_bp->b_blkno = blkno; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE, (int)f_offset, (int)blkno, io_size, zero_offset, 0); - if ( (!(flags & CL_READ) && (long)blkno == -1) || io_size == 0) { + if (io_size == 0) { + /* + * vnop_blockmap didn't return an error... however, it did + * return an extent size of 0 which means we can't + * make forward progress on this I/O... a hole in the + * file would be returned as a blkno of -1 with a non-zero io_size + * a real extent is returned with a blkno != -1 and a non-zero io_size + */ + error = EINVAL; + break; + } + if ( !(flags & CL_READ) && blkno == -1) { + off_t e_offset; + + /* + * we're writing into a 'hole' + */ if (flags & CL_PAGEOUT) { + /* + * if we got here via cluster_pageout + * then just error the request and return + * the 'hole' should already have been covered + */ error = EINVAL; break; - }; - - /* Try paging out the page individually before - giving up entirely and dumping it (it could - be mapped in a "hole" and require allocation - before the I/O: + } + if ( !(flags & CL_COMMIT)) { + /* + * currently writes always request the commit to happen + * as part of the io completion... however, if the CL_COMMIT + * flag isn't specified, than we can't issue the abort_range + * since the call site is going to abort or commit the same upl.. + * in this case we can only return an error + */ + error = EINVAL; + break; + } + /* + * we can get here if the cluster code happens to + * pick up a page that was dirtied via mmap vs + * a 'write' and the page targets a 'hole'... + * i.e. the writes to the cluster were sparse + * and the file was being written for the first time + * + * we can also get here if the filesystem supports + * 'holes' that are less than PAGE_SIZE.... because + * we can't know if the range in the page that covers + * the 'hole' has been dirtied via an mmap or not, + * we have to assume the worst and try to push the + * entire page to storage. + * + * Try paging out the page individually before + * giving up entirely and dumping it (the pageout + * path will insure that the zero extent accounting + * has been taken care of before we get back into cluster_io) */ - ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY); - if (ubc_pushdirty_range(vp, f_offset, PAGE_SIZE_64) == 0) { - error = EINVAL; + ubc_upl_abort_range(upl, trunc_page(upl_offset), PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY); + + e_offset = round_page_64(f_offset + 1); + + if (ubc_sync_range(vp, f_offset, e_offset, UBC_PUSHDIRTY) == 0) { + error = EINVAL; break; - }; - - f_offset += PAGE_SIZE_64; - upl_offset += PAGE_SIZE; - size -= PAGE_SIZE; + } + io_size = e_offset - f_offset; + + f_offset += io_size; + upl_offset += io_size; + + if (size >= io_size) + size -= io_size; + else + size = 0; + /* + * keep track of how much of the original request + * that we've actually completed... non_rounded_size + * may go negative due to us rounding the request + * to a page size multiple (i.e. size > non_rounded_size) + */ + non_rounded_size -= io_size; + + if (non_rounded_size <= 0) { + /* + * we've transferred all of the data in the original + * request, but we were unable to complete the tail + * of the last page because the file didn't have + * an allocation to back that portion... this is ok. + */ + size = 0; + } continue; } - lblkno = (daddr_t)(f_offset / PAGE_SIZE_64); + lblkno = (daddr64_t)(f_offset / PAGE_SIZE_64); /* * we have now figured out how much I/O we can do - this is in 'io_size' * pg_offset is the starting point in the first page for the I/O @@ -495,7 +730,7 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, /* * currently, can't deal with reading 'holes' in file */ - if ((long)blkno == -1) { + if (blkno == -1) { error = EINVAL; break; } @@ -506,7 +741,7 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, } else pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE; - if ((flags & CL_READ) && (long)blkno == -1) { + if ((flags & CL_READ) && blkno == -1) { int bytes_to_zero; /* @@ -518,7 +753,7 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, /* * if this upl contains the EOF and it is not a multiple of PAGE_SIZE * than 'zero_offset' will be non-zero - * if the 'hole' returned by VOP_CMAP extends all the way to the eof + * if the 'hole' returned by vnop_blockmap extends all the way to the eof * (indicated by the io_size finishing off the I/O request for this UPL) * than we're not going to issue an I/O for the * last page in this upl... we need to zero both the hole and the tail @@ -574,26 +809,39 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, upl_offset += io_size; f_offset += io_size; size -= io_size; + /* + * keep track of how much of the original request + * that we've actually completed... non_rounded_size + * may go negative due to us rounding the request + * to a page size multiple (i.e. size > non_rounded_size) + */ + non_rounded_size -= io_size; + if (non_rounded_size <= 0) { + /* + * we've transferred all of the data in the original + * request, but we were unable to complete the tail + * of the last page because the file didn't have + * an allocation to back that portion... this is ok. + */ + size = 0; + } if (cbp_head && pg_count) goto start_io; continue; - } else if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) { - real_bp->b_blkno = blkno; } - if (pg_count > max_vectors) { - io_size -= (pg_count - max_vectors) * PAGE_SIZE; - - if (io_size < 0) { + if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) { io_size = PAGE_SIZE - pg_offset; pg_count = 1; - } else + } else { + io_size -= (pg_count - max_vectors) * PAGE_SIZE; pg_count = max_vectors; + } } - if ( !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)) + if ( !(mp->mnt_kern_flag & MNTK_VIRTUALDEV)) /* * if we're not targeting a virtual device i.e. a disk image * it's safe to dip into the reserve pool since real devices @@ -611,51 +859,44 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, cbp = alloc_io_buf(vp, priv); - if (flags & CL_PAGEOUT) { + u_int i; + for (i = 0; i < pg_count; i++) { - int s; - struct buf *bp; - - s = splbio(); - if (bp = incore(vp, lblkno + i)) { - if (!ISSET(bp->b_flags, B_BUSY)) { - bremfree(bp); - SET(bp->b_flags, (B_BUSY | B_INVAL)); - splx(s); - brelse(bp); - } else - panic("BUSY bp found in cluster_io"); - } - splx(s); + if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY) + panic("BUSY bp found in cluster_io"); } } if (flags & CL_ASYNC) { - cbp->b_flags |= (B_CALL | B_ASYNC); - cbp->b_iodone = (void *)cluster_iodone; + if (buf_setcallback(cbp, (void *)cluster_iodone, NULL)) + panic("buf_setcallback failed\n"); } cbp->b_flags |= io_flags; cbp->b_lblkno = lblkno; cbp->b_blkno = blkno; cbp->b_bcount = io_size; - cbp->b_pagelist = upl; - cbp->b_uploffset = upl_offset; - cbp->b_trans_next = (struct buf *)0; - if (cbp->b_iostate = (void *)iostate) + if (buf_setupl(cbp, upl, upl_offset)) + panic("buf_setupl failed\n"); + + cbp->b_trans_next = (buf_t)NULL; + + if ((cbp->b_iostate = (void *)iostate)) /* * caller wants to track the state of this * io... bump the amount issued against this stream */ iostate->io_issued += io_size; - if (flags & CL_READ) + if (flags & CL_READ) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE, - cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0); - else + (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0); + } + else { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE, - cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0); + (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0); + } if (cbp_head) { cbp_tail->b_trans_next = cbp; @@ -664,14 +905,30 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, cbp_head = cbp; cbp_tail = cbp; } - (struct buf *)(cbp->b_trans_head) = cbp_head; - buf_count++; + (buf_t)(cbp->b_trans_head) = cbp_head; + trans_count++; upl_offset += io_size; f_offset += io_size; size -= io_size; + /* + * keep track of how much of the original request + * that we've actually completed... non_rounded_size + * may go negative due to us rounding the request + * to a page size multiple (i.e. size > non_rounded_size) + */ + non_rounded_size -= io_size; - if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || buf_count > 8)) || size == 0) { + if (non_rounded_size <= 0) { + /* + * we've transferred all of the data in the original + * request, but we were unable to complete the tail + * of the last page because the file didn't have + * an allocation to back that portion... this is ok. + */ + size = 0; + } + if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || trans_count > 8)) || size == 0) { /* * if we have no more I/O to issue or * the current I/O we've prepared fully @@ -687,7 +944,7 @@ start_io: cbp_head->b_flags |= B_NEED_IODONE; cbp_head->b_real_bp = real_bp; } else - cbp_head->b_real_bp = (struct buf *)NULL; + cbp_head->b_real_bp = (buf_t)NULL; if (size == 0) { /* @@ -700,39 +957,40 @@ start_io: } else cbp_head->b_validend = 0; - if (flags & CL_THROTTLE) { - while (vp->v_numoutput >= async_throttle) { - vp->v_flag |= VTHROTTLED; - tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_io", 0); - } - } + if (flags & CL_THROTTLE) + (void)vnode_waitforwrites(vp, async_throttle, 0, 0, (char *)"cluster_io"); + for (cbp = cbp_head; cbp;) { - struct buf * cbp_next; + buf_t cbp_next; - if (io_flags & B_WRITEINPROG) - cbp->b_vp->v_numoutput++; + if ( !(io_flags & B_READ)) + vnode_startwrite(vp); cbp_next = cbp->b_trans_next; - (void) VOP_STRATEGY(cbp); + (void) VNOP_STRATEGY(cbp); cbp = cbp_next; } if ( !(flags & CL_ASYNC)) { + int dummy; + for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) - biowait(cbp); - - if (error = cluster_iodone(cbp_head)) { - if ((flags & CL_PAGEOUT) && (error == ENXIO)) - retval = 0; /* drop the error */ - else - retval = error; - error = 0; + buf_biowait(cbp); + + if ((error = cluster_iodone(cbp_head, (void *)&dummy))) { + if ((flags & (CL_PAGEOUT | CL_KEEPCACHED) == CL_PAGEOUT) && (error == ENXIO)) + error = 0; /* drop the error */ + else { + if (retval == 0) + retval = error; + error = 0; + } } } - cbp_head = (struct buf *)0; - cbp_tail = (struct buf *)0; + cbp_head = (buf_t)NULL; + cbp_tail = (buf_t)NULL; - buf_count = 0; + trans_count = 0; } } if (error) { @@ -741,7 +999,7 @@ start_io: io_size = 0; for (cbp = cbp_head; cbp;) { - struct buf * cbp_next; + buf_t cbp_next; upl_offset -= cbp->b_bcount; size += cbp->b_bcount; @@ -752,11 +1010,15 @@ start_io: cbp = cbp_next; } if (iostate) { + int need_wakeup = 0; + /* * update the error condition for this stream * since we never really issued the io * just go ahead and adjust it back */ + lck_mtx_lock(cl_mtxp); + if (iostate->io_error == 0) iostate->io_error = error; iostate->io_issued -= io_size; @@ -767,8 +1029,12 @@ start_io: * this io stream to change */ iostate->io_wanted = 0; - wakeup((caddr_t)&iostate->io_wanted); + need_wakeup = 0; } + lck_mtx_unlock(cl_mtxp); + + if (need_wakeup) + wakeup((caddr_t)&iostate->io_wanted); } pg_offset = upl_offset & PAGE_MASK; abort_size = (size + pg_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK; @@ -797,7 +1063,7 @@ start_io: real_bp->b_flags |= B_ERROR; real_bp->b_error = error; - biodone(real_bp); + buf_biodone(real_bp); } if (retval == 0) retval = error; @@ -810,12 +1076,7 @@ start_io: static int -cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize) - struct vnode *vp; - off_t f_offset; - u_int size; - off_t filesize; - int devblocksize; +cluster_rd_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize) { int pages_in_prefetch; @@ -836,7 +1097,7 @@ cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize) size = filesize - f_offset; pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE; - advisory_read(vp, filesize, f_offset, size, devblocksize); + advisory_read(vp, filesize, f_offset, size); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END, (int)f_offset + size, pages_in_prefetch, 0, 1, 0); @@ -847,45 +1108,41 @@ cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize) static void -cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize) - struct vnode *vp; - daddr_t b_lblkno; - daddr_t e_lblkno; - off_t filesize; - int devblocksize; +cluster_rd_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap) { - daddr_t r_lblkno; - off_t f_offset; - int size_of_prefetch; + daddr64_t r_addr; + off_t f_offset; + int size_of_prefetch; + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START, - b_lblkno, e_lblkno, vp->v_lastr, 0, 0); + (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0); - if (b_lblkno == vp->v_lastr && b_lblkno == e_lblkno) { + if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, - vp->v_ralen, vp->v_maxra, vp->v_lastr, 0, 0); + rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0); return; } - if (vp->v_lastr == -1 || (b_lblkno != vp->v_lastr && b_lblkno != (vp->v_lastr + 1) && - (b_lblkno != (vp->v_maxra + 1) || vp->v_ralen == 0))) { - vp->v_ralen = 0; - vp->v_maxra = 0; + if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1) && + (extent->b_addr != (rap->cl_maxra + 1) || rap->cl_ralen == 0))) { + rap->cl_ralen = 0; + rap->cl_maxra = 0; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, - vp->v_ralen, vp->v_maxra, vp->v_lastr, 1, 0); + rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0); return; } - if (e_lblkno < vp->v_maxra) { - if ((vp->v_maxra - e_lblkno) > (MAX_UPL_TRANSFER / 4)) { + if (extent->e_addr < rap->cl_maxra) { + if ((rap->cl_maxra - extent->e_addr) > (MAX_UPL_TRANSFER / 4)) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, - vp->v_ralen, vp->v_maxra, vp->v_lastr, 2, 0); + rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0); return; } } - r_lblkno = max(e_lblkno, vp->v_maxra) + 1; - f_offset = (off_t)r_lblkno * PAGE_SIZE_64; + r_addr = max(extent->e_addr, rap->cl_maxra) + 1; + f_offset = (off_t)(r_addr * PAGE_SIZE_64); size_of_prefetch = 0; @@ -893,39 +1150,40 @@ cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize) if (size_of_prefetch) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, - vp->v_ralen, vp->v_maxra, vp->v_lastr, 3, 0); + rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0); return; } if (f_offset < filesize) { - vp->v_ralen = vp->v_ralen ? min(MAX_UPL_TRANSFER, vp->v_ralen << 1) : 1; + daddr64_t read_size; - if (((e_lblkno + 1) - b_lblkno) > vp->v_ralen) - vp->v_ralen = min(MAX_UPL_TRANSFER, (e_lblkno + 1) - b_lblkno); + rap->cl_ralen = rap->cl_ralen ? min(MAX_UPL_TRANSFER, rap->cl_ralen << 1) : 1; - size_of_prefetch = cluster_rd_prefetch(vp, f_offset, vp->v_ralen * PAGE_SIZE, filesize, devblocksize); + read_size = (extent->e_addr + 1) - extent->b_addr; + + if (read_size > rap->cl_ralen) { + if (read_size > MAX_UPL_TRANSFER) + rap->cl_ralen = MAX_UPL_TRANSFER; + else + rap->cl_ralen = read_size; + } + size_of_prefetch = cluster_rd_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize); if (size_of_prefetch) - vp->v_maxra = (r_lblkno + size_of_prefetch) - 1; + rap->cl_maxra = (r_addr + size_of_prefetch) - 1; } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, - vp->v_ralen, vp->v_maxra, vp->v_lastr, 4, 0); + rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0); } int -cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags) - struct vnode *vp; - upl_t upl; - vm_offset_t upl_offset; - off_t f_offset; - int size; - off_t filesize; - int devblocksize; - int flags; +cluster_pageout(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, + int size, off_t filesize, int flags) { int io_size; int rounded_size; off_t max_size; int local_flags; + struct cl_writebehind *wbp; if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) /* @@ -944,6 +1202,8 @@ cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, fla local_flags |= CL_ASYNC; if ((flags & UPL_NOCOMMIT) == 0) local_flags |= CL_COMMIT; + if ((flags & UPL_KEEPCACHED)) + local_flags |= CL_KEEPCACHED; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE, @@ -988,22 +1248,16 @@ cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, fla ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size, UPL_ABORT_FREE_ON_EMPTY); } - vp->v_flag |= VHASBEENPAGED; + if ((wbp = cluster_get_wbp(vp, 0)) != NULL) + wbp->cl_hasbeenpaged = 1; - return (cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize, - local_flags, (struct buf *)0, (struct clios *)0)); + return (cluster_io(vp, upl, upl_offset, f_offset, io_size, + local_flags, (buf_t)NULL, (struct clios *)NULL)); } int -cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags) - struct vnode *vp; - upl_t upl; - vm_offset_t upl_offset; - off_t f_offset; - int size; - off_t filesize; - int devblocksize; - int flags; +cluster_pagein(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, + int size, off_t filesize, int flags) { u_int io_size; int rounded_size; @@ -1048,42 +1302,45 @@ cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flag ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); - retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize, - local_flags | CL_READ | CL_PAGEIN, (struct buf *)0, (struct clios *)0); + retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, + local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL); - if (retval == 0) { - int b_lblkno; - int e_lblkno; + if (retval == 0 && !(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF)) { + struct cl_readahead *rap; - b_lblkno = (int)(f_offset / PAGE_SIZE_64); - e_lblkno = (int) - ((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64); + rap = cluster_get_rap(vp); - if (!(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF) && rounded_size == PAGE_SIZE) { - /* - * we haven't read the last page in of the file yet - * so let's try to read ahead if we're in - * a sequential access pattern - */ - cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize); + if (rap != NULL) { + struct cl_extent extent; + + extent.b_addr = (daddr64_t)(f_offset / PAGE_SIZE_64); + extent.e_addr = (daddr64_t)((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64); + + if (rounded_size == PAGE_SIZE) { + /* + * we haven't read the last page in of the file yet + * so let's try to read ahead if we're in + * a sequential access pattern + */ + cluster_rd_ahead(vp, &extent, filesize, rap); + } + rap->cl_lastr = extent.e_addr; + + lck_mtx_unlock(&rap->cl_lockr); } - vp->v_lastr = e_lblkno; } return (retval); } int -cluster_bp(bp) - struct buf *bp; +cluster_bp(buf_t bp) { off_t f_offset; int flags; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START, - (int)bp, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0); + (int)bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0); - if (bp->b_pagelist == (upl_t) 0) - panic("cluster_bp: can't handle NULL upl yet\n"); if (bp->b_flags & B_READ) flags = CL_ASYNC | CL_READ; else @@ -1091,207 +1348,196 @@ cluster_bp(bp) f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno); - return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp, (struct clios *)0)); + return (cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL)); } int -cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags) - struct vnode *vp; - struct uio *uio; - off_t oldEOF; - off_t newEOF; - off_t headOff; - off_t tailOff; - int devblocksize; - int flags; +cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags) { int prev_resid; - int clip_size; + u_int clip_size; off_t max_io_size; - struct iovec *iov; int upl_size; int upl_flags; upl_t upl; int retval = 0; + int flags; + flags = xflags; + + if (vp->v_flag & VNOCACHE_DATA) + flags |= IO_NOCACHE; + + if ( (!(flags & IO_NOCACHE)) || (!uio) || (!UIO_SEG_IS_USER_SPACE(uio->uio_segflg))) { + /* + * go do a write through the cache if one of the following is true.... + * NOCACHE is not true + * there is no uio structure or it doesn't target USERSPACE + */ + return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags)); + } + +#if LP64_DEBUG + if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) { + panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__); + } +#endif /* LP64_DEBUG */ - if (vp->v_flag & VHASBEENPAGED) - { - /* - * this vnode had pages cleaned to it by - * the pager which indicates that either - * it's not very 'hot', or the system is - * being overwhelmed by a lot of dirty - * data being delayed in the VM cache... - * in either event, we'll push our remaining - * delayed data at this point... this will - * be more efficient than paging out 1 page at - * a time, and will also act as a throttle - * by delaying this client from writing any - * more data until all his delayed data has - * at least been queued to the uderlying driver. - */ - cluster_push(vp); - - vp->v_flag &= ~VHASBEENPAGED; - } - - if ( (!(vp->v_flag & VNOCACHE_DATA)) || (!uio) || (uio->uio_segflg != UIO_USERSPACE)) - { - /* - * go do a write through the cache if one of the following is true.... - * NOCACHE is not true - * there is no uio structure or it doesn't target USERSPACE - */ - return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)); - } - - while (uio->uio_resid && uio->uio_offset < newEOF && retval == 0) - { - /* - * we know we have a resid, so this is safe - * skip over any emtpy vectors - */ - iov = uio->uio_iov; - - while (iov->iov_len == 0) { - uio->uio_iov++; - uio->uio_iovcnt--; - iov = uio->uio_iov; - } - upl_size = PAGE_SIZE; - upl_flags = UPL_QUERY_OBJECT_TYPE; - - if ((vm_map_get_upl(current_map(), - (vm_offset_t)iov->iov_base & ~PAGE_MASK, - &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) - { + while (uio_resid(uio) && uio->uio_offset < newEOF && retval == 0) { + u_int64_t iov_len; + u_int64_t iov_base; + /* - * the user app must have passed in an invalid address + * we know we have a resid, so this is safe + * skip over any emtpy vectors */ - return (EFAULT); - } - - /* - * We check every vector target but if it is physically - * contiguous space, we skip the sanity checks. - */ - if (upl_flags & UPL_PHYS_CONTIG) - { - if (flags & IO_HEADZEROFILL) - { - flags &= ~IO_HEADZEROFILL; - - if (retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, devblocksize, IO_HEADZEROFILL)) - return(retval); - } + iov_len = uio_iov_len(uio); - retval = cluster_phys_write(vp, uio, newEOF, devblocksize, flags); + while (iov_len == 0) { + uio_next_iov(uio); + uio->uio_iovcnt--; + iov_len = uio_iov_len(uio); + } + iov_base = uio_iov_base(uio); + + upl_size = PAGE_SIZE; + upl_flags = UPL_QUERY_OBJECT_TYPE; + + // LP64todo - fix this! + if ((vm_map_get_upl(current_map(), + CAST_DOWN(vm_offset_t, iov_base) & ~PAGE_MASK, + &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) { + /* + * the user app must have passed in an invalid address + */ + return (EFAULT); + } - if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL)) - { - return (cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, devblocksize, IO_HEADZEROFILL)); - } - } - else if ((uio->uio_resid < PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL))) - { /* - * we're here because we're don't have a physically contiguous target buffer - * go do a write through the cache if one of the following is true.... - * the total xfer size is less than a page... - * we're being asked to ZEROFILL either the head or the tail of the I/O... + * We check every vector target but if it is physically + * contiguous space, we skip the sanity checks. */ - return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)); - } - else if (((int)uio->uio_offset & PAGE_MASK) || ((int)iov->iov_base & PAGE_MASK)) - { - if (((int)uio->uio_offset & PAGE_MASK) == ((int)iov->iov_base & PAGE_MASK)) - { - /* - * Bring the file offset write up to a pagesize boundary - * this will also bring the base address to a page boundary - * since they both are currently on the same offset within a page - * note: if we get here, uio->uio_resid is greater than PAGE_SIZE - * so the computed clip_size must always be less than the current uio_resid - */ - clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64)); - - /* - * Fake the resid going into the cluster_write_x call - * and restore it on the way out. - */ - prev_resid = uio->uio_resid; - uio->uio_resid = clip_size; - retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags); - uio->uio_resid = prev_resid - (clip_size - uio->uio_resid); - } - else - { - /* - * can't get both the file offset and the buffer offset aligned to a page boundary - * so fire an I/O through the cache for this entire vector - */ - clip_size = iov->iov_len; - prev_resid = uio->uio_resid; - uio->uio_resid = clip_size; - retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags); - uio->uio_resid = prev_resid - (clip_size - uio->uio_resid); - } - } - else - { - /* - * If we come in here, we know the offset into - * the file is on a pagesize boundary and the - * target buffer address is also on a page boundary - */ - max_io_size = newEOF - uio->uio_offset; - clip_size = uio->uio_resid; - if (iov->iov_len < clip_size) - clip_size = iov->iov_len; - if (max_io_size < clip_size) - clip_size = max_io_size; - - if (clip_size < PAGE_SIZE) - { - /* - * Take care of tail end of write in this vector - */ - prev_resid = uio->uio_resid; - uio->uio_resid = clip_size; - retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags); - uio->uio_resid = prev_resid - (clip_size - uio->uio_resid); - } - else - { - /* round clip_size down to a multiple of pagesize */ - clip_size = clip_size & ~(PAGE_MASK); - prev_resid = uio->uio_resid; - uio->uio_resid = clip_size; - retval = cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags); - if ((retval == 0) && uio->uio_resid) - retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags); - uio->uio_resid = prev_resid - (clip_size - uio->uio_resid); - } - } /* end else */ - } /* end while */ + if (upl_flags & UPL_PHYS_CONTIG) { + int zflags; + + zflags = flags & ~IO_TAILZEROFILL; + zflags |= IO_HEADZEROFILL; + + if (flags & IO_HEADZEROFILL) { + /* + * in case we have additional vectors, we don't want to do this again + */ + flags &= ~IO_HEADZEROFILL; + + if ((retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, zflags))) + return(retval); + } + retval = cluster_phys_write(vp, uio, newEOF); + + if (uio_resid(uio) == 0 && (flags & IO_TAILZEROFILL)) { + return (cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, zflags)); + } + } + else if ((uio_resid(uio) < PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL))) { + /* + * we're here because we're don't have a physically contiguous target buffer + * go do a write through the cache if one of the following is true.... + * the total xfer size is less than a page... + * we're being asked to ZEROFILL either the head or the tail of the I/O... + */ + return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags)); + } + // LP64todo - fix this! + else if (((int)uio->uio_offset & PAGE_MASK) || (CAST_DOWN(int, iov_base) & PAGE_MASK)) { + if (((int)uio->uio_offset & PAGE_MASK) == (CAST_DOWN(int, iov_base) & PAGE_MASK)) { + /* + * Bring the file offset write up to a pagesize boundary + * this will also bring the base address to a page boundary + * since they both are currently on the same offset within a page + * note: if we get here, uio->uio_resid is greater than PAGE_SIZE + * so the computed clip_size must always be less than the current uio_resid + */ + clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64)); + + /* + * Fake the resid going into the cluster_write_x call + * and restore it on the way out. + */ + // LP64todo - fix this + prev_resid = uio_resid(uio); + uio_setresid(uio, clip_size); + + retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags); + + uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio))); + } else { + /* + * can't get both the file offset and the buffer offset aligned to a page boundary + * so fire an I/O through the cache for this entire vector + */ + // LP64todo - fix this + clip_size = iov_len; + // LP64todo - fix this + prev_resid = uio_resid(uio); + uio_setresid(uio, clip_size); + + retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags); + + uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio))); + } + } else { + /* + * If we come in here, we know the offset into + * the file is on a pagesize boundary and the + * target buffer address is also on a page boundary + */ + max_io_size = newEOF - uio->uio_offset; + // LP64todo - fix this + clip_size = uio_resid(uio); + if (iov_len < clip_size) + // LP64todo - fix this! + clip_size = iov_len; + if (max_io_size < clip_size) + clip_size = max_io_size; + + if (clip_size < PAGE_SIZE) { + /* + * Take care of tail end of write in this vector + */ + // LP64todo - fix this + prev_resid = uio_resid(uio); + uio_setresid(uio, clip_size); + + retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags); + + uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio))); + } else { + /* round clip_size down to a multiple of pagesize */ + clip_size = clip_size & ~(PAGE_MASK); + // LP64todo - fix this + prev_resid = uio_resid(uio); + uio_setresid(uio, clip_size); + + retval = cluster_nocopy_write(vp, uio, newEOF); + + if ((retval == 0) && uio_resid(uio)) + retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags); + + uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio))); + } + } /* end else */ + } /* end while */ + return(retval); } static int -cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags) - struct vnode *vp; - struct uio *uio; - off_t newEOF; - int devblocksize; - int flags; +cluster_nocopy_write(vnode_t vp, struct uio *uio, off_t newEOF) { upl_t upl; upl_page_info_t *pl; - off_t upl_f_offset; vm_offset_t upl_offset; - off_t max_io_size; int io_size; int io_flag; int upl_size; @@ -1299,15 +1545,16 @@ cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags) int pages_in_pl; int upl_flags; kern_return_t kret; - struct iovec *iov; int i; int force_data_sync; int error = 0; struct clios iostate; + struct cl_writebehind *wbp; + struct iovec *iov; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START, - (int)uio->uio_offset, (int)uio->uio_resid, - (int)newEOF, devblocksize, 0); + (int)uio->uio_offset, (int)uio_resid(uio), + (int)newEOF, 0, 0); /* * When we enter this routine, we know @@ -1315,8 +1562,13 @@ cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags) * -- the resid is a page multiple * -- the resid will not exceed iov_len */ - cluster_try_push(vp, newEOF, 0, 1); + + if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) != NULL) { + cluster_try_push(wbp, vp, newEOF, 0, 1); + + lck_mtx_unlock(&wbp->cl_lockw); + } iostate.io_completed = 0; iostate.io_issued = 0; iostate.io_error = 0; @@ -1324,13 +1576,15 @@ cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags) iov = uio->uio_iov; - while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) { - io_size = uio->uio_resid; + while (uio_resid(uio) && uio->uio_offset < newEOF && error == 0) { + io_size = uio_resid(uio); if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE)) io_size = MAX_UPL_TRANSFER * PAGE_SIZE; - upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK; + // LP64todo - fix this! + upl_offset = CAST_DOWN(vm_offset_t, iov->iov_base) & PAGE_MASK; + upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START, @@ -1342,8 +1596,9 @@ cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags) upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; + // LP64todo - fix this! kret = vm_map_get_upl(current_map(), - (vm_offset_t)iov->iov_base & ~PAGE_MASK, + CAST_DOWN(vm_offset_t, iov->iov_base) & ~PAGE_MASK, &upl_size, &upl, NULL, @@ -1427,10 +1682,14 @@ cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags) * if there are already too many outstanding writes * wait until some complete before issuing the next */ + lck_mtx_lock(cl_mtxp); + while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) { iostate.io_wanted = 1; - tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0); + msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_write", 0); } + lck_mtx_unlock(cl_mtxp); + if (iostate.io_error) { /* * one of the earlier writes we issued ran into a hard error @@ -1450,15 +1709,15 @@ cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags) (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0); error = cluster_io(vp, upl, upl_offset, uio->uio_offset, - io_size, devblocksize, io_flag, (struct buf *)0, &iostate); + io_size, io_flag, (buf_t)NULL, &iostate); iov->iov_len -= io_size; - iov->iov_base += io_size; - uio->uio_resid -= io_size; + ((u_int32_t)iov->iov_base) += io_size; + uio_setresid(uio, (uio_resid(uio) - io_size)); uio->uio_offset += io_size; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END, - (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0); + (int)upl_offset, (int)uio->uio_offset, (int)uio_resid(uio), error, 0); } /* end while */ @@ -1467,10 +1726,14 @@ wait_for_writes: * make sure all async writes issued as part of this stream * have completed before we return */ + lck_mtx_lock(cl_mtxp); + while (iostate.io_issued != iostate.io_completed) { iostate.io_wanted = 1; - tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0); + msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_write", 0); } + lck_mtx_unlock(cl_mtxp); + if (iostate.io_error) error = iostate.io_error; @@ -1482,12 +1745,7 @@ wait_for_writes: static int -cluster_phys_write(vp, uio, newEOF, devblocksize, flags) - struct vnode *vp; - struct uio *uio; - off_t newEOF; - int devblocksize; - int flags; +cluster_phys_write(vnode_t vp, struct uio *uio, off_t newEOF) { upl_page_info_t *pl; addr64_t src_paddr; @@ -1500,19 +1758,33 @@ cluster_phys_write(vp, uio, newEOF, devblocksize, flags) int pages_in_pl; int upl_flags; kern_return_t kret; - struct iovec *iov; int error = 0; + u_int64_t iov_base; + int devblocksize; + struct cl_writebehind *wbp; + devblocksize = vp->v_mount->mnt_devblocksize; /* * When we enter this routine, we know * -- the resid will not exceed iov_len * -- the vector target address is physcially contiguous */ - cluster_try_push(vp, newEOF, 0, 1); + if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) != NULL) { - iov = uio->uio_iov; - io_size = iov->iov_len; - upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK; + cluster_try_push(wbp, vp, newEOF, 0, 1); + + lck_mtx_unlock(&wbp->cl_lockw); + } +#if LP64_DEBUG + if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) { + panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__); + } +#endif /* LP64_DEBUG */ + + // LP64todo - fix this! + io_size = uio_iov_len(uio); + iov_base = uio_iov_base(uio); + upl_offset = CAST_DOWN(upl_offset_t, iov_base) & PAGE_MASK; upl_needed_size = upl_offset + io_size; pages_in_pl = 0; @@ -1520,8 +1792,9 @@ cluster_phys_write(vp, uio, newEOF, devblocksize, flags) upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; + // LP64todo - fix this! kret = vm_map_get_upl(current_map(), - (vm_offset_t)iov->iov_base & ~PAGE_MASK, + CAST_DOWN(upl_offset_t, iov_base) & ~PAGE_MASK, &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0); if (kret != KERN_SUCCESS) { @@ -1536,12 +1809,12 @@ cluster_phys_write(vp, uio, newEOF, devblocksize, flags) * This is a failure in the physical memory case. */ if (upl_size < upl_needed_size) { - kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); + ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); return(EINVAL); } pl = ubc_upl_pageinfo(upl); - src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + ((addr64_t)((u_int)iov->iov_base & PAGE_MASK)); + src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + ((addr64_t)(iov_base & PAGE_MASK)); while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) { int head_size; @@ -1551,7 +1824,7 @@ cluster_phys_write(vp, uio, newEOF, devblocksize, flags) if (head_size > io_size) head_size = io_size; - error = cluster_align_phys_io(vp, uio, src_paddr, head_size, devblocksize, 0); + error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0); if (error) { ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); @@ -1570,21 +1843,21 @@ cluster_phys_write(vp, uio, newEOF, devblocksize, flags) * issue a synchronous write to cluster_io */ error = cluster_io(vp, upl, upl_offset, uio->uio_offset, - io_size, 0, CL_DEV_MEMORY, (struct buf *)0, (struct clios *)0); + io_size, CL_DEV_MEMORY, (buf_t)NULL, (struct clios *)NULL); } if (error == 0) { /* * The cluster_io write completed successfully, * update the uio structure */ - uio->uio_resid -= io_size; - iov->iov_len -= io_size; - iov->iov_base += io_size; + uio_setresid(uio, (uio_resid(uio) - io_size)); + uio_iov_len_add(uio, -io_size); + uio_iov_base_add(uio, io_size); uio->uio_offset += io_size; src_paddr += io_size; if (tail_size) - error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, devblocksize, 0); + error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0); } /* * just release our hold on the physically contiguous @@ -1597,55 +1870,71 @@ cluster_phys_write(vp, uio, newEOF, devblocksize, flags) static int -cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags) - struct vnode *vp; - struct uio *uio; - off_t oldEOF; - off_t newEOF; - off_t headOff; - off_t tailOff; - int devblocksize; - int flags; +cluster_write_x(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int flags) { upl_page_info_t *pl; upl_t upl; - vm_offset_t upl_offset; + vm_offset_t upl_offset = 0; int upl_size; off_t upl_f_offset; int pages_in_upl; int start_offset; int xfer_resid; int io_size; - int io_flags; int io_offset; int bytes_to_zero; int bytes_to_move; kern_return_t kret; int retval = 0; - int uio_resid; + int io_resid; long long total_size; long long zero_cnt; off_t zero_off; long long zero_cnt1; off_t zero_off1; - daddr_t start_blkno; - daddr_t last_blkno; + struct cl_extent cl; int intersection; + struct cl_writebehind *wbp; + if ((wbp = cluster_get_wbp(vp, 0)) != NULL) + { + if (wbp->cl_hasbeenpaged) { + /* + * this vnode had pages cleaned to it by + * the pager which indicates that either + * it's not very 'hot', or the system is + * being overwhelmed by a lot of dirty + * data being delayed in the VM cache... + * in either event, we'll push our remaining + * delayed data at this point... this will + * be more efficient than paging out 1 page at + * a time, and will also act as a throttle + * by delaying this client from writing any + * more data until all his delayed data has + * at least been queued to the uderlying driver. + */ + if (wbp->cl_number || wbp->cl_scmap) + cluster_push_EOF(vp, newEOF); + wbp->cl_hasbeenpaged = 0; + } + } if (uio) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START, - (int)uio->uio_offset, uio->uio_resid, (int)oldEOF, (int)newEOF, 0); + (int)uio->uio_offset, uio_resid(uio), (int)oldEOF, (int)newEOF, 0); - uio_resid = uio->uio_resid; + // LP64todo - fix this + io_resid = uio_resid(uio); } else { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START, 0, 0, (int)oldEOF, (int)newEOF, 0); - uio_resid = 0; + io_resid = 0; } zero_cnt = 0; zero_cnt1 = 0; + zero_off = 0; + zero_off1 = 0; if (flags & IO_HEADZEROFILL) { /* @@ -1667,26 +1956,27 @@ cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags) } if (flags & IO_TAILZEROFILL) { if (uio) { - zero_off1 = uio->uio_offset + uio->uio_resid; + // LP64todo - fix this + zero_off1 = uio->uio_offset + uio_resid(uio); if (zero_off1 < tailOff) zero_cnt1 = tailOff - zero_off1; } } if (zero_cnt == 0 && uio == (struct uio *) 0) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, - retval, 0, 0, 0, 0); - return (0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, + retval, 0, 0, 0, 0); + return (0); } - while ((total_size = (uio_resid + zero_cnt + zero_cnt1)) && retval == 0) { + while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) { /* * for this iteration of the loop, figure out where our starting point is */ if (zero_cnt) { start_offset = (int)(zero_off & PAGE_MASK_64); upl_f_offset = zero_off - start_offset; - } else if (uio_resid) { + } else if (io_resid) { start_offset = (int)(uio->uio_offset & PAGE_MASK_64); upl_f_offset = uio->uio_offset - start_offset; } else { @@ -1699,12 +1989,11 @@ cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags) if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE)) total_size = MAX_UPL_TRANSFER * PAGE_SIZE; - start_blkno = (daddr_t)(upl_f_offset / PAGE_SIZE_64); + cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64); - if (uio && !(vp->v_flag & VNOCACHE_DATA) && - (flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0) { + if (uio && ((flags & (IO_NOCACHE | IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) { /* - * assumption... total_size <= uio_resid + * assumption... total_size <= io_resid * because IO_HEADZEROFILL and IO_TAILZEROFILL not set */ if ((start_offset + total_size) > (MAX_UPL_TRANSFER * PAGE_SIZE)) @@ -1716,7 +2005,7 @@ cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags) if (retval) break; - uio_resid -= (total_size - xfer_resid); + io_resid -= (total_size - xfer_resid); total_size = xfer_resid; start_offset = (int)(uio->uio_offset & PAGE_MASK_64); upl_f_offset = uio->uio_offset - start_offset; @@ -1760,12 +2049,17 @@ cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags) KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0); + /* + * Gather the pages from the buffer cache. + * The UPL_WILL_MODIFY flag lets the UPL subsystem know + * that we intend to modify these pages. + */ kret = ubc_create_upl(vp, - upl_f_offset, - upl_size, - &upl, - &pl, - UPL_SET_LITE); + upl_f_offset, + upl_size, + &upl, + &pl, + UPL_SET_LITE | UPL_WILL_MODIFY); if (kret != KERN_SUCCESS) panic("cluster_write: failed to get pagelist"); @@ -1785,8 +2079,8 @@ cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags) if ((upl_f_offset + read_size) > newEOF) read_size = newEOF - upl_f_offset; - retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, devblocksize, - CL_READ, (struct buf *)0, (struct clios *)0); + retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, + CL_READ, (buf_t)NULL, (struct clios *)NULL); if (retval) { /* * we had an error during the read which causes us to abort @@ -1795,7 +2089,9 @@ cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags) * there state and mark the failed page in error */ ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES); - ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); + + if (upl_size > PAGE_SIZE) + ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE, (int)upl, 0, 0, retval, 0); @@ -1819,8 +2115,8 @@ cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags) if ((upl_f_offset + upl_offset + read_size) > newEOF) read_size = newEOF - (upl_f_offset + upl_offset); - retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, devblocksize, - CL_READ, (struct buf *)0, (struct clios *)0); + retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, + CL_READ, (buf_t)NULL, (struct clios *)NULL); if (retval) { /* * we had an error during the read which causes us to abort @@ -1829,7 +2125,9 @@ cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags) * modifying there state and mark the failed page in error */ ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES); - ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); + + if (upl_size > PAGE_SIZE) + ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE, (int)upl, 0, 0, retval, 0); @@ -1868,8 +2166,8 @@ cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags) zero_off += bytes_to_zero; io_offset += bytes_to_zero; } - if (xfer_resid && uio_resid) { - bytes_to_move = min(uio_resid, xfer_resid); + if (xfer_resid && io_resid) { + bytes_to_move = min(io_resid, xfer_resid); retval = cluster_copy_upl_data(uio, upl, io_offset, bytes_to_move); @@ -1880,7 +2178,7 @@ cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags) KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE, (int)upl, 0, 0, retval, 0); } else { - uio_resid -= bytes_to_move; + io_resid -= bytes_to_move; xfer_resid -= bytes_to_move; io_offset += bytes_to_move; } @@ -1936,15 +2234,21 @@ cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags) */ goto issue_io; check_cluster: + /* + * take the lock to protect our accesses + * of the writebehind and sparse cluster state + */ + wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED); + /* * calculate the last logical block number * that this delayed I/O encompassed */ - last_blkno = (upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64; + cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64); - if (vp->v_flag & VHASDIRTY) { + if (wbp->cl_scmap) { - if ( !(vp->v_flag & VNOCACHE_DATA)) { + if ( !(flags & IO_NOCACHE)) { /* * we've fallen into the sparse * cluster method of delaying dirty pages @@ -1958,7 +2262,9 @@ check_cluster: ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY); - sparse_cluster_add(vp, newEOF, start_blkno, last_blkno); + sparse_cluster_add(wbp, vp, &cl, newEOF); + + lck_mtx_unlock(&wbp->cl_lockw); continue; } @@ -1980,8 +2286,9 @@ check_cluster: */ upl_size = 0; } - sparse_cluster_push(vp, ubc_getsize(vp), 1); + sparse_cluster_push(wbp, vp, newEOF, 1); + wbp->cl_number = 0; /* * no clusters of either type present at this point * so just go directly to start_new_cluster since @@ -1993,13 +2300,13 @@ check_cluster: } upl_offset = 0; - if (vp->v_clen == 0) + if (wbp->cl_number == 0) /* * no clusters currently present */ goto start_new_cluster; - for (cl_index = 0; cl_index < vp->v_clen; cl_index++) { + for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) { /* * check each cluster that we currently hold * try to merge some or all of this write into @@ -2007,42 +2314,42 @@ check_cluster: * any portion of the write remains, start a * new cluster */ - if (start_blkno >= vp->v_clusters[cl_index].start_pg) { + if (cl.b_addr >= wbp->cl_clusters[cl_index].b_addr) { /* * the current write starts at or after the current cluster */ - if (last_blkno <= (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) { + if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER)) { /* * we have a write that fits entirely * within the existing cluster limits */ - if (last_blkno > vp->v_clusters[cl_index].last_pg) + if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) /* * update our idea of where the cluster ends */ - vp->v_clusters[cl_index].last_pg = last_blkno; + wbp->cl_clusters[cl_index].e_addr = cl.e_addr; break; } - if (start_blkno < (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) { + if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER)) { /* * we have a write that starts in the middle of the current cluster * but extends beyond the cluster's limit... we know this because * of the previous checks * we'll extend the current cluster to the max - * and update the start_blkno for the current write to reflect that + * and update the b_addr for the current write to reflect that * the head of it was absorbed into this cluster... * note that we'll always have a leftover tail in this case since * full absorbtion would have occurred in the clause above */ - vp->v_clusters[cl_index].last_pg = vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER; + wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER; if (upl_size) { - int start_pg_in_upl; + daddr64_t start_pg_in_upl; - start_pg_in_upl = upl_f_offset / PAGE_SIZE_64; + start_pg_in_upl = (daddr64_t)(upl_f_offset / PAGE_SIZE_64); - if (start_pg_in_upl < vp->v_clusters[cl_index].last_pg) { - intersection = (vp->v_clusters[cl_index].last_pg - start_pg_in_upl) * PAGE_SIZE; + if (start_pg_in_upl < wbp->cl_clusters[cl_index].e_addr) { + intersection = (int)((wbp->cl_clusters[cl_index].e_addr - start_pg_in_upl) * PAGE_SIZE); ubc_upl_commit_range(upl, upl_offset, intersection, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY); @@ -2051,7 +2358,7 @@ check_cluster: upl_size -= intersection; } } - start_blkno = vp->v_clusters[cl_index].last_pg; + cl.b_addr = wbp->cl_clusters[cl_index].e_addr; } /* * we come here for the case where the current write starts @@ -2065,16 +2372,16 @@ check_cluster: /* * the current write starts in front of the cluster we're currently considering */ - if ((vp->v_clusters[cl_index].last_pg - start_blkno) <= MAX_UPL_TRANSFER) { + if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= MAX_UPL_TRANSFER) { /* * we can just merge the new request into * this cluster and leave it in the cache * since the resulting cluster is still * less than the maximum allowable size */ - vp->v_clusters[cl_index].start_pg = start_blkno; + wbp->cl_clusters[cl_index].b_addr = cl.b_addr; - if (last_blkno > vp->v_clusters[cl_index].last_pg) { + if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) { /* * the current write completely * envelops the existing cluster and since @@ -2082,7 +2389,7 @@ check_cluster: * we can just use the start and last blocknos of the write * to generate the cluster limits */ - vp->v_clusters[cl_index].last_pg = last_blkno; + wbp->cl_clusters[cl_index].e_addr = cl.e_addr; } break; } @@ -2096,16 +2403,16 @@ check_cluster: * get an intersection with the current write * */ - if (last_blkno > vp->v_clusters[cl_index].last_pg - MAX_UPL_TRANSFER) { + if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - MAX_UPL_TRANSFER) { /* * the current write extends into the proposed cluster * clip the length of the current write after first combining it's * tail with the newly shaped cluster */ - vp->v_clusters[cl_index].start_pg = vp->v_clusters[cl_index].last_pg - MAX_UPL_TRANSFER; + wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - MAX_UPL_TRANSFER; if (upl_size) { - intersection = (last_blkno - vp->v_clusters[cl_index].start_pg) * PAGE_SIZE; + intersection = (int)((cl.e_addr - wbp->cl_clusters[cl_index].b_addr) * PAGE_SIZE); if (intersection > upl_size) /* @@ -2119,7 +2426,7 @@ check_cluster: UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY); upl_size -= intersection; } - last_blkno = vp->v_clusters[cl_index].start_pg; + cl.e_addr = wbp->cl_clusters[cl_index].b_addr; } /* * if we get here, there was no way to merge @@ -2130,14 +2437,14 @@ check_cluster: */ } } - if (cl_index < vp->v_clen) + if (cl_index < wbp->cl_number) /* * we found an existing cluster(s) that we * could entirely merge this I/O into */ goto delay_io; - if (vp->v_clen < MAX_CLUSTERS && !(vp->v_flag & VNOCACHE_DATA)) + if (wbp->cl_number < MAX_CLUSTERS && !(flags & IO_NOCACHE)) /* * we didn't find an existing cluster to * merge into, but there's room to start @@ -2151,16 +2458,23 @@ check_cluster: * pushing one of the existing ones... if none of * them are able to be pushed, we'll switch * to the sparse cluster mechanism - * cluster_try_push updates v_clen to the + * cluster_try_push updates cl_number to the * number of remaining clusters... and * returns the number of currently unused clusters */ - if (vp->v_flag & VNOCACHE_DATA) - can_delay = 0; - else - can_delay = 1; + int ret_cluster_try_push = 0; + /* if writes are not deferred, call cluster push immediately */ + if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) { + if (flags & IO_NOCACHE) + can_delay = 0; + else + can_delay = 1; + + ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, can_delay, 0); + } - if (cluster_try_push(vp, newEOF, can_delay, 0) == 0) { + /* execute following regardless writes are deferred or not */ + if (ret_cluster_try_push == 0) { /* * no more room in the normal cluster mechanism * so let's switch to the more expansive but expensive @@ -2175,8 +2489,10 @@ check_cluster: ubc_upl_commit_range(upl, upl_offset, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY); - sparse_cluster_switch(vp, newEOF); - sparse_cluster_add(vp, newEOF, start_blkno, last_blkno); + sparse_cluster_switch(wbp, vp, newEOF); + sparse_cluster_add(wbp, vp, &cl, newEOF); + + lck_mtx_unlock(&wbp->cl_lockw); continue; } @@ -2189,208 +2505,218 @@ check_cluster: * however, we don't want to push so much out that the write throttle kicks in and * hangs this thread up until some of the I/O completes... */ - while (vp->v_clen && (vp->v_numoutput <= (ASYNC_THROTTLE / 2))) - cluster_try_push(vp, newEOF, 0, 0); + if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) { + while (wbp->cl_number && (vp->v_numoutput <= (VNODE_ASYNC_THROTTLE / 2))) + cluster_try_push(wbp, vp, newEOF, 0, 0); + } start_new_cluster: - if (vp->v_clen == 0) - vp->v_ciosiz = devblocksize; - - vp->v_clusters[vp->v_clen].start_pg = start_blkno; - vp->v_clusters[vp->v_clen].last_pg = last_blkno; - vp->v_clen++; + wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr; + wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr; + if (flags & IO_NOCACHE) + wbp->cl_clusters[wbp->cl_number].io_nocache = 1; + else + wbp->cl_clusters[wbp->cl_number].io_nocache = 0; + wbp->cl_number++; delay_io: if (upl_size) ubc_upl_commit_range(upl, upl_offset, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY); + + lck_mtx_unlock(&wbp->cl_lockw); + continue; issue_io: /* + * we don't hold the vnode lock at this point + * + * because we had to ask for a UPL that provides currenty non-present pages, the + * UPL has been automatically set to clear the dirty flags (both software and hardware) + * upon committing it... this is not the behavior we want since it's possible for + * pages currently present as part of a mapped file to be dirtied while the I/O is in flight. * in order to maintain some semblance of coherency with mapped writes - * we need to write the cluster back out as a multiple of the PAGESIZE - * unless the cluster encompasses the last page of the file... in this - * case we'll round out to the nearest device block boundary + * we need to drop the current upl and pick it back up with COPYOUT_FROM set + * so that we correctly deal with a change in state of the hardware modify bit... + * we do this via cluster_push_x... by passing along the IO_SYNC flag, we force + * cluster_push_x to wait until all the I/Os have completed... cluster_push_x is also + * responsible for generating the correct sized I/O(s) */ - io_size = upl_size; - - if ((upl_f_offset + io_size) > newEOF) { - io_size = newEOF - upl_f_offset; - io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1); - } - - if (flags & IO_SYNC) - io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE; - else - io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | CL_ASYNC; + ubc_upl_commit_range(upl, 0, upl_size, + UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY); - if (vp->v_flag & VNOCACHE_DATA) - io_flags |= CL_DUMP; + cl.e_addr = (upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64; - retval = cluster_io(vp, upl, 0, upl_f_offset, io_size, devblocksize, - io_flags, (struct buf *)0, (struct clios *)0); + retval = cluster_push_x(vp, &cl, newEOF, flags); } } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, - retval, 0, uio_resid, 0, 0); + retval, 0, io_resid, 0, 0); return (retval); } int -cluster_read(vp, uio, filesize, devblocksize, flags) - struct vnode *vp; - struct uio *uio; - off_t filesize; - int devblocksize; - int flags; +cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags) { int prev_resid; - int clip_size; + u_int clip_size; off_t max_io_size; - struct iovec *iov; int upl_size; int upl_flags; upl_t upl; int retval = 0; + int flags; + flags = xflags; - if (!((vp->v_flag & VNOCACHE_DATA) && (uio->uio_segflg == UIO_USERSPACE))) - { - /* - * go do a read through the cache if one of the following is true.... - * NOCACHE is not true - * the uio request doesn't target USERSPACE - */ - return (cluster_read_x(vp, uio, filesize, devblocksize, flags)); - } - - while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) - { - /* - * we know we have a resid, so this is safe - * skip over any emtpy vectors - */ - iov = uio->uio_iov; - - while (iov->iov_len == 0) { - uio->uio_iov++; - uio->uio_iovcnt--; - iov = uio->uio_iov; - } - upl_size = PAGE_SIZE; - upl_flags = UPL_QUERY_OBJECT_TYPE; - - if ((vm_map_get_upl(current_map(), - (vm_offset_t)iov->iov_base & ~PAGE_MASK, - &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) - { - /* - * the user app must have passed in an invalid address + if (vp->v_flag & VNOCACHE_DATA) + flags |= IO_NOCACHE; + if (vp->v_flag & VRAOFF) + flags |= IO_RAOFF; + + if (!((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg))) { + /* + * go do a read through the cache if one of the following is true.... + * NOCACHE is not true + * the uio request doesn't target USERSPACE */ - return (EFAULT); - } - - /* - * We check every vector target but if it is physically - * contiguous space, we skip the sanity checks. - */ - if (upl_flags & UPL_PHYS_CONTIG) - { - retval = cluster_phys_read(vp, uio, filesize, devblocksize, flags); - } - else if (uio->uio_resid < PAGE_SIZE) - { + return (cluster_read_x(vp, uio, filesize, flags)); + } + +#if LP64_DEBUG + if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) { + panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__); + } +#endif /* LP64_DEBUG */ + + while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) { + u_int64_t iov_len; + u_int64_t iov_base; + /* - * we're here because we're don't have a physically contiguous target buffer - * go do a read through the cache if - * the total xfer size is less than a page... + * we know we have a resid, so this is safe + * skip over any emtpy vectors */ - return (cluster_read_x(vp, uio, filesize, devblocksize, flags)); - } - else if (((int)uio->uio_offset & PAGE_MASK) || ((int)iov->iov_base & PAGE_MASK)) - { - if (((int)uio->uio_offset & PAGE_MASK) == ((int)iov->iov_base & PAGE_MASK)) - { - /* - * Bring the file offset read up to a pagesize boundary - * this will also bring the base address to a page boundary - * since they both are currently on the same offset within a page - * note: if we get here, uio->uio_resid is greater than PAGE_SIZE - * so the computed clip_size must always be less than the current uio_resid - */ - clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64)); - - /* - * Fake the resid going into the cluster_read_x call - * and restore it on the way out. - */ - prev_resid = uio->uio_resid; - uio->uio_resid = clip_size; - retval = cluster_read_x(vp, uio, filesize, devblocksize, flags); - uio->uio_resid = prev_resid - (clip_size - uio->uio_resid); - } - else - { - /* - * can't get both the file offset and the buffer offset aligned to a page boundary - * so fire an I/O through the cache for this entire vector - */ - clip_size = iov->iov_len; - prev_resid = uio->uio_resid; - uio->uio_resid = clip_size; - retval = cluster_read_x(vp, uio, filesize, devblocksize, flags); - uio->uio_resid = prev_resid - (clip_size - uio->uio_resid); - } - } - else - { - /* - * If we come in here, we know the offset into - * the file is on a pagesize boundary + iov_len = uio_iov_len(uio); + + while (iov_len == 0) { + uio_next_iov(uio); + uio->uio_iovcnt--; + iov_len = uio_iov_len(uio); + } + iov_base = uio_iov_base(uio); + upl_size = PAGE_SIZE; + upl_flags = UPL_QUERY_OBJECT_TYPE; + + // LP64todo - fix this! + if ((vm_map_get_upl(current_map(), + CAST_DOWN(vm_offset_t, iov_base) & ~PAGE_MASK, + &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) { + /* + * the user app must have passed in an invalid address + */ + return (EFAULT); + } + + /* + * We check every vector target but if it is physically + * contiguous space, we skip the sanity checks. */ + if (upl_flags & UPL_PHYS_CONTIG) { + retval = cluster_phys_read(vp, uio, filesize); + } + else if (uio_resid(uio) < PAGE_SIZE) { + /* + * we're here because we're don't have a physically contiguous target buffer + * go do a read through the cache if + * the total xfer size is less than a page... + */ + return (cluster_read_x(vp, uio, filesize, flags)); + } + // LP64todo - fix this! + else if (((int)uio->uio_offset & PAGE_MASK) || (CAST_DOWN(int, iov_base) & PAGE_MASK)) { + if (((int)uio->uio_offset & PAGE_MASK) == (CAST_DOWN(int, iov_base) & PAGE_MASK)) { + /* + * Bring the file offset read up to a pagesize boundary + * this will also bring the base address to a page boundary + * since they both are currently on the same offset within a page + * note: if we get here, uio->uio_resid is greater than PAGE_SIZE + * so the computed clip_size must always be less than the current uio_resid + */ + clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64)); + + /* + * Fake the resid going into the cluster_read_x call + * and restore it on the way out. + */ + prev_resid = uio_resid(uio); + // LP64todo - fix this + uio_setresid(uio, clip_size); + + retval = cluster_read_x(vp, uio, filesize, flags); + + uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio))); + } else { + /* + * can't get both the file offset and the buffer offset aligned to a page boundary + * so fire an I/O through the cache for this entire vector + */ + // LP64todo - fix this! + clip_size = iov_len; + prev_resid = uio_resid(uio); + uio_setresid(uio, clip_size); + + retval = cluster_read_x(vp, uio, filesize, flags); + + uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio))); + } + } else { + /* + * If we come in here, we know the offset into + * the file is on a pagesize boundary + */ + max_io_size = filesize - uio->uio_offset; + // LP64todo - fix this + clip_size = uio_resid(uio); + if (iov_len < clip_size) + clip_size = iov_len; + if (max_io_size < clip_size) + clip_size = (int)max_io_size; + + if (clip_size < PAGE_SIZE) { + /* + * Take care of the tail end of the read in this vector. + */ + // LP64todo - fix this + prev_resid = uio_resid(uio); + uio_setresid(uio, clip_size); - max_io_size = filesize - uio->uio_offset; - clip_size = uio->uio_resid; - if (iov->iov_len < clip_size) - clip_size = iov->iov_len; - if (max_io_size < clip_size) - clip_size = (int)max_io_size; - - if (clip_size < PAGE_SIZE) - { - /* - * Take care of the tail end of the read in this vector. - */ - prev_resid = uio->uio_resid; - uio->uio_resid = clip_size; - retval = cluster_read_x(vp, uio, filesize, devblocksize, flags); - uio->uio_resid = prev_resid - (clip_size - uio->uio_resid); - } - else - { - /* round clip_size down to a multiple of pagesize */ - clip_size = clip_size & ~(PAGE_MASK); - prev_resid = uio->uio_resid; - uio->uio_resid = clip_size; - retval = cluster_nocopy_read(vp, uio, filesize, devblocksize, flags); - if ((retval==0) && uio->uio_resid) - retval = cluster_read_x(vp, uio, filesize, devblocksize, flags); - uio->uio_resid = prev_resid - (clip_size - uio->uio_resid); - } - } /* end else */ - } /* end while */ + retval = cluster_read_x(vp, uio, filesize, flags); + + uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio))); + } else { + /* round clip_size down to a multiple of pagesize */ + clip_size = clip_size & ~(PAGE_MASK); + // LP64todo - fix this + prev_resid = uio_resid(uio); + uio_setresid(uio, clip_size); + + retval = cluster_nocopy_read(vp, uio, filesize); + + if ((retval==0) && uio_resid(uio)) + retval = cluster_read_x(vp, uio, filesize, flags); + + uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio))); + } + } /* end else */ + } /* end while */ return(retval); } static int -cluster_read_x(vp, uio, filesize, devblocksize, flags) - struct vnode *vp; - struct uio *uio; - off_t filesize; - int devblocksize; - int flags; +cluster_read_x(vnode_t vp, struct uio *uio, off_t filesize, int flags) { upl_page_info_t *pl; upl_t upl; @@ -2400,44 +2726,49 @@ cluster_read_x(vp, uio, filesize, devblocksize, flags) int start_offset; int start_pg; int last_pg; - int uio_last; + int uio_last = 0; int pages_in_upl; off_t max_size; off_t last_ioread_offset; off_t last_request_offset; u_int size_of_prefetch; - int io_size; + u_int io_size; kern_return_t kret; int error = 0; int retval = 0; - u_int b_lblkno; - u_int e_lblkno; - struct clios iostate; u_int max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE; u_int rd_ahead_enabled = 1; u_int prefetch_enabled = 1; - + struct cl_readahead * rap; + struct clios iostate; + struct cl_extent extent; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START, - (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0); + (int)uio->uio_offset, uio_resid(uio), (int)filesize, 0, 0); + + // LP64todo - fix this + last_request_offset = uio->uio_offset + uio_resid(uio); - if (cluster_hard_throttle_on(vp)) { + if ((flags & (IO_RAOFF|IO_NOCACHE)) || + ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) { rd_ahead_enabled = 0; - prefetch_enabled = 0; + rap = NULL; + } else { + if (cluster_hard_throttle_on(vp)) { + rd_ahead_enabled = 0; + prefetch_enabled = 0; - max_rd_size = HARD_THROTTLE_MAXSIZE; + max_rd_size = HARD_THROTTLE_MAXSIZE; + } + if ((rap = cluster_get_rap(vp)) == NULL) + rd_ahead_enabled = 0; } - if (vp->v_flag & (VRAOFF|VNOCACHE_DATA)) - rd_ahead_enabled = 0; - - last_request_offset = uio->uio_offset + uio->uio_resid; - if (last_request_offset > filesize) last_request_offset = filesize; - b_lblkno = (u_int)(uio->uio_offset / PAGE_SIZE_64); - e_lblkno = (u_int)((last_request_offset - 1) / PAGE_SIZE_64); + extent.b_addr = uio->uio_offset / PAGE_SIZE_64; + extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64; - if (vp->v_ralen && (vp->v_lastr == b_lblkno || (vp->v_lastr + 1) == b_lblkno)) { + if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) { /* * determine if we already have a read-ahead in the pipe courtesy of the * last read systemcall that was issued... @@ -2445,7 +2776,7 @@ cluster_read_x(vp, uio, filesize, devblocksize, flags) * with respect to any read-ahead that might be necessary to * garner all the data needed to complete this read systemcall */ - last_ioread_offset = (vp->v_maxra * PAGE_SIZE_64) + PAGE_SIZE_64; + last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64; if (last_ioread_offset < uio->uio_offset) last_ioread_offset = (off_t)0; @@ -2454,7 +2785,7 @@ cluster_read_x(vp, uio, filesize, devblocksize, flags) } else last_ioread_offset = (off_t)0; - while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) { + while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) { /* * compute the size of the upl needed to encompass * the requested read... limit each call to cluster_io @@ -2467,12 +2798,13 @@ cluster_read_x(vp, uio, filesize, devblocksize, flags) upl_f_offset = uio->uio_offset - (off_t)start_offset; max_size = filesize - uio->uio_offset; - if ((off_t)((unsigned int)uio->uio_resid) < max_size) - io_size = uio->uio_resid; + // LP64todo - fix this! + if ((off_t)((unsigned int)uio_resid(uio)) < max_size) + io_size = uio_resid(uio); else io_size = max_size; - if (!(vp->v_flag & VNOCACHE_DATA)) { + if (!(flags & IO_NOCACHE)) { while (io_size) { u_int io_resid; @@ -2497,7 +2829,7 @@ cluster_read_x(vp, uio, filesize, devblocksize, flags) if (size_of_prefetch > max_rd_size) size_of_prefetch = max_rd_size; - size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, devblocksize); + size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize); last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE); @@ -2534,16 +2866,17 @@ cluster_read_x(vp, uio, filesize, devblocksize, flags) * we're already finished the I/O for this read request * let's see if we should do a read-ahead */ - cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize); + cluster_rd_ahead(vp, &extent, filesize, rap); } } if (retval) break; if (io_size == 0) { - if (e_lblkno < vp->v_lastr) - vp->v_maxra = 0; - vp->v_lastr = e_lblkno; - + if (rap != NULL) { + if (extent.e_addr < rap->cl_lastr) + rap->cl_maxra = 0; + rap->cl_lastr = extent.e_addr; + } break; } start_offset = (int)(uio->uio_offset & PAGE_MASK_64); @@ -2563,11 +2896,11 @@ cluster_read_x(vp, uio, filesize, devblocksize, flags) (int)upl, (int)upl_f_offset, upl_size, start_offset, 0); kret = ubc_create_upl(vp, - upl_f_offset, - upl_size, - &upl, - &pl, - UPL_SET_LITE); + upl_f_offset, + upl_size, + &upl, + &pl, + UPL_SET_LITE); if (kret != KERN_SUCCESS) panic("cluster_read: failed to get pagelist"); @@ -2618,7 +2951,7 @@ cluster_read_x(vp, uio, filesize, devblocksize, flags) */ error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, - io_size, devblocksize, CL_READ | CL_ASYNC, (struct buf *)0, &iostate); + io_size, CL_READ | CL_ASYNC, (buf_t)NULL, &iostate); } if (error == 0) { /* @@ -2643,8 +2976,9 @@ cluster_read_x(vp, uio, filesize, devblocksize, flags) if (val_size > max_size) val_size = max_size; - if (val_size > uio->uio_resid) - val_size = uio->uio_resid; + if (val_size > uio_resid(uio)) + // LP64todo - fix this + val_size = uio_resid(uio); if (last_ioread_offset == 0) last_ioread_offset = uio->uio_offset + val_size; @@ -2656,7 +2990,7 @@ cluster_read_x(vp, uio, filesize, devblocksize, flags) * pre-fetch I/O... the I/O latency will overlap * with the copying of the data */ - size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, devblocksize); + size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize); last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE); @@ -2671,16 +3005,22 @@ cluster_read_x(vp, uio, filesize, devblocksize, flags) * explicitly disabled it */ if (rd_ahead_enabled) - cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize); - - if (e_lblkno < vp->v_lastr) - vp->v_maxra = 0; - vp->v_lastr = e_lblkno; + cluster_rd_ahead(vp, &extent, filesize, rap); + + if (rap != NULL) { + if (extent.e_addr < rap->cl_lastr) + rap->cl_maxra = 0; + rap->cl_lastr = extent.e_addr; + } } + lck_mtx_lock(cl_mtxp); + while (iostate.io_issued != iostate.io_completed) { iostate.io_wanted = 1; - tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_read_x", 0); + msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_x", 0); } + lck_mtx_unlock(cl_mtxp); + if (iostate.io_error) error = iostate.io_error; else @@ -2697,7 +3037,7 @@ cluster_read_x(vp, uio, filesize, devblocksize, flags) KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, (int)upl, start_pg * PAGE_SIZE, io_size, error, 0); - if (error || (vp->v_flag & VNOCACHE_DATA)) + if (error || (flags & IO_NOCACHE)) ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); else @@ -2737,7 +3077,7 @@ cluster_read_x(vp, uio, filesize, devblocksize, flags) if (upl_dirty_page(pl, cur_pg)) commit_flags |= UPL_COMMIT_SET_DIRTY; - if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA)) + if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (flags & IO_NOCACHE)) ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); else @@ -2758,7 +3098,7 @@ cluster_read_x(vp, uio, filesize, devblocksize, flags) if (upl_dirty_page(pl, cur_pg)) commit_flags |= UPL_COMMIT_SET_DIRTY; - if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA)) + if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (flags & IO_NOCACHE)) ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); else @@ -2782,21 +3122,38 @@ cluster_read_x(vp, uio, filesize, devblocksize, flags) } if (retval == 0) retval = error; + + if ( uio_resid(uio) ) { + if (cluster_hard_throttle_on(vp)) { + rd_ahead_enabled = 0; + prefetch_enabled = 0; + + max_rd_size = HARD_THROTTLE_MAXSIZE; + } else { + if (rap != NULL) + rd_ahead_enabled = 1; + prefetch_enabled = 1; + + max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE; + } + } + } + if (rap != NULL) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END, + (int)uio->uio_offset, uio_resid(uio), rap->cl_lastr, retval, 0); + + lck_mtx_unlock(&rap->cl_lockr); + } else { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END, + (int)uio->uio_offset, uio_resid(uio), 0, retval, 0); } - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END, - (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0); return (retval); } static int -cluster_nocopy_read(vp, uio, filesize, devblocksize, flags) - struct vnode *vp; - struct uio *uio; - off_t filesize; - int devblocksize; - int flags; +cluster_nocopy_read(vnode_t vp, struct uio *uio, off_t filesize) { upl_t upl; upl_page_info_t *pl; @@ -2812,13 +3169,15 @@ cluster_nocopy_read(vp, uio, filesize, devblocksize, flags) int i; int force_data_sync; int retval = 0; + int no_zero_fill = 0; + int abort_flag = 0; struct clios iostate; u_int max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE; u_int max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 2; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START, - (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0); + (int)uio->uio_offset, uio_resid(uio), (int)filesize, 0, 0); /* * When we enter this routine, we know @@ -2834,18 +3193,22 @@ cluster_nocopy_read(vp, uio, filesize, devblocksize, flags) iov = uio->uio_iov; - if (cluster_hard_throttle_on(vp)) { - max_rd_size = HARD_THROTTLE_MAXSIZE; - max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1; - } - while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) { + while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) { + if (cluster_hard_throttle_on(vp)) { + max_rd_size = HARD_THROTTLE_MAXSIZE; + max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1; + } else { + max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE; + max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 2; + } max_io_size = filesize - uio->uio_offset; - if (max_io_size < (off_t)((unsigned int)uio->uio_resid)) + // LP64todo - fix this + if (max_io_size < (off_t)((unsigned int)uio_resid(uio))) io_size = max_io_size; else - io_size = uio->uio_resid; + io_size = uio_resid(uio); /* * First look for pages already in the cache @@ -2889,20 +3252,34 @@ cluster_nocopy_read(vp, uio, filesize, devblocksize, flags) */ goto wait_for_reads; - upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK; + // LP64todo - fix this! + upl_offset = CAST_DOWN(vm_offset_t, iov->iov_base) & PAGE_MASK; upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START, (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0); + if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) { + no_zero_fill = 1; + abort_flag = UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY; + } else { + no_zero_fill = 0; + abort_flag = UPL_ABORT_FREE_ON_EMPTY; + } for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) { pages_in_pl = 0; upl_size = upl_needed_size; upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; - kret = vm_map_get_upl(current_map(), - (vm_offset_t)iov->iov_base & ~PAGE_MASK, - &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync); + if (no_zero_fill) + upl_flags |= UPL_NOZEROFILL; + if (force_data_sync) + upl_flags |= UPL_FORCE_DATA_SYNC; + + // LP64todo - fix this! + kret = vm_map_create_upl(current_map(), + (vm_map_offset_t)(CAST_DOWN(vm_offset_t, iov->iov_base) & ~PAGE_MASK), + &upl_size, &upl, NULL, &pages_in_pl, &upl_flags); if (kret != KERN_SUCCESS) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, @@ -2926,8 +3303,7 @@ cluster_nocopy_read(vp, uio, filesize, devblocksize, flags) if (i == pages_in_pl) break; - ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, - UPL_ABORT_FREE_ON_EMPTY); + ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag); } if (force_data_sync >= 3) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, @@ -2942,8 +3318,7 @@ cluster_nocopy_read(vp, uio, filesize, devblocksize, flags) io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK; if (io_size == 0) { - ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, - UPL_ABORT_FREE_ON_EMPTY); + ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag); goto wait_for_reads; } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, @@ -2955,10 +3330,14 @@ cluster_nocopy_read(vp, uio, filesize, devblocksize, flags) * if there are already too many outstanding reads * wait until some have completed before issuing the next read */ + lck_mtx_lock(cl_mtxp); + while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) { iostate.io_wanted = 1; - tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0); + msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_read", 0); } + lck_mtx_unlock(cl_mtxp); + if (iostate.io_error) { /* * one of the earlier reads we issued ran into a hard error @@ -2967,29 +3346,27 @@ cluster_nocopy_read(vp, uio, filesize, devblocksize, flags) * go wait for any other reads to complete before * returning the error to the caller */ - ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, - UPL_ABORT_FREE_ON_EMPTY); + ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag); goto wait_for_reads; } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START, (int)upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0); - retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, - io_size, devblocksize, + retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO, - (struct buf *)0, &iostate); + (buf_t)NULL, &iostate); /* * update the uio structure */ - iov->iov_base += io_size; + ((u_int32_t)iov->iov_base) += io_size; iov->iov_len -= io_size; - uio->uio_resid -= io_size; + uio_setresid(uio, (uio_resid(uio) - io_size)); uio->uio_offset += io_size; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END, - (int)upl, (int)uio->uio_offset, (int)uio->uio_resid, retval, 0); + (int)upl, (int)uio->uio_offset, (int)uio_resid(uio), retval, 0); } /* end while */ @@ -2998,60 +3375,77 @@ wait_for_reads: * make sure all async reads that are part of this stream * have completed before we return */ + lck_mtx_lock(cl_mtxp); + while (iostate.io_issued != iostate.io_completed) { iostate.io_wanted = 1; - tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0); + msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_read", 0); } + lck_mtx_unlock(cl_mtxp); + if (iostate.io_error) retval = iostate.io_error; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END, - (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0); + (int)uio->uio_offset, (int)uio_resid(uio), 6, retval, 0); return (retval); } static int -cluster_phys_read(vp, uio, filesize, devblocksize, flags) - struct vnode *vp; - struct uio *uio; - off_t filesize; - int devblocksize; - int flags; +cluster_phys_read(vnode_t vp, struct uio *uio, off_t filesize) { upl_page_info_t *pl; upl_t upl; vm_offset_t upl_offset; addr64_t dst_paddr; off_t max_size; - int io_size; +#if LP64KERN + int64_t io_size; + u_int64_t iov_len; + u_int64_t iov_base; +#else + int io_size; + uint iov_len; + uint iov_base; +#endif int tail_size; int upl_size; int upl_needed_size; int pages_in_pl; int upl_flags; kern_return_t kret; - struct iovec *iov; struct clios iostate; int error; + int devblocksize; + devblocksize = vp->v_mount->mnt_devblocksize; /* * When we enter this routine, we know * -- the resid will not exceed iov_len * -- the target address is physically contiguous */ - iov = uio->uio_iov; +#if LP64_DEBUG + if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) { + panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__); + } +#endif /* LP64_DEBUG */ + + iov_len = uio_iov_len(uio); + iov_base = uio_iov_base(uio); max_size = filesize - uio->uio_offset; - if (max_size > (off_t)((unsigned int)iov->iov_len)) - io_size = iov->iov_len; + // LP64todo - fix this! + if (max_size < 0 || (u_int64_t)max_size > iov_len) + io_size = iov_len; else io_size = max_size; - upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK; + // LP64todo - fix this! + upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK; upl_needed_size = upl_offset + io_size; error = 0; @@ -3060,7 +3454,7 @@ cluster_phys_read(vp, uio, filesize, devblocksize, flags) upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; kret = vm_map_get_upl(current_map(), - (vm_offset_t)iov->iov_base & ~PAGE_MASK, + CAST_DOWN(vm_offset_t, iov_base) & ~PAGE_MASK, &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0); if (kret != KERN_SUCCESS) { @@ -3079,7 +3473,7 @@ cluster_phys_read(vp, uio, filesize, devblocksize, flags) } pl = ubc_upl_pageinfo(upl); - dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + ((addr64_t)((u_int)iov->iov_base & PAGE_MASK)); + dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + ((addr64_t)(iov_base & PAGE_MASK)); while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) { int head_size; @@ -3089,7 +3483,7 @@ cluster_phys_read(vp, uio, filesize, devblocksize, flags) if (head_size > io_size) head_size = io_size; - error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, devblocksize, CL_READ); + error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ); if (error) { ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); @@ -3123,22 +3517,25 @@ cluster_phys_read(vp, uio, filesize, devblocksize, flags) * if there are already too many outstanding reads * wait until some have completed before issuing the next */ + lck_mtx_lock(cl_mtxp); + while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) { iostate.io_wanted = 1; - tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0); + msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_phys_read", 0); } + lck_mtx_unlock(cl_mtxp); - error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize, 0, + error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize, CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC, - (struct buf *)0, &iostate); + (buf_t)NULL, &iostate); /* * The cluster_io read was issued successfully, * update the uio structure */ if (error == 0) { - uio->uio_resid -= xsize; - iov->iov_len -= xsize; - iov->iov_base += xsize; + uio_setresid(uio, (uio_resid(uio) - xsize)); + uio_iov_base_add(uio, xsize); + uio_iov_len_add(uio, -xsize); uio->uio_offset += xsize; dst_paddr += xsize; upl_offset += xsize; @@ -3149,15 +3546,19 @@ cluster_phys_read(vp, uio, filesize, devblocksize, flags) * make sure all async reads that are part of this stream * have completed before we proceed */ + lck_mtx_lock(cl_mtxp); + while (iostate.io_issued != iostate.io_completed) { iostate.io_wanted = 1; - tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0); + msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_phys_read", 0); } - if (iostate.io_error) { + lck_mtx_unlock(cl_mtxp); + + if (iostate.io_error) error = iostate.io_error; - } + if (error == 0 && tail_size) - error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, devblocksize, CL_READ); + error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ); /* * just release our hold on the physically contiguous @@ -3174,12 +3575,7 @@ cluster_phys_read(vp, uio, filesize, devblocksize, flags) * the completed pages will be released into the VM cache */ int -advisory_read(vp, filesize, f_offset, resid, devblocksize) - struct vnode *vp; - off_t filesize; - off_t f_offset; - int resid; - int devblocksize; +advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid) { upl_page_info_t *pl; upl_t upl; @@ -3197,11 +3593,11 @@ advisory_read(vp, filesize, f_offset, resid, devblocksize) int issued_io; int skip_range; - if (!UBCINFOEXISTS(vp)) + if ( !UBCINFOEXISTS(vp)) return(EINVAL); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START, - (int)f_offset, resid, (int)filesize, devblocksize, 0); + (int)f_offset, resid, (int)filesize, 0, 0); while (resid && f_offset < filesize && retval == 0) { /* @@ -3258,11 +3654,11 @@ advisory_read(vp, filesize, f_offset, resid, devblocksize) (int)upl, (int)upl_f_offset, upl_size, start_offset, 0); kret = ubc_create_upl(vp, - upl_f_offset, - upl_size, - &upl, - &pl, - UPL_RET_ONLY_ABSENT | UPL_SET_LITE); + upl_f_offset, + upl_size, + &upl, + &pl, + UPL_RET_ONLY_ABSENT | UPL_SET_LITE); if (kret != KERN_SUCCESS) return(retval); issued_io = 0; @@ -3322,8 +3718,8 @@ advisory_read(vp, filesize, f_offset, resid, devblocksize) /* * issue an asynchronous read to cluster_io */ - retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, devblocksize, - CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0, (struct clios *)0); + retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, + CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (buf_t)NULL, (struct clios *)NULL); issued_io = 1; } @@ -3347,88 +3743,144 @@ advisory_read(vp, filesize, f_offset, resid, devblocksize) int -cluster_push(vp) - struct vnode *vp; +cluster_push(vnode_t vp, int flags) { - int retval; + int retval; + struct cl_writebehind *wbp; - if (!UBCINFOEXISTS(vp) || (vp->v_clen == 0 && !(vp->v_flag & VHASDIRTY))) - return(0); + if ( !UBCINFOEXISTS(vp)) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -1, 0); + return (0); + } + /* return if deferred write is set */ + if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) { + return (0); + } + if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -2, 0); + return (0); + } + if (wbp->cl_number == 0 && wbp->cl_scmap == NULL) { + lck_mtx_unlock(&wbp->cl_lockw); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -3, 0); + return(0); + } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START, - vp->v_flag & VHASDIRTY, vp->v_clen, 0, 0, 0); + (int)wbp->cl_scmap, wbp->cl_number, flags, 0, 0); - if (vp->v_flag & VHASDIRTY) { - sparse_cluster_push(vp, ubc_getsize(vp), 1); + if (wbp->cl_scmap) { + sparse_cluster_push(wbp, vp, ubc_getsize(vp), 1); - vp->v_clen = 0; retval = 1; } else - retval = cluster_try_push(vp, ubc_getsize(vp), 0, 1); + retval = cluster_try_push(wbp, vp, ubc_getsize(vp), 0, 1); + + lck_mtx_unlock(&wbp->cl_lockw); + + if (flags & IO_SYNC) + (void)vnode_waitforwrites(vp, 0, 0, 0, (char *)"cluster_push"); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END, - vp->v_flag & VHASDIRTY, vp->v_clen, retval, 0, 0); + (int)wbp->cl_scmap, wbp->cl_number, retval, 0, 0); return (retval); } -int -cluster_release(vp) - struct vnode *vp; +__private_extern__ void +cluster_release(struct ubc_info *ubc) { - off_t offset; - u_int length; + struct cl_writebehind *wbp; + struct cl_readahead *rap; + + if ((wbp = ubc->cl_wbehind)) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0); + + if (wbp->cl_scmap) + vfs_drt_control(&(wbp->cl_scmap), 0); + } else { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, 0, 0, 0, 0); + } - if (vp->v_flag & VHASDIRTY) { - vfs_drt_control(&(vp->v_scmap), 0); + rap = ubc->cl_rahead; - vp->v_flag &= ~VHASDIRTY; + if (wbp != NULL) { + lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp); + FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND); + } + if ((rap = ubc->cl_rahead)) { + lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp); + FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD); } - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0); + ubc->cl_rahead = NULL; + ubc->cl_wbehind = NULL; + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, (int)ubc, (int)rap, (int)wbp, 0, 0); +} + + +static void +cluster_push_EOF(vnode_t vp, off_t EOF) +{ + struct cl_writebehind *wbp; + + wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED); + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START, + (int)wbp->cl_scmap, wbp->cl_number, (int)EOF, 0, 0); + + if (wbp->cl_scmap) + sparse_cluster_push(wbp, vp, EOF, 1); + else + cluster_try_push(wbp, vp, EOF, 0, 1); + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END, + (int)wbp->cl_scmap, wbp->cl_number, 0, 0, 0); + + lck_mtx_unlock(&wbp->cl_lockw); } static int -cluster_try_push(vp, EOF, can_delay, push_all) - struct vnode *vp; - off_t EOF; - int can_delay; - int push_all; +cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int can_delay, int push_all) { int cl_index; int cl_index1; int min_index; int cl_len; - int cl_total; int cl_pushed = 0; - struct v_cluster l_clusters[MAX_CLUSTERS]; + struct cl_wextent l_clusters[MAX_CLUSTERS]; /* + * the write behind context exists and has + * already been locked... + * * make a local 'sorted' copy of the clusters - * and clear vp->v_clen so that new clusters can + * and clear wbp->cl_number so that new clusters can * be developed */ - for (cl_index = 0; cl_index < vp->v_clen; cl_index++) { - for (min_index = -1, cl_index1 = 0; cl_index1 < vp->v_clen; cl_index1++) { - if (vp->v_clusters[cl_index1].start_pg == vp->v_clusters[cl_index1].last_pg) + for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) { + for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) { + if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr) continue; if (min_index == -1) min_index = cl_index1; - else if (vp->v_clusters[cl_index1].start_pg < vp->v_clusters[min_index].start_pg) + else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr) min_index = cl_index1; } if (min_index == -1) break; - l_clusters[cl_index].start_pg = vp->v_clusters[min_index].start_pg; - l_clusters[cl_index].last_pg = vp->v_clusters[min_index].last_pg; + l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr; + l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr; + l_clusters[cl_index].io_nocache = wbp->cl_clusters[min_index].io_nocache; - vp->v_clusters[min_index].start_pg = vp->v_clusters[min_index].last_pg; + wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr; } - cl_len = cl_index; - vp->v_clen = 0; + wbp->cl_number = 0; + + cl_len = cl_index; if (can_delay && cl_len == MAX_CLUSTERS) { int i; @@ -3444,8 +3896,8 @@ cluster_try_push(vp, EOF, can_delay, push_all) * * check to make sure that all the clusters except the last one are 'full'... and that each cluster * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above - * so we can just make a simple pass through up, to but not including the last one... - * note that last_pg is not inclusive, so it will be equal to the start_pg of the next cluster if they + * so we can just make a simple pass through, up to, but not including the last one... + * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they * are sequential * * we let the last one be partial as long as it was adjacent to the previous one... @@ -3453,100 +3905,113 @@ cluster_try_push(vp, EOF, can_delay, push_all) * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world... */ for (i = 0; i < MAX_CLUSTERS - 1; i++) { - if ((l_clusters[i].last_pg - l_clusters[i].start_pg) != MAX_UPL_TRANSFER) + if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != MAX_UPL_TRANSFER) goto dont_try; - if (l_clusters[i].last_pg != l_clusters[i+1].start_pg) + if (l_clusters[i].e_addr != l_clusters[i+1].b_addr) goto dont_try; } } + /* + * drop the lock while we're firing off the I/Os... + * this is safe since I'm working off of a private sorted copy + * of the clusters, and I'm going to re-evaluate the public + * state after I retake the lock + */ + lck_mtx_unlock(&wbp->cl_lockw); + for (cl_index = 0; cl_index < cl_len; cl_index++) { + int flags; + struct cl_extent cl; + /* - * try to push each cluster in turn... cluster_push_x may not - * push the cluster if can_delay is TRUE and the cluster doesn't - * meet the critera for an immediate push + * try to push each cluster in turn... */ - if (cluster_push_x(vp, EOF, l_clusters[cl_index].start_pg, l_clusters[cl_index].last_pg, can_delay)) { - l_clusters[cl_index].start_pg = 0; - l_clusters[cl_index].last_pg = 0; + if (l_clusters[cl_index].io_nocache) + flags = IO_NOCACHE; + else + flags = 0; + cl.b_addr = l_clusters[cl_index].b_addr; + cl.e_addr = l_clusters[cl_index].e_addr; - cl_pushed++; + cluster_push_x(vp, &cl, EOF, flags); - if (push_all == 0) - break; - } + l_clusters[cl_index].b_addr = 0; + l_clusters[cl_index].e_addr = 0; + + cl_pushed++; + + if (push_all == 0) + break; } + lck_mtx_lock(&wbp->cl_lockw); + dont_try: if (cl_len > cl_pushed) { /* * we didn't push all of the clusters, so * lets try to merge them back in to the vnode */ - if ((MAX_CLUSTERS - vp->v_clen) < (cl_len - cl_pushed)) { + if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) { /* * we picked up some new clusters while we were trying to - * push the old ones (I don't think this can happen because - * I'm holding the lock, but just in case)... the sum of the + * push the old ones... this can happen because I've dropped + * the vnode lock... the sum of the * leftovers plus the new cluster count exceeds our ability * to represent them, so switch to the sparse cluster mechanism + * + * collect the active public clusters... */ - - /* - * first collect the new clusters sitting in the vp - */ - sparse_cluster_switch(vp, EOF); + sparse_cluster_switch(wbp, vp, EOF); for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) { - if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg) + if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) continue; - vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg; - vp->v_clusters[cl_index1].last_pg = l_clusters[cl_index].last_pg; + wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr; + wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr; + wbp->cl_clusters[cl_index1].io_nocache = l_clusters[cl_index].io_nocache; cl_index1++; } /* * update the cluster count */ - vp->v_clen = cl_index1; + wbp->cl_number = cl_index1; /* * and collect the original clusters that were moved into the * local storage for sorting purposes */ - sparse_cluster_switch(vp, EOF); + sparse_cluster_switch(wbp, vp, EOF); } else { /* * we've got room to merge the leftovers back in * just append them starting at the next 'hole' - * represented by vp->v_clen + * represented by wbp->cl_number */ - for (cl_index = 0, cl_index1 = vp->v_clen; cl_index < cl_len; cl_index++) { - if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg) + for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) { + if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) continue; - vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg; - vp->v_clusters[cl_index1].last_pg = l_clusters[cl_index].last_pg; + wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr; + wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr; + wbp->cl_clusters[cl_index1].io_nocache = l_clusters[cl_index].io_nocache; cl_index1++; } /* * update the cluster count */ - vp->v_clen = cl_index1; + wbp->cl_number = cl_index1; } } - return(MAX_CLUSTERS - vp->v_clen); + return(MAX_CLUSTERS - wbp->cl_number); } static int -cluster_push_x(vp, EOF, first, last, can_delay) - struct vnode *vp; - off_t EOF; - unsigned int first; - unsigned int last; - int can_delay; +cluster_push_x(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags) { upl_page_info_t *pl; upl_t upl; @@ -3560,19 +4025,21 @@ cluster_push_x(vp, EOF, first, last, can_delay) int io_flags; int upl_flags; int size; + int error = 0; + int retval; kern_return_t kret; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START, - vp->v_clen, first, last, EOF, 0); + (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0); - if ((pages_in_upl = last - first) == 0) { + if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0); - return (1); + return (0); } upl_size = pages_in_upl * PAGE_SIZE; - upl_f_offset = (off_t)((unsigned long long)first * PAGE_SIZE_64); + upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64); if (upl_f_offset + upl_size >= EOF) { @@ -3584,7 +4051,7 @@ cluster_push_x(vp, EOF, first, last, can_delay) */ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0); - return(1); + return(0); } size = EOF - upl_f_offset; @@ -3595,7 +4062,19 @@ cluster_push_x(vp, EOF, first, last, can_delay) KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0); - if (vp->v_flag & VNOCACHE_DATA) + /* + * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior + * + * - only pages that are currently dirty are returned... these are the ones we need to clean + * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set + * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page + * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if + * someone dirties this page while the I/O is in progress, we don't lose track of the new state + * + * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard) + */ + + if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE)) upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED; else upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE; @@ -3629,7 +4108,7 @@ cluster_push_x(vp, EOF, first, last, can_delay) ubc_upl_abort(upl, 0); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0); - return(1); + return(0); } for (last_pg = 0; last_pg < pages_in_upl; ) { @@ -3671,116 +4150,128 @@ cluster_push_x(vp, EOF, first, last, can_delay) io_size = min(size, (last_pg - start_pg) * PAGE_SIZE); - if (vp->v_flag & VNOCACHE_DATA) - io_flags = CL_THROTTLE | CL_COMMIT | CL_ASYNC | CL_DUMP; - else - io_flags = CL_THROTTLE | CL_COMMIT | CL_ASYNC; + io_flags = CL_THROTTLE | CL_COMMIT; + + if ( !(flags & IO_SYNC)) + io_flags |= CL_ASYNC; + + retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, + io_flags, (buf_t)NULL, (struct clios *)NULL); - cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0, (struct clios *)0); + if (error == 0 && retval) + error = retval; size -= io_size; } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0); - return(1); + return(error); } -static int -sparse_cluster_switch(struct vnode *vp, off_t EOF) +/* + * sparse_cluster_switch is called with the write behind lock held + */ +static void +sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF) { - int cl_index; + int cl_index; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0); - if ( !(vp->v_flag & VHASDIRTY)) { - vp->v_flag |= VHASDIRTY; - vp->v_scdirty = 0; - vp->v_scmap = 0; - } - for (cl_index = 0; cl_index < vp->v_clen; cl_index++) { - int flags; - int start_pg; - int last_pg; + if (wbp->cl_scmap == NULL) + wbp->cl_scdirty = 0; + + for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) { + int flags; + struct cl_extent cl; + + for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) { - for (start_pg = vp->v_clusters[cl_index].start_pg; start_pg < vp->v_clusters[cl_index].last_pg; start_pg++) { + if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, 0, &flags) == KERN_SUCCESS) { + if (flags & UPL_POP_DIRTY) { + cl.e_addr = cl.b_addr + 1; - if (ubc_page_op(vp, (off_t)(((off_t)start_pg) * PAGE_SIZE_64), 0, 0, &flags) == KERN_SUCCESS) { - if (flags & UPL_POP_DIRTY) - sparse_cluster_add(vp, EOF, start_pg, start_pg + 1); + sparse_cluster_add(wbp, vp, &cl, EOF); + } } } } - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0); + wbp->cl_number = 0; + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0); } -static int -sparse_cluster_push(struct vnode *vp, off_t EOF, int push_all) +/* + * sparse_cluster_push is called with the write behind lock held + */ +static void +sparse_cluster_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_all) { - unsigned int first; - unsigned int last; - off_t offset; - u_int length; + struct cl_extent cl; + off_t offset; + u_int length; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, (int)vp, (int)vp->v_scmap, vp->v_scdirty, push_all, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, push_all, 0); if (push_all) - vfs_drt_control(&(vp->v_scmap), 1); + vfs_drt_control(&(wbp->cl_scmap), 1); for (;;) { - if (vfs_drt_get_cluster(&(vp->v_scmap), &offset, &length) != KERN_SUCCESS) { - vp->v_flag &= ~VHASDIRTY; - vp->v_clen = 0; + if (vfs_drt_get_cluster(&(wbp->cl_scmap), &offset, &length) != KERN_SUCCESS) break; - } - first = (unsigned int)(offset / PAGE_SIZE_64); - last = (unsigned int)((offset + length) / PAGE_SIZE_64); - cluster_push_x(vp, EOF, first, last, 0); + cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64); + cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64); + + wbp->cl_scdirty -= (int)(cl.e_addr - cl.b_addr); - vp->v_scdirty -= (last - first); + cluster_push_x(vp, &cl, EOF, 0); if (push_all == 0) break; } - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0); } -static int -sparse_cluster_add(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last) +/* + * sparse_cluster_add is called with the write behind lock held + */ +static void +sparse_cluster_add(struct cl_writebehind *wbp, vnode_t vp, struct cl_extent *cl, off_t EOF) { - u_int new_dirty; - u_int length; - off_t offset; + u_int new_dirty; + u_int length; + off_t offset; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (int)vp->v_scmap, vp->v_scdirty, first, last, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (int)wbp->cl_scmap, wbp->cl_scdirty, (int)cl->b_addr, (int)cl->e_addr, 0); - offset = (off_t)first * PAGE_SIZE_64; - length = (last - first) * PAGE_SIZE; + offset = (off_t)(cl->b_addr * PAGE_SIZE_64); + length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE; - while (vfs_drt_mark_pages(&(vp->v_scmap), offset, length, &new_dirty) != KERN_SUCCESS) { + while (vfs_drt_mark_pages(&(wbp->cl_scmap), offset, length, &new_dirty) != KERN_SUCCESS) { /* * no room left in the map * only a partial update was done * push out some pages and try again */ - vp->v_scdirty += new_dirty; + wbp->cl_scdirty += new_dirty; - sparse_cluster_push(vp, EOF, 0); + sparse_cluster_push(wbp, vp, EOF, 0); offset += (new_dirty * PAGE_SIZE_64); length -= (new_dirty * PAGE_SIZE); } - vp->v_scdirty += new_dirty; + wbp->cl_scdirty += new_dirty; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0); } static int -cluster_align_phys_io(struct vnode *vp, struct uio *uio, addr64_t usr_paddr, int xsize, int devblocksize, int flags) +cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, int xsize, int flags) { struct iovec *iov; upl_page_info_t *pl; @@ -3788,15 +4279,28 @@ cluster_align_phys_io(struct vnode *vp, struct uio *uio, addr64_t usr_paddr, int addr64_t ubc_paddr; kern_return_t kret; int error = 0; + int did_read = 0; + int abort_flags; + int upl_flags; iov = uio->uio_iov; + upl_flags = UPL_SET_LITE; + if (! (flags & CL_READ)) { + /* + * "write" operation: let the UPL subsystem know + * that we intend to modify the buffer cache pages + * we're gathering. + */ + upl_flags |= UPL_WILL_MODIFY; + } + kret = ubc_create_upl(vp, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, &upl, &pl, - UPL_SET_LITE); + upl_flags); if (kret != KERN_SUCCESS) return(EINVAL); @@ -3805,13 +4309,14 @@ cluster_align_phys_io(struct vnode *vp, struct uio *uio, addr64_t usr_paddr, int /* * issue a synchronous read to cluster_io */ - error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize, - CL_READ, (struct buf *)0, (struct clios *)0); + error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, + CL_READ, (buf_t)NULL, (struct clios *)NULL); if (error) { ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); return(error); } + did_read = 1; } ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64); @@ -3832,16 +4337,21 @@ cluster_align_phys_io(struct vnode *vp, struct uio *uio, addr64_t usr_paddr, int /* * issue a synchronous write to cluster_io */ - error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize, - 0, (struct buf *)0, (struct clios *)0); + error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, + 0, (buf_t)NULL, (struct clios *)NULL); } if (error == 0) { uio->uio_offset += xsize; - iov->iov_base += xsize; - iov->iov_len -= xsize; - uio->uio_resid -= xsize; + uio_iov_base_add(uio, xsize); + uio_iov_len_add(uio, -xsize); + uio_setresid(uio, (uio_resid(uio) - xsize)); } - ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); + if (did_read) + abort_flags = UPL_ABORT_FREE_ON_EMPTY; + else + abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES; + + ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags); return (error); } @@ -3857,27 +4367,40 @@ cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int xsize) int segflg; int retval = 0; upl_page_info_t *pl; - boolean_t funnel_state = FALSE; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START, - (int)uio->uio_offset, uio->uio_resid, upl_offset, xsize, 0); - - if (xsize >= (16 * 1024)) - funnel_state = thread_funnel_set(kernel_flock, FALSE); + (int)uio->uio_offset, uio_resid(uio), upl_offset, xsize, 0); segflg = uio->uio_segflg; switch(segflg) { + case UIO_USERSPACE32: + case UIO_USERISPACE32: + uio->uio_segflg = UIO_PHYS_USERSPACE32; + break; + case UIO_USERSPACE: case UIO_USERISPACE: uio->uio_segflg = UIO_PHYS_USERSPACE; break; + case UIO_USERSPACE64: + case UIO_USERISPACE64: + uio->uio_segflg = UIO_PHYS_USERSPACE64; + break; + + case UIO_SYSSPACE32: + uio->uio_segflg = UIO_PHYS_SYSSPACE32; + break; + case UIO_SYSSPACE: uio->uio_segflg = UIO_PHYS_SYSSPACE; break; + + case UIO_SYSSPACE64: + uio->uio_segflg = UIO_PHYS_SYSSPACE64; + break; } pl = ubc_upl_pageinfo(upl); @@ -3899,47 +4422,56 @@ cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int xsize) } uio->uio_segflg = segflg; - if (funnel_state == TRUE) - thread_funnel_set(kernel_flock, TRUE); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END, - (int)uio->uio_offset, uio->uio_resid, retval, segflg, 0); + (int)uio->uio_offset, uio_resid(uio), retval, segflg, 0); return (retval); } int -cluster_copy_ubc_data(struct vnode *vp, struct uio *uio, int *io_resid, int mark_dirty) +cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty) { int segflg; int io_size; int xsize; int start_offset; - off_t f_offset; int retval = 0; memory_object_control_t control; - int op_flags = UPL_POP_SET | UPL_POP_BUSY; - boolean_t funnel_state = FALSE; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START, - (int)uio->uio_offset, uio->uio_resid, 0, *io_resid, 0); + (int)uio->uio_offset, uio_resid(uio), 0, *io_resid, 0); control = ubc_getobject(vp, UBC_FLAGS_NONE); if (control == MEMORY_OBJECT_CONTROL_NULL) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END, - (int)uio->uio_offset, uio->uio_resid, retval, 3, 0); + (int)uio->uio_offset, uio_resid(uio), retval, 3, 0); return(0); } - if (mark_dirty) - op_flags |= UPL_POP_DIRTY; - segflg = uio->uio_segflg; switch(segflg) { + case UIO_USERSPACE32: + case UIO_USERISPACE32: + uio->uio_segflg = UIO_PHYS_USERSPACE32; + break; + + case UIO_USERSPACE64: + case UIO_USERISPACE64: + uio->uio_segflg = UIO_PHYS_USERSPACE64; + break; + + case UIO_SYSSPACE32: + uio->uio_segflg = UIO_PHYS_SYSSPACE32; + break; + + case UIO_SYSSPACE64: + uio->uio_segflg = UIO_PHYS_SYSSPACE64; + break; + case UIO_USERSPACE: case UIO_USERISPACE: uio->uio_segflg = UIO_PHYS_USERSPACE; @@ -3949,44 +4481,28 @@ cluster_copy_ubc_data(struct vnode *vp, struct uio *uio, int *io_resid, int mark uio->uio_segflg = UIO_PHYS_SYSSPACE; break; } - io_size = *io_resid; - start_offset = (int)(uio->uio_offset & PAGE_MASK_64); - f_offset = uio->uio_offset - start_offset; - xsize = min(PAGE_SIZE - start_offset, io_size); - - while (io_size && retval == 0) { - ppnum_t pgframe; - - if (ubc_page_op_with_control(control, f_offset, op_flags, &pgframe, 0) != KERN_SUCCESS) - break; - - if (funnel_state == FALSE && io_size >= (16 * 1024)) - funnel_state = thread_funnel_set(kernel_flock, FALSE); - retval = uiomove64((addr64_t)(((addr64_t)pgframe << 12) + start_offset), xsize, uio); + if ( (io_size = *io_resid) ) { + start_offset = (int)(uio->uio_offset & PAGE_MASK_64); + xsize = uio_resid(uio); - ubc_page_op_with_control(control, f_offset, UPL_POP_CLR | UPL_POP_BUSY, 0, 0); - - io_size -= xsize; - start_offset = 0; - f_offset = uio->uio_offset; - xsize = min(PAGE_SIZE, io_size); + retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, + uio, start_offset, io_size, mark_dirty); + xsize -= uio_resid(uio); + io_size -= xsize; } uio->uio_segflg = segflg; *io_resid = io_size; - if (funnel_state == TRUE) - thread_funnel_set(kernel_flock, TRUE); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END, - (int)uio->uio_offset, uio->uio_resid, retval, 0x80000000 | segflg, 0); + (int)uio->uio_offset, uio_resid(uio), retval, 0x80000000 | segflg, 0); return(retval); } int -is_file_clean(struct vnode *vp, off_t filesize) +is_file_clean(vnode_t vp, off_t filesize) { off_t f_offset; int flags; @@ -4168,7 +4684,6 @@ struct vfs_drt_clustermap { * lastclean, iskips */ -static void vfs_drt_sanity(struct vfs_drt_clustermap *cmap); static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp); static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap); static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap, @@ -4321,8 +4836,6 @@ vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp) static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap) { - kern_return_t ret; - kmem_free(kernel_map, (vm_offset_t)cmap, (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION); return(KERN_SUCCESS); @@ -4335,8 +4848,7 @@ vfs_drt_free_map(struct vfs_drt_clustermap *cmap) static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp) { - kern_return_t kret; - int index, i, tries; + int index, i; offset = DRT_ALIGN_ADDRESS(offset); index = DRT_HASH(cmap, offset); @@ -4513,7 +5025,7 @@ vfs_drt_do_mark_pages( } } DRT_HASH_SET_COUNT(cmap, index, ecount); -next: + offset += pgcount * PAGE_SIZE; length -= pgcount * PAGE_SIZE; } @@ -4556,11 +5068,13 @@ vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp) return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1)); } +#if 0 static kern_return_t vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length) { return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0)); } +#endif /* * Get a cluster of dirty pages. @@ -4687,12 +5201,22 @@ vfs_drt_control(void **cmapp, int op_type) * Emit a summary of the state of the clustermap into the trace buffer * along with some caller-provided data. */ +#if KDEBUG static void -vfs_drt_trace(struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4) +vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4) { KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0); } +#else +static void +vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code, + __unused int arg1, __unused int arg2, __unused int arg3, + __unused int arg4) +{ +} +#endif +#if 0 /* * Perform basic sanity check on the hash entry summary count * vs. the actual bits set in the entry. @@ -4715,3 +5239,4 @@ vfs_drt_sanity(struct vfs_drt_clustermap *cmap) panic("bits_on = %d, index = %d\n", bits_on, index); } } +#endif diff --git a/bsd/vfs/vfs_conf.c b/bsd/vfs/vfs_conf.c index 7f0274f33..e02553392 100644 --- a/bsd/vfs/vfs_conf.c +++ b/bsd/vfs/vfs_conf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -57,8 +57,8 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/mount.h> -#include <sys/vnode.h> +#include <sys/mount_internal.h> +#include <sys/vnode_internal.h> /* * These define the root filesystem, device, and root filesystem type. @@ -71,7 +71,9 @@ int (*mountroot)() = NULL; * Set up the initial array of known filesystem types. */ extern struct vfsops ufs_vfsops; +#if FFS extern int ffs_mountroot(); +#endif extern struct vfsops mfs_vfsops; extern int mfs_mountroot(); extern struct vfsops hfs_vfsops; @@ -90,60 +92,60 @@ extern struct vfsops devfs_vfsops; /* * Set up the filesystem operations for vnodes. */ -static struct vfsconf vfsconflist[] = { +static struct vfstable vfsconflist[] = { /* HFS/HFS+ Filesystem */ #if HFS - { &hfs_vfsops, "hfs", 17, 0, MNT_LOCAL | MNT_DOVOLFS, hfs_mountroot, NULL }, + { &hfs_vfsops, "hfs", 17, 0, MNT_LOCAL | MNT_DOVOLFS, hfs_mountroot, NULL, 0, {0}, VFC_VFSLOCALARGS, 0, 0 }, #endif /* Fast Filesystem */ #if FFS - { &ufs_vfsops, "ufs", 1, 0, MNT_LOCAL, ffs_mountroot, NULL }, + { &ufs_vfsops, "ufs", 1, 0, MNT_LOCAL, ffs_mountroot, NULL, 0, {0}, VFC_VFSLOCALARGS, 0, 0 }, #endif /* ISO9660 (aka CDROM) Filesystem */ #if CD9660 - { &cd9660_vfsops, "cd9660", 14, 0, MNT_LOCAL, cd9660_mountroot, NULL }, + { &cd9660_vfsops, "cd9660", 14, 0, MNT_LOCAL, cd9660_mountroot, NULL, 0, {0}, VFC_VFSLOCALARGS, 0, 0 }, #endif /* Memory-based Filesystem */ #if MFS - { &mfs_vfsops, "mfs", 3, 0, MNT_LOCAL, mfs_mountroot, NULL }, + { &mfs_vfsops, "mfs", 3, 0, MNT_LOCAL, mfs_mountroot, NULL, 0, {0}, VFC_VFSGENERICARGS , 0, 0}, #endif /* Sun-compatible Network Filesystem */ #if NFSCLIENT - { &nfs_vfsops, "nfs", 2, 0, 0, nfs_mountroot, NULL }, + { &nfs_vfsops, "nfs", 2, 0, 0, nfs_mountroot, NULL, 0, {0}, VFC_VFSGENERICARGS , 0, 0}, #endif /* Andrew Filesystem */ #if AFS - { &afs_vfsops, "andrewfs", 13, 0, 0, afs_mountroot, NULL }, + { &afs_vfsops, "andrewfs", 13, 0, 0, afs_mountroot, NULL, 0, {0}, VFC_VFSGENERICARGS , 0, 0}, #endif /* Loopback (Minimal) Filesystem Layer */ #if NULLFS - { &null_vfsops, "loopback", 9, 0, 0, NULL, NULL }, + { &null_vfsops, "loopback", 9, 0, 0, NULL, NULL, 0, {0}, VFC_VFSGENERICARGS , 0, 0}, #endif /* Union (translucent) Filesystem */ #if UNION - { &union_vfsops, "union", 15, 0, 0, NULL, NULL }, + { &union_vfsops, "union", 15, 0, 0, NULL, NULL, 0, {0}, VFC_VFSGENERICARGS , 0, 0}, #endif /* File Descriptor Filesystem */ #if FDESC - { &fdesc_vfsops, "fdesc", 7, 0, 0, NULL, NULL }, + { &fdesc_vfsops, "fdesc", 7, 0, 0, NULL, NULL, 0, {0}, VFC_VFSGENERICARGS , 0, 0}, #endif /* Volume ID Filesystem */ #if VOLFS - { &volfs_vfsops, "volfs", 18, 0, 0, NULL, NULL }, + { &volfs_vfsops, "volfs", 18, 0, 0, NULL, NULL, 0, {0}, VFC_VFSGENERICARGS , 0, 0}, #endif /* Device Filesystem */ #if DEVFS - { &devfs_vfsops, "devfs", 19, 0, 0, NULL, NULL }, + { &devfs_vfsops, "devfs", 19, 0, 0, NULL, NULL, 0, {0}, VFC_VFSGENERICARGS , 0, 0}, #endif {0}, @@ -169,7 +171,7 @@ static struct vfsconf vfsconflist[] = { int maxvfsslots = sizeof(vfsconflist) / sizeof (struct vfsconf); int numused_vfsslots = 0; int maxvfsconf = sizeof(vfsconflist) / sizeof (struct vfsconf); -struct vfsconf *vfsconf = vfsconflist; +struct vfstable *vfsconf = vfsconflist; /* * @@ -178,9 +180,11 @@ struct vfsconf *vfsconf = vfsconflist; * vectors. It is NULL terminated. * */ +#if FFS extern struct vnodeopv_desc ffs_vnodeop_opv_desc; extern struct vnodeopv_desc ffs_specop_opv_desc; extern struct vnodeopv_desc ffs_fifoop_opv_desc; +#endif extern struct vnodeopv_desc mfs_vnodeop_opv_desc; extern struct vnodeopv_desc dead_vnodeop_opv_desc; extern struct vnodeopv_desc fifo_vnodeop_opv_desc; @@ -203,10 +207,12 @@ extern struct vnodeopv_desc devfs_vnodeop_opv_desc; extern struct vnodeopv_desc devfs_spec_vnodeop_opv_desc; struct vnodeopv_desc *vfs_opv_descs[] = { +#if FFS &ffs_vnodeop_opv_desc, &ffs_specop_opv_desc, #if FIFO &ffs_fifoop_opv_desc, +#endif #endif &dead_vnodeop_opv_desc, #if FIFO diff --git a/bsd/vfs/vfs_fsevents.c b/bsd/vfs/vfs_fsevents.c new file mode 100644 index 000000000..5949b796a --- /dev/null +++ b/bsd/vfs/vfs_fsevents.c @@ -0,0 +1,1402 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#include <stdarg.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/namei.h> +#include <sys/filedesc.h> +#include <sys/kernel.h> +#include <sys/file_internal.h> +#include <sys/stat.h> +#include <sys/vnode_internal.h> +#include <sys/mount_internal.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> +#include <sys/uio.h> +#include <sys/malloc.h> +#include <sys/dirent.h> +#include <sys/attr.h> +#include <sys/sysctl.h> +#include <sys/ubc.h> +#include <machine/cons.h> +#include <miscfs/specfs/specdev.h> +#include <miscfs/devfs/devfs.h> +#include <sys/filio.h> +#include <architecture/byte_order.h> +#include <kern/locks.h> +#include <libkern/OSAtomic.h> + +#include <bsm/audit_kernel.h> +#include <bsm/audit_kevents.h> + +// where all our structs and defines come from +#include <sys/fsevents.h> + + +typedef struct kfs_event_arg { + u_int16_t type; + u_int16_t len; + union { + struct vnode *vp; + char *str; + void *ptr; + int32_t int32; + dev_t dev; + ino_t ino; + int32_t mode; + uid_t uid; + gid_t gid; + } data; +}kfs_event_arg; + +#define KFS_NUM_ARGS FSE_MAX_ARGS +typedef struct kfs_event { + int32_t type; // type code of this event + u_int32_t refcount; // number of clients referencing this + pid_t pid; // pid of the process that did the op + kfs_event_arg args[KFS_NUM_ARGS]; +} kfs_event; + + +typedef struct fs_event_watcher { + SLIST_ENTRY(fs_event_watcher) link; + int8_t *event_list; // the events we're interested in + int32_t num_events; + dev_t *devices_to_watch; // only report events from these devices + uint32_t num_devices; + int32_t flags; + kfs_event **event_queue; + int32_t eventq_size; // number of event pointers in queue + int32_t rd, wr; // indices to the event_queue + int32_t blockers; +} fs_event_watcher; + +// fs_event_watcher flags +#define WATCHER_DROPPED_EVENTS 0x0001 +#define WATCHER_CLOSING 0x0002 + +static SLIST_HEAD(watch_list, fs_event_watcher) watch_list_head = { NULL }; + + +#define MAX_KFS_EVENTS 2048 + +// this array holds each pending event +static kfs_event fs_event_buf[MAX_KFS_EVENTS]; +static int free_event_idx = 0; +static int fs_event_init = 0; + +// +// this array records whether anyone is interested in a +// particular type of event. if no one is, we bail out +// early from the event delivery +// +static int16_t fs_event_type_watchers[FSE_MAX_EVENTS]; + +static int watcher_add_event(fs_event_watcher *watcher, kfs_event *kfse); + +// +// Locks +// +static lck_grp_attr_t * fsevent_group_attr; +static lck_attr_t * fsevent_lock_attr; +static lck_grp_t * fsevent_mutex_group; + +static lck_grp_t * fsevent_rw_group; + +static lck_rw_t fsevent_big_lock; // always grab this first +static lck_mtx_t watch_list_lock; +static lck_mtx_t event_buf_lock; + + +static void init_pathbuff(void); + + +static void +fsevents_internal_init(void) +{ + int i; + + if (fs_event_init++ != 0) { + return; + } + + for(i=0; i < FSE_MAX_EVENTS; i++) { + fs_event_type_watchers[i] = 0; + } + + for(i=0; i < MAX_KFS_EVENTS; i++) { + fs_event_buf[i].type = FSE_INVALID; + fs_event_buf[i].refcount = 0; + } + + SLIST_INIT(&watch_list_head); + + fsevent_lock_attr = lck_attr_alloc_init(); + fsevent_group_attr = lck_grp_attr_alloc_init(); + fsevent_mutex_group = lck_grp_alloc_init("fsevent-mutex", fsevent_group_attr); + fsevent_rw_group = lck_grp_alloc_init("fsevent-rw", fsevent_group_attr); + + lck_mtx_init(&watch_list_lock, fsevent_mutex_group, fsevent_lock_attr); + lck_mtx_init(&event_buf_lock, fsevent_mutex_group, fsevent_lock_attr); + + lck_rw_init(&fsevent_big_lock, fsevent_rw_group, fsevent_lock_attr); + + init_pathbuff(); +} + +static void +lock_watch_list(void) +{ + lck_mtx_lock(&watch_list_lock); +} + +static void +unlock_watch_list(void) +{ + lck_mtx_unlock(&watch_list_lock); +} + +static void +lock_fs_event_buf(void) +{ + lck_mtx_lock(&event_buf_lock); +} + +static void +unlock_fs_event_buf(void) +{ + lck_mtx_unlock(&event_buf_lock); +} + +// forward prototype +static void do_free_event(kfs_event *kfse); + +static int +watcher_cares_about_dev(fs_event_watcher *watcher, dev_t dev) +{ + unsigned int i; + + // if there is not list of devices to watch, then always + // say we're interested so we'll report all events from + // all devices + if (watcher->devices_to_watch == NULL) { + return 1; + } + + for(i=0; i < watcher->num_devices; i++) { + if (dev == watcher->devices_to_watch[i]) { + // found a match! that means we want events + // from this device. + return 1; + } + } + + // if we're here it's not in the devices_to_watch[] + // list so that means we do not care about it + return 0; +} + + +int +need_fsevent(int type, vnode_t vp) +{ + fs_event_watcher *watcher; + dev_t dev; + + if (fs_event_type_watchers[type] == 0) + return (0); + dev = (dev_t)(vp->v_mount->mnt_vfsstat.f_fsid.val[0]); + + lock_watch_list(); + + SLIST_FOREACH(watcher, &watch_list_head, link) { + if (watcher->event_list[type] == FSE_REPORT && watcher_cares_about_dev(watcher, dev)) { + unlock_watch_list(); + return (1); + } + } + unlock_watch_list(); + + return (0); +} + + +int +add_fsevent(int type, vfs_context_t ctx, ...) +{ + struct proc *p = vfs_context_proc(ctx); + int i, arg_idx, num_deliveries=0; + kfs_event_arg *kea; + kfs_event *kfse; + fs_event_watcher *watcher; + va_list ap; + int error = 0; + dev_t dev = 0; + + va_start(ap, ctx); + + // if no one cares about this type of event, bail out + if (fs_event_type_watchers[type] == 0) { + va_end(ap); + return 0; + } + + lck_rw_lock_shared(&fsevent_big_lock); + + // find a free event and snag it for our use + // NOTE: do not do anything that would block until + // the lock is dropped. + lock_fs_event_buf(); + + for(i=0; i < MAX_KFS_EVENTS; i++) { + if (fs_event_buf[(free_event_idx + i) % MAX_KFS_EVENTS].type == FSE_INVALID) { + break; + } + } + + if (i >= MAX_KFS_EVENTS) { + // yikes! no free slots + unlock_fs_event_buf(); + va_end(ap); + + lock_watch_list(); + SLIST_FOREACH(watcher, &watch_list_head, link) { + watcher->flags |= WATCHER_DROPPED_EVENTS; + wakeup((caddr_t)watcher); + } + unlock_watch_list(); + lck_rw_done(&fsevent_big_lock); + + printf("fs_events: add_event: event queue is full! dropping events.\n"); + return ENOSPC; + } + + kfse = &fs_event_buf[(free_event_idx + i) % MAX_KFS_EVENTS]; + + free_event_idx++; + + kfse->type = type; + kfse->refcount = 0; + kfse->pid = p->p_pid; + + unlock_fs_event_buf(); // at this point it's safe to unlock + + // + // now process the arguments passed in and copy them into + // the kfse + // + arg_idx = 0; + while(arg_idx < KFS_NUM_ARGS) { + kea = &kfse->args[arg_idx++]; + kea->type = va_arg(ap, int32_t); + + if (kea->type == FSE_ARG_DONE) { + break; + } + + switch(kea->type) { + case FSE_ARG_VNODE: { + // this expands out into multiple arguments to the client + struct vnode *vp; + struct vnode_attr va; + + kea->data.vp = vp = va_arg(ap, struct vnode *); + if (kea->data.vp == NULL) { + panic("add_fsevent: you can't pass me a NULL vnode ptr (type %d)!\n", + kfse->type); + } + + if (vnode_ref_ext(kea->data.vp, O_EVTONLY) != 0) { + kea->type = FSE_ARG_DONE; + + error = EINVAL; + goto clean_up; + } + VATTR_INIT(&va); + VATTR_WANTED(&va, va_fsid); + VATTR_WANTED(&va, va_fileid); + VATTR_WANTED(&va, va_mode); + VATTR_WANTED(&va, va_uid); + VATTR_WANTED(&va, va_gid); + if (vnode_getattr(kea->data.vp, &va, ctx) != 0) { + vnode_rele_ext(kea->data.vp, O_EVTONLY, 0); + kea->type = FSE_ARG_DONE; + + error = EINVAL; + goto clean_up; + } + + kea++; + kea->type = FSE_ARG_DEV; + kea->data.dev = dev = (dev_t)va.va_fsid; + + kea++; + kea->type = FSE_ARG_INO; + kea->data.ino = (ino_t)va.va_fileid; + + kea++; + kea->type = FSE_ARG_MODE; + kea->data.mode = (int32_t)vnode_vttoif(vnode_vtype(vp)) | va.va_mode; + + kea++; + kea->type = FSE_ARG_UID; + kea->data.uid = va.va_uid; + + kea++; + kea->type = FSE_ARG_GID; + kea->data.gid = va.va_gid; + arg_idx += 5; + break; + } + + case FSE_ARG_FINFO: { + fse_info *fse; + + fse = va_arg(ap, fse_info *); + + kea->type = FSE_ARG_DEV; + kea->data.dev = dev = (dev_t)fse->dev; + + kea++; + kea->type = FSE_ARG_INO; + kea->data.ino = (ino_t)fse->ino; + + kea++; + kea->type = FSE_ARG_MODE; + kea->data.mode = (int32_t)fse->mode; + + kea++; + kea->type = FSE_ARG_UID; + kea->data.uid = (uid_t)fse->uid; + + kea++; + kea->type = FSE_ARG_GID; + kea->data.gid = (uid_t)fse->gid; + arg_idx += 4; + break; + } + + case FSE_ARG_STRING: + kea->len = (int16_t)(va_arg(ap, int32_t) & 0xffff); + kea->data.str = vfs_addname(va_arg(ap, char *), kea->len, 0, 0); + break; + + case FSE_ARG_INT32: + kea->data.int32 = va_arg(ap, int32_t); + break; + + case FSE_ARG_INT64: + printf("fs_events: 64-bit args not implemented.\n"); +// kea->data.int64 = va_arg(ap, int64_t); + break; + + case FSE_ARG_RAW: + kea->len = (int16_t)(va_arg(ap, int32_t) & 0xffff); + MALLOC(kea->data.ptr, void *, kea->len, M_TEMP, M_WAITOK); + memcpy(kea->data.ptr, va_arg(ap, void *), kea->len); + break; + + case FSE_ARG_DEV: + kea->data.dev = dev = va_arg(ap, dev_t); + break; + + case FSE_ARG_MODE: + kea->data.mode = va_arg(ap, int32_t); + break; + + case FSE_ARG_INO: + kea->data.ino = va_arg(ap, ino_t); + break; + + case FSE_ARG_UID: + kea->data.uid = va_arg(ap, uid_t); + break; + + case FSE_ARG_GID: + kea->data.gid = va_arg(ap, gid_t); + break; + + default: + printf("add_fsevent: unknown type %d\n", kea->type); + // just skip one 32-bit word and hope we sync up... + (void)va_arg(ap, int32_t); + } + } + + va_end(ap); + + // + // now we have to go and let everyone know that + // is interested in this type of event... + // + lock_watch_list(); + + SLIST_FOREACH(watcher, &watch_list_head, link) { + if (watcher->event_list[type] == FSE_REPORT && watcher_cares_about_dev(watcher, dev)) { + if (watcher_add_event(watcher, kfse) == 0) { + num_deliveries++; + } + } + } + + unlock_watch_list(); + + clean_up: + // just in case no one was interested after all... + if (num_deliveries == 0) { + do_free_event(kfse); + free_event_idx = (int)(kfse - &fs_event_buf[0]); + } + + lck_rw_done(&fsevent_big_lock); + return error; +} + +static void +do_free_event(kfs_event *kfse) +{ + int i; + kfs_event_arg *kea, all_args[KFS_NUM_ARGS]; + + lock_fs_event_buf(); + + // mark this fsevent as invalid + kfse->type = FSE_INVALID; + + // make a copy of this so we can free things without + // holding the fs_event_buf lock + // + memcpy(&all_args[0], &kfse->args[0], sizeof(all_args)); + + // and just to be anal, set this so that there are no args + kfse->args[0].type = FSE_ARG_DONE; + + free_event_idx = (kfse - fs_event_buf); + + unlock_fs_event_buf(); + + for(i=0; i < KFS_NUM_ARGS; i++) { + kea = &all_args[i]; + if (kea->type == FSE_ARG_DONE) { + break; + } + + switch(kea->type) { + case FSE_ARG_VNODE: + vnode_rele_ext(kea->data.vp, O_EVTONLY, 0); + break; + case FSE_ARG_STRING: + vfs_removename(kea->data.str); + break; + case FSE_ARG_RAW: + FREE(kea->data.ptr, M_TEMP); + break; + } + } +} + + +static int +add_watcher(int8_t *event_list, int32_t num_events, int32_t eventq_size, fs_event_watcher **watcher_out) +{ + int i; + fs_event_watcher *watcher; + + if (eventq_size < 0 || eventq_size > MAX_KFS_EVENTS) { + eventq_size = MAX_KFS_EVENTS; + } + + // Note: the event_queue follows the fs_event_watcher struct + // in memory so we only have to do one allocation + MALLOC(watcher, + fs_event_watcher *, + sizeof(fs_event_watcher) + eventq_size * sizeof(kfs_event *), + M_TEMP, M_WAITOK); + + watcher->event_list = event_list; + watcher->num_events = num_events; + watcher->devices_to_watch = NULL; + watcher->num_devices = 0; + watcher->flags = 0; + watcher->event_queue = (kfs_event **)&watcher[1]; + watcher->eventq_size = eventq_size; + watcher->rd = 0; + watcher->wr = 0; + watcher->blockers = 0; + + lock_watch_list(); + + // now update the global list of who's interested in + // events of a particular type... + for(i=0; i < num_events; i++) { + if (event_list[i] != FSE_IGNORE && i < FSE_MAX_EVENTS) { + fs_event_type_watchers[i]++; + } + } + + SLIST_INSERT_HEAD(&watch_list_head, watcher, link); + + unlock_watch_list(); + + *watcher_out = watcher; + + return 0; +} + +static void +remove_watcher(fs_event_watcher *target) +{ + int i; + fs_event_watcher *watcher; + kfs_event *kfse; + + lck_rw_lock_shared(&fsevent_big_lock); + + lock_watch_list(); + + SLIST_FOREACH(watcher, &watch_list_head, link) { + if (watcher == target) { + SLIST_REMOVE(&watch_list_head, watcher, fs_event_watcher, link); + + for(i=0; i < watcher->num_events; i++) { + if (watcher->event_list[i] != FSE_IGNORE && i < FSE_MAX_EVENTS) { + fs_event_type_watchers[i]--; + } + } + + unlock_watch_list(); + + // drain the event_queue + for(i=watcher->rd; i != watcher->wr; i=(i+1) % watcher->eventq_size) { + kfse = watcher->event_queue[i]; + + if (OSAddAtomic(-1, (SInt32 *)&kfse->refcount) == 1) { + do_free_event(kfse); + } + } + + if (watcher->event_list) { + FREE(watcher->event_list, M_TEMP); + watcher->event_list = NULL; + } + if (watcher->devices_to_watch) { + FREE(watcher->devices_to_watch, M_TEMP); + watcher->devices_to_watch = NULL; + } + FREE(watcher, M_TEMP); + + lck_rw_done(&fsevent_big_lock); + return; + } + } + + unlock_watch_list(); + lck_rw_done(&fsevent_big_lock); +} + + +static int +watcher_add_event(fs_event_watcher *watcher, kfs_event *kfse) +{ + if (((watcher->wr + 1) % watcher->eventq_size) == watcher->rd) { + watcher->flags |= WATCHER_DROPPED_EVENTS; + wakeup((caddr_t)watcher); + return ENOSPC; + } + + watcher->event_queue[watcher->wr] = kfse; + OSAddAtomic(1, (SInt32 *)&kfse->refcount); + watcher->wr = (watcher->wr + 1) % watcher->eventq_size; + + // wake up the watcher if he's waiting! + wakeup((caddr_t)watcher); + + return 0; +} + + +static int +fmod_watch(fs_event_watcher *watcher, struct uio *uio) +{ + int i, error=0, last_full_event_resid; + kfs_event *kfse; + kfs_event_arg *kea; + uint16_t tmp16; + + // LP64todo - fix this + last_full_event_resid = uio_resid(uio); + + // need at least 2048 bytes of space (maxpathlen + 1 event buf) + if (uio_resid(uio) < 2048 || watcher == NULL) { + return EINVAL; + } + + + if (watcher->rd == watcher->wr) { + if (watcher->flags & WATCHER_CLOSING) { + return 0; + } + OSAddAtomic(1, (SInt32 *)&watcher->blockers); + + // there's nothing to do, go to sleep + error = tsleep((caddr_t)watcher, PUSER|PCATCH, "fsevents_empty", 0); + + OSAddAtomic(-1, (SInt32 *)&watcher->blockers); + + if (error != 0 || (watcher->flags & WATCHER_CLOSING)) { + return error; + } + } + + // if we dropped events, return that as an event first + if (watcher->flags & WATCHER_DROPPED_EVENTS) { + int32_t val = FSE_EVENTS_DROPPED; + + error = uiomove((caddr_t)&val, sizeof(int32_t), uio); + if (error == 0) { + val = 0; // a fake pid + error = uiomove((caddr_t)&val, sizeof(int32_t), uio); + + tmp16 = FSE_ARG_DONE; // makes it a consistent msg + error = uiomove((caddr_t)&tmp16, sizeof(int16_t), uio); + } + + if (error) { + return error; + } + + watcher->flags &= ~WATCHER_DROPPED_EVENTS; + } + +// check if the next chunk of data will fit in the user's +// buffer. if not, just goto get_out which will return +// the number of bytes worth of events that we did read. +// this leaves the event that didn't fit in the queue. +// + // LP64todo - fix this +#define CHECK_UPTR(size) if (size > (unsigned)uio_resid(uio)) { \ + uio_setresid(uio, last_full_event_resid); \ + goto get_out; \ + } + + for (; uio_resid(uio) > 0 && watcher->rd != watcher->wr; ) { + kfse = watcher->event_queue[watcher->rd]; + + // copy out the type of the event + CHECK_UPTR(sizeof(int32_t)); + if ((error = uiomove((caddr_t)&kfse->type, sizeof(int32_t), uio)) != 0) { + goto get_out; + } + + // now copy out the pid of the person that changed the file + CHECK_UPTR(sizeof(pid_t)); + if ((error = uiomove((caddr_t)&kfse->pid, sizeof(pid_t), uio)) != 0) { + goto get_out; + } + + error = 0; + for(i=0; i < KFS_NUM_ARGS && error == 0; i++) { + char *pathbuff; + int pathbuff_len; + + kea = &kfse->args[i]; + + tmp16 = (uint16_t)kea->type; + CHECK_UPTR(sizeof(uint16_t)); + error = uiomove((caddr_t)&tmp16, sizeof(uint16_t), uio); + if (error || kea->type == FSE_ARG_DONE) { + break; + } + + switch(kea->type) { + case FSE_ARG_VNODE: + pathbuff = get_pathbuff(); + pathbuff_len = MAXPATHLEN; + if (kea->data.vp == NULL) { + printf("fmod_watch: whoa... vp == NULL (%d)!\n", kfse->type); + i--; + release_pathbuff(pathbuff); + continue; + } + + if (vn_getpath(kea->data.vp, pathbuff, &pathbuff_len) != 0 || pathbuff[0] == '\0') { +// printf("fmod_watch: vn_getpath failed! vp 0x%x vname 0x%x (%s) vparent 0x%x\n", +// kea->data.vp, +// VNAME(kea->data.vp), +// VNAME(kea->data.vp) ? VNAME(kea->data.vp) : "<null>", +// VPARENT(kea->data.vp)); + } + CHECK_UPTR(sizeof(uint16_t)); + tmp16 = (uint16_t)pathbuff_len; + error = uiomove((caddr_t)&tmp16, sizeof(uint16_t), uio); + + CHECK_UPTR((unsigned)pathbuff_len); + error = uiomove((caddr_t)pathbuff, pathbuff_len, uio); + release_pathbuff(pathbuff); + break; + + + case FSE_ARG_STRING: + tmp16 = (int32_t)kea->len; + CHECK_UPTR(sizeof(uint16_t)); + error = uiomove((caddr_t)&tmp16, sizeof(uint16_t), uio); + + CHECK_UPTR(kea->len); + error = uiomove((caddr_t)kea->data.str, kea->len, uio); + break; + + case FSE_ARG_INT32: + CHECK_UPTR(sizeof(uint16_t) + sizeof(int32_t)); + tmp16 = sizeof(int32_t); + error = uiomove((caddr_t)&tmp16, sizeof(uint16_t), uio); + error = uiomove((caddr_t)&kea->data.int32, sizeof(int32_t), uio); + break; + + case FSE_ARG_INT64: + printf("fs_events: 64-bit args not implemented on copyout.\n"); +// CHECK_UPTR(sizeof(uint16_t) + sizeof(int64_t)); +// tmp16 = sizeof(int64_t); +// error = uiomove((caddr_t)&tmp16, sizeof(uint16_t), uio); +// error = uiomove((caddr_t)&kea->data.int64, sizeof(int64_t), uio); + break; + + case FSE_ARG_RAW: + tmp16 = (uint16_t)kea->len; + CHECK_UPTR(sizeof(uint16_t)); + error = uiomove((caddr_t)&tmp16, sizeof(uint16_t), uio); + + CHECK_UPTR(kea->len); + error = uiomove((caddr_t)kea->data.ptr, kea->len, uio); + break; + + case FSE_ARG_DEV: + CHECK_UPTR(sizeof(uint16_t) + sizeof(dev_t)); + tmp16 = sizeof(dev_t); + error = uiomove((caddr_t)&tmp16, sizeof(uint16_t), uio); + error = uiomove((caddr_t)&kea->data.dev, sizeof(dev_t), uio); + break; + + case FSE_ARG_INO: + CHECK_UPTR(sizeof(uint16_t) + sizeof(ino_t)); + tmp16 = sizeof(ino_t); + error = uiomove((caddr_t)&tmp16, sizeof(uint16_t), uio); + error = uiomove((caddr_t)&kea->data.ino, sizeof(ino_t), uio); + break; + + case FSE_ARG_MODE: + // XXXdbg - NOTE: we use 32-bits for the mode, not + // 16-bits like a real mode_t + CHECK_UPTR(sizeof(uint16_t) + sizeof(int32_t)); + tmp16 = sizeof(int32_t); + error = uiomove((caddr_t)&tmp16, sizeof(uint16_t), uio); + error = uiomove((caddr_t)&kea->data.mode, sizeof(int32_t), uio); + break; + + case FSE_ARG_UID: + CHECK_UPTR(sizeof(uint16_t) + sizeof(uid_t)); + tmp16 = sizeof(uid_t); + error = uiomove((caddr_t)&tmp16, sizeof(uint16_t), uio); + error = uiomove((caddr_t)&kea->data.uid, sizeof(uid_t), uio); + break; + + case FSE_ARG_GID: + CHECK_UPTR(sizeof(uint16_t) + sizeof(gid_t)); + tmp16 = sizeof(gid_t); + error = uiomove((caddr_t)&tmp16, sizeof(uint16_t), uio); + error = uiomove((caddr_t)&kea->data.gid, sizeof(gid_t), uio); + break; + + default: + printf("fmod_watch: unknown arg type %d.\n", kea->type); + break; + } + } + + // make sure that we always end with a FSE_ARG_DONE + if (i >= KFS_NUM_ARGS) { + tmp16 = FSE_ARG_DONE; + CHECK_UPTR(sizeof(uint16_t)); + error = uiomove((caddr_t)&tmp16, sizeof(uint16_t), uio); + } + + + // LP64todo - fix this + last_full_event_resid = uio_resid(uio); + + watcher->rd = (watcher->rd + 1) % watcher->eventq_size; + + if (OSAddAtomic(-1, (SInt32 *)&kfse->refcount) == 1) { + do_free_event(kfse); + } + } + + get_out: + return error; +} + + +// release any references we might have on vnodes which are +// the mount point passed to us (so that it can be cleanly +// unmounted). +// +// since we don't want to lose the events we'll convert the +// vnode refs to the full path, inode #, and uid. +// +void +fsevent_unmount(struct mount *mp) +{ + int i, j; + kfs_event *kfse; + kfs_event_arg *kea; + + lck_rw_lock_exclusive(&fsevent_big_lock); + lock_fs_event_buf(); + + for(i=0; i < MAX_KFS_EVENTS; i++) { + if (fs_event_buf[i].type == FSE_INVALID) { + continue; + } + + kfse = &fs_event_buf[i]; + for(j=0; j < KFS_NUM_ARGS; j++) { + kea = &kfse->args[j]; + if (kea->type == FSE_ARG_DONE) { + break; + } + + if (kea->type == FSE_ARG_VNODE && kea->data.vp->v_mount == mp) { + struct vnode *vp; + char *pathbuff; + int pathbuff_len; + + vp = kea->data.vp; + pathbuff = get_pathbuff(); + pathbuff_len = MAXPATHLEN; + + if (vn_getpath(vp, pathbuff, &pathbuff_len) != 0 || pathbuff[0] == '\0') { + char *vname; + + vname = vnode_getname(vp); + + printf("fsevent_unmount: vn_getpath failed! vp 0x%x vname 0x%x (%s) vparent 0x%x\n", + vp, vname, vname ? vname : "<null>", vp->v_parent); + + if (vname) + vnode_putname(vname); + } + + // switch the type of the string + kea->type = FSE_ARG_STRING; + kea->data.str = vfs_addname(pathbuff, pathbuff_len, 0, 0); + kea->len = pathbuff_len; + release_pathbuff(pathbuff); + + // and finally let go of the reference on the vnode + vnode_rele_ext(vp, O_EVTONLY, 0); + } + } + } + + unlock_fs_event_buf(); + lck_rw_done(&fsevent_big_lock); +} + + +// +// /dev/fsevents device code +// +static int fsevents_installed = 0; +static struct lock__bsd__ fsevents_lck; + +typedef struct fsevent_handle { + fs_event_watcher *watcher; + struct selinfo si; +} fsevent_handle; + + +static int +fseventsf_read(struct fileproc *fp, struct uio *uio, + __unused kauth_cred_t *cred, __unused int flags, + __unused struct proc *p) +{ + fsevent_handle *fseh = (struct fsevent_handle *)fp->f_fglob->fg_data; + int error; + + error = fmod_watch(fseh->watcher, uio); + + return error; +} + +static int +fseventsf_write(__unused struct fileproc *fp, __unused struct uio *uio, + __unused kauth_cred_t *cred, __unused int flags, + __unused struct proc *p) +{ + return EIO; +} + + +static int +fseventsf_ioctl(struct fileproc *fp, u_long cmd, caddr_t data, struct proc *p) +{ + fsevent_handle *fseh = (struct fsevent_handle *)fp->f_fglob->fg_data; + int ret = 0; + pid_t pid = 0; + fsevent_dev_filter_args *devfilt_args=(fsevent_dev_filter_args *)data; + + switch (cmd) { + case FIONBIO: + case FIOASYNC: + return 0; + + case FSEVENTS_DEVICE_FILTER: { + int new_num_devices; + dev_t *devices_to_watch, *tmp=NULL; + + if (devfilt_args->num_devices > 256) { + ret = EINVAL; + break; + } + + new_num_devices = devfilt_args->num_devices; + if (new_num_devices == 0) { + tmp = fseh->watcher->devices_to_watch; + + lock_watch_list(); + fseh->watcher->devices_to_watch = NULL; + fseh->watcher->num_devices = new_num_devices; + unlock_watch_list(); + + if (tmp) { + FREE(tmp, M_TEMP); + } + break; + } + + MALLOC(devices_to_watch, dev_t *, + new_num_devices * sizeof(dev_t), + M_TEMP, M_WAITOK); + if (devices_to_watch == NULL) { + ret = ENOMEM; + break; + } + + ret = copyin(CAST_USER_ADDR_T(devfilt_args->devices), + (void *)devices_to_watch, + new_num_devices * sizeof(dev_t)); + if (ret) { + FREE(devices_to_watch, M_TEMP); + break; + } + + lock_watch_list(); + fseh->watcher->num_devices = new_num_devices; + tmp = fseh->watcher->devices_to_watch; + fseh->watcher->devices_to_watch = devices_to_watch; + unlock_watch_list(); + + if (tmp) { + FREE(tmp, M_TEMP); + } + + break; + } + + default: + ret = EINVAL; + break; + } + + return (ret); +} + + +static int +fseventsf_select(struct fileproc *fp, int which, void *wql, struct proc *p) +{ + fsevent_handle *fseh = (struct fsevent_handle *)fp->f_fglob->fg_data; + int ready = 0; + + if ((which != FREAD) || (fseh->watcher->flags & WATCHER_CLOSING)) { + return 0; + } + + + // if there's nothing in the queue, we're not ready + if (fseh->watcher->rd == fseh->watcher->wr) { + ready = 0; + } else { + ready = 1; + } + + if (!ready) { + selrecord(p, &fseh->si, wql); + } + + return ready; +} + + +static int +fseventsf_stat(struct fileproc *fp, struct stat *sb, struct proc *p) +{ + return ENOTSUP; +} + + +static int +fseventsf_close(struct fileglob *fg, struct proc *p) +{ + fsevent_handle *fseh = (struct fsevent_handle *)fg->fg_data; + + remove_watcher(fseh->watcher); + + fg->fg_data = NULL; + fseh->watcher = NULL; + FREE(fseh, M_TEMP); + + return 0; +} + +int +fseventsf_kqfilter(struct fileproc *fp, struct knote *kn, struct proc *p) +{ + // XXXdbg + return 0; +} + + +static int +fseventsf_drain(struct fileproc *fp, struct proc *p) +{ + int counter = 0; + fsevent_handle *fseh = (struct fsevent_handle *)fp->f_fglob->fg_data; + + fseh->watcher->flags |= WATCHER_CLOSING; + + // if there are people still waiting, sleep for 10ms to + // let them clean up and get out of there. however we + // also don't want to get stuck forever so if they don't + // exit after 5 seconds we're tearing things down anyway. + while(fseh->watcher->blockers && counter++ < 500) { + // issue wakeup in case anyone is blocked waiting for an event + // do this each time we wakeup in case the blocker missed + // the wakeup due to the unprotected test of WATCHER_CLOSING + // and decision to tsleep in fmod_watch... this bit of + // latency is a decent tradeoff against not having to + // take and drop a lock in fmod_watch + wakeup((caddr_t)fseh->watcher); + + tsleep((caddr_t)fseh->watcher, PRIBIO, "watcher-close", 1); + } + + return 0; +} + + +static int +fseventsopen(dev_t dev, int flag, int mode, struct proc *p) +{ + if (!is_suser()) { + return EPERM; + } + + return 0; +} + +static int +fseventsclose(dev_t dev, int flag, int mode, struct proc *p) +{ + return 0; +} + +static int +fseventsread(dev_t dev, struct uio *uio, int ioflag) +{ + return EIO; +} + +static int +fseventswrite(dev_t dev, struct uio *uio, int ioflag) +{ + return EIO; +} + + +static struct fileops fsevents_fops = { + fseventsf_read, + fseventsf_write, + fseventsf_ioctl, + fseventsf_select, + fseventsf_close, + fseventsf_kqfilter, + fseventsf_drain +}; + + + +static int +fseventsioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) +{ + struct fileproc *f; + int fd, error; + fsevent_handle *fseh = NULL; + fsevent_clone_args *fse_clone_args=(fsevent_clone_args *)data; + int8_t *event_list; + + switch (cmd) { + case FSEVENTS_CLONE: + if (fse_clone_args->num_events < 0 || fse_clone_args->num_events > 4096) { + return EINVAL; + } + + MALLOC(fseh, fsevent_handle *, sizeof(fsevent_handle), + M_TEMP, M_WAITOK); + memset(fseh, 0, sizeof(fsevent_handle)); + + MALLOC(event_list, int8_t *, + fse_clone_args->num_events * sizeof(int8_t), + M_TEMP, M_WAITOK); + + error = copyin(CAST_USER_ADDR_T(fse_clone_args->event_list), + (void *)event_list, + fse_clone_args->num_events * sizeof(int8_t)); + if (error) { + FREE(event_list, M_TEMP); + FREE(fseh, M_TEMP); + return error; + } + + error = add_watcher(event_list, + fse_clone_args->num_events, + fse_clone_args->event_queue_depth, + &fseh->watcher); + if (error) { + FREE(event_list, M_TEMP); + FREE(fseh, M_TEMP); + return error; + } + + error = falloc(p, &f, &fd); + if (error) { + FREE(event_list, M_TEMP); + FREE(fseh, M_TEMP); + return (error); + } + proc_fdlock(p); + f->f_fglob->fg_flag = FREAD | FWRITE; + f->f_fglob->fg_type = DTYPE_FSEVENTS; + f->f_fglob->fg_ops = &fsevents_fops; + f->f_fglob->fg_data = (caddr_t) fseh; + proc_fdunlock(p); + copyout((void *)&fd, CAST_USER_ADDR_T(fse_clone_args->fd), sizeof(int32_t)); + proc_fdlock(p); + *fdflags(p, fd) &= ~UF_RESERVED; + fp_drop(p, fd, f, 1); + proc_fdunlock(p); + break; + + default: + error = EINVAL; + break; + } + + return error; +} + +static int +fseventsselect(dev_t dev, int rw, struct proc *p) +{ + return 0; +} + +static void +fsevents_wakeup(fsevent_handle *fseh) +{ + wakeup((caddr_t)fseh); + selwakeup(&fseh->si); +} + + +/* + * A struct describing which functions will get invoked for certain + * actions. + */ +static struct cdevsw fsevents_cdevsw = +{ + fseventsopen, /* open */ + fseventsclose, /* close */ + fseventsread, /* read */ + fseventswrite, /* write */ + fseventsioctl, /* ioctl */ + nulldev, /* stop */ + nulldev, /* reset */ + NULL, /* tty's */ + eno_select, /* select */ + eno_mmap, /* mmap */ + eno_strat, /* strategy */ + eno_getc, /* getc */ + eno_putc, /* putc */ + 0 /* type */ +}; + + +/* + * Called to initialize our device, + * and to register ourselves with devfs + */ + +void +fsevents_init(void) +{ + int ret; + + if (fsevents_installed) { + return; + } + + fsevents_installed = 1; + + lockinit(&fsevents_lck, PLOCK, "fsevents", 0, 0); + + ret = cdevsw_add(-1, &fsevents_cdevsw); + if (ret < 0) { + fsevents_installed = 0; + return; + } + + devfs_make_node(makedev (ret, 0), DEVFS_CHAR, + UID_ROOT, GID_WHEEL, 0644, "fsevents", 0); + + fsevents_internal_init(); +} + + + +// +// XXXdbg - temporary path buffer handling +// +#define NUM_PATH_BUFFS 16 +static char path_buff[NUM_PATH_BUFFS][MAXPATHLEN]; +static char path_buff_inuse[NUM_PATH_BUFFS]; + +static lck_grp_attr_t * pathbuff_group_attr; +static lck_attr_t * pathbuff_lock_attr; +static lck_grp_t * pathbuff_mutex_group; +static lck_mtx_t pathbuff_lock; + +static void +init_pathbuff(void) +{ + pathbuff_lock_attr = lck_attr_alloc_init(); + pathbuff_group_attr = lck_grp_attr_alloc_init(); + pathbuff_mutex_group = lck_grp_alloc_init("pathbuff-mutex", pathbuff_group_attr); + + lck_mtx_init(&pathbuff_lock, pathbuff_mutex_group, pathbuff_lock_attr); +} + +static void +lock_pathbuff(void) +{ + lck_mtx_lock(&pathbuff_lock); +} + +static void +unlock_pathbuff(void) +{ + lck_mtx_unlock(&pathbuff_lock); +} + + +char * +get_pathbuff(void) +{ + int i; + + lock_pathbuff(); + for(i=0; i < NUM_PATH_BUFFS; i++) { + if (path_buff_inuse[i] == 0) { + break; + } + } + + if (i >= NUM_PATH_BUFFS) { + char *path; + + unlock_pathbuff(); + MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); + return path; + } + + path_buff_inuse[i] = 1; + unlock_pathbuff(); + return &path_buff[i][0]; +} + +void +release_pathbuff(char *path) +{ + int i; + + if (path == NULL) { + return; + } + + lock_pathbuff(); + for(i=0; i < NUM_PATH_BUFFS; i++) { + if (path == &path_buff[i][0]) { + path_buff[i][0] = '\0'; + path_buff_inuse[i] = 0; + unlock_pathbuff(); + return; + } + } + + unlock_pathbuff(); + + // if we get here then it wasn't one of our temp buffers + FREE_ZONE(path, MAXPATHLEN, M_NAMEI); +} + +int +get_fse_info(struct vnode *vp, fse_info *fse, vfs_context_t ctx) +{ + struct vnode_attr va; + + VATTR_INIT(&va); + VATTR_WANTED(&va, va_fsid); + VATTR_WANTED(&va, va_fileid); + VATTR_WANTED(&va, va_mode); + VATTR_WANTED(&va, va_uid); + VATTR_WANTED(&va, va_gid); + if (vnode_getattr(vp, &va, ctx) != 0) { + return -1; + } + + fse->dev = (dev_t)va.va_fsid; + fse->ino = (ino_t)va.va_fileid; + fse->mode = (int32_t)vnode_vttoif(vnode_vtype(vp)) | va.va_mode; + fse->uid = (uid_t)va.va_uid; + fse->gid = (gid_t)va.va_gid; + + return 0; +} diff --git a/bsd/vfs/vfs_init.c b/bsd/vfs/vfs_init.c index 5074582f7..4c4aabb22 100644 --- a/bsd/vfs/vfs_init.c +++ b/bsd/vfs/vfs_init.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -62,14 +62,13 @@ #include <sys/param.h> -#include <sys/mount.h> +#include <sys/mount_internal.h> #include <sys/time.h> #include <sys/vm.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/stat.h> #include <sys/namei.h> #include <sys/ucred.h> -#include <sys/buf.h> #include <sys/errno.h> #include <sys/malloc.h> @@ -106,7 +105,7 @@ int vn_default_error() { - return (EOPNOTSUPP); + return (ENOTSUP); } /* @@ -172,7 +171,7 @@ vfs_opv_init() */ if (opve_descp->opve_op->vdesc_offset == 0 && opve_descp->opve_op->vdesc_offset != - VOFFSET(vop_default)) { + VOFFSET(vnop_default)) { printf("operation %s not listed in %s.\n", opve_descp->opve_op->vdesc_name, "vfs_op_descs"); @@ -195,13 +194,13 @@ vfs_opv_init() /* * Force every operations vector to have a default routine. */ - if (opv_desc_vector[VOFFSET(vop_default)]==NULL) { + if (opv_desc_vector[VOFFSET(vnop_default)]==NULL) { panic("vfs_opv_init: operation vector without default routine."); } for (k = 0; k<vfs_opv_numops; k++) if (opv_desc_vector[k] == NULL) opv_desc_vector[k] = - opv_desc_vector[VOFFSET(vop_default)]; + opv_desc_vector[VOFFSET(vnop_default)]; } } @@ -235,15 +234,115 @@ vfs_op_init() */ extern struct vnodeops dead_vnodeops; extern struct vnodeops spec_vnodeops; -struct vattr va_null; +/* vars for vnode lock */ +lck_grp_t * vnode_lck_grp; +lck_grp_attr_t * vnode_lck_grp_attr; +lck_attr_t * vnode_lck_attr; + + +/* vars for vnode list lock */ +lck_grp_t * vnode_list_lck_grp; +lck_grp_attr_t * vnode_list_lck_grp_attr; +lck_attr_t * vnode_list_lck_attr; +lck_mtx_t * vnode_list_mtx_lock; +lck_mtx_t * spechash_mtx_lock; +/* Routine to lock and unlock the vnode lists */ +void vnode_list_lock(void); +void vnode_list_unlock(void); + +/* vars for vfsconf lock */ +lck_grp_t * fsconf_lck_grp; +lck_grp_attr_t * fsconf_lck_grp_attr; +lck_attr_t * fsconf_lck_attr; + + +/* vars for mount lock */ +lck_grp_t * mnt_lck_grp; +lck_grp_attr_t * mnt_lck_grp_attr; +lck_attr_t * mnt_lck_attr; + +/* vars for mount list lock */ +lck_grp_t * mnt_list_lck_grp; +lck_grp_attr_t * mnt_list_lck_grp_attr; +lck_attr_t * mnt_list_lck_attr; +lck_mtx_t * mnt_list_mtx_lock; + +extern void journal_init(); + +struct mount * dead_mountp; /* * Initialize the vnode structures and initialize each file system type. */ +void vfsinit() { - struct vfsconf *vfsp; + struct vfstable *vfsp; int i, maxtypenum; + struct mount * mp; + + /* Allocate vnode list lock group attribute and group */ + vnode_list_lck_grp_attr= lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(vnode_list_lck_grp_attr); + + vnode_list_lck_grp = lck_grp_alloc_init("vnode list", vnode_list_lck_grp_attr); + + /* Allocate vnode list lock attribute */ + vnode_list_lck_attr = lck_attr_alloc_init(); + //lck_attr_setdebug(vnode_list_lck_attr); + + /* Allocate vnode list lock */ + vnode_list_mtx_lock = lck_mtx_alloc_init(vnode_list_lck_grp, vnode_list_lck_attr); + + /* Allocate spec hash list lock */ + spechash_mtx_lock = lck_mtx_alloc_init(vnode_list_lck_grp, vnode_list_lck_attr); + + /* allocate vnode lock group attribute and group */ + vnode_lck_grp_attr= lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(vnode_lck_grp_attr); + + vnode_lck_grp = lck_grp_alloc_init("vnode", vnode_lck_grp_attr); + + /* Allocate vnode lock attribute */ + vnode_lck_attr = lck_attr_alloc_init(); + //lck_attr_setdebug(vnode_lck_attr); + + /* Allocate fs config lock group attribute and group */ + fsconf_lck_grp_attr= lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(fsconf_lck_grp_attr); + + fsconf_lck_grp = lck_grp_alloc_init("fs conf", fsconf_lck_grp_attr); + + /* Allocate fs config lock attribute */ + fsconf_lck_attr = lck_attr_alloc_init(); + //lck_attr_setdebug(fsconf_lck_attr); + + + /* Allocate mount point related lock structures */ + + /* Allocate mount list lock group attribute and group */ + mnt_list_lck_grp_attr= lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(mnt_list_lck_grp_attr); + + mnt_list_lck_grp = lck_grp_alloc_init("mount list", mnt_list_lck_grp_attr); + + /* Allocate mount list lock attribute */ + mnt_list_lck_attr = lck_attr_alloc_init(); + //lck_attr_setdebug(mnt_list_lck_attr); + + /* Allocate mount list lock */ + mnt_list_mtx_lock = lck_mtx_alloc_init(mnt_list_lck_grp, mnt_list_lck_attr); + + + /* allocate mount lock group attribute and group */ + mnt_lck_grp_attr= lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(mnt_lck_grp_attr); + + mnt_lck_grp = lck_grp_alloc_init("mount", mnt_lck_grp_attr); + + /* Allocate mount lock attribute */ + mnt_lck_attr = lck_attr_alloc_init(); + //lck_attr_setdebug(mnt_lck_attr); /* * Initialize the "console user" for access purposes: @@ -262,6 +361,10 @@ vfsinit() * Initialize the vnode name cache */ nchinit(); + /* + * Initialize the journaling locks + */ + journal_init(); /* * Build vnode operation vectors. */ @@ -271,7 +374,6 @@ vfsinit() * Initialize each file system type in the static list, * until the first NULL ->vfs_vfsops is encountered. */ - vattr_null(&va_null); numused_vfsslots = maxtypenum = 0; for (vfsp = vfsconf, i = 0; i < maxvfsconf; i++, vfsp++) { if (vfsp->vfc_vfsops == (struct vfsops *)0) @@ -280,14 +382,84 @@ vfsinit() if (maxtypenum <= vfsp->vfc_typenum) maxtypenum = vfsp->vfc_typenum + 1; (*vfsp->vfc_vfsops->vfs_init)(vfsp); + + lck_mtx_init(&vfsp->vfc_lock, fsconf_lck_grp, fsconf_lck_attr); + numused_vfsslots++; } /* next vfc_typenum to be used */ maxvfsconf = maxtypenum; + + /* + * Initialize the vnop authorization scope. + */ + vnode_authorize_init(); + + /* + * create a mount point for dead vnodes + */ + MALLOC_ZONE(mp, struct mount *, (u_long)sizeof(struct mount), + M_MOUNT, M_WAITOK); + bzero((char *)mp, (u_long)sizeof(struct mount)); + /* Initialize the default IO constraints */ + mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS; + mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32; + mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt; + mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt; + mp->mnt_devblocksize = DEV_BSIZE; + + TAILQ_INIT(&mp->mnt_vnodelist); + TAILQ_INIT(&mp->mnt_workerqueue); + TAILQ_INIT(&mp->mnt_newvnodes); + mp->mnt_flag = MNT_LOCAL; + mp->mnt_lflag = MNT_LDEAD; + mount_lock_init(mp); + dead_mountp = mp; +} + +void +vnode_list_lock() +{ + lck_mtx_lock(vnode_list_mtx_lock); +} + +void +vnode_list_unlock() +{ + lck_mtx_unlock(vnode_list_mtx_lock); +} + +void +mount_list_lock() +{ + lck_mtx_lock(mnt_list_mtx_lock); +} + +void +mount_list_unlock() +{ + lck_mtx_unlock(mnt_list_mtx_lock); +} + +void +mount_lock_init(mount_t mp) +{ + lck_mtx_init(&mp->mnt_mlock, mnt_lck_grp, mnt_lck_attr); + lck_mtx_init(&mp->mnt_renamelock, mnt_lck_grp, mnt_lck_attr); + lck_rw_init(&mp->mnt_rwlock, mnt_lck_grp, mnt_lck_attr); +} + +void +mount_lock_destroy(mount_t mp) +{ + lck_mtx_destroy(&mp->mnt_mlock, mnt_lck_grp); + lck_mtx_destroy(&mp->mnt_renamelock, mnt_lck_grp); + lck_rw_destroy(&mp->mnt_rwlock, mnt_lck_grp); } + /* - * Name: vfsconf_add + * Name: vfstable_add * * Description: Add a filesystem to the vfsconf list at the first * unused slot. If no slots are available, return an @@ -305,15 +477,12 @@ vfsinit() * * Warning: This code assumes that vfsconf[0] is non-empty. */ -int -vfsconf_add(struct vfsconf *nvfsp) +struct vfstable * +vfstable_add(struct vfstable *nvfsp) { int slot; - struct vfsconf *slotp; + struct vfstable *slotp; - if (nvfsp == NULL) /* overkill */ - return (-1); - /* * Find the next empty slot; we recognize an empty slot by a * NULL-valued ->vfc_vfsops, so if we delete a VFS, we must @@ -325,7 +494,7 @@ vfsconf_add(struct vfsconf *nvfsp) } if (slot == maxvfsslots) { /* out of static slots; allocate one instead */ - MALLOC(slotp, struct vfsconf *, sizeof(struct vfsconf), + MALLOC(slotp, struct vfstable *, sizeof(struct vfstable), M_TEMP, M_WAITOK); } else { slotp = &vfsconf[slot]; @@ -338,7 +507,8 @@ vfsconf_add(struct vfsconf *nvfsp) * Note; Takes advantage of the fact that 'slot' was left * with the value of 'maxvfslots' in the allocation case. */ - bcopy(nvfsp, slotp, sizeof(struct vfsconf)); + bcopy(nvfsp, slotp, sizeof(struct vfstable)); + lck_mtx_init(&slotp->vfc_lock, fsconf_lck_grp, fsconf_lck_attr); if (slot != 0) { slotp->vfc_next = vfsconf[slot - 1].vfc_next; vfsconf[slot - 1].vfc_next = slotp; @@ -347,22 +517,11 @@ vfsconf_add(struct vfsconf *nvfsp) } numused_vfsslots++; - /* - * Call through the ->vfs_init(); use slotp instead of nvfsp, - * so that if the FS cares where it's instance record is, it - * can find it later. - * - * XXX All code that calls ->vfs_init treats it as if it - * XXX returns a "void', and can never fail. - */ - if (nvfsp->vfc_vfsops->vfs_init) - (*nvfsp->vfc_vfsops->vfs_init)(slotp); - - return(0); + return(slotp); } /* - * Name: vfsconf_del + * Name: vfstable_del * * Description: Remove a filesystem from the vfsconf list by name. * If no such filesystem exists, return an error. @@ -375,30 +534,30 @@ vfsconf_add(struct vfsconf *nvfsp) * Notes: Hopefully all filesystems have unique names. */ int -vfsconf_del(char * fs_name) +vfstable_del(struct vfstable * vtbl) { - struct vfsconf **vcpp; - struct vfsconf *vcdelp; + struct vfstable **vcpp; + struct vfstable *vcdelp; /* - * Traverse the list looking for fs_name; if found, *vcpp + * Traverse the list looking for vtbl; if found, *vcpp * will contain the address of the pointer to the entry to * be removed. */ for( vcpp = &vfsconf; *vcpp; vcpp = &(*vcpp)->vfc_next) { - if (strcmp( (*vcpp)->vfc_name, fs_name) == 0) + if (*vcpp == vtbl) break; } - if (*vcpp == NULL) { - /* XXX need real error code for entry not found */ - return(-1); - } + if (*vcpp == NULL) + return(ESRCH); /* vtbl not on vfsconf list */ /* Unlink entry */ vcdelp = *vcpp; *vcpp = (*vcpp)->vfc_next; + lck_mtx_destroy(&vcdelp->vfc_lock, fsconf_lck_grp); + /* * Is this an entry from our static table? We find out by * seeing if the pointer to the object to be deleted places @@ -406,7 +565,7 @@ vfsconf_del(char * fs_name) */ if (vcdelp >= vfsconf && vcdelp < (vfsconf + maxvfsslots)) { /* Y */ /* Mark as empty for vfscon_add() */ - bzero(vcdelp, sizeof(struct vfsconf)); + bzero(vcdelp, sizeof(struct vfstable)); numused_vfsslots--; } else { /* N */ /* @@ -420,3 +579,16 @@ vfsconf_del(char * fs_name) return(0); } + +void +SPECHASH_LOCK(void) +{ + lck_mtx_lock(spechash_mtx_lock); +} + +void +SPECHASH_UNLOCK(void) +{ + lck_mtx_unlock(spechash_mtx_lock); +} + diff --git a/bsd/vfs/vfs_journal.c b/bsd/vfs/vfs_journal.c index 19c28f39a..4389d214b 100644 --- a/bsd/vfs/vfs_journal.c +++ b/bsd/vfs/vfs_journal.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1995-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1995-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -34,19 +34,18 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/stat.h> -#include <sys/buf.h> -#include <sys/proc.h> -#include <sys/mount.h> +#include <sys/buf_internal.h> +#include <sys/proc_internal.h> +#include <sys/mount_internal.h> #include <sys/namei.h> -#include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/ioctl.h> #include <sys/tty.h> #include <sys/ubc.h> #include <sys/malloc.h> -#include <sys/vnode.h> -#include <kern/thread_act.h> +#include <kern/thread.h> #include <sys/disk.h> #include <miscfs/specfs/specdev.h> @@ -81,6 +80,13 @@ static int end_transaction(transaction *tr, int force_it); static void abort_transaction(journal *jnl, transaction *tr); static void dump_journal(journal *jnl); +static __inline__ void lock_journal(journal *jnl); +static __inline__ void unlock_journal(journal *jnl); +static __inline__ void lock_oldstart(journal *jnl); +static __inline__ void unlock_oldstart(journal *jnl); + + + // // 3105942 - Coalesce writes to the same block on journal replay @@ -178,6 +184,49 @@ calc_checksum(char *ptr, int len) return (~cksum); } +// +// Journal Locking +// +lck_grp_attr_t * jnl_group_attr; +lck_attr_t * jnl_lock_attr; +lck_grp_t * jnl_mutex_group; + +void +journal_init() +{ + jnl_lock_attr = lck_attr_alloc_init(); + jnl_group_attr = lck_grp_attr_alloc_init(); + jnl_mutex_group = lck_grp_alloc_init("jnl-mutex", jnl_group_attr); + + /* Turn on lock debugging */ + //lck_attr_setdebug(jnl_lock_attr); +} + +static __inline__ void +lock_journal(journal *jnl) +{ + lck_mtx_lock(&jnl->jlock); +} + +static __inline__ void +unlock_journal(journal *jnl) +{ + lck_mtx_unlock(&jnl->jlock); +} + +static __inline__ void +lock_oldstart(journal *jnl) +{ + lck_mtx_lock(&jnl->old_start_lock); +} + +static __inline__ void +unlock_oldstart(journal *jnl) +{ + lck_mtx_unlock(&jnl->old_start_lock); +} + + #define JNL_WRITE 0x0001 #define JNL_READ 0x0002 @@ -196,29 +245,23 @@ static size_t do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction) { int err, io_sz=0, curlen=len; - struct buf *bp; - int max_iosize=0, max_vectors; + buf_t bp; + int max_iosize = 128 * 1024; + struct vfsioattr ioattr; if (*offset < 0 || *offset > jnl->jhdr->size) { panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size); } + vfs_ioattr(vnode_mount(jnl->jdev), &ioattr); + + if (direction & JNL_WRITE) + max_iosize = ioattr.io_maxwritecnt; + else if (direction & JNL_READ) + max_iosize = ioattr.io_maxreadcnt; again: bp = alloc_io_buf(jnl->jdev, 1); - if (direction & JNL_WRITE) { - bp->b_flags |= 0; // don't have to set any flags (was: B_WRITEINPROG) - jnl->jdev->v_numoutput++; - vfs_io_attributes(jnl->jdev, B_WRITE, &max_iosize, &max_vectors); - } else if (direction & JNL_READ) { - bp->b_flags |= B_READ; - vfs_io_attributes(jnl->jdev, B_READ, &max_iosize, &max_vectors); - } - - if (max_iosize == 0) { - max_iosize = 128 * 1024; - } - if (*offset + (off_t)curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) { if (*offset == jnl->jhdr->size) { *offset = jnl->jhdr->jhdr_size; @@ -239,21 +282,24 @@ do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %d, data %p)\n", curlen, data); } - bp->b_bufsize = curlen; - bp->b_bcount = curlen; - bp->b_data = data; - bp->b_blkno = (daddr_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size); - bp->b_lblkno = (daddr_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size); - - err = VOP_STRATEGY(bp); + if (direction & JNL_READ) + buf_setflags(bp, B_READ); + else { + /* + * don't have to set any flags + */ + vnode_startwrite(jnl->jdev); + } + buf_setsize(bp, curlen); + buf_setcount(bp, curlen); + buf_setdataptr(bp, (uintptr_t)data); + buf_setblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size)); + buf_setlblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size)); + + err = VNOP_STRATEGY(bp); if (!err) { - err = biowait(bp); + err = (int)buf_biowait(bp); } - - bp->b_data = NULL; - bp->b_bufsize = bp->b_bcount = 0; - bp->b_blkno = bp->b_lblkno = -1; - free_io_buf(bp); if (err) { @@ -303,11 +349,14 @@ write_journal_header(journal *jnl) static int num_err_prints = 0; int ret; off_t jhdr_offset = 0; - + struct vfs_context context; + + context.vc_proc = current_proc(); + context.vc_ucred = NOCRED; // // XXXdbg note: this ioctl doesn't seem to do anything on firewire disks. // - ret = VOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NOCRED, current_proc()); + ret = VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context); if (ret != 0) { // // Only print this error if it's a different error than the @@ -345,7 +394,7 @@ write_journal_header(journal *jnl) // on an IDE bus analyzer with Larry Barras so while it // may seem obscure, it's not. // - VOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NOCRED, current_proc()); + VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context); return 0; } @@ -365,12 +414,16 @@ free_old_stuff(journal *jnl) { transaction *tr, *next; - for(tr=jnl->tr_freeme; tr; tr=next) { - next = tr->next; - FREE_ZONE(tr, sizeof(transaction), M_JNL_TR); + lock_oldstart(jnl); + tr = jnl->tr_freeme; + jnl->tr_freeme = NULL; + unlock_oldstart(jnl); + + for(; tr; tr=next) { + next = tr->next; + FREE_ZONE(tr, sizeof(transaction), M_JNL_TR); } - jnl->tr_freeme = NULL; } @@ -382,7 +435,7 @@ free_old_stuff(journal *jnl) // not initiate any new i/o's or allocate/free memory. // static void -buffer_flushed_callback(struct buf *bp) +buffer_flushed_callback(struct buf *bp, void *arg) { transaction *tr; journal *jnl; @@ -390,29 +443,12 @@ buffer_flushed_callback(struct buf *bp) int i, bufsize; - //printf("jnl: buf flush: bp @ 0x%x l/blkno %d/%d vp 0x%x tr @ 0x%x\n", - // bp, bp->b_lblkno, bp->b_blkno, bp->b_vp, bp->b_transaction); + //printf("jnl: buf flush: bp @ 0x%x l/blkno %qd/%qd vp 0x%x tr @ 0x%x\n", + // bp, buf_lblkno(bp), buf_blkno(bp), buf_vnode(bp), arg); // snarf out the bits we want - bufsize = bp->b_bufsize; - tr = bp->b_transaction; - - bp->b_iodone = NULL; // don't call us for this guy again - bp->b_transaction = NULL; - - // - // This is what biodone() would do if it didn't call us. - // NOTE: THIS CODE *HAS* TO BE HERE! - // - if (ISSET(bp->b_flags, B_ASYNC)) { /* if async, release it */ - brelse(bp); - } else { /* or just wakeup the buffer */ - CLR(bp->b_flags, B_WANTED); - wakeup(bp); - } - - // NOTE: from here on out we do *NOT* touch bp anymore. - + bufsize = buf_size(bp); + tr = (transaction *)arg; // then we've already seen it if (tr == NULL) { @@ -431,7 +467,7 @@ buffer_flushed_callback(struct buf *bp) // update the number of blocks that have been flushed. // this buf may represent more than one block so take // that into account. - tr->num_flushed += bufsize; + OSAddAtomic(bufsize, &tr->num_flushed); // if this transaction isn't done yet, just return as @@ -440,11 +476,23 @@ buffer_flushed_callback(struct buf *bp) return; } + // this will single thread checking the transaction + lock_oldstart(jnl); + + if (tr->total_bytes == 0xfbadc0de) { + // then someone beat us to it... + unlock_oldstart(jnl); + return; + } + + // mark this so that we're the owner of dealing with the + // cleanup for this transaction + tr->total_bytes = 0xfbadc0de; + //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n", // tr, tr->journal_start, tr->journal_end, jnl); // find this entry in the old_start[] index and mark it completed - simple_lock(&jnl->old_start_lock); for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) { if ((jnl->old_start[i] & ~(0x8000000000000000LL)) == tr->journal_start) { @@ -456,7 +504,7 @@ buffer_flushed_callback(struct buf *bp) panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr 0x%x, jnl 0x%x)\n", tr->journal_start, tr, jnl); } - simple_unlock(&jnl->old_start_lock); + unlock_oldstart(jnl); // if we are here then we need to update the journal header @@ -478,10 +526,12 @@ buffer_flushed_callback(struct buf *bp) jnl->completed_trs = ctr->next; } + lock_oldstart(jnl); next = jnl->completed_trs; // this starts us over again ctr->next = jnl->tr_freeme; jnl->tr_freeme = ctr; ctr = NULL; + unlock_oldstart(jnl); } else if (tr->journal_end == ctr->journal_start) { ctr->journal_start = tr->journal_start; next = jnl->completed_trs; // this starts us over again @@ -496,9 +546,6 @@ buffer_flushed_callback(struct buf *bp) } } - // at this point no one should be using this guy anymore - tr->total_bytes = 0xfbadc0de; - // if this is true then we didn't merge with anyone // so link ourselves in at the head of the completed // transaction list. @@ -525,8 +572,10 @@ buffer_flushed_callback(struct buf *bp) } else { // if we're here this tr got merged with someone else so // put it on the list to be free'd + lock_oldstart(jnl); tr->next = jnl->tr_freeme; jnl->tr_freeme = tr; + unlock_oldstart(jnl); } } @@ -578,47 +627,47 @@ swap_block_list_header(journal *jnl, block_list_header *blhdr) static int update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize) { - int ret; + int ret; struct buf *oblock_bp=NULL; // first read the block we want. - ret = meta_bread(jnl->fsdev, (daddr_t)fs_block, bsize, NOCRED, &oblock_bp); + ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp); if (ret != 0) { printf("jnl: update_fs_block: error reading fs block # %lld! (ret %d)\n", fs_block, ret); if (oblock_bp) { - brelse(oblock_bp); + buf_brelse(oblock_bp); oblock_bp = NULL; } // let's try to be aggressive here and just re-write the block - oblock_bp = getblk(jnl->fsdev, (daddr_t)fs_block, bsize, 0, 0, BLK_META); + oblock_bp = buf_getblk(jnl->fsdev, (daddr64_t)fs_block, bsize, 0, 0, BLK_META); if (oblock_bp == NULL) { - printf("jnl: update_fs_block: getblk() for %lld failed! failing update.\n", fs_block); + printf("jnl: update_fs_block: buf_getblk() for %lld failed! failing update.\n", fs_block); return -1; } } // make sure it's the correct size. - if (oblock_bp->b_bufsize != bsize) { - brelse(oblock_bp); + if (buf_size(oblock_bp) != bsize) { + buf_brelse(oblock_bp); return -1; } // copy the journal data over top of it - memcpy(oblock_bp->b_data, block_ptr, bsize); + memcpy((void *)buf_dataptr(oblock_bp), block_ptr, bsize); - if ((ret = VOP_BWRITE(oblock_bp)) != 0) { + if ((ret = VNOP_BWRITE(oblock_bp)) != 0) { printf("jnl: update_fs_block: failed to update block %lld (ret %d)\n", fs_block,ret); return ret; } // and now invalidate it so that if someone else wants to read // it in a different size they'll be able to do it. - ret = meta_bread(jnl->fsdev, (daddr_t)fs_block, bsize, NOCRED, &oblock_bp); + ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp); if (oblock_bp) { - oblock_bp->b_flags |= B_INVAL; - brelse(oblock_bp); + buf_markinvalid(oblock_bp); + buf_brelse(oblock_bp); } return 0; @@ -781,7 +830,8 @@ do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num if (prev_block_end > block_end) { off_t new_num = block_end / jhdr_size; size_t new_size = prev_block_end - block_end; - size_t new_offset = (*buf_ptr)[blk_index-1].jnl_offset + (block_end - prev_block_start); + + new_offset = (*buf_ptr)[blk_index-1].jnl_offset + (block_end - prev_block_start); err = insert_block(jnl, buf_ptr, blk_index, new_num, new_size, new_offset, num_buckets_ptr, num_full_ptr, 0); if (err < 0) { @@ -815,7 +865,7 @@ do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num overlap = block_end - (*buf_ptr)[index].block_num*jhdr_size; if (overlap > 0) { if (overlap % jhdr_size != 0) { - panic("jnl: do_overlap: overlap of %d is not multiple of %d\n", overlap, jhdr_size); + panic("jnl: do_overlap: overlap of %lld is not multiple of %d\n", overlap, jhdr_size); } // if we partially overlap this entry, adjust its block number, jnl offset, and size @@ -873,7 +923,6 @@ static int add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, size_t offset, int *num_buckets_ptr, int *num_full_ptr) { int blk_index, overwriting; - size_t jhdr_size = jnl->jhdr->jhdr_size; // on return from lookup_bucket(), blk_index is the index into the table where block_num should be // inserted (or the index of the elem to overwrite). @@ -902,10 +951,9 @@ static int replay_journal(journal *jnl) { int i, ret, orig_checksum, checksum, max_bsize; - struct buf *oblock_bp; block_list_header *blhdr; off_t offset; - char *buf, *block_ptr=NULL; + char *buff, *block_ptr=NULL; struct bucket *co_buf; int num_buckets = STARTING_BUCKETS, num_full; @@ -922,7 +970,7 @@ replay_journal(journal *jnl) } // allocate memory for the header_block. we'll read each blhdr into this - if (kmem_alloc(kernel_map, (vm_offset_t *)&buf, jnl->jhdr->blhdr_size)) { + if (kmem_alloc(kernel_map, (vm_offset_t *)&buff, jnl->jhdr->blhdr_size)) { printf("jnl: replay_journal: no memory for block buffer! (%d bytes)\n", jnl->jhdr->blhdr_size); return -1; @@ -946,13 +994,13 @@ replay_journal(journal *jnl) while(jnl->jhdr->start != jnl->jhdr->end) { offset = jnl->jhdr->start; - ret = read_journal_data(jnl, &offset, buf, jnl->jhdr->blhdr_size); + ret = read_journal_data(jnl, &offset, buff, jnl->jhdr->blhdr_size); if (ret != jnl->jhdr->blhdr_size) { printf("jnl: replay_journal: Could not read block list header block @ 0x%llx!\n", offset); goto bad_replay; } - blhdr = (block_list_header *)buf; + blhdr = (block_list_header *)buff; orig_checksum = blhdr->checksum; blhdr->checksum = 0; @@ -966,15 +1014,15 @@ replay_journal(journal *jnl) checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE); } if (checksum != orig_checksum) { - printf("jnl: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n", - offset, orig_checksum, checksum); - goto bad_replay; - } + printf("jnl: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n", + offset, orig_checksum, checksum); + goto bad_replay; + } if ( blhdr->max_blocks <= 0 || blhdr->max_blocks > 2048 || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) { - printf("jnl: replay_journal: bad looking journal entry: max: %d num: %d\n", - blhdr->max_blocks, blhdr->num_blocks); - goto bad_replay; + printf("jnl: replay_journal: bad looking journal entry: max: %d num: %d\n", + blhdr->max_blocks, blhdr->num_blocks); + goto bad_replay; } for(i=1; i < blhdr->num_blocks; i++) { @@ -1094,7 +1142,7 @@ replay_journal(journal *jnl) FREE(co_buf, M_TEMP); co_buf = NULL; - kmem_free(kernel_map, (vm_offset_t)buf, jnl->jhdr->blhdr_size); + kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size); return 0; bad_replay: @@ -1104,7 +1152,7 @@ replay_journal(journal *jnl) if (co_buf) { FREE(co_buf, M_TEMP); } - kmem_free(kernel_map, (vm_offset_t)buf, jnl->jhdr->blhdr_size); + kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size); return -1; } @@ -1190,10 +1238,14 @@ journal_create(struct vnode *jvp, void *arg) { journal *jnl; - int ret, phys_blksz; + int phys_blksz; + struct vfs_context context; + + context.vc_proc = current_proc(); + context.vc_ucred = FSCRED; /* Get the real physical block size. */ - if (VOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, FSCRED, NULL)) { + if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) { return NULL; } @@ -1218,7 +1270,7 @@ journal_create(struct vnode *jvp, jnl->flush = flush; jnl->flush_arg = arg; jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK); - simple_lock_init(&jnl->old_start_lock); + lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr); if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) { printf("jnl: create: could not allocate space for header buffer (%d bytes)\n", phys_blksz); @@ -1242,7 +1294,7 @@ journal_create(struct vnode *jvp, // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3); // jnl->jhdr->end = jnl->jhdr->size - (phys_blksz*3); - lockinit(&jnl->jlock, PINOD, "journal", 0, 0); + lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr); if (write_journal_header(jnl) != 0) { printf("jnl: journal_create: failed to write journal header.\n"); @@ -1273,11 +1325,15 @@ journal_open(struct vnode *jvp, void *arg) { journal *jnl; - int orig_blksz=0, phys_blksz, blhdr_size; + int orig_blksz=0, phys_blksz; int orig_checksum, checksum; + struct vfs_context context; + + context.vc_proc = current_proc(); + context.vc_ucred = FSCRED; /* Get the real physical block size. */ - if (VOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, FSCRED, NULL)) { + if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) { return NULL; } @@ -1302,7 +1358,7 @@ journal_open(struct vnode *jvp, jnl->flush = flush; jnl->flush_arg = arg; jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK); - simple_lock_init(&jnl->old_start_lock); + lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr); if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) { printf("jnl: create: could not allocate space for header buffer (%d bytes)\n", phys_blksz); @@ -1362,7 +1418,7 @@ journal_open(struct vnode *jvp, orig_blksz = phys_blksz; phys_blksz = jnl->jhdr->jhdr_size; - if (VOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, FSCRED, NULL)) { + if (VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, &context)) { printf("jnl: could not set block size to %d bytes.\n", phys_blksz); goto bad_journal; } @@ -1420,7 +1476,7 @@ journal_open(struct vnode *jvp, } if (orig_blksz != 0) { - VOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, FSCRED, NULL); + VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context); phys_blksz = orig_blksz; if (orig_blksz < jnl->jhdr->jhdr_size) { printf("jnl: open: jhdr_size is %d but orig phys blk size is %d. switching.\n", @@ -1436,14 +1492,14 @@ journal_open(struct vnode *jvp, // set this now, after we've replayed the journal size_up_tbuffer(jnl, tbuffer_size, phys_blksz); - lockinit(&jnl->jlock, PINOD, "journal", 0, 0); + lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr); return jnl; bad_journal: if (orig_blksz != 0) { phys_blksz = orig_blksz; - VOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, FSCRED, NULL); + VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context); } kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz); bad_kmem_alloc: @@ -1464,14 +1520,8 @@ journal_close(journal *jnl) // jnl->flags |= JOURNAL_CLOSE_PENDING; - if (jnl->owner != current_act()) { - int ret; - - ret = lockmgr(&jnl->jlock, LK_EXCLUSIVE|LK_RETRY, NULL, current_proc()); - if (ret != 0) { - printf("jnl: close: locking the journal (0x%x) failed %d.\n", jnl, ret); - return; - } + if (jnl->owner != current_thread()) { + lock_journal(jnl); } // @@ -1619,7 +1669,7 @@ check_free_space(journal *jnl, int desired_size) // entries until there is enough space for the next transaction. // old_start_empty = 1; - simple_lock(&jnl->old_start_lock); + lock_oldstart(jnl); for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) { int counter; @@ -1630,12 +1680,12 @@ check_free_space(journal *jnl, int desired_size) jnl->old_start[i], jnl); } - simple_unlock(&jnl->old_start_lock); + unlock_oldstart(jnl); if (jnl->flush) { jnl->flush(jnl->flush_arg); } tsleep((caddr_t)jnl, PRIBIO, "check_free_space1", 1); - simple_lock(&jnl->old_start_lock); + lock_oldstart(jnl); } if (jnl->old_start[i] == 0) { @@ -1646,11 +1696,13 @@ check_free_space(journal *jnl, int desired_size) jnl->jhdr->start = jnl->old_start[i]; jnl->old_start[i] = 0; if (free_space(jnl) > desired_size) { + unlock_oldstart(jnl); write_journal_header(jnl); + lock_oldstart(jnl); break; } } - simple_unlock(&jnl->old_start_lock); + unlock_oldstart(jnl); // if we bumped the start, loop and try again if (i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) { @@ -1691,7 +1743,6 @@ journal_start_transaction(journal *jnl) { int ret; transaction *tr; - int prev_priv; CHECK_JOURNAL(jnl); @@ -1699,27 +1750,23 @@ journal_start_transaction(journal *jnl) return EINVAL; } - if (jnl->owner == current_act()) { + if (jnl->owner == current_thread()) { if (jnl->active_tr == NULL) { - panic("jnl: start_tr: active_tr is NULL (jnl @ 0x%x, owner 0x%x, current_act 0x%x\n", - jnl, jnl->owner, current_act()); + panic("jnl: start_tr: active_tr is NULL (jnl @ 0x%x, owner 0x%x, current_thread 0x%x\n", + jnl, jnl->owner, current_thread()); } jnl->nested_count++; return 0; } - ret = lockmgr(&jnl->jlock, LK_EXCLUSIVE|LK_RETRY, NULL, current_proc()); - if (ret != 0) { - printf("jnl: start_tr: locking the journal (0x%x) failed %d.\n", jnl, ret); - return EINVAL; - } + lock_journal(jnl); if (jnl->owner != NULL || jnl->nested_count != 0 || jnl->active_tr != NULL) { panic("jnl: start_tr: owner 0x%x, nested count 0x%x, active_tr 0x%x jnl @ 0x%x\n", jnl->owner, jnl->nested_count, jnl->active_tr, jnl); } - jnl->owner = current_act(); + jnl->owner = current_thread(); jnl->nested_count = 1; free_old_stuff(jnl); @@ -1743,15 +1790,13 @@ journal_start_transaction(journal *jnl) memset(tr, 0, sizeof(transaction)); tr->tbuffer_size = jnl->tbuffer_size; - thread_wire_internal(host_priv_self(), current_act(), TRUE, &prev_priv); + if (kmem_alloc(kernel_map, (vm_offset_t *)&tr->tbuffer, tr->tbuffer_size)) { FREE_ZONE(tr, sizeof(transaction), M_JNL_TR); printf("jnl: start transaction failed: no tbuffer mem\n"); ret = ENOMEM; - thread_wire_internal(host_priv_self(), current_act(), prev_priv, NULL); goto bad_start; } - thread_wire_internal(host_priv_self(), current_act(), prev_priv, NULL); // journal replay code checksum check depends on this. memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE); @@ -1774,7 +1819,7 @@ journal_start_transaction(journal *jnl) bad_start: jnl->owner = NULL; jnl->nested_count = 0; - lockmgr(&jnl->jlock, LK_RELEASE, NULL, current_proc()); + unlock_journal(jnl); return ret; } @@ -1792,35 +1837,35 @@ journal_modify_block_start(journal *jnl, struct buf *bp) // XXXdbg - for debugging I want this to be true. later it may // not be necessary. - if ((bp->b_flags & B_META) == 0) { + if ((buf_flags(bp) & B_META) == 0) { panic("jnl: modify_block_start: bp @ 0x%x is not a meta-data block! (jnl 0x%x)\n", bp, jnl); } tr = jnl->active_tr; CHECK_TRANSACTION(tr); - if (jnl->owner != current_act()) { + if (jnl->owner != current_thread()) { panic("jnl: modify_block_start: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n", - jnl, jnl->owner, current_act()); + jnl, jnl->owner, current_thread()); } free_old_stuff(jnl); - //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %d/%d bsz %d; total bytes %d)\n", - // bp, bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_bufsize, tr->total_bytes); + //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d; total bytes %d)\n", + // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes); // can't allow blocks that aren't an even multiple of the // underlying block size. - if ((bp->b_bufsize % jnl->jhdr->jhdr_size) != 0) { + if ((buf_size(bp) % jnl->jhdr->jhdr_size) != 0) { panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n", - bp->b_bufsize, jnl->jhdr->jhdr_size); + buf_size(bp), jnl->jhdr->jhdr_size); return -1; } // make sure that this transaction isn't bigger than the whole journal - if (tr->total_bytes+bp->b_bufsize >= (jnl->jhdr->size - jnl->jhdr->jhdr_size)) { + if (tr->total_bytes+buf_size(bp) >= (jnl->jhdr->size - jnl->jhdr->jhdr_size)) { panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr 0x%x bp 0x%x)\n", - tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), bp->b_bufsize, tr, bp); + tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), buf_size(bp), tr, bp); return -1; } @@ -1828,14 +1873,17 @@ journal_modify_block_start(journal *jnl, struct buf *bp) // it out before we muck with it because it has data that belongs // (presumably) to another transaction. // - if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_LOCKED) == 0) { + if ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI) { - // this will cause it to not be brelse()'d - bp->b_flags |= B_NORELSE; - VOP_BWRITE(bp); - } + if (buf_flags(bp) & B_ASYNC) { + panic("modify_block_start: bp @ 0x% has async flag set!\n", bp); + } - bp->b_flags |= B_LOCKED; + // this will cause it to not be buf_brelse()'d + buf_setflags(bp, B_NORELSE); + VNOP_BWRITE(bp); + } + buf_setflags(bp, B_LOCKED); return 0; } @@ -1853,11 +1901,11 @@ journal_modify_block_abort(journal *jnl, struct buf *bp) // // if there's no active transaction then we just want to - // call brelse() and return since this is just a block + // call buf_brelse() and return since this is just a block // that happened to be modified as part of another tr. // if (tr == NULL) { - brelse(bp); + buf_brelse(bp); return 0; } @@ -1867,9 +1915,9 @@ journal_modify_block_abort(journal *jnl, struct buf *bp) CHECK_TRANSACTION(tr); - if (jnl->owner != current_act()) { + if (jnl->owner != current_thread()) { panic("jnl: modify_block_abort: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n", - jnl, jnl->owner, current_act()); + jnl, jnl->owner, current_thread()); } free_old_stuff(jnl); @@ -1880,9 +1928,9 @@ journal_modify_block_abort(journal *jnl, struct buf *bp) for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) { for(i=1; i < blhdr->num_blocks; i++) { if (bp == blhdr->binfo[i].bp) { - if (bp->b_bufsize != blhdr->binfo[i].bsize) { + if (buf_size(bp) != blhdr->binfo[i].bsize) { panic("jnl: bp @ 0x%x changed size on me! (%d vs. %d, jnl 0x%x)\n", - bp, bp->b_bufsize, blhdr->binfo[i].bsize, jnl); + bp, buf_size(bp), blhdr->binfo[i].bsize, jnl); } break; } @@ -1901,10 +1949,10 @@ journal_modify_block_abort(journal *jnl, struct buf *bp) // on it and so we need to keep it locked in memory. // if (blhdr == NULL) { - bp->b_flags &= ~(B_LOCKED); + buf_clearflags(bp, B_LOCKED); } - brelse(bp); + buf_brelse(bp); return 0; } @@ -1926,19 +1974,18 @@ journal_modify_block_end(journal *jnl, struct buf *bp) tr = jnl->active_tr; CHECK_TRANSACTION(tr); - if (jnl->owner != current_act()) { + if (jnl->owner != current_thread()) { panic("jnl: modify_block_end: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n", - jnl, jnl->owner, current_act()); + jnl, jnl->owner, current_thread()); } free_old_stuff(jnl); - //printf("jnl: mod block end: (bp 0x%x vp 0x%x l/blkno %d/%d bsz %d, total bytes %d)\n", - // bp, bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_bufsize, tr->total_bytes); + //printf("jnl: mod block end: (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d, total bytes %d)\n", + // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes); - if ((bp->b_flags & B_LOCKED) == 0) { + if ((buf_flags(bp) & B_LOCKED) == 0) { panic("jnl: modify_block_end: bp 0x%x not locked! jnl @ 0x%x\n", bp, jnl); - bp->b_flags |= B_LOCKED; } // first check if it's already part of this transaction @@ -1947,9 +1994,9 @@ journal_modify_block_end(journal *jnl, struct buf *bp) for(i=1; i < blhdr->num_blocks; i++) { if (bp == blhdr->binfo[i].bp) { - if (bp->b_bufsize != blhdr->binfo[i].bsize) { + if (buf_size(bp) != blhdr->binfo[i].bsize) { panic("jnl: bp @ 0x%x changed size on me! (%d vs. %d, jnl 0x%x)\n", - bp, bp->b_bufsize, blhdr->binfo[i].bsize, jnl); + bp, buf_size(bp), blhdr->binfo[i].bsize, jnl); } break; } @@ -1964,11 +2011,10 @@ journal_modify_block_end(journal *jnl, struct buf *bp) if (blhdr == NULL && prev && (prev->num_blocks+1) <= prev->max_blocks - && (prev->bytes_used+bp->b_bufsize) <= tr->tbuffer_size) { + && (prev->bytes_used+buf_size(bp)) <= tr->tbuffer_size) { blhdr = prev; } else if (blhdr == NULL) { block_list_header *nblhdr; - int prev_priv; if (prev == NULL) { panic("jnl: modify block end: no way man, prev == NULL?!?, jnl 0x%x, bp 0x%x\n", jnl, bp); @@ -1981,12 +2027,10 @@ journal_modify_block_end(journal *jnl, struct buf *bp) // through prev->binfo[0].bnum. that's a skanky way to do things but // avoids having yet another linked list of small data structures to manage. - thread_wire_internal(host_priv_self(), current_act(), TRUE, &prev_priv); if (kmem_alloc(kernel_map, (vm_offset_t *)&nblhdr, tr->tbuffer_size)) { panic("jnl: end_tr: no space for new block tr @ 0x%x (total bytes: %d)!\n", tr, tr->total_bytes); } - thread_wire_internal(host_priv_self(), current_act(), prev_priv, NULL); // journal replay code checksum check depends on this. memset(nblhdr, 0, BLHDR_CHECKSUM_SIZE); @@ -2015,23 +2059,27 @@ journal_modify_block_end(journal *jnl, struct buf *bp) // copy the data into the in-memory transaction buffer blkptr = (char *)&((char *)blhdr)[tbuffer_offset]; - memcpy(blkptr, bp->b_data, bp->b_bufsize); + memcpy(blkptr, buf_dataptr(bp), buf_size(bp)); // if this is true then this is a new block we haven't seen if (i >= blhdr->num_blocks) { - vget(bp->b_vp, 0, current_proc()); + int bsize; + vnode_t vp; + + vp = buf_vnode(bp); + vnode_ref(vp); + bsize = buf_size(bp); - blhdr->binfo[i].bnum = (off_t)((unsigned)bp->b_blkno); - blhdr->binfo[i].bsize = bp->b_bufsize; + blhdr->binfo[i].bnum = (off_t)(buf_blkno(bp)); + blhdr->binfo[i].bsize = bsize; blhdr->binfo[i].bp = bp; - blhdr->bytes_used += bp->b_bufsize; - tr->total_bytes += bp->b_bufsize; + blhdr->bytes_used += bsize; + tr->total_bytes += bsize; blhdr->num_blocks++; } - - bdwrite(bp); + buf_bdwrite(bp); return 0; } @@ -2040,6 +2088,7 @@ int journal_kill_block(journal *jnl, struct buf *bp) { int i; + int bflags; block_list_header *blhdr; transaction *tr; @@ -2052,44 +2101,49 @@ journal_kill_block(journal *jnl, struct buf *bp) tr = jnl->active_tr; CHECK_TRANSACTION(tr); - if (jnl->owner != current_act()) { + if (jnl->owner != current_thread()) { panic("jnl: modify_block_end: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n", - jnl, jnl->owner, current_act()); + jnl, jnl->owner, current_thread()); } free_old_stuff(jnl); - if ((bp->b_flags & B_LOCKED) == 0) { - panic("jnl: kill block: bp 0x%x not locked! jnl @ 0x%x\n", bp, jnl); - } + bflags = buf_flags(bp); + + if ( !(bflags & B_LOCKED)) + panic("jnl: modify_block_end: called with bp not B_LOCKED"); + /* + * bp must be BL_BUSY and B_LOCKED + */ // first check if it's already part of this transaction for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) { for(i=1; i < blhdr->num_blocks; i++) { if (bp == blhdr->binfo[i].bp) { - bp->b_flags &= ~B_LOCKED; + vnode_t vp; - // this undoes the vget() in journal_modify_block_end() - vrele(bp->b_vp); + buf_clearflags(bp, B_LOCKED); - // if the block has the DELWRI and CALL bits sets, then + // this undoes the vnode_ref() in journal_modify_block_end() + vp = buf_vnode(bp); + vnode_rele_ext(vp, 0, 1); + + // if the block has the DELWRI and FILTER bits sets, then // things are seriously weird. if it was part of another // transaction then journal_modify_block_start() should // have force it to be written. // - if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_CALL)) { - panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp); - } else { - tr->num_killed += bp->b_bufsize; - } - - if (bp->b_flags & B_BUSY) { - brelse(bp); - } - + //if ((bflags & B_DELWRI) && (bflags & B_FILTER)) { + // panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp); + //} else { + tr->num_killed += buf_size(bp); + //} blhdr->binfo[i].bp = NULL; blhdr->binfo[i].bnum = (off_t)-1; + + buf_brelse(bp); + break; } } @@ -2106,9 +2160,9 @@ journal_kill_block(journal *jnl, struct buf *bp) static int journal_binfo_cmp(void *a, void *b) { - block_info *bi_a = (struct block_info *)a, - *bi_b = (struct block_info *)b; - daddr_t res; + block_info *bi_a = (struct block_info *)a; + block_info *bi_b = (struct block_info *)b; + daddr64_t res; if (bi_a->bp == NULL) { return 1; @@ -2120,7 +2174,7 @@ journal_binfo_cmp(void *a, void *b) // don't have to worry about negative block // numbers so this is ok to do. // - res = (bi_a->bp->b_blkno - bi_b->bp->b_blkno); + res = (buf_blkno(bi_a->bp) - buf_blkno(bi_b->bp)); return (int)res; } @@ -2130,6 +2184,7 @@ static int end_transaction(transaction *tr, int force_it) { int i, j, ret, amt; + errno_t errno; off_t end; journal *jnl = tr->jnl; struct buf *bp; @@ -2144,7 +2199,7 @@ end_transaction(transaction *tr, int force_it) // just save off the transaction pointer and return. if (tr->total_bytes == jnl->jhdr->blhdr_size) { jnl->cur_tr = tr; - return; + return 0; } // if our transaction buffer isn't very full, just hang @@ -2159,7 +2214,7 @@ end_transaction(transaction *tr, int force_it) && (tr->total_bytes <= ((tr->tbuffer_size*tr->num_blhdrs) - tr->tbuffer_size/8))) { jnl->cur_tr = tr; - return; + return 0; } @@ -2182,10 +2237,10 @@ end_transaction(transaction *tr, int force_it) // file system flush routine until it is (or we panic). // i = 0; - simple_lock(&jnl->old_start_lock); + lock_oldstart(jnl); while ((jnl->old_start[0] & 0x8000000000000000LL) != 0) { if (jnl->flush) { - simple_unlock(&jnl->old_start_lock); + unlock_oldstart(jnl); if (jnl->flush) { jnl->flush(jnl->flush_arg); @@ -2194,9 +2249,9 @@ end_transaction(transaction *tr, int force_it) // yield the cpu so others can get in to clear the lock bit (void)tsleep((void *)jnl, PRIBIO, "jnl-old-start-sleep", 1); - simple_lock(&jnl->old_start_lock); + lock_oldstart(jnl); } - if (i++ >= 100) { + if (i++ >= 500) { panic("jnl: transaction that started at 0x%llx is not completing! jnl 0x%x\n", jnl->old_start[0] & (~0x8000000000000000LL), jnl); } @@ -2209,14 +2264,17 @@ end_transaction(transaction *tr, int force_it) memcpy(&jnl->old_start[0], &jnl->old_start[1], sizeof(jnl->old_start)-sizeof(jnl->old_start[0])); jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] = tr->journal_start | 0x8000000000000000LL; - simple_unlock(&jnl->old_start_lock); + unlock_oldstart(jnl); // for each block, make sure that the physical block # is set for(blhdr=tr->blhdr; blhdr; blhdr=next) { for(i=1; i < blhdr->num_blocks; i++) { - + daddr64_t blkno; + daddr64_t lblkno; + struct vnode *vp; + bp = blhdr->binfo[i].bp; if (bp == NULL) { // only true if a block was "killed" if (blhdr->binfo[i].bnum != (off_t)-1) { @@ -2225,25 +2283,40 @@ end_transaction(transaction *tr, int force_it) } continue; } - - if (bp->b_vp == NULL && bp->b_lblkno == bp->b_blkno) { - panic("jnl: end_tr: DANGER! bp @ 0x%x w/null vp and l/blkno = %d/%d\n", - bp, bp->b_lblkno, bp->b_blkno); + vp = buf_vnode(bp); + blkno = buf_blkno(bp); + lblkno = buf_lblkno(bp); + + if (vp == NULL && lblkno == blkno) { + printf("jnl: end_tr: bad news! bp @ 0x%x w/null vp and l/blkno = %qd/%qd. aborting the transaction (tr 0x%x jnl 0x%x).\n", + bp, lblkno, blkno, tr, jnl); + goto bad_journal; } // if the lblkno is the same as blkno and this bp isn't // associated with the underlying file system device then // we need to call bmap() to get the actual physical block. // - if ((bp->b_lblkno == bp->b_blkno) && (bp->b_vp != jnl->fsdev)) { - if (VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL) != 0) { - printf("jnl: end_tr: can't bmap the bp @ 0x%x, jnl 0x%x\n", bp, jnl); + if ((lblkno == blkno) && (vp != jnl->fsdev)) { + off_t f_offset; + size_t contig_bytes; + + if (VNOP_BLKTOOFF(vp, lblkno, &f_offset)) { + printf("jnl: end_tr: vnop_blktooff failed @ 0x%x, jnl 0x%x\n", bp, jnl); + goto bad_journal; + } + if (VNOP_BLOCKMAP(vp, f_offset, buf_count(bp), &blkno, &contig_bytes, NULL, 0, NULL)) { + printf("jnl: end_tr: can't blockmap the bp @ 0x%x, jnl 0x%x\n", bp, jnl); goto bad_journal; } + if ((uint32_t)contig_bytes < buf_count(bp)) { + printf("jnl: end_tr: blk not physically contiguous on disk@ 0x%x, jnl 0x%x\n", bp, jnl); + goto bad_journal; + } + buf_setblkno(bp, blkno); } - // update this so we write out the correct physical block number! - blhdr->binfo[i].bnum = (off_t)((unsigned)bp->b_blkno); + blhdr->binfo[i].bnum = (off_t)(blkno); } next = (block_list_header *)((long)blhdr->binfo[0].bnum); @@ -2301,53 +2374,52 @@ end_transaction(transaction *tr, int force_it) continue; } - ret = meta_bread(blhdr->binfo[i].bp->b_vp, - (daddr_t)blhdr->binfo[i].bp->b_lblkno, - blhdr->binfo[i].bp->b_bufsize, + errno = buf_meta_bread(buf_vnode(blhdr->binfo[i].bp), + buf_lblkno(blhdr->binfo[i].bp), + buf_size(blhdr->binfo[i].bp), NOCRED, &bp); - if (ret == 0 && bp != NULL) { + if (errno == 0 && bp != NULL) { struct vnode *save_vp; - + void *cur_filter; + if (bp != blhdr->binfo[i].bp) { panic("jnl: end_tr: got back a different bp! (bp 0x%x should be 0x%x, jnl 0x%x\n", bp, blhdr->binfo[i].bp, jnl); } - if ((bp->b_flags & (B_LOCKED|B_DELWRI)) != (B_LOCKED|B_DELWRI)) { + if ((buf_flags(bp) & (B_LOCKED|B_DELWRI)) != (B_LOCKED|B_DELWRI)) { if (jnl->flags & JOURNAL_CLOSE_PENDING) { - brelse(bp); + buf_clearflags(bp, B_LOCKED); + buf_brelse(bp); continue; } else { - panic("jnl: end_tr: !!!DANGER!!! bp 0x%x flags (0x%x) not LOCKED & DELWRI\n", bp, bp->b_flags); + panic("jnl: end_tr: !!!DANGER!!! bp 0x%x flags (0x%x) not LOCKED & DELWRI\n", bp, buf_flags(bp)); } } + save_vp = buf_vnode(bp); - if (bp->b_iodone != NULL) { - panic("jnl: bp @ 0x%x (blkno %d, vp 0x%x) has non-null iodone (0x%x) buffflushcb 0x%x\n", - bp, bp->b_blkno, bp->b_vp, bp->b_iodone, buffer_flushed_callback); - } - - save_vp = bp->b_vp; + buf_setfilter(bp, buffer_flushed_callback, tr, &cur_filter, NULL); - bp->b_iodone = buffer_flushed_callback; - bp->b_transaction = tr; - bp->b_flags |= B_CALL; - bp->b_flags &= ~(B_LOCKED); + if (cur_filter) { + panic("jnl: bp @ 0x%x (blkno %qd, vp 0x%x) has non-null iodone (0x%x) buffflushcb 0x%x\n", + bp, buf_blkno(bp), save_vp, cur_filter, buffer_flushed_callback); + } + buf_clearflags(bp, B_LOCKED); // kicking off the write here helps performance - bawrite(bp); - // XXXdbg this is good for testing: bdwrite(bp); - //bdwrite(bp); + buf_bawrite(bp); + // XXXdbg this is good for testing: buf_bdwrite(bp); + //buf_bdwrite(bp); - // this undoes the vget() in journal_modify_block_end() - vrele(save_vp); - + // this undoes the vnode_ref() in journal_modify_block_end() + vnode_rele_ext(save_vp, 0, 1); } else { printf("jnl: end_transaction: could not find block %Ld vp 0x%x!\n", blhdr->binfo[i].bnum, blhdr->binfo[i].bp); if (bp) { - brelse(bp); + buf_clearflags(bp, B_LOCKED); + buf_brelse(bp); } } } @@ -2366,6 +2438,7 @@ end_transaction(transaction *tr, int force_it) bad_journal: jnl->flags |= JOURNAL_INVALID; + jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] &= ~0x8000000000000000LL; abort_transaction(jnl, tr); return -1; } @@ -2373,7 +2446,8 @@ end_transaction(transaction *tr, int force_it) static void abort_transaction(journal *jnl, transaction *tr) { - int i, ret; + int i; + errno_t errno; block_list_header *blhdr, *next; struct buf *bp; struct vnode *save_vp; @@ -2389,33 +2463,35 @@ abort_transaction(journal *jnl, transaction *tr) if (blhdr->binfo[i].bp == NULL) { continue; } - - ret = meta_bread(blhdr->binfo[i].bp->b_vp, - (daddr_t)blhdr->binfo[i].bp->b_lblkno, - blhdr->binfo[i].bp->b_bufsize, + if ( (buf_vnode(blhdr->binfo[i].bp) == NULL) || + !(buf_flags(blhdr->binfo[i].bp) & B_LOCKED) ) { + continue; + } + + errno = buf_meta_bread(buf_vnode(blhdr->binfo[i].bp), + buf_lblkno(blhdr->binfo[i].bp), + buf_size(blhdr->binfo[i].bp), NOCRED, &bp); - if (ret == 0) { + if (errno == 0) { if (bp != blhdr->binfo[i].bp) { panic("jnl: abort_tr: got back a different bp! (bp 0x%x should be 0x%x, jnl 0x%x\n", bp, blhdr->binfo[i].bp, jnl); } - // clear the locked bit and the delayed-write bit. we - // don't want these blocks going to disk. - bp->b_flags &= ~(B_LOCKED|B_DELWRI); - bp->b_flags |= B_INVAL; - save_vp = bp->b_vp; - - brelse(bp); + // releasing a bp marked invalid + // also clears the locked and delayed state + buf_markinvalid(bp); + save_vp = buf_vnode(bp); - vrele(save_vp); + buf_brelse(bp); + vnode_rele_ext(save_vp, 0, 1); } else { printf("jnl: abort_tr: could not find block %Ld vp 0x%x!\n", blhdr->binfo[i].bnum, blhdr->binfo[i].bp); if (bp) { - brelse(bp); + buf_brelse(bp); } } } @@ -2438,7 +2514,7 @@ int journal_end_transaction(journal *jnl) { int ret; - transaction *tr; + transaction *tr; CHECK_JOURNAL(jnl); @@ -2446,9 +2522,9 @@ journal_end_transaction(journal *jnl) return 0; } - if (jnl->owner != current_act()) { + if (jnl->owner != current_thread()) { panic("jnl: end_tr: I'm not the owner! jnl 0x%x, owner 0x%x, curact 0x%x\n", - jnl, jnl->owner, current_act()); + jnl, jnl->owner, current_thread()); } free_old_stuff(jnl); @@ -2462,8 +2538,6 @@ journal_end_transaction(journal *jnl) if (jnl->flags & JOURNAL_INVALID) { if (jnl->active_tr) { - transaction *tr; - if (jnl->cur_tr != NULL) { panic("jnl: journal @ 0x%x has active tr (0x%x) and cur tr (0x%x)\n", jnl, jnl->active_tr, jnl->cur_tr); @@ -2475,7 +2549,7 @@ journal_end_transaction(journal *jnl) } jnl->owner = NULL; - lockmgr(&jnl->jlock, LK_RELEASE, NULL, current_proc()); + unlock_journal(jnl); return EINVAL; } @@ -2492,7 +2566,7 @@ journal_end_transaction(journal *jnl) ret = end_transaction(tr, 0); jnl->owner = NULL; - lockmgr(&jnl->jlock, LK_RELEASE, NULL, current_proc()); + unlock_journal(jnl); return ret; } @@ -2509,14 +2583,10 @@ journal_flush(journal *jnl) return -1; } - if (jnl->owner != current_act()) { + if (jnl->owner != current_thread()) { int ret; - ret = lockmgr(&jnl->jlock, LK_EXCLUSIVE|LK_RETRY, NULL, current_proc()); - if (ret != 0) { - printf("jnl: flush: locking the journal (0x%x) failed %d.\n", jnl, ret); - return -1; - } + lock_journal(jnl); need_signal = 1; } @@ -2531,7 +2601,7 @@ journal_flush(journal *jnl) } if (need_signal) { - lockmgr(&jnl->jlock, LK_RELEASE, NULL, current_proc()); + unlock_journal(jnl); } return 0; @@ -2546,3 +2616,9 @@ journal_active(journal *jnl) return (jnl->active_tr == NULL) ? 0 : 1; } + +void * +journal_owner(journal *jnl) +{ + return jnl->owner; +} diff --git a/bsd/vfs/vfs_journal.h b/bsd/vfs/vfs_journal.h index 05606b1ba..cf87d421e 100644 --- a/bsd/vfs/vfs_journal.h +++ b/bsd/vfs/vfs_journal.h @@ -30,11 +30,12 @@ #define _SYS_VFS_JOURNAL_H_ #include <sys/appleapiopts.h> +#include <sys/cdefs.h> #ifdef __APPLE_API_UNSTABLE #include <sys/types.h> -#include <sys/lock.h> +#include <kern/locks.h> typedef struct block_info { off_t bnum; // block # on the file system device @@ -94,7 +95,7 @@ typedef struct journal_header { * In memory structure about the journal. */ typedef struct journal { - struct lock__bsd__ jlock; + lck_mtx_t jlock; // protects the struct journal data struct vnode *jdev; // vnode of the device where the journal lives off_t jdev_offset; // byte offset to the start of the journal @@ -118,11 +119,11 @@ typedef struct journal { transaction *tr_freeme; // transaction structs that need to be free'd - volatile off_t active_start; // the active start that we only keep in memory - simple_lock_data_t old_start_lock; // guard access - volatile off_t old_start[16]; // this is how we do lazy start update + volatile off_t active_start; // the active start that we only keep in memory + lck_mtx_t old_start_lock; // protects the old_start + volatile off_t old_start[16]; // this is how we do lazy start update - int last_flush_err; // last error from flushing the cache + int last_flush_err; // last error from flushing the cache } journal; /* internal-only journal flags (top 16 bits) */ @@ -134,10 +135,16 @@ typedef struct journal { /* journal_open/create options are always in the low-16 bits */ #define JOURNAL_OPTION_FLAGS_MASK 0x0000ffff +__BEGIN_DECLS /* * Prototypes. */ +/* + * Call journal_init() to initialize the journaling code (sets up lock attributes) + */ +void journal_init(void); + /* * Call journal_create() to create a new journal. You only * call this once, typically at file system creation time. @@ -200,7 +207,7 @@ journal *journal_open(struct vnode *jvp, * It flushes any outstanding transactions and makes sure the * journal is in a consistent state. */ -void journal_close(journal *journal); +void journal_close(journal *journalp); /* * flags for journal_create/open. only can use @@ -238,6 +245,9 @@ int journal_end_transaction(journal *jnl); int journal_active(journal *jnl); int journal_flush(journal *jnl); +void *journal_owner(journal *jnl); // compare against current_thread() + +__END_DECLS #endif /* __APPLE_API_UNSTABLE */ #endif /* !_SYS_VFS_JOURNAL_H_ */ diff --git a/bsd/vfs/vfs_lookup.c b/bsd/vfs/vfs_lookup.c index d65c10b90..d10ba8fd3 100644 --- a/bsd/vfs/vfs_lookup.c +++ b/bsd/vfs/vfs_lookup.c @@ -66,14 +66,16 @@ #include <sys/time.h> #include <sys/namei.h> #include <sys/vm.h> -#include <sys/vnode.h> -#include <sys/mount.h> +#include <sys/vnode_internal.h> +#include <sys/mount_internal.h> #include <sys/errno.h> #include <sys/malloc.h> #include <sys/filedesc.h> -#include <sys/proc.h> +#include <sys/proc_internal.h> #include <sys/kdebug.h> #include <sys/unistd.h> /* For _PC_NAME_MAX */ +#include <sys/uio_internal.h> +#include <sys/kauth.h> #include <bsm/audit_kernel.h> @@ -81,7 +83,8 @@ #include <sys/ktrace.h> #endif -static void kdebug_lookup(struct vnode *dp, struct componentname *cnp); + +static void kdebug_lookup(struct vnode *dp, struct componentname *cnp); /* * Convert a pathname into a pointer to a locked inode. @@ -103,6 +106,7 @@ static void kdebug_lookup(struct vnode *dp, struct componentname *cnp); * if symbolic link, massage name in buffer and continue * } */ + int namei(ndp) register struct nameidata *ndp; @@ -110,16 +114,16 @@ namei(ndp) register struct filedesc *fdp; /* pointer to file descriptor state */ register char *cp; /* pointer into pathname argument */ register struct vnode *dp; /* the directory we are searching */ - struct iovec aiov; /* uio for reading symbolic links */ - struct uio auio; - int error, linklen; + uio_t auio; + int error; struct componentname *cnp = &ndp->ni_cnd; - struct proc *p = cnp->cn_proc; + vfs_context_t ctx = cnp->cn_context; + struct proc *p = vfs_context_proc(ctx); char *tmppn; + char uio_buf[ UIO_SIZEOF(1) ]; - ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_proc->p_ucred; #if DIAGNOSTIC - if (!cnp->cn_cred || !cnp->cn_proc) + if (!vfs_context_ucred(ctx) || !p) panic ("namei: bad cred/proc"); if (cnp->cn_nameiop & (~OPMASK)) panic ("namei: nameiop contaminated with flags"); @@ -133,17 +137,34 @@ namei(ndp) * name into the buffer. */ if ((cnp->cn_flags & HASBUF) == 0) { - MALLOC_ZONE(cnp->cn_pnbuf, caddr_t, - MAXPATHLEN, M_NAMEI, M_WAITOK); - cnp->cn_pnlen = MAXPATHLEN; - cnp->cn_flags |= HASBUF; + cnp->cn_pnbuf = &ndp->ni_pathbuf; + cnp->cn_pnlen = PATHBUFLEN; } - if (ndp->ni_segflg == UIO_SYSSPACE) - error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, - MAXPATHLEN, (size_t *)&ndp->ni_pathlen); - else +#if LP64_DEBUG + if (IS_VALID_UIO_SEGFLG(ndp->ni_segflg) == 0) { + panic("%s :%d - invalid ni_segflg\n", __FILE__, __LINE__); + } +#endif /* LP64_DEBUG */ + +retry_copy: + if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, - MAXPATHLEN, (size_t *)&ndp->ni_pathlen); + cnp->cn_pnlen, (size_t *)&ndp->ni_pathlen); + else + error = copystr(CAST_DOWN(void *, ndp->ni_dirp), cnp->cn_pnbuf, + cnp->cn_pnlen, (size_t *)&ndp->ni_pathlen); + + if (error == ENAMETOOLONG && !(cnp->cn_flags & HASBUF)) { + MALLOC_ZONE(cnp->cn_pnbuf, caddr_t, + MAXPATHLEN, M_NAMEI, M_WAITOK); + + cnp->cn_flags |= HASBUF; + cnp->cn_pnlen = MAXPATHLEN; + + goto retry_copy; + } + if (error) + goto error_out; /* If we are auditing the kernel pathname, save the user pathname */ if (cnp->cn_flags & AUDITVNPATH1) @@ -154,19 +175,9 @@ namei(ndp) /* * Do not allow empty pathnames */ - if (!error && *cnp->cn_pnbuf == '\0') + if (*cnp->cn_pnbuf == '\0') { error = ENOENT; - - if (!error && ((dp = fdp->fd_cdir) == NULL)) - error = EPERM; /* 3382843 */ - - if (error) { - tmppn = cnp->cn_pnbuf; - cnp->cn_pnbuf = NULL; - cnp->cn_flags &= ~HASBUF; - FREE_ZONE(tmppn, cnp->cn_pnlen, M_NAMEI); - ndp->ni_vp = NULL; - return (error); + goto error_out; } ndp->ni_loopcnt = 0; #if KTRACE @@ -175,113 +186,155 @@ namei(ndp) #endif /* - * Get starting point for the translation. + * determine the starting point for the translation. */ - if ((ndp->ni_rootdir = fdp->fd_rdir) == NULL) - ndp->ni_rootdir = rootvnode; - if (ndp->ni_cnd.cn_flags & USEDVP) { - dp = ndp->ni_dvp; - ndp->ni_dvp = NULL; - } else { - dp = fdp->fd_cdir; + if ((ndp->ni_rootdir = fdp->fd_rdir) == NULLVP) { + if ( !(fdp->fd_flags & FD_CHROOT)) + ndp->ni_rootdir = rootvnode; } + cnp->cn_nameptr = cnp->cn_pnbuf; - VREF(dp); - for (;;) { - /* - * Check if root directory should replace current directory. - * Done at start of translation and after symbolic link. - */ - cnp->cn_nameptr = cnp->cn_pnbuf; - if (*(cnp->cn_nameptr) == '/') { - vrele(dp); - while (*(cnp->cn_nameptr) == '/') { - cnp->cn_nameptr++; - ndp->ni_pathlen--; - } - dp = ndp->ni_rootdir; - VREF(dp); + ndp->ni_usedvp = NULLVP; + + if (*(cnp->cn_nameptr) == '/') { + while (*(cnp->cn_nameptr) == '/') { + cnp->cn_nameptr++; + ndp->ni_pathlen--; } + dp = ndp->ni_rootdir; + } else if (cnp->cn_flags & USEDVP) { + dp = ndp->ni_dvp; + ndp->ni_usedvp = dp; + } else + dp = fdp->fd_cdir; + + if (dp == NULLVP) { + error = ENOENT; + goto error_out; + } + ndp->ni_dvp = NULLVP; + ndp->ni_vp = NULLVP; + + for (;;) { + int need_newpathbuf; + int linklen; + ndp->ni_startdir = dp; - if (error = lookup(ndp)) { - long len = cnp->cn_pnlen; - tmppn = cnp->cn_pnbuf; - cnp->cn_pnbuf = NULL; - cnp->cn_flags &= ~HASBUF; - FREE_ZONE(tmppn, len, M_NAMEI); - return (error); + + if ( (error = lookup(ndp)) ) { + goto error_out; } /* * Check for symbolic link */ if ((cnp->cn_flags & ISSYMLINK) == 0) { - if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) { - tmppn = cnp->cn_pnbuf; - cnp->cn_pnbuf = NULL; - cnp->cn_flags &= ~HASBUF; - FREE_ZONE(tmppn, cnp->cn_pnlen, M_NAMEI); - } else { - cnp->cn_flags |= HASBUF; - } return (0); } - if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1) - VOP_UNLOCK(ndp->ni_dvp, 0, p); + if ((cnp->cn_flags & FSNODELOCKHELD)) { + cnp->cn_flags &= ~FSNODELOCKHELD; + unlock_fsnode(ndp->ni_dvp, NULL); + } if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { error = ELOOP; break; } - if (ndp->ni_pathlen > 1) { + if (ndp->ni_pathlen > 1 || !(cnp->cn_flags & HASBUF)) + need_newpathbuf = 1; + else + need_newpathbuf = 0; + + if (need_newpathbuf) { MALLOC_ZONE(cp, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); } else { cp = cnp->cn_pnbuf; } - aiov.iov_base = cp; - aiov.iov_len = MAXPATHLEN; - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_offset = 0; - auio.uio_rw = UIO_READ; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_procp = (struct proc *)0; - auio.uio_resid = MAXPATHLEN; - if (error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred)) { - if (ndp->ni_pathlen > 1) + auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf)); + + uio_addiov(auio, CAST_USER_ADDR_T(cp), MAXPATHLEN); + + error = VNOP_READLINK(ndp->ni_vp, auio, ctx); + if (error) { + if (need_newpathbuf) FREE_ZONE(cp, MAXPATHLEN, M_NAMEI); break; } - linklen = MAXPATHLEN - auio.uio_resid; - if (linklen + ndp->ni_pathlen >= MAXPATHLEN) { - if (ndp->ni_pathlen > 1) + // LP64todo - fix this + linklen = MAXPATHLEN - uio_resid(auio); + if (linklen + ndp->ni_pathlen > MAXPATHLEN) { + if (need_newpathbuf) FREE_ZONE(cp, MAXPATHLEN, M_NAMEI); + error = ENAMETOOLONG; break; } - if (ndp->ni_pathlen > 1) { + if (need_newpathbuf) { long len = cnp->cn_pnlen; + tmppn = cnp->cn_pnbuf; bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); cnp->cn_pnbuf = cp; cnp->cn_pnlen = MAXPATHLEN; - FREE_ZONE(tmppn, len, M_NAMEI); + + if ( (cnp->cn_flags & HASBUF) ) + FREE_ZONE(tmppn, len, M_NAMEI); + else + cnp->cn_flags |= HASBUF; } else cnp->cn_pnbuf[linklen] = '\0'; + ndp->ni_pathlen += linklen; - vput(ndp->ni_vp); + cnp->cn_nameptr = cnp->cn_pnbuf; + + /* + * starting point for 'relative' + * symbolic link path + */ dp = ndp->ni_dvp; - } + /* + * get rid of references returned via 'lookup' + */ + vnode_put(ndp->ni_vp); + vnode_put(ndp->ni_dvp); + + ndp->ni_vp = NULLVP; + ndp->ni_dvp = NULLVP; - tmppn = cnp->cn_pnbuf; + /* + * Check if symbolic link restarts us at the root + */ + if (*(cnp->cn_nameptr) == '/') { + while (*(cnp->cn_nameptr) == '/') { + cnp->cn_nameptr++; + ndp->ni_pathlen--; + } + if ((dp = ndp->ni_rootdir) == NULLVP) { + error = ENOENT; + goto error_out; + } + } + } + /* + * only come here if we fail to handle a SYMLINK... + * if either ni_dvp or ni_vp is non-NULL, then + * we need to drop the iocount that was picked + * up in the lookup routine + */ + if (ndp->ni_dvp) + vnode_put(ndp->ni_dvp); + if (ndp->ni_vp) + vnode_put(ndp->ni_vp); + error_out: + if ( (cnp->cn_flags & HASBUF) ) { + cnp->cn_flags &= ~HASBUF; + FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); + } cnp->cn_pnbuf = NULL; - cnp->cn_flags &= ~HASBUF; - FREE_ZONE(tmppn, cnp->cn_pnlen, M_NAMEI); + ndp->ni_vp = NULLVP; - vrele(ndp->ni_dvp); - vput(ndp->ni_vp); - ndp->ni_vp = NULL; return (error); } + /* * Search a pathname. * This is a very central and rather complicated routine. @@ -310,7 +363,7 @@ namei(ndp) * identify next component of name at ndp->ni_ptr * handle degenerate case where name is null string * if .. and crossing mount points and on mounted filesys, find parent - * call VOP_LOOKUP routine for next component name + * call VNOP_LOOKUP routine for next component name * directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set * component vnode returned in ni_vp (if it exists), locked. * if result vnode is mounted on and crossing mount points, @@ -324,111 +377,67 @@ int lookup(ndp) register struct nameidata *ndp; { - register char *cp; /* pointer into pathname argument */ - register struct vnode *dp = 0; /* the directory we are searching */ - struct vnode *tdp; /* saved dp */ - struct mount *mp; /* mount table entry */ - int namemax = 0; /* maximun number of bytes for filename returned by pathconf() */ - int docache; /* == 0 do not cache last component */ + register char *cp; /* pointer into pathname argument */ + vnode_t tdp; /* saved dp */ + vnode_t dp; /* the directory we are searching */ + mount_t mp; /* mount table entry */ + int docache = 1; /* == 0 do not cache last component */ int wantparent; /* 1 => wantparent or lockparent flag */ - int dp_unlocked = 0; /* 1 => dp already VOP_UNLOCK()-ed */ int rdonly; /* lookup read-only flag bit */ int trailing_slash = 0; + int dp_authorized = 0; int error = 0; struct componentname *cnp = &ndp->ni_cnd; - struct proc *p = cnp->cn_proc; - int i; + vfs_context_t ctx = cnp->cn_context; /* * Setup: break out flag bits into variables. */ + if (cnp->cn_flags & (NOCACHE | DOWHITEOUT)) { + if ((cnp->cn_flags & NOCACHE) || (cnp->cn_nameiop == DELETE)) + docache = 0; + } wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT); - docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; - if (cnp->cn_nameiop == DELETE || - (wantparent && cnp->cn_nameiop != CREATE && - cnp->cn_nameiop != LOOKUP)) - docache = 0; rdonly = cnp->cn_flags & RDONLY; - ndp->ni_dvp = NULL; cnp->cn_flags &= ~ISSYMLINK; + cnp->cn_consume = 0; + dp = ndp->ni_startdir; ndp->ni_startdir = NULLVP; - vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, p); - cnp->cn_consume = 0; -dirloop: - /* - * Search a new directory. - * - * The cn_hash value is for use by vfs_cache. - * The last component of the filename is left accessible via - * cnp->cn_nameptr for callers that need the name. Callers needing - * the name set the SAVENAME flag. When done, they assume - * responsibility for freeing the pathname buffer. - */ - { - register unsigned int hash; - register unsigned int ch; - register int i; - - hash = 0; - cp = cnp->cn_nameptr; - ch = *cp; - if (ch == '\0') { - cnp->cn_namelen = 0; - goto emptyname; - } + cp = cnp->cn_nameptr; - for (i = 1; (ch != '/') && (ch != '\0'); i++) { - hash += ch * i; - ch = *(++cp); - } - cnp->cn_hash = hash; - } - cnp->cn_namelen = cp - cnp->cn_nameptr; - if (cnp->cn_namelen > NCHNAMLEN) { - if (VOP_PATHCONF(dp, _PC_NAME_MAX, &namemax)) - namemax = NAME_MAX; - if (cnp->cn_namelen > namemax) { - error = ENAMETOOLONG; + if (*cp == '\0') { + if ( (vnode_getwithref(dp)) ) { + dp = NULLVP; + error = ENOENT; goto bad; } + goto emptyname; } -#ifdef NAMEI_DIAGNOSTIC - { char c = *cp; - *cp = '\0'; - printf("{%s}: ", cnp->cn_nameptr); - *cp = c; } -#endif - ndp->ni_pathlen -= cnp->cn_namelen; - ndp->ni_next = cp; +dirloop: + ndp->ni_vp = NULLVP; - /* - * Replace multiple slashes by a single slash and trailing slashes - * by a null. This must be done before VOP_LOOKUP() because some - * fs's don't know about trailing slashes. Remember if there were - * trailing slashes to handle symlinks, existing non-directories - * and non-existing files that won't be directories specially later. - */ - trailing_slash = 0; - while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { - cp++; - ndp->ni_pathlen--; - if (*cp == '\0') { - trailing_slash = 1; - *ndp->ni_next = '\0'; - } + if ( (error = cache_lookup_path(ndp, cnp, dp, ctx, &trailing_slash, &dp_authorized)) ) { + dp = NULLVP; + goto bad; + } + if ((cnp->cn_flags & ISLASTCN)) { + if (docache) + cnp->cn_flags |= MAKEENTRY; + } else + cnp->cn_flags |= MAKEENTRY; + + dp = ndp->ni_dvp; + + if (ndp->ni_vp != NULLVP) { + /* + * cache_lookup_path returned a non-NULL ni_vp then, + * we're guaranteed that the dp is a VDIR, it's + * been authorized, and vp is not ".." + */ + goto returned_from_lookup_path; } - ndp->ni_next = cp; - - cnp->cn_flags |= MAKEENTRY; - if (*cp == '\0' && docache == 0) - cnp->cn_flags &= ~MAKEENTRY; - - if (*ndp->ni_next == 0) - cnp->cn_flags |= ISLASTCN; - else - cnp->cn_flags &= ~ISLASTCN; /* * Handle "..": two special cases. @@ -440,62 +449,91 @@ dirloop: * vnode which was mounted on so we take the * .. in the other file system. */ - if (cnp->cn_namelen == 2 && - cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') { - cnp->cn_flags |= ISDOTDOT; - + if ( (cnp->cn_flags & ISDOTDOT) ) { for (;;) { - if (dp == ndp->ni_rootdir || dp == rootvnode) { - ndp->ni_dvp = dp; + if (dp == ndp->ni_rootdir || dp == rootvnode) { + ndp->ni_dvp = dp; ndp->ni_vp = dp; - VREF(dp); + /* + * we're pinned at the root + * we've already got one reference on 'dp' + * courtesy of cache_lookup_path... take + * another one for the ".." + * if we fail to get the new reference, we'll + * drop our original down in 'bad' + */ + if ( (vnode_get(dp)) ) { + error = ENOENT; + goto bad; + } goto nextname; } if ((dp->v_flag & VROOT) == 0 || (cnp->cn_flags & NOCROSSMOUNT)) - break; + break; if (dp->v_mount == NULL) { /* forced umount */ - error = EBADF; + error = EBADF; goto bad; } - tdp = dp; - dp = dp->v_mount->mnt_vnodecovered; - vput(tdp); - VREF(dp); - vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, p); + dp = tdp->v_mount->mnt_vnodecovered; + + vnode_put(tdp); + + if ( (vnode_getwithref(dp)) ) { + dp = NULLVP; + error = ENOENT; + goto bad; + } + ndp->ni_dvp = dp; + dp_authorized = 0; } - } else { - cnp->cn_flags &= ~ISDOTDOT; } /* * We now have a segment name to search for, and a directory to search. */ unionlookup: - ndp->ni_dvp = dp; - ndp->ni_vp = NULL; - if (error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) { -#if DIAGNOSTIC - if (ndp->ni_vp != NULL) - panic("leaf should be empty"); -#endif -#ifdef NAMEI_DIAGNOSTIC - printf("not found\n"); -#endif + ndp->ni_vp = NULLVP; + + if (dp->v_type != VDIR) { + error = ENOTDIR; + goto lookup_error; + } + if ( !(dp_authorized || (cnp->cn_flags & DONOTAUTH)) ) { + if ( (error = vnode_authorize(dp, NULL, KAUTH_VNODE_SEARCH, ctx)) ) + goto lookup_error; + } + if ( (error = VNOP_LOOKUP(dp, &ndp->ni_vp, cnp, ctx)) ) { +lookup_error: if ((error == ENOENT) && (dp->v_flag & VROOT) && (dp->v_mount != NULL) && (dp->v_mount->mnt_flag & MNT_UNION)) { + if ((cnp->cn_flags & FSNODELOCKHELD)) { + cnp->cn_flags &= ~FSNODELOCKHELD; + unlock_fsnode(dp, NULL); + } tdp = dp; - dp = dp->v_mount->mnt_vnodecovered; - vput(tdp); - VREF(dp); - vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, p); + dp = tdp->v_mount->mnt_vnodecovered; + + vnode_put(tdp); + + if ( (vnode_getwithref(dp)) ) { + dp = NULLVP; + error = ENOENT; + goto bad; + } + ndp->ni_dvp = dp; + dp_authorized = 0; goto unionlookup; } if (error != EJUSTRETURN) goto bad; + + if (ndp->ni_vp != NULLVP) + panic("leaf should be empty"); + /* * If creating and at end of pathname, then can consider * allowing file to be created. @@ -504,27 +542,31 @@ unionlookup: error = EROFS; goto bad; } - if (*cp == '\0' && trailing_slash && - !(cnp->cn_flags & WILLBEDIR)) { + if ((cnp->cn_flags & ISLASTCN) && trailing_slash && !(cnp->cn_flags & WILLBEDIR)) { error = ENOENT; goto bad; } /* * We return with ni_vp NULL to indicate that the entry * doesn't currently exist, leaving a pointer to the - * (possibly locked) directory inode in ndp->ni_dvp. + * referenced directory vnode in ndp->ni_dvp. */ if (cnp->cn_flags & SAVESTART) { + if ( (vnode_get(ndp->ni_dvp)) ) { + error = ENOENT; + goto bad; + } ndp->ni_startdir = ndp->ni_dvp; - VREF(ndp->ni_startdir); } + if (!wantparent) + vnode_put(ndp->ni_dvp); + if (kdebug_enable) kdebug_lookup(ndp->ni_dvp, cnp); return (0); } -#ifdef NAMEI_DIAGNOSTIC - printf("found\n"); -#endif +returned_from_lookup_path: + dp = ndp->ni_vp; /* * Take into account any additional components consumed by @@ -536,48 +578,81 @@ unionlookup: ndp->ni_pathlen -= cnp->cn_consume; cnp->cn_consume = 0; } else { - int isdot_or_dotdot; + if (dp->v_name == NULL || dp->v_parent == NULLVP) { + int isdot_or_dotdot; + int update_flags = 0; - isdot_or_dotdot = (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') || (cnp->cn_flags & ISDOTDOT); + isdot_or_dotdot = (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') || (cnp->cn_flags & ISDOTDOT); - if (VNAME(ndp->ni_vp) == NULL && isdot_or_dotdot == 0) { - VNAME(ndp->ni_vp) = add_name(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0); - } - if (VPARENT(ndp->ni_vp) == NULL && isdot_or_dotdot == 0) { - if (vget(ndp->ni_dvp, 0, p) == 0) { - VPARENT(ndp->ni_vp) = ndp->ni_dvp; + if (isdot_or_dotdot == 0) { + if (dp->v_name == NULL) + update_flags |= VNODE_UPDATE_NAME; + if (ndp->ni_dvp != NULLVP && dp->v_parent == NULLVP) + update_flags |= VNODE_UPDATE_PARENT; + + if (update_flags) + vnode_update_identity(dp, ndp->ni_dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, update_flags); + } + } + + if ( (cnp->cn_flags & MAKEENTRY) && (dp->v_flag & VNCACHEABLE) && LIST_FIRST(&dp->v_nclinks) == NULL) { + /* + * missing from name cache, but should + * be in it... this can happen if volfs + * causes the vnode to be created or the + * name cache entry got recycled but the + * vnode didn't... + * check to make sure that ni_dvp is valid + * cache_lookup_path may return a NULL + */ + if (ndp->ni_dvp != NULL) + cache_enter(ndp->ni_dvp, dp, cnp); } - } } - dp = ndp->ni_vp; /* - * Check to see if the vnode has been mounted on; + * Check to see if the vnode has been mounted on... * if so find the root of the mounted file system. */ - while (dp->v_type == VDIR && (mp = dp->v_mountedhere) && - (cnp->cn_flags & NOCROSSMOUNT) == 0) { - if (vfs_busy(mp, LK_NOWAIT, 0, p)) { - error = ENOENT; - goto bad2; - } - VOP_UNLOCK(dp, 0, p); - error = VFS_ROOT(mp, &tdp); - vfs_unbusy(mp, p); - if (error) { - dp_unlocked = 1; /* Signal error path 'dp' has already been unlocked */ - goto bad2; - }; - vrele(dp); - ndp->ni_vp = dp = tdp; +check_mounted_on: + if ((dp->v_type == VDIR) && dp->v_mountedhere && + ((cnp->cn_flags & NOCROSSMOUNT) == 0)) { + + vnode_lock(dp); + + if ((dp->v_type == VDIR) && (mp = dp->v_mountedhere)) { + + mp->mnt_crossref++; + vnode_unlock(dp); + + if (vfs_busy(mp, 0)) { + mount_dropcrossref(mp, dp, 0); + goto check_mounted_on; + } + error = VFS_ROOT(mp, &tdp, ctx); + /* + * mount_dropcrossref does a vnode_put + * on dp if the 3rd arg is non-zero + */ + mount_dropcrossref(mp, dp, 1); + dp = NULL; + vfs_unbusy(mp); + + if (error) { + goto bad2; + } + ndp->ni_vp = dp = tdp; + + goto check_mounted_on; + } + vnode_unlock(dp); } /* * Check for symbolic link */ if ((dp->v_type == VLNK) && - ((cnp->cn_flags & FOLLOW) || trailing_slash || - *ndp->ni_next == '/')) { + ((cnp->cn_flags & FOLLOW) || trailing_slash || *ndp->ni_next == '/')) { cnp->cn_flags |= ISSYMLINK; return (0); } @@ -591,7 +666,7 @@ unionlookup: goto bad2; } trailing_slash = 0; - } + } nextname: /* @@ -605,7 +680,14 @@ nextname: cnp->cn_nameptr++; ndp->ni_pathlen--; } - vrele(ndp->ni_dvp); + vnode_put(ndp->ni_dvp); + + cp = cnp->cn_nameptr; + + if (*cp == '\0') + goto emptyname; + + vnode_put(dp); goto dirloop; } @@ -618,22 +700,32 @@ nextname: goto bad2; } if (cnp->cn_flags & SAVESTART) { + /* + * note that we already hold a reference + * on both dp and ni_dvp, but for some reason + * can't get another one... in this case we + * need to do vnode_put on dp in 'bad2' + */ + if ( (vnode_get(ndp->ni_dvp)) ) { + error = ENOENT; + goto bad2; + } ndp->ni_startdir = ndp->ni_dvp; - VREF(ndp->ni_startdir); } - if (!wantparent) - vrele(ndp->ni_dvp); + if (!wantparent && ndp->ni_dvp) + vnode_put(ndp->ni_dvp); + if (cnp->cn_flags & AUDITVNPATH1) AUDIT_ARG(vnpath, dp, ARG_VNODE1); else if (cnp->cn_flags & AUDITVNPATH2) AUDIT_ARG(vnpath, dp, ARG_VNODE2); - if ((cnp->cn_flags & LOCKLEAF) == 0) - VOP_UNLOCK(dp, 0, p); + if (kdebug_enable) kdebug_lookup(dp, cnp); return (0); emptyname: + cnp->cn_namelen = 0; /* * A degenerate name (e.g. / or "") which is a way of * talking about a directory, e.g. like "/." or ".". @@ -647,34 +739,55 @@ emptyname: goto bad; } if (wantparent) { + /* + * note that we already hold a reference + * on dp, but for some reason can't + * get another one... in this case we + * need to do vnode_put on dp in 'bad' + */ + if ( (vnode_get(dp)) ) { + error = ENOENT; + goto bad; + } ndp->ni_dvp = dp; - VREF(dp); } cnp->cn_flags &= ~ISDOTDOT; cnp->cn_flags |= ISLASTCN; ndp->ni_next = cp; ndp->ni_vp = dp; + if (cnp->cn_flags & AUDITVNPATH1) AUDIT_ARG(vnpath, dp, ARG_VNODE1); else if (cnp->cn_flags & AUDITVNPATH2) AUDIT_ARG(vnpath, dp, ARG_VNODE2); - if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF))) - VOP_UNLOCK(dp, 0, p); if (cnp->cn_flags & SAVESTART) panic("lookup: SAVESTART"); return (0); bad2: - if ((cnp->cn_flags & LOCKPARENT) && *ndp->ni_next == '\0') - VOP_UNLOCK(ndp->ni_dvp, 0, p); - vrele(ndp->ni_dvp); + if ((cnp->cn_flags & FSNODELOCKHELD)) { + cnp->cn_flags &= ~FSNODELOCKHELD; + unlock_fsnode(ndp->ni_dvp, NULL); + } + if (ndp->ni_dvp) + vnode_put(ndp->ni_dvp); + if (dp) + vnode_put(dp); + ndp->ni_vp = NULLVP; + + if (kdebug_enable) + kdebug_lookup(dp, cnp); + return (error); + bad: - if (dp_unlocked) { - vrele(dp); - } else { - vput(dp); - }; - ndp->ni_vp = NULL; + if ((cnp->cn_flags & FSNODELOCKHELD)) { + cnp->cn_flags &= ~FSNODELOCKHELD; + unlock_fsnode(ndp->ni_dvp, NULL); + } + if (dp) + vnode_put(dp); + ndp->ni_vp = NULLVP; + if (kdebug_enable) kdebug_lookup(dp, cnp); return (error); @@ -689,9 +802,7 @@ relookup(dvp, vpp, cnp) struct vnode *dvp, **vpp; struct componentname *cnp; { - struct proc *p = cnp->cn_proc; struct vnode *dp = 0; /* the directory we are searching */ - int docache; /* == 0 do not cache last component */ int wantparent; /* 1 => wantparent or lockparent flag */ int rdonly; /* lookup read-only flag bit */ int error = 0; @@ -699,41 +810,21 @@ relookup(dvp, vpp, cnp) int i, newhash; /* DEBUG: check name hash */ char *cp; /* DEBUG: check name ptr/len */ #endif + vfs_context_t ctx = cnp->cn_context;; /* * Setup: break out flag bits into variables. */ wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT); - docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; - if (cnp->cn_nameiop == DELETE || - (wantparent && cnp->cn_nameiop != CREATE)) - docache = 0; rdonly = cnp->cn_flags & RDONLY; cnp->cn_flags &= ~ISSYMLINK; - dp = dvp; - vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, p); -/* dirloop: */ - /* - * Search a new directory. - * - * The cn_hash value is for use by vfs_cache. - * The last component of the filename is left accessible via - * cnp->cn_nameptr for callers that need the name. Callers needing - * the name set the SAVENAME flag. When done, they assume - * responsibility for freeing the pathname buffer. - */ -#ifdef NAMEI_DIAGNOSTIC - for (i=1, newhash = 0, cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) - newhash += (unsigned char)*cp * i; - if (newhash != cnp->cn_hash) - panic("relookup: bad hash"); - if (cnp->cn_namelen != cp - cnp->cn_nameptr) - panic ("relookup: bad len"); - if (*cp != 0) - panic("relookup: not last component"); - printf("{%s}: ", cnp->cn_nameptr); -#endif + if (cnp->cn_flags & NOCACHE) + cnp->cn_flags &= ~MAKEENTRY; + else + cnp->cn_flags |= MAKEENTRY; + + dp = dvp; /* * Check for degenerate name (e.g. / or "") @@ -749,27 +840,26 @@ relookup(dvp, vpp, cnp) error = ENOTDIR; goto bad; } - if (!(cnp->cn_flags & LOCKLEAF)) - VOP_UNLOCK(dp, 0, p); + if ( (vnode_get(dp)) ) { + error = ENOENT; + goto bad; + } *vpp = dp; + if (cnp->cn_flags & SAVESTART) panic("lookup: SAVESTART"); return (0); } - - if (cnp->cn_flags & ISDOTDOT) - panic ("relookup: lookup on dot-dot"); - /* * We now have a segment name to search for, and a directory to search. */ - if (error = VOP_LOOKUP(dp, vpp, cnp)) { + if ( (error = VNOP_LOOKUP(dp, vpp, cnp, ctx)) ) { + if (error != EJUSTRETURN) + goto bad; #if DIAGNOSTIC if (*vpp != NULL) panic("leaf should be empty"); #endif - if (error != EJUSTRETURN) - goto bad; /* * If creating and at end of pathname, then can consider * allowing file to be created. @@ -778,9 +868,6 @@ relookup(dvp, vpp, cnp) error = EROFS; goto bad; } - /* ASSERT(dvp == ndp->ni_startdir) */ - if (cnp->cn_flags & SAVESTART) - VREF(dvp); /* * We return with ni_vp NULL to indicate that the entry * doesn't currently exist, leaving a pointer to the @@ -807,25 +894,36 @@ relookup(dvp, vpp, cnp) goto bad2; } /* ASSERT(dvp == ndp->ni_startdir) */ - if (cnp->cn_flags & SAVESTART) - VREF(dvp); - if (!wantparent) - vrele(dvp); - if ((cnp->cn_flags & LOCKLEAF) == 0) - VOP_UNLOCK(dp, 0, p); return (0); bad2: - if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN)) - VOP_UNLOCK(dvp, 0, p); - vrele(dvp); -bad: - vput(dp); + vnode_put(dp); +bad: *vpp = NULL; + return (error); } +/* + * Free pathname buffer + */ +void +nameidone(struct nameidata *ndp) +{ + if ((ndp->ni_cnd.cn_flags & FSNODELOCKHELD)) { + ndp->ni_cnd.cn_flags &= ~FSNODELOCKHELD; + unlock_fsnode(ndp->ni_dvp, NULL); + } + if (ndp->ni_cnd.cn_flags & HASBUF) { + char *tmp = ndp->ni_cnd.cn_pnbuf; + + ndp->ni_cnd.cn_pnbuf = NULL; + ndp->ni_cnd.cn_flags &= ~HASBUF; + FREE_ZONE(tmp, ndp->ni_cnd.cn_pnlen, M_NAMEI); + } +} + #define NUMPARMS 23 @@ -834,7 +932,7 @@ kdebug_lookup(dp, cnp) struct vnode *dp; struct componentname *cnp; { - register int i, n; + register unsigned int i, n; register int dbg_namelen; register int save_dbg_namelen; register char *dbg_nameptr; diff --git a/bsd/vfs/vfs_quota.c b/bsd/vfs/vfs_quota.c index 51bd854f8..118b7492d 100644 --- a/bsd/vfs/vfs_quota.c +++ b/bsd/vfs/vfs_quota.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -62,13 +62,32 @@ #include <sys/kernel.h> #include <sys/systm.h> #include <sys/malloc.h> -#include <sys/file.h> -#include <sys/proc.h> -#include <sys/vnode.h> -#include <sys/mount.h> +#include <sys/file_internal.h> +#include <sys/proc_internal.h> +#include <sys/vnode_internal.h> +#include <sys/mount_internal.h> #include <sys/quota.h> +#include <sys/uio_internal.h> +/* vars for quota file lock */ +lck_grp_t * qf_lck_grp; +lck_grp_attr_t * qf_lck_grp_attr; +lck_attr_t * qf_lck_attr; + +/* vars for quota list lock */ +lck_grp_t * quota_list_lck_grp; +lck_grp_attr_t * quota_list_lck_grp_attr; +lck_attr_t * quota_list_lck_attr; +lck_mtx_t * quota_list_mtx_lock; + +/* Routines to lock and unlock the quota global data */ +static void dq_list_lock(void); +static void dq_list_unlock(void); + +static void dq_lock_internal(struct dquot *dq); +static void dq_unlock_internal(struct dquot *dq); + static u_int32_t quotamagic[MAXQUOTAS] = INITQMAGICS; @@ -80,20 +99,26 @@ static u_int32_t quotamagic[MAXQUOTAS] = INITQMAGICS; LIST_HEAD(dqhash, dquot) *dqhashtbl; u_long dqhash; +#define DQUOTINC 5 /* minimum free dquots desired */ +long numdquot, desireddquot = DQUOTINC; + /* * Dquot free list. */ -#define DQUOTINC 5 /* minimum free dquots desired */ TAILQ_HEAD(dqfreelist, dquot) dqfreelist; -long numdquot, desireddquot = DQUOTINC; - /* - * Dquot dirty orphans list. + * Dquot dirty orphans list */ TAILQ_HEAD(dqdirtylist, dquot) dqdirtylist; -static int dqlookup(struct quotafile *, u_long, struct dqblk *, u_int32_t *); +static int dqlookup(struct quotafile *, u_long, struct dqblk *, u_int32_t *); +static int dqsync_locked(struct dquot *dq); + +static void qf_lock(struct quotafile *); +static void qf_unlock(struct quotafile *); +static int qf_ref(struct quotafile *); +static void qf_rele(struct quotafile *); /* @@ -106,11 +131,245 @@ dqinit() dqhashtbl = hashinit(desiredvnodes, M_DQUOT, &dqhash); TAILQ_INIT(&dqfreelist); TAILQ_INIT(&dqdirtylist); + + /* + * Allocate quota list lock group attribute and group + */ + quota_list_lck_grp_attr= lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(quota_list_lck_grp_attr); + quota_list_lck_grp = lck_grp_alloc_init("quota list", quota_list_lck_grp_attr); + + /* + * Allocate qouta list lock attribute + */ + quota_list_lck_attr = lck_attr_alloc_init(); + //lck_attr_setdebug(quota_list_lck_attr); + + /* + * Allocate quota list lock + */ + quota_list_mtx_lock = lck_mtx_alloc_init(quota_list_lck_grp, quota_list_lck_attr); + + + /* + * allocate quota file lock group attribute and group + */ + qf_lck_grp_attr= lck_grp_attr_alloc_init(); + lck_grp_attr_setstat(qf_lck_grp_attr); + qf_lck_grp = lck_grp_alloc_init("quota file", qf_lck_grp_attr); + + /* + * Allocate quota file lock attribute + */ + qf_lck_attr = lck_attr_alloc_init(); + //lck_attr_setdebug(qf_lck_attr); +} + + + +void +dq_list_lock(void) +{ + lck_mtx_lock(quota_list_mtx_lock); +} + +void +dq_list_unlock(void) +{ + lck_mtx_unlock(quota_list_mtx_lock); +} + + +/* + * must be called with the quota_list_lock held + */ +void +dq_lock_internal(struct dquot *dq) +{ + while (dq->dq_lflags & DQ_LLOCK) { + dq->dq_lflags |= DQ_LWANT; + msleep(&dq->dq_lflags, quota_list_mtx_lock, PVFS, "dq_lock_internal", 0); + } + dq->dq_lflags |= DQ_LLOCK; +} + +/* + * must be called with the quota_list_lock held + */ +void +dq_unlock_internal(struct dquot *dq) +{ + int wanted = dq->dq_lflags & DQ_LWANT; + + dq->dq_lflags &= ~(DQ_LLOCK | DQ_LWANT); + + if (wanted) + wakeup(&dq->dq_lflags); +} + +void +dqlock(struct dquot *dq) { + + lck_mtx_lock(quota_list_mtx_lock); + + dq_lock_internal(dq); + + lck_mtx_unlock(quota_list_mtx_lock); +} + +void +dqunlock(struct dquot *dq) { + + lck_mtx_lock(quota_list_mtx_lock); + + dq_unlock_internal(dq); + + lck_mtx_unlock(quota_list_mtx_lock); +} + + + +int +qf_get(struct quotafile *qfp, int type) +{ + int error = 0; + + dq_list_lock(); + + switch (type) { + + case QTF_OPENING: + while ( (qfp->qf_qflags & (QTF_OPENING | QTF_CLOSING)) ) { + if ( (qfp->qf_qflags & QTF_OPENING) ) { + error = EBUSY; + break; + } + if ( (qfp->qf_qflags & QTF_CLOSING) ) { + qfp->qf_qflags |= QTF_WANTED; + msleep(&qfp->qf_qflags, quota_list_mtx_lock, PVFS, "qf_get", 0); + } + } + if (qfp->qf_vp != NULLVP) + error = EBUSY; + if (error == 0) + qfp->qf_qflags |= QTF_OPENING; + break; + + case QTF_CLOSING: + if ( (qfp->qf_qflags & QTF_CLOSING) ) { + error = EBUSY; + break; + } + qfp->qf_qflags |= QTF_CLOSING; + + while ( (qfp->qf_qflags & QTF_OPENING) || qfp->qf_refcnt ) { + qfp->qf_qflags |= QTF_WANTED; + msleep(&qfp->qf_qflags, quota_list_mtx_lock, PVFS, "qf_get", 0); + } + if (qfp->qf_vp == NULLVP) { + qfp->qf_qflags &= ~QTF_CLOSING; + error = EBUSY; + } + break; + } + dq_list_unlock(); + + return (error); +} + +void +qf_put(struct quotafile *qfp, int type) +{ + + dq_list_lock(); + + switch (type) { + + case QTF_OPENING: + case QTF_CLOSING: + qfp->qf_qflags &= ~type; + break; + } + if ( (qfp->qf_qflags & QTF_WANTED) ) { + qfp->qf_qflags &= ~QTF_WANTED; + wakeup(&qfp->qf_qflags); + } + dq_list_unlock(); +} + + +static void +qf_lock(struct quotafile *qfp) +{ + lck_mtx_lock(&qfp->qf_lock); +} + +static void +qf_unlock(struct quotafile *qfp) +{ + lck_mtx_unlock(&qfp->qf_lock); +} + + +/* + * take a reference on the quota file while we're + * in dqget... this will prevent a quota_off from + * occurring while we're potentially playing with + * the quota file... the quota_off will stall until + * all the current references 'die'... once we start + * into quoto_off, all new references will be rejected + * we also don't want any dqgets being processed while + * we're in the middle of the quota_on... once we've + * actually got the quota file open and the associated + * struct quotafile inited, we can let them come through + * + * quota list lock must be held on entry + */ +static int +qf_ref(struct quotafile *qfp) +{ + int error = 0; + + if ( (qfp->qf_qflags & (QTF_OPENING | QTF_CLOSING)) || (qfp->qf_vp == NULLVP) ) + error = EINVAL; + else + qfp->qf_refcnt++; + + return (error); +} + +/* + * drop our reference and wakeup any waiters if + * we were the last one holding a ref + * + * quota list lock must be held on entry + */ +static void +qf_rele(struct quotafile *qfp) +{ + qfp->qf_refcnt--; + + if ( (qfp->qf_qflags & QTF_WANTED) && qfp->qf_refcnt == 0) { + qfp->qf_qflags &= ~QTF_WANTED; + wakeup(&qfp->qf_qflags); + } +} + + +void +dqfileinit(struct quotafile *qfp) +{ + qfp->qf_vp = NULLVP; + qfp->qf_qflags = 0; + + lck_mtx_init(&qfp->qf_lock, qf_lck_grp, qf_lck_attr); } /* * Initialize a quota file + * + * must be called with the quota file lock held */ int dqfileopen(qfp, type) @@ -118,39 +377,38 @@ dqfileopen(qfp, type) int type; { struct dqfilehdr header; - struct vattr vattr; - struct iovec aiov; - struct uio auio; - int error; + struct vfs_context context; + off_t file_size; + uio_t auio; + int error = 0; + char uio_buf[ UIO_SIZEOF(1) ]; + context.vc_proc = current_proc(); + context.vc_ucred = qfp->qf_cred; + /* Obtain the file size */ - error = VOP_GETATTR(qfp->qf_vp, &vattr, qfp->qf_cred, current_proc()); - if (error) - return (error); + if ((error = vnode_size(qfp->qf_vp, &file_size, &context)) != 0) + goto out; /* Read the file header */ - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - aiov.iov_base = (caddr_t)&header; - aiov.iov_len = sizeof (header); - auio.uio_resid = sizeof (header); - auio.uio_offset = (off_t)(0); - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = UIO_READ; - auio.uio_procp = (struct proc *)0; - error = VOP_READ(qfp->qf_vp, &auio, 0, qfp->qf_cred); + auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, + &uio_buf[0], sizeof(uio_buf)); + uio_addiov(auio, CAST_USER_ADDR_T(&header), sizeof (header)); + error = VNOP_READ(qfp->qf_vp, auio, 0, &context); if (error) - return (error); - else if (auio.uio_resid) - return (EINVAL); - + goto out; + else if (uio_resid(auio)) { + error = EINVAL; + goto out; + } /* Sanity check the quota file header. */ if ((header.dqh_magic != quotamagic[type]) || (header.dqh_version > QF_VERSION) || (!powerof2(header.dqh_maxentries)) || - (header.dqh_maxentries > (vattr.va_size / sizeof(struct dqblk)))) - return (EINVAL); - + (header.dqh_maxentries > (file_size / sizeof(struct dqblk)))) { + error = EINVAL; + goto out; + } /* Set up the time limits for this quota. */ if (header.dqh_btime > 0) qfp->qf_btime = header.dqh_btime; @@ -165,44 +423,33 @@ dqfileopen(qfp, type) qfp->qf_maxentries = header.dqh_maxentries; qfp->qf_entrycnt = header.dqh_entrycnt; qfp->qf_shift = dqhashshift(header.dqh_maxentries); - - return (0); +out: + return (error); } /* * Close down a quota file */ void -dqfileclose(qfp, type) - struct quotafile *qfp; - int type; +dqfileclose(struct quotafile *qfp, __unused int type) { struct dqfilehdr header; - struct iovec aiov; - struct uio auio; - - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - aiov.iov_base = (caddr_t)&header; - aiov.iov_len = sizeof (header); - auio.uio_resid = sizeof (header); - auio.uio_offset = (off_t)(0); - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = UIO_READ; - auio.uio_procp = (struct proc *)0; - if (VOP_READ(qfp->qf_vp, &auio, 0, qfp->qf_cred) == 0) { - header.dqh_entrycnt = qfp->qf_entrycnt; + struct vfs_context context; + uio_t auio; + char uio_buf[ UIO_SIZEOF(1) ]; + + auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, + &uio_buf[0], sizeof(uio_buf)); + uio_addiov(auio, CAST_USER_ADDR_T(&header), sizeof (header)); - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - aiov.iov_base = (caddr_t)&header; - aiov.iov_len = sizeof (header); - auio.uio_resid = sizeof (header); - auio.uio_offset = (off_t)(0); - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = UIO_WRITE; - auio.uio_procp = (struct proc *)0; - (void) VOP_WRITE(qfp->qf_vp, &auio, 0, qfp->qf_cred); + context.vc_proc = current_proc(); + context.vc_ucred = qfp->qf_cred; + + if (VNOP_READ(qfp->qf_vp, auio, 0, &context) == 0) { + header.dqh_entrycnt = qfp->qf_entrycnt; + uio_reset(auio, 0, UIO_SYSSPACE, UIO_WRITE); + uio_addiov(auio, CAST_USER_ADDR_T(&header), sizeof (header)); + (void) VNOP_WRITE(qfp->qf_vp, auio, 0, &context); } } @@ -212,92 +459,249 @@ dqfileclose(qfp, type) * reading the information from the file if necessary. */ int -dqget(vp, id, qfp, type, dqp) - struct vnode *vp; +dqget(id, qfp, type, dqp) u_long id; struct quotafile *qfp; register int type; struct dquot **dqp; { - struct proc *p = current_proc(); /* XXX */ struct dquot *dq; + struct dquot *ndq = NULL; + struct dquot *fdq = NULL; struct dqhash *dqh; struct vnode *dqvp; int error = 0; - dqvp = qfp->qf_vp; - if (id == 0 || dqvp == NULLVP || (qfp->qf_qflags & QTF_CLOSING)) { + if ( id == 0 || qfp->qf_vp == NULLVP ) { + *dqp = NODQUOT; + return (EINVAL); + } + dq_list_lock(); + + if ( (qf_ref(qfp)) ) { + dq_list_unlock(); + + *dqp = NODQUOT; + return (EINVAL); + } + if ( (dqvp = qfp->qf_vp) == NULLVP ) { + qf_rele(qfp); + dq_list_unlock(); + *dqp = NODQUOT; return (EINVAL); } + dqh = DQHASH(dqvp, id); + +relookup: /* * Check the cache first. */ - dqh = DQHASH(dqvp, id); for (dq = dqh->lh_first; dq; dq = dq->dq_hash.le_next) { if (dq->dq_id != id || dq->dq_qfile->qf_vp != dqvp) continue; + + dq_lock_internal(dq); + /* + * dq_lock_internal may drop the quota_list_lock to msleep, so + * we need to re-evaluate the identity of this dq + */ + if (dq->dq_id != id || dq->dq_qfile == NULL || + dq->dq_qfile->qf_vp != dqvp) { + dq_unlock_internal(dq); + goto relookup; + } /* * Cache hit with no references. Take * the structure off the free list. */ - if (dq->dq_cnt == 0) { + if (dq->dq_cnt++ == 0) { if (dq->dq_flags & DQ_MOD) TAILQ_REMOVE(&dqdirtylist, dq, dq_freelist); else TAILQ_REMOVE(&dqfreelist, dq, dq_freelist); } - DQREF(dq); + dq_unlock_internal(dq); + + if (fdq != NULL) { + /* + * we grabbed this from the free list in the first pass + * but we found the dq we were looking for in + * the cache the 2nd time through + * so stick it back on the free list and return the cached entry + */ + TAILQ_INSERT_HEAD(&dqfreelist, fdq, dq_freelist); + } + qf_rele(qfp); + dq_list_unlock(); + + if (ndq != NULL) { + /* + * we allocated this in the first pass + * but we found the dq we were looking for in + * the cache the 2nd time through so free it + */ + _FREE(ndq, M_DQUOT); + } *dqp = dq; + return (0); } /* * Not in cache, allocate a new one. */ - if (dqfreelist.tqh_first == NODQUOT && + if (TAILQ_EMPTY(&dqfreelist) && numdquot < MAXQUOTAS * desiredvnodes) desireddquot += DQUOTINC; - if (numdquot < desireddquot) { - dq = (struct dquot *)_MALLOC(sizeof *dq, M_DQUOT, M_WAITOK); - bzero((char *)dq, sizeof *dq); - numdquot++; + + if (fdq != NULL) { + /* + * we captured this from the free list + * in the first pass through, so go + * ahead and use it + */ + dq = fdq; + fdq = NULL; + } else if (numdquot < desireddquot) { + if (ndq == NULL) { + /* + * drop the quota list lock since MALLOC may block + */ + dq_list_unlock(); + + ndq = (struct dquot *)_MALLOC(sizeof *dq, M_DQUOT, M_WAITOK); + bzero((char *)ndq, sizeof *dq); + + dq_list_lock(); + /* + * need to look for the entry again in the cache + * since we dropped the quota list lock and + * someone else may have beaten us to creating it + */ + goto relookup; + } else { + /* + * we allocated this in the first pass through + * and we're still under out target, so go + * ahead and use it + */ + dq = ndq; + ndq = NULL; + numdquot++; + } } else { - if ((dq = dqfreelist.tqh_first) == NULL) { + if (TAILQ_EMPTY(&dqfreelist)) { + qf_rele(qfp); + dq_list_unlock(); + + if (ndq) { + /* + * we allocated this in the first pass through + * but we're now at the limit of our cache size + * so free it + */ + _FREE(ndq, M_DQUOT); + } tablefull("dquot"); *dqp = NODQUOT; return (EUSERS); } - if (dq->dq_cnt || (dq->dq_flags & DQ_MOD)) - panic("free dquot isn't"); + dq = TAILQ_FIRST(&dqfreelist); + + dq_lock_internal(dq); + + if (dq->dq_cnt || (dq->dq_flags & DQ_MOD)) { + /* + * we lost the race while we weren't holding + * the quota list lock... dq_lock_internal + * will drop it to msleep... this dq has been + * reclaimed... go find another + */ + dq_unlock_internal(dq); + + /* + * need to look for the entry again in the cache + * since we dropped the quota list lock and + * someone else may have beaten us to creating it + */ + goto relookup; + } TAILQ_REMOVE(&dqfreelist, dq, dq_freelist); - LIST_REMOVE(dq, dq_hash); + + if (dq->dq_qfile != NULL) { + LIST_REMOVE(dq, dq_hash); + dq->dq_qfile = NULL; + dq->dq_id = 0; + } + dq_unlock_internal(dq); + + /* + * because we may have dropped the quota list lock + * in the call to dq_lock_internal, we need to + * relookup in the hash in case someone else + * caused a dq with this identity to be created... + * if we don't find it, we'll use this one + */ + fdq = dq; + goto relookup; } + /* + * we've either freshly allocated a dq + * or we've atomically pulled it out of + * the hash and freelists... no one else + * can have a reference, which means no + * one else can be trying to use this dq + */ + dq_lock_internal(dq); + /* * Initialize the contents of the dquot structure. */ - if (vp != dqvp) - vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY, p); - LIST_INSERT_HEAD(dqh, dq, dq_hash); - DQREF(dq); - dq->dq_flags = DQ_LOCK; + dq->dq_cnt = 1; + dq->dq_flags = 0; dq->dq_id = id; dq->dq_qfile = qfp; dq->dq_type = type; + /* + * once we insert it in the hash and + * drop the quota_list_lock, it can be + * 'found'... however, we're still holding + * the dq_lock which will keep us from doing + * anything with it until we've finished + * initializing it... + */ + LIST_INSERT_HEAD(dqh, dq, dq_hash); + dq_list_unlock(); + + if (ndq) { + /* + * we allocated this in the first pass through + * but we didn't need it, so free it after + * we've droped the quota list lock + */ + _FREE(ndq, M_DQUOT); + } + error = dqlookup(qfp, id, &dq->dq_dqb, &dq->dq_index); - if (vp != dqvp) - VOP_UNLOCK(dqvp, 0, p); - if (dq->dq_flags & DQ_WANT) - wakeup((caddr_t)dq); - dq->dq_flags = 0; /* * I/O error in reading quota file, release * quota structure and reflect problem to caller. */ if (error) { + dq_list_lock(); + + dq->dq_id = 0; + dq->dq_qfile = NULL; LIST_REMOVE(dq, dq_hash); - dqrele(vp, dq); + + dq_unlock_internal(dq); + qf_rele(qfp); + dq_list_unlock(); + + dqrele(dq); + *dqp = NODQUOT; return (error); } @@ -309,18 +713,26 @@ dqget(vp, id, qfp, type, dqp) dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) dq->dq_flags |= DQ_FAKE; if (dq->dq_id != 0) { + struct timeval tv; + + microtime(&tv); if (dq->dq_btime == 0) - dq->dq_btime = time.tv_sec + qfp->qf_btime; + dq->dq_btime = tv.tv_sec + qfp->qf_btime; if (dq->dq_itime == 0) - dq->dq_itime = time.tv_sec + qfp->qf_itime; + dq->dq_itime = tv.tv_sec + qfp->qf_itime; } + dq_list_lock(); + dq_unlock_internal(dq); + qf_rele(qfp); + dq_list_unlock(); + *dqp = dq; return (0); } /* * Lookup a dqblk structure for the specified identifier and - * quota file. If there is no enetry for this identifier then + * quota file. If there is no entry for this identifier then * one is inserted. The actual hash table index is returned. */ static int @@ -331,22 +743,20 @@ dqlookup(qfp, id, dqb, index) u_int32_t *index; { struct vnode *dqvp; - struct ucred *cred; - struct iovec aiov; - struct uio auio; + struct vfs_context context; + uio_t auio; int i, skip, last; u_long mask; int error = 0; + char uio_buf[ UIO_SIZEOF(1) ]; + + + qf_lock(qfp); - if (id == 0) - return (EINVAL); dqvp = qfp->qf_vp; - cred = qfp->qf_cred; - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_procp = (struct proc *)0; + context.vc_proc = current_proc(); + context.vc_ucred = qfp->qf_cred; mask = qfp->qf_maxentries - 1; i = dqhash1(id, qfp->qf_shift, mask); @@ -355,17 +765,14 @@ dqlookup(qfp, id, dqb, index) for (last = (i + (qfp->qf_maxentries-1) * skip) & mask; i != last; i = (i + skip) & mask) { - - aiov.iov_base = (caddr_t)dqb; - aiov.iov_len = sizeof (struct dqblk); - auio.uio_resid = sizeof (struct dqblk); - auio.uio_offset = (off_t)dqoffset(i); - auio.uio_rw = UIO_READ; - error = VOP_READ(dqvp, &auio, 0, cred); + auio = uio_createwithbuffer(1, dqoffset(i), UIO_SYSSPACE, UIO_READ, + &uio_buf[0], sizeof(uio_buf)); + uio_addiov(auio, CAST_USER_ADDR_T(dqb), sizeof (struct dqblk)); + error = VNOP_READ(dqvp, auio, 0, &context); if (error) { printf("dqlookup: error %d looking up id %d at index %d\n", error, id, i); break; - } else if (auio.uio_resid) { + } else if (uio_resid(auio)) { error = EIO; printf("dqlookup: error looking up id %d at index %d\n", id, i); break; @@ -381,13 +788,10 @@ dqlookup(qfp, id, dqb, index) /* * Write back to reserve entry for this id */ - aiov.iov_base = (caddr_t)dqb; - aiov.iov_len = sizeof (struct dqblk); - auio.uio_resid = sizeof (struct dqblk); - auio.uio_offset = (off_t)dqoffset(i); - auio.uio_rw = UIO_WRITE; - error = VOP_WRITE(dqvp, &auio, 0, cred); - if (auio.uio_resid && error == 0) + uio_reset(auio, dqoffset(i), UIO_SYSSPACE, UIO_WRITE); + uio_addiov(auio, CAST_USER_ADDR_T(dqb), sizeof (struct dqblk)); + error = VNOP_WRITE(dqvp, auio, 0, &context); + if (uio_resid(auio) && error == 0) error = EIO; if (error == 0) ++qfp->qf_entrycnt; @@ -397,62 +801,66 @@ dqlookup(qfp, id, dqb, index) if (dqb->dqb_id == id) break; } - + qf_unlock(qfp); + *index = i; /* remember index so we don't have to recompute it later */ + return (error); } -/* - * Obtain a reference to a dquot. - */ -void -dqref(dq) - struct dquot *dq; -{ - - dq->dq_cnt++; -} /* * Release a reference to a dquot. */ void -dqrele(vp, dq) - struct vnode *vp; - register struct dquot *dq; +dqrele(struct dquot *dq) { if (dq == NODQUOT) return; + dqlock(dq); + if (dq->dq_cnt > 1) { dq->dq_cnt--; + + dqunlock(dq); return; } if (dq->dq_flags & DQ_MOD) - (void) dqsync(vp, dq); - if (--dq->dq_cnt > 0) - return; + (void) dqsync_locked(dq); + dq->dq_cnt--; + + dq_list_lock(); TAILQ_INSERT_TAIL(&dqfreelist, dq, dq_freelist); + dq_unlock_internal(dq); + dq_list_unlock(); } /* * Release a reference to a dquot but don't do any I/O. */ void -dqreclaim(vp, dq) - struct vnode *vp; - register struct dquot *dq; +dqreclaim(register struct dquot *dq) { + if (dq == NODQUOT) return; - if (--dq->dq_cnt > 0) - return; + dq_list_lock(); + dq_lock_internal(dq); + if (--dq->dq_cnt > 0) { + dq_unlock_internal(dq); + dq_list_unlock(); + return; + } if (dq->dq_flags & DQ_MOD) TAILQ_INSERT_TAIL(&dqdirtylist, dq, dq_freelist); else TAILQ_INSERT_TAIL(&dqfreelist, dq, dq_freelist); + + dq_unlock_internal(dq); + dq_list_unlock(); } /* @@ -463,80 +871,108 @@ dqsync_orphans(qfp) struct quotafile *qfp; { struct dquot *dq; - + + dq_list_lock(); loop: TAILQ_FOREACH(dq, &dqdirtylist, dq_freelist) { - if ((dq->dq_flags & DQ_MOD) == 0) - panic("dqsync_orphans: dirty dquot isn't"); + if (dq->dq_qfile != qfp) + continue; + + dq_lock_internal(dq); + + if (dq->dq_qfile != qfp) { + /* + * the identity of this dq changed while + * the quota_list_lock was dropped + * dq_lock_internal can drop it to msleep + */ + dq_unlock_internal(dq); + goto loop; + } + if ((dq->dq_flags & DQ_MOD) == 0) { + /* + * someone cleaned and removed this from + * the dq from the dirty list while the + * quota_list_lock was dropped + */ + dq_unlock_internal(dq); + goto loop; + } if (dq->dq_cnt != 0) panic("dqsync_orphans: dquot in use"); - if (dq->dq_qfile == qfp) { - TAILQ_REMOVE(&dqdirtylist, dq, dq_freelist); + TAILQ_REMOVE(&dqdirtylist, dq, dq_freelist); - dq->dq_cnt++; - (void) dqsync(NULLVP, dq); - dq->dq_cnt--; + dq_list_unlock(); + /* + * we're still holding the dqlock at this point + * with the reference count == 0 + * we shouldn't be able + * to pick up another one since we hold dqlock + */ + (void) dqsync_locked(dq); + + dq_list_lock(); - if ((dq->dq_cnt == 0) && (dq->dq_flags & DQ_MOD) == 0) - TAILQ_INSERT_TAIL(&dqfreelist, dq, dq_freelist); + TAILQ_INSERT_TAIL(&dqfreelist, dq, dq_freelist); - goto loop; - } + dq_unlock_internal(dq); + goto loop; + } + dq_list_unlock(); +} + +int +dqsync(struct dquot *dq) +{ + int error = 0; + + if (dq != NODQUOT) { + dqlock(dq); + + if ( (dq->dq_flags & DQ_MOD) ) + error = dqsync_locked(dq); + + dqunlock(dq); } + return (error); } + /* * Update the disk quota in the quota file. */ int -dqsync(vp, dq) - struct vnode *vp; - struct dquot *dq; +dqsync_locked(struct dquot *dq) { struct proc *p = current_proc(); /* XXX */ + struct vfs_context context; struct vnode *dqvp; - struct iovec aiov; - struct uio auio; + uio_t auio; int error; + char uio_buf[ UIO_SIZEOF(1) ]; - if (dq == NODQUOT) - panic("dqsync: dquot"); - if ((dq->dq_flags & DQ_MOD) == 0) + if (dq->dq_id == 0) { + dq->dq_flags &= ~DQ_MOD; return (0); - if (dq->dq_id == 0) - return(0); - if ((dqvp = dq->dq_qfile->qf_vp) == NULLVP) - panic("dqsync: file"); - if (vp != dqvp) - vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY, p); - while (dq->dq_flags & DQ_LOCK) { - dq->dq_flags |= DQ_WANT; - sleep((caddr_t)dq, PINOD+2); - if ((dq->dq_flags & DQ_MOD) == 0) { - if (vp != dqvp) - VOP_UNLOCK(dqvp, 0, p); - return (0); - } } - dq->dq_flags |= DQ_LOCK; - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - aiov.iov_base = (caddr_t)&dq->dq_dqb; - aiov.iov_len = sizeof (struct dqblk); - auio.uio_resid = sizeof (struct dqblk); - auio.uio_offset = (off_t)dqoffset(dq->dq_index); - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = UIO_WRITE; - auio.uio_procp = (struct proc *)0; - error = VOP_WRITE(dqvp, &auio, 0, dq->dq_qfile->qf_cred); - if (auio.uio_resid && error == 0) + if (dq->dq_qfile == NULL) + panic("dqsync: NULL dq_qfile"); + if ((dqvp = dq->dq_qfile->qf_vp) == NULLVP) + panic("dqsync: NULL qf_vp"); + + auio = uio_createwithbuffer(1, dqoffset(dq->dq_index), UIO_SYSSPACE, + UIO_WRITE, &uio_buf[0], sizeof(uio_buf)); + uio_addiov(auio, CAST_USER_ADDR_T(&dq->dq_dqb), sizeof (struct dqblk)); + + context.vc_proc = p; + context.vc_ucred = dq->dq_qfile->qf_cred; + + error = VNOP_WRITE(dqvp, auio, 0, &context); + if (uio_resid(auio) && error == 0) error = EIO; - if (dq->dq_flags & DQ_WANT) - wakeup((caddr_t)dq); - dq->dq_flags &= ~(DQ_MOD|DQ_LOCK|DQ_WANT); - if (vp != dqvp) - VOP_UNLOCK(dqvp, 0, p); + dq->dq_flags &= ~DQ_MOD; + return (error); } @@ -555,6 +991,8 @@ dqflush(vp) * file off their hash chains (they will eventually * fall off the head of the free list and be re-used). */ + dq_list_lock(); + for (dqh = &dqhashtbl[dqhash]; dqh >= dqhashtbl; dqh--) { for (dq = dqh->lh_first; dq; dq = nextdq) { nextdq = dq->dq_hash.le_next; @@ -563,7 +1001,32 @@ dqflush(vp) if (dq->dq_cnt) panic("dqflush: stray dquot"); LIST_REMOVE(dq, dq_hash); - dq->dq_qfile = (struct quotafile *)0; + dq->dq_qfile = NULL; } } + dq_list_unlock(); +} + +/* + * LP64 support for munging dqblk structure. + * XXX conversion of user_time_t to time_t loses precision; not an issue for + * XXX us now, since we are only ever setting 32 bits worth of time into it. + */ +__private_extern__ void +munge_dqblk(struct dqblk *dqblkp, struct user_dqblk *user_dqblkp, boolean_t to64) +{ + if (to64) { + /* munge kernel (32 bit) dqblk into user (64 bit) dqblk */ + bcopy((caddr_t)dqblkp, (caddr_t)user_dqblkp, offsetof(struct dqblk, dqb_btime)); + user_dqblkp->dqb_id = dqblkp->dqb_id; + user_dqblkp->dqb_itime = dqblkp->dqb_itime; + user_dqblkp->dqb_btime = dqblkp->dqb_btime; + } + else { + /* munge user (64 bit) dqblk into kernel (32 bit) dqblk */ + bcopy((caddr_t)user_dqblkp, (caddr_t)dqblkp, offsetof(struct dqblk, dqb_btime)); + dqblkp->dqb_id = user_dqblkp->dqb_id; + dqblkp->dqb_itime = user_dqblkp->dqb_itime; /* XXX - lose precision */ + dqblkp->dqb_btime = user_dqblkp->dqb_btime; /* XXX - lose precision */ + } } diff --git a/bsd/vfs/vfs_subr.c b/bsd/vfs/vfs_subr.c index e91eab165..a01ac6c45 100644 --- a/bsd/vfs/vfs_subr.c +++ b/bsd/vfs/vfs_subr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -69,24 +69,30 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/proc.h> -#include <sys/mount.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> +#include <sys/mount_internal.h> #include <sys/time.h> -#include <sys/vnode.h> +#include <sys/lock.h> +#include <sys/vnode_internal.h> #include <sys/stat.h> #include <sys/namei.h> #include <sys/ucred.h> -#include <sys/buf.h> +#include <sys/buf_internal.h> #include <sys/errno.h> #include <sys/malloc.h> #include <sys/domain.h> #include <sys/mbuf.h> #include <sys/syslog.h> -#include <sys/ubc.h> +#include <sys/ubc_internal.h> #include <sys/vm.h> #include <sys/sysctl.h> #include <sys/filedesc.h> #include <sys/event.h> +#include <sys/kdebug.h> +#include <sys/kauth.h> +#include <sys/user.h> +#include <miscfs/fifofs/fifo.h> #include <string.h> #include <machine/spl.h> @@ -99,6 +105,11 @@ #include <mach/mach_types.h> #include <mach/memory_object_types.h> +extern lck_grp_t *vnode_lck_grp; +extern lck_attr_t *vnode_lck_attr; + + +extern lck_mtx_t * mnt_list_mtx_lock; enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, @@ -109,15 +120,39 @@ int vttoif_tab[9] = { S_IFSOCK, S_IFIFO, S_IFMT, }; -static void vfree(struct vnode *vp); -static void vinactive(struct vnode *vp); -static int vnreclaim(int count); -extern kern_return_t - adjust_vm_object_cache(vm_size_t oval, vm_size_t nval); +extern int ubc_isinuse_locked(vnode_t, int, int); +extern kern_return_t adjust_vm_object_cache(vm_size_t oval, vm_size_t nval); + +static void vnode_list_add(vnode_t); +static void vnode_list_remove(vnode_t); + +static errno_t vnode_drain(vnode_t); +static void vgone(vnode_t); +static void vclean(vnode_t vp, int flag, proc_t p); +static void vnode_reclaim_internal(vnode_t, int, int); + +static void vnode_dropiocount (vnode_t, int); +static errno_t vnode_getiocount(vnode_t vp, int locked, int vid, int vflags); +static int vget_internal(vnode_t, int, int); + +static vnode_t checkalias(vnode_t vp, dev_t nvp_rdev); +static int vnode_reload(vnode_t); +static int vnode_isinuse_locked(vnode_t, int, int); + +static void insmntque(vnode_t vp, mount_t mp); +mount_t mount_list_lookupby_fsid(fsid_t *, int, int); +static int mount_getvfscnt(void); +static int mount_fillfsids(fsid_t *, int ); +static void vnode_iterate_setup(mount_t); +static int vnode_umount_preflight(mount_t, vnode_t, int); +static int vnode_iterate_prepare(mount_t); +static int vnode_iterate_reloadq(mount_t); +static void vnode_iterate_clear(mount_t); TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ TAILQ_HEAD(inactivelst, vnode) vnode_inactive_list; /* vnode inactive list */ struct mntlist mountlist; /* mounted filesystem list */ +static int nummounts = 0; #if DIAGNOSTIC #define VLISTCHECK(fun, vp, list) \ @@ -166,31 +201,13 @@ struct mntlist mountlist; /* mounted filesystem list */ inactivevnodes--; \ } while(0) -#define VORECLAIM_ENABLE(vp) \ - do { \ - if (ISSET((vp)->v_flag, VORECLAIM)) \ - panic("vm_object_reclaim already"); \ - SET((vp)->v_flag, VORECLAIM); \ - } while(0) - -#define VORECLAIM_DISABLE(vp) \ - do { \ - CLR((vp)->v_flag, VORECLAIM); \ - if (ISSET((vp)->v_flag, VXWANT)) { \ - CLR((vp)->v_flag, VXWANT); \ - wakeup((caddr_t)(vp)); \ - } \ - } while(0) - /* * Have to declare first two locks as actual data even if !MACH_SLOCKS, since * a pointers to them get passed around. */ -simple_lock_data_t mountlist_slock; -simple_lock_data_t mntvnode_slock; -decl_simple_lock_data(,mntid_slock); -decl_simple_lock_data(,vnode_free_list_slock); -decl_simple_lock_data(,spechash_slock); +void * mntvnode_slock; +void * mntid_slock; +void * spechash_slock; /* * vnodetarget is the amount of vnodes we expect to get back @@ -198,7 +215,7 @@ decl_simple_lock_data(,spechash_slock); * As vnreclaim() is a mainly cpu bound operation for faster * processers this number could be higher. * Having this number too high introduces longer delays in - * the execution of getnewvnode(). + * the execution of new_vnode(). */ unsigned long vnodetarget; /* target for vnreclaim() */ #define VNODE_FREE_TARGET 20 /* Default value for vnodetarget */ @@ -229,19 +246,11 @@ unsigned long vnodetarget; /* target for vnreclaim() */ * Initialize the vnode management data structures. */ __private_extern__ void -vntblinit() +vntblinit(void) { - extern struct lock__bsd__ exchangelock; - - simple_lock_init(&mountlist_slock); - simple_lock_init(&mntvnode_slock); - simple_lock_init(&mntid_slock); - simple_lock_init(&spechash_slock); TAILQ_INIT(&vnode_free_list); - simple_lock_init(&vnode_free_list_slock); TAILQ_INIT(&vnode_inactive_list); - CIRCLEQ_INIT(&mountlist); - lockinit(&exchangelock, PVFS, "exchange", 0, 0); + TAILQ_INIT(&mountlist); if (!vnodetarget) vnodetarget = VNODE_FREE_TARGET; @@ -268,531 +277,858 @@ reset_vmobjectcache(unsigned int val1, unsigned int val2) return(adjust_vm_object_cache(oval, nval)); } -/* - * Mark a mount point as busy. Used to synchronize access and to delay - * unmounting. Interlock is not released on failure. - */ + +/* the timeout is in 10 msecs */ int -vfs_busy(mp, flags, interlkp, p) - struct mount *mp; - int flags; - struct slock *interlkp; - struct proc *p; -{ - int lkflags; +vnode_waitforwrites(vnode_t vp, int output_target, int slpflag, int slptimeout, char *msg) { + int error = 0; + struct timespec ts; - if (mp->mnt_kern_flag & MNTK_UNMOUNT) { - if (flags & LK_NOWAIT) - return (ENOENT); - mp->mnt_kern_flag |= MNTK_MWAIT; - if (interlkp) - simple_unlock(interlkp); - /* - * Since all busy locks are shared except the exclusive - * lock granted when unmounting, the only place that a - * wakeup needs to be done is at the release of the - * exclusive lock at the end of dounmount. - */ - sleep((caddr_t)mp, PVFS); - if (interlkp) - simple_lock(interlkp); - return (ENOENT); + KERNEL_DEBUG(0x3010280 | DBG_FUNC_START, (int)vp, output_target, vp->v_numoutput, 0, 0); + + if (vp->v_numoutput > output_target) { + + slpflag &= ~PDROP; + + vnode_lock(vp); + + while ((vp->v_numoutput > output_target) && error == 0) { + if (output_target) + vp->v_flag |= VTHROTTLED; + else + vp->v_flag |= VBWAIT; + ts.tv_sec = (slptimeout/100); + ts.tv_nsec = (slptimeout % 1000) * 10 * NSEC_PER_USEC * 1000 ; + error = msleep((caddr_t)&vp->v_numoutput, &vp->v_lock, (slpflag | (PRIBIO + 1)), msg, &ts); + } + vnode_unlock(vp); } - lkflags = LK_SHARED; - if (interlkp) - lkflags |= LK_INTERLOCK; - if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) - panic("vfs_busy: unexpected lock failure"); - return (0); + KERNEL_DEBUG(0x3010280 | DBG_FUNC_END, (int)vp, output_target, vp->v_numoutput, error, 0); + + return error; } -/* - * Free a busy filesystem. - */ + void -vfs_unbusy(mp, p) - struct mount *mp; - struct proc *p; +vnode_startwrite(vnode_t vp) { + + OSAddAtomic(1, &vp->v_numoutput); +} + + +void +vnode_writedone(vnode_t vp) { + if (vp) { + int need_wakeup = 0; + + OSAddAtomic(-1, &vp->v_numoutput); + + vnode_lock(vp); - lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); + if (vp->v_numoutput < 0) + panic("vnode_writedone: numoutput < 0"); + + if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput < (VNODE_ASYNC_THROTTLE / 3))) { + vp->v_flag &= ~VTHROTTLED; + need_wakeup = 1; + } + if ((vp->v_flag & VBWAIT) && (vp->v_numoutput == 0)) { + vp->v_flag &= ~VBWAIT; + need_wakeup = 1; + } + vnode_unlock(vp); + + if (need_wakeup) + wakeup((caddr_t)&vp->v_numoutput); + } } -/* - * Lookup a filesystem type, and if found allocate and initialize - * a mount structure for it. - * - * Devname is usually updated by mount(8) after booting. - */ + + int -vfs_rootmountalloc(fstypename, devname, mpp) - char *fstypename; - char *devname; - struct mount **mpp; +vnode_hasdirtyblks(vnode_t vp) { - struct proc *p = current_proc(); /* XXX */ - struct vfsconf *vfsp; - struct mount *mp; + struct cl_writebehind *wbp; - for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) - if (!strcmp(vfsp->vfc_name, fstypename)) - break; - if (vfsp == NULL) - return (ENODEV); - mp = _MALLOC_ZONE((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); - bzero((char *)mp, (u_long)sizeof(struct mount)); + /* + * Not taking the buf_mtxp as there is little + * point doing it. Even if the lock is taken the + * state can change right after that. If their + * needs to be a synchronization, it must be driven + * by the caller + */ + if (vp->v_dirtyblkhd.lh_first) + return (1); + + if (!UBCINFOEXISTS(vp)) + return (0); - /* Initialize the default IO constraints */ - mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS; - mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32; + wbp = vp->v_ubcinfo->cl_wbehind; + + if (wbp && (wbp->cl_number || wbp->cl_scmap)) + return (1); - lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0); - (void)vfs_busy(mp, LK_NOWAIT, 0, p); - LIST_INIT(&mp->mnt_vnodelist); - mp->mnt_vfc = vfsp; - mp->mnt_op = vfsp->vfc_vfsops; - mp->mnt_flag = MNT_RDONLY; - mp->mnt_vnodecovered = NULLVP; - vfsp->vfc_refcount++; - mp->mnt_stat.f_type = vfsp->vfc_typenum; - mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; - strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); - mp->mnt_stat.f_mntonname[0] = '/'; - (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); - *mpp = mp; return (0); } -/* - * Find an appropriate filesystem to use for the root. If a filesystem - * has not been preselected, walk through the list of known filesystems - * trying those that have mountroot routines, and try them until one - * works or we have tried them all. - */ int -vfs_mountroot() +vnode_hascleanblks(vnode_t vp) { - struct vfsconf *vfsp; - extern int (*mountroot)(void); - int error; + /* + * Not taking the buf_mtxp as there is little + * point doing it. Even if the lock is taken the + * state can change right after that. If their + * needs to be a synchronization, it must be driven + * by the caller + */ + if (vp->v_cleanblkhd.lh_first) + return (1); + return (0); +} - if (mountroot != NULL) { - error = (*mountroot)(); - return (error); - } - - for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { - if (vfsp->vfc_mountroot == NULL) - continue; - if ((error = (*vfsp->vfc_mountroot)()) == 0) - return (0); - if (error != EINVAL) - printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); +void +vnode_iterate_setup(mount_t mp) +{ + while (mp->mnt_lflag & MNT_LITER) { + mp->mnt_lflag |= MNT_LITERWAIT; + msleep((caddr_t)mp, &mp->mnt_mlock, PVFS, "vnode_iterate_setup", 0); } - return (ENODEV); + + mp->mnt_lflag |= MNT_LITER; + } -/* - * Lookup a mount point by filesystem identifier. - */ -struct mount * -vfs_getvfs(fsid) - fsid_t *fsid; +static int +vnode_umount_preflight(mount_t mp, vnode_t skipvp, int flags) { - register struct mount *mp; + vnode_t vp; - simple_lock(&mountlist_slock); - CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) { - if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && - mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { - simple_unlock(&mountlist_slock); - return (mp); + TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { + if (vp->v_type == VDIR) + continue; + if (vp == skipvp) + continue; + if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) || + (vp->v_flag & VNOFLUSH))) + continue; + if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) + continue; + if ((flags & WRITECLOSE) && + (vp->v_writecount == 0 || vp->v_type != VREG)) + continue; + /* Look for busy vnode */ + if (((vp->v_usecount != 0) && + ((vp->v_usecount - vp->v_kusecount) != 0))) + return(1); } - } - simple_unlock(&mountlist_slock); - return ((struct mount *)0); + + return(0); } -/* - * Get a new unique fsid +/* + * This routine prepares iteration by moving all the vnodes to worker queue + * called with mount lock held */ -void -vfs_getnewfsid(mp) - struct mount *mp; +int +vnode_iterate_prepare(mount_t mp) { -static u_short xxxfs_mntid; + vnode_t vp; - fsid_t tfsid; - int mtype; + if (TAILQ_EMPTY(&mp->mnt_vnodelist)) { + /* nothing to do */ + return (0); + } - simple_lock(&mntid_slock); - mtype = mp->mnt_vfc->vfc_typenum; - mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0); - mp->mnt_stat.f_fsid.val[1] = mtype; - if (xxxfs_mntid == 0) - ++xxxfs_mntid; - tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid); - tfsid.val[1] = mtype; - if (!CIRCLEQ_EMPTY(&mountlist)) { - while (vfs_getvfs(&tfsid)) { - tfsid.val[0]++; - xxxfs_mntid++; - } - } - mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; - simple_unlock(&mntid_slock); + vp = TAILQ_FIRST(&mp->mnt_vnodelist); + vp->v_mntvnodes.tqe_prev = &(mp->mnt_workerqueue.tqh_first); + mp->mnt_workerqueue.tqh_first = mp->mnt_vnodelist.tqh_first; + mp->mnt_workerqueue.tqh_last = mp->mnt_vnodelist.tqh_last; + + TAILQ_INIT(&mp->mnt_vnodelist); + if (mp->mnt_newvnodes.tqh_first != NULL) + panic("vnode_iterate_prepare: newvnode when entering vnode"); + TAILQ_INIT(&mp->mnt_newvnodes); + + return (1); } -/* - * Set vnode attributes to VNOVAL - */ -void -vattr_null(vap) - register struct vattr *vap; + +/* called with mount lock held */ +int +vnode_iterate_reloadq(mount_t mp) { + int moved = 0; + + /* add the remaining entries in workerq to the end of mount vnode list */ + if (!TAILQ_EMPTY(&mp->mnt_workerqueue)) { + struct vnode * mvp; + mvp = TAILQ_LAST(&mp->mnt_vnodelist, vnodelst); + + /* Joining the workerque entities to mount vnode list */ + if (mvp) + mvp->v_mntvnodes.tqe_next = mp->mnt_workerqueue.tqh_first; + else + mp->mnt_vnodelist.tqh_first = mp->mnt_workerqueue.tqh_first; + mp->mnt_workerqueue.tqh_first->v_mntvnodes.tqe_prev = mp->mnt_vnodelist.tqh_last; + mp->mnt_vnodelist.tqh_last = mp->mnt_workerqueue.tqh_last; + TAILQ_INIT(&mp->mnt_workerqueue); + } + + /* add the newvnodes to the head of mount vnode list */ + if (!TAILQ_EMPTY(&mp->mnt_newvnodes)) { + struct vnode * nlvp; + nlvp = TAILQ_LAST(&mp->mnt_newvnodes, vnodelst); + + mp->mnt_newvnodes.tqh_first->v_mntvnodes.tqe_prev = &mp->mnt_vnodelist.tqh_first; + nlvp->v_mntvnodes.tqe_next = mp->mnt_vnodelist.tqh_first; + if(mp->mnt_vnodelist.tqh_first) + mp->mnt_vnodelist.tqh_first->v_mntvnodes.tqe_prev = &nlvp->v_mntvnodes.tqe_next; + else + mp->mnt_vnodelist.tqh_last = mp->mnt_newvnodes.tqh_last; + mp->mnt_vnodelist.tqh_first = mp->mnt_newvnodes.tqh_first; + TAILQ_INIT(&mp->mnt_newvnodes); + moved = 1; + } - vap->va_type = VNON; - vap->va_size = vap->va_bytes = VNOVAL; - vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid = - vap->va_fsid = vap->va_fileid = - vap->va_blocksize = vap->va_rdev = - vap->va_atime.tv_sec = vap->va_atime.tv_nsec = - vap->va_mtime.tv_sec = vap->va_mtime.tv_nsec = - vap->va_ctime.tv_sec = vap->va_ctime.tv_nsec = - vap->va_flags = vap->va_gen = VNOVAL; - vap->va_vaflags = 0; + return(moved); } -/* - * Routines having to do with the management of the vnode table. - */ -extern int (**dead_vnodeop_p)(void *); -static void vclean __P((struct vnode *vp, int flag, struct proc *p)); -extern void vgonel __P((struct vnode *vp, struct proc *p)); -long numvnodes, freevnodes; -long inactivevnodes; -long vnode_reclaim_tried; -long vnode_objects_reclaimed; +void +vnode_iterate_clear(mount_t mp) +{ + mp->mnt_lflag &= ~MNT_LITER; + if (mp->mnt_lflag & MNT_LITERWAIT) { + mp->mnt_lflag &= ~MNT_LITERWAIT; + wakeup(mp); + } +} -extern struct vattr va_null; -/* - * Return the next vnode from the free list. - */ int -getnewvnode(tag, mp, vops, vpp) - enum vtagtype tag; - struct mount *mp; - int (**vops)(void *); - struct vnode **vpp; +vnode_iterate(mp, flags, callout, arg) + mount_t mp; + int flags; + int (*callout)(struct vnode *, void *); + void * arg; { - struct proc *p = current_proc(); /* XXX */ struct vnode *vp; - int cnt, didretry = 0; - static int reused = 0; /* track the reuse rate */ - int reclaimhits = 0; - -retry: - simple_lock(&vnode_free_list_slock); - /* - * MALLOC a vnode if the number of vnodes has not reached the desired - * value and the number on the free list is still reasonable... - * reuse from the freelist even though we may evict a name cache entry - * to reduce the number of vnodes that accumulate.... vnodes tie up - * wired memory and are never garbage collected - */ - if (numvnodes < desiredvnodes && (freevnodes < (2 * VNODE_FREE_MIN))) { - numvnodes++; - simple_unlock(&vnode_free_list_slock); - MALLOC_ZONE(vp, struct vnode *, sizeof *vp, M_VNODE, M_WAITOK); - bzero((char *)vp, sizeof *vp); - VLISTNONE(vp); /* avoid double queue removal */ - simple_lock_init(&vp->v_interlock); - goto done; - } + int vid, retval; + int ret = 0; - /* - * Once the desired number of vnodes are allocated, - * we start reusing the vnodes. - */ - if (freevnodes < VNODE_FREE_MIN) { - /* - * if we are low on vnodes on the freelist attempt to get - * some back from the inactive list and VM object cache - */ - simple_unlock(&vnode_free_list_slock); - (void)vnreclaim(vnodetarget); - simple_lock(&vnode_free_list_slock); - } - if (numvnodes >= desiredvnodes && reused > VNODE_TOOMANY_REUSED) { - reused = 0; - if (freevnodes < VNODE_FREE_ENOUGH) { - simple_unlock(&vnode_free_list_slock); - (void)vnreclaim(vnodetarget); - simple_lock(&vnode_free_list_slock); - } - } - - for (cnt = 0, vp = vnode_free_list.tqh_first; - vp != NULLVP; cnt++, vp = vp->v_freelist.tqe_next) { - if (simple_lock_try(&vp->v_interlock)) { - /* got the interlock */ - if (ISSET(vp->v_flag, VORECLAIM)) { - /* skip over the vnodes that are being reclaimed */ - simple_unlock(&vp->v_interlock); - reclaimhits++; - } else - break; - } - } + mount_lock(mp); - /* - * Unless this is a bad time of the month, at most - * the first NCPUS items on the free list are - * locked, so this is close enough to being empty. - */ - if (vp == NULLVP) { - simple_unlock(&vnode_free_list_slock); - if (!(didretry++) && (vnreclaim(vnodetarget) > 0)) - goto retry; - tablefull("vnode"); - log(LOG_EMERG, "%d vnodes locked, %d desired, %d numvnodes, " - "%d free, %d inactive, %d being reclaimed\n", - cnt, desiredvnodes, numvnodes, freevnodes, inactivevnodes, - reclaimhits); - *vpp = 0; - return (ENFILE); - } + vnode_iterate_setup(mp); - if (vp->v_usecount) - panic("free vnode isn't: v_type = %d, v_usecount = %d?", - vp->v_type, vp->v_usecount); + /* it is returns 0 then there is nothing to do */ + retval = vnode_iterate_prepare(mp); - VREMFREE("getnewvnode", vp); - reused++; - simple_unlock(&vnode_free_list_slock); - vp->v_lease = NULL; - cache_purge(vp); - if (vp->v_type != VBAD) - vgonel(vp, p); /* clean and reclaim the vnode */ - else - simple_unlock(&vp->v_interlock); -#if DIAGNOSTIC - if (vp->v_data) - panic("cleaned vnode isn't"); - { - int s = splbio(); - if (vp->v_numoutput) - panic("Clean vnode has pending I/O's"); - splx(s); + if (retval == 0) { + vnode_iterate_clear(mp); + mount_unlock(mp); + return(ret); } -#endif - if (UBCINFOEXISTS(vp)) - panic("getnewvnode: ubcinfo not cleaned"); - else - vp->v_ubcinfo = UBC_INFO_NULL; + + /* iterate over all the vnodes */ + while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) { + vp = TAILQ_FIRST(&mp->mnt_workerqueue); + TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes); + TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes); + vid = vp->v_id; + if ((vp->v_data == NULL) || (vp->v_type == VNON) || (vp->v_mount != mp)) { + continue; + } + mount_unlock(mp); - if (vp->v_flag & VHASDIRTY) - cluster_release(vp); + if ( vget_internal(vp, vid, (flags | VNODE_NODEAD| VNODE_WITHID | VNODE_NOSUSPEND))) { + mount_lock(mp); + continue; + } + if (flags & VNODE_RELOAD) { + /* + * we're reloading the filesystem + * cast out any inactive vnodes... + */ + if (vnode_reload(vp)) { + /* vnode will be recycled on the refcount drop */ + vnode_put(vp); + mount_lock(mp); + continue; + } + } - // make sure all these fields are cleared out as the - // name/parent stuff uses them and assumes they're - // cleared to null/0. - if (vp->v_scmap != NULL) { - panic("getnewvnode: vp @ 0x%x has non-null scmap.\n", vp); + retval = callout(vp, arg); + + switch (retval) { + case VNODE_RETURNED: + case VNODE_RETURNED_DONE: + vnode_put(vp); + if (retval == VNODE_RETURNED_DONE) { + mount_lock(mp); + ret = 0; + goto out; + } + break; + + case VNODE_CLAIMED_DONE: + mount_lock(mp); + ret = 0; + goto out; + case VNODE_CLAIMED: + default: + break; + } + mount_lock(mp); } - vp->v_un.vu_name = NULL; - vp->v_scdirty = 0; - vp->v_un1.v_cl.v_pad = 0; - - - vp->v_lastr = -1; - vp->v_ralen = 0; - vp->v_maxra = 0; - vp->v_ciosiz = 0; - vp->v_clen = 0; - vp->v_socket = 0; - /* we may have blocked, re-evaluate state */ - simple_lock(&vnode_free_list_slock); - if (VONLIST(vp)) { - if (vp->v_usecount == 0) - VREMFREE("getnewvnode", vp); - else if (ISSET((vp)->v_flag, VUINACTIVE)) - VREMINACTIVE("getnewvnode", vp); - } - simple_unlock(&vnode_free_list_slock); +out: + (void)vnode_iterate_reloadq(mp); + vnode_iterate_clear(mp); + mount_unlock(mp); + return (ret); +} -done: - vp->v_flag = VSTANDARD; - vp->v_type = VNON; - vp->v_tag = tag; - vp->v_op = vops; - insmntque(vp, mp); - *vpp = vp; - vp->v_usecount = 1; - vp->v_data = 0; - return (0); +void +mount_lock_renames(mount_t mp) +{ + lck_mtx_lock(&mp->mnt_renamelock); } -/* - * Move a vnode from one mount queue to another. - */ void -insmntque(vp, mp) - struct vnode *vp; - struct mount *mp; +mount_unlock_renames(mount_t mp) { + lck_mtx_unlock(&mp->mnt_renamelock); +} - simple_lock(&mntvnode_slock); - /* - * Delete from old mount point vnode list, if on one. - */ - if (vp->v_mount != NULL) - LIST_REMOVE(vp, v_mntvnodes); - /* - * Insert into list of vnodes for the new mount point, if available. - */ - if ((vp->v_mount = mp) != NULL) - LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); - simple_unlock(&mntvnode_slock); +void +mount_lock(mount_t mp) +{ + lck_mtx_lock(&mp->mnt_mlock); } -__inline void -vpwakeup(struct vnode *vp) +void +mount_unlock(mount_t mp) { - if (vp) { - if (--vp->v_numoutput < 0) - panic("vpwakeup: neg numoutput"); - if ((vp->v_flag & VBWAIT || vp->v_flag & VTHROTTLED) - && vp->v_numoutput <= 0) { - vp->v_flag &= ~(VBWAIT|VTHROTTLED); - wakeup((caddr_t)&vp->v_numoutput); - } - } + lck_mtx_unlock(&mp->mnt_mlock); } -/* - * Update outstanding I/O count and do wakeup if requested. - */ + void -vwakeup(bp) - register struct buf *bp; +mount_ref(mount_t mp, int locked) { - CLR(bp->b_flags, B_WRITEINPROG); - vpwakeup(bp->b_vp); + if ( !locked) + mount_lock(mp); + + mp->mnt_count++; + + if ( !locked) + mount_unlock(mp); } -/* - * Flush out and invalidate all buffers associated with a vnode. - * Called with the underlying object locked. - */ + +void +mount_drop(mount_t mp, int locked) +{ + if ( !locked) + mount_lock(mp); + + mp->mnt_count--; + + if (mp->mnt_count == 0 && (mp->mnt_lflag & MNT_LDRAIN)) + wakeup(&mp->mnt_lflag); + + if ( !locked) + mount_unlock(mp); +} + + int -vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) - register struct vnode *vp; - int flags; - struct ucred *cred; - struct proc *p; - int slpflag, slptimeo; +mount_iterref(mount_t mp, int locked) { - register struct buf *bp; - struct buf *nbp, *blist; - int s, error = 0; + int retval = 0; - if (flags & V_SAVE) { - if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) { - return (error); - } - if (vp->v_dirtyblkhd.lh_first) - panic("vinvalbuf: dirty bufs (vp 0x%x, bp 0x%x)", vp, vp->v_dirtyblkhd.lh_first); + if (!locked) + mount_list_lock(); + if (mp->mnt_iterref < 0) { + retval = 1; + } else { + mp->mnt_iterref++; } + if (!locked) + mount_list_unlock(); + return(retval); +} - for (;;) { - if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA)) - while (blist && blist->b_lblkno < 0) - blist = blist->b_vnbufs.le_next; - if (!blist && (blist = vp->v_dirtyblkhd.lh_first) && - (flags & V_SAVEMETA)) - while (blist && blist->b_lblkno < 0) - blist = blist->b_vnbufs.le_next; - if (!blist) - break; +int +mount_isdrained(mount_t mp, int locked) +{ + int retval; - for (bp = blist; bp; bp = nbp) { - nbp = bp->b_vnbufs.le_next; - if ((flags & V_SAVEMETA) && bp->b_lblkno < 0) - continue; - s = splbio(); - if (ISSET(bp->b_flags, B_BUSY)) { - SET(bp->b_flags, B_WANTED); - error = tsleep((caddr_t)bp, - slpflag | (PRIBIO + 1), "vinvalbuf", - slptimeo); - splx(s); - if (error) { - return (error); - } - break; - } - bremfree(bp); - SET(bp->b_flags, B_BUSY); - splx(s); + if (!locked) + mount_list_lock(); + if (mp->mnt_iterref < 0) + retval = 1; + else + retval = 0; + if (!locked) + mount_list_unlock(); + return(retval); +} + +void +mount_iterdrop(mount_t mp) +{ + mount_list_lock(); + mp->mnt_iterref--; + wakeup(&mp->mnt_iterref); + mount_list_unlock(); +} + +void +mount_iterdrain(mount_t mp) +{ + mount_list_lock(); + while (mp->mnt_iterref) + msleep((caddr_t)&mp->mnt_iterref, mnt_list_mtx_lock, PVFS, "mount_iterdrain", 0 ); + /* mount iterations drained */ + mp->mnt_iterref = -1; + mount_list_unlock(); +} +void +mount_iterreset(mount_t mp) +{ + mount_list_lock(); + if (mp->mnt_iterref == -1) + mp->mnt_iterref = 0; + mount_list_unlock(); +} + +/* always called with mount lock held */ +int +mount_refdrain(mount_t mp) +{ + if (mp->mnt_lflag & MNT_LDRAIN) + panic("already in drain"); + mp->mnt_lflag |= MNT_LDRAIN; + + while (mp->mnt_count) + msleep((caddr_t)&mp->mnt_lflag, &mp->mnt_mlock, PVFS, "mount_drain", 0 ); + + if (mp->mnt_vnodelist.tqh_first != NULL) + panic("mount_refdrain: dangling vnode"); + + mp->mnt_lflag &= ~MNT_LDRAIN; + + return(0); +} + + +/* + * Mark a mount point as busy. Used to synchronize access and to delay + * unmounting. + */ +int +vfs_busy(mount_t mp, int flags) +{ + +restart: + if (mp->mnt_lflag & MNT_LDEAD) + return(ENOENT); + + if (mp->mnt_lflag & MNT_LUNMOUNT) { + if (flags & LK_NOWAIT) + return (ENOENT); + + mount_lock(mp); + + if (mp->mnt_lflag & MNT_LDEAD) { + mount_unlock(mp); + return(ENOENT); + } + if (mp->mnt_lflag & MNT_LUNMOUNT) { + mp->mnt_lflag |= MNT_LWAIT; /* - * XXX Since there are no node locks for NFS, I believe - * there is a slight chance that a delayed write will - * occur while sleeping just above, so check for it. + * Since all busy locks are shared except the exclusive + * lock granted when unmounting, the only place that a + * wakeup needs to be done is at the release of the + * exclusive lock at the end of dounmount. */ - if (ISSET(bp->b_flags, B_DELWRI) && (flags & V_SAVE)) { - (void) VOP_BWRITE(bp); - break; - } - - if (bp->b_flags & B_LOCKED) { - panic("vinvalbuf: bp @ 0x%x is locked!", bp); - break; - } else { - SET(bp->b_flags, B_INVAL); - } - brelse(bp); + msleep((caddr_t)mp, &mp->mnt_mlock, (PVFS | PDROP), "vfsbusy", 0 ); + return (ENOENT); } + mount_unlock(mp); + } + + lck_rw_lock_shared(&mp->mnt_rwlock); + + /* + * until we are granted the rwlock, it's possible for the mount point to + * change state, so reevaluate before granting the vfs_busy + */ + if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) { + lck_rw_done(&mp->mnt_rwlock); + goto restart; } - if (!(flags & V_SAVEMETA) && - (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first)) - panic("vinvalbuf: flush failed"); return (0); } +/* + * Free a busy filesystem. + */ + +void +vfs_unbusy(mount_t mp) +{ + lck_rw_done(&mp->mnt_rwlock); +} + + + +static void +vfs_rootmountfailed(mount_t mp) { + + mount_list_lock(); + mp->mnt_vtable->vfc_refcount--; + mount_list_unlock(); + + vfs_unbusy(mp); + + mount_lock_destroy(mp); + + FREE_ZONE(mp, sizeof(struct mount), M_MOUNT); +} + +/* + * Lookup a filesystem type, and if found allocate and initialize + * a mount structure for it. + * + * Devname is usually updated by mount(8) after booting. + */ +static mount_t +vfs_rootmountalloc_internal(struct vfstable *vfsp, const char *devname) +{ + mount_t mp; + + mp = _MALLOC_ZONE((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); + bzero((char *)mp, (u_long)sizeof(struct mount)); + + /* Initialize the default IO constraints */ + mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS; + mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32; + mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt; + mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt; + mp->mnt_devblocksize = DEV_BSIZE; + + mount_lock_init(mp); + (void)vfs_busy(mp, LK_NOWAIT); + + TAILQ_INIT(&mp->mnt_vnodelist); + TAILQ_INIT(&mp->mnt_workerqueue); + TAILQ_INIT(&mp->mnt_newvnodes); + + mp->mnt_vtable = vfsp; + mp->mnt_op = vfsp->vfc_vfsops; + mp->mnt_flag = MNT_RDONLY | MNT_ROOTFS; + mp->mnt_vnodecovered = NULLVP; + //mp->mnt_stat.f_type = vfsp->vfc_typenum; + mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; + + mount_list_lock(); + vfsp->vfc_refcount++; + mount_list_unlock(); + + strncpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN); + mp->mnt_vfsstat.f_mntonname[0] = '/'; + (void) copystr((char *)devname, mp->mnt_vfsstat.f_mntfromname, MAXPATHLEN - 1, 0); + + return (mp); +} + +errno_t +vfs_rootmountalloc(const char *fstypename, const char *devname, mount_t *mpp) +{ + struct vfstable *vfsp; + + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (!strcmp(vfsp->vfc_name, fstypename)) + break; + if (vfsp == NULL) + return (ENODEV); + + *mpp = vfs_rootmountalloc_internal(vfsp, devname); + + if (*mpp) + return (0); + + return (ENOMEM); +} + + +/* + * Find an appropriate filesystem to use for the root. If a filesystem + * has not been preselected, walk through the list of known filesystems + * trying those that have mountroot routines, and try them until one + * works or we have tried them all. + */ +extern int (*mountroot)(void); + +int +vfs_mountroot() +{ + struct vfstable *vfsp; + struct vfs_context context; + int error; + mount_t mp; + + if (mountroot != NULL) { + /* + * used for netboot which follows a different set of rules + */ + error = (*mountroot)(); + return (error); + } + if ((error = bdevvp(rootdev, &rootvp))) { + printf("vfs_mountroot: can't setup bdevvp\n"); + return (error); + } + context.vc_proc = current_proc(); + context.vc_ucred = kauth_cred_get(); + + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { + if (vfsp->vfc_mountroot == NULL) + continue; + + mp = vfs_rootmountalloc_internal(vfsp, "root_device"); + mp->mnt_devvp = rootvp; + + if ((error = (*vfsp->vfc_mountroot)(mp, rootvp, &context)) == 0) { + mp->mnt_devvp->v_specflags |= SI_MOUNTEDON; + + vfs_unbusy(mp); + + mount_list_add(mp); + + /* + * cache the IO attributes for the underlying physical media... + * an error return indicates the underlying driver doesn't + * support all the queries necessary... however, reasonable + * defaults will have been set, so no reason to bail or care + */ + vfs_init_io_attributes(rootvp, mp); + /* + * get rid of iocount reference returned + * by bdevvp... it will have also taken + * a usecount reference which we want to keep + */ + vnode_put(rootvp); + + return (0); + } + vfs_rootmountfailed(mp); + + if (error != EINVAL) + printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); + } + return (ENODEV); +} + +/* + * Lookup a mount point by filesystem identifier. + */ +extern mount_t vfs_getvfs_locked(fsid_t *); + +struct mount * +vfs_getvfs(fsid) + fsid_t *fsid; +{ + return (mount_list_lookupby_fsid(fsid, 0, 0)); +} + +struct mount * +vfs_getvfs_locked(fsid) + fsid_t *fsid; +{ + return(mount_list_lookupby_fsid(fsid, 1, 0)); +} + +struct mount * +vfs_getvfs_by_mntonname(u_char *path) +{ + mount_t retmp = (mount_t)0; + mount_t mp; + + mount_list_lock(); + TAILQ_FOREACH(mp, &mountlist, mnt_list) { + if (!strcmp(mp->mnt_vfsstat.f_mntonname, path)) { + retmp = mp; + goto out; + } + } +out: + mount_list_unlock(); + return (retmp); +} + +/* generation number for creation of new fsids */ +u_short mntid_gen = 0; +/* + * Get a new unique fsid + */ +void +vfs_getnewfsid(mp) + struct mount *mp; +{ + + fsid_t tfsid; + int mtype; + mount_t nmp; + + mount_list_lock(); + + /* generate a new fsid */ + mtype = mp->mnt_vtable->vfc_typenum; + if (++mntid_gen == 0) + mntid_gen++; + tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen); + tfsid.val[1] = mtype; + + TAILQ_FOREACH(nmp, &mountlist, mnt_list) { + while (vfs_getvfs_locked(&tfsid)) { + if (++mntid_gen == 0) + mntid_gen++; + tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen); + } + } + mp->mnt_vfsstat.f_fsid.val[0] = tfsid.val[0]; + mp->mnt_vfsstat.f_fsid.val[1] = tfsid.val[1]; + mount_list_unlock(); +} + +/* + * Routines having to do with the management of the vnode table. + */ +extern int (**dead_vnodeop_p)(void *); +long numvnodes, freevnodes; +long inactivevnodes; + + +/* + * Move a vnode from one mount queue to another. + */ +static void +insmntque(vnode_t vp, mount_t mp) +{ + mount_t lmp; + /* + * Delete from old mount point vnode list, if on one. + */ + if ( (lmp = vp->v_mount) != NULL) { + if ((vp->v_lflag & VNAMED_MOUNT) == 0) + panic("insmntque: vp not in mount vnode list"); + vp->v_lflag &= ~VNAMED_MOUNT; + + mount_lock(lmp); + + mount_drop(lmp, 1); + + if (vp->v_mntvnodes.tqe_next == NULL) { + if (TAILQ_LAST(&lmp->mnt_vnodelist, vnodelst) == vp) + TAILQ_REMOVE(&lmp->mnt_vnodelist, vp, v_mntvnodes); + else if (TAILQ_LAST(&lmp->mnt_newvnodes, vnodelst) == vp) + TAILQ_REMOVE(&lmp->mnt_newvnodes, vp, v_mntvnodes); + else if (TAILQ_LAST(&lmp->mnt_workerqueue, vnodelst) == vp) + TAILQ_REMOVE(&lmp->mnt_workerqueue, vp, v_mntvnodes); + } else { + vp->v_mntvnodes.tqe_next->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_prev; + *vp->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_next; + } + vp->v_mntvnodes.tqe_next = 0; + vp->v_mntvnodes.tqe_prev = 0; + mount_unlock(lmp); + return; + } + + /* + * Insert into list of vnodes for the new mount point, if available. + */ + if ((vp->v_mount = mp) != NULL) { + mount_lock(mp); + if ((vp->v_mntvnodes.tqe_next != 0) && (vp->v_mntvnodes.tqe_prev != 0)) + panic("vp already in mount list"); + if (mp->mnt_lflag & MNT_LITER) + TAILQ_INSERT_HEAD(&mp->mnt_newvnodes, vp, v_mntvnodes); + else + TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); + if (vp->v_lflag & VNAMED_MOUNT) + panic("insmntque: vp already in mount vnode list"); + if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) + panic("insmntque: vp on the free list\n"); + vp->v_lflag |= VNAMED_MOUNT; + mount_ref(mp, 1); + mount_unlock(mp); + } +} + + /* * Create a vnode for a block device. * Used for root filesystem, argdev, and swap areas. * Also used for memory file system special devices. */ int -bdevvp(dev, vpp) - dev_t dev; - struct vnode **vpp; +bdevvp(dev_t dev, vnode_t *vpp) { - register struct vnode *vp; - struct vnode *nvp; - int error; + vnode_t nvp; + int error; + struct vnode_fsparam vfsp; + struct vfs_context context; if (dev == NODEV) { *vpp = NULLVP; return (ENODEV); } - error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); - if (error) { + + context.vc_proc = current_proc(); + context.vc_ucred = FSCRED; + + vfsp.vnfs_mp = (struct mount *)0; + vfsp.vnfs_vtype = VBLK; + vfsp.vnfs_str = "bdevvp"; + vfsp.vnfs_dvp = 0; + vfsp.vnfs_fsnode = 0; + vfsp.vnfs_cnp = 0; + vfsp.vnfs_vops = spec_vnodeop_p; + vfsp.vnfs_rdev = dev; + vfsp.vnfs_filesize = 0; + + vfsp.vnfs_flags = VNFS_NOCACHE | VNFS_CANTCACHE; + + vfsp.vnfs_marksystem = 0; + vfsp.vnfs_markroot = 0; + + if ( (error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &nvp)) ) { *vpp = NULLVP; return (error); } - vp = nvp; - vp->v_type = VBLK; - if (nvp = checkalias(vp, dev, (struct mount *)0)) { - vput(vp); - vp = nvp; + if ( (error = vnode_ref(nvp)) ) { + panic("bdevvp failed: vnode_ref"); + return (error); } - *vpp = vp; + if ( (error = VNOP_FSYNC(nvp, MNT_WAIT, &context)) ) { + panic("bdevvp failed: fsync"); + return (error); + } + if ( (error = buf_invalidateblks(nvp, BUF_WRITE_DATA, 0, 0)) ) { + panic("bdevvp failed: invalidateblks"); + return (error); + } + if ( (error = VNOP_OPEN(nvp, FREAD, &context)) ) { + panic("bdevvp failed: open"); + return (error); + } + *vpp = nvp; + return (0); } @@ -804,74 +1140,72 @@ bdevvp(dev, vpp) * the existing contents and return the aliased vnode. The * caller is responsible for filling it with its new contents. */ -struct vnode * -checkalias(nvp, nvp_rdev, mp) +static vnode_t +checkalias(nvp, nvp_rdev) register struct vnode *nvp; dev_t nvp_rdev; - struct mount *mp; { - struct proc *p = current_proc(); /* XXX */ struct vnode *vp; struct vnode **vpp; - struct specinfo *specinfop; - - if (nvp->v_type != VBLK && nvp->v_type != VCHR) - return (NULLVP); + int vid = 0; - MALLOC_ZONE(specinfop, struct specinfo *, sizeof(struct specinfo), - M_SPECINFO, M_WAITOK); vpp = &speclisth[SPECHASH(nvp_rdev)]; loop: - simple_lock(&spechash_slock); + SPECHASH_LOCK(); + for (vp = *vpp; vp; vp = vp->v_specnext) { - if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type) - continue; + if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) { + vid = vp->v_id; + break; + } + } + SPECHASH_UNLOCK(); + + if (vp) { + if (vnode_getwithvid(vp,vid)) { + goto loop; + } + /* + * Termination state is checked in vnode_getwithvid + */ + vnode_lock(vp); + /* * Alias, but not in use, so flush it out. */ - simple_lock(&vp->v_interlock); - if (vp->v_usecount == 0) { - simple_unlock(&spechash_slock); - vgonel(vp, p); + if ((vp->v_iocount == 1) && (vp->v_usecount == 0)) { + vnode_reclaim_internal(vp, 1, 0); + vnode_unlock(vp); + vnode_put(vp); goto loop; } - if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) { - simple_unlock(&spechash_slock); - goto loop; - } - break; } if (vp == NULL || vp->v_tag != VT_NON) { - nvp->v_specinfo = specinfop; - specinfop = 0; /* buffer used */ + MALLOC_ZONE(nvp->v_specinfo, struct specinfo *, sizeof(struct specinfo), + M_SPECINFO, M_WAITOK); bzero(nvp->v_specinfo, sizeof(struct specinfo)); nvp->v_rdev = nvp_rdev; + nvp->v_specflags = 0; + nvp->v_speclastr = -1; + + SPECHASH_LOCK(); nvp->v_hashchain = vpp; nvp->v_specnext = *vpp; - nvp->v_specflags = 0; - simple_unlock(&spechash_slock); *vpp = nvp; + SPECHASH_UNLOCK(); + if (vp != NULLVP) { nvp->v_flag |= VALIASED; vp->v_flag |= VALIASED; - vput(vp); + vnode_unlock(vp); + vnode_put(vp); } - /* Since buffer is used just return */ return (NULLVP); } - simple_unlock(&spechash_slock); - VOP_UNLOCK(vp, 0, p); - simple_lock(&vp->v_interlock); - vclean(vp, 0, p); - vp->v_op = nvp->v_op; - vp->v_tag = nvp->v_tag; - nvp->v_type = VNON; - insmntque(vp, mp); - if (specinfop) - FREE_ZONE((void *)specinfop, sizeof(struct specinfo), M_SPECINFO); return (vp); } + /* * Get a reference on a particular vnode and lock it if requested. * If the vnode was on the inactive list, remove it from the list. @@ -882,571 +1216,264 @@ loop: * and an error returned to indicate that the vnode is no longer * usable (possibly having been changed to a new file system type). */ -int -vget(vp, flags, p) - struct vnode *vp; - int flags; - struct proc *p; +static int +vget_internal(vnode_t vp, int vid, int vflags) { int error = 0; u_long vpid; - vpid = vp->v_id; // save off the original v_id - -retry: - - /* - * If the vnode is in the process of being cleaned out for - * another use, we wait for the cleaning to finish and then - * return failure. Cleaning is determined by checking that - * the VXLOCK flag is set. - */ - if ((flags & LK_INTERLOCK) == 0) - simple_lock(&vp->v_interlock); - if ((vp->v_flag & VXLOCK) || (vp->v_flag & VORECLAIM)) { - vp->v_flag |= VXWANT; - simple_unlock(&vp->v_interlock); - (void)tsleep((caddr_t)vp, PINOD, "vget", 0); - return (ENOENT); - } - - /* - * vnode is being terminated. - * wait for vnode_pager_no_senders() to clear VTERMINATE - */ - if (ISSET(vp->v_flag, VTERMINATE)) { - SET(vp->v_flag, VTERMWANT); - simple_unlock(&vp->v_interlock); - (void)tsleep((caddr_t)&vp->v_ubcinfo, PINOD, "vget1", 0); - return (ENOENT); - } - - /* - * if the vnode is being initialized, - * wait for it to finish initialization - */ - if (ISSET(vp->v_flag, VUINIT)) { - SET(vp->v_flag, VUWANT); - simple_unlock(&vp->v_interlock); - (void) tsleep((caddr_t)vp, PINOD, "vget2", 0); - goto retry; - } - - simple_lock(&vnode_free_list_slock); - if (VONLIST(vp)) { - if (vp->v_usecount == 0) - VREMFREE("vget", vp); - else if (ISSET((vp)->v_flag, VUINACTIVE)) - VREMINACTIVE("vget", vp); - } - simple_unlock(&vnode_free_list_slock); - - if (++vp->v_usecount <= 0) - panic("vget: v_usecount"); - - /* - * Recover named reference as needed - */ - if (UBCISVALID(vp) && !UBCINFOMISSING(vp) && !ubc_issetflags(vp, UI_HASOBJREF)) { - simple_unlock(&vp->v_interlock); - if (ubc_getobject(vp, UBC_HOLDOBJECT) == MEMORY_OBJECT_CONTROL_NULL) { - error = ENOENT; - goto errout; - } - simple_lock(&vp->v_interlock); - } - - if (flags & LK_TYPE_MASK) { - if (error = vn_lock(vp, flags | LK_INTERLOCK, p)) - goto errout; - if (vpid != vp->v_id) { // make sure it's still the same vnode - vput(vp); - return ENOENT; - } - return (0); - } - - if ((flags & LK_INTERLOCK) == 0) - simple_unlock(&vp->v_interlock); + vnode_lock(vp); - if (vpid != vp->v_id) { // make sure it's still the same vnode - vrele(vp); - return ENOENT; - } - - return (0); + if (vflags & VNODE_WITHID) + vpid = vid; + else + vpid = vp->v_id; // save off the original v_id -errout: - simple_lock(&vp->v_interlock); + if ((vflags & VNODE_WRITEABLE) && (vp->v_writecount == 0)) + /* + * vnode to be returned only if it has writers opened + */ + error = EINVAL; + else + error = vnode_getiocount(vp, 1, vpid, vflags); - /* - * we may have blocked. Re-evaluate the state - */ - simple_lock(&vnode_free_list_slock); - if (VONLIST(vp)) { - if (vp->v_usecount == 0) - VREMFREE("vget", vp); - else if (ISSET((vp)->v_flag, VUINACTIVE)) - VREMINACTIVE("vget", vp); - } - simple_unlock(&vnode_free_list_slock); + vnode_unlock(vp); - /* - * If the vnode was not active in the first place - * must not call vrele() as VOP_INACTIVE() is not - * required. - * So inlined part of vrele() here. - */ - if (--vp->v_usecount == 1) { - if (UBCINFOEXISTS(vp)) { - vinactive(vp); - simple_unlock(&vp->v_interlock); - return (error); - } - } - if (vp->v_usecount > 0) { - simple_unlock(&vp->v_interlock); - return (error); - } - if (vp->v_usecount < 0) - panic("vget: negative usecount (%d)", vp->v_usecount); - vfree(vp); - simple_unlock(&vp->v_interlock); return (error); } -/* - * Get a pager reference on the particular vnode. - * - * This is called from ubc_info_init() and it is asumed that - * the vnode is not on the free list. - * It is also assumed that the vnode is neither being recycled - * by vgonel nor being terminated by vnode_pager_vrele(). - * - * The vnode interlock is NOT held by the caller. - */ -__private_extern__ int -vnode_pager_vget(vp) - struct vnode *vp; -{ - simple_lock(&vp->v_interlock); - - UBCINFOCHECK("vnode_pager_vget", vp); - - if (ISSET(vp->v_flag, (VXLOCK|VORECLAIM|VTERMINATE))) - panic("%s: dying vnode", "vnode_pager_vget"); - - simple_lock(&vnode_free_list_slock); - /* The vnode should not be on free list */ - if (VONLIST(vp)) { - if (vp->v_usecount == 0) - panic("%s: still on list", "vnode_pager_vget"); - else if (ISSET((vp)->v_flag, VUINACTIVE)) - VREMINACTIVE("vnode_pager_vget", vp); - } - - /* The vnode should not be on the inactive list here */ - simple_unlock(&vnode_free_list_slock); - - /* After all those checks, now do the real work :-) */ - if (++vp->v_usecount <= 0) - panic("vnode_pager_vget: v_usecount"); - simple_unlock(&vp->v_interlock); - - return (0); -} - -/* - * Stubs to use when there is no locking to be done on the underlying object. - * A minimal shared lock is necessary to ensure that the underlying object - * is not revoked while an operation is in progress. So, an active shared - * count is maintained in an auxillary vnode lock structure. - */ -int -vop_nolock(ap) - struct vop_lock_args /* { - struct vnode *a_vp; - int a_flags; - struct proc *a_p; - } */ *ap; -{ -#ifdef notyet - /* - * This code cannot be used until all the non-locking filesystems - * (notably NFS) are converted to properly lock and release nodes. - * Also, certain vnode operations change the locking state within - * the operation (create, mknod, remove, link, rename, mkdir, rmdir, - * and symlink). Ideally these operations should not change the - * lock state, but should be changed to let the caller of the - * function unlock them. Otherwise all intermediate vnode layers - * (such as union, umapfs, etc) must catch these functions to do - * the necessary locking at their layer. Note that the inactive - * and lookup operations also change their lock state, but this - * cannot be avoided, so these two operations will always need - * to be handled in intermediate layers. - */ - struct vnode *vp = ap->a_vp; - int vnflags, flags = ap->a_flags; - - if (vp->v_vnlock == NULL) { - if ((flags & LK_TYPE_MASK) == LK_DRAIN) - return (0); - MALLOC(vp->v_vnlock, struct lock__bsd__ *, - sizeof(struct lock__bsd__), M_TEMP, M_WAITOK); - lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0); - } - switch (flags & LK_TYPE_MASK) { - case LK_DRAIN: - vnflags = LK_DRAIN; - break; - case LK_EXCLUSIVE: - case LK_SHARED: - vnflags = LK_SHARED; - break; - case LK_UPGRADE: - case LK_EXCLUPGRADE: - case LK_DOWNGRADE: - return (0); - case LK_RELEASE: - default: - panic("vop_nolock: bad operation %d", flags & LK_TYPE_MASK); - } - if (flags & LK_INTERLOCK) - vnflags |= LK_INTERLOCK; - return(lockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p)); -#else /* for now */ - /* - * Since we are not using the lock manager, we must clear - * the interlock here. - */ - if (ap->a_flags & LK_INTERLOCK) - simple_unlock(&ap->a_vp->v_interlock); - return (0); -#endif -} - -/* - * Decrement the active use count. - */ int -vop_nounlock(ap) - struct vop_unlock_args /* { - struct vnode *a_vp; - int a_flags; - struct proc *a_p; - } */ *ap; +vnode_ref(vnode_t vp) { - struct vnode *vp = ap->a_vp; - if (vp->v_vnlock == NULL) - return (0); - return (lockmgr(vp->v_vnlock, LK_RELEASE, NULL, ap->a_p)); + return (vnode_ref_ext(vp, 0)); } -/* - * Return whether or not the node is in use. - */ int -vop_noislocked(ap) - struct vop_islocked_args /* { - struct vnode *a_vp; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - - if (vp->v_vnlock == NULL) - return (0); - return (lockstatus(vp->v_vnlock)); -} - -/* - * Vnode reference. - */ -void -vref(vp) - struct vnode *vp; +vnode_ref_ext(vnode_t vp, int fmode) { + int error = 0; - simple_lock(&vp->v_interlock); - if (vp->v_usecount <= 0) - panic("vref used where vget required"); + vnode_lock(vp); - /* If on the inactive list, remove it from there */ - simple_lock(&vnode_free_list_slock); - if (ISSET((vp)->v_flag, VUINACTIVE)) - VREMINACTIVE("vref", vp); - simple_unlock(&vnode_free_list_slock); + /* + * once all the current call sites have been fixed to insure they have + * taken an iocount, we can toughen this assert up and insist that the + * iocount is non-zero... a non-zero usecount doesn't insure correctness + */ + if (vp->v_iocount <= 0 && vp->v_usecount <= 0) + panic("vnode_ref_ext: vp %x has no valid reference %d, %d", vp, vp->v_iocount, vp->v_usecount); - if (++vp->v_usecount <= 0) - panic("vref v_usecount"); - simple_unlock(&vp->v_interlock); -} + /* + * if you are the owner of drain/termination, can acquire usecount + */ + if ((vp->v_lflag & (VL_DRAIN | VL_TERMINATE | VL_DEAD))) { + if (vp->v_owner != current_thread()) { + error = ENOENT; + goto out; + } + } + vp->v_usecount++; -static void -clean_up_name_parent_ptrs(struct vnode *vp) -{ - if (VNAME(vp) || VPARENT(vp)) { - char *tmp1; - struct vnode *tmp2; - - // do it this way so we don't block before clearing - // these fields. - tmp1 = VNAME(vp); - tmp2 = VPARENT(vp); - VNAME(vp) = NULL; - VPARENT(vp) = NULL; - - if (tmp1) { - remove_name(tmp1); + if (fmode & FWRITE) { + if (++vp->v_writecount <= 0) + panic("vnode_ref_ext: v_writecount"); } - - if (tmp2) { - vrele(tmp2); + if (fmode & O_EVTONLY) { + if (++vp->v_kusecount <= 0) + panic("vnode_ref_ext: v_kusecount"); } - } +out: + vnode_unlock(vp); + + return (error); } /* * put the vnode on appropriate free list. - * called with v_interlock held. + * called with vnode LOCKED */ static void -vfree(vp) - struct vnode *vp; +vnode_list_add(vnode_t vp) { - funnel_t *curflock; - extern int disable_funnel; - - if ((curflock = thread_funnel_get()) != kernel_flock && - !(disable_funnel && curflock != THR_FUNNEL_NULL)) - panic("Entering vfree() without kernel funnel"); /* - * if the vnode is not obtained by calling getnewvnode() we - * are not responsible for the cleanup. Just return. + * if it is already on a list or non zero references return */ - if (!(vp->v_flag & VSTANDARD)) { + if (VONLIST(vp) || (vp->v_usecount != 0) || (vp->v_iocount != 0)) return; - } - - if (vp->v_usecount != 0) - panic("vfree: v_usecount"); - - /* insert at tail of LRU list or at head if VAGE is set */ - simple_lock(&vnode_free_list_slock); + vnode_list_lock(); - // make sure the name & parent pointers get cleared out -// clean_up_name_parent_ptrs(vp); - - if (VONLIST(vp)) - panic("%s: vnode still on list", "vfree"); - - if (vp->v_flag & VAGE) { - TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); + /* + * insert at tail of LRU list or at head if VAGE or VL_DEAD is set + */ + if ((vp->v_flag & VAGE) || (vp->v_lflag & VL_DEAD)) { + TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); vp->v_flag &= ~VAGE; - } else - TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + } else { + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + } freevnodes++; - simple_unlock(&vnode_free_list_slock); - return; + + vnode_list_unlock(); } /* - * put the vnode on the inactive list. - * called with v_interlock held + * remove the vnode from appropriate free list. */ static void -vinactive(vp) - struct vnode *vp; +vnode_list_remove(vnode_t vp) { - funnel_t *curflock; - extern int disable_funnel; - - if ((curflock = thread_funnel_get()) != kernel_flock && - !(disable_funnel && curflock != THR_FUNNEL_NULL)) - panic("Entering vinactive() without kernel funnel"); - - if (!UBCINFOEXISTS(vp)) - panic("vinactive: not a UBC vnode"); - - if (vp->v_usecount != 1) - panic("vinactive: v_usecount"); - - simple_lock(&vnode_free_list_slock); - - if (VONLIST(vp)) - panic("%s: vnode still on list", "vinactive"); - VINACTIVECHECK("vinactive", vp, 0); - - TAILQ_INSERT_TAIL(&vnode_inactive_list, vp, v_freelist); - SET(vp->v_flag, VUINACTIVE); - CLR(vp->v_flag, (VNOCACHE_DATA | VRAOFF)); - - inactivevnodes++; - simple_unlock(&vnode_free_list_slock); - return; + /* + * we want to avoid taking the list lock + * in the case where we're not on the free + * list... this will be true for most + * directories and any currently in use files + * + * we're guaranteed that we can't go from + * the not-on-list state to the on-list + * state since we hold the vnode lock... + * all calls to vnode_list_add are done + * under the vnode lock... so we can + * check for that condition (the prevelant one) + * without taking the list lock + */ + if (VONLIST(vp)) { + vnode_list_lock(); + /* + * however, we're not guaranteed that + * we won't go from the on-list state + * to the non-on-list state until we + * hold the vnode_list_lock... this + * is due to new_vnode removing vnodes + * from the free list uder the list_lock + * w/o the vnode lock... so we need to + * check again whether we're currently + * on the free list + */ + if (VONLIST(vp)) { + VREMFREE("vnode_list_remove", vp); + VLISTNONE(vp); + } + vnode_list_unlock(); + } } -/* - * vput(), just unlock and vrele() - */ void -vput(vp) - struct vnode *vp; +vnode_rele(vnode_t vp) { - struct proc *p = current_proc(); /* XXX */ + vnode_rele_internal(vp, 0, 0, 0); +} - simple_lock(&vp->v_interlock); - if (--vp->v_usecount == 1) { - if (UBCINFOEXISTS(vp)) { - vinactive(vp); - simple_unlock(&vp->v_interlock); - VOP_UNLOCK(vp, 0, p); - return; - } - } - if (vp->v_usecount > 0) { - simple_unlock(&vp->v_interlock); - VOP_UNLOCK(vp, 0, p); - return; - } -#if DIAGNOSTIC - if (vp->v_usecount < 0 || vp->v_writecount != 0) { - vprint("vput: bad ref count", vp); - panic("vput: v_usecount = %d, v_writecount = %d", - vp->v_usecount, vp->v_writecount); - } -#endif - simple_lock(&vnode_free_list_slock); - if (ISSET((vp)->v_flag, VUINACTIVE)) - VREMINACTIVE("vref", vp); - simple_unlock(&vnode_free_list_slock); - simple_unlock(&vp->v_interlock); - VOP_INACTIVE(vp, p); - /* - * The interlock is not held and - * VOP_INCATIVE releases the vnode lock. - * We could block and the vnode might get reactivated - * Can not just call vfree without checking the state - */ - simple_lock(&vp->v_interlock); - if (!VONLIST(vp)) { - if (vp->v_usecount == 0) - vfree(vp); - else if ((vp->v_usecount == 1) && UBCINFOEXISTS(vp)) - vinactive(vp); - } - simple_unlock(&vp->v_interlock); +void +vnode_rele_ext(vnode_t vp, int fmode, int dont_reenter) +{ + vnode_rele_internal(vp, fmode, dont_reenter, 0); } -/* - * Vnode release. - * If count drops to zero, call inactive routine and return to freelist. - */ + void -vrele(vp) - struct vnode *vp; +vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked) { - struct proc *p = current_proc(); /* XXX */ - funnel_t *curflock; - extern int disable_funnel; + struct vfs_context context; - if ((curflock = thread_funnel_get()) != kernel_flock && - !(disable_funnel && curflock != THR_FUNNEL_NULL)) - panic("Entering vrele() without kernel funnel"); + if ( !locked) + vnode_lock(vp); - simple_lock(&vp->v_interlock); - if (--vp->v_usecount == 1) { - if (UBCINFOEXISTS(vp)) { - if ((vp->v_flag & VXLOCK) == 0) - vinactive(vp); - simple_unlock(&vp->v_interlock); - return; - } - } - if (vp->v_usecount > 0) { - simple_unlock(&vp->v_interlock); - return; + if (--vp->v_usecount < 0) + panic("vnode_rele_ext: vp %x usecount -ve : %d", vp, vp->v_usecount); + + if (fmode & FWRITE) { + if (--vp->v_writecount < 0) + panic("vnode_rele_ext: vp %x writecount -ve : %d", vp, vp->v_writecount); } -#if DIAGNOSTIC - if (vp->v_usecount < 0 || vp->v_writecount != 0) { - vprint("vrele: bad ref count", vp); - panic("vrele: ref cnt"); + if (fmode & O_EVTONLY) { + if (--vp->v_kusecount < 0) + panic("vnode_rele_ext: vp %x kusecount -ve : %d", vp, vp->v_kusecount); } -#endif - - if ((vp->v_flag & VXLOCK) || (vp->v_flag & VORECLAIM)) { - /* vnode is being cleaned, just return */ - vfree(vp); - simple_unlock(&vp->v_interlock); + if ((vp->v_iocount > 0) || (vp->v_usecount > 0)) { + /* + * vnode is still busy... if we're the last + * usecount, mark for a future call to VNOP_INACTIVE + * when the iocount finally drops to 0 + */ + if (vp->v_usecount == 0) { + vp->v_lflag |= VL_NEEDINACTIVE; + vp->v_flag &= ~(VNOCACHE_DATA | VRAOFF); + } + if ( !locked) + vnode_unlock(vp); return; } - - if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) { - VOP_INACTIVE(vp, p); - /* - * vn_lock releases the interlock and - * VOP_INCATIVE releases the vnode lock. - * We could block and the vnode might get reactivated - * Can not just call vfree without checking the state + vp->v_flag &= ~(VNOCACHE_DATA | VRAOFF); + + if ( (vp->v_lflag & (VL_TERMINATE | VL_DEAD)) || dont_reenter) { + /* + * vnode is being cleaned, or + * we've requested that we don't reenter + * the filesystem on this release... in + * this case, we'll mark the vnode aged + * if it's been marked for termination */ - simple_lock(&vp->v_interlock); - if (!VONLIST(vp)) { - if (vp->v_usecount == 0) - vfree(vp); - else if ((vp->v_usecount == 1) && UBCINFOEXISTS(vp)) - vinactive(vp); + if (dont_reenter) { + if ( !(vp->v_lflag & (VL_TERMINATE | VL_DEAD | VL_MARKTERM)) ) + vp->v_lflag |= VL_NEEDINACTIVE; + vp->v_flag |= VAGE; } - simple_unlock(&vp->v_interlock); - } -#if 0 - else { - vfree(vp); - simple_unlock(&vp->v_interlock); - kprintf("vrele: vn_lock() failed for vp = 0x%08x\n", vp); + vnode_list_add(vp); + if ( !locked) + vnode_unlock(vp); + return; } + /* + * at this point both the iocount and usecount + * are zero + * pick up an iocount so that we can call + * VNOP_INACTIVE with the vnode lock unheld + */ + vp->v_iocount++; +#ifdef JOE_DEBUG + record_vp(vp, 1); #endif -} - -void -vagevp(vp) - struct vnode *vp; -{ - simple_lock(&vp->v_interlock); - vp->v_flag |= VAGE; - simple_unlock(&vp->v_interlock); - return; -} - -/* - * Page or buffer structure gets a reference. - */ -void -vhold(vp) - register struct vnode *vp; -{ + vp->v_lflag &= ~VL_NEEDINACTIVE; + vnode_unlock(vp); - simple_lock(&vp->v_interlock); - vp->v_holdcnt++; - simple_unlock(&vp->v_interlock); -} + context.vc_proc = current_proc(); + context.vc_ucred = kauth_cred_get(); + VNOP_INACTIVE(vp, &context); -/* - * Page or buffer structure frees a reference. - */ -void -holdrele(vp) - register struct vnode *vp; -{ + vnode_lock(vp); + /* + * because we dropped the vnode lock to call VNOP_INACTIVE + * the state of the vnode may have changed... we may have + * picked up an iocount, usecount or the MARKTERM may have + * been set... we need to reevaluate the reference counts + * to determine if we can call vnode_reclaim_internal at + * this point... if the reference counts are up, we'll pick + * up the MARKTERM state when they get subsequently dropped + */ + if ( (vp->v_iocount == 1) && (vp->v_usecount == 0) && + ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM)) { + struct uthread *ut; - simple_lock(&vp->v_interlock); - if (vp->v_holdcnt <= 0) - panic("holdrele: holdcnt"); - vp->v_holdcnt--; - simple_unlock(&vp->v_interlock); + ut = get_bsdthread_info(current_thread()); + + if (ut->uu_defer_reclaims) { + vp->v_defer_reclaimlist = ut->uu_vreclaims; + ut->uu_vreclaims = vp; + goto defer_reclaim; + } + vnode_reclaim_internal(vp, 1, 0); + } + vnode_dropiocount(vp, 1); + vnode_list_add(vp); +defer_reclaim: + if ( !locked) + vnode_unlock(vp); + return; } /* @@ -1471,34 +1498,79 @@ vflush(mp, skipvp, flags) int flags; { struct proc *p = current_proc(); - struct vnode *vp, *nvp; + struct vnode *vp; int busy = 0; + int reclaimed = 0; + int vid, retval; - simple_lock(&mntvnode_slock); + mount_lock(mp); + vnode_iterate_setup(mp); + /* + * On regular unmounts(not forced) do a + * quick check for vnodes to be in use. This + * preserves the caching of vnodes. automounter + * tries unmounting every so often to see whether + * it is still busy or not. + */ + if ((flags & FORCECLOSE)==0) { + if (vnode_umount_preflight(mp, skipvp, flags)) { + vnode_iterate_clear(mp); + mount_unlock(mp); + return(EBUSY); + } + } loop: - for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { - if (vp->v_mount != mp) - goto loop; - nvp = vp->v_mntvnodes.le_next; + /* it is returns 0 then there is nothing to do */ + retval = vnode_iterate_prepare(mp); + + if (retval == 0) { + vnode_iterate_clear(mp); + mount_unlock(mp); + return(retval); + } + + /* iterate over all the vnodes */ + while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) { + vp = TAILQ_FIRST(&mp->mnt_workerqueue); + TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes); + TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes); + if ( (vp->v_mount != mp) || (vp == skipvp)) { + continue; + } + vid = vp->v_id; + mount_unlock(mp); + vnode_lock(vp); + + if ((vp->v_id != vid) || ((vp->v_lflag & (VL_DEAD | VL_TERMINATE)))) { + vnode_unlock(vp); + mount_lock(mp); + continue; + } + /* - * Skip over a selected vnode. - */ - if (vp == skipvp) + * If requested, skip over vnodes marked VSYSTEM. + * Skip over all vnodes marked VNOFLUSH. + */ + if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) || + (vp->v_flag & VNOFLUSH))) { + vnode_unlock(vp); + mount_lock(mp); continue; - - simple_lock(&vp->v_interlock); + } /* - * Skip over a vnodes marked VSYSTEM or VNOFLUSH. + * If requested, skip over vnodes marked VSWAP. */ - if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) || (vp->v_flag & VNOFLUSH))) { - simple_unlock(&vp->v_interlock); + if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) { + vnode_unlock(vp); + mount_lock(mp); continue; } /* - * Skip over a vnodes marked VSWAP. + * If requested, skip over vnodes marked VSWAP. */ - if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) { - simple_unlock(&vp->v_interlock); + if ((flags & SKIPROOT) && (vp->v_flag & VROOT)) { + vnode_unlock(vp); + mount_lock(mp); continue; } /* @@ -1507,17 +1579,27 @@ loop: */ if ((flags & WRITECLOSE) && (vp->v_writecount == 0 || vp->v_type != VREG)) { - simple_unlock(&vp->v_interlock); + vnode_unlock(vp); + mount_lock(mp); continue; } /* - * With v_usecount == 0, all we need to do is clear + * If the real usecount is 0, all we need to do is clear * out the vnode data structures and we are done. */ - if (vp->v_usecount == 0) { - simple_unlock(&mntvnode_slock); - vgonel(vp, p); - simple_lock(&mntvnode_slock); + if (((vp->v_usecount == 0) || + ((vp->v_usecount - vp->v_kusecount) == 0))) { + vp->v_iocount++; /* so that drain waits for * other iocounts */ +#ifdef JOE_DEBUG + record_vp(vp, 1); +#endif + vnode_reclaim_internal(vp, 1, 0); + vnode_dropiocount(vp, 1); + vnode_list_add(vp); + + vnode_unlock(vp); + reclaimed++; + mount_lock(mp); continue; } /* @@ -1526,51 +1608,73 @@ loop: * anonymous device. For all other files, just kill them. */ if (flags & FORCECLOSE) { - simple_unlock(&mntvnode_slock); if (vp->v_type != VBLK && vp->v_type != VCHR) { - vgonel(vp, p); + vp->v_iocount++; /* so that drain waits * for other iocounts */ +#ifdef JOE_DEBUG + record_vp(vp, 1); +#endif + vnode_reclaim_internal(vp, 1, 0); + vnode_dropiocount(vp, 1); + vnode_list_add(vp); + vnode_unlock(vp); } else { vclean(vp, 0, p); + vp->v_mount = 0; /*override any dead_mountp */ + vp->v_lflag &= ~VL_DEAD; vp->v_op = spec_vnodeop_p; insmntque(vp, (struct mount *)0); + vnode_unlock(vp); } - simple_lock(&mntvnode_slock); + mount_lock(mp); continue; } #if DIAGNOSTIC if (busyprt) vprint("vflush: busy vnode", vp); #endif - simple_unlock(&vp->v_interlock); + vnode_unlock(vp); + mount_lock(mp); busy++; } - simple_unlock(&mntvnode_slock); + + /* At this point the worker queue is completed */ + if (busy && ((flags & FORCECLOSE)==0) && reclaimed) { + busy = 0; + reclaimed = 0; + (void)vnode_iterate_reloadq(mp); + /* returned with mount lock held */ + goto loop; + } + + /* if new vnodes were created in between retry the reclaim */ + if ( vnode_iterate_reloadq(mp) != 0) { + if (!(busy && ((flags & FORCECLOSE)==0))) + goto loop; + } + vnode_iterate_clear(mp); + mount_unlock(mp); + if (busy && ((flags & FORCECLOSE)==0)) return (EBUSY); return (0); } +int num_recycledvnodes=0; /* * Disassociate the underlying file system from a vnode. - * The vnode interlock is held on entry. + * The vnode lock is held on entry. */ static void -vclean(vp, flags, p) - struct vnode *vp; - int flags; - struct proc *p; +vclean(vnode_t vp, int flags, proc_t p) { + struct vfs_context context; int active; - int didhold; + int need_inactive; + int already_terminating; + kauth_cred_t ucred = NULL; - /* - * if the vnode is not obtained by calling getnewvnode() we - * are not responsible for the cleanup. Just return. - */ - if (!(vp->v_flag & VSTANDARD)) { - simple_unlock(&vp->v_interlock); - return; - } + context.vc_proc = p; + context.vc_ucred = kauth_cred_get(); /* * Check to see if the vnode is in use. @@ -1578,124 +1682,103 @@ vclean(vp, flags, p) * so that its count cannot fall to zero and generate a * race against ourselves to recycle it. */ - if (active = vp->v_usecount) { - /* - * active vnode can not be on the free list. - * we are about to take an extra reference on this vnode - * do the queue management as needed - * Not doing so can cause "still on list" or - * "vnreclaim: v_usecount" panic if VOP_LOCK() blocks. - */ - simple_lock(&vnode_free_list_slock); - if (ISSET((vp)->v_flag, VUINACTIVE)) - VREMINACTIVE("vclean", vp); - simple_unlock(&vnode_free_list_slock); + active = vp->v_usecount; - if (++vp->v_usecount <= 0) - panic("vclean: v_usecount"); - } + /* + * just in case we missed sending a needed + * VNOP_INACTIVE, we'll do it now + */ + need_inactive = (vp->v_lflag & VL_NEEDINACTIVE); + + vp->v_lflag &= ~VL_NEEDINACTIVE; /* * Prevent the vnode from being recycled or * brought into use while we clean it out. */ - if (vp->v_flag & VXLOCK) - panic("vclean: deadlock"); - vp->v_flag |= VXLOCK; + already_terminating = (vp->v_lflag & VL_TERMINATE); + + vp->v_lflag |= VL_TERMINATE; /* - * Even if the count is zero, the VOP_INACTIVE routine may still - * have the object locked while it cleans it out. The VOP_LOCK - * ensures that the VOP_INACTIVE routine is done with its work. - * For active vnodes, it ensures that no other activity can - * occur while the underlying object is being cleaned out. + * remove the vnode from any mount list + * it might be on... */ - VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); + insmntque(vp, (struct mount *)0); + ucred = vp->v_cred; + vp->v_cred = NULL; + + vnode_unlock(vp); + + if (ucred) + kauth_cred_rele(ucred); + + OSAddAtomic(1, &num_recycledvnodes); /* - * While blocked in VOP_LOCK() someone could have dropped - * reference[s] and we could land on the inactive list. - * if this vnode is on the inactive list - * take it off the list. + * purge from the name cache as early as possible... */ - simple_lock(&vnode_free_list_slock); - if (ISSET((vp)->v_flag, VUINACTIVE)) - VREMINACTIVE("vclean", vp); - simple_unlock(&vnode_free_list_slock); + cache_purge(vp); - /* Clean the pages in VM. */ if (active && (flags & DOCLOSE)) - VOP_CLOSE(vp, IO_NDELAY, NOCRED, p); - - /* Clean the pages in VM. */ - didhold = ubc_hold(vp); - if ((active) && (didhold)) - (void)ubc_clean(vp, 0); /* do not invalidate */ + VNOP_CLOSE(vp, IO_NDELAY, &context); /* * Clean out any buffers associated with the vnode. */ if (flags & DOCLOSE) { +#if NFSCLIENT if (vp->v_tag == VT_NFS) nfs_vinvalbuf(vp, V_SAVE, NOCRED, p, 0); else - vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); +#endif + { + VNOP_FSYNC(vp, MNT_WAIT, &context); + buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0); + } + if (UBCINFOEXISTS(vp)) + /* + * Clean the pages in VM. + */ + (void)ubc_sync_range(vp, (off_t)0, ubc_getsize(vp), UBC_PUSHALL); } + if (UBCINFOEXISTS(vp)) + cluster_release(vp->v_ubcinfo); - if (active) - VOP_INACTIVE(vp, p); - else - VOP_UNLOCK(vp, 0, p); + if (active || need_inactive) + VNOP_INACTIVE(vp, &context); /* Destroy ubc named reference */ - if (didhold) { - ubc_rele(vp); - ubc_destroy_named(vp); - } - /* - * Make sure vp isn't on the inactive list. - */ - simple_lock(&vnode_free_list_slock); - if (ISSET((vp)->v_flag, VUINACTIVE)) { - VREMINACTIVE("vclean", vp); - } - simple_unlock(&vnode_free_list_slock); + ubc_destroy_named(vp); /* * Reclaim the vnode. */ - if (VOP_RECLAIM(vp, p)) + if (VNOP_RECLAIM(vp, &context)) panic("vclean: cannot reclaim"); // make sure the name & parent ptrs get cleaned out! - clean_up_name_parent_ptrs(vp); + vnode_update_identity(vp, NULLVP, NULL, 0, 0, VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME); - cache_purge(vp); - if (vp->v_vnlock) { - struct lock__bsd__ *tmp = vp->v_vnlock; - if ((tmp->lk_flags & LK_DRAINED) == 0) - vprint("vclean: lock not drained", vp); - vp->v_vnlock = NULL; - FREE(tmp, M_TEMP); - } + vnode_lock(vp); - /* It's dead, Jim! */ + vp->v_mount = dead_mountp; vp->v_op = dead_vnodeop_p; vp->v_tag = VT_NON; + vp->v_data = NULL; - insmntque(vp, (struct mount *)0); + vp->v_lflag |= VL_DEAD; - /* - * Done with purge, notify sleepers of the grim news. - */ - vp->v_flag &= ~VXLOCK; - if (vp->v_flag & VXWANT) { - vp->v_flag &= ~VXWANT; - wakeup((caddr_t)vp); + if (already_terminating == 0) { + vp->v_lflag &= ~VL_TERMINATE; + /* + * Done with purge, notify sleepers of the grim news. + */ + if (vp->v_lflag & VL_TERMWANT) { + vp->v_lflag &= ~VL_TERMWANT; + wakeup(&vp->v_lflag); + } } - - if (active) - vrele(vp); } /* @@ -1703,64 +1786,53 @@ vclean(vp, flags, p) * and with all vnodes aliased to the requested vnode. */ int -vop_revoke(ap) - struct vop_revoke_args /* { - struct vnode *a_vp; - int a_flags; - } */ *ap; +vn_revoke(vnode_t vp, int flags, __unused vfs_context_t a_context) { - struct vnode *vp, *vq; - struct proc *p = current_proc(); + struct vnode *vq; + int vid; #if DIAGNOSTIC - if ((ap->a_flags & REVOKEALL) == 0) - panic("vop_revoke"); + if ((flags & REVOKEALL) == 0) + panic("vnop_revoke"); #endif - vp = ap->a_vp; - simple_lock(&vp->v_interlock); - if (vp->v_flag & VALIASED) { /* * If a vgone (or vclean) is already in progress, * wait until it is done and return. */ - if (vp->v_flag & VXLOCK) { - while (vp->v_flag & VXLOCK) { - vp->v_flag |= VXWANT; - simple_unlock(&vp->v_interlock); - (void)tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0); - } - return (0); + vnode_lock(vp); + if (vp->v_lflag & VL_TERMINATE) { + vnode_unlock(vp); + return(ENOENT); } + vnode_unlock(vp); /* * Ensure that vp will not be vgone'd while we * are eliminating its aliases. */ - vp->v_flag |= VXLOCK; - simple_unlock(&vp->v_interlock); + SPECHASH_LOCK(); while (vp->v_flag & VALIASED) { - simple_lock(&spechash_slock); for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type || vp == vq) continue; - simple_unlock(&spechash_slock); - vgone(vq); + vid = vq->v_id; + SPECHASH_UNLOCK(); + if (vnode_getwithvid(vq,vid)){ + SPECHASH_LOCK(); + break; + } + vnode_reclaim_internal(vq, 0, 0); + vnode_put(vq); + SPECHASH_LOCK(); break; } - if (vq == NULLVP) - simple_unlock(&spechash_slock); } - /* - * Remove the lock so that vgone below will - * really eliminate the vnode after which time - * vgone will awaken any sleepers. - */ - simple_lock(&vp->v_interlock); - vp->v_flag &= ~VXLOCK; + SPECHASH_UNLOCK(); } - vgonel(vp, p); + vnode_reclaim_internal(vp, 0, 0); + return (0); } @@ -1769,197 +1841,174 @@ vop_revoke(ap) * Release the passed interlock if the vnode will be recycled. */ int -vrecycle(vp, inter_lkp, p) +vnode_recycle(vp) struct vnode *vp; - struct slock *inter_lkp; - struct proc *p; { + vnode_lock(vp); - simple_lock(&vp->v_interlock); - if (vp->v_usecount == 0) { - if (inter_lkp) - simple_unlock(inter_lkp); - vgonel(vp, p); - return (1); - } - simple_unlock(&vp->v_interlock); - return (0); + if (vp->v_iocount || vp->v_usecount) { + vp->v_lflag |= VL_MARKTERM; + vnode_unlock(vp); + return(0); + } + vnode_reclaim_internal(vp, 1, 0); + vnode_unlock(vp); + + return (1); } -/* - * Eliminate all activity associated with a vnode - * in preparation for reuse. - */ -void -vgone(vp) - struct vnode *vp; +static int +vnode_reload(vnode_t vp) { - struct proc *p = current_proc(); + vnode_lock(vp); - simple_lock(&vp->v_interlock); - vgonel(vp, p); + if ((vp->v_iocount > 1) || vp->v_usecount) { + vnode_unlock(vp); + return(0); + } + if (vp->v_iocount <= 0) + panic("vnode_reload with no iocount %d", vp->v_iocount); + + /* mark for release when iocount is dopped */ + vp->v_lflag |= VL_MARKTERM; + vnode_unlock(vp); + + return (1); } -/* - * vgone, with the vp interlock held. - */ -void -vgonel(vp, p) - struct vnode *vp; - struct proc *p; + +static void +vgone(vnode_t vp) { struct vnode *vq; struct vnode *vx; - /* - * if the vnode is not obtained by calling getnewvnode() we - * are not responsible for the cleanup. Just return. - */ - if (!(vp->v_flag & VSTANDARD)) { - simple_unlock(&vp->v_interlock); - return; - } - - /* - * If a vgone (or vclean) is already in progress, - * wait until it is done and return. - */ - if (vp->v_flag & VXLOCK) { - while (vp->v_flag & VXLOCK) { - vp->v_flag |= VXWANT; - simple_unlock(&vp->v_interlock); - (void)tsleep((caddr_t)vp, PINOD, "vgone", 0); - } - return; - } /* * Clean out the filesystem specific data. + * vclean also takes care of removing the + * vnode from any mount list it might be on */ - vclean(vp, DOCLOSE, p); - /* - * Delete from old mount point vnode list, if on one. - */ - if (vp->v_mount != NULL) - insmntque(vp, (struct mount *)0); + vclean(vp, DOCLOSE, current_proc()); + /* * If special device, remove it from special device alias list * if it is on one. */ if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) { - simple_lock(&spechash_slock); - if (*vp->v_hashchain == vp) { - *vp->v_hashchain = vp->v_specnext; - } else { - for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { - if (vq->v_specnext != vp) - continue; - vq->v_specnext = vp->v_specnext; - break; - } + SPECHASH_LOCK(); + if (*vp->v_hashchain == vp) { + *vp->v_hashchain = vp->v_specnext; + } else { + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_specnext != vp) + continue; + vq->v_specnext = vp->v_specnext; + break; + } if (vq == NULL) panic("missing bdev"); - } - if (vp->v_flag & VALIASED) { - vx = NULL; - for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { - if (vq->v_rdev != vp->v_rdev || - vq->v_type != vp->v_type) - continue; - if (vx) - break; - vx = vq; } - if (vx == NULL) - panic("missing alias"); - if (vq == NULL) - vx->v_flag &= ~VALIASED; - vp->v_flag &= ~VALIASED; - } - simple_unlock(&spechash_slock); - { - struct specinfo *tmp = vp->v_specinfo; - vp->v_specinfo = NULL; - FREE_ZONE((void *)tmp, sizeof(struct specinfo), M_SPECINFO); - } - } - /* - * If it is on the freelist and not already at the head, - * move it to the head of the list. The test of the back - * pointer and the reference count of zero is because - * it will be removed from the free list by getnewvnode, - * but will not have its reference count incremented until - * after calling vgone. If the reference count were - * incremented first, vgone would (incorrectly) try to - * close the previous instance of the underlying object. - * So, the back pointer is explicitly set to `0xdeadb' in - * getnewvnode after removing it from the freelist to ensure - * that we do not try to move it here. - */ - if (vp->v_usecount == 0 && (vp->v_flag & VUINACTIVE) == 0) { - simple_lock(&vnode_free_list_slock); - if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && - vnode_free_list.tqh_first != vp) { - TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); - TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); - } - simple_unlock(&vnode_free_list_slock); + if (vp->v_flag & VALIASED) { + vx = NULL; + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_rdev != vp->v_rdev || + vq->v_type != vp->v_type) + continue; + if (vx) + break; + vx = vq; + } + if (vx == NULL) + panic("missing alias"); + if (vq == NULL) + vx->v_flag &= ~VALIASED; + vp->v_flag &= ~VALIASED; + } + SPECHASH_UNLOCK(); + { + struct specinfo *tmp = vp->v_specinfo; + vp->v_specinfo = NULL; + FREE_ZONE((void *)tmp, sizeof(struct specinfo), M_SPECINFO); + } } - vp->v_type = VBAD; } /* * Lookup a vnode by device number. */ int -vfinddev(dev, type, vpp) - dev_t dev; - enum vtype type; - struct vnode **vpp; +check_mountedon(dev_t dev, enum vtype type, int *errorp) { - struct vnode *vp; + vnode_t vp; int rc = 0; + int vid; - simple_lock(&spechash_slock); +loop: + SPECHASH_LOCK(); for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { if (dev != vp->v_rdev || type != vp->v_type) continue; - *vpp = vp; - rc = 1; - break; + vid = vp->v_id; + SPECHASH_UNLOCK(); + if (vnode_getwithvid(vp,vid)) + goto loop; + vnode_lock(vp); + if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) { + vnode_unlock(vp); + if ((*errorp = vfs_mountedon(vp)) != 0) + rc = 1; + } else + vnode_unlock(vp); + vnode_put(vp); + return(rc); } - simple_unlock(&spechash_slock); - return (rc); + SPECHASH_UNLOCK(); + return (0); } /* * Calculate the total number of references to a special device. */ int -vcount(vp) - struct vnode *vp; +vcount(vnode_t vp) { - struct vnode *vq, *vnext; + vnode_t vq, vnext; int count; + int vid; loop: if ((vp->v_flag & VALIASED) == 0) - return (vp->v_usecount); - simple_lock(&spechash_slock); + return (vp->v_usecount - vp->v_kusecount); + + SPECHASH_LOCK(); for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) { vnext = vq->v_specnext; if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) continue; + vid = vq->v_id; + SPECHASH_UNLOCK(); + + if (vnode_getwithvid(vq, vid)) { + goto loop; + } /* * Alias, but not in use, so flush it out. */ - if (vq->v_usecount == 0 && vq != vp) { - simple_unlock(&spechash_slock); - vgone(vq); + vnode_lock(vq); + if ((vq->v_usecount == 0) && (vq->v_iocount == 1) && vq != vp) { + vnode_reclaim_internal(vq, 1, 0); + vnode_unlock(vq); + vnode_put(vq); goto loop; } - count += vq->v_usecount; + count += (vq->v_usecount - vq->v_kusecount); + vnode_unlock(vq); + vnode_put(vq); + + SPECHASH_LOCK(); } - simple_unlock(&spechash_slock); + SPECHASH_UNLOCK(); + return (count); } @@ -1972,162 +2021,188 @@ static char *typename[] = { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" }; void -vprint(label, vp) - char *label; - register struct vnode *vp; +vprint(const char *label, struct vnode *vp) { - char buf[64]; + char sbuf[64]; if (label != NULL) printf("%s: ", label); - printf("type %s, usecount %d, writecount %d, refcount %d,", - typename[vp->v_type], vp->v_usecount, vp->v_writecount, - vp->v_holdcnt); - buf[0] = '\0'; + printf("type %s, usecount %d, writecount %d", + typename[vp->v_type], vp->v_usecount, vp->v_writecount); + sbuf[0] = '\0'; if (vp->v_flag & VROOT) - strcat(buf, "|VROOT"); + strcat(sbuf, "|VROOT"); if (vp->v_flag & VTEXT) - strcat(buf, "|VTEXT"); + strcat(sbuf, "|VTEXT"); if (vp->v_flag & VSYSTEM) - strcat(buf, "|VSYSTEM"); + strcat(sbuf, "|VSYSTEM"); if (vp->v_flag & VNOFLUSH) - strcat(buf, "|VNOFLUSH"); - if (vp->v_flag & VXLOCK) - strcat(buf, "|VXLOCK"); - if (vp->v_flag & VXWANT) - strcat(buf, "|VXWANT"); + strcat(sbuf, "|VNOFLUSH"); if (vp->v_flag & VBWAIT) - strcat(buf, "|VBWAIT"); + strcat(sbuf, "|VBWAIT"); if (vp->v_flag & VALIASED) - strcat(buf, "|VALIASED"); - if (buf[0] != '\0') - printf(" flags (%s)", &buf[1]); - if (vp->v_data == NULL) { - printf("\n"); - } else { - printf("\n\t"); - VOP_PRINT(vp); - } + strcat(sbuf, "|VALIASED"); + if (sbuf[0] != '\0') + printf(" flags (%s)", &sbuf[1]); } -#ifdef DEBUG -/* - * List all of the locked vnodes in the system. - * Called when debugging the kernel. - */ -void -printlockedvnodes() -{ - struct proc *p = current_proc(); - struct mount *mp, *nmp; - struct vnode *vp; - printf("Locked vnodes\n"); - simple_lock(&mountlist_slock); - for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { - if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { - nmp = mp->mnt_list.cqe_next; - continue; - } - for (vp = mp->mnt_vnodelist.lh_first; - vp != NULL; - vp = vp->v_mntvnodes.le_next) { - if (VOP_ISLOCKED(vp)) - vprint((char *)0, vp); - } - simple_lock(&mountlist_slock); - nmp = mp->mnt_list.cqe_next; - vfs_unbusy(mp, p); - } - simple_unlock(&mountlist_slock); +int +vn_getpath(struct vnode *vp, char *pathbuf, int *len) +{ + return build_path(vp, pathbuf, *len, len); } -#endif + + +static char *extension_table=NULL; +static int nexts; +static int max_ext_width; static int -build_path(struct vnode *vp, char *buff, int buflen, int *outlen) -{ - char *end, *str; - int i, len, ret=0, counter=0; - - end = &buff[buflen-1]; - *--end = '\0'; - - while(vp && VPARENT(vp) != vp) { - // the maximum depth of a file system hierarchy is MAXPATHLEN/2 - // (with single-char names separated by slashes). we panic if - // we've ever looped more than that. - if (counter++ > MAXPATHLEN/2) { - panic("build_path: vnode parent chain is too long! vp 0x%x\n", vp); - } - str = VNAME(vp); - if (VNAME(vp) == NULL) { - if (VPARENT(vp) != NULL) { - ret = EINVAL; - } - break; - } - - // count how long the string is - for(len=0; *str; str++, len++) - /* nothing */; +extension_cmp(void *a, void *b) +{ + return (strlen((char *)a) - strlen((char *)b)); +} - // check that there's enough space - if ((end - buff) < len) { - ret = ENOSPC; - break; - } - // copy it backwards - for(; len > 0; len--) { - *--end = *--str; +// +// This is the api LaunchServices uses to inform the kernel +// the list of package extensions to ignore. +// +// Internally we keep the list sorted by the length of the +// the extension (from longest to shortest). We sort the +// list of extensions so that we can speed up our searches +// when comparing file names -- we only compare extensions +// that could possibly fit into the file name, not all of +// them (i.e. a short 8 character name can't have an 8 +// character extension). +// +__private_extern__ int +set_package_extensions_table(void *data, int nentries, int maxwidth) +{ + char *new_exts, *ptr; + int error, i, len; + + if (nentries <= 0 || nentries > 1024 || maxwidth <= 0 || maxwidth > 255) { + return EINVAL; + } + + MALLOC(new_exts, char *, nentries * maxwidth, M_TEMP, M_WAITOK); + + error = copyin(CAST_USER_ADDR_T(data), new_exts, nentries * maxwidth); + if (error) { + FREE(new_exts, M_TEMP); + return error; + } + + if (extension_table) { + FREE(extension_table, M_TEMP); + } + extension_table = new_exts; + nexts = nentries; + max_ext_width = maxwidth; + + qsort(extension_table, nexts, maxwidth, extension_cmp); + + return 0; +} + + +__private_extern__ int +is_package_name(char *name, int len) +{ + int i, extlen; + char *ptr, *name_ext; + + if (len <= 3) { + return 0; + } + + name_ext = NULL; + for(ptr=name; *ptr != '\0'; ptr++) { + if (*ptr == '.') { + name_ext = ptr; } + } - // put in the path separator - *--end = '/'; + // if there is no "." extension, it can't match + if (name_ext == NULL) { + return 0; + } - // walk up the chain. - vp = VPARENT(vp); + // advance over the "." + name_ext++; - // check if we're crossing a mount point and - // switch the vp if we are. - if (vp && (vp->v_flag & VROOT)) { - vp = vp->v_mount->mnt_vnodecovered; + // now iterate over all the extensions to see if any match + ptr = &extension_table[0]; + for(i=0; i < nexts; i++, ptr+=max_ext_width) { + extlen = strlen(ptr); + if (strncasecmp(name_ext, ptr, extlen) == 0 && name_ext[extlen] == '\0') { + // aha, a match! + return 1; } } - // slide it down to the beginning of the buffer - memmove(buff, end, &buff[buflen] - end); - - *outlen = &buff[buflen] - end; - - return ret; + // if we get here, no extension matched + return 0; } -__private_extern__ int -vn_getpath(struct vnode *vp, char *pathbuf, int *len) +int +vn_path_package_check(__unused vnode_t vp, char *path, int pathlen, int *component) { - return build_path(vp, pathbuf, *len, len); -} + char *ptr, *end; + int comp=0; + + *component = -1; + if (*path != '/') { + return EINVAL; + } + + end = path + 1; + while(end < path + pathlen && *end != '\0') { + while(end < path + pathlen && *end == '/' && *end != '\0') { + end++; + } + + ptr = end; + + while(end < path + pathlen && *end != '/' && *end != '\0') { + end++; + } + + if (end > path + pathlen) { + // hmm, string wasn't null terminated + return EINVAL; + } + + *end = '\0'; + if (is_package_name(ptr, end - ptr)) { + *component = comp; + break; + } + end++; + comp++; + } + + return 0; +} /* * Top level filesystem related information gathering. */ +extern unsigned int vfs_nummntops; + int -vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) - int *name; - u_int namelen; - void *oldp; - size_t *oldlenp; - void *newp; - size_t newlen; - struct proc *p; +vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen, struct proc *p) { - struct vfsconf *vfsp; + struct vfstable *vfsp; int *username; u_int usernamelen; int error; + struct vfsconf *vfsc; /* * The VFS_NUMMNTOPS shouldn't be at name[0] since @@ -2140,7 +2215,6 @@ vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) * name[1]: VFS_NUMMNTOPS */ if (namelen == 1 && name[0] == VFS_NUMMNTOPS) { - extern unsigned int vfs_nummntops; return (sysctl_rdint(oldp, oldlenp, newp, vfs_nummntops)); } @@ -2148,13 +2222,18 @@ vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) if (namelen < 2) return (EISDIR); /* overloaded */ if (name[0] != VFS_GENERIC) { + struct vfs_context context; + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (vfsp->vfc_typenum == name[0]) break; if (vfsp == NULL) - return (EOPNOTSUPP); + return (ENOTSUP); + context.vc_proc = p; + context.vc_ucred = kauth_cred_get(); + return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, - oldp, oldlenp, newp, newlen, p)); + oldp, oldlenp, newp, newlen, &context)); } switch (name[1]) { case VFS_MAXTYPENUM: @@ -2166,9 +2245,27 @@ vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) if (vfsp->vfc_typenum == name[2]) break; if (vfsp == NULL) - return (EOPNOTSUPP); - return (sysctl_rdstruct(oldp, oldlenp, newp, vfsp, - sizeof(struct vfsconf))); + return (ENOTSUP); + vfsc = (struct vfsconf *)vfsp; + if (proc_is64bit(p)) { + struct user_vfsconf usr_vfsc; + usr_vfsc.vfc_vfsops = CAST_USER_ADDR_T(vfsc->vfc_vfsops); + bcopy(vfsc->vfc_name, usr_vfsc.vfc_name, sizeof(usr_vfsc.vfc_name)); + usr_vfsc.vfc_typenum = vfsc->vfc_typenum; + usr_vfsc.vfc_refcount = vfsc->vfc_refcount; + usr_vfsc.vfc_flags = vfsc->vfc_flags; + usr_vfsc.vfc_mountroot = CAST_USER_ADDR_T(vfsc->vfc_mountroot); + usr_vfsc.vfc_next = CAST_USER_ADDR_T(vfsc->vfc_next); + return (sysctl_rdstruct(oldp, oldlenp, newp, &usr_vfsc, + sizeof(usr_vfsc))); + } + else { + return (sysctl_rdstruct(oldp, oldlenp, newp, vfsc, + sizeof(struct vfsconf))); + } + + case VFS_SET_PACKAGE_EXTS: + return set_package_extensions_table((void *)name[1], name[2], name[3]); } /* * We need to get back into the general MIB, so we need to re-prepend @@ -2179,8 +2276,8 @@ vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) M_TEMP, M_WAITOK); bcopy(name, username + 1, namelen * sizeof(*name)); username[0] = CTL_VFS; - error = userland_sysctl(p, username, usernamelen, oldp, oldlenp, 1, - newp, newlen, oldlenp); + error = userland_sysctl(p, username, usernamelen, oldp, + oldlenp, 1, newp, newlen, oldlenp); FREE(username, M_TEMP); return (error); } @@ -2193,11 +2290,9 @@ int kinfo_vdebug = 1; */ /* ARGSUSED */ int -sysctl_vnode(where, sizep, p) - char *where; - size_t *sizep; - struct proc *p; +sysctl_vnode(__unused user_addr_t where, __unused size_t *sizep) { +#if 0 struct mount *mp, *nmp; struct vnode *nvp, *vp; char *bp = where, *savebp; @@ -2212,55 +2307,46 @@ sysctl_vnode(where, sizep, p) } ewhere = where + *sizep; - simple_lock(&mountlist_slock); for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { - if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { + if (vfs_busy(mp, LK_NOWAIT)) { nmp = mp->mnt_list.cqe_next; continue; } savebp = bp; again: - simple_lock(&mntvnode_slock); - for (vp = mp->mnt_vnodelist.lh_first; - vp != NULL; - vp = nvp) { + TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { /* * Check that the vp is still associated with * this filesystem. RACE: could have been * recycled onto the same filesystem. */ if (vp->v_mount != mp) { - simple_unlock(&mntvnode_slock); if (kinfo_vdebug) printf("kinfo: vp changed\n"); bp = savebp; goto again; } - nvp = vp->v_mntvnodes.le_next; if (bp + VPTRSZ + VNODESZ > ewhere) { - simple_unlock(&mntvnode_slock); - vfs_unbusy(mp, p); + vfs_unbusy(mp); *sizep = bp - where; return (ENOMEM); } - simple_unlock(&mntvnode_slock); if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) || (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ))) { - vfs_unbusy(mp, p); + vfs_unbusy(mp); return (error); } bp += VPTRSZ + VNODESZ; - simple_lock(&mntvnode_slock); } - simple_unlock(&mntvnode_slock); - simple_lock(&mountlist_slock); nmp = mp->mnt_list.cqe_next; - vfs_unbusy(mp, p); + vfs_unbusy(mp); } - simple_unlock(&mountlist_slock); *sizep = bp - where; return (0); +#else + return(EINVAL); +#endif } /* @@ -2273,10 +2359,12 @@ vfs_mountedon(vp) struct vnode *vq; int error = 0; - if (vp->v_specflags & SI_MOUNTEDON) - return (EBUSY); + SPECHASH_LOCK(); + if (vp->v_specflags & SI_MOUNTEDON) { + error = EBUSY; + goto out; + } if (vp->v_flag & VALIASED) { - simple_lock(&spechash_slock); for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) @@ -2286,8 +2374,9 @@ vfs_mountedon(vp) break; } } - simple_unlock(&spechash_slock); } +out: + SPECHASH_UNLOCK(); return (error); } @@ -2298,635 +2387,89 @@ vfs_mountedon(vp) __private_extern__ void vfs_unmountall() { - struct mount *mp, *nmp; + struct mount *mp; struct proc *p = current_proc(); + int error; /* * Since this only runs when rebooting, it is not interlocked. */ - for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { - nmp = mp->mnt_list.cqe_prev; - (void) dounmount(mp, MNT_FORCE, p); - } -} - -/* - * Build hash lists of net addresses and hang them off the mount point. - * Called by vfs_export() to set up the lists of export addresses. - */ -static int -vfs_hang_addrlist(mp, nep, argp) - struct mount *mp; - struct netexport *nep; - struct export_args *argp; -{ - register struct netcred *np; - register struct radix_node_head *rnh; - register int i; - struct radix_node *rn; - struct sockaddr *saddr, *smask = 0; - struct domain *dom; - int error; - - if (argp->ex_addrlen == 0) { - if (mp->mnt_flag & MNT_DEFEXPORTED) - return (EPERM); - np = &nep->ne_defexported; - np->netc_exflags = argp->ex_flags; - np->netc_anon = argp->ex_anon; - np->netc_anon.cr_ref = 1; - mp->mnt_flag |= MNT_DEFEXPORTED; - return (0); - } - i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; - MALLOC(np, struct netcred *, i, M_NETADDR, M_WAITOK); - bzero((caddr_t)np, i); - saddr = (struct sockaddr *)(np + 1); - if (error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen)) - goto out; - if (saddr->sa_len > argp->ex_addrlen) - saddr->sa_len = argp->ex_addrlen; - if (argp->ex_masklen) { - smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen); - error = copyin(argp->ex_addr, (caddr_t)smask, argp->ex_masklen); - if (error) - goto out; - if (smask->sa_len > argp->ex_masklen) - smask->sa_len = argp->ex_masklen; - } - i = saddr->sa_family; - if ((rnh = nep->ne_rtable[i]) == 0) { - /* - * Seems silly to initialize every AF when most are not - * used, do so on demand here - */ - for (dom = domains; dom; dom = dom->dom_next) - if (dom->dom_family == i && dom->dom_rtattach) { - dom->dom_rtattach((void **)&nep->ne_rtable[i], - dom->dom_rtoffset); - break; - } - if ((rnh = nep->ne_rtable[i]) == 0) { - error = ENOBUFS; - goto out; + mount_list_lock(); + while(!TAILQ_EMPTY(&mountlist)) { + mp = TAILQ_LAST(&mountlist, mntlist); + mount_list_unlock(); + error = dounmount(mp, MNT_FORCE, p); + if (error) { + mount_list_lock(); + TAILQ_REMOVE(&mountlist, mp, mnt_list); + printf("unmount of %s failed (", mp->mnt_vfsstat.f_mntonname); + if (error == EBUSY) + printf("BUSY)\n"); + else + printf("%d)\n", error); + continue; } + mount_list_lock(); } - rn = (*rnh->rnh_addaddr)((caddr_t)saddr, (caddr_t)smask, rnh, - np->netc_rnodes); - if (rn == 0) { - /* - * One of the reasons that rnh_addaddr may fail is that - * the entry already exists. To check for this case, we - * look up the entry to see if it is there. If so, we - * do not need to make a new entry but do return success. - */ - _FREE(np, M_NETADDR); - rn = (*rnh->rnh_matchaddr)((caddr_t)saddr, rnh); - if (rn != 0 && (rn->rn_flags & RNF_ROOT) == 0 && - ((struct netcred *)rn)->netc_exflags == argp->ex_flags && - !bcmp((caddr_t)&((struct netcred *)rn)->netc_anon, - (caddr_t)&argp->ex_anon, sizeof(struct ucred))) - return (0); - return (EPERM); - } - np->netc_exflags = argp->ex_flags; - np->netc_anon = argp->ex_anon; - np->netc_anon.cr_ref = 1; - return (0); -out: - _FREE(np, M_NETADDR); - return (error); + mount_list_unlock(); } -/* ARGSUSED */ -static int -vfs_free_netcred(rn, w) - struct radix_node *rn; - caddr_t w; -{ - register struct radix_node_head *rnh = (struct radix_node_head *)w; - - (*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh); - _FREE((caddr_t)rn, M_NETADDR); - return (0); -} -/* - * Free the net address hash lists that are hanging off the mount points. +/* + * This routine is called from vnode_pager_no_senders() + * which in turn can be called with vnode locked by vnode_uncache() + * But it could also get called as a result of vm_object_cache_trim(). + * In that case lock state is unknown. + * AGE the vnode so that it gets recycled quickly. */ -static void -vfs_free_addrlist(nep) - struct netexport *nep; +__private_extern__ void +vnode_pager_vrele(struct vnode *vp) { - register int i; - register struct radix_node_head *rnh; + vnode_lock(vp); - for (i = 0; i <= AF_MAX; i++) - if (rnh = nep->ne_rtable[i]) { - (*rnh->rnh_walktree)(rnh, vfs_free_netcred, - (caddr_t)rnh); - _FREE((caddr_t)rnh, M_RTABLE); - nep->ne_rtable[i] = 0; - } -} + if (!ISSET(vp->v_lflag, VL_TERMINATE)) + panic("vnode_pager_vrele: vp not in termination"); + vp->v_lflag &= ~VNAMED_UBC; -int -vfs_export(mp, nep, argp) - struct mount *mp; - struct netexport *nep; - struct export_args *argp; -{ - int error; + if (UBCINFOEXISTS(vp)) { + struct ubc_info *uip = vp->v_ubcinfo; - if (argp->ex_flags & MNT_DELEXPORT) { - vfs_free_addrlist(nep); - mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); - } - if (argp->ex_flags & MNT_EXPORTED) { - if (error = vfs_hang_addrlist(mp, nep, argp)) - return (error); - mp->mnt_flag |= MNT_EXPORTED; + if (ISSET(uip->ui_flags, UI_WASMAPPED)) + SET(vp->v_flag, VWASMAPPED); + vp->v_ubcinfo = UBC_INFO_NULL; + + ubc_info_deallocate(uip); + } else { + panic("NO ubcinfo in vnode_pager_vrele"); } - return (0); + vnode_unlock(vp); + + wakeup(&vp->v_lflag); } -struct netcred * -vfs_export_lookup(mp, nep, nam) - register struct mount *mp; - struct netexport *nep; - struct mbuf *nam; + +#include <sys/disk.h> + +errno_t +vfs_init_io_attributes(vnode_t devvp, mount_t mp) { - register struct netcred *np; - register struct radix_node_head *rnh; - struct sockaddr *saddr; + int error; + off_t readblockcnt; + off_t writeblockcnt; + off_t readmaxcnt; + off_t writemaxcnt; + off_t readsegcnt; + off_t writesegcnt; + off_t readsegsize; + off_t writesegsize; + u_long blksize; + u_int64_t temp; + struct vfs_context context; - np = NULL; - if (mp->mnt_flag & MNT_EXPORTED) { - /* - * Lookup in the export list first. - */ - if (nam != NULL) { - saddr = mtod(nam, struct sockaddr *); - rnh = nep->ne_rtable[saddr->sa_family]; - if (rnh != NULL) { - np = (struct netcred *) - (*rnh->rnh_matchaddr)((caddr_t)saddr, - rnh); - if (np && np->netc_rnodes->rn_flags & RNF_ROOT) - np = NULL; - } - } - /* - * If no address match, use the default if it exists. - */ - if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) - np = &nep->ne_defexported; - } - return (np); -} - -/* - * try to reclaim vnodes from the memory - * object cache - */ -static int -vm_object_cache_reclaim(int count) -{ - int cnt; - void vnode_pager_release_from_cache(int *); - - /* attempt to reclaim vnodes from VM object cache */ - cnt = count; - vnode_pager_release_from_cache(&cnt); - return(cnt); -} - -/* - * Release memory object reference held by inactive vnodes - * and then try to reclaim some vnodes from the memory - * object cache - */ -static int -vnreclaim(int count) -{ - int i, loopcnt; - struct vnode *vp; - int err; - struct proc *p; - - i = 0; - loopcnt = 0; - - /* Try to release "count" vnodes from the inactive list */ -restart: - if (++loopcnt > inactivevnodes) { - /* - * I did my best trying to reclaim the vnodes. - * Do not try any more as that would only lead to - * long latencies. Also in the worst case - * this can get totally CPU bound. - * Just fall though and attempt a reclaim of VM - * object cache - */ - goto out; - } - - simple_lock(&vnode_free_list_slock); - for (vp = TAILQ_FIRST(&vnode_inactive_list); - (vp != NULLVP) && (i < count); - vp = TAILQ_NEXT(vp, v_freelist)) { - - if (!simple_lock_try(&vp->v_interlock)) - continue; - - if (vp->v_usecount != 1) - panic("vnreclaim: v_usecount"); - - if(!UBCINFOEXISTS(vp)) { - if (vp->v_type == VBAD) { - VREMINACTIVE("vnreclaim", vp); - simple_unlock(&vp->v_interlock); - continue; - } else - panic("non UBC vnode on inactive list"); - /* Should not reach here */ - } - - /* If vnode is already being reclaimed, wait */ - if ((vp->v_flag & VXLOCK) || (vp->v_flag & VORECLAIM)) { - vp->v_flag |= VXWANT; - simple_unlock(&vp->v_interlock); - simple_unlock(&vnode_free_list_slock); - (void)tsleep((caddr_t)vp, PINOD, "vocr", 0); - goto restart; - } - - /* - * if the vnode is being initialized, - * skip over it - */ - if (ISSET(vp->v_flag, VUINIT)) { - SET(vp->v_flag, VUWANT); - simple_unlock(&vp->v_interlock); - continue; - } - - VREMINACTIVE("vnreclaim", vp); - simple_unlock(&vnode_free_list_slock); - - if (ubc_issetflags(vp, UI_WASMAPPED)) { - /* - * We should not reclaim as it is likely - * to be in use. Let it die a natural death. - * Release the UBC reference if one exists - * and put it back at the tail. - */ - simple_unlock(&vp->v_interlock); - if (ubc_release_named(vp)) { - if (UBCINFOEXISTS(vp)) { - simple_lock(&vp->v_interlock); - if (vp->v_usecount == 1 && !VONLIST(vp)) - vinactive(vp); - simple_unlock(&vp->v_interlock); - } - } else { - simple_lock(&vp->v_interlock); - vinactive(vp); - simple_unlock(&vp->v_interlock); - } - } else { - int didhold; - - VORECLAIM_ENABLE(vp); - - /* - * scrub the dirty pages and invalidate the buffers - */ - p = current_proc(); - err = vn_lock(vp, LK_EXCLUSIVE|LK_INTERLOCK, p); - if (err) { - /* cannot reclaim */ - simple_lock(&vp->v_interlock); - vinactive(vp); - VORECLAIM_DISABLE(vp); - i++; - simple_unlock(&vp->v_interlock); - goto restart; - } - - /* keep the vnode alive so we can kill it */ - simple_lock(&vp->v_interlock); - if(vp->v_usecount != 1) - panic("VOCR: usecount race"); - vp->v_usecount++; - simple_unlock(&vp->v_interlock); - - /* clean up the state in VM without invalidating */ - didhold = ubc_hold(vp); - if (didhold) - (void)ubc_clean(vp, 0); - - /* flush and invalidate buffers associated with the vnode */ - if (vp->v_tag == VT_NFS) - nfs_vinvalbuf(vp, V_SAVE, NOCRED, p, 0); - else - vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); - - /* - * Note: for the v_usecount == 2 case, VOP_INACTIVE - * has not yet been called. Call it now while vp is - * still locked, it will also release the lock. - */ - if (vp->v_usecount == 2) - VOP_INACTIVE(vp, p); - else - VOP_UNLOCK(vp, 0, p); - - if (didhold) - ubc_rele(vp); - - /* - * destroy the ubc named reference. - * If we can't because it is held for I/Os - * in progress, just put it back on the inactive - * list and move on. Otherwise, the paging reference - * is toast (and so is this vnode?). - */ - if (ubc_destroy_named(vp)) { - i++; - } - simple_lock(&vp->v_interlock); - VORECLAIM_DISABLE(vp); - simple_unlock(&vp->v_interlock); - vrele(vp); /* release extra use we added here */ - } - /* inactive list lock was released, must restart */ - goto restart; - } - simple_unlock(&vnode_free_list_slock); - - vnode_reclaim_tried += i; -out: - i = vm_object_cache_reclaim(count); - vnode_objects_reclaimed += i; - - return(i); -} - -/* - * This routine is called from vnode_pager_no_senders() - * which in turn can be called with vnode locked by vnode_uncache() - * But it could also get called as a result of vm_object_cache_trim(). - * In that case lock state is unknown. - * AGE the vnode so that it gets recycled quickly. - * Check lock status to decide whether to call vput() or vrele(). - */ -__private_extern__ void -vnode_pager_vrele(struct vnode *vp) -{ - - boolean_t funnel_state; - int isvnreclaim = 1; - - funnel_state = thread_funnel_set(kernel_flock, TRUE); - - /* Mark the vnode to be recycled */ - vagevp(vp); - - simple_lock(&vp->v_interlock); - /* - * If a vgone (or vclean) is already in progress, - * Do not bother with the ubc_info cleanup. - * Let the vclean deal with it. - */ - if (vp->v_flag & VXLOCK) { - CLR(vp->v_flag, VTERMINATE); - if (ISSET(vp->v_flag, VTERMWANT)) { - CLR(vp->v_flag, VTERMWANT); - wakeup((caddr_t)&vp->v_ubcinfo); - } - simple_unlock(&vp->v_interlock); - vrele(vp); - (void) thread_funnel_set(kernel_flock, funnel_state); - return; - } - - /* It's dead, Jim! */ - if (!ISSET(vp->v_flag, VORECLAIM)) { - /* - * called as a result of eviction of the memory - * object from the memory object cache - */ - isvnreclaim = 0; - - /* So serialize vnode operations */ - VORECLAIM_ENABLE(vp); - } - if (!ISSET(vp->v_flag, VTERMINATE)) - SET(vp->v_flag, VTERMINATE); - - cache_purge(vp); - - if (UBCINFOEXISTS(vp)) { - struct ubc_info *uip = vp->v_ubcinfo; - - if (ubc_issetflags(vp, UI_WASMAPPED)) - SET(vp->v_flag, VWASMAPPED); - - vp->v_ubcinfo = UBC_NOINFO; /* catch bad accesses */ - simple_unlock(&vp->v_interlock); - ubc_info_deallocate(uip); - } else { - if ((vp->v_type == VBAD) && ((vp)->v_ubcinfo != UBC_INFO_NULL) - && ((vp)->v_ubcinfo != UBC_NOINFO)) { - struct ubc_info *uip = vp->v_ubcinfo; - - vp->v_ubcinfo = UBC_NOINFO; /* catch bad accesses */ - simple_unlock(&vp->v_interlock); - ubc_info_deallocate(uip); - } else { - simple_unlock(&vp->v_interlock); - } - } - - CLR(vp->v_flag, VTERMINATE); - - if (vp->v_type != VBAD){ - vgone(vp); /* revoke the vnode */ - vrele(vp); /* and drop the reference */ - } else - vrele(vp); - - if (ISSET(vp->v_flag, VTERMWANT)) { - CLR(vp->v_flag, VTERMWANT); - wakeup((caddr_t)&vp->v_ubcinfo); - } - if (!isvnreclaim) - VORECLAIM_DISABLE(vp); - (void) thread_funnel_set(kernel_flock, funnel_state); - return; -} - - -#if DIAGNOSTIC -int walk_vnodes_debug=0; - -void -walk_allvnodes() -{ - struct mount *mp, *nmp; - struct vnode *vp; - int cnt = 0; - - for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { - for (vp = mp->mnt_vnodelist.lh_first; - vp != NULL; - vp = vp->v_mntvnodes.le_next) { - if (vp->v_usecount < 0){ - if(walk_vnodes_debug) { - printf("vp is %x\n",vp); - } - } - } - nmp = mp->mnt_list.cqe_next; - } - for (cnt = 0, vp = vnode_free_list.tqh_first; - vp != NULLVP; cnt++, vp = vp->v_freelist.tqe_next) { - if ((vp->v_usecount < 0) && walk_vnodes_debug) { - if(walk_vnodes_debug) { - printf("vp is %x\n",vp); - } - } - } - printf("%d - free\n", cnt); - - for (cnt = 0, vp = vnode_inactive_list.tqh_first; - vp != NULLVP; cnt++, vp = vp->v_freelist.tqe_next) { - if ((vp->v_usecount < 0) && walk_vnodes_debug) { - if(walk_vnodes_debug) { - printf("vp is %x\n",vp); - } - } - } - printf("%d - inactive\n", cnt); -} -#endif /* DIAGNOSTIC */ - - -struct x_constraints { - u_int32_t x_maxreadcnt; - u_int32_t x_maxsegreadsize; - u_int32_t x_maxsegwritesize; -}; - - -void -vfs_io_attributes(vp, flags, iosize, vectors) - struct vnode *vp; - int flags; /* B_READ or B_WRITE */ - int *iosize; - int *vectors; -{ - struct mount *mp; - - /* start with "reasonable" defaults */ - *iosize = MAXPHYS; - *vectors = 32; - - mp = vp->v_mount; - if (mp != NULL) { - switch (flags) { - case B_READ: - if (mp->mnt_kern_flag & MNTK_IO_XINFO) - *iosize = ((struct x_constraints *)(mp->mnt_xinfo_ptr))->x_maxreadcnt; - else - *iosize = mp->mnt_maxreadcnt; - *vectors = mp->mnt_segreadcnt; - break; - case B_WRITE: - *iosize = mp->mnt_maxwritecnt; - *vectors = mp->mnt_segwritecnt; - break; - default: - break; - } - if (*iosize == 0) - *iosize = MAXPHYS; - if (*vectors == 0) - *vectors = 32; - } - return; -} + proc_t p = current_proc(); -__private_extern__ -void -vfs_io_maxsegsize(vp, flags, maxsegsize) - struct vnode *vp; - int flags; /* B_READ or B_WRITE */ - int *maxsegsize; -{ - struct mount *mp; - - /* start with "reasonable" default */ - *maxsegsize = MAXPHYS; - - mp = vp->v_mount; - if (mp != NULL) { - switch (flags) { - case B_READ: - if (mp->mnt_kern_flag & MNTK_IO_XINFO) - *maxsegsize = ((struct x_constraints *)(mp->mnt_xinfo_ptr))->x_maxsegreadsize; - else - /* - * if the extended info doesn't exist - * then use the maxread I/O size as the - * max segment size... this is the previous behavior - */ - *maxsegsize = mp->mnt_maxreadcnt; - break; - case B_WRITE: - if (mp->mnt_kern_flag & MNTK_IO_XINFO) - *maxsegsize = ((struct x_constraints *)(mp->mnt_xinfo_ptr))->x_maxsegwritesize; - else - /* - * if the extended info doesn't exist - * then use the maxwrite I/O size as the - * max segment size... this is the previous behavior - */ - *maxsegsize = mp->mnt_maxwritecnt; - break; - default: - break; - } - if (*maxsegsize == 0) - *maxsegsize = MAXPHYS; - } -} - - -#include <sys/disk.h> - - -int -vfs_init_io_attributes(devvp, mp) - struct vnode *devvp; - struct mount *mp; -{ - int error; - off_t readblockcnt; - off_t writeblockcnt; - off_t readmaxcnt; - off_t writemaxcnt; - off_t readsegcnt; - off_t writesegcnt; - off_t readsegsize; - off_t writesegsize; - u_long blksize; - - u_int64_t temp; - - struct proc *p = current_proc(); - struct ucred *cred = p->p_ucred; + context.vc_proc = p; + context.vc_ucred = kauth_cred_get(); int isvirtual = 0; /* @@ -2935,67 +2478,70 @@ vfs_init_io_attributes(devvp, mp) */ int thisunit = -1; static int rootunit = -1; - extern struct vnode *rootvp; if (rootunit == -1) { - if (VOP_IOCTL(rootvp, DKIOCGETBSDUNIT, (caddr_t)&rootunit, 0, cred, p)) + if (VNOP_IOCTL(rootvp, DKIOCGETBSDUNIT, (caddr_t)&rootunit, 0, &context)) rootunit = -1; else if (rootvp == devvp) mp->mnt_kern_flag |= MNTK_ROOTDEV; } if (devvp != rootvp && rootunit != -1) { - if (VOP_IOCTL(devvp, DKIOCGETBSDUNIT, (caddr_t)&thisunit, 0, cred, p) == 0) { + if (VNOP_IOCTL(devvp, DKIOCGETBSDUNIT, (caddr_t)&thisunit, 0, &context) == 0) { if (thisunit == rootunit) mp->mnt_kern_flag |= MNTK_ROOTDEV; } } - if (VOP_IOCTL(devvp, DKIOCGETISVIRTUAL, (caddr_t)&isvirtual, 0, cred, p) == 0) { + /* + * force the spec device to re-cache + * the underlying block size in case + * the filesystem overrode the initial value + */ + set_fsblocksize(devvp); + + + if ((error = VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, + (caddr_t)&blksize, 0, &context))) + return (error); + + mp->mnt_devblocksize = blksize; + + if (VNOP_IOCTL(devvp, DKIOCISVIRTUAL, (caddr_t)&isvirtual, 0, &context) == 0) { if (isvirtual) mp->mnt_kern_flag |= MNTK_VIRTUALDEV; } - if ((error = VOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD, - (caddr_t)&readblockcnt, 0, cred, p))) - return (error); - - if ((error = VOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE, - (caddr_t)&writeblockcnt, 0, cred, p))) + if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD, + (caddr_t)&readblockcnt, 0, &context))) return (error); - if ((error = VOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD, - (caddr_t)&readmaxcnt, 0, cred, p))) + if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE, + (caddr_t)&writeblockcnt, 0, &context))) return (error); - if ((error = VOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE, - (caddr_t)&writemaxcnt, 0, cred, p))) + if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD, + (caddr_t)&readmaxcnt, 0, &context))) return (error); - if ((error = VOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD, - (caddr_t)&readsegcnt, 0, cred, p))) + if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE, + (caddr_t)&writemaxcnt, 0, &context))) return (error); - if ((error = VOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE, - (caddr_t)&writesegcnt, 0, cred, p))) + if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD, + (caddr_t)&readsegcnt, 0, &context))) return (error); - if ((error = VOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTREAD, - (caddr_t)&readsegsize, 0, cred, p))) + if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE, + (caddr_t)&writesegcnt, 0, &context))) return (error); - if ((error = VOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTWRITE, - (caddr_t)&writesegsize, 0, cred, p))) + if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTREAD, + (caddr_t)&readsegsize, 0, &context))) return (error); - if ((error = VOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, - (caddr_t)&blksize, 0, cred, p))) + if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTWRITE, + (caddr_t)&writesegsize, 0, &context))) return (error); - - if ( !(mp->mnt_kern_flag & MNTK_IO_XINFO)) { - MALLOC(mp->mnt_xinfo_ptr, void *, sizeof(struct x_constraints), M_TEMP, M_WAITOK); - mp->mnt_kern_flag |= MNTK_IO_XINFO; - } - if (readmaxcnt) temp = (readmaxcnt > UINT32_MAX) ? UINT32_MAX : readmaxcnt; else { @@ -3005,7 +2551,7 @@ vfs_init_io_attributes(devvp, mp) } else temp = MAXPHYS; } - ((struct x_constraints *)(mp->mnt_xinfo_ptr))->x_maxreadcnt = (u_int32_t)temp; + mp->mnt_maxreadcnt = (u_int32_t)temp; if (writemaxcnt) temp = (writemaxcnt > UINT32_MAX) ? UINT32_MAX : writemaxcnt; @@ -3030,13 +2576,13 @@ vfs_init_io_attributes(devvp, mp) temp = (readsegsize > UINT32_MAX) ? UINT32_MAX : readsegsize; else temp = mp->mnt_maxreadcnt; - ((struct x_constraints *)(mp->mnt_xinfo_ptr))->x_maxsegreadsize = (u_int32_t)temp; + mp->mnt_maxsegreadsize = (u_int32_t)temp; if (writesegsize) temp = (writesegsize > UINT32_MAX) ? UINT32_MAX : writesegsize; else temp = mp->mnt_maxwritecnt; - ((struct x_constraints *)(mp->mnt_xinfo_ptr))->x_maxsegwritesize = (u_int32_t)temp; + mp->mnt_maxsegwritesize = (u_int32_t)temp; return (error); } @@ -3051,7 +2597,7 @@ vfs_event_init(void) } void -vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data) +vfs_event_signal(__unused fsid_t *fsid, u_int32_t event, __unused intptr_t data) { KNOTE(&fs_klist, event); @@ -3063,14 +2609,41 @@ vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data) static int sysctl_vfs_getvfscnt(void) { - struct mount *mp; - int ret = 0; + return(mount_getvfscnt()); +} + - simple_lock(&mountlist_slock); - CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) - ret++; - simple_unlock(&mountlist_slock); +static int +mount_getvfscnt(void) +{ + int ret; + + mount_list_lock(); + ret = nummounts; + mount_list_unlock(); return (ret); + +} + + + +static int +mount_fillfsids(fsid_t *fsidlst, int count) +{ + struct mount *mp; + int actual=0; + + actual = 0; + mount_list_lock(); + TAILQ_FOREACH(mp, &mountlist, mnt_list) { + if (actual <= count) { + fsidlst[actual] = mp->mnt_vfsstat.f_fsid; + actual++; + } + } + mount_list_unlock(); + return (actual); + } /* @@ -3086,13 +2659,13 @@ sysctl_vfs_getvfslist(fsid_t *fsidlst, int count, int *actual) struct mount *mp; *actual = 0; - simple_lock(&mountlist_slock); - CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) { + mount_list_lock(); + TAILQ_FOREACH(mp, &mountlist, mnt_list) { (*actual)++; if (*actual <= count) - fsidlst[(*actual) - 1] = mp->mnt_stat.f_fsid; + fsidlst[(*actual) - 1] = mp->mnt_vfsstat.f_fsid; } - simple_unlock(&mountlist_slock); + mount_list_unlock(); return (*actual <= count ? 0 : ENOMEM); } @@ -3104,11 +2677,11 @@ sysctl_vfs_vfslist SYSCTL_HANDLER_ARGS fsid_t *fsidlst; /* This is a readonly node. */ - if (req->newptr != NULL) + if (req->newptr != USER_ADDR_NULL) return (EPERM); /* they are querying us so just return the space required. */ - if (req->oldptr == NULL) { + if (req->oldptr == USER_ADDR_NULL) { req->oldidx = sysctl_vfs_getvfscnt() * sizeof(fsid_t); return 0; } @@ -3150,22 +2723,38 @@ static int sysctl_vfs_ctlbyfsid SYSCTL_HANDLER_ARGS { struct vfsidctl vc; + struct user_vfsidctl user_vc; struct mount *mp; - struct statfs *sp; + struct vfsstatfs *sp; struct proc *p; int *name; int error, flags, namelen; + struct vfs_context context; + boolean_t is_64_bit; name = arg1; namelen = arg2; p = req->p; + context.vc_proc = p; + context.vc_ucred = kauth_cred_get(); + is_64_bit = proc_is64bit(p); - error = SYSCTL_IN(req, &vc, sizeof(vc)); - if (error) - return (error); - if (vc.vc_vers != VFS_CTL_VERS1) - return (EINVAL); - mp = vfs_getvfs(&vc.vc_fsid); + if (is_64_bit) { + error = SYSCTL_IN(req, &user_vc, sizeof(user_vc)); + if (error) + return (error); + if (user_vc.vc_vers != VFS_CTL_VERS1) + return (EINVAL); + mp = mount_list_lookupby_fsid(&user_vc.vc_fsid, 0, 0); + } + else { + error = SYSCTL_IN(req, &vc, sizeof(vc)); + if (error) + return (error); + if (vc.vc_vers != VFS_CTL_VERS1) + return (EINVAL); + mp = mount_list_lookupby_fsid(&vc.vc_fsid, 0, 0); + } if (mp == NULL) return (ENOENT); /* reset so that the fs specific code can fetch it. */ @@ -3176,33 +2765,137 @@ sysctl_vfs_ctlbyfsid SYSCTL_HANDLER_ARGS * SYSCTL_IN/OUT routines. */ if (mp->mnt_op->vfs_sysctl != NULL) { - error = mp->mnt_op->vfs_sysctl(name, namelen, - req, NULL, NULL, 0, req->p); - if (error != EOPNOTSUPP) + if (is_64_bit) { + if (vfs_64bitready(mp)) { + error = mp->mnt_op->vfs_sysctl(name, namelen, + CAST_USER_ADDR_T(req), + NULL, USER_ADDR_NULL, 0, + &context); + } + else { + error = ENOTSUP; + } + } + else { + error = mp->mnt_op->vfs_sysctl(name, namelen, + CAST_USER_ADDR_T(req), + NULL, USER_ADDR_NULL, 0, + &context); + } + if (error != ENOTSUP) return (error); } switch (name[0]) { case VFS_CTL_UMOUNT: - VCTLTOREQ(&vc, req); + req->newidx = 0; + if (is_64_bit) { + req->newptr = user_vc.vc_ptr; + req->newlen = (size_t)user_vc.vc_len; + } + else { + req->newptr = CAST_USER_ADDR_T(vc.vc_ptr); + req->newlen = vc.vc_len; + } error = SYSCTL_IN(req, &flags, sizeof(flags)); if (error) break; error = safedounmount(mp, flags, p); break; case VFS_CTL_STATFS: - VCTLTOREQ(&vc, req); + req->newidx = 0; + if (is_64_bit) { + req->newptr = user_vc.vc_ptr; + req->newlen = (size_t)user_vc.vc_len; + } + else { + req->newptr = CAST_USER_ADDR_T(vc.vc_ptr); + req->newlen = vc.vc_len; + } error = SYSCTL_IN(req, &flags, sizeof(flags)); if (error) break; - sp = &mp->mnt_stat; + sp = &mp->mnt_vfsstat; if (((flags & MNT_NOWAIT) == 0 || (flags & MNT_WAIT)) && - (error = VFS_STATFS(mp, sp, p))) + (error = vfs_update_vfsstat(mp, &context))) return (error); - sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; - error = SYSCTL_OUT(req, sp, sizeof(*sp)); + if (is_64_bit) { + struct user_statfs sfs; + bzero(&sfs, sizeof(sfs)); + sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + sfs.f_type = mp->mnt_vtable->vfc_typenum; + sfs.f_bsize = (user_long_t)sp->f_bsize; + sfs.f_iosize = (user_long_t)sp->f_iosize; + sfs.f_blocks = (user_long_t)sp->f_blocks; + sfs.f_bfree = (user_long_t)sp->f_bfree; + sfs.f_bavail = (user_long_t)sp->f_bavail; + sfs.f_files = (user_long_t)sp->f_files; + sfs.f_ffree = (user_long_t)sp->f_ffree; + sfs.f_fsid = sp->f_fsid; + sfs.f_owner = sp->f_owner; + + strncpy(&sfs.f_fstypename, &sp->f_fstypename, MFSNAMELEN-1); + strncpy(&sfs.f_mntonname, &sp->f_mntonname, MNAMELEN-1); + strncpy(&sfs.f_mntfromname, &sp->f_mntfromname, MNAMELEN-1); + + error = SYSCTL_OUT(req, &sfs, sizeof(sfs)); + } + else { + struct statfs sfs; + bzero(&sfs, sizeof(struct statfs)); + sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + sfs.f_type = mp->mnt_vtable->vfc_typenum; + + /* + * It's possible for there to be more than 2^^31 blocks in the filesystem, so we + * have to fudge the numbers here in that case. We inflate the blocksize in order + * to reflect the filesystem size as best we can. + */ + if (sp->f_blocks > LONG_MAX) { + int shift; + + /* + * Work out how far we have to shift the block count down to make it fit. + * Note that it's possible to have to shift so far that the resulting + * blocksize would be unreportably large. At that point, we will clip + * any values that don't fit. + * + * For safety's sake, we also ensure that f_iosize is never reported as + * being smaller than f_bsize. + */ + for (shift = 0; shift < 32; shift++) { + if ((sp->f_blocks >> shift) <= LONG_MAX) + break; + if ((sp->f_bsize << (shift + 1)) > LONG_MAX) + break; + } +#define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > LONG_MAX) ? LONG_MAX : ((x) >> (s))) + sfs.f_blocks = (long)__SHIFT_OR_CLIP(sp->f_blocks, shift); + sfs.f_bfree = (long)__SHIFT_OR_CLIP(sp->f_bfree, shift); + sfs.f_bavail = (long)__SHIFT_OR_CLIP(sp->f_bavail, shift); +#undef __SHIFT_OR_CLIP + sfs.f_bsize = (long)(sp->f_bsize << shift); + sfs.f_iosize = lmax(sp->f_iosize, sp->f_bsize); + } else { + sfs.f_bsize = (long)sp->f_bsize; + sfs.f_iosize = (long)sp->f_iosize; + sfs.f_blocks = (long)sp->f_blocks; + sfs.f_bfree = (long)sp->f_bfree; + sfs.f_bavail = (long)sp->f_bavail; + } + sfs.f_files = (long)sp->f_files; + sfs.f_ffree = (long)sp->f_ffree; + sfs.f_fsid = sp->f_fsid; + sfs.f_owner = sp->f_owner; + + strncpy(&sfs.f_fstypename, &sp->f_fstypename, MFSNAMELEN-1); + strncpy(&sfs.f_mntonname, &sp->f_mntonname, MNAMELEN-1); + strncpy(&sfs.f_mntfromname, &sp->f_mntfromname, MNAMELEN-1); + + error = SYSCTL_OUT(req, &sfs, sizeof(sfs)); + } break; default: - return (EOPNOTSUPP); + return (ENOTSUP); } return (error); } @@ -3247,7 +2940,7 @@ sysctl_vfs_noremotehang SYSCTL_HANDLER_ARGS struct proc *p; /* We need a pid. */ - if (req->newptr == NULL) + if (req->newptr == USER_ADDR_NULL) return (EINVAL); error = SYSCTL_IN(req, &pid, sizeof(pid)); @@ -3262,12 +2955,13 @@ sysctl_vfs_noremotehang SYSCTL_HANDLER_ARGS * Fetching the value is ok, but we only fetch if the old * pointer is given. */ - if (req->oldptr != NULL) { + if (req->oldptr != USER_ADDR_NULL) { out = !((p->p_flag & P_NOREMOTEHANG) == 0); error = SYSCTL_OUT(req, &out, sizeof(out)); return (error); } + /* XXX req->p->p_ucred -> kauth_cred_get() ??? */ /* cansignal offers us enough security. */ if (p != req->p && suser(req->p->p_ucred, &req->p->p_acflag) != 0) return (EPERM); @@ -3289,4 +2983,2811 @@ SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW, sysctl_vfs_ctlbyfsid, "ctlbyfsid"); SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW, 0, 0, sysctl_vfs_noremotehang, "I", "noremotehang"); + + +int num_reusedvnodes=0; +static int +new_vnode(vnode_t *vpp) +{ + vnode_t vp; + int retries = 0; /* retry incase of tablefull */ + int vpid; + struct timespec ts; + +retry: + vnode_list_lock(); + + if ( !TAILQ_EMPTY(&vnode_free_list)) { + /* + * Pick the first vp for possible reuse + */ + vp = TAILQ_FIRST(&vnode_free_list); + + if (vp->v_lflag & VL_DEAD) + goto steal_this_vp; + } else + vp = NULL; + + /* + * we're either empty, or the next guy on the + * list is a valid vnode... if we're under the + * limit, we'll create a new vnode + */ + if (numvnodes < desiredvnodes) { + numvnodes++; + vnode_list_unlock(); + MALLOC_ZONE(vp, struct vnode *, sizeof *vp, M_VNODE, M_WAITOK); + bzero((char *)vp, sizeof *vp); + VLISTNONE(vp); /* avoid double queue removal */ + lck_mtx_init(&vp->v_lock, vnode_lck_grp, vnode_lck_attr); + + nanouptime(&ts); + vp->v_id = ts.tv_nsec; + vp->v_flag = VSTANDARD; + + goto done; + } + if (vp == NULL) { + /* + * we've reached the system imposed maximum number of vnodes + * but there isn't a single one available + * wait a bit and then retry... if we can't get a vnode + * after 100 retries, than log a complaint + */ + if (++retries <= 100) { + vnode_list_unlock(); + IOSleep(1); + goto retry; + } + + vnode_list_unlock(); + tablefull("vnode"); + log(LOG_EMERG, "%d desired, %d numvnodes, " + "%d free, %d inactive\n", + desiredvnodes, numvnodes, freevnodes, inactivevnodes); + *vpp = 0; + return (ENFILE); + } +steal_this_vp: + vpid = vp->v_id; + + VREMFREE("new_vnode", vp); + VLISTNONE(vp); + + vnode_list_unlock(); + vnode_lock(vp); + + /* + * We could wait for the vnode_lock after removing the vp from the freelist + * and the vid is bumped only at the very end of reclaim. So it is possible + * that we are looking at a vnode that is being terminated. If so skip it. + */ + if ((vpid != vp->v_id) || (vp->v_usecount != 0) || (vp->v_iocount != 0) || + VONLIST(vp) || (vp->v_lflag & VL_TERMINATE)) { + /* + * we lost the race between dropping the list lock + * and picking up the vnode_lock... someone else + * used this vnode and it is now in a new state + * so we need to go back and try again + */ + vnode_unlock(vp); + goto retry; + } + if ( (vp->v_lflag & (VL_NEEDINACTIVE | VL_MARKTERM)) == VL_NEEDINACTIVE ) { + /* + * we did a vnode_rele_ext that asked for + * us not to reenter the filesystem during + * the release even though VL_NEEDINACTIVE was + * set... we'll do it here by doing a + * vnode_get/vnode_put + * + * pick up an iocount so that we can call + * vnode_put and drive the VNOP_INACTIVE... + * vnode_put will either leave us off + * the freelist if a new ref comes in, + * or put us back on the end of the freelist + * or recycle us if we were marked for termination... + * so we'll just go grab a new candidate + */ + vp->v_iocount++; +#ifdef JOE_DEBUG + record_vp(vp, 1); +#endif + vnode_put_locked(vp); + vnode_unlock(vp); + goto retry; + } + OSAddAtomic(1, &num_reusedvnodes); + + /* Checks for anyone racing us for recycle */ + if (vp->v_type != VBAD) { + if (vp->v_lflag & VL_DEAD) + panic("new_vnode: the vnode is VL_DEAD but not VBAD"); + + (void)vnode_reclaim_internal(vp, 1, 1); + + if ((VONLIST(vp))) + panic("new_vnode: vp on list "); + if (vp->v_usecount || vp->v_iocount || vp->v_kusecount || + (vp->v_lflag & (VNAMED_UBC | VNAMED_MOUNT | VNAMED_FSHASH))) + panic("new_vnode: free vnode still referenced\n"); + if ((vp->v_mntvnodes.tqe_prev != 0) && (vp->v_mntvnodes.tqe_next != 0)) + panic("new_vnode: vnode seems to be on mount list "); + if ( !LIST_EMPTY(&vp->v_nclinks) || !LIST_EMPTY(&vp->v_ncchildren)) + panic("new_vnode: vnode still hooked into the name cache"); + } + if (vp->v_unsafefs) { + lck_mtx_destroy(&vp->v_unsafefs->fsnodelock, vnode_lck_grp); + FREE_ZONE((void *)vp->v_unsafefs, sizeof(struct unsafe_fsnode), M_UNSAFEFS); + vp->v_unsafefs = (struct unsafe_fsnode *)NULL; + } + vp->v_lflag = 0; + vp->v_writecount = 0; + vp->v_references = 0; + vp->v_iterblkflags = 0; + vp->v_flag = VSTANDARD; + /* vbad vnodes can point to dead_mountp */ + vp->v_mount = 0; + vp->v_defer_reclaimlist = (vnode_t)0; + + vnode_unlock(vp); +done: + *vpp = vp; + + return (0); +} + +void +vnode_lock(vnode_t vp) +{ + lck_mtx_lock(&vp->v_lock); +} + +void +vnode_unlock(vnode_t vp) +{ + lck_mtx_unlock(&vp->v_lock); +} + + + +int +vnode_get(struct vnode *vp) +{ + vnode_lock(vp); + + if ( (vp->v_iocount == 0) && (vp->v_lflag & (VL_TERMINATE | VL_DEAD)) ) { + vnode_unlock(vp); + return(ENOENT); + } + vp->v_iocount++; +#ifdef JOE_DEBUG + record_vp(vp, 1); +#endif + vnode_unlock(vp); + + return(0); +} + +int +vnode_getwithvid(vnode_t vp, int vid) +{ + return(vget_internal(vp, vid, ( VNODE_NODEAD| VNODE_WITHID))); +} + +int +vnode_getwithref(vnode_t vp) +{ + return(vget_internal(vp, 0, 0)); +} + + +int +vnode_put(vnode_t vp) +{ + int retval; + + vnode_lock(vp); + retval = vnode_put_locked(vp); + vnode_unlock(vp); + + return(retval); +} + +int +vnode_put_locked(vnode_t vp) +{ + struct vfs_context context; + +retry: + if (vp->v_iocount < 1) + panic("vnode_put(%x): iocount < 1", vp); + + if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) { + vnode_dropiocount(vp, 1); + return(0); + } + if ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD | VL_NEEDINACTIVE)) == VL_NEEDINACTIVE) { + + vp->v_lflag &= ~VL_NEEDINACTIVE; + vnode_unlock(vp); + + context.vc_proc = current_proc(); + context.vc_ucred = kauth_cred_get(); + VNOP_INACTIVE(vp, &context); + + vnode_lock(vp); + /* + * because we had to drop the vnode lock before calling + * VNOP_INACTIVE, the state of this vnode may have changed... + * we may pick up both VL_MARTERM and either + * an iocount or a usecount while in the VNOP_INACTIVE call + * we don't want to call vnode_reclaim_internal on a vnode + * that has active references on it... so loop back around + * and reevaluate the state + */ + goto retry; + } + vp->v_lflag &= ~VL_NEEDINACTIVE; + + if ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM) + vnode_reclaim_internal(vp, 1, 0); + + vnode_dropiocount(vp, 1); + vnode_list_add(vp); + + return(0); +} + +/* is vnode_t in use by others? */ +int +vnode_isinuse(vnode_t vp, int refcnt) +{ + return(vnode_isinuse_locked(vp, refcnt, 0)); +} + + +static int +vnode_isinuse_locked(vnode_t vp, int refcnt, int locked) +{ + int retval = 0; + + if (!locked) + vnode_lock(vp); + if ((vp->v_type != VREG) && (vp->v_usecount > refcnt)) { + retval = 1; + goto out; + } + if (vp->v_type == VREG) { + retval = ubc_isinuse_locked(vp, refcnt, 1); + } + +out: + if (!locked) + vnode_unlock(vp); + return(retval); +} + + +/* resume vnode_t */ +errno_t +vnode_resume(vnode_t vp) +{ + + vnode_lock(vp); + + if (vp->v_owner == current_thread()) { + vp->v_lflag &= ~VL_SUSPENDED; + vp->v_owner = 0; + vnode_unlock(vp); + wakeup(&vp->v_iocount); + } else + vnode_unlock(vp); + + return(0); +} + +static errno_t +vnode_drain(vnode_t vp) +{ + + if (vp->v_lflag & VL_DRAIN) { + panic("vnode_drain: recursuve drain"); + return(ENOENT); + } + vp->v_lflag |= VL_DRAIN; + vp->v_owner = current_thread(); + + while (vp->v_iocount > 1) + msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_drain", 0); + return(0); +} + + +/* + * if the number of recent references via vnode_getwithvid or vnode_getwithref + * exceeds this threshhold, than 'UN-AGE' the vnode by removing it from + * the LRU list if it's currently on it... once the iocount and usecount both drop + * to 0, it will get put back on the end of the list, effectively making it younger + * this allows us to keep actively referenced vnodes in the list without having + * to constantly remove and add to the list each time a vnode w/o a usecount is + * referenced which costs us taking and dropping a global lock twice. + */ +#define UNAGE_THRESHHOLD 10 + +errno_t +vnode_getiocount(vnode_t vp, int locked, int vid, int vflags) +{ + int nodead = vflags & VNODE_NODEAD; + int nosusp = vflags & VNODE_NOSUSPEND; + + if (!locked) + vnode_lock(vp); + + for (;;) { + /* + * if it is a dead vnode with deadfs + */ + if (nodead && (vp->v_lflag & VL_DEAD) && ((vp->v_type == VBAD) || (vp->v_data == 0))) { + if (!locked) + vnode_unlock(vp); + return(ENOENT); + } + /* + * will return VL_DEAD ones + */ + if ((vp->v_lflag & (VL_SUSPENDED | VL_DRAIN | VL_TERMINATE)) == 0 ) { + break; + } + /* + * if suspended vnodes are to be failed + */ + if (nosusp && (vp->v_lflag & VL_SUSPENDED)) { + if (!locked) + vnode_unlock(vp); + return(ENOENT); + } + /* + * if you are the owner of drain/suspend/termination , can acquire iocount + * check for VL_TERMINATE; it does not set owner + */ + if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED | VL_TERMINATE)) && + (vp->v_owner == current_thread())) { + break; + } + if (vp->v_lflag & VL_TERMINATE) { + vp->v_lflag |= VL_TERMWANT; + + msleep(&vp->v_lflag, &vp->v_lock, PVFS, "vnode getiocount", 0); + } else + msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_getiocount", 0); + } + if (vid != vp->v_id) { + if (!locked) + vnode_unlock(vp); + return(ENOENT); + } + if (++vp->v_references >= UNAGE_THRESHHOLD) { + vp->v_references = 0; + vnode_list_remove(vp); + } + vp->v_iocount++; +#ifdef JOE_DEBUG + record_vp(vp, 1); +#endif + if (!locked) + vnode_unlock(vp); + return(0); +} + +static void +vnode_dropiocount (vnode_t vp, int locked) +{ + if (!locked) + vnode_lock(vp); + if (vp->v_iocount < 1) + panic("vnode_dropiocount(%x): v_iocount < 1", vp); + + vp->v_iocount--; +#ifdef JOE_DEBUG + record_vp(vp, -1); +#endif + if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED)) && (vp->v_iocount <= 1)) + wakeup(&vp->v_iocount); + + if (!locked) + vnode_unlock(vp); +} + + +void +vnode_reclaim(struct vnode * vp) +{ + vnode_reclaim_internal(vp, 0, 0); +} + +__private_extern__ +void +vnode_reclaim_internal(struct vnode * vp, int locked, int reuse) +{ + int isfifo = 0; + + if (!locked) + vnode_lock(vp); + + if (vp->v_lflag & VL_TERMINATE) { + panic("vnode reclaim in progress"); + } + vp->v_lflag |= VL_TERMINATE; + + if (vnode_drain(vp)) { + panic("vnode drain failed"); + vnode_unlock(vp); + return; + } + isfifo = (vp->v_type == VFIFO); + + if (vp->v_type != VBAD) + vgone(vp); /* clean and reclaim the vnode */ + + /* + * give the vnode a new identity so + * that vnode_getwithvid will fail + * on any stale cache accesses + */ + vp->v_id++; + if (isfifo) { + struct fifoinfo * fip; + + fip = vp->v_fifoinfo; + vp->v_fifoinfo = NULL; + FREE(fip, M_TEMP); + } + + vp->v_type = VBAD; + + if (vp->v_data) + panic("vnode_reclaim_internal: cleaned vnode isn't"); + if (vp->v_numoutput) + panic("vnode_reclaim_internal: Clean vnode has pending I/O's"); + if (UBCINFOEXISTS(vp)) + panic("vnode_reclaim_internal: ubcinfo not cleaned"); + if (vp->v_parent) + panic("vnode_reclaim_internal: vparent not removed"); + if (vp->v_name) + panic("vnode_reclaim_internal: vname not removed"); + + vp->v_socket = 0; + + vp->v_lflag &= ~VL_TERMINATE; + vp->v_lflag &= ~VL_DRAIN; + vp->v_owner = 0; + + if (vp->v_lflag & VL_TERMWANT) { + vp->v_lflag &= ~VL_TERMWANT; + wakeup(&vp->v_lflag); + } + if (!reuse && vp->v_usecount == 0) + vnode_list_add(vp); + if (!locked) + vnode_unlock(vp); +} + +/* USAGE: + * The following api creates a vnode and associates all the parameter specified in vnode_fsparam + * structure and returns a vnode handle with a reference. device aliasing is handled here so checkalias + * is obsoleted by this. + * vnode_create(int flavor, size_t size, void * param, vnode_t *vp) + */ +int +vnode_create(int flavor, size_t size, void *data, vnode_t *vpp) +{ + int error; + int insert = 1; + vnode_t vp; + vnode_t nvp; + vnode_t dvp; + struct componentname *cnp; + struct vnode_fsparam *param = (struct vnode_fsparam *)data; + + if (flavor == VNCREATE_FLAVOR && (size == VCREATESIZE) && param) { + if ( (error = new_vnode(&vp)) ) { + return(error); + } else { + dvp = param->vnfs_dvp; + cnp = param->vnfs_cnp; + + vp->v_op = param->vnfs_vops; + vp->v_type = param->vnfs_vtype; + vp->v_data = param->vnfs_fsnode; + vp->v_iocount = 1; + + if (param->vnfs_markroot) + vp->v_flag |= VROOT; + if (param->vnfs_marksystem) + vp->v_flag |= VSYSTEM; + else if (vp->v_type == VREG) { + /* + * only non SYSTEM vp + */ + error = ubc_info_init_withsize(vp, param->vnfs_filesize); + if (error) { +#ifdef JOE_DEBUG + record_vp(vp, 1); +#endif + vp->v_mount = 0; + vp->v_op = dead_vnodeop_p; + vp->v_tag = VT_NON; + vp->v_data = NULL; + vp->v_type = VBAD; + vp->v_lflag |= VL_DEAD; + + vnode_put(vp); + return(error); + } + } +#ifdef JOE_DEBUG + record_vp(vp, 1); +#endif + if (vp->v_type == VCHR || vp->v_type == VBLK) { + + if ( (nvp = checkalias(vp, param->vnfs_rdev)) ) { + /* + * if checkalias returns a vnode, it will be locked + * + * first get rid of the unneeded vnode we acquired + */ + vp->v_data = NULL; + vp->v_op = spec_vnodeop_p; + vp->v_type = VBAD; + vp->v_lflag = VL_DEAD; + vp->v_data = NULL; + vp->v_tag = VT_NON; + vnode_put(vp); + + /* + * switch to aliased vnode and finish + * preparing it + */ + vp = nvp; + + vclean(vp, 0, current_proc()); + vp->v_op = param->vnfs_vops; + vp->v_type = param->vnfs_vtype; + vp->v_data = param->vnfs_fsnode; + vp->v_lflag = 0; + vp->v_mount = NULL; + insmntque(vp, param->vnfs_mp); + insert = 0; + vnode_unlock(vp); + } + } + + if (vp->v_type == VFIFO) { + struct fifoinfo *fip; + + MALLOC(fip, struct fifoinfo *, + sizeof(*fip), M_TEMP, M_WAITOK); + bzero(fip, sizeof(struct fifoinfo )); + vp->v_fifoinfo = fip; + } + /* The file systems usually pass the address of the location where + * where there store the vnode pointer. When we add the vnode in mount + * point and name cache they are discoverable. So the file system node + * will have the connection to vnode setup by then + */ + *vpp = vp; + + if (param->vnfs_mp) { + if (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL) + vp->v_flag |= VLOCKLOCAL; + if (insert) { + /* + * enter in mount vnode list + */ + insmntque(vp, param->vnfs_mp); + } +#ifdef INTERIM_FSNODE_LOCK + if (param->vnfs_mp->mnt_vtable->vfc_threadsafe == 0) { + MALLOC_ZONE(vp->v_unsafefs, struct unsafe_fsnode *, + sizeof(struct unsafe_fsnode), M_UNSAFEFS, M_WAITOK); + vp->v_unsafefs->fsnode_count = 0; + vp->v_unsafefs->fsnodeowner = (void *)NULL; + lck_mtx_init(&vp->v_unsafefs->fsnodelock, vnode_lck_grp, vnode_lck_attr); + } +#endif /* INTERIM_FSNODE_LOCK */ + } + if (dvp && vnode_ref(dvp) == 0) { + vp->v_parent = dvp; + } + if (cnp) { + if (dvp && ((param->vnfs_flags & (VNFS_NOCACHE | VNFS_CANTCACHE)) == 0)) { + /* + * enter into name cache + * we've got the info to enter it into the name cache now + */ + cache_enter(dvp, vp, cnp); + } + vp->v_name = vfs_addname(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0); + } + if ((param->vnfs_flags & VNFS_CANTCACHE) == 0) { + /* + * this vnode is being created as cacheable in the name cache + * this allows us to re-enter it in the cache + */ + vp->v_flag |= VNCACHEABLE; + } + if ((vp->v_flag & VSYSTEM) && (vp->v_type != VREG)) + panic("incorrect vnode setup"); + + return(0); + } + } + return (EINVAL); +} + +int +vnode_addfsref(vnode_t vp) +{ + vnode_lock(vp); + if (vp->v_lflag & VNAMED_FSHASH) + panic("add_fsref: vp already has named reference"); + if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) + panic("addfsref: vp on the free list\n"); + vp->v_lflag |= VNAMED_FSHASH; + vnode_unlock(vp); + return(0); + +} +int +vnode_removefsref(vnode_t vp) +{ + vnode_lock(vp); + if ((vp->v_lflag & VNAMED_FSHASH) == 0) + panic("remove_fsref: no named reference"); + vp->v_lflag &= ~VNAMED_FSHASH; + vnode_unlock(vp); + return(0); + +} + + +int +vfs_iterate(__unused int flags, int (*callout)(mount_t, void *), void *arg) +{ + mount_t mp; + int ret = 0; + fsid_t * fsid_list; + int count, actualcount, i; + void * allocmem; + + count = mount_getvfscnt(); + count += 10; + + fsid_list = (fsid_t *)kalloc(count * sizeof(fsid_t)); + allocmem = (void *)fsid_list; + + actualcount = mount_fillfsids(fsid_list, count); + + for (i=0; i< actualcount; i++) { + + /* obtain the mount point with iteration reference */ + mp = mount_list_lookupby_fsid(&fsid_list[i], 0, 1); + + if(mp == (struct mount *)0) + continue; + mount_lock(mp); + if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) { + mount_unlock(mp); + mount_iterdrop(mp); + continue; + + } + mount_unlock(mp); + + /* iterate over all the vnodes */ + ret = callout(mp, arg); + + mount_iterdrop(mp); + + switch (ret) { + case VFS_RETURNED: + case VFS_RETURNED_DONE: + if (ret == VFS_RETURNED_DONE) { + ret = 0; + goto out; + } + break; + + case VFS_CLAIMED_DONE: + ret = 0; + goto out; + case VFS_CLAIMED: + default: + break; + } + ret = 0; + } + +out: + kfree(allocmem, (count * sizeof(fsid_t))); + return (ret); +} + +/* + * Update the vfsstatfs structure in the mountpoint. + */ +int +vfs_update_vfsstat(mount_t mp, vfs_context_t ctx) +{ + struct vfs_attr va; + int error; + + /* + * Request the attributes we want to propagate into + * the per-mount vfsstat structure. + */ + VFSATTR_INIT(&va); + VFSATTR_WANTED(&va, f_iosize); + VFSATTR_WANTED(&va, f_blocks); + VFSATTR_WANTED(&va, f_bfree); + VFSATTR_WANTED(&va, f_bavail); + VFSATTR_WANTED(&va, f_bused); + VFSATTR_WANTED(&va, f_files); + VFSATTR_WANTED(&va, f_ffree); + VFSATTR_WANTED(&va, f_bsize); + VFSATTR_WANTED(&va, f_fssubtype); + if ((error = vfs_getattr(mp, &va, ctx)) != 0) { + KAUTH_DEBUG("STAT - filesystem returned error %d", error); + return(error); + } + + /* + * Unpack into the per-mount structure. + * + * We only overwrite these fields, which are likely to change: + * f_blocks + * f_bfree + * f_bavail + * f_bused + * f_files + * f_ffree + * + * And these which are not, but which the FS has no other way + * of providing to us: + * f_bsize + * f_iosize + * f_fssubtype + * + */ + if (VFSATTR_IS_SUPPORTED(&va, f_bsize)) { + mp->mnt_vfsstat.f_bsize = va.f_bsize; + } else { + mp->mnt_vfsstat.f_bsize = mp->mnt_devblocksize; /* default from the device block size */ + } + if (VFSATTR_IS_SUPPORTED(&va, f_iosize)) { + mp->mnt_vfsstat.f_iosize = va.f_iosize; + } else { + mp->mnt_vfsstat.f_iosize = 1024 * 1024; /* 1MB sensible I/O size */ + } + if (VFSATTR_IS_SUPPORTED(&va, f_blocks)) + mp->mnt_vfsstat.f_blocks = va.f_blocks; + if (VFSATTR_IS_SUPPORTED(&va, f_bfree)) + mp->mnt_vfsstat.f_bfree = va.f_bfree; + if (VFSATTR_IS_SUPPORTED(&va, f_bavail)) + mp->mnt_vfsstat.f_bavail = va.f_bavail; + if (VFSATTR_IS_SUPPORTED(&va, f_bused)) + mp->mnt_vfsstat.f_bused = va.f_bused; + if (VFSATTR_IS_SUPPORTED(&va, f_files)) + mp->mnt_vfsstat.f_files = va.f_files; + if (VFSATTR_IS_SUPPORTED(&va, f_ffree)) + mp->mnt_vfsstat.f_ffree = va.f_ffree; + + /* this is unlikely to change, but has to be queried for */ + if (VFSATTR_IS_SUPPORTED(&va, f_fssubtype)) + mp->mnt_vfsstat.f_fssubtype = va.f_fssubtype; + + return(0); +} + +void +mount_list_add(mount_t mp) +{ + mount_list_lock(); + TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); + nummounts++; + mount_list_unlock(); +} + +void +mount_list_remove(mount_t mp) +{ + mount_list_lock(); + TAILQ_REMOVE(&mountlist, mp, mnt_list); + nummounts--; + mp->mnt_list.tqe_next = 0; + mp->mnt_list.tqe_prev = 0; + mount_list_unlock(); +} + +mount_t +mount_lookupby_volfsid(int volfs_id, int withref) +{ + mount_t cur_mount = (mount_t)0; + mount_t mp ; + + mount_list_lock(); + TAILQ_FOREACH(mp, &mountlist, mnt_list) { + if (validfsnode(mp) && mp->mnt_vfsstat.f_fsid.val[0] == volfs_id) { + cur_mount = mp; + if (withref) { + if (mount_iterref(cur_mount, 1)) { + cur_mount = (mount_t)0; + mount_list_unlock(); + goto out; + } + } + break; + } + } + mount_list_unlock(); + if (withref && (cur_mount != (mount_t)0)) { + mp = cur_mount; + if (vfs_busy(mp, LK_NOWAIT) != 0) { + cur_mount = (mount_t)0; + } + mount_iterdrop(mp); + } +out: + return(cur_mount); +} + + +mount_t +mount_list_lookupby_fsid(fsid, locked, withref) + fsid_t *fsid; + int locked; + int withref; +{ + mount_t retmp = (mount_t)0; + mount_t mp; + + if (!locked) + mount_list_lock(); + TAILQ_FOREACH(mp, &mountlist, mnt_list) + if (mp->mnt_vfsstat.f_fsid.val[0] == fsid->val[0] && + mp->mnt_vfsstat.f_fsid.val[1] == fsid->val[1]) { + retmp = mp; + if (withref) { + if (mount_iterref(retmp, 1)) + retmp = (mount_t)0; + } + goto out; + } +out: + if (!locked) + mount_list_unlock(); + return (retmp); +} + +errno_t +vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t context) +{ + struct nameidata nd; + int error; + struct vfs_context context2; + vfs_context_t ctx = context; + u_long ndflags = 0; + + if (context == NULL) { /* XXX technically an error */ + context2.vc_proc = current_proc(); + context2.vc_ucred = kauth_cred_get(); + ctx = &context2; + } + + if (flags & VNODE_LOOKUP_NOFOLLOW) + ndflags = NOFOLLOW; + else + ndflags = FOLLOW; + + if (flags & VNODE_LOOKUP_NOCROSSMOUNT) + ndflags |= NOCROSSMOUNT; + if (flags & VNODE_LOOKUP_DOWHITEOUT) + ndflags |= DOWHITEOUT; + + /* XXX AUDITVNPATH1 needed ? */ + NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx); + + if ((error = namei(&nd))) + return (error); + *vpp = nd.ni_vp; + nameidone(&nd); + + return (0); +} + +errno_t +vnode_open(const char *path, int fmode, int cmode, int flags, vnode_t *vpp, vfs_context_t context) +{ + struct nameidata nd; + int error; + struct vfs_context context2; + vfs_context_t ctx = context; + u_long ndflags = 0; + + if (context == NULL) { /* XXX technically an error */ + context2.vc_proc = current_proc(); + context2.vc_ucred = kauth_cred_get(); + ctx = &context2; + } + + if (flags & VNODE_LOOKUP_NOFOLLOW) + ndflags = NOFOLLOW; + else + ndflags = FOLLOW; + + if (flags & VNODE_LOOKUP_NOCROSSMOUNT) + ndflags |= NOCROSSMOUNT; + if (flags & VNODE_LOOKUP_DOWHITEOUT) + ndflags |= DOWHITEOUT; + + /* XXX AUDITVNPATH1 needed ? */ + NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx); + + if ((error = vn_open(&nd, fmode, cmode))) + *vpp = NULL; + else + *vpp = nd.ni_vp; + + return (error); +} + +errno_t +vnode_close(vnode_t vp, int flags, vfs_context_t context) +{ + kauth_cred_t cred; + struct proc *p; + int error; + + if (context) { + p = context->vc_proc; + cred = context->vc_ucred; + } else { + p = current_proc(); + cred = kauth_cred_get(); + } + + error = vn_close(vp, flags, cred, p); + vnode_put(vp); + return (error); +} + +errno_t +vnode_size(vnode_t vp, off_t *sizep, vfs_context_t ctx) +{ + struct vnode_attr va; + int error; + + VATTR_INIT(&va); + VATTR_WANTED(&va, va_data_size); + error = vnode_getattr(vp, &va, ctx); + if (!error) + *sizep = va.va_data_size; + return(error); +} + +errno_t +vnode_setsize(vnode_t vp, off_t size, int ioflag, vfs_context_t ctx) +{ + struct vnode_attr va; + + VATTR_INIT(&va); + VATTR_SET(&va, va_data_size, size); + va.va_vaflags = ioflag & 0xffff; + return(vnode_setattr(vp, &va, ctx)); +} + +errno_t +vn_create(vnode_t dvp, vnode_t *vpp, struct componentname *cnp, struct vnode_attr *vap, int flags, vfs_context_t ctx) +{ + kauth_acl_t oacl, nacl; + int initial_acl; + errno_t error; + vnode_t vp = (vnode_t)0; + + error = 0; + oacl = nacl = NULL; + initial_acl = 0; + + KAUTH_DEBUG("%p CREATE - '%s'", dvp, cnp->cn_nameptr); + + /* + * Handle ACL inheritance. + */ + if (!(flags & VN_CREATE_NOINHERIT) && vfs_extendedsecurity(dvp->v_mount)) { + /* save the original filesec */ + if (VATTR_IS_ACTIVE(vap, va_acl)) { + initial_acl = 1; + oacl = vap->va_acl; + } + + vap->va_acl = NULL; + if ((error = kauth_acl_inherit(dvp, + oacl, + &nacl, + vap->va_type == VDIR, + ctx)) != 0) { + KAUTH_DEBUG("%p CREATE - error %d processing inheritance", dvp, error); + return(error); + } + + /* + * If the generated ACL is NULL, then we can save ourselves some effort + * by clearing the active bit. + */ + if (nacl == NULL) { + VATTR_CLEAR_ACTIVE(vap, va_acl); + } else { + VATTR_SET(vap, va_acl, nacl); + } + } + + /* + * Check and default new attributes. + * This will set va_uid, va_gid, va_mode and va_create_time at least, if the caller + * hasn't supplied them. + */ + if ((error = vnode_authattr_new(dvp, vap, flags & VN_CREATE_NOAUTH, ctx)) != 0) { + KAUTH_DEBUG("%p CREATE - error %d handing/defaulting attributes", dvp, error); + goto out; + } + + + /* + * Create the requested node. + */ + switch(vap->va_type) { + case VREG: + error = VNOP_CREATE(dvp, vpp, cnp, vap, ctx); + break; + case VDIR: + error = VNOP_MKDIR(dvp, vpp, cnp, vap, ctx); + break; + case VSOCK: + case VFIFO: + case VBLK: + case VCHR: + error = VNOP_MKNOD(dvp, vpp, cnp, vap, ctx); + break; + default: + panic("vnode_create: unknown vtype %d", vap->va_type); + } + if (error != 0) { + KAUTH_DEBUG("%p CREATE - error %d returned by filesystem", dvp, error); + goto out; + } + + vp = *vpp; + /* + * If some of the requested attributes weren't handled by the VNOP, + * use our fallback code. + */ + if (!VATTR_ALL_SUPPORTED(vap) && *vpp) { + KAUTH_DEBUG(" CREATE - doing fallback with ACL %p", vap->va_acl); + error = vnode_setattr_fallback(*vpp, vap, ctx); + } + if ((error != 0 ) && (vp != (vnode_t)0)) { + *vpp = (vnode_t) 0; + vnode_put(vp); + } + +out: + /* + * If the caller supplied a filesec in vap, it has been replaced + * now by the post-inheritance copy. We need to put the original back + * and free the inherited product. + */ + if (initial_acl) { + VATTR_SET(vap, va_acl, oacl); + } else { + VATTR_CLEAR_ACTIVE(vap, va_acl); + } + if (nacl != NULL) + kauth_acl_free(nacl); + + return(error); +} + +static kauth_scope_t vnode_scope; +static int vnode_authorize_callback(kauth_cred_t credential, __unused void *idata, kauth_action_t action, + uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3); + +typedef struct _vnode_authorize_context { + vnode_t vp; + struct vnode_attr *vap; + vnode_t dvp; + struct vnode_attr *dvap; + vfs_context_t ctx; + int flags; + int flags_valid; +#define _VAC_IS_OWNER (1<<0) +#define _VAC_IN_GROUP (1<<1) +#define _VAC_IS_DIR_OWNER (1<<2) +#define _VAC_IN_DIR_GROUP (1<<3) +} *vauth_ctx; + +void +vnode_authorize_init(void) +{ + vnode_scope = kauth_register_scope(KAUTH_SCOPE_VNODE, vnode_authorize_callback, NULL); +} + +/* + * Authorize an operation on a vnode. + * + * This is KPI, but here because it needs vnode_scope. + */ +int +vnode_authorize(vnode_t vp, vnode_t dvp, kauth_action_t action, vfs_context_t context) +{ + int error, result; + + /* + * We can't authorize against a dead vnode; allow all operations through so that + * the correct error can be returned. + */ + if (vp->v_type == VBAD) + return(0); + + error = 0; + result = kauth_authorize_action(vnode_scope, vfs_context_ucred(context), action, + (uintptr_t)context, (uintptr_t)vp, (uintptr_t)dvp, (uintptr_t)&error); + if (result == EPERM) /* traditional behaviour */ + result = EACCES; + /* did the lower layers give a better error return? */ + if ((result != 0) && (error != 0)) + return(error); + return(result); +} + +/* + * Test for vnode immutability. + * + * The 'append' flag is set when the authorization request is constrained + * to operations which only request the right to append to a file. + * + * The 'ignore' flag is set when an operation modifying the immutability flags + * is being authorized. We check the system securelevel to determine which + * immutability flags we can ignore. + */ +static int +vnode_immutable(struct vnode_attr *vap, int append, int ignore) +{ + int mask; + + /* start with all bits precluding the operation */ + mask = IMMUTABLE | APPEND; + + /* if appending only, remove the append-only bits */ + if (append) + mask &= ~APPEND; + + /* ignore only set when authorizing flags changes */ + if (ignore) { + if (securelevel <= 0) { + /* in insecure state, flags do not inhibit changes */ + mask = 0; + } else { + /* in secure state, user flags don't inhibit */ + mask &= ~(UF_IMMUTABLE | UF_APPEND); + } + } + KAUTH_DEBUG("IMMUTABLE - file flags 0x%x mask 0x%x append = %d ignore = %d", vap->va_flags, mask, append, ignore); + if ((vap->va_flags & mask) != 0) + return(EPERM); + return(0); +} + +static int +vauth_node_owner(struct vnode_attr *vap, kauth_cred_t cred) +{ + int result; + + /* default assumption is not-owner */ + result = 0; + + /* + * If the filesystem has given us a UID, we treat this as authoritative. + */ + if (vap && VATTR_IS_SUPPORTED(vap, va_uid)) { + result = (vap->va_uid == kauth_cred_getuid(cred)) ? 1 : 0; + } + /* we could test the owner UUID here if we had a policy for it */ + + return(result); +} + +static int +vauth_node_group(struct vnode_attr *vap, kauth_cred_t cred, int *ismember) +{ + int error; + int result; + + error = 0; + result = 0; + + /* the caller is expected to have asked the filesystem for a group at some point */ + if (vap && VATTR_IS_SUPPORTED(vap, va_gid)) { + error = kauth_cred_ismember_gid(cred, vap->va_gid, &result); + } + /* we could test the group UUID here if we had a policy for it */ + + if (!error) + *ismember = result; + return(error); +} + +static int +vauth_file_owner(vauth_ctx vcp) +{ + int result; + + if (vcp->flags_valid & _VAC_IS_OWNER) { + result = (vcp->flags & _VAC_IS_OWNER) ? 1 : 0; + } else { + result = vauth_node_owner(vcp->vap, vcp->ctx->vc_ucred); + + /* cache our result */ + vcp->flags_valid |= _VAC_IS_OWNER; + if (result) { + vcp->flags |= _VAC_IS_OWNER; + } else { + vcp->flags &= ~_VAC_IS_OWNER; + } + } + return(result); +} + +static int +vauth_file_ingroup(vauth_ctx vcp, int *ismember) +{ + int error; + + if (vcp->flags_valid & _VAC_IN_GROUP) { + *ismember = (vcp->flags & _VAC_IN_GROUP) ? 1 : 0; + error = 0; + } else { + error = vauth_node_group(vcp->vap, vcp->ctx->vc_ucred, ismember); + + if (!error) { + /* cache our result */ + vcp->flags_valid |= _VAC_IN_GROUP; + if (*ismember) { + vcp->flags |= _VAC_IN_GROUP; + } else { + vcp->flags &= ~_VAC_IN_GROUP; + } + } + + } + return(error); +} + +static int +vauth_dir_owner(vauth_ctx vcp) +{ + int result; + + if (vcp->flags_valid & _VAC_IS_DIR_OWNER) { + result = (vcp->flags & _VAC_IS_DIR_OWNER) ? 1 : 0; + } else { + result = vauth_node_owner(vcp->dvap, vcp->ctx->vc_ucred); + + /* cache our result */ + vcp->flags_valid |= _VAC_IS_DIR_OWNER; + if (result) { + vcp->flags |= _VAC_IS_DIR_OWNER; + } else { + vcp->flags &= ~_VAC_IS_DIR_OWNER; + } + } + return(result); +} + +static int +vauth_dir_ingroup(vauth_ctx vcp, int *ismember) +{ + int error; + + if (vcp->flags_valid & _VAC_IN_DIR_GROUP) { + *ismember = (vcp->flags & _VAC_IN_DIR_GROUP) ? 1 : 0; + error = 0; + } else { + error = vauth_node_group(vcp->dvap, vcp->ctx->vc_ucred, ismember); + + if (!error) { + /* cache our result */ + vcp->flags_valid |= _VAC_IN_DIR_GROUP; + if (*ismember) { + vcp->flags |= _VAC_IN_DIR_GROUP; + } else { + vcp->flags &= ~_VAC_IN_DIR_GROUP; + } + } + } + return(error); +} + +/* + * Test the posix permissions in (vap) to determine whether (credential) + * may perform (action) + */ +static int +vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir) +{ + struct vnode_attr *vap; + int needed, error, owner_ok, group_ok, world_ok, ismember; +#ifdef KAUTH_DEBUG_ENABLE + const char *where; +# define _SETWHERE(c) where = c; +#else +# define _SETWHERE(c) +#endif + + /* checking file or directory? */ + if (on_dir) { + vap = vcp->dvap; + } else { + vap = vcp->vap; + } + + error = 0; + + /* + * We want to do as little work here as possible. So first we check + * which sets of permissions grant us the access we need, and avoid checking + * whether specific permissions grant access when more generic ones would. + */ + + /* owner permissions */ + needed = 0; + if (action & VREAD) + needed |= S_IRUSR; + if (action & VWRITE) + needed |= S_IWUSR; + if (action & VEXEC) + needed |= S_IXUSR; + owner_ok = (needed & vap->va_mode) == needed; + + /* group permissions */ + needed = 0; + if (action & VREAD) + needed |= S_IRGRP; + if (action & VWRITE) + needed |= S_IWGRP; + if (action & VEXEC) + needed |= S_IXGRP; + group_ok = (needed & vap->va_mode) == needed; + + /* world permissions */ + needed = 0; + if (action & VREAD) + needed |= S_IROTH; + if (action & VWRITE) + needed |= S_IWOTH; + if (action & VEXEC) + needed |= S_IXOTH; + world_ok = (needed & vap->va_mode) == needed; + + /* If granted/denied by all three, we're done */ + if (owner_ok && group_ok && world_ok) { + _SETWHERE("all"); + goto out; + } + if (!owner_ok && !group_ok && !world_ok) { + _SETWHERE("all"); + error = EACCES; + goto out; + } + + /* Check ownership (relatively cheap) */ + if ((on_dir && vauth_dir_owner(vcp)) || + (!on_dir && vauth_file_owner(vcp))) { + _SETWHERE("user"); + if (!owner_ok) + error = EACCES; + goto out; + } + + /* Not owner; if group and world both grant it we're done */ + if (group_ok && world_ok) { + _SETWHERE("group/world"); + goto out; + } + if (!group_ok && !world_ok) { + _SETWHERE("group/world"); + error = EACCES; + goto out; + } + + /* Check group membership (most expensive) */ + ismember = 0; + if (on_dir) { + error = vauth_dir_ingroup(vcp, &ismember); + } else { + error = vauth_file_ingroup(vcp, &ismember); + } + if (error) + goto out; + if (ismember) { + _SETWHERE("group"); + if (!group_ok) + error = EACCES; + goto out; + } + + /* Not owner, not in group, use world result */ + _SETWHERE("world"); + if (!world_ok) + error = EACCES; + + /* FALLTHROUGH */ + +out: + KAUTH_DEBUG("%p %s - posix %s permissions : need %s%s%s %x have %s%s%s%s%s%s%s%s%s UID = %d file = %d,%d", + vcp->vp, (error == 0) ? "ALLOWED" : "DENIED", where, + (action & VREAD) ? "r" : "-", + (action & VWRITE) ? "w" : "-", + (action & VEXEC) ? "x" : "-", + needed, + (vap->va_mode & S_IRUSR) ? "r" : "-", + (vap->va_mode & S_IWUSR) ? "w" : "-", + (vap->va_mode & S_IXUSR) ? "x" : "-", + (vap->va_mode & S_IRGRP) ? "r" : "-", + (vap->va_mode & S_IWGRP) ? "w" : "-", + (vap->va_mode & S_IXGRP) ? "x" : "-", + (vap->va_mode & S_IROTH) ? "r" : "-", + (vap->va_mode & S_IWOTH) ? "w" : "-", + (vap->va_mode & S_IXOTH) ? "x" : "-", + kauth_cred_getuid(vcp->ctx->vc_ucred), + on_dir ? vcp->dvap->va_uid : vcp->vap->va_uid, + on_dir ? vcp->dvap->va_gid : vcp->vap->va_gid); + return(error); +} + +/* + * Authorize the deletion of the node vp from the directory dvp. + * + * We assume that: + * - Neither the node nor the directory are immutable. + * - The user is not the superuser. + * + * Deletion is not permitted if the directory is sticky and the caller is not owner of the + * node or directory. + * + * If either the node grants DELETE, or the directory grants DELETE_CHILD, the node may be + * deleted. If neither denies the permission, and the caller has Posix write access to the + * directory, then the node may be deleted. + */ +static int +vnode_authorize_delete(vauth_ctx vcp) +{ + struct vnode_attr *vap = vcp->vap; + struct vnode_attr *dvap = vcp->dvap; + kauth_cred_t cred = vcp->ctx->vc_ucred; + struct kauth_acl_eval eval; + int error, delete_denied, delete_child_denied, ismember; + + /* check the ACL on the directory */ + delete_child_denied = 0; + if (VATTR_IS_NOT(dvap, va_acl, NULL)) { + eval.ae_requested = KAUTH_VNODE_DELETE_CHILD; + eval.ae_acl = &dvap->va_acl->acl_ace[0]; + eval.ae_count = dvap->va_acl->acl_entrycount; + eval.ae_options = 0; + if (vauth_dir_owner(vcp)) + eval.ae_options |= KAUTH_AEVAL_IS_OWNER; + if ((error = vauth_dir_ingroup(vcp, &ismember)) != 0) + return(error); + if (ismember) + eval.ae_options |= KAUTH_AEVAL_IN_GROUP; + eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS; + eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS; + eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS; + eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS; + + error = kauth_acl_evaluate(cred, &eval); + + if (error != 0) { + KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error); + return(error); + } + if (eval.ae_result == KAUTH_RESULT_DENY) + delete_child_denied = 1; + if (eval.ae_result == KAUTH_RESULT_ALLOW) { + KAUTH_DEBUG("%p ALLOWED - granted by directory ACL", vcp->vp); + return(0); + } + } + + /* check the ACL on the node */ + delete_denied = 0; + if (VATTR_IS_NOT(vap, va_acl, NULL)) { + eval.ae_requested = KAUTH_VNODE_DELETE; + eval.ae_acl = &vap->va_acl->acl_ace[0]; + eval.ae_count = vap->va_acl->acl_entrycount; + eval.ae_options = 0; + if (vauth_file_owner(vcp)) + eval.ae_options |= KAUTH_AEVAL_IS_OWNER; + if ((error = vauth_file_ingroup(vcp, &ismember)) != 0) + return(error); + if (ismember) + eval.ae_options |= KAUTH_AEVAL_IN_GROUP; + eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS; + eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS; + eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS; + eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS; + + if ((error = kauth_acl_evaluate(cred, &eval)) != 0) { + KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error); + return(error); + } + if (eval.ae_result == KAUTH_RESULT_DENY) + delete_denied = 1; + if (eval.ae_result == KAUTH_RESULT_ALLOW) { + KAUTH_DEBUG("%p ALLOWED - granted by file ACL", vcp->vp); + return(0); + } + } + + /* if denied by ACL on directory or node, return denial */ + if (delete_denied || delete_child_denied) { + KAUTH_DEBUG("%p ALLOWED - denied by ACL", vcp->vp); + return(EACCES); + } + + /* enforce sticky bit behaviour */ + if ((dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) { + KAUTH_DEBUG("%p DENIED - sticky bit rules (user %d file %d dir %d)", + vcp->vp, cred->cr_uid, vap->va_uid, dvap->va_uid); + return(EACCES); + } + + /* check the directory */ + if ((error = vnode_authorize_posix(vcp, VWRITE, 1 /* on_dir */)) != 0) { + KAUTH_DEBUG("%p ALLOWED - granted by posix permisssions", vcp->vp); + return(error); + } + + /* not denied, must be OK */ + return(0); +} + + +/* + * Authorize an operation based on the node's attributes. + */ +static int +vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_rights_t preauth_rights) +{ + struct vnode_attr *vap = vcp->vap; + kauth_cred_t cred = vcp->ctx->vc_ucred; + struct kauth_acl_eval eval; + int error, ismember; + mode_t posix_action; + + /* + * If we are the file owner, we automatically have some rights. + * + * Do we need to expand this to support group ownership? + */ + if (vauth_file_owner(vcp)) + acl_rights &= ~(KAUTH_VNODE_WRITE_SECURITY); + + /* + * If we are checking both TAKE_OWNERSHIP and WRITE_SECURITY, we can + * mask the latter. If TAKE_OWNERSHIP is requested the caller is about to + * change ownership to themselves, and WRITE_SECURITY is implicitly + * granted to the owner. We need to do this because at this point + * WRITE_SECURITY may not be granted as the caller is not currently + * the owner. + */ + if ((acl_rights & KAUTH_VNODE_TAKE_OWNERSHIP) && + (acl_rights & KAUTH_VNODE_WRITE_SECURITY)) + acl_rights &= ~KAUTH_VNODE_WRITE_SECURITY; + + if (acl_rights == 0) { + KAUTH_DEBUG("%p ALLOWED - implicit or no rights required", vcp->vp); + return(0); + } + + /* if we have an ACL, evaluate it */ + if (VATTR_IS_NOT(vap, va_acl, NULL)) { + eval.ae_requested = acl_rights; + eval.ae_acl = &vap->va_acl->acl_ace[0]; + eval.ae_count = vap->va_acl->acl_entrycount; + eval.ae_options = 0; + if (vauth_file_owner(vcp)) + eval.ae_options |= KAUTH_AEVAL_IS_OWNER; + if ((error = vauth_file_ingroup(vcp, &ismember)) != 0) + return(error); + if (ismember) + eval.ae_options |= KAUTH_AEVAL_IN_GROUP; + eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS; + eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS; + eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS; + eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS; + + if ((error = kauth_acl_evaluate(cred, &eval)) != 0) { + KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error); + return(error); + } + + if (eval.ae_result == KAUTH_RESULT_DENY) { + KAUTH_DEBUG("%p DENIED - by ACL", vcp->vp); + return(EACCES); /* deny, deny, counter-allege */ + } + if (eval.ae_result == KAUTH_RESULT_ALLOW) { + KAUTH_DEBUG("%p ALLOWED - all rights granted by ACL", vcp->vp); + return(0); + } + /* fall through and evaluate residual rights */ + } else { + /* no ACL, everything is residual */ + eval.ae_residual = acl_rights; + } + + /* + * Grant residual rights that have been pre-authorized. + */ + eval.ae_residual &= ~preauth_rights; + + /* + * We grant WRITE_ATTRIBUTES to the owner if it hasn't been denied. + */ + if (vauth_file_owner(vcp)) + eval.ae_residual &= ~KAUTH_VNODE_WRITE_ATTRIBUTES; + + if (eval.ae_residual == 0) { + KAUTH_DEBUG("%p ALLOWED - rights already authorized", vcp->vp); + return(0); + } + + /* + * Bail if we have residual rights that can't be granted by posix permissions, + * or aren't presumed granted at this point. + * + * XXX these can be collapsed for performance + */ + if (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER) { + KAUTH_DEBUG("%p DENIED - CHANGE_OWNER not permitted", vcp->vp); + return(EACCES); + } + if (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY) { + KAUTH_DEBUG("%p DENIED - WRITE_SECURITY not permitted", vcp->vp); + return(EACCES); + } + +#if DIAGNOSTIC + if (eval.ae_residual & KAUTH_VNODE_DELETE) + panic("vnode_authorize: can't be checking delete permission here"); +#endif + + /* + * Compute the fallback posix permissions that will satisfy the remaining + * rights. + */ + posix_action = 0; + if (eval.ae_residual & (KAUTH_VNODE_READ_DATA | + KAUTH_VNODE_LIST_DIRECTORY | + KAUTH_VNODE_READ_EXTATTRIBUTES)) + posix_action |= VREAD; + if (eval.ae_residual & (KAUTH_VNODE_WRITE_DATA | + KAUTH_VNODE_ADD_FILE | + KAUTH_VNODE_ADD_SUBDIRECTORY | + KAUTH_VNODE_DELETE_CHILD | + KAUTH_VNODE_WRITE_ATTRIBUTES | + KAUTH_VNODE_WRITE_EXTATTRIBUTES)) + posix_action |= VWRITE; + if (eval.ae_residual & (KAUTH_VNODE_EXECUTE | + KAUTH_VNODE_SEARCH)) + posix_action |= VEXEC; + + if (posix_action != 0) { + return(vnode_authorize_posix(vcp, posix_action, 0 /* !on_dir */)); + } else { + KAUTH_DEBUG("%p ALLOWED - residual rights %s%s%s%s%s%s%s%s%s%s%s%s%s%s granted due to no posix mapping", + vcp->vp, + (eval.ae_residual & KAUTH_VNODE_READ_DATA) + ? vnode_isdir(vcp->vp) ? " LIST_DIRECTORY" : " READ_DATA" : "", + (eval.ae_residual & KAUTH_VNODE_WRITE_DATA) + ? vnode_isdir(vcp->vp) ? " ADD_FILE" : " WRITE_DATA" : "", + (eval.ae_residual & KAUTH_VNODE_EXECUTE) + ? vnode_isdir(vcp->vp) ? " SEARCH" : " EXECUTE" : "", + (eval.ae_residual & KAUTH_VNODE_DELETE) + ? " DELETE" : "", + (eval.ae_residual & KAUTH_VNODE_APPEND_DATA) + ? vnode_isdir(vcp->vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "", + (eval.ae_residual & KAUTH_VNODE_DELETE_CHILD) + ? " DELETE_CHILD" : "", + (eval.ae_residual & KAUTH_VNODE_READ_ATTRIBUTES) + ? " READ_ATTRIBUTES" : "", + (eval.ae_residual & KAUTH_VNODE_WRITE_ATTRIBUTES) + ? " WRITE_ATTRIBUTES" : "", + (eval.ae_residual & KAUTH_VNODE_READ_EXTATTRIBUTES) + ? " READ_EXTATTRIBUTES" : "", + (eval.ae_residual & KAUTH_VNODE_WRITE_EXTATTRIBUTES) + ? " WRITE_EXTATTRIBUTES" : "", + (eval.ae_residual & KAUTH_VNODE_READ_SECURITY) + ? " READ_SECURITY" : "", + (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY) + ? " WRITE_SECURITY" : "", + (eval.ae_residual & KAUTH_VNODE_CHECKIMMUTABLE) + ? " CHECKIMMUTABLE" : "", + (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER) + ? " CHANGE_OWNER" : ""); + } + + /* + * Lack of required Posix permissions implies no reason to deny access. + */ + return(0); +} + +/* + * Check for file immutability. + */ +static int +vnode_authorize_checkimmutable(vnode_t vp, struct vnode_attr *vap, int rights, int ignore) +{ + mount_t mp; + int error; + int append; + + /* + * Perform immutability checks for operations that change data. + * + * Sockets, fifos and devices require special handling. + */ + switch(vp->v_type) { + case VSOCK: + case VFIFO: + case VBLK: + case VCHR: + /* + * Writing to these nodes does not change the filesystem data, + * so forget that it's being tried. + */ + rights &= ~KAUTH_VNODE_WRITE_DATA; + break; + default: + break; + } + + error = 0; + if (rights & KAUTH_VNODE_WRITE_RIGHTS) { + + /* check per-filesystem options if possible */ + mp = vnode_mount(vp); + if (mp != NULL) { + + /* check for no-EA filesystems */ + if ((rights & KAUTH_VNODE_WRITE_EXTATTRIBUTES) && + (vfs_flags(mp) & MNT_NOUSERXATTR)) { + KAUTH_DEBUG("%p DENIED - filesystem disallowed extended attributes", vp); + error = EACCES; /* User attributes disabled */ + goto out; + } + } + + /* check for file immutability */ + append = 0; + if (vp->v_type == VDIR) { + if ((rights & (KAUTH_VNODE_ADD_FILE | KAUTH_VNODE_ADD_SUBDIRECTORY)) == rights) + append = 1; + } else { + if ((rights & KAUTH_VNODE_APPEND_DATA) == rights) + append = 1; + } + if ((error = vnode_immutable(vap, append, ignore)) != 0) { + KAUTH_DEBUG("%p DENIED - file is immutable", vp); + goto out; + } + } +out: + return(error); +} + +/* + * Handle authorization actions for filesystems that advertise that the server will + * be enforcing. + */ +static int +vnode_authorize_opaque(vnode_t vp, int *resultp, kauth_action_t action, vfs_context_t ctx) +{ + int error; + + /* + * If the vp is a device node, socket or FIFO it actually represents a local + * endpoint, so we need to handle it locally. + */ + switch(vp->v_type) { + case VBLK: + case VCHR: + case VSOCK: + case VFIFO: + return(0); + default: + break; + } + + /* + * In the advisory request case, if the filesystem doesn't think it's reliable + * we will attempt to formulate a result ourselves based on VNOP_GETATTR data. + */ + if ((action & KAUTH_VNODE_ACCESS) && !vfs_authopaqueaccess(vnode_mount(vp))) + return(0); + + /* + * Let the filesystem have a say in the matter. It's OK for it to not implemnent + * VNOP_ACCESS, as most will authorise inline with the actual request. + */ + if ((error = VNOP_ACCESS(vp, action, ctx)) != ENOTSUP) { + *resultp = error; + KAUTH_DEBUG("%p DENIED - opaque filesystem VNOP_ACCESS denied access", vp); + return(1); + } + + /* + * Typically opaque filesystems do authorisation in-line, but exec is a special case. In + * order to be reasonably sure that exec will be permitted, we try a bit harder here. + */ + if ((action & KAUTH_VNODE_EXECUTE) && vnode_isreg(vp)) { + /* try a VNOP_OPEN for readonly access */ + if ((error = VNOP_OPEN(vp, FREAD, ctx)) != 0) { + *resultp = error; + KAUTH_DEBUG("%p DENIED - EXECUTE denied because file could not be opened readonly", vp); + return(1); + } + VNOP_CLOSE(vp, FREAD, ctx); + } + + /* + * We don't have any reason to believe that the request has to be denied at this point, + * so go ahead and allow it. + */ + *resultp = 0; + KAUTH_DEBUG("%p ALLOWED - bypassing access check for non-local filesystem", vp); + return(1); +} + +static int +vnode_authorize_callback(__unused kauth_cred_t unused_cred, __unused void *idata, kauth_action_t action, + uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3) +{ + struct _vnode_authorize_context auth_context; + vauth_ctx vcp; + vfs_context_t ctx; + vnode_t vp, dvp; + kauth_cred_t cred; + kauth_ace_rights_t rights; + struct vnode_attr va, dva; + int result; + int *errorp; + int noimmutable; + + vcp = &auth_context; + ctx = vcp->ctx = (vfs_context_t)arg0; + vp = vcp->vp = (vnode_t)arg1; + dvp = vcp->dvp = (vnode_t)arg2; + errorp = (int *)arg3; + /* note that we authorize against the context, not the passed cred (the same thing anyway) */ + cred = ctx->vc_ucred; + + VATTR_INIT(&va); + vcp->vap = &va; + VATTR_INIT(&dva); + vcp->dvap = &dva; + + vcp->flags = vcp->flags_valid = 0; + +#if DIAGNOSTIC + if ((ctx == NULL) || (vp == NULL) || (cred == NULL)) + panic("vnode_authorize: bad arguments (context %p vp %p cred %p)", ctx, vp, cred); +#endif + + KAUTH_DEBUG("%p AUTH - %s %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s on %s '%s' (0x%x:%p/%p)", + vp, vfs_context_proc(ctx)->p_comm, + (action & KAUTH_VNODE_ACCESS) ? "access" : "auth", + (action & KAUTH_VNODE_READ_DATA) ? vnode_isdir(vp) ? " LIST_DIRECTORY" : " READ_DATA" : "", + (action & KAUTH_VNODE_WRITE_DATA) ? vnode_isdir(vp) ? " ADD_FILE" : " WRITE_DATA" : "", + (action & KAUTH_VNODE_EXECUTE) ? vnode_isdir(vp) ? " SEARCH" : " EXECUTE" : "", + (action & KAUTH_VNODE_DELETE) ? " DELETE" : "", + (action & KAUTH_VNODE_APPEND_DATA) ? vnode_isdir(vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "", + (action & KAUTH_VNODE_DELETE_CHILD) ? " DELETE_CHILD" : "", + (action & KAUTH_VNODE_READ_ATTRIBUTES) ? " READ_ATTRIBUTES" : "", + (action & KAUTH_VNODE_WRITE_ATTRIBUTES) ? " WRITE_ATTRIBUTES" : "", + (action & KAUTH_VNODE_READ_EXTATTRIBUTES) ? " READ_EXTATTRIBUTES" : "", + (action & KAUTH_VNODE_WRITE_EXTATTRIBUTES) ? " WRITE_EXTATTRIBUTES" : "", + (action & KAUTH_VNODE_READ_SECURITY) ? " READ_SECURITY" : "", + (action & KAUTH_VNODE_WRITE_SECURITY) ? " WRITE_SECURITY" : "", + (action & KAUTH_VNODE_CHANGE_OWNER) ? " CHANGE_OWNER" : "", + (action & KAUTH_VNODE_NOIMMUTABLE) ? " (noimmutable)" : "", + vnode_isdir(vp) ? "directory" : "file", + vp->v_name ? vp->v_name : "<NULL>", action, vp, dvp); + + /* + * Extract the control bits from the action, everything else is + * requested rights. + */ + noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? 1 : 0; + rights = action & ~(KAUTH_VNODE_ACCESS | KAUTH_VNODE_NOIMMUTABLE); + + if (rights & KAUTH_VNODE_DELETE) { +#if DIAGNOSTIC + if (dvp == NULL) + panic("vnode_authorize: KAUTH_VNODE_DELETE test requires a directory"); +#endif + } else { + dvp = NULL; + } + + /* + * Check for read-only filesystems. + */ + if ((rights & KAUTH_VNODE_WRITE_RIGHTS) && + (vp->v_mount->mnt_flag & MNT_RDONLY) && + ((vp->v_type == VREG) || (vp->v_type == VDIR) || + (vp->v_type == VLNK) || (vp->v_type == VCPLX) || + (rights & KAUTH_VNODE_DELETE) || (rights & KAUTH_VNODE_DELETE_CHILD))) { + result = EROFS; + goto out; + } + + /* + * Check for noexec filesystems. + */ + if ((rights & KAUTH_VNODE_EXECUTE) && vnode_isreg(vp) && (vp->v_mount->mnt_flag & MNT_NOEXEC)) { + result = EACCES; + goto out; + } + + /* + * Handle cases related to filesystems with non-local enforcement. + * This call can return 0, in which case we will fall through to perform a + * check based on VNOP_GETATTR data. Otherwise it returns 1 and sets + * an appropriate result, at which point we can return immediately. + */ + if (vfs_authopaque(vp->v_mount) && vnode_authorize_opaque(vp, &result, action, ctx)) + goto out; + + /* + * Get vnode attributes and extended security information for the vnode + * and directory if required. + */ + VATTR_WANTED(&va, va_mode); + VATTR_WANTED(&va, va_uid); + VATTR_WANTED(&va, va_gid); + VATTR_WANTED(&va, va_flags); + VATTR_WANTED(&va, va_acl); + if ((result = vnode_getattr(vp, &va, ctx)) != 0) { + KAUTH_DEBUG("%p ERROR - failed to get vnode attributes - %d", vp, result); + goto out; + } + if (dvp) { + VATTR_WANTED(&dva, va_mode); + VATTR_WANTED(&dva, va_uid); + VATTR_WANTED(&dva, va_gid); + VATTR_WANTED(&dva, va_flags); + VATTR_WANTED(&dva, va_acl); + if ((result = vnode_getattr(dvp, &dva, ctx)) != 0) { + KAUTH_DEBUG("%p ERROR - failed to get directory vnode attributes - %d", vp, result); + goto out; + } + } + + /* + * If the vnode is an extended attribute data vnode (eg. a resource fork), *_DATA becomes + * *_EXTATTRIBUTES. + */ + if (S_ISXATTR(va.va_mode)) { + if (rights & KAUTH_VNODE_READ_DATA) { + rights &= ~KAUTH_VNODE_READ_DATA; + rights |= KAUTH_VNODE_READ_EXTATTRIBUTES; + } + if (rights & KAUTH_VNODE_WRITE_DATA) { + rights &= ~KAUTH_VNODE_WRITE_DATA; + rights |= KAUTH_VNODE_WRITE_EXTATTRIBUTES; + } + } + + /* + * Check for immutability. + * + * In the deletion case, parent directory immutability vetoes specific + * file rights. + */ + if ((result = vnode_authorize_checkimmutable(vp, &va, rights, noimmutable)) != 0) + goto out; + if ((rights & KAUTH_VNODE_DELETE) && + ((result = vnode_authorize_checkimmutable(dvp, &dva, KAUTH_VNODE_DELETE_CHILD, 0)) != 0)) + goto out; + + /* + * Clear rights that have been authorized by reaching this point, bail if nothing left to + * check. + */ + rights &= ~(KAUTH_VNODE_LINKTARGET | KAUTH_VNODE_CHECKIMMUTABLE); + if (rights == 0) + goto out; + + /* + * If we're not the superuser, authorize based on file properties. + */ + if (!vfs_context_issuser(ctx)) { + /* process delete rights */ + if ((rights & KAUTH_VNODE_DELETE) && + ((result = vnode_authorize_delete(vcp)) != 0)) + goto out; + + /* process remaining rights */ + if ((rights & ~KAUTH_VNODE_DELETE) && + ((result = vnode_authorize_simple(vcp, rights, rights & KAUTH_VNODE_DELETE)) != 0)) + goto out; + } else { + + /* + * Execute is only granted to root if one of the x bits is set. This check only + * makes sense if the posix mode bits are actually supported. + */ + if ((rights & KAUTH_VNODE_EXECUTE) && + (vp->v_type == VREG) && + VATTR_IS_SUPPORTED(&va, va_mode) && + !(va.va_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) { + result = EPERM; + KAUTH_DEBUG("%p DENIED - root execute requires at least one x bit in 0x%x", vp, va.va_mode); + goto out; + } + + KAUTH_DEBUG("%p ALLOWED - caller is superuser", vp); + } + +out: + if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL)) + kauth_acl_free(va.va_acl); + if (VATTR_IS_SUPPORTED(&dva, va_acl) && (dva.va_acl != NULL)) + kauth_acl_free(dva.va_acl); + if (result) { + *errorp = result; + KAUTH_DEBUG("%p DENIED - auth denied", vp); + return(KAUTH_RESULT_DENY); + } + + /* + * Note that this implies that we will allow requests for no rights, as well as + * for rights that we do not recognise. There should be none of these. + */ + KAUTH_DEBUG("%p ALLOWED - auth granted", vp); + return(KAUTH_RESULT_ALLOW); +} + +/* + * Check that the attribute information in vattr can be legally applied to + * a new file by the context. + */ +int +vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_t ctx) +{ + int error; + int is_suser, ismember, defaulted_owner, defaulted_group, defaulted_mode; + kauth_cred_t cred; + guid_t changer; + mount_t dmp; + + error = 0; + defaulted_owner = defaulted_group = defaulted_mode = 0; + + /* + * Require that the filesystem support extended security to apply any. + */ + if (!vfs_extendedsecurity(dvp->v_mount) && + (VATTR_IS_ACTIVE(vap, va_acl) || VATTR_IS_ACTIVE(vap, va_uuuid) || VATTR_IS_ACTIVE(vap, va_guuid))) { + error = EINVAL; + goto out; + } + + /* + * Default some fields. + */ + dmp = dvp->v_mount; + + /* + * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit owner is set, that + * owner takes ownership of all new files. + */ + if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsowner != KAUTH_UID_NONE)) { + VATTR_SET(vap, va_uid, dmp->mnt_fsowner); + defaulted_owner = 1; + } else { + if (!VATTR_IS_ACTIVE(vap, va_uid)) { + /* default owner is current user */ + VATTR_SET(vap, va_uid, kauth_cred_getuid(vfs_context_ucred(ctx))); + defaulted_owner = 1; + } + } + + /* + * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit grouo is set, that + * group takes ownership of all new files. + */ + if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsgroup != KAUTH_GID_NONE)) { + VATTR_SET(vap, va_gid, dmp->mnt_fsgroup); + defaulted_group = 1; + } else { + if (!VATTR_IS_ACTIVE(vap, va_gid)) { + /* default group comes from parent object, fallback to current user */ + struct vnode_attr dva; + VATTR_INIT(&dva); + VATTR_WANTED(&dva, va_gid); + if ((error = vnode_getattr(dvp, &dva, ctx)) != 0) + goto out; + if (VATTR_IS_SUPPORTED(&dva, va_gid)) { + VATTR_SET(vap, va_gid, dva.va_gid); + } else { + VATTR_SET(vap, va_gid, kauth_cred_getgid(vfs_context_ucred(ctx))); + } + defaulted_group = 1; + } + } + + if (!VATTR_IS_ACTIVE(vap, va_flags)) + VATTR_SET(vap, va_flags, 0); + + /* default mode is everything, masked with current umask */ + if (!VATTR_IS_ACTIVE(vap, va_mode)) { + VATTR_SET(vap, va_mode, ACCESSPERMS & ~vfs_context_proc(ctx)->p_fd->fd_cmask); + KAUTH_DEBUG("ATTR - defaulting new file mode to %o from umask %o", vap->va_mode, vfs_context_proc(ctx)->p_fd->fd_cmask); + defaulted_mode = 1; + } + /* set timestamps to now */ + if (!VATTR_IS_ACTIVE(vap, va_create_time)) { + nanotime(&vap->va_create_time); + VATTR_SET_ACTIVE(vap, va_create_time); + } + + /* + * Check for attempts to set nonsensical fields. + */ + if (vap->va_active & ~VNODE_ATTR_NEWOBJ) { + error = EINVAL; + KAUTH_DEBUG("ATTR - ERROR - attempt to set unsupported new-file attributes %llx", + vap->va_active & ~VNODE_ATTR_NEWOBJ); + goto out; + } + + /* + * Quickly check for the applicability of any enforcement here. + * Tests below maintain the integrity of the local security model. + */ + if (vfs_authopaque(vnode_mount(dvp))) + goto out; + + /* + * We need to know if the caller is the superuser, or if the work is + * otherwise already authorised. + */ + cred = vfs_context_ucred(ctx); + if (noauth) { + /* doing work for the kernel */ + is_suser = 1; + } else { + is_suser = vfs_context_issuser(ctx); + } + + + if (VATTR_IS_ACTIVE(vap, va_flags)) { + if (is_suser) { + if ((vap->va_flags & (UF_SETTABLE | SF_SETTABLE)) != vap->va_flags) { + error = EPERM; + KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)"); + goto out; + } + } else { + if ((vap->va_flags & UF_SETTABLE) != vap->va_flags) { + error = EPERM; + KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)"); + goto out; + } + } + } + + /* if not superuser, validate legality of new-item attributes */ + if (!is_suser) { + if (!defaulted_mode && VATTR_IS_ACTIVE(vap, va_mode)) { + /* setgid? */ + if (vap->va_mode & S_ISGID) { + if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) { + KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid); + goto out; + } + if (!ismember) { + KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d", vap->va_gid); + error = EPERM; + goto out; + } + } + + /* setuid? */ + if ((vap->va_mode & S_ISUID) && (vap->va_uid != kauth_cred_getuid(cred))) { + KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit"); + error = EPERM; + goto out; + } + } + if (!defaulted_owner && (vap->va_uid != kauth_cred_getuid(cred))) { + KAUTH_DEBUG(" DENIED - cannot create new item owned by %d", vap->va_uid); + error = EPERM; + goto out; + } + if (!defaulted_group) { + if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) { + KAUTH_DEBUG(" ERROR - got %d checking for membership in %d", error, vap->va_gid); + goto out; + } + if (!ismember) { + KAUTH_DEBUG(" DENIED - cannot create new item with group %d - not a member", vap->va_gid); + error = EPERM; + goto out; + } + } + + /* initialising owner/group UUID */ + if (VATTR_IS_ACTIVE(vap, va_uuuid)) { + if ((error = kauth_cred_getguid(cred, &changer)) != 0) { + KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID", error); + /* XXX ENOENT here - no GUID - should perhaps become EPERM */ + goto out; + } + if (!kauth_guid_equal(&vap->va_uuuid, &changer)) { + KAUTH_DEBUG(" ERROR - cannot create item with supplied owner UUID - not us"); + error = EPERM; + goto out; + } + } + if (VATTR_IS_ACTIVE(vap, va_guuid)) { + if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) { + KAUTH_DEBUG(" ERROR - got %d trying to check group membership", error); + goto out; + } + if (!ismember) { + KAUTH_DEBUG(" ERROR - cannot create item with supplied group UUID - not a member"); + error = EPERM; + goto out; + } + } + } +out: + return(error); +} + +/* + * Check that the attribute information in vap can be legally written by the context. + * + * Call this when you're not sure about the vnode_attr; either its contents have come + * from an unknown source, or when they are variable. + * + * Returns errno, or zero and sets *actionp to the KAUTH_VNODE_* actions that + * must be authorized to be permitted to write the vattr. + */ +int +vnode_authattr(vnode_t vp, struct vnode_attr *vap, kauth_action_t *actionp, vfs_context_t ctx) +{ + struct vnode_attr ova; + kauth_action_t required_action; + int error, is_suser, ismember, chowner, chgroup; + guid_t changer; + gid_t group; + uid_t owner; + mode_t newmode; + kauth_cred_t cred; + uint32_t fdelta; + + VATTR_INIT(&ova); + required_action = 0; + error = 0; + + /* + * Quickly check for enforcement applicability. + */ + if (vfs_authopaque(vnode_mount(vp))) + goto out; + + /* + * Check for attempts to set nonsensical fields. + */ + if (vap->va_active & VNODE_ATTR_RDONLY) { + KAUTH_DEBUG("ATTR - ERROR: attempt to set readonly attribute(s)"); + error = EINVAL; + goto out; + } + + /* + * We need to know if the caller is the superuser. + */ + cred = vfs_context_ucred(ctx); + is_suser = kauth_cred_issuser(cred); + + /* + * If any of the following are changing, we need information from the old file: + * va_uid + * va_gid + * va_mode + * va_uuuid + * va_guuid + */ + if (VATTR_IS_ACTIVE(vap, va_uid) || + VATTR_IS_ACTIVE(vap, va_gid) || + VATTR_IS_ACTIVE(vap, va_mode) || + VATTR_IS_ACTIVE(vap, va_uuuid) || + VATTR_IS_ACTIVE(vap, va_guuid)) { + VATTR_WANTED(&ova, va_mode); + VATTR_WANTED(&ova, va_uid); + VATTR_WANTED(&ova, va_gid); + VATTR_WANTED(&ova, va_uuuid); + VATTR_WANTED(&ova, va_guuid); + KAUTH_DEBUG("ATTR - security information changing, fetching existing attributes"); + } + + /* + * If timestamps are being changed, we need to know who the file is owned + * by. + */ + if (VATTR_IS_ACTIVE(vap, va_create_time) || + VATTR_IS_ACTIVE(vap, va_change_time) || + VATTR_IS_ACTIVE(vap, va_modify_time) || + VATTR_IS_ACTIVE(vap, va_access_time) || + VATTR_IS_ACTIVE(vap, va_backup_time)) { + + VATTR_WANTED(&ova, va_uid); +#if 0 /* enable this when we support UUIDs as official owners */ + VATTR_WANTED(&ova, va_uuuid); +#endif + KAUTH_DEBUG("ATTR - timestamps changing, fetching uid and GUID"); + } + + /* + * If flags are being changed, we need the old flags. + */ + if (VATTR_IS_ACTIVE(vap, va_flags)) { + KAUTH_DEBUG("ATTR - flags changing, fetching old flags"); + VATTR_WANTED(&ova, va_flags); + } + + /* + * If the size is being set, make sure it's not a directory. + */ + if (VATTR_IS_ACTIVE(vap, va_data_size)) { + /* size is meaningless on a directory, don't permit this */ + if (vnode_isdir(vp)) { + KAUTH_DEBUG("ATTR - ERROR: size change requested on a directory"); + error = EISDIR; + goto out; + } + } + + /* + * Get old data. + */ + KAUTH_DEBUG("ATTR - fetching old attributes %016llx", ova.va_active); + if ((error = vnode_getattr(vp, &ova, ctx)) != 0) { + KAUTH_DEBUG(" ERROR - got %d trying to get attributes", error); + goto out; + } + + /* + * Size changes require write access to the file data. + */ + if (VATTR_IS_ACTIVE(vap, va_data_size)) { + /* if we can't get the size, or it's different, we need write access */ + KAUTH_DEBUG("ATTR - size change, requiring WRITE_DATA"); + required_action |= KAUTH_VNODE_WRITE_DATA; + } + + /* + * Changing timestamps? + * + * Note that we are only called to authorize user-requested time changes; + * side-effect time changes are not authorized. Authorisation is only + * required for existing files. + * + * Non-owners are not permitted to change the time on an existing + * file to anything other than the current time. + */ + if (VATTR_IS_ACTIVE(vap, va_create_time) || + VATTR_IS_ACTIVE(vap, va_change_time) || + VATTR_IS_ACTIVE(vap, va_modify_time) || + VATTR_IS_ACTIVE(vap, va_access_time) || + VATTR_IS_ACTIVE(vap, va_backup_time)) { + /* + * The owner and root may set any timestamps they like, + * provided that the file is not immutable. The owner still needs + * WRITE_ATTRIBUTES (implied by ownership but still deniable). + */ + if (is_suser || vauth_node_owner(&ova, cred)) { + KAUTH_DEBUG("ATTR - root or owner changing timestamps"); + required_action |= KAUTH_VNODE_CHECKIMMUTABLE | KAUTH_VNODE_WRITE_ATTRIBUTES; + } else { + /* just setting the current time? */ + if (vap->va_vaflags & VA_UTIMES_NULL) { + KAUTH_DEBUG("ATTR - non-root/owner changing timestamps, requiring WRITE_ATTRIBUTES"); + required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES; + } else { + KAUTH_DEBUG("ATTR - ERROR: illegal timestamp modification attempted"); + error = EACCES; + goto out; + } + } + } + + /* + * Changing file mode? + */ + if (VATTR_IS_ACTIVE(vap, va_mode) && VATTR_IS_SUPPORTED(&ova, va_mode) && (ova.va_mode != vap->va_mode)) { + KAUTH_DEBUG("ATTR - mode change from %06o to %06o", ova.va_mode, vap->va_mode); + + /* + * Mode changes always have the same basic auth requirements. + */ + if (is_suser) { + KAUTH_DEBUG("ATTR - superuser mode change, requiring immutability check"); + required_action |= KAUTH_VNODE_CHECKIMMUTABLE; + } else { + /* need WRITE_SECURITY */ + KAUTH_DEBUG("ATTR - non-superuser mode change, requiring WRITE_SECURITY"); + required_action |= KAUTH_VNODE_WRITE_SECURITY; + } + + /* + * Can't set the setgid bit if you're not in the group and not root. Have to have + * existing group information in the case we're not setting it right now. + */ + if (vap->va_mode & S_ISGID) { + required_action |= KAUTH_VNODE_CHECKIMMUTABLE; /* always required */ + if (!is_suser) { + if (VATTR_IS_ACTIVE(vap, va_gid)) { + group = vap->va_gid; + } else if (VATTR_IS_SUPPORTED(&ova, va_gid)) { + group = ova.va_gid; + } else { + KAUTH_DEBUG("ATTR - ERROR: setgid but no gid available"); + error = EINVAL; + goto out; + } + /* + * This might be too restrictive; WRITE_SECURITY might be implied by + * membership in this case, rather than being an additional requirement. + */ + if ((error = kauth_cred_ismember_gid(cred, group, &ismember)) != 0) { + KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid); + goto out; + } + if (!ismember) { + KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d", group); + error = EPERM; + goto out; + } + } + } + + /* + * Can't set the setuid bit unless you're root or the file's owner. + */ + if (vap->va_mode & S_ISUID) { + required_action |= KAUTH_VNODE_CHECKIMMUTABLE; /* always required */ + if (!is_suser) { + if (VATTR_IS_ACTIVE(vap, va_uid)) { + owner = vap->va_uid; + } else if (VATTR_IS_SUPPORTED(&ova, va_uid)) { + owner = ova.va_uid; + } else { + KAUTH_DEBUG("ATTR - ERROR: setuid but no uid available"); + error = EINVAL; + goto out; + } + if (owner != kauth_cred_getuid(cred)) { + /* + * We could allow this if WRITE_SECURITY is permitted, perhaps. + */ + KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit"); + error = EPERM; + goto out; + } + } + } + } + + /* + * Validate/mask flags changes. This checks that only the flags in + * the UF_SETTABLE mask are being set, and preserves the flags in + * the SF_SETTABLE case. + * + * Since flags changes may be made in conjunction with other changes, + * we will ask the auth code to ignore immutability in the case that + * the SF_* flags are not set and we are only manipulating the file flags. + * + */ + if (VATTR_IS_ACTIVE(vap, va_flags)) { + /* compute changing flags bits */ + if (VATTR_IS_SUPPORTED(&ova, va_flags)) { + fdelta = vap->va_flags ^ ova.va_flags; + } else { + fdelta = vap->va_flags; + } + + if (fdelta != 0) { + KAUTH_DEBUG("ATTR - flags changing, requiring WRITE_SECURITY"); + required_action |= KAUTH_VNODE_WRITE_SECURITY; + + /* check that changing bits are legal */ + if (is_suser) { + /* + * The immutability check will prevent us from clearing the SF_* + * flags unless the system securelevel permits it, so just check + * for legal flags here. + */ + if (fdelta & ~(UF_SETTABLE | SF_SETTABLE)) { + error = EPERM; + KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)"); + goto out; + } + } else { + if (fdelta & ~UF_SETTABLE) { + error = EPERM; + KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)"); + goto out; + } + } + /* + * If the caller has the ability to manipulate file flags, + * security is not reduced by ignoring them for this operation. + * + * A more complete test here would consider the 'after' states of the flags + * to determine whether it would permit the operation, but this becomes + * very complex. + * + * Ignoring immutability is conditional on securelevel; this does not bypass + * the SF_* flags if securelevel > 0. + */ + required_action |= KAUTH_VNODE_NOIMMUTABLE; + } + } + + /* + * Validate ownership information. + */ + chowner = 0; + chgroup = 0; + + /* + * uid changing + * Note that if the filesystem didn't give us a UID, we expect that it doesn't + * support them in general, and will ignore it if/when we try to set it. + * We might want to clear the uid out of vap completely here. + */ + if (VATTR_IS_ACTIVE(vap, va_uid) && VATTR_IS_SUPPORTED(&ova, va_uid) && (vap->va_uid != ova.va_uid)) { + if (!is_suser && (kauth_cred_getuid(cred) != vap->va_uid)) { + KAUTH_DEBUG(" DENIED - non-superuser cannot change ownershipt to a third party"); + error = EPERM; + goto out; + } + chowner = 1; + } + + /* + * gid changing + * Note that if the filesystem didn't give us a GID, we expect that it doesn't + * support them in general, and will ignore it if/when we try to set it. + * We might want to clear the gid out of vap completely here. + */ + if (VATTR_IS_ACTIVE(vap, va_gid) && VATTR_IS_SUPPORTED(&ova, va_gid) && (vap->va_gid != ova.va_gid)) { + if (!is_suser) { + if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) { + KAUTH_DEBUG(" ERROR - got %d checking for membership in %d", error, vap->va_gid); + goto out; + } + if (!ismember) { + KAUTH_DEBUG(" DENIED - group change from %d to %d but not a member of target group", + ova.va_gid, vap->va_gid); + error = EPERM; + goto out; + } + } + chgroup = 1; + } + + /* + * Owner UUID being set or changed. + */ + if (VATTR_IS_ACTIVE(vap, va_uuuid)) { + /* if the owner UUID is not actually changing ... */ + if (VATTR_IS_SUPPORTED(&ova, va_uuuid) && kauth_guid_equal(&vap->va_uuuid, &ova.va_uuuid)) + goto no_uuuid_change; + + /* + * The owner UUID cannot be set by a non-superuser to anything other than + * their own. + */ + if (!is_suser) { + if ((error = kauth_cred_getguid(cred, &changer)) != 0) { + KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID", error); + /* XXX ENOENT here - no UUID - should perhaps become EPERM */ + goto out; + } + if (!kauth_guid_equal(&vap->va_uuuid, &changer)) { + KAUTH_DEBUG(" ERROR - cannot set supplied owner UUID - not us"); + error = EPERM; + goto out; + } + } + chowner = 1; + } +no_uuuid_change: + /* + * Group UUID being set or changed. + */ + if (VATTR_IS_ACTIVE(vap, va_guuid)) { + /* if the group UUID is not actually changing ... */ + if (VATTR_IS_SUPPORTED(&ova, va_guuid) && kauth_guid_equal(&vap->va_guuid, &ova.va_guuid)) + goto no_guuid_change; + + /* + * The group UUID cannot be set by a non-superuser to anything other than + * one of which they are a member. + */ + if (!is_suser) { + if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) { + KAUTH_DEBUG(" ERROR - got %d trying to check group membership", error); + goto out; + } + if (!ismember) { + KAUTH_DEBUG(" ERROR - cannot create item with supplied group UUID - not a member"); + error = EPERM; + goto out; + } + } + chgroup = 1; + } +no_guuid_change: + + /* + * Compute authorisation for group/ownership changes. + */ + if (chowner || chgroup) { + if (is_suser) { + KAUTH_DEBUG("ATTR - superuser changing file owner/group, requiring immutability check"); + required_action |= KAUTH_VNODE_CHECKIMMUTABLE; + } else { + if (chowner) { + KAUTH_DEBUG("ATTR - ownership change, requiring TAKE_OWNERSHIP"); + required_action |= KAUTH_VNODE_TAKE_OWNERSHIP; + } + if (chgroup && !chowner) { + KAUTH_DEBUG("ATTR - group change, requiring WRITE_SECURITY"); + required_action |= KAUTH_VNODE_WRITE_SECURITY; + } + + /* clear set-uid and set-gid bits as required by Posix */ + if (VATTR_IS_ACTIVE(vap, va_mode)) { + newmode = vap->va_mode; + } else if (VATTR_IS_SUPPORTED(&ova, va_mode)) { + newmode = ova.va_mode; + } else { + KAUTH_DEBUG("CHOWN - trying to change owner but cannot get mode from filesystem to mask setugid bits"); + newmode = 0; + } + if (newmode & (S_ISUID | S_ISGID)) { + VATTR_SET(vap, va_mode, newmode & ~(S_ISUID | S_ISGID)); + KAUTH_DEBUG("CHOWN - masking setugid bits from mode %o to %o", newmode, vap->va_mode); + } + } + } + + /* + * Authorise changes in the ACL. + */ + if (VATTR_IS_ACTIVE(vap, va_acl)) { + + /* no existing ACL */ + if (!VATTR_IS_ACTIVE(&ova, va_acl) || (ova.va_acl == NULL)) { + + /* adding an ACL */ + if (vap->va_acl != NULL) { + required_action |= KAUTH_VNODE_WRITE_SECURITY; + KAUTH_DEBUG("CHMOD - adding ACL"); + } + + /* removing an existing ACL */ + } else if (vap->va_acl == NULL) { + required_action |= KAUTH_VNODE_WRITE_SECURITY; + KAUTH_DEBUG("CHMOD - removing ACL"); + + /* updating an existing ACL */ + } else { + if (vap->va_acl->acl_entrycount != ova.va_acl->acl_entrycount) { + /* entry count changed, must be different */ + required_action |= KAUTH_VNODE_WRITE_SECURITY; + KAUTH_DEBUG("CHMOD - adding/removing ACL entries"); + } else if (vap->va_acl->acl_entrycount > 0) { + /* both ACLs have the same ACE count, said count is 1 or more, bitwise compare ACLs */ + if (!memcmp(&vap->va_acl->acl_ace[0], &ova.va_acl->acl_ace[0], + sizeof(struct kauth_ace) * vap->va_acl->acl_entrycount)) { + required_action |= KAUTH_VNODE_WRITE_SECURITY; + KAUTH_DEBUG("CHMOD - changing ACL entries"); + } + } + } + } + + /* + * Other attributes that require authorisation. + */ + if (VATTR_IS_ACTIVE(vap, va_encoding)) + required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES; + +out: + if (VATTR_IS_SUPPORTED(&ova, va_acl) && (ova.va_acl != NULL)) + kauth_acl_free(ova.va_acl); + if (error == 0) + *actionp = required_action; + return(error); +} + + +void +vfs_setlocklocal(mount_t mp) +{ + vnode_t vp; + + mount_lock(mp); + mp->mnt_kern_flag |= MNTK_LOCK_LOCAL; + + /* + * We do not expect anyone to be using any vnodes at the + * time this routine is called. So no need for vnode locking + */ + TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { + vp->v_flag |= VLOCKLOCAL; + } + TAILQ_FOREACH(vp, &mp->mnt_workerqueue, v_mntvnodes) { + vp->v_flag |= VLOCKLOCAL; + } + TAILQ_FOREACH(vp, &mp->mnt_newvnodes, v_mntvnodes) { + vp->v_flag |= VLOCKLOCAL; + } + mount_unlock(mp); +} + + +#ifdef JOE_DEBUG + +record_vp(vnode_t vp, int count) { + struct uthread *ut; + int i; + + if ((vp->v_flag & VSYSTEM)) + return; + + ut = get_bsdthread_info(current_thread()); + ut->uu_iocount += count; + + if (ut->uu_vpindex < 32) { + for (i = 0; i < ut->uu_vpindex; i++) { + if (ut->uu_vps[i] == vp) + return; + } + ut->uu_vps[ut->uu_vpindex] = vp; + ut->uu_vpindex++; + } +} +#endif diff --git a/bsd/vfs/vfs_support.c b/bsd/vfs/vfs_support.c index 3ab24eb6f..0bf329efe 100644 --- a/bsd/vfs/vfs_support.c +++ b/bsd/vfs/vfs_support.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -30,7 +30,7 @@ * supposed to. * * nop_* routines always return 0 [success] - * err_* routines always return EOPNOTSUPP + * err_* routines always return ENOTSUP * * This file could be auto-generated from vnode_if.src. but that needs * support for freeing cnp. @@ -43,643 +43,539 @@ */ #include <vfs/vfs_support.h> +#include <sys/kauth.h> -struct vop_create_args /* { +struct vnop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; - struct vattr *a_vap; + struct vnode_vattr *a_vap; + vfs_context_t a_context; } */; int -nop_create(struct vop_create_args *ap) +nop_create(struct vnop_create_args *ap) { #if DIAGNOSTIC if ((ap->a_cnp->cn_flags & HASBUF) == 0) panic("nop_create: no name"); #endif - VOP_ABORTOP(ap->a_dvp, ap->a_cnp); - vput(ap->a_dvp); return (0); } int -err_create(struct vop_create_args *ap) +err_create(struct vnop_create_args *ap) { (void)nop_create(ap); - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_whiteout_args /* { +struct vnop_whiteout_args /* { struct vnode *a_dvp; struct componentname *a_cnp; int a_flags; + vfs_context_t a_context; } */; int -nop_whiteout(struct vop_whiteout_args *ap) +nop_whiteout(struct vnop_whiteout_args *ap) { return (0); } int -err_whiteout(struct vop_whiteout_args *ap) +err_whiteout(struct vnop_whiteout_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_mknod_args /* { +struct vnop_mknod_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; - struct vattr *a_vap; + struct vnode_vattr *a_vap; + vfs_context_t a_context; } */; int -nop_mknod(struct vop_mknod_args *ap) +nop_mknod(struct vnop_mknod_args *ap) { #if DIAGNOSTIC if ((ap->a_cnp->cn_flags & HASBUF) == 0) panic("nop_mknod: no name"); #endif - VOP_ABORTOP(ap->a_dvp, ap->a_cnp); - vput(ap->a_dvp); return (0); } int -err_mknod(struct vop_mknod_args *ap) +err_mknod(struct vnop_mknod_args *ap) { (void)nop_mknod(ap); - return (EOPNOTSUPP); + return (ENOTSUP); } - -struct vop_mkcomplex_args /* { - struct vnode *a_dvp, - struct vnode **a_vpp, - struct componentname *a_cnp, - struct vattr *a_vap, - u_long a_type) -} */; - -int -nop_mkcomplex(struct vop_mkcomplex_args *ap) -{ -#if DIAGNOSTIC - if ((ap->a_cnp->cn_flags & HASBUF) == 0) - panic("nop_mkcomplex: no name"); -#endif - VOP_ABORTOP(ap->a_dvp, ap->a_cnp); - vput(ap->a_dvp); - return (0); -} - -int -err_mkcomplex(struct vop_mkcomplex_args *ap) -{ - (void)nop_mkcomplex(ap); - return (EOPNOTSUPP); -} - - -struct vop_open_args /* { +struct vnop_open_args /* { struct vnode *a_vp; int a_mode; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */; int -nop_open(struct vop_open_args *ap) +nop_open(struct vnop_open_args *ap) { return (0); } int -err_open(struct vop_open_args *ap) +err_open(struct vnop_open_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_close_args /* { +struct vnop_close_args /* { struct vnode *a_vp; int a_fflag; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */; int -nop_close(struct vop_close_args *ap) +nop_close(struct vnop_close_args *ap) { return (0); } int -err_close(struct vop_close_args *ap) +err_close(struct vnop_close_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_access_args /* { +struct vnop_access_args /* { struct vnode *a_vp; int a_mode; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */; int -nop_access(struct vop_access_args *ap) +nop_access(struct vnop_access_args *ap) { return (0); } int -err_access(struct vop_access_args *ap) +err_access(struct vnop_access_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_getattr_args /* { +struct vnop_getattr_args /* { struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - struct proc *a_p; + struct vnode_vattr *a_vap; + vfs_context_t a_context; } */; int -nop_getattr(struct vop_getattr_args *ap) +nop_getattr(struct vnop_getattr_args *ap) { return (0); } int -err_getattr(struct vop_getattr_args *ap) +err_getattr(struct vnop_getattr_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_setattr_args /* { +struct vnop_setattr_args /* { struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - struct proc *a_p; + struct vnode_vattr *a_vap; + vfs_context_t a_context; } */; int -nop_setattr(struct vop_setattr_args *ap) +nop_setattr(struct vnop_setattr_args *ap) { return (0); } int -err_setattr(struct vop_setattr_args *ap) +err_setattr(struct vnop_setattr_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_getattrlist_args /* { +struct vnop_getattrlist_args /* { struct vnode *a_vp; struct attrlist *a_alist; struct uio *a_uio; - struct ucred *a_cred; - struct proc *a_p; + int a_options; + vfs_context a_context; } */; int -nop_getattrlist(struct vop_getattrlist_args *ap) +nop_getattrlist(struct vnop_getattrlist_args *ap) { return (0); } int -err_getattrlist(struct vop_getattrlist_args *ap) +err_getattrlist(struct vnop_getattrlist_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_setattrlist_args /* { +struct vnop_setattrlist_args /* { struct vnode *a_vp; struct attrlist *a_alist; struct uio *a_uio; - struct ucred *a_cred; - struct proc *a_p; + int a_options; + vfs_context_t a_context; } */; int -nop_setattrlist(struct vop_setattrlist_args *ap) +nop_setattrlist(struct vnop_setattrlist_args *ap) { return (0); } int -err_setattrlist(struct vop_setattrlist_args *ap) +err_setattrlist(struct vnop_setattrlist_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_read_args /* { +struct vnop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; + vfs_context_t a_context; } */; int -nop_read(struct vop_read_args *ap) +nop_read(struct vnop_read_args *ap) { return (0); } int -err_read(struct vop_read_args *ap) +err_read(struct vnop_read_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_write_args /* { +struct vnop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; - struct ucred *a_cred; + vfs_context_t a_context; } */; int -nop_write(struct vop_write_args *ap) +nop_write(struct vnop_write_args *ap) { return (0); } int -err_write(struct vop_write_args *ap) +err_write(struct vnop_write_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_lease_args /* { - struct vnode *a_vp; - struct proc *a_p; - struct ucred *a_cred; - int a_flag; -} */; - -int -nop_lease(struct vop_lease_args *ap) -{ - return (0); -} - -int -err_lease(struct vop_lease_args *ap) -{ - return (EOPNOTSUPP); -} - - -struct vop_ioctl_args /* { +struct vnop_ioctl_args /* { struct vnode *a_vp; u_long a_command; caddr_t a_data; int a_fflag; - struct ucred *a_cred; + kauth_cred_t a_cred; struct proc *a_p; } */; int -nop_ioctl(struct vop_ioctl_args *ap) +nop_ioctl(__unused struct vnop_ioctl_args *ap) { return (0); } int -err_ioctl(struct vop_ioctl_args *ap) +err_ioctl(struct vnop_ioctl_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_select_args /* { +struct vnop_select_args /* { struct vnode *a_vp; int a_which; int a_fflags; - struct ucred *a_cred; + kauth_cred_t a_cred; void *a_wql; struct proc *a_p; } */; int -nop_select(struct vop_select_args *ap) +nop_select(__unused struct vnop_select_args *ap) { return (0); } int -err_select(struct vop_select_args *ap) +err_select(struct vnop_select_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_exchange_args /* { +struct vnop_exchange_args /* { struct vnode *a_fvp; struct vnode *a_tvp; - struct ucred *a_cred; - struct proc *a_p; + int a_options; + vfs_context_t a_context; } */; int -nop_exchange(struct vop_exchange_args *ap) +nop_exchange(struct vnop_exchange_args *ap) { return (0); } int -err_exchange(struct vop_exchange_args *ap) +err_exchange(struct vnop_exchange_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_revoke_args /* { +struct vnop_revoke_args /* { struct vnode *a_vp; int a_flags; + vfs_context_t a_context; } */; int -nop_revoke(struct vop_revoke_args *ap) +nop_revoke(struct vnop_revoke_args *ap) { - return (vop_revoke(ap)); + return vn_revoke(ap->a_vp, ap->a_flags, ap->a_context); } int -err_revoke(struct vop_revoke_args *ap) +err_revoke(struct vnop_revoke_args *ap) { (void)nop_revoke(ap); - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_mmap_args /* { +struct vnop_mmap_args /* { struct vnode *a_vp; int a_fflags; - struct ucred *a_cred; + kauth_cred_t a_cred; struct proc *a_p; } */; int -nop_mmap(struct vop_mmap_args *ap) +nop_mmap(__unused struct vnop_mmap_args *ap) { return (0); } int -err_mmap(struct vop_mmap_args *ap) +err_mmap(struct vnop_mmap_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_fsync_args /* { +struct vnop_fsync_args /* { struct vnode *a_vp; - struct ucred *a_cred; int a_waitfor; - struct proc *a_p; -} */; - -int -nop_fsync(struct vop_fsync_args *ap) -{ - return (0); -} - -int -err_fsync(struct vop_fsync_args *ap) -{ - return (EOPNOTSUPP); -} - - -struct vop_seek_args /* { - struct vnode *a_vp; - off_t a_oldoff; - off_t a_newoff; - struct ucred *a_cred; + vfs_context_t a_context; } */; int -nop_seek(struct vop_seek_args *ap) +nop_fsync(struct vnop_fsync_args *ap) { return (0); } int -err_seek(struct vop_seek_args *ap) +err_fsync(struct vnop_fsync_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_remove_args /* { +struct vnop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; + int a_flags; + vfs_context_t a_context; } */; int -nop_remove(struct vop_remove_args *ap) +nop_remove(struct vnop_remove_args *ap) { - if (ap->a_dvp == ap->a_vp) - vrele(ap->a_vp); - else - vput(ap->a_vp); - vput(ap->a_dvp); return (0); } int -err_remove(struct vop_remove_args *ap) +err_remove(struct vnop_remove_args *ap) { (void)nop_remove(ap); - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_link_args /* { +struct vnop_link_args /* { struct vnode *a_vp; struct vnode *a_tdvp; struct componentname *a_cnp; + vfs_context_t a_context; } */; int -nop_link(struct vop_link_args *ap) +nop_link(struct vnop_link_args *ap) { -#if DIAGNOSTIC - if ((ap->a_cnp->cn_flags & HASBUF) == 0) - panic("nop_link: no name"); -#endif - VOP_ABORTOP(ap->a_tdvp, ap->a_cnp); - vput(ap->a_tdvp); return (0); } int -err_link(struct vop_link_args *ap) +err_link(struct vnop_link_args *ap) { (void)nop_link(ap); - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_rename_args /* { +struct vnop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; + vfs_context_t a_context; } */; int -nop_rename(struct vop_rename_args *ap) +nop_rename(struct vnop_rename_args *ap) { -#if DIAGNOSTIC - if ((ap->a_tcnp->cn_flags & HASBUF) == 0 || - (ap->a_fcnp->cn_flags & HASBUF) == 0) - panic("nop_rename: no name"); -#endif - VOP_ABORTOP(ap->a_tdvp, ap->a_tcnp); - if (ap->a_tdvp == ap->a_tvp) - vrele(ap->a_tdvp); - else - vput(ap->a_tdvp); - if (ap->a_tvp) - vput(ap->a_tvp); - VOP_ABORTOP(ap->a_fdvp, ap->a_fcnp); - vrele(ap->a_fdvp); - vrele(ap->a_fvp); return (0); } int -err_rename(struct vop_rename_args *ap) +err_rename(struct vnop_rename_args *ap) { (void)nop_rename(ap); - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_mkdir_args /* { +struct vnop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; - struct vattr *a_vap; + struct vnode_vattr *a_vap; + vfs_context_t a_context; } */; int -nop_mkdir(struct vop_mkdir_args *ap) +nop_mkdir(struct vnop_mkdir_args *ap) { -#if DIAGNOSTIC - if ((ap->a_cnp->cn_flags & HASBUF) == 0) - panic("nop_mkdir: no name"); -#endif - VOP_ABORTOP(ap->a_dvp, ap->a_cnp); - vput(ap->a_dvp); return (0); } int -err_mkdir(struct vop_mkdir_args *ap) +err_mkdir(struct vnop_mkdir_args *ap) { - (void)nop_mkdir(ap); - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_rmdir_args /* { +struct vnop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; + vfs_context_t a_context; } */; int -nop_rmdir(struct vop_rmdir_args *ap) +nop_rmdir(struct vnop_rmdir_args *ap) { - vput(ap->a_dvp); - vput(ap->a_vp); return (0); } int -err_rmdir(struct vop_rmdir_args *ap) +err_rmdir(struct vnop_rmdir_args *ap) { (void)nop_rmdir(ap); - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_symlink_args /* { +struct vnop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; - struct vattr *a_vap; + struct vnode_vattr *a_vap; char *a_target; + vfs_context_t a_context; } */; int -nop_symlink(struct vop_symlink_args *ap) +nop_symlink(struct vnop_symlink_args *ap) { #if DIAGNOSTIC if ((ap->a_cnp->cn_flags & HASBUF) == 0) panic("nop_symlink: no name"); #endif - VOP_ABORTOP(ap->a_dvp, ap->a_cnp); - vput(ap->a_dvp); return (0); } int -err_symlink(struct vop_symlink_args *ap) +err_symlink(struct vnop_symlink_args *ap) { (void)nop_symlink(ap); - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_readdir_args /* { - struct vnode *a_vp; +struct vnop_readdir_args /* { + vnode_t a_vp; struct uio *a_uio; - struct ucred *a_cred; + int a_flags; int *a_eofflag; - int *a_ncookies; - u_long **a_cookies; + int *a_numdirent; + vfs_context_t a_context; } */; int -nop_readdir(struct vop_readdir_args *ap) +nop_readdir(struct vnop_readdir_args *ap) { return (0); } int -err_readdir(struct vop_readdir_args *ap) +err_readdir(struct vnop_readdir_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_readdirattr_args /* { +struct vnop_readdirattr_args /* { struct vnode *a_vp; struct attrlist *a_alist; struct uio *a_uio; @@ -688,12 +584,11 @@ struct vop_readdirattr_args /* { int *a_newstate; int *a_eofflag; u_long *a_actualcount; - u_long **a_cookies; - struct ucred *a_cred; + vfs_context_t a_context; } */; int -nop_readdirattr(struct vop_readdirattr_args *ap) +nop_readdirattr(struct vnop_readdirattr_args *ap) { *(ap->a_actualcount) = 0; *(ap->a_eofflag) = 0; @@ -701,509 +596,230 @@ nop_readdirattr(struct vop_readdirattr_args *ap) } int -err_readdirattr(struct vop_readdirattr_args *ap) +err_readdirattr(struct vnop_readdirattr_args *ap) { (void)nop_readdirattr(ap); - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_readlink_args /* { +struct vnop_readlink_args /* { struct vnode *vp; struct uio *uio; - struct ucred *cred; -} */; - -int -nop_readlink(struct vop_readlink_args *ap) -{ - return (0); -} - -int -err_readlink(struct vop_readlink_args *ap) -{ - return (EOPNOTSUPP); -} - - -struct vop_abortop_args /* { - struct vnode *a_dvp; - struct componentname *a_cnp; + vfs_context_t a_context; } */; int -nop_abortop(struct vop_abortop_args *ap) +nop_readlink(struct vnop_readlink_args *ap) { - if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) { - char *tmp = ap->a_cnp->cn_pnbuf; - ap->a_cnp->cn_pnbuf = NULL; - ap->a_cnp->cn_flags &= ~HASBUF; - FREE_ZONE(tmp, ap->a_cnp->cn_pnlen, M_NAMEI); - } - return (0); } int -err_abortop(struct vop_abortop_args *ap) +err_readlink(struct vnop_readlink_args *ap) { - (void)nop_abortop(ap); - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_inactive_args /* { +struct vnop_inactive_args /* { struct vnode *a_vp; - struct proc *a_p; + vfs_context_t a_context; } */; int -nop_inactive(struct vop_inactive_args *ap) +nop_inactive(struct vnop_inactive_args *ap) { - VOP_UNLOCK(ap->a_vp, 0, ap->a_p); return (0); } int -err_inactive(struct vop_inactive_args *ap) +err_inactive(struct vnop_inactive_args *ap) { (void)nop_inactive(ap); - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_reclaim_args /* { +struct vnop_reclaim_args /* { struct vnode *a_vp; - struct proc *a_p; + vfs_context_t a_context; } */; int -nop_reclaim(struct vop_reclaim_args *ap) +nop_reclaim(struct vnop_reclaim_args *ap) { return (0); } int -err_reclaim(struct vop_reclaim_args *ap) +err_reclaim(struct vnop_reclaim_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_lock_args /* { - struct vnode *a_vp; - int a_flags; - struct proc *a_p; -} */; - -int -nop_lock(struct vop_lock_args *ap) -{ - return (vop_nolock(ap)); -} - -int -err_lock(struct vop_lock_args *ap) -{ - (void)nop_lock(ap); - return (EOPNOTSUPP); -} - - -struct vop_unlock_args /* { - struct vnode *a_vp; - int a_flags; - struct proc *a_p; -} */; - -int -nop_unlock(struct vop_unlock_args *ap) -{ - return (vop_nounlock(ap)); -} - -int -err_unlock(struct vop_unlock_args *ap) -{ - (void)nop_unlock(ap); - return (EOPNOTSUPP); -} - - -struct vop_bmap_args /* { - struct vnode *vp; - daddr_t bn; - struct vnode **vpp; - daddr_t *bnp; - int *runp; -} */; - -int -nop_bmap(struct vop_bmap_args *ap) -{ - return (0); -} - -int -err_bmap(struct vop_bmap_args *ap) -{ - return (EOPNOTSUPP); -} - - -struct vop_strategy_args /* { +struct vnop_strategy_args /* { struct buf *a_bp; } */; int -nop_strategy(struct vop_strategy_args *ap) -{ - return (0); -} - -int -err_strategy(struct vop_strategy_args *ap) -{ - return (EOPNOTSUPP); -} - - -struct vop_print_args /* { - struct vnode *a_vp; -} */; - -int -nop_print(struct vop_print_args *ap) +nop_strategy(struct vnop_strategy_args *ap) { return (0); } int -err_print(struct vop_print_args *ap) +err_strategy(struct vnop_strategy_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_islocked_args /* { - struct vnode *a_vp; -} */; - -int -nop_islocked(struct vop_islocked_args *ap) -{ - return (vop_noislocked(ap)); -} - -int -err_islocked(struct vop_islocked_args *ap) -{ - (void)nop_islocked(ap); - return (EOPNOTSUPP); -} - - -struct vop_pathconf_args /* { +struct vnop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; + vfs_context_t a_context; } */; int -nop_pathconf(struct vop_pathconf_args *ap) +nop_pathconf(struct vnop_pathconf_args *ap) { return (0); } int -err_pathconf(struct vop_pathconf_args *ap) +err_pathconf(struct vnop_pathconf_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_advlock_args /* { +struct vnop_advlock_args /* { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; + vfs_context_t a_context; } */; int -nop_advlock(struct vop_advlock_args *ap) -{ - return (0); -} - -int -err_advlock(struct vop_advlock_args *ap) -{ - return (EOPNOTSUPP); -} - - -struct vop_blkatoff_args /* { - struct vnode *a_vp; - off_t a_offset; - char **a_res; - struct buf **a_bpp; -} */; - -int -nop_blkatoff(struct vop_blkatoff_args *ap) -{ - *ap->a_bpp = NULL; - return (0); -} - -int -err_blkatoff(struct vop_blkatoff_args *ap) -{ - (void)nop_blkatoff(ap); - return (EOPNOTSUPP); -} - - -struct vop_valloc_args /* { - struct vnode *a_pvp; - int a_mode; - struct ucred *a_cred; - struct vnode **a_vpp; -} */; - -int -nop_valloc(struct vop_valloc_args *ap) -{ - *ap->a_vpp = NULL; - return (0); -} - -int -err_valloc(struct vop_valloc_args *ap) -{ - (void)nop_valloc(ap); - return (EOPNOTSUPP); -} - - -struct vop_reallocblks_args /* { - struct vnode *a_vp; - struct cluster_save *a_buflist; -} */; - -int -nop_reallocblks(struct vop_reallocblks_args *ap) -{ - return (0); -} - -int -err_reallocblks(struct vop_reallocblks_args *ap) -{ - return (EOPNOTSUPP); -} - - -struct vop_vfree_args /* { - struct vnode *a_pvp; - ino_t a_ino; - int a_mode; -} */; - -int -nop_vfree(struct vop_vfree_args *ap) +nop_advlock(struct vnop_advlock_args *ap) { return (0); } int -err_vfree(struct vop_vfree_args *ap) +err_advlock(struct vnop_advlock_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_truncate_args /* { - struct vnode *a_vp; - off_t a_length; - int a_flags; - struct ucred *a_cred; - struct proc *a_p; -} */; - -int -nop_truncate(struct vop_truncate_args *ap) -{ - return (0); -} - -int -err_truncate(struct vop_truncate_args *ap) -{ - return (EOPNOTSUPP); -} - -struct vop_allocate_args /* { +struct vnop_allocate_args /* { struct vnode *a_vp; off_t a_length; u_int32_t a_flags; off_t *a_bytesallocated; off_t a_offset; - struct ucred *a_cred; - struct proc *a_p; + vfs_context_t a_context; } */; int -nop_allocate(struct vop_allocate_args *ap) +nop_allocate(struct vnop_allocate_args *ap) { *(ap->a_bytesallocated) = 0; return (0); } int -err_allocate(struct vop_allocate_args *ap) +err_allocate(struct vnop_allocate_args *ap) { (void)nop_allocate(ap); - return (EOPNOTSUPP); -} - - -struct vop_update_args /* { - struct vnode *a_vp; - struct timeval *a_access; - struct timeval *a_modify; - int a_waitfor; -} */; - -int -nop_update(struct vop_update_args *ap) -{ - return (0); -} - -int -err_update(struct vop_update_args *ap) -{ - return (EOPNOTSUPP); + return (ENOTSUP); } - -struct vop_pgrd_args /* { - struct vnode *a_vp; - struct uio *a_uio; - struct ucred *a_cred; -} */; - -int -nop_pgrd(struct vop_pgrd_args *ap) -{ - return (0); -} - -int -err_pgrd(struct vop_pgrd_args *ap) -{ - return (EOPNOTSUPP); -} - - -struct vop_pgwr_args /* { - struct vnode *a_vp; - struct uio *a_uio; - struct ucred *a_cred; - vm_offset_t a_offset; -} */; - -int -nop_pgwr(struct vop_pgwr_args *ap) -{ - return (0); -} - -int -err_pgwr(struct vop_pgwr_args *ap) -{ - return (EOPNOTSUPP); -} - - -struct vop_bwrite_args /* { +struct vnop_bwrite_args /* { struct buf *a_bp; } */; int -nop_bwrite(struct vop_bwrite_args *ap) +nop_bwrite(struct vnop_bwrite_args *ap) { - return (bwrite(ap->a_bp)); + return ((int)buf_bwrite(ap->a_bp)); } int -err_bwrite(struct vop_bwrite_args *ap) +err_bwrite(struct vnop_bwrite_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_pagein_args /* { +struct vnop_pagein_args /* { struct vnode *a_vp, upl_t a_pl, vm_offset_t a_pl_offset, off_t a_foffset, size_t a_size, - struct ucred *a_cred, int a_flags + vfs_context_t a_context; } */; int -nop_pagein(struct vop_pagein_args *ap) +nop_pagein(struct vnop_pagein_args *ap) { - ubc_upl_abort(ap->a_pl, UPL_ABORT_ERROR); - return (0); + if ( !(ap->a_flags & UPL_NOCOMMIT)) + ubc_upl_abort_range(ap->a_pl, ap->a_pl_offset, ap->a_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); + return (EINVAL); } int -err_pagein(struct vop_pagein_args *ap) +err_pagein(struct vnop_pagein_args *ap) { - ubc_upl_abort(ap->a_pl, UPL_ABORT_ERROR); - return (EOPNOTSUPP); + if ( !(ap->a_flags & UPL_NOCOMMIT)) + ubc_upl_abort_range(ap->a_pl, ap->a_pl_offset, ap->a_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); + return (ENOTSUP); } -struct vop_pageout_args /* { +struct vnop_pageout_args /* { struct vnode *a_vp, upl_t a_pl, vm_offset_t a_pl_offset, off_t a_foffset, size_t a_size, - struct ucred *a_cred, int a_flags + vfs_context_t a_context; } */; int -nop_pageout(struct vop_pageout_args *ap) +nop_pageout(struct vnop_pageout_args *ap) { - ubc_upl_abort(ap->a_pl, UPL_ABORT_ERROR); - return (0); + if ( !(ap->a_flags & UPL_NOCOMMIT)) + ubc_upl_abort_range(ap->a_pl, ap->a_pl_offset, ap->a_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); + return (EINVAL); } int -err_pageout(struct vop_pageout_args *ap) +err_pageout(struct vnop_pageout_args *ap) { - ubc_upl_abort(ap->a_pl, UPL_ABORT_ERROR); - return (EOPNOTSUPP); + if ( !(ap->a_flags & UPL_NOCOMMIT)) + ubc_upl_abort_range(ap->a_pl, ap->a_pl_offset, ap->a_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); + return (ENOTSUP); } -struct vop_devblocksize_args /* { +struct vnop_devblocksize_args /* { struct vnode *a_vp; register_t *a_retval; } */; int -nop_devblocksize(struct vop_devblocksize_args *ap) +nop_devblocksize(struct vnop_devblocksize_args *ap) { /* XXX default value because the call sites do not check error */ *ap->a_retval = 512; @@ -1211,14 +827,14 @@ nop_devblocksize(struct vop_devblocksize_args *ap) } int -err_devblocksize(struct vop_devblocksize_args *ap) +err_devblocksize(struct vnop_devblocksize_args *ap) { (void)nop_devblocksize(ap); - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_searchfs /* { +struct vnop_searchfs /* { struct vnode *a_vp; void *a_searchparams1; void *a_searchparams2; @@ -1231,23 +847,24 @@ struct vop_searchfs /* { u_long a_options; struct uio *a_uio; struct searchstate *a_searchstate; + vfs_context_t a_context; } */; int -nop_searchfs(struct vop_searchfs_args *ap) +nop_searchfs(struct vnop_searchfs_args *ap) { *(ap->a_nummatches) = 0; return (0); } int -err_searchfs(struct vop_searchfs_args *ap) +err_searchfs(struct vnop_searchfs_args *ap) { (void)nop_searchfs(ap); - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_copyfile_args /*{ +struct vnop_copyfile_args /*{ struct vnodeop_desc *a_desc; struct vnode *a_fvp; struct vnode *a_tdvp; @@ -1257,83 +874,77 @@ struct vop_copyfile_args /*{ }*/; int -nop_copyfile(struct vop_copyfile_args *ap) +nop_copyfile(struct vnop_copyfile_args *ap) { - if (ap->a_tdvp == ap->a_tvp) - vrele(ap->a_tdvp); - else - vput(ap->a_tdvp); - if (ap->a_tvp) - vput(ap->a_tvp); - vrele(ap->a_fvp); return (0); } int -err_copyfile(struct vop_copyfile_args *ap) +err_copyfile(struct vnop_copyfile_args *ap) { (void)nop_copyfile(ap); - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_blktooff_args /* { +struct vnop_blktooff_args /* { struct vnode *a_vp; - daddr_t a_lblkno; + daddr64_t a_lblkno; off_t *a_offset; } */; int -nop_blktooff(struct vop_blktooff_args *ap) +nop_blktooff(struct vnop_blktooff_args *ap) { *ap->a_offset = (off_t)-1; /* failure */ return (0); } int -err_blktooff(struct vop_blktooff_args *ap) +err_blktooff(struct vnop_blktooff_args *ap) { (void)nop_blktooff(ap); - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_offtoblk_args /* { +struct vnop_offtoblk_args /* { struct vnode *a_vp; off_t a_offset; - daddr_t *a_lblkno; + daddr64_t *a_lblkno; } */; int -nop_offtoblk(struct vop_offtoblk_args *ap) +nop_offtoblk(struct vnop_offtoblk_args *ap) { - *ap->a_lblkno = (daddr_t)-1; /* failure */ + *ap->a_lblkno = (daddr64_t)-1; /* failure */ return (0); } int -err_offtoblk(struct vop_offtoblk_args *ap) +err_offtoblk(struct vnop_offtoblk_args *ap) { (void)nop_offtoblk(ap); - return (EOPNOTSUPP); + return (ENOTSUP); } -struct vop_cmap_args /* { +struct vnop_blockmap_args /* { struct vnode *a_vp; off_t a_foffset; size_t a_size; - daddr_t *a_bpn; + daddr64_t *a_bpn; size_t *a_run; void *a_poff; + int a_flags; } */; -int nop_cmap(struct vop_cmap_args *ap) +int nop_blockmap(struct vnop_blockmap_args *ap) { return (0); } -int err_cmap(struct vop_cmap_args *ap) +int err_blockmap(struct vnop_blockmap_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } diff --git a/bsd/vfs/vfs_support.h b/bsd/vfs/vfs_support.h index 7eac9f21e..9e49a68a1 100644 --- a/bsd/vfs/vfs_support.h +++ b/bsd/vfs/vfs_support.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -37,196 +37,150 @@ #ifndef _VFS_VFS_SUPPORT_H_ #define _VFS_VFS_SUPPORT_H_ +#include <sys/cdefs.h> #include <sys/param.h> #include <sys/systm.h> -#include <sys/namei.h> #include <sys/resourcevar.h> #include <sys/kernel.h> #include <sys/file.h> #include <sys/stat.h> -#include <sys/buf.h> #include <sys/proc.h> #include <sys/conf.h> #include <sys/mount.h> #include <sys/vnode.h> +#include <sys/vnode_if.h> #include <sys/malloc.h> #include <sys/dirent.h> #include <vm/vm_pageout.h> -extern int nop_create(struct vop_create_args *ap); -extern int err_create(struct vop_create_args *ap); +__BEGIN_DECLS +extern int nop_create(struct vnop_create_args *ap); +extern int err_create(struct vnop_create_args *ap); -extern int nop_whiteout(struct vop_whiteout_args *ap); -extern int err_whiteout(struct vop_whiteout_args *ap); +extern int nop_whiteout(struct vnop_whiteout_args *ap); +extern int err_whiteout(struct vnop_whiteout_args *ap); -extern int nop_mknod(struct vop_mknod_args *ap); -extern int err_mknod(struct vop_mknod_args *ap); +extern int nop_mknod(struct vnop_mknod_args *ap); +extern int err_mknod(struct vnop_mknod_args *ap); -extern int nop_mkcomplex(struct vop_mkcomplex_args *ap); -extern int err_mkcomplex(struct vop_mkcomplex_args *ap); +extern int nop_open(struct vnop_open_args *ap); +extern int err_open(struct vnop_open_args *ap); -extern int nop_open(struct vop_open_args *ap); -extern int err_open(struct vop_open_args *ap); +extern int nop_close(struct vnop_close_args *ap); +extern int err_close(struct vnop_close_args *ap); -extern int nop_close(struct vop_close_args *ap); -extern int err_close(struct vop_close_args *ap); +extern int nop_access(struct vnop_access_args *ap); +extern int err_access(struct vnop_access_args *ap); -extern int nop_access(struct vop_access_args *ap); -extern int err_access(struct vop_access_args *ap); +extern int nop_getattr(struct vnop_getattr_args *ap); +extern int err_getattr(struct vnop_getattr_args *ap); -extern int nop_getattr(struct vop_getattr_args *ap); -extern int err_getattr(struct vop_getattr_args *ap); +extern int nop_setattr(struct vnop_setattr_args *ap); +extern int err_setattr(struct vnop_setattr_args *ap); -extern int nop_setattr(struct vop_setattr_args *ap); -extern int err_setattr(struct vop_setattr_args *ap); +extern int nop_getattrlist(struct vnop_getattrlist_args *ap); +extern int err_getattrlist(struct vnop_getattrlist_args *ap); -extern int nop_getattrlist(struct vop_getattrlist_args *ap); -extern int err_getattrlist(struct vop_getattrlist_args *ap); +extern int nop_setattrlist(struct vnop_setattrlist_args *ap); +extern int err_setattrlist(struct vnop_setattrlist_args *ap); -extern int nop_setattrlist(struct vop_setattrlist_args *ap); -extern int err_setattrlist(struct vop_setattrlist_args *ap); +extern int nop_read(struct vnop_read_args *ap); +extern int err_read(struct vnop_read_args *ap); -extern int nop_read(struct vop_read_args *ap); -extern int err_read(struct vop_read_args *ap); +extern int nop_write(struct vnop_write_args *ap); +extern int err_write(struct vnop_write_args *ap); -extern int nop_write(struct vop_write_args *ap); -extern int err_write(struct vop_write_args *ap); +extern int nop_ioctl(struct vnop_ioctl_args *ap); +extern int err_ioctl(struct vnop_ioctl_args *ap); -extern int nop_lease(struct vop_lease_args *ap); -extern int err_lease(struct vop_lease_args *ap); +extern int nop_select(struct vnop_select_args *ap); +extern int err_select(struct vnop_select_args *ap); -extern int nop_ioctl(struct vop_ioctl_args *ap); -extern int err_ioctl(struct vop_ioctl_args *ap); +extern int nop_exchange(struct vnop_exchange_args *ap); +extern int err_exchange(struct vnop_exchange_args *ap); -extern int nop_select(struct vop_select_args *ap); -extern int err_select(struct vop_select_args *ap); +extern int nop_revoke(struct vnop_revoke_args *ap); +extern int err_revoke(struct vnop_revoke_args *ap); -extern int nop_exchange(struct vop_exchange_args *ap); -extern int err_exchange(struct vop_exchange_args *ap); +extern int nop_mmap(struct vnop_mmap_args *ap); +extern int err_mmap(struct vnop_mmap_args *ap); -extern int nop_revoke(struct vop_revoke_args *ap); -extern int err_revoke(struct vop_revoke_args *ap); +extern int nop_fsync(struct vnop_fsync_args *ap); +extern int err_fsync(struct vnop_fsync_args *ap); -extern int nop_mmap(struct vop_mmap_args *ap); -extern int err_mmap(struct vop_mmap_args *ap); +extern int nop_remove(struct vnop_remove_args *ap); +extern int err_remove(struct vnop_remove_args *ap); -extern int nop_fsync(struct vop_fsync_args *ap); -extern int err_fsync(struct vop_fsync_args *ap); +extern int nop_link(struct vnop_link_args *ap); +extern int err_link(struct vnop_link_args *ap); -extern int nop_seek(struct vop_seek_args *ap); -extern int err_seek(struct vop_seek_args *ap); +extern int nop_rename(struct vnop_rename_args *ap); +extern int err_rename(struct vnop_rename_args *ap); -extern int nop_remove(struct vop_remove_args *ap); -extern int err_remove(struct vop_remove_args *ap); +extern int nop_mkdir(struct vnop_mkdir_args *ap); +extern int err_mkdir(struct vnop_mkdir_args *ap); -extern int nop_link(struct vop_link_args *ap); -extern int err_link(struct vop_link_args *ap); +extern int nop_rmdir(struct vnop_rmdir_args *ap); +extern int err_rmdir(struct vnop_rmdir_args *ap); -extern int nop_rename(struct vop_rename_args *ap); -extern int err_rename(struct vop_rename_args *ap); +extern int nop_symlink(struct vnop_symlink_args *ap); +extern int err_symlink(struct vnop_symlink_args *ap); -extern int nop_mkdir(struct vop_mkdir_args *ap); -extern int err_mkdir(struct vop_mkdir_args *ap); +extern int nop_readdir(struct vnop_readdir_args *ap); +extern int err_readdir(struct vnop_readdir_args *ap); -extern int nop_rmdir(struct vop_rmdir_args *ap); -extern int err_rmdir(struct vop_rmdir_args *ap); +extern int nop_readdirattr(struct vnop_readdirattr_args *ap); +extern int err_readdirattr(struct vnop_readdirattr_args *ap); -extern int nop_symlink(struct vop_symlink_args *ap); -extern int err_symlink(struct vop_symlink_args *ap); +extern int nop_readlink(struct vnop_readlink_args *ap); +extern int err_readlink(struct vnop_readlink_args *ap); -extern int nop_readdir(struct vop_readdir_args *ap); -extern int err_readdir(struct vop_readdir_args *ap); +extern int nop_inactive(struct vnop_inactive_args *ap); +extern int err_inactive(struct vnop_inactive_args *ap); -extern int nop_readdirattr(struct vop_readdirattr_args *ap); -extern int err_readdirattr(struct vop_readdirattr_args *ap); +extern int nop_reclaim(struct vnop_reclaim_args *ap); +extern int err_reclaim(struct vnop_reclaim_args *ap); -extern int nop_readlink(struct vop_readlink_args *ap); -extern int err_readlink(struct vop_readlink_args *ap); -extern int nop_abortop(struct vop_abortop_args *ap); -extern int err_abortop(struct vop_abortop_args *ap); +extern int nop_strategy(struct vnop_strategy_args *ap); +extern int err_strategy(struct vnop_strategy_args *ap); -extern int nop_inactive(struct vop_inactive_args *ap); -extern int err_inactive(struct vop_inactive_args *ap); +extern int nop_pathconf(struct vnop_pathconf_args *ap); +extern int err_pathconf(struct vnop_pathconf_args *ap); -extern int nop_reclaim(struct vop_reclaim_args *ap); -extern int err_reclaim(struct vop_reclaim_args *ap); +extern int nop_advlock(struct vnop_advlock_args *ap); +extern int err_advlock(struct vnop_advlock_args *ap); -extern int nop_lock(struct vop_lock_args *ap); -extern int err_lock(struct vop_lock_args *ap); -extern int nop_unlock(struct vop_unlock_args *ap); -extern int err_unlock(struct vop_unlock_args *ap); +extern int nop_allocate(struct vnop_allocate_args *ap); +extern int err_allocate(struct vnop_allocate_args *ap); -extern int nop_bmap(struct vop_bmap_args *ap); -extern int err_bmap(struct vop_bmap_args *ap); +extern int nop_bwrite(struct vnop_bwrite_args *ap); +extern int err_bwrite(struct vnop_bwrite_args *ap); -extern int nop_strategy(struct vop_strategy_args *ap); -extern int err_strategy(struct vop_strategy_args *ap); +extern int nop_pagein(struct vnop_pagein_args *ap); +extern int err_pagein(struct vnop_pagein_args *ap); -extern int nop_print(struct vop_print_args *ap); -extern int err_print(struct vop_print_args *ap); +extern int nop_pageout(struct vnop_pageout_args *ap); +extern int err_pageout(struct vnop_pageout_args *ap); -extern int nop_islocked(struct vop_islocked_args *ap); -extern int err_islocked(struct vop_islocked_args *ap); +extern int nop_devblocksize(struct vnop_devblocksize_args *ap); +extern int err_devblocksize(struct vnop_devblocksize_args *ap); -extern int nop_pathconf(struct vop_pathconf_args *ap); -extern int err_pathconf(struct vop_pathconf_args *ap); +extern int nop_searchfs(struct vnop_searchfs_args *ap); +extern int err_searchfs(struct vnop_searchfs_args *ap); -extern int nop_advlock(struct vop_advlock_args *ap); -extern int err_advlock(struct vop_advlock_args *ap); +extern int nop_copyfile(struct vnop_copyfile_args *ap); +extern int err_copyfile(struct vnop_copyfile_args *ap); -extern int nop_blkatoff(struct vop_blkatoff_args *ap); -extern int err_blkatoff(struct vop_blkatoff_args *ap); +extern int nop_blktooff(struct vnop_blktooff_args *ap); +extern int err_blktooff(struct vnop_blktooff_args *ap); -extern int nop_valloc(struct vop_valloc_args *ap); -extern int err_valloc(struct vop_valloc_args *ap); +extern int nop_offtoblk(struct vnop_offtoblk_args *ap); +extern int err_offtoblk(struct vnop_offtoblk_args *ap); -extern int nop_reallocblks(struct vop_reallocblks_args *ap); -extern int err_reallocblks(struct vop_reallocblks_args *ap); +extern int nop_blockmap(struct vnop_blockmap_args *ap); +extern int err_blockmap(struct vnop_blockmap_args *ap); +__END_DECLS -extern int nop_vfree(struct vop_vfree_args *ap); -extern int err_vfree(struct vop_vfree_args *ap); - -extern int nop_truncate(struct vop_truncate_args *ap); -extern int err_truncate(struct vop_truncate_args *ap); - -extern int nop_allocate(struct vop_allocate_args *ap); -extern int err_allocate(struct vop_allocate_args *ap); - -extern int nop_update(struct vop_update_args *ap); -extern int err_update(struct vop_update_args *ap); - -extern int nop_pgrd(struct vop_pgrd_args *ap); -extern int err_pgrd(struct vop_pgrd_args *ap); - -extern int nop_pgwr(struct vop_pgwr_args *ap); -extern int err_pgwr(struct vop_pgwr_args *ap); - -extern int nop_bwrite(struct vop_bwrite_args *ap); -extern int err_bwrite(struct vop_bwrite_args *ap); - -extern int nop_pagein(struct vop_pagein_args *ap); -extern int err_pagein(struct vop_pagein_args *ap); - -extern int nop_pageout(struct vop_pageout_args *ap); -extern int err_pageout(struct vop_pageout_args *ap); - -extern int nop_devblocksize(struct vop_devblocksize_args *ap); -extern int err_devblocksize(struct vop_devblocksize_args *ap); - -extern int nop_searchfs(struct vop_searchfs_args *ap); -extern int err_searchfs(struct vop_searchfs_args *ap); - -extern int nop_copyfile(struct vop_copyfile_args *ap); -extern int err_copyfile(struct vop_copyfile_args *ap); - -extern int nop_blktooff(struct vop_blktooff_args *ap); -extern int err_blktooff(struct vop_blktooff_args *ap); - -extern int nop_offtoblk(struct vop_offtoblk_args *ap); -extern int err_offtoblk(struct vop_offtoblk_args *ap); - -extern int nop_cmap(struct vop_cmap_args *ap); -extern int err_cmap(struct vop_cmap_args *ap); #endif /* _VFS_VFS_SUPPORT_H_ */ diff --git a/bsd/vfs/vfs_syscalls.c b/bsd/vfs/vfs_syscalls.c index a44f963af..5675ce21f 100644 --- a/bsd/vfs/vfs_syscalls.c +++ b/bsd/vfs/vfs_syscalls.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1995-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1995-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -64,28 +64,41 @@ #include <sys/namei.h> #include <sys/filedesc.h> #include <sys/kernel.h> -#include <sys/file.h> +#include <sys/file_internal.h> #include <sys/stat.h> -#include <sys/vnode.h> -#include <sys/mount.h> -#include <sys/proc.h> -#include <sys/uio.h> +#include <sys/vnode_internal.h> +#include <sys/mount_internal.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> +#include <sys/uio_internal.h> #include <sys/malloc.h> +#include <sys/mman.h> #include <sys/dirent.h> #include <sys/attr.h> #include <sys/sysctl.h> #include <sys/ubc.h> #include <sys/quota.h> +#include <sys/kdebug.h> +#include <sys/fsevents.h> +#include <sys/sysproto.h> +#include <sys/xattr.h> +#include <sys/ubc_internal.h> +#include <machine/cons.h> +#include <machine/limits.h> +#include <miscfs/specfs/specdev.h> #include <bsm/audit_kernel.h> #include <bsm/audit_kevents.h> -#include <machine/cons.h> -#include <miscfs/specfs/specdev.h> +#include <mach/mach_types.h> +#include <kern/kern_types.h> +#include <kern/kalloc.h> + +#include <vm/vm_pageout.h> #include <architecture/byte_order.h> +#include <libkern/OSAtomic.h> -struct lock__bsd__ exchangelock; /* * The currently logged-in user, for ownership of files/directories whose on-disk @@ -93,14 +106,61 @@ struct lock__bsd__ exchangelock; */ uid_t console_user; -static int change_dir __P((struct nameidata *ndp, struct proc *p)); -static void checkdirs __P((struct vnode *olddp)); -static void enablequotas __P((struct proc *p, struct mount *mp)); -void notify_filemod_watchers(struct vnode *vp, struct proc *p); +static int change_dir(struct nameidata *ndp, vfs_context_t ctx); +static void checkdirs(struct vnode *olddp, vfs_context_t ctx); +void enablequotas(struct mount *mp, vfs_context_t ctx); +static int getfsstat_callback(mount_t mp, void * arg); +static int getutimes(user_addr_t usrtvp, struct timespec *tsp); +static int setutimes(vfs_context_t ctx, struct vnode *vp, const struct timespec *ts, int nullflag); +static int sync_callback(mount_t, void *); +static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp, + user_addr_t bufp, int *sizep, boolean_t is_64_bit, + boolean_t partial_copy); + +__private_extern__ int sync_internal(void); + +#ifdef __APPLE_API_OBSOLETE +struct fstatv_args { + int fd; /* file descriptor of the target file */ + struct vstat *vsb; /* vstat structure for returned info */ +}; +struct lstatv_args { + const char *path; /* pathname of the target file */ + struct vstat *vsb; /* vstat structure for returned info */ +}; +struct mkcomplex_args { + const char *path; /* pathname of the file to be created */ + mode_t mode; /* access mode for the newly created file */ + u_long type; /* format of the complex file */ +}; +struct statv_args { + const char *path; /* pathname of the target file */ + struct vstat *vsb; /* vstat structure for returned info */ +}; + +int fstatv(struct proc *p, struct fstatv_args *uap, register_t *retval); +int lstatv(struct proc *p, struct lstatv_args *uap, register_t *retval); +int mkcomplex(struct proc *p, struct mkcomplex_args *uap, register_t *retval); +int statv(struct proc *p, struct statv_args *uap, register_t *retval); + +#endif /* __APPLE_API_OBSOLETE */ + +#if UNION +extern int (**union_vnodeop_p)(void *); +extern struct vnode *union_dircache(struct vnode*, struct proc*); +#endif /* UNION */ /* counts number of mount and unmount operations */ unsigned int vfs_nummntops=0; +extern struct fileops vnops; + +extern void mount_list_add(mount_t mp); +extern void mount_list_remove(mount_t mp); +extern int mount_refdrain(mount_t mp); +extern int vcount(struct vnode *vp); + + /* * Virtual File System System Calls */ @@ -108,36 +168,40 @@ unsigned int vfs_nummntops=0; /* * Mount a file system. */ -struct mount_args { - char *type; - char *path; - int flags; - caddr_t data; -}; /* ARGSUSED */ int -mount(p, uap, retval) - struct proc *p; - register struct mount_args *uap; - register_t *retval; +mount(struct proc *p, register struct mount_args *uap, __unused register_t *retval) { struct vnode *vp; + struct vnode *devvp = NULLVP; + struct vnode *device_vnode = NULLVP; struct mount *mp; - struct vfsconf *vfsp; - int error, flag, err2; - struct vattr va; - u_long fstypenum; + struct vfstable *vfsp; + int error, flag = 0; + struct vnode_attr va; + struct vfs_context context; struct nameidata nd; + struct nameidata nd1; char fstypename[MFSNAMELEN]; size_t dummy=0; + user_addr_t devpath = USER_ADDR_NULL; + user_addr_t fsmountargs = uap->data; + int ronly = 0; + int mntalloc = 0; + mode_t accessmode; + boolean_t is_64bit; AUDIT_ARG(fflags, uap->flags); + context.vc_proc = p; + context.vc_ucred = kauth_cred_get(); + is_64bit = proc_is64bit(p); + /* * Get vnode to be covered */ - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, UIO_USERSPACE, - uap->path, p); + NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, + UIO_USERSPACE, uap->path, &context); error = namei(&nd); if (error) return (error); @@ -149,46 +213,44 @@ mount(p, uap, retval) if (uap->flags & MNT_UPDATE) { if ((vp->v_flag & VROOT) == 0) { - vput(vp); - return (EINVAL); + error = EINVAL; + goto out1; } mp = vp->v_mount; - if (vfs_busy(mp, LK_NOWAIT, 0, p)) { - vput(vp); - return (EBUSY); + /* unmount in progress return error */ + mount_lock(mp); + if (mp->mnt_lflag & MNT_LUNMOUNT) { + mount_unlock(mp); + error = EBUSY; + goto out1; } + mount_unlock(mp); + lck_rw_lock_exclusive(&mp->mnt_rwlock); /* * We only allow the filesystem to be reloaded if it * is currently mounted read-only. */ if ((uap->flags & MNT_RELOAD) && ((mp->mnt_flag & MNT_RDONLY) == 0)) { - vfs_unbusy(mp, p); - vput(vp); - return (EOPNOTSUPP); /* Needs translation */ + lck_rw_done(&mp->mnt_rwlock); + error = ENOTSUP; + goto out1; } /* * Only root, or the user that did the original mount is * permitted to update it. */ - if (mp->mnt_stat.f_owner != p->p_ucred->cr_uid && - (error = suser(p->p_ucred, &p->p_acflag))) { - vfs_unbusy(mp, p); - vput(vp); - return (error); + if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(context.vc_ucred) && + (error = suser(context.vc_ucred, &p->p_acflag))) { + lck_rw_done(&mp->mnt_rwlock); + goto out1; } /* - * Do not allow NFS export by non-root users. FOr non-root - * users, silently enforce MNT_NOSUID and MNT_NODEV, and - * MNT_NOEXEC if mount point is already MNT_NOEXEC. + * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, + * and MNT_NOEXEC if mount point is already MNT_NOEXEC. */ - if (p->p_ucred->cr_uid != 0) { - if (uap->flags & MNT_EXPORTED) { - vfs_unbusy(mp, p); - vput(vp); - return (EPERM); - } + if (suser(context.vc_ucred, NULL)) { uap->flags |= MNT_NOSUID | MNT_NODEV; if (mp->mnt_flag & MNT_NOEXEC) uap->flags |= MNT_NOEXEC; @@ -198,81 +260,58 @@ mount(p, uap, retval) mp->mnt_flag |= uap->flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE); - VOP_UNLOCK(vp, 0, p); - + vfsp = mp->mnt_vtable; goto update; } /* * If the user is not root, ensure that they own the directory * onto which we are attempting to mount. */ - if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)) || - (va.va_uid != p->p_ucred->cr_uid && - (error = suser(p->p_ucred, &p->p_acflag)))) { - vput(vp); - return (error); + VATTR_INIT(&va); + VATTR_WANTED(&va, va_uid); + if ((error = vnode_getattr(vp, &va, &context)) || + (va.va_uid != kauth_cred_getuid(context.vc_ucred) && + (error = suser(context.vc_ucred, &p->p_acflag)))) { + goto out1; } /* - * Do not allow NFS export by non-root users. FOr non-root - * users, silently enforce MNT_NOSUID and MNT_NODEV, and + * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and * MNT_NOEXEC if mount point is already MNT_NOEXEC. */ - if (p->p_ucred->cr_uid != 0) { - if (uap->flags & MNT_EXPORTED) { - vput(vp); - return (EPERM); - } + if (suser(context.vc_ucred, NULL)) { uap->flags |= MNT_NOSUID | MNT_NODEV; if (vp->v_mount->mnt_flag & MNT_NOEXEC) uap->flags |= MNT_NOEXEC; } - if (error = vinvalbuf(vp, V_SAVE, p->p_ucred, p, 0, 0)) { - vput(vp); - return (error); - } + if ( (error = VNOP_FSYNC(vp, MNT_WAIT, &context)) ) + goto out1; + + if ( (error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0)) ) + goto out1; + if (vp->v_type != VDIR) { - vput(vp); - return (ENOTDIR); - } -#if COMPAT_43 - /* - * Historically filesystem types were identified by number. If we - * get an integer for the filesystem type instead of a string, we - * check to see if it matches one of the historic filesystem types. - */ - fstypenum = (u_long)uap->type; - if (fstypenum < maxvfsconf) { - for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) - if (vfsp->vfc_typenum == fstypenum) - break; - if (vfsp == NULL) { - vput(vp); - return (ENODEV); - } - strncpy(fstypename, vfsp->vfc_name, MFSNAMELEN); - } else -#endif /* COMPAT_43 */ - if (error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy)) { - vput(vp); - return (error); + error = ENOTDIR; + goto out1; } + if ( (error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy)) ) + goto out1; + /* XXXAUDIT: Should we capture the type on the error path as well? */ AUDIT_ARG(text, fstypename); + mount_list_lock(); for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (!strcmp(vfsp->vfc_name, fstypename)) break; + mount_list_unlock(); if (vfsp == NULL) { - vput(vp); - return (ENODEV); + error = ENODEV; + goto out1; } - simple_lock(&vp->v_interlock); if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) { - simple_unlock(&vp->v_interlock); - vput(vp); - return (EBUSY); + error = EBUSY; + goto out1; } SET(vp->v_flag, VMOUNT); - simple_unlock(&vp->v_interlock); /* * Allocate and initialize the filesystem. @@ -280,23 +319,35 @@ mount(p, uap, retval) MALLOC_ZONE(mp, struct mount *, (u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); bzero((char *)mp, (u_long)sizeof(struct mount)); + mntalloc = 1; /* Initialize the default IO constraints */ mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS; mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32; - - lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0); - (void)vfs_busy(mp, LK_NOWAIT, 0, p); + mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt; + mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt; + mp->mnt_devblocksize = DEV_BSIZE; + + TAILQ_INIT(&mp->mnt_vnodelist); + TAILQ_INIT(&mp->mnt_workerqueue); + TAILQ_INIT(&mp->mnt_newvnodes); + mount_lock_init(mp); + lck_rw_lock_exclusive(&mp->mnt_rwlock); mp->mnt_op = vfsp->vfc_vfsops; - mp->mnt_vfc = vfsp; + mp->mnt_vtable = vfsp; + mount_list_lock(); vfsp->vfc_refcount++; - mp->mnt_stat.f_type = vfsp->vfc_typenum; + mount_list_unlock(); + //mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; - strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); + strncpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN); + strncpy(mp->mnt_vfsstat.f_mntonname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN); mp->mnt_vnodecovered = vp; - mp->mnt_stat.f_owner = p->p_ucred->cr_uid; - VOP_UNLOCK(vp, 0, p); + mp->mnt_vfsstat.f_owner = kauth_cred_getuid(context.vc_ucred); + /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */ + vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE); + update: /* * Set the mount level flags. @@ -310,14 +361,107 @@ update: MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE | MNT_AUTOMOUNTED); mp->mnt_flag |= uap->flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | - MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE | MNT_AUTOMOUNTED); + MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE | MNT_AUTOMOUNTED | + MNT_DEFWRITE); + + if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) { + if (is_64bit) { + if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) ) + goto out1; + fsmountargs += sizeof(devpath); + } else { + char *tmp; + if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) ) + goto out1; + /* munge into LP64 addr */ + devpath = CAST_USER_ADDR_T(tmp); + fsmountargs += sizeof(tmp); + } + + /* if it is not update and device name needs to be parsed */ + if ((devpath)) { + NDINIT(&nd1, LOOKUP, FOLLOW, UIO_USERSPACE, devpath, &context); + if ( (error = namei(&nd1)) ) + goto out1; + + strncpy(mp->mnt_vfsstat.f_mntfromname, nd1.ni_cnd.cn_pnbuf, MAXPATHLEN); + devvp = nd1.ni_vp; + + nameidone(&nd1); + + if (devvp->v_type != VBLK) { + error = ENOTBLK; + goto out2; + } + if (major(devvp->v_rdev) >= nblkdev) { + error = ENXIO; + goto out2; + } + /* + * If mount by non-root, then verify that user has necessary + * permissions on the device. + */ + if (suser(context.vc_ucred, NULL) != 0) { + accessmode = KAUTH_VNODE_READ_DATA; + if ((mp->mnt_flag & MNT_RDONLY) == 0) + accessmode |= KAUTH_VNODE_WRITE_DATA; + if ((error = vnode_authorize(devvp, NULL, accessmode, &context)) != 0) + goto out2; + } + } + if (devpath && ((uap->flags & MNT_UPDATE) == 0)) { + if ( (error = vnode_ref(devvp)) ) + goto out2; + /* + * Disallow multiple mounts of the same device. + * Disallow mounting of a device that is currently in use + * (except for root, which might share swap device for miniroot). + * Flush out any old buffers remaining from a previous use. + */ + if ( (error = vfs_mountedon(devvp)) ) + goto out3; + + if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) { + error = EBUSY; + goto out3; + } + if ( (error = VNOP_FSYNC(devvp, MNT_WAIT, &context)) ) { + error = ENOTBLK; + goto out3; + } + if ( (error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0)) ) + goto out3; + + ronly = (mp->mnt_flag & MNT_RDONLY) != 0; + if ( (error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, &context)) ) + goto out3; + + mp->mnt_devvp = devvp; + device_vnode = devvp; + } else { + if ((mp->mnt_flag & MNT_RDONLY) && (mp->mnt_kern_flag & MNTK_WANTRDWR)) { + /* + * If upgrade to read-write by non-root, then verify + * that user has necessary permissions on the device. + */ + device_vnode = mp->mnt_devvp; + if (device_vnode && suser(context.vc_ucred, NULL)) { + if ((error = vnode_authorize(device_vnode, NULL, + KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, &context)) != 0) + goto out2; + } + } + device_vnode = NULLVP; + } + } + + /* * Mount the filesystem. */ - error = VFS_MOUNT(mp, uap->path, uap->data, &nd, p); + error = VFS_MOUNT(mp, device_vnode, fsmountargs, &context); if (uap->flags & MNT_UPDATE) { - vrele(vp); if (mp->mnt_kern_flag & MNTK_WANTRDWR) mp->mnt_flag &= ~MNT_RDONLY; mp->mnt_flag &=~ @@ -325,73 +469,102 @@ update: mp->mnt_kern_flag &=~ MNTK_WANTRDWR; if (error) mp->mnt_flag = flag; - vfs_unbusy(mp, p); + vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL); + lck_rw_done(&mp->mnt_rwlock); if (!error) - enablequotas(p, mp); - return (error); + enablequotas(mp,&context); + goto out2; } - - /* get the vnode lock */ - err2 = vn_lock(vp, LK_EXCLUSIVE|LK_RETRY, p); - /* * Put the new filesystem on the mount list after root. */ - cache_purge(vp); - if (!error && !err2) { - simple_lock(&vp->v_interlock); + if (!error) { CLR(vp->v_flag, VMOUNT); - vp->v_mountedhere =mp; - simple_unlock(&vp->v_interlock); - simple_lock(&mountlist_slock); - CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list); - simple_unlock(&mountlist_slock); - vfs_event_signal(NULL, VQ_MOUNT, NULL); - checkdirs(vp); - VOP_UNLOCK(vp, 0, p); - vfs_unbusy(mp, p); - if (error = VFS_START(mp, 0, p)) - vrele(vp); + + vnode_lock(vp); + vp->v_mountedhere = mp; + vnode_unlock(vp); + + vnode_ref(vp); + + vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL); + checkdirs(vp, &context); + lck_rw_done(&mp->